From b307c9765c6fa8d88d8c2fcf9de44eec5a7765d6 Mon Sep 17 00:00:00 2001
From: "yinzefeng.yzf" <yinzefeng.yzf@alibaba-inc.com>
Date: Mon, 2 Feb 2026 22:25:18 +0800
Subject: [PATCH 01/11] add buffer pool & spsc_queue

---
 src/ailego/buffer/buffer_manager.cc          |   1 +
 src/include/zvec/ailego/buffer/buffer_pool.h | 311 +++++++++++++++++++
 2 files changed, 312 insertions(+)
 create mode 100644 src/include/zvec/ailego/buffer/buffer_pool.h
diff --git a/src/ailego/buffer/buffer_manager.cc b/src/ailego/buffer/buffer_manager.cc
index ac2945b0..307e80ce 100644
--- a/src/ailego/buffer/buffer_manager.cc
+++ b/src/ailego/buffer/buffer_manager.cc
@@ -20,6 +20,7 @@
 #include <zvec/ailego/buffer/buffer_manager.h>
 #include <zvec/ailego/internal/platform.h>
 #include <zvec/ailego/logger/logger.h>
+#include <zvec/ailego/buffer/buffer_pool.h>
 
 #ifdef __clang__
 #pragma clang diagnostic push
diff --git a/src/include/zvec/ailego/buffer/buffer_pool.h b/src/include/zvec/ailego/buffer/buffer_pool.h
new file mode 100644
index 00000000..5a09abfa
--- /dev/null
+++ b/src/include/zvec/ailego/buffer/buffer_pool.h
@@ -0,0 +1,311 @@
+#pragma once
+
+#include <atomic>
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <mutex>
+#include <queue>
+#include <string>
+#include <unordered_map>
+#include <map>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <stdexcept>
+#include <limits>
+#include <iostream>
+#include <boost/lockfree/spsc_queue.hpp>
+
+using block_id_t = int;
+
+#define BLOCK_SIZE (4 * 1024 * 1024)  // 2 MB
+#define BLOCK_MASK (BLOCK_SIZE - 1)
+#define BLOCK_ID(offset) (offset >> 22)
+#define BLOCK_OFFSET(offset) (offset & BLOCK_MASK)
+
+class LRUCache {
+    boost::lockfree::spsc_queue<int, boost::lockfree::capacity<1024>> q;
+};
+
+class LPMap {
+    struct Entry {
+        std::atomic<int> ref_count;
+        char* buffer;
+    };
+
+  public:
+    LPMap() : entry_num_(0), entries_(nullptr) {}
+    ~LPMap() {
+        delete[] entries_;
+    }
+
+    void init(size_t entry_num) {
+        if (entries_) {
+            delete[] entries_;
+        }
+        entry_num_ = entry_num;
+        entries_ = new Entry[entry_num_];
+        for (size_t i = 0; i < entry_num_; i++) {
+            // entries_[i].ref_count.store(0);
+            entries_[i].ref_count.store(std::numeric_limits<int>::min());
+            entries_[i].buffer = nullptr;
+        }
+    }
+
+    char* acquire_block(block_id_t block_id) {
+        assert(block_id < entry_num_);
+        Entry& entry = entries_[block_id];
+        int rc = entry.ref_count.fetch_add(1);
+        if (rc < 0) {
+            return nullptr;
+        }
+        return entry.buffer;
+    }
+
+    void release_block(block_id_t block_id) {
+        assert(block_id < entry_num_);
+        Entry& entry = entries_[block_id];
+        int rc = entry.ref_count.fetch_sub(1);
+        assert(rc > 0);
+    }
+
+    // need be called under lock
+    char* evict_block(block_id_t block_id) {
+        assert(block_id < entry_num_);
+        Entry& entry = entries_[block_id];
+        int expected = 0;
+        if (entry.ref_count.compare_exchange_strong(expected, std::numeric_limits<int>::min())) {
+            char* buffer = entry.buffer;
+            entry.buffer = nullptr;
+            return buffer;
+        } else {
+            return nullptr;
+        }
+    }
+
+    // need be called under lock
+    char* set_block_acquired(block_id_t block_id, char* buffer) {
+        // std::cout << "Set block " << block_id << std::endl;
+        assert(block_id < entry_num_);
+        Entry& entry = entries_[block_id];
+        if (entry.ref_count.load() >= 0) {
+            entry.ref_count.fetch_add(1);
+            return entry.buffer;
+        }
+        entry.buffer = buffer;
+        entry.ref_count.store(1);
+        return buffer;
+    }
+
+    // need be called under lock
+    void recycle(std::queue<char*>& free_buffers) {
+        for (size_t i = 0; i < entry_num_; i++) {
+            Entry& entry = entries_[i];
+            if (entry.ref_count.load() == 0) {
+                char* buffer = evict_block(i);
+                if (buffer) {
+                    free_buffers.push(buffer);
+                }
+            }
+        }
+    }
+
+    size_t entry_num() const {
+        return entry_num_;
+    }
+
+  private:
+    Entry* entries_;
+    size_t entry_num_;
+};
+
+class BufferPool;
+
+struct BufferPoolHandle {
+    BufferPoolHandle(BufferPool& pool);
+    BufferPoolHandle(BufferPoolHandle&& other) : pool(other.pool), local_cache(std::move(other.local_cache)), hit_num_(other.hit_num_) {
+        other.local_cache.clear();
+        other.hit_num_ = 0;
+    }
+    ~BufferPoolHandle();
+
+    char* get_block(size_t offset, size_t size);
+
+    void release_all();
+
+    BufferPool& pool;
+#ifdef USE_LOCAL_CACHE
+    // std::unordered_map<block_id_t, char*> local_cache;
+    phmap::flat_hash_map<block_id_t, char*> local_cache;
+#else
+    std::vector<block_id_t> local_cache;
+#endif
+    int hit_num_;
+};
+
+class BufferPool {
+  public:
+    BufferPool(const std::string& filename, size_t pool_capacity) : pool_capacity_(pool_capacity){
+        fd_ = open(filename.c_str(), O_RDONLY);
+        if (fd_ < 0) {
+            throw std::runtime_error("Failed to open file: " + filename);
+        }
+        struct stat st;
+        if (fstat(fd_, &st) < 0) {
+            throw std::runtime_error("Failed to stat file: " + filename);
+        }
+        file_size_ = st.st_size;
+        lp_map_.init((file_size_ + BLOCK_SIZE - 1) / BLOCK_SIZE);
+
+        size_t buffer_num = pool_capacity_ / BLOCK_SIZE;
+        for (size_t i = 0; i < buffer_num; i++) {
+            char* buffer = (char*)aligned_alloc(64, BLOCK_SIZE);
+            free_buffers_.push(buffer);
+        }
+        std::cout << "buffer_num: " << buffer_num << std::endl;
+        std::cout << "entry_num: " << lp_map_.entry_num() << std::endl;
+    }
+    ~BufferPool() {
+        close(fd_);
+    }
+
+    BufferPoolHandle get_handle() {
+        return BufferPoolHandle(*this);
+    }
+
+    char* acquire_buffer(block_id_t block_id, int retry = 0) {
+        char* buffer = lp_map_.acquire_block(block_id);
+        if (buffer) {
+            return buffer;
+        }
+        {
+            std::lock_guard<std::mutex> lock(mutex_);
+            if (free_buffers_.empty()) {
+                for (int i = 0; i < retry; i++) {
+                    lp_map_.recycle(free_buffers_);
+                    if (!free_buffers_.empty()) {
+                        break;
+                    }
+                }
+            }
+            if (free_buffers_.empty()) {
+                return nullptr;
+            }
+            buffer = free_buffers_.front();
+            free_buffers_.pop();
+        }
+        size_t read_offset = static_cast<size_t>(block_id) * BLOCK_SIZE;
+        size_t to_read = std::min<size_t>(BLOCK_SIZE, file_size_ - read_offset);
+
+        ssize_t read_bytes = pread(fd_, buffer, to_read, read_offset);
+        if (read_bytes != static_cast<ssize_t>(to_read)) {
+            std::cerr << "Failed to read file at offset " << read_offset << std::endl;
+            exit(-1);
+        }
+
+        {
+            std::lock_guard<std::mutex> lock(mutex_);
+            char* placed_buffer = lp_map_.set_block_acquired(block_id, buffer);
+            if (placed_buffer != buffer) {
+                // another thread has set the block
+                free_buffers_.push(buffer);
+            }
+            return placed_buffer;
+        }
+    }
+
+    size_t file_size() const {
+        return file_size_;
+    }
+
+  private:
+    int fd_;
+    size_t file_size_;
+    size_t pool_capacity_;
+
+  public:
+    LPMap lp_map_;
+
+  private:
+    std::mutex mutex_;
+    std::queue<char*> free_buffers_;
+};
+
+
+struct Counter {
+    ~Counter() = default;
+
+    static Counter& get_instance() {
+        static Counter instance;
+        return instance;
+    }
+
+    void record(const std::string& name, int64_t value) {
+        auto it = static_counters.find(name);
+        if (it == static_counters.end()) {
+            auto counter = std::make_unique<std::atomic<int64_t>>(0);
+            it = static_counters.emplace(name, std::move(counter)).first;
+        }
+        it->second->fetch_add(value);
+    }
+
+    void display() {
+        for (const auto& pair : static_counters) {
+            std::cout << pair.first << ": " << pair.second->load() << std::endl;
+        }
+    }
+
+    void clear() {
+        static_counters.clear();
+    }
+
+  private:
+    Counter() {}
+    std::map<std::string, std::unique_ptr<std::atomic<int64_t>>> static_counters;
+};
+
+BufferPoolHandle::BufferPoolHandle(BufferPool& pool) : pool(pool), hit_num_(0) {}
+BufferPoolHandle::~BufferPoolHandle() {
+    Counter::get_instance().record("buffer_pool_handle_hit_num", hit_num_);
+    release_all();
+}
+
+char* BufferPoolHandle::get_block(size_t offset, size_t size) {
+    block_id_t block_id = BLOCK_ID(offset);
+    assert(block_id == BLOCK_ID(offset + size - 1));
+#ifdef USE_LOCAL_CACHE
+    auto it = local_cache.find(block_id);
+    if (it != local_cache.end()) {
+        hit_num_++;
+        return it->second + BLOCK_OFFSET(offset);
+    }
+#endif
+
+    char* buffer = pool.acquire_buffer(block_id, 3);
+    if (buffer) {
+#ifdef USE_LOCAL_CACHE
+        local_cache[block_id] = buffer;
+#else
+        local_cache.push_back(block_id);
+#endif
+        return buffer + BLOCK_OFFSET(offset);
+    }
+
+    return nullptr;
+}
+
+void BufferPoolHandle::release_all() {
+#ifdef USE_LOCAL_CACHE
+    Counter::get_instance().record("buffer_pool_handle_release_call", local_cache.size());
+    for (const auto& pair : local_cache) {
+        pool.lp_map_.release_block(pair.first);
+    }
+#else
+    for (block_id_t block_id : local_cache) {
+        pool.lp_map_.release_block(block_id);
+    }
+#endif
+    local_cache.clear();
+}
\ No newline at end of file

From a96e684d276767ff4fc0c6019cf923d9f2707080 Mon Sep 17 00:00:00 2001
From: "yinzefeng.yzf" <yinzefeng.yzf@alibaba-inc.com>
Date: Wed, 4 Feb 2026 16:09:11 +0800
Subject: [PATCH 02/11] add buffer pool & open buffer storage ut

---
 src/ailego/buffer/buffer_manager.cc           |    1 -
 src/core/utility/buffer1_storage.cc           |  438 ++
 src/core/utility/buffer_storage.cc            |    2 +-
 src/include/zvec/ailego/buffer/buffer_pool.h  |  520 ++-
 .../zvec/ailego/buffer/concurrentqueue.h      | 3747 +++++++++++++++++
 ..._test.cpp => flat_streamer_buffer_test.cc} |    0
 ....cpp => flat_streamer_buffer_time_test.cc} |    0
 7 files changed, 4474 insertions(+), 234 deletions(-)
 create mode 100644 src/core/utility/buffer1_storage.cc
 create mode 100644 src/include/zvec/ailego/buffer/concurrentqueue.h
 rename tests/core/algorithm/flat/{flat_streamer_buffer_test.cpp => flat_streamer_buffer_test.cc} (100%)
 rename tests/core/algorithm/flat/{flat_streamer_buffer_time_test.cpp => flat_streamer_buffer_time_test.cc} (100%)

diff --git a/src/ailego/buffer/buffer_manager.cc b/src/ailego/buffer/buffer_manager.cc
index 307e80ce..ac2945b0 100644
--- a/src/ailego/buffer/buffer_manager.cc
+++ b/src/ailego/buffer/buffer_manager.cc
@@ -20,7 +20,6 @@
 #include <zvec/ailego/buffer/buffer_manager.h>
 #include <zvec/ailego/internal/platform.h>
 #include <zvec/ailego/logger/logger.h>
-#include <zvec/ailego/buffer/buffer_pool.h>
 
 #ifdef __clang__
 #pragma clang diagnostic push
diff --git a/src/core/utility/buffer1_storage.cc b/src/core/utility/buffer1_storage.cc
new file mode 100644
index 00000000..0ea591d9
--- /dev/null
+++ b/src/core/utility/buffer1_storage.cc
@@ -0,0 +1,438 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <mutex>
+// #include <zvec/ailego/buffer/buffer_manager.h>
+#include <zvec/ailego/buffer/buffer_pool.h>
+#include <zvec/core/framework/index_error.h>
+#include <zvec/core/framework/index_factory.h>
+#include <zvec/core/framework/index_mapping.h>
+#include <zvec/core/framework/index_version.h>
+#include "utility_params.h"
+
+#include <zvec/ailego/utility/time_helper.h>
+
+namespace zvec {
+namespace core {
+
+/*! MMap File Storage
+ */
+class Buffer1Storage : public IndexStorage {
+ public:
+  /*! Index Storage Segment
+   */
+  class Segment : public IndexStorage::Segment,
+                  public std::enable_shared_from_this<Segment> {
+   public:
+    //! Index Storage Pointer
+    typedef std::shared_ptr<Segment> Pointer;
+
+    //! Constructor
+    Segment(Buffer1Storage *owner, IndexMapping::Segment *segment, size_t segment_id)
+        : segment_(segment),
+          owner_(owner),
+          segment_id_(segment_id),
+          capacity_(static_cast<size_t>(segment->meta()->data_size +
+                                        segment->meta()->padding_size)) {}
+
+    //! Destructor
+    virtual ~Segment(void) {}
+
+    //! Retrieve size of data
+    size_t data_size(void) const override {
+      return static_cast<size_t>(segment_->meta()->data_size);
+    }
+
+    //! Retrieve crc of data
+    uint32_t data_crc(void) const override {
+      return segment_->meta()->data_crc;
+    }
+
+    //! Retrieve size of padding
+    size_t padding_size(void) const override {
+      return static_cast<size_t>(segment_->meta()->padding_size);
+    }
+
+    //! Retrieve capacity of segment
+    size_t capacity(void) const override {
+      return capacity_;
+    }
+
+    //! Fetch data from segment (with own buffer)
+    size_t fetch(size_t offset, void *buf, size_t len) const override {
+      if (ailego_unlikely(offset + len > segment_->meta()->data_size)) {
+        auto meta = segment_->meta();
+        if (offset > meta->data_size) {
+          offset = meta->data_size;
+        }
+        len = meta->data_size - offset;
+      }
+      memmove(buf, (const uint8_t *)(owner_->get_buffer(offset, len, segment_id_)) + offset,
+              len);
+      return len;
+    }
+
+    //! Read data from segment
+    size_t read(size_t offset, const void **data, size_t len) override {
+      
+      if (ailego_unlikely(offset + len > segment_->meta()->data_size)) {
+        auto meta = segment_->meta();
+        if (offset > meta->data_size) {
+          offset = meta->data_size;
+        }
+        len = meta->data_size - offset;
+      }
+      size_t segment_offset = segment_->meta()->data_index + owner_->get_context_offset();
+      *data = owner_->get_buffer(segment_offset, capacity_, segment_id_) + offset;
+      return len;
+    }
+
+    size_t read(size_t offset, MemoryBlock &data, size_t len) override {
+      if (ailego_unlikely(offset + len > segment_->meta()->data_size)) {
+        auto meta = segment_->meta();
+        if (offset > meta->data_size) {
+          offset = meta->data_size;
+        }
+        len = meta->data_size - offset;
+      }
+      size_t segment_offset = segment_->meta()->data_index + owner_->get_context_offset();
+      data.reset(owner_->get_buffer(segment_offset, capacity_, segment_id_) + offset);
+      if (data.data()) {
+        return len;
+      } else {
+        LOG_ERROR("read error.");
+        return -1;
+      }
+    }
+
+    //! Write data into the storage with offset
+    size_t write(size_t /*offset*/, const void * /*data*/,
+                 size_t len) override {
+      return len;
+    }
+
+    //! Resize size of data
+    size_t resize(size_t /*size*/) override {
+      return 0;
+    }
+
+    //! Update crc of data
+    void update_data_crc(uint32_t /*crc*/) override {}
+
+    //! Clone the segment
+    IndexStorage::Segment::Pointer clone(void) override {
+      return shared_from_this();
+    }
+
+   private:
+    IndexMapping::Segment *segment_{};
+    Buffer1Storage *owner_{nullptr};
+    size_t capacity_{};
+    size_t segment_id_{};
+  };
+
+  //! Destructor
+  virtual ~Buffer1Storage(void) {
+    this->cleanup();
+  }
+
+  //! Initialize storage
+  int init(const ailego::Params & /*params*/) override {
+    return 0;
+  }
+
+  //! Cleanup storage
+  int cleanup(void) override {
+    this->close_index();
+    return 0;
+  }
+
+  //! Open storage
+  int open(const std::string &path, bool /*create*/) override {
+    LOG_INFO("open buffer storage 1");
+    file_name_ = path;
+    buffer_pool_ = std::make_unique<VecBufferPool>(path, 10u * 1024 * 1024 * 1024, 2490368 * 2);
+    buffer_pool_handle_ =
+        std::make_unique<VecBufferPoolHandle>(buffer_pool_->get_handle());
+    int ret = ParseToMapping();
+    LOG_ERROR("segment count: %lu, max_segment_size: %lu", segments_.size(), max_segment_size_);
+    if(ret != 0) {
+      return ret;
+    }
+    return 0;
+  }
+
+  char *get_buffer(size_t offset, size_t length, size_t block_id) {
+    return buffer_pool_handle_->get_block(offset, length, block_id);
+  }
+
+  int get_meta(size_t offset, size_t length, char *out) {
+    return buffer_pool_handle_->get_meta(offset, length, out);
+  }
+
+  int ParseHeader(size_t offset) {
+    char *buffer = new char[sizeof(header_)];
+    get_meta(offset, sizeof(header_), buffer);
+    uint8_t *header_ptr = reinterpret_cast<uint8_t *>(buffer);
+    memcpy(&header_, header_ptr, sizeof(header_));
+    delete[] buffer;
+    if (header_.meta_header_size != sizeof(IndexFormat::MetaHeader)) {
+      LOG_ERROR("Header meta size is invalid.");
+      return IndexError_InvalidLength;
+    }
+    if (ailego::Crc32c::Hash(&header_, sizeof(header_), header_.header_crc) !=
+        header_.header_crc) {
+      LOG_ERROR("Header meta checksum is invalid.");
+      return IndexError_InvalidChecksum;
+    }
+    return 0;
+  }
+
+  int ParseFooter(size_t offset) {
+    char *buffer = new char[sizeof(footer_)];
+    get_meta(offset, sizeof(footer_), buffer);
+    uint8_t *footer_ptr = reinterpret_cast<uint8_t *>(buffer);
+    memcpy(&footer_, footer_ptr, sizeof(footer_));
+    delete[] buffer;
+    if (offset < (size_t)footer_.segments_meta_size) {
+      LOG_ERROR("Footer meta size is invalid.");
+      return IndexError_InvalidLength;
+    }
+    if (ailego::Crc32c::Hash(&footer_, sizeof(footer_), footer_.footer_crc) !=
+        footer_.footer_crc) {
+      LOG_ERROR("Footer meta checksum is invalid.");
+      return IndexError_InvalidChecksum;
+    }
+    return 0;
+  }
+
+  int ParseSegment(size_t offset) {
+    segment_buffer_ = std::make_unique<char[]>(footer_.segments_meta_size);
+    get_meta(offset, footer_.segments_meta_size, segment_buffer_.get());
+    if (ailego::Crc32c::Hash(segment_buffer_.get(), footer_.segments_meta_size, 0u) !=
+        footer_.segments_meta_crc) {
+      LOG_ERROR("Index segments meta checksum is invalid.");
+      return IndexError_InvalidChecksum;
+    }
+    IndexFormat::SegmentMeta *segment_start =
+        reinterpret_cast<IndexFormat::SegmentMeta *>(segment_buffer_.get());
+    uint32_t segment_ids_offset = footer_.segments_meta_size;
+    for (IndexFormat::SegmentMeta *iter = segment_start,
+                                  *end = segment_start + footer_.segment_count;
+         iter != end; ++iter) {
+      if (iter->segment_id_offset > footer_.segments_meta_size) {
+        return IndexError_InvalidValue;
+      }
+      if (iter->data_index > footer_.content_size) {
+        return IndexError_InvalidValue;
+      }
+      if (iter->data_index + iter->data_size > footer_.content_size) {
+        return IndexError_InvalidLength;
+      }
+
+      if (iter->segment_id_offset < segment_ids_offset) {
+        segment_ids_offset = iter->segment_id_offset;
+      }
+      id_hash_.emplace(
+          std::string(reinterpret_cast<const char *>(segment_start) +
+                      iter->segment_id_offset),
+          segments_.size());
+      segments_.emplace(
+          std::string(reinterpret_cast<const char *>(segment_start) +
+                      iter->segment_id_offset),
+          iter);
+      max_segment_size_ = std::max(max_segment_size_, iter->data_size + iter->padding_size);
+      if (sizeof(IndexFormat::SegmentMeta) * footer_.segment_count >
+          footer_.segments_meta_size) {
+        return IndexError_InvalidLength;
+      }
+    }
+    return 0;
+  }
+
+  int ParseToMapping() {
+    ParseHeader(0);
+    // Unpack footer
+    if (header_.meta_footer_size != sizeof(IndexFormat::MetaFooter)) {
+      return IndexError_InvalidLength;
+    }
+    if ((int32_t)header_.meta_footer_offset < 0) {
+      return IndexError_Unsupported;
+    }
+    size_t footer_offset = header_.meta_footer_offset;
+    ParseFooter(footer_offset);
+
+    // Unpack segment table
+    if (sizeof(IndexFormat::SegmentMeta) * footer_.segment_count >
+        footer_.segments_meta_size) {
+      return IndexError_InvalidLength;
+    }
+    const size_t segment_start_offset = footer_offset - footer_.segments_meta_size;
+    ParseSegment(segment_start_offset);
+    return 0;
+  }
+
+  //! Flush storage
+  int flush(void) override {
+    return this->flush_index();
+  }
+
+  //! Close storage
+  int close(void) override {
+    this->close_index();
+    return 0;
+  }
+
+  //! Append a segment into storage
+  int append(const std::string &id, size_t size) override {
+    return this->append_segment(id, size);
+  }
+
+  //! Refresh meta information (checksum, update time, etc.)
+  void refresh(uint64_t chkp) override {
+    this->refresh_index(chkp);
+  }
+
+  //! Retrieve check point of storage
+  uint64_t check_point(void) const override {
+    return footer_.check_point;
+  }
+
+  //! Retrieve a segment by id
+  IndexStorage::Segment::Pointer get(const std::string &id, int) override {
+    IndexMapping::Segment *segment = this->get_segment(id);
+    if (!segment) {
+      return Buffer1Storage::Segment::Pointer();
+    }
+    return std::make_shared<Buffer1Storage::Segment>(this, segment,
+                                                     id_hash_[id]);
+  }
+
+  //! Test if it a segment exists
+  bool has(const std::string &id) const override {
+    return this->has_segment(id);
+  }
+
+  //! Retrieve magic number of index
+  uint32_t magic(void) const override {
+    return header_.magic;
+  }
+
+  uint32_t get_context_offset() {
+    return header_.content_offset;
+  }
+
+ protected:
+  //! Initialize index version segment
+  int init_version_segment(void) {
+    size_t data_size = std::strlen(IndexVersion::Details());
+    int error_code =
+        this->append_segment(INDEX_VERSION_SEGMENT_NAME, data_size);
+    if (error_code != 0) {
+      return error_code;
+    }
+
+    IndexMapping::Segment *segment = get_segment(INDEX_VERSION_SEGMENT_NAME);
+    if (!segment) {
+      return IndexError_MMapFile;
+    }
+    auto meta = segment->meta();
+    size_t capacity = static_cast<size_t>(meta->padding_size + meta->data_size);
+    memcpy(segment->data(), IndexVersion::Details(), data_size);
+    segment->set_dirty();
+    meta->data_crc = ailego::Crc32c::Hash(segment->data(), data_size, 0);
+    meta->data_size = data_size;
+    meta->padding_size = capacity - data_size;
+    return 0;
+  }
+
+  //! Initialize index file
+  int init_index(const std::string &path) {
+    // Add index version
+    int error_code = this->init_version_segment();
+    if (error_code != 0) {
+      return error_code;
+    }
+
+    // Refresh mapping
+    this->refresh_index(0);
+    return 0;
+  }
+
+  //! Set the index file as dirty
+  void set_as_dirty(void) {
+    index_dirty_ = true;
+  }
+
+  //! Refresh meta information (checksum, update time, etc.)
+  void refresh_index(uint64_t /*chkp*/) {}
+
+  //! Flush index storage
+  int flush_index(void) {
+    return 0;
+  }
+
+  //! Close index storage
+  void close_index(void) {
+    std::lock_guard<std::mutex> latch(mapping_mutex_);
+    file_name_.clear();
+    segments_.clear();
+    memset(&header_, 0, sizeof(header_));
+    memset(&footer_, 0, sizeof(footer_));
+    segment_buffer_.release();
+  }
+
+  //! Append a segment into storage
+  int append_segment(const std::string & /*id*/, size_t /*size*/) {
+    return 0;
+  }
+
+  //! Test if a segment exists
+  bool has_segment(const std::string &id) const {
+    std::lock_guard<std::mutex> latch(mapping_mutex_);
+    return (segments_.find(id) != segments_.end());
+  }
+
+  //! Get a segment from storage
+  IndexMapping::Segment *get_segment(const std::string &id) {
+    std::lock_guard<std::mutex> latch(mapping_mutex_);
+    auto iter = segments_.find(id);
+    if (iter == segments_.end()) {
+      return nullptr;
+    }
+    IndexMapping::Segment *item = &iter->second;
+    return item;
+  }
+
+ private:
+  bool index_dirty_{false};
+  mutable std::mutex mapping_mutex_{};
+
+  // buffer manager
+  std::string file_name_;
+  IndexFormat::MetaHeader header_;
+  IndexFormat::MetaFooter footer_;
+  std::map<std::string, IndexMapping::Segment> segments_{};
+  std::map<std::string, size_t> id_hash_{};
+  size_t max_segment_size_{0};
+  std::unique_ptr<char[]> segment_buffer_{nullptr};
+
+  std::unique_ptr<VecBufferPool> buffer_pool_{nullptr};
+  std::unique_ptr<VecBufferPoolHandle> buffer_pool_handle_{nullptr};
+};
+
+INDEX_FACTORY_REGISTER_STORAGE_ALIAS(BufferStorage, Buffer1Storage);
+
+}  // namespace core
+}  // namespace zvec
\ No newline at end of file
diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc
index 4ac3c6b3..d4b23c87 100644
--- a/src/core/utility/buffer_storage.cc
+++ b/src/core/utility/buffer_storage.cc
@@ -436,7 +436,7 @@ class BufferStorage : public IndexStorage {
   std::map<std::string, IndexMapping::Segment> segments_{};
 };
 
-INDEX_FACTORY_REGISTER_STORAGE(BufferStorage);
+// INDEX_FACTORY_REGISTER_STORAGE(BufferStorage);
 
 }  // namespace core
 }  // namespace zvec
diff --git a/src/include/zvec/ailego/buffer/buffer_pool.h b/src/include/zvec/ailego/buffer/buffer_pool.h
index 5a09abfa..d86cffec 100644
--- a/src/include/zvec/ailego/buffer/buffer_pool.h
+++ b/src/include/zvec/ailego/buffer/buffer_pool.h
@@ -1,311 +1,367 @@
 #pragma once
 
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
 #include <atomic>
 #include <cassert>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <iostream>
+#include <limits>
+#include <map>
 #include <mutex>
 #include <queue>
+#include <stdexcept>
 #include <string>
 #include <unordered_map>
-#include <map>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <stdexcept>
-#include <limits>
-#include <iostream>
-#include <boost/lockfree/spsc_queue.hpp>
+#include "concurrentqueue.h"
 
 using block_id_t = int;
-
-#define BLOCK_SIZE (4 * 1024 * 1024)  // 2 MB
-#define BLOCK_MASK (BLOCK_SIZE - 1)
-#define BLOCK_ID(offset) (offset >> 22)
-#define BLOCK_OFFSET(offset) (offset & BLOCK_MASK)
+using version_t = int;
 
 class LRUCache {
-    boost::lockfree::spsc_queue<int, boost::lockfree::capacity<1024>> q;
-};
-
-class LPMap {
-    struct Entry {
-        std::atomic<int> ref_count;
-        char* buffer;
-    };
-
   public:
-    LPMap() : entry_num_(0), entries_(nullptr) {}
-    ~LPMap() {
-        delete[] entries_;
+    typedef std::pair<block_id_t, version_t> BlockType;
+    typedef moodycamel::ConcurrentQueue<BlockType> ConcurrentQueue;
+
+    int init(size_t block_size) {
+      for(int i = 0; i < CATCH_QUEUE_NUM; i++) {
+        queues_.push_back(ConcurrentQueue(block_size));
+      }
+      return 0;
     }
 
-    void init(size_t entry_num) {
-        if (entries_) {
-            delete[] entries_;
-        }
-        entry_num_ = entry_num;
-        entries_ = new Entry[entry_num_];
-        for (size_t i = 0; i < entry_num_; i++) {
-            // entries_[i].ref_count.store(0);
-            entries_[i].ref_count.store(std::numeric_limits<int>::min());
-            entries_[i].buffer = nullptr;
+    BlockType evict_single_block() {
+      BlockType item;
+      for(int i = 0; i < CATCH_QUEUE_NUM; i++) {
+        bool found = queues_[i].try_dequeue(item);
+        if(found) {
+          break;
         }
+      }
+      return item;
     }
 
-    char* acquire_block(block_id_t block_id) {
-        assert(block_id < entry_num_);
-        Entry& entry = entries_[block_id];
-        int rc = entry.ref_count.fetch_add(1);
-        if (rc < 0) {
-            return nullptr;
-        }
-        return entry.buffer;
+    bool add_single_block(const BlockType &block, int block_type) {
+      std::cout << "in LRU: " << block.first << ", " << block.second << std::endl;
+      return queues_[block_type].try_enqueue(block);
     }
 
-    void release_block(block_id_t block_id) {
-        assert(block_id < entry_num_);
-        Entry& entry = entries_[block_id];
-        int rc = entry.ref_count.fetch_sub(1);
-        assert(rc > 0);
-    }
+  private:
+    constexpr static size_t CATCH_QUEUE_NUM = 3;
+    std::vector<ConcurrentQueue> queues_;
+};
 
-    // need be called under lock
-    char* evict_block(block_id_t block_id) {
-        assert(block_id < entry_num_);
-        Entry& entry = entries_[block_id];
-        int expected = 0;
-        if (entry.ref_count.compare_exchange_strong(expected, std::numeric_limits<int>::min())) {
-            char* buffer = entry.buffer;
-            entry.buffer = nullptr;
-            return buffer;
-        } else {
-            return nullptr;
-        }
+class LPMap {
+  struct Entry {
+    std::atomic<int> ref_count;
+    std::atomic<int> load_count;
+    char *buffer;
+  };
+
+ public:
+  LPMap() : entry_num_(0), entries_(nullptr) {}
+  ~LPMap() {
+    delete[] entries_;
+  }
+
+  void init(size_t entry_num) {
+    if (entries_) {
+      delete[] entries_;
     }
-
-    // need be called under lock
-    char* set_block_acquired(block_id_t block_id, char* buffer) {
-        // std::cout << "Set block " << block_id << std::endl;
-        assert(block_id < entry_num_);
-        Entry& entry = entries_[block_id];
-        if (entry.ref_count.load() >= 0) {
-            entry.ref_count.fetch_add(1);
-            return entry.buffer;
-        }
-        entry.buffer = buffer;
-        entry.ref_count.store(1);
-        return buffer;
+    entry_num_ = entry_num;
+    entries_ = new Entry[entry_num_];
+    for (size_t i = 0; i < entry_num_; i++) {
+      entries_[i].ref_count.store(std::numeric_limits<int>::min());
+      entries_[i].load_count.store(0);
+      entries_[i].buffer = nullptr;
     }
-
-    // need be called under lock
-    void recycle(std::queue<char*>& free_buffers) {
-        for (size_t i = 0; i < entry_num_; i++) {
-            Entry& entry = entries_[i];
-            if (entry.ref_count.load() == 0) {
-                char* buffer = evict_block(i);
-                if (buffer) {
-                    free_buffers.push(buffer);
-                }
-            }
-        }
+  }
+
+  char *acquire_block(block_id_t block_id) {
+    assert(block_id < entry_num_);
+    Entry &entry = entries_[block_id];
+    int rc = entry.ref_count.fetch_add(1);
+    if (rc < 0) {
+      return nullptr;
     }
-
-    size_t entry_num() const {
-        return entry_num_;
+    return entry.buffer;
+  }
+
+  void release_block(block_id_t block_id) {
+    assert(block_id < entry_num_);
+    Entry &entry = entries_[block_id];
+    int rc = entry.ref_count.fetch_sub(1);
+    assert(rc >= 0);
+    if(rc == 0) {
+      LRUCache::BlockType block;
+      block.first = block_id;
+      block.second = entry.load_count.load();
+      cache_.add_single_block(block, 0);
+    }
+  }
+
+  // need be called under lock
+  char *evict_block(block_id_t block_id) {
+    assert(block_id < entry_num_);
+    Entry &entry = entries_[block_id];
+    int expected = 0;
+    if (entry.ref_count.compare_exchange_strong(
+            expected, std::numeric_limits<int>::min())) {
+      char *buffer = entry.buffer;
+      entry.buffer = nullptr;
+      return buffer;
+    } else {
+      return nullptr;
     }
+  }
+
+  // need be called under lock
+  char *set_block_acquired(block_id_t block_id, char *buffer) {
+    // std::cout << "Set block " << block_id << std::endl;
+    assert(block_id < entry_num_);
+    Entry &entry = entries_[block_id];
+    if (entry.ref_count.load() >= 0) {
+      entry.ref_count.fetch_add(1);
+      return entry.buffer;
+    }
+    entry.buffer = buffer;
+    entry.ref_count.store(1);
+    entry.load_count.fetch_add(1);
+    return buffer;
+  }
+
+  // need be called under lock
+  void recycle(std::queue<char *> &free_buffers) {
+    LRUCache::BlockType block;
+    do {
+      block = cache_.evict_single_block();
+    } while(isDeadBlock(block));
+    char *buffer = evict_block(block.first);
+    if (buffer) {
+      free_buffers.push(buffer);
+    }
+  }
 
-  private:
-    Entry* entries_;
-    size_t entry_num_;
+  size_t entry_num() const {
+    return entry_num_;
+  }
+
+ private:
+  Entry *entries_;
+  size_t entry_num_;
+  LRUCache cache_;
+
+  bool isDeadBlock(LRUCache::BlockType block) {
+    Entry &entry = entries_[block.first];
+    return block.second == entry.load_count.load();
+  }
 };
 
-class BufferPool;
+class VecBufferPool;
 
-struct BufferPoolHandle {
-    BufferPoolHandle(BufferPool& pool);
-    BufferPoolHandle(BufferPoolHandle&& other) : pool(other.pool), local_cache(std::move(other.local_cache)), hit_num_(other.hit_num_) {
-        other.local_cache.clear();
-        other.hit_num_ = 0;
-    }
-    ~BufferPoolHandle();
+struct VecBufferPoolHandle {
+  VecBufferPoolHandle(VecBufferPool &pool);
+  VecBufferPoolHandle(VecBufferPoolHandle &&other)
+      : pool(other.pool),
+        local_cache(std::move(other.local_cache)),
+        hit_num_(other.hit_num_) {
+    other.local_cache.clear();
+    other.hit_num_ = 0;
+  }
+  ~VecBufferPoolHandle();
 
-    char* get_block(size_t offset, size_t size);
+  char *get_block(size_t offset, size_t size, size_t block_id);
 
-    void release_all();
+  int get_meta(size_t offset, size_t length, char *buffer);
 
-    BufferPool& pool;
+  void release_all();
+
+  VecBufferPool &pool;
 #ifdef USE_LOCAL_CACHE
-    // std::unordered_map<block_id_t, char*> local_cache;
-    phmap::flat_hash_map<block_id_t, char*> local_cache;
+  // std::unordered_map<block_id_t, char*> local_cache;
+  phmap::flat_hash_map<block_id_t, char *> local_cache;
 #else
-    std::vector<block_id_t> local_cache;
+  std::vector<block_id_t> local_cache;
 #endif
-    int hit_num_;
+  int hit_num_;
 };
 
-class BufferPool {
-  public:
-    BufferPool(const std::string& filename, size_t pool_capacity) : pool_capacity_(pool_capacity){
-        fd_ = open(filename.c_str(), O_RDONLY);
-        if (fd_ < 0) {
-            throw std::runtime_error("Failed to open file: " + filename);
-        }
-        struct stat st;
-        if (fstat(fd_, &st) < 0) {
-            throw std::runtime_error("Failed to stat file: " + filename);
-        }
-        file_size_ = st.st_size;
-        lp_map_.init((file_size_ + BLOCK_SIZE - 1) / BLOCK_SIZE);
-
-        size_t buffer_num = pool_capacity_ / BLOCK_SIZE;
-        for (size_t i = 0; i < buffer_num; i++) {
-            char* buffer = (char*)aligned_alloc(64, BLOCK_SIZE);
-            free_buffers_.push(buffer);
-        }
-        std::cout << "buffer_num: " << buffer_num << std::endl;
-        std::cout << "entry_num: " << lp_map_.entry_num() << std::endl;
+class VecBufferPool {
+ public:
+  VecBufferPool(const std::string &filename, size_t pool_capacity, size_t block_size)
+      : pool_capacity_(pool_capacity) {
+    fd_ = open(filename.c_str(), O_RDONLY);
+    if (fd_ < 0) {
+      throw std::runtime_error("Failed to open file: " + filename);
     }
-    ~BufferPool() {
-        close(fd_);
+    struct stat st;
+    if (fstat(fd_, &st) < 0) {
+      throw std::runtime_error("Failed to stat file: " + filename);
     }
+    file_size_ = st.st_size;
 
-    BufferPoolHandle get_handle() {
-        return BufferPoolHandle(*this);
+    size_t buffer_num = pool_capacity_ / block_size;
+    lp_map_.init(buffer_num);
+    for (size_t i = 0; i < buffer_num; i++) {
+      char *buffer = (char *)aligned_alloc(64, block_size);
+      free_buffers_.push(buffer);
     }
-
-    char* acquire_buffer(block_id_t block_id, int retry = 0) {
-        char* buffer = lp_map_.acquire_block(block_id);
-        if (buffer) {
-            return buffer;
-        }
-        {
-            std::lock_guard<std::mutex> lock(mutex_);
-            if (free_buffers_.empty()) {
-                for (int i = 0; i < retry; i++) {
-                    lp_map_.recycle(free_buffers_);
-                    if (!free_buffers_.empty()) {
-                        break;
-                    }
-                }
-            }
-            if (free_buffers_.empty()) {
-                return nullptr;
-            }
-            buffer = free_buffers_.front();
-            free_buffers_.pop();
+    std::cout << "buffer_num: " << buffer_num << std::endl;
+    std::cout << "entry_num: " << lp_map_.entry_num() << std::endl;
+  }
+  ~VecBufferPool() {
+    close(fd_);
+  }
+
+  VecBufferPoolHandle get_handle() {
+    return VecBufferPoolHandle(*this);
+  }
+
+  char *acquire_buffer(block_id_t block_id, size_t offset, size_t size, int retry = 0) {
+    char *buffer = lp_map_.acquire_block(block_id);
+    if (buffer) {
+      return buffer;
+    }
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      if (free_buffers_.empty()) {
+        for (int i = 0; i < retry; i++) {
+          lp_map_.recycle(free_buffers_);
+          if (!free_buffers_.empty()) {
+            break;
+          }
         }
-        size_t read_offset = static_cast<size_t>(block_id) * BLOCK_SIZE;
-        size_t to_read = std::min<size_t>(BLOCK_SIZE, file_size_ - read_offset);
+      }
+      if (free_buffers_.empty()) {
+        return nullptr;
+      }
+      buffer = free_buffers_.front();
+      free_buffers_.pop();
+    }
 
-        ssize_t read_bytes = pread(fd_, buffer, to_read, read_offset);
-        if (read_bytes != static_cast<ssize_t>(to_read)) {
-            std::cerr << "Failed to read file at offset " << read_offset << std::endl;
-            exit(-1);
-        }
+    ssize_t read_bytes = pread(fd_, buffer, size, offset);
+    if (read_bytes != static_cast<ssize_t>(size)) {
+      std::cerr << "Failed to read file at offset " << offset << std::endl;
+      exit(-1);
+    }
 
-        {
-            std::lock_guard<std::mutex> lock(mutex_);
-            char* placed_buffer = lp_map_.set_block_acquired(block_id, buffer);
-            if (placed_buffer != buffer) {
-                // another thread has set the block
-                free_buffers_.push(buffer);
-            }
-            return placed_buffer;
-        }
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      char *placed_buffer = lp_map_.set_block_acquired(block_id, buffer);
+      if (placed_buffer != buffer) {
+        // another thread has set the block
+        free_buffers_.push(buffer);
+      }
+      return placed_buffer;
     }
+  }
 
-    size_t file_size() const {
-        return file_size_;
+  int get_meta(size_t offset, size_t length, char *buffer) {
+    ssize_t read_bytes = pread(fd_, buffer, length, offset);
+    if (read_bytes != static_cast<ssize_t>(length)) {
+      std::cerr << "Failed to read file at offset " << offset << std::endl;
+      exit(-1);
     }
+    return 0;
+  }
 
-  private:
-    int fd_;
-    size_t file_size_;
-    size_t pool_capacity_;
+  size_t file_size() const {
+    return file_size_;
+  }
 
-  public:
-    LPMap lp_map_;
+ private:
+  int fd_;
+  size_t file_size_;
+  size_t pool_capacity_;
 
-  private:
-    std::mutex mutex_;
-    std::queue<char*> free_buffers_;
+ public:
+  LPMap lp_map_;
+
+ private:
+  std::mutex mutex_;
+  std::queue<char *> free_buffers_;
 };
 
 
 struct Counter {
-    ~Counter() = default;
-
-    static Counter& get_instance() {
-        static Counter instance;
-        return instance;
+  ~Counter() = default;
+
+  static Counter &get_instance() {
+    static Counter instance;
+    return instance;
+  }
+
+  void record(const std::string &name, int64_t value) {
+    auto it = static_counters.find(name);
+    if (it == static_counters.end()) {
+      auto counter = std::make_unique<std::atomic<int64_t>>(0);
+      it = static_counters.emplace(name, std::move(counter)).first;
     }
+    it->second->fetch_add(value);
+  }
 
-    void record(const std::string& name, int64_t value) {
-        auto it = static_counters.find(name);
-        if (it == static_counters.end()) {
-            auto counter = std::make_unique<std::atomic<int64_t>>(0);
-            it = static_counters.emplace(name, std::move(counter)).first;
-        }
-        it->second->fetch_add(value);
+  void display() {
+    for (const auto &pair : static_counters) {
+      std::cout << pair.first << ": " << pair.second->load() << std::endl;
     }
+  }
 
-    void display() {
-        for (const auto& pair : static_counters) {
-            std::cout << pair.first << ": " << pair.second->load() << std::endl;
-        }
-    }
-
-    void clear() {
-        static_counters.clear();
-    }
+  void clear() {
+    static_counters.clear();
+  }
 
-  private:
-    Counter() {}
-    std::map<std::string, std::unique_ptr<std::atomic<int64_t>>> static_counters;
+ private:
+  Counter() {}
+  std::map<std::string, std::unique_ptr<std::atomic<int64_t>>> static_counters;
 };
 
-BufferPoolHandle::BufferPoolHandle(BufferPool& pool) : pool(pool), hit_num_(0) {}
-BufferPoolHandle::~BufferPoolHandle() {
-    Counter::get_instance().record("buffer_pool_handle_hit_num", hit_num_);
-    release_all();
+VecBufferPoolHandle::VecBufferPoolHandle(VecBufferPool &pool)
+    : pool(pool), hit_num_(0) {}
+VecBufferPoolHandle::~VecBufferPoolHandle() {
+  Counter::get_instance().record("buffer_pool_handle_hit_num", hit_num_);
+  release_all();
 }
 
-char* BufferPoolHandle::get_block(size_t offset, size_t size) {
-    block_id_t block_id = BLOCK_ID(offset);
-    assert(block_id == BLOCK_ID(offset + size - 1));
+char *VecBufferPoolHandle::get_block(size_t offset, size_t size, size_t block_id) {
 #ifdef USE_LOCAL_CACHE
-    auto it = local_cache.find(block_id);
-    if (it != local_cache.end()) {
-        hit_num_++;
-        return it->second + BLOCK_OFFSET(offset);
-    }
+  auto it = local_cache.find(block_id);
+  if (it != local_cache.end()) {
+    hit_num_++;
+    return it->second;
+  }
 #endif
 
-    char* buffer = pool.acquire_buffer(block_id, 3);
-    if (buffer) {
+  char *buffer = pool.acquire_buffer(block_id, offset, size, 3);
+  if (buffer) {
 #ifdef USE_LOCAL_CACHE
-        local_cache[block_id] = buffer;
+    local_cache[block_id] = buffer;
 #else
-        local_cache.push_back(block_id);
+    local_cache.push_back(block_id);
 #endif
-        return buffer + BLOCK_OFFSET(offset);
-    }
+    return buffer;
+  }
 
-    return nullptr;
+  return nullptr;
 }
 
-void BufferPoolHandle::release_all() {
+int VecBufferPoolHandle::get_meta(size_t offset, size_t length, char *out) {
+  return pool.get_meta(offset, length, out);
+}
+
+void VecBufferPoolHandle::release_all() {
 #ifdef USE_LOCAL_CACHE
-    Counter::get_instance().record("buffer_pool_handle_release_call", local_cache.size());
-    for (const auto& pair : local_cache) {
-        pool.lp_map_.release_block(pair.first);
-    }
+  Counter::get_instance().record("buffer_pool_handle_release_call",
+                                 local_cache.size());
+  for (const auto &pair : local_cache) {
+    pool.lp_map_.release_block(pair.first);
+  }
 #else
-    for (block_id_t block_id : local_cache) {
-        pool.lp_map_.release_block(block_id);
-    }
+  for (block_id_t block_id : local_cache) {
+    pool.lp_map_.release_block(block_id);
+  }
 #endif
-    local_cache.clear();
+  local_cache.clear();
 }
\ No newline at end of file
diff --git a/src/include/zvec/ailego/buffer/concurrentqueue.h b/src/include/zvec/ailego/buffer/concurrentqueue.h
new file mode 100644
index 00000000..db4835b1
--- /dev/null
+++ b/src/include/zvec/ailego/buffer/concurrentqueue.h
@@ -0,0 +1,3747 @@
+// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free queue.
+// An overview, including benchmark results, is provided here:
+//     http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++
+// The full design is also described in excruciating detail at:
+//    http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue
+
+// Simplified BSD license:
+// Copyright (c) 2013-2020, Cameron Desrochers.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice, this list of
+// conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice, this list of
+// conditions and the following disclaimer in the documentation and/or other materials
+// provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+// TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Also dual-licensed under the Boost Software License (see LICENSE.md)
+
+#pragma once
+
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and
+// Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings
+// upon assigning any computed values)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+
+#ifdef MCDBGQ_USE_RELACY
+#pragma GCC diagnostic ignored "-Wint-to-pointer-cast"
+#endif
+#endif
+
+#if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17)
+// VS2019 with /W4 warns about constant conditional expressions but unless /std=c++17 or higher
+// does not support `if constexpr`, so we have no choice but to simply disable the warning
+#pragma warning(push)
+#pragma warning(disable: 4127)  // conditional expression is constant
+#endif
+
+#if defined(__APPLE__)
+#include "TargetConditionals.h"
+#endif
+
+#ifdef MCDBGQ_USE_RELACY
+#include "relacy/relacy_std.hpp"
+#include "relacy_shims.h"
+// We only use malloc/free anyway, and the delete macro messes up `= delete` method declarations.
+// We'll override the default trait malloc ourselves without a macro.
+#undef new
+#undef delete
+#undef malloc
+#undef free
+#else
+#include <atomic>		// Requires C++11. Sorry VS2010.
+#include <cassert>
+#endif
+#include <cstddef>              // for max_align_t
+#include <cstdint>
+#include <cstdlib>
+#include <type_traits>
+#include <algorithm>
+#include <utility>
+#include <limits>
+#include <climits>		// for CHAR_BIT
+#include <array>
+#include <thread>		// partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
+#include <mutex>        // used for thread exit synchronization
+
+// Platform-specific definitions of a numeric thread ID type and an invalid value
+namespace moodycamel { namespace details {
+	template<typename thread_id_t> struct thread_id_converter {
+		typedef thread_id_t thread_id_numeric_size_t;
+		typedef thread_id_t thread_id_hash_t;
+		static thread_id_hash_t prehash(thread_id_t const& x) { return x; }
+	};
+} }
+#if defined(MCDBGQ_USE_RELACY)
+namespace moodycamel { namespace details {
+	typedef std::uint32_t thread_id_t;
+	static const thread_id_t invalid_thread_id  = 0xFFFFFFFFU;
+	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU;
+	static inline thread_id_t thread_id() { return rl::thread_index(); }
+} }
+#elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__)
+// No sense pulling in windows.h in a header, we'll manually declare the function
+// we use and rely on backwards-compatibility for this not to break
+extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void);
+namespace moodycamel { namespace details {
+	static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows");
+	typedef std::uint32_t thread_id_t;
+	static const thread_id_t invalid_thread_id  = 0;			// See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx
+	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFFU;	// Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4.
+	static inline thread_id_t thread_id() { return static_cast<thread_id_t>(::GetCurrentThreadId()); }
+} }
+#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || (defined(__APPLE__) && TARGET_OS_IPHONE) || defined(__MVS__) || defined(MOODYCAMEL_NO_THREAD_LOCAL)
+namespace moodycamel { namespace details {
+	static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, "std::thread::id is expected to be either 4 or 8 bytes");
+	
+	typedef std::thread::id thread_id_t;
+	static const thread_id_t invalid_thread_id;         // Default ctor creates invalid ID
+
+	// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's
+	// only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't
+	// be.
+	static inline thread_id_t thread_id() { return std::this_thread::get_id(); }
+
+	template<std::size_t> struct thread_id_size { };
+	template<> struct thread_id_size<4> { typedef std::uint32_t numeric_t; };
+	template<> struct thread_id_size<8> { typedef std::uint64_t numeric_t; };
+
+	template<> struct thread_id_converter<thread_id_t> {
+		typedef thread_id_size<sizeof(thread_id_t)>::numeric_t thread_id_numeric_size_t;
+#ifndef __APPLE__
+		typedef std::size_t thread_id_hash_t;
+#else
+		typedef thread_id_numeric_size_t thread_id_hash_t;
+#endif
+
+		static thread_id_hash_t prehash(thread_id_t const& x)
+		{
+#ifndef __APPLE__
+			return std::hash<std::thread::id>()(x);
+#else
+			return *reinterpret_cast<thread_id_hash_t const*>(&x);
+#endif
+		}
+	};
+} }
+#else
+// Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475
+// In order to get a numeric thread ID in a platform-independent way, we use a thread-local
+// static variable's address as a thread identifier :-)
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+#define MOODYCAMEL_THREADLOCAL __thread
+#elif defined(_MSC_VER)
+#define MOODYCAMEL_THREADLOCAL __declspec(thread)
+#else
+// Assume C++11 compliant compiler
+#define MOODYCAMEL_THREADLOCAL thread_local
+#endif
+namespace moodycamel { namespace details {
+	typedef std::uintptr_t thread_id_t;
+	static const thread_id_t invalid_thread_id  = 0;		// Address can't be nullptr
+	static const thread_id_t invalid_thread_id2 = 1;		// Member accesses off a null pointer are also generally invalid. Plus it's not aligned.
+	inline thread_id_t thread_id() { static MOODYCAMEL_THREADLOCAL int x; return reinterpret_cast<thread_id_t>(&x); }
+} }
+#endif
+
+// Constexpr if
+#ifndef MOODYCAMEL_CONSTEXPR_IF
+#if (defined(_MSC_VER) && defined(_HAS_CXX17) && _HAS_CXX17) || __cplusplus > 201402L
+#define MOODYCAMEL_CONSTEXPR_IF if constexpr
+#define MOODYCAMEL_MAYBE_UNUSED [[maybe_unused]]
+#else
+#define MOODYCAMEL_CONSTEXPR_IF if
+#define MOODYCAMEL_MAYBE_UNUSED
+#endif
+#endif
+
+// Exceptions
+#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED
+#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__))
+#define MOODYCAMEL_EXCEPTIONS_ENABLED
+#endif
+#endif
+#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
+#define MOODYCAMEL_TRY try
+#define MOODYCAMEL_CATCH(...) catch(__VA_ARGS__)
+#define MOODYCAMEL_RETHROW throw
+#define MOODYCAMEL_THROW(expr) throw (expr)
+#else
+#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF (true)
+#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF (false)
+#define MOODYCAMEL_RETHROW
+#define MOODYCAMEL_THROW(expr)
+#endif
+
+#ifndef MOODYCAMEL_NOEXCEPT
+#if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED)
+#define MOODYCAMEL_NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true
+#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800
+// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-(
+// We have to assume *all* non-trivial constructors may throw on VS2012!
+#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900
+#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value || std::is_nothrow_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value || std::is_nothrow_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#else
+#define MOODYCAMEL_NOEXCEPT noexcept
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr)
+#endif
+#endif
+
+#ifndef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#ifdef MCDBGQ_USE_RELACY
+#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#else
+// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445
+// g++ <=4.7 doesn't support thread_local either.
+// Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work
+#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) && (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && !defined(__MVS__)
+// Assume `thread_local` is fully supported in all other C++11 compilers/platforms
+#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED    // tentatively enabled for now; years ago several users report having problems with it on
+#endif
+#endif
+#endif
+
+// VS2012 doesn't support deleted functions. 
+// In this case, we declare the function normally but don't define it. A link error will be generated if the function is called.
+#ifndef MOODYCAMEL_DELETE_FUNCTION
+#if defined(_MSC_VER) && _MSC_VER < 1800
+#define MOODYCAMEL_DELETE_FUNCTION
+#else
+#define MOODYCAMEL_DELETE_FUNCTION = delete
+#endif
+#endif
+
+namespace moodycamel { namespace details {
+#ifndef MOODYCAMEL_ALIGNAS
+// VS2013 doesn't support alignas or alignof, and align() requires a constant literal
+#if defined(_MSC_VER) && _MSC_VER <= 1800
+#define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment))
+#define MOODYCAMEL_ALIGNOF(obj) __alignof(obj)
+#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) typename details::Vs2013Aligned<std::alignment_of<obj>::value, T>::type
+	template<int Align, typename T> struct Vs2013Aligned { };  // default, unsupported alignment
+	template<typename T> struct Vs2013Aligned<1, T> { typedef __declspec(align(1)) T type; };
+	template<typename T> struct Vs2013Aligned<2, T> { typedef __declspec(align(2)) T type; };
+	template<typename T> struct Vs2013Aligned<4, T> { typedef __declspec(align(4)) T type; };
+	template<typename T> struct Vs2013Aligned<8, T> { typedef __declspec(align(8)) T type; };
+	template<typename T> struct Vs2013Aligned<16, T> { typedef __declspec(align(16)) T type; };
+	template<typename T> struct Vs2013Aligned<32, T> { typedef __declspec(align(32)) T type; };
+	template<typename T> struct Vs2013Aligned<64, T> { typedef __declspec(align(64)) T type; };
+	template<typename T> struct Vs2013Aligned<128, T> { typedef __declspec(align(128)) T type; };
+	template<typename T> struct Vs2013Aligned<256, T> { typedef __declspec(align(256)) T type; };
+#else
+	template<typename T> struct identity { typedef T type; };
+#define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment)
+#define MOODYCAMEL_ALIGNOF(obj) alignof(obj)
+#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) alignas(alignof(obj)) typename details::identity<T>::type
+#endif
+#endif
+} }
+
+
+// TSAN can false report races in lock-free code.  To enable TSAN to be used from projects that use this one,
+// we can apply per-function compile-time suppression.
+// See https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer
+#define MOODYCAMEL_NO_TSAN
+#if defined(__has_feature)
+ #if __has_feature(thread_sanitizer)
+  #undef MOODYCAMEL_NO_TSAN
+  #define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread")))
+ #endif // TSAN
+#endif // TSAN
+
+// Compiler-specific likely/unlikely hints
+namespace moodycamel { namespace details {
+#if defined(__GNUC__)
+	static inline bool (likely)(bool x) { return __builtin_expect((x), true); }
+	static inline bool (unlikely)(bool x) { return __builtin_expect((x), false); }
+#else
+	static inline bool (likely)(bool x) { return x; }
+	static inline bool (unlikely)(bool x) { return x; }
+#endif
+} }
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+#include "internal/concurrentqueue_internal_debug.h"
+#endif
+
+namespace moodycamel {
+namespace details {
+	template<typename T>
+	struct const_numeric_max {
+		static_assert(std::is_integral<T>::value, "const_numeric_max can only be used with integers");
+		static const T value = std::numeric_limits<T>::is_signed
+			? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast<T>(1)
+			: static_cast<T>(-1);
+	};
+
+#if defined(__GLIBCXX__)
+	typedef ::max_align_t std_max_align_t;      // libstdc++ forgot to add it to std:: for a while
+#else
+	typedef std::max_align_t std_max_align_t;   // Others (e.g. MSVC) insist it can *only* be accessed via std::
+#endif
+
+	// Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting
+	// 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64.
+	typedef union {
+		std_max_align_t x;
+		long long y;
+		void* z;
+	} max_align_t;
+}
+
+// Default traits for the ConcurrentQueue. To change some of the
+// traits without re-implementing all of them, inherit from this
+// struct and shadow the declarations you wish to be different;
+// since the traits are used as a template type parameter, the
+// shadowed declarations will be used where defined, and the defaults
+// otherwise.
+struct ConcurrentQueueDefaultTraits
+{
+	// General-purpose size type. std::size_t is strongly recommended.
+	typedef std::size_t size_t;
+	
+	// The type used for the enqueue and dequeue indices. Must be at least as
+	// large as size_t. Should be significantly larger than the number of elements
+	// you expect to hold at once, especially if you have a high turnover rate;
+	// for example, on 32-bit x86, if you expect to have over a hundred million
+	// elements or pump several million elements through your queue in a very
+	// short space of time, using a 32-bit type *may* trigger a race condition.
+	// A 64-bit int type is recommended in that case, and in practice will
+	// prevent a race condition no matter the usage of the queue. Note that
+	// whether the queue is lock-free with a 64-int type depends on the whether
+	// std::atomic<std::uint64_t> is lock-free, which is platform-specific.
+	typedef std::size_t index_t;
+	
+	// Internally, all elements are enqueued and dequeued from multi-element
+	// blocks; this is the smallest controllable unit. If you expect few elements
+	// but many producers, a smaller block size should be favoured. For few producers
+	// and/or many elements, a larger block size is preferred. A sane default
+	// is provided. Must be a power of 2.
+	static const size_t BLOCK_SIZE = 32;
+	
+	// For explicit producers (i.e. when using a producer token), the block is
+	// checked for being empty by iterating through a list of flags, one per element.
+	// For large block sizes, this is too inefficient, and switching to an atomic
+	// counter-based approach is faster. The switch is made for block sizes strictly
+	// larger than this threshold.
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;
+	
+	// How many full blocks can be expected for a single explicit producer? This should
+	// reflect that number's maximum for optimal performance. Must be a power of 2.
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32;
+	
+	// How many full blocks can be expected for a single implicit producer? This should
+	// reflect that number's maximum for optimal performance. Must be a power of 2.
+	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32;
+	
+	// The initial size of the hash table mapping thread IDs to implicit producers.
+	// Note that the hash is resized every time it becomes half full.
+	// Must be a power of two, and either 0 or at least 1. If 0, implicit production
+	// (using the enqueue methods without an explicit producer token) is disabled.
+	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32;
+	
+	// Controls the number of items that an explicit consumer (i.e. one with a token)
+	// must consume before it causes all consumers to rotate and move on to the next
+	// internal queue.
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256;
+	
+	// The maximum number of elements (inclusive) that can be enqueued to a sub-queue.
+	// Enqueue operations that would cause this limit to be surpassed will fail. Note
+	// that this limit is enforced at the block level (for performance reasons), i.e.
+	// it's rounded up to the nearest block size.
+	static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max<size_t>::value;
+
+	// The number of times to spin before sleeping when waiting on a semaphore.
+	// Recommended values are on the order of 1000-10000 unless the number of
+	// consumer threads exceeds the number of idle cores (in which case try 0-100).
+	// Only affects instances of the BlockingConcurrentQueue.
+	static const int MAX_SEMA_SPINS = 10000;
+
+	// Whether to recycle dynamically-allocated blocks into an internal free list or
+	// not. If false, only pre-allocated blocks (controlled by the constructor
+	// arguments) will be recycled, and all others will be `free`d back to the heap.
+	// Note that blocks consumed by explicit producers are only freed on destruction
+	// of the queue (not following destruction of the token) regardless of this trait.
+	static const bool RECYCLE_ALLOCATED_BLOCKS = false;
+
+	
+#ifndef MCDBGQ_USE_RELACY
+	// Memory allocation can be customized if needed.
+	// malloc should return nullptr on failure, and handle alignment like std::malloc.
+#if defined(malloc) || defined(free)
+	// Gah, this is 2015, stop defining macros that break standard code already!
+	// Work around malloc/free being special macros:
+	static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); }
+	static inline void WORKAROUND_free(void* ptr) { return free(ptr); }
+	static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); }
+	static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); }
+#else
+	static inline void* malloc(size_t size) { return std::malloc(size); }
+	static inline void free(void* ptr) { return std::free(ptr); }
+#endif
+#else
+	// Debug versions when running under the Relacy race detector (ignore
+	// these in user code)
+	static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); }
+	static inline void free(void* ptr) { return rl::rl_free(ptr, $); }
+#endif
+};
+
+
+// When producing or consuming many elements, the most efficient way is to:
+//    1) Use one of the bulk-operation methods of the queue with a token
+//    2) Failing that, use the bulk-operation methods without a token
+//    3) Failing that, create a token and use that with the single-item methods
+//    4) Failing that, use the single-parameter methods of the queue
+// Having said that, don't create tokens willy-nilly -- ideally there should be
+// a maximum of one token per thread (of each kind).
+struct ProducerToken;
+struct ConsumerToken;
+
+template<typename T, typename Traits> class ConcurrentQueue;
+template<typename T, typename Traits> class BlockingConcurrentQueue;
+class ConcurrentQueueTests;
+
+
+namespace details
+{
+	struct ConcurrentQueueProducerTypelessBase
+	{
+		ConcurrentQueueProducerTypelessBase* next;
+		std::atomic<bool> inactive;
+		ProducerToken* token;
+		
+		ConcurrentQueueProducerTypelessBase()
+			: next(nullptr), inactive(false), token(nullptr)
+		{
+		}
+	};
+	
+	template<bool use32> struct _hash_32_or_64 {
+		static inline std::uint32_t hash(std::uint32_t h)
+		{
+			// MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
+			// Since the thread ID is already unique, all we really want to do is propagate that
+			// uniqueness evenly across all the bits, so that we can use a subset of the bits while
+			// reducing collisions significantly
+			h ^= h >> 16;
+			h *= 0x85ebca6b;
+			h ^= h >> 13;
+			h *= 0xc2b2ae35;
+			return h ^ (h >> 16);
+		}
+	};
+	template<> struct _hash_32_or_64<1> {
+		static inline std::uint64_t hash(std::uint64_t h)
+		{
+			h ^= h >> 33;
+			h *= 0xff51afd7ed558ccd;
+			h ^= h >> 33;
+			h *= 0xc4ceb9fe1a85ec53;
+			return h ^ (h >> 33);
+		}
+	};
+	template<std::size_t size> struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> {  };
+	
+	static inline size_t hash_thread_id(thread_id_t id)
+	{
+		static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values");
+		return static_cast<size_t>(hash_32_or_64<sizeof(thread_id_converter<thread_id_t>::thread_id_hash_t)>::hash(
+			thread_id_converter<thread_id_t>::prehash(id)));
+	}
+	
+	template<typename T>
+	static inline bool circular_less_than(T a, T b)
+	{
+		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "circular_less_than is intended to be used only with unsigned integer types");
+		return static_cast<T>(a - b) > static_cast<T>(static_cast<T>(1) << (static_cast<T>(sizeof(T) * CHAR_BIT - 1)));
+		// Note: extra parens around rhs of operator<< is MSVC bug: https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931
+		//       silencing the bug requires #pragma warning(disable: 4554) around the calling code and has no effect when done here.
+	}
+	
+	template<typename U>
+	static inline char* align_for(char* ptr)
+	{
+		const std::size_t alignment = std::alignment_of<U>::value;
+		return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
+	}
+
+	template<typename T>
+	static inline T ceil_to_pow_2(T x)
+	{
+		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types");
+
+		// Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+		--x;
+		x |= x >> 1;
+		x |= x >> 2;
+		x |= x >> 4;
+		for (std::size_t i = 1; i < sizeof(T); i <<= 1) {
+			x |= x >> (i << 3);
+		}
+		++x;
+		return x;
+	}
+	
+	template<typename T>
+	static inline void swap_relaxed(std::atomic<T>& left, std::atomic<T>& right)
+	{
+		T temp = left.load(std::memory_order_relaxed);
+		left.store(right.load(std::memory_order_relaxed), std::memory_order_relaxed);
+		right.store(temp, std::memory_order_relaxed);
+	}
+	
+	template<typename T>
+	static inline T const& nomove(T const& x)
+	{
+		return x;
+	}
+	
+	template<bool Enable>
+	struct nomove_if
+	{
+		template<typename T>
+		static inline T const& eval(T const& x)
+		{
+			return x;
+		}
+	};
+	
+	template<>
+	struct nomove_if<false>
+	{
+		template<typename U>
+		static inline auto eval(U&& x)
+			-> decltype(std::forward<U>(x))
+		{
+			return std::forward<U>(x);
+		}
+	};
+	
+	template<typename It>
+	static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it)
+	{
+		return *it;
+	}
+	
+#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+	template<typename T> struct is_trivially_destructible : std::is_trivially_destructible<T> { };
+#else
+	template<typename T> struct is_trivially_destructible : std::has_trivial_destructor<T> { };
+#endif
+	
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#ifdef MCDBGQ_USE_RELACY
+	typedef RelacyThreadExitListener ThreadExitListener;
+	typedef RelacyThreadExitNotifier ThreadExitNotifier;
+#else
+	class ThreadExitNotifier;
+
+	struct ThreadExitListener
+	{
+		typedef void (*callback_t)(void*);
+		callback_t callback;
+		void* userData;
+		
+		ThreadExitListener* next;		// reserved for use by the ThreadExitNotifier
+		ThreadExitNotifier* chain;		// reserved for use by the ThreadExitNotifier
+	};
+
+	class ThreadExitNotifier
+	{
+	public:
+		static void subscribe(ThreadExitListener* listener)
+		{
+			auto& tlsInst = instance();
+			std::lock_guard<std::mutex> guard(mutex());
+			listener->next = tlsInst.tail;
+			listener->chain = &tlsInst;
+			tlsInst.tail = listener;
+		}
+		
+		static void unsubscribe(ThreadExitListener* listener)
+		{
+			std::lock_guard<std::mutex> guard(mutex());
+			if (!listener->chain) {
+				return;  // race with ~ThreadExitNotifier
+			}
+			auto& tlsInst = *listener->chain;
+			listener->chain = nullptr;
+			ThreadExitListener** prev = &tlsInst.tail;
+			for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) {
+				if (ptr == listener) {
+					*prev = ptr->next;
+					break;
+				}
+				prev = &ptr->next;
+			}
+		}
+		
+	private:
+		ThreadExitNotifier() : tail(nullptr) { }
+		ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
+		ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
+		
+		~ThreadExitNotifier()
+		{
+			// This thread is about to exit, let everyone know!
+			assert(this == &instance() && "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined.");
+			std::lock_guard<std::mutex> guard(mutex());
+			for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) {
+				ptr->chain = nullptr;
+				ptr->callback(ptr->userData);
+			}
+		}
+		
+		// Thread-local
+		static inline ThreadExitNotifier& instance()
+		{
+			static thread_local ThreadExitNotifier notifier;
+			return notifier;
+		}
+
+		static inline std::mutex& mutex()
+		{
+			// Must be static because the ThreadExitNotifier could be destroyed while unsubscribe is called
+			static std::mutex mutex;
+			return mutex;
+		}
+		
+	private:
+		ThreadExitListener* tail;
+	};
+#endif
+#endif
+	
+	template<typename T> struct static_is_lock_free_num { enum { value = 0 }; };
+	template<> struct static_is_lock_free_num<signed char> { enum { value = ATOMIC_CHAR_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<short> { enum { value = ATOMIC_SHORT_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<int> { enum { value = ATOMIC_INT_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<long> { enum { value = ATOMIC_LONG_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<long long> { enum { value = ATOMIC_LLONG_LOCK_FREE }; };
+	template<typename T> struct static_is_lock_free : static_is_lock_free_num<typename std::make_signed<T>::type> {  };
+	template<> struct static_is_lock_free<bool> { enum { value = ATOMIC_BOOL_LOCK_FREE }; };
+	template<typename U> struct static_is_lock_free<U*> { enum { value = ATOMIC_POINTER_LOCK_FREE }; };
+}
+
+
+struct ProducerToken
+{
+	template<typename T, typename Traits>
+	explicit ProducerToken(ConcurrentQueue<T, Traits>& queue);
+	
+	template<typename T, typename Traits>
+	explicit ProducerToken(BlockingConcurrentQueue<T, Traits>& queue);
+	
+	ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+		: producer(other.producer)
+	{
+		other.producer = nullptr;
+		if (producer != nullptr) {
+			producer->token = this;
+		}
+	}
+	
+	inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap(other);
+		return *this;
+	}
+	
+	void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT
+	{
+		std::swap(producer, other.producer);
+		if (producer != nullptr) {
+			producer->token = this;
+		}
+		if (other.producer != nullptr) {
+			other.producer->token = &other;
+		}
+	}
+	
+	// A token is always valid unless:
+	//     1) Memory allocation failed during construction
+	//     2) It was moved via the move constructor
+	//        (Note: assignment does a swap, leaving both potentially valid)
+	//     3) The associated queue was destroyed
+	// Note that if valid() returns true, that only indicates
+	// that the token is valid for use with a specific queue,
+	// but not which one; that's up to the user to track.
+	inline bool valid() const { return producer != nullptr; }
+	
+	~ProducerToken()
+	{
+		if (producer != nullptr) {
+			producer->token = nullptr;
+			producer->inactive.store(true, std::memory_order_release);
+		}
+	}
+	
+	// Disable copying and assignment
+	ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+	ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+	
+private:
+	template<typename T, typename Traits> friend class ConcurrentQueue;
+	friend class ConcurrentQueueTests;
+	
+protected:
+	details::ConcurrentQueueProducerTypelessBase* producer;
+};
+
+
+struct ConsumerToken
+{
+	template<typename T, typename Traits>
+	explicit ConsumerToken(ConcurrentQueue<T, Traits>& q);
+	
+	template<typename T, typename Traits>
+	explicit ConsumerToken(BlockingConcurrentQueue<T, Traits>& q);
+	
+	ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+		: initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer)
+	{
+	}
+	
+	inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap(other);
+		return *this;
+	}
+	
+	void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT
+	{
+		std::swap(initialOffset, other.initialOffset);
+		std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
+		std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
+		std::swap(currentProducer, other.currentProducer);
+		std::swap(desiredProducer, other.desiredProducer);
+	}
+	
+	// Disable copying and assignment
+	ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+	ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+
+private:
+	template<typename T, typename Traits> friend class ConcurrentQueue;
+	friend class ConcurrentQueueTests;
+	
+private: // but shared with ConcurrentQueue
+	std::uint32_t initialOffset;
+	std::uint32_t lastKnownGlobalOffset;
+	std::uint32_t itemsConsumedFromCurrent;
+	details::ConcurrentQueueProducerTypelessBase* currentProducer;
+	details::ConcurrentQueueProducerTypelessBase* desiredProducer;
+};
+
+// Need to forward-declare this swap because it's in a namespace.
+// See http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces
+template<typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT;
+
+
+template<typename T, typename Traits = ConcurrentQueueDefaultTraits>
+class ConcurrentQueue
+{
+public:
+	typedef ::moodycamel::ProducerToken producer_token_t;
+	typedef ::moodycamel::ConsumerToken consumer_token_t;
+	
+	typedef typename Traits::index_t index_t;
+	typedef typename Traits::size_t size_t;
+	
+	static const size_t BLOCK_SIZE = static_cast<size_t>(Traits::BLOCK_SIZE);
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
+	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::IMPLICIT_INITIAL_INDEX_SIZE);
+	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = static_cast<size_t>(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE);
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast<std::uint32_t>(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4307)		// + integral constant overflow (that's what the ternary expression is for!)
+#pragma warning(disable: 4309)		// static_cast: Truncation of constant value
+#endif
+	static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max<size_t>::value - static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max<size_t>::value : ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE);
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+	static_assert(!std::numeric_limits<size_t>::is_signed && std::is_integral<size_t>::value, "Traits::size_t must be an unsigned integral type");
+	static_assert(!std::numeric_limits<index_t>::is_signed && std::is_integral<index_t>::value, "Traits::index_t must be an unsigned integral type");
+	static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t");
+	static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)");
+	static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)");
+	static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+	static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+	static_assert((INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2");
+	static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)");
+
+public:
+	// Creates a queue with at least `capacity` element slots; note that the
+	// actual number of elements that can be inserted without additional memory
+	// allocation depends on the number of producers and the block size (e.g. if
+	// the block size is equal to `capacity`, only a single block will be allocated
+	// up-front, which means only a single producer will be able to enqueue elements
+	// without an extra allocation -- blocks aren't shared between producers).
+	// This method is not thread safe -- it is up to the user to ensure that the
+	// queue is fully constructed before it starts being used by other threads (this
+	// includes making the memory effects of construction visible, possibly with a
+	// memory barrier).
+	explicit ConcurrentQueue(size_t capacity = 32 * BLOCK_SIZE)
+		: producerListTail(nullptr),
+		producerCount(0),
+		initialBlockPoolIndex(0),
+		nextExplicitConsumerId(0),
+		globalExplicitConsumerOffset(0)
+	{
+		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+		populate_initial_implicit_producer_hash();
+		populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1));
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		// Track all the producers using a fully-resolved typed list for
+		// each kind; this makes it possible to debug them starting from
+		// the root queue object (otherwise wacky casts are needed that
+		// don't compile in the debugger's expression evaluator).
+		explicitProducers.store(nullptr, std::memory_order_relaxed);
+		implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+	}
+	
+	// Computes the correct amount of pre-allocated blocks for you based
+	// on the minimum number of elements you want available at any given
+	// time, and the maximum concurrent number of each type of producer.
+	ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers)
+		: producerListTail(nullptr),
+		producerCount(0),
+		initialBlockPoolIndex(0),
+		nextExplicitConsumerId(0),
+		globalExplicitConsumerOffset(0)
+	{
+		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+		populate_initial_implicit_producer_hash();
+		size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers + maxImplicitProducers);
+		populate_initial_block_list(blocks);
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		explicitProducers.store(nullptr, std::memory_order_relaxed);
+		implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+	}
+	
+	// Note: The queue should not be accessed concurrently while it's
+	// being deleted. It's up to the user to synchronize this.
+	// This method is not thread safe.
+	~ConcurrentQueue()
+	{
+		// Destroy producers
+		auto ptr = producerListTail.load(std::memory_order_relaxed);
+		while (ptr != nullptr) {
+			auto next = ptr->next_prod();
+			if (ptr->token != nullptr) {
+				ptr->token->producer = nullptr;
+			}
+			destroy(ptr);
+			ptr = next;
+		}
+		
+		// Destroy implicit producer hash tables
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) {
+			auto hash = implicitProducerHash.load(std::memory_order_relaxed);
+			while (hash != nullptr) {
+				auto prev = hash->prev;
+				if (prev != nullptr) {		// The last hash is part of this object and was not allocated dynamically
+					for (size_t i = 0; i != hash->capacity; ++i) {
+						hash->entries[i].~ImplicitProducerKVP();
+					}
+					hash->~ImplicitProducerHash();
+					(Traits::free)(hash);
+				}
+				hash = prev;
+			}
+		}
+		
+		// Destroy global free list
+		auto block = freeList.head_unsafe();
+		while (block != nullptr) {
+			auto next = block->freeListNext.load(std::memory_order_relaxed);
+			if (block->dynamicallyAllocated) {
+				destroy(block);
+			}
+			block = next;
+		}
+		
+		// Destroy initial free list
+		destroy_array(initialBlockPool, initialBlockPoolSize);
+	}
+
+	// Disable copying and copy assignment
+	ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+	ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+	
+	// Moving is supported, but note that it is *not* a thread-safe operation.
+	// Nobody can use the queue while it's being moved, and the memory effects
+	// of that move must be propagated to other threads before they can use it.
+	// Note: When a queue is moved, its tokens are still valid but can only be
+	// used with the destination queue (i.e. semantically they are moved along
+	// with the queue itself).
+	ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+		: producerListTail(other.producerListTail.load(std::memory_order_relaxed)),
+		producerCount(other.producerCount.load(std::memory_order_relaxed)),
+		initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)),
+		initialBlockPool(other.initialBlockPool),
+		initialBlockPoolSize(other.initialBlockPoolSize),
+		freeList(std::move(other.freeList)),
+		nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)),
+		globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed))
+	{
+		// Move the other one into this, and leave the other one as an empty queue
+		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+		populate_initial_implicit_producer_hash();
+		swap_implicit_producer_hashes(other);
+		
+		other.producerListTail.store(nullptr, std::memory_order_relaxed);
+		other.producerCount.store(0, std::memory_order_relaxed);
+		other.nextExplicitConsumerId.store(0, std::memory_order_relaxed);
+		other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed);
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
+		other.explicitProducers.store(nullptr, std::memory_order_relaxed);
+		implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
+		other.implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+		
+		other.initialBlockPoolIndex.store(0, std::memory_order_relaxed);
+		other.initialBlockPoolSize = 0;
+		other.initialBlockPool = nullptr;
+		
+		reown_producers();
+	}
+	
+	inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+	{
+		return swap_internal(other);
+	}
+	
+	// Swaps this queue's state with the other's. Not thread-safe.
+	// Swapping two queues does not invalidate their tokens, however
+	// the tokens that were created for one queue must be used with
+	// only the swapped queue (i.e. the tokens are tied to the
+	// queue's movable state, not the object itself).
+	inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap_internal(other);
+	}
+	
+private:
+	ConcurrentQueue& swap_internal(ConcurrentQueue& other)
+	{
+		if (this == &other) {
+			return *this;
+		}
+		
+		details::swap_relaxed(producerListTail, other.producerListTail);
+		details::swap_relaxed(producerCount, other.producerCount);
+		details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex);
+		std::swap(initialBlockPool, other.initialBlockPool);
+		std::swap(initialBlockPoolSize, other.initialBlockPoolSize);
+		freeList.swap(other.freeList);
+		details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId);
+		details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset);
+		
+		swap_implicit_producer_hashes(other);
+		
+		reown_producers();
+		other.reown_producers();
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		details::swap_relaxed(explicitProducers, other.explicitProducers);
+		details::swap_relaxed(implicitProducers, other.implicitProducers);
+#endif
+		
+		return *this;
+	}
+	
+public:
+	// Enqueues a single item (by copying it).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T const& item)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue<CanAlloc>(item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T&& item)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue<CanAlloc>(std::move(item));
+	}
+	
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T const& item)
+	{
+		return inner_enqueue<CanAlloc>(token, item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T&& item)
+	{
+		return inner_enqueue<CanAlloc>(token, std::move(item));
+	}
+	
+	// Enqueues several items.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool enqueue_bulk(It itemFirst, size_t count)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue_bulk<CanAlloc>(itemFirst, count);
+	}
+	
+	// Enqueues several items using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails
+	// (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		return inner_enqueue_bulk<CanAlloc>(token, itemFirst, count);
+	}
+	
+	// Enqueues a single item (by copying it).
+	// Does not allocate memory. Fails if not enough room to enqueue (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T const& item)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue<CannotAlloc>(item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible).
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T&& item)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue<CannotAlloc>(std::move(item));
+	}
+	
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T const& item)
+	{
+		return inner_enqueue<CannotAlloc>(token, item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T&& item)
+	{
+		return inner_enqueue<CannotAlloc>(token, std::move(item));
+	}
+	
+	// Enqueues several items.
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool try_enqueue_bulk(It itemFirst, size_t count)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue_bulk<CannotAlloc>(itemFirst, count);
+	}
+	
+	// Enqueues several items using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		return inner_enqueue_bulk<CannotAlloc>(token, itemFirst, count);
+	}
+	
+	
+	
+	// Attempts to dequeue from the queue.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	bool try_dequeue(U& item)
+	{
+		// Instead of simply trying each producer in turn (which could cause needless contention on the first
+		// producer), we score them heuristically.
+		size_t nonEmptyCount = 0;
+		ProducerBase* best = nullptr;
+		size_t bestSize = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) {
+			auto size = ptr->size_approx();
+			if (size > 0) {
+				if (size > bestSize) {
+					bestSize = size;
+					best = ptr;
+				}
+				++nonEmptyCount;
+			}
+		}
+		
+		// If there was at least one non-empty queue but it appears empty at the time
+		// we try to dequeue from it, we need to make sure every queue's been tried
+		if (nonEmptyCount > 0) {
+			if ((details::likely)(best->dequeue(item))) {
+				return true;
+			}
+			for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+				if (ptr != best && ptr->dequeue(item)) {
+					return true;
+				}
+			}
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue from the queue.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// This differs from the try_dequeue(item) method in that this one does
+	// not attempt to reduce contention by interleaving the order that producer
+	// streams are dequeued from. So, using this method can reduce overall throughput
+	// under contention, but will give more predictable results in single-threaded
+	// consumer scenarios. This is mostly only useful for internal unit tests.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	bool try_dequeue_non_interleaved(U& item)
+	{
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			if (ptr->dequeue(item)) {
+				return true;
+			}
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue from the queue using an explicit consumer token.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	bool try_dequeue(consumer_token_t& token, U& item)
+	{
+		// The idea is roughly as follows:
+		// Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less
+		// If you see that the global offset has changed, you must reset your consumption counter and move to your designated place
+		// If there's no items where you're supposed to be, keep moving until you find a producer with some items
+		// If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it
+		
+		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+			if (!update_current_producer_after_rotation(token)) {
+				return false;
+			}
+		}
+		
+		// If there was at least one non-empty queue but it appears empty at the time
+		// we try to dequeue from it, we need to make sure every queue's been tried
+		if (static_cast<ProducerBase*>(token.currentProducer)->dequeue(item)) {
+			if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+			}
+			return true;
+		}
+		
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+		if (ptr == nullptr) {
+			ptr = tail;
+		}
+		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
+			if (ptr->dequeue(item)) {
+				token.currentProducer = ptr;
+				token.itemsConsumedFromCurrent = 1;
+				return true;
+			}
+			ptr = ptr->next_prod();
+			if (ptr == nullptr) {
+				ptr = tail;
+			}
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	size_t try_dequeue_bulk(It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			count += ptr->dequeue_bulk(itemFirst, max - count);
+			if (count == max) {
+				break;
+			}
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
+	{
+		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+			if (!update_current_producer_after_rotation(token)) {
+				return 0;
+			}
+		}
+		
+		size_t count = static_cast<ProducerBase*>(token.currentProducer)->dequeue_bulk(itemFirst, max);
+		if (count == max) {
+			if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+			}
+			return max;
+		}
+		token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
+		max -= count;
+		
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+		if (ptr == nullptr) {
+			ptr = tail;
+		}
+		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
+			auto dequeued = ptr->dequeue_bulk(itemFirst, max);
+			count += dequeued;
+			if (dequeued != 0) {
+				token.currentProducer = ptr;
+				token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
+			}
+			if (dequeued == max) {
+				break;
+			}
+			max -= dequeued;
+			ptr = ptr->next_prod();
+			if (ptr == nullptr) {
+				ptr = tail;
+			}
+		}
+		return count;
+	}
+	
+	
+	
+	// Attempts to dequeue from a specific producer's inner queue.
+	// If you happen to know which producer you want to dequeue from, this
+	// is significantly faster than using the general-case try_dequeue methods.
+	// Returns false if the producer's queue appeared empty at the time it
+	// was checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item)
+	{
+		return static_cast<ExplicitProducer*>(producer.producer)->dequeue(item);
+	}
+	
+	// Attempts to dequeue several elements from a specific producer's inner queue.
+	// Returns the number of items actually dequeued.
+	// If you happen to know which producer you want to dequeue from, this
+	// is significantly faster than using the general-case try_dequeue methods.
+	// Returns 0 if the producer's queue appeared empty at the time it
+	// was checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max)
+	{
+		return static_cast<ExplicitProducer*>(producer.producer)->dequeue_bulk(itemFirst, max);
+	}
+	
+	
+	// Returns an estimate of the total number of elements currently in the queue. This
+	// estimate is only accurate if the queue has completely stabilized before it is called
+	// (i.e. all enqueue and dequeue operations have completed and their memory effects are
+	// visible on the calling thread, and no further operations start while this method is
+	// being called).
+	// Thread-safe.
+	size_t size_approx() const
+	{
+		size_t size = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			size += ptr->size_approx();
+		}
+		return size;
+	}
+	
+	
+	// Returns true if the underlying atomic variables used by
+	// the queue are lock-free (they should be on most platforms).
+	// Thread-safe.
+	static constexpr bool is_lock_free()
+	{
+		return
+			details::static_is_lock_free<bool>::value == 2 &&
+			details::static_is_lock_free<size_t>::value == 2 &&
+			details::static_is_lock_free<std::uint32_t>::value == 2 &&
+			details::static_is_lock_free<index_t>::value == 2 &&
+			details::static_is_lock_free<void*>::value == 2 &&
+			details::static_is_lock_free<typename details::thread_id_converter<details::thread_id_t>::thread_id_numeric_size_t>::value == 2;
+	}
+
+
+private:
+	friend struct ProducerToken;
+	friend struct ConsumerToken;
+	struct ExplicitProducer;
+	friend struct ExplicitProducer;
+	struct ImplicitProducer;
+	friend struct ImplicitProducer;
+	friend class ConcurrentQueueTests;
+		
+	enum AllocationMode { CanAlloc, CannotAlloc };
+	
+	
+	///////////////////////////////
+	// Queue methods
+	///////////////////////////////
+	
+	template<AllocationMode canAlloc, typename U>
+	inline bool inner_enqueue(producer_token_t const& token, U&& element)
+	{
+		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
+	}
+	
+	template<AllocationMode canAlloc, typename U>
+	inline bool inner_enqueue(U&& element)
+	{
+		auto producer = get_or_add_implicit_producer();
+		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
+	}
+	
+	template<AllocationMode canAlloc, typename It>
+	inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
+	}
+	
+	template<AllocationMode canAlloc, typename It>
+	inline bool inner_enqueue_bulk(It itemFirst, size_t count)
+	{
+		auto producer = get_or_add_implicit_producer();
+		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
+	}
+	
+	inline bool update_current_producer_after_rotation(consumer_token_t& token)
+	{
+		// Ah, there's been a rotation, figure out where we should be!
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		if (token.desiredProducer == nullptr && tail == nullptr) {
+			return false;
+		}
+		auto prodCount = producerCount.load(std::memory_order_relaxed);
+		auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed);
+		if ((details::unlikely)(token.desiredProducer == nullptr)) {
+			// Aha, first time we're dequeueing anything.
+			// Figure out our local position
+			// Note: offset is from start, not end, but we're traversing from end -- subtract from count first
+			std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount);
+			token.desiredProducer = tail;
+			for (std::uint32_t i = 0; i != offset; ++i) {
+				token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+				if (token.desiredProducer == nullptr) {
+					token.desiredProducer = tail;
+				}
+			}
+		}
+		
+		std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
+		if (delta >= prodCount) {
+			delta = delta % prodCount;
+		}
+		for (std::uint32_t i = 0; i != delta; ++i) {
+			token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+			if (token.desiredProducer == nullptr) {
+				token.desiredProducer = tail;
+			}
+		}
+		
+		token.lastKnownGlobalOffset = globalOffset;
+		token.currentProducer = token.desiredProducer;
+		token.itemsConsumedFromCurrent = 0;
+		return true;
+	}
+	
+	
+	///////////////////////////
+	// Free list
+	///////////////////////////
+	
+	template <typename N>
+	struct FreeListNode
+	{
+		FreeListNode() : freeListRefs(0), freeListNext(nullptr) { }
+		
+		std::atomic<std::uint32_t> freeListRefs;
+		std::atomic<N*> freeListNext;
+	};
+	
+	// A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but
+	// simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly
+	// speedy under low contention.
+	template<typename N>		// N must inherit FreeListNode or have the same fields (and initialization of them)
+	struct FreeList
+	{
+		FreeList() : freeListHead(nullptr) { }
+		FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); }
+		void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); }
+		
+		FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
+		FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
+		
+		inline void add(N* node)
+		{
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+			debug::DebugLock lock(mutex);
+#endif		
+			// We know that the should-be-on-freelist bit is 0 at this point, so it's safe to
+			// set it using a fetch_add
+			if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) {
+				// Oh look! We were the last ones referencing this node, and we know
+				// we want to add it to the free list, so let's do it!
+		 		add_knowing_refcount_is_zero(node);
+			}
+		}
+		
+		inline N* try_get()
+		{
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+			debug::DebugLock lock(mutex);
+#endif		
+			auto head = freeListHead.load(std::memory_order_acquire);
+			while (head != nullptr) {
+				auto prevHead = head;
+				auto refs = head->freeListRefs.load(std::memory_order_relaxed);
+				if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire)) {
+					head = freeListHead.load(std::memory_order_acquire);
+					continue;
+				}
+				
+				// Good, reference count has been incremented (it wasn't at zero), which means we can read the
+				// next and not worry about it changing between now and the time we do the CAS
+				auto next = head->freeListNext.load(std::memory_order_relaxed);
+				if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) {
+					// Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no
+					// matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on).
+					assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0);
+					
+					// Decrease refcount twice, once for our ref, and once for the list's ref
+					head->freeListRefs.fetch_sub(2, std::memory_order_release);
+					return head;
+				}
+				
+				// OK, the head must have changed on us, but we still need to decrease the refcount we increased.
+				// Note that we don't need to release any memory effects, but we do need to ensure that the reference
+				// count decrement happens-after the CAS on the head.
+				refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel);
+				if (refs == SHOULD_BE_ON_FREELIST + 1) {
+					add_knowing_refcount_is_zero(prevHead);
+				}
+			}
+			
+			return nullptr;
+		}
+		
+		// Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes)
+		N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); }
+		
+	private:
+		inline void add_knowing_refcount_is_zero(N* node)
+		{
+			// Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run
+			// only one copy of this method per node at a time, i.e. the single thread case), then we know
+			// we can safely change the next pointer of the node; however, once the refcount is back above
+			// zero, then other threads could increase it (happens under heavy contention, when the refcount
+			// goes to zero in between a load and a refcount increment of a node in try_get, then back up to
+			// something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS
+			// to add the node to the actual list fails, decrease the refcount and leave the add operation to
+			// the next thread who puts the refcount back at zero (which could be us, hence the loop).
+			auto head = freeListHead.load(std::memory_order_relaxed);
+			while (true) {
+				node->freeListNext.store(head, std::memory_order_relaxed);
+				node->freeListRefs.store(1, std::memory_order_release);
+				if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) {
+					// Hmm, the add failed, but we can only try again when the refcount goes back to zero
+					if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_acq_rel) == 1) {
+						continue;
+					}
+				}
+				return;
+			}
+		}
+		
+	private:
+		// Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention)
+		std::atomic<N*> freeListHead;
+	
+	static const std::uint32_t REFS_MASK = 0x7FFFFFFF;
+	static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;
+		
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+		debug::DebugMutex mutex;
+#endif
+	};
+	
+	
+	///////////////////////////
+	// Block
+	///////////////////////////
+	
+	enum InnerQueueContext { implicit_context = 0, explicit_context = 1 };
+	
+	struct Block
+	{
+		Block()
+			: next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), dynamicallyAllocated(true)
+		{
+#ifdef MCDBGQ_TRACKMEM
+			owner = nullptr;
+#endif
+		}
+		
+		template<InnerQueueContext context>
+		inline bool is_empty() const
+		{
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Check flags
+				for (size_t i = 0; i < BLOCK_SIZE; ++i) {
+					if (!emptyFlags[i].load(std::memory_order_relaxed)) {
+						return false;
+					}
+				}
+				
+				// Aha, empty; make sure we have all other memory effects that happened before the empty flags were set
+				std::atomic_thread_fence(std::memory_order_acquire);
+				return true;
+			}
+			else {
+				// Check counter
+				if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) {
+					std::atomic_thread_fence(std::memory_order_acquire);
+					return true;
+				}
+				assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE);
+				return false;
+			}
+		}
+		
+		// Returns true if the block is now empty (does not apply in explicit context)
+		template<InnerQueueContext context>
+		inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i)
+		{
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set flag
+				assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].load(std::memory_order_relaxed));
+				emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].store(true, std::memory_order_release);
+				return false;
+			}
+			else {
+				// Increment counter
+				auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_acq_rel);
+				assert(prevVal < BLOCK_SIZE);
+				return prevVal == BLOCK_SIZE - 1;
+			}
+		}
+		
+		// Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0).
+		// Returns true if the block is now empty (does not apply in explicit context).
+		template<InnerQueueContext context>
+		inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, size_t count)
+		{
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set flags
+				std::atomic_thread_fence(std::memory_order_release);
+				i = BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1)) - count + 1;
+				for (size_t j = 0; j != count; ++j) {
+					assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
+					emptyFlags[i + j].store(true, std::memory_order_relaxed);
+				}
+				return false;
+			}
+			else {
+				// Increment counter
+				auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_acq_rel);
+				assert(prevVal + count <= BLOCK_SIZE);
+				return prevVal + count == BLOCK_SIZE;
+			}
+		}
+		
+		template<InnerQueueContext context>
+		inline void set_all_empty()
+		{
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set all flags
+				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+					emptyFlags[i].store(true, std::memory_order_relaxed);
+				}
+			}
+			else {
+				// Reset counter
+				elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed);
+			}
+		}
+		
+		template<InnerQueueContext context>
+		inline void reset_empty()
+		{
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Reset flags
+				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+					emptyFlags[i].store(false, std::memory_order_relaxed);
+				}
+			}
+			else {
+				// Reset counter
+				elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
+			}
+		}
+		
+		inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT { return static_cast<T*>(static_cast<void*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
+		inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { return static_cast<T const*>(static_cast<void const*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
+		
+	private:
+		static_assert(std::alignment_of<T>::value <= sizeof(T), "The queue does not support types with an alignment greater than their size at this time");
+		MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements;
+	public:
+		Block* next;
+		std::atomic<size_t> elementsCompletelyDequeued;
+		std::atomic<bool> emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1];
+	public:
+		std::atomic<std::uint32_t> freeListRefs;
+		std::atomic<Block*> freeListNext;
+		bool dynamicallyAllocated;		// Perhaps a better name for this would be 'isNotPartOfInitialBlockPool'
+		
+#ifdef MCDBGQ_TRACKMEM
+		void* owner;
+#endif
+	};
+	static_assert(std::alignment_of<Block>::value >= std::alignment_of<T>::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping");
+
+
+#ifdef MCDBGQ_TRACKMEM
+public:
+	struct MemStats;
+private:
+#endif
+	
+	///////////////////////////
+	// Producer base
+	///////////////////////////
+	
+	struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase
+	{
+		ProducerBase(ConcurrentQueue* parent_, bool isExplicit_) :
+			tailIndex(0),
+			headIndex(0),
+			dequeueOptimisticCount(0),
+			dequeueOvercommit(0),
+			tailBlock(nullptr),
+			isExplicit(isExplicit_),
+			parent(parent_)
+		{
+		}
+		
+		virtual ~ProducerBase() { }
+		
+		template<typename U>
+		inline bool dequeue(U& element)
+		{
+			if (isExplicit) {
+				return static_cast<ExplicitProducer*>(this)->dequeue(element);
+			}
+			else {
+				return static_cast<ImplicitProducer*>(this)->dequeue(element);
+			}
+		}
+		
+		template<typename It>
+		inline size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			if (isExplicit) {
+				return static_cast<ExplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
+			}
+			else {
+				return static_cast<ImplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
+			}
+		}
+		
+		inline ProducerBase* next_prod() const { return static_cast<ProducerBase*>(next); }
+		
+		inline size_t size_approx() const
+		{
+			auto tail = tailIndex.load(std::memory_order_relaxed);
+			auto head = headIndex.load(std::memory_order_relaxed);
+			return details::circular_less_than(head, tail) ? static_cast<size_t>(tail - head) : 0;
+		}
+		
+		inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); }
+	protected:
+		std::atomic<index_t> tailIndex;		// Where to enqueue to next
+		std::atomic<index_t> headIndex;		// Where to dequeue from next
+		
+		std::atomic<index_t> dequeueOptimisticCount;
+		std::atomic<index_t> dequeueOvercommit;
+		
+		Block* tailBlock;
+		
+	public:
+		bool isExplicit;
+		ConcurrentQueue* parent;
+		
+	protected:
+#ifdef MCDBGQ_TRACKMEM
+		friend struct MemStats;
+#endif
+	};
+	
+	
+	///////////////////////////
+	// Explicit queue
+	///////////////////////////
+		
+	struct ExplicitProducer : public ProducerBase
+	{
+		explicit ExplicitProducer(ConcurrentQueue* parent_) :
+			ProducerBase(parent_, true),
+			blockIndex(nullptr),
+			pr_blockIndexSlotsUsed(0),
+			pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1),
+			pr_blockIndexFront(0),
+			pr_blockIndexEntries(nullptr),
+			pr_blockIndexRaw(nullptr)
+		{
+			size_t poolBasedIndexSize = details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1;
+			if (poolBasedIndexSize > pr_blockIndexSize) {
+				pr_blockIndexSize = poolBasedIndexSize;
+			}
+			
+			new_block_index(0);		// This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE
+		}
+		
+		~ExplicitProducer()
+		{
+			// Destruct any elements not yet dequeued.
+			// Since we're in the destructor, we can assume all elements
+			// are either completely dequeued or completely not (no halfways).
+			if (this->tailBlock != nullptr) {		// Note this means there must be a block index too
+				// First find the block that's partially dequeued, if any
+				Block* halfDequeuedBlock = nullptr;
+				if ((this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) != 0) {
+					// The head's not on a block boundary, meaning a block somewhere is partially dequeued
+					// (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary)
+					size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1);
+					while (details::circular_less_than<index_t>(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) {
+						i = (i + 1) & (pr_blockIndexSize - 1);
+					}
+					assert(details::circular_less_than<index_t>(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed)));
+					halfDequeuedBlock = pr_blockIndexEntries[i].block;
+				}
+				
+				// Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration)
+				auto block = this->tailBlock;
+				do {
+					block = block->next;
+					if (block->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+						continue;
+					}
+					
+					size_t i = 0;	// Offset into block
+					if (block == halfDequeuedBlock) {
+						i = static_cast<size_t>(this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+					}
+					
+					// Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index
+					auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast<size_t>(this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+					while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) {
+						(*block)[i++]->~T();
+					}
+				} while (block != this->tailBlock);
+			}
+			
+			// Destroy all blocks that we own
+			if (this->tailBlock != nullptr) {
+				auto block = this->tailBlock;
+				do {
+					auto nextBlock = block->next;
+					this->parent->add_block_to_free_list(block);
+					block = nextBlock;
+				} while (block != this->tailBlock);
+			}
+			
+			// Destroy the block indices
+			auto header = static_cast<BlockIndexHeader*>(pr_blockIndexRaw);
+			while (header != nullptr) {
+				auto prev = static_cast<BlockIndexHeader*>(header->prev);
+				header->~BlockIndexHeader();
+				(Traits::free)(header);
+				header = prev;
+			}
+		}
+		
+		template<AllocationMode allocMode, typename U>
+		inline bool enqueue(U&& element)
+		{
+			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			index_t newTailIndex = 1 + currentTailIndex;
+			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+				// We reached the end of a block, start a new one
+				auto startBlock = this->tailBlock;
+				auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+				if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+					// We can re-use the block ahead of us, it's empty!					
+					this->tailBlock = this->tailBlock->next;
+					this->tailBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+					
+					// We'll put the block on the block index (guaranteed to be room since we're conceptually removing the
+					// last block from it first -- except instead of removing then adding, we can just overwrite).
+					// Note that there must be a valid block index here, since even if allocation failed in the ctor,
+					// it would have been re-attempted when adding the first block to the queue; since there is such
+					// a block, a block index must have been successfully allocated.
+				}
+				else {
+					// Whatever head value we see here is >= the last value we saw here (relatively),
+					// and <= its current value. Since we have the most recent tail, the head must be
+					// <= to it.
+					auto head = this->headIndex.load(std::memory_order_relaxed);
+					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+					if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE)
+						|| (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+						// We can't enqueue in another block because there's not enough leeway -- the
+						// tail could surpass the head by the time the block fills up! (Or we'll exceed
+						// the size limit, if the second part of the condition was true.)
+						return false;
+					}
+					// We're going to need a new block; check that the block index has room
+					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) {
+						// Hmm, the circular block index is already full -- we'll need
+						// to allocate a new index. Note pr_blockIndexRaw can only be nullptr if
+						// the initial allocation failed in the constructor.
+						
+						MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
+							return false;
+						}
+						else if (!new_block_index(pr_blockIndexSlotsUsed)) {
+							return false;
+						}
+					}
+					
+					// Insert a new block in the circular linked list
+					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+					if (newBlock == nullptr) {
+						return false;
+					}
+#ifdef MCDBGQ_TRACKMEM
+					newBlock->owner = this;
+#endif
+					newBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+					if (this->tailBlock == nullptr) {
+						newBlock->next = newBlock;
+					}
+					else {
+						newBlock->next = this->tailBlock->next;
+						this->tailBlock->next = newBlock;
+					}
+					this->tailBlock = newBlock;
+					++pr_blockIndexSlotsUsed;
+				}
+
+				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
+					// The constructor may throw. We want the element not to appear in the queue in
+					// that case (without corrupting the queue):
+					MOODYCAMEL_TRY {
+						new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+					}
+					MOODYCAMEL_CATCH (...) {
+						// Revert change to the current block, but leave the new block available
+						// for next time
+						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+						this->tailBlock = startBlock == nullptr ? this->tailBlock : startBlock;
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				else {
+					(void)startBlock;
+					(void)originalBlockIndexSlotsUsed;
+				}
+				
+				// Add block to block index
+				auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+				entry.base = currentTailIndex;
+				entry.block = this->tailBlock;
+				blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release);
+				pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+				
+				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
+					this->tailIndex.store(newTailIndex, std::memory_order_release);
+					return true;
+				}
+			}
+			
+			// Enqueue
+			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+			
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+		
+		template<typename U>
+		bool dequeue(U& element)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
+				// Might be something to dequeue, let's give it a try
+				
+				// Note that this if is purely for performance purposes in the common case when the queue is
+				// empty and the values are eventually consistent -- we may enter here spuriously.
+				
+				// Note that whatever the values of overcommit and tail are, they are not going to change (unless we
+				// change them) and must be the same value at this point (inside the if) as when the if condition was
+				// evaluated.
+
+				// We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below.
+				// This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in
+				// the fetch_add below will result in a value at least as recent as that (and therefore at least as large).
+				// Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all
+				// read-modify-write operations are guaranteed to work on the latest value in the modification order), but
+				// unfortunately that can't be shown to be correct using only the C++11 standard.
+				// See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				// Increment optimistic counter, then check if it went over the boundary
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
+				
+				// Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever
+				// incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now
+				// have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon
+				// incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount.
+				// However, we can't assert this since both dequeueOptimisticCount and dequeueOvercommit may (independently)
+				// overflow; in such a case, though, the logic still holds since the difference between the two is maintained.
+				
+				// Note that we reload tail here in case it changed; it will be the same value as before or greater, since
+				// this load is sequenced after (happens after) the earlier load above. This is supported by read-read
+				// coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
+					// Guaranteed to be at least one element to dequeue!
+					
+					// Get the index. Note that since there's guaranteed to be at least one element, this
+					// will never exceed tail. We need to do an acquire-release fence here since it's possible
+					// that whatever condition got us to this point was for an earlier enqueued element (that
+					// we already see the memory effects for), but that by the time we increment somebody else
+					// has incremented it, and we need to see the memory effects for *that* element, which is
+					// in such a case is necessarily visible on the thread that incremented it in the first
+					// place with the more current condition (they must have acquired a tail that is at least
+					// as recent).
+					auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+					
+					
+					// Determine which block the element is in
+					
+					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+					
+					// We need to be careful here about subtracting and dividing because of index wrap-around.
+					// When an index wraps, we need to preserve the sign of the offset when dividing it by the
+					// block size (in order to get a correct signed block count offset in all cases):
+					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+					auto blockBaseIndex = index & ~static_cast<index_t>(BLOCK_SIZE - 1);
+					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(blockBaseIndex - headBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+					auto block = localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block;
+					
+					// Dequeue
+					auto& el = *((*block)[index]);
+					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
+						// Make sure the element is still fully dequeued and destroyed even if the assignment
+						// throws
+						struct Guard {
+							Block* block;
+							index_t index;
+							
+							~Guard()
+							{
+								(*block)[index]->~T();
+								block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
+							}
+						} guard = { block, index };
+
+						element = std::move(el); // NOLINT
+					}
+					else {
+						element = std::move(el); // NOLINT
+						el.~T(); // NOLINT
+						block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
+					}
+					
+					return true;
+				}
+				else {
+					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);		// Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write
+				}
+			}
+		
+			return false;
+		}
+		
+		template<AllocationMode allocMode, typename It>
+		bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count)
+		{
+			// First, we need to make sure we have enough room to enqueue all of the elements;
+			// this means pre-allocating blocks and putting them in the block index (but only if
+			// all the allocations succeeded).
+			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			auto startBlock = this->tailBlock;
+			auto originalBlockIndexFront = pr_blockIndexFront;
+			auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+			
+			Block* firstAllocatedBlock = nullptr;
+			
+			// Figure out how many blocks we'll need to allocate, and do so
+			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+			if (blockBaseDiff > 0) {
+				// Allocate as many blocks as possible from ahead
+				while (blockBaseDiff > 0 && this->tailBlock != nullptr && this->tailBlock->next != firstAllocatedBlock && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+					
+					this->tailBlock = this->tailBlock->next;
+					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
+					
+					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+					entry.base = currentTailIndex;
+					entry.block = this->tailBlock;
+					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+				}
+				
+				// Now allocate as many blocks as necessary from the block pool
+				while (blockBaseDiff > 0) {
+					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+					
+					auto head = this->headIndex.load(std::memory_order_relaxed);
+					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) {
+						MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
+							// Failed to allocate, undo changes (but keep injected blocks)
+							pr_blockIndexFront = originalBlockIndexFront;
+							pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+							this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+							return false;
+						}
+						else if (full || !new_block_index(originalBlockIndexSlotsUsed)) {
+							// Failed to allocate, undo changes (but keep injected blocks)
+							pr_blockIndexFront = originalBlockIndexFront;
+							pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+							this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+							return false;
+						}
+						
+						// pr_blockIndexFront is updated inside new_block_index, so we need to
+						// update our fallback value too (since we keep the new index even if we
+						// later fail)
+						originalBlockIndexFront = originalBlockIndexSlotsUsed;
+					}
+					
+					// Insert a new block in the circular linked list
+					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+					if (newBlock == nullptr) {
+						pr_blockIndexFront = originalBlockIndexFront;
+						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+						return false;
+					}
+					
+#ifdef MCDBGQ_TRACKMEM
+					newBlock->owner = this;
+#endif
+					newBlock->ConcurrentQueue::Block::template set_all_empty<explicit_context>();
+					if (this->tailBlock == nullptr) {
+						newBlock->next = newBlock;
+					}
+					else {
+						newBlock->next = this->tailBlock->next;
+						this->tailBlock->next = newBlock;
+					}
+					this->tailBlock = newBlock;
+					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
+					
+					++pr_blockIndexSlotsUsed;
+					
+					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+					entry.base = currentTailIndex;
+					entry.block = this->tailBlock;
+					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+				}
+				
+				// Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and
+				// publish the new block index front
+				auto block = firstAllocatedBlock;
+				while (true) {
+					block->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+					if (block == this->tailBlock) {
+						break;
+					}
+					block = block->next;
+				}
+				
+				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
+					blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
+				}
+			}
+			
+			// Enqueue, one block at a time
+			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+			currentTailIndex = startTailIndex;
+			auto endBlock = this->tailBlock;
+			this->tailBlock = startBlock;
+			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
+			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
+				this->tailBlock = firstAllocatedBlock;
+			}
+			while (true) {
+				index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+					stopIndex = newTailIndex;
+				}
+				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
+					while (currentTailIndex != stopIndex) {
+						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+					}
+				}
+				else {
+					MOODYCAMEL_TRY {
+						while (currentTailIndex != stopIndex) {
+							// Must use copy constructor even if move constructor is available
+							// because we may have to revert if there's an exception.
+							// Sorry about the horrible templated next line, but it was the only way
+							// to disable moving *at compile time*, which is important because a type
+							// may only define a (noexcept) move constructor, and so calls to the
+							// cctor will not compile, even if they are in an if branch that will never
+							// be executed
+							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
+							++currentTailIndex;
+							++itemFirst;
+						}
+					}
+					MOODYCAMEL_CATCH (...) {
+						// Oh dear, an exception's been thrown -- destroy the elements that
+						// were enqueued so far and revert the entire bulk operation (we'll keep
+						// any allocated blocks in our linked list for later, though).
+						auto constructedStopIndex = currentTailIndex;
+						auto lastBlockEnqueued = this->tailBlock;
+						
+						pr_blockIndexFront = originalBlockIndexFront;
+						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+						
+						if (!details::is_trivially_destructible<T>::value) {
+							auto block = startBlock;
+							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+								block = firstAllocatedBlock;
+							}
+							currentTailIndex = startTailIndex;
+							while (true) {
+								stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
+									stopIndex = constructedStopIndex;
+								}
+								while (currentTailIndex != stopIndex) {
+									(*block)[currentTailIndex++]->~T();
+								}
+								if (block == lastBlockEnqueued) {
+									break;
+								}
+								block = block->next;
+							}
+						}
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				
+				if (this->tailBlock == endBlock) {
+					assert(currentTailIndex == newTailIndex);
+					break;
+				}
+				this->tailBlock = this->tailBlock->next;
+			}
+			
+			MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
+				if (firstAllocatedBlock != nullptr)
+					blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
+			}
+			
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+		
+		template<typename It>
+		size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+			if (details::circular_less_than<size_t>(0, desiredCount)) {
+				desiredCount = desiredCount < max ? desiredCount : max;
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
+				
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+				if (details::circular_less_than<size_t>(0, actualCount)) {
+					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+					if (actualCount < desiredCount) {
+						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+					}
+					
+					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+					// will never exceed tail.
+					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+					
+					// Determine which block the first element is in
+					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+					
+					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+					auto firstBlockBaseIndex = firstIndex & ~static_cast<index_t>(BLOCK_SIZE - 1);
+					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(firstBlockBaseIndex - headBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+					auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1);
+					
+					// Iterate the blocks and dequeue
+					auto index = firstIndex;
+					do {
+						auto firstIndexInBlock = index;
+						index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+						auto block = localBlockIndex->entries[indexIndex].block;
+						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
+							while (index != endIndex) {
+								auto& el = *((*block)[index]);
+								*itemFirst++ = std::move(el);
+								el.~T();
+								++index;
+							}
+						}
+						else {
+							MOODYCAMEL_TRY {
+								while (index != endIndex) {
+									auto& el = *((*block)[index]);
+									*itemFirst = std::move(el);
+									++itemFirst;
+									el.~T();
+									++index;
+								}
+							}
+							MOODYCAMEL_CATCH (...) {
+								// It's too late to revert the dequeue, but we can make sure that all
+								// the dequeued objects are properly destroyed and the block index
+								// (and empty count) are properly updated before we propagate the exception
+								do {
+									block = localBlockIndex->entries[indexIndex].block;
+									while (index != endIndex) {
+										(*block)[index++]->~T();
+									}
+									block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
+									indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+									
+									firstIndexInBlock = index;
+									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+								} while (index != firstIndex + actualCount);
+								
+								MOODYCAMEL_RETHROW;
+							}
+						}
+						block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
+						indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+					} while (index != firstIndex + actualCount);
+					
+					return actualCount;
+				}
+				else {
+					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+				}
+			}
+			
+			return 0;
+		}
+		
+	private:
+		struct BlockIndexEntry
+		{
+			index_t base;
+			Block* block;
+		};
+		
+		struct BlockIndexHeader
+		{
+			size_t size;
+			std::atomic<size_t> front;		// Current slot (not next, like pr_blockIndexFront)
+			BlockIndexEntry* entries;
+			void* prev;
+		};
+		
+		
+		bool new_block_index(size_t numberOfFilledSlotsToExpose)
+		{
+			auto prevBlockSizeMask = pr_blockIndexSize - 1;
+			
+			// Create the new block
+			pr_blockIndexSize <<= 1;
+			auto newRawPtr = static_cast<char*>((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize));
+			if (newRawPtr == nullptr) {
+				pr_blockIndexSize >>= 1;		// Reset to allow graceful retry
+				return false;
+			}
+			
+			auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(newRawPtr + sizeof(BlockIndexHeader)));
+			
+			// Copy in all the old indices, if any
+			size_t j = 0;
+			if (pr_blockIndexSlotsUsed != 0) {
+				auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask;
+				do {
+					newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
+					i = (i + 1) & prevBlockSizeMask;
+				} while (i != pr_blockIndexFront);
+			}
+			
+			// Update everything
+			auto header = new (newRawPtr) BlockIndexHeader;
+			header->size = pr_blockIndexSize;
+			header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed);
+			header->entries = newBlockIndexEntries;
+			header->prev = pr_blockIndexRaw;		// we link the new block to the old one so we can free it later
+			
+			pr_blockIndexFront = j;
+			pr_blockIndexEntries = newBlockIndexEntries;
+			pr_blockIndexRaw = newRawPtr;
+			blockIndex.store(header, std::memory_order_release);
+			
+			return true;
+		}
+		
+	private:
+		std::atomic<BlockIndexHeader*> blockIndex;
+		
+		// To be used by producer only -- consumer must use the ones in referenced by blockIndex
+		size_t pr_blockIndexSlotsUsed;
+		size_t pr_blockIndexSize;
+		size_t pr_blockIndexFront;		// Next slot (not current)
+		BlockIndexEntry* pr_blockIndexEntries;
+		void* pr_blockIndexRaw;
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+	public:
+		ExplicitProducer* nextExplicitProducer;
+	private:
+#endif
+		
+#ifdef MCDBGQ_TRACKMEM
+		friend struct MemStats;
+#endif
+	};
+	
+	
+	//////////////////////////////////
+	// Implicit queue
+	//////////////////////////////////
+	
+	struct ImplicitProducer : public ProducerBase
+	{			
+		ImplicitProducer(ConcurrentQueue* parent_) :
+			ProducerBase(parent_, false),
+			nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE),
+			blockIndex(nullptr)
+		{
+			new_block_index();
+		}
+		
+		~ImplicitProducer()
+		{
+			// Note that since we're in the destructor we can assume that all enqueue/dequeue operations
+			// completed already; this means that all undequeued elements are placed contiguously across
+			// contiguous blocks, and that only the first and last remaining blocks can be only partially
+			// empty (all other remaining blocks must be completely full).
+			
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+			// Unregister ourselves for thread termination notification
+			if (!this->inactive.load(std::memory_order_relaxed)) {
+				details::ThreadExitNotifier::unsubscribe(&threadExitListener);
+			}
+#endif
+			
+			// Destroy all remaining elements!
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto index = this->headIndex.load(std::memory_order_relaxed);
+			Block* block = nullptr;
+			assert(index == tail || details::circular_less_than(index, tail));
+			bool forceFreeLastBlock = index != tail;		// If we enter the loop, then the last (tail) block will not be freed
+			while (index != tail) {
+				if ((index & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 || block == nullptr) {
+					if (block != nullptr) {
+						// Free the old block
+						this->parent->add_block_to_free_list(block);
+					}
+					
+					block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed);
+				}
+				
+				((*block)[index])->~T();
+				++index;
+			}
+			// Even if the queue is empty, there's still one block that's not on the free list
+			// (unless the head index reached the end of it, in which case the tail will be poised
+			// to create a new block).
+			if (this->tailBlock != nullptr && (forceFreeLastBlock || (tail & static_cast<index_t>(BLOCK_SIZE - 1)) != 0)) {
+				this->parent->add_block_to_free_list(this->tailBlock);
+			}
+			
+			// Destroy block index
+			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+			if (localBlockIndex != nullptr) {
+				for (size_t i = 0; i != localBlockIndex->capacity; ++i) {
+					localBlockIndex->index[i]->~BlockIndexEntry();
+				}
+				do {
+					auto prev = localBlockIndex->prev;
+					localBlockIndex->~BlockIndexHeader();
+					(Traits::free)(localBlockIndex);
+					localBlockIndex = prev;
+				} while (localBlockIndex != nullptr);
+			}
+		}
+		
+		template<AllocationMode allocMode, typename U>
+		inline bool enqueue(U&& element)
+		{
+			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			index_t newTailIndex = 1 + currentTailIndex;
+			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+				// We reached the end of a block, start a new one
+				auto head = this->headIndex.load(std::memory_order_relaxed);
+				assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+				if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+					return false;
+				}
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+				debug::DebugLock lock(mutex);
+#endif
+				// Find out where we'll be inserting this block in the block index
+				BlockIndexEntry* idxEntry;
+				if (!insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) {
+					return false;
+				}
+				
+				// Get ahold of a new block
+				auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+				if (newBlock == nullptr) {
+					rewind_block_index_tail();
+					idxEntry->value.store(nullptr, std::memory_order_relaxed);
+					return false;
+				}
+#ifdef MCDBGQ_TRACKMEM
+				newBlock->owner = this;
+#endif
+				newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+
+				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
+					// May throw, try to insert now before we publish the fact that we have this new block
+					MOODYCAMEL_TRY {
+						new ((*newBlock)[currentTailIndex]) T(std::forward<U>(element));
+					}
+					MOODYCAMEL_CATCH (...) {
+						rewind_block_index_tail();
+						idxEntry->value.store(nullptr, std::memory_order_relaxed);
+						this->parent->add_block_to_free_list(newBlock);
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				
+				// Insert the new block into the index
+				idxEntry->value.store(newBlock, std::memory_order_relaxed);
+				
+				this->tailBlock = newBlock;
+				
+				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
+					this->tailIndex.store(newTailIndex, std::memory_order_release);
+					return true;
+				}
+			}
+			
+			// Enqueue
+			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+			
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+		
+		template<typename U>
+		bool dequeue(U& element)
+		{
+			// See ExplicitProducer::dequeue for rationale and explanation
+			index_t tail = this->tailIndex.load(std::memory_order_relaxed);
+			index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
+					index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+					
+					// Determine which block the element is in
+					auto entry = get_block_index_entry_for_index(index);
+					
+					// Dequeue
+					auto block = entry->value.load(std::memory_order_relaxed);
+					auto& el = *((*block)[index]);
+					
+					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+						// Note: Acquiring the mutex with every dequeue instead of only when a block
+						// is released is very sub-optimal, but it is, after all, purely debug code.
+						debug::DebugLock lock(producer->mutex);
+#endif
+						struct Guard {
+							Block* block;
+							index_t index;
+							BlockIndexEntry* entry;
+							ConcurrentQueue* parent;
+							
+							~Guard()
+							{
+								(*block)[index]->~T();
+								if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
+									entry->value.store(nullptr, std::memory_order_relaxed);
+									parent->add_block_to_free_list(block);
+								}
+							}
+						} guard = { block, index, entry, this->parent };
+
+						element = std::move(el); // NOLINT
+					}
+					else {
+						element = std::move(el); // NOLINT
+						el.~T(); // NOLINT
+
+						if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
+							{
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+								debug::DebugLock lock(mutex);
+#endif
+								// Add the block back into the global free pool (and remove from block index)
+								entry->value.store(nullptr, std::memory_order_relaxed);
+							}
+							this->parent->add_block_to_free_list(block);		// releases the above store
+						}
+					}
+					
+					return true;
+				}
+				else {
+					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);
+				}
+			}
+		
+			return false;
+		}
+		
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4706)  // assignment within conditional expression
+#endif
+		template<AllocationMode allocMode, typename It>
+		bool enqueue_bulk(It itemFirst, size_t count)
+		{
+			// First, we need to make sure we have enough room to enqueue all of the elements;
+			// this means pre-allocating blocks and putting them in the block index (but only if
+			// all the allocations succeeded).
+			
+			// Note that the tailBlock we start off with may not be owned by us any more;
+			// this happens if it was filled up exactly to the top (setting tailIndex to
+			// the first index of the next block which is not yet allocated), then dequeued
+			// completely (putting it on the free list) before we enqueue again.
+			
+			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			auto startBlock = this->tailBlock;
+			Block* firstAllocatedBlock = nullptr;
+			auto endBlock = this->tailBlock;
+			
+			// Figure out how many blocks we'll need to allocate, and do so
+			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+			if (blockBaseDiff > 0) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+				debug::DebugLock lock(mutex);
+#endif
+				do {
+					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+					
+					// Find out where we'll be inserting this block in the block index
+					BlockIndexEntry* idxEntry = nullptr;  // initialization here unnecessary but compiler can't always tell
+					Block* newBlock;
+					bool indexInserted = false;
+					auto head = this->headIndex.load(std::memory_order_relaxed);
+					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+
+					if (full || !(indexInserted = insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) || (newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>()) == nullptr) {
+						// Index allocation or block allocation failed; revert any other allocations
+						// and index insertions done so far for this operation
+						if (indexInserted) {
+							rewind_block_index_tail();
+							idxEntry->value.store(nullptr, std::memory_order_relaxed);
+						}
+						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
+							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+							idxEntry = get_block_index_entry_for_index(currentTailIndex);
+							idxEntry->value.store(nullptr, std::memory_order_relaxed);
+							rewind_block_index_tail();
+						}
+						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+						this->tailBlock = startBlock;
+						
+						return false;
+					}
+					
+#ifdef MCDBGQ_TRACKMEM
+					newBlock->owner = this;
+#endif
+					newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+					newBlock->next = nullptr;
+					
+					// Insert the new block into the index
+					idxEntry->value.store(newBlock, std::memory_order_relaxed);
+					
+					// Store the chain of blocks so that we can undo if later allocations fail,
+					// and so that we can find the blocks when we do the actual enqueueing
+					if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr) {
+						assert(this->tailBlock != nullptr);
+						this->tailBlock->next = newBlock;
+					}
+					this->tailBlock = newBlock;
+					endBlock = newBlock;
+					firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock;
+				} while (blockBaseDiff > 0);
+			}
+			
+			// Enqueue, one block at a time
+			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+			currentTailIndex = startTailIndex;
+			this->tailBlock = startBlock;
+			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
+			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
+				this->tailBlock = firstAllocatedBlock;
+			}
+			while (true) {
+				index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+					stopIndex = newTailIndex;
+				}
+				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
+					while (currentTailIndex != stopIndex) {
+						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+					}
+				}
+				else {
+					MOODYCAMEL_TRY {
+						while (currentTailIndex != stopIndex) {
+							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
+							++currentTailIndex;
+							++itemFirst;
+						}
+					}
+					MOODYCAMEL_CATCH (...) {
+						auto constructedStopIndex = currentTailIndex;
+						auto lastBlockEnqueued = this->tailBlock;
+						
+						if (!details::is_trivially_destructible<T>::value) {
+							auto block = startBlock;
+							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+								block = firstAllocatedBlock;
+							}
+							currentTailIndex = startTailIndex;
+							while (true) {
+								stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
+									stopIndex = constructedStopIndex;
+								}
+								while (currentTailIndex != stopIndex) {
+									(*block)[currentTailIndex++]->~T();
+								}
+								if (block == lastBlockEnqueued) {
+									break;
+								}
+								block = block->next;
+							}
+						}
+						
+						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
+							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+							auto idxEntry = get_block_index_entry_for_index(currentTailIndex);
+							idxEntry->value.store(nullptr, std::memory_order_relaxed);
+							rewind_block_index_tail();
+						}
+						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+						this->tailBlock = startBlock;
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				
+				if (this->tailBlock == endBlock) {
+					assert(currentTailIndex == newTailIndex);
+					break;
+				}
+				this->tailBlock = this->tailBlock->next;
+			}
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+		
+		template<typename It>
+		size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+			if (details::circular_less_than<size_t>(0, desiredCount)) {
+				desiredCount = desiredCount < max ? desiredCount : max;
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
+				
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+				if (details::circular_less_than<size_t>(0, actualCount)) {
+					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+					if (actualCount < desiredCount) {
+						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+					}
+					
+					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+					// will never exceed tail.
+					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+					
+					// Iterate the blocks and dequeue
+					auto index = firstIndex;
+					BlockIndexHeader* localBlockIndex;
+					auto indexIndex = get_block_index_index_for_index(index, localBlockIndex);
+					do {
+						auto blockStartIndex = index;
+						index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+						
+						auto entry = localBlockIndex->index[indexIndex];
+						auto block = entry->value.load(std::memory_order_relaxed);
+						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
+							while (index != endIndex) {
+								auto& el = *((*block)[index]);
+								*itemFirst++ = std::move(el);
+								el.~T();
+								++index;
+							}
+						}
+						else {
+							MOODYCAMEL_TRY {
+								while (index != endIndex) {
+									auto& el = *((*block)[index]);
+									*itemFirst = std::move(el);
+									++itemFirst;
+									el.~T();
+									++index;
+								}
+							}
+							MOODYCAMEL_CATCH (...) {
+								do {
+									entry = localBlockIndex->index[indexIndex];
+									block = entry->value.load(std::memory_order_relaxed);
+									while (index != endIndex) {
+										(*block)[index++]->~T();
+									}
+									
+									if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+										debug::DebugLock lock(mutex);
+#endif
+										entry->value.store(nullptr, std::memory_order_relaxed);
+										this->parent->add_block_to_free_list(block);
+									}
+									indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+									
+									blockStartIndex = index;
+									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+								} while (index != firstIndex + actualCount);
+								
+								MOODYCAMEL_RETHROW;
+							}
+						}
+						if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
+							{
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+								debug::DebugLock lock(mutex);
+#endif
+								// Note that the set_many_empty above did a release, meaning that anybody who acquires the block
+								// we're about to free can use it safely since our writes (and reads!) will have happened-before then.
+								entry->value.store(nullptr, std::memory_order_relaxed);
+							}
+							this->parent->add_block_to_free_list(block);		// releases the above store
+						}
+						indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+					} while (index != firstIndex + actualCount);
+					
+					return actualCount;
+				}
+				else {
+					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+				}
+			}
+			
+			return 0;
+		}
+		
+	private:
+		// The block size must be > 1, so any number with the low bit set is an invalid block base index
+		static const index_t INVALID_BLOCK_BASE = 1;
+		
+		struct BlockIndexEntry
+		{
+			std::atomic<index_t> key;
+			std::atomic<Block*> value;
+		};
+		
+		struct BlockIndexHeader
+		{
+			size_t capacity;
+			std::atomic<size_t> tail;
+			BlockIndexEntry* entries;
+			BlockIndexEntry** index;
+			BlockIndexHeader* prev;
+		};
+		
+		template<AllocationMode allocMode>
+		inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex)
+		{
+			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);		// We're the only writer thread, relaxed is OK
+			if (localBlockIndex == nullptr) {
+				return false;  // this can happen if new_block_index failed in the constructor
+			}
+			size_t newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
+			idxEntry = localBlockIndex->index[newTail];
+			if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE ||
+				idxEntry->value.load(std::memory_order_relaxed) == nullptr) {
+				
+				idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+				localBlockIndex->tail.store(newTail, std::memory_order_release);
+				return true;
+			}
+			
+			// No room in the old block index, try to allocate another one!
+			MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
+				return false;
+			}
+			else if (!new_block_index()) {
+				return false;
+			}
+			else {
+				localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+				newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
+				idxEntry = localBlockIndex->index[newTail];
+				assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE);
+				idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+				localBlockIndex->tail.store(newTail, std::memory_order_release);
+				return true;
+			}
+		}
+		
+		inline void rewind_block_index_tail()
+		{
+			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+			localBlockIndex->tail.store((localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), std::memory_order_relaxed);
+		}
+		
+		inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const
+		{
+			BlockIndexHeader* localBlockIndex;
+			auto idx = get_block_index_index_for_index(index, localBlockIndex);
+			return localBlockIndex->index[idx];
+		}
+		
+		inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const
+		{
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+			debug::DebugLock lock(mutex);
+#endif
+			index &= ~static_cast<index_t>(BLOCK_SIZE - 1);
+			localBlockIndex = blockIndex.load(std::memory_order_acquire);
+			auto tail = localBlockIndex->tail.load(std::memory_order_acquire);
+			auto tailBase = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed);
+			assert(tailBase != INVALID_BLOCK_BASE);
+			// Note: Must use division instead of shift because the index may wrap around, causing a negative
+			// offset, whose negativity we want to preserve
+			auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(index - tailBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+			size_t idx = (tail + offset) & (localBlockIndex->capacity - 1);
+			assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr);
+			return idx;
+		}
+		
+		bool new_block_index()
+		{
+			auto prev = blockIndex.load(std::memory_order_relaxed);
+			size_t prevCapacity = prev == nullptr ? 0 : prev->capacity;
+			auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity;
+			auto raw = static_cast<char*>((Traits::malloc)(
+				sizeof(BlockIndexHeader) +
+				std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * entryCount +
+				std::alignment_of<BlockIndexEntry*>::value - 1 + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity));
+			if (raw == nullptr) {
+				return false;
+			}
+			
+			auto header = new (raw) BlockIndexHeader;
+			auto entries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(raw + sizeof(BlockIndexHeader)));
+			auto index = reinterpret_cast<BlockIndexEntry**>(details::align_for<BlockIndexEntry*>(reinterpret_cast<char*>(entries) + sizeof(BlockIndexEntry) * entryCount));
+			if (prev != nullptr) {
+				auto prevTail = prev->tail.load(std::memory_order_relaxed);
+				auto prevPos = prevTail;
+				size_t i = 0;
+				do {
+					prevPos = (prevPos + 1) & (prev->capacity - 1);
+					index[i++] = prev->index[prevPos];
+				} while (prevPos != prevTail);
+				assert(i == prevCapacity);
+			}
+			for (size_t i = 0; i != entryCount; ++i) {
+				new (entries + i) BlockIndexEntry;
+				entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed);
+				index[prevCapacity + i] = entries + i;
+			}
+			header->prev = prev;
+			header->entries = entries;
+			header->index = index;
+			header->capacity = nextBlockIndexCapacity;
+			header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed);
+			
+			blockIndex.store(header, std::memory_order_release);
+			
+			nextBlockIndexCapacity <<= 1;
+			
+			return true;
+		}
+		
+	private:
+		size_t nextBlockIndexCapacity;
+		std::atomic<BlockIndexHeader*> blockIndex;
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+	public:
+		details::ThreadExitListener threadExitListener;
+	private:
+#endif
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+	public:
+		ImplicitProducer* nextImplicitProducer;
+	private:
+#endif
+
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+		mutable debug::DebugMutex mutex;
+#endif
+#ifdef MCDBGQ_TRACKMEM
+		friend struct MemStats;
+#endif
+	};
+	
+	
+	//////////////////////////////////
+	// Block pool manipulation
+	//////////////////////////////////
+	
+	void populate_initial_block_list(size_t blockCount)
+	{
+		initialBlockPoolSize = blockCount;
+		if (initialBlockPoolSize == 0) {
+			initialBlockPool = nullptr;
+			return;
+		}
+		
+		initialBlockPool = create_array<Block>(blockCount);
+		if (initialBlockPool == nullptr) {
+			initialBlockPoolSize = 0;
+		}
+		for (size_t i = 0; i < initialBlockPoolSize; ++i) {
+			initialBlockPool[i].dynamicallyAllocated = false;
+		}
+	}
+	
+	inline Block* try_get_block_from_initial_pool()
+	{
+		if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) {
+			return nullptr;
+		}
+		
+		auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);
+		
+		return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr;
+	}
+	
+	inline void add_block_to_free_list(Block* block)
+	{
+#ifdef MCDBGQ_TRACKMEM
+		block->owner = nullptr;
+#endif
+		if (!Traits::RECYCLE_ALLOCATED_BLOCKS && block->dynamicallyAllocated) {
+			destroy(block);
+		}
+		else {
+			freeList.add(block);
+		}
+	}
+	
+	inline void add_blocks_to_free_list(Block* block)
+	{
+		while (block != nullptr) {
+			auto next = block->next;
+			add_block_to_free_list(block);
+			block = next;
+		}
+	}
+	
+	inline Block* try_get_block_from_free_list()
+	{
+		return freeList.try_get();
+	}
+	
+	// Gets a free block from one of the memory pools, or allocates a new one (if applicable)
+	template<AllocationMode canAlloc>
+	Block* requisition_block()
+	{
+		auto block = try_get_block_from_initial_pool();
+		if (block != nullptr) {
+			return block;
+		}
+		
+		block = try_get_block_from_free_list();
+		if (block != nullptr) {
+			return block;
+		}
+		
+		MOODYCAMEL_CONSTEXPR_IF (canAlloc == CanAlloc) {
+			return create<Block>();
+		}
+		else {
+			return nullptr;
+		}
+	}
+	
+
+#ifdef MCDBGQ_TRACKMEM
+	public:
+		struct MemStats {
+			size_t allocatedBlocks;
+			size_t usedBlocks;
+			size_t freeBlocks;
+			size_t ownedBlocksExplicit;
+			size_t ownedBlocksImplicit;
+			size_t implicitProducers;
+			size_t explicitProducers;
+			size_t elementsEnqueued;
+			size_t blockClassBytes;
+			size_t queueClassBytes;
+			size_t implicitBlockIndexBytes;
+			size_t explicitBlockIndexBytes;
+			
+			friend class ConcurrentQueue;
+			
+		private:
+			static MemStats getFor(ConcurrentQueue* q)
+			{
+				MemStats stats = { 0 };
+				
+				stats.elementsEnqueued = q->size_approx();
+			
+				auto block = q->freeList.head_unsafe();
+				while (block != nullptr) {
+					++stats.allocatedBlocks;
+					++stats.freeBlocks;
+					block = block->freeListNext.load(std::memory_order_relaxed);
+				}
+				
+				for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+					bool implicit = dynamic_cast<ImplicitProducer*>(ptr) != nullptr;
+					stats.implicitProducers += implicit ? 1 : 0;
+					stats.explicitProducers += implicit ? 0 : 1;
+					
+					if (implicit) {
+						auto prod = static_cast<ImplicitProducer*>(ptr);
+						stats.queueClassBytes += sizeof(ImplicitProducer);
+						auto head = prod->headIndex.load(std::memory_order_relaxed);
+						auto tail = prod->tailIndex.load(std::memory_order_relaxed);
+						auto hash = prod->blockIndex.load(std::memory_order_relaxed);
+						if (hash != nullptr) {
+							for (size_t i = 0; i != hash->capacity; ++i) {
+								if (hash->index[i]->key.load(std::memory_order_relaxed) != ImplicitProducer::INVALID_BLOCK_BASE && hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) {
+									++stats.allocatedBlocks;
+									++stats.ownedBlocksImplicit;
+								}
+							}
+							stats.implicitBlockIndexBytes += hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry);
+							for (; hash != nullptr; hash = hash->prev) {
+								stats.implicitBlockIndexBytes += sizeof(typename ImplicitProducer::BlockIndexHeader) + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*);
+							}
+						}
+						for (; details::circular_less_than<index_t>(head, tail); head += BLOCK_SIZE) {
+							//auto block = prod->get_block_index_entry_for_index(head);
+							++stats.usedBlocks;
+						}
+					}
+					else {
+						auto prod = static_cast<ExplicitProducer*>(ptr);
+						stats.queueClassBytes += sizeof(ExplicitProducer);
+						auto tailBlock = prod->tailBlock;
+						bool wasNonEmpty = false;
+						if (tailBlock != nullptr) {
+							auto block = tailBlock;
+							do {
+								++stats.allocatedBlocks;
+								if (!block->ConcurrentQueue::Block::template is_empty<explicit_context>() || wasNonEmpty) {
+									++stats.usedBlocks;
+									wasNonEmpty = wasNonEmpty || block != tailBlock;
+								}
+								++stats.ownedBlocksExplicit;
+								block = block->next;
+							} while (block != tailBlock);
+						}
+						auto index = prod->blockIndex.load(std::memory_order_relaxed);
+						while (index != nullptr) {
+							stats.explicitBlockIndexBytes += sizeof(typename ExplicitProducer::BlockIndexHeader) + index->size * sizeof(typename ExplicitProducer::BlockIndexEntry);
+							index = static_cast<typename ExplicitProducer::BlockIndexHeader*>(index->prev);
+						}
+					}
+				}
+				
+				auto freeOnInitialPool = q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? 0 : q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed);
+				stats.allocatedBlocks += freeOnInitialPool;
+				stats.freeBlocks += freeOnInitialPool;
+				
+				stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks;
+				stats.queueClassBytes += sizeof(ConcurrentQueue);
+				
+				return stats;
+			}
+		};
+		
+		// For debugging only. Not thread-safe.
+		MemStats getMemStats()
+		{
+			return MemStats::getFor(this);
+		}
+	private:
+		friend struct MemStats;
+#endif
+	
+	
+	//////////////////////////////////
+	// Producer list manipulation
+	//////////////////////////////////	
+	
+	ProducerBase* recycle_or_create_producer(bool isExplicit)
+	{
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+		debug::DebugLock lock(implicitProdMutex);
+#endif
+		// Try to re-use one first
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) {
+				bool expected = true;
+				if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) {
+					// We caught one! It's been marked as activated, the caller can have it
+					return ptr;
+				}
+			}
+		}
+
+		return add_producer(isExplicit ? static_cast<ProducerBase*>(create<ExplicitProducer>(this)) : create<ImplicitProducer>(this));
+	}
+	
+	ProducerBase* add_producer(ProducerBase* producer)
+	{
+		// Handle failed memory allocation
+		if (producer == nullptr) {
+			return nullptr;
+		}
+		
+		producerCount.fetch_add(1, std::memory_order_relaxed);
+		
+		// Add it to the lock-free list
+		auto prevTail = producerListTail.load(std::memory_order_relaxed);
+		do {
+			producer->next = prevTail;
+		} while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed));
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		if (producer->isExplicit) {
+			auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed);
+			do {
+				static_cast<ExplicitProducer*>(producer)->nextExplicitProducer = prevTailExplicit;
+			} while (!explicitProducers.compare_exchange_weak(prevTailExplicit, static_cast<ExplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
+		}
+		else {
+			auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed);
+			do {
+				static_cast<ImplicitProducer*>(producer)->nextImplicitProducer = prevTailImplicit;
+			} while (!implicitProducers.compare_exchange_weak(prevTailImplicit, static_cast<ImplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
+		}
+#endif
+		
+		return producer;
+	}
+	
+	void reown_producers()
+	{
+		// After another instance is moved-into/swapped-with this one, all the
+		// producers we stole still think their parents are the other queue.
+		// So fix them up!
+		for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) {
+			ptr->parent = this;
+		}
+	}
+	
+	
+	//////////////////////////////////
+	// Implicit producer hash
+	//////////////////////////////////
+	
+	struct ImplicitProducerKVP
+	{
+		std::atomic<details::thread_id_t> key;
+		ImplicitProducer* value;		// No need for atomicity since it's only read by the thread that sets it in the first place
+		
+		ImplicitProducerKVP() : value(nullptr) { }
+		
+		ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
+		{
+			key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed);
+			value = other.value;
+		}
+		
+		inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
+		{
+			swap(other);
+			return *this;
+		}
+		
+		inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT
+		{
+			if (this != &other) {
+				details::swap_relaxed(key, other.key);
+				std::swap(value, other.value);
+			}
+		}
+	};
+	
+	template<typename XT, typename XTraits>
+	friend void moodycamel::swap(typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&, typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT;
+	
+	struct ImplicitProducerHash
+	{
+		size_t capacity;
+		ImplicitProducerKVP* entries;
+		ImplicitProducerHash* prev;
+	};
+	
+	inline void populate_initial_implicit_producer_hash()
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
+			return;
+		}
+		else {
+			implicitProducerHashCount.store(0, std::memory_order_relaxed);
+			auto hash = &initialImplicitProducerHash;
+			hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
+			hash->entries = &initialImplicitProducerHashEntries[0];
+			for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) {
+				initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
+			}
+			hash->prev = nullptr;
+			implicitProducerHash.store(hash, std::memory_order_relaxed);
+		}
+	}
+	
+	void swap_implicit_producer_hashes(ConcurrentQueue& other)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
+			return;
+		}
+		else {
+			// Swap (assumes our implicit producer hash is initialized)
+			initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries);
+			initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0];
+			other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0];
+			
+			details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount);
+			
+			details::swap_relaxed(implicitProducerHash, other.implicitProducerHash);
+			if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) {
+				implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed);
+			}
+			else {
+				ImplicitProducerHash* hash;
+				for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev) {
+					continue;
+				}
+				hash->prev = &initialImplicitProducerHash;
+			}
+			if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) {
+				other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed);
+			}
+			else {
+				ImplicitProducerHash* hash;
+				for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev) {
+					continue;
+				}
+				hash->prev = &other.initialImplicitProducerHash;
+			}
+		}
+	}
+	
+	// Only fails (returns nullptr) if memory allocation fails
+	ImplicitProducer* get_or_add_implicit_producer()
+	{
+		// Note that since the data is essentially thread-local (key is thread ID),
+		// there's a reduced need for fences (memory ordering is already consistent
+		// for any individual thread), except for the current table itself.
+		
+		// Start by looking for the thread ID in the current and all previous hash tables.
+		// If it's not found, it must not be in there yet, since this same thread would
+		// have added it previously to one of the tables that we traversed.
+		
+		// Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table
+		
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+		debug::DebugLock lock(implicitProdMutex);
+#endif
+		
+		auto id = details::thread_id();
+		auto hashedId = details::hash_thread_id(id);
+		
+		auto mainHash = implicitProducerHash.load(std::memory_order_acquire);
+		assert(mainHash != nullptr);  // silence clang-tidy and MSVC warnings (hash cannot be null)
+		for (auto hash = mainHash; hash != nullptr; hash = hash->prev) {
+			// Look for the id in this hash
+			auto index = hashedId;
+			while (true) {		// Not an infinite loop because at least one slot is free in the hash table
+				index &= hash->capacity - 1u;
+				
+				auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed);
+				if (probedKey == id) {
+					// Found it! If we had to search several hashes deep, though, we should lazily add it
+					// to the current main hash table to avoid the extended search next time.
+					// Note there's guaranteed to be room in the current hash table since every subsequent
+					// table implicitly reserves space for all previous tables (there's only one
+					// implicitProducerHashCount).
+					auto value = hash->entries[index].value;
+					if (hash != mainHash) {
+						index = hashedId;
+						while (true) {
+							index &= mainHash->capacity - 1u;
+							auto empty = details::invalid_thread_id;
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+							auto reusable = details::invalid_thread_id2;
+							if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed) ||
+								mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+#else
+							if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+#endif
+								mainHash->entries[index].value = value;
+								break;
+							}
+							++index;
+						}
+					}
+					
+					return value;
+				}
+				if (probedKey == details::invalid_thread_id) {
+					break;		// Not in this hash table
+				}
+				++index;
+			}
+		}
+		
+		// Insert!
+		auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed);
+		while (true) {
+			// NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+			if (newCount >= (mainHash->capacity >> 1) && !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) {
+				// We've acquired the resize lock, try to allocate a bigger hash table.
+				// Note the acquire fence synchronizes with the release fence at the end of this block, and hence when
+				// we reload implicitProducerHash it must be the most recent version (it only gets changed within this
+				// locked block).
+				mainHash = implicitProducerHash.load(std::memory_order_acquire);
+				if (newCount >= (mainHash->capacity >> 1)) {
+					size_t newCapacity = mainHash->capacity << 1;
+					while (newCount >= (newCapacity >> 1)) {
+						newCapacity <<= 1;
+					}
+					auto raw = static_cast<char*>((Traits::malloc)(sizeof(ImplicitProducerHash) + std::alignment_of<ImplicitProducerKVP>::value - 1 + sizeof(ImplicitProducerKVP) * newCapacity));
+					if (raw == nullptr) {
+						// Allocation failed
+						implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+						implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+						return nullptr;
+					}
+					
+					auto newHash = new (raw) ImplicitProducerHash;
+					newHash->capacity = static_cast<size_t>(newCapacity);
+					newHash->entries = reinterpret_cast<ImplicitProducerKVP*>(details::align_for<ImplicitProducerKVP>(raw + sizeof(ImplicitProducerHash)));
+					for (size_t i = 0; i != newCapacity; ++i) {
+						new (newHash->entries + i) ImplicitProducerKVP;
+						newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
+					}
+					newHash->prev = mainHash;
+					implicitProducerHash.store(newHash, std::memory_order_release);
+					implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+					mainHash = newHash;
+				}
+				else {
+					implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+				}
+			}
+			
+			// If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table
+			// to finish being allocated by another thread (and if we just finished allocating above, the condition will
+			// always be true)
+			if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) {
+				auto producer = static_cast<ImplicitProducer*>(recycle_or_create_producer(false));
+				if (producer == nullptr) {
+					implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+					return nullptr;
+				}
+				
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+				producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback;
+				producer->threadExitListener.userData = producer;
+				details::ThreadExitNotifier::subscribe(&producer->threadExitListener);
+#endif
+				
+				auto index = hashedId;
+				while (true) {
+					index &= mainHash->capacity - 1u;
+					auto empty = details::invalid_thread_id;
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+					auto reusable = details::invalid_thread_id2;
+					if (mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+						implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);  // already counted as a used slot
+						mainHash->entries[index].value = producer;
+						break;
+					}
+#endif
+					if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+						mainHash->entries[index].value = producer;
+						break;
+					}
+					++index;
+				}
+				return producer;
+			}
+			
+			// Hmm, the old hash is quite full and somebody else is busy allocating a new one.
+			// We need to wait for the allocating thread to finish (if it succeeds, we add, if not,
+			// we try to allocate ourselves).
+			mainHash = implicitProducerHash.load(std::memory_order_acquire);
+		}
+	}
+	
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+	void implicit_producer_thread_exited(ImplicitProducer* producer)
+	{
+		// Remove from hash
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+		debug::DebugLock lock(implicitProdMutex);
+#endif
+		auto hash = implicitProducerHash.load(std::memory_order_acquire);
+		assert(hash != nullptr);		// The thread exit listener is only registered if we were added to a hash in the first place
+		auto id = details::thread_id();
+		auto hashedId = details::hash_thread_id(id);
+		details::thread_id_t probedKey;
+		
+		// We need to traverse all the hashes just in case other threads aren't on the current one yet and are
+		// trying to add an entry thinking there's a free slot (because they reused a producer)
+		for (; hash != nullptr; hash = hash->prev) {
+			auto index = hashedId;
+			do {
+				index &= hash->capacity - 1u;
+				probedKey = id;
+				if (hash->entries[index].key.compare_exchange_strong(probedKey, details::invalid_thread_id2, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+					break;
+				}
+				++index;
+			} while (probedKey != details::invalid_thread_id);		// Can happen if the hash has changed but we weren't put back in it yet, or if we weren't added to this hash in the first place
+		}
+		
+		// Mark the queue as being recyclable
+		producer->inactive.store(true, std::memory_order_release);
+	}
+	
+	static void implicit_producer_thread_exited_callback(void* userData)
+	{
+		auto producer = static_cast<ImplicitProducer*>(userData);
+		auto queue = producer->parent;
+		queue->implicit_producer_thread_exited(producer);
+	}
+#endif
+	
+	//////////////////////////////////
+	// Utility functions
+	//////////////////////////////////
+
+	template<typename TAlign>
+	static inline void* aligned_malloc(size_t size)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
+			return (Traits::malloc)(size);
+		else {
+			size_t alignment = std::alignment_of<TAlign>::value;
+			void* raw = (Traits::malloc)(size + alignment - 1 + sizeof(void*));
+			if (!raw)
+				return nullptr;
+			char* ptr = details::align_for<TAlign>(reinterpret_cast<char*>(raw) + sizeof(void*));
+			*(reinterpret_cast<void**>(ptr) - 1) = raw;
+			return ptr;
+		}
+	}
+
+	template<typename TAlign>
+	static inline void aligned_free(void* ptr)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
+			return (Traits::free)(ptr);
+		else
+			(Traits::free)(ptr ? *(reinterpret_cast<void**>(ptr) - 1) : nullptr);
+	}
+
+	template<typename U>
+	static inline U* create_array(size_t count)
+	{
+		assert(count > 0);
+		U* p = static_cast<U*>(aligned_malloc<U>(sizeof(U) * count));
+		if (p == nullptr)
+			return nullptr;
+
+		for (size_t i = 0; i != count; ++i)
+			new (p + i) U();
+		return p;
+	}
+
+	template<typename U>
+	static inline void destroy_array(U* p, size_t count)
+	{
+		if (p != nullptr) {
+			assert(count > 0);
+			for (size_t i = count; i != 0; )
+				(p + --i)->~U();
+		}
+		aligned_free<U>(p);
+	}
+
+	template<typename U>
+	static inline U* create()
+	{
+		void* p = aligned_malloc<U>(sizeof(U));
+		return p != nullptr ? new (p) U : nullptr;
+	}
+
+	template<typename U, typename A1>
+	static inline U* create(A1&& a1)
+	{
+		void* p = aligned_malloc<U>(sizeof(U));
+		return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
+	}
+
+	template<typename U>
+	static inline void destroy(U* p)
+	{
+		if (p != nullptr)
+			p->~U();
+		aligned_free<U>(p);
+	}
+
+private:
+	std::atomic<ProducerBase*> producerListTail;
+	std::atomic<std::uint32_t> producerCount;
+	
+	std::atomic<size_t> initialBlockPoolIndex;
+	Block* initialBlockPool;
+	size_t initialBlockPoolSize;
+	
+#ifndef MCDBGQ_USEDEBUGFREELIST
+	FreeList<Block> freeList;
+#else
+	debug::DebugFreeList<Block> freeList;
+#endif
+	
+	std::atomic<ImplicitProducerHash*> implicitProducerHash;
+	std::atomic<size_t> implicitProducerHashCount;		// Number of slots logically used
+	ImplicitProducerHash initialImplicitProducerHash;
+	std::array<ImplicitProducerKVP, INITIAL_IMPLICIT_PRODUCER_HASH_SIZE> initialImplicitProducerHashEntries;
+	std::atomic_flag implicitProducerHashResizeInProgress;
+	
+	std::atomic<std::uint32_t> nextExplicitConsumerId;
+	std::atomic<std::uint32_t> globalExplicitConsumerOffset;
+	
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+	debug::DebugMutex implicitProdMutex;
+#endif
+	
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+	std::atomic<ExplicitProducer*> explicitProducers;
+	std::atomic<ImplicitProducer*> implicitProducers;
+#endif
+};
+
+
+template<typename T, typename Traits>
+ProducerToken::ProducerToken(ConcurrentQueue<T, Traits>& queue)
+	: producer(queue.recycle_or_create_producer(true))
+{
+	if (producer != nullptr) {
+		producer->token = this;
+	}
+}
+
+template<typename T, typename Traits>
+ProducerToken::ProducerToken(BlockingConcurrentQueue<T, Traits>& queue)
+	: producer(reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->recycle_or_create_producer(true))
+{
+	if (producer != nullptr) {
+		producer->token = this;
+	}
+}
+
+template<typename T, typename Traits>
+ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits>& queue)
+	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
+{
+	initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+	lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+}
+
+template<typename T, typename Traits>
+ConsumerToken::ConsumerToken(BlockingConcurrentQueue<T, Traits>& queue)
+	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
+{
+	initialOffset = reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+	lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+}
+
+template<typename T, typename Traits>
+inline void swap(ConcurrentQueue<T, Traits>& a, ConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+template<typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+}
+
+#if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17)
+#pragma warning(pop)
+#endif
+
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+#pragma GCC diagnostic pop
+#endif
\ No newline at end of file
diff --git a/tests/core/algorithm/flat/flat_streamer_buffer_test.cpp b/tests/core/algorithm/flat/flat_streamer_buffer_test.cc
similarity index 100%
rename from tests/core/algorithm/flat/flat_streamer_buffer_test.cpp
rename to tests/core/algorithm/flat/flat_streamer_buffer_test.cc
diff --git a/tests/core/algorithm/flat/flat_streamer_buffer_time_test.cpp b/tests/core/algorithm/flat/flat_streamer_buffer_time_test.cc
similarity index 100%
rename from tests/core/algorithm/flat/flat_streamer_buffer_time_test.cpp
rename to tests/core/algorithm/flat/flat_streamer_buffer_time_test.cc

From 03e4dbce5437ed29f3db5bc79a58186541a1935b Mon Sep 17 00:00:00 2001
From: "yinzefeng.yzf" <yinzefeng.yzf@alibaba-inc.com>
Date: Fri, 6 Feb 2026 21:08:24 +0800
Subject: [PATCH 03/11] modify buffer pool

---
 src/ailego/buffer/buffer_pool.cc              | 257 ++++++++++++++
 src/core/algorithm/hnsw/hnsw_entity.h         |   4 +-
 .../algorithm/hnsw/hnsw_streamer_entity.cc    |   2 +-
 src/core/utility/buffer1_storage.cc           |  14 +-
 src/include/zvec/ailego/buffer/buffer_pool.h  | 331 +++---------------
 .../zvec/core/framework/index_storage.h       |  57 ++-
 .../flat/flat_streamer_buffer_test.cc         | 176 +++++-----
 .../flat/flat_streamer_buffer_time_test.cc    | 129 ++++++-
 8 files changed, 554 insertions(+), 416 deletions(-)
 create mode 100644 src/ailego/buffer/buffer_pool.cc

diff --git a/src/ailego/buffer/buffer_pool.cc b/src/ailego/buffer/buffer_pool.cc
new file mode 100644
index 00000000..061ead37
--- /dev/null
+++ b/src/ailego/buffer/buffer_pool.cc
@@ -0,0 +1,257 @@
+#include <zvec/ailego/buffer/buffer_pool.h>
+
+namespace zvec {
+namespace ailego {
+
+void Counter::record(const std::string &name, int64_t value) {
+	auto it = static_counters.find(name);
+	if (it == static_counters.end()) {
+			auto counter = std::make_unique<std::atomic<int64_t>>(0);
+			it = static_counters.emplace(name, std::move(counter)).first;
+	}
+	it->second->fetch_add(value);
+}
+
+void Counter::display() {
+	for (const auto &pair : static_counters) {
+		std::cout << pair.first << ": " << pair.second->load() << std::endl;
+	}
+}
+
+int LRUCache::init(size_t block_size) {
+	block_size_ = block_size;
+	for(size_t i = 0; i < CATCH_QUEUE_NUM; i++) {
+		queues_.push_back(ConcurrentQueue(block_size));
+	}
+	return 0;
+}
+
+bool LRUCache::evict_single_block(BlockType &item) {
+	// std::cerr << "dequeue: " << item.first << std::endl;
+	bool found = false;
+	for(size_t i = 0; i < CATCH_QUEUE_NUM; i++) {
+		found = queues_[i].try_dequeue(item);
+		// std::cerr << "dequeue: " << found << std::endl;
+		if(found) {
+			break;
+		}
+	}
+	return found;
+}
+
+bool LRUCache::add_single_block(const LPMap *lp_map, const BlockType &block, int block_type) {
+	bool ok = queues_[block_type].try_enqueue(block);
+	if(++evict_queue_insertions_ % block_size_ == 0) {
+		this->clear_dead_node(lp_map);
+	}
+	return ok;
+}
+
+void LRUCache::clear_dead_node(const LPMap *lp_map) {
+	for(int i = 0; i < CATCH_QUEUE_NUM; i++) {
+		int clear_count = 0;
+		ConcurrentQueue tmp(block_size_);
+		BlockType item;
+		while(queues_[i].try_dequeue(item) && (clear_count++ < block_size_)) {
+			if(!lp_map->isDeadBlock(item)) {
+				tmp.try_enqueue(item);
+			}
+		}
+		while(tmp.try_dequeue(item)) {
+			if(!lp_map->isDeadBlock(item)) {
+				queues_[i].try_enqueue(item);
+			}
+		}
+	}
+}
+
+void LPMap::init(size_t entry_num) {
+	if (entries_) {
+		delete[] entries_;
+	}
+	entry_num_ = entry_num;
+	entries_ = new Entry[entry_num_];
+	for (size_t i = 0; i < entry_num_; i++) {
+		entries_[i].ref_count.store(std::numeric_limits<int>::min());
+		entries_[i].load_count.store(0);
+		entries_[i].buffer = nullptr;
+	}
+	cache_.init(entry_num);
+}
+
+char* LPMap::acquire_block(block_id_t block_id) {
+	assert(block_id < entry_num_);
+	Entry &entry = entries_[block_id];
+	if (entry.ref_count.load() == 0) {
+		++entry.load_count;
+		// std::cout << entry.load_count.load() << std::endl;
+	}
+	++entry.ref_count;
+	// std::cout << entry.ref_count.load() << std::endl;
+	if (entry.ref_count.load() < 0) {
+		// std::cout << "acquire block failed: " << block_id << ", " << entry.ref_count.load() << std::endl;
+		return nullptr;
+	}
+	return entry.buffer;
+}
+
+void LPMap::release_block(block_id_t block_id) {
+	assert(block_id < entry_num_);
+	Entry &entry = entries_[block_id];
+	int rc = entry.ref_count.fetch_sub(1);
+	// std::cout << "release block: " << block_id << ", " << entry.ref_count.load() << std::endl;
+	// assert(rc > 0);
+	if(entry.ref_count.load() == 0) {
+		LRUCache::BlockType block;
+		block.first = block_id;
+		block.second = entry.load_count.load();
+		cache_.add_single_block(this, block, 0);
+	}
+}
+
+char* LPMap::evict_block(block_id_t block_id) {
+	// std::cout << "evict block: " << block_id << std::endl;
+	assert(block_id < entry_num_);
+	Entry &entry = entries_[block_id];
+	int expected = 0;
+	if (entry.ref_count.compare_exchange_strong(
+					expected, std::numeric_limits<int>::min())) {
+		char *buffer = entry.buffer;
+		entry.buffer = nullptr;
+		return buffer;
+	} else {
+		return nullptr;
+	}
+}
+
+char* LPMap::set_block_acquired(block_id_t block_id, char *buffer) {
+	assert(block_id < entry_num_);
+	Entry &entry = entries_[block_id];
+	if (entry.ref_count.load() >= 0) {
+		entry.ref_count.fetch_add(1);
+		// std::cout << "Set block2 " << block_id << std::endl;
+		return entry.buffer;
+	}
+	// if (buffer == nullptr) std::cout << "Set block " << block_id << std::endl;
+	entry.buffer = buffer;
+	entry.ref_count.store(1);
+	entry.load_count.fetch_add(1);
+	return buffer;
+}
+
+void LPMap::recycle(moodycamel::ConcurrentQueue<char *> &free_buffers) {
+	LRUCache::BlockType block;
+	do {
+		bool ok = cache_.evict_single_block(block);
+		if(!ok) {
+			return;
+		}
+	} while(isDeadBlock(block));
+	// std::cout << "evict_block done: " << block.first << ", " << block.second << std::endl;
+	char *buffer = evict_block(block.first);
+	if (buffer) {
+		free_buffers.try_enqueue(buffer);
+	}
+}
+
+VecBufferPool::VecBufferPool(const std::string &filename, size_t pool_capacity, size_t block_size)
+		: pool_capacity_(pool_capacity) {
+	fd_ = open(filename.c_str(), O_RDONLY);
+	if (fd_ < 0) {
+		throw std::runtime_error("Failed to open file: " + filename);
+	}
+	struct stat st;
+	if (fstat(fd_, &st) < 0) {
+		throw std::runtime_error("Failed to stat file: " + filename);
+	}
+	file_size_ = st.st_size;
+
+	size_t buffer_num = pool_capacity_ / block_size;
+	size_t block_num = file_size_ / block_size + 500;
+	lp_map_.init(block_num);
+	for (size_t i = 0; i < buffer_num; i++) {
+		char *buffer = (char *)aligned_alloc(64, block_size);
+		if (buffer != nullptr) {
+			bool ok = free_buffers_.try_enqueue(buffer);
+			// if(!ok) std::cerr << i << std::endl;
+		}
+	}
+	std::cout << "buffer_num: " << buffer_num << std::endl;
+	std::cout << "entry_num: " << lp_map_.entry_num() << std::endl;
+}
+
+VecBufferPoolHandle VecBufferPool::get_handle() {
+	return VecBufferPoolHandle(*this);
+}
+
+char* VecBufferPool::acquire_buffer(block_id_t block_id, size_t offset, size_t size, int retry) {
+	char *buffer = lp_map_.acquire_block(block_id);
+	if (buffer) {
+		return buffer;
+	}
+	{
+		// std::cerr << "block_id: " << block_id << ", offset: " << offset << ", size: " << size << std::endl;
+		// std::lock_guard<std::mutex> lock(mutex_);
+		bool found = free_buffers_.try_dequeue(buffer);
+		// std::cerr << "dequeue: " << found << std::endl;
+		if (!found) {
+			for (int i = 0; i < retry; i++) {
+				lp_map_.recycle(free_buffers_);
+				found = free_buffers_.try_dequeue(buffer);
+				// std::cerr << "dequeue: " << i << std::endl;
+				if (found) {
+					break;
+				}
+			}
+		}
+		if (!found) {
+			std::cerr << "Failed to get free buffer " << std::endl;
+			return nullptr;
+		}
+	}
+
+	ssize_t read_bytes = pread(fd_, buffer, size, offset);
+	if (read_bytes != static_cast<ssize_t>(size)) {
+		std::cerr << "Failed to read file at offset " << offset << std::endl;
+		exit(-1);
+	}
+	char *placed_buffer = nullptr;
+	{
+		std::lock_guard<std::mutex> lock(mutex_);
+		placed_buffer = lp_map_.set_block_acquired(block_id, buffer);
+	}
+	if (placed_buffer != buffer) {
+		// another thread has set the block
+		free_buffers_.try_enqueue(buffer);
+	}
+	return placed_buffer;
+}
+
+int VecBufferPool::get_meta(size_t offset, size_t length, char *buffer) {
+	ssize_t read_bytes = pread(fd_, buffer, length, offset);
+	if (read_bytes != static_cast<ssize_t>(length)) {
+		std::cerr << "Failed to read file at offset " << offset << std::endl;
+		exit(-1);
+	}
+	return 0;
+}
+
+char* VecBufferPoolHandle::get_block(size_t offset, size_t size, size_t block_id) {
+	char *buffer = pool.acquire_buffer(block_id, offset, size, 5);
+	return buffer;
+}
+
+int VecBufferPoolHandle::get_meta(size_t offset, size_t length, char *buffer) {
+	return pool.get_meta(offset, length, buffer);
+}
+
+void VecBufferPoolHandle::release_one(block_id_t block_id) {
+	pool.lp_map_.release_block(block_id);
+}
+
+void VecBufferPoolHandle::acquire_one(block_id_t block_id) {
+	pool.lp_map_.acquire_block(block_id);
+}
+
+}  // namespace ailego
+}  // namespace zvec
\ No newline at end of file
diff --git a/src/core/algorithm/hnsw/hnsw_entity.h b/src/core/algorithm/hnsw/hnsw_entity.h
index 363a7252..65fdae9e 100644
--- a/src/core/algorithm/hnsw/hnsw_entity.h
+++ b/src/core/algorithm/hnsw/hnsw_entity.h
@@ -147,8 +147,8 @@ struct Neighbors {
   Neighbors(uint32_t cnt_in, const node_id_t *data_in)
       : cnt{cnt_in}, data{data_in} {}
 
-  Neighbors(IndexStorage::MemoryBlock &&mem_block)
-      : neighbor_block{std::move(mem_block)} {
+  Neighbors(IndexStorage::MemoryBlock &mem_block)
+      : neighbor_block{mem_block} {
     auto hd = reinterpret_cast<const NeighborsHeader *>(neighbor_block.data());
     cnt = hd->neighbor_cnt;
     data = hd->neighbors;
diff --git a/src/core/algorithm/hnsw/hnsw_streamer_entity.cc b/src/core/algorithm/hnsw/hnsw_streamer_entity.cc
index feafa573..734f11f1 100644
--- a/src/core/algorithm/hnsw/hnsw_streamer_entity.cc
+++ b/src/core/algorithm/hnsw/hnsw_streamer_entity.cc
@@ -127,7 +127,7 @@ const Neighbors HnswStreamerEntity::get_neighbors(level_t level,
     LOG_ERROR("Read neighbor header failed, ret=%zu", size);
     return Neighbors();
   }
-  return Neighbors(std::move(neighbor_block));
+  return Neighbors(neighbor_block);
 }
 
 //! Get vector data by key
diff --git a/src/core/utility/buffer1_storage.cc b/src/core/utility/buffer1_storage.cc
index 0ea591d9..1c582198 100644
--- a/src/core/utility/buffer1_storage.cc
+++ b/src/core/utility/buffer1_storage.cc
@@ -85,7 +85,6 @@ class Buffer1Storage : public IndexStorage {
 
     //! Read data from segment
     size_t read(size_t offset, const void **data, size_t len) override {
-      
       if (ailego_unlikely(offset + len > segment_->meta()->data_size)) {
         auto meta = segment_->meta();
         if (offset > meta->data_size) {
@@ -107,7 +106,8 @@ class Buffer1Storage : public IndexStorage {
         len = meta->data_size - offset;
       }
       size_t segment_offset = segment_->meta()->data_index + owner_->get_context_offset();
-      data.reset(owner_->get_buffer(segment_offset, capacity_, segment_id_) + offset);
+      data.reset(owner_->buffer_pool_handle_.get(), segment_id_, owner_->get_buffer(segment_offset, capacity_, segment_id_) + offset);
+      // data.reset(owner_->get_buffer(segment_offset, capacity_, segment_id_) + offset);
       if (data.data()) {
         return len;
       } else {
@@ -138,8 +138,8 @@ class Buffer1Storage : public IndexStorage {
    private:
     IndexMapping::Segment *segment_{};
     Buffer1Storage *owner_{nullptr};
-    size_t capacity_{};
     size_t segment_id_{};
+    size_t capacity_{};
   };
 
   //! Destructor
@@ -162,9 +162,9 @@ class Buffer1Storage : public IndexStorage {
   int open(const std::string &path, bool /*create*/) override {
     LOG_INFO("open buffer storage 1");
     file_name_ = path;
-    buffer_pool_ = std::make_unique<VecBufferPool>(path, 10u * 1024 * 1024 * 1024, 2490368 * 2);
+    buffer_pool_ = std::make_shared<ailego::VecBufferPool>(path, 10u * 1024 * 1024 * 1024, 2490368 * 2);
     buffer_pool_handle_ =
-        std::make_unique<VecBufferPoolHandle>(buffer_pool_->get_handle());
+        std::make_shared<ailego::VecBufferPoolHandle>(buffer_pool_->get_handle());
     int ret = ParseToMapping();
     LOG_ERROR("segment count: %lu, max_segment_size: %lu", segments_.size(), max_segment_size_);
     if(ret != 0) {
@@ -428,8 +428,8 @@ class Buffer1Storage : public IndexStorage {
   size_t max_segment_size_{0};
   std::unique_ptr<char[]> segment_buffer_{nullptr};
 
-  std::unique_ptr<VecBufferPool> buffer_pool_{nullptr};
-  std::unique_ptr<VecBufferPoolHandle> buffer_pool_handle_{nullptr};
+  ailego::VecBufferPool::Pointer buffer_pool_{nullptr};
+  ailego::VecBufferPoolHandle::Pointer buffer_pool_handle_{nullptr};
 };
 
 INDEX_FACTORY_REGISTER_STORAGE_ALIAS(BufferStorage, Buffer1Storage);
diff --git a/src/include/zvec/ailego/buffer/buffer_pool.h b/src/include/zvec/ailego/buffer/buffer_pool.h
index d86cffec..34c69d51 100644
--- a/src/include/zvec/ailego/buffer/buffer_pool.h
+++ b/src/include/zvec/ailego/buffer/buffer_pool.h
@@ -16,48 +16,41 @@
 #include <stdexcept>
 #include <string>
 #include <unordered_map>
+#include <memory>
 #include "concurrentqueue.h"
 
-using block_id_t = int;
-using version_t = int;
+namespace zvec {
+namespace ailego {
+
+using block_id_t = size_t;
+using version_t = size_t;
+
+class LPMap;
 
 class LRUCache {
   public:
     typedef std::pair<block_id_t, version_t> BlockType;
     typedef moodycamel::ConcurrentQueue<BlockType> ConcurrentQueue;
 
-    int init(size_t block_size) {
-      for(int i = 0; i < CATCH_QUEUE_NUM; i++) {
-        queues_.push_back(ConcurrentQueue(block_size));
-      }
-      return 0;
-    }
-
-    BlockType evict_single_block() {
-      BlockType item;
-      for(int i = 0; i < CATCH_QUEUE_NUM; i++) {
-        bool found = queues_[i].try_dequeue(item);
-        if(found) {
-          break;
-        }
-      }
-      return item;
-    }
-
-    bool add_single_block(const BlockType &block, int block_type) {
-      std::cout << "in LRU: " << block.first << ", " << block.second << std::endl;
-      return queues_[block_type].try_enqueue(block);
-    }
+    int init(size_t block_size);
+
+    bool evict_single_block(BlockType &item);
+
+    bool add_single_block(const LPMap *lp_map, const BlockType &block, int block_type);
+
+    void clear_dead_node(const LPMap *lp_map);
 
   private:
     constexpr static size_t CATCH_QUEUE_NUM = 3;
+    int block_size_;
     std::vector<ConcurrentQueue> queues_;
+    alignas(64) std::atomic<size_t> evict_queue_insertions_{0};
 };
 
 class LPMap {
   struct Entry {
-    std::atomic<int> ref_count;
-    std::atomic<int> load_count;
+    alignas(64) std::atomic<int> ref_count;
+    alignas(64) std::atomic<version_t> load_count;
     char *buffer;
   };
 
@@ -67,206 +60,52 @@ class LPMap {
     delete[] entries_;
   }
 
-  void init(size_t entry_num) {
-    if (entries_) {
-      delete[] entries_;
-    }
-    entry_num_ = entry_num;
-    entries_ = new Entry[entry_num_];
-    for (size_t i = 0; i < entry_num_; i++) {
-      entries_[i].ref_count.store(std::numeric_limits<int>::min());
-      entries_[i].load_count.store(0);
-      entries_[i].buffer = nullptr;
-    }
-  }
+  void init(size_t entry_num);
 
-  char *acquire_block(block_id_t block_id) {
-    assert(block_id < entry_num_);
-    Entry &entry = entries_[block_id];
-    int rc = entry.ref_count.fetch_add(1);
-    if (rc < 0) {
-      return nullptr;
-    }
-    return entry.buffer;
-  }
+  char *acquire_block(block_id_t block_id);
 
-  void release_block(block_id_t block_id) {
-    assert(block_id < entry_num_);
-    Entry &entry = entries_[block_id];
-    int rc = entry.ref_count.fetch_sub(1);
-    assert(rc >= 0);
-    if(rc == 0) {
-      LRUCache::BlockType block;
-      block.first = block_id;
-      block.second = entry.load_count.load();
-      cache_.add_single_block(block, 0);
-    }
-  }
+  void release_block(block_id_t block_id);
 
   // need be called under lock
-  char *evict_block(block_id_t block_id) {
-    assert(block_id < entry_num_);
-    Entry &entry = entries_[block_id];
-    int expected = 0;
-    if (entry.ref_count.compare_exchange_strong(
-            expected, std::numeric_limits<int>::min())) {
-      char *buffer = entry.buffer;
-      entry.buffer = nullptr;
-      return buffer;
-    } else {
-      return nullptr;
-    }
-  }
+  char *evict_block(block_id_t block_id);
 
   // need be called under lock
-  char *set_block_acquired(block_id_t block_id, char *buffer) {
-    // std::cout << "Set block " << block_id << std::endl;
-    assert(block_id < entry_num_);
-    Entry &entry = entries_[block_id];
-    if (entry.ref_count.load() >= 0) {
-      entry.ref_count.fetch_add(1);
-      return entry.buffer;
-    }
-    entry.buffer = buffer;
-    entry.ref_count.store(1);
-    entry.load_count.fetch_add(1);
-    return buffer;
-  }
+  char *set_block_acquired(block_id_t block_id, char *buffer);
 
   // need be called under lock
-  void recycle(std::queue<char *> &free_buffers) {
-    LRUCache::BlockType block;
-    do {
-      block = cache_.evict_single_block();
-    } while(isDeadBlock(block));
-    char *buffer = evict_block(block.first);
-    if (buffer) {
-      free_buffers.push(buffer);
-    }
-  }
+  void recycle(moodycamel::ConcurrentQueue<char *> &free_buffers);
 
   size_t entry_num() const {
     return entry_num_;
   }
 
- private:
-  Entry *entries_;
-  size_t entry_num_;
-  LRUCache cache_;
-
-  bool isDeadBlock(LRUCache::BlockType block) {
+  bool isDeadBlock(LRUCache::BlockType block) const {
     Entry &entry = entries_[block.first];
-    return block.second == entry.load_count.load();
-  }
-};
-
-class VecBufferPool;
-
-struct VecBufferPoolHandle {
-  VecBufferPoolHandle(VecBufferPool &pool);
-  VecBufferPoolHandle(VecBufferPoolHandle &&other)
-      : pool(other.pool),
-        local_cache(std::move(other.local_cache)),
-        hit_num_(other.hit_num_) {
-    other.local_cache.clear();
-    other.hit_num_ = 0;
+    return block.second != entry.load_count.load();
   }
-  ~VecBufferPoolHandle();
 
-  char *get_block(size_t offset, size_t size, size_t block_id);
-
-  int get_meta(size_t offset, size_t length, char *buffer);
-
-  void release_all();
-
-  VecBufferPool &pool;
-#ifdef USE_LOCAL_CACHE
-  // std::unordered_map<block_id_t, char*> local_cache;
-  phmap::flat_hash_map<block_id_t, char *> local_cache;
-#else
-  std::vector<block_id_t> local_cache;
-#endif
-  int hit_num_;
+ private:
+  size_t entry_num_{0};
+  Entry *entries_{nullptr};
+  LRUCache cache_;
 };
 
+class VecBufferPoolHandle;
+
 class VecBufferPool {
  public:
-  VecBufferPool(const std::string &filename, size_t pool_capacity, size_t block_size)
-      : pool_capacity_(pool_capacity) {
-    fd_ = open(filename.c_str(), O_RDONLY);
-    if (fd_ < 0) {
-      throw std::runtime_error("Failed to open file: " + filename);
-    }
-    struct stat st;
-    if (fstat(fd_, &st) < 0) {
-      throw std::runtime_error("Failed to stat file: " + filename);
-    }
-    file_size_ = st.st_size;
-
-    size_t buffer_num = pool_capacity_ / block_size;
-    lp_map_.init(buffer_num);
-    for (size_t i = 0; i < buffer_num; i++) {
-      char *buffer = (char *)aligned_alloc(64, block_size);
-      free_buffers_.push(buffer);
-    }
-    std::cout << "buffer_num: " << buffer_num << std::endl;
-    std::cout << "entry_num: " << lp_map_.entry_num() << std::endl;
-  }
+  typedef std::shared_ptr<VecBufferPool> Pointer;
+  
+  VecBufferPool(const std::string &filename, size_t pool_capacity, size_t block_size);
   ~VecBufferPool() {
     close(fd_);
   }
 
-  VecBufferPoolHandle get_handle() {
-    return VecBufferPoolHandle(*this);
-  }
+  VecBufferPoolHandle get_handle();
 
-  char *acquire_buffer(block_id_t block_id, size_t offset, size_t size, int retry = 0) {
-    char *buffer = lp_map_.acquire_block(block_id);
-    if (buffer) {
-      return buffer;
-    }
-    {
-      std::lock_guard<std::mutex> lock(mutex_);
-      if (free_buffers_.empty()) {
-        for (int i = 0; i < retry; i++) {
-          lp_map_.recycle(free_buffers_);
-          if (!free_buffers_.empty()) {
-            break;
-          }
-        }
-      }
-      if (free_buffers_.empty()) {
-        return nullptr;
-      }
-      buffer = free_buffers_.front();
-      free_buffers_.pop();
-    }
-
-    ssize_t read_bytes = pread(fd_, buffer, size, offset);
-    if (read_bytes != static_cast<ssize_t>(size)) {
-      std::cerr << "Failed to read file at offset " << offset << std::endl;
-      exit(-1);
-    }
-
-    {
-      std::lock_guard<std::mutex> lock(mutex_);
-      char *placed_buffer = lp_map_.set_block_acquired(block_id, buffer);
-      if (placed_buffer != buffer) {
-        // another thread has set the block
-        free_buffers_.push(buffer);
-      }
-      return placed_buffer;
-    }
-  }
+  char *acquire_buffer(block_id_t block_id, size_t offset, size_t size, int retry = 0);
 
-  int get_meta(size_t offset, size_t length, char *buffer) {
-    ssize_t read_bytes = pread(fd_, buffer, length, offset);
-    if (read_bytes != static_cast<ssize_t>(length)) {
-      std::cerr << "Failed to read file at offset " << offset << std::endl;
-      exit(-1);
-    }
-    return 0;
-  }
+  int get_meta(size_t offset, size_t length, char *buffer);
 
   size_t file_size() const {
     return file_size_;
@@ -282,86 +121,32 @@ class VecBufferPool {
 
  private:
   std::mutex mutex_;
-  std::queue<char *> free_buffers_;
+  moodycamel::ConcurrentQueue<char *> free_buffers_;
 };
 
-
-struct Counter {
-  ~Counter() = default;
-
-  static Counter &get_instance() {
-    static Counter instance;
-    return instance;
-  }
-
-  void record(const std::string &name, int64_t value) {
-    auto it = static_counters.find(name);
-    if (it == static_counters.end()) {
-      auto counter = std::make_unique<std::atomic<int64_t>>(0);
-      it = static_counters.emplace(name, std::move(counter)).first;
-    }
-    it->second->fetch_add(value);
+struct VecBufferPoolHandle {
+  VecBufferPoolHandle(VecBufferPool &pool) : pool(pool), hit_num_(0) {};
+  VecBufferPoolHandle(VecBufferPoolHandle &&other)
+      : pool(other.pool),
+        hit_num_(other.hit_num_) {
+    other.hit_num_ = 0;
   }
+    
+  ~VecBufferPoolHandle() = default;
 
-  void display() {
-    for (const auto &pair : static_counters) {
-      std::cout << pair.first << ": " << pair.second->load() << std::endl;
-    }
-  }
+  typedef std::shared_ptr<VecBufferPoolHandle> Pointer;
 
-  void clear() {
-    static_counters.clear();
-  }
+  char *get_block(size_t offset, size_t size, size_t block_id);
 
- private:
-  Counter() {}
-  std::map<std::string, std::unique_ptr<std::atomic<int64_t>>> static_counters;
-};
+  int get_meta(size_t offset, size_t length, char *buffer);
 
-VecBufferPoolHandle::VecBufferPoolHandle(VecBufferPool &pool)
-    : pool(pool), hit_num_(0) {}
-VecBufferPoolHandle::~VecBufferPoolHandle() {
-  Counter::get_instance().record("buffer_pool_handle_hit_num", hit_num_);
-  release_all();
-}
-
-char *VecBufferPoolHandle::get_block(size_t offset, size_t size, size_t block_id) {
-#ifdef USE_LOCAL_CACHE
-  auto it = local_cache.find(block_id);
-  if (it != local_cache.end()) {
-    hit_num_++;
-    return it->second;
-  }
-#endif
-
-  char *buffer = pool.acquire_buffer(block_id, offset, size, 3);
-  if (buffer) {
-#ifdef USE_LOCAL_CACHE
-    local_cache[block_id] = buffer;
-#else
-    local_cache.push_back(block_id);
-#endif
-    return buffer;
-  }
+  void release_one(block_id_t block_id);
 
-  return nullptr;
-}
+  void acquire_one(block_id_t block_id);
 
-int VecBufferPoolHandle::get_meta(size_t offset, size_t length, char *out) {
-  return pool.get_meta(offset, length, out);
-}
+  VecBufferPool &pool;
+  int hit_num_;
+};
 
-void VecBufferPoolHandle::release_all() {
-#ifdef USE_LOCAL_CACHE
-  Counter::get_instance().record("buffer_pool_handle_release_call",
-                                 local_cache.size());
-  for (const auto &pair : local_cache) {
-    pool.lp_map_.release_block(pair.first);
-  }
-#else
-  for (block_id_t block_id : local_cache) {
-    pool.lp_map_.release_block(block_id);
-  }
-#endif
-  local_cache.clear();
-}
\ No newline at end of file
+}  // namespace ailego
+}  // namespace zvec
\ No newline at end of file
diff --git a/src/include/zvec/core/framework/index_storage.h b/src/include/zvec/core/framework/index_storage.h
index 8673d63e..346b8da4 100644
--- a/src/include/zvec/core/framework/index_storage.h
+++ b/src/include/zvec/core/framework/index_storage.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include <zvec/ailego/buffer/buffer_manager.h>
+#include <zvec/ailego/buffer/buffer_pool.h>
 #include <zvec/ailego/container/params.h>
 #include <zvec/core/framework/index_error.h>
 #include <zvec/core/framework/index_module.h>
@@ -37,10 +37,11 @@ class IndexStorage : public IndexModule {
     };
 
     MemoryBlock() {}
-    MemoryBlock(ailego::BufferHandle::Pointer &&buffer_handle)
-        : type_(MemoryBlockType::MBT_BUFFERPOOL),
-          buffer_handle_(std::move(buffer_handle)) {
-      data_ = buffer_handle_->pin_vector_data();
+    MemoryBlock(ailego::VecBufferPoolHandle* buffer_pool_handle, int block_id, void *data)
+        : type_(MemoryBlockType::MBT_BUFFERPOOL) {
+      buffer_pool_handle_ = buffer_pool_handle;
+      buffer_block_id_ = block_id;
+      data_ = data;
     }
     MemoryBlock(void *data) : type_(MemoryBlockType::MBT_MMAP), data_(data) {}
 
@@ -50,7 +51,8 @@ class IndexStorage : public IndexModule {
           this->reset(rhs.data_);
           break;
         case MemoryBlockType::MBT_BUFFERPOOL:
-          this->reset(rhs.buffer_handle_);
+          this->reset(rhs.buffer_pool_handle_, rhs.buffer_block_id_, rhs.data_);
+          buffer_pool_handle_->acquire_one(buffer_block_id_);
           break;
         default:
           break;
@@ -63,7 +65,7 @@ class IndexStorage : public IndexModule {
           this->reset(std::move(rhs.data_));
           break;
         case MemoryBlockType::MBT_BUFFERPOOL:
-          this->reset(std::move(rhs.buffer_handle_));
+          this->reset(std::move(rhs.buffer_pool_handle_), std::move(rhs.buffer_block_id_), std::move(rhs.data_));
           break;
         default:
           break;
@@ -77,7 +79,8 @@ class IndexStorage : public IndexModule {
             this->reset(rhs.data_);
             break;
           case MemoryBlockType::MBT_BUFFERPOOL:
-            this->reset(rhs.buffer_handle_);
+            this->reset(rhs.buffer_pool_handle_, rhs.buffer_block_id_, rhs.data_);
+            buffer_pool_handle_->acquire_one(buffer_block_id_);
             break;
           default:
             break;
@@ -93,7 +96,7 @@ class IndexStorage : public IndexModule {
             this->reset(std::move(rhs.data_));
             break;
           case MemoryBlockType::MBT_BUFFERPOOL:
-            this->reset(std::move(rhs.buffer_handle_));
+            this->reset(std::move(rhs.buffer_pool_handle_), std::move(rhs.buffer_block_id_), std::move(rhs.data_));
             break;
           default:
             break;
@@ -107,9 +110,8 @@ class IndexStorage : public IndexModule {
         case MemoryBlockType::MBT_MMAP:
           break;
         case MemoryBlockType::MBT_BUFFERPOOL:
-          if (buffer_handle_) {
-            buffer_handle_->unpin_vector_data();
-            // buffer_handle_.reset();
+          if (buffer_pool_handle_) {
+            buffer_pool_handle_->release_one(buffer_block_id_);
           }
           break;
         default:
@@ -122,34 +124,20 @@ class IndexStorage : public IndexModule {
       return data_;
     }
 
-    void reset(ailego::BufferHandle::Pointer &buffer_handle) {
+    void reset(ailego::VecBufferPoolHandle* buffer_pool_handle, int block_id, void *data) {
       if (type_ == MemoryBlockType::MBT_BUFFERPOOL) {
-        buffer_handle_->unpin_vector_data();
-        buffer_handle_.reset();
+        buffer_pool_handle->release_one(buffer_block_id_);
       }
       type_ = MemoryBlockType::MBT_BUFFERPOOL;
-      if (buffer_handle) {
-        buffer_handle_.reset(buffer_handle.release());
-      }
-      data_ = buffer_handle_->pin_vector_data();
-    }
-
-    void reset(ailego::BufferHandle::Pointer &&buffer_handle) {
-      if (type_ == MemoryBlockType::MBT_BUFFERPOOL) {
-        buffer_handle_->unpin_vector_data();
-        buffer_handle_.reset();
-      }
-      type_ = MemoryBlockType::MBT_BUFFERPOOL;
-      if (buffer_handle) {
-        buffer_handle_ = std::move(buffer_handle);
-      }
-      data_ = buffer_handle_->pin_vector_data();
+      buffer_pool_handle_ = buffer_pool_handle;
+      buffer_block_id_ = block_id;
+      data_ = data;
     }
 
     void reset(void *data) {
       if (type_ == MemoryBlockType::MBT_BUFFERPOOL) {
-        buffer_handle_->unpin_vector_data();
-        buffer_handle_.reset();
+        buffer_pool_handle_->release_one(buffer_block_id_);
+        buffer_pool_handle_ = nullptr;
       }
       type_ = MemoryBlockType::MBT_MMAP;
       data_ = data;
@@ -157,7 +145,8 @@ class IndexStorage : public IndexModule {
 
     MemoryBlockType type_{MBT_UNKNOWN};
     void *data_{nullptr};
-    mutable ailego::BufferHandle::Pointer buffer_handle_{nullptr};
+    mutable ailego::VecBufferPoolHandle* buffer_pool_handle_;
+    int buffer_block_id_{0};
   };
 
   struct SegmentData {
diff --git a/tests/core/algorithm/flat/flat_streamer_buffer_test.cc b/tests/core/algorithm/flat/flat_streamer_buffer_test.cc
index 62b25e23..fbc404b4 100644
--- a/tests/core/algorithm/flat/flat_streamer_buffer_test.cc
+++ b/tests/core/algorithm/flat/flat_streamer_buffer_test.cc
@@ -50,7 +50,6 @@ void FlatStreamerTest::TearDown(void) {
 }
 
 TEST_F(FlatStreamerTest, TestLinearSearch) {
-  BufferManager::Instance().init(300 * 1024 / 2 * 1024, 1);
   IndexStreamer::Pointer write_streamer =
       IndexFactory::CreateStreamer("FlatStreamer");
   ASSERT_TRUE(write_streamer != nullptr);
@@ -165,31 +164,33 @@ TEST_F(FlatStreamerTest, TestLinearSearch) {
     ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
     ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
   }
+  cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl;
 
   read_streamer->close();
   read_streamer.reset();
-  cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl;
 }
 
-TEST_F(FlatStreamerTest, TestLinearSearchMMap) {
-  BufferManager::Instance().init(3 * 1024 / 2 * 1024, 1);
+TEST_F(FlatStreamerTest, TestLinearSearchWithLRU) {
+  constexpr size_t static dim = 1600;
   IndexStreamer::Pointer write_streamer =
       IndexFactory::CreateStreamer("FlatStreamer");
   ASSERT_TRUE(write_streamer != nullptr);
 
   Params params;
-  ASSERT_EQ(0, write_streamer->init(*index_meta_ptr_, params));
+  IndexMeta meta = IndexMeta(IndexMeta::DataType::DT_FP32, dim);
+  meta.set_metric("SquaredEuclidean", 0, Params());
+  ASSERT_EQ(0, write_streamer->init(meta, params));
   auto storage = IndexFactory::CreateStorage("MMapFileStorage");
   ASSERT_NE(nullptr, storage);
   Params stg_params;
   ASSERT_EQ(0, storage->init(stg_params));
-  ASSERT_EQ(0, storage->open(dir_ + "/Test/LinearSearchMMap", true));
+  ASSERT_EQ(0, storage->open(dir_ + "/Test/LinearSearchWithLRU", true));
   ASSERT_EQ(0, write_streamer->open(storage));
 
   auto ctx = write_streamer->create_context();
   ASSERT_TRUE(!!ctx);
 
-  size_t cnt = 10000UL;
+  size_t cnt = 1000000UL;
   IndexQueryMeta qmeta(IndexMeta::DT_FP32, dim);
   for (size_t i = 0; i < cnt; i++) {
     NumericalVector<float> vec(dim);
@@ -202,18 +203,19 @@ TEST_F(FlatStreamerTest, TestLinearSearchMMap) {
   write_streamer->close();
   write_streamer.reset();
 
-  ElapsedTime elapsed_time;
+
   IndexStreamer::Pointer read_streamer =
       IndexFactory::CreateStreamer("FlatStreamer");
-  ASSERT_EQ(0, read_streamer->init(*index_meta_ptr_, params));
-  auto read_storage = IndexFactory::CreateStorage("MMapFileStorage");
+  ASSERT_EQ(0, read_streamer->init(meta, params));
+  auto read_storage = IndexFactory::CreateStorage("BufferStorage");
   ASSERT_NE(nullptr, read_storage);
   ASSERT_EQ(0, read_storage->init(stg_params));
-  ASSERT_EQ(0, read_storage->open(dir_ + "/Test/LinearSearchMMap", false));
+  ASSERT_EQ(0, read_storage->open(dir_ + "/Test/LinearSearchWithLRU", false));
   ASSERT_EQ(0, read_streamer->open(read_storage));
   size_t topk = 3;
   auto provider = read_streamer->create_provider();
-  for (size_t i = 0; i < cnt; i += 1) {
+  ElapsedTime elapsed_time;
+  for (size_t i = 0; i < 10; i += 1) {
     NumericalVector<float> vec(dim);
     for (size_t j = 0; j < dim; ++j) {
       vec[j] = i;
@@ -241,122 +243,132 @@ TEST_F(FlatStreamerTest, TestLinearSearchMMap) {
     ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
     ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
   }
-
-  ctx->set_topk(100U);
-  NumericalVector<float> vec(dim);
-  for (size_t j = 0; j < dim; ++j) {
-    vec[j] = 10.1f;
-  }
-  ASSERT_EQ(0, read_streamer->search_bf_impl(vec.data(), qmeta, ctx));
-  auto &result = ctx->result();
-  ASSERT_EQ(100U, result.size());
-  ASSERT_EQ(10, result[0].key());
-  ASSERT_EQ(11, result[1].key());
-  ASSERT_EQ(5, result[10].key());
-  ASSERT_EQ(0, result[20].key());
-  ASSERT_EQ(30, result[30].key());
-  ASSERT_EQ(35, result[35].key());
-  ASSERT_EQ(99, result[99].key());
+  cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl;
 
   read_streamer->close();
   read_streamer.reset();
-  cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl;
 }
 
-TEST_F(FlatStreamerTest, TestBufferStorage) {
-  BufferManager::Instance().init(10 * 1024 * 1024, 1);
-  IndexStreamer::Pointer streamer =
+TEST_F(FlatStreamerTest, TestLinearSearchMMap) {
+  IndexStreamer::Pointer write_streamer =
       IndexFactory::CreateStreamer("FlatStreamer");
-  ASSERT_TRUE(streamer != nullptr);
-  const int dim = 16;
-  IndexMeta meta = IndexMeta(IndexMeta::DT_FP32, dim);
-  meta.set_metric("SquaredEuclidean", 0, Params());
+  ASSERT_TRUE(write_streamer != nullptr);
 
   Params params;
-  EXPECT_EQ(0, streamer->init(meta, params));
+  ASSERT_EQ(0, write_streamer->init(*index_meta_ptr_, params));
   auto storage = IndexFactory::CreateStorage("MMapFileStorage");
   ASSERT_NE(nullptr, storage);
   Params stg_params;
-  EXPECT_EQ(0, storage->init(stg_params));
-  EXPECT_EQ(0, storage->open(dir_ + "/Test/LinearSearch", true));
-  EXPECT_EQ(0, streamer->open(storage));
+  ASSERT_EQ(0, storage->init(stg_params));
+  ASSERT_EQ(0, storage->open(dir_ + "/Test/LinearSearchMMap", true));
+  ASSERT_EQ(0, write_streamer->open(storage));
 
-  auto ctx = streamer->create_context();
+  auto ctx = write_streamer->create_context();
   ASSERT_TRUE(!!ctx);
 
-  size_t cnt = 1000UL;
+  size_t cnt = 10000UL;
   IndexQueryMeta qmeta(IndexMeta::DT_FP32, dim);
   for (size_t i = 0; i < cnt; i++) {
     NumericalVector<float> vec(dim);
     for (size_t j = 0; j < dim; ++j) {
       vec[j] = i;
     }
-    streamer->add_impl(i, vec.data(), qmeta, ctx);
+    write_streamer->add_impl(i, vec.data(), qmeta, ctx);
   }
-  streamer->flush(0UL);
-  streamer.reset();
+  write_streamer->flush(0UL);
+  write_streamer->close();
+  write_streamer.reset();
 
   IndexStreamer::Pointer read_streamer =
       IndexFactory::CreateStreamer("FlatStreamer");
-  ASSERT_TRUE(read_streamer != nullptr);
-  EXPECT_EQ(0, read_streamer->init(meta, params));
-  auto read_storage = IndexFactory::CreateStorage("BufferStorage");
+  ASSERT_EQ(0, read_streamer->init(*index_meta_ptr_, params));
+  auto read_storage = IndexFactory::CreateStorage("MMapFileStorage");
   ASSERT_NE(nullptr, read_storage);
-  EXPECT_EQ(0, read_storage->init(stg_params));
-  EXPECT_EQ(0, read_storage->open(dir_ + "/Test/LinearSearch", false));
-  EXPECT_EQ(0, read_streamer->open(read_storage));
-  auto read_ctx = read_streamer->create_context();
-  auto provider = read_streamer->create_provider();
-
+  ASSERT_EQ(0, read_storage->init(stg_params));
+  ASSERT_EQ(0, read_storage->open(dir_ + "/Test/LinearSearchMMap", false));
+  ASSERT_EQ(0, read_streamer->open(read_storage));
   size_t topk = 3;
+  auto provider = read_streamer->create_provider();
   for (size_t i = 0; i < cnt; i += 1) {
     NumericalVector<float> vec(dim);
     for (size_t j = 0; j < dim; ++j) {
       vec[j] = i;
     }
-    read_ctx->set_topk(topk);
-    EXPECT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, read_ctx));
-    auto &result1 = read_ctx->result();
-    EXPECT_EQ(topk, result1.size());
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    auto &result1 = ctx->result();
+    ASSERT_EQ(topk, result1.size());
+    IndexStorage::MemoryBlock block;
+    ASSERT_EQ(0, provider->get_vector(result1[0].key(), block));
+    const float *data = (float *)block.data();
     for (size_t j = 0; j < dim; ++j) {
-      const float *data = (float *)provider->get_vector(result1[0].key());
-      EXPECT_EQ(data[j], i);
+      ASSERT_EQ(data[j], i);
     }
-    EXPECT_EQ(i, result1[0].key());
+    ASSERT_EQ(i, result1[0].key());
 
     for (size_t j = 0; j < dim; ++j) {
       vec[j] = i + 0.1f;
     }
-    read_ctx->set_topk(topk);
-    EXPECT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, read_ctx));
-    auto &result2 = read_ctx->result();
-    EXPECT_EQ(topk, result2.size());
-    EXPECT_EQ(i, result2[0].key());
-    EXPECT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
-    EXPECT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    auto &result2 = ctx->result();
+    ASSERT_EQ(topk, result2.size());
+    ASSERT_EQ(i, result2[0].key());
+    ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
+    ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
   }
 
-  read_ctx->set_topk(100U);
+  ctx->set_topk(100U);
   NumericalVector<float> vec(dim);
   for (size_t j = 0; j < dim; ++j) {
     vec[j] = 10.1f;
   }
-  EXPECT_EQ(0, read_streamer->search_bf_impl(vec.data(), qmeta, read_ctx));
-  auto &result = read_ctx->result();
-  EXPECT_EQ(100U, result.size());
-  EXPECT_EQ(10, result[0].key());
-  EXPECT_EQ(11, result[1].key());
-  EXPECT_EQ(5, result[10].key());
-  EXPECT_EQ(0, result[20].key());
-  EXPECT_EQ(30, result[30].key());
-  EXPECT_EQ(35, result[35].key());
-  EXPECT_EQ(99, result[99].key());
-
-  read_streamer->flush(0UL);
+  ASSERT_EQ(0, read_streamer->search_bf_impl(vec.data(), qmeta, ctx));
+  auto &result = ctx->result();
+  ASSERT_EQ(100U, result.size());
+  ASSERT_EQ(10, result[0].key());
+  ASSERT_EQ(11, result[1].key());
+  ASSERT_EQ(5, result[10].key());
+  ASSERT_EQ(0, result[20].key());
+  ASSERT_EQ(30, result[30].key());
+  ASSERT_EQ(35, result[35].key());
+  ASSERT_EQ(99, result[99].key());
+
+  ElapsedTime elapsed_time;
+  for (size_t i = 0; i < cnt; i += 1) {
+    NumericalVector<float> vec(dim);
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    auto &result1 = ctx->result();
+    ASSERT_EQ(topk, result1.size());
+    IndexStorage::MemoryBlock block;
+    ASSERT_EQ(0, provider->get_vector(result1[0].key(), block));
+    const float *data = (float *)block.data();
+    for (size_t j = 0; j < dim; ++j) {
+      ASSERT_EQ(data[j], i);
+    }
+    ASSERT_EQ(i, result1[0].key());
+
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i + 0.1f;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    auto &result2 = ctx->result();
+    ASSERT_EQ(topk, result2.size());
+    ASSERT_EQ(i, result2[0].key());
+    ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
+    ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
+  }
+
+  read_streamer->close();
   read_streamer.reset();
+  cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl;
 }
 
-
 #if defined(__GNUC__) || defined(__GNUG__)
 #pragma GCC diagnostic pop
 #endif
\ No newline at end of file
diff --git a/tests/core/algorithm/flat/flat_streamer_buffer_time_test.cc b/tests/core/algorithm/flat/flat_streamer_buffer_time_test.cc
index c919e9fe..435ecccc 100644
--- a/tests/core/algorithm/flat/flat_streamer_buffer_time_test.cc
+++ b/tests/core/algorithm/flat/flat_streamer_buffer_time_test.cc
@@ -83,7 +83,7 @@ TEST_F(FlatStreamerTest, TestLinearSearchMMap) {
   IndexStreamer::Pointer read_streamer =
       IndexFactory::CreateStreamer("FlatStreamer");
   ASSERT_EQ(0, read_streamer->init(*index_meta_ptr_, params));
-  auto read_storage = IndexFactory::CreateStorage("BufferStorage");
+  auto read_storage = IndexFactory::CreateStorage("MMapFileStorage");
   ASSERT_NE(nullptr, read_storage);
   ASSERT_EQ(0, read_storage->init(stg_params));
   ASSERT_EQ(0, read_storage->open(dir_ + "/Test/LinearSearchMMap", false));
@@ -113,26 +113,121 @@ TEST_F(FlatStreamerTest, TestLinearSearchMMap) {
     // ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
   }
   cout << "Elapsed time: " << elapsed_time.micro_seconds() << " us" << endl;
+  for (size_t i = 0; i < cnt; i += 1) {
+    NumericalVector<float> vec(dim);
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    // auto &result1 = ctx->result();
+    // ASSERT_EQ(topk, result1.size());
+    // ASSERT_EQ(i, result1[0].key());
 
-  // ctx->set_topk(100U);
-  // NumericalVector<float> vec(dim);
-  // for (size_t j = 0; j < dim; ++j) {
-  //   vec[j] = 10.1f;
-  // }
-  // ASSERT_EQ(0, read_streamer->search_bf_impl(vec.data(), qmeta, ctx));
-  // auto &result = ctx->result();
-  // ASSERT_EQ(100U, result.size());
-  // ASSERT_EQ(10, result[0].key());
-  // ASSERT_EQ(11, result[1].key());
-  // ASSERT_EQ(5, result[10].key());
-  // ASSERT_EQ(0, result[20].key());
-  // ASSERT_EQ(30, result[30].key());
-  // ASSERT_EQ(35, result[35].key());
-  // ASSERT_EQ(99, result[99].key());
+    // for (size_t j = 0; j < dim; ++j) {
+    //   vec[j] = i + 0.1f;
+    // }
+    // ctx->set_topk(topk);
+    // ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    // auto &result2 = ctx->result();
+    // ASSERT_EQ(topk, result2.size());
+    // ASSERT_EQ(i, result2[0].key());
+    // ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
+    // ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
+  }
+  cout << "Elapsed time: " << elapsed_time.micro_seconds() << " us" << endl;
+  read_streamer->close();
+  read_streamer.reset();
+}
 
+TEST_F(FlatStreamerTest, TestLinearSearchBuffer) {
+  IndexStreamer::Pointer write_streamer =
+      IndexFactory::CreateStreamer("FlatStreamer");
+  ASSERT_TRUE(write_streamer != nullptr);
+
+  Params params;
+  ASSERT_EQ(0, write_streamer->init(*index_meta_ptr_, params));
+  auto storage = IndexFactory::CreateStorage("MMapFileStorage");
+  ASSERT_NE(nullptr, storage);
+  Params stg_params;
+  ASSERT_EQ(0, storage->init(stg_params));
+  ASSERT_EQ(0, storage->open(dir_ + "/Test/LinearSearchBuffer", true));
+  ASSERT_EQ(0, write_streamer->open(storage));
+
+  auto ctx = write_streamer->create_context();
+  ASSERT_TRUE(!!ctx);
+
+  size_t data_cnt = 300000UL, cnt = 500UL;
+  IndexQueryMeta qmeta(IndexMeta::DT_FP32, dim);
+  for (size_t i = 0; i < data_cnt; i++) {
+    NumericalVector<float> vec(dim);
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i;
+    }
+    write_streamer->add_impl(i, vec.data(), qmeta, ctx);
+  }
+  write_streamer->flush(0UL);
+  write_streamer->close();
+  write_streamer.reset();
+
+  IndexStreamer::Pointer read_streamer =
+      IndexFactory::CreateStreamer("FlatStreamer");
+  ASSERT_EQ(0, read_streamer->init(*index_meta_ptr_, params));
+  auto read_storage = IndexFactory::CreateStorage("BufferStorage");
+  ASSERT_NE(nullptr, read_storage);
+  ASSERT_EQ(0, read_storage->init(stg_params));
+  ASSERT_EQ(0, read_storage->open(dir_ + "/Test/LinearSearchBuffer", false));
+  ASSERT_EQ(0, read_streamer->open(read_storage));
+  size_t topk = 30;
+  ElapsedTime elapsed_time;
+  for (size_t i = 0; i < cnt; i += 1) {
+    NumericalVector<float> vec(dim);
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    // auto &result1 = ctx->result();
+    // ASSERT_EQ(topk, result1.size());
+    // ASSERT_EQ(i, result1[0].key());
+
+    // for (size_t j = 0; j < dim; ++j) {
+    //   vec[j] = i + 0.1f;
+    // }
+    // ctx->set_topk(topk);
+    // ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    // auto &result2 = ctx->result();
+    // ASSERT_EQ(topk, result2.size());
+    // ASSERT_EQ(i, result2[0].key());
+    // ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
+    // ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
+  }
+  cout << "Elapsed time: " << elapsed_time.micro_seconds() << " us" << endl;
+  for (size_t i = 0; i < cnt; i += 1) {
+    NumericalVector<float> vec(dim);
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    // auto &result1 = ctx->result();
+    // ASSERT_EQ(topk, result1.size());
+    // ASSERT_EQ(i, result1[0].key());
+
+    // for (size_t j = 0; j < dim; ++j) {
+    //   vec[j] = i + 0.1f;
+    // }
+    // ctx->set_topk(topk);
+    // ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    // auto &result2 = ctx->result();
+    // ASSERT_EQ(topk, result2.size());
+    // ASSERT_EQ(i, result2[0].key());
+    // ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
+    // ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
+  }
+  cout << "Elapsed time: " << elapsed_time.micro_seconds() << " us" << endl;
   read_streamer->close();
   read_streamer.reset();
-  // cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl;
 }
 
 #if defined(__GNUC__) || defined(__GNUG__)

From 7df2716d2ac969e665422b2d4a85ae51cc3d47cf Mon Sep 17 00:00:00 2001
From: "yinzefeng.yzf" <yinzefeng.yzf@alibaba-inc.com>
Date: Mon, 9 Feb 2026 15:33:34 +0800
Subject: [PATCH 04/11] upd buffer pool

---
 src/ailego/buffer/buffer_pool.cc              |  75 +--
 src/core/utility/buffer1_storage.cc           | 438 ------------------
 src/core/utility/buffer_storage.cc            | 130 +++---
 ..._test.cpp => hnsw_streamer_buffer_test.cc} |   0
 4 files changed, 91 insertions(+), 552 deletions(-)
 delete mode 100644 src/core/utility/buffer1_storage.cc
 rename tests/core/algorithm/hnsw/{hnsw_streamer_buffer_test.cpp => hnsw_streamer_buffer_test.cc} (100%)

diff --git a/src/ailego/buffer/buffer_pool.cc b/src/ailego/buffer/buffer_pool.cc
index 061ead37..3ed461c1 100644
--- a/src/ailego/buffer/buffer_pool.cc
+++ b/src/ailego/buffer/buffer_pool.cc
@@ -1,23 +1,9 @@
 #include <zvec/ailego/buffer/buffer_pool.h>
+#include <zvec/core/framework/index_logger.h>
 
 namespace zvec {
 namespace ailego {
 
-void Counter::record(const std::string &name, int64_t value) {
-	auto it = static_counters.find(name);
-	if (it == static_counters.end()) {
-			auto counter = std::make_unique<std::atomic<int64_t>>(0);
-			it = static_counters.emplace(name, std::move(counter)).first;
-	}
-	it->second->fetch_add(value);
-}
-
-void Counter::display() {
-	for (const auto &pair : static_counters) {
-		std::cout << pair.first << ": " << pair.second->load() << std::endl;
-	}
-}
-
 int LRUCache::init(size_t block_size) {
 	block_size_ = block_size;
 	for(size_t i = 0; i < CATCH_QUEUE_NUM; i++) {
@@ -27,11 +13,9 @@ int LRUCache::init(size_t block_size) {
 }
 
 bool LRUCache::evict_single_block(BlockType &item) {
-	// std::cerr << "dequeue: " << item.first << std::endl;
 	bool found = false;
 	for(size_t i = 0; i < CATCH_QUEUE_NUM; i++) {
 		found = queues_[i].try_dequeue(item);
-		// std::cerr << "dequeue: " << found << std::endl;
 		if(found) {
 			break;
 		}
@@ -41,7 +25,8 @@ bool LRUCache::evict_single_block(BlockType &item) {
 
 bool LRUCache::add_single_block(const LPMap *lp_map, const BlockType &block, int block_type) {
 	bool ok = queues_[block_type].try_enqueue(block);
-	if(++evict_queue_insertions_ % block_size_ == 0) {
+	evict_queue_insertions_.fetch_add(1, std::memory_order_relaxed);
+	if(evict_queue_insertions_ % block_size_ == 0) {
 		this->clear_dead_node(lp_map);
 	}
 	return ok;
@@ -49,10 +34,14 @@ bool LRUCache::add_single_block(const LPMap *lp_map, const BlockType &block, int
 
 void LRUCache::clear_dead_node(const LPMap *lp_map) {
 	for(int i = 0; i < CATCH_QUEUE_NUM; i++) {
+		int clear_size = block_size_ * 2;
+		if (queues_[i].size_approx() < clear_size * 4) {
+			continue;
+		}
 		int clear_count = 0;
 		ConcurrentQueue tmp(block_size_);
 		BlockType item;
-		while(queues_[i].try_dequeue(item) && (clear_count++ < block_size_)) {
+		while(queues_[i].try_dequeue(item) && (clear_count++ < clear_size)) {
 			if(!lp_map->isDeadBlock(item)) {
 				tmp.try_enqueue(item);
 			}
@@ -82,14 +71,11 @@ void LPMap::init(size_t entry_num) {
 char* LPMap::acquire_block(block_id_t block_id) {
 	assert(block_id < entry_num_);
 	Entry &entry = entries_[block_id];
-	if (entry.ref_count.load() == 0) {
-		++entry.load_count;
-		// std::cout << entry.load_count.load() << std::endl;
+	if (entry.ref_count.load(std::memory_order_relaxed) == 0) {
+		entry.load_count.fetch_add(1, std::memory_order_relaxed);
 	}
-	++entry.ref_count;
-	// std::cout << entry.ref_count.load() << std::endl;
-	if (entry.ref_count.load() < 0) {
-		// std::cout << "acquire block failed: " << block_id << ", " << entry.ref_count.load() << std::endl;
+	entry.ref_count.fetch_add(1, std::memory_order_relaxed);
+	if (entry.ref_count.load(std::memory_order_relaxed) < 0) {
 		return nullptr;
 	}
 	return entry.buffer;
@@ -98,10 +84,9 @@ char* LPMap::acquire_block(block_id_t block_id) {
 void LPMap::release_block(block_id_t block_id) {
 	assert(block_id < entry_num_);
 	Entry &entry = entries_[block_id];
-	int rc = entry.ref_count.fetch_sub(1);
-	// std::cout << "release block: " << block_id << ", " << entry.ref_count.load() << std::endl;
-	// assert(rc > 0);
-	if(entry.ref_count.load() == 0) {
+
+	if (entry.ref_count.fetch_sub(1, std::memory_order_release) == 1) {
+		std::atomic_thread_fence(std::memory_order_acquire);
 		LRUCache::BlockType block;
 		block.first = block_id;
 		block.second = entry.load_count.load();
@@ -110,7 +95,6 @@ void LPMap::release_block(block_id_t block_id) {
 }
 
 char* LPMap::evict_block(block_id_t block_id) {
-	// std::cout << "evict block: " << block_id << std::endl;
 	assert(block_id < entry_num_);
 	Entry &entry = entries_[block_id];
 	int expected = 0;
@@ -127,15 +111,13 @@ char* LPMap::evict_block(block_id_t block_id) {
 char* LPMap::set_block_acquired(block_id_t block_id, char *buffer) {
 	assert(block_id < entry_num_);
 	Entry &entry = entries_[block_id];
-	if (entry.ref_count.load() >= 0) {
-		entry.ref_count.fetch_add(1);
-		// std::cout << "Set block2 " << block_id << std::endl;
+	if (entry.ref_count.load(std::memory_order_relaxed) >= 0) {
+		entry.ref_count.fetch_add(1, std::memory_order_relaxed);
 		return entry.buffer;
 	}
-	// if (buffer == nullptr) std::cout << "Set block " << block_id << std::endl;
 	entry.buffer = buffer;
-	entry.ref_count.store(1);
-	entry.load_count.fetch_add(1);
+	entry.ref_count.store(1, std::memory_order_relaxed);
+	entry.load_count.fetch_add(1, std::memory_order_relaxed);
 	return buffer;
 }
 
@@ -147,7 +129,6 @@ void LPMap::recycle(moodycamel::ConcurrentQueue<char *> &free_buffers) {
 			return;
 		}
 	} while(isDeadBlock(block));
-	// std::cout << "evict_block done: " << block.first << ", " << block.second << std::endl;
 	char *buffer = evict_block(block.first);
 	if (buffer) {
 		free_buffers.try_enqueue(buffer);
@@ -173,11 +154,9 @@ VecBufferPool::VecBufferPool(const std::string &filename, size_t pool_capacity,
 		char *buffer = (char *)aligned_alloc(64, block_size);
 		if (buffer != nullptr) {
 			bool ok = free_buffers_.try_enqueue(buffer);
-			// if(!ok) std::cerr << i << std::endl;
 		}
 	}
-	std::cout << "buffer_num: " << buffer_num << std::endl;
-	std::cout << "entry_num: " << lp_map_.entry_num() << std::endl;
+	LOG_DEBUG("Buffer pool num: %zu, entry num: %zu", buffer_num, lp_map_.entry_num());
 }
 
 VecBufferPoolHandle VecBufferPool::get_handle() {
@@ -190,30 +169,26 @@ char* VecBufferPool::acquire_buffer(block_id_t block_id, size_t offset, size_t s
 		return buffer;
 	}
 	{
-		// std::cerr << "block_id: " << block_id << ", offset: " << offset << ", size: " << size << std::endl;
-		// std::lock_guard<std::mutex> lock(mutex_);
 		bool found = free_buffers_.try_dequeue(buffer);
-		// std::cerr << "dequeue: " << found << std::endl;
 		if (!found) {
 			for (int i = 0; i < retry; i++) {
 				lp_map_.recycle(free_buffers_);
 				found = free_buffers_.try_dequeue(buffer);
-				// std::cerr << "dequeue: " << i << std::endl;
 				if (found) {
 					break;
 				}
 			}
 		}
 		if (!found) {
-			std::cerr << "Failed to get free buffer " << std::endl;
+			LOG_ERROR("Buffer pool failed to get free buffer");
 			return nullptr;
 		}
 	}
 
 	ssize_t read_bytes = pread(fd_, buffer, size, offset);
 	if (read_bytes != static_cast<ssize_t>(size)) {
-		std::cerr << "Failed to read file at offset " << offset << std::endl;
-		exit(-1);
+		LOG_ERROR("Buffer pool failed to read file at offset: %zu", offset);
+		return nullptr;
 	}
 	char *placed_buffer = nullptr;
 	{
@@ -230,8 +205,8 @@ char* VecBufferPool::acquire_buffer(block_id_t block_id, size_t offset, size_t s
 int VecBufferPool::get_meta(size_t offset, size_t length, char *buffer) {
 	ssize_t read_bytes = pread(fd_, buffer, length, offset);
 	if (read_bytes != static_cast<ssize_t>(length)) {
-		std::cerr << "Failed to read file at offset " << offset << std::endl;
-		exit(-1);
+		LOG_ERROR("Buffer pool failed to read file at offset: %zu", offset);LOG_ERROR("Buffer pool failed to read file at offset: %zu", offset);
+		return -1;
 	}
 	return 0;
 }
diff --git a/src/core/utility/buffer1_storage.cc b/src/core/utility/buffer1_storage.cc
deleted file mode 100644
index 1c582198..00000000
--- a/src/core/utility/buffer1_storage.cc
+++ /dev/null
@@ -1,438 +0,0 @@
-// Copyright 2025-present the zvec project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <mutex>
-// #include <zvec/ailego/buffer/buffer_manager.h>
-#include <zvec/ailego/buffer/buffer_pool.h>
-#include <zvec/core/framework/index_error.h>
-#include <zvec/core/framework/index_factory.h>
-#include <zvec/core/framework/index_mapping.h>
-#include <zvec/core/framework/index_version.h>
-#include "utility_params.h"
-
-#include <zvec/ailego/utility/time_helper.h>
-
-namespace zvec {
-namespace core {
-
-/*! MMap File Storage
- */
-class Buffer1Storage : public IndexStorage {
- public:
-  /*! Index Storage Segment
-   */
-  class Segment : public IndexStorage::Segment,
-                  public std::enable_shared_from_this<Segment> {
-   public:
-    //! Index Storage Pointer
-    typedef std::shared_ptr<Segment> Pointer;
-
-    //! Constructor
-    Segment(Buffer1Storage *owner, IndexMapping::Segment *segment, size_t segment_id)
-        : segment_(segment),
-          owner_(owner),
-          segment_id_(segment_id),
-          capacity_(static_cast<size_t>(segment->meta()->data_size +
-                                        segment->meta()->padding_size)) {}
-
-    //! Destructor
-    virtual ~Segment(void) {}
-
-    //! Retrieve size of data
-    size_t data_size(void) const override {
-      return static_cast<size_t>(segment_->meta()->data_size);
-    }
-
-    //! Retrieve crc of data
-    uint32_t data_crc(void) const override {
-      return segment_->meta()->data_crc;
-    }
-
-    //! Retrieve size of padding
-    size_t padding_size(void) const override {
-      return static_cast<size_t>(segment_->meta()->padding_size);
-    }
-
-    //! Retrieve capacity of segment
-    size_t capacity(void) const override {
-      return capacity_;
-    }
-
-    //! Fetch data from segment (with own buffer)
-    size_t fetch(size_t offset, void *buf, size_t len) const override {
-      if (ailego_unlikely(offset + len > segment_->meta()->data_size)) {
-        auto meta = segment_->meta();
-        if (offset > meta->data_size) {
-          offset = meta->data_size;
-        }
-        len = meta->data_size - offset;
-      }
-      memmove(buf, (const uint8_t *)(owner_->get_buffer(offset, len, segment_id_)) + offset,
-              len);
-      return len;
-    }
-
-    //! Read data from segment
-    size_t read(size_t offset, const void **data, size_t len) override {
-      if (ailego_unlikely(offset + len > segment_->meta()->data_size)) {
-        auto meta = segment_->meta();
-        if (offset > meta->data_size) {
-          offset = meta->data_size;
-        }
-        len = meta->data_size - offset;
-      }
-      size_t segment_offset = segment_->meta()->data_index + owner_->get_context_offset();
-      *data = owner_->get_buffer(segment_offset, capacity_, segment_id_) + offset;
-      return len;
-    }
-
-    size_t read(size_t offset, MemoryBlock &data, size_t len) override {
-      if (ailego_unlikely(offset + len > segment_->meta()->data_size)) {
-        auto meta = segment_->meta();
-        if (offset > meta->data_size) {
-          offset = meta->data_size;
-        }
-        len = meta->data_size - offset;
-      }
-      size_t segment_offset = segment_->meta()->data_index + owner_->get_context_offset();
-      data.reset(owner_->buffer_pool_handle_.get(), segment_id_, owner_->get_buffer(segment_offset, capacity_, segment_id_) + offset);
-      // data.reset(owner_->get_buffer(segment_offset, capacity_, segment_id_) + offset);
-      if (data.data()) {
-        return len;
-      } else {
-        LOG_ERROR("read error.");
-        return -1;
-      }
-    }
-
-    //! Write data into the storage with offset
-    size_t write(size_t /*offset*/, const void * /*data*/,
-                 size_t len) override {
-      return len;
-    }
-
-    //! Resize size of data
-    size_t resize(size_t /*size*/) override {
-      return 0;
-    }
-
-    //! Update crc of data
-    void update_data_crc(uint32_t /*crc*/) override {}
-
-    //! Clone the segment
-    IndexStorage::Segment::Pointer clone(void) override {
-      return shared_from_this();
-    }
-
-   private:
-    IndexMapping::Segment *segment_{};
-    Buffer1Storage *owner_{nullptr};
-    size_t segment_id_{};
-    size_t capacity_{};
-  };
-
-  //! Destructor
-  virtual ~Buffer1Storage(void) {
-    this->cleanup();
-  }
-
-  //! Initialize storage
-  int init(const ailego::Params & /*params*/) override {
-    return 0;
-  }
-
-  //! Cleanup storage
-  int cleanup(void) override {
-    this->close_index();
-    return 0;
-  }
-
-  //! Open storage
-  int open(const std::string &path, bool /*create*/) override {
-    LOG_INFO("open buffer storage 1");
-    file_name_ = path;
-    buffer_pool_ = std::make_shared<ailego::VecBufferPool>(path, 10u * 1024 * 1024 * 1024, 2490368 * 2);
-    buffer_pool_handle_ =
-        std::make_shared<ailego::VecBufferPoolHandle>(buffer_pool_->get_handle());
-    int ret = ParseToMapping();
-    LOG_ERROR("segment count: %lu, max_segment_size: %lu", segments_.size(), max_segment_size_);
-    if(ret != 0) {
-      return ret;
-    }
-    return 0;
-  }
-
-  char *get_buffer(size_t offset, size_t length, size_t block_id) {
-    return buffer_pool_handle_->get_block(offset, length, block_id);
-  }
-
-  int get_meta(size_t offset, size_t length, char *out) {
-    return buffer_pool_handle_->get_meta(offset, length, out);
-  }
-
-  int ParseHeader(size_t offset) {
-    char *buffer = new char[sizeof(header_)];
-    get_meta(offset, sizeof(header_), buffer);
-    uint8_t *header_ptr = reinterpret_cast<uint8_t *>(buffer);
-    memcpy(&header_, header_ptr, sizeof(header_));
-    delete[] buffer;
-    if (header_.meta_header_size != sizeof(IndexFormat::MetaHeader)) {
-      LOG_ERROR("Header meta size is invalid.");
-      return IndexError_InvalidLength;
-    }
-    if (ailego::Crc32c::Hash(&header_, sizeof(header_), header_.header_crc) !=
-        header_.header_crc) {
-      LOG_ERROR("Header meta checksum is invalid.");
-      return IndexError_InvalidChecksum;
-    }
-    return 0;
-  }
-
-  int ParseFooter(size_t offset) {
-    char *buffer = new char[sizeof(footer_)];
-    get_meta(offset, sizeof(footer_), buffer);
-    uint8_t *footer_ptr = reinterpret_cast<uint8_t *>(buffer);
-    memcpy(&footer_, footer_ptr, sizeof(footer_));
-    delete[] buffer;
-    if (offset < (size_t)footer_.segments_meta_size) {
-      LOG_ERROR("Footer meta size is invalid.");
-      return IndexError_InvalidLength;
-    }
-    if (ailego::Crc32c::Hash(&footer_, sizeof(footer_), footer_.footer_crc) !=
-        footer_.footer_crc) {
-      LOG_ERROR("Footer meta checksum is invalid.");
-      return IndexError_InvalidChecksum;
-    }
-    return 0;
-  }
-
-  int ParseSegment(size_t offset) {
-    segment_buffer_ = std::make_unique<char[]>(footer_.segments_meta_size);
-    get_meta(offset, footer_.segments_meta_size, segment_buffer_.get());
-    if (ailego::Crc32c::Hash(segment_buffer_.get(), footer_.segments_meta_size, 0u) !=
-        footer_.segments_meta_crc) {
-      LOG_ERROR("Index segments meta checksum is invalid.");
-      return IndexError_InvalidChecksum;
-    }
-    IndexFormat::SegmentMeta *segment_start =
-        reinterpret_cast<IndexFormat::SegmentMeta *>(segment_buffer_.get());
-    uint32_t segment_ids_offset = footer_.segments_meta_size;
-    for (IndexFormat::SegmentMeta *iter = segment_start,
-                                  *end = segment_start + footer_.segment_count;
-         iter != end; ++iter) {
-      if (iter->segment_id_offset > footer_.segments_meta_size) {
-        return IndexError_InvalidValue;
-      }
-      if (iter->data_index > footer_.content_size) {
-        return IndexError_InvalidValue;
-      }
-      if (iter->data_index + iter->data_size > footer_.content_size) {
-        return IndexError_InvalidLength;
-      }
-
-      if (iter->segment_id_offset < segment_ids_offset) {
-        segment_ids_offset = iter->segment_id_offset;
-      }
-      id_hash_.emplace(
-          std::string(reinterpret_cast<const char *>(segment_start) +
-                      iter->segment_id_offset),
-          segments_.size());
-      segments_.emplace(
-          std::string(reinterpret_cast<const char *>(segment_start) +
-                      iter->segment_id_offset),
-          iter);
-      max_segment_size_ = std::max(max_segment_size_, iter->data_size + iter->padding_size);
-      if (sizeof(IndexFormat::SegmentMeta) * footer_.segment_count >
-          footer_.segments_meta_size) {
-        return IndexError_InvalidLength;
-      }
-    }
-    return 0;
-  }
-
-  int ParseToMapping() {
-    ParseHeader(0);
-    // Unpack footer
-    if (header_.meta_footer_size != sizeof(IndexFormat::MetaFooter)) {
-      return IndexError_InvalidLength;
-    }
-    if ((int32_t)header_.meta_footer_offset < 0) {
-      return IndexError_Unsupported;
-    }
-    size_t footer_offset = header_.meta_footer_offset;
-    ParseFooter(footer_offset);
-
-    // Unpack segment table
-    if (sizeof(IndexFormat::SegmentMeta) * footer_.segment_count >
-        footer_.segments_meta_size) {
-      return IndexError_InvalidLength;
-    }
-    const size_t segment_start_offset = footer_offset - footer_.segments_meta_size;
-    ParseSegment(segment_start_offset);
-    return 0;
-  }
-
-  //! Flush storage
-  int flush(void) override {
-    return this->flush_index();
-  }
-
-  //! Close storage
-  int close(void) override {
-    this->close_index();
-    return 0;
-  }
-
-  //! Append a segment into storage
-  int append(const std::string &id, size_t size) override {
-    return this->append_segment(id, size);
-  }
-
-  //! Refresh meta information (checksum, update time, etc.)
-  void refresh(uint64_t chkp) override {
-    this->refresh_index(chkp);
-  }
-
-  //! Retrieve check point of storage
-  uint64_t check_point(void) const override {
-    return footer_.check_point;
-  }
-
-  //! Retrieve a segment by id
-  IndexStorage::Segment::Pointer get(const std::string &id, int) override {
-    IndexMapping::Segment *segment = this->get_segment(id);
-    if (!segment) {
-      return Buffer1Storage::Segment::Pointer();
-    }
-    return std::make_shared<Buffer1Storage::Segment>(this, segment,
-                                                     id_hash_[id]);
-  }
-
-  //! Test if it a segment exists
-  bool has(const std::string &id) const override {
-    return this->has_segment(id);
-  }
-
-  //! Retrieve magic number of index
-  uint32_t magic(void) const override {
-    return header_.magic;
-  }
-
-  uint32_t get_context_offset() {
-    return header_.content_offset;
-  }
-
- protected:
-  //! Initialize index version segment
-  int init_version_segment(void) {
-    size_t data_size = std::strlen(IndexVersion::Details());
-    int error_code =
-        this->append_segment(INDEX_VERSION_SEGMENT_NAME, data_size);
-    if (error_code != 0) {
-      return error_code;
-    }
-
-    IndexMapping::Segment *segment = get_segment(INDEX_VERSION_SEGMENT_NAME);
-    if (!segment) {
-      return IndexError_MMapFile;
-    }
-    auto meta = segment->meta();
-    size_t capacity = static_cast<size_t>(meta->padding_size + meta->data_size);
-    memcpy(segment->data(), IndexVersion::Details(), data_size);
-    segment->set_dirty();
-    meta->data_crc = ailego::Crc32c::Hash(segment->data(), data_size, 0);
-    meta->data_size = data_size;
-    meta->padding_size = capacity - data_size;
-    return 0;
-  }
-
-  //! Initialize index file
-  int init_index(const std::string &path) {
-    // Add index version
-    int error_code = this->init_version_segment();
-    if (error_code != 0) {
-      return error_code;
-    }
-
-    // Refresh mapping
-    this->refresh_index(0);
-    return 0;
-  }
-
-  //! Set the index file as dirty
-  void set_as_dirty(void) {
-    index_dirty_ = true;
-  }
-
-  //! Refresh meta information (checksum, update time, etc.)
-  void refresh_index(uint64_t /*chkp*/) {}
-
-  //! Flush index storage
-  int flush_index(void) {
-    return 0;
-  }
-
-  //! Close index storage
-  void close_index(void) {
-    std::lock_guard<std::mutex> latch(mapping_mutex_);
-    file_name_.clear();
-    segments_.clear();
-    memset(&header_, 0, sizeof(header_));
-    memset(&footer_, 0, sizeof(footer_));
-    segment_buffer_.release();
-  }
-
-  //! Append a segment into storage
-  int append_segment(const std::string & /*id*/, size_t /*size*/) {
-    return 0;
-  }
-
-  //! Test if a segment exists
-  bool has_segment(const std::string &id) const {
-    std::lock_guard<std::mutex> latch(mapping_mutex_);
-    return (segments_.find(id) != segments_.end());
-  }
-
-  //! Get a segment from storage
-  IndexMapping::Segment *get_segment(const std::string &id) {
-    std::lock_guard<std::mutex> latch(mapping_mutex_);
-    auto iter = segments_.find(id);
-    if (iter == segments_.end()) {
-      return nullptr;
-    }
-    IndexMapping::Segment *item = &iter->second;
-    return item;
-  }
-
- private:
-  bool index_dirty_{false};
-  mutable std::mutex mapping_mutex_{};
-
-  // buffer manager
-  std::string file_name_;
-  IndexFormat::MetaHeader header_;
-  IndexFormat::MetaFooter footer_;
-  std::map<std::string, IndexMapping::Segment> segments_{};
-  std::map<std::string, size_t> id_hash_{};
-  size_t max_segment_size_{0};
-  std::unique_ptr<char[]> segment_buffer_{nullptr};
-
-  ailego::VecBufferPool::Pointer buffer_pool_{nullptr};
-  ailego::VecBufferPoolHandle::Pointer buffer_pool_handle_{nullptr};
-};
-
-INDEX_FACTORY_REGISTER_STORAGE_ALIAS(BufferStorage, Buffer1Storage);
-
-}  // namespace core
-}  // namespace zvec
\ No newline at end of file
diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc
index d4b23c87..13aee16a 100644
--- a/src/core/utility/buffer_storage.cc
+++ b/src/core/utility/buffer_storage.cc
@@ -13,13 +13,16 @@
 // limitations under the License.
 
 #include <mutex>
-#include <zvec/ailego/buffer/buffer_manager.h>
+// #include <zvec/ailego/buffer/buffer_manager.h>
+#include <zvec/ailego/buffer/buffer_pool.h>
 #include <zvec/core/framework/index_error.h>
 #include <zvec/core/framework/index_factory.h>
 #include <zvec/core/framework/index_mapping.h>
 #include <zvec/core/framework/index_version.h>
 #include "utility_params.h"
 
+#include <zvec/ailego/utility/time_helper.h>
+
 namespace zvec {
 namespace core {
 
@@ -36,9 +39,10 @@ class BufferStorage : public IndexStorage {
     typedef std::shared_ptr<Segment> Pointer;
 
     //! Constructor
-    Segment(BufferStorage *owner, IndexMapping::Segment *segment)
+    Segment(BufferStorage *owner, IndexMapping::Segment *segment, size_t segment_id)
         : segment_(segment),
           owner_(owner),
+          segment_id_(segment_id),
           capacity_(static_cast<size_t>(segment->meta()->data_size +
                                         segment->meta()->padding_size)) {}
 
@@ -74,9 +78,7 @@ class BufferStorage : public IndexStorage {
         }
         len = meta->data_size - offset;
       }
-      ailego::BufferHandle buffer_handle =
-          owner_->get_buffer_handle(offset, len);
-      memmove(buf, (const uint8_t *)buffer_handle.pin_vector_data() + offset,
+      memmove(buf, (const uint8_t *)(owner_->get_buffer(offset, len, segment_id_)) + offset,
               len);
       return len;
     }
@@ -90,11 +92,8 @@ class BufferStorage : public IndexStorage {
         }
         len = meta->data_size - offset;
       }
-      size_t buffer_offset =
-          segment_->meta()->data_index + owner_->get_context_offset() + offset;
-      ailego::BufferHandle buffer_handle =
-          owner_->get_buffer_handle(buffer_offset, len);
-      *data = buffer_handle.pin_vector_data();
+      size_t segment_offset = segment_->meta()->data_index + owner_->get_context_offset();
+      *data = owner_->get_buffer(segment_offset, capacity_, segment_id_) + offset;
       return len;
     }
 
@@ -106,16 +105,13 @@ class BufferStorage : public IndexStorage {
         }
         len = meta->data_size - offset;
       }
-      size_t buffer_offset =
-          segment_->meta()->data_index + owner_->get_context_offset() + offset;
-      data.reset(owner_->get_buffer_handle_ptr(buffer_offset, len));
+      size_t segment_offset = segment_->meta()->data_index + owner_->get_context_offset();
+      data.reset(owner_->buffer_pool_handle_.get(), segment_id_, owner_->get_buffer(segment_offset, capacity_, segment_id_) + offset);
+      // data.reset(owner_->get_buffer(segment_offset, capacity_, segment_id_) + offset);
       if (data.data()) {
         return len;
       } else {
-        LOG_ERROR(
-            "Buffer handle is null, now used memory: %zu, new: %zu",
-            (size_t)ailego::BufferManager::Instance().total_size_in_bytes(),
-            len);
+        LOG_ERROR("read error.");
         return -1;
       }
     }
@@ -142,6 +138,7 @@ class BufferStorage : public IndexStorage {
    private:
     IndexMapping::Segment *segment_{};
     BufferStorage *owner_{nullptr};
+    size_t segment_id_{};
     size_t capacity_{};
   };
 
@@ -163,29 +160,39 @@ class BufferStorage : public IndexStorage {
 
   //! Open storage
   int open(const std::string &path, bool /*create*/) override {
+    LOG_INFO("open buffer storage 1");
     file_name_ = path;
-    return ParseToMapping();
+    buffer_pool_ = std::make_shared<ailego::VecBufferPool>(path, 20lu * 1024 * 1024 * 1024, 2490368 * 2);
+    buffer_pool_handle_ =
+        std::make_shared<ailego::VecBufferPoolHandle>(buffer_pool_->get_handle());
+    int ret = ParseToMapping();
+    LOG_ERROR("segment count: %lu, max_segment_size: %lu", segments_.size(), max_segment_size_);
+    for(auto iter = segments_.begin(); iter != segments_.end(); iter++) {
+      auto seg = this->get(iter->first, 0);
+      MemoryBlock block;
+      int len = seg->read(0, block, 1);
+      LOG_ERROR("segment %s: %d", iter->first.c_str(), len);
+    }
+    if(ret != 0) {
+      return ret;
+    }
+    return 0;
   }
 
-  ailego::BufferHandle get_buffer_handle(int offset, int length) {
-    ailego::BufferID buffer_id =
-        ailego::BufferID::VectorID(file_name_, offset, length);
-    return ailego::BufferManager::Instance().acquire(buffer_id);
+  char *get_buffer(size_t offset, size_t length, size_t block_id) {
+    return buffer_pool_handle_->get_block(offset, length, block_id);
   }
 
-  ailego::BufferHandle::Pointer get_buffer_handle_ptr(int offset, int length) {
-    ailego::BufferID buffer_id =
-        ailego::BufferID::VectorID(file_name_, offset, length);
-    return ailego::BufferManager::Instance().acquire_ptr(buffer_id);
+  int get_meta(size_t offset, size_t length, char *out) {
+    return buffer_pool_handle_->get_meta(offset, length, out);
   }
 
-  int ParseHeader(int offset) {
-    ailego::BufferHandle header_handle =
-        get_buffer_handle(offset, sizeof(header_));
-    void *buffer = header_handle.pin_vector_data();
+  int ParseHeader(size_t offset) {
+    char *buffer = new char[sizeof(header_)];
+    get_meta(offset, sizeof(header_), buffer);
     uint8_t *header_ptr = reinterpret_cast<uint8_t *>(buffer);
     memcpy(&header_, header_ptr, sizeof(header_));
-    header_handle.unpin_vector_data();
+    delete[] buffer;
     if (header_.meta_header_size != sizeof(IndexFormat::MetaHeader)) {
       LOG_ERROR("Header meta size is invalid.");
       return IndexError_InvalidLength;
@@ -198,14 +205,13 @@ class BufferStorage : public IndexStorage {
     return 0;
   }
 
-  int ParseFooter(int offset) {
-    ailego::BufferHandle footer_handle =
-        get_buffer_handle(offset, sizeof(footer_));
-    void *buffer = footer_handle.pin_vector_data();
+  int ParseFooter(size_t offset) {
+    char *buffer = new char[sizeof(footer_)];
+    get_meta(offset, sizeof(footer_), buffer);
     uint8_t *footer_ptr = reinterpret_cast<uint8_t *>(buffer);
     memcpy(&footer_, footer_ptr, sizeof(footer_));
-    footer_handle.unpin_vector_data();
-    if (offset < (int)footer_.segments_meta_size) {
+    delete[] buffer;
+    if (offset < (size_t)footer_.segments_meta_size) {
       LOG_ERROR("Footer meta size is invalid.");
       return IndexError_InvalidLength;
     }
@@ -217,17 +223,16 @@ class BufferStorage : public IndexStorage {
     return 0;
   }
 
-  int ParseSegment(int offset) {
-    ailego::BufferHandle segment_start_handle =
-        get_buffer_handle(offset, footer_.segments_meta_size);
-    void *segment_buffer = segment_start_handle.pin_vector_data();
-    if (ailego::Crc32c::Hash(segment_buffer, footer_.segments_meta_size, 0u) !=
+  int ParseSegment(size_t offset) {
+    segment_buffer_ = std::make_unique<char[]>(footer_.segments_meta_size);
+    get_meta(offset, footer_.segments_meta_size, segment_buffer_.get());
+    if (ailego::Crc32c::Hash(segment_buffer_.get(), footer_.segments_meta_size, 0u) !=
         footer_.segments_meta_crc) {
       LOG_ERROR("Index segments meta checksum is invalid.");
       return IndexError_InvalidChecksum;
     }
     IndexFormat::SegmentMeta *segment_start =
-        reinterpret_cast<IndexFormat::SegmentMeta *>(segment_buffer);
+        reinterpret_cast<IndexFormat::SegmentMeta *>(segment_buffer_.get());
     uint32_t segment_ids_offset = footer_.segments_meta_size;
     for (IndexFormat::SegmentMeta *iter = segment_start,
                                   *end = segment_start + footer_.segment_count;
@@ -245,10 +250,15 @@ class BufferStorage : public IndexStorage {
       if (iter->segment_id_offset < segment_ids_offset) {
         segment_ids_offset = iter->segment_id_offset;
       }
+      id_hash_.emplace(
+          std::string(reinterpret_cast<const char *>(segment_start) +
+                      iter->segment_id_offset),
+          segments_.size());
       segments_.emplace(
           std::string(reinterpret_cast<const char *>(segment_start) +
                       iter->segment_id_offset),
           iter);
+      max_segment_size_ = std::max(max_segment_size_, iter->data_size + iter->padding_size);
       if (sizeof(IndexFormat::SegmentMeta) * footer_.segment_count >
           footer_.segments_meta_size) {
         return IndexError_InvalidLength;
@@ -259,7 +269,6 @@ class BufferStorage : public IndexStorage {
 
   int ParseToMapping() {
     ParseHeader(0);
-
     // Unpack footer
     if (header_.meta_footer_size != sizeof(IndexFormat::MetaFooter)) {
       return IndexError_InvalidLength;
@@ -275,7 +284,7 @@ class BufferStorage : public IndexStorage {
         footer_.segments_meta_size) {
       return IndexError_InvalidLength;
     }
-    const int segment_start_offset = footer_offset - footer_.segments_meta_size;
+    const size_t segment_start_offset = footer_offset - footer_.segments_meta_size;
     ParseSegment(segment_start_offset);
     return 0;
   }
@@ -312,7 +321,8 @@ class BufferStorage : public IndexStorage {
     if (!segment) {
       return BufferStorage::Segment::Pointer();
     }
-    return std::make_shared<BufferStorage::Segment>(this, segment);
+    return std::make_shared<BufferStorage::Segment>(this, segment,
+                                                     id_hash_[id]);
   }
 
   //! Test if it a segment exists
@@ -355,22 +365,14 @@ class BufferStorage : public IndexStorage {
 
   //! Initialize index file
   int init_index(const std::string &path) {
-    int error_code = mapping_.create(path, segment_meta_capacity_);
-    if (error_code != 0) {
-      return error_code;
-    }
-
     // Add index version
-    error_code = this->init_version_segment();
+    int error_code = this->init_version_segment();
     if (error_code != 0) {
       return error_code;
     }
 
     // Refresh mapping
     this->refresh_index(0);
-
-    // Close mapping
-    mapping_.close();
     return 0;
   }
 
@@ -394,6 +396,7 @@ class BufferStorage : public IndexStorage {
     segments_.clear();
     memset(&header_, 0, sizeof(header_));
     memset(&footer_, 0, sizeof(footer_));
+    segment_buffer_.release();
   }
 
   //! Append a segment into storage
@@ -419,14 +422,7 @@ class BufferStorage : public IndexStorage {
   }
 
  private:
-  // mmap
-  uint32_t segment_meta_capacity_{1024 * 1024};
-  // bool copy_on_write_{false};
-  // bool force_flush_{false};
-  // bool memory_locked_{false};
-  // bool memory_warmup_{false};
   bool index_dirty_{false};
-  mutable IndexMapping mapping_{};
   mutable std::mutex mapping_mutex_{};
 
   // buffer manager
@@ -434,9 +430,15 @@ class BufferStorage : public IndexStorage {
   IndexFormat::MetaHeader header_;
   IndexFormat::MetaFooter footer_;
   std::map<std::string, IndexMapping::Segment> segments_{};
+  std::map<std::string, size_t> id_hash_{};
+  size_t max_segment_size_{0};
+  std::unique_ptr<char[]> segment_buffer_{nullptr};
+
+  ailego::VecBufferPool::Pointer buffer_pool_{nullptr};
+  ailego::VecBufferPoolHandle::Pointer buffer_pool_handle_{nullptr};
 };
 
-// INDEX_FACTORY_REGISTER_STORAGE(BufferStorage);
+INDEX_FACTORY_REGISTER_STORAGE(BufferStorage);
 
 }  // namespace core
-}  // namespace zvec
+}  // namespace zvec
\ No newline at end of file
diff --git a/tests/core/algorithm/hnsw/hnsw_streamer_buffer_test.cpp b/tests/core/algorithm/hnsw/hnsw_streamer_buffer_test.cc
similarity index 100%
rename from tests/core/algorithm/hnsw/hnsw_streamer_buffer_test.cpp
rename to tests/core/algorithm/hnsw/hnsw_streamer_buffer_test.cc

From 11a0e475d154a57e54f23ae7791352db08e48d34 Mon Sep 17 00:00:00 2001
From: "yinzefeng.yzf" <yinzefeng.yzf@alibaba-inc.com>
Date: Mon, 9 Feb 2026 19:10:42 +0800
Subject: [PATCH 05/11] clang format

---
 src/ailego/buffer/buffer_pool.cc              |  384 +-
 src/core/utility/buffer_storage.cc            |   38 +-
 src/include/zvec/ailego/buffer/buffer_pool.h  |   40 +-
 .../zvec/ailego/buffer/concurrentqueue.h      | 7693 +++++++++--------
 4 files changed, 4418 insertions(+), 3737 deletions(-)

diff --git a/src/ailego/buffer/buffer_pool.cc b/src/ailego/buffer/buffer_pool.cc
index 3ed461c1..81ed92bf 100644
--- a/src/ailego/buffer/buffer_pool.cc
+++ b/src/ailego/buffer/buffer_pool.cc
@@ -5,227 +5,233 @@ namespace zvec {
 namespace ailego {
 
 int LRUCache::init(size_t block_size) {
-	block_size_ = block_size;
-	for(size_t i = 0; i < CATCH_QUEUE_NUM; i++) {
-		queues_.push_back(ConcurrentQueue(block_size));
-	}
-	return 0;
+  block_size_ = block_size;
+  for (size_t i = 0; i < CATCH_QUEUE_NUM; i++) {
+    queues_.push_back(ConcurrentQueue(block_size));
+  }
+  return 0;
 }
 
 bool LRUCache::evict_single_block(BlockType &item) {
-	bool found = false;
-	for(size_t i = 0; i < CATCH_QUEUE_NUM; i++) {
-		found = queues_[i].try_dequeue(item);
-		if(found) {
-			break;
-		}
-	}
-	return found;
-}
-
-bool LRUCache::add_single_block(const LPMap *lp_map, const BlockType &block, int block_type) {
-	bool ok = queues_[block_type].try_enqueue(block);
-	evict_queue_insertions_.fetch_add(1, std::memory_order_relaxed);
-	if(evict_queue_insertions_ % block_size_ == 0) {
-		this->clear_dead_node(lp_map);
-	}
-	return ok;
+  bool found = false;
+  for (size_t i = 0; i < CATCH_QUEUE_NUM; i++) {
+    found = queues_[i].try_dequeue(item);
+    if (found) {
+      break;
+    }
+  }
+  return found;
+}
+
+bool LRUCache::add_single_block(const LPMap *lp_map, const BlockType &block,
+                                int block_type) {
+  bool ok = queues_[block_type].try_enqueue(block);
+  evict_queue_insertions_.fetch_add(1, std::memory_order_relaxed);
+  if (evict_queue_insertions_ % block_size_ == 0) {
+    this->clear_dead_node(lp_map);
+  }
+  return ok;
 }
 
 void LRUCache::clear_dead_node(const LPMap *lp_map) {
-	for(int i = 0; i < CATCH_QUEUE_NUM; i++) {
-		int clear_size = block_size_ * 2;
-		if (queues_[i].size_approx() < clear_size * 4) {
-			continue;
-		}
-		int clear_count = 0;
-		ConcurrentQueue tmp(block_size_);
-		BlockType item;
-		while(queues_[i].try_dequeue(item) && (clear_count++ < clear_size)) {
-			if(!lp_map->isDeadBlock(item)) {
-				tmp.try_enqueue(item);
-			}
-		}
-		while(tmp.try_dequeue(item)) {
-			if(!lp_map->isDeadBlock(item)) {
-				queues_[i].try_enqueue(item);
-			}
-		}
-	}
+  for (int i = 0; i < CATCH_QUEUE_NUM; i++) {
+    int clear_size = block_size_ * 2;
+    if (queues_[i].size_approx() < clear_size * 4) {
+      continue;
+    }
+    int clear_count = 0;
+    ConcurrentQueue tmp(block_size_);
+    BlockType item;
+    while (queues_[i].try_dequeue(item) && (clear_count++ < clear_size)) {
+      if (!lp_map->isDeadBlock(item)) {
+        tmp.try_enqueue(item);
+      }
+    }
+    while (tmp.try_dequeue(item)) {
+      if (!lp_map->isDeadBlock(item)) {
+        queues_[i].try_enqueue(item);
+      }
+    }
+  }
 }
 
 void LPMap::init(size_t entry_num) {
-	if (entries_) {
-		delete[] entries_;
-	}
-	entry_num_ = entry_num;
-	entries_ = new Entry[entry_num_];
-	for (size_t i = 0; i < entry_num_; i++) {
-		entries_[i].ref_count.store(std::numeric_limits<int>::min());
-		entries_[i].load_count.store(0);
-		entries_[i].buffer = nullptr;
-	}
-	cache_.init(entry_num);
-}
-
-char* LPMap::acquire_block(block_id_t block_id) {
-	assert(block_id < entry_num_);
-	Entry &entry = entries_[block_id];
-	if (entry.ref_count.load(std::memory_order_relaxed) == 0) {
-		entry.load_count.fetch_add(1, std::memory_order_relaxed);
-	}
-	entry.ref_count.fetch_add(1, std::memory_order_relaxed);
-	if (entry.ref_count.load(std::memory_order_relaxed) < 0) {
-		return nullptr;
-	}
-	return entry.buffer;
+  if (entries_) {
+    delete[] entries_;
+  }
+  entry_num_ = entry_num;
+  entries_ = new Entry[entry_num_];
+  for (size_t i = 0; i < entry_num_; i++) {
+    entries_[i].ref_count.store(std::numeric_limits<int>::min());
+    entries_[i].load_count.store(0);
+    entries_[i].buffer = nullptr;
+  }
+  cache_.init(entry_num);
+}
+
+char *LPMap::acquire_block(block_id_t block_id) {
+  assert(block_id < entry_num_);
+  Entry &entry = entries_[block_id];
+  if (entry.ref_count.load(std::memory_order_relaxed) == 0) {
+    entry.load_count.fetch_add(1, std::memory_order_relaxed);
+  }
+  entry.ref_count.fetch_add(1, std::memory_order_relaxed);
+  if (entry.ref_count.load(std::memory_order_relaxed) < 0) {
+    return nullptr;
+  }
+  return entry.buffer;
 }
 
 void LPMap::release_block(block_id_t block_id) {
-	assert(block_id < entry_num_);
-	Entry &entry = entries_[block_id];
-
-	if (entry.ref_count.fetch_sub(1, std::memory_order_release) == 1) {
-		std::atomic_thread_fence(std::memory_order_acquire);
-		LRUCache::BlockType block;
-		block.first = block_id;
-		block.second = entry.load_count.load();
-		cache_.add_single_block(this, block, 0);
-	}
-}
-
-char* LPMap::evict_block(block_id_t block_id) {
-	assert(block_id < entry_num_);
-	Entry &entry = entries_[block_id];
-	int expected = 0;
-	if (entry.ref_count.compare_exchange_strong(
-					expected, std::numeric_limits<int>::min())) {
-		char *buffer = entry.buffer;
-		entry.buffer = nullptr;
-		return buffer;
-	} else {
-		return nullptr;
-	}
-}
-
-char* LPMap::set_block_acquired(block_id_t block_id, char *buffer) {
-	assert(block_id < entry_num_);
-	Entry &entry = entries_[block_id];
-	if (entry.ref_count.load(std::memory_order_relaxed) >= 0) {
-		entry.ref_count.fetch_add(1, std::memory_order_relaxed);
-		return entry.buffer;
-	}
-	entry.buffer = buffer;
-	entry.ref_count.store(1, std::memory_order_relaxed);
-	entry.load_count.fetch_add(1, std::memory_order_relaxed);
-	return buffer;
+  assert(block_id < entry_num_);
+  Entry &entry = entries_[block_id];
+
+  if (entry.ref_count.fetch_sub(1, std::memory_order_release) == 1) {
+    std::atomic_thread_fence(std::memory_order_acquire);
+    LRUCache::BlockType block;
+    block.first = block_id;
+    block.second = entry.load_count.load();
+    cache_.add_single_block(this, block, 0);
+  }
+}
+
+char *LPMap::evict_block(block_id_t block_id) {
+  assert(block_id < entry_num_);
+  Entry &entry = entries_[block_id];
+  int expected = 0;
+  if (entry.ref_count.compare_exchange_strong(
+          expected, std::numeric_limits<int>::min())) {
+    char *buffer = entry.buffer;
+    entry.buffer = nullptr;
+    return buffer;
+  } else {
+    return nullptr;
+  }
+}
+
+char *LPMap::set_block_acquired(block_id_t block_id, char *buffer) {
+  assert(block_id < entry_num_);
+  Entry &entry = entries_[block_id];
+  if (entry.ref_count.load(std::memory_order_relaxed) >= 0) {
+    entry.ref_count.fetch_add(1, std::memory_order_relaxed);
+    return entry.buffer;
+  }
+  entry.buffer = buffer;
+  entry.ref_count.store(1, std::memory_order_relaxed);
+  entry.load_count.fetch_add(1, std::memory_order_relaxed);
+  return buffer;
 }
 
 void LPMap::recycle(moodycamel::ConcurrentQueue<char *> &free_buffers) {
-	LRUCache::BlockType block;
-	do {
-		bool ok = cache_.evict_single_block(block);
-		if(!ok) {
-			return;
-		}
-	} while(isDeadBlock(block));
-	char *buffer = evict_block(block.first);
-	if (buffer) {
-		free_buffers.try_enqueue(buffer);
-	}
-}
-
-VecBufferPool::VecBufferPool(const std::string &filename, size_t pool_capacity, size_t block_size)
-		: pool_capacity_(pool_capacity) {
-	fd_ = open(filename.c_str(), O_RDONLY);
-	if (fd_ < 0) {
-		throw std::runtime_error("Failed to open file: " + filename);
-	}
-	struct stat st;
-	if (fstat(fd_, &st) < 0) {
-		throw std::runtime_error("Failed to stat file: " + filename);
-	}
-	file_size_ = st.st_size;
-
-	size_t buffer_num = pool_capacity_ / block_size;
-	size_t block_num = file_size_ / block_size + 500;
-	lp_map_.init(block_num);
-	for (size_t i = 0; i < buffer_num; i++) {
-		char *buffer = (char *)aligned_alloc(64, block_size);
-		if (buffer != nullptr) {
-			bool ok = free_buffers_.try_enqueue(buffer);
-		}
-	}
-	LOG_DEBUG("Buffer pool num: %zu, entry num: %zu", buffer_num, lp_map_.entry_num());
+  LRUCache::BlockType block;
+  do {
+    bool ok = cache_.evict_single_block(block);
+    if (!ok) {
+      return;
+    }
+  } while (isDeadBlock(block));
+  char *buffer = evict_block(block.first);
+  if (buffer) {
+    free_buffers.try_enqueue(buffer);
+  }
+}
+
+VecBufferPool::VecBufferPool(const std::string &filename, size_t pool_capacity,
+                             size_t block_size)
+    : pool_capacity_(pool_capacity) {
+  fd_ = open(filename.c_str(), O_RDONLY);
+  if (fd_ < 0) {
+    throw std::runtime_error("Failed to open file: " + filename);
+  }
+  struct stat st;
+  if (fstat(fd_, &st) < 0) {
+    throw std::runtime_error("Failed to stat file: " + filename);
+  }
+  file_size_ = st.st_size;
+
+  size_t buffer_num = pool_capacity_ / block_size;
+  size_t block_num = file_size_ / block_size + 500;
+  lp_map_.init(block_num);
+  for (size_t i = 0; i < buffer_num; i++) {
+    char *buffer = (char *)aligned_alloc(64, block_size);
+    if (buffer != nullptr) {
+      bool ok = free_buffers_.try_enqueue(buffer);
+    }
+  }
+  LOG_DEBUG("Buffer pool num: %zu, entry num: %zu", buffer_num,
+            lp_map_.entry_num());
 }
 
 VecBufferPoolHandle VecBufferPool::get_handle() {
-	return VecBufferPoolHandle(*this);
-}
-
-char* VecBufferPool::acquire_buffer(block_id_t block_id, size_t offset, size_t size, int retry) {
-	char *buffer = lp_map_.acquire_block(block_id);
-	if (buffer) {
-		return buffer;
-	}
-	{
-		bool found = free_buffers_.try_dequeue(buffer);
-		if (!found) {
-			for (int i = 0; i < retry; i++) {
-				lp_map_.recycle(free_buffers_);
-				found = free_buffers_.try_dequeue(buffer);
-				if (found) {
-					break;
-				}
-			}
-		}
-		if (!found) {
-			LOG_ERROR("Buffer pool failed to get free buffer");
-			return nullptr;
-		}
-	}
-
-	ssize_t read_bytes = pread(fd_, buffer, size, offset);
-	if (read_bytes != static_cast<ssize_t>(size)) {
-		LOG_ERROR("Buffer pool failed to read file at offset: %zu", offset);
-		return nullptr;
-	}
-	char *placed_buffer = nullptr;
-	{
-		std::lock_guard<std::mutex> lock(mutex_);
-		placed_buffer = lp_map_.set_block_acquired(block_id, buffer);
-	}
-	if (placed_buffer != buffer) {
-		// another thread has set the block
-		free_buffers_.try_enqueue(buffer);
-	}
-	return placed_buffer;
+  return VecBufferPoolHandle(*this);
+}
+
+char *VecBufferPool::acquire_buffer(block_id_t block_id, size_t offset,
+                                    size_t size, int retry) {
+  char *buffer = lp_map_.acquire_block(block_id);
+  if (buffer) {
+    return buffer;
+  }
+  {
+    bool found = free_buffers_.try_dequeue(buffer);
+    if (!found) {
+      for (int i = 0; i < retry; i++) {
+        lp_map_.recycle(free_buffers_);
+        found = free_buffers_.try_dequeue(buffer);
+        if (found) {
+          break;
+        }
+      }
+    }
+    if (!found) {
+      LOG_ERROR("Buffer pool failed to get free buffer");
+      return nullptr;
+    }
+  }
+
+  ssize_t read_bytes = pread(fd_, buffer, size, offset);
+  if (read_bytes != static_cast<ssize_t>(size)) {
+    LOG_ERROR("Buffer pool failed to read file at offset: %zu", offset);
+    return nullptr;
+  }
+  char *placed_buffer = nullptr;
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    placed_buffer = lp_map_.set_block_acquired(block_id, buffer);
+  }
+  if (placed_buffer != buffer) {
+    // another thread has set the block
+    free_buffers_.try_enqueue(buffer);
+  }
+  return placed_buffer;
 }
 
 int VecBufferPool::get_meta(size_t offset, size_t length, char *buffer) {
-	ssize_t read_bytes = pread(fd_, buffer, length, offset);
-	if (read_bytes != static_cast<ssize_t>(length)) {
-		LOG_ERROR("Buffer pool failed to read file at offset: %zu", offset);LOG_ERROR("Buffer pool failed to read file at offset: %zu", offset);
-		return -1;
-	}
-	return 0;
+  ssize_t read_bytes = pread(fd_, buffer, length, offset);
+  if (read_bytes != static_cast<ssize_t>(length)) {
+    LOG_ERROR("Buffer pool failed to read file at offset: %zu", offset);
+    LOG_ERROR("Buffer pool failed to read file at offset: %zu", offset);
+    return -1;
+  }
+  return 0;
 }
 
-char* VecBufferPoolHandle::get_block(size_t offset, size_t size, size_t block_id) {
-	char *buffer = pool.acquire_buffer(block_id, offset, size, 5);
-	return buffer;
+char *VecBufferPoolHandle::get_block(size_t offset, size_t size,
+                                     size_t block_id) {
+  char *buffer = pool.acquire_buffer(block_id, offset, size, 5);
+  return buffer;
 }
 
 int VecBufferPoolHandle::get_meta(size_t offset, size_t length, char *buffer) {
-	return pool.get_meta(offset, length, buffer);
+  return pool.get_meta(offset, length, buffer);
 }
 
 void VecBufferPoolHandle::release_one(block_id_t block_id) {
-	pool.lp_map_.release_block(block_id);
+  pool.lp_map_.release_block(block_id);
 }
 
 void VecBufferPoolHandle::acquire_one(block_id_t block_id) {
-	pool.lp_map_.acquire_block(block_id);
+  pool.lp_map_.acquire_block(block_id);
 }
 
 }  // namespace ailego
diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc
index 3765fd15..dcdb13d3 100644
--- a/src/core/utility/buffer_storage.cc
+++ b/src/core/utility/buffer_storage.cc
@@ -15,14 +15,13 @@
 #include <mutex>
 // #include <zvec/ailego/buffer/buffer_manager.h>
 #include <zvec/ailego/buffer/buffer_pool.h>
+#include <zvec/ailego/utility/time_helper.h>
 #include <zvec/core/framework/index_error.h>
 #include <zvec/core/framework/index_factory.h>
 #include <zvec/core/framework/index_mapping.h>
 #include <zvec/core/framework/index_version.h>
 #include "utility_params.h"
 
-#include <zvec/ailego/utility/time_helper.h>
-
 namespace zvec {
 namespace core {
 
@@ -81,7 +80,9 @@ class BufferStorage : public IndexStorage {
         }
         len = meta->data_size - offset;
       }
-      memmove(buf, (const uint8_t *)(owner_->get_buffer(offset, len, segment_id_)) + offset,
+      memmove(buf,
+              (const uint8_t *)(owner_->get_buffer(offset, len, segment_id_)) +
+                  offset,
               len);
       return len;
     }
@@ -98,7 +99,8 @@ class BufferStorage : public IndexStorage {
       size_t buffer_offset = segment_header_start_offset_ +
                              segment_header_->content_offset +
                              segment_->meta()->data_index + offset;
-      *data = owner_->get_buffer(buffer_offset, capacity_, segment_id_) + offset;
+      *data =
+          owner_->get_buffer(buffer_offset, capacity_, segment_id_) + offset;
       return len;
     }
 
@@ -113,8 +115,11 @@ class BufferStorage : public IndexStorage {
       size_t buffer_offset = segment_header_start_offset_ +
                              segment_header_->content_offset +
                              segment_->meta()->data_index + offset;
-      data.reset(owner_->buffer_pool_handle_.get(), segment_id_, owner_->get_buffer(buffer_offset, capacity_, segment_id_) + offset);
-      // data.reset(owner_->get_buffer(buffer_offset, capacity_, segment_id_) + offset);
+      data.reset(
+          owner_->buffer_pool_handle_.get(), segment_id_,
+          owner_->get_buffer(buffer_offset, capacity_, segment_id_) + offset);
+      // data.reset(owner_->get_buffer(buffer_offset, capacity_, segment_id_) +
+      // offset);
       if (data.data()) {
         return len;
       } else {
@@ -174,18 +179,20 @@ class BufferStorage : public IndexStorage {
   int open(const std::string &path, bool /*create*/) override {
     LOG_INFO("open buffer storage 1");
     file_name_ = path;
-    buffer_pool_ = std::make_shared<ailego::VecBufferPool>(path, 20lu * 1024 * 1024 * 1024, 2490368 * 2);
-    buffer_pool_handle_ =
-        std::make_shared<ailego::VecBufferPoolHandle>(buffer_pool_->get_handle());
+    buffer_pool_ = std::make_shared<ailego::VecBufferPool>(
+        path, 20lu * 1024 * 1024 * 1024, 2490368 * 2);
+    buffer_pool_handle_ = std::make_shared<ailego::VecBufferPoolHandle>(
+        buffer_pool_->get_handle());
     int ret = ParseToMapping();
-    LOG_ERROR("segment count: %lu, max_segment_size: %lu", segments_.size(), max_segment_size_);
-    for(auto iter = segments_.begin(); iter != segments_.end(); iter++) {
+    LOG_ERROR("segment count: %lu, max_segment_size: %lu", segments_.size(),
+              max_segment_size_);
+    for (auto iter = segments_.begin(); iter != segments_.end(); iter++) {
       auto seg = this->get(iter->first, 0);
       MemoryBlock block;
       int len = seg->read(0, block, 1);
       LOG_ERROR("segment %s: %d", iter->first.c_str(), len);
     }
-    if(ret != 0) {
+    if (ret != 0) {
       return ret;
     }
     return 0;
@@ -238,8 +245,8 @@ class BufferStorage : public IndexStorage {
   int ParseSegment(size_t offset) {
     segment_buffer_ = std::make_unique<char[]>(footer_.segments_meta_size);
     get_meta(offset, footer_.segments_meta_size, segment_buffer_.get());
-    if (ailego::Crc32c::Hash(segment_buffer_.get(), footer_.segments_meta_size, 0u) !=
-        footer_.segments_meta_crc) {
+    if (ailego::Crc32c::Hash(segment_buffer_.get(), footer_.segments_meta_size,
+                             0u) != footer_.segments_meta_crc) {
       LOG_ERROR("Index segments meta checksum is invalid.");
       return IndexError_InvalidChecksum;
     }
@@ -271,7 +278,8 @@ class BufferStorage : public IndexStorage {
                       iter->segment_id_offset),
           IndexMapping::SegmentInfo{IndexMapping::Segment{iter},
                                     current_header_start_offset_, &header_});
-      max_segment_size_ = std::max(max_segment_size_, iter->data_size + iter->padding_size);
+      max_segment_size_ =
+          std::max(max_segment_size_, iter->data_size + iter->padding_size);
       if (sizeof(IndexFormat::SegmentMeta) * footer_.segment_count >
           footer_.segments_meta_size) {
         return IndexError_InvalidLength;
diff --git a/src/include/zvec/ailego/buffer/buffer_pool.h b/src/include/zvec/ailego/buffer/buffer_pool.h
index 34c69d51..f1a0149c 100644
--- a/src/include/zvec/ailego/buffer/buffer_pool.h
+++ b/src/include/zvec/ailego/buffer/buffer_pool.h
@@ -11,12 +11,12 @@
 #include <iostream>
 #include <limits>
 #include <map>
+#include <memory>
 #include <mutex>
 #include <queue>
 #include <stdexcept>
 #include <string>
 #include <unordered_map>
-#include <memory>
 #include "concurrentqueue.h"
 
 namespace zvec {
@@ -28,23 +28,24 @@ using version_t = size_t;
 class LPMap;
 
 class LRUCache {
-  public:
-    typedef std::pair<block_id_t, version_t> BlockType;
-    typedef moodycamel::ConcurrentQueue<BlockType> ConcurrentQueue;
+ public:
+  typedef std::pair<block_id_t, version_t> BlockType;
+  typedef moodycamel::ConcurrentQueue<BlockType> ConcurrentQueue;
 
-    int init(size_t block_size);
+  int init(size_t block_size);
 
-    bool evict_single_block(BlockType &item);
+  bool evict_single_block(BlockType &item);
 
-    bool add_single_block(const LPMap *lp_map, const BlockType &block, int block_type);
+  bool add_single_block(const LPMap *lp_map, const BlockType &block,
+                        int block_type);
 
-    void clear_dead_node(const LPMap *lp_map);
+  void clear_dead_node(const LPMap *lp_map);
 
-  private:
-    constexpr static size_t CATCH_QUEUE_NUM = 3;
-    int block_size_;
-    std::vector<ConcurrentQueue> queues_;
-    alignas(64) std::atomic<size_t> evict_queue_insertions_{0};
+ private:
+  constexpr static size_t CATCH_QUEUE_NUM = 3;
+  int block_size_;
+  std::vector<ConcurrentQueue> queues_;
+  alignas(64) std::atomic<size_t> evict_queue_insertions_{0};
 };
 
 class LPMap {
@@ -95,15 +96,17 @@ class VecBufferPoolHandle;
 class VecBufferPool {
  public:
   typedef std::shared_ptr<VecBufferPool> Pointer;
-  
-  VecBufferPool(const std::string &filename, size_t pool_capacity, size_t block_size);
+
+  VecBufferPool(const std::string &filename, size_t pool_capacity,
+                size_t block_size);
   ~VecBufferPool() {
     close(fd_);
   }
 
   VecBufferPoolHandle get_handle();
 
-  char *acquire_buffer(block_id_t block_id, size_t offset, size_t size, int retry = 0);
+  char *acquire_buffer(block_id_t block_id, size_t offset, size_t size,
+                       int retry = 0);
 
   int get_meta(size_t offset, size_t length, char *buffer);
 
@@ -127,11 +130,10 @@ class VecBufferPool {
 struct VecBufferPoolHandle {
   VecBufferPoolHandle(VecBufferPool &pool) : pool(pool), hit_num_(0) {};
   VecBufferPoolHandle(VecBufferPoolHandle &&other)
-      : pool(other.pool),
-        hit_num_(other.hit_num_) {
+      : pool(other.pool), hit_num_(other.hit_num_) {
     other.hit_num_ = 0;
   }
-    
+
   ~VecBufferPoolHandle() = default;
 
   typedef std::shared_ptr<VecBufferPoolHandle> Pointer;
diff --git a/src/include/zvec/ailego/buffer/concurrentqueue.h b/src/include/zvec/ailego/buffer/concurrentqueue.h
index db4835b1..90edaf97 100644
--- a/src/include/zvec/ailego/buffer/concurrentqueue.h
+++ b/src/include/zvec/ailego/buffer/concurrentqueue.h
@@ -1,5 +1,5 @@
-// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free queue.
-// An overview, including benchmark results, is provided here:
+// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free
+// queue. An overview, including benchmark results, is provided here:
 //     http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++
 // The full design is also described in excruciating detail at:
 //    http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue
@@ -8,24 +8,26 @@
 // Copyright (c) 2013-2020, Cameron Desrochers.
 // All rights reserved.
 //
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
 //
-// - Redistributions of source code must retain the above copyright notice, this list of
-// conditions and the following disclaimer.
-// - Redistributions in binary form must reproduce the above copyright notice, this list of
-// conditions and the following disclaimer in the documentation and/or other materials
-// provided with the distribution.
+// - Redistributions of source code must retain the above copyright notice, this
+// list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
 //
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
-// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
-// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
-// TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
-// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
 
 // Also dual-licensed under the Boost Software License (see LICENSE.md)
 
@@ -33,8 +35,8 @@
 
 #if defined(__GNUC__) && !defined(__INTEL_COMPILER)
 // Disable -Wconversion warnings (spuriously triggered when Traits::size_t and
-// Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings
-// upon assigning any computed values)
+// Traits::index_t are set to < 32 bits, causing integer promotion, causing
+// warnings upon assigning any computed values)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wconversion"
 
@@ -44,10 +46,11 @@
 #endif
 
 #if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17)
-// VS2019 with /W4 warns about constant conditional expressions but unless /std=c++17 or higher
-// does not support `if constexpr`, so we have no choice but to simply disable the warning
+// VS2019 with /W4 warns about constant conditional expressions but unless
+// /std=c++17 or higher does not support `if constexpr`, so we have no choice
+// but to simply disable the warning
 #pragma warning(push)
-#pragma warning(disable: 4127)  // conditional expression is constant
+#pragma warning(disable : 4127)  // conditional expression is constant
 #endif
 
 #if defined(__APPLE__)
@@ -57,92 +60,128 @@
 #ifdef MCDBGQ_USE_RELACY
 #include "relacy/relacy_std.hpp"
 #include "relacy_shims.h"
-// We only use malloc/free anyway, and the delete macro messes up `= delete` method declarations.
-// We'll override the default trait malloc ourselves without a macro.
+// We only use malloc/free anyway, and the delete macro messes up `= delete`
+// method declarations. We'll override the default trait malloc ourselves
+// without a macro.
 #undef new
 #undef delete
 #undef malloc
 #undef free
 #else
-#include <atomic>		// Requires C++11. Sorry VS2010.
+#include <atomic>  // Requires C++11. Sorry VS2010.
 #include <cassert>
 #endif
-#include <cstddef>              // for max_align_t
+#include <algorithm>
+#include <array>
+#include <climits>  // for CHAR_BIT
+#include <cstddef>  // for max_align_t
 #include <cstdint>
 #include <cstdlib>
+#include <limits>
+#include <mutex>  // used for thread exit synchronization
+#include <thread>  // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
 #include <type_traits>
-#include <algorithm>
 #include <utility>
-#include <limits>
-#include <climits>		// for CHAR_BIT
-#include <array>
-#include <thread>		// partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
-#include <mutex>        // used for thread exit synchronization
-
-// Platform-specific definitions of a numeric thread ID type and an invalid value
-namespace moodycamel { namespace details {
-	template<typename thread_id_t> struct thread_id_converter {
-		typedef thread_id_t thread_id_numeric_size_t;
-		typedef thread_id_t thread_id_hash_t;
-		static thread_id_hash_t prehash(thread_id_t const& x) { return x; }
-	};
-} }
+
+// Platform-specific definitions of a numeric thread ID type and an invalid
+// value
+namespace moodycamel {
+namespace details {
+template <typename thread_id_t>
+struct thread_id_converter {
+  typedef thread_id_t thread_id_numeric_size_t;
+  typedef thread_id_t thread_id_hash_t;
+  static thread_id_hash_t prehash(thread_id_t const &x) {
+    return x;
+  }
+};
+}  // namespace details
+}  // namespace moodycamel
 #if defined(MCDBGQ_USE_RELACY)
-namespace moodycamel { namespace details {
-	typedef std::uint32_t thread_id_t;
-	static const thread_id_t invalid_thread_id  = 0xFFFFFFFFU;
-	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU;
-	static inline thread_id_t thread_id() { return rl::thread_index(); }
-} }
+namespace moodycamel {
+namespace details {
+typedef std::uint32_t thread_id_t;
+static const thread_id_t invalid_thread_id = 0xFFFFFFFFU;
+static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU;
+static inline thread_id_t thread_id() {
+  return rl::thread_index();
+}
+}  // namespace details
+}  // namespace moodycamel
 #elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__)
-// No sense pulling in windows.h in a header, we'll manually declare the function
-// we use and rely on backwards-compatibility for this not to break
-extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void);
-namespace moodycamel { namespace details {
-	static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows");
-	typedef std::uint32_t thread_id_t;
-	static const thread_id_t invalid_thread_id  = 0;			// See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx
-	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFFU;	// Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4.
-	static inline thread_id_t thread_id() { return static_cast<thread_id_t>(::GetCurrentThreadId()); }
-} }
-#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || (defined(__APPLE__) && TARGET_OS_IPHONE) || defined(__MVS__) || defined(MOODYCAMEL_NO_THREAD_LOCAL)
-namespace moodycamel { namespace details {
-	static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, "std::thread::id is expected to be either 4 or 8 bytes");
-	
-	typedef std::thread::id thread_id_t;
-	static const thread_id_t invalid_thread_id;         // Default ctor creates invalid ID
-
-	// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's
-	// only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't
-	// be.
-	static inline thread_id_t thread_id() { return std::this_thread::get_id(); }
-
-	template<std::size_t> struct thread_id_size { };
-	template<> struct thread_id_size<4> { typedef std::uint32_t numeric_t; };
-	template<> struct thread_id_size<8> { typedef std::uint64_t numeric_t; };
-
-	template<> struct thread_id_converter<thread_id_t> {
-		typedef thread_id_size<sizeof(thread_id_t)>::numeric_t thread_id_numeric_size_t;
+// No sense pulling in windows.h in a header, we'll manually declare the
+// function we use and rely on backwards-compatibility for this not to break
+extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(
+    void);
+namespace moodycamel {
+namespace details {
+static_assert(sizeof(unsigned long) == sizeof(std::uint32_t),
+              "Expected size of unsigned long to be 32 bits on Windows");
+typedef std::uint32_t thread_id_t;
+static const thread_id_t invalid_thread_id =
+    0;  // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx
+static const thread_id_t invalid_thread_id2 =
+    0xFFFFFFFFU;  // Not technically guaranteed to be invalid, but is never used
+                  // in practice. Note that all Win32 thread IDs are presently
+                  // multiples of 4.
+static inline thread_id_t thread_id() {
+  return static_cast<thread_id_t>(::GetCurrentThreadId());
+}
+}  // namespace details
+}  // namespace moodycamel
+#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || \
+    (defined(__APPLE__) && TARGET_OS_IPHONE) || defined(__MVS__) ||  \
+    defined(MOODYCAMEL_NO_THREAD_LOCAL)
+namespace moodycamel {
+namespace details {
+static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8,
+              "std::thread::id is expected to be either 4 or 8 bytes");
+
+typedef std::thread::id thread_id_t;
+static const thread_id_t invalid_thread_id;  // Default ctor creates invalid ID
+
+// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have
+// one; it's only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined
+// anyway, which it won't be.
+static inline thread_id_t thread_id() {
+  return std::this_thread::get_id();
+}
+
+template <std::size_t>
+struct thread_id_size {};
+template <>
+struct thread_id_size<4> {
+  typedef std::uint32_t numeric_t;
+};
+template <>
+struct thread_id_size<8> {
+  typedef std::uint64_t numeric_t;
+};
+
+template <>
+struct thread_id_converter<thread_id_t> {
+  typedef thread_id_size<sizeof(thread_id_t)>::numeric_t
+      thread_id_numeric_size_t;
 #ifndef __APPLE__
-		typedef std::size_t thread_id_hash_t;
+  typedef std::size_t thread_id_hash_t;
 #else
-		typedef thread_id_numeric_size_t thread_id_hash_t;
+  typedef thread_id_numeric_size_t thread_id_hash_t;
 #endif
 
-		static thread_id_hash_t prehash(thread_id_t const& x)
-		{
+  static thread_id_hash_t prehash(thread_id_t const &x) {
 #ifndef __APPLE__
-			return std::hash<std::thread::id>()(x);
+    return std::hash<std::thread::id>()(x);
 #else
-			return *reinterpret_cast<thread_id_hash_t const*>(&x);
+    return *reinterpret_cast<thread_id_hash_t const *>(&x);
 #endif
-		}
-	};
-} }
+  }
+};
+}
+}
 #else
 // Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475
-// In order to get a numeric thread ID in a platform-independent way, we use a thread-local
-// static variable's address as a thread identifier :-)
+// In order to get a numeric thread ID in a platform-independent way, we use a
+// thread-local static variable's address as a thread identifier :-)
 #if defined(__GNUC__) || defined(__INTEL_COMPILER)
 #define MOODYCAMEL_THREADLOCAL __thread
 #elif defined(_MSC_VER)
@@ -151,17 +190,25 @@ namespace moodycamel { namespace details {
 // Assume C++11 compliant compiler
 #define MOODYCAMEL_THREADLOCAL thread_local
 #endif
-namespace moodycamel { namespace details {
-	typedef std::uintptr_t thread_id_t;
-	static const thread_id_t invalid_thread_id  = 0;		// Address can't be nullptr
-	static const thread_id_t invalid_thread_id2 = 1;		// Member accesses off a null pointer are also generally invalid. Plus it's not aligned.
-	inline thread_id_t thread_id() { static MOODYCAMEL_THREADLOCAL int x; return reinterpret_cast<thread_id_t>(&x); }
-} }
+namespace moodycamel {
+namespace details {
+typedef std::uintptr_t thread_id_t;
+static const thread_id_t invalid_thread_id = 0;  // Address can't be nullptr
+static const thread_id_t invalid_thread_id2 =
+    1;  // Member accesses off a null pointer are also generally invalid. Plus
+        // it's not aligned.
+inline thread_id_t thread_id() {
+  static MOODYCAMEL_THREADLOCAL int x;
+  return reinterpret_cast<thread_id_t>(&x);
+}
+}
+}
 #endif
 
 // Constexpr if
 #ifndef MOODYCAMEL_CONSTEXPR_IF
-#if (defined(_MSC_VER) && defined(_HAS_CXX17) && _HAS_CXX17) || __cplusplus > 201402L
+#if (defined(_MSC_VER) && defined(_HAS_CXX17) && _HAS_CXX17) || \
+    __cplusplus > 201402L
 #define MOODYCAMEL_CONSTEXPR_IF if constexpr
 #define MOODYCAMEL_MAYBE_UNUSED [[maybe_unused]]
 #else
@@ -172,18 +219,20 @@ namespace moodycamel { namespace details {
 
 // Exceptions
 #ifndef MOODYCAMEL_EXCEPTIONS_ENABLED
-#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__))
+#if (defined(_MSC_VER) && defined(_CPPUNWIND)) ||   \
+    (defined(__GNUC__) && defined(__EXCEPTIONS)) || \
+    (!defined(_MSC_VER) && !defined(__GNUC__))
 #define MOODYCAMEL_EXCEPTIONS_ENABLED
 #endif
 #endif
 #ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
 #define MOODYCAMEL_TRY try
-#define MOODYCAMEL_CATCH(...) catch(__VA_ARGS__)
+#define MOODYCAMEL_CATCH(...) catch (__VA_ARGS__)
 #define MOODYCAMEL_RETHROW throw
-#define MOODYCAMEL_THROW(expr) throw (expr)
+#define MOODYCAMEL_THROW(expr) throw(expr)
 #else
-#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF (true)
-#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF (false)
+#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF(true)
+#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF(false)
 #define MOODYCAMEL_RETHROW
 #define MOODYCAMEL_THROW(expr)
 #endif
@@ -194,15 +243,40 @@ namespace moodycamel { namespace details {
 #define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true
 #define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true
 #elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800
-// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-(
-// We have to assume *all* non-trivial constructors may throw on VS2012!
+// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when
+// it shouldn't :-( We have to assume *all* non-trivial constructors may throw
+// on VS2012!
 #define MOODYCAMEL_NOEXCEPT _NOEXCEPT
-#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value)
-#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)    \
+  (std::is_rvalue_reference<valueType>::value &&           \
+           std::is_move_constructible<type>::value         \
+       ? std::is_trivially_move_constructible<type>::value \
+       : std::is_trivially_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr)      \
+  ((std::is_rvalue_reference<valueType>::value &&              \
+            std::is_move_assignable<type>::value               \
+        ? std::is_trivially_move_assignable<type>::value ||    \
+              std::is_nothrow_move_assignable<type>::value     \
+        : std::is_trivially_copy_assignable<type>::value ||    \
+              std::is_nothrow_copy_assignable<type>::value) && \
+   MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
 #elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900
 #define MOODYCAMEL_NOEXCEPT _NOEXCEPT
-#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value || std::is_nothrow_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value || std::is_nothrow_copy_constructible<type>::value)
-#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)       \
+  (std::is_rvalue_reference<valueType>::value &&              \
+           std::is_move_constructible<type>::value            \
+       ? std::is_trivially_move_constructible<type>::value || \
+             std::is_nothrow_move_constructible<type>::value  \
+       : std::is_trivially_copy_constructible<type>::value || \
+             std::is_nothrow_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr)      \
+  ((std::is_rvalue_reference<valueType>::value &&              \
+            std::is_move_assignable<type>::value               \
+        ? std::is_trivially_move_assignable<type>::value ||    \
+              std::is_nothrow_move_assignable<type>::value     \
+        : std::is_trivially_copy_assignable<type>::value ||    \
+              std::is_nothrow_copy_assignable<type>::value) && \
+   MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
 #else
 #define MOODYCAMEL_NOEXCEPT noexcept
 #define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr)
@@ -214,18 +288,31 @@ namespace moodycamel { namespace details {
 #ifdef MCDBGQ_USE_RELACY
 #define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
 #else
-// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445
-// g++ <=4.7 doesn't support thread_local either.
-// Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work
-#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) && (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && !defined(__MVS__)
-// Assume `thread_local` is fully supported in all other C++11 compilers/platforms
-#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED    // tentatively enabled for now; years ago several users report having problems with it on
+// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a
+// crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445 g++ <=4.7 doesn't
+// support thread_local either. Finally, iOS/ARM doesn't have support for it
+// either, and g++/ARM allows it to compile but it's unconfirmed to actually
+// work
+#if (!defined(_MSC_VER) || _MSC_VER >= 1900) &&                        \
+    (!defined(__MINGW32__) && !defined(__MINGW64__) ||                 \
+     !defined(__WINPTHREADS_VERSION)) &&                               \
+    (!defined(__GNUC__) || __GNUC__ > 4 ||                             \
+     (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) &&                        \
+    (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && \
+    !defined(_M_ARM) && !defined(__aarch64__) && !defined(__MVS__)
+// Assume `thread_local` is fully supported in all other C++11
+// compilers/platforms
+#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED  // tentatively enabled for now;
+                                                 // years ago several users
+                                                 // report having problems with
+                                                 // it on
 #endif
 #endif
 #endif
 
-// VS2012 doesn't support deleted functions. 
-// In this case, we declare the function normally but don't define it. A link error will be generated if the function is called.
+// VS2012 doesn't support deleted functions.
+// In this case, we declare the function normally but don't define it. A link
+// error will be generated if the function is called.
 #ifndef MOODYCAMEL_DELETE_FUNCTION
 #if defined(_MSC_VER) && _MSC_VER < 1800
 #define MOODYCAMEL_DELETE_FUNCTION
@@ -234,54 +321,101 @@ namespace moodycamel { namespace details {
 #endif
 #endif
 
-namespace moodycamel { namespace details {
+namespace moodycamel {
+namespace details {
 #ifndef MOODYCAMEL_ALIGNAS
-// VS2013 doesn't support alignas or alignof, and align() requires a constant literal
+// VS2013 doesn't support alignas or alignof, and align() requires a constant
+// literal
 #if defined(_MSC_VER) && _MSC_VER <= 1800
 #define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment))
 #define MOODYCAMEL_ALIGNOF(obj) __alignof(obj)
-#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) typename details::Vs2013Aligned<std::alignment_of<obj>::value, T>::type
-	template<int Align, typename T> struct Vs2013Aligned { };  // default, unsupported alignment
-	template<typename T> struct Vs2013Aligned<1, T> { typedef __declspec(align(1)) T type; };
-	template<typename T> struct Vs2013Aligned<2, T> { typedef __declspec(align(2)) T type; };
-	template<typename T> struct Vs2013Aligned<4, T> { typedef __declspec(align(4)) T type; };
-	template<typename T> struct Vs2013Aligned<8, T> { typedef __declspec(align(8)) T type; };
-	template<typename T> struct Vs2013Aligned<16, T> { typedef __declspec(align(16)) T type; };
-	template<typename T> struct Vs2013Aligned<32, T> { typedef __declspec(align(32)) T type; };
-	template<typename T> struct Vs2013Aligned<64, T> { typedef __declspec(align(64)) T type; };
-	template<typename T> struct Vs2013Aligned<128, T> { typedef __declspec(align(128)) T type; };
-	template<typename T> struct Vs2013Aligned<256, T> { typedef __declspec(align(256)) T type; };
+#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) \
+  typename details::Vs2013Aligned<std::alignment_of<obj>::value, T>::type
+template <int Align, typename T>
+struct Vs2013Aligned {};  // default, unsupported alignment
+template <typename T>
+struct Vs2013Aligned<1, T> {
+  typedef __declspec(align(1)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<2, T> {
+  typedef __declspec(align(2)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<4, T> {
+  typedef __declspec(align(4)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<8, T> {
+  typedef __declspec(align(8)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<16, T> {
+  typedef __declspec(align(16)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<32, T> {
+  typedef __declspec(align(32)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<64, T> {
+  typedef __declspec(align(64)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<128, T> {
+  typedef __declspec(align(128)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<256, T> {
+  typedef __declspec(align(256)) T type;
+};
 #else
-	template<typename T> struct identity { typedef T type; };
+template <typename T>
+struct identity {
+  typedef T type;
+};
 #define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment)
 #define MOODYCAMEL_ALIGNOF(obj) alignof(obj)
-#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) alignas(alignof(obj)) typename details::identity<T>::type
+#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) \
+  alignas(alignof(obj)) typename details::identity<T>::type
 #endif
 #endif
-} }
+}  // namespace details
+}  // namespace moodycamel
 
 
-// TSAN can false report races in lock-free code.  To enable TSAN to be used from projects that use this one,
-// we can apply per-function compile-time suppression.
-// See https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer
+// TSAN can false report races in lock-free code.  To enable TSAN to be used
+// from projects that use this one, we can apply per-function compile-time
+// suppression. See
+// https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer
 #define MOODYCAMEL_NO_TSAN
 #if defined(__has_feature)
- #if __has_feature(thread_sanitizer)
-  #undef MOODYCAMEL_NO_TSAN
-  #define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread")))
- #endif // TSAN
-#endif // TSAN
+#if __has_feature(thread_sanitizer)
+#undef MOODYCAMEL_NO_TSAN
+#define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread")))
+#endif  // TSAN
+#endif  // TSAN
 
 // Compiler-specific likely/unlikely hints
-namespace moodycamel { namespace details {
+namespace moodycamel {
+namespace details {
 #if defined(__GNUC__)
-	static inline bool (likely)(bool x) { return __builtin_expect((x), true); }
-	static inline bool (unlikely)(bool x) { return __builtin_expect((x), false); }
+static inline bool(likely)(bool x) {
+  return __builtin_expect((x), true);
+}
+static inline bool(unlikely)(bool x) {
+  return __builtin_expect((x), false);
+}
 #else
-	static inline bool (likely)(bool x) { return x; }
-	static inline bool (unlikely)(bool x) { return x; }
+static inline bool(likely)(bool x) {
+  return x;
+}
+static inline bool(unlikely)(bool x) {
+  return x;
+}
 #endif
-} }
+}  // namespace details
+}  // namespace moodycamel
 
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
 #include "internal/concurrentqueue_internal_debug.h"
@@ -289,28 +423,34 @@ namespace moodycamel { namespace details {
 
 namespace moodycamel {
 namespace details {
-	template<typename T>
-	struct const_numeric_max {
-		static_assert(std::is_integral<T>::value, "const_numeric_max can only be used with integers");
-		static const T value = std::numeric_limits<T>::is_signed
-			? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast<T>(1)
-			: static_cast<T>(-1);
-	};
+template <typename T>
+struct const_numeric_max {
+  static_assert(std::is_integral<T>::value,
+                "const_numeric_max can only be used with integers");
+  static const T value =
+      std::numeric_limits<T>::is_signed
+          ? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) -
+                static_cast<T>(1)
+          : static_cast<T>(-1);
+};
 
 #if defined(__GLIBCXX__)
-	typedef ::max_align_t std_max_align_t;      // libstdc++ forgot to add it to std:: for a while
+typedef ::max_align_t
+    std_max_align_t;  // libstdc++ forgot to add it to std:: for a while
 #else
-	typedef std::max_align_t std_max_align_t;   // Others (e.g. MSVC) insist it can *only* be accessed via std::
+typedef std::max_align_t std_max_align_t;  // Others (e.g. MSVC) insist it can
+                                           // *only* be accessed via std::
 #endif
 
-	// Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting
-	// 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64.
-	typedef union {
-		std_max_align_t x;
-		long long y;
-		void* z;
-	} max_align_t;
-}
+// Some platforms have incorrectly set max_align_t to a type with <8 bytes
+// alignment even while supporting 8-byte aligned scalar values (*cough* 32-bit
+// iOS). Work around this with our own union. See issue #64.
+typedef union {
+  std_max_align_t x;
+  long long y;
+  void *z;
+} max_align_t;
+}  // namespace details
 
 // Default traits for the ConcurrentQueue. To change some of the
 // traits without re-implementing all of them, inherit from this
@@ -318,95 +458,117 @@ namespace details {
 // since the traits are used as a template type parameter, the
 // shadowed declarations will be used where defined, and the defaults
 // otherwise.
-struct ConcurrentQueueDefaultTraits
-{
-	// General-purpose size type. std::size_t is strongly recommended.
-	typedef std::size_t size_t;
-	
-	// The type used for the enqueue and dequeue indices. Must be at least as
-	// large as size_t. Should be significantly larger than the number of elements
-	// you expect to hold at once, especially if you have a high turnover rate;
-	// for example, on 32-bit x86, if you expect to have over a hundred million
-	// elements or pump several million elements through your queue in a very
-	// short space of time, using a 32-bit type *may* trigger a race condition.
-	// A 64-bit int type is recommended in that case, and in practice will
-	// prevent a race condition no matter the usage of the queue. Note that
-	// whether the queue is lock-free with a 64-int type depends on the whether
-	// std::atomic<std::uint64_t> is lock-free, which is platform-specific.
-	typedef std::size_t index_t;
-	
-	// Internally, all elements are enqueued and dequeued from multi-element
-	// blocks; this is the smallest controllable unit. If you expect few elements
-	// but many producers, a smaller block size should be favoured. For few producers
-	// and/or many elements, a larger block size is preferred. A sane default
-	// is provided. Must be a power of 2.
-	static const size_t BLOCK_SIZE = 32;
-	
-	// For explicit producers (i.e. when using a producer token), the block is
-	// checked for being empty by iterating through a list of flags, one per element.
-	// For large block sizes, this is too inefficient, and switching to an atomic
-	// counter-based approach is faster. The switch is made for block sizes strictly
-	// larger than this threshold.
-	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;
-	
-	// How many full blocks can be expected for a single explicit producer? This should
-	// reflect that number's maximum for optimal performance. Must be a power of 2.
-	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32;
-	
-	// How many full blocks can be expected for a single implicit producer? This should
-	// reflect that number's maximum for optimal performance. Must be a power of 2.
-	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32;
-	
-	// The initial size of the hash table mapping thread IDs to implicit producers.
-	// Note that the hash is resized every time it becomes half full.
-	// Must be a power of two, and either 0 or at least 1. If 0, implicit production
-	// (using the enqueue methods without an explicit producer token) is disabled.
-	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32;
-	
-	// Controls the number of items that an explicit consumer (i.e. one with a token)
-	// must consume before it causes all consumers to rotate and move on to the next
-	// internal queue.
-	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256;
-	
-	// The maximum number of elements (inclusive) that can be enqueued to a sub-queue.
-	// Enqueue operations that would cause this limit to be surpassed will fail. Note
-	// that this limit is enforced at the block level (for performance reasons), i.e.
-	// it's rounded up to the nearest block size.
-	static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max<size_t>::value;
-
-	// The number of times to spin before sleeping when waiting on a semaphore.
-	// Recommended values are on the order of 1000-10000 unless the number of
-	// consumer threads exceeds the number of idle cores (in which case try 0-100).
-	// Only affects instances of the BlockingConcurrentQueue.
-	static const int MAX_SEMA_SPINS = 10000;
-
-	// Whether to recycle dynamically-allocated blocks into an internal free list or
-	// not. If false, only pre-allocated blocks (controlled by the constructor
-	// arguments) will be recycled, and all others will be `free`d back to the heap.
-	// Note that blocks consumed by explicit producers are only freed on destruction
-	// of the queue (not following destruction of the token) regardless of this trait.
-	static const bool RECYCLE_ALLOCATED_BLOCKS = false;
-
-	
+struct ConcurrentQueueDefaultTraits {
+  // General-purpose size type. std::size_t is strongly recommended.
+  typedef std::size_t size_t;
+
+  // The type used for the enqueue and dequeue indices. Must be at least as
+  // large as size_t. Should be significantly larger than the number of elements
+  // you expect to hold at once, especially if you have a high turnover rate;
+  // for example, on 32-bit x86, if you expect to have over a hundred million
+  // elements or pump several million elements through your queue in a very
+  // short space of time, using a 32-bit type *may* trigger a race condition.
+  // A 64-bit int type is recommended in that case, and in practice will
+  // prevent a race condition no matter the usage of the queue. Note that
+  // whether the queue is lock-free with a 64-int type depends on the whether
+  // std::atomic<std::uint64_t> is lock-free, which is platform-specific.
+  typedef std::size_t index_t;
+
+  // Internally, all elements are enqueued and dequeued from multi-element
+  // blocks; this is the smallest controllable unit. If you expect few elements
+  // but many producers, a smaller block size should be favoured. For few
+  // producers and/or many elements, a larger block size is preferred. A sane
+  // default is provided. Must be a power of 2.
+  static const size_t BLOCK_SIZE = 32;
+
+  // For explicit producers (i.e. when using a producer token), the block is
+  // checked for being empty by iterating through a list of flags, one per
+  // element. For large block sizes, this is too inefficient, and switching to
+  // an atomic counter-based approach is faster. The switch is made for block
+  // sizes strictly larger than this threshold.
+  static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;
+
+  // How many full blocks can be expected for a single explicit producer? This
+  // should reflect that number's maximum for optimal performance. Must be a
+  // power of 2.
+  static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32;
+
+  // How many full blocks can be expected for a single implicit producer? This
+  // should reflect that number's maximum for optimal performance. Must be a
+  // power of 2.
+  static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32;
+
+  // The initial size of the hash table mapping thread IDs to implicit
+  // producers. Note that the hash is resized every time it becomes half full.
+  // Must be a power of two, and either 0 or at least 1. If 0, implicit
+  // production (using the enqueue methods without an explicit producer token)
+  // is disabled.
+  static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32;
+
+  // Controls the number of items that an explicit consumer (i.e. one with a
+  // token) must consume before it causes all consumers to rotate and move on to
+  // the next internal queue.
+  static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE =
+      256;
+
+  // The maximum number of elements (inclusive) that can be enqueued to a
+  // sub-queue. Enqueue operations that would cause this limit to be surpassed
+  // will fail. Note that this limit is enforced at the block level (for
+  // performance reasons), i.e. it's rounded up to the nearest block size.
+  static const size_t MAX_SUBQUEUE_SIZE =
+      details::const_numeric_max<size_t>::value;
+
+  // The number of times to spin before sleeping when waiting on a semaphore.
+  // Recommended values are on the order of 1000-10000 unless the number of
+  // consumer threads exceeds the number of idle cores (in which case try
+  // 0-100). Only affects instances of the BlockingConcurrentQueue.
+  static const int MAX_SEMA_SPINS = 10000;
+
+  // Whether to recycle dynamically-allocated blocks into an internal free list
+  // or not. If false, only pre-allocated blocks (controlled by the constructor
+  // arguments) will be recycled, and all others will be `free`d back to the
+  // heap. Note that blocks consumed by explicit producers are only freed on
+  // destruction of the queue (not following destruction of the token)
+  // regardless of this trait.
+  static const bool RECYCLE_ALLOCATED_BLOCKS = false;
+
+
 #ifndef MCDBGQ_USE_RELACY
-	// Memory allocation can be customized if needed.
-	// malloc should return nullptr on failure, and handle alignment like std::malloc.
+  // Memory allocation can be customized if needed.
+  // malloc should return nullptr on failure, and handle alignment like
+  // std::malloc.
 #if defined(malloc) || defined(free)
-	// Gah, this is 2015, stop defining macros that break standard code already!
-	// Work around malloc/free being special macros:
-	static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); }
-	static inline void WORKAROUND_free(void* ptr) { return free(ptr); }
-	static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); }
-	static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); }
+  // Gah, this is 2015, stop defining macros that break standard code already!
+  // Work around malloc/free being special macros:
+  static inline void *WORKAROUND_malloc(size_t size) {
+    return malloc(size);
+  }
+  static inline void WORKAROUND_free(void *ptr) {
+    return free(ptr);
+  }
+  static inline void *(malloc)(size_t size) {
+    return WORKAROUND_malloc(size);
+  }
+  static inline void(free)(void *ptr) {
+    return WORKAROUND_free(ptr);
+  }
 #else
-	static inline void* malloc(size_t size) { return std::malloc(size); }
-	static inline void free(void* ptr) { return std::free(ptr); }
+  static inline void *malloc(size_t size) {
+    return std::malloc(size);
+  }
+  static inline void free(void *ptr) {
+    return std::free(ptr);
+  }
 #endif
 #else
-	// Debug versions when running under the Relacy race detector (ignore
-	// these in user code)
-	static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); }
-	static inline void free(void* ptr) { return rl::rl_free(ptr, $); }
+  // Debug versions when running under the Relacy race detector (ignore
+  // these in user code)
+  static inline void *malloc(size_t size) {
+    return rl::rl_malloc(size, $);
+  }
+  static inline void free(void *ptr) {
+    return rl::rl_free(ptr, $);
+  }
 #endif
 };
 
@@ -421,3322 +583,3825 @@ struct ConcurrentQueueDefaultTraits
 struct ProducerToken;
 struct ConsumerToken;
 
-template<typename T, typename Traits> class ConcurrentQueue;
-template<typename T, typename Traits> class BlockingConcurrentQueue;
+template <typename T, typename Traits>
+class ConcurrentQueue;
+template <typename T, typename Traits>
+class BlockingConcurrentQueue;
 class ConcurrentQueueTests;
 
 
-namespace details
-{
-	struct ConcurrentQueueProducerTypelessBase
-	{
-		ConcurrentQueueProducerTypelessBase* next;
-		std::atomic<bool> inactive;
-		ProducerToken* token;
-		
-		ConcurrentQueueProducerTypelessBase()
-			: next(nullptr), inactive(false), token(nullptr)
-		{
-		}
-	};
-	
-	template<bool use32> struct _hash_32_or_64 {
-		static inline std::uint32_t hash(std::uint32_t h)
-		{
-			// MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
-			// Since the thread ID is already unique, all we really want to do is propagate that
-			// uniqueness evenly across all the bits, so that we can use a subset of the bits while
-			// reducing collisions significantly
-			h ^= h >> 16;
-			h *= 0x85ebca6b;
-			h ^= h >> 13;
-			h *= 0xc2b2ae35;
-			return h ^ (h >> 16);
-		}
-	};
-	template<> struct _hash_32_or_64<1> {
-		static inline std::uint64_t hash(std::uint64_t h)
-		{
-			h ^= h >> 33;
-			h *= 0xff51afd7ed558ccd;
-			h ^= h >> 33;
-			h *= 0xc4ceb9fe1a85ec53;
-			return h ^ (h >> 33);
-		}
-	};
-	template<std::size_t size> struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> {  };
-	
-	static inline size_t hash_thread_id(thread_id_t id)
-	{
-		static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values");
-		return static_cast<size_t>(hash_32_or_64<sizeof(thread_id_converter<thread_id_t>::thread_id_hash_t)>::hash(
-			thread_id_converter<thread_id_t>::prehash(id)));
-	}
-	
-	template<typename T>
-	static inline bool circular_less_than(T a, T b)
-	{
-		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "circular_less_than is intended to be used only with unsigned integer types");
-		return static_cast<T>(a - b) > static_cast<T>(static_cast<T>(1) << (static_cast<T>(sizeof(T) * CHAR_BIT - 1)));
-		// Note: extra parens around rhs of operator<< is MSVC bug: https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931
-		//       silencing the bug requires #pragma warning(disable: 4554) around the calling code and has no effect when done here.
-	}
-	
-	template<typename U>
-	static inline char* align_for(char* ptr)
-	{
-		const std::size_t alignment = std::alignment_of<U>::value;
-		return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
-	}
-
-	template<typename T>
-	static inline T ceil_to_pow_2(T x)
-	{
-		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types");
-
-		// Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
-		--x;
-		x |= x >> 1;
-		x |= x >> 2;
-		x |= x >> 4;
-		for (std::size_t i = 1; i < sizeof(T); i <<= 1) {
-			x |= x >> (i << 3);
-		}
-		++x;
-		return x;
-	}
-	
-	template<typename T>
-	static inline void swap_relaxed(std::atomic<T>& left, std::atomic<T>& right)
-	{
-		T temp = left.load(std::memory_order_relaxed);
-		left.store(right.load(std::memory_order_relaxed), std::memory_order_relaxed);
-		right.store(temp, std::memory_order_relaxed);
-	}
-	
-	template<typename T>
-	static inline T const& nomove(T const& x)
-	{
-		return x;
-	}
-	
-	template<bool Enable>
-	struct nomove_if
-	{
-		template<typename T>
-		static inline T const& eval(T const& x)
-		{
-			return x;
-		}
-	};
-	
-	template<>
-	struct nomove_if<false>
-	{
-		template<typename U>
-		static inline auto eval(U&& x)
-			-> decltype(std::forward<U>(x))
-		{
-			return std::forward<U>(x);
-		}
-	};
-	
-	template<typename It>
-	static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it)
-	{
-		return *it;
-	}
-	
-#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
-	template<typename T> struct is_trivially_destructible : std::is_trivially_destructible<T> { };
+namespace details {
+struct ConcurrentQueueProducerTypelessBase {
+  ConcurrentQueueProducerTypelessBase *next;
+  std::atomic<bool> inactive;
+  ProducerToken *token;
+
+  ConcurrentQueueProducerTypelessBase()
+      : next(nullptr), inactive(false), token(nullptr) {}
+};
+
+template <bool use32>
+struct _hash_32_or_64 {
+  static inline std::uint32_t hash(std::uint32_t h) {
+    // MurmurHash3 finalizer -- see
+    // https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
+    // Since the thread ID is already unique, all we really want to do is
+    // propagate that uniqueness evenly across all the bits, so that we can use
+    // a subset of the bits while reducing collisions significantly
+    h ^= h >> 16;
+    h *= 0x85ebca6b;
+    h ^= h >> 13;
+    h *= 0xc2b2ae35;
+    return h ^ (h >> 16);
+  }
+};
+template <>
+struct _hash_32_or_64<1> {
+  static inline std::uint64_t hash(std::uint64_t h) {
+    h ^= h >> 33;
+    h *= 0xff51afd7ed558ccd;
+    h ^= h >> 33;
+    h *= 0xc4ceb9fe1a85ec53;
+    return h ^ (h >> 33);
+  }
+};
+template <std::size_t size>
+struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> {};
+
+static inline size_t hash_thread_id(thread_id_t id) {
+  static_assert(
+      sizeof(thread_id_t) <= 8,
+      "Expected a platform where thread IDs are at most 64-bit values");
+  return static_cast<size_t>(
+      hash_32_or_64<sizeof(
+          thread_id_converter<thread_id_t>::thread_id_hash_t)>::
+          hash(thread_id_converter<thread_id_t>::prehash(id)));
+}
+
+template <typename T>
+static inline bool circular_less_than(T a, T b) {
+  static_assert(
+      std::is_integral<T>::value && !std::numeric_limits<T>::is_signed,
+      "circular_less_than is intended to be used only with unsigned integer "
+      "types");
+  return static_cast<T>(a - b) >
+         static_cast<T>(static_cast<T>(1)
+                        << (static_cast<T>(sizeof(T) * CHAR_BIT - 1)));
+  // Note: extra parens around rhs of operator<< is MSVC bug:
+  // https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931
+  //       silencing the bug requires #pragma warning(disable: 4554) around the
+  //       calling code and has no effect when done here.
+}
+
+template <typename U>
+static inline char *align_for(char *ptr) {
+  const std::size_t alignment = std::alignment_of<U>::value;
+  return ptr +
+         (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) %
+             alignment;
+}
+
+template <typename T>
+static inline T ceil_to_pow_2(T x) {
+  static_assert(
+      std::is_integral<T>::value && !std::numeric_limits<T>::is_signed,
+      "ceil_to_pow_2 is intended to be used only with unsigned integer types");
+
+  // Adapted from
+  // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+  --x;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  for (std::size_t i = 1; i < sizeof(T); i <<= 1) {
+    x |= x >> (i << 3);
+  }
+  ++x;
+  return x;
+}
+
+template <typename T>
+static inline void swap_relaxed(std::atomic<T> &left, std::atomic<T> &right) {
+  T temp = left.load(std::memory_order_relaxed);
+  left.store(right.load(std::memory_order_relaxed), std::memory_order_relaxed);
+  right.store(temp, std::memory_order_relaxed);
+}
+
+template <typename T>
+static inline T const &nomove(T const &x) {
+  return x;
+}
+
+template <bool Enable>
+struct nomove_if {
+  template <typename T>
+  static inline T const &eval(T const &x) {
+    return x;
+  }
+};
+
+template <>
+struct nomove_if<false> {
+  template <typename U>
+  static inline auto eval(U &&x) -> decltype(std::forward<U>(x)) {
+    return std::forward<U>(x);
+  }
+};
+
+template <typename It>
+static inline auto deref_noexcept(It &it) MOODYCAMEL_NOEXCEPT -> decltype(*it) {
+  return *it;
+}
+
+#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || \
+    (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+template <typename T>
+struct is_trivially_destructible : std::is_trivially_destructible<T> {};
 #else
-	template<typename T> struct is_trivially_destructible : std::has_trivial_destructor<T> { };
+template <typename T>
+struct is_trivially_destructible : std::has_trivial_destructor<T> {};
 #endif
-	
+
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
 #ifdef MCDBGQ_USE_RELACY
-	typedef RelacyThreadExitListener ThreadExitListener;
-	typedef RelacyThreadExitNotifier ThreadExitNotifier;
+typedef RelacyThreadExitListener ThreadExitListener;
+typedef RelacyThreadExitNotifier ThreadExitNotifier;
 #else
-	class ThreadExitNotifier;
-
-	struct ThreadExitListener
-	{
-		typedef void (*callback_t)(void*);
-		callback_t callback;
-		void* userData;
-		
-		ThreadExitListener* next;		// reserved for use by the ThreadExitNotifier
-		ThreadExitNotifier* chain;		// reserved for use by the ThreadExitNotifier
-	};
-
-	class ThreadExitNotifier
-	{
-	public:
-		static void subscribe(ThreadExitListener* listener)
-		{
-			auto& tlsInst = instance();
-			std::lock_guard<std::mutex> guard(mutex());
-			listener->next = tlsInst.tail;
-			listener->chain = &tlsInst;
-			tlsInst.tail = listener;
-		}
-		
-		static void unsubscribe(ThreadExitListener* listener)
-		{
-			std::lock_guard<std::mutex> guard(mutex());
-			if (!listener->chain) {
-				return;  // race with ~ThreadExitNotifier
-			}
-			auto& tlsInst = *listener->chain;
-			listener->chain = nullptr;
-			ThreadExitListener** prev = &tlsInst.tail;
-			for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) {
-				if (ptr == listener) {
-					*prev = ptr->next;
-					break;
-				}
-				prev = &ptr->next;
-			}
-		}
-		
-	private:
-		ThreadExitNotifier() : tail(nullptr) { }
-		ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
-		ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
-		
-		~ThreadExitNotifier()
-		{
-			// This thread is about to exit, let everyone know!
-			assert(this == &instance() && "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined.");
-			std::lock_guard<std::mutex> guard(mutex());
-			for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) {
-				ptr->chain = nullptr;
-				ptr->callback(ptr->userData);
-			}
-		}
-		
-		// Thread-local
-		static inline ThreadExitNotifier& instance()
-		{
-			static thread_local ThreadExitNotifier notifier;
-			return notifier;
-		}
-
-		static inline std::mutex& mutex()
-		{
-			// Must be static because the ThreadExitNotifier could be destroyed while unsubscribe is called
-			static std::mutex mutex;
-			return mutex;
-		}
-		
-	private:
-		ThreadExitListener* tail;
-	};
-#endif
-#endif
-	
-	template<typename T> struct static_is_lock_free_num { enum { value = 0 }; };
-	template<> struct static_is_lock_free_num<signed char> { enum { value = ATOMIC_CHAR_LOCK_FREE }; };
-	template<> struct static_is_lock_free_num<short> { enum { value = ATOMIC_SHORT_LOCK_FREE }; };
-	template<> struct static_is_lock_free_num<int> { enum { value = ATOMIC_INT_LOCK_FREE }; };
-	template<> struct static_is_lock_free_num<long> { enum { value = ATOMIC_LONG_LOCK_FREE }; };
-	template<> struct static_is_lock_free_num<long long> { enum { value = ATOMIC_LLONG_LOCK_FREE }; };
-	template<typename T> struct static_is_lock_free : static_is_lock_free_num<typename std::make_signed<T>::type> {  };
-	template<> struct static_is_lock_free<bool> { enum { value = ATOMIC_BOOL_LOCK_FREE }; };
-	template<typename U> struct static_is_lock_free<U*> { enum { value = ATOMIC_POINTER_LOCK_FREE }; };
-}
+class ThreadExitNotifier;
+
+struct ThreadExitListener {
+  typedef void (*callback_t)(void *);
+  callback_t callback;
+  void *userData;
+
+  ThreadExitListener *next;   // reserved for use by the ThreadExitNotifier
+  ThreadExitNotifier *chain;  // reserved for use by the ThreadExitNotifier
+};
 
+class ThreadExitNotifier {
+ public:
+  static void subscribe(ThreadExitListener *listener) {
+    auto &tlsInst = instance();
+    std::lock_guard<std::mutex> guard(mutex());
+    listener->next = tlsInst.tail;
+    listener->chain = &tlsInst;
+    tlsInst.tail = listener;
+  }
 
-struct ProducerToken
-{
-	template<typename T, typename Traits>
-	explicit ProducerToken(ConcurrentQueue<T, Traits>& queue);
-	
-	template<typename T, typename Traits>
-	explicit ProducerToken(BlockingConcurrentQueue<T, Traits>& queue);
-	
-	ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
-		: producer(other.producer)
-	{
-		other.producer = nullptr;
-		if (producer != nullptr) {
-			producer->token = this;
-		}
-	}
-	
-	inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
-	{
-		swap(other);
-		return *this;
-	}
-	
-	void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT
-	{
-		std::swap(producer, other.producer);
-		if (producer != nullptr) {
-			producer->token = this;
-		}
-		if (other.producer != nullptr) {
-			other.producer->token = &other;
-		}
-	}
-	
-	// A token is always valid unless:
-	//     1) Memory allocation failed during construction
-	//     2) It was moved via the move constructor
-	//        (Note: assignment does a swap, leaving both potentially valid)
-	//     3) The associated queue was destroyed
-	// Note that if valid() returns true, that only indicates
-	// that the token is valid for use with a specific queue,
-	// but not which one; that's up to the user to track.
-	inline bool valid() const { return producer != nullptr; }
-	
-	~ProducerToken()
-	{
-		if (producer != nullptr) {
-			producer->token = nullptr;
-			producer->inactive.store(true, std::memory_order_release);
-		}
-	}
-	
-	// Disable copying and assignment
-	ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
-	ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
-	
-private:
-	template<typename T, typename Traits> friend class ConcurrentQueue;
-	friend class ConcurrentQueueTests;
-	
-protected:
-	details::ConcurrentQueueProducerTypelessBase* producer;
+  static void unsubscribe(ThreadExitListener *listener) {
+    std::lock_guard<std::mutex> guard(mutex());
+    if (!listener->chain) {
+      return;  // race with ~ThreadExitNotifier
+    }
+    auto &tlsInst = *listener->chain;
+    listener->chain = nullptr;
+    ThreadExitListener **prev = &tlsInst.tail;
+    for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) {
+      if (ptr == listener) {
+        *prev = ptr->next;
+        break;
+      }
+      prev = &ptr->next;
+    }
+  }
+
+ private:
+  ThreadExitNotifier() : tail(nullptr) {}
+  ThreadExitNotifier(ThreadExitNotifier const &) MOODYCAMEL_DELETE_FUNCTION;
+  ThreadExitNotifier &operator=(ThreadExitNotifier const &)
+      MOODYCAMEL_DELETE_FUNCTION;
+
+  ~ThreadExitNotifier() {
+    // This thread is about to exit, let everyone know!
+    assert(this == &instance() &&
+           "If this assert fails, you likely have a buggy compiler! Change the "
+           "preprocessor conditions such that "
+           "MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined.");
+    std::lock_guard<std::mutex> guard(mutex());
+    for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) {
+      ptr->chain = nullptr;
+      ptr->callback(ptr->userData);
+    }
+  }
+
+  // Thread-local
+  static inline ThreadExitNotifier &instance() {
+    static thread_local ThreadExitNotifier notifier;
+    return notifier;
+  }
+
+  static inline std::mutex &mutex() {
+    // Must be static because the ThreadExitNotifier could be destroyed while
+    // unsubscribe is called
+    static std::mutex mutex;
+    return mutex;
+  }
+
+ private:
+  ThreadExitListener *tail;
+};
+#endif
+#endif
+
+template <typename T>
+struct static_is_lock_free_num {
+  enum { value = 0 };
+};
+template <>
+struct static_is_lock_free_num<signed char> {
+  enum { value = ATOMIC_CHAR_LOCK_FREE };
+};
+template <>
+struct static_is_lock_free_num<short> {
+  enum { value = ATOMIC_SHORT_LOCK_FREE };
+};
+template <>
+struct static_is_lock_free_num<int> {
+  enum { value = ATOMIC_INT_LOCK_FREE };
+};
+template <>
+struct static_is_lock_free_num<long> {
+  enum { value = ATOMIC_LONG_LOCK_FREE };
+};
+template <>
+struct static_is_lock_free_num<long long> {
+  enum { value = ATOMIC_LLONG_LOCK_FREE };
+};
+template <typename T>
+struct static_is_lock_free
+    : static_is_lock_free_num<typename std::make_signed<T>::type> {};
+template <>
+struct static_is_lock_free<bool> {
+  enum { value = ATOMIC_BOOL_LOCK_FREE };
+};
+template <typename U>
+struct static_is_lock_free<U *> {
+  enum { value = ATOMIC_POINTER_LOCK_FREE };
 };
+}  // namespace details
+
+
+struct ProducerToken {
+  template <typename T, typename Traits>
+  explicit ProducerToken(ConcurrentQueue<T, Traits> &queue);
+
+  template <typename T, typename Traits>
+  explicit ProducerToken(BlockingConcurrentQueue<T, Traits> &queue);
+
+  ProducerToken(ProducerToken &&other) MOODYCAMEL_NOEXCEPT
+      : producer(other.producer) {
+    other.producer = nullptr;
+    if (producer != nullptr) {
+      producer->token = this;
+    }
+  }
+
+  inline ProducerToken &operator=(ProducerToken &&other) MOODYCAMEL_NOEXCEPT {
+    swap(other);
+    return *this;
+  }
+
+  void swap(ProducerToken &other) MOODYCAMEL_NOEXCEPT {
+    std::swap(producer, other.producer);
+    if (producer != nullptr) {
+      producer->token = this;
+    }
+    if (other.producer != nullptr) {
+      other.producer->token = &other;
+    }
+  }
+
+  // A token is always valid unless:
+  //     1) Memory allocation failed during construction
+  //     2) It was moved via the move constructor
+  //        (Note: assignment does a swap, leaving both potentially valid)
+  //     3) The associated queue was destroyed
+  // Note that if valid() returns true, that only indicates
+  // that the token is valid for use with a specific queue,
+  // but not which one; that's up to the user to track.
+  inline bool valid() const {
+    return producer != nullptr;
+  }
+
+  ~ProducerToken() {
+    if (producer != nullptr) {
+      producer->token = nullptr;
+      producer->inactive.store(true, std::memory_order_release);
+    }
+  }
+
+  // Disable copying and assignment
+  ProducerToken(ProducerToken const &) MOODYCAMEL_DELETE_FUNCTION;
+  ProducerToken &operator=(ProducerToken const &) MOODYCAMEL_DELETE_FUNCTION;
+
+ private:
+  template <typename T, typename Traits>
+  friend class ConcurrentQueue;
+  friend class ConcurrentQueueTests;
+
+ protected:
+  details::ConcurrentQueueProducerTypelessBase *producer;
+};
+
+
+struct ConsumerToken {
+  template <typename T, typename Traits>
+  explicit ConsumerToken(ConcurrentQueue<T, Traits> &q);
+
+  template <typename T, typename Traits>
+  explicit ConsumerToken(BlockingConcurrentQueue<T, Traits> &q);
+
+  ConsumerToken(ConsumerToken &&other) MOODYCAMEL_NOEXCEPT
+      : initialOffset(other.initialOffset),
+        lastKnownGlobalOffset(other.lastKnownGlobalOffset),
+        itemsConsumedFromCurrent(other.itemsConsumedFromCurrent),
+        currentProducer(other.currentProducer),
+        desiredProducer(other.desiredProducer) {}
 
+  inline ConsumerToken &operator=(ConsumerToken &&other) MOODYCAMEL_NOEXCEPT {
+    swap(other);
+    return *this;
+  }
 
-struct ConsumerToken
-{
-	template<typename T, typename Traits>
-	explicit ConsumerToken(ConcurrentQueue<T, Traits>& q);
-	
-	template<typename T, typename Traits>
-	explicit ConsumerToken(BlockingConcurrentQueue<T, Traits>& q);
-	
-	ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
-		: initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer)
-	{
-	}
-	
-	inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
-	{
-		swap(other);
-		return *this;
-	}
-	
-	void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT
-	{
-		std::swap(initialOffset, other.initialOffset);
-		std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
-		std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
-		std::swap(currentProducer, other.currentProducer);
-		std::swap(desiredProducer, other.desiredProducer);
-	}
-	
-	// Disable copying and assignment
-	ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
-	ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
-
-private:
-	template<typename T, typename Traits> friend class ConcurrentQueue;
-	friend class ConcurrentQueueTests;
-	
-private: // but shared with ConcurrentQueue
-	std::uint32_t initialOffset;
-	std::uint32_t lastKnownGlobalOffset;
-	std::uint32_t itemsConsumedFromCurrent;
-	details::ConcurrentQueueProducerTypelessBase* currentProducer;
-	details::ConcurrentQueueProducerTypelessBase* desiredProducer;
+  void swap(ConsumerToken &other) MOODYCAMEL_NOEXCEPT {
+    std::swap(initialOffset, other.initialOffset);
+    std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
+    std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
+    std::swap(currentProducer, other.currentProducer);
+    std::swap(desiredProducer, other.desiredProducer);
+  }
+
+  // Disable copying and assignment
+  ConsumerToken(ConsumerToken const &) MOODYCAMEL_DELETE_FUNCTION;
+  ConsumerToken &operator=(ConsumerToken const &) MOODYCAMEL_DELETE_FUNCTION;
+
+ private:
+  template <typename T, typename Traits>
+  friend class ConcurrentQueue;
+  friend class ConcurrentQueueTests;
+
+ private:  // but shared with ConcurrentQueue
+  std::uint32_t initialOffset;
+  std::uint32_t lastKnownGlobalOffset;
+  std::uint32_t itemsConsumedFromCurrent;
+  details::ConcurrentQueueProducerTypelessBase *currentProducer;
+  details::ConcurrentQueueProducerTypelessBase *desiredProducer;
 };
 
 // Need to forward-declare this swap because it's in a namespace.
-// See http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces
-template<typename T, typename Traits>
-inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT;
-
-
-template<typename T, typename Traits = ConcurrentQueueDefaultTraits>
-class ConcurrentQueue
-{
-public:
-	typedef ::moodycamel::ProducerToken producer_token_t;
-	typedef ::moodycamel::ConsumerToken consumer_token_t;
-	
-	typedef typename Traits::index_t index_t;
-	typedef typename Traits::size_t size_t;
-	
-	static const size_t BLOCK_SIZE = static_cast<size_t>(Traits::BLOCK_SIZE);
-	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
-	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
-	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::IMPLICIT_INITIAL_INDEX_SIZE);
-	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = static_cast<size_t>(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE);
-	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast<std::uint32_t>(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
+// See
+// http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces
+template <typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP &a,
+                 typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP &b)
+    MOODYCAMEL_NOEXCEPT;
+
+
+template <typename T, typename Traits = ConcurrentQueueDefaultTraits>
+class ConcurrentQueue {
+ public:
+  typedef ::moodycamel::ProducerToken producer_token_t;
+  typedef ::moodycamel::ConsumerToken consumer_token_t;
+
+  typedef typename Traits::index_t index_t;
+  typedef typename Traits::size_t size_t;
+
+  static const size_t BLOCK_SIZE = static_cast<size_t>(Traits::BLOCK_SIZE);
+  static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD =
+      static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
+  static const size_t EXPLICIT_INITIAL_INDEX_SIZE =
+      static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
+  static const size_t IMPLICIT_INITIAL_INDEX_SIZE =
+      static_cast<size_t>(Traits::IMPLICIT_INITIAL_INDEX_SIZE);
+  static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE =
+      static_cast<size_t>(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE);
+  static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE =
+      static_cast<std::uint32_t>(
+          Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
 #ifdef _MSC_VER
 #pragma warning(push)
-#pragma warning(disable: 4307)		// + integral constant overflow (that's what the ternary expression is for!)
-#pragma warning(disable: 4309)		// static_cast: Truncation of constant value
+#pragma warning(disable : 4307)  // + integral constant overflow (that's what
+                                 // the ternary expression is for!)
+#pragma warning(disable : 4309)  // static_cast: Truncation of constant value
 #endif
-	static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max<size_t>::value - static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max<size_t>::value : ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE);
+  static const size_t MAX_SUBQUEUE_SIZE =
+      (details::const_numeric_max<size_t>::value -
+           static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) <
+       BLOCK_SIZE)
+          ? details::const_numeric_max<size_t>::value
+          : ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) +
+              (BLOCK_SIZE - 1)) /
+             BLOCK_SIZE * BLOCK_SIZE);
 #ifdef _MSC_VER
 #pragma warning(pop)
 #endif
 
-	static_assert(!std::numeric_limits<size_t>::is_signed && std::is_integral<size_t>::value, "Traits::size_t must be an unsigned integral type");
-	static_assert(!std::numeric_limits<index_t>::is_signed && std::is_integral<index_t>::value, "Traits::index_t must be an unsigned integral type");
-	static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t");
-	static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)");
-	static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)");
-	static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
-	static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
-	static_assert((INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2");
-	static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)");
-
-public:
-	// Creates a queue with at least `capacity` element slots; note that the
-	// actual number of elements that can be inserted without additional memory
-	// allocation depends on the number of producers and the block size (e.g. if
-	// the block size is equal to `capacity`, only a single block will be allocated
-	// up-front, which means only a single producer will be able to enqueue elements
-	// without an extra allocation -- blocks aren't shared between producers).
-	// This method is not thread safe -- it is up to the user to ensure that the
-	// queue is fully constructed before it starts being used by other threads (this
-	// includes making the memory effects of construction visible, possibly with a
-	// memory barrier).
-	explicit ConcurrentQueue(size_t capacity = 32 * BLOCK_SIZE)
-		: producerListTail(nullptr),
-		producerCount(0),
-		initialBlockPoolIndex(0),
-		nextExplicitConsumerId(0),
-		globalExplicitConsumerOffset(0)
-	{
-		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
-		populate_initial_implicit_producer_hash();
-		populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1));
-		
-#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-		// Track all the producers using a fully-resolved typed list for
-		// each kind; this makes it possible to debug them starting from
-		// the root queue object (otherwise wacky casts are needed that
-		// don't compile in the debugger's expression evaluator).
-		explicitProducers.store(nullptr, std::memory_order_relaxed);
-		implicitProducers.store(nullptr, std::memory_order_relaxed);
-#endif
-	}
-	
-	// Computes the correct amount of pre-allocated blocks for you based
-	// on the minimum number of elements you want available at any given
-	// time, and the maximum concurrent number of each type of producer.
-	ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers)
-		: producerListTail(nullptr),
-		producerCount(0),
-		initialBlockPoolIndex(0),
-		nextExplicitConsumerId(0),
-		globalExplicitConsumerOffset(0)
-	{
-		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
-		populate_initial_implicit_producer_hash();
-		size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers + maxImplicitProducers);
-		populate_initial_block_list(blocks);
-		
-#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-		explicitProducers.store(nullptr, std::memory_order_relaxed);
-		implicitProducers.store(nullptr, std::memory_order_relaxed);
-#endif
-	}
-	
-	// Note: The queue should not be accessed concurrently while it's
-	// being deleted. It's up to the user to synchronize this.
-	// This method is not thread safe.
-	~ConcurrentQueue()
-	{
-		// Destroy producers
-		auto ptr = producerListTail.load(std::memory_order_relaxed);
-		while (ptr != nullptr) {
-			auto next = ptr->next_prod();
-			if (ptr->token != nullptr) {
-				ptr->token->producer = nullptr;
-			}
-			destroy(ptr);
-			ptr = next;
-		}
-		
-		// Destroy implicit producer hash tables
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) {
-			auto hash = implicitProducerHash.load(std::memory_order_relaxed);
-			while (hash != nullptr) {
-				auto prev = hash->prev;
-				if (prev != nullptr) {		// The last hash is part of this object and was not allocated dynamically
-					for (size_t i = 0; i != hash->capacity; ++i) {
-						hash->entries[i].~ImplicitProducerKVP();
-					}
-					hash->~ImplicitProducerHash();
-					(Traits::free)(hash);
-				}
-				hash = prev;
-			}
-		}
-		
-		// Destroy global free list
-		auto block = freeList.head_unsafe();
-		while (block != nullptr) {
-			auto next = block->freeListNext.load(std::memory_order_relaxed);
-			if (block->dynamicallyAllocated) {
-				destroy(block);
-			}
-			block = next;
-		}
-		
-		// Destroy initial free list
-		destroy_array(initialBlockPool, initialBlockPoolSize);
-	}
-
-	// Disable copying and copy assignment
-	ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
-	ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
-	
-	// Moving is supported, but note that it is *not* a thread-safe operation.
-	// Nobody can use the queue while it's being moved, and the memory effects
-	// of that move must be propagated to other threads before they can use it.
-	// Note: When a queue is moved, its tokens are still valid but can only be
-	// used with the destination queue (i.e. semantically they are moved along
-	// with the queue itself).
-	ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
-		: producerListTail(other.producerListTail.load(std::memory_order_relaxed)),
-		producerCount(other.producerCount.load(std::memory_order_relaxed)),
-		initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)),
-		initialBlockPool(other.initialBlockPool),
-		initialBlockPoolSize(other.initialBlockPoolSize),
-		freeList(std::move(other.freeList)),
-		nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)),
-		globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed))
-	{
-		// Move the other one into this, and leave the other one as an empty queue
-		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
-		populate_initial_implicit_producer_hash();
-		swap_implicit_producer_hashes(other);
-		
-		other.producerListTail.store(nullptr, std::memory_order_relaxed);
-		other.producerCount.store(0, std::memory_order_relaxed);
-		other.nextExplicitConsumerId.store(0, std::memory_order_relaxed);
-		other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed);
-		
-#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-		explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
-		other.explicitProducers.store(nullptr, std::memory_order_relaxed);
-		implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
-		other.implicitProducers.store(nullptr, std::memory_order_relaxed);
-#endif
-		
-		other.initialBlockPoolIndex.store(0, std::memory_order_relaxed);
-		other.initialBlockPoolSize = 0;
-		other.initialBlockPool = nullptr;
-		
-		reown_producers();
-	}
-	
-	inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
-	{
-		return swap_internal(other);
-	}
-	
-	// Swaps this queue's state with the other's. Not thread-safe.
-	// Swapping two queues does not invalidate their tokens, however
-	// the tokens that were created for one queue must be used with
-	// only the swapped queue (i.e. the tokens are tied to the
-	// queue's movable state, not the object itself).
-	inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT
-	{
-		swap_internal(other);
-	}
-	
-private:
-	ConcurrentQueue& swap_internal(ConcurrentQueue& other)
-	{
-		if (this == &other) {
-			return *this;
-		}
-		
-		details::swap_relaxed(producerListTail, other.producerListTail);
-		details::swap_relaxed(producerCount, other.producerCount);
-		details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex);
-		std::swap(initialBlockPool, other.initialBlockPool);
-		std::swap(initialBlockPoolSize, other.initialBlockPoolSize);
-		freeList.swap(other.freeList);
-		details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId);
-		details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset);
-		
-		swap_implicit_producer_hashes(other);
-		
-		reown_producers();
-		other.reown_producers();
-		
-#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-		details::swap_relaxed(explicitProducers, other.explicitProducers);
-		details::swap_relaxed(implicitProducers, other.implicitProducers);
-#endif
-		
-		return *this;
-	}
-	
-public:
-	// Enqueues a single item (by copying it).
-	// Allocates memory if required. Only fails if memory allocation fails (or implicit
-	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
-	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Thread-safe.
-	inline bool enqueue(T const& item)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue<CanAlloc>(item);
-	}
-	
-	// Enqueues a single item (by moving it, if possible).
-	// Allocates memory if required. Only fails if memory allocation fails (or implicit
-	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
-	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Thread-safe.
-	inline bool enqueue(T&& item)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue<CanAlloc>(std::move(item));
-	}
-	
-	// Enqueues a single item (by copying it) using an explicit producer token.
-	// Allocates memory if required. Only fails if memory allocation fails (or
-	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Thread-safe.
-	inline bool enqueue(producer_token_t const& token, T const& item)
-	{
-		return inner_enqueue<CanAlloc>(token, item);
-	}
-	
-	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
-	// Allocates memory if required. Only fails if memory allocation fails (or
-	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Thread-safe.
-	inline bool enqueue(producer_token_t const& token, T&& item)
-	{
-		return inner_enqueue<CanAlloc>(token, std::move(item));
-	}
-	
-	// Enqueues several items.
-	// Allocates memory if required. Only fails if memory allocation fails (or
-	// implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
-	// is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Note: Use std::make_move_iterator if the elements should be moved instead of copied.
-	// Thread-safe.
-	template<typename It>
-	bool enqueue_bulk(It itemFirst, size_t count)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue_bulk<CanAlloc>(itemFirst, count);
-	}
-	
-	// Enqueues several items using an explicit producer token.
-	// Allocates memory if required. Only fails if memory allocation fails
-	// (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Note: Use std::make_move_iterator if the elements should be moved
-	// instead of copied.
-	// Thread-safe.
-	template<typename It>
-	bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
-	{
-		return inner_enqueue_bulk<CanAlloc>(token, itemFirst, count);
-	}
-	
-	// Enqueues a single item (by copying it).
-	// Does not allocate memory. Fails if not enough room to enqueue (or implicit
-	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
-	// is 0).
-	// Thread-safe.
-	inline bool try_enqueue(T const& item)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue<CannotAlloc>(item);
-	}
-	
-	// Enqueues a single item (by moving it, if possible).
-	// Does not allocate memory (except for one-time implicit producer).
-	// Fails if not enough room to enqueue (or implicit production is
-	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
-	// Thread-safe.
-	inline bool try_enqueue(T&& item)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue<CannotAlloc>(std::move(item));
-	}
-	
-	// Enqueues a single item (by copying it) using an explicit producer token.
-	// Does not allocate memory. Fails if not enough room to enqueue.
-	// Thread-safe.
-	inline bool try_enqueue(producer_token_t const& token, T const& item)
-	{
-		return inner_enqueue<CannotAlloc>(token, item);
-	}
-	
-	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
-	// Does not allocate memory. Fails if not enough room to enqueue.
-	// Thread-safe.
-	inline bool try_enqueue(producer_token_t const& token, T&& item)
-	{
-		return inner_enqueue<CannotAlloc>(token, std::move(item));
-	}
-	
-	// Enqueues several items.
-	// Does not allocate memory (except for one-time implicit producer).
-	// Fails if not enough room to enqueue (or implicit production is
-	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
-	// Note: Use std::make_move_iterator if the elements should be moved
-	// instead of copied.
-	// Thread-safe.
-	template<typename It>
-	bool try_enqueue_bulk(It itemFirst, size_t count)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue_bulk<CannotAlloc>(itemFirst, count);
-	}
-	
-	// Enqueues several items using an explicit producer token.
-	// Does not allocate memory. Fails if not enough room to enqueue.
-	// Note: Use std::make_move_iterator if the elements should be moved
-	// instead of copied.
-	// Thread-safe.
-	template<typename It>
-	bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
-	{
-		return inner_enqueue_bulk<CannotAlloc>(token, itemFirst, count);
-	}
-	
-	
-	
-	// Attempts to dequeue from the queue.
-	// Returns false if all producer streams appeared empty at the time they
-	// were checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename U>
-	bool try_dequeue(U& item)
-	{
-		// Instead of simply trying each producer in turn (which could cause needless contention on the first
-		// producer), we score them heuristically.
-		size_t nonEmptyCount = 0;
-		ProducerBase* best = nullptr;
-		size_t bestSize = 0;
-		for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) {
-			auto size = ptr->size_approx();
-			if (size > 0) {
-				if (size > bestSize) {
-					bestSize = size;
-					best = ptr;
-				}
-				++nonEmptyCount;
-			}
-		}
-		
-		// If there was at least one non-empty queue but it appears empty at the time
-		// we try to dequeue from it, we need to make sure every queue's been tried
-		if (nonEmptyCount > 0) {
-			if ((details::likely)(best->dequeue(item))) {
-				return true;
-			}
-			for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-				if (ptr != best && ptr->dequeue(item)) {
-					return true;
-				}
-			}
-		}
-		return false;
-	}
-	
-	// Attempts to dequeue from the queue.
-	// Returns false if all producer streams appeared empty at the time they
-	// were checked (so, the queue is likely but not guaranteed to be empty).
-	// This differs from the try_dequeue(item) method in that this one does
-	// not attempt to reduce contention by interleaving the order that producer
-	// streams are dequeued from. So, using this method can reduce overall throughput
-	// under contention, but will give more predictable results in single-threaded
-	// consumer scenarios. This is mostly only useful for internal unit tests.
-	// Never allocates. Thread-safe.
-	template<typename U>
-	bool try_dequeue_non_interleaved(U& item)
-	{
-		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-			if (ptr->dequeue(item)) {
-				return true;
-			}
-		}
-		return false;
-	}
-	
-	// Attempts to dequeue from the queue using an explicit consumer token.
-	// Returns false if all producer streams appeared empty at the time they
-	// were checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename U>
-	bool try_dequeue(consumer_token_t& token, U& item)
-	{
-		// The idea is roughly as follows:
-		// Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less
-		// If you see that the global offset has changed, you must reset your consumption counter and move to your designated place
-		// If there's no items where you're supposed to be, keep moving until you find a producer with some items
-		// If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it
-		
-		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
-			if (!update_current_producer_after_rotation(token)) {
-				return false;
-			}
-		}
-		
-		// If there was at least one non-empty queue but it appears empty at the time
-		// we try to dequeue from it, we need to make sure every queue's been tried
-		if (static_cast<ProducerBase*>(token.currentProducer)->dequeue(item)) {
-			if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
-				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
-			}
-			return true;
-		}
-		
-		auto tail = producerListTail.load(std::memory_order_acquire);
-		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
-		if (ptr == nullptr) {
-			ptr = tail;
-		}
-		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
-			if (ptr->dequeue(item)) {
-				token.currentProducer = ptr;
-				token.itemsConsumedFromCurrent = 1;
-				return true;
-			}
-			ptr = ptr->next_prod();
-			if (ptr == nullptr) {
-				ptr = tail;
-			}
-		}
-		return false;
-	}
-	
-	// Attempts to dequeue several elements from the queue.
-	// Returns the number of items actually dequeued.
-	// Returns 0 if all producer streams appeared empty at the time they
-	// were checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename It>
-	size_t try_dequeue_bulk(It itemFirst, size_t max)
-	{
-		size_t count = 0;
-		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-			count += ptr->dequeue_bulk(itemFirst, max - count);
-			if (count == max) {
-				break;
-			}
-		}
-		return count;
-	}
-	
-	// Attempts to dequeue several elements from the queue using an explicit consumer token.
-	// Returns the number of items actually dequeued.
-	// Returns 0 if all producer streams appeared empty at the time they
-	// were checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename It>
-	size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
-	{
-		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
-			if (!update_current_producer_after_rotation(token)) {
-				return 0;
-			}
-		}
-		
-		size_t count = static_cast<ProducerBase*>(token.currentProducer)->dequeue_bulk(itemFirst, max);
-		if (count == max) {
-			if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
-				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
-			}
-			return max;
-		}
-		token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
-		max -= count;
-		
-		auto tail = producerListTail.load(std::memory_order_acquire);
-		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
-		if (ptr == nullptr) {
-			ptr = tail;
-		}
-		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
-			auto dequeued = ptr->dequeue_bulk(itemFirst, max);
-			count += dequeued;
-			if (dequeued != 0) {
-				token.currentProducer = ptr;
-				token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
-			}
-			if (dequeued == max) {
-				break;
-			}
-			max -= dequeued;
-			ptr = ptr->next_prod();
-			if (ptr == nullptr) {
-				ptr = tail;
-			}
-		}
-		return count;
-	}
-	
-	
-	
-	// Attempts to dequeue from a specific producer's inner queue.
-	// If you happen to know which producer you want to dequeue from, this
-	// is significantly faster than using the general-case try_dequeue methods.
-	// Returns false if the producer's queue appeared empty at the time it
-	// was checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename U>
-	inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item)
-	{
-		return static_cast<ExplicitProducer*>(producer.producer)->dequeue(item);
-	}
-	
-	// Attempts to dequeue several elements from a specific producer's inner queue.
-	// Returns the number of items actually dequeued.
-	// If you happen to know which producer you want to dequeue from, this
-	// is significantly faster than using the general-case try_dequeue methods.
-	// Returns 0 if the producer's queue appeared empty at the time it
-	// was checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename It>
-	inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max)
-	{
-		return static_cast<ExplicitProducer*>(producer.producer)->dequeue_bulk(itemFirst, max);
-	}
-	
-	
-	// Returns an estimate of the total number of elements currently in the queue. This
-	// estimate is only accurate if the queue has completely stabilized before it is called
-	// (i.e. all enqueue and dequeue operations have completed and their memory effects are
-	// visible on the calling thread, and no further operations start while this method is
-	// being called).
-	// Thread-safe.
-	size_t size_approx() const
-	{
-		size_t size = 0;
-		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-			size += ptr->size_approx();
-		}
-		return size;
-	}
-	
-	
-	// Returns true if the underlying atomic variables used by
-	// the queue are lock-free (they should be on most platforms).
-	// Thread-safe.
-	static constexpr bool is_lock_free()
-	{
-		return
-			details::static_is_lock_free<bool>::value == 2 &&
-			details::static_is_lock_free<size_t>::value == 2 &&
-			details::static_is_lock_free<std::uint32_t>::value == 2 &&
-			details::static_is_lock_free<index_t>::value == 2 &&
-			details::static_is_lock_free<void*>::value == 2 &&
-			details::static_is_lock_free<typename details::thread_id_converter<details::thread_id_t>::thread_id_numeric_size_t>::value == 2;
-	}
-
-
-private:
-	friend struct ProducerToken;
-	friend struct ConsumerToken;
-	struct ExplicitProducer;
-	friend struct ExplicitProducer;
-	struct ImplicitProducer;
-	friend struct ImplicitProducer;
-	friend class ConcurrentQueueTests;
-		
-	enum AllocationMode { CanAlloc, CannotAlloc };
-	
-	
-	///////////////////////////////
-	// Queue methods
-	///////////////////////////////
-	
-	template<AllocationMode canAlloc, typename U>
-	inline bool inner_enqueue(producer_token_t const& token, U&& element)
-	{
-		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
-	}
-	
-	template<AllocationMode canAlloc, typename U>
-	inline bool inner_enqueue(U&& element)
-	{
-		auto producer = get_or_add_implicit_producer();
-		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
-	}
-	
-	template<AllocationMode canAlloc, typename It>
-	inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
-	{
-		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
-	}
-	
-	template<AllocationMode canAlloc, typename It>
-	inline bool inner_enqueue_bulk(It itemFirst, size_t count)
-	{
-		auto producer = get_or_add_implicit_producer();
-		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
-	}
-	
-	inline bool update_current_producer_after_rotation(consumer_token_t& token)
-	{
-		// Ah, there's been a rotation, figure out where we should be!
-		auto tail = producerListTail.load(std::memory_order_acquire);
-		if (token.desiredProducer == nullptr && tail == nullptr) {
-			return false;
-		}
-		auto prodCount = producerCount.load(std::memory_order_relaxed);
-		auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed);
-		if ((details::unlikely)(token.desiredProducer == nullptr)) {
-			// Aha, first time we're dequeueing anything.
-			// Figure out our local position
-			// Note: offset is from start, not end, but we're traversing from end -- subtract from count first
-			std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount);
-			token.desiredProducer = tail;
-			for (std::uint32_t i = 0; i != offset; ++i) {
-				token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
-				if (token.desiredProducer == nullptr) {
-					token.desiredProducer = tail;
-				}
-			}
-		}
-		
-		std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
-		if (delta >= prodCount) {
-			delta = delta % prodCount;
-		}
-		for (std::uint32_t i = 0; i != delta; ++i) {
-			token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
-			if (token.desiredProducer == nullptr) {
-				token.desiredProducer = tail;
-			}
-		}
-		
-		token.lastKnownGlobalOffset = globalOffset;
-		token.currentProducer = token.desiredProducer;
-		token.itemsConsumedFromCurrent = 0;
-		return true;
-	}
-	
-	
-	///////////////////////////
-	// Free list
-	///////////////////////////
-	
-	template <typename N>
-	struct FreeListNode
-	{
-		FreeListNode() : freeListRefs(0), freeListNext(nullptr) { }
-		
-		std::atomic<std::uint32_t> freeListRefs;
-		std::atomic<N*> freeListNext;
-	};
-	
-	// A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but
-	// simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly
-	// speedy under low contention.
-	template<typename N>		// N must inherit FreeListNode or have the same fields (and initialization of them)
-	struct FreeList
-	{
-		FreeList() : freeListHead(nullptr) { }
-		FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); }
-		void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); }
-		
-		FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
-		FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
-		
-		inline void add(N* node)
-		{
-#ifdef MCDBGQ_NOLOCKFREE_FREELIST
-			debug::DebugLock lock(mutex);
-#endif		
-			// We know that the should-be-on-freelist bit is 0 at this point, so it's safe to
-			// set it using a fetch_add
-			if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) {
-				// Oh look! We were the last ones referencing this node, and we know
-				// we want to add it to the free list, so let's do it!
-		 		add_knowing_refcount_is_zero(node);
-			}
-		}
-		
-		inline N* try_get()
-		{
-#ifdef MCDBGQ_NOLOCKFREE_FREELIST
-			debug::DebugLock lock(mutex);
-#endif		
-			auto head = freeListHead.load(std::memory_order_acquire);
-			while (head != nullptr) {
-				auto prevHead = head;
-				auto refs = head->freeListRefs.load(std::memory_order_relaxed);
-				if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire)) {
-					head = freeListHead.load(std::memory_order_acquire);
-					continue;
-				}
-				
-				// Good, reference count has been incremented (it wasn't at zero), which means we can read the
-				// next and not worry about it changing between now and the time we do the CAS
-				auto next = head->freeListNext.load(std::memory_order_relaxed);
-				if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) {
-					// Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no
-					// matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on).
-					assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0);
-					
-					// Decrease refcount twice, once for our ref, and once for the list's ref
-					head->freeListRefs.fetch_sub(2, std::memory_order_release);
-					return head;
-				}
-				
-				// OK, the head must have changed on us, but we still need to decrease the refcount we increased.
-				// Note that we don't need to release any memory effects, but we do need to ensure that the reference
-				// count decrement happens-after the CAS on the head.
-				refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel);
-				if (refs == SHOULD_BE_ON_FREELIST + 1) {
-					add_knowing_refcount_is_zero(prevHead);
-				}
-			}
-			
-			return nullptr;
-		}
-		
-		// Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes)
-		N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); }
-		
-	private:
-		inline void add_knowing_refcount_is_zero(N* node)
-		{
-			// Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run
-			// only one copy of this method per node at a time, i.e. the single thread case), then we know
-			// we can safely change the next pointer of the node; however, once the refcount is back above
-			// zero, then other threads could increase it (happens under heavy contention, when the refcount
-			// goes to zero in between a load and a refcount increment of a node in try_get, then back up to
-			// something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS
-			// to add the node to the actual list fails, decrease the refcount and leave the add operation to
-			// the next thread who puts the refcount back at zero (which could be us, hence the loop).
-			auto head = freeListHead.load(std::memory_order_relaxed);
-			while (true) {
-				node->freeListNext.store(head, std::memory_order_relaxed);
-				node->freeListRefs.store(1, std::memory_order_release);
-				if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) {
-					// Hmm, the add failed, but we can only try again when the refcount goes back to zero
-					if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_acq_rel) == 1) {
-						continue;
-					}
-				}
-				return;
-			}
-		}
-		
-	private:
-		// Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention)
-		std::atomic<N*> freeListHead;
-	
-	static const std::uint32_t REFS_MASK = 0x7FFFFFFF;
-	static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;
-		
-#ifdef MCDBGQ_NOLOCKFREE_FREELIST
-		debug::DebugMutex mutex;
-#endif
-	};
-	
-	
-	///////////////////////////
-	// Block
-	///////////////////////////
-	
-	enum InnerQueueContext { implicit_context = 0, explicit_context = 1 };
-	
-	struct Block
-	{
-		Block()
-			: next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), dynamicallyAllocated(true)
-		{
-#ifdef MCDBGQ_TRACKMEM
-			owner = nullptr;
-#endif
-		}
-		
-		template<InnerQueueContext context>
-		inline bool is_empty() const
-		{
-			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
-				// Check flags
-				for (size_t i = 0; i < BLOCK_SIZE; ++i) {
-					if (!emptyFlags[i].load(std::memory_order_relaxed)) {
-						return false;
-					}
-				}
-				
-				// Aha, empty; make sure we have all other memory effects that happened before the empty flags were set
-				std::atomic_thread_fence(std::memory_order_acquire);
-				return true;
-			}
-			else {
-				// Check counter
-				if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) {
-					std::atomic_thread_fence(std::memory_order_acquire);
-					return true;
-				}
-				assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE);
-				return false;
-			}
-		}
-		
-		// Returns true if the block is now empty (does not apply in explicit context)
-		template<InnerQueueContext context>
-		inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i)
-		{
-			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
-				// Set flag
-				assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].load(std::memory_order_relaxed));
-				emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].store(true, std::memory_order_release);
-				return false;
-			}
-			else {
-				// Increment counter
-				auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_acq_rel);
-				assert(prevVal < BLOCK_SIZE);
-				return prevVal == BLOCK_SIZE - 1;
-			}
-		}
-		
-		// Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0).
-		// Returns true if the block is now empty (does not apply in explicit context).
-		template<InnerQueueContext context>
-		inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, size_t count)
-		{
-			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
-				// Set flags
-				std::atomic_thread_fence(std::memory_order_release);
-				i = BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1)) - count + 1;
-				for (size_t j = 0; j != count; ++j) {
-					assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
-					emptyFlags[i + j].store(true, std::memory_order_relaxed);
-				}
-				return false;
-			}
-			else {
-				// Increment counter
-				auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_acq_rel);
-				assert(prevVal + count <= BLOCK_SIZE);
-				return prevVal + count == BLOCK_SIZE;
-			}
-		}
-		
-		template<InnerQueueContext context>
-		inline void set_all_empty()
-		{
-			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
-				// Set all flags
-				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
-					emptyFlags[i].store(true, std::memory_order_relaxed);
-				}
-			}
-			else {
-				// Reset counter
-				elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed);
-			}
-		}
-		
-		template<InnerQueueContext context>
-		inline void reset_empty()
-		{
-			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
-				// Reset flags
-				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
-					emptyFlags[i].store(false, std::memory_order_relaxed);
-				}
-			}
-			else {
-				// Reset counter
-				elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
-			}
-		}
-		
-		inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT { return static_cast<T*>(static_cast<void*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
-		inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { return static_cast<T const*>(static_cast<void const*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
-		
-	private:
-		static_assert(std::alignment_of<T>::value <= sizeof(T), "The queue does not support types with an alignment greater than their size at this time");
-		MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements;
-	public:
-		Block* next;
-		std::atomic<size_t> elementsCompletelyDequeued;
-		std::atomic<bool> emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1];
-	public:
-		std::atomic<std::uint32_t> freeListRefs;
-		std::atomic<Block*> freeListNext;
-		bool dynamicallyAllocated;		// Perhaps a better name for this would be 'isNotPartOfInitialBlockPool'
-		
-#ifdef MCDBGQ_TRACKMEM
-		void* owner;
-#endif
-	};
-	static_assert(std::alignment_of<Block>::value >= std::alignment_of<T>::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping");
+  static_assert(!std::numeric_limits<size_t>::is_signed &&
+                    std::is_integral<size_t>::value,
+                "Traits::size_t must be an unsigned integral type");
+  static_assert(!std::numeric_limits<index_t>::is_signed &&
+                    std::is_integral<index_t>::value,
+                "Traits::index_t must be an unsigned integral type");
+  static_assert(sizeof(index_t) >= sizeof(size_t),
+                "Traits::index_t must be at least as wide as Traits::size_t");
+  static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)),
+                "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)");
+  static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) &&
+                    !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD &
+                      (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)),
+                "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a "
+                "power of 2 (and greater than 1)");
+  static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) &&
+                    !(EXPLICIT_INITIAL_INDEX_SIZE &
+                      (EXPLICIT_INITIAL_INDEX_SIZE - 1)),
+                "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and "
+                "greater than 1)");
+  static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) &&
+                    !(IMPLICIT_INITIAL_INDEX_SIZE &
+                      (IMPLICIT_INITIAL_INDEX_SIZE - 1)),
+                "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and "
+                "greater than 1)");
+  static_assert(
+      (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) ||
+          !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE &
+            (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)),
+      "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2");
+  static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 ||
+                    INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1,
+                "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least "
+                "1 (or 0 to disable implicit enqueueing)");
 
+ public:
+  // Creates a queue with at least `capacity` element slots; note that the
+  // actual number of elements that can be inserted without additional memory
+  // allocation depends on the number of producers and the block size (e.g. if
+  // the block size is equal to `capacity`, only a single block will be
+  // allocated up-front, which means only a single producer will be able to
+  // enqueue elements without an extra allocation -- blocks aren't shared
+  // between producers). This method is not thread safe -- it is up to the user
+  // to ensure that the queue is fully constructed before it starts being used
+  // by other threads (this includes making the memory effects of construction
+  // visible, possibly with a memory barrier).
+  explicit ConcurrentQueue(size_t capacity = 32 * BLOCK_SIZE)
+      : producerListTail(nullptr),
+        producerCount(0),
+        initialBlockPoolIndex(0),
+        nextExplicitConsumerId(0),
+        globalExplicitConsumerOffset(0) {
+    implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+    populate_initial_implicit_producer_hash();
+    populate_initial_block_list(capacity / BLOCK_SIZE +
+                                ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1));
 
-#ifdef MCDBGQ_TRACKMEM
-public:
-	struct MemStats;
-private:
-#endif
-	
-	///////////////////////////
-	// Producer base
-	///////////////////////////
-	
-	struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase
-	{
-		ProducerBase(ConcurrentQueue* parent_, bool isExplicit_) :
-			tailIndex(0),
-			headIndex(0),
-			dequeueOptimisticCount(0),
-			dequeueOvercommit(0),
-			tailBlock(nullptr),
-			isExplicit(isExplicit_),
-			parent(parent_)
-		{
-		}
-		
-		virtual ~ProducerBase() { }
-		
-		template<typename U>
-		inline bool dequeue(U& element)
-		{
-			if (isExplicit) {
-				return static_cast<ExplicitProducer*>(this)->dequeue(element);
-			}
-			else {
-				return static_cast<ImplicitProducer*>(this)->dequeue(element);
-			}
-		}
-		
-		template<typename It>
-		inline size_t dequeue_bulk(It& itemFirst, size_t max)
-		{
-			if (isExplicit) {
-				return static_cast<ExplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
-			}
-			else {
-				return static_cast<ImplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
-			}
-		}
-		
-		inline ProducerBase* next_prod() const { return static_cast<ProducerBase*>(next); }
-		
-		inline size_t size_approx() const
-		{
-			auto tail = tailIndex.load(std::memory_order_relaxed);
-			auto head = headIndex.load(std::memory_order_relaxed);
-			return details::circular_less_than(head, tail) ? static_cast<size_t>(tail - head) : 0;
-		}
-		
-		inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); }
-	protected:
-		std::atomic<index_t> tailIndex;		// Where to enqueue to next
-		std::atomic<index_t> headIndex;		// Where to dequeue from next
-		
-		std::atomic<index_t> dequeueOptimisticCount;
-		std::atomic<index_t> dequeueOvercommit;
-		
-		Block* tailBlock;
-		
-	public:
-		bool isExplicit;
-		ConcurrentQueue* parent;
-		
-	protected:
-#ifdef MCDBGQ_TRACKMEM
-		friend struct MemStats;
-#endif
-	};
-	
-	
-	///////////////////////////
-	// Explicit queue
-	///////////////////////////
-		
-	struct ExplicitProducer : public ProducerBase
-	{
-		explicit ExplicitProducer(ConcurrentQueue* parent_) :
-			ProducerBase(parent_, true),
-			blockIndex(nullptr),
-			pr_blockIndexSlotsUsed(0),
-			pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1),
-			pr_blockIndexFront(0),
-			pr_blockIndexEntries(nullptr),
-			pr_blockIndexRaw(nullptr)
-		{
-			size_t poolBasedIndexSize = details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1;
-			if (poolBasedIndexSize > pr_blockIndexSize) {
-				pr_blockIndexSize = poolBasedIndexSize;
-			}
-			
-			new_block_index(0);		// This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE
-		}
-		
-		~ExplicitProducer()
-		{
-			// Destruct any elements not yet dequeued.
-			// Since we're in the destructor, we can assume all elements
-			// are either completely dequeued or completely not (no halfways).
-			if (this->tailBlock != nullptr) {		// Note this means there must be a block index too
-				// First find the block that's partially dequeued, if any
-				Block* halfDequeuedBlock = nullptr;
-				if ((this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) != 0) {
-					// The head's not on a block boundary, meaning a block somewhere is partially dequeued
-					// (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary)
-					size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1);
-					while (details::circular_less_than<index_t>(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) {
-						i = (i + 1) & (pr_blockIndexSize - 1);
-					}
-					assert(details::circular_less_than<index_t>(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed)));
-					halfDequeuedBlock = pr_blockIndexEntries[i].block;
-				}
-				
-				// Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration)
-				auto block = this->tailBlock;
-				do {
-					block = block->next;
-					if (block->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
-						continue;
-					}
-					
-					size_t i = 0;	// Offset into block
-					if (block == halfDequeuedBlock) {
-						i = static_cast<size_t>(this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
-					}
-					
-					// Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index
-					auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast<size_t>(this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
-					while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) {
-						(*block)[i++]->~T();
-					}
-				} while (block != this->tailBlock);
-			}
-			
-			// Destroy all blocks that we own
-			if (this->tailBlock != nullptr) {
-				auto block = this->tailBlock;
-				do {
-					auto nextBlock = block->next;
-					this->parent->add_block_to_free_list(block);
-					block = nextBlock;
-				} while (block != this->tailBlock);
-			}
-			
-			// Destroy the block indices
-			auto header = static_cast<BlockIndexHeader*>(pr_blockIndexRaw);
-			while (header != nullptr) {
-				auto prev = static_cast<BlockIndexHeader*>(header->prev);
-				header->~BlockIndexHeader();
-				(Traits::free)(header);
-				header = prev;
-			}
-		}
-		
-		template<AllocationMode allocMode, typename U>
-		inline bool enqueue(U&& element)
-		{
-			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
-			index_t newTailIndex = 1 + currentTailIndex;
-			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
-				// We reached the end of a block, start a new one
-				auto startBlock = this->tailBlock;
-				auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
-				if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
-					// We can re-use the block ahead of us, it's empty!					
-					this->tailBlock = this->tailBlock->next;
-					this->tailBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
-					
-					// We'll put the block on the block index (guaranteed to be room since we're conceptually removing the
-					// last block from it first -- except instead of removing then adding, we can just overwrite).
-					// Note that there must be a valid block index here, since even if allocation failed in the ctor,
-					// it would have been re-attempted when adding the first block to the queue; since there is such
-					// a block, a block index must have been successfully allocated.
-				}
-				else {
-					// Whatever head value we see here is >= the last value we saw here (relatively),
-					// and <= its current value. Since we have the most recent tail, the head must be
-					// <= to it.
-					auto head = this->headIndex.load(std::memory_order_relaxed);
-					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
-					if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE)
-						|| (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
-						// We can't enqueue in another block because there's not enough leeway -- the
-						// tail could surpass the head by the time the block fills up! (Or we'll exceed
-						// the size limit, if the second part of the condition was true.)
-						return false;
-					}
-					// We're going to need a new block; check that the block index has room
-					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) {
-						// Hmm, the circular block index is already full -- we'll need
-						// to allocate a new index. Note pr_blockIndexRaw can only be nullptr if
-						// the initial allocation failed in the constructor.
-						
-						MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
-							return false;
-						}
-						else if (!new_block_index(pr_blockIndexSlotsUsed)) {
-							return false;
-						}
-					}
-					
-					// Insert a new block in the circular linked list
-					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
-					if (newBlock == nullptr) {
-						return false;
-					}
-#ifdef MCDBGQ_TRACKMEM
-					newBlock->owner = this;
-#endif
-					newBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
-					if (this->tailBlock == nullptr) {
-						newBlock->next = newBlock;
-					}
-					else {
-						newBlock->next = this->tailBlock->next;
-						this->tailBlock->next = newBlock;
-					}
-					this->tailBlock = newBlock;
-					++pr_blockIndexSlotsUsed;
-				}
-
-				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
-					// The constructor may throw. We want the element not to appear in the queue in
-					// that case (without corrupting the queue):
-					MOODYCAMEL_TRY {
-						new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
-					}
-					MOODYCAMEL_CATCH (...) {
-						// Revert change to the current block, but leave the new block available
-						// for next time
-						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-						this->tailBlock = startBlock == nullptr ? this->tailBlock : startBlock;
-						MOODYCAMEL_RETHROW;
-					}
-				}
-				else {
-					(void)startBlock;
-					(void)originalBlockIndexSlotsUsed;
-				}
-				
-				// Add block to block index
-				auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
-				entry.base = currentTailIndex;
-				entry.block = this->tailBlock;
-				blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release);
-				pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
-				
-				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
-					this->tailIndex.store(newTailIndex, std::memory_order_release);
-					return true;
-				}
-			}
-			
-			// Enqueue
-			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
-			
-			this->tailIndex.store(newTailIndex, std::memory_order_release);
-			return true;
-		}
-		
-		template<typename U>
-		bool dequeue(U& element)
-		{
-			auto tail = this->tailIndex.load(std::memory_order_relaxed);
-			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
-			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
-				// Might be something to dequeue, let's give it a try
-				
-				// Note that this if is purely for performance purposes in the common case when the queue is
-				// empty and the values are eventually consistent -- we may enter here spuriously.
-				
-				// Note that whatever the values of overcommit and tail are, they are not going to change (unless we
-				// change them) and must be the same value at this point (inside the if) as when the if condition was
-				// evaluated.
-
-				// We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below.
-				// This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in
-				// the fetch_add below will result in a value at least as recent as that (and therefore at least as large).
-				// Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all
-				// read-modify-write operations are guaranteed to work on the latest value in the modification order), but
-				// unfortunately that can't be shown to be correct using only the C++11 standard.
-				// See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case
-				std::atomic_thread_fence(std::memory_order_acquire);
-				
-				// Increment optimistic counter, then check if it went over the boundary
-				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
-				
-				// Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever
-				// incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now
-				// have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon
-				// incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount.
-				// However, we can't assert this since both dequeueOptimisticCount and dequeueOvercommit may (independently)
-				// overflow; in such a case, though, the logic still holds since the difference between the two is maintained.
-				
-				// Note that we reload tail here in case it changed; it will be the same value as before or greater, since
-				// this load is sequenced after (happens after) the earlier load above. This is supported by read-read
-				// coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order
-				tail = this->tailIndex.load(std::memory_order_acquire);
-				if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
-					// Guaranteed to be at least one element to dequeue!
-					
-					// Get the index. Note that since there's guaranteed to be at least one element, this
-					// will never exceed tail. We need to do an acquire-release fence here since it's possible
-					// that whatever condition got us to this point was for an earlier enqueued element (that
-					// we already see the memory effects for), but that by the time we increment somebody else
-					// has incremented it, and we need to see the memory effects for *that* element, which is
-					// in such a case is necessarily visible on the thread that incremented it in the first
-					// place with the more current condition (they must have acquired a tail that is at least
-					// as recent).
-					auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
-					
-					
-					// Determine which block the element is in
-					
-					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
-					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
-					
-					// We need to be careful here about subtracting and dividing because of index wrap-around.
-					// When an index wraps, we need to preserve the sign of the offset when dividing it by the
-					// block size (in order to get a correct signed block count offset in all cases):
-					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
-					auto blockBaseIndex = index & ~static_cast<index_t>(BLOCK_SIZE - 1);
-					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(blockBaseIndex - headBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
-					auto block = localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block;
-					
-					// Dequeue
-					auto& el = *((*block)[index]);
-					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
-						// Make sure the element is still fully dequeued and destroyed even if the assignment
-						// throws
-						struct Guard {
-							Block* block;
-							index_t index;
-							
-							~Guard()
-							{
-								(*block)[index]->~T();
-								block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
-							}
-						} guard = { block, index };
-
-						element = std::move(el); // NOLINT
-					}
-					else {
-						element = std::move(el); // NOLINT
-						el.~T(); // NOLINT
-						block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
-					}
-					
-					return true;
-				}
-				else {
-					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
-					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);		// Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write
-				}
-			}
-		
-			return false;
-		}
-		
-		template<AllocationMode allocMode, typename It>
-		bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count)
-		{
-			// First, we need to make sure we have enough room to enqueue all of the elements;
-			// this means pre-allocating blocks and putting them in the block index (but only if
-			// all the allocations succeeded).
-			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
-			auto startBlock = this->tailBlock;
-			auto originalBlockIndexFront = pr_blockIndexFront;
-			auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
-			
-			Block* firstAllocatedBlock = nullptr;
-			
-			// Figure out how many blocks we'll need to allocate, and do so
-			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
-			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
-			if (blockBaseDiff > 0) {
-				// Allocate as many blocks as possible from ahead
-				while (blockBaseDiff > 0 && this->tailBlock != nullptr && this->tailBlock->next != firstAllocatedBlock && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
-					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
-					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-					
-					this->tailBlock = this->tailBlock->next;
-					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
-					
-					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
-					entry.base = currentTailIndex;
-					entry.block = this->tailBlock;
-					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
-				}
-				
-				// Now allocate as many blocks as necessary from the block pool
-				while (blockBaseDiff > 0) {
-					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
-					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-					
-					auto head = this->headIndex.load(std::memory_order_relaxed);
-					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
-					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
-					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) {
-						MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
-							// Failed to allocate, undo changes (but keep injected blocks)
-							pr_blockIndexFront = originalBlockIndexFront;
-							pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-							this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
-							return false;
-						}
-						else if (full || !new_block_index(originalBlockIndexSlotsUsed)) {
-							// Failed to allocate, undo changes (but keep injected blocks)
-							pr_blockIndexFront = originalBlockIndexFront;
-							pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-							this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
-							return false;
-						}
-						
-						// pr_blockIndexFront is updated inside new_block_index, so we need to
-						// update our fallback value too (since we keep the new index even if we
-						// later fail)
-						originalBlockIndexFront = originalBlockIndexSlotsUsed;
-					}
-					
-					// Insert a new block in the circular linked list
-					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
-					if (newBlock == nullptr) {
-						pr_blockIndexFront = originalBlockIndexFront;
-						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
-						return false;
-					}
-					
-#ifdef MCDBGQ_TRACKMEM
-					newBlock->owner = this;
-#endif
-					newBlock->ConcurrentQueue::Block::template set_all_empty<explicit_context>();
-					if (this->tailBlock == nullptr) {
-						newBlock->next = newBlock;
-					}
-					else {
-						newBlock->next = this->tailBlock->next;
-						this->tailBlock->next = newBlock;
-					}
-					this->tailBlock = newBlock;
-					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
-					
-					++pr_blockIndexSlotsUsed;
-					
-					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
-					entry.base = currentTailIndex;
-					entry.block = this->tailBlock;
-					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
-				}
-				
-				// Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and
-				// publish the new block index front
-				auto block = firstAllocatedBlock;
-				while (true) {
-					block->ConcurrentQueue::Block::template reset_empty<explicit_context>();
-					if (block == this->tailBlock) {
-						break;
-					}
-					block = block->next;
-				}
-				
-				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
-					blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
-				}
-			}
-			
-			// Enqueue, one block at a time
-			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
-			currentTailIndex = startTailIndex;
-			auto endBlock = this->tailBlock;
-			this->tailBlock = startBlock;
-			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
-			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
-				this->tailBlock = firstAllocatedBlock;
-			}
-			while (true) {
-				index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
-					stopIndex = newTailIndex;
-				}
-				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
-					while (currentTailIndex != stopIndex) {
-						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
-					}
-				}
-				else {
-					MOODYCAMEL_TRY {
-						while (currentTailIndex != stopIndex) {
-							// Must use copy constructor even if move constructor is available
-							// because we may have to revert if there's an exception.
-							// Sorry about the horrible templated next line, but it was the only way
-							// to disable moving *at compile time*, which is important because a type
-							// may only define a (noexcept) move constructor, and so calls to the
-							// cctor will not compile, even if they are in an if branch that will never
-							// be executed
-							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
-							++currentTailIndex;
-							++itemFirst;
-						}
-					}
-					MOODYCAMEL_CATCH (...) {
-						// Oh dear, an exception's been thrown -- destroy the elements that
-						// were enqueued so far and revert the entire bulk operation (we'll keep
-						// any allocated blocks in our linked list for later, though).
-						auto constructedStopIndex = currentTailIndex;
-						auto lastBlockEnqueued = this->tailBlock;
-						
-						pr_blockIndexFront = originalBlockIndexFront;
-						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
-						
-						if (!details::is_trivially_destructible<T>::value) {
-							auto block = startBlock;
-							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
-								block = firstAllocatedBlock;
-							}
-							currentTailIndex = startTailIndex;
-							while (true) {
-								stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
-									stopIndex = constructedStopIndex;
-								}
-								while (currentTailIndex != stopIndex) {
-									(*block)[currentTailIndex++]->~T();
-								}
-								if (block == lastBlockEnqueued) {
-									break;
-								}
-								block = block->next;
-							}
-						}
-						MOODYCAMEL_RETHROW;
-					}
-				}
-				
-				if (this->tailBlock == endBlock) {
-					assert(currentTailIndex == newTailIndex);
-					break;
-				}
-				this->tailBlock = this->tailBlock->next;
-			}
-			
-			MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
-				if (firstAllocatedBlock != nullptr)
-					blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
-			}
-			
-			this->tailIndex.store(newTailIndex, std::memory_order_release);
-			return true;
-		}
-		
-		template<typename It>
-		size_t dequeue_bulk(It& itemFirst, size_t max)
-		{
-			auto tail = this->tailIndex.load(std::memory_order_relaxed);
-			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
-			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
-			if (details::circular_less_than<size_t>(0, desiredCount)) {
-				desiredCount = desiredCount < max ? desiredCount : max;
-				std::atomic_thread_fence(std::memory_order_acquire);
-				
-				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
-				
-				tail = this->tailIndex.load(std::memory_order_acquire);
-				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
-				if (details::circular_less_than<size_t>(0, actualCount)) {
-					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
-					if (actualCount < desiredCount) {
-						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
-					}
-					
-					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
-					// will never exceed tail.
-					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
-					
-					// Determine which block the first element is in
-					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
-					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
-					
-					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
-					auto firstBlockBaseIndex = firstIndex & ~static_cast<index_t>(BLOCK_SIZE - 1);
-					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(firstBlockBaseIndex - headBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
-					auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1);
-					
-					// Iterate the blocks and dequeue
-					auto index = firstIndex;
-					do {
-						auto firstIndexInBlock = index;
-						index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
-						auto block = localBlockIndex->entries[indexIndex].block;
-						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
-							while (index != endIndex) {
-								auto& el = *((*block)[index]);
-								*itemFirst++ = std::move(el);
-								el.~T();
-								++index;
-							}
-						}
-						else {
-							MOODYCAMEL_TRY {
-								while (index != endIndex) {
-									auto& el = *((*block)[index]);
-									*itemFirst = std::move(el);
-									++itemFirst;
-									el.~T();
-									++index;
-								}
-							}
-							MOODYCAMEL_CATCH (...) {
-								// It's too late to revert the dequeue, but we can make sure that all
-								// the dequeued objects are properly destroyed and the block index
-								// (and empty count) are properly updated before we propagate the exception
-								do {
-									block = localBlockIndex->entries[indexIndex].block;
-									while (index != endIndex) {
-										(*block)[index++]->~T();
-									}
-									block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
-									indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
-									
-									firstIndexInBlock = index;
-									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
-								} while (index != firstIndex + actualCount);
-								
-								MOODYCAMEL_RETHROW;
-							}
-						}
-						block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
-						indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
-					} while (index != firstIndex + actualCount);
-					
-					return actualCount;
-				}
-				else {
-					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
-					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
-				}
-			}
-			
-			return 0;
-		}
-		
-	private:
-		struct BlockIndexEntry
-		{
-			index_t base;
-			Block* block;
-		};
-		
-		struct BlockIndexHeader
-		{
-			size_t size;
-			std::atomic<size_t> front;		// Current slot (not next, like pr_blockIndexFront)
-			BlockIndexEntry* entries;
-			void* prev;
-		};
-		
-		
-		bool new_block_index(size_t numberOfFilledSlotsToExpose)
-		{
-			auto prevBlockSizeMask = pr_blockIndexSize - 1;
-			
-			// Create the new block
-			pr_blockIndexSize <<= 1;
-			auto newRawPtr = static_cast<char*>((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize));
-			if (newRawPtr == nullptr) {
-				pr_blockIndexSize >>= 1;		// Reset to allow graceful retry
-				return false;
-			}
-			
-			auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(newRawPtr + sizeof(BlockIndexHeader)));
-			
-			// Copy in all the old indices, if any
-			size_t j = 0;
-			if (pr_blockIndexSlotsUsed != 0) {
-				auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask;
-				do {
-					newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
-					i = (i + 1) & prevBlockSizeMask;
-				} while (i != pr_blockIndexFront);
-			}
-			
-			// Update everything
-			auto header = new (newRawPtr) BlockIndexHeader;
-			header->size = pr_blockIndexSize;
-			header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed);
-			header->entries = newBlockIndexEntries;
-			header->prev = pr_blockIndexRaw;		// we link the new block to the old one so we can free it later
-			
-			pr_blockIndexFront = j;
-			pr_blockIndexEntries = newBlockIndexEntries;
-			pr_blockIndexRaw = newRawPtr;
-			blockIndex.store(header, std::memory_order_release);
-			
-			return true;
-		}
-		
-	private:
-		std::atomic<BlockIndexHeader*> blockIndex;
-		
-		// To be used by producer only -- consumer must use the ones in referenced by blockIndex
-		size_t pr_blockIndexSlotsUsed;
-		size_t pr_blockIndexSize;
-		size_t pr_blockIndexFront;		// Next slot (not current)
-		BlockIndexEntry* pr_blockIndexEntries;
-		void* pr_blockIndexRaw;
-		
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-	public:
-		ExplicitProducer* nextExplicitProducer;
-	private:
-#endif
-		
-#ifdef MCDBGQ_TRACKMEM
-		friend struct MemStats;
-#endif
-	};
-	
-	
-	//////////////////////////////////
-	// Implicit queue
-	//////////////////////////////////
-	
-	struct ImplicitProducer : public ProducerBase
-	{			
-		ImplicitProducer(ConcurrentQueue* parent_) :
-			ProducerBase(parent_, false),
-			nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE),
-			blockIndex(nullptr)
-		{
-			new_block_index();
-		}
-		
-		~ImplicitProducer()
-		{
-			// Note that since we're in the destructor we can assume that all enqueue/dequeue operations
-			// completed already; this means that all undequeued elements are placed contiguously across
-			// contiguous blocks, and that only the first and last remaining blocks can be only partially
-			// empty (all other remaining blocks must be completely full).
-			
-#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-			// Unregister ourselves for thread termination notification
-			if (!this->inactive.load(std::memory_order_relaxed)) {
-				details::ThreadExitNotifier::unsubscribe(&threadExitListener);
-			}
-#endif
-			
-			// Destroy all remaining elements!
-			auto tail = this->tailIndex.load(std::memory_order_relaxed);
-			auto index = this->headIndex.load(std::memory_order_relaxed);
-			Block* block = nullptr;
-			assert(index == tail || details::circular_less_than(index, tail));
-			bool forceFreeLastBlock = index != tail;		// If we enter the loop, then the last (tail) block will not be freed
-			while (index != tail) {
-				if ((index & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 || block == nullptr) {
-					if (block != nullptr) {
-						// Free the old block
-						this->parent->add_block_to_free_list(block);
-					}
-					
-					block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed);
-				}
-				
-				((*block)[index])->~T();
-				++index;
-			}
-			// Even if the queue is empty, there's still one block that's not on the free list
-			// (unless the head index reached the end of it, in which case the tail will be poised
-			// to create a new block).
-			if (this->tailBlock != nullptr && (forceFreeLastBlock || (tail & static_cast<index_t>(BLOCK_SIZE - 1)) != 0)) {
-				this->parent->add_block_to_free_list(this->tailBlock);
-			}
-			
-			// Destroy block index
-			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
-			if (localBlockIndex != nullptr) {
-				for (size_t i = 0; i != localBlockIndex->capacity; ++i) {
-					localBlockIndex->index[i]->~BlockIndexEntry();
-				}
-				do {
-					auto prev = localBlockIndex->prev;
-					localBlockIndex->~BlockIndexHeader();
-					(Traits::free)(localBlockIndex);
-					localBlockIndex = prev;
-				} while (localBlockIndex != nullptr);
-			}
-		}
-		
-		template<AllocationMode allocMode, typename U>
-		inline bool enqueue(U&& element)
-		{
-			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
-			index_t newTailIndex = 1 + currentTailIndex;
-			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
-				// We reached the end of a block, start a new one
-				auto head = this->headIndex.load(std::memory_order_relaxed);
-				assert(!details::circular_less_than<index_t>(currentTailIndex, head));
-				if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
-					return false;
-				}
-#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-				debug::DebugLock lock(mutex);
-#endif
-				// Find out where we'll be inserting this block in the block index
-				BlockIndexEntry* idxEntry;
-				if (!insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) {
-					return false;
-				}
-				
-				// Get ahold of a new block
-				auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
-				if (newBlock == nullptr) {
-					rewind_block_index_tail();
-					idxEntry->value.store(nullptr, std::memory_order_relaxed);
-					return false;
-				}
-#ifdef MCDBGQ_TRACKMEM
-				newBlock->owner = this;
-#endif
-				newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
-
-				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
-					// May throw, try to insert now before we publish the fact that we have this new block
-					MOODYCAMEL_TRY {
-						new ((*newBlock)[currentTailIndex]) T(std::forward<U>(element));
-					}
-					MOODYCAMEL_CATCH (...) {
-						rewind_block_index_tail();
-						idxEntry->value.store(nullptr, std::memory_order_relaxed);
-						this->parent->add_block_to_free_list(newBlock);
-						MOODYCAMEL_RETHROW;
-					}
-				}
-				
-				// Insert the new block into the index
-				idxEntry->value.store(newBlock, std::memory_order_relaxed);
-				
-				this->tailBlock = newBlock;
-				
-				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
-					this->tailIndex.store(newTailIndex, std::memory_order_release);
-					return true;
-				}
-			}
-			
-			// Enqueue
-			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
-			
-			this->tailIndex.store(newTailIndex, std::memory_order_release);
-			return true;
-		}
-		
-		template<typename U>
-		bool dequeue(U& element)
-		{
-			// See ExplicitProducer::dequeue for rationale and explanation
-			index_t tail = this->tailIndex.load(std::memory_order_relaxed);
-			index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
-			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
-				std::atomic_thread_fence(std::memory_order_acquire);
-				
-				index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
-				tail = this->tailIndex.load(std::memory_order_acquire);
-				if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
-					index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
-					
-					// Determine which block the element is in
-					auto entry = get_block_index_entry_for_index(index);
-					
-					// Dequeue
-					auto block = entry->value.load(std::memory_order_relaxed);
-					auto& el = *((*block)[index]);
-					
-					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
-#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-						// Note: Acquiring the mutex with every dequeue instead of only when a block
-						// is released is very sub-optimal, but it is, after all, purely debug code.
-						debug::DebugLock lock(producer->mutex);
-#endif
-						struct Guard {
-							Block* block;
-							index_t index;
-							BlockIndexEntry* entry;
-							ConcurrentQueue* parent;
-							
-							~Guard()
-							{
-								(*block)[index]->~T();
-								if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
-									entry->value.store(nullptr, std::memory_order_relaxed);
-									parent->add_block_to_free_list(block);
-								}
-							}
-						} guard = { block, index, entry, this->parent };
-
-						element = std::move(el); // NOLINT
-					}
-					else {
-						element = std::move(el); // NOLINT
-						el.~T(); // NOLINT
-
-						if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
-							{
-#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-								debug::DebugLock lock(mutex);
-#endif
-								// Add the block back into the global free pool (and remove from block index)
-								entry->value.store(nullptr, std::memory_order_relaxed);
-							}
-							this->parent->add_block_to_free_list(block);		// releases the above store
-						}
-					}
-					
-					return true;
-				}
-				else {
-					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);
-				}
-			}
-		
-			return false;
-		}
-		
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable: 4706)  // assignment within conditional expression
-#endif
-		template<AllocationMode allocMode, typename It>
-		bool enqueue_bulk(It itemFirst, size_t count)
-		{
-			// First, we need to make sure we have enough room to enqueue all of the elements;
-			// this means pre-allocating blocks and putting them in the block index (but only if
-			// all the allocations succeeded).
-			
-			// Note that the tailBlock we start off with may not be owned by us any more;
-			// this happens if it was filled up exactly to the top (setting tailIndex to
-			// the first index of the next block which is not yet allocated), then dequeued
-			// completely (putting it on the free list) before we enqueue again.
-			
-			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
-			auto startBlock = this->tailBlock;
-			Block* firstAllocatedBlock = nullptr;
-			auto endBlock = this->tailBlock;
-			
-			// Figure out how many blocks we'll need to allocate, and do so
-			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
-			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
-			if (blockBaseDiff > 0) {
-#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-				debug::DebugLock lock(mutex);
-#endif
-				do {
-					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
-					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-					
-					// Find out where we'll be inserting this block in the block index
-					BlockIndexEntry* idxEntry = nullptr;  // initialization here unnecessary but compiler can't always tell
-					Block* newBlock;
-					bool indexInserted = false;
-					auto head = this->headIndex.load(std::memory_order_relaxed);
-					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
-					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
-
-					if (full || !(indexInserted = insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) || (newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>()) == nullptr) {
-						// Index allocation or block allocation failed; revert any other allocations
-						// and index insertions done so far for this operation
-						if (indexInserted) {
-							rewind_block_index_tail();
-							idxEntry->value.store(nullptr, std::memory_order_relaxed);
-						}
-						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
-						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
-							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-							idxEntry = get_block_index_entry_for_index(currentTailIndex);
-							idxEntry->value.store(nullptr, std::memory_order_relaxed);
-							rewind_block_index_tail();
-						}
-						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
-						this->tailBlock = startBlock;
-						
-						return false;
-					}
-					
-#ifdef MCDBGQ_TRACKMEM
-					newBlock->owner = this;
-#endif
-					newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
-					newBlock->next = nullptr;
-					
-					// Insert the new block into the index
-					idxEntry->value.store(newBlock, std::memory_order_relaxed);
-					
-					// Store the chain of blocks so that we can undo if later allocations fail,
-					// and so that we can find the blocks when we do the actual enqueueing
-					if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr) {
-						assert(this->tailBlock != nullptr);
-						this->tailBlock->next = newBlock;
-					}
-					this->tailBlock = newBlock;
-					endBlock = newBlock;
-					firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock;
-				} while (blockBaseDiff > 0);
-			}
-			
-			// Enqueue, one block at a time
-			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
-			currentTailIndex = startTailIndex;
-			this->tailBlock = startBlock;
-			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
-			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
-				this->tailBlock = firstAllocatedBlock;
-			}
-			while (true) {
-				index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
-					stopIndex = newTailIndex;
-				}
-				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
-					while (currentTailIndex != stopIndex) {
-						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
-					}
-				}
-				else {
-					MOODYCAMEL_TRY {
-						while (currentTailIndex != stopIndex) {
-							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
-							++currentTailIndex;
-							++itemFirst;
-						}
-					}
-					MOODYCAMEL_CATCH (...) {
-						auto constructedStopIndex = currentTailIndex;
-						auto lastBlockEnqueued = this->tailBlock;
-						
-						if (!details::is_trivially_destructible<T>::value) {
-							auto block = startBlock;
-							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
-								block = firstAllocatedBlock;
-							}
-							currentTailIndex = startTailIndex;
-							while (true) {
-								stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
-									stopIndex = constructedStopIndex;
-								}
-								while (currentTailIndex != stopIndex) {
-									(*block)[currentTailIndex++]->~T();
-								}
-								if (block == lastBlockEnqueued) {
-									break;
-								}
-								block = block->next;
-							}
-						}
-						
-						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
-						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
-							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-							auto idxEntry = get_block_index_entry_for_index(currentTailIndex);
-							idxEntry->value.store(nullptr, std::memory_order_relaxed);
-							rewind_block_index_tail();
-						}
-						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
-						this->tailBlock = startBlock;
-						MOODYCAMEL_RETHROW;
-					}
-				}
-				
-				if (this->tailBlock == endBlock) {
-					assert(currentTailIndex == newTailIndex);
-					break;
-				}
-				this->tailBlock = this->tailBlock->next;
-			}
-			this->tailIndex.store(newTailIndex, std::memory_order_release);
-			return true;
-		}
-#ifdef _MSC_VER
-#pragma warning(pop)
+    // Track all the producers using a fully-resolved typed list for
+    // each kind; this makes it possible to debug them starting from
+    // the root queue object (otherwise wacky casts are needed that
+    // don't compile in the debugger's expression evaluator).
+    explicitProducers.store(nullptr, std::memory_order_relaxed);
+    implicitProducers.store(nullptr, std::memory_order_relaxed);
 #endif
-		
-		template<typename It>
-		size_t dequeue_bulk(It& itemFirst, size_t max)
-		{
-			auto tail = this->tailIndex.load(std::memory_order_relaxed);
-			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
-			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
-			if (details::circular_less_than<size_t>(0, desiredCount)) {
-				desiredCount = desiredCount < max ? desiredCount : max;
-				std::atomic_thread_fence(std::memory_order_acquire);
-				
-				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
-				
-				tail = this->tailIndex.load(std::memory_order_acquire);
-				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
-				if (details::circular_less_than<size_t>(0, actualCount)) {
-					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
-					if (actualCount < desiredCount) {
-						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
-					}
-					
-					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
-					// will never exceed tail.
-					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
-					
-					// Iterate the blocks and dequeue
-					auto index = firstIndex;
-					BlockIndexHeader* localBlockIndex;
-					auto indexIndex = get_block_index_index_for_index(index, localBlockIndex);
-					do {
-						auto blockStartIndex = index;
-						index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
-						
-						auto entry = localBlockIndex->index[indexIndex];
-						auto block = entry->value.load(std::memory_order_relaxed);
-						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
-							while (index != endIndex) {
-								auto& el = *((*block)[index]);
-								*itemFirst++ = std::move(el);
-								el.~T();
-								++index;
-							}
-						}
-						else {
-							MOODYCAMEL_TRY {
-								while (index != endIndex) {
-									auto& el = *((*block)[index]);
-									*itemFirst = std::move(el);
-									++itemFirst;
-									el.~T();
-									++index;
-								}
-							}
-							MOODYCAMEL_CATCH (...) {
-								do {
-									entry = localBlockIndex->index[indexIndex];
-									block = entry->value.load(std::memory_order_relaxed);
-									while (index != endIndex) {
-										(*block)[index++]->~T();
-									}
-									
-									if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
-#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-										debug::DebugLock lock(mutex);
-#endif
-										entry->value.store(nullptr, std::memory_order_relaxed);
-										this->parent->add_block_to_free_list(block);
-									}
-									indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
-									
-									blockStartIndex = index;
-									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
-								} while (index != firstIndex + actualCount);
-								
-								MOODYCAMEL_RETHROW;
-							}
-						}
-						if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
-							{
-#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-								debug::DebugLock lock(mutex);
-#endif
-								// Note that the set_many_empty above did a release, meaning that anybody who acquires the block
-								// we're about to free can use it safely since our writes (and reads!) will have happened-before then.
-								entry->value.store(nullptr, std::memory_order_relaxed);
-							}
-							this->parent->add_block_to_free_list(block);		// releases the above store
-						}
-						indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
-					} while (index != firstIndex + actualCount);
-					
-					return actualCount;
-				}
-				else {
-					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
-				}
-			}
-			
-			return 0;
-		}
-		
-	private:
-		// The block size must be > 1, so any number with the low bit set is an invalid block base index
-		static const index_t INVALID_BLOCK_BASE = 1;
-		
-		struct BlockIndexEntry
-		{
-			std::atomic<index_t> key;
-			std::atomic<Block*> value;
-		};
-		
-		struct BlockIndexHeader
-		{
-			size_t capacity;
-			std::atomic<size_t> tail;
-			BlockIndexEntry* entries;
-			BlockIndexEntry** index;
-			BlockIndexHeader* prev;
-		};
-		
-		template<AllocationMode allocMode>
-		inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex)
-		{
-			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);		// We're the only writer thread, relaxed is OK
-			if (localBlockIndex == nullptr) {
-				return false;  // this can happen if new_block_index failed in the constructor
-			}
-			size_t newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
-			idxEntry = localBlockIndex->index[newTail];
-			if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE ||
-				idxEntry->value.load(std::memory_order_relaxed) == nullptr) {
-				
-				idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
-				localBlockIndex->tail.store(newTail, std::memory_order_release);
-				return true;
-			}
-			
-			// No room in the old block index, try to allocate another one!
-			MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
-				return false;
-			}
-			else if (!new_block_index()) {
-				return false;
-			}
-			else {
-				localBlockIndex = blockIndex.load(std::memory_order_relaxed);
-				newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
-				idxEntry = localBlockIndex->index[newTail];
-				assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE);
-				idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
-				localBlockIndex->tail.store(newTail, std::memory_order_release);
-				return true;
-			}
-		}
-		
-		inline void rewind_block_index_tail()
-		{
-			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
-			localBlockIndex->tail.store((localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), std::memory_order_relaxed);
-		}
-		
-		inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const
-		{
-			BlockIndexHeader* localBlockIndex;
-			auto idx = get_block_index_index_for_index(index, localBlockIndex);
-			return localBlockIndex->index[idx];
-		}
-		
-		inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const
-		{
-#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-			debug::DebugLock lock(mutex);
-#endif
-			index &= ~static_cast<index_t>(BLOCK_SIZE - 1);
-			localBlockIndex = blockIndex.load(std::memory_order_acquire);
-			auto tail = localBlockIndex->tail.load(std::memory_order_acquire);
-			auto tailBase = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed);
-			assert(tailBase != INVALID_BLOCK_BASE);
-			// Note: Must use division instead of shift because the index may wrap around, causing a negative
-			// offset, whose negativity we want to preserve
-			auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(index - tailBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
-			size_t idx = (tail + offset) & (localBlockIndex->capacity - 1);
-			assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr);
-			return idx;
-		}
-		
-		bool new_block_index()
-		{
-			auto prev = blockIndex.load(std::memory_order_relaxed);
-			size_t prevCapacity = prev == nullptr ? 0 : prev->capacity;
-			auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity;
-			auto raw = static_cast<char*>((Traits::malloc)(
-				sizeof(BlockIndexHeader) +
-				std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * entryCount +
-				std::alignment_of<BlockIndexEntry*>::value - 1 + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity));
-			if (raw == nullptr) {
-				return false;
-			}
-			
-			auto header = new (raw) BlockIndexHeader;
-			auto entries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(raw + sizeof(BlockIndexHeader)));
-			auto index = reinterpret_cast<BlockIndexEntry**>(details::align_for<BlockIndexEntry*>(reinterpret_cast<char*>(entries) + sizeof(BlockIndexEntry) * entryCount));
-			if (prev != nullptr) {
-				auto prevTail = prev->tail.load(std::memory_order_relaxed);
-				auto prevPos = prevTail;
-				size_t i = 0;
-				do {
-					prevPos = (prevPos + 1) & (prev->capacity - 1);
-					index[i++] = prev->index[prevPos];
-				} while (prevPos != prevTail);
-				assert(i == prevCapacity);
-			}
-			for (size_t i = 0; i != entryCount; ++i) {
-				new (entries + i) BlockIndexEntry;
-				entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed);
-				index[prevCapacity + i] = entries + i;
-			}
-			header->prev = prev;
-			header->entries = entries;
-			header->index = index;
-			header->capacity = nextBlockIndexCapacity;
-			header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed);
-			
-			blockIndex.store(header, std::memory_order_release);
-			
-			nextBlockIndexCapacity <<= 1;
-			
-			return true;
-		}
-		
-	private:
-		size_t nextBlockIndexCapacity;
-		std::atomic<BlockIndexHeader*> blockIndex;
+  }
+
+  // Computes the correct amount of pre-allocated blocks for you based
+  // on the minimum number of elements you want available at any given
+  // time, and the maximum concurrent number of each type of producer.
+  ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers,
+                  size_t maxImplicitProducers)
+      : producerListTail(nullptr),
+        producerCount(0),
+        initialBlockPoolIndex(0),
+        nextExplicitConsumerId(0),
+        globalExplicitConsumerOffset(0) {
+    implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+    populate_initial_implicit_producer_hash();
+    size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) *
+                        (maxExplicitProducers + 1) +
+                    2 * (maxExplicitProducers + maxImplicitProducers);
+    populate_initial_block_list(blocks);
 
-#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-	public:
-		details::ThreadExitListener threadExitListener;
-	private:
-#endif
-		
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-	public:
-		ImplicitProducer* nextImplicitProducer;
-	private:
+    explicitProducers.store(nullptr, std::memory_order_relaxed);
+    implicitProducers.store(nullptr, std::memory_order_relaxed);
 #endif
+  }
 
-#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-		mutable debug::DebugMutex mutex;
-#endif
-#ifdef MCDBGQ_TRACKMEM
-		friend struct MemStats;
-#endif
-	};
-	
-	
-	//////////////////////////////////
-	// Block pool manipulation
-	//////////////////////////////////
-	
-	void populate_initial_block_list(size_t blockCount)
-	{
-		initialBlockPoolSize = blockCount;
-		if (initialBlockPoolSize == 0) {
-			initialBlockPool = nullptr;
-			return;
-		}
-		
-		initialBlockPool = create_array<Block>(blockCount);
-		if (initialBlockPool == nullptr) {
-			initialBlockPoolSize = 0;
-		}
-		for (size_t i = 0; i < initialBlockPoolSize; ++i) {
-			initialBlockPool[i].dynamicallyAllocated = false;
-		}
-	}
-	
-	inline Block* try_get_block_from_initial_pool()
-	{
-		if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) {
-			return nullptr;
-		}
-		
-		auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);
-		
-		return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr;
-	}
-	
-	inline void add_block_to_free_list(Block* block)
-	{
-#ifdef MCDBGQ_TRACKMEM
-		block->owner = nullptr;
-#endif
-		if (!Traits::RECYCLE_ALLOCATED_BLOCKS && block->dynamicallyAllocated) {
-			destroy(block);
-		}
-		else {
-			freeList.add(block);
-		}
-	}
-	
-	inline void add_blocks_to_free_list(Block* block)
-	{
-		while (block != nullptr) {
-			auto next = block->next;
-			add_block_to_free_list(block);
-			block = next;
-		}
-	}
-	
-	inline Block* try_get_block_from_free_list()
-	{
-		return freeList.try_get();
-	}
-	
-	// Gets a free block from one of the memory pools, or allocates a new one (if applicable)
-	template<AllocationMode canAlloc>
-	Block* requisition_block()
-	{
-		auto block = try_get_block_from_initial_pool();
-		if (block != nullptr) {
-			return block;
-		}
-		
-		block = try_get_block_from_free_list();
-		if (block != nullptr) {
-			return block;
-		}
-		
-		MOODYCAMEL_CONSTEXPR_IF (canAlloc == CanAlloc) {
-			return create<Block>();
-		}
-		else {
-			return nullptr;
-		}
-	}
-	
+  // Note: The queue should not be accessed concurrently while it's
+  // being deleted. It's up to the user to synchronize this.
+  // This method is not thread safe.
+  ~ConcurrentQueue() {
+    // Destroy producers
+    auto ptr = producerListTail.load(std::memory_order_relaxed);
+    while (ptr != nullptr) {
+      auto next = ptr->next_prod();
+      if (ptr->token != nullptr) {
+        ptr->token->producer = nullptr;
+      }
+      destroy(ptr);
+      ptr = next;
+    }
+
+    // Destroy implicit producer hash tables
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) {
+      auto hash = implicitProducerHash.load(std::memory_order_relaxed);
+      while (hash != nullptr) {
+        auto prev = hash->prev;
+        if (prev != nullptr) {  // The last hash is part of this object and was
+                                // not allocated dynamically
+          for (size_t i = 0; i != hash->capacity; ++i) {
+            hash->entries[i].~ImplicitProducerKVP();
+          }
+          hash->~ImplicitProducerHash();
+          (Traits::free)(hash);
+        }
+        hash = prev;
+      }
+    }
+
+    // Destroy global free list
+    auto block = freeList.head_unsafe();
+    while (block != nullptr) {
+      auto next = block->freeListNext.load(std::memory_order_relaxed);
+      if (block->dynamicallyAllocated) {
+        destroy(block);
+      }
+      block = next;
+    }
+
+    // Destroy initial free list
+    destroy_array(initialBlockPool, initialBlockPoolSize);
+  }
+
+  // Disable copying and copy assignment
+  ConcurrentQueue(ConcurrentQueue const &) MOODYCAMEL_DELETE_FUNCTION;
+  ConcurrentQueue &operator=(ConcurrentQueue const &)
+      MOODYCAMEL_DELETE_FUNCTION;
+
+  // Moving is supported, but note that it is *not* a thread-safe operation.
+  // Nobody can use the queue while it's being moved, and the memory effects
+  // of that move must be propagated to other threads before they can use it.
+  // Note: When a queue is moved, its tokens are still valid but can only be
+  // used with the destination queue (i.e. semantically they are moved along
+  // with the queue itself).
+  ConcurrentQueue(ConcurrentQueue &&other) MOODYCAMEL_NOEXCEPT
+      : producerListTail(
+            other.producerListTail.load(std::memory_order_relaxed)),
+        producerCount(other.producerCount.load(std::memory_order_relaxed)),
+        initialBlockPoolIndex(
+            other.initialBlockPoolIndex.load(std::memory_order_relaxed)),
+        initialBlockPool(other.initialBlockPool),
+        initialBlockPoolSize(other.initialBlockPoolSize),
+        freeList(std::move(other.freeList)),
+        nextExplicitConsumerId(
+            other.nextExplicitConsumerId.load(std::memory_order_relaxed)),
+        globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(
+            std::memory_order_relaxed)) {
+    // Move the other one into this, and leave the other one as an empty queue
+    implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+    populate_initial_implicit_producer_hash();
+    swap_implicit_producer_hashes(other);
+
+    other.producerListTail.store(nullptr, std::memory_order_relaxed);
+    other.producerCount.store(0, std::memory_order_relaxed);
+    other.nextExplicitConsumerId.store(0, std::memory_order_relaxed);
+    other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed);
 
-#ifdef MCDBGQ_TRACKMEM
-	public:
-		struct MemStats {
-			size_t allocatedBlocks;
-			size_t usedBlocks;
-			size_t freeBlocks;
-			size_t ownedBlocksExplicit;
-			size_t ownedBlocksImplicit;
-			size_t implicitProducers;
-			size_t explicitProducers;
-			size_t elementsEnqueued;
-			size_t blockClassBytes;
-			size_t queueClassBytes;
-			size_t implicitBlockIndexBytes;
-			size_t explicitBlockIndexBytes;
-			
-			friend class ConcurrentQueue;
-			
-		private:
-			static MemStats getFor(ConcurrentQueue* q)
-			{
-				MemStats stats = { 0 };
-				
-				stats.elementsEnqueued = q->size_approx();
-			
-				auto block = q->freeList.head_unsafe();
-				while (block != nullptr) {
-					++stats.allocatedBlocks;
-					++stats.freeBlocks;
-					block = block->freeListNext.load(std::memory_order_relaxed);
-				}
-				
-				for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-					bool implicit = dynamic_cast<ImplicitProducer*>(ptr) != nullptr;
-					stats.implicitProducers += implicit ? 1 : 0;
-					stats.explicitProducers += implicit ? 0 : 1;
-					
-					if (implicit) {
-						auto prod = static_cast<ImplicitProducer*>(ptr);
-						stats.queueClassBytes += sizeof(ImplicitProducer);
-						auto head = prod->headIndex.load(std::memory_order_relaxed);
-						auto tail = prod->tailIndex.load(std::memory_order_relaxed);
-						auto hash = prod->blockIndex.load(std::memory_order_relaxed);
-						if (hash != nullptr) {
-							for (size_t i = 0; i != hash->capacity; ++i) {
-								if (hash->index[i]->key.load(std::memory_order_relaxed) != ImplicitProducer::INVALID_BLOCK_BASE && hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) {
-									++stats.allocatedBlocks;
-									++stats.ownedBlocksImplicit;
-								}
-							}
-							stats.implicitBlockIndexBytes += hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry);
-							for (; hash != nullptr; hash = hash->prev) {
-								stats.implicitBlockIndexBytes += sizeof(typename ImplicitProducer::BlockIndexHeader) + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*);
-							}
-						}
-						for (; details::circular_less_than<index_t>(head, tail); head += BLOCK_SIZE) {
-							//auto block = prod->get_block_index_entry_for_index(head);
-							++stats.usedBlocks;
-						}
-					}
-					else {
-						auto prod = static_cast<ExplicitProducer*>(ptr);
-						stats.queueClassBytes += sizeof(ExplicitProducer);
-						auto tailBlock = prod->tailBlock;
-						bool wasNonEmpty = false;
-						if (tailBlock != nullptr) {
-							auto block = tailBlock;
-							do {
-								++stats.allocatedBlocks;
-								if (!block->ConcurrentQueue::Block::template is_empty<explicit_context>() || wasNonEmpty) {
-									++stats.usedBlocks;
-									wasNonEmpty = wasNonEmpty || block != tailBlock;
-								}
-								++stats.ownedBlocksExplicit;
-								block = block->next;
-							} while (block != tailBlock);
-						}
-						auto index = prod->blockIndex.load(std::memory_order_relaxed);
-						while (index != nullptr) {
-							stats.explicitBlockIndexBytes += sizeof(typename ExplicitProducer::BlockIndexHeader) + index->size * sizeof(typename ExplicitProducer::BlockIndexEntry);
-							index = static_cast<typename ExplicitProducer::BlockIndexHeader*>(index->prev);
-						}
-					}
-				}
-				
-				auto freeOnInitialPool = q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? 0 : q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed);
-				stats.allocatedBlocks += freeOnInitialPool;
-				stats.freeBlocks += freeOnInitialPool;
-				
-				stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks;
-				stats.queueClassBytes += sizeof(ConcurrentQueue);
-				
-				return stats;
-			}
-		};
-		
-		// For debugging only. Not thread-safe.
-		MemStats getMemStats()
-		{
-			return MemStats::getFor(this);
-		}
-	private:
-		friend struct MemStats;
-#endif
-	
-	
-	//////////////////////////////////
-	// Producer list manipulation
-	//////////////////////////////////	
-	
-	ProducerBase* recycle_or_create_producer(bool isExplicit)
-	{
-#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
-		debug::DebugLock lock(implicitProdMutex);
-#endif
-		// Try to re-use one first
-		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-			if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) {
-				bool expected = true;
-				if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) {
-					// We caught one! It's been marked as activated, the caller can have it
-					return ptr;
-				}
-			}
-		}
-
-		return add_producer(isExplicit ? static_cast<ProducerBase*>(create<ExplicitProducer>(this)) : create<ImplicitProducer>(this));
-	}
-	
-	ProducerBase* add_producer(ProducerBase* producer)
-	{
-		// Handle failed memory allocation
-		if (producer == nullptr) {
-			return nullptr;
-		}
-		
-		producerCount.fetch_add(1, std::memory_order_relaxed);
-		
-		// Add it to the lock-free list
-		auto prevTail = producerListTail.load(std::memory_order_relaxed);
-		do {
-			producer->next = prevTail;
-		} while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed));
-		
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-		if (producer->isExplicit) {
-			auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed);
-			do {
-				static_cast<ExplicitProducer*>(producer)->nextExplicitProducer = prevTailExplicit;
-			} while (!explicitProducers.compare_exchange_weak(prevTailExplicit, static_cast<ExplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
-		}
-		else {
-			auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed);
-			do {
-				static_cast<ImplicitProducer*>(producer)->nextImplicitProducer = prevTailImplicit;
-			} while (!implicitProducers.compare_exchange_weak(prevTailImplicit, static_cast<ImplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
-		}
-#endif
-		
-		return producer;
-	}
-	
-	void reown_producers()
-	{
-		// After another instance is moved-into/swapped-with this one, all the
-		// producers we stole still think their parents are the other queue.
-		// So fix them up!
-		for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) {
-			ptr->parent = this;
-		}
-	}
-	
-	
-	//////////////////////////////////
-	// Implicit producer hash
-	//////////////////////////////////
-	
-	struct ImplicitProducerKVP
-	{
-		std::atomic<details::thread_id_t> key;
-		ImplicitProducer* value;		// No need for atomicity since it's only read by the thread that sets it in the first place
-		
-		ImplicitProducerKVP() : value(nullptr) { }
-		
-		ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
-		{
-			key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed);
-			value = other.value;
-		}
-		
-		inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
-		{
-			swap(other);
-			return *this;
-		}
-		
-		inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT
-		{
-			if (this != &other) {
-				details::swap_relaxed(key, other.key);
-				std::swap(value, other.value);
-			}
-		}
-	};
-	
-	template<typename XT, typename XTraits>
-	friend void moodycamel::swap(typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&, typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT;
-	
-	struct ImplicitProducerHash
-	{
-		size_t capacity;
-		ImplicitProducerKVP* entries;
-		ImplicitProducerHash* prev;
-	};
-	
-	inline void populate_initial_implicit_producer_hash()
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
-			return;
-		}
-		else {
-			implicitProducerHashCount.store(0, std::memory_order_relaxed);
-			auto hash = &initialImplicitProducerHash;
-			hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
-			hash->entries = &initialImplicitProducerHashEntries[0];
-			for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) {
-				initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
-			}
-			hash->prev = nullptr;
-			implicitProducerHash.store(hash, std::memory_order_relaxed);
-		}
-	}
-	
-	void swap_implicit_producer_hashes(ConcurrentQueue& other)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
-			return;
-		}
-		else {
-			// Swap (assumes our implicit producer hash is initialized)
-			initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries);
-			initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0];
-			other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0];
-			
-			details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount);
-			
-			details::swap_relaxed(implicitProducerHash, other.implicitProducerHash);
-			if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) {
-				implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed);
-			}
-			else {
-				ImplicitProducerHash* hash;
-				for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev) {
-					continue;
-				}
-				hash->prev = &initialImplicitProducerHash;
-			}
-			if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) {
-				other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed);
-			}
-			else {
-				ImplicitProducerHash* hash;
-				for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev) {
-					continue;
-				}
-				hash->prev = &other.initialImplicitProducerHash;
-			}
-		}
-	}
-	
-	// Only fails (returns nullptr) if memory allocation fails
-	ImplicitProducer* get_or_add_implicit_producer()
-	{
-		// Note that since the data is essentially thread-local (key is thread ID),
-		// there's a reduced need for fences (memory ordering is already consistent
-		// for any individual thread), except for the current table itself.
-		
-		// Start by looking for the thread ID in the current and all previous hash tables.
-		// If it's not found, it must not be in there yet, since this same thread would
-		// have added it previously to one of the tables that we traversed.
-		
-		// Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table
-		
-#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
-		debug::DebugLock lock(implicitProdMutex);
-#endif
-		
-		auto id = details::thread_id();
-		auto hashedId = details::hash_thread_id(id);
-		
-		auto mainHash = implicitProducerHash.load(std::memory_order_acquire);
-		assert(mainHash != nullptr);  // silence clang-tidy and MSVC warnings (hash cannot be null)
-		for (auto hash = mainHash; hash != nullptr; hash = hash->prev) {
-			// Look for the id in this hash
-			auto index = hashedId;
-			while (true) {		// Not an infinite loop because at least one slot is free in the hash table
-				index &= hash->capacity - 1u;
-				
-				auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed);
-				if (probedKey == id) {
-					// Found it! If we had to search several hashes deep, though, we should lazily add it
-					// to the current main hash table to avoid the extended search next time.
-					// Note there's guaranteed to be room in the current hash table since every subsequent
-					// table implicitly reserves space for all previous tables (there's only one
-					// implicitProducerHashCount).
-					auto value = hash->entries[index].value;
-					if (hash != mainHash) {
-						index = hashedId;
-						while (true) {
-							index &= mainHash->capacity - 1u;
-							auto empty = details::invalid_thread_id;
-#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-							auto reusable = details::invalid_thread_id2;
-							if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed) ||
-								mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
-#else
-							if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
-#endif
-								mainHash->entries[index].value = value;
-								break;
-							}
-							++index;
-						}
-					}
-					
-					return value;
-				}
-				if (probedKey == details::invalid_thread_id) {
-					break;		// Not in this hash table
-				}
-				++index;
-			}
-		}
-		
-		// Insert!
-		auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed);
-		while (true) {
-			// NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
-			if (newCount >= (mainHash->capacity >> 1) && !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) {
-				// We've acquired the resize lock, try to allocate a bigger hash table.
-				// Note the acquire fence synchronizes with the release fence at the end of this block, and hence when
-				// we reload implicitProducerHash it must be the most recent version (it only gets changed within this
-				// locked block).
-				mainHash = implicitProducerHash.load(std::memory_order_acquire);
-				if (newCount >= (mainHash->capacity >> 1)) {
-					size_t newCapacity = mainHash->capacity << 1;
-					while (newCount >= (newCapacity >> 1)) {
-						newCapacity <<= 1;
-					}
-					auto raw = static_cast<char*>((Traits::malloc)(sizeof(ImplicitProducerHash) + std::alignment_of<ImplicitProducerKVP>::value - 1 + sizeof(ImplicitProducerKVP) * newCapacity));
-					if (raw == nullptr) {
-						// Allocation failed
-						implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
-						implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
-						return nullptr;
-					}
-					
-					auto newHash = new (raw) ImplicitProducerHash;
-					newHash->capacity = static_cast<size_t>(newCapacity);
-					newHash->entries = reinterpret_cast<ImplicitProducerKVP*>(details::align_for<ImplicitProducerKVP>(raw + sizeof(ImplicitProducerHash)));
-					for (size_t i = 0; i != newCapacity; ++i) {
-						new (newHash->entries + i) ImplicitProducerKVP;
-						newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
-					}
-					newHash->prev = mainHash;
-					implicitProducerHash.store(newHash, std::memory_order_release);
-					implicitProducerHashResizeInProgress.clear(std::memory_order_release);
-					mainHash = newHash;
-				}
-				else {
-					implicitProducerHashResizeInProgress.clear(std::memory_order_release);
-				}
-			}
-			
-			// If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table
-			// to finish being allocated by another thread (and if we just finished allocating above, the condition will
-			// always be true)
-			if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) {
-				auto producer = static_cast<ImplicitProducer*>(recycle_or_create_producer(false));
-				if (producer == nullptr) {
-					implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
-					return nullptr;
-				}
-				
-#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-				producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback;
-				producer->threadExitListener.userData = producer;
-				details::ThreadExitNotifier::subscribe(&producer->threadExitListener);
-#endif
-				
-				auto index = hashedId;
-				while (true) {
-					index &= mainHash->capacity - 1u;
-					auto empty = details::invalid_thread_id;
-#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-					auto reusable = details::invalid_thread_id2;
-					if (mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
-						implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);  // already counted as a used slot
-						mainHash->entries[index].value = producer;
-						break;
-					}
-#endif
-					if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
-						mainHash->entries[index].value = producer;
-						break;
-					}
-					++index;
-				}
-				return producer;
-			}
-			
-			// Hmm, the old hash is quite full and somebody else is busy allocating a new one.
-			// We need to wait for the allocating thread to finish (if it succeeds, we add, if not,
-			// we try to allocate ourselves).
-			mainHash = implicitProducerHash.load(std::memory_order_acquire);
-		}
-	}
-	
-#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-	void implicit_producer_thread_exited(ImplicitProducer* producer)
-	{
-		// Remove from hash
-#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
-		debug::DebugLock lock(implicitProdMutex);
-#endif
-		auto hash = implicitProducerHash.load(std::memory_order_acquire);
-		assert(hash != nullptr);		// The thread exit listener is only registered if we were added to a hash in the first place
-		auto id = details::thread_id();
-		auto hashedId = details::hash_thread_id(id);
-		details::thread_id_t probedKey;
-		
-		// We need to traverse all the hashes just in case other threads aren't on the current one yet and are
-		// trying to add an entry thinking there's a free slot (because they reused a producer)
-		for (; hash != nullptr; hash = hash->prev) {
-			auto index = hashedId;
-			do {
-				index &= hash->capacity - 1u;
-				probedKey = id;
-				if (hash->entries[index].key.compare_exchange_strong(probedKey, details::invalid_thread_id2, std::memory_order_seq_cst, std::memory_order_relaxed)) {
-					break;
-				}
-				++index;
-			} while (probedKey != details::invalid_thread_id);		// Can happen if the hash has changed but we weren't put back in it yet, or if we weren't added to this hash in the first place
-		}
-		
-		// Mark the queue as being recyclable
-		producer->inactive.store(true, std::memory_order_release);
-	}
-	
-	static void implicit_producer_thread_exited_callback(void* userData)
-	{
-		auto producer = static_cast<ImplicitProducer*>(userData);
-		auto queue = producer->parent;
-		queue->implicit_producer_thread_exited(producer);
-	}
-#endif
-	
-	//////////////////////////////////
-	// Utility functions
-	//////////////////////////////////
-
-	template<typename TAlign>
-	static inline void* aligned_malloc(size_t size)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
-			return (Traits::malloc)(size);
-		else {
-			size_t alignment = std::alignment_of<TAlign>::value;
-			void* raw = (Traits::malloc)(size + alignment - 1 + sizeof(void*));
-			if (!raw)
-				return nullptr;
-			char* ptr = details::align_for<TAlign>(reinterpret_cast<char*>(raw) + sizeof(void*));
-			*(reinterpret_cast<void**>(ptr) - 1) = raw;
-			return ptr;
-		}
-	}
-
-	template<typename TAlign>
-	static inline void aligned_free(void* ptr)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
-			return (Traits::free)(ptr);
-		else
-			(Traits::free)(ptr ? *(reinterpret_cast<void**>(ptr) - 1) : nullptr);
-	}
-
-	template<typename U>
-	static inline U* create_array(size_t count)
-	{
-		assert(count > 0);
-		U* p = static_cast<U*>(aligned_malloc<U>(sizeof(U) * count));
-		if (p == nullptr)
-			return nullptr;
-
-		for (size_t i = 0; i != count; ++i)
-			new (p + i) U();
-		return p;
-	}
-
-	template<typename U>
-	static inline void destroy_array(U* p, size_t count)
-	{
-		if (p != nullptr) {
-			assert(count > 0);
-			for (size_t i = count; i != 0; )
-				(p + --i)->~U();
-		}
-		aligned_free<U>(p);
-	}
-
-	template<typename U>
-	static inline U* create()
-	{
-		void* p = aligned_malloc<U>(sizeof(U));
-		return p != nullptr ? new (p) U : nullptr;
-	}
-
-	template<typename U, typename A1>
-	static inline U* create(A1&& a1)
-	{
-		void* p = aligned_malloc<U>(sizeof(U));
-		return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
-	}
-
-	template<typename U>
-	static inline void destroy(U* p)
-	{
-		if (p != nullptr)
-			p->~U();
-		aligned_free<U>(p);
-	}
-
-private:
-	std::atomic<ProducerBase*> producerListTail;
-	std::atomic<std::uint32_t> producerCount;
-	
-	std::atomic<size_t> initialBlockPoolIndex;
-	Block* initialBlockPool;
-	size_t initialBlockPoolSize;
-	
-#ifndef MCDBGQ_USEDEBUGFREELIST
-	FreeList<Block> freeList;
-#else
-	debug::DebugFreeList<Block> freeList;
-#endif
-	
-	std::atomic<ImplicitProducerHash*> implicitProducerHash;
-	std::atomic<size_t> implicitProducerHashCount;		// Number of slots logically used
-	ImplicitProducerHash initialImplicitProducerHash;
-	std::array<ImplicitProducerKVP, INITIAL_IMPLICIT_PRODUCER_HASH_SIZE> initialImplicitProducerHashEntries;
-	std::atomic_flag implicitProducerHashResizeInProgress;
-	
-	std::atomic<std::uint32_t> nextExplicitConsumerId;
-	std::atomic<std::uint32_t> globalExplicitConsumerOffset;
-	
-#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
-	debug::DebugMutex implicitProdMutex;
+    explicitProducers.store(
+        other.explicitProducers.load(std::memory_order_relaxed),
+        std::memory_order_relaxed);
+    other.explicitProducers.store(nullptr, std::memory_order_relaxed);
+    implicitProducers.store(
+        other.implicitProducers.load(std::memory_order_relaxed),
+        std::memory_order_relaxed);
+    other.implicitProducers.store(nullptr, std::memory_order_relaxed);
 #endif
-	
+
+    other.initialBlockPoolIndex.store(0, std::memory_order_relaxed);
+    other.initialBlockPoolSize = 0;
+    other.initialBlockPool = nullptr;
+
+    reown_producers();
+  }
+
+  inline ConcurrentQueue &operator=(ConcurrentQueue &&other)
+      MOODYCAMEL_NOEXCEPT {
+    return swap_internal(other);
+  }
+
+  // Swaps this queue's state with the other's. Not thread-safe.
+  // Swapping two queues does not invalidate their tokens, however
+  // the tokens that were created for one queue must be used with
+  // only the swapped queue (i.e. the tokens are tied to the
+  // queue's movable state, not the object itself).
+  inline void swap(ConcurrentQueue &other) MOODYCAMEL_NOEXCEPT {
+    swap_internal(other);
+  }
+
+ private:
+  ConcurrentQueue &swap_internal(ConcurrentQueue &other) {
+    if (this == &other) {
+      return *this;
+    }
+
+    details::swap_relaxed(producerListTail, other.producerListTail);
+    details::swap_relaxed(producerCount, other.producerCount);
+    details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex);
+    std::swap(initialBlockPool, other.initialBlockPool);
+    std::swap(initialBlockPoolSize, other.initialBlockPoolSize);
+    freeList.swap(other.freeList);
+    details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId);
+    details::swap_relaxed(globalExplicitConsumerOffset,
+                          other.globalExplicitConsumerOffset);
+
+    swap_implicit_producer_hashes(other);
+
+    reown_producers();
+    other.reown_producers();
+
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-	std::atomic<ExplicitProducer*> explicitProducers;
-	std::atomic<ImplicitProducer*> implicitProducers;
+    details::swap_relaxed(explicitProducers, other.explicitProducers);
+    details::swap_relaxed(implicitProducers, other.implicitProducers);
 #endif
-};
 
+    return *this;
+  }
 
-template<typename T, typename Traits>
-ProducerToken::ProducerToken(ConcurrentQueue<T, Traits>& queue)
-	: producer(queue.recycle_or_create_producer(true))
-{
-	if (producer != nullptr) {
-		producer->token = this;
-	}
-}
+ public:
+  // Enqueues a single item (by copying it).
+  // Allocates memory if required. Only fails if memory allocation fails (or
+  // implicit production is disabled because
+  // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or
+  // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+  // Thread-safe.
+  inline bool enqueue(T const &item) {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+    return false;
+    else return inner_enqueue<CanAlloc>(item);
+  }
 
-template<typename T, typename Traits>
-ProducerToken::ProducerToken(BlockingConcurrentQueue<T, Traits>& queue)
-	: producer(reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->recycle_or_create_producer(true))
-{
-	if (producer != nullptr) {
-		producer->token = this;
-	}
-}
+  // Enqueues a single item (by moving it, if possible).
+  // Allocates memory if required. Only fails if memory allocation fails (or
+  // implicit production is disabled because
+  // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or
+  // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+  // Thread-safe.
+  inline bool enqueue(T &&item) {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+    return false;
+    else return inner_enqueue<CanAlloc>(std::move(item));
+  }
 
-template<typename T, typename Traits>
-ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits>& queue)
-	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
-{
-	initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
-	lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
-}
+  // Enqueues a single item (by copying it) using an explicit producer token.
+  // Allocates memory if required. Only fails if memory allocation fails (or
+  // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+  // Thread-safe.
+  inline bool enqueue(producer_token_t const &token, T const &item) {
+    return inner_enqueue<CanAlloc>(token, item);
+  }
 
-template<typename T, typename Traits>
-ConsumerToken::ConsumerToken(BlockingConcurrentQueue<T, Traits>& queue)
-	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
-{
-	initialOffset = reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
-	lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
-}
+  // Enqueues a single item (by moving it, if possible) using an explicit
+  // producer token. Allocates memory if required. Only fails if memory
+  // allocation fails (or Traits::MAX_SUBQUEUE_SIZE has been defined and would
+  // be surpassed). Thread-safe.
+  inline bool enqueue(producer_token_t const &token, T &&item) {
+    return inner_enqueue<CanAlloc>(token, std::move(item));
+  }
 
-template<typename T, typename Traits>
-inline void swap(ConcurrentQueue<T, Traits>& a, ConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT
-{
-	a.swap(b);
-}
+  // Enqueues several items.
+  // Allocates memory if required. Only fails if memory allocation fails (or
+  // implicit production is disabled because
+  // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or
+  // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). Note:
+  // Use std::make_move_iterator if the elements should be moved instead of
+  // copied. Thread-safe.
+  template <typename It>
+  bool enqueue_bulk(It itemFirst, size_t count) {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+    return false;
+    else return inner_enqueue_bulk<CanAlloc>(itemFirst, count);
+  }
 
-inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT
-{
-	a.swap(b);
-}
+  // Enqueues several items using an explicit producer token.
+  // Allocates memory if required. Only fails if memory allocation fails
+  // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+  // Note: Use std::make_move_iterator if the elements should be moved
+  // instead of copied.
+  // Thread-safe.
+  template <typename It>
+  bool enqueue_bulk(producer_token_t const &token, It itemFirst, size_t count) {
+    return inner_enqueue_bulk<CanAlloc>(token, itemFirst, count);
+  }
 
-inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT
-{
-	a.swap(b);
-}
+  // Enqueues a single item (by copying it).
+  // Does not allocate memory. Fails if not enough room to enqueue (or implicit
+  // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+  // is 0).
+  // Thread-safe.
+  inline bool try_enqueue(T const &item) {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+    return false;
+    else return inner_enqueue<CannotAlloc>(item);
+  }
 
-template<typename T, typename Traits>
-inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT
-{
-	a.swap(b);
-}
+  // Enqueues a single item (by moving it, if possible).
+  // Does not allocate memory (except for one-time implicit producer).
+  // Fails if not enough room to enqueue (or implicit production is
+  // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+  // Thread-safe.
+  inline bool try_enqueue(T &&item) {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+    return false;
+    else return inner_enqueue<CannotAlloc>(std::move(item));
+  }
 
-}
+  // Enqueues a single item (by copying it) using an explicit producer token.
+  // Does not allocate memory. Fails if not enough room to enqueue.
+  // Thread-safe.
+  inline bool try_enqueue(producer_token_t const &token, T const &item) {
+    return inner_enqueue<CannotAlloc>(token, item);
+  }
+
+  // Enqueues a single item (by moving it, if possible) using an explicit
+  // producer token. Does not allocate memory. Fails if not enough room to
+  // enqueue. Thread-safe.
+  inline bool try_enqueue(producer_token_t const &token, T &&item) {
+    return inner_enqueue<CannotAlloc>(token, std::move(item));
+  }
+
+  // Enqueues several items.
+  // Does not allocate memory (except for one-time implicit producer).
+  // Fails if not enough room to enqueue (or implicit production is
+  // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+  // Note: Use std::make_move_iterator if the elements should be moved
+  // instead of copied.
+  // Thread-safe.
+  template <typename It>
+  bool try_enqueue_bulk(It itemFirst, size_t count) {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+    return false;
+    else return inner_enqueue_bulk<CannotAlloc>(itemFirst, count);
+  }
+
+  // Enqueues several items using an explicit producer token.
+  // Does not allocate memory. Fails if not enough room to enqueue.
+  // Note: Use std::make_move_iterator if the elements should be moved
+  // instead of copied.
+  // Thread-safe.
+  template <typename It>
+  bool try_enqueue_bulk(producer_token_t const &token, It itemFirst,
+                        size_t count) {
+    return inner_enqueue_bulk<CannotAlloc>(token, itemFirst, count);
+  }
+
+
+  // Attempts to dequeue from the queue.
+  // Returns false if all producer streams appeared empty at the time they
+  // were checked (so, the queue is likely but not guaranteed to be empty).
+  // Never allocates. Thread-safe.
+  template <typename U>
+  bool try_dequeue(U &item) {
+    // Instead of simply trying each producer in turn (which could cause
+    // needless contention on the first producer), we score them heuristically.
+    size_t nonEmptyCount = 0;
+    ProducerBase *best = nullptr;
+    size_t bestSize = 0;
+    for (auto ptr = producerListTail.load(std::memory_order_acquire);
+         nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) {
+      auto size = ptr->size_approx();
+      if (size > 0) {
+        if (size > bestSize) {
+          bestSize = size;
+          best = ptr;
+        }
+        ++nonEmptyCount;
+      }
+    }
+
+    // If there was at least one non-empty queue but it appears empty at the
+    // time we try to dequeue from it, we need to make sure every queue's been
+    // tried
+    if (nonEmptyCount > 0) {
+      if ((details::likely)(best->dequeue(item))) {
+        return true;
+      }
+      for (auto ptr = producerListTail.load(std::memory_order_acquire);
+           ptr != nullptr; ptr = ptr->next_prod()) {
+        if (ptr != best && ptr->dequeue(item)) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  // Attempts to dequeue from the queue.
+  // Returns false if all producer streams appeared empty at the time they
+  // were checked (so, the queue is likely but not guaranteed to be empty).
+  // This differs from the try_dequeue(item) method in that this one does
+  // not attempt to reduce contention by interleaving the order that producer
+  // streams are dequeued from. So, using this method can reduce overall
+  // throughput under contention, but will give more predictable results in
+  // single-threaded consumer scenarios. This is mostly only useful for internal
+  // unit tests. Never allocates. Thread-safe.
+  template <typename U>
+  bool try_dequeue_non_interleaved(U &item) {
+    for (auto ptr = producerListTail.load(std::memory_order_acquire);
+         ptr != nullptr; ptr = ptr->next_prod()) {
+      if (ptr->dequeue(item)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Attempts to dequeue from the queue using an explicit consumer token.
+  // Returns false if all producer streams appeared empty at the time they
+  // were checked (so, the queue is likely but not guaranteed to be empty).
+  // Never allocates. Thread-safe.
+  template <typename U>
+  bool try_dequeue(consumer_token_t &token, U &item) {
+    // The idea is roughly as follows:
+    // Every 256 items from one producer, make everyone rotate (increase the
+    // global offset) -> this means the highest efficiency consumer dictates the
+    // rotation speed of everyone else, more or less If you see that the global
+    // offset has changed, you must reset your consumption counter and move to
+    // your designated place If there's no items where you're supposed to be,
+    // keep moving until you find a producer with some items If the global
+    // offset has not changed but you've run out of items to consume, move over
+    // from your current position until you find an producer with something in
+    // it
+
+    if (token.desiredProducer == nullptr ||
+        token.lastKnownGlobalOffset !=
+            globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+      if (!update_current_producer_after_rotation(token)) {
+        return false;
+      }
+    }
+
+    // If there was at least one non-empty queue but it appears empty at the
+    // time we try to dequeue from it, we need to make sure every queue's been
+    // tried
+    if (static_cast<ProducerBase *>(token.currentProducer)->dequeue(item)) {
+      if (++token.itemsConsumedFromCurrent ==
+          EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+        globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+      }
+      return true;
+    }
+
+    auto tail = producerListTail.load(std::memory_order_acquire);
+    auto ptr = static_cast<ProducerBase *>(token.currentProducer)->next_prod();
+    if (ptr == nullptr) {
+      ptr = tail;
+    }
+    while (ptr != static_cast<ProducerBase *>(token.currentProducer)) {
+      if (ptr->dequeue(item)) {
+        token.currentProducer = ptr;
+        token.itemsConsumedFromCurrent = 1;
+        return true;
+      }
+      ptr = ptr->next_prod();
+      if (ptr == nullptr) {
+        ptr = tail;
+      }
+    }
+    return false;
+  }
+
+  // Attempts to dequeue several elements from the queue.
+  // Returns the number of items actually dequeued.
+  // Returns 0 if all producer streams appeared empty at the time they
+  // were checked (so, the queue is likely but not guaranteed to be empty).
+  // Never allocates. Thread-safe.
+  template <typename It>
+  size_t try_dequeue_bulk(It itemFirst, size_t max) {
+    size_t count = 0;
+    for (auto ptr = producerListTail.load(std::memory_order_acquire);
+         ptr != nullptr; ptr = ptr->next_prod()) {
+      count += ptr->dequeue_bulk(itemFirst, max - count);
+      if (count == max) {
+        break;
+      }
+    }
+    return count;
+  }
+
+  // Attempts to dequeue several elements from the queue using an explicit
+  // consumer token. Returns the number of items actually dequeued. Returns 0 if
+  // all producer streams appeared empty at the time they were checked (so, the
+  // queue is likely but not guaranteed to be empty). Never allocates.
+  // Thread-safe.
+  template <typename It>
+  size_t try_dequeue_bulk(consumer_token_t &token, It itemFirst, size_t max) {
+    if (token.desiredProducer == nullptr ||
+        token.lastKnownGlobalOffset !=
+            globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+      if (!update_current_producer_after_rotation(token)) {
+        return 0;
+      }
+    }
+
+    size_t count = static_cast<ProducerBase *>(token.currentProducer)
+                       ->dequeue_bulk(itemFirst, max);
+    if (count == max) {
+      if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >=
+          EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+        globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+      }
+      return max;
+    }
+    token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
+    max -= count;
+
+    auto tail = producerListTail.load(std::memory_order_acquire);
+    auto ptr = static_cast<ProducerBase *>(token.currentProducer)->next_prod();
+    if (ptr == nullptr) {
+      ptr = tail;
+    }
+    while (ptr != static_cast<ProducerBase *>(token.currentProducer)) {
+      auto dequeued = ptr->dequeue_bulk(itemFirst, max);
+      count += dequeued;
+      if (dequeued != 0) {
+        token.currentProducer = ptr;
+        token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
+      }
+      if (dequeued == max) {
+        break;
+      }
+      max -= dequeued;
+      ptr = ptr->next_prod();
+      if (ptr == nullptr) {
+        ptr = tail;
+      }
+    }
+    return count;
+  }
+
+
+  // Attempts to dequeue from a specific producer's inner queue.
+  // If you happen to know which producer you want to dequeue from, this
+  // is significantly faster than using the general-case try_dequeue methods.
+  // Returns false if the producer's queue appeared empty at the time it
+  // was checked (so, the queue is likely but not guaranteed to be empty).
+  // Never allocates. Thread-safe.
+  template <typename U>
+  inline bool try_dequeue_from_producer(producer_token_t const &producer,
+                                        U &item) {
+    return static_cast<ExplicitProducer *>(producer.producer)->dequeue(item);
+  }
+
+  // Attempts to dequeue several elements from a specific producer's inner
+  // queue. Returns the number of items actually dequeued. If you happen to know
+  // which producer you want to dequeue from, this is significantly faster than
+  // using the general-case try_dequeue methods. Returns 0 if the producer's
+  // queue appeared empty at the time it was checked (so, the queue is likely
+  // but not guaranteed to be empty). Never allocates. Thread-safe.
+  template <typename It>
+  inline size_t try_dequeue_bulk_from_producer(producer_token_t const &producer,
+                                               It itemFirst, size_t max) {
+    return static_cast<ExplicitProducer *>(producer.producer)
+        ->dequeue_bulk(itemFirst, max);
+  }
+
+
+  // Returns an estimate of the total number of elements currently in the queue.
+  // This estimate is only accurate if the queue has completely stabilized
+  // before it is called (i.e. all enqueue and dequeue operations have completed
+  // and their memory effects are visible on the calling thread, and no further
+  // operations start while this method is being called). Thread-safe.
+  size_t size_approx() const {
+    size_t size = 0;
+    for (auto ptr = producerListTail.load(std::memory_order_acquire);
+         ptr != nullptr; ptr = ptr->next_prod()) {
+      size += ptr->size_approx();
+    }
+    return size;
+  }
+
+
+  // Returns true if the underlying atomic variables used by
+  // the queue are lock-free (they should be on most platforms).
+  // Thread-safe.
+  static constexpr bool is_lock_free() {
+    return details::static_is_lock_free<bool>::value == 2 &&
+           details::static_is_lock_free<size_t>::value == 2 &&
+           details::static_is_lock_free<std::uint32_t>::value == 2 &&
+           details::static_is_lock_free<index_t>::value == 2 &&
+           details::static_is_lock_free<void *>::value == 2 &&
+           details::static_is_lock_free<typename details::thread_id_converter<
+               details::thread_id_t>::thread_id_numeric_size_t>::value == 2;
+  }
+
+
+ private:
+  friend struct ProducerToken;
+  friend struct ConsumerToken;
+  struct ExplicitProducer;
+  friend struct ExplicitProducer;
+  struct ImplicitProducer;
+  friend struct ImplicitProducer;
+  friend class ConcurrentQueueTests;
+
+  enum AllocationMode { CanAlloc, CannotAlloc };
+
+
+  ///////////////////////////////
+  // Queue methods
+  ///////////////////////////////
+
+  template <AllocationMode canAlloc, typename U>
+  inline bool inner_enqueue(producer_token_t const &token, U &&element) {
+    return static_cast<ExplicitProducer *>(token.producer)
+        ->ConcurrentQueue::ExplicitProducer::template enqueue<canAlloc>(
+            std::forward<U>(element));
+  }
+
+  template <AllocationMode canAlloc, typename U>
+  inline bool inner_enqueue(U &&element) {
+    auto producer = get_or_add_implicit_producer();
+    return producer == nullptr
+               ? false
+               : producer->ConcurrentQueue::ImplicitProducer::template enqueue<
+                     canAlloc>(std::forward<U>(element));
+  }
+
+  template <AllocationMode canAlloc, typename It>
+  inline bool inner_enqueue_bulk(producer_token_t const &token, It itemFirst,
+                                 size_t count) {
+    return static_cast<ExplicitProducer *>(token.producer)
+        ->ConcurrentQueue::ExplicitProducer::template enqueue_bulk<canAlloc>(
+            itemFirst, count);
+  }
+
+  template <AllocationMode canAlloc, typename It>
+  inline bool inner_enqueue_bulk(It itemFirst, size_t count) {
+    auto producer = get_or_add_implicit_producer();
+    return producer == nullptr
+               ? false
+               : producer->ConcurrentQueue::ImplicitProducer::
+                     template enqueue_bulk<canAlloc>(itemFirst, count);
+  }
+
+  inline bool update_current_producer_after_rotation(consumer_token_t &token) {
+    // Ah, there's been a rotation, figure out where we should be!
+    auto tail = producerListTail.load(std::memory_order_acquire);
+    if (token.desiredProducer == nullptr && tail == nullptr) {
+      return false;
+    }
+    auto prodCount = producerCount.load(std::memory_order_relaxed);
+    auto globalOffset =
+        globalExplicitConsumerOffset.load(std::memory_order_relaxed);
+    if ((details::unlikely)(token.desiredProducer == nullptr)) {
+      // Aha, first time we're dequeueing anything.
+      // Figure out our local position
+      // Note: offset is from start, not end, but we're traversing from end --
+      // subtract from count first
+      std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount);
+      token.desiredProducer = tail;
+      for (std::uint32_t i = 0; i != offset; ++i) {
+        token.desiredProducer =
+            static_cast<ProducerBase *>(token.desiredProducer)->next_prod();
+        if (token.desiredProducer == nullptr) {
+          token.desiredProducer = tail;
+        }
+      }
+    }
+
+    std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
+    if (delta >= prodCount) {
+      delta = delta % prodCount;
+    }
+    for (std::uint32_t i = 0; i != delta; ++i) {
+      token.desiredProducer =
+          static_cast<ProducerBase *>(token.desiredProducer)->next_prod();
+      if (token.desiredProducer == nullptr) {
+        token.desiredProducer = tail;
+      }
+    }
+
+    token.lastKnownGlobalOffset = globalOffset;
+    token.currentProducer = token.desiredProducer;
+    token.itemsConsumedFromCurrent = 0;
+    return true;
+  }
+
+
+  ///////////////////////////
+  // Free list
+  ///////////////////////////
+
+  template <typename N>
+  struct FreeListNode {
+    FreeListNode() : freeListRefs(0), freeListNext(nullptr) {}
+
+    std::atomic<std::uint32_t> freeListRefs;
+    std::atomic<N *> freeListNext;
+  };
+
+  // A simple CAS-based lock-free free list. Not the fastest thing in the world
+  // under heavy contention, but simple and correct (assuming nodes are never
+  // freed until after the free list is destroyed), and fairly speedy under low
+  // contention.
+  template <typename N>  // N must inherit FreeListNode or have the same fields
+                         // (and initialization of them)
+                         struct FreeList {
+    FreeList() : freeListHead(nullptr) {}
+    FreeList(FreeList &&other)
+        : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) {
+      other.freeListHead.store(nullptr, std::memory_order_relaxed);
+    }
+    void swap(FreeList &other) {
+      details::swap_relaxed(freeListHead, other.freeListHead);
+    }
+
+    FreeList(FreeList const &) MOODYCAMEL_DELETE_FUNCTION;
+    FreeList &operator=(FreeList const &) MOODYCAMEL_DELETE_FUNCTION;
+
+    inline void add(N *node) {
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+      debug::DebugLock lock(mutex);
+#endif
+      // We know that the should-be-on-freelist bit is 0 at this point, so it's
+      // safe to set it using a fetch_add
+      if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST,
+                                       std::memory_order_acq_rel) == 0) {
+        // Oh look! We were the last ones referencing this node, and we know
+        // we want to add it to the free list, so let's do it!
+        add_knowing_refcount_is_zero(node);
+      }
+    }
+
+    inline N *try_get() {
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+      debug::DebugLock lock(mutex);
+#endif
+      auto head = freeListHead.load(std::memory_order_acquire);
+      while (head != nullptr) {
+        auto prevHead = head;
+        auto refs = head->freeListRefs.load(std::memory_order_relaxed);
+        if ((refs & REFS_MASK) == 0 ||
+            !head->freeListRefs.compare_exchange_strong(
+                refs, refs + 1, std::memory_order_acquire)) {
+          head = freeListHead.load(std::memory_order_acquire);
+          continue;
+        }
+
+        // Good, reference count has been incremented (it wasn't at zero), which
+        // means we can read the next and not worry about it changing between
+        // now and the time we do the CAS
+        auto next = head->freeListNext.load(std::memory_order_relaxed);
+        if (freeListHead.compare_exchange_strong(head, next,
+                                                 std::memory_order_acquire,
+                                                 std::memory_order_relaxed)) {
+          // Yay, got the node. This means it was on the list, which means
+          // shouldBeOnFreeList must be false no matter the refcount (because
+          // nobody else knows it's been taken off yet, it can't have been put
+          // back on).
+          assert((head->freeListRefs.load(std::memory_order_relaxed) &
+                  SHOULD_BE_ON_FREELIST) == 0);
+
+          // Decrease refcount twice, once for our ref, and once for the list's
+          // ref
+          head->freeListRefs.fetch_sub(2, std::memory_order_release);
+          return head;
+        }
+
+        // OK, the head must have changed on us, but we still need to decrease
+        // the refcount we increased. Note that we don't need to release any
+        // memory effects, but we do need to ensure that the reference count
+        // decrement happens-after the CAS on the head.
+        refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel);
+        if (refs == SHOULD_BE_ON_FREELIST + 1) {
+          add_knowing_refcount_is_zero(prevHead);
+        }
+      }
+
+      return nullptr;
+    }
+
+    // Useful for traversing the list when there's no contention (e.g. to
+    // destroy remaining nodes)
+    N *head_unsafe() const {
+      return freeListHead.load(std::memory_order_relaxed);
+    }
+
+   private:
+    inline void add_knowing_refcount_is_zero(N *node) {
+      // Since the refcount is zero, and nobody can increase it once it's zero
+      // (except us, and we run only one copy of this method per node at a time,
+      // i.e. the single thread case), then we know we can safely change the
+      // next pointer of the node; however, once the refcount is back above
+      // zero, then other threads could increase it (happens under heavy
+      // contention, when the refcount goes to zero in between a load and a
+      // refcount increment of a node in try_get, then back up to something
+      // non-zero, then the refcount increment is done by the other thread) --
+      // so, if the CAS to add the node to the actual list fails, decrease the
+      // refcount and leave the add operation to the next thread who puts the
+      // refcount back at zero (which could be us, hence the loop).
+      auto head = freeListHead.load(std::memory_order_relaxed);
+      while (true) {
+        node->freeListNext.store(head, std::memory_order_relaxed);
+        node->freeListRefs.store(1, std::memory_order_release);
+        if (!freeListHead.compare_exchange_strong(head, node,
+                                                  std::memory_order_release,
+                                                  std::memory_order_relaxed)) {
+          // Hmm, the add failed, but we can only try again when the refcount
+          // goes back to zero
+          if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1,
+                                           std::memory_order_acq_rel) == 1) {
+            continue;
+          }
+        }
+        return;
+      }
+    }
+
+   private:
+    // Implemented like a stack, but where node order doesn't matter (nodes are
+    // inserted out of order under contention)
+    std::atomic<N *> freeListHead;
+
+    static const std::uint32_t REFS_MASK = 0x7FFFFFFF;
+    static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;
+
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+    debug::DebugMutex mutex;
+#endif
+  };
+
+
+  ///////////////////////////
+  // Block
+  ///////////////////////////
+
+  enum InnerQueueContext { implicit_context = 0, explicit_context = 1 };
+
+  struct Block {
+    Block()
+        : next(nullptr),
+          elementsCompletelyDequeued(0),
+          freeListRefs(0),
+          freeListNext(nullptr),
+          dynamicallyAllocated(true) {
+#ifdef MCDBGQ_TRACKMEM
+      owner = nullptr;
+#endif
+    }
+
+    template <InnerQueueContext context>
+    inline bool is_empty() const {
+      MOODYCAMEL_CONSTEXPR_IF(context == explicit_context &&
+                              BLOCK_SIZE <=
+                                  EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+        // Check flags
+        for (size_t i = 0; i < BLOCK_SIZE; ++i) {
+          if (!emptyFlags[i].load(std::memory_order_relaxed)) {
+            return false;
+          }
+        }
+
+        // Aha, empty; make sure we have all other memory effects that happened
+        // before the empty flags were set
+        std::atomic_thread_fence(std::memory_order_acquire);
+        return true;
+      }
+      else {
+        // Check counter
+        if (elementsCompletelyDequeued.load(std::memory_order_relaxed) ==
+            BLOCK_SIZE) {
+          std::atomic_thread_fence(std::memory_order_acquire);
+          return true;
+        }
+        assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <=
+               BLOCK_SIZE);
+        return false;
+      }
+    }
+
+    // Returns true if the block is now empty (does not apply in explicit
+    // context)
+    template <InnerQueueContext context>
+    inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i) {
+      MOODYCAMEL_CONSTEXPR_IF(context == explicit_context &&
+                              BLOCK_SIZE <=
+                                  EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+        // Set flag
+        assert(!emptyFlags[BLOCK_SIZE - 1 -
+                           static_cast<size_t>(
+                               i & static_cast<index_t>(BLOCK_SIZE - 1))]
+                    .load(std::memory_order_relaxed));
+        emptyFlags[BLOCK_SIZE - 1 -
+                   static_cast<size_t>(i &
+                                       static_cast<index_t>(BLOCK_SIZE - 1))]
+            .store(true, std::memory_order_release);
+        return false;
+      }
+      else {
+        // Increment counter
+        auto prevVal =
+            elementsCompletelyDequeued.fetch_add(1, std::memory_order_acq_rel);
+        assert(prevVal < BLOCK_SIZE);
+        return prevVal == BLOCK_SIZE - 1;
+      }
+    }
+
+    // Sets multiple contiguous item statuses to 'empty' (assumes no wrapping
+    // and count > 0). Returns true if the block is now empty (does not apply in
+    // explicit context).
+    template <InnerQueueContext context>
+    inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i,
+                               size_t count) {
+      MOODYCAMEL_CONSTEXPR_IF(context == explicit_context &&
+                              BLOCK_SIZE <=
+                                  EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+        // Set flags
+        std::atomic_thread_fence(std::memory_order_release);
+        i = BLOCK_SIZE - 1 -
+            static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1)) -
+            count + 1;
+        for (size_t j = 0; j != count; ++j) {
+          assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
+          emptyFlags[i + j].store(true, std::memory_order_relaxed);
+        }
+        return false;
+      }
+      else {
+        // Increment counter
+        auto prevVal = elementsCompletelyDequeued.fetch_add(
+            count, std::memory_order_acq_rel);
+        assert(prevVal + count <= BLOCK_SIZE);
+        return prevVal + count == BLOCK_SIZE;
+      }
+    }
+
+    template <InnerQueueContext context>
+    inline void set_all_empty() {
+      MOODYCAMEL_CONSTEXPR_IF(context == explicit_context &&
+                              BLOCK_SIZE <=
+                                  EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+        // Set all flags
+        for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+          emptyFlags[i].store(true, std::memory_order_relaxed);
+        }
+      }
+      else {
+        // Reset counter
+        elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed);
+      }
+    }
+
+    template <InnerQueueContext context>
+    inline void reset_empty() {
+      MOODYCAMEL_CONSTEXPR_IF(context == explicit_context &&
+                              BLOCK_SIZE <=
+                                  EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+        // Reset flags
+        for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+          emptyFlags[i].store(false, std::memory_order_relaxed);
+        }
+      }
+      else {
+        // Reset counter
+        elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
+      }
+    }
+
+    inline T *operator[](index_t idx) MOODYCAMEL_NOEXCEPT {
+      return static_cast<T *>(static_cast<void *>(elements)) +
+             static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1));
+    }
+    inline T const *operator[](index_t idx) const MOODYCAMEL_NOEXCEPT {
+      return static_cast<T const *>(static_cast<void const *>(elements)) +
+             static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1));
+    }
+
+   private:
+    static_assert(std::alignment_of<T>::value <= sizeof(T),
+                  "The queue does not support types with an alignment greater "
+                  "than their size at this time");
+    MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements;
+
+   public:
+    Block *next;
+    std::atomic<size_t> elementsCompletelyDequeued;
+    std::atomic<bool> emptyFlags
+        [BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1];
+
+   public:
+    std::atomic<std::uint32_t> freeListRefs;
+    std::atomic<Block *> freeListNext;
+    bool dynamicallyAllocated;  // Perhaps a better name for this would be
+                                // 'isNotPartOfInitialBlockPool'
+
+#ifdef MCDBGQ_TRACKMEM
+    void *owner;
+#endif
+  };
+  static_assert(std::alignment_of<Block>::value >= std::alignment_of<T>::value,
+                "Internal error: Blocks must be at least as aligned as the "
+                "type they are wrapping");
+
+
+#ifdef MCDBGQ_TRACKMEM
+ public:
+  struct MemStats;
+
+ private:
+#endif
+
+  ///////////////////////////
+  // Producer base
+  ///////////////////////////
+
+  struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase {
+    ProducerBase(ConcurrentQueue *parent_, bool isExplicit_)
+        : tailIndex(0),
+          headIndex(0),
+          dequeueOptimisticCount(0),
+          dequeueOvercommit(0),
+          tailBlock(nullptr),
+          isExplicit(isExplicit_),
+          parent(parent_) {}
+
+    virtual ~ProducerBase() {}
+
+    template <typename U>
+    inline bool dequeue(U &element) {
+      if (isExplicit) {
+        return static_cast<ExplicitProducer *>(this)->dequeue(element);
+      } else {
+        return static_cast<ImplicitProducer *>(this)->dequeue(element);
+      }
+    }
+
+    template <typename It>
+    inline size_t dequeue_bulk(It &itemFirst, size_t max) {
+      if (isExplicit) {
+        return static_cast<ExplicitProducer *>(this)->dequeue_bulk(itemFirst,
+                                                                   max);
+      } else {
+        return static_cast<ImplicitProducer *>(this)->dequeue_bulk(itemFirst,
+                                                                   max);
+      }
+    }
+
+    inline ProducerBase *next_prod() const {
+      return static_cast<ProducerBase *>(next);
+    }
+
+    inline size_t size_approx() const {
+      auto tail = tailIndex.load(std::memory_order_relaxed);
+      auto head = headIndex.load(std::memory_order_relaxed);
+      return details::circular_less_than(head, tail)
+                 ? static_cast<size_t>(tail - head)
+                 : 0;
+    }
+
+    inline index_t getTail() const {
+      return tailIndex.load(std::memory_order_relaxed);
+    }
+
+   protected:
+    std::atomic<index_t> tailIndex;  // Where to enqueue to next
+    std::atomic<index_t> headIndex;  // Where to dequeue from next
+
+    std::atomic<index_t> dequeueOptimisticCount;
+    std::atomic<index_t> dequeueOvercommit;
+
+    Block *tailBlock;
+
+   public:
+    bool isExplicit;
+    ConcurrentQueue *parent;
+
+   protected:
+#ifdef MCDBGQ_TRACKMEM
+    friend struct MemStats;
+#endif
+  };
+
+
+  ///////////////////////////
+  // Explicit queue
+  ///////////////////////////
+
+  struct ExplicitProducer : public ProducerBase {
+    explicit ExplicitProducer(ConcurrentQueue *parent_)
+        : ProducerBase(parent_, true),
+          blockIndex(nullptr),
+          pr_blockIndexSlotsUsed(0),
+          pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1),
+          pr_blockIndexFront(0),
+          pr_blockIndexEntries(nullptr),
+          pr_blockIndexRaw(nullptr) {
+      size_t poolBasedIndexSize =
+          details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1;
+      if (poolBasedIndexSize > pr_blockIndexSize) {
+        pr_blockIndexSize = poolBasedIndexSize;
+      }
+
+      new_block_index(0);  // This creates an index with double the number of
+                           // current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE
+    }
+
+    ~ExplicitProducer() {
+      // Destruct any elements not yet dequeued.
+      // Since we're in the destructor, we can assume all elements
+      // are either completely dequeued or completely not (no halfways).
+      if (this->tailBlock !=
+          nullptr) {  // Note this means there must be a block index too
+        // First find the block that's partially dequeued, if any
+        Block *halfDequeuedBlock = nullptr;
+        if ((this->headIndex.load(std::memory_order_relaxed) &
+             static_cast<index_t>(BLOCK_SIZE - 1)) != 0) {
+          // The head's not on a block boundary, meaning a block somewhere is
+          // partially dequeued (or the head block is the tail block and was
+          // fully dequeued, but the head/tail are still not on a boundary)
+          size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) &
+                     (pr_blockIndexSize - 1);
+          while (details::circular_less_than<index_t>(
+              pr_blockIndexEntries[i].base + BLOCK_SIZE,
+              this->headIndex.load(std::memory_order_relaxed))) {
+            i = (i + 1) & (pr_blockIndexSize - 1);
+          }
+          assert(details::circular_less_than<index_t>(
+              pr_blockIndexEntries[i].base,
+              this->headIndex.load(std::memory_order_relaxed)));
+          halfDequeuedBlock = pr_blockIndexEntries[i].block;
+        }
+
+        // Start at the head block (note the first line in the loop gives us the
+        // head from the tail on the first iteration)
+        auto block = this->tailBlock;
+        do {
+          block = block->next;
+          if (block->ConcurrentQueue::Block::template is_empty<
+                  explicit_context>()) {
+            continue;
+          }
+
+          size_t i = 0;  // Offset into block
+          if (block == halfDequeuedBlock) {
+            i = static_cast<size_t>(
+                this->headIndex.load(std::memory_order_relaxed) &
+                static_cast<index_t>(BLOCK_SIZE - 1));
+          }
+
+          // Walk through all the items in the block; if this is the tail block,
+          // we need to stop when we reach the tail index
+          auto lastValidIndex =
+              (this->tailIndex.load(std::memory_order_relaxed) &
+               static_cast<index_t>(BLOCK_SIZE - 1)) == 0
+                  ? BLOCK_SIZE
+                  : static_cast<size_t>(
+                        this->tailIndex.load(std::memory_order_relaxed) &
+                        static_cast<index_t>(BLOCK_SIZE - 1));
+          while (i != BLOCK_SIZE &&
+                 (block != this->tailBlock || i != lastValidIndex)) {
+            (*block)[i++]->~T();
+          }
+        } while (block != this->tailBlock);
+      }
+
+      // Destroy all blocks that we own
+      if (this->tailBlock != nullptr) {
+        auto block = this->tailBlock;
+        do {
+          auto nextBlock = block->next;
+          this->parent->add_block_to_free_list(block);
+          block = nextBlock;
+        } while (block != this->tailBlock);
+      }
+
+      // Destroy the block indices
+      auto header = static_cast<BlockIndexHeader *>(pr_blockIndexRaw);
+      while (header != nullptr) {
+        auto prev = static_cast<BlockIndexHeader *>(header->prev);
+        header->~BlockIndexHeader();
+        (Traits::free)(header);
+        header = prev;
+      }
+    }
+
+    template <AllocationMode allocMode, typename U>
+    inline bool enqueue(U &&element) {
+      index_t currentTailIndex =
+          this->tailIndex.load(std::memory_order_relaxed);
+      index_t newTailIndex = 1 + currentTailIndex;
+      if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+        // We reached the end of a block, start a new one
+        auto startBlock = this->tailBlock;
+        auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+        if (this->tailBlock != nullptr &&
+            this->tailBlock->next->ConcurrentQueue::Block::template is_empty<
+                explicit_context>()) {
+          // We can re-use the block ahead of us, it's empty!
+          this->tailBlock = this->tailBlock->next;
+          this->tailBlock->ConcurrentQueue::Block::template reset_empty<
+              explicit_context>();
+
+          // We'll put the block on the block index (guaranteed to be room since
+          // we're conceptually removing the last block from it first -- except
+          // instead of removing then adding, we can just overwrite). Note that
+          // there must be a valid block index here, since even if allocation
+          // failed in the ctor, it would have been re-attempted when adding the
+          // first block to the queue; since there is such a block, a block
+          // index must have been successfully allocated.
+        } else {
+          // Whatever head value we see here is >= the last value we saw here
+          // (relatively), and <= its current value. Since we have the most
+          // recent tail, the head must be
+          // <= to it.
+          auto head = this->headIndex.load(std::memory_order_relaxed);
+          assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+          if (!details::circular_less_than<index_t>(
+                  head, currentTailIndex + BLOCK_SIZE) ||
+              (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
+               (MAX_SUBQUEUE_SIZE == 0 ||
+                MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+            // We can't enqueue in another block because there's not enough
+            // leeway -- the tail could surpass the head by the time the block
+            // fills up! (Or we'll exceed the size limit, if the second part of
+            // the condition was true.)
+            return false;
+          }
+          // We're going to need a new block; check that the block index has
+          // room
+          if (pr_blockIndexRaw == nullptr ||
+              pr_blockIndexSlotsUsed == pr_blockIndexSize) {
+            // Hmm, the circular block index is already full -- we'll need
+            // to allocate a new index. Note pr_blockIndexRaw can only be
+            // nullptr if the initial allocation failed in the constructor.
+
+            MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) {
+              return false;
+            }
+            else if (!new_block_index(pr_blockIndexSlotsUsed)) {
+              return false;
+            }
+          }
+
+          // Insert a new block in the circular linked list
+          auto newBlock =
+              this->parent
+                  ->ConcurrentQueue::template requisition_block<allocMode>();
+          if (newBlock == nullptr) {
+            return false;
+          }
+#ifdef MCDBGQ_TRACKMEM
+          newBlock->owner = this;
+#endif
+          newBlock->ConcurrentQueue::Block::template reset_empty<
+              explicit_context>();
+          if (this->tailBlock == nullptr) {
+            newBlock->next = newBlock;
+          } else {
+            newBlock->next = this->tailBlock->next;
+            this->tailBlock->next = newBlock;
+          }
+          this->tailBlock = newBlock;
+          ++pr_blockIndexSlotsUsed;
+        }
+
+        MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
+            T, U,
+            new (static_cast<T *>(nullptr)) T(std::forward<U>(element)))) {
+          // The constructor may throw. We want the element not to appear in the
+          // queue in that case (without corrupting the queue):
+          MOODYCAMEL_TRY {
+            new ((*this->tailBlock)[currentTailIndex])
+                T(std::forward<U>(element));
+          }
+          MOODYCAMEL_CATCH(...) {
+            // Revert change to the current block, but leave the new block
+            // available for next time
+            pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+            this->tailBlock =
+                startBlock == nullptr ? this->tailBlock : startBlock;
+            MOODYCAMEL_RETHROW;
+          }
+        }
+        else {
+          (void)startBlock;
+          (void)originalBlockIndexSlotsUsed;
+        }
+
+        // Add block to block index
+        auto &entry = blockIndex.load(std::memory_order_relaxed)
+                          ->entries[pr_blockIndexFront];
+        entry.base = currentTailIndex;
+        entry.block = this->tailBlock;
+        blockIndex.load(std::memory_order_relaxed)
+            ->front.store(pr_blockIndexFront, std::memory_order_release);
+        pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+
+        MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
+            T, U,
+            new (static_cast<T *>(nullptr)) T(std::forward<U>(element)))) {
+          this->tailIndex.store(newTailIndex, std::memory_order_release);
+          return true;
+        }
+      }
+
+      // Enqueue
+      new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+
+      this->tailIndex.store(newTailIndex, std::memory_order_release);
+      return true;
+    }
+
+    template <typename U>
+    bool dequeue(U &element) {
+      auto tail = this->tailIndex.load(std::memory_order_relaxed);
+      auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+      if (details::circular_less_than<index_t>(
+              this->dequeueOptimisticCount.load(std::memory_order_relaxed) -
+                  overcommit,
+              tail)) {
+        // Might be something to dequeue, let's give it a try
+
+        // Note that this if is purely for performance purposes in the common
+        // case when the queue is empty and the values are eventually consistent
+        // -- we may enter here spuriously.
+
+        // Note that whatever the values of overcommit and tail are, they are
+        // not going to change (unless we change them) and must be the same
+        // value at this point (inside the if) as when the if condition was
+        // evaluated.
+
+        // We insert an acquire fence here to synchronize-with the release upon
+        // incrementing dequeueOvercommit below. This ensures that whatever the
+        // value we got loaded into overcommit, the load of dequeueOptisticCount
+        // in the fetch_add below will result in a value at least as recent as
+        // that (and therefore at least as large). Note that I believe a
+        // compiler (signal) fence here would be sufficient due to the nature of
+        // fetch_add (all read-modify-write operations are guaranteed to work on
+        // the latest value in the modification order), but unfortunately that
+        // can't be shown to be correct using only the C++11 standard. See
+        // http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case
+        std::atomic_thread_fence(std::memory_order_acquire);
+
+        // Increment optimistic counter, then check if it went over the boundary
+        auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(
+            1, std::memory_order_relaxed);
+
+        // Note that since dequeueOvercommit must be <= dequeueOptimisticCount
+        // (because dequeueOvercommit is only ever incremented after
+        // dequeueOptimisticCount -- this is enforced in the `else` block
+        // below), and since we now have a version of dequeueOptimisticCount
+        // that is at least as recent as overcommit (due to the release upon
+        // incrementing dequeueOvercommit and the acquire above that
+        // synchronizes with it), overcommit <= myDequeueCount. However, we
+        // can't assert this since both dequeueOptimisticCount and
+        // dequeueOvercommit may (independently) overflow; in such a case,
+        // though, the logic still holds since the difference between the two is
+        // maintained.
+
+        // Note that we reload tail here in case it changed; it will be the same
+        // value as before or greater, since this load is sequenced after
+        // (happens after) the earlier load above. This is supported by
+        // read-read coherency (as defined in the standard), explained here:
+        // http://en.cppreference.com/w/cpp/atomic/memory_order
+        tail = this->tailIndex.load(std::memory_order_acquire);
+        if ((details::likely)(details::circular_less_than<index_t>(
+                myDequeueCount - overcommit, tail))) {
+          // Guaranteed to be at least one element to dequeue!
+
+          // Get the index. Note that since there's guaranteed to be at least
+          // one element, this will never exceed tail. We need to do an
+          // acquire-release fence here since it's possible that whatever
+          // condition got us to this point was for an earlier enqueued element
+          // (that we already see the memory effects for), but that by the time
+          // we increment somebody else has incremented it, and we need to see
+          // the memory effects for *that* element, which is in such a case is
+          // necessarily visible on the thread that incremented it in the first
+          // place with the more current condition (they must have acquired a
+          // tail that is at least as recent).
+          auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+
+
+          // Determine which block the element is in
+
+          auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+          auto localBlockIndexHead =
+              localBlockIndex->front.load(std::memory_order_acquire);
+
+          // We need to be careful here about subtracting and dividing because
+          // of index wrap-around. When an index wraps, we need to preserve the
+          // sign of the offset when dividing it by the block size (in order to
+          // get a correct signed block count offset in all cases):
+          auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+          auto blockBaseIndex = index & ~static_cast<index_t>(BLOCK_SIZE - 1);
+          auto offset = static_cast<size_t>(
+              static_cast<typename std::make_signed<index_t>::type>(
+                  blockBaseIndex - headBase) /
+              static_cast<typename std::make_signed<index_t>::type>(
+                  BLOCK_SIZE));
+          auto block = localBlockIndex
+                           ->entries[(localBlockIndexHead + offset) &
+                                     (localBlockIndex->size - 1)]
+                           .block;
+
+          // Dequeue
+          auto &el = *((*block)[index]);
+          if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T &&, element = std::move(el))) {
+            // Make sure the element is still fully dequeued and destroyed even
+            // if the assignment throws
+            struct Guard {
+              Block *block;
+              index_t index;
+
+              ~Guard() {
+                (*block)[index]->~T();
+                block->ConcurrentQueue::Block::template set_empty<
+                    explicit_context>(index);
+              }
+            } guard = {block, index};
+
+            element = std::move(el);  // NOLINT
+          } else {
+            element = std::move(el);  // NOLINT
+            el.~T();                  // NOLINT
+            block->ConcurrentQueue::Block::template set_empty<explicit_context>(
+                index);
+          }
+
+          return true;
+        } else {
+          // Wasn't anything to dequeue after all; make the effective dequeue
+          // count eventually consistent
+          this->dequeueOvercommit.fetch_add(
+              1, std::memory_order_release);  // Release so that the fetch_add
+                                              // on dequeueOptimisticCount is
+                                              // guaranteed to happen before
+                                              // this write
+        }
+      }
+
+      return false;
+    }
+
+    template <AllocationMode allocMode, typename It>
+    bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count) {
+      // First, we need to make sure we have enough room to enqueue all of the
+      // elements; this means pre-allocating blocks and putting them in the
+      // block index (but only if all the allocations succeeded).
+      index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+      auto startBlock = this->tailBlock;
+      auto originalBlockIndexFront = pr_blockIndexFront;
+      auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+
+      Block *firstAllocatedBlock = nullptr;
+
+      // Figure out how many blocks we'll need to allocate, and do so
+      size_t blockBaseDiff =
+          ((startTailIndex + count - 1) &
+           ~static_cast<index_t>(BLOCK_SIZE - 1)) -
+          ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+      index_t currentTailIndex =
+          (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+      if (blockBaseDiff > 0) {
+        // Allocate as many blocks as possible from ahead
+        while (blockBaseDiff > 0 && this->tailBlock != nullptr &&
+               this->tailBlock->next != firstAllocatedBlock &&
+               this->tailBlock->next->ConcurrentQueue::Block::template is_empty<
+                   explicit_context>()) {
+          blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+          currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+
+          this->tailBlock = this->tailBlock->next;
+          firstAllocatedBlock = firstAllocatedBlock == nullptr
+                                    ? this->tailBlock
+                                    : firstAllocatedBlock;
+
+          auto &entry = blockIndex.load(std::memory_order_relaxed)
+                            ->entries[pr_blockIndexFront];
+          entry.base = currentTailIndex;
+          entry.block = this->tailBlock;
+          pr_blockIndexFront =
+              (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+        }
+
+        // Now allocate as many blocks as necessary from the block pool
+        while (blockBaseDiff > 0) {
+          blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+          currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+
+          auto head = this->headIndex.load(std::memory_order_relaxed);
+          assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+          bool full =
+              !details::circular_less_than<index_t>(
+                  head, currentTailIndex + BLOCK_SIZE) ||
+              (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
+               (MAX_SUBQUEUE_SIZE == 0 ||
+                MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+          if (pr_blockIndexRaw == nullptr ||
+              pr_blockIndexSlotsUsed == pr_blockIndexSize || full) {
+            MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) {
+              // Failed to allocate, undo changes (but keep injected blocks)
+              pr_blockIndexFront = originalBlockIndexFront;
+              pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+              this->tailBlock =
+                  startBlock == nullptr ? firstAllocatedBlock : startBlock;
+              return false;
+            }
+            else if (full || !new_block_index(originalBlockIndexSlotsUsed)) {
+              // Failed to allocate, undo changes (but keep injected blocks)
+              pr_blockIndexFront = originalBlockIndexFront;
+              pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+              this->tailBlock =
+                  startBlock == nullptr ? firstAllocatedBlock : startBlock;
+              return false;
+            }
+
+            // pr_blockIndexFront is updated inside new_block_index, so we need
+            // to update our fallback value too (since we keep the new index
+            // even if we later fail)
+            originalBlockIndexFront = originalBlockIndexSlotsUsed;
+          }
+
+          // Insert a new block in the circular linked list
+          auto newBlock =
+              this->parent
+                  ->ConcurrentQueue::template requisition_block<allocMode>();
+          if (newBlock == nullptr) {
+            pr_blockIndexFront = originalBlockIndexFront;
+            pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+            this->tailBlock =
+                startBlock == nullptr ? firstAllocatedBlock : startBlock;
+            return false;
+          }
+
+#ifdef MCDBGQ_TRACKMEM
+          newBlock->owner = this;
+#endif
+          newBlock->ConcurrentQueue::Block::template set_all_empty<
+              explicit_context>();
+          if (this->tailBlock == nullptr) {
+            newBlock->next = newBlock;
+          } else {
+            newBlock->next = this->tailBlock->next;
+            this->tailBlock->next = newBlock;
+          }
+          this->tailBlock = newBlock;
+          firstAllocatedBlock = firstAllocatedBlock == nullptr
+                                    ? this->tailBlock
+                                    : firstAllocatedBlock;
+
+          ++pr_blockIndexSlotsUsed;
+
+          auto &entry = blockIndex.load(std::memory_order_relaxed)
+                            ->entries[pr_blockIndexFront];
+          entry.base = currentTailIndex;
+          entry.block = this->tailBlock;
+          pr_blockIndexFront =
+              (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+        }
+
+        // Excellent, all allocations succeeded. Reset each block's emptiness
+        // before we fill them up, and publish the new block index front
+        auto block = firstAllocatedBlock;
+        while (true) {
+          block->ConcurrentQueue::Block::template reset_empty<
+              explicit_context>();
+          if (block == this->tailBlock) {
+            break;
+          }
+          block = block->next;
+        }
+
+        MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(
+            T, decltype(*itemFirst),
+            new (static_cast<T *>(nullptr))
+                T(details::deref_noexcept(itemFirst)))) {
+          blockIndex.load(std::memory_order_relaxed)
+              ->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1),
+                            std::memory_order_release);
+        }
+      }
+
+      // Enqueue, one block at a time
+      index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+      currentTailIndex = startTailIndex;
+      auto endBlock = this->tailBlock;
+      this->tailBlock = startBlock;
+      assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 ||
+             firstAllocatedBlock != nullptr || count == 0);
+      if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 &&
+          firstAllocatedBlock != nullptr) {
+        this->tailBlock = firstAllocatedBlock;
+      }
+      while (true) {
+        index_t stopIndex =
+            (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+            static_cast<index_t>(BLOCK_SIZE);
+        if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+          stopIndex = newTailIndex;
+        }
+        MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(
+            T, decltype(*itemFirst),
+            new (static_cast<T *>(nullptr))
+                T(details::deref_noexcept(itemFirst)))) {
+          while (currentTailIndex != stopIndex) {
+            new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+          }
+        }
+        else {
+          MOODYCAMEL_TRY {
+            while (currentTailIndex != stopIndex) {
+              // Must use copy constructor even if move constructor is available
+              // because we may have to revert if there's an exception.
+              // Sorry about the horrible templated next line, but it was the
+              // only way to disable moving *at compile time*, which is
+              // important because a type may only define a (noexcept) move
+              // constructor, and so calls to the cctor will not compile, even
+              // if they are in an if branch that will never be executed
+              new ((*this->tailBlock)[currentTailIndex]) T(
+                  details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(
+                      T, decltype(*itemFirst),
+                      new (static_cast<T *>(nullptr)) T(details::deref_noexcept(
+                          itemFirst)))>::eval(*itemFirst));
+              ++currentTailIndex;
+              ++itemFirst;
+            }
+          }
+          MOODYCAMEL_CATCH(...) {
+            // Oh dear, an exception's been thrown -- destroy the elements that
+            // were enqueued so far and revert the entire bulk operation (we'll
+            // keep any allocated blocks in our linked list for later, though).
+            auto constructedStopIndex = currentTailIndex;
+            auto lastBlockEnqueued = this->tailBlock;
+
+            pr_blockIndexFront = originalBlockIndexFront;
+            pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+            this->tailBlock =
+                startBlock == nullptr ? firstAllocatedBlock : startBlock;
+
+            if (!details::is_trivially_destructible<T>::value) {
+              auto block = startBlock;
+              if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) ==
+                  0) {
+                block = firstAllocatedBlock;
+              }
+              currentTailIndex = startTailIndex;
+              while (true) {
+                stopIndex =
+                    (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                    static_cast<index_t>(BLOCK_SIZE);
+                if (details::circular_less_than<index_t>(constructedStopIndex,
+                                                         stopIndex)) {
+                  stopIndex = constructedStopIndex;
+                }
+                while (currentTailIndex != stopIndex) {
+                  (*block)[currentTailIndex++]->~T();
+                }
+                if (block == lastBlockEnqueued) {
+                  break;
+                }
+                block = block->next;
+              }
+            }
+            MOODYCAMEL_RETHROW;
+          }
+        }
+
+        if (this->tailBlock == endBlock) {
+          assert(currentTailIndex == newTailIndex);
+          break;
+        }
+        this->tailBlock = this->tailBlock->next;
+      }
+
+      MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
+          T, decltype(*itemFirst),
+          new (static_cast<T *>(nullptr))
+              T(details::deref_noexcept(itemFirst)))) {
+        if (firstAllocatedBlock != nullptr)
+          blockIndex.load(std::memory_order_relaxed)
+              ->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1),
+                            std::memory_order_release);
+      }
+
+      this->tailIndex.store(newTailIndex, std::memory_order_release);
+      return true;
+    }
+
+    template <typename It>
+    size_t dequeue_bulk(It &itemFirst, size_t max) {
+      auto tail = this->tailIndex.load(std::memory_order_relaxed);
+      auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+      auto desiredCount = static_cast<size_t>(
+          tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) -
+                  overcommit));
+      if (details::circular_less_than<size_t>(0, desiredCount)) {
+        desiredCount = desiredCount < max ? desiredCount : max;
+        std::atomic_thread_fence(std::memory_order_acquire);
+
+        auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(
+            desiredCount, std::memory_order_relaxed);
+
+        tail = this->tailIndex.load(std::memory_order_acquire);
+        auto actualCount =
+            static_cast<size_t>(tail - (myDequeueCount - overcommit));
+        if (details::circular_less_than<size_t>(0, actualCount)) {
+          actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+          if (actualCount < desiredCount) {
+            this->dequeueOvercommit.fetch_add(desiredCount - actualCount,
+                                              std::memory_order_release);
+          }
+
+          // Get the first index. Note that since there's guaranteed to be at
+          // least actualCount elements, this will never exceed tail.
+          auto firstIndex =
+              this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+
+          // Determine which block the first element is in
+          auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+          auto localBlockIndexHead =
+              localBlockIndex->front.load(std::memory_order_acquire);
+
+          auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+          auto firstBlockBaseIndex =
+              firstIndex & ~static_cast<index_t>(BLOCK_SIZE - 1);
+          auto offset = static_cast<size_t>(
+              static_cast<typename std::make_signed<index_t>::type>(
+                  firstBlockBaseIndex - headBase) /
+              static_cast<typename std::make_signed<index_t>::type>(
+                  BLOCK_SIZE));
+          auto indexIndex =
+              (localBlockIndexHead + offset) & (localBlockIndex->size - 1);
+
+          // Iterate the blocks and dequeue
+          auto index = firstIndex;
+          do {
+            auto firstIndexInBlock = index;
+            index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                               static_cast<index_t>(BLOCK_SIZE);
+            endIndex =
+                details::circular_less_than<index_t>(
+                    firstIndex + static_cast<index_t>(actualCount), endIndex)
+                    ? firstIndex + static_cast<index_t>(actualCount)
+                    : endIndex;
+            auto block = localBlockIndex->entries[indexIndex].block;
+            if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T &&,
+                                           details::deref_noexcept(itemFirst) =
+                                               std::move((*(*block)[index])))) {
+              while (index != endIndex) {
+                auto &el = *((*block)[index]);
+                *itemFirst++ = std::move(el);
+                el.~T();
+                ++index;
+              }
+            } else {
+              MOODYCAMEL_TRY {
+                while (index != endIndex) {
+                  auto &el = *((*block)[index]);
+                  *itemFirst = std::move(el);
+                  ++itemFirst;
+                  el.~T();
+                  ++index;
+                }
+              }
+              MOODYCAMEL_CATCH(...) {
+                // It's too late to revert the dequeue, but we can make sure
+                // that all the dequeued objects are properly destroyed and the
+                // block index (and empty count) are properly updated before we
+                // propagate the exception
+                do {
+                  block = localBlockIndex->entries[indexIndex].block;
+                  while (index != endIndex) {
+                    (*block)[index++]->~T();
+                  }
+                  block->ConcurrentQueue::Block::template set_many_empty<
+                      explicit_context>(
+                      firstIndexInBlock,
+                      static_cast<size_t>(endIndex - firstIndexInBlock));
+                  indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+
+                  firstIndexInBlock = index;
+                  endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                             static_cast<index_t>(BLOCK_SIZE);
+                  endIndex =
+                      details::circular_less_than<index_t>(
+                          firstIndex + static_cast<index_t>(actualCount),
+                          endIndex)
+                          ? firstIndex + static_cast<index_t>(actualCount)
+                          : endIndex;
+                } while (index != firstIndex + actualCount);
+
+                MOODYCAMEL_RETHROW;
+              }
+            }
+            block->ConcurrentQueue::Block::template set_many_empty<
+                explicit_context>(
+                firstIndexInBlock,
+                static_cast<size_t>(endIndex - firstIndexInBlock));
+            indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+          } while (index != firstIndex + actualCount);
+
+          return actualCount;
+        } else {
+          // Wasn't anything to dequeue after all; make the effective dequeue
+          // count eventually consistent
+          this->dequeueOvercommit.fetch_add(desiredCount,
+                                            std::memory_order_release);
+        }
+      }
+
+      return 0;
+    }
+
+   private:
+    struct BlockIndexEntry {
+      index_t base;
+      Block *block;
+    };
+
+    struct BlockIndexHeader {
+      size_t size;
+      std::atomic<size_t>
+          front;  // Current slot (not next, like pr_blockIndexFront)
+      BlockIndexEntry *entries;
+      void *prev;
+    };
+
+
+    bool new_block_index(size_t numberOfFilledSlotsToExpose) {
+      auto prevBlockSizeMask = pr_blockIndexSize - 1;
+
+      // Create the new block
+      pr_blockIndexSize <<= 1;
+      auto newRawPtr = static_cast<char *>(
+          (Traits::malloc)(sizeof(BlockIndexHeader) +
+                           std::alignment_of<BlockIndexEntry>::value - 1 +
+                           sizeof(BlockIndexEntry) * pr_blockIndexSize));
+      if (newRawPtr == nullptr) {
+        pr_blockIndexSize >>= 1;  // Reset to allow graceful retry
+        return false;
+      }
+
+      auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry *>(
+          details::align_for<BlockIndexEntry>(newRawPtr +
+                                              sizeof(BlockIndexHeader)));
+
+      // Copy in all the old indices, if any
+      size_t j = 0;
+      if (pr_blockIndexSlotsUsed != 0) {
+        auto i =
+            (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask;
+        do {
+          newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
+          i = (i + 1) & prevBlockSizeMask;
+        } while (i != pr_blockIndexFront);
+      }
+
+      // Update everything
+      auto header = new (newRawPtr) BlockIndexHeader;
+      header->size = pr_blockIndexSize;
+      header->front.store(numberOfFilledSlotsToExpose - 1,
+                          std::memory_order_relaxed);
+      header->entries = newBlockIndexEntries;
+      header->prev = pr_blockIndexRaw;  // we link the new block to the old one
+                                        // so we can free it later
+
+      pr_blockIndexFront = j;
+      pr_blockIndexEntries = newBlockIndexEntries;
+      pr_blockIndexRaw = newRawPtr;
+      blockIndex.store(header, std::memory_order_release);
+
+      return true;
+    }
+
+   private:
+    std::atomic<BlockIndexHeader *> blockIndex;
+
+    // To be used by producer only -- consumer must use the ones in referenced
+    // by blockIndex
+    size_t pr_blockIndexSlotsUsed;
+    size_t pr_blockIndexSize;
+    size_t pr_blockIndexFront;  // Next slot (not current)
+    BlockIndexEntry *pr_blockIndexEntries;
+    void *pr_blockIndexRaw;
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+   public:
+    ExplicitProducer *nextExplicitProducer;
+
+   private:
+#endif
+
+#ifdef MCDBGQ_TRACKMEM
+    friend struct MemStats;
+#endif
+  };
+
+
+  //////////////////////////////////
+  // Implicit queue
+  //////////////////////////////////
+
+  struct ImplicitProducer : public ProducerBase {
+    ImplicitProducer(ConcurrentQueue *parent_)
+        : ProducerBase(parent_, false),
+          nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE),
+          blockIndex(nullptr) {
+      new_block_index();
+    }
+
+    ~ImplicitProducer() {
+      // Note that since we're in the destructor we can assume that all
+      // enqueue/dequeue operations completed already; this means that all
+      // undequeued elements are placed contiguously across contiguous blocks,
+      // and that only the first and last remaining blocks can be only partially
+      // empty (all other remaining blocks must be completely full).
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+      // Unregister ourselves for thread termination notification
+      if (!this->inactive.load(std::memory_order_relaxed)) {
+        details::ThreadExitNotifier::unsubscribe(&threadExitListener);
+      }
+#endif
+
+      // Destroy all remaining elements!
+      auto tail = this->tailIndex.load(std::memory_order_relaxed);
+      auto index = this->headIndex.load(std::memory_order_relaxed);
+      Block *block = nullptr;
+      assert(index == tail || details::circular_less_than(index, tail));
+      bool forceFreeLastBlock =
+          index != tail;  // If we enter the loop, then the last (tail) block
+                          // will not be freed
+      while (index != tail) {
+        if ((index & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 ||
+            block == nullptr) {
+          if (block != nullptr) {
+            // Free the old block
+            this->parent->add_block_to_free_list(block);
+          }
+
+          block = get_block_index_entry_for_index(index)->value.load(
+              std::memory_order_relaxed);
+        }
+
+        ((*block)[index])->~T();
+        ++index;
+      }
+      // Even if the queue is empty, there's still one block that's not on the
+      // free list (unless the head index reached the end of it, in which case
+      // the tail will be poised to create a new block).
+      if (this->tailBlock != nullptr &&
+          (forceFreeLastBlock ||
+           (tail & static_cast<index_t>(BLOCK_SIZE - 1)) != 0)) {
+        this->parent->add_block_to_free_list(this->tailBlock);
+      }
+
+      // Destroy block index
+      auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+      if (localBlockIndex != nullptr) {
+        for (size_t i = 0; i != localBlockIndex->capacity; ++i) {
+          localBlockIndex->index[i]->~BlockIndexEntry();
+        }
+        do {
+          auto prev = localBlockIndex->prev;
+          localBlockIndex->~BlockIndexHeader();
+          (Traits::free)(localBlockIndex);
+          localBlockIndex = prev;
+        } while (localBlockIndex != nullptr);
+      }
+    }
+
+    template <AllocationMode allocMode, typename U>
+    inline bool enqueue(U &&element) {
+      index_t currentTailIndex =
+          this->tailIndex.load(std::memory_order_relaxed);
+      index_t newTailIndex = 1 + currentTailIndex;
+      if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+        // We reached the end of a block, start a new one
+        auto head = this->headIndex.load(std::memory_order_relaxed);
+        assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+        if (!details::circular_less_than<index_t>(
+                head, currentTailIndex + BLOCK_SIZE) ||
+            (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
+             (MAX_SUBQUEUE_SIZE == 0 ||
+              MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+          return false;
+        }
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+        debug::DebugLock lock(mutex);
+#endif
+        // Find out where we'll be inserting this block in the block index
+        BlockIndexEntry *idxEntry;
+        if (!insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) {
+          return false;
+        }
+
+        // Get ahold of a new block
+        auto newBlock =
+            this->parent
+                ->ConcurrentQueue::template requisition_block<allocMode>();
+        if (newBlock == nullptr) {
+          rewind_block_index_tail();
+          idxEntry->value.store(nullptr, std::memory_order_relaxed);
+          return false;
+        }
+#ifdef MCDBGQ_TRACKMEM
+        newBlock->owner = this;
+#endif
+        newBlock
+            ->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+
+        MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
+            T, U,
+            new (static_cast<T *>(nullptr)) T(std::forward<U>(element)))) {
+          // May throw, try to insert now before we publish the fact that we
+          // have this new block
+          MOODYCAMEL_TRY {
+            new ((*newBlock)[currentTailIndex]) T(std::forward<U>(element));
+          }
+          MOODYCAMEL_CATCH(...) {
+            rewind_block_index_tail();
+            idxEntry->value.store(nullptr, std::memory_order_relaxed);
+            this->parent->add_block_to_free_list(newBlock);
+            MOODYCAMEL_RETHROW;
+          }
+        }
+
+        // Insert the new block into the index
+        idxEntry->value.store(newBlock, std::memory_order_relaxed);
+
+        this->tailBlock = newBlock;
+
+        MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
+            T, U,
+            new (static_cast<T *>(nullptr)) T(std::forward<U>(element)))) {
+          this->tailIndex.store(newTailIndex, std::memory_order_release);
+          return true;
+        }
+      }
+
+      // Enqueue
+      new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+
+      this->tailIndex.store(newTailIndex, std::memory_order_release);
+      return true;
+    }
+
+    template <typename U>
+    bool dequeue(U &element) {
+      // See ExplicitProducer::dequeue for rationale and explanation
+      index_t tail = this->tailIndex.load(std::memory_order_relaxed);
+      index_t overcommit =
+          this->dequeueOvercommit.load(std::memory_order_relaxed);
+      if (details::circular_less_than<index_t>(
+              this->dequeueOptimisticCount.load(std::memory_order_relaxed) -
+                  overcommit,
+              tail)) {
+        std::atomic_thread_fence(std::memory_order_acquire);
+
+        index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(
+            1, std::memory_order_relaxed);
+        tail = this->tailIndex.load(std::memory_order_acquire);
+        if ((details::likely)(details::circular_less_than<index_t>(
+                myDequeueCount - overcommit, tail))) {
+          index_t index =
+              this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+
+          // Determine which block the element is in
+          auto entry = get_block_index_entry_for_index(index);
+
+          // Dequeue
+          auto block = entry->value.load(std::memory_order_relaxed);
+          auto &el = *((*block)[index]);
+
+          if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T &&, element = std::move(el))) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+            // Note: Acquiring the mutex with every dequeue instead of only when
+            // a block is released is very sub-optimal, but it is, after all,
+            // purely debug code.
+            debug::DebugLock lock(producer->mutex);
+#endif
+            struct Guard {
+              Block *block;
+              index_t index;
+              BlockIndexEntry *entry;
+              ConcurrentQueue *parent;
+
+              ~Guard() {
+                (*block)[index]->~T();
+                if (block->ConcurrentQueue::Block::template set_empty<
+                        implicit_context>(index)) {
+                  entry->value.store(nullptr, std::memory_order_relaxed);
+                  parent->add_block_to_free_list(block);
+                }
+              }
+            } guard = {block, index, entry, this->parent};
+
+            element = std::move(el);  // NOLINT
+          } else {
+            element = std::move(el);  // NOLINT
+            el.~T();                  // NOLINT
+
+            if (block->ConcurrentQueue::Block::template set_empty<
+                    implicit_context>(index)) {
+              {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+                debug::DebugLock lock(mutex);
+#endif
+                // Add the block back into the global free pool (and remove from
+                // block index)
+                entry->value.store(nullptr, std::memory_order_relaxed);
+              }
+              this->parent->add_block_to_free_list(
+                  block);  // releases the above store
+            }
+          }
+
+          return true;
+        } else {
+          this->dequeueOvercommit.fetch_add(1, std::memory_order_release);
+        }
+      }
+
+      return false;
+    }
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4706)  // assignment within conditional expression
+#endif
+    template <AllocationMode allocMode, typename It>
+    bool enqueue_bulk(It itemFirst, size_t count) {
+      // First, we need to make sure we have enough room to enqueue all of the
+      // elements; this means pre-allocating blocks and putting them in the
+      // block index (but only if all the allocations succeeded).
+
+      // Note that the tailBlock we start off with may not be owned by us any
+      // more; this happens if it was filled up exactly to the top (setting
+      // tailIndex to the first index of the next block which is not yet
+      // allocated), then dequeued completely (putting it on the free list)
+      // before we enqueue again.
+
+      index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+      auto startBlock = this->tailBlock;
+      Block *firstAllocatedBlock = nullptr;
+      auto endBlock = this->tailBlock;
+
+      // Figure out how many blocks we'll need to allocate, and do so
+      size_t blockBaseDiff =
+          ((startTailIndex + count - 1) &
+           ~static_cast<index_t>(BLOCK_SIZE - 1)) -
+          ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+      index_t currentTailIndex =
+          (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+      if (blockBaseDiff > 0) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+        debug::DebugLock lock(mutex);
+#endif
+        do {
+          blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+          currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+
+          // Find out where we'll be inserting this block in the block index
+          BlockIndexEntry *idxEntry =
+              nullptr;  // initialization here unnecessary but compiler can't
+                        // always tell
+          Block *newBlock;
+          bool indexInserted = false;
+          auto head = this->headIndex.load(std::memory_order_relaxed);
+          assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+          bool full =
+              !details::circular_less_than<index_t>(
+                  head, currentTailIndex + BLOCK_SIZE) ||
+              (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
+               (MAX_SUBQUEUE_SIZE == 0 ||
+                MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+
+          if (full ||
+              !(indexInserted = insert_block_index_entry<allocMode>(
+                    idxEntry, currentTailIndex)) ||
+              (newBlock =
+                   this->parent->ConcurrentQueue::template requisition_block<
+                       allocMode>()) == nullptr) {
+            // Index allocation or block allocation failed; revert any other
+            // allocations and index insertions done so far for this operation
+            if (indexInserted) {
+              rewind_block_index_tail();
+              idxEntry->value.store(nullptr, std::memory_order_relaxed);
+            }
+            currentTailIndex =
+                (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+            for (auto block = firstAllocatedBlock; block != nullptr;
+                 block = block->next) {
+              currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+              idxEntry = get_block_index_entry_for_index(currentTailIndex);
+              idxEntry->value.store(nullptr, std::memory_order_relaxed);
+              rewind_block_index_tail();
+            }
+            this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+            this->tailBlock = startBlock;
+
+            return false;
+          }
+
+#ifdef MCDBGQ_TRACKMEM
+          newBlock->owner = this;
+#endif
+          newBlock->ConcurrentQueue::Block::template reset_empty<
+              implicit_context>();
+          newBlock->next = nullptr;
+
+          // Insert the new block into the index
+          idxEntry->value.store(newBlock, std::memory_order_relaxed);
+
+          // Store the chain of blocks so that we can undo if later allocations
+          // fail, and so that we can find the blocks when we do the actual
+          // enqueueing
+          if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 ||
+              firstAllocatedBlock != nullptr) {
+            assert(this->tailBlock != nullptr);
+            this->tailBlock->next = newBlock;
+          }
+          this->tailBlock = newBlock;
+          endBlock = newBlock;
+          firstAllocatedBlock =
+              firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock;
+        } while (blockBaseDiff > 0);
+      }
+
+      // Enqueue, one block at a time
+      index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+      currentTailIndex = startTailIndex;
+      this->tailBlock = startBlock;
+      assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 ||
+             firstAllocatedBlock != nullptr || count == 0);
+      if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 &&
+          firstAllocatedBlock != nullptr) {
+        this->tailBlock = firstAllocatedBlock;
+      }
+      while (true) {
+        index_t stopIndex =
+            (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+            static_cast<index_t>(BLOCK_SIZE);
+        if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+          stopIndex = newTailIndex;
+        }
+        MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(
+            T, decltype(*itemFirst),
+            new (static_cast<T *>(nullptr))
+                T(details::deref_noexcept(itemFirst)))) {
+          while (currentTailIndex != stopIndex) {
+            new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+          }
+        }
+        else {
+          MOODYCAMEL_TRY {
+            while (currentTailIndex != stopIndex) {
+              new ((*this->tailBlock)[currentTailIndex]) T(
+                  details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(
+                      T, decltype(*itemFirst),
+                      new (static_cast<T *>(nullptr)) T(details::deref_noexcept(
+                          itemFirst)))>::eval(*itemFirst));
+              ++currentTailIndex;
+              ++itemFirst;
+            }
+          }
+          MOODYCAMEL_CATCH(...) {
+            auto constructedStopIndex = currentTailIndex;
+            auto lastBlockEnqueued = this->tailBlock;
+
+            if (!details::is_trivially_destructible<T>::value) {
+              auto block = startBlock;
+              if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) ==
+                  0) {
+                block = firstAllocatedBlock;
+              }
+              currentTailIndex = startTailIndex;
+              while (true) {
+                stopIndex =
+                    (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                    static_cast<index_t>(BLOCK_SIZE);
+                if (details::circular_less_than<index_t>(constructedStopIndex,
+                                                         stopIndex)) {
+                  stopIndex = constructedStopIndex;
+                }
+                while (currentTailIndex != stopIndex) {
+                  (*block)[currentTailIndex++]->~T();
+                }
+                if (block == lastBlockEnqueued) {
+                  break;
+                }
+                block = block->next;
+              }
+            }
+
+            currentTailIndex =
+                (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+            for (auto block = firstAllocatedBlock; block != nullptr;
+                 block = block->next) {
+              currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+              auto idxEntry = get_block_index_entry_for_index(currentTailIndex);
+              idxEntry->value.store(nullptr, std::memory_order_relaxed);
+              rewind_block_index_tail();
+            }
+            this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+            this->tailBlock = startBlock;
+            MOODYCAMEL_RETHROW;
+          }
+        }
+
+        if (this->tailBlock == endBlock) {
+          assert(currentTailIndex == newTailIndex);
+          break;
+        }
+        this->tailBlock = this->tailBlock->next;
+      }
+      this->tailIndex.store(newTailIndex, std::memory_order_release);
+      return true;
+    }
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+    template <typename It>
+    size_t dequeue_bulk(It &itemFirst, size_t max) {
+      auto tail = this->tailIndex.load(std::memory_order_relaxed);
+      auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+      auto desiredCount = static_cast<size_t>(
+          tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) -
+                  overcommit));
+      if (details::circular_less_than<size_t>(0, desiredCount)) {
+        desiredCount = desiredCount < max ? desiredCount : max;
+        std::atomic_thread_fence(std::memory_order_acquire);
+
+        auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(
+            desiredCount, std::memory_order_relaxed);
+
+        tail = this->tailIndex.load(std::memory_order_acquire);
+        auto actualCount =
+            static_cast<size_t>(tail - (myDequeueCount - overcommit));
+        if (details::circular_less_than<size_t>(0, actualCount)) {
+          actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+          if (actualCount < desiredCount) {
+            this->dequeueOvercommit.fetch_add(desiredCount - actualCount,
+                                              std::memory_order_release);
+          }
+
+          // Get the first index. Note that since there's guaranteed to be at
+          // least actualCount elements, this will never exceed tail.
+          auto firstIndex =
+              this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+
+          // Iterate the blocks and dequeue
+          auto index = firstIndex;
+          BlockIndexHeader *localBlockIndex;
+          auto indexIndex =
+              get_block_index_index_for_index(index, localBlockIndex);
+          do {
+            auto blockStartIndex = index;
+            index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                               static_cast<index_t>(BLOCK_SIZE);
+            endIndex =
+                details::circular_less_than<index_t>(
+                    firstIndex + static_cast<index_t>(actualCount), endIndex)
+                    ? firstIndex + static_cast<index_t>(actualCount)
+                    : endIndex;
+
+            auto entry = localBlockIndex->index[indexIndex];
+            auto block = entry->value.load(std::memory_order_relaxed);
+            if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T &&,
+                                           details::deref_noexcept(itemFirst) =
+                                               std::move((*(*block)[index])))) {
+              while (index != endIndex) {
+                auto &el = *((*block)[index]);
+                *itemFirst++ = std::move(el);
+                el.~T();
+                ++index;
+              }
+            } else {
+              MOODYCAMEL_TRY {
+                while (index != endIndex) {
+                  auto &el = *((*block)[index]);
+                  *itemFirst = std::move(el);
+                  ++itemFirst;
+                  el.~T();
+                  ++index;
+                }
+              }
+              MOODYCAMEL_CATCH(...) {
+                do {
+                  entry = localBlockIndex->index[indexIndex];
+                  block = entry->value.load(std::memory_order_relaxed);
+                  while (index != endIndex) {
+                    (*block)[index++]->~T();
+                  }
+
+                  if (block->ConcurrentQueue::Block::template set_many_empty<
+                          implicit_context>(
+                          blockStartIndex,
+                          static_cast<size_t>(endIndex - blockStartIndex))) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+                    debug::DebugLock lock(mutex);
+#endif
+                    entry->value.store(nullptr, std::memory_order_relaxed);
+                    this->parent->add_block_to_free_list(block);
+                  }
+                  indexIndex =
+                      (indexIndex + 1) & (localBlockIndex->capacity - 1);
+
+                  blockStartIndex = index;
+                  endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                             static_cast<index_t>(BLOCK_SIZE);
+                  endIndex =
+                      details::circular_less_than<index_t>(
+                          firstIndex + static_cast<index_t>(actualCount),
+                          endIndex)
+                          ? firstIndex + static_cast<index_t>(actualCount)
+                          : endIndex;
+                } while (index != firstIndex + actualCount);
+
+                MOODYCAMEL_RETHROW;
+              }
+            }
+            if (block->ConcurrentQueue::Block::template set_many_empty<
+                    implicit_context>(
+                    blockStartIndex,
+                    static_cast<size_t>(endIndex - blockStartIndex))) {
+              {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+                debug::DebugLock lock(mutex);
+#endif
+                // Note that the set_many_empty above did a release, meaning
+                // that anybody who acquires the block we're about to free can
+                // use it safely since our writes (and reads!) will have
+                // happened-before then.
+                entry->value.store(nullptr, std::memory_order_relaxed);
+              }
+              this->parent->add_block_to_free_list(
+                  block);  // releases the above store
+            }
+            indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+          } while (index != firstIndex + actualCount);
+
+          return actualCount;
+        } else {
+          this->dequeueOvercommit.fetch_add(desiredCount,
+                                            std::memory_order_release);
+        }
+      }
+
+      return 0;
+    }
+
+   private:
+    // The block size must be > 1, so any number with the low bit set is an
+    // invalid block base index
+    static const index_t INVALID_BLOCK_BASE = 1;
+
+    struct BlockIndexEntry {
+      std::atomic<index_t> key;
+      std::atomic<Block *> value;
+    };
+
+    struct BlockIndexHeader {
+      size_t capacity;
+      std::atomic<size_t> tail;
+      BlockIndexEntry *entries;
+      BlockIndexEntry **index;
+      BlockIndexHeader *prev;
+    };
+
+    template <AllocationMode allocMode>
+    inline bool insert_block_index_entry(BlockIndexEntry *&idxEntry,
+                                         index_t blockStartIndex) {
+      auto localBlockIndex =
+          blockIndex.load(std::memory_order_relaxed);  // We're the only writer
+                                                       // thread, relaxed is OK
+      if (localBlockIndex == nullptr) {
+        return false;  // this can happen if new_block_index failed in the
+                       // constructor
+      }
+      size_t newTail =
+          (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) &
+          (localBlockIndex->capacity - 1);
+      idxEntry = localBlockIndex->index[newTail];
+      if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE ||
+          idxEntry->value.load(std::memory_order_relaxed) == nullptr) {
+        idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+        localBlockIndex->tail.store(newTail, std::memory_order_release);
+        return true;
+      }
+
+      // No room in the old block index, try to allocate another one!
+      MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) {
+        return false;
+      }
+      else if (!new_block_index()) {
+        return false;
+      }
+      else {
+        localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+        newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) &
+                  (localBlockIndex->capacity - 1);
+        idxEntry = localBlockIndex->index[newTail];
+        assert(idxEntry->key.load(std::memory_order_relaxed) ==
+               INVALID_BLOCK_BASE);
+        idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+        localBlockIndex->tail.store(newTail, std::memory_order_release);
+        return true;
+      }
+    }
+
+    inline void rewind_block_index_tail() {
+      auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+      localBlockIndex->tail.store(
+          (localBlockIndex->tail.load(std::memory_order_relaxed) - 1) &
+              (localBlockIndex->capacity - 1),
+          std::memory_order_relaxed);
+    }
+
+    inline BlockIndexEntry *get_block_index_entry_for_index(
+        index_t index) const {
+      BlockIndexHeader *localBlockIndex;
+      auto idx = get_block_index_index_for_index(index, localBlockIndex);
+      return localBlockIndex->index[idx];
+    }
+
+    inline size_t get_block_index_index_for_index(
+        index_t index, BlockIndexHeader *&localBlockIndex) const {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+      debug::DebugLock lock(mutex);
+#endif
+      index &= ~static_cast<index_t>(BLOCK_SIZE - 1);
+      localBlockIndex = blockIndex.load(std::memory_order_acquire);
+      auto tail = localBlockIndex->tail.load(std::memory_order_acquire);
+      auto tailBase =
+          localBlockIndex->index[tail]->key.load(std::memory_order_relaxed);
+      assert(tailBase != INVALID_BLOCK_BASE);
+      // Note: Must use division instead of shift because the index may wrap
+      // around, causing a negative offset, whose negativity we want to preserve
+      auto offset = static_cast<size_t>(
+          static_cast<typename std::make_signed<index_t>::type>(index -
+                                                                tailBase) /
+          static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+      size_t idx = (tail + offset) & (localBlockIndex->capacity - 1);
+      assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) ==
+                 index &&
+             localBlockIndex->index[idx]->value.load(
+                 std::memory_order_relaxed) != nullptr);
+      return idx;
+    }
+
+    bool new_block_index() {
+      auto prev = blockIndex.load(std::memory_order_relaxed);
+      size_t prevCapacity = prev == nullptr ? 0 : prev->capacity;
+      auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity;
+      auto raw = static_cast<char *>(
+          (Traits::malloc)(sizeof(BlockIndexHeader) +
+                           std::alignment_of<BlockIndexEntry>::value - 1 +
+                           sizeof(BlockIndexEntry) * entryCount +
+                           std::alignment_of<BlockIndexEntry *>::value - 1 +
+                           sizeof(BlockIndexEntry *) * nextBlockIndexCapacity));
+      if (raw == nullptr) {
+        return false;
+      }
+
+      auto header = new (raw) BlockIndexHeader;
+      auto entries = reinterpret_cast<BlockIndexEntry *>(
+          details::align_for<BlockIndexEntry>(raw + sizeof(BlockIndexHeader)));
+      auto index = reinterpret_cast<BlockIndexEntry **>(
+          details::align_for<BlockIndexEntry *>(
+              reinterpret_cast<char *>(entries) +
+              sizeof(BlockIndexEntry) * entryCount));
+      if (prev != nullptr) {
+        auto prevTail = prev->tail.load(std::memory_order_relaxed);
+        auto prevPos = prevTail;
+        size_t i = 0;
+        do {
+          prevPos = (prevPos + 1) & (prev->capacity - 1);
+          index[i++] = prev->index[prevPos];
+        } while (prevPos != prevTail);
+        assert(i == prevCapacity);
+      }
+      for (size_t i = 0; i != entryCount; ++i) {
+        new (entries + i) BlockIndexEntry;
+        entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed);
+        index[prevCapacity + i] = entries + i;
+      }
+      header->prev = prev;
+      header->entries = entries;
+      header->index = index;
+      header->capacity = nextBlockIndexCapacity;
+      header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1),
+                         std::memory_order_relaxed);
+
+      blockIndex.store(header, std::memory_order_release);
+
+      nextBlockIndexCapacity <<= 1;
+
+      return true;
+    }
+
+   private:
+    size_t nextBlockIndexCapacity;
+    std::atomic<BlockIndexHeader *> blockIndex;
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+   public:
+    details::ThreadExitListener threadExitListener;
+
+   private:
+#endif
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+   public:
+    ImplicitProducer *nextImplicitProducer;
+
+   private:
+#endif
+
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+    mutable debug::DebugMutex mutex;
+#endif
+#ifdef MCDBGQ_TRACKMEM
+    friend struct MemStats;
+#endif
+  };
+
+
+  //////////////////////////////////
+  // Block pool manipulation
+  //////////////////////////////////
+
+  void populate_initial_block_list(size_t blockCount) {
+    initialBlockPoolSize = blockCount;
+    if (initialBlockPoolSize == 0) {
+      initialBlockPool = nullptr;
+      return;
+    }
+
+    initialBlockPool = create_array<Block>(blockCount);
+    if (initialBlockPool == nullptr) {
+      initialBlockPoolSize = 0;
+    }
+    for (size_t i = 0; i < initialBlockPoolSize; ++i) {
+      initialBlockPool[i].dynamicallyAllocated = false;
+    }
+  }
+
+  inline Block *try_get_block_from_initial_pool() {
+    if (initialBlockPoolIndex.load(std::memory_order_relaxed) >=
+        initialBlockPoolSize) {
+      return nullptr;
+    }
+
+    auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);
+
+    return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr;
+  }
+
+  inline void add_block_to_free_list(Block *block) {
+#ifdef MCDBGQ_TRACKMEM
+    block->owner = nullptr;
+#endif
+    if (!Traits::RECYCLE_ALLOCATED_BLOCKS && block->dynamicallyAllocated) {
+      destroy(block);
+    } else {
+      freeList.add(block);
+    }
+  }
+
+  inline void add_blocks_to_free_list(Block *block) {
+    while (block != nullptr) {
+      auto next = block->next;
+      add_block_to_free_list(block);
+      block = next;
+    }
+  }
+
+  inline Block *try_get_block_from_free_list() {
+    return freeList.try_get();
+  }
+
+  // Gets a free block from one of the memory pools, or allocates a new one (if
+  // applicable)
+  template <AllocationMode canAlloc>
+  Block *requisition_block() {
+    auto block = try_get_block_from_initial_pool();
+    if (block != nullptr) {
+      return block;
+    }
+
+    block = try_get_block_from_free_list();
+    if (block != nullptr) {
+      return block;
+    }
+
+    MOODYCAMEL_CONSTEXPR_IF(canAlloc == CanAlloc) {
+      return create<Block>();
+    }
+    else {
+      return nullptr;
+    }
+  }
+
+
+#ifdef MCDBGQ_TRACKMEM
+ public:
+  struct MemStats {
+    size_t allocatedBlocks;
+    size_t usedBlocks;
+    size_t freeBlocks;
+    size_t ownedBlocksExplicit;
+    size_t ownedBlocksImplicit;
+    size_t implicitProducers;
+    size_t explicitProducers;
+    size_t elementsEnqueued;
+    size_t blockClassBytes;
+    size_t queueClassBytes;
+    size_t implicitBlockIndexBytes;
+    size_t explicitBlockIndexBytes;
+
+    friend class ConcurrentQueue;
+
+   private:
+    static MemStats getFor(ConcurrentQueue *q) {
+      MemStats stats = {0};
+
+      stats.elementsEnqueued = q->size_approx();
+
+      auto block = q->freeList.head_unsafe();
+      while (block != nullptr) {
+        ++stats.allocatedBlocks;
+        ++stats.freeBlocks;
+        block = block->freeListNext.load(std::memory_order_relaxed);
+      }
+
+      for (auto ptr = q->producerListTail.load(std::memory_order_acquire);
+           ptr != nullptr; ptr = ptr->next_prod()) {
+        bool implicit = dynamic_cast<ImplicitProducer *>(ptr) != nullptr;
+        stats.implicitProducers += implicit ? 1 : 0;
+        stats.explicitProducers += implicit ? 0 : 1;
+
+        if (implicit) {
+          auto prod = static_cast<ImplicitProducer *>(ptr);
+          stats.queueClassBytes += sizeof(ImplicitProducer);
+          auto head = prod->headIndex.load(std::memory_order_relaxed);
+          auto tail = prod->tailIndex.load(std::memory_order_relaxed);
+          auto hash = prod->blockIndex.load(std::memory_order_relaxed);
+          if (hash != nullptr) {
+            for (size_t i = 0; i != hash->capacity; ++i) {
+              if (hash->index[i]->key.load(std::memory_order_relaxed) !=
+                      ImplicitProducer::INVALID_BLOCK_BASE &&
+                  hash->index[i]->value.load(std::memory_order_relaxed) !=
+                      nullptr) {
+                ++stats.allocatedBlocks;
+                ++stats.ownedBlocksImplicit;
+              }
+            }
+            stats.implicitBlockIndexBytes +=
+                hash->capacity *
+                sizeof(typename ImplicitProducer::BlockIndexEntry);
+            for (; hash != nullptr; hash = hash->prev) {
+              stats.implicitBlockIndexBytes +=
+                  sizeof(typename ImplicitProducer::BlockIndexHeader) +
+                  hash->capacity *
+                      sizeof(typename ImplicitProducer::BlockIndexEntry *);
+            }
+          }
+          for (; details::circular_less_than<index_t>(head, tail);
+               head += BLOCK_SIZE) {
+            // auto block = prod->get_block_index_entry_for_index(head);
+            ++stats.usedBlocks;
+          }
+        } else {
+          auto prod = static_cast<ExplicitProducer *>(ptr);
+          stats.queueClassBytes += sizeof(ExplicitProducer);
+          auto tailBlock = prod->tailBlock;
+          bool wasNonEmpty = false;
+          if (tailBlock != nullptr) {
+            auto block = tailBlock;
+            do {
+              ++stats.allocatedBlocks;
+              if (!block->ConcurrentQueue::Block::template is_empty<
+                      explicit_context>() ||
+                  wasNonEmpty) {
+                ++stats.usedBlocks;
+                wasNonEmpty = wasNonEmpty || block != tailBlock;
+              }
+              ++stats.ownedBlocksExplicit;
+              block = block->next;
+            } while (block != tailBlock);
+          }
+          auto index = prod->blockIndex.load(std::memory_order_relaxed);
+          while (index != nullptr) {
+            stats.explicitBlockIndexBytes +=
+                sizeof(typename ExplicitProducer::BlockIndexHeader) +
+                index->size *
+                    sizeof(typename ExplicitProducer::BlockIndexEntry);
+            index = static_cast<typename ExplicitProducer::BlockIndexHeader *>(
+                index->prev);
+          }
+        }
+      }
+
+      auto freeOnInitialPool =
+          q->initialBlockPoolIndex.load(std::memory_order_relaxed) >=
+                  q->initialBlockPoolSize
+              ? 0
+              : q->initialBlockPoolSize -
+                    q->initialBlockPoolIndex.load(std::memory_order_relaxed);
+      stats.allocatedBlocks += freeOnInitialPool;
+      stats.freeBlocks += freeOnInitialPool;
+
+      stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks;
+      stats.queueClassBytes += sizeof(ConcurrentQueue);
+
+      return stats;
+    }
+  };
+
+  // For debugging only. Not thread-safe.
+  MemStats getMemStats() {
+    return MemStats::getFor(this);
+  }
+
+ private:
+  friend struct MemStats;
+#endif
+
+
+  //////////////////////////////////
+  // Producer list manipulation
+  //////////////////////////////////
+
+  ProducerBase *recycle_or_create_producer(bool isExplicit) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+    debug::DebugLock lock(implicitProdMutex);
+#endif
+    // Try to re-use one first
+    for (auto ptr = producerListTail.load(std::memory_order_acquire);
+         ptr != nullptr; ptr = ptr->next_prod()) {
+      if (ptr->inactive.load(std::memory_order_relaxed) &&
+          ptr->isExplicit == isExplicit) {
+        bool expected = true;
+        if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false,
+                                                  std::memory_order_acquire,
+                                                  std::memory_order_relaxed)) {
+          // We caught one! It's been marked as activated, the caller can have
+          // it
+          return ptr;
+        }
+      }
+    }
+
+    return add_producer(
+        isExplicit ? static_cast<ProducerBase *>(create<ExplicitProducer>(this))
+                   : create<ImplicitProducer>(this));
+  }
+
+  ProducerBase *add_producer(ProducerBase *producer) {
+    // Handle failed memory allocation
+    if (producer == nullptr) {
+      return nullptr;
+    }
+
+    producerCount.fetch_add(1, std::memory_order_relaxed);
+
+    // Add it to the lock-free list
+    auto prevTail = producerListTail.load(std::memory_order_relaxed);
+    do {
+      producer->next = prevTail;
+    } while (!producerListTail.compare_exchange_weak(
+        prevTail, producer, std::memory_order_release,
+        std::memory_order_relaxed));
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+    if (producer->isExplicit) {
+      auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed);
+      do {
+        static_cast<ExplicitProducer *>(producer)->nextExplicitProducer =
+            prevTailExplicit;
+      } while (!explicitProducers.compare_exchange_weak(
+          prevTailExplicit, static_cast<ExplicitProducer *>(producer),
+          std::memory_order_release, std::memory_order_relaxed));
+    } else {
+      auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed);
+      do {
+        static_cast<ImplicitProducer *>(producer)->nextImplicitProducer =
+            prevTailImplicit;
+      } while (!implicitProducers.compare_exchange_weak(
+          prevTailImplicit, static_cast<ImplicitProducer *>(producer),
+          std::memory_order_release, std::memory_order_relaxed));
+    }
+#endif
+
+    return producer;
+  }
+
+  void reown_producers() {
+    // After another instance is moved-into/swapped-with this one, all the
+    // producers we stole still think their parents are the other queue.
+    // So fix them up!
+    for (auto ptr = producerListTail.load(std::memory_order_relaxed);
+         ptr != nullptr; ptr = ptr->next_prod()) {
+      ptr->parent = this;
+    }
+  }
+
+
+  //////////////////////////////////
+  // Implicit producer hash
+  //////////////////////////////////
+
+  struct ImplicitProducerKVP {
+    std::atomic<details::thread_id_t> key;
+    ImplicitProducer *value;  // No need for atomicity since it's only read by
+                              // the thread that sets it in the first place
+
+    ImplicitProducerKVP() : value(nullptr) {}
+
+    ImplicitProducerKVP(ImplicitProducerKVP &&other) MOODYCAMEL_NOEXCEPT {
+      key.store(other.key.load(std::memory_order_relaxed),
+                std::memory_order_relaxed);
+      value = other.value;
+    }
+
+    inline ImplicitProducerKVP &operator=(ImplicitProducerKVP &&other)
+        MOODYCAMEL_NOEXCEPT {
+      swap(other);
+      return *this;
+    }
+
+    inline void swap(ImplicitProducerKVP &other) MOODYCAMEL_NOEXCEPT {
+      if (this != &other) {
+        details::swap_relaxed(key, other.key);
+        std::swap(value, other.value);
+      }
+    }
+  };
+
+  template <typename XT, typename XTraits>
+  friend void moodycamel::swap(
+      typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP &,
+      typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP &)
+      MOODYCAMEL_NOEXCEPT;
+
+  struct ImplicitProducerHash {
+    size_t capacity;
+    ImplicitProducerKVP *entries;
+    ImplicitProducerHash *prev;
+  };
+
+  inline void populate_initial_implicit_producer_hash() {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
+      return;
+    }
+    else {
+      implicitProducerHashCount.store(0, std::memory_order_relaxed);
+      auto hash = &initialImplicitProducerHash;
+      hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
+      hash->entries = &initialImplicitProducerHashEntries[0];
+      for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) {
+        initialImplicitProducerHashEntries[i].key.store(
+            details::invalid_thread_id, std::memory_order_relaxed);
+      }
+      hash->prev = nullptr;
+      implicitProducerHash.store(hash, std::memory_order_relaxed);
+    }
+  }
+
+  void swap_implicit_producer_hashes(ConcurrentQueue &other) {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
+      return;
+    }
+    else {
+      // Swap (assumes our implicit producer hash is initialized)
+      initialImplicitProducerHashEntries.swap(
+          other.initialImplicitProducerHashEntries);
+      initialImplicitProducerHash.entries =
+          &initialImplicitProducerHashEntries[0];
+      other.initialImplicitProducerHash.entries =
+          &other.initialImplicitProducerHashEntries[0];
+
+      details::swap_relaxed(implicitProducerHashCount,
+                            other.implicitProducerHashCount);
+
+      details::swap_relaxed(implicitProducerHash, other.implicitProducerHash);
+      if (implicitProducerHash.load(std::memory_order_relaxed) ==
+          &other.initialImplicitProducerHash) {
+        implicitProducerHash.store(&initialImplicitProducerHash,
+                                   std::memory_order_relaxed);
+      } else {
+        ImplicitProducerHash *hash;
+        for (hash = implicitProducerHash.load(std::memory_order_relaxed);
+             hash->prev != &other.initialImplicitProducerHash;
+             hash = hash->prev) {
+          continue;
+        }
+        hash->prev = &initialImplicitProducerHash;
+      }
+      if (other.implicitProducerHash.load(std::memory_order_relaxed) ==
+          &initialImplicitProducerHash) {
+        other.implicitProducerHash.store(&other.initialImplicitProducerHash,
+                                         std::memory_order_relaxed);
+      } else {
+        ImplicitProducerHash *hash;
+        for (hash = other.implicitProducerHash.load(std::memory_order_relaxed);
+             hash->prev != &initialImplicitProducerHash; hash = hash->prev) {
+          continue;
+        }
+        hash->prev = &other.initialImplicitProducerHash;
+      }
+    }
+  }
+
+  // Only fails (returns nullptr) if memory allocation fails
+  ImplicitProducer *get_or_add_implicit_producer() {
+    // Note that since the data is essentially thread-local (key is thread ID),
+    // there's a reduced need for fences (memory ordering is already consistent
+    // for any individual thread), except for the current table itself.
+
+    // Start by looking for the thread ID in the current and all previous hash
+    // tables. If it's not found, it must not be in there yet, since this same
+    // thread would have added it previously to one of the tables that we
+    // traversed.
+
+    // Code and algorithm adapted from
+    // http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table
+
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+    debug::DebugLock lock(implicitProdMutex);
+#endif
+
+    auto id = details::thread_id();
+    auto hashedId = details::hash_thread_id(id);
+
+    auto mainHash = implicitProducerHash.load(std::memory_order_acquire);
+    assert(
+        mainHash !=
+        nullptr);  // silence clang-tidy and MSVC warnings (hash cannot be null)
+    for (auto hash = mainHash; hash != nullptr; hash = hash->prev) {
+      // Look for the id in this hash
+      auto index = hashedId;
+      while (true) {  // Not an infinite loop because at least one slot is free
+                      // in the hash table
+        index &= hash->capacity - 1u;
+
+        auto probedKey =
+            hash->entries[index].key.load(std::memory_order_relaxed);
+        if (probedKey == id) {
+          // Found it! If we had to search several hashes deep, though, we
+          // should lazily add it to the current main hash table to avoid the
+          // extended search next time. Note there's guaranteed to be room in
+          // the current hash table since every subsequent table implicitly
+          // reserves space for all previous tables (there's only one
+          // implicitProducerHashCount).
+          auto value = hash->entries[index].value;
+          if (hash != mainHash) {
+            index = hashedId;
+            while (true) {
+              index &= mainHash->capacity - 1u;
+              auto empty = details::invalid_thread_id;
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+              auto reusable = details::invalid_thread_id2;
+              if (mainHash->entries[index].key.compare_exchange_strong(
+                      empty, id, std::memory_order_seq_cst,
+                      std::memory_order_relaxed) ||
+                  mainHash->entries[index].key.compare_exchange_strong(
+                      reusable, id, std::memory_order_seq_cst,
+                      std::memory_order_relaxed)) {
+#else
+              if (mainHash->entries[index].key.compare_exchange_strong(
+                      empty, id, std::memory_order_seq_cst,
+                      std::memory_order_relaxed)) {
+#endif
+                mainHash->entries[index].value = value;
+                break;
+              }
+              ++index;
+            }
+          }
+
+          return value;
+        }
+        if (probedKey == details::invalid_thread_id) {
+          break;  // Not in this hash table
+        }
+        ++index;
+      }
+    }
+
+    // Insert!
+    auto newCount =
+        1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed);
+    while (true) {
+      // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+      if (newCount >= (mainHash->capacity >> 1) &&
+          !implicitProducerHashResizeInProgress.test_and_set(
+              std::memory_order_acquire)) {
+        // We've acquired the resize lock, try to allocate a bigger hash table.
+        // Note the acquire fence synchronizes with the release fence at the end
+        // of this block, and hence when we reload implicitProducerHash it must
+        // be the most recent version (it only gets changed within this locked
+        // block).
+        mainHash = implicitProducerHash.load(std::memory_order_acquire);
+        if (newCount >= (mainHash->capacity >> 1)) {
+          size_t newCapacity = mainHash->capacity << 1;
+          while (newCount >= (newCapacity >> 1)) {
+            newCapacity <<= 1;
+          }
+          auto raw = static_cast<char *>(
+              (Traits::malloc)(sizeof(ImplicitProducerHash) +
+                               std::alignment_of<ImplicitProducerKVP>::value -
+                               1 + sizeof(ImplicitProducerKVP) * newCapacity));
+          if (raw == nullptr) {
+            // Allocation failed
+            implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+            implicitProducerHashResizeInProgress.clear(
+                std::memory_order_relaxed);
+            return nullptr;
+          }
+
+          auto newHash = new (raw) ImplicitProducerHash;
+          newHash->capacity = static_cast<size_t>(newCapacity);
+          newHash->entries = reinterpret_cast<ImplicitProducerKVP *>(
+              details::align_for<ImplicitProducerKVP>(
+                  raw + sizeof(ImplicitProducerHash)));
+          for (size_t i = 0; i != newCapacity; ++i) {
+            new (newHash->entries + i) ImplicitProducerKVP;
+            newHash->entries[i].key.store(details::invalid_thread_id,
+                                          std::memory_order_relaxed);
+          }
+          newHash->prev = mainHash;
+          implicitProducerHash.store(newHash, std::memory_order_release);
+          implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+          mainHash = newHash;
+        } else {
+          implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+        }
+      }
+
+      // If it's < three-quarters full, add to the old one anyway so that we
+      // don't have to wait for the next table to finish being allocated by
+      // another thread (and if we just finished allocating above, the condition
+      // will always be true)
+      if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) {
+        auto producer =
+            static_cast<ImplicitProducer *>(recycle_or_create_producer(false));
+        if (producer == nullptr) {
+          implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+          return nullptr;
+        }
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+        producer->threadExitListener.callback =
+            &ConcurrentQueue::implicit_producer_thread_exited_callback;
+        producer->threadExitListener.userData = producer;
+        details::ThreadExitNotifier::subscribe(&producer->threadExitListener);
+#endif
+
+        auto index = hashedId;
+        while (true) {
+          index &= mainHash->capacity - 1u;
+          auto empty = details::invalid_thread_id;
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+          auto reusable = details::invalid_thread_id2;
+          if (mainHash->entries[index].key.compare_exchange_strong(
+                  reusable, id, std::memory_order_seq_cst,
+                  std::memory_order_relaxed)) {
+            implicitProducerHashCount.fetch_sub(
+                1,
+                std::memory_order_relaxed);  // already counted as a used slot
+            mainHash->entries[index].value = producer;
+            break;
+          }
+#endif
+          if (mainHash->entries[index].key.compare_exchange_strong(
+                  empty, id, std::memory_order_seq_cst,
+                  std::memory_order_relaxed)) {
+            mainHash->entries[index].value = producer;
+            break;
+          }
+          ++index;
+        }
+        return producer;
+      }
+
+      // Hmm, the old hash is quite full and somebody else is busy allocating a
+      // new one. We need to wait for the allocating thread to finish (if it
+      // succeeds, we add, if not, we try to allocate ourselves).
+      mainHash = implicitProducerHash.load(std::memory_order_acquire);
+    }
+  }
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+  void implicit_producer_thread_exited(ImplicitProducer *producer) {
+    // Remove from hash
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+    debug::DebugLock lock(implicitProdMutex);
+#endif
+    auto hash = implicitProducerHash.load(std::memory_order_acquire);
+    assert(hash != nullptr);  // The thread exit listener is only registered if
+                              // we were added to a hash in the first place
+    auto id = details::thread_id();
+    auto hashedId = details::hash_thread_id(id);
+    details::thread_id_t probedKey;
+
+    // We need to traverse all the hashes just in case other threads aren't on
+    // the current one yet and are trying to add an entry thinking there's a
+    // free slot (because they reused a producer)
+    for (; hash != nullptr; hash = hash->prev) {
+      auto index = hashedId;
+      do {
+        index &= hash->capacity - 1u;
+        probedKey = id;
+        if (hash->entries[index].key.compare_exchange_strong(
+                probedKey, details::invalid_thread_id2,
+                std::memory_order_seq_cst, std::memory_order_relaxed)) {
+          break;
+        }
+        ++index;
+      } while (
+          probedKey !=
+          details::invalid_thread_id);  // Can happen if the hash has changed
+                                        // but we weren't put back in it yet, or
+                                        // if we weren't added to this hash in
+                                        // the first place
+    }
+
+    // Mark the queue as being recyclable
+    producer->inactive.store(true, std::memory_order_release);
+  }
+
+  static void implicit_producer_thread_exited_callback(void *userData) {
+    auto producer = static_cast<ImplicitProducer *>(userData);
+    auto queue = producer->parent;
+    queue->implicit_producer_thread_exited(producer);
+  }
+#endif
+
+  //////////////////////////////////
+  // Utility functions
+  //////////////////////////////////
+
+  template <typename TAlign>
+  static inline void *aligned_malloc(size_t size) {
+    MOODYCAMEL_CONSTEXPR_IF(std::alignment_of<TAlign>::value <=
+                            std::alignment_of<details::max_align_t>::value)
+    return (Traits::malloc)(size);
+    else {
+      size_t alignment = std::alignment_of<TAlign>::value;
+      void *raw = (Traits::malloc)(size + alignment - 1 + sizeof(void *));
+      if (!raw) return nullptr;
+      char *ptr = details::align_for<TAlign>(reinterpret_cast<char *>(raw) +
+                                             sizeof(void *));
+      *(reinterpret_cast<void **>(ptr) - 1) = raw;
+      return ptr;
+    }
+  }
+
+  template <typename TAlign>
+  static inline void aligned_free(void *ptr) {
+    MOODYCAMEL_CONSTEXPR_IF(std::alignment_of<TAlign>::value <=
+                            std::alignment_of<details::max_align_t>::value)
+    return (Traits::free)(ptr);
+    else(Traits::free)(ptr ? *(reinterpret_cast<void **>(ptr) - 1) : nullptr);
+  }
+
+  template <typename U>
+  static inline U *create_array(size_t count) {
+    assert(count > 0);
+    U *p = static_cast<U *>(aligned_malloc<U>(sizeof(U) * count));
+    if (p == nullptr) return nullptr;
+
+    for (size_t i = 0; i != count; ++i) new (p + i) U();
+    return p;
+  }
+
+  template <typename U>
+  static inline void destroy_array(U *p, size_t count) {
+    if (p != nullptr) {
+      assert(count > 0);
+      for (size_t i = count; i != 0;) (p + --i)->~U();
+    }
+    aligned_free<U>(p);
+  }
+
+  template <typename U>
+  static inline U *create() {
+    void *p = aligned_malloc<U>(sizeof(U));
+    return p != nullptr ? new (p) U : nullptr;
+  }
+
+  template <typename U, typename A1>
+  static inline U *create(A1 &&a1) {
+    void *p = aligned_malloc<U>(sizeof(U));
+    return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
+  }
+
+  template <typename U>
+  static inline void destroy(U *p) {
+    if (p != nullptr) p->~U();
+    aligned_free<U>(p);
+  }
+
+ private:
+  std::atomic<ProducerBase *> producerListTail;
+  std::atomic<std::uint32_t> producerCount;
+
+  std::atomic<size_t> initialBlockPoolIndex;
+  Block *initialBlockPool;
+  size_t initialBlockPoolSize;
+
+#ifndef MCDBGQ_USEDEBUGFREELIST
+  FreeList<Block> freeList;
+#else
+  debug::DebugFreeList<Block> freeList;
+#endif
+
+  std::atomic<ImplicitProducerHash *> implicitProducerHash;
+  std::atomic<size_t>
+      implicitProducerHashCount;  // Number of slots logically used
+  ImplicitProducerHash initialImplicitProducerHash;
+  std::array<ImplicitProducerKVP, INITIAL_IMPLICIT_PRODUCER_HASH_SIZE>
+      initialImplicitProducerHashEntries;
+  std::atomic_flag implicitProducerHashResizeInProgress;
+
+  std::atomic<std::uint32_t> nextExplicitConsumerId;
+  std::atomic<std::uint32_t> globalExplicitConsumerOffset;
+
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+  debug::DebugMutex implicitProdMutex;
+#endif
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+  std::atomic<ExplicitProducer *> explicitProducers;
+  std::atomic<ImplicitProducer *> implicitProducers;
+#endif
+};
+
+
+template <typename T, typename Traits>
+ProducerToken::ProducerToken(ConcurrentQueue<T, Traits> &queue)
+    : producer(queue.recycle_or_create_producer(true)) {
+  if (producer != nullptr) {
+    producer->token = this;
+  }
+}
+
+template <typename T, typename Traits>
+ProducerToken::ProducerToken(BlockingConcurrentQueue<T, Traits> &queue)
+    : producer(reinterpret_cast<ConcurrentQueue<T, Traits> *>(&queue)
+                   ->recycle_or_create_producer(true)) {
+  if (producer != nullptr) {
+    producer->token = this;
+  }
+}
+
+template <typename T, typename Traits>
+ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits> &queue)
+    : itemsConsumedFromCurrent(0),
+      currentProducer(nullptr),
+      desiredProducer(nullptr) {
+  initialOffset =
+      queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+  lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+}
+
+template <typename T, typename Traits>
+ConsumerToken::ConsumerToken(BlockingConcurrentQueue<T, Traits> &queue)
+    : itemsConsumedFromCurrent(0),
+      currentProducer(nullptr),
+      desiredProducer(nullptr) {
+  initialOffset =
+      reinterpret_cast<ConcurrentQueue<T, Traits> *>(&queue)
+          ->nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+  lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+}
+
+template <typename T, typename Traits>
+inline void swap(ConcurrentQueue<T, Traits> &a,
+                 ConcurrentQueue<T, Traits> &b) MOODYCAMEL_NOEXCEPT {
+  a.swap(b);
+}
+
+inline void swap(ProducerToken &a, ProducerToken &b) MOODYCAMEL_NOEXCEPT {
+  a.swap(b);
+}
+
+inline void swap(ConsumerToken &a, ConsumerToken &b) MOODYCAMEL_NOEXCEPT {
+  a.swap(b);
+}
+
+template <typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP &a,
+                 typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP &b)
+    MOODYCAMEL_NOEXCEPT {
+  a.swap(b);
+}
+
+}  // namespace moodycamel
 
 #if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17)
 #pragma warning(pop)

From 55e6f1b5bc3c33e1eefe0a0ef7f2662925f1257b Mon Sep 17 00:00:00 2001
From: "yinzefeng.yzf" <yinzefeng.yzf@alibaba-inc.com>
Date: Tue, 10 Feb 2026 11:29:48 +0800
Subject: [PATCH 06/11] clang format

---
 src/core/algorithm/hnsw/hnsw_entity.h           |  3 +--
 .../zvec/ailego/buffer/concurrentqueue.h        |  2 +-
 src/include/zvec/core/framework/index_storage.h | 17 +++++++++++------
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/core/algorithm/hnsw/hnsw_entity.h b/src/core/algorithm/hnsw/hnsw_entity.h
index d2c06c41..70ea3dcc 100644
--- a/src/core/algorithm/hnsw/hnsw_entity.h
+++ b/src/core/algorithm/hnsw/hnsw_entity.h
@@ -147,8 +147,7 @@ struct Neighbors {
   Neighbors(uint32_t cnt_in, const node_id_t *data_in)
       : cnt{cnt_in}, data{data_in} {}
 
-  Neighbors(IndexStorage::MemoryBlock &mem_block)
-      : neighbor_block{mem_block} {
+  Neighbors(IndexStorage::MemoryBlock &mem_block) : neighbor_block{mem_block} {
     auto hd = reinterpret_cast<const NeighborsHeader *>(neighbor_block.data());
     cnt = hd->neighbor_cnt;
     data = hd->neighbors;
diff --git a/src/include/zvec/ailego/buffer/concurrentqueue.h b/src/include/zvec/ailego/buffer/concurrentqueue.h
index 90edaf97..3b587642 100644
--- a/src/include/zvec/ailego/buffer/concurrentqueue.h
+++ b/src/include/zvec/ailego/buffer/concurrentqueue.h
@@ -1706,7 +1706,7 @@ class ConcurrentQueue {
   // contention.
   template <typename N>  // N must inherit FreeListNode or have the same fields
                          // (and initialization of them)
-                         struct FreeList {
+  struct FreeList {
     FreeList() : freeListHead(nullptr) {}
     FreeList(FreeList &&other)
         : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) {
diff --git a/src/include/zvec/core/framework/index_storage.h b/src/include/zvec/core/framework/index_storage.h
index 346b8da4..920580fe 100644
--- a/src/include/zvec/core/framework/index_storage.h
+++ b/src/include/zvec/core/framework/index_storage.h
@@ -37,7 +37,8 @@ class IndexStorage : public IndexModule {
     };
 
     MemoryBlock() {}
-    MemoryBlock(ailego::VecBufferPoolHandle* buffer_pool_handle, int block_id, void *data)
+    MemoryBlock(ailego::VecBufferPoolHandle *buffer_pool_handle, int block_id,
+                void *data)
         : type_(MemoryBlockType::MBT_BUFFERPOOL) {
       buffer_pool_handle_ = buffer_pool_handle;
       buffer_block_id_ = block_id;
@@ -65,7 +66,8 @@ class IndexStorage : public IndexModule {
           this->reset(std::move(rhs.data_));
           break;
         case MemoryBlockType::MBT_BUFFERPOOL:
-          this->reset(std::move(rhs.buffer_pool_handle_), std::move(rhs.buffer_block_id_), std::move(rhs.data_));
+          this->reset(std::move(rhs.buffer_pool_handle_),
+                      std::move(rhs.buffer_block_id_), std::move(rhs.data_));
           break;
         default:
           break;
@@ -79,7 +81,8 @@ class IndexStorage : public IndexModule {
             this->reset(rhs.data_);
             break;
           case MemoryBlockType::MBT_BUFFERPOOL:
-            this->reset(rhs.buffer_pool_handle_, rhs.buffer_block_id_, rhs.data_);
+            this->reset(rhs.buffer_pool_handle_, rhs.buffer_block_id_,
+                        rhs.data_);
             buffer_pool_handle_->acquire_one(buffer_block_id_);
             break;
           default:
@@ -96,7 +99,8 @@ class IndexStorage : public IndexModule {
             this->reset(std::move(rhs.data_));
             break;
           case MemoryBlockType::MBT_BUFFERPOOL:
-            this->reset(std::move(rhs.buffer_pool_handle_), std::move(rhs.buffer_block_id_), std::move(rhs.data_));
+            this->reset(std::move(rhs.buffer_pool_handle_),
+                        std::move(rhs.buffer_block_id_), std::move(rhs.data_));
             break;
           default:
             break;
@@ -124,7 +128,8 @@ class IndexStorage : public IndexModule {
       return data_;
     }
 
-    void reset(ailego::VecBufferPoolHandle* buffer_pool_handle, int block_id, void *data) {
+    void reset(ailego::VecBufferPoolHandle *buffer_pool_handle, int block_id,
+               void *data) {
       if (type_ == MemoryBlockType::MBT_BUFFERPOOL) {
         buffer_pool_handle->release_one(buffer_block_id_);
       }
@@ -145,7 +150,7 @@ class IndexStorage : public IndexModule {
 
     MemoryBlockType type_{MBT_UNKNOWN};
     void *data_{nullptr};
-    mutable ailego::VecBufferPoolHandle* buffer_pool_handle_;
+    mutable ailego::VecBufferPoolHandle *buffer_pool_handle_;
     int buffer_block_id_{0};
   };
 

From b24d921053dd2597357c524ebb78d65979948a92 Mon Sep 17 00:00:00 2001
From: "yinzefeng.yzf" <yinzefeng.yzf@alibaba-inc.com>
Date: Tue, 10 Feb 2026 11:45:03 +0800
Subject: [PATCH 07/11] clang format

---
 .../zvec/ailego/buffer/concurrentqueue.h      | 20 +++++++++----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/src/include/zvec/ailego/buffer/concurrentqueue.h b/src/include/zvec/ailego/buffer/concurrentqueue.h
index 3b587642..16f297e8 100644
--- a/src/include/zvec/ailego/buffer/concurrentqueue.h
+++ b/src/include/zvec/ailego/buffer/concurrentqueue.h
@@ -709,7 +709,7 @@ struct nomove_if<false> {
 };
 
 template <typename It>
-static inline auto deref_noexcept(It &it) MOODYCAMEL_NOEXCEPT -> decltype(*it) {
+static inline auto deref_noexcept(It &it) MOODYCAMEL_NOEXCEPT->decltype(*it) {
   return *it;
 }
 
@@ -2833,10 +2833,9 @@ class ConcurrentQueue {
 
       // Create the new block
       pr_blockIndexSize <<= 1;
-      auto newRawPtr = static_cast<char *>(
-          (Traits::malloc)(sizeof(BlockIndexHeader) +
-                           std::alignment_of<BlockIndexEntry>::value - 1 +
-                           sizeof(BlockIndexEntry) * pr_blockIndexSize));
+      auto newRawPtr = static_cast<char *>((Traits::malloc)(
+          sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value -
+          1 + sizeof(BlockIndexEntry) * pr_blockIndexSize));
       if (newRawPtr == nullptr) {
         pr_blockIndexSize >>= 1;  // Reset to allow graceful retry
         return false;
@@ -3556,12 +3555,11 @@ class ConcurrentQueue {
       auto prev = blockIndex.load(std::memory_order_relaxed);
       size_t prevCapacity = prev == nullptr ? 0 : prev->capacity;
       auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity;
-      auto raw = static_cast<char *>(
-          (Traits::malloc)(sizeof(BlockIndexHeader) +
-                           std::alignment_of<BlockIndexEntry>::value - 1 +
-                           sizeof(BlockIndexEntry) * entryCount +
-                           std::alignment_of<BlockIndexEntry *>::value - 1 +
-                           sizeof(BlockIndexEntry *) * nextBlockIndexCapacity));
+      auto raw = static_cast<char *>((Traits::malloc)(
+          sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value -
+          1 + sizeof(BlockIndexEntry) * entryCount +
+          std::alignment_of<BlockIndexEntry *>::value - 1 +
+          sizeof(BlockIndexEntry *) * nextBlockIndexCapacity));
       if (raw == nullptr) {
         return false;
       }

From 8916f90025d24f0bada96ac3c3eac9aa1d9efd7a Mon Sep 17 00:00:00 2001
From: "yinzefeng.yzf" <yinzefeng.yzf@alibaba-inc.com>
Date: Tue, 10 Feb 2026 15:53:54 +0800
Subject: [PATCH 08/11] clang format

---
 src/include/zvec/ailego/buffer/concurrentqueue.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/include/zvec/ailego/buffer/concurrentqueue.h b/src/include/zvec/ailego/buffer/concurrentqueue.h
index 16f297e8..f7f3d77e 100644
--- a/src/include/zvec/ailego/buffer/concurrentqueue.h
+++ b/src/include/zvec/ailego/buffer/concurrentqueue.h
@@ -111,8 +111,8 @@ static inline thread_id_t thread_id() {
 #elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__)
 // No sense pulling in windows.h in a header, we'll manually declare the
 // function we use and rely on backwards-compatibility for this not to break
-extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(
-    void);
+extern "C"
+    __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void);
 namespace moodycamel {
 namespace details {
 static_assert(sizeof(unsigned long) == sizeof(std::uint32_t),
@@ -709,7 +709,7 @@ struct nomove_if<false> {
 };
 
 template <typename It>
-static inline auto deref_noexcept(It &it) MOODYCAMEL_NOEXCEPT->decltype(*it) {
+static inline auto deref_noexcept(It &it) MOODYCAMEL_NOEXCEPT -> decltype(*it) {
   return *it;
 }
 

From e3d014ca629bdc7beda6fed6e32d95c0428470e3 Mon Sep 17 00:00:00 2001
From: Zefeng Yin <yinzefeng.yzf@alibaba-inc.com>
Date: Tue, 10 Feb 2026 22:41:21 +0800
Subject: [PATCH 09/11] fix bugs

---
 src/ailego/buffer/buffer_pool.cc               | 15 ++++++++-------
 .../algorithm/flat/flat_streamer_context.h     | 10 +++++++++-
 src/core/algorithm/hnsw/hnsw_context.h         |  4 ++++
 src/core/interface/index.cc                    |  7 +++++--
 src/core/utility/buffer_storage.cc             | 18 ++++++------------
 src/include/zvec/ailego/buffer/buffer_pool.h   |  5 +++--
 .../zvec/core/framework/index_storage.h        |  2 +-
 .../index/column/vector_column_indexer_test.cc |  1 -
 8 files changed, 36 insertions(+), 26 deletions(-)

diff --git a/src/ailego/buffer/buffer_pool.cc b/src/ailego/buffer/buffer_pool.cc
index 81ed92bf..bdbf0a03 100644
--- a/src/ailego/buffer/buffer_pool.cc
+++ b/src/ailego/buffer/buffer_pool.cc
@@ -66,7 +66,7 @@ void LPMap::init(size_t entry_num) {
     entries_[i].load_count.store(0);
     entries_[i].buffer = nullptr;
   }
-  cache_.init(entry_num);
+  cache_.init(entry_num * 4);
 }
 
 char *LPMap::acquire_block(block_id_t block_id) {
@@ -136,9 +136,7 @@ void LPMap::recycle(moodycamel::ConcurrentQueue<char *> &free_buffers) {
   }
 }
 
-VecBufferPool::VecBufferPool(const std::string &filename, size_t pool_capacity,
-                             size_t block_size)
-    : pool_capacity_(pool_capacity) {
+VecBufferPool::VecBufferPool(const std::string &filename) {
   fd_ = open(filename.c_str(), O_RDONLY);
   if (fd_ < 0) {
     throw std::runtime_error("Failed to open file: " + filename);
@@ -148,9 +146,12 @@ VecBufferPool::VecBufferPool(const std::string &filename, size_t pool_capacity,
     throw std::runtime_error("Failed to stat file: " + filename);
   }
   file_size_ = st.st_size;
+}
 
-  size_t buffer_num = pool_capacity_ / block_size;
-  size_t block_num = file_size_ / block_size + 500;
+int VecBufferPool::init(size_t pool_capacity, size_t block_size) {
+  pool_capacity_ = pool_capacity;
+  size_t buffer_num = pool_capacity_ / block_size + 10;
+  size_t block_num = file_size_ / block_size + 10;
   lp_map_.init(block_num);
   for (size_t i = 0; i < buffer_num; i++) {
     char *buffer = (char *)aligned_alloc(64, block_size);
@@ -160,6 +161,7 @@ VecBufferPool::VecBufferPool(const std::string &filename, size_t pool_capacity,
   }
   LOG_DEBUG("Buffer pool num: %zu, entry num: %zu", buffer_num,
             lp_map_.entry_num());
+  return 0;
 }
 
 VecBufferPoolHandle VecBufferPool::get_handle() {
@@ -209,7 +211,6 @@ char *VecBufferPool::acquire_buffer(block_id_t block_id, size_t offset,
 int VecBufferPool::get_meta(size_t offset, size_t length, char *buffer) {
   ssize_t read_bytes = pread(fd_, buffer, length, offset);
   if (read_bytes != static_cast<ssize_t>(length)) {
-    LOG_ERROR("Buffer pool failed to read file at offset: %zu", offset);
     LOG_ERROR("Buffer pool failed to read file at offset: %zu", offset);
     return -1;
   }
diff --git a/src/core/algorithm/flat/flat_streamer_context.h b/src/core/algorithm/flat/flat_streamer_context.h
index 24cfd9e5..22a1106a 100644
--- a/src/core/algorithm/flat/flat_streamer_context.h
+++ b/src/core/algorithm/flat/flat_streamer_context.h
@@ -190,10 +190,18 @@ class FlatStreamerContext : public IndexStreamer::Context {
     group_topk_heaps_.clear();
   }
 
-  void reset() override {}
+  void reset() override {
+    for (auto &it : results_) {
+      it.clear();
+    }
+    for (auto &it : group_results_) {
+      it.clear();
+    }
+  }
 
   //! Reset the context
   void reset(const FlatStreamer<BATCH_SIZE> *owner) {
+    this->reset();
     magic_ = owner->magic();
     feature_size_ = owner->meta().element_size();
 
diff --git a/src/core/algorithm/hnsw/hnsw_context.h b/src/core/algorithm/hnsw/hnsw_context.h
index 22bcfaad..e776b81a 100644
--- a/src/core/algorithm/hnsw/hnsw_context.h
+++ b/src/core/algorithm/hnsw/hnsw_context.h
@@ -335,6 +335,7 @@ class HnswContext : public IndexContext {
 
   //! Reset context
   void reset(void) override {
+    this->clear();
     set_filter(nullptr);
     reset_threshold();
     set_fetch_vector(false);
@@ -422,6 +423,9 @@ class HnswContext : public IndexContext {
     for (auto &it : results_) {
       it.clear();
     }
+    for (auto &it : group_results_) {
+      it.clear();
+    }
   }
 
   uint32_t *mutable_stats_get_neighbors() {
diff --git a/src/core/interface/index.cc b/src/core/interface/index.cc
index 038f67d4..72005bc9 100644
--- a/src/core/interface/index.cc
+++ b/src/core/interface/index.cc
@@ -406,8 +406,9 @@ int Index::Search(const VectorData &vector_data,
   }
 
   // dense support refiner, but sparse doesn't
+  int ret = 0;
   if (search_param->refiner_param == nullptr) {
-    return _dense_search(vector_data, search_param, result, context);
+    ret = _dense_search(vector_data, search_param, result, context);
   } else {
     auto &reference_index = search_param->refiner_param->reference_index;
     if (reference_index == nullptr) {
@@ -441,8 +442,10 @@ int Index::Search(const VectorData &vector_data,
     // TODO: should copy other params?
     flat_search_param->bf_pks = std::make_shared<std::vector<uint64_t>>(keys);
 
-    return reference_index->Search(vector_data, flat_search_param, result);
+    ret = reference_index->Search(vector_data, flat_search_param, result);
   }
+  context->reset();
+  return ret;
 }
 
 
diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc
index dcdb13d3..1fccbe2e 100644
--- a/src/core/utility/buffer_storage.cc
+++ b/src/core/utility/buffer_storage.cc
@@ -98,7 +98,7 @@ class BufferStorage : public IndexStorage {
       }
       size_t buffer_offset = segment_header_start_offset_ +
                              segment_header_->content_offset +
-                             segment_->meta()->data_index + offset;
+                             segment_->meta()->data_index;
       *data =
           owner_->get_buffer(buffer_offset, capacity_, segment_id_) + offset;
       return len;
@@ -114,7 +114,7 @@ class BufferStorage : public IndexStorage {
       }
       size_t buffer_offset = segment_header_start_offset_ +
                              segment_header_->content_offset +
-                             segment_->meta()->data_index + offset;
+                             segment_->meta()->data_index;
       data.reset(
           owner_->buffer_pool_handle_.get(), segment_id_,
           owner_->get_buffer(buffer_offset, capacity_, segment_id_) + offset);
@@ -177,21 +177,15 @@ class BufferStorage : public IndexStorage {
 
   //! Open storage
   int open(const std::string &path, bool /*create*/) override {
-    LOG_INFO("open buffer storage 1");
     file_name_ = path;
-    buffer_pool_ = std::make_shared<ailego::VecBufferPool>(
-        path, 20lu * 1024 * 1024 * 1024, 2490368 * 2);
+    buffer_pool_ = std::make_shared<ailego::VecBufferPool>(path);
     buffer_pool_handle_ = std::make_shared<ailego::VecBufferPoolHandle>(
         buffer_pool_->get_handle());
     int ret = ParseToMapping();
-    LOG_ERROR("segment count: %lu, max_segment_size: %lu", segments_.size(),
-              max_segment_size_);
-    for (auto iter = segments_.begin(); iter != segments_.end(); iter++) {
-      auto seg = this->get(iter->first, 0);
-      MemoryBlock block;
-      int len = seg->read(0, block, 1);
-      LOG_ERROR("segment %s: %d", iter->first.c_str(), len);
+    if (ret != 0) {
+      return ret;
     }
+    ret = buffer_pool_->init(20lu * 1024 * 1024 * 1024, max_segment_size_);
     if (ret != 0) {
       return ret;
     }
diff --git a/src/include/zvec/ailego/buffer/buffer_pool.h b/src/include/zvec/ailego/buffer/buffer_pool.h
index f1a0149c..c27065a2 100644
--- a/src/include/zvec/ailego/buffer/buffer_pool.h
+++ b/src/include/zvec/ailego/buffer/buffer_pool.h
@@ -97,12 +97,13 @@ class VecBufferPool {
  public:
   typedef std::shared_ptr<VecBufferPool> Pointer;
 
-  VecBufferPool(const std::string &filename, size_t pool_capacity,
-                size_t block_size);
+  VecBufferPool(const std::string &filename);
   ~VecBufferPool() {
     close(fd_);
   }
 
+  int init(size_t pool_capacity, size_t block_size);
+
   VecBufferPoolHandle get_handle();
 
   char *acquire_buffer(block_id_t block_id, size_t offset, size_t size,
diff --git a/src/include/zvec/core/framework/index_storage.h b/src/include/zvec/core/framework/index_storage.h
index 920580fe..9173da3e 100644
--- a/src/include/zvec/core/framework/index_storage.h
+++ b/src/include/zvec/core/framework/index_storage.h
@@ -131,7 +131,7 @@ class IndexStorage : public IndexModule {
     void reset(ailego::VecBufferPoolHandle *buffer_pool_handle, int block_id,
                void *data) {
       if (type_ == MemoryBlockType::MBT_BUFFERPOOL) {
-        buffer_pool_handle->release_one(buffer_block_id_);
+        buffer_pool_handle_->release_one(buffer_block_id_);
       }
       type_ = MemoryBlockType::MBT_BUFFERPOOL;
       buffer_pool_handle_ = buffer_pool_handle;
diff --git a/tests/db/index/column/vector_column_indexer_test.cc b/tests/db/index/column/vector_column_indexer_test.cc
index 483efcde..251e5a18 100644
--- a/tests/db/index/column/vector_column_indexer_test.cc
+++ b/tests/db/index/column/vector_column_indexer_test.cc
@@ -2160,7 +2160,6 @@ TEST(VectorColumnIndexerTest, Failure) {
     ASSERT_TRUE(indexer->Flush().ok());
     ASSERT_TRUE(indexer->Close().ok());
     {
-      ailego::BufferManager::Instance().init(10 * 1024 * 1024, 1);
       auto indexer = std::make_shared<VectorColumnIndexer>(
           index_file_path,
           FieldSchema("test", DataType::VECTOR_FP32, 3, false,

From ed6a3f205e9aaec904316cf5df318dbc073207d3 Mon Sep 17 00:00:00 2001
From: Zefeng Yin <yinzefeng.yzf@alibaba-inc.com>
Date: Wed, 11 Feb 2026 10:58:59 +0800
Subject: [PATCH 10/11] =?UTF-8?q?fix=20complie=E2=80=9C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/core/utility/buffer_storage.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc
index 1fccbe2e..4db38cb0 100644
--- a/src/core/utility/buffer_storage.cc
+++ b/src/core/utility/buffer_storage.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include <mutex>
-// #include <zvec/ailego/buffer/buffer_manager.h>
+#include <algorithm>
 #include <zvec/ailego/buffer/buffer_pool.h>
 #include <zvec/ailego/utility/time_helper.h>
 #include <zvec/core/framework/index_error.h>
@@ -476,7 +476,7 @@ class BufferStorage : public IndexStorage {
   IndexFormat::MetaFooter footer_;
   std::map<std::string, IndexMapping::SegmentInfo> segments_{};
   std::map<std::string, size_t> id_hash_{};
-  size_t max_segment_size_{0};
+  uint64_t max_segment_size_{0};
   std::unique_ptr<char[]> segment_buffer_{nullptr};
 
   ailego::VecBufferPool::Pointer buffer_pool_{nullptr};
@@ -487,4 +487,4 @@ class BufferStorage : public IndexStorage {
 INDEX_FACTORY_REGISTER_STORAGE(BufferStorage);
 
 }  // namespace core
-}  // namespace zvec
\ No newline at end of file
+}  // namespace zvec

From 95b1c16dafcd417e15f04d5bda276cc1d8431774 Mon Sep 17 00:00:00 2001
From: "yinzefeng.yzf" <yinzefeng.yzf@alibaba-inc.com>
Date: Wed, 11 Feb 2026 11:31:05 +0800
Subject: [PATCH 11/11] clang format

---
 src/core/utility/buffer_storage.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc
index 4db38cb0..d339553a 100644
--- a/src/core/utility/buffer_storage.cc
+++ b/src/core/utility/buffer_storage.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <mutex>
 #include <algorithm>
+#include <mutex>
 #include <zvec/ailego/buffer/buffer_pool.h>
 #include <zvec/ailego/utility/time_helper.h>
 #include <zvec/core/framework/index_error.h>