diff --git a/src/ailego/buffer/buffer_pool.cc b/src/ailego/buffer/buffer_pool.cc
new file mode 100644
index 00000000..bdbf0a03
--- /dev/null
+++ b/src/ailego/buffer/buffer_pool.cc
@@ -0,0 +1,239 @@
+#include <zvec/ailego/buffer/buffer_pool.h>
+#include <zvec/core/framework/index_logger.h>
+
+namespace zvec {
+namespace ailego {
+
+int LRUCache::init(size_t block_size) {
+  block_size_ = block_size;
+  for (size_t i = 0; i < CATCH_QUEUE_NUM; i++) {
+    queues_.push_back(ConcurrentQueue(block_size));
+  }
+  return 0;
+}
+
+bool LRUCache::evict_single_block(BlockType &item) {
+  bool found = false;
+  for (size_t i = 0; i < CATCH_QUEUE_NUM; i++) {
+    found = queues_[i].try_dequeue(item);
+    if (found) {
+      break;
+    }
+  }
+  return found;
+}
+
+bool LRUCache::add_single_block(const LPMap *lp_map, const BlockType &block,
+                                int block_type) {
+  bool ok = queues_[block_type].try_enqueue(block);
+  evict_queue_insertions_.fetch_add(1, std::memory_order_relaxed);
+  if (evict_queue_insertions_ % block_size_ == 0) {
+    this->clear_dead_node(lp_map);
+  }
+  return ok;
+}
+
+void LRUCache::clear_dead_node(const LPMap *lp_map) {
+  for (int i = 0; i < CATCH_QUEUE_NUM; i++) {
+    int clear_size = block_size_ * 2;
+    if (queues_[i].size_approx() < clear_size * 4) {
+      continue;
+    }
+    int clear_count = 0;
+    ConcurrentQueue tmp(block_size_);
+    BlockType item;
+    while (queues_[i].try_dequeue(item) && (clear_count++ < clear_size)) {
+      if (!lp_map->isDeadBlock(item)) {
+        tmp.try_enqueue(item);
+      }
+    }
+    while (tmp.try_dequeue(item)) {
+      if (!lp_map->isDeadBlock(item)) {
+        queues_[i].try_enqueue(item);
+      }
+    }
+  }
+}
+
+void LPMap::init(size_t entry_num) {
+  if (entries_) {
+    delete[] entries_;
+  }
+  entry_num_ = entry_num;
+  entries_ = new Entry[entry_num_];
+  for (size_t i = 0; i < entry_num_; i++) {
+    entries_[i].ref_count.store(std::numeric_limits<int>::min());
+    entries_[i].load_count.store(0);
+    entries_[i].buffer = nullptr;
+  }
+  cache_.init(entry_num * 4);
+}
+
+char *LPMap::acquire_block(block_id_t block_id) {
+  assert(block_id < entry_num_);
+  Entry &entry = entries_[block_id];
+  if (entry.ref_count.load(std::memory_order_relaxed) == 0) {
+    entry.load_count.fetch_add(1, std::memory_order_relaxed);
+  }
+  entry.ref_count.fetch_add(1, std::memory_order_relaxed);
+  if (entry.ref_count.load(std::memory_order_relaxed) < 0) {
+    return nullptr;
+  }
+  return entry.buffer;
+}
+
+void LPMap::release_block(block_id_t block_id) {
+  assert(block_id < entry_num_);
+  Entry &entry = entries_[block_id];
+
+  if (entry.ref_count.fetch_sub(1, std::memory_order_release) == 1) {
+    std::atomic_thread_fence(std::memory_order_acquire);
+    LRUCache::BlockType block;
+    block.first = block_id;
+    block.second = entry.load_count.load();
+    cache_.add_single_block(this, block, 0);
+  }
+}
+
+char *LPMap::evict_block(block_id_t block_id) {
+  assert(block_id < entry_num_);
+  Entry &entry = entries_[block_id];
+  int expected = 0;
+  if (entry.ref_count.compare_exchange_strong(
+          expected, std::numeric_limits<int>::min())) {
+    char *buffer = entry.buffer;
+    entry.buffer = nullptr;
+    return buffer;
+  } else {
+    return nullptr;
+  }
+}
+
+char *LPMap::set_block_acquired(block_id_t block_id, char *buffer) {
+  assert(block_id < entry_num_);
+  Entry &entry = entries_[block_id];
+  if (entry.ref_count.load(std::memory_order_relaxed) >= 0) {
+    entry.ref_count.fetch_add(1, std::memory_order_relaxed);
+    return entry.buffer;
+  }
+  entry.buffer = buffer;
+  entry.ref_count.store(1, std::memory_order_relaxed);
+  entry.load_count.fetch_add(1, std::memory_order_relaxed);
+  return buffer;
+}
+
+void LPMap::recycle(moodycamel::ConcurrentQueue<char *> &free_buffers) {
+  LRUCache::BlockType block;
+  do {
+    bool ok = cache_.evict_single_block(block);
+    if (!ok) {
+      return;
+    }
+  } while (isDeadBlock(block));
+  char *buffer = evict_block(block.first);
+  if (buffer) {
+    free_buffers.try_enqueue(buffer);
+  }
+}
+
+VecBufferPool::VecBufferPool(const std::string &filename) {
+  fd_ = open(filename.c_str(), O_RDONLY);
+  if (fd_ < 0) {
+    throw std::runtime_error("Failed to open file: " + filename);
+  }
+  struct stat st;
+  if (fstat(fd_, &st) < 0) {
+    throw std::runtime_error("Failed to stat file: " + filename);
+  }
+  file_size_ = st.st_size;
+}
+
+int VecBufferPool::init(size_t pool_capacity, size_t block_size) {
+  pool_capacity_ = pool_capacity;
+  size_t buffer_num = pool_capacity_ / block_size + 10;
+  size_t block_num = file_size_ / block_size + 10;
+  lp_map_.init(block_num);
+  for (size_t i = 0; i < buffer_num; i++) {
+    char *buffer = (char *)aligned_alloc(64, block_size);
+    if (buffer != nullptr) {
+      bool ok = free_buffers_.try_enqueue(buffer);
+    }
+  }
+  LOG_DEBUG("Buffer pool num: %zu, entry num: %zu", buffer_num,
+            lp_map_.entry_num());
+  return 0;
+}
+
+VecBufferPoolHandle VecBufferPool::get_handle() {
+  return VecBufferPoolHandle(*this);
+}
+
+char *VecBufferPool::acquire_buffer(block_id_t block_id, size_t offset,
+                                    size_t size, int retry) {
+  char *buffer = lp_map_.acquire_block(block_id);
+  if (buffer) {
+    return buffer;
+  }
+  {
+    bool found = free_buffers_.try_dequeue(buffer);
+    if (!found) {
+      for (int i = 0; i < retry; i++) {
+        lp_map_.recycle(free_buffers_);
+        found = free_buffers_.try_dequeue(buffer);
+        if (found) {
+          break;
+        }
+      }
+    }
+    if (!found) {
+      LOG_ERROR("Buffer pool failed to get free buffer");
+      return nullptr;
+    }
+  }
+
+  ssize_t read_bytes = pread(fd_, buffer, size, offset);
+  if (read_bytes != static_cast<ssize_t>(size)) {
+    LOG_ERROR("Buffer pool failed to read file at offset: %zu", offset);
+    return nullptr;
+  }
+  char *placed_buffer = nullptr;
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    placed_buffer = lp_map_.set_block_acquired(block_id, buffer);
+  }
+  if (placed_buffer != buffer) {
+    // another thread has set the block
+    free_buffers_.try_enqueue(buffer);
+  }
+  return placed_buffer;
+}
+
+int VecBufferPool::get_meta(size_t offset, size_t length, char *buffer) {
+  ssize_t read_bytes = pread(fd_, buffer, length, offset);
+  if (read_bytes != static_cast<ssize_t>(length)) {
+    LOG_ERROR("Buffer pool failed to read file at offset: %zu", offset);
+    return -1;
+  }
+  return 0;
+}
+
+char *VecBufferPoolHandle::get_block(size_t offset, size_t size,
+                                     size_t block_id) {
+  char *buffer = pool.acquire_buffer(block_id, offset, size, 5);
+  return buffer;
+}
+
+int VecBufferPoolHandle::get_meta(size_t offset, size_t length, char *buffer) {
+  return pool.get_meta(offset, length, buffer);
+}
+
+void VecBufferPoolHandle::release_one(block_id_t block_id) {
+  pool.lp_map_.release_block(block_id);
+}
+
+void VecBufferPoolHandle::acquire_one(block_id_t block_id) {
+  pool.lp_map_.acquire_block(block_id);
+}
+
+}  // namespace ailego
+}  // namespace zvec
\ No newline at end of file
diff --git a/src/core/algorithm/flat/flat_streamer_context.h b/src/core/algorithm/flat/flat_streamer_context.h
index 24cfd9e5..22a1106a 100644
--- a/src/core/algorithm/flat/flat_streamer_context.h
+++ b/src/core/algorithm/flat/flat_streamer_context.h
@@ -190,10 +190,18 @@ class FlatStreamerContext : public IndexStreamer::Context {
     group_topk_heaps_.clear();
   }
 
-  void reset() override {}
+  void reset() override {
+    for (auto &it : results_) {
+      it.clear();
+    }
+    for (auto &it : group_results_) {
+      it.clear();
+    }
+  }
 
   //! Reset the context
   void reset(const FlatStreamer<BATCH_SIZE> *owner) {
+    this->reset();
     magic_ = owner->magic();
     feature_size_ = owner->meta().element_size();
 
diff --git a/src/core/algorithm/hnsw/hnsw_context.h b/src/core/algorithm/hnsw/hnsw_context.h
index 22bcfaad..e776b81a 100644
--- a/src/core/algorithm/hnsw/hnsw_context.h
+++ b/src/core/algorithm/hnsw/hnsw_context.h
@@ -335,6 +335,7 @@ class HnswContext : public IndexContext {
 
   //! Reset context
   void reset(void) override {
+    this->clear();
     set_filter(nullptr);
     reset_threshold();
     set_fetch_vector(false);
@@ -422,6 +423,9 @@ class HnswContext : public IndexContext {
     for (auto &it : results_) {
       it.clear();
     }
+    for (auto &it : group_results_) {
+      it.clear();
+    }
   }
 
   uint32_t *mutable_stats_get_neighbors() {
diff --git a/src/core/algorithm/hnsw/hnsw_entity.h b/src/core/algorithm/hnsw/hnsw_entity.h
index e5f2077f..70ea3dcc 100644
--- a/src/core/algorithm/hnsw/hnsw_entity.h
+++ b/src/core/algorithm/hnsw/hnsw_entity.h
@@ -147,8 +147,7 @@ struct Neighbors {
   Neighbors(uint32_t cnt_in, const node_id_t *data_in)
       : cnt{cnt_in}, data{data_in} {}
 
-  Neighbors(IndexStorage::MemoryBlock &&mem_block)
-      : neighbor_block{std::move(mem_block)} {
+  Neighbors(IndexStorage::MemoryBlock &mem_block) : neighbor_block{mem_block} {
     auto hd = reinterpret_cast<const NeighborsHeader *>(neighbor_block.data());
     cnt = hd->neighbor_cnt;
     data = hd->neighbors;
diff --git a/src/core/algorithm/hnsw/hnsw_streamer_entity.cc b/src/core/algorithm/hnsw/hnsw_streamer_entity.cc
index feafa573..734f11f1 100644
--- a/src/core/algorithm/hnsw/hnsw_streamer_entity.cc
+++ b/src/core/algorithm/hnsw/hnsw_streamer_entity.cc
@@ -127,7 +127,7 @@ const Neighbors HnswStreamerEntity::get_neighbors(level_t level,
     LOG_ERROR("Read neighbor header failed, ret=%zu", size);
     return Neighbors();
   }
-  return Neighbors(std::move(neighbor_block));
+  return Neighbors(neighbor_block);
 }
 
 //! Get vector data by key
diff --git a/src/core/interface/index.cc b/src/core/interface/index.cc
index 038f67d4..72005bc9 100644
--- a/src/core/interface/index.cc
+++ b/src/core/interface/index.cc
@@ -406,8 +406,9 @@ int Index::Search(const VectorData &vector_data,
   }
 
   // dense support refiner, but sparse doesn't
+  int ret = 0;
   if (search_param->refiner_param == nullptr) {
-    return _dense_search(vector_data, search_param, result, context);
+    ret = _dense_search(vector_data, search_param, result, context);
   } else {
     auto &reference_index = search_param->refiner_param->reference_index;
     if (reference_index == nullptr) {
@@ -441,8 +442,10 @@ int Index::Search(const VectorData &vector_data,
     // TODO: should copy other params?
     flat_search_param->bf_pks = std::make_shared<std::vector<uint64_t>>(keys);
 
-    return reference_index->Search(vector_data, flat_search_param, result);
+    ret = reference_index->Search(vector_data, flat_search_param, result);
   }
+  context->reset();
+  return ret;
 }
 
 
diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc
index bccf07e2..d339553a 100644
--- a/src/core/utility/buffer_storage.cc
+++ b/src/core/utility/buffer_storage.cc
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <algorithm>
 #include <mutex>
-#include <zvec/ailego/buffer/buffer_manager.h>
+#include <zvec/ailego/buffer/buffer_pool.h>
+#include <zvec/ailego/utility/time_helper.h>
 #include <zvec/core/framework/index_error.h>
 #include <zvec/core/framework/index_factory.h>
 #include <zvec/core/framework/index_mapping.h>
@@ -38,9 +40,10 @@ class BufferStorage : public IndexStorage {
     //! Constructor
     WrappedSegment(BufferStorage *owner, IndexMapping::Segment *segment,
                    uint64_t segment_header_start_offset,
-                   IndexFormat::MetaHeader *segment_header)
+                   IndexFormat::MetaHeader *segment_header, size_t segment_id)
         : segment_(segment),
           owner_(owner),
+          segment_id_(segment_id),
           capacity_(static_cast<size_t>(segment->meta()->data_size +
                                         segment->meta()->padding_size)),
           segment_header_start_offset_(segment_header_start_offset),
@@ -77,9 +80,9 @@ class BufferStorage : public IndexStorage {
         }
         len = meta->data_size - offset;
       }
-      ailego::BufferHandle buffer_handle =
-          owner_->get_buffer_handle(offset, len);
-      memmove(buf, (const uint8_t *)buffer_handle.pin_vector_data() + offset,
+      memmove(buf,
+              (const uint8_t *)(owner_->get_buffer(offset, len, segment_id_)) +
+                  offset,
               len);
       return len;
     }
@@ -95,10 +98,9 @@ class BufferStorage : public IndexStorage {
       }
       size_t buffer_offset = segment_header_start_offset_ +
                              segment_header_->content_offset +
-                             segment_->meta()->data_index + offset;
-      ailego::BufferHandle buffer_handle =
-          owner_->get_buffer_handle(buffer_offset, len);
-      *data = buffer_handle.pin_vector_data();
+                             segment_->meta()->data_index;
+      *data =
+          owner_->get_buffer(buffer_offset, capacity_, segment_id_) + offset;
       return len;
     }
 
@@ -112,15 +114,16 @@ class BufferStorage : public IndexStorage {
       }
       size_t buffer_offset = segment_header_start_offset_ +
                              segment_header_->content_offset +
-                             segment_->meta()->data_index + offset;
-      data.reset(owner_->get_buffer_handle_ptr(buffer_offset, len));
+                             segment_->meta()->data_index;
+      data.reset(
+          owner_->buffer_pool_handle_.get(), segment_id_,
+          owner_->get_buffer(buffer_offset, capacity_, segment_id_) + offset);
+      // data.reset(owner_->get_buffer(buffer_offset, capacity_, segment_id_) +
+      // offset);
       if (data.data()) {
         return len;
       } else {
-        LOG_ERROR(
-            "Buffer handle is null, now used memory: %zu, new: %zu",
-            (size_t)ailego::BufferManager::Instance().total_size_in_bytes(),
-            len);
+        LOG_ERROR("read error.");
         return -1;
       }
     }
@@ -150,6 +153,7 @@ class BufferStorage : public IndexStorage {
 
    private:
     BufferStorage *owner_{nullptr};
+    size_t segment_id_{};
     size_t capacity_{};
     uint64_t segment_header_start_offset_;
     IndexFormat::MetaHeader *segment_header_;
@@ -174,28 +178,34 @@ class BufferStorage : public IndexStorage {
   //! Open storage
   int open(const std::string &path, bool /*create*/) override {
     file_name_ = path;
-    return ParseToMapping();
+    buffer_pool_ = std::make_shared<ailego::VecBufferPool>(path);
+    buffer_pool_handle_ = std::make_shared<ailego::VecBufferPoolHandle>(
+        buffer_pool_->get_handle());
+    int ret = ParseToMapping();
+    if (ret != 0) {
+      return ret;
+    }
+    ret = buffer_pool_->init(20lu * 1024 * 1024 * 1024, max_segment_size_);
+    if (ret != 0) {
+      return ret;
+    }
+    return 0;
   }
 
-  ailego::BufferHandle get_buffer_handle(int offset, int length) {
-    ailego::BufferID buffer_id =
-        ailego::BufferID::VectorID(file_name_, offset, length);
-    return ailego::BufferManager::Instance().acquire(buffer_id);
+  char *get_buffer(size_t offset, size_t length, size_t block_id) {
+    return buffer_pool_handle_->get_block(offset, length, block_id);
   }
 
-  ailego::BufferHandle::Pointer get_buffer_handle_ptr(int offset, int length) {
-    ailego::BufferID buffer_id =
-        ailego::BufferID::VectorID(file_name_, offset, length);
-    return ailego::BufferManager::Instance().acquire_ptr(buffer_id);
+  int get_meta(size_t offset, size_t length, char *out) {
+    return buffer_pool_handle_->get_meta(offset, length, out);
   }
 
-  int ParseHeader(int offset) {
-    ailego::BufferHandle header_handle =
-        get_buffer_handle(offset, sizeof(header_));
-    void *buffer = header_handle.pin_vector_data();
+  int ParseHeader(size_t offset) {
+    char *buffer = new char[sizeof(header_)];
+    get_meta(offset, sizeof(header_), buffer);
     uint8_t *header_ptr = reinterpret_cast<uint8_t *>(buffer);
     memcpy(&header_, header_ptr, sizeof(header_));
-    header_handle.unpin_vector_data();
+    delete[] buffer;
     if (header_.meta_header_size != sizeof(IndexFormat::MetaHeader)) {
       LOG_ERROR("Header meta size is invalid.");
       return IndexError_InvalidLength;
@@ -208,14 +218,13 @@ class BufferStorage : public IndexStorage {
     return 0;
   }
 
-  int ParseFooter(int offset) {
-    ailego::BufferHandle footer_handle =
-        get_buffer_handle(offset, sizeof(footer_));
-    void *buffer = footer_handle.pin_vector_data();
+  int ParseFooter(size_t offset) {
+    char *buffer = new char[sizeof(footer_)];
+    get_meta(offset, sizeof(footer_), buffer);
     uint8_t *footer_ptr = reinterpret_cast<uint8_t *>(buffer);
     memcpy(&footer_, footer_ptr, sizeof(footer_));
-    footer_handle.unpin_vector_data();
-    if (offset < (int)footer_.segments_meta_size) {
+    delete[] buffer;
+    if (offset < (size_t)footer_.segments_meta_size) {
       LOG_ERROR("Footer meta size is invalid.");
       return IndexError_InvalidLength;
     }
@@ -227,17 +236,16 @@ class BufferStorage : public IndexStorage {
     return 0;
   }
 
-  int ParseSegment(int offset) {
-    ailego::BufferHandle segment_start_handle =
-        get_buffer_handle(offset, footer_.segments_meta_size);
-    void *segment_buffer = segment_start_handle.pin_vector_data();
-    if (ailego::Crc32c::Hash(segment_buffer, footer_.segments_meta_size, 0u) !=
-        footer_.segments_meta_crc) {
+  int ParseSegment(size_t offset) {
+    segment_buffer_ = std::make_unique<char[]>(footer_.segments_meta_size);
+    get_meta(offset, footer_.segments_meta_size, segment_buffer_.get());
+    if (ailego::Crc32c::Hash(segment_buffer_.get(), footer_.segments_meta_size,
+                             0u) != footer_.segments_meta_crc) {
       LOG_ERROR("Index segments meta checksum is invalid.");
       return IndexError_InvalidChecksum;
     }
     IndexFormat::SegmentMeta *segment_start =
-        reinterpret_cast<IndexFormat::SegmentMeta *>(segment_buffer);
+        reinterpret_cast<IndexFormat::SegmentMeta *>(segment_buffer_.get());
     uint32_t segment_ids_offset = footer_.segments_meta_size;
     for (IndexFormat::SegmentMeta *iter = segment_start,
                                   *end = segment_start + footer_.segment_count;
@@ -255,11 +263,17 @@ class BufferStorage : public IndexStorage {
       if (iter->segment_id_offset < segment_ids_offset) {
         segment_ids_offset = iter->segment_id_offset;
       }
+      id_hash_.emplace(
+          std::string(reinterpret_cast<const char *>(segment_start) +
+                      iter->segment_id_offset),
+          segments_.size());
       segments_.emplace(
           std::string(reinterpret_cast<const char *>(segment_start) +
                       iter->segment_id_offset),
           IndexMapping::SegmentInfo{IndexMapping::Segment{iter},
                                     current_header_start_offset_, &header_});
+      max_segment_size_ =
+          std::max(max_segment_size_, iter->data_size + iter->padding_size);
       if (sizeof(IndexFormat::SegmentMeta) * footer_.segment_count >
           footer_.segments_meta_size) {
         return IndexError_InvalidLength;
@@ -358,7 +372,7 @@ class BufferStorage : public IndexStorage {
     }
     return std::make_shared<WrappedSegment>(
         this, &segment_info->segment, segment_info->segment_header_start_offset,
-        segment_info->segment_header);
+        segment_info->segment_header, id_hash_[id]);
   }
 
   //! Test if it a segment exists
@@ -397,22 +411,14 @@ class BufferStorage : public IndexStorage {
 
   //! Initialize index file
   int init_index(const std::string &path) {
-    int error_code = mapping_.create(path, segment_meta_capacity_);
-    if (error_code != 0) {
-      return error_code;
-    }
-
     // Add index version
-    error_code = this->init_version_segment();
+    int error_code = this->init_version_segment();
     if (error_code != 0) {
       return error_code;
     }
 
     // Refresh mapping
     this->refresh_index(0);
-
-    // Close mapping
-    mapping_.close();
     return 0;
   }
 
@@ -436,6 +442,7 @@ class BufferStorage : public IndexStorage {
     segments_.clear();
     memset(&header_, 0, sizeof(header_));
     memset(&footer_, 0, sizeof(footer_));
+    segment_buffer_.release();
   }
 
   //! Append a segment into storage
@@ -460,21 +467,20 @@ class BufferStorage : public IndexStorage {
   }
 
  private:
-  // mmap
-  uint32_t segment_meta_capacity_{1024 * 1024};
-  // bool copy_on_write_{false};
-  // bool force_flush_{false};
-  // bool memory_locked_{false};
-  // bool memory_warmup_{false};
   bool index_dirty_{false};
-  mutable IndexMapping mapping_{};
   mutable std::mutex mapping_mutex_{};
 
   // buffer manager
   std::string file_name_;
-  IndexFormat::MetaHeader header_{};
-  IndexFormat::MetaFooter footer_{};
+  IndexFormat::MetaHeader header_;
+  IndexFormat::MetaFooter footer_;
   std::map<std::string, IndexMapping::SegmentInfo> segments_{};
+  std::map<std::string, size_t> id_hash_{};
+  uint64_t max_segment_size_{0};
+  std::unique_ptr<char[]> segment_buffer_{nullptr};
+
+  ailego::VecBufferPool::Pointer buffer_pool_{nullptr};
+  ailego::VecBufferPoolHandle::Pointer buffer_pool_handle_{nullptr};
   uint64_t current_header_start_offset_{0u};
 };
 
diff --git a/src/include/zvec/ailego/buffer/buffer_pool.h b/src/include/zvec/ailego/buffer/buffer_pool.h
new file mode 100644
index 00000000..c27065a2
--- /dev/null
+++ b/src/include/zvec/ailego/buffer/buffer_pool.h
@@ -0,0 +1,155 @@
+#pragma once
+
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <atomic>
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include "concurrentqueue.h"
+
+namespace zvec {
+namespace ailego {
+
+using block_id_t = size_t;
+using version_t = size_t;
+
+class LPMap;
+
+class LRUCache {
+ public:
+  typedef std::pair<block_id_t, version_t> BlockType;
+  typedef moodycamel::ConcurrentQueue<BlockType> ConcurrentQueue;
+
+  int init(size_t block_size);
+
+  bool evict_single_block(BlockType &item);
+
+  bool add_single_block(const LPMap *lp_map, const BlockType &block,
+                        int block_type);
+
+  void clear_dead_node(const LPMap *lp_map);
+
+ private:
+  constexpr static size_t CATCH_QUEUE_NUM = 3;
+  int block_size_;
+  std::vector<ConcurrentQueue> queues_;
+  alignas(64) std::atomic<size_t> evict_queue_insertions_{0};
+};
+
+class LPMap {
+  struct Entry {
+    alignas(64) std::atomic<int> ref_count;
+    alignas(64) std::atomic<version_t> load_count;
+    char *buffer;
+  };
+
+ public:
+  LPMap() : entry_num_(0), entries_(nullptr) {}
+  ~LPMap() {
+    delete[] entries_;
+  }
+
+  void init(size_t entry_num);
+
+  char *acquire_block(block_id_t block_id);
+
+  void release_block(block_id_t block_id);
+
+  // need be called under lock
+  char *evict_block(block_id_t block_id);
+
+  // need be called under lock
+  char *set_block_acquired(block_id_t block_id, char *buffer);
+
+  // need be called under lock
+  void recycle(moodycamel::ConcurrentQueue<char *> &free_buffers);
+
+  size_t entry_num() const {
+    return entry_num_;
+  }
+
+  bool isDeadBlock(LRUCache::BlockType block) const {
+    Entry &entry = entries_[block.first];
+    return block.second != entry.load_count.load();
+  }
+
+ private:
+  size_t entry_num_{0};
+  Entry *entries_{nullptr};
+  LRUCache cache_;
+};
+
+class VecBufferPoolHandle;
+
+class VecBufferPool {
+ public:
+  typedef std::shared_ptr<VecBufferPool> Pointer;
+
+  VecBufferPool(const std::string &filename);
+  ~VecBufferPool() {
+    close(fd_);
+  }
+
+  int init(size_t pool_capacity, size_t block_size);
+
+  VecBufferPoolHandle get_handle();
+
+  char *acquire_buffer(block_id_t block_id, size_t offset, size_t size,
+                       int retry = 0);
+
+  int get_meta(size_t offset, size_t length, char *buffer);
+
+  size_t file_size() const {
+    return file_size_;
+  }
+
+ private:
+  int fd_;
+  size_t file_size_;
+  size_t pool_capacity_;
+
+ public:
+  LPMap lp_map_;
+
+ private:
+  std::mutex mutex_;
+  moodycamel::ConcurrentQueue<char *> free_buffers_;
+};
+
+struct VecBufferPoolHandle {
+  VecBufferPoolHandle(VecBufferPool &pool) : pool(pool), hit_num_(0) {};
+  VecBufferPoolHandle(VecBufferPoolHandle &&other)
+      : pool(other.pool), hit_num_(other.hit_num_) {
+    other.hit_num_ = 0;
+  }
+
+  ~VecBufferPoolHandle() = default;
+
+  typedef std::shared_ptr<VecBufferPoolHandle> Pointer;
+
+  char *get_block(size_t offset, size_t size, size_t block_id);
+
+  int get_meta(size_t offset, size_t length, char *buffer);
+
+  void release_one(block_id_t block_id);
+
+  void acquire_one(block_id_t block_id);
+
+  VecBufferPool &pool;
+  int hit_num_;
+};
+
+}  // namespace ailego
+}  // namespace zvec
\ No newline at end of file
diff --git a/src/include/zvec/ailego/buffer/concurrentqueue.h b/src/include/zvec/ailego/buffer/concurrentqueue.h
new file mode 100644
index 00000000..f7f3d77e
--- /dev/null
+++ b/src/include/zvec/ailego/buffer/concurrentqueue.h
@@ -0,0 +1,4410 @@
+// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free
+// queue. An overview, including benchmark results, is provided here:
+//     http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++
+// The full design is also described in excruciating detail at:
+//    http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue
+
+// Simplified BSD license:
+// Copyright (c) 2013-2020, Cameron Desrochers.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice, this
+// list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+// Also dual-licensed under the Boost Software License (see LICENSE.md)
+
+#pragma once
+
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and
+// Traits::index_t are set to < 32 bits, causing integer promotion, causing
+// warnings upon assigning any computed values)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+
+#ifdef MCDBGQ_USE_RELACY
+#pragma GCC diagnostic ignored "-Wint-to-pointer-cast"
+#endif
+#endif
+
+#if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17)
+// VS2019 with /W4 warns about constant conditional expressions but unless
+// /std=c++17 or higher does not support `if constexpr`, so we have no choice
+// but to simply disable the warning
+#pragma warning(push)
+#pragma warning(disable : 4127)  // conditional expression is constant
+#endif
+
+#if defined(__APPLE__)
+#include "TargetConditionals.h"
+#endif
+
+#ifdef MCDBGQ_USE_RELACY
+#include "relacy/relacy_std.hpp"
+#include "relacy_shims.h"
+// We only use malloc/free anyway, and the delete macro messes up `= delete`
+// method declarations. We'll override the default trait malloc ourselves
+// without a macro.
+#undef new
+#undef delete
+#undef malloc
+#undef free
+#else
+#include <atomic>  // Requires C++11. Sorry VS2010.
+#include <cassert>
+#endif
+#include <algorithm>
+#include <array>
+#include <climits>  // for CHAR_BIT
+#include <cstddef>  // for max_align_t
+#include <cstdint>
+#include <cstdlib>
+#include <limits>
+#include <mutex>  // used for thread exit synchronization
+#include <thread>  // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
+#include <type_traits>
+#include <utility>
+
+// Platform-specific definitions of a numeric thread ID type and an invalid
+// value
+namespace moodycamel {
+namespace details {
+template <typename thread_id_t>
+struct thread_id_converter {
+  typedef thread_id_t thread_id_numeric_size_t;
+  typedef thread_id_t thread_id_hash_t;
+  static thread_id_hash_t prehash(thread_id_t const &x) {
+    return x;
+  }
+};
+}  // namespace details
+}  // namespace moodycamel
+#if defined(MCDBGQ_USE_RELACY)
+namespace moodycamel {
+namespace details {
+typedef std::uint32_t thread_id_t;
+static const thread_id_t invalid_thread_id = 0xFFFFFFFFU;
+static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU;
+static inline thread_id_t thread_id() {
+  return rl::thread_index();
+}
+}  // namespace details
+}  // namespace moodycamel
+#elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__)
+// No sense pulling in windows.h in a header, we'll manually declare the
+// function we use and rely on backwards-compatibility for this not to break
+extern "C"
+    __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void);
+namespace moodycamel {
+namespace details {
+static_assert(sizeof(unsigned long) == sizeof(std::uint32_t),
+              "Expected size of unsigned long to be 32 bits on Windows");
+typedef std::uint32_t thread_id_t;
+static const thread_id_t invalid_thread_id =
+    0;  // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx
+static const thread_id_t invalid_thread_id2 =
+    0xFFFFFFFFU;  // Not technically guaranteed to be invalid, but is never used
+                  // in practice. Note that all Win32 thread IDs are presently
+                  // multiples of 4.
+static inline thread_id_t thread_id() {
+  return static_cast<thread_id_t>(::GetCurrentThreadId());
+}
+}  // namespace details
+}  // namespace moodycamel
+#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || \
+    (defined(__APPLE__) && TARGET_OS_IPHONE) || defined(__MVS__) ||  \
+    defined(MOODYCAMEL_NO_THREAD_LOCAL)
+namespace moodycamel {
+namespace details {
+static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8,
+              "std::thread::id is expected to be either 4 or 8 bytes");
+
+typedef std::thread::id thread_id_t;
+static const thread_id_t invalid_thread_id;  // Default ctor creates invalid ID
+
+// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have
+// one; it's only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined
+// anyway, which it won't be.
+static inline thread_id_t thread_id() {
+  return std::this_thread::get_id();
+}
+
+template <std::size_t>
+struct thread_id_size {};
+template <>
+struct thread_id_size<4> {
+  typedef std::uint32_t numeric_t;
+};
+template <>
+struct thread_id_size<8> {
+  typedef std::uint64_t numeric_t;
+};
+
+template <>
+struct thread_id_converter<thread_id_t> {
+  typedef thread_id_size<sizeof(thread_id_t)>::numeric_t
+      thread_id_numeric_size_t;
+#ifndef __APPLE__
+  typedef std::size_t thread_id_hash_t;
+#else
+  typedef thread_id_numeric_size_t thread_id_hash_t;
+#endif
+
+  static thread_id_hash_t prehash(thread_id_t const &x) {
+#ifndef __APPLE__
+    return std::hash<std::thread::id>()(x);
+#else
+    return *reinterpret_cast<thread_id_hash_t const *>(&x);
+#endif
+  }
+};
+}
+}
+#else
+// Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475
+// In order to get a numeric thread ID in a platform-independent way, we use a
+// thread-local static variable's address as a thread identifier :-)
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+#define MOODYCAMEL_THREADLOCAL __thread
+#elif defined(_MSC_VER)
+#define MOODYCAMEL_THREADLOCAL __declspec(thread)
+#else
+// Assume C++11 compliant compiler
+#define MOODYCAMEL_THREADLOCAL thread_local
+#endif
+namespace moodycamel {
+namespace details {
+typedef std::uintptr_t thread_id_t;
+static const thread_id_t invalid_thread_id = 0;  // Address can't be nullptr
+static const thread_id_t invalid_thread_id2 =
+    1;  // Member accesses off a null pointer are also generally invalid. Plus
+        // it's not aligned.
+inline thread_id_t thread_id() {
+  static MOODYCAMEL_THREADLOCAL int x;
+  return reinterpret_cast<thread_id_t>(&x);
+}
+}
+}
+#endif
+
+// Constexpr if
+#ifndef MOODYCAMEL_CONSTEXPR_IF
+#if (defined(_MSC_VER) && defined(_HAS_CXX17) && _HAS_CXX17) || \
+    __cplusplus > 201402L
+#define MOODYCAMEL_CONSTEXPR_IF if constexpr
+#define MOODYCAMEL_MAYBE_UNUSED [[maybe_unused]]
+#else
+#define MOODYCAMEL_CONSTEXPR_IF if
+#define MOODYCAMEL_MAYBE_UNUSED
+#endif
+#endif
+
+// Exceptions
+#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED
+#if (defined(_MSC_VER) && defined(_CPPUNWIND)) ||   \
+    (defined(__GNUC__) && defined(__EXCEPTIONS)) || \
+    (!defined(_MSC_VER) && !defined(__GNUC__))
+#define MOODYCAMEL_EXCEPTIONS_ENABLED
+#endif
+#endif
+#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
+#define MOODYCAMEL_TRY try
+#define MOODYCAMEL_CATCH(...) catch (__VA_ARGS__)
+#define MOODYCAMEL_RETHROW throw
+#define MOODYCAMEL_THROW(expr) throw(expr)
+#else
+#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF(true)
+#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF(false)
+#define MOODYCAMEL_RETHROW
+#define MOODYCAMEL_THROW(expr)
+#endif
+
+#ifndef MOODYCAMEL_NOEXCEPT
+#if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED)
+#define MOODYCAMEL_NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true
+#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800
+// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when
+// it shouldn't :-( We have to assume *all* non-trivial constructors may throw
+// on VS2012!
+#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)    \
+  (std::is_rvalue_reference<valueType>::value &&           \
+           std::is_move_constructible<type>::value         \
+       ? std::is_trivially_move_constructible<type>::value \
+       : std::is_trivially_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr)      \
+  ((std::is_rvalue_reference<valueType>::value &&              \
+            std::is_move_assignable<type>::value               \
+        ? std::is_trivially_move_assignable<type>::value ||    \
+              std::is_nothrow_move_assignable<type>::value     \
+        : std::is_trivially_copy_assignable<type>::value ||    \
+              std::is_nothrow_copy_assignable<type>::value) && \
+   MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900
+#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)       \
+  (std::is_rvalue_reference<valueType>::value &&              \
+           std::is_move_constructible<type>::value            \
+       ? std::is_trivially_move_constructible<type>::value || \
+             std::is_nothrow_move_constructible<type>::value  \
+       : std::is_trivially_copy_constructible<type>::value || \
+             std::is_nothrow_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr)      \
+  ((std::is_rvalue_reference<valueType>::value &&              \
+            std::is_move_assignable<type>::value               \
+        ? std::is_trivially_move_assignable<type>::value ||    \
+              std::is_nothrow_move_assignable<type>::value     \
+        : std::is_trivially_copy_assignable<type>::value ||    \
+              std::is_nothrow_copy_assignable<type>::value) && \
+   MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#else
+#define MOODYCAMEL_NOEXCEPT noexcept
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr)
+#endif
+#endif
+
+#ifndef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#ifdef MCDBGQ_USE_RELACY
+#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#else
+// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a
+// crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445 g++ <=4.7 doesn't
+// support thread_local either. Finally, iOS/ARM doesn't have support for it
+// either, and g++/ARM allows it to compile but it's unconfirmed to actually
+// work
+#if (!defined(_MSC_VER) || _MSC_VER >= 1900) &&                        \
+    (!defined(__MINGW32__) && !defined(__MINGW64__) ||                 \
+     !defined(__WINPTHREADS_VERSION)) &&                               \
+    (!defined(__GNUC__) || __GNUC__ > 4 ||                             \
+     (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) &&                        \
+    (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && \
+    !defined(_M_ARM) && !defined(__aarch64__) && !defined(__MVS__)
+// Assume `thread_local` is fully supported in all other C++11
+// compilers/platforms
+#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED  // tentatively enabled for now;
+                                                 // years ago several users
+                                                 // report having problems with
+                                                 // it on
+#endif
+#endif
+#endif
+
+// VS2012 doesn't support deleted functions.
+// In this case, we declare the function normally but don't define it. A link
+// error will be generated if the function is called.
+#ifndef MOODYCAMEL_DELETE_FUNCTION
+#if defined(_MSC_VER) && _MSC_VER < 1800
+#define MOODYCAMEL_DELETE_FUNCTION
+#else
+#define MOODYCAMEL_DELETE_FUNCTION = delete
+#endif
+#endif
+
+namespace moodycamel {
+namespace details {
+#ifndef MOODYCAMEL_ALIGNAS
+// VS2013 doesn't support alignas or alignof, and align() requires a constant
+// literal
+#if defined(_MSC_VER) && _MSC_VER <= 1800
+#define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment))
+#define MOODYCAMEL_ALIGNOF(obj) __alignof(obj)
+#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) \
+  typename details::Vs2013Aligned<std::alignment_of<obj>::value, T>::type
+template <int Align, typename T>
+struct Vs2013Aligned {};  // default, unsupported alignment
+template <typename T>
+struct Vs2013Aligned<1, T> {
+  typedef __declspec(align(1)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<2, T> {
+  typedef __declspec(align(2)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<4, T> {
+  typedef __declspec(align(4)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<8, T> {
+  typedef __declspec(align(8)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<16, T> {
+  typedef __declspec(align(16)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<32, T> {
+  typedef __declspec(align(32)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<64, T> {
+  typedef __declspec(align(64)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<128, T> {
+  typedef __declspec(align(128)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<256, T> {
+  typedef __declspec(align(256)) T type;
+};
+#else
+template <typename T>
+struct identity {
+  typedef T type;
+};
+#define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment)
+#define MOODYCAMEL_ALIGNOF(obj) alignof(obj)
+#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) \
+  alignas(alignof(obj)) typename details::identity<T>::type
+#endif
+#endif
+}  // namespace details
+}  // namespace moodycamel
+
+
+// TSAN can false report races in lock-free code.  To enable TSAN to be used
+// from projects that use this one, we can apply per-function compile-time
+// suppression. See
+// https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer
+#define MOODYCAMEL_NO_TSAN
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#undef MOODYCAMEL_NO_TSAN
+#define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread")))
+#endif  // TSAN
+#endif  // TSAN
+
+// Compiler-specific likely/unlikely hints
+namespace moodycamel {
+namespace details {
+#if defined(__GNUC__)
+static inline bool(likely)(bool x) {
+  return __builtin_expect((x), true);
+}
+static inline bool(unlikely)(bool x) {
+  return __builtin_expect((x), false);
+}
+#else
+static inline bool(likely)(bool x) {
+  return x;
+}
+static inline bool(unlikely)(bool x) {
+  return x;
+}
+#endif
+}  // namespace details
+}  // namespace moodycamel
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+#include "internal/concurrentqueue_internal_debug.h"
+#endif
+
+namespace moodycamel {
+namespace details {
+template <typename T>
+struct const_numeric_max {
+  static_assert(std::is_integral<T>::value,
+                "const_numeric_max can only be used with integers");
+  static const T value =
+      std::numeric_limits<T>::is_signed
+          ? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) -
+                static_cast<T>(1)
+          : static_cast<T>(-1);
+};
+
+#if defined(__GLIBCXX__)
+typedef ::max_align_t
+    std_max_align_t;  // libstdc++ forgot to add it to std:: for a while
+#else
+typedef std::max_align_t std_max_align_t;  // Others (e.g. MSVC) insist it can
+                                           // *only* be accessed via std::
+#endif
+
+// Some platforms have incorrectly set max_align_t to a type with <8 bytes
+// alignment even while supporting 8-byte aligned scalar values (*cough* 32-bit
+// iOS). Work around this with our own union. See issue #64.
+typedef union {
+  std_max_align_t x;
+  long long y;
+  void *z;
+} max_align_t;
+}  // namespace details
+
+// Default traits for the ConcurrentQueue. To change some of the
+// traits without re-implementing all of them, inherit from this
+// struct and shadow the declarations you wish to be different;
+// since the traits are used as a template type parameter, the
+// shadowed declarations will be used where defined, and the defaults
+// otherwise.
+struct ConcurrentQueueDefaultTraits {
+  // General-purpose size type. std::size_t is strongly recommended.
+  typedef std::size_t size_t;
+
+  // The type used for the enqueue and dequeue indices. Must be at least as
+  // large as size_t. Should be significantly larger than the number of elements
+  // you expect to hold at once, especially if you have a high turnover rate;
+  // for example, on 32-bit x86, if you expect to have over a hundred million
+  // elements or pump several million elements through your queue in a very
+  // short space of time, using a 32-bit type *may* trigger a race condition.
+  // A 64-bit int type is recommended in that case, and in practice will
+  // prevent a race condition no matter the usage of the queue. Note that
+  // whether the queue is lock-free with a 64-int type depends on the whether
+  // std::atomic<std::uint64_t> is lock-free, which is platform-specific.
+  typedef std::size_t index_t;
+
+  // Internally, all elements are enqueued and dequeued from multi-element
+  // blocks; this is the smallest controllable unit. If you expect few elements
+  // but many producers, a smaller block size should be favoured. For few
+  // producers and/or many elements, a larger block size is preferred. A sane
+  // default is provided. Must be a power of 2.
+  static const size_t BLOCK_SIZE = 32;
+
+  // For explicit producers (i.e. when using a producer token), the block is
+  // checked for being empty by iterating through a list of flags, one per
+  // element. For large block sizes, this is too inefficient, and switching to
+  // an atomic counter-based approach is faster. The switch is made for block
+  // sizes strictly larger than this threshold.
+  static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;
+
+  // How many full blocks can be expected for a single explicit producer? This
+  // should reflect that number's maximum for optimal performance. Must be a
+  // power of 2.
+  static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32;
+
+  // How many full blocks can be expected for a single implicit producer? This
+  // should reflect that number's maximum for optimal performance. Must be a
+  // power of 2.
+  static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32;
+
+  // The initial size of the hash table mapping thread IDs to implicit
+  // producers. Note that the hash is resized every time it becomes half full.
+  // Must be a power of two, and either 0 or at least 1. If 0, implicit
+  // production (using the enqueue methods without an explicit producer token)
+  // is disabled.
+  static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32;
+
+  // Controls the number of items that an explicit consumer (i.e. one with a
+  // token) must consume before it causes all consumers to rotate and move on to
+  // the next internal queue.
+  static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE =
+      256;
+
+  // The maximum number of elements (inclusive) that can be enqueued to a
+  // sub-queue. Enqueue operations that would cause this limit to be surpassed
+  // will fail. Note that this limit is enforced at the block level (for
+  // performance reasons), i.e. it's rounded up to the nearest block size.
+  static const size_t MAX_SUBQUEUE_SIZE =
+      details::const_numeric_max<size_t>::value;
+
+  // The number of times to spin before sleeping when waiting on a semaphore.
+  // Recommended values are on the order of 1000-10000 unless the number of
+  // consumer threads exceeds the number of idle cores (in which case try
+  // 0-100). Only affects instances of the BlockingConcurrentQueue.
+  static const int MAX_SEMA_SPINS = 10000;
+
+  // Whether to recycle dynamically-allocated blocks into an internal free list
+  // or not. If false, only pre-allocated blocks (controlled by the constructor
+  // arguments) will be recycled, and all others will be `free`d back to the
+  // heap. Note that blocks consumed by explicit producers are only freed on
+  // destruction of the queue (not following destruction of the token)
+  // regardless of this trait.
+  static const bool RECYCLE_ALLOCATED_BLOCKS = false;
+
+
+#ifndef MCDBGQ_USE_RELACY
+  // Memory allocation can be customized if needed.
+  // malloc should return nullptr on failure, and handle alignment like
+  // std::malloc.
+#if defined(malloc) || defined(free)
+  // Gah, this is 2015, stop defining macros that break standard code already!
+  // Work around malloc/free being special macros:
+  static inline void *WORKAROUND_malloc(size_t size) {
+    return malloc(size);
+  }
+  static inline void WORKAROUND_free(void *ptr) {
+    return free(ptr);
+  }
+  static inline void *(malloc)(size_t size) {
+    return WORKAROUND_malloc(size);
+  }
+  static inline void(free)(void *ptr) {
+    return WORKAROUND_free(ptr);
+  }
+#else
+  static inline void *malloc(size_t size) {
+    return std::malloc(size);
+  }
+  static inline void free(void *ptr) {
+    return std::free(ptr);
+  }
+#endif
+#else
+  // Debug versions when running under the Relacy race detector (ignore
+  // these in user code)
+  static inline void *malloc(size_t size) {
+    return rl::rl_malloc(size, $);
+  }
+  static inline void free(void *ptr) {
+    return rl::rl_free(ptr, $);
+  }
+#endif
+};
+
+
+// When producing or consuming many elements, the most efficient way is to:
+//    1) Use one of the bulk-operation methods of the queue with a token
+//    2) Failing that, use the bulk-operation methods without a token
+//    3) Failing that, create a token and use that with the single-item methods
+//    4) Failing that, use the single-parameter methods of the queue
+// Having said that, don't create tokens willy-nilly -- ideally there should be
+// a maximum of one token per thread (of each kind).
+struct ProducerToken;
+struct ConsumerToken;
+
+template <typename T, typename Traits>
+class ConcurrentQueue;
+template <typename T, typename Traits>
+class BlockingConcurrentQueue;
+class ConcurrentQueueTests;
+
+
+namespace details {
+struct ConcurrentQueueProducerTypelessBase {
+  ConcurrentQueueProducerTypelessBase *next;
+  std::atomic<bool> inactive;
+  ProducerToken *token;
+
+  ConcurrentQueueProducerTypelessBase()
+      : next(nullptr), inactive(false), token(nullptr) {}
+};
+
+template <bool use32>
+struct _hash_32_or_64 {
+  static inline std::uint32_t hash(std::uint32_t h) {
+    // MurmurHash3 finalizer -- see
+    // https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
+    // Since the thread ID is already unique, all we really want to do is
+    // propagate that uniqueness evenly across all the bits, so that we can use
+    // a subset of the bits while reducing collisions significantly
+    h ^= h >> 16;
+    h *= 0x85ebca6b;
+    h ^= h >> 13;
+    h *= 0xc2b2ae35;
+    return h ^ (h >> 16);
+  }
+};
+template <>
+struct _hash_32_or_64<1> {
+  static inline std::uint64_t hash(std::uint64_t h) {
+    h ^= h >> 33;
+    h *= 0xff51afd7ed558ccd;
+    h ^= h >> 33;
+    h *= 0xc4ceb9fe1a85ec53;
+    return h ^ (h >> 33);
+  }
+};
+template <std::size_t size>
+struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> {};
+
+static inline size_t hash_thread_id(thread_id_t id) {
+  static_assert(
+      sizeof(thread_id_t) <= 8,
+      "Expected a platform where thread IDs are at most 64-bit values");
+  return static_cast<size_t>(
+      hash_32_or_64<sizeof(
+          thread_id_converter<thread_id_t>::thread_id_hash_t)>::
+          hash(thread_id_converter<thread_id_t>::prehash(id)));
+}
+
+template <typename T>
+static inline bool circular_less_than(T a, T b) {
+  static_assert(
+      std::is_integral<T>::value && !std::numeric_limits<T>::is_signed,
+      "circular_less_than is intended to be used only with unsigned integer "
+      "types");
+  return static_cast<T>(a - b) >
+         static_cast<T>(static_cast<T>(1)
+                        << (static_cast<T>(sizeof(T) * CHAR_BIT - 1)));
+  // Note: extra parens around rhs of operator<< is MSVC bug:
+  // https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931
+  //       silencing the bug requires #pragma warning(disable: 4554) around the
+  //       calling code and has no effect when done here.
+}
+
+template <typename U>
+static inline char *align_for(char *ptr) {
+  const std::size_t alignment = std::alignment_of<U>::value;
+  return ptr +
+         (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) %
+             alignment;
+}
+
+template <typename T>
+static inline T ceil_to_pow_2(T x) {
+  static_assert(
+      std::is_integral<T>::value && !std::numeric_limits<T>::is_signed,
+      "ceil_to_pow_2 is intended to be used only with unsigned integer types");
+
+  // Adapted from
+  // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+  --x;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  for (std::size_t i = 1; i < sizeof(T); i <<= 1) {
+    x |= x >> (i << 3);
+  }
+  ++x;
+  return x;
+}
+
+template <typename T>
+static inline void swap_relaxed(std::atomic<T> &left, std::atomic<T> &right) {
+  T temp = left.load(std::memory_order_relaxed);
+  left.store(right.load(std::memory_order_relaxed), std::memory_order_relaxed);
+  right.store(temp, std::memory_order_relaxed);
+}
+
+template <typename T>
+static inline T const &nomove(T const &x) {
+  return x;
+}
+
+template <bool Enable>
+struct nomove_if {
+  template <typename T>
+  static inline T const &eval(T const &x) {
+    return x;
+  }
+};
+
+template <>
+struct nomove_if<false> {
+  template <typename U>
+  static inline auto eval(U &&x) -> decltype(std::forward<U>(x)) {
+    return std::forward<U>(x);
+  }
+};
+
+template <typename It>
+static inline auto deref_noexcept(It &it) MOODYCAMEL_NOEXCEPT -> decltype(*it) {
+  return *it;
+}
+
+#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || \
+    (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+template <typename T>
+struct is_trivially_destructible : std::is_trivially_destructible<T> {};
+#else
+template <typename T>
+struct is_trivially_destructible : std::has_trivial_destructor<T> {};
+#endif
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#ifdef MCDBGQ_USE_RELACY
+typedef RelacyThreadExitListener ThreadExitListener;
+typedef RelacyThreadExitNotifier ThreadExitNotifier;
+#else
+class ThreadExitNotifier;
+
+struct ThreadExitListener {
+  typedef void (*callback_t)(void *);
+  callback_t callback;
+  void *userData;
+
+  ThreadExitListener *next;   // reserved for use by the ThreadExitNotifier
+  ThreadExitNotifier *chain;  // reserved for use by the ThreadExitNotifier
+};
+
+class ThreadExitNotifier {
+ public:
+  static void subscribe(ThreadExitListener *listener) {
+    auto &tlsInst = instance();
+    std::lock_guard<std::mutex> guard(mutex());
+    listener->next = tlsInst.tail;
+    listener->chain = &tlsInst;
+    tlsInst.tail = listener;
+  }
+
+  static void unsubscribe(ThreadExitListener *listener) {
+    std::lock_guard<std::mutex> guard(mutex());
+    if (!listener->chain) {
+      return;  // race with ~ThreadExitNotifier
+    }
+    auto &tlsInst = *listener->chain;
+    listener->chain = nullptr;
+    ThreadExitListener **prev = &tlsInst.tail;
+    for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) {
+      if (ptr == listener) {
+        *prev = ptr->next;
+        break;
+      }
+      prev = &ptr->next;
+    }
+  }
+
+ private:
+  ThreadExitNotifier() : tail(nullptr) {}
+  ThreadExitNotifier(ThreadExitNotifier const &) MOODYCAMEL_DELETE_FUNCTION;
+  ThreadExitNotifier &operator=(ThreadExitNotifier const &)
+      MOODYCAMEL_DELETE_FUNCTION;
+
+  ~ThreadExitNotifier() {
+    // This thread is about to exit, let everyone know!
+    assert(this == &instance() &&
+           "If this assert fails, you likely have a buggy compiler! Change the "
+           "preprocessor conditions such that "
+           "MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined.");
+    std::lock_guard<std::mutex> guard(mutex());
+    for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) {
+      ptr->chain = nullptr;
+      ptr->callback(ptr->userData);
+    }
+  }
+
+  // Thread-local
+  static inline ThreadExitNotifier &instance() {
+    static thread_local ThreadExitNotifier notifier;
+    return notifier;
+  }
+
+  static inline std::mutex &mutex() {
+    // Must be static because the ThreadExitNotifier could be destroyed while
+    // unsubscribe is called
+    static std::mutex mutex;
+    return mutex;
+  }
+
+ private:
+  ThreadExitListener *tail;
+};
+#endif
+#endif
+
+template <typename T>
+struct static_is_lock_free_num {
+  enum { value = 0 };
+};
+template <>
+struct static_is_lock_free_num<signed char> {
+  enum { value = ATOMIC_CHAR_LOCK_FREE };
+};
+template <>
+struct static_is_lock_free_num<short> {
+  enum { value = ATOMIC_SHORT_LOCK_FREE };
+};
+template <>
+struct static_is_lock_free_num<int> {
+  enum { value = ATOMIC_INT_LOCK_FREE };
+};
+template <>
+struct static_is_lock_free_num<long> {
+  enum { value = ATOMIC_LONG_LOCK_FREE };
+};
+template <>
+struct static_is_lock_free_num<long long> {
+  enum { value = ATOMIC_LLONG_LOCK_FREE };
+};
+template <typename T>
+struct static_is_lock_free
+    : static_is_lock_free_num<typename std::make_signed<T>::type> {};
+template <>
+struct static_is_lock_free<bool> {
+  enum { value = ATOMIC_BOOL_LOCK_FREE };
+};
+template <typename U>
+struct static_is_lock_free<U *> {
+  enum { value = ATOMIC_POINTER_LOCK_FREE };
+};
+}  // namespace details
+
+
+struct ProducerToken {
+  template <typename T, typename Traits>
+  explicit ProducerToken(ConcurrentQueue<T, Traits> &queue);
+
+  template <typename T, typename Traits>
+  explicit ProducerToken(BlockingConcurrentQueue<T, Traits> &queue);
+
+  ProducerToken(ProducerToken &&other) MOODYCAMEL_NOEXCEPT
+      : producer(other.producer) {
+    other.producer = nullptr;
+    if (producer != nullptr) {
+      producer->token = this;
+    }
+  }
+
+  inline ProducerToken &operator=(ProducerToken &&other) MOODYCAMEL_NOEXCEPT {
+    swap(other);
+    return *this;
+  }
+
+  void swap(ProducerToken &other) MOODYCAMEL_NOEXCEPT {
+    std::swap(producer, other.producer);
+    if (producer != nullptr) {
+      producer->token = this;
+    }
+    if (other.producer != nullptr) {
+      other.producer->token = &other;
+    }
+  }
+
+  // A token is always valid unless:
+  //     1) Memory allocation failed during construction
+  //     2) It was moved via the move constructor
+  //        (Note: assignment does a swap, leaving both potentially valid)
+  //     3) The associated queue was destroyed
+  // Note that if valid() returns true, that only indicates
+  // that the token is valid for use with a specific queue,
+  // but not which one; that's up to the user to track.
+  inline bool valid() const {
+    return producer != nullptr;
+  }
+
+  ~ProducerToken() {
+    if (producer != nullptr) {
+      producer->token = nullptr;
+      producer->inactive.store(true, std::memory_order_release);
+    }
+  }
+
+  // Disable copying and assignment
+  ProducerToken(ProducerToken const &) MOODYCAMEL_DELETE_FUNCTION;
+  ProducerToken &operator=(ProducerToken const &) MOODYCAMEL_DELETE_FUNCTION;
+
+ private:
+  template <typename T, typename Traits>
+  friend class ConcurrentQueue;
+  friend class ConcurrentQueueTests;
+
+ protected:
+  details::ConcurrentQueueProducerTypelessBase *producer;
+};
+
+
+struct ConsumerToken {
+  template <typename T, typename Traits>
+  explicit ConsumerToken(ConcurrentQueue<T, Traits> &q);
+
+  template <typename T, typename Traits>
+  explicit ConsumerToken(BlockingConcurrentQueue<T, Traits> &q);
+
+  ConsumerToken(ConsumerToken &&other) MOODYCAMEL_NOEXCEPT
+      : initialOffset(other.initialOffset),
+        lastKnownGlobalOffset(other.lastKnownGlobalOffset),
+        itemsConsumedFromCurrent(other.itemsConsumedFromCurrent),
+        currentProducer(other.currentProducer),
+        desiredProducer(other.desiredProducer) {}
+
+  inline ConsumerToken &operator=(ConsumerToken &&other) MOODYCAMEL_NOEXCEPT {
+    swap(other);
+    return *this;
+  }
+
+  void swap(ConsumerToken &other) MOODYCAMEL_NOEXCEPT {
+    std::swap(initialOffset, other.initialOffset);
+    std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
+    std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
+    std::swap(currentProducer, other.currentProducer);
+    std::swap(desiredProducer, other.desiredProducer);
+  }
+
+  // Disable copying and assignment
+  ConsumerToken(ConsumerToken const &) MOODYCAMEL_DELETE_FUNCTION;
+  ConsumerToken &operator=(ConsumerToken const &) MOODYCAMEL_DELETE_FUNCTION;
+
+ private:
+  template <typename T, typename Traits>
+  friend class ConcurrentQueue;
+  friend class ConcurrentQueueTests;
+
+ private:  // but shared with ConcurrentQueue
+  std::uint32_t initialOffset;
+  std::uint32_t lastKnownGlobalOffset;
+  std::uint32_t itemsConsumedFromCurrent;
+  details::ConcurrentQueueProducerTypelessBase *currentProducer;
+  details::ConcurrentQueueProducerTypelessBase *desiredProducer;
+};
+
+// Need to forward-declare this swap because it's in a namespace.
+// See
+// http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces
+template <typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP &a,
+                 typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP &b)
+    MOODYCAMEL_NOEXCEPT;
+
+
+template <typename T, typename Traits = ConcurrentQueueDefaultTraits>
+class ConcurrentQueue {
+ public:
+  typedef ::moodycamel::ProducerToken producer_token_t;
+  typedef ::moodycamel::ConsumerToken consumer_token_t;
+
+  typedef typename Traits::index_t index_t;
+  typedef typename Traits::size_t size_t;
+
+  static const size_t BLOCK_SIZE = static_cast<size_t>(Traits::BLOCK_SIZE);
+  static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD =
+      static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
+  static const size_t EXPLICIT_INITIAL_INDEX_SIZE =
+      static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
+  static const size_t IMPLICIT_INITIAL_INDEX_SIZE =
+      static_cast<size_t>(Traits::IMPLICIT_INITIAL_INDEX_SIZE);
+  static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE =
+      static_cast<size_t>(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE);
+  static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE =
+      static_cast<std::uint32_t>(
+          Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4307)  // + integral constant overflow (that's what
+                                 // the ternary expression is for!)
+#pragma warning(disable : 4309)  // static_cast: Truncation of constant value
+#endif
+  static const size_t MAX_SUBQUEUE_SIZE =
+      (details::const_numeric_max<size_t>::value -
+           static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) <
+       BLOCK_SIZE)
+          ? details::const_numeric_max<size_t>::value
+          : ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) +
+              (BLOCK_SIZE - 1)) /
+             BLOCK_SIZE * BLOCK_SIZE);
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+  static_assert(!std::numeric_limits<size_t>::is_signed &&
+                    std::is_integral<size_t>::value,
+                "Traits::size_t must be an unsigned integral type");
+  static_assert(!std::numeric_limits<index_t>::is_signed &&
+                    std::is_integral<index_t>::value,
+                "Traits::index_t must be an unsigned integral type");
+  static_assert(sizeof(index_t) >= sizeof(size_t),
+                "Traits::index_t must be at least as wide as Traits::size_t");
+  static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)),
+                "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)");
+  static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) &&
+                    !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD &
+                      (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)),
+                "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a "
+                "power of 2 (and greater than 1)");
+  static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) &&
+                    !(EXPLICIT_INITIAL_INDEX_SIZE &
+                      (EXPLICIT_INITIAL_INDEX_SIZE - 1)),
+                "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and "
+                "greater than 1)");
+  static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) &&
+                    !(IMPLICIT_INITIAL_INDEX_SIZE &
+                      (IMPLICIT_INITIAL_INDEX_SIZE - 1)),
+                "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and "
+                "greater than 1)");
+  static_assert(
+      (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) ||
+          !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE &
+            (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)),
+      "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2");
+  static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 ||
+                    INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1,
+                "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least "
+                "1 (or 0 to disable implicit enqueueing)");
+
+ public:
+  // Creates a queue with at least `capacity` element slots; note that the
+  // actual number of elements that can be inserted without additional memory
+  // allocation depends on the number of producers and the block size (e.g. if
+  // the block size is equal to `capacity`, only a single block will be
+  // allocated up-front, which means only a single producer will be able to
+  // enqueue elements without an extra allocation -- blocks aren't shared
+  // between producers). This method is not thread safe -- it is up to the user
+  // to ensure that the queue is fully constructed before it starts being used
+  // by other threads (this includes making the memory effects of construction
+  // visible, possibly with a memory barrier).
+  explicit ConcurrentQueue(size_t capacity = 32 * BLOCK_SIZE)
+      : producerListTail(nullptr),
+        producerCount(0),
+        initialBlockPoolIndex(0),
+        nextExplicitConsumerId(0),
+        globalExplicitConsumerOffset(0) {
+    implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+    populate_initial_implicit_producer_hash();
+    populate_initial_block_list(capacity / BLOCK_SIZE +
+                                ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1));
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+    // Track all the producers using a fully-resolved typed list for
+    // each kind; this makes it possible to debug them starting from
+    // the root queue object (otherwise wacky casts are needed that
+    // don't compile in the debugger's expression evaluator).
+    explicitProducers.store(nullptr, std::memory_order_relaxed);
+    implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+  }
+
+  // Computes the correct amount of pre-allocated blocks for you based
+  // on the minimum number of elements you want available at any given
+  // time, and the maximum concurrent number of each type of producer.
+  ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers,
+                  size_t maxImplicitProducers)
+      : producerListTail(nullptr),
+        producerCount(0),
+        initialBlockPoolIndex(0),
+        nextExplicitConsumerId(0),
+        globalExplicitConsumerOffset(0) {
+    implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+    populate_initial_implicit_producer_hash();
+    size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) *
+                        (maxExplicitProducers + 1) +
+                    2 * (maxExplicitProducers + maxImplicitProducers);
+    populate_initial_block_list(blocks);
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+    explicitProducers.store(nullptr, std::memory_order_relaxed);
+    implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+  }
+
+  // Note: The queue should not be accessed concurrently while it's
+  // being deleted. It's up to the user to synchronize this.
+  // This method is not thread safe.
+  ~ConcurrentQueue() {
+    // Destroy producers
+    auto ptr = producerListTail.load(std::memory_order_relaxed);
+    while (ptr != nullptr) {
+      auto next = ptr->next_prod();
+      if (ptr->token != nullptr) {
+        ptr->token->producer = nullptr;
+      }
+      destroy(ptr);
+      ptr = next;
+    }
+
+    // Destroy implicit producer hash tables
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) {
+      auto hash = implicitProducerHash.load(std::memory_order_relaxed);
+      while (hash != nullptr) {
+        auto prev = hash->prev;
+        if (prev != nullptr) {  // The last hash is part of this object and was
+                                // not allocated dynamically
+          for (size_t i = 0; i != hash->capacity; ++i) {
+            hash->entries[i].~ImplicitProducerKVP();
+          }
+          hash->~ImplicitProducerHash();
+          (Traits::free)(hash);
+        }
+        hash = prev;
+      }
+    }
+
+    // Destroy global free list
+    auto block = freeList.head_unsafe();
+    while (block != nullptr) {
+      auto next = block->freeListNext.load(std::memory_order_relaxed);
+      if (block->dynamicallyAllocated) {
+        destroy(block);
+      }
+      block = next;
+    }
+
+    // Destroy initial free list
+    destroy_array(initialBlockPool, initialBlockPoolSize);
+  }
+
+  // Disable copying and copy assignment
+  ConcurrentQueue(ConcurrentQueue const &) MOODYCAMEL_DELETE_FUNCTION;
+  ConcurrentQueue &operator=(ConcurrentQueue const &)
+      MOODYCAMEL_DELETE_FUNCTION;
+
+  // Moving is supported, but note that it is *not* a thread-safe operation.
+  // Nobody can use the queue while it's being moved, and the memory effects
+  // of that move must be propagated to other threads before they can use it.
+  // Note: When a queue is moved, its tokens are still valid but can only be
+  // used with the destination queue (i.e. semantically they are moved along
+  // with the queue itself).
+  ConcurrentQueue(ConcurrentQueue &&other) MOODYCAMEL_NOEXCEPT
+      : producerListTail(
+            other.producerListTail.load(std::memory_order_relaxed)),
+        producerCount(other.producerCount.load(std::memory_order_relaxed)),
+        initialBlockPoolIndex(
+            other.initialBlockPoolIndex.load(std::memory_order_relaxed)),
+        initialBlockPool(other.initialBlockPool),
+        initialBlockPoolSize(other.initialBlockPoolSize),
+        freeList(std::move(other.freeList)),
+        nextExplicitConsumerId(
+            other.nextExplicitConsumerId.load(std::memory_order_relaxed)),
+        globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(
+            std::memory_order_relaxed)) {
+    // Move the other one into this, and leave the other one as an empty queue
+    implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+    populate_initial_implicit_producer_hash();
+    swap_implicit_producer_hashes(other);
+
+    other.producerListTail.store(nullptr, std::memory_order_relaxed);
+    other.producerCount.store(0, std::memory_order_relaxed);
+    other.nextExplicitConsumerId.store(0, std::memory_order_relaxed);
+    other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed);
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+    explicitProducers.store(
+        other.explicitProducers.load(std::memory_order_relaxed),
+        std::memory_order_relaxed);
+    other.explicitProducers.store(nullptr, std::memory_order_relaxed);
+    implicitProducers.store(
+        other.implicitProducers.load(std::memory_order_relaxed),
+        std::memory_order_relaxed);
+    other.implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+
+    other.initialBlockPoolIndex.store(0, std::memory_order_relaxed);
+    other.initialBlockPoolSize = 0;
+    other.initialBlockPool = nullptr;
+
+    reown_producers();
+  }
+
+  inline ConcurrentQueue &operator=(ConcurrentQueue &&other)
+      MOODYCAMEL_NOEXCEPT {
+    return swap_internal(other);
+  }
+
+  // Swaps this queue's state with the other's. Not thread-safe.
+  // Swapping two queues does not invalidate their tokens, however
+  // the tokens that were created for one queue must be used with
+  // only the swapped queue (i.e. the tokens are tied to the
+  // queue's movable state, not the object itself).
+  inline void swap(ConcurrentQueue &other) MOODYCAMEL_NOEXCEPT {
+    swap_internal(other);
+  }
+
+ private:
+  ConcurrentQueue &swap_internal(ConcurrentQueue &other) {
+    if (this == &other) {
+      return *this;
+    }
+
+    details::swap_relaxed(producerListTail, other.producerListTail);
+    details::swap_relaxed(producerCount, other.producerCount);
+    details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex);
+    std::swap(initialBlockPool, other.initialBlockPool);
+    std::swap(initialBlockPoolSize, other.initialBlockPoolSize);
+    freeList.swap(other.freeList);
+    details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId);
+    details::swap_relaxed(globalExplicitConsumerOffset,
+                          other.globalExplicitConsumerOffset);
+
+    swap_implicit_producer_hashes(other);
+
+    reown_producers();
+    other.reown_producers();
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+    details::swap_relaxed(explicitProducers, other.explicitProducers);
+    details::swap_relaxed(implicitProducers, other.implicitProducers);
+#endif
+
+    return *this;
+  }
+
+ public:
+  // Enqueues a single item (by copying it).
+  // Allocates memory if required. Only fails if memory allocation fails (or
+  // implicit production is disabled because
+  // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or
+  // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+  // Thread-safe.
+  inline bool enqueue(T const &item) {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+    return false;
+    else return inner_enqueue<CanAlloc>(item);
+  }
+
+  // Enqueues a single item (by moving it, if possible).
+  // Allocates memory if required. Only fails if memory allocation fails (or
+  // implicit production is disabled because
+  // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or
+  // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+  // Thread-safe.
+  inline bool enqueue(T &&item) {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+    return false;
+    else return inner_enqueue<CanAlloc>(std::move(item));
+  }
+
+  // Enqueues a single item (by copying it) using an explicit producer token.
+  // Allocates memory if required. Only fails if memory allocation fails (or
+  // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+  // Thread-safe.
+  inline bool enqueue(producer_token_t const &token, T const &item) {
+    return inner_enqueue<CanAlloc>(token, item);
+  }
+
+  // Enqueues a single item (by moving it, if possible) using an explicit
+  // producer token. Allocates memory if required. Only fails if memory
+  // allocation fails (or Traits::MAX_SUBQUEUE_SIZE has been defined and would
+  // be surpassed). Thread-safe.
+  inline bool enqueue(producer_token_t const &token, T &&item) {
+    return inner_enqueue<CanAlloc>(token, std::move(item));
+  }
+
+  // Enqueues several items.
+  // Allocates memory if required. Only fails if memory allocation fails (or
+  // implicit production is disabled because
+  // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or
+  // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). Note:
+  // Use std::make_move_iterator if the elements should be moved instead of
+  // copied. Thread-safe.
+  template <typename It>
+  bool enqueue_bulk(It itemFirst, size_t count) {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+    return false;
+    else return inner_enqueue_bulk<CanAlloc>(itemFirst, count);
+  }
+
+  // Enqueues several items using an explicit producer token.
+  // Allocates memory if required. Only fails if memory allocation fails
+  // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+  // Note: Use std::make_move_iterator if the elements should be moved
+  // instead of copied.
+  // Thread-safe.
+  template <typename It>
+  bool enqueue_bulk(producer_token_t const &token, It itemFirst, size_t count) {
+    return inner_enqueue_bulk<CanAlloc>(token, itemFirst, count);
+  }
+
+  // Enqueues a single item (by copying it).
+  // Does not allocate memory. Fails if not enough room to enqueue (or implicit
+  // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+  // is 0).
+  // Thread-safe.
+  inline bool try_enqueue(T const &item) {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+    return false;
+    else return inner_enqueue<CannotAlloc>(item);
+  }
+
+  // Enqueues a single item (by moving it, if possible).
+  // Does not allocate memory (except for one-time implicit producer).
+  // Fails if not enough room to enqueue (or implicit production is
+  // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+  // Thread-safe.
+  inline bool try_enqueue(T &&item) {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+    return false;
+    else return inner_enqueue<CannotAlloc>(std::move(item));
+  }
+
+  // Enqueues a single item (by copying it) using an explicit producer token.
+  // Does not allocate memory. Fails if not enough room to enqueue.
+  // Thread-safe.
+  inline bool try_enqueue(producer_token_t const &token, T const &item) {
+    return inner_enqueue<CannotAlloc>(token, item);
+  }
+
+  // Enqueues a single item (by moving it, if possible) using an explicit
+  // producer token. Does not allocate memory. Fails if not enough room to
+  // enqueue. Thread-safe.
+  inline bool try_enqueue(producer_token_t const &token, T &&item) {
+    return inner_enqueue<CannotAlloc>(token, std::move(item));
+  }
+
+  // Enqueues several items.
+  // Does not allocate memory (except for one-time implicit producer).
+  // Fails if not enough room to enqueue (or implicit production is
+  // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+  // Note: Use std::make_move_iterator if the elements should be moved
+  // instead of copied.
+  // Thread-safe.
+  template <typename It>
+  bool try_enqueue_bulk(It itemFirst, size_t count) {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+    return false;
+    else return inner_enqueue_bulk<CannotAlloc>(itemFirst, count);
+  }
+
+  // Enqueues several items using an explicit producer token.
+  // Does not allocate memory. Fails if not enough room to enqueue.
+  // Note: Use std::make_move_iterator if the elements should be moved
+  // instead of copied.
+  // Thread-safe.
+  template <typename It>
+  bool try_enqueue_bulk(producer_token_t const &token, It itemFirst,
+                        size_t count) {
+    return inner_enqueue_bulk<CannotAlloc>(token, itemFirst, count);
+  }
+
+
+  // Attempts to dequeue from the queue.
+  // Returns false if all producer streams appeared empty at the time they
+  // were checked (so, the queue is likely but not guaranteed to be empty).
+  // Never allocates. Thread-safe.
+  template <typename U>
+  bool try_dequeue(U &item) {
+    // Instead of simply trying each producer in turn (which could cause
+    // needless contention on the first producer), we score them heuristically.
+    size_t nonEmptyCount = 0;
+    ProducerBase *best = nullptr;
+    size_t bestSize = 0;
+    for (auto ptr = producerListTail.load(std::memory_order_acquire);
+         nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) {
+      auto size = ptr->size_approx();
+      if (size > 0) {
+        if (size > bestSize) {
+          bestSize = size;
+          best = ptr;
+        }
+        ++nonEmptyCount;
+      }
+    }
+
+    // If there was at least one non-empty queue but it appears empty at the
+    // time we try to dequeue from it, we need to make sure every queue's been
+    // tried
+    if (nonEmptyCount > 0) {
+      if ((details::likely)(best->dequeue(item))) {
+        return true;
+      }
+      for (auto ptr = producerListTail.load(std::memory_order_acquire);
+           ptr != nullptr; ptr = ptr->next_prod()) {
+        if (ptr != best && ptr->dequeue(item)) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  // Attempts to dequeue from the queue.
+  // Returns false if all producer streams appeared empty at the time they
+  // were checked (so, the queue is likely but not guaranteed to be empty).
+  // This differs from the try_dequeue(item) method in that this one does
+  // not attempt to reduce contention by interleaving the order that producer
+  // streams are dequeued from. So, using this method can reduce overall
+  // throughput under contention, but will give more predictable results in
+  // single-threaded consumer scenarios. This is mostly only useful for internal
+  // unit tests. Never allocates. Thread-safe.
+  template <typename U>
+  bool try_dequeue_non_interleaved(U &item) {
+    for (auto ptr = producerListTail.load(std::memory_order_acquire);
+         ptr != nullptr; ptr = ptr->next_prod()) {
+      if (ptr->dequeue(item)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Attempts to dequeue from the queue using an explicit consumer token.
+  // Returns false if all producer streams appeared empty at the time they
+  // were checked (so, the queue is likely but not guaranteed to be empty).
+  // Never allocates. Thread-safe.
+  template <typename U>
+  bool try_dequeue(consumer_token_t &token, U &item) {
+    // The idea is roughly as follows:
+    // Every 256 items from one producer, make everyone rotate (increase the
+    // global offset) -> this means the highest efficiency consumer dictates the
+    // rotation speed of everyone else, more or less If you see that the global
+    // offset has changed, you must reset your consumption counter and move to
+    // your designated place If there's no items where you're supposed to be,
+    // keep moving until you find a producer with some items If the global
+    // offset has not changed but you've run out of items to consume, move over
+    // from your current position until you find an producer with something in
+    // it
+
+    if (token.desiredProducer == nullptr ||
+        token.lastKnownGlobalOffset !=
+            globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+      if (!update_current_producer_after_rotation(token)) {
+        return false;
+      }
+    }
+
+    // If there was at least one non-empty queue but it appears empty at the
+    // time we try to dequeue from it, we need to make sure every queue's been
+    // tried
+    if (static_cast<ProducerBase *>(token.currentProducer)->dequeue(item)) {
+      if (++token.itemsConsumedFromCurrent ==
+          EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+        globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+      }
+      return true;
+    }
+
+    auto tail = producerListTail.load(std::memory_order_acquire);
+    auto ptr = static_cast<ProducerBase *>(token.currentProducer)->next_prod();
+    if (ptr == nullptr) {
+      ptr = tail;
+    }
+    while (ptr != static_cast<ProducerBase *>(token.currentProducer)) {
+      if (ptr->dequeue(item)) {
+        token.currentProducer = ptr;
+        token.itemsConsumedFromCurrent = 1;
+        return true;
+      }
+      ptr = ptr->next_prod();
+      if (ptr == nullptr) {
+        ptr = tail;
+      }
+    }
+    return false;
+  }
+
+  // Attempts to dequeue several elements from the queue.
+  // Returns the number of items actually dequeued.
+  // Returns 0 if all producer streams appeared empty at the time they
+  // were checked (so, the queue is likely but not guaranteed to be empty).
+  // Never allocates. Thread-safe.
+  template <typename It>
+  size_t try_dequeue_bulk(It itemFirst, size_t max) {
+    size_t count = 0;
+    for (auto ptr = producerListTail.load(std::memory_order_acquire);
+         ptr != nullptr; ptr = ptr->next_prod()) {
+      count += ptr->dequeue_bulk(itemFirst, max - count);
+      if (count == max) {
+        break;
+      }
+    }
+    return count;
+  }
+
+  // Attempts to dequeue several elements from the queue using an explicit
+  // consumer token. Returns the number of items actually dequeued. Returns 0 if
+  // all producer streams appeared empty at the time they were checked (so, the
+  // queue is likely but not guaranteed to be empty). Never allocates.
+  // Thread-safe.
+  template <typename It>
+  size_t try_dequeue_bulk(consumer_token_t &token, It itemFirst, size_t max) {
+    if (token.desiredProducer == nullptr ||
+        token.lastKnownGlobalOffset !=
+            globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+      if (!update_current_producer_after_rotation(token)) {
+        return 0;
+      }
+    }
+
+    size_t count = static_cast<ProducerBase *>(token.currentProducer)
+                       ->dequeue_bulk(itemFirst, max);
+    if (count == max) {
+      if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >=
+          EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+        globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+      }
+      return max;
+    }
+    token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
+    max -= count;
+
+    auto tail = producerListTail.load(std::memory_order_acquire);
+    auto ptr = static_cast<ProducerBase *>(token.currentProducer)->next_prod();
+    if (ptr == nullptr) {
+      ptr = tail;
+    }
+    while (ptr != static_cast<ProducerBase *>(token.currentProducer)) {
+      auto dequeued = ptr->dequeue_bulk(itemFirst, max);
+      count += dequeued;
+      if (dequeued != 0) {
+        token.currentProducer = ptr;
+        token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
+      }
+      if (dequeued == max) {
+        break;
+      }
+      max -= dequeued;
+      ptr = ptr->next_prod();
+      if (ptr == nullptr) {
+        ptr = tail;
+      }
+    }
+    return count;
+  }
+
+
+  // Attempts to dequeue from a specific producer's inner queue.
+  // If you happen to know which producer you want to dequeue from, this
+  // is significantly faster than using the general-case try_dequeue methods.
+  // Returns false if the producer's queue appeared empty at the time it
+  // was checked (so, the queue is likely but not guaranteed to be empty).
+  // Never allocates. Thread-safe.
+  template <typename U>
+  inline bool try_dequeue_from_producer(producer_token_t const &producer,
+                                        U &item) {
+    return static_cast<ExplicitProducer *>(producer.producer)->dequeue(item);
+  }
+
+  // Attempts to dequeue several elements from a specific producer's inner
+  // queue. Returns the number of items actually dequeued. If you happen to know
+  // which producer you want to dequeue from, this is significantly faster than
+  // using the general-case try_dequeue methods. Returns 0 if the producer's
+  // queue appeared empty at the time it was checked (so, the queue is likely
+  // but not guaranteed to be empty). Never allocates. Thread-safe.
+  template <typename It>
+  inline size_t try_dequeue_bulk_from_producer(producer_token_t const &producer,
+                                               It itemFirst, size_t max) {
+    return static_cast<ExplicitProducer *>(producer.producer)
+        ->dequeue_bulk(itemFirst, max);
+  }
+
+
+  // Returns an estimate of the total number of elements currently in the queue.
+  // This estimate is only accurate if the queue has completely stabilized
+  // before it is called (i.e. all enqueue and dequeue operations have completed
+  // and their memory effects are visible on the calling thread, and no further
+  // operations start while this method is being called). Thread-safe.
+  size_t size_approx() const {
+    size_t size = 0;
+    for (auto ptr = producerListTail.load(std::memory_order_acquire);
+         ptr != nullptr; ptr = ptr->next_prod()) {
+      size += ptr->size_approx();
+    }
+    return size;
+  }
+
+
+  // Returns true if the underlying atomic variables used by
+  // the queue are lock-free (they should be on most platforms).
+  // Thread-safe.
+  static constexpr bool is_lock_free() {
+    return details::static_is_lock_free<bool>::value == 2 &&
+           details::static_is_lock_free<size_t>::value == 2 &&
+           details::static_is_lock_free<std::uint32_t>::value == 2 &&
+           details::static_is_lock_free<index_t>::value == 2 &&
+           details::static_is_lock_free<void *>::value == 2 &&
+           details::static_is_lock_free<typename details::thread_id_converter<
+               details::thread_id_t>::thread_id_numeric_size_t>::value == 2;
+  }
+
+
+ private:
+  friend struct ProducerToken;
+  friend struct ConsumerToken;
+  struct ExplicitProducer;
+  friend struct ExplicitProducer;
+  struct ImplicitProducer;
+  friend struct ImplicitProducer;
+  friend class ConcurrentQueueTests;
+
+  enum AllocationMode { CanAlloc, CannotAlloc };
+
+
+  ///////////////////////////////
+  // Queue methods
+  ///////////////////////////////
+
+  template <AllocationMode canAlloc, typename U>
+  inline bool inner_enqueue(producer_token_t const &token, U &&element) {
+    return static_cast<ExplicitProducer *>(token.producer)
+        ->ConcurrentQueue::ExplicitProducer::template enqueue<canAlloc>(
+            std::forward<U>(element));
+  }
+
+  template <AllocationMode canAlloc, typename U>
+  inline bool inner_enqueue(U &&element) {
+    auto producer = get_or_add_implicit_producer();
+    return producer == nullptr
+               ? false
+               : producer->ConcurrentQueue::ImplicitProducer::template enqueue<
+                     canAlloc>(std::forward<U>(element));
+  }
+
+  template <AllocationMode canAlloc, typename It>
+  inline bool inner_enqueue_bulk(producer_token_t const &token, It itemFirst,
+                                 size_t count) {
+    return static_cast<ExplicitProducer *>(token.producer)
+        ->ConcurrentQueue::ExplicitProducer::template enqueue_bulk<canAlloc>(
+            itemFirst, count);
+  }
+
+  template <AllocationMode canAlloc, typename It>
+  inline bool inner_enqueue_bulk(It itemFirst, size_t count) {
+    auto producer = get_or_add_implicit_producer();
+    return producer == nullptr
+               ? false
+               : producer->ConcurrentQueue::ImplicitProducer::
+                     template enqueue_bulk<canAlloc>(itemFirst, count);
+  }
+
+  inline bool update_current_producer_after_rotation(consumer_token_t &token) {
+    // Ah, there's been a rotation, figure out where we should be!
+    auto tail = producerListTail.load(std::memory_order_acquire);
+    if (token.desiredProducer == nullptr && tail == nullptr) {
+      return false;
+    }
+    auto prodCount = producerCount.load(std::memory_order_relaxed);
+    auto globalOffset =
+        globalExplicitConsumerOffset.load(std::memory_order_relaxed);
+    if ((details::unlikely)(token.desiredProducer == nullptr)) {
+      // Aha, first time we're dequeueing anything.
+      // Figure out our local position
+      // Note: offset is from start, not end, but we're traversing from end --
+      // subtract from count first
+      std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount);
+      token.desiredProducer = tail;
+      for (std::uint32_t i = 0; i != offset; ++i) {
+        token.desiredProducer =
+            static_cast<ProducerBase *>(token.desiredProducer)->next_prod();
+        if (token.desiredProducer == nullptr) {
+          token.desiredProducer = tail;
+        }
+      }
+    }
+
+    std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
+    if (delta >= prodCount) {
+      delta = delta % prodCount;
+    }
+    for (std::uint32_t i = 0; i != delta; ++i) {
+      token.desiredProducer =
+          static_cast<ProducerBase *>(token.desiredProducer)->next_prod();
+      if (token.desiredProducer == nullptr) {
+        token.desiredProducer = tail;
+      }
+    }
+
+    token.lastKnownGlobalOffset = globalOffset;
+    token.currentProducer = token.desiredProducer;
+    token.itemsConsumedFromCurrent = 0;
+    return true;
+  }
+
+
+  ///////////////////////////
+  // Free list
+  ///////////////////////////
+
+  template <typename N>
+  struct FreeListNode {
+    FreeListNode() : freeListRefs(0), freeListNext(nullptr) {}
+
+    std::atomic<std::uint32_t> freeListRefs;
+    std::atomic<N *> freeListNext;
+  };
+
+  // A simple CAS-based lock-free free list. Not the fastest thing in the world
+  // under heavy contention, but simple and correct (assuming nodes are never
+  // freed until after the free list is destroyed), and fairly speedy under low
+  // contention.
+  template <typename N>  // N must inherit FreeListNode or have the same fields
+                         // (and initialization of them)
+  struct FreeList {
+    FreeList() : freeListHead(nullptr) {}
+    FreeList(FreeList &&other)
+        : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) {
+      other.freeListHead.store(nullptr, std::memory_order_relaxed);
+    }
+    void swap(FreeList &other) {
+      details::swap_relaxed(freeListHead, other.freeListHead);
+    }
+
+    FreeList(FreeList const &) MOODYCAMEL_DELETE_FUNCTION;
+    FreeList &operator=(FreeList const &) MOODYCAMEL_DELETE_FUNCTION;
+
+    inline void add(N *node) {
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+      debug::DebugLock lock(mutex);
+#endif
+      // We know that the should-be-on-freelist bit is 0 at this point, so it's
+      // safe to set it using a fetch_add
+      if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST,
+                                       std::memory_order_acq_rel) == 0) {
+        // Oh look! We were the last ones referencing this node, and we know
+        // we want to add it to the free list, so let's do it!
+        add_knowing_refcount_is_zero(node);
+      }
+    }
+
+    inline N *try_get() {
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+      debug::DebugLock lock(mutex);
+#endif
+      auto head = freeListHead.load(std::memory_order_acquire);
+      while (head != nullptr) {
+        auto prevHead = head;
+        auto refs = head->freeListRefs.load(std::memory_order_relaxed);
+        if ((refs & REFS_MASK) == 0 ||
+            !head->freeListRefs.compare_exchange_strong(
+                refs, refs + 1, std::memory_order_acquire)) {
+          head = freeListHead.load(std::memory_order_acquire);
+          continue;
+        }
+
+        // Good, reference count has been incremented (it wasn't at zero), which
+        // means we can read the next and not worry about it changing between
+        // now and the time we do the CAS
+        auto next = head->freeListNext.load(std::memory_order_relaxed);
+        if (freeListHead.compare_exchange_strong(head, next,
+                                                 std::memory_order_acquire,
+                                                 std::memory_order_relaxed)) {
+          // Yay, got the node. This means it was on the list, which means
+          // shouldBeOnFreeList must be false no matter the refcount (because
+          // nobody else knows it's been taken off yet, it can't have been put
+          // back on).
+          assert((head->freeListRefs.load(std::memory_order_relaxed) &
+                  SHOULD_BE_ON_FREELIST) == 0);
+
+          // Decrease refcount twice, once for our ref, and once for the list's
+          // ref
+          head->freeListRefs.fetch_sub(2, std::memory_order_release);
+          return head;
+        }
+
+        // OK, the head must have changed on us, but we still need to decrease
+        // the refcount we increased. Note that we don't need to release any
+        // memory effects, but we do need to ensure that the reference count
+        // decrement happens-after the CAS on the head.
+        refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel);
+        if (refs == SHOULD_BE_ON_FREELIST + 1) {
+          add_knowing_refcount_is_zero(prevHead);
+        }
+      }
+
+      return nullptr;
+    }
+
+    // Useful for traversing the list when there's no contention (e.g. to
+    // destroy remaining nodes)
+    N *head_unsafe() const {
+      return freeListHead.load(std::memory_order_relaxed);
+    }
+
+   private:
+    inline void add_knowing_refcount_is_zero(N *node) {
+      // Since the refcount is zero, and nobody can increase it once it's zero
+      // (except us, and we run only one copy of this method per node at a time,
+      // i.e. the single thread case), then we know we can safely change the
+      // next pointer of the node; however, once the refcount is back above
+      // zero, then other threads could increase it (happens under heavy
+      // contention, when the refcount goes to zero in between a load and a
+      // refcount increment of a node in try_get, then back up to something
+      // non-zero, then the refcount increment is done by the other thread) --
+      // so, if the CAS to add the node to the actual list fails, decrease the
+      // refcount and leave the add operation to the next thread who puts the
+      // refcount back at zero (which could be us, hence the loop).
+      auto head = freeListHead.load(std::memory_order_relaxed);
+      while (true) {
+        node->freeListNext.store(head, std::memory_order_relaxed);
+        node->freeListRefs.store(1, std::memory_order_release);
+        if (!freeListHead.compare_exchange_strong(head, node,
+                                                  std::memory_order_release,
+                                                  std::memory_order_relaxed)) {
+          // Hmm, the add failed, but we can only try again when the refcount
+          // goes back to zero
+          if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1,
+                                           std::memory_order_acq_rel) == 1) {
+            continue;
+          }
+        }
+        return;
+      }
+    }
+
+   private:
+    // Implemented like a stack, but where node order doesn't matter (nodes are
+    // inserted out of order under contention)
+    std::atomic<N *> freeListHead;
+
+    static const std::uint32_t REFS_MASK = 0x7FFFFFFF;
+    static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;
+
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+    debug::DebugMutex mutex;
+#endif
+  };
+
+
+  ///////////////////////////
+  // Block
+  ///////////////////////////
+
+  enum InnerQueueContext { implicit_context = 0, explicit_context = 1 };
+
+  struct Block {
+    Block()
+        : next(nullptr),
+          elementsCompletelyDequeued(0),
+          freeListRefs(0),
+          freeListNext(nullptr),
+          dynamicallyAllocated(true) {
+#ifdef MCDBGQ_TRACKMEM
+      owner = nullptr;
+#endif
+    }
+
+    template <InnerQueueContext context>
+    inline bool is_empty() const {
+      MOODYCAMEL_CONSTEXPR_IF(context == explicit_context &&
+                              BLOCK_SIZE <=
+                                  EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+        // Check flags
+        for (size_t i = 0; i < BLOCK_SIZE; ++i) {
+          if (!emptyFlags[i].load(std::memory_order_relaxed)) {
+            return false;
+          }
+        }
+
+        // Aha, empty; make sure we have all other memory effects that happened
+        // before the empty flags were set
+        std::atomic_thread_fence(std::memory_order_acquire);
+        return true;
+      }
+      else {
+        // Check counter
+        if (elementsCompletelyDequeued.load(std::memory_order_relaxed) ==
+            BLOCK_SIZE) {
+          std::atomic_thread_fence(std::memory_order_acquire);
+          return true;
+        }
+        assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <=
+               BLOCK_SIZE);
+        return false;
+      }
+    }
+
+    // Returns true if the block is now empty (does not apply in explicit
+    // context)
+    template <InnerQueueContext context>
+    inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i) {
+      MOODYCAMEL_CONSTEXPR_IF(context == explicit_context &&
+                              BLOCK_SIZE <=
+                                  EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+        // Set flag
+        assert(!emptyFlags[BLOCK_SIZE - 1 -
+                           static_cast<size_t>(
+                               i & static_cast<index_t>(BLOCK_SIZE - 1))]
+                    .load(std::memory_order_relaxed));
+        emptyFlags[BLOCK_SIZE - 1 -
+                   static_cast<size_t>(i &
+                                       static_cast<index_t>(BLOCK_SIZE - 1))]
+            .store(true, std::memory_order_release);
+        return false;
+      }
+      else {
+        // Increment counter
+        auto prevVal =
+            elementsCompletelyDequeued.fetch_add(1, std::memory_order_acq_rel);
+        assert(prevVal < BLOCK_SIZE);
+        return prevVal == BLOCK_SIZE - 1;
+      }
+    }
+
+    // Sets multiple contiguous item statuses to 'empty' (assumes no wrapping
+    // and count > 0). Returns true if the block is now empty (does not apply in
+    // explicit context).
+    template <InnerQueueContext context>
+    inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i,
+                               size_t count) {
+      MOODYCAMEL_CONSTEXPR_IF(context == explicit_context &&
+                              BLOCK_SIZE <=
+                                  EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+        // Set flags
+        std::atomic_thread_fence(std::memory_order_release);
+        i = BLOCK_SIZE - 1 -
+            static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1)) -
+            count + 1;
+        for (size_t j = 0; j != count; ++j) {
+          assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
+          emptyFlags[i + j].store(true, std::memory_order_relaxed);
+        }
+        return false;
+      }
+      else {
+        // Increment counter
+        auto prevVal = elementsCompletelyDequeued.fetch_add(
+            count, std::memory_order_acq_rel);
+        assert(prevVal + count <= BLOCK_SIZE);
+        return prevVal + count == BLOCK_SIZE;
+      }
+    }
+
+    template <InnerQueueContext context>
+    inline void set_all_empty() {
+      MOODYCAMEL_CONSTEXPR_IF(context == explicit_context &&
+                              BLOCK_SIZE <=
+                                  EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+        // Set all flags
+        for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+          emptyFlags[i].store(true, std::memory_order_relaxed);
+        }
+      }
+      else {
+        // Reset counter
+        elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed);
+      }
+    }
+
+    template <InnerQueueContext context>
+    inline void reset_empty() {
+      MOODYCAMEL_CONSTEXPR_IF(context == explicit_context &&
+                              BLOCK_SIZE <=
+                                  EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+        // Reset flags
+        for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+          emptyFlags[i].store(false, std::memory_order_relaxed);
+        }
+      }
+      else {
+        // Reset counter
+        elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
+      }
+    }
+
+    inline T *operator[](index_t idx) MOODYCAMEL_NOEXCEPT {
+      return static_cast<T *>(static_cast<void *>(elements)) +
+             static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1));
+    }
+    inline T const *operator[](index_t idx) const MOODYCAMEL_NOEXCEPT {
+      return static_cast<T const *>(static_cast<void const *>(elements)) +
+             static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1));
+    }
+
+   private:
+    static_assert(std::alignment_of<T>::value <= sizeof(T),
+                  "The queue does not support types with an alignment greater "
+                  "than their size at this time");
+    MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements;
+
+   public:
+    Block *next;
+    std::atomic<size_t> elementsCompletelyDequeued;
+    std::atomic<bool> emptyFlags
+        [BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1];
+
+   public:
+    std::atomic<std::uint32_t> freeListRefs;
+    std::atomic<Block *> freeListNext;
+    bool dynamicallyAllocated;  // Perhaps a better name for this would be
+                                // 'isNotPartOfInitialBlockPool'
+
+#ifdef MCDBGQ_TRACKMEM
+    void *owner;
+#endif
+  };
+  static_assert(std::alignment_of<Block>::value >= std::alignment_of<T>::value,
+                "Internal error: Blocks must be at least as aligned as the "
+                "type they are wrapping");
+
+
+#ifdef MCDBGQ_TRACKMEM
+ public:
+  struct MemStats;
+
+ private:
+#endif
+
+  ///////////////////////////
+  // Producer base
+  ///////////////////////////
+
+  struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase {
+    ProducerBase(ConcurrentQueue *parent_, bool isExplicit_)
+        : tailIndex(0),
+          headIndex(0),
+          dequeueOptimisticCount(0),
+          dequeueOvercommit(0),
+          tailBlock(nullptr),
+          isExplicit(isExplicit_),
+          parent(parent_) {}
+
+    virtual ~ProducerBase() {}
+
+    template <typename U>
+    inline bool dequeue(U &element) {
+      if (isExplicit) {
+        return static_cast<ExplicitProducer *>(this)->dequeue(element);
+      } else {
+        return static_cast<ImplicitProducer *>(this)->dequeue(element);
+      }
+    }
+
+    template <typename It>
+    inline size_t dequeue_bulk(It &itemFirst, size_t max) {
+      if (isExplicit) {
+        return static_cast<ExplicitProducer *>(this)->dequeue_bulk(itemFirst,
+                                                                   max);
+      } else {
+        return static_cast<ImplicitProducer *>(this)->dequeue_bulk(itemFirst,
+                                                                   max);
+      }
+    }
+
+    inline ProducerBase *next_prod() const {
+      return static_cast<ProducerBase *>(next);
+    }
+
+    inline size_t size_approx() const {
+      auto tail = tailIndex.load(std::memory_order_relaxed);
+      auto head = headIndex.load(std::memory_order_relaxed);
+      return details::circular_less_than(head, tail)
+                 ? static_cast<size_t>(tail - head)
+                 : 0;
+    }
+
+    inline index_t getTail() const {
+      return tailIndex.load(std::memory_order_relaxed);
+    }
+
+   protected:
+    std::atomic<index_t> tailIndex;  // Where to enqueue to next
+    std::atomic<index_t> headIndex;  // Where to dequeue from next
+
+    std::atomic<index_t> dequeueOptimisticCount;
+    std::atomic<index_t> dequeueOvercommit;
+
+    Block *tailBlock;
+
+   public:
+    bool isExplicit;
+    ConcurrentQueue *parent;
+
+   protected:
+#ifdef MCDBGQ_TRACKMEM
+    friend struct MemStats;
+#endif
+  };
+
+
+  ///////////////////////////
+  // Explicit queue
+  ///////////////////////////
+
+  struct ExplicitProducer : public ProducerBase {
+    explicit ExplicitProducer(ConcurrentQueue *parent_)
+        : ProducerBase(parent_, true),
+          blockIndex(nullptr),
+          pr_blockIndexSlotsUsed(0),
+          pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1),
+          pr_blockIndexFront(0),
+          pr_blockIndexEntries(nullptr),
+          pr_blockIndexRaw(nullptr) {
+      size_t poolBasedIndexSize =
+          details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1;
+      if (poolBasedIndexSize > pr_blockIndexSize) {
+        pr_blockIndexSize = poolBasedIndexSize;
+      }
+
+      new_block_index(0);  // This creates an index with double the number of
+                           // current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE
+    }
+
+    ~ExplicitProducer() {
+      // Destruct any elements not yet dequeued.
+      // Since we're in the destructor, we can assume all elements
+      // are either completely dequeued or completely not (no halfways).
+      if (this->tailBlock !=
+          nullptr) {  // Note this means there must be a block index too
+        // First find the block that's partially dequeued, if any
+        Block *halfDequeuedBlock = nullptr;
+        if ((this->headIndex.load(std::memory_order_relaxed) &
+             static_cast<index_t>(BLOCK_SIZE - 1)) != 0) {
+          // The head's not on a block boundary, meaning a block somewhere is
+          // partially dequeued (or the head block is the tail block and was
+          // fully dequeued, but the head/tail are still not on a boundary)
+          size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) &
+                     (pr_blockIndexSize - 1);
+          while (details::circular_less_than<index_t>(
+              pr_blockIndexEntries[i].base + BLOCK_SIZE,
+              this->headIndex.load(std::memory_order_relaxed))) {
+            i = (i + 1) & (pr_blockIndexSize - 1);
+          }
+          assert(details::circular_less_than<index_t>(
+              pr_blockIndexEntries[i].base,
+              this->headIndex.load(std::memory_order_relaxed)));
+          halfDequeuedBlock = pr_blockIndexEntries[i].block;
+        }
+
+        // Start at the head block (note the first line in the loop gives us the
+        // head from the tail on the first iteration)
+        auto block = this->tailBlock;
+        do {
+          block = block->next;
+          if (block->ConcurrentQueue::Block::template is_empty<
+                  explicit_context>()) {
+            continue;
+          }
+
+          size_t i = 0;  // Offset into block
+          if (block == halfDequeuedBlock) {
+            i = static_cast<size_t>(
+                this->headIndex.load(std::memory_order_relaxed) &
+                static_cast<index_t>(BLOCK_SIZE - 1));
+          }
+
+          // Walk through all the items in the block; if this is the tail block,
+          // we need to stop when we reach the tail index
+          auto lastValidIndex =
+              (this->tailIndex.load(std::memory_order_relaxed) &
+               static_cast<index_t>(BLOCK_SIZE - 1)) == 0
+                  ? BLOCK_SIZE
+                  : static_cast<size_t>(
+                        this->tailIndex.load(std::memory_order_relaxed) &
+                        static_cast<index_t>(BLOCK_SIZE - 1));
+          while (i != BLOCK_SIZE &&
+                 (block != this->tailBlock || i != lastValidIndex)) {
+            (*block)[i++]->~T();
+          }
+        } while (block != this->tailBlock);
+      }
+
+      // Destroy all blocks that we own
+      if (this->tailBlock != nullptr) {
+        auto block = this->tailBlock;
+        do {
+          auto nextBlock = block->next;
+          this->parent->add_block_to_free_list(block);
+          block = nextBlock;
+        } while (block != this->tailBlock);
+      }
+
+      // Destroy the block indices
+      auto header = static_cast<BlockIndexHeader *>(pr_blockIndexRaw);
+      while (header != nullptr) {
+        auto prev = static_cast<BlockIndexHeader *>(header->prev);
+        header->~BlockIndexHeader();
+        (Traits::free)(header);
+        header = prev;
+      }
+    }
+
+    template <AllocationMode allocMode, typename U>
+    inline bool enqueue(U &&element) {
+      index_t currentTailIndex =
+          this->tailIndex.load(std::memory_order_relaxed);
+      index_t newTailIndex = 1 + currentTailIndex;
+      if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+        // We reached the end of a block, start a new one
+        auto startBlock = this->tailBlock;
+        auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+        if (this->tailBlock != nullptr &&
+            this->tailBlock->next->ConcurrentQueue::Block::template is_empty<
+                explicit_context>()) {
+          // We can re-use the block ahead of us, it's empty!
+          this->tailBlock = this->tailBlock->next;
+          this->tailBlock->ConcurrentQueue::Block::template reset_empty<
+              explicit_context>();
+
+          // We'll put the block on the block index (guaranteed to be room since
+          // we're conceptually removing the last block from it first -- except
+          // instead of removing then adding, we can just overwrite). Note that
+          // there must be a valid block index here, since even if allocation
+          // failed in the ctor, it would have been re-attempted when adding the
+          // first block to the queue; since there is such a block, a block
+          // index must have been successfully allocated.
+        } else {
+          // Whatever head value we see here is >= the last value we saw here
+          // (relatively), and <= its current value. Since we have the most
+          // recent tail, the head must be
+          // <= to it.
+          auto head = this->headIndex.load(std::memory_order_relaxed);
+          assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+          if (!details::circular_less_than<index_t>(
+                  head, currentTailIndex + BLOCK_SIZE) ||
+              (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
+               (MAX_SUBQUEUE_SIZE == 0 ||
+                MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+            // We can't enqueue in another block because there's not enough
+            // leeway -- the tail could surpass the head by the time the block
+            // fills up! (Or we'll exceed the size limit, if the second part of
+            // the condition was true.)
+            return false;
+          }
+          // We're going to need a new block; check that the block index has
+          // room
+          if (pr_blockIndexRaw == nullptr ||
+              pr_blockIndexSlotsUsed == pr_blockIndexSize) {
+            // Hmm, the circular block index is already full -- we'll need
+            // to allocate a new index. Note pr_blockIndexRaw can only be
+            // nullptr if the initial allocation failed in the constructor.
+
+            MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) {
+              return false;
+            }
+            else if (!new_block_index(pr_blockIndexSlotsUsed)) {
+              return false;
+            }
+          }
+
+          // Insert a new block in the circular linked list
+          auto newBlock =
+              this->parent
+                  ->ConcurrentQueue::template requisition_block<allocMode>();
+          if (newBlock == nullptr) {
+            return false;
+          }
+#ifdef MCDBGQ_TRACKMEM
+          newBlock->owner = this;
+#endif
+          newBlock->ConcurrentQueue::Block::template reset_empty<
+              explicit_context>();
+          if (this->tailBlock == nullptr) {
+            newBlock->next = newBlock;
+          } else {
+            newBlock->next = this->tailBlock->next;
+            this->tailBlock->next = newBlock;
+          }
+          this->tailBlock = newBlock;
+          ++pr_blockIndexSlotsUsed;
+        }
+
+        MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
+            T, U,
+            new (static_cast<T *>(nullptr)) T(std::forward<U>(element)))) {
+          // The constructor may throw. We want the element not to appear in the
+          // queue in that case (without corrupting the queue):
+          MOODYCAMEL_TRY {
+            new ((*this->tailBlock)[currentTailIndex])
+                T(std::forward<U>(element));
+          }
+          MOODYCAMEL_CATCH(...) {
+            // Revert change to the current block, but leave the new block
+            // available for next time
+            pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+            this->tailBlock =
+                startBlock == nullptr ? this->tailBlock : startBlock;
+            MOODYCAMEL_RETHROW;
+          }
+        }
+        else {
+          (void)startBlock;
+          (void)originalBlockIndexSlotsUsed;
+        }
+
+        // Add block to block index
+        auto &entry = blockIndex.load(std::memory_order_relaxed)
+                          ->entries[pr_blockIndexFront];
+        entry.base = currentTailIndex;
+        entry.block = this->tailBlock;
+        blockIndex.load(std::memory_order_relaxed)
+            ->front.store(pr_blockIndexFront, std::memory_order_release);
+        pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+
+        MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
+            T, U,
+            new (static_cast<T *>(nullptr)) T(std::forward<U>(element)))) {
+          this->tailIndex.store(newTailIndex, std::memory_order_release);
+          return true;
+        }
+      }
+
+      // Enqueue
+      new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+
+      this->tailIndex.store(newTailIndex, std::memory_order_release);
+      return true;
+    }
+
+    template <typename U>
+    bool dequeue(U &element) {
+      auto tail = this->tailIndex.load(std::memory_order_relaxed);
+      auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+      if (details::circular_less_than<index_t>(
+              this->dequeueOptimisticCount.load(std::memory_order_relaxed) -
+                  overcommit,
+              tail)) {
+        // Might be something to dequeue, let's give it a try
+
+        // Note that this if is purely for performance purposes in the common
+        // case when the queue is empty and the values are eventually consistent
+        // -- we may enter here spuriously.
+
+        // Note that whatever the values of overcommit and tail are, they are
+        // not going to change (unless we change them) and must be the same
+        // value at this point (inside the if) as when the if condition was
+        // evaluated.
+
+        // We insert an acquire fence here to synchronize-with the release upon
+        // incrementing dequeueOvercommit below. This ensures that whatever the
+        // value we got loaded into overcommit, the load of dequeueOptisticCount
+        // in the fetch_add below will result in a value at least as recent as
+        // that (and therefore at least as large). Note that I believe a
+        // compiler (signal) fence here would be sufficient due to the nature of
+        // fetch_add (all read-modify-write operations are guaranteed to work on
+        // the latest value in the modification order), but unfortunately that
+        // can't be shown to be correct using only the C++11 standard. See
+        // http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case
+        std::atomic_thread_fence(std::memory_order_acquire);
+
+        // Increment optimistic counter, then check if it went over the boundary
+        auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(
+            1, std::memory_order_relaxed);
+
+        // Note that since dequeueOvercommit must be <= dequeueOptimisticCount
+        // (because dequeueOvercommit is only ever incremented after
+        // dequeueOptimisticCount -- this is enforced in the `else` block
+        // below), and since we now have a version of dequeueOptimisticCount
+        // that is at least as recent as overcommit (due to the release upon
+        // incrementing dequeueOvercommit and the acquire above that
+        // synchronizes with it), overcommit <= myDequeueCount. However, we
+        // can't assert this since both dequeueOptimisticCount and
+        // dequeueOvercommit may (independently) overflow; in such a case,
+        // though, the logic still holds since the difference between the two is
+        // maintained.
+
+        // Note that we reload tail here in case it changed; it will be the same
+        // value as before or greater, since this load is sequenced after
+        // (happens after) the earlier load above. This is supported by
+        // read-read coherency (as defined in the standard), explained here:
+        // http://en.cppreference.com/w/cpp/atomic/memory_order
+        tail = this->tailIndex.load(std::memory_order_acquire);
+        if ((details::likely)(details::circular_less_than<index_t>(
+                myDequeueCount - overcommit, tail))) {
+          // Guaranteed to be at least one element to dequeue!
+
+          // Get the index. Note that since there's guaranteed to be at least
+          // one element, this will never exceed tail. We need to do an
+          // acquire-release fence here since it's possible that whatever
+          // condition got us to this point was for an earlier enqueued element
+          // (that we already see the memory effects for), but that by the time
+          // we increment somebody else has incremented it, and we need to see
+          // the memory effects for *that* element, which is in such a case is
+          // necessarily visible on the thread that incremented it in the first
+          // place with the more current condition (they must have acquired a
+          // tail that is at least as recent).
+          auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+
+
+          // Determine which block the element is in
+
+          auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+          auto localBlockIndexHead =
+              localBlockIndex->front.load(std::memory_order_acquire);
+
+          // We need to be careful here about subtracting and dividing because
+          // of index wrap-around. When an index wraps, we need to preserve the
+          // sign of the offset when dividing it by the block size (in order to
+          // get a correct signed block count offset in all cases):
+          auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+          auto blockBaseIndex = index & ~static_cast<index_t>(BLOCK_SIZE - 1);
+          auto offset = static_cast<size_t>(
+              static_cast<typename std::make_signed<index_t>::type>(
+                  blockBaseIndex - headBase) /
+              static_cast<typename std::make_signed<index_t>::type>(
+                  BLOCK_SIZE));
+          auto block = localBlockIndex
+                           ->entries[(localBlockIndexHead + offset) &
+                                     (localBlockIndex->size - 1)]
+                           .block;
+
+          // Dequeue
+          auto &el = *((*block)[index]);
+          if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T &&, element = std::move(el))) {
+            // Make sure the element is still fully dequeued and destroyed even
+            // if the assignment throws
+            struct Guard {
+              Block *block;
+              index_t index;
+
+              ~Guard() {
+                (*block)[index]->~T();
+                block->ConcurrentQueue::Block::template set_empty<
+                    explicit_context>(index);
+              }
+            } guard = {block, index};
+
+            element = std::move(el);  // NOLINT
+          } else {
+            element = std::move(el);  // NOLINT
+            el.~T();                  // NOLINT
+            block->ConcurrentQueue::Block::template set_empty<explicit_context>(
+                index);
+          }
+
+          return true;
+        } else {
+          // Wasn't anything to dequeue after all; make the effective dequeue
+          // count eventually consistent
+          this->dequeueOvercommit.fetch_add(
+              1, std::memory_order_release);  // Release so that the fetch_add
+                                              // on dequeueOptimisticCount is
+                                              // guaranteed to happen before
+                                              // this write
+        }
+      }
+
+      return false;
+    }
+
+    template <AllocationMode allocMode, typename It>
+    bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count) {
+      // First, we need to make sure we have enough room to enqueue all of the
+      // elements; this means pre-allocating blocks and putting them in the
+      // block index (but only if all the allocations succeeded).
+      index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+      auto startBlock = this->tailBlock;
+      auto originalBlockIndexFront = pr_blockIndexFront;
+      auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+
+      Block *firstAllocatedBlock = nullptr;
+
+      // Figure out how many blocks we'll need to allocate, and do so
+      size_t blockBaseDiff =
+          ((startTailIndex + count - 1) &
+           ~static_cast<index_t>(BLOCK_SIZE - 1)) -
+          ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+      index_t currentTailIndex =
+          (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+      if (blockBaseDiff > 0) {
+        // Allocate as many blocks as possible from ahead
+        while (blockBaseDiff > 0 && this->tailBlock != nullptr &&
+               this->tailBlock->next != firstAllocatedBlock &&
+               this->tailBlock->next->ConcurrentQueue::Block::template is_empty<
+                   explicit_context>()) {
+          blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+          currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+
+          this->tailBlock = this->tailBlock->next;
+          firstAllocatedBlock = firstAllocatedBlock == nullptr
+                                    ? this->tailBlock
+                                    : firstAllocatedBlock;
+
+          auto &entry = blockIndex.load(std::memory_order_relaxed)
+                            ->entries[pr_blockIndexFront];
+          entry.base = currentTailIndex;
+          entry.block = this->tailBlock;
+          pr_blockIndexFront =
+              (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+        }
+
+        // Now allocate as many blocks as necessary from the block pool
+        while (blockBaseDiff > 0) {
+          blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+          currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+
+          auto head = this->headIndex.load(std::memory_order_relaxed);
+          assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+          bool full =
+              !details::circular_less_than<index_t>(
+                  head, currentTailIndex + BLOCK_SIZE) ||
+              (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
+               (MAX_SUBQUEUE_SIZE == 0 ||
+                MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+          if (pr_blockIndexRaw == nullptr ||
+              pr_blockIndexSlotsUsed == pr_blockIndexSize || full) {
+            MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) {
+              // Failed to allocate, undo changes (but keep injected blocks)
+              pr_blockIndexFront = originalBlockIndexFront;
+              pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+              this->tailBlock =
+                  startBlock == nullptr ? firstAllocatedBlock : startBlock;
+              return false;
+            }
+            else if (full || !new_block_index(originalBlockIndexSlotsUsed)) {
+              // Failed to allocate, undo changes (but keep injected blocks)
+              pr_blockIndexFront = originalBlockIndexFront;
+              pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+              this->tailBlock =
+                  startBlock == nullptr ? firstAllocatedBlock : startBlock;
+              return false;
+            }
+
+            // pr_blockIndexFront is updated inside new_block_index, so we need
+            // to update our fallback value too (since we keep the new index
+            // even if we later fail)
+            originalBlockIndexFront = originalBlockIndexSlotsUsed;
+          }
+
+          // Insert a new block in the circular linked list
+          auto newBlock =
+              this->parent
+                  ->ConcurrentQueue::template requisition_block<allocMode>();
+          if (newBlock == nullptr) {
+            pr_blockIndexFront = originalBlockIndexFront;
+            pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+            this->tailBlock =
+                startBlock == nullptr ? firstAllocatedBlock : startBlock;
+            return false;
+          }
+
+#ifdef MCDBGQ_TRACKMEM
+          newBlock->owner = this;
+#endif
+          newBlock->ConcurrentQueue::Block::template set_all_empty<
+              explicit_context>();
+          if (this->tailBlock == nullptr) {
+            newBlock->next = newBlock;
+          } else {
+            newBlock->next = this->tailBlock->next;
+            this->tailBlock->next = newBlock;
+          }
+          this->tailBlock = newBlock;
+          firstAllocatedBlock = firstAllocatedBlock == nullptr
+                                    ? this->tailBlock
+                                    : firstAllocatedBlock;
+
+          ++pr_blockIndexSlotsUsed;
+
+          auto &entry = blockIndex.load(std::memory_order_relaxed)
+                            ->entries[pr_blockIndexFront];
+          entry.base = currentTailIndex;
+          entry.block = this->tailBlock;
+          pr_blockIndexFront =
+              (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+        }
+
+        // Excellent, all allocations succeeded. Reset each block's emptiness
+        // before we fill them up, and publish the new block index front
+        auto block = firstAllocatedBlock;
+        while (true) {
+          block->ConcurrentQueue::Block::template reset_empty<
+              explicit_context>();
+          if (block == this->tailBlock) {
+            break;
+          }
+          block = block->next;
+        }
+
+        MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(
+            T, decltype(*itemFirst),
+            new (static_cast<T *>(nullptr))
+                T(details::deref_noexcept(itemFirst)))) {
+          blockIndex.load(std::memory_order_relaxed)
+              ->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1),
+                            std::memory_order_release);
+        }
+      }
+
+      // Enqueue, one block at a time
+      index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+      currentTailIndex = startTailIndex;
+      auto endBlock = this->tailBlock;
+      this->tailBlock = startBlock;
+      assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 ||
+             firstAllocatedBlock != nullptr || count == 0);
+      if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 &&
+          firstAllocatedBlock != nullptr) {
+        this->tailBlock = firstAllocatedBlock;
+      }
+      while (true) {
+        index_t stopIndex =
+            (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+            static_cast<index_t>(BLOCK_SIZE);
+        if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+          stopIndex = newTailIndex;
+        }
+        MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(
+            T, decltype(*itemFirst),
+            new (static_cast<T *>(nullptr))
+                T(details::deref_noexcept(itemFirst)))) {
+          while (currentTailIndex != stopIndex) {
+            new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+          }
+        }
+        else {
+          MOODYCAMEL_TRY {
+            while (currentTailIndex != stopIndex) {
+              // Must use copy constructor even if move constructor is available
+              // because we may have to revert if there's an exception.
+              // Sorry about the horrible templated next line, but it was the
+              // only way to disable moving *at compile time*, which is
+              // important because a type may only define a (noexcept) move
+              // constructor, and so calls to the cctor will not compile, even
+              // if they are in an if branch that will never be executed
+              new ((*this->tailBlock)[currentTailIndex]) T(
+                  details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(
+                      T, decltype(*itemFirst),
+                      new (static_cast<T *>(nullptr)) T(details::deref_noexcept(
+                          itemFirst)))>::eval(*itemFirst));
+              ++currentTailIndex;
+              ++itemFirst;
+            }
+          }
+          MOODYCAMEL_CATCH(...) {
+            // Oh dear, an exception's been thrown -- destroy the elements that
+            // were enqueued so far and revert the entire bulk operation (we'll
+            // keep any allocated blocks in our linked list for later, though).
+            auto constructedStopIndex = currentTailIndex;
+            auto lastBlockEnqueued = this->tailBlock;
+
+            pr_blockIndexFront = originalBlockIndexFront;
+            pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+            this->tailBlock =
+                startBlock == nullptr ? firstAllocatedBlock : startBlock;
+
+            if (!details::is_trivially_destructible<T>::value) {
+              auto block = startBlock;
+              if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) ==
+                  0) {
+                block = firstAllocatedBlock;
+              }
+              currentTailIndex = startTailIndex;
+              while (true) {
+                stopIndex =
+                    (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                    static_cast<index_t>(BLOCK_SIZE);
+                if (details::circular_less_than<index_t>(constructedStopIndex,
+                                                         stopIndex)) {
+                  stopIndex = constructedStopIndex;
+                }
+                while (currentTailIndex != stopIndex) {
+                  (*block)[currentTailIndex++]->~T();
+                }
+                if (block == lastBlockEnqueued) {
+                  break;
+                }
+                block = block->next;
+              }
+            }
+            MOODYCAMEL_RETHROW;
+          }
+        }
+
+        if (this->tailBlock == endBlock) {
+          assert(currentTailIndex == newTailIndex);
+          break;
+        }
+        this->tailBlock = this->tailBlock->next;
+      }
+
+      MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
+          T, decltype(*itemFirst),
+          new (static_cast<T *>(nullptr))
+              T(details::deref_noexcept(itemFirst)))) {
+        if (firstAllocatedBlock != nullptr)
+          blockIndex.load(std::memory_order_relaxed)
+              ->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1),
+                            std::memory_order_release);
+      }
+
+      this->tailIndex.store(newTailIndex, std::memory_order_release);
+      return true;
+    }
+
+    template <typename It>
+    size_t dequeue_bulk(It &itemFirst, size_t max) {
+      auto tail = this->tailIndex.load(std::memory_order_relaxed);
+      auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+      auto desiredCount = static_cast<size_t>(
+          tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) -
+                  overcommit));
+      if (details::circular_less_than<size_t>(0, desiredCount)) {
+        desiredCount = desiredCount < max ? desiredCount : max;
+        std::atomic_thread_fence(std::memory_order_acquire);
+
+        auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(
+            desiredCount, std::memory_order_relaxed);
+
+        tail = this->tailIndex.load(std::memory_order_acquire);
+        auto actualCount =
+            static_cast<size_t>(tail - (myDequeueCount - overcommit));
+        if (details::circular_less_than<size_t>(0, actualCount)) {
+          actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+          if (actualCount < desiredCount) {
+            this->dequeueOvercommit.fetch_add(desiredCount - actualCount,
+                                              std::memory_order_release);
+          }
+
+          // Get the first index. Note that since there's guaranteed to be at
+          // least actualCount elements, this will never exceed tail.
+          auto firstIndex =
+              this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+
+          // Determine which block the first element is in
+          auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+          auto localBlockIndexHead =
+              localBlockIndex->front.load(std::memory_order_acquire);
+
+          auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+          auto firstBlockBaseIndex =
+              firstIndex & ~static_cast<index_t>(BLOCK_SIZE - 1);
+          auto offset = static_cast<size_t>(
+              static_cast<typename std::make_signed<index_t>::type>(
+                  firstBlockBaseIndex - headBase) /
+              static_cast<typename std::make_signed<index_t>::type>(
+                  BLOCK_SIZE));
+          auto indexIndex =
+              (localBlockIndexHead + offset) & (localBlockIndex->size - 1);
+
+          // Iterate the blocks and dequeue
+          auto index = firstIndex;
+          do {
+            auto firstIndexInBlock = index;
+            index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                               static_cast<index_t>(BLOCK_SIZE);
+            endIndex =
+                details::circular_less_than<index_t>(
+                    firstIndex + static_cast<index_t>(actualCount), endIndex)
+                    ? firstIndex + static_cast<index_t>(actualCount)
+                    : endIndex;
+            auto block = localBlockIndex->entries[indexIndex].block;
+            if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T &&,
+                                           details::deref_noexcept(itemFirst) =
+                                               std::move((*(*block)[index])))) {
+              while (index != endIndex) {
+                auto &el = *((*block)[index]);
+                *itemFirst++ = std::move(el);
+                el.~T();
+                ++index;
+              }
+            } else {
+              MOODYCAMEL_TRY {
+                while (index != endIndex) {
+                  auto &el = *((*block)[index]);
+                  *itemFirst = std::move(el);
+                  ++itemFirst;
+                  el.~T();
+                  ++index;
+                }
+              }
+              MOODYCAMEL_CATCH(...) {
+                // It's too late to revert the dequeue, but we can make sure
+                // that all the dequeued objects are properly destroyed and the
+                // block index (and empty count) are properly updated before we
+                // propagate the exception
+                do {
+                  block = localBlockIndex->entries[indexIndex].block;
+                  while (index != endIndex) {
+                    (*block)[index++]->~T();
+                  }
+                  block->ConcurrentQueue::Block::template set_many_empty<
+                      explicit_context>(
+                      firstIndexInBlock,
+                      static_cast<size_t>(endIndex - firstIndexInBlock));
+                  indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+
+                  firstIndexInBlock = index;
+                  endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                             static_cast<index_t>(BLOCK_SIZE);
+                  endIndex =
+                      details::circular_less_than<index_t>(
+                          firstIndex + static_cast<index_t>(actualCount),
+                          endIndex)
+                          ? firstIndex + static_cast<index_t>(actualCount)
+                          : endIndex;
+                } while (index != firstIndex + actualCount);
+
+                MOODYCAMEL_RETHROW;
+              }
+            }
+            block->ConcurrentQueue::Block::template set_many_empty<
+                explicit_context>(
+                firstIndexInBlock,
+                static_cast<size_t>(endIndex - firstIndexInBlock));
+            indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+          } while (index != firstIndex + actualCount);
+
+          return actualCount;
+        } else {
+          // Wasn't anything to dequeue after all; make the effective dequeue
+          // count eventually consistent
+          this->dequeueOvercommit.fetch_add(desiredCount,
+                                            std::memory_order_release);
+        }
+      }
+
+      return 0;
+    }
+
+   private:
+    struct BlockIndexEntry {
+      index_t base;
+      Block *block;
+    };
+
+    struct BlockIndexHeader {
+      size_t size;
+      std::atomic<size_t>
+          front;  // Current slot (not next, like pr_blockIndexFront)
+      BlockIndexEntry *entries;
+      void *prev;
+    };
+
+
+    bool new_block_index(size_t numberOfFilledSlotsToExpose) {
+      auto prevBlockSizeMask = pr_blockIndexSize - 1;
+
+      // Create the new block
+      pr_blockIndexSize <<= 1;
+      auto newRawPtr = static_cast<char *>((Traits::malloc)(
+          sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value -
+          1 + sizeof(BlockIndexEntry) * pr_blockIndexSize));
+      if (newRawPtr == nullptr) {
+        pr_blockIndexSize >>= 1;  // Reset to allow graceful retry
+        return false;
+      }
+
+      auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry *>(
+          details::align_for<BlockIndexEntry>(newRawPtr +
+                                              sizeof(BlockIndexHeader)));
+
+      // Copy in all the old indices, if any
+      size_t j = 0;
+      if (pr_blockIndexSlotsUsed != 0) {
+        auto i =
+            (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask;
+        do {
+          newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
+          i = (i + 1) & prevBlockSizeMask;
+        } while (i != pr_blockIndexFront);
+      }
+
+      // Update everything
+      auto header = new (newRawPtr) BlockIndexHeader;
+      header->size = pr_blockIndexSize;
+      header->front.store(numberOfFilledSlotsToExpose - 1,
+                          std::memory_order_relaxed);
+      header->entries = newBlockIndexEntries;
+      header->prev = pr_blockIndexRaw;  // we link the new block to the old one
+                                        // so we can free it later
+
+      pr_blockIndexFront = j;
+      pr_blockIndexEntries = newBlockIndexEntries;
+      pr_blockIndexRaw = newRawPtr;
+      blockIndex.store(header, std::memory_order_release);
+
+      return true;
+    }
+
+   private:
+    std::atomic<BlockIndexHeader *> blockIndex;
+
+    // To be used by producer only -- consumer must use the ones in referenced
+    // by blockIndex
+    size_t pr_blockIndexSlotsUsed;
+    size_t pr_blockIndexSize;
+    size_t pr_blockIndexFront;  // Next slot (not current)
+    BlockIndexEntry *pr_blockIndexEntries;
+    void *pr_blockIndexRaw;
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+   public:
+    ExplicitProducer *nextExplicitProducer;
+
+   private:
+#endif
+
+#ifdef MCDBGQ_TRACKMEM
+    friend struct MemStats;
+#endif
+  };
+
+
+  //////////////////////////////////
+  // Implicit queue
+  //////////////////////////////////
+
+  struct ImplicitProducer : public ProducerBase {
+    ImplicitProducer(ConcurrentQueue *parent_)
+        : ProducerBase(parent_, false),
+          nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE),
+          blockIndex(nullptr) {
+      new_block_index();
+    }
+
+    ~ImplicitProducer() {
+      // Note that since we're in the destructor we can assume that all
+      // enqueue/dequeue operations completed already; this means that all
+      // undequeued elements are placed contiguously across contiguous blocks,
+      // and that only the first and last remaining blocks can be only partially
+      // empty (all other remaining blocks must be completely full).
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+      // Unregister ourselves for thread termination notification
+      if (!this->inactive.load(std::memory_order_relaxed)) {
+        details::ThreadExitNotifier::unsubscribe(&threadExitListener);
+      }
+#endif
+
+      // Destroy all remaining elements!
+      auto tail = this->tailIndex.load(std::memory_order_relaxed);
+      auto index = this->headIndex.load(std::memory_order_relaxed);
+      Block *block = nullptr;
+      assert(index == tail || details::circular_less_than(index, tail));
+      bool forceFreeLastBlock =
+          index != tail;  // If we enter the loop, then the last (tail) block
+                          // will not be freed
+      while (index != tail) {
+        if ((index & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 ||
+            block == nullptr) {
+          if (block != nullptr) {
+            // Free the old block
+            this->parent->add_block_to_free_list(block);
+          }
+
+          block = get_block_index_entry_for_index(index)->value.load(
+              std::memory_order_relaxed);
+        }
+
+        ((*block)[index])->~T();
+        ++index;
+      }
+      // Even if the queue is empty, there's still one block that's not on the
+      // free list (unless the head index reached the end of it, in which case
+      // the tail will be poised to create a new block).
+      if (this->tailBlock != nullptr &&
+          (forceFreeLastBlock ||
+           (tail & static_cast<index_t>(BLOCK_SIZE - 1)) != 0)) {
+        this->parent->add_block_to_free_list(this->tailBlock);
+      }
+
+      // Destroy block index
+      auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+      if (localBlockIndex != nullptr) {
+        for (size_t i = 0; i != localBlockIndex->capacity; ++i) {
+          localBlockIndex->index[i]->~BlockIndexEntry();
+        }
+        do {
+          auto prev = localBlockIndex->prev;
+          localBlockIndex->~BlockIndexHeader();
+          (Traits::free)(localBlockIndex);
+          localBlockIndex = prev;
+        } while (localBlockIndex != nullptr);
+      }
+    }
+
+    template <AllocationMode allocMode, typename U>
+    inline bool enqueue(U &&element) {
+      index_t currentTailIndex =
+          this->tailIndex.load(std::memory_order_relaxed);
+      index_t newTailIndex = 1 + currentTailIndex;
+      if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+        // We reached the end of a block, start a new one
+        auto head = this->headIndex.load(std::memory_order_relaxed);
+        assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+        if (!details::circular_less_than<index_t>(
+                head, currentTailIndex + BLOCK_SIZE) ||
+            (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
+             (MAX_SUBQUEUE_SIZE == 0 ||
+              MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+          return false;
+        }
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+        debug::DebugLock lock(mutex);
+#endif
+        // Find out where we'll be inserting this block in the block index
+        BlockIndexEntry *idxEntry;
+        if (!insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) {
+          return false;
+        }
+
+        // Get ahold of a new block
+        auto newBlock =
+            this->parent
+                ->ConcurrentQueue::template requisition_block<allocMode>();
+        if (newBlock == nullptr) {
+          rewind_block_index_tail();
+          idxEntry->value.store(nullptr, std::memory_order_relaxed);
+          return false;
+        }
+#ifdef MCDBGQ_TRACKMEM
+        newBlock->owner = this;
+#endif
+        newBlock
+            ->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+
+        MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
+            T, U,
+            new (static_cast<T *>(nullptr)) T(std::forward<U>(element)))) {
+          // May throw, try to insert now before we publish the fact that we
+          // have this new block
+          MOODYCAMEL_TRY {
+            new ((*newBlock)[currentTailIndex]) T(std::forward<U>(element));
+          }
+          MOODYCAMEL_CATCH(...) {
+            rewind_block_index_tail();
+            idxEntry->value.store(nullptr, std::memory_order_relaxed);
+            this->parent->add_block_to_free_list(newBlock);
+            MOODYCAMEL_RETHROW;
+          }
+        }
+
+        // Insert the new block into the index
+        idxEntry->value.store(newBlock, std::memory_order_relaxed);
+
+        this->tailBlock = newBlock;
+
+        MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
+            T, U,
+            new (static_cast<T *>(nullptr)) T(std::forward<U>(element)))) {
+          this->tailIndex.store(newTailIndex, std::memory_order_release);
+          return true;
+        }
+      }
+
+      // Enqueue
+      new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+
+      this->tailIndex.store(newTailIndex, std::memory_order_release);
+      return true;
+    }
+
+    template <typename U>
+    bool dequeue(U &element) {
+      // See ExplicitProducer::dequeue for rationale and explanation
+      index_t tail = this->tailIndex.load(std::memory_order_relaxed);
+      index_t overcommit =
+          this->dequeueOvercommit.load(std::memory_order_relaxed);
+      if (details::circular_less_than<index_t>(
+              this->dequeueOptimisticCount.load(std::memory_order_relaxed) -
+                  overcommit,
+              tail)) {
+        std::atomic_thread_fence(std::memory_order_acquire);
+
+        index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(
+            1, std::memory_order_relaxed);
+        tail = this->tailIndex.load(std::memory_order_acquire);
+        if ((details::likely)(details::circular_less_than<index_t>(
+                myDequeueCount - overcommit, tail))) {
+          index_t index =
+              this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+
+          // Determine which block the element is in
+          auto entry = get_block_index_entry_for_index(index);
+
+          // Dequeue
+          auto block = entry->value.load(std::memory_order_relaxed);
+          auto &el = *((*block)[index]);
+
+          if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T &&, element = std::move(el))) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+            // Note: Acquiring the mutex with every dequeue instead of only when
+            // a block is released is very sub-optimal, but it is, after all,
+            // purely debug code.
+            debug::DebugLock lock(producer->mutex);
+#endif
+            struct Guard {
+              Block *block;
+              index_t index;
+              BlockIndexEntry *entry;
+              ConcurrentQueue *parent;
+
+              ~Guard() {
+                (*block)[index]->~T();
+                if (block->ConcurrentQueue::Block::template set_empty<
+                        implicit_context>(index)) {
+                  entry->value.store(nullptr, std::memory_order_relaxed);
+                  parent->add_block_to_free_list(block);
+                }
+              }
+            } guard = {block, index, entry, this->parent};
+
+            element = std::move(el);  // NOLINT
+          } else {
+            element = std::move(el);  // NOLINT
+            el.~T();                  // NOLINT
+
+            if (block->ConcurrentQueue::Block::template set_empty<
+                    implicit_context>(index)) {
+              {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+                debug::DebugLock lock(mutex);
+#endif
+                // Add the block back into the global free pool (and remove from
+                // block index)
+                entry->value.store(nullptr, std::memory_order_relaxed);
+              }
+              this->parent->add_block_to_free_list(
+                  block);  // releases the above store
+            }
+          }
+
+          return true;
+        } else {
+          this->dequeueOvercommit.fetch_add(1, std::memory_order_release);
+        }
+      }
+
+      return false;
+    }
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4706)  // assignment within conditional expression
+#endif
+    template <AllocationMode allocMode, typename It>
+    bool enqueue_bulk(It itemFirst, size_t count) {
+      // First, we need to make sure we have enough room to enqueue all of the
+      // elements; this means pre-allocating blocks and putting them in the
+      // block index (but only if all the allocations succeeded).
+
+      // Note that the tailBlock we start off with may not be owned by us any
+      // more; this happens if it was filled up exactly to the top (setting
+      // tailIndex to the first index of the next block which is not yet
+      // allocated), then dequeued completely (putting it on the free list)
+      // before we enqueue again.
+
+      index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+      auto startBlock = this->tailBlock;
+      Block *firstAllocatedBlock = nullptr;
+      auto endBlock = this->tailBlock;
+
+      // Figure out how many blocks we'll need to allocate, and do so
+      size_t blockBaseDiff =
+          ((startTailIndex + count - 1) &
+           ~static_cast<index_t>(BLOCK_SIZE - 1)) -
+          ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+      index_t currentTailIndex =
+          (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+      if (blockBaseDiff > 0) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+        debug::DebugLock lock(mutex);
+#endif
+        do {
+          blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+          currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+
+          // Find out where we'll be inserting this block in the block index
+          BlockIndexEntry *idxEntry =
+              nullptr;  // initialization here unnecessary but compiler can't
+                        // always tell
+          Block *newBlock;
+          bool indexInserted = false;
+          auto head = this->headIndex.load(std::memory_order_relaxed);
+          assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+          bool full =
+              !details::circular_less_than<index_t>(
+                  head, currentTailIndex + BLOCK_SIZE) ||
+              (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
+               (MAX_SUBQUEUE_SIZE == 0 ||
+                MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+
+          if (full ||
+              !(indexInserted = insert_block_index_entry<allocMode>(
+                    idxEntry, currentTailIndex)) ||
+              (newBlock =
+                   this->parent->ConcurrentQueue::template requisition_block<
+                       allocMode>()) == nullptr) {
+            // Index allocation or block allocation failed; revert any other
+            // allocations and index insertions done so far for this operation
+            if (indexInserted) {
+              rewind_block_index_tail();
+              idxEntry->value.store(nullptr, std::memory_order_relaxed);
+            }
+            currentTailIndex =
+                (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+            for (auto block = firstAllocatedBlock; block != nullptr;
+                 block = block->next) {
+              currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+              idxEntry = get_block_index_entry_for_index(currentTailIndex);
+              idxEntry->value.store(nullptr, std::memory_order_relaxed);
+              rewind_block_index_tail();
+            }
+            this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+            this->tailBlock = startBlock;
+
+            return false;
+          }
+
+#ifdef MCDBGQ_TRACKMEM
+          newBlock->owner = this;
+#endif
+          newBlock->ConcurrentQueue::Block::template reset_empty<
+              implicit_context>();
+          newBlock->next = nullptr;
+
+          // Insert the new block into the index
+          idxEntry->value.store(newBlock, std::memory_order_relaxed);
+
+          // Store the chain of blocks so that we can undo if later allocations
+          // fail, and so that we can find the blocks when we do the actual
+          // enqueueing
+          if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 ||
+              firstAllocatedBlock != nullptr) {
+            assert(this->tailBlock != nullptr);
+            this->tailBlock->next = newBlock;
+          }
+          this->tailBlock = newBlock;
+          endBlock = newBlock;
+          firstAllocatedBlock =
+              firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock;
+        } while (blockBaseDiff > 0);
+      }
+
+      // Enqueue, one block at a time
+      index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+      currentTailIndex = startTailIndex;
+      this->tailBlock = startBlock;
+      assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 ||
+             firstAllocatedBlock != nullptr || count == 0);
+      if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 &&
+          firstAllocatedBlock != nullptr) {
+        this->tailBlock = firstAllocatedBlock;
+      }
+      while (true) {
+        index_t stopIndex =
+            (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+            static_cast<index_t>(BLOCK_SIZE);
+        if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+          stopIndex = newTailIndex;
+        }
+        MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(
+            T, decltype(*itemFirst),
+            new (static_cast<T *>(nullptr))
+                T(details::deref_noexcept(itemFirst)))) {
+          while (currentTailIndex != stopIndex) {
+            new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+          }
+        }
+        else {
+          MOODYCAMEL_TRY {
+            while (currentTailIndex != stopIndex) {
+              new ((*this->tailBlock)[currentTailIndex]) T(
+                  details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(
+                      T, decltype(*itemFirst),
+                      new (static_cast<T *>(nullptr)) T(details::deref_noexcept(
+                          itemFirst)))>::eval(*itemFirst));
+              ++currentTailIndex;
+              ++itemFirst;
+            }
+          }
+          MOODYCAMEL_CATCH(...) {
+            auto constructedStopIndex = currentTailIndex;
+            auto lastBlockEnqueued = this->tailBlock;
+
+            if (!details::is_trivially_destructible<T>::value) {
+              auto block = startBlock;
+              if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) ==
+                  0) {
+                block = firstAllocatedBlock;
+              }
+              currentTailIndex = startTailIndex;
+              while (true) {
+                stopIndex =
+                    (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                    static_cast<index_t>(BLOCK_SIZE);
+                if (details::circular_less_than<index_t>(constructedStopIndex,
+                                                         stopIndex)) {
+                  stopIndex = constructedStopIndex;
+                }
+                while (currentTailIndex != stopIndex) {
+                  (*block)[currentTailIndex++]->~T();
+                }
+                if (block == lastBlockEnqueued) {
+                  break;
+                }
+                block = block->next;
+              }
+            }
+
+            currentTailIndex =
+                (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+            for (auto block = firstAllocatedBlock; block != nullptr;
+                 block = block->next) {
+              currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+              auto idxEntry = get_block_index_entry_for_index(currentTailIndex);
+              idxEntry->value.store(nullptr, std::memory_order_relaxed);
+              rewind_block_index_tail();
+            }
+            this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+            this->tailBlock = startBlock;
+            MOODYCAMEL_RETHROW;
+          }
+        }
+
+        if (this->tailBlock == endBlock) {
+          assert(currentTailIndex == newTailIndex);
+          break;
+        }
+        this->tailBlock = this->tailBlock->next;
+      }
+      this->tailIndex.store(newTailIndex, std::memory_order_release);
+      return true;
+    }
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+    template <typename It>
+    size_t dequeue_bulk(It &itemFirst, size_t max) {
+      auto tail = this->tailIndex.load(std::memory_order_relaxed);
+      auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+      auto desiredCount = static_cast<size_t>(
+          tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) -
+                  overcommit));
+      if (details::circular_less_than<size_t>(0, desiredCount)) {
+        desiredCount = desiredCount < max ? desiredCount : max;
+        std::atomic_thread_fence(std::memory_order_acquire);
+
+        auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(
+            desiredCount, std::memory_order_relaxed);
+
+        tail = this->tailIndex.load(std::memory_order_acquire);
+        auto actualCount =
+            static_cast<size_t>(tail - (myDequeueCount - overcommit));
+        if (details::circular_less_than<size_t>(0, actualCount)) {
+          actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+          if (actualCount < desiredCount) {
+            this->dequeueOvercommit.fetch_add(desiredCount - actualCount,
+                                              std::memory_order_release);
+          }
+
+          // Get the first index. Note that since there's guaranteed to be at
+          // least actualCount elements, this will never exceed tail.
+          auto firstIndex =
+              this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+
+          // Iterate the blocks and dequeue
+          auto index = firstIndex;
+          BlockIndexHeader *localBlockIndex;
+          auto indexIndex =
+              get_block_index_index_for_index(index, localBlockIndex);
+          do {
+            auto blockStartIndex = index;
+            index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                               static_cast<index_t>(BLOCK_SIZE);
+            endIndex =
+                details::circular_less_than<index_t>(
+                    firstIndex + static_cast<index_t>(actualCount), endIndex)
+                    ? firstIndex + static_cast<index_t>(actualCount)
+                    : endIndex;
+
+            auto entry = localBlockIndex->index[indexIndex];
+            auto block = entry->value.load(std::memory_order_relaxed);
+            if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T &&,
+                                           details::deref_noexcept(itemFirst) =
+                                               std::move((*(*block)[index])))) {
+              while (index != endIndex) {
+                auto &el = *((*block)[index]);
+                *itemFirst++ = std::move(el);
+                el.~T();
+                ++index;
+              }
+            } else {
+              MOODYCAMEL_TRY {
+                while (index != endIndex) {
+                  auto &el = *((*block)[index]);
+                  *itemFirst = std::move(el);
+                  ++itemFirst;
+                  el.~T();
+                  ++index;
+                }
+              }
+              MOODYCAMEL_CATCH(...) {
+                do {
+                  entry = localBlockIndex->index[indexIndex];
+                  block = entry->value.load(std::memory_order_relaxed);
+                  while (index != endIndex) {
+                    (*block)[index++]->~T();
+                  }
+
+                  if (block->ConcurrentQueue::Block::template set_many_empty<
+                          implicit_context>(
+                          blockStartIndex,
+                          static_cast<size_t>(endIndex - blockStartIndex))) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+                    debug::DebugLock lock(mutex);
+#endif
+                    entry->value.store(nullptr, std::memory_order_relaxed);
+                    this->parent->add_block_to_free_list(block);
+                  }
+                  indexIndex =
+                      (indexIndex + 1) & (localBlockIndex->capacity - 1);
+
+                  blockStartIndex = index;
+                  endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                             static_cast<index_t>(BLOCK_SIZE);
+                  endIndex =
+                      details::circular_less_than<index_t>(
+                          firstIndex + static_cast<index_t>(actualCount),
+                          endIndex)
+                          ? firstIndex + static_cast<index_t>(actualCount)
+                          : endIndex;
+                } while (index != firstIndex + actualCount);
+
+                MOODYCAMEL_RETHROW;
+              }
+            }
+            if (block->ConcurrentQueue::Block::template set_many_empty<
+                    implicit_context>(
+                    blockStartIndex,
+                    static_cast<size_t>(endIndex - blockStartIndex))) {
+              {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+                debug::DebugLock lock(mutex);
+#endif
+                // Note that the set_many_empty above did a release, meaning
+                // that anybody who acquires the block we're about to free can
+                // use it safely since our writes (and reads!) will have
+                // happened-before then.
+                entry->value.store(nullptr, std::memory_order_relaxed);
+              }
+              this->parent->add_block_to_free_list(
+                  block);  // releases the above store
+            }
+            indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+          } while (index != firstIndex + actualCount);
+
+          return actualCount;
+        } else {
+          this->dequeueOvercommit.fetch_add(desiredCount,
+                                            std::memory_order_release);
+        }
+      }
+
+      return 0;
+    }
+
+   private:
+    // The block size must be > 1, so any number with the low bit set is an
+    // invalid block base index
+    static const index_t INVALID_BLOCK_BASE = 1;
+
+    struct BlockIndexEntry {
+      std::atomic<index_t> key;
+      std::atomic<Block *> value;
+    };
+
+    struct BlockIndexHeader {
+      size_t capacity;
+      std::atomic<size_t> tail;
+      BlockIndexEntry *entries;
+      BlockIndexEntry **index;
+      BlockIndexHeader *prev;
+    };
+
+    template <AllocationMode allocMode>
+    inline bool insert_block_index_entry(BlockIndexEntry *&idxEntry,
+                                         index_t blockStartIndex) {
+      auto localBlockIndex =
+          blockIndex.load(std::memory_order_relaxed);  // We're the only writer
+                                                       // thread, relaxed is OK
+      if (localBlockIndex == nullptr) {
+        return false;  // this can happen if new_block_index failed in the
+                       // constructor
+      }
+      size_t newTail =
+          (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) &
+          (localBlockIndex->capacity - 1);
+      idxEntry = localBlockIndex->index[newTail];
+      if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE ||
+          idxEntry->value.load(std::memory_order_relaxed) == nullptr) {
+        idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+        localBlockIndex->tail.store(newTail, std::memory_order_release);
+        return true;
+      }
+
+      // No room in the old block index, try to allocate another one!
+      MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) {
+        return false;
+      }
+      else if (!new_block_index()) {
+        return false;
+      }
+      else {
+        localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+        newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) &
+                  (localBlockIndex->capacity - 1);
+        idxEntry = localBlockIndex->index[newTail];
+        assert(idxEntry->key.load(std::memory_order_relaxed) ==
+               INVALID_BLOCK_BASE);
+        idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+        localBlockIndex->tail.store(newTail, std::memory_order_release);
+        return true;
+      }
+    }
+
+    inline void rewind_block_index_tail() {
+      auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+      localBlockIndex->tail.store(
+          (localBlockIndex->tail.load(std::memory_order_relaxed) - 1) &
+              (localBlockIndex->capacity - 1),
+          std::memory_order_relaxed);
+    }
+
+    inline BlockIndexEntry *get_block_index_entry_for_index(
+        index_t index) const {
+      BlockIndexHeader *localBlockIndex;
+      auto idx = get_block_index_index_for_index(index, localBlockIndex);
+      return localBlockIndex->index[idx];
+    }
+
+    inline size_t get_block_index_index_for_index(
+        index_t index, BlockIndexHeader *&localBlockIndex) const {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+      debug::DebugLock lock(mutex);
+#endif
+      index &= ~static_cast<index_t>(BLOCK_SIZE - 1);
+      localBlockIndex = blockIndex.load(std::memory_order_acquire);
+      auto tail = localBlockIndex->tail.load(std::memory_order_acquire);
+      auto tailBase =
+          localBlockIndex->index[tail]->key.load(std::memory_order_relaxed);
+      assert(tailBase != INVALID_BLOCK_BASE);
+      // Note: Must use division instead of shift because the index may wrap
+      // around, causing a negative offset, whose negativity we want to preserve
+      auto offset = static_cast<size_t>(
+          static_cast<typename std::make_signed<index_t>::type>(index -
+                                                                tailBase) /
+          static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+      size_t idx = (tail + offset) & (localBlockIndex->capacity - 1);
+      assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) ==
+                 index &&
+             localBlockIndex->index[idx]->value.load(
+                 std::memory_order_relaxed) != nullptr);
+      return idx;
+    }
+
+    bool new_block_index() {
+      auto prev = blockIndex.load(std::memory_order_relaxed);
+      size_t prevCapacity = prev == nullptr ? 0 : prev->capacity;
+      auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity;
+      auto raw = static_cast<char *>((Traits::malloc)(
+          sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value -
+          1 + sizeof(BlockIndexEntry) * entryCount +
+          std::alignment_of<BlockIndexEntry *>::value - 1 +
+          sizeof(BlockIndexEntry *) * nextBlockIndexCapacity));
+      if (raw == nullptr) {
+        return false;
+      }
+
+      auto header = new (raw) BlockIndexHeader;
+      auto entries = reinterpret_cast<BlockIndexEntry *>(
+          details::align_for<BlockIndexEntry>(raw + sizeof(BlockIndexHeader)));
+      auto index = reinterpret_cast<BlockIndexEntry **>(
+          details::align_for<BlockIndexEntry *>(
+              reinterpret_cast<char *>(entries) +
+              sizeof(BlockIndexEntry) * entryCount));
+      if (prev != nullptr) {
+        auto prevTail = prev->tail.load(std::memory_order_relaxed);
+        auto prevPos = prevTail;
+        size_t i = 0;
+        do {
+          prevPos = (prevPos + 1) & (prev->capacity - 1);
+          index[i++] = prev->index[prevPos];
+        } while (prevPos != prevTail);
+        assert(i == prevCapacity);
+      }
+      for (size_t i = 0; i != entryCount; ++i) {
+        new (entries + i) BlockIndexEntry;
+        entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed);
+        index[prevCapacity + i] = entries + i;
+      }
+      header->prev = prev;
+      header->entries = entries;
+      header->index = index;
+      header->capacity = nextBlockIndexCapacity;
+      header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1),
+                         std::memory_order_relaxed);
+
+      blockIndex.store(header, std::memory_order_release);
+
+      nextBlockIndexCapacity <<= 1;
+
+      return true;
+    }
+
+   private:
+    size_t nextBlockIndexCapacity;
+    std::atomic<BlockIndexHeader *> blockIndex;
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+   public:
+    details::ThreadExitListener threadExitListener;
+
+   private:
+#endif
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+   public:
+    ImplicitProducer *nextImplicitProducer;
+
+   private:
+#endif
+
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+    mutable debug::DebugMutex mutex;
+#endif
+#ifdef MCDBGQ_TRACKMEM
+    friend struct MemStats;
+#endif
+  };
+
+
+  //////////////////////////////////
+  // Block pool manipulation
+  //////////////////////////////////
+
+  void populate_initial_block_list(size_t blockCount) {
+    initialBlockPoolSize = blockCount;
+    if (initialBlockPoolSize == 0) {
+      initialBlockPool = nullptr;
+      return;
+    }
+
+    initialBlockPool = create_array<Block>(blockCount);
+    if (initialBlockPool == nullptr) {
+      initialBlockPoolSize = 0;
+    }
+    for (size_t i = 0; i < initialBlockPoolSize; ++i) {
+      initialBlockPool[i].dynamicallyAllocated = false;
+    }
+  }
+
+  inline Block *try_get_block_from_initial_pool() {
+    if (initialBlockPoolIndex.load(std::memory_order_relaxed) >=
+        initialBlockPoolSize) {
+      return nullptr;
+    }
+
+    auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);
+
+    return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr;
+  }
+
+  inline void add_block_to_free_list(Block *block) {
+#ifdef MCDBGQ_TRACKMEM
+    block->owner = nullptr;
+#endif
+    if (!Traits::RECYCLE_ALLOCATED_BLOCKS && block->dynamicallyAllocated) {
+      destroy(block);
+    } else {
+      freeList.add(block);
+    }
+  }
+
+  inline void add_blocks_to_free_list(Block *block) {
+    while (block != nullptr) {
+      auto next = block->next;
+      add_block_to_free_list(block);
+      block = next;
+    }
+  }
+
+  inline Block *try_get_block_from_free_list() {
+    return freeList.try_get();
+  }
+
+  // Gets a free block from one of the memory pools, or allocates a new one (if
+  // applicable)
+  template <AllocationMode canAlloc>
+  Block *requisition_block() {
+    auto block = try_get_block_from_initial_pool();
+    if (block != nullptr) {
+      return block;
+    }
+
+    block = try_get_block_from_free_list();
+    if (block != nullptr) {
+      return block;
+    }
+
+    MOODYCAMEL_CONSTEXPR_IF(canAlloc == CanAlloc) {
+      return create<Block>();
+    }
+    else {
+      return nullptr;
+    }
+  }
+
+
+#ifdef MCDBGQ_TRACKMEM
+ public:
+  struct MemStats {
+    size_t allocatedBlocks;
+    size_t usedBlocks;
+    size_t freeBlocks;
+    size_t ownedBlocksExplicit;
+    size_t ownedBlocksImplicit;
+    size_t implicitProducers;
+    size_t explicitProducers;
+    size_t elementsEnqueued;
+    size_t blockClassBytes;
+    size_t queueClassBytes;
+    size_t implicitBlockIndexBytes;
+    size_t explicitBlockIndexBytes;
+
+    friend class ConcurrentQueue;
+
+   private:
+    static MemStats getFor(ConcurrentQueue *q) {
+      MemStats stats = {0};
+
+      stats.elementsEnqueued = q->size_approx();
+
+      auto block = q->freeList.head_unsafe();
+      while (block != nullptr) {
+        ++stats.allocatedBlocks;
+        ++stats.freeBlocks;
+        block = block->freeListNext.load(std::memory_order_relaxed);
+      }
+
+      for (auto ptr = q->producerListTail.load(std::memory_order_acquire);
+           ptr != nullptr; ptr = ptr->next_prod()) {
+        bool implicit = dynamic_cast<ImplicitProducer *>(ptr) != nullptr;
+        stats.implicitProducers += implicit ? 1 : 0;
+        stats.explicitProducers += implicit ? 0 : 1;
+
+        if (implicit) {
+          auto prod = static_cast<ImplicitProducer *>(ptr);
+          stats.queueClassBytes += sizeof(ImplicitProducer);
+          auto head = prod->headIndex.load(std::memory_order_relaxed);
+          auto tail = prod->tailIndex.load(std::memory_order_relaxed);
+          auto hash = prod->blockIndex.load(std::memory_order_relaxed);
+          if (hash != nullptr) {
+            for (size_t i = 0; i != hash->capacity; ++i) {
+              if (hash->index[i]->key.load(std::memory_order_relaxed) !=
+                      ImplicitProducer::INVALID_BLOCK_BASE &&
+                  hash->index[i]->value.load(std::memory_order_relaxed) !=
+                      nullptr) {
+                ++stats.allocatedBlocks;
+                ++stats.ownedBlocksImplicit;
+              }
+            }
+            stats.implicitBlockIndexBytes +=
+                hash->capacity *
+                sizeof(typename ImplicitProducer::BlockIndexEntry);
+            for (; hash != nullptr; hash = hash->prev) {
+              stats.implicitBlockIndexBytes +=
+                  sizeof(typename ImplicitProducer::BlockIndexHeader) +
+                  hash->capacity *
+                      sizeof(typename ImplicitProducer::BlockIndexEntry *);
+            }
+          }
+          for (; details::circular_less_than<index_t>(head, tail);
+               head += BLOCK_SIZE) {
+            // auto block = prod->get_block_index_entry_for_index(head);
+            ++stats.usedBlocks;
+          }
+        } else {
+          auto prod = static_cast<ExplicitProducer *>(ptr);
+          stats.queueClassBytes += sizeof(ExplicitProducer);
+          auto tailBlock = prod->tailBlock;
+          bool wasNonEmpty = false;
+          if (tailBlock != nullptr) {
+            auto block = tailBlock;
+            do {
+              ++stats.allocatedBlocks;
+              if (!block->ConcurrentQueue::Block::template is_empty<
+                      explicit_context>() ||
+                  wasNonEmpty) {
+                ++stats.usedBlocks;
+                wasNonEmpty = wasNonEmpty || block != tailBlock;
+              }
+              ++stats.ownedBlocksExplicit;
+              block = block->next;
+            } while (block != tailBlock);
+          }
+          auto index = prod->blockIndex.load(std::memory_order_relaxed);
+          while (index != nullptr) {
+            stats.explicitBlockIndexBytes +=
+                sizeof(typename ExplicitProducer::BlockIndexHeader) +
+                index->size *
+                    sizeof(typename ExplicitProducer::BlockIndexEntry);
+            index = static_cast<typename ExplicitProducer::BlockIndexHeader *>(
+                index->prev);
+          }
+        }
+      }
+
+      auto freeOnInitialPool =
+          q->initialBlockPoolIndex.load(std::memory_order_relaxed) >=
+                  q->initialBlockPoolSize
+              ? 0
+              : q->initialBlockPoolSize -
+                    q->initialBlockPoolIndex.load(std::memory_order_relaxed);
+      stats.allocatedBlocks += freeOnInitialPool;
+      stats.freeBlocks += freeOnInitialPool;
+
+      stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks;
+      stats.queueClassBytes += sizeof(ConcurrentQueue);
+
+      return stats;
+    }
+  };
+
+  // For debugging only. Not thread-safe.
+  MemStats getMemStats() {
+    return MemStats::getFor(this);
+  }
+
+ private:
+  friend struct MemStats;
+#endif
+
+
+  //////////////////////////////////
+  // Producer list manipulation
+  //////////////////////////////////
+
+  ProducerBase *recycle_or_create_producer(bool isExplicit) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+    debug::DebugLock lock(implicitProdMutex);
+#endif
+    // Try to re-use one first
+    for (auto ptr = producerListTail.load(std::memory_order_acquire);
+         ptr != nullptr; ptr = ptr->next_prod()) {
+      if (ptr->inactive.load(std::memory_order_relaxed) &&
+          ptr->isExplicit == isExplicit) {
+        bool expected = true;
+        if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false,
+                                                  std::memory_order_acquire,
+                                                  std::memory_order_relaxed)) {
+          // We caught one! It's been marked as activated, the caller can have
+          // it
+          return ptr;
+        }
+      }
+    }
+
+    return add_producer(
+        isExplicit ? static_cast<ProducerBase *>(create<ExplicitProducer>(this))
+                   : create<ImplicitProducer>(this));
+  }
+
+  ProducerBase *add_producer(ProducerBase *producer) {
+    // Handle failed memory allocation
+    if (producer == nullptr) {
+      return nullptr;
+    }
+
+    producerCount.fetch_add(1, std::memory_order_relaxed);
+
+    // Add it to the lock-free list
+    auto prevTail = producerListTail.load(std::memory_order_relaxed);
+    do {
+      producer->next = prevTail;
+    } while (!producerListTail.compare_exchange_weak(
+        prevTail, producer, std::memory_order_release,
+        std::memory_order_relaxed));
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+    if (producer->isExplicit) {
+      auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed);
+      do {
+        static_cast<ExplicitProducer *>(producer)->nextExplicitProducer =
+            prevTailExplicit;
+      } while (!explicitProducers.compare_exchange_weak(
+          prevTailExplicit, static_cast<ExplicitProducer *>(producer),
+          std::memory_order_release, std::memory_order_relaxed));
+    } else {
+      auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed);
+      do {
+        static_cast<ImplicitProducer *>(producer)->nextImplicitProducer =
+            prevTailImplicit;
+      } while (!implicitProducers.compare_exchange_weak(
+          prevTailImplicit, static_cast<ImplicitProducer *>(producer),
+          std::memory_order_release, std::memory_order_relaxed));
+    }
+#endif
+
+    return producer;
+  }
+
+  void reown_producers() {
+    // After another instance is moved-into/swapped-with this one, all the
+    // producers we stole still think their parents are the other queue.
+    // So fix them up!
+    for (auto ptr = producerListTail.load(std::memory_order_relaxed);
+         ptr != nullptr; ptr = ptr->next_prod()) {
+      ptr->parent = this;
+    }
+  }
+
+
+  //////////////////////////////////
+  // Implicit producer hash
+  //////////////////////////////////
+
+  struct ImplicitProducerKVP {
+    std::atomic<details::thread_id_t> key;
+    ImplicitProducer *value;  // No need for atomicity since it's only read by
+                              // the thread that sets it in the first place
+
+    ImplicitProducerKVP() : value(nullptr) {}
+
+    ImplicitProducerKVP(ImplicitProducerKVP &&other) MOODYCAMEL_NOEXCEPT {
+      key.store(other.key.load(std::memory_order_relaxed),
+                std::memory_order_relaxed);
+      value = other.value;
+    }
+
+    inline ImplicitProducerKVP &operator=(ImplicitProducerKVP &&other)
+        MOODYCAMEL_NOEXCEPT {
+      swap(other);
+      return *this;
+    }
+
+    inline void swap(ImplicitProducerKVP &other) MOODYCAMEL_NOEXCEPT {
+      if (this != &other) {
+        details::swap_relaxed(key, other.key);
+        std::swap(value, other.value);
+      }
+    }
+  };
+
+  template <typename XT, typename XTraits>
+  friend void moodycamel::swap(
+      typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP &,
+      typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP &)
+      MOODYCAMEL_NOEXCEPT;
+
+  struct ImplicitProducerHash {
+    size_t capacity;
+    ImplicitProducerKVP *entries;
+    ImplicitProducerHash *prev;
+  };
+
+  inline void populate_initial_implicit_producer_hash() {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
+      return;
+    }
+    else {
+      implicitProducerHashCount.store(0, std::memory_order_relaxed);
+      auto hash = &initialImplicitProducerHash;
+      hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
+      hash->entries = &initialImplicitProducerHashEntries[0];
+      for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) {
+        initialImplicitProducerHashEntries[i].key.store(
+            details::invalid_thread_id, std::memory_order_relaxed);
+      }
+      hash->prev = nullptr;
+      implicitProducerHash.store(hash, std::memory_order_relaxed);
+    }
+  }
+
+  void swap_implicit_producer_hashes(ConcurrentQueue &other) {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
+      return;
+    }
+    else {
+      // Swap (assumes our implicit producer hash is initialized)
+      initialImplicitProducerHashEntries.swap(
+          other.initialImplicitProducerHashEntries);
+      initialImplicitProducerHash.entries =
+          &initialImplicitProducerHashEntries[0];
+      other.initialImplicitProducerHash.entries =
+          &other.initialImplicitProducerHashEntries[0];
+
+      details::swap_relaxed(implicitProducerHashCount,
+                            other.implicitProducerHashCount);
+
+      details::swap_relaxed(implicitProducerHash, other.implicitProducerHash);
+      if (implicitProducerHash.load(std::memory_order_relaxed) ==
+          &other.initialImplicitProducerHash) {
+        implicitProducerHash.store(&initialImplicitProducerHash,
+                                   std::memory_order_relaxed);
+      } else {
+        ImplicitProducerHash *hash;
+        for (hash = implicitProducerHash.load(std::memory_order_relaxed);
+             hash->prev != &other.initialImplicitProducerHash;
+             hash = hash->prev) {
+          continue;
+        }
+        hash->prev = &initialImplicitProducerHash;
+      }
+      if (other.implicitProducerHash.load(std::memory_order_relaxed) ==
+          &initialImplicitProducerHash) {
+        other.implicitProducerHash.store(&other.initialImplicitProducerHash,
+                                         std::memory_order_relaxed);
+      } else {
+        ImplicitProducerHash *hash;
+        for (hash = other.implicitProducerHash.load(std::memory_order_relaxed);
+             hash->prev != &initialImplicitProducerHash; hash = hash->prev) {
+          continue;
+        }
+        hash->prev = &other.initialImplicitProducerHash;
+      }
+    }
+  }
+
+  // Only fails (returns nullptr) if memory allocation fails
+  ImplicitProducer *get_or_add_implicit_producer() {
+    // Note that since the data is essentially thread-local (key is thread ID),
+    // there's a reduced need for fences (memory ordering is already consistent
+    // for any individual thread), except for the current table itself.
+
+    // Start by looking for the thread ID in the current and all previous hash
+    // tables. If it's not found, it must not be in there yet, since this same
+    // thread would have added it previously to one of the tables that we
+    // traversed.
+
+    // Code and algorithm adapted from
+    // http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table
+
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+    debug::DebugLock lock(implicitProdMutex);
+#endif
+
+    auto id = details::thread_id();
+    auto hashedId = details::hash_thread_id(id);
+
+    auto mainHash = implicitProducerHash.load(std::memory_order_acquire);
+    assert(
+        mainHash !=
+        nullptr);  // silence clang-tidy and MSVC warnings (hash cannot be null)
+    for (auto hash = mainHash; hash != nullptr; hash = hash->prev) {
+      // Look for the id in this hash
+      auto index = hashedId;
+      while (true) {  // Not an infinite loop because at least one slot is free
+                      // in the hash table
+        index &= hash->capacity - 1u;
+
+        auto probedKey =
+            hash->entries[index].key.load(std::memory_order_relaxed);
+        if (probedKey == id) {
+          // Found it! If we had to search several hashes deep, though, we
+          // should lazily add it to the current main hash table to avoid the
+          // extended search next time. Note there's guaranteed to be room in
+          // the current hash table since every subsequent table implicitly
+          // reserves space for all previous tables (there's only one
+          // implicitProducerHashCount).
+          auto value = hash->entries[index].value;
+          if (hash != mainHash) {
+            index = hashedId;
+            while (true) {
+              index &= mainHash->capacity - 1u;
+              auto empty = details::invalid_thread_id;
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+              auto reusable = details::invalid_thread_id2;
+              if (mainHash->entries[index].key.compare_exchange_strong(
+                      empty, id, std::memory_order_seq_cst,
+                      std::memory_order_relaxed) ||
+                  mainHash->entries[index].key.compare_exchange_strong(
+                      reusable, id, std::memory_order_seq_cst,
+                      std::memory_order_relaxed)) {
+#else
+              if (mainHash->entries[index].key.compare_exchange_strong(
+                      empty, id, std::memory_order_seq_cst,
+                      std::memory_order_relaxed)) {
+#endif
+                mainHash->entries[index].value = value;
+                break;
+              }
+              ++index;
+            }
+          }
+
+          return value;
+        }
+        if (probedKey == details::invalid_thread_id) {
+          break;  // Not in this hash table
+        }
+        ++index;
+      }
+    }
+
+    // Insert!
+    auto newCount =
+        1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed);
+    while (true) {
+      // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+      if (newCount >= (mainHash->capacity >> 1) &&
+          !implicitProducerHashResizeInProgress.test_and_set(
+              std::memory_order_acquire)) {
+        // We've acquired the resize lock, try to allocate a bigger hash table.
+        // Note the acquire fence synchronizes with the release fence at the end
+        // of this block, and hence when we reload implicitProducerHash it must
+        // be the most recent version (it only gets changed within this locked
+        // block).
+        mainHash = implicitProducerHash.load(std::memory_order_acquire);
+        if (newCount >= (mainHash->capacity >> 1)) {
+          size_t newCapacity = mainHash->capacity << 1;
+          while (newCount >= (newCapacity >> 1)) {
+            newCapacity <<= 1;
+          }
+          auto raw = static_cast<char *>(
+              (Traits::malloc)(sizeof(ImplicitProducerHash) +
+                               std::alignment_of<ImplicitProducerKVP>::value -
+                               1 + sizeof(ImplicitProducerKVP) * newCapacity));
+          if (raw == nullptr) {
+            // Allocation failed
+            implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+            implicitProducerHashResizeInProgress.clear(
+                std::memory_order_relaxed);
+            return nullptr;
+          }
+
+          auto newHash = new (raw) ImplicitProducerHash;
+          newHash->capacity = static_cast<size_t>(newCapacity);
+          newHash->entries = reinterpret_cast<ImplicitProducerKVP *>(
+              details::align_for<ImplicitProducerKVP>(
+                  raw + sizeof(ImplicitProducerHash)));
+          for (size_t i = 0; i != newCapacity; ++i) {
+            new (newHash->entries + i) ImplicitProducerKVP;
+            newHash->entries[i].key.store(details::invalid_thread_id,
+                                          std::memory_order_relaxed);
+          }
+          newHash->prev = mainHash;
+          implicitProducerHash.store(newHash, std::memory_order_release);
+          implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+          mainHash = newHash;
+        } else {
+          implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+        }
+      }
+
+      // If it's < three-quarters full, add to the old one anyway so that we
+      // don't have to wait for the next table to finish being allocated by
+      // another thread (and if we just finished allocating above, the condition
+      // will always be true)
+      if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) {
+        auto producer =
+            static_cast<ImplicitProducer *>(recycle_or_create_producer(false));
+        if (producer == nullptr) {
+          implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+          return nullptr;
+        }
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+        producer->threadExitListener.callback =
+            &ConcurrentQueue::implicit_producer_thread_exited_callback;
+        producer->threadExitListener.userData = producer;
+        details::ThreadExitNotifier::subscribe(&producer->threadExitListener);
+#endif
+
+        auto index = hashedId;
+        while (true) {
+          index &= mainHash->capacity - 1u;
+          auto empty = details::invalid_thread_id;
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+          auto reusable = details::invalid_thread_id2;
+          if (mainHash->entries[index].key.compare_exchange_strong(
+                  reusable, id, std::memory_order_seq_cst,
+                  std::memory_order_relaxed)) {
+            implicitProducerHashCount.fetch_sub(
+                1,
+                std::memory_order_relaxed);  // already counted as a used slot
+            mainHash->entries[index].value = producer;
+            break;
+          }
+#endif
+          if (mainHash->entries[index].key.compare_exchange_strong(
+                  empty, id, std::memory_order_seq_cst,
+                  std::memory_order_relaxed)) {
+            mainHash->entries[index].value = producer;
+            break;
+          }
+          ++index;
+        }
+        return producer;
+      }
+
+      // Hmm, the old hash is quite full and somebody else is busy allocating a
+      // new one. We need to wait for the allocating thread to finish (if it
+      // succeeds, we add, if not, we try to allocate ourselves).
+      mainHash = implicitProducerHash.load(std::memory_order_acquire);
+    }
+  }
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+  void implicit_producer_thread_exited(ImplicitProducer *producer) {
+    // Remove from hash
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+    debug::DebugLock lock(implicitProdMutex);
+#endif
+    auto hash = implicitProducerHash.load(std::memory_order_acquire);
+    assert(hash != nullptr);  // The thread exit listener is only registered if
+                              // we were added to a hash in the first place
+    auto id = details::thread_id();
+    auto hashedId = details::hash_thread_id(id);
+    details::thread_id_t probedKey;
+
+    // We need to traverse all the hashes just in case other threads aren't on
+    // the current one yet and are trying to add an entry thinking there's a
+    // free slot (because they reused a producer)
+    for (; hash != nullptr; hash = hash->prev) {
+      auto index = hashedId;
+      do {
+        index &= hash->capacity - 1u;
+        probedKey = id;
+        if (hash->entries[index].key.compare_exchange_strong(
+                probedKey, details::invalid_thread_id2,
+                std::memory_order_seq_cst, std::memory_order_relaxed)) {
+          break;
+        }
+        ++index;
+      } while (
+          probedKey !=
+          details::invalid_thread_id);  // Can happen if the hash has changed
+                                        // but we weren't put back in it yet, or
+                                        // if we weren't added to this hash in
+                                        // the first place
+    }
+
+    // Mark the queue as being recyclable
+    producer->inactive.store(true, std::memory_order_release);
+  }
+
+  static void implicit_producer_thread_exited_callback(void *userData) {
+    auto producer = static_cast<ImplicitProducer *>(userData);
+    auto queue = producer->parent;
+    queue->implicit_producer_thread_exited(producer);
+  }
+#endif
+
+  //////////////////////////////////
+  // Utility functions
+  //////////////////////////////////
+
+  template <typename TAlign>
+  static inline void *aligned_malloc(size_t size) {
+    MOODYCAMEL_CONSTEXPR_IF(std::alignment_of<TAlign>::value <=
+                            std::alignment_of<details::max_align_t>::value)
+    return (Traits::malloc)(size);
+    else {
+      size_t alignment = std::alignment_of<TAlign>::value;
+      void *raw = (Traits::malloc)(size + alignment - 1 + sizeof(void *));
+      if (!raw) return nullptr;
+      char *ptr = details::align_for<TAlign>(reinterpret_cast<char *>(raw) +
+                                             sizeof(void *));
+      *(reinterpret_cast<void **>(ptr) - 1) = raw;
+      return ptr;
+    }
+  }
+
+  template <typename TAlign>
+  static inline void aligned_free(void *ptr) {
+    MOODYCAMEL_CONSTEXPR_IF(std::alignment_of<TAlign>::value <=
+                            std::alignment_of<details::max_align_t>::value)
+    return (Traits::free)(ptr);
+    else(Traits::free)(ptr ? *(reinterpret_cast<void **>(ptr) - 1) : nullptr);
+  }
+
+  template <typename U>
+  static inline U *create_array(size_t count) {
+    assert(count > 0);
+    U *p = static_cast<U *>(aligned_malloc<U>(sizeof(U) * count));
+    if (p == nullptr) return nullptr;
+
+    for (size_t i = 0; i != count; ++i) new (p + i) U();
+    return p;
+  }
+
+  template <typename U>
+  static inline void destroy_array(U *p, size_t count) {
+    if (p != nullptr) {
+      assert(count > 0);
+      for (size_t i = count; i != 0;) (p + --i)->~U();
+    }
+    aligned_free<U>(p);
+  }
+
+  template <typename U>
+  static inline U *create() {
+    void *p = aligned_malloc<U>(sizeof(U));
+    return p != nullptr ? new (p) U : nullptr;
+  }
+
+  template <typename U, typename A1>
+  static inline U *create(A1 &&a1) {
+    void *p = aligned_malloc<U>(sizeof(U));
+    return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
+  }
+
+  template <typename U>
+  static inline void destroy(U *p) {
+    if (p != nullptr) p->~U();
+    aligned_free<U>(p);
+  }
+
+ private:
+  std::atomic<ProducerBase *> producerListTail;
+  std::atomic<std::uint32_t> producerCount;
+
+  std::atomic<size_t> initialBlockPoolIndex;
+  Block *initialBlockPool;
+  size_t initialBlockPoolSize;
+
+#ifndef MCDBGQ_USEDEBUGFREELIST
+  FreeList<Block> freeList;
+#else
+  debug::DebugFreeList<Block> freeList;
+#endif
+
+  std::atomic<ImplicitProducerHash *> implicitProducerHash;
+  std::atomic<size_t>
+      implicitProducerHashCount;  // Number of slots logically used
+  ImplicitProducerHash initialImplicitProducerHash;
+  std::array<ImplicitProducerKVP, INITIAL_IMPLICIT_PRODUCER_HASH_SIZE>
+      initialImplicitProducerHashEntries;
+  std::atomic_flag implicitProducerHashResizeInProgress;
+
+  std::atomic<std::uint32_t> nextExplicitConsumerId;
+  std::atomic<std::uint32_t> globalExplicitConsumerOffset;
+
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+  debug::DebugMutex implicitProdMutex;
+#endif
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+  std::atomic<ExplicitProducer *> explicitProducers;
+  std::atomic<ImplicitProducer *> implicitProducers;
+#endif
+};
+
+
+template <typename T, typename Traits>
+ProducerToken::ProducerToken(ConcurrentQueue<T, Traits> &queue)
+    : producer(queue.recycle_or_create_producer(true)) {
+  if (producer != nullptr) {
+    producer->token = this;
+  }
+}
+
+template <typename T, typename Traits>
+ProducerToken::ProducerToken(BlockingConcurrentQueue<T, Traits> &queue)
+    : producer(reinterpret_cast<ConcurrentQueue<T, Traits> *>(&queue)
+                   ->recycle_or_create_producer(true)) {
+  if (producer != nullptr) {
+    producer->token = this;
+  }
+}
+
+template <typename T, typename Traits>
+ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits> &queue)
+    : itemsConsumedFromCurrent(0),
+      currentProducer(nullptr),
+      desiredProducer(nullptr) {
+  initialOffset =
+      queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+  lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+}
+
+template <typename T, typename Traits>
+ConsumerToken::ConsumerToken(BlockingConcurrentQueue<T, Traits> &queue)
+    : itemsConsumedFromCurrent(0),
+      currentProducer(nullptr),
+      desiredProducer(nullptr) {
+  initialOffset =
+      reinterpret_cast<ConcurrentQueue<T, Traits> *>(&queue)
+          ->nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+  lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+}
+
+template <typename T, typename Traits>
+inline void swap(ConcurrentQueue<T, Traits> &a,
+                 ConcurrentQueue<T, Traits> &b) MOODYCAMEL_NOEXCEPT {
+  a.swap(b);
+}
+
+inline void swap(ProducerToken &a, ProducerToken &b) MOODYCAMEL_NOEXCEPT {
+  a.swap(b);
+}
+
+inline void swap(ConsumerToken &a, ConsumerToken &b) MOODYCAMEL_NOEXCEPT {
+  a.swap(b);
+}
+
+template <typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP &a,
+                 typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP &b)
+    MOODYCAMEL_NOEXCEPT {
+  a.swap(b);
+}
+
+}  // namespace moodycamel
+
+#if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17)
+#pragma warning(pop)
+#endif
+
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+#pragma GCC diagnostic pop
+#endif
\ No newline at end of file
diff --git a/src/include/zvec/core/framework/index_storage.h b/src/include/zvec/core/framework/index_storage.h
index 8673d63e..9173da3e 100644
--- a/src/include/zvec/core/framework/index_storage.h
+++ b/src/include/zvec/core/framework/index_storage.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include <zvec/ailego/buffer/buffer_manager.h>
+#include <zvec/ailego/buffer/buffer_pool.h>
 #include <zvec/ailego/container/params.h>
 #include <zvec/core/framework/index_error.h>
 #include <zvec/core/framework/index_module.h>
@@ -37,10 +37,12 @@ class IndexStorage : public IndexModule {
     };
 
     MemoryBlock() {}
-    MemoryBlock(ailego::BufferHandle::Pointer &&buffer_handle)
-        : type_(MemoryBlockType::MBT_BUFFERPOOL),
-          buffer_handle_(std::move(buffer_handle)) {
-      data_ = buffer_handle_->pin_vector_data();
+    MemoryBlock(ailego::VecBufferPoolHandle *buffer_pool_handle, int block_id,
+                void *data)
+        : type_(MemoryBlockType::MBT_BUFFERPOOL) {
+      buffer_pool_handle_ = buffer_pool_handle;
+      buffer_block_id_ = block_id;
+      data_ = data;
     }
     MemoryBlock(void *data) : type_(MemoryBlockType::MBT_MMAP), data_(data) {}
 
@@ -50,7 +52,8 @@ class IndexStorage : public IndexModule {
           this->reset(rhs.data_);
           break;
         case MemoryBlockType::MBT_BUFFERPOOL:
-          this->reset(rhs.buffer_handle_);
+          this->reset(rhs.buffer_pool_handle_, rhs.buffer_block_id_, rhs.data_);
+          buffer_pool_handle_->acquire_one(buffer_block_id_);
           break;
         default:
           break;
@@ -63,7 +66,8 @@ class IndexStorage : public IndexModule {
           this->reset(std::move(rhs.data_));
           break;
         case MemoryBlockType::MBT_BUFFERPOOL:
-          this->reset(std::move(rhs.buffer_handle_));
+          this->reset(std::move(rhs.buffer_pool_handle_),
+                      std::move(rhs.buffer_block_id_), std::move(rhs.data_));
           break;
         default:
           break;
@@ -77,7 +81,9 @@ class IndexStorage : public IndexModule {
             this->reset(rhs.data_);
             break;
           case MemoryBlockType::MBT_BUFFERPOOL:
-            this->reset(rhs.buffer_handle_);
+            this->reset(rhs.buffer_pool_handle_, rhs.buffer_block_id_,
+                        rhs.data_);
+            buffer_pool_handle_->acquire_one(buffer_block_id_);
             break;
           default:
             break;
@@ -93,7 +99,8 @@ class IndexStorage : public IndexModule {
             this->reset(std::move(rhs.data_));
             break;
           case MemoryBlockType::MBT_BUFFERPOOL:
-            this->reset(std::move(rhs.buffer_handle_));
+            this->reset(std::move(rhs.buffer_pool_handle_),
+                        std::move(rhs.buffer_block_id_), std::move(rhs.data_));
             break;
           default:
             break;
@@ -107,9 +114,8 @@ class IndexStorage : public IndexModule {
         case MemoryBlockType::MBT_MMAP:
           break;
         case MemoryBlockType::MBT_BUFFERPOOL:
-          if (buffer_handle_) {
-            buffer_handle_->unpin_vector_data();
-            // buffer_handle_.reset();
+          if (buffer_pool_handle_) {
+            buffer_pool_handle_->release_one(buffer_block_id_);
           }
           break;
         default:
@@ -122,34 +128,21 @@ class IndexStorage : public IndexModule {
       return data_;
     }
 
-    void reset(ailego::BufferHandle::Pointer &buffer_handle) {
+    void reset(ailego::VecBufferPoolHandle *buffer_pool_handle, int block_id,
+               void *data) {
       if (type_ == MemoryBlockType::MBT_BUFFERPOOL) {
-        buffer_handle_->unpin_vector_data();
-        buffer_handle_.reset();
+        buffer_pool_handle_->release_one(buffer_block_id_);
       }
       type_ = MemoryBlockType::MBT_BUFFERPOOL;
-      if (buffer_handle) {
-        buffer_handle_.reset(buffer_handle.release());
-      }
-      data_ = buffer_handle_->pin_vector_data();
-    }
-
-    void reset(ailego::BufferHandle::Pointer &&buffer_handle) {
-      if (type_ == MemoryBlockType::MBT_BUFFERPOOL) {
-        buffer_handle_->unpin_vector_data();
-        buffer_handle_.reset();
-      }
-      type_ = MemoryBlockType::MBT_BUFFERPOOL;
-      if (buffer_handle) {
-        buffer_handle_ = std::move(buffer_handle);
-      }
-      data_ = buffer_handle_->pin_vector_data();
+      buffer_pool_handle_ = buffer_pool_handle;
+      buffer_block_id_ = block_id;
+      data_ = data;
     }
 
     void reset(void *data) {
       if (type_ == MemoryBlockType::MBT_BUFFERPOOL) {
-        buffer_handle_->unpin_vector_data();
-        buffer_handle_.reset();
+        buffer_pool_handle_->release_one(buffer_block_id_);
+        buffer_pool_handle_ = nullptr;
       }
       type_ = MemoryBlockType::MBT_MMAP;
       data_ = data;
@@ -157,7 +150,8 @@ class IndexStorage : public IndexModule {
 
     MemoryBlockType type_{MBT_UNKNOWN};
     void *data_{nullptr};
-    mutable ailego::BufferHandle::Pointer buffer_handle_{nullptr};
+    mutable ailego::VecBufferPoolHandle *buffer_pool_handle_;
+    int buffer_block_id_{0};
   };
 
   struct SegmentData {
diff --git a/tests/core/algorithm/flat/flat_streamer_buffer_test.cpp b/tests/core/algorithm/flat/flat_streamer_buffer_test.cc
similarity index 78%
rename from tests/core/algorithm/flat/flat_streamer_buffer_test.cpp
rename to tests/core/algorithm/flat/flat_streamer_buffer_test.cc
index 62b25e23..fbc404b4 100644
--- a/tests/core/algorithm/flat/flat_streamer_buffer_test.cpp
+++ b/tests/core/algorithm/flat/flat_streamer_buffer_test.cc
@@ -50,7 +50,6 @@ void FlatStreamerTest::TearDown(void) {
 }
 
 TEST_F(FlatStreamerTest, TestLinearSearch) {
-  BufferManager::Instance().init(300 * 1024 / 2 * 1024, 1);
   IndexStreamer::Pointer write_streamer =
       IndexFactory::CreateStreamer("FlatStreamer");
   ASSERT_TRUE(write_streamer != nullptr);
@@ -165,31 +164,33 @@ TEST_F(FlatStreamerTest, TestLinearSearch) {
     ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
     ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
   }
+  cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl;
 
   read_streamer->close();
   read_streamer.reset();
-  cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl;
 }
 
-TEST_F(FlatStreamerTest, TestLinearSearchMMap) {
-  BufferManager::Instance().init(3 * 1024 / 2 * 1024, 1);
+TEST_F(FlatStreamerTest, TestLinearSearchWithLRU) {
+  constexpr size_t static dim = 1600;
   IndexStreamer::Pointer write_streamer =
       IndexFactory::CreateStreamer("FlatStreamer");
   ASSERT_TRUE(write_streamer != nullptr);
 
   Params params;
-  ASSERT_EQ(0, write_streamer->init(*index_meta_ptr_, params));
+  IndexMeta meta = IndexMeta(IndexMeta::DataType::DT_FP32, dim);
+  meta.set_metric("SquaredEuclidean", 0, Params());
+  ASSERT_EQ(0, write_streamer->init(meta, params));
   auto storage = IndexFactory::CreateStorage("MMapFileStorage");
   ASSERT_NE(nullptr, storage);
   Params stg_params;
   ASSERT_EQ(0, storage->init(stg_params));
-  ASSERT_EQ(0, storage->open(dir_ + "/Test/LinearSearchMMap", true));
+  ASSERT_EQ(0, storage->open(dir_ + "/Test/LinearSearchWithLRU", true));
   ASSERT_EQ(0, write_streamer->open(storage));
 
   auto ctx = write_streamer->create_context();
   ASSERT_TRUE(!!ctx);
 
-  size_t cnt = 10000UL;
+  size_t cnt = 1000000UL;
   IndexQueryMeta qmeta(IndexMeta::DT_FP32, dim);
   for (size_t i = 0; i < cnt; i++) {
     NumericalVector<float> vec(dim);
@@ -202,18 +203,19 @@ TEST_F(FlatStreamerTest, TestLinearSearchMMap) {
   write_streamer->close();
   write_streamer.reset();
 
-  ElapsedTime elapsed_time;
+
   IndexStreamer::Pointer read_streamer =
       IndexFactory::CreateStreamer("FlatStreamer");
-  ASSERT_EQ(0, read_streamer->init(*index_meta_ptr_, params));
-  auto read_storage = IndexFactory::CreateStorage("MMapFileStorage");
+  ASSERT_EQ(0, read_streamer->init(meta, params));
+  auto read_storage = IndexFactory::CreateStorage("BufferStorage");
   ASSERT_NE(nullptr, read_storage);
   ASSERT_EQ(0, read_storage->init(stg_params));
-  ASSERT_EQ(0, read_storage->open(dir_ + "/Test/LinearSearchMMap", false));
+  ASSERT_EQ(0, read_storage->open(dir_ + "/Test/LinearSearchWithLRU", false));
   ASSERT_EQ(0, read_streamer->open(read_storage));
   size_t topk = 3;
   auto provider = read_streamer->create_provider();
-  for (size_t i = 0; i < cnt; i += 1) {
+  ElapsedTime elapsed_time;
+  for (size_t i = 0; i < 10; i += 1) {
     NumericalVector<float> vec(dim);
     for (size_t j = 0; j < dim; ++j) {
       vec[j] = i;
@@ -241,122 +243,132 @@ TEST_F(FlatStreamerTest, TestLinearSearchMMap) {
     ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
     ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
   }
-
-  ctx->set_topk(100U);
-  NumericalVector<float> vec(dim);
-  for (size_t j = 0; j < dim; ++j) {
-    vec[j] = 10.1f;
-  }
-  ASSERT_EQ(0, read_streamer->search_bf_impl(vec.data(), qmeta, ctx));
-  auto &result = ctx->result();
-  ASSERT_EQ(100U, result.size());
-  ASSERT_EQ(10, result[0].key());
-  ASSERT_EQ(11, result[1].key());
-  ASSERT_EQ(5, result[10].key());
-  ASSERT_EQ(0, result[20].key());
-  ASSERT_EQ(30, result[30].key());
-  ASSERT_EQ(35, result[35].key());
-  ASSERT_EQ(99, result[99].key());
+  cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl;
 
   read_streamer->close();
   read_streamer.reset();
-  cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl;
 }
 
-TEST_F(FlatStreamerTest, TestBufferStorage) {
-  BufferManager::Instance().init(10 * 1024 * 1024, 1);
-  IndexStreamer::Pointer streamer =
+TEST_F(FlatStreamerTest, TestLinearSearchMMap) {
+  IndexStreamer::Pointer write_streamer =
       IndexFactory::CreateStreamer("FlatStreamer");
-  ASSERT_TRUE(streamer != nullptr);
-  const int dim = 16;
-  IndexMeta meta = IndexMeta(IndexMeta::DT_FP32, dim);
-  meta.set_metric("SquaredEuclidean", 0, Params());
+  ASSERT_TRUE(write_streamer != nullptr);
 
   Params params;
-  EXPECT_EQ(0, streamer->init(meta, params));
+  ASSERT_EQ(0, write_streamer->init(*index_meta_ptr_, params));
   auto storage = IndexFactory::CreateStorage("MMapFileStorage");
   ASSERT_NE(nullptr, storage);
   Params stg_params;
-  EXPECT_EQ(0, storage->init(stg_params));
-  EXPECT_EQ(0, storage->open(dir_ + "/Test/LinearSearch", true));
-  EXPECT_EQ(0, streamer->open(storage));
+  ASSERT_EQ(0, storage->init(stg_params));
+  ASSERT_EQ(0, storage->open(dir_ + "/Test/LinearSearchMMap", true));
+  ASSERT_EQ(0, write_streamer->open(storage));
 
-  auto ctx = streamer->create_context();
+  auto ctx = write_streamer->create_context();
   ASSERT_TRUE(!!ctx);
 
-  size_t cnt = 1000UL;
+  size_t cnt = 10000UL;
   IndexQueryMeta qmeta(IndexMeta::DT_FP32, dim);
   for (size_t i = 0; i < cnt; i++) {
     NumericalVector<float> vec(dim);
     for (size_t j = 0; j < dim; ++j) {
       vec[j] = i;
     }
-    streamer->add_impl(i, vec.data(), qmeta, ctx);
+    write_streamer->add_impl(i, vec.data(), qmeta, ctx);
   }
-  streamer->flush(0UL);
-  streamer.reset();
+  write_streamer->flush(0UL);
+  write_streamer->close();
+  write_streamer.reset();
 
   IndexStreamer::Pointer read_streamer =
       IndexFactory::CreateStreamer("FlatStreamer");
-  ASSERT_TRUE(read_streamer != nullptr);
-  EXPECT_EQ(0, read_streamer->init(meta, params));
-  auto read_storage = IndexFactory::CreateStorage("BufferStorage");
+  ASSERT_EQ(0, read_streamer->init(*index_meta_ptr_, params));
+  auto read_storage = IndexFactory::CreateStorage("MMapFileStorage");
   ASSERT_NE(nullptr, read_storage);
-  EXPECT_EQ(0, read_storage->init(stg_params));
-  EXPECT_EQ(0, read_storage->open(dir_ + "/Test/LinearSearch", false));
-  EXPECT_EQ(0, read_streamer->open(read_storage));
-  auto read_ctx = read_streamer->create_context();
-  auto provider = read_streamer->create_provider();
-
+  ASSERT_EQ(0, read_storage->init(stg_params));
+  ASSERT_EQ(0, read_storage->open(dir_ + "/Test/LinearSearchMMap", false));
+  ASSERT_EQ(0, read_streamer->open(read_storage));
   size_t topk = 3;
+  auto provider = read_streamer->create_provider();
   for (size_t i = 0; i < cnt; i += 1) {
     NumericalVector<float> vec(dim);
     for (size_t j = 0; j < dim; ++j) {
       vec[j] = i;
     }
-    read_ctx->set_topk(topk);
-    EXPECT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, read_ctx));
-    auto &result1 = read_ctx->result();
-    EXPECT_EQ(topk, result1.size());
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    auto &result1 = ctx->result();
+    ASSERT_EQ(topk, result1.size());
+    IndexStorage::MemoryBlock block;
+    ASSERT_EQ(0, provider->get_vector(result1[0].key(), block));
+    const float *data = (float *)block.data();
     for (size_t j = 0; j < dim; ++j) {
-      const float *data = (float *)provider->get_vector(result1[0].key());
-      EXPECT_EQ(data[j], i);
+      ASSERT_EQ(data[j], i);
     }
-    EXPECT_EQ(i, result1[0].key());
+    ASSERT_EQ(i, result1[0].key());
 
     for (size_t j = 0; j < dim; ++j) {
       vec[j] = i + 0.1f;
     }
-    read_ctx->set_topk(topk);
-    EXPECT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, read_ctx));
-    auto &result2 = read_ctx->result();
-    EXPECT_EQ(topk, result2.size());
-    EXPECT_EQ(i, result2[0].key());
-    EXPECT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
-    EXPECT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    auto &result2 = ctx->result();
+    ASSERT_EQ(topk, result2.size());
+    ASSERT_EQ(i, result2[0].key());
+    ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
+    ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
   }
 
-  read_ctx->set_topk(100U);
+  ctx->set_topk(100U);
   NumericalVector<float> vec(dim);
   for (size_t j = 0; j < dim; ++j) {
     vec[j] = 10.1f;
   }
-  EXPECT_EQ(0, read_streamer->search_bf_impl(vec.data(), qmeta, read_ctx));
-  auto &result = read_ctx->result();
-  EXPECT_EQ(100U, result.size());
-  EXPECT_EQ(10, result[0].key());
-  EXPECT_EQ(11, result[1].key());
-  EXPECT_EQ(5, result[10].key());
-  EXPECT_EQ(0, result[20].key());
-  EXPECT_EQ(30, result[30].key());
-  EXPECT_EQ(35, result[35].key());
-  EXPECT_EQ(99, result[99].key());
-
-  read_streamer->flush(0UL);
+  ASSERT_EQ(0, read_streamer->search_bf_impl(vec.data(), qmeta, ctx));
+  auto &result = ctx->result();
+  ASSERT_EQ(100U, result.size());
+  ASSERT_EQ(10, result[0].key());
+  ASSERT_EQ(11, result[1].key());
+  ASSERT_EQ(5, result[10].key());
+  ASSERT_EQ(0, result[20].key());
+  ASSERT_EQ(30, result[30].key());
+  ASSERT_EQ(35, result[35].key());
+  ASSERT_EQ(99, result[99].key());
+
+  ElapsedTime elapsed_time;
+  for (size_t i = 0; i < cnt; i += 1) {
+    NumericalVector<float> vec(dim);
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    auto &result1 = ctx->result();
+    ASSERT_EQ(topk, result1.size());
+    IndexStorage::MemoryBlock block;
+    ASSERT_EQ(0, provider->get_vector(result1[0].key(), block));
+    const float *data = (float *)block.data();
+    for (size_t j = 0; j < dim; ++j) {
+      ASSERT_EQ(data[j], i);
+    }
+    ASSERT_EQ(i, result1[0].key());
+
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i + 0.1f;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    auto &result2 = ctx->result();
+    ASSERT_EQ(topk, result2.size());
+    ASSERT_EQ(i, result2[0].key());
+    ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
+    ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
+  }
+
+  read_streamer->close();
   read_streamer.reset();
+  cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl;
 }
 
-
 #if defined(__GNUC__) || defined(__GNUG__)
 #pragma GCC diagnostic pop
 #endif
\ No newline at end of file
diff --git a/tests/core/algorithm/flat/flat_streamer_buffer_time_test.cc b/tests/core/algorithm/flat/flat_streamer_buffer_time_test.cc
new file mode 100644
index 00000000..435ecccc
--- /dev/null
+++ b/tests/core/algorithm/flat/flat_streamer_buffer_time_test.cc
@@ -0,0 +1,235 @@
+#include <future>
+#include <string>
+#include <vector>
+#include <ailego/utility/math_helper.h>
+#include <ailego/utility/memory_helper.h>
+#include <gtest/gtest.h>
+#include <zvec/ailego/buffer/buffer_manager.h>
+#include <zvec/core/framework/index_framework.h>
+#include <zvec/core/framework/index_streamer.h>
+
+using namespace zvec::core;
+using namespace zvec::ailego;
+using namespace std;
+
+#if defined(__GNUC__) || defined(__GNUG__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-result"
+#endif
+
+constexpr size_t static dim = 128;
+
+class FlatStreamerTest : public testing::Test {
+ protected:
+  void SetUp(void);
+  void TearDown(void);
+  void hybrid_scale(std::vector<float> &dense_value,
+                    std::vector<float> &sparse_value, float alpha_scale);
+
+  static std::string dir_;
+  static std::shared_ptr<IndexMeta> index_meta_ptr_;
+};
+
+std::string FlatStreamerTest::dir_("streamer_test/");
+std::shared_ptr<IndexMeta> FlatStreamerTest::index_meta_ptr_;
+
+void FlatStreamerTest::SetUp(void) {
+  index_meta_ptr_.reset(new (std::nothrow)
+                            IndexMeta(IndexMeta::DataType::DT_FP32, dim));
+  index_meta_ptr_->set_metric("SquaredEuclidean", 0, Params());
+
+  char cmdBuf[100];
+  snprintf(cmdBuf, 100, "rm -rf %s", dir_.c_str());
+  system(cmdBuf);
+}
+
+void FlatStreamerTest::TearDown(void) {
+  char cmdBuf[100];
+  snprintf(cmdBuf, 100, "rm -rf %s", dir_.c_str());
+  system(cmdBuf);
+}
+
+TEST_F(FlatStreamerTest, TestLinearSearchMMap) {
+  BufferManager::Instance().init(50 * 1024 * 1024, 1);
+  IndexStreamer::Pointer write_streamer =
+      IndexFactory::CreateStreamer("FlatStreamer");
+  ASSERT_TRUE(write_streamer != nullptr);
+
+  Params params;
+  ASSERT_EQ(0, write_streamer->init(*index_meta_ptr_, params));
+  auto storage = IndexFactory::CreateStorage("MMapFileStorage");
+  ASSERT_NE(nullptr, storage);
+  Params stg_params;
+  ASSERT_EQ(0, storage->init(stg_params));
+  ASSERT_EQ(0, storage->open(dir_ + "/Test/LinearSearchMMap", true));
+  ASSERT_EQ(0, write_streamer->open(storage));
+
+  auto ctx = write_streamer->create_context();
+  ASSERT_TRUE(!!ctx);
+
+  size_t data_cnt = 300000UL, cnt = 500UL;
+  IndexQueryMeta qmeta(IndexMeta::DT_FP32, dim);
+  for (size_t i = 0; i < data_cnt; i++) {
+    NumericalVector<float> vec(dim);
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i;
+    }
+    write_streamer->add_impl(i, vec.data(), qmeta, ctx);
+  }
+  write_streamer->flush(0UL);
+  write_streamer->close();
+  write_streamer.reset();
+
+  IndexStreamer::Pointer read_streamer =
+      IndexFactory::CreateStreamer("FlatStreamer");
+  ASSERT_EQ(0, read_streamer->init(*index_meta_ptr_, params));
+  auto read_storage = IndexFactory::CreateStorage("MMapFileStorage");
+  ASSERT_NE(nullptr, read_storage);
+  ASSERT_EQ(0, read_storage->init(stg_params));
+  ASSERT_EQ(0, read_storage->open(dir_ + "/Test/LinearSearchMMap", false));
+  ASSERT_EQ(0, read_streamer->open(read_storage));
+  size_t topk = 30;
+  ElapsedTime elapsed_time;
+  for (size_t i = 0; i < cnt; i += 1) {
+    NumericalVector<float> vec(dim);
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    // auto &result1 = ctx->result();
+    // ASSERT_EQ(topk, result1.size());
+    // ASSERT_EQ(i, result1[0].key());
+
+    // for (size_t j = 0; j < dim; ++j) {
+    //   vec[j] = i + 0.1f;
+    // }
+    // ctx->set_topk(topk);
+    // ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    // auto &result2 = ctx->result();
+    // ASSERT_EQ(topk, result2.size());
+    // ASSERT_EQ(i, result2[0].key());
+    // ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
+    // ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
+  }
+  cout << "Elapsed time: " << elapsed_time.micro_seconds() << " us" << endl;
+  for (size_t i = 0; i < cnt; i += 1) {
+    NumericalVector<float> vec(dim);
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    // auto &result1 = ctx->result();
+    // ASSERT_EQ(topk, result1.size());
+    // ASSERT_EQ(i, result1[0].key());
+
+    // for (size_t j = 0; j < dim; ++j) {
+    //   vec[j] = i + 0.1f;
+    // }
+    // ctx->set_topk(topk);
+    // ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    // auto &result2 = ctx->result();
+    // ASSERT_EQ(topk, result2.size());
+    // ASSERT_EQ(i, result2[0].key());
+    // ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
+    // ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
+  }
+  cout << "Elapsed time: " << elapsed_time.micro_seconds() << " us" << endl;
+  read_streamer->close();
+  read_streamer.reset();
+}
+
+TEST_F(FlatStreamerTest, TestLinearSearchBuffer) {
+  IndexStreamer::Pointer write_streamer =
+      IndexFactory::CreateStreamer("FlatStreamer");
+  ASSERT_TRUE(write_streamer != nullptr);
+
+  Params params;
+  ASSERT_EQ(0, write_streamer->init(*index_meta_ptr_, params));
+  auto storage = IndexFactory::CreateStorage("MMapFileStorage");
+  ASSERT_NE(nullptr, storage);
+  Params stg_params;
+  ASSERT_EQ(0, storage->init(stg_params));
+  ASSERT_EQ(0, storage->open(dir_ + "/Test/LinearSearchBuffer", true));
+  ASSERT_EQ(0, write_streamer->open(storage));
+
+  auto ctx = write_streamer->create_context();
+  ASSERT_TRUE(!!ctx);
+
+  size_t data_cnt = 300000UL, cnt = 500UL;
+  IndexQueryMeta qmeta(IndexMeta::DT_FP32, dim);
+  for (size_t i = 0; i < data_cnt; i++) {
+    NumericalVector<float> vec(dim);
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i;
+    }
+    write_streamer->add_impl(i, vec.data(), qmeta, ctx);
+  }
+  write_streamer->flush(0UL);
+  write_streamer->close();
+  write_streamer.reset();
+
+  IndexStreamer::Pointer read_streamer =
+      IndexFactory::CreateStreamer("FlatStreamer");
+  ASSERT_EQ(0, read_streamer->init(*index_meta_ptr_, params));
+  auto read_storage = IndexFactory::CreateStorage("BufferStorage");
+  ASSERT_NE(nullptr, read_storage);
+  ASSERT_EQ(0, read_storage->init(stg_params));
+  ASSERT_EQ(0, read_storage->open(dir_ + "/Test/LinearSearchBuffer", false));
+  ASSERT_EQ(0, read_streamer->open(read_storage));
+  size_t topk = 30;
+  ElapsedTime elapsed_time;
+  for (size_t i = 0; i < cnt; i += 1) {
+    NumericalVector<float> vec(dim);
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    // auto &result1 = ctx->result();
+    // ASSERT_EQ(topk, result1.size());
+    // ASSERT_EQ(i, result1[0].key());
+
+    // for (size_t j = 0; j < dim; ++j) {
+    //   vec[j] = i + 0.1f;
+    // }
+    // ctx->set_topk(topk);
+    // ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    // auto &result2 = ctx->result();
+    // ASSERT_EQ(topk, result2.size());
+    // ASSERT_EQ(i, result2[0].key());
+    // ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
+    // ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
+  }
+  cout << "Elapsed time: " << elapsed_time.micro_seconds() << " us" << endl;
+  for (size_t i = 0; i < cnt; i += 1) {
+    NumericalVector<float> vec(dim);
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    // auto &result1 = ctx->result();
+    // ASSERT_EQ(topk, result1.size());
+    // ASSERT_EQ(i, result1[0].key());
+
+    // for (size_t j = 0; j < dim; ++j) {
+    //   vec[j] = i + 0.1f;
+    // }
+    // ctx->set_topk(topk);
+    // ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    // auto &result2 = ctx->result();
+    // ASSERT_EQ(topk, result2.size());
+    // ASSERT_EQ(i, result2[0].key());
+    // ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
+    // ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
+  }
+  cout << "Elapsed time: " << elapsed_time.micro_seconds() << " us" << endl;
+  read_streamer->close();
+  read_streamer.reset();
+}
+
+#if defined(__GNUC__) || defined(__GNUG__)
+#pragma GCC diagnostic pop
+#endif
\ No newline at end of file
diff --git a/tests/core/algorithm/flat/flat_streamer_buffer_time_test.cpp b/tests/core/algorithm/flat/flat_streamer_buffer_time_test.cpp
deleted file mode 100644
index c919e9fe..00000000
--- a/tests/core/algorithm/flat/flat_streamer_buffer_time_test.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-#include <future>
-#include <string>
-#include <vector>
-#include <ailego/utility/math_helper.h>
-#include <ailego/utility/memory_helper.h>
-#include <gtest/gtest.h>
-#include <zvec/ailego/buffer/buffer_manager.h>
-#include <zvec/core/framework/index_framework.h>
-#include <zvec/core/framework/index_streamer.h>
-
-using namespace zvec::core;
-using namespace zvec::ailego;
-using namespace std;
-
-#if defined(__GNUC__) || defined(__GNUG__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunused-result"
-#endif
-
-constexpr size_t static dim = 128;
-
-class FlatStreamerTest : public testing::Test {
- protected:
-  void SetUp(void);
-  void TearDown(void);
-  void hybrid_scale(std::vector<float> &dense_value,
-                    std::vector<float> &sparse_value, float alpha_scale);
-
-  static std::string dir_;
-  static std::shared_ptr<IndexMeta> index_meta_ptr_;
-};
-
-std::string FlatStreamerTest::dir_("streamer_test/");
-std::shared_ptr<IndexMeta> FlatStreamerTest::index_meta_ptr_;
-
-void FlatStreamerTest::SetUp(void) {
-  index_meta_ptr_.reset(new (std::nothrow)
-                            IndexMeta(IndexMeta::DataType::DT_FP32, dim));
-  index_meta_ptr_->set_metric("SquaredEuclidean", 0, Params());
-
-  char cmdBuf[100];
-  snprintf(cmdBuf, 100, "rm -rf %s", dir_.c_str());
-  system(cmdBuf);
-}
-
-void FlatStreamerTest::TearDown(void) {
-  char cmdBuf[100];
-  snprintf(cmdBuf, 100, "rm -rf %s", dir_.c_str());
-  system(cmdBuf);
-}
-
-TEST_F(FlatStreamerTest, TestLinearSearchMMap) {
-  BufferManager::Instance().init(50 * 1024 * 1024, 1);
-  IndexStreamer::Pointer write_streamer =
-      IndexFactory::CreateStreamer("FlatStreamer");
-  ASSERT_TRUE(write_streamer != nullptr);
-
-  Params params;
-  ASSERT_EQ(0, write_streamer->init(*index_meta_ptr_, params));
-  auto storage = IndexFactory::CreateStorage("MMapFileStorage");
-  ASSERT_NE(nullptr, storage);
-  Params stg_params;
-  ASSERT_EQ(0, storage->init(stg_params));
-  ASSERT_EQ(0, storage->open(dir_ + "/Test/LinearSearchMMap", true));
-  ASSERT_EQ(0, write_streamer->open(storage));
-
-  auto ctx = write_streamer->create_context();
-  ASSERT_TRUE(!!ctx);
-
-  size_t data_cnt = 300000UL, cnt = 500UL;
-  IndexQueryMeta qmeta(IndexMeta::DT_FP32, dim);
-  for (size_t i = 0; i < data_cnt; i++) {
-    NumericalVector<float> vec(dim);
-    for (size_t j = 0; j < dim; ++j) {
-      vec[j] = i;
-    }
-    write_streamer->add_impl(i, vec.data(), qmeta, ctx);
-  }
-  write_streamer->flush(0UL);
-  write_streamer->close();
-  write_streamer.reset();
-
-  IndexStreamer::Pointer read_streamer =
-      IndexFactory::CreateStreamer("FlatStreamer");
-  ASSERT_EQ(0, read_streamer->init(*index_meta_ptr_, params));
-  auto read_storage = IndexFactory::CreateStorage("BufferStorage");
-  ASSERT_NE(nullptr, read_storage);
-  ASSERT_EQ(0, read_storage->init(stg_params));
-  ASSERT_EQ(0, read_storage->open(dir_ + "/Test/LinearSearchMMap", false));
-  ASSERT_EQ(0, read_streamer->open(read_storage));
-  size_t topk = 30;
-  ElapsedTime elapsed_time;
-  for (size_t i = 0; i < cnt; i += 1) {
-    NumericalVector<float> vec(dim);
-    for (size_t j = 0; j < dim; ++j) {
-      vec[j] = i;
-    }
-    ctx->set_topk(topk);
-    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
-    // auto &result1 = ctx->result();
-    // ASSERT_EQ(topk, result1.size());
-    // ASSERT_EQ(i, result1[0].key());
-
-    // for (size_t j = 0; j < dim; ++j) {
-    //   vec[j] = i + 0.1f;
-    // }
-    // ctx->set_topk(topk);
-    // ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
-    // auto &result2 = ctx->result();
-    // ASSERT_EQ(topk, result2.size());
-    // ASSERT_EQ(i, result2[0].key());
-    // ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
-    // ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
-  }
-  cout << "Elapsed time: " << elapsed_time.micro_seconds() << " us" << endl;
-
-  // ctx->set_topk(100U);
-  // NumericalVector<float> vec(dim);
-  // for (size_t j = 0; j < dim; ++j) {
-  //   vec[j] = 10.1f;
-  // }
-  // ASSERT_EQ(0, read_streamer->search_bf_impl(vec.data(), qmeta, ctx));
-  // auto &result = ctx->result();
-  // ASSERT_EQ(100U, result.size());
-  // ASSERT_EQ(10, result[0].key());
-  // ASSERT_EQ(11, result[1].key());
-  // ASSERT_EQ(5, result[10].key());
-  // ASSERT_EQ(0, result[20].key());
-  // ASSERT_EQ(30, result[30].key());
-  // ASSERT_EQ(35, result[35].key());
-  // ASSERT_EQ(99, result[99].key());
-
-  read_streamer->close();
-  read_streamer.reset();
-  // cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl;
-}
-
-#if defined(__GNUC__) || defined(__GNUG__)
-#pragma GCC diagnostic pop
-#endif
\ No newline at end of file
diff --git a/tests/core/algorithm/hnsw/hnsw_streamer_buffer_test.cpp b/tests/core/algorithm/hnsw/hnsw_streamer_buffer_test.cc
similarity index 100%
rename from tests/core/algorithm/hnsw/hnsw_streamer_buffer_test.cpp
rename to tests/core/algorithm/hnsw/hnsw_streamer_buffer_test.cc
diff --git a/tests/db/index/column/vector_column_indexer_test.cc b/tests/db/index/column/vector_column_indexer_test.cc
index 483efcde..251e5a18 100644
--- a/tests/db/index/column/vector_column_indexer_test.cc
+++ b/tests/db/index/column/vector_column_indexer_test.cc
@@ -2160,7 +2160,6 @@ TEST(VectorColumnIndexerTest, Failure) {
     ASSERT_TRUE(indexer->Flush().ok());
     ASSERT_TRUE(indexer->Close().ok());
     {
-      ailego::BufferManager::Instance().init(10 * 1024 * 1024, 1);
       auto indexer = std::make_shared<VectorColumnIndexer>(
           index_file_path,
           FieldSchema("test", DataType::VECTOR_FP32, 3, false,