From 160bb77d58163a522efdccd75de5b22088cb70ba Mon Sep 17 00:00:00 2001
From: Open Source Bot <generatedunixname499836121@fb.com>
Date: Wed, 15 Feb 2023 19:44:16 -0800
Subject: [PATCH 01/47] Updating submodules

Summary:
GitHub commits:

https://github.com/facebook/wangle/commit/43e56205514ee4d77e483f4f52ff27dac3216514
https://github.com/facebookexperimental/edencommon/commit/3934bece13d925ae5dd99bd993caa0258f480fcc
https://github.com/facebookincubator/katran/commit/a4c94aa4b0899c5480f803b557fb894c4e0a5ac3

Reviewed By: jurajh-fb

fbshipit-source-id: 720310662068e4aa245198d817666700c9dde1d5
---
 cachelib/external/fbthrift | 2 +-
 cachelib/external/fizz     | 2 +-
 cachelib/external/wangle   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cachelib/external/fbthrift b/cachelib/external/fbthrift
index 33a9fbc258..1dea977766 160000
--- a/cachelib/external/fbthrift
+++ b/cachelib/external/fbthrift
@@ -1 +1 @@
-Subproject commit 33a9fbc258f21818f20ea03a55c979014882e84d
+Subproject commit 1dea977766b157ff2f5df9922534361b44f9698e
diff --git a/cachelib/external/fizz b/cachelib/external/fizz
index 9198ca6e7d..2716439e6d 160000
--- a/cachelib/external/fizz
+++ b/cachelib/external/fizz
@@ -1 +1 @@
-Subproject commit 9198ca6e7daa50fae6b8413d745b4faaf97dfd10
+Subproject commit 2716439e6d0efdeec3c27d75793249bd42accdf0
diff --git a/cachelib/external/wangle b/cachelib/external/wangle
index 6bc77c8d46..43e5620551 160000
--- a/cachelib/external/wangle
+++ b/cachelib/external/wangle
@@ -1 +1 @@
-Subproject commit 6bc77c8d46b5ef68d77e921bb1e3d1e576adb8fe
+Subproject commit 43e56205514ee4d77e483f4f52ff27dac3216514

From ba8d6b003a4f5856d88af02b09fac74d7f12ae07 Mon Sep 17 00:00:00 2001
From: Open Source Bot <generatedunixname499836121@fb.com>
Date: Thu, 16 Feb 2023 13:23:03 -0800
Subject: [PATCH 02/47] Updating submodules

Summary:
GitHub commits:

https://github.com/facebook/fb303/commit/c51a2906c5be9f12223b2314501b7cc3e9f801d9
https://github.com/facebook/fbthrift/commit/5e97023405d6bddc686e836e9a6d1117947cd9dd
https://github.com/facebook/proxygen/commit/3710a3267346e43095e1f140e72706538ac22f23
https://github.com/facebook/wangle/commit/fd298004026d8f72fcfc37e6b4ff1749a88ea4d1
https://github.com/facebook/watchman/commit/fca53d2d93d8fc2f256aa99b4ebecc4e9bb5ac1a
https://github.com/facebookexperimental/edencommon/commit/3d8a6d905c416ff7b8ae6c70765d34231d747a9d
https://github.com/facebookexperimental/rust-shed/commit/75c5d75f7ec67dacf44db3dcf444b7511f038c58
https://github.com/facebookincubator/fizz/commit/bae2d7ebe3b348679e832102b65d47aceb4ac2f8
https://github.com/facebookincubator/katran/commit/139c75dfb61626dea75951ec1097a1373b5a182e
https://github.com/facebookincubator/mvfst/commit/001332d5e9c5a8ee47e9c932e640474037832abf
https://github.com/facebookincubator/velox/commit/ec0ea1fd5ffcf6dec60d2e53831478cbaf2ca4a0

Reviewed By: jurajh-fb

fbshipit-source-id: a8a8ff6a7fddeaa7bbccb3faa6a737c50d22cfe2
---
 cachelib/external/fbthrift | 2 +-
 cachelib/external/fizz     | 2 +-
 cachelib/external/folly    | 2 +-
 cachelib/external/wangle   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cachelib/external/fbthrift b/cachelib/external/fbthrift
index 1dea977766..5e97023405 160000
--- a/cachelib/external/fbthrift
+++ b/cachelib/external/fbthrift
@@ -1 +1 @@
-Subproject commit 1dea977766b157ff2f5df9922534361b44f9698e
+Subproject commit 5e97023405d6bddc686e836e9a6d1117947cd9dd
diff --git a/cachelib/external/fizz b/cachelib/external/fizz
index 2716439e6d..bae2d7ebe3 160000
--- a/cachelib/external/fizz
+++ b/cachelib/external/fizz
@@ -1 +1 @@
-Subproject commit 2716439e6d0efdeec3c27d75793249bd42accdf0
+Subproject commit bae2d7ebe3b348679e832102b65d47aceb4ac2f8
diff --git a/cachelib/external/folly b/cachelib/external/folly
index 128cfac6ac..9aeeaf4933 160000
--- a/cachelib/external/folly
+++ b/cachelib/external/folly
@@ -1 +1 @@
-Subproject commit 128cfac6ac3d69825bad2af852fced3f63d87411
+Subproject commit 9aeeaf4933c271559c995df862b52af5bd9645c2
diff --git a/cachelib/external/wangle b/cachelib/external/wangle
index 43e5620551..fd29800402 160000
--- a/cachelib/external/wangle
+++ b/cachelib/external/wangle
@@ -1 +1 @@
-Subproject commit 43e56205514ee4d77e483f4f52ff27dac3216514
+Subproject commit fd298004026d8f72fcfc37e6b4ff1749a88ea4d1

From ba9dd68a1bbae05676d332e9aa4c1a6d6b79348e Mon Sep 17 00:00:00 2001
From: Hao Wu <haowux@meta.com>
Date: Fri, 17 Feb 2023 10:09:28 -0800
Subject: [PATCH 03/47] Change some of the stats to rate

Summary: As mentioned in the tasks, these stats make more sense in rates.

Reviewed By: jiayuebao

Differential Revision: D43243498

fbshipit-source-id: 556b0b4a062235bb3b34dd076e5f7a5a88316f8c
---
 cachelib/navy/block_cache/BlockCache.cpp | 9 ++++++---
 cachelib/navy/driver/Driver.cpp          | 6 ++++--
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/cachelib/navy/block_cache/BlockCache.cpp b/cachelib/navy/block_cache/BlockCache.cpp
index f34605b68b..84dadd13e7 100644
--- a/cachelib/navy/block_cache/BlockCache.cpp
+++ b/cachelib/navy/block_cache/BlockCache.cpp
@@ -723,9 +723,11 @@ void BlockCache::getCounters(const CounterVisitor& visitor) const {
           reclaimValueChecksumErrorCount_.get(),
           CounterVisitor::CounterType::RATE);
   visitor("navy_bc_cleanup_entry_header_checksum_errors",
-          cleanupEntryHeaderChecksumErrorCount_.get());
+          cleanupEntryHeaderChecksumErrorCount_.get(),
+          CounterVisitor::CounterType::RATE);
   visitor("navy_bc_cleanup_value_checksum_errors",
-          cleanupValueChecksumErrorCount_.get());
+          cleanupValueChecksumErrorCount_.get(),
+          CounterVisitor::CounterType::RATE);
   visitor("navy_bc_succ_lookups", succLookupCount_.get(),
           CounterVisitor::CounterType::RATE);
   visitor("navy_bc_removes", removeCount_.get(),
@@ -750,7 +752,8 @@ void BlockCache::getCounters(const CounterVisitor& visitor) const {
   visitor("navy_bc_reinsertion_errors", reinsertionErrorCount_.get(),
           CounterVisitor::CounterType::RATE);
   visitor("navy_bc_lookup_for_item_destructor_errors",
-          lookupForItemDestructorErrorCount_.get());
+          lookupForItemDestructorErrorCount_.get(),
+          CounterVisitor::CounterType::RATE);
   visitor("navy_bc_remove_attempt_collisions", removeAttemptCollisions_.get(),
           CounterVisitor::CounterType::RATE);
   // Allocator visits region manager
diff --git a/cachelib/navy/driver/Driver.cpp b/cachelib/navy/driver/Driver.cpp
index 1615d1cc48..29215cc161 100644
--- a/cachelib/navy/driver/Driver.cpp
+++ b/cachelib/navy/driver/Driver.cpp
@@ -273,8 +273,10 @@ void Driver::getCounters(const CounterVisitor& visitor) const {
           CounterVisitor::CounterType::RATE);
   visitor("navy_rejected_bytes", rejectedBytes_.get(),
           CounterVisitor::CounterType::RATE);
-  visitor("navy_accepted_bytes", acceptedBytes_.get());
-  visitor("navy_accepted", acceptedCount_.get());
+  visitor("navy_accepted_bytes", acceptedBytes_.get(),
+          CounterVisitor::CounterType::RATE);
+  visitor("navy_accepted", acceptedCount_.get(),
+          CounterVisitor::CounterType::RATE);
 
   visitor("navy_parcel_memory", parcelMemory_.get());
   visitor("navy_concurrent_inserts", concurrentInserts_.get());

From ad8bfd693f824f904cdcab4f2319aa68a6bf7e4b Mon Sep 17 00:00:00 2001
From: Open Source Bot <generatedunixname499836121@fb.com>
Date: Fri, 17 Feb 2023 11:40:50 -0800
Subject: [PATCH 04/47] Updating submodules

Summary:
GitHub commits:

https://github.com/facebook/cachelib/commit/ba9dd68a1bbae05676d332e9aa4c1a6d6b79348e
https://github.com/facebook/fbthrift/commit/23208ff909ee751de695e6806f14820137045803
https://github.com/facebookincubator/velox/commit/68b8a45ff019bb20ed204451c29ac34352760840

Reviewed By: jurajh-fb

fbshipit-source-id: f8581e13cd1c0a2510d87f42c813ca2c65eaef1d
---
 cachelib/external/fbthrift | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cachelib/external/fbthrift b/cachelib/external/fbthrift
index 5e97023405..23208ff909 160000
--- a/cachelib/external/fbthrift
+++ b/cachelib/external/fbthrift
@@ -1 +1 @@
-Subproject commit 5e97023405d6bddc686e836e9a6d1117947cd9dd
+Subproject commit 23208ff909ee751de695e6806f14820137045803

From bdad762aa2bc38e97c9afc45c4edb9186750b1d3 Mon Sep 17 00:00:00 2001
From: Open Source Bot <generatedunixname499836121@fb.com>
Date: Sat, 18 Feb 2023 11:28:38 -0800
Subject: [PATCH 05/47] Updating submodules

Summary:
GitHub commits:

https://github.com/facebook/fbthrift/commit/3b00d58af9a028db696e2034c54cf9500fda53a7

Reviewed By: jurajh-fb

fbshipit-source-id: 23ecb07e7e9ebd4dfbf3980de141ea1ebe36bb12
---
 cachelib/external/fbthrift | 2 +-
 cachelib/external/fizz     | 2 +-
 cachelib/external/folly    | 2 +-
 cachelib/external/wangle   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cachelib/external/fbthrift b/cachelib/external/fbthrift
index 23208ff909..3b00d58af9 160000
--- a/cachelib/external/fbthrift
+++ b/cachelib/external/fbthrift
@@ -1 +1 @@
-Subproject commit 23208ff909ee751de695e6806f14820137045803
+Subproject commit 3b00d58af9a028db696e2034c54cf9500fda53a7
diff --git a/cachelib/external/fizz b/cachelib/external/fizz
index bae2d7ebe3..2da27e939d 160000
--- a/cachelib/external/fizz
+++ b/cachelib/external/fizz
@@ -1 +1 @@
-Subproject commit bae2d7ebe3b348679e832102b65d47aceb4ac2f8
+Subproject commit 2da27e939de1aa4eeecc6bc8d3a32844a75bd42b
diff --git a/cachelib/external/folly b/cachelib/external/folly
index 9aeeaf4933..3c5efbdff2 160000
--- a/cachelib/external/folly
+++ b/cachelib/external/folly
@@ -1 +1 @@
-Subproject commit 9aeeaf4933c271559c995df862b52af5bd9645c2
+Subproject commit 3c5efbdff2d01b83fc76827518c14a786f9c28ce
diff --git a/cachelib/external/wangle b/cachelib/external/wangle
index fd29800402..8733674dc8 160000
--- a/cachelib/external/wangle
+++ b/cachelib/external/wangle
@@ -1 +1 @@
-Subproject commit fd298004026d8f72fcfc37e6b4ff1749a88ea4d1
+Subproject commit 8733674dc8977b22324b2dfbd956aa6763a9834c

From b42327aea0839c726e8579fc9b36165d2d74bd30 Mon Sep 17 00:00:00 2001
From: Open Source Bot <generatedunixname499836121@fb.com>
Date: Sat, 18 Feb 2023 18:09:16 -0800
Subject: [PATCH 06/47] Updating submodules

Summary:
GitHub commits:

https://github.com/facebook/fbthrift/commit/df0eb612d7cb64ab73ed2f3b42bfe787a6ed4653
https://github.com/facebook/proxygen/commit/7fc3feabba6dca34b03e012d9a981f05e4ea27a3

Reviewed By: jurajh-fb

fbshipit-source-id: fa982baf35fd98edc52be391f03d15a9c2c333bf
---
 cachelib/external/fbthrift | 2 +-
 cachelib/external/wangle   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cachelib/external/fbthrift b/cachelib/external/fbthrift
index 3b00d58af9..df0eb612d7 160000
--- a/cachelib/external/fbthrift
+++ b/cachelib/external/fbthrift
@@ -1 +1 @@
-Subproject commit 3b00d58af9a028db696e2034c54cf9500fda53a7
+Subproject commit df0eb612d7cb64ab73ed2f3b42bfe787a6ed4653
diff --git a/cachelib/external/wangle b/cachelib/external/wangle
index 8733674dc8..cb61ed1759 160000
--- a/cachelib/external/wangle
+++ b/cachelib/external/wangle
@@ -1 +1 @@
-Subproject commit 8733674dc8977b22324b2dfbd956aa6763a9834c
+Subproject commit cb61ed1759c692a4c69b85df7cb983ee920e91cf

From 0d49b53fc50fbe00b48ef854454b884e11768878 Mon Sep 17 00:00:00 2001
From: Open Source Bot <generatedunixname499836121@fb.com>
Date: Sun, 19 Feb 2023 20:30:30 -0800
Subject: [PATCH 07/47] Updating submodules

Summary:
GitHub commits:

https://github.com/facebook/fbthrift/commit/1d01d76ff25b7d5c5d36a08f111834082277f8e7

Reviewed By: jurajh-fb

fbshipit-source-id: bdefba88e8d002c65d8b7c11e86c8aa772996d9b
---
 cachelib/external/fbthrift | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cachelib/external/fbthrift b/cachelib/external/fbthrift
index df0eb612d7..1d01d76ff2 160000
--- a/cachelib/external/fbthrift
+++ b/cachelib/external/fbthrift
@@ -1 +1 @@
-Subproject commit df0eb612d7cb64ab73ed2f3b42bfe787a6ed4653
+Subproject commit 1d01d76ff25b7d5c5d36a08f111834082277f8e7

From bb2d1c657667a15f5ff96fc8a2dd5208bd93aee3 Mon Sep 17 00:00:00 2001
From: Open Source Bot <generatedunixname499836121@fb.com>
Date: Tue, 21 Feb 2023 10:54:23 -0800
Subject: [PATCH 08/47] Updating submodules

Summary:
GitHub commits:

https://github.com/facebook/fbthrift/commit/1b848d85947b8d20e00521e95f78690a73543678
https://github.com/facebook/rocksdb/commit/cfe50f7e77326aac5b04050afcda05059a25667c
https://github.com/pytorch/fbgemm/commit/c55d8006722eae1ec886502363d8386862a13719

Reviewed By: bigfootjon

fbshipit-source-id: e840d1116311ac52336e598442cf66cdc50d7725
---
 cachelib/external/fbthrift | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cachelib/external/fbthrift b/cachelib/external/fbthrift
index 1d01d76ff2..1b848d8594 160000
--- a/cachelib/external/fbthrift
+++ b/cachelib/external/fbthrift
@@ -1 +1 @@
-Subproject commit 1d01d76ff25b7d5c5d36a08f111834082277f8e7
+Subproject commit 1b848d85947b8d20e00521e95f78690a73543678

From c167a8327810110cac2ab739ab2346a05cde5a88 Mon Sep 17 00:00:00 2001
From: Open Source Bot <generatedunixname499836121@fb.com>
Date: Tue, 21 Feb 2023 14:59:14 -0800
Subject: [PATCH 09/47] Updating submodules

Summary:
GitHub commits:

https://github.com/facebook/fbthrift/commit/90fddab67b2a40ec0edda28fd014402258726eef

Reviewed By: bigfootjon

fbshipit-source-id: cf7ac48c00c548805fb74663fe3e9bd9f5906cc2
---
 cachelib/external/fbthrift | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cachelib/external/fbthrift b/cachelib/external/fbthrift
index 1b848d8594..90fddab67b 160000
--- a/cachelib/external/fbthrift
+++ b/cachelib/external/fbthrift
@@ -1 +1 @@
-Subproject commit 1b848d85947b8d20e00521e95f78690a73543678
+Subproject commit 90fddab67b2a40ec0edda28fd014402258726eef

From 32ed439b6b319896386cdd4aa818b2ccf285ae23 Mon Sep 17 00:00:00 2001
From: Jiayue Bao <jiayueb@meta.com>
Date: Tue, 21 Feb 2023 15:17:04 -0800
Subject: [PATCH 10/47] Return replacedPtr from InsertOrReplace API

Summary: Currently `replacedPtr` is passed as an argument and such usage isn't easy to understand (we kept receiving user feedback on that). Now we want to return it.

Reviewed By: therealgymmy, antonf, jaesoo-fb

Differential Revision: D43143989

fbshipit-source-id: eaae055107050cd67a7ca8b26b716fffc0c002f2
---
 .../experimental/objcache2/ObjectCache-inl.h  | 28 +++++-----
 cachelib/experimental/objcache2/ObjectCache.h | 18 +++----
 .../objcache2/persistence/Serialization.h     |  4 +-
 .../objcache2/tests/ObjectCacheTest.cpp       | 51 +++++++++++--------
 4 files changed, 54 insertions(+), 47 deletions(-)

diff --git a/cachelib/experimental/objcache2/ObjectCache-inl.h b/cachelib/experimental/objcache2/ObjectCache-inl.h
index 345a27d528..70cfc445ba 100644
--- a/cachelib/experimental/objcache2/ObjectCache-inl.h
+++ b/cachelib/experimental/objcache2/ObjectCache-inl.h
@@ -153,12 +153,13 @@ std::shared_ptr<T> ObjectCache<AllocatorT>::findToWrite(
 
 template <typename AllocatorT>
 template <typename T>
-std::pair<typename ObjectCache<AllocatorT>::AllocStatus, std::shared_ptr<T>>
+std::tuple<typename ObjectCache<AllocatorT>::AllocStatus,
+           std::shared_ptr<T>,
+           std::shared_ptr<T>>
 ObjectCache<AllocatorT>::insertOrReplace(folly::StringPiece key,
                                          std::unique_ptr<T> object,
                                          size_t objectSize,
-                                         uint32_t ttlSecs,
-                                         std::shared_ptr<T>* replacedPtr) {
+                                         uint32_t ttlSecs) {
   if (config_.objectSizeTrackingEnabled && objectSize == 0) {
     throw std::invalid_argument(
         "Object size tracking is enabled but object size is set to be 0.");
@@ -176,7 +177,8 @@ ObjectCache<AllocatorT>::insertOrReplace(folly::StringPiece key,
       allocateFromL1(key, ttlSecs, 0 /* use current time as creationTime */);
   if (!handle) {
     insertErrors_.inc();
-    return {AllocStatus::kAllocError, std::shared_ptr<T>(std::move(object))};
+    return {AllocStatus::kAllocError, std::shared_ptr<T>(std::move(object)),
+            nullptr};
   }
   // We don't release the object here because insertOrReplace could throw when
   // the replaced item is out of refcount; in this case, the object isn't
@@ -187,16 +189,15 @@ ObjectCache<AllocatorT>::insertOrReplace(folly::StringPiece key,
 
   auto replaced = this->l1Cache_->insertOrReplace(handle);
 
+  std::shared_ptr<T> replacedPtr = nullptr;
   if (replaced) {
     replaces_.inc();
-    if (replacedPtr) {
-      auto itemPtr = reinterpret_cast<ObjectCacheItem*>(replaced->getMemory());
-      // Just release the handle. Cache destorys object when all handles
-      // released.
-      auto deleter = [h = std::move(replaced)](T*) {};
-      *replacedPtr = std::shared_ptr<T>(
-          reinterpret_cast<T*>(itemPtr->objectPtr), std::move(deleter));
-    }
+    auto itemPtr = reinterpret_cast<ObjectCacheItem*>(replaced->getMemory());
+    // Just release the handle. Cache destorys object when all handles
+    // released.
+    auto deleter = [h = std::move(replaced)](T*) {};
+    replacedPtr = std::shared_ptr<T>(reinterpret_cast<T*>(itemPtr->objectPtr),
+                                     std::move(deleter));
   }
 
   // Just release the handle. Cache destorys object when all handles released.
@@ -209,7 +210,8 @@ ObjectCache<AllocatorT>::insertOrReplace(folly::StringPiece key,
 
   // Release the object as it has been successfully inserted to the cache.
   object.release();
-  return {AllocStatus::kSuccess, std::shared_ptr<T>(ptr, std::move(deleter))};
+  return {AllocStatus::kSuccess, std::shared_ptr<T>(ptr, std::move(deleter)),
+          replacedPtr};
 }
 
 template <typename AllocatorT>
diff --git a/cachelib/experimental/objcache2/ObjectCache.h b/cachelib/experimental/objcache2/ObjectCache.h
index 85abac068e..f4cd2a9bb7 100644
--- a/cachelib/experimental/objcache2/ObjectCache.h
+++ b/cachelib/experimental/objcache2/ObjectCache.h
@@ -146,22 +146,20 @@ class ObjectCache : public ObjectCacheBase<AllocatorT> {
   //                     if objectSizeTracking is enabled, a non-zero value must
   //                     be passed.
   // @param ttlSecs      object expiring seconds.
-  // @param replacedPtr  a pointer to a shared_ptr, if it is not nullptr it will
-  //                     be assigned to the replaced object.
   //
   // @throw cachelib::exception::RefcountOverflow if the item we are replacing
   //        is already out of refcounts.
   // @throw std::invalid_argument if objectSizeTracking is enabled but
   //        objectSize is 0.
-  // @return a pair of allocation status and shared_ptr of newly inserted
-  //         object.
+  // @return a tuple of allocation status, shared_ptr of newly inserted
+  //         object and shared_ptr of old object that has been replaced (nullptr
+  //         if no replacement happened)
   template <typename T>
-  std::pair<AllocStatus, std::shared_ptr<T>> insertOrReplace(
-      folly::StringPiece key,
-      std::unique_ptr<T> object,
-      size_t objectSize = 0,
-      uint32_t ttlSecs = 0,
-      std::shared_ptr<T>* replacedPtr = nullptr);
+  std::tuple<AllocStatus, std::shared_ptr<T>, std::shared_ptr<T>>
+  insertOrReplace(folly::StringPiece key,
+                  std::unique_ptr<T> object,
+                  size_t objectSize = 0,
+                  uint32_t ttlSecs = 0);
 
   // Insert the object into the cache with given key. If the key exists in the
   // cache, the new object won't be inserted.
diff --git a/cachelib/experimental/objcache2/persistence/Serialization.h b/cachelib/experimental/objcache2/persistence/Serialization.h
index cccb414b45..4edad88e4b 100644
--- a/cachelib/experimental/objcache2/persistence/Serialization.h
+++ b/cachelib/experimental/objcache2/persistence/Serialization.h
@@ -68,9 +68,9 @@ struct ObjectDeserializer {
     Deserializer deserializer{reinterpret_cast<const uint8_t*>(payload.begin()),
                               reinterpret_cast<const uint8_t*>(payload.end())};
     auto ptr = std::make_unique<T>(deserializer.deserialize<T>());
-    auto [allocStatus, _] =
+    auto res =
         objCache_.insertOrReplace(key, std::move(ptr), objectSize, ttlSecs);
-    return allocStatus == ObjectCache::AllocStatus::kSuccess;
+    return std::get<0>(res) == ObjectCache::AllocStatus::kSuccess;
   }
 
   // cache key of the object to be deserialized
diff --git a/cachelib/experimental/objcache2/tests/ObjectCacheTest.cpp b/cachelib/experimental/objcache2/tests/ObjectCacheTest.cpp
index 0bcc120de9..6bafa40589 100644
--- a/cachelib/experimental/objcache2/tests/ObjectCacheTest.cpp
+++ b/cachelib/experimental/objcache2/tests/ObjectCacheTest.cpp
@@ -206,12 +206,12 @@ class ObjectCacheTest : public ::testing::Test {
     foo->a = 1;
     foo->b = 2;
     foo->c = 3;
-    auto res = objcache->insertOrReplace("Foo", std::move(foo));
-    EXPECT_EQ(ObjectCache::AllocStatus::kSuccess, res.first);
-    ASSERT_NE(nullptr, res.second);
-    EXPECT_EQ(1, res.second->a);
-    EXPECT_EQ(2, res.second->b);
-    EXPECT_EQ(3, res.second->c);
+    auto [allocRes, ptr, _] = objcache->insertOrReplace("Foo", std::move(foo));
+    EXPECT_EQ(ObjectCache::AllocStatus::kSuccess, allocRes);
+    ASSERT_NE(nullptr, ptr);
+    EXPECT_EQ(1, ptr->a);
+    EXPECT_EQ(2, ptr->b);
+    EXPECT_EQ(3, ptr->c);
 
     auto found2 = objcache->template find<Foo>("Foo");
     ASSERT_NE(nullptr, found2);
@@ -238,7 +238,7 @@ class ObjectCacheTest : public ::testing::Test {
     foo->b = 2;
     foo->c = 3;
     auto res1 = objcache->insertOrReplace("Foo", std::move(foo));
-    EXPECT_EQ(ObjectCache::AllocStatus::kSuccess, res1.first);
+    EXPECT_EQ(ObjectCache::AllocStatus::kSuccess, std::get<0>(res1));
 
     auto found1 = objcache->template find<Foo>("Foo");
     ASSERT_NE(nullptr, found1);
@@ -251,7 +251,7 @@ class ObjectCacheTest : public ::testing::Test {
     foo2->e = 5;
     foo2->f = 6;
     auto res2 = objcache->insertOrReplace("Foo2", std::move(foo2));
-    EXPECT_EQ(ObjectCache::AllocStatus::kSuccess, res2.first);
+    EXPECT_EQ(ObjectCache::AllocStatus::kSuccess, std::get<0>(res2));
 
     auto found2 = objcache->template find<Foo2>("Foo2");
     ASSERT_NE(nullptr, found2);
@@ -272,7 +272,7 @@ class ObjectCacheTest : public ::testing::Test {
     foo4->b = 2;
     foo4->c = 3;
     auto res1 = objcache->insertOrReplace("Foo4", std::move(foo4));
-    EXPECT_EQ(ObjectCache::AllocStatus::kSuccess, res1.first);
+    EXPECT_EQ(ObjectCache::AllocStatus::kSuccess, std::get<0>(res1));
 
     auto found1 = objcache->template find<Foo4>("Foo4");
     ASSERT_NE(nullptr, found1);
@@ -285,7 +285,7 @@ class ObjectCacheTest : public ::testing::Test {
     foo5->e = 5;
     foo5->f = 6;
     auto res2 = objcache->insertOrReplace("Foo5", std::move(foo5));
-    EXPECT_EQ(ObjectCache::AllocStatus::kSuccess, res2.first);
+    EXPECT_EQ(ObjectCache::AllocStatus::kSuccess, std::get<0>(res2));
 
     auto found2 = objcache->template find<Foo5>("Foo5");
     ASSERT_NE(nullptr, found2);
@@ -385,11 +385,14 @@ class ObjectCacheTest : public ::testing::Test {
     foo1->a = 1;
     foo1->b = 2;
     foo1->c = 3;
-    std::shared_ptr<Foo> replaced;
-    auto res =
-        objcache->insertOrReplace("Foo", std::move(foo1), 0, 0, &replaced);
-    EXPECT_EQ(ObjectCache::AllocStatus::kSuccess, res.first);
-    EXPECT_EQ(nullptr, replaced);
+
+    auto [res1, ptr1, replaced1] =
+        objcache->insertOrReplace("Foo", std::move(foo1));
+    EXPECT_EQ(ObjectCache::AllocStatus::kSuccess, res1);
+    EXPECT_EQ(1, ptr1->a);
+    EXPECT_EQ(2, ptr1->b);
+    EXPECT_EQ(3, ptr1->c);
+    EXPECT_EQ(nullptr, replaced1);
 
     auto found1 = objcache->template find<Foo>("Foo");
     ASSERT_NE(nullptr, found1);
@@ -401,12 +404,16 @@ class ObjectCacheTest : public ::testing::Test {
     foo2->a = 10;
     foo2->b = 20;
     foo2->c = 30;
-    res = objcache->insertOrReplace("Foo", std::move(foo2), 0, 0, &replaced);
-    EXPECT_EQ(ObjectCache::AllocStatus::kSuccess, res.first);
-    ASSERT_NE(nullptr, replaced);
-    EXPECT_EQ(1, replaced->a);
-    EXPECT_EQ(2, replaced->b);
-    EXPECT_EQ(3, replaced->c);
+    auto [res2, ptr2, replaced2] =
+        objcache->insertOrReplace("Foo", std::move(foo2));
+    EXPECT_EQ(ObjectCache::AllocStatus::kSuccess, res2);
+    EXPECT_EQ(10, ptr2->a);
+    EXPECT_EQ(20, ptr2->b);
+    EXPECT_EQ(30, ptr2->c);
+    ASSERT_NE(nullptr, replaced2);
+    EXPECT_EQ(1, replaced2->a);
+    EXPECT_EQ(2, replaced2->b);
+    EXPECT_EQ(3, replaced2->c);
 
     auto found2 = objcache->template find<Foo>("Foo");
     ASSERT_NE(nullptr, found2);
@@ -497,7 +504,7 @@ class ObjectCacheTest : public ::testing::Test {
     // replace foo1 with foo2
     {
       auto res = objcache->insertOrReplace("Foo", std::move(foo2), foo2Size);
-      ASSERT_EQ(ObjectCache::AllocStatus::kSuccess, res.first);
+      ASSERT_EQ(ObjectCache::AllocStatus::kSuccess, std::get<0>(res));
 
       auto found = objcache->template find<Foo>("Foo");
       ASSERT_NE(nullptr, found);

From dd0af61a5fcb621ac253f67715b5fad8994627c7 Mon Sep 17 00:00:00 2001
From: Open Source Bot <generatedunixname499836121@fb.com>
Date: Tue, 21 Feb 2023 15:51:28 -0800
Subject: [PATCH 11/47] Updating submodules

Summary:
GitHub commits:

https://github.com/facebook/folly/commit/fdb333b2bb6dd9424d19c4e12385726ff87e60d4
https://github.com/facebook/watchman/commit/0b238b069f54a4de27c35a3e2352c37bccc12d60
https://github.com/pytorch/fbgemm/commit/d142cdf42560496e5f473a0f85220ff75681acc8

Reviewed By: bigfootjon

fbshipit-source-id: ac185dc9dc24ba7d2765dca010958ce6b38bd029
---
 cachelib/external/folly | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cachelib/external/folly b/cachelib/external/folly
index 3c5efbdff2..fdb333b2bb 160000
--- a/cachelib/external/folly
+++ b/cachelib/external/folly
@@ -1 +1 @@
-Subproject commit 3c5efbdff2d01b83fc76827518c14a786f9c28ce
+Subproject commit fdb333b2bb6dd9424d19c4e12385726ff87e60d4

From cce19ea5feeba497e8e72e7bda9e899d146a37df Mon Sep 17 00:00:00 2001
From: Open Source Bot <generatedunixname499836121@fb.com>
Date: Wed, 22 Feb 2023 13:29:47 -0800
Subject: [PATCH 12/47] Updating submodules

Summary:
GitHub commits:

https://github.com/facebook/fbthrift/commit/cbc3de581fdf36ba474b0c135b9e785e504f1c1e
https://github.com/facebook/rocksdb/commit/229297d1b83c1885e7db2573b9b44736a7be23a5
https://github.com/facebook/watchman/commit/d873e11529fae6d0bbd74cfe95d41732dbfbded8
https://github.com/facebookexperimental/rust-shed/commit/2f45b886ae2eb0bf59664036f0fd36e3b20923d9

Reviewed By: bigfootjon

fbshipit-source-id: a65fa2098f6b6f1704154e0509e2f5423f9679eb
---
 cachelib/external/fbthrift | 2 +-
 cachelib/external/fizz     | 2 +-
 cachelib/external/folly    | 2 +-
 cachelib/external/wangle   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cachelib/external/fbthrift b/cachelib/external/fbthrift
index 90fddab67b..cbc3de581f 160000
--- a/cachelib/external/fbthrift
+++ b/cachelib/external/fbthrift
@@ -1 +1 @@
-Subproject commit 90fddab67b2a40ec0edda28fd014402258726eef
+Subproject commit cbc3de581fdf36ba474b0c135b9e785e504f1c1e
diff --git a/cachelib/external/fizz b/cachelib/external/fizz
index 2da27e939d..287625bd66 160000
--- a/cachelib/external/fizz
+++ b/cachelib/external/fizz
@@ -1 +1 @@
-Subproject commit 2da27e939de1aa4eeecc6bc8d3a32844a75bd42b
+Subproject commit 287625bd6676b812e75ad0b088a61f72b4c9e681
diff --git a/cachelib/external/folly b/cachelib/external/folly
index fdb333b2bb..ce2b95715d 160000
--- a/cachelib/external/folly
+++ b/cachelib/external/folly
@@ -1 +1 @@
-Subproject commit fdb333b2bb6dd9424d19c4e12385726ff87e60d4
+Subproject commit ce2b95715de229fcb51bd97410469a3ad4d2bfb2
diff --git a/cachelib/external/wangle b/cachelib/external/wangle
index cb61ed1759..44690e7894 160000
--- a/cachelib/external/wangle
+++ b/cachelib/external/wangle
@@ -1 +1 @@
-Subproject commit cb61ed1759c692a4c69b85df7cb983ee920e91cf
+Subproject commit 44690e7894842a7127245837b69627d4b964aabd

From 6357906c331954f34eccc5b870a7984da79f27f4 Mon Sep 17 00:00:00 2001
From: Daniel Wong <wonglkd@gmail.com>
Date: Thu, 23 Feb 2023 20:29:55 -0800
Subject: [PATCH 13/47] Pin fmt version at 8.0.1 (#196)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Change build script to pin fmt version to same as folly's to minimize future breaks.

Pull Request resolved: https://github.com/facebook/CacheLib/pull/196

Test Plan:
Built successfully on a fresh clone of CacheLib. (Also had to change `external_git_branch=dev` for zstd to deal with the cmake/zstd issue in https://github.com/facebook/CacheLib/issues/194, but that should resolve when gets merged into release)

**Context:** OSS build broke between 3-5 Jan 2023, likely due to changes in folly. While switching to v9.1.0 or 9.0.0 fixes the issue at hand, it seems sensible to match folly, which specifies fmt v8.0.1: https://github.com/facebook/folly/blob/main/build/fbcode_builder/manifests/fmt

> https://github.com/facebook/CacheLib/issues/62 agordon: For the other packages, you'll notice we do use a specific git tag or branch… I notice `fmt` is an exception - not pinned to a specific git tag or revision - likely an omission that can be fixed.

Related CacheLib issues: https://github.com/facebook/CacheLib/issues/186, https://github.com/facebook/CacheLib/issues/189, https://github.com/facebook/CacheLib/issues/107, https://github.com/facebook/CacheLib/issues/97, https://github.com/facebook/CacheLib/issues/62
Possibly related CacheLib commit: 67cc11ad6f5fb7b1e1948513292ef00edee34f5e

Last working (Jan 3): https://github.com/facebook/CacheLib/actions/runs/3826992478
First failed (Jan 5): https://github.com/facebook/CacheLib/actions/runs/3844002307/jobs/6546742348
Error: `error: static assertion failed: Cannot format an argument. To make type T formattable provide a formatter<T> specialization: https://fmt.dev/latest/api.html#udt`

Reviewed By: therealgymmy

Differential Revision: D43517927

Pulled By: jiayuebao

fbshipit-source-id: 2d28791f7804d862b646263b96b10b835f843d8c
---
 contrib/build-package.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/contrib/build-package.sh b/contrib/build-package.sh
index ff487967cb..6e7acac5c2 100755
--- a/contrib/build-package.sh
+++ b/contrib/build-package.sh
@@ -160,6 +160,7 @@ case "$1" in
     REPODIR=cachelib/external/$NAME
     SRCDIR=$REPODIR
     external_git_clone=yes
+    external_git_tag="8.0.1"
     cmake_custom_params="-DBUILD_SHARED_LIBS=ON"
     if test "$build_tests" = "yes" ; then
         cmake_custom_params="$cmake_custom_params -DFMT_TEST=YES"

From a9257379ab72ef2733f35e814c425d12785f0ea9 Mon Sep 17 00:00:00 2001
From: Jaesoo Lee <jaesoo@meta.com>
Date: Fri, 24 Feb 2023 19:09:53 -0800
Subject: [PATCH 14/47] KVReplayGenerator: parse GET_LEASE and SET_LEASE
 operations

Summary:
Memcached's WSA logger will now emits GET_LEASE and SET_LEASE operations as well. This changes makes
the cachebench treats those as GET and SET, respectively, for compatibility.

Reviewed By: therealgymmy

Differential Revision: D43336316

fbshipit-source-id: c9b842d567b9fb2128b822bd429f5dce30b378da
---
 cachelib/cachebench/workload/KVReplayGenerator.h  |  6 +++---
 .../workload/tests/KVReplayGeneratorTest.cpp      | 15 +++++++++++++++
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/cachelib/cachebench/workload/KVReplayGenerator.h b/cachelib/cachebench/workload/KVReplayGenerator.h
index 4b12970081..a9124e2bd7 100644
--- a/cachelib/cachebench/workload/KVReplayGenerator.h
+++ b/cachelib/cachebench/workload/KVReplayGenerator.h
@@ -230,10 +230,10 @@ inline bool KVReplayGenerator::parseRequest(const std::string& line,
 
   // Set op
   const auto& op = fields[SampleFields::OP];
-  // TODO only memcache optypes are supported
-  if (!op.compare("GET")) {
+  // TODO implement GET_LEASE and SET_LEASE emulations
+  if (!op.compare("GET") || !op.compare("GET_LEASE")) {
     req->req_.setOp(OpType::kGet);
-  } else if (!op.compare("SET")) {
+  } else if (!op.compare("SET") || !op.compare("SET_LEASE")) {
     req->req_.setOp(OpType::kSet);
   } else if (!op.compare("DELETE")) {
     req->req_.setOp(OpType::kDel);
diff --git a/cachelib/cachebench/workload/tests/KVReplayGeneratorTest.cpp b/cachelib/cachebench/workload/tests/KVReplayGeneratorTest.cpp
index 72a55a4020..16e4e52060 100644
--- a/cachelib/cachebench/workload/tests/KVReplayGeneratorTest.cpp
+++ b/cachelib/cachebench/workload/tests/KVReplayGeneratorTest.cpp
@@ -56,6 +56,18 @@ struct TraceEntry {
     size_t expKeySize = std::max<size_t>(keySize_, reqKey.size());
     expKeySize = std::min<size_t>(expKeySize, 256);
     ASSERT_EQ(reqKey.size(), expKeySize);
+    ASSERT_EQ(req.req_.getOp(), getOpType());
+  }
+
+  OpType getOpType() {
+    if (!op_.compare("GET") || !op_.compare("GET_LEASE")) {
+      return OpType::kGet;
+    } else if (!op_.compare("SET") || !op_.compare("SET_LEASE")) {
+      return OpType::kSet;
+    } else if (!op_.compare("DELETE")) {
+      return OpType::kDel;
+    }
+    return OpType::kSize;
   }
 
   std::string key_;
@@ -86,8 +98,11 @@ TEST(KVReplayGeneratorTest, BasicFormat) {
       // <key_size>,<op>,<size>,<op_count>,<ttl>,<valid>
       {7, "GET", 0, 2, std::nullopt, true},
       {7, "GET", 0, 2, 50, true},
+      {7, "GET_LEASE", 0, 2, 50, true},
       {20, "SET", 100, 35, std::nullopt, true},
       {20, "SET", 100, 35, 3600, true},
+      {20, "SAT", 100, 35, 3600, false}, // invalid op name
+      {20, "SET_LEASE", 100, 35, 3600, true},
       {7, "GET", 0, 0, std::nullopt, false},      // invalid op count
       {7, "GET", 0, 0, 600, false},               // invalid op count
       {1024, "SET", 100, 35, 300, true},          // key truncated

From f7e13a4bf723a6e128e2707accc12c8c3594bfa2 Mon Sep 17 00:00:00 2001
From: Jiayue Bao <jiayueb@meta.com>
Date: Mon, 27 Feb 2023 08:41:42 -0800
Subject: [PATCH 15/47] Update @braintree/sanitize-url version

Summary: update to >6.0.1 version

Reviewed By: antonk52

Differential Revision: D43575577

fbshipit-source-id: 7143da212ab6f124bffdfdaf3be29ff3ab986ffb
---
 website/package.json | 3 ++-
 website/yarn.lock    | 8 ++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/website/package.json b/website/package.json
index 8c58fda9a2..ac9801eeeb 100644
--- a/website/package.json
+++ b/website/package.json
@@ -43,7 +43,8 @@
     "ansi-html": "0.0.8",
     "ua-parser-js": "^1.0.33",
     "eta": "^2.0.0",
-    "http-cache-semantics": "^4.1.1"
+    "http-cache-semantics": "^4.1.1",
+    "@braintree/sanitize-url": "^6.0.1"
   },
   "browserslist": {
     "production": [
diff --git a/website/yarn.lock b/website/yarn.lock
index 19f12eb0d5..51e55da8c7 100644
--- a/website/yarn.lock
+++ b/website/yarn.lock
@@ -1809,10 +1809,10 @@
     "@babel/helper-validator-identifier" "^7.18.6"
     to-fast-properties "^2.0.0"
 
-"@braintree/sanitize-url@^6.0.0":
-  version "6.0.0"
-  resolved "https://registry.yarnpkg.com/@braintree/sanitize-url/-/sanitize-url-6.0.0.tgz#fe364f025ba74f6de6c837a84ef44bdb1d61e68f"
-  integrity sha512-mgmE7XBYY/21erpzhexk4Cj1cyTQ9LzvnTxtzM17BJ7ERMNE6W72mQRo0I1Ud8eFJ+RVVIcBNhLFZ3GX4XFz5w==
+"@braintree/sanitize-url@^6.0.0", "@braintree/sanitize-url@^6.0.1":
+  version "6.0.2"
+  resolved "https://registry.yarnpkg.com/@braintree/sanitize-url/-/sanitize-url-6.0.2.tgz#6110f918d273fe2af8ea1c4398a88774bb9fc12f"
+  integrity sha512-Tbsj02wXCbqGmzdnXNk0SOF19ChhRU70BsroIi4Pm6Ehp56in6vch94mfbdQ17DozxkL3BAVjbZ4Qc1a0HFRAg==
 
 "@colors/colors@1.5.0":
   version "1.5.0"

From 70ff91f9558fb17e67003e272d2b75df5317d6bc Mon Sep 17 00:00:00 2001
From: Daniel Wong <wonglkd@gmail.com>
Date: Mon, 27 Feb 2023 08:47:26 -0800
Subject: [PATCH 16/47] Add missing numa deps for fedora, rocky, arch (#197)

Summary:
Fix OSS builds by adding numa deps to build files. Currently some fail on missing `numa.h`.

Context: https://github.com/facebook/CacheLib/issues/161 added the dependencies to the centOS, debian, and ubuntu18 build files. The PR was opened in Sep 2022 but only landed in Dec 2022, and so probably missed out on the fedora, rocky and arch build files which were added in-between those dates. Having had those build actions run on PRs would have caught this (currently, they are only scheduled.)

Pull Request resolved: https://github.com/facebook/CacheLib/pull/197

Test Plan: Github Actions builds (ideally, https://github.com/facebook/CacheLib/issues/198 would be landed first.) I've checked that those packages exist for the respective repositories but didn't run them myself.

Reviewed By: jaesoo-fb

Differential Revision: D43587970

Pulled By: jiayuebao

fbshipit-source-id: 8c59e48528042350e576a45ffc3bf2520699f5a9
---
 contrib/prerequisites-arch.sh     | 1 +
 contrib/prerequisites-fedora32.sh | 1 +
 contrib/prerequisites-fedora34.sh | 3 ++-
 contrib/prerequisites-rocky9.sh   | 3 ++-
 4 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/contrib/prerequisites-arch.sh b/contrib/prerequisites-arch.sh
index 85a8656f7b..249f6c8082 100755
--- a/contrib/prerequisites-arch.sh
+++ b/contrib/prerequisites-arch.sh
@@ -19,4 +19,5 @@ sudo pacman -S --needed --noconfirm cmake \
   boost \
   double-conversion \
   libdwarf \
+  numactl \
   libsodium
diff --git a/contrib/prerequisites-fedora32.sh b/contrib/prerequisites-fedora32.sh
index 235d6c1a8a..942cac0470 100755
--- a/contrib/prerequisites-fedora32.sh
+++ b/contrib/prerequisites-fedora32.sh
@@ -21,6 +21,7 @@ sudo dnf -y install bison flex patch bzip2 cmake \
   zlib-devel lz4-devel xz-devel bzip2-devel \
   jemalloc-devel snappy-devel libsodium-devel libdwarf-devel libaio-devel \
   gmock-devel gflags-devel gtest gtest-devel \
+  numactl-devel \
   fmt fmt-devel
 
 # DO NOT INSTALL glog-devel - need to build from source for the glog-*.cmake files
diff --git a/contrib/prerequisites-fedora34.sh b/contrib/prerequisites-fedora34.sh
index 7e45c8740d..c7182cc513 100755
--- a/contrib/prerequisites-fedora34.sh
+++ b/contrib/prerequisites-fedora34.sh
@@ -19,4 +19,5 @@ sudo dnf -y install bison flex patch bzip2 cmake \
   double-conversion double-conversion-devel make g++ \
   boost-devel libevent-devel openssl-devel libunwind-devel \
   zlib-devel lz4-devel xz-devel bzip2-devel \
-  jemalloc-devel snappy-devel libsodium-devel libdwarf-devel libaio-devel
+  jemalloc-devel snappy-devel libsodium-devel libdwarf-devel libaio-devel \
+  numactl-devel
diff --git a/contrib/prerequisites-rocky9.sh b/contrib/prerequisites-rocky9.sh
index bec5b82011..06720aba2e 100755
--- a/contrib/prerequisites-rocky9.sh
+++ b/contrib/prerequisites-rocky9.sh
@@ -38,7 +38,8 @@ sudo dnf install -y \
   jemalloc-devel \
   libsodium-devel \
   libaio-devel \
-  binutils-devel
+  binutils-devel \
+  numactl-devel
 
 
 sudo dnf install -y \

From 6ad7a318b43de25ba41067c3d6a851d5a60d1633 Mon Sep 17 00:00:00 2001
From: generatedunixname89002005287564 <generatedunixname89002005287564@fb.com>
Date: Mon, 27 Feb 2023 10:31:09 -0800
Subject: [PATCH 17/47] fbcode/cachelib/allocator/datastruct/serialize

Reviewed By: avalonalex

Differential Revision: D43616310

fbshipit-source-id: 3367ad01ba31e5dc561d63a4f3b9746170e64912
---
 .../allocator/datastruct/serialize/objects.thrift    | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cachelib/allocator/datastruct/serialize/objects.thrift b/cachelib/allocator/datastruct/serialize/objects.thrift
index bd2c8b79bc..223b804e5b 100644
--- a/cachelib/allocator/datastruct/serialize/objects.thrift
+++ b/cachelib/allocator/datastruct/serialize/objects.thrift
@@ -22,17 +22,17 @@ namespace cpp2 facebook.cachelib.serialization
 
 // Saved state for an SList
 struct SListObject {
-  2: required i64 size,
-  3: required i64 compressedHead, // Pointer to the head element
+  2: required i64 size;
+  3: required i64 compressedHead; // Pointer to the head element
   // TODO(bwatling): remove the default value and clean up SList::SList() once
   // we can rely on 'compressedTail' always being valid.
-  4: i64 compressedTail = -1, // Pointer to the tail element
+  4: i64 compressedTail = -1; // Pointer to the tail element
 }
 
 struct DListObject {
-  1: required i64 compressedHead,
-  2: required i64 compressedTail,
-  3: required i64 size,
+  1: required i64 compressedHead;
+  2: required i64 compressedTail;
+  3: required i64 size;
 }
 
 struct MultiDListObject {

From df5b9f6ef35c55e432b6713c52397a03dd19c34c Mon Sep 17 00:00:00 2001
From: Daniel Wong <wonglkd@gmail.com>
Date: Mon, 27 Feb 2023 16:21:05 -0800
Subject: [PATCH 18/47] Run Github Actions on pull requests (#198)

Summary:
Run GitHub action builds on every pull request, in addition to currently daily scheduled runs.

Benefit: avoid accidentally breaking other OS builds. This would have caught https://github.com/facebook/CacheLib/issues/197.
This triggers whenever PRs are opened or when commits are added to a PR.

Frequency of PRs (total of 76 PRs over last 18 months since CacheLib was open-sourced) is much less than frequency  of daily scheduled builds, so this shouldn't add too many builds overall.

Pull Request resolved: https://github.com/facebook/CacheLib/pull/198

Reviewed By: therealgymmy

Differential Revision: D43625609

Pulled By: jiayuebao

fbshipit-source-id: 1572f6da32584ce6a1983d5e64afedf17ff17457
---
 .github/workflows/build-cachelib-centos-8-1.yml   | 1 +
 .github/workflows/build-cachelib-centos-8-5.yml   | 1 +
 .github/workflows/build-cachelib-debian-10.yml    | 1 +
 .github/workflows/build-cachelib-fedora-36.yml    | 1 +
 .github/workflows/build-cachelib-rockylinux-8.yml | 1 +
 .github/workflows/build-cachelib-rockylinux-9.yml | 1 +
 .github/workflows/build-cachelib-ubuntu-18.yml    | 1 +
 .github/workflows/build-cachelib-ubuntu-20.yml    | 1 +
 .github/workflows/build-cachelib-ubuntu-22.yml    | 1 +
 9 files changed, 9 insertions(+)

diff --git a/.github/workflows/build-cachelib-centos-8-1.yml b/.github/workflows/build-cachelib-centos-8-1.yml
index 5eb1090b0a..3983e0c78b 100644
--- a/.github/workflows/build-cachelib-centos-8-1.yml
+++ b/.github/workflows/build-cachelib-centos-8-1.yml
@@ -14,6 +14,7 @@
 name: build-cachelib-centos-8-1
 on:
 #  push:
+  pull_request:
   schedule:
      - cron:  '0 11 * * 1,3,5'
 jobs:
diff --git a/.github/workflows/build-cachelib-centos-8-5.yml b/.github/workflows/build-cachelib-centos-8-5.yml
index 3ffee37765..4e6c2d12e1 100644
--- a/.github/workflows/build-cachelib-centos-8-5.yml
+++ b/.github/workflows/build-cachelib-centos-8-5.yml
@@ -14,6 +14,7 @@
 name: build-cachelib-centos-8.5
 on:
 #   push:
+  pull_request:
   schedule:
      - cron:  '0 9 * * *'
 jobs:
diff --git a/.github/workflows/build-cachelib-debian-10.yml b/.github/workflows/build-cachelib-debian-10.yml
index c7c67e0724..56fb576298 100644
--- a/.github/workflows/build-cachelib-debian-10.yml
+++ b/.github/workflows/build-cachelib-debian-10.yml
@@ -14,6 +14,7 @@
 name: build-cachelib-debian-10
 on:
 #  push:
+  pull_request:
   schedule:
      - cron:  '0 13 * * *'
 jobs:
diff --git a/.github/workflows/build-cachelib-fedora-36.yml b/.github/workflows/build-cachelib-fedora-36.yml
index 216dbf5841..f8c0424400 100644
--- a/.github/workflows/build-cachelib-fedora-36.yml
+++ b/.github/workflows/build-cachelib-fedora-36.yml
@@ -14,6 +14,7 @@
 name: build-cachelib-fedora-36
 on:
 #  push:
+  pull_request:
   schedule:
      - cron:  '0 19 * * *'
 jobs:
diff --git a/.github/workflows/build-cachelib-rockylinux-8.yml b/.github/workflows/build-cachelib-rockylinux-8.yml
index 879dc27566..c8af12327d 100644
--- a/.github/workflows/build-cachelib-rockylinux-8.yml
+++ b/.github/workflows/build-cachelib-rockylinux-8.yml
@@ -14,6 +14,7 @@
 name: build-cachelib-rockylinux-8.6
 on:
 #   push:
+  pull_request:
   schedule:
      - cron:  '0 15 * * 2,4,6'
 jobs:
diff --git a/.github/workflows/build-cachelib-rockylinux-9.yml b/.github/workflows/build-cachelib-rockylinux-9.yml
index f6a86d75a0..e26eac6ff1 100644
--- a/.github/workflows/build-cachelib-rockylinux-9.yml
+++ b/.github/workflows/build-cachelib-rockylinux-9.yml
@@ -14,6 +14,7 @@
 name: build-cachelib-rockylinux-9.0
 on:
 #   push:
+  pull_request:
   schedule:
      - cron:  '0 17 * * *'
 jobs:
diff --git a/.github/workflows/build-cachelib-ubuntu-18.yml b/.github/workflows/build-cachelib-ubuntu-18.yml
index fad34c0897..ad068278a4 100644
--- a/.github/workflows/build-cachelib-ubuntu-18.yml
+++ b/.github/workflows/build-cachelib-ubuntu-18.yml
@@ -19,6 +19,7 @@
 name: build-cachelib-ubuntu-18
 on:
 #  push:
+  pull_request:
   schedule:
     - cron:  '0 5 * * 2,4,6'
 jobs:
diff --git a/.github/workflows/build-cachelib-ubuntu-20.yml b/.github/workflows/build-cachelib-ubuntu-20.yml
index 35a3f507e2..a8380fdb96 100644
--- a/.github/workflows/build-cachelib-ubuntu-20.yml
+++ b/.github/workflows/build-cachelib-ubuntu-20.yml
@@ -15,6 +15,7 @@
 name: build-cachelib-ubuntu-20
 on:
 #  push:
+  pull_request:
   schedule:
     - cron:  '0 5 * * 1,3,5'
 jobs:
diff --git a/.github/workflows/build-cachelib-ubuntu-22.yml b/.github/workflows/build-cachelib-ubuntu-22.yml
index b4374a5b96..4db194431d 100644
--- a/.github/workflows/build-cachelib-ubuntu-22.yml
+++ b/.github/workflows/build-cachelib-ubuntu-22.yml
@@ -15,6 +15,7 @@
 name: build-cachelib-ubuntu-22
 on:
 #  push:
+  pull_request:
   schedule:
     - cron:  '0 7 * * *'
 jobs:

From e8151adb8bb1fa4f628232c35cab06cad2ffd052 Mon Sep 17 00:00:00 2001
From: Jiayue Bao <jiayueb@meta.com>
Date: Tue, 28 Feb 2023 10:25:27 -0800
Subject: [PATCH 19/47] Add a custom deleter class to access the Item Handle

Summary:
Add a custom deleter class that stores a `handle`. This allows object-cache to access the Item Handle via a shared_ptr. Both size-awareness feature and getting/updating object's TTL require that.

Deleter class is marked as private because we don't want to expose `Handle` to object-cache users.

Reviewed By: therealgymmy, jaesoo-fb

Differential Revision: D42503594

fbshipit-source-id: 16ac14e6a84a1cfa80a3c145d440790002734a34
---
 .../experimental/objcache2/ObjectCache-inl.h  | 18 ++++-----
 cachelib/experimental/objcache2/ObjectCache.h | 37 +++++++++++++++++++
 2 files changed, 46 insertions(+), 9 deletions(-)

diff --git a/cachelib/experimental/objcache2/ObjectCache-inl.h b/cachelib/experimental/objcache2/ObjectCache-inl.h
index 70cfc445ba..9f1b91631d 100644
--- a/cachelib/experimental/objcache2/ObjectCache-inl.h
+++ b/cachelib/experimental/objcache2/ObjectCache-inl.h
@@ -128,8 +128,8 @@ std::shared_ptr<const T> ObjectCache<AllocatorT>::find(folly::StringPiece key) {
   succL1Lookups_.inc();
 
   auto ptr = found->template getMemoryAs<ObjectCacheItem>()->objectPtr;
-  // Just release the handle. Cache destorys object when all handles released.
-  auto deleter = [h = std::move(found)](const T*) {};
+  // Use custom deleter
+  auto deleter = Deleter<const T>(std::move(found));
   return std::shared_ptr<const T>(reinterpret_cast<const T*>(ptr),
                                   std::move(deleter));
 }
@@ -146,8 +146,8 @@ std::shared_ptr<T> ObjectCache<AllocatorT>::findToWrite(
   succL1Lookups_.inc();
 
   auto ptr = found->template getMemoryAs<ObjectCacheItem>()->objectPtr;
-  // Just release the handle. Cache destorys object when all handles released.
-  auto deleter = [h = std::move(found)](T*) {};
+  // Use custom deleter
+  auto deleter = Deleter<T>(std::move(found));
   return std::shared_ptr<T>(reinterpret_cast<T*>(ptr), std::move(deleter));
 }
 
@@ -200,9 +200,6 @@ ObjectCache<AllocatorT>::insertOrReplace(folly::StringPiece key,
                                      std::move(deleter));
   }
 
-  // Just release the handle. Cache destorys object when all handles released.
-  auto deleter = [h = std::move(handle)](T*) {};
-
   // update total object size
   if (config_.objectSizeTrackingEnabled) {
     totalObjectSizeBytes_.fetch_add(objectSize, std::memory_order_relaxed);
@@ -210,6 +207,9 @@ ObjectCache<AllocatorT>::insertOrReplace(folly::StringPiece key,
 
   // Release the object as it has been successfully inserted to the cache.
   object.release();
+
+  // Use custom deleter
+  auto deleter = Deleter<T>(std::move(handle));
   return {AllocStatus::kSuccess, std::shared_ptr<T>(ptr, std::move(deleter)),
           replacedPtr};
 }
@@ -256,8 +256,8 @@ ObjectCache<AllocatorT>::insert(folly::StringPiece key,
     object.release();
   }
 
-  // Just release the handle. Cache destorys object when all handles released.
-  auto deleter = [h = std::move(handle)](T*) {};
+  // Use custom deleter
+  auto deleter = Deleter<T>(std::move(handle));
   return {success ? AllocStatus::kSuccess : AllocStatus::kKeyAlreadyExists,
           std::shared_ptr<T>(ptr, std::move(deleter))};
 }
diff --git a/cachelib/experimental/objcache2/ObjectCache.h b/cachelib/experimental/objcache2/ObjectCache.h
index f4cd2a9bb7..5f8aab85ad 100644
--- a/cachelib/experimental/objcache2/ObjectCache.h
+++ b/cachelib/experimental/objcache2/ObjectCache.h
@@ -94,6 +94,43 @@ class ObjectCache : public ObjectCacheBase<AllocatorT> {
   // make constructor private, but constructable by std::make_unique
   struct InternalConstructor {};
 
+  template <typename T>
+  class Deleter {
+   public:
+    using ReadHandle = typename AllocatorT::ReadHandle;
+    using WriteHandle = typename AllocatorT::WriteHandle;
+    using Handle = std::variant<ReadHandle, WriteHandle>;
+
+    explicit Deleter(typename AllocatorT::ReadHandle&& hdl)
+        : hdl_(std::move(hdl)) {}
+    explicit Deleter(typename AllocatorT::WriteHandle&& hdl)
+        : hdl_(std::move(hdl)) {}
+
+    void operator()(T*) {
+      // Just release the handle.
+      // Cache destorys object when all handles released.
+      std::holds_alternative<ReadHandle>(hdl_)
+          ? std::get<ReadHandle>(hdl_).reset()
+          : std::get<WriteHandle>(hdl_).reset();
+    }
+
+    WriteHandle& getWriteHandleRef() {
+      if (std::holds_alternative<ReadHandle>(hdl_)) {
+        hdl_ = std::move(std::get<ReadHandle>(hdl_)).toWriteHandle();
+      }
+      return std::get<WriteHandle>(hdl_);
+    }
+
+    ReadHandle& getReadHandleRef() {
+      return std::holds_alternative<ReadHandle>(hdl_)
+                 ? std::get<ReadHandle>(hdl_)
+                 : std::get<WriteHandle>(hdl_);
+    }
+
+   private:
+    Handle hdl_;
+  };
+
  public:
   using ItemDestructor = std::function<void(ObjectCacheDestructorData)>;
   using Key = KAllocation::Key;

From 982781860c0ccc6c61cde83f33e72341d87e72d7 Mon Sep 17 00:00:00 2001
From: Jaesoo Lee <jaesoo@meta.com>
Date: Tue, 28 Feb 2023 15:59:14 -0800
Subject: [PATCH 20/47] fix flaky tests in NvmCacheTests

Summary:
This change fixes following flaky tests in NvmCacheTests.
* NvmCacheTest.Delete
* NvmCacheTest.NvmEvicted
* NvmCacheTest.EvictToNvmGetCheckCtime

The root cause of the failures are essentially the same as D42443647 (https://github.com/facebook/CacheLib/commit/5e7ff9ab28cc00c74b19b176885d6af2e3d27d60) which fixed the problem for
NvmCacheTest.EvictToNvmGet; we are inserting enough items that could be spilled to NVM cache, where
the NvmCache::put() can be dropped and the item is evicted completely when the delete operations
(and tombstones) issued as part of the insertion are still outstanding. In order to fix the problem,
this change flushes the NVM cache periodically during the insertions.

Also, since this could cause more regions are used, the size of NVM cache needs to be increased.
This change bumps the default size of NVM cache to 200MB (previous 100MB). Also, the size of persist
storage used in the test PersistenceCache has been bumped by 100MB accordingly, i.e., from 400MB to
500MB.

This change addresses the github issue https://github.com/facebook/CacheLib/issues/169

Reviewed By: therealgymmy

Differential Revision: D43592888

fbshipit-source-id: f0968884eb39fb5728b59129e98345df3240f01e
---
 .../allocator/nvmcache/tests/NvmCacheTests.cpp   | 16 ++++++++++++++++
 cachelib/allocator/tests/NvmTestUtils.h          |  2 +-
 cachelib/persistence/tests/PersistenceCache.h    |  2 +-
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/cachelib/allocator/nvmcache/tests/NvmCacheTests.cpp b/cachelib/allocator/nvmcache/tests/NvmCacheTests.cpp
index 7355627fea..ec74c51980 100644
--- a/cachelib/allocator/nvmcache/tests/NvmCacheTests.cpp
+++ b/cachelib/allocator/nvmcache/tests/NvmCacheTests.cpp
@@ -245,7 +245,13 @@ TEST_F(NvmCacheTest, EvictToNvmGetCheckCtime) {
     ASSERT_NE(nullptr, it);
     cache_->insertOrReplace(it);
     keyToCtime.insert({key, it->getCreationTime()});
+    // Avoid any nvm eviction being dropped due to the race with still
+    // outstanding remove operation for insertion
+    if (i % 100 == 0) {
+      nvm.flushNvmCache();
+    }
   }
+  nvm.flushNvmCache();
 
   const auto nEvictions = this->evictionCount() - evictBefore;
   ASSERT_LT(0, nEvictions);
@@ -331,6 +337,11 @@ TEST_F(NvmCacheTest, Delete) {
     auto it = nvm.allocate(pid, key, 15 * 1024);
     ASSERT_NE(nullptr, it);
     nvm.insertOrReplace(it);
+    // Avoid any nvm eviction being dropped due to the race with still
+    // outstanding remove operation for insertion
+    if (i % 100 == 0) {
+      nvm.flushNvmCache();
+    }
   }
   nvm.flushNvmCache();
 
@@ -533,6 +544,11 @@ TEST_F(NvmCacheTest, NvmEvicted) {
     auto it = nvm.allocate(pid, key, allocSize);
     ASSERT_NE(nullptr, it);
     nvm.insertOrReplace(it);
+    // Avoid any nvm eviction being dropped due to the race with still
+    // outstanding remove operation for insertion
+    if (i % 100 == 0) {
+      nvm.flushNvmCache();
+    }
   }
   nvm.flushNvmCache();
 
diff --git a/cachelib/allocator/tests/NvmTestUtils.h b/cachelib/allocator/tests/NvmTestUtils.h
index 6d6242aadf..cad96c41d4 100644
--- a/cachelib/allocator/tests/NvmTestUtils.h
+++ b/cachelib/allocator/tests/NvmTestUtils.h
@@ -27,7 +27,7 @@ namespace utils {
 using NavyConfig = navy::NavyConfig;
 inline NavyConfig getNvmTestConfig(const std::string& cacheDir) {
   NavyConfig config{};
-  config.setSimpleFile(cacheDir + "/navy", 100 * 1024ULL * 1024ULL);
+  config.setSimpleFile(cacheDir + "/navy", 200 * 1024ULL * 1024ULL);
   config.setDeviceMetadataSize(4 * 1024 * 1024);
   config.setBlockSize(1024);
   config.setNavyReqOrderingShards(10);
diff --git a/cachelib/persistence/tests/PersistenceCache.h b/cachelib/persistence/tests/PersistenceCache.h
index 5400b4d4ea..1db5b5fc8a 100644
--- a/cachelib/persistence/tests/PersistenceCache.h
+++ b/cachelib/persistence/tests/PersistenceCache.h
@@ -213,7 +213,7 @@ class PersistenceCache {
  public:
   const uint32_t kNumKeys = 1024 * 1024;    // 1 million
   const size_t kCacheSize = 100 * kNumKeys; // 100MB
-  const size_t kCapacity = 4 * kCacheSize;  // 400MB
+  const size_t kCapacity = 5 * kCacheSize;  // 500MB
 
   std::unique_ptr<folly::IOBuf> buffer_;
   std::string cacheDir_;

From 9447a8acfb84c70c93333063bbee22d5e748a3e1 Mon Sep 17 00:00:00 2001
From: Darryl Gardner <darryleg@meta.com>
Date: Tue, 28 Feb 2023 17:07:02 -0800
Subject: [PATCH 21/47] Added PM9A3 support for Cachebench Write Bytes
 Calculations

Summary: The Samsung PM9A3 does not report samsung in the model number so I added the specific model number to the vendorMap.

Reviewed By: jaesoo-fb

Differential Revision: D43676582

fbshipit-source-id: 6df19c40dd9da9563b75aa5847a1d1f9eb6a9aef
---
 cachelib/cachebench/util/NandWrites.cpp       |  1 +
 .../cachebench/util/tests/NandWritesTest.cpp  | 90 +++++++++++++++++++
 2 files changed, 91 insertions(+)

diff --git a/cachelib/cachebench/util/NandWrites.cpp b/cachelib/cachebench/util/NandWrites.cpp
index ae82aca65c..370ddfa2b6 100644
--- a/cachelib/cachebench/util/NandWrites.cpp
+++ b/cachelib/cachebench/util/NandWrites.cpp
@@ -400,6 +400,7 @@ uint64_t nandWriteBytes(const folly::StringPiece& deviceName,
                             const folly::StringPiece&)>>
       vendorMap{{"samsung", samsungWriteBytes},
                 {"mz1lb960hbjr-", samsungWriteBytes},
+                {"mzol23t8hcls-", samsungWriteBytes},
                 // The Samsung PM983a doesn't include Samsung in the model
                 // number at this time, but it's a Samsung device.
                 {"liteon", liteonWriteBytes},
diff --git a/cachelib/cachebench/util/tests/NandWritesTest.cpp b/cachelib/cachebench/util/tests/NandWritesTest.cpp
index 0002e8a837..af09593f41 100644
--- a/cachelib/cachebench/util/tests/NandWritesTest.cpp
+++ b/cachelib/cachebench/util/tests/NandWritesTest.cpp
@@ -240,6 +240,96 @@ TEST_F(NandWritesTest, nandWriteBytes_handlesSamsungPM983aDevice) {
   EXPECT_EQ(nandWriteBytes("nvme1n1", kNvmePath, mockFactory_), 35061362294784);
 }
 
+TEST_F(NandWritesTest, nandWriteBytes_handlesSamsungPM9A3Device) {
+  constexpr auto& kListOutput = R"EOF({
+  "Devices" : [
+    {
+      "DevicePath" : "/dev/nvme0n1",
+      "Firmware" : "P1FB007",
+      "Index" : 0,
+      "NameSpace" : 1,
+      "ModelNumber" : "MTFDHBA512TCK",
+      "ProductName" : "Non-Volatile memory controller: Micron Technology Inc Device 0x5410",
+      "SerialNumber" : "        21062E6B8061",
+      "UsedBytes" : 512110190592,
+      "MaximumLBA" : 1000215216,
+      "PhysicalSize" : 512110190592,
+      "SectorSize" : 512
+    },
+    {
+      "DevicePath" : "/dev/nvme1n1",
+      "Firmware" : "GDA82F2Q",
+      "Index" : 1,
+      "NameSpace" : 1,
+      "ModelNumber" : "MZOL23T8HCLS-00AFB",
+      "ProductName" : "Unknown device",
+      "SerialNumber" : "S5X9NG0T116005",
+      "UsedBytes" : 104910848,
+      "MaximumLBA" : 918149526,
+      "PhysicalSize" : 3760740458496,
+      "SectorSize" : 4096
+    },
+    {
+      "DevicePath" : "/dev/nvme2n1",
+      "Firmware" : "GDA82F2Q",
+      "Index" : 2,
+      "NameSpace" : 1,
+      "ModelNumber" : "MZOL23T8HCLS-00AFB",
+      "ProductName" : "Unknown device",
+      "SerialNumber" : "S5X9NG0T116027",
+      "UsedBytes" : 0,
+      "MaximumLBA" : 918149526,
+      "PhysicalSize" : 3760740458496,
+      "SectorSize" : 4096
+    }
+  ]
+})EOF";
+
+  constexpr auto& kSmartLogOutput = R"EOF(
+[015:000] PhysicallyWrittenBytes                            : 241393664
+[031:016] Physically Read Bytes                             : 106217472
+[037:032] Bad NAND Block Count (Raw Value)                  : 0
+[039:038] Bad NAND Block Count (Normalized Value)           : 100
+[047:040] Uncorrectable Read Error Count                    : 0
+[055:048] Soft ECC Error Count                              : 0
+[059:056] SSD End to end Correction Count (Detected Errors) : 0
+[063:060] SSD End to end Correction Count (Corrected Errors): 0
+[064:064] System Data Percentage Used                       : 0
+[068:065] User Data Erase Count (Min)                       : 0
+[072:069] User Data Erase Count (Max)                       : 1
+[080:073] Refresh Count                                     : 0
+[086:081] Program Fail Count (Raw Value)                    : 0
+[088:087] Program Fail Count (Normalized Value)             : 100
+[094:089] User Data Erase Fail Count (Raw Value)            : 0
+[096:095] User Data Erase Fail Count (Normalized Value)     : 100
+[102:097] System Area Erase Fail Count (Raw Value)          : 0
+[104:103] System Area Erase Fail Count (Normalized value)   : 100
+[105:105] Thermal Throttling Status                         : 0
+[106:106] Thermal Throttling Count                          : 0
+[108:107] PHY Error Count                                   : 0
+[110:109] Bad DLLP Count                                    : 0
+[112:111] Bad TLP Count                                     : 0
+[114:113] Reserved                                          : 0
+[118:115] Incomplete Shutdowns                              : 0
+[119:119] % Free Blocks                                     : 96
+[121:120] PCIe Correctable Error Count (RTS)                : 0
+[123:122] PCIe Correctable Error Count (RRS)                : 0
+[131:124] XOR Recovery Count                                : 0
+[137:132] Bad System NAND block count (Raw Value)           : 0
+[139:138] Bad System NAND block count (Normalized Value)    : 100
+[141:140] Capacitor Health                                  : 163
+[157:142] Endurance Estimate                                : 28862181
+[165:158] Security Version Number                           : 4294967296
+[167:166] Log Page Version                                  : 1
+)EOF";
+
+  mockFactory_->expectedCommands(
+      {{{kNvmePath, "list", "-o", "json"}, kListOutput},
+       {{kNvmePath, "samsung", "vs-smart-add-log", "/dev/nvme1n1"},
+        kSmartLogOutput}});
+  EXPECT_EQ(nandWriteBytes("nvme1n1", kNvmePath, mockFactory_), 241393664);
+}
+
 TEST_F(NandWritesTest, nandWriteBytes_handlesSeagateDevice) {
   constexpr auto& kListOutput = R"EOF({
   "Devices" : [

From 293118bfed1d63726ab24e3bee39962da268ac44 Mon Sep 17 00:00:00 2001
From: Daniel Wong <wonglkd@gmail.com>
Date: Wed, 1 Mar 2023 09:47:43 -0800
Subject: [PATCH 22/47] Fix XDCHECK regression in lambda for gcc-8.x (#201)

Summary:
XDCHECK (or XCHECK) fails here for gcc-8.x in a lambda, so we move it outside. This occurs in CacheBench's AsyncCacheStressor.h.

```
/__w/CacheLib/CacheLib/cachelib/../cachelib/cachebench/runner/AsyncCacheStressor.h:306:7:
internal compiler error: in cp_build_addr_expr_1, at cp/typeck.c:5965
```

This line compiled fine with 8.5 before Jan 2023 so I suspect a regression in folly or other external library. It still compiles fine with gcc-7.5, 9.4, and 11.3.1. Possibly related commits: https://github.com/facebook/folly/commit/1aafad45f316896a7504396f421dacd6c10d7d5f and https://github.com/facebook/folly/commit/e6d09f66b9fc473bc108361d4c8dce8f29f7bcaf

Line of gcc-8.5 that it fails on: https://github.com/gcc-mirror/gcc/blob/releases/gcc-8.5.0/gcc/cp/typeck.c#L5965

The version that isn't in a lambda compiles just fine: https://github.com/facebook/CacheLib/blob/df5b9f6ef35c55e432b6713c52397a03dd19c34c/cachelib/cachebench/runner/CacheStressor.h#L399

Pull Request resolved: https://github.com/facebook/CacheLib/pull/201

Test Plan: GitHub actions. Built fine on my fork with CentOS 8.5/gcc-8.5. This issue currently causes 3 builds that use gcc-8.5 to fail (2 CentOS and RockyLinux-8.6) and 1 build using gcc-8.3 (Debian).

Reviewed By: therealgymmy

Differential Revision: D43681854

Pulled By: jaesoo-fb

fbshipit-source-id: f3a65aefedcd98a26a80bb6ad009ad0d64e2395b
---
 cachelib/cachebench/runner/AsyncCacheStressor.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cachelib/cachebench/runner/AsyncCacheStressor.h b/cachelib/cachebench/runner/AsyncCacheStressor.h
index 5b50db43b4..830b503795 100644
--- a/cachelib/cachebench/runner/AsyncCacheStressor.h
+++ b/cachelib/cachebench/runner/AsyncCacheStressor.h
@@ -287,6 +287,10 @@ class AsyncCacheStressor : public Stressor {
     ++stats.get;
     auto lock = chainedItemAcquireUniqueLock(*key);
 
+    // This was moved outside the lambda, as otherwise gcc-8.x crashes with an
+    // internal compiler error here (suspected regression in folly).
+    XDCHECK(req->sizeBegin + 1 != req->sizeEnd);
+
     auto onReadyFn = [&, req, key, l = std::move(lock), pid](auto hdl) {
       WriteHandle wHdl;
       if (hdl == nullptr) {
@@ -303,7 +307,6 @@ class AsyncCacheStressor : public Stressor {
       } else {
         wHdl = std::move(hdl).toWriteHandle();
       }
-      XDCHECK(req->sizeBegin + 1 != req->sizeEnd);
       bool chainSuccessful = false;
       for (auto j = req->sizeBegin + 1; j != req->sizeEnd; j++) {
         ++stats.addChained;

From f853a42a804c7259d55272d70b010078c50ffe52 Mon Sep 17 00:00:00 2001
From: Jiayue Bao <jiayueb@meta.com>
Date: Wed, 1 Mar 2023 15:39:43 -0800
Subject: [PATCH 23/47] Get or update object TTL via object shared_ptr

Summary:
Add the following TTL-related APIs:
- getConfiguredTtlSec
- getExpiryTimeSec
- extendTtlSec
- updateExpiryTimeSec

Usage:
```
auto ptr = objcache->find<T>("key");
auto configuredTtl = objcache->getConfiguredTtl(ptr);
auto expiryTime = objcache->getExpiryTimeSec(ptr);
objcache->extendTtl(ptr, std::chrono::seconds(3));
objcache->updateExpiryTimeSec(ptr, newExpiryTimeSecs);

```

Reviewed By: therealgymmy, jaesoo-fb

Differential Revision: D43167879

fbshipit-source-id: 3b11fb0a2b9a3b5c38fcfd856ade100e6ae27470
---
 cachelib/experimental/objcache2/ObjectCache.h | 79 +++++++++++++++
 .../objcache2/tests/ObjectCacheTest.cpp       | 95 +++++++++++++++++++
 2 files changed, 174 insertions(+)

diff --git a/cachelib/experimental/objcache2/ObjectCache.h b/cachelib/experimental/objcache2/ObjectCache.h
index 5f8aab85ad..1253213284 100644
--- a/cachelib/experimental/objcache2/ObjectCache.h
+++ b/cachelib/experimental/objcache2/ObjectCache.h
@@ -267,6 +267,63 @@ class ObjectCache : public ObjectCacheBase<AllocatorT> {
                : sizeController_->getCurrentEntriesLimit();
   }
 
+  // Get the expiry timestamp of the object
+  // @param  object   object shared pointer returned from ObjectCache APIs
+  //
+  // @return the expiry timestamp in seconds of the object
+  //         0 if object is nullptr
+  template <typename T>
+  uint32_t getExpiryTimeSec(const std::shared_ptr<T>& object) const {
+    if (object == nullptr) {
+      return 0;
+    }
+    return getReadHandleRefInternal<T>(object)->getExpiryTime();
+  }
+
+  // Get the configured TTL of the object
+  // @param  object   object shared pointer returned from ObjectCache APIs
+  //
+  // @return the configured TTL in seconds of the object
+  //         0 if object is nullptr
+  template <typename T>
+  std::chrono::seconds getConfiguredTtl(
+      const std::shared_ptr<T>& object) const {
+    if (object == nullptr) {
+      return std::chrono::seconds{0};
+    }
+    return getReadHandleRefInternal<T>(object)->getConfiguredTTL();
+  }
+
+  // Update the expiry timestamp of an object
+  //
+  // @param  object         object shared pointer returned from ObjectCache APIs
+  // @param  expiryTimeSecs the expiryTime in seconds to update
+  //
+  // @return boolean indicating whether expiry time was successfully updated
+  template <typename T>
+  bool updateExpiryTimeSec(std::shared_ptr<T>& object,
+                           uint32_t expiryTimeSecs) {
+    if (object == nullptr) {
+      return false;
+    }
+    return getWriteHandleRefInternal<T>(object)->updateExpiryTime(
+        expiryTimeSecs);
+  }
+
+  // Update expiry time to @ttl seconds from now.
+  //
+  // @param  object    object shared pointer returned from ObjectCache APIs
+  // @param  ttl       TTL in seconds (from now)
+  //
+  // @return boolean indicating whether TTL was successfully extended
+  template <typename T>
+  bool extendTtl(std::shared_ptr<T>& object, std::chrono::seconds ttl) {
+    if (object == nullptr) {
+      return false;
+    }
+    return getWriteHandleRefInternal<T>(object)->extendTTL(ttl);
+  }
+
  protected:
   // Serialize cache allocator config for exporting to Scuba
   std::map<std::string, std::string> serializeConfigParams() const override;
@@ -307,6 +364,28 @@ class ObjectCache : public ObjectCacheBase<AllocatorT> {
   bool stopSizeController(std::chrono::seconds timeout = std::chrono::seconds{
                               0});
 
+  // Get a ReadHandle reference from the object shared_ptr
+  template <typename T>
+  typename AllocatorT::ReadHandle& getReadHandleRefInternal(
+      const std::shared_ptr<T>& object) const {
+    auto* deleter = std::get_deleter<Deleter<T>>(object);
+    XDCHECK(deleter != nullptr);
+    auto& hdl = deleter->getReadHandleRef();
+    XDCHECK(hdl != nullptr);
+    return hdl;
+  }
+
+  // Get a WriteHandle reference from the object shared_ptr
+  template <typename T>
+  typename AllocatorT::WriteHandle& getWriteHandleRefInternal(
+      std::shared_ptr<T>& object) {
+    auto* deleter = std::get_deleter<Deleter<T>>(object);
+    XDCHECK(deleter != nullptr);
+    auto& hdl = deleter->getWriteHandleRef();
+    XDCHECK(hdl != nullptr);
+    return hdl;
+  }
+
   // Config passed to the cache.
   Config config_{};
 
diff --git a/cachelib/experimental/objcache2/tests/ObjectCacheTest.cpp b/cachelib/experimental/objcache2/tests/ObjectCacheTest.cpp
index 6bafa40589..701654cd1c 100644
--- a/cachelib/experimental/objcache2/tests/ObjectCacheTest.cpp
+++ b/cachelib/experimental/objcache2/tests/ObjectCacheTest.cpp
@@ -886,6 +886,69 @@ class ObjectCacheTest : public ::testing::Test {
     }
   }
 
+  void testGetTtl() {
+    const uint32_t ttlSecs = 600;
+
+    ObjectCacheConfig config;
+    config.setCacheName("test").setCacheCapacity(10'000).setItemDestructor(
+        [&](ObjectCacheDestructorData data) { data.deleteObject<Foo>(); });
+    auto objcache = ObjectCache::create(config);
+
+    auto before = util::getCurrentTimeSec();
+    std::this_thread::sleep_for(std::chrono::seconds{3});
+    objcache->insertOrReplace("Foo", std::make_unique<Foo>(), 0 /*object size*/,
+                              ttlSecs);
+
+    // lookup via find API
+    auto found1 = objcache->template find<Foo>("Foo");
+    ASSERT_NE(nullptr, found1);
+
+    // get TTL info
+    EXPECT_EQ(ttlSecs, objcache->getConfiguredTtl(found1).count());
+    EXPECT_LE(before + ttlSecs, objcache->getExpiryTimeSec(found1));
+
+    // lookup via findToWrite API
+    auto found2 = objcache->template findToWrite<Foo>("Foo");
+    ASSERT_NE(nullptr, found2);
+
+    // get TTL info
+    EXPECT_EQ(ttlSecs, objcache->getConfiguredTtl(found2).count());
+    EXPECT_LE(before + ttlSecs, objcache->getExpiryTimeSec(found2));
+  }
+
+  void testUpdateTtl() {
+    const uint32_t ttlSecs = 600;
+
+    ObjectCacheConfig config;
+    config.setCacheName("test").setCacheCapacity(10'000).setItemDestructor(
+        [&](ObjectCacheDestructorData data) { data.deleteObject<Foo>(); });
+    auto objcache = ObjectCache::create(config);
+
+    auto insertionTime = util::getCurrentTimeSec();
+    objcache->insertOrReplace("Foo", std::make_unique<Foo>(), 0 /*object size*/,
+                              ttlSecs);
+
+    auto found = objcache->template find<Foo>("Foo");
+    ASSERT_NE(nullptr, found);
+
+    // get TTL info
+    EXPECT_EQ(ttlSecs, objcache->getConfiguredTtl(found).count());
+    EXPECT_LE(insertionTime + ttlSecs, objcache->getExpiryTimeSec(found));
+
+    // update expiry time
+    auto currExpTime = objcache->getExpiryTimeSec(found);
+    EXPECT_TRUE(objcache->updateExpiryTimeSec(found, currExpTime + ttlSecs));
+    EXPECT_EQ(2 * ttlSecs, objcache->getConfiguredTtl(found).count());
+    EXPECT_EQ(currExpTime + ttlSecs, objcache->getExpiryTimeSec(found));
+
+    // extend TTL
+    auto now = util::getCurrentTimeSec();
+    std::this_thread::sleep_for(std::chrono::seconds{3});
+    EXPECT_TRUE(objcache->extendTtl(found, std::chrono::seconds(3 * ttlSecs)));
+    EXPECT_LE(now + ttlSecs, objcache->getExpiryTimeSec(found));
+    EXPECT_LE(3 * ttlSecs, objcache->getConfiguredTtl(found).count());
+  }
+
   void testMultithreadReplace() {
     // Sanity test to see if insertOrReplace across multiple
     // threads are safe.
@@ -1079,6 +1142,32 @@ class ObjectCacheTest : public ::testing::Test {
       fs[i].join();
     }
   }
+
+  void testMultithreadUpdateTtl() {
+    // Sanity test to see if update TTL across multiple
+    // threads is safe.
+    ObjectCacheConfig config;
+    config.setCacheName("test").setCacheCapacity(10'000).setItemDestructor(
+        [&](ObjectCacheDestructorData data) { data.deleteObject<Foo>(); });
+    auto objcache = ObjectCache::create(config);
+    objcache->insertOrReplace("key", std::make_unique<Foo>(), 0, 60);
+
+    auto runUpdateTtlOps = [&] {
+      for (int i = 0; i < 2000; i++) {
+        auto found = objcache->template find<Foo>("key");
+        auto configuredTtlSecs = objcache->getConfiguredTtl(found).count();
+        objcache->extendTtl(found, std::chrono::seconds{configuredTtlSecs});
+      }
+    };
+
+    std::vector<std::thread> ts;
+    for (int i = 0; i < 10; i++) {
+      ts.push_back(std::thread{runUpdateTtlOps});
+    }
+    for (int i = 0; i < 10; i++) {
+      ts[i].join();
+    }
+  }
 };
 
 using AllocatorTypes = ::testing::Types<LruAllocator,
@@ -1117,6 +1206,9 @@ TYPED_TEST(ObjectCacheTest, PersistenceHighLoad) {
   this->testPersistenceHighLoad();
 }
 
+TYPED_TEST(ObjectCacheTest, GetTtl) { this->testGetTtl(); }
+TYPED_TEST(ObjectCacheTest, UpdateTtl) { this->testUpdateTtl(); }
+
 TYPED_TEST(ObjectCacheTest, MultithreadReplace) {
   this->testMultithreadReplace();
 }
@@ -1135,6 +1227,9 @@ TYPED_TEST(ObjectCacheTest, MultithreadFindAndEviction) {
 TYPED_TEST(ObjectCacheTest, MultithreadFindAndReplaceWith10Shards) {
   this->testMultithreadFindAndReplaceWith10Shards();
 }
+TYPED_TEST(ObjectCacheTest, MultithreadUpdateTtl) {
+  this->testMultithreadUpdateTtl();
+}
 
 using ObjectCache = ObjectCache<LruAllocator>;
 TEST(ObjectCacheTest, LruEviction) {

From c120a5307c3062b21253c27f4a3ea598ae780b8b Mon Sep 17 00:00:00 2001
From: Jiayue Bao <jiayueb@meta.com>
Date: Wed, 1 Mar 2023 16:13:29 -0800
Subject: [PATCH 24/47] Remove isWriteHandle() API

Summary: Based on our discussion, this API would be confusing if a reference of ReadHandle is obtained from a WriteHandle. We also don't want to make it virtual because this will increase `sizeof(ReadHandle)` / `sizeof(WriteHandle)` by 8 bytes.

Reviewed By: therealgymmy

Differential Revision: D43667308

fbshipit-source-id: a77a17113f1a23f84332ebfa7ea6772d7647339c
---
 cachelib/allocator/Handle.h                  | 4 ----
 cachelib/allocator/tests/BaseAllocatorTest.h | 8 +-------
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/cachelib/allocator/Handle.h b/cachelib/allocator/Handle.h
index a125ace1b7..11d2bed2be 100644
--- a/cachelib/allocator/Handle.h
+++ b/cachelib/allocator/Handle.h
@@ -242,8 +242,6 @@ struct ReadHandleImpl {
     return hdl;
   }
 
-  bool isWriteHandle() const { return false; }
-
  protected:
   // accessor. Calling getInternal() on handle with isReady() == false blocks
   // the thread until the handle is ready.
@@ -571,8 +569,6 @@ struct WriteHandleImpl : public ReadHandleImpl<T> {
   //        creating this item handle.
   WriteHandleImpl clone() const { return WriteHandleImpl{ReadHandle::clone()}; }
 
-  bool isWriteHandle() const { return true; }
-
   // Friends
   friend ReadHandle;
   // Only CacheAllocator and NvmCache can create non-default constructed handles
diff --git a/cachelib/allocator/tests/BaseAllocatorTest.h b/cachelib/allocator/tests/BaseAllocatorTest.h
index d684545cb9..aa9d38a857 100644
--- a/cachelib/allocator/tests/BaseAllocatorTest.h
+++ b/cachelib/allocator/tests/BaseAllocatorTest.h
@@ -713,35 +713,29 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
       auto handle = alloc.find("key");
       ASSERT_NE(handle, nullptr);
       ASSERT_TRUE(isConst(handle->getMemory()));
-      ASSERT_EQ(handle.isWriteHandle(), false);
 
       // read handle clone
       auto handle2 = handle.clone();
       ASSERT_TRUE(isConst(handle2->getMemory()));
-      ASSERT_EQ(handle2.isWriteHandle(), false);
 
       // upgrade a read handle to a write handle
       auto handle3 = std::move(handle).toWriteHandle();
       ASSERT_FALSE(isConst(handle3->getMemory()));
-      ASSERT_EQ(handle3.isWriteHandle(), true);
     }
 
     {
       auto handle = alloc.findToWrite("key");
       ASSERT_NE(handle, nullptr);
       ASSERT_FALSE(isConst(handle->getMemory()));
-      ASSERT_EQ(handle.isWriteHandle(), true);
 
       // write handle clone
       auto handle2 = handle.clone();
       ASSERT_FALSE(isConst(handle2->getMemory()));
-      ASSERT_EQ(handle2.isWriteHandle(), true);
 
       // downgrade a write handle to a read handle
       ReadHandle handle3 = handle.clone();
       ASSERT_NE(handle3, nullptr);
       ASSERT_TRUE(isConst(handle3->getMemory()));
-      ASSERT_EQ(handle3.isWriteHandle(), false);
     }
 
     {
@@ -752,7 +746,7 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
       // This is like doing a "clone" and setting it into wait context
       waitContext->set(alloc.find("key"));
       auto handle2 = std::move(handle).toWriteHandle();
-      ASSERT_EQ(handle2.isWriteHandle(), true);
+      ASSERT_FALSE(isConst(handle2->getMemory()));
     }
   }
 

From 7785d24b3030a64a2e7d885e8ad552d5fa758f9b Mon Sep 17 00:00:00 2001
From: Jaesoo Lee <jaesoo@meta.com>
Date: Wed, 1 Mar 2023 16:16:43 -0800
Subject: [PATCH 25/47] Edit Cachebench_FB_HW_eval.md using inpage editor

Summary:
This diff has been automatically generated by the inpage editor.

NOTE: If you want to update this diff, go via the preview link inside the static docs section below.
Ensure you are editing the same page that was used to create this diff.

Reviewed By: therealgymmy

Differential Revision: D43667238

fbshipit-source-id: 0be4c1ef376a5a1a2de92afc311af24f66d10afd
---
 .../Cache_Library_User_Guides/Cachebench_FB_HW_eval.md     | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/website/docs/Cache_Library_User_Guides/Cachebench_FB_HW_eval.md b/website/docs/Cache_Library_User_Guides/Cachebench_FB_HW_eval.md
index ce99649133..09abe4b050 100644
--- a/website/docs/Cache_Library_User_Guides/Cachebench_FB_HW_eval.md
+++ b/website/docs/Cache_Library_User_Guides/Cachebench_FB_HW_eval.md
@@ -17,10 +17,11 @@ sufficient free memory (50+GB) and SSD capacity (1TB).
 * SSD Capacity: 100GB or more available capacity
 * Internet connection capable of accessing github.com and installing packages
 
-## Set up the SSD devices using mdraid
+## Set up the SSD devices
 
-To gather SSD performance metrics, the SSD must be setup first. An example
-below sets up a raid device to handle two ssds being used by CacheBench.
+To gather SSD performance metrics, the SSD must be setup first. Cachebench (and CacheLib) supports using various types of devices for NVM cache including a raw block device or a regular file. When one wants to use multiple SSDs as NVM cache, the CacheLib also provides a native support for RAID0 (i.e., striping). 
+
+Optionally, as an example, an user can setup and use md devices as follows. In this example, the md device is created from two ssd devices to be used as a raw block device in CacheBench.
 
 ```sh
 mdadm --create /dev/md0 --force --raid-devices=2 --level=0 --chunk=256 /dev/nvme1n1 /dev/nvme2n1

From 185bbe6664167a680f4070713987548573cbbd2c Mon Sep 17 00:00:00 2001
From: Daniel Wong <wonglkd@gmail.com>
Date: Thu, 2 Mar 2023 11:23:44 -0800
Subject: [PATCH 26/47] Fix Debian GitHub build & zstd CMake error (#200)

Summary:
1. Workaround for Debian Docker image bug that is breaking Debian build on GitHub (Explicitly mark Git repo as safe).
2. Pin zstd to a commit that resolves problems with older CMakes (note: affects all OSes, not just Debian)

Context for 1: In latest Debian Docker image , there is a regression that affects the checkout action.

From https://github.com/actions/checkout/issues/1169:
> - Checkout runs, and runs /usr/bin/git config --global --add safe.directory <path>
> - The global .gitconfig does not exist
> - Any calls to git remain unsafe/dubious

The suggested workaround was to use --system instead of --global.

Pull Request resolved: https://github.com/facebook/CacheLib/pull/200

Test Plan: See if GitHub Action Debian build is fixed.

Reviewed By: therealgymmy

Differential Revision: D43720363

Pulled By: jaesoo-fb

fbshipit-source-id: 54f3586cc7f8e72045e60d8dd454c7a77725e6b2
---
 .github/workflows/build-cachelib-debian-10.yml | 3 +++
 contrib/build-package.sh                       | 8 ++++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-cachelib-debian-10.yml b/.github/workflows/build-cachelib-debian-10.yml
index 56fb576298..7f0ab29a6c 100644
--- a/.github/workflows/build-cachelib-debian-10.yml
+++ b/.github/workflows/build-cachelib-debian-10.yml
@@ -52,6 +52,9 @@ jobs:
           g++ - || true
       - name: "checkout sources"
         uses: actions/checkout@v2
+      - name: "Add Git safe directory"
+        # Workaround for Docker image bug (GitHub issue #199).
+        run: git config --system --add safe.directory $GITHUB_WORKSPACE
       - name: "Install Prerequisites"
         run: ./contrib/build.sh -S -B
       - name: "Test: update-submodules"
diff --git a/contrib/build-package.sh b/contrib/build-package.sh
index 6e7acac5c2..755933bd44 100755
--- a/contrib/build-package.sh
+++ b/contrib/build-package.sh
@@ -102,11 +102,12 @@ test "$#" -eq 0 \
     && die "missing dependancy name to build. See -h for help"
 
 ######################################
-## Check which dependecy was requested
+## Check which dependency was requested
 ######################################
 
 external_git_clone=
 external_git_branch=
+# external_git_tag can also be used for commit hashes
 external_git_tag=
 update_submodules=
 cmake_custom_params=
@@ -175,7 +176,10 @@ case "$1" in
     REPODIR=cachelib/external/$NAME
     SRCDIR=$REPODIR/build/cmake
     external_git_clone=yes
-    external_git_branch=release
+    # Previously, we pinned to release branch. v1.5.4 needed
+    # CMake >= 3.18, later reverted. While waiting for v1.5.5,
+    # pin to the fix: https://github.com/facebook/zstd/pull/3510
+    external_git_tag=8420502e
     if test "$build_tests" = "yes" ; then
         cmake_custom_params="-DZSTD_BUILD_TESTS=ON"
     else

From 968533f58cfdc9fa70bdedc46918005150899a2d Mon Sep 17 00:00:00 2001
From: Jaesoo Lee <jaesoo@meta.com>
Date: Fri, 3 Mar 2023 09:43:41 -0800
Subject: [PATCH 27/47] fix broken installation page link

Summary:
An user in github reported an issue that the installation link is broken. Somehow,
docs/installation/installation.md cannot be referenced by `/docs/installation/installation`, but
only by `/docs/installation`. The root cause could not figured out yet, but this change fixes it as
such for now.

Reviewed By: jiayuebao

Differential Revision: D43757739

fbshipit-source-id: 4abd3208800c0b68e9162d381f6395897f047b24
---
 README.md                                                    | 2 +-
 .../docs/Cache_Library_User_Guides/Cachebench_FB_HW_eval.md  | 5 ++---
 .../docs/Cache_Library_User_Guides/Cachebench_Overview.md    | 2 +-
 website/docs/installation/testing.md                         | 4 ++--
 4 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index e05c932d1e..523b8b20a1 100644
--- a/README.md
+++ b/README.md
@@ -50,7 +50,7 @@ cd CacheLib
 Re-running `./contrib/build.sh` will update CacheLib and its dependencies
 to their latest versions and rebuild them.
 
-See [build](https://cachelib.org/docs/installation/installation) for more details about
+See [build](https://cachelib.org/docs/installation/) for more details about
 the building and installation process.
 
 
diff --git a/website/docs/Cache_Library_User_Guides/Cachebench_FB_HW_eval.md b/website/docs/Cache_Library_User_Guides/Cachebench_FB_HW_eval.md
index 09abe4b050..d17b7ac522 100644
--- a/website/docs/Cache_Library_User_Guides/Cachebench_FB_HW_eval.md
+++ b/website/docs/Cache_Library_User_Guides/Cachebench_FB_HW_eval.md
@@ -19,7 +19,7 @@ sufficient free memory (50+GB) and SSD capacity (1TB).
 
 ## Set up the SSD devices
 
-To gather SSD performance metrics, the SSD must be setup first. Cachebench (and CacheLib) supports using various types of devices for NVM cache including a raw block device or a regular file. When one wants to use multiple SSDs as NVM cache, the CacheLib also provides a native support for RAID0 (i.e., striping). 
+To gather SSD performance metrics, the SSD must be setup first. Cachebench (and CacheLib) supports using various types of devices for NVM cache including a raw block device or a regular file. When one wants to use multiple SSDs as NVM cache, the CacheLib also provides a native support for RAID0 (i.e., striping).
 
 Optionally, as an example, an user can setup and use md devices as follows. In this example, the md device is created from two ssd devices to be used as a raw block device in CacheBench.
 
@@ -143,7 +143,7 @@ mdadm --create /dev/md0 --force --raid-devices=2 --level=0 --chunk=256 /dev/nvme
     make install
     ```
 
-See [build and installation](/docs/installation/installation) for further details.
+See [build and installation](/docs/installation) for further details.
 
 ## Running the benchmark for SSD perf testing
 
@@ -197,7 +197,6 @@ For a full list of options that can be configured, see [configuring cachebench](
    using the `--progress` and specifying a duration in seconds.
    If `--progress-stats-file` is also specified, on every progress
    interval, `cachebench` would log the internal stats to the specified file.
-   
 ## Running cachebench with the trace workload
 
 Meta is sharing anonymized traces captured from large scale production cache services. These traces are licensed under the same license as CacheLib. They are meant to help academic and industry researchers to optimize for our caching workloads. One can freely download it from our AWS S3 bucket and run the CacheBench to replay the trace with varying configuration as follows.
diff --git a/website/docs/Cache_Library_User_Guides/Cachebench_Overview.md b/website/docs/Cache_Library_User_Guides/Cachebench_Overview.md
index eb72646542..8c878e1be6 100644
--- a/website/docs/Cache_Library_User_Guides/Cachebench_Overview.md
+++ b/website/docs/Cache_Library_User_Guides/Cachebench_Overview.md
@@ -53,6 +53,6 @@ developer's need. The following are few examples.
 
 ## Building  cachebench
 
-Follow instructions in [Installation](/docs/installation/installation) to build
+Follow instructions in [Installation](/docs/installation) to build
 cachebench. This should install cachebench in your local machine under
 ```opt/cachelib/bin/cachebench```
diff --git a/website/docs/installation/testing.md b/website/docs/installation/testing.md
index 02b2cb747c..d8730127b4 100644
--- a/website/docs/installation/testing.md
+++ b/website/docs/installation/testing.md
@@ -11,7 +11,7 @@ of the cache infrastructure.
 ## Building CacheLib Unit Tests
 
 To build the cachelib unit tests, use one of the following commands
-(see [installation](docs/installation/installation) instructions for more details):
+(see [installation](/docs/installation) instructions for more details):
 
 1. Use `./contrib/build.sh` script with the `-T` option.
 2. Use `./contrib/build-package.sh -t cachelib` (with the `-t` option)
@@ -42,7 +42,7 @@ Running a single unit test binary:
 
 ```sh
 $ cd opt/cachelib/tests
-$ ./allocator-test-ItemTest 
+$ ./allocator-test-ItemTest
 [==========] Running 6 tests from 1 test suite.
 [----------] Global test environment set-up.
 [----------] 6 tests from ItemTest

From b791b774ba6e92278b57bb2041a9980686392113 Mon Sep 17 00:00:00 2001
From: "Chorazewicz, Igor" <igor.chorazewicz@intel.com>
Date: Tue, 2 Nov 2021 16:00:53 +0100
Subject: [PATCH 28/47] Run centos and debian workflows on push and PR

Run tests on CI

Run long tests (navy/bench) every day on CI

Run CI on prebuild docker image

Run only centos build on CI

Update docker file used in CI

Centos8 is EOL

Disable failing clang-format-check

Add extra param to build-package.sh

Add scripts for rebuilding/pushing docker images

Taken from: https://github.com/pmem/dev-utils-kit/commit/30794c3e1bbc9273e87da3e8f3ce7e5a2792b19e

Extend CI to rebuild docker automatically

Update build-cachelib-docker.yml

Do not use shallow clone to make sure Docker rebuild logic works correctly.

Added required packages to install Intel ittapi

Update CI to use intel/CacheLib repo (#17)

Add multi-tier navy benchmark and run it on CI
- fix navy multi-tier config for NUMA bindings

added code coverage support in CacheLib

Adding libdml to CentOS docker image (#53)

only exclude allocator-test-NavySetupTestm, shm-test-test_page_size tests

added perf and numactl to docker packages

---------------------------------------------
one large commit for all CI and code coverage
see above for the change history.
---
 .../workflows/build-cachelib-centos-long.yml  |  39 ++++++
 .github/workflows/build-cachelib-debian.yml   |  43 ++++++
 .github/workflows/build-cachelib-docker.yml   |  49 +++++++
 .github/workflows/clang-format-check.yml      |   2 +-
 cachelib/CMakeLists.txt                       |   5 +
 .../consistency/navy-multi-tier.json          |  54 ++++++++
 .../test_configs/consistency/navy.json        |   4 +-
 contrib/build-package.sh                      |   8 +-
 docker/build.sh                               |  97 ++++++++++++++
 docker/images/build-image.sh                  |  38 ++++++
 docker/images/centos-8streams.Dockerfile      |  24 ++++
 docker/images/install-cachelib-deps.sh        |  14 ++
 docker/images/install-dsa-deps.sh             |  23 ++++
 docker/images/push-image.sh                   |  49 +++++++
 docker/pull-or-rebuild-image.sh               | 124 ++++++++++++++++++
 docker/run-build.sh                           |  17 +++
 docker/set-ci-vars.sh                         | 111 ++++++++++++++++
 run_code_coverage.sh                          |  20 +++
 run_tests.sh                                  |  14 ++
 19 files changed, 728 insertions(+), 7 deletions(-)
 create mode 100644 .github/workflows/build-cachelib-centos-long.yml
 create mode 100644 .github/workflows/build-cachelib-debian.yml
 create mode 100644 .github/workflows/build-cachelib-docker.yml
 create mode 100644 cachelib/cachebench/test_configs/consistency/navy-multi-tier.json
 create mode 100755 docker/build.sh
 create mode 100755 docker/images/build-image.sh
 create mode 100644 docker/images/centos-8streams.Dockerfile
 create mode 100755 docker/images/install-cachelib-deps.sh
 create mode 100755 docker/images/install-dsa-deps.sh
 create mode 100755 docker/images/push-image.sh
 create mode 100755 docker/pull-or-rebuild-image.sh
 create mode 100755 docker/run-build.sh
 create mode 100755 docker/set-ci-vars.sh
 create mode 100755 run_code_coverage.sh
 create mode 100755 run_tests.sh

diff --git a/.github/workflows/build-cachelib-centos-long.yml b/.github/workflows/build-cachelib-centos-long.yml
new file mode 100644
index 0000000000..92165f603b
--- /dev/null
+++ b/.github/workflows/build-cachelib-centos-long.yml
@@ -0,0 +1,39 @@
+name: build-cachelib-centos-latest
+on:
+  schedule:
+    - cron:  '0 7 * * *'
+    
+jobs:
+  build-cachelib-centos8-latest:
+    name: "CentOS/latest - Build CacheLib with all dependencies"
+    runs-on: ubuntu-latest
+    # Docker container image name
+    container: "centos:latest"
+    steps:
+      - name: "update packages"
+        run: dnf upgrade -y
+      - name: "install sudo,git"
+        run: dnf install -y sudo git cmake gcc
+      - name: "System Information"
+        run: |
+          echo === uname ===
+          uname -a
+          echo === /etc/os-release ===
+          cat /etc/os-release
+          echo === df -hl ===
+          df -hl
+          echo === free -h ===
+          free -h
+          echo === top ===
+          top -b -n1 -1 -Eg || timeout 1 top -b -n1
+          echo === env ===
+          env
+          echo === gcc -v ===
+          gcc -v
+      - name: "checkout sources"
+        uses: actions/checkout@v2
+      - name: "build CacheLib using build script"
+        run: ./contrib/build.sh -j -v -T
+      - name: "run tests"
+        timeout-minutes: 60
+        run: cd opt/cachelib/tests && ../../../run_tests.sh long
diff --git a/.github/workflows/build-cachelib-debian.yml b/.github/workflows/build-cachelib-debian.yml
new file mode 100644
index 0000000000..5bc3ad3c70
--- /dev/null
+++ b/.github/workflows/build-cachelib-debian.yml
@@ -0,0 +1,43 @@
+name: build-cachelib-debian-10
+on:
+  schedule:
+    - cron:  '30 5 * * 0,3'
+
+jobs:
+  build-cachelib-debian-10:
+    name: "Debian/Buster - Build CacheLib with all dependencies"
+    runs-on: ubuntu-latest
+    # Docker container image name
+    container: "debian:buster-slim"
+    steps:
+      - name: "update packages"
+        run: apt-get update
+      - name: "upgrade packages"
+        run: apt-get -y upgrade
+      - name: "install sudo,git"
+        run: apt-get install -y sudo git procps
+      - name: "System Information"
+        run: |
+          echo === uname ===
+          uname -a
+          echo === /etc/os-release ===
+          cat /etc/os-release
+          echo === df -hl ===
+          df -hl
+          echo === free -h ===
+          free -h
+          echo === top ===
+          top -b -n1 -1 -Eg || timeout 1 top -b -n1 ; true
+          echo === env ===
+          env
+          echo === cc -v ===
+          cc -v || true
+          echo === g++ -v ===
+          g++ - || true
+      - name: "checkout sources"
+        uses: actions/checkout@v2
+      - name: "build CacheLib using build script"
+        run: ./contrib/build.sh -j -v -T
+      - name: "run tests"
+        timeout-minutes: 60
+        run: cd opt/cachelib/tests && ../../../run_tests.sh
diff --git a/.github/workflows/build-cachelib-docker.yml b/.github/workflows/build-cachelib-docker.yml
new file mode 100644
index 0000000000..be28bc233c
--- /dev/null
+++ b/.github/workflows/build-cachelib-docker.yml
@@ -0,0 +1,49 @@
+name: build-cachelib-docker
+on:
+  push:
+  pull_request:
+
+jobs:
+  build-cachelib-docker:
+    name: "CentOS/latest - Build CacheLib with all dependencies"
+    runs-on: ubuntu-latest
+    env:
+      REPO:           cachelib
+      GITHUB_REPO:    intel/CacheLib
+      CONTAINER_REG:  ghcr.io/pmem/cachelib
+      CONTAINER_REG_USER:   ${{ secrets.GH_CR_USER }}
+      CONTAINER_REG_PASS:   ${{ secrets.GH_CR_PAT }}
+      FORCE_IMAGE_ACTION:   ${{ secrets.FORCE_IMAGE_ACTION }}
+      HOST_WORKDIR:         ${{ github.workspace }}
+      WORKDIR:              docker
+      IMG_VER:              devel
+    strategy:
+      matrix:
+        CONFIG: ["OS=centos OS_VER=8streams PUSH_IMAGE=1"]
+    steps:
+      - name: "System Information"
+        run: |
+          echo === uname ===
+          uname -a
+          echo === /etc/os-release ===
+          cat /etc/os-release
+          echo === df -hl ===
+          df -hl
+          echo === free -h ===
+          free -h
+          echo === top ===
+          top -b -n1 -1 -Eg || timeout 1 top -b -n1
+          echo === env ===
+          env
+          echo === gcc -v ===
+          gcc -v
+      - name: "checkout sources"
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+
+      - name: Pull the image or rebuild and push it
+        run: cd $WORKDIR && ${{ matrix.CONFIG }} ./pull-or-rebuild-image.sh $FORCE_IMAGE_ACTION
+
+      - name: Run the build
+        run: cd $WORKDIR && ${{ matrix.CONFIG }} ./build.sh
diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml
index 4b4897b610..90c8d739c6 100644
--- a/.github/workflows/clang-format-check.yml
+++ b/.github/workflows/clang-format-check.yml
@@ -1,6 +1,6 @@
 # From: https://github.com/marketplace/actions/clang-format-check#multiple-paths
 name: clang-format Check
-on: [pull_request]
+on: []
 jobs:
   formatting-check:
     name: Formatting Check
diff --git a/cachelib/CMakeLists.txt b/cachelib/CMakeLists.txt
index 36df0dc19f..e77c25085c 100644
--- a/cachelib/CMakeLists.txt
+++ b/cachelib/CMakeLists.txt
@@ -85,6 +85,11 @@ set(CMAKE_MODULE_PATH
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
 
+if(COVERAGE_ENABLED)
+  # Add code coverage
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --coverage -fprofile-arcs -ftest-coverage")
+endif()
+
 # include(fb_cxx_flags)
 message(STATUS "Update CXXFLAGS: ${CMAKE_CXX_FLAGS}")
 
diff --git a/cachelib/cachebench/test_configs/consistency/navy-multi-tier.json b/cachelib/cachebench/test_configs/consistency/navy-multi-tier.json
new file mode 100644
index 0000000000..076550bc5c
--- /dev/null
+++ b/cachelib/cachebench/test_configs/consistency/navy-multi-tier.json
@@ -0,0 +1,54 @@
+{
+  "cache_config" : {
+    "cacheSizeMB" : 300,
+    "poolRebalanceIntervalSec" : 1,
+    "moveOnSlabRelease" : true,
+
+    "cacheDir": "/tmp/mem-tier2",
+    "memoryTiers" : [
+      {
+        "ratio": 1,
+	"memBindNodes": 0
+      },
+      {
+        "ratio": 1,
+	"memBindNodes": 0
+      }
+    ],
+
+    "numPools" : 2,
+    "poolSizes" : [0.5, 0.5],
+    "allocFactor" : 2.0,
+    "nvmCacheSizeMB" : 1024
+  },
+  "test_config" :
+    {
+
+      "checkConsistency" : true,
+
+      "numOps" : 60000,
+      "numThreads" : 20,
+      "numKeys" : 200000,
+
+
+      "keySizeRange" : [1, 8, 64],
+      "keySizeRangeProbability" : [0.5, 0.5],
+
+      "valSizeRange" : [256, 1024, 4096, 8192],
+      "valSizeRangeProbability" : [0.2, 0.7, 0.1],
+
+      "chainedItemLengthRange" : [1, 2, 4, 32],
+      "chainedItemLengthRangeProbability" : [0.8, 0.18, 0.02],
+
+      "chainedItemValSizeRange" : [1, 128, 256, 1024, 4096, 20480],
+      "chainedItemValSizeRangeProbability" : [0.1, 0.1, 0.2, 0.3, 0.3],
+
+      "getRatio" : 0.8,
+      "setRatio" : 0.1,
+      "delRatio" : 0.0,
+      "addChainedRatio" : 0.05,
+      "keyPoolDistribution": [0.5, 0.5],
+      "opPoolDistribution" : [0.5, 0.5]
+    }
+
+}
diff --git a/cachelib/cachebench/test_configs/consistency/navy.json b/cachelib/cachebench/test_configs/consistency/navy.json
index 73b016a50f..b95b056d31 100644
--- a/cachelib/cachebench/test_configs/consistency/navy.json
+++ b/cachelib/cachebench/test_configs/consistency/navy.json
@@ -14,8 +14,8 @@
 
       "checkConsistency" : true,
 
-      "numOps" : 30000000,
-      "numThreads" : 40,
+      "numOps" : 600000,
+      "numThreads" : 20,
       "numKeys" : 200000,
 
 
diff --git a/contrib/build-package.sh b/contrib/build-package.sh
index 755933bd44..f0f3283df0 100755
--- a/contrib/build-package.sh
+++ b/contrib/build-package.sh
@@ -78,9 +78,8 @@ build_tests=
 show_help=
 many_jobs=
 verbose=
-PREFIX="$PWD/opt/cachelib/"
-
-while getopts :BSdhijtvp: param
+install_path=
+while getopts :BSdhijtvI: param
 do
   case $param in
     i) install=yes ;;
@@ -91,7 +90,7 @@ do
     v) verbose=yes ;;
     j) many_jobs=yes ;;
     t) build_tests=yes ;;
-    p) PREFIX=$OPTARG ;;
+    I) install_path=${OPTARG} ; install=yes ;;
     ?) die "unknown option. See -h for help."
   esac
 done
@@ -281,6 +280,7 @@ test -d cachelib || die "expected 'cachelib' directory not found in $PWD"
 
 
 # After ensuring we are in the correct directory, set the installation prefix"
+PREFIX=${install_path:-"$PWD/opt/cachelib/"}
 CMAKE_PARAMS="$CMAKE_PARAMS -DCMAKE_INSTALL_PREFIX=$PREFIX"
 CMAKE_PREFIX_PATH="$PREFIX/lib/cmake:$PREFIX/lib64/cmake:$PREFIX/lib:$PREFIX/lib64:$PREFIX:${CMAKE_PREFIX_PATH:-}"
 export CMAKE_PREFIX_PATH
diff --git a/docker/build.sh b/docker/build.sh
new file mode 100755
index 0000000000..bb82f0142d
--- /dev/null
+++ b/docker/build.sh
@@ -0,0 +1,97 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2022, Intel Corporation
+
+#
+# build.sh - runs a Docker container from a Docker image with environment
+#		prepared for running CacheLib builds and tests. It uses Docker image
+#		tagged as described in ./images/build-image.sh.
+#
+# Notes:
+# - set env var 'HOST_WORKDIR' to where the root of this project is on the host machine,
+# - set env var 'OS' and 'OS_VER' properly to a system/Docker you want to build this
+#	repo on (for proper values take a look at the list of Dockerfiles at the
+#	utils/docker/images directory in this repo), e.g. OS=ubuntu, OS_VER=20.04,
+# - set env var 'CONTAINER_REG' to container registry address
+#	[and possibly user/org name, and package name], e.g. "<CR_addr>/pmem/CacheLib",
+# - set env var 'DNS_SERVER' if you use one,
+# - set env var 'COMMAND' to execute specific command within Docker container or
+#	env var 'TYPE' to pick command based on one of the predefined types of build (see below).
+#
+
+set -e
+
+source $(dirname ${0})/set-ci-vars.sh
+IMG_VER=${IMG_VER:-devel}
+TAG="${OS}-${OS_VER}-${IMG_VER}"
+IMAGE_NAME=${CONTAINER_REG}:${TAG}
+CONTAINER_NAME=CacheLib-${OS}-${OS_VER}
+WORKDIR=/CacheLib  # working dir within Docker container
+SCRIPTSDIR=${WORKDIR}/docker
+
+if [[ -z "${OS}" || -z "${OS_VER}" ]]; then
+	echo "ERROR: The variables OS and OS_VER have to be set " \
+		"(e.g. OS=fedora, OS_VER=32)."
+	exit 1
+fi
+
+if [[ -z "${HOST_WORKDIR}" ]]; then
+	echo "ERROR: The variable HOST_WORKDIR has to contain a path to " \
+		"the root of this project on the host machine."
+	exit 1
+fi
+
+if [[ -z "${CONTAINER_REG}" ]]; then
+	echo "ERROR: CONTAINER_REG environment variable is not set " \
+		"(e.g. \"<registry_addr>/<org_name>/<package_name>\")."
+	exit 1
+fi
+
+# Set command to execute in the Docker container
+COMMAND="./run-build.sh";
+echo "COMMAND to execute within Docker container: ${COMMAND}"
+
+if [ -n "${DNS_SERVER}" ]; then DOCKER_OPTS="${DOCKER_OPTS} --dns=${DNS_SERVER}"; fi
+
+# Check if we are running on a CI (Travis or GitHub Actions)
+[ -n "${GITHUB_ACTIONS}" -o -n "${TRAVIS}" ] && CI_RUN="YES" || CI_RUN="NO"
+
+# Do not allocate a pseudo-TTY if we are running on GitHub Actions
+[ ! "${GITHUB_ACTIONS}" ] && DOCKER_OPTS="${DOCKER_OPTS} --tty=true"
+
+
+echo "Running build using Docker image: ${IMAGE_NAME}"
+
+# Run a container with
+#  - environment variables set (--env)
+#  - host directory containing source mounted (-v)
+#  - working directory set (-w)
+docker run --privileged=true --name=${CONTAINER_NAME} -i \
+	${DOCKER_OPTS} \
+	--env http_proxy=${http_proxy} \
+	--env https_proxy=${https_proxy} \
+	--env TERM=xterm-256color \
+	--env WORKDIR=${WORKDIR} \
+	--env SCRIPTSDIR=${SCRIPTSDIR} \
+	--env GITHUB_REPO=${GITHUB_REPO} \
+	--env CI_RUN=${CI_RUN} \
+	--env TRAVIS=${TRAVIS} \
+	--env GITHUB_ACTIONS=${GITHUB_ACTIONS} \
+	--env CI_COMMIT=${CI_COMMIT} \
+	--env CI_COMMIT_RANGE=${CI_COMMIT_RANGE} \
+	--env CI_BRANCH=${CI_BRANCH} \
+	--env CI_EVENT_TYPE=${CI_EVENT_TYPE} \
+	--env CI_REPO_SLUG=${CI_REPO_SLUG} \
+	--env DOC_UPDATE_GITHUB_TOKEN=${DOC_UPDATE_GITHUB_TOKEN} \
+	--env DOC_UPDATE_BOT_NAME=${DOC_UPDATE_BOT_NAME} \
+	--env DOC_REPO_OWNER=${DOC_REPO_OWNER} \
+	--env COVERITY_SCAN_TOKEN=${COVERITY_SCAN_TOKEN} \
+	--env COVERITY_SCAN_NOTIFICATION_EMAIL=${COVERITY_SCAN_NOTIFICATION_EMAIL} \
+	--env TEST_TIMEOUT=${TEST_TIMEOUT} \
+	--env TZ='Europe/Warsaw' \
+	--shm-size=4G \
+	-v ${HOST_WORKDIR}:${WORKDIR} \
+	-v /etc/localtime:/etc/localtime \
+	-w ${SCRIPTSDIR} \
+	${IMAGE_NAME} ${COMMAND}
+
diff --git a/docker/images/build-image.sh b/docker/images/build-image.sh
new file mode 100755
index 0000000000..985a6e0ff1
--- /dev/null
+++ b/docker/images/build-image.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2016-2021, Intel Corporation
+#
+# build-image.sh - prepares a Docker image with <OS>-based environment for
+#		testing (or dev) purpose, tagged with ${CONTAINER_REG}:${OS}-${OS_VER}-${IMG_VER},
+#		according to the ${OS}-${OS_VER}.Dockerfile file located in the same directory.
+#		IMG_VER is a version of Docker image (it usually relates to project's release tag)
+#		and it defaults to "devel".
+#
+
+set -e
+IMG_VER=${IMG_VER:-devel}
+TAG="${OS}-${OS_VER}-${IMG_VER}"
+
+if [[ -z "${OS}" || -z "${OS_VER}" ]]; then
+	echo "ERROR: The variables OS and OS_VER have to be set " \
+		"(e.g. OS=fedora, OS_VER=34)."
+	exit 1
+fi
+
+if [[ -z "${CONTAINER_REG}" ]]; then
+	echo "ERROR: CONTAINER_REG environment variable is not set " \
+		"(e.g. \"<registry_addr>/<org_name>/<package_name>\")."
+	exit 1
+fi
+
+echo "Check if the file ${OS}-${OS_VER}.Dockerfile exists"
+if [[ ! -f "${OS}-${OS_VER}.Dockerfile" ]]; then
+	echo "Error: ${OS}-${OS_VER}.Dockerfile does not exist."
+	exit 1
+fi
+
+echo "Build a Docker image tagged with: ${CONTAINER_REG}:${TAG}"
+docker build -t ${CONTAINER_REG}:${TAG} \
+	--build-arg http_proxy=$http_proxy \
+	--build-arg https_proxy=$https_proxy \
+	-f ${OS}-${OS_VER}.Dockerfile .
diff --git a/docker/images/centos-8streams.Dockerfile b/docker/images/centos-8streams.Dockerfile
new file mode 100644
index 0000000000..29752c5d98
--- /dev/null
+++ b/docker/images/centos-8streams.Dockerfile
@@ -0,0 +1,24 @@
+FROM quay.io/centos/centos:stream8
+
+RUN dnf install -y \
+cmake \
+sudo \
+git \
+tzdata \
+vim \
+gdb \
+clang \
+python36 \
+glibc-devel.i686 \
+xmlto \
+uuid \
+libuuid-devel \
+json-c-devel \
+perf \
+numactl
+
+COPY ./install-cachelib-deps.sh ./install-cachelib-deps.sh
+RUN ./install-cachelib-deps.sh
+
+COPY ./install-dsa-deps.sh ./install-dsa-deps.sh
+RUN ./install-dsa-deps.sh
diff --git a/docker/images/install-cachelib-deps.sh b/docker/images/install-cachelib-deps.sh
new file mode 100755
index 0000000000..6d8fbdef7b
--- /dev/null
+++ b/docker/images/install-cachelib-deps.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2022, Intel Corporation
+
+git clone -b develop https://github.com/intel/CacheLib CacheLib
+
+./CacheLib/contrib/prerequisites-centos8.sh
+
+for pkg in zstd googleflags googlelog googletest sparsemap fmt folly fizz wangle fbthrift ;
+do
+    sudo ./CacheLib/contrib/build-package.sh -j -I /opt/ "$pkg"
+done
+
+rm -rf CacheLib
diff --git a/docker/images/install-dsa-deps.sh b/docker/images/install-dsa-deps.sh
new file mode 100755
index 0000000000..b4c62ecc93
--- /dev/null
+++ b/docker/images/install-dsa-deps.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+# Copyright 2023, Intel Corporation
+
+# Install idxd-config
+git clone https://github.com/intel/idxd-config.git
+cd idxd-config
+./autogen.sh
+./configure CFLAGS='-g -O2' --prefix=/usr --sysconfdir=/etc --libdir=/usr/lib64
+make
+make check
+sudo make install
+cd ../
+rm -rf idxd-config
+
+# Install DML Library
+git clone --recursive https://github.com/intel/DML.git
+cd DML
+mkdir build
+cd build
+cmake -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_BUILD_TYPE=RelWithDebInfo ..
+cmake --build . --target install
+cd ../../
+rm -rf DML
diff --git a/docker/images/push-image.sh b/docker/images/push-image.sh
new file mode 100755
index 0000000000..8f516b4205
--- /dev/null
+++ b/docker/images/push-image.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2016-2021, Intel Corporation
+
+#
+# push-image.sh - pushes the Docker image tagged as described in
+#		./build-image.sh, to the ${CONTAINER_REG}.
+#
+# The script utilizes ${CONTAINER_REG_USER} and ${CONTAINER_REG_PASS} variables to
+# log in to the ${CONTAINER_REG}. The variables can be set in the CI's configuration
+# for automated builds.
+#
+
+set -e
+IMG_VER=${IMG_VER:-devel}
+TAG="${OS}-${OS_VER}-${IMG_VER}"
+
+if [[ -z "${OS}" || -z "${OS_VER}" ]]; then
+	echo "ERROR: The variables OS and OS_VER have to be set " \
+		"(e.g. OS=fedora, OS_VER=34)."
+	exit 1
+fi
+
+if [[ -z "${CONTAINER_REG}" ]]; then
+	echo "ERROR: CONTAINER_REG environment variable is not set " \
+		"(e.g. \"<registry_addr>/<org_name>/<package_name>\")."
+	exit 1
+fi
+
+if [[ -z "${CONTAINER_REG_USER}" || -z "${CONTAINER_REG_PASS}" ]]; then
+	echo "ERROR: variables CONTAINER_REG_USER=\"${CONTAINER_REG_USER}\" and " \
+		"CONTAINER_REG_PASS=\"${CONTAINER_REG_PASS}\"" \
+		"have to be set properly to allow login to the Container Registry."
+	exit 1
+fi
+
+# Check if the image tagged with ${CONTAINER_REG}:${TAG} exists locally
+if [[ ! $(docker images -a | awk -v pattern="^${CONTAINER_REG}:${TAG}\$" \
+	'$1":"$2 ~ pattern') ]]
+then
+	echo "ERROR: Docker image tagged ${CONTAINER_REG}:${TAG} does not exist locally."
+	exit 1
+fi
+
+echo "Log in to the Container Registry: ${CONTAINER_REG}"
+echo "${CONTAINER_REG_PASS}" | docker login ghcr.io -u="${CONTAINER_REG_USER}" --password-stdin
+
+echo "Push the image to the Container Registry"
+docker push ${CONTAINER_REG}:${TAG}
diff --git a/docker/pull-or-rebuild-image.sh b/docker/pull-or-rebuild-image.sh
new file mode 100755
index 0000000000..dcdcb40e8c
--- /dev/null
+++ b/docker/pull-or-rebuild-image.sh
@@ -0,0 +1,124 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2016-2021, Intel Corporation
+
+#
+# pull-or-rebuild-image.sh - rebuilds the Docker image used in the
+#		current build (if necessary) or pulls it from the Container Registry.
+#		Docker image is tagged as described in docker/build-image.sh,
+#		but IMG_VER defaults in this script to "latest" (just in case it's
+#		used locally without building any images).
+#
+# If Docker was rebuilt and all requirements are fulfilled (more details in
+# push_image function below) image will be pushed to the ${CONTAINER_REG}.
+#
+# The script rebuilds the Docker image if:
+# 1. the Dockerfile for the current OS version (${OS}-${OS_VER}.Dockerfile)
+#    or any .sh script in the Dockerfiles directory were modified and committed, or
+# 2. "rebuild" param was passed as a first argument to this script.
+#
+# The script pulls the Docker image if:
+# 1. it does not have to be rebuilt (based on committed changes), or
+# 2. "pull" param was passed as a first argument to this script.
+#
+
+set -e
+
+source $(dirname ${0})/set-ci-vars.sh
+IMG_VER=${IMG_VER:-latest}
+TAG="${OS}-${OS_VER}-${IMG_VER}"
+IMAGES_DIR_NAME=images
+BASE_DIR=docker/${IMAGES_DIR_NAME}
+
+if [[ -z "${OS}" || -z "${OS_VER}" ]]; then
+	echo "ERROR: The variables OS and OS_VER have to be set properly " \
+             "(eg. OS=fedora, OS_VER=34)."
+	exit 1
+fi
+
+if [[ -z "${CONTAINER_REG}" ]]; then
+	echo "ERROR: CONTAINER_REG environment variable is not set " \
+		"(e.g. \"<registry_addr>/<org_name>/<package_name>\")."
+	exit 1
+fi
+
+function build_image() {
+	echo "Building the Docker image for the ${OS}-${OS_VER}.Dockerfile"
+	pushd ${IMAGES_DIR_NAME}
+	./build-image.sh
+	popd
+}
+
+function pull_image() {
+	echo "Pull the image '${CONTAINER_REG}:${TAG}' from the Container Registry."
+	docker pull ${CONTAINER_REG}:${TAG}
+}
+
+function push_image {
+	# Check if the image has to be pushed to the Container Registry:
+	# - only upstream (not forked) repository,
+	# - stable-* or master branch,
+	# - not a pull_request event,
+	# - and PUSH_IMAGE flag was set for current build.
+	if [[ "${CI_REPO_SLUG}" == "${GITHUB_REPO}" \
+		&& (${CI_BRANCH} == develop || ${CI_BRANCH} == main) \
+		&& ${CI_EVENT_TYPE} != "pull_request" \
+		&& ${PUSH_IMAGE} == "1" ]]
+	then
+		echo "The image will be pushed to the Container Registry: ${CONTAINER_REG}"
+		pushd ${IMAGES_DIR_NAME}
+		./push-image.sh
+		popd
+	else
+		echo "Skip pushing the image to the Container Registry."
+	fi
+}
+
+# If "rebuild" or "pull" are passed to the script as param, force rebuild/pull.
+if [[ "${1}" == "rebuild" ]]; then
+	build_image
+	push_image
+	exit 0
+elif [[ "${1}" == "pull" ]]; then
+	pull_image
+	exit 0
+fi
+
+# Determine if we need to rebuild the image or just pull it from
+# the Container Registry, based on committed changes.
+if [ -n "${CI_COMMIT_RANGE}" ]; then
+	commits=$(git rev-list ${CI_COMMIT_RANGE})
+else
+	commits=${CI_COMMIT}
+fi
+
+if [[ -z "${commits}" ]]; then
+	echo "'commits' variable is empty. Docker image will be pulled."
+fi
+
+echo "Commits in the commit range:"
+for commit in ${commits}; do echo ${commit}; done
+
+echo "Files modified within the commit range:"
+files=$(for commit in ${commits}; do git diff-tree --no-commit-id --name-only \
+	-r ${commit}; done | sort -u)
+for file in ${files}; do echo ${file}; done
+
+# Check if committed file modifications require the Docker image to be rebuilt
+for file in ${files}; do
+	# Check if modified files are relevant to the current build
+	if [[ ${file} =~ ^(${BASE_DIR})\/(${OS})-(${OS_VER})\.Dockerfile$ ]] \
+		|| [[ ${file} =~ ^(${BASE_DIR})\/.*\.sh$ ]]
+	then
+		build_image
+		push_image
+		exit 0
+	fi
+done
+
+# Getting here means rebuilding the Docker image isn't required (based on changed files).
+# Pull the image from the Container Registry or rebuild anyway, if pull fails.
+if ! pull_image; then
+	build_image
+	push_image
+fi
diff --git a/docker/run-build.sh b/docker/run-build.sh
new file mode 100755
index 0000000000..02c7caf731
--- /dev/null
+++ b/docker/run-build.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2022, Intel Corporation
+
+set -e
+
+function sudo_password() {
+	echo ${USERPASS} | sudo -Sk $*
+}
+
+cd ..
+mkdir build
+cd build
+cmake ../cachelib -DBUILD_TESTS=ON -DCMAKE_INSTALL_PREFIX=/opt -DCMAKE_BUILD_TYPE=Debug
+sudo_password make install -j$(nproc)
+
+cd /opt/tests && $WORKDIR/run_tests.sh
diff --git a/docker/set-ci-vars.sh b/docker/set-ci-vars.sh
new file mode 100755
index 0000000000..f6f52132c8
--- /dev/null
+++ b/docker/set-ci-vars.sh
@@ -0,0 +1,111 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2020-2021, Intel Corporation
+
+#
+# set-ci-vars.sh -- set CI variables common for both:
+#                   Travis and GitHub Actions CIs
+#
+
+set -e
+
+function get_commit_range_from_last_merge {
+	# get commit id of the last merge
+	LAST_MERGE=$(git log --merges --pretty=%H -1)
+	LAST_COMMIT=$(git log --pretty=%H -1)
+	RANGE_END="HEAD"
+	if [ -n "${GITHUB_ACTIONS}" ] && [ "${GITHUB_EVENT_NAME}" == "pull_request" ] && [ "${LAST_MERGE}" == "${LAST_COMMIT}" ]; then
+		# GitHub Actions commits its own merge in case of pull requests
+		# so the first merge commit has to be skipped.
+
+		LAST_COMMIT=$(git log --pretty=%H -2 | tail -n1)
+		LAST_MERGE=$(git log --merges --pretty=%H -2 | tail -n1)
+		# If still the last commit is a merge commit it means we're manually
+		# merging changes (probably back from stable branch). We have to use
+		# left parent of the merge and the current commit for COMMIT_RANGE.
+		if [ "${LAST_MERGE}" == "${LAST_COMMIT}" ]; then
+			LAST_MERGE=$(git log --merges --pretty=%P -2 | tail -n1 | cut -d" " -f1)
+			RANGE_END=${LAST_COMMIT}
+		fi
+	elif [ "${LAST_MERGE}" == "${LAST_COMMIT}" ] &&
+		([ "${TRAVIS_EVENT_TYPE}" == "push" ] || [ "${GITHUB_EVENT_NAME}" == "push" ]); then
+		# Other case in which last commit equals last merge, is when committing
+		# a manual merge. Push events don't set proper COMMIT_RANGE.
+		# It has to be then set: from merge's left parent to the current commit.
+		LAST_MERGE=$(git log --merges --pretty=%P -1 | cut -d" " -f1)
+	fi
+	if [ "${LAST_MERGE}" == "" ]; then
+		# possible in case of shallow clones
+		# or new repos with no merge commits yet
+		# - pick up the first commit
+		LAST_MERGE=$(git log --pretty=%H | tail -n1)
+	fi
+	COMMIT_RANGE="${LAST_MERGE}..${RANGE_END}"
+	# make sure it works now
+	if ! git rev-list ${COMMIT_RANGE} >/dev/null; then
+		COMMIT_RANGE=""
+	fi
+	echo ${COMMIT_RANGE}
+}
+
+COMMIT_RANGE_FROM_LAST_MERGE=$(get_commit_range_from_last_merge)
+
+if [ -n "${TRAVIS}" ]; then
+	CI_COMMIT=${TRAVIS_COMMIT}
+	CI_COMMIT_RANGE="${TRAVIS_COMMIT_RANGE/.../..}"
+	CI_BRANCH=${TRAVIS_BRANCH}
+	CI_EVENT_TYPE=${TRAVIS_EVENT_TYPE}
+	CI_REPO_SLUG=${TRAVIS_REPO_SLUG}
+
+	# CI_COMMIT_RANGE is usually invalid for force pushes - fix it when used
+	# with non-upstream repository
+	if [ -n "${CI_COMMIT_RANGE}" -a "${CI_REPO_SLUG}" != "${GITHUB_REPO}" ]; then
+		if ! git rev-list ${CI_COMMIT_RANGE}; then
+			CI_COMMIT_RANGE=${COMMIT_RANGE_FROM_LAST_MERGE}
+		fi
+	fi
+
+	case "${TRAVIS_CPU_ARCH}" in
+	"amd64")
+		CI_CPU_ARCH="x86_64"
+		;;
+	*)
+		CI_CPU_ARCH=${TRAVIS_CPU_ARCH}
+		;;
+	esac
+
+elif [ -n "${GITHUB_ACTIONS}" ]; then
+	CI_COMMIT=${GITHUB_SHA}
+	CI_COMMIT_RANGE=${COMMIT_RANGE_FROM_LAST_MERGE}
+	CI_BRANCH=$(echo ${GITHUB_REF} | cut -d'/' -f3)
+	CI_REPO_SLUG=${GITHUB_REPOSITORY}
+	CI_CPU_ARCH="x86_64" # GitHub Actions supports only x86_64
+
+	case "${GITHUB_EVENT_NAME}" in
+	"schedule")
+		CI_EVENT_TYPE="cron"
+		;;
+	*)
+		CI_EVENT_TYPE=${GITHUB_EVENT_NAME}
+		;;
+	esac
+
+else
+	CI_COMMIT=$(git log --pretty=%H -1)
+	CI_COMMIT_RANGE=${COMMIT_RANGE_FROM_LAST_MERGE}
+	CI_CPU_ARCH="x86_64"
+fi
+
+export CI_COMMIT=${CI_COMMIT}
+export CI_COMMIT_RANGE=${CI_COMMIT_RANGE}
+export CI_BRANCH=${CI_BRANCH}
+export CI_EVENT_TYPE=${CI_EVENT_TYPE}
+export CI_REPO_SLUG=${CI_REPO_SLUG}
+export CI_CPU_ARCH=${CI_CPU_ARCH}
+
+echo CI_COMMIT=${CI_COMMIT}
+echo CI_COMMIT_RANGE=${CI_COMMIT_RANGE}
+echo CI_BRANCH=${CI_BRANCH}
+echo CI_EVENT_TYPE=${CI_EVENT_TYPE}
+echo CI_REPO_SLUG=${CI_REPO_SLUG}
+echo CI_CPU_ARCH=${CI_CPU_ARCH}
diff --git a/run_code_coverage.sh b/run_code_coverage.sh
new file mode 100755
index 0000000000..7722e262bf
--- /dev/null
+++ b/run_code_coverage.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+#Build CacheLib with flag -DCOVERAGE_ENABLED=ON
+
+# Track coverage
+lcov -c -i -b . -d . -o Coverage.baseline
+./run_tests.sh
+lcov -c -d . -b . -o Coverage.out
+lcov -a Coverage.baseline -a Coverage.out -o Coverage.combined
+
+# Generate report
+COVERAGE_DIR='coverage_report'
+genhtml Coverage.combined -o ${COVERAGE_DIR}
+COVERAGE_REPORT="${COVERAGE_DIR}.tgz"
+tar -zcvf ${COVERAGE_REPORT} ${COVERAGE_DIR}
+echo "Created coverage report ${COVERAGE_REPORT}"
+
+# Cleanup
+rm Coverage.baseline Coverage.out Coverage.combined
+rm -rf ${COVERAGE_DIR}
diff --git a/run_tests.sh b/run_tests.sh
new file mode 100755
index 0000000000..111e218333
--- /dev/null
+++ b/run_tests.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+# Newline separated list of tests to ignore
+BLACKLIST="allocator-test-NavySetupTest
+shm-test-test_page_size"
+
+if [ "$1" == "long" ]; then
+    find -type f -executable | grep -vF "$BLACKLIST" | xargs -n1 bash -c
+else
+    find -type f \( -not -name "*bench*" -and -not -name "navy*" \) -executable | grep -vF "$BLACKLIST" | xargs -n1 bash -c
+fi
+
+../bin/cachebench --json_test_config ../test_configs/consistency/navy.json
+../bin/cachebench --json_test_config ../test_configs/consistency/navy-multi-tier.json

From 470f563fe9fe77a8c544ebee7f7fdb7af3140e58 Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Thu, 15 Dec 2022 16:26:48 +0000
Subject: [PATCH 29/47] Introduce 'markedForEviction' state for the Item.

It is similar to 'moving' but requires ref count to be 0.

An item which is marked for eviction causes all incRef() calls to
that item to fail.

This will be used to ensure that once item is selected for eviction,
no one can interfere and prevent the eviction from suceeding.

'markedForEviction' relies on the same 'exlusive' bit as the 'moving'
state. To distinguish between those two states, 'moving' add 1 to the
refCount. This is hidden from the user, so getRefCount() will not
return that extra ref.
---
 cachelib/allocator/CacheAllocator-inl.h   |  71 +++---
 cachelib/allocator/CacheAllocator.h       |   8 +-
 cachelib/allocator/CacheItem-inl.h        |  55 +++--
 cachelib/allocator/CacheItem.h            |  57 +++--
 cachelib/allocator/Refcount.h             | 288 +++++++++++++++-------
 cachelib/allocator/tests/ItemTest.cpp     |  14 +-
 cachelib/allocator/tests/RefCountTest.cpp | 117 +++++++--
 7 files changed, 434 insertions(+), 176 deletions(-)

diff --git a/cachelib/allocator/CacheAllocator-inl.h b/cachelib/allocator/CacheAllocator-inl.h
index 1d89593268..3678ca131b 100644
--- a/cachelib/allocator/CacheAllocator-inl.h
+++ b/cachelib/allocator/CacheAllocator-inl.h
@@ -832,20 +832,21 @@ CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it,
 
       removeFromMMContainer(*head);
 
-      // If this chained item is marked as exclusive, we will not free it.
-      // We must capture the exclusive state before we do the decRef when
+      // If this chained item is marked as moving, we will not free it.
+      // We must capture the moving state before we do the decRef when
       // we know the item must still be valid
-      const bool wasExclusive = head->isExclusive();
+      const bool wasMoving = head->isMoving();
+      XDCHECK(!head->isMarkedForEviction());
 
       // Decref and check if we were the last reference. Now if the item
-      // was marked exclusive, after decRef, it will be free to be released
+      // was marked moving, after decRef, it will be free to be released
       // by slab release thread
       const auto childRef = head->decRef();
 
-      // If the item is already exclusive and we already decremented the
+      // If the item is already moving and we already decremented the
       // refcount, we don't need to free this item. We'll let the slab
       // release thread take care of that
-      if (!wasExclusive) {
+      if (!wasMoving) {
         if (childRef != 0) {
           throw std::runtime_error(folly::sformat(
               "chained item refcount is not zero. We cannot proceed! "
@@ -853,7 +854,7 @@ CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it,
               childRef, head->toString()));
         }
 
-        // Item is not exclusive and refcount is 0, we can proceed to
+        // Item is not moving and refcount is 0, we can proceed to
         // free it or recylce the memory
         if (head == toRecycle) {
           XDCHECK(ReleaseRes::kReleased != res);
@@ -1179,7 +1180,7 @@ bool CacheAllocator<CacheTrait>::moveChainedItem(ChainedItem& oldItem,
 
   // This item has been unlinked from its parent and we're the only
   // owner of it, so we're done here
-  if (!oldItem.isInMMContainer() || oldItem.isOnlyExclusive()) {
+  if (!oldItem.isInMMContainer() || oldItem.isOnlyMoving()) {
     return false;
   }
 
@@ -1210,7 +1211,7 @@ bool CacheAllocator<CacheTrait>::moveChainedItem(ChainedItem& oldItem,
 
   // In case someone else had removed this chained item from its parent by now
   // So we check again to see if the it has been unlinked from its parent
-  if (!oldItem.isInMMContainer() || oldItem.isOnlyExclusive()) {
+  if (!oldItem.isInMMContainer() || oldItem.isOnlyMoving()) {
     return false;
   }
 
@@ -1226,7 +1227,7 @@ bool CacheAllocator<CacheTrait>::moveChainedItem(ChainedItem& oldItem,
   // parent's chain and the MMContainer.
   auto oldItemHandle =
       replaceChainedItemLocked(oldItem, std::move(newItemHdl), *parentHandle);
-  XDCHECK(oldItemHandle->isExclusive());
+  XDCHECK(oldItemHandle->isMoving());
   XDCHECK(!oldItemHandle->isInMMContainer());
 
   return true;
@@ -1255,7 +1256,7 @@ CacheAllocator<CacheTrait>::findEviction(PoolId pid, ClassId cid) {
             : toRecycle;
 
     // make sure no other thead is evicting the item
-    if (candidate->getRefCount() != 0 || !candidate->markExclusive()) {
+    if (candidate->getRefCount() != 0 || !candidate->markMoving()) {
       ++itr;
       continue;
     }
@@ -1270,11 +1271,11 @@ CacheAllocator<CacheTrait>::findEviction(PoolId pid, ClassId cid) {
               ? advanceIteratorAndTryEvictChainedItem(itr)
               : advanceIteratorAndTryEvictRegularItem(mmContainer, itr);
       evictionSuccessful = toReleaseHandle != nullptr;
-      // destroy toReleseHandle. The item won't be released to allocator
-      // since we marked it as exclusive.
+      // destroy toReleaseHandle. The item won't be released to allocator
+      // since we marked for eviction.
     }
 
-    const auto ref = candidate->unmarkExclusive();
+    const auto ref = candidate->unmarkMoving();
     if (ref == 0u) {
       // Invalidate iterator since later on we may use this mmContainer
       // again, which cannot be done unless we drop this iterator
@@ -2361,7 +2362,7 @@ void CacheAllocator<CacheTrait>::releaseSlabImpl(
     // Need to mark an item for release before proceeding
     // If we can't mark as moving, it means the item is already freed
     const bool isAlreadyFreed =
-        !markExclusiveForSlabRelease(releaseContext, alloc, throttler);
+        !markMovingForSlabRelease(releaseContext, alloc, throttler);
     if (isAlreadyFreed) {
       continue;
     }
@@ -2406,8 +2407,8 @@ bool CacheAllocator<CacheTrait>::moveForSlabRelease(
     stats_.numMoveAttempts.inc();
 
     // Nothing to move and the key is likely also bogus for chained items.
-    if (oldItem.isOnlyExclusive()) {
-      oldItem.unmarkExclusive();
+    if (oldItem.isOnlyMoving()) {
+      oldItem.unmarkMoving();
       const auto res =
           releaseBackToAllocator(oldItem, RemoveContext::kNormal, false);
       XDCHECK(res == ReleaseRes::kReleased);
@@ -2446,7 +2447,7 @@ bool CacheAllocator<CacheTrait>::moveForSlabRelease(
   // that's identical to this one to replace it. Here we just need to wait
   // until all users have dropped the item handles before we can proceed.
   startTime = util::getCurrentTimeSec();
-  while (!oldItem.isOnlyExclusive()) {
+  while (!oldItem.isOnlyMoving()) {
     throttleWith(throttler, [&] {
       XLOGF(WARN,
             "Spent {} seconds, slab release still waiting for refcount to "
@@ -2500,8 +2501,8 @@ CacheAllocator<CacheTrait>::allocateNewItemForOldItem(const Item& oldItem) {
       return {};
     }
 
-    // Set up the destination for the move. Since oldChainedItem would have
-    // the exclusive bit set, it won't be picked for eviction.
+    // Set up the destination for the move. Since oldChainedItem would be
+    // marked as moving, it won't be picked for eviction.
     auto newItemHdl =
         allocateChainedItemInternal(parentHandle, oldChainedItem.getSize());
     if (!newItemHdl) {
@@ -2553,7 +2554,7 @@ bool CacheAllocator<CacheTrait>::tryMovingForSlabRelease(
       // item is still valid.
       const std::string parentKey =
           oldItem.asChainedItem().getParentItem(compressor_).getKey().str();
-      if (oldItem.isOnlyExclusive()) {
+      if (oldItem.isOnlyMoving()) {
         // If chained item no longer has a refcount, its parent is already
         // being released, so we abort this try to moving.
         return false;
@@ -2583,10 +2584,10 @@ void CacheAllocator<CacheTrait>::evictForSlabRelease(
     stats_.numEvictionAttempts.inc();
 
     // if the item is already in a state where only the exclusive bit is set,
-    // nothing needs to be done. We simply need to unmark exclusive bit and free
+    // nothing needs to be done. We simply need to call unmarkMoving and free
     // the item.
-    if (item.isOnlyExclusive()) {
-      item.unmarkExclusive();
+    if (item.isOnlyMoving()) {
+      item.unmarkMoving();
       const auto res =
           releaseBackToAllocator(item, RemoveContext::kNormal, false);
       XDCHECK(ReleaseRes::kReleased == res);
@@ -2617,7 +2618,7 @@ void CacheAllocator<CacheTrait>::evictForSlabRelease(
       stats_.numEvictionSuccesses.inc();
 
       // we have the last handle. no longer need to hold on to the exclusive bit
-      item.unmarkExclusive();
+      item.unmarkMoving();
 
       // manually decrement the refcount to call releaseBackToAllocator
       const auto ref = decRef(*owningHandle);
@@ -2629,7 +2630,7 @@ void CacheAllocator<CacheTrait>::evictForSlabRelease(
     }
 
     if (shutDownInProgress_) {
-      item.unmarkExclusive();
+      item.unmarkMoving();
       allocator_->abortSlabRelease(ctx);
       throw exception::SlabReleaseAborted(
           folly::sformat("Slab Release aborted while trying to evict"
@@ -2775,9 +2776,9 @@ CacheAllocator<CacheTrait>::advanceIteratorAndTryEvictChainedItem(
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::WriteHandle
 CacheAllocator<CacheTrait>::evictNormalItemForSlabRelease(Item& item) {
-  XDCHECK(item.isExclusive());
+  XDCHECK(item.isMoving());
 
-  if (item.isOnlyExclusive()) {
+  if (item.isOnlyMoving()) {
     return WriteHandle{};
   }
 
@@ -2789,7 +2790,7 @@ CacheAllocator<CacheTrait>::evictNormalItemForSlabRelease(Item& item) {
 
   // We remove the item from both access and mm containers. It doesn't matter
   // if someone else calls remove on the item at this moment, the item cannot
-  // be freed as long as we have the exclusive bit set.
+  // be freed as long as it's marked for eviction.
   auto handle = accessContainer_->removeIf(item, std::move(predicate));
 
   if (!handle) {
@@ -2813,7 +2814,7 @@ CacheAllocator<CacheTrait>::evictNormalItemForSlabRelease(Item& item) {
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::WriteHandle
 CacheAllocator<CacheTrait>::evictChainedItemForSlabRelease(ChainedItem& child) {
-  XDCHECK(child.isExclusive());
+  XDCHECK(child.isMoving());
 
   // We have the child marked as moving, but dont know anything about the
   // state of the parent. Unlike the case of regular eviction where we are
@@ -2835,7 +2836,7 @@ CacheAllocator<CacheTrait>::evictChainedItemForSlabRelease(ChainedItem& child) {
   // check if the child is still in mmContainer and the expected parent is
   // valid under the chained item lock.
   if (expectedParent.getKey() != parentKey || !child.isInMMContainer() ||
-      child.isOnlyExclusive() ||
+      child.isOnlyMoving() ||
       &expectedParent != &child.getParentItem(compressor_) ||
       !expectedParent.isAccessible() || !expectedParent.hasChainedItem()) {
     return {};
@@ -2890,14 +2891,14 @@ CacheAllocator<CacheTrait>::evictChainedItemForSlabRelease(ChainedItem& child) {
 
   // In case someone else had removed this chained item from its parent by now
   // So we check again to see if it has been unlinked from its parent
-  if (!child.isInMMContainer() || child.isOnlyExclusive()) {
+  if (!child.isInMMContainer() || child.isOnlyMoving()) {
     return {};
   }
 
   // check after removing from the MMContainer that the parent is still not
   // being marked as moving. If parent is moving, it will release the child
   // item and we will wait for that.
-  if (parentHandle->isExclusive()) {
+  if (parentHandle->isMoving()) {
     return {};
   }
 
@@ -2930,7 +2931,7 @@ bool CacheAllocator<CacheTrait>::removeIfExpired(const ReadHandle& handle) {
 }
 
 template <typename CacheTrait>
-bool CacheAllocator<CacheTrait>::markExclusiveForSlabRelease(
+bool CacheAllocator<CacheTrait>::markMovingForSlabRelease(
     const SlabReleaseContext& ctx, void* alloc, util::Throttler& throttler) {
   // MemoryAllocator::processAllocForRelease will execute the callback
   // if the item is not already free. So there are three outcomes here:
@@ -2949,7 +2950,7 @@ bool CacheAllocator<CacheTrait>::markExclusiveForSlabRelease(
     // Since this callback is executed, the item is not yet freed
     itemFreed = false;
     Item* item = static_cast<Item*>(memory);
-    if (item->markExclusive()) {
+    if (item->markMoving()) {
       markedMoving = true;
     }
   };
diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
index ed0096390a..6c33772dac 100644
--- a/cachelib/allocator/CacheAllocator.h
+++ b/cachelib/allocator/CacheAllocator.h
@@ -1756,9 +1756,9 @@ class CacheAllocator : public CacheBase {
 
   // @return  true when successfully marked as moving,
   //          fasle when this item has already been freed
-  bool markExclusiveForSlabRelease(const SlabReleaseContext& ctx,
-                                   void* alloc,
-                                   util::Throttler& throttler);
+  bool markMovingForSlabRelease(const SlabReleaseContext& ctx,
+                                void* alloc,
+                                util::Throttler& throttler);
 
   // "Move" (by copying) the content in this item to another memory
   // location by invoking the move callback.
@@ -1937,7 +1937,7 @@ class CacheAllocator : public CacheBase {
   }
 
   static bool parentEvictForSlabReleasePredicate(const Item& item) {
-    return item.getRefCount() == 1 && !item.isExclusive();
+    return item.getRefCount() == 1 && !item.isMoving();
   }
 
   std::unique_ptr<Deserializer> createDeserializer();
diff --git a/cachelib/allocator/CacheItem-inl.h b/cachelib/allocator/CacheItem-inl.h
index f59fa9d599..0028e2776a 100644
--- a/cachelib/allocator/CacheItem-inl.h
+++ b/cachelib/allocator/CacheItem-inl.h
@@ -148,15 +148,16 @@ std::string CacheItem<CacheTrait>::toString() const {
     return folly::sformat(
         "item: "
         "memory={}:raw-ref={}:size={}:key={}:hex-key={}:"
-        "isInMMContainer={}:isAccessible={}:isExclusive={}:references={}:ctime="
+        "isInMMContainer={}:isAccessible={}:isMarkedForEviction={}:"
+        "isMoving={}:references={}:ctime="
         "{}:"
         "expTime={}:updateTime={}:isNvmClean={}:isNvmEvicted={}:hasChainedItem="
         "{}",
         this, getRefCountAndFlagsRaw(), getSize(),
         folly::humanify(getKey().str()), folly::hexlify(getKey()),
-        isInMMContainer(), isAccessible(), isExclusive(), getRefCount(),
-        getCreationTime(), getExpiryTime(), getLastAccessTime(), isNvmClean(),
-        isNvmEvicted(), hasChainedItem());
+        isInMMContainer(), isAccessible(), isMarkedForEviction(), isMoving(),
+        getRefCount(), getCreationTime(), getExpiryTime(), getLastAccessTime(),
+        isNvmClean(), isNvmEvicted(), hasChainedItem());
   }
 }
 
@@ -217,23 +218,43 @@ bool CacheItem<CacheTrait>::isInMMContainer() const noexcept {
 }
 
 template <typename CacheTrait>
-bool CacheItem<CacheTrait>::markExclusive() noexcept {
-  return ref_.markExclusive();
+bool CacheItem<CacheTrait>::markForEviction() noexcept {
+  return ref_.markForEviction();
 }
 
 template <typename CacheTrait>
-RefcountWithFlags::Value CacheItem<CacheTrait>::unmarkExclusive() noexcept {
-  return ref_.unmarkExclusive();
+RefcountWithFlags::Value CacheItem<CacheTrait>::unmarkForEviction() noexcept {
+  return ref_.unmarkForEviction();
 }
 
 template <typename CacheTrait>
-bool CacheItem<CacheTrait>::isExclusive() const noexcept {
-  return ref_.isExclusive();
+bool CacheItem<CacheTrait>::isMarkedForEviction() const noexcept {
+  return ref_.isMarkedForEviction();
 }
 
 template <typename CacheTrait>
-bool CacheItem<CacheTrait>::isOnlyExclusive() const noexcept {
-  return ref_.isOnlyExclusive();
+bool CacheItem<CacheTrait>::markForEvictionWhenMoving() {
+  return ref_.markForEvictionWhenMoving();
+}
+
+template <typename CacheTrait>
+bool CacheItem<CacheTrait>::markMoving() {
+  return ref_.markMoving();
+}
+
+template <typename CacheTrait>
+RefcountWithFlags::Value CacheItem<CacheTrait>::unmarkMoving() noexcept {
+  return ref_.unmarkMoving();
+}
+
+template <typename CacheTrait>
+bool CacheItem<CacheTrait>::isMoving() const noexcept {
+  return ref_.isMoving();
+}
+
+template <typename CacheTrait>
+bool CacheItem<CacheTrait>::isOnlyMoving() const noexcept {
+  return ref_.isOnlyMoving();
 }
 
 template <typename CacheTrait>
@@ -335,7 +356,7 @@ bool CacheItem<CacheTrait>::updateExpiryTime(uint32_t expiryTimeSecs) noexcept {
   // check for moving to make sure we are not updating the expiry time while at
   // the same time re-allocating the item with the old state of the expiry time
   // in moveRegularItem(). See D6852328
-  if (isExclusive() || !isInMMContainer() || isChainedItem()) {
+  if (isMoving() || isMarkedForEviction() || !isInMMContainer() || isChainedItem()) {
     return false;
   }
   // attempt to atomically update the value of expiryTime
@@ -451,12 +472,14 @@ std::string CacheChainedItem<CacheTrait>::toString() const {
   return folly::sformat(
       "chained item: "
       "memory={}:raw-ref={}:size={}:parent-compressed-ptr={}:"
-      "isInMMContainer={}:isAccessible={}:isExclusive={}:references={}:ctime={}"
+      "isInMMContainer={}:isAccessible={}:isMarkedForEviction={}:"
+      "isMoving={}:references={}:ctime={}"
       ":"
       "expTime={}:updateTime={}",
       this, Item::getRefCountAndFlagsRaw(), Item::getSize(), cPtr.getRaw(),
-      Item::isInMMContainer(), Item::isAccessible(), Item::isExclusive(),
-      Item::getRefCount(), Item::getCreationTime(), Item::getExpiryTime(),
+      Item::isInMMContainer(), Item::isAccessible(),
+      Item::isMarkedForEviction(), Item::isMoving(), Item::getRefCount(),
+      Item::getCreationTime(), Item::getExpiryTime(),
       Item::getLastAccessTime());
 }
 
diff --git a/cachelib/allocator/CacheItem.h b/cachelib/allocator/CacheItem.h
index 06136db032..afee315cbb 100644
--- a/cachelib/allocator/CacheItem.h
+++ b/cachelib/allocator/CacheItem.h
@@ -305,12 +305,17 @@ class CACHELIB_PACKED_ATTR CacheItem {
    */
   RefcountWithFlags::Value getRefCountAndFlagsRaw() const noexcept;
 
-  FOLLY_ALWAYS_INLINE void incRef() {
-    if (LIKELY(ref_.incRef())) {
-      return;
+  // Increments item's ref count
+  //
+  // @return true on success, failure if item is marked as exclusive
+  // @throw exception::RefcountOverflow on ref count overflow
+  FOLLY_ALWAYS_INLINE bool incRef() {
+    try {
+      return ref_.incRef();
+    } catch (exception::RefcountOverflow& e) {
+      throw exception::RefcountOverflow(
+          folly::sformat("{} item: {}", e.what(), toString()));
     }
-    throw exception::RefcountOverflow(
-        folly::sformat("Refcount maxed out. item: {}", toString()));
   }
 
   FOLLY_ALWAYS_INLINE RefcountWithFlags::Value decRef() {
@@ -344,23 +349,43 @@ class CACHELIB_PACKED_ATTR CacheItem {
 
   /**
    * The following two functions corresond to whether or not an item is
-   * currently in the process of being moved. This happens during a slab
-   * rebalance, eviction or resize operation.
+   * currently in the process of being evicted.
    *
-   * An item can only be marked exclusive when `isInMMContainer` returns true.
+   * An item can only be marked exclusive when `isInMMContainer` returns true
+   * and item is not already exclusive nor moving and the ref count is 0.
    * This operation is atomic.
    *
-   * User can also query if an item "isOnlyExclusive". This returns true only
-   * if the refcount is 0 and only the exclusive bit is set.
-   *
-   * Unmarking exclusive does not depend on `isInMMContainer`.
+   * Unmarking exclusive does not depend on `isInMMContainer`
    * Unmarking exclusive will also return the refcount at the moment of
    * unmarking.
    */
-  bool markExclusive() noexcept;
-  RefcountWithFlags::Value unmarkExclusive() noexcept;
-  bool isExclusive() const noexcept;
-  bool isOnlyExclusive() const noexcept;
+  bool markForEviction() noexcept;
+  RefcountWithFlags::Value unmarkForEviction() noexcept;
+  bool isMarkedForEviction() const noexcept;
+
+  /**
+   * The following functions correspond to whether or not an item is
+   * currently in the processed of being moved. When moving, ref count
+   * is always >= 1.
+   *
+   * An item can only be marked moving when `isInMMContainer` returns true
+   * and item is not already exclusive nor moving.
+   *
+   * User can also query if an item "isOnlyMoving". This returns true only
+   * if the refcount is one and only the exclusive bit is set.
+   *
+   * Unmarking moving does not depend on `isInMMContainer`
+   * Unmarking moving will also return the refcount at the moment of
+   * unmarking.
+   */
+  bool markMoving();
+  RefcountWithFlags::Value unmarkMoving() noexcept;
+  bool isMoving() const noexcept;
+  bool isOnlyMoving() const noexcept;
+
+  /** This function attempts to mark item as exclusive.
+   * Can only be called on the item that is moving.*/
+  bool markForEvictionWhenMoving();
 
   /**
    * Item cannot be marked both chained allocation and
diff --git a/cachelib/allocator/Refcount.h b/cachelib/allocator/Refcount.h
index c60dea34f1..44a3facd3a 100644
--- a/cachelib/allocator/Refcount.h
+++ b/cachelib/allocator/Refcount.h
@@ -132,32 +132,28 @@ class FOLLY_PACK_ATTR RefcountWithFlags {
   RefcountWithFlags& operator=(RefcountWithFlags&&) = delete;
 
   // Bumps up the reference count only if the new count will be strictly less
-  // than or equal to the maxCount.
-  // @return true if refcount is bumped. false otherwise.
-  FOLLY_ALWAYS_INLINE bool incRef() noexcept {
-    Value* const refPtr = &refCount_;
-    unsigned int nCASFailures = 0;
-    constexpr bool isWeak = false;
-    Value oldVal = __atomic_load_n(refPtr, __ATOMIC_RELAXED);
-
-    while (true) {
-      const Value newCount = oldVal + static_cast<Value>(1);
-      if (UNLIKELY((oldVal & kAccessRefMask) == (kAccessRefMask))) {
-        return false;
+  // than or equal to the maxCount and the item is not exclusive
+  // @return true if refcount is bumped. false otherwise (if item is exclusive)
+  // @throw  exception::RefcountOverflow if new count would be greater than
+  // maxCount
+  FOLLY_ALWAYS_INLINE bool incRef() {
+    auto predicate = [](const Value curValue) {
+      Value bitMask = getAdminRef<kExclusive>();
+
+      const bool exlusiveBitIsSet = curValue & bitMask;
+      if (UNLIKELY((curValue & kAccessRefMask) == (kAccessRefMask))) {
+        throw exception::RefcountOverflow("Refcount maxed out.");
       }
 
-      if (__atomic_compare_exchange_n(refPtr, &oldVal, newCount, isWeak,
-                                      __ATOMIC_ACQ_REL, __ATOMIC_RELAXED)) {
-        return true;
-      }
+      // Check if the item is not marked for eviction
+      return !exlusiveBitIsSet || ((curValue & kAccessRefMask) != 0);
+    };
 
-      if ((++nCASFailures % 4) == 0) {
-        // this pause takes up to 40 clock cycles on intel and the lock cmpxchgl
-        // above should take about 100 clock cycles. we pause once every 400
-        // cycles or so if we are extremely unlucky.
-        folly::asm_volatile_pause();
-      }
-    }
+    auto newValue = [](const Value curValue) {
+      return (curValue + static_cast<Value>(1));
+    };
+
+    return atomicUpdateValue(predicate, newValue);
   }
 
   // Bumps down the reference count
@@ -167,33 +163,38 @@ class FOLLY_PACK_ATTR RefcountWithFlags {
   // @throw  RefcountUnderflow when we are trying to decremenet from 0
   //         refcount and have a refcount leak.
   FOLLY_ALWAYS_INLINE Value decRef() {
-    Value* const refPtr = &refCount_;
-    unsigned int nCASFailures = 0;
-    constexpr bool isWeak = false;
-
-    Value oldVal = __atomic_load_n(refPtr, __ATOMIC_RELAXED);
-    while (true) {
-      const Value newCount = oldVal - static_cast<Value>(1);
-      if ((oldVal & kAccessRefMask) == 0) {
+    auto predicate = [](const Value curValue) {
+      if ((curValue & kAccessRefMask) == 0) {
         throw exception::RefcountUnderflow(
             "Trying to decRef with no refcount. RefCount Leak!");
       }
+      return true;
+    };
 
-      if (__atomic_compare_exchange_n(refPtr, &oldVal, newCount, isWeak,
-                                      __ATOMIC_ACQ_REL, __ATOMIC_RELAXED)) {
-        return newCount & kRefMask;
-      }
-      if ((++nCASFailures % 4) == 0) {
-        // this pause takes up to 40 clock cycles on intel and the lock cmpxchgl
-        // above should take about 100 clock cycles. we pause once every 400
-        // cycles or so if we are extremely unlucky
-        folly::asm_volatile_pause();
-      }
-    }
+    Value retValue;
+    auto newValue = [&retValue](const Value curValue) {
+      retValue = (curValue - static_cast<Value>(1));
+      return retValue;
+    };
+
+    auto updated = atomicUpdateValue(predicate, newValue);
+    XDCHECK(updated);
+
+    return retValue & kRefMask;
   }
 
-  // Return refcount excluding control bits and flags
-  Value getAccessRef() const noexcept { return getRaw() & kAccessRefMask; }
+  // Return refcount excluding moving refcount, control bits and flags.
+  Value getAccessRef() const noexcept {
+    auto raw = getRaw();
+    auto accessRef = raw & kAccessRefMask;
+
+    if ((raw & getAdminRef<kExclusive>()) && accessRef >= 1) {
+      // if item is moving, ignore the extra ref
+      return accessRef - static_cast<Value>(1);
+    } else {
+      return accessRef;
+    }
+  }
 
   // Return access ref and the admin ref bits
   Value getRefWithAccessAndAdmin() const noexcept {
@@ -246,65 +247,143 @@ class FOLLY_PACK_ATTR RefcountWithFlags {
   }
 
   /**
-   * The following four functions are used to track whether or not
-   * an item is currently in the process of being moved. This happens during a
-   * slab rebalance or resize operation or during eviction.
+   * The following two functions correspond to whether or not an item is
+   * currently in the process of being evicted. When item is marked for
+   * eviction, `kExclusive` bit is set and ref count is zero.
    *
-   * An item can only be marked exclusive when `isInMMContainer` returns true
-   * and the item is not yet marked as exclusive. This operation is atomic.
+   * An item can only be marked for eviction when `isInMMContainer` and
+   * `isAccessible` return true and item is not already marked for eviction
+   * nor moving and the ref count is 0. This operation is atomic.
    *
-   * User can also query if an item "isOnlyExclusive". This returns true only
-   * if the refcount is 0 and only the exclusive bit is set.
-   *
-   * Unmarking exclusive does not depend on `isInMMContainer`.
-   * Unmarking exclusive will also return the refcount at the moment of
-   * unmarking.
+   * Unmarking eviction does not depend on `isInMMContainer` nor `isAccessible`
    */
-  bool markExclusive() noexcept {
-    Value bitMask = getAdminRef<kExclusive>();
-    Value conditionBitMask = getAdminRef<kLinked>();
-
-    Value* const refPtr = &refCount_;
-    unsigned int nCASFailures = 0;
-    constexpr bool isWeak = false;
-    Value curValue = __atomic_load_n(refPtr, __ATOMIC_RELAXED);
-    while (true) {
+  bool markForEviction() noexcept {
+    auto predicate = [](const Value curValue) {
+      Value conditionBitMask = getAdminRef<kLinked>();
       const bool flagSet = curValue & conditionBitMask;
-      const bool alreadyExclusive = curValue & bitMask;
+      const bool alreadyExclusive = curValue & getAdminRef<kExclusive>();
+      const bool accessible = curValue & getAdminRef<kAccessible>();
+
       if (!flagSet || alreadyExclusive) {
         return false;
       }
-
-      const Value newValue = curValue | bitMask;
-      if (__atomic_compare_exchange_n(refPtr, &curValue, newValue, isWeak,
-                                      __ATOMIC_ACQ_REL, __ATOMIC_RELAXED)) {
-        XDCHECK(newValue & conditionBitMask);
-        return true;
+      if ((curValue & kAccessRefMask) != 0) {
+        return false;
       }
-
-      if ((++nCASFailures % 4) == 0) {
-        // this pause takes up to 40 clock cycles on intel and the lock cmpxchgl
-        // above should take about 100 clock cycles. we pause once every 400
-        // cycles or so if we are extremely unlucky.
-        folly::asm_volatile_pause();
+      if (!accessible) {
+        return false;
       }
-    }
+
+      return true;
+    };
+
+    auto newValue = [](const Value curValue) {
+      return curValue | getAdminRef<kExclusive>();
+    };
+
+    return atomicUpdateValue(predicate, newValue);
   }
-  Value unmarkExclusive() noexcept {
+
+  Value unmarkForEviction() noexcept {
+    XDCHECK(isMarkedForEviction());
     Value bitMask = ~getAdminRef<kExclusive>();
     return __atomic_and_fetch(&refCount_, bitMask, __ATOMIC_ACQ_REL) & kRefMask;
   }
-  bool isExclusive() const noexcept {
-    return getRaw() & getAdminRef<kExclusive>();
+
+  bool isMarkedForEviction() const noexcept {
+    auto raw = getRaw();
+    return (raw & getAdminRef<kExclusive>()) && ((raw & kAccessRefMask) == 0);
   }
-  bool isOnlyExclusive() const noexcept {
-    // An item is only exclusive when its refcount is zero and only the
-    // exclusive bit among all the control bits is set. This indicates an item
-    // is exclusive to the current thread. No other thread is allowed to
-    // do anything with it.
+
+  /**
+   * The following functions correspond to whether or not an item is
+   * currently in the processed of being moved. When moving, internal
+   * ref count is always >= 1 and `kExclusive` bit is set. getRefCount
+   * does not return the extra ref (it can return 0).
+   *
+   * An item can only be marked moving when `isInMMContainer` returns true
+   * and item is not already marked for eviction nor moving.
+   *
+   * User can also query if an item "isOnlyMoving". This returns true only
+   * if the refcount is one and only the exlusive bit is set.
+   *
+   * Unmarking moving does not depend on `isInMMContainer`
+   */
+  bool markMoving() {
+    auto predicate = [](const Value curValue) {
+      Value conditionBitMask = getAdminRef<kLinked>();
+      const bool flagSet = curValue & conditionBitMask;
+      const bool alreadyExclusive = curValue & getAdminRef<kExclusive>();
+
+      if (!flagSet || alreadyExclusive) {
+        return false;
+      }
+      if (UNLIKELY((curValue & kAccessRefMask) == (kAccessRefMask))) {
+        throw exception::RefcountOverflow("Refcount maxed out.");
+      }
+
+      return true;
+    };
+
+    auto newValue = [](const Value curValue) {
+      // Set exclusive flag and make the ref count non-zero (to distinguish
+      // from exclusive case). This extra ref will not be reported to the
+      // user
+      return (curValue + static_cast<Value>(1)) | getAdminRef<kExclusive>();
+    };
+
+    return atomicUpdateValue(predicate, newValue);
+  }
+
+  Value unmarkMoving() noexcept {
+    XDCHECK(isMoving());
+    auto predicate = [](const Value curValue) {
+      XDCHECK((curValue & kAccessRefMask) != 0);
+      return true;
+    };
+
+    Value retValue;
+    auto newValue = [&retValue](const Value curValue) {
+      retValue =
+          (curValue - static_cast<Value>(1)) & ~getAdminRef<kExclusive>();
+      return retValue;
+    };
+
+    auto updated = atomicUpdateValue(predicate, newValue);
+    XDCHECK(updated);
+
+    return retValue & kRefMask;
+  }
+
+  bool isMoving() const noexcept {
+    auto raw = getRaw();
+    return (raw & getAdminRef<kExclusive>()) && ((raw & kAccessRefMask) != 0);
+  }
+
+  /** This function attempts to mark item for eviction.
+   * Can only be called on the item that is moving.*/
+  bool markForEvictionWhenMoving() {
+    XDCHECK(isMoving());
+
+    auto predicate = [](const Value curValue) {
+      return (curValue & kAccessRefMask) == 1;
+    };
+
+    auto newValue = [](const Value curValue) {
+      XDCHECK((curValue & kAccessRefMask) == 1);
+      return (curValue - static_cast<Value>(1));
+    };
+
+    return atomicUpdateValue(predicate, newValue);
+  }
+
+  bool isOnlyMoving() const noexcept {
+    // An item is only moving when its refcount is one and only the exclusive
+    // bit among all the control bits is set. This indicates an item is already
+    // on its way out of cache and does not need to be moved.
     auto ref = getRefWithAccessAndAdmin();
-    bool anyOtherBitSet = ref & ~getAdminRef<kExclusive>();
-    if (anyOtherBitSet) {
+    Value valueWithoutExclusiveBit = ref & ~getAdminRef<kExclusive>();
+    if (valueWithoutExclusiveBit != 1) {
       return false;
     }
     return ref & getAdminRef<kExclusive>();
@@ -370,6 +449,39 @@ class FOLLY_PACK_ATTR RefcountWithFlags {
   }
 
  private:
+  /**
+   * Helper function to modify refCount_ atomically.
+   *
+   * If predicate(currentValue) is true, then it atomically assigns result
+   * of newValueF(currentValue) to refCount_ and returns true. Otherwise
+   * returns false and leaves refCount_ unmodified.
+   */
+  template <typename P, typename F>
+  bool atomicUpdateValue(P&& predicate, F&& newValueF) {
+    Value* const refPtr = &refCount_;
+    unsigned int nCASFailures = 0;
+    constexpr bool isWeak = false;
+    Value curValue = __atomic_load_n(refPtr, __ATOMIC_RELAXED);
+    while (true) {
+      if (!predicate(curValue)) {
+        return false;
+      }
+
+      const Value newValue = newValueF(curValue);
+      if (__atomic_compare_exchange_n(refPtr, &curValue, newValue, isWeak,
+                                      __ATOMIC_ACQ_REL, __ATOMIC_RELAXED)) {
+        return true;
+      }
+
+      if ((++nCASFailures % 4) == 0) {
+        // this pause takes up to 40 clock cycles on intel and the lock cmpxchgl
+        // above should take about 100 clock cycles. we pause once every 400
+        // cycles or so if we are extremely unlucky.
+        folly::asm_volatile_pause();
+      }
+    }
+  }
+
   template <Flags flagBit>
   static Value getFlag() noexcept {
     static_assert(flagBit >= kNumAccessRefBits + kNumAdminRefBits,
diff --git a/cachelib/allocator/tests/ItemTest.cpp b/cachelib/allocator/tests/ItemTest.cpp
index b0f3a2fdec..70dd1277fe 100644
--- a/cachelib/allocator/tests/ItemTest.cpp
+++ b/cachelib/allocator/tests/ItemTest.cpp
@@ -83,10 +83,20 @@ TEST(ItemTest, ExpiryTime) {
   EXPECT_EQ(tenMins, item->getConfiguredTTL());
 
   // Test that writes fail while the item is moving
-  item->markExclusive();
+  result = item->markMoving();
+  EXPECT_TRUE(result);
+  result = item->updateExpiryTime(0);
+  EXPECT_FALSE(result);
+  item->unmarkMoving();
+
+  // Test that writes fail while the item is marked for eviction
+  item->markAccessible();
+  result = item->markForEviction();
+  EXPECT_TRUE(result);
   result = item->updateExpiryTime(0);
   EXPECT_FALSE(result);
-  item->unmarkExclusive();
+  item->unmarkForEviction();
+  item->unmarkAccessible();
 
   // Test that writes fail while the item is not in an MMContainer
   item->unmarkInMMContainer();
diff --git a/cachelib/allocator/tests/RefCountTest.cpp b/cachelib/allocator/tests/RefCountTest.cpp
index b355a48a8e..d05be08c31 100644
--- a/cachelib/allocator/tests/RefCountTest.cpp
+++ b/cachelib/allocator/tests/RefCountTest.cpp
@@ -30,6 +30,7 @@ class RefCountTest : public AllocTestBase {
  public:
   static void testMultiThreaded();
   static void testBasic();
+  static void testMarkForEvictionAndMoving();
 };
 
 void RefCountTest::testMultiThreaded() {
@@ -81,7 +82,7 @@ void RefCountTest::testBasic() {
   ASSERT_EQ(0, ref.getRaw());
   ASSERT_FALSE(ref.isInMMContainer());
   ASSERT_FALSE(ref.isAccessible());
-  ASSERT_FALSE(ref.isExclusive());
+  ASSERT_FALSE(ref.isMoving());
   ASSERT_FALSE(ref.template isFlagSet<RefcountWithFlags::Flags::kMMFlag0>());
   ASSERT_FALSE(ref.template isFlagSet<RefcountWithFlags::Flags::kMMFlag1>());
 
@@ -89,7 +90,7 @@ void RefCountTest::testBasic() {
   ref.markInMMContainer();
   ASSERT_TRUE(ref.isInMMContainer());
   ASSERT_FALSE(ref.isAccessible());
-  ASSERT_FALSE(ref.isExclusive());
+  ASSERT_FALSE(ref.isMoving());
   ASSERT_EQ(0, ref.getAccessRef());
   ASSERT_FALSE(ref.template isFlagSet<RefcountWithFlags::Flags::kMMFlag0>());
   ASSERT_FALSE(ref.template isFlagSet<RefcountWithFlags::Flags::kMMFlag1>());
@@ -105,13 +106,13 @@ void RefCountTest::testBasic() {
 
   // Incrementing past the max will fail
   auto rawRef = ref.getRaw();
-  ASSERT_FALSE(ref.incRef());
+  ASSERT_THROW(ref.incRef(), std::overflow_error);
   ASSERT_EQ(rawRef, ref.getRaw());
 
   // Bumping up access ref shouldn't affect admin ref and flags
   ASSERT_TRUE(ref.isInMMContainer());
   ASSERT_FALSE(ref.isAccessible());
-  ASSERT_FALSE(ref.isExclusive());
+  ASSERT_FALSE(ref.isMoving());
   ASSERT_EQ(RefcountWithFlags::kAccessRefMask, ref.getAccessRef());
   ASSERT_TRUE(ref.template isFlagSet<RefcountWithFlags::Flags::kMMFlag0>());
   ASSERT_FALSE(ref.template isFlagSet<RefcountWithFlags::Flags::kMMFlag1>());
@@ -128,7 +129,7 @@ void RefCountTest::testBasic() {
   // Bumping down access ref shouldn't affect admin ref and flags
   ASSERT_TRUE(ref.isInMMContainer());
   ASSERT_FALSE(ref.isAccessible());
-  ASSERT_FALSE(ref.isExclusive());
+  ASSERT_FALSE(ref.isMoving());
   ASSERT_EQ(0, ref.getAccessRef());
   ASSERT_TRUE(ref.template isFlagSet<RefcountWithFlags::Flags::kMMFlag0>());
   ASSERT_FALSE(ref.template isFlagSet<RefcountWithFlags::Flags::kMMFlag1>());
@@ -136,7 +137,7 @@ void RefCountTest::testBasic() {
   ref.template unSetFlag<RefcountWithFlags::Flags::kMMFlag0>();
   ASSERT_TRUE(ref.isInMMContainer());
   ASSERT_FALSE(ref.isAccessible());
-  ASSERT_FALSE(ref.isExclusive());
+  ASSERT_FALSE(ref.isMoving());
   ASSERT_EQ(0, ref.getAccessRef());
   ASSERT_FALSE(ref.template isFlagSet<RefcountWithFlags::Flags::kMMFlag0>());
   ASSERT_FALSE(ref.template isFlagSet<RefcountWithFlags::Flags::kMMFlag1>());
@@ -145,33 +146,119 @@ void RefCountTest::testBasic() {
   ASSERT_EQ(0, ref.getRaw());
   ASSERT_FALSE(ref.isInMMContainer());
   ASSERT_FALSE(ref.isAccessible());
-  ASSERT_FALSE(ref.isExclusive());
+  ASSERT_FALSE(ref.isMoving());
   ASSERT_EQ(0, ref.getAccessRef());
   ASSERT_FALSE(ref.template isFlagSet<RefcountWithFlags::Flags::kMMFlag0>());
   ASSERT_FALSE(ref.template isFlagSet<RefcountWithFlags::Flags::kMMFlag1>());
 
   // conditionally set flags
-  ASSERT_FALSE((ref.markExclusive()));
+  ASSERT_FALSE((ref.markMoving()));
   ref.markInMMContainer();
-  ASSERT_TRUE((ref.markExclusive()));
-  ASSERT_FALSE((ref.isOnlyExclusive()));
+  // only first one succeeds
+  ASSERT_TRUE((ref.markMoving()));
+  ASSERT_FALSE((ref.markMoving()));
   ref.unmarkInMMContainer();
+
   ref.template setFlag<RefcountWithFlags::Flags::kMMFlag0>();
-  // Have no other admin refcount but with a flag still means "isOnlyExclusive"
-  ASSERT_TRUE((ref.isOnlyExclusive()));
+  // Have no other admin refcount but with a flag still means "isOnlyMoving"
+  ASSERT_TRUE((ref.isOnlyMoving()));
 
-  // Set some flags and verify that "isOnlyExclusive" does not care about flags
+  // Set some flags and verify that "isOnlyMoving" does not care about flags
   ref.markIsChainedItem();
   ASSERT_TRUE(ref.isChainedItem());
-  ASSERT_TRUE((ref.isOnlyExclusive()));
+  ASSERT_TRUE((ref.isOnlyMoving()));
   ref.unmarkIsChainedItem();
   ASSERT_FALSE(ref.isChainedItem());
-  ASSERT_TRUE((ref.isOnlyExclusive()));
+  ASSERT_TRUE((ref.isOnlyMoving()));
+}
+
+void RefCountTest::testMarkForEvictionAndMoving() {
+  {
+    // cannot mark for eviction when not accessible or not in MMContainer
+    RefcountWithFlags ref;
+    ASSERT_FALSE(ref.markForEviction());
+
+    ref.markInMMContainer();
+    ASSERT_FALSE(ref.markForEviction());
+    ref.unmarkInMMContainer();
+
+    ref.markAccessible();
+    ASSERT_FALSE(ref.markForEviction());
+  }
+
+  {
+    // can mark for eviction when accessible and in MMContainer
+    // and unmarkForEviction return value contains admin bits
+    RefcountWithFlags ref;
+    ref.markInMMContainer();
+    ref.markAccessible();
+    ASSERT_TRUE(ref.markForEviction());
+    ASSERT_TRUE(ref.unmarkForEviction() > 0);
+  }
+
+  {
+    // cannot mark for eviction when moving
+    RefcountWithFlags ref;
+    ref.markInMMContainer();
+    ref.markAccessible();
+
+    ASSERT_TRUE(ref.markMoving());
+    ASSERT_FALSE(ref.markForEviction());
+
+    ref.unmarkInMMContainer();
+    ref.unmarkAccessible();
+    auto ret = ref.unmarkMoving();
+    ASSERT_EQ(ret, 0);
+  }
+
+  {
+    // cannot mark moving when marked for eviction
+    RefcountWithFlags ref;
+    ref.markInMMContainer();
+    ref.markAccessible();
+
+    ASSERT_TRUE(ref.markForEviction());
+    ASSERT_FALSE(ref.markMoving());
+
+    ref.unmarkInMMContainer();
+    ref.unmarkAccessible();
+    auto ret = ref.unmarkForEviction();
+    ASSERT_EQ(ret, 0);
+  }
+
+  {
+    // can mark moving when ref count > 0
+    RefcountWithFlags ref;
+    ref.markInMMContainer();
+    ref.markAccessible();
+
+    ref.incRef();
+
+    ASSERT_TRUE(ref.markMoving());
+
+    ref.unmarkInMMContainer();
+    ref.unmarkAccessible();
+    auto ret = ref.unmarkMoving();
+    ASSERT_EQ(ret, 1);
+  }
+
+  {
+    // cannot mark for eviction when ref count > 0
+    RefcountWithFlags ref;
+    ref.markInMMContainer();
+    ref.markAccessible();
+
+    ref.incRef();
+    ASSERT_FALSE(ref.markForEviction());
+  }
 }
 } // namespace
 
 TEST_F(RefCountTest, MutliThreaded) { testMultiThreaded(); }
 TEST_F(RefCountTest, Basic) { testBasic(); }
+TEST_F(RefCountTest, MarkForEvictionAndMoving) {
+  testMarkForEvictionAndMoving();
+}
 } // namespace tests
 } // namespace cachelib
 } // namespace facebook

From c1020dfb6438eb4666bd6ff6723fd03980845ab7 Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Fri, 3 Feb 2023 16:02:50 -0800
Subject: [PATCH 30/47] Adds createPutToken and switches findEviction to
 utilize combined locking.

---
 cachelib/allocator/CacheAllocator-inl.h       | 613 +++++++-----------
 cachelib/allocator/CacheAllocator.h           |  61 +-
 cachelib/allocator/MM2Q.h                     |   1 +
 cachelib/allocator/nvmcache/NvmCache-inl.h    |  17 +-
 cachelib/allocator/nvmcache/NvmCache.h        |   6 +-
 .../allocator/nvmcache/tests/NvmTestBase.h    |   4 +-
 cachelib/allocator/tests/BaseAllocatorTest.h  |  30 +-
 7 files changed, 281 insertions(+), 451 deletions(-)

diff --git a/cachelib/allocator/CacheAllocator-inl.h b/cachelib/allocator/CacheAllocator-inl.h
index 3678ca131b..9d33e8519a 100644
--- a/cachelib/allocator/CacheAllocator-inl.h
+++ b/cachelib/allocator/CacheAllocator-inl.h
@@ -834,7 +834,9 @@ CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it,
 
       // If this chained item is marked as moving, we will not free it.
       // We must capture the moving state before we do the decRef when
-      // we know the item must still be valid
+      // we know the item must still be valid. Item cannot be marked as
+      // exclusive. Only parent can be marked as such and even parent needs
+      // to be unmark prior to calling releaseBackToAllocator.
       const bool wasMoving = head->isMoving();
       XDCHECK(!head->isMarkedForEviction());
 
@@ -882,9 +884,12 @@ CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it,
 }
 
 template <typename CacheTrait>
-void CacheAllocator<CacheTrait>::incRef(Item& it) {
-  it.incRef();
-  ++handleCount_.tlStats();
+bool CacheAllocator<CacheTrait>::incRef(Item& it) {
+  if (it.incRef()) {
+    ++handleCount_.tlStats();
+    return true;
+  }
+  return false;
 }
 
 template <typename CacheTrait>
@@ -904,8 +909,12 @@ CacheAllocator<CacheTrait>::acquire(Item* it) {
 
   SCOPE_FAIL { stats_.numRefcountOverflow.inc(); };
 
-  incRef(*it);
-  return WriteHandle{it, *this};
+  if (LIKELY(incRef(*it))) {
+    return WriteHandle{it, *this};
+  } else {
+    // item is being evicted
+    return WriteHandle{};
+  }
 }
 
 template <typename CacheTrait>
@@ -1131,7 +1140,7 @@ bool CacheAllocator<CacheTrait>::moveRegularItem(Item& oldItem,
   // it is unsafe to replace the old item with a new one, so we should
   // also abort.
   if (!accessContainer_->replaceIf(oldItem, *newItemHdl,
-                                   itemExclusivePredicate)) {
+                                   itemSlabMovePredicate)) {
     return false;
   }
 
@@ -1184,14 +1193,14 @@ bool CacheAllocator<CacheTrait>::moveChainedItem(ChainedItem& oldItem,
     return false;
   }
 
-  const auto parentKey = oldItem.getParentItem(compressor_).getKey();
-
-  // Grab lock to prevent anyone else from modifying the chain
+  auto& expectedParent = oldItem.getParentItem(compressor_);
+  const auto parentKey = expectedParent.getKey();
   auto l = chainedItemLocks_.lockExclusive(parentKey);
 
+  // verify old item under the lock
   auto parentHandle =
       validateAndGetParentHandleForChainedMoveLocked(oldItem, parentKey);
-  if (!parentHandle) {
+  if (!parentHandle || &expectedParent != parentHandle.get()) {
     return false;
   }
 
@@ -1233,6 +1242,28 @@ bool CacheAllocator<CacheTrait>::moveChainedItem(ChainedItem& oldItem,
   return true;
 }
 
+template <typename CacheTrait>
+typename CacheAllocator<CacheTrait>::NvmCacheT::PutToken
+CacheAllocator<CacheTrait>::createPutToken(Item& item) {
+  const bool evictToNvmCache = shouldWriteToNvmCache(item);
+  return evictToNvmCache ? nvmCache_->createPutToken(item.getKey())
+                         : typename NvmCacheT::PutToken{};
+}
+
+template <typename CacheTrait>
+void CacheAllocator<CacheTrait>::unlinkItemForEviction(Item& it) {
+  XDCHECK(it.isMarkedForEviction());
+  XDCHECK(it.getRefCount() == 0);
+
+  accessContainer_->remove(it);
+  removeFromMMContainer(it);
+
+  // Since we managed to mark the item for eviction we must be the only
+  // owner of the item.
+  const auto ref = it.unmarkForEviction();
+  XDCHECK(ref == 0u);
+}
+
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::Item*
 CacheAllocator<CacheTrait>::findEviction(PoolId pid, ClassId cid) {
@@ -1241,76 +1272,102 @@ CacheAllocator<CacheTrait>::findEviction(PoolId pid, ClassId cid) {
   // Keep searching for a candidate until we were able to evict it
   // or until the search limit has been exhausted
   unsigned int searchTries = 0;
-  auto itr = mmContainer.getEvictionIterator();
   while ((config_.evictionSearchTries == 0 ||
-          config_.evictionSearchTries > searchTries) &&
-         itr) {
-    ++searchTries;
-    (*stats_.evictionAttempts)[pid][cid].inc();
+          config_.evictionSearchTries > searchTries)) {
+    Item* toRecycle = nullptr;
+    Item* candidate = nullptr;
+    typename NvmCacheT::PutToken token;
+
+    mmContainer.withEvictionIterator([this, pid, cid, &candidate, &toRecycle,
+                                      &searchTries, &mmContainer,
+                                      &token](auto&& itr) {
+      if (!itr) {
+        ++searchTries;
+        (*stats_.evictionAttempts)[pid][cid].inc();
+        return;
+      }
+
+      while ((config_.evictionSearchTries == 0 ||
+              config_.evictionSearchTries > searchTries) &&
+             itr) {
+        ++searchTries;
+        (*stats_.evictionAttempts)[pid][cid].inc();
+
+        auto* toRecycle_ = itr.get();
+        auto* candidate_ =
+            toRecycle_->isChainedItem()
+                ? &toRecycle_->asChainedItem().getParentItem(compressor_)
+                : toRecycle_;
+
+        token = createPutToken(*candidate_);
+
+        if (shouldWriteToNvmCache(*candidate_) && !token.isValid()) {
+          stats_.evictFailConcurrentFill.inc();
+        } else if (candidate_->markForEviction()) {
+          XDCHECK(candidate_->isMarkedForEviction());
+          // markForEviction to make sure no other thead is evicting the item
+          // nor holding a handle to that item
+
+          toRecycle = toRecycle_;
+          candidate = candidate_;
+
+          // Check if parent changed for chained items - if yes, we cannot
+          // remove the child from the mmContainer as we will not be evicting
+          // it. We could abort right here, but we need to cleanup in case
+          // unmarkForEviction() returns 0 - so just go through normal path.
+          if (!toRecycle_->isChainedItem() ||
+              &toRecycle->asChainedItem().getParentItem(compressor_) ==
+                  candidate)
+            mmContainer.remove(itr);
+          return;
+        }
 
-    Item* toRecycle = itr.get();
+        if (candidate_->hasChainedItem()) {
+          stats_.evictFailParentAC.inc();
+        } else {
+          stats_.evictFailAC.inc();
+        }
 
-    Item* candidate =
-        toRecycle->isChainedItem()
-            ? &toRecycle->asChainedItem().getParentItem(compressor_)
-            : toRecycle;
+        ++itr;
+        XDCHECK(toRecycle == nullptr);
+        XDCHECK(candidate == nullptr);
+      }
+    });
 
-    // make sure no other thead is evicting the item
-    if (candidate->getRefCount() != 0 || !candidate->markMoving()) {
-      ++itr;
+    if (!toRecycle)
       continue;
-    }
+
+    XDCHECK(toRecycle);
+    XDCHECK(candidate);
 
     // for chained items, the ownership of the parent can change. We try to
     // evict what we think as parent and see if the eviction of parent
     // recycles the child we intend to.
-    bool evictionSuccessful = false;
-    {
-      auto toReleaseHandle =
-          itr->isChainedItem()
-              ? advanceIteratorAndTryEvictChainedItem(itr)
-              : advanceIteratorAndTryEvictRegularItem(mmContainer, itr);
-      evictionSuccessful = toReleaseHandle != nullptr;
-      // destroy toReleaseHandle. The item won't be released to allocator
-      // since we marked for eviction.
-    }
-
-    const auto ref = candidate->unmarkMoving();
-    if (ref == 0u) {
-      // Invalidate iterator since later on we may use this mmContainer
-      // again, which cannot be done unless we drop this iterator
-      itr.destroy();
-
-      // recycle the item. it's safe to do so, even if toReleaseHandle was
-      // NULL. If `ref` == 0 then it means that we are the last holder of
-      // that item.
-      if (candidate->hasChainedItem()) {
-        (*stats_.chainedItemEvictions)[pid][cid].inc();
-      } else {
-        (*stats_.regularItemEvictions)[pid][cid].inc();
-      }
+    unlinkItemForEviction(*candidate);
+    XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving());
 
-      if (auto eventTracker = getEventTracker()) {
-        eventTracker->record(AllocatorApiEvent::DRAM_EVICT, candidate->getKey(),
-                             AllocatorApiResult::EVICTED, candidate->getSize(),
-                             candidate->getConfiguredTTL().count());
-      }
+    if (token.isValid() && shouldWriteToNvmCacheExclusive(*candidate)) {
+      nvmCache_->put(*candidate, std::move(token));
+    }
 
-      // check if by releasing the item we intend to, we actually
-      // recycle the candidate.
-      if (ReleaseRes::kRecycled ==
-          releaseBackToAllocator(*candidate, RemoveContext::kEviction,
-                                 /* isNascent */ false, toRecycle)) {
-        return toRecycle;
-      }
+    if (candidate->hasChainedItem()) {
+      (*stats_.chainedItemEvictions)[pid][cid].inc();
     } else {
-      XDCHECK(!evictionSuccessful);
+      (*stats_.regularItemEvictions)[pid][cid].inc();
+    }
+
+    if (auto eventTracker = getEventTracker()) {
+      eventTracker->record(AllocatorApiEvent::DRAM_EVICT, candidate->getKey(),
+                           AllocatorApiResult::EVICTED, candidate->getSize(),
+                           candidate->getConfiguredTTL().count());
     }
 
-    // If we destroyed the itr to possibly evict and failed, we restart
-    // from the beginning again
-    if (!itr) {
-      itr.resetToBegin();
+    // check if by releasing the item we intend to, we actually
+    // recycle the candidate.
+    auto ret = releaseBackToAllocator(*candidate, RemoveContext::kEviction,
+                                      /* isNascent */ false, toRecycle);
+    if (ret == ReleaseRes::kRecycled) {
+      return toRecycle;
     }
   }
   return nullptr;
@@ -1454,7 +1511,7 @@ bool CacheAllocator<CacheTrait>::pushToNvmCacheFromRamForTesting(
 
   if (handle && nvmCache_ && shouldWriteToNvmCache(*handle) &&
       shouldWriteToNvmCacheExclusive(*handle)) {
-    nvmCache_->put(handle, nvmCache_->createPutToken(handle->getKey()));
+    nvmCache_->put(*handle, nvmCache_->createPutToken(handle->getKey()));
     return true;
   }
   return false;
@@ -1844,13 +1901,13 @@ std::vector<std::string> CacheAllocator<CacheTrait>::dumpEvictionIterator(
   std::vector<std::string> content;
 
   auto& mm = *mmContainers_[pid][cid];
-  auto evictItr = mm.getEvictionIterator();
-  size_t i = 0;
-  while (evictItr && i < numItems) {
-    content.push_back(evictItr->toString());
-    ++evictItr;
-    ++i;
-  }
+
+  mm.withEvictionIterator([&content, numItems](auto&& itr) {
+    while (itr && content.size() < numItems) {
+      content.push_back(itr->toString());
+      ++itr;
+    }
+  });
 
   return content;
 }
@@ -2359,6 +2416,7 @@ void CacheAllocator<CacheTrait>::releaseSlabImpl(
   //  3. If 2 is successful, Move or Evict
   //  4. Move on to the next item if current item is freed
   for (auto alloc : releaseContext.getActiveAllocations()) {
+    auto startTimeSec = util::getCurrentTimeSec();
     // Need to mark an item for release before proceeding
     // If we can't mark as moving, it means the item is already freed
     const bool isAlreadyFreed =
@@ -2399,7 +2457,7 @@ bool CacheAllocator<CacheTrait>::moveForSlabRelease(
 
   bool isMoved = false;
   auto startTime = util::getCurrentTimeSec();
-  WriteHandle newItemHdl = allocateNewItemForOldItem(oldItem);
+  WriteHandle newItemHdl{};
 
   for (unsigned int itemMovingAttempts = 0;
        itemMovingAttempts < config_.movingTries;
@@ -2415,8 +2473,15 @@ bool CacheAllocator<CacheTrait>::moveForSlabRelease(
       return true;
     }
 
+    throttleWith(throttler, [&] {
+      XLOGF(WARN,
+            "Spent {} seconds, slab release still trying to move Item: {}. "
+            "Pool: {}, Class: {}.",
+            util::getCurrentTimeSec() - startTime, oldItem.toString(),
+            ctx.getPoolId(), ctx.getClassId());
+    });
+
     if (!newItemHdl) {
-      // try to allocate again if it previously wasn't successful
       newItemHdl = allocateNewItemForOldItem(oldItem);
     }
 
@@ -2427,14 +2492,6 @@ bool CacheAllocator<CacheTrait>::moveForSlabRelease(
         break;
       }
     }
-
-    throttleWith(throttler, [&] {
-      XLOGF(WARN,
-            "Spent {} seconds, slab release still trying to move Item: {}. "
-            "Pool: {}, Class: {}.",
-            util::getCurrentTimeSec() - startTime, oldItem.toString(),
-            ctx.getPoolId(), ctx.getClassId());
-    });
   }
 
   // Return false if we've exhausted moving tries.
@@ -2456,6 +2513,8 @@ bool CacheAllocator<CacheTrait>::moveForSlabRelease(
             ctx.getPoolId(), ctx.getClassId());
     });
   }
+  auto ref = oldItem.unmarkMoving();
+  XDCHECK_EQ(ref, 0);
   const auto allocInfo = allocator_->getAllocInfo(oldItem.getMemory());
   allocator_->free(&oldItem);
 
@@ -2466,10 +2525,10 @@ bool CacheAllocator<CacheTrait>::moveForSlabRelease(
 }
 
 template <typename CacheTrait>
-typename CacheAllocator<CacheTrait>::ReadHandle
+typename CacheAllocator<CacheTrait>::WriteHandle
 CacheAllocator<CacheTrait>::validateAndGetParentHandleForChainedMoveLocked(
     const ChainedItem& item, const Key& parentKey) {
-  ReadHandle parentHandle{};
+  WriteHandle parentHandle{};
   try {
     parentHandle = findInternal(parentKey);
     // If the parent is not the same as the parent of the chained item,
@@ -2488,6 +2547,7 @@ CacheAllocator<CacheTrait>::validateAndGetParentHandleForChainedMoveLocked(
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::WriteHandle
 CacheAllocator<CacheTrait>::allocateNewItemForOldItem(const Item& oldItem) {
+  XDCHECK(oldItem.isMoving());
   if (oldItem.isChainedItem()) {
     const auto& oldChainedItem = oldItem.asChainedItem();
     const auto parentKey = oldChainedItem.getParentItem(compressor_).getKey();
@@ -2501,8 +2561,8 @@ CacheAllocator<CacheTrait>::allocateNewItemForOldItem(const Item& oldItem) {
       return {};
     }
 
-    // Set up the destination for the move. Since oldChainedItem would be
-    // marked as moving, it won't be picked for eviction.
+    // Set up the destination for the move. Since oldChainedItem would
+    // be marked as moving, it won't be picked for eviction.
     auto newItemHdl =
         allocateChainedItemInternal(parentHandle, oldChainedItem.getSize());
     if (!newItemHdl) {
@@ -2513,7 +2573,7 @@ CacheAllocator<CacheTrait>::allocateNewItemForOldItem(const Item& oldItem) {
     auto parentPtr = parentHandle.getInternal();
     XDCHECK_EQ(reinterpret_cast<uintptr_t>(parentPtr),
                reinterpret_cast<uintptr_t>(
-                   &oldChainedItem.getParentItem(compressor_)));
+                   &newItemHdl->asChainedItem().getParentItem(compressor_)));
 
     return newItemHdl;
   }
@@ -2581,54 +2641,9 @@ void CacheAllocator<CacheTrait>::evictForSlabRelease(
     const SlabReleaseContext& ctx, Item& item, util::Throttler& throttler) {
   auto startTime = util::getCurrentTimeSec();
   while (true) {
+    XDCHECK(item.isMoving());
     stats_.numEvictionAttempts.inc();
 
-    // if the item is already in a state where only the exclusive bit is set,
-    // nothing needs to be done. We simply need to call unmarkMoving and free
-    // the item.
-    if (item.isOnlyMoving()) {
-      item.unmarkMoving();
-      const auto res =
-          releaseBackToAllocator(item, RemoveContext::kNormal, false);
-      XDCHECK(ReleaseRes::kReleased == res);
-      return;
-    }
-
-    // Since we couldn't move, we now evict this item. Owning handle will be
-    // the item's handle for regular/normal items and will be the parent
-    // handle for chained items.
-    auto owningHandle =
-        item.isChainedItem()
-            ? evictChainedItemForSlabRelease(item.asChainedItem())
-            : evictNormalItemForSlabRelease(item);
-
-    // we managed to evict the corresponding owner of the item and have the
-    // last handle for the owner.
-    if (owningHandle) {
-      const auto allocInfo =
-          allocator_->getAllocInfo(static_cast<const void*>(&item));
-      if (owningHandle->hasChainedItem()) {
-        (*stats_.chainedItemEvictions)[allocInfo.poolId][allocInfo.classId]
-            .inc();
-      } else {
-        (*stats_.regularItemEvictions)[allocInfo.poolId][allocInfo.classId]
-            .inc();
-      }
-
-      stats_.numEvictionSuccesses.inc();
-
-      // we have the last handle. no longer need to hold on to the exclusive bit
-      item.unmarkMoving();
-
-      // manually decrement the refcount to call releaseBackToAllocator
-      const auto ref = decRef(*owningHandle);
-      XDCHECK(ref == 0);
-      const auto res = releaseBackToAllocator(*owningHandle.release(),
-                                              RemoveContext::kEviction, false);
-      XDCHECK(res == ReleaseRes::kReleased);
-      return;
-    }
-
     if (shutDownInProgress_) {
       item.unmarkMoving();
       allocator_->abortSlabRelease(ctx);
@@ -2650,266 +2665,102 @@ void CacheAllocator<CacheTrait>::evictForSlabRelease(
                                        .toString())
                   : "");
     });
-  }
-}
-
-template <typename CacheTrait>
-typename CacheAllocator<CacheTrait>::WriteHandle
-CacheAllocator<CacheTrait>::advanceIteratorAndTryEvictRegularItem(
-    MMContainer& mmContainer, EvictionIterator& itr) {
-  // we should flush this to nvmcache if it is not already present in nvmcache
-  // and the item is not expired.
-  Item& item = *itr;
-  const bool evictToNvmCache = shouldWriteToNvmCache(item);
-
-  auto token = evictToNvmCache ? nvmCache_->createPutToken(item.getKey())
-                               : typename NvmCacheT::PutToken{};
-
-  // record the in-flight eviciton. If not, we move on to next item to avoid
-  // stalling eviction.
-  if (evictToNvmCache && !token.isValid()) {
-    ++itr;
-    stats_.evictFailConcurrentFill.inc();
-    return WriteHandle{};
-  }
-
-  // If there are other accessors, we should abort. Acquire a handle here since
-  // if we remove the item from both access containers and mm containers
-  // below, we will need a handle to ensure proper cleanup in case we end up
-  // not evicting this item
-  auto evictHandle = accessContainer_->removeIf(item, &itemExclusivePredicate);
-  if (!evictHandle) {
-    ++itr;
-    stats_.evictFailAC.inc();
-    return evictHandle;
-  }
-
-  mmContainer.remove(itr);
-  XDCHECK_EQ(reinterpret_cast<uintptr_t>(evictHandle.get()),
-             reinterpret_cast<uintptr_t>(&item));
-  XDCHECK(!evictHandle->isInMMContainer());
-  XDCHECK(!evictHandle->isAccessible());
-
-  // Invalidate iterator since later on if we are not evicting this
-  // item, we may need to rely on the handle we created above to ensure
-  // proper cleanup if the item's raw refcount has dropped to 0.
-  // And since this item may be a parent item that has some child items
-  // in this very same mmContainer, we need to make sure we drop this
-  // exclusive iterator so we can gain access to it when we're cleaning
-  // up the child items
-  itr.destroy();
 
-  // Ensure that there are no accessors after removing from the access
-  // container
-  XDCHECK(evictHandle->getRefCount() == 1);
-
-  if (evictToNvmCache && shouldWriteToNvmCacheExclusive(item)) {
-    XDCHECK(token.isValid());
-    nvmCache_->put(evictHandle, std::move(token));
-  }
-  return evictHandle;
-}
-
-template <typename CacheTrait>
-typename CacheAllocator<CacheTrait>::WriteHandle
-CacheAllocator<CacheTrait>::advanceIteratorAndTryEvictChainedItem(
-    EvictionIterator& itr) {
-  XDCHECK(itr->isChainedItem());
-
-  ChainedItem* candidate = &itr->asChainedItem();
-  ++itr;
-
-  // The parent could change at any point through transferChain. However, if
-  // that happens, we would realize that the releaseBackToAllocator return
-  // kNotRecycled and we would try another chained item, leading to transient
-  // failure.
-  auto& parent = candidate->getParentItem(compressor_);
-
-  const bool evictToNvmCache = shouldWriteToNvmCache(parent);
-
-  auto token = evictToNvmCache ? nvmCache_->createPutToken(parent.getKey())
-                               : typename NvmCacheT::PutToken{};
-
-  // if token is invalid, return. iterator is already advanced.
-  if (evictToNvmCache && !token.isValid()) {
-    stats_.evictFailConcurrentFill.inc();
-    return WriteHandle{};
-  }
-
-  // check if the parent exists in the hashtable and refcount is drained.
-  auto parentHandle =
-      accessContainer_->removeIf(parent, &itemExclusivePredicate);
-  if (!parentHandle) {
-    stats_.evictFailParentAC.inc();
-    return parentHandle;
-  }
-
-  // Invalidate iterator since later on we may use the mmContainer
-  // associated with this iterator which cannot be done unless we
-  // drop this iterator
-  //
-  // This must be done once we know the parent is not nullptr.
-  // Since we can very well be the last holder of this parent item,
-  // which may have a chained item that is linked in this MM container.
-  itr.destroy();
-
-  // Ensure we have the correct parent and we're the only user of the
-  // parent, then free it from access container. Otherwise, we abort
-  XDCHECK_EQ(reinterpret_cast<uintptr_t>(&parent),
-             reinterpret_cast<uintptr_t>(parentHandle.get()));
-  XDCHECK_EQ(1u, parent.getRefCount());
-
-  removeFromMMContainer(*parentHandle);
-
-  XDCHECK(!parent.isInMMContainer());
-  XDCHECK(!parent.isAccessible());
-
-  if (evictToNvmCache && shouldWriteToNvmCacheExclusive(*parentHandle)) {
-    XDCHECK(token.isValid());
-    XDCHECK(parentHandle->hasChainedItem());
-    nvmCache_->put(parentHandle, std::move(token));
-  }
-
-  return parentHandle;
-}
-
-template <typename CacheTrait>
-typename CacheAllocator<CacheTrait>::WriteHandle
-CacheAllocator<CacheTrait>::evictNormalItemForSlabRelease(Item& item) {
-  XDCHECK(item.isMoving());
-
-  if (item.isOnlyMoving()) {
-    return WriteHandle{};
-  }
-
-  auto predicate = [](const Item& it) { return it.getRefCount() == 0; };
-
-  const bool evictToNvmCache = shouldWriteToNvmCache(item);
-  auto token = evictToNvmCache ? nvmCache_->createPutToken(item.getKey())
-                               : typename NvmCacheT::PutToken{};
-
-  // We remove the item from both access and mm containers. It doesn't matter
-  // if someone else calls remove on the item at this moment, the item cannot
-  // be freed as long as it's marked for eviction.
-  auto handle = accessContainer_->removeIf(item, std::move(predicate));
-
-  if (!handle) {
-    return handle;
-  }
+    // if the item is already in a state where only the exclusive bit is set,
+    // nothing needs to be done. We simply need to call unmarkMoving and free
+    // the item.
+    if (item.isOnlyMoving()) {
+      item.unmarkMoving();
+      const auto res =
+          releaseBackToAllocator(item, RemoveContext::kNormal, false);
+      XDCHECK(ReleaseRes::kReleased == res);
+      return;
+    }
 
-  XDCHECK_EQ(reinterpret_cast<uintptr_t>(handle.get()),
-             reinterpret_cast<uintptr_t>(&item));
-  XDCHECK_EQ(1u, handle->getRefCount());
-  removeFromMMContainer(item);
+    typename NvmCacheT::PutToken token;
+    Item* evicted;
+    if (item.isChainedItem()) {
+      auto& expectedParent = item.asChainedItem().getParentItem(compressor_);
+      const std::string parentKey = expectedParent.getKey().str();
+      auto l = chainedItemLocks_.lockExclusive(parentKey);
+
+      // check if the child is still in mmContainer and the expected parent is
+      // valid under the chained item lock.
+      if (expectedParent.getKey() != parentKey || !item.isInMMContainer() ||
+          item.isOnlyMoving() ||
+          &expectedParent != &item.asChainedItem().getParentItem(compressor_) ||
+          !expectedParent.isAccessible() || !expectedParent.hasChainedItem()) {
+        continue;
+      }
 
-  // now that we are the only handle and we actually removed something from
-  // the RAM cache, we enqueue it to nvmcache.
-  if (evictToNvmCache && shouldWriteToNvmCacheExclusive(item)) {
-    nvmCache_->put(handle, std::move(token));
-  }
+      // search if the child is present in the chain
+      {
+        auto parentHandle = findInternal(parentKey);
+        if (!parentHandle || parentHandle != &expectedParent) {
+          continue;
+        }
 
-  return handle;
-}
+        ChainedItem* head = nullptr;
+        { // scope for the handle
+          auto headHandle = findChainedItem(expectedParent);
+          head = headHandle ? &headHandle->asChainedItem() : nullptr;
+        }
 
-template <typename CacheTrait>
-typename CacheAllocator<CacheTrait>::WriteHandle
-CacheAllocator<CacheTrait>::evictChainedItemForSlabRelease(ChainedItem& child) {
-  XDCHECK(child.isMoving());
-
-  // We have the child marked as moving, but dont know anything about the
-  // state of the parent. Unlike the case of regular eviction where we are
-  // sure that the child is inside the MMContainer, ensuring its parent is
-  // valid, we can not make any assumptions here. We try to find the parent
-  // first through the access container and then verify that the parent's
-  // chain points to the child before cleaning up the parent. If the parent
-  // was in the process of being re-allocated or child was being removed
-  // concurrently, we would synchronize here on one of the checks.
-  Item& expectedParent = child.getParentItem(compressor_);
-
-  // Grab exclusive lock since we are modifying the chain. at this point, we
-  // dont know the state of the parent. so we need to do some validity checks
-  // after we have the chained item lock to ensure that we got the lock off of
-  // a valid state.
-  const std::string parentKey = expectedParent.getKey().str();
-  auto l = chainedItemLocks_.lockExclusive(parentKey);
+        bool found = false;
+        while (head) {
+          if (head == &item) {
+            found = true;
+            break;
+          }
+          head = head->getNext(compressor_);
+        }
 
-  // check if the child is still in mmContainer and the expected parent is
-  // valid under the chained item lock.
-  if (expectedParent.getKey() != parentKey || !child.isInMMContainer() ||
-      child.isOnlyMoving() ||
-      &expectedParent != &child.getParentItem(compressor_) ||
-      !expectedParent.isAccessible() || !expectedParent.hasChainedItem()) {
-    return {};
-  }
+        if (!found) {
+          continue;
+        }
+      }
 
-  // search if the child is present in the chain
-  auto parentHandle = findInternal(parentKey);
-  if (!parentHandle || parentHandle != &expectedParent) {
-    return {};
-  }
+      evicted = &expectedParent;
 
-  ChainedItem* head = nullptr;
-  { // scope for the handle
-    auto headHandle = findChainedItem(expectedParent);
-    head = headHandle ? &headHandle->asChainedItem() : nullptr;
-  }
+      token = createPutToken(*evicted);
+      if (evicted->markForEviction()) {
+        // unmark the child so it will be freed
+        item.unmarkMoving();
+        unlinkItemForEviction(*evicted);
+      } else {
+        continue;
+      }
+    } else {
+      evicted = &item;
 
-  bool found = false;
-  while (head) {
-    if (head == &child) {
-      found = true;
-      break;
+      token = createPutToken(*evicted);
+      if (evicted->markForEvictionWhenMoving()) {
+        unlinkItemForEviction(*evicted);
+      } else {
+        continue;
+      }
     }
-    head = head->getNext(compressor_);
-  }
-
-  if (!found) {
-    return {};
-  }
-
-  // if we found the child in the parent's chain, we remove it and ensure that
-  // the handle we obtained was the last one. Before that, create a put token
-  // to guard any racing cache find to avoid item re-appearing in NvmCache.
-  const bool evictToNvmCache = shouldWriteToNvmCache(expectedParent);
-
-  auto token = evictToNvmCache
-                   ? nvmCache_->createPutToken(expectedParent.getKey())
-                   : typename NvmCacheT::PutToken{};
 
-  if (!accessContainer_->removeIf(expectedParent,
-                                  parentEvictForSlabReleasePredicate)) {
-    return {};
-  }
-
-  // at this point, we should be the last handle owner
-  XDCHECK_EQ(1u, parentHandle->getRefCount());
-
-  // We remove the parent from both access and mm containers. It doesn't
-  // matter if someone else calls remove on the parent at this moment, it
-  // cannot be freed since we hold an active item handle
-  removeFromMMContainer(*parentHandle);
+    if (token.isValid() && shouldWriteToNvmCacheExclusive(*evicted)) {
+      nvmCache_->put(*evicted, std::move(token));
+    }
 
-  // In case someone else had removed this chained item from its parent by now
-  // So we check again to see if it has been unlinked from its parent
-  if (!child.isInMMContainer() || child.isOnlyMoving()) {
-    return {};
-  }
+    const auto allocInfo =
+        allocator_->getAllocInfo(static_cast<const void*>(evicted));
+    if (evicted->hasChainedItem()) {
+      (*stats_.chainedItemEvictions)[allocInfo.poolId][allocInfo.classId].inc();
+    } else {
+      (*stats_.regularItemEvictions)[allocInfo.poolId][allocInfo.classId].inc();
+    }
 
-  // check after removing from the MMContainer that the parent is still not
-  // being marked as moving. If parent is moving, it will release the child
-  // item and we will wait for that.
-  if (parentHandle->isMoving()) {
-    return {};
-  }
+    stats_.numEvictionSuccesses.inc();
 
-  // now that we are the only handle and we actually removed something from
-  // the RAM cache, we enqueue it to nvmcache.
-  if (evictToNvmCache && shouldWriteToNvmCacheExclusive(*parentHandle)) {
-    DCHECK(parentHandle->hasChainedItem());
-    nvmCache_->put(parentHandle, std::move(token));
+    XDCHECK(evicted->getRefCount() == 0);
+    const auto res =
+        releaseBackToAllocator(*evicted, RemoveContext::kEviction, false);
+    XDCHECK(res == ReleaseRes::kReleased);
+    return;
   }
-
-  return parentHandle;
 }
 
 template <typename CacheTrait>
diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
index 6c33772dac..aeaa90beca 100644
--- a/cachelib/allocator/CacheAllocator.h
+++ b/cachelib/allocator/CacheAllocator.h
@@ -1308,7 +1308,7 @@ class CacheAllocator : public CacheBase {
 
  private:
   // wrapper around Item's refcount and active handle tracking
-  FOLLY_ALWAYS_INLINE void incRef(Item& it);
+  FOLLY_ALWAYS_INLINE bool incRef(Item& it);
   FOLLY_ALWAYS_INLINE RefcountWithFlags::Value decRef(Item& it);
 
   // drops the refcount and if needed, frees the allocation back to the memory
@@ -1359,6 +1359,12 @@ class CacheAllocator : public CacheBase {
                                     bool nascent = false,
                                     const Item* toRecycle = nullptr);
 
+  // Must be called by the thread which called markForEviction and
+  // succeeded. After this call, the item is unlinked from Access and
+  // MM Containers. The item is no longer marked as exclusive and it's
+  // ref count is 0 - it's available for recycling.
+  void unlinkItemForEviction(Item& it);
+
   // acquires an handle on the item. returns an empty handle if it is null.
   // @param it    pointer to an item
   // @return WriteHandle   return a handle to this item
@@ -1448,17 +1454,17 @@ class CacheAllocator : public CacheBase {
   // @return  handle to the parent item if the validations pass
   //          otherwise, an empty Handle is returned.
   //
-  ReadHandle validateAndGetParentHandleForChainedMoveLocked(
+  WriteHandle validateAndGetParentHandleForChainedMoveLocked(
       const ChainedItem& item, const Key& parentKey);
 
   // Given an existing item, allocate a new one for the
   // existing one to later be moved into.
   //
-  // @param oldItem    the item we want to allocate a new item for
+  // @param item   reference to the item we want to allocate a new item for
   //
   // @return  handle to the newly allocated item
   //
-  WriteHandle allocateNewItemForOldItem(const Item& oldItem);
+  WriteHandle allocateNewItemForOldItem(const Item& item);
 
   // internal helper that grabs a refcounted handle to the item. This does
   // not record the access to reflect in the mmContainer.
@@ -1512,7 +1518,7 @@ class CacheAllocator : public CacheBase {
   // callback is responsible for copying the contents and fixing the semantics
   // of chained item.
   //
-  // @param oldItem     Reference to the item being moved
+  // @param oldItem     item being moved
   // @param newItemHdl  Reference to the handle of the new item being moved into
   //
   // @return true  If the move was completed, and the containers were updated
@@ -1662,25 +1668,6 @@ class CacheAllocator : public CacheBase {
 
   using EvictionIterator = typename MMContainer::LockedIterator;
 
-  // Advance the current iterator and try to evict a regular item
-  //
-  // @param  mmContainer  the container to look for evictions.
-  // @param  itr          iterator holding the item
-  //
-  // @return  valid handle to regular item on success. This will be the last
-  //          handle to the item. On failure an empty handle.
-  WriteHandle advanceIteratorAndTryEvictRegularItem(MMContainer& mmContainer,
-                                                    EvictionIterator& itr);
-
-  // Advance the current iterator and try to evict a chained item
-  // Iterator may also be reset during the course of this function
-  //
-  // @param  itr          iterator holding the item
-  //
-  // @return  valid handle to the parent item on success. This will be the last
-  //          handle to the item
-  WriteHandle advanceIteratorAndTryEvictChainedItem(EvictionIterator& itr);
-
   // Deserializer CacheAllocatorMetadata and verify the version
   //
   // @param  deserializer   Deserializer object
@@ -1765,13 +1752,14 @@ class CacheAllocator : public CacheBase {
   //
   //
   // @param ctx         slab release context
-  // @param item        old item to be moved elsewhere
+  // @param oldItem     old item to be moved elsewhere
+  // @param handle      handle to the item or to it's parent (if chained)
   // @param throttler   slow this function down as not to take too much cpu
   //
   // @return    true  if the item has been moved
   //            false if we have exhausted moving attempts
   bool moveForSlabRelease(const SlabReleaseContext& ctx,
-                          Item& item,
+                          Item& oldItem,
                           util::Throttler& throttler);
 
   // "Move" (by copying) the content in this item to another memory
@@ -1794,18 +1782,7 @@ class CacheAllocator : public CacheBase {
                            Item& item,
                            util::Throttler& throttler);
 
-  // Helper function to evict a normal item for slab release
-  //
-  // @return last handle for corresponding to item on success. empty handle on
-  // failure. caller can retry if needed.
-  WriteHandle evictNormalItemForSlabRelease(Item& item);
-
-  // Helper function to evict a child item for slab release
-  // As a side effect, the parent item is also evicted
-  //
-  // @return  last handle to the parent item of the child on success. empty
-  // handle on failure. caller can retry.
-  WriteHandle evictChainedItemForSlabRelease(ChainedItem& item);
+  typename NvmCacheT::PutToken createPutToken(Item& item);
 
   // Helper function to remove a item if expired.
   //
@@ -1928,18 +1905,14 @@ class CacheAllocator : public CacheBase {
   std::optional<bool> saveNvmCache();
   void saveRamCache();
 
-  static bool itemExclusivePredicate(const Item& item) {
-    return item.getRefCount() == 0;
+  static bool itemSlabMovePredicate(const Item& item) {
+    return item.isMoving() && item.getRefCount() == 0;
   }
 
   static bool itemExpiryPredicate(const Item& item) {
     return item.getRefCount() == 1 && item.isExpired();
   }
 
-  static bool parentEvictForSlabReleasePredicate(const Item& item) {
-    return item.getRefCount() == 1 && !item.isMoving();
-  }
-
   std::unique_ptr<Deserializer> createDeserializer();
 
   // Execute func on each item. `func` can throw exception but must ensure
diff --git a/cachelib/allocator/MM2Q.h b/cachelib/allocator/MM2Q.h
index 982eca21f9..9c5cac834c 100644
--- a/cachelib/allocator/MM2Q.h
+++ b/cachelib/allocator/MM2Q.h
@@ -68,6 +68,7 @@ class MM2Q {
   enum LruType { Warm, WarmTail, Hot, Cold, ColdTail, NumTypes };
 
   // Config class for MM2Q
+  // TODO: implement support for useCombinedLockForIterators
   struct Config {
     // Create from serialized config
     explicit Config(SerializationConfigType configState)
diff --git a/cachelib/allocator/nvmcache/NvmCache-inl.h b/cachelib/allocator/nvmcache/NvmCache-inl.h
index b79712e607..3cef6b7ad0 100644
--- a/cachelib/allocator/nvmcache/NvmCache-inl.h
+++ b/cachelib/allocator/nvmcache/NvmCache-inl.h
@@ -460,19 +460,18 @@ uint32_t NvmCache<C>::getStorageSizeInNvm(const Item& it) {
 }
 
 template <typename C>
-std::unique_ptr<NvmItem> NvmCache<C>::makeNvmItem(const WriteHandle& hdl) {
-  const auto& item = *hdl;
+std::unique_ptr<NvmItem> NvmCache<C>::makeNvmItem(const Item& item) {
   auto poolId = cache_.getAllocInfo((void*)(&item)).poolId;
 
   if (item.isChainedItem()) {
     throw std::invalid_argument(folly::sformat(
-        "Chained item can not be flushed separately {}", hdl->toString()));
+        "Chained item can not be flushed separately {}", item.toString()));
   }
 
   auto chainedItemRange =
-      CacheAPIWrapperForNvm<C>::viewAsChainedAllocsRange(cache_, *hdl);
+      CacheAPIWrapperForNvm<C>::viewAsChainedAllocsRange(cache_, item);
   if (config_.encodeCb && !config_.encodeCb(EncodeDecodeContext{
-                              *(hdl.getInternal()), chainedItemRange})) {
+                              const_cast<Item&>(item), chainedItemRange})) {
     return nullptr;
   }
 
@@ -496,12 +495,10 @@ std::unique_ptr<NvmItem> NvmCache<C>::makeNvmItem(const WriteHandle& hdl) {
 }
 
 template <typename C>
-void NvmCache<C>::put(WriteHandle& hdl, PutToken token) {
+void NvmCache<C>::put(Item& item, PutToken token) {
   util::LatencyTracker tracker(stats().nvmInsertLatency_);
-  HashedKey hk{hdl->getKey()};
+  HashedKey hk{item.getKey()};
 
-  XDCHECK(hdl);
-  auto& item = *hdl;
   // for regular items that can only write to nvmcache upon eviction, we
   // should not be recording a write for an nvmclean item unless it is marked
   // as evicted from nvmcache.
@@ -526,7 +523,7 @@ void NvmCache<C>::put(WriteHandle& hdl, PutToken token) {
     return;
   }
 
-  auto nvmItem = makeNvmItem(hdl);
+  auto nvmItem = makeNvmItem(item);
   if (!nvmItem) {
     stats().numNvmPutEncodeFailure.inc();
     return;
diff --git a/cachelib/allocator/nvmcache/NvmCache.h b/cachelib/allocator/nvmcache/NvmCache.h
index c9f5c753f0..245431f7d0 100644
--- a/cachelib/allocator/nvmcache/NvmCache.h
+++ b/cachelib/allocator/nvmcache/NvmCache.h
@@ -158,11 +158,11 @@ class NvmCache {
   PutToken createPutToken(folly::StringPiece key);
 
   // store the given item in navy
-  // @param hdl         handle to cache item. should not be null
+  // @param item        cache item
   // @param token       the put token for the item. this must have been
   //                    obtained before enqueueing the put to maintain
   //                    consistency
-  void put(WriteHandle& hdl, PutToken token);
+  void put(Item& item, PutToken token);
 
   // returns the current state of whether nvmcache is enabled or not. nvmcache
   // can be disabled if the backend implementation ends up in a corrupt state
@@ -286,7 +286,7 @@ class NvmCache {
   // returns true if there is tombstone entry for the key.
   bool hasTombStone(HashedKey hk);
 
-  std::unique_ptr<NvmItem> makeNvmItem(const WriteHandle& handle);
+  std::unique_ptr<NvmItem> makeNvmItem(const Item& item);
 
   // wrap an item into a blob for writing into navy.
   Blob makeBlob(const Item& it);
diff --git a/cachelib/allocator/nvmcache/tests/NvmTestBase.h b/cachelib/allocator/nvmcache/tests/NvmTestBase.h
index fd88875fa9..70f00f2e52 100644
--- a/cachelib/allocator/nvmcache/tests/NvmTestBase.h
+++ b/cachelib/allocator/nvmcache/tests/NvmTestBase.h
@@ -108,7 +108,7 @@ class NvmCacheTest : public testing::Test {
   void pushToNvmCacheFromRamForTesting(WriteHandle& handle) {
     auto nvmCache = getNvmCache();
     if (nvmCache) {
-      nvmCache->put(handle, nvmCache->createPutToken(handle->getKey()));
+      nvmCache->put(*handle, nvmCache->createPutToken(handle->getKey()));
     }
   }
 
@@ -127,7 +127,7 @@ class NvmCacheTest : public testing::Test {
   }
 
   std::unique_ptr<NvmItem> makeNvmItem(const WriteHandle& handle) {
-    return getNvmCache()->makeNvmItem(handle);
+    return getNvmCache()->makeNvmItem(*handle);
   }
 
   std::unique_ptr<folly::IOBuf> createItemAsIOBuf(folly::StringPiece key,
diff --git a/cachelib/allocator/tests/BaseAllocatorTest.h b/cachelib/allocator/tests/BaseAllocatorTest.h
index aa9d38a857..4d070df37d 100644
--- a/cachelib/allocator/tests/BaseAllocatorTest.h
+++ b/cachelib/allocator/tests/BaseAllocatorTest.h
@@ -4074,15 +4074,16 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
   // Check that item is in the expected container.
   bool findItem(AllocatorT& allocator, typename AllocatorT::Item* item) {
     auto& container = allocator.getMMContainer(*item);
-    auto itr = container.getEvictionIterator();
     bool found = false;
-    while (itr) {
-      if (itr.get() == item) {
-        found = true;
-        break;
+    container.withEvictionIterator([&found, &item](auto&& itr) {
+      while (itr) {
+        if (itr.get() == item) {
+          found = true;
+          break;
+        }
+        ++itr;
       }
-      ++itr;
-    }
+    });
     return found;
   }
 
@@ -5470,8 +5471,12 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
       ASSERT_TRUE(big->isInMMContainer());
 
       auto& mmContainer = alloc.getMMContainer(*big);
-      auto itr = mmContainer.getEvictionIterator();
-      ASSERT_EQ(big.get(), &(*itr));
+
+      typename AllocatorT::Item* evictionCandidate = nullptr;
+      mmContainer.withEvictionIterator(
+          [&evictionCandidate](auto&& itr) { evictionCandidate = itr.get(); });
+
+      ASSERT_EQ(big.get(), evictionCandidate);
 
       alloc.remove("hello");
     }
@@ -5485,8 +5490,11 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
       ASSERT_TRUE(small2->isInMMContainer());
 
       auto& mmContainer = alloc.getMMContainer(*small2);
-      auto itr = mmContainer.getEvictionIterator();
-      ASSERT_EQ(small2.get(), &(*itr));
+
+      typename AllocatorT::Item* evictionCandidate = nullptr;
+      mmContainer.withEvictionIterator(
+          [&evictionCandidate](auto&& itr) { evictionCandidate = itr.get(); });
+      ASSERT_EQ(small2.get(), evictionCandidate);
 
       alloc.remove("hello");
     }

From 9bb6775dfbe4188a2faa2f32a1506093c3d53f3b Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Wed, 4 Jan 2023 06:40:10 -0800
Subject: [PATCH 31/47] - Change the cache size calculation to use
 getCacheSize() - Switch to isUsingPosixShm() method

---
 cachelib/allocator/CacheAllocator-inl.h     | 19 ++++++++++---------
 cachelib/allocator/CacheAllocatorConfig.h   |  2 +-
 cachelib/allocator/memory/SlabAllocator.cpp |  4 +++-
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/cachelib/allocator/CacheAllocator-inl.h b/cachelib/allocator/CacheAllocator-inl.h
index 9d33e8519a..903bc0e4b2 100644
--- a/cachelib/allocator/CacheAllocator-inl.h
+++ b/cachelib/allocator/CacheAllocator-inl.h
@@ -54,11 +54,11 @@ CacheAllocator<CacheTrait>::CacheAllocator(
                                           : config.memMonitoringEnabled()},
       config_(config.validate()),
       tempShm_(type == InitMemType::kNone && isOnShm_
-                   ? std::make_unique<TempShmMapping>(config_.size)
+                   ? std::make_unique<TempShmMapping>(config_.getCacheSize())
                    : nullptr),
       shmManager_(type != InitMemType::kNone
                       ? std::make_unique<ShmManager>(config_.cacheDir,
-                                                     config_.usePosixShm)
+                                                     config_.isUsingPosixShm())
                       : nullptr),
       deserializer_(type == InitMemType::kMemAttach ? createDeserializer()
                                                     : nullptr),
@@ -122,10 +122,10 @@ CacheAllocator<CacheTrait>::createNewMemoryAllocator() {
   return std::make_unique<MemoryAllocator>(
       getAllocatorConfig(config_),
       shmManager_
-          ->createShm(detail::kShmCacheName, config_.size,
+          ->createShm(detail::kShmCacheName, config_.getCacheSize(),
                       config_.slabMemoryBaseAddr, createShmCacheOpts())
           .addr,
-      config_.size);
+      config_.getCacheSize());
 }
 
 template <typename CacheTrait>
@@ -137,7 +137,7 @@ CacheAllocator<CacheTrait>::restoreMemoryAllocator() {
           ->attachShm(detail::kShmCacheName, config_.slabMemoryBaseAddr,
                       createShmCacheOpts())
           .addr,
-      config_.size,
+      config_.getCacheSize(),
       config_.disableFullCoredump);
 }
 
@@ -242,11 +242,12 @@ std::unique_ptr<MemoryAllocator> CacheAllocator<CacheTrait>::initAllocator(
     InitMemType type) {
   if (type == InitMemType::kNone) {
     if (isOnShm_ == true) {
-      return std::make_unique<MemoryAllocator>(
-          getAllocatorConfig(config_), tempShm_->getAddr(), config_.size);
+      return std::make_unique<MemoryAllocator>(getAllocatorConfig(config_),
+                                               tempShm_->getAddr(),
+                                               config_.getCacheSize());
     } else {
       return std::make_unique<MemoryAllocator>(getAllocatorConfig(config_),
-                                               config_.size);
+                                               config_.getCacheSize());
     }
   } else if (type == InitMemType::kMemNew) {
     return createNewMemoryAllocator();
@@ -2309,7 +2310,7 @@ PoolEvictionAgeStats CacheAllocator<CacheTrait>::getPoolEvictionAgeStats(
 template <typename CacheTrait>
 CacheMetadata CacheAllocator<CacheTrait>::getCacheMetadata() const noexcept {
   return CacheMetadata{kCachelibVersion, kCacheRamFormatVersion,
-                       kCacheNvmFormatVersion, config_.size};
+                       kCacheNvmFormatVersion, config_.getCacheSize()};
 }
 
 template <typename CacheTrait>
diff --git a/cachelib/allocator/CacheAllocatorConfig.h b/cachelib/allocator/CacheAllocatorConfig.h
index ec44ff8467..0831cbe008 100644
--- a/cachelib/allocator/CacheAllocatorConfig.h
+++ b/cachelib/allocator/CacheAllocatorConfig.h
@@ -1085,7 +1085,7 @@ std::map<std::string, std::string> CacheAllocatorConfig<T>::serialize() const {
 
   configMap["size"] = std::to_string(size);
   configMap["cacheDir"] = cacheDir;
-  configMap["posixShm"] = usePosixShm ? "set" : "empty";
+  configMap["posixShm"] = isUsingPosixShm() ? "set" : "empty";
 
   configMap["defaultAllocSizes"] = "";
   // Stringify std::set
diff --git a/cachelib/allocator/memory/SlabAllocator.cpp b/cachelib/allocator/memory/SlabAllocator.cpp
index ade5a8e535..0106f1bf4e 100644
--- a/cachelib/allocator/memory/SlabAllocator.cpp
+++ b/cachelib/allocator/memory/SlabAllocator.cpp
@@ -40,7 +40,9 @@
 using namespace facebook::cachelib;
 
 namespace {
-size_t roundDownToSlabSize(size_t size) { return size - (size % sizeof(Slab)); }
+static inline size_t roundDownToSlabSize(size_t size) {
+  return size - (size % sizeof(Slab));
+}
 } // namespace
 
 // definitions to avoid ODR violation.

From 1759e83a50a5115d144dc751bcbb475179a71401 Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Wed, 4 Jan 2023 05:59:45 -0800
Subject: [PATCH 32/47] - Add memory tier configs to cache allocator based on
 NUMA bindings - Add in MemoryTier tests - Increase max number of tiers to 2
 for tiers tests

---
 cachelib/allocator/CMakeLists.txt             |   2 +
 cachelib/allocator/CacheAllocator-inl.h       |   6 +-
 cachelib/allocator/CacheAllocator.h           |   5 +
 cachelib/allocator/CacheAllocatorConfig.h     |   6 +-
 cachelib/allocator/MemoryTierCacheConfig.h    |   7 +-
 .../tests/AllocatorMemoryTiersTest.cpp        |  32 ++++
 .../tests/AllocatorMemoryTiersTest.h          |  42 +++++
 .../allocator/tests/AllocatorTypeTest.cpp     |   6 +
 cachelib/allocator/tests/BaseAllocatorTest.h  |   5 +-
 cachelib/allocator/tests/MemoryTiersTest.cpp  | 177 ++++++++++++++++++
 10 files changed, 278 insertions(+), 10 deletions(-)
 create mode 100644 cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp
 create mode 100644 cachelib/allocator/tests/AllocatorMemoryTiersTest.h
 create mode 100644 cachelib/allocator/tests/MemoryTiersTest.cpp

diff --git a/cachelib/allocator/CMakeLists.txt b/cachelib/allocator/CMakeLists.txt
index 78cfa7ca06..f94c8c90c7 100644
--- a/cachelib/allocator/CMakeLists.txt
+++ b/cachelib/allocator/CMakeLists.txt
@@ -117,6 +117,8 @@ if (BUILD_TESTS)
   add_test (tests/ChainedHashTest.cpp)
   add_test (tests/AllocatorResizeTypeTest.cpp)
   add_test (tests/AllocatorHitStatsTypeTest.cpp)
+  add_test (tests/AllocatorMemoryTiersTest.cpp)
+  add_test (tests/MemoryTiersTest.cpp)
   add_test (tests/MultiAllocatorTest.cpp)
   add_test (tests/NvmAdmissionPolicyTest.cpp)
   add_test (tests/CacheAllocatorConfigTest.cpp)
diff --git a/cachelib/allocator/CacheAllocator-inl.h b/cachelib/allocator/CacheAllocator-inl.h
index 903bc0e4b2..eb0f726a45 100644
--- a/cachelib/allocator/CacheAllocator-inl.h
+++ b/cachelib/allocator/CacheAllocator-inl.h
@@ -53,6 +53,7 @@ CacheAllocator<CacheTrait>::CacheAllocator(
     : isOnShm_{type != InitMemType::kNone ? true
                                           : config.memMonitoringEnabled()},
       config_(config.validate()),
+      memoryTierConfigs(config.getMemoryTierConfigs()),
       tempShm_(type == InitMemType::kNone && isOnShm_
                    ? std::make_unique<TempShmMapping>(config_.getCacheSize())
                    : nullptr),
@@ -110,9 +111,10 @@ ShmSegmentOpts CacheAllocator<CacheTrait>::createShmCacheOpts() {
   opts.alignment = sizeof(Slab);
   auto memoryTierConfigs = config_.getMemoryTierConfigs();
   // TODO: we support single tier so far
-  XDCHECK_EQ(memoryTierConfigs.size(), 1ul);
+  if (memoryTierConfigs.size() > 1) {
+    throw std::invalid_argument("CacheLib only supports a single memory tier");
+  }
   opts.memBindNumaNodes = memoryTierConfigs[0].getMemBind();
-
   return opts;
 }
 
diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
index aeaa90beca..41ea947c34 100644
--- a/cachelib/allocator/CacheAllocator.h
+++ b/cachelib/allocator/CacheAllocator.h
@@ -1159,6 +1159,9 @@ class CacheAllocator : public CacheBase {
   // whether it is object-cache
   bool isObjectCache() const override final { return false; }
 
+  // combined pool size for all memory tiers
+  size_t getPoolSize(PoolId pid) const;
+
   // pool stats by pool id
   PoolStats getPoolStats(PoolId pid) const override final;
 
@@ -1959,6 +1962,8 @@ class CacheAllocator : public CacheBase {
 
   Config config_{};
 
+  const typename Config::MemoryTierConfigs memoryTierConfigs;
+
   // Manages the temporary shared memory segment for memory allocator that
   // is not persisted when cache process exits.
   std::unique_ptr<TempShmMapping> tempShm_;
diff --git a/cachelib/allocator/CacheAllocatorConfig.h b/cachelib/allocator/CacheAllocatorConfig.h
index 0831cbe008..74cd34c6a2 100644
--- a/cachelib/allocator/CacheAllocatorConfig.h
+++ b/cachelib/allocator/CacheAllocatorConfig.h
@@ -207,6 +207,9 @@ class CacheAllocatorConfig {
   // Accepts vector of MemoryTierCacheConfig. Each vector element describes
   // configuration for a single memory cache tier. Tier sizes are specified as
   // ratios, the number of parts of total cache size each tier would occupy.
+  // @throw std::invalid_argument if:
+  // - the size of configs is 0
+  // - the size of configs is greater than kMaxCacheMemoryTiers
   CacheAllocatorConfig& configureMemoryTiers(const MemoryTierConfigs& configs);
 
   // Return reference to MemoryTierCacheConfigs.
@@ -374,8 +377,7 @@ class CacheAllocatorConfig {
   std::map<std::string, std::string> serialize() const;
 
   // The max number of memory cache tiers
-  // TODO: increase this number when multi-tier configs are enabled
-  inline static const size_t kMaxCacheMemoryTiers = 1;
+  inline static const size_t kMaxCacheMemoryTiers = 2;
 
   // Cache name for users to indentify their own cache.
   std::string cacheName{""};
diff --git a/cachelib/allocator/MemoryTierCacheConfig.h b/cachelib/allocator/MemoryTierCacheConfig.h
index a60fb64d3e..1b9477c048 100644
--- a/cachelib/allocator/MemoryTierCacheConfig.h
+++ b/cachelib/allocator/MemoryTierCacheConfig.h
@@ -23,10 +23,7 @@ namespace cachelib {
 class MemoryTierCacheConfig {
  public:
   // Creates instance of MemoryTierCacheConfig for Posix/SysV Shared memory.
-  static MemoryTierCacheConfig fromShm() {
-    // TODO: expand this method when adding support for file-mapped memory
-    return MemoryTierCacheConfig();
-  }
+  static MemoryTierCacheConfig fromShm() { return MemoryTierCacheConfig(); }
 
   // Specifies ratio of this memory tier to other tiers. Absolute size
   // of each tier can be calculated as:
@@ -49,7 +46,7 @@ class MemoryTierCacheConfig {
 
   const NumaBitMask& getMemBind() const noexcept { return numaNodes; }
 
-  size_t calculateTierSize(size_t totalCacheSize, size_t partitionNum) {
+  size_t calculateTierSize(size_t totalCacheSize, size_t partitionNum) const {
     // TODO: Call this method when tiers are enabled in allocator
     // to calculate tier sizes in bytes.
     if (!partitionNum) {
diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp b/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp
new file mode 100644
index 0000000000..71059ee496
--- /dev/null
+++ b/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) Intel Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cachelib/allocator/tests/AllocatorMemoryTiersTest.h"
+
+namespace facebook {
+namespace cachelib {
+namespace tests {
+
+using LruAllocatorMemoryTiersTest = AllocatorMemoryTiersTest<LruAllocator>;
+
+// TODO(MEMORY_TIER): add more tests with different eviction policies
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValid1) {
+  this->testMultiTiersValid1();
+}
+
+} // end of namespace tests
+} // end of namespace cachelib
+} // end of namespace facebook
diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.h b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h
new file mode 100644
index 0000000000..05d5020b52
--- /dev/null
+++ b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cachelib/allocator/CacheAllocatorConfig.h"
+#include "cachelib/allocator/MemoryTierCacheConfig.h"
+#include "cachelib/allocator/tests/TestBase.h"
+
+namespace facebook {
+namespace cachelib {
+namespace tests {
+
+template <typename AllocatorT>
+class AllocatorMemoryTiersTest : public AllocatorTest<AllocatorT> {
+ public:
+  void testMultiTiersValid1() {
+    typename AllocatorT::Config config;
+    config.setCacheSize(100 * Slab::kSize);
+    ASSERT_NO_THROW(config.configureMemoryTiers(
+        {MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind(
+             std::string("0")),
+         MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind(
+             std::string("0"))}));
+  }
+};
+} // namespace tests
+} // namespace cachelib
+} // namespace facebook
diff --git a/cachelib/allocator/tests/AllocatorTypeTest.cpp b/cachelib/allocator/tests/AllocatorTypeTest.cpp
index 97e08cd7ed..1e98af29f2 100644
--- a/cachelib/allocator/tests/AllocatorTypeTest.cpp
+++ b/cachelib/allocator/tests/AllocatorTypeTest.cpp
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "cachelib/allocator/MemoryTierCacheConfig.h"
 #include "cachelib/allocator/tests/BaseAllocatorTest.h"
 #include "cachelib/allocator/tests/TestBase.h"
 
@@ -226,6 +227,11 @@ TYPED_TEST(BaseAllocatorTest, ReaperSkippingSlabTraversalWhileSlabReleasing) {
 }
 
 TYPED_TEST(BaseAllocatorTest, ReaperShutDown) { this->testReaperShutDown(); }
+TYPED_TEST(BaseAllocatorTest, ReaperShutDownFile) {
+  this->testReaperShutDown(
+      {MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind(
+          std::string("0"))});
+}
 
 TYPED_TEST(BaseAllocatorTest, ShutDownWithActiveHandles) {
   this->testShutDownWithActiveHandles();
diff --git a/cachelib/allocator/tests/BaseAllocatorTest.h b/cachelib/allocator/tests/BaseAllocatorTest.h
index 4d070df37d..afc27e6fbd 100644
--- a/cachelib/allocator/tests/BaseAllocatorTest.h
+++ b/cachelib/allocator/tests/BaseAllocatorTest.h
@@ -1253,7 +1253,8 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
     this->testLruLength(alloc, poolId, sizes, keyLen, evictedKeys);
   }
 
-  void testReaperShutDown() {
+  void testReaperShutDown(
+      typename AllocatorT::Config::MemoryTierConfigs cfgs = {}) {
     const size_t nSlabs = 20;
     const size_t size = nSlabs * Slab::kSize;
 
@@ -1263,6 +1264,8 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
     config.setAccessConfig({8, 8});
     config.enableCachePersistence(this->cacheDir_);
     config.enableItemReaperInBackground(std::chrono::seconds(1), {});
+    if (cfgs.size())
+      config.configureMemoryTiers(cfgs);
     std::vector<typename AllocatorT::Key> keys;
     {
       AllocatorT alloc(AllocatorT::SharedMemNew, config);
diff --git a/cachelib/allocator/tests/MemoryTiersTest.cpp b/cachelib/allocator/tests/MemoryTiersTest.cpp
new file mode 100644
index 0000000000..81eca12e62
--- /dev/null
+++ b/cachelib/allocator/tests/MemoryTiersTest.cpp
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <folly/Random.h>
+
+#include <numeric>
+
+#include "cachelib/allocator/CacheAllocator.h"
+#include "cachelib/allocator/tests/TestBase.h"
+
+namespace facebook {
+namespace cachelib {
+namespace tests {
+
+using LruAllocatorConfig = CacheAllocatorConfig<LruAllocator>;
+using LruMemoryTierConfigs = LruAllocatorConfig::MemoryTierConfigs;
+using Strings = std::vector<std::string>;
+using Ratios = std::vector<size_t>;
+
+constexpr size_t MB = 1024ULL * 1024ULL;
+constexpr size_t GB = MB * 1024ULL;
+
+const size_t defaultTotalCacheSize{1 * GB};
+const std::string defaultCacheDir{"/tmp/metadataDir"};
+
+template <typename Allocator>
+class MemoryTiersTest : public AllocatorTest<Allocator> {
+ public:
+  void basicCheck(LruAllocatorConfig& actualConfig,
+                  size_t expectedTotalCacheSize = defaultTotalCacheSize,
+                  const std::string& expectedCacheDir = defaultCacheDir) {
+    EXPECT_EQ(actualConfig.getCacheSize(), expectedTotalCacheSize);
+    auto configs = actualConfig.getMemoryTierConfigs();
+
+    size_t sum_ratios = std::accumulate(
+        configs.begin(), configs.end(), 0UL,
+        [](const size_t i, const MemoryTierCacheConfig& config) {
+          return i + config.getRatio();
+        });
+    size_t sum_sizes = std::accumulate(
+        configs.begin(), configs.end(), 0UL,
+        [&](const size_t i, const MemoryTierCacheConfig& config) {
+          return i + config.calculateTierSize(actualConfig.getCacheSize(),
+                                              sum_ratios);
+        });
+
+    EXPECT_GE(expectedTotalCacheSize, sum_ratios * Slab::kSize);
+    EXPECT_LE(sum_sizes, expectedTotalCacheSize);
+    EXPECT_GE(sum_sizes, expectedTotalCacheSize - configs.size() * Slab::kSize);
+  }
+
+  LruAllocatorConfig createTestCacheConfig(
+      const Ratios& tierRatios = {1},
+      bool setPosixForShm = true,
+      size_t cacheSize = defaultTotalCacheSize,
+      const std::string& cacheDir = defaultCacheDir) {
+    LruAllocatorConfig cfg;
+    cfg.setCacheSize(cacheSize).enableCachePersistence(cacheDir);
+
+    if (setPosixForShm)
+      cfg.usePosixForShm();
+    LruMemoryTierConfigs tierConfigs;
+    tierConfigs.reserve(tierRatios.size());
+    for (auto i = 0; i < tierRatios.size(); ++i) {
+      tierConfigs.push_back(MemoryTierCacheConfig::fromShm()
+                                .setRatio(tierRatios[i])
+                                .setMemBind(std::string("0")));
+    }
+
+    cfg.configureMemoryTiers(tierConfigs);
+    return cfg;
+  }
+
+  LruAllocatorConfig createTieredCacheConfig(size_t totalCacheSize,
+                                             size_t numTiers = 2) {
+    LruAllocatorConfig tieredCacheConfig{};
+    std::vector<MemoryTierCacheConfig> configs;
+    for (auto i = 1; i <= numTiers; ++i) {
+      configs.push_back(MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind(
+          std::string("0")));
+    }
+    tieredCacheConfig.setCacheSize(totalCacheSize)
+        .enableCachePersistence(
+            folly::sformat("/tmp/multi-tier-test/{}", ::getpid()))
+        .usePosixForShm()
+        .configureMemoryTiers(configs);
+    return tieredCacheConfig;
+  }
+
+  LruAllocatorConfig createDramCacheConfig(size_t totalCacheSize) {
+    LruAllocatorConfig dramConfig{};
+    dramConfig.setCacheSize(totalCacheSize);
+    return dramConfig;
+  }
+
+  void validatePoolSize(PoolId poolId,
+                        std::unique_ptr<LruAllocator>& allocator,
+                        size_t expectedSize) {
+    size_t actualSize = allocator->getPool(poolId).getPoolSize();
+    EXPECT_EQ(actualSize, expectedSize);
+  }
+
+  void testAddPool(std::unique_ptr<LruAllocator>& alloc,
+                   size_t poolSize,
+                   bool isSizeValid = true,
+                   size_t numTiers = 2) {
+    if (isSizeValid) {
+      auto pool = alloc->addPool("validPoolSize", poolSize);
+      EXPECT_LE(alloc->getPool(pool).getPoolSize(), poolSize);
+      if (poolSize >= numTiers * Slab::kSize)
+        EXPECT_GE(alloc->getPool(pool).getPoolSize(),
+                  poolSize - numTiers * Slab::kSize);
+    } else {
+      EXPECT_THROW(alloc->addPool("invalidPoolSize", poolSize),
+                   std::invalid_argument);
+      // TODO: test this for all tiers
+      EXPECT_EQ(alloc->getPoolIds().size(), 0);
+    }
+  }
+};
+
+using LruMemoryTiersTest = MemoryTiersTest<LruAllocator>;
+
+TEST_F(LruMemoryTiersTest, TestValid1TierConfig) {
+  LruAllocatorConfig cfg = createTestCacheConfig().validate();
+  basicCheck(cfg);
+}
+
+TEST_F(LruMemoryTiersTest, TestValid2TierConfig) {
+  LruAllocatorConfig cfg = createTestCacheConfig({1, 1});
+  basicCheck(cfg);
+}
+
+TEST_F(LruMemoryTiersTest, TestValid2TierRatioConfig) {
+  LruAllocatorConfig cfg = createTestCacheConfig({5, 2});
+  basicCheck(cfg);
+}
+
+TEST_F(LruMemoryTiersTest, TestInvalid2TierConfigNumberOfPartitionsTooLarge) {
+  EXPECT_THROW(createTestCacheConfig({defaultTotalCacheSize, 1}).validate(),
+               std::invalid_argument);
+}
+
+TEST_F(LruMemoryTiersTest, TestInvalid2TierConfigSizesAndRatioNotSet) {
+  EXPECT_THROW(createTestCacheConfig({1, 0}), std::invalid_argument);
+}
+
+TEST_F(LruMemoryTiersTest, TestInvalid2TierConfigRatiosCacheSizeNotSet) {
+  EXPECT_THROW(createTestCacheConfig({1, 1}, true,
+                                     /* cacheSize */ 0)
+                   .validate(),
+               std::invalid_argument);
+}
+
+TEST_F(LruMemoryTiersTest, TestInvalid2TierConfigRatioNotSet) {
+  EXPECT_THROW(createTestCacheConfig({1, 0}), std::invalid_argument);
+}
+
+TEST_F(LruMemoryTiersTest, TestInvalid2TierConfigSizesNeCacheSize) {
+  EXPECT_THROW(createTestCacheConfig({0, 0}), std::invalid_argument);
+}
+} // namespace tests
+} // namespace cachelib
+} // namespace facebook

From 62b0c417a93730b25c52280d9dc521e337791525 Mon Sep 17 00:00:00 2001
From: Sounak Gupta <sounak.gupta@intel.com>
Date: Mon, 14 Nov 2022 02:07:57 -0800
Subject: [PATCH 33/47] added ability for compressed pointer to use full 32
 bits for addressing in single tier mode and use 31 bits for addressing in
 multi-tier mode

---
 cachelib/allocator/CCacheAllocator.cpp        |  8 +-
 cachelib/allocator/memory/CompressedPtr.h     | 96 ++++++++++++++-----
 cachelib/allocator/memory/MemoryAllocator.h   | 10 +-
 cachelib/allocator/memory/Slab.h              |  2 +
 cachelib/allocator/memory/SlabAllocator.cpp   |  1 -
 cachelib/allocator/memory/SlabAllocator.h     | 14 ++-
 .../memory/tests/MemoryAllocatorTest.cpp      | 21 +++-
 cachelib/benchmarks/PtrCompressionBench.cpp   |  7 +-
 8 files changed, 117 insertions(+), 42 deletions(-)

diff --git a/cachelib/allocator/CCacheAllocator.cpp b/cachelib/allocator/CCacheAllocator.cpp
index 2709bde377..dd1986114b 100644
--- a/cachelib/allocator/CCacheAllocator.cpp
+++ b/cachelib/allocator/CCacheAllocator.cpp
@@ -36,7 +36,9 @@ CCacheAllocator::CCacheAllocator(MemoryAllocator& allocator,
       currentChunksIndex_(0) {
   auto& currentChunks = chunks_[currentChunksIndex_];
   for (auto chunk : *object.chunks()) {
-    currentChunks.push_back(allocator_.unCompress(CompressedPtr(chunk)));
+    // TODO : pass multi-tier flag when compact cache supports multi-tier config
+    currentChunks.push_back(
+        allocator_.unCompress(CompressedPtr(chunk), false /* isMultiTier */));
   }
 }
 
@@ -97,7 +99,9 @@ CCacheAllocator::SerializationType CCacheAllocator::saveState() {
 
   std::lock_guard<std::mutex> guard(resizeLock_);
   for (auto chunk : getCurrentChunks()) {
-    object.chunks()->push_back(allocator_.compress(chunk).saveState());
+    // TODO : pass multi-tier flag when compact cache supports multi-tier config
+    object.chunks()->push_back(
+        allocator_.compress(chunk, false /* isMultiTier */).saveState());
   }
   return object;
 }
diff --git a/cachelib/allocator/memory/CompressedPtr.h b/cachelib/allocator/memory/CompressedPtr.h
index 96d39ae2b9..029abd91b9 100644
--- a/cachelib/allocator/memory/CompressedPtr.h
+++ b/cachelib/allocator/memory/CompressedPtr.h
@@ -27,18 +27,29 @@ namespace cachelib {
 
 class SlabAllocator;
 
-// the following are for pointer compression for the memory allocator.  We
-// compress pointers by storing the slab index and the alloc index of the
-// allocation inside the slab. With slab worth kNumSlabBits of data, if we
-// have the min allocation size as 64 bytes, that requires kNumSlabBits - 6
-// bits for storing the alloc index. This leaves the remaining (32 -
-// (kNumSlabBits - 6)) bits for the slab index.  Hence we can index 256 GiB
-// of memory in slabs and index anything more than 64 byte allocations inside
-// the slab using a 32 bit representation.
-//
 // This CompressedPtr makes decompression fast by staying away from division and
 // modulo arithmetic and doing those during the compression time. We most often
-// decompress a CompressedPtr than compress a pointer while creating one.
+// decompress a CompressedPtr than compress a pointer while creating one. This
+// is used for pointer compression by the memory allocator.
+
+// We compress pointers by storing the tier index, slab index and alloc index of
+// the allocation inside the slab.
+
+// In original design (without memory tiers):
+// Each slab addresses 22 bits of allocations (kNumSlabBits). This is split into
+// allocation index and allocation size. If we have the min allocation size of
+// 64 bytes (kMinAllocPower = 6 bits), remaining kNumSlabBits(22) -
+// kMinAllocPower(6) = 16 bits for storing the alloc index. This leaves the
+// remaining 32 - (kNumSlabBits - kMinAllocPower) = 16 bits  for the  slab
+// index. Hence we can index 256 GiB of memory.
+
+// In multi-tier design:
+// kNumSlabIds and kMinAllocPower remains unchanged. The tier id occupies the
+// 32nd bit only since its value cannot exceed kMaxTiers(2). This leaves the
+// remaining 32 - (kNumSlabBits - kMinAllocPower) - 1 bit for tier id = 15 bits
+// for the slab index. Hence we can index 128 GiB of memory per tier in
+// multi-tier configuration.
+
 class CACHELIB_PACKED_ATTR CompressedPtr {
  public:
   using PtrType = uint32_t;
@@ -62,9 +73,10 @@ class CACHELIB_PACKED_ATTR CompressedPtr {
     return static_cast<uint32_t>(1) << (Slab::kMinAllocPower);
   }
 
-  // maximum adressable memory for pointer compression to work.
+  // maximum addressable memory for pointer compression to work.
   static constexpr size_t getMaxAddressableSize() noexcept {
-    return static_cast<size_t>(1) << (kNumSlabIdxBits + Slab::kNumSlabBits);
+    return static_cast<size_t>(1)
+           << (numSlabIdxBits(false) + Slab::kNumSlabBits);
   }
 
   // default construct to nullptr.
@@ -89,8 +101,11 @@ class CACHELIB_PACKED_ATTR CompressedPtr {
   PtrType ptr_{kNull};
 
   // create a compressed pointer for a valid memory allocation.
-  CompressedPtr(uint32_t slabIdx, uint32_t allocIdx)
-      : ptr_(compress(slabIdx, allocIdx)) {}
+  CompressedPtr(uint32_t slabIdx,
+                uint32_t allocIdx,
+                bool isMultiTiered,
+                TierId tid = 0)
+      : ptr_(compress(slabIdx, allocIdx, isMultiTiered, tid)) {}
 
   constexpr explicit CompressedPtr(PtrType ptr) noexcept : ptr_{ptr} {}
 
@@ -100,33 +115,63 @@ class CACHELIB_PACKED_ATTR CompressedPtr {
   static constexpr unsigned int kNumAllocIdxBits =
       Slab::kNumSlabBits - Slab::kMinAllocPower;
 
+  // Use 32nd bit position for TierId
+  static constexpr unsigned int kNumTierIdxOffset = 31;
+
   static constexpr PtrType kAllocIdxMask = ((PtrType)1 << kNumAllocIdxBits) - 1;
 
-  // Number of bits for the slab index. This will be the top 16 bits of the
-  // compressed ptr.
-  static constexpr unsigned int kNumSlabIdxBits =
-      NumBits<PtrType>::value - kNumAllocIdxBits;
+  // kNumTierIdxBits most significant bits
+  static constexpr PtrType kTierIdxMask = (PtrType)1 << kNumTierIdxOffset;
+
+  // Number of bits for the slab index.
+  // If CacheLib is single tiered, slab index will be the top 16 bits
+  // of the compressed ptr.
+  // Else if CacheLib is multi-tiered, the topmost 32nd bit will be
+  // reserved for tier id. The following 15 bits will be reserved for
+  // the slab index.
+  static constexpr unsigned int numSlabIdxBits(bool isMultiTiered) {
+    return kNumTierIdxOffset - kNumAllocIdxBits + (!isMultiTiered);
+  }
 
   // Compress the given slabIdx and allocIdx into a 32-bit compressed
   // pointer.
-  static PtrType compress(uint32_t slabIdx, uint32_t allocIdx) noexcept {
+  static PtrType compress(uint32_t slabIdx,
+                          uint32_t allocIdx,
+                          bool isMultiTiered,
+                          TierId tid) noexcept {
     XDCHECK_LE(allocIdx, kAllocIdxMask);
-    XDCHECK_LT(slabIdx, (1u << kNumSlabIdxBits) - 1);
-    return (slabIdx << kNumAllocIdxBits) + allocIdx;
+    XDCHECK_LT(slabIdx, (1u << numSlabIdxBits(isMultiTiered)) - 1);
+    if (!isMultiTiered) {
+      return (slabIdx << kNumAllocIdxBits) + allocIdx;
+    }
+    return (static_cast<uint32_t>(tid) << kNumTierIdxOffset) +
+           (slabIdx << kNumAllocIdxBits) + allocIdx;
   }
 
   // Get the slab index of the compressed ptr
-  uint32_t getSlabIdx() const noexcept {
+  uint32_t getSlabIdx(bool isMultiTiered) const noexcept {
     XDCHECK(!isNull());
-    return static_cast<uint32_t>(ptr_ >> kNumAllocIdxBits);
+    auto noTierIdPtr = isMultiTiered ? ptr_ & ~kTierIdxMask : ptr_;
+    return static_cast<uint32_t>(noTierIdPtr >> kNumAllocIdxBits);
   }
 
   // Get the allocation index of the compressed ptr
   uint32_t getAllocIdx() const noexcept {
     XDCHECK(!isNull());
+    // Note: tid check not required in ptr_ since only
+    //       the lower 16 bits are being read here.
     return static_cast<uint32_t>(ptr_ & kAllocIdxMask);
   }
 
+  uint32_t getTierId(bool isMultiTiered) const noexcept {
+    XDCHECK(!isNull());
+    return isMultiTiered ? static_cast<uint32_t>(ptr_ >> kNumTierIdxOffset) : 0;
+  }
+
+  void setTierId(TierId tid) noexcept {
+    ptr_ += static_cast<uint32_t>(tid) << kNumTierIdxOffset;
+  }
+
   friend SlabAllocator;
 };
 
@@ -137,11 +182,12 @@ class PtrCompressor {
       : allocator_(allocator) {}
 
   const CompressedPtr compress(const PtrType* uncompressed) const {
-    return allocator_.compress(uncompressed);
+    return allocator_.compress(uncompressed, false /* isMultiTiered */);
   }
 
   PtrType* unCompress(const CompressedPtr compressed) const {
-    return static_cast<PtrType*>(allocator_.unCompress(compressed));
+    return static_cast<PtrType*>(
+        allocator_.unCompress(compressed, false /* isMultiTiered */));
   }
 
   bool operator==(const PtrCompressor& rhs) const noexcept {
diff --git a/cachelib/allocator/memory/MemoryAllocator.h b/cachelib/allocator/memory/MemoryAllocator.h
index 509664afa6..1ce58857de 100644
--- a/cachelib/allocator/memory/MemoryAllocator.h
+++ b/cachelib/allocator/memory/MemoryAllocator.h
@@ -534,8 +534,9 @@ class MemoryAllocator {
   //                as the original pointer is valid.
   //
   // @throw  std::invalid_argument if the ptr is invalid.
-  CompressedPtr CACHELIB_INLINE compress(const void* ptr) const {
-    return slabAllocator_.compress(ptr);
+  CompressedPtr CACHELIB_INLINE compress(const void* ptr,
+                                         bool isMultiTiered) const {
+    return slabAllocator_.compress(ptr, isMultiTiered);
   }
 
   // retrieve the raw pointer corresponding to the compressed pointer. This is
@@ -546,8 +547,9 @@ class MemoryAllocator {
   // @return        the raw pointer corresponding to this compressed pointer.
   //
   // @throw   std::invalid_argument if the compressed pointer is invalid.
-  void* CACHELIB_INLINE unCompress(const CompressedPtr cPtr) const {
-    return slabAllocator_.unCompress(cPtr);
+  void* CACHELIB_INLINE unCompress(const CompressedPtr cPtr,
+                                   bool isMultiTiered) const {
+    return slabAllocator_.unCompress(cPtr, isMultiTiered);
   }
 
   // a special implementation of pointer compression for benchmarking purposes.
diff --git a/cachelib/allocator/memory/Slab.h b/cachelib/allocator/memory/Slab.h
index 4784bee8e9..897ad4e349 100644
--- a/cachelib/allocator/memory/Slab.h
+++ b/cachelib/allocator/memory/Slab.h
@@ -50,6 +50,8 @@ namespace cachelib {
  * independantly by the SlabAllocator.
  */
 
+// identifier for the memory tier
+using TierId = int8_t;
 // identifier for the memory pool
 using PoolId = int8_t;
 // identifier for the allocation class
diff --git a/cachelib/allocator/memory/SlabAllocator.cpp b/cachelib/allocator/memory/SlabAllocator.cpp
index 0106f1bf4e..080a3ee174 100644
--- a/cachelib/allocator/memory/SlabAllocator.cpp
+++ b/cachelib/allocator/memory/SlabAllocator.cpp
@@ -50,7 +50,6 @@ using PtrType = CompressedPtr::PtrType;
 constexpr uint64_t SlabAllocator::kAddressMask;
 constexpr PtrType CompressedPtr::kAllocIdxMask;
 constexpr unsigned int CompressedPtr::kNumAllocIdxBits;
-constexpr unsigned int CompressedPtr::kNumSlabIdxBits;
 
 constexpr unsigned int SlabAllocator::kLockSleepMS;
 constexpr size_t SlabAllocator::kPagesPerStep;
diff --git a/cachelib/allocator/memory/SlabAllocator.h b/cachelib/allocator/memory/SlabAllocator.h
index 5f5bf3265a..d82cf5b947 100644
--- a/cachelib/allocator/memory/SlabAllocator.h
+++ b/cachelib/allocator/memory/SlabAllocator.h
@@ -225,7 +225,8 @@ class SlabAllocator {
   // the corresponding memory allocator. trying to inline this just increases
   // the code size and does not move the needle on the benchmarks much.
   // Calling this with invalid input in optimized build is undefined behavior.
-  CompressedPtr CACHELIB_INLINE compress(const void* ptr) const {
+  CompressedPtr CACHELIB_INLINE compress(const void* ptr,
+                                         bool isMultiTiered) const {
     if (ptr == nullptr) {
       return CompressedPtr{};
     }
@@ -246,18 +247,23 @@ class SlabAllocator {
         static_cast<uint32_t>(reinterpret_cast<const uint8_t*>(ptr) -
                               reinterpret_cast<const uint8_t*>(slab)) /
         allocSize;
-    return CompressedPtr{slabIndex, allocIdx};
+    return CompressedPtr{slabIndex, allocIdx, isMultiTiered};
   }
 
   // uncompress the point and return the raw ptr.  This function never throws
   // in optimized build and assumes that the caller is responsible for calling
   // it with a valid compressed pointer.
-  void* CACHELIB_INLINE unCompress(const CompressedPtr ptr) const {
+  void* CACHELIB_INLINE unCompress(const CompressedPtr ptr,
+                                   bool isMultiTiered) const {
     if (ptr.isNull()) {
       return nullptr;
     }
 
-    const SlabIdx slabIndex = ptr.getSlabIdx();
+    /* TODO: isMultiTiered set to false by default.
+       Multi-tiering flag will have no impact till
+       rest of the multi-tiering changes are merged.
+     */
+    const SlabIdx slabIndex = ptr.getSlabIdx(isMultiTiered);
     const uint32_t allocIdx = ptr.getAllocIdx();
     const Slab* slab = &slabMemoryStart_[slabIndex];
 
diff --git a/cachelib/allocator/memory/tests/MemoryAllocatorTest.cpp b/cachelib/allocator/memory/tests/MemoryAllocatorTest.cpp
index cbd8cbef17..a19b9749a6 100644
--- a/cachelib/allocator/memory/tests/MemoryAllocatorTest.cpp
+++ b/cachelib/allocator/memory/tests/MemoryAllocatorTest.cpp
@@ -401,13 +401,28 @@ TEST_F(MemoryAllocatorTest, PointerCompression) {
   for (const auto& pool : poolAllocs) {
     const auto& allocs = pool.second;
     for (const auto* alloc : allocs) {
-      CompressedPtr ptr = m.compress(alloc);
+      CompressedPtr ptr = m.compress(alloc, false /* isMultiTiered */);
       ASSERT_FALSE(ptr.isNull());
-      ASSERT_EQ(alloc, m.unCompress(ptr));
+      ASSERT_EQ(alloc, m.unCompress(ptr, false /* isMultiTiered */));
     }
   }
 
-  ASSERT_EQ(nullptr, m.unCompress(m.compress(nullptr)));
+  ASSERT_EQ(nullptr,
+            m.unCompress(m.compress(nullptr, false /* isMultiTiered */),
+                         false /* isMultiTiered */));
+
+  // test pointer compression with multi-tier
+  for (const auto& pool : poolAllocs) {
+    const auto& allocs = pool.second;
+    for (const auto* alloc : allocs) {
+      CompressedPtr ptr = m.compress(alloc, true /* isMultiTiered */);
+      ASSERT_FALSE(ptr.isNull());
+      ASSERT_EQ(alloc, m.unCompress(ptr, true /* isMultiTiered */));
+    }
+  }
+
+  ASSERT_EQ(nullptr, m.unCompress(m.compress(nullptr, true /* isMultiTiered */),
+                                  true /* isMultiTiered */));
 }
 
 TEST_F(MemoryAllocatorTest, Restorable) {
diff --git a/cachelib/benchmarks/PtrCompressionBench.cpp b/cachelib/benchmarks/PtrCompressionBench.cpp
index aeaa2c3b11..5daefc146f 100644
--- a/cachelib/benchmarks/PtrCompressionBench.cpp
+++ b/cachelib/benchmarks/PtrCompressionBench.cpp
@@ -61,7 +61,8 @@ void buildAllocs(size_t poolSize) {
         void* alloc = ma->allocate(pid, size);
         XDCHECK_GE(size, CompressedPtr::getMinAllocSize());
         if (alloc != nullptr) {
-          validAllocs.push_back({alloc, ma->compress(alloc)});
+          validAllocs.push_back(
+              {alloc, ma->compress(alloc, false /* isMultiTiered */)});
           validAllocsAlt.push_back({alloc, ma->compressAlt(alloc)});
           numAllocations++;
         }
@@ -83,7 +84,7 @@ BENCHMARK(CompressionAlt) {
 
 BENCHMARK_RELATIVE(Compression) {
   for (const auto& alloc : validAllocs) {
-    CompressedPtr c = m->compress(alloc.first);
+    CompressedPtr c = m->compress(alloc.first, false /* isMultiTiered */);
     folly::doNotOptimizeAway(c);
   }
 }
@@ -97,7 +98,7 @@ BENCHMARK(DeCompressAlt) {
 
 BENCHMARK_RELATIVE(DeCompress) {
   for (const auto& alloc : validAllocs) {
-    void* ptr = m->unCompress(alloc.second);
+    void* ptr = m->unCompress(alloc.second, false /* isMultiTiered */);
     folly::doNotOptimizeAway(ptr);
   }
 }

From b7459eedd7a5782d72bbbb9b046d96d7d6cd51a2 Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Wed, 6 Jul 2022 10:15:17 +0000
Subject: [PATCH 34/47] Add memory usage statistics for allocation classes

This includes printing:
- allocSize
- allocated memory size
- memory usage fraction
---
 cachelib/allocator/Cache.h                    |  6 ++
 cachelib/allocator/CacheAllocator-inl.h       |  8 +++
 cachelib/allocator/CacheAllocator.h           |  3 +
 cachelib/allocator/memory/AllocationClass.h   |  8 ---
 .../allocator/memory/MemoryAllocatorStats.h   | 11 ++++
 cachelib/allocator/tests/CacheBaseTest.cpp    |  1 +
 cachelib/cachebench/cache/Cache-inl.h         |  9 +++
 cachelib/cachebench/cache/Cache.cpp           |  6 ++
 cachelib/cachebench/cache/Cache.h             |  5 ++
 cachelib/cachebench/cache/CacheStats.h        | 58 +++++++++++++++++++
 10 files changed, 107 insertions(+), 8 deletions(-)

diff --git a/cachelib/allocator/Cache.h b/cachelib/allocator/Cache.h
index e225ba8a01..082db65f7a 100644
--- a/cachelib/allocator/Cache.h
+++ b/cachelib/allocator/Cache.h
@@ -102,6 +102,12 @@ class CacheBase {
   // @param poolId   the pool id
   virtual PoolStats getPoolStats(PoolId poolId) const = 0;
 
+  // Get Allocation Class specific stats.
+  //
+  // @param poolId   the pool id
+  // @param classId   the class id
+  virtual ACStats getACStats(PoolId poolId, ClassId classId) const = 0;
+
   // @param poolId   the pool id
   virtual AllSlabReleaseEvents getAllSlabReleaseEvents(PoolId poolId) const = 0;
 
diff --git a/cachelib/allocator/CacheAllocator-inl.h b/cachelib/allocator/CacheAllocator-inl.h
index eb0f726a45..dc16785fe0 100644
--- a/cachelib/allocator/CacheAllocator-inl.h
+++ b/cachelib/allocator/CacheAllocator-inl.h
@@ -2290,6 +2290,14 @@ PoolStats CacheAllocator<CacheTrait>::getPoolStats(PoolId poolId) const {
   return ret;
 }
 
+template <typename CacheTrait>
+ACStats CacheAllocator<CacheTrait>::getACStats(PoolId poolId,
+                                               ClassId classId) const {
+  const auto& pool = allocator_->getPool(poolId);
+  const auto& ac = pool.getAllocationClass(classId);
+  return ac.getStats();
+}
+
 template <typename CacheTrait>
 PoolEvictionAgeStats CacheAllocator<CacheTrait>::getPoolEvictionAgeStats(
     PoolId pid, unsigned int slabProjectionLength) const {
diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
index 41ea947c34..6cadc34ebb 100644
--- a/cachelib/allocator/CacheAllocator.h
+++ b/cachelib/allocator/CacheAllocator.h
@@ -1178,6 +1178,9 @@ class CacheAllocator : public CacheBase {
   // return cache's memory usage stats
   CacheMemoryStats getCacheMemoryStats() const override final;
 
+  // return stats for Allocation Class
+  ACStats getACStats(PoolId pid, ClassId cid) const override final;
+
   // return the nvm cache stats map
   util::StatsMap getNvmCacheStatsMap() const override final;
 
diff --git a/cachelib/allocator/memory/AllocationClass.h b/cachelib/allocator/memory/AllocationClass.h
index b602e4d66d..d45a45c6cd 100644
--- a/cachelib/allocator/memory/AllocationClass.h
+++ b/cachelib/allocator/memory/AllocationClass.h
@@ -94,14 +94,6 @@ class AllocationClass {
     return static_cast<unsigned int>(Slab::kSize / allocationSize_);
   }
 
-  // total number of slabs under this AllocationClass.
-  unsigned int getNumSlabs() const {
-    return lock_->lock_combine([this]() {
-      return static_cast<unsigned int>(freeSlabs_.size() +
-                                       allocatedSlabs_.size());
-    });
-  }
-
   // fetch stats about this allocation class.
   ACStats getStats() const;
 
diff --git a/cachelib/allocator/memory/MemoryAllocatorStats.h b/cachelib/allocator/memory/MemoryAllocatorStats.h
index 74ebbe64dd..65d82e000d 100644
--- a/cachelib/allocator/memory/MemoryAllocatorStats.h
+++ b/cachelib/allocator/memory/MemoryAllocatorStats.h
@@ -54,6 +54,17 @@ struct ACStats {
   constexpr size_t getTotalFreeMemory() const noexcept {
     return Slab::kSize * freeSlabs + freeAllocs * allocSize;
   }
+
+  constexpr double usageFraction() const noexcept {
+    if (usedSlabs == 0)
+      return 0.0;
+
+    return activeAllocs / (usedSlabs * allocsPerSlab);
+  }
+
+  constexpr size_t totalAllocatedSize() const noexcept {
+    return activeAllocs * allocSize;
+  }
 };
 
 // structure to query stats corresponding to a MemoryPool
diff --git a/cachelib/allocator/tests/CacheBaseTest.cpp b/cachelib/allocator/tests/CacheBaseTest.cpp
index 928fcc0c67..f249786743 100644
--- a/cachelib/allocator/tests/CacheBaseTest.cpp
+++ b/cachelib/allocator/tests/CacheBaseTest.cpp
@@ -34,6 +34,7 @@ class CacheBaseTest : public CacheBase, public SlabAllocatorTestBase {
   bool isObjectCache() const override { return false; }
   const MemoryPool& getPool(PoolId) const override { return memoryPool_; }
   PoolStats getPoolStats(PoolId) const override { return PoolStats(); }
+  ACStats getACStats(PoolId, ClassId) const { return ACStats(); };
   AllSlabReleaseEvents getAllSlabReleaseEvents(PoolId) const override {
     return AllSlabReleaseEvents{};
   }
diff --git a/cachelib/cachebench/cache/Cache-inl.h b/cachelib/cachebench/cache/Cache-inl.h
index ed8bfd1b04..688654de96 100644
--- a/cachelib/cachebench/cache/Cache-inl.h
+++ b/cachelib/cachebench/cache/Cache-inl.h
@@ -620,10 +620,19 @@ Stats Cache<Allocator>::getStats() const {
     aggregate += poolStats;
   }
 
+  std::map<PoolId, std::map<ClassId, ACStats>> allocationClassStats{};
+
+  for (size_t pid = 0; pid < pools_.size(); pid++) {
+    auto cids = cache_->getPoolStats(static_cast<PoolId>(pid)).getClassIds();
+    for (auto cid : cids)
+      allocationClassStats[pid][cid] = cache_->getACStats(pid, cid);
+  }
+
   const auto cacheStats = cache_->getGlobalCacheStats();
   const auto rebalanceStats = cache_->getSlabReleaseStats();
   const auto navyStats = cache_->getNvmCacheStatsMap().toMap();
 
+  ret.allocationClassStats = allocationClassStats;
   ret.numEvictions = aggregate.numEvictions();
   ret.numItems = aggregate.numItems();
   ret.evictAttempts = cacheStats.evictionAttempts;
diff --git a/cachelib/cachebench/cache/Cache.cpp b/cachelib/cachebench/cache/Cache.cpp
index 70feb0f764..ea9d6b106c 100644
--- a/cachelib/cachebench/cache/Cache.cpp
+++ b/cachelib/cachebench/cache/Cache.cpp
@@ -20,6 +20,12 @@ DEFINE_bool(report_api_latency,
             false,
             "Enable reporting cache API latency tracking");
 
+DEFINE_string(
+    report_ac_memory_usage_stats,
+    "",
+    "Enable reporting statistics for each allocation class. Set to"
+    "'human_readable' to print KB/MB/GB or to 'raw' to print in bytes.");
+
 namespace facebook {
 namespace cachelib {
 namespace cachebench {} // namespace cachebench
diff --git a/cachelib/cachebench/cache/Cache.h b/cachelib/cachebench/cache/Cache.h
index a86dd06c49..65c70c30c1 100644
--- a/cachelib/cachebench/cache/Cache.h
+++ b/cachelib/cachebench/cache/Cache.h
@@ -44,6 +44,7 @@
 #include "cachelib/cachebench/util/NandWrites.h"
 
 DECLARE_bool(report_api_latency);
+DECLARE_string(report_ac_memory_usage_stats);
 
 namespace facebook {
 namespace cachelib {
@@ -324,6 +325,10 @@ class Cache {
   // return the stats for the pool.
   PoolStats getPoolStats(PoolId pid) const { return cache_->getPoolStats(pid); }
 
+  ACStats getACStats(PoolId pid, ClassId cid) const {
+    return cache_->getACStats(pid, cid);
+  }
+
   // return the total number of inconsistent operations detected since start.
   unsigned int getInconsistencyCount() const {
     return inconsistencyCount_.load(std::memory_order_relaxed);
diff --git a/cachelib/cachebench/cache/CacheStats.h b/cachelib/cachebench/cache/CacheStats.h
index d6c9e53584..38c48aa4c8 100644
--- a/cachelib/cachebench/cache/CacheStats.h
+++ b/cachelib/cachebench/cache/CacheStats.h
@@ -21,6 +21,7 @@
 #include "cachelib/common/PercentileStats.h"
 
 DECLARE_bool(report_api_latency);
+DECLARE_string(report_ac_memory_usage_stats);
 
 namespace facebook {
 namespace cachelib {
@@ -100,6 +101,8 @@ struct Stats {
   uint64_t invalidDestructorCount{0};
   int64_t unDestructedItemCount{0};
 
+  std::map<PoolId, std::map<ClassId, ACStats>> allocationClassStats;
+
   // populate the counters related to nvm usage. Cache implementation can decide
   // what to populate since not all of those are interesting when running
   // cachebench.
@@ -131,6 +134,61 @@ struct Stats {
           << std::endl;
     }
 
+    if (FLAGS_report_ac_memory_usage_stats != "") {
+      auto formatMemory = [&](size_t bytes) -> std::tuple<std::string, double> {
+        if (FLAGS_report_ac_memory_usage_stats == "raw") {
+          return {"B", bytes};
+        }
+
+        constexpr double KB = 1024.0;
+        constexpr double MB = 1024.0 * 1024;
+        constexpr double GB = 1024.0 * 1024 * 1024;
+
+        if (bytes >= GB) {
+          return {"GB", static_cast<double>(bytes) / GB};
+        } else if (bytes >= MB) {
+          return {"MB", static_cast<double>(bytes) / MB};
+        } else if (bytes >= KB) {
+          return {"KB", static_cast<double>(bytes) / KB};
+        } else {
+          return {"B", bytes};
+        }
+      };
+
+      auto foreachAC = [&](auto cb) {
+        for (auto& pidStat : allocationClassStats) {
+          for (auto& cidStat : pidStat.second) {
+            cb(pidStat.first, cidStat.first, cidStat.second);
+          }
+        }
+      };
+
+      foreachAC([&](auto pid, auto cid, auto stats) {
+        auto [allocSizeSuffix, allocSize] = formatMemory(stats.allocSize);
+        auto [memorySizeSuffix, memorySize] =
+            formatMemory(stats.totalAllocatedSize());
+        out << folly::sformat("pid{:2} cid{:4} {:8.2f}{} memorySize: {:8.2f}{}",
+                              pid, cid, allocSize, allocSizeSuffix, memorySize,
+                              memorySizeSuffix)
+            << std::endl;
+      });
+
+      foreachAC([&](auto pid, auto cid, auto stats) {
+        auto [allocSizeSuffix, allocSize] = formatMemory(stats.allocSize);
+
+        // If the pool is not full, extrapolate usageFraction for AC assuming it
+        // will grow at the same rate. This value will be the same for all ACs.
+        auto acUsageFraction = (poolUsageFraction[pid] < 1.0)
+                                   ? poolUsageFraction[pid]
+                                   : stats.usageFraction();
+
+        out << folly::sformat(
+                   "pid{:2} cid{:4} {:8.2f}{} usageFraction: {:4.2f}", pid, cid,
+                   allocSize, allocSizeSuffix, acUsageFraction)
+            << std::endl;
+      });
+    }
+
     if (numCacheGets > 0) {
       out << folly::sformat("Cache Gets    : {:,}", numCacheGets) << std::endl;
       out << folly::sformat("Hit Ratio     : {:6.2f}%", overallHitRatio)

From fafface2905f39cbb697d09da7a71895a759410c Mon Sep 17 00:00:00 2001
From: "Chorazewicz, Igor" <igor.chorazewicz@intel.com>
Date: Tue, 28 Sep 2021 15:11:07 +0200
Subject: [PATCH 35/47] Initial multi-tier support implementation (rebased with
 NUMA and cs part 2)

fix for compressed ptr (upstream) -> compress from false to true
---
 cachelib/allocator/CacheAllocator-inl.h       | 453 ++++++++++++------
 cachelib/allocator/CacheAllocator.h           | 101 +++-
 cachelib/allocator/PoolOptimizer.cpp          |   2 +
 cachelib/allocator/memory/MemoryAllocator.h   |   7 +
 cachelib/allocator/memory/SlabAllocator.h     |  17 +-
 .../allocator/tests/AllocatorResizeTest.h     |   8 +-
 cachelib/allocator/tests/BaseAllocatorTest.h  |   8 +-
 cachelib/allocator/tests/TestBase-inl.h       |   4 +-
 8 files changed, 405 insertions(+), 195 deletions(-)

diff --git a/cachelib/allocator/CacheAllocator-inl.h b/cachelib/allocator/CacheAllocator-inl.h
index dc16785fe0..8bc794de66 100644
--- a/cachelib/allocator/CacheAllocator-inl.h
+++ b/cachelib/allocator/CacheAllocator-inl.h
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <folly/Random.h>
+
 namespace facebook {
 namespace cachelib {
 
@@ -35,6 +37,7 @@ CacheAllocator<CacheTrait>::CacheAllocator(SharedMemNewT, Config config)
 template <typename CacheTrait>
 CacheAllocator<CacheTrait>::CacheAllocator(SharedMemAttachT, Config config)
     : CacheAllocator(InitMemType::kMemAttach, config) {
+  /* TODO - per tier? */
   for (auto pid : *metadata_.compactCachePools()) {
     isCompactCachePool_[pid] = true;
   }
@@ -68,12 +71,12 @@ CacheAllocator<CacheTrait>::CacheAllocator(
                     : serialization::CacheAllocatorMetadata{}},
       allocator_(initAllocator(type)),
       compactCacheManager_(type != InitMemType::kMemAttach
-                               ? std::make_unique<CCacheManager>(*allocator_)
-                               : restoreCCacheManager()),
+                               ? std::make_unique<CCacheManager>(*allocator_[0] /* TODO: per tier */)
+                               : restoreCCacheManager(0/* TODO: per tier */)),
       compressor_(createPtrCompressor()),
       mmContainers_(type == InitMemType::kMemAttach
                         ? deserializeMMContainers(*deserializer_, compressor_)
-                        : MMContainers{}),
+                        : MMContainers{getNumTiers()}),
       accessContainer_(initAccessContainer(
           type, detail::kShmHashTableName, config.accessConfig)),
       chainedItemAccessContainer_(
@@ -106,49 +109,87 @@ CacheAllocator<CacheTrait>::~CacheAllocator() {
 }
 
 template <typename CacheTrait>
-ShmSegmentOpts CacheAllocator<CacheTrait>::createShmCacheOpts() {
+ShmSegmentOpts CacheAllocator<CacheTrait>::createShmCacheOpts(TierId tid) {
   ShmSegmentOpts opts;
   opts.alignment = sizeof(Slab);
   auto memoryTierConfigs = config_.getMemoryTierConfigs();
   // TODO: we support single tier so far
-  if (memoryTierConfigs.size() > 1) {
-    throw std::invalid_argument("CacheLib only supports a single memory tier");
+  if (memoryTierConfigs.size() > 2) {
+    throw std::invalid_argument("CacheLib only supports two memory tiers");
   }
-  opts.memBindNumaNodes = memoryTierConfigs[0].getMemBind();
+  opts.memBindNumaNodes = memoryTierConfigs[tid].getMemBind();
   return opts;
 }
 
+template <typename CacheTrait>
+std::vector<std::unique_ptr<MemoryAllocator>>
+CacheAllocator<CacheTrait>::createPrivateAllocator() {
+  std::vector<std::unique_ptr<MemoryAllocator>> allocators;
+
+  if (isOnShm_)
+    allocators.emplace_back(std::make_unique<MemoryAllocator>(
+                            getAllocatorConfig(config_),
+                            tempShm_->getAddr(),
+                            config_.getCacheSize()));
+  else
+    allocators.emplace_back(std::make_unique<MemoryAllocator>(
+                            getAllocatorConfig(config_),
+                            config_.getCacheSize()));
+
+  return allocators;
+}
+
 template <typename CacheTrait>
 std::unique_ptr<MemoryAllocator>
-CacheAllocator<CacheTrait>::createNewMemoryAllocator() {
+CacheAllocator<CacheTrait>::createNewMemoryAllocator(TierId tid) {
   return std::make_unique<MemoryAllocator>(
       getAllocatorConfig(config_),
       shmManager_
-          ->createShm(detail::kShmCacheName, config_.getCacheSize(),
-                      config_.slabMemoryBaseAddr, createShmCacheOpts())
+          ->createShm(detail::kShmCacheName + std::to_string(tid),
+                      config_.getCacheSize(), config_.slabMemoryBaseAddr,
+                      createShmCacheOpts(tid))
           .addr,
       config_.getCacheSize());
 }
 
 template <typename CacheTrait>
 std::unique_ptr<MemoryAllocator>
-CacheAllocator<CacheTrait>::restoreMemoryAllocator() {
+CacheAllocator<CacheTrait>::restoreMemoryAllocator(TierId tid) {
   return std::make_unique<MemoryAllocator>(
       deserializer_->deserialize<MemoryAllocator::SerializationType>(),
       shmManager_
-          ->attachShm(detail::kShmCacheName, config_.slabMemoryBaseAddr,
-                      createShmCacheOpts())
-          .addr,
+          ->attachShm(detail::kShmCacheName + std::to_string(tid),
+            config_.slabMemoryBaseAddr, createShmCacheOpts(tid)).addr,
       config_.getCacheSize(),
       config_.disableFullCoredump);
 }
 
+template <typename CacheTrait>
+std::vector<std::unique_ptr<MemoryAllocator>>
+CacheAllocator<CacheTrait>::createAllocators() {
+  std::vector<std::unique_ptr<MemoryAllocator>> allocators;
+  for (int tid = 0; tid < getNumTiers(); tid++) {
+    allocators.emplace_back(createNewMemoryAllocator(tid));
+  }
+  return allocators;
+}
+
+template <typename CacheTrait>
+std::vector<std::unique_ptr<MemoryAllocator>>
+CacheAllocator<CacheTrait>::restoreAllocators() {
+  std::vector<std::unique_ptr<MemoryAllocator>> allocators;
+  for (int tid = 0; tid < getNumTiers(); tid++) {
+    allocators.emplace_back(restoreMemoryAllocator(tid));
+  }
+  return allocators;
+}
+
 template <typename CacheTrait>
 std::unique_ptr<CCacheManager>
-CacheAllocator<CacheTrait>::restoreCCacheManager() {
+CacheAllocator<CacheTrait>::restoreCCacheManager(TierId tid) {
   return std::make_unique<CCacheManager>(
       deserializer_->deserialize<CCacheManager::SerializationType>(),
-      *allocator_);
+      *allocator_[tid]);
 }
 
 template <typename CacheTrait>
@@ -240,21 +281,15 @@ void CacheAllocator<CacheTrait>::initWorkers() {
 }
 
 template <typename CacheTrait>
-std::unique_ptr<MemoryAllocator> CacheAllocator<CacheTrait>::initAllocator(
+std::vector<std::unique_ptr<MemoryAllocator>>
+CacheAllocator<CacheTrait>::initAllocator(
     InitMemType type) {
   if (type == InitMemType::kNone) {
-    if (isOnShm_ == true) {
-      return std::make_unique<MemoryAllocator>(getAllocatorConfig(config_),
-                                               tempShm_->getAddr(),
-                                               config_.getCacheSize());
-    } else {
-      return std::make_unique<MemoryAllocator>(getAllocatorConfig(config_),
-                                               config_.getCacheSize());
-    }
+    return createPrivateAllocator();
   } else if (type == InitMemType::kMemNew) {
-    return createNewMemoryAllocator();
+    return createAllocators();
   } else if (type == InitMemType::kMemAttach) {
-    return restoreMemoryAllocator();
+    return restoreAllocators();
   }
 
   // Invalid type
@@ -323,11 +358,12 @@ CacheAllocator<CacheTrait>::allocate(PoolId poolId,
 
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::WriteHandle
-CacheAllocator<CacheTrait>::allocateInternal(PoolId pid,
-                                             typename Item::Key key,
-                                             uint32_t size,
-                                             uint32_t creationTime,
-                                             uint32_t expiryTime) {
+CacheAllocator<CacheTrait>::allocateInternalTier(TierId tid,
+                                                 PoolId pid,
+                                                 typename Item::Key key,
+                                                 uint32_t size,
+                                                 uint32_t creationTime,
+                                                 uint32_t expiryTime) {
   util::LatencyTracker tracker{stats().allocateLatency_};
 
   SCOPE_FAIL { stats_.invalidAllocs.inc(); };
@@ -336,13 +372,14 @@ CacheAllocator<CacheTrait>::allocateInternal(PoolId pid,
   const auto requiredSize = Item::getRequiredSize(key, size);
 
   // the allocation class in our memory allocator.
-  const auto cid = allocator_->getAllocationClassId(pid, requiredSize);
+  const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize);
 
+  // TODO: per-tier
   (*stats_.allocAttempts)[pid][cid].inc();
 
-  void* memory = allocator_->allocate(pid, requiredSize);
+  void* memory = allocator_[tid]->allocate(pid, requiredSize);
   if (memory == nullptr) {
-    memory = findEviction(pid, cid);
+    memory = findEviction(tid, pid, cid);
   }
 
   WriteHandle handle;
@@ -353,7 +390,7 @@ CacheAllocator<CacheTrait>::allocateInternal(PoolId pid,
     // for example.
     SCOPE_FAIL {
       // free back the memory to the allocator since we failed.
-      allocator_->free(memory);
+      allocator_[tid]->free(memory);
     };
 
     handle = acquire(new (memory) Item(key, size, creationTime, expiryTime));
@@ -364,7 +401,7 @@ CacheAllocator<CacheTrait>::allocateInternal(PoolId pid,
     }
 
   } else { // failed to allocate memory.
-    (*stats_.allocFailures)[pid][cid].inc();
+    (*stats_.allocFailures)[pid][cid].inc(); // TODO: per-tier
     // wake up rebalancer
     if (poolRebalancer_) {
       poolRebalancer_->wakeUp();
@@ -381,6 +418,21 @@ CacheAllocator<CacheTrait>::allocateInternal(PoolId pid,
   return handle;
 }
 
+template <typename CacheTrait>
+typename CacheAllocator<CacheTrait>::WriteHandle
+CacheAllocator<CacheTrait>::allocateInternal(PoolId pid,
+                                             typename Item::Key key,
+                                             uint32_t size,
+                                             uint32_t creationTime,
+                                             uint32_t expiryTime) {
+  auto tid = 0; /* TODO: consult admission policy */
+  for(TierId tid = 0; tid < getNumTiers(); ++tid) {
+    auto handle = allocateInternalTier(tid, pid, key, size, creationTime, expiryTime);
+    if (handle) return handle;
+  }
+  return {};
+}
+
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::WriteHandle
 CacheAllocator<CacheTrait>::allocateChainedItem(const ReadHandle& parent,
@@ -411,21 +463,26 @@ CacheAllocator<CacheTrait>::allocateChainedItemInternal(
   // number of bytes required for this item
   const auto requiredSize = ChainedItem::getRequiredSize(size);
 
-  const auto pid = allocator_->getAllocInfo(parent->getMemory()).poolId;
-  const auto cid = allocator_->getAllocationClassId(pid, requiredSize);
+  // TODO: is this correct?
+  auto tid = getTierId(*parent);
 
+  const auto pid = allocator_[tid]->getAllocInfo(parent->getMemory()).poolId;
+  const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize);
+
+  // TODO: per-tier? Right now stats_ are not used in any public periodic
+  // worker
   (*stats_.allocAttempts)[pid][cid].inc();
 
-  void* memory = allocator_->allocate(pid, requiredSize);
+  void* memory = allocator_[tid]->allocate(pid, requiredSize);
   if (memory == nullptr) {
-    memory = findEviction(pid, cid);
+    memory = findEviction(tid, pid, cid);
   }
   if (memory == nullptr) {
     (*stats_.allocFailures)[pid][cid].inc();
     return WriteHandle{};
   }
 
-  SCOPE_FAIL { allocator_->free(memory); };
+  SCOPE_FAIL { allocator_[tid]->free(memory); };
 
   auto child = acquire(
       new (memory) ChainedItem(compressor_.compress(parent.getInternal()), size,
@@ -734,8 +791,8 @@ CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it,
     throw std::runtime_error(
         folly::sformat("cannot release this item: {}", it.toString()));
   }
-
-  const auto allocInfo = allocator_->getAllocInfo(it.getMemory());
+  const auto tid = getTierId(it);
+  const auto allocInfo = allocator_[tid]->getAllocInfo(it.getMemory());
 
   if (ctx == RemoveContext::kEviction) {
     const auto timeNow = util::getCurrentTimeSec();
@@ -759,8 +816,7 @@ CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it,
           folly::sformat("Can not recycle a chained item {}, toRecyle",
                          it.toString(), toRecycle->toString()));
     }
-
-    allocator_->free(&it);
+    allocator_[tid]->free(&it);
     return ReleaseRes::kReleased;
   }
 
@@ -829,7 +885,7 @@ CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it,
       auto next = head->getNext(compressor_);
 
       const auto childInfo =
-          allocator_->getAllocInfo(static_cast<const void*>(head));
+          allocator_[tid]->getAllocInfo(static_cast<const void*>(head));
       (*stats_.fragmentationSize)[childInfo.poolId][childInfo.classId].sub(
           util::getFragmentation(*this, *head));
 
@@ -865,7 +921,7 @@ CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it,
           XDCHECK(ReleaseRes::kReleased != res);
           res = ReleaseRes::kRecycled;
         } else {
-          allocator_->free(head);
+          allocator_[tid]->free(head);
         }
       }
 
@@ -880,7 +936,7 @@ CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it,
     res = ReleaseRes::kRecycled;
   } else {
     XDCHECK(it.isDrained());
-    allocator_->free(&it);
+    allocator_[tid]->free(&it);
   }
 
   return res;
@@ -1269,8 +1325,8 @@ void CacheAllocator<CacheTrait>::unlinkItemForEviction(Item& it) {
 
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::Item*
-CacheAllocator<CacheTrait>::findEviction(PoolId pid, ClassId cid) {
-  auto& mmContainer = getMMContainer(pid, cid);
+CacheAllocator<CacheTrait>::findEviction(TierId tid, PoolId pid, ClassId cid) {
+  auto& mmContainer = getMMContainer(tid, pid, cid);
 
   // Keep searching for a candidate until we were able to evict it
   // or until the search limit has been exhausted
@@ -1624,21 +1680,57 @@ void CacheAllocator<CacheTrait>::invalidateNvm(Item& item) {
   }
 }
 
+template <typename CacheTrait>
+TierId
+CacheAllocator<CacheTrait>::getTierId(const Item& item) const {
+  return getTierId(item.getMemory());
+}
+
+template <typename CacheTrait>
+TierId
+CacheAllocator<CacheTrait>::getTierId(const void* ptr) const {
+  for (TierId tid = 0; tid < getNumTiers(); tid++) {
+    if (allocator_[tid]->isMemoryInAllocator(ptr))
+      return tid;
+  }
+
+  throw std::invalid_argument("Item does not belong to any tier!");
+}
+
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::MMContainer&
 CacheAllocator<CacheTrait>::getMMContainer(const Item& item) const noexcept {
+  const auto tid = getTierId(item);
   const auto allocInfo =
-      allocator_->getAllocInfo(static_cast<const void*>(&item));
-  return getMMContainer(allocInfo.poolId, allocInfo.classId);
+      allocator_[tid]->getAllocInfo(static_cast<const void*>(&item));
+  return getMMContainer(tid, allocInfo.poolId, allocInfo.classId);
 }
 
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::MMContainer&
-CacheAllocator<CacheTrait>::getMMContainer(PoolId pid,
+CacheAllocator<CacheTrait>::getMMContainer(TierId tid,
+                                           PoolId pid,
                                            ClassId cid) const noexcept {
-  XDCHECK_LT(static_cast<size_t>(pid), mmContainers_.size());
-  XDCHECK_LT(static_cast<size_t>(cid), mmContainers_[pid].size());
-  return *mmContainers_[pid][cid];
+  XDCHECK_LT(static_cast<size_t>(tid), mmContainers_.size());
+  XDCHECK_LT(static_cast<size_t>(pid), mmContainers_[tid].size());
+  XDCHECK_LT(static_cast<size_t>(cid), mmContainers_[tid][pid].size());
+  return *mmContainers_[tid][pid][cid];
+}
+
+template <typename CacheTrait>
+MMContainerStat CacheAllocator<CacheTrait>::getMMContainerStat(
+    TierId tid, PoolId pid, ClassId cid) const noexcept {
+  if(static_cast<size_t>(tid) >= mmContainers_.size()) {
+    return MMContainerStat{};
+  }
+  if (static_cast<size_t>(pid) >= mmContainers_[tid].size()) {
+    return MMContainerStat{};
+  }
+  if (static_cast<size_t>(cid) >= mmContainers_[tid][pid].size()) {
+    return MMContainerStat{};
+  }
+  return mmContainers_[tid][pid][cid] ? mmContainers_[tid][pid][cid]->getStats()
+                                 : MMContainerStat{};
 }
 
 template <typename CacheTrait>
@@ -1824,8 +1916,9 @@ void CacheAllocator<CacheTrait>::markUseful(const ReadHandle& handle,
 template <typename CacheTrait>
 bool CacheAllocator<CacheTrait>::recordAccessInMMContainer(Item& item,
                                                            AccessMode mode) {
+  const auto tid = getTierId(item);
   const auto allocInfo =
-      allocator_->getAllocInfo(static_cast<const void*>(&item));
+      allocator_[tid]->getAllocInfo(static_cast<const void*>(&item));
   (*stats_.cacheHits)[allocInfo.poolId][allocInfo.classId].inc();
 
   // track recently accessed items if needed
@@ -1833,14 +1926,15 @@ bool CacheAllocator<CacheTrait>::recordAccessInMMContainer(Item& item,
     ring_->trackItem(reinterpret_cast<uintptr_t>(&item), item.getSize());
   }
 
-  auto& mmContainer = getMMContainer(allocInfo.poolId, allocInfo.classId);
+  auto& mmContainer = getMMContainer(tid, allocInfo.poolId, allocInfo.classId);
   return mmContainer.recordAccess(item, mode);
 }
 
 template <typename CacheTrait>
 uint32_t CacheAllocator<CacheTrait>::getUsableSize(const Item& item) const {
+  const auto tid = getTierId(item);
   const auto allocSize =
-      allocator_->getAllocInfo(static_cast<const void*>(&item)).allocSize;
+      allocator_[tid]->getAllocInfo(static_cast<const void*>(&item)).allocSize;
   return item.isChainedItem()
              ? allocSize - ChainedItem::getRequiredSize(0)
              : allocSize - Item::getRequiredSize(item.getKey(), 0);
@@ -1849,8 +1943,10 @@ uint32_t CacheAllocator<CacheTrait>::getUsableSize(const Item& item) const {
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::SampleItem
 CacheAllocator<CacheTrait>::getSampleItem() {
-  size_t nvmCacheSize = nvmCache_ ? nvmCache_->getUsableSize() : 0;
-  size_t ramCacheSize = allocator_->getMemorySizeInclAdvised();
+  // TODO: is using random tier a good idea?
+  auto tid = folly::Random::rand32() % getNumTiers();
+  static size_t nvmCacheSize = nvmCache_ ? nvmCache_->getUsableSize() : 0;
+  static size_t ramCacheSize = allocator_[tid]->getMemorySizeInclAdvised();
 
   bool fromNvm =
       folly::Random::rand64(0, nvmCacheSize + ramCacheSize) >= ramCacheSize;
@@ -1859,19 +1955,18 @@ CacheAllocator<CacheTrait>::getSampleItem() {
   }
 
   // Sampling from DRAM cache
-  auto item = reinterpret_cast<const Item*>(allocator_->getRandomAlloc());
+  auto item = reinterpret_cast<const Item*>(allocator_[tid]->getRandomAlloc());
   if (!item) {
     return SampleItem{false /* fromNvm */};
   }
 
   // Check that item returned is the same that was sampled
-
   auto sharedHdl = std::make_shared<ReadHandle>(findInternal(item->getKey()));
   if (sharedHdl->get() != item) {
     return SampleItem{false /* fromNvm */};
   }
 
-  const auto allocInfo = allocator_->getAllocInfo(item->getMemory());
+  const auto allocInfo = allocator_[tid]->getAllocInfo(item->getMemory());
 
   // Convert the Item to IOBuf to make SampleItem
   auto iobuf = folly::IOBuf{
@@ -1890,27 +1985,34 @@ CacheAllocator<CacheTrait>::getSampleItem() {
 
 template <typename CacheTrait>
 std::vector<std::string> CacheAllocator<CacheTrait>::dumpEvictionIterator(
-    PoolId pid, ClassId cid, size_t numItems) {
+  PoolId pid, ClassId cid, size_t numItems) {
   if (numItems == 0) {
     return {};
   }
 
-  if (static_cast<size_t>(pid) >= mmContainers_.size() ||
-      static_cast<size_t>(cid) >= mmContainers_[pid].size()) {
+  // Always evict from the lowest layer.
+  int tid = getNumTiers() - 1;
+
+  if (static_cast<size_t>(tid) >= mmContainers_.size() ||
+      static_cast<size_t>(pid) >= mmContainers_[tid].size() ||
+      static_cast<size_t>(cid) >= mmContainers_[tid][pid].size()) {
     throw std::invalid_argument(
-        folly::sformat("Invalid PoolId: {} and ClassId: {}.", pid, cid));
+        folly::sformat("Invalid TierId: {} and PoolId: {} and ClassId: {}.", tid, pid, cid));
   }
 
   std::vector<std::string> content;
 
-  auto& mm = *mmContainers_[pid][cid];
-
-  mm.withEvictionIterator([&content, numItems](auto&& itr) {
-    while (itr && content.size() < numItems) {
-      content.push_back(itr->toString());
-      ++itr;
-    }
-  });
+  size_t i = 0;
+  while (i < numItems && tid >= 0) {
+    auto& mm = *mmContainers_[tid][pid][cid];
+    mm.withEvictionIterator([&content, numItems](auto&& itr) {
+      while (itr && content.size() < numItems) {
+        content.push_back(itr->toString());
+        ++itr;
+      }
+    });
+    --tid;
+  }
 
   return content;
 }
@@ -2087,19 +2189,40 @@ PoolId CacheAllocator<CacheTrait>::addPool(
     std::shared_ptr<RebalanceStrategy> resizeStrategy,
     bool ensureProvisionable) {
   folly::SharedMutex::WriteHolder w(poolsResizeAndRebalanceLock_);
-  auto pid = allocator_->addPool(name, size, allocSizes, ensureProvisionable);
+
+  PoolId pid = 0;
+  size_t totalCacheSize = 0;
+
+  for (TierId tid = 0; tid < getNumTiers(); tid++) {
+    totalCacheSize += allocator_[tid]->getMemorySize();
+  }
+
+  for (TierId tid = 0; tid < getNumTiers(); tid++) {
+    auto tierSizeRatio =
+        static_cast<double>(allocator_[tid]->getMemorySize()) / totalCacheSize;
+    size_t tierPoolSize = static_cast<size_t>(tierSizeRatio * size);
+    
+    // TODO: what if we manage to add pool only in one tier?
+    // we should probably remove that on failure
+    auto res = allocator_[tid]->addPool(
+        name, tierPoolSize, allocSizes, ensureProvisionable);
+    XDCHECK(tid == 0 || res == pid);
+    pid = res;
+  }
+
   createMMContainers(pid, std::move(config));
   setRebalanceStrategy(pid, std::move(rebalanceStrategy));
   setResizeStrategy(pid, std::move(resizeStrategy));
+
   return pid;
 }
 
 template <typename CacheTrait>
 void CacheAllocator<CacheTrait>::overridePoolRebalanceStrategy(
     PoolId pid, std::shared_ptr<RebalanceStrategy> rebalanceStrategy) {
-  if (static_cast<size_t>(pid) >= mmContainers_.size()) {
+  if (static_cast<size_t>(pid) >= mmContainers_[0].size()) {
     throw std::invalid_argument(folly::sformat(
-        "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size()));
+        "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[0].size()));
   }
   setRebalanceStrategy(pid, std::move(rebalanceStrategy));
 }
@@ -2107,9 +2230,9 @@ void CacheAllocator<CacheTrait>::overridePoolRebalanceStrategy(
 template <typename CacheTrait>
 void CacheAllocator<CacheTrait>::overridePoolResizeStrategy(
     PoolId pid, std::shared_ptr<RebalanceStrategy> resizeStrategy) {
-  if (static_cast<size_t>(pid) >= mmContainers_.size()) {
+  if (static_cast<size_t>(pid) >= mmContainers_[0].size()) {
     throw std::invalid_argument(folly::sformat(
-        "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size()));
+        "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[0].size()));
   }
   setResizeStrategy(pid, std::move(resizeStrategy));
 }
@@ -2121,14 +2244,14 @@ void CacheAllocator<CacheTrait>::overridePoolOptimizeStrategy(
 }
 
 template <typename CacheTrait>
-void CacheAllocator<CacheTrait>::overridePoolConfig(PoolId pid,
+void CacheAllocator<CacheTrait>::overridePoolConfig(TierId tid, PoolId pid,
                                                     const MMConfig& config) {
-  if (static_cast<size_t>(pid) >= mmContainers_.size()) {
+  // TODO: add generic tier id checking
+  if (static_cast<size_t>(pid) >= mmContainers_[tid].size()) {
     throw std::invalid_argument(folly::sformat(
-        "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size()));
+        "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[tid].size()));
   }
-
-  auto& pool = allocator_->getPool(pid);
+  auto& pool = allocator_[tid]->getPool(pid);
   for (unsigned int cid = 0; cid < pool.getNumClassId(); ++cid) {
     MMConfig mmConfig = config;
     mmConfig.addExtraConfig(
@@ -2136,29 +2259,35 @@ void CacheAllocator<CacheTrait>::overridePoolConfig(PoolId pid,
             ? pool.getAllocationClass(static_cast<ClassId>(cid))
                   .getAllocsPerSlab()
             : 0);
-    DCHECK_NOTNULL(mmContainers_[pid][cid].get());
-    mmContainers_[pid][cid]->setConfig(mmConfig);
+    DCHECK_NOTNULL(mmContainers_[tid][pid][cid].get());
+    mmContainers_[tid][pid][cid]->setConfig(mmConfig);
   }
 }
 
 template <typename CacheTrait>
 void CacheAllocator<CacheTrait>::createMMContainers(const PoolId pid,
                                                     MMConfig config) {
-  auto& pool = allocator_->getPool(pid);
+  // pools on each layer should have the same number of class id, etc.
+  // TODO: think about deduplication
+  auto& pool = allocator_[0]->getPool(pid);
+
   for (unsigned int cid = 0; cid < pool.getNumClassId(); ++cid) {
     config.addExtraConfig(
         config_.trackTailHits
             ? pool.getAllocationClass(static_cast<ClassId>(cid))
                   .getAllocsPerSlab()
             : 0);
-    mmContainers_[pid][cid].reset(new MMContainer(config, compressor_));
+    for (TierId tid = 0; tid < getNumTiers(); tid++) {
+      mmContainers_[tid][pid][cid].reset(new MMContainer(config, compressor_));
+    }
   }
 }
 
 template <typename CacheTrait>
 PoolId CacheAllocator<CacheTrait>::getPoolId(
     folly::StringPiece name) const noexcept {
-  return allocator_->getPoolId(name.str());
+  // each tier has the same pools
+  return allocator_[0]->getPoolId(name.str());
 }
 
 // The Function returns a consolidated vector of Release Slab
@@ -2201,7 +2330,9 @@ std::set<PoolId> CacheAllocator<CacheTrait>::filterCompactCachePools(
 template <typename CacheTrait>
 std::set<PoolId> CacheAllocator<CacheTrait>::getRegularPoolIds() const {
   folly::SharedMutex::ReadHolder r(poolsResizeAndRebalanceLock_);
-  return filterCompactCachePools(allocator_->getPoolIds());
+  // TODO - get rid of the duplication - right now, each tier
+  // holds pool objects with mostly the same info
+  return filterCompactCachePools(allocator_[0]->getPoolIds());
 }
 
 template <typename CacheTrait>
@@ -2226,10 +2357,9 @@ std::set<PoolId> CacheAllocator<CacheTrait>::getRegularPoolIdsForResize()
   // getAdvisedMemorySize - then pools may be overLimit even when
   // all slabs are not allocated. Otherwise, pools may be overLimit
   // only after all slabs are allocated.
-  //
-  return (allocator_->allSlabsAllocated()) ||
-                 (allocator_->getAdvisedMemorySize() != 0)
-             ? filterCompactCachePools(allocator_->getPoolsOverLimit())
+  return (allocator_[currentTier()]->allSlabsAllocated()) ||
+                 (allocator_[currentTier()]->getAdvisedMemorySize() != 0)
+             ? filterCompactCachePools(allocator_[currentTier()]->getPoolsOverLimit())
              : std::set<PoolId>{};
 }
 
@@ -2240,7 +2370,7 @@ const std::string CacheAllocator<CacheTrait>::getCacheName() const {
 
 template <typename CacheTrait>
 PoolStats CacheAllocator<CacheTrait>::getPoolStats(PoolId poolId) const {
-  const auto& pool = allocator_->getPool(poolId);
+  const auto& pool = allocator_[currentTier()]->getPool(poolId);
   const auto& allocSizes = pool.getAllocSizes();
   auto mpStats = pool.getStats();
   const auto& classIds = mpStats.classIds;
@@ -2259,7 +2389,7 @@ PoolStats CacheAllocator<CacheTrait>::getPoolStats(PoolId poolId) const {
   if (!isCompactCache) {
     for (const ClassId cid : classIds) {
       uint64_t classHits = (*stats_.cacheHits)[poolId][cid].get();
-      XDCHECK(mmContainers_[poolId][cid],
+      XDCHECK(mmContainers_[currentTier()][poolId][cid],
               folly::sformat("Pid {}, Cid {} not initialized.", poolId, cid));
       cacheStats.insert(
           {cid,
@@ -2269,16 +2399,14 @@ PoolStats CacheAllocator<CacheTrait>::getPoolStats(PoolId poolId) const {
             (*stats_.fragmentationSize)[poolId][cid].get(), classHits,
             (*stats_.chainedItemEvictions)[poolId][cid].get(),
             (*stats_.regularItemEvictions)[poolId][cid].get(),
-            mmContainers_[poolId][cid]->getStats()}
-
-          });
+            getMMContainerStat(currentTier(), poolId, cid)}});
       totalHits += classHits;
     }
   }
 
   PoolStats ret;
   ret.isCompactCache = isCompactCache;
-  ret.poolName = allocator_->getPoolName(poolId);
+  ret.poolName = allocator_[currentTier()]->getPoolName(poolId);
   ret.poolSize = pool.getPoolSize();
   ret.poolUsableSize = pool.getPoolUsableSize();
   ret.poolAdvisedSize = pool.getPoolAdvisedSize();
@@ -2293,7 +2421,7 @@ PoolStats CacheAllocator<CacheTrait>::getPoolStats(PoolId poolId) const {
 template <typename CacheTrait>
 ACStats CacheAllocator<CacheTrait>::getACStats(PoolId poolId,
                                                ClassId classId) const {
-  const auto& pool = allocator_->getPool(poolId);
+  const auto& pool = allocator_[currentTier()]->getPool(poolId);
   const auto& ac = pool.getAllocationClass(classId);
   return ac.getStats();
 }
@@ -2302,18 +2430,16 @@ template <typename CacheTrait>
 PoolEvictionAgeStats CacheAllocator<CacheTrait>::getPoolEvictionAgeStats(
     PoolId pid, unsigned int slabProjectionLength) const {
   PoolEvictionAgeStats stats;
-
-  const auto& pool = allocator_->getPool(pid);
+  const auto& pool = allocator_[currentTier()]->getPool(pid);
   const auto& allocSizes = pool.getAllocSizes();
   for (ClassId cid = 0; cid < static_cast<ClassId>(allocSizes.size()); ++cid) {
-    auto& mmContainer = getMMContainer(pid, cid);
+    auto& mmContainer = getMMContainer(currentTier(), pid, cid);
     const auto numItemsPerSlab =
-        allocator_->getPool(pid).getAllocationClass(cid).getAllocsPerSlab();
+        allocator_[currentTier()]->getPool(pid).getAllocationClass(cid).getAllocsPerSlab();
     const auto projectionLength = numItemsPerSlab * slabProjectionLength;
     stats.classEvictionAgeStats[cid] =
         mmContainer.getEvictionAgeStat(projectionLength);
   }
-
   return stats;
 }
 
@@ -2352,7 +2478,7 @@ void CacheAllocator<CacheTrait>::releaseSlab(PoolId pid,
   }
 
   try {
-    auto releaseContext = allocator_->startSlabRelease(
+    auto releaseContext = allocator_[currentTier()]->startSlabRelease(
         pid, victim, receiver, mode, hint,
         [this]() -> bool { return shutDownInProgress_; });
 
@@ -2361,15 +2487,15 @@ void CacheAllocator<CacheTrait>::releaseSlab(PoolId pid,
       return;
     }
 
-    releaseSlabImpl(releaseContext);
-    if (!allocator_->allAllocsFreed(releaseContext)) {
+    releaseSlabImpl(currentTier(), releaseContext);
+    if (!allocator_[currentTier()]->allAllocsFreed(releaseContext)) {
       throw std::runtime_error(
           folly::sformat("Was not able to free all allocs. PoolId: {}, AC: {}",
                          releaseContext.getPoolId(),
                          releaseContext.getClassId()));
     }
 
-    allocator_->completeSlabRelease(releaseContext);
+    allocator_[currentTier()]->completeSlabRelease(releaseContext);
   } catch (const exception::SlabReleaseAborted& e) {
     stats_.numAbortedSlabReleases.inc();
     throw exception::SlabReleaseAborted(folly::sformat(
@@ -2380,8 +2506,7 @@ void CacheAllocator<CacheTrait>::releaseSlab(PoolId pid,
 }
 
 template <typename CacheTrait>
-SlabReleaseStats CacheAllocator<CacheTrait>::getSlabReleaseStats()
-    const noexcept {
+SlabReleaseStats CacheAllocator<CacheTrait>::getSlabReleaseStats() const noexcept {
   std::lock_guard<std::mutex> l(workersMutex_);
   return SlabReleaseStats{stats_.numActiveSlabReleases.get(),
                           stats_.numReleasedForRebalance.get(),
@@ -2399,7 +2524,7 @@ SlabReleaseStats CacheAllocator<CacheTrait>::getSlabReleaseStats()
 }
 
 template <typename CacheTrait>
-void CacheAllocator<CacheTrait>::releaseSlabImpl(
+void CacheAllocator<CacheTrait>::releaseSlabImpl(TierId tid,
     const SlabReleaseContext& releaseContext) {
   auto startTime = std::chrono::milliseconds(util::getCurrentTimeMs());
   bool releaseStuck = false;
@@ -2445,7 +2570,7 @@ void CacheAllocator<CacheTrait>::releaseSlabImpl(
     if (!isMoved) {
       evictForSlabRelease(releaseContext, item, throttler);
     }
-    XDCHECK(allocator_->isAllocFreed(releaseContext, alloc));
+    XDCHECK(allocator_[tid]->isAllocFreed(releaseContext, alloc));
   }
 }
 
@@ -2524,10 +2649,11 @@ bool CacheAllocator<CacheTrait>::moveForSlabRelease(
             ctx.getPoolId(), ctx.getClassId());
     });
   }
+  auto tid = getTierId(oldItem);
   auto ref = oldItem.unmarkMoving();
   XDCHECK_EQ(ref, 0);
-  const auto allocInfo = allocator_->getAllocInfo(oldItem.getMemory());
-  allocator_->free(&oldItem);
+  const auto allocInfo = allocator_[tid]->getAllocInfo(oldItem.getMemory());
+  allocator_[tid]->free(&oldItem);
 
   (*stats_.fragmentationSize)[allocInfo.poolId][allocInfo.classId].sub(
       util::getFragmentation(*this, oldItem));
@@ -2590,15 +2716,16 @@ CacheAllocator<CacheTrait>::allocateNewItemForOldItem(const Item& oldItem) {
   }
 
   const auto allocInfo =
-      allocator_->getAllocInfo(static_cast<const void*>(&oldItem));
+      allocator_[getTierId(oldItem)]->getAllocInfo(static_cast<const void*>(&oldItem));
 
   // Set up the destination for the move. Since oldItem would have the moving
   // bit set, it won't be picked for eviction.
-  auto newItemHdl = allocateInternal(allocInfo.poolId,
-                                     oldItem.getKey(),
-                                     oldItem.getSize(),
-                                     oldItem.getCreationTime(),
-                                     oldItem.getExpiryTime());
+  auto newItemHdl = allocateInternalTier(getTierId(oldItem),
+                                         allocInfo.poolId,
+                                         oldItem.getKey(),
+                                         oldItem.getSize(),
+                                         oldItem.getCreationTime(),
+                                         oldItem.getExpiryTime());
   if (!newItemHdl) {
     return {};
   }
@@ -2657,7 +2784,7 @@ void CacheAllocator<CacheTrait>::evictForSlabRelease(
 
     if (shutDownInProgress_) {
       item.unmarkMoving();
-      allocator_->abortSlabRelease(ctx);
+      allocator_[getTierId(item)]->abortSlabRelease(ctx);
       throw exception::SlabReleaseAborted(
           folly::sformat("Slab Release aborted while trying to evict"
                          " Item: {} Pool: {}, Class: {}.",
@@ -2757,7 +2884,7 @@ void CacheAllocator<CacheTrait>::evictForSlabRelease(
     }
 
     const auto allocInfo =
-        allocator_->getAllocInfo(static_cast<const void*>(evicted));
+        allocator_[getTierId(item)]->getAllocInfo(static_cast<const void*>(evicted));
     if (evicted->hasChainedItem()) {
       (*stats_.chainedItemEvictions)[allocInfo.poolId][allocInfo.classId].inc();
     } else {
@@ -2808,18 +2935,20 @@ bool CacheAllocator<CacheTrait>::markMovingForSlabRelease(
   // At first, we assume this item was already freed
   bool itemFreed = true;
   bool markedMoving = false;
-  const auto fn = [&markedMoving, &itemFreed](void* memory) {
+  TierId tid = 0;
+  const auto fn = [&markedMoving, &itemFreed, &tid, this /* TODO - necessary for getTierId */](void* memory) {
     // Since this callback is executed, the item is not yet freed
     itemFreed = false;
     Item* item = static_cast<Item*>(memory);
     if (item->markMoving()) {
       markedMoving = true;
     }
+    tid = getTierId(*item);
   };
 
   auto startTime = util::getCurrentTimeSec();
   while (true) {
-    allocator_->processAllocForRelease(ctx, alloc, fn);
+    allocator_[tid]->processAllocForRelease(ctx, alloc, fn);
 
     // If item is already freed we give up trying to mark the item moving
     // and return false, otherwise if marked as moving, we return true.
@@ -2834,7 +2963,7 @@ bool CacheAllocator<CacheTrait>::markMovingForSlabRelease(
     itemFreed = true;
 
     if (shutDownInProgress_) {
-      allocator_->abortSlabRelease(ctx);
+      allocator_[tid]->abortSlabRelease(ctx);
       throw exception::SlabReleaseAborted(
           folly::sformat("Slab Release aborted while still trying to mark"
                          " as moving for Item: {}. Pool: {}, Class: {}.",
@@ -2857,12 +2986,15 @@ template <typename CCacheT, typename... Args>
 CCacheT* CacheAllocator<CacheTrait>::addCompactCache(folly::StringPiece name,
                                                      size_t size,
                                                      Args&&... args) {
+  if (getNumTiers() != 1)
+    throw std::runtime_error("TODO: compact cache for multi-tier Cache not supported.");
+
   if (!config_.isCompactCacheEnabled()) {
     throw std::logic_error("Compact cache is not enabled");
   }
 
   folly::SharedMutex::WriteHolder lock(compactCachePoolsLock_);
-  auto poolId = allocator_->addPool(name, size, {Slab::kSize});
+  auto poolId = allocator_[0]->addPool(name, size, {Slab::kSize});
   isCompactCachePool_[poolId] = true;
 
   auto ptr = std::make_unique<CCacheT>(
@@ -2971,12 +3103,15 @@ folly::IOBufQueue CacheAllocator<CacheTrait>::saveStateToIOBuf() {
   *metadata_.numChainedChildItems() = stats_.numChainedChildItems.get();
   *metadata_.numAbortedSlabReleases() = stats_.numAbortedSlabReleases.get();
 
+  // TODO: implement serialization for multiple tiers
   auto serializeMMContainers = [](MMContainers& mmContainers) {
     MMSerializationTypeContainer state;
-    for (unsigned int i = 0; i < mmContainers.size(); ++i) {
+    for (unsigned int i = 0; i < 1 /* TODO: */ ; ++i) {
       for (unsigned int j = 0; j < mmContainers[i].size(); ++j) {
-        if (mmContainers[i][j]) {
-          state.pools_ref()[i][j] = mmContainers[i][j]->saveState();
+        for (unsigned int k = 0; k < mmContainers[i][j].size(); ++k) {
+          if (mmContainers[i][j][k]) {
+            state.pools_ref()[j][k] = mmContainers[i][j][k]->saveState();
+          }
         }
       }
     }
@@ -2986,7 +3121,8 @@ folly::IOBufQueue CacheAllocator<CacheTrait>::saveStateToIOBuf() {
       serializeMMContainers(mmContainers_);
 
   AccessSerializationType accessContainerState = accessContainer_->saveState();
-  MemoryAllocator::SerializationType allocatorState = allocator_->saveState();
+  // TODO: foreach allocator
+  MemoryAllocator::SerializationType allocatorState = allocator_[0]->saveState();
   CCacheManager::SerializationType ccState = compactCacheManager_->saveState();
 
   AccessSerializationType chainedItemAccessContainerState =
@@ -3048,6 +3184,8 @@ CacheAllocator<CacheTrait>::shutDown() {
       (shmShutDownStatus == ShmShutDownRes::kSuccess);
   shmManager_.reset();
 
+  // TODO: save per-tier state
+
   if (shmShutDownSucceeded) {
     if (!nvmShutDownStatusOpt || *nvmShutDownStatusOpt)
       return ShutDownStatus::kSuccess;
@@ -3111,22 +3249,26 @@ CacheAllocator<CacheTrait>::deserializeMMContainers(
   const auto container =
       deserializer.deserialize<MMSerializationTypeContainer>();
 
-  MMContainers mmContainers;
+  /* TODO: right now, we create empty containers because deserialization
+   * only works for a single (topmost) tier. */
+  MMContainers mmContainers{getNumTiers()};
 
   for (auto& kvPool : *container.pools_ref()) {
     auto i = static_cast<PoolId>(kvPool.first);
     auto& pool = getPool(i);
     for (auto& kv : kvPool.second) {
       auto j = static_cast<ClassId>(kv.first);
-      MMContainerPtr ptr =
-          std::make_unique<typename MMContainerPtr::element_type>(kv.second,
-                                                                  compressor);
-      auto config = ptr->getConfig();
-      config.addExtraConfig(config_.trackTailHits
-                                ? pool.getAllocationClass(j).getAllocsPerSlab()
-                                : 0);
-      ptr->setConfig(config);
-      mmContainers[i][j] = std::move(ptr);
+      for (TierId tid = 0; tid < getNumTiers(); tid++) {
+        MMContainerPtr ptr =
+            std::make_unique<typename MMContainerPtr::element_type>(kv.second,
+                                                                    compressor);
+        auto config = ptr->getConfig();
+        config.addExtraConfig(config_.trackTailHits
+                                  ? pool.getAllocationClass(j).getAllocsPerSlab()
+                                  : 0);
+        ptr->setConfig(config);
+        mmContainers[tid][i][j] = std::move(ptr);
+      }
     }
   }
   // We need to drop the unevictableMMContainer in the desierializer.
@@ -3277,11 +3419,11 @@ GlobalCacheStats CacheAllocator<CacheTrait>::getGlobalCacheStats() const {
 
 template <typename CacheTrait>
 CacheMemoryStats CacheAllocator<CacheTrait>::getCacheMemoryStats() const {
-  const auto totalCacheSize = allocator_->getMemorySize();
-  const auto configuredTotalCacheSize = allocator_->getMemorySizeInclAdvised();
+  const auto configuredTotalCacheSize = allocator_[currentTier()]->getMemorySizeInclAdvised();
+  const auto totalCacheSize = allocator_[currentTier()]->getMemorySize();
 
   auto addSize = [this](size_t a, PoolId pid) {
-    return a + allocator_->getPool(pid).getPoolSize();
+    return a + allocator_[currentTier()]->getPool(pid).getPoolSize();
   };
   const auto regularPoolIds = getRegularPoolIds();
   const auto ccCachePoolIds = getCCachePoolIds();
@@ -3294,9 +3436,9 @@ CacheMemoryStats CacheAllocator<CacheTrait>::getCacheMemoryStats() const {
                           configuredTotalCacheSize,
                           configuredRegularCacheSize,
                           configuredCompactCacheSize,
-                          allocator_->getAdvisedMemorySize(),
+                          allocator_[currentTier()]->getAdvisedMemorySize(),
                           memMonitor_ ? memMonitor_->getMaxAdvisePct() : 0,
-                          allocator_->getUnreservedMemorySize(),
+                          allocator_[currentTier()]->getUnreservedMemorySize(),
                           nvmCache_ ? nvmCache_->getSize() : 0,
                           util::getMemAvailable(),
                           util::getRSSBytes()};
@@ -3484,6 +3626,8 @@ bool CacheAllocator<CacheTrait>::cleanupStrayShmSegments(
       // cache dir exists. clean up only if there are no other processes
       // attached. if another process was attached, the following would fail.
       ShmManager::cleanup(cacheDir, posix);
+
+      // TODO: cleanup per-tier state
     } catch (const std::exception& e) {
       XLOGF(ERR, "Error cleaning up {}. Exception: ", cacheDir, e.what());
       return false;
@@ -3493,7 +3637,8 @@ bool CacheAllocator<CacheTrait>::cleanupStrayShmSegments(
     // Any other concurrent process can not be attached to the segments or
     // even if it does, we want to mark it for destruction.
     ShmManager::removeByName(cacheDir, detail::kShmInfoName, posix);
-    ShmManager::removeByName(cacheDir, detail::kShmCacheName, posix);
+    ShmManager::removeByName(cacheDir, detail::kShmCacheName
+                             + std::to_string(0 /* TODO: per tier */), posix);
     ShmManager::removeByName(cacheDir, detail::kShmHashTableName, posix);
     ShmManager::removeByName(cacheDir, detail::kShmChainedItemHashTableName,
                              posix);
@@ -3507,14 +3652,16 @@ uint64_t CacheAllocator<CacheTrait>::getItemPtrAsOffset(const void* ptr) {
   // the two differ (e.g. Mac OS 12) - causing templating instantiation
   // errors downstream.
 
+  auto tid = getTierId(ptr);
+
   // if this succeeeds, the address is valid within the cache.
-  allocator_->getAllocInfo(ptr);
+  allocator_[tid]->getAllocInfo(ptr);
 
   if (!isOnShm_ || !shmManager_) {
     throw std::invalid_argument("Shared memory not used");
   }
 
-  const auto& shm = shmManager_->getShmByName(detail::kShmCacheName);
+  const auto& shm = shmManager_->getShmByName(detail::kShmCacheName + std::to_string(tid));
 
   return reinterpret_cast<uint64_t>(ptr) -
          reinterpret_cast<uint64_t>(shm.getCurrentMapping().addr);
diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
index 6cadc34ebb..a169d589e0 100644
--- a/cachelib/allocator/CacheAllocator.h
+++ b/cachelib/allocator/CacheAllocator.h
@@ -806,7 +806,7 @@ class CacheAllocator : public CacheBase {
   // @param config    new config for the pool
   //
   // @throw std::invalid_argument if the poolId is invalid
-  void overridePoolConfig(PoolId pid, const MMConfig& config);
+  void overridePoolConfig(TierId tid, PoolId pid, const MMConfig& config);
 
   // update an existing pool's rebalance strategy
   //
@@ -847,8 +847,9 @@ class CacheAllocator : public CacheBase {
   // @return  true if the operation succeeded. false if the size of the pool is
   //          smaller than _bytes_
   // @throw   std::invalid_argument if the poolId is invalid.
+  // TODO: should call shrinkPool for specific tier?
   bool shrinkPool(PoolId pid, size_t bytes) {
-    return allocator_->shrinkPool(pid, bytes);
+    return allocator_[currentTier()]->shrinkPool(pid, bytes);
   }
 
   // grow an existing pool by _bytes_. This will fail if there is no
@@ -857,8 +858,9 @@ class CacheAllocator : public CacheBase {
   // @return    true if the pool was grown. false if the necessary number of
   //            bytes were not available.
   // @throw     std::invalid_argument if the poolId is invalid.
+  // TODO: should call growPool for specific tier?
   bool growPool(PoolId pid, size_t bytes) {
-    return allocator_->growPool(pid, bytes);
+    return allocator_[currentTier()]->growPool(pid, bytes);
   }
 
   // move bytes from one pool to another. The source pool should be at least
@@ -871,7 +873,7 @@ class CacheAllocator : public CacheBase {
   //          correct size to do the transfer.
   // @throw   std::invalid_argument if src or dest is invalid pool
   bool resizePools(PoolId src, PoolId dest, size_t bytes) override {
-    return allocator_->resizePools(src, dest, bytes);
+    return allocator_[currentTier()]->resizePools(src, dest, bytes);
   }
 
   // Add a new compact cache with given name and size
@@ -1076,12 +1078,13 @@ class CacheAllocator : public CacheBase {
   // @throw std::invalid_argument if the memory does not belong to this
   //        cache allocator
   AllocInfo getAllocInfo(const void* memory) const {
-    return allocator_->getAllocInfo(memory);
+    return allocator_[getTierId(memory)]->getAllocInfo(memory);
   }
 
   // return the ids for the set of existing pools in this cache.
   std::set<PoolId> getPoolIds() const override final {
-    return allocator_->getPoolIds();
+    // all tiers have the same pool ids. TODO: deduplicate
+    return allocator_[0]->getPoolIds();
   }
 
   // return a list of pool ids that are backing compact caches. This includes
@@ -1093,18 +1096,18 @@ class CacheAllocator : public CacheBase {
 
   // return the pool with speicified id.
   const MemoryPool& getPool(PoolId pid) const override final {
-    return allocator_->getPool(pid);
+    return allocator_[currentTier()]->getPool(pid);
   }
 
   // calculate the number of slabs to be advised/reclaimed in each pool
   PoolAdviseReclaimData calcNumSlabsToAdviseReclaim() override final {
     auto regularPoolIds = getRegularPoolIds();
-    return allocator_->calcNumSlabsToAdviseReclaim(regularPoolIds);
+    return allocator_[currentTier()]->calcNumSlabsToAdviseReclaim(regularPoolIds);
   }
 
   // update number of slabs to advise in the cache
   void updateNumSlabsToAdvise(int32_t numSlabsToAdvise) override final {
-    allocator_->updateNumSlabsToAdvise(numSlabsToAdvise);
+    allocator_[currentTier()]->updateNumSlabsToAdvise(numSlabsToAdvise);
   }
 
   // returns a valid PoolId corresponding to the name or kInvalidPoolId if the
@@ -1112,8 +1115,9 @@ class CacheAllocator : public CacheBase {
   PoolId getPoolId(folly::StringPiece name) const noexcept;
 
   // returns the pool's name by its poolId.
-  std::string getPoolName(PoolId poolId) const override {
-    return allocator_->getPoolName(poolId);
+  std::string getPoolName(PoolId poolId) const {
+    // all tiers have the same pool names.
+    return allocator_[0]->getPoolName(poolId);
   }
 
   // get stats related to all kinds of slab release events.
@@ -1390,11 +1394,14 @@ class CacheAllocator : public CacheBase {
 
   using MMContainerPtr = std::unique_ptr<MMContainer>;
   using MMContainers =
-      std::array<std::array<MMContainerPtr, MemoryAllocator::kMaxClasses>,
-                 MemoryPoolManager::kMaxPools>;
+      std::vector<std::array<std::array<MMContainerPtr, MemoryAllocator::kMaxClasses>,
+                 MemoryPoolManager::kMaxPools>>;
 
   void createMMContainers(const PoolId pid, MMConfig config);
 
+  TierId getTierId(const Item& item) const;
+  TierId getTierId(const void* ptr) const;
+
   // acquire the MMContainer corresponding to the the Item's class and pool.
   //
   // @return pointer to the MMContainer.
@@ -1402,7 +1409,12 @@ class CacheAllocator : public CacheBase {
   // allocation from the memory allocator.
   MMContainer& getMMContainer(const Item& item) const noexcept;
 
-  MMContainer& getMMContainer(PoolId pid, ClassId cid) const noexcept;
+  MMContainer& getMMContainer(TierId tid, PoolId pid, ClassId cid) const noexcept;
+
+  // Get stats of the specified pid and cid.
+  // If such mmcontainer is not valid (pool id or cid out of bound)
+  // or the mmcontainer is not initialized, return an empty stat.
+  MMContainerStat getMMContainerStat(TierId tid, PoolId pid, ClassId cid) const noexcept;
 
   // create a new cache allocation. The allocation can be initialized
   // appropriately and made accessible through insert or insertOrReplace.
@@ -1434,6 +1446,17 @@ class CacheAllocator : public CacheBase {
                                uint32_t creationTime,
                                uint32_t expiryTime);
 
+  // create a new cache allocation on specific memory tier.
+  // For description see allocateInternal.
+  //
+  // @param tid id a memory tier
+  WriteHandle allocateInternalTier(TierId tid,
+                                   PoolId id,
+                                   Key key,
+                                   uint32_t size,
+                                   uint32_t creationTime,
+                                   uint32_t expiryTime);
+
   // Allocate a chained item
   //
   // The resulting chained item does not have a parent item and
@@ -1519,6 +1542,15 @@ class CacheAllocator : public CacheBase {
   //              not exist.
   FOLLY_ALWAYS_INLINE WriteHandle findFastImpl(Key key, AccessMode mode);
 
+  // Moves a regular item to a different memory tier.
+  //
+  // @param oldItem     Reference to the item being moved
+  // @param newItemHdl  Reference to the handle of the new item being moved into
+  //
+  // @return true  If the move was completed, and the containers were updated
+  //               successfully.
+  bool moveRegularItemOnEviction(Item& oldItem, WriteHandle& newItemHdl);
+
   // Moves a regular item to a different slab. This should only be used during
   // slab release after the item's exclusive bit has been set. The user supplied
   // callback is responsible for copying the contents and fixing the semantics
@@ -1670,7 +1702,7 @@ class CacheAllocator : public CacheBase {
   // @param  pid  the id of the pool to look for evictions inside
   // @param  cid  the id of the class to look for evictions inside
   // @return An evicted item or nullptr  if there is no suitable candidate.
-  Item* findEviction(PoolId pid, ClassId cid);
+  Item* findEviction(TierId tid, PoolId pid, ClassId cid);
 
   using EvictionIterator = typename MMContainer::LockedIterator;
 
@@ -1686,7 +1718,7 @@ class CacheAllocator : public CacheBase {
       const typename Item::PtrCompressor& compressor);
 
   unsigned int reclaimSlabs(PoolId id, size_t numSlabs) final {
-    return allocator_->reclaimSlabsAndGrow(id, numSlabs);
+    return allocator_[currentTier()]->reclaimSlabsAndGrow(id, numSlabs);
   }
 
   FOLLY_ALWAYS_INLINE EventTracker* getEventTracker() const {
@@ -1745,7 +1777,7 @@ class CacheAllocator : public CacheBase {
                    const void* hint = nullptr) final;
 
   // @param releaseContext  slab release context
-  void releaseSlabImpl(const SlabReleaseContext& releaseContext);
+  void releaseSlabImpl(TierId tid, const SlabReleaseContext& releaseContext);
 
   // @return  true when successfully marked as moving,
   //          fasle when this item has already been freed
@@ -1807,7 +1839,7 @@ class CacheAllocator : public CacheBase {
     // primitives. So we consciously exempt ourselves here from TSAN data race
     // detection.
     folly::annotate_ignore_thread_sanitizer_guard g(__FILE__, __LINE__);
-    auto slabsSkipped = allocator_->forEachAllocation(std::forward<Fn>(f));
+    auto slabsSkipped = allocator_[currentTier()]->forEachAllocation(std::forward<Fn>(f));
     stats().numReaperSkippedSlabs.add(slabsSkipped);
   }
 
@@ -1851,10 +1883,10 @@ class CacheAllocator : public CacheBase {
                   std::unique_ptr<T>& worker,
                   std::chrono::seconds timeout = std::chrono::seconds{0});
 
-  ShmSegmentOpts createShmCacheOpts();
-  std::unique_ptr<MemoryAllocator> createNewMemoryAllocator();
-  std::unique_ptr<MemoryAllocator> restoreMemoryAllocator();
-  std::unique_ptr<CCacheManager> restoreCCacheManager();
+  ShmSegmentOpts createShmCacheOpts(TierId tid);
+  std::unique_ptr<MemoryAllocator> createNewMemoryAllocator(TierId tid);
+  std::unique_ptr<MemoryAllocator> restoreMemoryAllocator(TierId tid);
+  std::unique_ptr<CCacheManager> restoreCCacheManager(TierId tid);
 
   PoolIds filterCompactCachePools(const PoolIds& poolIds) const;
 
@@ -1874,7 +1906,7 @@ class CacheAllocator : public CacheBase {
   }
 
   typename Item::PtrCompressor createPtrCompressor() const {
-    return allocator_->createPtrCompressor<Item>();
+    return allocator_[0 /* TODO */]->createPtrCompressor<Item>();
   }
 
   // helper utility to throttle and optionally log.
@@ -1897,9 +1929,14 @@ class CacheAllocator : public CacheBase {
 
   // @param type        the type of initialization
   // @return nullptr if the type is invalid
-  // @return pointer to memory allocator
+  // @return vector of pointers to memory allocator
   // @throw std::runtime_error if type is invalid
-  std::unique_ptr<MemoryAllocator> initAllocator(InitMemType type);
+  std::vector<std::unique_ptr<MemoryAllocator>> initAllocator(InitMemType type);
+
+  std::vector<std::unique_ptr<MemoryAllocator>> createPrivateAllocator();
+  std::vector<std::unique_ptr<MemoryAllocator>> createAllocators();
+  std::vector<std::unique_ptr<MemoryAllocator>> restoreAllocators();
+
   // @param type        the type of initialization
   // @return nullptr if the type is invalid
   // @return pointer to access container
@@ -1957,6 +1994,17 @@ class CacheAllocator : public CacheBase {
 
   // BEGIN private members
 
+  TierId currentTier() const {
+    // TODO: every function which calls this method should be refactored.
+    // We should go case by case and either make such function work on
+    // all tiers or expose separate parameter to describe the tier ID.
+    return 0;
+  }
+
+  unsigned getNumTiers() const {
+    return memoryTierConfigs.size();
+  }
+
   // Whether the memory allocator for this cache allocator was created on shared
   // memory. The hash table, chained item hash table etc is also created on
   // shared memory except for temporary shared memory mode when they're created
@@ -1984,9 +2032,10 @@ class CacheAllocator : public CacheBase {
   const MMConfig mmConfig_{};
 
   // the memory allocator for allocating out of the available memory.
-  std::unique_ptr<MemoryAllocator> allocator_;
+  std::vector<std::unique_ptr<MemoryAllocator>> allocator_;
 
   // compact cache allocator manager
+  // TODO: per tier?
   std::unique_ptr<CCacheManager> compactCacheManager_;
 
   // compact cache instances reside here when user "add" or "attach" compact
diff --git a/cachelib/allocator/PoolOptimizer.cpp b/cachelib/allocator/PoolOptimizer.cpp
index 8d67762be8..d101231a04 100644
--- a/cachelib/allocator/PoolOptimizer.cpp
+++ b/cachelib/allocator/PoolOptimizer.cpp
@@ -51,6 +51,8 @@ void PoolOptimizer::optimizeRegularPoolSizes() {
 
 void PoolOptimizer::optimizeCompactCacheSizes() {
   try {
+    // TODO: should optimizer look at each tier individually?
+    // If yes, then resizePools should be per-tier
     auto strategy = cache_.getPoolOptimizeStrategy();
     if (!strategy) {
       strategy = strategy_;
diff --git a/cachelib/allocator/memory/MemoryAllocator.h b/cachelib/allocator/memory/MemoryAllocator.h
index 1ce58857de..625171fd6f 100644
--- a/cachelib/allocator/memory/MemoryAllocator.h
+++ b/cachelib/allocator/memory/MemoryAllocator.h
@@ -646,6 +646,13 @@ class MemoryAllocator {
     memoryPoolManager_.updateNumSlabsToAdvise(numSlabs);
   }
 
+  // returns ture if ptr points to memory which is managed by this
+  // allocator
+  bool isMemoryInAllocator(const void *ptr) {
+    return ptr && ptr >= slabAllocator_.getSlabMemoryBegin()
+      && ptr < slabAllocator_.getSlabMemoryEnd();
+  }
+
  private:
   // @param memory    pointer to the memory.
   // @return          the MemoryPool corresponding to the memory.
diff --git a/cachelib/allocator/memory/SlabAllocator.h b/cachelib/allocator/memory/SlabAllocator.h
index d82cf5b947..9fdb1e60b4 100644
--- a/cachelib/allocator/memory/SlabAllocator.h
+++ b/cachelib/allocator/memory/SlabAllocator.h
@@ -322,6 +322,17 @@ class SlabAllocator {
     return PtrCompressor<PtrType, SlabAllocator>(*this);
   }
 
+  // returns starting address of memory we own.
+  const Slab* getSlabMemoryBegin() const noexcept {
+    return reinterpret_cast<Slab*>(memoryStart_);
+  }
+
+  // returns first byte after the end of memory region we own.
+  const Slab* getSlabMemoryEnd() const noexcept {
+    return reinterpret_cast<Slab*>(reinterpret_cast<uint8_t*>(memoryStart_) +
+                                   memorySize_);
+  }
+
  private:
   // null Slab* presenttation. With 4M Slab size, a valid slab index would never
   // reach 2^16 - 1;
@@ -339,12 +350,6 @@ class SlabAllocator {
   // @throw std::invalid_argument if the state is invalid.
   void checkState() const;
 
-  // returns first byte after the end of memory region we own.
-  const Slab* getSlabMemoryEnd() const noexcept {
-    return reinterpret_cast<Slab*>(reinterpret_cast<uint8_t*>(memoryStart_) +
-                                   memorySize_);
-  }
-
   // returns true if we have slabbed all the memory that is available to us.
   // false otherwise.
   bool allMemorySlabbed() const noexcept {
diff --git a/cachelib/allocator/tests/AllocatorResizeTest.h b/cachelib/allocator/tests/AllocatorResizeTest.h
index d65205ac74..883dd9c056 100644
--- a/cachelib/allocator/tests/AllocatorResizeTest.h
+++ b/cachelib/allocator/tests/AllocatorResizeTest.h
@@ -966,23 +966,23 @@ class AllocatorResizeTest : public AllocatorTest<AllocatorT> {
       for (i = 1; i <= numItersToMaxAdviseAway + 1; i++) {
         alloc.memMonitor_->adviseAwaySlabs();
         std::this_thread::sleep_for(std::chrono::seconds{2});
-        ASSERT_EQ(alloc.allocator_->getAdvisedMemorySize(), i * perIterAdvSize);
+        ASSERT_EQ(alloc.allocator_[0 /* TODO - extend test */]->getAdvisedMemorySize(), i * perIterAdvSize);
       }
       i--;
       // This should fail
       alloc.memMonitor_->adviseAwaySlabs();
       std::this_thread::sleep_for(std::chrono::seconds{2});
-      auto totalAdvisedAwayMemory = alloc.allocator_->getAdvisedMemorySize();
+      auto totalAdvisedAwayMemory = alloc.allocator_[0 /* TODO - extend test */]->getAdvisedMemorySize();
       ASSERT_EQ(totalAdvisedAwayMemory, i * perIterAdvSize);
 
       // Try to reclaim back
       for (i = 1; i <= numItersToMaxAdviseAway + 1; i++) {
         alloc.memMonitor_->reclaimSlabs();
         std::this_thread::sleep_for(std::chrono::seconds{2});
-        ASSERT_EQ(alloc.allocator_->getAdvisedMemorySize(),
+        ASSERT_EQ(alloc.allocator_[0 /* TODO - extend test */]->getAdvisedMemorySize(),
                   totalAdvisedAwayMemory - i * perIterAdvSize);
       }
-      totalAdvisedAwayMemory = alloc.allocator_->getAdvisedMemorySize();
+      totalAdvisedAwayMemory = alloc.allocator_[0 /* TODO - extend test */]->getAdvisedMemorySize();
       ASSERT_EQ(totalAdvisedAwayMemory, 0);
     }
   }
diff --git a/cachelib/allocator/tests/BaseAllocatorTest.h b/cachelib/allocator/tests/BaseAllocatorTest.h
index afc27e6fbd..13cd4fbf29 100644
--- a/cachelib/allocator/tests/BaseAllocatorTest.h
+++ b/cachelib/allocator/tests/BaseAllocatorTest.h
@@ -4236,13 +4236,13 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
     // Had a bug: D4799860 where we allocated the wrong size for chained item
     {
       const auto parentAllocInfo =
-          alloc.allocator_->getAllocInfo(itemHandle->getMemory());
+          alloc.allocator_[0 /* TODO - extend test */]->getAllocInfo(itemHandle->getMemory());
       const auto child1AllocInfo =
-          alloc.allocator_->getAllocInfo(chainedItemHandle->getMemory());
+          alloc.allocator_[0 /* TODO - extend test */]->getAllocInfo(chainedItemHandle->getMemory());
       const auto child2AllocInfo =
-          alloc.allocator_->getAllocInfo(chainedItemHandle2->getMemory());
+          alloc.allocator_[0 /* TODO - extend test */]->getAllocInfo(chainedItemHandle2->getMemory());
       const auto child3AllocInfo =
-          alloc.allocator_->getAllocInfo(chainedItemHandle3->getMemory());
+          alloc.allocator_[0 /* TODO - extend test */]->getAllocInfo(chainedItemHandle3->getMemory());
 
       const auto parentCid = parentAllocInfo.classId;
       const auto child1Cid = child1AllocInfo.classId;
diff --git a/cachelib/allocator/tests/TestBase-inl.h b/cachelib/allocator/tests/TestBase-inl.h
index bf7355c87d..4d45e981bc 100644
--- a/cachelib/allocator/tests/TestBase-inl.h
+++ b/cachelib/allocator/tests/TestBase-inl.h
@@ -312,7 +312,7 @@ void AllocatorTest<AllocatorT>::testShmIsRemoved(
   ASSERT_FALSE(AllocatorT::ShmManager::segmentExists(
       config.getCacheDir(), detail::kShmHashTableName, config.usePosixShm));
   ASSERT_FALSE(AllocatorT::ShmManager::segmentExists(
-      config.getCacheDir(), detail::kShmCacheName, config.usePosixShm));
+      config.getCacheDir(), detail::kShmCacheName + std::to_string(0), config.usePosixShm));
   ASSERT_FALSE(AllocatorT::ShmManager::segmentExists(
       config.getCacheDir(), detail::kShmChainedItemHashTableName,
       config.usePosixShm));
@@ -326,7 +326,7 @@ void AllocatorTest<AllocatorT>::testShmIsNotRemoved(
   ASSERT_TRUE(AllocatorT::ShmManager::segmentExists(
       config.getCacheDir(), detail::kShmHashTableName, config.usePosixShm));
   ASSERT_TRUE(AllocatorT::ShmManager::segmentExists(
-      config.getCacheDir(), detail::kShmCacheName, config.usePosixShm));
+      config.getCacheDir(), detail::kShmCacheName + std::to_string(0), config.usePosixShm));
   ASSERT_TRUE(AllocatorT::ShmManager::segmentExists(
       config.getCacheDir(), detail::kShmChainedItemHashTableName,
       config.usePosixShm));

From 369e55b79ea7f2a0f31cc83c40c854e1acda00b8 Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Tue, 17 Jan 2023 10:49:16 -0800
Subject: [PATCH 36/47] AC stats multi-tier

---
 cachelib/allocator/Cache.h                 |  2 +-
 cachelib/allocator/CacheAllocator-inl.h    |  5 +++--
 cachelib/allocator/CacheAllocator.h        |  2 +-
 cachelib/allocator/tests/CacheBaseTest.cpp |  2 +-
 cachelib/cachebench/cache/Cache-inl.h      |  8 +++++---
 cachelib/cachebench/cache/Cache.h          |  4 ++--
 cachelib/cachebench/cache/CacheStats.h     | 18 ++++++++++--------
 7 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/cachelib/allocator/Cache.h b/cachelib/allocator/Cache.h
index 082db65f7a..fed0abce2f 100644
--- a/cachelib/allocator/Cache.h
+++ b/cachelib/allocator/Cache.h
@@ -106,7 +106,7 @@ class CacheBase {
   //
   // @param poolId   the pool id
   // @param classId   the class id
-  virtual ACStats getACStats(PoolId poolId, ClassId classId) const = 0;
+  virtual ACStats getACStats(TierId tid, PoolId poolId, ClassId classId) const = 0;
 
   // @param poolId   the pool id
   virtual AllSlabReleaseEvents getAllSlabReleaseEvents(PoolId poolId) const = 0;
diff --git a/cachelib/allocator/CacheAllocator-inl.h b/cachelib/allocator/CacheAllocator-inl.h
index 8bc794de66..9eb42178de 100644
--- a/cachelib/allocator/CacheAllocator-inl.h
+++ b/cachelib/allocator/CacheAllocator-inl.h
@@ -2419,9 +2419,10 @@ PoolStats CacheAllocator<CacheTrait>::getPoolStats(PoolId poolId) const {
 }
 
 template <typename CacheTrait>
-ACStats CacheAllocator<CacheTrait>::getACStats(PoolId poolId,
+ACStats CacheAllocator<CacheTrait>::getACStats(TierId tid,
+                                               PoolId poolId,
                                                ClassId classId) const {
-  const auto& pool = allocator_[currentTier()]->getPool(poolId);
+  const auto& pool = allocator_[tid]->getPool(poolId);
   const auto& ac = pool.getAllocationClass(classId);
   return ac.getStats();
 }
diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
index a169d589e0..7f3d63892e 100644
--- a/cachelib/allocator/CacheAllocator.h
+++ b/cachelib/allocator/CacheAllocator.h
@@ -1183,7 +1183,7 @@ class CacheAllocator : public CacheBase {
   CacheMemoryStats getCacheMemoryStats() const override final;
 
   // return stats for Allocation Class
-  ACStats getACStats(PoolId pid, ClassId cid) const override final;
+  ACStats getACStats(TierId tid, PoolId pid, ClassId cid) const override final;
 
   // return the nvm cache stats map
   util::StatsMap getNvmCacheStatsMap() const override final;
diff --git a/cachelib/allocator/tests/CacheBaseTest.cpp b/cachelib/allocator/tests/CacheBaseTest.cpp
index f249786743..e7778d6ccf 100644
--- a/cachelib/allocator/tests/CacheBaseTest.cpp
+++ b/cachelib/allocator/tests/CacheBaseTest.cpp
@@ -34,7 +34,7 @@ class CacheBaseTest : public CacheBase, public SlabAllocatorTestBase {
   bool isObjectCache() const override { return false; }
   const MemoryPool& getPool(PoolId) const override { return memoryPool_; }
   PoolStats getPoolStats(PoolId) const override { return PoolStats(); }
-  ACStats getACStats(PoolId, ClassId) const { return ACStats(); };
+  ACStats getACStats(TierId, PoolId, ClassId) const { return ACStats(); };
   AllSlabReleaseEvents getAllSlabReleaseEvents(PoolId) const override {
     return AllSlabReleaseEvents{};
   }
diff --git a/cachelib/cachebench/cache/Cache-inl.h b/cachelib/cachebench/cache/Cache-inl.h
index 688654de96..cb038d0f1f 100644
--- a/cachelib/cachebench/cache/Cache-inl.h
+++ b/cachelib/cachebench/cache/Cache-inl.h
@@ -620,12 +620,14 @@ Stats Cache<Allocator>::getStats() const {
     aggregate += poolStats;
   }
 
-  std::map<PoolId, std::map<ClassId, ACStats>> allocationClassStats{};
+  std::map<TierId, std::map<PoolId, std::map<ClassId, ACStats>>> allocationClassStats{};
 
   for (size_t pid = 0; pid < pools_.size(); pid++) {
     auto cids = cache_->getPoolStats(static_cast<PoolId>(pid)).getClassIds();
-    for (auto cid : cids)
-      allocationClassStats[pid][cid] = cache_->getACStats(pid, cid);
+    for (TierId tid = 0; tid < cache_->getNumTiers(); tid++) {
+      for (auto cid : cids)
+        allocationClassStats[tid][pid][cid] = cache_->getACStats(tid, pid, cid);
+    }
   }
 
   const auto cacheStats = cache_->getGlobalCacheStats();
diff --git a/cachelib/cachebench/cache/Cache.h b/cachelib/cachebench/cache/Cache.h
index 65c70c30c1..a85c1efb66 100644
--- a/cachelib/cachebench/cache/Cache.h
+++ b/cachelib/cachebench/cache/Cache.h
@@ -325,8 +325,8 @@ class Cache {
   // return the stats for the pool.
   PoolStats getPoolStats(PoolId pid) const { return cache_->getPoolStats(pid); }
 
-  ACStats getACStats(PoolId pid, ClassId cid) const {
-    return cache_->getACStats(pid, cid);
+  ACStats getACStats(TierId tid, PoolId pid, ClassId cid) const {
+    return cache_->getACStats(tid, pid, cid);
   }
 
   // return the total number of inconsistent operations detected since start.
diff --git a/cachelib/cachebench/cache/CacheStats.h b/cachelib/cachebench/cache/CacheStats.h
index 38c48aa4c8..7c7137b63d 100644
--- a/cachelib/cachebench/cache/CacheStats.h
+++ b/cachelib/cachebench/cache/CacheStats.h
@@ -101,7 +101,7 @@ struct Stats {
   uint64_t invalidDestructorCount{0};
   int64_t unDestructedItemCount{0};
 
-  std::map<PoolId, std::map<ClassId, ACStats>> allocationClassStats;
+  std::map<TierId, std::map<PoolId, std::map<ClassId, ACStats>>> allocationClassStats;
 
   // populate the counters related to nvm usage. Cache implementation can decide
   // what to populate since not all of those are interesting when running
@@ -156,24 +156,26 @@ struct Stats {
       };
 
       auto foreachAC = [&](auto cb) {
-        for (auto& pidStat : allocationClassStats) {
-          for (auto& cidStat : pidStat.second) {
-            cb(pidStat.first, cidStat.first, cidStat.second);
+        for (auto& tidStat : allocationClassStats) {
+          for (auto& pidStat : tidStat.second) {
+            for (auto& cidStat : pidStat.second) {
+              cb(tidStat.first, pidStat.first, cidStat.first, cidStat.second);
+            }
           }
         }
       };
 
-      foreachAC([&](auto pid, auto cid, auto stats) {
+      foreachAC([&](auto tid, auto pid, auto cid, auto stats) {
         auto [allocSizeSuffix, allocSize] = formatMemory(stats.allocSize);
         auto [memorySizeSuffix, memorySize] =
             formatMemory(stats.totalAllocatedSize());
-        out << folly::sformat("pid{:2} cid{:4} {:8.2f}{} memorySize: {:8.2f}{}",
-                              pid, cid, allocSize, allocSizeSuffix, memorySize,
+        out << folly::sformat("tid{:2} pid{:2} cid{:4} {:8.2f}{} memorySize: {:8.2f}{}",
+                              tid, pid, cid, allocSize, allocSizeSuffix, memorySize,
                               memorySizeSuffix)
             << std::endl;
       });
 
-      foreachAC([&](auto pid, auto cid, auto stats) {
+      foreachAC([&](auto tid, auto pid, auto cid, auto stats) {
         auto [allocSizeSuffix, allocSize] = formatMemory(stats.allocSize);
 
         // If the pool is not full, extrapolate usageFraction for AC assuming it

From 713c6d959de1d7177f58f50912b47ec2341834a3 Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Wed, 8 Feb 2023 08:30:48 -0800
Subject: [PATCH 37/47] This commit contains the additional memory tiers tests
 for different pool sizes. We also use getPoolSize(pid), to get total size
 from all pools across allocators.

It also fixes the tiering sizes (pulls changes from
what was issue75 rebased commit that did not make
it into upstream commits).

Rebased to use ramCacheSize.
---
 cachelib/allocator/CacheAllocator-inl.h       | 36 ++++++--
 cachelib/allocator/CacheAllocator.h           |  2 +
 .../tests/AllocatorMemoryTiersTest.cpp        |  6 +-
 .../tests/AllocatorMemoryTiersTest.h          | 40 ++++++++-
 cachelib/allocator/tests/MemoryTiersTest.cpp  | 84 ++++++++++++++++++-
 5 files changed, 155 insertions(+), 13 deletions(-)

diff --git a/cachelib/allocator/CacheAllocator-inl.h b/cachelib/allocator/CacheAllocator-inl.h
index 9eb42178de..b474799c50 100644
--- a/cachelib/allocator/CacheAllocator-inl.h
+++ b/cachelib/allocator/CacheAllocator-inl.h
@@ -121,6 +121,16 @@ ShmSegmentOpts CacheAllocator<CacheTrait>::createShmCacheOpts(TierId tid) {
   return opts;
 }
 
+template <typename CacheTrait>
+size_t CacheAllocator<CacheTrait>::memoryTierSize(TierId tid) const {
+  auto partitions = std::accumulate(memoryTierConfigs.begin(), memoryTierConfigs.end(), 0UL,
+  [](const size_t i, const MemoryTierCacheConfig& config){
+    return i + config.getRatio();
+  });
+
+  return memoryTierConfigs[tid].calculateTierSize(config_.getCacheSize(), partitions);
+}
+
 template <typename CacheTrait>
 std::vector<std::unique_ptr<MemoryAllocator>>
 CacheAllocator<CacheTrait>::createPrivateAllocator() {
@@ -142,14 +152,15 @@ CacheAllocator<CacheTrait>::createPrivateAllocator() {
 template <typename CacheTrait>
 std::unique_ptr<MemoryAllocator>
 CacheAllocator<CacheTrait>::createNewMemoryAllocator(TierId tid) {
+  size_t tierSize = memoryTierSize(tid);
   return std::make_unique<MemoryAllocator>(
       getAllocatorConfig(config_),
       shmManager_
           ->createShm(detail::kShmCacheName + std::to_string(tid),
-                      config_.getCacheSize(), config_.slabMemoryBaseAddr,
+                      tierSize, config_.slabMemoryBaseAddr,
                       createShmCacheOpts(tid))
           .addr,
-      config_.getCacheSize());
+      tierSize);
 }
 
 template <typename CacheTrait>
@@ -160,7 +171,7 @@ CacheAllocator<CacheTrait>::restoreMemoryAllocator(TierId tid) {
       shmManager_
           ->attachShm(detail::kShmCacheName + std::to_string(tid),
             config_.slabMemoryBaseAddr, createShmCacheOpts(tid)).addr,
-      config_.getCacheSize(),
+      memoryTierSize(tid),
       config_.disableFullCoredump);
 }
 
@@ -2368,6 +2379,16 @@ const std::string CacheAllocator<CacheTrait>::getCacheName() const {
   return config_.cacheName;
 }
 
+template <typename CacheTrait>
+size_t CacheAllocator<CacheTrait>::getPoolSize(PoolId poolId) const {
+  size_t poolSize = 0;
+  for (auto& allocator: allocator_) {
+    const auto& pool = allocator->getPool(poolId);
+    poolSize += pool.getPoolSize();
+  }
+  return poolSize;
+}
+
 template <typename CacheTrait>
 PoolStats CacheAllocator<CacheTrait>::getPoolStats(PoolId poolId) const {
   const auto& pool = allocator_[currentTier()]->getPool(poolId);
@@ -3420,9 +3441,12 @@ GlobalCacheStats CacheAllocator<CacheTrait>::getGlobalCacheStats() const {
 
 template <typename CacheTrait>
 CacheMemoryStats CacheAllocator<CacheTrait>::getCacheMemoryStats() const {
-  const auto configuredTotalCacheSize = allocator_[currentTier()]->getMemorySizeInclAdvised();
-  const auto totalCacheSize = allocator_[currentTier()]->getMemorySize();
-
+  size_t totalCacheSize = 0;
+  size_t configuredTotalCacheSize = 0;
+  for(auto& allocator: allocator_) {
+    totalCacheSize += allocator->getMemorySize();
+    configuredTotalCacheSize += allocator->getMemorySizeInclAdvised();
+  }
   auto addSize = [this](size_t a, PoolId pid) {
     return a + allocator_[currentTier()]->getPool(pid).getPoolSize();
   };
diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
index 7f3d63892e..d85104bb4d 100644
--- a/cachelib/allocator/CacheAllocator.h
+++ b/cachelib/allocator/CacheAllocator.h
@@ -2005,6 +2005,8 @@ class CacheAllocator : public CacheBase {
     return memoryTierConfigs.size();
   }
 
+  size_t memoryTierSize(TierId tid) const;
+
   // Whether the memory allocator for this cache allocator was created on shared
   // memory. The hash table, chained item hash table etc is also created on
   // shared memory except for temporary shared memory mode when they're created
diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp b/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp
index 71059ee496..90ef34be41 100644
--- a/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp
+++ b/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp
@@ -23,9 +23,9 @@ namespace tests {
 using LruAllocatorMemoryTiersTest = AllocatorMemoryTiersTest<LruAllocator>;
 
 // TODO(MEMORY_TIER): add more tests with different eviction policies
-TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValid1) {
-  this->testMultiTiersValid1();
-}
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersInvalid) { this->testMultiTiersInvalid(); }
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValid) { this->testMultiTiersValid(); }
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValidMixed) { this->testMultiTiersValidMixed(); }
 
 } // end of namespace tests
 } // end of namespace cachelib
diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.h b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h
index 05d5020b52..682cbb7c80 100644
--- a/cachelib/allocator/tests/AllocatorMemoryTiersTest.h
+++ b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h
@@ -27,7 +27,7 @@ namespace tests {
 template <typename AllocatorT>
 class AllocatorMemoryTiersTest : public AllocatorTest<AllocatorT> {
  public:
-  void testMultiTiersValid1() {
+  void testMultiTiersInvalid() {
     typename AllocatorT::Config config;
     config.setCacheSize(100 * Slab::kSize);
     ASSERT_NO_THROW(config.configureMemoryTiers(
@@ -36,6 +36,44 @@ class AllocatorMemoryTiersTest : public AllocatorTest<AllocatorT> {
          MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind(
              std::string("0"))}));
   }
+
+  void testMultiTiersValid() {
+    typename AllocatorT::Config config;
+    config.setCacheSize(100 * Slab::kSize);
+    config.enableCachePersistence("/tmp");
+    ASSERT_NO_THROW(config.configureMemoryTiers(
+        {MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind(
+             std::string("0")),
+         MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind(
+             std::string("0"))}));
+
+    auto alloc = std::make_unique<AllocatorT>(AllocatorT::SharedMemNew, config);
+    ASSERT(alloc != nullptr);
+
+    auto pool = alloc->addPool("default", alloc->getCacheMemoryStats().ramCacheSize);
+    auto handle = alloc->allocate(pool, "key", std::string("value").size());
+    ASSERT(handle != nullptr);
+    ASSERT_NO_THROW(alloc->insertOrReplace(handle));
+  }
+
+  void testMultiTiersValidMixed() {
+    typename AllocatorT::Config config;
+    config.setCacheSize(100 * Slab::kSize);
+    config.enableCachePersistence("/tmp");
+    ASSERT_NO_THROW(config.configureMemoryTiers(
+        {MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind(
+             std::string("0")),
+         MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind(
+             std::string("0"))}));
+
+    auto alloc = std::make_unique<AllocatorT>(AllocatorT::SharedMemNew, config);
+    ASSERT(alloc != nullptr);
+
+    auto pool = alloc->addPool("default", alloc->getCacheMemoryStats().ramCacheSize);
+    auto handle = alloc->allocate(pool, "key", std::string("value").size());
+    ASSERT(handle != nullptr);
+    ASSERT_NO_THROW(alloc->insertOrReplace(handle));
+  }
 };
 } // namespace tests
 } // namespace cachelib
diff --git a/cachelib/allocator/tests/MemoryTiersTest.cpp b/cachelib/allocator/tests/MemoryTiersTest.cpp
index 81eca12e62..298195378c 100644
--- a/cachelib/allocator/tests/MemoryTiersTest.cpp
+++ b/cachelib/allocator/tests/MemoryTiersTest.cpp
@@ -109,7 +109,7 @@ class MemoryTiersTest : public AllocatorTest<Allocator> {
   void validatePoolSize(PoolId poolId,
                         std::unique_ptr<LruAllocator>& allocator,
                         size_t expectedSize) {
-    size_t actualSize = allocator->getPool(poolId).getPoolSize();
+    size_t actualSize = allocator->getPoolSize(poolId);
     EXPECT_EQ(actualSize, expectedSize);
   }
 
@@ -119,9 +119,9 @@ class MemoryTiersTest : public AllocatorTest<Allocator> {
                    size_t numTiers = 2) {
     if (isSizeValid) {
       auto pool = alloc->addPool("validPoolSize", poolSize);
-      EXPECT_LE(alloc->getPool(pool).getPoolSize(), poolSize);
+      EXPECT_LE(alloc->getPoolSize(pool), poolSize);
       if (poolSize >= numTiers * Slab::kSize)
-        EXPECT_GE(alloc->getPool(pool).getPoolSize(),
+        EXPECT_GE(alloc->getPoolSize(pool),
                   poolSize - numTiers * Slab::kSize);
     } else {
       EXPECT_THROW(alloc->addPool("invalidPoolSize", poolSize),
@@ -172,6 +172,84 @@ TEST_F(LruMemoryTiersTest, TestInvalid2TierConfigRatioNotSet) {
 TEST_F(LruMemoryTiersTest, TestInvalid2TierConfigSizesNeCacheSize) {
   EXPECT_THROW(createTestCacheConfig({0, 0}), std::invalid_argument);
 }
+
+TEST_F(LruMemoryTiersTest, TestPoolAllocations) {
+  std::vector<size_t> totalCacheSizes = {8 * GB, 2 * GB};
+
+  static const size_t numExtraSizes = 4;
+  static const size_t numExtraSlabs = 20;
+
+  for (size_t i = 0; i < numExtraSizes; i++) {
+    totalCacheSizes.push_back(totalCacheSizes.back() +
+                              (folly::Random::rand64() % numExtraSlabs) *
+                                  Slab::kSize);
+  }
+
+  size_t min_ratio = 1;
+  size_t max_ratio = 111;
+
+  static const size_t numCombinations = 10;
+
+  for (auto totalCacheSize : totalCacheSizes) {
+    for (size_t k = 0; k < numCombinations; k++) {
+      const size_t i = folly::Random::rand32() % max_ratio + min_ratio;
+      const size_t j = folly::Random::rand32() % max_ratio + min_ratio;
+      LruAllocatorConfig cfg =
+          createTestCacheConfig({i, j},
+                                /* usePoisx */ true, totalCacheSize);
+      basicCheck(cfg, totalCacheSize);
+
+      std::unique_ptr<LruAllocator> alloc = std::unique_ptr<LruAllocator>(
+          new LruAllocator(LruAllocator::SharedMemNew, cfg));
+
+      size_t size = (folly::Random::rand64() %
+                      (alloc->getCacheMemoryStats().ramCacheSize - Slab::kSize)) +
+                    Slab::kSize;
+      testAddPool(alloc, size, true);
+    }
+  }
+}
+
+TEST_F(LruMemoryTiersTest, TestPoolInvalidAllocations) {
+  std::vector<size_t> totalCacheSizes = {48 * MB, 51 * MB, 256 * MB,
+                                         1 * GB,  5 * GB,  8 * GB};
+  size_t min_ratio = 1;
+  size_t max_ratio = 111;
+
+  static const size_t numCombinations = 10;
+
+  for (auto totalCacheSize : totalCacheSizes) {
+    for (size_t k = 0; k < numCombinations; k++) {
+      const size_t i = folly::Random::rand32() % max_ratio + min_ratio;
+      const size_t j = folly::Random::rand32() % max_ratio + min_ratio;
+      LruAllocatorConfig cfg =
+          createTestCacheConfig({i, j},
+                                /* usePoisx */ true, totalCacheSize);
+
+      std::unique_ptr<LruAllocator> alloc = nullptr;
+      try {
+         alloc = std::unique_ptr<LruAllocator>(
+            new LruAllocator(LruAllocator::SharedMemNew, cfg));
+      } catch(...) {
+        // expection only if cache too small
+        size_t sum_ratios = std::accumulate(
+          cfg.getMemoryTierConfigs().begin(), cfg.getMemoryTierConfigs().end(), 0UL,
+          [](const size_t i, const MemoryTierCacheConfig& config) {
+            return i + config.getRatio();
+        });
+        auto tier1slabs = cfg.getMemoryTierConfigs()[0].calculateTierSize(cfg.getCacheSize(), sum_ratios) / Slab::kSize;
+        auto tier2slabs = cfg.getMemoryTierConfigs()[1].calculateTierSize(cfg.getCacheSize(), sum_ratios) / Slab::kSize;
+        EXPECT_TRUE(tier1slabs <= 2 || tier2slabs <= 2);
+
+        continue;
+      }
+
+      size_t size = (folly::Random::rand64() % (100 * GB)) +
+                    alloc->getCacheMemoryStats().ramCacheSize;
+      testAddPool(alloc, size, false);
+    }
+  }
+}
 } // namespace tests
 } // namespace cachelib
 } // namespace facebook

From d209d78018be20d18c1cba78cfdb849a525f1dfd Mon Sep 17 00:00:00 2001
From: Sounak Gupta <sounak.gupta@intel.com>
Date: Mon, 14 Nov 2022 02:07:57 -0800
Subject: [PATCH 38/47] This is the additional multi-tier support needed for
 the compressed ptr changes that were introduced upstream.  - Includes later
 cosmetic changes added by sounak 9cb5c29fa493499192900227169050773820d265

---
 cachelib/allocator/CacheAllocator.h           |  3 +-
 cachelib/allocator/memory/AllocationClass.cpp | 11 ++--
 cachelib/allocator/memory/AllocationClass.h   |  2 +-
 cachelib/allocator/memory/CompressedPtr.h     | 65 +++++++++++++++++--
 cachelib/allocator/memory/MemoryAllocator.h   | 11 ++--
 cachelib/allocator/memory/SlabAllocator.h     |  4 +-
 run_tests.sh                                  |  1 +
 7 files changed, 77 insertions(+), 20 deletions(-)

diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
index d85104bb4d..9fb6728c1b 100644
--- a/cachelib/allocator/CacheAllocator.h
+++ b/cachelib/allocator/CacheAllocator.h
@@ -1294,6 +1294,7 @@ class CacheAllocator : public CacheBase {
                  sizeof(typename RefcountWithFlags::Value) + sizeof(uint32_t) +
                  sizeof(uint32_t) + sizeof(KAllocation)) == sizeof(Item),
                 "vtable overhead");
+  // Check for CompressedPtr single/multi tier support
   static_assert(32 == sizeof(Item), "item overhead is 32 bytes");
 
   // make sure there is no overhead in ChainedItem on top of a regular Item
@@ -1906,7 +1907,7 @@ class CacheAllocator : public CacheBase {
   }
 
   typename Item::PtrCompressor createPtrCompressor() const {
-    return allocator_[0 /* TODO */]->createPtrCompressor<Item>();
+    return typename Item::PtrCompressor(allocator_);
   }
 
   // helper utility to throttle and optionally log.
diff --git a/cachelib/allocator/memory/AllocationClass.cpp b/cachelib/allocator/memory/AllocationClass.cpp
index 71089153e9..512df86bbe 100644
--- a/cachelib/allocator/memory/AllocationClass.cpp
+++ b/cachelib/allocator/memory/AllocationClass.cpp
@@ -50,7 +50,7 @@ AllocationClass::AllocationClass(ClassId classId,
       poolId_(poolId),
       allocationSize_(allocSize),
       slabAlloc_(s),
-      freedAllocations_{slabAlloc_.createPtrCompressor<FreeAlloc>()} {
+      freedAllocations_{slabAlloc_.createSingleTierPtrCompressor<FreeAlloc>()} {
   checkState();
 }
 
@@ -102,7 +102,7 @@ AllocationClass::AllocationClass(
       currSlab_(s.getSlabForIdx(*object.currSlabIdx())),
       slabAlloc_(s),
       freedAllocations_(*object.freedAllocationsObject(),
-                        slabAlloc_.createPtrCompressor<FreeAlloc>()),
+                        slabAlloc_.createSingleTierPtrCompressor<FreeAlloc>()),
       canAllocate_(*object.canAllocate()) {
   if (!slabAlloc_.isRestorable()) {
     throw std::logic_error("The allocation class cannot be restored.");
@@ -356,9 +356,10 @@ std::pair<bool, std::vector<void*>> AllocationClass::pruneFreeAllocs(
   // allocated slab, release any freed allocations belonging to this slab.
   // Set the bit to true if the corresponding allocation is freed, false
   // otherwise.
-  FreeList freeAllocs{slabAlloc_.createPtrCompressor<FreeAlloc>()};
-  FreeList notInSlab{slabAlloc_.createPtrCompressor<FreeAlloc>()};
-  FreeList inSlab{slabAlloc_.createPtrCompressor<FreeAlloc>()};
+  FreeList freeAllocs{slabAlloc_.createSingleTierPtrCompressor<FreeAlloc>()};
+  FreeList notInSlab{slabAlloc_.createSingleTierPtrCompressor<FreeAlloc>()};
+  FreeList inSlab{slabAlloc_.createSingleTierPtrCompressor<FreeAlloc>()};
+
 
   lock_->lock_combine([&]() {
     // Take the allocation class free list offline
diff --git a/cachelib/allocator/memory/AllocationClass.h b/cachelib/allocator/memory/AllocationClass.h
index d45a45c6cd..269887f207 100644
--- a/cachelib/allocator/memory/AllocationClass.h
+++ b/cachelib/allocator/memory/AllocationClass.h
@@ -445,7 +445,7 @@ class AllocationClass {
   struct CACHELIB_PACKED_ATTR FreeAlloc {
     using CompressedPtr = facebook::cachelib::CompressedPtr;
     using PtrCompressor =
-        facebook::cachelib::PtrCompressor<FreeAlloc, SlabAllocator>;
+        facebook::cachelib::SingleTierPtrCompressor<FreeAlloc, SlabAllocator>;
     SListHook<FreeAlloc> hook_{};
   };
 
diff --git a/cachelib/allocator/memory/CompressedPtr.h b/cachelib/allocator/memory/CompressedPtr.h
index 029abd91b9..d664063ea3 100644
--- a/cachelib/allocator/memory/CompressedPtr.h
+++ b/cachelib/allocator/memory/CompressedPtr.h
@@ -27,9 +27,12 @@ namespace cachelib {
 
 class SlabAllocator;
 
+template <typename PtrType, typename AllocatorContainer>
+class PtrCompressor;
+
 // This CompressedPtr makes decompression fast by staying away from division and
-// modulo arithmetic and doing those during the compression time. We most often
-// decompress a CompressedPtr than compress a pointer while creating one. This
+// modulo arithmetic and doing those during the compression time. We most  often
+// decompress a CompressedPtr than compress a pointer  while creating one.  This
 // is used for pointer compression by the memory allocator.
 
 // We compress pointers by storing the tier index, slab index and alloc index of
@@ -173,12 +176,14 @@ class CACHELIB_PACKED_ATTR CompressedPtr {
   }
 
   friend SlabAllocator;
+  template <typename CPtrType, typename AllocatorContainer>
+  friend class PtrCompressor;
 };
 
 template <typename PtrType, typename AllocatorT>
-class PtrCompressor {
+class SingleTierPtrCompressor {
  public:
-  explicit PtrCompressor(const AllocatorT& allocator) noexcept
+  explicit SingleTierPtrCompressor(const AllocatorT& allocator) noexcept
       : allocator_(allocator) {}
 
   const CompressedPtr compress(const PtrType* uncompressed) const {
@@ -190,11 +195,11 @@ class PtrCompressor {
         allocator_.unCompress(compressed, false /* isMultiTiered */));
   }
 
-  bool operator==(const PtrCompressor& rhs) const noexcept {
+  bool operator==(const SingleTierPtrCompressor& rhs) const noexcept {
     return &allocator_ == &rhs.allocator_;
   }
 
-  bool operator!=(const PtrCompressor& rhs) const noexcept {
+  bool operator!=(const SingleTierPtrCompressor& rhs) const noexcept {
     return !(*this == rhs);
   }
 
@@ -202,5 +207,53 @@ class PtrCompressor {
   // memory allocator that does the pointer compression.
   const AllocatorT& allocator_;
 };
+
+template <typename PtrType, typename AllocatorContainer>
+class PtrCompressor {
+ public:
+  explicit PtrCompressor(const AllocatorContainer& allocators) noexcept
+      : allocators_(allocators) {}
+
+  const CompressedPtr compress(const PtrType* uncompressed) const {
+    if (uncompressed == nullptr)
+      return CompressedPtr{};
+
+    TierId tid;
+    for (tid = 0; tid < allocators_.size(); tid++) {
+      if (allocators_[tid]->isMemoryInAllocator(
+              static_cast<const void*>(uncompressed)))
+        break;
+    }
+
+    bool isMultiTiered = allocators_.size() > 1;
+    auto cptr = allocators_[tid]->compress(uncompressed, isMultiTiered);
+    if (isMultiTiered) { // config has multiple tiers
+      cptr.setTierId(tid);
+    }
+    return cptr;
+  }
+
+  PtrType* unCompress(const CompressedPtr compressed) const {
+    if (compressed.isNull()) {
+      return nullptr;
+    }
+    bool isMultiTiered = allocators_.size() > 1;
+    auto& allocator = *allocators_[compressed.getTierId(isMultiTiered)];
+    return static_cast<PtrType*>(
+        allocator.unCompress(compressed, isMultiTiered));
+  }
+
+  bool operator==(const PtrCompressor& rhs) const noexcept {
+    return &allocators_ == &rhs.allocators_;
+  }
+
+  bool operator!=(const PtrCompressor& rhs) const noexcept {
+    return !(*this == rhs);
+  }
+
+ private:
+  // memory allocator that does the pointer compression.
+  const AllocatorContainer& allocators_;
+};
 } // namespace cachelib
 } // namespace facebook
diff --git a/cachelib/allocator/memory/MemoryAllocator.h b/cachelib/allocator/memory/MemoryAllocator.h
index 625171fd6f..a77d23494c 100644
--- a/cachelib/allocator/memory/MemoryAllocator.h
+++ b/cachelib/allocator/memory/MemoryAllocator.h
@@ -516,12 +516,13 @@ class MemoryAllocator {
   using CompressedPtr = facebook::cachelib::CompressedPtr;
   template <typename PtrType>
   using PtrCompressor =
-      facebook::cachelib::PtrCompressor<PtrType, SlabAllocator>;
-
+      facebook::cachelib::PtrCompressor<PtrType,
+             std::vector<std::unique_ptr<MemoryAllocator>>>;
+  
   template <typename PtrType>
-  PtrCompressor<PtrType> createPtrCompressor() {
-    return slabAllocator_.createPtrCompressor<PtrType>();
-  }
+  using SingleTierPtrCompressor =
+       facebook::cachelib::PtrCompressor<PtrType,
+       SlabAllocator>;
 
   // compress a given pointer to a valid allocation made out of this allocator
   // through an allocate() or nullptr. Calling this otherwise with invalid
diff --git a/cachelib/allocator/memory/SlabAllocator.h b/cachelib/allocator/memory/SlabAllocator.h
index 9fdb1e60b4..a80a54672c 100644
--- a/cachelib/allocator/memory/SlabAllocator.h
+++ b/cachelib/allocator/memory/SlabAllocator.h
@@ -318,8 +318,8 @@ class SlabAllocator {
   }
 
   template <typename PtrType>
-  PtrCompressor<PtrType, SlabAllocator> createPtrCompressor() const {
-    return PtrCompressor<PtrType, SlabAllocator>(*this);
+  SingleTierPtrCompressor<PtrType, SlabAllocator> createSingleTierPtrCompressor() const {
+     return SingleTierPtrCompressor<PtrType, SlabAllocator>(*this);
   }
 
   // returns starting address of memory we own.
diff --git a/run_tests.sh b/run_tests.sh
index 111e218333..e575dbc62a 100755
--- a/run_tests.sh
+++ b/run_tests.sh
@@ -2,6 +2,7 @@
 
 # Newline separated list of tests to ignore
 BLACKLIST="allocator-test-NavySetupTest
+allocator-test-NvmCacheTests
 shm-test-test_page_size"
 
 if [ "$1" == "long" ]; then

From e74fa4059c24a330197152b19db39fb29343cddd Mon Sep 17 00:00:00 2001
From: Sounak Gupta <sounak.gupta@intel.com>
Date: Thu, 21 Jul 2022 02:01:04 -0700
Subject: [PATCH 39/47] added per pool class rolling average latency (upstream
 PR version)

fix for rolling stats (on multi-tier to be followed by multi-tier rolling stats
implementation in the following commit)
---
 cachelib/allocator/CacheAllocator-inl.h       | 12 ++-
 cachelib/allocator/CacheStats.cpp             |  4 +-
 cachelib/allocator/CacheStatsInternal.h       |  8 ++
 .../allocator/memory/MemoryAllocatorStats.h   |  4 +
 cachelib/cachebench/cache/CacheStats.h        |  6 +-
 cachelib/common/RollingStats.h                | 90 +++++++++++++++++++
 6 files changed, 118 insertions(+), 6 deletions(-)
 create mode 100644 cachelib/common/RollingStats.h

diff --git a/cachelib/allocator/CacheAllocator-inl.h b/cachelib/allocator/CacheAllocator-inl.h
index b474799c50..e01f61f800 100644
--- a/cachelib/allocator/CacheAllocator-inl.h
+++ b/cachelib/allocator/CacheAllocator-inl.h
@@ -384,6 +384,8 @@ CacheAllocator<CacheTrait>::allocateInternalTier(TierId tid,
 
   // the allocation class in our memory allocator.
   const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize);
+  util::RollingLatencyTracker rollTracker{
+      (*stats_.classAllocLatency)[pid][cid]};
 
   // TODO: per-tier
   (*stats_.allocAttempts)[pid][cid].inc();
@@ -480,8 +482,9 @@ CacheAllocator<CacheTrait>::allocateChainedItemInternal(
   const auto pid = allocator_[tid]->getAllocInfo(parent->getMemory()).poolId;
   const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize);
 
-  // TODO: per-tier? Right now stats_ are not used in any public periodic
-  // worker
+  util::RollingLatencyTracker rollTracker{
+      (*stats_.classAllocLatency)[pid][cid]};
+
   (*stats_.allocAttempts)[pid][cid].inc();
 
   void* memory = allocator_[tid]->allocate(pid, requiredSize);
@@ -2445,7 +2448,10 @@ ACStats CacheAllocator<CacheTrait>::getACStats(TierId tid,
                                                ClassId classId) const {
   const auto& pool = allocator_[tid]->getPool(poolId);
   const auto& ac = pool.getAllocationClass(classId);
-  return ac.getStats();
+
+  auto stats = ac.getStats();
+  stats.allocLatencyNs = (*stats_.classAllocLatency)[poolId][classId];
+  return stats;
 }
 
 template <typename CacheTrait>
diff --git a/cachelib/allocator/CacheStats.cpp b/cachelib/allocator/CacheStats.cpp
index c16149df6b..69cb2366cf 100644
--- a/cachelib/allocator/CacheStats.cpp
+++ b/cachelib/allocator/CacheStats.cpp
@@ -44,6 +44,8 @@ void Stats::init() {
   initToZero(*fragmentationSize);
   initToZero(*chainedItemEvictions);
   initToZero(*regularItemEvictions);
+
+  classAllocLatency = std::make_unique<PerPoolClassRollingStats>();
 }
 
 template <int>
@@ -51,7 +53,7 @@ struct SizeVerify {};
 
 void Stats::populateGlobalCacheStats(GlobalCacheStats& ret) const {
 #ifndef SKIP_SIZE_VERIFY
-  SizeVerify<sizeof(Stats)> a = SizeVerify<16176>{};
+  SizeVerify<sizeof(Stats)> a = SizeVerify<16192>{};
   std::ignore = a;
 #endif
   ret.numCacheGets = numCacheGets.get();
diff --git a/cachelib/allocator/CacheStatsInternal.h b/cachelib/allocator/CacheStatsInternal.h
index b2a5f8c469..8f54cd6ecf 100644
--- a/cachelib/allocator/CacheStatsInternal.h
+++ b/cachelib/allocator/CacheStatsInternal.h
@@ -21,6 +21,7 @@
 #include "cachelib/allocator/Cache.h"
 #include "cachelib/allocator/memory/MemoryAllocator.h"
 #include "cachelib/common/AtomicCounter.h"
+#include "cachelib/common/RollingStats.h"
 
 namespace facebook {
 namespace cachelib {
@@ -229,6 +230,13 @@ struct Stats {
   std::unique_ptr<PerPoolClassAtomicCounters> chainedItemEvictions{};
   std::unique_ptr<PerPoolClassAtomicCounters> regularItemEvictions{};
 
+  using PerPoolClassRollingStats =
+      std::array<std::array<util::RollingStats, MemoryAllocator::kMaxClasses>,
+                 MemoryPoolManager::kMaxPools>;
+
+  // rolling latency tracking for every alloc class in every pool
+  std::unique_ptr<PerPoolClassRollingStats> classAllocLatency{};
+
   // Eviction failures due to parent cannot be removed from access container
   AtomicCounter evictFailParentAC{0};
 
diff --git a/cachelib/allocator/memory/MemoryAllocatorStats.h b/cachelib/allocator/memory/MemoryAllocatorStats.h
index 65d82e000d..acda9ee530 100644
--- a/cachelib/allocator/memory/MemoryAllocatorStats.h
+++ b/cachelib/allocator/memory/MemoryAllocatorStats.h
@@ -20,6 +20,7 @@
 #include <unordered_map>
 
 #include "cachelib/allocator/memory/Slab.h"
+#include "cachelib/common/RollingStats.h"
 
 namespace facebook {
 namespace cachelib {
@@ -47,6 +48,9 @@ struct ACStats {
   // true if the allocation class is full.
   bool full;
 
+  // Rolling allocation latency (in ns)
+  util::RollingStats allocLatencyNs;
+
   constexpr unsigned long long totalSlabs() const noexcept {
     return freeSlabs + usedSlabs;
   }
diff --git a/cachelib/cachebench/cache/CacheStats.h b/cachelib/cachebench/cache/CacheStats.h
index 7c7137b63d..93bf11a74e 100644
--- a/cachelib/cachebench/cache/CacheStats.h
+++ b/cachelib/cachebench/cache/CacheStats.h
@@ -185,8 +185,10 @@ struct Stats {
                                    : stats.usageFraction();
 
         out << folly::sformat(
-                   "pid{:2} cid{:4} {:8.2f}{} usageFraction: {:4.2f}", pid, cid,
-                   allocSize, allocSizeSuffix, acUsageFraction)
+                   "tid{:2} pid{:2} cid{:4} {:8.2f}{} usageFraction: {:4.2f} "
+                   "rollingAvgAllocLatency: {:8.2f}ns",
+                   tid, pid, cid, allocSize, allocSizeSuffix, acUsageFraction,
+                   stats.allocLatencyNs.estimate())
             << std::endl;
       });
     }
diff --git a/cachelib/common/RollingStats.h b/cachelib/common/RollingStats.h
new file mode 100644
index 0000000000..4d179681ad
--- /dev/null
+++ b/cachelib/common/RollingStats.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <folly/Range.h>
+#include <folly/logging/xlog.h>
+
+#include "cachelib/common/Utils.h"
+
+namespace facebook {
+namespace cachelib {
+namespace util {
+
+class RollingStats {
+ public:
+  // track latency by taking the value of duration directly.
+  void trackValue(double value) {
+    // This is a highly unlikely scenario where
+    // cnt_ reaches numerical limits. Skip update
+    // of the rolling average anymore.
+    if (cnt_ == std::numeric_limits<uint64_t>::max()) {
+      cnt_ = 0;
+      return;
+    }
+    auto ratio = static_cast<double>(cnt_) / (cnt_ + 1);
+    avg_ *= ratio;
+    ++cnt_;
+    avg_ += value / cnt_;
+  }
+
+  // Return the rolling average.
+  double estimate() { return avg_; }
+
+ private:
+  double avg_{0};
+  uint64_t cnt_{0};
+};
+
+class RollingLatencyTracker {
+ public:
+  explicit RollingLatencyTracker(RollingStats& stats)
+      : stats_(&stats), begin_(std::chrono::steady_clock::now()) {}
+  RollingLatencyTracker() {}
+  ~RollingLatencyTracker() {
+    if (stats_) {
+      auto tp = std::chrono::steady_clock::now();
+      auto diffNanos =
+          std::chrono::duration_cast<std::chrono::nanoseconds>(tp - begin_)
+              .count();
+      stats_->trackValue(static_cast<double>(diffNanos));
+    }
+  }
+
+  RollingLatencyTracker(const RollingLatencyTracker&) = delete;
+  RollingLatencyTracker& operator=(const RollingLatencyTracker&) = delete;
+
+  RollingLatencyTracker(RollingLatencyTracker&& rhs) noexcept
+      : stats_(rhs.stats_), begin_(rhs.begin_) {
+    rhs.stats_ = nullptr;
+  }
+
+  RollingLatencyTracker& operator=(RollingLatencyTracker&& rhs) noexcept {
+    if (this != &rhs) {
+      this->~RollingLatencyTracker();
+      new (this) RollingLatencyTracker(std::move(rhs));
+    }
+    return *this;
+  }
+
+ private:
+  RollingStats* stats_{nullptr};
+  std::chrono::time_point<std::chrono::steady_clock> begin_;
+};
+} // namespace util
+} // namespace cachelib
+} // namespace facebook

From bab780a3d38e5d8ba29a507ac04212addfc0db8a Mon Sep 17 00:00:00 2001
From: Sounak Gupta <sounak.gupta@intel.com>
Date: Thu, 21 Jul 2022 02:01:04 -0700
Subject: [PATCH 40/47] added per tier pool class rolling average latency
 (based on upstream PR)

---
 cachelib/allocator/Cache.h              |  3 +++
 cachelib/allocator/CacheAllocator-inl.h | 10 ++++++----
 cachelib/allocator/CacheStats.cpp       |  2 +-
 cachelib/allocator/CacheStats.h         |  1 +
 cachelib/allocator/CacheStatsInternal.h |  7 ++++---
 cachelib/cachebench/cache/CacheStats.h  | 11 +++--------
 6 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/cachelib/allocator/Cache.h b/cachelib/allocator/Cache.h
index fed0abce2f..cb2fa83f0d 100644
--- a/cachelib/allocator/Cache.h
+++ b/cachelib/allocator/Cache.h
@@ -85,6 +85,9 @@ class CacheBase {
   CacheBase(CacheBase&&) = default;
   CacheBase& operator=(CacheBase&&) = default;
 
+  // TODO: come up with some reasonable number
+  static constexpr unsigned kMaxTiers = 2;
+
   // Get a string referring to the cache name for this cache
   virtual const std::string getCacheName() const = 0;
 
diff --git a/cachelib/allocator/CacheAllocator-inl.h b/cachelib/allocator/CacheAllocator-inl.h
index e01f61f800..996ec79822 100644
--- a/cachelib/allocator/CacheAllocator-inl.h
+++ b/cachelib/allocator/CacheAllocator-inl.h
@@ -385,7 +385,7 @@ CacheAllocator<CacheTrait>::allocateInternalTier(TierId tid,
   // the allocation class in our memory allocator.
   const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize);
   util::RollingLatencyTracker rollTracker{
-      (*stats_.classAllocLatency)[pid][cid]};
+      (*stats_.classAllocLatency)[tid][pid][cid]};
 
   // TODO: per-tier
   (*stats_.allocAttempts)[pid][cid].inc();
@@ -483,8 +483,10 @@ CacheAllocator<CacheTrait>::allocateChainedItemInternal(
   const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize);
 
   util::RollingLatencyTracker rollTracker{
-      (*stats_.classAllocLatency)[pid][cid]};
-
+      (*stats_.classAllocLatency)[tid][pid][cid]};
+  
+  // TODO: per-tier? Right now stats_ are not used in any public periodic
+  // worker
   (*stats_.allocAttempts)[pid][cid].inc();
 
   void* memory = allocator_[tid]->allocate(pid, requiredSize);
@@ -2450,7 +2452,7 @@ ACStats CacheAllocator<CacheTrait>::getACStats(TierId tid,
   const auto& ac = pool.getAllocationClass(classId);
 
   auto stats = ac.getStats();
-  stats.allocLatencyNs = (*stats_.classAllocLatency)[poolId][classId];
+  stats.allocLatencyNs = (*stats_.classAllocLatency)[tid][poolId][classId];
   return stats;
 }
 
diff --git a/cachelib/allocator/CacheStats.cpp b/cachelib/allocator/CacheStats.cpp
index 69cb2366cf..b4770a3480 100644
--- a/cachelib/allocator/CacheStats.cpp
+++ b/cachelib/allocator/CacheStats.cpp
@@ -45,7 +45,7 @@ void Stats::init() {
   initToZero(*chainedItemEvictions);
   initToZero(*regularItemEvictions);
 
-  classAllocLatency = std::make_unique<PerPoolClassRollingStats>();
+  classAllocLatency = std::make_unique<PerTierPoolClassRollingStats>();
 }
 
 template <int>
diff --git a/cachelib/allocator/CacheStats.h b/cachelib/allocator/CacheStats.h
index fb9955b805..9f3674f513 100644
--- a/cachelib/allocator/CacheStats.h
+++ b/cachelib/allocator/CacheStats.h
@@ -25,6 +25,7 @@
 #include "cachelib/allocator/memory/Slab.h"
 #include "cachelib/common/FastStats.h"
 #include "cachelib/common/PercentileStats.h"
+#include "cachelib/common/RollingStats.h"
 #include "cachelib/common/Time.h"
 
 namespace facebook {
diff --git a/cachelib/allocator/CacheStatsInternal.h b/cachelib/allocator/CacheStatsInternal.h
index 8f54cd6ecf..19a15fbbd4 100644
--- a/cachelib/allocator/CacheStatsInternal.h
+++ b/cachelib/allocator/CacheStatsInternal.h
@@ -230,12 +230,13 @@ struct Stats {
   std::unique_ptr<PerPoolClassAtomicCounters> chainedItemEvictions{};
   std::unique_ptr<PerPoolClassAtomicCounters> regularItemEvictions{};
 
-  using PerPoolClassRollingStats =
+  using PerTierPoolClassRollingStats = std::array<
       std::array<std::array<util::RollingStats, MemoryAllocator::kMaxClasses>,
-                 MemoryPoolManager::kMaxPools>;
+                 MemoryPoolManager::kMaxPools>,
+      CacheBase::kMaxTiers>;
 
   // rolling latency tracking for every alloc class in every pool
-  std::unique_ptr<PerPoolClassRollingStats> classAllocLatency{};
+  std::unique_ptr<PerTierPoolClassRollingStats> classAllocLatency{};
 
   // Eviction failures due to parent cannot be removed from access container
   AtomicCounter evictFailParentAC{0};
diff --git a/cachelib/cachebench/cache/CacheStats.h b/cachelib/cachebench/cache/CacheStats.h
index 93bf11a74e..3e3d5da307 100644
--- a/cachelib/cachebench/cache/CacheStats.h
+++ b/cachelib/cachebench/cache/CacheStats.h
@@ -165,18 +165,11 @@ struct Stats {
         }
       };
 
+
       foreachAC([&](auto tid, auto pid, auto cid, auto stats) {
         auto [allocSizeSuffix, allocSize] = formatMemory(stats.allocSize);
         auto [memorySizeSuffix, memorySize] =
             formatMemory(stats.totalAllocatedSize());
-        out << folly::sformat("tid{:2} pid{:2} cid{:4} {:8.2f}{} memorySize: {:8.2f}{}",
-                              tid, pid, cid, allocSize, allocSizeSuffix, memorySize,
-                              memorySizeSuffix)
-            << std::endl;
-      });
-
-      foreachAC([&](auto tid, auto pid, auto cid, auto stats) {
-        auto [allocSizeSuffix, allocSize] = formatMemory(stats.allocSize);
 
         // If the pool is not full, extrapolate usageFraction for AC assuming it
         // will grow at the same rate. This value will be the same for all ACs.
@@ -186,8 +179,10 @@ struct Stats {
 
         out << folly::sformat(
                    "tid{:2} pid{:2} cid{:4} {:8.2f}{} usageFraction: {:4.2f} "
+                   "memorySize: {:8.2f}{} "
                    "rollingAvgAllocLatency: {:8.2f}ns",
                    tid, pid, cid, allocSize, allocSizeSuffix, acUsageFraction,
+                   memorySize, memorySizeSuffix,
                    stats.allocLatencyNs.estimate())
             << std::endl;
       });

From f0baeb1185802e9eb52ea6db2aceea955a920bcc Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Tue, 9 Aug 2022 10:45:26 -0400
Subject: [PATCH 41/47] MM2Q promotion iterators (#1)

Hot queue iterator for 2Q. Will start at Hot queue and move to Warm queue if hot queue is exhausted. Useful for promotion semantics if using 2Q replacement. rebased on to develop and added some tests.
---
 cachelib/allocator/MM2Q-inl.h                 | 10 ++++
 cachelib/allocator/MM2Q.h                     |  5 ++
 cachelib/allocator/datastruct/DList.h         |  4 ++
 .../allocator/datastruct/MultiDList-inl.h     | 56 ++++++++++++++++---
 cachelib/allocator/datastruct/MultiDList.h    | 16 +++++-
 cachelib/allocator/tests/MM2QTest.cpp         | 33 +++++++++++
 cachelib/allocator/tests/MMTypeTest.h         |  2 +
 7 files changed, 116 insertions(+), 10 deletions(-)

diff --git a/cachelib/allocator/MM2Q-inl.h b/cachelib/allocator/MM2Q-inl.h
index ba388d40a4..07aae775f7 100644
--- a/cachelib/allocator/MM2Q-inl.h
+++ b/cachelib/allocator/MM2Q-inl.h
@@ -258,6 +258,16 @@ void MM2Q::Container<T, HookPtr>::withEvictionIterator(F&& fun) {
   }
 }
 
+// returns the head of the hot queue for promotion
+template <typename T, MM2Q::Hook<T> T::*HookPtr>
+template <typename F>
+void
+MM2Q::Container<T, HookPtr>::withPromotionIterator(F&& fun) {
+  lruMutex_->lock_combine([this, &fun]() {
+    fun(LockedIterator{LockHolder{}, lru_.begin(LruType::Hot)});
+  });
+}
+
 template <typename T, MM2Q::Hook<T> T::*HookPtr>
 void MM2Q::Container<T, HookPtr>::removeLocked(T& node,
                                                bool doRebalance) noexcept {
diff --git a/cachelib/allocator/MM2Q.h b/cachelib/allocator/MM2Q.h
index 9c5cac834c..62b644f9c6 100644
--- a/cachelib/allocator/MM2Q.h
+++ b/cachelib/allocator/MM2Q.h
@@ -502,6 +502,11 @@ class MM2Q {
     // Iterator passed as parameter.
     template <typename F>
     void withEvictionIterator(F&& f);
+    
+    // Execute provided function under container lock. Function gets
+    // iterator passed as parameter.
+    template <typename F>
+    void withPromotionIterator(F&& f);
 
     // get the current config as a copy
     Config getConfig() const;
diff --git a/cachelib/allocator/datastruct/DList.h b/cachelib/allocator/datastruct/DList.h
index 2e872c8ee0..4d862b1908 100644
--- a/cachelib/allocator/datastruct/DList.h
+++ b/cachelib/allocator/datastruct/DList.h
@@ -221,6 +221,10 @@ class DList {
       curr_ = dir_ == Direction::FROM_HEAD ? dlist_->head_ : dlist_->tail_;
     }
 
+    Direction getDirection() noexcept {
+        return dir_;
+    }
+
    protected:
     void goForward() noexcept;
     void goBackward() noexcept;
diff --git a/cachelib/allocator/datastruct/MultiDList-inl.h b/cachelib/allocator/datastruct/MultiDList-inl.h
index e20510d4fc..cd79b600c5 100644
--- a/cachelib/allocator/datastruct/MultiDList-inl.h
+++ b/cachelib/allocator/datastruct/MultiDList-inl.h
@@ -25,12 +25,26 @@ void MultiDList<T, HookPtr>::Iterator::goForward() noexcept {
   }
   // Move iterator forward
   ++currIter_;
-  // If we land at the rend of this list, move to the previous list.
-  while (index_ != kInvalidIndex &&
-         currIter_ == mlist_.lists_[index_]->rend()) {
-    --index_;
-    if (index_ != kInvalidIndex) {
-      currIter_ = mlist_.lists_[index_]->rbegin();
+
+  if (currIter_.getDirection() == DListIterator::Direction::FROM_HEAD) {
+    // If we land at the rend of this list, move to the previous list.
+    while (index_ != kInvalidIndex && index_ != mlist_.lists_.size() &&
+           currIter_ == mlist_.lists_[index_]->end()) {
+      ++index_;
+      if (index_ != kInvalidIndex && index_ != mlist_.lists_.size()) {
+        currIter_ = mlist_.lists_[index_]->begin();
+      } else {
+          return;
+      }
+    }
+  } else {
+    // If we land at the rend of this list, move to the previous list.
+    while (index_ != kInvalidIndex &&
+           currIter_ == mlist_.lists_[index_]->rend()) {
+      --index_;
+      if (index_ != kInvalidIndex) {
+        currIter_ = mlist_.lists_[index_]->rbegin();
+      }
     }
   }
 }
@@ -71,6 +85,25 @@ void MultiDList<T, HookPtr>::Iterator::initToValidRBeginFrom(
                   : mlist_.lists_[index_]->rbegin();
 }
 
+template <typename T, DListHook<T> T::*HookPtr>
+void MultiDList<T, HookPtr>::Iterator::initToValidBeginFrom(
+    size_t listIdx) noexcept {
+  // Find the first non-empty list.
+  index_ = listIdx;
+  while (index_ != mlist_.lists_.size() &&
+         mlist_.lists_[index_]->size() == 0) {
+    ++index_;
+  }
+  if (index_ == mlist_.lists_.size()) {
+    //we reached the end - we should get set to
+    //invalid index
+    index_ = std::numeric_limits<size_t>::max();
+  }
+  currIter_ = index_ == std::numeric_limits<size_t>::max()
+                  ? mlist_.lists_[0]->begin()
+                  : mlist_.lists_[index_]->begin();
+}
+
 template <typename T, DListHook<T> T::*HookPtr>
 typename MultiDList<T, HookPtr>::Iterator&
 MultiDList<T, HookPtr>::Iterator::operator++() noexcept {
@@ -97,7 +130,16 @@ typename MultiDList<T, HookPtr>::Iterator MultiDList<T, HookPtr>::rbegin(
   if (listIdx >= lists_.size()) {
     throw std::invalid_argument("Invalid list index for MultiDList iterator.");
   }
-  return MultiDList<T, HookPtr>::Iterator(*this, listIdx);
+  return MultiDList<T, HookPtr>::Iterator(*this, listIdx, false);
+}
+
+template <typename T, DListHook<T> T::*HookPtr>
+typename MultiDList<T, HookPtr>::Iterator MultiDList<T, HookPtr>::begin(
+    size_t listIdx) const {
+  if (listIdx >= lists_.size()) {
+    throw std::invalid_argument("Invalid list index for MultiDList iterator.");
+  }
+  return MultiDList<T, HookPtr>::Iterator(*this, listIdx, true);
 }
 
 template <typename T, DListHook<T> T::*HookPtr>
diff --git a/cachelib/allocator/datastruct/MultiDList.h b/cachelib/allocator/datastruct/MultiDList.h
index 1a59baa715..bd7be00bd4 100644
--- a/cachelib/allocator/datastruct/MultiDList.h
+++ b/cachelib/allocator/datastruct/MultiDList.h
@@ -110,14 +110,18 @@ class MultiDList {
     }
 
     explicit Iterator(const MultiDList<T, HookPtr>& mlist,
-                      size_t listIdx) noexcept
+                      size_t listIdx, bool head) noexcept
         : currIter_(mlist.lists_[mlist.lists_.size() - 1]->rbegin()),
           mlist_(mlist) {
       XDCHECK_LT(listIdx, mlist.lists_.size());
-      initToValidRBeginFrom(listIdx);
+      if (head) {
+        initToValidBeginFrom(listIdx);
+      } else {
+        initToValidRBeginFrom(listIdx);
+      }
       // We should either point to an element or the end() iterator
       // which has an invalid index_.
-      XDCHECK(index_ == kInvalidIndex || currIter_.get() != nullptr);
+      XDCHECK(index_ == kInvalidIndex || index_ == mlist.lists_.size() || currIter_.get() != nullptr);
     }
     virtual ~Iterator() = default;
 
@@ -169,6 +173,9 @@ class MultiDList {
 
     // reset iterator to the beginning of a speicific queue
     void initToValidRBeginFrom(size_t listIdx) noexcept;
+    
+    // reset iterator to the head of a specific queue
+    void initToValidBeginFrom(size_t listIdx) noexcept;
 
     // Index of current list
     size_t index_{0};
@@ -184,6 +191,9 @@ class MultiDList {
 
   // provides an iterator starting from the tail of a specific list.
   Iterator rbegin(size_t idx) const;
+  
+  // provides an iterator starting from the head of a specific list.
+  Iterator begin(size_t idx) const;
 
   // Iterator to compare against for the end.
   Iterator rend() const noexcept;
diff --git a/cachelib/allocator/tests/MM2QTest.cpp b/cachelib/allocator/tests/MM2QTest.cpp
index e11dd95f5a..0e01ffa56f 100644
--- a/cachelib/allocator/tests/MM2QTest.cpp
+++ b/cachelib/allocator/tests/MM2QTest.cpp
@@ -223,6 +223,19 @@ void MMTypeTest<MMType>::testIterate(std::vector<std::unique_ptr<Node>>& nodes,
   }
 }
 
+template <typename MMType>
+void MMTypeTest<MMType>::testIterateHot(std::vector<std::unique_ptr<Node>>& nodes,
+                                     Container& c) {
+  auto it = nodes.rbegin();
+  c.withPromotionIterator([&it,&c](auto &&it2q) {
+    while (it2q && c.isHot(*it2q)) {
+        ASSERT_EQ(it2q->getId(), (*it)->getId());
+        ++it2q;
+        ++it;
+    }
+  });
+}
+
 template <typename MMType>
 void MMTypeTest<MMType>::testMatch(std::string expected,
                                    MMTypeTest<MMType>::Container& c) {
@@ -238,6 +251,23 @@ void MMTypeTest<MMType>::testMatch(std::string expected,
   ASSERT_EQ(expected, actual);
 }
 
+template <typename MMType>
+void MMTypeTest<MMType>::testMatchHot(std::string expected,
+                                   MMTypeTest<MMType>::Container& c) {
+  int index = -1;
+  std::string actual;
+  c.withPromotionIterator([&c,&actual,&index](auto &&it2q) {
+    while (it2q) {
+      ++index;
+      actual += folly::stringPrintf(
+          "%d:%s, ", it2q->getId(),
+          (c.isHot(*it2q) ? "H" : (c.isCold(*it2q) ? "C" : "W")));
+      ++it2q;
+    }
+  });
+  ASSERT_EQ(expected, actual);
+}
+
 TEST_F(MM2QTest, DetailedTest) {
   MM2Q::Config config;
   config.lruRefreshTime = 0;
@@ -259,8 +289,11 @@ TEST_F(MM2QTest, DetailedTest) {
   }
 
   testIterate(nodes, c);
+  testIterateHot(nodes, c);
 
   testMatch("0:C, 1:C, 2:C, 3:C, 4:H, 5:H, ", c);
+  testMatchHot("5:H, 4:H, 3:C, 2:C, 1:C, 0:C, ", c);
+
   // Move 3 to top of the hot cache
   c.recordAccess(*(nodes[4]), AccessMode::kRead);
   testMatch("0:C, 1:C, 2:C, 3:C, 5:H, 4:H, ", c);
diff --git a/cachelib/allocator/tests/MMTypeTest.h b/cachelib/allocator/tests/MMTypeTest.h
index d38f6ce2c1..dbc55677ea 100644
--- a/cachelib/allocator/tests/MMTypeTest.h
+++ b/cachelib/allocator/tests/MMTypeTest.h
@@ -147,7 +147,9 @@ class MMTypeTest : public testing::Test {
   void testRecordAccessBasic(Config c);
   void testSerializationBasic(Config c);
   void testIterate(std::vector<std::unique_ptr<Node>>& nodes, Container& c);
+  void testIterateHot(std::vector<std::unique_ptr<Node>>& nodes, Container& c);
   void testMatch(std::string expected, Container& c);
+  void testMatchHot(std::string expected, Container& c);
   size_t getListSize(const Container& c, typename MMType::LruType list);
   void verifyIterationVariants(Container& c);
 };

From 8ab8c754c86d19de4e7cac3c6aa3b57abce16729 Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Mon, 6 Feb 2023 16:45:18 -0800
Subject: [PATCH 42/47] CS Patch Part 2 for mulit-tier cachelib: - transparent
 item movement - multi-tier combined locking with exclusive bit (#38) with
 refactored incRef to support returning the result of markMoving (fail if
 already moving or exclusvie bit is set) option. - add tests (updated for numa
 bindings - post combined locking) for transparent item movement

---
 cachelib/allocator/CacheAllocator-inl.h       | 405 +++++++++++++++---
 cachelib/allocator/CacheAllocator.h           | 145 ++++++-
 cachelib/allocator/CacheItem-inl.h            |   4 +-
 cachelib/allocator/CacheItem.h                |   6 +-
 cachelib/allocator/Handle.h                   |   9 +-
 cachelib/allocator/MMLru-inl.h                |  12 +
 cachelib/allocator/MMLru.h                    |   5 +-
 cachelib/allocator/Refcount.h                 |  57 ++-
 .../tests/AllocatorMemoryTiersTest.cpp        |   2 +
 .../tests/AllocatorMemoryTiersTest.h          |  94 ++++
 cachelib/allocator/tests/ItemHandleTest.cpp   |   8 +
 cachelib/allocator/tests/ItemTest.cpp         |   4 +-
 cachelib/allocator/tests/RefCountTest.cpp     |  22 +-
 cachelib/cachebench/util/CacheConfig.h        |   2 +-
 14 files changed, 669 insertions(+), 106 deletions(-)

diff --git a/cachelib/allocator/CacheAllocator-inl.h b/cachelib/allocator/CacheAllocator-inl.h
index 996ec79822..e2be55604a 100644
--- a/cachelib/allocator/CacheAllocator-inl.h
+++ b/cachelib/allocator/CacheAllocator-inl.h
@@ -85,6 +85,8 @@ CacheAllocator<CacheTrait>::CacheAllocator(
                               config.chainedItemAccessConfig)),
       chainedItemLocks_(config_.chainedItemsLockPower,
                         std::make_shared<MurmurHash2>()),
+      movesMap_(kShards),
+      moveLock_(kShards), 
       cacheCreationTime_{
           type != InitMemType::kMemAttach
               ? util::getCurrentTimeSec()
@@ -542,14 +544,15 @@ void CacheAllocator<CacheTrait>::addChainedItem(WriteHandle& parent,
   // Count a new child
   stats_.numChainedChildItems.inc();
 
-  insertInMMContainer(*child);
-
   // Increment refcount since this chained item is now owned by the parent
   // Parent will decrement the refcount upon release. Since this is an
   // internal refcount, we dont include it in active handle tracking.
-  child->incRef();
+  auto ret = child->incRef(true);
+  XDCHECK(ret == RefcountWithFlags::incResult::incOk);
   XDCHECK_EQ(2u, child->getRefCount());
 
+  insertInMMContainer(*child);
+
   invalidateNvm(*parent);
   if (auto eventTracker = getEventTracker()) {
     eventTracker->record(AllocatorApiEvent::ADD_CHAINED, parent->getKey(),
@@ -793,7 +796,8 @@ CacheAllocator<CacheTrait>::replaceChainedItemLocked(Item& oldItem,
   // Since this is an internal refcount, we dont include it in active handle
   // tracking.
 
-  newItemHdl->incRef();
+  auto ret = newItemHdl->incRef(true);
+  XDCHECK(ret == RefcountWithFlags::incResult::incOk);
   return oldItemHdl;
 }
 
@@ -959,12 +963,12 @@ CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it,
 }
 
 template <typename CacheTrait>
-bool CacheAllocator<CacheTrait>::incRef(Item& it) {
-  if (it.incRef()) {
-    ++handleCount_.tlStats();
-    return true;
-  }
-  return false;
+RefcountWithFlags::incResult CacheAllocator<CacheTrait>::incRef(Item& it, bool failIfMoving) {
+   auto ret = it.incRef(failIfMoving);
+   if (ret == RefcountWithFlags::incResult::incOk) {
+     ++handleCount_.tlStats();
+   }
+   return ret;
 }
 
 template <typename CacheTrait>
@@ -984,11 +988,16 @@ CacheAllocator<CacheTrait>::acquire(Item* it) {
 
   SCOPE_FAIL { stats_.numRefcountOverflow.inc(); };
 
-  if (LIKELY(incRef(*it))) {
+  auto failIfMoving = getNumTiers() > 1;
+  auto incRes = incRef(*it, failIfMoving);
+  if (LIKELY(incRes == RefcountWithFlags::incResult::incOk)) {
     return WriteHandle{it, *this};
-  } else {
+  } else if (incRes == RefcountWithFlags::incResult::incFailedEviction){
     // item is being evicted
     return WriteHandle{};
+  } else {
+    // item is being moved - wait for completion
+    return handleWithWaitContextForMovingItem(*it);
   }
 }
 
@@ -1031,6 +1040,25 @@ bool CacheAllocator<CacheTrait>::replaceInMMContainer(Item& oldItem,
   }
 }
 
+template <typename CacheTrait>
+bool CacheAllocator<CacheTrait>::replaceInMMContainer(Item* oldItem,
+                                                      Item& newItem) {
+  return replaceInMMContainer(*oldItem, newItem);
+}
+
+template <typename CacheTrait>
+bool CacheAllocator<CacheTrait>::replaceInMMContainer(EvictionIterator& oldItemIt,
+                                                      Item& newItem) {
+  auto& oldContainer = getMMContainer(*oldItemIt);
+  auto& newContainer = getMMContainer(newItem);
+
+  // This function is used for eviction across tiers
+  XDCHECK(&oldContainer != &newContainer);
+  oldContainer.remove(oldItemIt);
+
+  return newContainer.add(newItem);
+}
+
 template <typename CacheTrait>
 bool CacheAllocator<CacheTrait>::replaceChainedItemInMMContainer(
     Item& oldItem, Item& newItem) {
@@ -1176,6 +1204,143 @@ CacheAllocator<CacheTrait>::insertOrReplace(const WriteHandle& handle) {
   return replaced;
 }
 
+/* Next two methods are used to asynchronously move Item between memory tiers.
+ *
+ * The thread, which moves Item, allocates new Item in the tier we are moving to
+ * and calls moveRegularItemWithSync() method. This method does the following:
+ *  1. Update the access container with the new item from the tier we are
+ *     moving to. This Item has moving flag set.
+ *  2. Copy data from the old Item to the new one.
+ *
+ * Concurrent threads which are getting handle to the same key:
+ *  1. When a handle is created it checks if the moving flag is set
+ *  2. If so, Handle implementation creates waitContext and adds it to the
+ *     MoveCtx by calling handleWithWaitContextForMovingItem() method.
+ *  3. Wait until the moving thread will complete its job.
+ */
+template <typename CacheTrait>
+typename CacheAllocator<CacheTrait>::WriteHandle
+CacheAllocator<CacheTrait>::handleWithWaitContextForMovingItem(Item& item) {
+  auto shard = getShardForKey(item.getKey());
+  auto& movesMap = getMoveMapForShard(shard);
+  {
+    auto lock = getMoveLockForShard(shard);
+
+    WriteHandle hdl{*this};
+    auto waitContext = hdl.getItemWaitContext();
+
+    auto ret = movesMap.try_emplace(item.getKey(), std::make_unique<MoveCtx>());
+    ret.first->second->addWaiter(std::move(waitContext));
+
+    return hdl;
+  }
+}
+
+template <typename CacheTrait>
+size_t CacheAllocator<CacheTrait>::wakeUpWaitersLocked(folly::StringPiece key,
+  WriteHandle&& handle) {
+  std::unique_ptr<MoveCtx> ctx;
+  auto shard = getShardForKey(key);
+  auto& movesMap = getMoveMapForShard(shard);
+  {
+    auto lock = getMoveLockForShard(shard);
+    movesMap.eraseInto(key, [&](auto &&key, auto &&value) {
+      ctx = std::move(value);
+    });
+  }
+
+  if (ctx) {
+    ctx->setItemHandle(std::move(handle));
+    return ctx->numWaiters();
+  }
+
+  return 0;
+}
+
+template <typename CacheTrait>
+void CacheAllocator<CacheTrait>::moveRegularItemWithSync(
+    Item& oldItem, WriteHandle& newItemHdl) {
+  XDCHECK(oldItem.isMoving());
+  XDCHECK(!oldItem.isExpired());
+  // TODO: should we introduce new latency tracker. E.g. evictRegularLatency_
+  // ??? util::LatencyTracker tracker{stats_.evictRegularLatency_};
+
+  XDCHECK_EQ(newItemHdl->getSize(), oldItem.getSize());
+
+  // take care of the flags before we expose the item to be accessed. this
+  // will ensure that when another thread removes the item from RAM, we issue
+  // a delete accordingly. See D7859775 for an example
+  if (oldItem.isNvmClean()) {
+    newItemHdl->markNvmClean();
+  }
+
+  // mark new item as moving to block readers until the data is copied
+  // (moveCb is called). Mark item in MMContainer temporarily (TODO: should
+  // we remove markMoving requirement for the item to be linked?)
+  newItemHdl->markInMMContainer();
+  auto marked = newItemHdl->markMoving(false /* there is already a handle */);
+  newItemHdl->unmarkInMMContainer();
+  XDCHECK(marked);
+
+  auto predicate = [&](const Item& item){
+    // we rely on moving flag being set (it should block all readers)
+    XDCHECK(item.getRefCount() == 0);
+    return true;
+  };
+
+  auto replaced = accessContainer_->replaceIf(oldItem, *newItemHdl,
+                                   predicate);
+
+  if (config_.moveCb) {
+    // Execute the move callback. We cannot make any guarantees about the
+    // consistency of the old item beyond this point, because the callback can
+    // do more than a simple memcpy() e.g. update external references. If there
+    // are any remaining handles to the old item, it is the caller's
+    // responsibility to invalidate them. The move can only fail after this
+    // statement if the old item has been removed or replaced, in which case it
+    // should be fine for it to be left in an inconsistent state.
+    config_.moveCb(oldItem, *newItemHdl, nullptr);
+  } else {
+    std::memcpy(newItemHdl->getMemory(), oldItem.getMemory(),
+                oldItem.getSize());
+  }
+
+  // Adding the item to mmContainer has to succeed since no one can remove the item
+  auto& newContainer = getMMContainer(*newItemHdl);
+  auto mmContainerAdded = newContainer.add(*newItemHdl);
+  XDCHECK(mmContainerAdded);
+
+  // no one can add or remove chained items at this point
+  if (oldItem.hasChainedItem()) {
+    // safe to acquire handle for a moving Item
+    auto incRes = incRef(oldItem, false);
+    XDCHECK(incRes == RefcountWithFlags::incResult::incOk);
+    auto oldHandle = WriteHandle{&oldItem,*this};
+    XDCHECK_EQ(1u, oldHandle->getRefCount()) << oldHandle->toString();
+    XDCHECK(!newItemHdl->hasChainedItem()) << newItemHdl->toString();
+    try {
+      auto l = chainedItemLocks_.lockExclusive(oldItem.getKey());
+      transferChainLocked(oldHandle, newItemHdl);
+    } catch (const std::exception& e) {
+      // this should never happen because we drained all the handles.
+      XLOGF(DFATAL, "{}", e.what());
+      throw;
+    }
+
+    XDCHECK(!oldItem.hasChainedItem());
+    XDCHECK(newItemHdl->hasChainedItem());
+  }
+  newItemHdl.unmarkNascent();
+  auto ref = newItemHdl->unmarkMoving();
+  //remove because there is a chance the new item was not
+  //added to the access container
+  if (UNLIKELY(ref == 0)) {
+    const auto res =
+        releaseBackToAllocator(*newItemHdl, RemoveContext::kNormal, false);
+    XDCHECK(res == ReleaseRes::kReleased);
+  }
+}
+
 template <typename CacheTrait>
 bool CacheAllocator<CacheTrait>::moveRegularItem(Item& oldItem,
                                                  WriteHandle& newItemHdl) {
@@ -1207,10 +1372,10 @@ bool CacheAllocator<CacheTrait>::moveRegularItem(Item& oldItem,
   config_.moveCb(oldItem, *newItemHdl, nullptr);
 
   // Inside the access container's lock, this checks if the old item is
-  // accessible and its refcount is zero. If the item is not accessible,
+  // accessible and its refcount is one. If the item is not accessible,
   // there is no point to replace it since it had already been removed
   // or in the process of being removed. If the item is in cache but the
-  // refcount is non-zero, it means user could be attempting to remove
+  // refcount is non-one, it means user could be attempting to remove
   // this item through an API such as remove(itemHandle). In this case,
   // it is unsafe to replace the old item with a new one, so we should
   // also abort.
@@ -1236,13 +1401,12 @@ bool CacheAllocator<CacheTrait>::moveRegularItem(Item& oldItem,
 
   // no one can add or remove chained items at this point
   if (oldItem.hasChainedItem()) {
-    // safe to acquire handle for a moving Item
-    auto oldHandle = acquire(&oldItem);
-    XDCHECK_EQ(1u, oldHandle->getRefCount()) << oldHandle->toString();
+    auto oldItemHdl = acquire(&oldItem);
+    XDCHECK_EQ(1u, oldItemHdl->getRefCount()) << oldItemHdl->toString();
     XDCHECK(!newItemHdl->hasChainedItem()) << newItemHdl->toString();
     try {
       auto l = chainedItemLocks_.lockExclusive(oldItem.getKey());
-      transferChainLocked(oldHandle, newItemHdl);
+      transferChainLocked(oldItemHdl, newItemHdl);
     } catch (const std::exception& e) {
       // this should never happen because we drained all the handles.
       XLOGF(DFATAL, "{}", e.what());
@@ -1343,18 +1507,19 @@ template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::Item*
 CacheAllocator<CacheTrait>::findEviction(TierId tid, PoolId pid, ClassId cid) {
   auto& mmContainer = getMMContainer(tid, pid, cid);
-
+  bool lastTier = tid+1 >= getNumTiers();
   // Keep searching for a candidate until we were able to evict it
   // or until the search limit has been exhausted
   unsigned int searchTries = 0;
   while ((config_.evictionSearchTries == 0 ||
           config_.evictionSearchTries > searchTries)) {
+
     Item* toRecycle = nullptr;
     Item* candidate = nullptr;
     typename NvmCacheT::PutToken token;
 
     mmContainer.withEvictionIterator([this, pid, cid, &candidate, &toRecycle,
-                                      &searchTries, &mmContainer,
+                                      &searchTries, &mmContainer, &lastTier,
                                       &token](auto&& itr) {
       if (!itr) {
         ++searchTries;
@@ -1374,15 +1539,20 @@ CacheAllocator<CacheTrait>::findEviction(TierId tid, PoolId pid, ClassId cid) {
                 ? &toRecycle_->asChainedItem().getParentItem(compressor_)
                 : toRecycle_;
 
-        token = createPutToken(*candidate_);
+        if (lastTier) {
+          // if it's last tier, the item will be evicted
+          // need to create put token before marking it exclusive
+          token = createPutToken(*candidate_);
+        }
 
-        if (shouldWriteToNvmCache(*candidate_) && !token.isValid()) {
+        if (lastTier && shouldWriteToNvmCache(*candidate_) && !token.isValid()) {
           stats_.evictFailConcurrentFill.inc();
-        } else if (candidate_->markForEviction()) {
-          XDCHECK(candidate_->isMarkedForEviction());
+        } else if ( (lastTier && candidate_->markForEviction()) ||
+                    (!lastTier && candidate_->markMoving(true)) ) {
+          XDCHECK(candidate_->isMoving() || candidate_->isMarkedForEviction());
           // markForEviction to make sure no other thead is evicting the item
-          // nor holding a handle to that item
-
+          // nor holding a handle to that item if this is last tier
+          // since we won't be moving the item to the next tier
           toRecycle = toRecycle_;
           candidate = candidate_;
 
@@ -1415,16 +1585,45 @@ CacheAllocator<CacheTrait>::findEviction(TierId tid, PoolId pid, ClassId cid) {
     XDCHECK(toRecycle);
     XDCHECK(candidate);
 
-    // for chained items, the ownership of the parent can change. We try to
-    // evict what we think as parent and see if the eviction of parent
-    // recycles the child we intend to.
-    unlinkItemForEviction(*candidate);
-    XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving());
+    auto evictedToNext = lastTier ? nullptr
+        : tryEvictToNextMemoryTier(*candidate);
+    if (!evictedToNext) {
+      if (!token.isValid()) {
+        token = createPutToken(*candidate);
+      }
+      // tryEvictToNextMemoryTier should only fail if allocation of the new item fails
+      // in that case, it should be still possible to mark item as exclusive.
+      //
+      // in case that we are on the last tier, we whould have already marked
+      // as exclusive since we will not be moving the item to the next tier
+      // but rather just evicting all together, no need to
+      // markExclusiveWhenMoving
+      auto ret = lastTier ? true : candidate->markForEvictionWhenMoving();
+      XDCHECK(ret);
+
+      unlinkItemForEviction(*candidate);
+      // wake up any readers that wait for the move to complete
+      // it's safe to do now, as we have the item marked exclusive and
+      // no other reader can be added to the waiters list
+      wakeUpWaiters(*candidate, {});
+
+      if (token.isValid() && shouldWriteToNvmCacheExclusive(*candidate)) {
+        nvmCache_->put(*candidate, std::move(token));
+      }
+    } else {
+      XDCHECK(!evictedToNext->isMarkedForEviction() && !evictedToNext->isMoving());
+      XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving());
+      XDCHECK(!candidate->isAccessible());
+      XDCHECK(candidate->getKey() == evictedToNext->getKey());
 
-    if (token.isValid() && shouldWriteToNvmCacheExclusive(*candidate)) {
-      nvmCache_->put(*candidate, std::move(token));
+      wakeUpWaiters(*candidate, std::move(evictedToNext));
     }
 
+    XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving());
+
+    // recycle the item. it's safe to do so, even if toReleaseHandle was
+    // NULL. If `ref` == 0 then it means that we are the last holder of
+    // that item.
     if (candidate->hasChainedItem()) {
       (*stats_.chainedItemEvictions)[pid][cid].inc();
     } else {
@@ -1496,6 +1695,49 @@ bool CacheAllocator<CacheTrait>::shouldWriteToNvmCacheExclusive(
   return true;
 }
 
+template <typename CacheTrait>
+typename CacheAllocator<CacheTrait>::WriteHandle
+CacheAllocator<CacheTrait>::tryEvictToNextMemoryTier(
+    TierId tid, PoolId pid, Item& item) {
+  XDCHECK(item.isMoving());
+  XDCHECK(item.getRefCount() == 0);
+  if(item.hasChainedItem()) return WriteHandle{}; // TODO: We do not support ChainedItem yet
+  if(item.isExpired()) {
+    accessContainer_->remove(item);
+    item.unmarkMoving();
+    return acquire(&item);
+  }
+
+  TierId nextTier = tid; // TODO - calculate this based on some admission policy
+  while (++nextTier < getNumTiers()) { // try to evict down to the next memory tiers
+    // allocateInternal might trigger another eviction
+    auto newItemHdl = allocateInternalTier(nextTier, pid,
+                     item.getKey(),
+                     item.getSize(),
+                     item.getCreationTime(),
+                     item.getExpiryTime());
+
+    if (newItemHdl) {
+      XDCHECK_EQ(newItemHdl->getSize(), item.getSize());
+      moveRegularItemWithSync(item, newItemHdl);
+      item.unmarkMoving();
+      return newItemHdl;
+    } else {
+      return WriteHandle{};
+    }
+  }
+
+  return {};
+}
+
+template <typename CacheTrait>
+typename CacheAllocator<CacheTrait>::WriteHandle
+CacheAllocator<CacheTrait>::tryEvictToNextMemoryTier(Item& item) {
+  auto tid = getTierId(item);
+  auto pid = allocator_[tid]->getAllocInfo(item.getMemory()).poolId;
+  return tryEvictToNextMemoryTier(tid, pid, item);
+}
+
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::RemoveRes
 CacheAllocator<CacheTrait>::remove(typename Item::Key key) {
@@ -2018,8 +2260,7 @@ std::vector<std::string> CacheAllocator<CacheTrait>::dumpEvictionIterator(
 
   std::vector<std::string> content;
 
-  size_t i = 0;
-  while (i < numItems && tid >= 0) {
+  while (tid >= 0) {
     auto& mm = *mmContainers_[tid][pid][cid];
     mm.withEvictionIterator([&content, numItems](auto&& itr) {
       while (itr && content.size() < numItems) {
@@ -2029,7 +2270,6 @@ std::vector<std::string> CacheAllocator<CacheTrait>::dumpEvictionIterator(
     });
     --tid;
   }
-
   return content;
 }
 
@@ -2614,6 +2854,14 @@ void CacheAllocator<CacheTrait>::throttleWith(util::Throttler& t,
   }
 }
 
+template <typename CacheTrait>
+typename RefcountWithFlags::Value CacheAllocator<CacheTrait>::unmarkMovingAndWakeUpWaiters(Item &item, WriteHandle handle)
+{
+  auto ret = item.unmarkMoving();
+  wakeUpWaiters(item, std::move(handle));
+  return ret;
+}
+
 template <typename CacheTrait>
 bool CacheAllocator<CacheTrait>::moveForSlabRelease(
     const SlabReleaseContext& ctx, Item& oldItem, util::Throttler& throttler) {
@@ -2632,7 +2880,8 @@ bool CacheAllocator<CacheTrait>::moveForSlabRelease(
 
     // Nothing to move and the key is likely also bogus for chained items.
     if (oldItem.isOnlyMoving()) {
-      oldItem.unmarkMoving();
+      auto ret = unmarkMovingAndWakeUpWaiters(oldItem, {});
+      XDCHECK(ret == 0);
       const auto res =
           releaseBackToAllocator(oldItem, RemoveContext::kNormal, false);
       XDCHECK(res == ReleaseRes::kReleased);
@@ -2680,8 +2929,9 @@ bool CacheAllocator<CacheTrait>::moveForSlabRelease(
     });
   }
   auto tid = getTierId(oldItem);
-  auto ref = oldItem.unmarkMoving();
-  XDCHECK_EQ(ref, 0);
+  auto ref = unmarkMovingAndWakeUpWaiters(oldItem, std::move(newItemHdl));
+  XDCHECK(ref == 0);
+
   const auto allocInfo = allocator_[tid]->getAllocInfo(oldItem.getMemory());
   allocator_[tid]->free(&oldItem);
 
@@ -2731,12 +2981,12 @@ CacheAllocator<CacheTrait>::allocateNewItemForOldItem(const Item& oldItem) {
     // Set up the destination for the move. Since oldChainedItem would
     // be marked as moving, it won't be picked for eviction.
     auto newItemHdl =
-        allocateChainedItemInternal(parentHandle, oldChainedItem.getSize());
+        allocateChainedItemInternal(parentHandle, oldItem.getSize());
     if (!newItemHdl) {
       return {};
     }
 
-    XDCHECK_EQ(newItemHdl->getSize(), oldChainedItem.getSize());
+    XDCHECK_EQ(newItemHdl->getSize(), oldItem.getSize());
     auto parentPtr = parentHandle.getInternal();
     XDCHECK_EQ(reinterpret_cast<uintptr_t>(parentPtr),
                reinterpret_cast<uintptr_t>(
@@ -2799,21 +3049,44 @@ bool CacheAllocator<CacheTrait>::tryMovingForSlabRelease(
     }
   }
 
-  return oldItem.isChainedItem()
-             ? moveChainedItem(oldItem.asChainedItem(), newItemHdl)
-             : moveRegularItem(oldItem, newItemHdl);
+  // TODO: we can unify move*Item and move*ItemWithSync by always
+  // using the moving bit to block readers.
+  if (getNumTiers() == 1) {
+    return oldItem.isChainedItem()
+              ? moveChainedItem(oldItem.asChainedItem(), newItemHdl)
+              : moveRegularItem(oldItem, newItemHdl);
+  } else {
+    if (oldItem.isChainedItem() || oldItem.hasChainedItem()) {
+      // TODO: add support for chained items
+      return false;
+    } else {
+      moveRegularItemWithSync(oldItem, newItemHdl);
+      removeFromMMContainer(oldItem);
+      return true;
+    }
+  }
+}
+
+template <typename CacheTrait>
+void CacheAllocator<CacheTrait>::wakeUpWaiters(Item& item, WriteHandle handle)
+{
+  // readers do not block on 'moving' items in case there is only one tier
+  if (getNumTiers() > 1) {
+    wakeUpWaitersLocked(item.getKey(), std::move(handle));
+  }
 }
 
 template <typename CacheTrait>
 void CacheAllocator<CacheTrait>::evictForSlabRelease(
     const SlabReleaseContext& ctx, Item& item, util::Throttler& throttler) {
   auto startTime = util::getCurrentTimeSec();
+
   while (true) {
     XDCHECK(item.isMoving());
     stats_.numEvictionAttempts.inc();
 
     if (shutDownInProgress_) {
-      item.unmarkMoving();
+      auto ref = unmarkMovingAndWakeUpWaiters(item, {});
       allocator_[getTierId(item)]->abortSlabRelease(ctx);
       throw exception::SlabReleaseAborted(
           folly::sformat("Slab Release aborted while trying to evict"
@@ -2833,12 +3106,12 @@ void CacheAllocator<CacheTrait>::evictForSlabRelease(
                                        .toString())
                   : "");
     });
-
     // if the item is already in a state where only the exclusive bit is set,
     // nothing needs to be done. We simply need to call unmarkMoving and free
     // the item.
     if (item.isOnlyMoving()) {
-      item.unmarkMoving();
+      auto ref = unmarkMovingAndWakeUpWaiters(item, {});
+      XDCHECK(ref == 0);
       const auto res =
           releaseBackToAllocator(item, RemoveContext::kNormal, false);
       XDCHECK(ReleaseRes::kReleased == res);
@@ -2895,6 +3168,10 @@ void CacheAllocator<CacheTrait>::evictForSlabRelease(
         // unmark the child so it will be freed
         item.unmarkMoving();
         unlinkItemForEviction(*evicted);
+        // wake up any readers that wait for the move to complete
+        // it's safe to do now, as we have the item marked exclusive and
+        // no other reader can be added to the waiters list
+        wakeUpWaiters(*evicted, {});
       } else {
         continue;
       }
@@ -2904,6 +3181,7 @@ void CacheAllocator<CacheTrait>::evictForSlabRelease(
       token = createPutToken(*evicted);
       if (evicted->markForEvictionWhenMoving()) {
         unlinkItemForEviction(*evicted);
+        wakeUpWaiters(*evicted, {});
       } else {
         continue;
       }
@@ -2914,7 +3192,7 @@ void CacheAllocator<CacheTrait>::evictForSlabRelease(
     }
 
     const auto allocInfo =
-        allocator_[getTierId(item)]->getAllocInfo(static_cast<const void*>(evicted));
+        allocator_[getTierId(*evicted)]->getAllocInfo(static_cast<const void*>(evicted));
     if (evicted->hasChainedItem()) {
       (*stats_.chainedItemEvictions)[allocInfo.poolId][allocInfo.classId].inc();
     } else {
@@ -2931,6 +3209,21 @@ void CacheAllocator<CacheTrait>::evictForSlabRelease(
   }
 }
 
+template <typename CacheTrait>
+template <typename Fn>
+typename CacheAllocator<CacheTrait>::WriteHandle
+CacheAllocator<CacheTrait>::removeIf(Item& item, Fn&& predicate) {
+  auto handle = accessContainer_->removeIf(item, std::forward<Fn>(predicate));
+
+  if (handle) {
+    XDCHECK_EQ(reinterpret_cast<uintptr_t>(handle.get()),
+             reinterpret_cast<uintptr_t>(&item));
+    removeFromMMContainer(item);
+  }
+
+  return handle;
+}
+
 template <typename CacheTrait>
 bool CacheAllocator<CacheTrait>::removeIfExpired(const ReadHandle& handle) {
   if (!handle) {
@@ -2939,14 +3232,7 @@ bool CacheAllocator<CacheTrait>::removeIfExpired(const ReadHandle& handle) {
 
   // We remove the item from both access and mm containers.
   // We want to make sure the caller is the only one holding the handle.
-  auto removedHandle =
-      accessContainer_->removeIf(*(handle.getInternal()), itemExpiryPredicate);
-  if (removedHandle) {
-    removeFromMMContainer(*(handle.getInternal()));
-    return true;
-  }
-
-  return false;
+  return (bool)removeIf(*(handle.getInternal()), itemExpiryPredicate);
 }
 
 template <typename CacheTrait>
@@ -2965,15 +3251,14 @@ bool CacheAllocator<CacheTrait>::markMovingForSlabRelease(
   // At first, we assume this item was already freed
   bool itemFreed = true;
   bool markedMoving = false;
-  TierId tid = 0;
-  const auto fn = [&markedMoving, &itemFreed, &tid, this /* TODO - necessary for getTierId */](void* memory) {
+  TierId tid = getTierId(alloc);
+  const auto fn = [&markedMoving, &itemFreed](void* memory) {
     // Since this callback is executed, the item is not yet freed
     itemFreed = false;
     Item* item = static_cast<Item*>(memory);
-    if (item->markMoving()) {
+    if (item->markMoving(false)) {
       markedMoving = true;
     }
-    tid = getTierId(*item);
   };
 
   auto startTime = util::getCurrentTimeSec();
diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
index 9fb6728c1b..a845efb645 100644
--- a/cachelib/allocator/CacheAllocator.h
+++ b/cachelib/allocator/CacheAllocator.h
@@ -22,6 +22,8 @@
 #include <folly/ScopeGuard.h>
 #include <folly/logging/xlog.h>
 #include <folly/synchronization/SanitizeThread.h>
+#include <folly/hash/Hash.h>
+#include <folly/container/F14Map.h>
 #include <gtest/gtest.h>
 
 #include <chrono>
@@ -1319,7 +1321,7 @@ class CacheAllocator : public CacheBase {
 
  private:
   // wrapper around Item's refcount and active handle tracking
-  FOLLY_ALWAYS_INLINE bool incRef(Item& it);
+  FOLLY_ALWAYS_INLINE RefcountWithFlags::incResult incRef(Item& it, bool failIfMoving);
   FOLLY_ALWAYS_INLINE RefcountWithFlags::Value decRef(Item& it);
 
   // drops the refcount and if needed, frees the allocation back to the memory
@@ -1550,7 +1552,7 @@ class CacheAllocator : public CacheBase {
   //
   // @return true  If the move was completed, and the containers were updated
   //               successfully.
-  bool moveRegularItemOnEviction(Item& oldItem, WriteHandle& newItemHdl);
+  void moveRegularItemWithSync(Item& oldItem, WriteHandle& newItemHdl);
 
   // Moves a regular item to a different slab. This should only be used during
   // slab release after the item's exclusive bit has been set. The user supplied
@@ -1637,6 +1639,10 @@ class CacheAllocator : public CacheBase {
   //         false  if the item is not in MMContainer
   bool removeFromMMContainer(Item& item);
 
+  using EvictionIterator = typename MMContainer::LockedIterator;
+
+  WriteHandle acquire(EvictionIterator& it) { return acquire(it.get()); }
+
   // Replaces an item in the MMContainer with another item, at the same
   // position.
   //
@@ -1647,6 +1653,8 @@ class CacheAllocator : public CacheBase {
   //               destination item did not exist in the container, or if the
   //               source item already existed.
   bool replaceInMMContainer(Item& oldItem, Item& newItem);
+  bool replaceInMMContainer(Item* oldItem, Item& newItem);
+  bool replaceInMMContainer(EvictionIterator& oldItemIt, Item& newItem);
 
   // Replaces an item in the MMContainer with another item, at the same
   // position. Or, if the two chained items belong to two different MM
@@ -1705,7 +1713,35 @@ class CacheAllocator : public CacheBase {
   // @return An evicted item or nullptr  if there is no suitable candidate.
   Item* findEviction(TierId tid, PoolId pid, ClassId cid);
 
-  using EvictionIterator = typename MMContainer::LockedIterator;
+  // Try to move the item down to the next memory tier
+  //
+  // @param tid current tier ID of the item
+  // @param pid the pool ID the item belong to.
+  // @param item the item to evict
+  //
+  // @return valid handle to the item. This will be the last
+  //         handle to the item. On failure an empty handle.
+  WriteHandle tryEvictToNextMemoryTier(TierId tid, PoolId pid, Item& item); 
+
+  // Wakes up waiters if there are any
+  //
+  // @param item    wakes waiters that are waiting on that item
+  // @param handle  handle to pass to the waiters
+  void wakeUpWaiters(Item& item, WriteHandle handle);
+
+  // Unmarks item as moving and wakes up any waiters waiting on that item
+  //
+  // @param item    wakes waiters that are waiting on that item
+  // @param handle  handle to pass to the waiters
+  typename RefcountWithFlags::Value unmarkMovingAndWakeUpWaiters(Item &item, WriteHandle handle);
+
+  // Try to move the item down to the next memory tier
+  //
+  // @param item the item to evict
+  //
+  // @return valid handle to the item. This will be the last
+  //         handle to the item. On failure an empty handle. 
+  WriteHandle tryEvictToNextMemoryTier(Item& item);
 
   // Deserializer CacheAllocatorMetadata and verify the version
   //
@@ -1823,6 +1859,12 @@ class CacheAllocator : public CacheBase {
 
   typename NvmCacheT::PutToken createPutToken(Item& item);
 
+  // Helper function to remove a item if predicates is true.
+  //
+  // @return last handle to the item on success. empty handle on failure.
+  template <typename Fn>
+  WriteHandle removeIf(Item& item, Fn&& predicate);
+
   // Helper function to remove a item if expired.
   //
   // @return true if it item expire and removed successfully.
@@ -2008,6 +2050,87 @@ class CacheAllocator : public CacheBase {
 
   size_t memoryTierSize(TierId tid) const;
 
+  WriteHandle handleWithWaitContextForMovingItem(Item& item);
+
+  size_t wakeUpWaitersLocked(folly::StringPiece key, WriteHandle&& handle);
+
+  class MoveCtx {
+   public:
+    MoveCtx() {}
+
+    ~MoveCtx() {
+      // prevent any further enqueue to waiters
+      // Note: we don't need to hold locks since no one can enqueue
+      // after this point.
+      wakeUpWaiters();
+    }
+
+    // record the item handle. Upon destruction we will wake up the waiters
+    // and pass a clone of the handle to the callBack. By default we pass
+    // a null handle
+    void setItemHandle(WriteHandle _it) { it = std::move(_it); }
+
+    // enqueue a waiter into the waiter list
+    // @param  waiter       WaitContext
+    void addWaiter(std::shared_ptr<WaitContext<ReadHandle>> waiter) {
+      XDCHECK(waiter);
+      waiters.push_back(std::move(waiter));
+    }
+
+    size_t numWaiters() const { return waiters.size(); }
+
+   private:
+    // notify all pending waiters that are waiting for the fetch.
+    void wakeUpWaiters() {
+      bool refcountOverflowed = false;
+      for (auto& w : waiters) {
+        // If refcount overflowed earlier, then we will return miss to
+        // all subsequent waitors.
+        if (refcountOverflowed) {
+          w->set(WriteHandle{});
+          continue;
+        }
+
+        try {
+          w->set(it.clone());
+        } catch (const exception::RefcountOverflow&) {
+          // We'll return a miss to the user's pending read,
+          // so we should enqueue a delete via NvmCache.
+          // TODO: cache.remove(it);
+          refcountOverflowed = true;
+        }
+      }
+    }
+
+    WriteHandle it; // will be set when Context is being filled
+    std::vector<std::shared_ptr<WaitContext<ReadHandle>>> waiters; // list of
+                                                                   // waiters
+  };
+  using MoveMap =
+      folly::F14ValueMap<folly::StringPiece,
+                         std::unique_ptr<MoveCtx>,
+                         folly::HeterogeneousAccessHash<folly::StringPiece>>;
+
+  static size_t getShardForKey(folly::StringPiece key) {
+    return folly::Hash()(key) % kShards;
+  }
+
+  MoveMap& getMoveMapForShard(size_t shard) {
+    return movesMap_[shard].movesMap_;
+  }
+
+  MoveMap& getMoveMap(folly::StringPiece key) {
+    return getMoveMapForShard(getShardForKey(key));
+  }
+
+  std::unique_lock<std::mutex> getMoveLockForShard(size_t shard) {
+    return std::unique_lock<std::mutex>(moveLock_[shard].moveLock_);
+  }
+
+  std::unique_lock<std::mutex> getMoveLock(folly::StringPiece key) {
+    return getMoveLockForShard(getShardForKey(key));
+  }
+
   // Whether the memory allocator for this cache allocator was created on shared
   // memory. The hash table, chained item hash table etc is also created on
   // shared memory except for temporary shared memory mode when they're created
@@ -2100,6 +2223,22 @@ class CacheAllocator : public CacheBase {
   // poolResizer_, poolOptimizer_, memMonitor_, reaper_
   mutable std::mutex workersMutex_;
 
+  static constexpr size_t kShards = 8192; // TODO: need to define right value
+
+  struct MovesMapShard {
+    alignas(folly::hardware_destructive_interference_size) MoveMap movesMap_;
+  };
+
+  struct MoveLock {
+    alignas(folly::hardware_destructive_interference_size) std::mutex moveLock_;
+  };
+
+  // a map of all pending moves
+  std::vector<MovesMapShard> movesMap_;
+
+  // a map of move locks for each shard
+  std::vector<MoveLock> moveLock_;
+
   // time when the ram cache was first created
   const uint32_t cacheCreationTime_{0};
 
diff --git a/cachelib/allocator/CacheItem-inl.h b/cachelib/allocator/CacheItem-inl.h
index 0028e2776a..b33c1ea28a 100644
--- a/cachelib/allocator/CacheItem-inl.h
+++ b/cachelib/allocator/CacheItem-inl.h
@@ -238,8 +238,8 @@ bool CacheItem<CacheTrait>::markForEvictionWhenMoving() {
 }
 
 template <typename CacheTrait>
-bool CacheItem<CacheTrait>::markMoving() {
-  return ref_.markMoving();
+bool CacheItem<CacheTrait>::markMoving(bool failIfRefNotZero) {
+  return ref_.markMoving(failIfRefNotZero);
 }
 
 template <typename CacheTrait>
diff --git a/cachelib/allocator/CacheItem.h b/cachelib/allocator/CacheItem.h
index afee315cbb..b4fa339b57 100644
--- a/cachelib/allocator/CacheItem.h
+++ b/cachelib/allocator/CacheItem.h
@@ -309,9 +309,9 @@ class CACHELIB_PACKED_ATTR CacheItem {
   //
   // @return true on success, failure if item is marked as exclusive
   // @throw exception::RefcountOverflow on ref count overflow
-  FOLLY_ALWAYS_INLINE bool incRef() {
+  FOLLY_ALWAYS_INLINE RefcountWithFlags::incResult incRef(bool failIfMoving) {
     try {
-      return ref_.incRef();
+      return ref_.incRef(failIfMoving);
     } catch (exception::RefcountOverflow& e) {
       throw exception::RefcountOverflow(
           folly::sformat("{} item: {}", e.what(), toString()));
@@ -378,7 +378,7 @@ class CACHELIB_PACKED_ATTR CacheItem {
    * Unmarking moving will also return the refcount at the moment of
    * unmarking.
    */
-  bool markMoving();
+  bool markMoving(bool failIfRefNotZero);
   RefcountWithFlags::Value unmarkMoving() noexcept;
   bool isMoving() const noexcept;
   bool isOnlyMoving() const noexcept;
diff --git a/cachelib/allocator/Handle.h b/cachelib/allocator/Handle.h
index 11d2bed2be..06c21bffe4 100644
--- a/cachelib/allocator/Handle.h
+++ b/cachelib/allocator/Handle.h
@@ -400,6 +400,12 @@ struct ReadHandleImpl {
       }
     }
 
+   protected:
+    friend class ReadHandleImpl;
+    // Method used only by ReadHandleImpl ctor
+    void discard() {
+      it_.store(nullptr, std::memory_order_relaxed);
+    }
    private:
     // we are waiting on Item* to be set to a value. One of the valid values is
     // nullptr. So choose something that we dont expect to indicate a ptr
@@ -479,7 +485,8 @@ struct ReadHandleImpl {
 
   // Handle which has the item already
   FOLLY_ALWAYS_INLINE ReadHandleImpl(Item* it, CacheT& alloc) noexcept
-      : alloc_(&alloc), it_(it) {}
+      : alloc_(&alloc), it_(it) {
+  }
 
   // handle that has a wait context allocated. Used for async handles
   // In this case, the it_ will be filled in asynchronously and mulitple
diff --git a/cachelib/allocator/MMLru-inl.h b/cachelib/allocator/MMLru-inl.h
index d35759f212..842d87ddb8 100644
--- a/cachelib/allocator/MMLru-inl.h
+++ b/cachelib/allocator/MMLru-inl.h
@@ -229,6 +229,18 @@ void MMLru::Container<T, HookPtr>::withEvictionIterator(F&& fun) {
   }
 }
 
+//template <typename T, MMLru::Hook<T> T::*HookPtr>
+//template <typename F>
+//void
+//MMLru::Container<T, HookPtr>::withPromotionIterator(F&& fun) {
+//  if (config_.useCombinedLockForIterators) {
+//    lruMutex_->lock_combine([this, &fun]() { fun(Iterator{lru_.begin()}); });
+//  } else {
+//    LockHolder lck{*lruMutex_};
+//    fun(Iterator{lru_.begin()});
+//  }
+//}
+
 template <typename T, MMLru::Hook<T> T::*HookPtr>
 void MMLru::Container<T, HookPtr>::ensureNotInsertionPoint(T& node) noexcept {
   // If we are removing the insertion point node, grow tail before we remove
diff --git a/cachelib/allocator/MMLru.h b/cachelib/allocator/MMLru.h
index 29c6d02689..645b8f0e86 100644
--- a/cachelib/allocator/MMLru.h
+++ b/cachelib/allocator/MMLru.h
@@ -230,12 +230,13 @@ class MMLru {
     //     lruInsertionPointSpec = 2, we insert at a point 1/4th from tail
     uint8_t lruInsertionPointSpec{0};
 
+    // Whether to use combined locking for withEvictionIterator.
+    bool useCombinedLockForIterators{true};
+
     // Minimum interval between reconfigurations. If 0, reconfigure is never
     // called.
     std::chrono::seconds mmReconfigureIntervalSecs{};
 
-    // Whether to use combined locking for withEvictionIterator.
-    bool useCombinedLockForIterators{false};
   };
 
   // The container object which can be used to keep track of objects of type
diff --git a/cachelib/allocator/Refcount.h b/cachelib/allocator/Refcount.h
index 44a3facd3a..8251ef15ba 100644
--- a/cachelib/allocator/Refcount.h
+++ b/cachelib/allocator/Refcount.h
@@ -130,30 +130,41 @@ class FOLLY_PACK_ATTR RefcountWithFlags {
   RefcountWithFlags& operator=(const RefcountWithFlags&) = delete;
   RefcountWithFlags(RefcountWithFlags&&) = delete;
   RefcountWithFlags& operator=(RefcountWithFlags&&) = delete;
-
+  enum incResult {
+     incOk,
+     incFailedMoving,
+     incFailedEviction
+   };
   // Bumps up the reference count only if the new count will be strictly less
   // than or equal to the maxCount and the item is not exclusive
   // @return true if refcount is bumped. false otherwise (if item is exclusive)
   // @throw  exception::RefcountOverflow if new count would be greater than
   // maxCount
-  FOLLY_ALWAYS_INLINE bool incRef() {
-    auto predicate = [](const Value curValue) {
-      Value bitMask = getAdminRef<kExclusive>();
-
-      const bool exlusiveBitIsSet = curValue & bitMask;
-      if (UNLIKELY((curValue & kAccessRefMask) == (kAccessRefMask))) {
-        throw exception::RefcountOverflow("Refcount maxed out.");
-      }
-
-      // Check if the item is not marked for eviction
-      return !exlusiveBitIsSet || ((curValue & kAccessRefMask) != 0);
-    };
-
-    auto newValue = [](const Value curValue) {
-      return (curValue + static_cast<Value>(1));
-    };
-
-    return atomicUpdateValue(predicate, newValue);
+  FOLLY_ALWAYS_INLINE incResult incRef(bool failIfMoving) {
+    incResult res = incOk;
+    auto predicate = [failIfMoving, &res](const Value curValue) {
+       Value bitMask = getAdminRef<kExclusive>();
+
+       const bool exlusiveBitIsSet = curValue & bitMask;
+       if (UNLIKELY((curValue & kAccessRefMask) == (kAccessRefMask))) {
+         throw exception::RefcountOverflow("Refcount maxed out.");
+       } else if (exlusiveBitIsSet && (curValue & kAccessRefMask) == 0) {
+         res = incFailedEviction;
+         return false;
+       } else if (exlusiveBitIsSet && failIfMoving) {
+         res = incFailedMoving;
+         return false;
+       }
+       res = incOk;
+       return true;
+     };
+
+     auto newValue = [](const Value curValue) {
+       return (curValue + static_cast<Value>(1));
+     };
+
+     atomicUpdateValue(predicate, newValue);
+     return res;
   }
 
   // Bumps down the reference count
@@ -309,12 +320,14 @@ class FOLLY_PACK_ATTR RefcountWithFlags {
    *
    * Unmarking moving does not depend on `isInMMContainer`
    */
-  bool markMoving() {
-    auto predicate = [](const Value curValue) {
+  bool markMoving(bool failIfRefNotZero) {
+    auto predicate = [failIfRefNotZero](const Value curValue) {
       Value conditionBitMask = getAdminRef<kLinked>();
       const bool flagSet = curValue & conditionBitMask;
       const bool alreadyExclusive = curValue & getAdminRef<kExclusive>();
-
+      if (failIfRefNotZero && (curValue & kAccessRefMask) != 0) {
+        return false;
+      }
       if (!flagSet || alreadyExclusive) {
         return false;
       }
diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp b/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp
index 90ef34be41..a8fb952b71 100644
--- a/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp
+++ b/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp
@@ -26,6 +26,8 @@ using LruAllocatorMemoryTiersTest = AllocatorMemoryTiersTest<LruAllocator>;
 TEST_F(LruAllocatorMemoryTiersTest, MultiTiersInvalid) { this->testMultiTiersInvalid(); }
 TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValid) { this->testMultiTiersValid(); }
 TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValidMixed) { this->testMultiTiersValidMixed(); }
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersRemoveDuringEviction) { this->testMultiTiersRemoveDuringEviction(); }
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersReplaceDuringEviction) { this->testMultiTiersReplaceDuringEviction(); }
 
 } // end of namespace tests
 } // end of namespace cachelib
diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.h b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h
index 682cbb7c80..244a56ed83 100644
--- a/cachelib/allocator/tests/AllocatorMemoryTiersTest.h
+++ b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h
@@ -20,12 +20,42 @@
 #include "cachelib/allocator/MemoryTierCacheConfig.h"
 #include "cachelib/allocator/tests/TestBase.h"
 
+#include <folly/synchronization/Latch.h>
+
 namespace facebook {
 namespace cachelib {
 namespace tests {
 
 template <typename AllocatorT>
 class AllocatorMemoryTiersTest : public AllocatorTest<AllocatorT> {
+ private:
+  template<typename MvCallback>
+  void testMultiTiersAsyncOpDuringMove(std::unique_ptr<AllocatorT>& alloc,
+                                       PoolId& pool, bool& quit, MvCallback&& moveCb) {
+    typename AllocatorT::Config config;
+    config.setCacheSize(4 * Slab::kSize);
+    config.enableCachePersistence("/tmp");
+    config.configureMemoryTiers({
+        MemoryTierCacheConfig::fromShm()
+            .setRatio(1).setMemBind(std::string("0")),
+        MemoryTierCacheConfig::fromShm()
+            .setRatio(1).setMemBind(std::string("0"))
+    });
+
+    config.enableMovingOnSlabRelease(moveCb, {} /* ChainedItemsMoveSync */,
+                                     -1 /* movingAttemptsLimit */);
+
+    alloc = std::make_unique<AllocatorT>(AllocatorT::SharedMemNew, config);
+    ASSERT(alloc != nullptr);
+    pool = alloc->addPool("default", alloc->getCacheMemoryStats().ramCacheSize);
+
+    int i = 0;
+    while(!quit) {
+      auto handle = alloc->allocate(pool, std::to_string(++i), std::string("value").size());
+      ASSERT(handle != nullptr);
+      ASSERT_NO_THROW(alloc->insertOrReplace(handle));
+    }
+  }
  public:
   void testMultiTiersInvalid() {
     typename AllocatorT::Config config;
@@ -74,6 +104,70 @@ class AllocatorMemoryTiersTest : public AllocatorTest<AllocatorT> {
     ASSERT(handle != nullptr);
     ASSERT_NO_THROW(alloc->insertOrReplace(handle));
   }
+
+  void testMultiTiersRemoveDuringEviction() {
+    std::unique_ptr<AllocatorT> alloc;
+    PoolId pool;
+    std::unique_ptr<std::thread> t;
+    folly::Latch latch(1);
+    bool quit = false;
+
+    auto moveCb = [&] (typename AllocatorT::Item& oldItem,
+                       typename AllocatorT::Item& newItem,
+                       typename AllocatorT::Item* /* parentPtr */) {
+      
+      auto key = oldItem.getKey();
+      t = std::make_unique<std::thread>([&](){
+            // remove() function is blocked by wait context
+            // till item is moved to next tier. So that, we should
+            // notify latch before calling remove()
+            latch.count_down();
+            alloc->remove(key);
+          });
+      // wait till async thread is running
+      latch.wait();
+      memcpy(newItem.getMemory(), oldItem.getMemory(), oldItem.getSize());
+      quit = true;
+    };
+
+    testMultiTiersAsyncOpDuringMove(alloc, pool, quit, moveCb);
+
+    t->join();
+  }
+
+  void testMultiTiersReplaceDuringEviction() {
+    std::unique_ptr<AllocatorT> alloc;
+    PoolId pool;
+    std::unique_ptr<std::thread> t;
+    folly::Latch latch(1);
+    bool quit = false;
+
+    auto moveCb = [&] (typename AllocatorT::Item& oldItem,
+                       typename AllocatorT::Item& newItem,
+                       typename AllocatorT::Item* /* parentPtr */) {
+      auto key = oldItem.getKey();
+      if(!quit) {
+        // we need to replace only once because subsequent allocate calls
+        // will cause evictions recursevly
+        quit = true;
+        t = std::make_unique<std::thread>([&](){
+              auto handle = alloc->allocate(pool, key, std::string("new value").size());
+              // insertOrReplace() function is blocked by wait context
+              // till item is moved to next tier. So that, we should
+              // notify latch before calling insertOrReplace()
+              latch.count_down();
+              ASSERT_NO_THROW(alloc->insertOrReplace(handle));
+            });
+        // wait till async thread is running
+        latch.wait();
+      }
+      memcpy(newItem.getMemory(), oldItem.getMemory(), oldItem.getSize());
+    };
+
+    testMultiTiersAsyncOpDuringMove(alloc, pool, quit, moveCb);
+
+    t->join();
+  }
 };
 } // namespace tests
 } // namespace cachelib
diff --git a/cachelib/allocator/tests/ItemHandleTest.cpp b/cachelib/allocator/tests/ItemHandleTest.cpp
index d992a84011..5213166816 100644
--- a/cachelib/allocator/tests/ItemHandleTest.cpp
+++ b/cachelib/allocator/tests/ItemHandleTest.cpp
@@ -39,6 +39,8 @@ struct TestItem {
   using ChainedItem = int;
 
   void reset() {}
+
+  folly::StringPiece getKey() const { return folly::StringPiece(); }
 };
 
 struct TestNvmCache;
@@ -80,6 +82,12 @@ struct TestAllocator {
 
   void adjustHandleCountForThread_private(int i) { tlRef_.tlStats() += i; }
 
+  bool addWaitContextForMovingItem(
+      folly::StringPiece key,
+      std::shared_ptr<WaitContext<TestReadHandle>> waiter) {
+    return false;
+  }
+
   util::FastStats<int> tlRef_;
 };
 } // namespace
diff --git a/cachelib/allocator/tests/ItemTest.cpp b/cachelib/allocator/tests/ItemTest.cpp
index 70dd1277fe..2f25cc07f0 100644
--- a/cachelib/allocator/tests/ItemTest.cpp
+++ b/cachelib/allocator/tests/ItemTest.cpp
@@ -82,8 +82,10 @@ TEST(ItemTest, ExpiryTime) {
   EXPECT_TRUE(result);
   EXPECT_EQ(tenMins, item->getConfiguredTTL());
 
+  // So that exclusive bit will be set
+  item->markAccessible();
   // Test that writes fail while the item is moving
-  result = item->markMoving();
+  result = item->markMoving(true);
   EXPECT_TRUE(result);
   result = item->updateExpiryTime(0);
   EXPECT_FALSE(result);
diff --git a/cachelib/allocator/tests/RefCountTest.cpp b/cachelib/allocator/tests/RefCountTest.cpp
index d05be08c31..862271b03d 100644
--- a/cachelib/allocator/tests/RefCountTest.cpp
+++ b/cachelib/allocator/tests/RefCountTest.cpp
@@ -52,7 +52,7 @@ void RefCountTest::testMultiThreaded() {
         nLocalRef--;
         ref.markAccessible();
       } else {
-        ref.incRef();
+        ref.incRef(true);
         nLocalRef++;
         ref.unmarkAccessible();
       }
@@ -101,12 +101,12 @@ void RefCountTest::testBasic() {
   ASSERT_FALSE(ref.template isFlagSet<RefcountWithFlags::Flags::kMMFlag1>());
 
   for (uint32_t i = 0; i < RefcountWithFlags::kAccessRefMask; i++) {
-    ASSERT_TRUE(ref.incRef());
+    ASSERT_EQ(ref.incRef(true),RefcountWithFlags::incOk);
   }
 
   // Incrementing past the max will fail
   auto rawRef = ref.getRaw();
-  ASSERT_THROW(ref.incRef(), std::overflow_error);
+  ASSERT_THROW(ref.incRef(true), std::overflow_error);
   ASSERT_EQ(rawRef, ref.getRaw());
 
   // Bumping up access ref shouldn't affect admin ref and flags
@@ -152,11 +152,11 @@ void RefCountTest::testBasic() {
   ASSERT_FALSE(ref.template isFlagSet<RefcountWithFlags::Flags::kMMFlag1>());
 
   // conditionally set flags
-  ASSERT_FALSE((ref.markMoving()));
+  ASSERT_FALSE(ref.markMoving(true));
   ref.markInMMContainer();
   // only first one succeeds
-  ASSERT_TRUE((ref.markMoving()));
-  ASSERT_FALSE((ref.markMoving()));
+  ASSERT_TRUE(ref.markMoving(true));
+  ASSERT_FALSE(ref.markMoving(true));
   ref.unmarkInMMContainer();
 
   ref.template setFlag<RefcountWithFlags::Flags::kMMFlag0>();
@@ -202,7 +202,7 @@ void RefCountTest::testMarkForEvictionAndMoving() {
     ref.markInMMContainer();
     ref.markAccessible();
 
-    ASSERT_TRUE(ref.markMoving());
+    ASSERT_TRUE(ref.markMoving(true));
     ASSERT_FALSE(ref.markForEviction());
 
     ref.unmarkInMMContainer();
@@ -218,7 +218,7 @@ void RefCountTest::testMarkForEvictionAndMoving() {
     ref.markAccessible();
 
     ASSERT_TRUE(ref.markForEviction());
-    ASSERT_FALSE(ref.markMoving());
+    ASSERT_FALSE(ref.markMoving(true));
 
     ref.unmarkInMMContainer();
     ref.unmarkAccessible();
@@ -232,9 +232,9 @@ void RefCountTest::testMarkForEvictionAndMoving() {
     ref.markInMMContainer();
     ref.markAccessible();
 
-    ref.incRef();
+    ref.incRef(true);
 
-    ASSERT_TRUE(ref.markMoving());
+    ASSERT_TRUE(ref.markMoving(false));
 
     ref.unmarkInMMContainer();
     ref.unmarkAccessible();
@@ -248,7 +248,7 @@ void RefCountTest::testMarkForEvictionAndMoving() {
     ref.markInMMContainer();
     ref.markAccessible();
 
-    ref.incRef();
+    ref.incRef(true);
     ASSERT_FALSE(ref.markForEviction());
   }
 }
diff --git a/cachelib/cachebench/util/CacheConfig.h b/cachelib/cachebench/util/CacheConfig.h
index a1b8f52011..d86ef1f620 100644
--- a/cachelib/cachebench/util/CacheConfig.h
+++ b/cachelib/cachebench/util/CacheConfig.h
@@ -92,7 +92,7 @@ struct CacheConfig : public JSONConfig {
   bool lruUpdateOnWrite{false};
   bool lruUpdateOnRead{true};
   bool tryLockUpdate{false};
-  bool useCombinedLockForIterators{false};
+  bool useCombinedLockForIterators{true};
 
   // LRU param
   uint64_t lruIpSpec{0};

From 21526393d1096025b49a5194fa21de12822dbf80 Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <Igor.Chorazewicz@intel.com>
Date: Thu, 30 Dec 2021 17:18:29 -0500
Subject: [PATCH 43/47] basic multi-tier test based on numa bindings

---
 .../allocator/tests/AllocatorTypeTest.cpp     |  1 +
 cachelib/allocator/tests/BaseAllocatorTest.h  | 80 +++++++++++++++++++
 2 files changed, 81 insertions(+)

diff --git a/cachelib/allocator/tests/AllocatorTypeTest.cpp b/cachelib/allocator/tests/AllocatorTypeTest.cpp
index 1e98af29f2..a572485e8d 100644
--- a/cachelib/allocator/tests/AllocatorTypeTest.cpp
+++ b/cachelib/allocator/tests/AllocatorTypeTest.cpp
@@ -409,6 +409,7 @@ TYPED_TEST(BaseAllocatorTest, RateMap) { this->testRateMap(); }
 TYPED_TEST(BaseAllocatorTest, StatSnapshotTest) {
   this->testStatSnapshotTest();
 }
+TYPED_TEST(BaseAllocatorTest, BasicMultiTier) {this->testBasicMultiTier(); }
 
 namespace { // the tests that cannot be done by TYPED_TEST.
 
diff --git a/cachelib/allocator/tests/BaseAllocatorTest.h b/cachelib/allocator/tests/BaseAllocatorTest.h
index 13cd4fbf29..5733798e98 100644
--- a/cachelib/allocator/tests/BaseAllocatorTest.h
+++ b/cachelib/allocator/tests/BaseAllocatorTest.h
@@ -6295,6 +6295,86 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
         });
     EXPECT_EQ(intervalNameExists, 4);
   }
+  
+  void testSingleTierMemoryAllocatorSize() {
+    typename AllocatorT::Config config;
+    static constexpr size_t cacheSize = 100 * 1024 * 1024; /* 100 MB */
+    config.setCacheSize(cacheSize);
+    config.enableCachePersistence(folly::sformat("/tmp/single-tier-test/{}", ::getpid()));
+
+    AllocatorT alloc(AllocatorT::SharedMemNew, config);
+
+    EXPECT_LE(alloc.allocator_[0]->getMemorySize(), cacheSize);
+  }
+
+  void testSingleTierMemoryAllocatorSizeAnonymous() {
+    typename AllocatorT::Config config;
+    static constexpr size_t cacheSize = 100 * 1024 * 1024; /* 100 MB */
+    config.setCacheSize(cacheSize);
+
+    AllocatorT alloc(config);
+
+    EXPECT_LE(alloc.allocator_[0]->getMemorySize(), cacheSize);
+  }
+
+  void testBasicMultiTier() {
+    using Item = typename AllocatorT::Item;
+    const static std::string data = "data";
+
+    std::set<std::string> movedKeys;
+    auto moveCb = [&](const Item& oldItem, Item& newItem, Item* /* parentPtr */) {
+      std::memcpy(newItem.getMemory(), oldItem.getMemory(), oldItem.getSize());
+      movedKeys.insert(oldItem.getKey().str());
+    };
+
+    typename AllocatorT::Config config;
+    static constexpr size_t cacheSize = 100 * 1024 * 1024; /* 100 MB */
+    config.setCacheSize(100 * 1024 * 1024); /* 100 MB */
+    config.enableCachePersistence(folly::sformat("/tmp/multi-tier-test/{}", ::getpid()));
+    config.configureMemoryTiers({
+      MemoryTierCacheConfig::fromShm().setRatio(1)
+        .setMemBind(std::string("0")),
+      MemoryTierCacheConfig::fromShm().setRatio(1)
+        .setMemBind(std::string("0")),
+    });
+    config.enableMovingOnSlabRelease(moveCb);
+
+    AllocatorT alloc(AllocatorT::SharedMemNew, config);
+
+    EXPECT_EQ(alloc.allocator_.size(), 2);
+    EXPECT_LE(alloc.allocator_[0]->getMemorySize(), cacheSize / 2);
+    EXPECT_LE(alloc.allocator_[1]->getMemorySize(), cacheSize / 2);
+
+    const size_t numBytes = alloc.getCacheMemoryStats().ramCacheSize;
+    auto pid = alloc.addPool("default", numBytes);
+
+    static constexpr size_t numOps = cacheSize / 1024;
+    for (int i = 0; i < numOps; i++) {
+      std::string key = std::to_string(i);
+      auto h = alloc.allocate(pid, key, 1024);
+      EXPECT_TRUE(h);
+
+      std::memcpy(h->getMemory(), data.data(), data.size());
+
+      alloc.insertOrReplace(h);
+    }
+
+    EXPECT_TRUE(movedKeys.size() > 0);
+
+    size_t movedButStillInMemory = 0;
+    for (const auto &k : movedKeys) {
+      auto h = alloc.find(k);
+
+      if (h) {
+        movedButStillInMemory++;
+        /* All moved elements should be in the second tier. */
+        EXPECT_TRUE(alloc.allocator_[1]->isMemoryInAllocator(h->getMemory()));
+        EXPECT_EQ(data, std::string((char*)h->getMemory(), data.size()));
+      }
+    }
+
+    EXPECT_TRUE(movedButStillInMemory > 0);
+  }
 };
 } // namespace tests
 } // namespace cachelib

From e30310470e55efeb9b05c9f5ea8f17be16de2fac Mon Sep 17 00:00:00 2001
From: Sergei Vinogradov <sergey.vinogradov@intel.com>
Date: Thu, 27 Jan 2022 05:27:20 -0800
Subject: [PATCH 44/47] Aadding new configs to
 hit_ratio/graph_cache_leader_fobj -updated configs for numa bindings

---
 .../config-4GB-DRAM-4GB-PMEM.json             | 42 +++++++++++++++++++
 .../config-8GB-DRAM.json                      | 32 ++++++++++++++
 .../config-8GB-PMEM.json                      | 38 +++++++++++++++++
 .../test_configs/simple_tiers_test.json       | 12 ++++--
 4 files changed, 120 insertions(+), 4 deletions(-)
 create mode 100644 cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-4GB-DRAM-4GB-PMEM.json
 create mode 100644 cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-DRAM.json
 create mode 100644 cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-PMEM.json

diff --git a/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-4GB-DRAM-4GB-PMEM.json b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-4GB-DRAM-4GB-PMEM.json
new file mode 100644
index 0000000000..d9acdf7c6c
--- /dev/null
+++ b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-4GB-DRAM-4GB-PMEM.json
@@ -0,0 +1,42 @@
+{
+  "cache_config": {
+    "cacheSizeMB": 8192,
+    "poolRebalanceIntervalSec": 0,
+    "cacheDir": "/tmp/mem-tiers",
+    "memoryTiers" : [
+      {
+        "ratio": 1,
+        "memBindNodes": 0
+      },
+      {
+        "ratio": 1,
+        "memBindNodes": 0
+      }
+    ]
+  }, 
+  "test_config": 
+    {
+      "addChainedRatio": 0.0, 
+      "delRatio": 0.0, 
+      "enableLookaside": true, 
+      "getRatio": 0.7684563460126871, 
+      "keySizeRange": [
+        1, 
+        8, 
+        64
+      ], 
+      "keySizeRangeProbability": [
+        0.3, 
+        0.7
+      ], 
+      "loneGetRatio": 0.2315436539873129, 
+      "numKeys": 71605574, 
+      "numOps": 5000000, 
+      "numThreads": 24, 
+      "popDistFile": "pop.json",
+
+      "setRatio": 0.0, 
+      "valSizeDistFile": "sizes.json"
+    }
+ 
+}
diff --git a/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-DRAM.json b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-DRAM.json
new file mode 100644
index 0000000000..6d47e08b74
--- /dev/null
+++ b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-DRAM.json
@@ -0,0 +1,32 @@
+{
+  "cache_config": {
+    "cacheSizeMB": 8192,
+    "poolRebalanceIntervalSec": 0,
+    "cacheDir": "/tmp/mem-tier"
+  }, 
+  "test_config": 
+    {
+      "addChainedRatio": 0.0, 
+      "delRatio": 0.0, 
+      "enableLookaside": true, 
+      "getRatio": 0.7684563460126871, 
+      "keySizeRange": [
+        1, 
+        8, 
+        64
+      ], 
+      "keySizeRangeProbability": [
+        0.3, 
+        0.7
+      ], 
+      "loneGetRatio": 0.2315436539873129, 
+      "numKeys": 71605574, 
+      "numOps": 5000000, 
+      "numThreads": 24, 
+      "popDistFile": "pop.json", 
+       
+      "setRatio": 0.0, 
+      "valSizeDistFile": "sizes.json"
+    }
+ 
+}
diff --git a/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-PMEM.json b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-PMEM.json
new file mode 100644
index 0000000000..4feab55154
--- /dev/null
+++ b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-PMEM.json
@@ -0,0 +1,38 @@
+{
+  "cache_config": {
+    "cacheSizeMB": 8192,
+    "poolRebalanceIntervalSec": 0,
+    "cacheDir": "/tmp/mem-tier",
+    "memoryTiers" : [
+      {
+        "ratio": 1,
+        "memBindNodes": 0
+      }
+    ]
+  }, 
+  "test_config": 
+    {
+      "addChainedRatio": 0.0, 
+      "delRatio": 0.0, 
+      "enableLookaside": true, 
+      "getRatio": 0.7684563460126871, 
+      "keySizeRange": [
+        1, 
+        8, 
+        64
+      ], 
+      "keySizeRangeProbability": [
+        0.3, 
+        0.7
+      ], 
+      "loneGetRatio": 0.2315436539873129, 
+      "numKeys": 71605574, 
+      "numOps": 5000000, 
+      "numThreads": 24, 
+      "popDistFile": "pop.json", 
+       
+      "setRatio": 0.0, 
+      "valSizeDistFile": "sizes.json"
+    }
+ 
+}
diff --git a/cachelib/cachebench/test_configs/simple_tiers_test.json b/cachelib/cachebench/test_configs/simple_tiers_test.json
index 182bb514cb..58302b9f20 100644
--- a/cachelib/cachebench/test_configs/simple_tiers_test.json
+++ b/cachelib/cachebench/test_configs/simple_tiers_test.json
@@ -1,14 +1,18 @@
 // @nolint instantiates a small cache and runs a quick run of basic operations.
 {
     "cache_config" : {
-      "cacheSizeMB" : 512,
-      "usePosixShm" : false,
+      "cacheSizeMB" : 1024,
       "cacheDir" : "/tmp/mem-tiers",
       "memoryTiers" : [
+        {
+          "ratio": 1,
+          "memBindNodes": "0"
+        },
         {
           "ratio": 1,
           "memBindNodes": "0"
         }
+
       ],
       "poolRebalanceIntervalSec" : 1,
       "moveOnSlabRelease" : false,
@@ -19,7 +23,7 @@
     "test_config" : {
         "numOps" : 100000,
         "numThreads" : 32,
-        "numKeys" : 1000000,
+        "numKeys" : 2000000,
 
         "keySizeRange" : [1, 8, 64],
         "keySizeRangeProbability" : [0.3, 0.7],
@@ -33,4 +37,4 @@
         "keyPoolDistribution": [0.4, 0.6],
         "opPoolDistribution" : [0.5, 0.5]
     }
-  }
\ No newline at end of file
+  }

From 1e40a00e58c9265fe960fad804f3c6f20d01aa3f Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Mon, 19 Dec 2022 08:43:35 -0800
Subject: [PATCH 45/47] Do not block reader if a child item is moving

This would lead to deadlock (.e.g in forEachChainedItem)
if the child is moving (e.g. marked by Slab Release thread).

Instead treat moving bit only to prevent freeing the item and
do all synchronization on parent.
---
 cachelib/allocator/CacheAllocator-inl.h | 93 +++++++++++++++----------
 1 file changed, 58 insertions(+), 35 deletions(-)

diff --git a/cachelib/allocator/CacheAllocator-inl.h b/cachelib/allocator/CacheAllocator-inl.h
index e2be55604a..74bb6a588f 100644
--- a/cachelib/allocator/CacheAllocator-inl.h
+++ b/cachelib/allocator/CacheAllocator-inl.h
@@ -988,7 +988,8 @@ CacheAllocator<CacheTrait>::acquire(Item* it) {
 
   SCOPE_FAIL { stats_.numRefcountOverflow.inc(); };
 
-  auto failIfMoving = getNumTiers() > 1;
+  // TODO: do not block incRef for child items to avoid deadlock
+  auto failIfMoving = getNumTiers() > 1 && !it->isChainedItem();
   auto incRes = incRef(*it, failIfMoving);
   if (LIKELY(incRes == RefcountWithFlags::incResult::incOk)) {
     return WriteHandle{it, *this};
@@ -3024,7 +3025,8 @@ bool CacheAllocator<CacheTrait>::tryMovingForSlabRelease(
   // a regular item or chained item is synchronized with any potential
   // user-side mutation.
   std::unique_ptr<SyncObj> syncObj;
-  if (config_.movingSync) {
+  if (config_.movingSync && getNumTiers() == 1) {
+    // TODO: use moving-bit synchronization for single tier as well
     if (!oldItem.isChainedItem()) {
       syncObj = config_.movingSync(oldItem.getKey());
     } else {
@@ -3122,47 +3124,51 @@ void CacheAllocator<CacheTrait>::evictForSlabRelease(
     Item* evicted;
     if (item.isChainedItem()) {
       auto& expectedParent = item.asChainedItem().getParentItem(compressor_);
-      const std::string parentKey = expectedParent.getKey().str();
-      auto l = chainedItemLocks_.lockExclusive(parentKey);
-
-      // check if the child is still in mmContainer and the expected parent is
-      // valid under the chained item lock.
-      if (expectedParent.getKey() != parentKey || !item.isInMMContainer() ||
-          item.isOnlyMoving() ||
-          &expectedParent != &item.asChainedItem().getParentItem(compressor_) ||
-          !expectedParent.isAccessible() || !expectedParent.hasChainedItem()) {
-        continue;
-      }
 
-      // search if the child is present in the chain
-      {
-        auto parentHandle = findInternal(parentKey);
-        if (!parentHandle || parentHandle != &expectedParent) {
+      if (getNumTiers() == 1) {
+        // TODO: unify this with multi-tier implementation
+        // right now, taking a chained item lock here would lead to deadlock
+        const std::string parentKey = expectedParent.getKey().str();
+        auto l = chainedItemLocks_.lockExclusive(parentKey);
+
+        // check if the child is still in mmContainer and the expected parent is
+        // valid under the chained item lock.
+        if (expectedParent.getKey() != parentKey || !item.isInMMContainer() ||
+            item.isOnlyMoving() ||
+            &expectedParent != &item.asChainedItem().getParentItem(compressor_) ||
+            !expectedParent.isAccessible() || !expectedParent.hasChainedItem()) {
           continue;
         }
 
-        ChainedItem* head = nullptr;
-        { // scope for the handle
-          auto headHandle = findChainedItem(expectedParent);
-          head = headHandle ? &headHandle->asChainedItem() : nullptr;
-        }
+        // search if the child is present in the chain
+        {
+          auto parentHandle = findInternal(parentKey);
+          if (!parentHandle || parentHandle != &expectedParent) {
+            continue;
+          }
 
-        bool found = false;
-        while (head) {
-          if (head == &item) {
-            found = true;
-            break;
+          ChainedItem* head = nullptr;
+          { // scope for the handle
+            auto headHandle = findChainedItem(expectedParent);
+            head = headHandle ? &headHandle->asChainedItem() : nullptr;
           }
-          head = head->getNext(compressor_);
-        }
 
-        if (!found) {
-          continue;
+          bool found = false;
+          while (head) {
+            if (head == &item) {
+              found = true;
+              break;
+            }
+            head = head->getNext(compressor_);
+          }
+
+          if (!found) {
+            continue;
+          }
         }
       }
 
       evicted = &expectedParent;
-
       token = createPutToken(*evicted);
       if (evicted->markForEviction()) {
         // unmark the child so it will be freed
@@ -3173,6 +3179,9 @@ void CacheAllocator<CacheTrait>::evictForSlabRelease(
         // no other reader can be added to the waiters list
         wakeUpWaiters(*evicted, {});
       } else {
+        // TODO: potential deadlock with markUseful for parent item
+        // for now, we do not block any reader on child items but this
+        // should probably be fixed
         continue;
       }
     } else {
@@ -3204,7 +3213,17 @@ void CacheAllocator<CacheTrait>::evictForSlabRelease(
     XDCHECK(evicted->getRefCount() == 0);
     const auto res =
         releaseBackToAllocator(*evicted, RemoveContext::kEviction, false);
-    XDCHECK(res == ReleaseRes::kReleased);
+
+    if (getNumTiers() == 1) {
+      XDCHECK(res == ReleaseRes::kReleased);
+    } else {
+      const bool isAlreadyFreed =
+          !markMovingForSlabRelease(ctx, &item, throttler);
+      if (!isAlreadyFreed) {
+        continue;
+      }
+    }
+  
     return;
   }
 }
@@ -3252,11 +3271,15 @@ bool CacheAllocator<CacheTrait>::markMovingForSlabRelease(
   bool itemFreed = true;
   bool markedMoving = false;
   TierId tid = getTierId(alloc);
-  const auto fn = [&markedMoving, &itemFreed](void* memory) {
+  auto numTiers = getNumTiers();
+  const auto fn = [&markedMoving, &itemFreed, numTiers](void* memory) {
     // Since this callback is executed, the item is not yet freed
     itemFreed = false;
     Item* item = static_cast<Item*>(memory);
-    if (item->markMoving(false)) {
+    // TODO: for chained items, moving bit is only used to avoid
+    // freeing the item prematurely
+    auto failIfRefNotZero = numTiers > 1 && !item->isChainedItem();
+    if (item->markMoving(failIfRefNotZero)) {
       markedMoving = true;
     }
   };

From b99bb9da2278526f42be8012f21e5ff5ceb69d7e Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Fri, 21 Oct 2022 12:27:47 -0400
Subject: [PATCH 46/47] Background data movement (#20)

Background data movement using periodic workers. Attempts to evict/promote items per given thresholds for each class. These reduce p99 latency since there is a higher chance that an allocation slot is free in the tier we are allocating in.
---
 MultiTierDataMovement.md                      |  90 ++++++++
 cachelib/allocator/BackgroundMover-inl.h      | 112 +++++++++
 cachelib/allocator/BackgroundMover.h          | 103 +++++++++
 cachelib/allocator/BackgroundMoverStrategy.h  |  42 ++++
 cachelib/allocator/CMakeLists.txt             |   1 +
 cachelib/allocator/Cache.h                    |   6 +
 cachelib/allocator/CacheAllocator-inl.h       | 188 +++++++++++++++-
 cachelib/allocator/CacheAllocator.h           | 213 +++++++++++++++++-
 cachelib/allocator/CacheAllocatorConfig.h     |  71 ++++++
 cachelib/allocator/CacheStats.h               |  26 +++
 cachelib/allocator/FreeThresholdStrategy.cpp  |  74 ++++++
 cachelib/allocator/FreeThresholdStrategy.h    |  46 ++++
 cachelib/allocator/MMLru-inl.h                |  22 +-
 cachelib/allocator/MMLru.h                    |   3 +
 cachelib/allocator/MMTinyLFU-inl.h            |   7 +
 cachelib/allocator/MMTinyLFU.h                |   3 +
 cachelib/allocator/PromotionStrategy.h        |  84 +++++++
 .../tests/AllocatorMemoryTiersTest.cpp        |   1 +
 .../tests/AllocatorMemoryTiersTest.h          |  67 ++++++
 cachelib/allocator/tests/CacheBaseTest.cpp    |   2 +
 cachelib/cachebench/cache/Cache-inl.h         |  34 +++
 cachelib/cachebench/cache/CacheStats.h        |  59 +++++
 cachelib/cachebench/util/CacheConfig.cpp      |  38 +++-
 cachelib/cachebench/util/CacheConfig.h        |  27 +++
 24 files changed, 1293 insertions(+), 26 deletions(-)
 create mode 100644 MultiTierDataMovement.md
 create mode 100644 cachelib/allocator/BackgroundMover-inl.h
 create mode 100644 cachelib/allocator/BackgroundMover.h
 create mode 100644 cachelib/allocator/BackgroundMoverStrategy.h
 create mode 100644 cachelib/allocator/FreeThresholdStrategy.cpp
 create mode 100644 cachelib/allocator/FreeThresholdStrategy.h
 create mode 100644 cachelib/allocator/PromotionStrategy.h

diff --git a/MultiTierDataMovement.md b/MultiTierDataMovement.md
new file mode 100644
index 0000000000..cccc14b947
--- /dev/null
+++ b/MultiTierDataMovement.md
@@ -0,0 +1,90 @@
+# Background Data Movement
+
+In order to reduce the number of online evictions and support asynchronous
+promotion - we have added two periodic workers to handle eviction and promotion.
+
+The diagram below shows a simplified version of how the background evictor
+thread (green) is integrated to the CacheLib architecture. 
+
+<p align="center">
+  <img width="640" height="360" alt="BackgroundEvictor" src="cachelib-background-evictor.png">
+</p>
+
+## Background Evictors
+
+The background evictors scan each class to see if there are objects to move the next (lower)
+tier using a given strategy. Here we document the parameters for the different
+strategies and general parameters. 
+
+- `backgroundEvictorIntervalMilSec`: The interval that this thread runs for - by default
+the background evictor threads will wake up every 10 ms to scan the AllocationClasses. Also,
+the background evictor thread will be woken up everytime there is a failed allocation (from
+a request handling thread) and the current percentage of free memory for the 
+AllocationClass is lower than `lowEvictionAcWatermark`. This may render the interval parameter
+not as important when there are many allocations occuring from request handling threads. 
+
+- `evictorThreads`: The number of background evictors to run - each thread is a assigned
+a set of AllocationClasses to scan and evict objects from. Currently, each thread gets
+an equal number of classes to scan - but as object size distribution may be unequal - future
+versions will attempt to balance the classes among threads. The range is 1 to number of AllocationClasses.
+The default is 1. 
+
+- `maxEvictionBatch`: The number of objects to remove in a given eviction call. The
+default is 40. Lower range is 10 and the upper range is 1000. Too low and we might not
+remove objects at a reasonable rate, too high and it might increase contention with user threads.
+
+- `minEvictionBatch`: Minimum number of items to evict at any time (if there are any
+candidates)
+
+- `maxEvictionPromotionHotness`: Maximum candidates to consider for eviction. This is similar to `maxEvictionBatch`
+but it specifies how many candidates will be taken into consideration, not the actual number of items to evict.
+This option can be used to configure duration of critical section on LRU lock.
+
+
+### FreeThresholdStrategy (default)
+
+- `lowEvictionAcWatermark`: Triggers background eviction thread to run
+when this percentage of the AllocationClass is free. 
+The default is `2.0`, to avoid wasting capacity we don't set this above `10.0`.
+
+- `highEvictionAcWatermark`: Stop the evictions from an AllocationClass when this 
+percentage of the AllocationClass is free. The default is `5.0`, to avoid wasting capacity we
+don't set this above `10`.
+
+
+## Background Promoters
+
+The background promoters scan each class to see if there are objects to move to a lower
+tier using a given strategy. Here we document the parameters for the different
+strategies and general parameters.
+
+- `backgroundPromoterIntervalMilSec`: The interval that this thread runs for - by default
+the background promoter threads will wake up every 10 ms to scan the AllocationClasses for
+objects to promote.
+
+- `promoterThreads`: The number of background promoters to run - each thread is a assigned
+a set of AllocationClasses to scan and promote objects from. Currently, each thread gets
+an equal number of classes to scan - but as object size distribution may be unequal - future
+versions will attempt to balance the classes among threads. The range is `1` to number of AllocationClasses. The default is `1`.
+
+- `maxProtmotionBatch`: The number of objects to promote in a given promotion call. The
+default is 40. Lower range is 10 and the upper range is 1000. Too low and we might not
+remove objects at a reasonable rate, too high and it might increase contention with user threads. 
+
+- `minPromotionBatch`: Minimum number of items to promote at any time (if there are any
+candidates)
+
+- `numDuplicateElements`: This allows us to promote items that have existing handles (read-only) since
+we won't need to modify the data when a user is done with the data. Therefore, for a short time
+the data could reside in both tiers until it is evicted from its current tier. The default is to
+not allow this (0). Setting the value to 100 will enable duplicate elements in tiers.
+
+### Background Promotion Strategy (only one currently)
+
+- `promotionAcWatermark`: Promote items if there is at least this
+percent of free AllocationClasses. Promotion thread will attempt to move `maxPromotionBatch` number of objects
+to that tier. The objects are chosen from the head of the LRU. The default is `4.0`.
+This value should correlate with `lowEvictionAcWatermark`, `highEvictionAcWatermark`, `minAcAllocationWatermark`, `maxAcAllocationWatermark`.
+- `maxPromotionBatch`: The number of objects to promote in batch during BG promotion. Analogous to
+`maxEvictionBatch`. It's value should be lower to decrease contention on hot items.
+
diff --git a/cachelib/allocator/BackgroundMover-inl.h b/cachelib/allocator/BackgroundMover-inl.h
new file mode 100644
index 0000000000..b77436635f
--- /dev/null
+++ b/cachelib/allocator/BackgroundMover-inl.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) Intel and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace facebook {
+namespace cachelib {
+
+template <typename CacheT>
+BackgroundMover<CacheT>::BackgroundMover(
+    Cache& cache,
+    std::shared_ptr<BackgroundMoverStrategy> strategy,
+    MoverDir direction)
+    : cache_(cache), strategy_(strategy), direction_(direction) {
+  if (direction_ == MoverDir::Evict) {
+    moverFunc = BackgroundMoverAPIWrapper<CacheT>::traverseAndEvictItems;
+
+  } else if (direction_ == MoverDir::Promote) {
+    moverFunc = BackgroundMoverAPIWrapper<CacheT>::traverseAndPromoteItems;
+  }
+}
+
+template <typename CacheT>
+BackgroundMover<CacheT>::~BackgroundMover() {
+  stop(std::chrono::seconds(0));
+}
+
+template <typename CacheT>
+void BackgroundMover<CacheT>::work() {
+  try {
+    checkAndRun();
+  } catch (const std::exception& ex) {
+    XLOGF(ERR, "BackgroundMover interrupted due to exception: {}", ex.what());
+  }
+}
+
+template <typename CacheT>
+void BackgroundMover<CacheT>::setAssignedMemory(
+    std::vector<MemoryDescriptorType>&& assignedMemory) {
+  XLOG(INFO, "Class assigned to background worker:");
+  for (auto [tid, pid, cid] : assignedMemory) {
+    XLOGF(INFO, "Tid: {}, Pid: {}, Cid: {}", tid, pid, cid);
+  }
+
+  mutex.lock_combine([this, &assignedMemory] {
+    this->assignedMemory_ = std::move(assignedMemory);
+  });
+}
+
+// Look for classes that exceed the target memory capacity
+// and return those for eviction
+template <typename CacheT>
+void BackgroundMover<CacheT>::checkAndRun() {
+  auto assignedMemory = mutex.lock_combine([this] { return assignedMemory_; });
+
+  unsigned int moves = 0;
+  std::set<ClassId> classes{};
+  auto batches = strategy_->calculateBatchSizes(cache_, assignedMemory);
+
+  for (size_t i = 0; i < batches.size(); i++) {
+    const auto [tid, pid, cid] = assignedMemory[i];
+    const auto batch = batches[i];
+
+    classes.insert(cid);
+    const auto& mpStats = cache_.getPoolByTid(pid, tid).getStats();
+
+    if (!batch) {
+      continue;
+    }
+
+    // try moving BATCH items from the class in order to reach free target
+    auto moved = moverFunc(cache_, tid, pid, cid, batch);
+    moves += moved;
+    moves_per_class_[tid][pid][cid] += moved;
+    totalBytesMoved.add(moved * mpStats.acStats.at(cid).allocSize);
+  }
+
+  numTraversals.inc();
+  numMovedItems.add(moves);
+  totalClasses.add(classes.size());
+}
+
+template <typename CacheT>
+BackgroundMoverStats BackgroundMover<CacheT>::getStats() const noexcept {
+  BackgroundMoverStats stats;
+  stats.numMovedItems = numMovedItems.get();
+  stats.runCount = numTraversals.get();
+  stats.totalBytesMoved = totalBytesMoved.get();
+  stats.totalClasses = totalClasses.get();
+
+  return stats;
+}
+
+template <typename CacheT>
+std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>>
+BackgroundMover<CacheT>::getClassStats() const noexcept {
+  return moves_per_class_;
+}
+
+} // namespace cachelib
+} // namespace facebook
diff --git a/cachelib/allocator/BackgroundMover.h b/cachelib/allocator/BackgroundMover.h
new file mode 100644
index 0000000000..1246676d6e
--- /dev/null
+++ b/cachelib/allocator/BackgroundMover.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) Intel and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cachelib/allocator/BackgroundMoverStrategy.h"
+#include "cachelib/allocator/CacheStats.h"
+#include "cachelib/common/AtomicCounter.h"
+#include "cachelib/common/PeriodicWorker.h"
+
+namespace facebook {
+namespace cachelib {
+
+// wrapper that exposes the private APIs of CacheType that are specifically
+// needed for the cache api
+template <typename C>
+struct BackgroundMoverAPIWrapper {
+  static size_t traverseAndEvictItems(C& cache,
+                                      unsigned int tid,
+                                      unsigned int pid,
+                                      unsigned int cid,
+                                      size_t batch) {
+    return cache.traverseAndEvictItems(tid, pid, cid, batch);
+  }
+
+  static size_t traverseAndPromoteItems(C& cache,
+                                        unsigned int tid,
+                                        unsigned int pid,
+                                        unsigned int cid,
+                                        size_t batch) {
+    return cache.traverseAndPromoteItems(tid, pid, cid, batch);
+  }
+};
+
+enum class MoverDir { Evict = 0, Promote };
+
+// Periodic worker that evicts items from tiers in batches
+// The primary aim is to reduce insertion times for new items in the
+// cache
+template <typename CacheT>
+class BackgroundMover : public PeriodicWorker {
+ public:
+  using Cache = CacheT;
+  // @param cache               the cache interface
+  // @param strategy            the stragey class that defines how objects are
+  // moved,
+  //                            (promoted vs. evicted and how much)
+  BackgroundMover(Cache& cache,
+                  std::shared_ptr<BackgroundMoverStrategy> strategy,
+                  MoverDir direction_);
+
+  ~BackgroundMover() override;
+
+  BackgroundMoverStats getStats() const noexcept;
+  std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>>
+  getClassStats() const noexcept;
+
+  void setAssignedMemory(
+      std::vector<MemoryDescriptorType>&& assignedMemory);
+
+ private:
+  std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>>
+      moves_per_class_;
+  // cache allocator's interface for evicting
+  using Item = typename Cache::Item;
+
+  Cache& cache_;
+  std::shared_ptr<BackgroundMoverStrategy> strategy_;
+  MoverDir direction_;
+
+  std::function<size_t(
+      Cache&, unsigned int, unsigned int, unsigned int, size_t)>
+      moverFunc;
+
+  // implements the actual logic of running the background evictor
+  void work() override final;
+  void checkAndRun();
+
+  AtomicCounter numMovedItems{0};
+  AtomicCounter numTraversals{0};
+  AtomicCounter totalClasses{0};
+  AtomicCounter totalBytesMoved{0};
+
+  std::vector<MemoryDescriptorType> assignedMemory_;
+  folly::DistributedMutex mutex;
+};
+} // namespace cachelib
+} // namespace facebook
+
+#include "cachelib/allocator/BackgroundMover-inl.h"
diff --git a/cachelib/allocator/BackgroundMoverStrategy.h b/cachelib/allocator/BackgroundMoverStrategy.h
new file mode 100644
index 0000000000..7706a625a5
--- /dev/null
+++ b/cachelib/allocator/BackgroundMoverStrategy.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cachelib/allocator/Cache.h"
+
+
+namespace facebook {
+namespace cachelib {
+
+struct MemoryDescriptorType {
+    MemoryDescriptorType(TierId tid, PoolId pid, ClassId cid) : 
+        tid_(tid), pid_(pid), cid_(cid) {}
+    TierId tid_;
+    PoolId pid_;
+    ClassId cid_;
+};
+
+// Base class for background eviction strategy.
+class BackgroundMoverStrategy {
+ public:
+  virtual std::vector<size_t> calculateBatchSizes(
+      const CacheBase& cache,
+      std::vector<MemoryDescriptorType> acVec) = 0;
+};
+
+} // namespace cachelib
+} // namespace facebook
diff --git a/cachelib/allocator/CMakeLists.txt b/cachelib/allocator/CMakeLists.txt
index f94c8c90c7..6103cdc823 100644
--- a/cachelib/allocator/CMakeLists.txt
+++ b/cachelib/allocator/CMakeLists.txt
@@ -35,6 +35,7 @@ add_library (cachelib_allocator
     CCacheManager.cpp
     ContainerTypes.cpp
     FreeMemStrategy.cpp
+    FreeThresholdStrategy.cpp
     HitsPerSlabStrategy.cpp
     LruTailAgeStrategy.cpp
     MarginalHitsOptimizeStrategy.cpp
diff --git a/cachelib/allocator/Cache.h b/cachelib/allocator/Cache.h
index cb2fa83f0d..c871358189 100644
--- a/cachelib/allocator/Cache.h
+++ b/cachelib/allocator/Cache.h
@@ -98,6 +98,12 @@ class CacheBase {
   //
   // @param poolId    The pool id to query
   virtual const MemoryPool& getPool(PoolId poolId) const = 0;
+  
+  // Get the reference  to a memory pool using a tier id, for stats purposes
+  //
+  // @param poolId    The pool id to query
+  // @param tierId    The tier of the pool id
+  virtual const MemoryPool& getPoolByTid(PoolId poolId, TierId tid) const = 0;
 
   // Get Pool specific stats (regular pools). This includes stats from the
   // Memory Pool and also the cache.
diff --git a/cachelib/allocator/CacheAllocator-inl.h b/cachelib/allocator/CacheAllocator-inl.h
index 74bb6a588f..3caf4b4b0b 100644
--- a/cachelib/allocator/CacheAllocator-inl.h
+++ b/cachelib/allocator/CacheAllocator-inl.h
@@ -291,6 +291,18 @@ void CacheAllocator<CacheTrait>::initWorkers() {
                           config_.poolOptimizeStrategy,
                           config_.ccacheOptimizeStepSizePercent);
   }
+
+  if (config_.backgroundEvictorEnabled()) {
+      startNewBackgroundEvictor(config_.backgroundEvictorInterval,
+                                config_.backgroundEvictorStrategy,
+                                config_.backgroundEvictorThreads);
+  }
+
+  if (config_.backgroundPromoterEnabled()) {
+      startNewBackgroundPromoter(config_.backgroundPromoterInterval,
+                                config_.backgroundPromoterStrategy,
+                                config_.backgroundPromoterThreads);
+  }
 }
 
 template <typename CacheTrait>
@@ -369,6 +381,22 @@ CacheAllocator<CacheTrait>::allocate(PoolId poolId,
                           ttlSecs == 0 ? 0 : creationTime + ttlSecs);
 }
 
+template <typename CacheTrait>
+bool CacheAllocator<CacheTrait>::shouldWakeupBgEvictor(TierId tid, PoolId pid, ClassId cid) {
+  // TODO: should we also work on lower tiers? should we have separate set of params?
+  if (tid == 1) return false;
+  return (1-getACStats(tid, pid, cid).usageFraction())*100 <= config_.lowEvictionAcWatermark;
+}
+ 
+template <typename CacheTrait>
+size_t CacheAllocator<CacheTrait>::backgroundWorkerId(TierId tid, PoolId pid, ClassId cid, size_t numWorkers) {
+  XDCHECK(numWorkers);
+
+  // TODO: came up with some better sharding (use some hashing)
+  return (tid + pid + cid) % numWorkers;
+}
+
+
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::WriteHandle
 CacheAllocator<CacheTrait>::allocateInternalTier(TierId tid,
@@ -376,7 +404,8 @@ CacheAllocator<CacheTrait>::allocateInternalTier(TierId tid,
                                                  typename Item::Key key,
                                                  uint32_t size,
                                                  uint32_t creationTime,
-                                                 uint32_t expiryTime) {
+                                                 uint32_t expiryTime,
+                                                 bool fromBgThread) {
   util::LatencyTracker tracker{stats().allocateLatency_};
 
   SCOPE_FAIL { stats_.invalidAllocs.inc(); };
@@ -391,8 +420,13 @@ CacheAllocator<CacheTrait>::allocateInternalTier(TierId tid,
 
   // TODO: per-tier
   (*stats_.allocAttempts)[pid][cid].inc();
-
+  
   void* memory = allocator_[tid]->allocate(pid, requiredSize);
+  
+  if (backgroundEvictor_.size() && !fromBgThread && (memory == nullptr || shouldWakeupBgEvictor(tid, pid, cid))) {
+    backgroundEvictor_[backgroundWorkerId(tid, pid, cid, backgroundEvictor_.size())]->wakeUp();
+  }
+  
   if (memory == nullptr) {
     memory = findEviction(tid, pid, cid);
   }
@@ -439,10 +473,11 @@ CacheAllocator<CacheTrait>::allocateInternal(PoolId pid,
                                              typename Item::Key key,
                                              uint32_t size,
                                              uint32_t creationTime,
-                                             uint32_t expiryTime) {
+                                             uint32_t expiryTime,
+                                             bool fromBgThread) {
   auto tid = 0; /* TODO: consult admission policy */
   for(TierId tid = 0; tid < getNumTiers(); ++tid) {
-    auto handle = allocateInternalTier(tid, pid, key, size, creationTime, expiryTime);
+    auto handle = allocateInternalTier(tid, pid, key, size, creationTime, expiryTime, fromBgThread);
     if (handle) return handle;
   }
   return {};
@@ -1587,7 +1622,7 @@ CacheAllocator<CacheTrait>::findEviction(TierId tid, PoolId pid, ClassId cid) {
     XDCHECK(candidate);
 
     auto evictedToNext = lastTier ? nullptr
-        : tryEvictToNextMemoryTier(*candidate);
+        : tryEvictToNextMemoryTier(*candidate, false);
     if (!evictedToNext) {
       if (!token.isValid()) {
         token = createPutToken(*candidate);
@@ -1699,7 +1734,7 @@ bool CacheAllocator<CacheTrait>::shouldWriteToNvmCacheExclusive(
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::WriteHandle
 CacheAllocator<CacheTrait>::tryEvictToNextMemoryTier(
-    TierId tid, PoolId pid, Item& item) {
+    TierId tid, PoolId pid, Item& item, bool fromBgThread) {
   XDCHECK(item.isMoving());
   XDCHECK(item.getRefCount() == 0);
   if(item.hasChainedItem()) return WriteHandle{}; // TODO: We do not support ChainedItem yet
@@ -1716,7 +1751,8 @@ CacheAllocator<CacheTrait>::tryEvictToNextMemoryTier(
                      item.getKey(),
                      item.getSize(),
                      item.getCreationTime(),
-                     item.getExpiryTime());
+                     item.getExpiryTime(),
+                     fromBgThread);
 
     if (newItemHdl) {
       XDCHECK_EQ(newItemHdl->getSize(), item.getSize());
@@ -1733,10 +1769,49 @@ CacheAllocator<CacheTrait>::tryEvictToNextMemoryTier(
 
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::WriteHandle
-CacheAllocator<CacheTrait>::tryEvictToNextMemoryTier(Item& item) {
+CacheAllocator<CacheTrait>::tryEvictToNextMemoryTier(Item& item, bool fromBgThread) {
   auto tid = getTierId(item);
   auto pid = allocator_[tid]->getAllocInfo(item.getMemory()).poolId;
-  return tryEvictToNextMemoryTier(tid, pid, item);
+  return tryEvictToNextMemoryTier(tid, pid, item, fromBgThread);
+}
+
+template <typename CacheTrait>
+typename CacheAllocator<CacheTrait>::WriteHandle
+CacheAllocator<CacheTrait>::tryPromoteToNextMemoryTier(
+    TierId tid, PoolId pid, Item& item, bool fromBgThread) {
+  if(item.isExpired()) { return {}; }
+  TierId nextTier = tid;
+  while (nextTier > 0) { // try to evict down to the next memory tiers
+    auto toPromoteTier = nextTier - 1;
+    --nextTier;
+
+    // allocateInternal might trigger another eviction
+    auto newItemHdl = allocateInternalTier(toPromoteTier, pid,
+                     item.getKey(),
+                     item.getSize(),
+                     item.getCreationTime(),
+                     item.getExpiryTime(),
+                     fromBgThread);
+
+    if (newItemHdl) {
+      XDCHECK_EQ(newItemHdl->getSize(), item.getSize());
+      moveRegularItemWithSync(item, newItemHdl);
+      item.unmarkMoving();
+      return newItemHdl;
+    } else {
+      return WriteHandle{};
+    }
+  }
+
+  return {};
+}
+
+template <typename CacheTrait>
+typename CacheAllocator<CacheTrait>::WriteHandle
+CacheAllocator<CacheTrait>::tryPromoteToNextMemoryTier(Item& item, bool fromBgThread) {
+    auto tid = getTierId(item);
+    auto pid = allocator_[tid]->getAllocInfo(item.getMemory()).poolId;
+    return tryPromoteToNextMemoryTier(tid, pid, item, fromBgThread);
 }
 
 template <typename CacheTrait>
@@ -2471,6 +2546,16 @@ PoolId CacheAllocator<CacheTrait>::addPool(
   setRebalanceStrategy(pid, std::move(rebalanceStrategy));
   setResizeStrategy(pid, std::move(resizeStrategy));
 
+  if (backgroundEvictor_.size()) {
+    for (size_t id = 0; id < backgroundEvictor_.size(); id++)
+      backgroundEvictor_[id]->setAssignedMemory(getAssignedMemoryToBgWorker(id, backgroundEvictor_.size(), 0));
+  }
+
+  if (backgroundPromoter_.size()) {
+    for (size_t id = 0; id < backgroundPromoter_.size(); id++)
+      backgroundPromoter_[id]->setAssignedMemory(getAssignedMemoryToBgWorker(id, backgroundPromoter_.size(), 1));
+  }
+
   return pid;
 }
 
@@ -3006,7 +3091,8 @@ CacheAllocator<CacheTrait>::allocateNewItemForOldItem(const Item& oldItem) {
                                          oldItem.getKey(),
                                          oldItem.getSize(),
                                          oldItem.getCreationTime(),
-                                         oldItem.getExpiryTime());
+                                         oldItem.getExpiryTime(),
+                                         false);
   if (!newItemHdl) {
     return {};
   }
@@ -3485,6 +3571,8 @@ bool CacheAllocator<CacheTrait>::stopWorkers(std::chrono::seconds timeout) {
   success &= stopPoolResizer(timeout);
   success &= stopMemMonitor(timeout);
   success &= stopReaper(timeout);
+  success &= stopBackgroundEvictor(timeout);
+  success &= stopBackgroundPromoter(timeout);
   return success;
 }
 
@@ -3746,6 +3834,8 @@ GlobalCacheStats CacheAllocator<CacheTrait>::getGlobalCacheStats() const {
   ret.nvmUpTime = currTime - nvmCacheState_.getCreationTime();
   ret.nvmCacheEnabled = nvmCache_ ? nvmCache_->isEnabled() : false;
   ret.reaperStats = getReaperStats();
+  ret.evictionStats = getBackgroundMoverStats(MoverDir::Evict);
+  ret.promotionStats = getBackgroundMoverStats(MoverDir::Promote);
   ret.numActiveHandles = getNumActiveHandles();
 
   ret.isNewRamCache = cacheCreationTime_ == cacheInstanceCreationTime_;
@@ -3932,6 +4022,64 @@ bool CacheAllocator<CacheTrait>::startNewReaper(
   return true;
 }
 
+template <typename CacheTrait>
+auto CacheAllocator<CacheTrait>::getAssignedMemoryToBgWorker(size_t evictorId, size_t numWorkers, TierId tid)
+{
+  std::vector<MemoryDescriptorType> asssignedMemory;
+  // TODO: for now, only evict from tier 0
+  auto pools = filterCompactCachePools(allocator_[tid]->getPoolIds());
+  for (const auto pid : pools) {
+    const auto& mpStats = getPoolByTid(pid,tid).getStats();
+    for (const auto cid : mpStats.classIds) {
+      if (backgroundWorkerId(tid, pid, cid, numWorkers) == evictorId) {
+        asssignedMemory.emplace_back(tid, pid, cid);
+      }
+    }
+  }
+  return asssignedMemory;
+}
+
+template <typename CacheTrait>
+bool CacheAllocator<CacheTrait>::startNewBackgroundEvictor(
+    std::chrono::milliseconds interval,
+    std::shared_ptr<BackgroundMoverStrategy> strategy,
+    size_t threads) {
+  XDCHECK(threads > 0);
+  backgroundEvictor_.resize(threads);
+  bool result = true;
+
+  for (size_t i = 0; i < threads; i++) {
+    auto ret = startNewWorker("BackgroundEvictor" + std::to_string(i), backgroundEvictor_[i], interval, strategy, MoverDir::Evict);
+    result = result && ret;
+
+    if (result) {
+      backgroundEvictor_[i]->setAssignedMemory(getAssignedMemoryToBgWorker(i, backgroundEvictor_.size(), 0));
+    }
+  }
+  return result;
+}
+
+template <typename CacheTrait>
+bool CacheAllocator<CacheTrait>::startNewBackgroundPromoter(
+    std::chrono::milliseconds interval,
+    std::shared_ptr<BackgroundMoverStrategy> strategy,
+    size_t threads) {
+  XDCHECK(threads > 0);
+  XDCHECK(getNumTiers() > 1);
+  backgroundPromoter_.resize(threads);
+  bool result = true;
+
+  for (size_t i = 0; i < threads; i++) {
+    auto ret = startNewWorker("BackgroundPromoter" + std::to_string(i), backgroundPromoter_[i], interval, strategy, MoverDir::Promote);
+    result = result && ret;
+
+    if (result) {
+      backgroundPromoter_[i]->setAssignedMemory(getAssignedMemoryToBgWorker(i, backgroundPromoter_.size(), 1));
+    }
+  }
+  return result;
+}
+
 template <typename CacheTrait>
 bool CacheAllocator<CacheTrait>::stopPoolRebalancer(
     std::chrono::seconds timeout) {
@@ -3959,6 +4107,26 @@ bool CacheAllocator<CacheTrait>::stopReaper(std::chrono::seconds timeout) {
   return stopWorker("Reaper", reaper_, timeout);
 }
 
+template <typename CacheTrait>
+bool CacheAllocator<CacheTrait>::stopBackgroundEvictor(std::chrono::seconds timeout) {
+  bool result = true;
+  for (size_t i = 0; i < backgroundEvictor_.size(); i++) {
+    auto ret = stopWorker("BackgroundEvictor", backgroundEvictor_[i], timeout);
+    result = result && ret;
+  }
+  return result;
+}
+
+template <typename CacheTrait>
+bool CacheAllocator<CacheTrait>::stopBackgroundPromoter(std::chrono::seconds timeout) {
+  bool result = true;
+  for (size_t i = 0; i < backgroundPromoter_.size(); i++) {
+    auto ret = stopWorker("BackgroundPromoter", backgroundPromoter_[i], timeout);
+    result = result && ret;
+  }
+  return result;
+}
+
 template <typename CacheTrait>
 bool CacheAllocator<CacheTrait>::cleanupStrayShmSegments(
     const std::string& cacheDir, bool posix) {
diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
index a845efb645..d32a8c991c 100644
--- a/cachelib/allocator/CacheAllocator.h
+++ b/cachelib/allocator/CacheAllocator.h
@@ -40,6 +40,7 @@
 #include <folly/Range.h>
 #pragma GCC diagnostic pop
 
+#include "cachelib/allocator/BackgroundMover.h"
 #include "cachelib/allocator/CCacheManager.h"
 #include "cachelib/allocator/Cache.h"
 #include "cachelib/allocator/CacheAllocatorConfig.h"
@@ -712,6 +713,11 @@ class CacheAllocator : public CacheBase {
   // @return    the full usable size for this item
   uint32_t getUsableSize(const Item& item) const;
 
+  // gets the allocation class assigned to BG worker
+  auto getAssignedMemoryToBgWorker(size_t evictorId, size_t numWorkers, TierId tid);
+  bool shouldWakeupBgEvictor(TierId tid, PoolId pid, ClassId cid);
+  size_t backgroundWorkerId(TierId tid, PoolId pid, ClassId cid, size_t numWorkers);
+
   // Get a random item from memory
   // This is useful for profiling and sampling cachelib managed memory
   //
@@ -1057,6 +1063,11 @@ class CacheAllocator : public CacheBase {
   // @param reaperThrottleConfig    throttling config
   bool startNewReaper(std::chrono::milliseconds interval,
                       util::Throttler::Config reaperThrottleConfig);
+  
+  bool startNewBackgroundPromoter(std::chrono::milliseconds interval,
+                      std::shared_ptr<BackgroundMoverStrategy> strategy, size_t threads);
+  bool startNewBackgroundEvictor(std::chrono::milliseconds interval,
+                      std::shared_ptr<BackgroundMoverStrategy> strategy, size_t threads);
 
   // Stop existing workers with a timeout
   bool stopPoolRebalancer(std::chrono::seconds timeout = std::chrono::seconds{
@@ -1066,6 +1077,8 @@ class CacheAllocator : public CacheBase {
                              0});
   bool stopMemMonitor(std::chrono::seconds timeout = std::chrono::seconds{0});
   bool stopReaper(std::chrono::seconds timeout = std::chrono::seconds{0});
+  bool stopBackgroundEvictor(std::chrono::seconds timeout = std::chrono::seconds{0});
+  bool stopBackgroundPromoter(std::chrono::seconds timeout = std::chrono::seconds{0});
 
   // Set pool optimization to either true or false
   //
@@ -1101,6 +1114,10 @@ class CacheAllocator : public CacheBase {
     return allocator_[currentTier()]->getPool(pid);
   }
 
+  const MemoryPool& getPoolByTid(PoolId pid, TierId tid) const override final {
+    return allocator_[tid]->getPool(pid);
+  }
+
   // calculate the number of slabs to be advised/reclaimed in each pool
   PoolAdviseReclaimData calcNumSlabsToAdviseReclaim() override final {
     auto regularPoolIds = getRegularPoolIds();
@@ -1151,6 +1168,52 @@ class CacheAllocator : public CacheBase {
     auto stats = reaper_ ? reaper_->getStats() : ReaperStats{};
     return stats;
   }
+  
+  // returns the background mover stats
+  BackgroundMoverStats getBackgroundMoverStats(MoverDir direction) const {
+    
+    auto stats = BackgroundMoverStats{};
+    if (direction == MoverDir::Evict) {
+        for (auto &bg : backgroundEvictor_)
+          stats += bg->getStats();
+    } else if (direction == MoverDir::Promote) {
+        for (auto &bg : backgroundPromoter_)
+          stats += bg->getStats();
+    }
+    return stats;
+
+  }
+  
+  
+  std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>>
+  getBackgroundMoverClassStats(MoverDir direction) const {
+    std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>> stats;
+
+    if (direction == MoverDir::Evict) {
+        for (auto &bg : backgroundEvictor_) {
+          for (auto &tid : bg->getClassStats()) {
+            for (auto &pid : tid.second) {
+              for (auto &cid : pid.second) {
+                stats[tid.first][pid.first][cid.first] += cid.second;
+              }
+            }
+          }
+        }
+    } else if (direction == MoverDir::Promote) {
+        for (auto &bg : backgroundPromoter_) {
+          for (auto &tid : bg->getClassStats()) {
+            for (auto &pid : tid.second) {
+              for (auto &cid : pid.second) {
+                stats[tid.first][pid.first][cid.first] += cid.second;
+              }
+            }
+          }
+        }
+    }
+
+    return stats;
+  }
+  
 
   // return the LruType of an item
   typename MMType::LruType getItemLruType(const Item& item) const;
@@ -1447,7 +1510,8 @@ class CacheAllocator : public CacheBase {
                                Key key,
                                uint32_t size,
                                uint32_t creationTime,
-                               uint32_t expiryTime);
+                               uint32_t expiryTime,
+                               bool fromBgThread = false);
 
   // create a new cache allocation on specific memory tier.
   // For description see allocateInternal.
@@ -1458,7 +1522,8 @@ class CacheAllocator : public CacheBase {
                                    Key key,
                                    uint32_t size,
                                    uint32_t creationTime,
-                                   uint32_t expiryTime);
+                                   uint32_t expiryTime,
+                                   bool fromBgThread);
 
   // Allocate a chained item
   //
@@ -1721,7 +1786,11 @@ class CacheAllocator : public CacheBase {
   //
   // @return valid handle to the item. This will be the last
   //         handle to the item. On failure an empty handle.
-  WriteHandle tryEvictToNextMemoryTier(TierId tid, PoolId pid, Item& item); 
+  WriteHandle tryEvictToNextMemoryTier(TierId tid, PoolId pid, Item& item, bool fromBgThread);
+
+  WriteHandle tryPromoteToNextMemoryTier(TierId tid, PoolId pid, Item& item, bool fromBgThread);
+
+  WriteHandle tryPromoteToNextMemoryTier(Item& item, bool fromBgThread);
 
   // Wakes up waiters if there are any
   //
@@ -1741,7 +1810,7 @@ class CacheAllocator : public CacheBase {
   //
   // @return valid handle to the item. This will be the last
   //         handle to the item. On failure an empty handle. 
-  WriteHandle tryEvictToNextMemoryTier(Item& item);
+  WriteHandle tryEvictToNextMemoryTier(Item& item, bool fromBgThread);
 
   // Deserializer CacheAllocatorMetadata and verify the version
   //
@@ -1886,6 +1955,137 @@ class CacheAllocator : public CacheBase {
     stats().numReaperSkippedSlabs.add(slabsSkipped);
   }
 
+  // exposed for the background evictor to iterate through the memory and evict
+  // in batch. This should improve insertion path for tiered memory config
+  size_t traverseAndEvictItems(unsigned int tid, unsigned int pid, unsigned int cid, size_t batch) {
+auto& mmContainer = getMMContainer(tid, pid, cid);
+    size_t evictions = 0;
+    size_t evictionCandidates = 0;
+    std::vector<Item*> candidates;
+    candidates.reserve(batch);
+
+    size_t tries = 0;
+    mmContainer.withEvictionIterator([&tries, &candidates, &batch, &mmContainer, this](auto &&itr) {
+      while (candidates.size() < batch && 
+        (config_.maxEvictionPromotionHotness == 0 || tries < config_.maxEvictionPromotionHotness) && 
+         itr) {
+        tries++;
+        Item* candidate = itr.get();
+        XDCHECK(candidate);
+
+        if (candidate->isChainedItem()) {
+          throw std::runtime_error("Not supported for chained items");
+        }
+
+        if (candidate->markMoving(true)) {
+          mmContainer.remove(itr);
+          candidates.push_back(candidate);
+        } else {
+            ++itr;
+        }
+      }
+    });
+
+    for (Item *candidate : candidates) {
+      auto evictedToNext = tryEvictToNextMemoryTier(*candidate, true /* from BgThread */);
+      if (!evictedToNext) {
+	  auto token = createPutToken(*candidate);
+
+	  auto ret = candidate->markForEvictionWhenMoving();
+	  XDCHECK(ret);
+
+          unlinkItemForEviction(*candidate);
+      	  // wake up any readers that wait for the move to complete
+      	  // it's safe to do now, as we have the item marked exclusive and
+      	  // no other reader can be added to the waiters list
+      	  wakeUpWaiters(*candidate, WriteHandle{});
+
+      	  if (token.isValid() && shouldWriteToNvmCacheExclusive(*candidate)) {
+      	    nvmCache_->put(*candidate, std::move(token));
+      	  }
+      } else {
+          evictions++;
+      	  XDCHECK(!evictedToNext->isMarkedForEviction() && !evictedToNext->isMoving());
+      	  XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving());
+      	  XDCHECK(!candidate->isAccessible());
+      	  XDCHECK(candidate->getKey() == evictedToNext->getKey());
+
+      	  wakeUpWaiters(*candidate, std::move(evictedToNext));
+      }
+      XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving());
+
+      if (candidate->hasChainedItem()) {
+        (*stats_.chainedItemEvictions)[pid][cid].inc();
+      } else {
+        (*stats_.regularItemEvictions)[pid][cid].inc();
+      }
+
+      // it's safe to recycle the item here as there are no more
+      // references and the item could not been marked as moving
+      // by other thread since it's detached from MMContainer.
+      auto res = releaseBackToAllocator(*candidate, RemoveContext::kEviction,
+                                /* isNascent */ false);
+      XDCHECK(res == ReleaseRes::kReleased);
+    }
+    return evictions;
+  }
+
+  size_t traverseAndPromoteItems(unsigned int tid, unsigned int pid, unsigned int cid, size_t batch) {
+auto& mmContainer = getMMContainer(tid, pid, cid);
+    size_t promotions = 0;
+    std::vector<Item*> candidates;
+    candidates.reserve(batch);
+
+    size_t tries = 0;
+
+    mmContainer.withPromotionIterator([&tries, &candidates, &batch, &mmContainer, this](auto &&itr){
+      while (candidates.size() < batch && (config_.maxEvictionPromotionHotness == 0 || tries < config_.maxEvictionPromotionHotness) && itr) {
+        tries++;
+        Item* candidate = itr.get();
+        XDCHECK(candidate);
+
+        if (candidate->isChainedItem()) {
+          throw std::runtime_error("Not supported for chained items");
+        }
+
+        // TODO: only allow it for read-only items?
+        // or implement mvcc
+        if (candidate->markMoving(true)) {
+          candidates.push_back(candidate);
+        }
+
+        ++itr;
+      }
+    });
+
+    for (Item *candidate : candidates) {
+      auto promoted = tryPromoteToNextMemoryTier(*candidate, true);
+      if (promoted) {
+        promotions++;
+  	    removeFromMMContainer(*candidate);
+        XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving());
+        // it's safe to recycle the item here as there are no more
+        // references and the item could not been marked as moving
+        // by other thread since it's detached from MMContainer.
+        auto res = releaseBackToAllocator(*candidate, RemoveContext::kEviction,
+                                  /* isNascent */ false);
+        XDCHECK(res == ReleaseRes::kReleased);
+        wakeUpWaiters(*candidate, std::move(promoted));
+      } else {
+        // we failed to allocate a new item, this item is no  longer moving
+        auto ref = unmarkMovingAndWakeUpWaiters(*candidate, {});
+        if (UNLIKELY(ref == 0)) {
+          const auto res =
+              releaseBackToAllocator(*candidate, 
+                      RemoveContext::kNormal, false);
+          XDCHECK(res == ReleaseRes::kReleased);
+        }
+      }
+     
+    }
+    return promotions;
+  }
+
   // returns true if nvmcache is enabled and we should write this item to
   // nvmcache.
   bool shouldWriteToNvmCache(const Item& item);
@@ -2211,6 +2411,10 @@ class CacheAllocator : public CacheBase {
 
   // free memory monitor
   std::unique_ptr<MemoryMonitor> memMonitor_;
+  
+  // background evictor
+  std::vector<std::unique_ptr<BackgroundMover<CacheT>>> backgroundEvictor_;
+  std::vector<std::unique_ptr<BackgroundMover<CacheT>>> backgroundPromoter_;
 
   // check whether a pool is a slabs pool
   std::array<bool, MemoryPoolManager::kMaxPools> isCompactCachePool_{};
@@ -2272,6 +2476,7 @@ class CacheAllocator : public CacheBase {
   // Make this friend to give access to acquire and release
   friend ReadHandle;
   friend ReaperAPIWrapper<CacheT>;
+  friend BackgroundMoverAPIWrapper<CacheT>;
   friend class CacheAPIWrapperForNvm<CacheT>;
   friend class FbInternalRuntimeUpdateWrapper<CacheT>;
   friend class objcache2::ObjectCache<CacheT>;
diff --git a/cachelib/allocator/CacheAllocatorConfig.h b/cachelib/allocator/CacheAllocatorConfig.h
index 74cd34c6a2..a089e754c0 100644
--- a/cachelib/allocator/CacheAllocatorConfig.h
+++ b/cachelib/allocator/CacheAllocatorConfig.h
@@ -31,6 +31,7 @@
 #include "cachelib/allocator/MemoryTierCacheConfig.h"
 #include "cachelib/allocator/NvmAdmissionPolicy.h"
 #include "cachelib/allocator/PoolOptimizeStrategy.h"
+#include "cachelib/allocator/BackgroundMoverStrategy.h"
 #include "cachelib/allocator/RebalanceStrategy.h"
 #include "cachelib/allocator/Util.h"
 #include "cachelib/common/EventInterface.h"
@@ -265,6 +266,16 @@ class CacheAllocatorConfig {
       std::chrono::seconds regularInterval,
       std::chrono::seconds ccacheInterval,
       uint32_t ccacheStepSizePercent);
+  
+  // Enable the background evictor - scans a tier to look for objects
+  // to evict to the next tier
+  CacheAllocatorConfig& enableBackgroundEvictor(
+      std::shared_ptr<BackgroundMoverStrategy> backgroundMoverStrategy,
+      std::chrono::milliseconds regularInterval, size_t threads);
+
+  CacheAllocatorConfig& enableBackgroundPromoter(
+      std::shared_ptr<BackgroundMoverStrategy> backgroundMoverStrategy,
+      std::chrono::milliseconds regularInterval, size_t threads);
 
   // This enables an optimization for Pool rebalancing and resizing.
   // The rough idea is to ensure only the least useful items are evicted when
@@ -340,6 +351,17 @@ class CacheAllocatorConfig {
            poolOptimizeStrategy != nullptr;
   }
 
+  // @return whether background evictor thread is enabled
+  bool backgroundEvictorEnabled() const noexcept {
+    return backgroundEvictorInterval.count() > 0 &&
+           backgroundEvictorStrategy != nullptr;
+  }
+
+  bool backgroundPromoterEnabled() const noexcept {
+    return backgroundPromoterInterval.count() > 0 &&
+           backgroundPromoterStrategy != nullptr;
+  }
+
   // @return whether memory monitor is enabled
   bool memMonitoringEnabled() const noexcept {
     return memMonitorConfig.mode != MemoryMonitor::Disabled &&
@@ -450,6 +472,16 @@ class CacheAllocatorConfig {
   // The slab release process is considered as being stuck if it does not
   // make any progress for the below threshold
   std::chrono::milliseconds slabReleaseStuckThreshold{std::chrono::seconds(60)};
+  
+  // rebalance to avoid alloc fialures.
+  std::shared_ptr<BackgroundMoverStrategy> backgroundEvictorStrategy;
+  std::shared_ptr<BackgroundMoverStrategy> backgroundPromoterStrategy;
+  // time interval to sleep between runs of the background evictor
+  std::chrono::milliseconds backgroundEvictorInterval{std::chrono::milliseconds{1000}};
+  std::chrono::milliseconds backgroundPromoterInterval{std::chrono::milliseconds{1000}};
+
+  size_t backgroundEvictorThreads{1};
+  size_t backgroundPromoterThreads{1};
 
   // time interval to sleep between iterations of pool size optimization,
   // for regular pools and compact caches
@@ -589,6 +621,25 @@ class CacheAllocatorConfig {
   // If true, we will delay worker start until user explicitly calls
   // CacheAllocator::startCacheWorkers()
   bool delayCacheWorkersStart{false};
+  
+  // see MultiTierDataMovement.md
+  double promotionAcWatermark{4.0}; 
+  double lowEvictionAcWatermark{2.0};
+  double highEvictionAcWatermark{5.0};
+  double numDuplicateElements{0.0}; // inclusivness of the cache
+  double syncPromotion{0.0}; // can promotion be done synchronously in user thread
+  
+  uint64_t evictorThreads{1};
+  uint64_t promoterThreads{1};
+
+  uint64_t maxEvictionBatch{40};
+  uint64_t maxPromotionBatch{10};
+
+  uint64_t minEvictionBatch{1};
+  uint64_t minPromotionBatch{1};
+
+  uint64_t maxEvictionPromotionHotness{60};
+
 
   friend CacheT;
 
@@ -926,6 +977,26 @@ CacheAllocatorConfig<T>& CacheAllocatorConfig<T>::enablePoolRebalancing(
   return *this;
 }
 
+template <typename T>
+CacheAllocatorConfig<T>& CacheAllocatorConfig<T>::enableBackgroundEvictor(
+    std::shared_ptr<BackgroundMoverStrategy> strategy,
+    std::chrono::milliseconds interval, size_t evictorThreads) {
+  backgroundEvictorStrategy = strategy;
+  backgroundEvictorInterval = interval;
+  backgroundEvictorThreads = evictorThreads;
+  return *this;
+}
+
+template <typename T>
+CacheAllocatorConfig<T>& CacheAllocatorConfig<T>::enableBackgroundPromoter(
+    std::shared_ptr<BackgroundMoverStrategy> strategy,
+    std::chrono::milliseconds interval, size_t promoterThreads) {
+  backgroundPromoterStrategy = strategy;
+  backgroundPromoterInterval = interval;
+  backgroundPromoterThreads = promoterThreads;
+  return *this;
+}
+
 template <typename T>
 CacheAllocatorConfig<T>& CacheAllocatorConfig<T>::enablePoolResizing(
     std::shared_ptr<RebalanceStrategy> resizeStrategy,
diff --git a/cachelib/allocator/CacheStats.h b/cachelib/allocator/CacheStats.h
index 9f3674f513..46c051be14 100644
--- a/cachelib/allocator/CacheStats.h
+++ b/cachelib/allocator/CacheStats.h
@@ -290,6 +290,27 @@ struct ReaperStats {
   uint64_t avgTraversalTimeMs{0};
 };
 
+// Mover Stats
+struct BackgroundMoverStats {
+  // the number of items this worker moved by looking at pools/classes stats
+  uint64_t numMovedItems{0};
+  // number of times we went executed the thread //TODO: is this def correct?
+  uint64_t runCount{0};
+  // total number of classes
+  uint64_t totalClasses{0};
+  // eviction size
+  uint64_t totalBytesMoved{0};
+
+  BackgroundMoverStats& operator+=(const BackgroundMoverStats& rhs) {
+    numMovedItems += rhs.numMovedItems;
+    runCount += rhs.runCount;
+    totalClasses += rhs.totalClasses;
+    totalBytesMoved += rhs.totalBytesMoved;
+    return *this;
+  }
+};
+
+
 // CacheMetadata type to export
 struct CacheMetadata {
   // allocator_version
@@ -310,6 +331,11 @@ struct Stats;
 // Stats that apply globally in cache and
 // the ones that are aggregated over all pools
 struct GlobalCacheStats {
+  // background eviction stats
+  BackgroundMoverStats evictionStats;
+  
+  BackgroundMoverStats promotionStats;
+
   // number of calls to CacheAllocator::find
   uint64_t numCacheGets{0};
 
diff --git a/cachelib/allocator/FreeThresholdStrategy.cpp b/cachelib/allocator/FreeThresholdStrategy.cpp
new file mode 100644
index 0000000000..4a900c2cb1
--- /dev/null
+++ b/cachelib/allocator/FreeThresholdStrategy.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) Intel and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cachelib/allocator/FreeThresholdStrategy.h"
+
+#include <folly/logging/xlog.h>
+
+namespace facebook {
+namespace cachelib {
+
+FreeThresholdStrategy::FreeThresholdStrategy(double lowEvictionAcWatermark,
+                                             double highEvictionAcWatermark,
+                                             uint64_t maxEvictionBatch,
+                                             uint64_t minEvictionBatch)
+    : lowEvictionAcWatermark(lowEvictionAcWatermark),
+      highEvictionAcWatermark(highEvictionAcWatermark),
+      maxEvictionBatch(maxEvictionBatch),
+      minEvictionBatch(minEvictionBatch) {}
+
+std::vector<size_t> FreeThresholdStrategy::calculateBatchSizes(
+    const CacheBase& cache,
+    std::vector<MemoryDescriptorType> acVec) {
+  std::vector<size_t> batches{};
+  for (auto [tid, pid, cid] : acVec) {
+    auto stats = cache.getACStats(tid, pid, cid);
+    if ((1-stats.usageFraction())*100 >= highEvictionAcWatermark) {
+      batches.push_back(0);
+    } else {
+      auto toFreeMemPercent = highEvictionAcWatermark - (1-stats.usageFraction())*100;
+      auto toFreeItems = static_cast<size_t>(
+          toFreeMemPercent * (stats.totalSlabs() * Slab::kSize) / stats.allocSize);
+      batches.push_back(toFreeItems);
+    }
+  }
+
+  if (batches.size() == 0) {
+    return batches;
+  }
+
+  auto maxBatch = *std::max_element(batches.begin(), batches.end());
+  if (maxBatch == 0)
+    return batches;
+
+  std::transform(
+      batches.begin(), batches.end(), batches.begin(), [&](auto numItems) {
+        if (numItems == 0) {
+          return 0UL;
+        }
+
+        auto cappedBatchSize = maxEvictionBatch * numItems / maxBatch;
+        if (cappedBatchSize < minEvictionBatch)
+          return minEvictionBatch;
+        else
+          return cappedBatchSize;
+      });
+
+  return batches;
+}
+
+} // namespace cachelib
+} // namespace facebook
diff --git a/cachelib/allocator/FreeThresholdStrategy.h b/cachelib/allocator/FreeThresholdStrategy.h
new file mode 100644
index 0000000000..94316bfe82
--- /dev/null
+++ b/cachelib/allocator/FreeThresholdStrategy.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cachelib/allocator/BackgroundMoverStrategy.h"
+#include "cachelib/allocator/Cache.h"
+
+namespace facebook {
+namespace cachelib {
+
+// Base class for background mover strategy.
+class FreeThresholdStrategy : public BackgroundMoverStrategy {
+ public:
+  FreeThresholdStrategy(double lowEvictionAcWatermark,
+                        double highEvictionAcWatermark,
+                        uint64_t maxEvictionBatch,
+                        uint64_t minEvictionBatch);
+  ~FreeThresholdStrategy() {}
+
+  std::vector<size_t> calculateBatchSizes(
+      const CacheBase& cache,
+      std::vector<MemoryDescriptorType> acVecs);
+
+ private:
+  double lowEvictionAcWatermark{2.0};
+  double highEvictionAcWatermark{5.0};
+  uint64_t maxEvictionBatch{40};
+  uint64_t minEvictionBatch{5};
+};
+
+} // namespace cachelib
+} // namespace facebook
diff --git a/cachelib/allocator/MMLru-inl.h b/cachelib/allocator/MMLru-inl.h
index 842d87ddb8..751bcca5c1 100644
--- a/cachelib/allocator/MMLru-inl.h
+++ b/cachelib/allocator/MMLru-inl.h
@@ -229,17 +229,17 @@ void MMLru::Container<T, HookPtr>::withEvictionIterator(F&& fun) {
   }
 }
 
-//template <typename T, MMLru::Hook<T> T::*HookPtr>
-//template <typename F>
-//void
-//MMLru::Container<T, HookPtr>::withPromotionIterator(F&& fun) {
-//  if (config_.useCombinedLockForIterators) {
-//    lruMutex_->lock_combine([this, &fun]() { fun(Iterator{lru_.begin()}); });
-//  } else {
-//    LockHolder lck{*lruMutex_};
-//    fun(Iterator{lru_.begin()});
-//  }
-//}
+template <typename T, MMLru::Hook<T> T::*HookPtr>
+template <typename F>
+void
+MMLru::Container<T, HookPtr>::withPromotionIterator(F&& fun) {
+  if (config_.useCombinedLockForIterators) {
+    lruMutex_->lock_combine([this, &fun]() { fun(Iterator{lru_.begin()}); });
+  } else {
+    LockHolder lck{*lruMutex_};
+    fun(Iterator{lru_.begin()});
+  }
+}
 
 template <typename T, MMLru::Hook<T> T::*HookPtr>
 void MMLru::Container<T, HookPtr>::ensureNotInsertionPoint(T& node) noexcept {
diff --git a/cachelib/allocator/MMLru.h b/cachelib/allocator/MMLru.h
index 645b8f0e86..cf3253349a 100644
--- a/cachelib/allocator/MMLru.h
+++ b/cachelib/allocator/MMLru.h
@@ -377,6 +377,9 @@ class MMLru {
     template <typename F>
     void withEvictionIterator(F&& f);
 
+    template <typename F>
+    void withPromotionIterator(F&& f);
+
     // get copy of current config
     Config getConfig() const;
 
diff --git a/cachelib/allocator/MMTinyLFU-inl.h b/cachelib/allocator/MMTinyLFU-inl.h
index 46640b24ca..9203a54dd6 100644
--- a/cachelib/allocator/MMTinyLFU-inl.h
+++ b/cachelib/allocator/MMTinyLFU-inl.h
@@ -227,6 +227,13 @@ void MMTinyLFU::Container<T, HookPtr>::withEvictionIterator(F&& fun) {
   fun(getEvictionIterator());
 }
 
+template <typename T, MMTinyLFU::Hook<T> T::*HookPtr>
+template <typename F>
+void
+MMTinyLFU::Container<T, HookPtr>::withPromotionIterator(F&& fun) {
+  throw std::runtime_error("Not supported");
+}
+
 template <typename T, MMTinyLFU::Hook<T> T::*HookPtr>
 void MMTinyLFU::Container<T, HookPtr>::removeLocked(T& node) noexcept {
   if (isTiny(node)) {
diff --git a/cachelib/allocator/MMTinyLFU.h b/cachelib/allocator/MMTinyLFU.h
index c8f2699264..eb45cefd22 100644
--- a/cachelib/allocator/MMTinyLFU.h
+++ b/cachelib/allocator/MMTinyLFU.h
@@ -496,6 +496,9 @@ class MMTinyLFU {
     // iterator passed as parameter.
     template <typename F>
     void withEvictionIterator(F&& f);
+    
+    template <typename F>
+    void withPromotionIterator(F&& f);
 
     // for saving the state of the lru
     //
diff --git a/cachelib/allocator/PromotionStrategy.h b/cachelib/allocator/PromotionStrategy.h
new file mode 100644
index 0000000000..1022aca0f8
--- /dev/null
+++ b/cachelib/allocator/PromotionStrategy.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cachelib/allocator/BackgroundMoverStrategy.h"
+#include "cachelib/allocator/Cache.h"
+
+namespace facebook {
+namespace cachelib {
+
+// Base class for background eviction strategy.
+class PromotionStrategy : public BackgroundMoverStrategy {
+ public:
+  PromotionStrategy(uint64_t promotionAcWatermark,
+                    uint64_t maxPromotionBatch,
+                    uint64_t minPromotionBatch)
+      : promotionAcWatermark(promotionAcWatermark),
+        maxPromotionBatch(maxPromotionBatch),
+        minPromotionBatch(minPromotionBatch) {}
+  ~PromotionStrategy() {}
+
+  std::vector<size_t> calculateBatchSizes(
+      const CacheBase& cache,
+      std::vector<MemoryDescriptorType> acVec) {
+    std::vector<size_t> batches{};
+    for (auto [tid, pid, cid] : acVec) {
+      XDCHECK(tid > 0);
+      auto stats = cache.getACStats(tid - 1, pid, cid);
+      if ((1-stats.usageFraction())*100 < promotionAcWatermark)
+        batches.push_back(0);
+      else {
+        auto maxPossibleItemsToPromote = static_cast<size_t>(
+            (promotionAcWatermark - (1-stats.usageFraction())*100) *
+            (stats.totalSlabs() * Slab::kSize) / stats.allocSize);
+        batches.push_back(maxPossibleItemsToPromote);
+      }
+    }
+
+    if (batches.size() == 0) {
+      return batches;
+    }
+
+    auto maxBatch = *std::max_element(batches.begin(), batches.end());
+    if (maxBatch == 0)
+      return batches;
+
+    std::transform(
+        batches.begin(), batches.end(), batches.begin(), [&](auto numItems) {
+          if (numItems == 0) {
+            return 0UL;
+          }
+
+          auto cappedBatchSize = maxPromotionBatch * numItems / maxBatch;
+          if (cappedBatchSize < minPromotionBatch)
+            return minPromotionBatch;
+          else
+            return cappedBatchSize;
+        });
+
+    return batches;
+  }
+
+ private:
+  double promotionAcWatermark{4.0};
+  uint64_t maxPromotionBatch{40};
+  uint64_t minPromotionBatch{5};
+};
+
+} // namespace cachelib
+} // namespace facebook
diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp b/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp
index a8fb952b71..6e2de814da 100644
--- a/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp
+++ b/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp
@@ -26,6 +26,7 @@ using LruAllocatorMemoryTiersTest = AllocatorMemoryTiersTest<LruAllocator>;
 TEST_F(LruAllocatorMemoryTiersTest, MultiTiersInvalid) { this->testMultiTiersInvalid(); }
 TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValid) { this->testMultiTiersValid(); }
 TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValidMixed) { this->testMultiTiersValidMixed(); }
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersBackgroundMovers ) { this->testMultiTiersBackgroundMovers(); }
 TEST_F(LruAllocatorMemoryTiersTest, MultiTiersRemoveDuringEviction) { this->testMultiTiersRemoveDuringEviction(); }
 TEST_F(LruAllocatorMemoryTiersTest, MultiTiersReplaceDuringEviction) { this->testMultiTiersReplaceDuringEviction(); }
 
diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.h b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h
index 244a56ed83..b559649aa6 100644
--- a/cachelib/allocator/tests/AllocatorMemoryTiersTest.h
+++ b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h
@@ -19,6 +19,8 @@
 #include "cachelib/allocator/CacheAllocatorConfig.h"
 #include "cachelib/allocator/MemoryTierCacheConfig.h"
 #include "cachelib/allocator/tests/TestBase.h"
+#include "cachelib/allocator/FreeThresholdStrategy.h"
+#include "cachelib/allocator/PromotionStrategy.h"
 
 #include <folly/synchronization/Latch.h>
 
@@ -85,6 +87,71 @@ class AllocatorMemoryTiersTest : public AllocatorTest<AllocatorT> {
     ASSERT(handle != nullptr);
     ASSERT_NO_THROW(alloc->insertOrReplace(handle));
   }
+  
+  void testMultiTiersBackgroundMovers() {
+    typename AllocatorT::Config config;
+    config.setCacheSize(10 * Slab::kSize);
+    config.enableCachePersistence("/tmp");
+    config.usePosixForShm();
+    config.configureMemoryTiers({
+        MemoryTierCacheConfig::fromShm()
+            .setRatio(1).setMemBind(std::string("0")),
+        MemoryTierCacheConfig::fromShm()
+            .setRatio(1).setMemBind(std::string("0"))
+    });
+    config.enableBackgroundEvictor(std::make_shared<FreeThresholdStrategy>(2, 10, 100, 40),
+            std::chrono::milliseconds(10),1);
+    config.enableBackgroundPromoter(std::make_shared<PromotionStrategy>(5, 4, 2),
+            std::chrono::milliseconds(10),1);
+
+    auto allocator = std::make_unique<AllocatorT>(AllocatorT::SharedMemNew, config);
+    ASSERT(allocator != nullptr);
+    const size_t numBytes = allocator->getCacheMemoryStats().ramCacheSize;
+
+    auto poolId = allocator->addPool("default", numBytes);
+
+    const unsigned int keyLen = 100;
+    const unsigned int size = 100;
+    unsigned int allocs = 0;
+
+    //we should work on pool stats because filluppooluntil evictions
+    //will finish once we evict an item from tier 0 to tier 1 and
+    //there will be unallocated memory left.
+    while (allocs < 174760) {
+      const auto key = this->getRandomNewKey(*allocator, keyLen);
+      ASSERT_EQ(allocator->find(key), nullptr);
+      auto handle = util::allocateAccessible(*allocator, poolId, key, size);
+      allocs++;
+    }
+   
+    const auto key = this->getRandomNewKey(*allocator, keyLen);
+    auto handle = util::allocateAccessible(*allocator, poolId, key, size);
+    ASSERT_NE(nullptr, handle);
+    const uint8_t cid = allocator->getAllocInfo(handle->getMemory()).classId;
+    ASSERT_EQ(cid,5);
+    auto stats = allocator->getGlobalCacheStats();
+    auto slabStats = allocator->getACStats(0,0,cid);
+    const auto& mpStats = allocator->getPoolByTid(poolId, 0).getStats(); 
+    //cache is 10MB should move about 1MB to reach 10% free
+    uint32_t approxEvict = (1024*1024)/mpStats.acStats.at(cid).allocSize;
+    while (stats.evictionStats.numMovedItems < approxEvict*0.95 && (1-slabStats.usageFraction()) >= 0.095) {
+        std::this_thread::sleep_for(std::chrono::seconds(1));
+        stats = allocator->getGlobalCacheStats();
+        slabStats = allocator->getACStats(0,0,cid);
+    }
+    ASSERT_GE(1-slabStats.usageFraction(),0.095);
+
+    auto perclassEstats = allocator->getBackgroundMoverClassStats(MoverDir::Evict);
+    auto perclassPstats = allocator->getBackgroundMoverClassStats(MoverDir::Promote);
+
+    ASSERT_GE(stats.evictionStats.numMovedItems,1);
+    ASSERT_GE(stats.evictionStats.runCount,1);
+    ASSERT_GE(stats.promotionStats.numMovedItems,1);
+   
+    ASSERT_GE(perclassEstats[0][0][cid], 1);
+    ASSERT_GE(perclassPstats[1][0][cid], 1);
+    
+  }
 
   void testMultiTiersValidMixed() {
     typename AllocatorT::Config config;
diff --git a/cachelib/allocator/tests/CacheBaseTest.cpp b/cachelib/allocator/tests/CacheBaseTest.cpp
index e7778d6ccf..dae14c5335 100644
--- a/cachelib/allocator/tests/CacheBaseTest.cpp
+++ b/cachelib/allocator/tests/CacheBaseTest.cpp
@@ -33,6 +33,8 @@ class CacheBaseTest : public CacheBase, public SlabAllocatorTestBase {
   const std::string getCacheName() const override { return cacheName; }
   bool isObjectCache() const override { return false; }
   const MemoryPool& getPool(PoolId) const override { return memoryPool_; }
+  //TODO: support tiers
+  const MemoryPool& getPoolByTid(PoolId, TierId tid) const override { return memoryPool_; }
   PoolStats getPoolStats(PoolId) const override { return PoolStats(); }
   ACStats getACStats(TierId, PoolId, ClassId) const { return ACStats(); };
   AllSlabReleaseEvents getAllSlabReleaseEvents(PoolId) const override {
diff --git a/cachelib/cachebench/cache/Cache-inl.h b/cachelib/cachebench/cache/Cache-inl.h
index cb038d0f1f..a73763bcc4 100644
--- a/cachelib/cachebench/cache/Cache-inl.h
+++ b/cachelib/cachebench/cache/Cache-inl.h
@@ -46,6 +46,16 @@ Cache<Allocator>::Cache(const CacheConfig& config,
       config_.getRebalanceStrategy(),
       std::chrono::seconds(config_.poolRebalanceIntervalSec));
 
+  allocatorConfig_.enableBackgroundEvictor(
+      config_.getBackgroundEvictorStrategy(),
+      std::chrono::milliseconds(config_.backgroundEvictorIntervalMilSec),
+      config_.evictorThreads);
+
+  allocatorConfig_.enableBackgroundPromoter(
+      config_.getBackgroundPromoterStrategy(),
+      std::chrono::milliseconds(config_.backgroundPromoterIntervalMilSec),
+      config_.promoterThreads);
+
   if (config_.moveOnSlabRelease && movingSync != nullptr) {
     allocatorConfig_.enableMovingOnSlabRelease(
         [](Item& oldItem, Item& newItem, Item* parentPtr) {
@@ -98,6 +108,12 @@ Cache<Allocator>::Cache(const CacheConfig& config,
     }
   });
 
+  allocatorConfig_.maxEvictionBatch = config_.maxEvictionBatch;
+  allocatorConfig_.maxPromotionBatch = config_.maxPromotionBatch;
+  allocatorConfig_.minEvictionBatch = config_.minEvictionBatch;
+  allocatorConfig_.minPromotionBatch = config_.minPromotionBatch;
+  allocatorConfig_.maxEvictionPromotionHotness = config_.maxEvictionPromotionHotness;
+
   if (config_.enableItemDestructorCheck) {
     auto removeCB = [&](const typename Allocator::DestructorData& data) {
       if (!itemRecords_.validate(data)) {
@@ -635,6 +651,21 @@ Stats Cache<Allocator>::getStats() const {
   const auto navyStats = cache_->getNvmCacheStatsMap().toMap();
 
   ret.allocationClassStats = allocationClassStats;
+
+  ret.backgndEvicStats.nEvictedItems =
+            cacheStats.evictionStats.numMovedItems;
+  ret.backgndEvicStats.nTraversals =
+            cacheStats.evictionStats.runCount;
+  ret.backgndEvicStats.nClasses =
+            cacheStats.evictionStats.totalClasses;
+  ret.backgndEvicStats.evictionSize =
+            cacheStats.evictionStats.totalBytesMoved;
+  
+  ret.backgndPromoStats.nPromotedItems =
+            cacheStats.promotionStats.numMovedItems;
+  ret.backgndPromoStats.nTraversals =
+            cacheStats.promotionStats.runCount;
+
   ret.numEvictions = aggregate.numEvictions();
   ret.numItems = aggregate.numItems();
   ret.evictAttempts = cacheStats.evictionAttempts;
@@ -688,6 +719,9 @@ Stats Cache<Allocator>::getStats() const {
     ret.nvmCounters = cache_->getNvmCacheStatsMap().toMap();
   }
 
+  ret.backgroundEvictionClasses = cache_->getBackgroundMoverClassStats(MoverDir::Evict);
+  ret.backgroundPromotionClasses = cache_->getBackgroundMoverClassStats(MoverDir::Promote);
+
   // nvm stats from navy
   if (!isRamOnly() && !navyStats.empty()) {
     auto lookup = [&navyStats](const std::string& key) {
diff --git a/cachelib/cachebench/cache/CacheStats.h b/cachelib/cachebench/cache/CacheStats.h
index 3e3d5da307..993f3845a3 100644
--- a/cachelib/cachebench/cache/CacheStats.h
+++ b/cachelib/cachebench/cache/CacheStats.h
@@ -26,7 +26,33 @@ DECLARE_string(report_ac_memory_usage_stats);
 namespace facebook {
 namespace cachelib {
 namespace cachebench {
+
+struct BackgroundEvictionStats {
+  // the number of items this worker evicted by looking at pools/classes stats
+  uint64_t nEvictedItems{0};
+
+  // number of times we went executed the thread //TODO: is this def correct?
+  uint64_t nTraversals{0};
+
+  // number of classes
+  uint64_t nClasses{0};
+
+  // size of evicted items
+  uint64_t evictionSize{0};
+};
+
+struct BackgroundPromotionStats {
+  // the number of items this worker evicted by looking at pools/classes stats
+  uint64_t nPromotedItems{0};
+
+  // number of times we went executed the thread //TODO: is this def correct?
+  uint64_t nTraversals{0};
+};
+
 struct Stats {
+  BackgroundEvictionStats backgndEvicStats;
+  BackgroundPromotionStats backgndPromoStats;
+
   uint64_t numEvictions{0};
   uint64_t numItems{0};
 
@@ -108,6 +134,9 @@ struct Stats {
   // cachebench.
   std::unordered_map<std::string, double> nvmCounters;
 
+  std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>> backgroundEvictionClasses;
+  std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>> backgroundPromotionClasses;
+
   // errors from the nvm engine.
   std::unordered_map<std::string, double> nvmErrors;
 
@@ -128,6 +157,16 @@ struct Stats {
         << std::endl;
     out << folly::sformat("RAM Evictions : {:,}", numEvictions) << std::endl;
 
+    auto foreachAC = [&](auto &map, auto cb) {
+      for (auto &tidStats : map) {
+        for (auto &pidStat : tidStats.second) {
+          for (auto &cidStat : pidStat.second) {
+            cb(tidStats.first, pidStat.first, cidStat.first, cidStat.second);
+          }
+        }
+      }
+    };
+
     for (auto pid = 0U; pid < poolUsageFraction.size(); pid++) {
       out << folly::sformat("Fraction of pool {:,} used : {:.2f}", pid,
                             poolUsageFraction[pid])
@@ -188,6 +227,10 @@ struct Stats {
       });
     }
 
+    out << folly::sformat("Tier 0 Background Evicted Items : {:,}",
+                            backgndEvicStats.nEvictedItems) << std::endl;
+    out << folly::sformat("Tier 0 Background Traversals : {:,}",
+                            backgndEvicStats.nTraversals) << std::endl;
     if (numCacheGets > 0) {
       out << folly::sformat("Cache Gets    : {:,}", numCacheGets) << std::endl;
       out << folly::sformat("Hit Ratio     : {:6.2f}%", overallHitRatio)
@@ -218,6 +261,22 @@ struct Stats {
       }
     }
 
+    if (!backgroundEvictionClasses.empty() && backgndEvicStats.nEvictedItems > 0 ) {
+      out << "== Class Background Eviction Counters Map ==" << std::endl;
+      foreachAC(backgroundEvictionClasses, [&](auto tid, auto pid, auto cid, auto evicted){
+        out << folly::sformat("tid{:2} pid{:2} cid{:4} evicted: {:4}",
+          tid, pid, cid, evicted) << std::endl;
+      });
+    }
+    
+    if (!backgroundPromotionClasses.empty() && backgndPromoStats.nPromotedItems > 0) {
+      out << "== Class Background Promotion Counters Map ==" << std::endl;
+      foreachAC(backgroundPromotionClasses, [&](auto tid, auto pid, auto cid, auto promoted){
+        out << folly::sformat("tid{:2} pid{:2} cid{:4} promoted: {:4}",
+          tid, pid, cid, promoted) << std::endl;
+      });
+    }
+
     if (numNvmGets > 0 || numNvmDeletes > 0 || numNvmPuts > 0) {
       const double ramHitRatio = invertPctFn(numCacheGetMiss, numCacheGets);
       const double nvmHitRatio = invertPctFn(numNvmGetMiss, numNvmGets);
diff --git a/cachelib/cachebench/util/CacheConfig.cpp b/cachelib/cachebench/util/CacheConfig.cpp
index b9ba839218..fc37e5cc30 100644
--- a/cachelib/cachebench/util/CacheConfig.cpp
+++ b/cachelib/cachebench/util/CacheConfig.cpp
@@ -19,6 +19,8 @@
 #include "cachelib/allocator/HitsPerSlabStrategy.h"
 #include "cachelib/allocator/LruTailAgeStrategy.h"
 #include "cachelib/allocator/RandomStrategy.h"
+#include "cachelib/allocator/FreeThresholdStrategy.h"
+#include "cachelib/allocator/PromotionStrategy.h"
 
 namespace facebook {
 namespace cachelib {
@@ -28,6 +30,9 @@ CacheConfig::CacheConfig(const folly::dynamic& configJson) {
   JSONSetVal(configJson, cacheDir);
   JSONSetVal(configJson, cacheSizeMB);
   JSONSetVal(configJson, poolRebalanceIntervalSec);
+  JSONSetVal(configJson, backgroundEvictorIntervalMilSec);
+  JSONSetVal(configJson, backgroundPromoterIntervalMilSec);
+  JSONSetVal(configJson, backgroundEvictorStrategy);
   JSONSetVal(configJson, moveOnSlabRelease);
   JSONSetVal(configJson, rebalanceStrategy);
   JSONSetVal(configJson, rebalanceMinSlabs);
@@ -101,10 +106,27 @@ CacheConfig::CacheConfig(const folly::dynamic& configJson) {
   JSONSetVal(configJson, nvmAdmissionRetentionTimeThreshold);
 
   JSONSetVal(configJson, customConfigJson);
+  
+  //Background related configs
+  JSONSetVal(configJson, lowEvictionAcWatermark);
+  JSONSetVal(configJson, highEvictionAcWatermark);
+  JSONSetVal(configJson, minAcAllocationWatermark);
+  JSONSetVal(configJson, maxAcAllocationWatermark);
+  JSONSetVal(configJson, numDuplicateElements);
+  JSONSetVal(configJson, syncPromotion);
+  JSONSetVal(configJson, evictorThreads);
+  JSONSetVal(configJson, promoterThreads);
+  JSONSetVal(configJson, promotionAcWatermark);
+  JSONSetVal(configJson, maxEvictionBatch);
+  JSONSetVal(configJson, maxPromotionBatch);
+  JSONSetVal(configJson, minEvictionBatch);
+  JSONSetVal(configJson, minPromotionBatch);
+  JSONSetVal(configJson, maxEvictionPromotionHotness);
+  
   // if you added new fields to the configuration, update the JSONSetVal
   // to make them available for the json configs and increment the size
   // below
-  checkCorrectSize<CacheConfig, 728>();
+  checkCorrectSize<CacheConfig, 888>();
 
   if (numPools != poolSizes.size()) {
     throw std::invalid_argument(folly::sformat(
@@ -140,6 +162,20 @@ MemoryTierConfig::MemoryTierConfig(const folly::dynamic& configJson) {
 
   checkCorrectSize<MemoryTierConfig, 40>();
 }
+
+std::shared_ptr<BackgroundMoverStrategy> CacheConfig::getBackgroundEvictorStrategy() const {
+  if (backgroundEvictorIntervalMilSec == 0) {
+    return nullptr;
+  }
+  return std::make_shared<FreeThresholdStrategy>(lowEvictionAcWatermark, highEvictionAcWatermark, maxEvictionBatch, minEvictionBatch);
+}
+
+std::shared_ptr<BackgroundMoverStrategy> CacheConfig::getBackgroundPromoterStrategy() const {
+  if (backgroundPromoterIntervalMilSec == 0) {
+    return nullptr;
+  }
+  return std::make_shared<PromotionStrategy>(promotionAcWatermark, maxPromotionBatch, minPromotionBatch);
+}
 } // namespace cachebench
 } // namespace cachelib
 } // namespace facebook
diff --git a/cachelib/cachebench/util/CacheConfig.h b/cachelib/cachebench/util/CacheConfig.h
index d86ef1f620..a4713a1cc4 100644
--- a/cachelib/cachebench/util/CacheConfig.h
+++ b/cachelib/cachebench/util/CacheConfig.h
@@ -20,6 +20,7 @@
 
 #include "cachelib/allocator/CacheAllocator.h"
 #include "cachelib/allocator/RebalanceStrategy.h"
+#include "cachelib/allocator/BackgroundMoverStrategy.h"
 #include "cachelib/cachebench/util/JSONConfig.h"
 #include "cachelib/common/Ticker.h"
 #include "cachelib/navy/common/Device.h"
@@ -71,7 +72,10 @@ struct CacheConfig : public JSONConfig {
 
   uint64_t cacheSizeMB{0};
   uint64_t poolRebalanceIntervalSec{0};
+  uint64_t backgroundEvictorIntervalMilSec{0};
+  uint64_t backgroundPromoterIntervalMilSec{0};
   std::string rebalanceStrategy;
+  std::string backgroundEvictorStrategy;
   uint64_t rebalanceMinSlabs{1};
   double rebalanceDiffRatio{0.25};
   bool moveOnSlabRelease{false};
@@ -249,6 +253,27 @@ struct CacheConfig : public JSONConfig {
   // eviction-age is more than this threshold. 0 means no threshold
   uint32_t nvmAdmissionRetentionTimeThreshold{0};
 
+  // See BackgroundMovers.md for complete description
+  double promotionAcWatermark{4.0};
+  double lowEvictionAcWatermark{2.0};
+  double highEvictionAcWatermark{5.0};
+  double minAcAllocationWatermark{0.0};
+  double maxAcAllocationWatermark{0.0};
+
+  double numDuplicateElements{0.0}; // inclusivness of the cache
+  double syncPromotion{0.0}; // can promotion be done synchronously in user thread
+  
+  uint64_t evictorThreads{1};
+  uint64_t promoterThreads{1};
+  
+  uint64_t maxEvictionBatch{40};
+  uint64_t maxPromotionBatch{10};
+  
+  uint64_t minEvictionBatch{5};
+  uint64_t minPromotionBatch{5};
+  
+  uint64_t maxEvictionPromotionHotness{60};
+
   //
   // Options below are not to be populated with JSON
   //
@@ -284,6 +309,8 @@ struct CacheConfig : public JSONConfig {
   CacheConfig() {}
 
   std::shared_ptr<RebalanceStrategy> getRebalanceStrategy() const;
+  std::shared_ptr<BackgroundMoverStrategy> getBackgroundEvictorStrategy() const;
+  std::shared_ptr<BackgroundMoverStrategy> getBackgroundPromoterStrategy() const;
 };
 } // namespace cachebench
 } // namespace cachelib

From 08bf0b4e8510eb1646ccad057e6209b937b6fe89 Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Thu, 16 Feb 2023 14:19:21 -0800
Subject: [PATCH 47/47] fix race in moveRegularItemWith sync where
 insertOrReplace can cause move to fail  - updated slab release logic for move
 failure, but there is still an issue    with slab movement. currently
 investigating.

---
 cachelib/allocator/CacheAllocator-inl.h       |  99 +++++++++++---
 cachelib/allocator/CacheAllocator.h           |   2 +-
 cachelib/allocator/CacheItem.h                |   5 +
 .../tests/AllocatorMemoryTiersTest.cpp        |   3 +-
 .../tests/AllocatorMemoryTiersTest.h          | 128 +++++++++++++++++-
 5 files changed, 212 insertions(+), 25 deletions(-)

diff --git a/cachelib/allocator/CacheAllocator-inl.h b/cachelib/allocator/CacheAllocator-inl.h
index 3caf4b4b0b..01a5f5e49e 100644
--- a/cachelib/allocator/CacheAllocator-inl.h
+++ b/cachelib/allocator/CacheAllocator-inl.h
@@ -1294,8 +1294,21 @@ size_t CacheAllocator<CacheTrait>::wakeUpWaitersLocked(folly::StringPiece key,
 }
 
 template <typename CacheTrait>
-void CacheAllocator<CacheTrait>::moveRegularItemWithSync(
+bool CacheAllocator<CacheTrait>::moveRegularItemWithSync(
     Item& oldItem, WriteHandle& newItemHdl) {
+  //on function exit - the new item handle is no longer moving
+  //and other threads may access it - but in case where
+  //we failed to replace in access container we can give the
+  //new item back to the allocator
+  auto guard = folly::makeGuard([&]() {
+    auto ref = newItemHdl->unmarkMoving();
+    if (UNLIKELY(ref == 0)) {
+      const auto res =
+          releaseBackToAllocator(*newItemHdl, RemoveContext::kNormal, false);
+      XDCHECK(res == ReleaseRes::kReleased);
+    }
+  });
+
   XDCHECK(oldItem.isMoving());
   XDCHECK(!oldItem.isExpired());
   // TODO: should we introduce new latency tracker. E.g. evictRegularLatency_
@@ -1326,6 +1339,22 @@ void CacheAllocator<CacheTrait>::moveRegularItemWithSync(
 
   auto replaced = accessContainer_->replaceIf(oldItem, *newItemHdl,
                                    predicate);
+  // another thread may have called insertOrReplace which could have
+  // marked this item as unaccessible causing the replaceIf
+  // in the access container to fail - in this case we want
+  // to abort the move since the item is no longer valid
+  if (!replaced) {
+      return false;
+  }
+  // what if another thread calls insertOrReplace now when
+  // the item is moving and already replaced in the hash table?
+  // 1. it succeeds in updating the hash table - so there is
+  //    no guarentee that isAccessible() is true
+  // 2. it will then try to remove from MM container
+  //     - this operation will wait for newItemHdl to
+  //       be unmarkedMoving via the waitContext
+  // 3. replaced handle is returned and eventually drops
+  //    ref to 0 and the item is recycled back to allocator.
 
   if (config_.moveCb) {
     // Execute the move callback. We cannot make any guarantees about the
@@ -1367,14 +1396,7 @@ void CacheAllocator<CacheTrait>::moveRegularItemWithSync(
     XDCHECK(newItemHdl->hasChainedItem());
   }
   newItemHdl.unmarkNascent();
-  auto ref = newItemHdl->unmarkMoving();
-  //remove because there is a chance the new item was not
-  //added to the access container
-  if (UNLIKELY(ref == 0)) {
-    const auto res =
-        releaseBackToAllocator(*newItemHdl, RemoveContext::kNormal, false);
-    XDCHECK(res == ReleaseRes::kReleased);
-  }
+  return true;
 }
 
 template <typename CacheTrait>
@@ -1529,7 +1551,6 @@ template <typename CacheTrait>
 void CacheAllocator<CacheTrait>::unlinkItemForEviction(Item& it) {
   XDCHECK(it.isMarkedForEviction());
   XDCHECK(it.getRefCount() == 0);
-
   accessContainer_->remove(it);
   removeFromMMContainer(it);
 
@@ -1624,28 +1645,43 @@ CacheAllocator<CacheTrait>::findEviction(TierId tid, PoolId pid, ClassId cid) {
     auto evictedToNext = lastTier ? nullptr
         : tryEvictToNextMemoryTier(*candidate, false);
     if (!evictedToNext) {
-      if (!token.isValid()) {
+      //if insertOrReplace was called during move
+      //then candidate will not be accessible (failed replace during tryEvict)
+      // - therefore this was why we failed to
+      //   evict to the next tier and insertOrReplace
+      //   will remove from NVM cache
+      //however, if candidate is accessible
+      //that means the allocation in the next
+      //tier failed - so we will continue to
+      //evict the item to NVM cache
+      bool failedToReplace = !candidate->isAccessible();
+      if (!token.isValid() && !failedToReplace) {
         token = createPutToken(*candidate);
       }
-      // tryEvictToNextMemoryTier should only fail if allocation of the new item fails
-      // in that case, it should be still possible to mark item as exclusive.
+      // tryEvictToNextMemoryTier can fail if:
+      //    a) allocation of the new item fails in that case,
+      //       it should be still possible to mark item for eviction.
+      //    b) another thread calls insertOrReplace and the item
+      //       is no longer accessible
       //
       // in case that we are on the last tier, we whould have already marked
       // as exclusive since we will not be moving the item to the next tier
       // but rather just evicting all together, no need to
-      // markExclusiveWhenMoving
+      // markForEvictionWhenMoving
       auto ret = lastTier ? true : candidate->markForEvictionWhenMoving();
       XDCHECK(ret);
 
       unlinkItemForEviction(*candidate);
+      
+      if (token.isValid() && shouldWriteToNvmCacheExclusive(*candidate)
+              && !failedToReplace) {
+        nvmCache_->put(*candidate, std::move(token));
+      }
       // wake up any readers that wait for the move to complete
       // it's safe to do now, as we have the item marked exclusive and
       // no other reader can be added to the waiters list
       wakeUpWaiters(*candidate, {});
 
-      if (token.isValid() && shouldWriteToNvmCacheExclusive(*candidate)) {
-        nvmCache_->put(*candidate, std::move(token));
-      }
     } else {
       XDCHECK(!evictedToNext->isMarkedForEviction() && !evictedToNext->isMoving());
       XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving());
@@ -1756,7 +1792,10 @@ CacheAllocator<CacheTrait>::tryEvictToNextMemoryTier(
 
     if (newItemHdl) {
       XDCHECK_EQ(newItemHdl->getSize(), item.getSize());
-      moveRegularItemWithSync(item, newItemHdl);
+      if (!moveRegularItemWithSync(item, newItemHdl)) {
+          return WriteHandle{};
+      }
+      XDCHECK_EQ(newItemHdl->getKey(),item.getKey());
       item.unmarkMoving();
       return newItemHdl;
     } else {
@@ -1795,7 +1834,9 @@ CacheAllocator<CacheTrait>::tryPromoteToNextMemoryTier(
 
     if (newItemHdl) {
       XDCHECK_EQ(newItemHdl->getSize(), item.getSize());
-      moveRegularItemWithSync(item, newItemHdl);
+      if (!moveRegularItemWithSync(item, newItemHdl)) {
+          return WriteHandle{};
+      }
       item.unmarkMoving();
       return newItemHdl;
     } else {
@@ -3148,9 +3189,23 @@ bool CacheAllocator<CacheTrait>::tryMovingForSlabRelease(
       // TODO: add support for chained items
       return false;
     } else {
-      moveRegularItemWithSync(oldItem, newItemHdl);
-      removeFromMMContainer(oldItem);
-      return true;
+      //move can fail if another thread calls insertOrReplace
+      //in this case oldItem is no longer valid (not accessible, 
+      //it gets removed from MMContainer and evictForSlabRelease
+      //will send it back to the allocator
+      bool ret = moveRegularItemWithSync(oldItem, newItemHdl);
+      if (!ret) {
+          //we failed to move - newItemHdl was released back to allocator
+          //by the moveRegularItemWithSync but oldItem is not accessible
+          //and no longer valid - we need to clean it up here
+          XDCHECK(!oldItem.isAccessible());
+          oldItem.markForEvictionWhenMoving();
+          unlinkItemForEviction(oldItem);
+          wakeUpWaiters(oldItem, {});
+      } else {
+        removeFromMMContainer(oldItem);
+      }
+      return ret;
     }
   }
 }
diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
index d32a8c991c..39a1c4881b 100644
--- a/cachelib/allocator/CacheAllocator.h
+++ b/cachelib/allocator/CacheAllocator.h
@@ -1617,7 +1617,7 @@ class CacheAllocator : public CacheBase {
   //
   // @return true  If the move was completed, and the containers were updated
   //               successfully.
-  void moveRegularItemWithSync(Item& oldItem, WriteHandle& newItemHdl);
+  bool moveRegularItemWithSync(Item& oldItem, WriteHandle& newItemHdl);
 
   // Moves a regular item to a different slab. This should only be used during
   // slab release after the item's exclusive bit has been set. The user supplied
diff --git a/cachelib/allocator/CacheItem.h b/cachelib/allocator/CacheItem.h
index b4fa339b57..6728b654eb 100644
--- a/cachelib/allocator/CacheItem.h
+++ b/cachelib/allocator/CacheItem.h
@@ -46,6 +46,9 @@ class BaseAllocatorTest;
 template <typename AllocatorT>
 class AllocatorHitStatsTest;
 
+template <typename AllocatorT>
+class AllocatorMemoryTiersTest;
+
 template <typename AllocatorT>
 class MapTest;
 
@@ -473,6 +476,8 @@ class CACHELIB_PACKED_ATTR CacheItem {
   FRIEND_TEST(ItemTest, NonStringKey);
   template <typename AllocatorT>
   friend class facebook::cachelib::tests::AllocatorHitStatsTest;
+  template <typename AllocatorT>
+  friend class facebook::cachelib::tests::AllocatorMemoryTiersTest;
 };
 
 // A chained item has a hook pointing to the next chained item. The hook is
diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp b/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp
index 6e2de814da..ad845d94df 100644
--- a/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp
+++ b/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp
@@ -21,7 +21,7 @@ namespace cachelib {
 namespace tests {
 
 using LruAllocatorMemoryTiersTest = AllocatorMemoryTiersTest<LruAllocator>;
-
+//using LruTestAllocatorMemoryTiersTest = AllocatorMemoryTiersTest<LruTestAllocator>;
 // TODO(MEMORY_TIER): add more tests with different eviction policies
 TEST_F(LruAllocatorMemoryTiersTest, MultiTiersInvalid) { this->testMultiTiersInvalid(); }
 TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValid) { this->testMultiTiersValid(); }
@@ -29,6 +29,7 @@ TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValidMixed) { this->testMultiTiers
 TEST_F(LruAllocatorMemoryTiersTest, MultiTiersBackgroundMovers ) { this->testMultiTiersBackgroundMovers(); }
 TEST_F(LruAllocatorMemoryTiersTest, MultiTiersRemoveDuringEviction) { this->testMultiTiersRemoveDuringEviction(); }
 TEST_F(LruAllocatorMemoryTiersTest, MultiTiersReplaceDuringEviction) { this->testMultiTiersReplaceDuringEviction(); }
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersReplaceDuringEvictionWithReader) { this->testMultiTiersReplaceDuringEvictionWithReader(); }
 
 } // end of namespace tests
 } // end of namespace cachelib
diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.h b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h
index b559649aa6..c724cd9617 100644
--- a/cachelib/allocator/tests/AllocatorMemoryTiersTest.h
+++ b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h
@@ -22,6 +22,10 @@
 #include "cachelib/allocator/FreeThresholdStrategy.h"
 #include "cachelib/allocator/PromotionStrategy.h"
 
+#include <fcntl.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <semaphore.h>
 #include <folly/synchronization/Latch.h>
 
 namespace facebook {
@@ -58,6 +62,7 @@ class AllocatorMemoryTiersTest : public AllocatorTest<AllocatorT> {
       ASSERT_NO_THROW(alloc->insertOrReplace(handle));
     }
   }
+  
  public:
   void testMultiTiersInvalid() {
     typename AllocatorT::Config config;
@@ -201,7 +206,7 @@ class AllocatorMemoryTiersTest : public AllocatorTest<AllocatorT> {
 
     t->join();
   }
-
+  
   void testMultiTiersReplaceDuringEviction() {
     std::unique_ptr<AllocatorT> alloc;
     PoolId pool;
@@ -234,6 +239,127 @@ class AllocatorMemoryTiersTest : public AllocatorTest<AllocatorT> {
     testMultiTiersAsyncOpDuringMove(alloc, pool, quit, moveCb);
 
     t->join();
+
+  }
+
+
+  void gdb_sync1() {}
+  void gdb_sync2() {}
+  void gdb_sync3() {}
+  using ReadHandle = typename AllocatorT::ReadHandle;
+  void testMultiTiersReplaceDuringEvictionWithReader() {
+    sem_unlink ("/gdb1_sem");
+    sem_t *sem = sem_open ("/gdb1_sem", O_CREAT | O_EXCL, S_IRUSR | S_IWUSR, 0);
+    int gdbfd = open("/tmp/gdb1.gdb",O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR);
+    char gdbcmds[] = 
+                     "set attached=1\n"
+                     "break gdb_sync1\n"
+                     "break gdb_sync2\n"
+                     "break moveRegularItemWithSync\n"
+                     "c\n"
+                     "set scheduler-locking on\n"
+                     "thread 1\n"
+                     "c\n"
+                     "thread 4\n"
+                     "c\n"
+                     "thread 5\n"
+                     "break nativeFutexWaitImpl thread 5\n"
+                     "c\n"
+                     "thread 4\n"
+                     "break nativeFutexWaitImpl thread 4\n"
+                     "c\n"
+                     "thread 1\n"
+                     "break releaseBackToAllocator\n"
+                     "c\n"
+                     "c\n"
+                     "thread 5\n"
+                     "c\n"
+                     "thread 4\n"
+                     "c\n"
+                     "thread 1\n"
+                     "break gdb_sync3\n"
+                     "c\n"
+                     "quit\n";
+    int ret = write(gdbfd,gdbcmds,strlen(gdbcmds));
+    int ppid = getpid(); //parent pid
+    //int pid = 0;
+    int pid = fork();
+    if (pid == 0) {
+        sem_wait(sem);
+        sem_close(sem);
+        sem_unlink("/gdb1_sem");
+        char cmdpid[256];
+        sprintf(cmdpid,"%d",ppid);
+        int f = execlp("gdb","gdb","--pid",cmdpid,"--batch-silent","--command=/tmp/gdb1.gdb",(char*) 0);
+        ASSERT(f != -1);
+    }
+    sem_post(sem);
+    //wait for gdb to run
+    int attached = 0;
+    while (attached == 0);
+    
+    std::unique_ptr<AllocatorT> alloc;
+    PoolId pool;
+    bool quit = false;
+    
+    typename AllocatorT::Config config;
+    config.setCacheSize(4 * Slab::kSize);
+    config.enableCachePersistence("/tmp");
+    config.configureMemoryTiers({
+        MemoryTierCacheConfig::fromShm()
+            .setRatio(1).setMemBind(std::string("0")),
+        MemoryTierCacheConfig::fromShm()
+            .setRatio(1).setMemBind(std::string("0"))
+    });
+
+    alloc = std::make_unique<AllocatorT>(AllocatorT::SharedMemNew, config);
+    ASSERT(alloc != nullptr);
+    pool = alloc->addPool("default", alloc->getCacheMemoryStats().ramCacheSize);
+
+    int i = 0;
+    typename AllocatorT::Item* evicted;
+    std::unique_ptr<std::thread> t;
+    std::unique_ptr<std::thread> r;
+    while(!quit) {
+      auto handle = alloc->allocate(pool, std::to_string(++i), std::string("value").size());
+      ASSERT(handle != nullptr);
+      if (i == 1) {
+          evicted = static_cast<typename AllocatorT::Item*>(handle.get());
+          folly::Latch latch_t(1);
+          t = std::make_unique<std::thread>([&](){
+                auto handleNew = alloc->allocate(pool, std::to_string(1), std::string("new value").size());
+                ASSERT(handleNew != nullptr);
+                latch_t.count_down();
+                //first breakpoint will be this one because 
+                //thread 1 still has more items to fill up the
+                //cache before an evict is evicted
+                gdb_sync1();
+                ASSERT(evicted->isMoving());
+                //need to suspend thread 1 - who is doing the eviction
+                //gdb will do this for us
+                folly::Latch latch(1);
+                r = std::make_unique<std::thread>([&](){
+                    ASSERT(evicted->isMoving());
+                    latch.count_down();
+                    auto handleEvict = alloc->find(std::to_string(1));
+                    //does find block until done moving?? yes
+                    while (evicted->isMarkedForEviction()); //move will fail
+                    XDCHECK(handleEvict == nullptr) << handleEvict->toString();
+                    ASSERT(handleEvict == nullptr);
+                });
+                latch.wait();
+                gdb_sync2();
+                alloc->insertOrReplace(handleNew);
+                ASSERT(!evicted->isAccessible()); //move failed
+                quit = true;
+              });
+          latch_t.wait();
+      }
+      ASSERT_NO_THROW(alloc->insertOrReplace(handle));
+    }
+    t->join();
+    r->join();
+    gdb_sync3();
   }
 };
 } // namespace tests