diff --git a/src/globals/utils.h b/src/globals/utils.h
index 7bc76ef5..515e4c03 100644
--- a/src/globals/utils.h
+++ b/src/globals/utils.h
@@ -282,6 +282,9 @@
 #define MIN4(v0, v1, v2, v3) (MIN2(MIN2((v0), (v1)), MIN2((v2), (v3))))
 #define MAX4(v0, v1, v2, v3) (MAX2(MAX2((v0), (v1)), MAX2((v2), (v3))))
 
+// a is the original addr, num is the shift amt before interleaving (usually
+// the cacheline), int is the interleave factor. The bank idx computed here
+// is simply the lower bits
 #define BANK(a, num, int) ((a) >> LOG2(int) & N_BIT_MASK(LOG2(num)))
 #define CHANNEL(bank, num) ((bank) >> LOG2(num))
 #define BANK_IN_CHANNEL(bank, num) ((bank)&N_BIT_MASK(LOG2(num)))
diff --git a/src/libs/cache_lib.c b/src/libs/cache_lib.c
index 9a1f541b..adbda1ed 100644
--- a/src/libs/cache_lib.c
+++ b/src/libs/cache_lib.c
@@ -70,12 +70,13 @@ char rand_repl_state[31];
 
 /**************************************************************************************/
 /**
- * @brief Return set index of the addr 
- * As a side-effect, the tag and line_addr will be populated 
- * @param cache 
+ * @brief Return set index of the addr
+ * As a side-effect, the tag and line_addr will be populated
+ * @param cache
  * @param addr The access addr (input)
  * @param tag  The tag of the access (output)
- * @param line_addr The base address of the cache blk corresponding to the access (output)
+ * @param line_addr The base address of the cache blk corresponding to the
+ * access (output)
  * @return uns The set index of the access
  */
 static inline uns cache_index(Cache* cache, Addr addr, Addr* tag,
@@ -207,11 +208,11 @@ void init_cache(Cache* cache, const char* name, uns cache_size, uns assoc,
 
 /**
  * @brief access the address.
- * 
- * @param cache 
- * @param addr the request addr 
- * @param line_addr 
- * @param update_repl 
+ *
+ * @param cache
+ * @param addr the request addr
+ * @param line_addr
+ * @param update_repl
  * @return void* data field of the blk or NULL if cache miss
  */
 void* cache_access(Cache* cache, Addr addr, Addr* line_addr, Flag update_repl) {
@@ -223,7 +224,7 @@ void* cache_access(Cache* cache, Addr addr, Addr* line_addr, Flag update_repl) {
     return access_ideal_storage(cache, set, tag, addr);
   }
 
-  //search the ways
+  // search the ways
   for(ii = 0; ii < cache->assoc; ii++) {
     Cache_Entry* line = &cache->entries[set][ii];
 
@@ -266,22 +267,22 @@ void* cache_access(Cache* cache, Addr addr, Addr* line_addr, Flag update_repl) {
 
 /**
  * @brief Insert new addr to the cache
- * 
+ *
  * This function is a wrapper of cache_insert_replpos, see below
- * 
- * Note cache_insert is intrusive, for a non-instusive function 
- * (which only pick out the victim but not doing the insertion), 
+ *
+ * Note cache_insert is intrusive, for a non-instusive function
+ * (which only pick out the victim but not doing the insertion),
  * see get_next_repl_line, both of these functions calls find_repl_entry
  * internally
- * 
+ *
  * DON'T call this unless you are sure that the line is not in the
  * cache (call after cache_access returned NULL)
- * 
- * @param cache 
- * @param proc_id 
- * @param addr 
- * @param line_addr 
- * @param repl_line_addr 
+ *
+ * @param cache
+ * @param proc_id
+ * @param addr
+ * @param line_addr
+ * @param repl_line_addr
  * @return void* The data field of the inserted blk
  */
 void* cache_insert(Cache* cache, uns8 proc_id, Addr addr, Addr* line_addr,
@@ -295,16 +296,16 @@ void* cache_insert(Cache* cache, uns8 proc_id, Addr addr, Addr* line_addr,
  *  returns a pointer to the data section of the new cache line.
  *  Sets line_addr to the address of the first block of the new line.  Sets
  *  repl_line_addr to the address of the first block that was replaced
- * 
+ *
  *  Note this func won't do the WB if the victim is dirty, the info of the
  *  victim blk is returned and WB is handled by the caller of this func
- * 
+ *
  *  DON'T call this unless you are sure that the line is *not* in the
  *  cache (call after cache_access returned NULL)
- * @param cache 
- * @param proc_id 
+ * @param cache
+ * @param proc_id
  * @param addr The addr of the blk to be inserted
- * @param line_addr The base addr of the blk to be insert (input) 
+ * @param line_addr The base addr of the blk to be insert (input)
  * @param repl_line_addr The base addr of the blk got evicted (output)
  * @return void* The data field of the inserted blk
  */
@@ -321,13 +322,13 @@ void* cache_insert_replpos(Cache* cache, uns8 proc_id, Addr addr,
     new_line        = insert_sure_line(cache, set, tag);
     *repl_line_addr = 0;
   } else {
-    //new_line points to the victim, repl_index is the way id for the victim
+    // new_line points to the victim, repl_index is the way id for the victim
     new_line = find_repl_entry(cache, proc_id, set, &repl_index);
     /* before insert the data into cache, if the cache has shadow entry */
     /* insert that entry to the shadow cache */
     if((cache->repl_policy == REPL_SHADOW_IDEAL) && new_line->valid)
       shadow_cache_insert(cache, set, new_line->tag, new_line->base);
-    if(new_line->valid){
+    if(new_line->valid) {
       // bug fixed. 4/26/04 if the entry is not valid,
       // repl_line_addr should be set to 0
       *repl_line_addr = new_line->base;
@@ -443,13 +444,13 @@ void* cache_insert_replpos(Cache* cache, uns8 proc_id, Addr addr,
 
 
 /**
- * @brief Invalid the blk by address if presented, no wb even the blk 
+ * @brief Invalidate the blk by address if presented, no wb even the blk
  * is dirty
- * 
- * @param cache 
- * @param addr 
- * @param line_addr 
- * @param True on find in cache, False on no present 
+ *
+ * @param cache
+ * @param addr
+ * @param line_addr
+ * @param True on find in cache, False on no present
  */
 void cache_invalidate(Cache* cache, Addr addr, Addr* line_addr) {
   Addr tag;
@@ -472,9 +473,9 @@ void cache_invalidate(Cache* cache, Addr addr, Addr* line_addr) {
 
 /**
  * @brief Return a pointer to the victim to be replaced
- * 
+ *
  * The caller of this func is supposed to handle the possible
- * writeback correctly, otherwise the correctness of simulation 
+ * writeback correctly, otherwise the correctness of simulation
  * is compromised
  *
  * @param cache
diff --git a/src/memory/mem_req.h b/src/memory/mem_req.h
index 4b26261f..e73f1fb4 100644
--- a/src/memory/mem_req.h
+++ b/src/memory/mem_req.h
@@ -131,7 +131,15 @@ struct Mem_Req_struct {
   uns  op_count;  /* number of ops that are waiting for the miss */
   uns  req_count; /* number of requests coalesced into this one */
   Flag (*done_func)(struct Mem_Req_struct*); /* pointer to function to call when
-                                                the memory request is finished
+                                                the memory request is finished,
+                                                this is the mechanism scarab
+                                                used to implement a "callback".
+                                                i.e. when a req is finally
+                                                returned from the mem system,
+                                                continue with the rest of the
+                                                process. This is mostly used by
+                                                I$ and D$ to fill the line when
+                                                req returned from uncore/mem
                                               */
   Flag mlc_miss;                             /* did this request miss in MLC */
   Flag mlc_miss_satisfied;   /* did this request miss in MLC and it is already
diff --git a/src/memory/memory.c b/src/memory/memory.c
index 23302e18..17e16007 100644
--- a/src/memory/memory.c
+++ b/src/memory/memory.c
@@ -291,7 +291,7 @@ void init_mem_req_type_priorities() {
 
 /**
  * @brief Init memory
- * 
+ *
  */
 void init_memory() {
   int  ii;
@@ -392,26 +392,29 @@ void init_memory() {
 }
 
 /**
- * @brief this function should only be called once in warmup mode
+ * @brief Instantiate all the parts in uncore
+ *
+ * Note: this function should only be called once in warmup mode
  *
  */
 void init_uncores(void) {
   mem->uncores = (Uncore*)malloc(sizeof(Uncore) * NUM_CORES);
 
   /* Initialize MLC cache */
-  if(PRIVATE_MLC){
-    for(uns proc_id = 0; proc_id < NUM_CORES; proc_id++){
+  if(PRIVATE_MLC) {
+    for(uns proc_id = 0; proc_id < NUM_CORES; proc_id++) {
       Ported_Cache* mlc = (Ported_Cache*)malloc(sizeof(Ported_Cache));
-      char buf[MAX_STR_LENGTH+1];
+      char          buf[MAX_STR_LENGTH + 1];
       sprintf(buf, "MLC[%d]", proc_id);
-      init_cache(&mlc->cache, buf, MLC_SIZE, MLC_ASSOC, MLC_LINE_SIZE, 
-                sizeof(MLC_Data), MLC_CACHE_REPL_POLICY);
+      init_cache(&mlc->cache, buf, MLC_SIZE, MLC_ASSOC, MLC_LINE_SIZE,
+                 sizeof(MLC_Data), MLC_CACHE_REPL_POLICY);
       mlc->num_banks = MLC_BANKS;
-      mlc->ports = (Ports*)malloc(sizeof(Ports) * mlc->num_banks);
-      for(uns ii=0; ii < mlc->num_banks; ii++){
-        char name[MAX_STR_LENGTH+1];
+      mlc->ports     = (Ports*)malloc(sizeof(Ports) * mlc->num_banks);
+      for(uns ii = 0; ii < mlc->num_banks; ii++) {
+        char name[MAX_STR_LENGTH + 1];
         snprintf(name, MAX_STR_LENGTH, "MLC[%d] BANK %d PORTS", proc_id, ii);
-        init_ports(&mlc->ports[ii], name, MLC_READ_PORTS, MLC_WRITE_PORTS, FALSE);
+        init_ports(&mlc->ports[ii], name, MLC_READ_PORTS, MLC_WRITE_PORTS,
+                   FALSE);
       }
       MLC(proc_id) = mlc;
     }
@@ -887,8 +890,9 @@ void update_on_chip_memory_stats() {
 
 /**
  * @brief simulate the memory system for one cycle
- * functions are called in reverse order, that's fill queues (req going back to
- * cpu), first, then ramulator (DRAM), then request queues (reg going down to
+ *
+ * Note: updates happen in reverse order, that's fill queues (reqs going back to
+ * cpu), first, then ramulator (DRAM), then request queues (reqs going out to
  * mem)
  *
  */
@@ -916,11 +920,12 @@ void update_memory() {
   if(freq_is_ready(FREQ_DOMAIN_L1)) {
     cycle_count = freq_cycle_count(FREQ_DOMAIN_L1);
 
-    mem_process_bus_out_reqs(); // obsolete code, nothing will be executed
+    mem_process_bus_out_reqs();  // obsolete code, nothing will be executed
     mem_process_l1_reqs();
     mem_process_mlc_reqs();
   }
 
+  // WQ: why is this not called before mlc_fill???
   for(uns proc_id = 0; proc_id < NUM_CORES; proc_id++) {
     if(freq_is_ready(FREQ_DOMAIN_CORES[proc_id])) {
       cycle_count = freq_cycle_count(FREQ_DOMAIN_CORES[proc_id]);
@@ -946,13 +951,18 @@ int mem_compare_priority(const void* a, const void* b) {
     return 0;
 }
 
-/**************************************************************************************/
-/* mem_start_mlc_access: */
 
+/**
+ * @brief Try obtain MLC ports, transit state into MLC_WAIT on success
+ *
+ * @param req
+ */
 void mem_start_mlc_access(Mem_Req* req) {
   Flag avail = FALSE;
 
   /* FIXME: Only WB reqs try to get a write port? How about stores? */
+  // WQ: store definately need read port first, but potentially need to obtain
+  //    write port subsequently, not sure how to model this
   Flag need_wp = ((req->type == MRT_WB) || (req->type == MRT_WB_NODIRTY));
   Flag need_rp = !need_wp;
   if((need_wp && get_write_port(&MLC(req->proc_id)->ports[req->mlc_bank])) ||
@@ -1020,19 +1030,20 @@ void mem_start_l1_access(Mem_Req* req) {
 }
 
 /**
- * @brief post_process after LLC hit, return the req upwards (to MLC and Core)
- * Note regardless of the req type, the req need to be returned upwards after
- * get resolved in LLC
+ * @brief post_process after LLC hit.
  *
- * Does bunch of STAT, and mark the dirty bit of the hit block
+ * Resp the req either to core's fill_queue or MLC fill_queue
+ * If need to WB (write through cache), it will be handled outside this func
+ *
+ * Update the state of the data blk (dirty bit) on hit
  *
- * Returns TRUE if l1 access is complete and needs to be removed from l1_queue
  * @param req
  * @param l1_queue_entry
  * @param line_addr
  * @param data
  * @param lru_position
- * @return Flag
+ * @return Flag Returns TRUE if l1 access is complete and needs to be removed
+ * from l1_queue
  */
 Flag mem_process_l1_hit_access(Mem_Req* req, Mem_Queue_Entry* l1_queue_entry,
                                Addr* line_addr, L1_Data* data,
@@ -1091,7 +1102,8 @@ Flag mem_process_l1_hit_access(Mem_Req* req, Mem_Queue_Entry* l1_queue_entry,
       STAT_EVENT(req->proc_id, L1_WB_HIT);
       STAT_EVENT(req->proc_id, CORE_L1_WB_HIT);
     }
-    data->dirty |= (req->type == MRT_WB);
+    // mark the blk dirty on WB
+    data->dirty |= (req->type == MRT_WB || req->type == MRT_DSTORE);
   }
 
   DEBUG(req->proc_id,
@@ -1126,15 +1138,26 @@ Flag mem_process_l1_hit_access(Mem_Req* req, Mem_Queue_Entry* l1_queue_entry,
     DEBUG(req->proc_id, "Req index:%d no longer a chip demand\n", req->id);
   }
 
-  // write port related stuff, currently only stat collection
+  // collect stat for wrong path accesses
   wp_process_l1_hit(data, req);
 
+  /////////////////////////////////////////////
+  // main logic for handling a hit:
+  // Case 0: req is WB and L1 is write through, propogate downwards
+  // WQ: case 0 is deprecated, WB will be sent to ramulator from the caller
   if(L1_WRITE_THROUGH && (req->type == MRT_WB)) {
     req->state     = MRS_BUS_NEW;
     req->rdy_cycle = cycle_count + L1Q_TO_FSB_TRANSFER_LATENCY;
+    // Case 1: propagate upwards to MLC
   } else if(fill_mlc) {
     req->state     = MRS_FILL_MLC;
     req->rdy_cycle = cycle_count + 1;
+    // borrow the dirty_l0 field of req to mark the fill contains dirty data
+    // true for both inclusive & exclusive L1
+    if(data->dirty) {
+      req->dirty_l0 = TRUE;
+    }
+
     // insert into mlc queue
     req->queue = &(mem->mlc_fill_queue);
     if(!ORDER_BEYOND_BUS)
@@ -1145,30 +1168,47 @@ Flag mem_process_l1_hit_access(Mem_Req* req, Mem_Queue_Entry* l1_queue_entry,
       mem_insert_req_into_queue(req, req->queue,
                                 ALL_FIFO_QUEUES ? mlc_fill_seq_num : 0);
     mlc_fill_seq_num++;
+    // maintain exclusivity, evict from L1
+    if(EXCLUSIVE_L1) {
+      // no need to WB since dirtyness will be propogated upwards too
+      ASSERT(0, MLC_PRESENT);
+      Addr dummy;
+      cache_invalidate(&L1(req->proc_id)->cache, req->addr, &dummy);
+    }
+
+    // Case 2: if done_func is not bound (usually is a prefetch), terminate the
+    // req
   } else if(!req->done_func) {
     req->state = MRS_L1_HIT_DONE;
     // Free the request buffer
     mem_free_reqbuf(req);
+    // Case 3: propgate upwards, directly to the core
   } else {
+    // this case should only be called when no MLC in sys
+    // WQ: seems the dirtyness is not propogate upward here
     req->state     = MRS_L1_HIT_DONE;
     req->rdy_cycle = freq_cycle_count(
       FREQ_DOMAIN_CORES[req->proc_id]);  // no +1 to match old performance
     // insert into core fill queue
     req->queue = &(mem->core_fill_queues[req->proc_id]);
-    if(!ORDER_BEYOND_BUS)
+
+    if(data->dirty)
+      req->dirty_l0 = TRUE;
+    if(!ORDER_BEYOND_BUS) {
       mem_insert_req_into_queue(req, req->queue,
                                 ALL_FIFO_QUEUES ?
                                   core_fill_seq_num[req->proc_id] :
                                   l1_queue_entry->priority);
-    else
+    } else {
       mem_insert_req_into_queue(
         req, req->queue, ALL_FIFO_QUEUES ? core_fill_seq_num[req->proc_id] : 0);
-    core_fill_seq_num[req->proc_id]++;
+      core_fill_seq_num[req->proc_id]++;
+    }
   }
 
   /* Set the priority so that this entry will be removed from the l1_queue */
   l1_queue_entry->priority = Mem_Req_Priority_Offset[MRT_MIN_PRIORITY];
-
+  // wq todo: the count is not set correctly?
 
   if(L2L1PREF_ON)
     l2l1pref_mem(req);
@@ -1176,92 +1216,117 @@ Flag mem_process_l1_hit_access(Mem_Req* req, Mem_Queue_Entry* l1_queue_entry,
   return TRUE;
 }
 
-/**************************************************************************************/
-/* mem_process_mlc_hit_access: */
-/* Returns TRUE if mlc access is complete and needs to be removed from mlc_queue
+/**
+ * @brief
+ *
+ * @param req
+ * @param mlc_queue_entry
+ * @param line_addr
+ * @param data
+ * @param lru_position
+ * @return Flag Returns TRUE if mlc access is complete and needs to be removed
+ * from mlc_queue
  */
-
 Flag mem_process_mlc_hit_access(Mem_Req* req, Mem_Queue_Entry* mlc_queue_entry,
                                 Addr* line_addr, MLC_Data* data,
                                 int lru_position) {
-  if(!req->done_func ||
-     req->done_func(req)) { /* If done_func is not complete we will keep
-                               accessing MLC until done_func returns TRUE */
-
-    if(data) { /* not perfect mlc */
-      if((req->type == MRT_DFETCH) || (req->type == MRT_DSTORE) ||
-         (req->type == MRT_IFETCH)) {
-        if(data->prefetch) {  // prefetch hit
-          DEBUG(req->proc_id, "%7lld mlc prefetch hit %d\n", cycle_count,
-                (int)(req->addr));
-          STAT_EVENT(req->proc_id, MLC_PREF_HIT);
-          if(!data->seen_prefetch) {
-            data->seen_prefetch = TRUE;
-
-            STAT_EVENT(req->proc_id, MLC_PREF_UNIQUE_HIT);
-            STAT_EVENT(req->proc_id, PREF_MLC_TOTAL_USED);
-            STAT_EVENT(req->proc_id, CORE_PREF_MLC_USED);
-            STAT_EVENT(req->proc_id, CORE_MLC_PREF_FILL_USED);
-          }
-        }
-      }
-
-      if(req->type == MRT_DPRF || req->type == MRT_IPRF ||
-         req->demand_match_prefetch) {
-        STAT_EVENT(req->proc_id, MLC_PREF_REQ_HIT);
-        STAT_EVENT(req->proc_id, CORE_MLC_PREF_REQ_HIT);
-      } else if((req->type == MRT_DFETCH) || (req->type == MRT_DSTORE) ||
-                (req->type == MRT_IFETCH)) {
-        STAT_EVENT(req->proc_id, MLC_DEMAND_HIT);
-        STAT_EVENT(req->proc_id, CORE_MLC_DEMAND_HIT);
-      } else {  // CMP Watch out RA
-        STAT_EVENT(req->proc_id, MLC_WB_HIT);
-        STAT_EVENT(req->proc_id, CORE_MLC_WB_HIT);
-      }
-      data->dirty |= (req->type == MRT_WB);
-    }
-
+  if(data) { /* not perfect mlc */
     if((req->type == MRT_DFETCH) || (req->type == MRT_DSTORE) ||
        (req->type == MRT_IFETCH)) {
-      STAT_EVENT(req->proc_id, MLC_HIT);
-      STAT_EVENT(req->proc_id, CORE_MLC_HIT);
-      STAT_EVENT(req->proc_id, MLC_HIT_ONPATH + req->off_path);
-      if(0 && DEBUG_EXC_INSERTS) {
-        printf("addr:%s hit in MLC type:%s\n", hexstr64s(req->addr),
-               Mem_Req_Type_str(req->type));
+      if(data->prefetch) {  // prefetch hit
+        DEBUG(req->proc_id, "%7lld mlc prefetch hit %d\n", cycle_count,
+              (int)(req->addr));
+        STAT_EVENT(req->proc_id, MLC_PREF_HIT);
+        if(!data->seen_prefetch) {
+          data->seen_prefetch = TRUE;
+
+          STAT_EVENT(req->proc_id, MLC_PREF_UNIQUE_HIT);
+          STAT_EVENT(req->proc_id, PREF_MLC_TOTAL_USED);
+          STAT_EVENT(req->proc_id, CORE_PREF_MLC_USED);
+          STAT_EVENT(req->proc_id, CORE_MLC_PREF_FILL_USED);
+        }
       }
     }
+    data->dirty |= (req->type == MRT_WB || req->type == MRT_DSTORE);
+  }
 
-    STAT_EVENT_ALL(MLC_HIT_ALL);
-    STAT_EVENT_ALL(MLC_HIT_ALL_ONPATH + req->off_path);
+  if(req->type == MRT_DPRF || req->type == MRT_IPRF ||
+     req->demand_match_prefetch) {
+    STAT_EVENT(req->proc_id, MLC_PREF_REQ_HIT);
+    STAT_EVENT(req->proc_id, CORE_MLC_PREF_REQ_HIT);
+  } else if((req->type == MRT_DFETCH) || (req->type == MRT_DSTORE) ||
+            (req->type == MRT_IFETCH)) {
+    STAT_EVENT(req->proc_id, MLC_DEMAND_HIT);
+    STAT_EVENT(req->proc_id, CORE_MLC_DEMAND_HIT);
+  } else {  // CMP Watch out RA
+    STAT_EVENT(req->proc_id, MLC_WB_HIT);
+    STAT_EVENT(req->proc_id, CORE_MLC_WB_HIT);
+  }
 
-    // cmp IGNORE
-    if(req->off_path)
-      STAT_EVENT(req->proc_id, MLC_HIT_OFFPATH_IFETCH + MIN2(req->type, 6));
-    else
-      STAT_EVENT(req->proc_id, MLC_HIT_ONPATH_IFETCH + MIN2(req->type, 6));
-
-    if(MLC_WRITE_THROUGH && (req->type == MRT_WB)) {
-      req->state     = MRS_L1_NEW;
-      req->rdy_cycle = cycle_count + MLCQ_TO_L1Q_TRANSFER_LATENCY;
-    } else {  // writeback done
-      /* Remove the entry from request buffer */
-      req->state = MRS_MLC_HIT_DONE;
-      mem_free_reqbuf(req);
+  if((req->type == MRT_DFETCH) || (req->type == MRT_DSTORE) ||
+     (req->type == MRT_IFETCH)) {
+    STAT_EVENT(req->proc_id, MLC_HIT);
+    STAT_EVENT(req->proc_id, CORE_MLC_HIT);
+    STAT_EVENT(req->proc_id, MLC_HIT_ONPATH + req->off_path);
+    if(0 && DEBUG_EXC_INSERTS) {
+      printf("addr:%s hit in MLC type:%s\n", hexstr64s(req->addr),
+             Mem_Req_Type_str(req->type));
     }
+  }
 
-    /* Set the priority so that this entry will be removed from the mlc_queue */
-    mlc_queue_entry->priority = Mem_Req_Priority_Offset[MRT_MIN_PRIORITY];
+  STAT_EVENT_ALL(MLC_HIT_ALL);
+  STAT_EVENT_ALL(MLC_HIT_ALL_ONPATH + req->off_path);
 
-    return TRUE;
-  } else {
-    return FALSE;
+  // cmp IGNORE
+  if(req->off_path)
+    STAT_EVENT(req->proc_id, MLC_HIT_OFFPATH_IFETCH + MIN2(req->type, 6));
+  else
+    STAT_EVENT(req->proc_id, MLC_HIT_ONPATH_IFETCH + MIN2(req->type, 6));
+
+  /////////////////////////////////////////////////////////////////
+  // main logic for handling mlc hit
+  // Case 0, deprecated: wb
+  if(MLC_WRITE_THROUGH && (req->type == MRT_WB)) {
+    req->state     = MRS_L1_NEW;
+    req->rdy_cycle = cycle_count + MLCQ_TO_L1Q_TRANSFER_LATENCY;
+  }
+  // Case 1: is a prefetch, free the req here
+  else if(!req->done_func) {
+    /* Remove the entry from request buffer */
+    req->state = MRS_MLC_HIT_DONE;
+    mem_free_reqbuf(req);
   }
+  // Case 2: steer the req to cores' fill queue
+  else {
+    req->state     = MRS_MLC_HIT_DONE;
+    req->rdy_cycle = freq_cycle_count(FREQ_DOMAIN_CORES[req->proc_id]);
+    req->queue     = &(mem->core_fill_queues[req->proc_id]);
+    if(data->dirty)
+      req->dirty_l0 = TRUE;
+    if(!ORDER_BEYOND_BUS) {
+      mem_insert_req_into_queue(req, req->queue,
+                                ALL_FIFO_QUEUES ?
+                                  core_fill_seq_num[req->proc_id] :
+                                  mlc_queue_entry->priority);
+    } else {
+      mem_insert_req_into_queue(
+        req, req->queue, ALL_FIFO_QUEUES ? core_fill_seq_num[req->proc_id] : 0);
+      core_fill_seq_num[req->proc_id]++;
+    }
+  }
+
+  /* Set the priority so that this entry will be removed from the mlc_queue */
+  mlc_queue_entry->priority = Mem_Req_Priority_Offset[MRT_MIN_PRIORITY];
+
+  return TRUE;
 }
 
 /**
  * @brief Miss path for LLC access
  *
+ * if WB, inserted into L1 array. Otherwise, send to ramulator (handled by the
+ * caller)
+ *
  *
  * @param req
  * @param l1_queue_entry
@@ -1327,15 +1392,16 @@ static Flag mem_process_l1_miss_access(Mem_Req*         req,
   }
 
   /*
-   * Case 1: if the request is a write back request then the processor just insert
-   * the request to the L1 cache
+   * Case 1: if the request is a write back request then the processor just
+   * insert the request to the L1 cache
    */
   if((req->type == MRT_WB) || (req->type == MRT_WB_NODIRTY)) {
-    if(req->type == MRT_WB_NODIRTY)
+    if(!EXCLUSIVE_L1 && req->type == MRT_WB_NODIRTY)
       WARNING(0, "CMP: A WB_NODIRTY request found! Check it out!");
 
     // install the blk and descruct the req
     if(req->done_func) {
+      // this should be rare, wb usually don't bind done_func
       ASSERT(req->proc_id, ALLOW_TYPE_MATCHES);
       ASSERT(req->proc_id, req->wb_requested_back);
       if(req->done_func(req)) {
@@ -1355,6 +1421,8 @@ static Flag mem_process_l1_miss_access(Mem_Req*         req,
     } else {
       STAT_EVENT(req->proc_id, WB_L1_MISS_FILL_L1);  // CMP remove this later
       if(!l1_fill_line(req)) {
+        // if cannot insert the blk, need to inform the caller not to delete
+        // the req from the queue (and retry later)
         req->rdy_cycle = cycle_count + 1;
         return FALSE;
       }
@@ -1385,43 +1453,30 @@ static Flag mem_process_l1_miss_access(Mem_Req*         req,
   }
 
   /**
-   * Case 3: propogate the miss downwards, marks the req as L1_miss
+   * Case 3: just need to propogate the miss downwards (handled in the caller of
+   * this func)
    */
   req->l1_miss       = TRUE;
   req->l1_miss_cycle = cycle_count;
 
   if((CONSTANT_MEMORY_LATENCY && !queue_full(&mem->l1fill_queue)) ||
-     //(!CONSTANT_MEMORY_LATENCY && !queue_full(&mem->bus_out_queue))) {
      (!CONSTANT_MEMORY_LATENCY)) {
-    // Ramulator: moving the lines below to where ramulator_send() is called
-
-    //// cmp FIXME
-    // if (TRACK_L1_MISS_DEPS || MARK_L1_MISSES)
-    //    mark_ops_as_l1_miss(req);
-
-    // req->state = MRS_BUS_NEW; // FIXME?
-    // req->rdy_cycle = cycle_count + L1Q_TO_FSB_TRANSFER_LATENCY; /* this req
-    // will be ready to be sent to memory in the next cycle */
-
-    //// cmp FIXME
-    // if (STREAM_PREFETCH_ON)
-    //    stream_ul1_miss (req);
-
-    ///* Set the priority so that this entry will be removed from the l1_queue
-    ///*/
-    // l1_queue_entry->priority = Mem_Req_Priority_Offset[MRT_MIN_PRIORITY];
-
-    // STAT_EVENT(req->proc_id, SEND_MISS_REQ_QUEUE);
     return TRUE;
   } else {
-    // STAT_EVENT(req->proc_id, REJECTED_QUEUE_BUS_OUT);
     return FALSE;
   }
 }
 
-/**************************************************************************************/
-/* mem_process_mlc_miss_access: */
 
+/**
+ * @brief Miss path for MLC accesses
+ *
+ * @param req
+ * @param mlc_queue_entry
+ * @param line_addr
+ * @param data
+ * @return Flag
+ */
 static Flag mem_process_mlc_miss_access(Mem_Req*         req,
                                         Mem_Queue_Entry* mlc_queue_entry,
                                         Addr* line_addr, MLC_Data* data) {
@@ -1466,12 +1521,14 @@ static Flag mem_process_mlc_miss_access(Mem_Req*         req,
   req->mlc_miss       = TRUE;
   req->mlc_miss_cycle = cycle_count;
 
+  // Case 0: if WB, directly insert to MLC
   if((req->type == MRT_WB) || (req->type == MRT_WB_NODIRTY)) {
     // if the request is a write back request then the processor just insert the
     // request to the MLC cache
     if(req->type == MRT_WB_NODIRTY)
       WARNING(0, "CMP: A WB_NODIRTY request found! Check it out!");
 
+    // WQ: WB with a done func should be rare (dc miss won't bind func_done)
     if(req->done_func) {
       ASSERT(req->proc_id, ALLOW_TYPE_MATCHES);
       ASSERT(req->proc_id, req->wb_requested_back);
@@ -1488,6 +1545,7 @@ static Flag mem_process_mlc_miss_access(Mem_Req*         req,
       }
     } else {
       STAT_EVENT(req->proc_id, WB_MLC_MISS_FILL_MLC);  // CMP remove this later
+      // WQ TODO: check if mlc_fill can potentially fail
       mlc_fill_line(req);
       if(MLC_WRITE_THROUGH && req->type == MRT_WB) {
         req->state     = MRS_L1_NEW;
@@ -1501,7 +1559,7 @@ static Flag mem_process_mlc_miss_access(Mem_Req*         req,
       return TRUE;
     }
   }
-
+  // Case 1: otherwise, send req to downwards (l1)
   if(!queue_full(&mem->l1_queue)) {
     req->state     = MRS_L1_NEW;
     req->rdy_cycle = cycle_count +
@@ -1519,13 +1577,16 @@ static Flag mem_process_mlc_miss_access(Mem_Req*         req,
 
 /**
  * @brief Process the L1 reg already obtain the port
- * 
- * If hit in L1, send req back upwards. Otherwise try sending it out to bus (ramulator)
+ *
+ * If hit in L1, send req back upwards. Otherwise try sending it out to bus
+ * (ramulator)
+ *
  * @param req
  * @param l1_queue_entry
  * @param out_queue_insertion_count
  * @param reserved_entry_count
- * @return Flag TRUE if l1 access is complete and needs to be removed from l1_queue
+ * @return Flag TRUE if l1 access is complete and needs to be removed from
+ * l1_queue
  */
 static Flag mem_complete_l1_access(Mem_Req*         req,
                                    Mem_Queue_Entry* l1_queue_entry,
@@ -1597,11 +1658,11 @@ static Flag mem_complete_l1_access(Mem_Req*         req,
   if(!PREFETCH_UPDATE_LRU_L1 &&
      (req->type == MRT_DPRF || req->type == MRT_IPRF))
     update_l1_lru = FALSE;
-  
-  //lookup LLC, data set to NULL on miss
+
+  // lookup LLC, data set to NULL on miss
   data = (L1_Data*)cache_access(&L1(req->proc_id)->cache, req->addr, &line_addr,
-                                update_l1_lru); 
-  //update the shadow cache
+                                update_l1_lru);
+  // update the shadow cache
   cache_part_l1_access(req);
   if(FORCE_L1_MISS)
     data = NULL;
@@ -1626,16 +1687,13 @@ static Flag mem_complete_l1_access(Mem_Req*         req,
      !data) /* do not put into L2 if this is a prefetch or off-path */
     data = l1_pref_cache_access(req);
 
-  Flag access_done = TRUE;
+  Flag access_done = TRUE;  // This flag tells whether to remove the req from
+                            // L1_queue
   if(data || PERFECT_L1) { /* l1 hit */
-    // if exclusive cache, invalidate the line in L2 if there is a done function
-    // to transfer the data to L1 -- also need to propagate the dirty to L1
-    // TODO: check here for adding MLC
-
-    // return the req upwards
     Flag l1_hit_access = mem_process_l1_hit_access(
       req, l1_queue_entry, &line_addr, data, lru_position);
     if(!l1_hit_access)
+      // WQ: this should not happen
       access_done = FALSE;
     else {
       if(!PREF_ORACLE_TRAIN_ON &&
@@ -1650,12 +1708,9 @@ static Flag mem_complete_l1_access(Mem_Req*         req,
       }
 
       // propogate to dram for writethrough cache regardless of hit/miss
-      if(L1_WRITE_THROUGH && (req->type == MRT_WB) &&
+      if(L1_WRITE_THROUGH &&
+         (req->type == MRT_WB || req->type == MRT_WB_NODIRTY) &&
          !CONSTANT_MEMORY_LATENCY) {
-        // req->queue = &(mem->bus_out_queue);
-
-        // mem_insert_req_into_queue (req, req->queue, ALL_FIFO_QUEUES ?
-        // bus_out_seq_num : 0);
         ASSERT(req->proc_id, MRS_L1_WAIT == req->state);
         req->state    = MRS_MEM_NEW;
         l1_hit_access = ramulator_send(req);
@@ -1673,13 +1728,8 @@ static Flag mem_complete_l1_access(Mem_Req*         req,
           // perf_pred_mem_req_start(req);
           mem_free_reqbuf(req);
         }
-
-        // bus_out_seq_num++;
-        //(*out_queue_insertion_count) += 1;
-        // STAT_EVENT(req->proc_id, BUS_ACCESS);
       }
     }
-    // CMP IGNORE
   } else { /* l1 miss */
     /* if req is wb then either fill l1 or try again */
     Flag l1_miss_send_bus = (L1_WRITE_THROUGH && (req->type == MRT_WB)) ||
@@ -1689,6 +1739,9 @@ static Flag mem_complete_l1_access(Mem_Req*         req,
       l1_miss_send_bus = FALSE;
     Flag l1_miss_access = mem_process_l1_miss_access(req, l1_queue_entry,
                                                      &line_addr, data);
+    // send a miss req downwards
+    // WQ: to be consistent, this blk need to be moved in
+    // mem_process_l1_miss_access
     if(l1_miss_access && l1_miss_send_bus) {
       if(CONSTANT_MEMORY_LATENCY) {
         mem->uncores[req->proc_id].num_outstanding_l1_misses++;
@@ -1710,11 +1763,6 @@ static Flag mem_complete_l1_access(Mem_Req*         req,
         STAT_EVENT(req->proc_id, POWER_DRAM_ACTIVATE);
         STAT_EVENT(req->proc_id, POWER_DRAM_READ);
       } else {
-        // Ramulator remove
-        // req->queue = &(mem->bus_out_queue);
-        // mem_insert_req_into_queue (req, req->queue, ALL_FIFO_QUEUES ?
-        // bus_out_seq_num : 0);
-
         ASSERT(req->proc_id, MRS_L1_WAIT == req->state);
         req->state     = MRS_MEM_NEW;
         l1_miss_access = ramulator_send(req);
@@ -1722,7 +1770,7 @@ static Flag mem_complete_l1_access(Mem_Req*         req,
           // Fail to send req to dram
           req->state  = MRS_L1_WAIT;
           access_done = FALSE;
-        } else { //send to dram succeed
+        } else {  // send to dram succeed
           ASSERT(req->proc_id, req->mem_queue_cycle >= req->rdy_cycle);
           req->queue = NULL;
 
@@ -1734,10 +1782,6 @@ static Flag mem_complete_l1_access(Mem_Req*         req,
           if(TRACK_L1_MISS_DEPS || MARK_L1_MISSES)
             mark_ops_as_l1_miss(req);
 
-          // req->state = MRS_BUS_NEW; // FIXME?
-          // req->rdy_cycle = cycle_count + L1Q_TO_FSB_TRANSFER_LATENCY; /* this
-          // req will be ready to be sent to memory in the next cycle */
-
           // cmp FIXME
           if(STREAM_PREFETCH_ON)
             stream_ul1_miss(req);
@@ -1812,13 +1856,14 @@ static Flag mem_complete_l1_access(Mem_Req*         req,
 
 
 /**
- * @brief 
- * 
- * @param req 
- * @param mlc_queue_entry 
- * @param l1_queue_insertion_count 
- * @param reserved_entry_count 
- * @return Flag Returns TRUE if mlc access is complete and needs to be removed from mlc_queue
+ * @brief Access MLC array
+ *
+ * @param req
+ * @param mlc_queue_entry
+ * @param l1_queue_insertion_count
+ * @param reserved_entry_count
+ * @return Flag Returns TRUE if mlc access is complete and needs to be removed
+ * from mlc_queue
  */
 static Flag mem_complete_mlc_access(Mem_Req*         req,
                                     Mem_Queue_Entry* mlc_queue_entry,
@@ -1836,11 +1881,9 @@ static Flag mem_complete_mlc_access(Mem_Req*         req,
                                  &line_addr, update_mlc_lru);  // access MLC
 
   if(data || PERFECT_MLC) { /* mlc hit */
-    // if exclusive cache, invalidate the line in L2 if there is a done function
-    // to transfer the data to MLC -- also need to propagate the dirty to MLC
     Flag mlc_hit_access = mem_process_mlc_hit_access(
       req, mlc_queue_entry, &line_addr, data, lru_position);
-    if(!mlc_hit_access) {
+    if(!mlc_hit_access) {  // not gonna happen
       return FALSE;
     } else {
       if(!PREF_ORACLE_TRAIN_ON &&
@@ -1854,7 +1897,9 @@ static Flag mem_complete_mlc_access(Mem_Req*         req,
         pref_umlc_hit(req->proc_id, req->addr, req->loadPC, req->global_hist);
       }
 
-      if(MLC_WRITE_THROUGH && (req->type == MRT_WB)) {
+      // wb for write through cache
+      if(MLC_WRITE_THROUGH &&
+         (req->type == MRT_WB || req->type == MRT_WB_NODIRTY)) {
         req->queue = &(mem->l1_queue);
         mem_insert_req_into_queue(req, req->queue,
                                   ALL_FIFO_QUEUES ? l1_seq_num : 0);
@@ -1899,21 +1944,25 @@ static Flag mem_complete_mlc_access(Mem_Req*         req,
         // Train the Data prefetcher
         pref_umlc_miss(req->proc_id, req->addr, req->loadPC, req->global_hist);
       }
-
       return TRUE;
     } else if(!mlc_miss_access) {
+      // miss process is not ready
       return FALSE;
+    } else {
+      return TRUE;
     }
-    return TRUE;
   }
   ASSERT(req->proc_id, 0);
 }
 
-/**************************************************************************************/
-/* mem_process_new_reqs: */
-/* Access L1 if port is ready - If L1 miss, then put the request into miss queue
- */
 
+/**
+ * @brief Access path for all reqs from upward into L1
+ *
+ * WQ: modeling for DSTORE is largely off now: obtain port & handling of
+ * write miss
+ *
+ */
 static void mem_process_l1_reqs() {
   Mem_Req* req = NULL;
   int      ii;
@@ -1923,7 +1972,6 @@ static void mem_process_l1_reqs() {
   int      l1_queue_reserve_entry_count = 0;
 
   /* Go thru the l1_queue and try to access L1 for each request */
-
   for(ii = 0; ii < mem->l1_queue.entry_count; ii++) {
     reqbuf_id = mem->l1_queue.base[ii].reqbuf;
     req       = &(mem->req_buffer[reqbuf_id]);
@@ -1947,7 +1995,7 @@ static void mem_process_l1_reqs() {
     /* Request is ready: see what state it is in */
 
     if(req->state == MRS_L1_NEW) {
-      mem_start_l1_access(req); //obtain port for req
+      mem_start_l1_access(req);  // obtain port for req, change req->state
       STAT_EVENT(req->proc_id, L1_ACCESS);
       if(req->type == MRT_DPRF || req->type == MRT_IPRF)
         STAT_EVENT(req->proc_id, L1_PREF_ACCESS);
@@ -1960,7 +2008,7 @@ static void mem_process_l1_reqs() {
               mem->req_count, mem->l1_queue.entry_count,
               mem->bus_out_queue.entry_count, mem->l1fill_queue.entry_count);
 
-      // heavy work in done this in func
+      // actual logic for accessing L1 array
       if(mem_complete_l1_access(req, &(mem->l1_queue.base[ii]),
                                 &out_queue_insertion_count,
                                 &l1_queue_reserve_entry_count))
@@ -2002,9 +2050,7 @@ static void mem_process_l1_reqs() {
 
 
 /**
- * @brief Access MLC if port is ready - If MLC miss, then put the request into miss
- * queue
- * 
+ * @brief Access path for all req coming from core side into MLC
  */
 static void mem_process_mlc_reqs() {
   Mem_Req* req = NULL;
@@ -2087,15 +2133,14 @@ static void mem_process_mlc_reqs() {
   }
 }
 
-/**************************************************************************************/
-/* mem_process_bus_out_reqs: */
-/* FIXME: need to busy the bus for the time a line is being sent */
-
 /**
+ * @deprecated
  * @brief Obsolete, bus_out is repalced by ramulator. The function
  * will still be called but since bus_out_queue is supposed to always
  * be 0, the first return will take the execution out of the function
- * 
+ *
+ * To send req to DRAM, use ramulator_send()
+ *
  */
 static void mem_process_bus_out_reqs() {
   Mem_Req* req;
@@ -2116,7 +2161,7 @@ static void mem_process_bus_out_reqs() {
     // return; // VEYNU: if there is no room in the mem queue do nothing
     return;  // Ramulator: early return if bus_out_queue is empty
   }
-  //WQ: will this ever be executed?
+
   ASSERTM(0, FALSE,
           "ERROR: bus_out_queue should always be empty\n");  // Ramulator
   // Ramulator handles off-chip communication latency itself. So we
@@ -2423,12 +2468,16 @@ static void mem_process_bus_out_reqs() {
 
 
 /**
- * @brief Add req into uncore
- * 
- * Ramulator call this func to return a serviced req
- * 
- * @param req 
- * @param priority 
+ * @brief Add req into uncore queues
+ *
+ * Depends on uncore config, this func will either steer reqs
+ * into l1fill_queue or to mlc_fill_queue, and change the req->state
+ * correspondingly.
+ *
+ * Ramulator call this func to return serviced reqs
+ *
+ * @param req
+ * @param priority
  */
 void mem_complete_bus_in_access(Mem_Req* req, Counter priority) {
   DEBUG(req->proc_id,
@@ -2437,22 +2486,32 @@ void mem_complete_bus_in_access(Mem_Req* req, Counter priority) {
         (long int)(req - mem->req_buffer), Mem_Req_Type_str(req->type),
         hexstr64s(req->addr), req->size, mem_req_state_names[req->state]);
 
-  req->state = MRS_FILL_L1;
-
-  /* Crossing frequency domain boundary between the chip and memory controller
-   */
-  req->rdy_cycle = freq_cycle_count(FREQ_DOMAIN_L1) + 1;
-
-  req->queue = &(mem->l1fill_queue);
+  // usually the dest are either L1 (prefetchers) or None (demandings)
+  // WQ TODO: MLC prefetch
+  Counter* fill_seq_num;
+  if(req->destination == DEST_L1 || !MLC_PRESENT || !EXCLUSIVE_L1) {
+    req->state = MRS_FILL_L1;
+    // Crossing frequency domain boundary between the chip and memory controller
+    req->rdy_cycle = freq_cycle_count(FREQ_DOMAIN_L1) + 1;
+    req->queue     = &(mem->l1fill_queue);
+    fill_seq_num   = &l1fill_seq_num;
+  } else {
+    req->state     = MRS_FILL_MLC;
+    req->rdy_cycle = freq_cycle_count(FREQ_DOMAIN_L1) + 1;
+    req->queue     = &(mem->mlc_fill_queue);
+    fill_seq_num   = &mlc_fill_seq_num;
+  }
 
-  if(!ORDER_BEYOND_BUS)
+  if(!ORDER_BEYOND_BUS) {
     mem_insert_req_into_queue(req, req->queue,
-                              ALL_FIFO_QUEUES ? l1fill_seq_num : priority);
-  else
+                              ALL_FIFO_QUEUES ? *fill_seq_num : priority);
+  } else {
     mem_insert_req_into_queue(req, req->queue,
-                              ALL_FIFO_QUEUES ? l1fill_seq_num : 0);
+                              ALL_FIFO_QUEUES ? *fill_seq_num : 0);
+  }
+  (*fill_seq_num)++;
 
-  l1fill_seq_num++;
+  // WQ TODO: currently there is no dedicate counter for mlc, reuse l1 counters
   ASSERT(req->proc_id,
          mem->uncores[req->proc_id].num_outstanding_l1_misses > 0);
   mem->uncores[req->proc_id].num_outstanding_l1_misses--;
@@ -2460,6 +2519,7 @@ void mem_complete_bus_in_access(Mem_Req* req, Counter priority) {
   if(!CONSTANT_MEMORY_LATENCY && !PERF_PRED_REQS_FINISH_AT_FILL)
     perf_pred_mem_req_done(req);
 
+  // collect stats
   if(req->type != MRT_WB_NODIRTY && req->type != MRT_WB) {
     INC_STAT_EVENT_ALL(TOTAL_MEM_LATENCY,
                        req->rdy_cycle - req->mem_queue_cycle);
@@ -2485,12 +2545,25 @@ void mem_complete_bus_in_access(Mem_Req* req, Counter priority) {
   }
 }
 
+/**
+ * @brief Remove req from l1_fill_queue by change the l1fill_queue.entry_count
+ *
+ * the removal_count should be pre-populated before this call. The # of
+ * removal_count req with lowest priority will be removed
+ *
+ * removal_count will be reset after the call
+ *
+ * @param proc_id
+ * @param p_l1fill_queue_removal_count
+ */
 static void remove_from_l1_fill_queue(uns  proc_id,
                                       int* p_l1fill_queue_removal_count) {
   /* Remove requests from l1 fill queue */
   if(*p_l1fill_queue_removal_count > 0) {
     /* After this sort requests that should be removed will be at the tail of
      * the l1_queue */
+    // WQ TODO: assert the num of req with MIN_PROORITY is always equal to the
+    // removal_count
     DEBUG(0, "l1fill_queue removal\n");
     qsort(mem->l1fill_queue.base, mem->l1fill_queue.entry_count,
           sizeof(Mem_Queue_Entry), mem_compare_priority);
@@ -2514,7 +2587,12 @@ static void mem_process_l1_fill_reqs() {
   Mem_Req* req = NULL;
   int      ii;
   int      reqbuf_id;
-  int      l1fill_queue_removal_count = 0;
+
+  // control the num of req in fill_queue to remove in each call
+  // main logic will update this var, and the final call of
+  // remove_from_l1_fill_queue will take this var and remove the exact
+  // number from the tail of the queue (sorted by priority)
+  int l1fill_queue_removal_count = 0;
 
   /* Go thru the l1fill_queue */
   for(ii = 0; ii < mem->l1fill_queue.entry_count; ii++) {
@@ -2529,6 +2607,15 @@ static void mem_process_l1_fill_reqs() {
     if(cycle_count < req->rdy_cycle)
       continue;
 
+    // reqs in L1_FILL_QUEUE will be in one of 3 states at any given time:
+    //  1) Fill_l1
+    //  2) Fill_mlc
+    //  3) Done
+    //  Initially when mem_complete_bus_in_access steer reqs to here, the req
+    //  state should be Fill_l1, after the req got filled into L1, depends on
+    //  whether the L1 is exclusive and req's dest, the state either transit
+    //  into FILL_MLC or FILL_DONE
+    // req will be removed (controlled by l1fill_queue_removal_count)
     if(req->state == MRS_FILL_L1) {
       DEBUG(req->proc_id,
             "Mem request about to fill L1  index:%ld  type:%s  addr:0x%s  "
@@ -2540,7 +2627,7 @@ static void mem_process_l1_fill_reqs() {
         ASSERT(0, req->type != MRT_WB && req->type != MRT_WB_NODIRTY);
         if(CONSTANT_MEMORY_LATENCY)
           perf_pred_mem_req_done(req);
-        if(MLC_PRESENT && req->destination != DEST_L1) {
+        if(MLC_PRESENT && !EXCLUSIVE_L1 && req->destination != DEST_L1) {
           req->state     = MRS_FILL_MLC;
           req->rdy_cycle = cycle_count + 1;
         } else {
@@ -2568,15 +2655,20 @@ static void mem_process_l1_fill_reqs() {
         mem_insert_req_into_queue(req, req->queue,
                                   ALL_FIFO_QUEUES ? mlc_fill_seq_num : 0);
       mlc_fill_seq_num++;
-      // remove from l1fill queue - how do we handle this now?
-      if(HIER_MSHR_ON)
+      if(HIER_MSHR_ON) {
         req->reserved_entry_count -= 1;
+      }
+      // remove from l1fill queue
       l1fill_queue_removal_count++;
+      // MIN_PRIOTITY will guarantte this req be moved, WQ: is this always true?
+      // feels like a dangerous design
       mem->l1fill_queue.base[ii].priority =
         Mem_Req_Priority_Offset[MRT_MIN_PRIORITY];
-    } else { // cleanup the req
+    } else {
       ASSERT(req->proc_id, req->state == MRS_FILL_DONE);
-      if(!req->done_func) {
+      // The existance of done_func indicates whether this req should
+      // ultimately be sent back to core
+      if(!req->done_func) {  // req should be resolved at here (l1)
         if(HIER_MSHR_ON)
           req->reserved_entry_count -= 1;
 
@@ -2588,8 +2680,11 @@ static void mem_process_l1_fill_reqs() {
         mem->l1fill_queue.base[ii].priority =
           Mem_Req_Priority_Offset[MRT_MIN_PRIORITY];
 
-        remove_from_l1_fill_queue(req->proc_id, &l1fill_queue_removal_count);
-      } else {
+        // WQ: seems uncessary to call here
+        // remove_from_l1_fill_queue(req->proc_id, &l1fill_queue_removal_count);
+      } else {  // steer req to core's fill queue
+        // this should not happen when MLC is present (regardless of incl/excl)
+        // WQ TODO: put an assert here
         req->rdy_cycle = freq_cycle_count(
           FREQ_DOMAIN_CORES[req->proc_id]);  // no +1 to match old performance
         // insert into core fill queue
@@ -2612,16 +2707,18 @@ static void mem_process_l1_fill_reqs() {
     }
   }
 
-  if(req) {
+  if(req) {  // predicate is only to make sure req->proc_id is valid
     remove_from_l1_fill_queue(req->proc_id, &l1fill_queue_removal_count);
   }
 }
 
 
-
 /**
- * @brief 
- * 
+ * @brief Fill resp into mlc
+ *
+ * Pending reqs in the MLC_QUEUE_FILL will be inserted and the
+ * req state will turn MRS_FILL_DONE. Next time this func got
+ * called, all reqs of MRS_FILL_DONE will be freed.
  */
 static void mem_process_mlc_fill_reqs() {
   Mem_Req* req;
@@ -2644,6 +2741,7 @@ static void mem_process_mlc_fill_reqs() {
     if(cycle_count < req->rdy_cycle)
       continue;
 
+    // either from DRAM or L1, depends on the cache inclusivitity
     if(req->state == MRS_FILL_MLC) {
       DEBUG(req->proc_id,
             "Mem request about to fill MLC  index:%ld  type:%s  addr:0x%s  "
@@ -2651,26 +2749,45 @@ static void mem_process_mlc_fill_reqs() {
             (long int)(req - mem->req_buffer), Mem_Req_Type_str(req->type),
             hexstr64s(req->addr), req->size, mem_req_state_names[req->state]);
       if(mlc_fill_line(req)) {
+        // mark done if req successfully writen into the array
         req->state     = MRS_FILL_DONE;
         req->rdy_cycle = cycle_count + 1;
       }
     } else {
       ASSERT(req->proc_id, req->state == MRS_FILL_DONE);
-      if(!req->done_func || req->done_func(req)) {
+      // WQ: this looks wrong, done_func is not supposed to be called here
+      // if(!req->done_func || req->done_func(req)) {
+      if(!req->done_func) {  // reqs supposed to be resolved here
         if(HIER_MSHR_ON)
           req->reserved_entry_count -= 1;
-
         // Free the request buffer
         mem_free_reqbuf(req);
-
-        // remove from mlc_fill queue - how do we handle this now?
-        mlc_fill_queue_removal_count++;
-        mem->mlc_fill_queue.base[ii].priority =
-          Mem_Req_Priority_Offset[MRT_MIN_PRIORITY];
+      } else {  // needs to be delivered to core's fill_queue
+        req->rdy_cycle = freq_cycle_count(
+          FREQ_DOMAIN_CORES[req->proc_id]);  // no +1 to match old performance
+        // insert into core fill queue
+        req->queue = &(mem->core_fill_queues[req->proc_id]);
+        if(!ORDER_BEYOND_BUS)
+          mem_insert_req_into_queue(req, req->queue,
+                                    ALL_FIFO_QUEUES ?
+                                      core_fill_seq_num[req->proc_id] :
+                                      mem->l1fill_queue.base[ii].priority);
+        else
+          mem_insert_req_into_queue(
+            req, req->queue,
+            ALL_FIFO_QUEUES ? core_fill_seq_num[req->proc_id] : 0);
+        core_fill_seq_num[req->proc_id]++;
       }
+
+      // remove from mlc_fill queue - how do we handle this now?
+      mlc_fill_queue_removal_count++;
+      mem->mlc_fill_queue.base[ii].priority =
+        Mem_Req_Priority_Offset[MRT_MIN_PRIORITY];
     }
   }
 
+  // WQ: this is really inconsistent, this functionality for L1 is packed into
+  // a separate func...
   /* Remove requests from mlc access queue */
   if(mlc_fill_queue_removal_count > 0) {
     /* After this sort requests that should be removed will be at the tail of
@@ -2688,9 +2805,15 @@ static void mem_process_mlc_fill_reqs() {
   }
 }
 
-/**************************************************************************************/
-/* mem_process_core_fill_reqs: */
-
+/**
+ * @brief Upwards interface between memsys and queue
+ *
+ * reqs reach here should have a callback (done_func), req
+ * is fed back into core by calling this done_func. And then
+ * req will be marked as freed
+ *
+ * @param proc_id
+ */
 static void mem_process_core_fill_reqs(uns proc_id) {
   Mem_Req* req;
   int      ii;
@@ -2709,11 +2832,13 @@ static void mem_process_core_fill_reqs(uns proc_id) {
     ASSERT(req->proc_id, (req->type != MRT_WB) || req->wb_requested_back);
     ASSERT(req->proc_id, req->type != MRT_WB_NODIRTY);
     ASSERT(req->proc_id, cycle_count >= req->rdy_cycle);
-    ASSERT(proc_id,
-           req->state == MRS_L1_HIT_DONE || req->state == MRS_FILL_DONE);
+    ASSERT(proc_id, req->state == MRS_L1_HIT_DONE ||
+                      req->state == MRS_MLC_HIT_DONE ||
+                      req->state == MRS_FILL_DONE);
     ASSERT(proc_id,
            req->done_func);  // requests w/o done_func() should be done by now
 
+    // common used done func are i/dcache_fill_line
     if(req->done_func(req)) {
       // Free the request buffer
       mem_free_reqbuf(req);
@@ -2882,23 +3007,22 @@ static inline Mem_Req* mem_search_queue(
 
 /**
  * @brief Search given queues to check whether a req of addr exist
- * 
- * @param proc_id 
- * @param addr 
- * @param type 
- * @param size 
+ *
+ * @param proc_id
+ * @param addr
+ * @param type
+ * @param size
  * @param demand_hit_prefetch set if the matching req is a prefetch and a
                                 demand hits it
- * @param demand_hit_writeback 
- * @param queues_to_search 
- * @param queue_entry 
- * @param ramulator_match 
- * @return Mem_Req* 
+ * @param demand_hit_writeback
+ * @param queues_to_search
+ * @param queue_entry
+ * @param ramulator_match
+ * @return Mem_Req*
  */
 static inline Mem_Req* mem_search_reqbuf(
   uns8 proc_id, Addr addr, Mem_Req_Type type, uns size,
-  Flag* demand_hit_prefetch, 
-  Flag* demand_hit_writeback, uns queues_to_search,
+  Flag* demand_hit_prefetch, Flag* demand_hit_writeback, uns queues_to_search,
   Mem_Queue_Entry** queue_entry, Flag* ramulator_match) {
   Mem_Req* req;
   ASSERTM(proc_id, proc_id == get_proc_id_from_cmp_addr(addr),
@@ -3279,10 +3403,15 @@ Flag mem_can_allocate_req_buffer(uns proc_id, Mem_Req_Type type,
   return TRUE;
 }
 
-/**************************************************************************************/
-/* mem_allocate_req_buffer: */
-/* If queue is specified, only allocates if its entry_count < size */
-
+/**
+ * @brief alloc new req from the request buffer
+ *
+ * If queue is specified, only allocates if its entry_count < size
+ * @param proc_id
+ * @param type
+ * @param for_l1_writeback
+ * @return Mem_Req*
+ */
 static inline Mem_Req* mem_allocate_req_buffer(uns proc_id, Mem_Req_Type type,
                                                Flag for_l1_writeback) {
   if(!mem_can_allocate_req_buffer(proc_id, type, for_l1_writeback))
@@ -3481,14 +3610,27 @@ static Mem_Req* mem_kick_out_oldest_first_prefetch_from_queues(
   return NULL;
 }
 
-/**************************************************************************************/
-/* mem_init_new_req: */
-
-static void mem_init_new_req(
-  Mem_Req* new_req, Mem_Req_Type type, Mem_Queue_Type queue_type, uns8 proc_id,
-  Addr addr, uns size, uns delay, Op* op, Flag done_func(Mem_Req*),
-  Counter unique_num, /* This counter is used when op is NULL */
-  Flag kicked_out_another, Counter new_priority) {
+/**
+ * @brief Populate the newly generated req
+ *
+ * @param new_req
+ * @param type
+ * @param queue_type
+ * @param proc_id
+ * @param addr
+ * @param size
+ * @param delay
+ * @param op
+ * @param done_func
+ * @param unique_num This counter is used when op is NULL
+ * @param kicked_out_another
+ * @param new_priority
+ */
+static void mem_init_new_req(Mem_Req* new_req, Mem_Req_Type type,
+                             Mem_Queue_Type queue_type, uns8 proc_id, Addr addr,
+                             uns size, uns delay, Op* op,
+                             Flag done_func(Mem_Req*), Counter unique_num,
+                             Flag kicked_out_another, Counter new_priority) {
   ASSERT(0, queue_type & (QUEUE_L1 | QUEUE_MLC));
   Flag to_mlc = (queue_type == QUEUE_MLC);
 
@@ -3530,9 +3672,12 @@ static void mem_init_new_req(
   new_req->mem_channel = CHANNEL(new_req->mem_flat_bank, RAMULATOR_BANKS);
   new_req->mem_bank = BANK_IN_CHANNEL(new_req->mem_flat_bank, RAMULATOR_BANKS);
   */
+
+  // WQ TODO: add support for configurable (bank) hashing schemes
   new_req->mlc_bank = BANK(addr, MLC(proc_id)->num_banks,
                            MLC_INTERLEAVE_FACTOR);
   new_req->l1_bank  = BANK(addr, L1(proc_id)->num_banks, L1_INTERLEAVE_FACTOR);
+
   new_req->start_cycle          = freq_cycle_count(FREQ_DOMAIN_L1) + delay;
   new_req->rdy_cycle            = freq_cycle_count(FREQ_DOMAIN_L1) + delay;
   new_req->first_stalling_cycle = mem_req_type_is_stalling(type) ?
@@ -3621,9 +3766,14 @@ static void mem_init_new_req(
 }
 
 
-/**************************************************************************************/
-/* mem_insert_req_into_queue: */
-
+/**
+ * @brief Put req into corresponding queues. Queue entry holds idx of req_buf
+ *
+ * @param new_req
+ * @param queue
+ * @param priority
+ * @return Mem_Queue_Entry*
+ */
 static inline Mem_Queue_Entry* mem_insert_req_into_queue(Mem_Req*   new_req,
                                                          Mem_Queue* queue,
                                                          Counter    priority) {
@@ -3689,15 +3839,16 @@ void mem_insert_req_round_robin() {
 }
 
 
-
 /**
- * @brief Create new req. Req is the starter of all memory activities and has 3 
- * major sources:
- *  1. core
- *  2. prefetchers
- *  3. WBs in cache hierarchies
- * However, this func is only supposed to be called within in the core (demand/prefetch) 
- * 
+ * @brief Create new req and insert into correct queue
+ *
+ * Note this is one of the four sources where new_req will be generated
+ * in scarab simulation, the other three are: new_mem_dc/mlc/l1_wb_req and
+ * are used specific for write back reqs
+ *
+ * This func is used to create req when core has demanding LD/ST or for
+ * prefetchers (done_func is usually bound to d/icache_fill_line())
+ *
  * Returns TRUE if the request is successfully entered into the memory system
  */
 Flag new_mem_req(Mem_Req_Type type, uns8 proc_id, Addr addr, uns size,
@@ -3715,6 +3866,10 @@ Flag new_mem_req(Mem_Req_Type type, uns8 proc_id, Addr addr, uns size,
   Counter priority_offset = freq_cycle_count(FREQ_DOMAIN_L1);
   Counter new_priority;
   Flag    to_mlc = MLC_PRESENT && (!pref_info || pref_info->dest != DEST_L1);
+
+  // Demand reqs will have dest of DEST_NONE, which means req will return to
+  // the core ultimately. Prefetch reqs can has different dest depends on
+  // pref_info
   Destination destination = (pref_info ? pref_info->dest : DEST_NONE);
 
   ASSERTM(proc_id, proc_id == get_proc_id_from_cmp_addr(addr),
@@ -3769,7 +3924,6 @@ Flag new_mem_req(Mem_Req_Type type, uns8 proc_id, Addr addr, uns size,
   }
 
   /* Step 2: Found matching request. Adjust it based on the current request */
-
   if(matching_req) {
     // Simulation inaccuracy: an L2-destined request can match a request in the
     // MLC queue, not the other way around
@@ -3871,6 +4025,7 @@ Flag new_mem_req(Mem_Req_Type type, uns8 proc_id, Addr addr, uns size,
     }
   }
 
+  // Use oracle info (look into cache hit/miss) to train prefetchers
   /* we model this more accurately by training the prefetcher when we actually
    * hit/miss if PREF_ORACLE_TRAIN_ON is off */
   // cmp FIXME What can I do for the prefetcher?
@@ -3953,9 +4108,13 @@ Flag new_mem_req(Mem_Req_Type type, uns8 proc_id, Addr addr, uns size,
     return insert_new_req_into_l1_queue(proc_id, new_req);
 }
 
-/**************************************************************************************/
-/* insert_new_req_into_l1_queue: */
-
+/**
+ * @brief
+ *
+ * @param proc_id
+ * @param new_req
+ * @return Flag
+ */
 static Flag insert_new_req_into_l1_queue(uns proc_id, Mem_Req* new_req) {
   if(!ROUND_ROBIN_TO_L1) {
     if(queue_full(&mem->l1_queue)) {
@@ -3974,9 +4133,14 @@ static Flag insert_new_req_into_l1_queue(uns proc_id, Mem_Req* new_req) {
   return TRUE;
 }
 
-/**************************************************************************************/
-/* insert_new_req_into_mlc_queue: */
 
+/**
+ * @brief
+ *
+ * @param proc_id
+ * @param new_req
+ * @return Flag
+ */
 static Flag insert_new_req_into_mlc_queue(uns proc_id, Mem_Req* new_req) {
   if(queue_full(&mem->mlc_queue)) {
     ASSERT(proc_id, 0);
@@ -3988,10 +4152,22 @@ static Flag insert_new_req_into_mlc_queue(uns proc_id, Mem_Req* new_req) {
   return TRUE;
 }
 
-/**************************************************************************************/
-/* new_mem_dc_wb_req: */
-/* Returns TRUE if the request is successfully entered into the memory system */
 
+/**
+ * @brief New dcache write back req
+ *
+ * @param type
+ * @param proc_id
+ * @param addr
+ * @param size
+ * @param delay
+ * @param op
+ * @param done_func
+ * @param unique_num
+ * @param used_onpath
+ * @return Flag TRUE if the request is successfully entered into the memory
+ * system
+ */
 Flag new_mem_dc_wb_req(Mem_Req_Type type, uns8 proc_id, Addr addr, uns size,
                        uns delay, Op* op, Flag done_func(Mem_Req*),
                        Counter unique_num, Flag used_onpath) {
@@ -4080,6 +4256,7 @@ Flag new_mem_dc_wb_req(Mem_Req_Type type, uns8 proc_id, Addr addr, uns size,
   new_req->wb_used_onpath = used_onpath;  // DC WB requests carry this flag
 
   /* Step 6: Insert the request into the l1 queue if it is not already there */
+  // WQ: note the WB is steered into req queues not fill queues
   if(MLC_PRESENT)
     insert_new_req_into_mlc_queue(proc_id, new_req);
   else
@@ -4090,16 +4267,17 @@ Flag new_mem_dc_wb_req(Mem_Req_Type type, uns8 proc_id, Addr addr, uns size,
 
 /**
  * @brief Create req for wb and insert into L1 req queue
- * 
- * @param type 
- * @param proc_id 
- * @param addr 
- * @param size 
- * @param delay 
- * @param op 
- * @param done_func 
- * @param unique_num 
- * @return Flag  Returns TRUE if the request is successfully entered into the memory system
+ *
+ * @param type
+ * @param proc_id
+ * @param addr
+ * @param size
+ * @param delay
+ * @param op
+ * @param done_func
+ * @param unique_num
+ * @return Flag  Returns TRUE if the request is successfully entered into the
+ * memory system
  */
 static Flag new_mem_mlc_wb_req(Mem_Req_Type type, uns8 proc_id, Addr addr,
                                uns size, uns delay, Op* op,
@@ -4190,12 +4368,22 @@ static Flag new_mem_mlc_wb_req(Mem_Req_Type type, uns8 proc_id, Addr addr,
 }
 
 
+/**
+ * @brief
+ *
+ * @param type
+ * @param proc_id
+ * @param addr
+ * @param size
+ * @param delay
+ * @param op
+ * @param done_func
+ * @param unique_num This counter is used when op is NULL
+ * @return Flag
+ */
 static Flag new_mem_l1_wb_req(Mem_Req_Type type, uns8 proc_id, Addr addr,
                               uns size, uns delay, Op* op,
-                              Flag    done_func(Mem_Req*),
-                              Counter unique_num) /* This counter is used when
-                                                     op is NULL */
-{
+                              Flag done_func(Mem_Req*), Counter unique_num) {
   Mem_Req*         new_req              = NULL;
   Mem_Req*         matching_req         = NULL;
   Mem_Queue_Entry* queue_entry          = NULL;
@@ -4406,7 +4594,7 @@ Flag l1_fill_line(Mem_Req* req) {
   // WQ: seems scarab is not modelling the mem sys correctly, there could be
   // cases  when two req of the same block is propogating at the same time in
   // the system  results in when fill the cache line, it was already presented
-  // there so, 
+  // there so,
   // Temporarily fix starts here:
   Addr dummy_line_addr;
   if(cache_access(&L1(req->proc_id)->cache, req->addr, &dummy_line_addr,
@@ -4724,12 +4912,11 @@ Flag l1_fill_line(Mem_Req* req) {
 }
 
 
-
 /**
  * @brief Fill line into MLC, handle possible WBs
- * 
- * @param req 
- * @return Flag 
+ *
+ * @param req
+ * @return Flag Return 1 on successfully put req into the cache, 0 otherwise
  */
 Flag mlc_fill_line(Mem_Req* req) {
   MLC_Data* data;
@@ -4754,6 +4941,7 @@ Flag mlc_fill_line(Mem_Req* req) {
         (req->op_count ? &(top->unique_num) : 0x0));
 
 
+  // WQ: why is this commented out??
   /* if it can't get a write port, fail */
   /* if (!get_write_port(&MLC(req->proc_id)->ports[req->mlc_bank])) return
    * FAILURE; */
@@ -4770,17 +4958,21 @@ Flag mlc_fill_line(Mem_Req* req) {
 
   /* If we are replacing anything, check if we need to write it back */
   if(repl_line_valid) {
-    if(!MLC_WRITE_THROUGH && data->dirty) {
-      /* need to do a write-back */
+    /* write-back on dirty victim or exclusive hierarchy */
+    if((!MLC_WRITE_THROUGH && data->dirty) || (EXCLUSIVE_L1)) {
       DEBUG(req->proc_id, "Scheduling writeback of addr:0x%s\n",
             hexstr64s(repl_line_addr));
       if(0 && DEBUG_EXC_INSERTS)
         printf("Scheduling L2 writeback of addr:0x%s ins addr:0x%s\n",
                hexstr64s(repl_line_addr), hexstr64s(req->addr));
-      if(!new_mem_mlc_wb_req(MRT_WB, data->proc_id, repl_line_addr,
+
+      Mem_Req_Type wbtype = data->dirty ? MRT_WB : MRT_WB_NODIRTY;
+      if(!new_mem_mlc_wb_req(wbtype, data->proc_id, repl_line_addr,
                              MLC_LINE_SIZE, 1, NULL, NULL, unique_count))
         return FAILURE;
-      STAT_EVENT(req->proc_id, MLC_FILL_DIRTY);
+
+      // WQ:this stat looks wrong..
+      // STAT_EVENT(req->proc_id, MLC_FILL_DIRTY);
     }
 
     if(data->prefetch) {
@@ -4923,6 +5115,8 @@ Flag mlc_fill_line(Mem_Req* req) {
                  (req->state != MRS_FILL_MLC));  // write back can fill mlc
                                                  // directly - reqs filling core
                                                  // should not dirty the line
+  data->dirty |= req->dirty_l0;  // for exclusive L1, pull dirty blk
+                                 // from L1 to MLC
   data->prefetch = req->type == MRT_DPRF || req->type == MRT_IPRF ||
                    req->demand_match_prefetch;
   data->seen_prefetch = req->demand_match_prefetch; /* If demand matches
@@ -5331,9 +5525,13 @@ Flag is_final_state(Mem_Req_State state) {
          (state == MRS_MEM_DONE) || (state == MRS_FILL_DONE);
 }
 
-/**************************************************************************************/
-/* wp_process_l1_hit: */
 
+/**
+ * @brief Wrong path stat collect for l1 hit
+ *
+ * @param line
+ * @param req
+ */
 void wp_process_l1_hit(L1_Data* line, Mem_Req* req) {
   if(!line) {
     ASSERT(req->proc_id, PERFECT_L1);
@@ -5411,9 +5609,12 @@ void wp_process_l1_hit(L1_Data* line, Mem_Req* req) {
 }
 
 
-/**************************************************************************************/
-/* wp_process_l1_fill: */
-
+/**
+ * @brief wrong path stat collect for l1_fill
+ *
+ * @param line
+ * @param req
+ */
 void wp_process_l1_fill(L1_Data* line, Mem_Req* req) {
   if(!WP_COLLECT_STATS)
     return;
@@ -5515,7 +5716,7 @@ static void update_mem_req_occupancy_counter(Mem_Req_Type type, int delta) {
       counter = &mem_req_wb_entries;
       break;
     default:
-      FATAL_ERROR(0, "Unknown mem req state\n");
+      FATAL_ERROR(0, "Unknown mem req type\n");
       break;
   }
   *counter += delta;
diff --git a/src/memory/memory.h b/src/memory/memory.h
index 5e9e216d..9ab58e24 100644
--- a/src/memory/memory.h
+++ b/src/memory/memory.h
@@ -87,7 +87,7 @@ typedef enum Mem_Queue_Type_enum {
 } Mem_Queue_Type;
 
 typedef struct Mem_Queue_Entry_struct {
-  int     reqbuf;   /* request buffer num, a pointer(idx) to the global req_buffer */
+  int reqbuf; /* request buffer num, a pointer(idx) to the global req_buffer */
   Counter priority; /* priority of the miss */
   Counter rdy_cycle;
 } Mem_Queue_Entry;
@@ -132,11 +132,13 @@ typedef struct Uncore_struct {
 
 typedef struct Memory_struct {
   /* miss buffer */
-  Mem_Req* req_buffer;  
-  List     req_buffer_free_list;
-  List*    l1_in_buffer_core;
-  uns      total_mem_req_buffers;
-  uns*     num_req_buffers_per_core;
+  Mem_Req* req_buffer;  // global buffer holds all the real reqs, the entries
+                        // from
+  // various queues below points to the reqs in this buffer (with idx)
+  List  req_buffer_free_list;
+  List* l1_in_buffer_core;
+  uns   total_mem_req_buffers;
+  uns*  num_req_buffers_per_core;
 
   int req_count;
 
@@ -147,6 +149,9 @@ typedef struct Memory_struct {
   Cache pref_l1_cache;
 
   /* various queues (arrays) */
+  /* reg comes from upward goes to the queue (includes WBs)
+     reg comes from downward goes to fill_queue
+   */
   Mem_Queue  mlc_queue;
   Mem_Queue  mlc_fill_queue;
   Mem_Queue  l1_queue;
diff --git a/src/memory/memory.param.def b/src/memory/memory.param.def
index 13cff708..30db15d0 100644
--- a/src/memory/memory.param.def
+++ b/src/memory/memory.param.def
@@ -80,6 +80,7 @@ DEF_PARAM(l1_line_size, L1_LINE_SIZE, uns, uns,
 DEF_PARAM(l1_cycles, L1_CYCLES, uns, uns, 24, )
 DEF_PARAM(perfect_l1, PERFECT_L1, Flag, Flag, FALSE, )
 DEF_PARAM(private_l1, PRIVATE_L1, Flag, Flag, FALSE, )
+DEF_PARAM(exclusive_l1, EXCLUSIVE_L1, Flag, Flag, TRUE, )
 DEF_PARAM(l1_read_ports, L1_READ_PORTS, uns, uns, 1, )
 DEF_PARAM(l1_write_ports, L1_WRITE_PORTS, uns, uns, 1, )
 DEF_PARAM(l1_banks, L1_BANKS, uns, uns, 8, )
diff --git a/src/ramulator.cc b/src/ramulator.cc
index 3b0e2492..ffc0f512 100644
--- a/src/ramulator.cc
+++ b/src/ramulator.cc
@@ -168,6 +168,14 @@ void init_configs() {
 }
 
 
+/**
+ * @brief Send req to ramulator
+ *
+ * This func is the interface between scarab and ramulator
+ *
+ * @param scarab_req
+ * @return int
+ */
 int ramulator_send(Mem_Req* scarab_req) {
   Request req;