diff --git a/Makefile b/Makefile
index efc5b8950..eb33b01dc 100644
--- a/Makefile
+++ b/Makefile
@@ -36,7 +36,7 @@ SRCS-y := main/main.c
 SRCS-y += config/static.c config/dynamic.c
 SRCS-y += cps/main.c cps/kni.c cps/elf.c
 SRCS-y += ggu/main.c
-SRCS-y += gk/main.c gk/fib.c gk/bpf.c
+SRCS-y += gk/main.c gk/fib.c gk/bpf.c gk/co.c
 SRCS-y += gt/main.c gt/lua_lpm.c
 SRCS-y += lls/main.c lls/cache.c lls/arp.c lls/nd.c
 SRCS-y += sol/main.c
@@ -44,12 +44,12 @@ SRCS-y += sol/main.c
 # Libraries.
 SRCS-y += lib/mailbox.c lib/net.c lib/flow.c lib/ipip.c \
 	lib/luajit-ffi-cdata.c lib/launch.c lib/lpm.c lib/acl.c lib/varip.c \
-	lib/l2.c lib/ratelimit.c lib/memblock.c lib/log_ratelimit.c
+	lib/l2.c lib/ratelimit.c lib/memblock.c lib/log_ratelimit.c lib/coro.c
 
 LDLIBS += $(LDIR) -Bstatic -lluajit-5.1 -Bdynamic -lm -lmnl -lkmod
 CFLAGS += $(WERROR_FLAGS) -I${GATEKEEPER}/include -I/usr/local/include/luajit-2.0/
 EXTRA_CFLAGS += -O3 -g -Wfatal-errors -DALLOW_EXPERIMENTAL_API \
-	-Wno-deprecated-declarations
+	-Wno-deprecated-declarations -DCORO_ASM
 
 include $(RTE_SDK)/mk/rte.extapp.mk
 
diff --git a/dependencies/dpdk b/dependencies/dpdk
index bcc1e4fce..c637f7cd4 160000
--- a/dependencies/dpdk
+++ b/dependencies/dpdk
@@ -1 +1 @@
-Subproject commit bcc1e4fce82336ca39108ed4d54fb501af4a1b5a
+Subproject commit c637f7cd452d750d6eb51bb2abf9de92a111fe60
diff --git a/gk/bpf.c b/gk/bpf.c
index 16b09963b..2ffcdd913 100644
--- a/gk/bpf.c
+++ b/gk/bpf.c
@@ -106,12 +106,13 @@ static const struct rte_bpf_xsym flow_handler_init_xsym[] = {
 };
 
 struct gk_bpf_pkt_frame {
-	uint64_t		password;
-	struct flow_entry	*fe;
-	struct ipacket          *packet;
-	struct gk_config	*gk_conf;
-	bool			ready_to_tx;
-	struct gk_bpf_pkt_ctx	ctx;
+	uint64_t              password;
+	struct flow_entry     *fe;
+	struct ipacket        *packet;
+	struct gk_co          *this_co;
+	bool                  pkt_part2_prefetched;
+	bool                  ready_to_tx;
+	struct gk_bpf_pkt_ctx ctx;
 };
 
 static const uint64_t pkt_password = 0xa2e329ba8b15af05;
@@ -199,6 +200,7 @@ gk_bpf_prep_for_tx(struct gk_bpf_pkt_ctx *ctx, int priority,
 	int direct_if_possible)
 {
 	int ret;
+	struct gatekeeper_if *back;
 	struct gk_bpf_pkt_frame *frame = pkt_ctx_to_frame(ctx);
 	if (unlikely(frame == NULL))
 		return -EINVAL;
@@ -208,11 +210,18 @@ gk_bpf_prep_for_tx(struct gk_bpf_pkt_ctx *ctx, int priority,
 	if (unlikely(priority < 0 || priority > PRIORITY_MAX))
 		return -EINVAL;
 
+	/* Prepare packet for transmission if needed. */
+	if (likely(!frame->pkt_part2_prefetched)) {
+		frame->pkt_part2_prefetched = true;
+		if (likely(rte_mbuf_prefetch_part2_non_temporal(
+				frame->packet->pkt)))
+			gk_yield_next(frame->this_co);
+	}
+
+	back = &frame->this_co->work->gk_conf->net->back;
 	ret = (direct_if_possible != 0 && priority == PRIORITY_GRANTED)
-		? update_pkt_priority(frame->packet, priority,
-			&frame->gk_conf->net->back)
-		: encapsulate(frame->packet->pkt, priority,
-			&frame->gk_conf->net->back,
+		? update_pkt_priority(frame->packet, priority, back)
+		: encapsulate(frame->packet->pkt, priority, back,
 			&frame->fe->grantor_fib->u.grantor.gt_addr);
 
 	frame->ready_to_tx = ret == 0;
@@ -486,7 +495,7 @@ parse_packet_further(struct ipacket *packet, struct gk_bpf_pkt_ctx *ctx)
 }
 
 int
-gk_bpf_decide_pkt(struct gk_config *gk_conf, uint8_t program_index,
+gk_bpf_decide_pkt(struct gk_co *this_co, uint8_t program_index,
 	struct flow_entry *fe, struct ipacket *packet, uint64_t now,
 	uint64_t *p_bpf_ret)
 {
@@ -494,7 +503,8 @@ gk_bpf_decide_pkt(struct gk_config *gk_conf, uint8_t program_index,
 		.password = pkt_password,
 		.fe = fe,
 		.packet = packet,
-		.gk_conf = gk_conf,
+		.this_co = this_co,
+		.pkt_part2_prefetched = false,
 		.ready_to_tx = false,
 		.ctx = {
 			.now = now,
@@ -502,7 +512,7 @@ gk_bpf_decide_pkt(struct gk_config *gk_conf, uint8_t program_index,
 		},
 	};
 	const struct gk_bpf_flow_handler *handler =
-		&gk_conf->flow_handlers[program_index];
+		&this_co->work->gk_conf->flow_handlers[program_index];
 
 	if (unlikely(handler->f_pkt == NULL)) {
 		GK_LOG(WARNING,
diff --git a/gk/bpf.h b/gk/bpf.h
index f5c93e9ec..05cfd7f6d 100644
--- a/gk/bpf.h
+++ b/gk/bpf.h
@@ -20,6 +20,7 @@
 #define _GATEKEEPER_GK_BPF_H_
 
 #include "gatekeeper_gk.h"
+#include "co.h"
 
 /*
  * Load the BPF program that handles flows into @gk_conf at
@@ -32,7 +33,7 @@
 int gk_load_bpf_flow_handler(struct gk_config *gk_conf, unsigned int index,
 	const char *filename, int jit);
 
-int gk_bpf_decide_pkt(struct gk_config *gk_conf, uint8_t program_index,
+int gk_bpf_decide_pkt(struct gk_co *this_co, uint8_t program_index,
 	struct flow_entry *fe, struct ipacket *packet, uint64_t now,
 	uint64_t *p_bpf_ret);
 
diff --git a/gk/co.c b/gk/co.c
new file mode 100644
index 000000000..35ad7d941
--- /dev/null
+++ b/gk/co.c
@@ -0,0 +1,1121 @@
+/*
+ * Gatekeeper - DoS protection system.
+ * Copyright (C) 2016 Digirati LTDA.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+
+#include "gatekeeper_lls.h"
+
+#include "bpf.h"
+#include "co.h"
+
+static struct gk_co *
+get_next_co(struct gk_co *this_co)
+{
+	/*
+	 * It is unlikely because as long as there is more than
+	 * one working coroutine, there is at least 50% chance that
+	 * @this_co is not the last working coroutine.
+	 */
+	if (unlikely(this_co->co_list.next == &this_co->work->working_cos)) {
+		/* @this_co is the last working co. */
+		return list_first_entry(&this_co->work->working_cos,
+			struct gk_co, co_list);
+	}
+	return list_next_entry(this_co, co_list);
+}
+
+void
+gk_yield_next(struct gk_co *this_co)
+{
+	struct gk_co *next_co = get_next_co(this_co);
+	if (unlikely(this_co == next_co))
+		return;
+	coro_transfer(&this_co->coro, &next_co->coro);
+}
+
+/*
+ * If @task is added to @this_co->task_queue without a proper @task->task_hash,
+ * @task must be rescheduled once the proper @task->task_hash becomes known
+ * in order to avoid race conditions related to the proper @task->task_hash.
+ *
+ * NOTICE: while a task is running without a proper @task->task_hash,
+ * the task must not use the leftover available because the task is likely
+ * running under a task hash that is different of its proper @task->task_hash.
+ */
+static void
+reschedule_task(struct gk_co *this_co, struct gk_co_task *task)
+{
+	struct gk_co_work *work = this_co->work;
+	struct gk_co *task_owner_co = get_task_owner_co(work, task);
+
+	__schedule_task(task_owner_co, task);
+
+	if (list_poison(&task_owner_co->co_list))
+		list_add_tail(&task_owner_co->co_list, &work->working_cos);
+}
+
+static int
+extract_packet_info(struct rte_mbuf *pkt, struct ipacket *packet)
+{
+	int ret = 0;
+	uint16_t ether_type;
+	size_t ether_len;
+	struct rte_ether_hdr *eth_hdr;
+	struct rte_ipv4_hdr *ip4_hdr;
+	struct rte_ipv6_hdr *ip6_hdr;
+	uint16_t pkt_len = rte_pktmbuf_data_len(pkt);
+
+	eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
+	ether_type = rte_be_to_cpu_16(pkt_in_skip_l2(pkt, eth_hdr,
+		&packet->l3_hdr));
+	ether_len = pkt_in_l2_hdr_len(pkt);
+
+	switch (ether_type) {
+	case RTE_ETHER_TYPE_IPV4:
+		if (pkt_len < ether_len + sizeof(*ip4_hdr)) {
+			packet->flow.proto = 0;
+			GK_LOG(NOTICE,
+				"Packet is too short to be IPv4 (%" PRIu16 ")\n",
+				pkt_len);
+			ret = -1;
+			goto out;
+		}
+
+		ip4_hdr = packet->l3_hdr;
+		packet->flow.proto = RTE_ETHER_TYPE_IPV4;
+		packet->flow.f.v4.src.s_addr = ip4_hdr->src_addr;
+		packet->flow.f.v4.dst.s_addr = ip4_hdr->dst_addr;
+		break;
+
+	case RTE_ETHER_TYPE_IPV6:
+		if (pkt_len < ether_len + sizeof(*ip6_hdr)) {
+			packet->flow.proto = 0;
+			GK_LOG(NOTICE,
+				"Packet is too short to be IPv6 (%" PRIu16 ")\n",
+				pkt_len);
+			ret = -1;
+			goto out;
+		}
+
+		ip6_hdr = packet->l3_hdr;
+		packet->flow.proto = RTE_ETHER_TYPE_IPV6;
+		rte_memcpy(packet->flow.f.v6.src.s6_addr, ip6_hdr->src_addr,
+			sizeof(packet->flow.f.v6.src.s6_addr));
+		rte_memcpy(packet->flow.f.v6.dst.s6_addr, ip6_hdr->dst_addr,
+			sizeof(packet->flow.f.v6.dst.s6_addr));
+		break;
+
+	case RTE_ETHER_TYPE_ARP:
+		packet->flow.proto = RTE_ETHER_TYPE_ARP;
+		ret = -1;
+		break;
+
+	default:
+		packet->flow.proto = 0;
+		log_unknown_l2("gk", ether_type);
+		ret = -1;
+		break;
+	}
+out:
+	packet->pkt = pkt;
+	return ret;
+}
+
+static int
+drop_packet_front(struct rte_mbuf *pkt, struct gk_instance *instance)
+{
+	instance->traffic_stats.tot_pkts_num_dropped++;
+	instance->traffic_stats.tot_pkts_size_dropped +=
+		rte_pktmbuf_pkt_len(pkt);
+
+	return drop_packet(pkt);
+}
+
+static int
+parse_front_pkt(struct gk_co *this_co,
+	struct ipacket *packet, struct rte_mbuf *pkt)
+{
+	struct gk_co_work *work = this_co->work;
+	int ret;
+
+	/* TODO Does this prefetch improve performance?
+	rte_mbuf_prefetch_part1_non_temporal(pkt);
+	gk_yield_next(this_co);
+	*/
+       /*
+        * This prefetch is enough to load Ethernet header (14 bytes),
+        * optional Ethernet VLAN header (8 bytes), and either
+        * an IPv4 header without options (20 bytes), or
+        * an IPv6 header without options (40 bytes).
+        * IPv4: 14 + 8 + 20 = 42
+        * IPv6: 14 + 8 + 40 = 62
+	rte_prefetch_non_temporal(rte_pktmbuf_mtod_offset(pkt, void *, 0));
+	gk_yield_next(this_co);
+        */
+
+	ret = extract_packet_info(pkt, packet);
+	if (ret < 0) {
+		if (likely(packet->flow.proto == RTE_ETHER_TYPE_ARP)) {
+			struct gk_measurement_metrics *stats =
+				&work->instance->traffic_stats;
+
+			stats->tot_pkts_num_distributed++;
+			stats->tot_pkts_size_distributed +=
+				rte_pktmbuf_pkt_len(pkt);
+
+			work->front_arp_bufs[work->front_num_arp++] = pkt;
+			return -1;
+		}
+
+		/* Drop non-IP and non-ARP packets. */
+		drop_packet_front(pkt, work->instance);
+		return -1;
+	}
+
+	if (unlikely((packet->flow.proto == RTE_ETHER_TYPE_IPV4 &&
+				!work->front_ipv4_configured) ||
+			(packet->flow.proto == RTE_ETHER_TYPE_IPV6 &&
+				!work->front_ipv6_configured))) {
+		drop_packet_front(pkt, work->instance);
+		return -1;
+	}
+
+	return 0;
+}
+
+#define	START_PRIORITY		 (38)
+/* Set @START_ALLOWANCE as the double size of a large DNS reply. */
+#define	START_ALLOWANCE		 (8)
+
+static void
+initialize_flow_entry(struct flow_entry *fe, struct ip_flow *flow,
+	uint32_t flow_hash_val, struct gk_fib *grantor_fib)
+{
+	/*
+	 * The flow table is a critical data structure, so,
+	 * whenever the size of entries grow too much,
+	 * one must look for alternatives before increasing
+	 * the limit below.
+	 */
+	RTE_BUILD_BUG_ON(sizeof(*fe) > 128);
+
+	rte_memcpy(&fe->flow, flow, sizeof(*flow));
+
+	fe->in_use = true;
+	fe->flow_hash_val = flow_hash_val;
+	fe->state = GK_REQUEST;
+	fe->u.request.last_packet_seen_at = rte_rdtsc();
+	fe->u.request.last_priority = START_PRIORITY;
+	fe->u.request.allowance = START_ALLOWANCE - 1;
+	fe->grantor_fib = grantor_fib;
+}
+
+static inline void
+reinitialize_flow_entry(struct flow_entry *fe, uint64_t now)
+{
+	fe->state = GK_REQUEST;
+	fe->u.request.last_packet_seen_at = now;
+	fe->u.request.last_priority = START_PRIORITY;
+	fe->u.request.allowance = START_ALLOWANCE - 1;
+}
+
+static inline void
+prefetch_flow_entry(struct flow_entry *fe)
+{
+#if RTE_CACHE_LINE_SIZE == 64
+	RTE_BUILD_BUG_ON(sizeof(*fe) <= RTE_CACHE_LINE_SIZE);
+	RTE_BUILD_BUG_ON(sizeof(*fe) > 2 * RTE_CACHE_LINE_SIZE);
+	rte_prefetch0(fe);
+	rte_prefetch0(((char *)fe) + RTE_CACHE_LINE_SIZE);
+#elif RTE_CACHE_LINE_SIZE == 128
+	RTE_BUILD_BUG_ON(sizeof(*fe) > RTE_CACHE_LINE_SIZE);
+	rte_prefetch0(fe);
+#else
+#error "Unsupported cache line size"
+#endif
+}
+
+/* We should avoid calling integer_log_base_2() with zero. */
+static inline uint8_t
+integer_log_base_2(uint64_t delta_time)
+{
+#if __WORDSIZE == 64
+    return (8 * sizeof(uint64_t) - 1) - __builtin_clzl(delta_time);
+#else
+    return (8 * sizeof(uint64_t) - 1) - __builtin_clzll(delta_time);
+#endif
+}
+
+/*
+ * It converts the difference of time between the current packet and
+ * the last seen packet into a given priority.
+ */
+static uint8_t
+priority_from_delta_time(uint64_t present, uint64_t past)
+{
+	uint64_t delta_time;
+
+	if (unlikely(present < past)) {
+		/*
+		 * This should never happen, but we handle it gracefully here
+		 * in order to keep going.
+		 */
+		GK_LOG(ERR, "The present time smaller than the past time\n");
+		return 0;
+	}
+
+	delta_time = (present - past) * picosec_per_cycle;
+	if (unlikely(delta_time < 1))
+		return 0;
+
+	return integer_log_base_2(delta_time);
+}
+
+/*
+ * When a flow entry is at request state, all the GK block processing
+ * that entry does is to:
+ * (1) compute the priority of the packet.
+ * (2) encapsulate the packet as a request.
+ * (3) put this encapsulated packet in the request queue.
+ */
+static void
+gk_process_request(struct gk_co *this_co, struct flow_entry *fe,
+	struct ipacket *packet)
+{
+	int ret;
+	uint64_t now = rte_rdtsc();
+	uint8_t priority = priority_from_delta_time(now,
+		fe->u.request.last_packet_seen_at);
+	struct rte_mbuf *pkt = packet->pkt;
+	struct gk_co_work *work = this_co->work;
+	struct gatekeeper_if *back = &work->gk_conf->net->back;
+	struct gk_fib *fib = fe->grantor_fib;
+	struct ether_cache *eth_cache;
+
+	fe->u.request.last_packet_seen_at = now;
+
+	/*
+	 * The reason for using "<" instead of "<=" is that the equal case
+	 * means that the source has waited enough time to have the same
+	 * last priority, so it should be awarded with the allowance.
+	 */
+	if (priority < fe->u.request.last_priority &&
+			fe->u.request.allowance > 0) {
+		fe->u.request.allowance--;
+		priority = fe->u.request.last_priority;
+	} else {
+		fe->u.request.last_priority = priority;
+		fe->u.request.allowance = START_ALLOWANCE - 1;
+	}
+
+	/*
+	 * Adjust @priority for the DSCP field.
+	 * DSCP 0 for legacy packets; 1 for granted packets;
+	 * 2 for capability renew; 3-63 for requests.
+	 */
+	priority += PRIORITY_REQ_MIN;
+	if (unlikely(priority > PRIORITY_MAX))
+		priority = PRIORITY_MAX;
+
+	/* The assigned priority is @priority. */
+
+	/* Prepare packet for transmission. */
+	if (likely(rte_mbuf_prefetch_part2_non_temporal(pkt)))
+		gk_yield_next(this_co);
+
+	/* Encapsulate the packet as a request. */
+	ret = encapsulate(pkt, priority, back, &fib->u.grantor.gt_addr);
+	if (ret < 0)
+		goto drop_pkt;
+
+	eth_cache = fib->u.grantor.eth_cache;
+	RTE_VERIFY(eth_cache != NULL);
+	/* If needed, packet header space was adjusted by encapsulate(). */
+	if (pkt_copy_cached_eth_header(pkt, eth_cache, back->l2_len_out))
+		goto drop_pkt;
+
+	pkt->udata64 = priority;
+	work->front_req_bufs[work->front_num_req++] = pkt;
+	return;
+
+drop_pkt:
+	drop_packet_front(pkt, work->instance);
+}
+
+static void
+gk_process_granted(struct gk_co *this_co, struct flow_entry *fe,
+	struct ipacket *packet)
+{
+	int ret;
+	bool renew_cap;
+	uint8_t priority = PRIORITY_GRANTED;
+	uint64_t now = rte_rdtsc();
+	struct rte_mbuf *pkt = packet->pkt;
+	struct gk_fib *fib = fe->grantor_fib;
+	struct gk_co_work *work = this_co->work;
+	struct gatekeeper_if *back = &work->gk_conf->net->back;
+	struct gk_measurement_metrics *stats;
+	struct ether_cache *eth_cache;
+	uint32_t pkt_len;
+
+	if (now >= fe->u.granted.cap_expire_at) {
+		reinitialize_flow_entry(fe, now);
+		return gk_process_request(this_co, fe, packet);
+	}
+
+	if (now >= fe->u.granted.budget_renew_at) {
+		fe->u.granted.budget_renew_at = now + cycles_per_sec;
+		fe->u.granted.budget_byte =
+			(uint64_t)fe->u.granted.tx_rate_kib_cycle * 1024;
+	}
+
+	stats = &work->instance->traffic_stats;
+
+	pkt_len = rte_pktmbuf_pkt_len(pkt);
+	if (pkt_len > fe->u.granted.budget_byte) {
+		stats->pkts_num_declined++;
+		stats->pkts_size_declined += pkt_len;
+		goto drop_pkt;
+	}
+
+	fe->u.granted.budget_byte -= pkt_len;
+	renew_cap = now >= fe->u.granted.send_next_renewal_at;
+	if (renew_cap) {
+		fe->u.granted.send_next_renewal_at = now +
+			fe->u.granted.renewal_step_cycle;
+		priority = PRIORITY_RENEW_CAP;
+	}
+
+	/* Prepare packet for transmission. */
+	if (likely(rte_mbuf_prefetch_part2_non_temporal(pkt)))
+		gk_yield_next(this_co);
+
+	/*
+	 * Encapsulate packet as a granted packet,
+	 * mark it as a capability renewal request if @renew_cap is true,
+	 * enter destination according to @fe->grantor_fib.
+	 */
+	ret = encapsulate(pkt, priority, back, &fib->u.grantor.gt_addr);
+	if (ret < 0)
+		goto drop_pkt;
+
+	eth_cache = fib->u.grantor.eth_cache;
+	RTE_VERIFY(eth_cache != NULL);
+	/* If needed, packet header space was adjusted by encapsulate(). */
+	if (pkt_copy_cached_eth_header(pkt, eth_cache, back->l2_len_out))
+		goto drop_pkt;
+
+	stats->pkts_num_granted++;
+	stats->pkts_size_granted += pkt_len;
+	work->tx_back_pkts[work->tx_back_num_pkts++] = pkt;
+	return;
+
+drop_pkt:
+	drop_packet_front(pkt, work->instance);
+}
+
+static void
+gk_process_declined(struct gk_co *this_co, struct flow_entry *fe,
+	struct ipacket *packet)
+{
+	uint64_t now = rte_rdtsc();
+	struct gk_co_work *work = this_co->work;
+	struct gk_measurement_metrics *stats;
+
+	if (unlikely(now >= fe->u.declined.expire_at)) {
+		reinitialize_flow_entry(fe, now);
+		return gk_process_request(this_co, fe, packet);
+	}
+
+	stats = &work->instance->traffic_stats;
+	stats->pkts_num_declined++;
+	stats->pkts_size_declined += rte_pktmbuf_pkt_len(packet->pkt);
+	drop_packet_front(packet->pkt, work->instance);
+}
+
+static void
+gk_process_bpf(struct gk_co *this_co, struct flow_entry *fe,
+	struct ipacket *packet)
+{
+	struct rte_mbuf *pkt = packet->pkt;
+	struct gk_co_work *work = this_co->work;
+	struct gk_measurement_metrics *stats;
+	uint64_t bpf_ret;
+	int program_index, rc;
+	uint64_t now = rte_rdtsc();
+
+	if (unlikely(now >= fe->u.bpf.expire_at))
+		goto expired;
+
+	program_index = fe->program_index;
+	rc = gk_bpf_decide_pkt(this_co, program_index, fe, packet, now,
+		&bpf_ret);
+	if (unlikely(rc != 0)) {
+		GK_LOG(WARNING,
+			"The BPF program at index %u failed to run its function pkt\n",
+			program_index);
+		goto expired;
+	}
+
+	stats = &work->instance->traffic_stats;
+	switch (bpf_ret) {
+	case GK_BPF_PKT_RET_FORWARD: {
+		struct ether_cache *eth_cache =
+			fe->grantor_fib->u.grantor.eth_cache;
+		RTE_VERIFY(eth_cache != NULL);
+		/*
+		 * If needed, encapsulate() already adjusted
+		 * packet header space.
+		 */
+		if (pkt_copy_cached_eth_header(pkt, eth_cache,
+				work->gk_conf->net->back.l2_len_out))
+			goto drop_pkt;
+
+		stats->pkts_num_granted++;
+		stats->pkts_size_granted += rte_pktmbuf_pkt_len(pkt);
+		work->tx_back_pkts[work->tx_back_num_pkts++] = pkt;
+		return;
+	}
+	case GK_BPF_PKT_RET_DECLINE:
+		stats->pkts_num_declined++;
+		stats->pkts_size_declined += rte_pktmbuf_pkt_len(pkt);
+		goto drop_pkt;
+	case GK_BPF_PKT_RET_ERROR:
+		GK_LOG(WARNING,
+			"The function pkt of the BPF program at index %u returned GK_BPF_PKT_RET_ERROR\n",
+			program_index);
+		goto drop_pkt;
+	default:
+		GK_LOG(WARNING,
+			"The function pkt of the BPF program at index %u returned an invalid return: %" PRIu64 "\n",
+			program_index, bpf_ret);
+		goto drop_pkt;
+	}
+
+	rte_panic("Unexpected condition at %s()", __func__);
+
+expired:
+	reinitialize_flow_entry(fe, now);
+	return gk_process_request(this_co, fe, packet);
+
+drop_pkt:
+	drop_packet_front(pkt, work->instance);
+}
+
+static void
+process_flow_entry(struct gk_co *this_co, struct flow_entry *fe,
+	struct ipacket *packet)
+{
+	/*
+	 * Some notes regarding flow rates and units:
+	 *
+	 * Flows in the GK_REQUEST state are bandwidth limited
+	 * to an overall rate relative to the link. Therefore,
+	 * the Ethernet frame overhead is counted toward the
+	 * credits used by requests. The request channel rate
+	 * is measured in megabits (base 10) per second to
+	 * match the units used by hardware specifications.
+	 *
+	 * Granted flows (in state GK_GRANTED or sometimes
+	 * GK_BPF) are allocated budgets that are intended
+	 * to reflect the max throughput of the flow, and
+	 * therefore do not include the Ethernet frame overhead.
+	 * The budgets of granted flows are measured in
+	 * kibibytes (base 2).
+	 */
+	switch (fe->state) {
+	case GK_REQUEST:
+		return gk_process_request(this_co, fe, packet);
+
+	case GK_GRANTED:
+		return gk_process_granted(this_co, fe, packet);
+
+	case GK_DECLINED:
+		return gk_process_declined(this_co, fe, packet);
+
+	case GK_BPF:
+		return gk_process_bpf(this_co, fe, packet);
+
+	default:
+		GK_LOG(ERR, "Unknown flow state: %d\n", fe->state);
+		drop_packet_front(packet->pkt, this_co->work->instance);
+		return;
+	}
+
+	rte_panic("Unexpected condition at %s()\n", __func__);
+}
+
+typedef int (*packet_drop_cb_func)(struct rte_mbuf *pkt,
+	struct gk_instance *instance);
+
+static void
+xmit_icmp(struct gatekeeper_if *iface, struct ipacket *packet,
+	uint16_t *num_pkts, struct rte_mbuf **icmp_bufs,
+	struct gk_instance *instance, packet_drop_cb_func cb_f)
+{
+	struct rte_ether_addr eth_addr_tmp;
+	struct rte_ether_hdr *icmp_eth;
+	struct rte_ipv4_hdr *icmp_ipv4;
+	struct rte_icmp_hdr *icmph;
+	struct rte_mbuf *pkt = packet->pkt;
+	int icmp_pkt_len = iface->l2_len_out + sizeof(struct rte_ipv4_hdr) +
+		sizeof(struct rte_icmp_hdr);
+	if (pkt->data_len >= icmp_pkt_len) {
+		int ret = rte_pktmbuf_trim(pkt, pkt->data_len - icmp_pkt_len);
+		if (ret < 0) {
+			GK_LOG(ERR,
+				"Failed to remove %d bytes of data at the end of the mbuf at %s",
+				pkt->data_len - icmp_pkt_len, __func__);
+			cb_f(pkt, instance);
+			return;
+		}
+
+		icmp_eth = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
+	} else {
+		icmp_eth = (struct rte_ether_hdr *)rte_pktmbuf_append(pkt,
+			icmp_pkt_len - pkt->data_len);
+		if (icmp_eth == NULL) {
+			GK_LOG(ERR,
+				"Failed to append %d bytes of new data: not enough headroom space in the first segment at %s\n",
+				icmp_pkt_len - pkt->data_len, __func__);
+			cb_f(pkt, instance);
+			return;
+		}
+	}
+
+	rte_ether_addr_copy(&icmp_eth->s_addr, &eth_addr_tmp);
+	rte_ether_addr_copy(&icmp_eth->d_addr, &icmp_eth->s_addr);
+	rte_ether_addr_copy(&eth_addr_tmp, &icmp_eth->d_addr);
+	if (iface->vlan_insert) {
+		fill_vlan_hdr(icmp_eth, iface->vlan_tag_be,
+			RTE_ETHER_TYPE_IPV4);
+	}
+
+	icmp_ipv4 = (struct rte_ipv4_hdr *)pkt_out_skip_l2(iface, icmp_eth);
+	icmp_ipv4->version_ihl = IP_VHL_DEF;
+	icmp_ipv4->type_of_service = 0;
+	icmp_ipv4->packet_id = 0;
+	icmp_ipv4->fragment_offset = IP_DN_FRAGMENT_FLAG;
+	icmp_ipv4->time_to_live = IP_DEFTTL;
+	icmp_ipv4->next_proto_id = IPPROTO_ICMP;
+	icmp_ipv4->src_addr = packet->flow.f.v4.dst.s_addr;
+	icmp_ipv4->dst_addr = packet->flow.f.v4.src.s_addr;
+	icmp_ipv4->total_length = rte_cpu_to_be_16(pkt->data_len -
+		iface->l2_len_out);
+	/*
+	 * The IP header checksum filed must be set to 0
+	 * in order to offload the checksum calculation.
+	 */
+	icmp_ipv4->hdr_checksum = 0;
+	pkt->l2_len = iface->l2_len_out;
+	pkt->l3_len = sizeof(struct rte_ipv4_hdr);
+	pkt->ol_flags |= PKT_TX_IPV4 | PKT_TX_IP_CKSUM;
+
+	icmph = (struct rte_icmp_hdr *)&icmp_ipv4[1];
+	icmph->icmp_type = ICMP_TIME_EXCEEDED;
+	icmph->icmp_code = ICMP_EXC_TTL;
+	icmph->icmp_cksum = 0;
+	icmph->icmp_ident = 0;
+	icmph->icmp_seq_nb = 0;
+	icmph->icmp_cksum = icmp_cksum(icmph, sizeof(*icmph));
+
+	icmp_bufs[*num_pkts] = pkt;
+	(*num_pkts)++;
+}
+
+static void
+xmit_icmpv6(struct gatekeeper_if *iface, struct ipacket *packet,
+	uint16_t *num_pkts, struct rte_mbuf **icmp_bufs,
+	struct gk_instance *instance, packet_drop_cb_func cb_f)
+{
+	struct rte_ether_addr eth_addr_tmp;
+	struct rte_ether_hdr *icmp_eth;
+	struct rte_ipv6_hdr *icmp_ipv6;
+	struct icmpv6_hdr *icmpv6_hdr;
+	struct rte_mbuf *pkt = packet->pkt;
+	int icmpv6_pkt_len = iface->l2_len_out + sizeof(struct rte_ipv6_hdr) +
+		sizeof(struct icmpv6_hdr);
+	if (pkt->data_len >= icmpv6_pkt_len) {
+		int ret = rte_pktmbuf_trim(pkt,
+			pkt->data_len - icmpv6_pkt_len);
+		if (ret < 0) {
+			GK_LOG(ERR,
+				"Failed to remove %d bytes of data at the end of the mbuf at %s",
+				pkt->data_len - icmpv6_pkt_len, __func__);
+			cb_f(pkt, instance);
+			return;
+		}
+
+		icmp_eth = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
+	} else {
+		icmp_eth = (struct rte_ether_hdr *)rte_pktmbuf_append(pkt,
+			icmpv6_pkt_len - pkt->data_len);
+		if (icmp_eth == NULL) {
+			GK_LOG(ERR,
+				"Failed to append %d bytes of new data: not enough headroom space in the first segment at %s\n",
+				icmpv6_pkt_len - pkt->data_len, __func__);
+			cb_f(pkt, instance);
+			return;
+		}
+	}
+
+	rte_ether_addr_copy(&icmp_eth->s_addr, &eth_addr_tmp);
+	rte_ether_addr_copy(&icmp_eth->d_addr, &icmp_eth->s_addr);
+	rte_ether_addr_copy(&eth_addr_tmp, &icmp_eth->d_addr);
+	if (iface->vlan_insert) {
+		fill_vlan_hdr(icmp_eth, iface->vlan_tag_be,
+			RTE_ETHER_TYPE_IPV6);
+	}
+
+	/* Set-up IPv6 header. */
+	icmp_ipv6 = (struct rte_ipv6_hdr *)pkt_out_skip_l2(iface, icmp_eth);
+	icmp_ipv6->vtc_flow = rte_cpu_to_be_32(IPv6_DEFAULT_VTC_FLOW);
+	icmp_ipv6->payload_len = rte_cpu_to_be_16(sizeof(*icmpv6_hdr));
+	icmp_ipv6->proto = IPPROTO_ICMPV6;
+	/*
+	 * The IP Hop Limit field must be 255 as required by
+	 * RFC 4861, sections 7.1.1 and 7.1.2.
+	 */
+	icmp_ipv6->hop_limits = 255;
+	rte_memcpy(icmp_ipv6->src_addr, packet->flow.f.v6.dst.s6_addr,
+		sizeof(icmp_ipv6->src_addr));
+	rte_memcpy(icmp_ipv6->dst_addr, packet->flow.f.v6.src.s6_addr,
+		sizeof(icmp_ipv6->dst_addr));
+
+	/* Set-up ICMPv6 header. */
+	icmpv6_hdr = (struct icmpv6_hdr *)&icmp_ipv6[1];
+	icmpv6_hdr->type = ICMPV6_TIME_EXCEED;
+	icmpv6_hdr->code = ICMPV6_EXC_HOPLIMIT;
+	icmpv6_hdr->cksum = 0; /* Calculated below. */
+
+	icmpv6_hdr->cksum = rte_ipv6_icmpv6_cksum(icmp_ipv6, icmpv6_hdr);
+
+	icmp_bufs[*num_pkts] = pkt;
+	(*num_pkts)++;
+}
+
+/*
+ * For IPv4, according to the RFC 1812 section 5.3.1 Time to Live (TTL),
+ * if the TTL is reduced to zero (or less), the packet MUST be
+ * discarded, and if the destination is not a multicast address the
+ * router MUST send an ICMP Time Exceeded message, Code 0 (TTL Exceeded
+ * in Transit) message to the source.
+ *
+ * For IPv6, according to the RFC 1883 section 4.4,
+ * if the IPv6 Hop Limit is less than or equal to 1, then the router needs to
+ * send an ICMP Time Exceeded -- Hop Limit Exceeded in Transit message to
+ * the Source Address and discard the packet.
+ */
+static int
+update_ip_hop_count(struct gatekeeper_if *iface, struct ipacket *packet,
+	uint16_t *num_pkts, struct rte_mbuf **icmp_bufs,
+	struct token_bucket_ratelimit_state *rs, struct gk_instance *instance,
+	packet_drop_cb_func cb_f)
+{
+	if (packet->flow.proto == RTE_ETHER_TYPE_IPV4) {
+		struct rte_ipv4_hdr *ipv4_hdr = packet->l3_hdr;
+		if (ipv4_hdr->time_to_live <= 1) {
+			if (tb_ratelimit_allow(rs)) {
+				xmit_icmp(iface, packet, num_pkts,
+					icmp_bufs, instance, cb_f);
+			} else
+				cb_f(packet->pkt, instance);
+			return -ETIMEDOUT;
+		}
+
+		--(ipv4_hdr->time_to_live);
+		++(ipv4_hdr->hdr_checksum);
+	} else if (likely(packet->flow.proto == RTE_ETHER_TYPE_IPV6)) {
+		struct rte_ipv6_hdr *ipv6_hdr = packet->l3_hdr;
+		if (ipv6_hdr->hop_limits <= 1) {
+			if (tb_ratelimit_allow(rs)) {
+				xmit_icmpv6(iface, packet, num_pkts,
+					icmp_bufs, instance, cb_f);
+			} else
+				cb_f(packet->pkt, instance);
+			return -ETIMEDOUT;
+		}
+
+		--(ipv6_hdr->hop_limits);
+	} else {
+		GK_LOG(WARNING,
+			"Unexpected condition at %s: unknown flow type %hu\n",
+			__func__, packet->flow.proto);
+		cb_f(packet->pkt, instance);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void
+forward_pkt_to_back(struct ipacket *packet, struct ether_cache *eth_cache,
+	struct gk_co_work *work)
+{
+	struct rte_mbuf *pkt = packet->pkt;
+	struct gatekeeper_if *front = &work->gk_conf->net->front;
+	struct gatekeeper_if *back = &work->gk_conf->net->back;
+
+	if (adjust_pkt_len(pkt, back, 0) == NULL ||
+			pkt_copy_cached_eth_header(pkt, eth_cache,
+				back->l2_len_out)) {
+		drop_packet_front(pkt, work->instance);
+		return;
+	}
+
+	if (update_ip_hop_count(front, packet,
+			&work->tx_front_num_pkts, work->tx_front_pkts,
+			&work->instance->front_icmp_rs, work->instance,
+			drop_packet_front) < 0)
+		return;
+
+	work->tx_back_pkts[work->tx_back_num_pkts++] = pkt;
+}
+
+static struct gk_fib *
+look_up_fib(struct gk_lpm *ltbl, struct ip_flow *flow)
+{
+	int fib_id;
+
+	if (flow->proto == RTE_ETHER_TYPE_IPV4) {
+		fib_id = lpm_lookup_ipv4(ltbl->lpm, flow->f.v4.dst.s_addr);
+		if (fib_id < 0)
+			return NULL;
+		return &ltbl->fib_tbl[fib_id];
+	}
+
+	if (likely(flow->proto == RTE_ETHER_TYPE_IPV6)) {
+		fib_id = lpm_lookup_ipv6(ltbl->lpm6, &flow->f.v6.dst);
+		if (fib_id < 0)
+			return NULL;
+		return &ltbl->fib_tbl6[fib_id];
+	}
+
+	rte_panic("Unexpected condition at %s: unknown flow type %hu\n",
+		__func__, flow->proto);
+
+	return NULL; /* Unreachable. */
+}
+
+static struct flow_entry *
+lookup_fe_from_lpm(struct ipacket *packet, uint32_t ip_flow_hash_val,
+	struct gk_co_work *work)
+{
+	struct rte_mbuf *pkt = packet->pkt;
+
+	/*
+	 * A prefetch is not needed here because current deployments of
+	 * Gatekeeper servers have only a couple of FIB entries forwarding
+	 * traffic from front to back interfaces.
+	 */
+	struct gk_fib *fib = look_up_fib(&work->gk_conf->lpm_tbl,
+		&packet->flow);
+
+	if (fib == NULL || fib->action == GK_FWD_NEIGHBOR_FRONT_NET) {
+		struct gk_measurement_metrics *stats =
+			&work->instance->traffic_stats;
+		if (packet->flow.proto == RTE_ETHER_TYPE_IPV4) {
+			stats->tot_pkts_num_distributed++;
+			stats->tot_pkts_size_distributed +=
+				rte_pktmbuf_pkt_len(pkt);
+			add_pkt_acl(&work->front_acl4, pkt);
+		} else if (likely(packet->flow.proto ==
+				RTE_ETHER_TYPE_IPV6)) {
+			stats->tot_pkts_num_distributed++;
+			stats->tot_pkts_size_distributed +=
+				rte_pktmbuf_pkt_len(pkt);
+			add_pkt_acl(&work->front_acl6, pkt);
+		} else {
+			print_flow_err_msg(&packet->flow,
+				"gk: failed to get the fib entry");
+			drop_packet_front(pkt, work->instance);
+		}
+		return NULL;
+	}
+
+	switch (fib->action) {
+	case GK_FWD_GRANTOR: {
+		struct flow_entry *fe = &work->temp_fes[work->temp_fes_num++];
+		initialize_flow_entry(fe, &packet->flow, ip_flow_hash_val, fib);
+		return fe;
+	}
+
+	case GK_FWD_GATEWAY_BACK_NET: {
+		/*
+		 * The entry instructs to forward its packets to
+		 * the gateway in the back network.
+		 */
+		struct ether_cache *eth_cache = fib->u.gateway.eth_cache;
+		RTE_VERIFY(eth_cache != NULL);
+		forward_pkt_to_back(packet, eth_cache, work);
+		return NULL;
+	}
+
+	case GK_FWD_NEIGHBOR_BACK_NET: {
+		/*
+		 * The entry instructs to forward its packets to
+		 * the neighbor in the back network.
+		 */
+		struct ether_cache *eth_cache =
+			(packet->flow.proto == RTE_ETHER_TYPE_IPV4)
+				? lookup_ether_cache(&fib->u.neigh,
+					&packet->flow.f.v4.dst)
+				: lookup_ether_cache(&fib->u.neigh6,
+					&packet->flow.f.v6.dst);
+		RTE_VERIFY(eth_cache != NULL);
+		forward_pkt_to_back(packet, eth_cache, work);
+		return NULL;
+	}
+
+	case GK_DROP:
+		/* FALLTHROUGH */
+	default:
+		drop_packet_front(pkt, work->instance);
+		return NULL;
+	}
+
+	return NULL;
+}
+
+static void
+prefetch_and_yield(void *addr, void *this_co)
+{
+	rte_prefetch_non_temporal(addr);
+	gk_yield_next(this_co);
+}
+
+static void
+gk_co_process_front_pkt_final(struct gk_co *this_co, struct gk_co_task *task)
+{
+	struct ipacket *packet = task->task_arg;
+	struct gk_co_work *work = this_co->work;
+	uint32_t ip_flow_hash_val = task->task_hash;
+	struct flow_entry *fe_leftover =
+		get_fe_leftover(work, ip_flow_hash_val);
+	struct flow_entry *fe;
+	int ret;
+
+	/* Is leftover useful? */
+	if (fe_leftover != NULL &&
+			fe_leftover->flow_hash_val == ip_flow_hash_val &&
+			ip_flow_cmp_eq(&fe_leftover->flow,
+				&packet->flow, 0) == 0) {
+		/* Jackpot! Deal with @pkt right away. */
+		process_flow_entry(this_co, fe_leftover, packet);
+		return;
+	}
+
+	/* Look up flow entry. */
+	ret = rte_hash_lookup_and_yield_with_hash(
+		work->instance->ip_flow_hash_table, &packet->flow,
+		ip_flow_hash_val, prefetch_and_yield, this_co);
+	if (ret >= 0) {
+		fe = &work->instance->ip_flow_entry_table[ret];
+		/* TODO Break this prefetch into part1 and part2. */
+		prefetch_flow_entry(fe);
+		gk_yield_next(this_co);
+		process_flow_entry(this_co, fe, packet);
+		save_fe_leftover(work, fe);
+		return;
+	}
+	if (unlikely(ret != -ENOENT)) {
+		char err_msg[1024];
+
+		ret = snprintf(err_msg, sizeof(err_msg),
+			"gk: failed to look up flow state at %s with lcore %u: %i\n",
+			__func__, rte_lcore_id(), ret);
+
+		RTE_VERIFY(ret > 0 && ret < (int)sizeof(err_msg));
+		print_flow_err_msg(&packet->flow, err_msg);
+		return;
+	}
+
+	fe = lookup_fe_from_lpm(packet, ip_flow_hash_val, work);
+	if (fe == NULL)
+		return;
+	process_flow_entry(this_co, fe, packet);
+	save_fe_leftover(work, fe);
+}
+
+void
+gk_co_process_front_pkt_software_rss(struct gk_co *this_co,
+	struct gk_co_task *task)
+{
+	struct ipacket *packet = task->task_arg;
+
+	if (parse_front_pkt(this_co, packet, packet->pkt) != 0)
+		return;
+
+	/* Finish up the work with the correct hash value. */
+	task->task_hash = rss_ip_flow_hf(&packet->flow, 0, 0);
+	task->task_func = gk_co_process_front_pkt_final;
+	reschedule_task(this_co, task);
+}
+
+void
+gk_co_process_front_pkt(struct gk_co *this_co, struct gk_co_task *task)
+{
+	struct ipacket packet;
+
+	if (parse_front_pkt(this_co, &packet, task->task_arg) != 0)
+		return;
+	task->task_arg = &packet;
+	gk_co_process_front_pkt_final(this_co, task);
+}
+
+static void
+gk_co_scan_flow_table_final(struct gk_co *this_co, struct gk_co_task *task)
+{
+	struct gk_co_work *work = this_co->work;
+	struct flow_entry *fe = task->task_arg;
+	struct flow_entry **leftover_bucket = get_fe_leftover_bucket(work, fe);
+
+	RTE_VERIFY(work->del_fe == NULL);
+	work->del_fe = fe;
+
+	/* Deal with the leftover. */
+	if (unlikely(*leftover_bucket == fe)) {
+		/* One does not need to look up again. */
+		return;
+	}
+	*leftover_bucket = fe;
+
+	/* Prefetch buckets to remove the flow entry later. */
+	rte_hash_lookup_and_yield_with_hash(work->instance->ip_flow_hash_table,
+		&fe->flow, fe->flow_hash_val, prefetch_and_yield, this_co);
+}
+
+static bool
+is_flow_expired(struct flow_entry *fe, uint64_t now)
+{
+	switch(fe->state) {
+	case GK_REQUEST:
+		if (fe->u.request.last_packet_seen_at > now) {
+			char err_msg[128];
+			int ret = snprintf(err_msg, sizeof(err_msg),
+				"gk: buggy condition at %s: wrong timestamp",
+				__func__);
+			RTE_VERIFY(ret > 0 && ret < (int)sizeof(err_msg));
+			print_flow_err_msg(&fe->flow, err_msg);
+			return true;
+		}
+
+		/*
+		 * A request entry is considered expired if it is not
+		 * doubling its waiting time. We use +2 instead of +1 in
+		 * the test below to account for random delays in the network.
+		 */
+		return priority_from_delta_time(now,
+			fe->u.request.last_packet_seen_at) >
+			fe->u.request.last_priority + 2;
+	case GK_GRANTED:
+		return now >= fe->u.granted.cap_expire_at;
+	case GK_DECLINED:
+		return now >= fe->u.declined.expire_at;
+	case GK_BPF:
+		return now >= fe->u.bpf.expire_at;
+	default:
+		return true;
+	}
+}
+
+void
+gk_co_scan_flow_table(struct gk_co *this_co, struct gk_co_task *task)
+{
+	struct flow_entry *fe = task->task_arg;
+
+	/*
+	 * Only one prefetch is needed here because one only needs
+	 * the beginning of a struct flow_entry to
+	 * check if it's expired.
+	 */
+	rte_prefetch_non_temporal(fe);
+	gk_yield_next(this_co);
+
+	if (!fe->in_use || !is_flow_expired(fe, rte_rdtsc()))
+		return;
+
+	/* Finish up the work with the correct hash value. */
+	task->task_hash = fe->flow_hash_val;
+	task->task_func = gk_co_scan_flow_table_final;
+	reschedule_task(this_co, task);
+}
+
+static struct gk_co_task *
+next_task(struct gk_co *this_co)
+{
+	while (true) {
+		struct gk_co *next_co;
+
+		/*
+		 * This test is likely because if @this_co has at least
+		 * one task, there's at least 50% that it will be true because
+		 * this function is called twice.
+		 */
+		if (likely(!list_empty(&this_co->task_queue))) {
+			/*
+			 * @this_co has assigned tasks.
+			 * Return the first assigned task.
+			 */
+			struct gk_co_task *task = list_first_entry(
+				&this_co->task_queue, struct gk_co_task,
+				task_list);
+			list_del(&task->task_list);
+			return task;
+		}
+
+		/* There is no more tasks assigned to @this_co. */
+
+		next_co = get_next_co(this_co);
+
+		/* Make @this_co idle. */
+		list_del(&this_co->co_list);
+
+		/* Transfer control to another coroutine. */
+		if (likely(this_co != next_co)) {
+			/*
+			 * @this_co is NOT the last working coroutine.
+			 * Yield to the next coroutine.
+			 */
+			coro_transfer(&this_co->coro, &next_co->coro);
+		} else {
+			/*
+			 * No more work and no more working coroutines;
+			 * @this_co is the last working coroutine.
+			 * Return to the main coroutine.
+			 */
+			coro_transfer(&this_co->coro,
+				&this_co->work->instance->coro_root);
+		}
+	}
+}
+
+void
+gk_co_main(void *arg)
+{
+	struct gk_co *this_co = arg;
+	struct gk_co_task *task = next_task(this_co);
+
+	while (likely(task != NULL)) {
+		task->task_func(this_co, task);
+		task = next_task(this_co);
+	}
+
+	rte_panic("%s() terminated\n", __func__);
+}
diff --git a/gk/co.h b/gk/co.h
new file mode 100644
index 000000000..6ed27033a
--- /dev/null
+++ b/gk/co.h
@@ -0,0 +1,290 @@
+/*
+ * Gatekeeper - DoS protection system.
+ * Copyright (C) 2016 Digirati LTDA.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _GATEKEEPER_GK_CO_H_
+#define _GATEKEEPER_GK_CO_H_
+
+#include <stdbool.h>
+#include <string.h>
+#include <coro.h>
+#include <list.h>
+
+#include "gatekeeper_gk.h"
+#include "gatekeeper_acl.h"
+
+struct gk_co {
+	/*
+	 * Attach this coroutine to work->working_cos while
+	 * this coroutine is working.
+	 */
+	struct list_head    co_list;
+	/* structs from libcoro. */
+	struct coro_stack   stack;
+	struct coro_context coro;
+	/* Task assigned to this coroutine. */
+	struct list_head    task_queue;
+	struct gk_co_work   *work;
+};
+
+struct gk_co_task *task;
+
+typedef void (*gk_co_task_func_t)(struct gk_co *this_co,
+	struct gk_co_task *task);
+
+struct gk_co_task {
+	/*
+	 * Once the task is assigned to a coroutine,
+	 * attach this task to co->task_queue.
+	 */
+	struct list_head  task_list;
+	/*
+	 * @task_hash is used to assign the task to a coroutine.
+	 *
+	 * This is important to avoid race conditions between coroutines.
+	 * For example, assume that two tasks that are going to work on
+	 * the same flow entry are assigned to two different coroutines, and
+	 * that the corresponding flow entry is not available in
+	 * the flow table, both coroutines may try to add the same flow entry.
+	 * If these two tasks share the same task hash, both tasks are going to
+	 * be assigned to the same coroutine.
+	 */
+	uint32_t          task_hash;
+	void              *task_arg;
+	gk_co_task_func_t task_func;
+};
+
+struct gk_co_work {
+	/* The coroutines working on the tasks. */
+	struct list_head working_cos;
+	/* Coroutines available to do the work. */
+	struct gk_co     *cos;
+	/* Number of coroutines available for the next batch of work. */
+	uint16_t         co_num;
+	/* Total number of coroutines available at field @cos. */
+	uint16_t         co_max_num;
+	/* Index of the next coroutine to use when a task has no task hash. */
+	uint16_t         any_co_index;
+	/* How field @co_num will change for the next batch of work. */
+	int16_t          co_delta_num;
+	/*
+	 * Previous value of field @co_num.
+	 * When the value of this field is zero, an invalid value for @co_num,
+	 * the value of field @avg_cycles_per_task is not meaningful.
+	 */
+	uint16_t         co_prv_num;
+	/*
+	 * Average number of cycles per task when @co_num was equal to
+	 * @co_prv_num.
+	 */
+	double           avg_cycles_per_task;
+
+	struct gk_config   *gk_conf;
+	struct gk_instance *instance;
+
+	/* All preallocated tasks available to do work. */
+	struct gk_co_task *all_tasks;
+	/* The total number of taks available at field @all_tasks. */
+	const uint32_t task_total;
+	/* Current number of tasks used at field @all_tasks. */
+	uint32_t task_num;
+
+	/* Fields for front packets and mailbox messages. */
+	/*
+	 * This is a single-entry-per-bucket hash table.
+	 * This flow entries are reused between tasks assigned to
+	 * the same coroutine.
+	 */
+	struct flow_entry ** const leftover;
+	/*
+	 * Flow entries that has not been inserted in the flow table, but
+	 * they may be present in @leftover.
+	 */
+	struct flow_entry * const temp_fes;
+	/* Number of entries in used in @temp_fes. */
+	uint16_t temp_fes_num;
+	/*
+	 * Mask for the hash table @leftover.
+	 * It must be of the form (2^n - 1) for any n >= 0.
+	 */
+	const uint32_t leftover_mask;
+	/*
+	 * The following fields release the coroutines of acquiring
+	 * a writer lock on the flow table.
+	 */
+	/* If different of NULL, free this entry in flush_work(). */
+	struct flow_entry *del_fe;
+
+	/* Fields for front and back packets. */
+	uint16_t tx_front_num_pkts;
+	uint16_t tx_back_num_pkts;
+	struct rte_mbuf ** const tx_front_pkts;
+	struct rte_mbuf ** const tx_back_pkts;
+	/*
+	 * The following field is only needed when the RSS hash is not
+	 * available.
+	 */
+	struct ipacket * const packets;
+
+	/* Fields for the front packets only. */
+	uint16_t          front_num_req;
+	uint16_t          front_num_arp;
+	struct rte_mbuf   ** const front_req_bufs;
+	struct rte_mbuf   ** const front_arp_bufs;
+	struct acl_search front_acl4;
+	struct acl_search front_acl6;
+	bool front_ipv4_configured;
+	bool front_ipv6_configured;
+
+	/* Fields for the front packets only. */
+	uint16_t          back_num_arp;
+	struct rte_mbuf   ** const back_arp_bufs;
+	struct acl_search back_acl4;
+	struct acl_search back_acl6;
+};
+
+/* Declare and initialize a struct gk_co_work. */
+#define DEFINE_GK_CO_WORK(name, max_front_pkts, max_back_pkts,		\
+		max_mailbox, lo_mask, task_extra)			\
+	struct gk_co_task name##_all_tasks_array[(max_front_pkts) +	\
+		(max_back_pkts) + (max_mailbox) + (task_extra)];	\
+	struct flow_entry *name##_leftover_array[(lo_mask) + 1];	\
+	struct flow_entry name##_temp_fes_array[			\
+		(max_front_pkts) + (max_mailbox)];			\
+	struct rte_mbuf *name##_tx_front_pkts_array[			\
+		(max_front_pkts) + (max_back_pkts)];			\
+	struct rte_mbuf *name##_tx_back_pkts_array[			\
+		(max_front_pkts) + (max_back_pkts)];			\
+	struct ipacket name##_packets_array[				\
+		(max_front_pkts) + (max_back_pkts)];			\
+	struct rte_mbuf *name##_front_req_bufs_array[(max_front_pkts)];	\
+	struct rte_mbuf *name##_front_arp_bufs_array[(max_front_pkts)];	\
+	DECLARE_ACL_SEARCH_VARIABLE_PART(front_acl4, (max_front_pkts));	\
+	DECLARE_ACL_SEARCH_VARIABLE_PART(front_acl6, (max_front_pkts));	\
+	struct rte_mbuf *name##_back_arp_bufs_array[(max_back_pkts)];	\
+	DECLARE_ACL_SEARCH_VARIABLE_PART(back_acl4, (max_back_pkts));	\
+	DECLARE_ACL_SEARCH_VARIABLE_PART(back_acl6, (max_back_pkts));	\
+	struct gk_co_work name = {					\
+		.working_cos = LIST_HEAD_INIT(name.working_cos),	\
+		.cos = NULL,						\
+		.co_num = 0,						\
+		.co_max_num = 0,					\
+		.any_co_index = 0,					\
+		.co_delta_num = 1,					\
+		.co_prv_num = 0,					\
+		.avg_cycles_per_task = 0,				\
+		.gk_conf = NULL,					\
+		.instance = NULL,					\
+		.all_tasks = name##_all_tasks_array,			\
+		.task_total = (max_front_pkts) + (max_back_pkts) +	\
+			(max_mailbox) + (task_extra),			\
+		.task_num = 0,						\
+		.leftover = memset(name##_leftover_array, 0,		\
+			sizeof(name##_leftover_array)),			\
+		.temp_fes = name##_temp_fes_array,			\
+		.temp_fes_num = 0,					\
+		.leftover_mask = (lo_mask),				\
+		.del_fe = NULL,						\
+		.tx_front_num_pkts = 0,					\
+		.tx_back_num_pkts  = 0,					\
+		.tx_front_pkts = name##_tx_front_pkts_array,		\
+		.tx_back_pkts  = name##_tx_back_pkts_array,		\
+		.packets = name##_packets_array,			\
+		.front_num_req = 0,					\
+		.front_num_arp = 0,					\
+		.front_req_bufs = name##_front_req_bufs_array,		\
+		.front_arp_bufs = name##_front_arp_bufs_array,		\
+		.front_acl4 = ACL_SEARCH_INIT(front_acl4),		\
+		.front_acl6 = ACL_SEARCH_INIT(front_acl6),		\
+		.front_ipv4_configured = false,				\
+		.front_ipv6_configured = false,				\
+		.back_num_arp = 0,					\
+		.back_arp_bufs = name##_back_arp_bufs_array,		\
+		.back_acl4 = ACL_SEARCH_INIT(back_acl4),		\
+		.back_acl6 = ACL_SEARCH_INIT(back_acl6),		\
+	}
+
+static inline struct gk_co *
+get_task_owner_co(struct gk_co_work *work, struct gk_co_task *task)
+{
+	return &work->cos[task->task_hash % work->co_num];
+}
+
+static inline void
+__schedule_task(struct gk_co *task_owner_co, struct gk_co_task *task)
+{
+	list_add_tail(&task->task_list, &task_owner_co->task_queue);
+}
+
+static inline void
+schedule_task(struct gk_co_work *work, struct gk_co_task *task)
+{
+	__schedule_task(get_task_owner_co(work, task), task);
+}
+
+/* Uniformly distribuite tasks with no task hash among coroutines. */
+static inline void
+schedule_task_to_any_co(struct gk_co_work *work, struct gk_co_task *task)
+{
+	__schedule_task(&work->cos[work->any_co_index], task);
+	work->any_co_index = (work->any_co_index + 1) % work->co_num;
+}
+
+static inline struct flow_entry **
+__get_fe_leftover_bucket(struct gk_co_work *work, uint32_t hash)
+{
+	return &work->leftover[hash & work->leftover_mask];
+}
+
+static inline struct flow_entry **
+get_fe_leftover_bucket(struct gk_co_work *work, struct flow_entry *fe)
+{
+	return __get_fe_leftover_bucket(work, fe->flow_hash_val);
+}
+
+static inline struct flow_entry *
+get_fe_leftover(struct gk_co_work *work, uint32_t hash)
+{
+	return *__get_fe_leftover_bucket(work, hash);
+}
+
+/*
+ * Notice that if the bucket is not empty, that reference will be lost.
+ * That is, the code favors the newer entry over the older entry.
+ */
+static inline void
+save_fe_leftover(struct gk_co_work *work, struct flow_entry *fe)
+{
+	*get_fe_leftover_bucket(work, fe) = fe;
+}
+
+void
+gk_co_main(void *arg);
+
+void
+gk_co_scan_flow_table(struct gk_co *this_co, struct gk_co_task *task);
+
+void
+gk_co_process_front_pkt(struct gk_co *this_co, struct gk_co_task *task);
+void
+gk_co_process_front_pkt_software_rss(struct gk_co *this_co,
+	struct gk_co_task *task);
+
+void
+gk_yield_next(struct gk_co *this_co);
+
+#endif /* _GATEKEEPER_GK_CO_H_ */
diff --git a/gk/main.c b/gk/main.c
index bce203456..bb38be214 100644
--- a/gk/main.c
+++ b/gk/main.c
@@ -44,14 +44,14 @@
 #include "gatekeeper_sol.h"
 #include "gatekeeper_flow_bpf.h"
 
-#include "bpf.h"
-
-#define	START_PRIORITY		 (38)
-/* Set @START_ALLOWANCE as the double size of a large DNS reply. */
-#define	START_ALLOWANCE		 (8)
+#include "co.h"
 
 int gk_logtype;
 
+/*
+ * TODO A copy of this function is available in gk/co.c,
+ * so drop it when possible.
+ */
 /* We should avoid calling integer_log_base_2() with zero. */
 static inline uint8_t
 integer_log_base_2(uint64_t delta_time)
@@ -63,18 +63,22 @@ integer_log_base_2(uint64_t delta_time)
 #endif
 }
 
-/* 
- * It converts the difference of time between the current packet and 
- * the last seen packet into a given priority. 
+/*
+ * TODO A copy of this function is available in gk/co.c,
+ * so drop it when possible.
  */
-static uint8_t 
+/*
+ * It converts the difference of time between the current packet and
+ * the last seen packet into a given priority.
+ */
+static uint8_t
 priority_from_delta_time(uint64_t present, uint64_t past)
 {
 	uint64_t delta_time;
 
 	if (unlikely(present < past)) {
 		/*
-		 * This should never happen, but we handle it gracefully here 
+		 * This should never happen, but we handle it gracefully here
 		 * in order to keep going.
 		 */
 		GK_LOG(ERR, "The present time smaller than the past time\n");
@@ -84,10 +88,14 @@ priority_from_delta_time(uint64_t present, uint64_t past)
 	delta_time = (present - past) * picosec_per_cycle;
 	if (unlikely(delta_time < 1))
 		return 0;
-	
+
 	return integer_log_base_2(delta_time);
 }
 
+/*
+ * TODO A copy of this function is available in gk/co.c,
+ * so drop it when possible.
+ */
 static struct gk_fib *
 look_up_fib(struct gk_lpm *ltbl, struct ip_flow *flow)
 {
@@ -113,6 +121,10 @@ look_up_fib(struct gk_lpm *ltbl, struct ip_flow *flow)
 	return NULL; /* Unreachable. */
 }
 
+/*
+ * TODO A copy of this function is available in gk/co.c,
+ * so drop it when possible.
+ */
 static int
 extract_packet_info(struct rte_mbuf *pkt, struct ipacket *packet)
 {
@@ -180,41 +192,17 @@ extract_packet_info(struct rte_mbuf *pkt, struct ipacket *packet)
 	return ret;
 }
 
-static inline void
-initialize_flow_entry(struct flow_entry *fe, struct ip_flow *flow,
-	uint32_t flow_hash_val, struct gk_fib *grantor_fib)
-{
-	/*
-	 * The flow table is a critical data structure, so,
-	 * whenever the size of entries grow too much,
-	 * one must look for alternatives before increasing
-	 * the limit below.
-	 */
-	RTE_BUILD_BUG_ON(sizeof(*fe) > 128);
-
-	rte_memcpy(&fe->flow, flow, sizeof(*flow));
-
-	fe->in_use = true;
-	fe->flow_hash_val = flow_hash_val;
-	fe->state = GK_REQUEST;
-	fe->u.request.last_packet_seen_at = rte_rdtsc();
-	fe->u.request.last_priority = START_PRIORITY;
-	fe->u.request.allowance = START_ALLOWANCE - 1;
-	fe->grantor_fib = grantor_fib;
-}
-
-static inline void
-reinitialize_flow_entry(struct flow_entry *fe, uint64_t now)
-{
-	fe->state = GK_REQUEST;
-	fe->u.request.last_packet_seen_at = now;
-	fe->u.request.last_priority = START_PRIORITY;
-	fe->u.request.allowance = START_ALLOWANCE - 1;
-}
-
+/*
+ * TODO A copy of this typedef is available in gk/co.c,
+ * so drop it when possible.
+ */
 typedef int (*packet_drop_cb_func)(struct rte_mbuf *pkt,
 	struct gk_instance *instance);
 
+/*
+ * TODO A copy of this function is available in gk/co.c,
+ * so drop it when possible.
+ */
 static int
 drop_packet_front(struct rte_mbuf *pkt, struct gk_instance *instance)
 {
@@ -257,247 +245,6 @@ pkt_copy_cached_eth_header(struct rte_mbuf *pkt, struct ether_cache *eth_cache,
 	return stale;
 }
 
-/* 
- * When a flow entry is at request state, all the GK block processing
- * that entry does is to:
- * (1) compute the priority of the packet.
- * (2) encapsulate the packet as a request.
- * (3) put this encapsulated packet in the request queue.
- *
- * Returns a negative integer on error, or EINPROGRESS to indicate
- * that the request is being processed by another lcore, and should
- * not be forwarded or dropped on returning from this function.
- */
-static int
-gk_process_request(struct flow_entry *fe, struct ipacket *packet,
-	struct rte_mbuf **req_bufs, uint16_t *num_reqs,
-	struct sol_config *sol_conf)
-{
-	int ret;
-	uint64_t now = rte_rdtsc();
-	uint8_t priority = priority_from_delta_time(now,
-			fe->u.request.last_packet_seen_at);
-	struct gk_fib *fib = fe->grantor_fib;
-	struct ether_cache *eth_cache;
-
-	fe->u.request.last_packet_seen_at = now;
-
-	/*
-	 * The reason for using "<" instead of "<=" is that the equal case 
-	 * means that the source has waited enough time to have the same 
-	 * last priority, so it should be awarded with the allowance.
-	 */
-	if (priority < fe->u.request.last_priority &&
-			fe->u.request.allowance > 0) {
-		fe->u.request.allowance--;
-		priority = fe->u.request.last_priority;
-	} else {
-		fe->u.request.last_priority = priority;
-		fe->u.request.allowance = START_ALLOWANCE - 1;
-	}
-
-	/*
-	 * Adjust @priority for the DSCP field.
-	 * DSCP 0 for legacy packets; 1 for granted packets; 
-	 * 2 for capability renew; 3-63 for requests.
-	 */
-	priority += PRIORITY_REQ_MIN;
-	if (unlikely(priority > PRIORITY_MAX))
-		priority = PRIORITY_MAX;
-
-	/* The assigned priority is @priority. */
-
-	/* Encapsulate the packet as a request. */
-	ret = encapsulate(packet->pkt, priority,
-		&sol_conf->net->back, &fib->u.grantor.gt_addr);
-	if (ret < 0)
-		return ret;
-
-	eth_cache = fib->u.grantor.eth_cache;
-	RTE_VERIFY(eth_cache != NULL);
-	/* If needed, packet header space was adjusted by encapsulate(). */
-	if (pkt_copy_cached_eth_header(packet->pkt, eth_cache,
-			sol_conf->net->back.l2_len_out))
-		return -1;
-
-	req_bufs[*num_reqs] = packet->pkt;
-	req_bufs[*num_reqs]->udata64 = priority;
-	(*num_reqs)++;
-
-	return EINPROGRESS;
-}
-
-/*
- * Returns:
- *   * zero on success; the granted packet can be enqueued and forwarded
- *   * a negative number on error or when the packet needs to be
- *     otherwise dropped because it has exceeded its budget
- *   * EINPROGRESS to indicate that the packet is now a request that
- *     is being processed by another lcore, and should not
- *     be forwarded or dropped on returning from this function.
- */
-static int
-gk_process_granted(struct flow_entry *fe, struct ipacket *packet,
-	struct rte_mbuf **req_bufs, uint16_t *num_reqs,
-	struct sol_config *sol_conf, struct gk_measurement_metrics *stats)
-{
-	int ret;
-	bool renew_cap;
-	uint8_t priority = PRIORITY_GRANTED;
-	uint64_t now = rte_rdtsc();
-	struct rte_mbuf *pkt = packet->pkt;
-	struct gk_fib *fib = fe->grantor_fib;
-	struct ether_cache *eth_cache;
-	uint32_t pkt_len;
-
-	if (now >= fe->u.granted.cap_expire_at) {
-		reinitialize_flow_entry(fe, now);
-		return gk_process_request(fe, packet, req_bufs,
-			num_reqs, sol_conf);
-	}
-
-	if (now >= fe->u.granted.budget_renew_at) {
-		fe->u.granted.budget_renew_at = now + cycles_per_sec;
-		fe->u.granted.budget_byte =
-			(uint64_t)fe->u.granted.tx_rate_kib_cycle * 1024;
-	}
-
-	pkt_len = rte_pktmbuf_pkt_len(pkt);
-	if (pkt_len > fe->u.granted.budget_byte) {
-		stats->pkts_num_declined++;
-		stats->pkts_size_declined += pkt_len;
-		return -1;
-	}
-
-	fe->u.granted.budget_byte -= pkt_len;
-	renew_cap = now >= fe->u.granted.send_next_renewal_at;
-	if (renew_cap) {
-		fe->u.granted.send_next_renewal_at = now +
-			fe->u.granted.renewal_step_cycle;
-		priority = PRIORITY_RENEW_CAP;
-	}
-
-	/*
-	 * Encapsulate packet as a granted packet,
-	 * mark it as a capability renewal request if @renew_cap is true,
-	 * enter destination according to @fe->grantor_fib.
-	 */
-	ret = encapsulate(packet->pkt, priority,
-		&sol_conf->net->back, &fib->u.grantor.gt_addr);
-	if (ret < 0)
-		return ret;
-
-	eth_cache = fib->u.grantor.eth_cache;
-	RTE_VERIFY(eth_cache != NULL);
-	/* If needed, packet header space was adjusted by encapsulate(). */
-	if (pkt_copy_cached_eth_header(packet->pkt, eth_cache,
-			sol_conf->net->back.l2_len_out))
-		return -1;
-
-	stats->pkts_num_granted++;
-	stats->pkts_size_granted += pkt_len;
-	return 0;
-}
-
-/*
- * Returns:
- *   * a negative number on error or when the packet needs to be
- *     otherwise dropped because it is declined
- *   * EINPROGRESS to indicate that the packet is now a request that
- *     is being processed by another lcore, and should not
- *     be forwarded or dropped on returning from this function.
- */
-static int
-gk_process_declined(struct flow_entry *fe, struct ipacket *packet,
-	struct rte_mbuf **req_bufs, uint16_t *num_reqs,
-	struct sol_config *sol_conf, struct gk_measurement_metrics *stats)
-{
-	uint64_t now = rte_rdtsc();
-
-	if (unlikely(now >= fe->u.declined.expire_at)) {
-		reinitialize_flow_entry(fe, now);
-		return gk_process_request(fe, packet, req_bufs,
-			num_reqs, sol_conf);
-	}
-
-	stats->pkts_num_declined++;
-	stats->pkts_size_declined += rte_pktmbuf_pkt_len(packet->pkt);
-
-	return -1;
-}
-
-/*
- * Returns:
- *   * zero on success; the packet can be enqueued and forwarded
- *   * a negative number on error or when the packet needs to be
- *     otherwise dropped because it has exceeded a limit
- *   * EINPROGRESS to indicate that the packet is now a request that
- *     is being processed by another lcore, and should not
- *     be forwarded or dropped on returning from this function.
- */
-static int
-gk_process_bpf(struct flow_entry *fe, struct ipacket *packet,
-	struct rte_mbuf **req_bufs, uint16_t *num_reqs,
-	struct gk_config *gk_conf, struct gk_measurement_metrics *stats)
-{
-	uint64_t bpf_ret;
-	int program_index, rc;
-	uint64_t now = rte_rdtsc();
-
-	if (unlikely(now >= fe->u.bpf.expire_at))
-		goto expired;
-
-	program_index = fe->program_index;
-	rc = gk_bpf_decide_pkt(gk_conf, program_index, fe, packet, now,
-		&bpf_ret);
-	if (unlikely(rc != 0)) {
-		GK_LOG(WARNING,
-			"The BPF program at index %u failed to run its function pkt\n",
-			program_index);
-		goto expired;
-	}
-
-	switch (bpf_ret) {
-	case GK_BPF_PKT_RET_FORWARD: {
-		struct ether_cache *eth_cache =
-			fe->grantor_fib->u.grantor.eth_cache;
-		RTE_VERIFY(eth_cache != NULL);
-		/*
-		 * If needed, encapsulate() already adjusted
-		 * packet header space.
-		 */
-		if (pkt_copy_cached_eth_header(packet->pkt, eth_cache,
-				gk_conf->net->back.l2_len_out))
-			return -1;
-
-		stats->pkts_num_granted++;
-		stats->pkts_size_granted += rte_pktmbuf_pkt_len(packet->pkt);
-		return 0;
-	}
-	case GK_BPF_PKT_RET_DECLINE:
-		stats->pkts_num_declined++;
-		stats->pkts_size_declined += rte_pktmbuf_pkt_len(packet->pkt);
-		return -1;
-	case GK_BPF_PKT_RET_ERROR:
-		GK_LOG(WARNING,
-			"The function pkt of the BPF program at index %u returned GK_BPF_PKT_RET_ERROR\n",
-			program_index);
-		return -1;
-	default:
-		GK_LOG(WARNING,
-			"The function pkt of the BPF program at index %u returned an invalid return: %" PRIu64 "\n",
-			program_index, bpf_ret);
-		return -1;
-	}
-
-	rte_panic("Unexpected condition at %s()", __func__);
-
-expired:
-	reinitialize_flow_entry(fe, now);
-	return gk_process_request(fe, packet, req_bufs, num_reqs,
-		gk_conf->sol_conf);
-}
-
 static int
 get_block_idx(struct gk_config *gk_conf, unsigned int lcore_id)
 {
@@ -510,6 +257,10 @@ get_block_idx(struct gk_config *gk_conf, unsigned int lcore_id)
 	return 0;
 }
 
+/*
+ * TODO A copy of this function is available in gk/co.c,
+ * so drop it when possible.
+ */
 static bool
 is_flow_expired(struct flow_entry *fe, uint64_t now)
 {
@@ -545,12 +296,17 @@ is_flow_expired(struct flow_entry *fe, uint64_t now)
 }
 
 static int
-gk_del_flow_entry_from_hash(struct rte_hash *h, struct flow_entry *fe)
+gk_del_flow_entry_from_hash(struct gk_instance *instance, struct flow_entry *fe)
 {
-	int ret = rte_hash_del_key_with_hash(h, &fe->flow, fe->flow_hash_val);
-	if (likely(ret >= 0))
+
+	int ret = rte_hash_del_key_with_hash(instance->ip_flow_hash_table,
+		&fe->flow, fe->flow_hash_val);
+	if (likely(ret >= 0)) {
 		memset(fe, 0, sizeof(*fe));
-	else {
+
+		if (instance->num_scan_del > 0)
+			instance->num_scan_del--;
+	} else {
 		GK_LOG(ERR,
 			"The GK block failed to delete a key from hash table at %s: %s\n",
 			__func__, strerror(-ret));
@@ -559,6 +315,56 @@ gk_del_flow_entry_from_hash(struct rte_hash *h, struct flow_entry *fe)
 	return ret;
 }
 
+static void
+free_cos(struct gk_co *cos, unsigned int num)
+{
+	unsigned int i;
+
+	if (cos == NULL)
+		return;
+
+	for (i = 0; i < num; i++) {
+		struct gk_co *co = &cos[i];
+
+		if (co->stack.sptr == NULL)
+			continue;
+
+		/* Free @co. */
+		coro_destroy(&co->coro);
+		coro_stack_free(&co->stack);
+	}
+
+	rte_free(cos);
+}
+
+static struct gk_co *
+alloc_cos(unsigned int num, unsigned int stack_size_byte)
+{
+	unsigned int stack_size_ptr = stack_size_byte / sizeof(void *);
+	unsigned int i;
+
+	struct gk_co *cos = rte_calloc(__func__, num, sizeof(*cos), 0);
+	if (cos == NULL)
+		return NULL;
+
+	for (i = 0; i < num; i++) {
+		struct gk_co *co = &cos[i];
+
+		if (unlikely(!coro_stack_alloc(&co->stack, stack_size_ptr))) {
+			free_cos(cos, num);
+			return NULL;
+		}
+
+		coro_create(&co->coro, gk_co_main, co,
+			co->stack.sptr, co->stack.ssze);
+		INIT_LIST_HEAD_WITH_POISON(&co->co_list);
+		INIT_LIST_HEAD(&co->task_queue);
+		co->work = NULL;
+	}
+
+	return cos;
+}
+
 static int
 setup_gk_instance(unsigned int lcore_id, struct gk_config *gk_conf)
 {
@@ -586,7 +392,6 @@ setup_gk_instance(unsigned int lcore_id, struct gk_config *gk_conf)
 		GK_LOG(ERR,
 			"The GK block cannot create hash table at lcore %u\n",
 			lcore_id);
-
 		ret = -1;
 		goto out;
 	}
@@ -600,7 +405,6 @@ setup_gk_instance(unsigned int lcore_id, struct gk_config *gk_conf)
 		GK_LOG(ERR,
 			"The GK block can't create flow entry table at lcore %u\n",
 			lcore_id);
-
 		ret = -1;
 		goto flow_hash;
 	}
@@ -611,6 +415,19 @@ setup_gk_instance(unsigned int lcore_id, struct gk_config *gk_conf)
     	if (ret < 0)
 		goto flow_entry;
 
+	coro_create(&instance->coro_root, NULL, NULL, NULL, 0);
+
+	/* Allocate coroutines. */
+	instance->cos = alloc_cos(gk_conf->co_max_num,
+		gk_conf->co_stack_size_kb * 1024);
+	if (instance->cos == NULL) {
+		GK_LOG(ERR,
+			"The GK block can't allocate coroutines at lcore %u\n",
+			lcore_id);
+		ret = -1;
+		goto coro_root;
+	}
+
 	tb_ratelimit_state_init(&instance->front_icmp_rs,
 		gk_conf->front_icmp_msgs_per_sec,
 		gk_conf->front_icmp_msgs_burst);
@@ -621,6 +438,10 @@ setup_gk_instance(unsigned int lcore_id, struct gk_config *gk_conf)
 	ret = 0;
 	goto out;
 
+coro_root:
+	coro_destroy(&instance->coro_root);
+/*mailbox:*/
+	destroy_mailbox(&instance->mb);
 flow_entry:
     	rte_free(instance->ip_flow_entry_table);
     	instance->ip_flow_entry_table = NULL;
@@ -730,8 +551,7 @@ flush_flow_table(struct ip_prefix *src,
 		}
 
 		if (matched) {
-			gk_del_flow_entry_from_hash(
-				instance->ip_flow_hash_table, fe);
+			gk_del_flow_entry_from_hash(instance, fe);
 			num_flushed_flows++;
 		}
 
@@ -872,10 +692,8 @@ gk_synchronize(struct gk_fib *fib, struct gk_instance *instance)
 		while (index >= 0) {
 			struct flow_entry *fe =
 				&instance->ip_flow_entry_table[index];
-			if (fe->grantor_fib == fib) {
-				gk_del_flow_entry_from_hash(
-					instance->ip_flow_hash_table, fe);
-			}
+			if (fe->grantor_fib == fib)
+				gk_del_flow_entry_from_hash(instance, fe);
 
 			index = rte_hash_iterate(instance->ip_flow_hash_table,
 				(void *)&key, &data, &next);
@@ -990,6 +808,10 @@ gk_setup_rss(struct gk_config *gk_conf)
 	return ret;
 }
 
+/*
+ * TODO A copy of this function is available in gk/co.c,
+ * so drop it when possible.
+ */
 static void
 xmit_icmp(struct gatekeeper_if *iface, struct ipacket *packet,
 	uint16_t *num_pkts, struct rte_mbuf **icmp_bufs,
@@ -1065,6 +887,10 @@ xmit_icmp(struct gatekeeper_if *iface, struct ipacket *packet,
 	(*num_pkts)++;
 }
 
+/*
+ * TODO A copy of this function is available in gk/co.c,
+ * so drop it when possible.
+ */
 static void
 xmit_icmpv6(struct gatekeeper_if *iface, struct ipacket *packet,
 	uint16_t *num_pkts, struct rte_mbuf **icmp_bufs,
@@ -1136,6 +962,10 @@ xmit_icmpv6(struct gatekeeper_if *iface, struct ipacket *packet,
 	(*num_pkts)++;
 }
 
+/*
+ * TODO A copy of this function is available in gk/co.c,
+ * so drop it when possible.
+ */
 /*
  * For IPv4, according to the RFC 1812 section 5.3.1 Time to Live (TTL),
  * if the TTL is reduced to zero (or less), the packet MUST be
@@ -1190,26 +1020,6 @@ update_ip_hop_count(struct gatekeeper_if *iface, struct ipacket *packet,
 	return 0;
 }
 
-/*
- * This function is only to be called on flows that
- * are not backed by a flow entry.
- */
-static void
-send_request_to_grantor(struct ipacket *packet, uint32_t flow_hash_val,
-		struct gk_fib *fib, struct rte_mbuf **req_bufs,
-		uint16_t *num_reqs, struct gk_instance *instance,
-		struct gk_config *gk_conf) {
-	int ret;
-	struct flow_entry temp_fe;
-
-	initialize_flow_entry(&temp_fe, &packet->flow, flow_hash_val, fib);
-
-	ret = gk_process_request(&temp_fe, packet, req_bufs,
-		num_reqs, gk_conf->sol_conf);
-	if (ret < 0)
-		drop_packet_front(packet->pkt, instance);
-}
-
 static void
 lookup_fib_bulk(struct gk_lpm *ltbl, struct ip_flow **flows,
 	int num_flows, struct gk_fib *fibs[])
@@ -1289,111 +1099,70 @@ lookup_fib6_bulk(struct gk_lpm *ltbl, struct ip_flow **flows,
 	}
 }
 
-static struct flow_entry *
-lookup_fe_from_lpm(struct ipacket *packet, uint32_t ip_flow_hash_val,
-		struct gk_fib *fib, uint16_t *num_tx, struct rte_mbuf **tx_bufs,
+static void
+process_fib(struct ipacket *packet, struct gk_fib *fib,
+		uint16_t *num_tx, struct rte_mbuf **tx_bufs,
 		struct acl_search *acl4, struct acl_search *acl6,
 		uint16_t *num_pkts, struct rte_mbuf **icmp_bufs,
-		struct rte_mbuf **req_bufs, uint16_t *num_reqs,
 		struct gatekeeper_if *front, struct gatekeeper_if *back,
-		struct gk_instance *instance, struct gk_config *gk_conf) {
+		struct gk_instance *instance) {
 	struct rte_mbuf *pkt = packet->pkt;
 	struct ether_cache *eth_cache;
-	struct gk_measurement_metrics *stats = &instance->traffic_stats;
-
-	if (fib == NULL || fib->action == GK_FWD_NEIGHBOR_FRONT_NET) {
-		if (packet->flow.proto == RTE_ETHER_TYPE_IPV4) {
-			stats->tot_pkts_num_distributed++;
-			stats->tot_pkts_size_distributed +=
-				rte_pktmbuf_pkt_len(pkt);
 
+	if (fib == NULL || fib->action == GK_FWD_NEIGHBOR_BACK_NET) {
+		if (packet->flow.proto == RTE_ETHER_TYPE_IPV4)
 			add_pkt_acl(acl4, pkt);
-		} else if (likely(packet->flow.proto ==
-				RTE_ETHER_TYPE_IPV6)) {
-			stats->tot_pkts_num_distributed++;
-			stats->tot_pkts_size_distributed +=
-				rte_pktmbuf_pkt_len(pkt);
-
+		else if (likely(packet->flow.proto ==
+				RTE_ETHER_TYPE_IPV6))
 			add_pkt_acl(acl6, pkt);
-		} else {
+		else {
 			print_flow_err_msg(&packet->flow,
-				"gk: failed to get the fib entry");
-			drop_packet_front(pkt, instance);
+				"gk: failed to get the fib entry or it is not an IP packet");
+			drop_packet(pkt);
 		}
-		return NULL;
+		return;
 	}
 
 	switch (fib->action) {
-	case GK_FWD_GRANTOR: {
-		struct flow_entry *fe;
-		int ret = gk_hash_add_flow_entry(
-			instance, &packet->flow,
-			ip_flow_hash_val, gk_conf);
-		if (ret == -ENOSPC) {
-			/*
-			 * There is no room for a new
-			 * flow entry, but give this
-			 * flow a chance sending a
-			 * request to the grantor
-			 * server.
-			 */
-			send_request_to_grantor(packet, ip_flow_hash_val,
-				fib, req_bufs, num_reqs, instance, gk_conf);
-			return NULL;
-		}
-		if (ret < 0) {
-			drop_packet_front(pkt, instance);
-			return NULL;
-		}
-
-		fe = &instance->ip_flow_entry_table[ret];
-		initialize_flow_entry(fe,
-			&packet->flow, ip_flow_hash_val, fib);
-		return fe;
-	}
-
-	case GK_FWD_GATEWAY_BACK_NET: {
+	case GK_FWD_GATEWAY_FRONT_NET: {
 		/*
 		 * The entry instructs to forward
 		 * its packets to the gateway in
-		 * the back network, forward accordingly.
+		 * the front network, forward accordingly.
 		 *
-		 * BP block bypasses from the front to the
-		 * back interface are expected to bypass
-		 * ranges of IP addresses that should not
-		 * go through Gatekeeper.
+		 * BP bypasses from the back to the front interface
+		 * are expected to bypass the outgoing traffic
+		 * from the AS to its peers.
 		 *
 		 * Notice that one needs to update
 		 * the Ethernet header.
 		 */
-
 		eth_cache = fib->u.gateway.eth_cache;
 		RTE_VERIFY(eth_cache != NULL);
 
-		if (adjust_pkt_len(pkt, back, 0) == NULL ||
+		if (adjust_pkt_len(pkt, front, 0) == NULL ||
 				pkt_copy_cached_eth_header(pkt,
 					eth_cache,
-					back->l2_len_out)) {
-			drop_packet_front(pkt, instance);
-			return NULL;
+					front->l2_len_out)) {
+			drop_packet(pkt);
+			return;
 		}
 
-		if (update_ip_hop_count(front, packet,
+		if (update_ip_hop_count(back, packet,
 				num_pkts, icmp_bufs,
-				&instance->front_icmp_rs,
-				instance,
-				drop_packet_front) < 0)
-			return NULL;
+				&instance->back_icmp_rs,
+				instance, drop_packet_back) < 0)
+			return;
 
 		tx_bufs[(*num_tx)++] = pkt;
-		return NULL;
+		break;
 	}
 
-	case GK_FWD_NEIGHBOR_BACK_NET: {
+	case GK_FWD_NEIGHBOR_FRONT_NET: {
 		/*
 		 * The entry instructs to forward
 		 * its packets to the neighbor in
-		 * the back network, forward accordingly.
+		 * the front network, forward accordingly.
 		 */
 		if (packet->flow.proto == RTE_ETHER_TYPE_IPV4) {
 			eth_cache = lookup_ether_cache(
@@ -1407,455 +1176,45 @@ lookup_fe_from_lpm(struct ipacket *packet, uint32_t ip_flow_hash_val,
 
 		RTE_VERIFY(eth_cache != NULL);
 
-		if (adjust_pkt_len(pkt, back, 0) == NULL ||
+		if (adjust_pkt_len(pkt, front, 0) == NULL ||
 				pkt_copy_cached_eth_header(pkt,
 					eth_cache,
-					back->l2_len_out)) {
-			drop_packet_front(pkt, instance);
-			return NULL;
+					front->l2_len_out)) {
+			drop_packet(pkt);
+			return;
 		}
 
-		if (update_ip_hop_count(front, packet,
+		if (update_ip_hop_count(back, packet,
 				num_pkts, icmp_bufs,
-				&instance->front_icmp_rs,
-				instance,
-				drop_packet_front) < 0)
-			return NULL;
+				&instance->back_icmp_rs,
+				instance, drop_packet_back) < 0)
+			return;
 
 		tx_bufs[(*num_tx)++] = pkt;
-		return NULL;
+		break;
 	}
 
 	case GK_DROP:
-		/* FALLTHROUGH */
+		drop_packet(pkt);
+		break;
+
 	default:
-		drop_packet_front(pkt, instance);
-		return NULL;
+		/* All other actions should log a warning. */
+		GK_LOG(WARNING,
+			"The fib entry has an unexpected action %u at %s\n",
+			fib->action, __func__);
+		drop_packet(pkt);
+		break;
 	}
-
-	return NULL;
 }
 
-static int
-process_flow_entry(struct flow_entry *fe, struct ipacket *packet,
-	struct rte_mbuf **req_bufs, uint16_t *num_reqs,
-	struct gk_config *gk_conf, struct gk_measurement_metrics *stats)
-{
-	int ret;
-
-	/*
-	 * Some notes regarding flow rates and units:
-	 *
-	 * Flows in the GK_REQUEST state are bandwidth limited
-	 * to an overall rate relative to the link. Therefore,
-	 * the Ethernet frame overhead is counted toward the
-	 * credits used by requests. The request channel rate
-	 * is measured in megabits (base 10) per second to
-	 * match the units used by hardware specifications.
-	 *
-	 * Granted flows (in state GK_GRANTED or sometimes
-	 * GK_BPF) are allocated budgets that are intended
-	 * to reflect the max throughput of the flow, and
-	 * therefore do not include the Ethernet frame overhead.
-	 * The budgets of granted flows are measured in
-	 * kibibytes (base 2).
-	 */
-	switch (fe->state) {
-	case GK_REQUEST:
-		ret = gk_process_request(fe, packet,
-			req_bufs, num_reqs, gk_conf->sol_conf);
-		break;
-
-	case GK_GRANTED:
-		ret = gk_process_granted(fe, packet,
-			req_bufs, num_reqs, gk_conf->sol_conf, stats);
-		break;
-
-	case GK_DECLINED:
-		ret = gk_process_declined(fe, packet,
-			req_bufs, num_reqs, gk_conf->sol_conf, stats);
-		break;
-
-	case GK_BPF:
-		ret = gk_process_bpf(fe, packet,
-			req_bufs, num_reqs, gk_conf, stats);
-		break;
-
-	default:
-		ret = -1;
-		GK_LOG(ERR, "Unknown flow state: %d\n", fe->state);
-		break;
-	}
-
-	return ret;
-}
-
-static inline void
-prefetch_flow_entry(struct flow_entry *fe)
-{
-#if RTE_CACHE_LINE_SIZE == 64
-	RTE_BUILD_BUG_ON(sizeof(*fe) <= RTE_CACHE_LINE_SIZE);
-	RTE_BUILD_BUG_ON(sizeof(*fe) > 2 * RTE_CACHE_LINE_SIZE);
-	rte_prefetch0(fe);
-	rte_prefetch0(((char *)fe) + RTE_CACHE_LINE_SIZE);
-#elif RTE_CACHE_LINE_SIZE == 128
-	RTE_BUILD_BUG_ON(sizeof(*fe) > RTE_CACHE_LINE_SIZE);
-	rte_prefetch0(fe);
-#else
-#error "Unsupported cache line size"
-#endif
-}
-
-static void
-parse_packet(struct ipacket *packet, struct rte_mbuf *pkt,
-	struct rte_mbuf **arp_bufs, uint16_t *num_arp,
-	bool ipv4_configured_front, bool ipv6_configured_front,
-	struct ip_flow **flow_arr, uint32_t *flow_hash_val_arr,
-	int *num_ip_flows, struct gatekeeper_if *front,
-	struct gk_instance *instance)
-{
-	int ret;
-	struct gk_measurement_metrics *stats = &instance->traffic_stats;
-
-	stats->tot_pkts_size += rte_pktmbuf_pkt_len(pkt);
-
-	ret = extract_packet_info(pkt, packet);
-	if (ret < 0) {
-		if (likely(packet->flow.proto == RTE_ETHER_TYPE_ARP)) {
-			stats->tot_pkts_num_distributed++;
-			stats->tot_pkts_size_distributed +=
-				rte_pktmbuf_pkt_len(pkt);
-
-			arp_bufs[(*num_arp)++] = pkt;
-			return;
-		}
-
-		/* Drop non-IP and non-ARP packets. */
-		drop_packet_front(pkt, instance);
-		return;
-	}
-
-	if (unlikely((packet->flow.proto == RTE_ETHER_TYPE_IPV4 &&
-			!ipv4_configured_front) ||
-			(packet->flow.proto == RTE_ETHER_TYPE_IPV6 &&
-			!ipv6_configured_front))) {
-		drop_packet_front(pkt, instance);
-		return;
-	}
-
-	flow_arr[*num_ip_flows] = &packet->flow;
-	flow_hash_val_arr[*num_ip_flows] = likely(front->rss) ?
-		pkt->hash.rss : rss_ip_flow_hf(&packet->flow, 0, 0);
-	(*num_ip_flows)++;
-}
-
-#define PREFETCH_OFFSET (4)
-
-/* Process the packets on the front interface. */
-static void
-process_pkts_front(uint16_t port_front, uint16_t rx_queue_front,
-	unsigned int lcore,
-	uint16_t *tx_front_num_pkts, struct rte_mbuf **tx_front_pkts,
-	uint16_t *tx_back_num_pkts, struct rte_mbuf **tx_back_pkts,
-	struct gk_instance *instance, struct gk_config *gk_conf)
-{
-	int i;
-	int done_lookups;
-	int ret;
-	uint16_t num_rx;
-	uint16_t num_arp = 0;
-	uint16_t num_reqs = 0;
-	uint16_t front_max_pkt_burst = gk_conf->front_max_pkt_burst;
-	struct rte_mbuf *rx_bufs[front_max_pkt_burst];
-	struct rte_mbuf *arp_bufs[front_max_pkt_burst];
-	struct rte_mbuf *req_bufs[front_max_pkt_burst];
-	DEFINE_ACL_SEARCH(acl4, front_max_pkt_burst);
-	DEFINE_ACL_SEARCH(acl6, front_max_pkt_burst);
-	struct gatekeeper_if *front = &gk_conf->net->front;
-	struct gatekeeper_if *back = &gk_conf->net->back;
-	struct gk_measurement_metrics *stats = &instance->traffic_stats;
-	bool ipv4_configured_front = ipv4_if_configured(&gk_conf->net->front);
-	bool ipv6_configured_front = ipv6_if_configured(&gk_conf->net->front);
-	int num_ip_flows = 0;
-	struct ipacket pkt_arr[front_max_pkt_burst];
-	struct ip_flow *flow_arr[front_max_pkt_burst];
-	uint32_t flow_hash_val_arr[front_max_pkt_burst];
-	int num_lpm_lookups = 0;
-	int num_lpm6_lookups = 0;
-	struct ip_flow *flows[front_max_pkt_burst];
-	struct ip_flow *flows6[front_max_pkt_burst];
-	int32_t lpm_lookup_pos[front_max_pkt_burst];
-	int32_t lpm6_lookup_pos[front_max_pkt_burst];
-	int32_t pos_arr[front_max_pkt_burst];
-	struct gk_fib *fibs[front_max_pkt_burst];
-	struct gk_fib *fibs6[front_max_pkt_burst];
-	struct flow_entry *fe_arr[front_max_pkt_burst];
-
-	/* Load a set of packets from the front NIC. */
-	num_rx = rte_eth_rx_burst(port_front, rx_queue_front, rx_bufs,
-		front_max_pkt_burst);
-
-	if (unlikely(num_rx == 0))
-		return;
-
-	stats->tot_pkts_num += num_rx;
-
-       /*
-        * This prefetch is enough to load Ethernet header (14 bytes),
-        * optional Ethernet VLAN header (8 bytes), and either
-        * an IPv4 header without options (20 bytes), or
-        * an IPv6 header without options (40 bytes).
-        * IPv4: 14 + 8 + 20 = 42
-        * IPv6: 14 + 8 + 40 = 62
-        */
-       for (i = 0; i < PREFETCH_OFFSET && i < num_rx; i++)
-		rte_prefetch0(rte_pktmbuf_mtod_offset(rx_bufs[i], void *, 0));
-
-	/* Extract packet and flow information. */
-	for (i = 0; i < (num_rx - PREFETCH_OFFSET); i++) {
-		rte_prefetch0(rte_pktmbuf_mtod_offset(
-			rx_bufs[i + PREFETCH_OFFSET], void *, 0));
-
-		parse_packet(&pkt_arr[num_ip_flows], rx_bufs[i], arp_bufs,
-			&num_arp, ipv4_configured_front, ipv6_configured_front,
-			flow_arr, flow_hash_val_arr, &num_ip_flows, front,
-			instance);
-	}
-
-	/* Extract the rest packet and flow information. */
-	for (; i < num_rx; i++) {
-		parse_packet(&pkt_arr[num_ip_flows], rx_bufs[i], arp_bufs,
-			&num_arp, ipv4_configured_front, ipv6_configured_front,
-			flow_arr, flow_hash_val_arr, &num_ip_flows, front,
-			instance);
-	}
-
-	done_lookups = 0;
-	while (done_lookups < num_ip_flows) {
-		uint32_t num_keys = num_ip_flows - done_lookups;
-		if (num_keys > RTE_HASH_LOOKUP_BULK_MAX)
-			num_keys = RTE_HASH_LOOKUP_BULK_MAX;
-
-		ret = rte_hash_lookup_bulk_with_hash(
-			instance->ip_flow_hash_table,
-			(const void **)&flow_arr[done_lookups],
-			(hash_sig_t *)&flow_hash_val_arr[done_lookups],
-			num_keys, &pos_arr[done_lookups]);
-		if (ret != 0) {
-			GK_LOG(NOTICE,
-				"failed to find multiple keys in the hash table at lcore %u\n",
-				rte_lcore_id());
-		}
-
-		done_lookups += num_keys;
-	}
-
-	for (i = 0; i < num_ip_flows; i++) {
-		if (pos_arr[i] >= 0) {
-			fe_arr[i] = &instance->ip_flow_entry_table[pos_arr[i]];
-
-			prefetch_flow_entry(fe_arr[i]);
-		} else {
-			fe_arr[i] = NULL;
-			if (flow_arr[i]->proto == RTE_ETHER_TYPE_IPV4) {
-				lpm_lookup_pos[num_lpm_lookups] = i;
-				flows[num_lpm_lookups] = flow_arr[i];
-				num_lpm_lookups++;
-			} else {
-				lpm6_lookup_pos[num_lpm6_lookups] = i;
-				flows6[num_lpm6_lookups] = flow_arr[i];
-				num_lpm6_lookups++;
-			}
-		}
-	}
-
-	/* The remaining flows need LPM lookups. */
-	lookup_fib_bulk(&gk_conf->lpm_tbl, flows, num_lpm_lookups, fibs);
-	lookup_fib6_bulk(&gk_conf->lpm_tbl, flows6, num_lpm6_lookups, fibs6);
-
-	for (i = 0; i < num_lpm_lookups; i++) {
-		int fidx = lpm_lookup_pos[i];
-
-		fe_arr[fidx] = lookup_fe_from_lpm(&pkt_arr[fidx],
-			flow_hash_val_arr[fidx], fibs[i],
-			tx_back_num_pkts, tx_back_pkts, &acl4, &acl6,
-			tx_front_num_pkts, tx_front_pkts, req_bufs,
-			&num_reqs, front, back, instance, gk_conf);
-	}
-
-	for (i = 0; i < num_lpm6_lookups; i++) {
-		int fidx = lpm6_lookup_pos[i];
-
-		fe_arr[fidx] = lookup_fe_from_lpm(&pkt_arr[fidx],
-			flow_hash_val_arr[fidx], fibs6[i],
-			tx_back_num_pkts, tx_back_pkts, &acl4, &acl6,
-			tx_front_num_pkts, tx_front_pkts, req_bufs,
-			&num_reqs, front, back, instance, gk_conf);
-	}
-
-	for (i = 0; i < num_ip_flows; i++) {
-		if (fe_arr[i] == NULL)
-			continue;
-
-		ret = process_flow_entry(fe_arr[i], &pkt_arr[i], req_bufs,
-			&num_reqs, gk_conf, stats);
-		if (ret < 0)
-			drop_packet_front(pkt_arr[i].pkt, instance);
-		else if (ret == EINPROGRESS) {
-			/* Request will be serviced by another lcore. */
-			continue;
-		} else if (likely(ret == 0))
-			tx_back_pkts[(*tx_back_num_pkts)++] = pkt_arr[i].pkt;
-		else
-			rte_panic("Invalid return value (%d) from processing a packet in a flow with state %d",
-				ret, fe_arr[i]->state);
-	}
-
-	if (num_reqs > 0) {
-		uint64_t acc_size_request[num_reqs + 1];
-
-		acc_size_request[0] = 0;
-		for (i = 1; i <= num_reqs; i++) {
-			acc_size_request[i] = acc_size_request[i - 1] +
-				rte_pktmbuf_pkt_len(req_bufs[i - 1]);
-		}
-
-		ret = RTE_MAX(gk_solicitor_enqueue_bulk(gk_conf->sol_conf,
-			req_bufs, num_reqs), 0);
-		if (ret < num_reqs) {
-			for (i = ret; i < num_reqs; i++)
-				drop_packet_front(req_bufs[i], instance);
-		}
-
-		stats->pkts_num_request += ret;
-		stats->pkts_size_request += acc_size_request[ret];
-	}
-
-	if (num_arp > 0)
-		submit_arp(arp_bufs, num_arp, &gk_conf->net->front);
-
-	process_pkts_acl(&gk_conf->net->front,
-		lcore, &acl4, RTE_ETHER_TYPE_IPV4);
-	process_pkts_acl(&gk_conf->net->front,
-		lcore, &acl6, RTE_ETHER_TYPE_IPV6);
-}
-
-static void
-process_fib(struct ipacket *packet, struct gk_fib *fib,
-		uint16_t *num_tx, struct rte_mbuf **tx_bufs,
-		struct acl_search *acl4, struct acl_search *acl6,
-		uint16_t *num_pkts, struct rte_mbuf **icmp_bufs,
-		struct gatekeeper_if *front, struct gatekeeper_if *back,
-		struct gk_instance *instance) {
-	struct rte_mbuf *pkt = packet->pkt;
-	struct ether_cache *eth_cache;
-
-	if (fib == NULL || fib->action == GK_FWD_NEIGHBOR_BACK_NET) {
-		if (packet->flow.proto == RTE_ETHER_TYPE_IPV4)
-			add_pkt_acl(acl4, pkt);
-		else if (likely(packet->flow.proto ==
-				RTE_ETHER_TYPE_IPV6))
-			add_pkt_acl(acl6, pkt);
-		else {
-			print_flow_err_msg(&packet->flow,
-				"gk: failed to get the fib entry or it is not an IP packet");
-			drop_packet(pkt);
-		}
-		return;
-	}
-
-	switch (fib->action) {
-	case GK_FWD_GATEWAY_FRONT_NET: {
-		/*
-		 * The entry instructs to forward
-		 * its packets to the gateway in
-		 * the front network, forward accordingly.
-		 *
-		 * BP bypasses from the back to the front interface
-		 * are expected to bypass the outgoing traffic
-		 * from the AS to its peers.
-		 *
-		 * Notice that one needs to update
-		 * the Ethernet header.
-		 */
-		eth_cache = fib->u.gateway.eth_cache;
-		RTE_VERIFY(eth_cache != NULL);
-
-		if (adjust_pkt_len(pkt, front, 0) == NULL ||
-				pkt_copy_cached_eth_header(pkt,
-					eth_cache,
-					front->l2_len_out)) {
-			drop_packet(pkt);
-			return;
-		}
-
-		if (update_ip_hop_count(back, packet,
-				num_pkts, icmp_bufs,
-				&instance->back_icmp_rs,
-				instance, drop_packet_back) < 0)
-			return;
-
-		tx_bufs[(*num_tx)++] = pkt;
-		break;
-	}
-
-	case GK_FWD_NEIGHBOR_FRONT_NET: {
-		/*
-		 * The entry instructs to forward
-		 * its packets to the neighbor in
-		 * the front network, forward accordingly.
-		 */
-		if (packet->flow.proto == RTE_ETHER_TYPE_IPV4) {
-			eth_cache = lookup_ether_cache(
-				&fib->u.neigh,
-				&packet->flow.f.v4.dst);
-		} else {
-			eth_cache = lookup_ether_cache(
-				&fib->u.neigh6,
-				&packet->flow.f.v6.dst);
-		}
-
-		RTE_VERIFY(eth_cache != NULL);
-
-		if (adjust_pkt_len(pkt, front, 0) == NULL ||
-				pkt_copy_cached_eth_header(pkt,
-					eth_cache,
-					front->l2_len_out)) {
-			drop_packet(pkt);
-			return;
-		}
-
-		if (update_ip_hop_count(back, packet,
-				num_pkts, icmp_bufs,
-				&instance->back_icmp_rs,
-				instance, drop_packet_back) < 0)
-			return;
-
-		tx_bufs[(*num_tx)++] = pkt;
-		break;
-	}
-
-	case GK_DROP:
-		drop_packet(pkt);
-		break;
-
-	default:
-		/* All other actions should log a warning. */
-		GK_LOG(WARNING,
-			"The fib entry has an unexpected action %u at %s\n",
-			fib->action, __func__);
-		drop_packet(pkt);
-		break;
-	}
-}
-
-/* Process the packets on the back interface. */
-static void
-process_pkts_back(uint16_t port_back, uint16_t rx_queue_back,
-	unsigned int lcore,
-	uint16_t *tx_front_num_pkts, struct rte_mbuf **tx_front_pkts,
-	uint16_t *tx_back_num_pkts, struct rte_mbuf **tx_back_pkts,
-	struct gk_instance *instance, struct gk_config *gk_conf)
+/* Process the packets on the back interface. */
+static void
+process_pkts_back(uint16_t port_back, uint16_t rx_queue_back,
+	unsigned int lcore,
+	uint16_t *tx_front_num_pkts, struct rte_mbuf **tx_front_pkts,
+	uint16_t *tx_back_num_pkts, struct rte_mbuf **tx_back_pkts,
+	struct gk_instance *instance, struct gk_config *gk_conf)
 {
 	int i;
 	int ret;
@@ -2153,6 +1512,340 @@ process_cmds_from_mailbox(
 	mb_free_entry_bulk(&instance->mb, (void * const *)gk_cmds, num_cmd);
 }
 
+static void
+populate_front_tasks(struct gk_co_work *work,
+	uint16_t port_front, uint16_t rx_queue_front)
+{
+	uint16_t front_max_pkt_burst = work->gk_conf->front_max_pkt_burst;
+	struct rte_mbuf *rx_bufs[front_max_pkt_burst];
+	/* Load a set of packets from the front NIC. */
+	uint16_t num_rx = rte_eth_rx_burst(port_front, rx_queue_front, rx_bufs,
+		front_max_pkt_burst);
+	struct gk_measurement_metrics *stats;
+	bool has_rss;
+	int i;
+
+	if (unlikely(num_rx == 0))
+		return;
+
+	stats = &work->instance->traffic_stats;
+	stats->tot_pkts_num += num_rx;
+
+	has_rss = work->gk_conf->net->front.rss;
+	for (i = 0; i < num_rx; i++) {
+		struct gk_co_task *task = &work->all_tasks[work->task_num++];
+		struct rte_mbuf *pkt = rx_bufs[i];
+
+		stats->tot_pkts_size += rte_pktmbuf_pkt_len(pkt);
+
+		if (likely(has_rss)) {
+			task->task_hash = pkt->hash.rss;
+			task->task_arg = pkt;
+			task->task_func = gk_co_process_front_pkt;
+			schedule_task(work, task);
+		} else {
+			struct ipacket *packet = &work->packets[i];
+			/*
+			 * There is a chance that packets on the same flow
+			 * are brought out of order. For example, consider that
+			 * (1) three packets arrive on the following order:
+			 * 	pkt1, pkt2, pkt3;
+			 * (2) there are only two coroutines doing the work;
+			 * (3) The packets are mapped to
+			 * 	the coroutines as follow:
+			 * 	* pkt1 and pkt2 goes coroutine 1,
+			 * 	* pkt3 goes to coroutine 2;
+			 * (4) Packets pkt2 and pkt3 belong to the same flow.
+			 *
+			 * Packet pkt1 and ptk3 are processed in parallel,
+			 * receive their correct hashes, and are rescheduled.
+			 * Once pk2 is recheduled, it is going to be placed
+			 * after pk3 in the task queue of
+			 * the assigned coroutine, that is, pk3 is going to
+			 * be sent out before pkt2 (inverted order).
+			 */
+			task->task_hash = 0; /* Dummy hash. */
+			/*
+			 * Passing @packet instead of just @pkt so @packet
+			 * can be carried over once the task is rescheduled.
+			 */
+			packet->pkt = pkt;
+			task->task_arg = packet;
+			task->task_func = gk_co_process_front_pkt_software_rss;
+			schedule_task_to_any_co(work, task);
+		}
+	}
+}
+
+static void
+add_cos_to_work(struct gk_co_work *work, struct gk_config *gk_conf,
+	struct gk_instance *instance)
+{
+	unsigned int i;
+
+	work->gk_conf = gk_conf;
+	work->instance = instance;
+	work->cos = instance->cos;
+	work->co_max_num = gk_conf->co_max_num;
+	work->co_num = RTE_MIN(2, work->co_max_num);
+	work->front_ipv4_configured = ipv4_if_configured(&gk_conf->net->front);
+	work->front_ipv6_configured = ipv6_if_configured(&gk_conf->net->front);
+
+	RTE_VERIFY(work->co_num > 0);
+
+	for (i = 0; i < work->co_max_num; i++)
+		work->cos[i].work = work;
+}
+
+static void
+update_cos(struct gk_co_work *work)
+{
+	/*
+	 * The local variable @co_num is needed here to enable one to go
+	 * above @work->co_max_num and below zero if needed.
+	 */
+	int32_t co_num = work->co_num;
+
+	if (work->co_delta_num > 0) {
+		/* @work->co_num is going up. */
+
+		if (unlikely(co_num >= work->co_max_num)) {
+			/*
+			 * @work->co_num is at its maximum;
+			 * Reverse direction.
+			 */
+			RTE_VERIFY(co_num == work->co_max_num);
+			work->co_delta_num = - work->co_delta_num;
+			work->co_num = RTE_MAX(1, co_num + work->co_delta_num);
+			return;
+		}
+
+		work->co_num = RTE_MIN(work->co_max_num,
+			co_num + work->co_delta_num);
+		return;
+	}
+
+	/* @work->co_num is going down. */
+	RTE_VERIFY(work->co_delta_num < 0);
+
+	if (unlikely(co_num <= 1)) {
+		/* @work->co_num is at its minimum; reverse direction. */
+		RTE_VERIFY(co_num == 1);
+		work->co_delta_num = - work->co_delta_num;
+		work->co_num = RTE_MIN(work->co_max_num,
+				co_num + work->co_delta_num);
+		return;
+	}
+
+	work->co_num = RTE_MAX(1, co_num + work->co_delta_num);
+}
+
+static void
+do_work(struct gk_co_work *work)
+{
+	uint16_t i, real_co_num = 0;
+	uint64_t cycles;
+	double avg_cycles_per_task;
+
+	/* Add coroutines with tasks to @work->working_cos. */
+	for (i = 0; i < work->co_num; i++) {
+		struct gk_co *co = &work->cos[i];
+		if (!list_empty(&co->task_queue)) {
+			list_add_tail(&co->co_list, &work->working_cos);
+			real_co_num++;
+		}
+	}
+
+	/* Is there any work to do? */
+	if (unlikely(list_empty(&work->working_cos))) {
+		RTE_VERIFY(real_co_num == 0);
+		RTE_VERIFY(work->task_num == 0);
+		return;
+	}
+	RTE_VERIFY(real_co_num > 0);
+	RTE_VERIFY(work->task_num > 0);
+
+	/* Do work. */
+	cycles = rte_rdtsc();
+	coro_transfer(&work->instance->coro_root,
+		&list_first_entry(&work->working_cos, struct gk_co, co_list)->
+		coro);
+	cycles = rte_rdtsc() - cycles;
+	avg_cycles_per_task = (double)cycles / work->task_num;
+
+	if (work->co_num != real_co_num) {
+		/* Workload changed; adjust quickly. */
+		RTE_VERIFY(work->co_num > real_co_num);
+		work->co_prv_num = real_co_num;
+		work->avg_cycles_per_task = avg_cycles_per_task;
+		work->co_num = real_co_num;
+		return update_cos(work);
+	}
+
+	if (work->co_prv_num == 0) {
+		/* Initialize the performance tracking fields. */
+		work->co_prv_num = real_co_num;
+		work->avg_cycles_per_task = avg_cycles_per_task;
+		return update_cos(work);
+	}
+
+	if (avg_cycles_per_task >= work->avg_cycles_per_task) {
+		/* The last change did not bring an improvement; go back. */
+		work->co_num = work->co_prv_num;
+		/* Reset measurement. */
+		work->co_prv_num = 0;
+		/* Change adjustment direction. */
+		work->co_delta_num = - work->co_delta_num;
+		return;
+	}
+
+	/* @real_co_num is an improvement. */
+	work->co_prv_num = real_co_num;
+	work->avg_cycles_per_task = avg_cycles_per_task;
+	update_cos(work);
+}
+
+static void
+flush_work(struct gk_co_work *work,
+	uint16_t port_front, uint16_t tx_queue_front,
+	uint16_t port_back, uint16_t tx_queue_back,
+	unsigned int lcore)
+{
+	struct gk_instance *instance = work->instance;
+
+	uint16_t front_max_pkt_burst = work->gk_conf->front_max_pkt_burst;
+	uint16_t back_max_pkt_burst = work->gk_conf->back_max_pkt_burst;
+	uint32_t max_pkt_burst = front_max_pkt_burst + back_max_pkt_burst;
+	struct gatekeeper_if *front = &work->gk_conf->net->front;
+
+	/*
+	 * Flush packets.
+	 */
+
+	send_pkts(port_front, tx_queue_front,
+		work->tx_front_num_pkts, work->tx_front_pkts);
+	RTE_VERIFY(work->tx_front_num_pkts <= max_pkt_burst);
+	work->tx_front_num_pkts = 0;
+
+	send_pkts(port_back, tx_queue_back,
+		work->tx_back_num_pkts, work->tx_back_pkts);
+	RTE_VERIFY(work->tx_back_num_pkts <= max_pkt_burst);
+	work->tx_back_num_pkts = 0;
+
+	/*
+	 * Flush front.
+	 */
+
+	if (work->front_num_req > 0) {
+		uint16_t num_req = work->front_num_req;
+		uint64_t acc_size_request[num_req + 1];
+		struct gk_measurement_metrics *stats = &instance->traffic_stats;
+		int i, ret;
+
+		/*
+		 * The byte length of the packets must be computed before
+		 * calling gk_solicitor_enqueue_bulk() because after it
+		 * the GK block no longer owns the packets.
+		 */
+		acc_size_request[0] = 0;
+		for (i = 1; i <= num_req; i++) {
+			acc_size_request[i] = acc_size_request[i - 1] +
+				rte_pktmbuf_pkt_len(
+					work->front_req_bufs[i - 1]
+				);
+		}
+
+		ret = RTE_MAX(
+			gk_solicitor_enqueue_bulk(work->gk_conf->sol_conf,
+				work->front_req_bufs, num_req),
+			0);
+
+		stats->pkts_num_request += ret;
+		stats->pkts_size_request += acc_size_request[ret];
+
+		for (i = ret; i < num_req; i++)
+			drop_packet_front(work->front_req_bufs[i], instance);
+
+		RTE_VERIFY(num_req <= front_max_pkt_burst);
+		work->front_num_req = 0;
+	}
+
+	if (work->front_num_arp > 0) {
+		submit_arp(work->front_arp_bufs, work->front_num_arp, front);
+		RTE_VERIFY(work->front_num_arp <= front_max_pkt_burst);
+		work->front_num_arp = 0;
+	}
+
+	RTE_VERIFY(work->front_acl4.num <= front_max_pkt_burst);
+	RTE_VERIFY(work->front_acl6.num <= front_max_pkt_burst);
+	process_pkts_acl(front, lcore, &work->front_acl4, RTE_ETHER_TYPE_IPV4);
+	process_pkts_acl(front, lcore, &work->front_acl6, RTE_ETHER_TYPE_IPV6);
+
+	/*
+	 * TODO Flush back.
+	 */
+
+	/*
+	 * Update flow table.
+	 */
+
+	if (work->del_fe != NULL) {
+		RTE_VERIFY(work->del_fe->in_use);
+		/*
+		 * Test that the flow entry is expired once more because
+		 * it may have been update since it tested as expired and
+		 * arriving here.
+		 */
+		if (likely(is_flow_expired(work->del_fe, rte_rdtsc())))
+			gk_del_flow_entry_from_hash(instance, work->del_fe);
+		work->del_fe = NULL;
+	}
+
+	/*
+	 * Adding new entries to the flow table should be among the last steps
+	 * to do because when the flow table is full,
+	 * rte_hash_cuckoo_make_space_mw() is going to be called. And
+	 * this function disrupts the cache of the running core.
+	 * rte_hash_cuckoo_make_space_mw() may access up to 1000 buckets and,
+	 * on 64-bit platforms, consumes about 32KB of execution stack.
+	 */
+	if (work->temp_fes_num > 0) {
+		unsigned int i;
+		for (i = 0; i < work->temp_fes_num; i++) {
+			struct flow_entry *temp_fe = &work->temp_fes[i];
+			struct flow_entry *fe;
+			int ret = gk_hash_add_flow_entry(instance,
+				&temp_fe->flow, temp_fe->flow_hash_val,
+				work->gk_conf);
+			if (ret == -ENOSPC) {
+				/* Flow table is full. */
+				break;
+			}
+			if (unlikely(ret < 0)) {
+				GK_LOG(ERR,
+					"Failed to add an flow entry ret=%i\n",
+					ret);
+				continue;
+			}
+			fe = &instance->ip_flow_entry_table[ret];
+			rte_memcpy(fe, temp_fe, sizeof(*fe));
+		}
+		RTE_VERIFY(work->temp_fes_num <= (front_max_pkt_burst +
+			work->gk_conf->mailbox_burst_size));
+		work->temp_fes_num = 0;
+	}
+
+	/*
+	 * Reset fields of @work.
+	 */
+
+	RTE_VERIFY(work->task_num <= work->task_total);
+	work->task_num = 0;
+	work->any_co_index = 0;
+	memset(work->leftover, 0,
+		sizeof(*work->leftover) * (work->leftover_mask + 1));
+}
+
 static int
 gk_proc(void *arg)
 {
@@ -2168,13 +1861,6 @@ gk_proc(void *arg)
 	uint16_t rx_queue_back = instance->rx_queue_back;
 	uint16_t tx_queue_back = instance->tx_queue_back;
 
-	uint16_t tx_front_num_pkts;
-	uint16_t tx_back_num_pkts;
-	uint16_t tx_max_num_pkts = gk_conf->front_max_pkt_burst +
-		gk_conf->back_max_pkt_burst;
-	struct rte_mbuf *tx_front_pkts[tx_max_num_pkts];
-	struct rte_mbuf *tx_back_pkts[tx_max_num_pkts];
-
 	uint32_t entry_idx = 0;
 	uint64_t last_measure_tsc = rte_rdtsc();
 	uint64_t basic_measurement_logging_cycles =
@@ -2183,64 +1869,58 @@ gk_proc(void *arg)
 	uint32_t scan_iter = gk_conf->flow_table_scan_iter;
 	uint32_t iter_count = 0;
 
+	DEFINE_GK_CO_WORK(work, gk_conf->front_max_pkt_burst,
+		gk_conf->back_max_pkt_burst, gk_conf->mailbox_burst_size,
+		/*
+		 * The 4* is intended to minimize collisions, whereas the -1 is
+		 * intended to avoid doubling the size when
+		 * the expression already is a power of 2.
+		 */
+		rte_combine32ms1b(4 * (gk_conf->front_max_pkt_burst +
+			gk_conf->mailbox_burst_size) - 1),
+		1 /* One extra tast for the full scanning of the flow table. */
+	);
+
 	GK_LOG(NOTICE, "The GK block is running at lcore = %u\n", lcore);
 
 	gk_conf_hold(gk_conf);
+	add_cos_to_work(&work, gk_conf, instance);
 
 	while (likely(!exiting)) {
-		struct flow_entry *fe = NULL;
 
-		tx_front_num_pkts = 0;
-		tx_back_num_pkts = 0;
+		populate_front_tasks(&work, port_front, rx_queue_front);
 
+		/*
+		 * Have the expiration test after all flow-ralated work to
+		 * give one more chance for entries to not expire.
+		 */
 		if (iter_count >= scan_iter) {
+			struct gk_co_task *task =
+				&work.all_tasks[work.task_num++];
 			entry_idx = (entry_idx + 1) % gk_conf->flow_ht_size;
-			fe = &instance->ip_flow_entry_table[entry_idx];
-			/*
-			 * Only one prefetch is needed here because we only
-			 * need the beginning of a struct flow_entry to
-			 * check if it's expired.
-			 */
-			rte_prefetch_non_temporal(fe);
+
+			task->task_hash = 0; /* Dummy hash. */
+			task->task_arg =
+				&instance->ip_flow_entry_table[entry_idx];
+			task->task_func = gk_co_scan_flow_table;
+			schedule_task_to_any_co(&work, task);
 
 			iter_count = 0;
 		} else
 			iter_count++;
 
-		process_pkts_front(port_front, rx_queue_front, lcore,
-			&tx_front_num_pkts, tx_front_pkts,
-			&tx_back_num_pkts, tx_back_pkts,
-			instance, gk_conf);
+		do_work(&work);
 
 		process_pkts_back(port_back, rx_queue_back, lcore,
-			&tx_front_num_pkts, tx_front_pkts,
-			&tx_back_num_pkts, tx_back_pkts,
+			&work.tx_front_num_pkts, work.tx_front_pkts,
+			&work.tx_back_num_pkts,  work.tx_back_pkts,
 			instance, gk_conf);
 
-		if (fe != NULL && fe->in_use &&
-				is_flow_expired(fe, rte_rdtsc())) {
-			rte_hash_prefetch_buckets_non_temporal(
-				instance->ip_flow_hash_table,
-				fe->flow_hash_val);
-		} else
-			fe = NULL;
-
-		send_pkts(port_front, tx_queue_front,
-			tx_front_num_pkts, tx_front_pkts);
-
-		send_pkts(port_back, tx_queue_back,
-			tx_back_num_pkts, tx_back_pkts);
+		flush_work(&work, port_front, tx_queue_front,
+			port_back, tx_queue_back, lcore);
 
 		process_cmds_from_mailbox(instance, gk_conf);
 
-		if (fe != NULL) {
-			gk_del_flow_entry_from_hash(
-				instance->ip_flow_hash_table, fe);
-
-			if (instance->num_scan_del > 0)
-				instance->num_scan_del--;
-		}
-
 		if (rte_rdtsc() - last_measure_tsc >=
 				basic_measurement_logging_cycles) {
 			struct gk_measurement_metrics *stats =
@@ -2310,6 +1990,8 @@ cleanup_gk(struct gk_config *gk_conf)
 		}
 
 		destroy_mailbox(&gk_conf->instances[i].mb);
+		free_cos(gk_conf->instances[i].cos, gk_conf->co_max_num);
+		coro_destroy(&gk_conf->instances[i].coro_root);
 	}
 
 	if (gk_conf->lpm_tbl.fib_tbl != NULL) {
@@ -2518,6 +2200,12 @@ run_gk(struct net_config *net_conf, struct gk_config *gk_conf,
 		goto out;
 	}
 
+	if (gk_conf->co_max_num == 0) {
+		GK_LOG(ERR, "There must be at least one coroutine\n");
+		ret = -1;
+		goto out;
+	}
+
 	front_inc = gk_conf->front_max_pkt_burst * gk_conf->num_lcores;
 	net_conf->front.total_pkt_burst += front_inc;
 	back_inc = gk_conf->back_max_pkt_burst * gk_conf->num_lcores;
diff --git a/include/coro.h b/include/coro.h
new file mode 100644
index 000000000..7645d5029
--- /dev/null
+++ b/include/coro.h
@@ -0,0 +1,440 @@
+/*
+ * Copyright (c) 2001-2012,2015 Marc Alexander Lehmann <schmorp@schmorp.de>
+ *
+ * Redistribution and use in source and binary forms, with or without modifica-
+ * tion, are permitted provided that the following conditions are met:
+ *
+ *   1.  Redistributions of source code must retain the above copyright notice,
+ *       this list of conditions and the following disclaimer.
+ *
+ *   2.  Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
+ * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
+ * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
+ * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * the GNU General Public License ("GPL") version 2 or any later version,
+ * in which case the provisions of the GPL are applicable instead of
+ * the above. If you wish to allow the use of your version of this file
+ * only under the terms of the GPL and not to allow others to use your
+ * version of this file under the BSD license, indicate your decision
+ * by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL. If you do not delete the
+ * provisions above, a recipient may use your version of this file under
+ * either the BSD or the GPL.
+ *
+ * This library is modelled strictly after Ralf S. Engelschalls article at
+ * http://www.gnu.org/software/pth/rse-pmt.ps. So most of the credit must
+ * go to Ralf S. Engelschall <rse@engelschall.com>.
+ *
+ * This coroutine library is very much stripped down. You should either
+ * build your own process abstraction using it or - better - just use GNU
+ * Portable Threads, http://www.gnu.org/software/pth/.
+ *
+ */
+
+/*
+ * 2006-10-26 Include stddef.h on OS X to work around one of its bugs.
+ *            Reported by Michael_G_Schwern.
+ * 2006-11-26 Use _setjmp instead of setjmp on GNU/Linux.
+ * 2007-04-27 Set unwind frame info if gcc 3+ and ELF is detected.
+ *            Use _setjmp instead of setjmp on _XOPEN_SOURCE >= 600.
+ * 2007-05-02 Add assembly versions for x86 and amd64 (to avoid reliance
+ *            on SIGUSR2 and sigaltstack in Crossfire).
+ * 2008-01-21 Disable CFI usage on anything but GNU/Linux.
+ * 2008-03-02 Switched to 2-clause BSD license with GPL exception.
+ * 2008-04-04 New (but highly unrecommended) pthreads backend.
+ * 2008-04-24 Reinstate CORO_LOSER (had wrong stack adjustments).
+ * 2008-10-30 Support assembly method on x86 with and without frame pointer.
+ * 2008-11-03 Use a global asm statement for CORO_ASM, idea by pippijn.
+ * 2008-11-05 Hopefully fix misaligned stacks with CORO_ASM/SETJMP.
+ * 2008-11-07 rbp wasn't saved in CORO_ASM on x86_64.
+ *            introduce coro_destroy, which is a nop except for pthreads.
+ *            speed up CORO_PTHREAD. Do no longer leak threads either.
+ *            coro_create now allows one to create source coro_contexts.
+ *            do not rely on makecontext passing a void * correctly.
+ *            try harder to get _setjmp/_longjmp.
+ *            major code cleanup/restructuring.
+ * 2008-11-10 the .cfi hacks are no longer needed.
+ * 2008-11-16 work around a freebsd pthread bug.
+ * 2008-11-19 define coro_*jmp symbols for easier porting.
+ * 2009-06-23 tentative win32-backend support for mingw32 (Yasuhiro Matsumoto).
+ * 2010-12-03 tentative support for uclibc (which lacks all sorts of things).
+ * 2011-05-30 set initial callee-saved-registers to zero with CORO_ASM.
+ *            use .cfi_undefined rip on linux-amd64 for better backtraces.
+ * 2011-06-08 maybe properly implement weird windows amd64 calling conventions.
+ * 2011-07-03 rely on __GCC_HAVE_DWARF2_CFI_ASM for cfi detection.
+ * 2011-08-08 cygwin trashes stacks, use pthreads with double stack on cygwin.
+ * 2012-12-04 reduce misprediction penalty for x86/amd64 assembly switcher.
+ * 2012-12-05 experimental fiber backend (allocates stack twice).
+ * 2012-12-07 API version 3 - add coro_stack_alloc/coro_stack_free.
+ * 2012-12-21 valgrind stack registering was broken.
+ * 2015-12-05 experimental asm be for arm7, based on a patch by Nick Zavaritsky.
+ *            use __name__ for predefined symbols, as in libecb.
+ *            enable guard pages on arm, aarch64 and mips.
+ * 2016-08-27 try to disable _FORTIFY_SOURCE with CORO_SJLJ, as it
+ *            breaks setjmp/longjmp. Also disable CORO_ASM for asm by default,
+ *            as it was reported to crash.
+ * 2016-11-18 disable cfi_undefined again - backtraces might be worse, but
+ *            compile compatibility is improved.
+ * 2018-08-14 use a completely different pthread strategy that should allow
+ *            sharing of coroutines among different threads. this would
+ *            undefined behaviour before as mutexes would be unlocked on
+ *            a different thread. overall, this might be slower than
+ *            using a pipe for synchronisation, but pipes eat fd's...
+ */
+
+#ifndef CORO_H
+#define CORO_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * This library consists of only three files
+ * coro.h, coro.c and LICENSE (and optionally README)
+ *
+ * It implements what is known as coroutines, in a hopefully
+ * portable way.
+ *
+ * All compiletime symbols must be defined both when including coro.h
+ * (using libcoro) as well as when compiling coro.c (the implementation).
+ *
+ * You can manually specify which flavour you want. If you don't define
+ * any of these, libcoro tries to choose a safe and fast default:
+ *
+ * -DCORO_UCONTEXT
+ *
+ *    This flavour uses SUSv2's get/set/swap/makecontext functions that
+ *    unfortunately only some unices support, and is quite slow.
+ *
+ * -DCORO_SJLJ
+ *
+ *    This flavour uses SUSv2's setjmp/longjmp and sigaltstack functions to
+ *    do it's job. Coroutine creation is much slower than UCONTEXT, but
+ *    context switching is a bit cheaper. It should work on almost all unices.
+ *
+ * -DCORO_LINUX
+ *
+ *    CORO_SJLJ variant.
+ *    Old GNU/Linux systems (<= glibc-2.1) only work with this implementation
+ *    (it is very fast and therefore recommended over other methods, but
+ *    doesn't work with anything newer).
+ *
+ * -DCORO_LOSER
+ *
+ *    CORO_SJLJ variant.
+ *    Microsoft's highly proprietary platform doesn't support sigaltstack, and
+ *    this selects a suitable workaround for this platform. It might not work
+ *    with your compiler though - it has only been tested with MSVC 6.
+ *
+ * -DCORO_FIBER
+ *
+ *    Slower, but probably more portable variant for the Microsoft operating
+ *    system, using fibers. Ignores the passed stack and allocates it internally.
+ *    Also, due to bugs in cygwin, this does not work with cygwin.
+ *
+ * -DCORO_IRIX
+ *
+ *    CORO_SJLJ variant.
+ *    For SGI's version of Microsoft's NT ;)
+ *
+ * -DCORO_ASM
+ *
+ *    Hand coded assembly, known to work only on a few architectures/ABI:
+ *    GCC + arm7/x86/IA32/amd64/x86_64 + GNU/Linux and a few BSDs. Fastest
+ *    choice, if it works.
+ *
+ * -DCORO_PTHREAD
+ *
+ *    Use the pthread API. You have to provide <pthread.h> and -lpthread.
+ *    This is likely the slowest backend, and it also does not support fork(),
+ *    so avoid it at all costs.
+ *
+ * If you define neither of these symbols, coro.h will try to autodetect
+ * the best/safest model. To help with the autodetection, you should check
+ * (e.g. using autoconf) and define the following symbols: HAVE_UCONTEXT_H
+ * / HAVE_SETJMP_H / HAVE_SIGALTSTACK.
+ */
+
+/*
+ * Changes when the API changes incompatibly.
+ * This is ONLY the API version - there is no ABI compatibility between releases.
+ *
+ * Changes in API version 2:
+ * replaced bogus -DCORO_LOOSE with grammatically more correct -DCORO_LOSER
+ * Changes in API version 3:
+ * introduced stack management (CORO_STACKALLOC)
+ */
+#define CORO_VERSION 3
+
+#include <stddef.h>
+
+/*
+ * This is the type for the initialization function of a new coroutine.
+ */
+typedef void (*coro_func)(void *);
+
+/*
+ * A coroutine state is saved in the following structure. Treat it as an
+ * opaque type. errno and sigmask might be saved, but don't rely on it,
+ * implement your own switching primitive if you need that.
+ */
+typedef struct coro_context coro_context;
+
+/*
+ * This function creates a new coroutine. Apart from a pointer to an
+ * uninitialised coro_context, it expects a pointer to the entry function
+ * and the single pointer value that is given to it as argument.
+ *
+ * Allocating/deallocating the stack is your own responsibility.
+ *
+ * As a special case, if coro, arg, sptr and ssze are all zero,
+ * then an "empty" coro_context will be created that is suitable
+ * as an initial source for coro_transfer.
+ *
+ * This function is not reentrant, but putting a mutex around it
+ * will work.
+ */
+void coro_create (coro_context *ctx, /* an uninitialised coro_context */
+                  coro_func coro,    /* the coroutine code to be executed */
+                  void *arg,         /* a single pointer passed to the coro */
+                  void *sptr,        /* start of stack area */
+                  size_t ssze);      /* size of stack area in bytes */
+
+/*
+ * The following prototype defines the coroutine switching function. It is
+ * sometimes implemented as a macro, so watch out.
+ *
+ * This function is thread-safe and reentrant.
+ */
+#if 0
+void coro_transfer (coro_context *prev, coro_context *next);
+#endif
+
+/*
+ * The following prototype defines the coroutine destroy function. It
+ * is sometimes implemented as a macro, so watch out. It also serves no
+ * purpose unless you want to use the CORO_PTHREAD backend, where it is
+ * used to clean up the thread. You are responsible for freeing the stack
+ * and the context itself.
+ *
+ * This function is thread-safe and reentrant.
+ */
+#if 0
+void coro_destroy (coro_context *ctx);
+#endif
+
+/*****************************************************************************/
+/* optional stack management                                                 */
+/*****************************************************************************/
+/*
+ * You can disable all of the stack management functions by
+ * defining CORO_STACKALLOC to 0. Otherwise, they are enabled by default.
+ *
+ * If stack management is enabled, you can influence the implementation via these
+ * symbols:
+ *
+ * -DCORO_USE_VALGRIND
+ *
+ *    If defined, then libcoro will include valgrind/valgrind.h and register
+ *    and unregister stacks with valgrind.
+ *
+ * -DCORO_GUARDPAGES=n
+ *
+ *    libcoro will try to use the specified number of guard pages to protect against
+ *    stack overflow. If n is 0, then the feature will be disabled. If it isn't
+ *    defined, then libcoro will choose a suitable default. If guardpages are not
+ *    supported on the platform, then the feature will be silently disabled.
+ */
+#ifndef CORO_STACKALLOC
+# define CORO_STACKALLOC 1
+#endif
+
+#if CORO_STACKALLOC
+
+/*
+ * The only allowed operations on these struct members is to read the
+ * "sptr" and "ssze" members to pass it to coro_create, to read the "sptr"
+ * member to see if it is false, in which case the stack isn't allocated,
+ * and to set the "sptr" member to 0, to indicate to coro_stack_free to
+ * not actually do anything.
+ */
+
+struct coro_stack
+{
+  void *sptr;
+  size_t ssze;
+#ifdef CORO_USE_VALGRIND
+  int valgrind_id;
+#endif
+};
+
+/*
+ * Try to allocate a stack of at least the given size and return true if
+ * successful, or false otherwise.
+ *
+ * The size is *NOT* specified in bytes, but in units of sizeof (void *),
+ * i.e. the stack is typically 4(8) times larger on 32 bit(64 bit) platforms
+ * then the size passed in.
+ *
+ * If size is 0, then a "suitable" stack size is chosen (usually 1-2MB).
+ */
+int coro_stack_alloc (struct coro_stack *stack, unsigned int size);
+
+/*
+ * Free the stack allocated by coro_stack_alloc again. It is safe to
+ * call this function on the coro_stack structure even if coro_stack_alloc
+ * failed.
+ */
+void coro_stack_free (struct coro_stack *stack);
+
+#endif
+
+/*
+ * That was it. No other user-serviceable parts below here.
+ */
+
+/*****************************************************************************/
+
+#if !defined CORO_LOSER      && !defined CORO_UCONTEXT \
+    && !defined CORO_SJLJ    && !defined CORO_LINUX \
+    && !defined CORO_IRIX    && !defined CORO_ASM \
+    && !defined CORO_PTHREAD && !defined CORO_FIBER
+# if defined WINDOWS && (defined __i386__ || (__x86_64__ || defined _M_IX86 || defined _M_AMD64)
+#  define CORO_ASM 1
+# elif defined WINDOWS || defined _WIN32
+#  define CORO_LOSER 1 /* you don't win with windoze */
+# elif __linux && (__i386__ || (__x86_64__ && !__ILP32__) /*|| (__arm__ && __ARM_ARCH == 7)), not working */
+#  define CORO_ASM 1
+# elif defined HAVE_UCONTEXT_H
+#  define CORO_UCONTEXT 1
+# elif defined HAVE_SETJMP_H && defined HAVE_SIGALTSTACK
+#  define CORO_SJLJ 1
+# else
+error unknown or unsupported architecture
+# endif
+#endif
+
+/*****************************************************************************/
+
+#ifdef CORO_UCONTEXT
+
+# include <ucontext.h>
+
+struct coro_context
+{
+  ucontext_t uc;
+};
+
+# define coro_transfer(p,n) swapcontext (&((p)->uc), &((n)->uc))
+# define coro_destroy(ctx) (void *)(ctx)
+
+#elif defined (CORO_SJLJ) || defined (CORO_LOSER) || defined (CORO_LINUX) || defined (CORO_IRIX)
+
+# if defined(CORO_LINUX) && !defined(_GNU_SOURCE)
+#  define _GNU_SOURCE /* for glibc */
+# endif
+
+/* try to disable well-meant but buggy checks in some libcs */
+# ifdef _FORTIFY_SOURCE
+#  undef _FORTIFY_SOURCE
+#  undef __USE_FORTIFY_LEVEL /* helps some more when too much has been included already */
+# endif
+
+# if !CORO_LOSER
+#  include <unistd.h>
+# endif
+
+/* solaris is hopelessly borked, it expands _XOPEN_UNIX to nothing */
+# if __sun
+#  undef _XOPEN_UNIX
+#  define _XOPEN_UNIX 1
+# endif
+
+# include <setjmp.h>
+
+# if _XOPEN_UNIX > 0 || defined (_setjmp)
+#  define coro_jmp_buf      jmp_buf
+#  define coro_setjmp(env)  _setjmp (env)
+#  define coro_longjmp(env) _longjmp ((env), 1)
+# elif CORO_LOSER
+#  define coro_jmp_buf      jmp_buf
+#  define coro_setjmp(env)  setjmp (env)
+#  define coro_longjmp(env) longjmp ((env), 1)
+# else
+#  define coro_jmp_buf      sigjmp_buf
+#  define coro_setjmp(env)  sigsetjmp (env, 0)
+#  define coro_longjmp(env) siglongjmp ((env), 1)
+# endif
+
+struct coro_context
+{
+  coro_jmp_buf env;
+};
+
+# define coro_transfer(p,n) do { if (!coro_setjmp ((p)->env)) coro_longjmp ((n)->env); } while (0)
+# define coro_destroy(ctx) (void *)(ctx)
+
+#elif CORO_ASM
+
+struct coro_context
+{
+  void **sp; /* must be at offset 0 */
+};
+
+#if defined (__i386__) || defined (__x86_64__)
+void __attribute__ ((__noinline__, __regparm__(2)))
+#else
+void __attribute__ ((__noinline__))
+#endif
+coro_transfer (coro_context *prev, coro_context *next);
+
+# define coro_destroy(ctx) (void)(ctx)
+
+#elif CORO_PTHREAD
+
+# include <pthread.h>
+
+extern pthread_mutex_t coro_mutex;
+
+struct coro_context
+{
+  int flags;
+  pthread_cond_t cv;
+};
+
+void coro_transfer (coro_context *prev, coro_context *next);
+void coro_destroy (coro_context *ctx);
+
+#elif CORO_FIBER
+
+struct coro_context
+{
+  void *fiber;
+  /* only used for initialisation */
+  coro_func coro;
+  void *arg;
+};
+
+void coro_transfer (coro_context *prev, coro_context *next);
+void coro_destroy (coro_context *ctx);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/gatekeeper_acl.h b/include/gatekeeper_acl.h
index 59c4bbf42..61aa0d11e 100644
--- a/include/gatekeeper_acl.h
+++ b/include/gatekeeper_acl.h
@@ -32,16 +32,26 @@ struct acl_search {
 	struct rte_mbuf **mbufs;
 };
 
-/* Declare and initialize a struct acl_search. */
-#define DEFINE_ACL_SEARCH(name, num_pkts)			\
+#define DECLARE_ACL_SEARCH_VARIABLE_PART(name, num_pkts)	\
 	const uint8_t *name##_data_array[(num_pkts)];		\
-	struct rte_mbuf *name##_mbufs_array[(num_pkts)];	\
-	struct acl_search name = {				\
-		.num = 0,					\
-		.data = name##_data_array,			\
-		.mbufs = name##_mbufs_array,			\
+	struct rte_mbuf *name##_mbufs_array[(num_pkts)]
+
+/*
+ * This macro can only be used if the macro DECLARE_ACL_SEARCH_VARIABLE_PART()
+ * has been placed before it.
+ */
+#define ACL_SEARCH_INIT(name)			\
+	{					\
+		.num = 0,			\
+		.data = name##_data_array,	\
+		.mbufs = name##_mbufs_array,	\
 	}
 
+/* Declare and initialize a struct acl_search. */
+#define DEFINE_ACL_SEARCH(name, num_pkts)			\
+	DECLARE_ACL_SEARCH_VARIABLE_PART(name, num_pkts);	\
+	struct acl_search name = ACL_SEARCH_INIT(name)
+
 /* Classify batches of packets in @acl and invoke callback functions. */
 int process_acl(struct gatekeeper_if *iface, unsigned int lcore_id,
 	struct acl_search *acl, struct acl_state *astate,
diff --git a/include/gatekeeper_gk.h b/include/gatekeeper_gk.h
index 95264d984..d732621ee 100644
--- a/include/gatekeeper_gk.h
+++ b/include/gatekeeper_gk.h
@@ -19,6 +19,8 @@
 #ifndef _GATEKEEPER_GK_H_
 #define _GATEKEEPER_GK_H_
 
+#include <coro.h>
+
 #include <rte_atomic.h>
 #include <rte_bpf.h>
 
@@ -98,6 +100,14 @@ struct gk_measurement_metrics {
 struct gk_instance {
 	struct rte_hash   *ip_flow_hash_table;
 	struct flow_entry *ip_flow_entry_table;
+	/*
+	 * Coroutines.
+	 *
+	 * These structs must be here and not in struct gk_co_work because
+	 * initialization functions (e.g. coro_create()) are not reentrant.
+	 */
+	struct coro_context coro_root;
+	struct gk_co      *cos;
 	/* RX queue on the front interface. */
 	uint16_t          rx_queue_front;
 	/* TX queue on the front interface. */
@@ -201,6 +211,11 @@ struct gk_config {
 	/* Time for logging the basic measurements in ms. */
 	unsigned int       basic_measurement_logging_ms;
 
+	/* Maximum number of coroutines running in parallel per GK instance. */
+	uint16_t           co_max_num;
+	/* Size of the stack of each coroutine in KB. */
+	uint16_t           co_stack_size_kb;
+
 	/*
 	 * The fields below are for internal use.
 	 * Configuration files should not refer to them.
diff --git a/include/gatekeeper_main.h b/include/gatekeeper_main.h
index 50aafa1fe..b9de610e4 100644
--- a/include/gatekeeper_main.h
+++ b/include/gatekeeper_main.h
@@ -20,6 +20,10 @@
 #define _GATEKEEPER_MAIN_H_
 
 #include <stdint.h>
+#include <stdbool.h>
+
+#include <rte_mbuf.h>
+#include <rte_prefetch.h>
 
 #ifdef RTE_MACHINE_CPUFLAG_SSE4_2
 #include <rte_hash_crc.h>
@@ -52,4 +56,49 @@ extern FILE *log_file;
 char *rte_strdup(const char *type, const char *s);
 int gatekeeper_log_init(void);
 
+/* XXX #52 This should be part of DPDK. */
+/**
+ * Prefetch the first part of the mbuf
+ *
+ * The first 64 bytes of the mbuf corresponds to fields that are used early
+ * in the receive path. If the cache line of the architecture is higher than
+ * 64B, the second part will also be prefetched.
+ *
+ * @param m
+ *   The pointer to the mbuf.
+ */
+static inline void
+rte_mbuf_prefetch_part1_non_temporal(struct rte_mbuf *m)
+{
+	rte_prefetch_non_temporal(&m->cacheline0);
+}
+
+/* XXX #52 This should be part of DPDK. */
+/**
+ * Prefetch the second part of the mbuf
+ *
+ * The next 64 bytes of the mbuf corresponds to fields that are used in the
+ * transmit path. If the cache line of the architecture is higher than 64B,
+ * this function does nothing as it is expected that the full mbuf is
+ * already in cache.
+ *
+ * @param m
+ *   The pointer to the mbuf.
+ */
+static inline bool
+rte_mbuf_prefetch_part2_non_temporal(struct rte_mbuf *m)
+{
+#if RTE_CACHE_LINE_SIZE == 64
+	/* TODO Do we need this prefetch?
+	rte_prefetch_non_temporal(&m->cacheline1);
+	return true;
+	*/
+	RTE_SET_USED(m);
+	return false;
+#else
+	RTE_SET_USED(m);
+	return false;
+#endif
+}
+
 #endif /* _GATEKEEPER_MAIN_H_ */
diff --git a/include/list.h b/include/list.h
index e7fd442fa..c5adf7c51 100644
--- a/include/list.h
+++ b/include/list.h
@@ -34,6 +34,11 @@ struct list_head {
 
 #define LIST_HEAD_INIT(name) { &(name), &(name) }
 
+#define LIST_POISON1  ((void *) 0x00100100)
+#define LIST_POISON2  ((void *) 0x00200200)
+
+#define LIST_HEAD_INIT_WITH_POISON(name) { LIST_POISON1, LIST_POISON2 }
+
 static inline void
 INIT_LIST_HEAD(struct list_head *list)
 {
@@ -41,6 +46,13 @@ INIT_LIST_HEAD(struct list_head *list)
 	list->prev = list;
 }
 
+static inline void
+INIT_LIST_HEAD_WITH_POISON(struct list_head *list)
+{
+	list->next = LIST_POISON1;
+	list->prev = LIST_POISON2;
+}
+
 /**
  * list_entry - get the struct for this entry
  * @ptr:	the &struct list_head pointer.
@@ -133,6 +145,16 @@ list_is_singular(const struct list_head *head)
 	return !list_empty(head) && (head->next == head->prev);
 }
 
+/**
+ * list_poison - tests whether @entry has been poisoned.
+ * @entry: the entry to test.
+ */
+static inline int
+list_poison(const struct list_head *entry)
+{
+	return entry->next == LIST_POISON1 && entry->prev == LIST_POISON2;
+}
+
 /*
  * Insert a new entry between two known consecutive entries.
  *
@@ -191,8 +213,6 @@ __list_del(struct list_head *prev, struct list_head *next)
 	prev->next = next;
 }
 
-#define LIST_POISON1  ((void *) 0x00100100)
-#define LIST_POISON2  ((void *) 0x00200200)
 /**
  * list_del - deletes entry from list.
  * @entry: the element to delete from the list.
diff --git a/lib/coro.c b/lib/coro.c
new file mode 100644
index 000000000..7817aab22
--- /dev/null
+++ b/lib/coro.c
@@ -0,0 +1,806 @@
+/*
+ * Copyright (c) 2001-2011 Marc Alexander Lehmann <schmorp@schmorp.de>
+ *
+ * Redistribution and use in source and binary forms, with or without modifica-
+ * tion, are permitted provided that the following conditions are met:
+ *
+ *   1.  Redistributions of source code must retain the above copyright notice,
+ *       this list of conditions and the following disclaimer.
+ *
+ *   2.  Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
+ * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
+ * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
+ * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * the GNU General Public License ("GPL") version 2 or any later version,
+ * in which case the provisions of the GPL are applicable instead of
+ * the above. If you wish to allow the use of your version of this file
+ * only under the terms of the GPL and not to allow others to use your
+ * version of this file under the BSD license, indicate your decision
+ * by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL. If you do not delete the
+ * provisions above, a recipient may use your version of this file under
+ * either the BSD or the GPL.
+ *
+ * This library is modelled strictly after Ralf S. Engelschalls article at
+ * http://www.gnu.org/software/pth/rse-pmt.ps. So most of the credit must
+ * go to Ralf S. Engelschall <rse@engelschall.com>.
+ */
+
+#include "coro.h"
+
+#include <stddef.h>
+#include <string.h>
+
+/*****************************************************************************/
+/* ucontext/setjmp/asm backends                                              */
+/*****************************************************************************/
+#if defined (CORO_UCONTEXT) || defined (CORO_SJLJ) || defined (CORO_LOSER) || defined (CORO_LINUX) || defined (CORO_IRIX) || defined (CORO_ASM)
+
+# ifdef CORO_UCONTEXT
+#  include <stddef.h>
+# endif
+
+# if !defined(STACK_ADJUST_PTR)
+#  ifdef __sgi
+/* IRIX is decidedly NON-unix */
+#   define STACK_ADJUST_PTR(sp,ss) ((char *)(sp) + (ss) - 8)
+#   define STACK_ADJUST_SIZE(sp,ss) ((ss) - 8)
+#  elif (defined (__i386__) && defined (CORO_LINUX)) || (defined (_M_IX86) && defined (CORO_LOSER))
+#   define STACK_ADJUST_PTR(sp,ss) ((char *)(sp) + (ss))
+#   define STACK_ADJUST_SIZE(sp,ss) (ss)
+#  elif (defined (__amd64__) && defined (CORO_LINUX)) || ((defined (_M_AMD64) || defined (_M_IA64)) && defined (CORO_LOSER))
+#   define STACK_ADJUST_PTR(sp,ss) ((char *)(sp) + (ss) - 8)
+#   define STACK_ADJUST_SIZE(sp,ss) (ss)
+#  else
+#   define STACK_ADJUST_PTR(sp,ss) (sp)
+#   define STACK_ADJUST_SIZE(sp,ss) (ss)
+#  endif
+# endif
+
+# include <stdlib.h>
+
+# ifdef CORO_SJLJ
+#  include <stdio.h>
+#  include <signal.h>
+#  include <unistd.h>
+# endif
+
+static coro_func coro_init_func;
+static void *coro_init_arg;
+static coro_context *new_coro, *create_coro;
+
+static void
+coro_init (void)
+{
+  volatile coro_func func = coro_init_func;
+  volatile void *arg = coro_init_arg;
+
+  coro_transfer (new_coro, create_coro);
+
+#if defined (__GCC_HAVE_DWARF2_CFI_ASM) && defined (__amd64)
+  /*asm (".cfi_startproc");*/
+  /*asm (".cfi_undefined rip");*/
+#endif
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcast-qual"
+  func ((void *)arg);
+#pragma GCC diagnostic pop
+
+#if __GCC_HAVE_DWARF2_CFI_ASM && __amd64
+  /*asm (".cfi_endproc");*/
+#endif
+
+  /* the new coro returned. bad. just abort() for now */
+  abort ();
+}
+
+# ifdef CORO_SJLJ
+
+static volatile int trampoline_done;
+
+/* trampoline signal handler */
+static void
+trampoline (int sig)
+{
+  if (coro_setjmp (new_coro->env))
+    coro_init (); /* start it */
+  else
+    trampoline_done = 1;
+}
+
+# endif
+
+# if CORO_ASM
+
+  #if (defined __arm__) && \
+      (defined __ARM_ARCH_7__  || defined __ARM_ARCH_7A__ \
+    || defined __ARM_ARCH_7R__ || defined __ARM_ARCH_7M__ \
+    || __ARM_ARCH == 7)
+    #define CORO_ARM 1
+  #endif
+
+  #if defined (_WIN32) || defined (__CYGWIN__)
+    #define CORO_WIN_TIB 1
+  #endif
+
+  asm (
+       "\t.text\n"
+       #if defined (_WIN32) || defined (__CYGWIN__)
+       "\t.globl _coro_transfer\n"
+       "_coro_transfer:\n"
+       #else
+       "\t.globl coro_transfer\n"
+       "coro_transfer:\n"
+       #endif
+       /* windows, of course, gives a shit on the amd64 ABI and uses different registers */
+       /* http://blogs.msdn.com/freik/archive/2005/03/17/398200.aspx */
+       #ifdef __amd64
+
+         #if defined (_WIN32) || defined (__CYGWIN__)
+           #define NUM_SAVED 29
+           "\tsubq $168, %rsp\t" /* one dummy qword to improve alignment */
+           "\tmovaps %xmm6, (%rsp)\n"
+           "\tmovaps %xmm7, 16(%rsp)\n"
+           "\tmovaps %xmm8, 32(%rsp)\n"
+           "\tmovaps %xmm9, 48(%rsp)\n"
+           "\tmovaps %xmm10, 64(%rsp)\n"
+           "\tmovaps %xmm11, 80(%rsp)\n"
+           "\tmovaps %xmm12, 96(%rsp)\n"
+           "\tmovaps %xmm13, 112(%rsp)\n"
+           "\tmovaps %xmm14, 128(%rsp)\n"
+           "\tmovaps %xmm15, 144(%rsp)\n"
+           "\tpushq %rsi\n"
+           "\tpushq %rdi\n"
+           "\tpushq %rbp\n"
+           "\tpushq %rbx\n"
+           "\tpushq %r12\n"
+           "\tpushq %r13\n"
+           "\tpushq %r14\n"
+           "\tpushq %r15\n"
+           #if CORO_WIN_TIB
+             "\tpushq %fs:0x0\n"
+             "\tpushq %fs:0x8\n"
+             "\tpushq %fs:0xc\n"
+           #endif
+           "\tmovq %rsp, (%rcx)\n"
+           "\tmovq (%rdx), %rsp\n"
+           #if CORO_WIN_TIB
+             "\tpopq %fs:0xc\n"
+             "\tpopq %fs:0x8\n"
+             "\tpopq %fs:0x0\n"
+           #endif
+           "\tpopq %r15\n"
+           "\tpopq %r14\n"
+           "\tpopq %r13\n"
+           "\tpopq %r12\n"
+           "\tpopq %rbx\n"
+           "\tpopq %rbp\n"
+           "\tpopq %rdi\n"
+           "\tpopq %rsi\n"
+           "\tmovaps (%rsp), %xmm6\n"
+           "\tmovaps 16(%rsp), %xmm7\n"
+           "\tmovaps 32(%rsp), %xmm8\n"
+           "\tmovaps 48(%rsp), %xmm9\n"
+           "\tmovaps 64(%rsp), %xmm10\n"
+           "\tmovaps 80(%rsp), %xmm11\n"
+           "\tmovaps 96(%rsp), %xmm12\n"
+           "\tmovaps 112(%rsp), %xmm13\n"
+           "\tmovaps 128(%rsp), %xmm14\n"
+           "\tmovaps 144(%rsp), %xmm15\n"
+           "\taddq $168, %rsp\n"
+         #else
+           #define NUM_SAVED 6
+           "\tpushq %rbp\n"
+           "\tpushq %rbx\n"
+           "\tpushq %r12\n"
+           "\tpushq %r13\n"
+           "\tpushq %r14\n"
+           "\tpushq %r15\n"
+           "\tmovq %rsp, (%rdi)\n"
+           "\tmovq (%rsi), %rsp\n"
+           "\tpopq %r15\n"
+           "\tpopq %r14\n"
+           "\tpopq %r13\n"
+           "\tpopq %r12\n"
+           "\tpopq %rbx\n"
+           "\tpopq %rbp\n"
+         #endif
+         "\tpopq %rcx\n"
+         "\tjmpq *%rcx\n"
+
+       #elif __i386__
+
+         #define NUM_SAVED 4
+         "\tpushl %ebp\n"
+         "\tpushl %ebx\n"
+         "\tpushl %esi\n"
+         "\tpushl %edi\n"
+         #if CORO_WIN_TIB
+           #undef NUM_SAVED
+           #define NUM_SAVED 7
+           "\tpushl %fs:0\n"
+           "\tpushl %fs:4\n"
+           "\tpushl %fs:8\n"
+         #endif
+         "\tmovl %esp, (%eax)\n"
+         "\tmovl (%edx), %esp\n"
+         #if CORO_WIN_TIB
+           "\tpopl %fs:8\n"
+           "\tpopl %fs:4\n"
+           "\tpopl %fs:0\n"
+         #endif
+         "\tpopl %edi\n"
+         "\tpopl %esi\n"
+         "\tpopl %ebx\n"
+         "\tpopl %ebp\n"
+         "\tpopl %ecx\n"
+         "\tjmpl *%ecx\n"
+
+       #elif CORO_ARM /* untested, what about thumb, neon, iwmmxt? */
+
+         #if __ARM_PCS_VFP
+           "\tvpush {d8-d15}\n"
+           #define NUM_SAVED (9 + 8 * 2)
+         #else
+           #define NUM_SAVED 9
+         #endif
+         "\tpush {r4-r11,lr}\n"
+         "\tstr sp, [r0]\n"
+         "\tldr sp, [r1]\n"
+         "\tpop {r4-r11,lr}\n"
+         #if __ARM_PCS_VFP
+           "\tvpop {d8-d15}\n"
+         #endif
+         "\tmov r15, lr\n"
+
+       #elif __mips__ && 0 /* untested, 32 bit only */
+
+        #define NUM_SAVED (12 + 8 * 2)
+         /* TODO: n64/o64, lw=>ld */
+
+         "\t.set    nomips16\n"
+         "\t.frame  $sp,112,$31\n"
+         #if __mips_soft_float
+           "\taddiu   $sp,$sp,-44\n"
+         #else
+           "\taddiu   $sp,$sp,-112\n"
+           "\ts.d     $f30,88($sp)\n"
+           "\ts.d     $f28,80($sp)\n"
+           "\ts.d     $f26,72($sp)\n"
+           "\ts.d     $f24,64($sp)\n"
+           "\ts.d     $f22,56($sp)\n"
+           "\ts.d     $f20,48($sp)\n"
+         #endif
+         "\tsw      $28,40($sp)\n"
+         "\tsw      $31,36($sp)\n"
+         "\tsw      $fp,32($sp)\n"
+         "\tsw      $23,28($sp)\n"
+         "\tsw      $22,24($sp)\n"
+         "\tsw      $21,20($sp)\n"
+         "\tsw      $20,16($sp)\n"
+         "\tsw      $19,12($sp)\n"
+         "\tsw      $18,8($sp)\n"
+         "\tsw      $17,4($sp)\n"
+         "\tsw      $16,0($sp)\n"
+         "\tsw      $sp,0($4)\n"
+         "\tlw      $sp,0($5)\n"
+         #if !__mips_soft_float
+           "\tl.d     $f30,88($sp)\n"
+           "\tl.d     $f28,80($sp)\n"
+           "\tl.d     $f26,72($sp)\n"
+           "\tl.d     $f24,64($sp)\n"
+           "\tl.d     $f22,56($sp)\n"
+           "\tl.d     $f20,48($sp)\n"
+         #endif
+         "\tlw      $28,40($sp)\n"
+         "\tlw      $31,36($sp)\n"
+         "\tlw      $fp,32($sp)\n"
+         "\tlw      $23,28($sp)\n"
+         "\tlw      $22,24($sp)\n"
+         "\tlw      $21,20($sp)\n"
+         "\tlw      $20,16($sp)\n"
+         "\tlw      $19,12($sp)\n"
+         "\tlw      $18,8($sp)\n"
+         "\tlw      $17,4($sp)\n"
+         "\tlw      $16,0($sp)\n"
+         "\tj       $31\n"
+         #if __mips_soft_float
+           "\taddiu   $sp,$sp,44\n"
+         #else
+           "\taddiu   $sp,$sp,112\n"
+         #endif
+
+       #else
+         #error unsupported architecture
+       #endif
+  );
+
+# endif
+
+void
+coro_create (coro_context *ctx, coro_func coro, void *arg, void *sptr, size_t ssize)
+{
+  coro_context nctx;
+# ifdef CORO_SJLJ
+  stack_t ostk, nstk;
+  struct sigaction osa, nsa;
+  sigset_t nsig, osig;
+# endif
+
+  if (!coro)
+    return;
+
+  coro_init_func = coro;
+  coro_init_arg  = arg;
+
+  new_coro    = ctx;
+  create_coro = &nctx;
+
+# ifdef CORO_SJLJ
+  /* we use SIGUSR2. first block it, then fiddle with it. */
+
+  sigemptyset (&nsig);
+  sigaddset (&nsig, SIGUSR2);
+  sigprocmask (SIG_BLOCK, &nsig, &osig);
+
+  nsa.sa_handler = trampoline;
+  sigemptyset (&nsa.sa_mask);
+  nsa.sa_flags = SA_ONSTACK;
+
+  if (sigaction (SIGUSR2, &nsa, &osa))
+    {
+      perror ("sigaction");
+      abort ();
+    }
+
+  /* set the new stack */
+  nstk.ss_sp    = STACK_ADJUST_PTR (sptr, ssize); /* yes, some platforms (IRIX) get this wrong. */
+  nstk.ss_size  = STACK_ADJUST_SIZE (sptr, ssize);
+  nstk.ss_flags = 0;
+
+  if (sigaltstack (&nstk, &ostk) < 0)
+    {
+      perror ("sigaltstack");
+      abort ();
+    }
+
+  trampoline_done = 0;
+  kill (getpid (), SIGUSR2);
+  sigfillset (&nsig); sigdelset (&nsig, SIGUSR2);
+
+  while (!trampoline_done)
+    sigsuspend (&nsig);
+
+  sigaltstack (0, &nstk);
+  nstk.ss_flags = SS_DISABLE;
+  if (sigaltstack (&nstk, 0) < 0)
+    perror ("sigaltstack");
+
+  sigaltstack (0, &nstk);
+  if (~nstk.ss_flags & SS_DISABLE)
+    abort ();
+
+  if (~ostk.ss_flags & SS_DISABLE)
+    sigaltstack (&ostk, 0);
+
+  sigaction (SIGUSR2, &osa, 0);
+  sigprocmask (SIG_SETMASK, &osig, 0);
+
+# elif defined (CORO_LOSER)
+
+  coro_setjmp (ctx->env);
+  #if __CYGWIN__ && __i386__
+    ctx->env[8]                        = (long)    coro_init;
+    ctx->env[7]                        = (long)    ((char *)sptr + ssize)         - sizeof (long);
+  #elif __CYGWIN__ && __x86_64__
+    ctx->env[7]                        = (long)    coro_init;
+    ctx->env[6]                        = (long)    ((char *)sptr + ssize)         - sizeof (long);
+  #elif defined __MINGW32__
+    ctx->env[5]                        = (long)    coro_init;
+    ctx->env[4]                        = (long)    ((char *)sptr + ssize)         - sizeof (long);
+  #elif defined _M_IX86
+    ((_JUMP_BUFFER *)&ctx->env)->Eip   = (long)    coro_init;
+    ((_JUMP_BUFFER *)&ctx->env)->Esp   = (long)    STACK_ADJUST_PTR (sptr, ssize) - sizeof (long);
+  #elif defined _M_AMD64
+    ((_JUMP_BUFFER *)&ctx->env)->Rip   = (__int64) coro_init;
+    ((_JUMP_BUFFER *)&ctx->env)->Rsp   = (__int64) STACK_ADJUST_PTR (sptr, ssize) - sizeof (__int64);
+  #elif defined _M_IA64
+    ((_JUMP_BUFFER *)&ctx->env)->StIIP = (__int64) coro_init;
+    ((_JUMP_BUFFER *)&ctx->env)->IntSp = (__int64) STACK_ADJUST_PTR (sptr, ssize) - sizeof (__int64);
+  #else
+    #error "microsoft libc or architecture not supported"
+  #endif
+
+# elif defined (CORO_LINUX)
+
+  coro_setjmp (ctx->env);
+  #if __GLIBC__ >= 2 && __GLIBC_MINOR__ >= 0 && defined (JB_PC) && defined (JB_SP)
+    ctx->env[0].__jmpbuf[JB_PC]        = (long)    coro_init;
+    ctx->env[0].__jmpbuf[JB_SP]        = (long)    STACK_ADJUST_PTR (sptr, ssize) - sizeof (long);
+  #elif __GLIBC__ >= 2 && __GLIBC_MINOR__ >= 0 && defined (__mc68000__)
+    ctx->env[0].__jmpbuf[0].__aregs[0] = (long int)coro_init;
+    ctx->env[0].__jmpbuf[0].__sp       = (int *)   ((char *)sptr + ssize)         - sizeof (long);
+  #elif defined (__GNU_LIBRARY__) && defined (__i386__)
+    ctx->env[0].__jmpbuf[0].__pc       = (char *)  coro_init;
+    ctx->env[0].__jmpbuf[0].__sp       = (void *)  ((char *)sptr + ssize)         - sizeof (long);
+  #elif defined (__GNU_LIBRARY__) && defined (__x86_64__)
+    ctx->env[0].__jmpbuf[JB_PC]        = (long)    coro_init;
+    ctx->env[0].__jmpbuf[0].__sp       = (void *)  ((char *)sptr + ssize)         - sizeof (long);
+  #else
+    #error "linux libc or architecture not supported"
+  #endif
+
+# elif defined (CORO_IRIX)
+
+  coro_setjmp (ctx->env, 0);
+  ctx->env[JB_PC]                      = (__uint64_t)coro_init;
+  ctx->env[JB_SP]                      = (__uint64_t)STACK_ADJUST_PTR (sptr, ssize) - sizeof (long);
+
+# elif CORO_ASM
+
+  #if defined (__i386__) || defined (__x86_64__)
+    ctx->sp = (void **)(ssize + (char *)sptr);
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcast-qual"
+    *--ctx->sp = (void *)abort; /* needed for alignment only */
+#pragma GCC diagnostic pop
+    *--ctx->sp = (void *)coro_init;
+    #ifdef CORO_WIN_TIB
+      *--ctx->sp = 0;                    /* ExceptionList */
+      *--ctx->sp = (char *)sptr + ssize; /* StackBase */
+      *--ctx->sp = sptr;                 /* StackLimit */
+    #endif
+  #elif CORO_ARM
+    /* return address stored in lr register, don't push anything */
+  #else
+    #error unsupported architecture
+  #endif
+
+  ctx->sp -= NUM_SAVED;
+  memset (ctx->sp, 0, sizeof (*ctx->sp) * NUM_SAVED);
+
+  #if defined (__i386__) || defined (__x86_64__)
+    /* done already */
+  #elif defined (CORO_ARM)
+    ctx->sp[0] = coro; /* r4 */
+    ctx->sp[1] = arg;  /* r5 */
+    ctx->sp[8] = (char *)coro_init; /* lr */
+  #else
+    #error unsupported architecture
+  #endif
+
+# elif CORO_UCONTEXT
+
+  getcontext (&(ctx->uc));
+
+  ctx->uc.uc_link           =  0;
+  ctx->uc.uc_stack.ss_sp    = sptr;
+  ctx->uc.uc_stack.ss_size  = (size_t)ssize;
+  ctx->uc.uc_stack.ss_flags = 0;
+
+  makecontext (&(ctx->uc), (void (*)())coro_init, 0);
+
+# endif
+
+  coro_transfer (create_coro, new_coro);
+}
+
+/*****************************************************************************/
+/* pthread backend                                                           */
+/*****************************************************************************/
+#elif CORO_PTHREAD
+
+/* this mutex will be locked by the running coroutine */
+pthread_mutex_t coro_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+struct coro_init_args
+{
+  coro_func func;
+  void *arg;
+  coro_context *self, *main;
+};
+
+static void *
+coro_init (void *args_)
+{
+  struct coro_init_args *args = (struct coro_init_args *)args_;
+  coro_func func = args->func;
+  void *arg = args->arg;
+
+  coro_transfer (args->self, args->main);
+  func (arg);
+
+  return 0;
+}
+
+void
+coro_transfer (coro_context *prev, coro_context *next)
+{
+  pthread_mutex_lock (&coro_mutex);
+
+  next->flags = 1;
+  pthread_cond_signal (&next->cv);
+
+  prev->flags = 0;
+
+  while (!prev->flags)
+    pthread_cond_wait (&prev->cv, &coro_mutex);
+
+  if (prev->flags == 2)
+    {
+      pthread_mutex_unlock (&coro_mutex);
+      pthread_cond_destroy (&prev->cv);
+      pthread_detach (pthread_self ());
+      pthread_exit (0);
+    }
+
+  pthread_mutex_unlock (&coro_mutex);
+}
+
+void
+coro_create (coro_context *ctx, coro_func coro, void *arg, void *sptr, size_t ssize)
+{
+  static coro_context nctx;
+  static int once;
+
+  if (!once)
+    {
+      once = 1;
+
+      pthread_cond_init (&nctx.cv, 0);
+    }
+
+  pthread_cond_init (&ctx->cv, 0);
+
+  if (coro)
+    {
+      pthread_attr_t attr;
+      struct coro_init_args args;
+      pthread_t id;
+
+      args.func = coro;
+      args.arg  = arg;
+      args.self = ctx;
+      args.main = &nctx;
+
+      pthread_attr_init (&attr);
+#if __UCLIBC__
+      /* exists, but is borked */
+      /*pthread_attr_setstacksize (&attr, (size_t)ssize);*/
+#elif __CYGWIN__
+      /* POSIX, not here */
+      pthread_attr_setstacksize (&attr, (size_t)ssize);
+#else
+      pthread_attr_setstack (&attr, sptr, (size_t)ssize);
+#endif
+      pthread_attr_setscope (&attr, PTHREAD_SCOPE_PROCESS);
+      pthread_create (&id, &attr, coro_init, &args);
+
+      coro_transfer (args.main, args.self);
+    }
+}
+
+void
+coro_destroy (coro_context *ctx)
+{
+  pthread_mutex_lock (&coro_mutex);
+  ctx->flags = 2;
+  pthread_cond_signal (&ctx->cv);
+  pthread_mutex_unlock (&coro_mutex);
+}
+
+/*****************************************************************************/
+/* fiber backend                                                             */
+/*****************************************************************************/
+#elif CORO_FIBER
+
+#define WIN32_LEAN_AND_MEAN
+#if _WIN32_WINNT < 0x0400
+  #undef _WIN32_WINNT
+  #define _WIN32_WINNT 0x0400
+#endif
+#include <windows.h>
+
+VOID CALLBACK
+coro_init (PVOID arg)
+{
+  coro_context *ctx = (coro_context *)arg;
+
+  ctx->coro (ctx->arg);
+}
+
+void
+coro_transfer (coro_context *prev, coro_context *next)
+{
+  if (!prev->fiber)
+    {
+      prev->fiber = GetCurrentFiber ();
+
+      if (prev->fiber == 0 || prev->fiber == (void *)0x1e00)
+        prev->fiber = ConvertThreadToFiber (0);
+    }
+
+  SwitchToFiber (next->fiber);
+}
+
+void
+coro_create (coro_context *ctx, coro_func coro, void *arg, void *sptr, size_t ssize)
+{
+  ctx->fiber = 0;
+  ctx->coro  = coro;
+  ctx->arg   = arg;
+
+  if (!coro)
+    return;
+
+  ctx->fiber = CreateFiber (ssize, coro_init, ctx);
+}
+
+void
+coro_destroy (coro_context *ctx)
+{
+  DeleteFiber (ctx->fiber);
+}
+
+#else
+  #error unsupported backend
+#endif
+
+/*****************************************************************************/
+/* stack management                                                          */
+/*****************************************************************************/
+#if CORO_STACKALLOC
+
+#include <stdlib.h>
+
+#ifndef _WIN32
+# include <unistd.h>
+#endif
+
+#ifdef CORO_USE_VALGRIND
+# include <valgrind/valgrind.h>
+#endif
+
+#ifdef _POSIX_MAPPED_FILES
+# include <sys/mman.h>
+# define CORO_MMAP 1
+# ifndef MAP_ANONYMOUS
+#  ifdef MAP_ANON
+#   define MAP_ANONYMOUS MAP_ANON
+#  else
+#   undef CORO_MMAP
+#  endif
+# endif
+# include <limits.h>
+#else
+# undef CORO_MMAP
+#endif
+
+#if _POSIX_MEMORY_PROTECTION
+# ifndef CORO_GUARDPAGES
+#  define CORO_GUARDPAGES 4
+# endif
+#else
+# undef CORO_GUARDPAGES
+#endif
+
+#if !CORO_MMAP
+# undef CORO_GUARDPAGES
+#endif
+
+#if !defined (__i386__) && !defined (__x86_64__) && !defined (__powerpc__) && !defined (__arm__) && !defined (__aarch64__) && !defined (__m68k__) && !defined (__alpha__) && !defined (__mips__) && !defined (__sparc64__)
+# undef CORO_GUARDPAGES
+#endif
+
+#ifndef CORO_GUARDPAGES
+# define CORO_GUARDPAGES 0
+#endif
+
+#ifndef PAGESIZE
+  #if !CORO_MMAP
+    #define PAGESIZE 4096
+  #else
+    static size_t
+    coro_pagesize (void)
+    {
+      static size_t pagesize;
+
+      if (!pagesize)
+        pagesize = sysconf (_SC_PAGESIZE);
+
+      return pagesize;
+    }
+
+    #define PAGESIZE coro_pagesize ()
+  #endif
+#endif
+
+int
+coro_stack_alloc (struct coro_stack *stack, unsigned int size)
+{
+  if (!size)
+    size = 256 * 1024;
+
+  stack->sptr = 0;
+  stack->ssze = ((size_t)size * sizeof (void *) + PAGESIZE - 1) / PAGESIZE * PAGESIZE;
+
+#ifdef CORO_FIBER
+
+  stack->sptr = (void *)stack;
+  return 1;
+
+#else
+
+  size_t ssze = stack->ssze + CORO_GUARDPAGES * PAGESIZE;
+  void *base;
+
+  #if CORO_MMAP
+    /* mmap supposedly does allocate-on-write for us */
+    base = mmap (0, ssze, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+    if (base == (void *)-1)
+      {
+        /* some systems don't let us have executable heap */
+        /* we assume they won't need executable stack in that case */
+        base = mmap (0, ssze, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+        if (base == (void *)-1)
+          return 0;
+      }
+
+    #if CORO_GUARDPAGES
+      mprotect (base, CORO_GUARDPAGES * PAGESIZE, PROT_NONE);
+    #endif
+
+    base = (void*)((char *)base + CORO_GUARDPAGES * PAGESIZE);
+  #else
+    base = malloc (ssze);
+    if (!base)
+      return 0;
+  #endif
+
+  #ifdef CORO_USE_VALGRIND
+    stack->valgrind_id = VALGRIND_STACK_REGISTER ((char *)base, ((char *)base) + ssze - CORO_GUARDPAGES * PAGESIZE);
+  #endif
+
+  stack->sptr = base;
+  return 1;
+
+#endif
+}
+
+void
+coro_stack_free (struct coro_stack *stack)
+{
+#ifdef CORO_FIBER
+  /* nop */
+#else
+  #ifdef CORO_USE_VALGRIND
+    VALGRIND_STACK_DEREGISTER (stack->valgrind_id);
+  #endif
+
+  #if CORO_MMAP
+    if (stack->sptr)
+      munmap ((void*)((char *)stack->sptr - CORO_GUARDPAGES * PAGESIZE),
+              stack->ssze                 + CORO_GUARDPAGES * PAGESIZE);
+  #else
+    free (stack->sptr);
+  #endif
+#endif
+}
+
+#endif
+
diff --git a/lib/mailbox.c b/lib/mailbox.c
index 33bb242df..a78c53c0e 100644
--- a/lib/mailbox.c
+++ b/lib/mailbox.c
@@ -111,9 +111,13 @@ void
 destroy_mailbox(struct mailbox *mb)
 {
 	if (mb) {
-		if (mb->ring)
+		if (mb->ring) {
     			rte_ring_free(mb->ring);
-		if (mb->pool)
+			mb->ring = NULL;
+		}
+		if (mb->pool) {
 			rte_mempool_free(mb->pool);
+			mb->pool = NULL;
+		}
 	}
 }
diff --git a/lua/gatekeeper/staticlib.lua b/lua/gatekeeper/staticlib.lua
index c3c1435dc..cae9c7b93 100644
--- a/lua/gatekeeper/staticlib.lua
+++ b/lua/gatekeeper/staticlib.lua
@@ -211,6 +211,8 @@ struct gk_config {
 	uint32_t     log_ratelimit_interval_ms;
 	uint32_t     log_ratelimit_burst;
 	unsigned int basic_measurement_logging_ms;
+	uint16_t     co_max_num;
+	uint16_t     co_stack_size_kb;
 	/* This struct has hidden fields. */
 };
 
diff --git a/lua/gk.lua b/lua/gk.lua
index 057b98644..be3e452c4 100644
--- a/lua/gk.lua
+++ b/lua/gk.lua
@@ -42,8 +42,12 @@ return function (net_conf, lls_conf, sol_conf, gk_lcores)
 	local back_icmp_msgs_per_sec = 1000
 	local back_icmp_msgs_burst = 50
 
+	local co_max_num = 16
+
 	-- These variables are unlikely to need to be changed.
 	local bpf_enable_jit = true
+	-- CAUTION: stacks too small will crash the GK blocks.
+	local co_stack_size_kb = 16
 
 	--
 	-- End configuration of GK block.
@@ -100,6 +104,9 @@ return function (net_conf, lls_conf, sol_conf, gk_lcores)
 	gk_conf.back_max_pkt_burst =
 		staticlib.get_back_burst_config(max_pkt_burst_back, net_conf)
 
+	gk_conf.co_max_num = co_max_num
+	gk_conf.co_stack_size_kb = co_stack_size_kb
+
 	-- The maximum number of ARP or ND packets in LLS submitted by
 	-- GK or GT. The code below makes sure that the parameter should
 	-- be at least the same with the maximum configured value of GK.