From 57b0f19687f1ce811b493b0ef32e7b0714ebdaf8 Mon Sep 17 00:00:00 2001
From: Michel Machado <michel@digirati.com.br>
Date: Sat, 9 Nov 2019 16:02:00 +0000
Subject: [PATCH 1/4] gatekeeper: add libcoro

Library CORO implements coroutines.
---
 Makefile       |   4 +-
 include/coro.h | 440 +++++++++++++++++++++++++++
 lib/coro.c     | 806 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 1248 insertions(+), 2 deletions(-)
 create mode 100644 include/coro.h
 create mode 100644 lib/coro.c

diff --git a/Makefile b/Makefile
index efc5b8950..d7cc789af 100644
--- a/Makefile
+++ b/Makefile
@@ -44,12 +44,12 @@ SRCS-y += sol/main.c
 # Libraries.
 SRCS-y += lib/mailbox.c lib/net.c lib/flow.c lib/ipip.c \
 	lib/luajit-ffi-cdata.c lib/launch.c lib/lpm.c lib/acl.c lib/varip.c \
-	lib/l2.c lib/ratelimit.c lib/memblock.c lib/log_ratelimit.c
+	lib/l2.c lib/ratelimit.c lib/memblock.c lib/log_ratelimit.c lib/coro.c
 
 LDLIBS += $(LDIR) -Bstatic -lluajit-5.1 -Bdynamic -lm -lmnl -lkmod
 CFLAGS += $(WERROR_FLAGS) -I${GATEKEEPER}/include -I/usr/local/include/luajit-2.0/
 EXTRA_CFLAGS += -O3 -g -Wfatal-errors -DALLOW_EXPERIMENTAL_API \
-	-Wno-deprecated-declarations
+	-Wno-deprecated-declarations -DCORO_ASM
 
 include $(RTE_SDK)/mk/rte.extapp.mk
 
diff --git a/include/coro.h b/include/coro.h
new file mode 100644
index 000000000..7645d5029
--- /dev/null
+++ b/include/coro.h
@@ -0,0 +1,440 @@
+/*
+ * Copyright (c) 2001-2012,2015 Marc Alexander Lehmann <schmorp@schmorp.de>
+ *
+ * Redistribution and use in source and binary forms, with or without modifica-
+ * tion, are permitted provided that the following conditions are met:
+ *
+ *   1.  Redistributions of source code must retain the above copyright notice,
+ *       this list of conditions and the following disclaimer.
+ *
+ *   2.  Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
+ * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
+ * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
+ * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * the GNU General Public License ("GPL") version 2 or any later version,
+ * in which case the provisions of the GPL are applicable instead of
+ * the above. If you wish to allow the use of your version of this file
+ * only under the terms of the GPL and not to allow others to use your
+ * version of this file under the BSD license, indicate your decision
+ * by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL. If you do not delete the
+ * provisions above, a recipient may use your version of this file under
+ * either the BSD or the GPL.
+ *
+ * This library is modelled strictly after Ralf S. Engelschalls article at
+ * http://www.gnu.org/software/pth/rse-pmt.ps. So most of the credit must
+ * go to Ralf S. Engelschall <rse@engelschall.com>.
+ *
+ * This coroutine library is very much stripped down. You should either
+ * build your own process abstraction using it or - better - just use GNU
+ * Portable Threads, http://www.gnu.org/software/pth/.
+ *
+ */
+
+/*
+ * 2006-10-26 Include stddef.h on OS X to work around one of its bugs.
+ *            Reported by Michael_G_Schwern.
+ * 2006-11-26 Use _setjmp instead of setjmp on GNU/Linux.
+ * 2007-04-27 Set unwind frame info if gcc 3+ and ELF is detected.
+ *            Use _setjmp instead of setjmp on _XOPEN_SOURCE >= 600.
+ * 2007-05-02 Add assembly versions for x86 and amd64 (to avoid reliance
+ *            on SIGUSR2 and sigaltstack in Crossfire).
+ * 2008-01-21 Disable CFI usage on anything but GNU/Linux.
+ * 2008-03-02 Switched to 2-clause BSD license with GPL exception.
+ * 2008-04-04 New (but highly unrecommended) pthreads backend.
+ * 2008-04-24 Reinstate CORO_LOSER (had wrong stack adjustments).
+ * 2008-10-30 Support assembly method on x86 with and without frame pointer.
+ * 2008-11-03 Use a global asm statement for CORO_ASM, idea by pippijn.
+ * 2008-11-05 Hopefully fix misaligned stacks with CORO_ASM/SETJMP.
+ * 2008-11-07 rbp wasn't saved in CORO_ASM on x86_64.
+ *            introduce coro_destroy, which is a nop except for pthreads.
+ *            speed up CORO_PTHREAD. Do no longer leak threads either.
+ *            coro_create now allows one to create source coro_contexts.
+ *            do not rely on makecontext passing a void * correctly.
+ *            try harder to get _setjmp/_longjmp.
+ *            major code cleanup/restructuring.
+ * 2008-11-10 the .cfi hacks are no longer needed.
+ * 2008-11-16 work around a freebsd pthread bug.
+ * 2008-11-19 define coro_*jmp symbols for easier porting.
+ * 2009-06-23 tentative win32-backend support for mingw32 (Yasuhiro Matsumoto).
+ * 2010-12-03 tentative support for uclibc (which lacks all sorts of things).
+ * 2011-05-30 set initial callee-saved-registers to zero with CORO_ASM.
+ *            use .cfi_undefined rip on linux-amd64 for better backtraces.
+ * 2011-06-08 maybe properly implement weird windows amd64 calling conventions.
+ * 2011-07-03 rely on __GCC_HAVE_DWARF2_CFI_ASM for cfi detection.
+ * 2011-08-08 cygwin trashes stacks, use pthreads with double stack on cygwin.
+ * 2012-12-04 reduce misprediction penalty for x86/amd64 assembly switcher.
+ * 2012-12-05 experimental fiber backend (allocates stack twice).
+ * 2012-12-07 API version 3 - add coro_stack_alloc/coro_stack_free.
+ * 2012-12-21 valgrind stack registering was broken.
+ * 2015-12-05 experimental asm be for arm7, based on a patch by Nick Zavaritsky.
+ *            use __name__ for predefined symbols, as in libecb.
+ *            enable guard pages on arm, aarch64 and mips.
+ * 2016-08-27 try to disable _FORTIFY_SOURCE with CORO_SJLJ, as it
+ *            breaks setjmp/longjmp. Also disable CORO_ASM for asm by default,
+ *            as it was reported to crash.
+ * 2016-11-18 disable cfi_undefined again - backtraces might be worse, but
+ *            compile compatibility is improved.
+ * 2018-08-14 use a completely different pthread strategy that should allow
+ *            sharing of coroutines among different threads. this would
+ *            undefined behaviour before as mutexes would be unlocked on
+ *            a different thread. overall, this might be slower than
+ *            using a pipe for synchronisation, but pipes eat fd's...
+ */
+
+#ifndef CORO_H
+#define CORO_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * This library consists of only three files
+ * coro.h, coro.c and LICENSE (and optionally README)
+ *
+ * It implements what is known as coroutines, in a hopefully
+ * portable way.
+ *
+ * All compiletime symbols must be defined both when including coro.h
+ * (using libcoro) as well as when compiling coro.c (the implementation).
+ *
+ * You can manually specify which flavour you want. If you don't define
+ * any of these, libcoro tries to choose a safe and fast default:
+ *
+ * -DCORO_UCONTEXT
+ *
+ *    This flavour uses SUSv2's get/set/swap/makecontext functions that
+ *    unfortunately only some unices support, and is quite slow.
+ *
+ * -DCORO_SJLJ
+ *
+ *    This flavour uses SUSv2's setjmp/longjmp and sigaltstack functions to
+ *    do it's job. Coroutine creation is much slower than UCONTEXT, but
+ *    context switching is a bit cheaper. It should work on almost all unices.
+ *
+ * -DCORO_LINUX
+ *
+ *    CORO_SJLJ variant.
+ *    Old GNU/Linux systems (<= glibc-2.1) only work with this implementation
+ *    (it is very fast and therefore recommended over other methods, but
+ *    doesn't work with anything newer).
+ *
+ * -DCORO_LOSER
+ *
+ *    CORO_SJLJ variant.
+ *    Microsoft's highly proprietary platform doesn't support sigaltstack, and
+ *    this selects a suitable workaround for this platform. It might not work
+ *    with your compiler though - it has only been tested with MSVC 6.
+ *
+ * -DCORO_FIBER
+ *
+ *    Slower, but probably more portable variant for the Microsoft operating
+ *    system, using fibers. Ignores the passed stack and allocates it internally.
+ *    Also, due to bugs in cygwin, this does not work with cygwin.
+ *
+ * -DCORO_IRIX
+ *
+ *    CORO_SJLJ variant.
+ *    For SGI's version of Microsoft's NT ;)
+ *
+ * -DCORO_ASM
+ *
+ *    Hand coded assembly, known to work only on a few architectures/ABI:
+ *    GCC + arm7/x86/IA32/amd64/x86_64 + GNU/Linux and a few BSDs. Fastest
+ *    choice, if it works.
+ *
+ * -DCORO_PTHREAD
+ *
+ *    Use the pthread API. You have to provide <pthread.h> and -lpthread.
+ *    This is likely the slowest backend, and it also does not support fork(),
+ *    so avoid it at all costs.
+ *
+ * If you define neither of these symbols, coro.h will try to autodetect
+ * the best/safest model. To help with the autodetection, you should check
+ * (e.g. using autoconf) and define the following symbols: HAVE_UCONTEXT_H
+ * / HAVE_SETJMP_H / HAVE_SIGALTSTACK.
+ */
+
+/*
+ * Changes when the API changes incompatibly.
+ * This is ONLY the API version - there is no ABI compatibility between releases.
+ *
+ * Changes in API version 2:
+ * replaced bogus -DCORO_LOOSE with grammatically more correct -DCORO_LOSER
+ * Changes in API version 3:
+ * introduced stack management (CORO_STACKALLOC)
+ */
+#define CORO_VERSION 3
+
+#include <stddef.h>
+
+/*
+ * This is the type for the initialization function of a new coroutine.
+ */
+typedef void (*coro_func)(void *);
+
+/*
+ * A coroutine state is saved in the following structure. Treat it as an
+ * opaque type. errno and sigmask might be saved, but don't rely on it,
+ * implement your own switching primitive if you need that.
+ */
+typedef struct coro_context coro_context;
+
+/*
+ * This function creates a new coroutine. Apart from a pointer to an
+ * uninitialised coro_context, it expects a pointer to the entry function
+ * and the single pointer value that is given to it as argument.
+ *
+ * Allocating/deallocating the stack is your own responsibility.
+ *
+ * As a special case, if coro, arg, sptr and ssze are all zero,
+ * then an "empty" coro_context will be created that is suitable
+ * as an initial source for coro_transfer.
+ *
+ * This function is not reentrant, but putting a mutex around it
+ * will work.
+ */
+void coro_create (coro_context *ctx, /* an uninitialised coro_context */
+                  coro_func coro,    /* the coroutine code to be executed */
+                  void *arg,         /* a single pointer passed to the coro */
+                  void *sptr,        /* start of stack area */
+                  size_t ssze);      /* size of stack area in bytes */
+
+/*
+ * The following prototype defines the coroutine switching function. It is
+ * sometimes implemented as a macro, so watch out.
+ *
+ * This function is thread-safe and reentrant.
+ */
+#if 0
+void coro_transfer (coro_context *prev, coro_context *next);
+#endif
+
+/*
+ * The following prototype defines the coroutine destroy function. It
+ * is sometimes implemented as a macro, so watch out. It also serves no
+ * purpose unless you want to use the CORO_PTHREAD backend, where it is
+ * used to clean up the thread. You are responsible for freeing the stack
+ * and the context itself.
+ *
+ * This function is thread-safe and reentrant.
+ */
+#if 0
+void coro_destroy (coro_context *ctx);
+#endif
+
+/*****************************************************************************/
+/* optional stack management                                                 */
+/*****************************************************************************/
+/*
+ * You can disable all of the stack management functions by
+ * defining CORO_STACKALLOC to 0. Otherwise, they are enabled by default.
+ *
+ * If stack management is enabled, you can influence the implementation via these
+ * symbols:
+ *
+ * -DCORO_USE_VALGRIND
+ *
+ *    If defined, then libcoro will include valgrind/valgrind.h and register
+ *    and unregister stacks with valgrind.
+ *
+ * -DCORO_GUARDPAGES=n
+ *
+ *    libcoro will try to use the specified number of guard pages to protect against
+ *    stack overflow. If n is 0, then the feature will be disabled. If it isn't
+ *    defined, then libcoro will choose a suitable default. If guardpages are not
+ *    supported on the platform, then the feature will be silently disabled.
+ */
+#ifndef CORO_STACKALLOC
+# define CORO_STACKALLOC 1
+#endif
+
+#if CORO_STACKALLOC
+
+/*
+ * The only allowed operations on these struct members is to read the
+ * "sptr" and "ssze" members to pass it to coro_create, to read the "sptr"
+ * member to see if it is false, in which case the stack isn't allocated,
+ * and to set the "sptr" member to 0, to indicate to coro_stack_free to
+ * not actually do anything.
+ */
+
+struct coro_stack
+{
+  void *sptr;
+  size_t ssze;
+#ifdef CORO_USE_VALGRIND
+  int valgrind_id;
+#endif
+};
+
+/*
+ * Try to allocate a stack of at least the given size and return true if
+ * successful, or false otherwise.
+ *
+ * The size is *NOT* specified in bytes, but in units of sizeof (void *),
+ * i.e. the stack is typically 4(8) times larger on 32 bit(64 bit) platforms
+ * then the size passed in.
+ *
+ * If size is 0, then a "suitable" stack size is chosen (usually 1-2MB).
+ */
+int coro_stack_alloc (struct coro_stack *stack, unsigned int size);
+
+/*
+ * Free the stack allocated by coro_stack_alloc again. It is safe to
+ * call this function on the coro_stack structure even if coro_stack_alloc
+ * failed.
+ */
+void coro_stack_free (struct coro_stack *stack);
+
+#endif
+
+/*
+ * That was it. No other user-serviceable parts below here.
+ */
+
+/*****************************************************************************/
+
+#if !defined CORO_LOSER      && !defined CORO_UCONTEXT \
+    && !defined CORO_SJLJ    && !defined CORO_LINUX \
+    && !defined CORO_IRIX    && !defined CORO_ASM \
+    && !defined CORO_PTHREAD && !defined CORO_FIBER
+# if defined WINDOWS && (defined __i386__ || (__x86_64__ || defined _M_IX86 || defined _M_AMD64)
+#  define CORO_ASM 1
+# elif defined WINDOWS || defined _WIN32
+#  define CORO_LOSER 1 /* you don't win with windoze */
+# elif __linux && (__i386__ || (__x86_64__ && !__ILP32__) /*|| (__arm__ && __ARM_ARCH == 7)), not working */
+#  define CORO_ASM 1
+# elif defined HAVE_UCONTEXT_H
+#  define CORO_UCONTEXT 1
+# elif defined HAVE_SETJMP_H && defined HAVE_SIGALTSTACK
+#  define CORO_SJLJ 1
+# else
+error unknown or unsupported architecture
+# endif
+#endif
+
+/*****************************************************************************/
+
+#ifdef CORO_UCONTEXT
+
+# include <ucontext.h>
+
+struct coro_context
+{
+  ucontext_t uc;
+};
+
+# define coro_transfer(p,n) swapcontext (&((p)->uc), &((n)->uc))
+# define coro_destroy(ctx) (void *)(ctx)
+
+#elif defined (CORO_SJLJ) || defined (CORO_LOSER) || defined (CORO_LINUX) || defined (CORO_IRIX)
+
+# if defined(CORO_LINUX) && !defined(_GNU_SOURCE)
+#  define _GNU_SOURCE /* for glibc */
+# endif
+
+/* try to disable well-meant but buggy checks in some libcs */
+# ifdef _FORTIFY_SOURCE
+#  undef _FORTIFY_SOURCE
+#  undef __USE_FORTIFY_LEVEL /* helps some more when too much has been included already */
+# endif
+
+# if !CORO_LOSER
+#  include <unistd.h>
+# endif
+
+/* solaris is hopelessly borked, it expands _XOPEN_UNIX to nothing */
+# if __sun
+#  undef _XOPEN_UNIX
+#  define _XOPEN_UNIX 1
+# endif
+
+# include <setjmp.h>
+
+# if _XOPEN_UNIX > 0 || defined (_setjmp)
+#  define coro_jmp_buf      jmp_buf
+#  define coro_setjmp(env)  _setjmp (env)
+#  define coro_longjmp(env) _longjmp ((env), 1)
+# elif CORO_LOSER
+#  define coro_jmp_buf      jmp_buf
+#  define coro_setjmp(env)  setjmp (env)
+#  define coro_longjmp(env) longjmp ((env), 1)
+# else
+#  define coro_jmp_buf      sigjmp_buf
+#  define coro_setjmp(env)  sigsetjmp (env, 0)
+#  define coro_longjmp(env) siglongjmp ((env), 1)
+# endif
+
+struct coro_context
+{
+  coro_jmp_buf env;
+};
+
+# define coro_transfer(p,n) do { if (!coro_setjmp ((p)->env)) coro_longjmp ((n)->env); } while (0)
+# define coro_destroy(ctx) (void *)(ctx)
+
+#elif CORO_ASM
+
+struct coro_context
+{
+  void **sp; /* must be at offset 0 */
+};
+
+#if defined (__i386__) || defined (__x86_64__)
+void __attribute__ ((__noinline__, __regparm__(2)))
+#else
+void __attribute__ ((__noinline__))
+#endif
+coro_transfer (coro_context *prev, coro_context *next);
+
+# define coro_destroy(ctx) (void)(ctx)
+
+#elif CORO_PTHREAD
+
+# include <pthread.h>
+
+extern pthread_mutex_t coro_mutex;
+
+struct coro_context
+{
+  int flags;
+  pthread_cond_t cv;
+};
+
+void coro_transfer (coro_context *prev, coro_context *next);
+void coro_destroy (coro_context *ctx);
+
+#elif CORO_FIBER
+
+struct coro_context
+{
+  void *fiber;
+  /* only used for initialisation */
+  coro_func coro;
+  void *arg;
+};
+
+void coro_transfer (coro_context *prev, coro_context *next);
+void coro_destroy (coro_context *ctx);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/lib/coro.c b/lib/coro.c
new file mode 100644
index 000000000..7817aab22
--- /dev/null
+++ b/lib/coro.c
@@ -0,0 +1,806 @@
+/*
+ * Copyright (c) 2001-2011 Marc Alexander Lehmann <schmorp@schmorp.de>
+ *
+ * Redistribution and use in source and binary forms, with or without modifica-
+ * tion, are permitted provided that the following conditions are met:
+ *
+ *   1.  Redistributions of source code must retain the above copyright notice,
+ *       this list of conditions and the following disclaimer.
+ *
+ *   2.  Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
+ * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
+ * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
+ * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * the GNU General Public License ("GPL") version 2 or any later version,
+ * in which case the provisions of the GPL are applicable instead of
+ * the above. If you wish to allow the use of your version of this file
+ * only under the terms of the GPL and not to allow others to use your
+ * version of this file under the BSD license, indicate your decision
+ * by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL. If you do not delete the
+ * provisions above, a recipient may use your version of this file under
+ * either the BSD or the GPL.
+ *
+ * This library is modelled strictly after Ralf S. Engelschalls article at
+ * http://www.gnu.org/software/pth/rse-pmt.ps. So most of the credit must
+ * go to Ralf S. Engelschall <rse@engelschall.com>.
+ */
+
+#include "coro.h"
+
+#include <stddef.h>
+#include <string.h>
+
+/*****************************************************************************/
+/* ucontext/setjmp/asm backends                                              */
+/*****************************************************************************/
+#if defined (CORO_UCONTEXT) || defined (CORO_SJLJ) || defined (CORO_LOSER) || defined (CORO_LINUX) || defined (CORO_IRIX) || defined (CORO_ASM)
+
+# ifdef CORO_UCONTEXT
+#  include <stddef.h>
+# endif
+
+# if !defined(STACK_ADJUST_PTR)
+#  ifdef __sgi
+/* IRIX is decidedly NON-unix */
+#   define STACK_ADJUST_PTR(sp,ss) ((char *)(sp) + (ss) - 8)
+#   define STACK_ADJUST_SIZE(sp,ss) ((ss) - 8)
+#  elif (defined (__i386__) && defined (CORO_LINUX)) || (defined (_M_IX86) && defined (CORO_LOSER))
+#   define STACK_ADJUST_PTR(sp,ss) ((char *)(sp) + (ss))
+#   define STACK_ADJUST_SIZE(sp,ss) (ss)
+#  elif (defined (__amd64__) && defined (CORO_LINUX)) || ((defined (_M_AMD64) || defined (_M_IA64)) && defined (CORO_LOSER))
+#   define STACK_ADJUST_PTR(sp,ss) ((char *)(sp) + (ss) - 8)
+#   define STACK_ADJUST_SIZE(sp,ss) (ss)
+#  else
+#   define STACK_ADJUST_PTR(sp,ss) (sp)
+#   define STACK_ADJUST_SIZE(sp,ss) (ss)
+#  endif
+# endif
+
+# include <stdlib.h>
+
+# ifdef CORO_SJLJ
+#  include <stdio.h>
+#  include <signal.h>
+#  include <unistd.h>
+# endif
+
+static coro_func coro_init_func;
+static void *coro_init_arg;
+static coro_context *new_coro, *create_coro;
+
+static void
+coro_init (void)
+{
+  volatile coro_func func = coro_init_func;
+  volatile void *arg = coro_init_arg;
+
+  coro_transfer (new_coro, create_coro);
+
+#if defined (__GCC_HAVE_DWARF2_CFI_ASM) && defined (__amd64)
+  /*asm (".cfi_startproc");*/
+  /*asm (".cfi_undefined rip");*/
+#endif
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcast-qual"
+  func ((void *)arg);
+#pragma GCC diagnostic pop
+
+#if __GCC_HAVE_DWARF2_CFI_ASM && __amd64
+  /*asm (".cfi_endproc");*/
+#endif
+
+  /* the new coro returned. bad. just abort() for now */
+  abort ();
+}
+
+# ifdef CORO_SJLJ
+
+static volatile int trampoline_done;
+
+/* trampoline signal handler */
+static void
+trampoline (int sig)
+{
+  if (coro_setjmp (new_coro->env))
+    coro_init (); /* start it */
+  else
+    trampoline_done = 1;
+}
+
+# endif
+
+# if CORO_ASM
+
+  #if (defined __arm__) && \
+      (defined __ARM_ARCH_7__  || defined __ARM_ARCH_7A__ \
+    || defined __ARM_ARCH_7R__ || defined __ARM_ARCH_7M__ \
+    || __ARM_ARCH == 7)
+    #define CORO_ARM 1
+  #endif
+
+  #if defined (_WIN32) || defined (__CYGWIN__)
+    #define CORO_WIN_TIB 1
+  #endif
+
+  asm (
+       "\t.text\n"
+       #if defined (_WIN32) || defined (__CYGWIN__)
+       "\t.globl _coro_transfer\n"
+       "_coro_transfer:\n"
+       #else
+       "\t.globl coro_transfer\n"
+       "coro_transfer:\n"
+       #endif
+       /* windows, of course, gives a shit on the amd64 ABI and uses different registers */
+       /* http://blogs.msdn.com/freik/archive/2005/03/17/398200.aspx */
+       #ifdef __amd64
+
+         #if defined (_WIN32) || defined (__CYGWIN__)
+           #define NUM_SAVED 29
+           "\tsubq $168, %rsp\t" /* one dummy qword to improve alignment */
+           "\tmovaps %xmm6, (%rsp)\n"
+           "\tmovaps %xmm7, 16(%rsp)\n"
+           "\tmovaps %xmm8, 32(%rsp)\n"
+           "\tmovaps %xmm9, 48(%rsp)\n"
+           "\tmovaps %xmm10, 64(%rsp)\n"
+           "\tmovaps %xmm11, 80(%rsp)\n"
+           "\tmovaps %xmm12, 96(%rsp)\n"
+           "\tmovaps %xmm13, 112(%rsp)\n"
+           "\tmovaps %xmm14, 128(%rsp)\n"
+           "\tmovaps %xmm15, 144(%rsp)\n"
+           "\tpushq %rsi\n"
+           "\tpushq %rdi\n"
+           "\tpushq %rbp\n"
+           "\tpushq %rbx\n"
+           "\tpushq %r12\n"
+           "\tpushq %r13\n"
+           "\tpushq %r14\n"
+           "\tpushq %r15\n"
+           #if CORO_WIN_TIB
+             "\tpushq %fs:0x0\n"
+             "\tpushq %fs:0x8\n"
+             "\tpushq %fs:0xc\n"
+           #endif
+           "\tmovq %rsp, (%rcx)\n"
+           "\tmovq (%rdx), %rsp\n"
+           #if CORO_WIN_TIB
+             "\tpopq %fs:0xc\n"
+             "\tpopq %fs:0x8\n"
+             "\tpopq %fs:0x0\n"
+           #endif
+           "\tpopq %r15\n"
+           "\tpopq %r14\n"
+           "\tpopq %r13\n"
+           "\tpopq %r12\n"
+           "\tpopq %rbx\n"
+           "\tpopq %rbp\n"
+           "\tpopq %rdi\n"
+           "\tpopq %rsi\n"
+           "\tmovaps (%rsp), %xmm6\n"
+           "\tmovaps 16(%rsp), %xmm7\n"
+           "\tmovaps 32(%rsp), %xmm8\n"
+           "\tmovaps 48(%rsp), %xmm9\n"
+           "\tmovaps 64(%rsp), %xmm10\n"
+           "\tmovaps 80(%rsp), %xmm11\n"
+           "\tmovaps 96(%rsp), %xmm12\n"
+           "\tmovaps 112(%rsp), %xmm13\n"
+           "\tmovaps 128(%rsp), %xmm14\n"
+           "\tmovaps 144(%rsp), %xmm15\n"
+           "\taddq $168, %rsp\n"
+         #else
+           #define NUM_SAVED 6
+           "\tpushq %rbp\n"
+           "\tpushq %rbx\n"
+           "\tpushq %r12\n"
+           "\tpushq %r13\n"
+           "\tpushq %r14\n"
+           "\tpushq %r15\n"
+           "\tmovq %rsp, (%rdi)\n"
+           "\tmovq (%rsi), %rsp\n"
+           "\tpopq %r15\n"
+           "\tpopq %r14\n"
+           "\tpopq %r13\n"
+           "\tpopq %r12\n"
+           "\tpopq %rbx\n"
+           "\tpopq %rbp\n"
+         #endif
+         "\tpopq %rcx\n"
+         "\tjmpq *%rcx\n"
+
+       #elif __i386__
+
+         #define NUM_SAVED 4
+         "\tpushl %ebp\n"
+         "\tpushl %ebx\n"
+         "\tpushl %esi\n"
+         "\tpushl %edi\n"
+         #if CORO_WIN_TIB
+           #undef NUM_SAVED
+           #define NUM_SAVED 7
+           "\tpushl %fs:0\n"
+           "\tpushl %fs:4\n"
+           "\tpushl %fs:8\n"
+         #endif
+         "\tmovl %esp, (%eax)\n"
+         "\tmovl (%edx), %esp\n"
+         #if CORO_WIN_TIB
+           "\tpopl %fs:8\n"
+           "\tpopl %fs:4\n"
+           "\tpopl %fs:0\n"
+         #endif
+         "\tpopl %edi\n"
+         "\tpopl %esi\n"
+         "\tpopl %ebx\n"
+         "\tpopl %ebp\n"
+         "\tpopl %ecx\n"
+         "\tjmpl *%ecx\n"
+
+       #elif CORO_ARM /* untested, what about thumb, neon, iwmmxt? */
+
+         #if __ARM_PCS_VFP
+           "\tvpush {d8-d15}\n"
+           #define NUM_SAVED (9 + 8 * 2)
+         #else
+           #define NUM_SAVED 9
+         #endif
+         "\tpush {r4-r11,lr}\n"
+         "\tstr sp, [r0]\n"
+         "\tldr sp, [r1]\n"
+         "\tpop {r4-r11,lr}\n"
+         #if __ARM_PCS_VFP
+           "\tvpop {d8-d15}\n"
+         #endif
+         "\tmov r15, lr\n"
+
+       #elif __mips__ && 0 /* untested, 32 bit only */
+
+        #define NUM_SAVED (12 + 8 * 2)
+         /* TODO: n64/o64, lw=>ld */
+
+         "\t.set    nomips16\n"
+         "\t.frame  $sp,112,$31\n"
+         #if __mips_soft_float
+           "\taddiu   $sp,$sp,-44\n"
+         #else
+           "\taddiu   $sp,$sp,-112\n"
+           "\ts.d     $f30,88($sp)\n"
+           "\ts.d     $f28,80($sp)\n"
+           "\ts.d     $f26,72($sp)\n"
+           "\ts.d     $f24,64($sp)\n"
+           "\ts.d     $f22,56($sp)\n"
+           "\ts.d     $f20,48($sp)\n"
+         #endif
+         "\tsw      $28,40($sp)\n"
+         "\tsw      $31,36($sp)\n"
+         "\tsw      $fp,32($sp)\n"
+         "\tsw      $23,28($sp)\n"
+         "\tsw      $22,24($sp)\n"
+         "\tsw      $21,20($sp)\n"
+         "\tsw      $20,16($sp)\n"
+         "\tsw      $19,12($sp)\n"
+         "\tsw      $18,8($sp)\n"
+         "\tsw      $17,4($sp)\n"
+         "\tsw      $16,0($sp)\n"
+         "\tsw      $sp,0($4)\n"
+         "\tlw      $sp,0($5)\n"
+         #if !__mips_soft_float
+           "\tl.d     $f30,88($sp)\n"
+           "\tl.d     $f28,80($sp)\n"
+           "\tl.d     $f26,72($sp)\n"
+           "\tl.d     $f24,64($sp)\n"
+           "\tl.d     $f22,56($sp)\n"
+           "\tl.d     $f20,48($sp)\n"
+         #endif
+         "\tlw      $28,40($sp)\n"
+         "\tlw      $31,36($sp)\n"
+         "\tlw      $fp,32($sp)\n"
+         "\tlw      $23,28($sp)\n"
+         "\tlw      $22,24($sp)\n"
+         "\tlw      $21,20($sp)\n"
+         "\tlw      $20,16($sp)\n"
+         "\tlw      $19,12($sp)\n"
+         "\tlw      $18,8($sp)\n"
+         "\tlw      $17,4($sp)\n"
+         "\tlw      $16,0($sp)\n"
+         "\tj       $31\n"
+         #if __mips_soft_float
+           "\taddiu   $sp,$sp,44\n"
+         #else
+           "\taddiu   $sp,$sp,112\n"
+         #endif
+
+       #else
+         #error unsupported architecture
+       #endif
+  );
+
+# endif
+
+void
+coro_create (coro_context *ctx, coro_func coro, void *arg, void *sptr, size_t ssize)
+{
+  coro_context nctx;
+# ifdef CORO_SJLJ
+  stack_t ostk, nstk;
+  struct sigaction osa, nsa;
+  sigset_t nsig, osig;
+# endif
+
+  if (!coro)
+    return;
+
+  coro_init_func = coro;
+  coro_init_arg  = arg;
+
+  new_coro    = ctx;
+  create_coro = &nctx;
+
+# ifdef CORO_SJLJ
+  /* we use SIGUSR2. first block it, then fiddle with it. */
+
+  sigemptyset (&nsig);
+  sigaddset (&nsig, SIGUSR2);
+  sigprocmask (SIG_BLOCK, &nsig, &osig);
+
+  nsa.sa_handler = trampoline;
+  sigemptyset (&nsa.sa_mask);
+  nsa.sa_flags = SA_ONSTACK;
+
+  if (sigaction (SIGUSR2, &nsa, &osa))
+    {
+      perror ("sigaction");
+      abort ();
+    }
+
+  /* set the new stack */
+  nstk.ss_sp    = STACK_ADJUST_PTR (sptr, ssize); /* yes, some platforms (IRIX) get this wrong. */
+  nstk.ss_size  = STACK_ADJUST_SIZE (sptr, ssize);
+  nstk.ss_flags = 0;
+
+  if (sigaltstack (&nstk, &ostk) < 0)
+    {
+      perror ("sigaltstack");
+      abort ();
+    }
+
+  trampoline_done = 0;
+  kill (getpid (), SIGUSR2);
+  sigfillset (&nsig); sigdelset (&nsig, SIGUSR2);
+
+  while (!trampoline_done)
+    sigsuspend (&nsig);
+
+  sigaltstack (0, &nstk);
+  nstk.ss_flags = SS_DISABLE;
+  if (sigaltstack (&nstk, 0) < 0)
+    perror ("sigaltstack");
+
+  sigaltstack (0, &nstk);
+  if (~nstk.ss_flags & SS_DISABLE)
+    abort ();
+
+  if (~ostk.ss_flags & SS_DISABLE)
+    sigaltstack (&ostk, 0);
+
+  sigaction (SIGUSR2, &osa, 0);
+  sigprocmask (SIG_SETMASK, &osig, 0);
+
+# elif defined (CORO_LOSER)
+
+  coro_setjmp (ctx->env);
+  #if __CYGWIN__ && __i386__
+    ctx->env[8]                        = (long)    coro_init;
+    ctx->env[7]                        = (long)    ((char *)sptr + ssize)         - sizeof (long);
+  #elif __CYGWIN__ && __x86_64__
+    ctx->env[7]                        = (long)    coro_init;
+    ctx->env[6]                        = (long)    ((char *)sptr + ssize)         - sizeof (long);
+  #elif defined __MINGW32__
+    ctx->env[5]                        = (long)    coro_init;
+    ctx->env[4]                        = (long)    ((char *)sptr + ssize)         - sizeof (long);
+  #elif defined _M_IX86
+    ((_JUMP_BUFFER *)&ctx->env)->Eip   = (long)    coro_init;
+    ((_JUMP_BUFFER *)&ctx->env)->Esp   = (long)    STACK_ADJUST_PTR (sptr, ssize) - sizeof (long);
+  #elif defined _M_AMD64
+    ((_JUMP_BUFFER *)&ctx->env)->Rip   = (__int64) coro_init;
+    ((_JUMP_BUFFER *)&ctx->env)->Rsp   = (__int64) STACK_ADJUST_PTR (sptr, ssize) - sizeof (__int64);
+  #elif defined _M_IA64
+    ((_JUMP_BUFFER *)&ctx->env)->StIIP = (__int64) coro_init;
+    ((_JUMP_BUFFER *)&ctx->env)->IntSp = (__int64) STACK_ADJUST_PTR (sptr, ssize) - sizeof (__int64);
+  #else
+    #error "microsoft libc or architecture not supported"
+  #endif
+
+# elif defined (CORO_LINUX)
+
+  coro_setjmp (ctx->env);
+  #if __GLIBC__ >= 2 && __GLIBC_MINOR__ >= 0 && defined (JB_PC) && defined (JB_SP)
+    ctx->env[0].__jmpbuf[JB_PC]        = (long)    coro_init;
+    ctx->env[0].__jmpbuf[JB_SP]        = (long)    STACK_ADJUST_PTR (sptr, ssize) - sizeof (long);
+  #elif __GLIBC__ >= 2 && __GLIBC_MINOR__ >= 0 && defined (__mc68000__)
+    ctx->env[0].__jmpbuf[0].__aregs[0] = (long int)coro_init;
+    ctx->env[0].__jmpbuf[0].__sp       = (int *)   ((char *)sptr + ssize)         - sizeof (long);
+  #elif defined (__GNU_LIBRARY__) && defined (__i386__)
+    ctx->env[0].__jmpbuf[0].__pc       = (char *)  coro_init;
+    ctx->env[0].__jmpbuf[0].__sp       = (void *)  ((char *)sptr + ssize)         - sizeof (long);
+  #elif defined (__GNU_LIBRARY__) && defined (__x86_64__)
+    ctx->env[0].__jmpbuf[JB_PC]        = (long)    coro_init;
+    ctx->env[0].__jmpbuf[0].__sp       = (void *)  ((char *)sptr + ssize)         - sizeof (long);
+  #else
+    #error "linux libc or architecture not supported"
+  #endif
+
+# elif defined (CORO_IRIX)
+
+  coro_setjmp (ctx->env, 0);
+  ctx->env[JB_PC]                      = (__uint64_t)coro_init;
+  ctx->env[JB_SP]                      = (__uint64_t)STACK_ADJUST_PTR (sptr, ssize) - sizeof (long);
+
+# elif CORO_ASM
+
+  #if defined (__i386__) || defined (__x86_64__)
+    ctx->sp = (void **)(ssize + (char *)sptr);
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcast-qual"
+    *--ctx->sp = (void *)abort; /* needed for alignment only */
+#pragma GCC diagnostic pop
+    *--ctx->sp = (void *)coro_init;
+    #ifdef CORO_WIN_TIB
+      *--ctx->sp = 0;                    /* ExceptionList */
+      *--ctx->sp = (char *)sptr + ssize; /* StackBase */
+      *--ctx->sp = sptr;                 /* StackLimit */
+    #endif
+  #elif CORO_ARM
+    /* return address stored in lr register, don't push anything */
+  #else
+    #error unsupported architecture
+  #endif
+
+  ctx->sp -= NUM_SAVED;
+  memset (ctx->sp, 0, sizeof (*ctx->sp) * NUM_SAVED);
+
+  #if defined (__i386__) || defined (__x86_64__)
+    /* done already */
+  #elif defined (CORO_ARM)
+    ctx->sp[0] = coro; /* r4 */
+    ctx->sp[1] = arg;  /* r5 */
+    ctx->sp[8] = (char *)coro_init; /* lr */
+  #else
+    #error unsupported architecture
+  #endif
+
+# elif CORO_UCONTEXT
+
+  getcontext (&(ctx->uc));
+
+  ctx->uc.uc_link           =  0;
+  ctx->uc.uc_stack.ss_sp    = sptr;
+  ctx->uc.uc_stack.ss_size  = (size_t)ssize;
+  ctx->uc.uc_stack.ss_flags = 0;
+
+  makecontext (&(ctx->uc), (void (*)())coro_init, 0);
+
+# endif
+
+  coro_transfer (create_coro, new_coro);
+}
+
+/*****************************************************************************/
+/* pthread backend                                                           */
+/*****************************************************************************/
+#elif CORO_PTHREAD
+
+/* this mutex will be locked by the running coroutine */
+pthread_mutex_t coro_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+struct coro_init_args
+{
+  coro_func func;
+  void *arg;
+  coro_context *self, *main;
+};
+
+static void *
+coro_init (void *args_)
+{
+  struct coro_init_args *args = (struct coro_init_args *)args_;
+  coro_func func = args->func;
+  void *arg = args->arg;
+
+  coro_transfer (args->self, args->main);
+  func (arg);
+
+  return 0;
+}
+
+void
+coro_transfer (coro_context *prev, coro_context *next)
+{
+  pthread_mutex_lock (&coro_mutex);
+
+  next->flags = 1;
+  pthread_cond_signal (&next->cv);
+
+  prev->flags = 0;
+
+  while (!prev->flags)
+    pthread_cond_wait (&prev->cv, &coro_mutex);
+
+  if (prev->flags == 2)
+    {
+      pthread_mutex_unlock (&coro_mutex);
+      pthread_cond_destroy (&prev->cv);
+      pthread_detach (pthread_self ());
+      pthread_exit (0);
+    }
+
+  pthread_mutex_unlock (&coro_mutex);
+}
+
+void
+coro_create (coro_context *ctx, coro_func coro, void *arg, void *sptr, size_t ssize)
+{
+  static coro_context nctx;
+  static int once;
+
+  if (!once)
+    {
+      once = 1;
+
+      pthread_cond_init (&nctx.cv, 0);
+    }
+
+  pthread_cond_init (&ctx->cv, 0);
+
+  if (coro)
+    {
+      pthread_attr_t attr;
+      struct coro_init_args args;
+      pthread_t id;
+
+      args.func = coro;
+      args.arg  = arg;
+      args.self = ctx;
+      args.main = &nctx;
+
+      pthread_attr_init (&attr);
+#if __UCLIBC__
+      /* exists, but is borked */
+      /*pthread_attr_setstacksize (&attr, (size_t)ssize);*/
+#elif __CYGWIN__
+      /* POSIX, not here */
+      pthread_attr_setstacksize (&attr, (size_t)ssize);
+#else
+      pthread_attr_setstack (&attr, sptr, (size_t)ssize);
+#endif
+      pthread_attr_setscope (&attr, PTHREAD_SCOPE_PROCESS);
+      pthread_create (&id, &attr, coro_init, &args);
+
+      coro_transfer (args.main, args.self);
+    }
+}
+
+void
+coro_destroy (coro_context *ctx)
+{
+  pthread_mutex_lock (&coro_mutex);
+  ctx->flags = 2;
+  pthread_cond_signal (&ctx->cv);
+  pthread_mutex_unlock (&coro_mutex);
+}
+
+/*****************************************************************************/
+/* fiber backend                                                             */
+/*****************************************************************************/
+#elif CORO_FIBER
+
+#define WIN32_LEAN_AND_MEAN
+#if _WIN32_WINNT < 0x0400
+  #undef _WIN32_WINNT
+  #define _WIN32_WINNT 0x0400
+#endif
+#include <windows.h>
+
+VOID CALLBACK
+coro_init (PVOID arg)
+{
+  coro_context *ctx = (coro_context *)arg;
+
+  ctx->coro (ctx->arg);
+}
+
+void
+coro_transfer (coro_context *prev, coro_context *next)
+{
+  if (!prev->fiber)
+    {
+      prev->fiber = GetCurrentFiber ();
+
+      if (prev->fiber == 0 || prev->fiber == (void *)0x1e00)
+        prev->fiber = ConvertThreadToFiber (0);
+    }
+
+  SwitchToFiber (next->fiber);
+}
+
+void
+coro_create (coro_context *ctx, coro_func coro, void *arg, void *sptr, size_t ssize)
+{
+  ctx->fiber = 0;
+  ctx->coro  = coro;
+  ctx->arg   = arg;
+
+  if (!coro)
+    return;
+
+  ctx->fiber = CreateFiber (ssize, coro_init, ctx);
+}
+
+void
+coro_destroy (coro_context *ctx)
+{
+  DeleteFiber (ctx->fiber);
+}
+
+#else
+  #error unsupported backend
+#endif
+
+/*****************************************************************************/
+/* stack management                                                          */
+/*****************************************************************************/
+#if CORO_STACKALLOC
+
+#include <stdlib.h>
+
+#ifndef _WIN32
+# include <unistd.h>
+#endif
+
+#ifdef CORO_USE_VALGRIND
+# include <valgrind/valgrind.h>
+#endif
+
+#ifdef _POSIX_MAPPED_FILES
+# include <sys/mman.h>
+# define CORO_MMAP 1
+# ifndef MAP_ANONYMOUS
+#  ifdef MAP_ANON
+#   define MAP_ANONYMOUS MAP_ANON
+#  else
+#   undef CORO_MMAP
+#  endif
+# endif
+# include <limits.h>
+#else
+# undef CORO_MMAP
+#endif
+
+#if _POSIX_MEMORY_PROTECTION
+# ifndef CORO_GUARDPAGES
+#  define CORO_GUARDPAGES 4
+# endif
+#else
+# undef CORO_GUARDPAGES
+#endif
+
+#if !CORO_MMAP
+# undef CORO_GUARDPAGES
+#endif
+
+#if !defined (__i386__) && !defined (__x86_64__) && !defined (__powerpc__) && !defined (__arm__) && !defined (__aarch64__) && !defined (__m68k__) && !defined (__alpha__) && !defined (__mips__) && !defined (__sparc64__)
+# undef CORO_GUARDPAGES
+#endif
+
+#ifndef CORO_GUARDPAGES
+# define CORO_GUARDPAGES 0
+#endif
+
+#ifndef PAGESIZE
+  #if !CORO_MMAP
+    #define PAGESIZE 4096
+  #else
+    static size_t
+    coro_pagesize (void)
+    {
+      static size_t pagesize;
+
+      if (!pagesize)
+        pagesize = sysconf (_SC_PAGESIZE);
+
+      return pagesize;
+    }
+
+    #define PAGESIZE coro_pagesize ()
+  #endif
+#endif
+
+int
+coro_stack_alloc (struct coro_stack *stack, unsigned int size)
+{
+  if (!size)
+    size = 256 * 1024;
+
+  stack->sptr = 0;
+  stack->ssze = ((size_t)size * sizeof (void *) + PAGESIZE - 1) / PAGESIZE * PAGESIZE;
+
+#ifdef CORO_FIBER
+
+  stack->sptr = (void *)stack;
+  return 1;
+
+#else
+
+  size_t ssze = stack->ssze + CORO_GUARDPAGES * PAGESIZE;
+  void *base;
+
+  #if CORO_MMAP
+    /* mmap supposedly does allocate-on-write for us */
+    base = mmap (0, ssze, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+    if (base == (void *)-1)
+      {
+        /* some systems don't let us have executable heap */
+        /* we assume they won't need executable stack in that case */
+        base = mmap (0, ssze, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+        if (base == (void *)-1)
+          return 0;
+      }
+
+    #if CORO_GUARDPAGES
+      mprotect (base, CORO_GUARDPAGES * PAGESIZE, PROT_NONE);
+    #endif
+
+    base = (void*)((char *)base + CORO_GUARDPAGES * PAGESIZE);
+  #else
+    base = malloc (ssze);
+    if (!base)
+      return 0;
+  #endif
+
+  #ifdef CORO_USE_VALGRIND
+    stack->valgrind_id = VALGRIND_STACK_REGISTER ((char *)base, ((char *)base) + ssze - CORO_GUARDPAGES * PAGESIZE);
+  #endif
+
+  stack->sptr = base;
+  return 1;
+
+#endif
+}
+
+void
+coro_stack_free (struct coro_stack *stack)
+{
+#ifdef CORO_FIBER
+  /* nop */
+#else
+  #ifdef CORO_USE_VALGRIND
+    VALGRIND_STACK_DEREGISTER (stack->valgrind_id);
+  #endif
+
+  #if CORO_MMAP
+    if (stack->sptr)
+      munmap ((void*)((char *)stack->sptr - CORO_GUARDPAGES * PAGESIZE),
+              stack->ssze                 + CORO_GUARDPAGES * PAGESIZE);
+  #else
+    free (stack->sptr);
+  #endif
+#endif
+}
+
+#endif
+

From 19fb3d94115051358d774800705aa6cfcd5cc52f Mon Sep 17 00:00:00 2001
From: Michel Machado <michel@digirati.com.br>
Date: Mon, 25 Nov 2019 18:37:39 +0000
Subject: [PATCH 2/4] gk: add support to coroutines

This patch gets everything set up to do the work of gk_proc()
inside coroutines, but no work is actually moved into coroutines.
The following patches gradually move the work of gk_proc() into
coroutines.
---
 Makefile                     |   2 +-
 gk/co.c                      |  98 ++++++++++++
 gk/co.h                      | 243 +++++++++++++++++++++++++++++
 gk/main.c                    | 289 ++++++++++++++++++++++++++++++++---
 include/gatekeeper_acl.h     |  24 ++-
 include/gatekeeper_gk.h      |  15 ++
 include/list.h               |  24 ++-
 lib/mailbox.c                |   8 +-
 lua/gatekeeper/staticlib.lua |   2 +
 lua/gk.lua                   |   7 +
 10 files changed, 679 insertions(+), 33 deletions(-)
 create mode 100644 gk/co.c
 create mode 100644 gk/co.h

diff --git a/Makefile b/Makefile
index d7cc789af..eb33b01dc 100644
--- a/Makefile
+++ b/Makefile
@@ -36,7 +36,7 @@ SRCS-y := main/main.c
 SRCS-y += config/static.c config/dynamic.c
 SRCS-y += cps/main.c cps/kni.c cps/elf.c
 SRCS-y += ggu/main.c
-SRCS-y += gk/main.c gk/fib.c gk/bpf.c
+SRCS-y += gk/main.c gk/fib.c gk/bpf.c gk/co.c
 SRCS-y += gt/main.c gt/lua_lpm.c
 SRCS-y += lls/main.c lls/cache.c lls/arp.c lls/nd.c
 SRCS-y += sol/main.c
diff --git a/gk/co.c b/gk/co.c
new file mode 100644
index 000000000..590acfb6c
--- /dev/null
+++ b/gk/co.c
@@ -0,0 +1,98 @@
+/*
+ * Gatekeeper - DoS protection system.
+ * Copyright (C) 2016 Digirati LTDA.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "co.h"
+
+static struct gk_co *
+get_next_co(struct gk_co *this_co)
+{
+	/*
+	 * It is unlikely because as long as there is more than
+	 * one working coroutine, there is at least 50% chance that
+	 * @this_co is not the last working coroutine.
+	 */
+	if (unlikely(this_co->co_list.next == &this_co->work->working_cos)) {
+		/* @this_co is the last working co. */
+		return list_first_entry(&this_co->work->working_cos,
+			struct gk_co, co_list);
+	}
+	return list_next_entry(this_co, co_list);
+}
+
+static struct gk_co_task *
+next_task(struct gk_co *this_co)
+{
+	while (true) {
+		struct gk_co *next_co;
+
+		/*
+		 * This test is likely because if @this_co has at least
+		 * one task, there's at least 50% that it will be true because
+		 * this function is called twice.
+		 */
+		if (likely(!list_empty(&this_co->task_queue))) {
+			/*
+			 * @this_co has assigned tasks.
+			 * Return the first assigned task.
+			 */
+			struct gk_co_task *task = list_first_entry(
+				&this_co->task_queue, struct gk_co_task,
+				task_list);
+			list_del(&task->task_list);
+			return task;
+		}
+
+		/* There is no more tasks assigned to @this_co. */
+
+		next_co = get_next_co(this_co);
+
+		/* Make @this_co idle. */
+		list_del(&this_co->co_list);
+
+		/* Transfer control to another coroutine. */
+		if (likely(this_co != next_co)) {
+			/*
+			 * @this_co is NOT the last working coroutine.
+			 * Yield to the next coroutine.
+			 */
+			coro_transfer(&this_co->coro, &next_co->coro);
+		} else {
+			/*
+			 * No more work and no more working coroutines;
+			 * @this_co is the last working coroutine.
+			 * Return to the main coroutine.
+			 */
+			coro_transfer(&this_co->coro,
+				&this_co->work->instance->coro_root);
+		}
+	}
+}
+
+void
+gk_co_main(void *arg)
+{
+	struct gk_co *this_co = arg;
+	struct gk_co_task *task = next_task(this_co);
+
+	while (likely(task != NULL)) {
+		task->task_func(this_co, task);
+		task = next_task(this_co);
+	}
+
+	rte_panic("%s() terminated\n", __func__);
+}
diff --git a/gk/co.h b/gk/co.h
new file mode 100644
index 000000000..d6828f65b
--- /dev/null
+++ b/gk/co.h
@@ -0,0 +1,243 @@
+/*
+ * Gatekeeper - DoS protection system.
+ * Copyright (C) 2016 Digirati LTDA.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _GATEKEEPER_GK_CO_H_
+#define _GATEKEEPER_GK_CO_H_
+
+#include <stdbool.h>
+#include <string.h>
+#include <coro.h>
+#include <list.h>
+
+#include "gatekeeper_gk.h"
+#include "gatekeeper_acl.h"
+
+struct gk_co {
+	/*
+	 * Attach this coroutine to work->working_cos while
+	 * this coroutine is working.
+	 */
+	struct list_head    co_list;
+	/* structs from libcoro. */
+	struct coro_stack   stack;
+	struct coro_context coro;
+	/* Task assigned to this coroutine. */
+	struct list_head    task_queue;
+	struct gk_co_work   *work;
+};
+
+struct gk_co_task *task;
+
+typedef void (*gk_co_task_func_t)(struct gk_co *this_co,
+	struct gk_co_task *task);
+
+struct gk_co_task {
+	/*
+	 * Once the task is assigned to a coroutine,
+	 * attach this task to co->task_queue.
+	 */
+	struct list_head  task_list;
+	/*
+	 * @task_hash is used to assign the task to a coroutine.
+	 *
+	 * This is important to avoid race conditions between coroutines.
+	 * For example, assume that two tasks that are going to work on
+	 * the same flow entry are assigned to two different coroutines, and
+	 * that the corresponding flow entry is not available in
+	 * the flow table, both coroutines may try to add the same flow entry.
+	 * If these two tasks share the same task hash, both tasks are going to
+	 * be assigned to the same coroutine.
+	 */
+	uint32_t          task_hash;
+	void              *task_arg;
+	gk_co_task_func_t task_func;
+};
+
+struct gk_co_work {
+	/* The coroutines working on the tasks. */
+	struct list_head working_cos;
+	/* Coroutines available to do the work. */
+	struct gk_co     *cos;
+	/* Number of coroutines available for the next batch of work. */
+	uint16_t         co_num;
+	/* Total number of coroutines available at field @cos. */
+	uint16_t         co_max_num;
+	/* Index of the next coroutine to use when a task has no task hash. */
+	uint16_t         any_co_index;
+	/* How field @co_num will change for the next batch of work. */
+	int16_t          co_delta_num;
+	/*
+	 * Previous value of field @co_num.
+	 * When the value of this field is zero, an invalid value for @co_num,
+	 * the value of field @avg_cycles_per_task is not meaningful.
+	 */
+	uint16_t         co_prv_num;
+	/*
+	 * Average number of cycles per task when @co_num was equal to
+	 * @co_prv_num.
+	 */
+	double           avg_cycles_per_task;
+
+	struct gk_config   *gk_conf;
+	struct gk_instance *instance;
+
+	/* All preallocated tasks available to do work. */
+	struct gk_co_task *all_tasks;
+	/* The total number of taks available at field @all_tasks. */
+	const uint32_t task_total;
+	/* Current number of tasks used at field @all_tasks. */
+	uint32_t task_num;
+
+	/* Fields for front packets and mailbox messages. */
+	/*
+	 * This is a single-entry-per-bucket hash table.
+	 * This flow entries are reused between tasks assigned to
+	 * the same coroutine.
+	 */
+	struct flow_entry ** const leftover;
+	/*
+	 * Flow entries that has not been inserted in the flow table, but
+	 * they may be present in @leftover.
+	 */
+	struct flow_entry * const temp_fes;
+	/* Number of entries in used in @temp_fes. */
+	uint16_t temp_fes_num;
+	/*
+	 * Mask for the hash table @leftover.
+	 * It must be of the form (2^n - 1) for any n >= 0.
+	 */
+	const uint32_t leftover_mask;
+
+	/* Fields for front and back packets. */
+	uint16_t tx_front_num_pkts;
+	uint16_t tx_back_num_pkts;
+	struct rte_mbuf ** const tx_front_pkts;
+	struct rte_mbuf ** const tx_back_pkts;
+	/*
+	 * The following field is only needed when the RSS hash is not
+	 * available.
+	 */
+	struct ipacket * const packets;
+
+	/* Fields for the front packets only. */
+	uint16_t          front_num_req;
+	uint16_t          front_num_arp;
+	struct rte_mbuf   ** const front_req_bufs;
+	struct rte_mbuf   ** const front_arp_bufs;
+	struct acl_search front_acl4;
+	struct acl_search front_acl6;
+	bool front_ipv4_configured;
+	bool front_ipv6_configured;
+
+	/* Fields for the front packets only. */
+	uint16_t          back_num_arp;
+	struct rte_mbuf   ** const back_arp_bufs;
+	struct acl_search back_acl4;
+	struct acl_search back_acl6;
+};
+
+/* Declare and initialize a struct gk_co_work. */
+#define DEFINE_GK_CO_WORK(name, max_front_pkts, max_back_pkts,		\
+		max_mailbox, lo_mask, task_extra)			\
+	struct gk_co_task name##_all_tasks_array[(max_front_pkts) +	\
+		(max_back_pkts) + (max_mailbox) + (task_extra)];	\
+	struct flow_entry *name##_leftover_array[(lo_mask) + 1];	\
+	struct flow_entry name##_temp_fes_array[			\
+		(max_front_pkts) + (max_mailbox)];			\
+	struct rte_mbuf *name##_tx_front_pkts_array[			\
+		(max_front_pkts) + (max_back_pkts)];			\
+	struct rte_mbuf *name##_tx_back_pkts_array[			\
+		(max_front_pkts) + (max_back_pkts)];			\
+	struct ipacket name##_packets_array[				\
+		(max_front_pkts) + (max_back_pkts)];			\
+	struct rte_mbuf *name##_front_req_bufs_array[(max_front_pkts)];	\
+	struct rte_mbuf *name##_front_arp_bufs_array[(max_front_pkts)];	\
+	DECLARE_ACL_SEARCH_VARIABLE_PART(front_acl4, (max_front_pkts));	\
+	DECLARE_ACL_SEARCH_VARIABLE_PART(front_acl6, (max_front_pkts));	\
+	struct rte_mbuf *name##_back_arp_bufs_array[(max_back_pkts)];	\
+	DECLARE_ACL_SEARCH_VARIABLE_PART(back_acl4, (max_back_pkts));	\
+	DECLARE_ACL_SEARCH_VARIABLE_PART(back_acl6, (max_back_pkts));	\
+	struct gk_co_work name = {					\
+		.working_cos = LIST_HEAD_INIT(name.working_cos),	\
+		.cos = NULL,						\
+		.co_num = 0,						\
+		.co_max_num = 0,					\
+		.any_co_index = 0,					\
+		.co_delta_num = 1,					\
+		.co_prv_num = 0,					\
+		.avg_cycles_per_task = 0,				\
+		.gk_conf = NULL,					\
+		.instance = NULL,					\
+		.all_tasks = name##_all_tasks_array,			\
+		.task_total = (max_front_pkts) + (max_back_pkts) +	\
+			(max_mailbox) + (task_extra),			\
+		.task_num = 0,						\
+		.leftover = memset(name##_leftover_array, 0,		\
+			sizeof(name##_leftover_array)),			\
+		.temp_fes = name##_temp_fes_array,			\
+		.temp_fes_num = 0,					\
+		.leftover_mask = (lo_mask),				\
+		.tx_front_num_pkts = 0,					\
+		.tx_back_num_pkts  = 0,					\
+		.tx_front_pkts = name##_tx_front_pkts_array,		\
+		.tx_back_pkts  = name##_tx_back_pkts_array,		\
+		.packets = name##_packets_array,			\
+		.front_num_req = 0,					\
+		.front_num_arp = 0,					\
+		.front_req_bufs = name##_front_req_bufs_array,		\
+		.front_arp_bufs = name##_front_arp_bufs_array,		\
+		.front_acl4 = ACL_SEARCH_INIT(front_acl4),		\
+		.front_acl6 = ACL_SEARCH_INIT(front_acl6),		\
+		.front_ipv4_configured = false,				\
+		.front_ipv6_configured = false,				\
+		.back_num_arp = 0,					\
+		.back_arp_bufs = name##_back_arp_bufs_array,		\
+		.back_acl4 = ACL_SEARCH_INIT(back_acl4),		\
+		.back_acl6 = ACL_SEARCH_INIT(back_acl6),		\
+	}
+
+static inline struct gk_co *
+get_task_owner_co(struct gk_co_work *work, struct gk_co_task *task)
+{
+	return &work->cos[task->task_hash % work->co_num];
+}
+
+static inline void
+__schedule_task(struct gk_co *task_owner_co, struct gk_co_task *task)
+{
+	list_add_tail(&task->task_list, &task_owner_co->task_queue);
+}
+
+static inline void
+schedule_task(struct gk_co_work *work, struct gk_co_task *task)
+{
+	__schedule_task(get_task_owner_co(work, task), task);
+}
+
+/* Uniformly distribuite tasks with no task hash among coroutines. */
+static inline void
+schedule_task_to_any_co(struct gk_co_work *work, struct gk_co_task *task)
+{
+	__schedule_task(&work->cos[work->any_co_index], task);
+	work->any_co_index = (work->any_co_index + 1) % work->co_num;
+}
+
+void
+gk_co_main(void *arg);
+
+#endif /* _GATEKEEPER_GK_CO_H_ */
diff --git a/gk/main.c b/gk/main.c
index bce203456..f19f1272e 100644
--- a/gk/main.c
+++ b/gk/main.c
@@ -45,6 +45,7 @@
 #include "gatekeeper_flow_bpf.h"
 
 #include "bpf.h"
+#include "co.h"
 
 #define	START_PRIORITY		 (38)
 /* Set @START_ALLOWANCE as the double size of a large DNS reply. */
@@ -559,6 +560,56 @@ gk_del_flow_entry_from_hash(struct rte_hash *h, struct flow_entry *fe)
 	return ret;
 }
 
+static void
+free_cos(struct gk_co *cos, unsigned int num)
+{
+	unsigned int i;
+
+	if (cos == NULL)
+		return;
+
+	for (i = 0; i < num; i++) {
+		struct gk_co *co = &cos[i];
+
+		if (co->stack.sptr == NULL)
+			continue;
+
+		/* Free @co. */
+		coro_destroy(&co->coro);
+		coro_stack_free(&co->stack);
+	}
+
+	rte_free(cos);
+}
+
+static struct gk_co *
+alloc_cos(unsigned int num, unsigned int stack_size_byte)
+{
+	unsigned int stack_size_ptr = stack_size_byte / sizeof(void *);
+	unsigned int i;
+
+	struct gk_co *cos = rte_calloc(__func__, num, sizeof(*cos), 0);
+	if (cos == NULL)
+		return NULL;
+
+	for (i = 0; i < num; i++) {
+		struct gk_co *co = &cos[i];
+
+		if (unlikely(!coro_stack_alloc(&co->stack, stack_size_ptr))) {
+			free_cos(cos, num);
+			return NULL;
+		}
+
+		coro_create(&co->coro, gk_co_main, co,
+			co->stack.sptr, co->stack.ssze);
+		INIT_LIST_HEAD_WITH_POISON(&co->co_list);
+		INIT_LIST_HEAD(&co->task_queue);
+		co->work = NULL;
+	}
+
+	return cos;
+}
+
 static int
 setup_gk_instance(unsigned int lcore_id, struct gk_config *gk_conf)
 {
@@ -586,7 +637,6 @@ setup_gk_instance(unsigned int lcore_id, struct gk_config *gk_conf)
 		GK_LOG(ERR,
 			"The GK block cannot create hash table at lcore %u\n",
 			lcore_id);
-
 		ret = -1;
 		goto out;
 	}
@@ -600,7 +650,6 @@ setup_gk_instance(unsigned int lcore_id, struct gk_config *gk_conf)
 		GK_LOG(ERR,
 			"The GK block can't create flow entry table at lcore %u\n",
 			lcore_id);
-
 		ret = -1;
 		goto flow_hash;
 	}
@@ -611,6 +660,19 @@ setup_gk_instance(unsigned int lcore_id, struct gk_config *gk_conf)
     	if (ret < 0)
 		goto flow_entry;
 
+	coro_create(&instance->coro_root, NULL, NULL, NULL, 0);
+
+	/* Allocate coroutines. */
+	instance->cos = alloc_cos(gk_conf->co_max_num,
+		gk_conf->co_stack_size_kb * 1024);
+	if (instance->cos == NULL) {
+		GK_LOG(ERR,
+			"The GK block can't allocate coroutines at lcore %u\n",
+			lcore_id);
+		ret = -1;
+		goto coro_root;
+	}
+
 	tb_ratelimit_state_init(&instance->front_icmp_rs,
 		gk_conf->front_icmp_msgs_per_sec,
 		gk_conf->front_icmp_msgs_burst);
@@ -621,6 +683,10 @@ setup_gk_instance(unsigned int lcore_id, struct gk_config *gk_conf)
 	ret = 0;
 	goto out;
 
+coro_root:
+	coro_destroy(&instance->coro_root);
+/*mailbox:*/
+	destroy_mailbox(&instance->mb);
 flow_entry:
     	rte_free(instance->ip_flow_entry_table);
     	instance->ip_flow_entry_table = NULL;
@@ -2153,6 +2219,177 @@ process_cmds_from_mailbox(
 	mb_free_entry_bulk(&instance->mb, (void * const *)gk_cmds, num_cmd);
 }
 
+static void
+add_cos_to_work(struct gk_co_work *work, struct gk_config *gk_conf,
+	struct gk_instance *instance)
+{
+	unsigned int i;
+
+	work->gk_conf = gk_conf;
+	work->instance = instance;
+	work->cos = instance->cos;
+	work->co_max_num = gk_conf->co_max_num;
+	work->co_num = RTE_MIN(2, work->co_max_num);
+
+	RTE_VERIFY(work->co_num > 0);
+
+	for (i = 0; i < work->co_max_num; i++)
+		work->cos[i].work = work;
+}
+
+static void
+update_cos(struct gk_co_work *work)
+{
+	/*
+	 * The local variable @co_num is needed here to enable one to go
+	 * above @work->co_max_num and below zero if needed.
+	 */
+	int32_t co_num = work->co_num;
+
+	if (work->co_delta_num > 0) {
+		/* @work->co_num is going up. */
+
+		if (unlikely(co_num >= work->co_max_num)) {
+			/*
+			 * @work->co_num is at its maximum;
+			 * Reverse direction.
+			 */
+			RTE_VERIFY(co_num == work->co_max_num);
+			work->co_delta_num = - work->co_delta_num;
+			work->co_num = RTE_MAX(1, co_num + work->co_delta_num);
+			return;
+		}
+
+		work->co_num = RTE_MIN(work->co_max_num,
+			co_num + work->co_delta_num);
+		return;
+	}
+
+	/* @work->co_num is going down. */
+	RTE_VERIFY(work->co_delta_num < 0);
+
+	if (unlikely(co_num <= 1)) {
+		/* @work->co_num is at its minimum; reverse direction. */
+		RTE_VERIFY(co_num == 1);
+		work->co_delta_num = - work->co_delta_num;
+		work->co_num = RTE_MIN(work->co_max_num,
+				co_num + work->co_delta_num);
+		return;
+	}
+
+	work->co_num = RTE_MAX(1, co_num + work->co_delta_num);
+}
+
+static void
+do_work(struct gk_co_work *work)
+{
+	uint16_t i, real_co_num = 0;
+	uint64_t cycles;
+	double avg_cycles_per_task;
+
+	/* Add coroutines with tasks to @work->working_cos. */
+	for (i = 0; i < work->co_num; i++) {
+		struct gk_co *co = &work->cos[i];
+		if (!list_empty(&co->task_queue)) {
+			list_add_tail(&co->co_list, &work->working_cos);
+			real_co_num++;
+		}
+	}
+
+	/* Is there any work to do? */
+	if (unlikely(list_empty(&work->working_cos))) {
+		RTE_VERIFY(real_co_num == 0);
+		RTE_VERIFY(work->task_num == 0);
+		return;
+	}
+	RTE_VERIFY(real_co_num > 0);
+	RTE_VERIFY(work->task_num > 0);
+
+	/* Do work. */
+	cycles = rte_rdtsc();
+	coro_transfer(&work->instance->coro_root,
+		&list_first_entry(&work->working_cos, struct gk_co, co_list)->
+		coro);
+	cycles = rte_rdtsc() - cycles;
+	avg_cycles_per_task = (double)cycles / work->task_num;
+
+	if (work->co_num != real_co_num) {
+		/* Workload changed; adjust quickly. */
+		RTE_VERIFY(work->co_num > real_co_num);
+		work->co_prv_num = real_co_num;
+		work->avg_cycles_per_task = avg_cycles_per_task;
+		work->co_num = real_co_num;
+		return update_cos(work);
+	}
+
+	if (work->co_prv_num == 0) {
+		/* Initialize the performance tracking fields. */
+		work->co_prv_num = real_co_num;
+		work->avg_cycles_per_task = avg_cycles_per_task;
+		return update_cos(work);
+	}
+
+	if (avg_cycles_per_task >= work->avg_cycles_per_task) {
+		/* The last change did not bring an improvement; go back. */
+		work->co_num = work->co_prv_num;
+		/* Reset measurement. */
+		work->co_prv_num = 0;
+		/* Change adjustment direction. */
+		work->co_delta_num = - work->co_delta_num;
+		return;
+	}
+
+	/* @real_co_num is an improvement. */
+	work->co_prv_num = real_co_num;
+	work->avg_cycles_per_task = avg_cycles_per_task;
+	update_cos(work);
+}
+
+static void
+flush_work(struct gk_co_work *work,
+	uint16_t port_front, uint16_t tx_queue_front,
+	uint16_t port_back, uint16_t tx_queue_back)
+{
+	uint16_t front_max_pkt_burst = work->gk_conf->front_max_pkt_burst;
+	uint16_t back_max_pkt_burst = work->gk_conf->back_max_pkt_burst;
+	uint32_t max_pkt_burst = front_max_pkt_burst + back_max_pkt_burst;
+
+	/*
+	 * Flush packets.
+	 */
+
+	send_pkts(port_front, tx_queue_front,
+		work->tx_front_num_pkts, work->tx_front_pkts);
+	RTE_VERIFY(work->tx_front_num_pkts <= max_pkt_burst);
+	work->tx_front_num_pkts = 0;
+
+	send_pkts(port_back, tx_queue_back,
+		work->tx_back_num_pkts, work->tx_back_pkts);
+	RTE_VERIFY(work->tx_back_num_pkts <= max_pkt_burst);
+	work->tx_back_num_pkts = 0;
+
+	/*
+	 * TODO Flush front.
+	 */
+
+	/*
+	 * TODO Flush back.
+	 */
+
+	/*
+	 * Reset fields of @work.
+	 */
+
+	RTE_VERIFY(work->task_num <= work->task_total);
+	work->task_num = 0;
+	work->any_co_index = 0;
+	RTE_VERIFY(work->temp_fes_num <=
+		(front_max_pkt_burst + work->gk_conf->mailbox_burst_size));
+	work->temp_fes_num = 0;
+	memset(work->leftover, 0,
+		sizeof(*work->leftover) * (work->leftover_mask + 1));
+}
+
 static int
 gk_proc(void *arg)
 {
@@ -2168,13 +2405,6 @@ gk_proc(void *arg)
 	uint16_t rx_queue_back = instance->rx_queue_back;
 	uint16_t tx_queue_back = instance->tx_queue_back;
 
-	uint16_t tx_front_num_pkts;
-	uint16_t tx_back_num_pkts;
-	uint16_t tx_max_num_pkts = gk_conf->front_max_pkt_burst +
-		gk_conf->back_max_pkt_burst;
-	struct rte_mbuf *tx_front_pkts[tx_max_num_pkts];
-	struct rte_mbuf *tx_back_pkts[tx_max_num_pkts];
-
 	uint32_t entry_idx = 0;
 	uint64_t last_measure_tsc = rte_rdtsc();
 	uint64_t basic_measurement_logging_cycles =
@@ -2183,16 +2413,26 @@ gk_proc(void *arg)
 	uint32_t scan_iter = gk_conf->flow_table_scan_iter;
 	uint32_t iter_count = 0;
 
+	DEFINE_GK_CO_WORK(work, gk_conf->front_max_pkt_burst,
+		gk_conf->back_max_pkt_burst, gk_conf->mailbox_burst_size,
+		/*
+		 * The 4* is intended to minimize collisions, whereas the -1 is
+		 * intended to avoid doubling the size when
+		 * the expression already is a power of 2.
+		 */
+		rte_combine32ms1b(4 * (gk_conf->front_max_pkt_burst +
+			gk_conf->mailbox_burst_size) - 1),
+		1 /* One extra tast for the full scanning of the flow table. */
+	);
+
 	GK_LOG(NOTICE, "The GK block is running at lcore = %u\n", lcore);
 
 	gk_conf_hold(gk_conf);
+	add_cos_to_work(&work, gk_conf, instance);
 
 	while (likely(!exiting)) {
 		struct flow_entry *fe = NULL;
 
-		tx_front_num_pkts = 0;
-		tx_back_num_pkts = 0;
-
 		if (iter_count >= scan_iter) {
 			entry_idx = (entry_idx + 1) % gk_conf->flow_ht_size;
 			fe = &instance->ip_flow_entry_table[entry_idx];
@@ -2207,14 +2447,16 @@ gk_proc(void *arg)
 		} else
 			iter_count++;
 
+		do_work(&work);
+
 		process_pkts_front(port_front, rx_queue_front, lcore,
-			&tx_front_num_pkts, tx_front_pkts,
-			&tx_back_num_pkts, tx_back_pkts,
+			&work.tx_front_num_pkts, work.tx_front_pkts,
+			&work.tx_back_num_pkts,  work.tx_back_pkts,
 			instance, gk_conf);
 
 		process_pkts_back(port_back, rx_queue_back, lcore,
-			&tx_front_num_pkts, tx_front_pkts,
-			&tx_back_num_pkts, tx_back_pkts,
+			&work.tx_front_num_pkts, work.tx_front_pkts,
+			&work.tx_back_num_pkts,  work.tx_back_pkts,
 			instance, gk_conf);
 
 		if (fe != NULL && fe->in_use &&
@@ -2225,11 +2467,8 @@ gk_proc(void *arg)
 		} else
 			fe = NULL;
 
-		send_pkts(port_front, tx_queue_front,
-			tx_front_num_pkts, tx_front_pkts);
-
-		send_pkts(port_back, tx_queue_back,
-			tx_back_num_pkts, tx_back_pkts);
+		flush_work(&work, port_front, tx_queue_front,
+			port_back, tx_queue_back);
 
 		process_cmds_from_mailbox(instance, gk_conf);
 
@@ -2310,6 +2549,8 @@ cleanup_gk(struct gk_config *gk_conf)
 		}
 
 		destroy_mailbox(&gk_conf->instances[i].mb);
+		free_cos(gk_conf->instances[i].cos, gk_conf->co_max_num);
+		coro_destroy(&gk_conf->instances[i].coro_root);
 	}
 
 	if (gk_conf->lpm_tbl.fib_tbl != NULL) {
@@ -2518,6 +2759,12 @@ run_gk(struct net_config *net_conf, struct gk_config *gk_conf,
 		goto out;
 	}
 
+	if (gk_conf->co_max_num == 0) {
+		GK_LOG(ERR, "There must be at least one coroutine\n");
+		ret = -1;
+		goto out;
+	}
+
 	front_inc = gk_conf->front_max_pkt_burst * gk_conf->num_lcores;
 	net_conf->front.total_pkt_burst += front_inc;
 	back_inc = gk_conf->back_max_pkt_burst * gk_conf->num_lcores;
diff --git a/include/gatekeeper_acl.h b/include/gatekeeper_acl.h
index 59c4bbf42..61aa0d11e 100644
--- a/include/gatekeeper_acl.h
+++ b/include/gatekeeper_acl.h
@@ -32,16 +32,26 @@ struct acl_search {
 	struct rte_mbuf **mbufs;
 };
 
-/* Declare and initialize a struct acl_search. */
-#define DEFINE_ACL_SEARCH(name, num_pkts)			\
+#define DECLARE_ACL_SEARCH_VARIABLE_PART(name, num_pkts)	\
 	const uint8_t *name##_data_array[(num_pkts)];		\
-	struct rte_mbuf *name##_mbufs_array[(num_pkts)];	\
-	struct acl_search name = {				\
-		.num = 0,					\
-		.data = name##_data_array,			\
-		.mbufs = name##_mbufs_array,			\
+	struct rte_mbuf *name##_mbufs_array[(num_pkts)]
+
+/*
+ * This macro can only be used if the macro DECLARE_ACL_SEARCH_VARIABLE_PART()
+ * has been placed before it.
+ */
+#define ACL_SEARCH_INIT(name)			\
+	{					\
+		.num = 0,			\
+		.data = name##_data_array,	\
+		.mbufs = name##_mbufs_array,	\
 	}
 
+/* Declare and initialize a struct acl_search. */
+#define DEFINE_ACL_SEARCH(name, num_pkts)			\
+	DECLARE_ACL_SEARCH_VARIABLE_PART(name, num_pkts);	\
+	struct acl_search name = ACL_SEARCH_INIT(name)
+
 /* Classify batches of packets in @acl and invoke callback functions. */
 int process_acl(struct gatekeeper_if *iface, unsigned int lcore_id,
 	struct acl_search *acl, struct acl_state *astate,
diff --git a/include/gatekeeper_gk.h b/include/gatekeeper_gk.h
index 95264d984..d732621ee 100644
--- a/include/gatekeeper_gk.h
+++ b/include/gatekeeper_gk.h
@@ -19,6 +19,8 @@
 #ifndef _GATEKEEPER_GK_H_
 #define _GATEKEEPER_GK_H_
 
+#include <coro.h>
+
 #include <rte_atomic.h>
 #include <rte_bpf.h>
 
@@ -98,6 +100,14 @@ struct gk_measurement_metrics {
 struct gk_instance {
 	struct rte_hash   *ip_flow_hash_table;
 	struct flow_entry *ip_flow_entry_table;
+	/*
+	 * Coroutines.
+	 *
+	 * These structs must be here and not in struct gk_co_work because
+	 * initialization functions (e.g. coro_create()) are not reentrant.
+	 */
+	struct coro_context coro_root;
+	struct gk_co      *cos;
 	/* RX queue on the front interface. */
 	uint16_t          rx_queue_front;
 	/* TX queue on the front interface. */
@@ -201,6 +211,11 @@ struct gk_config {
 	/* Time for logging the basic measurements in ms. */
 	unsigned int       basic_measurement_logging_ms;
 
+	/* Maximum number of coroutines running in parallel per GK instance. */
+	uint16_t           co_max_num;
+	/* Size of the stack of each coroutine in KB. */
+	uint16_t           co_stack_size_kb;
+
 	/*
 	 * The fields below are for internal use.
 	 * Configuration files should not refer to them.
diff --git a/include/list.h b/include/list.h
index e7fd442fa..c5adf7c51 100644
--- a/include/list.h
+++ b/include/list.h
@@ -34,6 +34,11 @@ struct list_head {
 
 #define LIST_HEAD_INIT(name) { &(name), &(name) }
 
+#define LIST_POISON1  ((void *) 0x00100100)
+#define LIST_POISON2  ((void *) 0x00200200)
+
+#define LIST_HEAD_INIT_WITH_POISON(name) { LIST_POISON1, LIST_POISON2 }
+
 static inline void
 INIT_LIST_HEAD(struct list_head *list)
 {
@@ -41,6 +46,13 @@ INIT_LIST_HEAD(struct list_head *list)
 	list->prev = list;
 }
 
+static inline void
+INIT_LIST_HEAD_WITH_POISON(struct list_head *list)
+{
+	list->next = LIST_POISON1;
+	list->prev = LIST_POISON2;
+}
+
 /**
  * list_entry - get the struct for this entry
  * @ptr:	the &struct list_head pointer.
@@ -133,6 +145,16 @@ list_is_singular(const struct list_head *head)
 	return !list_empty(head) && (head->next == head->prev);
 }
 
+/**
+ * list_poison - tests whether @entry has been poisoned.
+ * @entry: the entry to test.
+ */
+static inline int
+list_poison(const struct list_head *entry)
+{
+	return entry->next == LIST_POISON1 && entry->prev == LIST_POISON2;
+}
+
 /*
  * Insert a new entry between two known consecutive entries.
  *
@@ -191,8 +213,6 @@ __list_del(struct list_head *prev, struct list_head *next)
 	prev->next = next;
 }
 
-#define LIST_POISON1  ((void *) 0x00100100)
-#define LIST_POISON2  ((void *) 0x00200200)
 /**
  * list_del - deletes entry from list.
  * @entry: the element to delete from the list.
diff --git a/lib/mailbox.c b/lib/mailbox.c
index 33bb242df..a78c53c0e 100644
--- a/lib/mailbox.c
+++ b/lib/mailbox.c
@@ -111,9 +111,13 @@ void
 destroy_mailbox(struct mailbox *mb)
 {
 	if (mb) {
-		if (mb->ring)
+		if (mb->ring) {
     			rte_ring_free(mb->ring);
-		if (mb->pool)
+			mb->ring = NULL;
+		}
+		if (mb->pool) {
 			rte_mempool_free(mb->pool);
+			mb->pool = NULL;
+		}
 	}
 }
diff --git a/lua/gatekeeper/staticlib.lua b/lua/gatekeeper/staticlib.lua
index c3c1435dc..cae9c7b93 100644
--- a/lua/gatekeeper/staticlib.lua
+++ b/lua/gatekeeper/staticlib.lua
@@ -211,6 +211,8 @@ struct gk_config {
 	uint32_t     log_ratelimit_interval_ms;
 	uint32_t     log_ratelimit_burst;
 	unsigned int basic_measurement_logging_ms;
+	uint16_t     co_max_num;
+	uint16_t     co_stack_size_kb;
 	/* This struct has hidden fields. */
 };
 
diff --git a/lua/gk.lua b/lua/gk.lua
index 057b98644..be3e452c4 100644
--- a/lua/gk.lua
+++ b/lua/gk.lua
@@ -42,8 +42,12 @@ return function (net_conf, lls_conf, sol_conf, gk_lcores)
 	local back_icmp_msgs_per_sec = 1000
 	local back_icmp_msgs_burst = 50
 
+	local co_max_num = 16
+
 	-- These variables are unlikely to need to be changed.
 	local bpf_enable_jit = true
+	-- CAUTION: stacks too small will crash the GK blocks.
+	local co_stack_size_kb = 16
 
 	--
 	-- End configuration of GK block.
@@ -100,6 +104,9 @@ return function (net_conf, lls_conf, sol_conf, gk_lcores)
 	gk_conf.back_max_pkt_burst =
 		staticlib.get_back_burst_config(max_pkt_burst_back, net_conf)
 
+	gk_conf.co_max_num = co_max_num
+	gk_conf.co_stack_size_kb = co_stack_size_kb
+
 	-- The maximum number of ARP or ND packets in LLS submitted by
 	-- GK or GT. The code below makes sure that the parameter should
 	-- be at least the same with the maximum configured value of GK.

From 1a12bf5e2d6ae6a6a7dec8458ee0bb59587b37d3 Mon Sep 17 00:00:00 2001
From: Michel Machado <michel@digirati.com.br>
Date: Mon, 25 Nov 2019 18:58:56 +0000
Subject: [PATCH 3/4] gk: move some work to coroutines

This patch moves the following work to coroutines:
1. scanning of the flow table;
2. processing of front packets.

Besides moving the processing of front packets to coroutines
this patch streamlines the code to better fit the new model.
For example, this patch simplifies the parameters of
process_flow_entry() and its subordinate functions:
gk_process_request(), gk_process_granted(), gk_process_declined(),
and gk_process_bpf().
---
 dependencies/dpdk         |    2 +-
 gk/co.c                   | 1016 ++++++++++++++++++++++++++++++++++++
 gk/co.h                   |   44 ++
 gk/main.c                 | 1041 +++++++++----------------------------
 include/gatekeeper_main.h |   20 +
 5 files changed, 1322 insertions(+), 801 deletions(-)

diff --git a/dependencies/dpdk b/dependencies/dpdk
index bcc1e4fce..c637f7cd4 160000
--- a/dependencies/dpdk
+++ b/dependencies/dpdk
@@ -1 +1 @@
-Subproject commit bcc1e4fce82336ca39108ed4d54fb501af4a1b5a
+Subproject commit c637f7cd452d750d6eb51bb2abf9de92a111fe60
diff --git a/gk/co.c b/gk/co.c
index 590acfb6c..1bba8b8ec 100644
--- a/gk/co.c
+++ b/gk/co.c
@@ -16,6 +16,12 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+
+#include "gatekeeper_lls.h"
+
+#include "bpf.h"
 #include "co.h"
 
 static struct gk_co *
@@ -34,6 +40,1016 @@ get_next_co(struct gk_co *this_co)
 	return list_next_entry(this_co, co_list);
 }
 
+static void
+yield_next(struct gk_co *this_co)
+{
+	struct gk_co *next_co = get_next_co(this_co);
+	if (unlikely(this_co == next_co))
+		return;
+	coro_transfer(&this_co->coro, &next_co->coro);
+}
+
+/*
+ * If @task is added to @this_co->task_queue without a proper @task->task_hash,
+ * @task must be rescheduled once the proper @task->task_hash becomes known
+ * in order to avoid race conditions related to the proper @task->task_hash.
+ *
+ * NOTICE: while a task is running without a proper @task->task_hash,
+ * the task must not use the leftover available because the task is likely
+ * running under a task hash that is different of its proper @task->task_hash.
+ */
+static void
+reschedule_task(struct gk_co *this_co, struct gk_co_task *task)
+{
+	struct gk_co_work *work = this_co->work;
+	struct gk_co *task_owner_co = get_task_owner_co(work, task);
+
+	__schedule_task(task_owner_co, task);
+
+	if (list_poison(&task_owner_co->co_list))
+		list_add_tail(&task_owner_co->co_list, &work->working_cos);
+}
+
+static int
+extract_packet_info(struct rte_mbuf *pkt, struct ipacket *packet)
+{
+	int ret = 0;
+	uint16_t ether_type;
+	size_t ether_len;
+	struct rte_ether_hdr *eth_hdr;
+	struct rte_ipv4_hdr *ip4_hdr;
+	struct rte_ipv6_hdr *ip6_hdr;
+	uint16_t pkt_len = rte_pktmbuf_data_len(pkt);
+
+	eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
+	ether_type = rte_be_to_cpu_16(pkt_in_skip_l2(pkt, eth_hdr,
+		&packet->l3_hdr));
+	ether_len = pkt_in_l2_hdr_len(pkt);
+
+	switch (ether_type) {
+	case RTE_ETHER_TYPE_IPV4:
+		if (pkt_len < ether_len + sizeof(*ip4_hdr)) {
+			packet->flow.proto = 0;
+			GK_LOG(NOTICE,
+				"Packet is too short to be IPv4 (%" PRIu16 ")\n",
+				pkt_len);
+			ret = -1;
+			goto out;
+		}
+
+		ip4_hdr = packet->l3_hdr;
+		packet->flow.proto = RTE_ETHER_TYPE_IPV4;
+		packet->flow.f.v4.src.s_addr = ip4_hdr->src_addr;
+		packet->flow.f.v4.dst.s_addr = ip4_hdr->dst_addr;
+		break;
+
+	case RTE_ETHER_TYPE_IPV6:
+		if (pkt_len < ether_len + sizeof(*ip6_hdr)) {
+			packet->flow.proto = 0;
+			GK_LOG(NOTICE,
+				"Packet is too short to be IPv6 (%" PRIu16 ")\n",
+				pkt_len);
+			ret = -1;
+			goto out;
+		}
+
+		ip6_hdr = packet->l3_hdr;
+		packet->flow.proto = RTE_ETHER_TYPE_IPV6;
+		rte_memcpy(packet->flow.f.v6.src.s6_addr, ip6_hdr->src_addr,
+			sizeof(packet->flow.f.v6.src.s6_addr));
+		rte_memcpy(packet->flow.f.v6.dst.s6_addr, ip6_hdr->dst_addr,
+			sizeof(packet->flow.f.v6.dst.s6_addr));
+		break;
+
+	case RTE_ETHER_TYPE_ARP:
+		packet->flow.proto = RTE_ETHER_TYPE_ARP;
+		ret = -1;
+		break;
+
+	default:
+		packet->flow.proto = 0;
+		log_unknown_l2("gk", ether_type);
+		ret = -1;
+		break;
+	}
+out:
+	packet->pkt = pkt;
+	return ret;
+}
+
+static int
+drop_packet_front(struct rte_mbuf *pkt, struct gk_instance *instance)
+{
+	instance->traffic_stats.tot_pkts_num_dropped++;
+	instance->traffic_stats.tot_pkts_size_dropped +=
+		rte_pktmbuf_pkt_len(pkt);
+
+	return drop_packet(pkt);
+}
+
+static int
+parse_front_pkt(struct gk_co *this_co,
+	struct ipacket *packet, struct rte_mbuf *pkt)
+{
+	struct gk_co_work *work = this_co->work;
+	int ret;
+
+	/* TODO Does this prefetch improve performance?
+	rte_mbuf_prefetch_part1_non_temporal(pkt);
+	yield_next(this_co);
+	*/
+       /*
+        * This prefetch is enough to load Ethernet header (14 bytes),
+        * optional Ethernet VLAN header (8 bytes), and either
+        * an IPv4 header without options (20 bytes), or
+        * an IPv6 header without options (40 bytes).
+        * IPv4: 14 + 8 + 20 = 42
+        * IPv6: 14 + 8 + 40 = 62
+	rte_prefetch_non_temporal(rte_pktmbuf_mtod_offset(pkt, void *, 0));
+	yield_next(this_co);
+        */
+
+	ret = extract_packet_info(pkt, packet);
+	if (ret < 0) {
+		if (likely(packet->flow.proto == RTE_ETHER_TYPE_ARP)) {
+			struct gk_measurement_metrics *stats =
+				&work->instance->traffic_stats;
+
+			stats->tot_pkts_num_distributed++;
+			stats->tot_pkts_size_distributed +=
+				rte_pktmbuf_pkt_len(pkt);
+
+			work->front_arp_bufs[work->front_num_arp++] = pkt;
+			return -1;
+		}
+
+		/* Drop non-IP and non-ARP packets. */
+		drop_packet_front(pkt, work->instance);
+		return -1;
+	}
+
+	if (unlikely((packet->flow.proto == RTE_ETHER_TYPE_IPV4 &&
+				!work->front_ipv4_configured) ||
+			(packet->flow.proto == RTE_ETHER_TYPE_IPV6 &&
+				!work->front_ipv6_configured))) {
+		drop_packet_front(pkt, work->instance);
+		return -1;
+	}
+
+	return 0;
+}
+
+#define	START_PRIORITY		 (38)
+/* Set @START_ALLOWANCE as the double size of a large DNS reply. */
+#define	START_ALLOWANCE		 (8)
+
+static void
+initialize_flow_entry(struct flow_entry *fe, struct ip_flow *flow,
+	uint32_t flow_hash_val, struct gk_fib *grantor_fib)
+{
+	/*
+	 * The flow table is a critical data structure, so,
+	 * whenever the size of entries grow too much,
+	 * one must look for alternatives before increasing
+	 * the limit below.
+	 */
+	RTE_BUILD_BUG_ON(sizeof(*fe) > 128);
+
+	rte_memcpy(&fe->flow, flow, sizeof(*flow));
+
+	fe->in_use = true;
+	fe->flow_hash_val = flow_hash_val;
+	fe->state = GK_REQUEST;
+	fe->u.request.last_packet_seen_at = rte_rdtsc();
+	fe->u.request.last_priority = START_PRIORITY;
+	fe->u.request.allowance = START_ALLOWANCE - 1;
+	fe->grantor_fib = grantor_fib;
+}
+
+static inline void
+reinitialize_flow_entry(struct flow_entry *fe, uint64_t now)
+{
+	fe->state = GK_REQUEST;
+	fe->u.request.last_packet_seen_at = now;
+	fe->u.request.last_priority = START_PRIORITY;
+	fe->u.request.allowance = START_ALLOWANCE - 1;
+}
+
+static inline void
+prefetch_flow_entry(struct flow_entry *fe)
+{
+#if RTE_CACHE_LINE_SIZE == 64
+	RTE_BUILD_BUG_ON(sizeof(*fe) <= RTE_CACHE_LINE_SIZE);
+	RTE_BUILD_BUG_ON(sizeof(*fe) > 2 * RTE_CACHE_LINE_SIZE);
+	rte_prefetch0(fe);
+	rte_prefetch0(((char *)fe) + RTE_CACHE_LINE_SIZE);
+#elif RTE_CACHE_LINE_SIZE == 128
+	RTE_BUILD_BUG_ON(sizeof(*fe) > RTE_CACHE_LINE_SIZE);
+	rte_prefetch0(fe);
+#else
+#error "Unsupported cache line size"
+#endif
+}
+
+/* We should avoid calling integer_log_base_2() with zero. */
+static inline uint8_t
+integer_log_base_2(uint64_t delta_time)
+{
+#if __WORDSIZE == 64
+    return (8 * sizeof(uint64_t) - 1) - __builtin_clzl(delta_time);
+#else
+    return (8 * sizeof(uint64_t) - 1) - __builtin_clzll(delta_time);
+#endif
+}
+
+/*
+ * It converts the difference of time between the current packet and
+ * the last seen packet into a given priority.
+ */
+static uint8_t
+priority_from_delta_time(uint64_t present, uint64_t past)
+{
+	uint64_t delta_time;
+
+	if (unlikely(present < past)) {
+		/*
+		 * This should never happen, but we handle it gracefully here
+		 * in order to keep going.
+		 */
+		GK_LOG(ERR, "The present time smaller than the past time\n");
+		return 0;
+	}
+
+	delta_time = (present - past) * picosec_per_cycle;
+	if (unlikely(delta_time < 1))
+		return 0;
+
+	return integer_log_base_2(delta_time);
+}
+
+/*
+ * When a flow entry is at request state, all the GK block processing
+ * that entry does is to:
+ * (1) compute the priority of the packet.
+ * (2) encapsulate the packet as a request.
+ * (3) put this encapsulated packet in the request queue.
+ */
+static void
+gk_process_request(struct gk_co *this_co, struct flow_entry *fe,
+	struct ipacket *packet)
+{
+	int ret;
+	uint64_t now = rte_rdtsc();
+	uint8_t priority = priority_from_delta_time(now,
+		fe->u.request.last_packet_seen_at);
+	struct rte_mbuf *pkt = packet->pkt;
+	struct gk_co_work *work = this_co->work;
+	struct gatekeeper_if *back = &work->gk_conf->net->back;
+	struct gk_fib *fib = fe->grantor_fib;
+	struct ether_cache *eth_cache;
+
+	fe->u.request.last_packet_seen_at = now;
+
+	/*
+	 * The reason for using "<" instead of "<=" is that the equal case
+	 * means that the source has waited enough time to have the same
+	 * last priority, so it should be awarded with the allowance.
+	 */
+	if (priority < fe->u.request.last_priority &&
+			fe->u.request.allowance > 0) {
+		fe->u.request.allowance--;
+		priority = fe->u.request.last_priority;
+	} else {
+		fe->u.request.last_priority = priority;
+		fe->u.request.allowance = START_ALLOWANCE - 1;
+	}
+
+	/*
+	 * Adjust @priority for the DSCP field.
+	 * DSCP 0 for legacy packets; 1 for granted packets;
+	 * 2 for capability renew; 3-63 for requests.
+	 */
+	priority += PRIORITY_REQ_MIN;
+	if (unlikely(priority > PRIORITY_MAX))
+		priority = PRIORITY_MAX;
+
+	/* The assigned priority is @priority. */
+
+	/* Encapsulate the packet as a request. */
+	ret = encapsulate(pkt, priority, back, &fib->u.grantor.gt_addr);
+	if (ret < 0)
+		goto drop_pkt;
+
+	eth_cache = fib->u.grantor.eth_cache;
+	RTE_VERIFY(eth_cache != NULL);
+	/* If needed, packet header space was adjusted by encapsulate(). */
+	if (pkt_copy_cached_eth_header(pkt, eth_cache, back->l2_len_out))
+		goto drop_pkt;
+
+	pkt->udata64 = priority;
+	work->front_req_bufs[work->front_num_req++] = pkt;
+	return;
+
+drop_pkt:
+	drop_packet_front(pkt, work->instance);
+}
+
+static void
+gk_process_granted(struct gk_co *this_co, struct flow_entry *fe,
+	struct ipacket *packet)
+{
+	int ret;
+	bool renew_cap;
+	uint8_t priority = PRIORITY_GRANTED;
+	uint64_t now = rte_rdtsc();
+	struct rte_mbuf *pkt = packet->pkt;
+	struct gk_fib *fib = fe->grantor_fib;
+	struct gk_co_work *work = this_co->work;
+	struct gatekeeper_if *back = &work->gk_conf->net->back;
+	struct gk_measurement_metrics *stats;
+	struct ether_cache *eth_cache;
+	uint32_t pkt_len;
+
+	if (now >= fe->u.granted.cap_expire_at) {
+		reinitialize_flow_entry(fe, now);
+		return gk_process_request(this_co, fe, packet);
+	}
+
+	if (now >= fe->u.granted.budget_renew_at) {
+		fe->u.granted.budget_renew_at = now + cycles_per_sec;
+		fe->u.granted.budget_byte =
+			(uint64_t)fe->u.granted.tx_rate_kib_cycle * 1024;
+	}
+
+	stats = &work->instance->traffic_stats;
+
+	pkt_len = rte_pktmbuf_pkt_len(pkt);
+	if (pkt_len > fe->u.granted.budget_byte) {
+		stats->pkts_num_declined++;
+		stats->pkts_size_declined += pkt_len;
+		goto drop_pkt;
+	}
+
+	fe->u.granted.budget_byte -= pkt_len;
+	renew_cap = now >= fe->u.granted.send_next_renewal_at;
+	if (renew_cap) {
+		fe->u.granted.send_next_renewal_at = now +
+			fe->u.granted.renewal_step_cycle;
+		priority = PRIORITY_RENEW_CAP;
+	}
+
+	/*
+	 * Encapsulate packet as a granted packet,
+	 * mark it as a capability renewal request if @renew_cap is true,
+	 * enter destination according to @fe->grantor_fib.
+	 */
+	ret = encapsulate(pkt, priority, back, &fib->u.grantor.gt_addr);
+	if (ret < 0)
+		goto drop_pkt;
+
+	eth_cache = fib->u.grantor.eth_cache;
+	RTE_VERIFY(eth_cache != NULL);
+	/* If needed, packet header space was adjusted by encapsulate(). */
+	if (pkt_copy_cached_eth_header(pkt, eth_cache, back->l2_len_out))
+		goto drop_pkt;
+
+	stats->pkts_num_granted++;
+	stats->pkts_size_granted += pkt_len;
+	work->tx_back_pkts[work->tx_back_num_pkts++] = pkt;
+	return;
+
+drop_pkt:
+	drop_packet_front(pkt, work->instance);
+}
+
+static void
+gk_process_declined(struct gk_co *this_co, struct flow_entry *fe,
+	struct ipacket *packet)
+{
+	uint64_t now = rte_rdtsc();
+	struct gk_co_work *work = this_co->work;
+	struct gk_measurement_metrics *stats;
+
+	if (unlikely(now >= fe->u.declined.expire_at)) {
+		reinitialize_flow_entry(fe, now);
+		return gk_process_request(this_co, fe, packet);
+	}
+
+	stats = &work->instance->traffic_stats;
+	stats->pkts_num_declined++;
+	stats->pkts_size_declined += rte_pktmbuf_pkt_len(packet->pkt);
+	drop_packet_front(packet->pkt, work->instance);
+}
+
+static void
+gk_process_bpf(struct gk_co *this_co, struct flow_entry *fe,
+	struct ipacket *packet)
+{
+	struct rte_mbuf *pkt = packet->pkt;
+	struct gk_co_work *work = this_co->work;
+	struct gk_config *gk_conf = work->gk_conf;
+	struct gk_measurement_metrics *stats;
+	uint64_t bpf_ret;
+	int program_index, rc;
+	uint64_t now = rte_rdtsc();
+
+	if (unlikely(now >= fe->u.bpf.expire_at))
+		goto expired;
+
+	program_index = fe->program_index;
+	rc = gk_bpf_decide_pkt(gk_conf, program_index, fe, packet, now,
+		&bpf_ret);
+	if (unlikely(rc != 0)) {
+		GK_LOG(WARNING,
+			"The BPF program at index %u failed to run its function pkt\n",
+			program_index);
+		goto expired;
+	}
+
+	stats = &work->instance->traffic_stats;
+	switch (bpf_ret) {
+	case GK_BPF_PKT_RET_FORWARD: {
+		struct ether_cache *eth_cache =
+			fe->grantor_fib->u.grantor.eth_cache;
+		RTE_VERIFY(eth_cache != NULL);
+		/*
+		 * If needed, encapsulate() already adjusted
+		 * packet header space.
+		 */
+		if (pkt_copy_cached_eth_header(pkt, eth_cache,
+				gk_conf->net->back.l2_len_out))
+			goto drop_pkt;
+
+		stats->pkts_num_granted++;
+		stats->pkts_size_granted += rte_pktmbuf_pkt_len(pkt);
+		work->tx_back_pkts[work->tx_back_num_pkts++] = pkt;
+		return;
+	}
+	case GK_BPF_PKT_RET_DECLINE:
+		stats->pkts_num_declined++;
+		stats->pkts_size_declined += rte_pktmbuf_pkt_len(pkt);
+		goto drop_pkt;
+	case GK_BPF_PKT_RET_ERROR:
+		GK_LOG(WARNING,
+			"The function pkt of the BPF program at index %u returned GK_BPF_PKT_RET_ERROR\n",
+			program_index);
+		goto drop_pkt;
+	default:
+		GK_LOG(WARNING,
+			"The function pkt of the BPF program at index %u returned an invalid return: %" PRIu64 "\n",
+			program_index, bpf_ret);
+		goto drop_pkt;
+	}
+
+	rte_panic("Unexpected condition at %s()", __func__);
+
+expired:
+	reinitialize_flow_entry(fe, now);
+	return gk_process_request(this_co, fe, packet);
+
+drop_pkt:
+	drop_packet_front(pkt, work->instance);
+}
+
+static void
+process_flow_entry(struct gk_co *this_co, struct flow_entry *fe,
+	struct ipacket *packet)
+{
+	/*
+	 * Some notes regarding flow rates and units:
+	 *
+	 * Flows in the GK_REQUEST state are bandwidth limited
+	 * to an overall rate relative to the link. Therefore,
+	 * the Ethernet frame overhead is counted toward the
+	 * credits used by requests. The request channel rate
+	 * is measured in megabits (base 10) per second to
+	 * match the units used by hardware specifications.
+	 *
+	 * Granted flows (in state GK_GRANTED or sometimes
+	 * GK_BPF) are allocated budgets that are intended
+	 * to reflect the max throughput of the flow, and
+	 * therefore do not include the Ethernet frame overhead.
+	 * The budgets of granted flows are measured in
+	 * kibibytes (base 2).
+	 */
+	switch (fe->state) {
+	case GK_REQUEST:
+		return gk_process_request(this_co, fe, packet);
+
+	case GK_GRANTED:
+		return gk_process_granted(this_co, fe, packet);
+
+	case GK_DECLINED:
+		return gk_process_declined(this_co, fe, packet);
+
+	case GK_BPF:
+		return gk_process_bpf(this_co, fe, packet);
+
+	default:
+		GK_LOG(ERR, "Unknown flow state: %d\n", fe->state);
+		drop_packet_front(packet->pkt, this_co->work->instance);
+		return;
+	}
+
+	rte_panic("Unexpected condition at %s()\n", __func__);
+}
+
+typedef int (*packet_drop_cb_func)(struct rte_mbuf *pkt,
+	struct gk_instance *instance);
+
+static void
+xmit_icmp(struct gatekeeper_if *iface, struct ipacket *packet,
+	uint16_t *num_pkts, struct rte_mbuf **icmp_bufs,
+	struct gk_instance *instance, packet_drop_cb_func cb_f)
+{
+	struct rte_ether_addr eth_addr_tmp;
+	struct rte_ether_hdr *icmp_eth;
+	struct rte_ipv4_hdr *icmp_ipv4;
+	struct rte_icmp_hdr *icmph;
+	struct rte_mbuf *pkt = packet->pkt;
+	int icmp_pkt_len = iface->l2_len_out + sizeof(struct rte_ipv4_hdr) +
+		sizeof(struct rte_icmp_hdr);
+	if (pkt->data_len >= icmp_pkt_len) {
+		int ret = rte_pktmbuf_trim(pkt, pkt->data_len - icmp_pkt_len);
+		if (ret < 0) {
+			GK_LOG(ERR,
+				"Failed to remove %d bytes of data at the end of the mbuf at %s",
+				pkt->data_len - icmp_pkt_len, __func__);
+			cb_f(pkt, instance);
+			return;
+		}
+
+		icmp_eth = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
+	} else {
+		icmp_eth = (struct rte_ether_hdr *)rte_pktmbuf_append(pkt,
+			icmp_pkt_len - pkt->data_len);
+		if (icmp_eth == NULL) {
+			GK_LOG(ERR,
+				"Failed to append %d bytes of new data: not enough headroom space in the first segment at %s\n",
+				icmp_pkt_len - pkt->data_len, __func__);
+			cb_f(pkt, instance);
+			return;
+		}
+	}
+
+	rte_ether_addr_copy(&icmp_eth->s_addr, &eth_addr_tmp);
+	rte_ether_addr_copy(&icmp_eth->d_addr, &icmp_eth->s_addr);
+	rte_ether_addr_copy(&eth_addr_tmp, &icmp_eth->d_addr);
+	if (iface->vlan_insert) {
+		fill_vlan_hdr(icmp_eth, iface->vlan_tag_be,
+			RTE_ETHER_TYPE_IPV4);
+	}
+
+	icmp_ipv4 = (struct rte_ipv4_hdr *)pkt_out_skip_l2(iface, icmp_eth);
+	icmp_ipv4->version_ihl = IP_VHL_DEF;
+	icmp_ipv4->type_of_service = 0;
+	icmp_ipv4->packet_id = 0;
+	icmp_ipv4->fragment_offset = IP_DN_FRAGMENT_FLAG;
+	icmp_ipv4->time_to_live = IP_DEFTTL;
+	icmp_ipv4->next_proto_id = IPPROTO_ICMP;
+	icmp_ipv4->src_addr = packet->flow.f.v4.dst.s_addr;
+	icmp_ipv4->dst_addr = packet->flow.f.v4.src.s_addr;
+	icmp_ipv4->total_length = rte_cpu_to_be_16(pkt->data_len -
+		iface->l2_len_out);
+	/*
+	 * The IP header checksum filed must be set to 0
+	 * in order to offload the checksum calculation.
+	 */
+	icmp_ipv4->hdr_checksum = 0;
+	pkt->l2_len = iface->l2_len_out;
+	pkt->l3_len = sizeof(struct rte_ipv4_hdr);
+	pkt->ol_flags |= PKT_TX_IPV4 | PKT_TX_IP_CKSUM;
+
+	icmph = (struct rte_icmp_hdr *)&icmp_ipv4[1];
+	icmph->icmp_type = ICMP_TIME_EXCEEDED;
+	icmph->icmp_code = ICMP_EXC_TTL;
+	icmph->icmp_cksum = 0;
+	icmph->icmp_ident = 0;
+	icmph->icmp_seq_nb = 0;
+	icmph->icmp_cksum = icmp_cksum(icmph, sizeof(*icmph));
+
+	icmp_bufs[*num_pkts] = pkt;
+	(*num_pkts)++;
+}
+
+static void
+xmit_icmpv6(struct gatekeeper_if *iface, struct ipacket *packet,
+	uint16_t *num_pkts, struct rte_mbuf **icmp_bufs,
+	struct gk_instance *instance, packet_drop_cb_func cb_f)
+{
+	struct rte_ether_addr eth_addr_tmp;
+	struct rte_ether_hdr *icmp_eth;
+	struct rte_ipv6_hdr *icmp_ipv6;
+	struct icmpv6_hdr *icmpv6_hdr;
+	struct rte_mbuf *pkt = packet->pkt;
+	int icmpv6_pkt_len = iface->l2_len_out + sizeof(struct rte_ipv6_hdr) +
+		sizeof(struct icmpv6_hdr);
+	if (pkt->data_len >= icmpv6_pkt_len) {
+		int ret = rte_pktmbuf_trim(pkt,
+			pkt->data_len - icmpv6_pkt_len);
+		if (ret < 0) {
+			GK_LOG(ERR,
+				"Failed to remove %d bytes of data at the end of the mbuf at %s",
+				pkt->data_len - icmpv6_pkt_len, __func__);
+			cb_f(pkt, instance);
+			return;
+		}
+
+		icmp_eth = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
+	} else {
+		icmp_eth = (struct rte_ether_hdr *)rte_pktmbuf_append(pkt,
+			icmpv6_pkt_len - pkt->data_len);
+		if (icmp_eth == NULL) {
+			GK_LOG(ERR,
+				"Failed to append %d bytes of new data: not enough headroom space in the first segment at %s\n",
+				icmpv6_pkt_len - pkt->data_len, __func__);
+			cb_f(pkt, instance);
+			return;
+		}
+	}
+
+	rte_ether_addr_copy(&icmp_eth->s_addr, &eth_addr_tmp);
+	rte_ether_addr_copy(&icmp_eth->d_addr, &icmp_eth->s_addr);
+	rte_ether_addr_copy(&eth_addr_tmp, &icmp_eth->d_addr);
+	if (iface->vlan_insert) {
+		fill_vlan_hdr(icmp_eth, iface->vlan_tag_be,
+			RTE_ETHER_TYPE_IPV6);
+	}
+
+	/* Set-up IPv6 header. */
+	icmp_ipv6 = (struct rte_ipv6_hdr *)pkt_out_skip_l2(iface, icmp_eth);
+	icmp_ipv6->vtc_flow = rte_cpu_to_be_32(IPv6_DEFAULT_VTC_FLOW);
+	icmp_ipv6->payload_len = rte_cpu_to_be_16(sizeof(*icmpv6_hdr));
+	icmp_ipv6->proto = IPPROTO_ICMPV6;
+	/*
+	 * The IP Hop Limit field must be 255 as required by
+	 * RFC 4861, sections 7.1.1 and 7.1.2.
+	 */
+	icmp_ipv6->hop_limits = 255;
+	rte_memcpy(icmp_ipv6->src_addr, packet->flow.f.v6.dst.s6_addr,
+		sizeof(icmp_ipv6->src_addr));
+	rte_memcpy(icmp_ipv6->dst_addr, packet->flow.f.v6.src.s6_addr,
+		sizeof(icmp_ipv6->dst_addr));
+
+	/* Set-up ICMPv6 header. */
+	icmpv6_hdr = (struct icmpv6_hdr *)&icmp_ipv6[1];
+	icmpv6_hdr->type = ICMPV6_TIME_EXCEED;
+	icmpv6_hdr->code = ICMPV6_EXC_HOPLIMIT;
+	icmpv6_hdr->cksum = 0; /* Calculated below. */
+
+	icmpv6_hdr->cksum = rte_ipv6_icmpv6_cksum(icmp_ipv6, icmpv6_hdr);
+
+	icmp_bufs[*num_pkts] = pkt;
+	(*num_pkts)++;
+}
+
+/*
+ * For IPv4, according to the RFC 1812 section 5.3.1 Time to Live (TTL),
+ * if the TTL is reduced to zero (or less), the packet MUST be
+ * discarded, and if the destination is not a multicast address the
+ * router MUST send an ICMP Time Exceeded message, Code 0 (TTL Exceeded
+ * in Transit) message to the source.
+ *
+ * For IPv6, according to the RFC 1883 section 4.4,
+ * if the IPv6 Hop Limit is less than or equal to 1, then the router needs to
+ * send an ICMP Time Exceeded -- Hop Limit Exceeded in Transit message to
+ * the Source Address and discard the packet.
+ */
+static int
+update_ip_hop_count(struct gatekeeper_if *iface, struct ipacket *packet,
+	uint16_t *num_pkts, struct rte_mbuf **icmp_bufs,
+	struct token_bucket_ratelimit_state *rs, struct gk_instance *instance,
+	packet_drop_cb_func cb_f)
+{
+	if (packet->flow.proto == RTE_ETHER_TYPE_IPV4) {
+		struct rte_ipv4_hdr *ipv4_hdr = packet->l3_hdr;
+		if (ipv4_hdr->time_to_live <= 1) {
+			if (tb_ratelimit_allow(rs)) {
+				xmit_icmp(iface, packet, num_pkts,
+					icmp_bufs, instance, cb_f);
+			} else
+				cb_f(packet->pkt, instance);
+			return -ETIMEDOUT;
+		}
+
+		--(ipv4_hdr->time_to_live);
+		++(ipv4_hdr->hdr_checksum);
+	} else if (likely(packet->flow.proto == RTE_ETHER_TYPE_IPV6)) {
+		struct rte_ipv6_hdr *ipv6_hdr = packet->l3_hdr;
+		if (ipv6_hdr->hop_limits <= 1) {
+			if (tb_ratelimit_allow(rs)) {
+				xmit_icmpv6(iface, packet, num_pkts,
+					icmp_bufs, instance, cb_f);
+			} else
+				cb_f(packet->pkt, instance);
+			return -ETIMEDOUT;
+		}
+
+		--(ipv6_hdr->hop_limits);
+	} else {
+		GK_LOG(WARNING,
+			"Unexpected condition at %s: unknown flow type %hu\n",
+			__func__, packet->flow.proto);
+		cb_f(packet->pkt, instance);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void
+forward_pkt_to_back(struct ipacket *packet, struct ether_cache *eth_cache,
+	struct gk_co_work *work)
+{
+	struct rte_mbuf *pkt = packet->pkt;
+	struct gatekeeper_if *front = &work->gk_conf->net->front;
+	struct gatekeeper_if *back = &work->gk_conf->net->back;
+
+	if (adjust_pkt_len(pkt, back, 0) == NULL ||
+			pkt_copy_cached_eth_header(pkt, eth_cache,
+				back->l2_len_out)) {
+		drop_packet_front(pkt, work->instance);
+		return;
+	}
+
+	if (update_ip_hop_count(front, packet,
+			&work->tx_front_num_pkts, work->tx_front_pkts,
+			&work->instance->front_icmp_rs, work->instance,
+			drop_packet_front) < 0)
+		return;
+
+	work->tx_back_pkts[work->tx_back_num_pkts++] = pkt;
+}
+
+static struct gk_fib *
+look_up_fib(struct gk_lpm *ltbl, struct ip_flow *flow)
+{
+	int fib_id;
+
+	if (flow->proto == RTE_ETHER_TYPE_IPV4) {
+		fib_id = lpm_lookup_ipv4(ltbl->lpm, flow->f.v4.dst.s_addr);
+		if (fib_id < 0)
+			return NULL;
+		return &ltbl->fib_tbl[fib_id];
+	}
+
+	if (likely(flow->proto == RTE_ETHER_TYPE_IPV6)) {
+		fib_id = lpm_lookup_ipv6(ltbl->lpm6, &flow->f.v6.dst);
+		if (fib_id < 0)
+			return NULL;
+		return &ltbl->fib_tbl6[fib_id];
+	}
+
+	rte_panic("Unexpected condition at %s: unknown flow type %hu\n",
+		__func__, flow->proto);
+
+	return NULL; /* Unreachable. */
+}
+
+static struct flow_entry *
+lookup_fe_from_lpm(struct ipacket *packet, uint32_t ip_flow_hash_val,
+	struct gk_co_work *work)
+{
+	struct rte_mbuf *pkt = packet->pkt;
+
+	/*
+	 * A prefetch is not needed here because current deployments of
+	 * Gatekeeper servers have only a couple of FIB entries forwarding
+	 * traffic from front to back interfaces.
+	 */
+	struct gk_fib *fib = look_up_fib(&work->gk_conf->lpm_tbl,
+		&packet->flow);
+
+	if (fib == NULL || fib->action == GK_FWD_NEIGHBOR_FRONT_NET) {
+		struct gk_measurement_metrics *stats =
+			&work->instance->traffic_stats;
+		if (packet->flow.proto == RTE_ETHER_TYPE_IPV4) {
+			stats->tot_pkts_num_distributed++;
+			stats->tot_pkts_size_distributed +=
+				rte_pktmbuf_pkt_len(pkt);
+			add_pkt_acl(&work->front_acl4, pkt);
+		} else if (likely(packet->flow.proto ==
+				RTE_ETHER_TYPE_IPV6)) {
+			stats->tot_pkts_num_distributed++;
+			stats->tot_pkts_size_distributed +=
+				rte_pktmbuf_pkt_len(pkt);
+			add_pkt_acl(&work->front_acl6, pkt);
+		} else {
+			print_flow_err_msg(&packet->flow,
+				"gk: failed to get the fib entry");
+			drop_packet_front(pkt, work->instance);
+		}
+		return NULL;
+	}
+
+	switch (fib->action) {
+	case GK_FWD_GRANTOR: {
+		struct flow_entry *fe = &work->temp_fes[work->temp_fes_num++];
+		initialize_flow_entry(fe, &packet->flow, ip_flow_hash_val, fib);
+		return fe;
+	}
+
+	case GK_FWD_GATEWAY_BACK_NET: {
+		/*
+		 * The entry instructs to forward its packets to
+		 * the gateway in the back network.
+		 */
+		struct ether_cache *eth_cache = fib->u.gateway.eth_cache;
+		RTE_VERIFY(eth_cache != NULL);
+		forward_pkt_to_back(packet, eth_cache, work);
+		return NULL;
+	}
+
+	case GK_FWD_NEIGHBOR_BACK_NET: {
+		/*
+		 * The entry instructs to forward its packets to
+		 * the neighbor in the back network.
+		 */
+		struct ether_cache *eth_cache =
+			(packet->flow.proto == RTE_ETHER_TYPE_IPV4)
+				? lookup_ether_cache(&fib->u.neigh,
+					&packet->flow.f.v4.dst)
+				: lookup_ether_cache(&fib->u.neigh6,
+					&packet->flow.f.v6.dst);
+		RTE_VERIFY(eth_cache != NULL);
+		forward_pkt_to_back(packet, eth_cache, work);
+		return NULL;
+	}
+
+	case GK_DROP:
+		/* FALLTHROUGH */
+	default:
+		drop_packet_front(pkt, work->instance);
+		return NULL;
+	}
+
+	return NULL;
+}
+
+static void
+prefetch_and_yield(void *addr, void *this_co)
+{
+	rte_prefetch_non_temporal(addr);
+	yield_next(this_co);
+}
+
+static void
+gk_co_process_front_pkt_final(struct gk_co *this_co, struct gk_co_task *task)
+{
+	struct ipacket *packet = task->task_arg;
+	struct gk_co_work *work = this_co->work;
+	uint32_t ip_flow_hash_val = task->task_hash;
+	struct flow_entry *fe_leftover =
+		get_fe_leftover(work, ip_flow_hash_val);
+	struct flow_entry *fe;
+	int ret;
+
+	/* Is leftover useful? */
+	if (fe_leftover != NULL &&
+			fe_leftover->flow_hash_val == ip_flow_hash_val &&
+			ip_flow_cmp_eq(&fe_leftover->flow,
+				&packet->flow, 0) == 0) {
+		/* Jackpot! Deal with @pkt right away. */
+		process_flow_entry(this_co, fe_leftover, packet);
+		return;
+	}
+
+	/* Look up flow entry. */
+	ret = rte_hash_lookup_and_yield_with_hash(
+		work->instance->ip_flow_hash_table, &packet->flow,
+		ip_flow_hash_val, prefetch_and_yield, this_co);
+	if (ret >= 0) {
+		fe = &work->instance->ip_flow_entry_table[ret];
+		/* TODO Break this prefetch into part1 and part2. */
+		prefetch_flow_entry(fe);
+		yield_next(this_co);
+		process_flow_entry(this_co, fe, packet);
+		save_fe_leftover(work, fe);
+		return;
+	}
+	if (unlikely(ret != -ENOENT)) {
+		char err_msg[1024];
+
+		ret = snprintf(err_msg, sizeof(err_msg),
+			"gk: failed to look up flow state at %s with lcore %u: %i\n",
+			__func__, rte_lcore_id(), ret);
+
+		RTE_VERIFY(ret > 0 && ret < (int)sizeof(err_msg));
+		print_flow_err_msg(&packet->flow, err_msg);
+		return;
+	}
+
+	fe = lookup_fe_from_lpm(packet, ip_flow_hash_val, work);
+	if (fe == NULL)
+		return;
+	process_flow_entry(this_co, fe, packet);
+	save_fe_leftover(work, fe);
+}
+
+void
+gk_co_process_front_pkt_software_rss(struct gk_co *this_co,
+	struct gk_co_task *task)
+{
+	struct ipacket *packet = task->task_arg;
+
+	if (parse_front_pkt(this_co, packet, packet->pkt) != 0)
+		return;
+
+	/* Finish up the work with the correct hash value. */
+	task->task_hash = rss_ip_flow_hf(&packet->flow, 0, 0);
+	task->task_func = gk_co_process_front_pkt_final;
+	reschedule_task(this_co, task);
+}
+
+void
+gk_co_process_front_pkt(struct gk_co *this_co, struct gk_co_task *task)
+{
+	struct ipacket packet;
+
+	if (parse_front_pkt(this_co, &packet, task->task_arg) != 0)
+		return;
+	task->task_arg = &packet;
+	gk_co_process_front_pkt_final(this_co, task);
+}
+
+static void
+gk_co_scan_flow_table_final(struct gk_co *this_co, struct gk_co_task *task)
+{
+	struct gk_co_work *work = this_co->work;
+	struct flow_entry *fe = task->task_arg;
+	struct flow_entry **leftover_bucket = get_fe_leftover_bucket(work, fe);
+
+	RTE_VERIFY(work->del_fe == NULL);
+	work->del_fe = fe;
+
+	/* Deal with the leftover. */
+	if (unlikely(*leftover_bucket == fe)) {
+		/* One does not need to look up again. */
+		return;
+	}
+	*leftover_bucket = fe;
+
+	/* Prefetch buckets to remove the flow entry later. */
+	rte_hash_lookup_and_yield_with_hash(work->instance->ip_flow_hash_table,
+		&fe->flow, fe->flow_hash_val, prefetch_and_yield, this_co);
+}
+
+static bool
+is_flow_expired(struct flow_entry *fe, uint64_t now)
+{
+	switch(fe->state) {
+	case GK_REQUEST:
+		if (fe->u.request.last_packet_seen_at > now) {
+			char err_msg[128];
+			int ret = snprintf(err_msg, sizeof(err_msg),
+				"gk: buggy condition at %s: wrong timestamp",
+				__func__);
+			RTE_VERIFY(ret > 0 && ret < (int)sizeof(err_msg));
+			print_flow_err_msg(&fe->flow, err_msg);
+			return true;
+		}
+
+		/*
+		 * A request entry is considered expired if it is not
+		 * doubling its waiting time. We use +2 instead of +1 in
+		 * the test below to account for random delays in the network.
+		 */
+		return priority_from_delta_time(now,
+			fe->u.request.last_packet_seen_at) >
+			fe->u.request.last_priority + 2;
+	case GK_GRANTED:
+		return now >= fe->u.granted.cap_expire_at;
+	case GK_DECLINED:
+		return now >= fe->u.declined.expire_at;
+	case GK_BPF:
+		return now >= fe->u.bpf.expire_at;
+	default:
+		return true;
+	}
+}
+
+void
+gk_co_scan_flow_table(struct gk_co *this_co, struct gk_co_task *task)
+{
+	struct flow_entry *fe = task->task_arg;
+
+	/*
+	 * Only one prefetch is needed here because one only needs
+	 * the beginning of a struct flow_entry to
+	 * check if it's expired.
+	 */
+	rte_prefetch_non_temporal(fe);
+	yield_next(this_co);
+
+	if (!fe->in_use || !is_flow_expired(fe, rte_rdtsc()))
+		return;
+
+	/* Finish up the work with the correct hash value. */
+	task->task_hash = fe->flow_hash_val;
+	task->task_func = gk_co_scan_flow_table_final;
+	reschedule_task(this_co, task);
+}
+
 static struct gk_co_task *
 next_task(struct gk_co *this_co)
 {
diff --git a/gk/co.h b/gk/co.h
index d6828f65b..e290ea26e 100644
--- a/gk/co.h
+++ b/gk/co.h
@@ -122,6 +122,12 @@ struct gk_co_work {
 	 * It must be of the form (2^n - 1) for any n >= 0.
 	 */
 	const uint32_t leftover_mask;
+	/*
+	 * The following fields release the coroutines of acquiring
+	 * a writer lock on the flow table.
+	 */
+	/* If different of NULL, free this entry in flush_work(). */
+	struct flow_entry *del_fe;
 
 	/* Fields for front and back packets. */
 	uint16_t tx_front_num_pkts;
@@ -192,6 +198,7 @@ struct gk_co_work {
 		.temp_fes = name##_temp_fes_array,			\
 		.temp_fes_num = 0,					\
 		.leftover_mask = (lo_mask),				\
+		.del_fe = NULL,						\
 		.tx_front_num_pkts = 0,					\
 		.tx_back_num_pkts  = 0,					\
 		.tx_front_pkts = name##_tx_front_pkts_array,		\
@@ -237,7 +244,44 @@ schedule_task_to_any_co(struct gk_co_work *work, struct gk_co_task *task)
 	work->any_co_index = (work->any_co_index + 1) % work->co_num;
 }
 
+static inline struct flow_entry **
+__get_fe_leftover_bucket(struct gk_co_work *work, uint32_t hash)
+{
+	return &work->leftover[hash & work->leftover_mask];
+}
+
+static inline struct flow_entry **
+get_fe_leftover_bucket(struct gk_co_work *work, struct flow_entry *fe)
+{
+	return __get_fe_leftover_bucket(work, fe->flow_hash_val);
+}
+
+static inline struct flow_entry *
+get_fe_leftover(struct gk_co_work *work, uint32_t hash)
+{
+	return *__get_fe_leftover_bucket(work, hash);
+}
+
+/*
+ * Notice that if the bucket is not empty, that reference will be lost.
+ * That is, the code favors the newer entry over the older entry.
+ */
+static inline void
+save_fe_leftover(struct gk_co_work *work, struct flow_entry *fe)
+{
+	*get_fe_leftover_bucket(work, fe) = fe;
+}
+
 void
 gk_co_main(void *arg);
 
+void
+gk_co_scan_flow_table(struct gk_co *this_co, struct gk_co_task *task);
+
+void
+gk_co_process_front_pkt(struct gk_co *this_co, struct gk_co_task *task);
+void
+gk_co_process_front_pkt_software_rss(struct gk_co *this_co,
+	struct gk_co_task *task);
+
 #endif /* _GATEKEEPER_GK_CO_H_ */
diff --git a/gk/main.c b/gk/main.c
index f19f1272e..bb38be214 100644
--- a/gk/main.c
+++ b/gk/main.c
@@ -44,15 +44,14 @@
 #include "gatekeeper_sol.h"
 #include "gatekeeper_flow_bpf.h"
 
-#include "bpf.h"
 #include "co.h"
 
-#define	START_PRIORITY		 (38)
-/* Set @START_ALLOWANCE as the double size of a large DNS reply. */
-#define	START_ALLOWANCE		 (8)
-
 int gk_logtype;
 
+/*
+ * TODO A copy of this function is available in gk/co.c,
+ * so drop it when possible.
+ */
 /* We should avoid calling integer_log_base_2() with zero. */
 static inline uint8_t
 integer_log_base_2(uint64_t delta_time)
@@ -64,18 +63,22 @@ integer_log_base_2(uint64_t delta_time)
 #endif
 }
 
-/* 
- * It converts the difference of time between the current packet and 
- * the last seen packet into a given priority. 
+/*
+ * TODO A copy of this function is available in gk/co.c,
+ * so drop it when possible.
  */
-static uint8_t 
+/*
+ * It converts the difference of time between the current packet and
+ * the last seen packet into a given priority.
+ */
+static uint8_t
 priority_from_delta_time(uint64_t present, uint64_t past)
 {
 	uint64_t delta_time;
 
 	if (unlikely(present < past)) {
 		/*
-		 * This should never happen, but we handle it gracefully here 
+		 * This should never happen, but we handle it gracefully here
 		 * in order to keep going.
 		 */
 		GK_LOG(ERR, "The present time smaller than the past time\n");
@@ -85,10 +88,14 @@ priority_from_delta_time(uint64_t present, uint64_t past)
 	delta_time = (present - past) * picosec_per_cycle;
 	if (unlikely(delta_time < 1))
 		return 0;
-	
+
 	return integer_log_base_2(delta_time);
 }
 
+/*
+ * TODO A copy of this function is available in gk/co.c,
+ * so drop it when possible.
+ */
 static struct gk_fib *
 look_up_fib(struct gk_lpm *ltbl, struct ip_flow *flow)
 {
@@ -114,6 +121,10 @@ look_up_fib(struct gk_lpm *ltbl, struct ip_flow *flow)
 	return NULL; /* Unreachable. */
 }
 
+/*
+ * TODO A copy of this function is available in gk/co.c,
+ * so drop it when possible.
+ */
 static int
 extract_packet_info(struct rte_mbuf *pkt, struct ipacket *packet)
 {
@@ -181,41 +192,17 @@ extract_packet_info(struct rte_mbuf *pkt, struct ipacket *packet)
 	return ret;
 }
 
-static inline void
-initialize_flow_entry(struct flow_entry *fe, struct ip_flow *flow,
-	uint32_t flow_hash_val, struct gk_fib *grantor_fib)
-{
-	/*
-	 * The flow table is a critical data structure, so,
-	 * whenever the size of entries grow too much,
-	 * one must look for alternatives before increasing
-	 * the limit below.
-	 */
-	RTE_BUILD_BUG_ON(sizeof(*fe) > 128);
-
-	rte_memcpy(&fe->flow, flow, sizeof(*flow));
-
-	fe->in_use = true;
-	fe->flow_hash_val = flow_hash_val;
-	fe->state = GK_REQUEST;
-	fe->u.request.last_packet_seen_at = rte_rdtsc();
-	fe->u.request.last_priority = START_PRIORITY;
-	fe->u.request.allowance = START_ALLOWANCE - 1;
-	fe->grantor_fib = grantor_fib;
-}
-
-static inline void
-reinitialize_flow_entry(struct flow_entry *fe, uint64_t now)
-{
-	fe->state = GK_REQUEST;
-	fe->u.request.last_packet_seen_at = now;
-	fe->u.request.last_priority = START_PRIORITY;
-	fe->u.request.allowance = START_ALLOWANCE - 1;
-}
-
+/*
+ * TODO A copy of this typedef is available in gk/co.c,
+ * so drop it when possible.
+ */
 typedef int (*packet_drop_cb_func)(struct rte_mbuf *pkt,
 	struct gk_instance *instance);
 
+/*
+ * TODO A copy of this function is available in gk/co.c,
+ * so drop it when possible.
+ */
 static int
 drop_packet_front(struct rte_mbuf *pkt, struct gk_instance *instance)
 {
@@ -258,247 +245,6 @@ pkt_copy_cached_eth_header(struct rte_mbuf *pkt, struct ether_cache *eth_cache,
 	return stale;
 }
 
-/* 
- * When a flow entry is at request state, all the GK block processing
- * that entry does is to:
- * (1) compute the priority of the packet.
- * (2) encapsulate the packet as a request.
- * (3) put this encapsulated packet in the request queue.
- *
- * Returns a negative integer on error, or EINPROGRESS to indicate
- * that the request is being processed by another lcore, and should
- * not be forwarded or dropped on returning from this function.
- */
-static int
-gk_process_request(struct flow_entry *fe, struct ipacket *packet,
-	struct rte_mbuf **req_bufs, uint16_t *num_reqs,
-	struct sol_config *sol_conf)
-{
-	int ret;
-	uint64_t now = rte_rdtsc();
-	uint8_t priority = priority_from_delta_time(now,
-			fe->u.request.last_packet_seen_at);
-	struct gk_fib *fib = fe->grantor_fib;
-	struct ether_cache *eth_cache;
-
-	fe->u.request.last_packet_seen_at = now;
-
-	/*
-	 * The reason for using "<" instead of "<=" is that the equal case 
-	 * means that the source has waited enough time to have the same 
-	 * last priority, so it should be awarded with the allowance.
-	 */
-	if (priority < fe->u.request.last_priority &&
-			fe->u.request.allowance > 0) {
-		fe->u.request.allowance--;
-		priority = fe->u.request.last_priority;
-	} else {
-		fe->u.request.last_priority = priority;
-		fe->u.request.allowance = START_ALLOWANCE - 1;
-	}
-
-	/*
-	 * Adjust @priority for the DSCP field.
-	 * DSCP 0 for legacy packets; 1 for granted packets; 
-	 * 2 for capability renew; 3-63 for requests.
-	 */
-	priority += PRIORITY_REQ_MIN;
-	if (unlikely(priority > PRIORITY_MAX))
-		priority = PRIORITY_MAX;
-
-	/* The assigned priority is @priority. */
-
-	/* Encapsulate the packet as a request. */
-	ret = encapsulate(packet->pkt, priority,
-		&sol_conf->net->back, &fib->u.grantor.gt_addr);
-	if (ret < 0)
-		return ret;
-
-	eth_cache = fib->u.grantor.eth_cache;
-	RTE_VERIFY(eth_cache != NULL);
-	/* If needed, packet header space was adjusted by encapsulate(). */
-	if (pkt_copy_cached_eth_header(packet->pkt, eth_cache,
-			sol_conf->net->back.l2_len_out))
-		return -1;
-
-	req_bufs[*num_reqs] = packet->pkt;
-	req_bufs[*num_reqs]->udata64 = priority;
-	(*num_reqs)++;
-
-	return EINPROGRESS;
-}
-
-/*
- * Returns:
- *   * zero on success; the granted packet can be enqueued and forwarded
- *   * a negative number on error or when the packet needs to be
- *     otherwise dropped because it has exceeded its budget
- *   * EINPROGRESS to indicate that the packet is now a request that
- *     is being processed by another lcore, and should not
- *     be forwarded or dropped on returning from this function.
- */
-static int
-gk_process_granted(struct flow_entry *fe, struct ipacket *packet,
-	struct rte_mbuf **req_bufs, uint16_t *num_reqs,
-	struct sol_config *sol_conf, struct gk_measurement_metrics *stats)
-{
-	int ret;
-	bool renew_cap;
-	uint8_t priority = PRIORITY_GRANTED;
-	uint64_t now = rte_rdtsc();
-	struct rte_mbuf *pkt = packet->pkt;
-	struct gk_fib *fib = fe->grantor_fib;
-	struct ether_cache *eth_cache;
-	uint32_t pkt_len;
-
-	if (now >= fe->u.granted.cap_expire_at) {
-		reinitialize_flow_entry(fe, now);
-		return gk_process_request(fe, packet, req_bufs,
-			num_reqs, sol_conf);
-	}
-
-	if (now >= fe->u.granted.budget_renew_at) {
-		fe->u.granted.budget_renew_at = now + cycles_per_sec;
-		fe->u.granted.budget_byte =
-			(uint64_t)fe->u.granted.tx_rate_kib_cycle * 1024;
-	}
-
-	pkt_len = rte_pktmbuf_pkt_len(pkt);
-	if (pkt_len > fe->u.granted.budget_byte) {
-		stats->pkts_num_declined++;
-		stats->pkts_size_declined += pkt_len;
-		return -1;
-	}
-
-	fe->u.granted.budget_byte -= pkt_len;
-	renew_cap = now >= fe->u.granted.send_next_renewal_at;
-	if (renew_cap) {
-		fe->u.granted.send_next_renewal_at = now +
-			fe->u.granted.renewal_step_cycle;
-		priority = PRIORITY_RENEW_CAP;
-	}
-
-	/*
-	 * Encapsulate packet as a granted packet,
-	 * mark it as a capability renewal request if @renew_cap is true,
-	 * enter destination according to @fe->grantor_fib.
-	 */
-	ret = encapsulate(packet->pkt, priority,
-		&sol_conf->net->back, &fib->u.grantor.gt_addr);
-	if (ret < 0)
-		return ret;
-
-	eth_cache = fib->u.grantor.eth_cache;
-	RTE_VERIFY(eth_cache != NULL);
-	/* If needed, packet header space was adjusted by encapsulate(). */
-	if (pkt_copy_cached_eth_header(packet->pkt, eth_cache,
-			sol_conf->net->back.l2_len_out))
-		return -1;
-
-	stats->pkts_num_granted++;
-	stats->pkts_size_granted += pkt_len;
-	return 0;
-}
-
-/*
- * Returns:
- *   * a negative number on error or when the packet needs to be
- *     otherwise dropped because it is declined
- *   * EINPROGRESS to indicate that the packet is now a request that
- *     is being processed by another lcore, and should not
- *     be forwarded or dropped on returning from this function.
- */
-static int
-gk_process_declined(struct flow_entry *fe, struct ipacket *packet,
-	struct rte_mbuf **req_bufs, uint16_t *num_reqs,
-	struct sol_config *sol_conf, struct gk_measurement_metrics *stats)
-{
-	uint64_t now = rte_rdtsc();
-
-	if (unlikely(now >= fe->u.declined.expire_at)) {
-		reinitialize_flow_entry(fe, now);
-		return gk_process_request(fe, packet, req_bufs,
-			num_reqs, sol_conf);
-	}
-
-	stats->pkts_num_declined++;
-	stats->pkts_size_declined += rte_pktmbuf_pkt_len(packet->pkt);
-
-	return -1;
-}
-
-/*
- * Returns:
- *   * zero on success; the packet can be enqueued and forwarded
- *   * a negative number on error or when the packet needs to be
- *     otherwise dropped because it has exceeded a limit
- *   * EINPROGRESS to indicate that the packet is now a request that
- *     is being processed by another lcore, and should not
- *     be forwarded or dropped on returning from this function.
- */
-static int
-gk_process_bpf(struct flow_entry *fe, struct ipacket *packet,
-	struct rte_mbuf **req_bufs, uint16_t *num_reqs,
-	struct gk_config *gk_conf, struct gk_measurement_metrics *stats)
-{
-	uint64_t bpf_ret;
-	int program_index, rc;
-	uint64_t now = rte_rdtsc();
-
-	if (unlikely(now >= fe->u.bpf.expire_at))
-		goto expired;
-
-	program_index = fe->program_index;
-	rc = gk_bpf_decide_pkt(gk_conf, program_index, fe, packet, now,
-		&bpf_ret);
-	if (unlikely(rc != 0)) {
-		GK_LOG(WARNING,
-			"The BPF program at index %u failed to run its function pkt\n",
-			program_index);
-		goto expired;
-	}
-
-	switch (bpf_ret) {
-	case GK_BPF_PKT_RET_FORWARD: {
-		struct ether_cache *eth_cache =
-			fe->grantor_fib->u.grantor.eth_cache;
-		RTE_VERIFY(eth_cache != NULL);
-		/*
-		 * If needed, encapsulate() already adjusted
-		 * packet header space.
-		 */
-		if (pkt_copy_cached_eth_header(packet->pkt, eth_cache,
-				gk_conf->net->back.l2_len_out))
-			return -1;
-
-		stats->pkts_num_granted++;
-		stats->pkts_size_granted += rte_pktmbuf_pkt_len(packet->pkt);
-		return 0;
-	}
-	case GK_BPF_PKT_RET_DECLINE:
-		stats->pkts_num_declined++;
-		stats->pkts_size_declined += rte_pktmbuf_pkt_len(packet->pkt);
-		return -1;
-	case GK_BPF_PKT_RET_ERROR:
-		GK_LOG(WARNING,
-			"The function pkt of the BPF program at index %u returned GK_BPF_PKT_RET_ERROR\n",
-			program_index);
-		return -1;
-	default:
-		GK_LOG(WARNING,
-			"The function pkt of the BPF program at index %u returned an invalid return: %" PRIu64 "\n",
-			program_index, bpf_ret);
-		return -1;
-	}
-
-	rte_panic("Unexpected condition at %s()", __func__);
-
-expired:
-	reinitialize_flow_entry(fe, now);
-	return gk_process_request(fe, packet, req_bufs, num_reqs,
-		gk_conf->sol_conf);
-}
-
 static int
 get_block_idx(struct gk_config *gk_conf, unsigned int lcore_id)
 {
@@ -511,6 +257,10 @@ get_block_idx(struct gk_config *gk_conf, unsigned int lcore_id)
 	return 0;
 }
 
+/*
+ * TODO A copy of this function is available in gk/co.c,
+ * so drop it when possible.
+ */
 static bool
 is_flow_expired(struct flow_entry *fe, uint64_t now)
 {
@@ -546,12 +296,17 @@ is_flow_expired(struct flow_entry *fe, uint64_t now)
 }
 
 static int
-gk_del_flow_entry_from_hash(struct rte_hash *h, struct flow_entry *fe)
+gk_del_flow_entry_from_hash(struct gk_instance *instance, struct flow_entry *fe)
 {
-	int ret = rte_hash_del_key_with_hash(h, &fe->flow, fe->flow_hash_val);
-	if (likely(ret >= 0))
+
+	int ret = rte_hash_del_key_with_hash(instance->ip_flow_hash_table,
+		&fe->flow, fe->flow_hash_val);
+	if (likely(ret >= 0)) {
 		memset(fe, 0, sizeof(*fe));
-	else {
+
+		if (instance->num_scan_del > 0)
+			instance->num_scan_del--;
+	} else {
 		GK_LOG(ERR,
 			"The GK block failed to delete a key from hash table at %s: %s\n",
 			__func__, strerror(-ret));
@@ -796,8 +551,7 @@ flush_flow_table(struct ip_prefix *src,
 		}
 
 		if (matched) {
-			gk_del_flow_entry_from_hash(
-				instance->ip_flow_hash_table, fe);
+			gk_del_flow_entry_from_hash(instance, fe);
 			num_flushed_flows++;
 		}
 
@@ -938,10 +692,8 @@ gk_synchronize(struct gk_fib *fib, struct gk_instance *instance)
 		while (index >= 0) {
 			struct flow_entry *fe =
 				&instance->ip_flow_entry_table[index];
-			if (fe->grantor_fib == fib) {
-				gk_del_flow_entry_from_hash(
-					instance->ip_flow_hash_table, fe);
-			}
+			if (fe->grantor_fib == fib)
+				gk_del_flow_entry_from_hash(instance, fe);
 
 			index = rte_hash_iterate(instance->ip_flow_hash_table,
 				(void *)&key, &data, &next);
@@ -1056,6 +808,10 @@ gk_setup_rss(struct gk_config *gk_conf)
 	return ret;
 }
 
+/*
+ * TODO A copy of this function is available in gk/co.c,
+ * so drop it when possible.
+ */
 static void
 xmit_icmp(struct gatekeeper_if *iface, struct ipacket *packet,
 	uint16_t *num_pkts, struct rte_mbuf **icmp_bufs,
@@ -1131,6 +887,10 @@ xmit_icmp(struct gatekeeper_if *iface, struct ipacket *packet,
 	(*num_pkts)++;
 }
 
+/*
+ * TODO A copy of this function is available in gk/co.c,
+ * so drop it when possible.
+ */
 static void
 xmit_icmpv6(struct gatekeeper_if *iface, struct ipacket *packet,
 	uint16_t *num_pkts, struct rte_mbuf **icmp_bufs,
@@ -1202,6 +962,10 @@ xmit_icmpv6(struct gatekeeper_if *iface, struct ipacket *packet,
 	(*num_pkts)++;
 }
 
+/*
+ * TODO A copy of this function is available in gk/co.c,
+ * so drop it when possible.
+ */
 /*
  * For IPv4, according to the RFC 1812 section 5.3.1 Time to Live (TTL),
  * if the TTL is reduced to zero (or less), the packet MUST be
@@ -1256,26 +1020,6 @@ update_ip_hop_count(struct gatekeeper_if *iface, struct ipacket *packet,
 	return 0;
 }
 
-/*
- * This function is only to be called on flows that
- * are not backed by a flow entry.
- */
-static void
-send_request_to_grantor(struct ipacket *packet, uint32_t flow_hash_val,
-		struct gk_fib *fib, struct rte_mbuf **req_bufs,
-		uint16_t *num_reqs, struct gk_instance *instance,
-		struct gk_config *gk_conf) {
-	int ret;
-	struct flow_entry temp_fe;
-
-	initialize_flow_entry(&temp_fe, &packet->flow, flow_hash_val, fib);
-
-	ret = gk_process_request(&temp_fe, packet, req_bufs,
-		num_reqs, gk_conf->sol_conf);
-	if (ret < 0)
-		drop_packet_front(packet->pkt, instance);
-}
-
 static void
 lookup_fib_bulk(struct gk_lpm *ltbl, struct ip_flow **flows,
 	int num_flows, struct gk_fib *fibs[])
@@ -1355,457 +1099,6 @@ lookup_fib6_bulk(struct gk_lpm *ltbl, struct ip_flow **flows,
 	}
 }
 
-static struct flow_entry *
-lookup_fe_from_lpm(struct ipacket *packet, uint32_t ip_flow_hash_val,
-		struct gk_fib *fib, uint16_t *num_tx, struct rte_mbuf **tx_bufs,
-		struct acl_search *acl4, struct acl_search *acl6,
-		uint16_t *num_pkts, struct rte_mbuf **icmp_bufs,
-		struct rte_mbuf **req_bufs, uint16_t *num_reqs,
-		struct gatekeeper_if *front, struct gatekeeper_if *back,
-		struct gk_instance *instance, struct gk_config *gk_conf) {
-	struct rte_mbuf *pkt = packet->pkt;
-	struct ether_cache *eth_cache;
-	struct gk_measurement_metrics *stats = &instance->traffic_stats;
-
-	if (fib == NULL || fib->action == GK_FWD_NEIGHBOR_FRONT_NET) {
-		if (packet->flow.proto == RTE_ETHER_TYPE_IPV4) {
-			stats->tot_pkts_num_distributed++;
-			stats->tot_pkts_size_distributed +=
-				rte_pktmbuf_pkt_len(pkt);
-
-			add_pkt_acl(acl4, pkt);
-		} else if (likely(packet->flow.proto ==
-				RTE_ETHER_TYPE_IPV6)) {
-			stats->tot_pkts_num_distributed++;
-			stats->tot_pkts_size_distributed +=
-				rte_pktmbuf_pkt_len(pkt);
-
-			add_pkt_acl(acl6, pkt);
-		} else {
-			print_flow_err_msg(&packet->flow,
-				"gk: failed to get the fib entry");
-			drop_packet_front(pkt, instance);
-		}
-		return NULL;
-	}
-
-	switch (fib->action) {
-	case GK_FWD_GRANTOR: {
-		struct flow_entry *fe;
-		int ret = gk_hash_add_flow_entry(
-			instance, &packet->flow,
-			ip_flow_hash_val, gk_conf);
-		if (ret == -ENOSPC) {
-			/*
-			 * There is no room for a new
-			 * flow entry, but give this
-			 * flow a chance sending a
-			 * request to the grantor
-			 * server.
-			 */
-			send_request_to_grantor(packet, ip_flow_hash_val,
-				fib, req_bufs, num_reqs, instance, gk_conf);
-			return NULL;
-		}
-		if (ret < 0) {
-			drop_packet_front(pkt, instance);
-			return NULL;
-		}
-
-		fe = &instance->ip_flow_entry_table[ret];
-		initialize_flow_entry(fe,
-			&packet->flow, ip_flow_hash_val, fib);
-		return fe;
-	}
-
-	case GK_FWD_GATEWAY_BACK_NET: {
-		/*
-		 * The entry instructs to forward
-		 * its packets to the gateway in
-		 * the back network, forward accordingly.
-		 *
-		 * BP block bypasses from the front to the
-		 * back interface are expected to bypass
-		 * ranges of IP addresses that should not
-		 * go through Gatekeeper.
-		 *
-		 * Notice that one needs to update
-		 * the Ethernet header.
-		 */
-
-		eth_cache = fib->u.gateway.eth_cache;
-		RTE_VERIFY(eth_cache != NULL);
-
-		if (adjust_pkt_len(pkt, back, 0) == NULL ||
-				pkt_copy_cached_eth_header(pkt,
-					eth_cache,
-					back->l2_len_out)) {
-			drop_packet_front(pkt, instance);
-			return NULL;
-		}
-
-		if (update_ip_hop_count(front, packet,
-				num_pkts, icmp_bufs,
-				&instance->front_icmp_rs,
-				instance,
-				drop_packet_front) < 0)
-			return NULL;
-
-		tx_bufs[(*num_tx)++] = pkt;
-		return NULL;
-	}
-
-	case GK_FWD_NEIGHBOR_BACK_NET: {
-		/*
-		 * The entry instructs to forward
-		 * its packets to the neighbor in
-		 * the back network, forward accordingly.
-		 */
-		if (packet->flow.proto == RTE_ETHER_TYPE_IPV4) {
-			eth_cache = lookup_ether_cache(
-				&fib->u.neigh,
-				&packet->flow.f.v4.dst);
-		} else {
-			eth_cache = lookup_ether_cache(
-				&fib->u.neigh6,
-				&packet->flow.f.v6.dst);
-		}
-
-		RTE_VERIFY(eth_cache != NULL);
-
-		if (adjust_pkt_len(pkt, back, 0) == NULL ||
-				pkt_copy_cached_eth_header(pkt,
-					eth_cache,
-					back->l2_len_out)) {
-			drop_packet_front(pkt, instance);
-			return NULL;
-		}
-
-		if (update_ip_hop_count(front, packet,
-				num_pkts, icmp_bufs,
-				&instance->front_icmp_rs,
-				instance,
-				drop_packet_front) < 0)
-			return NULL;
-
-		tx_bufs[(*num_tx)++] = pkt;
-		return NULL;
-	}
-
-	case GK_DROP:
-		/* FALLTHROUGH */
-	default:
-		drop_packet_front(pkt, instance);
-		return NULL;
-	}
-
-	return NULL;
-}
-
-static int
-process_flow_entry(struct flow_entry *fe, struct ipacket *packet,
-	struct rte_mbuf **req_bufs, uint16_t *num_reqs,
-	struct gk_config *gk_conf, struct gk_measurement_metrics *stats)
-{
-	int ret;
-
-	/*
-	 * Some notes regarding flow rates and units:
-	 *
-	 * Flows in the GK_REQUEST state are bandwidth limited
-	 * to an overall rate relative to the link. Therefore,
-	 * the Ethernet frame overhead is counted toward the
-	 * credits used by requests. The request channel rate
-	 * is measured in megabits (base 10) per second to
-	 * match the units used by hardware specifications.
-	 *
-	 * Granted flows (in state GK_GRANTED or sometimes
-	 * GK_BPF) are allocated budgets that are intended
-	 * to reflect the max throughput of the flow, and
-	 * therefore do not include the Ethernet frame overhead.
-	 * The budgets of granted flows are measured in
-	 * kibibytes (base 2).
-	 */
-	switch (fe->state) {
-	case GK_REQUEST:
-		ret = gk_process_request(fe, packet,
-			req_bufs, num_reqs, gk_conf->sol_conf);
-		break;
-
-	case GK_GRANTED:
-		ret = gk_process_granted(fe, packet,
-			req_bufs, num_reqs, gk_conf->sol_conf, stats);
-		break;
-
-	case GK_DECLINED:
-		ret = gk_process_declined(fe, packet,
-			req_bufs, num_reqs, gk_conf->sol_conf, stats);
-		break;
-
-	case GK_BPF:
-		ret = gk_process_bpf(fe, packet,
-			req_bufs, num_reqs, gk_conf, stats);
-		break;
-
-	default:
-		ret = -1;
-		GK_LOG(ERR, "Unknown flow state: %d\n", fe->state);
-		break;
-	}
-
-	return ret;
-}
-
-static inline void
-prefetch_flow_entry(struct flow_entry *fe)
-{
-#if RTE_CACHE_LINE_SIZE == 64
-	RTE_BUILD_BUG_ON(sizeof(*fe) <= RTE_CACHE_LINE_SIZE);
-	RTE_BUILD_BUG_ON(sizeof(*fe) > 2 * RTE_CACHE_LINE_SIZE);
-	rte_prefetch0(fe);
-	rte_prefetch0(((char *)fe) + RTE_CACHE_LINE_SIZE);
-#elif RTE_CACHE_LINE_SIZE == 128
-	RTE_BUILD_BUG_ON(sizeof(*fe) > RTE_CACHE_LINE_SIZE);
-	rte_prefetch0(fe);
-#else
-#error "Unsupported cache line size"
-#endif
-}
-
-static void
-parse_packet(struct ipacket *packet, struct rte_mbuf *pkt,
-	struct rte_mbuf **arp_bufs, uint16_t *num_arp,
-	bool ipv4_configured_front, bool ipv6_configured_front,
-	struct ip_flow **flow_arr, uint32_t *flow_hash_val_arr,
-	int *num_ip_flows, struct gatekeeper_if *front,
-	struct gk_instance *instance)
-{
-	int ret;
-	struct gk_measurement_metrics *stats = &instance->traffic_stats;
-
-	stats->tot_pkts_size += rte_pktmbuf_pkt_len(pkt);
-
-	ret = extract_packet_info(pkt, packet);
-	if (ret < 0) {
-		if (likely(packet->flow.proto == RTE_ETHER_TYPE_ARP)) {
-			stats->tot_pkts_num_distributed++;
-			stats->tot_pkts_size_distributed +=
-				rte_pktmbuf_pkt_len(pkt);
-
-			arp_bufs[(*num_arp)++] = pkt;
-			return;
-		}
-
-		/* Drop non-IP and non-ARP packets. */
-		drop_packet_front(pkt, instance);
-		return;
-	}
-
-	if (unlikely((packet->flow.proto == RTE_ETHER_TYPE_IPV4 &&
-			!ipv4_configured_front) ||
-			(packet->flow.proto == RTE_ETHER_TYPE_IPV6 &&
-			!ipv6_configured_front))) {
-		drop_packet_front(pkt, instance);
-		return;
-	}
-
-	flow_arr[*num_ip_flows] = &packet->flow;
-	flow_hash_val_arr[*num_ip_flows] = likely(front->rss) ?
-		pkt->hash.rss : rss_ip_flow_hf(&packet->flow, 0, 0);
-	(*num_ip_flows)++;
-}
-
-#define PREFETCH_OFFSET (4)
-
-/* Process the packets on the front interface. */
-static void
-process_pkts_front(uint16_t port_front, uint16_t rx_queue_front,
-	unsigned int lcore,
-	uint16_t *tx_front_num_pkts, struct rte_mbuf **tx_front_pkts,
-	uint16_t *tx_back_num_pkts, struct rte_mbuf **tx_back_pkts,
-	struct gk_instance *instance, struct gk_config *gk_conf)
-{
-	int i;
-	int done_lookups;
-	int ret;
-	uint16_t num_rx;
-	uint16_t num_arp = 0;
-	uint16_t num_reqs = 0;
-	uint16_t front_max_pkt_burst = gk_conf->front_max_pkt_burst;
-	struct rte_mbuf *rx_bufs[front_max_pkt_burst];
-	struct rte_mbuf *arp_bufs[front_max_pkt_burst];
-	struct rte_mbuf *req_bufs[front_max_pkt_burst];
-	DEFINE_ACL_SEARCH(acl4, front_max_pkt_burst);
-	DEFINE_ACL_SEARCH(acl6, front_max_pkt_burst);
-	struct gatekeeper_if *front = &gk_conf->net->front;
-	struct gatekeeper_if *back = &gk_conf->net->back;
-	struct gk_measurement_metrics *stats = &instance->traffic_stats;
-	bool ipv4_configured_front = ipv4_if_configured(&gk_conf->net->front);
-	bool ipv6_configured_front = ipv6_if_configured(&gk_conf->net->front);
-	int num_ip_flows = 0;
-	struct ipacket pkt_arr[front_max_pkt_burst];
-	struct ip_flow *flow_arr[front_max_pkt_burst];
-	uint32_t flow_hash_val_arr[front_max_pkt_burst];
-	int num_lpm_lookups = 0;
-	int num_lpm6_lookups = 0;
-	struct ip_flow *flows[front_max_pkt_burst];
-	struct ip_flow *flows6[front_max_pkt_burst];
-	int32_t lpm_lookup_pos[front_max_pkt_burst];
-	int32_t lpm6_lookup_pos[front_max_pkt_burst];
-	int32_t pos_arr[front_max_pkt_burst];
-	struct gk_fib *fibs[front_max_pkt_burst];
-	struct gk_fib *fibs6[front_max_pkt_burst];
-	struct flow_entry *fe_arr[front_max_pkt_burst];
-
-	/* Load a set of packets from the front NIC. */
-	num_rx = rte_eth_rx_burst(port_front, rx_queue_front, rx_bufs,
-		front_max_pkt_burst);
-
-	if (unlikely(num_rx == 0))
-		return;
-
-	stats->tot_pkts_num += num_rx;
-
-       /*
-        * This prefetch is enough to load Ethernet header (14 bytes),
-        * optional Ethernet VLAN header (8 bytes), and either
-        * an IPv4 header without options (20 bytes), or
-        * an IPv6 header without options (40 bytes).
-        * IPv4: 14 + 8 + 20 = 42
-        * IPv6: 14 + 8 + 40 = 62
-        */
-       for (i = 0; i < PREFETCH_OFFSET && i < num_rx; i++)
-		rte_prefetch0(rte_pktmbuf_mtod_offset(rx_bufs[i], void *, 0));
-
-	/* Extract packet and flow information. */
-	for (i = 0; i < (num_rx - PREFETCH_OFFSET); i++) {
-		rte_prefetch0(rte_pktmbuf_mtod_offset(
-			rx_bufs[i + PREFETCH_OFFSET], void *, 0));
-
-		parse_packet(&pkt_arr[num_ip_flows], rx_bufs[i], arp_bufs,
-			&num_arp, ipv4_configured_front, ipv6_configured_front,
-			flow_arr, flow_hash_val_arr, &num_ip_flows, front,
-			instance);
-	}
-
-	/* Extract the rest packet and flow information. */
-	for (; i < num_rx; i++) {
-		parse_packet(&pkt_arr[num_ip_flows], rx_bufs[i], arp_bufs,
-			&num_arp, ipv4_configured_front, ipv6_configured_front,
-			flow_arr, flow_hash_val_arr, &num_ip_flows, front,
-			instance);
-	}
-
-	done_lookups = 0;
-	while (done_lookups < num_ip_flows) {
-		uint32_t num_keys = num_ip_flows - done_lookups;
-		if (num_keys > RTE_HASH_LOOKUP_BULK_MAX)
-			num_keys = RTE_HASH_LOOKUP_BULK_MAX;
-
-		ret = rte_hash_lookup_bulk_with_hash(
-			instance->ip_flow_hash_table,
-			(const void **)&flow_arr[done_lookups],
-			(hash_sig_t *)&flow_hash_val_arr[done_lookups],
-			num_keys, &pos_arr[done_lookups]);
-		if (ret != 0) {
-			GK_LOG(NOTICE,
-				"failed to find multiple keys in the hash table at lcore %u\n",
-				rte_lcore_id());
-		}
-
-		done_lookups += num_keys;
-	}
-
-	for (i = 0; i < num_ip_flows; i++) {
-		if (pos_arr[i] >= 0) {
-			fe_arr[i] = &instance->ip_flow_entry_table[pos_arr[i]];
-
-			prefetch_flow_entry(fe_arr[i]);
-		} else {
-			fe_arr[i] = NULL;
-			if (flow_arr[i]->proto == RTE_ETHER_TYPE_IPV4) {
-				lpm_lookup_pos[num_lpm_lookups] = i;
-				flows[num_lpm_lookups] = flow_arr[i];
-				num_lpm_lookups++;
-			} else {
-				lpm6_lookup_pos[num_lpm6_lookups] = i;
-				flows6[num_lpm6_lookups] = flow_arr[i];
-				num_lpm6_lookups++;
-			}
-		}
-	}
-
-	/* The remaining flows need LPM lookups. */
-	lookup_fib_bulk(&gk_conf->lpm_tbl, flows, num_lpm_lookups, fibs);
-	lookup_fib6_bulk(&gk_conf->lpm_tbl, flows6, num_lpm6_lookups, fibs6);
-
-	for (i = 0; i < num_lpm_lookups; i++) {
-		int fidx = lpm_lookup_pos[i];
-
-		fe_arr[fidx] = lookup_fe_from_lpm(&pkt_arr[fidx],
-			flow_hash_val_arr[fidx], fibs[i],
-			tx_back_num_pkts, tx_back_pkts, &acl4, &acl6,
-			tx_front_num_pkts, tx_front_pkts, req_bufs,
-			&num_reqs, front, back, instance, gk_conf);
-	}
-
-	for (i = 0; i < num_lpm6_lookups; i++) {
-		int fidx = lpm6_lookup_pos[i];
-
-		fe_arr[fidx] = lookup_fe_from_lpm(&pkt_arr[fidx],
-			flow_hash_val_arr[fidx], fibs6[i],
-			tx_back_num_pkts, tx_back_pkts, &acl4, &acl6,
-			tx_front_num_pkts, tx_front_pkts, req_bufs,
-			&num_reqs, front, back, instance, gk_conf);
-	}
-
-	for (i = 0; i < num_ip_flows; i++) {
-		if (fe_arr[i] == NULL)
-			continue;
-
-		ret = process_flow_entry(fe_arr[i], &pkt_arr[i], req_bufs,
-			&num_reqs, gk_conf, stats);
-		if (ret < 0)
-			drop_packet_front(pkt_arr[i].pkt, instance);
-		else if (ret == EINPROGRESS) {
-			/* Request will be serviced by another lcore. */
-			continue;
-		} else if (likely(ret == 0))
-			tx_back_pkts[(*tx_back_num_pkts)++] = pkt_arr[i].pkt;
-		else
-			rte_panic("Invalid return value (%d) from processing a packet in a flow with state %d",
-				ret, fe_arr[i]->state);
-	}
-
-	if (num_reqs > 0) {
-		uint64_t acc_size_request[num_reqs + 1];
-
-		acc_size_request[0] = 0;
-		for (i = 1; i <= num_reqs; i++) {
-			acc_size_request[i] = acc_size_request[i - 1] +
-				rte_pktmbuf_pkt_len(req_bufs[i - 1]);
-		}
-
-		ret = RTE_MAX(gk_solicitor_enqueue_bulk(gk_conf->sol_conf,
-			req_bufs, num_reqs), 0);
-		if (ret < num_reqs) {
-			for (i = ret; i < num_reqs; i++)
-				drop_packet_front(req_bufs[i], instance);
-		}
-
-		stats->pkts_num_request += ret;
-		stats->pkts_size_request += acc_size_request[ret];
-	}
-
-	if (num_arp > 0)
-		submit_arp(arp_bufs, num_arp, &gk_conf->net->front);
-
-	process_pkts_acl(&gk_conf->net->front,
-		lcore, &acl4, RTE_ETHER_TYPE_IPV4);
-	process_pkts_acl(&gk_conf->net->front,
-		lcore, &acl6, RTE_ETHER_TYPE_IPV6);
-}
-
 static void
 process_fib(struct ipacket *packet, struct gk_fib *fib,
 		uint16_t *num_tx, struct rte_mbuf **tx_bufs,
@@ -2219,6 +1512,71 @@ process_cmds_from_mailbox(
 	mb_free_entry_bulk(&instance->mb, (void * const *)gk_cmds, num_cmd);
 }
 
+static void
+populate_front_tasks(struct gk_co_work *work,
+	uint16_t port_front, uint16_t rx_queue_front)
+{
+	uint16_t front_max_pkt_burst = work->gk_conf->front_max_pkt_burst;
+	struct rte_mbuf *rx_bufs[front_max_pkt_burst];
+	/* Load a set of packets from the front NIC. */
+	uint16_t num_rx = rte_eth_rx_burst(port_front, rx_queue_front, rx_bufs,
+		front_max_pkt_burst);
+	struct gk_measurement_metrics *stats;
+	bool has_rss;
+	int i;
+
+	if (unlikely(num_rx == 0))
+		return;
+
+	stats = &work->instance->traffic_stats;
+	stats->tot_pkts_num += num_rx;
+
+	has_rss = work->gk_conf->net->front.rss;
+	for (i = 0; i < num_rx; i++) {
+		struct gk_co_task *task = &work->all_tasks[work->task_num++];
+		struct rte_mbuf *pkt = rx_bufs[i];
+
+		stats->tot_pkts_size += rte_pktmbuf_pkt_len(pkt);
+
+		if (likely(has_rss)) {
+			task->task_hash = pkt->hash.rss;
+			task->task_arg = pkt;
+			task->task_func = gk_co_process_front_pkt;
+			schedule_task(work, task);
+		} else {
+			struct ipacket *packet = &work->packets[i];
+			/*
+			 * There is a chance that packets on the same flow
+			 * are brought out of order. For example, consider that
+			 * (1) three packets arrive on the following order:
+			 * 	pkt1, pkt2, pkt3;
+			 * (2) there are only two coroutines doing the work;
+			 * (3) The packets are mapped to
+			 * 	the coroutines as follow:
+			 * 	* pkt1 and pkt2 goes coroutine 1,
+			 * 	* pkt3 goes to coroutine 2;
+			 * (4) Packets pkt2 and pkt3 belong to the same flow.
+			 *
+			 * Packet pkt1 and ptk3 are processed in parallel,
+			 * receive their correct hashes, and are rescheduled.
+			 * Once pk2 is recheduled, it is going to be placed
+			 * after pk3 in the task queue of
+			 * the assigned coroutine, that is, pk3 is going to
+			 * be sent out before pkt2 (inverted order).
+			 */
+			task->task_hash = 0; /* Dummy hash. */
+			/*
+			 * Passing @packet instead of just @pkt so @packet
+			 * can be carried over once the task is rescheduled.
+			 */
+			packet->pkt = pkt;
+			task->task_arg = packet;
+			task->task_func = gk_co_process_front_pkt_software_rss;
+			schedule_task_to_any_co(work, task);
+		}
+	}
+}
+
 static void
 add_cos_to_work(struct gk_co_work *work, struct gk_config *gk_conf,
 	struct gk_instance *instance)
@@ -2230,6 +1588,8 @@ add_cos_to_work(struct gk_co_work *work, struct gk_config *gk_conf,
 	work->cos = instance->cos;
 	work->co_max_num = gk_conf->co_max_num;
 	work->co_num = RTE_MIN(2, work->co_max_num);
+	work->front_ipv4_configured = ipv4_if_configured(&gk_conf->net->front);
+	work->front_ipv6_configured = ipv6_if_configured(&gk_conf->net->front);
 
 	RTE_VERIFY(work->co_num > 0);
 
@@ -2348,11 +1708,15 @@ do_work(struct gk_co_work *work)
 static void
 flush_work(struct gk_co_work *work,
 	uint16_t port_front, uint16_t tx_queue_front,
-	uint16_t port_back, uint16_t tx_queue_back)
+	uint16_t port_back, uint16_t tx_queue_back,
+	unsigned int lcore)
 {
+	struct gk_instance *instance = work->instance;
+
 	uint16_t front_max_pkt_burst = work->gk_conf->front_max_pkt_burst;
 	uint16_t back_max_pkt_burst = work->gk_conf->back_max_pkt_burst;
 	uint32_t max_pkt_burst = front_max_pkt_burst + back_max_pkt_burst;
+	struct gatekeeper_if *front = &work->gk_conf->net->front;
 
 	/*
 	 * Flush packets.
@@ -2369,13 +1733,108 @@ flush_work(struct gk_co_work *work,
 	work->tx_back_num_pkts = 0;
 
 	/*
-	 * TODO Flush front.
+	 * Flush front.
 	 */
 
+	if (work->front_num_req > 0) {
+		uint16_t num_req = work->front_num_req;
+		uint64_t acc_size_request[num_req + 1];
+		struct gk_measurement_metrics *stats = &instance->traffic_stats;
+		int i, ret;
+
+		/*
+		 * The byte length of the packets must be computed before
+		 * calling gk_solicitor_enqueue_bulk() because after it
+		 * the GK block no longer owns the packets.
+		 */
+		acc_size_request[0] = 0;
+		for (i = 1; i <= num_req; i++) {
+			acc_size_request[i] = acc_size_request[i - 1] +
+				rte_pktmbuf_pkt_len(
+					work->front_req_bufs[i - 1]
+				);
+		}
+
+		ret = RTE_MAX(
+			gk_solicitor_enqueue_bulk(work->gk_conf->sol_conf,
+				work->front_req_bufs, num_req),
+			0);
+
+		stats->pkts_num_request += ret;
+		stats->pkts_size_request += acc_size_request[ret];
+
+		for (i = ret; i < num_req; i++)
+			drop_packet_front(work->front_req_bufs[i], instance);
+
+		RTE_VERIFY(num_req <= front_max_pkt_burst);
+		work->front_num_req = 0;
+	}
+
+	if (work->front_num_arp > 0) {
+		submit_arp(work->front_arp_bufs, work->front_num_arp, front);
+		RTE_VERIFY(work->front_num_arp <= front_max_pkt_burst);
+		work->front_num_arp = 0;
+	}
+
+	RTE_VERIFY(work->front_acl4.num <= front_max_pkt_burst);
+	RTE_VERIFY(work->front_acl6.num <= front_max_pkt_burst);
+	process_pkts_acl(front, lcore, &work->front_acl4, RTE_ETHER_TYPE_IPV4);
+	process_pkts_acl(front, lcore, &work->front_acl6, RTE_ETHER_TYPE_IPV6);
+
 	/*
 	 * TODO Flush back.
 	 */
 
+	/*
+	 * Update flow table.
+	 */
+
+	if (work->del_fe != NULL) {
+		RTE_VERIFY(work->del_fe->in_use);
+		/*
+		 * Test that the flow entry is expired once more because
+		 * it may have been update since it tested as expired and
+		 * arriving here.
+		 */
+		if (likely(is_flow_expired(work->del_fe, rte_rdtsc())))
+			gk_del_flow_entry_from_hash(instance, work->del_fe);
+		work->del_fe = NULL;
+	}
+
+	/*
+	 * Adding new entries to the flow table should be among the last steps
+	 * to do because when the flow table is full,
+	 * rte_hash_cuckoo_make_space_mw() is going to be called. And
+	 * this function disrupts the cache of the running core.
+	 * rte_hash_cuckoo_make_space_mw() may access up to 1000 buckets and,
+	 * on 64-bit platforms, consumes about 32KB of execution stack.
+	 */
+	if (work->temp_fes_num > 0) {
+		unsigned int i;
+		for (i = 0; i < work->temp_fes_num; i++) {
+			struct flow_entry *temp_fe = &work->temp_fes[i];
+			struct flow_entry *fe;
+			int ret = gk_hash_add_flow_entry(instance,
+				&temp_fe->flow, temp_fe->flow_hash_val,
+				work->gk_conf);
+			if (ret == -ENOSPC) {
+				/* Flow table is full. */
+				break;
+			}
+			if (unlikely(ret < 0)) {
+				GK_LOG(ERR,
+					"Failed to add an flow entry ret=%i\n",
+					ret);
+				continue;
+			}
+			fe = &instance->ip_flow_entry_table[ret];
+			rte_memcpy(fe, temp_fe, sizeof(*fe));
+		}
+		RTE_VERIFY(work->temp_fes_num <= (front_max_pkt_burst +
+			work->gk_conf->mailbox_burst_size));
+		work->temp_fes_num = 0;
+	}
+
 	/*
 	 * Reset fields of @work.
 	 */
@@ -2383,9 +1842,6 @@ flush_work(struct gk_co_work *work,
 	RTE_VERIFY(work->task_num <= work->task_total);
 	work->task_num = 0;
 	work->any_co_index = 0;
-	RTE_VERIFY(work->temp_fes_num <=
-		(front_max_pkt_burst + work->gk_conf->mailbox_burst_size));
-	work->temp_fes_num = 0;
 	memset(work->leftover, 0,
 		sizeof(*work->leftover) * (work->leftover_mask + 1));
 }
@@ -2431,17 +1887,23 @@ gk_proc(void *arg)
 	add_cos_to_work(&work, gk_conf, instance);
 
 	while (likely(!exiting)) {
-		struct flow_entry *fe = NULL;
 
+		populate_front_tasks(&work, port_front, rx_queue_front);
+
+		/*
+		 * Have the expiration test after all flow-ralated work to
+		 * give one more chance for entries to not expire.
+		 */
 		if (iter_count >= scan_iter) {
+			struct gk_co_task *task =
+				&work.all_tasks[work.task_num++];
 			entry_idx = (entry_idx + 1) % gk_conf->flow_ht_size;
-			fe = &instance->ip_flow_entry_table[entry_idx];
-			/*
-			 * Only one prefetch is needed here because we only
-			 * need the beginning of a struct flow_entry to
-			 * check if it's expired.
-			 */
-			rte_prefetch_non_temporal(fe);
+
+			task->task_hash = 0; /* Dummy hash. */
+			task->task_arg =
+				&instance->ip_flow_entry_table[entry_idx];
+			task->task_func = gk_co_scan_flow_table;
+			schedule_task_to_any_co(&work, task);
 
 			iter_count = 0;
 		} else
@@ -2449,37 +1911,16 @@ gk_proc(void *arg)
 
 		do_work(&work);
 
-		process_pkts_front(port_front, rx_queue_front, lcore,
-			&work.tx_front_num_pkts, work.tx_front_pkts,
-			&work.tx_back_num_pkts,  work.tx_back_pkts,
-			instance, gk_conf);
-
 		process_pkts_back(port_back, rx_queue_back, lcore,
 			&work.tx_front_num_pkts, work.tx_front_pkts,
 			&work.tx_back_num_pkts,  work.tx_back_pkts,
 			instance, gk_conf);
 
-		if (fe != NULL && fe->in_use &&
-				is_flow_expired(fe, rte_rdtsc())) {
-			rte_hash_prefetch_buckets_non_temporal(
-				instance->ip_flow_hash_table,
-				fe->flow_hash_val);
-		} else
-			fe = NULL;
-
 		flush_work(&work, port_front, tx_queue_front,
-			port_back, tx_queue_back);
+			port_back, tx_queue_back, lcore);
 
 		process_cmds_from_mailbox(instance, gk_conf);
 
-		if (fe != NULL) {
-			gk_del_flow_entry_from_hash(
-				instance->ip_flow_hash_table, fe);
-
-			if (instance->num_scan_del > 0)
-				instance->num_scan_del--;
-		}
-
 		if (rte_rdtsc() - last_measure_tsc >=
 				basic_measurement_logging_cycles) {
 			struct gk_measurement_metrics *stats =
diff --git a/include/gatekeeper_main.h b/include/gatekeeper_main.h
index 50aafa1fe..37f1f0b9b 100644
--- a/include/gatekeeper_main.h
+++ b/include/gatekeeper_main.h
@@ -21,6 +21,9 @@
 
 #include <stdint.h>
 
+#include <rte_mbuf.h>
+#include <rte_prefetch.h>
+
 #ifdef RTE_MACHINE_CPUFLAG_SSE4_2
 #include <rte_hash_crc.h>
 #define DEFAULT_HASH_FUNC       rte_hash_crc
@@ -52,4 +55,21 @@ extern FILE *log_file;
 char *rte_strdup(const char *type, const char *s);
 int gatekeeper_log_init(void);
 
+/* XXX #52 This should be part of DPDK. */
+/**
+ * Prefetch the first part of the mbuf
+ *
+ * The first 64 bytes of the mbuf corresponds to fields that are used early
+ * in the receive path. If the cache line of the architecture is higher than
+ * 64B, the second part will also be prefetched.
+ *
+ * @param m
+ *   The pointer to the mbuf.
+ */
+static inline void
+rte_mbuf_prefetch_part1_non_temporal(struct rte_mbuf *m)
+{
+	rte_prefetch_non_temporal(&m->cacheline0);
+}
+
 #endif /* _GATEKEEPER_MAIN_H_ */

From a3a395c687ebcb002a29113ca0178386e20991b3 Mon Sep 17 00:00:00 2001
From: Michel Machado <michel@digirati.com.br>
Date: Sat, 9 Nov 2019 21:58:47 +0000
Subject: [PATCH 4/4] gk: prefetch the transmission fields of packets

This patch prefetches the transmission fields of a packet when
it is ready to be prepared for transmission.
---
 gk/bpf.c                  | 36 +++++++++++++++++++++++-------------
 gk/bpf.h                  |  3 ++-
 gk/co.c                   | 27 +++++++++++++++++----------
 gk/co.h                   |  3 +++
 include/gatekeeper_main.h | 29 +++++++++++++++++++++++++++++
 5 files changed, 74 insertions(+), 24 deletions(-)

diff --git a/gk/bpf.c b/gk/bpf.c
index 16b09963b..2ffcdd913 100644
--- a/gk/bpf.c
+++ b/gk/bpf.c
@@ -106,12 +106,13 @@ static const struct rte_bpf_xsym flow_handler_init_xsym[] = {
 };
 
 struct gk_bpf_pkt_frame {
-	uint64_t		password;
-	struct flow_entry	*fe;
-	struct ipacket          *packet;
-	struct gk_config	*gk_conf;
-	bool			ready_to_tx;
-	struct gk_bpf_pkt_ctx	ctx;
+	uint64_t              password;
+	struct flow_entry     *fe;
+	struct ipacket        *packet;
+	struct gk_co          *this_co;
+	bool                  pkt_part2_prefetched;
+	bool                  ready_to_tx;
+	struct gk_bpf_pkt_ctx ctx;
 };
 
 static const uint64_t pkt_password = 0xa2e329ba8b15af05;
@@ -199,6 +200,7 @@ gk_bpf_prep_for_tx(struct gk_bpf_pkt_ctx *ctx, int priority,
 	int direct_if_possible)
 {
 	int ret;
+	struct gatekeeper_if *back;
 	struct gk_bpf_pkt_frame *frame = pkt_ctx_to_frame(ctx);
 	if (unlikely(frame == NULL))
 		return -EINVAL;
@@ -208,11 +210,18 @@ gk_bpf_prep_for_tx(struct gk_bpf_pkt_ctx *ctx, int priority,
 	if (unlikely(priority < 0 || priority > PRIORITY_MAX))
 		return -EINVAL;
 
+	/* Prepare packet for transmission if needed. */
+	if (likely(!frame->pkt_part2_prefetched)) {
+		frame->pkt_part2_prefetched = true;
+		if (likely(rte_mbuf_prefetch_part2_non_temporal(
+				frame->packet->pkt)))
+			gk_yield_next(frame->this_co);
+	}
+
+	back = &frame->this_co->work->gk_conf->net->back;
 	ret = (direct_if_possible != 0 && priority == PRIORITY_GRANTED)
-		? update_pkt_priority(frame->packet, priority,
-			&frame->gk_conf->net->back)
-		: encapsulate(frame->packet->pkt, priority,
-			&frame->gk_conf->net->back,
+		? update_pkt_priority(frame->packet, priority, back)
+		: encapsulate(frame->packet->pkt, priority, back,
 			&frame->fe->grantor_fib->u.grantor.gt_addr);
 
 	frame->ready_to_tx = ret == 0;
@@ -486,7 +495,7 @@ parse_packet_further(struct ipacket *packet, struct gk_bpf_pkt_ctx *ctx)
 }
 
 int
-gk_bpf_decide_pkt(struct gk_config *gk_conf, uint8_t program_index,
+gk_bpf_decide_pkt(struct gk_co *this_co, uint8_t program_index,
 	struct flow_entry *fe, struct ipacket *packet, uint64_t now,
 	uint64_t *p_bpf_ret)
 {
@@ -494,7 +503,8 @@ gk_bpf_decide_pkt(struct gk_config *gk_conf, uint8_t program_index,
 		.password = pkt_password,
 		.fe = fe,
 		.packet = packet,
-		.gk_conf = gk_conf,
+		.this_co = this_co,
+		.pkt_part2_prefetched = false,
 		.ready_to_tx = false,
 		.ctx = {
 			.now = now,
@@ -502,7 +512,7 @@ gk_bpf_decide_pkt(struct gk_config *gk_conf, uint8_t program_index,
 		},
 	};
 	const struct gk_bpf_flow_handler *handler =
-		&gk_conf->flow_handlers[program_index];
+		&this_co->work->gk_conf->flow_handlers[program_index];
 
 	if (unlikely(handler->f_pkt == NULL)) {
 		GK_LOG(WARNING,
diff --git a/gk/bpf.h b/gk/bpf.h
index f5c93e9ec..05cfd7f6d 100644
--- a/gk/bpf.h
+++ b/gk/bpf.h
@@ -20,6 +20,7 @@
 #define _GATEKEEPER_GK_BPF_H_
 
 #include "gatekeeper_gk.h"
+#include "co.h"
 
 /*
  * Load the BPF program that handles flows into @gk_conf at
@@ -32,7 +33,7 @@
 int gk_load_bpf_flow_handler(struct gk_config *gk_conf, unsigned int index,
 	const char *filename, int jit);
 
-int gk_bpf_decide_pkt(struct gk_config *gk_conf, uint8_t program_index,
+int gk_bpf_decide_pkt(struct gk_co *this_co, uint8_t program_index,
 	struct flow_entry *fe, struct ipacket *packet, uint64_t now,
 	uint64_t *p_bpf_ret);
 
diff --git a/gk/co.c b/gk/co.c
index 1bba8b8ec..35ad7d941 100644
--- a/gk/co.c
+++ b/gk/co.c
@@ -40,8 +40,8 @@ get_next_co(struct gk_co *this_co)
 	return list_next_entry(this_co, co_list);
 }
 
-static void
-yield_next(struct gk_co *this_co)
+void
+gk_yield_next(struct gk_co *this_co)
 {
 	struct gk_co *next_co = get_next_co(this_co);
 	if (unlikely(this_co == next_co))
@@ -156,7 +156,7 @@ parse_front_pkt(struct gk_co *this_co,
 
 	/* TODO Does this prefetch improve performance?
 	rte_mbuf_prefetch_part1_non_temporal(pkt);
-	yield_next(this_co);
+	gk_yield_next(this_co);
 	*/
        /*
         * This prefetch is enough to load Ethernet header (14 bytes),
@@ -166,7 +166,7 @@ parse_front_pkt(struct gk_co *this_co,
         * IPv4: 14 + 8 + 20 = 42
         * IPv6: 14 + 8 + 40 = 62
 	rte_prefetch_non_temporal(rte_pktmbuf_mtod_offset(pkt, void *, 0));
-	yield_next(this_co);
+	gk_yield_next(this_co);
         */
 
 	ret = extract_packet_info(pkt, packet);
@@ -335,6 +335,10 @@ gk_process_request(struct gk_co *this_co, struct flow_entry *fe,
 
 	/* The assigned priority is @priority. */
 
+	/* Prepare packet for transmission. */
+	if (likely(rte_mbuf_prefetch_part2_non_temporal(pkt)))
+		gk_yield_next(this_co);
+
 	/* Encapsulate the packet as a request. */
 	ret = encapsulate(pkt, priority, back, &fib->u.grantor.gt_addr);
 	if (ret < 0)
@@ -398,6 +402,10 @@ gk_process_granted(struct gk_co *this_co, struct flow_entry *fe,
 		priority = PRIORITY_RENEW_CAP;
 	}
 
+	/* Prepare packet for transmission. */
+	if (likely(rte_mbuf_prefetch_part2_non_temporal(pkt)))
+		gk_yield_next(this_co);
+
 	/*
 	 * Encapsulate packet as a granted packet,
 	 * mark it as a capability renewal request if @renew_cap is true,
@@ -447,7 +455,6 @@ gk_process_bpf(struct gk_co *this_co, struct flow_entry *fe,
 {
 	struct rte_mbuf *pkt = packet->pkt;
 	struct gk_co_work *work = this_co->work;
-	struct gk_config *gk_conf = work->gk_conf;
 	struct gk_measurement_metrics *stats;
 	uint64_t bpf_ret;
 	int program_index, rc;
@@ -457,7 +464,7 @@ gk_process_bpf(struct gk_co *this_co, struct flow_entry *fe,
 		goto expired;
 
 	program_index = fe->program_index;
-	rc = gk_bpf_decide_pkt(gk_conf, program_index, fe, packet, now,
+	rc = gk_bpf_decide_pkt(this_co, program_index, fe, packet, now,
 		&bpf_ret);
 	if (unlikely(rc != 0)) {
 		GK_LOG(WARNING,
@@ -477,7 +484,7 @@ gk_process_bpf(struct gk_co *this_co, struct flow_entry *fe,
 		 * packet header space.
 		 */
 		if (pkt_copy_cached_eth_header(pkt, eth_cache,
-				gk_conf->net->back.l2_len_out))
+				work->gk_conf->net->back.l2_len_out))
 			goto drop_pkt;
 
 		stats->pkts_num_granted++;
@@ -890,7 +897,7 @@ static void
 prefetch_and_yield(void *addr, void *this_co)
 {
 	rte_prefetch_non_temporal(addr);
-	yield_next(this_co);
+	gk_yield_next(this_co);
 }
 
 static void
@@ -922,7 +929,7 @@ gk_co_process_front_pkt_final(struct gk_co *this_co, struct gk_co_task *task)
 		fe = &work->instance->ip_flow_entry_table[ret];
 		/* TODO Break this prefetch into part1 and part2. */
 		prefetch_flow_entry(fe);
-		yield_next(this_co);
+		gk_yield_next(this_co);
 		process_flow_entry(this_co, fe, packet);
 		save_fe_leftover(work, fe);
 		return;
@@ -1039,7 +1046,7 @@ gk_co_scan_flow_table(struct gk_co *this_co, struct gk_co_task *task)
 	 * check if it's expired.
 	 */
 	rte_prefetch_non_temporal(fe);
-	yield_next(this_co);
+	gk_yield_next(this_co);
 
 	if (!fe->in_use || !is_flow_expired(fe, rte_rdtsc()))
 		return;
diff --git a/gk/co.h b/gk/co.h
index e290ea26e..6ed27033a 100644
--- a/gk/co.h
+++ b/gk/co.h
@@ -284,4 +284,7 @@ void
 gk_co_process_front_pkt_software_rss(struct gk_co *this_co,
 	struct gk_co_task *task);
 
+void
+gk_yield_next(struct gk_co *this_co);
+
 #endif /* _GATEKEEPER_GK_CO_H_ */
diff --git a/include/gatekeeper_main.h b/include/gatekeeper_main.h
index 37f1f0b9b..b9de610e4 100644
--- a/include/gatekeeper_main.h
+++ b/include/gatekeeper_main.h
@@ -20,6 +20,7 @@
 #define _GATEKEEPER_MAIN_H_
 
 #include <stdint.h>
+#include <stdbool.h>
 
 #include <rte_mbuf.h>
 #include <rte_prefetch.h>
@@ -72,4 +73,32 @@ rte_mbuf_prefetch_part1_non_temporal(struct rte_mbuf *m)
 	rte_prefetch_non_temporal(&m->cacheline0);
 }
 
+/* XXX #52 This should be part of DPDK. */
+/**
+ * Prefetch the second part of the mbuf
+ *
+ * The next 64 bytes of the mbuf corresponds to fields that are used in the
+ * transmit path. If the cache line of the architecture is higher than 64B,
+ * this function does nothing as it is expected that the full mbuf is
+ * already in cache.
+ *
+ * @param m
+ *   The pointer to the mbuf.
+ */
+static inline bool
+rte_mbuf_prefetch_part2_non_temporal(struct rte_mbuf *m)
+{
+#if RTE_CACHE_LINE_SIZE == 64
+	/* TODO Do we need this prefetch?
+	rte_prefetch_non_temporal(&m->cacheline1);
+	return true;
+	*/
+	RTE_SET_USED(m);
+	return false;
+#else
+	RTE_SET_USED(m);
+	return false;
+#endif
+}
+
 #endif /* _GATEKEEPER_MAIN_H_ */