diff --git a/Makefile b/Makefile
index efc5b8950..eb33b01dc 100644
--- a/Makefile
+++ b/Makefile
@@ -36,7 +36,7 @@ SRCS-y := main/main.c
SRCS-y += config/static.c config/dynamic.c
SRCS-y += cps/main.c cps/kni.c cps/elf.c
SRCS-y += ggu/main.c
-SRCS-y += gk/main.c gk/fib.c gk/bpf.c
+SRCS-y += gk/main.c gk/fib.c gk/bpf.c gk/co.c
SRCS-y += gt/main.c gt/lua_lpm.c
SRCS-y += lls/main.c lls/cache.c lls/arp.c lls/nd.c
SRCS-y += sol/main.c
@@ -44,12 +44,12 @@ SRCS-y += sol/main.c
# Libraries.
SRCS-y += lib/mailbox.c lib/net.c lib/flow.c lib/ipip.c \
lib/luajit-ffi-cdata.c lib/launch.c lib/lpm.c lib/acl.c lib/varip.c \
- lib/l2.c lib/ratelimit.c lib/memblock.c lib/log_ratelimit.c
+ lib/l2.c lib/ratelimit.c lib/memblock.c lib/log_ratelimit.c lib/coro.c
LDLIBS += $(LDIR) -Bstatic -lluajit-5.1 -Bdynamic -lm -lmnl -lkmod
CFLAGS += $(WERROR_FLAGS) -I${GATEKEEPER}/include -I/usr/local/include/luajit-2.0/
EXTRA_CFLAGS += -O3 -g -Wfatal-errors -DALLOW_EXPERIMENTAL_API \
- -Wno-deprecated-declarations
+ -Wno-deprecated-declarations -DCORO_ASM
include $(RTE_SDK)/mk/rte.extapp.mk
diff --git a/dependencies/dpdk b/dependencies/dpdk
index bcc1e4fce..c637f7cd4 160000
--- a/dependencies/dpdk
+++ b/dependencies/dpdk
@@ -1 +1 @@
-Subproject commit bcc1e4fce82336ca39108ed4d54fb501af4a1b5a
+Subproject commit c637f7cd452d750d6eb51bb2abf9de92a111fe60
diff --git a/gk/bpf.c b/gk/bpf.c
index 16b09963b..2ffcdd913 100644
--- a/gk/bpf.c
+++ b/gk/bpf.c
@@ -106,12 +106,13 @@ static const struct rte_bpf_xsym flow_handler_init_xsym[] = {
};
struct gk_bpf_pkt_frame {
- uint64_t password;
- struct flow_entry *fe;
- struct ipacket *packet;
- struct gk_config *gk_conf;
- bool ready_to_tx;
- struct gk_bpf_pkt_ctx ctx;
+ uint64_t password;
+ struct flow_entry *fe;
+ struct ipacket *packet;
+ struct gk_co *this_co;
+ bool pkt_part2_prefetched;
+ bool ready_to_tx;
+ struct gk_bpf_pkt_ctx ctx;
};
static const uint64_t pkt_password = 0xa2e329ba8b15af05;
@@ -199,6 +200,7 @@ gk_bpf_prep_for_tx(struct gk_bpf_pkt_ctx *ctx, int priority,
int direct_if_possible)
{
int ret;
+ struct gatekeeper_if *back;
struct gk_bpf_pkt_frame *frame = pkt_ctx_to_frame(ctx);
if (unlikely(frame == NULL))
return -EINVAL;
@@ -208,11 +210,18 @@ gk_bpf_prep_for_tx(struct gk_bpf_pkt_ctx *ctx, int priority,
if (unlikely(priority < 0 || priority > PRIORITY_MAX))
return -EINVAL;
+ /* Prepare packet for transmission if needed. */
+ if (likely(!frame->pkt_part2_prefetched)) {
+ frame->pkt_part2_prefetched = true;
+ if (likely(rte_mbuf_prefetch_part2_non_temporal(
+ frame->packet->pkt)))
+ gk_yield_next(frame->this_co);
+ }
+
+ back = &frame->this_co->work->gk_conf->net->back;
ret = (direct_if_possible != 0 && priority == PRIORITY_GRANTED)
- ? update_pkt_priority(frame->packet, priority,
- &frame->gk_conf->net->back)
- : encapsulate(frame->packet->pkt, priority,
- &frame->gk_conf->net->back,
+ ? update_pkt_priority(frame->packet, priority, back)
+ : encapsulate(frame->packet->pkt, priority, back,
&frame->fe->grantor_fib->u.grantor.gt_addr);
frame->ready_to_tx = ret == 0;
@@ -486,7 +495,7 @@ parse_packet_further(struct ipacket *packet, struct gk_bpf_pkt_ctx *ctx)
}
int
-gk_bpf_decide_pkt(struct gk_config *gk_conf, uint8_t program_index,
+gk_bpf_decide_pkt(struct gk_co *this_co, uint8_t program_index,
struct flow_entry *fe, struct ipacket *packet, uint64_t now,
uint64_t *p_bpf_ret)
{
@@ -494,7 +503,8 @@ gk_bpf_decide_pkt(struct gk_config *gk_conf, uint8_t program_index,
.password = pkt_password,
.fe = fe,
.packet = packet,
- .gk_conf = gk_conf,
+ .this_co = this_co,
+ .pkt_part2_prefetched = false,
.ready_to_tx = false,
.ctx = {
.now = now,
@@ -502,7 +512,7 @@ gk_bpf_decide_pkt(struct gk_config *gk_conf, uint8_t program_index,
},
};
const struct gk_bpf_flow_handler *handler =
- &gk_conf->flow_handlers[program_index];
+ &this_co->work->gk_conf->flow_handlers[program_index];
if (unlikely(handler->f_pkt == NULL)) {
GK_LOG(WARNING,
diff --git a/gk/bpf.h b/gk/bpf.h
index f5c93e9ec..05cfd7f6d 100644
--- a/gk/bpf.h
+++ b/gk/bpf.h
@@ -20,6 +20,7 @@
#define _GATEKEEPER_GK_BPF_H_
#include "gatekeeper_gk.h"
+#include "co.h"
/*
* Load the BPF program that handles flows into @gk_conf at
@@ -32,7 +33,7 @@
int gk_load_bpf_flow_handler(struct gk_config *gk_conf, unsigned int index,
const char *filename, int jit);
-int gk_bpf_decide_pkt(struct gk_config *gk_conf, uint8_t program_index,
+int gk_bpf_decide_pkt(struct gk_co *this_co, uint8_t program_index,
struct flow_entry *fe, struct ipacket *packet, uint64_t now,
uint64_t *p_bpf_ret);
diff --git a/gk/co.c b/gk/co.c
new file mode 100644
index 000000000..35ad7d941
--- /dev/null
+++ b/gk/co.c
@@ -0,0 +1,1121 @@
+/*
+ * Gatekeeper - DoS protection system.
+ * Copyright (C) 2016 Digirati LTDA.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+
+#include
+#include
+
+#include "gatekeeper_lls.h"
+
+#include "bpf.h"
+#include "co.h"
+
+static struct gk_co *
+get_next_co(struct gk_co *this_co)
+{
+ /*
+ * It is unlikely because as long as there is more than
+ * one working coroutine, there is at least 50% chance that
+ * @this_co is not the last working coroutine.
+ */
+ if (unlikely(this_co->co_list.next == &this_co->work->working_cos)) {
+ /* @this_co is the last working co. */
+ return list_first_entry(&this_co->work->working_cos,
+ struct gk_co, co_list);
+ }
+ return list_next_entry(this_co, co_list);
+}
+
+void
+gk_yield_next(struct gk_co *this_co)
+{
+ struct gk_co *next_co = get_next_co(this_co);
+ if (unlikely(this_co == next_co))
+ return;
+ coro_transfer(&this_co->coro, &next_co->coro);
+}
+
+/*
+ * If @task is added to @this_co->task_queue without a proper @task->task_hash,
+ * @task must be rescheduled once the proper @task->task_hash becomes known
+ * in order to avoid race conditions related to the proper @task->task_hash.
+ *
+ * NOTICE: while a task is running without a proper @task->task_hash,
+ * the task must not use the leftover available because the task is likely
+ * running under a task hash that is different of its proper @task->task_hash.
+ */
+static void
+reschedule_task(struct gk_co *this_co, struct gk_co_task *task)
+{
+ struct gk_co_work *work = this_co->work;
+ struct gk_co *task_owner_co = get_task_owner_co(work, task);
+
+ __schedule_task(task_owner_co, task);
+
+ if (list_poison(&task_owner_co->co_list))
+ list_add_tail(&task_owner_co->co_list, &work->working_cos);
+}
+
+static int
+extract_packet_info(struct rte_mbuf *pkt, struct ipacket *packet)
+{
+ int ret = 0;
+ uint16_t ether_type;
+ size_t ether_len;
+ struct rte_ether_hdr *eth_hdr;
+ struct rte_ipv4_hdr *ip4_hdr;
+ struct rte_ipv6_hdr *ip6_hdr;
+ uint16_t pkt_len = rte_pktmbuf_data_len(pkt);
+
+ eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
+ ether_type = rte_be_to_cpu_16(pkt_in_skip_l2(pkt, eth_hdr,
+ &packet->l3_hdr));
+ ether_len = pkt_in_l2_hdr_len(pkt);
+
+ switch (ether_type) {
+ case RTE_ETHER_TYPE_IPV4:
+ if (pkt_len < ether_len + sizeof(*ip4_hdr)) {
+ packet->flow.proto = 0;
+ GK_LOG(NOTICE,
+ "Packet is too short to be IPv4 (%" PRIu16 ")\n",
+ pkt_len);
+ ret = -1;
+ goto out;
+ }
+
+ ip4_hdr = packet->l3_hdr;
+ packet->flow.proto = RTE_ETHER_TYPE_IPV4;
+ packet->flow.f.v4.src.s_addr = ip4_hdr->src_addr;
+ packet->flow.f.v4.dst.s_addr = ip4_hdr->dst_addr;
+ break;
+
+ case RTE_ETHER_TYPE_IPV6:
+ if (pkt_len < ether_len + sizeof(*ip6_hdr)) {
+ packet->flow.proto = 0;
+ GK_LOG(NOTICE,
+ "Packet is too short to be IPv6 (%" PRIu16 ")\n",
+ pkt_len);
+ ret = -1;
+ goto out;
+ }
+
+ ip6_hdr = packet->l3_hdr;
+ packet->flow.proto = RTE_ETHER_TYPE_IPV6;
+ rte_memcpy(packet->flow.f.v6.src.s6_addr, ip6_hdr->src_addr,
+ sizeof(packet->flow.f.v6.src.s6_addr));
+ rte_memcpy(packet->flow.f.v6.dst.s6_addr, ip6_hdr->dst_addr,
+ sizeof(packet->flow.f.v6.dst.s6_addr));
+ break;
+
+ case RTE_ETHER_TYPE_ARP:
+ packet->flow.proto = RTE_ETHER_TYPE_ARP;
+ ret = -1;
+ break;
+
+ default:
+ packet->flow.proto = 0;
+ log_unknown_l2("gk", ether_type);
+ ret = -1;
+ break;
+ }
+out:
+ packet->pkt = pkt;
+ return ret;
+}
+
+static int
+drop_packet_front(struct rte_mbuf *pkt, struct gk_instance *instance)
+{
+ instance->traffic_stats.tot_pkts_num_dropped++;
+ instance->traffic_stats.tot_pkts_size_dropped +=
+ rte_pktmbuf_pkt_len(pkt);
+
+ return drop_packet(pkt);
+}
+
+static int
+parse_front_pkt(struct gk_co *this_co,
+ struct ipacket *packet, struct rte_mbuf *pkt)
+{
+ struct gk_co_work *work = this_co->work;
+ int ret;
+
+ /* TODO Does this prefetch improve performance?
+ rte_mbuf_prefetch_part1_non_temporal(pkt);
+ gk_yield_next(this_co);
+ */
+ /*
+ * This prefetch is enough to load Ethernet header (14 bytes),
+ * optional Ethernet VLAN header (8 bytes), and either
+ * an IPv4 header without options (20 bytes), or
+ * an IPv6 header without options (40 bytes).
+ * IPv4: 14 + 8 + 20 = 42
+ * IPv6: 14 + 8 + 40 = 62
+ rte_prefetch_non_temporal(rte_pktmbuf_mtod_offset(pkt, void *, 0));
+ gk_yield_next(this_co);
+ */
+
+ ret = extract_packet_info(pkt, packet);
+ if (ret < 0) {
+ if (likely(packet->flow.proto == RTE_ETHER_TYPE_ARP)) {
+ struct gk_measurement_metrics *stats =
+ &work->instance->traffic_stats;
+
+ stats->tot_pkts_num_distributed++;
+ stats->tot_pkts_size_distributed +=
+ rte_pktmbuf_pkt_len(pkt);
+
+ work->front_arp_bufs[work->front_num_arp++] = pkt;
+ return -1;
+ }
+
+ /* Drop non-IP and non-ARP packets. */
+ drop_packet_front(pkt, work->instance);
+ return -1;
+ }
+
+ if (unlikely((packet->flow.proto == RTE_ETHER_TYPE_IPV4 &&
+ !work->front_ipv4_configured) ||
+ (packet->flow.proto == RTE_ETHER_TYPE_IPV6 &&
+ !work->front_ipv6_configured))) {
+ drop_packet_front(pkt, work->instance);
+ return -1;
+ }
+
+ return 0;
+}
+
+#define START_PRIORITY (38)
+/* Set @START_ALLOWANCE as the double size of a large DNS reply. */
+#define START_ALLOWANCE (8)
+
+static void
+initialize_flow_entry(struct flow_entry *fe, struct ip_flow *flow,
+ uint32_t flow_hash_val, struct gk_fib *grantor_fib)
+{
+ /*
+ * The flow table is a critical data structure, so,
+ * whenever the size of entries grow too much,
+ * one must look for alternatives before increasing
+ * the limit below.
+ */
+ RTE_BUILD_BUG_ON(sizeof(*fe) > 128);
+
+ rte_memcpy(&fe->flow, flow, sizeof(*flow));
+
+ fe->in_use = true;
+ fe->flow_hash_val = flow_hash_val;
+ fe->state = GK_REQUEST;
+ fe->u.request.last_packet_seen_at = rte_rdtsc();
+ fe->u.request.last_priority = START_PRIORITY;
+ fe->u.request.allowance = START_ALLOWANCE - 1;
+ fe->grantor_fib = grantor_fib;
+}
+
+static inline void
+reinitialize_flow_entry(struct flow_entry *fe, uint64_t now)
+{
+ fe->state = GK_REQUEST;
+ fe->u.request.last_packet_seen_at = now;
+ fe->u.request.last_priority = START_PRIORITY;
+ fe->u.request.allowance = START_ALLOWANCE - 1;
+}
+
+static inline void
+prefetch_flow_entry(struct flow_entry *fe)
+{
+#if RTE_CACHE_LINE_SIZE == 64
+ RTE_BUILD_BUG_ON(sizeof(*fe) <= RTE_CACHE_LINE_SIZE);
+ RTE_BUILD_BUG_ON(sizeof(*fe) > 2 * RTE_CACHE_LINE_SIZE);
+ rte_prefetch0(fe);
+ rte_prefetch0(((char *)fe) + RTE_CACHE_LINE_SIZE);
+#elif RTE_CACHE_LINE_SIZE == 128
+ RTE_BUILD_BUG_ON(sizeof(*fe) > RTE_CACHE_LINE_SIZE);
+ rte_prefetch0(fe);
+#else
+#error "Unsupported cache line size"
+#endif
+}
+
+/* We should avoid calling integer_log_base_2() with zero. */
+static inline uint8_t
+integer_log_base_2(uint64_t delta_time)
+{
+#if __WORDSIZE == 64
+ return (8 * sizeof(uint64_t) - 1) - __builtin_clzl(delta_time);
+#else
+ return (8 * sizeof(uint64_t) - 1) - __builtin_clzll(delta_time);
+#endif
+}
+
+/*
+ * It converts the difference of time between the current packet and
+ * the last seen packet into a given priority.
+ */
+static uint8_t
+priority_from_delta_time(uint64_t present, uint64_t past)
+{
+ uint64_t delta_time;
+
+ if (unlikely(present < past)) {
+ /*
+ * This should never happen, but we handle it gracefully here
+ * in order to keep going.
+ */
+ GK_LOG(ERR, "The present time smaller than the past time\n");
+ return 0;
+ }
+
+ delta_time = (present - past) * picosec_per_cycle;
+ if (unlikely(delta_time < 1))
+ return 0;
+
+ return integer_log_base_2(delta_time);
+}
+
+/*
+ * When a flow entry is at request state, all the GK block processing
+ * that entry does is to:
+ * (1) compute the priority of the packet.
+ * (2) encapsulate the packet as a request.
+ * (3) put this encapsulated packet in the request queue.
+ */
+static void
+gk_process_request(struct gk_co *this_co, struct flow_entry *fe,
+ struct ipacket *packet)
+{
+ int ret;
+ uint64_t now = rte_rdtsc();
+ uint8_t priority = priority_from_delta_time(now,
+ fe->u.request.last_packet_seen_at);
+ struct rte_mbuf *pkt = packet->pkt;
+ struct gk_co_work *work = this_co->work;
+ struct gatekeeper_if *back = &work->gk_conf->net->back;
+ struct gk_fib *fib = fe->grantor_fib;
+ struct ether_cache *eth_cache;
+
+ fe->u.request.last_packet_seen_at = now;
+
+ /*
+ * The reason for using "<" instead of "<=" is that the equal case
+ * means that the source has waited enough time to have the same
+ * last priority, so it should be awarded with the allowance.
+ */
+ if (priority < fe->u.request.last_priority &&
+ fe->u.request.allowance > 0) {
+ fe->u.request.allowance--;
+ priority = fe->u.request.last_priority;
+ } else {
+ fe->u.request.last_priority = priority;
+ fe->u.request.allowance = START_ALLOWANCE - 1;
+ }
+
+ /*
+ * Adjust @priority for the DSCP field.
+ * DSCP 0 for legacy packets; 1 for granted packets;
+ * 2 for capability renew; 3-63 for requests.
+ */
+ priority += PRIORITY_REQ_MIN;
+ if (unlikely(priority > PRIORITY_MAX))
+ priority = PRIORITY_MAX;
+
+ /* The assigned priority is @priority. */
+
+ /* Prepare packet for transmission. */
+ if (likely(rte_mbuf_prefetch_part2_non_temporal(pkt)))
+ gk_yield_next(this_co);
+
+ /* Encapsulate the packet as a request. */
+ ret = encapsulate(pkt, priority, back, &fib->u.grantor.gt_addr);
+ if (ret < 0)
+ goto drop_pkt;
+
+ eth_cache = fib->u.grantor.eth_cache;
+ RTE_VERIFY(eth_cache != NULL);
+ /* If needed, packet header space was adjusted by encapsulate(). */
+ if (pkt_copy_cached_eth_header(pkt, eth_cache, back->l2_len_out))
+ goto drop_pkt;
+
+ pkt->udata64 = priority;
+ work->front_req_bufs[work->front_num_req++] = pkt;
+ return;
+
+drop_pkt:
+ drop_packet_front(pkt, work->instance);
+}
+
+static void
+gk_process_granted(struct gk_co *this_co, struct flow_entry *fe,
+ struct ipacket *packet)
+{
+ int ret;
+ bool renew_cap;
+ uint8_t priority = PRIORITY_GRANTED;
+ uint64_t now = rte_rdtsc();
+ struct rte_mbuf *pkt = packet->pkt;
+ struct gk_fib *fib = fe->grantor_fib;
+ struct gk_co_work *work = this_co->work;
+ struct gatekeeper_if *back = &work->gk_conf->net->back;
+ struct gk_measurement_metrics *stats;
+ struct ether_cache *eth_cache;
+ uint32_t pkt_len;
+
+ if (now >= fe->u.granted.cap_expire_at) {
+ reinitialize_flow_entry(fe, now);
+ return gk_process_request(this_co, fe, packet);
+ }
+
+ if (now >= fe->u.granted.budget_renew_at) {
+ fe->u.granted.budget_renew_at = now + cycles_per_sec;
+ fe->u.granted.budget_byte =
+ (uint64_t)fe->u.granted.tx_rate_kib_cycle * 1024;
+ }
+
+ stats = &work->instance->traffic_stats;
+
+ pkt_len = rte_pktmbuf_pkt_len(pkt);
+ if (pkt_len > fe->u.granted.budget_byte) {
+ stats->pkts_num_declined++;
+ stats->pkts_size_declined += pkt_len;
+ goto drop_pkt;
+ }
+
+ fe->u.granted.budget_byte -= pkt_len;
+ renew_cap = now >= fe->u.granted.send_next_renewal_at;
+ if (renew_cap) {
+ fe->u.granted.send_next_renewal_at = now +
+ fe->u.granted.renewal_step_cycle;
+ priority = PRIORITY_RENEW_CAP;
+ }
+
+ /* Prepare packet for transmission. */
+ if (likely(rte_mbuf_prefetch_part2_non_temporal(pkt)))
+ gk_yield_next(this_co);
+
+ /*
+ * Encapsulate packet as a granted packet,
+ * mark it as a capability renewal request if @renew_cap is true,
+ * enter destination according to @fe->grantor_fib.
+ */
+ ret = encapsulate(pkt, priority, back, &fib->u.grantor.gt_addr);
+ if (ret < 0)
+ goto drop_pkt;
+
+ eth_cache = fib->u.grantor.eth_cache;
+ RTE_VERIFY(eth_cache != NULL);
+ /* If needed, packet header space was adjusted by encapsulate(). */
+ if (pkt_copy_cached_eth_header(pkt, eth_cache, back->l2_len_out))
+ goto drop_pkt;
+
+ stats->pkts_num_granted++;
+ stats->pkts_size_granted += pkt_len;
+ work->tx_back_pkts[work->tx_back_num_pkts++] = pkt;
+ return;
+
+drop_pkt:
+ drop_packet_front(pkt, work->instance);
+}
+
+static void
+gk_process_declined(struct gk_co *this_co, struct flow_entry *fe,
+ struct ipacket *packet)
+{
+ uint64_t now = rte_rdtsc();
+ struct gk_co_work *work = this_co->work;
+ struct gk_measurement_metrics *stats;
+
+ if (unlikely(now >= fe->u.declined.expire_at)) {
+ reinitialize_flow_entry(fe, now);
+ return gk_process_request(this_co, fe, packet);
+ }
+
+ stats = &work->instance->traffic_stats;
+ stats->pkts_num_declined++;
+ stats->pkts_size_declined += rte_pktmbuf_pkt_len(packet->pkt);
+ drop_packet_front(packet->pkt, work->instance);
+}
+
+static void
+gk_process_bpf(struct gk_co *this_co, struct flow_entry *fe,
+ struct ipacket *packet)
+{
+ struct rte_mbuf *pkt = packet->pkt;
+ struct gk_co_work *work = this_co->work;
+ struct gk_measurement_metrics *stats;
+ uint64_t bpf_ret;
+ int program_index, rc;
+ uint64_t now = rte_rdtsc();
+
+ if (unlikely(now >= fe->u.bpf.expire_at))
+ goto expired;
+
+ program_index = fe->program_index;
+ rc = gk_bpf_decide_pkt(this_co, program_index, fe, packet, now,
+ &bpf_ret);
+ if (unlikely(rc != 0)) {
+ GK_LOG(WARNING,
+ "The BPF program at index %u failed to run its function pkt\n",
+ program_index);
+ goto expired;
+ }
+
+ stats = &work->instance->traffic_stats;
+ switch (bpf_ret) {
+ case GK_BPF_PKT_RET_FORWARD: {
+ struct ether_cache *eth_cache =
+ fe->grantor_fib->u.grantor.eth_cache;
+ RTE_VERIFY(eth_cache != NULL);
+ /*
+ * If needed, encapsulate() already adjusted
+ * packet header space.
+ */
+ if (pkt_copy_cached_eth_header(pkt, eth_cache,
+ work->gk_conf->net->back.l2_len_out))
+ goto drop_pkt;
+
+ stats->pkts_num_granted++;
+ stats->pkts_size_granted += rte_pktmbuf_pkt_len(pkt);
+ work->tx_back_pkts[work->tx_back_num_pkts++] = pkt;
+ return;
+ }
+ case GK_BPF_PKT_RET_DECLINE:
+ stats->pkts_num_declined++;
+ stats->pkts_size_declined += rte_pktmbuf_pkt_len(pkt);
+ goto drop_pkt;
+ case GK_BPF_PKT_RET_ERROR:
+ GK_LOG(WARNING,
+ "The function pkt of the BPF program at index %u returned GK_BPF_PKT_RET_ERROR\n",
+ program_index);
+ goto drop_pkt;
+ default:
+ GK_LOG(WARNING,
+ "The function pkt of the BPF program at index %u returned an invalid return: %" PRIu64 "\n",
+ program_index, bpf_ret);
+ goto drop_pkt;
+ }
+
+ rte_panic("Unexpected condition at %s()", __func__);
+
+expired:
+ reinitialize_flow_entry(fe, now);
+ return gk_process_request(this_co, fe, packet);
+
+drop_pkt:
+ drop_packet_front(pkt, work->instance);
+}
+
+static void
+process_flow_entry(struct gk_co *this_co, struct flow_entry *fe,
+ struct ipacket *packet)
+{
+ /*
+ * Some notes regarding flow rates and units:
+ *
+ * Flows in the GK_REQUEST state are bandwidth limited
+ * to an overall rate relative to the link. Therefore,
+ * the Ethernet frame overhead is counted toward the
+ * credits used by requests. The request channel rate
+ * is measured in megabits (base 10) per second to
+ * match the units used by hardware specifications.
+ *
+ * Granted flows (in state GK_GRANTED or sometimes
+ * GK_BPF) are allocated budgets that are intended
+ * to reflect the max throughput of the flow, and
+ * therefore do not include the Ethernet frame overhead.
+ * The budgets of granted flows are measured in
+ * kibibytes (base 2).
+ */
+ switch (fe->state) {
+ case GK_REQUEST:
+ return gk_process_request(this_co, fe, packet);
+
+ case GK_GRANTED:
+ return gk_process_granted(this_co, fe, packet);
+
+ case GK_DECLINED:
+ return gk_process_declined(this_co, fe, packet);
+
+ case GK_BPF:
+ return gk_process_bpf(this_co, fe, packet);
+
+ default:
+ GK_LOG(ERR, "Unknown flow state: %d\n", fe->state);
+ drop_packet_front(packet->pkt, this_co->work->instance);
+ return;
+ }
+
+ rte_panic("Unexpected condition at %s()\n", __func__);
+}
+
+typedef int (*packet_drop_cb_func)(struct rte_mbuf *pkt,
+ struct gk_instance *instance);
+
+static void
+xmit_icmp(struct gatekeeper_if *iface, struct ipacket *packet,
+ uint16_t *num_pkts, struct rte_mbuf **icmp_bufs,
+ struct gk_instance *instance, packet_drop_cb_func cb_f)
+{
+ struct rte_ether_addr eth_addr_tmp;
+ struct rte_ether_hdr *icmp_eth;
+ struct rte_ipv4_hdr *icmp_ipv4;
+ struct rte_icmp_hdr *icmph;
+ struct rte_mbuf *pkt = packet->pkt;
+ int icmp_pkt_len = iface->l2_len_out + sizeof(struct rte_ipv4_hdr) +
+ sizeof(struct rte_icmp_hdr);
+ if (pkt->data_len >= icmp_pkt_len) {
+ int ret = rte_pktmbuf_trim(pkt, pkt->data_len - icmp_pkt_len);
+ if (ret < 0) {
+ GK_LOG(ERR,
+ "Failed to remove %d bytes of data at the end of the mbuf at %s",
+ pkt->data_len - icmp_pkt_len, __func__);
+ cb_f(pkt, instance);
+ return;
+ }
+
+ icmp_eth = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
+ } else {
+ icmp_eth = (struct rte_ether_hdr *)rte_pktmbuf_append(pkt,
+ icmp_pkt_len - pkt->data_len);
+ if (icmp_eth == NULL) {
+ GK_LOG(ERR,
+ "Failed to append %d bytes of new data: not enough headroom space in the first segment at %s\n",
+ icmp_pkt_len - pkt->data_len, __func__);
+ cb_f(pkt, instance);
+ return;
+ }
+ }
+
+ rte_ether_addr_copy(&icmp_eth->s_addr, ð_addr_tmp);
+ rte_ether_addr_copy(&icmp_eth->d_addr, &icmp_eth->s_addr);
+ rte_ether_addr_copy(ð_addr_tmp, &icmp_eth->d_addr);
+ if (iface->vlan_insert) {
+ fill_vlan_hdr(icmp_eth, iface->vlan_tag_be,
+ RTE_ETHER_TYPE_IPV4);
+ }
+
+ icmp_ipv4 = (struct rte_ipv4_hdr *)pkt_out_skip_l2(iface, icmp_eth);
+ icmp_ipv4->version_ihl = IP_VHL_DEF;
+ icmp_ipv4->type_of_service = 0;
+ icmp_ipv4->packet_id = 0;
+ icmp_ipv4->fragment_offset = IP_DN_FRAGMENT_FLAG;
+ icmp_ipv4->time_to_live = IP_DEFTTL;
+ icmp_ipv4->next_proto_id = IPPROTO_ICMP;
+ icmp_ipv4->src_addr = packet->flow.f.v4.dst.s_addr;
+ icmp_ipv4->dst_addr = packet->flow.f.v4.src.s_addr;
+ icmp_ipv4->total_length = rte_cpu_to_be_16(pkt->data_len -
+ iface->l2_len_out);
+ /*
+ * The IP header checksum filed must be set to 0
+ * in order to offload the checksum calculation.
+ */
+ icmp_ipv4->hdr_checksum = 0;
+ pkt->l2_len = iface->l2_len_out;
+ pkt->l3_len = sizeof(struct rte_ipv4_hdr);
+ pkt->ol_flags |= PKT_TX_IPV4 | PKT_TX_IP_CKSUM;
+
+ icmph = (struct rte_icmp_hdr *)&icmp_ipv4[1];
+ icmph->icmp_type = ICMP_TIME_EXCEEDED;
+ icmph->icmp_code = ICMP_EXC_TTL;
+ icmph->icmp_cksum = 0;
+ icmph->icmp_ident = 0;
+ icmph->icmp_seq_nb = 0;
+ icmph->icmp_cksum = icmp_cksum(icmph, sizeof(*icmph));
+
+ icmp_bufs[*num_pkts] = pkt;
+ (*num_pkts)++;
+}
+
+static void
+xmit_icmpv6(struct gatekeeper_if *iface, struct ipacket *packet,
+ uint16_t *num_pkts, struct rte_mbuf **icmp_bufs,
+ struct gk_instance *instance, packet_drop_cb_func cb_f)
+{
+ struct rte_ether_addr eth_addr_tmp;
+ struct rte_ether_hdr *icmp_eth;
+ struct rte_ipv6_hdr *icmp_ipv6;
+ struct icmpv6_hdr *icmpv6_hdr;
+ struct rte_mbuf *pkt = packet->pkt;
+ int icmpv6_pkt_len = iface->l2_len_out + sizeof(struct rte_ipv6_hdr) +
+ sizeof(struct icmpv6_hdr);
+ if (pkt->data_len >= icmpv6_pkt_len) {
+ int ret = rte_pktmbuf_trim(pkt,
+ pkt->data_len - icmpv6_pkt_len);
+ if (ret < 0) {
+ GK_LOG(ERR,
+ "Failed to remove %d bytes of data at the end of the mbuf at %s",
+ pkt->data_len - icmpv6_pkt_len, __func__);
+ cb_f(pkt, instance);
+ return;
+ }
+
+ icmp_eth = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
+ } else {
+ icmp_eth = (struct rte_ether_hdr *)rte_pktmbuf_append(pkt,
+ icmpv6_pkt_len - pkt->data_len);
+ if (icmp_eth == NULL) {
+ GK_LOG(ERR,
+ "Failed to append %d bytes of new data: not enough headroom space in the first segment at %s\n",
+ icmpv6_pkt_len - pkt->data_len, __func__);
+ cb_f(pkt, instance);
+ return;
+ }
+ }
+
+ rte_ether_addr_copy(&icmp_eth->s_addr, ð_addr_tmp);
+ rte_ether_addr_copy(&icmp_eth->d_addr, &icmp_eth->s_addr);
+ rte_ether_addr_copy(ð_addr_tmp, &icmp_eth->d_addr);
+ if (iface->vlan_insert) {
+ fill_vlan_hdr(icmp_eth, iface->vlan_tag_be,
+ RTE_ETHER_TYPE_IPV6);
+ }
+
+ /* Set-up IPv6 header. */
+ icmp_ipv6 = (struct rte_ipv6_hdr *)pkt_out_skip_l2(iface, icmp_eth);
+ icmp_ipv6->vtc_flow = rte_cpu_to_be_32(IPv6_DEFAULT_VTC_FLOW);
+ icmp_ipv6->payload_len = rte_cpu_to_be_16(sizeof(*icmpv6_hdr));
+ icmp_ipv6->proto = IPPROTO_ICMPV6;
+ /*
+ * The IP Hop Limit field must be 255 as required by
+ * RFC 4861, sections 7.1.1 and 7.1.2.
+ */
+ icmp_ipv6->hop_limits = 255;
+ rte_memcpy(icmp_ipv6->src_addr, packet->flow.f.v6.dst.s6_addr,
+ sizeof(icmp_ipv6->src_addr));
+ rte_memcpy(icmp_ipv6->dst_addr, packet->flow.f.v6.src.s6_addr,
+ sizeof(icmp_ipv6->dst_addr));
+
+ /* Set-up ICMPv6 header. */
+ icmpv6_hdr = (struct icmpv6_hdr *)&icmp_ipv6[1];
+ icmpv6_hdr->type = ICMPV6_TIME_EXCEED;
+ icmpv6_hdr->code = ICMPV6_EXC_HOPLIMIT;
+ icmpv6_hdr->cksum = 0; /* Calculated below. */
+
+ icmpv6_hdr->cksum = rte_ipv6_icmpv6_cksum(icmp_ipv6, icmpv6_hdr);
+
+ icmp_bufs[*num_pkts] = pkt;
+ (*num_pkts)++;
+}
+
+/*
+ * For IPv4, according to the RFC 1812 section 5.3.1 Time to Live (TTL),
+ * if the TTL is reduced to zero (or less), the packet MUST be
+ * discarded, and if the destination is not a multicast address the
+ * router MUST send an ICMP Time Exceeded message, Code 0 (TTL Exceeded
+ * in Transit) message to the source.
+ *
+ * For IPv6, according to the RFC 1883 section 4.4,
+ * if the IPv6 Hop Limit is less than or equal to 1, then the router needs to
+ * send an ICMP Time Exceeded -- Hop Limit Exceeded in Transit message to
+ * the Source Address and discard the packet.
+ */
+static int
+update_ip_hop_count(struct gatekeeper_if *iface, struct ipacket *packet,
+ uint16_t *num_pkts, struct rte_mbuf **icmp_bufs,
+ struct token_bucket_ratelimit_state *rs, struct gk_instance *instance,
+ packet_drop_cb_func cb_f)
+{
+ if (packet->flow.proto == RTE_ETHER_TYPE_IPV4) {
+ struct rte_ipv4_hdr *ipv4_hdr = packet->l3_hdr;
+ if (ipv4_hdr->time_to_live <= 1) {
+ if (tb_ratelimit_allow(rs)) {
+ xmit_icmp(iface, packet, num_pkts,
+ icmp_bufs, instance, cb_f);
+ } else
+ cb_f(packet->pkt, instance);
+ return -ETIMEDOUT;
+ }
+
+ --(ipv4_hdr->time_to_live);
+ ++(ipv4_hdr->hdr_checksum);
+ } else if (likely(packet->flow.proto == RTE_ETHER_TYPE_IPV6)) {
+ struct rte_ipv6_hdr *ipv6_hdr = packet->l3_hdr;
+ if (ipv6_hdr->hop_limits <= 1) {
+ if (tb_ratelimit_allow(rs)) {
+ xmit_icmpv6(iface, packet, num_pkts,
+ icmp_bufs, instance, cb_f);
+ } else
+ cb_f(packet->pkt, instance);
+ return -ETIMEDOUT;
+ }
+
+ --(ipv6_hdr->hop_limits);
+ } else {
+ GK_LOG(WARNING,
+ "Unexpected condition at %s: unknown flow type %hu\n",
+ __func__, packet->flow.proto);
+ cb_f(packet->pkt, instance);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void
+forward_pkt_to_back(struct ipacket *packet, struct ether_cache *eth_cache,
+ struct gk_co_work *work)
+{
+ struct rte_mbuf *pkt = packet->pkt;
+ struct gatekeeper_if *front = &work->gk_conf->net->front;
+ struct gatekeeper_if *back = &work->gk_conf->net->back;
+
+ if (adjust_pkt_len(pkt, back, 0) == NULL ||
+ pkt_copy_cached_eth_header(pkt, eth_cache,
+ back->l2_len_out)) {
+ drop_packet_front(pkt, work->instance);
+ return;
+ }
+
+ if (update_ip_hop_count(front, packet,
+ &work->tx_front_num_pkts, work->tx_front_pkts,
+ &work->instance->front_icmp_rs, work->instance,
+ drop_packet_front) < 0)
+ return;
+
+ work->tx_back_pkts[work->tx_back_num_pkts++] = pkt;
+}
+
+static struct gk_fib *
+look_up_fib(struct gk_lpm *ltbl, struct ip_flow *flow)
+{
+ int fib_id;
+
+ if (flow->proto == RTE_ETHER_TYPE_IPV4) {
+ fib_id = lpm_lookup_ipv4(ltbl->lpm, flow->f.v4.dst.s_addr);
+ if (fib_id < 0)
+ return NULL;
+ return <bl->fib_tbl[fib_id];
+ }
+
+ if (likely(flow->proto == RTE_ETHER_TYPE_IPV6)) {
+ fib_id = lpm_lookup_ipv6(ltbl->lpm6, &flow->f.v6.dst);
+ if (fib_id < 0)
+ return NULL;
+ return <bl->fib_tbl6[fib_id];
+ }
+
+ rte_panic("Unexpected condition at %s: unknown flow type %hu\n",
+ __func__, flow->proto);
+
+ return NULL; /* Unreachable. */
+}
+
+static struct flow_entry *
+lookup_fe_from_lpm(struct ipacket *packet, uint32_t ip_flow_hash_val,
+ struct gk_co_work *work)
+{
+ struct rte_mbuf *pkt = packet->pkt;
+
+ /*
+ * A prefetch is not needed here because current deployments of
+ * Gatekeeper servers have only a couple of FIB entries forwarding
+ * traffic from front to back interfaces.
+ */
+ struct gk_fib *fib = look_up_fib(&work->gk_conf->lpm_tbl,
+ &packet->flow);
+
+ if (fib == NULL || fib->action == GK_FWD_NEIGHBOR_FRONT_NET) {
+ struct gk_measurement_metrics *stats =
+ &work->instance->traffic_stats;
+ if (packet->flow.proto == RTE_ETHER_TYPE_IPV4) {
+ stats->tot_pkts_num_distributed++;
+ stats->tot_pkts_size_distributed +=
+ rte_pktmbuf_pkt_len(pkt);
+ add_pkt_acl(&work->front_acl4, pkt);
+ } else if (likely(packet->flow.proto ==
+ RTE_ETHER_TYPE_IPV6)) {
+ stats->tot_pkts_num_distributed++;
+ stats->tot_pkts_size_distributed +=
+ rte_pktmbuf_pkt_len(pkt);
+ add_pkt_acl(&work->front_acl6, pkt);
+ } else {
+ print_flow_err_msg(&packet->flow,
+ "gk: failed to get the fib entry");
+ drop_packet_front(pkt, work->instance);
+ }
+ return NULL;
+ }
+
+ switch (fib->action) {
+ case GK_FWD_GRANTOR: {
+ struct flow_entry *fe = &work->temp_fes[work->temp_fes_num++];
+ initialize_flow_entry(fe, &packet->flow, ip_flow_hash_val, fib);
+ return fe;
+ }
+
+ case GK_FWD_GATEWAY_BACK_NET: {
+ /*
+ * The entry instructs to forward its packets to
+ * the gateway in the back network.
+ */
+ struct ether_cache *eth_cache = fib->u.gateway.eth_cache;
+ RTE_VERIFY(eth_cache != NULL);
+ forward_pkt_to_back(packet, eth_cache, work);
+ return NULL;
+ }
+
+ case GK_FWD_NEIGHBOR_BACK_NET: {
+ /*
+ * The entry instructs to forward its packets to
+ * the neighbor in the back network.
+ */
+ struct ether_cache *eth_cache =
+ (packet->flow.proto == RTE_ETHER_TYPE_IPV4)
+ ? lookup_ether_cache(&fib->u.neigh,
+ &packet->flow.f.v4.dst)
+ : lookup_ether_cache(&fib->u.neigh6,
+ &packet->flow.f.v6.dst);
+ RTE_VERIFY(eth_cache != NULL);
+ forward_pkt_to_back(packet, eth_cache, work);
+ return NULL;
+ }
+
+ case GK_DROP:
+ /* FALLTHROUGH */
+ default:
+ drop_packet_front(pkt, work->instance);
+ return NULL;
+ }
+
+ return NULL;
+}
+
+static void
+prefetch_and_yield(void *addr, void *this_co)
+{
+ rte_prefetch_non_temporal(addr);
+ gk_yield_next(this_co);
+}
+
+static void
+gk_co_process_front_pkt_final(struct gk_co *this_co, struct gk_co_task *task)
+{
+ struct ipacket *packet = task->task_arg;
+ struct gk_co_work *work = this_co->work;
+ uint32_t ip_flow_hash_val = task->task_hash;
+ struct flow_entry *fe_leftover =
+ get_fe_leftover(work, ip_flow_hash_val);
+ struct flow_entry *fe;
+ int ret;
+
+ /* Is leftover useful? */
+ if (fe_leftover != NULL &&
+ fe_leftover->flow_hash_val == ip_flow_hash_val &&
+ ip_flow_cmp_eq(&fe_leftover->flow,
+ &packet->flow, 0) == 0) {
+ /* Jackpot! Deal with @pkt right away. */
+ process_flow_entry(this_co, fe_leftover, packet);
+ return;
+ }
+
+ /* Look up flow entry. */
+ ret = rte_hash_lookup_and_yield_with_hash(
+ work->instance->ip_flow_hash_table, &packet->flow,
+ ip_flow_hash_val, prefetch_and_yield, this_co);
+ if (ret >= 0) {
+ fe = &work->instance->ip_flow_entry_table[ret];
+ /* TODO Break this prefetch into part1 and part2. */
+ prefetch_flow_entry(fe);
+ gk_yield_next(this_co);
+ process_flow_entry(this_co, fe, packet);
+ save_fe_leftover(work, fe);
+ return;
+ }
+ if (unlikely(ret != -ENOENT)) {
+ char err_msg[1024];
+
+ ret = snprintf(err_msg, sizeof(err_msg),
+ "gk: failed to look up flow state at %s with lcore %u: %i\n",
+ __func__, rte_lcore_id(), ret);
+
+ RTE_VERIFY(ret > 0 && ret < (int)sizeof(err_msg));
+ print_flow_err_msg(&packet->flow, err_msg);
+ return;
+ }
+
+ fe = lookup_fe_from_lpm(packet, ip_flow_hash_val, work);
+ if (fe == NULL)
+ return;
+ process_flow_entry(this_co, fe, packet);
+ save_fe_leftover(work, fe);
+}
+
+void
+gk_co_process_front_pkt_software_rss(struct gk_co *this_co,
+ struct gk_co_task *task)
+{
+ struct ipacket *packet = task->task_arg;
+
+ if (parse_front_pkt(this_co, packet, packet->pkt) != 0)
+ return;
+
+ /* Finish up the work with the correct hash value. */
+ task->task_hash = rss_ip_flow_hf(&packet->flow, 0, 0);
+ task->task_func = gk_co_process_front_pkt_final;
+ reschedule_task(this_co, task);
+}
+
+void
+gk_co_process_front_pkt(struct gk_co *this_co, struct gk_co_task *task)
+{
+ struct ipacket packet;
+
+ if (parse_front_pkt(this_co, &packet, task->task_arg) != 0)
+ return;
+ task->task_arg = &packet;
+ gk_co_process_front_pkt_final(this_co, task);
+}
+
+static void
+gk_co_scan_flow_table_final(struct gk_co *this_co, struct gk_co_task *task)
+{
+ struct gk_co_work *work = this_co->work;
+ struct flow_entry *fe = task->task_arg;
+ struct flow_entry **leftover_bucket = get_fe_leftover_bucket(work, fe);
+
+ RTE_VERIFY(work->del_fe == NULL);
+ work->del_fe = fe;
+
+ /* Deal with the leftover. */
+ if (unlikely(*leftover_bucket == fe)) {
+ /* One does not need to look up again. */
+ return;
+ }
+ *leftover_bucket = fe;
+
+ /* Prefetch buckets to remove the flow entry later. */
+ rte_hash_lookup_and_yield_with_hash(work->instance->ip_flow_hash_table,
+ &fe->flow, fe->flow_hash_val, prefetch_and_yield, this_co);
+}
+
+static bool
+is_flow_expired(struct flow_entry *fe, uint64_t now)
+{
+ switch(fe->state) {
+ case GK_REQUEST:
+ if (fe->u.request.last_packet_seen_at > now) {
+ char err_msg[128];
+ int ret = snprintf(err_msg, sizeof(err_msg),
+ "gk: buggy condition at %s: wrong timestamp",
+ __func__);
+ RTE_VERIFY(ret > 0 && ret < (int)sizeof(err_msg));
+ print_flow_err_msg(&fe->flow, err_msg);
+ return true;
+ }
+
+ /*
+ * A request entry is considered expired if it is not
+ * doubling its waiting time. We use +2 instead of +1 in
+ * the test below to account for random delays in the network.
+ */
+ return priority_from_delta_time(now,
+ fe->u.request.last_packet_seen_at) >
+ fe->u.request.last_priority + 2;
+ case GK_GRANTED:
+ return now >= fe->u.granted.cap_expire_at;
+ case GK_DECLINED:
+ return now >= fe->u.declined.expire_at;
+ case GK_BPF:
+ return now >= fe->u.bpf.expire_at;
+ default:
+ return true;
+ }
+}
+
+void
+gk_co_scan_flow_table(struct gk_co *this_co, struct gk_co_task *task)
+{
+ struct flow_entry *fe = task->task_arg;
+
+ /*
+ * Only one prefetch is needed here because one only needs
+ * the beginning of a struct flow_entry to
+ * check if it's expired.
+ */
+ rte_prefetch_non_temporal(fe);
+ gk_yield_next(this_co);
+
+ if (!fe->in_use || !is_flow_expired(fe, rte_rdtsc()))
+ return;
+
+ /* Finish up the work with the correct hash value. */
+ task->task_hash = fe->flow_hash_val;
+ task->task_func = gk_co_scan_flow_table_final;
+ reschedule_task(this_co, task);
+}
+
+static struct gk_co_task *
+next_task(struct gk_co *this_co)
+{
+ while (true) {
+ struct gk_co *next_co;
+
+ /*
+ * This test is likely because if @this_co has at least
+ * one task, there's at least 50% that it will be true because
+ * this function is called twice.
+ */
+ if (likely(!list_empty(&this_co->task_queue))) {
+ /*
+ * @this_co has assigned tasks.
+ * Return the first assigned task.
+ */
+ struct gk_co_task *task = list_first_entry(
+ &this_co->task_queue, struct gk_co_task,
+ task_list);
+ list_del(&task->task_list);
+ return task;
+ }
+
+ /* There is no more tasks assigned to @this_co. */
+
+ next_co = get_next_co(this_co);
+
+ /* Make @this_co idle. */
+ list_del(&this_co->co_list);
+
+ /* Transfer control to another coroutine. */
+ if (likely(this_co != next_co)) {
+ /*
+ * @this_co is NOT the last working coroutine.
+ * Yield to the next coroutine.
+ */
+ coro_transfer(&this_co->coro, &next_co->coro);
+ } else {
+ /*
+ * No more work and no more working coroutines;
+ * @this_co is the last working coroutine.
+ * Return to the main coroutine.
+ */
+ coro_transfer(&this_co->coro,
+ &this_co->work->instance->coro_root);
+ }
+ }
+}
+
+void
+gk_co_main(void *arg)
+{
+ struct gk_co *this_co = arg;
+ struct gk_co_task *task = next_task(this_co);
+
+ while (likely(task != NULL)) {
+ task->task_func(this_co, task);
+ task = next_task(this_co);
+ }
+
+ rte_panic("%s() terminated\n", __func__);
+}
diff --git a/gk/co.h b/gk/co.h
new file mode 100644
index 000000000..6ed27033a
--- /dev/null
+++ b/gk/co.h
@@ -0,0 +1,290 @@
+/*
+ * Gatekeeper - DoS protection system.
+ * Copyright (C) 2016 Digirati LTDA.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+
+#ifndef _GATEKEEPER_GK_CO_H_
+#define _GATEKEEPER_GK_CO_H_
+
+#include
+#include
+#include
+#include
+
+#include "gatekeeper_gk.h"
+#include "gatekeeper_acl.h"
+
+struct gk_co {
+ /*
+ * Attach this coroutine to work->working_cos while
+ * this coroutine is working.
+ */
+ struct list_head co_list;
+ /* structs from libcoro. */
+ struct coro_stack stack;
+ struct coro_context coro;
+ /* Task assigned to this coroutine. */
+ struct list_head task_queue;
+ struct gk_co_work *work;
+};
+
+struct gk_co_task *task;
+
+typedef void (*gk_co_task_func_t)(struct gk_co *this_co,
+ struct gk_co_task *task);
+
+struct gk_co_task {
+ /*
+ * Once the task is assigned to a coroutine,
+ * attach this task to co->task_queue.
+ */
+ struct list_head task_list;
+ /*
+ * @task_hash is used to assign the task to a coroutine.
+ *
+ * This is important to avoid race conditions between coroutines.
+ * For example, assume that two tasks that are going to work on
+ * the same flow entry are assigned to two different coroutines, and
+ * that the corresponding flow entry is not available in
+ * the flow table, both coroutines may try to add the same flow entry.
+ * If these two tasks share the same task hash, both tasks are going to
+ * be assigned to the same coroutine.
+ */
+ uint32_t task_hash;
+ void *task_arg;
+ gk_co_task_func_t task_func;
+};
+
+struct gk_co_work {
+ /* The coroutines working on the tasks. */
+ struct list_head working_cos;
+ /* Coroutines available to do the work. */
+ struct gk_co *cos;
+ /* Number of coroutines available for the next batch of work. */
+ uint16_t co_num;
+ /* Total number of coroutines available at field @cos. */
+ uint16_t co_max_num;
+ /* Index of the next coroutine to use when a task has no task hash. */
+ uint16_t any_co_index;
+ /* How field @co_num will change for the next batch of work. */
+ int16_t co_delta_num;
+ /*
+ * Previous value of field @co_num.
+ * When the value of this field is zero, an invalid value for @co_num,
+ * the value of field @avg_cycles_per_task is not meaningful.
+ */
+ uint16_t co_prv_num;
+ /*
+ * Average number of cycles per task when @co_num was equal to
+ * @co_prv_num.
+ */
+ double avg_cycles_per_task;
+
+ struct gk_config *gk_conf;
+ struct gk_instance *instance;
+
+ /* All preallocated tasks available to do work. */
+ struct gk_co_task *all_tasks;
+ /* The total number of taks available at field @all_tasks. */
+ const uint32_t task_total;
+ /* Current number of tasks used at field @all_tasks. */
+ uint32_t task_num;
+
+ /* Fields for front packets and mailbox messages. */
+ /*
+ * This is a single-entry-per-bucket hash table.
+ * This flow entries are reused between tasks assigned to
+ * the same coroutine.
+ */
+ struct flow_entry ** const leftover;
+ /*
+ * Flow entries that has not been inserted in the flow table, but
+ * they may be present in @leftover.
+ */
+ struct flow_entry * const temp_fes;
+ /* Number of entries in used in @temp_fes. */
+ uint16_t temp_fes_num;
+ /*
+ * Mask for the hash table @leftover.
+ * It must be of the form (2^n - 1) for any n >= 0.
+ */
+ const uint32_t leftover_mask;
+ /*
+ * The following fields release the coroutines of acquiring
+ * a writer lock on the flow table.
+ */
+ /* If different of NULL, free this entry in flush_work(). */
+ struct flow_entry *del_fe;
+
+ /* Fields for front and back packets. */
+ uint16_t tx_front_num_pkts;
+ uint16_t tx_back_num_pkts;
+ struct rte_mbuf ** const tx_front_pkts;
+ struct rte_mbuf ** const tx_back_pkts;
+ /*
+ * The following field is only needed when the RSS hash is not
+ * available.
+ */
+ struct ipacket * const packets;
+
+ /* Fields for the front packets only. */
+ uint16_t front_num_req;
+ uint16_t front_num_arp;
+ struct rte_mbuf ** const front_req_bufs;
+ struct rte_mbuf ** const front_arp_bufs;
+ struct acl_search front_acl4;
+ struct acl_search front_acl6;
+ bool front_ipv4_configured;
+ bool front_ipv6_configured;
+
+ /* Fields for the front packets only. */
+ uint16_t back_num_arp;
+ struct rte_mbuf ** const back_arp_bufs;
+ struct acl_search back_acl4;
+ struct acl_search back_acl6;
+};
+
+/* Declare and initialize a struct gk_co_work. */
+#define DEFINE_GK_CO_WORK(name, max_front_pkts, max_back_pkts, \
+ max_mailbox, lo_mask, task_extra) \
+ struct gk_co_task name##_all_tasks_array[(max_front_pkts) + \
+ (max_back_pkts) + (max_mailbox) + (task_extra)]; \
+ struct flow_entry *name##_leftover_array[(lo_mask) + 1]; \
+ struct flow_entry name##_temp_fes_array[ \
+ (max_front_pkts) + (max_mailbox)]; \
+ struct rte_mbuf *name##_tx_front_pkts_array[ \
+ (max_front_pkts) + (max_back_pkts)]; \
+ struct rte_mbuf *name##_tx_back_pkts_array[ \
+ (max_front_pkts) + (max_back_pkts)]; \
+ struct ipacket name##_packets_array[ \
+ (max_front_pkts) + (max_back_pkts)]; \
+ struct rte_mbuf *name##_front_req_bufs_array[(max_front_pkts)]; \
+ struct rte_mbuf *name##_front_arp_bufs_array[(max_front_pkts)]; \
+ DECLARE_ACL_SEARCH_VARIABLE_PART(front_acl4, (max_front_pkts)); \
+ DECLARE_ACL_SEARCH_VARIABLE_PART(front_acl6, (max_front_pkts)); \
+ struct rte_mbuf *name##_back_arp_bufs_array[(max_back_pkts)]; \
+ DECLARE_ACL_SEARCH_VARIABLE_PART(back_acl4, (max_back_pkts)); \
+ DECLARE_ACL_SEARCH_VARIABLE_PART(back_acl6, (max_back_pkts)); \
+ struct gk_co_work name = { \
+ .working_cos = LIST_HEAD_INIT(name.working_cos), \
+ .cos = NULL, \
+ .co_num = 0, \
+ .co_max_num = 0, \
+ .any_co_index = 0, \
+ .co_delta_num = 1, \
+ .co_prv_num = 0, \
+ .avg_cycles_per_task = 0, \
+ .gk_conf = NULL, \
+ .instance = NULL, \
+ .all_tasks = name##_all_tasks_array, \
+ .task_total = (max_front_pkts) + (max_back_pkts) + \
+ (max_mailbox) + (task_extra), \
+ .task_num = 0, \
+ .leftover = memset(name##_leftover_array, 0, \
+ sizeof(name##_leftover_array)), \
+ .temp_fes = name##_temp_fes_array, \
+ .temp_fes_num = 0, \
+ .leftover_mask = (lo_mask), \
+ .del_fe = NULL, \
+ .tx_front_num_pkts = 0, \
+ .tx_back_num_pkts = 0, \
+ .tx_front_pkts = name##_tx_front_pkts_array, \
+ .tx_back_pkts = name##_tx_back_pkts_array, \
+ .packets = name##_packets_array, \
+ .front_num_req = 0, \
+ .front_num_arp = 0, \
+ .front_req_bufs = name##_front_req_bufs_array, \
+ .front_arp_bufs = name##_front_arp_bufs_array, \
+ .front_acl4 = ACL_SEARCH_INIT(front_acl4), \
+ .front_acl6 = ACL_SEARCH_INIT(front_acl6), \
+ .front_ipv4_configured = false, \
+ .front_ipv6_configured = false, \
+ .back_num_arp = 0, \
+ .back_arp_bufs = name##_back_arp_bufs_array, \
+ .back_acl4 = ACL_SEARCH_INIT(back_acl4), \
+ .back_acl6 = ACL_SEARCH_INIT(back_acl6), \
+ }
+
+static inline struct gk_co *
+get_task_owner_co(struct gk_co_work *work, struct gk_co_task *task)
+{
+ return &work->cos[task->task_hash % work->co_num];
+}
+
+static inline void
+__schedule_task(struct gk_co *task_owner_co, struct gk_co_task *task)
+{
+ list_add_tail(&task->task_list, &task_owner_co->task_queue);
+}
+
+static inline void
+schedule_task(struct gk_co_work *work, struct gk_co_task *task)
+{
+ __schedule_task(get_task_owner_co(work, task), task);
+}
+
+/* Uniformly distribuite tasks with no task hash among coroutines. */
+static inline void
+schedule_task_to_any_co(struct gk_co_work *work, struct gk_co_task *task)
+{
+ __schedule_task(&work->cos[work->any_co_index], task);
+ work->any_co_index = (work->any_co_index + 1) % work->co_num;
+}
+
+static inline struct flow_entry **
+__get_fe_leftover_bucket(struct gk_co_work *work, uint32_t hash)
+{
+ return &work->leftover[hash & work->leftover_mask];
+}
+
+static inline struct flow_entry **
+get_fe_leftover_bucket(struct gk_co_work *work, struct flow_entry *fe)
+{
+ return __get_fe_leftover_bucket(work, fe->flow_hash_val);
+}
+
+static inline struct flow_entry *
+get_fe_leftover(struct gk_co_work *work, uint32_t hash)
+{
+ return *__get_fe_leftover_bucket(work, hash);
+}
+
+/*
+ * Notice that if the bucket is not empty, that reference will be lost.
+ * That is, the code favors the newer entry over the older entry.
+ */
+static inline void
+save_fe_leftover(struct gk_co_work *work, struct flow_entry *fe)
+{
+ *get_fe_leftover_bucket(work, fe) = fe;
+}
+
+void
+gk_co_main(void *arg);
+
+void
+gk_co_scan_flow_table(struct gk_co *this_co, struct gk_co_task *task);
+
+void
+gk_co_process_front_pkt(struct gk_co *this_co, struct gk_co_task *task);
+void
+gk_co_process_front_pkt_software_rss(struct gk_co *this_co,
+ struct gk_co_task *task);
+
+void
+gk_yield_next(struct gk_co *this_co);
+
+#endif /* _GATEKEEPER_GK_CO_H_ */
diff --git a/gk/main.c b/gk/main.c
index bce203456..bb38be214 100644
--- a/gk/main.c
+++ b/gk/main.c
@@ -44,14 +44,14 @@
#include "gatekeeper_sol.h"
#include "gatekeeper_flow_bpf.h"
-#include "bpf.h"
-
-#define START_PRIORITY (38)
-/* Set @START_ALLOWANCE as the double size of a large DNS reply. */
-#define START_ALLOWANCE (8)
+#include "co.h"
int gk_logtype;
+/*
+ * TODO A copy of this function is available in gk/co.c,
+ * so drop it when possible.
+ */
/* We should avoid calling integer_log_base_2() with zero. */
static inline uint8_t
integer_log_base_2(uint64_t delta_time)
@@ -63,18 +63,22 @@ integer_log_base_2(uint64_t delta_time)
#endif
}
-/*
- * It converts the difference of time between the current packet and
- * the last seen packet into a given priority.
+/*
+ * TODO A copy of this function is available in gk/co.c,
+ * so drop it when possible.
*/
-static uint8_t
+/*
+ * It converts the difference of time between the current packet and
+ * the last seen packet into a given priority.
+ */
+static uint8_t
priority_from_delta_time(uint64_t present, uint64_t past)
{
uint64_t delta_time;
if (unlikely(present < past)) {
/*
- * This should never happen, but we handle it gracefully here
+ * This should never happen, but we handle it gracefully here
* in order to keep going.
*/
GK_LOG(ERR, "The present time smaller than the past time\n");
@@ -84,10 +88,14 @@ priority_from_delta_time(uint64_t present, uint64_t past)
delta_time = (present - past) * picosec_per_cycle;
if (unlikely(delta_time < 1))
return 0;
-
+
return integer_log_base_2(delta_time);
}
+/*
+ * TODO A copy of this function is available in gk/co.c,
+ * so drop it when possible.
+ */
static struct gk_fib *
look_up_fib(struct gk_lpm *ltbl, struct ip_flow *flow)
{
@@ -113,6 +121,10 @@ look_up_fib(struct gk_lpm *ltbl, struct ip_flow *flow)
return NULL; /* Unreachable. */
}
+/*
+ * TODO A copy of this function is available in gk/co.c,
+ * so drop it when possible.
+ */
static int
extract_packet_info(struct rte_mbuf *pkt, struct ipacket *packet)
{
@@ -180,41 +192,17 @@ extract_packet_info(struct rte_mbuf *pkt, struct ipacket *packet)
return ret;
}
-static inline void
-initialize_flow_entry(struct flow_entry *fe, struct ip_flow *flow,
- uint32_t flow_hash_val, struct gk_fib *grantor_fib)
-{
- /*
- * The flow table is a critical data structure, so,
- * whenever the size of entries grow too much,
- * one must look for alternatives before increasing
- * the limit below.
- */
- RTE_BUILD_BUG_ON(sizeof(*fe) > 128);
-
- rte_memcpy(&fe->flow, flow, sizeof(*flow));
-
- fe->in_use = true;
- fe->flow_hash_val = flow_hash_val;
- fe->state = GK_REQUEST;
- fe->u.request.last_packet_seen_at = rte_rdtsc();
- fe->u.request.last_priority = START_PRIORITY;
- fe->u.request.allowance = START_ALLOWANCE - 1;
- fe->grantor_fib = grantor_fib;
-}
-
-static inline void
-reinitialize_flow_entry(struct flow_entry *fe, uint64_t now)
-{
- fe->state = GK_REQUEST;
- fe->u.request.last_packet_seen_at = now;
- fe->u.request.last_priority = START_PRIORITY;
- fe->u.request.allowance = START_ALLOWANCE - 1;
-}
-
+/*
+ * TODO A copy of this typedef is available in gk/co.c,
+ * so drop it when possible.
+ */
typedef int (*packet_drop_cb_func)(struct rte_mbuf *pkt,
struct gk_instance *instance);
+/*
+ * TODO A copy of this function is available in gk/co.c,
+ * so drop it when possible.
+ */
static int
drop_packet_front(struct rte_mbuf *pkt, struct gk_instance *instance)
{
@@ -257,247 +245,6 @@ pkt_copy_cached_eth_header(struct rte_mbuf *pkt, struct ether_cache *eth_cache,
return stale;
}
-/*
- * When a flow entry is at request state, all the GK block processing
- * that entry does is to:
- * (1) compute the priority of the packet.
- * (2) encapsulate the packet as a request.
- * (3) put this encapsulated packet in the request queue.
- *
- * Returns a negative integer on error, or EINPROGRESS to indicate
- * that the request is being processed by another lcore, and should
- * not be forwarded or dropped on returning from this function.
- */
-static int
-gk_process_request(struct flow_entry *fe, struct ipacket *packet,
- struct rte_mbuf **req_bufs, uint16_t *num_reqs,
- struct sol_config *sol_conf)
-{
- int ret;
- uint64_t now = rte_rdtsc();
- uint8_t priority = priority_from_delta_time(now,
- fe->u.request.last_packet_seen_at);
- struct gk_fib *fib = fe->grantor_fib;
- struct ether_cache *eth_cache;
-
- fe->u.request.last_packet_seen_at = now;
-
- /*
- * The reason for using "<" instead of "<=" is that the equal case
- * means that the source has waited enough time to have the same
- * last priority, so it should be awarded with the allowance.
- */
- if (priority < fe->u.request.last_priority &&
- fe->u.request.allowance > 0) {
- fe->u.request.allowance--;
- priority = fe->u.request.last_priority;
- } else {
- fe->u.request.last_priority = priority;
- fe->u.request.allowance = START_ALLOWANCE - 1;
- }
-
- /*
- * Adjust @priority for the DSCP field.
- * DSCP 0 for legacy packets; 1 for granted packets;
- * 2 for capability renew; 3-63 for requests.
- */
- priority += PRIORITY_REQ_MIN;
- if (unlikely(priority > PRIORITY_MAX))
- priority = PRIORITY_MAX;
-
- /* The assigned priority is @priority. */
-
- /* Encapsulate the packet as a request. */
- ret = encapsulate(packet->pkt, priority,
- &sol_conf->net->back, &fib->u.grantor.gt_addr);
- if (ret < 0)
- return ret;
-
- eth_cache = fib->u.grantor.eth_cache;
- RTE_VERIFY(eth_cache != NULL);
- /* If needed, packet header space was adjusted by encapsulate(). */
- if (pkt_copy_cached_eth_header(packet->pkt, eth_cache,
- sol_conf->net->back.l2_len_out))
- return -1;
-
- req_bufs[*num_reqs] = packet->pkt;
- req_bufs[*num_reqs]->udata64 = priority;
- (*num_reqs)++;
-
- return EINPROGRESS;
-}
-
-/*
- * Returns:
- * * zero on success; the granted packet can be enqueued and forwarded
- * * a negative number on error or when the packet needs to be
- * otherwise dropped because it has exceeded its budget
- * * EINPROGRESS to indicate that the packet is now a request that
- * is being processed by another lcore, and should not
- * be forwarded or dropped on returning from this function.
- */
-static int
-gk_process_granted(struct flow_entry *fe, struct ipacket *packet,
- struct rte_mbuf **req_bufs, uint16_t *num_reqs,
- struct sol_config *sol_conf, struct gk_measurement_metrics *stats)
-{
- int ret;
- bool renew_cap;
- uint8_t priority = PRIORITY_GRANTED;
- uint64_t now = rte_rdtsc();
- struct rte_mbuf *pkt = packet->pkt;
- struct gk_fib *fib = fe->grantor_fib;
- struct ether_cache *eth_cache;
- uint32_t pkt_len;
-
- if (now >= fe->u.granted.cap_expire_at) {
- reinitialize_flow_entry(fe, now);
- return gk_process_request(fe, packet, req_bufs,
- num_reqs, sol_conf);
- }
-
- if (now >= fe->u.granted.budget_renew_at) {
- fe->u.granted.budget_renew_at = now + cycles_per_sec;
- fe->u.granted.budget_byte =
- (uint64_t)fe->u.granted.tx_rate_kib_cycle * 1024;
- }
-
- pkt_len = rte_pktmbuf_pkt_len(pkt);
- if (pkt_len > fe->u.granted.budget_byte) {
- stats->pkts_num_declined++;
- stats->pkts_size_declined += pkt_len;
- return -1;
- }
-
- fe->u.granted.budget_byte -= pkt_len;
- renew_cap = now >= fe->u.granted.send_next_renewal_at;
- if (renew_cap) {
- fe->u.granted.send_next_renewal_at = now +
- fe->u.granted.renewal_step_cycle;
- priority = PRIORITY_RENEW_CAP;
- }
-
- /*
- * Encapsulate packet as a granted packet,
- * mark it as a capability renewal request if @renew_cap is true,
- * enter destination according to @fe->grantor_fib.
- */
- ret = encapsulate(packet->pkt, priority,
- &sol_conf->net->back, &fib->u.grantor.gt_addr);
- if (ret < 0)
- return ret;
-
- eth_cache = fib->u.grantor.eth_cache;
- RTE_VERIFY(eth_cache != NULL);
- /* If needed, packet header space was adjusted by encapsulate(). */
- if (pkt_copy_cached_eth_header(packet->pkt, eth_cache,
- sol_conf->net->back.l2_len_out))
- return -1;
-
- stats->pkts_num_granted++;
- stats->pkts_size_granted += pkt_len;
- return 0;
-}
-
-/*
- * Returns:
- * * a negative number on error or when the packet needs to be
- * otherwise dropped because it is declined
- * * EINPROGRESS to indicate that the packet is now a request that
- * is being processed by another lcore, and should not
- * be forwarded or dropped on returning from this function.
- */
-static int
-gk_process_declined(struct flow_entry *fe, struct ipacket *packet,
- struct rte_mbuf **req_bufs, uint16_t *num_reqs,
- struct sol_config *sol_conf, struct gk_measurement_metrics *stats)
-{
- uint64_t now = rte_rdtsc();
-
- if (unlikely(now >= fe->u.declined.expire_at)) {
- reinitialize_flow_entry(fe, now);
- return gk_process_request(fe, packet, req_bufs,
- num_reqs, sol_conf);
- }
-
- stats->pkts_num_declined++;
- stats->pkts_size_declined += rte_pktmbuf_pkt_len(packet->pkt);
-
- return -1;
-}
-
-/*
- * Returns:
- * * zero on success; the packet can be enqueued and forwarded
- * * a negative number on error or when the packet needs to be
- * otherwise dropped because it has exceeded a limit
- * * EINPROGRESS to indicate that the packet is now a request that
- * is being processed by another lcore, and should not
- * be forwarded or dropped on returning from this function.
- */
-static int
-gk_process_bpf(struct flow_entry *fe, struct ipacket *packet,
- struct rte_mbuf **req_bufs, uint16_t *num_reqs,
- struct gk_config *gk_conf, struct gk_measurement_metrics *stats)
-{
- uint64_t bpf_ret;
- int program_index, rc;
- uint64_t now = rte_rdtsc();
-
- if (unlikely(now >= fe->u.bpf.expire_at))
- goto expired;
-
- program_index = fe->program_index;
- rc = gk_bpf_decide_pkt(gk_conf, program_index, fe, packet, now,
- &bpf_ret);
- if (unlikely(rc != 0)) {
- GK_LOG(WARNING,
- "The BPF program at index %u failed to run its function pkt\n",
- program_index);
- goto expired;
- }
-
- switch (bpf_ret) {
- case GK_BPF_PKT_RET_FORWARD: {
- struct ether_cache *eth_cache =
- fe->grantor_fib->u.grantor.eth_cache;
- RTE_VERIFY(eth_cache != NULL);
- /*
- * If needed, encapsulate() already adjusted
- * packet header space.
- */
- if (pkt_copy_cached_eth_header(packet->pkt, eth_cache,
- gk_conf->net->back.l2_len_out))
- return -1;
-
- stats->pkts_num_granted++;
- stats->pkts_size_granted += rte_pktmbuf_pkt_len(packet->pkt);
- return 0;
- }
- case GK_BPF_PKT_RET_DECLINE:
- stats->pkts_num_declined++;
- stats->pkts_size_declined += rte_pktmbuf_pkt_len(packet->pkt);
- return -1;
- case GK_BPF_PKT_RET_ERROR:
- GK_LOG(WARNING,
- "The function pkt of the BPF program at index %u returned GK_BPF_PKT_RET_ERROR\n",
- program_index);
- return -1;
- default:
- GK_LOG(WARNING,
- "The function pkt of the BPF program at index %u returned an invalid return: %" PRIu64 "\n",
- program_index, bpf_ret);
- return -1;
- }
-
- rte_panic("Unexpected condition at %s()", __func__);
-
-expired:
- reinitialize_flow_entry(fe, now);
- return gk_process_request(fe, packet, req_bufs, num_reqs,
- gk_conf->sol_conf);
-}
-
static int
get_block_idx(struct gk_config *gk_conf, unsigned int lcore_id)
{
@@ -510,6 +257,10 @@ get_block_idx(struct gk_config *gk_conf, unsigned int lcore_id)
return 0;
}
+/*
+ * TODO A copy of this function is available in gk/co.c,
+ * so drop it when possible.
+ */
static bool
is_flow_expired(struct flow_entry *fe, uint64_t now)
{
@@ -545,12 +296,17 @@ is_flow_expired(struct flow_entry *fe, uint64_t now)
}
static int
-gk_del_flow_entry_from_hash(struct rte_hash *h, struct flow_entry *fe)
+gk_del_flow_entry_from_hash(struct gk_instance *instance, struct flow_entry *fe)
{
- int ret = rte_hash_del_key_with_hash(h, &fe->flow, fe->flow_hash_val);
- if (likely(ret >= 0))
+
+ int ret = rte_hash_del_key_with_hash(instance->ip_flow_hash_table,
+ &fe->flow, fe->flow_hash_val);
+ if (likely(ret >= 0)) {
memset(fe, 0, sizeof(*fe));
- else {
+
+ if (instance->num_scan_del > 0)
+ instance->num_scan_del--;
+ } else {
GK_LOG(ERR,
"The GK block failed to delete a key from hash table at %s: %s\n",
__func__, strerror(-ret));
@@ -559,6 +315,56 @@ gk_del_flow_entry_from_hash(struct rte_hash *h, struct flow_entry *fe)
return ret;
}
+static void
+free_cos(struct gk_co *cos, unsigned int num)
+{
+ unsigned int i;
+
+ if (cos == NULL)
+ return;
+
+ for (i = 0; i < num; i++) {
+ struct gk_co *co = &cos[i];
+
+ if (co->stack.sptr == NULL)
+ continue;
+
+ /* Free @co. */
+ coro_destroy(&co->coro);
+ coro_stack_free(&co->stack);
+ }
+
+ rte_free(cos);
+}
+
+static struct gk_co *
+alloc_cos(unsigned int num, unsigned int stack_size_byte)
+{
+ unsigned int stack_size_ptr = stack_size_byte / sizeof(void *);
+ unsigned int i;
+
+ struct gk_co *cos = rte_calloc(__func__, num, sizeof(*cos), 0);
+ if (cos == NULL)
+ return NULL;
+
+ for (i = 0; i < num; i++) {
+ struct gk_co *co = &cos[i];
+
+ if (unlikely(!coro_stack_alloc(&co->stack, stack_size_ptr))) {
+ free_cos(cos, num);
+ return NULL;
+ }
+
+ coro_create(&co->coro, gk_co_main, co,
+ co->stack.sptr, co->stack.ssze);
+ INIT_LIST_HEAD_WITH_POISON(&co->co_list);
+ INIT_LIST_HEAD(&co->task_queue);
+ co->work = NULL;
+ }
+
+ return cos;
+}
+
static int
setup_gk_instance(unsigned int lcore_id, struct gk_config *gk_conf)
{
@@ -586,7 +392,6 @@ setup_gk_instance(unsigned int lcore_id, struct gk_config *gk_conf)
GK_LOG(ERR,
"The GK block cannot create hash table at lcore %u\n",
lcore_id);
-
ret = -1;
goto out;
}
@@ -600,7 +405,6 @@ setup_gk_instance(unsigned int lcore_id, struct gk_config *gk_conf)
GK_LOG(ERR,
"The GK block can't create flow entry table at lcore %u\n",
lcore_id);
-
ret = -1;
goto flow_hash;
}
@@ -611,6 +415,19 @@ setup_gk_instance(unsigned int lcore_id, struct gk_config *gk_conf)
if (ret < 0)
goto flow_entry;
+ coro_create(&instance->coro_root, NULL, NULL, NULL, 0);
+
+ /* Allocate coroutines. */
+ instance->cos = alloc_cos(gk_conf->co_max_num,
+ gk_conf->co_stack_size_kb * 1024);
+ if (instance->cos == NULL) {
+ GK_LOG(ERR,
+ "The GK block can't allocate coroutines at lcore %u\n",
+ lcore_id);
+ ret = -1;
+ goto coro_root;
+ }
+
tb_ratelimit_state_init(&instance->front_icmp_rs,
gk_conf->front_icmp_msgs_per_sec,
gk_conf->front_icmp_msgs_burst);
@@ -621,6 +438,10 @@ setup_gk_instance(unsigned int lcore_id, struct gk_config *gk_conf)
ret = 0;
goto out;
+coro_root:
+ coro_destroy(&instance->coro_root);
+/*mailbox:*/
+ destroy_mailbox(&instance->mb);
flow_entry:
rte_free(instance->ip_flow_entry_table);
instance->ip_flow_entry_table = NULL;
@@ -730,8 +551,7 @@ flush_flow_table(struct ip_prefix *src,
}
if (matched) {
- gk_del_flow_entry_from_hash(
- instance->ip_flow_hash_table, fe);
+ gk_del_flow_entry_from_hash(instance, fe);
num_flushed_flows++;
}
@@ -872,10 +692,8 @@ gk_synchronize(struct gk_fib *fib, struct gk_instance *instance)
while (index >= 0) {
struct flow_entry *fe =
&instance->ip_flow_entry_table[index];
- if (fe->grantor_fib == fib) {
- gk_del_flow_entry_from_hash(
- instance->ip_flow_hash_table, fe);
- }
+ if (fe->grantor_fib == fib)
+ gk_del_flow_entry_from_hash(instance, fe);
index = rte_hash_iterate(instance->ip_flow_hash_table,
(void *)&key, &data, &next);
@@ -990,6 +808,10 @@ gk_setup_rss(struct gk_config *gk_conf)
return ret;
}
+/*
+ * TODO A copy of this function is available in gk/co.c,
+ * so drop it when possible.
+ */
static void
xmit_icmp(struct gatekeeper_if *iface, struct ipacket *packet,
uint16_t *num_pkts, struct rte_mbuf **icmp_bufs,
@@ -1065,6 +887,10 @@ xmit_icmp(struct gatekeeper_if *iface, struct ipacket *packet,
(*num_pkts)++;
}
+/*
+ * TODO A copy of this function is available in gk/co.c,
+ * so drop it when possible.
+ */
static void
xmit_icmpv6(struct gatekeeper_if *iface, struct ipacket *packet,
uint16_t *num_pkts, struct rte_mbuf **icmp_bufs,
@@ -1136,6 +962,10 @@ xmit_icmpv6(struct gatekeeper_if *iface, struct ipacket *packet,
(*num_pkts)++;
}
+/*
+ * TODO A copy of this function is available in gk/co.c,
+ * so drop it when possible.
+ */
/*
* For IPv4, according to the RFC 1812 section 5.3.1 Time to Live (TTL),
* if the TTL is reduced to zero (or less), the packet MUST be
@@ -1190,26 +1020,6 @@ update_ip_hop_count(struct gatekeeper_if *iface, struct ipacket *packet,
return 0;
}
-/*
- * This function is only to be called on flows that
- * are not backed by a flow entry.
- */
-static void
-send_request_to_grantor(struct ipacket *packet, uint32_t flow_hash_val,
- struct gk_fib *fib, struct rte_mbuf **req_bufs,
- uint16_t *num_reqs, struct gk_instance *instance,
- struct gk_config *gk_conf) {
- int ret;
- struct flow_entry temp_fe;
-
- initialize_flow_entry(&temp_fe, &packet->flow, flow_hash_val, fib);
-
- ret = gk_process_request(&temp_fe, packet, req_bufs,
- num_reqs, gk_conf->sol_conf);
- if (ret < 0)
- drop_packet_front(packet->pkt, instance);
-}
-
static void
lookup_fib_bulk(struct gk_lpm *ltbl, struct ip_flow **flows,
int num_flows, struct gk_fib *fibs[])
@@ -1289,111 +1099,70 @@ lookup_fib6_bulk(struct gk_lpm *ltbl, struct ip_flow **flows,
}
}
-static struct flow_entry *
-lookup_fe_from_lpm(struct ipacket *packet, uint32_t ip_flow_hash_val,
- struct gk_fib *fib, uint16_t *num_tx, struct rte_mbuf **tx_bufs,
+static void
+process_fib(struct ipacket *packet, struct gk_fib *fib,
+ uint16_t *num_tx, struct rte_mbuf **tx_bufs,
struct acl_search *acl4, struct acl_search *acl6,
uint16_t *num_pkts, struct rte_mbuf **icmp_bufs,
- struct rte_mbuf **req_bufs, uint16_t *num_reqs,
struct gatekeeper_if *front, struct gatekeeper_if *back,
- struct gk_instance *instance, struct gk_config *gk_conf) {
+ struct gk_instance *instance) {
struct rte_mbuf *pkt = packet->pkt;
struct ether_cache *eth_cache;
- struct gk_measurement_metrics *stats = &instance->traffic_stats;
-
- if (fib == NULL || fib->action == GK_FWD_NEIGHBOR_FRONT_NET) {
- if (packet->flow.proto == RTE_ETHER_TYPE_IPV4) {
- stats->tot_pkts_num_distributed++;
- stats->tot_pkts_size_distributed +=
- rte_pktmbuf_pkt_len(pkt);
+ if (fib == NULL || fib->action == GK_FWD_NEIGHBOR_BACK_NET) {
+ if (packet->flow.proto == RTE_ETHER_TYPE_IPV4)
add_pkt_acl(acl4, pkt);
- } else if (likely(packet->flow.proto ==
- RTE_ETHER_TYPE_IPV6)) {
- stats->tot_pkts_num_distributed++;
- stats->tot_pkts_size_distributed +=
- rte_pktmbuf_pkt_len(pkt);
-
+ else if (likely(packet->flow.proto ==
+ RTE_ETHER_TYPE_IPV6))
add_pkt_acl(acl6, pkt);
- } else {
+ else {
print_flow_err_msg(&packet->flow,
- "gk: failed to get the fib entry");
- drop_packet_front(pkt, instance);
+ "gk: failed to get the fib entry or it is not an IP packet");
+ drop_packet(pkt);
}
- return NULL;
+ return;
}
switch (fib->action) {
- case GK_FWD_GRANTOR: {
- struct flow_entry *fe;
- int ret = gk_hash_add_flow_entry(
- instance, &packet->flow,
- ip_flow_hash_val, gk_conf);
- if (ret == -ENOSPC) {
- /*
- * There is no room for a new
- * flow entry, but give this
- * flow a chance sending a
- * request to the grantor
- * server.
- */
- send_request_to_grantor(packet, ip_flow_hash_val,
- fib, req_bufs, num_reqs, instance, gk_conf);
- return NULL;
- }
- if (ret < 0) {
- drop_packet_front(pkt, instance);
- return NULL;
- }
-
- fe = &instance->ip_flow_entry_table[ret];
- initialize_flow_entry(fe,
- &packet->flow, ip_flow_hash_val, fib);
- return fe;
- }
-
- case GK_FWD_GATEWAY_BACK_NET: {
+ case GK_FWD_GATEWAY_FRONT_NET: {
/*
* The entry instructs to forward
* its packets to the gateway in
- * the back network, forward accordingly.
+ * the front network, forward accordingly.
*
- * BP block bypasses from the front to the
- * back interface are expected to bypass
- * ranges of IP addresses that should not
- * go through Gatekeeper.
+ * BP bypasses from the back to the front interface
+ * are expected to bypass the outgoing traffic
+ * from the AS to its peers.
*
* Notice that one needs to update
* the Ethernet header.
*/
-
eth_cache = fib->u.gateway.eth_cache;
RTE_VERIFY(eth_cache != NULL);
- if (adjust_pkt_len(pkt, back, 0) == NULL ||
+ if (adjust_pkt_len(pkt, front, 0) == NULL ||
pkt_copy_cached_eth_header(pkt,
eth_cache,
- back->l2_len_out)) {
- drop_packet_front(pkt, instance);
- return NULL;
+ front->l2_len_out)) {
+ drop_packet(pkt);
+ return;
}
- if (update_ip_hop_count(front, packet,
+ if (update_ip_hop_count(back, packet,
num_pkts, icmp_bufs,
- &instance->front_icmp_rs,
- instance,
- drop_packet_front) < 0)
- return NULL;
+ &instance->back_icmp_rs,
+ instance, drop_packet_back) < 0)
+ return;
tx_bufs[(*num_tx)++] = pkt;
- return NULL;
+ break;
}
- case GK_FWD_NEIGHBOR_BACK_NET: {
+ case GK_FWD_NEIGHBOR_FRONT_NET: {
/*
* The entry instructs to forward
* its packets to the neighbor in
- * the back network, forward accordingly.
+ * the front network, forward accordingly.
*/
if (packet->flow.proto == RTE_ETHER_TYPE_IPV4) {
eth_cache = lookup_ether_cache(
@@ -1407,455 +1176,45 @@ lookup_fe_from_lpm(struct ipacket *packet, uint32_t ip_flow_hash_val,
RTE_VERIFY(eth_cache != NULL);
- if (adjust_pkt_len(pkt, back, 0) == NULL ||
+ if (adjust_pkt_len(pkt, front, 0) == NULL ||
pkt_copy_cached_eth_header(pkt,
eth_cache,
- back->l2_len_out)) {
- drop_packet_front(pkt, instance);
- return NULL;
+ front->l2_len_out)) {
+ drop_packet(pkt);
+ return;
}
- if (update_ip_hop_count(front, packet,
+ if (update_ip_hop_count(back, packet,
num_pkts, icmp_bufs,
- &instance->front_icmp_rs,
- instance,
- drop_packet_front) < 0)
- return NULL;
+ &instance->back_icmp_rs,
+ instance, drop_packet_back) < 0)
+ return;
tx_bufs[(*num_tx)++] = pkt;
- return NULL;
+ break;
}
case GK_DROP:
- /* FALLTHROUGH */
+ drop_packet(pkt);
+ break;
+
default:
- drop_packet_front(pkt, instance);
- return NULL;
+ /* All other actions should log a warning. */
+ GK_LOG(WARNING,
+ "The fib entry has an unexpected action %u at %s\n",
+ fib->action, __func__);
+ drop_packet(pkt);
+ break;
}
-
- return NULL;
}
-static int
-process_flow_entry(struct flow_entry *fe, struct ipacket *packet,
- struct rte_mbuf **req_bufs, uint16_t *num_reqs,
- struct gk_config *gk_conf, struct gk_measurement_metrics *stats)
-{
- int ret;
-
- /*
- * Some notes regarding flow rates and units:
- *
- * Flows in the GK_REQUEST state are bandwidth limited
- * to an overall rate relative to the link. Therefore,
- * the Ethernet frame overhead is counted toward the
- * credits used by requests. The request channel rate
- * is measured in megabits (base 10) per second to
- * match the units used by hardware specifications.
- *
- * Granted flows (in state GK_GRANTED or sometimes
- * GK_BPF) are allocated budgets that are intended
- * to reflect the max throughput of the flow, and
- * therefore do not include the Ethernet frame overhead.
- * The budgets of granted flows are measured in
- * kibibytes (base 2).
- */
- switch (fe->state) {
- case GK_REQUEST:
- ret = gk_process_request(fe, packet,
- req_bufs, num_reqs, gk_conf->sol_conf);
- break;
-
- case GK_GRANTED:
- ret = gk_process_granted(fe, packet,
- req_bufs, num_reqs, gk_conf->sol_conf, stats);
- break;
-
- case GK_DECLINED:
- ret = gk_process_declined(fe, packet,
- req_bufs, num_reqs, gk_conf->sol_conf, stats);
- break;
-
- case GK_BPF:
- ret = gk_process_bpf(fe, packet,
- req_bufs, num_reqs, gk_conf, stats);
- break;
-
- default:
- ret = -1;
- GK_LOG(ERR, "Unknown flow state: %d\n", fe->state);
- break;
- }
-
- return ret;
-}
-
-static inline void
-prefetch_flow_entry(struct flow_entry *fe)
-{
-#if RTE_CACHE_LINE_SIZE == 64
- RTE_BUILD_BUG_ON(sizeof(*fe) <= RTE_CACHE_LINE_SIZE);
- RTE_BUILD_BUG_ON(sizeof(*fe) > 2 * RTE_CACHE_LINE_SIZE);
- rte_prefetch0(fe);
- rte_prefetch0(((char *)fe) + RTE_CACHE_LINE_SIZE);
-#elif RTE_CACHE_LINE_SIZE == 128
- RTE_BUILD_BUG_ON(sizeof(*fe) > RTE_CACHE_LINE_SIZE);
- rte_prefetch0(fe);
-#else
-#error "Unsupported cache line size"
-#endif
-}
-
-static void
-parse_packet(struct ipacket *packet, struct rte_mbuf *pkt,
- struct rte_mbuf **arp_bufs, uint16_t *num_arp,
- bool ipv4_configured_front, bool ipv6_configured_front,
- struct ip_flow **flow_arr, uint32_t *flow_hash_val_arr,
- int *num_ip_flows, struct gatekeeper_if *front,
- struct gk_instance *instance)
-{
- int ret;
- struct gk_measurement_metrics *stats = &instance->traffic_stats;
-
- stats->tot_pkts_size += rte_pktmbuf_pkt_len(pkt);
-
- ret = extract_packet_info(pkt, packet);
- if (ret < 0) {
- if (likely(packet->flow.proto == RTE_ETHER_TYPE_ARP)) {
- stats->tot_pkts_num_distributed++;
- stats->tot_pkts_size_distributed +=
- rte_pktmbuf_pkt_len(pkt);
-
- arp_bufs[(*num_arp)++] = pkt;
- return;
- }
-
- /* Drop non-IP and non-ARP packets. */
- drop_packet_front(pkt, instance);
- return;
- }
-
- if (unlikely((packet->flow.proto == RTE_ETHER_TYPE_IPV4 &&
- !ipv4_configured_front) ||
- (packet->flow.proto == RTE_ETHER_TYPE_IPV6 &&
- !ipv6_configured_front))) {
- drop_packet_front(pkt, instance);
- return;
- }
-
- flow_arr[*num_ip_flows] = &packet->flow;
- flow_hash_val_arr[*num_ip_flows] = likely(front->rss) ?
- pkt->hash.rss : rss_ip_flow_hf(&packet->flow, 0, 0);
- (*num_ip_flows)++;
-}
-
-#define PREFETCH_OFFSET (4)
-
-/* Process the packets on the front interface. */
-static void
-process_pkts_front(uint16_t port_front, uint16_t rx_queue_front,
- unsigned int lcore,
- uint16_t *tx_front_num_pkts, struct rte_mbuf **tx_front_pkts,
- uint16_t *tx_back_num_pkts, struct rte_mbuf **tx_back_pkts,
- struct gk_instance *instance, struct gk_config *gk_conf)
-{
- int i;
- int done_lookups;
- int ret;
- uint16_t num_rx;
- uint16_t num_arp = 0;
- uint16_t num_reqs = 0;
- uint16_t front_max_pkt_burst = gk_conf->front_max_pkt_burst;
- struct rte_mbuf *rx_bufs[front_max_pkt_burst];
- struct rte_mbuf *arp_bufs[front_max_pkt_burst];
- struct rte_mbuf *req_bufs[front_max_pkt_burst];
- DEFINE_ACL_SEARCH(acl4, front_max_pkt_burst);
- DEFINE_ACL_SEARCH(acl6, front_max_pkt_burst);
- struct gatekeeper_if *front = &gk_conf->net->front;
- struct gatekeeper_if *back = &gk_conf->net->back;
- struct gk_measurement_metrics *stats = &instance->traffic_stats;
- bool ipv4_configured_front = ipv4_if_configured(&gk_conf->net->front);
- bool ipv6_configured_front = ipv6_if_configured(&gk_conf->net->front);
- int num_ip_flows = 0;
- struct ipacket pkt_arr[front_max_pkt_burst];
- struct ip_flow *flow_arr[front_max_pkt_burst];
- uint32_t flow_hash_val_arr[front_max_pkt_burst];
- int num_lpm_lookups = 0;
- int num_lpm6_lookups = 0;
- struct ip_flow *flows[front_max_pkt_burst];
- struct ip_flow *flows6[front_max_pkt_burst];
- int32_t lpm_lookup_pos[front_max_pkt_burst];
- int32_t lpm6_lookup_pos[front_max_pkt_burst];
- int32_t pos_arr[front_max_pkt_burst];
- struct gk_fib *fibs[front_max_pkt_burst];
- struct gk_fib *fibs6[front_max_pkt_burst];
- struct flow_entry *fe_arr[front_max_pkt_burst];
-
- /* Load a set of packets from the front NIC. */
- num_rx = rte_eth_rx_burst(port_front, rx_queue_front, rx_bufs,
- front_max_pkt_burst);
-
- if (unlikely(num_rx == 0))
- return;
-
- stats->tot_pkts_num += num_rx;
-
- /*
- * This prefetch is enough to load Ethernet header (14 bytes),
- * optional Ethernet VLAN header (8 bytes), and either
- * an IPv4 header without options (20 bytes), or
- * an IPv6 header without options (40 bytes).
- * IPv4: 14 + 8 + 20 = 42
- * IPv6: 14 + 8 + 40 = 62
- */
- for (i = 0; i < PREFETCH_OFFSET && i < num_rx; i++)
- rte_prefetch0(rte_pktmbuf_mtod_offset(rx_bufs[i], void *, 0));
-
- /* Extract packet and flow information. */
- for (i = 0; i < (num_rx - PREFETCH_OFFSET); i++) {
- rte_prefetch0(rte_pktmbuf_mtod_offset(
- rx_bufs[i + PREFETCH_OFFSET], void *, 0));
-
- parse_packet(&pkt_arr[num_ip_flows], rx_bufs[i], arp_bufs,
- &num_arp, ipv4_configured_front, ipv6_configured_front,
- flow_arr, flow_hash_val_arr, &num_ip_flows, front,
- instance);
- }
-
- /* Extract the rest packet and flow information. */
- for (; i < num_rx; i++) {
- parse_packet(&pkt_arr[num_ip_flows], rx_bufs[i], arp_bufs,
- &num_arp, ipv4_configured_front, ipv6_configured_front,
- flow_arr, flow_hash_val_arr, &num_ip_flows, front,
- instance);
- }
-
- done_lookups = 0;
- while (done_lookups < num_ip_flows) {
- uint32_t num_keys = num_ip_flows - done_lookups;
- if (num_keys > RTE_HASH_LOOKUP_BULK_MAX)
- num_keys = RTE_HASH_LOOKUP_BULK_MAX;
-
- ret = rte_hash_lookup_bulk_with_hash(
- instance->ip_flow_hash_table,
- (const void **)&flow_arr[done_lookups],
- (hash_sig_t *)&flow_hash_val_arr[done_lookups],
- num_keys, &pos_arr[done_lookups]);
- if (ret != 0) {
- GK_LOG(NOTICE,
- "failed to find multiple keys in the hash table at lcore %u\n",
- rte_lcore_id());
- }
-
- done_lookups += num_keys;
- }
-
- for (i = 0; i < num_ip_flows; i++) {
- if (pos_arr[i] >= 0) {
- fe_arr[i] = &instance->ip_flow_entry_table[pos_arr[i]];
-
- prefetch_flow_entry(fe_arr[i]);
- } else {
- fe_arr[i] = NULL;
- if (flow_arr[i]->proto == RTE_ETHER_TYPE_IPV4) {
- lpm_lookup_pos[num_lpm_lookups] = i;
- flows[num_lpm_lookups] = flow_arr[i];
- num_lpm_lookups++;
- } else {
- lpm6_lookup_pos[num_lpm6_lookups] = i;
- flows6[num_lpm6_lookups] = flow_arr[i];
- num_lpm6_lookups++;
- }
- }
- }
-
- /* The remaining flows need LPM lookups. */
- lookup_fib_bulk(&gk_conf->lpm_tbl, flows, num_lpm_lookups, fibs);
- lookup_fib6_bulk(&gk_conf->lpm_tbl, flows6, num_lpm6_lookups, fibs6);
-
- for (i = 0; i < num_lpm_lookups; i++) {
- int fidx = lpm_lookup_pos[i];
-
- fe_arr[fidx] = lookup_fe_from_lpm(&pkt_arr[fidx],
- flow_hash_val_arr[fidx], fibs[i],
- tx_back_num_pkts, tx_back_pkts, &acl4, &acl6,
- tx_front_num_pkts, tx_front_pkts, req_bufs,
- &num_reqs, front, back, instance, gk_conf);
- }
-
- for (i = 0; i < num_lpm6_lookups; i++) {
- int fidx = lpm6_lookup_pos[i];
-
- fe_arr[fidx] = lookup_fe_from_lpm(&pkt_arr[fidx],
- flow_hash_val_arr[fidx], fibs6[i],
- tx_back_num_pkts, tx_back_pkts, &acl4, &acl6,
- tx_front_num_pkts, tx_front_pkts, req_bufs,
- &num_reqs, front, back, instance, gk_conf);
- }
-
- for (i = 0; i < num_ip_flows; i++) {
- if (fe_arr[i] == NULL)
- continue;
-
- ret = process_flow_entry(fe_arr[i], &pkt_arr[i], req_bufs,
- &num_reqs, gk_conf, stats);
- if (ret < 0)
- drop_packet_front(pkt_arr[i].pkt, instance);
- else if (ret == EINPROGRESS) {
- /* Request will be serviced by another lcore. */
- continue;
- } else if (likely(ret == 0))
- tx_back_pkts[(*tx_back_num_pkts)++] = pkt_arr[i].pkt;
- else
- rte_panic("Invalid return value (%d) from processing a packet in a flow with state %d",
- ret, fe_arr[i]->state);
- }
-
- if (num_reqs > 0) {
- uint64_t acc_size_request[num_reqs + 1];
-
- acc_size_request[0] = 0;
- for (i = 1; i <= num_reqs; i++) {
- acc_size_request[i] = acc_size_request[i - 1] +
- rte_pktmbuf_pkt_len(req_bufs[i - 1]);
- }
-
- ret = RTE_MAX(gk_solicitor_enqueue_bulk(gk_conf->sol_conf,
- req_bufs, num_reqs), 0);
- if (ret < num_reqs) {
- for (i = ret; i < num_reqs; i++)
- drop_packet_front(req_bufs[i], instance);
- }
-
- stats->pkts_num_request += ret;
- stats->pkts_size_request += acc_size_request[ret];
- }
-
- if (num_arp > 0)
- submit_arp(arp_bufs, num_arp, &gk_conf->net->front);
-
- process_pkts_acl(&gk_conf->net->front,
- lcore, &acl4, RTE_ETHER_TYPE_IPV4);
- process_pkts_acl(&gk_conf->net->front,
- lcore, &acl6, RTE_ETHER_TYPE_IPV6);
-}
-
-static void
-process_fib(struct ipacket *packet, struct gk_fib *fib,
- uint16_t *num_tx, struct rte_mbuf **tx_bufs,
- struct acl_search *acl4, struct acl_search *acl6,
- uint16_t *num_pkts, struct rte_mbuf **icmp_bufs,
- struct gatekeeper_if *front, struct gatekeeper_if *back,
- struct gk_instance *instance) {
- struct rte_mbuf *pkt = packet->pkt;
- struct ether_cache *eth_cache;
-
- if (fib == NULL || fib->action == GK_FWD_NEIGHBOR_BACK_NET) {
- if (packet->flow.proto == RTE_ETHER_TYPE_IPV4)
- add_pkt_acl(acl4, pkt);
- else if (likely(packet->flow.proto ==
- RTE_ETHER_TYPE_IPV6))
- add_pkt_acl(acl6, pkt);
- else {
- print_flow_err_msg(&packet->flow,
- "gk: failed to get the fib entry or it is not an IP packet");
- drop_packet(pkt);
- }
- return;
- }
-
- switch (fib->action) {
- case GK_FWD_GATEWAY_FRONT_NET: {
- /*
- * The entry instructs to forward
- * its packets to the gateway in
- * the front network, forward accordingly.
- *
- * BP bypasses from the back to the front interface
- * are expected to bypass the outgoing traffic
- * from the AS to its peers.
- *
- * Notice that one needs to update
- * the Ethernet header.
- */
- eth_cache = fib->u.gateway.eth_cache;
- RTE_VERIFY(eth_cache != NULL);
-
- if (adjust_pkt_len(pkt, front, 0) == NULL ||
- pkt_copy_cached_eth_header(pkt,
- eth_cache,
- front->l2_len_out)) {
- drop_packet(pkt);
- return;
- }
-
- if (update_ip_hop_count(back, packet,
- num_pkts, icmp_bufs,
- &instance->back_icmp_rs,
- instance, drop_packet_back) < 0)
- return;
-
- tx_bufs[(*num_tx)++] = pkt;
- break;
- }
-
- case GK_FWD_NEIGHBOR_FRONT_NET: {
- /*
- * The entry instructs to forward
- * its packets to the neighbor in
- * the front network, forward accordingly.
- */
- if (packet->flow.proto == RTE_ETHER_TYPE_IPV4) {
- eth_cache = lookup_ether_cache(
- &fib->u.neigh,
- &packet->flow.f.v4.dst);
- } else {
- eth_cache = lookup_ether_cache(
- &fib->u.neigh6,
- &packet->flow.f.v6.dst);
- }
-
- RTE_VERIFY(eth_cache != NULL);
-
- if (adjust_pkt_len(pkt, front, 0) == NULL ||
- pkt_copy_cached_eth_header(pkt,
- eth_cache,
- front->l2_len_out)) {
- drop_packet(pkt);
- return;
- }
-
- if (update_ip_hop_count(back, packet,
- num_pkts, icmp_bufs,
- &instance->back_icmp_rs,
- instance, drop_packet_back) < 0)
- return;
-
- tx_bufs[(*num_tx)++] = pkt;
- break;
- }
-
- case GK_DROP:
- drop_packet(pkt);
- break;
-
- default:
- /* All other actions should log a warning. */
- GK_LOG(WARNING,
- "The fib entry has an unexpected action %u at %s\n",
- fib->action, __func__);
- drop_packet(pkt);
- break;
- }
-}
-
-/* Process the packets on the back interface. */
-static void
-process_pkts_back(uint16_t port_back, uint16_t rx_queue_back,
- unsigned int lcore,
- uint16_t *tx_front_num_pkts, struct rte_mbuf **tx_front_pkts,
- uint16_t *tx_back_num_pkts, struct rte_mbuf **tx_back_pkts,
- struct gk_instance *instance, struct gk_config *gk_conf)
+/* Process the packets on the back interface. */
+static void
+process_pkts_back(uint16_t port_back, uint16_t rx_queue_back,
+ unsigned int lcore,
+ uint16_t *tx_front_num_pkts, struct rte_mbuf **tx_front_pkts,
+ uint16_t *tx_back_num_pkts, struct rte_mbuf **tx_back_pkts,
+ struct gk_instance *instance, struct gk_config *gk_conf)
{
int i;
int ret;
@@ -2153,6 +1512,340 @@ process_cmds_from_mailbox(
mb_free_entry_bulk(&instance->mb, (void * const *)gk_cmds, num_cmd);
}
+static void
+populate_front_tasks(struct gk_co_work *work,
+ uint16_t port_front, uint16_t rx_queue_front)
+{
+ uint16_t front_max_pkt_burst = work->gk_conf->front_max_pkt_burst;
+ struct rte_mbuf *rx_bufs[front_max_pkt_burst];
+ /* Load a set of packets from the front NIC. */
+ uint16_t num_rx = rte_eth_rx_burst(port_front, rx_queue_front, rx_bufs,
+ front_max_pkt_burst);
+ struct gk_measurement_metrics *stats;
+ bool has_rss;
+ int i;
+
+ if (unlikely(num_rx == 0))
+ return;
+
+ stats = &work->instance->traffic_stats;
+ stats->tot_pkts_num += num_rx;
+
+ has_rss = work->gk_conf->net->front.rss;
+ for (i = 0; i < num_rx; i++) {
+ struct gk_co_task *task = &work->all_tasks[work->task_num++];
+ struct rte_mbuf *pkt = rx_bufs[i];
+
+ stats->tot_pkts_size += rte_pktmbuf_pkt_len(pkt);
+
+ if (likely(has_rss)) {
+ task->task_hash = pkt->hash.rss;
+ task->task_arg = pkt;
+ task->task_func = gk_co_process_front_pkt;
+ schedule_task(work, task);
+ } else {
+ struct ipacket *packet = &work->packets[i];
+ /*
+ * There is a chance that packets on the same flow
+ * are brought out of order. For example, consider that
+ * (1) three packets arrive on the following order:
+ * pkt1, pkt2, pkt3;
+ * (2) there are only two coroutines doing the work;
+ * (3) The packets are mapped to
+ * the coroutines as follow:
+ * * pkt1 and pkt2 goes coroutine 1,
+ * * pkt3 goes to coroutine 2;
+ * (4) Packets pkt2 and pkt3 belong to the same flow.
+ *
+ * Packet pkt1 and ptk3 are processed in parallel,
+ * receive their correct hashes, and are rescheduled.
+ * Once pk2 is recheduled, it is going to be placed
+ * after pk3 in the task queue of
+ * the assigned coroutine, that is, pk3 is going to
+ * be sent out before pkt2 (inverted order).
+ */
+ task->task_hash = 0; /* Dummy hash. */
+ /*
+ * Passing @packet instead of just @pkt so @packet
+ * can be carried over once the task is rescheduled.
+ */
+ packet->pkt = pkt;
+ task->task_arg = packet;
+ task->task_func = gk_co_process_front_pkt_software_rss;
+ schedule_task_to_any_co(work, task);
+ }
+ }
+}
+
+static void
+add_cos_to_work(struct gk_co_work *work, struct gk_config *gk_conf,
+ struct gk_instance *instance)
+{
+ unsigned int i;
+
+ work->gk_conf = gk_conf;
+ work->instance = instance;
+ work->cos = instance->cos;
+ work->co_max_num = gk_conf->co_max_num;
+ work->co_num = RTE_MIN(2, work->co_max_num);
+ work->front_ipv4_configured = ipv4_if_configured(&gk_conf->net->front);
+ work->front_ipv6_configured = ipv6_if_configured(&gk_conf->net->front);
+
+ RTE_VERIFY(work->co_num > 0);
+
+ for (i = 0; i < work->co_max_num; i++)
+ work->cos[i].work = work;
+}
+
+static void
+update_cos(struct gk_co_work *work)
+{
+ /*
+ * The local variable @co_num is needed here to enable one to go
+ * above @work->co_max_num and below zero if needed.
+ */
+ int32_t co_num = work->co_num;
+
+ if (work->co_delta_num > 0) {
+ /* @work->co_num is going up. */
+
+ if (unlikely(co_num >= work->co_max_num)) {
+ /*
+ * @work->co_num is at its maximum;
+ * Reverse direction.
+ */
+ RTE_VERIFY(co_num == work->co_max_num);
+ work->co_delta_num = - work->co_delta_num;
+ work->co_num = RTE_MAX(1, co_num + work->co_delta_num);
+ return;
+ }
+
+ work->co_num = RTE_MIN(work->co_max_num,
+ co_num + work->co_delta_num);
+ return;
+ }
+
+ /* @work->co_num is going down. */
+ RTE_VERIFY(work->co_delta_num < 0);
+
+ if (unlikely(co_num <= 1)) {
+ /* @work->co_num is at its minimum; reverse direction. */
+ RTE_VERIFY(co_num == 1);
+ work->co_delta_num = - work->co_delta_num;
+ work->co_num = RTE_MIN(work->co_max_num,
+ co_num + work->co_delta_num);
+ return;
+ }
+
+ work->co_num = RTE_MAX(1, co_num + work->co_delta_num);
+}
+
+static void
+do_work(struct gk_co_work *work)
+{
+ uint16_t i, real_co_num = 0;
+ uint64_t cycles;
+ double avg_cycles_per_task;
+
+ /* Add coroutines with tasks to @work->working_cos. */
+ for (i = 0; i < work->co_num; i++) {
+ struct gk_co *co = &work->cos[i];
+ if (!list_empty(&co->task_queue)) {
+ list_add_tail(&co->co_list, &work->working_cos);
+ real_co_num++;
+ }
+ }
+
+ /* Is there any work to do? */
+ if (unlikely(list_empty(&work->working_cos))) {
+ RTE_VERIFY(real_co_num == 0);
+ RTE_VERIFY(work->task_num == 0);
+ return;
+ }
+ RTE_VERIFY(real_co_num > 0);
+ RTE_VERIFY(work->task_num > 0);
+
+ /* Do work. */
+ cycles = rte_rdtsc();
+ coro_transfer(&work->instance->coro_root,
+ &list_first_entry(&work->working_cos, struct gk_co, co_list)->
+ coro);
+ cycles = rte_rdtsc() - cycles;
+ avg_cycles_per_task = (double)cycles / work->task_num;
+
+ if (work->co_num != real_co_num) {
+ /* Workload changed; adjust quickly. */
+ RTE_VERIFY(work->co_num > real_co_num);
+ work->co_prv_num = real_co_num;
+ work->avg_cycles_per_task = avg_cycles_per_task;
+ work->co_num = real_co_num;
+ return update_cos(work);
+ }
+
+ if (work->co_prv_num == 0) {
+ /* Initialize the performance tracking fields. */
+ work->co_prv_num = real_co_num;
+ work->avg_cycles_per_task = avg_cycles_per_task;
+ return update_cos(work);
+ }
+
+ if (avg_cycles_per_task >= work->avg_cycles_per_task) {
+ /* The last change did not bring an improvement; go back. */
+ work->co_num = work->co_prv_num;
+ /* Reset measurement. */
+ work->co_prv_num = 0;
+ /* Change adjustment direction. */
+ work->co_delta_num = - work->co_delta_num;
+ return;
+ }
+
+ /* @real_co_num is an improvement. */
+ work->co_prv_num = real_co_num;
+ work->avg_cycles_per_task = avg_cycles_per_task;
+ update_cos(work);
+}
+
+static void
+flush_work(struct gk_co_work *work,
+ uint16_t port_front, uint16_t tx_queue_front,
+ uint16_t port_back, uint16_t tx_queue_back,
+ unsigned int lcore)
+{
+ struct gk_instance *instance = work->instance;
+
+ uint16_t front_max_pkt_burst = work->gk_conf->front_max_pkt_burst;
+ uint16_t back_max_pkt_burst = work->gk_conf->back_max_pkt_burst;
+ uint32_t max_pkt_burst = front_max_pkt_burst + back_max_pkt_burst;
+ struct gatekeeper_if *front = &work->gk_conf->net->front;
+
+ /*
+ * Flush packets.
+ */
+
+ send_pkts(port_front, tx_queue_front,
+ work->tx_front_num_pkts, work->tx_front_pkts);
+ RTE_VERIFY(work->tx_front_num_pkts <= max_pkt_burst);
+ work->tx_front_num_pkts = 0;
+
+ send_pkts(port_back, tx_queue_back,
+ work->tx_back_num_pkts, work->tx_back_pkts);
+ RTE_VERIFY(work->tx_back_num_pkts <= max_pkt_burst);
+ work->tx_back_num_pkts = 0;
+
+ /*
+ * Flush front.
+ */
+
+ if (work->front_num_req > 0) {
+ uint16_t num_req = work->front_num_req;
+ uint64_t acc_size_request[num_req + 1];
+ struct gk_measurement_metrics *stats = &instance->traffic_stats;
+ int i, ret;
+
+ /*
+ * The byte length of the packets must be computed before
+ * calling gk_solicitor_enqueue_bulk() because after it
+ * the GK block no longer owns the packets.
+ */
+ acc_size_request[0] = 0;
+ for (i = 1; i <= num_req; i++) {
+ acc_size_request[i] = acc_size_request[i - 1] +
+ rte_pktmbuf_pkt_len(
+ work->front_req_bufs[i - 1]
+ );
+ }
+
+ ret = RTE_MAX(
+ gk_solicitor_enqueue_bulk(work->gk_conf->sol_conf,
+ work->front_req_bufs, num_req),
+ 0);
+
+ stats->pkts_num_request += ret;
+ stats->pkts_size_request += acc_size_request[ret];
+
+ for (i = ret; i < num_req; i++)
+ drop_packet_front(work->front_req_bufs[i], instance);
+
+ RTE_VERIFY(num_req <= front_max_pkt_burst);
+ work->front_num_req = 0;
+ }
+
+ if (work->front_num_arp > 0) {
+ submit_arp(work->front_arp_bufs, work->front_num_arp, front);
+ RTE_VERIFY(work->front_num_arp <= front_max_pkt_burst);
+ work->front_num_arp = 0;
+ }
+
+ RTE_VERIFY(work->front_acl4.num <= front_max_pkt_burst);
+ RTE_VERIFY(work->front_acl6.num <= front_max_pkt_burst);
+ process_pkts_acl(front, lcore, &work->front_acl4, RTE_ETHER_TYPE_IPV4);
+ process_pkts_acl(front, lcore, &work->front_acl6, RTE_ETHER_TYPE_IPV6);
+
+ /*
+ * TODO Flush back.
+ */
+
+ /*
+ * Update flow table.
+ */
+
+ if (work->del_fe != NULL) {
+ RTE_VERIFY(work->del_fe->in_use);
+ /*
+ * Test that the flow entry is expired once more because
+ * it may have been update since it tested as expired and
+ * arriving here.
+ */
+ if (likely(is_flow_expired(work->del_fe, rte_rdtsc())))
+ gk_del_flow_entry_from_hash(instance, work->del_fe);
+ work->del_fe = NULL;
+ }
+
+ /*
+ * Adding new entries to the flow table should be among the last steps
+ * to do because when the flow table is full,
+ * rte_hash_cuckoo_make_space_mw() is going to be called. And
+ * this function disrupts the cache of the running core.
+ * rte_hash_cuckoo_make_space_mw() may access up to 1000 buckets and,
+ * on 64-bit platforms, consumes about 32KB of execution stack.
+ */
+ if (work->temp_fes_num > 0) {
+ unsigned int i;
+ for (i = 0; i < work->temp_fes_num; i++) {
+ struct flow_entry *temp_fe = &work->temp_fes[i];
+ struct flow_entry *fe;
+ int ret = gk_hash_add_flow_entry(instance,
+ &temp_fe->flow, temp_fe->flow_hash_val,
+ work->gk_conf);
+ if (ret == -ENOSPC) {
+ /* Flow table is full. */
+ break;
+ }
+ if (unlikely(ret < 0)) {
+ GK_LOG(ERR,
+ "Failed to add an flow entry ret=%i\n",
+ ret);
+ continue;
+ }
+ fe = &instance->ip_flow_entry_table[ret];
+ rte_memcpy(fe, temp_fe, sizeof(*fe));
+ }
+ RTE_VERIFY(work->temp_fes_num <= (front_max_pkt_burst +
+ work->gk_conf->mailbox_burst_size));
+ work->temp_fes_num = 0;
+ }
+
+ /*
+ * Reset fields of @work.
+ */
+
+ RTE_VERIFY(work->task_num <= work->task_total);
+ work->task_num = 0;
+ work->any_co_index = 0;
+ memset(work->leftover, 0,
+ sizeof(*work->leftover) * (work->leftover_mask + 1));
+}
+
static int
gk_proc(void *arg)
{
@@ -2168,13 +1861,6 @@ gk_proc(void *arg)
uint16_t rx_queue_back = instance->rx_queue_back;
uint16_t tx_queue_back = instance->tx_queue_back;
- uint16_t tx_front_num_pkts;
- uint16_t tx_back_num_pkts;
- uint16_t tx_max_num_pkts = gk_conf->front_max_pkt_burst +
- gk_conf->back_max_pkt_burst;
- struct rte_mbuf *tx_front_pkts[tx_max_num_pkts];
- struct rte_mbuf *tx_back_pkts[tx_max_num_pkts];
-
uint32_t entry_idx = 0;
uint64_t last_measure_tsc = rte_rdtsc();
uint64_t basic_measurement_logging_cycles =
@@ -2183,64 +1869,58 @@ gk_proc(void *arg)
uint32_t scan_iter = gk_conf->flow_table_scan_iter;
uint32_t iter_count = 0;
+ DEFINE_GK_CO_WORK(work, gk_conf->front_max_pkt_burst,
+ gk_conf->back_max_pkt_burst, gk_conf->mailbox_burst_size,
+ /*
+ * The 4* is intended to minimize collisions, whereas the -1 is
+ * intended to avoid doubling the size when
+ * the expression already is a power of 2.
+ */
+ rte_combine32ms1b(4 * (gk_conf->front_max_pkt_burst +
+ gk_conf->mailbox_burst_size) - 1),
+ 1 /* One extra tast for the full scanning of the flow table. */
+ );
+
GK_LOG(NOTICE, "The GK block is running at lcore = %u\n", lcore);
gk_conf_hold(gk_conf);
+ add_cos_to_work(&work, gk_conf, instance);
while (likely(!exiting)) {
- struct flow_entry *fe = NULL;
- tx_front_num_pkts = 0;
- tx_back_num_pkts = 0;
+ populate_front_tasks(&work, port_front, rx_queue_front);
+ /*
+ * Have the expiration test after all flow-ralated work to
+ * give one more chance for entries to not expire.
+ */
if (iter_count >= scan_iter) {
+ struct gk_co_task *task =
+ &work.all_tasks[work.task_num++];
entry_idx = (entry_idx + 1) % gk_conf->flow_ht_size;
- fe = &instance->ip_flow_entry_table[entry_idx];
- /*
- * Only one prefetch is needed here because we only
- * need the beginning of a struct flow_entry to
- * check if it's expired.
- */
- rte_prefetch_non_temporal(fe);
+
+ task->task_hash = 0; /* Dummy hash. */
+ task->task_arg =
+ &instance->ip_flow_entry_table[entry_idx];
+ task->task_func = gk_co_scan_flow_table;
+ schedule_task_to_any_co(&work, task);
iter_count = 0;
} else
iter_count++;
- process_pkts_front(port_front, rx_queue_front, lcore,
- &tx_front_num_pkts, tx_front_pkts,
- &tx_back_num_pkts, tx_back_pkts,
- instance, gk_conf);
+ do_work(&work);
process_pkts_back(port_back, rx_queue_back, lcore,
- &tx_front_num_pkts, tx_front_pkts,
- &tx_back_num_pkts, tx_back_pkts,
+ &work.tx_front_num_pkts, work.tx_front_pkts,
+ &work.tx_back_num_pkts, work.tx_back_pkts,
instance, gk_conf);
- if (fe != NULL && fe->in_use &&
- is_flow_expired(fe, rte_rdtsc())) {
- rte_hash_prefetch_buckets_non_temporal(
- instance->ip_flow_hash_table,
- fe->flow_hash_val);
- } else
- fe = NULL;
-
- send_pkts(port_front, tx_queue_front,
- tx_front_num_pkts, tx_front_pkts);
-
- send_pkts(port_back, tx_queue_back,
- tx_back_num_pkts, tx_back_pkts);
+ flush_work(&work, port_front, tx_queue_front,
+ port_back, tx_queue_back, lcore);
process_cmds_from_mailbox(instance, gk_conf);
- if (fe != NULL) {
- gk_del_flow_entry_from_hash(
- instance->ip_flow_hash_table, fe);
-
- if (instance->num_scan_del > 0)
- instance->num_scan_del--;
- }
-
if (rte_rdtsc() - last_measure_tsc >=
basic_measurement_logging_cycles) {
struct gk_measurement_metrics *stats =
@@ -2310,6 +1990,8 @@ cleanup_gk(struct gk_config *gk_conf)
}
destroy_mailbox(&gk_conf->instances[i].mb);
+ free_cos(gk_conf->instances[i].cos, gk_conf->co_max_num);
+ coro_destroy(&gk_conf->instances[i].coro_root);
}
if (gk_conf->lpm_tbl.fib_tbl != NULL) {
@@ -2518,6 +2200,12 @@ run_gk(struct net_config *net_conf, struct gk_config *gk_conf,
goto out;
}
+ if (gk_conf->co_max_num == 0) {
+ GK_LOG(ERR, "There must be at least one coroutine\n");
+ ret = -1;
+ goto out;
+ }
+
front_inc = gk_conf->front_max_pkt_burst * gk_conf->num_lcores;
net_conf->front.total_pkt_burst += front_inc;
back_inc = gk_conf->back_max_pkt_burst * gk_conf->num_lcores;
diff --git a/include/coro.h b/include/coro.h
new file mode 100644
index 000000000..7645d5029
--- /dev/null
+++ b/include/coro.h
@@ -0,0 +1,440 @@
+/*
+ * Copyright (c) 2001-2012,2015 Marc Alexander Lehmann
+ *
+ * Redistribution and use in source and binary forms, with or without modifica-
+ * tion, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
+ * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+ * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
+ * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * the GNU General Public License ("GPL") version 2 or any later version,
+ * in which case the provisions of the GPL are applicable instead of
+ * the above. If you wish to allow the use of your version of this file
+ * only under the terms of the GPL and not to allow others to use your
+ * version of this file under the BSD license, indicate your decision
+ * by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL. If you do not delete the
+ * provisions above, a recipient may use your version of this file under
+ * either the BSD or the GPL.
+ *
+ * This library is modelled strictly after Ralf S. Engelschalls article at
+ * http://www.gnu.org/software/pth/rse-pmt.ps. So most of the credit must
+ * go to Ralf S. Engelschall .
+ *
+ * This coroutine library is very much stripped down. You should either
+ * build your own process abstraction using it or - better - just use GNU
+ * Portable Threads, http://www.gnu.org/software/pth/.
+ *
+ */
+
+/*
+ * 2006-10-26 Include stddef.h on OS X to work around one of its bugs.
+ * Reported by Michael_G_Schwern.
+ * 2006-11-26 Use _setjmp instead of setjmp on GNU/Linux.
+ * 2007-04-27 Set unwind frame info if gcc 3+ and ELF is detected.
+ * Use _setjmp instead of setjmp on _XOPEN_SOURCE >= 600.
+ * 2007-05-02 Add assembly versions for x86 and amd64 (to avoid reliance
+ * on SIGUSR2 and sigaltstack in Crossfire).
+ * 2008-01-21 Disable CFI usage on anything but GNU/Linux.
+ * 2008-03-02 Switched to 2-clause BSD license with GPL exception.
+ * 2008-04-04 New (but highly unrecommended) pthreads backend.
+ * 2008-04-24 Reinstate CORO_LOSER (had wrong stack adjustments).
+ * 2008-10-30 Support assembly method on x86 with and without frame pointer.
+ * 2008-11-03 Use a global asm statement for CORO_ASM, idea by pippijn.
+ * 2008-11-05 Hopefully fix misaligned stacks with CORO_ASM/SETJMP.
+ * 2008-11-07 rbp wasn't saved in CORO_ASM on x86_64.
+ * introduce coro_destroy, which is a nop except for pthreads.
+ * speed up CORO_PTHREAD. Do no longer leak threads either.
+ * coro_create now allows one to create source coro_contexts.
+ * do not rely on makecontext passing a void * correctly.
+ * try harder to get _setjmp/_longjmp.
+ * major code cleanup/restructuring.
+ * 2008-11-10 the .cfi hacks are no longer needed.
+ * 2008-11-16 work around a freebsd pthread bug.
+ * 2008-11-19 define coro_*jmp symbols for easier porting.
+ * 2009-06-23 tentative win32-backend support for mingw32 (Yasuhiro Matsumoto).
+ * 2010-12-03 tentative support for uclibc (which lacks all sorts of things).
+ * 2011-05-30 set initial callee-saved-registers to zero with CORO_ASM.
+ * use .cfi_undefined rip on linux-amd64 for better backtraces.
+ * 2011-06-08 maybe properly implement weird windows amd64 calling conventions.
+ * 2011-07-03 rely on __GCC_HAVE_DWARF2_CFI_ASM for cfi detection.
+ * 2011-08-08 cygwin trashes stacks, use pthreads with double stack on cygwin.
+ * 2012-12-04 reduce misprediction penalty for x86/amd64 assembly switcher.
+ * 2012-12-05 experimental fiber backend (allocates stack twice).
+ * 2012-12-07 API version 3 - add coro_stack_alloc/coro_stack_free.
+ * 2012-12-21 valgrind stack registering was broken.
+ * 2015-12-05 experimental asm be for arm7, based on a patch by Nick Zavaritsky.
+ * use __name__ for predefined symbols, as in libecb.
+ * enable guard pages on arm, aarch64 and mips.
+ * 2016-08-27 try to disable _FORTIFY_SOURCE with CORO_SJLJ, as it
+ * breaks setjmp/longjmp. Also disable CORO_ASM for asm by default,
+ * as it was reported to crash.
+ * 2016-11-18 disable cfi_undefined again - backtraces might be worse, but
+ * compile compatibility is improved.
+ * 2018-08-14 use a completely different pthread strategy that should allow
+ * sharing of coroutines among different threads. this would
+ * undefined behaviour before as mutexes would be unlocked on
+ * a different thread. overall, this might be slower than
+ * using a pipe for synchronisation, but pipes eat fd's...
+ */
+
+#ifndef CORO_H
+#define CORO_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * This library consists of only three files
+ * coro.h, coro.c and LICENSE (and optionally README)
+ *
+ * It implements what is known as coroutines, in a hopefully
+ * portable way.
+ *
+ * All compiletime symbols must be defined both when including coro.h
+ * (using libcoro) as well as when compiling coro.c (the implementation).
+ *
+ * You can manually specify which flavour you want. If you don't define
+ * any of these, libcoro tries to choose a safe and fast default:
+ *
+ * -DCORO_UCONTEXT
+ *
+ * This flavour uses SUSv2's get/set/swap/makecontext functions that
+ * unfortunately only some unices support, and is quite slow.
+ *
+ * -DCORO_SJLJ
+ *
+ * This flavour uses SUSv2's setjmp/longjmp and sigaltstack functions to
+ * do it's job. Coroutine creation is much slower than UCONTEXT, but
+ * context switching is a bit cheaper. It should work on almost all unices.
+ *
+ * -DCORO_LINUX
+ *
+ * CORO_SJLJ variant.
+ * Old GNU/Linux systems (<= glibc-2.1) only work with this implementation
+ * (it is very fast and therefore recommended over other methods, but
+ * doesn't work with anything newer).
+ *
+ * -DCORO_LOSER
+ *
+ * CORO_SJLJ variant.
+ * Microsoft's highly proprietary platform doesn't support sigaltstack, and
+ * this selects a suitable workaround for this platform. It might not work
+ * with your compiler though - it has only been tested with MSVC 6.
+ *
+ * -DCORO_FIBER
+ *
+ * Slower, but probably more portable variant for the Microsoft operating
+ * system, using fibers. Ignores the passed stack and allocates it internally.
+ * Also, due to bugs in cygwin, this does not work with cygwin.
+ *
+ * -DCORO_IRIX
+ *
+ * CORO_SJLJ variant.
+ * For SGI's version of Microsoft's NT ;)
+ *
+ * -DCORO_ASM
+ *
+ * Hand coded assembly, known to work only on a few architectures/ABI:
+ * GCC + arm7/x86/IA32/amd64/x86_64 + GNU/Linux and a few BSDs. Fastest
+ * choice, if it works.
+ *
+ * -DCORO_PTHREAD
+ *
+ * Use the pthread API. You have to provide and -lpthread.
+ * This is likely the slowest backend, and it also does not support fork(),
+ * so avoid it at all costs.
+ *
+ * If you define neither of these symbols, coro.h will try to autodetect
+ * the best/safest model. To help with the autodetection, you should check
+ * (e.g. using autoconf) and define the following symbols: HAVE_UCONTEXT_H
+ * / HAVE_SETJMP_H / HAVE_SIGALTSTACK.
+ */
+
+/*
+ * Changes when the API changes incompatibly.
+ * This is ONLY the API version - there is no ABI compatibility between releases.
+ *
+ * Changes in API version 2:
+ * replaced bogus -DCORO_LOOSE with grammatically more correct -DCORO_LOSER
+ * Changes in API version 3:
+ * introduced stack management (CORO_STACKALLOC)
+ */
+#define CORO_VERSION 3
+
+#include
+
+/*
+ * This is the type for the initialization function of a new coroutine.
+ */
+typedef void (*coro_func)(void *);
+
+/*
+ * A coroutine state is saved in the following structure. Treat it as an
+ * opaque type. errno and sigmask might be saved, but don't rely on it,
+ * implement your own switching primitive if you need that.
+ */
+typedef struct coro_context coro_context;
+
+/*
+ * This function creates a new coroutine. Apart from a pointer to an
+ * uninitialised coro_context, it expects a pointer to the entry function
+ * and the single pointer value that is given to it as argument.
+ *
+ * Allocating/deallocating the stack is your own responsibility.
+ *
+ * As a special case, if coro, arg, sptr and ssze are all zero,
+ * then an "empty" coro_context will be created that is suitable
+ * as an initial source for coro_transfer.
+ *
+ * This function is not reentrant, but putting a mutex around it
+ * will work.
+ */
+void coro_create (coro_context *ctx, /* an uninitialised coro_context */
+ coro_func coro, /* the coroutine code to be executed */
+ void *arg, /* a single pointer passed to the coro */
+ void *sptr, /* start of stack area */
+ size_t ssze); /* size of stack area in bytes */
+
+/*
+ * The following prototype defines the coroutine switching function. It is
+ * sometimes implemented as a macro, so watch out.
+ *
+ * This function is thread-safe and reentrant.
+ */
+#if 0
+void coro_transfer (coro_context *prev, coro_context *next);
+#endif
+
+/*
+ * The following prototype defines the coroutine destroy function. It
+ * is sometimes implemented as a macro, so watch out. It also serves no
+ * purpose unless you want to use the CORO_PTHREAD backend, where it is
+ * used to clean up the thread. You are responsible for freeing the stack
+ * and the context itself.
+ *
+ * This function is thread-safe and reentrant.
+ */
+#if 0
+void coro_destroy (coro_context *ctx);
+#endif
+
+/*****************************************************************************/
+/* optional stack management */
+/*****************************************************************************/
+/*
+ * You can disable all of the stack management functions by
+ * defining CORO_STACKALLOC to 0. Otherwise, they are enabled by default.
+ *
+ * If stack management is enabled, you can influence the implementation via these
+ * symbols:
+ *
+ * -DCORO_USE_VALGRIND
+ *
+ * If defined, then libcoro will include valgrind/valgrind.h and register
+ * and unregister stacks with valgrind.
+ *
+ * -DCORO_GUARDPAGES=n
+ *
+ * libcoro will try to use the specified number of guard pages to protect against
+ * stack overflow. If n is 0, then the feature will be disabled. If it isn't
+ * defined, then libcoro will choose a suitable default. If guardpages are not
+ * supported on the platform, then the feature will be silently disabled.
+ */
+#ifndef CORO_STACKALLOC
+# define CORO_STACKALLOC 1
+#endif
+
+#if CORO_STACKALLOC
+
+/*
+ * The only allowed operations on these struct members is to read the
+ * "sptr" and "ssze" members to pass it to coro_create, to read the "sptr"
+ * member to see if it is false, in which case the stack isn't allocated,
+ * and to set the "sptr" member to 0, to indicate to coro_stack_free to
+ * not actually do anything.
+ */
+
+struct coro_stack
+{
+ void *sptr;
+ size_t ssze;
+#ifdef CORO_USE_VALGRIND
+ int valgrind_id;
+#endif
+};
+
+/*
+ * Try to allocate a stack of at least the given size and return true if
+ * successful, or false otherwise.
+ *
+ * The size is *NOT* specified in bytes, but in units of sizeof (void *),
+ * i.e. the stack is typically 4(8) times larger on 32 bit(64 bit) platforms
+ * then the size passed in.
+ *
+ * If size is 0, then a "suitable" stack size is chosen (usually 1-2MB).
+ */
+int coro_stack_alloc (struct coro_stack *stack, unsigned int size);
+
+/*
+ * Free the stack allocated by coro_stack_alloc again. It is safe to
+ * call this function on the coro_stack structure even if coro_stack_alloc
+ * failed.
+ */
+void coro_stack_free (struct coro_stack *stack);
+
+#endif
+
+/*
+ * That was it. No other user-serviceable parts below here.
+ */
+
+/*****************************************************************************/
+
+#if !defined CORO_LOSER && !defined CORO_UCONTEXT \
+ && !defined CORO_SJLJ && !defined CORO_LINUX \
+ && !defined CORO_IRIX && !defined CORO_ASM \
+ && !defined CORO_PTHREAD && !defined CORO_FIBER
+# if defined WINDOWS && (defined __i386__ || (__x86_64__ || defined _M_IX86 || defined _M_AMD64)
+# define CORO_ASM 1
+# elif defined WINDOWS || defined _WIN32
+# define CORO_LOSER 1 /* you don't win with windoze */
+# elif __linux && (__i386__ || (__x86_64__ && !__ILP32__) /*|| (__arm__ && __ARM_ARCH == 7)), not working */
+# define CORO_ASM 1
+# elif defined HAVE_UCONTEXT_H
+# define CORO_UCONTEXT 1
+# elif defined HAVE_SETJMP_H && defined HAVE_SIGALTSTACK
+# define CORO_SJLJ 1
+# else
+error unknown or unsupported architecture
+# endif
+#endif
+
+/*****************************************************************************/
+
+#ifdef CORO_UCONTEXT
+
+# include
+
+struct coro_context
+{
+ ucontext_t uc;
+};
+
+# define coro_transfer(p,n) swapcontext (&((p)->uc), &((n)->uc))
+# define coro_destroy(ctx) (void *)(ctx)
+
+#elif defined (CORO_SJLJ) || defined (CORO_LOSER) || defined (CORO_LINUX) || defined (CORO_IRIX)
+
+# if defined(CORO_LINUX) && !defined(_GNU_SOURCE)
+# define _GNU_SOURCE /* for glibc */
+# endif
+
+/* try to disable well-meant but buggy checks in some libcs */
+# ifdef _FORTIFY_SOURCE
+# undef _FORTIFY_SOURCE
+# undef __USE_FORTIFY_LEVEL /* helps some more when too much has been included already */
+# endif
+
+# if !CORO_LOSER
+# include
+# endif
+
+/* solaris is hopelessly borked, it expands _XOPEN_UNIX to nothing */
+# if __sun
+# undef _XOPEN_UNIX
+# define _XOPEN_UNIX 1
+# endif
+
+# include
+
+# if _XOPEN_UNIX > 0 || defined (_setjmp)
+# define coro_jmp_buf jmp_buf
+# define coro_setjmp(env) _setjmp (env)
+# define coro_longjmp(env) _longjmp ((env), 1)
+# elif CORO_LOSER
+# define coro_jmp_buf jmp_buf
+# define coro_setjmp(env) setjmp (env)
+# define coro_longjmp(env) longjmp ((env), 1)
+# else
+# define coro_jmp_buf sigjmp_buf
+# define coro_setjmp(env) sigsetjmp (env, 0)
+# define coro_longjmp(env) siglongjmp ((env), 1)
+# endif
+
+struct coro_context
+{
+ coro_jmp_buf env;
+};
+
+# define coro_transfer(p,n) do { if (!coro_setjmp ((p)->env)) coro_longjmp ((n)->env); } while (0)
+# define coro_destroy(ctx) (void *)(ctx)
+
+#elif CORO_ASM
+
+struct coro_context
+{
+ void **sp; /* must be at offset 0 */
+};
+
+#if defined (__i386__) || defined (__x86_64__)
+void __attribute__ ((__noinline__, __regparm__(2)))
+#else
+void __attribute__ ((__noinline__))
+#endif
+coro_transfer (coro_context *prev, coro_context *next);
+
+# define coro_destroy(ctx) (void)(ctx)
+
+#elif CORO_PTHREAD
+
+# include
+
+extern pthread_mutex_t coro_mutex;
+
+struct coro_context
+{
+ int flags;
+ pthread_cond_t cv;
+};
+
+void coro_transfer (coro_context *prev, coro_context *next);
+void coro_destroy (coro_context *ctx);
+
+#elif CORO_FIBER
+
+struct coro_context
+{
+ void *fiber;
+ /* only used for initialisation */
+ coro_func coro;
+ void *arg;
+};
+
+void coro_transfer (coro_context *prev, coro_context *next);
+void coro_destroy (coro_context *ctx);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/gatekeeper_acl.h b/include/gatekeeper_acl.h
index 59c4bbf42..61aa0d11e 100644
--- a/include/gatekeeper_acl.h
+++ b/include/gatekeeper_acl.h
@@ -32,16 +32,26 @@ struct acl_search {
struct rte_mbuf **mbufs;
};
-/* Declare and initialize a struct acl_search. */
-#define DEFINE_ACL_SEARCH(name, num_pkts) \
+#define DECLARE_ACL_SEARCH_VARIABLE_PART(name, num_pkts) \
const uint8_t *name##_data_array[(num_pkts)]; \
- struct rte_mbuf *name##_mbufs_array[(num_pkts)]; \
- struct acl_search name = { \
- .num = 0, \
- .data = name##_data_array, \
- .mbufs = name##_mbufs_array, \
+ struct rte_mbuf *name##_mbufs_array[(num_pkts)]
+
+/*
+ * This macro can only be used if the macro DECLARE_ACL_SEARCH_VARIABLE_PART()
+ * has been placed before it.
+ */
+#define ACL_SEARCH_INIT(name) \
+ { \
+ .num = 0, \
+ .data = name##_data_array, \
+ .mbufs = name##_mbufs_array, \
}
+/* Declare and initialize a struct acl_search. */
+#define DEFINE_ACL_SEARCH(name, num_pkts) \
+ DECLARE_ACL_SEARCH_VARIABLE_PART(name, num_pkts); \
+ struct acl_search name = ACL_SEARCH_INIT(name)
+
/* Classify batches of packets in @acl and invoke callback functions. */
int process_acl(struct gatekeeper_if *iface, unsigned int lcore_id,
struct acl_search *acl, struct acl_state *astate,
diff --git a/include/gatekeeper_gk.h b/include/gatekeeper_gk.h
index 95264d984..d732621ee 100644
--- a/include/gatekeeper_gk.h
+++ b/include/gatekeeper_gk.h
@@ -19,6 +19,8 @@
#ifndef _GATEKEEPER_GK_H_
#define _GATEKEEPER_GK_H_
+#include
+
#include
#include
@@ -98,6 +100,14 @@ struct gk_measurement_metrics {
struct gk_instance {
struct rte_hash *ip_flow_hash_table;
struct flow_entry *ip_flow_entry_table;
+ /*
+ * Coroutines.
+ *
+ * These structs must be here and not in struct gk_co_work because
+ * initialization functions (e.g. coro_create()) are not reentrant.
+ */
+ struct coro_context coro_root;
+ struct gk_co *cos;
/* RX queue on the front interface. */
uint16_t rx_queue_front;
/* TX queue on the front interface. */
@@ -201,6 +211,11 @@ struct gk_config {
/* Time for logging the basic measurements in ms. */
unsigned int basic_measurement_logging_ms;
+ /* Maximum number of coroutines running in parallel per GK instance. */
+ uint16_t co_max_num;
+ /* Size of the stack of each coroutine in KB. */
+ uint16_t co_stack_size_kb;
+
/*
* The fields below are for internal use.
* Configuration files should not refer to them.
diff --git a/include/gatekeeper_main.h b/include/gatekeeper_main.h
index 50aafa1fe..b9de610e4 100644
--- a/include/gatekeeper_main.h
+++ b/include/gatekeeper_main.h
@@ -20,6 +20,10 @@
#define _GATEKEEPER_MAIN_H_
#include
+#include
+
+#include
+#include
#ifdef RTE_MACHINE_CPUFLAG_SSE4_2
#include
@@ -52,4 +56,49 @@ extern FILE *log_file;
char *rte_strdup(const char *type, const char *s);
int gatekeeper_log_init(void);
+/* XXX #52 This should be part of DPDK. */
+/**
+ * Prefetch the first part of the mbuf
+ *
+ * The first 64 bytes of the mbuf corresponds to fields that are used early
+ * in the receive path. If the cache line of the architecture is higher than
+ * 64B, the second part will also be prefetched.
+ *
+ * @param m
+ * The pointer to the mbuf.
+ */
+static inline void
+rte_mbuf_prefetch_part1_non_temporal(struct rte_mbuf *m)
+{
+ rte_prefetch_non_temporal(&m->cacheline0);
+}
+
+/* XXX #52 This should be part of DPDK. */
+/**
+ * Prefetch the second part of the mbuf
+ *
+ * The next 64 bytes of the mbuf corresponds to fields that are used in the
+ * transmit path. If the cache line of the architecture is higher than 64B,
+ * this function does nothing as it is expected that the full mbuf is
+ * already in cache.
+ *
+ * @param m
+ * The pointer to the mbuf.
+ */
+static inline bool
+rte_mbuf_prefetch_part2_non_temporal(struct rte_mbuf *m)
+{
+#if RTE_CACHE_LINE_SIZE == 64
+ /* TODO Do we need this prefetch?
+ rte_prefetch_non_temporal(&m->cacheline1);
+ return true;
+ */
+ RTE_SET_USED(m);
+ return false;
+#else
+ RTE_SET_USED(m);
+ return false;
+#endif
+}
+
#endif /* _GATEKEEPER_MAIN_H_ */
diff --git a/include/list.h b/include/list.h
index e7fd442fa..c5adf7c51 100644
--- a/include/list.h
+++ b/include/list.h
@@ -34,6 +34,11 @@ struct list_head {
#define LIST_HEAD_INIT(name) { &(name), &(name) }
+#define LIST_POISON1 ((void *) 0x00100100)
+#define LIST_POISON2 ((void *) 0x00200200)
+
+#define LIST_HEAD_INIT_WITH_POISON(name) { LIST_POISON1, LIST_POISON2 }
+
static inline void
INIT_LIST_HEAD(struct list_head *list)
{
@@ -41,6 +46,13 @@ INIT_LIST_HEAD(struct list_head *list)
list->prev = list;
}
+static inline void
+INIT_LIST_HEAD_WITH_POISON(struct list_head *list)
+{
+ list->next = LIST_POISON1;
+ list->prev = LIST_POISON2;
+}
+
/**
* list_entry - get the struct for this entry
* @ptr: the &struct list_head pointer.
@@ -133,6 +145,16 @@ list_is_singular(const struct list_head *head)
return !list_empty(head) && (head->next == head->prev);
}
+/**
+ * list_poison - tests whether @entry has been poisoned.
+ * @entry: the entry to test.
+ */
+static inline int
+list_poison(const struct list_head *entry)
+{
+ return entry->next == LIST_POISON1 && entry->prev == LIST_POISON2;
+}
+
/*
* Insert a new entry between two known consecutive entries.
*
@@ -191,8 +213,6 @@ __list_del(struct list_head *prev, struct list_head *next)
prev->next = next;
}
-#define LIST_POISON1 ((void *) 0x00100100)
-#define LIST_POISON2 ((void *) 0x00200200)
/**
* list_del - deletes entry from list.
* @entry: the element to delete from the list.
diff --git a/lib/coro.c b/lib/coro.c
new file mode 100644
index 000000000..7817aab22
--- /dev/null
+++ b/lib/coro.c
@@ -0,0 +1,806 @@
+/*
+ * Copyright (c) 2001-2011 Marc Alexander Lehmann
+ *
+ * Redistribution and use in source and binary forms, with or without modifica-
+ * tion, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
+ * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+ * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
+ * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * the GNU General Public License ("GPL") version 2 or any later version,
+ * in which case the provisions of the GPL are applicable instead of
+ * the above. If you wish to allow the use of your version of this file
+ * only under the terms of the GPL and not to allow others to use your
+ * version of this file under the BSD license, indicate your decision
+ * by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL. If you do not delete the
+ * provisions above, a recipient may use your version of this file under
+ * either the BSD or the GPL.
+ *
+ * This library is modelled strictly after Ralf S. Engelschalls article at
+ * http://www.gnu.org/software/pth/rse-pmt.ps. So most of the credit must
+ * go to Ralf S. Engelschall .
+ */
+
+#include "coro.h"
+
+#include
+#include
+
+/*****************************************************************************/
+/* ucontext/setjmp/asm backends */
+/*****************************************************************************/
+#if defined (CORO_UCONTEXT) || defined (CORO_SJLJ) || defined (CORO_LOSER) || defined (CORO_LINUX) || defined (CORO_IRIX) || defined (CORO_ASM)
+
+# ifdef CORO_UCONTEXT
+# include
+# endif
+
+# if !defined(STACK_ADJUST_PTR)
+# ifdef __sgi
+/* IRIX is decidedly NON-unix */
+# define STACK_ADJUST_PTR(sp,ss) ((char *)(sp) + (ss) - 8)
+# define STACK_ADJUST_SIZE(sp,ss) ((ss) - 8)
+# elif (defined (__i386__) && defined (CORO_LINUX)) || (defined (_M_IX86) && defined (CORO_LOSER))
+# define STACK_ADJUST_PTR(sp,ss) ((char *)(sp) + (ss))
+# define STACK_ADJUST_SIZE(sp,ss) (ss)
+# elif (defined (__amd64__) && defined (CORO_LINUX)) || ((defined (_M_AMD64) || defined (_M_IA64)) && defined (CORO_LOSER))
+# define STACK_ADJUST_PTR(sp,ss) ((char *)(sp) + (ss) - 8)
+# define STACK_ADJUST_SIZE(sp,ss) (ss)
+# else
+# define STACK_ADJUST_PTR(sp,ss) (sp)
+# define STACK_ADJUST_SIZE(sp,ss) (ss)
+# endif
+# endif
+
+# include
+
+# ifdef CORO_SJLJ
+# include
+# include
+# include
+# endif
+
+static coro_func coro_init_func;
+static void *coro_init_arg;
+static coro_context *new_coro, *create_coro;
+
+static void
+coro_init (void)
+{
+ volatile coro_func func = coro_init_func;
+ volatile void *arg = coro_init_arg;
+
+ coro_transfer (new_coro, create_coro);
+
+#if defined (__GCC_HAVE_DWARF2_CFI_ASM) && defined (__amd64)
+ /*asm (".cfi_startproc");*/
+ /*asm (".cfi_undefined rip");*/
+#endif
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcast-qual"
+ func ((void *)arg);
+#pragma GCC diagnostic pop
+
+#if __GCC_HAVE_DWARF2_CFI_ASM && __amd64
+ /*asm (".cfi_endproc");*/
+#endif
+
+ /* the new coro returned. bad. just abort() for now */
+ abort ();
+}
+
+# ifdef CORO_SJLJ
+
+static volatile int trampoline_done;
+
+/* trampoline signal handler */
+static void
+trampoline (int sig)
+{
+ if (coro_setjmp (new_coro->env))
+ coro_init (); /* start it */
+ else
+ trampoline_done = 1;
+}
+
+# endif
+
+# if CORO_ASM
+
+ #if (defined __arm__) && \
+ (defined __ARM_ARCH_7__ || defined __ARM_ARCH_7A__ \
+ || defined __ARM_ARCH_7R__ || defined __ARM_ARCH_7M__ \
+ || __ARM_ARCH == 7)
+ #define CORO_ARM 1
+ #endif
+
+ #if defined (_WIN32) || defined (__CYGWIN__)
+ #define CORO_WIN_TIB 1
+ #endif
+
+ asm (
+ "\t.text\n"
+ #if defined (_WIN32) || defined (__CYGWIN__)
+ "\t.globl _coro_transfer\n"
+ "_coro_transfer:\n"
+ #else
+ "\t.globl coro_transfer\n"
+ "coro_transfer:\n"
+ #endif
+ /* windows, of course, gives a shit on the amd64 ABI and uses different registers */
+ /* http://blogs.msdn.com/freik/archive/2005/03/17/398200.aspx */
+ #ifdef __amd64
+
+ #if defined (_WIN32) || defined (__CYGWIN__)
+ #define NUM_SAVED 29
+ "\tsubq $168, %rsp\t" /* one dummy qword to improve alignment */
+ "\tmovaps %xmm6, (%rsp)\n"
+ "\tmovaps %xmm7, 16(%rsp)\n"
+ "\tmovaps %xmm8, 32(%rsp)\n"
+ "\tmovaps %xmm9, 48(%rsp)\n"
+ "\tmovaps %xmm10, 64(%rsp)\n"
+ "\tmovaps %xmm11, 80(%rsp)\n"
+ "\tmovaps %xmm12, 96(%rsp)\n"
+ "\tmovaps %xmm13, 112(%rsp)\n"
+ "\tmovaps %xmm14, 128(%rsp)\n"
+ "\tmovaps %xmm15, 144(%rsp)\n"
+ "\tpushq %rsi\n"
+ "\tpushq %rdi\n"
+ "\tpushq %rbp\n"
+ "\tpushq %rbx\n"
+ "\tpushq %r12\n"
+ "\tpushq %r13\n"
+ "\tpushq %r14\n"
+ "\tpushq %r15\n"
+ #if CORO_WIN_TIB
+ "\tpushq %fs:0x0\n"
+ "\tpushq %fs:0x8\n"
+ "\tpushq %fs:0xc\n"
+ #endif
+ "\tmovq %rsp, (%rcx)\n"
+ "\tmovq (%rdx), %rsp\n"
+ #if CORO_WIN_TIB
+ "\tpopq %fs:0xc\n"
+ "\tpopq %fs:0x8\n"
+ "\tpopq %fs:0x0\n"
+ #endif
+ "\tpopq %r15\n"
+ "\tpopq %r14\n"
+ "\tpopq %r13\n"
+ "\tpopq %r12\n"
+ "\tpopq %rbx\n"
+ "\tpopq %rbp\n"
+ "\tpopq %rdi\n"
+ "\tpopq %rsi\n"
+ "\tmovaps (%rsp), %xmm6\n"
+ "\tmovaps 16(%rsp), %xmm7\n"
+ "\tmovaps 32(%rsp), %xmm8\n"
+ "\tmovaps 48(%rsp), %xmm9\n"
+ "\tmovaps 64(%rsp), %xmm10\n"
+ "\tmovaps 80(%rsp), %xmm11\n"
+ "\tmovaps 96(%rsp), %xmm12\n"
+ "\tmovaps 112(%rsp), %xmm13\n"
+ "\tmovaps 128(%rsp), %xmm14\n"
+ "\tmovaps 144(%rsp), %xmm15\n"
+ "\taddq $168, %rsp\n"
+ #else
+ #define NUM_SAVED 6
+ "\tpushq %rbp\n"
+ "\tpushq %rbx\n"
+ "\tpushq %r12\n"
+ "\tpushq %r13\n"
+ "\tpushq %r14\n"
+ "\tpushq %r15\n"
+ "\tmovq %rsp, (%rdi)\n"
+ "\tmovq (%rsi), %rsp\n"
+ "\tpopq %r15\n"
+ "\tpopq %r14\n"
+ "\tpopq %r13\n"
+ "\tpopq %r12\n"
+ "\tpopq %rbx\n"
+ "\tpopq %rbp\n"
+ #endif
+ "\tpopq %rcx\n"
+ "\tjmpq *%rcx\n"
+
+ #elif __i386__
+
+ #define NUM_SAVED 4
+ "\tpushl %ebp\n"
+ "\tpushl %ebx\n"
+ "\tpushl %esi\n"
+ "\tpushl %edi\n"
+ #if CORO_WIN_TIB
+ #undef NUM_SAVED
+ #define NUM_SAVED 7
+ "\tpushl %fs:0\n"
+ "\tpushl %fs:4\n"
+ "\tpushl %fs:8\n"
+ #endif
+ "\tmovl %esp, (%eax)\n"
+ "\tmovl (%edx), %esp\n"
+ #if CORO_WIN_TIB
+ "\tpopl %fs:8\n"
+ "\tpopl %fs:4\n"
+ "\tpopl %fs:0\n"
+ #endif
+ "\tpopl %edi\n"
+ "\tpopl %esi\n"
+ "\tpopl %ebx\n"
+ "\tpopl %ebp\n"
+ "\tpopl %ecx\n"
+ "\tjmpl *%ecx\n"
+
+ #elif CORO_ARM /* untested, what about thumb, neon, iwmmxt? */
+
+ #if __ARM_PCS_VFP
+ "\tvpush {d8-d15}\n"
+ #define NUM_SAVED (9 + 8 * 2)
+ #else
+ #define NUM_SAVED 9
+ #endif
+ "\tpush {r4-r11,lr}\n"
+ "\tstr sp, [r0]\n"
+ "\tldr sp, [r1]\n"
+ "\tpop {r4-r11,lr}\n"
+ #if __ARM_PCS_VFP
+ "\tvpop {d8-d15}\n"
+ #endif
+ "\tmov r15, lr\n"
+
+ #elif __mips__ && 0 /* untested, 32 bit only */
+
+ #define NUM_SAVED (12 + 8 * 2)
+ /* TODO: n64/o64, lw=>ld */
+
+ "\t.set nomips16\n"
+ "\t.frame $sp,112,$31\n"
+ #if __mips_soft_float
+ "\taddiu $sp,$sp,-44\n"
+ #else
+ "\taddiu $sp,$sp,-112\n"
+ "\ts.d $f30,88($sp)\n"
+ "\ts.d $f28,80($sp)\n"
+ "\ts.d $f26,72($sp)\n"
+ "\ts.d $f24,64($sp)\n"
+ "\ts.d $f22,56($sp)\n"
+ "\ts.d $f20,48($sp)\n"
+ #endif
+ "\tsw $28,40($sp)\n"
+ "\tsw $31,36($sp)\n"
+ "\tsw $fp,32($sp)\n"
+ "\tsw $23,28($sp)\n"
+ "\tsw $22,24($sp)\n"
+ "\tsw $21,20($sp)\n"
+ "\tsw $20,16($sp)\n"
+ "\tsw $19,12($sp)\n"
+ "\tsw $18,8($sp)\n"
+ "\tsw $17,4($sp)\n"
+ "\tsw $16,0($sp)\n"
+ "\tsw $sp,0($4)\n"
+ "\tlw $sp,0($5)\n"
+ #if !__mips_soft_float
+ "\tl.d $f30,88($sp)\n"
+ "\tl.d $f28,80($sp)\n"
+ "\tl.d $f26,72($sp)\n"
+ "\tl.d $f24,64($sp)\n"
+ "\tl.d $f22,56($sp)\n"
+ "\tl.d $f20,48($sp)\n"
+ #endif
+ "\tlw $28,40($sp)\n"
+ "\tlw $31,36($sp)\n"
+ "\tlw $fp,32($sp)\n"
+ "\tlw $23,28($sp)\n"
+ "\tlw $22,24($sp)\n"
+ "\tlw $21,20($sp)\n"
+ "\tlw $20,16($sp)\n"
+ "\tlw $19,12($sp)\n"
+ "\tlw $18,8($sp)\n"
+ "\tlw $17,4($sp)\n"
+ "\tlw $16,0($sp)\n"
+ "\tj $31\n"
+ #if __mips_soft_float
+ "\taddiu $sp,$sp,44\n"
+ #else
+ "\taddiu $sp,$sp,112\n"
+ #endif
+
+ #else
+ #error unsupported architecture
+ #endif
+ );
+
+# endif
+
+void
+coro_create (coro_context *ctx, coro_func coro, void *arg, void *sptr, size_t ssize)
+{
+ coro_context nctx;
+# ifdef CORO_SJLJ
+ stack_t ostk, nstk;
+ struct sigaction osa, nsa;
+ sigset_t nsig, osig;
+# endif
+
+ if (!coro)
+ return;
+
+ coro_init_func = coro;
+ coro_init_arg = arg;
+
+ new_coro = ctx;
+ create_coro = &nctx;
+
+# ifdef CORO_SJLJ
+ /* we use SIGUSR2. first block it, then fiddle with it. */
+
+ sigemptyset (&nsig);
+ sigaddset (&nsig, SIGUSR2);
+ sigprocmask (SIG_BLOCK, &nsig, &osig);
+
+ nsa.sa_handler = trampoline;
+ sigemptyset (&nsa.sa_mask);
+ nsa.sa_flags = SA_ONSTACK;
+
+ if (sigaction (SIGUSR2, &nsa, &osa))
+ {
+ perror ("sigaction");
+ abort ();
+ }
+
+ /* set the new stack */
+ nstk.ss_sp = STACK_ADJUST_PTR (sptr, ssize); /* yes, some platforms (IRIX) get this wrong. */
+ nstk.ss_size = STACK_ADJUST_SIZE (sptr, ssize);
+ nstk.ss_flags = 0;
+
+ if (sigaltstack (&nstk, &ostk) < 0)
+ {
+ perror ("sigaltstack");
+ abort ();
+ }
+
+ trampoline_done = 0;
+ kill (getpid (), SIGUSR2);
+ sigfillset (&nsig); sigdelset (&nsig, SIGUSR2);
+
+ while (!trampoline_done)
+ sigsuspend (&nsig);
+
+ sigaltstack (0, &nstk);
+ nstk.ss_flags = SS_DISABLE;
+ if (sigaltstack (&nstk, 0) < 0)
+ perror ("sigaltstack");
+
+ sigaltstack (0, &nstk);
+ if (~nstk.ss_flags & SS_DISABLE)
+ abort ();
+
+ if (~ostk.ss_flags & SS_DISABLE)
+ sigaltstack (&ostk, 0);
+
+ sigaction (SIGUSR2, &osa, 0);
+ sigprocmask (SIG_SETMASK, &osig, 0);
+
+# elif defined (CORO_LOSER)
+
+ coro_setjmp (ctx->env);
+ #if __CYGWIN__ && __i386__
+ ctx->env[8] = (long) coro_init;
+ ctx->env[7] = (long) ((char *)sptr + ssize) - sizeof (long);
+ #elif __CYGWIN__ && __x86_64__
+ ctx->env[7] = (long) coro_init;
+ ctx->env[6] = (long) ((char *)sptr + ssize) - sizeof (long);
+ #elif defined __MINGW32__
+ ctx->env[5] = (long) coro_init;
+ ctx->env[4] = (long) ((char *)sptr + ssize) - sizeof (long);
+ #elif defined _M_IX86
+ ((_JUMP_BUFFER *)&ctx->env)->Eip = (long) coro_init;
+ ((_JUMP_BUFFER *)&ctx->env)->Esp = (long) STACK_ADJUST_PTR (sptr, ssize) - sizeof (long);
+ #elif defined _M_AMD64
+ ((_JUMP_BUFFER *)&ctx->env)->Rip = (__int64) coro_init;
+ ((_JUMP_BUFFER *)&ctx->env)->Rsp = (__int64) STACK_ADJUST_PTR (sptr, ssize) - sizeof (__int64);
+ #elif defined _M_IA64
+ ((_JUMP_BUFFER *)&ctx->env)->StIIP = (__int64) coro_init;
+ ((_JUMP_BUFFER *)&ctx->env)->IntSp = (__int64) STACK_ADJUST_PTR (sptr, ssize) - sizeof (__int64);
+ #else
+ #error "microsoft libc or architecture not supported"
+ #endif
+
+# elif defined (CORO_LINUX)
+
+ coro_setjmp (ctx->env);
+ #if __GLIBC__ >= 2 && __GLIBC_MINOR__ >= 0 && defined (JB_PC) && defined (JB_SP)
+ ctx->env[0].__jmpbuf[JB_PC] = (long) coro_init;
+ ctx->env[0].__jmpbuf[JB_SP] = (long) STACK_ADJUST_PTR (sptr, ssize) - sizeof (long);
+ #elif __GLIBC__ >= 2 && __GLIBC_MINOR__ >= 0 && defined (__mc68000__)
+ ctx->env[0].__jmpbuf[0].__aregs[0] = (long int)coro_init;
+ ctx->env[0].__jmpbuf[0].__sp = (int *) ((char *)sptr + ssize) - sizeof (long);
+ #elif defined (__GNU_LIBRARY__) && defined (__i386__)
+ ctx->env[0].__jmpbuf[0].__pc = (char *) coro_init;
+ ctx->env[0].__jmpbuf[0].__sp = (void *) ((char *)sptr + ssize) - sizeof (long);
+ #elif defined (__GNU_LIBRARY__) && defined (__x86_64__)
+ ctx->env[0].__jmpbuf[JB_PC] = (long) coro_init;
+ ctx->env[0].__jmpbuf[0].__sp = (void *) ((char *)sptr + ssize) - sizeof (long);
+ #else
+ #error "linux libc or architecture not supported"
+ #endif
+
+# elif defined (CORO_IRIX)
+
+ coro_setjmp (ctx->env, 0);
+ ctx->env[JB_PC] = (__uint64_t)coro_init;
+ ctx->env[JB_SP] = (__uint64_t)STACK_ADJUST_PTR (sptr, ssize) - sizeof (long);
+
+# elif CORO_ASM
+
+ #if defined (__i386__) || defined (__x86_64__)
+ ctx->sp = (void **)(ssize + (char *)sptr);
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcast-qual"
+ *--ctx->sp = (void *)abort; /* needed for alignment only */
+#pragma GCC diagnostic pop
+ *--ctx->sp = (void *)coro_init;
+ #ifdef CORO_WIN_TIB
+ *--ctx->sp = 0; /* ExceptionList */
+ *--ctx->sp = (char *)sptr + ssize; /* StackBase */
+ *--ctx->sp = sptr; /* StackLimit */
+ #endif
+ #elif CORO_ARM
+ /* return address stored in lr register, don't push anything */
+ #else
+ #error unsupported architecture
+ #endif
+
+ ctx->sp -= NUM_SAVED;
+ memset (ctx->sp, 0, sizeof (*ctx->sp) * NUM_SAVED);
+
+ #if defined (__i386__) || defined (__x86_64__)
+ /* done already */
+ #elif defined (CORO_ARM)
+ ctx->sp[0] = coro; /* r4 */
+ ctx->sp[1] = arg; /* r5 */
+ ctx->sp[8] = (char *)coro_init; /* lr */
+ #else
+ #error unsupported architecture
+ #endif
+
+# elif CORO_UCONTEXT
+
+ getcontext (&(ctx->uc));
+
+ ctx->uc.uc_link = 0;
+ ctx->uc.uc_stack.ss_sp = sptr;
+ ctx->uc.uc_stack.ss_size = (size_t)ssize;
+ ctx->uc.uc_stack.ss_flags = 0;
+
+ makecontext (&(ctx->uc), (void (*)())coro_init, 0);
+
+# endif
+
+ coro_transfer (create_coro, new_coro);
+}
+
+/*****************************************************************************/
+/* pthread backend */
+/*****************************************************************************/
+#elif CORO_PTHREAD
+
+/* this mutex will be locked by the running coroutine */
+pthread_mutex_t coro_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+struct coro_init_args
+{
+ coro_func func;
+ void *arg;
+ coro_context *self, *main;
+};
+
+static void *
+coro_init (void *args_)
+{
+ struct coro_init_args *args = (struct coro_init_args *)args_;
+ coro_func func = args->func;
+ void *arg = args->arg;
+
+ coro_transfer (args->self, args->main);
+ func (arg);
+
+ return 0;
+}
+
+void
+coro_transfer (coro_context *prev, coro_context *next)
+{
+ pthread_mutex_lock (&coro_mutex);
+
+ next->flags = 1;
+ pthread_cond_signal (&next->cv);
+
+ prev->flags = 0;
+
+ while (!prev->flags)
+ pthread_cond_wait (&prev->cv, &coro_mutex);
+
+ if (prev->flags == 2)
+ {
+ pthread_mutex_unlock (&coro_mutex);
+ pthread_cond_destroy (&prev->cv);
+ pthread_detach (pthread_self ());
+ pthread_exit (0);
+ }
+
+ pthread_mutex_unlock (&coro_mutex);
+}
+
+void
+coro_create (coro_context *ctx, coro_func coro, void *arg, void *sptr, size_t ssize)
+{
+ static coro_context nctx;
+ static int once;
+
+ if (!once)
+ {
+ once = 1;
+
+ pthread_cond_init (&nctx.cv, 0);
+ }
+
+ pthread_cond_init (&ctx->cv, 0);
+
+ if (coro)
+ {
+ pthread_attr_t attr;
+ struct coro_init_args args;
+ pthread_t id;
+
+ args.func = coro;
+ args.arg = arg;
+ args.self = ctx;
+ args.main = &nctx;
+
+ pthread_attr_init (&attr);
+#if __UCLIBC__
+ /* exists, but is borked */
+ /*pthread_attr_setstacksize (&attr, (size_t)ssize);*/
+#elif __CYGWIN__
+ /* POSIX, not here */
+ pthread_attr_setstacksize (&attr, (size_t)ssize);
+#else
+ pthread_attr_setstack (&attr, sptr, (size_t)ssize);
+#endif
+ pthread_attr_setscope (&attr, PTHREAD_SCOPE_PROCESS);
+ pthread_create (&id, &attr, coro_init, &args);
+
+ coro_transfer (args.main, args.self);
+ }
+}
+
+void
+coro_destroy (coro_context *ctx)
+{
+ pthread_mutex_lock (&coro_mutex);
+ ctx->flags = 2;
+ pthread_cond_signal (&ctx->cv);
+ pthread_mutex_unlock (&coro_mutex);
+}
+
+/*****************************************************************************/
+/* fiber backend */
+/*****************************************************************************/
+#elif CORO_FIBER
+
+#define WIN32_LEAN_AND_MEAN
+#if _WIN32_WINNT < 0x0400
+ #undef _WIN32_WINNT
+ #define _WIN32_WINNT 0x0400
+#endif
+#include
+
+VOID CALLBACK
+coro_init (PVOID arg)
+{
+ coro_context *ctx = (coro_context *)arg;
+
+ ctx->coro (ctx->arg);
+}
+
+void
+coro_transfer (coro_context *prev, coro_context *next)
+{
+ if (!prev->fiber)
+ {
+ prev->fiber = GetCurrentFiber ();
+
+ if (prev->fiber == 0 || prev->fiber == (void *)0x1e00)
+ prev->fiber = ConvertThreadToFiber (0);
+ }
+
+ SwitchToFiber (next->fiber);
+}
+
+void
+coro_create (coro_context *ctx, coro_func coro, void *arg, void *sptr, size_t ssize)
+{
+ ctx->fiber = 0;
+ ctx->coro = coro;
+ ctx->arg = arg;
+
+ if (!coro)
+ return;
+
+ ctx->fiber = CreateFiber (ssize, coro_init, ctx);
+}
+
+void
+coro_destroy (coro_context *ctx)
+{
+ DeleteFiber (ctx->fiber);
+}
+
+#else
+ #error unsupported backend
+#endif
+
+/*****************************************************************************/
+/* stack management */
+/*****************************************************************************/
+#if CORO_STACKALLOC
+
+#include
+
+#ifndef _WIN32
+# include
+#endif
+
+#ifdef CORO_USE_VALGRIND
+# include
+#endif
+
+#ifdef _POSIX_MAPPED_FILES
+# include
+# define CORO_MMAP 1
+# ifndef MAP_ANONYMOUS
+# ifdef MAP_ANON
+# define MAP_ANONYMOUS MAP_ANON
+# else
+# undef CORO_MMAP
+# endif
+# endif
+# include
+#else
+# undef CORO_MMAP
+#endif
+
+#if _POSIX_MEMORY_PROTECTION
+# ifndef CORO_GUARDPAGES
+# define CORO_GUARDPAGES 4
+# endif
+#else
+# undef CORO_GUARDPAGES
+#endif
+
+#if !CORO_MMAP
+# undef CORO_GUARDPAGES
+#endif
+
+#if !defined (__i386__) && !defined (__x86_64__) && !defined (__powerpc__) && !defined (__arm__) && !defined (__aarch64__) && !defined (__m68k__) && !defined (__alpha__) && !defined (__mips__) && !defined (__sparc64__)
+# undef CORO_GUARDPAGES
+#endif
+
+#ifndef CORO_GUARDPAGES
+# define CORO_GUARDPAGES 0
+#endif
+
+#ifndef PAGESIZE
+ #if !CORO_MMAP
+ #define PAGESIZE 4096
+ #else
+ static size_t
+ coro_pagesize (void)
+ {
+ static size_t pagesize;
+
+ if (!pagesize)
+ pagesize = sysconf (_SC_PAGESIZE);
+
+ return pagesize;
+ }
+
+ #define PAGESIZE coro_pagesize ()
+ #endif
+#endif
+
+int
+coro_stack_alloc (struct coro_stack *stack, unsigned int size)
+{
+ if (!size)
+ size = 256 * 1024;
+
+ stack->sptr = 0;
+ stack->ssze = ((size_t)size * sizeof (void *) + PAGESIZE - 1) / PAGESIZE * PAGESIZE;
+
+#ifdef CORO_FIBER
+
+ stack->sptr = (void *)stack;
+ return 1;
+
+#else
+
+ size_t ssze = stack->ssze + CORO_GUARDPAGES * PAGESIZE;
+ void *base;
+
+ #if CORO_MMAP
+ /* mmap supposedly does allocate-on-write for us */
+ base = mmap (0, ssze, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ if (base == (void *)-1)
+ {
+ /* some systems don't let us have executable heap */
+ /* we assume they won't need executable stack in that case */
+ base = mmap (0, ssze, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ if (base == (void *)-1)
+ return 0;
+ }
+
+ #if CORO_GUARDPAGES
+ mprotect (base, CORO_GUARDPAGES * PAGESIZE, PROT_NONE);
+ #endif
+
+ base = (void*)((char *)base + CORO_GUARDPAGES * PAGESIZE);
+ #else
+ base = malloc (ssze);
+ if (!base)
+ return 0;
+ #endif
+
+ #ifdef CORO_USE_VALGRIND
+ stack->valgrind_id = VALGRIND_STACK_REGISTER ((char *)base, ((char *)base) + ssze - CORO_GUARDPAGES * PAGESIZE);
+ #endif
+
+ stack->sptr = base;
+ return 1;
+
+#endif
+}
+
+void
+coro_stack_free (struct coro_stack *stack)
+{
+#ifdef CORO_FIBER
+ /* nop */
+#else
+ #ifdef CORO_USE_VALGRIND
+ VALGRIND_STACK_DEREGISTER (stack->valgrind_id);
+ #endif
+
+ #if CORO_MMAP
+ if (stack->sptr)
+ munmap ((void*)((char *)stack->sptr - CORO_GUARDPAGES * PAGESIZE),
+ stack->ssze + CORO_GUARDPAGES * PAGESIZE);
+ #else
+ free (stack->sptr);
+ #endif
+#endif
+}
+
+#endif
+
diff --git a/lib/mailbox.c b/lib/mailbox.c
index 33bb242df..a78c53c0e 100644
--- a/lib/mailbox.c
+++ b/lib/mailbox.c
@@ -111,9 +111,13 @@ void
destroy_mailbox(struct mailbox *mb)
{
if (mb) {
- if (mb->ring)
+ if (mb->ring) {
rte_ring_free(mb->ring);
- if (mb->pool)
+ mb->ring = NULL;
+ }
+ if (mb->pool) {
rte_mempool_free(mb->pool);
+ mb->pool = NULL;
+ }
}
}
diff --git a/lua/gatekeeper/staticlib.lua b/lua/gatekeeper/staticlib.lua
index c3c1435dc..cae9c7b93 100644
--- a/lua/gatekeeper/staticlib.lua
+++ b/lua/gatekeeper/staticlib.lua
@@ -211,6 +211,8 @@ struct gk_config {
uint32_t log_ratelimit_interval_ms;
uint32_t log_ratelimit_burst;
unsigned int basic_measurement_logging_ms;
+ uint16_t co_max_num;
+ uint16_t co_stack_size_kb;
/* This struct has hidden fields. */
};
diff --git a/lua/gk.lua b/lua/gk.lua
index 057b98644..be3e452c4 100644
--- a/lua/gk.lua
+++ b/lua/gk.lua
@@ -42,8 +42,12 @@ return function (net_conf, lls_conf, sol_conf, gk_lcores)
local back_icmp_msgs_per_sec = 1000
local back_icmp_msgs_burst = 50
+ local co_max_num = 16
+
-- These variables are unlikely to need to be changed.
local bpf_enable_jit = true
+ -- CAUTION: stacks too small will crash the GK blocks.
+ local co_stack_size_kb = 16
--
-- End configuration of GK block.
@@ -100,6 +104,9 @@ return function (net_conf, lls_conf, sol_conf, gk_lcores)
gk_conf.back_max_pkt_burst =
staticlib.get_back_burst_config(max_pkt_burst_back, net_conf)
+ gk_conf.co_max_num = co_max_num
+ gk_conf.co_stack_size_kb = co_stack_size_kb
+
-- The maximum number of ARP or ND packets in LLS submitted by
-- GK or GT. The code below makes sure that the parameter should
-- be at least the same with the maximum configured value of GK.