diff --git a/Makefile b/Makefile index efc5b8950..eb33b01dc 100644 --- a/Makefile +++ b/Makefile @@ -36,7 +36,7 @@ SRCS-y := main/main.c SRCS-y += config/static.c config/dynamic.c SRCS-y += cps/main.c cps/kni.c cps/elf.c SRCS-y += ggu/main.c -SRCS-y += gk/main.c gk/fib.c gk/bpf.c +SRCS-y += gk/main.c gk/fib.c gk/bpf.c gk/co.c SRCS-y += gt/main.c gt/lua_lpm.c SRCS-y += lls/main.c lls/cache.c lls/arp.c lls/nd.c SRCS-y += sol/main.c @@ -44,12 +44,12 @@ SRCS-y += sol/main.c # Libraries. SRCS-y += lib/mailbox.c lib/net.c lib/flow.c lib/ipip.c \ lib/luajit-ffi-cdata.c lib/launch.c lib/lpm.c lib/acl.c lib/varip.c \ - lib/l2.c lib/ratelimit.c lib/memblock.c lib/log_ratelimit.c + lib/l2.c lib/ratelimit.c lib/memblock.c lib/log_ratelimit.c lib/coro.c LDLIBS += $(LDIR) -Bstatic -lluajit-5.1 -Bdynamic -lm -lmnl -lkmod CFLAGS += $(WERROR_FLAGS) -I${GATEKEEPER}/include -I/usr/local/include/luajit-2.0/ EXTRA_CFLAGS += -O3 -g -Wfatal-errors -DALLOW_EXPERIMENTAL_API \ - -Wno-deprecated-declarations + -Wno-deprecated-declarations -DCORO_ASM include $(RTE_SDK)/mk/rte.extapp.mk diff --git a/dependencies/dpdk b/dependencies/dpdk index bcc1e4fce..c637f7cd4 160000 --- a/dependencies/dpdk +++ b/dependencies/dpdk @@ -1 +1 @@ -Subproject commit bcc1e4fce82336ca39108ed4d54fb501af4a1b5a +Subproject commit c637f7cd452d750d6eb51bb2abf9de92a111fe60 diff --git a/gk/bpf.c b/gk/bpf.c index 16b09963b..2ffcdd913 100644 --- a/gk/bpf.c +++ b/gk/bpf.c @@ -106,12 +106,13 @@ static const struct rte_bpf_xsym flow_handler_init_xsym[] = { }; struct gk_bpf_pkt_frame { - uint64_t password; - struct flow_entry *fe; - struct ipacket *packet; - struct gk_config *gk_conf; - bool ready_to_tx; - struct gk_bpf_pkt_ctx ctx; + uint64_t password; + struct flow_entry *fe; + struct ipacket *packet; + struct gk_co *this_co; + bool pkt_part2_prefetched; + bool ready_to_tx; + struct gk_bpf_pkt_ctx ctx; }; static const uint64_t pkt_password = 0xa2e329ba8b15af05; @@ -199,6 +200,7 @@ gk_bpf_prep_for_tx(struct gk_bpf_pkt_ctx *ctx, int priority, int direct_if_possible) { int ret; + struct gatekeeper_if *back; struct gk_bpf_pkt_frame *frame = pkt_ctx_to_frame(ctx); if (unlikely(frame == NULL)) return -EINVAL; @@ -208,11 +210,18 @@ gk_bpf_prep_for_tx(struct gk_bpf_pkt_ctx *ctx, int priority, if (unlikely(priority < 0 || priority > PRIORITY_MAX)) return -EINVAL; + /* Prepare packet for transmission if needed. */ + if (likely(!frame->pkt_part2_prefetched)) { + frame->pkt_part2_prefetched = true; + if (likely(rte_mbuf_prefetch_part2_non_temporal( + frame->packet->pkt))) + gk_yield_next(frame->this_co); + } + + back = &frame->this_co->work->gk_conf->net->back; ret = (direct_if_possible != 0 && priority == PRIORITY_GRANTED) - ? update_pkt_priority(frame->packet, priority, - &frame->gk_conf->net->back) - : encapsulate(frame->packet->pkt, priority, - &frame->gk_conf->net->back, + ? update_pkt_priority(frame->packet, priority, back) + : encapsulate(frame->packet->pkt, priority, back, &frame->fe->grantor_fib->u.grantor.gt_addr); frame->ready_to_tx = ret == 0; @@ -486,7 +495,7 @@ parse_packet_further(struct ipacket *packet, struct gk_bpf_pkt_ctx *ctx) } int -gk_bpf_decide_pkt(struct gk_config *gk_conf, uint8_t program_index, +gk_bpf_decide_pkt(struct gk_co *this_co, uint8_t program_index, struct flow_entry *fe, struct ipacket *packet, uint64_t now, uint64_t *p_bpf_ret) { @@ -494,7 +503,8 @@ gk_bpf_decide_pkt(struct gk_config *gk_conf, uint8_t program_index, .password = pkt_password, .fe = fe, .packet = packet, - .gk_conf = gk_conf, + .this_co = this_co, + .pkt_part2_prefetched = false, .ready_to_tx = false, .ctx = { .now = now, @@ -502,7 +512,7 @@ gk_bpf_decide_pkt(struct gk_config *gk_conf, uint8_t program_index, }, }; const struct gk_bpf_flow_handler *handler = - &gk_conf->flow_handlers[program_index]; + &this_co->work->gk_conf->flow_handlers[program_index]; if (unlikely(handler->f_pkt == NULL)) { GK_LOG(WARNING, diff --git a/gk/bpf.h b/gk/bpf.h index f5c93e9ec..05cfd7f6d 100644 --- a/gk/bpf.h +++ b/gk/bpf.h @@ -20,6 +20,7 @@ #define _GATEKEEPER_GK_BPF_H_ #include "gatekeeper_gk.h" +#include "co.h" /* * Load the BPF program that handles flows into @gk_conf at @@ -32,7 +33,7 @@ int gk_load_bpf_flow_handler(struct gk_config *gk_conf, unsigned int index, const char *filename, int jit); -int gk_bpf_decide_pkt(struct gk_config *gk_conf, uint8_t program_index, +int gk_bpf_decide_pkt(struct gk_co *this_co, uint8_t program_index, struct flow_entry *fe, struct ipacket *packet, uint64_t now, uint64_t *p_bpf_ret); diff --git a/gk/co.c b/gk/co.c new file mode 100644 index 000000000..35ad7d941 --- /dev/null +++ b/gk/co.c @@ -0,0 +1,1121 @@ +/* + * Gatekeeper - DoS protection system. + * Copyright (C) 2016 Digirati LTDA. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include + +#include "gatekeeper_lls.h" + +#include "bpf.h" +#include "co.h" + +static struct gk_co * +get_next_co(struct gk_co *this_co) +{ + /* + * It is unlikely because as long as there is more than + * one working coroutine, there is at least 50% chance that + * @this_co is not the last working coroutine. + */ + if (unlikely(this_co->co_list.next == &this_co->work->working_cos)) { + /* @this_co is the last working co. */ + return list_first_entry(&this_co->work->working_cos, + struct gk_co, co_list); + } + return list_next_entry(this_co, co_list); +} + +void +gk_yield_next(struct gk_co *this_co) +{ + struct gk_co *next_co = get_next_co(this_co); + if (unlikely(this_co == next_co)) + return; + coro_transfer(&this_co->coro, &next_co->coro); +} + +/* + * If @task is added to @this_co->task_queue without a proper @task->task_hash, + * @task must be rescheduled once the proper @task->task_hash becomes known + * in order to avoid race conditions related to the proper @task->task_hash. + * + * NOTICE: while a task is running without a proper @task->task_hash, + * the task must not use the leftover available because the task is likely + * running under a task hash that is different of its proper @task->task_hash. + */ +static void +reschedule_task(struct gk_co *this_co, struct gk_co_task *task) +{ + struct gk_co_work *work = this_co->work; + struct gk_co *task_owner_co = get_task_owner_co(work, task); + + __schedule_task(task_owner_co, task); + + if (list_poison(&task_owner_co->co_list)) + list_add_tail(&task_owner_co->co_list, &work->working_cos); +} + +static int +extract_packet_info(struct rte_mbuf *pkt, struct ipacket *packet) +{ + int ret = 0; + uint16_t ether_type; + size_t ether_len; + struct rte_ether_hdr *eth_hdr; + struct rte_ipv4_hdr *ip4_hdr; + struct rte_ipv6_hdr *ip6_hdr; + uint16_t pkt_len = rte_pktmbuf_data_len(pkt); + + eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *); + ether_type = rte_be_to_cpu_16(pkt_in_skip_l2(pkt, eth_hdr, + &packet->l3_hdr)); + ether_len = pkt_in_l2_hdr_len(pkt); + + switch (ether_type) { + case RTE_ETHER_TYPE_IPV4: + if (pkt_len < ether_len + sizeof(*ip4_hdr)) { + packet->flow.proto = 0; + GK_LOG(NOTICE, + "Packet is too short to be IPv4 (%" PRIu16 ")\n", + pkt_len); + ret = -1; + goto out; + } + + ip4_hdr = packet->l3_hdr; + packet->flow.proto = RTE_ETHER_TYPE_IPV4; + packet->flow.f.v4.src.s_addr = ip4_hdr->src_addr; + packet->flow.f.v4.dst.s_addr = ip4_hdr->dst_addr; + break; + + case RTE_ETHER_TYPE_IPV6: + if (pkt_len < ether_len + sizeof(*ip6_hdr)) { + packet->flow.proto = 0; + GK_LOG(NOTICE, + "Packet is too short to be IPv6 (%" PRIu16 ")\n", + pkt_len); + ret = -1; + goto out; + } + + ip6_hdr = packet->l3_hdr; + packet->flow.proto = RTE_ETHER_TYPE_IPV6; + rte_memcpy(packet->flow.f.v6.src.s6_addr, ip6_hdr->src_addr, + sizeof(packet->flow.f.v6.src.s6_addr)); + rte_memcpy(packet->flow.f.v6.dst.s6_addr, ip6_hdr->dst_addr, + sizeof(packet->flow.f.v6.dst.s6_addr)); + break; + + case RTE_ETHER_TYPE_ARP: + packet->flow.proto = RTE_ETHER_TYPE_ARP; + ret = -1; + break; + + default: + packet->flow.proto = 0; + log_unknown_l2("gk", ether_type); + ret = -1; + break; + } +out: + packet->pkt = pkt; + return ret; +} + +static int +drop_packet_front(struct rte_mbuf *pkt, struct gk_instance *instance) +{ + instance->traffic_stats.tot_pkts_num_dropped++; + instance->traffic_stats.tot_pkts_size_dropped += + rte_pktmbuf_pkt_len(pkt); + + return drop_packet(pkt); +} + +static int +parse_front_pkt(struct gk_co *this_co, + struct ipacket *packet, struct rte_mbuf *pkt) +{ + struct gk_co_work *work = this_co->work; + int ret; + + /* TODO Does this prefetch improve performance? + rte_mbuf_prefetch_part1_non_temporal(pkt); + gk_yield_next(this_co); + */ + /* + * This prefetch is enough to load Ethernet header (14 bytes), + * optional Ethernet VLAN header (8 bytes), and either + * an IPv4 header without options (20 bytes), or + * an IPv6 header without options (40 bytes). + * IPv4: 14 + 8 + 20 = 42 + * IPv6: 14 + 8 + 40 = 62 + rte_prefetch_non_temporal(rte_pktmbuf_mtod_offset(pkt, void *, 0)); + gk_yield_next(this_co); + */ + + ret = extract_packet_info(pkt, packet); + if (ret < 0) { + if (likely(packet->flow.proto == RTE_ETHER_TYPE_ARP)) { + struct gk_measurement_metrics *stats = + &work->instance->traffic_stats; + + stats->tot_pkts_num_distributed++; + stats->tot_pkts_size_distributed += + rte_pktmbuf_pkt_len(pkt); + + work->front_arp_bufs[work->front_num_arp++] = pkt; + return -1; + } + + /* Drop non-IP and non-ARP packets. */ + drop_packet_front(pkt, work->instance); + return -1; + } + + if (unlikely((packet->flow.proto == RTE_ETHER_TYPE_IPV4 && + !work->front_ipv4_configured) || + (packet->flow.proto == RTE_ETHER_TYPE_IPV6 && + !work->front_ipv6_configured))) { + drop_packet_front(pkt, work->instance); + return -1; + } + + return 0; +} + +#define START_PRIORITY (38) +/* Set @START_ALLOWANCE as the double size of a large DNS reply. */ +#define START_ALLOWANCE (8) + +static void +initialize_flow_entry(struct flow_entry *fe, struct ip_flow *flow, + uint32_t flow_hash_val, struct gk_fib *grantor_fib) +{ + /* + * The flow table is a critical data structure, so, + * whenever the size of entries grow too much, + * one must look for alternatives before increasing + * the limit below. + */ + RTE_BUILD_BUG_ON(sizeof(*fe) > 128); + + rte_memcpy(&fe->flow, flow, sizeof(*flow)); + + fe->in_use = true; + fe->flow_hash_val = flow_hash_val; + fe->state = GK_REQUEST; + fe->u.request.last_packet_seen_at = rte_rdtsc(); + fe->u.request.last_priority = START_PRIORITY; + fe->u.request.allowance = START_ALLOWANCE - 1; + fe->grantor_fib = grantor_fib; +} + +static inline void +reinitialize_flow_entry(struct flow_entry *fe, uint64_t now) +{ + fe->state = GK_REQUEST; + fe->u.request.last_packet_seen_at = now; + fe->u.request.last_priority = START_PRIORITY; + fe->u.request.allowance = START_ALLOWANCE - 1; +} + +static inline void +prefetch_flow_entry(struct flow_entry *fe) +{ +#if RTE_CACHE_LINE_SIZE == 64 + RTE_BUILD_BUG_ON(sizeof(*fe) <= RTE_CACHE_LINE_SIZE); + RTE_BUILD_BUG_ON(sizeof(*fe) > 2 * RTE_CACHE_LINE_SIZE); + rte_prefetch0(fe); + rte_prefetch0(((char *)fe) + RTE_CACHE_LINE_SIZE); +#elif RTE_CACHE_LINE_SIZE == 128 + RTE_BUILD_BUG_ON(sizeof(*fe) > RTE_CACHE_LINE_SIZE); + rte_prefetch0(fe); +#else +#error "Unsupported cache line size" +#endif +} + +/* We should avoid calling integer_log_base_2() with zero. */ +static inline uint8_t +integer_log_base_2(uint64_t delta_time) +{ +#if __WORDSIZE == 64 + return (8 * sizeof(uint64_t) - 1) - __builtin_clzl(delta_time); +#else + return (8 * sizeof(uint64_t) - 1) - __builtin_clzll(delta_time); +#endif +} + +/* + * It converts the difference of time between the current packet and + * the last seen packet into a given priority. + */ +static uint8_t +priority_from_delta_time(uint64_t present, uint64_t past) +{ + uint64_t delta_time; + + if (unlikely(present < past)) { + /* + * This should never happen, but we handle it gracefully here + * in order to keep going. + */ + GK_LOG(ERR, "The present time smaller than the past time\n"); + return 0; + } + + delta_time = (present - past) * picosec_per_cycle; + if (unlikely(delta_time < 1)) + return 0; + + return integer_log_base_2(delta_time); +} + +/* + * When a flow entry is at request state, all the GK block processing + * that entry does is to: + * (1) compute the priority of the packet. + * (2) encapsulate the packet as a request. + * (3) put this encapsulated packet in the request queue. + */ +static void +gk_process_request(struct gk_co *this_co, struct flow_entry *fe, + struct ipacket *packet) +{ + int ret; + uint64_t now = rte_rdtsc(); + uint8_t priority = priority_from_delta_time(now, + fe->u.request.last_packet_seen_at); + struct rte_mbuf *pkt = packet->pkt; + struct gk_co_work *work = this_co->work; + struct gatekeeper_if *back = &work->gk_conf->net->back; + struct gk_fib *fib = fe->grantor_fib; + struct ether_cache *eth_cache; + + fe->u.request.last_packet_seen_at = now; + + /* + * The reason for using "<" instead of "<=" is that the equal case + * means that the source has waited enough time to have the same + * last priority, so it should be awarded with the allowance. + */ + if (priority < fe->u.request.last_priority && + fe->u.request.allowance > 0) { + fe->u.request.allowance--; + priority = fe->u.request.last_priority; + } else { + fe->u.request.last_priority = priority; + fe->u.request.allowance = START_ALLOWANCE - 1; + } + + /* + * Adjust @priority for the DSCP field. + * DSCP 0 for legacy packets; 1 for granted packets; + * 2 for capability renew; 3-63 for requests. + */ + priority += PRIORITY_REQ_MIN; + if (unlikely(priority > PRIORITY_MAX)) + priority = PRIORITY_MAX; + + /* The assigned priority is @priority. */ + + /* Prepare packet for transmission. */ + if (likely(rte_mbuf_prefetch_part2_non_temporal(pkt))) + gk_yield_next(this_co); + + /* Encapsulate the packet as a request. */ + ret = encapsulate(pkt, priority, back, &fib->u.grantor.gt_addr); + if (ret < 0) + goto drop_pkt; + + eth_cache = fib->u.grantor.eth_cache; + RTE_VERIFY(eth_cache != NULL); + /* If needed, packet header space was adjusted by encapsulate(). */ + if (pkt_copy_cached_eth_header(pkt, eth_cache, back->l2_len_out)) + goto drop_pkt; + + pkt->udata64 = priority; + work->front_req_bufs[work->front_num_req++] = pkt; + return; + +drop_pkt: + drop_packet_front(pkt, work->instance); +} + +static void +gk_process_granted(struct gk_co *this_co, struct flow_entry *fe, + struct ipacket *packet) +{ + int ret; + bool renew_cap; + uint8_t priority = PRIORITY_GRANTED; + uint64_t now = rte_rdtsc(); + struct rte_mbuf *pkt = packet->pkt; + struct gk_fib *fib = fe->grantor_fib; + struct gk_co_work *work = this_co->work; + struct gatekeeper_if *back = &work->gk_conf->net->back; + struct gk_measurement_metrics *stats; + struct ether_cache *eth_cache; + uint32_t pkt_len; + + if (now >= fe->u.granted.cap_expire_at) { + reinitialize_flow_entry(fe, now); + return gk_process_request(this_co, fe, packet); + } + + if (now >= fe->u.granted.budget_renew_at) { + fe->u.granted.budget_renew_at = now + cycles_per_sec; + fe->u.granted.budget_byte = + (uint64_t)fe->u.granted.tx_rate_kib_cycle * 1024; + } + + stats = &work->instance->traffic_stats; + + pkt_len = rte_pktmbuf_pkt_len(pkt); + if (pkt_len > fe->u.granted.budget_byte) { + stats->pkts_num_declined++; + stats->pkts_size_declined += pkt_len; + goto drop_pkt; + } + + fe->u.granted.budget_byte -= pkt_len; + renew_cap = now >= fe->u.granted.send_next_renewal_at; + if (renew_cap) { + fe->u.granted.send_next_renewal_at = now + + fe->u.granted.renewal_step_cycle; + priority = PRIORITY_RENEW_CAP; + } + + /* Prepare packet for transmission. */ + if (likely(rte_mbuf_prefetch_part2_non_temporal(pkt))) + gk_yield_next(this_co); + + /* + * Encapsulate packet as a granted packet, + * mark it as a capability renewal request if @renew_cap is true, + * enter destination according to @fe->grantor_fib. + */ + ret = encapsulate(pkt, priority, back, &fib->u.grantor.gt_addr); + if (ret < 0) + goto drop_pkt; + + eth_cache = fib->u.grantor.eth_cache; + RTE_VERIFY(eth_cache != NULL); + /* If needed, packet header space was adjusted by encapsulate(). */ + if (pkt_copy_cached_eth_header(pkt, eth_cache, back->l2_len_out)) + goto drop_pkt; + + stats->pkts_num_granted++; + stats->pkts_size_granted += pkt_len; + work->tx_back_pkts[work->tx_back_num_pkts++] = pkt; + return; + +drop_pkt: + drop_packet_front(pkt, work->instance); +} + +static void +gk_process_declined(struct gk_co *this_co, struct flow_entry *fe, + struct ipacket *packet) +{ + uint64_t now = rte_rdtsc(); + struct gk_co_work *work = this_co->work; + struct gk_measurement_metrics *stats; + + if (unlikely(now >= fe->u.declined.expire_at)) { + reinitialize_flow_entry(fe, now); + return gk_process_request(this_co, fe, packet); + } + + stats = &work->instance->traffic_stats; + stats->pkts_num_declined++; + stats->pkts_size_declined += rte_pktmbuf_pkt_len(packet->pkt); + drop_packet_front(packet->pkt, work->instance); +} + +static void +gk_process_bpf(struct gk_co *this_co, struct flow_entry *fe, + struct ipacket *packet) +{ + struct rte_mbuf *pkt = packet->pkt; + struct gk_co_work *work = this_co->work; + struct gk_measurement_metrics *stats; + uint64_t bpf_ret; + int program_index, rc; + uint64_t now = rte_rdtsc(); + + if (unlikely(now >= fe->u.bpf.expire_at)) + goto expired; + + program_index = fe->program_index; + rc = gk_bpf_decide_pkt(this_co, program_index, fe, packet, now, + &bpf_ret); + if (unlikely(rc != 0)) { + GK_LOG(WARNING, + "The BPF program at index %u failed to run its function pkt\n", + program_index); + goto expired; + } + + stats = &work->instance->traffic_stats; + switch (bpf_ret) { + case GK_BPF_PKT_RET_FORWARD: { + struct ether_cache *eth_cache = + fe->grantor_fib->u.grantor.eth_cache; + RTE_VERIFY(eth_cache != NULL); + /* + * If needed, encapsulate() already adjusted + * packet header space. + */ + if (pkt_copy_cached_eth_header(pkt, eth_cache, + work->gk_conf->net->back.l2_len_out)) + goto drop_pkt; + + stats->pkts_num_granted++; + stats->pkts_size_granted += rte_pktmbuf_pkt_len(pkt); + work->tx_back_pkts[work->tx_back_num_pkts++] = pkt; + return; + } + case GK_BPF_PKT_RET_DECLINE: + stats->pkts_num_declined++; + stats->pkts_size_declined += rte_pktmbuf_pkt_len(pkt); + goto drop_pkt; + case GK_BPF_PKT_RET_ERROR: + GK_LOG(WARNING, + "The function pkt of the BPF program at index %u returned GK_BPF_PKT_RET_ERROR\n", + program_index); + goto drop_pkt; + default: + GK_LOG(WARNING, + "The function pkt of the BPF program at index %u returned an invalid return: %" PRIu64 "\n", + program_index, bpf_ret); + goto drop_pkt; + } + + rte_panic("Unexpected condition at %s()", __func__); + +expired: + reinitialize_flow_entry(fe, now); + return gk_process_request(this_co, fe, packet); + +drop_pkt: + drop_packet_front(pkt, work->instance); +} + +static void +process_flow_entry(struct gk_co *this_co, struct flow_entry *fe, + struct ipacket *packet) +{ + /* + * Some notes regarding flow rates and units: + * + * Flows in the GK_REQUEST state are bandwidth limited + * to an overall rate relative to the link. Therefore, + * the Ethernet frame overhead is counted toward the + * credits used by requests. The request channel rate + * is measured in megabits (base 10) per second to + * match the units used by hardware specifications. + * + * Granted flows (in state GK_GRANTED or sometimes + * GK_BPF) are allocated budgets that are intended + * to reflect the max throughput of the flow, and + * therefore do not include the Ethernet frame overhead. + * The budgets of granted flows are measured in + * kibibytes (base 2). + */ + switch (fe->state) { + case GK_REQUEST: + return gk_process_request(this_co, fe, packet); + + case GK_GRANTED: + return gk_process_granted(this_co, fe, packet); + + case GK_DECLINED: + return gk_process_declined(this_co, fe, packet); + + case GK_BPF: + return gk_process_bpf(this_co, fe, packet); + + default: + GK_LOG(ERR, "Unknown flow state: %d\n", fe->state); + drop_packet_front(packet->pkt, this_co->work->instance); + return; + } + + rte_panic("Unexpected condition at %s()\n", __func__); +} + +typedef int (*packet_drop_cb_func)(struct rte_mbuf *pkt, + struct gk_instance *instance); + +static void +xmit_icmp(struct gatekeeper_if *iface, struct ipacket *packet, + uint16_t *num_pkts, struct rte_mbuf **icmp_bufs, + struct gk_instance *instance, packet_drop_cb_func cb_f) +{ + struct rte_ether_addr eth_addr_tmp; + struct rte_ether_hdr *icmp_eth; + struct rte_ipv4_hdr *icmp_ipv4; + struct rte_icmp_hdr *icmph; + struct rte_mbuf *pkt = packet->pkt; + int icmp_pkt_len = iface->l2_len_out + sizeof(struct rte_ipv4_hdr) + + sizeof(struct rte_icmp_hdr); + if (pkt->data_len >= icmp_pkt_len) { + int ret = rte_pktmbuf_trim(pkt, pkt->data_len - icmp_pkt_len); + if (ret < 0) { + GK_LOG(ERR, + "Failed to remove %d bytes of data at the end of the mbuf at %s", + pkt->data_len - icmp_pkt_len, __func__); + cb_f(pkt, instance); + return; + } + + icmp_eth = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *); + } else { + icmp_eth = (struct rte_ether_hdr *)rte_pktmbuf_append(pkt, + icmp_pkt_len - pkt->data_len); + if (icmp_eth == NULL) { + GK_LOG(ERR, + "Failed to append %d bytes of new data: not enough headroom space in the first segment at %s\n", + icmp_pkt_len - pkt->data_len, __func__); + cb_f(pkt, instance); + return; + } + } + + rte_ether_addr_copy(&icmp_eth->s_addr, ð_addr_tmp); + rte_ether_addr_copy(&icmp_eth->d_addr, &icmp_eth->s_addr); + rte_ether_addr_copy(ð_addr_tmp, &icmp_eth->d_addr); + if (iface->vlan_insert) { + fill_vlan_hdr(icmp_eth, iface->vlan_tag_be, + RTE_ETHER_TYPE_IPV4); + } + + icmp_ipv4 = (struct rte_ipv4_hdr *)pkt_out_skip_l2(iface, icmp_eth); + icmp_ipv4->version_ihl = IP_VHL_DEF; + icmp_ipv4->type_of_service = 0; + icmp_ipv4->packet_id = 0; + icmp_ipv4->fragment_offset = IP_DN_FRAGMENT_FLAG; + icmp_ipv4->time_to_live = IP_DEFTTL; + icmp_ipv4->next_proto_id = IPPROTO_ICMP; + icmp_ipv4->src_addr = packet->flow.f.v4.dst.s_addr; + icmp_ipv4->dst_addr = packet->flow.f.v4.src.s_addr; + icmp_ipv4->total_length = rte_cpu_to_be_16(pkt->data_len - + iface->l2_len_out); + /* + * The IP header checksum filed must be set to 0 + * in order to offload the checksum calculation. + */ + icmp_ipv4->hdr_checksum = 0; + pkt->l2_len = iface->l2_len_out; + pkt->l3_len = sizeof(struct rte_ipv4_hdr); + pkt->ol_flags |= PKT_TX_IPV4 | PKT_TX_IP_CKSUM; + + icmph = (struct rte_icmp_hdr *)&icmp_ipv4[1]; + icmph->icmp_type = ICMP_TIME_EXCEEDED; + icmph->icmp_code = ICMP_EXC_TTL; + icmph->icmp_cksum = 0; + icmph->icmp_ident = 0; + icmph->icmp_seq_nb = 0; + icmph->icmp_cksum = icmp_cksum(icmph, sizeof(*icmph)); + + icmp_bufs[*num_pkts] = pkt; + (*num_pkts)++; +} + +static void +xmit_icmpv6(struct gatekeeper_if *iface, struct ipacket *packet, + uint16_t *num_pkts, struct rte_mbuf **icmp_bufs, + struct gk_instance *instance, packet_drop_cb_func cb_f) +{ + struct rte_ether_addr eth_addr_tmp; + struct rte_ether_hdr *icmp_eth; + struct rte_ipv6_hdr *icmp_ipv6; + struct icmpv6_hdr *icmpv6_hdr; + struct rte_mbuf *pkt = packet->pkt; + int icmpv6_pkt_len = iface->l2_len_out + sizeof(struct rte_ipv6_hdr) + + sizeof(struct icmpv6_hdr); + if (pkt->data_len >= icmpv6_pkt_len) { + int ret = rte_pktmbuf_trim(pkt, + pkt->data_len - icmpv6_pkt_len); + if (ret < 0) { + GK_LOG(ERR, + "Failed to remove %d bytes of data at the end of the mbuf at %s", + pkt->data_len - icmpv6_pkt_len, __func__); + cb_f(pkt, instance); + return; + } + + icmp_eth = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *); + } else { + icmp_eth = (struct rte_ether_hdr *)rte_pktmbuf_append(pkt, + icmpv6_pkt_len - pkt->data_len); + if (icmp_eth == NULL) { + GK_LOG(ERR, + "Failed to append %d bytes of new data: not enough headroom space in the first segment at %s\n", + icmpv6_pkt_len - pkt->data_len, __func__); + cb_f(pkt, instance); + return; + } + } + + rte_ether_addr_copy(&icmp_eth->s_addr, ð_addr_tmp); + rte_ether_addr_copy(&icmp_eth->d_addr, &icmp_eth->s_addr); + rte_ether_addr_copy(ð_addr_tmp, &icmp_eth->d_addr); + if (iface->vlan_insert) { + fill_vlan_hdr(icmp_eth, iface->vlan_tag_be, + RTE_ETHER_TYPE_IPV6); + } + + /* Set-up IPv6 header. */ + icmp_ipv6 = (struct rte_ipv6_hdr *)pkt_out_skip_l2(iface, icmp_eth); + icmp_ipv6->vtc_flow = rte_cpu_to_be_32(IPv6_DEFAULT_VTC_FLOW); + icmp_ipv6->payload_len = rte_cpu_to_be_16(sizeof(*icmpv6_hdr)); + icmp_ipv6->proto = IPPROTO_ICMPV6; + /* + * The IP Hop Limit field must be 255 as required by + * RFC 4861, sections 7.1.1 and 7.1.2. + */ + icmp_ipv6->hop_limits = 255; + rte_memcpy(icmp_ipv6->src_addr, packet->flow.f.v6.dst.s6_addr, + sizeof(icmp_ipv6->src_addr)); + rte_memcpy(icmp_ipv6->dst_addr, packet->flow.f.v6.src.s6_addr, + sizeof(icmp_ipv6->dst_addr)); + + /* Set-up ICMPv6 header. */ + icmpv6_hdr = (struct icmpv6_hdr *)&icmp_ipv6[1]; + icmpv6_hdr->type = ICMPV6_TIME_EXCEED; + icmpv6_hdr->code = ICMPV6_EXC_HOPLIMIT; + icmpv6_hdr->cksum = 0; /* Calculated below. */ + + icmpv6_hdr->cksum = rte_ipv6_icmpv6_cksum(icmp_ipv6, icmpv6_hdr); + + icmp_bufs[*num_pkts] = pkt; + (*num_pkts)++; +} + +/* + * For IPv4, according to the RFC 1812 section 5.3.1 Time to Live (TTL), + * if the TTL is reduced to zero (or less), the packet MUST be + * discarded, and if the destination is not a multicast address the + * router MUST send an ICMP Time Exceeded message, Code 0 (TTL Exceeded + * in Transit) message to the source. + * + * For IPv6, according to the RFC 1883 section 4.4, + * if the IPv6 Hop Limit is less than or equal to 1, then the router needs to + * send an ICMP Time Exceeded -- Hop Limit Exceeded in Transit message to + * the Source Address and discard the packet. + */ +static int +update_ip_hop_count(struct gatekeeper_if *iface, struct ipacket *packet, + uint16_t *num_pkts, struct rte_mbuf **icmp_bufs, + struct token_bucket_ratelimit_state *rs, struct gk_instance *instance, + packet_drop_cb_func cb_f) +{ + if (packet->flow.proto == RTE_ETHER_TYPE_IPV4) { + struct rte_ipv4_hdr *ipv4_hdr = packet->l3_hdr; + if (ipv4_hdr->time_to_live <= 1) { + if (tb_ratelimit_allow(rs)) { + xmit_icmp(iface, packet, num_pkts, + icmp_bufs, instance, cb_f); + } else + cb_f(packet->pkt, instance); + return -ETIMEDOUT; + } + + --(ipv4_hdr->time_to_live); + ++(ipv4_hdr->hdr_checksum); + } else if (likely(packet->flow.proto == RTE_ETHER_TYPE_IPV6)) { + struct rte_ipv6_hdr *ipv6_hdr = packet->l3_hdr; + if (ipv6_hdr->hop_limits <= 1) { + if (tb_ratelimit_allow(rs)) { + xmit_icmpv6(iface, packet, num_pkts, + icmp_bufs, instance, cb_f); + } else + cb_f(packet->pkt, instance); + return -ETIMEDOUT; + } + + --(ipv6_hdr->hop_limits); + } else { + GK_LOG(WARNING, + "Unexpected condition at %s: unknown flow type %hu\n", + __func__, packet->flow.proto); + cb_f(packet->pkt, instance); + return -EINVAL; + } + + return 0; +} + +static void +forward_pkt_to_back(struct ipacket *packet, struct ether_cache *eth_cache, + struct gk_co_work *work) +{ + struct rte_mbuf *pkt = packet->pkt; + struct gatekeeper_if *front = &work->gk_conf->net->front; + struct gatekeeper_if *back = &work->gk_conf->net->back; + + if (adjust_pkt_len(pkt, back, 0) == NULL || + pkt_copy_cached_eth_header(pkt, eth_cache, + back->l2_len_out)) { + drop_packet_front(pkt, work->instance); + return; + } + + if (update_ip_hop_count(front, packet, + &work->tx_front_num_pkts, work->tx_front_pkts, + &work->instance->front_icmp_rs, work->instance, + drop_packet_front) < 0) + return; + + work->tx_back_pkts[work->tx_back_num_pkts++] = pkt; +} + +static struct gk_fib * +look_up_fib(struct gk_lpm *ltbl, struct ip_flow *flow) +{ + int fib_id; + + if (flow->proto == RTE_ETHER_TYPE_IPV4) { + fib_id = lpm_lookup_ipv4(ltbl->lpm, flow->f.v4.dst.s_addr); + if (fib_id < 0) + return NULL; + return <bl->fib_tbl[fib_id]; + } + + if (likely(flow->proto == RTE_ETHER_TYPE_IPV6)) { + fib_id = lpm_lookup_ipv6(ltbl->lpm6, &flow->f.v6.dst); + if (fib_id < 0) + return NULL; + return <bl->fib_tbl6[fib_id]; + } + + rte_panic("Unexpected condition at %s: unknown flow type %hu\n", + __func__, flow->proto); + + return NULL; /* Unreachable. */ +} + +static struct flow_entry * +lookup_fe_from_lpm(struct ipacket *packet, uint32_t ip_flow_hash_val, + struct gk_co_work *work) +{ + struct rte_mbuf *pkt = packet->pkt; + + /* + * A prefetch is not needed here because current deployments of + * Gatekeeper servers have only a couple of FIB entries forwarding + * traffic from front to back interfaces. + */ + struct gk_fib *fib = look_up_fib(&work->gk_conf->lpm_tbl, + &packet->flow); + + if (fib == NULL || fib->action == GK_FWD_NEIGHBOR_FRONT_NET) { + struct gk_measurement_metrics *stats = + &work->instance->traffic_stats; + if (packet->flow.proto == RTE_ETHER_TYPE_IPV4) { + stats->tot_pkts_num_distributed++; + stats->tot_pkts_size_distributed += + rte_pktmbuf_pkt_len(pkt); + add_pkt_acl(&work->front_acl4, pkt); + } else if (likely(packet->flow.proto == + RTE_ETHER_TYPE_IPV6)) { + stats->tot_pkts_num_distributed++; + stats->tot_pkts_size_distributed += + rte_pktmbuf_pkt_len(pkt); + add_pkt_acl(&work->front_acl6, pkt); + } else { + print_flow_err_msg(&packet->flow, + "gk: failed to get the fib entry"); + drop_packet_front(pkt, work->instance); + } + return NULL; + } + + switch (fib->action) { + case GK_FWD_GRANTOR: { + struct flow_entry *fe = &work->temp_fes[work->temp_fes_num++]; + initialize_flow_entry(fe, &packet->flow, ip_flow_hash_val, fib); + return fe; + } + + case GK_FWD_GATEWAY_BACK_NET: { + /* + * The entry instructs to forward its packets to + * the gateway in the back network. + */ + struct ether_cache *eth_cache = fib->u.gateway.eth_cache; + RTE_VERIFY(eth_cache != NULL); + forward_pkt_to_back(packet, eth_cache, work); + return NULL; + } + + case GK_FWD_NEIGHBOR_BACK_NET: { + /* + * The entry instructs to forward its packets to + * the neighbor in the back network. + */ + struct ether_cache *eth_cache = + (packet->flow.proto == RTE_ETHER_TYPE_IPV4) + ? lookup_ether_cache(&fib->u.neigh, + &packet->flow.f.v4.dst) + : lookup_ether_cache(&fib->u.neigh6, + &packet->flow.f.v6.dst); + RTE_VERIFY(eth_cache != NULL); + forward_pkt_to_back(packet, eth_cache, work); + return NULL; + } + + case GK_DROP: + /* FALLTHROUGH */ + default: + drop_packet_front(pkt, work->instance); + return NULL; + } + + return NULL; +} + +static void +prefetch_and_yield(void *addr, void *this_co) +{ + rte_prefetch_non_temporal(addr); + gk_yield_next(this_co); +} + +static void +gk_co_process_front_pkt_final(struct gk_co *this_co, struct gk_co_task *task) +{ + struct ipacket *packet = task->task_arg; + struct gk_co_work *work = this_co->work; + uint32_t ip_flow_hash_val = task->task_hash; + struct flow_entry *fe_leftover = + get_fe_leftover(work, ip_flow_hash_val); + struct flow_entry *fe; + int ret; + + /* Is leftover useful? */ + if (fe_leftover != NULL && + fe_leftover->flow_hash_val == ip_flow_hash_val && + ip_flow_cmp_eq(&fe_leftover->flow, + &packet->flow, 0) == 0) { + /* Jackpot! Deal with @pkt right away. */ + process_flow_entry(this_co, fe_leftover, packet); + return; + } + + /* Look up flow entry. */ + ret = rte_hash_lookup_and_yield_with_hash( + work->instance->ip_flow_hash_table, &packet->flow, + ip_flow_hash_val, prefetch_and_yield, this_co); + if (ret >= 0) { + fe = &work->instance->ip_flow_entry_table[ret]; + /* TODO Break this prefetch into part1 and part2. */ + prefetch_flow_entry(fe); + gk_yield_next(this_co); + process_flow_entry(this_co, fe, packet); + save_fe_leftover(work, fe); + return; + } + if (unlikely(ret != -ENOENT)) { + char err_msg[1024]; + + ret = snprintf(err_msg, sizeof(err_msg), + "gk: failed to look up flow state at %s with lcore %u: %i\n", + __func__, rte_lcore_id(), ret); + + RTE_VERIFY(ret > 0 && ret < (int)sizeof(err_msg)); + print_flow_err_msg(&packet->flow, err_msg); + return; + } + + fe = lookup_fe_from_lpm(packet, ip_flow_hash_val, work); + if (fe == NULL) + return; + process_flow_entry(this_co, fe, packet); + save_fe_leftover(work, fe); +} + +void +gk_co_process_front_pkt_software_rss(struct gk_co *this_co, + struct gk_co_task *task) +{ + struct ipacket *packet = task->task_arg; + + if (parse_front_pkt(this_co, packet, packet->pkt) != 0) + return; + + /* Finish up the work with the correct hash value. */ + task->task_hash = rss_ip_flow_hf(&packet->flow, 0, 0); + task->task_func = gk_co_process_front_pkt_final; + reschedule_task(this_co, task); +} + +void +gk_co_process_front_pkt(struct gk_co *this_co, struct gk_co_task *task) +{ + struct ipacket packet; + + if (parse_front_pkt(this_co, &packet, task->task_arg) != 0) + return; + task->task_arg = &packet; + gk_co_process_front_pkt_final(this_co, task); +} + +static void +gk_co_scan_flow_table_final(struct gk_co *this_co, struct gk_co_task *task) +{ + struct gk_co_work *work = this_co->work; + struct flow_entry *fe = task->task_arg; + struct flow_entry **leftover_bucket = get_fe_leftover_bucket(work, fe); + + RTE_VERIFY(work->del_fe == NULL); + work->del_fe = fe; + + /* Deal with the leftover. */ + if (unlikely(*leftover_bucket == fe)) { + /* One does not need to look up again. */ + return; + } + *leftover_bucket = fe; + + /* Prefetch buckets to remove the flow entry later. */ + rte_hash_lookup_and_yield_with_hash(work->instance->ip_flow_hash_table, + &fe->flow, fe->flow_hash_val, prefetch_and_yield, this_co); +} + +static bool +is_flow_expired(struct flow_entry *fe, uint64_t now) +{ + switch(fe->state) { + case GK_REQUEST: + if (fe->u.request.last_packet_seen_at > now) { + char err_msg[128]; + int ret = snprintf(err_msg, sizeof(err_msg), + "gk: buggy condition at %s: wrong timestamp", + __func__); + RTE_VERIFY(ret > 0 && ret < (int)sizeof(err_msg)); + print_flow_err_msg(&fe->flow, err_msg); + return true; + } + + /* + * A request entry is considered expired if it is not + * doubling its waiting time. We use +2 instead of +1 in + * the test below to account for random delays in the network. + */ + return priority_from_delta_time(now, + fe->u.request.last_packet_seen_at) > + fe->u.request.last_priority + 2; + case GK_GRANTED: + return now >= fe->u.granted.cap_expire_at; + case GK_DECLINED: + return now >= fe->u.declined.expire_at; + case GK_BPF: + return now >= fe->u.bpf.expire_at; + default: + return true; + } +} + +void +gk_co_scan_flow_table(struct gk_co *this_co, struct gk_co_task *task) +{ + struct flow_entry *fe = task->task_arg; + + /* + * Only one prefetch is needed here because one only needs + * the beginning of a struct flow_entry to + * check if it's expired. + */ + rte_prefetch_non_temporal(fe); + gk_yield_next(this_co); + + if (!fe->in_use || !is_flow_expired(fe, rte_rdtsc())) + return; + + /* Finish up the work with the correct hash value. */ + task->task_hash = fe->flow_hash_val; + task->task_func = gk_co_scan_flow_table_final; + reschedule_task(this_co, task); +} + +static struct gk_co_task * +next_task(struct gk_co *this_co) +{ + while (true) { + struct gk_co *next_co; + + /* + * This test is likely because if @this_co has at least + * one task, there's at least 50% that it will be true because + * this function is called twice. + */ + if (likely(!list_empty(&this_co->task_queue))) { + /* + * @this_co has assigned tasks. + * Return the first assigned task. + */ + struct gk_co_task *task = list_first_entry( + &this_co->task_queue, struct gk_co_task, + task_list); + list_del(&task->task_list); + return task; + } + + /* There is no more tasks assigned to @this_co. */ + + next_co = get_next_co(this_co); + + /* Make @this_co idle. */ + list_del(&this_co->co_list); + + /* Transfer control to another coroutine. */ + if (likely(this_co != next_co)) { + /* + * @this_co is NOT the last working coroutine. + * Yield to the next coroutine. + */ + coro_transfer(&this_co->coro, &next_co->coro); + } else { + /* + * No more work and no more working coroutines; + * @this_co is the last working coroutine. + * Return to the main coroutine. + */ + coro_transfer(&this_co->coro, + &this_co->work->instance->coro_root); + } + } +} + +void +gk_co_main(void *arg) +{ + struct gk_co *this_co = arg; + struct gk_co_task *task = next_task(this_co); + + while (likely(task != NULL)) { + task->task_func(this_co, task); + task = next_task(this_co); + } + + rte_panic("%s() terminated\n", __func__); +} diff --git a/gk/co.h b/gk/co.h new file mode 100644 index 000000000..6ed27033a --- /dev/null +++ b/gk/co.h @@ -0,0 +1,290 @@ +/* + * Gatekeeper - DoS protection system. + * Copyright (C) 2016 Digirati LTDA. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef _GATEKEEPER_GK_CO_H_ +#define _GATEKEEPER_GK_CO_H_ + +#include +#include +#include +#include + +#include "gatekeeper_gk.h" +#include "gatekeeper_acl.h" + +struct gk_co { + /* + * Attach this coroutine to work->working_cos while + * this coroutine is working. + */ + struct list_head co_list; + /* structs from libcoro. */ + struct coro_stack stack; + struct coro_context coro; + /* Task assigned to this coroutine. */ + struct list_head task_queue; + struct gk_co_work *work; +}; + +struct gk_co_task *task; + +typedef void (*gk_co_task_func_t)(struct gk_co *this_co, + struct gk_co_task *task); + +struct gk_co_task { + /* + * Once the task is assigned to a coroutine, + * attach this task to co->task_queue. + */ + struct list_head task_list; + /* + * @task_hash is used to assign the task to a coroutine. + * + * This is important to avoid race conditions between coroutines. + * For example, assume that two tasks that are going to work on + * the same flow entry are assigned to two different coroutines, and + * that the corresponding flow entry is not available in + * the flow table, both coroutines may try to add the same flow entry. + * If these two tasks share the same task hash, both tasks are going to + * be assigned to the same coroutine. + */ + uint32_t task_hash; + void *task_arg; + gk_co_task_func_t task_func; +}; + +struct gk_co_work { + /* The coroutines working on the tasks. */ + struct list_head working_cos; + /* Coroutines available to do the work. */ + struct gk_co *cos; + /* Number of coroutines available for the next batch of work. */ + uint16_t co_num; + /* Total number of coroutines available at field @cos. */ + uint16_t co_max_num; + /* Index of the next coroutine to use when a task has no task hash. */ + uint16_t any_co_index; + /* How field @co_num will change for the next batch of work. */ + int16_t co_delta_num; + /* + * Previous value of field @co_num. + * When the value of this field is zero, an invalid value for @co_num, + * the value of field @avg_cycles_per_task is not meaningful. + */ + uint16_t co_prv_num; + /* + * Average number of cycles per task when @co_num was equal to + * @co_prv_num. + */ + double avg_cycles_per_task; + + struct gk_config *gk_conf; + struct gk_instance *instance; + + /* All preallocated tasks available to do work. */ + struct gk_co_task *all_tasks; + /* The total number of taks available at field @all_tasks. */ + const uint32_t task_total; + /* Current number of tasks used at field @all_tasks. */ + uint32_t task_num; + + /* Fields for front packets and mailbox messages. */ + /* + * This is a single-entry-per-bucket hash table. + * This flow entries are reused between tasks assigned to + * the same coroutine. + */ + struct flow_entry ** const leftover; + /* + * Flow entries that has not been inserted in the flow table, but + * they may be present in @leftover. + */ + struct flow_entry * const temp_fes; + /* Number of entries in used in @temp_fes. */ + uint16_t temp_fes_num; + /* + * Mask for the hash table @leftover. + * It must be of the form (2^n - 1) for any n >= 0. + */ + const uint32_t leftover_mask; + /* + * The following fields release the coroutines of acquiring + * a writer lock on the flow table. + */ + /* If different of NULL, free this entry in flush_work(). */ + struct flow_entry *del_fe; + + /* Fields for front and back packets. */ + uint16_t tx_front_num_pkts; + uint16_t tx_back_num_pkts; + struct rte_mbuf ** const tx_front_pkts; + struct rte_mbuf ** const tx_back_pkts; + /* + * The following field is only needed when the RSS hash is not + * available. + */ + struct ipacket * const packets; + + /* Fields for the front packets only. */ + uint16_t front_num_req; + uint16_t front_num_arp; + struct rte_mbuf ** const front_req_bufs; + struct rte_mbuf ** const front_arp_bufs; + struct acl_search front_acl4; + struct acl_search front_acl6; + bool front_ipv4_configured; + bool front_ipv6_configured; + + /* Fields for the front packets only. */ + uint16_t back_num_arp; + struct rte_mbuf ** const back_arp_bufs; + struct acl_search back_acl4; + struct acl_search back_acl6; +}; + +/* Declare and initialize a struct gk_co_work. */ +#define DEFINE_GK_CO_WORK(name, max_front_pkts, max_back_pkts, \ + max_mailbox, lo_mask, task_extra) \ + struct gk_co_task name##_all_tasks_array[(max_front_pkts) + \ + (max_back_pkts) + (max_mailbox) + (task_extra)]; \ + struct flow_entry *name##_leftover_array[(lo_mask) + 1]; \ + struct flow_entry name##_temp_fes_array[ \ + (max_front_pkts) + (max_mailbox)]; \ + struct rte_mbuf *name##_tx_front_pkts_array[ \ + (max_front_pkts) + (max_back_pkts)]; \ + struct rte_mbuf *name##_tx_back_pkts_array[ \ + (max_front_pkts) + (max_back_pkts)]; \ + struct ipacket name##_packets_array[ \ + (max_front_pkts) + (max_back_pkts)]; \ + struct rte_mbuf *name##_front_req_bufs_array[(max_front_pkts)]; \ + struct rte_mbuf *name##_front_arp_bufs_array[(max_front_pkts)]; \ + DECLARE_ACL_SEARCH_VARIABLE_PART(front_acl4, (max_front_pkts)); \ + DECLARE_ACL_SEARCH_VARIABLE_PART(front_acl6, (max_front_pkts)); \ + struct rte_mbuf *name##_back_arp_bufs_array[(max_back_pkts)]; \ + DECLARE_ACL_SEARCH_VARIABLE_PART(back_acl4, (max_back_pkts)); \ + DECLARE_ACL_SEARCH_VARIABLE_PART(back_acl6, (max_back_pkts)); \ + struct gk_co_work name = { \ + .working_cos = LIST_HEAD_INIT(name.working_cos), \ + .cos = NULL, \ + .co_num = 0, \ + .co_max_num = 0, \ + .any_co_index = 0, \ + .co_delta_num = 1, \ + .co_prv_num = 0, \ + .avg_cycles_per_task = 0, \ + .gk_conf = NULL, \ + .instance = NULL, \ + .all_tasks = name##_all_tasks_array, \ + .task_total = (max_front_pkts) + (max_back_pkts) + \ + (max_mailbox) + (task_extra), \ + .task_num = 0, \ + .leftover = memset(name##_leftover_array, 0, \ + sizeof(name##_leftover_array)), \ + .temp_fes = name##_temp_fes_array, \ + .temp_fes_num = 0, \ + .leftover_mask = (lo_mask), \ + .del_fe = NULL, \ + .tx_front_num_pkts = 0, \ + .tx_back_num_pkts = 0, \ + .tx_front_pkts = name##_tx_front_pkts_array, \ + .tx_back_pkts = name##_tx_back_pkts_array, \ + .packets = name##_packets_array, \ + .front_num_req = 0, \ + .front_num_arp = 0, \ + .front_req_bufs = name##_front_req_bufs_array, \ + .front_arp_bufs = name##_front_arp_bufs_array, \ + .front_acl4 = ACL_SEARCH_INIT(front_acl4), \ + .front_acl6 = ACL_SEARCH_INIT(front_acl6), \ + .front_ipv4_configured = false, \ + .front_ipv6_configured = false, \ + .back_num_arp = 0, \ + .back_arp_bufs = name##_back_arp_bufs_array, \ + .back_acl4 = ACL_SEARCH_INIT(back_acl4), \ + .back_acl6 = ACL_SEARCH_INIT(back_acl6), \ + } + +static inline struct gk_co * +get_task_owner_co(struct gk_co_work *work, struct gk_co_task *task) +{ + return &work->cos[task->task_hash % work->co_num]; +} + +static inline void +__schedule_task(struct gk_co *task_owner_co, struct gk_co_task *task) +{ + list_add_tail(&task->task_list, &task_owner_co->task_queue); +} + +static inline void +schedule_task(struct gk_co_work *work, struct gk_co_task *task) +{ + __schedule_task(get_task_owner_co(work, task), task); +} + +/* Uniformly distribuite tasks with no task hash among coroutines. */ +static inline void +schedule_task_to_any_co(struct gk_co_work *work, struct gk_co_task *task) +{ + __schedule_task(&work->cos[work->any_co_index], task); + work->any_co_index = (work->any_co_index + 1) % work->co_num; +} + +static inline struct flow_entry ** +__get_fe_leftover_bucket(struct gk_co_work *work, uint32_t hash) +{ + return &work->leftover[hash & work->leftover_mask]; +} + +static inline struct flow_entry ** +get_fe_leftover_bucket(struct gk_co_work *work, struct flow_entry *fe) +{ + return __get_fe_leftover_bucket(work, fe->flow_hash_val); +} + +static inline struct flow_entry * +get_fe_leftover(struct gk_co_work *work, uint32_t hash) +{ + return *__get_fe_leftover_bucket(work, hash); +} + +/* + * Notice that if the bucket is not empty, that reference will be lost. + * That is, the code favors the newer entry over the older entry. + */ +static inline void +save_fe_leftover(struct gk_co_work *work, struct flow_entry *fe) +{ + *get_fe_leftover_bucket(work, fe) = fe; +} + +void +gk_co_main(void *arg); + +void +gk_co_scan_flow_table(struct gk_co *this_co, struct gk_co_task *task); + +void +gk_co_process_front_pkt(struct gk_co *this_co, struct gk_co_task *task); +void +gk_co_process_front_pkt_software_rss(struct gk_co *this_co, + struct gk_co_task *task); + +void +gk_yield_next(struct gk_co *this_co); + +#endif /* _GATEKEEPER_GK_CO_H_ */ diff --git a/gk/main.c b/gk/main.c index bce203456..bb38be214 100644 --- a/gk/main.c +++ b/gk/main.c @@ -44,14 +44,14 @@ #include "gatekeeper_sol.h" #include "gatekeeper_flow_bpf.h" -#include "bpf.h" - -#define START_PRIORITY (38) -/* Set @START_ALLOWANCE as the double size of a large DNS reply. */ -#define START_ALLOWANCE (8) +#include "co.h" int gk_logtype; +/* + * TODO A copy of this function is available in gk/co.c, + * so drop it when possible. + */ /* We should avoid calling integer_log_base_2() with zero. */ static inline uint8_t integer_log_base_2(uint64_t delta_time) @@ -63,18 +63,22 @@ integer_log_base_2(uint64_t delta_time) #endif } -/* - * It converts the difference of time between the current packet and - * the last seen packet into a given priority. +/* + * TODO A copy of this function is available in gk/co.c, + * so drop it when possible. */ -static uint8_t +/* + * It converts the difference of time between the current packet and + * the last seen packet into a given priority. + */ +static uint8_t priority_from_delta_time(uint64_t present, uint64_t past) { uint64_t delta_time; if (unlikely(present < past)) { /* - * This should never happen, but we handle it gracefully here + * This should never happen, but we handle it gracefully here * in order to keep going. */ GK_LOG(ERR, "The present time smaller than the past time\n"); @@ -84,10 +88,14 @@ priority_from_delta_time(uint64_t present, uint64_t past) delta_time = (present - past) * picosec_per_cycle; if (unlikely(delta_time < 1)) return 0; - + return integer_log_base_2(delta_time); } +/* + * TODO A copy of this function is available in gk/co.c, + * so drop it when possible. + */ static struct gk_fib * look_up_fib(struct gk_lpm *ltbl, struct ip_flow *flow) { @@ -113,6 +121,10 @@ look_up_fib(struct gk_lpm *ltbl, struct ip_flow *flow) return NULL; /* Unreachable. */ } +/* + * TODO A copy of this function is available in gk/co.c, + * so drop it when possible. + */ static int extract_packet_info(struct rte_mbuf *pkt, struct ipacket *packet) { @@ -180,41 +192,17 @@ extract_packet_info(struct rte_mbuf *pkt, struct ipacket *packet) return ret; } -static inline void -initialize_flow_entry(struct flow_entry *fe, struct ip_flow *flow, - uint32_t flow_hash_val, struct gk_fib *grantor_fib) -{ - /* - * The flow table is a critical data structure, so, - * whenever the size of entries grow too much, - * one must look for alternatives before increasing - * the limit below. - */ - RTE_BUILD_BUG_ON(sizeof(*fe) > 128); - - rte_memcpy(&fe->flow, flow, sizeof(*flow)); - - fe->in_use = true; - fe->flow_hash_val = flow_hash_val; - fe->state = GK_REQUEST; - fe->u.request.last_packet_seen_at = rte_rdtsc(); - fe->u.request.last_priority = START_PRIORITY; - fe->u.request.allowance = START_ALLOWANCE - 1; - fe->grantor_fib = grantor_fib; -} - -static inline void -reinitialize_flow_entry(struct flow_entry *fe, uint64_t now) -{ - fe->state = GK_REQUEST; - fe->u.request.last_packet_seen_at = now; - fe->u.request.last_priority = START_PRIORITY; - fe->u.request.allowance = START_ALLOWANCE - 1; -} - +/* + * TODO A copy of this typedef is available in gk/co.c, + * so drop it when possible. + */ typedef int (*packet_drop_cb_func)(struct rte_mbuf *pkt, struct gk_instance *instance); +/* + * TODO A copy of this function is available in gk/co.c, + * so drop it when possible. + */ static int drop_packet_front(struct rte_mbuf *pkt, struct gk_instance *instance) { @@ -257,247 +245,6 @@ pkt_copy_cached_eth_header(struct rte_mbuf *pkt, struct ether_cache *eth_cache, return stale; } -/* - * When a flow entry is at request state, all the GK block processing - * that entry does is to: - * (1) compute the priority of the packet. - * (2) encapsulate the packet as a request. - * (3) put this encapsulated packet in the request queue. - * - * Returns a negative integer on error, or EINPROGRESS to indicate - * that the request is being processed by another lcore, and should - * not be forwarded or dropped on returning from this function. - */ -static int -gk_process_request(struct flow_entry *fe, struct ipacket *packet, - struct rte_mbuf **req_bufs, uint16_t *num_reqs, - struct sol_config *sol_conf) -{ - int ret; - uint64_t now = rte_rdtsc(); - uint8_t priority = priority_from_delta_time(now, - fe->u.request.last_packet_seen_at); - struct gk_fib *fib = fe->grantor_fib; - struct ether_cache *eth_cache; - - fe->u.request.last_packet_seen_at = now; - - /* - * The reason for using "<" instead of "<=" is that the equal case - * means that the source has waited enough time to have the same - * last priority, so it should be awarded with the allowance. - */ - if (priority < fe->u.request.last_priority && - fe->u.request.allowance > 0) { - fe->u.request.allowance--; - priority = fe->u.request.last_priority; - } else { - fe->u.request.last_priority = priority; - fe->u.request.allowance = START_ALLOWANCE - 1; - } - - /* - * Adjust @priority for the DSCP field. - * DSCP 0 for legacy packets; 1 for granted packets; - * 2 for capability renew; 3-63 for requests. - */ - priority += PRIORITY_REQ_MIN; - if (unlikely(priority > PRIORITY_MAX)) - priority = PRIORITY_MAX; - - /* The assigned priority is @priority. */ - - /* Encapsulate the packet as a request. */ - ret = encapsulate(packet->pkt, priority, - &sol_conf->net->back, &fib->u.grantor.gt_addr); - if (ret < 0) - return ret; - - eth_cache = fib->u.grantor.eth_cache; - RTE_VERIFY(eth_cache != NULL); - /* If needed, packet header space was adjusted by encapsulate(). */ - if (pkt_copy_cached_eth_header(packet->pkt, eth_cache, - sol_conf->net->back.l2_len_out)) - return -1; - - req_bufs[*num_reqs] = packet->pkt; - req_bufs[*num_reqs]->udata64 = priority; - (*num_reqs)++; - - return EINPROGRESS; -} - -/* - * Returns: - * * zero on success; the granted packet can be enqueued and forwarded - * * a negative number on error or when the packet needs to be - * otherwise dropped because it has exceeded its budget - * * EINPROGRESS to indicate that the packet is now a request that - * is being processed by another lcore, and should not - * be forwarded or dropped on returning from this function. - */ -static int -gk_process_granted(struct flow_entry *fe, struct ipacket *packet, - struct rte_mbuf **req_bufs, uint16_t *num_reqs, - struct sol_config *sol_conf, struct gk_measurement_metrics *stats) -{ - int ret; - bool renew_cap; - uint8_t priority = PRIORITY_GRANTED; - uint64_t now = rte_rdtsc(); - struct rte_mbuf *pkt = packet->pkt; - struct gk_fib *fib = fe->grantor_fib; - struct ether_cache *eth_cache; - uint32_t pkt_len; - - if (now >= fe->u.granted.cap_expire_at) { - reinitialize_flow_entry(fe, now); - return gk_process_request(fe, packet, req_bufs, - num_reqs, sol_conf); - } - - if (now >= fe->u.granted.budget_renew_at) { - fe->u.granted.budget_renew_at = now + cycles_per_sec; - fe->u.granted.budget_byte = - (uint64_t)fe->u.granted.tx_rate_kib_cycle * 1024; - } - - pkt_len = rte_pktmbuf_pkt_len(pkt); - if (pkt_len > fe->u.granted.budget_byte) { - stats->pkts_num_declined++; - stats->pkts_size_declined += pkt_len; - return -1; - } - - fe->u.granted.budget_byte -= pkt_len; - renew_cap = now >= fe->u.granted.send_next_renewal_at; - if (renew_cap) { - fe->u.granted.send_next_renewal_at = now + - fe->u.granted.renewal_step_cycle; - priority = PRIORITY_RENEW_CAP; - } - - /* - * Encapsulate packet as a granted packet, - * mark it as a capability renewal request if @renew_cap is true, - * enter destination according to @fe->grantor_fib. - */ - ret = encapsulate(packet->pkt, priority, - &sol_conf->net->back, &fib->u.grantor.gt_addr); - if (ret < 0) - return ret; - - eth_cache = fib->u.grantor.eth_cache; - RTE_VERIFY(eth_cache != NULL); - /* If needed, packet header space was adjusted by encapsulate(). */ - if (pkt_copy_cached_eth_header(packet->pkt, eth_cache, - sol_conf->net->back.l2_len_out)) - return -1; - - stats->pkts_num_granted++; - stats->pkts_size_granted += pkt_len; - return 0; -} - -/* - * Returns: - * * a negative number on error or when the packet needs to be - * otherwise dropped because it is declined - * * EINPROGRESS to indicate that the packet is now a request that - * is being processed by another lcore, and should not - * be forwarded or dropped on returning from this function. - */ -static int -gk_process_declined(struct flow_entry *fe, struct ipacket *packet, - struct rte_mbuf **req_bufs, uint16_t *num_reqs, - struct sol_config *sol_conf, struct gk_measurement_metrics *stats) -{ - uint64_t now = rte_rdtsc(); - - if (unlikely(now >= fe->u.declined.expire_at)) { - reinitialize_flow_entry(fe, now); - return gk_process_request(fe, packet, req_bufs, - num_reqs, sol_conf); - } - - stats->pkts_num_declined++; - stats->pkts_size_declined += rte_pktmbuf_pkt_len(packet->pkt); - - return -1; -} - -/* - * Returns: - * * zero on success; the packet can be enqueued and forwarded - * * a negative number on error or when the packet needs to be - * otherwise dropped because it has exceeded a limit - * * EINPROGRESS to indicate that the packet is now a request that - * is being processed by another lcore, and should not - * be forwarded or dropped on returning from this function. - */ -static int -gk_process_bpf(struct flow_entry *fe, struct ipacket *packet, - struct rte_mbuf **req_bufs, uint16_t *num_reqs, - struct gk_config *gk_conf, struct gk_measurement_metrics *stats) -{ - uint64_t bpf_ret; - int program_index, rc; - uint64_t now = rte_rdtsc(); - - if (unlikely(now >= fe->u.bpf.expire_at)) - goto expired; - - program_index = fe->program_index; - rc = gk_bpf_decide_pkt(gk_conf, program_index, fe, packet, now, - &bpf_ret); - if (unlikely(rc != 0)) { - GK_LOG(WARNING, - "The BPF program at index %u failed to run its function pkt\n", - program_index); - goto expired; - } - - switch (bpf_ret) { - case GK_BPF_PKT_RET_FORWARD: { - struct ether_cache *eth_cache = - fe->grantor_fib->u.grantor.eth_cache; - RTE_VERIFY(eth_cache != NULL); - /* - * If needed, encapsulate() already adjusted - * packet header space. - */ - if (pkt_copy_cached_eth_header(packet->pkt, eth_cache, - gk_conf->net->back.l2_len_out)) - return -1; - - stats->pkts_num_granted++; - stats->pkts_size_granted += rte_pktmbuf_pkt_len(packet->pkt); - return 0; - } - case GK_BPF_PKT_RET_DECLINE: - stats->pkts_num_declined++; - stats->pkts_size_declined += rte_pktmbuf_pkt_len(packet->pkt); - return -1; - case GK_BPF_PKT_RET_ERROR: - GK_LOG(WARNING, - "The function pkt of the BPF program at index %u returned GK_BPF_PKT_RET_ERROR\n", - program_index); - return -1; - default: - GK_LOG(WARNING, - "The function pkt of the BPF program at index %u returned an invalid return: %" PRIu64 "\n", - program_index, bpf_ret); - return -1; - } - - rte_panic("Unexpected condition at %s()", __func__); - -expired: - reinitialize_flow_entry(fe, now); - return gk_process_request(fe, packet, req_bufs, num_reqs, - gk_conf->sol_conf); -} - static int get_block_idx(struct gk_config *gk_conf, unsigned int lcore_id) { @@ -510,6 +257,10 @@ get_block_idx(struct gk_config *gk_conf, unsigned int lcore_id) return 0; } +/* + * TODO A copy of this function is available in gk/co.c, + * so drop it when possible. + */ static bool is_flow_expired(struct flow_entry *fe, uint64_t now) { @@ -545,12 +296,17 @@ is_flow_expired(struct flow_entry *fe, uint64_t now) } static int -gk_del_flow_entry_from_hash(struct rte_hash *h, struct flow_entry *fe) +gk_del_flow_entry_from_hash(struct gk_instance *instance, struct flow_entry *fe) { - int ret = rte_hash_del_key_with_hash(h, &fe->flow, fe->flow_hash_val); - if (likely(ret >= 0)) + + int ret = rte_hash_del_key_with_hash(instance->ip_flow_hash_table, + &fe->flow, fe->flow_hash_val); + if (likely(ret >= 0)) { memset(fe, 0, sizeof(*fe)); - else { + + if (instance->num_scan_del > 0) + instance->num_scan_del--; + } else { GK_LOG(ERR, "The GK block failed to delete a key from hash table at %s: %s\n", __func__, strerror(-ret)); @@ -559,6 +315,56 @@ gk_del_flow_entry_from_hash(struct rte_hash *h, struct flow_entry *fe) return ret; } +static void +free_cos(struct gk_co *cos, unsigned int num) +{ + unsigned int i; + + if (cos == NULL) + return; + + for (i = 0; i < num; i++) { + struct gk_co *co = &cos[i]; + + if (co->stack.sptr == NULL) + continue; + + /* Free @co. */ + coro_destroy(&co->coro); + coro_stack_free(&co->stack); + } + + rte_free(cos); +} + +static struct gk_co * +alloc_cos(unsigned int num, unsigned int stack_size_byte) +{ + unsigned int stack_size_ptr = stack_size_byte / sizeof(void *); + unsigned int i; + + struct gk_co *cos = rte_calloc(__func__, num, sizeof(*cos), 0); + if (cos == NULL) + return NULL; + + for (i = 0; i < num; i++) { + struct gk_co *co = &cos[i]; + + if (unlikely(!coro_stack_alloc(&co->stack, stack_size_ptr))) { + free_cos(cos, num); + return NULL; + } + + coro_create(&co->coro, gk_co_main, co, + co->stack.sptr, co->stack.ssze); + INIT_LIST_HEAD_WITH_POISON(&co->co_list); + INIT_LIST_HEAD(&co->task_queue); + co->work = NULL; + } + + return cos; +} + static int setup_gk_instance(unsigned int lcore_id, struct gk_config *gk_conf) { @@ -586,7 +392,6 @@ setup_gk_instance(unsigned int lcore_id, struct gk_config *gk_conf) GK_LOG(ERR, "The GK block cannot create hash table at lcore %u\n", lcore_id); - ret = -1; goto out; } @@ -600,7 +405,6 @@ setup_gk_instance(unsigned int lcore_id, struct gk_config *gk_conf) GK_LOG(ERR, "The GK block can't create flow entry table at lcore %u\n", lcore_id); - ret = -1; goto flow_hash; } @@ -611,6 +415,19 @@ setup_gk_instance(unsigned int lcore_id, struct gk_config *gk_conf) if (ret < 0) goto flow_entry; + coro_create(&instance->coro_root, NULL, NULL, NULL, 0); + + /* Allocate coroutines. */ + instance->cos = alloc_cos(gk_conf->co_max_num, + gk_conf->co_stack_size_kb * 1024); + if (instance->cos == NULL) { + GK_LOG(ERR, + "The GK block can't allocate coroutines at lcore %u\n", + lcore_id); + ret = -1; + goto coro_root; + } + tb_ratelimit_state_init(&instance->front_icmp_rs, gk_conf->front_icmp_msgs_per_sec, gk_conf->front_icmp_msgs_burst); @@ -621,6 +438,10 @@ setup_gk_instance(unsigned int lcore_id, struct gk_config *gk_conf) ret = 0; goto out; +coro_root: + coro_destroy(&instance->coro_root); +/*mailbox:*/ + destroy_mailbox(&instance->mb); flow_entry: rte_free(instance->ip_flow_entry_table); instance->ip_flow_entry_table = NULL; @@ -730,8 +551,7 @@ flush_flow_table(struct ip_prefix *src, } if (matched) { - gk_del_flow_entry_from_hash( - instance->ip_flow_hash_table, fe); + gk_del_flow_entry_from_hash(instance, fe); num_flushed_flows++; } @@ -872,10 +692,8 @@ gk_synchronize(struct gk_fib *fib, struct gk_instance *instance) while (index >= 0) { struct flow_entry *fe = &instance->ip_flow_entry_table[index]; - if (fe->grantor_fib == fib) { - gk_del_flow_entry_from_hash( - instance->ip_flow_hash_table, fe); - } + if (fe->grantor_fib == fib) + gk_del_flow_entry_from_hash(instance, fe); index = rte_hash_iterate(instance->ip_flow_hash_table, (void *)&key, &data, &next); @@ -990,6 +808,10 @@ gk_setup_rss(struct gk_config *gk_conf) return ret; } +/* + * TODO A copy of this function is available in gk/co.c, + * so drop it when possible. + */ static void xmit_icmp(struct gatekeeper_if *iface, struct ipacket *packet, uint16_t *num_pkts, struct rte_mbuf **icmp_bufs, @@ -1065,6 +887,10 @@ xmit_icmp(struct gatekeeper_if *iface, struct ipacket *packet, (*num_pkts)++; } +/* + * TODO A copy of this function is available in gk/co.c, + * so drop it when possible. + */ static void xmit_icmpv6(struct gatekeeper_if *iface, struct ipacket *packet, uint16_t *num_pkts, struct rte_mbuf **icmp_bufs, @@ -1136,6 +962,10 @@ xmit_icmpv6(struct gatekeeper_if *iface, struct ipacket *packet, (*num_pkts)++; } +/* + * TODO A copy of this function is available in gk/co.c, + * so drop it when possible. + */ /* * For IPv4, according to the RFC 1812 section 5.3.1 Time to Live (TTL), * if the TTL is reduced to zero (or less), the packet MUST be @@ -1190,26 +1020,6 @@ update_ip_hop_count(struct gatekeeper_if *iface, struct ipacket *packet, return 0; } -/* - * This function is only to be called on flows that - * are not backed by a flow entry. - */ -static void -send_request_to_grantor(struct ipacket *packet, uint32_t flow_hash_val, - struct gk_fib *fib, struct rte_mbuf **req_bufs, - uint16_t *num_reqs, struct gk_instance *instance, - struct gk_config *gk_conf) { - int ret; - struct flow_entry temp_fe; - - initialize_flow_entry(&temp_fe, &packet->flow, flow_hash_val, fib); - - ret = gk_process_request(&temp_fe, packet, req_bufs, - num_reqs, gk_conf->sol_conf); - if (ret < 0) - drop_packet_front(packet->pkt, instance); -} - static void lookup_fib_bulk(struct gk_lpm *ltbl, struct ip_flow **flows, int num_flows, struct gk_fib *fibs[]) @@ -1289,111 +1099,70 @@ lookup_fib6_bulk(struct gk_lpm *ltbl, struct ip_flow **flows, } } -static struct flow_entry * -lookup_fe_from_lpm(struct ipacket *packet, uint32_t ip_flow_hash_val, - struct gk_fib *fib, uint16_t *num_tx, struct rte_mbuf **tx_bufs, +static void +process_fib(struct ipacket *packet, struct gk_fib *fib, + uint16_t *num_tx, struct rte_mbuf **tx_bufs, struct acl_search *acl4, struct acl_search *acl6, uint16_t *num_pkts, struct rte_mbuf **icmp_bufs, - struct rte_mbuf **req_bufs, uint16_t *num_reqs, struct gatekeeper_if *front, struct gatekeeper_if *back, - struct gk_instance *instance, struct gk_config *gk_conf) { + struct gk_instance *instance) { struct rte_mbuf *pkt = packet->pkt; struct ether_cache *eth_cache; - struct gk_measurement_metrics *stats = &instance->traffic_stats; - - if (fib == NULL || fib->action == GK_FWD_NEIGHBOR_FRONT_NET) { - if (packet->flow.proto == RTE_ETHER_TYPE_IPV4) { - stats->tot_pkts_num_distributed++; - stats->tot_pkts_size_distributed += - rte_pktmbuf_pkt_len(pkt); + if (fib == NULL || fib->action == GK_FWD_NEIGHBOR_BACK_NET) { + if (packet->flow.proto == RTE_ETHER_TYPE_IPV4) add_pkt_acl(acl4, pkt); - } else if (likely(packet->flow.proto == - RTE_ETHER_TYPE_IPV6)) { - stats->tot_pkts_num_distributed++; - stats->tot_pkts_size_distributed += - rte_pktmbuf_pkt_len(pkt); - + else if (likely(packet->flow.proto == + RTE_ETHER_TYPE_IPV6)) add_pkt_acl(acl6, pkt); - } else { + else { print_flow_err_msg(&packet->flow, - "gk: failed to get the fib entry"); - drop_packet_front(pkt, instance); + "gk: failed to get the fib entry or it is not an IP packet"); + drop_packet(pkt); } - return NULL; + return; } switch (fib->action) { - case GK_FWD_GRANTOR: { - struct flow_entry *fe; - int ret = gk_hash_add_flow_entry( - instance, &packet->flow, - ip_flow_hash_val, gk_conf); - if (ret == -ENOSPC) { - /* - * There is no room for a new - * flow entry, but give this - * flow a chance sending a - * request to the grantor - * server. - */ - send_request_to_grantor(packet, ip_flow_hash_val, - fib, req_bufs, num_reqs, instance, gk_conf); - return NULL; - } - if (ret < 0) { - drop_packet_front(pkt, instance); - return NULL; - } - - fe = &instance->ip_flow_entry_table[ret]; - initialize_flow_entry(fe, - &packet->flow, ip_flow_hash_val, fib); - return fe; - } - - case GK_FWD_GATEWAY_BACK_NET: { + case GK_FWD_GATEWAY_FRONT_NET: { /* * The entry instructs to forward * its packets to the gateway in - * the back network, forward accordingly. + * the front network, forward accordingly. * - * BP block bypasses from the front to the - * back interface are expected to bypass - * ranges of IP addresses that should not - * go through Gatekeeper. + * BP bypasses from the back to the front interface + * are expected to bypass the outgoing traffic + * from the AS to its peers. * * Notice that one needs to update * the Ethernet header. */ - eth_cache = fib->u.gateway.eth_cache; RTE_VERIFY(eth_cache != NULL); - if (adjust_pkt_len(pkt, back, 0) == NULL || + if (adjust_pkt_len(pkt, front, 0) == NULL || pkt_copy_cached_eth_header(pkt, eth_cache, - back->l2_len_out)) { - drop_packet_front(pkt, instance); - return NULL; + front->l2_len_out)) { + drop_packet(pkt); + return; } - if (update_ip_hop_count(front, packet, + if (update_ip_hop_count(back, packet, num_pkts, icmp_bufs, - &instance->front_icmp_rs, - instance, - drop_packet_front) < 0) - return NULL; + &instance->back_icmp_rs, + instance, drop_packet_back) < 0) + return; tx_bufs[(*num_tx)++] = pkt; - return NULL; + break; } - case GK_FWD_NEIGHBOR_BACK_NET: { + case GK_FWD_NEIGHBOR_FRONT_NET: { /* * The entry instructs to forward * its packets to the neighbor in - * the back network, forward accordingly. + * the front network, forward accordingly. */ if (packet->flow.proto == RTE_ETHER_TYPE_IPV4) { eth_cache = lookup_ether_cache( @@ -1407,455 +1176,45 @@ lookup_fe_from_lpm(struct ipacket *packet, uint32_t ip_flow_hash_val, RTE_VERIFY(eth_cache != NULL); - if (adjust_pkt_len(pkt, back, 0) == NULL || + if (adjust_pkt_len(pkt, front, 0) == NULL || pkt_copy_cached_eth_header(pkt, eth_cache, - back->l2_len_out)) { - drop_packet_front(pkt, instance); - return NULL; + front->l2_len_out)) { + drop_packet(pkt); + return; } - if (update_ip_hop_count(front, packet, + if (update_ip_hop_count(back, packet, num_pkts, icmp_bufs, - &instance->front_icmp_rs, - instance, - drop_packet_front) < 0) - return NULL; + &instance->back_icmp_rs, + instance, drop_packet_back) < 0) + return; tx_bufs[(*num_tx)++] = pkt; - return NULL; + break; } case GK_DROP: - /* FALLTHROUGH */ + drop_packet(pkt); + break; + default: - drop_packet_front(pkt, instance); - return NULL; + /* All other actions should log a warning. */ + GK_LOG(WARNING, + "The fib entry has an unexpected action %u at %s\n", + fib->action, __func__); + drop_packet(pkt); + break; } - - return NULL; } -static int -process_flow_entry(struct flow_entry *fe, struct ipacket *packet, - struct rte_mbuf **req_bufs, uint16_t *num_reqs, - struct gk_config *gk_conf, struct gk_measurement_metrics *stats) -{ - int ret; - - /* - * Some notes regarding flow rates and units: - * - * Flows in the GK_REQUEST state are bandwidth limited - * to an overall rate relative to the link. Therefore, - * the Ethernet frame overhead is counted toward the - * credits used by requests. The request channel rate - * is measured in megabits (base 10) per second to - * match the units used by hardware specifications. - * - * Granted flows (in state GK_GRANTED or sometimes - * GK_BPF) are allocated budgets that are intended - * to reflect the max throughput of the flow, and - * therefore do not include the Ethernet frame overhead. - * The budgets of granted flows are measured in - * kibibytes (base 2). - */ - switch (fe->state) { - case GK_REQUEST: - ret = gk_process_request(fe, packet, - req_bufs, num_reqs, gk_conf->sol_conf); - break; - - case GK_GRANTED: - ret = gk_process_granted(fe, packet, - req_bufs, num_reqs, gk_conf->sol_conf, stats); - break; - - case GK_DECLINED: - ret = gk_process_declined(fe, packet, - req_bufs, num_reqs, gk_conf->sol_conf, stats); - break; - - case GK_BPF: - ret = gk_process_bpf(fe, packet, - req_bufs, num_reqs, gk_conf, stats); - break; - - default: - ret = -1; - GK_LOG(ERR, "Unknown flow state: %d\n", fe->state); - break; - } - - return ret; -} - -static inline void -prefetch_flow_entry(struct flow_entry *fe) -{ -#if RTE_CACHE_LINE_SIZE == 64 - RTE_BUILD_BUG_ON(sizeof(*fe) <= RTE_CACHE_LINE_SIZE); - RTE_BUILD_BUG_ON(sizeof(*fe) > 2 * RTE_CACHE_LINE_SIZE); - rte_prefetch0(fe); - rte_prefetch0(((char *)fe) + RTE_CACHE_LINE_SIZE); -#elif RTE_CACHE_LINE_SIZE == 128 - RTE_BUILD_BUG_ON(sizeof(*fe) > RTE_CACHE_LINE_SIZE); - rte_prefetch0(fe); -#else -#error "Unsupported cache line size" -#endif -} - -static void -parse_packet(struct ipacket *packet, struct rte_mbuf *pkt, - struct rte_mbuf **arp_bufs, uint16_t *num_arp, - bool ipv4_configured_front, bool ipv6_configured_front, - struct ip_flow **flow_arr, uint32_t *flow_hash_val_arr, - int *num_ip_flows, struct gatekeeper_if *front, - struct gk_instance *instance) -{ - int ret; - struct gk_measurement_metrics *stats = &instance->traffic_stats; - - stats->tot_pkts_size += rte_pktmbuf_pkt_len(pkt); - - ret = extract_packet_info(pkt, packet); - if (ret < 0) { - if (likely(packet->flow.proto == RTE_ETHER_TYPE_ARP)) { - stats->tot_pkts_num_distributed++; - stats->tot_pkts_size_distributed += - rte_pktmbuf_pkt_len(pkt); - - arp_bufs[(*num_arp)++] = pkt; - return; - } - - /* Drop non-IP and non-ARP packets. */ - drop_packet_front(pkt, instance); - return; - } - - if (unlikely((packet->flow.proto == RTE_ETHER_TYPE_IPV4 && - !ipv4_configured_front) || - (packet->flow.proto == RTE_ETHER_TYPE_IPV6 && - !ipv6_configured_front))) { - drop_packet_front(pkt, instance); - return; - } - - flow_arr[*num_ip_flows] = &packet->flow; - flow_hash_val_arr[*num_ip_flows] = likely(front->rss) ? - pkt->hash.rss : rss_ip_flow_hf(&packet->flow, 0, 0); - (*num_ip_flows)++; -} - -#define PREFETCH_OFFSET (4) - -/* Process the packets on the front interface. */ -static void -process_pkts_front(uint16_t port_front, uint16_t rx_queue_front, - unsigned int lcore, - uint16_t *tx_front_num_pkts, struct rte_mbuf **tx_front_pkts, - uint16_t *tx_back_num_pkts, struct rte_mbuf **tx_back_pkts, - struct gk_instance *instance, struct gk_config *gk_conf) -{ - int i; - int done_lookups; - int ret; - uint16_t num_rx; - uint16_t num_arp = 0; - uint16_t num_reqs = 0; - uint16_t front_max_pkt_burst = gk_conf->front_max_pkt_burst; - struct rte_mbuf *rx_bufs[front_max_pkt_burst]; - struct rte_mbuf *arp_bufs[front_max_pkt_burst]; - struct rte_mbuf *req_bufs[front_max_pkt_burst]; - DEFINE_ACL_SEARCH(acl4, front_max_pkt_burst); - DEFINE_ACL_SEARCH(acl6, front_max_pkt_burst); - struct gatekeeper_if *front = &gk_conf->net->front; - struct gatekeeper_if *back = &gk_conf->net->back; - struct gk_measurement_metrics *stats = &instance->traffic_stats; - bool ipv4_configured_front = ipv4_if_configured(&gk_conf->net->front); - bool ipv6_configured_front = ipv6_if_configured(&gk_conf->net->front); - int num_ip_flows = 0; - struct ipacket pkt_arr[front_max_pkt_burst]; - struct ip_flow *flow_arr[front_max_pkt_burst]; - uint32_t flow_hash_val_arr[front_max_pkt_burst]; - int num_lpm_lookups = 0; - int num_lpm6_lookups = 0; - struct ip_flow *flows[front_max_pkt_burst]; - struct ip_flow *flows6[front_max_pkt_burst]; - int32_t lpm_lookup_pos[front_max_pkt_burst]; - int32_t lpm6_lookup_pos[front_max_pkt_burst]; - int32_t pos_arr[front_max_pkt_burst]; - struct gk_fib *fibs[front_max_pkt_burst]; - struct gk_fib *fibs6[front_max_pkt_burst]; - struct flow_entry *fe_arr[front_max_pkt_burst]; - - /* Load a set of packets from the front NIC. */ - num_rx = rte_eth_rx_burst(port_front, rx_queue_front, rx_bufs, - front_max_pkt_burst); - - if (unlikely(num_rx == 0)) - return; - - stats->tot_pkts_num += num_rx; - - /* - * This prefetch is enough to load Ethernet header (14 bytes), - * optional Ethernet VLAN header (8 bytes), and either - * an IPv4 header without options (20 bytes), or - * an IPv6 header without options (40 bytes). - * IPv4: 14 + 8 + 20 = 42 - * IPv6: 14 + 8 + 40 = 62 - */ - for (i = 0; i < PREFETCH_OFFSET && i < num_rx; i++) - rte_prefetch0(rte_pktmbuf_mtod_offset(rx_bufs[i], void *, 0)); - - /* Extract packet and flow information. */ - for (i = 0; i < (num_rx - PREFETCH_OFFSET); i++) { - rte_prefetch0(rte_pktmbuf_mtod_offset( - rx_bufs[i + PREFETCH_OFFSET], void *, 0)); - - parse_packet(&pkt_arr[num_ip_flows], rx_bufs[i], arp_bufs, - &num_arp, ipv4_configured_front, ipv6_configured_front, - flow_arr, flow_hash_val_arr, &num_ip_flows, front, - instance); - } - - /* Extract the rest packet and flow information. */ - for (; i < num_rx; i++) { - parse_packet(&pkt_arr[num_ip_flows], rx_bufs[i], arp_bufs, - &num_arp, ipv4_configured_front, ipv6_configured_front, - flow_arr, flow_hash_val_arr, &num_ip_flows, front, - instance); - } - - done_lookups = 0; - while (done_lookups < num_ip_flows) { - uint32_t num_keys = num_ip_flows - done_lookups; - if (num_keys > RTE_HASH_LOOKUP_BULK_MAX) - num_keys = RTE_HASH_LOOKUP_BULK_MAX; - - ret = rte_hash_lookup_bulk_with_hash( - instance->ip_flow_hash_table, - (const void **)&flow_arr[done_lookups], - (hash_sig_t *)&flow_hash_val_arr[done_lookups], - num_keys, &pos_arr[done_lookups]); - if (ret != 0) { - GK_LOG(NOTICE, - "failed to find multiple keys in the hash table at lcore %u\n", - rte_lcore_id()); - } - - done_lookups += num_keys; - } - - for (i = 0; i < num_ip_flows; i++) { - if (pos_arr[i] >= 0) { - fe_arr[i] = &instance->ip_flow_entry_table[pos_arr[i]]; - - prefetch_flow_entry(fe_arr[i]); - } else { - fe_arr[i] = NULL; - if (flow_arr[i]->proto == RTE_ETHER_TYPE_IPV4) { - lpm_lookup_pos[num_lpm_lookups] = i; - flows[num_lpm_lookups] = flow_arr[i]; - num_lpm_lookups++; - } else { - lpm6_lookup_pos[num_lpm6_lookups] = i; - flows6[num_lpm6_lookups] = flow_arr[i]; - num_lpm6_lookups++; - } - } - } - - /* The remaining flows need LPM lookups. */ - lookup_fib_bulk(&gk_conf->lpm_tbl, flows, num_lpm_lookups, fibs); - lookup_fib6_bulk(&gk_conf->lpm_tbl, flows6, num_lpm6_lookups, fibs6); - - for (i = 0; i < num_lpm_lookups; i++) { - int fidx = lpm_lookup_pos[i]; - - fe_arr[fidx] = lookup_fe_from_lpm(&pkt_arr[fidx], - flow_hash_val_arr[fidx], fibs[i], - tx_back_num_pkts, tx_back_pkts, &acl4, &acl6, - tx_front_num_pkts, tx_front_pkts, req_bufs, - &num_reqs, front, back, instance, gk_conf); - } - - for (i = 0; i < num_lpm6_lookups; i++) { - int fidx = lpm6_lookup_pos[i]; - - fe_arr[fidx] = lookup_fe_from_lpm(&pkt_arr[fidx], - flow_hash_val_arr[fidx], fibs6[i], - tx_back_num_pkts, tx_back_pkts, &acl4, &acl6, - tx_front_num_pkts, tx_front_pkts, req_bufs, - &num_reqs, front, back, instance, gk_conf); - } - - for (i = 0; i < num_ip_flows; i++) { - if (fe_arr[i] == NULL) - continue; - - ret = process_flow_entry(fe_arr[i], &pkt_arr[i], req_bufs, - &num_reqs, gk_conf, stats); - if (ret < 0) - drop_packet_front(pkt_arr[i].pkt, instance); - else if (ret == EINPROGRESS) { - /* Request will be serviced by another lcore. */ - continue; - } else if (likely(ret == 0)) - tx_back_pkts[(*tx_back_num_pkts)++] = pkt_arr[i].pkt; - else - rte_panic("Invalid return value (%d) from processing a packet in a flow with state %d", - ret, fe_arr[i]->state); - } - - if (num_reqs > 0) { - uint64_t acc_size_request[num_reqs + 1]; - - acc_size_request[0] = 0; - for (i = 1; i <= num_reqs; i++) { - acc_size_request[i] = acc_size_request[i - 1] + - rte_pktmbuf_pkt_len(req_bufs[i - 1]); - } - - ret = RTE_MAX(gk_solicitor_enqueue_bulk(gk_conf->sol_conf, - req_bufs, num_reqs), 0); - if (ret < num_reqs) { - for (i = ret; i < num_reqs; i++) - drop_packet_front(req_bufs[i], instance); - } - - stats->pkts_num_request += ret; - stats->pkts_size_request += acc_size_request[ret]; - } - - if (num_arp > 0) - submit_arp(arp_bufs, num_arp, &gk_conf->net->front); - - process_pkts_acl(&gk_conf->net->front, - lcore, &acl4, RTE_ETHER_TYPE_IPV4); - process_pkts_acl(&gk_conf->net->front, - lcore, &acl6, RTE_ETHER_TYPE_IPV6); -} - -static void -process_fib(struct ipacket *packet, struct gk_fib *fib, - uint16_t *num_tx, struct rte_mbuf **tx_bufs, - struct acl_search *acl4, struct acl_search *acl6, - uint16_t *num_pkts, struct rte_mbuf **icmp_bufs, - struct gatekeeper_if *front, struct gatekeeper_if *back, - struct gk_instance *instance) { - struct rte_mbuf *pkt = packet->pkt; - struct ether_cache *eth_cache; - - if (fib == NULL || fib->action == GK_FWD_NEIGHBOR_BACK_NET) { - if (packet->flow.proto == RTE_ETHER_TYPE_IPV4) - add_pkt_acl(acl4, pkt); - else if (likely(packet->flow.proto == - RTE_ETHER_TYPE_IPV6)) - add_pkt_acl(acl6, pkt); - else { - print_flow_err_msg(&packet->flow, - "gk: failed to get the fib entry or it is not an IP packet"); - drop_packet(pkt); - } - return; - } - - switch (fib->action) { - case GK_FWD_GATEWAY_FRONT_NET: { - /* - * The entry instructs to forward - * its packets to the gateway in - * the front network, forward accordingly. - * - * BP bypasses from the back to the front interface - * are expected to bypass the outgoing traffic - * from the AS to its peers. - * - * Notice that one needs to update - * the Ethernet header. - */ - eth_cache = fib->u.gateway.eth_cache; - RTE_VERIFY(eth_cache != NULL); - - if (adjust_pkt_len(pkt, front, 0) == NULL || - pkt_copy_cached_eth_header(pkt, - eth_cache, - front->l2_len_out)) { - drop_packet(pkt); - return; - } - - if (update_ip_hop_count(back, packet, - num_pkts, icmp_bufs, - &instance->back_icmp_rs, - instance, drop_packet_back) < 0) - return; - - tx_bufs[(*num_tx)++] = pkt; - break; - } - - case GK_FWD_NEIGHBOR_FRONT_NET: { - /* - * The entry instructs to forward - * its packets to the neighbor in - * the front network, forward accordingly. - */ - if (packet->flow.proto == RTE_ETHER_TYPE_IPV4) { - eth_cache = lookup_ether_cache( - &fib->u.neigh, - &packet->flow.f.v4.dst); - } else { - eth_cache = lookup_ether_cache( - &fib->u.neigh6, - &packet->flow.f.v6.dst); - } - - RTE_VERIFY(eth_cache != NULL); - - if (adjust_pkt_len(pkt, front, 0) == NULL || - pkt_copy_cached_eth_header(pkt, - eth_cache, - front->l2_len_out)) { - drop_packet(pkt); - return; - } - - if (update_ip_hop_count(back, packet, - num_pkts, icmp_bufs, - &instance->back_icmp_rs, - instance, drop_packet_back) < 0) - return; - - tx_bufs[(*num_tx)++] = pkt; - break; - } - - case GK_DROP: - drop_packet(pkt); - break; - - default: - /* All other actions should log a warning. */ - GK_LOG(WARNING, - "The fib entry has an unexpected action %u at %s\n", - fib->action, __func__); - drop_packet(pkt); - break; - } -} - -/* Process the packets on the back interface. */ -static void -process_pkts_back(uint16_t port_back, uint16_t rx_queue_back, - unsigned int lcore, - uint16_t *tx_front_num_pkts, struct rte_mbuf **tx_front_pkts, - uint16_t *tx_back_num_pkts, struct rte_mbuf **tx_back_pkts, - struct gk_instance *instance, struct gk_config *gk_conf) +/* Process the packets on the back interface. */ +static void +process_pkts_back(uint16_t port_back, uint16_t rx_queue_back, + unsigned int lcore, + uint16_t *tx_front_num_pkts, struct rte_mbuf **tx_front_pkts, + uint16_t *tx_back_num_pkts, struct rte_mbuf **tx_back_pkts, + struct gk_instance *instance, struct gk_config *gk_conf) { int i; int ret; @@ -2153,6 +1512,340 @@ process_cmds_from_mailbox( mb_free_entry_bulk(&instance->mb, (void * const *)gk_cmds, num_cmd); } +static void +populate_front_tasks(struct gk_co_work *work, + uint16_t port_front, uint16_t rx_queue_front) +{ + uint16_t front_max_pkt_burst = work->gk_conf->front_max_pkt_burst; + struct rte_mbuf *rx_bufs[front_max_pkt_burst]; + /* Load a set of packets from the front NIC. */ + uint16_t num_rx = rte_eth_rx_burst(port_front, rx_queue_front, rx_bufs, + front_max_pkt_burst); + struct gk_measurement_metrics *stats; + bool has_rss; + int i; + + if (unlikely(num_rx == 0)) + return; + + stats = &work->instance->traffic_stats; + stats->tot_pkts_num += num_rx; + + has_rss = work->gk_conf->net->front.rss; + for (i = 0; i < num_rx; i++) { + struct gk_co_task *task = &work->all_tasks[work->task_num++]; + struct rte_mbuf *pkt = rx_bufs[i]; + + stats->tot_pkts_size += rte_pktmbuf_pkt_len(pkt); + + if (likely(has_rss)) { + task->task_hash = pkt->hash.rss; + task->task_arg = pkt; + task->task_func = gk_co_process_front_pkt; + schedule_task(work, task); + } else { + struct ipacket *packet = &work->packets[i]; + /* + * There is a chance that packets on the same flow + * are brought out of order. For example, consider that + * (1) three packets arrive on the following order: + * pkt1, pkt2, pkt3; + * (2) there are only two coroutines doing the work; + * (3) The packets are mapped to + * the coroutines as follow: + * * pkt1 and pkt2 goes coroutine 1, + * * pkt3 goes to coroutine 2; + * (4) Packets pkt2 and pkt3 belong to the same flow. + * + * Packet pkt1 and ptk3 are processed in parallel, + * receive their correct hashes, and are rescheduled. + * Once pk2 is recheduled, it is going to be placed + * after pk3 in the task queue of + * the assigned coroutine, that is, pk3 is going to + * be sent out before pkt2 (inverted order). + */ + task->task_hash = 0; /* Dummy hash. */ + /* + * Passing @packet instead of just @pkt so @packet + * can be carried over once the task is rescheduled. + */ + packet->pkt = pkt; + task->task_arg = packet; + task->task_func = gk_co_process_front_pkt_software_rss; + schedule_task_to_any_co(work, task); + } + } +} + +static void +add_cos_to_work(struct gk_co_work *work, struct gk_config *gk_conf, + struct gk_instance *instance) +{ + unsigned int i; + + work->gk_conf = gk_conf; + work->instance = instance; + work->cos = instance->cos; + work->co_max_num = gk_conf->co_max_num; + work->co_num = RTE_MIN(2, work->co_max_num); + work->front_ipv4_configured = ipv4_if_configured(&gk_conf->net->front); + work->front_ipv6_configured = ipv6_if_configured(&gk_conf->net->front); + + RTE_VERIFY(work->co_num > 0); + + for (i = 0; i < work->co_max_num; i++) + work->cos[i].work = work; +} + +static void +update_cos(struct gk_co_work *work) +{ + /* + * The local variable @co_num is needed here to enable one to go + * above @work->co_max_num and below zero if needed. + */ + int32_t co_num = work->co_num; + + if (work->co_delta_num > 0) { + /* @work->co_num is going up. */ + + if (unlikely(co_num >= work->co_max_num)) { + /* + * @work->co_num is at its maximum; + * Reverse direction. + */ + RTE_VERIFY(co_num == work->co_max_num); + work->co_delta_num = - work->co_delta_num; + work->co_num = RTE_MAX(1, co_num + work->co_delta_num); + return; + } + + work->co_num = RTE_MIN(work->co_max_num, + co_num + work->co_delta_num); + return; + } + + /* @work->co_num is going down. */ + RTE_VERIFY(work->co_delta_num < 0); + + if (unlikely(co_num <= 1)) { + /* @work->co_num is at its minimum; reverse direction. */ + RTE_VERIFY(co_num == 1); + work->co_delta_num = - work->co_delta_num; + work->co_num = RTE_MIN(work->co_max_num, + co_num + work->co_delta_num); + return; + } + + work->co_num = RTE_MAX(1, co_num + work->co_delta_num); +} + +static void +do_work(struct gk_co_work *work) +{ + uint16_t i, real_co_num = 0; + uint64_t cycles; + double avg_cycles_per_task; + + /* Add coroutines with tasks to @work->working_cos. */ + for (i = 0; i < work->co_num; i++) { + struct gk_co *co = &work->cos[i]; + if (!list_empty(&co->task_queue)) { + list_add_tail(&co->co_list, &work->working_cos); + real_co_num++; + } + } + + /* Is there any work to do? */ + if (unlikely(list_empty(&work->working_cos))) { + RTE_VERIFY(real_co_num == 0); + RTE_VERIFY(work->task_num == 0); + return; + } + RTE_VERIFY(real_co_num > 0); + RTE_VERIFY(work->task_num > 0); + + /* Do work. */ + cycles = rte_rdtsc(); + coro_transfer(&work->instance->coro_root, + &list_first_entry(&work->working_cos, struct gk_co, co_list)-> + coro); + cycles = rte_rdtsc() - cycles; + avg_cycles_per_task = (double)cycles / work->task_num; + + if (work->co_num != real_co_num) { + /* Workload changed; adjust quickly. */ + RTE_VERIFY(work->co_num > real_co_num); + work->co_prv_num = real_co_num; + work->avg_cycles_per_task = avg_cycles_per_task; + work->co_num = real_co_num; + return update_cos(work); + } + + if (work->co_prv_num == 0) { + /* Initialize the performance tracking fields. */ + work->co_prv_num = real_co_num; + work->avg_cycles_per_task = avg_cycles_per_task; + return update_cos(work); + } + + if (avg_cycles_per_task >= work->avg_cycles_per_task) { + /* The last change did not bring an improvement; go back. */ + work->co_num = work->co_prv_num; + /* Reset measurement. */ + work->co_prv_num = 0; + /* Change adjustment direction. */ + work->co_delta_num = - work->co_delta_num; + return; + } + + /* @real_co_num is an improvement. */ + work->co_prv_num = real_co_num; + work->avg_cycles_per_task = avg_cycles_per_task; + update_cos(work); +} + +static void +flush_work(struct gk_co_work *work, + uint16_t port_front, uint16_t tx_queue_front, + uint16_t port_back, uint16_t tx_queue_back, + unsigned int lcore) +{ + struct gk_instance *instance = work->instance; + + uint16_t front_max_pkt_burst = work->gk_conf->front_max_pkt_burst; + uint16_t back_max_pkt_burst = work->gk_conf->back_max_pkt_burst; + uint32_t max_pkt_burst = front_max_pkt_burst + back_max_pkt_burst; + struct gatekeeper_if *front = &work->gk_conf->net->front; + + /* + * Flush packets. + */ + + send_pkts(port_front, tx_queue_front, + work->tx_front_num_pkts, work->tx_front_pkts); + RTE_VERIFY(work->tx_front_num_pkts <= max_pkt_burst); + work->tx_front_num_pkts = 0; + + send_pkts(port_back, tx_queue_back, + work->tx_back_num_pkts, work->tx_back_pkts); + RTE_VERIFY(work->tx_back_num_pkts <= max_pkt_burst); + work->tx_back_num_pkts = 0; + + /* + * Flush front. + */ + + if (work->front_num_req > 0) { + uint16_t num_req = work->front_num_req; + uint64_t acc_size_request[num_req + 1]; + struct gk_measurement_metrics *stats = &instance->traffic_stats; + int i, ret; + + /* + * The byte length of the packets must be computed before + * calling gk_solicitor_enqueue_bulk() because after it + * the GK block no longer owns the packets. + */ + acc_size_request[0] = 0; + for (i = 1; i <= num_req; i++) { + acc_size_request[i] = acc_size_request[i - 1] + + rte_pktmbuf_pkt_len( + work->front_req_bufs[i - 1] + ); + } + + ret = RTE_MAX( + gk_solicitor_enqueue_bulk(work->gk_conf->sol_conf, + work->front_req_bufs, num_req), + 0); + + stats->pkts_num_request += ret; + stats->pkts_size_request += acc_size_request[ret]; + + for (i = ret; i < num_req; i++) + drop_packet_front(work->front_req_bufs[i], instance); + + RTE_VERIFY(num_req <= front_max_pkt_burst); + work->front_num_req = 0; + } + + if (work->front_num_arp > 0) { + submit_arp(work->front_arp_bufs, work->front_num_arp, front); + RTE_VERIFY(work->front_num_arp <= front_max_pkt_burst); + work->front_num_arp = 0; + } + + RTE_VERIFY(work->front_acl4.num <= front_max_pkt_burst); + RTE_VERIFY(work->front_acl6.num <= front_max_pkt_burst); + process_pkts_acl(front, lcore, &work->front_acl4, RTE_ETHER_TYPE_IPV4); + process_pkts_acl(front, lcore, &work->front_acl6, RTE_ETHER_TYPE_IPV6); + + /* + * TODO Flush back. + */ + + /* + * Update flow table. + */ + + if (work->del_fe != NULL) { + RTE_VERIFY(work->del_fe->in_use); + /* + * Test that the flow entry is expired once more because + * it may have been update since it tested as expired and + * arriving here. + */ + if (likely(is_flow_expired(work->del_fe, rte_rdtsc()))) + gk_del_flow_entry_from_hash(instance, work->del_fe); + work->del_fe = NULL; + } + + /* + * Adding new entries to the flow table should be among the last steps + * to do because when the flow table is full, + * rte_hash_cuckoo_make_space_mw() is going to be called. And + * this function disrupts the cache of the running core. + * rte_hash_cuckoo_make_space_mw() may access up to 1000 buckets and, + * on 64-bit platforms, consumes about 32KB of execution stack. + */ + if (work->temp_fes_num > 0) { + unsigned int i; + for (i = 0; i < work->temp_fes_num; i++) { + struct flow_entry *temp_fe = &work->temp_fes[i]; + struct flow_entry *fe; + int ret = gk_hash_add_flow_entry(instance, + &temp_fe->flow, temp_fe->flow_hash_val, + work->gk_conf); + if (ret == -ENOSPC) { + /* Flow table is full. */ + break; + } + if (unlikely(ret < 0)) { + GK_LOG(ERR, + "Failed to add an flow entry ret=%i\n", + ret); + continue; + } + fe = &instance->ip_flow_entry_table[ret]; + rte_memcpy(fe, temp_fe, sizeof(*fe)); + } + RTE_VERIFY(work->temp_fes_num <= (front_max_pkt_burst + + work->gk_conf->mailbox_burst_size)); + work->temp_fes_num = 0; + } + + /* + * Reset fields of @work. + */ + + RTE_VERIFY(work->task_num <= work->task_total); + work->task_num = 0; + work->any_co_index = 0; + memset(work->leftover, 0, + sizeof(*work->leftover) * (work->leftover_mask + 1)); +} + static int gk_proc(void *arg) { @@ -2168,13 +1861,6 @@ gk_proc(void *arg) uint16_t rx_queue_back = instance->rx_queue_back; uint16_t tx_queue_back = instance->tx_queue_back; - uint16_t tx_front_num_pkts; - uint16_t tx_back_num_pkts; - uint16_t tx_max_num_pkts = gk_conf->front_max_pkt_burst + - gk_conf->back_max_pkt_burst; - struct rte_mbuf *tx_front_pkts[tx_max_num_pkts]; - struct rte_mbuf *tx_back_pkts[tx_max_num_pkts]; - uint32_t entry_idx = 0; uint64_t last_measure_tsc = rte_rdtsc(); uint64_t basic_measurement_logging_cycles = @@ -2183,64 +1869,58 @@ gk_proc(void *arg) uint32_t scan_iter = gk_conf->flow_table_scan_iter; uint32_t iter_count = 0; + DEFINE_GK_CO_WORK(work, gk_conf->front_max_pkt_burst, + gk_conf->back_max_pkt_burst, gk_conf->mailbox_burst_size, + /* + * The 4* is intended to minimize collisions, whereas the -1 is + * intended to avoid doubling the size when + * the expression already is a power of 2. + */ + rte_combine32ms1b(4 * (gk_conf->front_max_pkt_burst + + gk_conf->mailbox_burst_size) - 1), + 1 /* One extra tast for the full scanning of the flow table. */ + ); + GK_LOG(NOTICE, "The GK block is running at lcore = %u\n", lcore); gk_conf_hold(gk_conf); + add_cos_to_work(&work, gk_conf, instance); while (likely(!exiting)) { - struct flow_entry *fe = NULL; - tx_front_num_pkts = 0; - tx_back_num_pkts = 0; + populate_front_tasks(&work, port_front, rx_queue_front); + /* + * Have the expiration test after all flow-ralated work to + * give one more chance for entries to not expire. + */ if (iter_count >= scan_iter) { + struct gk_co_task *task = + &work.all_tasks[work.task_num++]; entry_idx = (entry_idx + 1) % gk_conf->flow_ht_size; - fe = &instance->ip_flow_entry_table[entry_idx]; - /* - * Only one prefetch is needed here because we only - * need the beginning of a struct flow_entry to - * check if it's expired. - */ - rte_prefetch_non_temporal(fe); + + task->task_hash = 0; /* Dummy hash. */ + task->task_arg = + &instance->ip_flow_entry_table[entry_idx]; + task->task_func = gk_co_scan_flow_table; + schedule_task_to_any_co(&work, task); iter_count = 0; } else iter_count++; - process_pkts_front(port_front, rx_queue_front, lcore, - &tx_front_num_pkts, tx_front_pkts, - &tx_back_num_pkts, tx_back_pkts, - instance, gk_conf); + do_work(&work); process_pkts_back(port_back, rx_queue_back, lcore, - &tx_front_num_pkts, tx_front_pkts, - &tx_back_num_pkts, tx_back_pkts, + &work.tx_front_num_pkts, work.tx_front_pkts, + &work.tx_back_num_pkts, work.tx_back_pkts, instance, gk_conf); - if (fe != NULL && fe->in_use && - is_flow_expired(fe, rte_rdtsc())) { - rte_hash_prefetch_buckets_non_temporal( - instance->ip_flow_hash_table, - fe->flow_hash_val); - } else - fe = NULL; - - send_pkts(port_front, tx_queue_front, - tx_front_num_pkts, tx_front_pkts); - - send_pkts(port_back, tx_queue_back, - tx_back_num_pkts, tx_back_pkts); + flush_work(&work, port_front, tx_queue_front, + port_back, tx_queue_back, lcore); process_cmds_from_mailbox(instance, gk_conf); - if (fe != NULL) { - gk_del_flow_entry_from_hash( - instance->ip_flow_hash_table, fe); - - if (instance->num_scan_del > 0) - instance->num_scan_del--; - } - if (rte_rdtsc() - last_measure_tsc >= basic_measurement_logging_cycles) { struct gk_measurement_metrics *stats = @@ -2310,6 +1990,8 @@ cleanup_gk(struct gk_config *gk_conf) } destroy_mailbox(&gk_conf->instances[i].mb); + free_cos(gk_conf->instances[i].cos, gk_conf->co_max_num); + coro_destroy(&gk_conf->instances[i].coro_root); } if (gk_conf->lpm_tbl.fib_tbl != NULL) { @@ -2518,6 +2200,12 @@ run_gk(struct net_config *net_conf, struct gk_config *gk_conf, goto out; } + if (gk_conf->co_max_num == 0) { + GK_LOG(ERR, "There must be at least one coroutine\n"); + ret = -1; + goto out; + } + front_inc = gk_conf->front_max_pkt_burst * gk_conf->num_lcores; net_conf->front.total_pkt_burst += front_inc; back_inc = gk_conf->back_max_pkt_burst * gk_conf->num_lcores; diff --git a/include/coro.h b/include/coro.h new file mode 100644 index 000000000..7645d5029 --- /dev/null +++ b/include/coro.h @@ -0,0 +1,440 @@ +/* + * Copyright (c) 2001-2012,2015 Marc Alexander Lehmann + * + * Redistribution and use in source and binary forms, with or without modifica- + * tion, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- + * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- + * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * the GNU General Public License ("GPL") version 2 or any later version, + * in which case the provisions of the GPL are applicable instead of + * the above. If you wish to allow the use of your version of this file + * only under the terms of the GPL and not to allow others to use your + * version of this file under the BSD license, indicate your decision + * by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL. If you do not delete the + * provisions above, a recipient may use your version of this file under + * either the BSD or the GPL. + * + * This library is modelled strictly after Ralf S. Engelschalls article at + * http://www.gnu.org/software/pth/rse-pmt.ps. So most of the credit must + * go to Ralf S. Engelschall . + * + * This coroutine library is very much stripped down. You should either + * build your own process abstraction using it or - better - just use GNU + * Portable Threads, http://www.gnu.org/software/pth/. + * + */ + +/* + * 2006-10-26 Include stddef.h on OS X to work around one of its bugs. + * Reported by Michael_G_Schwern. + * 2006-11-26 Use _setjmp instead of setjmp on GNU/Linux. + * 2007-04-27 Set unwind frame info if gcc 3+ and ELF is detected. + * Use _setjmp instead of setjmp on _XOPEN_SOURCE >= 600. + * 2007-05-02 Add assembly versions for x86 and amd64 (to avoid reliance + * on SIGUSR2 and sigaltstack in Crossfire). + * 2008-01-21 Disable CFI usage on anything but GNU/Linux. + * 2008-03-02 Switched to 2-clause BSD license with GPL exception. + * 2008-04-04 New (but highly unrecommended) pthreads backend. + * 2008-04-24 Reinstate CORO_LOSER (had wrong stack adjustments). + * 2008-10-30 Support assembly method on x86 with and without frame pointer. + * 2008-11-03 Use a global asm statement for CORO_ASM, idea by pippijn. + * 2008-11-05 Hopefully fix misaligned stacks with CORO_ASM/SETJMP. + * 2008-11-07 rbp wasn't saved in CORO_ASM on x86_64. + * introduce coro_destroy, which is a nop except for pthreads. + * speed up CORO_PTHREAD. Do no longer leak threads either. + * coro_create now allows one to create source coro_contexts. + * do not rely on makecontext passing a void * correctly. + * try harder to get _setjmp/_longjmp. + * major code cleanup/restructuring. + * 2008-11-10 the .cfi hacks are no longer needed. + * 2008-11-16 work around a freebsd pthread bug. + * 2008-11-19 define coro_*jmp symbols for easier porting. + * 2009-06-23 tentative win32-backend support for mingw32 (Yasuhiro Matsumoto). + * 2010-12-03 tentative support for uclibc (which lacks all sorts of things). + * 2011-05-30 set initial callee-saved-registers to zero with CORO_ASM. + * use .cfi_undefined rip on linux-amd64 for better backtraces. + * 2011-06-08 maybe properly implement weird windows amd64 calling conventions. + * 2011-07-03 rely on __GCC_HAVE_DWARF2_CFI_ASM for cfi detection. + * 2011-08-08 cygwin trashes stacks, use pthreads with double stack on cygwin. + * 2012-12-04 reduce misprediction penalty for x86/amd64 assembly switcher. + * 2012-12-05 experimental fiber backend (allocates stack twice). + * 2012-12-07 API version 3 - add coro_stack_alloc/coro_stack_free. + * 2012-12-21 valgrind stack registering was broken. + * 2015-12-05 experimental asm be for arm7, based on a patch by Nick Zavaritsky. + * use __name__ for predefined symbols, as in libecb. + * enable guard pages on arm, aarch64 and mips. + * 2016-08-27 try to disable _FORTIFY_SOURCE with CORO_SJLJ, as it + * breaks setjmp/longjmp. Also disable CORO_ASM for asm by default, + * as it was reported to crash. + * 2016-11-18 disable cfi_undefined again - backtraces might be worse, but + * compile compatibility is improved. + * 2018-08-14 use a completely different pthread strategy that should allow + * sharing of coroutines among different threads. this would + * undefined behaviour before as mutexes would be unlocked on + * a different thread. overall, this might be slower than + * using a pipe for synchronisation, but pipes eat fd's... + */ + +#ifndef CORO_H +#define CORO_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * This library consists of only three files + * coro.h, coro.c and LICENSE (and optionally README) + * + * It implements what is known as coroutines, in a hopefully + * portable way. + * + * All compiletime symbols must be defined both when including coro.h + * (using libcoro) as well as when compiling coro.c (the implementation). + * + * You can manually specify which flavour you want. If you don't define + * any of these, libcoro tries to choose a safe and fast default: + * + * -DCORO_UCONTEXT + * + * This flavour uses SUSv2's get/set/swap/makecontext functions that + * unfortunately only some unices support, and is quite slow. + * + * -DCORO_SJLJ + * + * This flavour uses SUSv2's setjmp/longjmp and sigaltstack functions to + * do it's job. Coroutine creation is much slower than UCONTEXT, but + * context switching is a bit cheaper. It should work on almost all unices. + * + * -DCORO_LINUX + * + * CORO_SJLJ variant. + * Old GNU/Linux systems (<= glibc-2.1) only work with this implementation + * (it is very fast and therefore recommended over other methods, but + * doesn't work with anything newer). + * + * -DCORO_LOSER + * + * CORO_SJLJ variant. + * Microsoft's highly proprietary platform doesn't support sigaltstack, and + * this selects a suitable workaround for this platform. It might not work + * with your compiler though - it has only been tested with MSVC 6. + * + * -DCORO_FIBER + * + * Slower, but probably more portable variant for the Microsoft operating + * system, using fibers. Ignores the passed stack and allocates it internally. + * Also, due to bugs in cygwin, this does not work with cygwin. + * + * -DCORO_IRIX + * + * CORO_SJLJ variant. + * For SGI's version of Microsoft's NT ;) + * + * -DCORO_ASM + * + * Hand coded assembly, known to work only on a few architectures/ABI: + * GCC + arm7/x86/IA32/amd64/x86_64 + GNU/Linux and a few BSDs. Fastest + * choice, if it works. + * + * -DCORO_PTHREAD + * + * Use the pthread API. You have to provide and -lpthread. + * This is likely the slowest backend, and it also does not support fork(), + * so avoid it at all costs. + * + * If you define neither of these symbols, coro.h will try to autodetect + * the best/safest model. To help with the autodetection, you should check + * (e.g. using autoconf) and define the following symbols: HAVE_UCONTEXT_H + * / HAVE_SETJMP_H / HAVE_SIGALTSTACK. + */ + +/* + * Changes when the API changes incompatibly. + * This is ONLY the API version - there is no ABI compatibility between releases. + * + * Changes in API version 2: + * replaced bogus -DCORO_LOOSE with grammatically more correct -DCORO_LOSER + * Changes in API version 3: + * introduced stack management (CORO_STACKALLOC) + */ +#define CORO_VERSION 3 + +#include + +/* + * This is the type for the initialization function of a new coroutine. + */ +typedef void (*coro_func)(void *); + +/* + * A coroutine state is saved in the following structure. Treat it as an + * opaque type. errno and sigmask might be saved, but don't rely on it, + * implement your own switching primitive if you need that. + */ +typedef struct coro_context coro_context; + +/* + * This function creates a new coroutine. Apart from a pointer to an + * uninitialised coro_context, it expects a pointer to the entry function + * and the single pointer value that is given to it as argument. + * + * Allocating/deallocating the stack is your own responsibility. + * + * As a special case, if coro, arg, sptr and ssze are all zero, + * then an "empty" coro_context will be created that is suitable + * as an initial source for coro_transfer. + * + * This function is not reentrant, but putting a mutex around it + * will work. + */ +void coro_create (coro_context *ctx, /* an uninitialised coro_context */ + coro_func coro, /* the coroutine code to be executed */ + void *arg, /* a single pointer passed to the coro */ + void *sptr, /* start of stack area */ + size_t ssze); /* size of stack area in bytes */ + +/* + * The following prototype defines the coroutine switching function. It is + * sometimes implemented as a macro, so watch out. + * + * This function is thread-safe and reentrant. + */ +#if 0 +void coro_transfer (coro_context *prev, coro_context *next); +#endif + +/* + * The following prototype defines the coroutine destroy function. It + * is sometimes implemented as a macro, so watch out. It also serves no + * purpose unless you want to use the CORO_PTHREAD backend, where it is + * used to clean up the thread. You are responsible for freeing the stack + * and the context itself. + * + * This function is thread-safe and reentrant. + */ +#if 0 +void coro_destroy (coro_context *ctx); +#endif + +/*****************************************************************************/ +/* optional stack management */ +/*****************************************************************************/ +/* + * You can disable all of the stack management functions by + * defining CORO_STACKALLOC to 0. Otherwise, they are enabled by default. + * + * If stack management is enabled, you can influence the implementation via these + * symbols: + * + * -DCORO_USE_VALGRIND + * + * If defined, then libcoro will include valgrind/valgrind.h and register + * and unregister stacks with valgrind. + * + * -DCORO_GUARDPAGES=n + * + * libcoro will try to use the specified number of guard pages to protect against + * stack overflow. If n is 0, then the feature will be disabled. If it isn't + * defined, then libcoro will choose a suitable default. If guardpages are not + * supported on the platform, then the feature will be silently disabled. + */ +#ifndef CORO_STACKALLOC +# define CORO_STACKALLOC 1 +#endif + +#if CORO_STACKALLOC + +/* + * The only allowed operations on these struct members is to read the + * "sptr" and "ssze" members to pass it to coro_create, to read the "sptr" + * member to see if it is false, in which case the stack isn't allocated, + * and to set the "sptr" member to 0, to indicate to coro_stack_free to + * not actually do anything. + */ + +struct coro_stack +{ + void *sptr; + size_t ssze; +#ifdef CORO_USE_VALGRIND + int valgrind_id; +#endif +}; + +/* + * Try to allocate a stack of at least the given size and return true if + * successful, or false otherwise. + * + * The size is *NOT* specified in bytes, but in units of sizeof (void *), + * i.e. the stack is typically 4(8) times larger on 32 bit(64 bit) platforms + * then the size passed in. + * + * If size is 0, then a "suitable" stack size is chosen (usually 1-2MB). + */ +int coro_stack_alloc (struct coro_stack *stack, unsigned int size); + +/* + * Free the stack allocated by coro_stack_alloc again. It is safe to + * call this function on the coro_stack structure even if coro_stack_alloc + * failed. + */ +void coro_stack_free (struct coro_stack *stack); + +#endif + +/* + * That was it. No other user-serviceable parts below here. + */ + +/*****************************************************************************/ + +#if !defined CORO_LOSER && !defined CORO_UCONTEXT \ + && !defined CORO_SJLJ && !defined CORO_LINUX \ + && !defined CORO_IRIX && !defined CORO_ASM \ + && !defined CORO_PTHREAD && !defined CORO_FIBER +# if defined WINDOWS && (defined __i386__ || (__x86_64__ || defined _M_IX86 || defined _M_AMD64) +# define CORO_ASM 1 +# elif defined WINDOWS || defined _WIN32 +# define CORO_LOSER 1 /* you don't win with windoze */ +# elif __linux && (__i386__ || (__x86_64__ && !__ILP32__) /*|| (__arm__ && __ARM_ARCH == 7)), not working */ +# define CORO_ASM 1 +# elif defined HAVE_UCONTEXT_H +# define CORO_UCONTEXT 1 +# elif defined HAVE_SETJMP_H && defined HAVE_SIGALTSTACK +# define CORO_SJLJ 1 +# else +error unknown or unsupported architecture +# endif +#endif + +/*****************************************************************************/ + +#ifdef CORO_UCONTEXT + +# include + +struct coro_context +{ + ucontext_t uc; +}; + +# define coro_transfer(p,n) swapcontext (&((p)->uc), &((n)->uc)) +# define coro_destroy(ctx) (void *)(ctx) + +#elif defined (CORO_SJLJ) || defined (CORO_LOSER) || defined (CORO_LINUX) || defined (CORO_IRIX) + +# if defined(CORO_LINUX) && !defined(_GNU_SOURCE) +# define _GNU_SOURCE /* for glibc */ +# endif + +/* try to disable well-meant but buggy checks in some libcs */ +# ifdef _FORTIFY_SOURCE +# undef _FORTIFY_SOURCE +# undef __USE_FORTIFY_LEVEL /* helps some more when too much has been included already */ +# endif + +# if !CORO_LOSER +# include +# endif + +/* solaris is hopelessly borked, it expands _XOPEN_UNIX to nothing */ +# if __sun +# undef _XOPEN_UNIX +# define _XOPEN_UNIX 1 +# endif + +# include + +# if _XOPEN_UNIX > 0 || defined (_setjmp) +# define coro_jmp_buf jmp_buf +# define coro_setjmp(env) _setjmp (env) +# define coro_longjmp(env) _longjmp ((env), 1) +# elif CORO_LOSER +# define coro_jmp_buf jmp_buf +# define coro_setjmp(env) setjmp (env) +# define coro_longjmp(env) longjmp ((env), 1) +# else +# define coro_jmp_buf sigjmp_buf +# define coro_setjmp(env) sigsetjmp (env, 0) +# define coro_longjmp(env) siglongjmp ((env), 1) +# endif + +struct coro_context +{ + coro_jmp_buf env; +}; + +# define coro_transfer(p,n) do { if (!coro_setjmp ((p)->env)) coro_longjmp ((n)->env); } while (0) +# define coro_destroy(ctx) (void *)(ctx) + +#elif CORO_ASM + +struct coro_context +{ + void **sp; /* must be at offset 0 */ +}; + +#if defined (__i386__) || defined (__x86_64__) +void __attribute__ ((__noinline__, __regparm__(2))) +#else +void __attribute__ ((__noinline__)) +#endif +coro_transfer (coro_context *prev, coro_context *next); + +# define coro_destroy(ctx) (void)(ctx) + +#elif CORO_PTHREAD + +# include + +extern pthread_mutex_t coro_mutex; + +struct coro_context +{ + int flags; + pthread_cond_t cv; +}; + +void coro_transfer (coro_context *prev, coro_context *next); +void coro_destroy (coro_context *ctx); + +#elif CORO_FIBER + +struct coro_context +{ + void *fiber; + /* only used for initialisation */ + coro_func coro; + void *arg; +}; + +void coro_transfer (coro_context *prev, coro_context *next); +void coro_destroy (coro_context *ctx); + +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/gatekeeper_acl.h b/include/gatekeeper_acl.h index 59c4bbf42..61aa0d11e 100644 --- a/include/gatekeeper_acl.h +++ b/include/gatekeeper_acl.h @@ -32,16 +32,26 @@ struct acl_search { struct rte_mbuf **mbufs; }; -/* Declare and initialize a struct acl_search. */ -#define DEFINE_ACL_SEARCH(name, num_pkts) \ +#define DECLARE_ACL_SEARCH_VARIABLE_PART(name, num_pkts) \ const uint8_t *name##_data_array[(num_pkts)]; \ - struct rte_mbuf *name##_mbufs_array[(num_pkts)]; \ - struct acl_search name = { \ - .num = 0, \ - .data = name##_data_array, \ - .mbufs = name##_mbufs_array, \ + struct rte_mbuf *name##_mbufs_array[(num_pkts)] + +/* + * This macro can only be used if the macro DECLARE_ACL_SEARCH_VARIABLE_PART() + * has been placed before it. + */ +#define ACL_SEARCH_INIT(name) \ + { \ + .num = 0, \ + .data = name##_data_array, \ + .mbufs = name##_mbufs_array, \ } +/* Declare and initialize a struct acl_search. */ +#define DEFINE_ACL_SEARCH(name, num_pkts) \ + DECLARE_ACL_SEARCH_VARIABLE_PART(name, num_pkts); \ + struct acl_search name = ACL_SEARCH_INIT(name) + /* Classify batches of packets in @acl and invoke callback functions. */ int process_acl(struct gatekeeper_if *iface, unsigned int lcore_id, struct acl_search *acl, struct acl_state *astate, diff --git a/include/gatekeeper_gk.h b/include/gatekeeper_gk.h index 95264d984..d732621ee 100644 --- a/include/gatekeeper_gk.h +++ b/include/gatekeeper_gk.h @@ -19,6 +19,8 @@ #ifndef _GATEKEEPER_GK_H_ #define _GATEKEEPER_GK_H_ +#include + #include #include @@ -98,6 +100,14 @@ struct gk_measurement_metrics { struct gk_instance { struct rte_hash *ip_flow_hash_table; struct flow_entry *ip_flow_entry_table; + /* + * Coroutines. + * + * These structs must be here and not in struct gk_co_work because + * initialization functions (e.g. coro_create()) are not reentrant. + */ + struct coro_context coro_root; + struct gk_co *cos; /* RX queue on the front interface. */ uint16_t rx_queue_front; /* TX queue on the front interface. */ @@ -201,6 +211,11 @@ struct gk_config { /* Time for logging the basic measurements in ms. */ unsigned int basic_measurement_logging_ms; + /* Maximum number of coroutines running in parallel per GK instance. */ + uint16_t co_max_num; + /* Size of the stack of each coroutine in KB. */ + uint16_t co_stack_size_kb; + /* * The fields below are for internal use. * Configuration files should not refer to them. diff --git a/include/gatekeeper_main.h b/include/gatekeeper_main.h index 50aafa1fe..b9de610e4 100644 --- a/include/gatekeeper_main.h +++ b/include/gatekeeper_main.h @@ -20,6 +20,10 @@ #define _GATEKEEPER_MAIN_H_ #include +#include + +#include +#include #ifdef RTE_MACHINE_CPUFLAG_SSE4_2 #include @@ -52,4 +56,49 @@ extern FILE *log_file; char *rte_strdup(const char *type, const char *s); int gatekeeper_log_init(void); +/* XXX #52 This should be part of DPDK. */ +/** + * Prefetch the first part of the mbuf + * + * The first 64 bytes of the mbuf corresponds to fields that are used early + * in the receive path. If the cache line of the architecture is higher than + * 64B, the second part will also be prefetched. + * + * @param m + * The pointer to the mbuf. + */ +static inline void +rte_mbuf_prefetch_part1_non_temporal(struct rte_mbuf *m) +{ + rte_prefetch_non_temporal(&m->cacheline0); +} + +/* XXX #52 This should be part of DPDK. */ +/** + * Prefetch the second part of the mbuf + * + * The next 64 bytes of the mbuf corresponds to fields that are used in the + * transmit path. If the cache line of the architecture is higher than 64B, + * this function does nothing as it is expected that the full mbuf is + * already in cache. + * + * @param m + * The pointer to the mbuf. + */ +static inline bool +rte_mbuf_prefetch_part2_non_temporal(struct rte_mbuf *m) +{ +#if RTE_CACHE_LINE_SIZE == 64 + /* TODO Do we need this prefetch? + rte_prefetch_non_temporal(&m->cacheline1); + return true; + */ + RTE_SET_USED(m); + return false; +#else + RTE_SET_USED(m); + return false; +#endif +} + #endif /* _GATEKEEPER_MAIN_H_ */ diff --git a/include/list.h b/include/list.h index e7fd442fa..c5adf7c51 100644 --- a/include/list.h +++ b/include/list.h @@ -34,6 +34,11 @@ struct list_head { #define LIST_HEAD_INIT(name) { &(name), &(name) } +#define LIST_POISON1 ((void *) 0x00100100) +#define LIST_POISON2 ((void *) 0x00200200) + +#define LIST_HEAD_INIT_WITH_POISON(name) { LIST_POISON1, LIST_POISON2 } + static inline void INIT_LIST_HEAD(struct list_head *list) { @@ -41,6 +46,13 @@ INIT_LIST_HEAD(struct list_head *list) list->prev = list; } +static inline void +INIT_LIST_HEAD_WITH_POISON(struct list_head *list) +{ + list->next = LIST_POISON1; + list->prev = LIST_POISON2; +} + /** * list_entry - get the struct for this entry * @ptr: the &struct list_head pointer. @@ -133,6 +145,16 @@ list_is_singular(const struct list_head *head) return !list_empty(head) && (head->next == head->prev); } +/** + * list_poison - tests whether @entry has been poisoned. + * @entry: the entry to test. + */ +static inline int +list_poison(const struct list_head *entry) +{ + return entry->next == LIST_POISON1 && entry->prev == LIST_POISON2; +} + /* * Insert a new entry between two known consecutive entries. * @@ -191,8 +213,6 @@ __list_del(struct list_head *prev, struct list_head *next) prev->next = next; } -#define LIST_POISON1 ((void *) 0x00100100) -#define LIST_POISON2 ((void *) 0x00200200) /** * list_del - deletes entry from list. * @entry: the element to delete from the list. diff --git a/lib/coro.c b/lib/coro.c new file mode 100644 index 000000000..7817aab22 --- /dev/null +++ b/lib/coro.c @@ -0,0 +1,806 @@ +/* + * Copyright (c) 2001-2011 Marc Alexander Lehmann + * + * Redistribution and use in source and binary forms, with or without modifica- + * tion, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- + * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- + * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * the GNU General Public License ("GPL") version 2 or any later version, + * in which case the provisions of the GPL are applicable instead of + * the above. If you wish to allow the use of your version of this file + * only under the terms of the GPL and not to allow others to use your + * version of this file under the BSD license, indicate your decision + * by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL. If you do not delete the + * provisions above, a recipient may use your version of this file under + * either the BSD or the GPL. + * + * This library is modelled strictly after Ralf S. Engelschalls article at + * http://www.gnu.org/software/pth/rse-pmt.ps. So most of the credit must + * go to Ralf S. Engelschall . + */ + +#include "coro.h" + +#include +#include + +/*****************************************************************************/ +/* ucontext/setjmp/asm backends */ +/*****************************************************************************/ +#if defined (CORO_UCONTEXT) || defined (CORO_SJLJ) || defined (CORO_LOSER) || defined (CORO_LINUX) || defined (CORO_IRIX) || defined (CORO_ASM) + +# ifdef CORO_UCONTEXT +# include +# endif + +# if !defined(STACK_ADJUST_PTR) +# ifdef __sgi +/* IRIX is decidedly NON-unix */ +# define STACK_ADJUST_PTR(sp,ss) ((char *)(sp) + (ss) - 8) +# define STACK_ADJUST_SIZE(sp,ss) ((ss) - 8) +# elif (defined (__i386__) && defined (CORO_LINUX)) || (defined (_M_IX86) && defined (CORO_LOSER)) +# define STACK_ADJUST_PTR(sp,ss) ((char *)(sp) + (ss)) +# define STACK_ADJUST_SIZE(sp,ss) (ss) +# elif (defined (__amd64__) && defined (CORO_LINUX)) || ((defined (_M_AMD64) || defined (_M_IA64)) && defined (CORO_LOSER)) +# define STACK_ADJUST_PTR(sp,ss) ((char *)(sp) + (ss) - 8) +# define STACK_ADJUST_SIZE(sp,ss) (ss) +# else +# define STACK_ADJUST_PTR(sp,ss) (sp) +# define STACK_ADJUST_SIZE(sp,ss) (ss) +# endif +# endif + +# include + +# ifdef CORO_SJLJ +# include +# include +# include +# endif + +static coro_func coro_init_func; +static void *coro_init_arg; +static coro_context *new_coro, *create_coro; + +static void +coro_init (void) +{ + volatile coro_func func = coro_init_func; + volatile void *arg = coro_init_arg; + + coro_transfer (new_coro, create_coro); + +#if defined (__GCC_HAVE_DWARF2_CFI_ASM) && defined (__amd64) + /*asm (".cfi_startproc");*/ + /*asm (".cfi_undefined rip");*/ +#endif + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wcast-qual" + func ((void *)arg); +#pragma GCC diagnostic pop + +#if __GCC_HAVE_DWARF2_CFI_ASM && __amd64 + /*asm (".cfi_endproc");*/ +#endif + + /* the new coro returned. bad. just abort() for now */ + abort (); +} + +# ifdef CORO_SJLJ + +static volatile int trampoline_done; + +/* trampoline signal handler */ +static void +trampoline (int sig) +{ + if (coro_setjmp (new_coro->env)) + coro_init (); /* start it */ + else + trampoline_done = 1; +} + +# endif + +# if CORO_ASM + + #if (defined __arm__) && \ + (defined __ARM_ARCH_7__ || defined __ARM_ARCH_7A__ \ + || defined __ARM_ARCH_7R__ || defined __ARM_ARCH_7M__ \ + || __ARM_ARCH == 7) + #define CORO_ARM 1 + #endif + + #if defined (_WIN32) || defined (__CYGWIN__) + #define CORO_WIN_TIB 1 + #endif + + asm ( + "\t.text\n" + #if defined (_WIN32) || defined (__CYGWIN__) + "\t.globl _coro_transfer\n" + "_coro_transfer:\n" + #else + "\t.globl coro_transfer\n" + "coro_transfer:\n" + #endif + /* windows, of course, gives a shit on the amd64 ABI and uses different registers */ + /* http://blogs.msdn.com/freik/archive/2005/03/17/398200.aspx */ + #ifdef __amd64 + + #if defined (_WIN32) || defined (__CYGWIN__) + #define NUM_SAVED 29 + "\tsubq $168, %rsp\t" /* one dummy qword to improve alignment */ + "\tmovaps %xmm6, (%rsp)\n" + "\tmovaps %xmm7, 16(%rsp)\n" + "\tmovaps %xmm8, 32(%rsp)\n" + "\tmovaps %xmm9, 48(%rsp)\n" + "\tmovaps %xmm10, 64(%rsp)\n" + "\tmovaps %xmm11, 80(%rsp)\n" + "\tmovaps %xmm12, 96(%rsp)\n" + "\tmovaps %xmm13, 112(%rsp)\n" + "\tmovaps %xmm14, 128(%rsp)\n" + "\tmovaps %xmm15, 144(%rsp)\n" + "\tpushq %rsi\n" + "\tpushq %rdi\n" + "\tpushq %rbp\n" + "\tpushq %rbx\n" + "\tpushq %r12\n" + "\tpushq %r13\n" + "\tpushq %r14\n" + "\tpushq %r15\n" + #if CORO_WIN_TIB + "\tpushq %fs:0x0\n" + "\tpushq %fs:0x8\n" + "\tpushq %fs:0xc\n" + #endif + "\tmovq %rsp, (%rcx)\n" + "\tmovq (%rdx), %rsp\n" + #if CORO_WIN_TIB + "\tpopq %fs:0xc\n" + "\tpopq %fs:0x8\n" + "\tpopq %fs:0x0\n" + #endif + "\tpopq %r15\n" + "\tpopq %r14\n" + "\tpopq %r13\n" + "\tpopq %r12\n" + "\tpopq %rbx\n" + "\tpopq %rbp\n" + "\tpopq %rdi\n" + "\tpopq %rsi\n" + "\tmovaps (%rsp), %xmm6\n" + "\tmovaps 16(%rsp), %xmm7\n" + "\tmovaps 32(%rsp), %xmm8\n" + "\tmovaps 48(%rsp), %xmm9\n" + "\tmovaps 64(%rsp), %xmm10\n" + "\tmovaps 80(%rsp), %xmm11\n" + "\tmovaps 96(%rsp), %xmm12\n" + "\tmovaps 112(%rsp), %xmm13\n" + "\tmovaps 128(%rsp), %xmm14\n" + "\tmovaps 144(%rsp), %xmm15\n" + "\taddq $168, %rsp\n" + #else + #define NUM_SAVED 6 + "\tpushq %rbp\n" + "\tpushq %rbx\n" + "\tpushq %r12\n" + "\tpushq %r13\n" + "\tpushq %r14\n" + "\tpushq %r15\n" + "\tmovq %rsp, (%rdi)\n" + "\tmovq (%rsi), %rsp\n" + "\tpopq %r15\n" + "\tpopq %r14\n" + "\tpopq %r13\n" + "\tpopq %r12\n" + "\tpopq %rbx\n" + "\tpopq %rbp\n" + #endif + "\tpopq %rcx\n" + "\tjmpq *%rcx\n" + + #elif __i386__ + + #define NUM_SAVED 4 + "\tpushl %ebp\n" + "\tpushl %ebx\n" + "\tpushl %esi\n" + "\tpushl %edi\n" + #if CORO_WIN_TIB + #undef NUM_SAVED + #define NUM_SAVED 7 + "\tpushl %fs:0\n" + "\tpushl %fs:4\n" + "\tpushl %fs:8\n" + #endif + "\tmovl %esp, (%eax)\n" + "\tmovl (%edx), %esp\n" + #if CORO_WIN_TIB + "\tpopl %fs:8\n" + "\tpopl %fs:4\n" + "\tpopl %fs:0\n" + #endif + "\tpopl %edi\n" + "\tpopl %esi\n" + "\tpopl %ebx\n" + "\tpopl %ebp\n" + "\tpopl %ecx\n" + "\tjmpl *%ecx\n" + + #elif CORO_ARM /* untested, what about thumb, neon, iwmmxt? */ + + #if __ARM_PCS_VFP + "\tvpush {d8-d15}\n" + #define NUM_SAVED (9 + 8 * 2) + #else + #define NUM_SAVED 9 + #endif + "\tpush {r4-r11,lr}\n" + "\tstr sp, [r0]\n" + "\tldr sp, [r1]\n" + "\tpop {r4-r11,lr}\n" + #if __ARM_PCS_VFP + "\tvpop {d8-d15}\n" + #endif + "\tmov r15, lr\n" + + #elif __mips__ && 0 /* untested, 32 bit only */ + + #define NUM_SAVED (12 + 8 * 2) + /* TODO: n64/o64, lw=>ld */ + + "\t.set nomips16\n" + "\t.frame $sp,112,$31\n" + #if __mips_soft_float + "\taddiu $sp,$sp,-44\n" + #else + "\taddiu $sp,$sp,-112\n" + "\ts.d $f30,88($sp)\n" + "\ts.d $f28,80($sp)\n" + "\ts.d $f26,72($sp)\n" + "\ts.d $f24,64($sp)\n" + "\ts.d $f22,56($sp)\n" + "\ts.d $f20,48($sp)\n" + #endif + "\tsw $28,40($sp)\n" + "\tsw $31,36($sp)\n" + "\tsw $fp,32($sp)\n" + "\tsw $23,28($sp)\n" + "\tsw $22,24($sp)\n" + "\tsw $21,20($sp)\n" + "\tsw $20,16($sp)\n" + "\tsw $19,12($sp)\n" + "\tsw $18,8($sp)\n" + "\tsw $17,4($sp)\n" + "\tsw $16,0($sp)\n" + "\tsw $sp,0($4)\n" + "\tlw $sp,0($5)\n" + #if !__mips_soft_float + "\tl.d $f30,88($sp)\n" + "\tl.d $f28,80($sp)\n" + "\tl.d $f26,72($sp)\n" + "\tl.d $f24,64($sp)\n" + "\tl.d $f22,56($sp)\n" + "\tl.d $f20,48($sp)\n" + #endif + "\tlw $28,40($sp)\n" + "\tlw $31,36($sp)\n" + "\tlw $fp,32($sp)\n" + "\tlw $23,28($sp)\n" + "\tlw $22,24($sp)\n" + "\tlw $21,20($sp)\n" + "\tlw $20,16($sp)\n" + "\tlw $19,12($sp)\n" + "\tlw $18,8($sp)\n" + "\tlw $17,4($sp)\n" + "\tlw $16,0($sp)\n" + "\tj $31\n" + #if __mips_soft_float + "\taddiu $sp,$sp,44\n" + #else + "\taddiu $sp,$sp,112\n" + #endif + + #else + #error unsupported architecture + #endif + ); + +# endif + +void +coro_create (coro_context *ctx, coro_func coro, void *arg, void *sptr, size_t ssize) +{ + coro_context nctx; +# ifdef CORO_SJLJ + stack_t ostk, nstk; + struct sigaction osa, nsa; + sigset_t nsig, osig; +# endif + + if (!coro) + return; + + coro_init_func = coro; + coro_init_arg = arg; + + new_coro = ctx; + create_coro = &nctx; + +# ifdef CORO_SJLJ + /* we use SIGUSR2. first block it, then fiddle with it. */ + + sigemptyset (&nsig); + sigaddset (&nsig, SIGUSR2); + sigprocmask (SIG_BLOCK, &nsig, &osig); + + nsa.sa_handler = trampoline; + sigemptyset (&nsa.sa_mask); + nsa.sa_flags = SA_ONSTACK; + + if (sigaction (SIGUSR2, &nsa, &osa)) + { + perror ("sigaction"); + abort (); + } + + /* set the new stack */ + nstk.ss_sp = STACK_ADJUST_PTR (sptr, ssize); /* yes, some platforms (IRIX) get this wrong. */ + nstk.ss_size = STACK_ADJUST_SIZE (sptr, ssize); + nstk.ss_flags = 0; + + if (sigaltstack (&nstk, &ostk) < 0) + { + perror ("sigaltstack"); + abort (); + } + + trampoline_done = 0; + kill (getpid (), SIGUSR2); + sigfillset (&nsig); sigdelset (&nsig, SIGUSR2); + + while (!trampoline_done) + sigsuspend (&nsig); + + sigaltstack (0, &nstk); + nstk.ss_flags = SS_DISABLE; + if (sigaltstack (&nstk, 0) < 0) + perror ("sigaltstack"); + + sigaltstack (0, &nstk); + if (~nstk.ss_flags & SS_DISABLE) + abort (); + + if (~ostk.ss_flags & SS_DISABLE) + sigaltstack (&ostk, 0); + + sigaction (SIGUSR2, &osa, 0); + sigprocmask (SIG_SETMASK, &osig, 0); + +# elif defined (CORO_LOSER) + + coro_setjmp (ctx->env); + #if __CYGWIN__ && __i386__ + ctx->env[8] = (long) coro_init; + ctx->env[7] = (long) ((char *)sptr + ssize) - sizeof (long); + #elif __CYGWIN__ && __x86_64__ + ctx->env[7] = (long) coro_init; + ctx->env[6] = (long) ((char *)sptr + ssize) - sizeof (long); + #elif defined __MINGW32__ + ctx->env[5] = (long) coro_init; + ctx->env[4] = (long) ((char *)sptr + ssize) - sizeof (long); + #elif defined _M_IX86 + ((_JUMP_BUFFER *)&ctx->env)->Eip = (long) coro_init; + ((_JUMP_BUFFER *)&ctx->env)->Esp = (long) STACK_ADJUST_PTR (sptr, ssize) - sizeof (long); + #elif defined _M_AMD64 + ((_JUMP_BUFFER *)&ctx->env)->Rip = (__int64) coro_init; + ((_JUMP_BUFFER *)&ctx->env)->Rsp = (__int64) STACK_ADJUST_PTR (sptr, ssize) - sizeof (__int64); + #elif defined _M_IA64 + ((_JUMP_BUFFER *)&ctx->env)->StIIP = (__int64) coro_init; + ((_JUMP_BUFFER *)&ctx->env)->IntSp = (__int64) STACK_ADJUST_PTR (sptr, ssize) - sizeof (__int64); + #else + #error "microsoft libc or architecture not supported" + #endif + +# elif defined (CORO_LINUX) + + coro_setjmp (ctx->env); + #if __GLIBC__ >= 2 && __GLIBC_MINOR__ >= 0 && defined (JB_PC) && defined (JB_SP) + ctx->env[0].__jmpbuf[JB_PC] = (long) coro_init; + ctx->env[0].__jmpbuf[JB_SP] = (long) STACK_ADJUST_PTR (sptr, ssize) - sizeof (long); + #elif __GLIBC__ >= 2 && __GLIBC_MINOR__ >= 0 && defined (__mc68000__) + ctx->env[0].__jmpbuf[0].__aregs[0] = (long int)coro_init; + ctx->env[0].__jmpbuf[0].__sp = (int *) ((char *)sptr + ssize) - sizeof (long); + #elif defined (__GNU_LIBRARY__) && defined (__i386__) + ctx->env[0].__jmpbuf[0].__pc = (char *) coro_init; + ctx->env[0].__jmpbuf[0].__sp = (void *) ((char *)sptr + ssize) - sizeof (long); + #elif defined (__GNU_LIBRARY__) && defined (__x86_64__) + ctx->env[0].__jmpbuf[JB_PC] = (long) coro_init; + ctx->env[0].__jmpbuf[0].__sp = (void *) ((char *)sptr + ssize) - sizeof (long); + #else + #error "linux libc or architecture not supported" + #endif + +# elif defined (CORO_IRIX) + + coro_setjmp (ctx->env, 0); + ctx->env[JB_PC] = (__uint64_t)coro_init; + ctx->env[JB_SP] = (__uint64_t)STACK_ADJUST_PTR (sptr, ssize) - sizeof (long); + +# elif CORO_ASM + + #if defined (__i386__) || defined (__x86_64__) + ctx->sp = (void **)(ssize + (char *)sptr); +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wcast-qual" + *--ctx->sp = (void *)abort; /* needed for alignment only */ +#pragma GCC diagnostic pop + *--ctx->sp = (void *)coro_init; + #ifdef CORO_WIN_TIB + *--ctx->sp = 0; /* ExceptionList */ + *--ctx->sp = (char *)sptr + ssize; /* StackBase */ + *--ctx->sp = sptr; /* StackLimit */ + #endif + #elif CORO_ARM + /* return address stored in lr register, don't push anything */ + #else + #error unsupported architecture + #endif + + ctx->sp -= NUM_SAVED; + memset (ctx->sp, 0, sizeof (*ctx->sp) * NUM_SAVED); + + #if defined (__i386__) || defined (__x86_64__) + /* done already */ + #elif defined (CORO_ARM) + ctx->sp[0] = coro; /* r4 */ + ctx->sp[1] = arg; /* r5 */ + ctx->sp[8] = (char *)coro_init; /* lr */ + #else + #error unsupported architecture + #endif + +# elif CORO_UCONTEXT + + getcontext (&(ctx->uc)); + + ctx->uc.uc_link = 0; + ctx->uc.uc_stack.ss_sp = sptr; + ctx->uc.uc_stack.ss_size = (size_t)ssize; + ctx->uc.uc_stack.ss_flags = 0; + + makecontext (&(ctx->uc), (void (*)())coro_init, 0); + +# endif + + coro_transfer (create_coro, new_coro); +} + +/*****************************************************************************/ +/* pthread backend */ +/*****************************************************************************/ +#elif CORO_PTHREAD + +/* this mutex will be locked by the running coroutine */ +pthread_mutex_t coro_mutex = PTHREAD_MUTEX_INITIALIZER; + +struct coro_init_args +{ + coro_func func; + void *arg; + coro_context *self, *main; +}; + +static void * +coro_init (void *args_) +{ + struct coro_init_args *args = (struct coro_init_args *)args_; + coro_func func = args->func; + void *arg = args->arg; + + coro_transfer (args->self, args->main); + func (arg); + + return 0; +} + +void +coro_transfer (coro_context *prev, coro_context *next) +{ + pthread_mutex_lock (&coro_mutex); + + next->flags = 1; + pthread_cond_signal (&next->cv); + + prev->flags = 0; + + while (!prev->flags) + pthread_cond_wait (&prev->cv, &coro_mutex); + + if (prev->flags == 2) + { + pthread_mutex_unlock (&coro_mutex); + pthread_cond_destroy (&prev->cv); + pthread_detach (pthread_self ()); + pthread_exit (0); + } + + pthread_mutex_unlock (&coro_mutex); +} + +void +coro_create (coro_context *ctx, coro_func coro, void *arg, void *sptr, size_t ssize) +{ + static coro_context nctx; + static int once; + + if (!once) + { + once = 1; + + pthread_cond_init (&nctx.cv, 0); + } + + pthread_cond_init (&ctx->cv, 0); + + if (coro) + { + pthread_attr_t attr; + struct coro_init_args args; + pthread_t id; + + args.func = coro; + args.arg = arg; + args.self = ctx; + args.main = &nctx; + + pthread_attr_init (&attr); +#if __UCLIBC__ + /* exists, but is borked */ + /*pthread_attr_setstacksize (&attr, (size_t)ssize);*/ +#elif __CYGWIN__ + /* POSIX, not here */ + pthread_attr_setstacksize (&attr, (size_t)ssize); +#else + pthread_attr_setstack (&attr, sptr, (size_t)ssize); +#endif + pthread_attr_setscope (&attr, PTHREAD_SCOPE_PROCESS); + pthread_create (&id, &attr, coro_init, &args); + + coro_transfer (args.main, args.self); + } +} + +void +coro_destroy (coro_context *ctx) +{ + pthread_mutex_lock (&coro_mutex); + ctx->flags = 2; + pthread_cond_signal (&ctx->cv); + pthread_mutex_unlock (&coro_mutex); +} + +/*****************************************************************************/ +/* fiber backend */ +/*****************************************************************************/ +#elif CORO_FIBER + +#define WIN32_LEAN_AND_MEAN +#if _WIN32_WINNT < 0x0400 + #undef _WIN32_WINNT + #define _WIN32_WINNT 0x0400 +#endif +#include + +VOID CALLBACK +coro_init (PVOID arg) +{ + coro_context *ctx = (coro_context *)arg; + + ctx->coro (ctx->arg); +} + +void +coro_transfer (coro_context *prev, coro_context *next) +{ + if (!prev->fiber) + { + prev->fiber = GetCurrentFiber (); + + if (prev->fiber == 0 || prev->fiber == (void *)0x1e00) + prev->fiber = ConvertThreadToFiber (0); + } + + SwitchToFiber (next->fiber); +} + +void +coro_create (coro_context *ctx, coro_func coro, void *arg, void *sptr, size_t ssize) +{ + ctx->fiber = 0; + ctx->coro = coro; + ctx->arg = arg; + + if (!coro) + return; + + ctx->fiber = CreateFiber (ssize, coro_init, ctx); +} + +void +coro_destroy (coro_context *ctx) +{ + DeleteFiber (ctx->fiber); +} + +#else + #error unsupported backend +#endif + +/*****************************************************************************/ +/* stack management */ +/*****************************************************************************/ +#if CORO_STACKALLOC + +#include + +#ifndef _WIN32 +# include +#endif + +#ifdef CORO_USE_VALGRIND +# include +#endif + +#ifdef _POSIX_MAPPED_FILES +# include +# define CORO_MMAP 1 +# ifndef MAP_ANONYMOUS +# ifdef MAP_ANON +# define MAP_ANONYMOUS MAP_ANON +# else +# undef CORO_MMAP +# endif +# endif +# include +#else +# undef CORO_MMAP +#endif + +#if _POSIX_MEMORY_PROTECTION +# ifndef CORO_GUARDPAGES +# define CORO_GUARDPAGES 4 +# endif +#else +# undef CORO_GUARDPAGES +#endif + +#if !CORO_MMAP +# undef CORO_GUARDPAGES +#endif + +#if !defined (__i386__) && !defined (__x86_64__) && !defined (__powerpc__) && !defined (__arm__) && !defined (__aarch64__) && !defined (__m68k__) && !defined (__alpha__) && !defined (__mips__) && !defined (__sparc64__) +# undef CORO_GUARDPAGES +#endif + +#ifndef CORO_GUARDPAGES +# define CORO_GUARDPAGES 0 +#endif + +#ifndef PAGESIZE + #if !CORO_MMAP + #define PAGESIZE 4096 + #else + static size_t + coro_pagesize (void) + { + static size_t pagesize; + + if (!pagesize) + pagesize = sysconf (_SC_PAGESIZE); + + return pagesize; + } + + #define PAGESIZE coro_pagesize () + #endif +#endif + +int +coro_stack_alloc (struct coro_stack *stack, unsigned int size) +{ + if (!size) + size = 256 * 1024; + + stack->sptr = 0; + stack->ssze = ((size_t)size * sizeof (void *) + PAGESIZE - 1) / PAGESIZE * PAGESIZE; + +#ifdef CORO_FIBER + + stack->sptr = (void *)stack; + return 1; + +#else + + size_t ssze = stack->ssze + CORO_GUARDPAGES * PAGESIZE; + void *base; + + #if CORO_MMAP + /* mmap supposedly does allocate-on-write for us */ + base = mmap (0, ssze, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (base == (void *)-1) + { + /* some systems don't let us have executable heap */ + /* we assume they won't need executable stack in that case */ + base = mmap (0, ssze, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (base == (void *)-1) + return 0; + } + + #if CORO_GUARDPAGES + mprotect (base, CORO_GUARDPAGES * PAGESIZE, PROT_NONE); + #endif + + base = (void*)((char *)base + CORO_GUARDPAGES * PAGESIZE); + #else + base = malloc (ssze); + if (!base) + return 0; + #endif + + #ifdef CORO_USE_VALGRIND + stack->valgrind_id = VALGRIND_STACK_REGISTER ((char *)base, ((char *)base) + ssze - CORO_GUARDPAGES * PAGESIZE); + #endif + + stack->sptr = base; + return 1; + +#endif +} + +void +coro_stack_free (struct coro_stack *stack) +{ +#ifdef CORO_FIBER + /* nop */ +#else + #ifdef CORO_USE_VALGRIND + VALGRIND_STACK_DEREGISTER (stack->valgrind_id); + #endif + + #if CORO_MMAP + if (stack->sptr) + munmap ((void*)((char *)stack->sptr - CORO_GUARDPAGES * PAGESIZE), + stack->ssze + CORO_GUARDPAGES * PAGESIZE); + #else + free (stack->sptr); + #endif +#endif +} + +#endif + diff --git a/lib/mailbox.c b/lib/mailbox.c index 33bb242df..a78c53c0e 100644 --- a/lib/mailbox.c +++ b/lib/mailbox.c @@ -111,9 +111,13 @@ void destroy_mailbox(struct mailbox *mb) { if (mb) { - if (mb->ring) + if (mb->ring) { rte_ring_free(mb->ring); - if (mb->pool) + mb->ring = NULL; + } + if (mb->pool) { rte_mempool_free(mb->pool); + mb->pool = NULL; + } } } diff --git a/lua/gatekeeper/staticlib.lua b/lua/gatekeeper/staticlib.lua index c3c1435dc..cae9c7b93 100644 --- a/lua/gatekeeper/staticlib.lua +++ b/lua/gatekeeper/staticlib.lua @@ -211,6 +211,8 @@ struct gk_config { uint32_t log_ratelimit_interval_ms; uint32_t log_ratelimit_burst; unsigned int basic_measurement_logging_ms; + uint16_t co_max_num; + uint16_t co_stack_size_kb; /* This struct has hidden fields. */ }; diff --git a/lua/gk.lua b/lua/gk.lua index 057b98644..be3e452c4 100644 --- a/lua/gk.lua +++ b/lua/gk.lua @@ -42,8 +42,12 @@ return function (net_conf, lls_conf, sol_conf, gk_lcores) local back_icmp_msgs_per_sec = 1000 local back_icmp_msgs_burst = 50 + local co_max_num = 16 + -- These variables are unlikely to need to be changed. local bpf_enable_jit = true + -- CAUTION: stacks too small will crash the GK blocks. + local co_stack_size_kb = 16 -- -- End configuration of GK block. @@ -100,6 +104,9 @@ return function (net_conf, lls_conf, sol_conf, gk_lcores) gk_conf.back_max_pkt_burst = staticlib.get_back_burst_config(max_pkt_burst_back, net_conf) + gk_conf.co_max_num = co_max_num + gk_conf.co_stack_size_kb = co_stack_size_kb + -- The maximum number of ARP or ND packets in LLS submitted by -- GK or GT. The code below makes sure that the parameter should -- be at least the same with the maximum configured value of GK.