[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <f2f4134d-a01f-8d14-26c3-2ca82405a960@iogearbox.net>
Date: Fri, 15 Feb 2019 17:37:44 +0100
From: Daniel Borkmann <daniel@...earbox.net>
To: Magnus Karlsson <magnus.karlsson@...el.com>, bjorn.topel@...el.com,
ast@...nel.org, netdev@...r.kernel.org,
jakub.kicinski@...ronome.com, bjorn.topel@...il.com,
qi.z.zhang@...el.com
Cc: brouer@...hat.com, xiaolong.ye@...el.com
Subject: Re: [PATCH bpf-next v4 1/2] libbpf: add support for using AF_XDP
sockets
On 02/08/2019 02:05 PM, Magnus Karlsson wrote:
> This commit adds AF_XDP support to libbpf. The main reason for this is
> to facilitate writing applications that use AF_XDP by offering
> higher-level APIs that hide many of the details of the AF_XDP
> uapi. This is in the same vein as libbpf facilitates XDP adoption by
> offering easy-to-use higher level interfaces of XDP
> functionality. Hopefully this will facilitate adoption of AF_XDP, make
> applications using it simpler and smaller, and finally also make it
> possible for applications to benefit from optimizations in the AF_XDP
> user space access code. Previously, people just copied and pasted the
> code from the sample application into their application, which is not
> desirable.
>
> The interface is composed of two parts:
>
> * Low-level access interface to the four rings and the packet
> * High-level control plane interface for creating and setting
> up umems and af_xdp sockets as well as a simple XDP program.
>
> Tested-by: Björn Töpel <bjorn.topel@...el.com>
> Signed-off-by: Magnus Karlsson <magnus.karlsson@...el.com>
[...]
> diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
> new file mode 100644
> index 0000000..a982a76
> --- /dev/null
> +++ b/tools/lib/bpf/xsk.c
> @@ -0,0 +1,742 @@
> +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
> +
> +/*
> + * AF_XDP user-space access library.
> + *
> + * Copyright(c) 2018 - 2019 Intel Corporation.
> + *
> + * Author(s): Magnus Karlsson <magnus.karlsson@...el.com>
> + */
> +
> +#include <errno.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <unistd.h>
> +#include <arpa/inet.h>
> +#include <asm/barrier.h>
> +#include <linux/compiler.h>
> +#include <linux/filter.h>
> +#include <linux/if_ether.h>
> +#include <linux/if_link.h>
> +#include <linux/if_packet.h>
> +#include <linux/if_xdp.h>
> +#include <linux/rtnetlink.h>
> +#include <net/if.h>
> +#include <sys/mman.h>
> +#include <sys/socket.h>
> +#include <sys/types.h>
> +
> +#include "bpf.h"
> +#include "libbpf.h"
> +#include "libbpf_util.h"
> +#include "nlattr.h"
> +#include "xsk.h"
> +
> +#ifndef SOL_XDP
> + #define SOL_XDP 283
> +#endif
> +
> +#ifndef AF_XDP
> + #define AF_XDP 44
> +#endif
> +
> +#ifndef PF_XDP
> + #define PF_XDP AF_XDP
> +#endif
> +
> +struct xsk_umem {
> + struct xsk_ring_prod *fill;
> + struct xsk_ring_cons *comp;
> + char *umem_area;
> + struct xsk_umem_config config;
> + int fd;
> + int refcount;
> +};
> +
> +struct xsk_socket {
> + struct xsk_ring_cons *rx;
> + struct xsk_ring_prod *tx;
> + __u64 outstanding_tx;
> + struct xsk_umem *umem;
> + struct xsk_socket_config config;
> + int fd;
> + int xsks_map;
> + int ifindex;
> + int prog_fd;
> + int qidconf_map_fd;
> + int xsks_map_fd;
> + __u32 queue_id;
> +};
> +
> +struct xsk_nl_info {
> + bool xdp_prog_attached;
> + int ifindex;
> + int fd;
> +};
> +
> +#define MAX_QUEUES 128
Why is this a fixed constant here, shouldn't this be dynamic due to being NIC
specific anyway?
[...]
> +void *xsk_umem__get_data(struct xsk_umem *umem, __u64 addr)
> +{
> + return &((char *)(umem->umem_area))[addr];
> +}
There's also a xsk_umem__get_data_raw() doing the same. Why having both, resp.
when to choose which? ;)
> +int xsk_umem__fd(const struct xsk_umem *umem)
> +{
> + return umem ? umem->fd : -EINVAL;
> +}
> +
> +int xsk_socket__fd(const struct xsk_socket *xsk)
> +{
> + return xsk ? xsk->fd : -EINVAL;
> +}
> +
> +static bool xsk_page_aligned(void *buffer)
> +{
> + unsigned long addr = (unsigned long)buffer;
> +
> + return !(addr & (getpagesize() - 1));
> +}
> +
> +static void xsk_set_umem_config(struct xsk_umem_config *cfg,
> + const struct xsk_umem_config *usr_cfg)
> +{
> + if (!usr_cfg) {
> + cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
> + cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
> + cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
> + cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
> + return;
> + }
> +
> + cfg->fill_size = usr_cfg->fill_size;
> + cfg->comp_size = usr_cfg->comp_size;
> + cfg->frame_size = usr_cfg->frame_size;
> + cfg->frame_headroom = usr_cfg->frame_headroom;
Just optional nit, might be a bit nicer to have it in this form:
cfg->fill_size = usr_cfg ? usr_cfg->fill_size :
XSK_RING_PROD__DEFAULT_NUM_DESCS;
> +}
> +
> +static void xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
> + const struct xsk_socket_config *usr_cfg)
> +{
> + if (!usr_cfg) {
> + cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
> + cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
> + cfg->libbpf_flags = 0;
> + cfg->xdp_flags = 0;
> + cfg->bind_flags = 0;
> + return;
> + }
> +
> + cfg->rx_size = usr_cfg->rx_size;
> + cfg->tx_size = usr_cfg->tx_size;
> + cfg->libbpf_flags = usr_cfg->libbpf_flags;
> + cfg->xdp_flags = usr_cfg->xdp_flags;
> + cfg->bind_flags = usr_cfg->bind_flags;
(Ditto)
> +}
> +
> +int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
> + struct xsk_ring_prod *fill, struct xsk_ring_cons *comp,
> + const struct xsk_umem_config *usr_config)
> +{
> + struct xdp_mmap_offsets off;
> + struct xdp_umem_reg mr;
> + struct xsk_umem *umem;
> + socklen_t optlen;
> + void *map;
> + int err;
> +
> + if (!umem_area || !umem_ptr || !fill || !comp)
> + return -EFAULT;
> + if (!size && !xsk_page_aligned(umem_area))
> + return -EINVAL;
> +
> + umem = calloc(1, sizeof(*umem));
> + if (!umem)
> + return -ENOMEM;
> +
> + umem->fd = socket(AF_XDP, SOCK_RAW, 0);
> + if (umem->fd < 0) {
> + err = -errno;
> + goto out_umem_alloc;
> + }
> +
> + umem->umem_area = umem_area;
> + xsk_set_umem_config(&umem->config, usr_config);
> +
> + mr.addr = (uintptr_t)umem_area;
> + mr.len = size;
> + mr.chunk_size = umem->config.frame_size;
> + mr.headroom = umem->config.frame_headroom;
> +
> + err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr));
> + if (err) {
> + err = -errno;
> + goto out_socket;
> + }
> + err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_FILL_RING,
> + &umem->config.fill_size,
> + sizeof(umem->config.fill_size));
> + if (err) {
> + err = -errno;
> + goto out_socket;
> + }
> + err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_COMPLETION_RING,
> + &umem->config.comp_size,
> + sizeof(umem->config.comp_size));
> + if (err) {
> + err = -errno;
> + goto out_socket;
> + }
> +
> + optlen = sizeof(off);
> + err = getsockopt(umem->fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);
> + if (err) {
> + err = -errno;
> + goto out_socket;
> + }
> +
> + map = xsk_mmap(NULL, off.fr.desc +
> + umem->config.fill_size * sizeof(__u64),
> + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
> + umem->fd, XDP_UMEM_PGOFF_FILL_RING);
> + if (map == MAP_FAILED) {
> + err = -errno;
> + goto out_socket;
> + }
> +
> + umem->fill = fill;
> + fill->mask = umem->config.fill_size - 1;
> + fill->size = umem->config.fill_size;
> + fill->producer = map + off.fr.producer;
> + fill->consumer = map + off.fr.consumer;
> + fill->ring = map + off.fr.desc;
> + fill->cached_cons = umem->config.fill_size;
> +
> + map = xsk_mmap(NULL,
> + off.cr.desc + umem->config.comp_size * sizeof(__u64),
> + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
> + umem->fd, XDP_UMEM_PGOFF_COMPLETION_RING);
> + if (map == MAP_FAILED) {
> + err = -errno;
> + goto out_mmap;
> + }
> +
> + umem->comp = comp;
> + comp->mask = umem->config.comp_size - 1;
> + comp->size = umem->config.comp_size;
> + comp->producer = map + off.cr.producer;
> + comp->consumer = map + off.cr.consumer;
> + comp->ring = map + off.cr.desc;
> +
> + *umem_ptr = umem;
> + return 0;
> +
> +out_mmap:
> + munmap(umem->fill,
> + off.fr.desc + umem->config.fill_size * sizeof(__u64));
> +out_socket:
> + close(umem->fd);
> +out_umem_alloc:
> + free(umem);
> + return err;
> +}
> +
> +static int xsk_parse_nl(void *cookie, void *msg, struct nlattr **tb)
> +{
> + struct nlattr *tb_parsed[IFLA_XDP_MAX + 1];
> + struct xsk_nl_info *nl_info = cookie;
> + struct ifinfomsg *ifinfo = msg;
> + unsigned char mode;
> + int err;
> +
> + if (nl_info->ifindex && nl_info->ifindex != ifinfo->ifi_index)
> + return 0;
> +
> + if (!tb[IFLA_XDP])
> + return 0;
> +
> + err = libbpf_nla_parse_nested(tb_parsed, IFLA_XDP_MAX, tb[IFLA_XDP],
> + NULL);
> + if (err)
> + return err;
> +
> + if (!tb_parsed[IFLA_XDP_ATTACHED] || !tb_parsed[IFLA_XDP_FD])
> + return 0;
> +
> + mode = libbpf_nla_getattr_u8(tb_parsed[IFLA_XDP_ATTACHED]);
> + if (mode == XDP_ATTACHED_NONE)
> + return 0;
> +
> + nl_info->xdp_prog_attached = true;
> + nl_info->fd = libbpf_nla_getattr_u32(tb_parsed[IFLA_XDP_FD]);
Hm, I don't think this works if I read the intention of this helper correctly.
IFLA_XDP_FD is never set for retrieving the prog from the kernel. So the
above is a bug.
We also have bpf_get_link_xdp_id(). This should probably just be reused in
this context here.
> + return 0;
> +}
> +
> +static bool xsk_xdp_prog_attached(struct xsk_socket *xsk)
> +{
> + struct xsk_nl_info nl_info;
> + unsigned int nl_pid;
> + char err_buf[256];
> + int sock, err;
> +
> + sock = libbpf_netlink_open(&nl_pid);
> + if (sock < 0)
> + return false;
> +
> + nl_info.xdp_prog_attached = false;
> + nl_info.ifindex = xsk->ifindex;
> + nl_info.fd = -1;
> +
> + err = libbpf_nl_get_link(sock, nl_pid, xsk_parse_nl, &nl_info);
> + if (err) {
> + libbpf_strerror(err, err_buf, sizeof(err_buf));
> + pr_warning("Error:\n%s\n", err_buf);
> + close(sock);
> + return false;
> + }
> +
> + close(sock);
> + xsk->prog_fd = nl_info.fd;
> + return nl_info.xdp_prog_attached;
> +}
(See bpf_get_link_xdp_id().)
> +
> +static int xsk_load_xdp_prog(struct xsk_socket *xsk)
> +{
> + char bpf_log_buf[BPF_LOG_BUF_SIZE];
> + int err, prog_fd;
> +
> + /* This is the C-program:
> + * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
> + * {
> + * int *qidconf, index = ctx->rx_queue_index;
[...]
> +
> +int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
> + __u32 queue_id, struct xsk_umem *umem,
> + struct xsk_ring_cons *rx, struct xsk_ring_prod *tx,
> + const struct xsk_socket_config *usr_config)
> +{
> + struct sockaddr_xdp sxdp = {};
> + struct xdp_mmap_offsets off;
> + struct xsk_socket *xsk;
> + socklen_t optlen;
> + void *map;
> + int err;
> +
> + if (!umem || !xsk_ptr || !rx || !tx)
> + return -EFAULT;
> +
> + if (umem->refcount) {
> + pr_warning("Error: shared umems not supported by libbpf.\n");
> + return -EBUSY;
> + }
> +
> + xsk = calloc(1, sizeof(*xsk));
> + if (!xsk)
> + return -ENOMEM;
> +
> + if (umem->refcount++ > 0) {
Should this refcount rather be atomic actually?
> + xsk->fd = socket(AF_XDP, SOCK_RAW, 0);
> + if (xsk->fd < 0) {
> + err = -errno;
> + goto out_xsk_alloc;
> + }
> + } else {
> + xsk->fd = umem->fd;
> + }
> +
> + xsk->outstanding_tx = 0;
> + xsk->queue_id = queue_id;
> + xsk->umem = umem;
> + xsk->ifindex = if_nametoindex(ifname);
> + if (!xsk->ifindex) {
> + err = -errno;
> + goto out_socket;
> + }
> +
> + xsk_set_xdp_socket_config(&xsk->config, usr_config);
[...]
Powered by blists - more mailing lists