lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <f2f4134d-a01f-8d14-26c3-2ca82405a960@iogearbox.net>
Date:   Fri, 15 Feb 2019 17:37:44 +0100
From:   Daniel Borkmann <daniel@...earbox.net>
To:     Magnus Karlsson <magnus.karlsson@...el.com>, bjorn.topel@...el.com,
        ast@...nel.org, netdev@...r.kernel.org,
        jakub.kicinski@...ronome.com, bjorn.topel@...il.com,
        qi.z.zhang@...el.com
Cc:     brouer@...hat.com, xiaolong.ye@...el.com
Subject: Re: [PATCH bpf-next v4 1/2] libbpf: add support for using AF_XDP
 sockets

On 02/08/2019 02:05 PM, Magnus Karlsson wrote:
> This commit adds AF_XDP support to libbpf. The main reason for this is
> to facilitate writing applications that use AF_XDP by offering
> higher-level APIs that hide many of the details of the AF_XDP
> uapi. This is in the same vein as libbpf facilitates XDP adoption by
> offering easy-to-use higher level interfaces of XDP
> functionality. Hopefully this will facilitate adoption of AF_XDP, make
> applications using it simpler and smaller, and finally also make it
> possible for applications to benefit from optimizations in the AF_XDP
> user space access code. Previously, people just copied and pasted the
> code from the sample application into their application, which is not
> desirable.
> 
> The interface is composed of two parts:
> 
> * Low-level access interface to the four rings and the packet
> * High-level control plane interface for creating and setting
>   up umems and af_xdp sockets as well as a simple XDP program.
> 
> Tested-by: Björn Töpel <bjorn.topel@...el.com>
> Signed-off-by: Magnus Karlsson <magnus.karlsson@...el.com>
[...]
> diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
> new file mode 100644
> index 0000000..a982a76
> --- /dev/null
> +++ b/tools/lib/bpf/xsk.c
> @@ -0,0 +1,742 @@
> +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
> +
> +/*
> + * AF_XDP user-space access library.
> + *
> + * Copyright(c) 2018 - 2019 Intel Corporation.
> + *
> + * Author(s): Magnus Karlsson <magnus.karlsson@...el.com>
> + */
> +
> +#include <errno.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <unistd.h>
> +#include <arpa/inet.h>
> +#include <asm/barrier.h>
> +#include <linux/compiler.h>
> +#include <linux/filter.h>
> +#include <linux/if_ether.h>
> +#include <linux/if_link.h>
> +#include <linux/if_packet.h>
> +#include <linux/if_xdp.h>
> +#include <linux/rtnetlink.h>
> +#include <net/if.h>
> +#include <sys/mman.h>
> +#include <sys/socket.h>
> +#include <sys/types.h>
> +
> +#include "bpf.h"
> +#include "libbpf.h"
> +#include "libbpf_util.h"
> +#include "nlattr.h"
> +#include "xsk.h"
> +
> +#ifndef SOL_XDP
> + #define SOL_XDP 283
> +#endif
> +
> +#ifndef AF_XDP
> + #define AF_XDP 44
> +#endif
> +
> +#ifndef PF_XDP
> + #define PF_XDP AF_XDP
> +#endif
> +
> +struct xsk_umem {
> +	struct xsk_ring_prod *fill;
> +	struct xsk_ring_cons *comp;
> +	char *umem_area;
> +	struct xsk_umem_config config;
> +	int fd;
> +	int refcount;
> +};
> +
> +struct xsk_socket {
> +	struct xsk_ring_cons *rx;
> +	struct xsk_ring_prod *tx;
> +	__u64 outstanding_tx;
> +	struct xsk_umem *umem;
> +	struct xsk_socket_config config;
> +	int fd;
> +	int xsks_map;
> +	int ifindex;
> +	int prog_fd;
> +	int qidconf_map_fd;
> +	int xsks_map_fd;
> +	__u32 queue_id;
> +};
> +
> +struct xsk_nl_info {
> +	bool xdp_prog_attached;
> +	int ifindex;
> +	int fd;
> +};
> +
> +#define MAX_QUEUES 128

Why is this a fixed constant here, shouldn't this be dynamic due to being NIC
specific anyway?

[...]
> +void *xsk_umem__get_data(struct xsk_umem *umem, __u64 addr)
> +{
> +	return &((char *)(umem->umem_area))[addr];
> +}

There's also a xsk_umem__get_data_raw() doing the same. Why having both, resp.
when to choose which? ;)

> +int xsk_umem__fd(const struct xsk_umem *umem)
> +{
> +	return umem ? umem->fd : -EINVAL;
> +}
> +
> +int xsk_socket__fd(const struct xsk_socket *xsk)
> +{
> +	return xsk ? xsk->fd : -EINVAL;
> +}
> +
> +static bool xsk_page_aligned(void *buffer)
> +{
> +	unsigned long addr = (unsigned long)buffer;
> +
> +	return !(addr & (getpagesize() - 1));
> +}
> +
> +static void xsk_set_umem_config(struct xsk_umem_config *cfg,
> +				const struct xsk_umem_config *usr_cfg)
> +{
> +	if (!usr_cfg) {
> +		cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
> +		cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
> +		cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
> +		cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
> +		return;
> +	}
> +
> +	cfg->fill_size = usr_cfg->fill_size;
> +	cfg->comp_size = usr_cfg->comp_size;
> +	cfg->frame_size = usr_cfg->frame_size;
> +	cfg->frame_headroom = usr_cfg->frame_headroom;

Just optional nit, might be a bit nicer to have it in this form:

	cfg->fill_size = usr_cfg ? usr_cfg->fill_size :
		         XSK_RING_PROD__DEFAULT_NUM_DESCS;


> +}
> +
> +static void xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
> +				      const struct xsk_socket_config *usr_cfg)
> +{
> +	if (!usr_cfg) {
> +		cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
> +		cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
> +		cfg->libbpf_flags = 0;
> +		cfg->xdp_flags = 0;
> +		cfg->bind_flags = 0;
> +		return;
> +	}
> +
> +	cfg->rx_size = usr_cfg->rx_size;
> +	cfg->tx_size = usr_cfg->tx_size;
> +	cfg->libbpf_flags = usr_cfg->libbpf_flags;
> +	cfg->xdp_flags = usr_cfg->xdp_flags;
> +	cfg->bind_flags = usr_cfg->bind_flags;

(Ditto)

> +}
> +
> +int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
> +		     struct xsk_ring_prod *fill, struct xsk_ring_cons *comp,
> +		     const struct xsk_umem_config *usr_config)
> +{
> +	struct xdp_mmap_offsets off;
> +	struct xdp_umem_reg mr;
> +	struct xsk_umem *umem;
> +	socklen_t optlen;
> +	void *map;
> +	int err;
> +
> +	if (!umem_area || !umem_ptr || !fill || !comp)
> +		return -EFAULT;
> +	if (!size && !xsk_page_aligned(umem_area))
> +		return -EINVAL;
> +
> +	umem = calloc(1, sizeof(*umem));
> +	if (!umem)
> +		return -ENOMEM;
> +
> +	umem->fd = socket(AF_XDP, SOCK_RAW, 0);
> +	if (umem->fd < 0) {
> +		err = -errno;
> +		goto out_umem_alloc;
> +	}
> +
> +	umem->umem_area = umem_area;
> +	xsk_set_umem_config(&umem->config, usr_config);
> +
> +	mr.addr = (uintptr_t)umem_area;
> +	mr.len = size;
> +	mr.chunk_size = umem->config.frame_size;
> +	mr.headroom = umem->config.frame_headroom;
> +
> +	err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr));
> +	if (err) {
> +		err = -errno;
> +		goto out_socket;
> +	}
> +	err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_FILL_RING,
> +			 &umem->config.fill_size,
> +			 sizeof(umem->config.fill_size));
> +	if (err) {
> +		err = -errno;
> +		goto out_socket;
> +	}
> +	err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_COMPLETION_RING,
> +			 &umem->config.comp_size,
> +			 sizeof(umem->config.comp_size));
> +	if (err) {
> +		err = -errno;
> +		goto out_socket;
> +	}
> +
> +	optlen = sizeof(off);
> +	err = getsockopt(umem->fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);
> +	if (err) {
> +		err = -errno;
> +		goto out_socket;
> +	}
> +
> +	map = xsk_mmap(NULL, off.fr.desc +
> +		       umem->config.fill_size * sizeof(__u64),
> +		       PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
> +		       umem->fd, XDP_UMEM_PGOFF_FILL_RING);
> +	if (map == MAP_FAILED) {
> +		err = -errno;
> +		goto out_socket;
> +	}
> +
> +	umem->fill = fill;
> +	fill->mask = umem->config.fill_size - 1;
> +	fill->size = umem->config.fill_size;
> +	fill->producer = map + off.fr.producer;
> +	fill->consumer = map + off.fr.consumer;
> +	fill->ring = map + off.fr.desc;
> +	fill->cached_cons = umem->config.fill_size;
> +
> +	map = xsk_mmap(NULL,
> +		       off.cr.desc + umem->config.comp_size * sizeof(__u64),
> +		       PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
> +		       umem->fd, XDP_UMEM_PGOFF_COMPLETION_RING);
> +	if (map == MAP_FAILED) {
> +		err = -errno;
> +		goto out_mmap;
> +	}
> +
> +	umem->comp = comp;
> +	comp->mask = umem->config.comp_size - 1;
> +	comp->size = umem->config.comp_size;
> +	comp->producer = map + off.cr.producer;
> +	comp->consumer = map + off.cr.consumer;
> +	comp->ring = map + off.cr.desc;
> +
> +	*umem_ptr = umem;
> +	return 0;
> +
> +out_mmap:
> +	munmap(umem->fill,
> +	       off.fr.desc + umem->config.fill_size * sizeof(__u64));
> +out_socket:
> +	close(umem->fd);
> +out_umem_alloc:
> +	free(umem);
> +	return err;
> +}
> +
> +static int xsk_parse_nl(void *cookie, void *msg, struct nlattr **tb)
> +{
> +	struct nlattr *tb_parsed[IFLA_XDP_MAX + 1];
> +	struct xsk_nl_info *nl_info = cookie;
> +	struct ifinfomsg *ifinfo = msg;
> +	unsigned char mode;
> +	int err;
> +
> +	if (nl_info->ifindex && nl_info->ifindex != ifinfo->ifi_index)
> +		return 0;
> +
> +	if (!tb[IFLA_XDP])
> +		return 0;
> +
> +	err = libbpf_nla_parse_nested(tb_parsed, IFLA_XDP_MAX, tb[IFLA_XDP],
> +				      NULL);
> +	if (err)
> +		return err;
> +
> +	if (!tb_parsed[IFLA_XDP_ATTACHED] || !tb_parsed[IFLA_XDP_FD])
> +		return 0;
> +
> +	mode = libbpf_nla_getattr_u8(tb_parsed[IFLA_XDP_ATTACHED]);
> +	if (mode == XDP_ATTACHED_NONE)
> +		return 0;
> +
> +	nl_info->xdp_prog_attached = true;
> +	nl_info->fd = libbpf_nla_getattr_u32(tb_parsed[IFLA_XDP_FD]);

Hm, I don't think this works if I read the intention of this helper correctly.

IFLA_XDP_FD is never set for retrieving the prog from the kernel. So the
above is a bug.

We also have bpf_get_link_xdp_id(). This should probably just be reused in
this context here.

> +	return 0;
> +}
> +
> +static bool xsk_xdp_prog_attached(struct xsk_socket *xsk)
> +{
> +	struct xsk_nl_info nl_info;
> +	unsigned int nl_pid;
> +	char err_buf[256];
> +	int sock, err;
> +
> +	sock = libbpf_netlink_open(&nl_pid);
> +	if (sock < 0)
> +		return false;
> +
> +	nl_info.xdp_prog_attached = false;
> +	nl_info.ifindex = xsk->ifindex;
> +	nl_info.fd = -1;
> +
> +	err = libbpf_nl_get_link(sock, nl_pid, xsk_parse_nl, &nl_info);
> +	if (err) {
> +		libbpf_strerror(err, err_buf, sizeof(err_buf));
> +		pr_warning("Error:\n%s\n", err_buf);
> +		close(sock);
> +		return false;
> +	}
> +
> +	close(sock);
> +	xsk->prog_fd = nl_info.fd;
> +	return nl_info.xdp_prog_attached;
> +}

(See bpf_get_link_xdp_id().)

> +
> +static int xsk_load_xdp_prog(struct xsk_socket *xsk)
> +{
> +	char bpf_log_buf[BPF_LOG_BUF_SIZE];
> +	int err, prog_fd;
> +
> +	/* This is the C-program:
> +	 * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
> +	 * {
> +	 *     int *qidconf, index = ctx->rx_queue_index;
[...]
> +
> +int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
> +		       __u32 queue_id, struct xsk_umem *umem,
> +		       struct xsk_ring_cons *rx, struct xsk_ring_prod *tx,
> +		       const struct xsk_socket_config *usr_config)
> +{
> +	struct sockaddr_xdp sxdp = {};
> +	struct xdp_mmap_offsets off;
> +	struct xsk_socket *xsk;
> +	socklen_t optlen;
> +	void *map;
> +	int err;
> +
> +	if (!umem || !xsk_ptr || !rx || !tx)
> +		return -EFAULT;
> +
> +	if (umem->refcount) {
> +		pr_warning("Error: shared umems not supported by libbpf.\n");
> +		return -EBUSY;
> +	}
> +
> +	xsk = calloc(1, sizeof(*xsk));
> +	if (!xsk)
> +		return -ENOMEM;
> +
> +	if (umem->refcount++ > 0) {

Should this refcount rather be atomic actually?

> +		xsk->fd = socket(AF_XDP, SOCK_RAW, 0);
> +		if (xsk->fd < 0) {
> +			err = -errno;
> +			goto out_xsk_alloc;
> +		}
> +	} else {
> +		xsk->fd = umem->fd;
> +	}
> +
> +	xsk->outstanding_tx = 0;
> +	xsk->queue_id = queue_id;
> +	xsk->umem = umem;
> +	xsk->ifindex = if_nametoindex(ifname);
> +	if (!xsk->ifindex) {
> +		err = -errno;
> +		goto out_socket;
> +	}
> +
> +	xsk_set_xdp_socket_config(&xsk->config, usr_config);
[...]

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ