[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAJ8uoz04xv_yBxX7uzsu7ftkgs8ufC97Jn8MbzXtgfN9fZUdGg@mail.gmail.com>
Date: Thu, 31 Jan 2019 14:55:06 +0100
From: Magnus Karlsson <magnus.karlsson@...il.com>
To: Daniel Borkmann <daniel@...earbox.net>
Cc: Magnus Karlsson <magnus.karlsson@...el.com>,
Björn Töpel <bjorn.topel@...el.com>,
ast@...nel.org, Network Development <netdev@...r.kernel.org>,
Jakub Kicinski <jakub.kicinski@...ronome.com>,
Björn Töpel <bjorn.topel@...il.com>,
"Zhang, Qi Z" <qi.z.zhang@...el.com>,
Jesper Dangaard Brouer <brouer@...hat.com>
Subject: Re: [PATCH bpf-next v3 2/3] libbpf: add support for using AF_XDP sockets
On Thu, Jan 31, 2019 at 11:31 AM Daniel Borkmann <daniel@...earbox.net> wrote:
>
> On 01/29/2019 04:12 PM, Magnus Karlsson wrote:
> > This commit adds AF_XDP support to libbpf. The main reason for this is
> > to facilitate writing applications that use AF_XDP by offering
> > higher-level APIs that hide many of the details of the AF_XDP
> > uapi. This is in the same vein as libbpf facilitates XDP adoption by
> > offering easy-to-use higher level interfaces of XDP
> > functionality. Hopefully this will facilitate adoption of AF_XDP, make
> > applications using it simpler and smaller, and finally also make it
> > possible for applications to benefit from optimizations in the AF_XDP
> > user space access code. Previously, people just copied and pasted the
> > code from the sample application into their application, which is not
> > desirable.
> >
> > The interface is composed of two parts:
> >
> > * Low-level access interface to the four rings and the packet
> > * High-level control plane interface for creating and setting
> > up umems and af_xdp sockets as well as a simple XDP program.
> >
> > Signed-off-by: Magnus Karlsson <magnus.karlsson@...el.com>
> [...]
> > +
> > +static __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb)
> > +{
> > + __u32 free_entries = r->cached_cons - r->cached_prod;
> > +
> > + if (free_entries >= nb)
> > + return free_entries;
> > +
> > + /* Refresh the local tail pointer.
> > + * cached_cons is r->size bigger than the real consumer pointer so
> > + * that this addition can be avoided in the more frequently
> > + * executed code that computs free_entries in the beginning of
> > + * this function. Without this optimization it whould have been
> > + * free_entries = r->cached_prod - r->cached_cons + r->size.
> > + */
> > + r->cached_cons = *r->consumer + r->size;
> > +
> > + return r->cached_cons - r->cached_prod;
> > +}
> > +
> > +static __u32 xsk_cons_nb_avail(struct xsk_ring_cons *r, __u32 nb)
> > +{
> > + __u32 entries = r->cached_prod - r->cached_cons;
> > +
> > + if (entries == 0) {
> > + r->cached_prod = *r->producer;
> > + entries = r->cached_prod - r->cached_cons;
> > + }
> > +
> > + return (entries > nb) ? nb : entries;
> > +}
> > +
> > +size_t xsk_ring_prod__reserve(struct xsk_ring_prod *prod, size_t nb, __u32 *idx)
> > +{
> > + if (unlikely(xsk_prod_nb_free(prod, nb) < nb))
> > + return 0;
> > +
> > + *idx = prod->cached_prod;
> > + prod->cached_prod += nb;
> > +
> > + return nb;
> > +}
> > +
> > +void xsk_ring_prod__submit(struct xsk_ring_prod *prod, size_t nb)
> > +{
> > + /* Make sure everything has been written to the ring before signalling
> > + * this to the kernel.
> > + */
> > + smp_wmb();
> > +
> > + *prod->producer += nb;
> > +}
> > +
> > +size_t xsk_ring_cons__peek(struct xsk_ring_cons *cons, size_t nb, __u32 *idx)
> > +{
> > + size_t entries = xsk_cons_nb_avail(cons, nb);
> > +
> > + if (likely(entries > 0)) {
> > + /* Make sure we do not speculatively read the data before
> > + * we have received the packet buffers from the ring.
> > + */
> > + smp_rmb();
> > +
> > + *idx = cons->cached_cons;
> > + cons->cached_cons += entries;
> > + }
> > +
> > + return entries;
> > +}
> > +
> > +void xsk_ring_cons__release(struct xsk_ring_cons *cons, size_t nb)
> > +{
> > + *cons->consumer += nb;
> > +}
> > +
> > +void *xsk_umem__get_data_raw(void *umem_area, __u64 addr)
> > +{
> > + return &((char *)umem_area)[addr];
> > +}
> > +
> > +void *xsk_umem__get_data(struct xsk_umem *umem, __u64 addr)
> > +{
> > + return &((char *)(umem->umem_area))[addr];
> > +}
>
> Shouldn't some of the above helpers for critical path be exposed as static
> inline functions instead?
Could be. I will make some experiments to see how much performance
would improve. Kept them as non-static so that they could be changed
if someone thought of a faster way of doing these operations. But
might be unnecessary, since we can change parts of it even if they are
static. I will measure and come back to you.
> [...]
> > +static int xsk_parse_nl(void *cookie, void *msg, struct nlattr **tb)
> > +{
> > + struct nlattr *tb_parsed[IFLA_XDP_MAX + 1];
> > + struct xsk_nl_info *nl_info = cookie;
> > + unsigned char mode;
> > + int err;
> > +
> > + (void)msg;
>
> Unused?
Yes.
> > + nl_info->xdp_prog_attached = false;
> > + if (!tb[IFLA_XDP])
> > + return 0;
> > +
> > + err = libbpf_nla_parse_nested(tb_parsed, IFLA_XDP_MAX, tb[IFLA_XDP],
> > + NULL);
> > + if (err)
> > + return err;
> > +
> > + if (!tb_parsed[IFLA_XDP_ATTACHED] || !tb_parsed[IFLA_XDP_FD])
> > + return 0;
> > +
> > + mode = libbpf_nla_getattr_u8(tb_parsed[IFLA_XDP_ATTACHED]);
> > + if (mode == XDP_ATTACHED_NONE)
> > + return 0;
>
> Probably good to memset and/or init the passed struct xsk_nl_info (e.g.
> fd to -1 etc) such that if some user did not we won't end up with garbage
> values on return 0.
Will do.
> > + nl_info->xdp_prog_attached = true;
> > + nl_info->fd = libbpf_nla_getattr_u32(tb_parsed[IFLA_XDP_FD]);
> > + return 0;
> > +}
> > +
> > +static bool xsk_xdp_prog_attached(struct xsk_socket *xsk)
> > +{
> > + struct xsk_nl_info nl_info;
> > + unsigned int nl_pid;
> > + char err_buf[256];
> > + int sock, err;
> > +
> > + sock = libbpf_netlink_open(&nl_pid);
> > + if (sock < 0)
> > + return false;
> > +
> > + nl_info.xdp_prog_attached = false;
> > + nl_info.fd = 0;
> > +
> > + err = libbpf_nl_get_link(sock, nl_pid, xsk_parse_nl, &nl_info);
> > + if (err) {
> > + libbpf_strerror(err, err_buf, sizeof(err_buf));
> > + pr_warning("Error:\n%s\n", err_buf);
> > + return false;
> > + }
> > +
> > + xsk->prog_fd = nl_info.fd;
> > + return nl_info.xdp_prog_attached;
>
> Don't we leak sock here and in error above?
Will fix the fd leaks that you have pointed out here and below. And go
through the code to see if there are more fd leaks. Do I ever learn
;-).
Thanks for your review. Appreciated.
/Magnus
> > +}
> > +
> > +static int xsk_load_xdp_prog(struct xsk_socket *xsk)
> > +{
> > + char bpf_log_buf[BPF_LOG_BUF_SIZE];
> > + int err, prog_fd;
> > +
> > + /* This is the C-program:
> > + * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
> > + * {
> > + * int *qidconf, index = ctx->rx_queue_index;
> > + *
> > + * // A set entry here means that the correspnding queue_id
> > + * // has an active AF_XDP socket bound to it.
> > + * qidconf = bpf_map_lookup_elem(&qidconf_map, &index);
> > + * if (!qidconf)
> > + * return XDP_ABORTED;
> > + *
> > + * if (*qidconf)
> > + * return bpf_redirect_map(&xsks_map, index, 0);
> > + *
> > + * return XDP_PASS;
> > + * }
> > + */
> > + struct bpf_insn prog[] = {
> > + /* r1 = *(u32 *)(r1 + 16) */
> > + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 16),
> > + /* *(u32 *)(r10 - 4) = r1 */
> > + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_1, -4),
> > + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
> > + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
> > + BPF_LD_MAP_FD(BPF_REG_1, xsk->qidconf_map_fd),
> > + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
> > + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
> > + BPF_MOV32_IMM(BPF_REG_0, 0),
> > + /* if r1 == 0 goto +8 */
> > + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 8),
> > + BPF_MOV32_IMM(BPF_REG_0, 2),
> > + /* r1 = *(u32 *)(r1 + 0) */
> > + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 0),
> > + /* if r1 == 0 goto +5 */
> > + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5),
> > + /* r2 = *(u32 *)(r10 - 4) */
> > + BPF_LD_MAP_FD(BPF_REG_1, xsk->xsks_map_fd),
> > + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_10, -4),
> > + BPF_MOV32_IMM(BPF_REG_3, 0),
> > + BPF_EMIT_CALL(BPF_FUNC_redirect_map),
> > + /* The jumps are to this instruction */
> > + BPF_EXIT_INSN(),
> > + };
> > + size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
> > +
> > + prog_fd = bpf_load_program(BPF_PROG_TYPE_XDP, prog, insns_cnt,
> > + "LGPL-2.1 or BSD-2-Clause", 0, bpf_log_buf,
> > + BPF_LOG_BUF_SIZE);
> > + if (prog_fd < 0) {
> > + pr_warning("BPF log buffer:\n%s", bpf_log_buf);
> > + return prog_fd;
> > + }
> > +
> > + err = bpf_set_link_xdp_fd(xsk->ifindex, prog_fd, xsk->config.xdp_flags);
> > + if (err)
> > + return err;
>
> Leaks prog_fd on error.
>
> > + xsk->prog_fd = prog_fd;
> > + return 0;
> > +}
> > +
> > +static int xsk_create_bpf_maps(struct xsk_socket *xsk)
> > +{
> > + int fd;
> > +
> > + fd = bpf_create_map_name(BPF_MAP_TYPE_ARRAY, "qidconf_map",
> > + sizeof(int), sizeof(int), MAX_QUEUES, 0);
> > + if (fd < 0)
> > + return fd;
> > + xsk->qidconf_map_fd = fd;
> > +
> > + fd = bpf_create_map_name(BPF_MAP_TYPE_XSKMAP, "xsks_map",
> > + sizeof(int), sizeof(int), MAX_QUEUES, 0);
> > + if (fd < 0)
> > + return fd;
>
> Leaks first map fd on error.
>
> > + xsk->xsks_map_fd = fd;
> > +
> > + return 0;
> > +}
> > +
> > +static int xsk_update_bpf_maps(struct xsk_socket *xsk, int qidconf_value,
> > + int xsks_value)
> > +{
> > + bool qidconf_map_updated = false, xsks_map_updated = false;
> > + struct bpf_prog_info prog_info = {};
> > + __u32 prog_len = sizeof(prog_info);
> > + struct bpf_map_info map_info;
> > + __u32 map_len = sizeof(map_info);
> > + __u32 *map_ids;
> > + int reset_value = 0;
> > + __u32 num_maps;
> > + unsigned int i;
> > + int err;
> > +
> > + err = bpf_obj_get_info_by_fd(xsk->prog_fd, &prog_info, &prog_len);
> > + if (err)
> > + return err;
> > +
> > + num_maps = prog_info.nr_map_ids;
> > +
> > + map_ids = malloc(prog_info.nr_map_ids * sizeof(*map_ids));
>
> calloc()?
>
> > + if (!map_ids)
> > + return -ENOMEM;
> > +
> > + memset(&prog_info, 0, prog_len);
> > + prog_info.nr_map_ids = num_maps;
> > + prog_info.map_ids = (__u64)(unsigned long)map_ids;
> > +
> > + err = bpf_obj_get_info_by_fd(xsk->prog_fd, &prog_info, &prog_len);
> > + if (err)
> > + return err;
>
> Leaks map_ids on error.
>
> > +
> > + for (i = 0; i < prog_info.nr_map_ids; i++) {
> > + int fd;
> > +
> > + fd = bpf_map_get_fd_by_id(map_ids[i]);
> > + if (fd < 0) {
> > + err = -errno;
> > + goto out;
> > + }
> > +
> > + err = bpf_obj_get_info_by_fd(fd, &map_info, &map_len);
> > + if (err)
> > + goto out;
>
> close(fd) on error, also the case for below, please double check everything
> so that no fd leaks by accident into the app.
>
> > +
> > + if (!strcmp(map_info.name, "qidconf_map")) {
> > + err = bpf_map_update_elem(fd, &xsk->queue_id,
> > + &qidconf_value, 0);
> > + if (err)
> > + goto out;
> > + qidconf_map_updated = true;
> > + xsk->qidconf_map_fd = fd;
> > + } else if (!strcmp(map_info.name, "xsks_map")) {
> > + err = bpf_map_update_elem(fd, &xsk->queue_id,
> > + &xsks_value, 0);
> > + if (err)
> > + goto out;> + xsks_map_updated = true;
> > + xsk->xsks_map_fd = fd;
> > + }
> > +
> > + if (qidconf_map_updated && xsks_map_updated)
> > + break;
> > + }
> > +
> > + if (!(qidconf_map_updated && xsks_map_updated)) {
> > + err = -ENOENT;
> > + goto out;
> > + }
> > +
> > + return 0;
> > +
> > +out:
> > + if (qidconf_map_updated)
> > + (void)bpf_map_update_elem(xsk->qidconf_map_fd, &xsk->queue_id,
> > + &reset_value, 0);
> > + if (xsks_map_updated)
> > + (void)bpf_map_update_elem(xsk->xsks_map_fd, &xsk->queue_id,
> > + &reset_value, 0);
> > + return err;
> > +}
> > +
> > +static int xsk_setup_xdp_prog(struct xsk_socket *xsk)
> > +{
> > + bool prog_loaded = false;
> > + int err;
> > +
> > + if (!xsk_xdp_prog_attached(xsk)) {
> > + err = xsk_create_bpf_maps(xsk);
> > + if (err)
> > + goto out_load;
> > +
> > + err = xsk_load_xdp_prog(xsk);
> > + if (err)
> > + return err;
>
> Needs to undo created maps on error.
>
> > + prog_loaded = true;
> > + }
> > +
> > + err = xsk_update_bpf_maps(xsk, true, xsk->fd);
> > + if (err)
> > + goto out_load;
> > +
> > + return 0;
> > +
> > +out_load:
> > + if (prog_loaded)
> > + close(xsk->prog_fd);
>
> Ditto
>
> > + return err;
> > +}
> > +
> > +int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
> > + __u32 queue_id, struct xsk_umem *umem,
> > + struct xsk_ring_cons *rx, struct xsk_ring_prod *tx,
> > + const struct xsk_socket_config *usr_config)
> > +{
> > + struct sockaddr_xdp sxdp = {};
> > + struct xdp_mmap_offsets off;
> > + struct xsk_socket *xsk;
> > + socklen_t optlen;
> > + void *map;
Powered by blists - more mailing lists