[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240702180908.0eccf78f@kernel.org>
Date: Tue, 2 Jul 2024 18:09:08 -0700
From: Jakub Kicinski <kuba@...nel.org>
To: Mina Almasry <almasrymina@...gle.com>
Cc: netdev@...r.kernel.org, linux-kernel@...r.kernel.org,
linux-doc@...r.kernel.org, linux-alpha@...r.kernel.org,
linux-mips@...r.kernel.org, linux-parisc@...r.kernel.org,
sparclinux@...r.kernel.org, linux-trace-kernel@...r.kernel.org,
linux-arch@...r.kernel.org, bpf@...r.kernel.org,
linux-kselftest@...r.kernel.org, linux-media@...r.kernel.org,
dri-devel@...ts.freedesktop.org, "David S. Miller" <davem@...emloft.net>,
Eric Dumazet <edumazet@...gle.com>, Paolo Abeni <pabeni@...hat.com>, Donald
Hunter <donald.hunter@...il.com>, Jonathan Corbet <corbet@....net>, Richard
Henderson <richard.henderson@...aro.org>, Ivan Kokshaysky
<ink@...assic.park.msu.ru>, Matt Turner <mattst88@...il.com>, Thomas
Bogendoerfer <tsbogend@...ha.franken.de>, "James E.J. Bottomley"
<James.Bottomley@...senPartnership.com>, Helge Deller <deller@....de>,
Andreas Larsson <andreas@...sler.com>, Jesper Dangaard Brouer
<hawk@...nel.org>, Ilias Apalodimas <ilias.apalodimas@...aro.org>, Steven
Rostedt <rostedt@...dmis.org>, Masami Hiramatsu <mhiramat@...nel.org>,
Mathieu Desnoyers <mathieu.desnoyers@...icios.com>, Arnd Bergmann
<arnd@...db.de>, Alexei Starovoitov <ast@...nel.org>, Daniel Borkmann
<daniel@...earbox.net>, Andrii Nakryiko <andrii@...nel.org>, Martin KaFai
Lau <martin.lau@...ux.dev>, Eduard Zingerman <eddyz87@...il.com>, Song Liu
<song@...nel.org>, Yonghong Song <yonghong.song@...ux.dev>, John Fastabend
<john.fastabend@...il.com>, KP Singh <kpsingh@...nel.org>, Stanislav
Fomichev <sdf@...ichev.me>, Hao Luo <haoluo@...gle.com>, Jiri Olsa
<jolsa@...nel.org>, Steffen Klassert <steffen.klassert@...unet.com>,
Herbert Xu <herbert@...dor.apana.org.au>, David Ahern <dsahern@...nel.org>,
Willem de Bruijn <willemdebruijn.kernel@...il.com>, Shuah Khan
<shuah@...nel.org>, Sumit Semwal <sumit.semwal@...aro.org>, "Christian
König" <christian.koenig@....com>, Bagas Sanjaya
<bagasdotme@...il.com>, Christoph Hellwig <hch@...radead.org>, Nikolay
Aleksandrov <razor@...ckwall.org>, Pavel Begunkov <asml.silence@...il.com>,
David Wei <dw@...idwei.uk>, Jason Gunthorpe <jgg@...pe.ca>, Yunsheng Lin
<linyunsheng@...wei.com>, Shailend Chand <shailend@...gle.com>, Harshitha
Ramamurthy <hramamurthy@...gle.com>, Shakeel Butt <shakeel.butt@...ux.dev>,
Jeroen de Borst <jeroendb@...gle.com>, Praveen Kaligineedi
<pkaligineedi@...gle.com>, Willem de Bruijn <willemb@...gle.com>, Kaiyuan
Zhang <kaiyuanz@...gle.com>
Subject: Re: [PATCH net-next v15 03/14] netdev: support binding dma-buf to
netdevice
On Fri, 28 Jun 2024 00:32:40 +0000 Mina Almasry wrote:
> +/* Protected by rtnl_lock() */
> +static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1);
> +
> +void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
> +{
> + struct netdev_rx_queue *rxq;
> + unsigned long xa_idx;
> + unsigned int rxq_idx;
> +
> + if (!binding)
> + return;
nit: I don't see how it can happen, no defensive programming, please
> + if (binding->list.next)
> + list_del(&binding->list);
> +
> + xa_for_each(&binding->bound_rxq_list, xa_idx, rxq) {
nit: s/bound_rxq_list/bound_rxqs/ ? it's not a list
> + if (rxq->mp_params.mp_priv == binding) {
> + /* We hold the rtnl_lock while binding/unbinding
> + * dma-buf, so we can't race with another thread that
> + * is also modifying this value. However, the page_pool
> + * may read this config while it's creating its
> + * rx-queues. WRITE_ONCE() here to match the
> + * READ_ONCE() in the page_pool.
> + */
> + WRITE_ONCE(rxq->mp_params.mp_priv, NULL);
Is this really sufficient in terms of locking? @binding is not
RCU-protected and neither is the reader guaranteed to be in
an RCU critical section. Actually the "reader" tries to take a ref
and use this struct so it's not even a pure reader.
Let's add a lock or use one of the existing locks
Or, perhaps time to add a mutex to struct net_device
> + rxq_idx = get_netdev_rx_queue_index(rxq);
> +
> + netdev_rx_queue_restart(binding->dev, rxq_idx);
> + }
> + }
> +
> + xa_erase(&net_devmem_dmabuf_bindings, binding->id);
> +
> + net_devmem_dmabuf_binding_put(binding);
> +}
> +
> +int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
> + struct net_devmem_dmabuf_binding *binding)
> +{
> + struct netdev_rx_queue *rxq;
> + u32 xa_idx;
> + int err;
> +
> + if (rxq_idx >= dev->num_rx_queues)
> + return -ERANGE;
> +
> + rxq = __netif_get_rx_queue(dev, rxq_idx);
> + if (rxq->mp_params.mp_priv)
> + return -EEXIST;
Makes me wonder - do we need an API to unbind or we assume
application will only have one binding per socket and close
it every time? I guess that's fine for future extension.
> + err = xa_alloc(&binding->bound_rxq_list, &xa_idx, rxq, xa_limit_32b,
> + GFP_KERNEL);
> + if (err)
> + return err;
> +
> + /* We hold the rtnl_lock while binding/unbinding dma-buf, so we can't
> + * race with another thread that is also modifying this value. However,
> + * the driver may read this config while it's creating its * rx-queues.
> + * WRITE_ONCE() here to match the READ_ONCE() in the driver.
> + */
> + WRITE_ONCE(rxq->mp_params.mp_priv, binding);
> +
> + err = netdev_rx_queue_restart(dev, rxq_idx);
> + if (err)
> + goto err_xa_erase;
> +
> + return 0;
> +
> +err_xa_erase:
> + WRITE_ONCE(rxq->mp_params.mp_priv, NULL);
> + xa_erase(&binding->bound_rxq_list, xa_idx);
> +
> + return err;
> +}
> +
> +int net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
> + struct net_devmem_dmabuf_binding **out)
> +{
> + struct net_devmem_dmabuf_binding *binding;
> + static u32 id_alloc_next;
> + struct scatterlist *sg;
> + struct dma_buf *dmabuf;
> + unsigned int sg_idx, i;
> + unsigned long virtual;
> + int err;
> +
> + dmabuf = dma_buf_get(dmabuf_fd);
> + if (IS_ERR(dmabuf))
> + return -EBADFD;
nit: I think error pointers are nicer than **out parameters :(
you can ERR_CAST() all the DMABUF errors
> + binding = kzalloc_node(sizeof(*binding), GFP_KERNEL,
> + dev_to_node(&dev->dev));
> + if (!binding) {
> + err = -ENOMEM;
> + goto err_put_dmabuf;
> + }
> +
> + binding->dev = dev;
> +
> + err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id,
> + binding, xa_limit_32b, &id_alloc_next,
> + GFP_KERNEL);
> + if (err < 0)
> + goto err_free_binding;
> +
> + xa_init_flags(&binding->bound_rxq_list, XA_FLAGS_ALLOC);
> +
> + refcount_set(&binding->ref, 1);
> +
> + binding->dmabuf = dmabuf;
> +
> + binding->attachment = dma_buf_attach(binding->dmabuf, dev->dev.parent);
> + if (IS_ERR(binding->attachment)) {
> + err = PTR_ERR(binding->attachment);
> + goto err_free_id;
> + }
> -/* Stub */
> int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
> {
> - return 0;
> + struct nlattr *tb[ARRAY_SIZE(netdev_queue_dmabuf_nl_policy)];
> + struct net_devmem_dmabuf_binding *out_binding;
> + struct list_head *sock_binding_list;
> + u32 ifindex, dmabuf_fd, rxq_idx;
> + struct net_device *netdev;
> + struct sk_buff *rsp;
> + struct nlattr *attr;
> + int rem, err = 0;
> + void *hdr;
> +
> + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) ||
> + GENL_REQ_ATTR_CHECK(info, NETDEV_A_BIND_DMABUF_DMABUF_FD) ||
> + GENL_REQ_ATTR_CHECK(info, NETDEV_A_BIND_DMABUF_QUEUES))
> + return -EINVAL;
> +
> + ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]);
> + dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_BIND_DMABUF_DMABUF_FD]);
> +
> + rtnl_lock();
> +
> + netdev = __dev_get_by_index(genl_info_net(info), ifindex);
> + if (!netdev) {
|| !netif_device_present(netdev)
> + err = -ENODEV;
> + goto err_unlock;
> + }
> +
> + err = net_devmem_bind_dmabuf(netdev, dmabuf_fd, &out_binding);
> + if (err)
> + goto err_unlock;
> +
> + nla_for_each_attr(attr, genlmsg_data(info->genlhdr),
> + genlmsg_len(info->genlhdr), rem) {
> +
> + if (nla_type(attr) != NETDEV_A_BIND_DMABUF_QUEUES)
> + continue;
nit: nla_for_each_attr_type()
> + err = nla_parse_nested(
> + tb, ARRAY_SIZE(netdev_queue_dmabuf_nl_policy) - 1, attr,
> + netdev_queue_dmabuf_nl_policy, info->extack);
> + if (err < 0)
> + goto err_unbind;
> +
> + rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_DMABUF_IDX]);
> +
> + err = net_devmem_bind_dmabuf_to_queue(netdev, rxq_idx,
> + out_binding);
> + if (err)
> + goto err_unbind;
> + }
> +
> + sock_binding_list = genl_sk_priv_get(&netdev_nl_family,
> + NETLINK_CB(skb).sk);
> + if (IS_ERR(sock_binding_list)) {
> + err = PTR_ERR(sock_binding_list);
> + goto err_unbind;
> + }
> +
> + list_add(&out_binding->list, sock_binding_list);
> +
> + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
> + if (!rsp) {
> + err = -ENOMEM;
> + goto err_unbind;
> + }
> +
> + hdr = genlmsg_iput(rsp, info);
> + if (!hdr) {
> + err = -EMSGSIZE;
> + goto err_genlmsg_free;
> + }
I'd move genl_sk_priv_get(), genlmsg_new() and genlmsg_iput() before we
take rtnl_lock(), but I admit it's a bit late for this sort of
feedback.. :)
> + nla_put_u32(rsp, NETDEV_A_BIND_DMABUF_DMABUF_ID, out_binding->id);
> + genlmsg_end(rsp, hdr);
> +
> + rtnl_unlock();
> +
> + return genlmsg_reply(rsp, info);
> +
> +err_genlmsg_free:
> + nlmsg_free(rsp);
> +err_unbind:
> + net_devmem_unbind_dmabuf(out_binding);
> +err_unlock:
> + rtnl_unlock();
> + return err;
> }
Powered by blists - more mailing lists