lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <2aa643d2-5cef-bab5-d399-4d5330eb7a21@mellanox.com>
Date:   Wed, 8 Jul 2020 18:00:38 +0300
From:   Maxim Mikityanskiy <maximmi@...lanox.com>
To:     Magnus Karlsson <magnus.karlsson@...el.com>
Cc:     bjorn.topel@...el.com, ast@...nel.org, daniel@...earbox.net,
        netdev@...r.kernel.org, jonathan.lemon@...il.com,
        bpf@...r.kernel.org, jeffrey.t.kirsher@...el.com,
        maciej.fijalkowski@...el.com, maciejromanfijalkowski@...il.com,
        cristian.dumitrescu@...el.com
Subject: Re: [PATCH bpf-next 03/14] xsk: create and free context independently
 from umem

On 2020-07-02 15:19, Magnus Karlsson wrote:
> Create and free the buffer pool independently from the umem. Move
> these operations that are performed on the buffer pool from the
> umem create and destroy functions to new create and destroy
> functions just for the buffer pool. This so that in later commits
> we can instantiate multiple buffer pools per umem when sharing a
> umem between HW queues and/or devices. We also erradicate the
> back pointer from the umem to the buffer pool as this will not
> work when we introduce the possibility to have multiple buffer
> pools per umem.
> 
> It might seem a bit odd that we create an empty buffer pool first
> and then recreate it with its right size when we bind to a device
> and umem. But the page pool will in later commits be used to
> carry information before it has been assigned to a umem and its
> size decided.

What kind of information? I'm looking at the final code: on socket 
creation you just fill the pool with zeros, then we may have setsockopt 
for FQ and CQ, then xsk_bind replaces the pool with the real one. So the 
only information carried from the old pool to the new one is FQ and CQ, 
or did I miss anything?

I don't quite like this design, it's kind of a hack to support the 
initialization order that we have, but it complicates things: when you 
copy the old pool into the new one, it's not clear which fields we care 
about, and which are ignored/overwritten.

Regarding FQ and CQ, for shared UMEM, they won't be filled, so there is 
no point in the temporary pool in this case (unless it also stores 
something that I missed).

I suggest to add a pointer to some kind of a configuration struct to xs. 
All things configured with setsockopt go to that struct. xsk_bind will 
call a function to validate the config struct, and if it's OK, it will 
create the pool (once), fill the fields and free the config struct. 
Config struct can be a union with the pool to save space in xs. Probably 
we will also be able to drop a few fields from xs (such as umem?). How 
do you feel about this idea?

> 
> Signed-off-by: Magnus Karlsson <magnus.karlsson@...el.com>
> ---
>   include/net/xdp_sock.h      |   3 +-
>   include/net/xsk_buff_pool.h |  14 +++-
>   net/xdp/xdp_umem.c          | 164 ++++----------------------------------------
>   net/xdp/xdp_umem.h          |   4 +-
>   net/xdp/xsk.c               |  83 +++++++++++++++++++---
>   net/xdp/xsk.h               |   3 +
>   net/xdp/xsk_buff_pool.c     | 154 +++++++++++++++++++++++++++++++++++++----
>   net/xdp/xsk_queue.h         |  12 ++--
>   8 files changed, 250 insertions(+), 187 deletions(-)
> 
> diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> index 6eb9628..b9bb118 100644
> --- a/include/net/xdp_sock.h
> +++ b/include/net/xdp_sock.h
> @@ -20,13 +20,12 @@ struct xdp_buff;
>   struct xdp_umem {
>   	struct xsk_queue *fq;
>   	struct xsk_queue *cq;
> -	struct xsk_buff_pool *pool;
>   	u64 size;
>   	u32 headroom;
>   	u32 chunk_size;
> +	u32 chunks;
>   	struct user_struct *user;
>   	refcount_t users;
> -	struct work_struct work;
>   	struct page **pgs;
>   	u32 npgs;
>   	u16 queue_id;
> diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
> index a6dec9c..cda8ced 100644
> --- a/include/net/xsk_buff_pool.h
> +++ b/include/net/xsk_buff_pool.h
> @@ -14,6 +14,7 @@ struct xdp_rxq_info;
>   struct xsk_queue;
>   struct xdp_desc;
>   struct xdp_umem;
> +struct xdp_sock;
>   struct device;
>   struct page;
>   
> @@ -46,16 +47,23 @@ struct xsk_buff_pool {
>   	struct xdp_umem *umem;
>   	void *addrs;
>   	struct device *dev;
> +	refcount_t users;
> +	struct work_struct work;
>   	struct xdp_buff_xsk *free_heads[];
>   };
>   
>   /* AF_XDP core. */
> -struct xsk_buff_pool *xp_create(struct xdp_umem *umem, u32 chunks,
> -				u32 chunk_size, u32 headroom, u64 size,
> -				bool unaligned);
> +struct xsk_buff_pool *xp_create(void);
> +struct xsk_buff_pool *xp_assign_umem(struct xsk_buff_pool *pool,
> +				     struct xdp_umem *umem);
> +int xp_assign_dev(struct xsk_buff_pool *pool, struct xdp_sock *xs,
> +		  struct net_device *dev, u16 queue_id, u16 flags);
>   void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq);
>   void xp_destroy(struct xsk_buff_pool *pool);
>   void xp_release(struct xdp_buff_xsk *xskb);
> +void xp_get_pool(struct xsk_buff_pool *pool);
> +void xp_put_pool(struct xsk_buff_pool *pool);
> +void xp_clear_dev(struct xsk_buff_pool *pool);
>   
>   /* AF_XDP, and XDP core. */
>   void xp_free(struct xdp_buff_xsk *xskb);
> diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
> index adde4d5..f290345 100644
> --- a/net/xdp/xdp_umem.c
> +++ b/net/xdp/xdp_umem.c
> @@ -47,160 +47,41 @@ void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
>   	spin_unlock_irqrestore(&umem->xsk_tx_list_lock, flags);
>   }
>   
> -/* The umem is stored both in the _rx struct and the _tx struct as we do
> - * not know if the device has more tx queues than rx, or the opposite.
> - * This might also change during run time.
> - */
> -static int xsk_reg_pool_at_qid(struct net_device *dev,
> -			       struct xsk_buff_pool *pool,
> -			       u16 queue_id)
> -{
> -	if (queue_id >= max_t(unsigned int,
> -			      dev->real_num_rx_queues,
> -			      dev->real_num_tx_queues))
> -		return -EINVAL;
> -
> -	if (queue_id < dev->real_num_rx_queues)
> -		dev->_rx[queue_id].pool = pool;
> -	if (queue_id < dev->real_num_tx_queues)
> -		dev->_tx[queue_id].pool = pool;
> -
> -	return 0;
> -}
> -
> -struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
> -					    u16 queue_id)
> +static void xdp_umem_unpin_pages(struct xdp_umem *umem)
>   {
> -	if (queue_id < dev->real_num_rx_queues)
> -		return dev->_rx[queue_id].pool;
> -	if (queue_id < dev->real_num_tx_queues)
> -		return dev->_tx[queue_id].pool;
> +	unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true);
>   
> -	return NULL;
> +	kfree(umem->pgs);
> +	umem->pgs = NULL;
>   }
> -EXPORT_SYMBOL(xsk_get_pool_from_qid);
>   
> -static void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
> +static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
>   {
> -	if (queue_id < dev->real_num_rx_queues)
> -		dev->_rx[queue_id].pool = NULL;
> -	if (queue_id < dev->real_num_tx_queues)
> -		dev->_tx[queue_id].pool = NULL;
> +	if (umem->user) {
> +		atomic_long_sub(umem->npgs, &umem->user->locked_vm);
> +		free_uid(umem->user);
> +	}
>   }
>   
> -int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
> -			u16 queue_id, u16 flags)
> +void xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
> +			 u16 queue_id)
>   {
> -	bool force_zc, force_copy;
> -	struct netdev_bpf bpf;
> -	int err = 0;
> -
> -	ASSERT_RTNL();
> -
> -	force_zc = flags & XDP_ZEROCOPY;
> -	force_copy = flags & XDP_COPY;
> -
> -	if (force_zc && force_copy)
> -		return -EINVAL;
> -
> -	if (xsk_get_pool_from_qid(dev, queue_id))
> -		return -EBUSY;
> -
> -	err = xsk_reg_pool_at_qid(dev, umem->pool, queue_id);
> -	if (err)
> -		return err;
> -
>   	umem->dev = dev;
>   	umem->queue_id = queue_id;
>   
> -	if (flags & XDP_USE_NEED_WAKEUP) {
> -		umem->flags |= XDP_UMEM_USES_NEED_WAKEUP;
> -		/* Tx needs to be explicitly woken up the first time.
> -		 * Also for supporting drivers that do not implement this
> -		 * feature. They will always have to call sendto().
> -		 */
> -		xsk_set_tx_need_wakeup(umem->pool);
> -	}
> -
>   	dev_hold(dev);
> -
> -	if (force_copy)
> -		/* For copy-mode, we are done. */
> -		return 0;
> -
> -	if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_wakeup) {
> -		err = -EOPNOTSUPP;
> -		goto err_unreg_umem;
> -	}
> -
> -	bpf.command = XDP_SETUP_XSK_POOL;
> -	bpf.xsk.pool = umem->pool;
> -	bpf.xsk.queue_id = queue_id;
> -
> -	err = dev->netdev_ops->ndo_bpf(dev, &bpf);
> -	if (err)
> -		goto err_unreg_umem;
> -
> -	umem->zc = true;
> -	return 0;
> -
> -err_unreg_umem:
> -	if (!force_zc)
> -		err = 0; /* fallback to copy mode */
> -	if (err)
> -		xsk_clear_pool_at_qid(dev, queue_id);
> -	return err;
>   }
>   
>   void xdp_umem_clear_dev(struct xdp_umem *umem)
>   {
> -	struct netdev_bpf bpf;
> -	int err;
> -
> -	ASSERT_RTNL();
> -
> -	if (!umem->dev)
> -		return;
> -
> -	if (umem->zc) {
> -		bpf.command = XDP_SETUP_XSK_POOL;
> -		bpf.xsk.pool = NULL;
> -		bpf.xsk.queue_id = umem->queue_id;
> -
> -		err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf);
> -
> -		if (err)
> -			WARN(1, "failed to disable umem!\n");
> -	}
> -
> -	xsk_clear_pool_at_qid(umem->dev, umem->queue_id);
> -
>   	dev_put(umem->dev);
>   	umem->dev = NULL;
>   	umem->zc = false;
>   }
>   
> -static void xdp_umem_unpin_pages(struct xdp_umem *umem)
> -{
> -	unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true);
> -
> -	kfree(umem->pgs);
> -	umem->pgs = NULL;
> -}
> -
> -static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
> -{
> -	if (umem->user) {
> -		atomic_long_sub(umem->npgs, &umem->user->locked_vm);
> -		free_uid(umem->user);
> -	}
> -}
> -
>   static void xdp_umem_release(struct xdp_umem *umem)
>   {
> -	rtnl_lock();
>   	xdp_umem_clear_dev(umem);
> -	rtnl_unlock();
>   
>   	ida_simple_remove(&umem_ida, umem->id);
>   
> @@ -214,20 +95,12 @@ static void xdp_umem_release(struct xdp_umem *umem)
>   		umem->cq = NULL;
>   	}
>   
> -	xp_destroy(umem->pool);
>   	xdp_umem_unpin_pages(umem);
>   
>   	xdp_umem_unaccount_pages(umem);
>   	kfree(umem);
>   }
>   
> -static void xdp_umem_release_deferred(struct work_struct *work)
> -{
> -	struct xdp_umem *umem = container_of(work, struct xdp_umem, work);
> -
> -	xdp_umem_release(umem);
> -}
> -
>   void xdp_get_umem(struct xdp_umem *umem)
>   {
>   	refcount_inc(&umem->users);
> @@ -238,10 +111,8 @@ void xdp_put_umem(struct xdp_umem *umem)
>   	if (!umem)
>   		return;
>   
> -	if (refcount_dec_and_test(&umem->users)) {
> -		INIT_WORK(&umem->work, xdp_umem_release_deferred);
> -		schedule_work(&umem->work);
> -	}
> +	if (refcount_dec_and_test(&umem->users))
> +		xdp_umem_release(umem);
>   }
>   
>   static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address)
> @@ -357,6 +228,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
>   	umem->size = size;
>   	umem->headroom = headroom;
>   	umem->chunk_size = chunk_size;
> +	umem->chunks = chunks;
>   	umem->npgs = (u32)npgs;
>   	umem->pgs = NULL;
>   	umem->user = NULL;
> @@ -374,16 +246,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
>   	if (err)
>   		goto out_account;
>   
> -	umem->pool = xp_create(umem, chunks, chunk_size, headroom, size,
> -			       unaligned_chunks);
> -	if (!umem->pool) {
> -		err = -ENOMEM;
> -		goto out_pin;
> -	}
>   	return 0;
>   
> -out_pin:
> -	xdp_umem_unpin_pages(umem);
>   out_account:
>   	xdp_umem_unaccount_pages(umem);
>   	return err;
> diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
> index 32067fe..93e96be 100644
> --- a/net/xdp/xdp_umem.h
> +++ b/net/xdp/xdp_umem.h
> @@ -8,8 +8,8 @@
>   
>   #include <net/xdp_sock_drv.h>
>   
> -int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
> -			u16 queue_id, u16 flags);
> +void xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
> +			 u16 queue_id);
>   void xdp_umem_clear_dev(struct xdp_umem *umem);
>   bool xdp_umem_validate_queues(struct xdp_umem *umem);
>   void xdp_get_umem(struct xdp_umem *umem);
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 7551f5b..b12a832 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -105,6 +105,46 @@ bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
>   }
>   EXPORT_SYMBOL(xsk_uses_need_wakeup);
>   
> +struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
> +					    u16 queue_id)
> +{
> +	if (queue_id < dev->real_num_rx_queues)
> +		return dev->_rx[queue_id].pool;
> +	if (queue_id < dev->real_num_tx_queues)
> +		return dev->_tx[queue_id].pool;
> +
> +	return NULL;
> +}
> +EXPORT_SYMBOL(xsk_get_pool_from_qid);
> +
> +void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
> +{
> +	if (queue_id < dev->real_num_rx_queues)
> +		dev->_rx[queue_id].pool = NULL;
> +	if (queue_id < dev->real_num_tx_queues)
> +		dev->_tx[queue_id].pool = NULL;
> +}
> +
> +/* The buffer pool is stored both in the _rx struct and the _tx struct as we do
> + * not know if the device has more tx queues than rx, or the opposite.
> + * This might also change during run time.
> + */
> +int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
> +			u16 queue_id)
> +{
> +	if (queue_id >= max_t(unsigned int,
> +			      dev->real_num_rx_queues,
> +			      dev->real_num_tx_queues))
> +		return -EINVAL;
> +
> +	if (queue_id < dev->real_num_rx_queues)
> +		dev->_rx[queue_id].pool = pool;
> +	if (queue_id < dev->real_num_tx_queues)
> +		dev->_tx[queue_id].pool = pool;
> +
> +	return 0;
> +}
> +
>   void xp_release(struct xdp_buff_xsk *xskb)
>   {
>   	xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb;
> @@ -281,7 +321,7 @@ bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
>   
>   	rcu_read_lock();
>   	list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) {
> -		if (!xskq_cons_peek_desc(xs->tx, desc, umem))
> +		if (!xskq_cons_peek_desc(xs->tx, desc, pool))
>   			continue;
>   
>   		/* This is the backpressure mechanism for the Tx path.
> @@ -347,7 +387,7 @@ static int xsk_generic_xmit(struct sock *sk)
>   	if (xs->queue_id >= xs->dev->real_num_tx_queues)
>   		goto out;
>   
> -	while (xskq_cons_peek_desc(xs->tx, &desc, xs->umem)) {
> +	while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
>   		char *buffer;
>   		u64 addr;
>   		u32 len;
> @@ -629,6 +669,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
>   	qid = sxdp->sxdp_queue_id;
>   
>   	if (flags & XDP_SHARED_UMEM) {
> +		struct xsk_buff_pool *curr_pool;
>   		struct xdp_sock *umem_xs;
>   		struct socket *sock;
>   
> @@ -663,6 +704,11 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
>   			goto out_unlock;
>   		}
>   
> +		/* Share the buffer pool with the other socket. */
> +		xp_get_pool(umem_xs->pool);
> +		curr_pool = xs->pool;
> +		xs->pool = umem_xs->pool;
> +		xp_destroy(curr_pool);
>   		xdp_get_umem(umem_xs->umem);
>   		WRITE_ONCE(xs->umem, umem_xs->umem);
>   		sockfd_put(sock);
> @@ -670,10 +716,24 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
>   		err = -EINVAL;
>   		goto out_unlock;
>   	} else {
> +		struct xsk_buff_pool *new_pool;
> +
>   		/* This xsk has its own umem. */
> -		err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
> -		if (err)
> +		xdp_umem_assign_dev(xs->umem, dev, qid);
> +		new_pool = xp_assign_umem(xs->pool, xs->umem);

It looks like the old pool (xs->pool) is never freed.

> +		if (!new_pool) {
> +			err = -ENOMEM;
> +			xdp_umem_clear_dev(xs->umem);
> +			goto out_unlock;
> +		}
> +
> +		err = xp_assign_dev(new_pool, xs, dev, qid, flags);
> +		if (err) {
> +			xp_destroy(new_pool);
> +			xdp_umem_clear_dev(xs->umem);
>   			goto out_unlock;
> +		}
> +		xs->pool = new_pool;
>   	}
>   
>   	xs->dev = dev;
> @@ -765,8 +825,6 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
>   			return PTR_ERR(umem);
>   		}
>   
> -		xs->pool = umem->pool;
> -
>   		/* Make sure umem is ready before it can be seen by others */
>   		smp_wmb();
>   		WRITE_ONCE(xs->umem, umem);
> @@ -796,7 +854,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
>   			&xs->umem->cq;
>   		err = xsk_init_queue(entries, q, true);
>   		if (optname == XDP_UMEM_FILL_RING)
> -			xp_set_fq(xs->umem->pool, *q);
> +			xp_set_fq(xs->pool, *q);
>   		mutex_unlock(&xs->mutex);
>   		return err;
>   	}
> @@ -1002,7 +1060,8 @@ static int xsk_notifier(struct notifier_block *this,
>   
>   				xsk_unbind_dev(xs);
>   
> -				/* Clear device references in umem. */
> +				/* Clear device references. */
> +				xp_clear_dev(xs->pool);
>   				xdp_umem_clear_dev(xs->umem);
>   			}
>   			mutex_unlock(&xs->mutex);
> @@ -1047,7 +1106,7 @@ static void xsk_destruct(struct sock *sk)
>   	if (!sock_flag(sk, SOCK_DEAD))
>   		return;
>   
> -	xdp_put_umem(xs->umem);
> +	xp_put_pool(xs->pool);
>   
>   	sk_refcnt_debug_dec(sk);
>   }
> @@ -1055,8 +1114,8 @@ static void xsk_destruct(struct sock *sk)
>   static int xsk_create(struct net *net, struct socket *sock, int protocol,
>   		      int kern)
>   {
> -	struct sock *sk;
>   	struct xdp_sock *xs;
> +	struct sock *sk;
>   
>   	if (!ns_capable(net->user_ns, CAP_NET_RAW))
>   		return -EPERM;
> @@ -1092,6 +1151,10 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
>   	INIT_LIST_HEAD(&xs->map_list);
>   	spin_lock_init(&xs->map_list_lock);
>   
> +	xs->pool = xp_create();
> +	if (!xs->pool)
> +		return -ENOMEM;
> +
>   	mutex_lock(&net->xdp.lock);
>   	sk_add_node_rcu(sk, &net->xdp.list);
>   	mutex_unlock(&net->xdp.lock);
> diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h
> index 455ddd4..a00e3e2 100644
> --- a/net/xdp/xsk.h
> +++ b/net/xdp/xsk.h
> @@ -51,5 +51,8 @@ void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
>   			     struct xdp_sock **map_entry);
>   int xsk_map_inc(struct xsk_map *map);
>   void xsk_map_put(struct xsk_map *map);
> +void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id);
> +int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
> +			u16 queue_id);
>   
>   #endif /* XSK_H_ */
> diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
> index c57f0bb..da93b36 100644
> --- a/net/xdp/xsk_buff_pool.c
> +++ b/net/xdp/xsk_buff_pool.c
> @@ -2,11 +2,14 @@
>   
>   #include <net/xsk_buff_pool.h>
>   #include <net/xdp_sock.h>
> +#include <net/xdp_sock_drv.h>
>   #include <linux/dma-direct.h>
>   #include <linux/dma-noncoherent.h>
>   #include <linux/swiotlb.h>
>   
>   #include "xsk_queue.h"
> +#include "xdp_umem.h"
> +#include "xsk.h"
>   
>   static void xp_addr_unmap(struct xsk_buff_pool *pool)
>   {
> @@ -32,39 +35,48 @@ void xp_destroy(struct xsk_buff_pool *pool)
>   	kvfree(pool);
>   }
>   
> -struct xsk_buff_pool *xp_create(struct xdp_umem *umem, u32 chunks,
> -				u32 chunk_size, u32 headroom, u64 size,
> -				bool unaligned)
> +struct xsk_buff_pool *xp_create(void)
> +{
> +	return kvzalloc(sizeof(struct xsk_buff_pool), GFP_KERNEL);
> +}
> +
> +struct xsk_buff_pool *xp_assign_umem(struct xsk_buff_pool *pool_old,
> +				     struct xdp_umem *umem)
>   {
>   	struct xsk_buff_pool *pool;
>   	struct xdp_buff_xsk *xskb;
>   	int err;
>   	u32 i;
>   
> -	pool = kvzalloc(struct_size(pool, free_heads, chunks), GFP_KERNEL);
> +	pool = kvzalloc(struct_size(pool, free_heads, umem->chunks),
> +			GFP_KERNEL);
>   	if (!pool)
>   		goto out;
>   
> -	pool->heads = kvcalloc(chunks, sizeof(*pool->heads), GFP_KERNEL);
> +	memcpy(pool, pool_old, sizeof(*pool_old));
> +
> +	pool->heads = kvcalloc(umem->chunks, sizeof(*pool->heads), GFP_KERNEL);
>   	if (!pool->heads)
>   		goto out;
>   
> -	pool->chunk_mask = ~((u64)chunk_size - 1);
> -	pool->addrs_cnt = size;
> -	pool->heads_cnt = chunks;
> -	pool->free_heads_cnt = chunks;
> -	pool->headroom = headroom;
> -	pool->chunk_size = chunk_size;
> +	pool->chunk_mask = ~((u64)umem->chunk_size - 1);
> +	pool->addrs_cnt = umem->size;
> +	pool->heads_cnt = umem->chunks;
> +	pool->free_heads_cnt = umem->chunks;
> +	pool->headroom = umem->headroom;
> +	pool->chunk_size = umem->chunk_size;
>   	pool->cheap_dma = true;
> -	pool->unaligned = unaligned;
> -	pool->frame_len = chunk_size - headroom - XDP_PACKET_HEADROOM;
> +	pool->unaligned = umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
> +	pool->frame_len = umem->chunk_size - umem->headroom -
> +		XDP_PACKET_HEADROOM;
>   	pool->umem = umem;
>   	INIT_LIST_HEAD(&pool->free_list);
> +	refcount_set(&pool->users, 1);
>   
>   	for (i = 0; i < pool->free_heads_cnt; i++) {
>   		xskb = &pool->heads[i];
>   		xskb->pool = pool;
> -		xskb->xdp.frame_sz = chunk_size - headroom;
> +		xskb->xdp.frame_sz = umem->chunk_size - umem->headroom;
>   		pool->free_heads[i] = xskb;
>   	}
>   
> @@ -91,6 +103,120 @@ void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq)
>   }
>   EXPORT_SYMBOL(xp_set_rxq_info);
>   
> +int xp_assign_dev(struct xsk_buff_pool *pool, struct xdp_sock *xs,
> +		  struct net_device *dev, u16 queue_id, u16 flags)
> +{
> +	struct xdp_umem *umem = pool->umem;
> +	bool force_zc, force_copy;
> +	struct netdev_bpf bpf;
> +	int err = 0;
> +
> +	ASSERT_RTNL();
> +
> +	force_zc = flags & XDP_ZEROCOPY;
> +	force_copy = flags & XDP_COPY;
> +
> +	if (force_zc && force_copy)
> +		return -EINVAL;
> +
> +	if (xsk_get_pool_from_qid(dev, queue_id))
> +		return -EBUSY;
> +
> +	err = xsk_reg_pool_at_qid(dev, pool, queue_id);
> +	if (err)
> +		return err;
> +
> +	if ((flags & XDP_USE_NEED_WAKEUP) && xs->tx) {
> +		umem->flags |= XDP_UMEM_USES_NEED_WAKEUP;
> +		/* Tx needs to be explicitly woken up the first time.
> +		 * Also for supporting drivers that do not implement this
> +		 * feature. They will always have to call sendto().
> +		 */
> +		xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
> +	}
> +
> +	if (force_copy)
> +		/* For copy-mode, we are done. */
> +		return 0;
> +
> +	if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_wakeup) {
> +		err = -EOPNOTSUPP;
> +		goto err_unreg_pool;
> +	}
> +
> +	bpf.command = XDP_SETUP_XSK_POOL;
> +	bpf.xsk.pool = pool;
> +	bpf.xsk.queue_id = queue_id;
> +
> +	err = dev->netdev_ops->ndo_bpf(dev, &bpf);
> +	if (err)
> +		goto err_unreg_pool;
> +
> +	umem->zc = true;
> +	return 0;
> +
> +err_unreg_pool:
> +	if (!force_zc)
> +		err = 0; /* fallback to copy mode */
> +	if (err)
> +		xsk_clear_pool_at_qid(dev, queue_id);
> +	return err;
> +}
> +
> +void xp_clear_dev(struct xsk_buff_pool *pool)
> +{
> +	struct xdp_umem *umem = pool->umem;
> +	struct netdev_bpf bpf;
> +	int err;
> +
> +	ASSERT_RTNL();
> +
> +	if (!umem->dev)
> +		return;
> +
> +	if (umem->zc) {
> +		bpf.command = XDP_SETUP_XSK_POOL;
> +		bpf.xsk.pool = NULL;
> +		bpf.xsk.queue_id = umem->queue_id;
> +
> +		err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf);
> +
> +		if (err)
> +			WARN(1, "failed to disable umem!\n");
> +	}
> +
> +	xsk_clear_pool_at_qid(umem->dev, umem->queue_id);
> +}
> +
> +static void xp_release_deferred(struct work_struct *work)
> +{
> +	struct xsk_buff_pool *pool = container_of(work, struct xsk_buff_pool,
> +						  work);
> +
> +	rtnl_lock();
> +	xp_clear_dev(pool);
> +	rtnl_unlock();
> +
> +	xdp_put_umem(pool->umem);
> +	xp_destroy(pool);
> +}
> +
> +void xp_get_pool(struct xsk_buff_pool *pool)
> +{
> +	refcount_inc(&pool->users);
> +}
> +
> +void xp_put_pool(struct xsk_buff_pool *pool)
> +{
> +	if (!pool)
> +		return;
> +
> +	if (refcount_dec_and_test(&pool->users)) {
> +		INIT_WORK(&pool->work, xp_release_deferred);
> +		schedule_work(&pool->work);
> +	}
> +}
> +
>   void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs)
>   {
>   	dma_addr_t *dma;
> diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
> index 5b5d24d..75f1853 100644
> --- a/net/xdp/xsk_queue.h
> +++ b/net/xdp/xsk_queue.h
> @@ -165,9 +165,9 @@ static inline bool xp_validate_desc(struct xsk_buff_pool *pool,
>   
>   static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q,
>   					   struct xdp_desc *d,
> -					   struct xdp_umem *umem)
> +					   struct xsk_buff_pool *pool)
>   {
> -	if (!xp_validate_desc(umem->pool, d)) {
> +	if (!xp_validate_desc(pool, d)) {
>   		q->invalid_descs++;
>   		return false;
>   	}
> @@ -176,14 +176,14 @@ static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q,
>   
>   static inline bool xskq_cons_read_desc(struct xsk_queue *q,
>   				       struct xdp_desc *desc,
> -				       struct xdp_umem *umem)
> +				       struct xsk_buff_pool *pool)
>   {
>   	while (q->cached_cons != q->cached_prod) {
>   		struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
>   		u32 idx = q->cached_cons & q->ring_mask;
>   
>   		*desc = ring->desc[idx];
> -		if (xskq_cons_is_valid_desc(q, desc, umem))
> +		if (xskq_cons_is_valid_desc(q, desc, pool))
>   			return true;
>   
>   		q->cached_cons++;
> @@ -235,11 +235,11 @@ static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr)
>   
>   static inline bool xskq_cons_peek_desc(struct xsk_queue *q,
>   				       struct xdp_desc *desc,
> -				       struct xdp_umem *umem)
> +				       struct xsk_buff_pool *pool)
>   {
>   	if (q->cached_prod == q->cached_cons)
>   		xskq_cons_get_entries(q);
> -	return xskq_cons_read_desc(q, desc, umem);
> +	return xskq_cons_read_desc(q, desc, pool);
>   }
>   
>   static inline void xskq_cons_release(struct xsk_queue *q)
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ