lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAHS8izOTzLVxQ_rYt1vyhb=tgs2GAtuSZUWkZ183=7J3wEEzjQ@mail.gmail.com>
Date: Wed, 25 Oct 2023 12:56:44 -0700
From: Mina Almasry <almasrymina@...gle.com>
To: Jakub Kicinski <kuba@...nel.org>
Cc: davem@...emloft.net, netdev@...r.kernel.org, edumazet@...gle.com, 
	pabeni@...hat.com, hawk@...nel.org, ilias.apalodimas@...aro.org
Subject: Re: [PATCH net-next 05/15] net: page_pool: record pools per netdev

On Tue, Oct 24, 2023 at 9:02 AM Jakub Kicinski <kuba@...nel.org> wrote:
>
> Link the page pools with netdevs. This needs to be netns compatible
> so we have two options. Either we record the pools per netns and
> have to worry about moving them as the netdev gets moved.
> Or we record them directly on the netdev so they move with the netdev
> without any extra work.
>
> Implement the latter option. Since pools may outlast netdev we need
> a place to store orphans. In time honored tradition use loopback
> for this purpose.
>
> Signed-off-by: Jakub Kicinski <kuba@...nel.org>
> ---
> v1: fix race between page pool and netdev disappearing (Simon)
> ---
>  include/linux/list.h          | 20 ++++++++
>  include/linux/netdevice.h     |  4 ++
>  include/linux/poison.h        |  2 +
>  include/net/page_pool/types.h |  4 ++
>  net/core/page_pool_user.c     | 90 +++++++++++++++++++++++++++++++++++
>  5 files changed, 120 insertions(+)
>
> diff --git a/include/linux/list.h b/include/linux/list.h
> index 164b4d0e9d2a..3d8884472164 100644
> --- a/include/linux/list.h
> +++ b/include/linux/list.h
> @@ -1111,6 +1111,26 @@ static inline void hlist_move_list(struct hlist_head *old,
>         old->first = NULL;
>  }
>
> +/**
> + * hlist_splice_init() - move all entries from one list to another
> + * @from: hlist_head from which entries will be moved
> + * @last: last entry on the @from list
> + * @to:   hlist_head to which entries will be moved
> + *
> + * @to can be empty, @from must contain at least @last.
> + */
> +static inline void hlist_splice_init(struct hlist_head *from,
> +                                    struct hlist_node *last,
> +                                    struct hlist_head *to)
> +{
> +       if (to->first)
> +               to->first->pprev = &last->next;
> +       last->next = to->first;
> +       to->first = from->first;
> +       from->first->pprev = &to->first;
> +       from->first = NULL;
> +}
> +
>  #define hlist_entry(ptr, type, member) container_of(ptr,type,member)
>
>  #define hlist_for_each(pos, head) \
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index b8bf669212cc..224ee4680a31 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -2431,6 +2431,10 @@ struct net_device {
>  #if IS_ENABLED(CONFIG_DPLL)
>         struct dpll_pin         *dpll_pin;
>  #endif
> +#if IS_ENABLED(CONFIG_PAGE_POOL)
> +       /** @page_pools: page pools created for this netdevice */
> +       struct hlist_head       page_pools;
> +#endif

I wonder if this per netdev field is really necessary. Is it not
possible to do the same simply looping over the  (global) page_pools
xarray? Or is that too silly of an idea. I guess on some systems you
may end up with 100s or 1000s of active or orphaned page pools and
then globally iterating over the whole page_pools xarray can be really
slow..

>  };
>  #define to_net_dev(d) container_of(d, struct net_device, dev)
>
> diff --git a/include/linux/poison.h b/include/linux/poison.h
> index 851a855d3868..27a7dad17eef 100644
> --- a/include/linux/poison.h
> +++ b/include/linux/poison.h
> @@ -83,6 +83,8 @@
>
>  /********** net/core/skbuff.c **********/
>  #define SKB_LIST_POISON_NEXT   ((void *)(0x800 + POISON_POINTER_DELTA))
> +/********** net/ **********/
> +#define NET_PTR_POISON         ((void *)(0x801 + POISON_POINTER_DELTA))
>
>  /********** kernel/bpf/ **********/
>  #define BPF_PTR_POISON ((void *)(0xeB9FUL + POISON_POINTER_DELTA))
> diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
> index c19f0df3bf0b..b258a571201e 100644
> --- a/include/net/page_pool/types.h
> +++ b/include/net/page_pool/types.h
> @@ -5,6 +5,7 @@
>
>  #include <linux/dma-direction.h>
>  #include <linux/ptr_ring.h>
> +#include <linux/types.h>
>
>  #define PP_FLAG_DMA_MAP                BIT(0) /* Should page_pool do the DMA
>                                         * map/unmap
> @@ -48,6 +49,7 @@ struct pp_alloc_cache {
>   * @pool_size: size of the ptr_ring
>   * @nid:       NUMA node id to allocate from pages from
>   * @dev:       device, for DMA pre-mapping purposes
> + * @netdev:    netdev this pool will serve (leave as NULL if none or multiple)

Is this an existing use case (page_pools that serve null or multiple
netdevs), or a future use case? My understanding is that currently
page_pools serve at most 1 rx-queue. Spot checking a few drivers that
seems to be true.

>   * @napi:      NAPI which is the sole consumer of pages, otherwise NULL
>   * @dma_dir:   DMA mapping direction
>   * @max_len:   max DMA sync memory size for PP_FLAG_DMA_SYNC_DEV
> @@ -66,6 +68,7 @@ struct page_pool_params {
>                 unsigned int    offset;
>         );
>         struct_group_tagged(page_pool_params_slow, slow,
> +               struct net_device *netdev;
>  /* private: used by test code only */
>                 void (*init_callback)(struct page *page, void *arg);
>                 void *init_arg;
> @@ -189,6 +192,7 @@ struct page_pool {
>         struct page_pool_params_slow slow;
>         /* User-facing fields, protected by page_pools_lock */
>         struct {
> +               struct hlist_node list;
>                 u32 id;
>         } user;
>  };
> diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c
> index 630d1eeecf2a..7cd6f416b87a 100644
> --- a/net/core/page_pool_user.c
> +++ b/net/core/page_pool_user.c
> @@ -1,14 +1,31 @@
>  // SPDX-License-Identifier: GPL-2.0
>
>  #include <linux/mutex.h>
> +#include <linux/netdevice.h>
>  #include <linux/xarray.h>
> +#include <net/net_debug.h>
>  #include <net/page_pool/types.h>
>
>  #include "page_pool_priv.h"
>
>  static DEFINE_XARRAY_FLAGS(page_pools, XA_FLAGS_ALLOC1);
> +/* Protects: page_pools, netdevice->page_pools, pool->slow.netdev, pool->user.
> + * Ordering: inside rtnl_lock
> + */
>  static DEFINE_MUTEX(page_pools_lock);
>
> +/* Page pools are only reachable from user space (via netlink) if they are
> + * linked to a netdev at creation time. Following page pool "visibility"
> + * states are possible:
> + *  - normal
> + *    - user.list: linked to real netdev, netdev: real netdev
> + *  - orphaned - real netdev has disappeared
> + *    - user.list: linked to lo, netdev: lo
> + *  - invisible - either (a) created without netdev linking, (b) unlisted due
> + *      to error, or (c) the entire namespace which owned this pool disappeared
> + *    - user.list: unhashed, netdev: unknown
> + */
> +
>  int page_pool_list(struct page_pool *pool)
>  {
>         static u32 id_alloc_next;
> @@ -20,6 +37,10 @@ int page_pool_list(struct page_pool *pool)
>         if (err < 0)
>                 goto err_unlock;
>
> +       if (pool->slow.netdev)
> +               hlist_add_head(&pool->user.list,
> +                              &pool->slow.netdev->page_pools);
> +
>         mutex_unlock(&page_pools_lock);
>         return 0;
>
> @@ -32,5 +53,74 @@ void page_pool_unlist(struct page_pool *pool)
>  {
>         mutex_lock(&page_pools_lock);
>         xa_erase(&page_pools, pool->user.id);
> +       hlist_del(&pool->user.list);
>         mutex_unlock(&page_pools_lock);
>  }
> +
> +static void page_pool_unreg_netdev_wipe(struct net_device *netdev)
> +{
> +       struct page_pool *pool;
> +       struct hlist_node *n;
> +
> +       mutex_lock(&page_pools_lock);
> +       hlist_for_each_entry_safe(pool, n, &netdev->page_pools, user.list) {
> +               hlist_del_init(&pool->user.list);
> +               pool->slow.netdev = NET_PTR_POISON;
> +       }
> +       mutex_unlock(&page_pools_lock);
> +}
> +
> +static void page_pool_unreg_netdev(struct net_device *netdev)
> +{
> +       struct page_pool *pool, *last;
> +       struct net_device *lo;
> +
> +       lo = __dev_get_by_index(dev_net(netdev), 1);
> +       if (!lo) {
> +               netdev_err_once(netdev,
> +                               "can't get lo to store orphan page pools\n");
> +               page_pool_unreg_netdev_wipe(netdev);
> +               return;
> +       }
> +
> +       mutex_lock(&page_pools_lock);
> +       last = NULL;
> +       hlist_for_each_entry(pool, &netdev->page_pools, user.list) {
> +               pool->slow.netdev = lo;
> +               last = pool;
> +       }
> +       if (last)
> +               hlist_splice_init(&netdev->page_pools, &last->user.list,
> +                                 &lo->page_pools);
> +       mutex_unlock(&page_pools_lock);
> +}
> +
> +static int
> +page_pool_netdevice_event(struct notifier_block *nb,
> +                         unsigned long event, void *ptr)
> +{
> +       struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
> +
> +       if (event != NETDEV_UNREGISTER)
> +               return NOTIFY_DONE;
> +
> +       if (hlist_empty(&netdev->page_pools))
> +               return NOTIFY_OK;
> +
> +       if (netdev->ifindex != 1)

I'm guessing 1 is _always_ loopback?

> +               page_pool_unreg_netdev(netdev);
> +       else
> +               page_pool_unreg_netdev_wipe(netdev);
> +       return NOTIFY_OK;
> +}
> +
> +static struct notifier_block page_pool_netdevice_nb = {
> +       .notifier_call = page_pool_netdevice_event,
> +};
> +
> +static int __init page_pool_user_init(void)
> +{
> +       return register_netdevice_notifier(&page_pool_netdevice_nb);
> +}
> +
> +subsys_initcall(page_pool_user_init);
> --
> 2.41.0
>


--
Thanks,
Mina

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ