linux-kernel - Re: [PATCH] net: page_pool: fix refcounting issues with fragmented allocation

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAC_iWjKAEgUB8Z3WNNVgUK8omXD+nwt_VPSVyFn1i4EQzJadog@mail.gmail.com>
Date:   Tue, 24 Jan 2023 16:11:43 +0200
From:   Ilias Apalodimas <ilias.apalodimas@...aro.org>
To:     Felix Fietkau <nbd@....name>
Cc:     netdev@...r.kernel.org, Jesper Dangaard Brouer <hawk@...nel.org>,
        "David S. Miller" <davem@...emloft.net>,
        Eric Dumazet <edumazet@...gle.com>,
        Jakub Kicinski <kuba@...nel.org>,
        Paolo Abeni <pabeni@...hat.com>,
        Lorenzo Bianconi <lorenzo@...nel.org>,
        linux-kernel@...r.kernel.org,
        Alexander Duyck <alexander.duyck@...il.com>,
        Yunsheng Lin <linyunsheng@...wei.com>
Subject: Re: [PATCH] net: page_pool: fix refcounting issues with fragmented allocation

Hi Felix,

++cc Alexander and Yunsheng.

Thanks for the report

On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@....name> wrote:
>
> While testing fragmented page_pool allocation in the mt76 driver, I was able
> to reliably trigger page refcount underflow issues, which did not occur with
> full-page page_pool allocation.
> It appears to me, that handling refcounting in two separate counters
> (page->pp_frag_count and page refcount) is racy when page refcount gets
> incremented by code dealing with skb fragments directly, and
> page_pool_return_skb_page is called multiple times for the same fragment.
>
> Dropping page->pp_frag_count and relying entirely on the page refcount makes
> these underflow issues and crashes go away.
>

This has been discussed here [1].  TL;DR changing this to page
refcount might blow up in other colorful ways.  Can we look closer and
figure out why the underflow happens?

[1] https://lore.kernel.org/netdev/1625903002-31619-4-git-send-email-linyunsheng@huawei.com/

Thanks
/Ilias

> Cc: Lorenzo Bianconi <lorenzo@...nel.org>
> Signed-off-by: Felix Fietkau <nbd@....name>
> ---
>  include/linux/mm_types.h | 17 +++++------------
>  include/net/page_pool.h  | 19 ++++---------------
>  net/core/page_pool.c     | 12 ++++--------
>  3 files changed, 13 insertions(+), 35 deletions(-)
>
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 9757067c3053..96ec3b19a86d 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -125,18 +125,11 @@ struct page {
>                         struct page_pool *pp;
>                         unsigned long _pp_mapping_pad;
>                         unsigned long dma_addr;
> -                       union {
> -                               /**
> -                                * dma_addr_upper: might require a 64-bit
> -                                * value on 32-bit architectures.
> -                                */
> -                               unsigned long dma_addr_upper;
> -                               /**
> -                                * For frag page support, not supported in
> -                                * 32-bit architectures with 64-bit DMA.
> -                                */
> -                               atomic_long_t pp_frag_count;
> -                       };
> +                       /**
> +                        * dma_addr_upper: might require a 64-bit
> +                        * value on 32-bit architectures.
> +                        */
> +                       unsigned long dma_addr_upper;
>                 };
>                 struct {        /* Tail pages of compound page */
>                         unsigned long compound_head;    /* Bit zero is set */
> diff --git a/include/net/page_pool.h b/include/net/page_pool.h
> index 813c93499f20..28e1fdbdcd53 100644
> --- a/include/net/page_pool.h
> +++ b/include/net/page_pool.h
> @@ -279,14 +279,14 @@ void page_pool_put_defragged_page(struct page_pool *pool, struct page *page,
>
>  static inline void page_pool_fragment_page(struct page *page, long nr)
>  {
> -       atomic_long_set(&page->pp_frag_count, nr);
> +       page_ref_add(page, nr);
>  }
>
>  static inline long page_pool_defrag_page(struct page *page, long nr)
>  {
>         long ret;
>
> -       /* If nr == pp_frag_count then we have cleared all remaining
> +       /* If nr == page_ref_count then we have cleared all remaining
>          * references to the page. No need to actually overwrite it, instead
>          * we can leave this to be overwritten by the calling function.
>          *
> @@ -295,22 +295,14 @@ static inline long page_pool_defrag_page(struct page *page, long nr)
>          * especially when dealing with a page that may be partitioned
>          * into only 2 or 3 pieces.
>          */
> -       if (atomic_long_read(&page->pp_frag_count) == nr)
> +       if (page_ref_count(page) == nr)
>                 return 0;
>
> -       ret = atomic_long_sub_return(nr, &page->pp_frag_count);
> +       ret = page_ref_sub_return(page, nr);
>         WARN_ON(ret < 0);
>         return ret;
>  }
>
> -static inline bool page_pool_is_last_frag(struct page_pool *pool,
> -                                         struct page *page)
> -{
> -       /* If fragments aren't enabled or count is 0 we were the last user */
> -       return !(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
> -              (page_pool_defrag_page(page, 1) == 0);
> -}
> -
>  static inline void page_pool_put_page(struct page_pool *pool,
>                                       struct page *page,
>                                       unsigned int dma_sync_size,
> @@ -320,9 +312,6 @@ static inline void page_pool_put_page(struct page_pool *pool,
>          * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
>          */
>  #ifdef CONFIG_PAGE_POOL
> -       if (!page_pool_is_last_frag(pool, page))
> -               return;
> -
>         page_pool_put_defragged_page(pool, page, dma_sync_size, allow_direct);
>  #endif
>  }
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index 9b203d8660e4..0defcadae225 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -25,7 +25,7 @@
>  #define DEFER_TIME (msecs_to_jiffies(1000))
>  #define DEFER_WARN_INTERVAL (60 * HZ)
>
> -#define BIAS_MAX       LONG_MAX
> +#define BIAS_MAX(pool) (PAGE_SIZE << ((pool)->p.order))
>
>  #ifdef CONFIG_PAGE_POOL_STATS
>  /* alloc_stat_inc is intended to be used in softirq context */
> @@ -619,10 +619,6 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data,
>         for (i = 0; i < count; i++) {
>                 struct page *page = virt_to_head_page(data[i]);
>
> -               /* It is not the last user for the page frag case */
> -               if (!page_pool_is_last_frag(pool, page))
> -                       continue;
> -
>                 page = __page_pool_put_page(pool, page, -1, false);
>                 /* Approved for bulk recycling in ptr_ring cache */
>                 if (page)
> @@ -659,7 +655,7 @@ EXPORT_SYMBOL(page_pool_put_page_bulk);
>  static struct page *page_pool_drain_frag(struct page_pool *pool,
>                                          struct page *page)
>  {
> -       long drain_count = BIAS_MAX - pool->frag_users;
> +       long drain_count = BIAS_MAX(pool) - pool->frag_users;
>
>         /* Some user is still using the page frag */
>         if (likely(page_pool_defrag_page(page, drain_count)))
> @@ -678,7 +674,7 @@ static struct page *page_pool_drain_frag(struct page_pool *pool,
>
>  static void page_pool_free_frag(struct page_pool *pool)
>  {
> -       long drain_count = BIAS_MAX - pool->frag_users;
> +       long drain_count = BIAS_MAX(pool) - pool->frag_users;
>         struct page *page = pool->frag_page;
>
>         pool->frag_page = NULL;
> @@ -724,7 +720,7 @@ struct page *page_pool_alloc_frag(struct page_pool *pool,
>                 pool->frag_users = 1;
>                 *offset = 0;
>                 pool->frag_offset = size;
> -               page_pool_fragment_page(page, BIAS_MAX);
> +               page_pool_fragment_page(page, BIAS_MAX(pool));
>                 return page;
>         }
>
> --
> 2.39.0
>