lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAHS8izOMudLgV2HstD4O1HUL1A35UT58os3n-boMzs=rW4wJAg@mail.gmail.com>
Date: Fri, 29 Mar 2024 12:21:15 -0700
From: Mina Almasry <almasrymina@...gle.com>
To: Alexander Lobakin <aleksander.lobakin@...el.com>
Cc: "David S. Miller" <davem@...emloft.net>, Eric Dumazet <edumazet@...gle.com>, 
	Jakub Kicinski <kuba@...nel.org>, Paolo Abeni <pabeni@...hat.com>, 
	Lorenzo Bianconi <lorenzo@...nel.org>, Toke Høiland-Jørgensen <toke@...hat.com>, 
	nex.sw.ncis.osdt.itp.upstreaming@...el.com, netdev@...r.kernel.org, 
	linux-kernel@...r.kernel.org
Subject: Re: [PATCH net-next 1/2] page_pool: check for PP direct cache
 locality later

On Fri, Mar 29, 2024 at 9:56 AM Alexander Lobakin
<aleksander.lobakin@...el.com> wrote:
>
> Since we have pool->p.napi (Jakub) and pool->cpuid (Lorenzo) to check
> whether it's safe to use direct recycling, we can use both globally for
> each page instead of relying solely on @allow_direct argument.
> Let's assume that @allow_direct means "I'm sure it's local, don't waste
> time rechecking this" and when it's false, try the mentioned params to
> still recycle the page directly. If neither is true, we'll lose some
> CPU cycles, but then it surely won't be hotpath. On the other hand,
> paths where it's possible to use direct cache, but not possible to
> safely set @allow_direct, will benefit from this move.
> The whole propagation of @napi_safe through a dozen of skb freeing
> functions can now go away, which saves us some stack space.
>
> Signed-off-by: Alexander Lobakin <aleksander.lobakin@...el.com>
> ---
>  include/linux/skbuff.h | 12 ++++----
>  net/core/page_pool.c   | 31 +++++++++++++++++--
>  net/core/skbuff.c      | 70 +++++++++++++-----------------------------
>  net/ipv4/esp4.c        |  2 +-
>  net/ipv6/esp6.c        |  2 +-
>  5 files changed, 58 insertions(+), 59 deletions(-)
>
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index dadd3f55d549..f7f6e42c6814 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -3515,25 +3515,25 @@ int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
>                     unsigned int headroom);
>  int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
>                          struct bpf_prog *prog);
> -bool napi_pp_put_page(struct page *page, bool napi_safe);
> +bool napi_pp_put_page(struct page *page);
>
>  static inline void
> -skb_page_unref(const struct sk_buff *skb, struct page *page, bool napi_safe)
> +skb_page_unref(const struct sk_buff *skb, struct page *page)
>  {
>  #ifdef CONFIG_PAGE_POOL
> -       if (skb->pp_recycle && napi_pp_put_page(page, napi_safe))
> +       if (skb->pp_recycle && napi_pp_put_page(page))
>                 return;
>  #endif
>         put_page(page);
>  }
>
>  static inline void
> -napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe)
> +napi_frag_unref(skb_frag_t *frag, bool recycle)
>  {
>         struct page *page = skb_frag_page(frag);
>
>  #ifdef CONFIG_PAGE_POOL
> -       if (recycle && napi_pp_put_page(page, napi_safe))
> +       if (recycle && napi_pp_put_page(page))
>                 return;
>  #endif
>         put_page(page);
> @@ -3549,7 +3549,7 @@ napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe)
>   */
>  static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
>  {
> -       napi_frag_unref(frag, recycle, false);
> +       napi_frag_unref(frag, recycle);
>  }
>
>  /**
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index dd364d738c00..9d56257e444b 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -690,8 +690,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
>                         page_pool_dma_sync_for_device(pool, page,
>                                                       dma_sync_size);
>
> -               if (allow_direct && in_softirq() &&
> -                   page_pool_recycle_in_cache(page, pool))
> +               if (allow_direct && page_pool_recycle_in_cache(page, pool))
>                         return NULL;
>
>                 /* Page found as candidate for recycling */
> @@ -716,9 +715,35 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
>         return NULL;
>  }
>
> +static bool page_pool_napi_local(const struct page_pool *pool)
> +{
> +       const struct napi_struct *napi;
> +       u32 cpuid;
> +
> +       if (unlikely(!in_softirq()))
> +               return false;
> +
> +       /* Allow direct recycle if we have reasons to believe that we are
> +        * in the same context as the consumer would run, so there's
> +        * no possible race.
> +        * __page_pool_put_page() makes sure we're not in hardirq context
> +        * and interrupts are enabled prior to accessing the cache.
> +        */
> +       cpuid = smp_processor_id();
> +       if (READ_ONCE(pool->cpuid) == cpuid)
> +               return true;
> +
> +       napi = READ_ONCE(pool->p.napi);
> +
> +       return napi && READ_ONCE(napi->list_owner) == cpuid;
> +}
> +
>  void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
>                                 unsigned int dma_sync_size, bool allow_direct)
>  {
> +       if (!allow_direct)
> +               allow_direct = page_pool_napi_local(pool);
> +
>         page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
>         if (page && !page_pool_recycle_in_ring(pool, page)) {
>                 /* Cache full, fallback to free pages */
> @@ -969,7 +994,7 @@ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
>  static void page_pool_disable_direct_recycling(struct page_pool *pool)
>  {
>         /* Disable direct recycling based on pool->cpuid.
> -        * Paired with READ_ONCE() in napi_pp_put_page().
> +        * Paired with READ_ONCE() in page_pool_napi_local().
>          */
>         WRITE_ONCE(pool->cpuid, -1);
>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index e0e172638c0a..e01e2a618621 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -1005,11 +1005,8 @@ int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
>  EXPORT_SYMBOL(skb_cow_data_for_xdp);
>
>  #if IS_ENABLED(CONFIG_PAGE_POOL)
> -bool napi_pp_put_page(struct page *page, bool napi_safe)
> +bool napi_pp_put_page(struct page *page)
>  {
> -       bool allow_direct = false;
> -       struct page_pool *pp;
> -
>         page = compound_head(page);
>
>         /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
> @@ -1022,39 +1019,18 @@ bool napi_pp_put_page(struct page *page, bool napi_safe)
>         if (unlikely(!is_pp_page(page)))
>                 return false;
>
> -       pp = page->pp;
> -
> -       /* Allow direct recycle if we have reasons to believe that we are
> -        * in the same context as the consumer would run, so there's
> -        * no possible race.
> -        * __page_pool_put_page() makes sure we're not in hardirq context
> -        * and interrupts are enabled prior to accessing the cache.
> -        */
> -       if (napi_safe || in_softirq()) {
> -               const struct napi_struct *napi = READ_ONCE(pp->p.napi);
> -               unsigned int cpuid = smp_processor_id();
> -
> -               allow_direct = napi && READ_ONCE(napi->list_owner) == cpuid;
> -               allow_direct |= READ_ONCE(pp->cpuid) == cpuid;
> -       }
> -
> -       /* Driver set this to memory recycling info. Reset it on recycle.
> -        * This will *not* work for NIC using a split-page memory model.
> -        * The page will be returned to the pool here regardless of the
> -        * 'flipped' fragment being in use or not.
> -        */
> -       page_pool_put_full_page(pp, page, allow_direct);
> +       page_pool_put_full_page(page->pp, page, false);
>
>         return true;
>  }
>  EXPORT_SYMBOL(napi_pp_put_page);
>  #endif
>
> -static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
> +static bool skb_pp_recycle(struct sk_buff *skb, void *data)
>  {
>         if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
>                 return false;
> -       return napi_pp_put_page(virt_to_page(data), napi_safe);
> +       return napi_pp_put_page(virt_to_page(data));
>  }
>
>  /**
> @@ -1096,12 +1072,12 @@ static void skb_kfree_head(void *head, unsigned int end_offset)
>                 kfree(head);
>  }
>
> -static void skb_free_head(struct sk_buff *skb, bool napi_safe)
> +static void skb_free_head(struct sk_buff *skb)
>  {
>         unsigned char *head = skb->head;
>
>         if (skb->head_frag) {
> -               if (skb_pp_recycle(skb, head, napi_safe))
> +               if (skb_pp_recycle(skb, head))
>                         return;
>                 skb_free_frag(head);
>         } else {
> @@ -1109,8 +1085,7 @@ static void skb_free_head(struct sk_buff *skb, bool napi_safe)
>         }
>  }
>
> -static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
> -                            bool napi_safe)
> +static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
>  {
>         struct skb_shared_info *shinfo = skb_shinfo(skb);
>         int i;
> @@ -1127,13 +1102,13 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,

skb_release_data has a napi_safe argument. Do you want to remove that
as well? To my eye it looks like dead code now that the value is not
passed to napi_frag_unref and skb_free_head.

After this change, __napi_kfree_skb() which previously set napi_safe
to true, now isn't able to. Is my understanding correct that this
should still work fine because we now check pool->p.napi in the
page_pool code?

>         }
>
>         for (i = 0; i < shinfo->nr_frags; i++)
> -               napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe);
> +               napi_frag_unref(&shinfo->frags[i], skb->pp_recycle);
>
>  free_head:
>         if (shinfo->frag_list)
>                 kfree_skb_list_reason(shinfo->frag_list, reason);
>
> -       skb_free_head(skb, napi_safe);
> +       skb_free_head(skb);
>  exit:
>         /* When we clone an SKB we copy the reycling bit. The pp_recycle
>          * bit is only set on the head though, so in order to avoid races
> @@ -1194,12 +1169,11 @@ void skb_release_head_state(struct sk_buff *skb)
>  }
>
>  /* Free everything but the sk_buff shell. */
> -static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,
> -                           bool napi_safe)
> +static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
>  {
>         skb_release_head_state(skb);
>         if (likely(skb->head))
> -               skb_release_data(skb, reason, napi_safe);
> +               skb_release_data(skb, reason);
>  }
>
>  /**
> @@ -1213,7 +1187,7 @@ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,
>
>  void __kfree_skb(struct sk_buff *skb)
>  {
> -       skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false);
> +       skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
>         kfree_skbmem(skb);
>  }
>  EXPORT_SYMBOL(__kfree_skb);
> @@ -1270,7 +1244,7 @@ static void kfree_skb_add_bulk(struct sk_buff *skb,
>                 return;
>         }
>
> -       skb_release_all(skb, reason, false);
> +       skb_release_all(skb, reason);
>         sa->skb_array[sa->skb_count++] = skb;
>
>         if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) {
> @@ -1444,7 +1418,7 @@ EXPORT_SYMBOL(consume_skb);
>  void __consume_stateless_skb(struct sk_buff *skb)
>  {
>         trace_consume_skb(skb, __builtin_return_address(0));
> -       skb_release_data(skb, SKB_CONSUMED, false);
> +       skb_release_data(skb, SKB_CONSUMED);
>         kfree_skbmem(skb);
>  }
>
> @@ -1471,7 +1445,7 @@ static void napi_skb_cache_put(struct sk_buff *skb)
>
>  void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason)
>  {
> -       skb_release_all(skb, reason, true);
> +       skb_release_all(skb, reason);
>         napi_skb_cache_put(skb);
>  }
>
> @@ -1509,7 +1483,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
>                 return;
>         }
>
> -       skb_release_all(skb, SKB_CONSUMED, !!budget);
> +       skb_release_all(skb, SKB_CONSUMED);
>         napi_skb_cache_put(skb);
>  }
>  EXPORT_SYMBOL(napi_consume_skb);
> @@ -1640,7 +1614,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
>   */
>  struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
>  {
> -       skb_release_all(dst, SKB_CONSUMED, false);
> +       skb_release_all(dst, SKB_CONSUMED);
>         return __skb_clone(dst, src);
>  }
>  EXPORT_SYMBOL_GPL(skb_morph);
> @@ -2272,9 +2246,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
>                 if (skb_has_frag_list(skb))
>                         skb_clone_fraglist(skb);
>
> -               skb_release_data(skb, SKB_CONSUMED, false);
> +               skb_release_data(skb, SKB_CONSUMED);
>         } else {
> -               skb_free_head(skb, false);
> +               skb_free_head(skb);
>         }
>         off = (data + nhead) - skb->head;
>
> @@ -6575,12 +6549,12 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
>                         skb_frag_ref(skb, i);
>                 if (skb_has_frag_list(skb))
>                         skb_clone_fraglist(skb);
> -               skb_release_data(skb, SKB_CONSUMED, false);
> +               skb_release_data(skb, SKB_CONSUMED);
>         } else {
>                 /* we can reuse existing recount- all we did was
>                  * relocate values
>                  */
> -               skb_free_head(skb, false);
> +               skb_free_head(skb);
>         }
>
>         skb->head = data;
> @@ -6715,7 +6689,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
>                 skb_kfree_head(data, size);
>                 return -ENOMEM;
>         }
> -       skb_release_data(skb, SKB_CONSUMED, false);
> +       skb_release_data(skb, SKB_CONSUMED);
>
>         skb->head = data;
>         skb->head_frag = 0;
> diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
> index d33d12421814..3d647c9a7a21 100644
> --- a/net/ipv4/esp4.c
> +++ b/net/ipv4/esp4.c
> @@ -114,7 +114,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb)
>          */
>         if (req->src != req->dst)
>                 for (sg = sg_next(req->src); sg; sg = sg_next(sg))
> -                       skb_page_unref(skb, sg_page(sg), false);
> +                       skb_page_unref(skb, sg_page(sg));
>  }
>
>  #ifdef CONFIG_INET_ESPINTCP
> diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
> index 7371886d4f9f..fe8d53f5a5ee 100644
> --- a/net/ipv6/esp6.c
> +++ b/net/ipv6/esp6.c
> @@ -131,7 +131,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb)
>          */
>         if (req->src != req->dst)
>                 for (sg = sg_next(req->src); sg; sg = sg_next(sg))
> -                       skb_page_unref(skb, sg_page(sg), false);
> +                       skb_page_unref(skb, sg_page(sg));
>  }
>
>  #ifdef CONFIG_INET6_ESPINTCP
> --
> 2.44.0
>
>


-- 
Thanks,
Mina

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ