lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAEA6p_AyVbAevnUFXpxo4OgNqmHe01qYs9x8jGDgVJwUbAAg4w@mail.gmail.com>
Date: Mon, 19 May 2025 17:34:03 -0700
From: Wei Wang <weiwan@...gle.com>
To: Samiullah Khawaja <skhawaja@...gle.com>
Cc: Jakub Kicinski <kuba@...nel.org>, "David S . Miller" <davem@...emloft.net>, 
	Eric Dumazet <edumazet@...gle.com>, Paolo Abeni <pabeni@...hat.com>, almasrymina@...gle.com, 
	willemb@...gle.com, jdamato@...tly.com, mkarsten@...terloo.ca, 
	netdev@...r.kernel.org
Subject: Re: [PATCH net-next v2] net: stop napi kthreads when THREADED napi is disabled

On Mon, May 19, 2025 at 3:43 PM Samiullah Khawaja <skhawaja@...gle.com> wrote:
>
> Once the THREADED napi is disabled, the napi kthread should also be
> stopped. Keeping the kthread intact after disabling THREADED napi makes
> the PID of this kthread show up in the output of netlink 'napi-get' and
> ps -ef output.
>
> The is discussed in the patch below:
> https://lore.kernel.org/all/20250502191548.559cc416@kernel.org
>
> The napi THREADED state should be unset before stopping the thread. This
> makes sure that this napi will not be scheduled again for threaded
> polling (SCHED_THREADED). In the napi_thread_wait function we need to
> make sure that the SCHED_THREADED is not set before stopping. Once the
> SCHED_THREADED is unset it means that the napi kthread doesn't own this
> napi and it is safe to stop it.
>
> ____napi_schedule for threaded napi is also modified to make sure that
> the STATE_THREADED is not unset while we are setting SCHED_THREADED and
> waking up the napi kthread. If STATE_THREADED is unset while the
> threaded napi is being scheduled, the schedule code will recheck the
> STATE_THREADED to prevent wakeup of a stopped thread. Use try_cmpxchg to
> make sure the value of napi->state is not changed.
>
> Add a new test in nl_netdev to verify this behaviour.
>
> Tested:
>  ./tools/testing/selftests/net/nl_netdev.py
>  TAP version 13
>  1..6
>  ok 1 nl_netdev.empty_check
>  ok 2 nl_netdev.lo_check
>  ok 3 nl_netdev.page_pool_check
>  ok 4 nl_netdev.napi_list_check
>  ok 5 nl_netdev.dev_set_threaded
>  ok 6 nl_netdev.nsim_rxq_reset_down
>  # Totals: pass:6 fail:0 xfail:0 xpass:0 skip:0 error:0
>
> Signed-off-by: Samiullah Khawaja <skhawaja@...gle.com>
> ---
>
> v2:
> - Handle the race where the STATE_THREADED is disabled and kthread is
>   stopped while the ____napi_schedule code has already checked the
>   STATE_THREADED and tries to wakeup napi kthread that is stopped.
>
>  net/core/dev.c                           | 77 +++++++++++++++++++-----
>  tools/testing/selftests/net/nl_netdev.py | 38 +++++++++++-
>  2 files changed, 98 insertions(+), 17 deletions(-)
>
> diff --git a/net/core/dev.c b/net/core/dev.c
> index d0563ddff6ca..52d173f5206c 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -4754,15 +4754,18 @@ int weight_p __read_mostly = 64;           /* old backlog weight */
>  int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
>  int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
>
> -/* Called with irq disabled */
> -static inline void ____napi_schedule(struct softnet_data *sd,
> -                                    struct napi_struct *napi)
> +static inline bool ____try_napi_schedule_threaded(struct softnet_data *sd,
> +                                                 struct napi_struct *napi)
>  {
>         struct task_struct *thread;
> +       unsigned long new, val;
>
> -       lockdep_assert_irqs_disabled();
> +       do {
> +               val = READ_ONCE(napi->state);
> +
> +               if (!(val & NAPIF_STATE_THREADED))
> +                       return false;
>
> -       if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
>                 /* Paired with smp_mb__before_atomic() in
>                  * napi_enable()/dev_set_threaded().
>                  * Use READ_ONCE() to guarantee a complete
> @@ -4770,17 +4773,30 @@ static inline void ____napi_schedule(struct softnet_data *sd,
>                  * wake_up_process() when it's not NULL.
>                  */
>                 thread = READ_ONCE(napi->thread);
> -               if (thread) {
> -                       if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi))
> -                               goto use_local_napi;
> +               if (!thread)
> +                       return false;
>
> -                       set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
> -                       wake_up_process(thread);
> -                       return;
> -               }
> -       }
> +               if (use_backlog_threads() &&
> +                   thread == raw_cpu_read(backlog_napi))
> +                       return false;
> +
> +               new = val | NAPIF_STATE_SCHED_THREADED;
> +       } while (!try_cmpxchg(&napi->state, &val, new));
> +
> +       wake_up_process(thread);
> +       return true;
> +}
> +
> +/* Called with irq disabled */
> +static inline void ____napi_schedule(struct softnet_data *sd,
> +                                    struct napi_struct *napi)
> +{
> +       lockdep_assert_irqs_disabled();
> +
> +       /* try to schedule threaded napi if enabled */
> +       if (____try_napi_schedule_threaded(sd, napi))
> +               return;
>
> -use_local_napi:
>         list_add_tail(&napi->poll_list, &sd->poll_list);
>         WRITE_ONCE(napi->list_owner, smp_processor_id());
>         /* If not called from net_rx_action()
> @@ -6888,6 +6904,18 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
>         return HRTIMER_NORESTART;
>  }
>
> +static void dev_stop_napi_threads(struct net_device *dev)
> +{
> +       struct napi_struct *napi;
> +
> +       list_for_each_entry(napi, &dev->napi_list, dev_list) {
> +               if (napi->thread) {
> +                       kthread_stop(napi->thread);
> +                       napi->thread = NULL;
> +               }
> +       }
> +}
> +
>  int dev_set_threaded(struct net_device *dev, bool threaded)
>  {
>         struct napi_struct *napi;
> @@ -6926,6 +6954,15 @@ int dev_set_threaded(struct net_device *dev, bool threaded)
>         list_for_each_entry(napi, &dev->napi_list, dev_list)
>                 assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
>
> +       /* Calling kthread_stop on napi threads should be safe now as the
> +        * threaded state is disabled.
> +        */
> +       if (!threaded) {
> +               /* Make sure the state is set before stopping threads.*/
> +               smp_mb__before_atomic();

This should be smp_mb__after_atomic() right?

> +               dev_stop_napi_threads(dev);
> +       }
> +
>         return err;
>  }
>  EXPORT_SYMBOL(dev_set_threaded);
> @@ -7451,7 +7488,8 @@ static int napi_thread_wait(struct napi_struct *napi)
>  {
>         set_current_state(TASK_INTERRUPTIBLE);
>
> -       while (!kthread_should_stop()) {
> +       /* Wait until we are scheduled or asked to stop. */
> +       while (true) {
>                 /* Testing SCHED_THREADED bit here to make sure the current
>                  * kthread owns this napi and could poll on this napi.
>                  * Testing SCHED bit is not enough because SCHED bit might be
> @@ -7463,6 +7501,15 @@ static int napi_thread_wait(struct napi_struct *napi)
>                         return 0;
>                 }
>
> +               /* Since the SCHED_THREADED is not set so this napi kthread does
> +                * not own this napi and it is safe to stop here. Checking the
> +                * SCHED_THREADED before stopping here makes sure that this napi
> +                * was not scheduled again while napi threaded was being
> +                * disabled.
> +                */
> +               if (kthread_should_stop())
> +                       break;
> +
>                 schedule();
>                 set_current_state(TASK_INTERRUPTIBLE);
>         }
> diff --git a/tools/testing/selftests/net/nl_netdev.py b/tools/testing/selftests/net/nl_netdev.py
> index beaee5e4e2aa..c9109627a741 100755
> --- a/tools/testing/selftests/net/nl_netdev.py
> +++ b/tools/testing/selftests/net/nl_netdev.py
> @@ -2,8 +2,9 @@
>  # SPDX-License-Identifier: GPL-2.0
>
>  import time
> +from os import system
>  from lib.py import ksft_run, ksft_exit, ksft_pr
> -from lib.py import ksft_eq, ksft_ge, ksft_busy_wait
> +from lib.py import ksft_eq, ksft_ge, ksft_ne, ksft_busy_wait
>  from lib.py import NetdevFamily, NetdevSimDev, ip
>
>
> @@ -34,6 +35,39 @@ def napi_list_check(nf) -> None:
>                  ksft_eq(len(napis), 100,
>                          comment=f"queue count after reset queue {q} mode {i}")
>
> +def dev_set_threaded(nf) -> None:
> +    """
> +    Test that verifies various cases of napi threaded
> +    set and unset at device level using sysfs.
> +    """
> +    with NetdevSimDev(queue_count=2) as nsimdev:
> +        nsim = nsimdev.nsims[0]
> +
> +        ip(f"link set dev {nsim.ifname} up")
> +
> +        napis = nf.napi_get({'ifindex': nsim.ifindex}, dump=True)
> +        ksft_eq(len(napis), 2)
> +
> +        napi0_id = napis[0]['id']
> +        napi1_id = napis[1]['id']
> +
> +        # set threaded
> +        system(f"echo 1 > /sys/class/net/{nsim.ifname}/threaded")
> +
> +        # check napi threaded is set for both napis
> +        napi0 = nf.napi_get({'id': napi0_id})
> +        ksft_ne(napi0.get('pid'), None)
> +        napi1 = nf.napi_get({'id': napi1_id})
> +        ksft_ne(napi1.get('pid'), None)
> +
> +        # unset threaded
> +        system(f"echo 0 > /sys/class/net/{nsim.ifname}/threaded")
> +
> +        # check napi threaded is unset for both napis
> +        napi0 = nf.napi_get({'id': napi0_id})
> +        ksft_eq(napi0.get('pid'), None)
> +        napi1 = nf.napi_get({'id': napi1_id})
> +        ksft_eq(napi1.get('pid'), None)
>
>  def nsim_rxq_reset_down(nf) -> None:
>      """
> @@ -122,7 +156,7 @@ def page_pool_check(nf) -> None:
>  def main() -> None:
>      nf = NetdevFamily()
>      ksft_run([empty_check, lo_check, page_pool_check, napi_list_check,
> -              nsim_rxq_reset_down],
> +              dev_set_threaded, nsim_rxq_reset_down],
>               args=(nf, ))
>      ksft_exit()
>
>
> base-commit: b65999e7238e6f2a48dc77c8c2109c48318ff41b
> --
> 2.49.0.1101.gccaa498523-goog
>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ