netdev - Re: [PATCH v2 net-next 6/8] net: rps: change input_queue_tail_incr

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAL+tcoBa1g1Ps5V_P1TqVtGWD482AvSy=wgvvUMT3RCHH+x2=Q@mail.gmail.com>
Date: Sat, 30 Mar 2024 22:46:55 +0800
From: Jason Xing <kerneljasonxing@...il.com>
To: Eric Dumazet <edumazet@...gle.com>
Cc: "David S . Miller" <davem@...emloft.net>, Jakub Kicinski <kuba@...nel.org>, 
	Paolo Abeni <pabeni@...hat.com>, netdev@...r.kernel.org, eric.dumazet@...il.com
Subject: Re: [PATCH v2 net-next 6/8] net: rps: change input_queue_tail_incr_save()

Hello Eric,

On Fri, Mar 29, 2024 at 11:43 PM Eric Dumazet <edumazet@...gle.com> wrote:
>
> input_queue_tail_incr_save() is incrementing the sd queue_tail
> and save it in the flow last_qtail.
>
> Two issues here :
>
> - no lock protects the write on last_qtail, we should use appropriate
>   annotations.
>
> - We can perform this write after releasing the per-cpu backlog lock,
>   to decrease this lock hold duration (move away the cache line miss)
>
> Also move input_queue_head_incr() and rps helpers to include/net/rps.h,
> while adding rps_ prefix to better reflect their role.
>
> v2: Fixed a build issue (Jakub and kernel build bots)
>
> Signed-off-by: Eric Dumazet <edumazet@...gle.com>
> ---
>  include/linux/netdevice.h | 15 ---------------
>  include/net/rps.h         | 23 +++++++++++++++++++++++
>  net/core/dev.c            | 20 ++++++++++++--------
>  3 files changed, 35 insertions(+), 23 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 1c31cd2691d32064613836141fbdeeebc831b21f..14f19cc2616452d7e6afbbaa52f8ad3e61a419e9 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -3249,21 +3249,6 @@ struct softnet_data {
>         call_single_data_t      defer_csd;
>  };
>
> -static inline void input_queue_head_incr(struct softnet_data *sd)
> -{
> -#ifdef CONFIG_RPS
> -       sd->input_queue_head++;
> -#endif
> -}
> -
> -static inline void input_queue_tail_incr_save(struct softnet_data *sd,
> -                                             unsigned int *qtail)
> -{
> -#ifdef CONFIG_RPS
> -       *qtail = ++sd->input_queue_tail;
> -#endif
> -}
> -
>  DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
>
>  static inline int dev_recursion_level(void)
> diff --git a/include/net/rps.h b/include/net/rps.h
> index 7660243e905b92651a41292e04caf72c5f12f26e..10ca25731c1ef766715fe7ee415ad0b71ec643a8 100644
> --- a/include/net/rps.h
> +++ b/include/net/rps.h
> @@ -122,4 +122,27 @@ static inline void sock_rps_record_flow(const struct sock *sk)
>  #endif
>  }
>
> +static inline u32 rps_input_queue_tail_incr(struct softnet_data *sd)
> +{
> +#ifdef CONFIG_RPS
> +       return ++sd->input_queue_tail;
> +#else
> +       return 0;
> +#endif
> +}
> +
> +static inline void rps_input_queue_tail_save(u32 *dest, u32 tail)
> +{
> +#ifdef CONFIG_RPS
> +       WRITE_ONCE(*dest, tail);
> +#endif
> +}

I wonder if we should also call this new helper to WRITE_ONCE
last_qtail in the set_rps_cpu()?

Thanks,
Jason

> +
> +static inline void rps_input_queue_head_incr(struct softnet_data *sd)
> +{
> +#ifdef CONFIG_RPS
> +       sd->input_queue_head++;
> +#endif
> +}
> +
>  #endif /* _NET_RPS_H */
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 0a8ccb0451c30a39f8f8b45d26b7e5548b8bfba4..79073bbc9a644049cacf8433310f4641745049e9 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -4611,7 +4611,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
>                 if (unlikely(tcpu != next_cpu) &&
>                     (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
>                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
> -                     rflow->last_qtail)) >= 0)) {
> +                     READ_ONCE(rflow->last_qtail))) >= 0)) {
>                         tcpu = next_cpu;
>                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
>                 }
> @@ -4666,7 +4666,7 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
>                 cpu = READ_ONCE(rflow->cpu);
>                 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
>                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
> -                          rflow->last_qtail) <
> +                          READ_ONCE(rflow->last_qtail)) <
>                      (int)(10 * flow_table->mask)))
>                         expire = false;
>         }
> @@ -4801,6 +4801,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
>         unsigned long flags;
>         unsigned int qlen;
>         int max_backlog;
> +       u32 tail;
>
>         reason = SKB_DROP_REASON_DEV_READY;
>         if (!netif_running(skb->dev))
> @@ -4825,8 +4826,11 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
>                                 napi_schedule_rps(sd);
>                 }
>                 __skb_queue_tail(&sd->input_pkt_queue, skb);
> -               input_queue_tail_incr_save(sd, qtail);
> +               tail = rps_input_queue_tail_incr(sd);
>                 backlog_unlock_irq_restore(sd, &flags);
> +
> +               /* save the tail outside of the critical section */
> +               rps_input_queue_tail_save(qtail, tail);
>                 return NET_RX_SUCCESS;
>         }
>
> @@ -5904,7 +5908,7 @@ static void flush_backlog(struct work_struct *work)
>                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
>                         __skb_unlink(skb, &sd->input_pkt_queue);
>                         dev_kfree_skb_irq(skb);
> -                       input_queue_head_incr(sd);
> +                       rps_input_queue_head_incr(sd);
>                 }
>         }
>         backlog_unlock_irq_enable(sd);
> @@ -5913,7 +5917,7 @@ static void flush_backlog(struct work_struct *work)
>                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
>                         __skb_unlink(skb, &sd->process_queue);
>                         kfree_skb(skb);
> -                       input_queue_head_incr(sd);
> +                       rps_input_queue_head_incr(sd);
>                 }
>         }
>         local_bh_enable();
> @@ -6041,7 +6045,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
>                         rcu_read_lock();
>                         __netif_receive_skb(skb);
>                         rcu_read_unlock();
> -                       input_queue_head_incr(sd);
> +                       rps_input_queue_head_incr(sd);
>                         if (++work >= quota)
>                                 return work;
>
> @@ -11455,11 +11459,11 @@ static int dev_cpu_dead(unsigned int oldcpu)
>         /* Process offline CPU's input_pkt_queue */
>         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
>                 netif_rx(skb);
> -               input_queue_head_incr(oldsd);
> +               rps_input_queue_head_incr(oldsd);
>         }
>         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
>                 netif_rx(skb);
> -               input_queue_head_incr(oldsd);
> +               rps_input_queue_head_incr(oldsd);
>         }
>
>         return 0;
> --
> 2.44.0.478.gd926399ef9-goog
>
>