[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240624102018.WYAKspD9@linutronix.de>
Date: Mon, 24 Jun 2024 12:20:18 +0200
From: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
To: Jakub Kicinski <kuba@...nel.org>
Cc: linux-kernel@...r.kernel.org, netdev@...r.kernel.org,
"David S. Miller" <davem@...emloft.net>,
Daniel Bristot de Oliveira <bristot@...nel.org>,
Boqun Feng <boqun.feng@...il.com>,
Daniel Borkmann <daniel@...earbox.net>,
Eric Dumazet <edumazet@...gle.com>,
Frederic Weisbecker <frederic@...nel.org>,
Ingo Molnar <mingo@...hat.com>, Paolo Abeni <pabeni@...hat.com>,
Peter Zijlstra <peterz@...radead.org>,
Thomas Gleixner <tglx@...utronix.de>,
Waiman Long <longman@...hat.com>, Will Deacon <will@...nel.org>,
Ben Segall <bsegall@...gle.com>,
Daniel Bristot de Oliveira <bristot@...hat.com>,
Dietmar Eggemann <dietmar.eggemann@....com>,
Juri Lelli <juri.lelli@...hat.com>, Mel Gorman <mgorman@...e.de>,
Steven Rostedt <rostedt@...dmis.org>,
Valentin Schneider <vschneid@...hat.com>,
Vincent Guittot <vincent.guittot@...aro.org>
Subject: Re: [PATCH v9 net-next 08/15] net: softnet_data: Make xmit per task.
On 2024-06-21 19:12:45 [-0700], Jakub Kicinski wrote:
> On Thu, 20 Jun 2024 15:21:58 +0200 Sebastian Andrzej Siewior wrote:
> > +static inline void netdev_xmit_set_more(bool more)
> > +{
> > + current->net_xmit.more = more;
> > +}
> > +
> > +static inline bool netdev_xmit_more(void)
> > +{
> > + return current->net_xmit.more;
> > +}
> > +#endif
> > +
> > +static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
> > + struct sk_buff *skb, struct net_device *dev,
> > + bool more)
> > +{
> > + netdev_xmit_set_more(more);
> > + return ops->ndo_start_xmit(skb, dev);
> > +}
>
> The series looks clean, I'm happy for it to be applied as is.
>
> But I'm curious whether similar helper organization as with the BPF
> code would work. By which I mean - instead of read / write helpers
> for each member can we not have one helper which returns the struct?
> It would be a per-CPU struct on !RT and pointer from current on RT.
> Does it change the generated code? Or stripping the __percpu annotation
> is a PITA?
You are asking for
| #ifndef CONFIG_PREEMPT_RT
| static inline struct netdev_xmit *netdev_get_xmit(void)
| {
| return this_cpu_ptr(&softnet_data.xmit);
| }
| #else
| static inline int netdev_get_xmit(void)
| {
| return ¤t->net_xmit;
| }
| #endif
on one side so that we can have then
| static inline void dev_xmit_recursion_inc(void)
| {
| netdev_get_xmit()->recursion++;
| }
|
| static inline void dev_xmit_recursion_dec(void)
| {
| netdev_get_xmit()->recursion--;
| }
This changes the generated code slightly. The inc increases from one to
two opcodes, __dev_direct_xmit() snippet:
| addl $512, %gs:pcpu_hot+8(%rip) #, *_45
local_bh_disable();
| incw %gs:softnet_data+120(%rip) # *_44
dev_xmit_recursion_inc();
| testb $16, 185(%rbx) #, dev_24->features
| je .L3310 #,
| movl $16, %r13d #, <retval>
| testb $5, 208(%r12) #, MEM[(const struct netdev_queue *)_54].state
| je .L3290 #,
| movl $512, %esi #,
^ part of local_bh_enable();
| decw %gs:softnet_data+120(%rip) # *_44
dev_xmit_recursion_dec();
| lea 0(%rip), %rdi # __here
| call __local_bh_enable_ip #
With the change mentioned above we get:
| addl $512, %gs:pcpu_hot+8(%rip) #, *_51
local_bh_disable();
| movq %gs:this_cpu_off(%rip), %rax # *_44, tcp_ptr__
| addw $1, softnet_data+120(%rax) #, _48->recursion
two opcodes for dev_xmit_recursion_inc()
| testb $16, 185(%rbx) #, dev_24->features
| je .L3310 #,
| movl $16, %r13d #, <retval>
| testb $5, 208(%r12) #, MEM[(const struct netdev_queue *)_60].state
| je .L3290 #,
| movq %gs:this_cpu_off(%rip), %rax # *_44, tcp_ptr__
one opcode from dev_xmit_recursion_dec()
| movl $512, %esi #,
part of local_bh_enable()
| lea 0(%rip), %rdi # __here
| subw $1, softnet_data+120(%rax) #, _68->recursion
second opcode from dev_xmit_recursion_dec()
| call __local_bh_enable_ip #
So we end up with one additional opcode per usage and I can't tell how
bad it is. The second invocation (dec) was interleaved so it might use
idle cycles. Instead of one optimized operation we get two and the
pointer can't be cached.
And in case you ask, the task version looks like this:
| addl $512, %gs:pcpu_hot+8(%rip) #, *_47
local_bh_disable()
| movq %gs:const_pcpu_hot(%rip), %r14 # const_pcpu_hot.D.2663.D.2661.current_task, _44
| movzwl 2426(%r14), %eax # MEM[(struct netdev_xmit *)_44 + 2426B].recursion, _45
| leal 1(%rax), %edx #, tmp140
| movw %dx, 2426(%r14) # tmp140, MEM[(struct netdev_xmit *)_44 + 2426B].recursion
four opcodes for the inc.
| testb $16, 185(%rbx) #, dev_24->features
| je .L3311 #,
| movl $16, %r13d #, <retval>
| testb $5, 208(%r12) #, MEM[(const struct netdev_queue *)_56].state
| je .L3291 #,
| movw %ax, 2426(%r14) # _45, MEM[(struct netdev_xmit *)_44 + 2426B].recursion
but then gcc recycles the initial value. It reloads the value and
decrements it in case it calls the function.
| movl $512, %esi #,
| lea 0(%rip), %rdi # __here
| call __local_bh_enable_ip #
|
Any update request?
Sebastian
Powered by blists - more mailing lists