[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CANn89iKesUZB=R04Xqp4HL82v3oka6JMDLmZdenv-N_m+sXVvw@mail.gmail.com>
Date: Fri, 30 Sep 2022 14:36:37 -0700
From: Eric Dumazet <edumazet@...gle.com>
To: Paolo Abeni <pabeni@...hat.com>
Cc: Eric Dumazet <eric.dumazet@...il.com>,
"David S . Miller" <davem@...emloft.net>,
Jakub Kicinski <kuba@...nel.org>,
netdev <netdev@...r.kernel.org>, Coco Li <lixiaoyan@...gle.com>
Subject: Re: [PATCH net-next] gro: add support of (hw)gro packets to gro stack
On Fri, Sep 30, 2022 at 2:02 PM Paolo Abeni <pabeni@...hat.com> wrote:
>
> On Fri, 2022-09-30 at 13:19 -0700, Eric Dumazet wrote:
> > On Fri, Sep 30, 2022 at 1:00 PM Eric Dumazet <edumazet@...gle.com> wrote:
> > >
> > > On Fri, Sep 30, 2022 at 12:53 PM Eric Dumazet <edumazet@...gle.com> wrote:
> > > >
> > > > On Fri, Sep 30, 2022 at 1:45 AM Paolo Abeni <pabeni@...hat.com> wrote:
> > > > >
> > > > > On Thu, 2022-09-29 at 18:44 -0700, Eric Dumazet wrote:
> > > > > > From: Coco Li <lixiaoyan@...gle.com>
> > > > > >
> > > > > > Current GRO stack only supports incoming packets containing
> > > > > > one frame/MSS.
> > > > > >
> > > > > > This patch changes GRO to accept packets that are already GRO.
> > > > > >
> > > > > > HW-GRO (aka RSC for some vendors) is very often limited in presence
> > > > > > of interleaved packets. Linux SW GRO stack can complete the job
> > > > > > and provide larger GRO packets, thus reducing rate of ACK packets
> > > > > > and cpu overhead.
> > > > > >
> > > > > > This also means BIG TCP can be used, even if HW-GRO/RSC was
> > > > > > able to cook ~64 KB GRO packets.
> > > > > >
> > > > > > Co-Developed-by: Eric Dumazet <edumazet@...gle.com>
> > > > > > Signed-off-by: Eric Dumazet <edumazet@...gle.com>
> > > > > > Signed-off-by: Coco Li <lixiaoyan@...gle.com>
> > > > > > ---
> > > > > > net/core/gro.c | 13 +++++++++----
> > > > > > net/ipv4/tcp_offload.c | 7 ++++++-
> > > > > > 2 files changed, 15 insertions(+), 5 deletions(-)
> > > > > >
> > > > > > diff --git a/net/core/gro.c b/net/core/gro.c
> > > > > > index b4190eb084672fb4f2be8b437eccb4e8507ff63f..d8e159c4bdf553508cd123bee4f5251908ede9fe 100644
> > > > > > --- a/net/core/gro.c
> > > > > > +++ b/net/core/gro.c
> > > > > > @@ -160,6 +160,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
> > > > > > unsigned int gro_max_size;
> > > > > > unsigned int new_truesize;
> > > > > > struct sk_buff *lp;
> > > > > > + int segs;
> > > > > >
> > > > > > /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */
> > > > > > gro_max_size = READ_ONCE(p->dev->gro_max_size);
> > > > > > @@ -175,6 +176,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
> > > > > > return -E2BIG;
> > > > > > }
> > > > > >
> > > > > > + segs = NAPI_GRO_CB(skb)->count;
> > > > > > lp = NAPI_GRO_CB(p)->last;
> > > > > > pinfo = skb_shinfo(lp);
> > > > > >
> > > > > > @@ -265,7 +267,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
> > > > > > lp = p;
> > > > > >
> > > > > > done:
> > > > > > - NAPI_GRO_CB(p)->count++;
> > > > > > + NAPI_GRO_CB(p)->count += segs;
> > > > > > p->data_len += len;
> > > > > > p->truesize += delta_truesize;
> > > > > > p->len += len;
> > > > > > @@ -496,8 +498,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
> > > > > > BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed),
> > > > > > sizeof(u32))); /* Avoid slow unaligned acc */
> > > > > > *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0;
> > > > > > - NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
> > > > > > + NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb);
> > > > > > NAPI_GRO_CB(skb)->is_atomic = 1;
> > > > > > + NAPI_GRO_CB(skb)->count = max_t(u16, 1,
> > > > > > + skb_shinfo(skb)->gso_segs);
> > > > > >
> > > > > > /* Setup for GRO checksum validation */
> > > > > > switch (skb->ip_summed) {
> > > > > > @@ -545,10 +549,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
> > > > > > else
> > > > > > gro_list->count++;
> > > > > >
> > > > > > - NAPI_GRO_CB(skb)->count = 1;
> > > > > > NAPI_GRO_CB(skb)->age = jiffies;
> > > > > > NAPI_GRO_CB(skb)->last = skb;
> > > > > > - skb_shinfo(skb)->gso_size = skb_gro_len(skb);
> > > > > > + if (!skb_is_gso(skb))
> > > > > > + skb_shinfo(skb)->gso_size = skb_gro_len(skb);
> > > > > > list_add(&skb->list, &gro_list->list);
> > > > > > ret = GRO_HELD;
> > > > > >
> > > > > > @@ -660,6 +664,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
> > > > > >
> > > > > > skb->encapsulation = 0;
> > > > > > skb_shinfo(skb)->gso_type = 0;
> > > > > > + skb_shinfo(skb)->gso_size = 0;
> > > > > > if (unlikely(skb->slow_gro)) {
> > > > > > skb_orphan(skb);
> > > > > > skb_ext_reset(skb);
> > > > > > diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
> > > > > > index a844a0d38482d916251f3aca4555c75c9770820c..0223bbfe9568064b47bc6227d342a4d25c9edfa7 100644
> > > > > > --- a/net/ipv4/tcp_offload.c
> > > > > > +++ b/net/ipv4/tcp_offload.c
> > > > > > @@ -255,7 +255,12 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
> > > > > >
> > > > > > mss = skb_shinfo(p)->gso_size;
> > > > > >
> > > > > > - flush |= (len - 1) >= mss;
> > > > > > + if (skb_is_gso(skb)) {
> > > > > > + flush |= (mss != skb_shinfo(skb)->gso_size);
> > > > > > + flush |= ((skb_gro_len(p) % mss) != 0);
> > > > >
> > > > > If I read correctly, the '(skb_gro_len(p) % mss) != 0' codition can be
> > > > > true only if 'p' was an HW GRO packet (or at least a gso packet before
> > > > > entering the GRO engine), am I correct? In that case 'p' staged into
> > > > > the GRO hash up to the next packet (skb), just to be flushed.
> > > > >
> > > > > Should the above condition be instead:
> > > > >
> > > > > flush |= ((skb_gro_len(skb) % mss) != 0);
> > > >
> > > > Yes, probable typo.
> > > >
> > > > > ?
> > > > >
> > > > > And possibly use that condition while initializing
> > > > > NAPI_GRO_CB(skb)->flush in dev_gro_receive() ?
> > > >
> > > > Not sure, this would add an extra test in dev_gro_receive()
> > > >
> > > > It seems better to leave the test here, because the prior condition
> > > > needs to stay here.
> > > >
> > > > if (skb_is_gso(skb)) {
> > > > flush |= (mss != skb_shinfo(skb)->gso_size);
> > > >
> > >
> > > Oh well, I think Coco missed the fact that the ((skb_gro_len(skb) % mss) != 0)
> > > condition needs to be put after label out_check_final.
> > >
> > > For example, if MSS==1000, and p has 4 segments, we still want to
> > > aggregate skb into p
> > > if skb payload is not a multiple of MSS.
> > >
> >
> > Relative diff would be:
> >
> > diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
> > index 0223bbfe9568064b47bc6227d342a4d25c9edfa7..79996b007bd64635aea27e3fddf291abe10ceca1
> > 100644
> > --- a/net/ipv4/tcp_offload.c
> > +++ b/net/ipv4/tcp_offload.c
> > @@ -255,26 +255,27 @@ struct sk_buff *tcp_gro_receive(struct list_head
> > *head, struct sk_buff *skb)
> >
> > mss = skb_shinfo(p)->gso_size;
> >
> > - if (skb_is_gso(skb)) {
> > + if (unlikely(skb_is_gso(skb)))
> > flush |= (mss != skb_shinfo(skb)->gso_size);
> > - flush |= ((skb_gro_len(p) % mss) != 0);
> > - } else {
> > + else
> > flush |= (len - 1) >= mss;
> > - }
> > +
> > flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
> > #ifdef CONFIG_TLS_DEVICE
> > flush |= p->decrypted ^ skb->decrypted;
> > #endif
> >
> > if (flush || skb_gro_receive(p, skb)) {
> > - mss = 1;
> > - goto out_check_final;
> > + flush = 0;
> > + goto check_flags;
> > }
> >
> > tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
> >
> > out_check_final:
> > - flush = len < mss;
> > + flush = len != NAPI_GRO_CB(skb)->count * mss;
>
> Not sure if it's worthy, perhaps mss can be updated under the
> unlikely(skb_is_gso(skb)) a few lines above:
>
> mss *= NAPI_GRO_CB(skb)->count;
>
> so that here we can avoid the additional operation for the non gso
> case? - just:
> flush = len != mss;
>
Yes, but we still need a check even if the above code was not run
(because of earlier goto out_check_final)
I am thinking of:
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index a844a0d38482d916251f3aca4555c75c9770820c..ba8e6cfb3852fc609afe3022efa10f7a06fb4c12
100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -255,7 +255,11 @@ struct sk_buff *tcp_gro_receive(struct list_head
*head, struct sk_buff *skb)
mss = skb_shinfo(p)->gso_size;
- flush |= (len - 1) >= mss;
+ if (unlikely(skb_is_gso(skb)))
+ flush |= (mss != skb_shinfo(skb)->gso_size);
+ else
+ flush |= (len - 1) >= mss;
+
flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
#ifdef CONFIG_TLS_DEVICE
flush |= p->decrypted ^ skb->decrypted;
@@ -269,7 +273,11 @@ struct sk_buff *tcp_gro_receive(struct list_head
*head, struct sk_buff *skb)
tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
out_check_final:
- flush = len < mss;
+ if (unlikely(skb_is_gso(skb)))
+ flush = len != NAPI_GRO_CB(skb)->count *
skb_shinfo(skb)->gso_size;
+ else
+ flush = len < mss;
+
flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
TCP_FLAG_RST | TCP_FLAG_SYN |
TCP_FLAG_FIN));
Powered by blists - more mailing lists