lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Mon, 22 Aug 2022 02:11:44 -0700
From:   Peilin Ye <yepeilin.cs@...il.com>
To:     "David S. Miller" <davem@...emloft.net>,
        Eric Dumazet <edumazet@...gle.com>,
        Jakub Kicinski <kuba@...nel.org>,
        Paolo Abeni <pabeni@...hat.com>,
        Jonathan Corbet <corbet@....net>,
        Hideaki YOSHIFUJI <yoshfuji@...ux-ipv6.org>,
        David Ahern <dsahern@...nel.org>,
        Jamal Hadi Salim <jhs@...atatu.com>,
        Cong Wang <xiyou.wangcong@...il.com>,
        Jiri Pirko <jiri@...nulli.us>
Cc:     Peilin Ye <peilin.ye@...edance.com>, netdev@...r.kernel.org,
        linux-doc@...r.kernel.org, linux-kernel@...r.kernel.org,
        Cong Wang <cong.wang@...edance.com>,
        Stephen Hemminger <stephen@...workplumber.org>,
        Dave Taht <dave.taht@...il.com>,
        Peilin Ye <yepeilin.cs@...il.com>
Subject: [PATCH RFC v2 net-next 1/5] net: Introduce Qdisc backpressure infrastructure

From: Peilin Ye <peilin.ye@...edance.com>

Currently sockets (especially UDP ones) can drop a lot of traffic at TC
egress when rate limited by shaper Qdiscs like HTB.  Improve this by
introducing a Qdisc backpressure infrastructure:

  a. A new 'sock struct' field, @sk_overlimits, which keeps track of the
     number of bytes in socket send buffer that are currently
     unavailable due to TC egress congestion.  The size of an overlimit
     socket's "effective" send buffer is represented by @sk_sndbuf minus
     @sk_overlimits, with a lower limit of SOCK_MIN_SNDBUF:

     max(@sk_sndbuf - @sk_overlimits, SOCK_MIN_SNDBUF)

  b. A new (*backpressure) 'struct proto' callback, which is the
     protocol's private algorithm for Qdisc backpressure.

Working together:

  1. When a shaper Qdisc (TBF, HTB, CBQ, etc.) drops a packet that
     belongs to a local socket, it calls qdisc_backpressure().

  2. qdisc_backpressure() eventually invokes the socket protocol's
     (*backpressure) callback, which should increase @sk_overlimits.

  3. The transport layer then sees a smaller "effective" send buffer and
     will send slower.

  4. It is the per-protocol (*backpressure) implementation's
     responsibility to decrease @sk_overlimits when TC egress becomes
     idle again, potentially by using a timer.

Suggested-by: Cong Wang <cong.wang@...edance.com>
Signed-off-by: Peilin Ye <peilin.ye@...edance.com>
---
 include/net/sch_generic.h | 11 +++++++++++
 include/net/sock.h        | 21 +++++++++++++++++++++
 net/core/sock.c           |  1 +
 3 files changed, 33 insertions(+)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index ec693fe7c553..afdf4bf64936 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -19,6 +19,7 @@
 #include <net/gen_stats.h>
 #include <net/rtnetlink.h>
 #include <net/flow_offload.h>
+#include <net/sock.h>
 
 struct Qdisc_ops;
 struct qdisc_walker;
@@ -1188,6 +1189,16 @@ static inline int qdisc_drop_all(struct sk_buff *skb, struct Qdisc *sch,
 	return NET_XMIT_DROP;
 }
 
+static inline void qdisc_backpressure(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+
+	if (!sk || !sk_fullsock(sk))
+		return;
+
+	sk_backpressure(sk);
+}
+
 /* Length to Time (L2T) lookup in a qdisc_rate_table, to determine how
    long it will take to send a packet given its size.
  */
diff --git a/include/net/sock.h b/include/net/sock.h
index 05a1bbdf5805..ef10ca66cf26 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -277,6 +277,7 @@ struct sk_filter;
   *	@sk_pacing_status: Pacing status (requested, handled by sch_fq)
   *	@sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE)
   *	@sk_sndbuf: size of send buffer in bytes
+  *	@sk_overlimits: size of temporarily unavailable send buffer in bytes
   *	@__sk_flags_offset: empty field used to determine location of bitfield
   *	@sk_padding: unused element for alignment
   *	@sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets
@@ -439,6 +440,7 @@ struct sock {
 	struct dst_entry __rcu	*sk_dst_cache;
 	atomic_t		sk_omem_alloc;
 	int			sk_sndbuf;
+	int			sk_overlimits;
 
 	/* ===== cache line for TX ===== */
 	int			sk_wmem_queued;
@@ -1264,6 +1266,7 @@ struct proto {
 
 	bool			(*stream_memory_free)(const struct sock *sk, int wake);
 	bool			(*sock_is_readable)(struct sock *sk);
+	void			(*backpressure)(struct sock *sk);
 	/* Memory pressure */
 	void			(*enter_memory_pressure)(struct sock *sk);
 	void			(*leave_memory_pressure)(struct sock *sk);
@@ -2499,6 +2502,24 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk)
 	WRITE_ONCE(sk->sk_sndbuf, max_t(u32, val, SOCK_MIN_SNDBUF));
 }
 
+static inline int sk_sndbuf_avail(struct sock *sk)
+{
+	int overlimits, sndbuf = READ_ONCE(sk->sk_sndbuf);
+
+	if (!sk->sk_prot->backpressure)
+		return sndbuf;
+
+	overlimits = READ_ONCE(sk->sk_overlimits);
+
+	return max_t(int, sndbuf - overlimits, SOCK_MIN_SNDBUF);
+}
+
+static inline void sk_backpressure(struct sock *sk)
+{
+	if (sk->sk_prot->backpressure)
+		sk->sk_prot->backpressure(sk);
+}
+
 /**
  * sk_page_frag - return an appropriate page_frag
  * @sk: socket
diff --git a/net/core/sock.c b/net/core/sock.c
index 4cb957d934a2..167d471b176f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2194,6 +2194,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 
 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
 	refcount_set(&newsk->sk_wmem_alloc, 1);
+	newsk->sk_overlimits	= 0;
 
 	atomic_set(&newsk->sk_omem_alloc, 0);
 	sk_init_common(newsk);
-- 
2.20.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ