lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20191022231051.30770-4-xiyou.wangcong@gmail.com>
Date:   Tue, 22 Oct 2019 16:10:51 -0700
From:   Cong Wang <xiyou.wangcong@...il.com>
To:     netdev@...r.kernel.org
Cc:     ycheng@...gle.com, ncardwell@...gle.com, edumazet@...gle.com,
        Cong Wang <xiyou.wangcong@...il.com>
Subject: [Patch net-next 3/3] tcp: decouple TLP timer from RTO timer

Currently RTO, TLP and PROBE0 all share a same timer instance
in kernel and use icsk->icsk_pending to dispatch the work.
This causes spinlock contention when resetting the timer is
too frequent, as clearly shown in the perf report:

   61.72%    61.71%  swapper          [kernel.kallsyms]                        [k] queued_spin_lock_slowpath
   ...
    - 58.83% tcp_v4_rcv
      - 58.80% tcp_v4_do_rcv
         - 58.80% tcp_rcv_established
            - 52.88% __tcp_push_pending_frames
               - 52.88% tcp_write_xmit
                  - 28.16% tcp_event_new_data_sent
                     - 28.15% sk_reset_timer
                        + mod_timer
                  - 24.68% tcp_schedule_loss_probe
                     - 24.68% sk_reset_timer
                        + 24.68% mod_timer

This patch decouples TLP timer from RTO timer by adding a new
timer instance but still uses icsk->icsk_pending to dispatch,
in order to minimize the risk of this patch.

After this patch, the CPU time spent in tcp_write_xmit() reduced
down to 10.92%.

Cc: Eric Dumazet <edumazet@...gle.com>
Signed-off-by: Cong Wang <xiyou.wangcong@...il.com>
---
 include/net/inet_connection_sock.h |  9 +++++--
 include/net/tcp.h                  |  1 +
 net/dccp/timer.c                   |  2 +-
 net/ipv4/inet_connection_sock.c    |  5 +++-
 net/ipv4/inet_diag.c               |  8 ++++--
 net/ipv4/tcp_input.c               |  8 ++++--
 net/ipv4/tcp_ipv4.c                |  6 +++--
 net/ipv4/tcp_output.c              |  1 +
 net/ipv4/tcp_timer.c               | 43 +++++++++++++++++++++++++++---
 net/ipv6/tcp_ipv6.c                |  6 +++--
 10 files changed, 73 insertions(+), 16 deletions(-)

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index e46958460739..2a129fc6b522 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -121,6 +121,7 @@ struct inet_connection_sock {
 		__u16		  last_seg_size; /* Size of last incoming segment	   */
 		__u16		  rcv_mss;	 /* MSS used for delayed ACK decisions	   */
 	} icsk_ack;
+	struct timer_list	  icsk_tlp_timer;
 	struct {
 		int		  enabled;
 
@@ -170,7 +171,8 @@ enum inet_csk_ack_state_t {
 void inet_csk_init_xmit_timers(struct sock *sk,
 			       void (*retransmit_handler)(struct timer_list *),
 			       void (*delack_handler)(struct timer_list *),
-			       void (*keepalive_handler)(struct timer_list *));
+			       void (*keepalive_handler)(struct timer_list *),
+			       void (*tlp_handler)(struct timer_list *));
 void inet_csk_clear_xmit_timers(struct sock *sk);
 
 static inline void inet_csk_schedule_ack(struct sock *sk)
@@ -226,7 +228,7 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,
 	}
 
 	if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 ||
-	    what == ICSK_TIME_LOSS_PROBE || what == ICSK_TIME_REO_TIMEOUT) {
+	    what == ICSK_TIME_REO_TIMEOUT) {
 		icsk->icsk_pending = what;
 		icsk->icsk_timeout = jiffies + when;
 		sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
@@ -234,6 +236,9 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,
 		icsk->icsk_ack.pending |= ICSK_ACK_TIMER;
 		icsk->icsk_ack.timeout = jiffies + when;
 		sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
+	} else if (what == ICSK_TIME_LOSS_PROBE) {
+		icsk->icsk_pending = what;
+		sk_reset_timer(sk, &icsk->icsk_tlp_timer, jiffies + when);
 	} else {
 		pr_debug("inet_csk BUG: unknown timer value\n");
 	}
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0ee5400e751c..3319d2b6b1c4 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -331,6 +331,7 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
 void tcp_release_cb(struct sock *sk);
 void tcp_wfree(struct sk_buff *skb);
 void tcp_write_timer_handler(struct sock *sk);
+void tcp_tail_loss_probe_handler(struct sock *sk);
 void tcp_delack_timer_handler(struct sock *sk);
 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index c0b3672637c4..897a0469e8f1 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -246,7 +246,7 @@ void dccp_init_xmit_timers(struct sock *sk)
 	tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk);
 	timer_setup(&dp->dccps_xmit_timer, dccp_write_xmit_timer, 0);
 	inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer,
-				  &dccp_keepalive_timer);
+				  &dccp_keepalive_timer, NULL);
 }
 
 static ktime_t dccp_timestamp_seed;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index eb30fc1770de..4b279a86308e 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -503,12 +503,14 @@ EXPORT_SYMBOL(inet_csk_accept);
 void inet_csk_init_xmit_timers(struct sock *sk,
 			       void (*retransmit_handler)(struct timer_list *t),
 			       void (*delack_handler)(struct timer_list *t),
-			       void (*keepalive_handler)(struct timer_list *t))
+			       void (*keepalive_handler)(struct timer_list *t),
+			       void (*tlp_handler)(struct timer_list *t))
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
 	timer_setup(&icsk->icsk_retransmit_timer, retransmit_handler, 0);
 	timer_setup(&icsk->icsk_delack_timer, delack_handler, 0);
+	timer_setup(&icsk->icsk_tlp_timer, tlp_handler, 0);
 	timer_setup(&sk->sk_timer, keepalive_handler, 0);
 	icsk->icsk_pending = icsk->icsk_ack.pending = 0;
 }
@@ -522,6 +524,7 @@ void inet_csk_clear_xmit_timers(struct sock *sk)
 
 	sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
 	sk_stop_timer(sk, &icsk->icsk_delack_timer);
+	sk_stop_timer(sk, &icsk->icsk_tlp_timer);
 	sk_stop_timer(sk, &sk->sk_timer);
 }
 EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 7dc79b973e6e..e87fe87571a1 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -221,8 +221,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 	}
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
-	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
-	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT) {
 		r->idiag_timer = 1;
 		r->idiag_retrans = icsk->icsk_retransmits;
 		r->idiag_expires =
@@ -232,6 +231,11 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 		r->idiag_retrans = icsk->icsk_probes_out;
 		r->idiag_expires =
 			jiffies_to_msecs(icsk->icsk_timeout - jiffies);
+	} else if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+		r->idiag_timer = 1;
+		r->idiag_retrans = icsk->icsk_retransmits;
+		r->idiag_expires =
+			jiffies_to_msecs(icsk->icsk_tlp_timer.expires - jiffies);
 	} else if (timer_pending(&sk->sk_timer)) {
 		r->idiag_timer = 2;
 		r->idiag_retrans = icsk->icsk_probes_out;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a2e52ad7cdab..71cbb486ef85 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3008,8 +3008,12 @@ void tcp_rearm_rto(struct sock *sk)
 			 */
 			rto = usecs_to_jiffies(max_t(int, delta_us, 1));
 		}
-		tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
-				     TCP_RTO_MAX, tcp_rtx_queue_head(sk));
+		if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
+			tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, rto,
+					     TCP_RTO_MAX, tcp_rtx_queue_head(sk));
+		else
+			tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
+					     TCP_RTO_MAX, tcp_rtx_queue_head(sk));
 	}
 }
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c616f0ad1fa0..f5e34fe7b2e6 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2434,13 +2434,15 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
 	int state;
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
-	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
-	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT) {
 		timer_active	= 1;
 		timer_expires	= icsk->icsk_timeout;
 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
 		timer_active	= 4;
 		timer_expires	= icsk->icsk_timeout;
+	} else if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+		timer_active	= 1;
+		timer_expires	= icsk->icsk_tlp_timer.expires;
 	} else if (timer_pending(&sk->sk_timer)) {
 		timer_active	= 2;
 		timer_expires	= sk->sk_timer.expires;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9822820edca4..9038d7d61d0f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -882,6 +882,7 @@ void tcp_release_cb(struct sock *sk)
 
 	if (flags & TCPF_WRITE_TIMER_DEFERRED) {
 		tcp_write_timer_handler(sk);
+		tcp_tail_loss_probe_handler(sk);
 		__sock_put(sk);
 	}
 	if (flags & TCPF_DELACK_TIMER_DEFERRED) {
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index dd5a6317a801..f112aa979e8c 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -591,9 +591,6 @@ void tcp_write_timer_handler(struct sock *sk)
 	case ICSK_TIME_REO_TIMEOUT:
 		tcp_rack_reo_timeout(sk);
 		break;
-	case ICSK_TIME_LOSS_PROBE:
-		tcp_send_loss_probe(sk);
-		break;
 	case ICSK_TIME_RETRANS:
 		icsk->icsk_pending = 0;
 		tcp_retransmit_timer(sk);
@@ -626,6 +623,42 @@ static void tcp_write_timer(struct timer_list *t)
 	sock_put(sk);
 }
 
+void tcp_tail_loss_probe_handler(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN) ||
+	    icsk->icsk_pending != ICSK_TIME_LOSS_PROBE)
+		goto out;
+
+	if (timer_pending(&icsk->icsk_tlp_timer))
+		goto out;
+
+	tcp_mstamp_refresh(tcp_sk(sk));
+	if (tcp_send_loss_probe(sk))
+		icsk->icsk_retransmits++;
+out:
+	sk_mem_reclaim(sk);
+}
+
+static void tcp_tail_loss_probe_timer(struct timer_list *t)
+{
+	struct inet_connection_sock *icsk =
+			from_timer(icsk, t, icsk_tlp_timer);
+	struct sock *sk = &icsk->icsk_inet.sk;
+
+	bh_lock_sock(sk);
+	if (!sock_owned_by_user(sk)) {
+		tcp_tail_loss_probe_handler(sk);
+	} else {
+		/* delegate our work to tcp_release_cb() */
+		if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags))
+			sock_hold(sk);
+	}
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
+
 void tcp_syn_ack_timeout(const struct request_sock *req)
 {
 	struct net *net = read_pnet(&inet_rsk(req)->ireq_net);
@@ -758,7 +791,9 @@ static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer)
 void tcp_init_xmit_timers(struct sock *sk)
 {
 	inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
-				  &tcp_keepalive_timer);
+				  &tcp_keepalive_timer,
+				  &tcp_tail_loss_probe_timer);
+
 	hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC,
 		     HRTIMER_MODE_ABS_PINNED_SOFT);
 	tcp_sk(sk)->pacing_timer.function = tcp_pace_kick;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 4804b6dc5e65..7cc8dbe412af 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1874,13 +1874,15 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
 	srcp  = ntohs(inet->inet_sport);
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
-	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
-	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT) {
 		timer_active	= 1;
 		timer_expires	= icsk->icsk_timeout;
 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
 		timer_active	= 4;
 		timer_expires	= icsk->icsk_timeout;
+	} else if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+		timer_active	= 1;
+		timer_expires	= icsk->icsk_tlp_timer.expires;
 	} else if (timer_pending(&sp->sk_timer)) {
 		timer_active	= 2;
 		timer_expires	= sp->sk_timer.expires;
-- 
2.21.0

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ