netdev - RE: Re: [Android][Kernel][TCP/IP] report of packet discarding during tcp handshaking

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Wed, 07 Feb 2018 16:19:10 +0900
From:   배석진 <soukjin.bae@...sung.com>
To:     Eric Dumazet <eric.dumazet@...il.com>,
        "netdev@...r.kernel.org" <netdev@...r.kernel.org>
CC:     유금환 <geumhwan.yu@...sung.com>,
        안치훈 <ch.ahn@...sung.com>
Subject: RE: Re: [Android][Kernel][TCP/IP] report of packet discarding
 during tcp handshaking

Dear Dumazet,


although with your changes, the problem still there.
own_req couldn't update the lost_race as you wanted.
maybee it needs additional locking method.

and, i agree with your thought about the condition.
low probability, and recovery by retransmission.
(except DF, fragmented packet is over MTU)

but like as our case,
when the problem happened not rarely, (2~3 times in 60 trial)
and if the function of dropped packet is sensitive for time delay,
i think it could be a problem.

because we have to find the solution in time,
we blocked the parallel processing of two packets for same request sock.
it is rough way, but working good. :p

we want more delicate method too,
but we don't know yet perpectly about tcp stack and /kernel/net/...
in fact, this is why we were contact to you :)


best regards,
bae



< our patch >

--- /a/include/uapi/linux/snmp.h
+++ /b/include/uapi/linux/snmp.h

@@ -280,6 +280,8 @@
 	LINUX_MIB_TCPKEEPALIVE,			/* TCPKeepAlive */
 	LINUX_MIB_TCPMTUPFAIL,			/* TCPMTUPFail */
 	LINUX_MIB_TCPMTUPSUCCESS,		/* TCPMTUPSuccess */
+	LINUX_MIB_TCPRACECNDREQSK,		/* TCPRaceCondInReqsk */
+	LINUX_MIB_TCPRACECNDREQSKDROP,		/* TCPRaceCondInReqskDrop */
 	__LINUX_MIB_MAX
 };
 

--- /a/net/ipv4/proc.c
+++ /b/net/ipv4/proc.c

@@ -302,6 +302,8 @@
 	SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE),
 	SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL),
 	SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS),
+	SNMP_MIB_ITEM("TCPRaceCondInReqsk", LINUX_MIB_TCPRACECNDREQSK),
+	SNMP_MIB_ITEM("TCPRaceCondInReqskDrop", LINUX_MIB_TCPRACECNDREQSKDROP),
 	SNMP_MIB_SENTINEL
 };
 

--- /a/net/ipv4/tcp_ipv4.c
+++ /b/net/ipv4/tcp_ipv4.c

@@ -1673,6 +1673,7 @@
  *	From tcp_input.c
  */
 
+#define RC_RETRY_CNT 3
 int tcp_v4_rcv(struct sk_buff *skb)
 {
 	const struct iphdr *iph;
@@ -1683,6 +1684,7 @@
 #endif
 	int ret;
 	struct net *net = dev_net(skb->dev);
+	unsigned int retry_cnt = RC_RETRY_CNT;
 
 	if (skb->pkt_type != PACKET_HOST)
 		goto discard_it;
@@ -1749,6 +1751,33 @@
 	if (!sk)
 		goto no_tcp_socket;
 #endif
+	/* 
+	 * FIXME: SEC patch
+	 * If ACK packets for three-way handshake are received at the same time by multi core,
+	 * each core will try to access request socket and create new socket to establish TCP connection.
+	 * But, there is no synchronization scheme to avoid race condition for request socket,
+	 * 2nd attempt that create new socket will be fail, it caused 2nd ACK packet discard.
+	 * 
+	 * For that reason,
+	 * If 2nd ACK packet contained meaningful data, it caused unintended packet drop.
+	 * so, 2nd core should wait at this point until new socket was created by 1st core.
+	 * */
+	if (sk->sk_state == TCP_NEW_SYN_RECV) {
+		struct request_sock *req = inet_reqsk(sk);
+		if (atomic_read(&req->rsk_refcnt) > (2+1) && retry_cnt > 0) {
+			reqsk_put(req);
+			if (retry_cnt == RC_RETRY_CNT)
+				NET_INC_STATS_BH(net, LINUX_MIB_TCPRACECNDREQSK);
+			retry_cnt--;
+			udelay(500);
+
+			goto lookup;
+		}
+
+		if (!retry_cnt)
+			NET_INC_STATS_BH(net, LINUX_MIB_TCPRACECNDREQSKDROP);
+	}
+
 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
 		struct request_sock *req = inet_reqsk(sk);
 		struct sock *nsk;

--- /a/net/ipv6/tcp_ipv6.c
+++ /b/net/ipv6/tcp_ipv6.c

@@ -1539,6 +1539,7 @@
 		sizeof(struct inet6_skb_parm));
 }
 
+#define RC_RETRY_CNT 3
 static int tcp_v6_rcv(struct sk_buff *skb)
 {
 	const struct tcphdr *th;
@@ -1549,6 +1550,7 @@
 #endif
 	int ret;
 	struct net *net = dev_net(skb->dev);
+	unsigned int retry_cnt = RC_RETRY_CNT;
 
 	if (skb->pkt_type != PACKET_HOST)
 		goto discard_it;
@@ -1594,6 +1596,22 @@
 	if (!sk)
 		goto no_tcp_socket;
 #endif
+	/* FIXME: SEC patch */
+	if (sk->sk_state == TCP_NEW_SYN_RECV) {
+		struct request_sock *req = inet_reqsk(sk);
+		if (atomic_read(&req->rsk_refcnt) > (2+1) && retry_cnt > 0) {
+			reqsk_put(req);
+			if (retry_cnt == RC_RETRY_CNT)
+				NET_INC_STATS_BH(net, LINUX_MIB_TCPRACECNDREQSK);
+			retry_cnt--;
+			udelay(500);
+
+			goto lookup;
+		}
+
+		if (!retry_cnt)
+			NET_INC_STATS_BH(net, LINUX_MIB_TCPRACECNDREQSKDROP);
+	}
 
 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
 		struct request_sock *req = inet_reqsk(sk);





--------- Original Message ---------
Sender : Eric Dumazet <eric.dumazet@...il.com>
Date   : 2018-02-07 13:31 (GMT+9)
Title  : Re: [Android][Kernel][TCP/IP] report of packet discarding during tcp handshaking
 
On Wed, 2018-02-07 at 11:16 +0900, 배석진 wrote:
> Hello, 
> this is bae working on samsung elec. 
> 
 
Hi Bae,  thanks for this detailed report and analysis.
 
> we have a problem that packet discarded during 3-way handshaking on TCP. 
> already looks like that Mr Dumazet try to fix the similar issue on this patch, https://android.googlesource.com/kernel/common/+/5e0724d027f0548511a2165a209572d48fe7a4c8 
 
This patch was fixing a more serious bug, since this was possibly
causing corruptions and kernel crashes.
 
What you describe is simply a packet being dropped, on a very low
probability.
 
Which is not a huge deal for TCP since this packet will eventually be
re-transmitted.
 
Also most TCP stacks/flows use DF bit set to not allow packets being
fragmented...
 
Anyway see my answer at the end of this mail.
 
 
> but we are still facing the another corner case.
> 
> it needs preconditions for this problem.
> (1) last ack packet of 3-way handshaking and next packet have been arrived at almost same time 
> (2) next packet, the first data packet was fragmented 
> (3) enable rps
> 
> 
> [tcp dump]
> No.     A-Time         Source     Destination  Len   Seq  Info 
>  1  08:35:18.115259  193.81.6.70  10.217.0.47  84     0   [SYN] Seq=0 Win=21504 Len=0 MSS=1460 
>  2  08:35:18.115888  10.217.0.47  193.81.6.70  84     0   6100 → 5063 [SYN, ACK] Seq=0 Ack=1 Win=29200 Len=0 MSS=1460 
>  3  08:35:18.142385  193.81.6.70  10.217.0.47  80     1   5063 → 6100 [ACK] Seq=1 Ack=1 Win=21504 Len=0 
>  4  08:35:18.142425  193.81.6.70  10.217.0.47  1516       Fragmented IP protocol (proto=Encap Security Payload 50, off=0, ID=6e24) [Reassembled in #5] 
>  5  08:35:18.142449  193.81.6.70  10.217.0.47  60     1   5063 → 6100 [ACK] Seq=1 Ack=1 Win=21504 Len=1460 [TCP segment of a reassembled PDU] 
>  6  08:35:21.227070  193.81.6.70  10.217.0.47  1516       Fragmented IP protocol (proto=Encap Security Payload 50, off=0, ID=71e9) [Reassembled in #7] 
>  7  08:35:21.227191  193.81.6.70  10.217.0.47  60     1   [TCP Retransmission] 5063 → 6100 [ACK] Seq=1 Ack=1 Win=21504 Len=1460 
>  8  08:35:21.228822  10.217.0.47  193.81.6.70  80     1   6100 → 5063 [ACK] Seq=1 Ack=1461 Win=32120 Len=0
> 
> - last ack packet of handshaking(No.3) and next data packet(No4,5) were arrived with just 40us time gap.
> 
> 
> [kernel log]
> - stage 1 
> <3>[ 1037.669229] I[0:  system_server: 3778] get_rps_cpu: skb(64), check hash value:3412396090 
> <3>[ 1037.669261] I[0:  system_server: 3778] get_rps_cpu: skb(1500), check hash value:158575680 
> <3>[ 1037.669285] I[0:  system_server: 3778] get_rps_cpu: skb(44), check hash value:158575680 
> - stage 2 
> <3>[ 1037.669541] I[1: Binder:3778_13: 8391] tcp_v4_rcv: Enter! skb(seq:A93E087B, len:1480) 
> <3>[ 1037.669552] I[2:Jit thread pool:12990] tcp_v4_rcv: Enter! skb(seq:A93E087B, len:20) 
> <3>[ 1037.669564] I[2:Jit thread pool:12990] tcp_v4_rcv: check sk_state:12 skb(seq:A93E087B, len:20) 
> <3>[ 1037.669585] I[2:Jit thread pool:12990] tcp_check_req, Enter!: skb(seq:A93E087B, len:20) 
> <3>[ 1037.669612] I[1: Binder:3778_13: 8391] tcp_v4_rcv: check sk_state:12 skb(seq:A93E087B, len:1480) 
> <3>[ 1037.669625] I[1: Binder:3778_13: 8391] tcp_check_req, Enter!: skb(seq:A93E087B, len:1480) 
> <3>[ 1037.669653] I[2:Jit thread pool:12990] tcp_check_req, skb(seq:A93E087B, len:20), own_req:1 
> <3>[ 1037.669668] I[1: Binder:3778_13: 8391] tcp_check_req, skb(seq:A93E087B, len:1480), own_req:0 
> <3>[ 1037.669708] I[2:Jit thread pool:12990] tcp_rcv_state_process, Established: skb(seq:A93E087B, len:20) 
> <3>[ 1037.669724] I[1: Binder:3778_13: 8391] tcp_v4_rcv: discard_relse skb(seq:A93E087B, len:1480)
> 
> - stage 1 
> because of the data packet has been fragmented(No.4 & 5), 
> it was hashed to another core(cpu1) which was differnet with last ack packet(cpu2), by rps. 
> so last ack and data packet handled in different core almost simultaniously, at NEW_SYN_RECV state.
> 
> - stage 2, cpu2 
> one of them will be treated in tcp_check_req() function a little more earlier, 
> then it got the true value for own_req from tcp_v4_syn_recv_sock(), and return valid nsk. 
> finally going to ESTABLISHED state.
> 
> - stage 2, cpu1 
> but another, later one is got the false value for own_req, 
> and return null for nsk, because of own_req value is false in inet_csk_complete_hashdance(). 
> so earlier packet was handled successfully but later one has gone to discard.
> 
> at this time, one of the ack or data packet could be discarded, by schedule timing. (we saw both of them) 
> if the ack was discarded, that's ok. 
> tcp state goes to ESTABLISHED by piggyback on data packet, and payload will be deliverd to upper layer. 
> but if the data packet was discarded, client can't receive the payload it have to. 
> this is the problem we faced.
> 
> 
> although server retransmitted the dropped packet(No6,7), but it takes few seconds delay. 
> since of this problem occured in IMS-Call setup, this is appeared to call connection delay. 
> these situation is serious problem in call service.
> 
> do you have any report about this or plan to fix it?
 
I guess the following patch should help, can you test it ?
 
Thanks !
 
 include/net/tcp.h        |    3 ++-
 net/ipv4/tcp_input.c     |    4 +++-
 net/ipv4/tcp_ipv4.c      |    9 ++++++++-
 net/ipv4/tcp_minisocks.c |    3 ++-
 net/ipv6/tcp_ipv6.c      |    9 ++++++++-
 5 files changed, 23 insertions(+), 5 deletions(-)
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 58278669cc5572ec397f028b5888f574764e298a..fe4616978ccf2b048ca1b7ad8976558563798785 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -374,7 +374,8 @@ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw,
                                               struct sk_buff *skb,
                                               const struct tcphdr *th);
 struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
-                           struct request_sock *req, bool fastopen);
+                           struct request_sock *req, bool fastopen,
+                           bool *lost_race);
 int tcp_child_process(struct sock *parent, struct sock *child,
                       struct sk_buff *skb);
 void tcp_enter_loss(struct sock *sk);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index cfa51cfd2d999342018aee3511d5760a17b1493b..20c68a4765235daec01a020913d4779d6926d203 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5870,10 +5870,12 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
         tp->rx_opt.saw_tstamp = 0;
         req = tp->fastopen_rsk;
         if (req) {
+                bool lost_race;
+
                 WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
                     sk->sk_state != TCP_FIN_WAIT1);
 
-                if (!tcp_check_req(sk, skb, req, true))
+                if (!tcp_check_req(sk, skb, req, true, &lost_race))
                         goto discard;
         }
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 95738aa0d8a60697053d763ba7023be2c62d7a90..d72de3d12b4c960c3bd1ad4c45fc493ee919201c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1672,6 +1672,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
         if (sk->sk_state == TCP_NEW_SYN_RECV) {
                 struct request_sock *req = inet_reqsk(sk);
                 struct sock *nsk;
+                bool lost_race;
 
                 sk = req->rsk_listener;
                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
@@ -1688,15 +1689,21 @@ int tcp_v4_rcv(struct sk_buff *skb)
                  */
                 sock_hold(sk);
                 refcounted = true;
+                lost_race = false;
                 nsk = NULL;
                 if (!tcp_filter(sk, skb)) {
                         th = (const struct tcphdr *)skb->data;
                         iph = ip_hdr(skb);
                         tcp_v4_fill_cb(skb, iph, th);
-                        nsk = tcp_check_req(sk, skb, req, false);
+                        nsk = tcp_check_req(sk, skb, req, false, &lost_race);
                 }
                 if (!nsk) {
                         reqsk_put(req);
+                        if (lost_race) {
+                                tcp_v4_restore_cb(skb);
+                                sock_put(sk);
+                                goto lookup;
+                        }
                         goto discard_and_relse;
                 }
                 if (nsk == sk) {
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index a8384b0c11f8fa589e2ed5311899b62c80a269f8..7ebc222854f7d7215850f86456aeb6298659b6fa 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -578,7 +578,7 @@ EXPORT_SYMBOL(tcp_create_openreq_child);
 
 struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
                            struct request_sock *req,
-                           bool fastopen)
+                           bool fastopen, bool *lost_race)
 {
         struct tcp_options_received tmp_opt;
         struct sock *child;
@@ -785,6 +785,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 
         sock_rps_save_rxhash(child, skb);
         tcp_synack_rtt_meas(child, req);
+        *lost_race = !own_req;
         return inet_csk_complete_hashdance(sk, child, req, own_req);
 
 listen_overflow:
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index a1ab29e2ab3bb99e120920186be46d2dd4bbb71e..211570bd524ddc3eddadec0b58c587935ad9a42c 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1451,6 +1451,7 @@ static int tcp_v6_rcv(struct sk_buff *skb)
         if (sk->sk_state == TCP_NEW_SYN_RECV) {
                 struct request_sock *req = inet_reqsk(sk);
                 struct sock *nsk;
+                bool lost_race;
 
                 sk = req->rsk_listener;
                 if (tcp_v6_inbound_md5_hash(sk, skb)) {
@@ -1464,15 +1465,21 @@ static int tcp_v6_rcv(struct sk_buff *skb)
                 }
                 sock_hold(sk);
                 refcounted = true;
+                lost_race = false;
                 nsk = NULL;
                 if (!tcp_filter(sk, skb)) {
                         th = (const struct tcphdr *)skb->data;
                         hdr = ipv6_hdr(skb);
                         tcp_v6_fill_cb(skb, hdr, th);
-                        nsk = tcp_check_req(sk, skb, req, false);
+                        nsk = tcp_check_req(sk, skb, req, false, &lost_race);
                 }
                 if (!nsk) {
                         reqsk_put(req);
+                        if (lost_race) {
+                                tcp_v6_restore_cb(skb);
+                                sock_put(sk);
+                                goto lookup;
+                        }
                         goto discard_and_relse;
                 }
                 if (nsk == sk) {