[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20180518190141.899-2-dbanerje@akamai.com>
Date: Fri, 18 May 2018 15:01:41 -0400
From: Debabrata Banerjee <dbanerje@...mai.com>
To: "David S . Miller" <davem@...emloft.net>, netdev@...r.kernel.org
Cc: Alexey Kuznetsov <kuznet@....inr.ac.ru>,
Hideaki YOSHIFUJI <yoshfuji@...ux-ipv6.org>,
dbanerje@...mai.com
Subject: [PATCH RFC net-next 1/1] tcp: close socket without reset on incoming data
When TCP_CLOSE_NORST is set before a close(), offload sinking of
unwanted data to the kernel with low resource usage, with a timeout of
TCP_LINGER2. The socket will transition to FIN_WAIT1 and then FIN_WAIT2
where it will ack data until either the timeout is hit, or a RST or FIN
is received.
Signed-off-by: Debabrata Banerjee <dbanerje@...mai.com>
---
include/linux/tcp.h | 4 +++-
include/uapi/linux/tcp.h | 2 +-
net/ipv4/tcp.c | 23 +++++++++++++++++++++--
net/ipv4/tcp_input.c | 16 ++++++++++++----
net/ipv4/tcp_minisocks.c | 15 +++++++++++++++
5 files changed, 52 insertions(+), 8 deletions(-)
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 72705eaf4b84..bd44bc99b480 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -226,7 +226,8 @@ struct tcp_sock {
fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
is_sack_reneg:1, /* in recovery from loss with SACK reneg? */
- unused:2;
+ norst:1, /* Don't send RST on shutdown() socket */
+ unused:1;
u8 nonagle : 4,/* Disable Nagle algorithm? */
thin_lto : 1,/* Use linear timeouts for thin streams */
recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */
@@ -429,6 +430,7 @@ struct tcp_timewait_sock {
#ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *tw_md5_key;
#endif
+ int tw_norst;
};
static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 29eb659aa77a..369f3402b669 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -124,8 +124,8 @@ enum {
#define TCP_FASTOPEN_NO_COOKIE 34 /* Enable TFO without a TFO cookie */
#define TCP_ZEROCOPY_RECEIVE 35
#define TCP_INQ 36 /* Notify bytes available to read as a cmsg on read */
-
#define TCP_CM_INQ TCP_INQ
+#define TCP_CLOSE_NORST 37 /* Don't send RST on close()'d socket */
struct tcp_repair_opt {
__u32 opt_code;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0a2ea0bbf867..29fe763002e5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2318,8 +2318,10 @@ void tcp_close(struct sock *sk, long timeout)
struct sk_buff *skb;
int data_was_unread = 0;
int state;
+ struct tcp_sock *tp;
lock_sock(sk);
+ tp = tcp_sk(sk);
sk->sk_shutdown = SHUTDOWN_MASK;
if (sk->sk_state == TCP_LISTEN) {
@@ -2362,8 +2364,19 @@ void tcp_close(struct sock *sk, long timeout)
} else if (data_was_unread) {
/* Unread data was tossed, zap the connection. */
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
- tcp_set_state(sk, TCP_CLOSE);
- tcp_send_active_reset(sk, sk->sk_allocation);
+
+ if (unlikely(tp->norst)) {
+ if (tcp_close_state(sk)) {
+ /* We will discard all new incoming data
+ * set window to max of current or init.
+ */
+ tp->rcv_wnd = max(tp->rcv_wnd, MAX_TCP_WINDOW);
+ tcp_send_fin(sk);
+ }
+ } else {
+ tcp_set_state(sk, TCP_CLOSE);
+ tcp_send_active_reset(sk, sk->sk_allocation);
+ }
} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
/* Check zero linger _after_ checking for unread data. */
sk->sk_prot->disconnect(sk, 0);
@@ -3040,6 +3053,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
else
tp->recvmsg_inq = val;
break;
+ case TCP_CLOSE_NORST:
+ tp->norst = !!val;
+ break;
default:
err = -ENOPROTOOPT;
break;
@@ -3523,6 +3539,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
return err;
}
#endif
+ case TCP_CLOSE_NORST:
+ val = tp->norst;
+ break;
default:
return -ENOPROTOOPT;
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index aebb29ab2fdf..e0aa6e126700 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6054,7 +6054,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
break;
}
- if (tp->linger2 < 0) {
+ if (likely(!tp->norst) && tp->linger2 < 0) {
tcp_done(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
return 1;
@@ -6064,9 +6064,12 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
/* Receive out of order FIN after close() */
if (tp->syn_fastopen && th->fin)
tcp_fastopen_active_disable(sk);
- tcp_done(sk);
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
- return 1;
+
+ if (likely(!tp->norst)) {
+ tcp_done(sk);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+ return 1;
+ }
}
tmo = tcp_fin_time(sk);
@@ -6123,6 +6126,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (sk->sk_shutdown & RCV_SHUTDOWN) {
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
+ if (unlikely(tp->norst)) {
+ tcp_send_ack(sk);
+ goto discard;
+ }
+
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
tcp_reset(sk);
return 1;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f867658b4b30..48a9d5351478 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -133,6 +133,20 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
return TCP_TW_SUCCESS;
}
+ if (tcptw->tw_norst) {
+ /* ack and discard new data */
+ tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ if (tmp_opt.saw_tstamp) {
+ tcptw->tw_ts_recent_stamp = get_seconds();
+ tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
+ }
+
+ if (th->fin) /* active remote close, we can die now */
+ inet_twsk_deschedule_put(tw);
+
+ return TCP_TW_ACK;
+ }
+
/* New data or FIN. If new data arrive after half-duplex close,
* reset.
*/
@@ -272,6 +286,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
tcptw->tw_ts_offset = tp->tsoffset;
tcptw->tw_last_oow_ack_time = 0;
+ tcptw->tw_norst = tp->norst;
#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == PF_INET6) {
--
2.17.0
Powered by blists - more mailing lists