[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-id: <20180201000716.69301-7-cpaasch@apple.com>
Date: Wed, 31 Jan 2018 16:07:08 -0800
From: Christoph Paasch <cpaasch@...le.com>
To: netdev@...r.kernel.org
Cc: Eric Dumazet <edumazet@...gle.com>,
Mat Martineau <mathew.j.martineau@...ux.intel.com>,
Ursula Braun <ubraun@...ux.vnet.ibm.com>
Subject: [RFC v2 06/14] tcp_smc: Make SMC use TCP extra-option framework
Adopt the extra-option framework for SMC.
It allows us to entirely remove SMC-code out of the TCP-stack.
The static key is gone, as this is now covered by the static key of the
extra-option framework.
We allocate state (struct tcp_smc_opt) that indicates whether SMC was
successfully negotiated or not and check this state in the relevant
functions.
Cc: Ursula Braun <ubraun@...ux.vnet.ibm.com>
Signed-off-by: Christoph Paasch <cpaasch@...le.com>
Reviewed-by: Mat Martineau <mathew.j.martineau@...ux.intel.com>
---
include/linux/tcp.h | 3 +-
include/net/inet_sock.h | 3 +-
include/net/tcp.h | 4 -
net/ipv4/tcp.c | 5 --
net/ipv4/tcp_input.c | 36 ---------
net/ipv4/tcp_minisocks.c | 18 -----
net/ipv4/tcp_output.c | 54 --------------
net/smc/af_smc.c | 190 +++++++++++++++++++++++++++++++++++++++++++++--
8 files changed, 186 insertions(+), 127 deletions(-)
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 6e1f0f29bf24..0958b3760cfc 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -257,8 +257,7 @@ struct tcp_sock {
syn_fastopen_ch:1, /* Active TFO re-enabling probe */
syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
save_syn:1, /* Save headers of SYN packet */
- is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */
- syn_smc:1; /* SYN includes SMC */
+ is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */
/* RTT measurement */
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 0a671c32d6b9..4efa6cb14705 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -90,8 +90,7 @@ struct inet_request_sock {
wscale_ok : 1,
ecn_ok : 1,
acked : 1,
- no_srccheck: 1,
- smc_ok : 1;
+ no_srccheck: 1;
u32 ir_mark;
union {
struct ip_options_rcu __rcu *ireq_opt;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index be6709e380a6..2a565883e2ef 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2093,10 +2093,6 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1);
}
-#if IS_ENABLED(CONFIG_SMC)
-extern struct static_key_false tcp_have_smc;
-#endif
-
struct tcp_extopt_store;
struct tcp_extopt_ops {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ffb5f4fbd935..f08542d91e1c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -292,11 +292,6 @@ EXPORT_SYMBOL(sysctl_tcp_mem);
atomic_long_t tcp_memory_allocated; /* Current allocated memory. */
EXPORT_SYMBOL(tcp_memory_allocated);
-#if IS_ENABLED(CONFIG_SMC)
-DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
-EXPORT_SYMBOL(tcp_have_smc);
-#endif
-
/*
* Current number of TCP sockets.
*/
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 187e3fa761c8..fd2693baee4a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3691,24 +3691,6 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
foc->exp = exp_opt;
}
-static int smc_parse_options(const struct tcphdr *th,
- struct tcp_options_received *opt_rx,
- const unsigned char *ptr,
- int opsize)
-{
-#if IS_ENABLED(CONFIG_SMC)
- if (static_branch_unlikely(&tcp_have_smc)) {
- if (th->syn && !(opsize & 1) &&
- opsize >= TCPOLEN_EXP_SMC_BASE &&
- get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) {
- opt_rx->smc_ok = 1;
- return 1;
- }
- }
-#endif
- return 0;
-}
-
/* Look for tcp options. Normally only called on SYN and SYNACK packets.
* But, this can also be called on packets in the established flow when
* the fast version below fails.
@@ -3816,9 +3798,6 @@ void tcp_parse_options(const struct net *net,
tcp_parse_fastopen_option(opsize -
TCPOLEN_EXP_FASTOPEN_BASE,
ptr + 2, th->syn, foc, true);
- else if (smc_parse_options(th, opt_rx, ptr,
- opsize))
- break;
else if (opsize >= TCPOLEN_EXP_BASE)
tcp_extopt_parse(get_unaligned_be32(ptr),
opsize, ptr, skb,
@@ -5595,16 +5574,6 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
return false;
}
-static void smc_check_reset_syn(struct tcp_sock *tp)
-{
-#if IS_ENABLED(CONFIG_SMC)
- if (static_branch_unlikely(&tcp_have_smc)) {
- if (tp->syn_smc && !tp->rx_opt.smc_ok)
- tp->syn_smc = 0;
- }
-#endif
-}
-
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th)
{
@@ -5715,8 +5684,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* is initialized. */
tp->copied_seq = tp->rcv_nxt;
- smc_check_reset_syn(tp);
-
smp_mb();
tcp_finish_connect(sk, skb);
@@ -6173,9 +6140,6 @@ static void tcp_openreq_init(struct request_sock *req,
ireq->ir_rmt_port = tcp_hdr(skb)->source;
ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
ireq->ir_mark = inet_request_mark(sk, skb);
-#if IS_ENABLED(CONFIG_SMC)
- ireq->smc_ok = rx_opt->smc_ok;
-#endif
}
struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 46eb5a33aec1..5e08dce49a00 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -435,21 +435,6 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
}
EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
-static void smc_check_reset_syn_req(struct tcp_sock *oldtp,
- struct request_sock *req,
- struct tcp_sock *newtp)
-{
-#if IS_ENABLED(CONFIG_SMC)
- struct inet_request_sock *ireq;
-
- if (static_branch_unlikely(&tcp_have_smc)) {
- ireq = inet_rsk(req);
- if (oldtp->syn_smc && !ireq->smc_ok)
- newtp->syn_smc = 0;
- }
-#endif
-}
-
/* This is not only more efficient than what we used to do, it eliminates
* a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
*
@@ -467,9 +452,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
struct tcp_request_sock *treq = tcp_rsk(req);
struct inet_connection_sock *newicsk = inet_csk(newsk);
struct tcp_sock *newtp = tcp_sk(newsk);
- struct tcp_sock *oldtp = tcp_sk(sk);
-
- smc_check_reset_syn_req(oldtp, req, newtp);
/* Now setup tcp_sock */
newtp->pred_flags = 0;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 6d418ce06b59..549e33a30b41 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -398,21 +398,6 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
return tp->snd_una != tp->snd_up;
}
-static void smc_options_write(__be32 *ptr, u16 *options)
-{
-#if IS_ENABLED(CONFIG_SMC)
- if (static_branch_unlikely(&tcp_have_smc)) {
- if (unlikely(OPTION_SMC & *options)) {
- *ptr++ = htonl((TCPOPT_NOP << 24) |
- (TCPOPT_NOP << 16) |
- (TCPOPT_EXP << 8) |
- (TCPOLEN_EXP_SMC_BASE));
- *ptr++ = htonl(TCPOPT_SMC_MAGIC);
- }
- }
-#endif
-}
-
/* Write previously computed TCP options to the packet.
*
* Beware: Something in the Internet is very sensitive to the ordering of
@@ -527,45 +512,10 @@ static void tcp_options_write(__be32 *ptr, struct sk_buff *skb, struct sock *sk,
ptr += (len + 3) >> 2;
}
- smc_options_write(ptr, &options);
-
if (unlikely(!hlist_empty(extopt_list)))
tcp_extopt_write(ptr, skb, opts, sk);
}
-static void smc_set_option(const struct tcp_sock *tp,
- struct tcp_out_options *opts,
- unsigned int *remaining)
-{
-#if IS_ENABLED(CONFIG_SMC)
- if (static_branch_unlikely(&tcp_have_smc)) {
- if (tp->syn_smc) {
- if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
- opts->options |= OPTION_SMC;
- *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
- }
- }
- }
-#endif
-}
-
-static void smc_set_option_cond(const struct tcp_sock *tp,
- const struct inet_request_sock *ireq,
- struct tcp_out_options *opts,
- unsigned int *remaining)
-{
-#if IS_ENABLED(CONFIG_SMC)
- if (static_branch_unlikely(&tcp_have_smc)) {
- if (tp->syn_smc && ireq->smc_ok) {
- if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
- opts->options |= OPTION_SMC;
- *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
- }
- }
- }
-#endif
-}
-
/* Compute TCP options for SYN packets. This is not the final
* network wire format yet.
*/
@@ -631,8 +581,6 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
}
}
- smc_set_option(tp, opts, &remaining);
-
if (unlikely(!hlist_empty(&tp->tcp_option_list)))
remaining -= tcp_extopt_prepare(skb, TCPHDR_SYN, remaining,
opts, tcp_to_sk(tp));
@@ -698,8 +646,6 @@ static unsigned int tcp_synack_options(const struct sock *sk,
}
}
- smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
-
if (unlikely(!hlist_empty(&tcp_rsk(req)->tcp_option_list)))
remaining -= tcp_extopt_prepare(skb, TCPHDR_SYN | TCPHDR_ACK,
remaining, opts,
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 267e68379110..1b942a73609e 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -44,6 +44,149 @@
#include "smc_rx.h"
#include "smc_close.h"
+static unsigned int tcp_smc_opt_prepare(struct sk_buff *skb, u8 flags,
+ unsigned int remaining,
+ struct tcp_out_options *opts,
+ const struct sock *sk,
+ struct tcp_extopt_store *store);
+static __be32 *tcp_smc_opt_write(__be32 *ptr, struct sk_buff *skb,
+ struct tcp_out_options *opts,
+ struct sock *sk,
+ struct tcp_extopt_store *store);
+static void tcp_smc_opt_parse(int opsize, const unsigned char *opptr,
+ const struct sk_buff *skb,
+ struct tcp_options_received *opt_rx,
+ struct sock *sk,
+ struct tcp_extopt_store *store);
+static void tcp_smc_opt_post_process(struct sock *sk,
+ struct tcp_options_received *opt,
+ struct tcp_extopt_store *store);
+static struct tcp_extopt_store *tcp_smc_opt_copy(struct sock *listener,
+ struct request_sock *req,
+ struct tcp_options_received *opt,
+ struct tcp_extopt_store *store);
+static void tcp_smc_opt_destroy(struct tcp_extopt_store *store);
+
+struct tcp_smc_opt {
+ struct tcp_extopt_store store;
+ int smc_ok:1; /* SMC supported on this connection */
+ struct rcu_head rcu;
+};
+
+static const struct tcp_extopt_ops tcp_smc_extra_ops = {
+ .option_kind = TCPOPT_SMC_MAGIC,
+ .parse = tcp_smc_opt_parse,
+ .post_process = tcp_smc_opt_post_process,
+ .prepare = tcp_smc_opt_prepare,
+ .write = tcp_smc_opt_write,
+ .copy = tcp_smc_opt_copy,
+ .destroy = tcp_smc_opt_destroy,
+ .owner = THIS_MODULE,
+};
+
+static struct tcp_smc_opt *tcp_extopt_to_smc(struct tcp_extopt_store *store)
+{
+ return container_of(store, struct tcp_smc_opt, store);
+}
+
+static struct tcp_smc_opt *tcp_smc_opt_find(struct sock *sk)
+{
+ struct tcp_extopt_store *ext_opt;
+
+ ext_opt = tcp_extopt_find_kind(TCPOPT_SMC_MAGIC, sk);
+
+ return tcp_extopt_to_smc(ext_opt);
+}
+
+static unsigned int tcp_smc_opt_prepare(struct sk_buff *skb, u8 flags,
+ unsigned int remaining,
+ struct tcp_out_options *opts,
+ const struct sock *sk,
+ struct tcp_extopt_store *store)
+{
+ if (!(flags & TCPHDR_SYN))
+ return 0;
+
+ if (remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
+ opts->options |= OPTION_SMC;
+ return TCPOLEN_EXP_SMC_BASE_ALIGNED;
+ }
+
+ return 0;
+}
+
+static __be32 *tcp_smc_opt_write(__be32 *ptr, struct sk_buff *skb,
+ struct tcp_out_options *opts,
+ struct sock *sk,
+ struct tcp_extopt_store *store)
+{
+ if (unlikely(OPTION_SMC & opts->options)) {
+ *ptr++ = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_EXP << 8) |
+ (TCPOLEN_EXP_SMC_BASE));
+ *ptr++ = htonl(TCPOPT_SMC_MAGIC);
+ }
+
+ return ptr;
+}
+
+static void tcp_smc_opt_parse(int opsize, const unsigned char *opptr,
+ const struct sk_buff *skb,
+ struct tcp_options_received *opt_rx,
+ struct sock *sk,
+ struct tcp_extopt_store *store)
+{
+ struct tcphdr *th = tcp_hdr(skb);
+
+ if (th->syn && !(opsize & 1) && opsize >= TCPOLEN_EXP_SMC_BASE)
+ opt_rx->smc_ok = 1;
+}
+
+static void tcp_smc_opt_post_process(struct sock *sk,
+ struct tcp_options_received *opt,
+ struct tcp_extopt_store *store)
+{
+ struct tcp_smc_opt *smc_opt = tcp_extopt_to_smc(store);
+
+ if (sk->sk_state != TCP_SYN_SENT)
+ return;
+
+ if (opt->smc_ok)
+ smc_opt->smc_ok = 1;
+ else
+ smc_opt->smc_ok = 0;
+}
+
+static struct tcp_extopt_store *tcp_smc_opt_copy(struct sock *listener,
+ struct request_sock *req,
+ struct tcp_options_received *opt,
+ struct tcp_extopt_store *store)
+{
+ struct tcp_smc_opt *smc_opt;
+
+ /* First, check if the peer sent us the smc-opt */
+ if (!opt->smc_ok)
+ return NULL;
+
+ smc_opt = kzalloc(sizeof(*smc_opt), GFP_ATOMIC);
+ if (!smc_opt)
+ return NULL;
+
+ smc_opt->store.ops = &tcp_smc_extra_ops;
+
+ smc_opt->smc_ok = 1;
+
+ return (struct tcp_extopt_store *)smc_opt;
+}
+
+static void tcp_smc_opt_destroy(struct tcp_extopt_store *store)
+{
+ struct tcp_smc_opt *smc_opt = tcp_extopt_to_smc(store);
+
+ kfree_rcu(smc_opt, rcu);
+}
+
static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group
* creation
*/
@@ -389,6 +532,7 @@ static int smc_connect_rdma(struct smc_sock *smc)
struct smc_clc_msg_accept_confirm aclc;
int local_contact = SMC_FIRST_CONTACT;
struct smc_ib_device *smcibdev;
+ struct tcp_smc_opt *smc_opt;
struct smc_link *link;
u8 srv_first_contact;
int reason_code = 0;
@@ -397,7 +541,8 @@ static int smc_connect_rdma(struct smc_sock *smc)
sock_hold(&smc->sk); /* sock put in passive closing */
- if (!tcp_sk(smc->clcsock->sk)->syn_smc) {
+ smc_opt = tcp_smc_opt_find(smc->clcsock->sk);
+ if (!smc_opt || !smc_opt->smc_ok) {
/* peer has not signalled SMC-capability */
smc->use_fallback = true;
goto out_connected;
@@ -548,6 +693,7 @@ static int smc_connect_rdma(struct smc_sock *smc)
static int smc_connect(struct socket *sock, struct sockaddr *addr,
int alen, int flags)
{
+ struct tcp_smc_opt *smc_opt;
struct sock *sk = sock->sk;
struct smc_sock *smc;
int rc = -EINVAL;
@@ -561,9 +707,17 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
goto out_err;
smc->addr = addr; /* needed for nonblocking connect */
+ smc_opt = kzalloc(sizeof(*smc_opt), GFP_KERNEL);
+ if (!smc_opt) {
+ rc = -ENOMEM;
+ goto out_err;
+ }
+ smc_opt->store.ops = &tcp_smc_extra_ops;
+
lock_sock(sk);
switch (sk->sk_state) {
default:
+ rc = -EINVAL;
goto out;
case SMC_ACTIVE:
rc = -EISCONN;
@@ -573,8 +727,15 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
break;
}
+ /* We are the only owner of smc->clcsock->sk, so we can be lockless */
+ rc = tcp_register_extopt(&smc_opt->store, smc->clcsock->sk);
+ if (rc) {
+ release_sock(smc->clcsock->sk);
+ kfree(smc_opt);
+ goto out_err;
+ }
+
smc_copy_sock_settings_to_clc(smc);
- tcp_sk(smc->clcsock->sk)->syn_smc = 1;
rc = kernel_connect(smc->clcsock, addr, alen, flags);
if (rc)
goto out;
@@ -768,6 +929,7 @@ static void smc_listen_work(struct work_struct *work)
struct smc_clc_msg_proposal *pclc;
struct smc_ib_device *smcibdev;
struct sockaddr_in peeraddr;
+ struct tcp_smc_opt *smc_opt;
u8 buf[SMC_CLC_MAX_LEN];
struct smc_link *link;
int reason_code = 0;
@@ -777,7 +939,8 @@ static void smc_listen_work(struct work_struct *work)
u8 ibport;
/* check if peer is smc capable */
- if (!tcp_sk(newclcsock->sk)->syn_smc) {
+ smc_opt = tcp_smc_opt_find(newclcsock->sk);
+ if (!smc_opt || !smc_opt->smc_ok) {
new_smc->use_fallback = true;
goto out_connected;
}
@@ -987,10 +1150,18 @@ static void smc_tcp_listen_work(struct work_struct *work)
static int smc_listen(struct socket *sock, int backlog)
{
+ struct tcp_smc_opt *smc_opt;
struct sock *sk = sock->sk;
struct smc_sock *smc;
int rc;
+ smc_opt = kzalloc(sizeof(*smc_opt), GFP_KERNEL);
+ if (!smc_opt) {
+ rc = -ENOMEM;
+ goto out_err;
+ }
+ smc_opt->store.ops = &tcp_smc_extra_ops;
+
smc = smc_sk(sk);
lock_sock(sk);
@@ -1003,11 +1174,19 @@ static int smc_listen(struct socket *sock, int backlog)
sk->sk_max_ack_backlog = backlog;
goto out;
}
+
+ /* We are the only owner of smc->clcsock->sk, so we can be lockless */
+ rc = tcp_register_extopt(&smc_opt->store, smc->clcsock->sk);
+ if (rc) {
+ release_sock(smc->clcsock->sk);
+ kfree(smc_opt);
+ goto out_err;
+ }
+
/* some socket options are handled in core, so we could not apply
* them to the clc socket -- copy smc socket options to clc socket
*/
smc_copy_sock_settings_to_clc(smc);
- tcp_sk(smc->clcsock->sk)->syn_smc = 1;
rc = kernel_listen(smc->clcsock, backlog);
if (rc)
@@ -1022,6 +1201,7 @@ static int smc_listen(struct socket *sock, int backlog)
out:
release_sock(sk);
+out_err:
return rc;
}
@@ -1460,7 +1640,6 @@ static int __init smc_init(void)
goto out_sock;
}
- static_branch_enable(&tcp_have_smc);
return 0;
out_sock:
@@ -1485,7 +1664,6 @@ static void __exit smc_exit(void)
list_del_init(&lgr->list);
smc_lgr_free(lgr); /* free link group */
}
- static_branch_disable(&tcp_have_smc);
smc_ib_unregister_client();
sock_unregister(PF_SMC);
proto_unregister(&smc_proto);
--
2.16.1
Powered by blists - more mailing lists