lists.openwall.net | lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC | |
Open Source and information security mailing list archives
| ||
|
Message-ID: <2972ad09-291b-0c34-fa35-b7852038b32f@linux.ibm.com> Date: Mon, 27 Feb 2023 08:58:13 +0100 From: Wenjia Zhang <wenjia@...ux.ibm.com> To: "D. Wythe" <alibuda@...ux.alibaba.com>, kgraul@...ux.ibm.com, jaka@...ux.ibm.com, ast@...nel.org, daniel@...earbox.net, andrii@...nel.org Cc: kuba@...nel.org, davem@...emloft.net, netdev@...r.kernel.org, linux-s390@...r.kernel.org, linux-rdma@...r.kernel.org, bpf@...r.kernel.org Subject: Re: [PATCH bpf-next v2 1/2] net/smc: Introduce BPF injection capability for SMC On 21.02.23 13:18, D. Wythe wrote: > From: "D. Wythe" <alibuda@...ux.alibaba.com> > > This PATCH attempts to introduce BPF injection capability for SMC. > As we all know that the SMC protocol is not suitable for all scenarios, > especially for short-lived. However, for most applications, they cannot > guarantee that there are no such scenarios at all. Therefore, apps > may need some specific strategies to decide shall we need to use SMC > or not, for example, apps can limit the scope of the SMC to a specific > IP address or port. > > Based on the consideration of transparent replacement, we hope that apps > can remain transparent even if they need to formulate some specific > strategies for SMC using. That is, do not need to recompile their code. > > On the other hand, we need to ensure the scalability of strategies > implementation. Although it is simple to use socket options or sysctl, > it will bring more complexity to subsequent expansion. > > Fortunately, BPF can solve these concerns very well, users can write > thire own strategies in eBPF to choose whether to use SMC or not. > And it's quite easy for them to modify their strategies in the future. > > This PATCH implement injection capability for SMC via struct_ops. > In that way, we can add new injection scenarios in the future. > > Signed-off-by: D. Wythe <alibuda@...ux.alibaba.com> > --- > include/linux/btf_ids.h | 15 +++ > include/net/smc.h | 254 ++++++++++++++++++++++++++++++++++++++ > kernel/bpf/bpf_struct_ops_types.h | 4 + > net/Makefile | 5 + > net/smc/af_smc.c | 10 +- > net/smc/bpf_smc_struct_ops.c | 146 ++++++++++++++++++++++ > net/smc/smc.h | 220 --------------------------------- > 7 files changed, 433 insertions(+), 221 deletions(-) > create mode 100644 net/smc/bpf_smc_struct_ops.c > > diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h > index 3a4f7cd..25eab1e 100644 > --- a/include/linux/btf_ids.h > +++ b/include/linux/btf_ids.h > @@ -264,6 +264,21 @@ enum { > MAX_BTF_TRACING_TYPE, > }; > > +#if IS_ENABLED(CONFIG_SMC) > +#define BTF_SMC_TYPE_xxx \ > + BTF_SMC_TYPE(BTF_SMC_TYPE_SOCK, smc_sock) \ > + BTF_SMC_TYPE(BTF_SMC_TYPE_CONNECTION, smc_connection) \ > + BTF_SMC_TYPE(BTF_SMC_TYPE_HOST_CURSOR, smc_host_cursor) > + > +enum { > +#define BTF_SMC_TYPE(name, type) name, > +BTF_SMC_TYPE_xxx > +#undef BTF_SMC_TYPE > +MAX_BTF_SMC_TYPE, > +}; > +extern u32 btf_smc_ids[]; > +#endif > + > extern u32 btf_tracing_ids[]; > extern u32 bpf_cgroup_btf_id[]; > extern u32 bpf_local_storage_map_btf_id[]; > diff --git a/include/net/smc.h b/include/net/smc.h > index 597cb93..912c269 100644 > --- a/include/net/smc.h > +++ b/include/net/smc.h > @@ -11,13 +11,16 @@ > #ifndef _SMC_H > #define _SMC_H > > +#include <net/inet_connection_sock.h> > #include <linux/device.h> > #include <linux/spinlock.h> > #include <linux/types.h> > #include <linux/wait.h> > +#include <linux/bpf.h> > #include "linux/ism.h" > > struct sock; > +struct smc_diag_conninfo; > > #define SMC_MAX_PNETID_LEN 16 /* Max. length of PNET id */ > > @@ -90,4 +93,255 @@ struct smcd_dev { > u8 going_away : 1; > }; > > +#if IS_ENABLED(CONFIG_SMC) > + > +struct smc_wr_rx_hdr { /* common prefix part of LLC and CDC to demultiplex */ > + union { > + u8 type; > +#if defined(__BIG_ENDIAN_BITFIELD) > + struct { > + u8 llc_version:4, > + llc_type:4; > + }; > +#elif defined(__LITTLE_ENDIAN_BITFIELD) > + struct { > + u8 llc_type:4, > + llc_version:4; > + }; > +#endif > + }; > +} __aligned(1); > + > +struct smc_cdc_conn_state_flags { > +#if defined(__BIG_ENDIAN_BITFIELD) > + u8 peer_done_writing : 1; /* Sending done indicator */ > + u8 peer_conn_closed : 1; /* Peer connection closed indicator */ > + u8 peer_conn_abort : 1; /* Abnormal close indicator */ > + u8 reserved : 5; > +#elif defined(__LITTLE_ENDIAN_BITFIELD) > + u8 reserved : 5; > + u8 peer_conn_abort : 1; > + u8 peer_conn_closed : 1; > + u8 peer_done_writing : 1; > +#endif > +}; > + > +struct smc_cdc_producer_flags { > +#if defined(__BIG_ENDIAN_BITFIELD) > + u8 write_blocked : 1; /* Writing Blocked, no rx buf space */ > + u8 urg_data_pending : 1; /* Urgent Data Pending */ > + u8 urg_data_present : 1; /* Urgent Data Present */ > + u8 cons_curs_upd_req : 1; /* cursor update requested */ > + u8 failover_validation : 1;/* message replay due to failover */ > + u8 reserved : 3; > +#elif defined(__LITTLE_ENDIAN_BITFIELD) > + u8 reserved : 3; > + u8 failover_validation : 1; > + u8 cons_curs_upd_req : 1; > + u8 urg_data_present : 1; > + u8 urg_data_pending : 1; > + u8 write_blocked : 1; > +#endif > +}; > + > +enum smc_urg_state { > + SMC_URG_VALID = 1, /* data present */ > + SMC_URG_NOTYET = 2, /* data pending */ > + SMC_URG_READ = 3, /* data was already read */ > +}; > + > +/* in host byte order */ > +union smc_host_cursor { /* SMC cursor - an offset in an RMBE */ > + struct { > + u16 reserved; > + u16 wrap; /* window wrap sequence number */ > + u32 count; /* cursor (= offset) part */ > + }; > +#ifdef ATOMIC64_INIT > + atomic64_t acurs; /* for atomic processing */ > +#else > + u64 acurs; /* for atomic processing */ > +#endif > +} __aligned(8); > + > +/* in host byte order, except for flag bitfields in network byte order */ > +struct smc_host_cdc_msg { /* Connection Data Control message */ > + struct smc_wr_rx_hdr common; /* .type = 0xFE */ > + u8 len; /* length = 44 */ > + u16 seqno; /* connection seq # */ > + u32 token; /* alert_token */ > + union smc_host_cursor prod; /* producer cursor */ > + union smc_host_cursor cons; /* consumer cursor, > + * piggy backed "ack" > + */ > + struct smc_cdc_producer_flags prod_flags; /* conn. tx/rx status */ > + struct smc_cdc_conn_state_flags conn_state_flags; /* peer conn. status*/ > + u8 reserved[18]; > +} __aligned(8); > + > +struct smc_connection { > + struct rb_node alert_node; > + struct smc_link_group *lgr; /* link group of connection */ > + struct smc_link *lnk; /* assigned SMC-R link */ > + u32 alert_token_local; /* unique conn. id */ > + u8 peer_rmbe_idx; /* from tcp handshake */ > + int peer_rmbe_size; /* size of peer rx buffer */ > + atomic_t peer_rmbe_space;/* remaining free bytes in peer > + * rmbe > + */ > + int rtoken_idx; /* idx to peer RMB rkey/addr */ > + > + struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */ > + struct smc_buf_desc *rmb_desc; /* RMBE descriptor */ > + int rmbe_size_short;/* compressed notation */ > + int rmbe_update_limit; > + /* lower limit for consumer > + * cursor update > + */ > + > + struct smc_host_cdc_msg local_tx_ctrl; /* host byte order staging > + * buffer for CDC msg send > + * .prod cf. TCP snd_nxt > + * .cons cf. TCP sends ack > + */ > + union smc_host_cursor local_tx_ctrl_fin; > + /* prod crsr - confirmed by peer > + */ > + union smc_host_cursor tx_curs_prep; /* tx - prepared data > + * snd_max..wmem_alloc > + */ > + union smc_host_cursor tx_curs_sent; /* tx - sent data > + * snd_nxt ? > + */ > + union smc_host_cursor tx_curs_fin; /* tx - confirmed by peer > + * snd-wnd-begin ? > + */ > + atomic_t sndbuf_space; /* remaining space in sndbuf */ > + u16 tx_cdc_seq; /* sequence # for CDC send */ > + u16 tx_cdc_seq_fin; /* sequence # - tx completed */ > + spinlock_t send_lock; /* protect wr_sends */ > + atomic_t cdc_pend_tx_wr; /* number of pending tx CDC wqe > + * - inc when post wqe, > + * - dec on polled tx cqe > + */ > + wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/ > + atomic_t tx_pushing; /* nr_threads trying tx push */ > + struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ > + u32 tx_off; /* base offset in peer rmb */ > + > + struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl. > + * .prod cf. TCP rcv_nxt > + * .cons cf. TCP snd_una > + */ > + union smc_host_cursor rx_curs_confirmed; /* confirmed to peer > + * source of snd_una ? > + */ > + union smc_host_cursor urg_curs; /* points at urgent byte */ > + enum smc_urg_state urg_state; > + bool urg_tx_pend; /* urgent data staged */ > + bool urg_rx_skip_pend; > + /* indicate urgent oob data > + * read, but previous regular > + * data still pending > + */ > + char urg_rx_byte; /* urgent byte */ > + bool tx_in_release_sock; > + /* flush pending tx data in > + * sock release_cb() > + */ > + atomic_t bytes_to_rcv; /* arrived data, > + * not yet received > + */ > + atomic_t splice_pending; /* number of spliced bytes > + * pending processing > + */ > +#ifndef KERNEL_HAS_ATOMIC64 > + spinlock_t acurs_lock; /* protect cursors */ > +#endif > + struct work_struct close_work; /* peer sent some closing */ > + struct work_struct abort_work; /* abort the connection */ > + struct tasklet_struct rx_tsklet; /* Receiver tasklet for SMC-D */ > + u8 rx_off; /* receive offset: > + * 0 for SMC-R, 32 for SMC-D > + */ > + u64 peer_token; /* SMC-D token of peer */ > + u8 killed : 1; /* abnormal termination */ > + u8 freed : 1; /* normal termiation */ > + u8 out_of_sync : 1; /* out of sync with peer */ > +}; > + > +struct smc_sock { /* smc sock container */ > + struct sock sk; > + struct socket *clcsock; /* internal tcp socket */ > + void (*clcsk_state_change)(struct sock *sk); > + /* original stat_change fct. */ > + void (*clcsk_data_ready)(struct sock *sk); > + /* original data_ready fct. */ > + void (*clcsk_write_space)(struct sock *sk); > + /* original write_space fct. */ > + void (*clcsk_error_report)(struct sock *sk); > + /* original error_report fct. */ > + struct smc_connection conn; /* smc connection */ > + struct smc_sock *listen_smc; /* listen parent */ > + struct work_struct connect_work; /* handle non-blocking connect*/ > + struct work_struct tcp_listen_work;/* handle tcp socket accepts */ > + struct work_struct smc_listen_work;/* prepare new accept socket */ > + struct list_head accept_q; /* sockets to be accepted */ > + spinlock_t accept_q_lock; /* protects accept_q */ > + bool limit_smc_hs; /* put constraint on handshake */ > + bool use_fallback; /* fallback to tcp */ > + int fallback_rsn; /* reason for fallback */ > + u32 peer_diagnosis; /* decline reason from peer */ > + atomic_t queued_smc_hs; /* queued smc handshakes */ > + struct inet_connection_sock_af_ops af_ops; > + const struct inet_connection_sock_af_ops *ori_af_ops; > + /* original af ops */ > + int sockopt_defer_accept; > + /* sockopt TCP_DEFER_ACCEPT > + * value > + */ > + u8 wait_close_tx_prepared : 1; > + /* shutdown wr or close > + * started, waiting for unsent > + * data to be sent > + */ > + u8 connect_nonblock : 1; > + /* non-blocking connect in > + * flight > + */ > + struct mutex clcsock_release_lock; > + /* protects clcsock of a listen > + * socket > + */ > +}; > + > +#define SMC_SOCK_CLOSED_TIMING (0) > + > +/* BPF struct ops for smc protocol negotiator */ > +struct smc_sock_negotiator_ops { > + /* ret for negotiate */ > + int (*negotiate)(struct smc_sock *sk); > + > + /* info gathering timing */ > + void (*collect_info)(struct smc_sock *sk, int timing); > +}; > + > +/* Query if current sock should go with SMC protocol > + * SK_PASS for yes, otherwise for no. > + */ > +int smc_sock_should_select_smc(const struct smc_sock *smc); > + > +/* At some specific points in time, > + * let negotiator can perform info gathering > + * on target sock. > + */ > +void smc_sock_perform_collecting_info(const struct smc_sock *smc, int timing); > + > +#else > +struct smc_sock {}; > +struct smc_connection {}; > +struct smc_sock_negotiator_ops {}; > +union smc_host_cursor {}; > +#endif /* CONFIG_SMC */ > + > #endif /* _SMC_H */ > diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h > index 5678a9d..35cdd15 100644 > --- a/kernel/bpf/bpf_struct_ops_types.h > +++ b/kernel/bpf/bpf_struct_ops_types.h > @@ -9,4 +9,8 @@ > #include <net/tcp.h> > BPF_STRUCT_OPS_TYPE(tcp_congestion_ops) > #endif > +#if IS_ENABLED(CONFIG_SMC) > +#include <net/smc.h> > +BPF_STRUCT_OPS_TYPE(smc_sock_negotiator_ops) > +#endif > #endif > diff --git a/net/Makefile b/net/Makefile > index 0914bea..47a4c00 100644 > --- a/net/Makefile > +++ b/net/Makefile > @@ -52,6 +52,11 @@ obj-$(CONFIG_TIPC) += tipc/ > obj-$(CONFIG_NETLABEL) += netlabel/ > obj-$(CONFIG_IUCV) += iucv/ > obj-$(CONFIG_SMC) += smc/ > +ifneq ($(CONFIG_SMC),) > +ifeq ($(CONFIG_BPF_SYSCALL),y) > +obj-y += smc/bpf_smc_struct_ops.o > +endif > +endif > obj-$(CONFIG_RFKILL) += rfkill/ > obj-$(CONFIG_NET_9P) += 9p/ > obj-$(CONFIG_CAIF) += caif/ > diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c > index d7a7420..98651b85 100644 > --- a/net/smc/af_smc.c > +++ b/net/smc/af_smc.c > @@ -166,6 +166,9 @@ static bool smc_hs_congested(const struct sock *sk) > if (workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq)) > return true; > > + if (!smc_sock_should_select_smc(smc)) > + return true; > + > return false; > } > > @@ -320,6 +323,9 @@ static int smc_release(struct socket *sock) > sock_hold(sk); /* sock_put below */ > smc = smc_sk(sk); > > + /* trigger info gathering if needed.*/ > + smc_sock_perform_collecting_info(smc, SMC_SOCK_CLOSED_TIMING); > + > old_state = sk->sk_state; > > /* cleanup for a dangling non-blocking connect */ > @@ -1627,7 +1633,9 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, > } > > smc_copy_sock_settings_to_clc(smc); > - tcp_sk(smc->clcsock->sk)->syn_smc = 1; > + tcp_sk(smc->clcsock->sk)->syn_smc = (smc_sock_should_select_smc(smc) == SK_PASS) ? > + 1 : 0; > + > if (smc->connect_nonblock) { > rc = -EALREADY; > goto out; > diff --git a/net/smc/bpf_smc_struct_ops.c b/net/smc/bpf_smc_struct_ops.c > new file mode 100644 > index 0000000..a5989b6 > --- /dev/null > +++ b/net/smc/bpf_smc_struct_ops.c > @@ -0,0 +1,146 @@ > +// SPDX-License-Identifier: GPL-2.0 > + > +#include <linux/kernel.h> > +#include <linux/bpf_verifier.h> > +#include <linux/btf_ids.h> > +#include <linux/bpf.h> > +#include <linux/btf.h> > +#include <net/sock.h> > +#include <net/smc.h> > + > +extern struct bpf_struct_ops smc_sock_negotiator_ops; > + > +DEFINE_RWLOCK(smc_sock_negotiator_ops_rwlock); > +struct smc_sock_negotiator_ops *negotiator; > + > +/* convert sk to smc_sock */ > +static inline struct smc_sock *smc_sk(const struct sock *sk) > +{ > + return (struct smc_sock *)sk; > +} > + > +/* register ops */ > +static inline void smc_reg_passive_sk_ops(struct smc_sock_negotiator_ops *ops) > +{ > + write_lock_bh(&smc_sock_negotiator_ops_rwlock); > + negotiator = ops; > + write_unlock_bh(&smc_sock_negotiator_ops_rwlock); > +} > + > +/* unregister ops */ > +static inline void smc_unreg_passive_sk_ops(struct smc_sock_negotiator_ops *ops) > +{ > + write_lock_bh(&smc_sock_negotiator_ops_rwlock); > + if (negotiator == ops) > + negotiator = NULL; > + write_unlock_bh(&smc_sock_negotiator_ops_rwlock); > +} > + > +int smc_sock_should_select_smc(const struct smc_sock *smc) > +{ > + int ret = SK_PASS; > + > + read_lock_bh(&smc_sock_negotiator_ops_rwlock); > + if (negotiator && negotiator->negotiate) > + ret = negotiator->negotiate((struct smc_sock *)smc); > + read_unlock_bh(&smc_sock_negotiator_ops_rwlock); > + return ret; > +} > +EXPORT_SYMBOL_GPL(smc_sock_should_select_smc); > + > +void smc_sock_perform_collecting_info(const struct smc_sock *smc, int timing) > +{ > + read_lock_bh(&smc_sock_negotiator_ops_rwlock); > + if (negotiator && negotiator->collect_info) > + negotiator->collect_info((struct smc_sock *)smc, timing); > + read_unlock_bh(&smc_sock_negotiator_ops_rwlock); > +} > +EXPORT_SYMBOL_GPL(smc_sock_perform_collecting_info); > + > +/* define global smc ID for smc_struct_ops */ > +BTF_ID_LIST_GLOBAL(btf_smc_ids, MAX_BTF_SMC_TYPE) > +#define BTF_SMC_TYPE(name, type) BTF_ID(struct, type) > +BTF_SMC_TYPE_xxx > +#undef BTF_SMC_TYPE > + > +static int bpf_smc_passive_sk_init(struct btf *btf) > +{ > + return 0; > +} > + > +/* register ops by BPF */ > +static int bpf_smc_passive_sk_ops_reg(void *kdata) > +{ > + struct smc_sock_negotiator_ops *ops = kdata; > + > + /* at least one ops need implement */ > + if (!ops->negotiate || !ops->collect_info) { > + pr_err("At least one ops need implement.\n"); > + return -EINVAL; > + } > + > + smc_reg_passive_sk_ops(ops); > + /* always success now */ > + return 0; > +} > + > +/* unregister ops by BPF */ > +static void bpf_smc_passive_sk_ops_unreg(void *kdata) > +{ > + smc_unreg_passive_sk_ops(kdata); > +} > + > +static int bpf_smc_passive_sk_ops_check_member(const struct btf_type *t, > + const struct btf_member *member, > + const struct bpf_prog *prog) > +{ > + return 0; > +} Please check the right pointer type of check_member: int (*check_member)(const struct btf_type *t, const struct btf_member *member); > + > +static int bpf_smc_passive_sk_ops_init_member(const struct btf_type *t, > + const struct btf_member *member, > + void *kdata, const void *udata) > +{ > + return 0; > +} > + > +static const struct bpf_func_proto * > +smc_passive_sk_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) > +{ > + return bpf_base_func_proto(func_id); > +} > + > +static bool smc_passive_sk_ops_prog_is_valid_access(int off, int size, enum bpf_access_type type, > + const struct bpf_prog *prog, > + struct bpf_insn_access_aux *info) > +{ > + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); > +} > + > +static int smc_passive_sk_ops_prog_struct_access(struct bpf_verifier_log *log, > + const struct bpf_reg_state *reg, > + int off, int size, enum bpf_access_type atype, > + u32 *next_btf_id, enum bpf_type_flag *flag) > +{ > + /* only allow read now*/ > + if (atype == BPF_READ) > + return btf_struct_access(log, reg, off, size, atype, next_btf_id, flag); > + > + return -EACCES; > +} > + > +static const struct bpf_verifier_ops bpf_smc_passive_sk_verifier_ops = { > + .get_func_proto = smc_passive_sk_prog_func_proto, > + .is_valid_access = smc_passive_sk_ops_prog_is_valid_access, > + .btf_struct_access = smc_passive_sk_ops_prog_struct_access > +}; > + > +struct bpf_struct_ops bpf_smc_sock_negotiator_ops = { > + .verifier_ops = &bpf_smc_passive_sk_verifier_ops, > + .init = bpf_smc_passive_sk_init, > + .check_member = bpf_smc_passive_sk_ops_check_member, > + .init_member = bpf_smc_passive_sk_ops_init_member, > + .reg = bpf_smc_passive_sk_ops_reg, > + .unreg = bpf_smc_passive_sk_ops_unreg, > + .name = "smc_sock_negotiator_ops", > +}; > diff --git a/net/smc/smc.h b/net/smc/smc.h > index 5ed765e..349b193 100644 > --- a/net/smc/smc.h > +++ b/net/smc/smc.h > @@ -57,232 +57,12 @@ enum smc_state { /* possible states of an SMC socket */ > > struct smc_link_group; > > -struct smc_wr_rx_hdr { /* common prefix part of LLC and CDC to demultiplex */ > - union { > - u8 type; > -#if defined(__BIG_ENDIAN_BITFIELD) > - struct { > - u8 llc_version:4, > - llc_type:4; > - }; > -#elif defined(__LITTLE_ENDIAN_BITFIELD) > - struct { > - u8 llc_type:4, > - llc_version:4; > - }; > -#endif > - }; > -} __aligned(1); > - > -struct smc_cdc_conn_state_flags { > -#if defined(__BIG_ENDIAN_BITFIELD) > - u8 peer_done_writing : 1; /* Sending done indicator */ > - u8 peer_conn_closed : 1; /* Peer connection closed indicator */ > - u8 peer_conn_abort : 1; /* Abnormal close indicator */ > - u8 reserved : 5; > -#elif defined(__LITTLE_ENDIAN_BITFIELD) > - u8 reserved : 5; > - u8 peer_conn_abort : 1; > - u8 peer_conn_closed : 1; > - u8 peer_done_writing : 1; > -#endif > -}; > - > -struct smc_cdc_producer_flags { > -#if defined(__BIG_ENDIAN_BITFIELD) > - u8 write_blocked : 1; /* Writing Blocked, no rx buf space */ > - u8 urg_data_pending : 1; /* Urgent Data Pending */ > - u8 urg_data_present : 1; /* Urgent Data Present */ > - u8 cons_curs_upd_req : 1; /* cursor update requested */ > - u8 failover_validation : 1;/* message replay due to failover */ > - u8 reserved : 3; > -#elif defined(__LITTLE_ENDIAN_BITFIELD) > - u8 reserved : 3; > - u8 failover_validation : 1; > - u8 cons_curs_upd_req : 1; > - u8 urg_data_present : 1; > - u8 urg_data_pending : 1; > - u8 write_blocked : 1; > -#endif > -}; > - > -/* in host byte order */ > -union smc_host_cursor { /* SMC cursor - an offset in an RMBE */ > - struct { > - u16 reserved; > - u16 wrap; /* window wrap sequence number */ > - u32 count; /* cursor (= offset) part */ > - }; > -#ifdef KERNEL_HAS_ATOMIC64 > - atomic64_t acurs; /* for atomic processing */ > -#else > - u64 acurs; /* for atomic processing */ > -#endif > -} __aligned(8); > - > -/* in host byte order, except for flag bitfields in network byte order */ > -struct smc_host_cdc_msg { /* Connection Data Control message */ > - struct smc_wr_rx_hdr common; /* .type = 0xFE */ > - u8 len; /* length = 44 */ > - u16 seqno; /* connection seq # */ > - u32 token; /* alert_token */ > - union smc_host_cursor prod; /* producer cursor */ > - union smc_host_cursor cons; /* consumer cursor, > - * piggy backed "ack" > - */ > - struct smc_cdc_producer_flags prod_flags; /* conn. tx/rx status */ > - struct smc_cdc_conn_state_flags conn_state_flags; /* peer conn. status*/ > - u8 reserved[18]; > -} __aligned(8); > - > -enum smc_urg_state { > - SMC_URG_VALID = 1, /* data present */ > - SMC_URG_NOTYET = 2, /* data pending */ > - SMC_URG_READ = 3, /* data was already read */ > -}; > - > struct smc_mark_woken { > bool woken; > void *key; > wait_queue_entry_t wait_entry; > }; > > -struct smc_connection { > - struct rb_node alert_node; > - struct smc_link_group *lgr; /* link group of connection */ > - struct smc_link *lnk; /* assigned SMC-R link */ > - u32 alert_token_local; /* unique conn. id */ > - u8 peer_rmbe_idx; /* from tcp handshake */ > - int peer_rmbe_size; /* size of peer rx buffer */ > - atomic_t peer_rmbe_space;/* remaining free bytes in peer > - * rmbe > - */ > - int rtoken_idx; /* idx to peer RMB rkey/addr */ > - > - struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */ > - struct smc_buf_desc *rmb_desc; /* RMBE descriptor */ > - int rmbe_size_short;/* compressed notation */ > - int rmbe_update_limit; > - /* lower limit for consumer > - * cursor update > - */ > - > - struct smc_host_cdc_msg local_tx_ctrl; /* host byte order staging > - * buffer for CDC msg send > - * .prod cf. TCP snd_nxt > - * .cons cf. TCP sends ack > - */ > - union smc_host_cursor local_tx_ctrl_fin; > - /* prod crsr - confirmed by peer > - */ > - union smc_host_cursor tx_curs_prep; /* tx - prepared data > - * snd_max..wmem_alloc > - */ > - union smc_host_cursor tx_curs_sent; /* tx - sent data > - * snd_nxt ? > - */ > - union smc_host_cursor tx_curs_fin; /* tx - confirmed by peer > - * snd-wnd-begin ? > - */ > - atomic_t sndbuf_space; /* remaining space in sndbuf */ > - u16 tx_cdc_seq; /* sequence # for CDC send */ > - u16 tx_cdc_seq_fin; /* sequence # - tx completed */ > - spinlock_t send_lock; /* protect wr_sends */ > - atomic_t cdc_pend_tx_wr; /* number of pending tx CDC wqe > - * - inc when post wqe, > - * - dec on polled tx cqe > - */ > - wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/ > - atomic_t tx_pushing; /* nr_threads trying tx push */ > - struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ > - u32 tx_off; /* base offset in peer rmb */ > - > - struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl. > - * .prod cf. TCP rcv_nxt > - * .cons cf. TCP snd_una > - */ > - union smc_host_cursor rx_curs_confirmed; /* confirmed to peer > - * source of snd_una ? > - */ > - union smc_host_cursor urg_curs; /* points at urgent byte */ > - enum smc_urg_state urg_state; > - bool urg_tx_pend; /* urgent data staged */ > - bool urg_rx_skip_pend; > - /* indicate urgent oob data > - * read, but previous regular > - * data still pending > - */ > - char urg_rx_byte; /* urgent byte */ > - bool tx_in_release_sock; > - /* flush pending tx data in > - * sock release_cb() > - */ > - atomic_t bytes_to_rcv; /* arrived data, > - * not yet received > - */ > - atomic_t splice_pending; /* number of spliced bytes > - * pending processing > - */ > -#ifndef KERNEL_HAS_ATOMIC64 > - spinlock_t acurs_lock; /* protect cursors */ > -#endif > - struct work_struct close_work; /* peer sent some closing */ > - struct work_struct abort_work; /* abort the connection */ > - struct tasklet_struct rx_tsklet; /* Receiver tasklet for SMC-D */ > - u8 rx_off; /* receive offset: > - * 0 for SMC-R, 32 for SMC-D > - */ > - u64 peer_token; /* SMC-D token of peer */ > - u8 killed : 1; /* abnormal termination */ > - u8 freed : 1; /* normal termiation */ > - u8 out_of_sync : 1; /* out of sync with peer */ > -}; > - > -struct smc_sock { /* smc sock container */ > - struct sock sk; > - struct socket *clcsock; /* internal tcp socket */ > - void (*clcsk_state_change)(struct sock *sk); > - /* original stat_change fct. */ > - void (*clcsk_data_ready)(struct sock *sk); > - /* original data_ready fct. */ > - void (*clcsk_write_space)(struct sock *sk); > - /* original write_space fct. */ > - void (*clcsk_error_report)(struct sock *sk); > - /* original error_report fct. */ > - struct smc_connection conn; /* smc connection */ > - struct smc_sock *listen_smc; /* listen parent */ > - struct work_struct connect_work; /* handle non-blocking connect*/ > - struct work_struct tcp_listen_work;/* handle tcp socket accepts */ > - struct work_struct smc_listen_work;/* prepare new accept socket */ > - struct list_head accept_q; /* sockets to be accepted */ > - spinlock_t accept_q_lock; /* protects accept_q */ > - bool limit_smc_hs; /* put constraint on handshake */ > - bool use_fallback; /* fallback to tcp */ > - int fallback_rsn; /* reason for fallback */ > - u32 peer_diagnosis; /* decline reason from peer */ > - atomic_t queued_smc_hs; /* queued smc handshakes */ > - struct inet_connection_sock_af_ops af_ops; > - const struct inet_connection_sock_af_ops *ori_af_ops; > - /* original af ops */ > - int sockopt_defer_accept; > - /* sockopt TCP_DEFER_ACCEPT > - * value > - */ > - u8 wait_close_tx_prepared : 1; > - /* shutdown wr or close > - * started, waiting for unsent > - * data to be sent > - */ > - u8 connect_nonblock : 1; > - /* non-blocking connect in > - * flight > - */ > - struct mutex clcsock_release_lock; > - /* protects clcsock of a listen > - * socket > - * */ > -}; > - > static inline struct smc_sock *smc_sk(const struct sock *sk) > { > return (struct smc_sock *)sk;
Powered by blists - more mailing lists