[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20231017014716.3944813-6-lixiaoyan@google.com>
Date: Tue, 17 Oct 2023 01:47:16 +0000
From: Coco Li <lixiaoyan@...gle.com>
To: Jakub Kicinski <kuba@...nel.org>, Eric Dumazet <edumazet@...gle.com>,
Neal Cardwell <ncardwell@...gle.com>, Mubashir Adnan Qureshi <mubashirq@...gle.com>,
Paolo Abeni <pabeni@...hat.com>
Cc: netdev@...r.kernel.org, Chao Wu <wwchao@...gle.com>, Wei Wang <weiwan@...gle.com>,
Coco Li <lixiaoyan@...gle.com>, David Ahern <dsahern@...nel.org>
Subject: [PATCH v2 net-next 5/5] tcp: reorganize tcp_sock fast path variables
The variables are organized according in the following way:
- TX read-mostly hotpath cache lines
- TXRX read-mostly hotpath cache lines
- RX read-mostly hotpath cache lines
- TX read-write hotpath cache line
- TXRX read-write hotpath cache line
- RX read-write hotpath cache line
Fastpath cachelines end after rcvq_space.
Cache line boundaries are enfored only between read-mostly and
read-write. That is, if read-mostly tx cachelines bleed into
read-mostly txrx cachelines, we do not care. We care about the
boundaries between read and write cachelines because we want
to prevent false sharing.
Fast path variables span cache lines before change: 12
Fast path variables span cache lines after change: 8
Signed-off-by: Coco Li <lixiaoyan@...gle.com>
Suggested-by: Eric Dumazet <edumazet@...gle.com>
Reviewed-by: Wei Wang <weiwan@...gle.com>
Reviewed-by: David Ahern <dsahern@...nel.org>
---
include/linux/tcp.h | 238 +++++++++++++++++++++++---------------------
1 file changed, 124 insertions(+), 114 deletions(-)
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index e15452df9804f..5195a0657997a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -175,23 +175,110 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
#define TCP_RMEM_TO_WIN_SCALE 8
struct tcp_sock {
+ /* Caacheline organization can be found documented in
+ * Documentation/networking/net_cachelines/tcp_sock.rst.
+ * Please update the document when adding new fields.
+ */
+
/* inet_connection_sock has to be the first member of tcp_sock */
struct inet_connection_sock inet_conn;
- u16 tcp_header_len; /* Bytes of tcp header to send */
+
+ /* TX read-mostly hotpath cache lines */
+ /* timestamp of last sent data packet (for restart window) */
+ u32 max_window; /* Maximal window ever seen from peer */
+ u32 rcv_ssthresh; /* Current window clamp */
+ u32 reordering; /* Packet reordering metric. */
+ u32 notsent_lowat; /* TCP_NOTSENT_LOWAT */
u16 gso_segs; /* Max number of segs per GSO packet */
+ /* from STCP, retrans queue hinting */
+ struct sk_buff *lost_skb_hint;
+ struct sk_buff *retransmit_skb_hint;
+
+ /* TXRX read-mostly hotpath cache lines */
+ u32 tsoffset; /* timestamp offset */
+ u32 snd_wnd; /* The window we expect to receive */
+ u32 mss_cache; /* Cached effective mss, not including SACKS */
+ u32 snd_cwnd; /* Sending congestion window */
+ u32 prr_out; /* Total number of pkts sent during Recovery. */
+ u32 lost_out; /* Lost packets */
+ u32 sacked_out; /* SACK'd packets */
+ u16 tcp_header_len; /* Bytes of tcp header to send */
+ u8 chrono_type : 2, /* current chronograph type */
+ repair : 1,
+ is_sack_reneg:1, /* in recovery from loss with SACK reneg? */
+ is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
+
+ /* RX read-mostly hotpath cache lines */
+ u32 copied_seq; /* Head of yet unread data */
+ u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */
+ u32 snd_wl1; /* Sequence for window update */
+ u32 tlp_high_seq; /* snd_nxt at the time of TLP */
+ u32 rttvar_us; /* smoothed mdev_max */
+ u32 retrans_out; /* Retransmitted packets out */
+ u16 advmss; /* Advertised MSS */
+ u16 urg_data; /* Saved octet of OOB data and control flags */
+ u32 lost; /* Total data packets lost incl. rexmits */
+ struct minmax rtt_min;
+ /* OOO segments go in this rbtree. Socket lock must be held. */
+ struct rb_root out_of_order_queue;
+ u32 snd_ssthresh; /* Slow start size threshold */
+ /* TX read-write hotpath cache lines */
+ u32 segs_out ____cacheline_aligned; /* RFC4898 tcpEStatsPerfSegsOut
+ * The total number of segments sent.
+ */
+ u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut
+ * total number of data segments sent.
+ */
+ u64 bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut
+ * total number of data bytes sent.
+ */
+ u32 snd_sml; /* Last byte of the most recently transmitted small packet */
+ u32 chrono_start; /* Start time in jiffies of a TCP chrono */
+ u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */
+ u32 write_seq; /* Tail(+1) of data held in tcp send buffer */
+ u32 pushed_seq; /* Last pushed seq, required to talk to windows */
+ u32 lsndtime;
+ u32 mdev_us; /* medium deviation */
+ u64 tcp_wstamp_ns; /* departure time for next sent data packet */
+ u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */
+ u64 tcp_mstamp; /* most recent packet received/sent */
+ u32 rtt_seq; /* sequence number to update rttvar */
+ struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */
+ struct sk_buff *highest_sack; /* skb just after the highest
+ * skb with SACKed bit set
+ * (validity guaranteed only if
+ * sacked_out > 0)
+ */
+ u8 ecn_flags; /* ECN status bits. */
+
+ /* TXRX read-write hotpath cache lines */
/*
* Header prediction flags
* 0x5?10 << 16 + snd_wnd in net byte order
*/
__be32 pred_flags;
-
+ u32 rcv_nxt; /* What we want to receive next */
+ u32 snd_nxt; /* Next sequence we send */
+ u32 snd_una; /* First byte we want an ack for */
+ u32 window_clamp; /* Maximal window to advertise */
+ u32 srtt_us; /* smoothed round trip time << 3 in usecs */
+ u32 packets_out; /* Packets which are "in flight" */
+ u32 snd_up; /* Urgent pointer */
+ u32 delivered; /* Total data packets delivered incl. rexmits */
+ u32 delivered_ce; /* Like the above but only ECE marked packets */
+ u32 app_limited; /* limited until "delivered" reaches this val */
+ u32 rcv_wnd; /* Current receiver window */
/*
- * RFC793 variables by their proper names. This means you can
- * read the code and the spec side by side (and laugh ...)
- * See RFC793 and RFC1122. The RFC writes these in capitals.
+ * Options received (usually on last packet, some only on SYN packets).
*/
- u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived
+ struct tcp_options_received rx_opt;
+ u8 nonagle : 4,/* Disable Nagle algorithm? */
+ rate_app_limited:1; /* rate_{delivered,interval_us} limited? */
+
+ /* RX read-write hotpath cache lines */
+ u64 bytes_received;
+ /* RFC4898 tcpEStatsAppHCThruOctetsReceived
* sum(delta(rcv_nxt)), or how many bytes
* were acked.
*/
@@ -201,45 +288,44 @@ struct tcp_sock {
u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn
* total number of data segments in.
*/
- u32 rcv_nxt; /* What we want to receive next */
- u32 copied_seq; /* Head of yet unread data */
u32 rcv_wup; /* rcv_nxt on last window update sent */
- u32 snd_nxt; /* Next sequence we send */
- u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut
- * The total number of segments sent.
- */
- u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut
- * total number of data segments sent.
- */
- u64 bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut
- * total number of data bytes sent.
- */
+ u32 max_packets_out; /* max packets_out in last window */
+ u32 cwnd_usage_seq; /* right edge of cwnd usage tracking flight */
+ u32 rate_delivered; /* saved rate sample: packets delivered */
+ u32 rate_interval_us; /* saved rate sample: time elapsed */
+ u32 rcv_rtt_last_tsecr;
+ u64 first_tx_mstamp; /* start of window send phase */
+ u64 delivered_mstamp; /* time we reached "delivered" */
u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked
* sum(delta(snd_una)), or how many bytes
* were acked.
*/
+ struct {
+ u32 rtt_us;
+ u32 seq;
+ u64 time;
+ } rcv_rtt_est;
+/* Receiver queue space */
+ struct {
+ u32 space;
+ u32 seq;
+ u64 time;
+ } rcvq_space;
+
+ /* End of Hot Path */
+
+/*
+ * RFC793 variables by their proper names. This means you can
+ * read the code and the spec side by side (and laugh ...)
+ * See RFC793 and RFC1122. The RFC writes these in capitals.
+ */
u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups
* total number of DSACK blocks received
*/
- u32 snd_una; /* First byte we want an ack for */
- u32 snd_sml; /* Last byte of the most recently transmitted small packet */
- u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */
- u32 lsndtime; /* timestamp of last sent data packet (for restart window) */
u32 last_oow_ack_time; /* timestamp of last out-of-window ACK */
u32 compressed_ack_rcv_nxt;
-
- u32 tsoffset; /* timestamp offset */
-
struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
- struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */
-
- u32 snd_wl1; /* Sequence for window update */
- u32 snd_wnd; /* The window we expect to receive */
- u32 max_window; /* Maximal window ever seen from peer */
- u32 mss_cache; /* Cached effective mss, not including SACKS */
- u32 window_clamp; /* Maximal window to advertise */
- u32 rcv_ssthresh; /* Current window clamp */
u8 scaling_ratio; /* see tcp_win_from_space() */
/* Information of the most recently (s)acked skb */
struct tcp_rack {
@@ -253,23 +339,16 @@ struct tcp_sock {
dsack_seen:1, /* Whether DSACK seen after last adj */
advanced:1; /* mstamp advanced since last lost marking */
} rack;
- u16 advmss; /* Advertised MSS */
u8 compressed_ack;
u8 dup_ack_counter:2,
tlp_retrans:1, /* TLP is a retransmission */
unused:5;
- u32 chrono_start; /* Start time in jiffies of a TCP chrono */
- u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */
- u8 chrono_type:2, /* current chronograph type */
- rate_app_limited:1, /* rate_{delivered,interval_us} limited? */
+
+ u8 thin_lto : 1,/* Use linear timeouts for thin streams */
+ recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */
fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
- is_sack_reneg:1, /* in recovery from loss with SACK reneg? */
- fastopen_client_fail:2; /* reason why fastopen failed */
- u8 nonagle : 4,/* Disable Nagle algorithm? */
- thin_lto : 1,/* Use linear timeouts for thin streams */
- recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */
- repair : 1,
+ fastopen_client_fail:2, /* reason why fastopen failed */
frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */
u8 repair_queue;
u8 save_syn:2, /* Save headers of SYN packet */
@@ -277,45 +356,19 @@ struct tcp_sock {
syn_fastopen:1, /* SYN includes Fast Open option */
syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
syn_fastopen_ch:1, /* Active TFO re-enabling probe */
- syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
- is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
- u32 tlp_high_seq; /* snd_nxt at the time of TLP */
+ syn_data_acked:1;/* data in SYN is acked by SYN-ACK */
u32 tcp_tx_delay; /* delay (in usec) added to TX packets */
- u64 tcp_wstamp_ns; /* departure time for next sent data packet */
- u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */
/* RTT measurement */
- u64 tcp_mstamp; /* most recent packet received/sent */
- u32 srtt_us; /* smoothed round trip time << 3 in usecs */
- u32 mdev_us; /* medium deviation */
u32 mdev_max_us; /* maximal mdev for the last rtt period */
- u32 rttvar_us; /* smoothed mdev_max */
- u32 rtt_seq; /* sequence number to update rttvar */
- struct minmax rtt_min;
- u32 packets_out; /* Packets which are "in flight" */
- u32 retrans_out; /* Retransmitted packets out */
- u32 max_packets_out; /* max packets_out in last window */
- u32 cwnd_usage_seq; /* right edge of cwnd usage tracking flight */
-
- u16 urg_data; /* Saved octet of OOB data and control flags */
- u8 ecn_flags; /* ECN status bits. */
u8 keepalive_probes; /* num of allowed keep alive probes */
- u32 reordering; /* Packet reordering metric. */
u32 reord_seen; /* number of data packet reordering events */
- u32 snd_up; /* Urgent pointer */
-
-/*
- * Options received (usually on last packet, some only on SYN packets).
- */
- struct tcp_options_received rx_opt;
/*
* Slow start and congestion control (see also Nagle, and Karn & Partridge)
*/
- u32 snd_ssthresh; /* Slow start size threshold */
- u32 snd_cwnd; /* Sending congestion window */
u32 snd_cwnd_cnt; /* Linear increase counter */
u32 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
u32 snd_cwnd_used;
@@ -323,32 +376,10 @@ struct tcp_sock {
u32 prior_cwnd; /* cwnd right before starting loss recovery */
u32 prr_delivered; /* Number of newly delivered packets to
* receiver in Recovery. */
- u32 prr_out; /* Total number of pkts sent during Recovery. */
- u32 delivered; /* Total data packets delivered incl. rexmits */
- u32 delivered_ce; /* Like the above but only ECE marked packets */
- u32 lost; /* Total data packets lost incl. rexmits */
- u32 app_limited; /* limited until "delivered" reaches this val */
- u64 first_tx_mstamp; /* start of window send phase */
- u64 delivered_mstamp; /* time we reached "delivered" */
- u32 rate_delivered; /* saved rate sample: packets delivered */
- u32 rate_interval_us; /* saved rate sample: time elapsed */
-
- u32 rcv_wnd; /* Current receiver window */
- u32 write_seq; /* Tail(+1) of data held in tcp send buffer */
- u32 notsent_lowat; /* TCP_NOTSENT_LOWAT */
- u32 pushed_seq; /* Last pushed seq, required to talk to windows */
- u32 lost_out; /* Lost packets */
- u32 sacked_out; /* SACK'd packets */
struct hrtimer pacing_timer;
struct hrtimer compressed_ack_timer;
- /* from STCP, retrans queue hinting */
- struct sk_buff* lost_skb_hint;
- struct sk_buff *retransmit_skb_hint;
-
- /* OOO segments go in this rbtree. Socket lock must be held. */
- struct rb_root out_of_order_queue;
struct sk_buff *ooo_last_skb; /* cache rb_last(out_of_order_queue) */
/* SACKs data, these 2 need to be together (see tcp_options_write) */
@@ -357,12 +388,6 @@ struct tcp_sock {
struct tcp_sack_block recv_sack_cache[4];
- struct sk_buff *highest_sack; /* skb just after the highest
- * skb with SACKed bit set
- * (validity guaranteed only if
- * sacked_out > 0)
- */
-
int lost_cnt_hint;
u32 prior_ssthresh; /* ssthresh saved at recovery start */
@@ -413,21 +438,6 @@ struct tcp_sock {
u32 rcv_ooopack; /* Received out-of-order packets, for tcpinfo */
-/* Receiver side RTT estimation */
- u32 rcv_rtt_last_tsecr;
- struct {
- u32 rtt_us;
- u32 seq;
- u64 time;
- } rcv_rtt_est;
-
-/* Receiver queue space */
- struct {
- u32 space;
- u32 seq;
- u64 time;
- } rcvq_space;
-
/* TCP-specific MTU probe information. */
struct {
u32 probe_seq_start;
--
2.42.0.655.g421f12c284-goog
Powered by blists - more mailing lists