[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1584524289-24187-12-git-send-email-ilpo.jarvinen@helsinki.fi>
Date: Wed, 18 Mar 2020 11:37:53 +0200
From: Ilpo Järvinen <ilpo.jarvinen@...sinki.fi>
To: netdev@...r.kernel.org
Cc: Yuchung Cheng <ycheng@...gle.com>,
Neal Cardwell <ncardwell@...gle.com>,
Eric Dumazet <eric.dumazet@...il.com>,
Olivier Tilmans <olivier.tilmans@...ia-bell-labs.com>
Subject: [RFC PATCH 12/28] tcp: AccECN core
From: Ilpo Järvinen <ilpo.jarvinen@...helsinki.fi>
Accurate ECN (without negotiation and AccECN Option).
Signed-off-by: Olivier Tilmans <olivier.tilmans@...ia-bell-labs.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@...helsinki.fi>
---
include/linux/tcp.h | 2 +
include/net/tcp.h | 29 ++++++++++++
net/ipv4/tcp.c | 1 +
net/ipv4/tcp_input.c | 108 +++++++++++++++++++++++++++++++++++-------
net/ipv4/tcp_output.c | 22 ++++++++-
5 files changed, 144 insertions(+), 18 deletions(-)
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 3dc964010fef..9bdf67dd0d1d 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -315,6 +315,8 @@ struct tcp_sock {
u32 prr_out; /* Total number of pkts sent during Recovery. */
u32 delivered; /* Total data packets delivered incl. rexmits */
u32 delivered_ce; /* Like the above but only ECE marked packets */
+ u32 received_ce; /* Like the above but for received CE marked packets */
+ u32 received_ce_tx; /* Like the above but max transmitted value */
u32 lost; /* Total data packets lost incl. rexmits */
u32 app_limited; /* limited until "delivered" reaches this val */
u64 first_tx_mstamp; /* start of window send phase */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index f4ac4c029215..ee938066fd8c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -404,6 +404,16 @@ static inline void tcp_ecn_mode_set(struct tcp_sock *tp, u8 mode)
tp->ecn_flags |= mode;
}
+static inline u8 tcp_accecn_ace(const struct tcphdr *th)
+{
+ return (th->ae << 2) | (th->cwr << 1) | th->ece;
+}
+
+static inline u32 tcp_accecn_ace_deficit(const struct tcp_sock *tp)
+{
+ return tp->received_ce - tp->received_ce_tx;
+}
+
enum tcp_tw_status {
TCP_TW_SUCCESS = 0,
TCP_TW_RST = 1,
@@ -833,6 +843,20 @@ static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
#define TCPHDR_ACE (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)
#define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR)
+#define TCP_ACCECN_CEP_ACE_MASK 0x7
+#define TCP_ACCECN_ACE_MAX_DELTA 6
+
+/* To avoid/detect middlebox interference, not all counters start at 0.
+ * See draft-ietf-tcpm-accurate-ecn for the latest values.
+ */
+#define TCP_ACCECN_CEP_INIT_OFFSET 5
+
+static inline void tcp_accecn_init_counters(struct tcp_sock *tp)
+{
+ tp->received_ce = 0;
+ tp->received_ce_tx = 0;
+}
+
/* This is what the send packet queuing engine uses to pass
* TCP per-packet control information to the transmission code.
* We also store the host-order sequence numbers in here too.
@@ -1528,7 +1552,12 @@ static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt,
static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
{
+ u32 ace = tcp_ecn_mode_accecn(tp) ?
+ ((tp->delivered_ce + TCP_ACCECN_CEP_INIT_OFFSET) &
+ TCP_ACCECN_CEP_ACE_MASK) : 0;
+
tp->pred_flags = htonl((tp->tcp_header_len << 26) |
+ (ace << 22) |
ntohl(TCP_FLAG_ACK) |
snd_wnd);
}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index fbf365dd51e4..2ee1e4794c7d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2624,6 +2624,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->window_clamp = 0;
tp->delivered = 0;
tp->delivered_ce = 0;
+ tcp_accecn_init_counters(tp);
tcp_set_ca_state(sk, TCP_CA_Open);
tp->is_sack_reneg = 0;
tcp_clear_retrans(tp);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 59078fa2240d..65bbfadbee67 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -249,6 +249,7 @@ static bool tcp_in_quickack_mode(struct sock *sk)
static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
{
+ /* Do not set CWR if in AccECN mode! */
if (tcp_ecn_mode_rfc3168(tp))
tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
}
@@ -257,7 +258,7 @@ static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tcp_hdr(skb)->cwr) {
+ if (tcp_ecn_mode_rfc3168(tp) && tcp_hdr(skb)->cwr) {
tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
/* If the sender is telling us it has entered CWR, then its
@@ -293,17 +294,16 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb)
if (tcp_ca_needs_ecn(sk))
tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
- if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
+ if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR) &&
+ tcp_ecn_mode_rfc3168(tp)) {
/* Better not delay acks, sender can have a very low cwnd */
tcp_enter_quickack_mode(sk, 2);
tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
}
- tp->ecn_flags |= TCP_ECN_SEEN;
break;
default:
if (tcp_ca_needs_ecn(sk))
tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
- tp->ecn_flags |= TCP_ECN_SEEN;
break;
}
}
@@ -320,11 +320,43 @@ static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
}
-static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
+static u32 tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
{
if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp))
- return true;
- return false;
+ return 1;
+ return 0;
+}
+
+/* Returns the ECN CE delta */
+static u32 tcp_accecn_process(struct tcp_sock *tp, const struct sk_buff *skb,
+ u32 delivered_pkts, int flag)
+{
+ u32 delta, safe_delta;
+ u32 corrected_ace;
+
+ /* Reordered ACK? (...or uncertain due to lack of data to send and ts) */
+ if (!(flag & (FLAG_FORWARD_PROGRESS|FLAG_TS_PROGRESS)))
+ return 0;
+
+ if (!(flag & FLAG_SLOWPATH)) {
+ /* AccECN counter might overflow on large ACKs */
+ if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK)
+ return 0;
+ }
+
+ /* ACE field is not available during handshake */
+ if (flag & FLAG_SYN_ACKED)
+ return 0;
+
+ corrected_ace = tcp_accecn_ace(tcp_hdr(skb)) - TCP_ACCECN_CEP_INIT_OFFSET;
+ delta = (corrected_ace - tp->delivered_ce) & TCP_ACCECN_CEP_ACE_MASK;
+ if (delivered_pkts < TCP_ACCECN_CEP_ACE_MASK)
+ return delta;
+
+ safe_delta = delivered_pkts -
+ ((delivered_pkts - delta) & TCP_ACCECN_CEP_ACE_MASK);
+
+ return safe_delta;
}
/* Buffer size and advertised window tuning.
@@ -3590,7 +3622,8 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit)
}
/* Returns the number of packets newly acked or sacked by the current ACK */
-static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
+static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered,
+ u32 ecn_count)
{
const struct net *net = sock_net(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -3598,10 +3631,18 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
delivered = tp->delivered - prior_delivered;
NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
- if (flag & FLAG_ECE) {
- tp->delivered_ce += delivered;
- NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
+
+ if (ecn_count) {
+ if (tcp_ecn_mode_rfc3168(tp))
+ ecn_count = delivered;
+
+ tp->delivered_ce += ecn_count;
+ if (tcp_ecn_mode_accecn(tp) &&
+ tcp_accecn_ace_deficit(tp) >= TCP_ACCECN_ACE_MAX_DELTA)
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
+ NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, ecn_count);
}
+
return delivered;
}
@@ -3621,6 +3662,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
u32 delivered = tp->delivered;
u32 lost = tp->lost;
int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
+ u32 ecn_count = 0; /* Did we receive ECE/an AccECN ACE update? */
u32 prior_fack;
sack_state.first_sackt = 0;
@@ -3690,8 +3732,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (TCP_SKB_CB(skb)->sacked)
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_state);
-
- if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb)))
+ ecn_count = tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb));
+ if (ecn_count > 0)
flag |= FLAG_ECE;
}
@@ -3709,6 +3751,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
tcp_rack_update_reo_wnd(sk, &rs);
+ if (tcp_ecn_mode_accecn(tp)) {
+ ecn_count = tcp_accecn_process(tp, skb,
+ tp->delivered - delivered, flag);
+ if (ecn_count > 0)
+ flag |= FLAG_ECE;
+ }
+
tcp_in_ack_event(sk, flag);
if (tp->tlp_high_seq)
@@ -3731,7 +3780,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
sk_dst_confirm(sk);
- delivered = tcp_newly_delivered(sk, delivered, flag);
+ delivered = tcp_newly_delivered(sk, delivered, ecn_count);
+
lost = tp->lost - lost; /* freshly marked lost */
rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
@@ -3740,12 +3790,18 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
return 1;
no_queue:
+ if (tcp_ecn_mode_accecn(tp)) {
+ ecn_count = tcp_accecn_process(tp, skb,
+ tp->delivered - delivered, flag);
+ if (ecn_count > 0)
+ flag |= FLAG_ECE;
+ }
tcp_in_ack_event(sk, flag);
/* If data was DSACKed, see if we can undo a cwnd reduction. */
if (flag & FLAG_DSACKING_ACK) {
tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
&rexmit);
- tcp_newly_delivered(sk, delivered, flag);
+ tcp_newly_delivered(sk, delivered, ecn_count);
}
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
@@ -3766,7 +3822,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
&sack_state);
tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
&rexmit);
- tcp_newly_delivered(sk, delivered, flag);
+ tcp_newly_delivered(sk, delivered, ecn_count);
tcp_xmit_recovery(sk, rexmit);
}
@@ -5425,6 +5481,21 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t
}
}
+/* Updates Accurate ECN received counters from the received IP ECN field */
+static void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK;
+ u8 is_ce = INET_ECN_is_ce(ecnfield);
+
+ if (!INET_ECN_is_not_ect(ecnfield)) {
+ tp->ecn_flags |= TCP_ECN_SEEN;
+
+ /* ACE counter tracks *all* segments including pure acks */
+ tp->received_ce += is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+ }
+}
+
/* Accept RST for rcv_nxt - 1 after a FIN.
* When tcp connections are abruptly terminated from Mac OSX (via ^C), a
* FIN is sent followed by a RST packet. The RST is sent with the same
@@ -5656,6 +5727,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
tp->rcv_nxt == tp->rcv_wup)
flag |= __tcp_replace_ts_recent(tp, tstamp_delta);
+ tcp_ecn_received_counters(sk, skb);
+
/* We know that such packets are checksummed
* on entry.
*/
@@ -5697,6 +5770,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
/* Bulk data transfer: receiver */
__skb_pull(skb, tcp_header_len);
+ tcp_ecn_received_counters(sk, skb);
eaten = tcp_queue_rcv(sk, skb, &fragstolen);
tcp_event_data_recv(sk, skb);
@@ -5733,6 +5807,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
return;
step5:
+ tcp_ecn_received_counters(sk, skb);
+
if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
goto discard;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 71a96983987d..a1414d1a8ef1 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -360,6 +360,18 @@ tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
th->ece = 1;
}
+static void tcp_accecn_set_ace(struct tcphdr *th, struct tcp_sock *tp)
+{
+ u32 wire_ace;
+
+ tp->received_ce_tx += min_t(u32, tcp_accecn_ace_deficit(tp),
+ TCP_ACCECN_ACE_MAX_DELTA);
+ wire_ace = tp->received_ce_tx + TCP_ACCECN_CEP_INIT_OFFSET;
+ th->ece = !!(wire_ace & 0x1);
+ th->cwr = !!(wire_ace & 0x2);
+ th->ae = !!(wire_ace & 0x4);
+}
+
/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
* be sent.
*/
@@ -368,11 +380,17 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tcp_ecn_mode_rfc3168(tp)) {
+ if (!tcp_ecn_mode_any(tp))
+ return;
+
+ INET_ECN_xmit(sk);
+ if (tcp_ecn_mode_accecn(tp)) {
+ tcp_accecn_set_ace(th, tp);
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ACCECN;
+ } else {
/* Not-retransmitted data segment: set ECT and inject CWR. */
if (skb->len != tcp_header_len &&
!before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
- INET_ECN_xmit(sk);
if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
th->cwr = 1;
--
2.20.1
Powered by blists - more mailing lists