lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20241204074710.990092-10-dhowells@redhat.com>
Date: Wed,  4 Dec 2024 07:46:37 +0000
From: David Howells <dhowells@...hat.com>
To: netdev@...r.kernel.org
Cc: David Howells <dhowells@...hat.com>,
	Marc Dionne <marc.dionne@...istor.com>,
	Yunsheng Lin <linyunsheng@...wei.com>,
	"David S. Miller" <davem@...emloft.net>,
	Eric Dumazet <edumazet@...gle.com>,
	Jakub Kicinski <kuba@...nel.org>,
	Paolo Abeni <pabeni@...hat.com>,
	linux-afs@...ts.infradead.org,
	linux-kernel@...r.kernel.org
Subject: [PATCH net-next v2 09/39] rxrpc: Implement path-MTU probing using padded PING ACKs (RFC8899)

Implement path-MTU probing (along the lines of RFC8899) by padding some of
the PING ACKs we send.  PING ACKs get their own individual responses quite
apart from the acking of data (though, as ACKs, they fulfil that role
also).

The probing concentrates on packet sizes that correspond how many
subpackets can be stuffed inside a jumbo packet as jumbo DATA packets are
just aggregations of individual DATA packets and can be split easily for
retransmission purposes.

If we want to perform probing, we advertise this by setting the maximum
number of jumbo subpackets to 0 in the ack trailer when we send an ACK and
see if the peer is also advertising the service.  This is interpreted by
non-supporting Rx stacks as an indication that jumbo packets aren't
supported.

The MTU sizes advertised in the ACK trailer AF_RXRPC transmits are pegged
at a maximum of 1444 unless pmtud is supported by both sides.

Signed-off-by: David Howells <dhowells@...hat.com>
cc: Marc Dionne <marc.dionne@...istor.com>
cc: "David S. Miller" <davem@...emloft.net>
cc: Eric Dumazet <edumazet@...gle.com>
cc: Jakub Kicinski <kuba@...nel.org>
cc: Paolo Abeni <pabeni@...hat.com>
cc: linux-afs@...ts.infradead.org
cc: netdev@...r.kernel.org
---
 include/trace/events/rxrpc.h | 124 +++++++++++++++++++++++++++++++++++
 net/rxrpc/ar-internal.h      |  25 +++++--
 net/rxrpc/call_event.c       |   5 ++
 net/rxrpc/conn_event.c       |  17 +++--
 net/rxrpc/conn_object.c      |   6 ++
 net/rxrpc/input.c            |  26 +++++---
 net/rxrpc/io_thread.c        |   6 ++
 net/rxrpc/misc.c             |   4 +-
 net/rxrpc/output.c           |  67 +++++++++++++++----
 net/rxrpc/peer_event.c       | 104 +++++++++++++++++++++++++++--
 net/rxrpc/peer_object.c      |  24 +++++--
 net/rxrpc/proc.c             |   9 +--
 net/rxrpc/protocol.h         |  13 ++--
 net/rxrpc/sysctl.c           |   6 +-
 net/rxrpc/txbuf.c            |   3 +-
 15 files changed, 382 insertions(+), 57 deletions(-)

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index d86b5f07d292..9dcadad88e76 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -364,6 +364,7 @@
 	EM(rxrpc_propose_ack_ping_for_lost_ack,	"LostAck") \
 	EM(rxrpc_propose_ack_ping_for_lost_reply, "LostRpl") \
 	EM(rxrpc_propose_ack_ping_for_0_retrans, "0-Retrn") \
+	EM(rxrpc_propose_ack_ping_for_mtu_probe, "MTUProb") \
 	EM(rxrpc_propose_ack_ping_for_old_rtt,	"OldRtt ") \
 	EM(rxrpc_propose_ack_ping_for_params,	"Params ") \
 	EM(rxrpc_propose_ack_ping_for_rtt,	"Rtt    ") \
@@ -478,6 +479,11 @@
 	EM(rxrpc_txbuf_see_send_more,		"SEE SEND+  ")	\
 	E_(rxrpc_txbuf_see_unacked,		"SEE UNACKED")
 
+#define rxrpc_pmtud_reduce_traces \
+	EM(rxrpc_pmtud_reduce_ack,		"Ack  ")	\
+	EM(rxrpc_pmtud_reduce_icmp,		"Icmp ")	\
+	E_(rxrpc_pmtud_reduce_route,		"Route")
+
 /*
  * Generate enums for tracing information.
  */
@@ -498,6 +504,7 @@ enum rxrpc_congest_change	{ rxrpc_congest_changes } __mode(byte);
 enum rxrpc_conn_trace		{ rxrpc_conn_traces } __mode(byte);
 enum rxrpc_local_trace		{ rxrpc_local_traces } __mode(byte);
 enum rxrpc_peer_trace		{ rxrpc_peer_traces } __mode(byte);
+enum rxrpc_pmtud_reduce_trace	{ rxrpc_pmtud_reduce_traces } __mode(byte);
 enum rxrpc_propose_ack_outcome	{ rxrpc_propose_ack_outcomes } __mode(byte);
 enum rxrpc_propose_ack_trace	{ rxrpc_propose_ack_traces } __mode(byte);
 enum rxrpc_receive_trace	{ rxrpc_receive_traces } __mode(byte);
@@ -534,6 +541,7 @@ rxrpc_congest_changes;
 rxrpc_congest_modes;
 rxrpc_conn_traces;
 rxrpc_local_traces;
+rxrpc_pmtud_reduce_traces;
 rxrpc_propose_ack_traces;
 rxrpc_receive_traces;
 rxrpc_recvmsg_traces;
@@ -2040,6 +2048,122 @@ TRACE_EVENT(rxrpc_sack,
 		      __entry->sack)
 	    );
 
+TRACE_EVENT(rxrpc_pmtud_tx,
+	    TP_PROTO(struct rxrpc_call *call),
+
+	    TP_ARGS(call),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,	peer_debug_id)
+		    __field(unsigned int,	call_debug_id)
+		    __field(rxrpc_serial_t,	ping_serial)
+		    __field(unsigned short,	pmtud_trial)
+		    __field(unsigned short,	pmtud_good)
+		    __field(unsigned short,	pmtud_bad)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->peer_debug_id = call->peer->debug_id;
+		    __entry->call_debug_id = call->debug_id;
+		    __entry->ping_serial = call->conn->pmtud_probe;
+		    __entry->pmtud_trial = call->peer->pmtud_trial;
+		    __entry->pmtud_good = call->peer->pmtud_good;
+		    __entry->pmtud_bad = call->peer->pmtud_bad;
+			   ),
+
+	    TP_printk("P=%08x c=%08x pr=%08x %u-%u-%u",
+		      __entry->peer_debug_id,
+		      __entry->call_debug_id,
+		      __entry->ping_serial,
+		      __entry->pmtud_good,
+		      __entry->pmtud_trial,
+		      __entry->pmtud_bad)
+	    );
+
+TRACE_EVENT(rxrpc_pmtud_rx,
+	    TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t resp_serial),
+
+	    TP_ARGS(conn, resp_serial),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,	peer_debug_id)
+		    __field(unsigned int,	call_debug_id)
+		    __field(rxrpc_serial_t,	ping_serial)
+		    __field(rxrpc_serial_t,	resp_serial)
+		    __field(unsigned short,	max_data)
+		    __field(u8,			jumbo_max)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->peer_debug_id = conn->peer->debug_id;
+		    __entry->call_debug_id = conn->pmtud_call;
+		    __entry->ping_serial = conn->pmtud_probe;
+		    __entry->resp_serial = resp_serial;
+		    __entry->max_data = conn->peer->max_data;
+		    __entry->jumbo_max = conn->peer->pmtud_jumbo;
+			   ),
+
+	    TP_printk("P=%08x c=%08x pr=%08x rr=%08x max=%u jm=%u",
+		      __entry->peer_debug_id,
+		      __entry->call_debug_id,
+		      __entry->ping_serial,
+		      __entry->resp_serial,
+		      __entry->max_data,
+		      __entry->jumbo_max)
+	    );
+
+TRACE_EVENT(rxrpc_pmtud_lost,
+	    TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t resp_serial),
+
+	    TP_ARGS(conn, resp_serial),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,	peer_debug_id)
+		    __field(unsigned int,	call_debug_id)
+		    __field(rxrpc_serial_t,	ping_serial)
+		    __field(rxrpc_serial_t,	resp_serial)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->peer_debug_id = conn->peer->debug_id;
+		    __entry->call_debug_id = conn->pmtud_call;
+		    __entry->ping_serial = conn->pmtud_probe;
+		    __entry->resp_serial = resp_serial;
+			   ),
+
+	    TP_printk("P=%08x c=%08x pr=%08x rr=%08x",
+		      __entry->peer_debug_id,
+		      __entry->call_debug_id,
+		      __entry->ping_serial,
+		      __entry->resp_serial)
+	    );
+
+TRACE_EVENT(rxrpc_pmtud_reduce,
+	    TP_PROTO(struct rxrpc_peer *peer, rxrpc_serial_t serial,
+		     unsigned int max_data, enum rxrpc_pmtud_reduce_trace reason),
+
+	    TP_ARGS(peer, serial, max_data, reason),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,	peer_debug_id)
+		    __field(rxrpc_serial_t,	serial)
+		    __field(unsigned int,	max_data)
+		    __field(enum rxrpc_pmtud_reduce_trace, reason)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->peer_debug_id = peer->debug_id;
+		    __entry->serial = serial;
+		    __entry->max_data = max_data;
+		    __entry->reason = reason;
+			   ),
+
+	    TP_printk("P=%08x %s r=%08x m=%u",
+		      __entry->peer_debug_id,
+		      __print_symbolic(__entry->reason, rxrpc_pmtud_reduce_traces),
+		      __entry->serial, __entry->max_data)
+	    );
+
 #undef EM
 #undef E_
 
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index ab8e565cb20b..69e6f4b20bad 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -344,13 +344,25 @@ struct rxrpc_peer {
 	time64_t		last_tx_at;	/* Last time packet sent here */
 	seqlock_t		service_conn_lock;
 	spinlock_t		lock;		/* access lock */
-	unsigned int		if_mtu;		/* interface MTU for this peer */
-	unsigned int		mtu;		/* network MTU for this peer */
-	unsigned int		maxdata;	/* data size (MTU - hdrsize) */
-	unsigned short		hdrsize;	/* header size (IP + UDP + RxRPC) */
 	int			debug_id;	/* debug ID for printks */
 	struct sockaddr_rxrpc	srx;		/* remote address */
 
+	/* Path MTU discovery [RFC8899] */
+	unsigned int		pmtud_trial;	/* Current MTU probe size */
+	unsigned int		pmtud_good;	/* Largest working MTU probe we've tried */
+	unsigned int		pmtud_bad;	/* Smallest non-working MTU probe we've tried */
+	bool			pmtud_lost;	/* T if MTU probe was lost */
+	bool			pmtud_probing;	/* T if we have an active probe outstanding */
+	bool			pmtud_pending;	/* T if a call to this peer should send a probe */
+	u8			pmtud_jumbo;	/* Max jumbo packets for the MTU */
+	bool			ackr_adv_pmtud;	/* T if the peer advertises path-MTU */
+	unsigned int		ackr_max_data;	/* Maximum data advertised by peer */
+	seqcount_t		mtu_lock;	/* Lockless MTU access management */
+	unsigned int		if_mtu;		/* Local interface MTU (- hdrsize) for this peer */
+	unsigned int		max_data;	/* Maximum packet data capacity for this peer */
+	unsigned short		hdrsize;	/* header size (IP + UDP + RxRPC) */
+	unsigned short		tx_seg_max;	/* Maximum number of transmissable segments */
+
 	/* calculated RTT cache */
 #define RXRPC_RTT_CACHE_SIZE 32
 	spinlock_t		rtt_input_lock;	/* RTT lock for input routine */
@@ -531,6 +543,8 @@ struct rxrpc_connection {
 	int			debug_id;	/* debug ID for printks */
 	rxrpc_serial_t		tx_serial;	/* Outgoing packet serial number counter */
 	unsigned int		hi_serial;	/* highest serial number received */
+	rxrpc_serial_t		pmtud_probe;	/* Serial of MTU probe (or 0) */
+	unsigned int		pmtud_call;	/* ID of call used for probe */
 	u32			service_id;	/* Service ID, possibly upgraded */
 	u32			security_level;	/* Security level selected */
 	u8			security_ix;	/* security type */
@@ -1155,6 +1169,7 @@ static inline struct rxrpc_net *rxrpc_net(struct net *net)
  */
 void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason,
 		    rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why);
+void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call);
 int rxrpc_send_abort_packet(struct rxrpc_call *);
 void rxrpc_send_conn_abort(struct rxrpc_connection *conn);
 void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb);
@@ -1166,6 +1181,8 @@ void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb);
  */
 void rxrpc_input_error(struct rxrpc_local *, struct sk_buff *);
 void rxrpc_peer_keepalive_worker(struct work_struct *);
+void rxrpc_input_probe_for_pmtud(struct rxrpc_connection *conn, rxrpc_serial_t acked_serial,
+				 bool sendmsg_fail);
 
 /*
  * peer_object.c
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index c4754cc9b8d4..1d889b6f0366 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -483,6 +483,11 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
 			rxrpc_disconnect_call(call);
 		if (call->security)
 			call->security->free_call_crypto(call);
+	} else {
+		if (skb &&
+		    call->peer->ackr_adv_pmtud &&
+		    call->peer->pmtud_pending)
+			rxrpc_send_probe_for_pmtud(call);
 	}
 	if (call->acks_hard_ack != call->tx_bottom)
 		rxrpc_shrink_call_tx_buffer(call);
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 2a1396cd892f..f6c02cc44d98 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -92,7 +92,7 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
 	struct rxrpc_acktrailer trailer;
 	size_t len;
 	int ret, ioc;
-	u32 serial, mtu, call_id, padding;
+	u32 serial, max_mtu, if_mtu, call_id, padding;
 
 	_enter("%d", conn->debug_id);
 
@@ -150,8 +150,13 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
 		break;
 
 	case RXRPC_PACKET_TYPE_ACK:
-		mtu = conn->peer->if_mtu;
-		mtu -= conn->peer->hdrsize;
+		if_mtu = conn->peer->if_mtu - conn->peer->hdrsize;
+		if (conn->peer->ackr_adv_pmtud) {
+			max_mtu = umax(conn->peer->max_data, rxrpc_rx_mtu);
+		} else {
+			if_mtu = umin(1444, if_mtu);
+			max_mtu = if_mtu;
+		}
 		pkt.ack.bufferSpace	= 0;
 		pkt.ack.maxSkew		= htons(skb ? skb->priority : 0);
 		pkt.ack.firstPacket	= htonl(chan->last_seq + 1);
@@ -159,10 +164,10 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
 		pkt.ack.serial		= htonl(skb ? sp->hdr.serial : 0);
 		pkt.ack.reason		= skb ? RXRPC_ACK_DUPLICATE : RXRPC_ACK_IDLE;
 		pkt.ack.nAcks		= 0;
-		trailer.maxMTU		= htonl(rxrpc_rx_mtu);
-		trailer.ifMTU		= htonl(mtu);
+		trailer.maxMTU		= htonl(max_mtu);
+		trailer.ifMTU		= htonl(if_mtu);
 		trailer.rwind		= htonl(rxrpc_rx_window_size);
-		trailer.jumbo_max	= htonl(rxrpc_rx_jumbo_max);
+		trailer.jumbo_max	= 0;
 		pkt.whdr.flags		|= RXRPC_SLOW_START_OK;
 		padding			= 0;
 		iov[0].iov_len += sizeof(pkt.ack);
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 694c4df7a1a3..b0627398311b 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -321,6 +321,12 @@ static void rxrpc_clean_up_connection(struct work_struct *work)
 	list_del_init(&conn->proc_link);
 	write_unlock(&rxnet->conn_lock);
 
+	if (conn->pmtud_probe) {
+		trace_rxrpc_pmtud_lost(conn, 0);
+		conn->peer->pmtud_probing = false;
+		conn->peer->pmtud_pending = true;
+	}
+
 	rxrpc_purge_queue(&conn->rx_queue);
 
 	rxrpc_kill_client_conn(conn);
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 49e35be7dc13..fd08d813ef29 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -692,8 +692,8 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb
 				    struct rxrpc_acktrailer *trailer)
 {
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-	struct rxrpc_peer *peer;
-	unsigned int mtu;
+	struct rxrpc_peer *peer = call->peer;
+	unsigned int max_data;
 	bool wake = false;
 	u32 rwind = ntohl(trailer->rwind);
 
@@ -706,14 +706,22 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb
 		call->tx_winsize = rwind;
 	}
 
-	mtu = umin(ntohl(trailer->maxMTU), ntohl(trailer->ifMTU));
+	if (trailer->jumbo_max == 0) {
+		/* The peer says it supports pmtu discovery */
+		peer->ackr_adv_pmtud = true;
+	} else {
+		peer->ackr_adv_pmtud = false;
+	}
+
+	max_data = ntohl(trailer->maxMTU);
+	peer->ackr_max_data = max_data;
 
-	peer = call->peer;
-	if (mtu < peer->maxdata) {
-		spin_lock(&peer->lock);
-		peer->maxdata = mtu;
-		peer->mtu = mtu + peer->hdrsize;
-		spin_unlock(&peer->lock);
+	if (max_data < peer->max_data) {
+		trace_rxrpc_pmtud_reduce(peer, sp->hdr.serial, max_data,
+					 rxrpc_pmtud_reduce_ack);
+		write_seqcount_begin(&peer->mtu_lock);
+		peer->max_data = max_data;
+		write_seqcount_end(&peer->mtu_lock);
 	}
 
 	if (wake)
diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c
index 7af5adf53b25..bd6d4f5e97b4 100644
--- a/net/rxrpc/io_thread.c
+++ b/net/rxrpc/io_thread.c
@@ -364,6 +364,12 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn,
 	if (sp->hdr.callNumber == 0)
 		return rxrpc_input_conn_packet(conn, skb);
 
+	/* Deal with path MTU discovery probing. */
+	if (sp->hdr.type == RXRPC_PACKET_TYPE_ACK &&
+	    conn->pmtud_probe &&
+	    after_eq(sp->ack.acked_serial, conn->pmtud_probe))
+		rxrpc_input_probe_for_pmtud(conn, sp->ack.acked_serial, false);
+
 	/* Call-bound packets are routed by connection channel. */
 	channel = sp->hdr.cid & RXRPC_CHANNELMASK;
 	chan = &conn->channels[channel];
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index 657cf35089a6..8fcc8139d771 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -46,13 +46,13 @@ unsigned int rxrpc_rx_window_size = 255;
  * Maximum Rx MTU size.  This indicates to the sender the size of jumbo packet
  * made by gluing normal packets together that we're willing to handle.
  */
-unsigned int rxrpc_rx_mtu = 5692;
+unsigned int rxrpc_rx_mtu = RXRPC_JUMBO(46);
 
 /*
  * The maximum number of fragments in a received jumbo packet that we tell the
  * sender that we're willing to handle.
  */
-unsigned int rxrpc_rx_jumbo_max = 4;
+unsigned int rxrpc_rx_jumbo_max = 46;
 
 #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY
 /*
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index f8bb5250e849..a91be871ad96 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -82,10 +82,9 @@ static void rxrpc_fill_out_ack(struct rxrpc_call *call,
 	struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base;
 	struct rxrpc_acktrailer *trailer = txb->kvec[2].iov_base + 3;
 	struct rxrpc_ackpacket *ack = (struct rxrpc_ackpacket *)(whdr + 1);
-	unsigned int qsize, sack, wrap, to;
+	unsigned int qsize, sack, wrap, to, max_mtu, if_mtu;
 	rxrpc_seq_t window, wtop;
 	int rsize;
-	u32 mtu, jmax;
 	u8 *filler = txb->kvec[2].iov_base;
 	u8 *sackp = txb->kvec[1].iov_base;
 
@@ -132,16 +131,22 @@ static void rxrpc_fill_out_ack(struct rxrpc_call *call,
 		ack->reason = RXRPC_ACK_IDLE;
 	}
 
-	mtu = call->peer->if_mtu;
-	mtu -= call->peer->hdrsize;
-	jmax = rxrpc_rx_jumbo_max;
 	qsize = (window - 1) - call->rx_consumed;
 	rsize = max_t(int, call->rx_winsize - qsize, 0);
 	txb->ack_rwind = rsize;
-	trailer->maxMTU		= htonl(rxrpc_rx_mtu);
-	trailer->ifMTU		= htonl(mtu);
+
+	if_mtu = call->peer->if_mtu - call->peer->hdrsize;
+	if (call->peer->ackr_adv_pmtud) {
+		max_mtu = umax(call->peer->max_data, rxrpc_rx_mtu);
+	} else {
+		if_mtu = umin(if_mtu, 1444);
+		max_mtu = if_mtu;
+	}
+
+	trailer->maxMTU		= htonl(max_mtu);
+	trailer->ifMTU		= htonl(if_mtu);
 	trailer->rwind		= htonl(rsize);
-	trailer->jumbo_max	= htonl(jmax);
+	trailer->jumbo_max	= 0; /* Advertise pmtu discovery */
 }
 
 /*
@@ -176,7 +181,7 @@ static void rxrpc_begin_rtt_probe(struct rxrpc_call *call, rxrpc_serial_t serial
  * Transmit an ACK packet.
  */
 static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb,
-				  int nr_kv)
+				  int nr_kv, enum rxrpc_propose_ack_trace why)
 {
 	struct kvec *kv = call->local->kvec;
 	struct rxrpc_wire_header *whdr = kv[0].iov_base;
@@ -209,13 +214,16 @@ static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t
 	rxrpc_inc_stat(call->rxnet, stat_tx_ack_send);
 
 	iov_iter_kvec(&msg.msg_iter, WRITE, kv, nr_kv, txb->len);
-	rxrpc_local_dont_fragment(conn->local, false);
+	rxrpc_local_dont_fragment(conn->local, why == rxrpc_propose_ack_ping_for_mtu_probe);
 
 	ret = do_udp_sendmsg(conn->local->socket, &msg, txb->len);
 	call->peer->last_tx_at = ktime_get_seconds();
 	if (ret < 0) {
 		trace_rxrpc_tx_fail(call->debug_id, txb->serial, ret,
 				    rxrpc_tx_point_call_ack);
+		if (why == rxrpc_propose_ack_ping_for_mtu_probe &&
+		    ret == -EMSGSIZE)
+			rxrpc_input_probe_for_pmtud(conn, txb->serial, true);
 	} else {
 		trace_rxrpc_tx_packet(call->debug_id, whdr,
 				      rxrpc_tx_point_call_ack);
@@ -225,6 +233,13 @@ static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t
 		if (txb->flags & RXRPC_REQUEST_ACK)
 			call->peer->rtt_last_req = now;
 		rxrpc_set_keepalive(call, now);
+		if (why == rxrpc_propose_ack_ping_for_mtu_probe) {
+			call->peer->pmtud_pending = false;
+			call->peer->pmtud_probing = true;
+			call->conn->pmtud_probe = txb->serial;
+			call->conn->pmtud_call = call->debug_id;
+			trace_rxrpc_pmtud_tx(call);
+		}
 	}
 	rxrpc_tx_backoff(call, ret);
 }
@@ -254,21 +269,45 @@ void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason,
 
 	rxrpc_fill_out_ack(call, txb, ack_reason, serial);
 
+	/* Extend a path MTU probe ACK. */
 	nr_kv = txb->nr_kvec;
 	kv[0] = txb->kvec[0];
 	kv[1] = txb->kvec[1];
 	kv[2] = txb->kvec[2];
-	// TODO: Extend a path MTU probe ACK
+	if (why == rxrpc_propose_ack_ping_for_mtu_probe) {
+		size_t probe_mtu = call->peer->pmtud_trial + sizeof(struct rxrpc_wire_header);
+
+		if (txb->len > probe_mtu)
+			goto skip;
+		while (txb->len < probe_mtu) {
+			size_t part = umin(probe_mtu - txb->len, PAGE_SIZE);
+
+			kv[nr_kv].iov_base = page_address(ZERO_PAGE(0));
+			kv[nr_kv].iov_len = part;
+			txb->len += part;
+			nr_kv++;
+		}
+	}
 
 	call->ackr_nr_unacked = 0;
 	atomic_set(&call->ackr_nr_consumed, 0);
 	clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags);
 
 	trace_rxrpc_send_ack(call, why, ack_reason, serial);
-	rxrpc_send_ack_packet(call, txb, nr_kv);
+	rxrpc_send_ack_packet(call, txb, nr_kv, why);
+skip:
 	rxrpc_put_txbuf(txb, rxrpc_txbuf_put_ack_tx);
 }
 
+/*
+ * Send an ACK probe for path MTU discovery.
+ */
+void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call)
+{
+	rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
+		       rxrpc_propose_ack_ping_for_mtu_probe);
+}
+
 /*
  * Send an ABORT call packet.
  */
@@ -501,7 +540,7 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t
 
 	/* send the packet with the don't fragment bit set if we currently
 	 * think it's small enough */
-	if (len >= sizeof(struct rxrpc_wire_header) + call->peer->maxdata) {
+	if (len >= sizeof(struct rxrpc_wire_header) + call->peer->max_data) {
 		rxrpc_local_dont_fragment(conn->local, false);
 		frag = rxrpc_tx_point_call_data_frag;
 	} else {
@@ -548,7 +587,7 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t
 						  RX_USER_ABORT, ret);
 	}
 
-	_leave(" = %d [%u]", ret, call->peer->maxdata);
+	_leave(" = %d [%u]", ret, call->peer->max_data);
 	return ret;
 }
 
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index 552ba84a255c..8fc9464a960c 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -102,6 +102,8 @@ static struct rxrpc_peer *rxrpc_lookup_peer_local_rcu(struct rxrpc_local *local,
  */
 static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu)
 {
+	unsigned int max_data;
+
 	/* wind down the local interface MTU */
 	if (mtu > 0 && peer->if_mtu == 65535 && mtu < peer->if_mtu)
 		peer->if_mtu = mtu;
@@ -120,11 +122,17 @@ static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu)
 		}
 	}
 
-	if (mtu < peer->mtu) {
-		spin_lock(&peer->lock);
-		peer->mtu = mtu;
-		peer->maxdata = peer->mtu - peer->hdrsize;
-		spin_unlock(&peer->lock);
+	max_data = max_t(int, mtu - peer->hdrsize, 500);
+	if (max_data < peer->max_data) {
+		if (peer->pmtud_good > max_data)
+			peer->pmtud_good = max_data;
+		if (peer->pmtud_bad > max_data + 1)
+			peer->pmtud_bad = max_data + 1;
+
+		trace_rxrpc_pmtud_reduce(peer, 0, max_data, rxrpc_pmtud_reduce_icmp);
+		write_seqcount_begin(&peer->mtu_lock);
+		peer->max_data = max_data;
+		write_seqcount_end(&peer->mtu_lock);
 	}
 }
 
@@ -347,3 +355,89 @@ void rxrpc_peer_keepalive_worker(struct work_struct *work)
 
 	_leave("");
 }
+
+/*
+ * Do path MTU probing.
+ */
+void rxrpc_input_probe_for_pmtud(struct rxrpc_connection *conn, rxrpc_serial_t acked_serial,
+				 bool sendmsg_fail)
+{
+	struct rxrpc_peer *peer = conn->peer;
+	unsigned int max_data = peer->max_data;
+	int good, trial, bad, jumbo;
+
+	good  = peer->pmtud_good;
+	trial = peer->pmtud_trial;
+	bad   = peer->pmtud_bad;
+	if (good >= bad - 1) {
+		conn->pmtud_probe = 0;
+		peer->pmtud_lost = false;
+		return;
+	}
+
+	if (!peer->pmtud_probing)
+		goto send_probe;
+
+	if (sendmsg_fail || after(acked_serial, conn->pmtud_probe)) {
+		/* Retry a lost probe. */
+		if (!peer->pmtud_lost) {
+			trace_rxrpc_pmtud_lost(conn, acked_serial);
+			conn->pmtud_probe = 0;
+			peer->pmtud_lost = true;
+			goto send_probe;
+		}
+
+		/* The probed size didn't seem to get through. */
+		bad = trial;
+		peer->pmtud_bad = bad;
+		if (bad <= max_data)
+			max_data = bad - 1;
+	} else {
+		/* It did get through. */
+		good = trial;
+		peer->pmtud_good = good;
+		if (good > max_data)
+			max_data = good;
+	}
+
+	max_data = umin(max_data, peer->ackr_max_data);
+	if (max_data != peer->max_data) {
+		preempt_disable();
+		write_seqcount_begin(&peer->mtu_lock);
+		peer->max_data = max_data;
+		write_seqcount_end(&peer->mtu_lock);
+		preempt_enable();
+	}
+
+	jumbo = max_data + sizeof(struct rxrpc_jumbo_header);
+	jumbo /= RXRPC_JUMBO_SUBPKTLEN;
+	peer->pmtud_jumbo = jumbo;
+
+	trace_rxrpc_pmtud_rx(conn, acked_serial);
+	conn->pmtud_probe = 0;
+	peer->pmtud_lost = false;
+
+	if (good < RXRPC_JUMBO(2) && bad > RXRPC_JUMBO(2))
+		trial = RXRPC_JUMBO(2);
+	else if (good < RXRPC_JUMBO(4) && bad > RXRPC_JUMBO(4))
+		trial = RXRPC_JUMBO(4);
+	else if (good < RXRPC_JUMBO(3) && bad > RXRPC_JUMBO(3))
+		trial = RXRPC_JUMBO(3);
+	else if (good < RXRPC_JUMBO(6) && bad > RXRPC_JUMBO(6))
+		trial = RXRPC_JUMBO(6);
+	else if (good < RXRPC_JUMBO(5) && bad > RXRPC_JUMBO(5))
+		trial = RXRPC_JUMBO(5);
+	else if (good < RXRPC_JUMBO(8) && bad > RXRPC_JUMBO(8))
+		trial = RXRPC_JUMBO(8);
+	else if (good < RXRPC_JUMBO(7) && bad > RXRPC_JUMBO(7))
+		trial = RXRPC_JUMBO(7);
+	else
+		trial = (good + bad) / 2;
+	peer->pmtud_trial = trial;
+
+	if (good >= bad)
+		return;
+
+send_probe:
+	peer->pmtud_pending = true;
+}
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index 49dcda67a0d5..80ef6f06d512 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -162,6 +162,11 @@ static void rxrpc_assess_MTU_size(struct rxrpc_local *local,
 #endif
 
 	peer->if_mtu = 1500;
+	if (peer->max_data < peer->if_mtu - peer->hdrsize) {
+		trace_rxrpc_pmtud_reduce(peer, 0, peer->if_mtu - peer->hdrsize,
+					 rxrpc_pmtud_reduce_route);
+		peer->max_data = peer->if_mtu - peer->hdrsize;
+	}
 
 	memset(&fl, 0, sizeof(fl));
 	switch (peer->srx.transport.family) {
@@ -199,8 +204,16 @@ static void rxrpc_assess_MTU_size(struct rxrpc_local *local,
 	}
 
 	peer->if_mtu = dst_mtu(dst);
+	peer->hdrsize += dst->header_len + dst->trailer_len;
+	peer->tx_seg_max = dst->dev->gso_max_segs;
 	dst_release(dst);
 
+	peer->max_data		= umin(RXRPC_JUMBO(1), peer->if_mtu - peer->hdrsize);
+	peer->pmtud_good	= 500;
+	peer->pmtud_bad		= peer->if_mtu - peer->hdrsize + 1;
+	peer->pmtud_trial	= umin(peer->max_data, peer->pmtud_bad - 1);
+	peer->pmtud_pending	= true;
+
 	_leave(" [if_mtu %u]", peer->if_mtu);
 }
 
@@ -223,6 +236,7 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp,
 		seqlock_init(&peer->service_conn_lock);
 		spin_lock_init(&peer->lock);
 		spin_lock_init(&peer->rtt_input_lock);
+		seqcount_init(&peer->mtu_lock);
 		peer->debug_id = atomic_inc_return(&rxrpc_debug_id);
 
 		rxrpc_peer_init_rtt(peer);
@@ -242,9 +256,7 @@ static void rxrpc_init_peer(struct rxrpc_local *local, struct rxrpc_peer *peer,
 			    unsigned long hash_key)
 {
 	peer->hash_key = hash_key;
-	rxrpc_assess_MTU_size(local, peer);
-	peer->mtu = peer->if_mtu;
-	peer->rtt_last_req = ktime_get_real();
+
 
 	switch (peer->srx.transport.family) {
 	case AF_INET:
@@ -268,7 +280,11 @@ static void rxrpc_init_peer(struct rxrpc_local *local, struct rxrpc_peer *peer,
 	}
 
 	peer->hdrsize += sizeof(struct rxrpc_wire_header);
-	peer->maxdata = peer->mtu - peer->hdrsize;
+	peer->max_data = peer->if_mtu - peer->hdrsize;
+
+	rxrpc_assess_MTU_size(local, peer);
+
+	peer->rtt_last_req = ktime_get_real();
 }
 
 /*
diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
index ce4d48bdfbe9..44722c226064 100644
--- a/net/rxrpc/proc.c
+++ b/net/rxrpc/proc.c
@@ -283,9 +283,7 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v)
 
 	if (v == SEQ_START_TOKEN) {
 		seq_puts(seq,
-			 "Proto Local                                          "
-			 " Remote                                         "
-			 " Use SST   MTU LastUse      RTT      RTO\n"
+			 "Proto Local                                           Remote                                          Use SST   Maxd LastUse      RTT      RTO\n"
 			 );
 		return 0;
 	}
@@ -298,13 +296,12 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v)
 
 	now = ktime_get_seconds();
 	seq_printf(seq,
-		   "UDP   %-47.47s %-47.47s %3u"
-		   " %3u %5u %6llus %8u %8u\n",
+		   "UDP   %-47.47s %-47.47s %3u %4u %5u %6llus %8u %8u\n",
 		   lbuff,
 		   rbuff,
 		   refcount_read(&peer->ref),
 		   peer->cong_ssthresh,
-		   peer->mtu,
+		   peer->max_data,
 		   now - peer->last_tx_at,
 		   peer->srtt_us >> 3,
 		   peer->rto_us);
diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h
index 4fe6b4d20ada..42f70e4636f8 100644
--- a/net/rxrpc/protocol.h
+++ b/net/rxrpc/protocol.h
@@ -92,11 +92,16 @@ struct rxrpc_jumbo_header {
 /*
  * The maximum number of subpackets that can possibly fit in a UDP packet is:
  *
- *	((max_IP - IP_hdr - UDP_hdr) / RXRPC_JUMBO_SUBPKTLEN) + 1
- *	= ((65535 - 28 - 28) / 1416) + 1
- *	= 46 non-terminal packets and 1 terminal packet.
+ *	(max_UDP - wirehdr + jumbohdr) / (jumbohdr + 1412)
+ *	= ((65535 - 28 + 4) / 1416)
+ *	= 45 non-terminal packets and 1 terminal packet.
  */
-#define RXRPC_MAX_NR_JUMBO	47
+#define RXRPC_MAX_NR_JUMBO	46
+
+/* Size of a jumbo packet with N subpackets, excluding UDP+IP */
+#define RXRPC_JUMBO(N) ((int)sizeof(struct rxrpc_wire_header) + \
+			RXRPC_JUMBO_DATALEN +				\
+			((N) - 1) * RXRPC_JUMBO_SUBPKTLEN)
 
 /*****************************************************************************/
 /*
diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
index 9bf9a1f6e4cb..46a20cf4c402 100644
--- a/net/rxrpc/sysctl.c
+++ b/net/rxrpc/sysctl.c
@@ -11,6 +11,8 @@
 #include "ar-internal.h"
 
 static struct ctl_table_header *rxrpc_sysctl_reg_table;
+static const unsigned int rxrpc_rx_mtu_min = 500;
+static const unsigned int rxrpc_jumbo_max = RXRPC_MAX_NR_JUMBO;
 static const unsigned int four = 4;
 static const unsigned int max_backlog = RXRPC_BACKLOG_MAX - 1;
 static const unsigned int n_65535 = 65535;
@@ -115,7 +117,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= (void *)SYSCTL_ONE,
+		.extra1		= (void *)&rxrpc_rx_mtu_min,
 		.extra2		= (void *)&n_65535,
 	},
 	{
@@ -125,7 +127,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= (void *)SYSCTL_ONE,
-		.extra2		= (void *)&four,
+		.extra2		= (void *)&rxrpc_jumbo_max,
 	},
 };
 
diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c
index c3913d8a50d3..2a4291617d40 100644
--- a/net/rxrpc/txbuf.c
+++ b/net/rxrpc/txbuf.c
@@ -179,7 +179,8 @@ static void rxrpc_free_txbuf(struct rxrpc_txbuf *txb)
 	trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 0,
 			  rxrpc_txbuf_free);
 	for (i = 0; i < txb->nr_kvec; i++)
-		if (txb->kvec[i].iov_base)
+		if (txb->kvec[i].iov_base &&
+		    !is_zero_pfn(page_to_pfn(virt_to_page(txb->kvec[i].iov_base))))
 			page_frag_free(txb->kvec[i].iov_base);
 	kfree(txb);
 	atomic_dec(&rxrpc_nr_txbuf);


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ