linux-kernel - [PATCH] libceph: handle EADDRNOTAVAIL more gracefully

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20260208164033.457748-2-ionut.nechita@windriver.com>
Date: Sun,  8 Feb 2026 18:40:34 +0200
From: "Ionut Nechita (Wind River)" <ionut.nechita@...driver.com>
To: Ilya Dryomov <idryomov@...il.com>, Alex Markuze <amarkuze@...hat.com>,
        Viacheslav Dubeyko <slava@...eyko.com>
Cc: Sebastian Andrzej Siewior <bigeasy@...utronix.de>,
        Clark Williams <clrkwllms@...nel.org>,
        Steven Rostedt <rostedt@...dmis.org>, ceph-devel@...r.kernel.org,
        linux-kernel@...r.kernel.org, linux-rt-devel@...ts.linux.dev,
        Ionut Nechita <ionut_n2001@...oo.com>,
        Ionut Nechita <ionut.nechita@...driver.com>,
        Xiubo Li <xiubli@...hat.com>, Jeff Layton <jlayton@...nel.org>,
        Sage Weil <sage@...dream.net>, superm1@...nel.org, jkosina@...e.com
Subject: [PATCH] libceph: handle EADDRNOTAVAIL more gracefully

From: Ionut Nechita <ionut.nechita@...driver.com>

When connecting to Ceph monitors/OSDs, kernel_connect() may return
-EADDRNOTAVAIL if the source address is temporarily unavailable.
This commonly occurs during:
- IPv6 Duplicate Address Detection (DAD), which takes 1-2 seconds
- IPv4/IPv6 interface state changes (link up/down events)
- Address removal or reconfiguration on the interface
- Network namespace transitions in containerized environments
- CNI reconfigurations in Kubernetes

Currently, libceph treats EADDRNOTAVAIL like any other connection error
and enters exponential backoff
(250ms, 500ms, 1s, 2s, 4s, ...), causing delays of 15+ seconds
before successful reconnection even after the address becomes
available.

This is particularly problematic in Kubernetes environments running Ceph
on real-time kernels, where:
- Storage pods undergo frequent rolling updates
- Network policies and CNI configurations change dynamically
- Low I/O latency is critical for RT workloads
- sync() calls can block for 120+ seconds waiting for reconnection

This patch improves the situation by:
1. Detecting EADDRNOTAVAIL on both IPv4 and IPv6 connections
2. Using a shorter retry interval (100ms) instead of exponential backoff
3. Logging a more informative rate-limited warning message
4. Supporting both msgr1 and msgr2 protocol versions
5. Clearing the flag on successful connection and when reopening

The fast retry approach is appropriate because:
- EADDRNOTAVAIL is typically transient (address becomes valid in 1-2s)
- Each retry attempt is inexpensive (kernel_connect fails immediately)
- Quick recovery is critical for maintaining storage availability
- The connection succeeds as soon as the address becomes valid

Real-world impact: In production logs showing 'task sync blocked for
more than 122 seconds' with error -99 (EADDRNOTAVAIL), this patch
reduces reconnection time from 120+ seconds to 2-3 seconds.

Fixes: 60bf8bf8815e6 ("libceph: fix msgr backoff")
Signed-off-by: Ionut Nechita <ionut.nechita@...driver.com>
---
 include/linux/ceph/messenger.h | 11 +++++++
 net/ceph/messenger.c           | 55 ++++++++++++++++++++++++++++++++--
 2 files changed, 63 insertions(+), 3 deletions(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 6aa4c6478c9f6..ec08d02a9d4bd 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -321,6 +321,13 @@ struct ceph_msg {
 /* ceph connection fault delay defaults, for exponential backoff */
 #define BASE_DELAY_INTERVAL	(HZ / 4)
 #define MAX_DELAY_INTERVAL	(15 * HZ)
+/*
+ * Shorter retry delay for EADDRNOTAVAIL. This error typically indicates
+ * a transient condition (IPv6 DAD in progress, address reconfiguration,
+ * temporary route issue) that resolves in 1-2 seconds. Fast retries
+ * allow quick recovery without exponential backoff delays.
+ */
+#define ADDRNOTAVAIL_DELAY	(HZ / 10)
 
 struct ceph_connection_v1_info {
 	struct kvec out_kvec[8],         /* sending header/footer data */
@@ -361,6 +368,8 @@ struct ceph_connection_v1_info {
 	u32 connect_seq;      /* identify the most recent connection
 				 attempt for this session */
 	u32 peer_global_seq;  /* peer's global seq for this connection */
+
+	bool addr_notavail;  /* address not available (transient) */
 };
 
 #define CEPH_CRC_LEN			4
@@ -432,6 +441,8 @@ struct ceph_connection_v2_info {
 
 	int con_mode;  /* CEPH_CON_MODE_* */
 
+	bool addr_notavail;  /* address not available (transient) */
+
 	void *conn_bufs[16];
 	int conn_buf_cnt;
 	int data_len_remain;
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 70b25f4ecba67..d86efcfb7b87f 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -467,8 +467,22 @@ int ceph_tcp_connect(struct ceph_connection *con)
 		     ceph_pr_addr(&con->peer_addr),
 		     sock->sk->sk_state);
 	} else if (ret < 0) {
-		pr_err("connect %s error %d\n",
-		       ceph_pr_addr(&con->peer_addr), ret);
+		if (ret == -EADDRNOTAVAIL) {
+			/*
+			 * Address not yet available - could be IPv6 DAD in
+			 * progress, address reconfiguration, or temporary
+			 * route issue. Use shorter delay.
+			 */
+			pr_warn_ratelimited("connect %s: address not available (DAD/route issue?), will retry\n",
+					    ceph_pr_addr(&con->peer_addr));
+			if (ceph_msgr2(from_msgr(con->msgr)))
+				con->v2.addr_notavail = true;
+			else
+				con->v1.addr_notavail = true;
+		} else {
+			pr_err("connect %s error %d\n",
+			       ceph_pr_addr(&con->peer_addr), ret);
+		}
 		sock_release(sock);
 		return ret;
 	}
@@ -477,6 +491,13 @@ int ceph_tcp_connect(struct ceph_connection *con)
 		tcp_sock_set_nodelay(sock->sk);
 
 	con->sock = sock;
+
+	/* Clear addr_notavail flag on successful connection */
+	if (ceph_msgr2(from_msgr(con->msgr)))
+		con->v2.addr_notavail = false;
+	else
+		con->v1.addr_notavail = false;
+
 	return 0;
 }
 
@@ -610,6 +631,13 @@ void ceph_con_open(struct ceph_connection *con,
 
 	memcpy(&con->peer_addr, addr, sizeof(*addr));
 	con->delay = 0;      /* reset backoff memory */
+
+	/* Clear addr_notavail flag when opening/reopening connection */
+	if (ceph_msgr2(from_msgr(con->msgr)))
+		con->v2.addr_notavail = false;
+	else
+		con->v1.addr_notavail = false;
+
 	mutex_unlock(&con->mutex);
 	queue_con(con);
 }
@@ -1614,6 +1642,8 @@ static void ceph_con_workfn(struct work_struct *work)
  */
 static void con_fault(struct ceph_connection *con)
 {
+	bool addr_issue = false;
+
 	dout("fault %p state %d to peer %s\n",
 	     con, con->state, ceph_pr_addr(&con->peer_addr));
 
@@ -1621,6 +1651,19 @@ static void con_fault(struct ceph_connection *con)
 		ceph_pr_addr(&con->peer_addr), con->error_msg);
 	con->error_msg = NULL;
 
+	/* Check and reset addr_notavail flag if set */
+	if (ceph_msgr2(from_msgr(con->msgr))) {
+		if (con->v2.addr_notavail) {
+			addr_issue = true;
+			con->v2.addr_notavail = false;
+		}
+	} else {
+		if (con->v1.addr_notavail) {
+			addr_issue = true;
+			con->v1.addr_notavail = false;
+		}
+	}
+
 	WARN_ON(con->state == CEPH_CON_S_STANDBY ||
 		con->state == CEPH_CON_S_CLOSED);
 
@@ -1645,7 +1688,13 @@ static void con_fault(struct ceph_connection *con)
 	} else {
 		/* retry after a delay. */
 		con->state = CEPH_CON_S_PREOPEN;
-		if (!con->delay) {
+		if (addr_issue) {
+			/*
+			 * Address not available - use shorter delay as this
+			 * is often a transient condition.
+			 */
+			con->delay = ADDRNOTAVAIL_DELAY;
+		} else if (!con->delay) {
 			con->delay = BASE_DELAY_INTERVAL;
 		} else if (con->delay < MAX_DELAY_INTERVAL) {
 			con->delay *= 2;
-- 
2.52.0