linux-kernel - [PATCH] ceph: add timeout protection to ceph_mdsc

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-ID: <20260208131819.37276-2-ionut.nechita@windriver.com>
Date: Sun,  8 Feb 2026 15:18:20 +0200
From: "Ionut Nechita (Wind River)" <ionut.nechita@...driver.com>
To: Ilya Dryomov <idryomov@...il.com>, Alex Markuze <amarkuze@...hat.com>,
        Viacheslav Dubeyko <slava@...eyko.com>
Cc: Sebastian Andrzej Siewior <bigeasy@...utronix.de>,
        Clark Williams <clrkwllms@...nel.org>,
        Steven Rostedt <rostedt@...dmis.org>, ceph-devel@...r.kernel.org,
        linux-kernel@...r.kernel.org, linux-rt-devel@...ts.linux.dev,
        Ionut Nechita <ionut_n2001@...oo.com>,
        Ionut Nechita <ionut.nechita@...driver.com>,
        Xiubo Li <xiubli@...hat.com>, Jeff Layton <jlayton@...nel.org>,
        superm1@...nel.org, jkosina@...e.com
Subject: [PATCH] ceph: add timeout protection to ceph_mdsc_sync() path

From: Ionut Nechita <ionut.nechita@...driver.com>

When Ceph MDS becomes unreachable (e.g., due to IPv6 EADDRNOTAVAIL
during DAD or network transitions), the sync syscall can block
indefinitely in ceph_mdsc_sync(). The hung_task detector fires
repeatedly (122s, 245s, 368s... up to 983+ seconds) with traces like:

  INFO: task sync:12345 blocked for more than 122 seconds.
  Call Trace:
    ceph_mdsc_sync+0x4d6/0x5a0 [ceph]
    ceph_sync_fs+0x31/0x130 [ceph]
    iterate_supers+0x97/0x100
    ksys_sync+0x32/0xb0

Three functions in the MDS sync path use indefinite waits:

1. wait_caps_flush() uses wait_event() with no timeout
2. flush_mdlog_and_wait_mdsc_unsafe_requests() uses
   wait_for_completion() with no timeout
3. ceph_mdsc_sync() returns void, cannot propagate errors

This is particularly problematic in Kubernetes environments with
PREEMPT_RT kernels where Ceph storage pods undergo rolling updates
and IPv6 network reconfigurations cause temporary MDS unavailability.

Fix this by adding mount_timeout-based timeouts (default 60s) to the
blocking waits, following the existing pattern used by wait_requests()
and ceph_mdsc_close_sessions() in the same file:

- wait_caps_flush(): use wait_event_timeout() with mount_timeout
- flush_mdlog_and_wait_mdsc_unsafe_requests(): use
  wait_for_completion_timeout() with mount_timeout
- ceph_mdsc_sync(): change return type to int, propagate -ETIMEDOUT
- ceph_sync_fs(): propagate error from ceph_mdsc_sync() to VFS

On timeout, dirty caps and pending requests are NOT discarded - they
remain in memory and are re-synced when MDS reconnects. The timeout
simply unblocks the calling task. If mount_timeout is set to 0,
ceph_timeout_jiffies() returns MAX_SCHEDULE_TIMEOUT, preserving the
original infinite-wait behavior.

Real-world impact: In production logs showing 'task sync blocked for
more than 983 seconds', this patch limits the block to mount_timeout
(60s default), returning -ETIMEDOUT to the VFS layer instead of
hanging indefinitely.

Fixes: 1b2ba3c5616e ("ceph: flush the mdlog for filesystem sync")
Signed-off-by: Ionut Nechita <ionut.nechita@...driver.com>
---
 fs/ceph/mds_client.c | 50 ++++++++++++++++++++++++++++++++++----------
 fs/ceph/mds_client.h |  2 +-
 fs/ceph/super.c      |  5 +++--
 3 files changed, 43 insertions(+), 14 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 7e4eab824daef..4cd8f584147f4 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2290,17 +2290,26 @@ static int check_caps_flush(struct ceph_mds_client *mdsc,
  *
  * returns true if we've flushed through want_flush_tid
  */
-static void wait_caps_flush(struct ceph_mds_client *mdsc,
-			    u64 want_flush_tid)
+static int wait_caps_flush(struct ceph_mds_client *mdsc,
+			   u64 want_flush_tid)
 {
 	struct ceph_client *cl = mdsc->fsc->client;
+	struct ceph_options *opts = mdsc->fsc->client->options;
+	long ret;
 
 	doutc(cl, "want %llu\n", want_flush_tid);
 
-	wait_event(mdsc->cap_flushing_wq,
-		   check_caps_flush(mdsc, want_flush_tid));
+	ret = wait_event_timeout(mdsc->cap_flushing_wq,
+				 check_caps_flush(mdsc, want_flush_tid),
+				 ceph_timeout_jiffies(opts->mount_timeout));
+	if (!ret) {
+		pr_warn_client(cl, "cap flush timeout waiting for tid %llu\n",
+			       want_flush_tid);
+		return -ETIMEDOUT;
+	}
 
 	doutc(cl, "ok, flushed thru %llu\n", want_flush_tid);
+	return 0;
 }
 
 /*
@@ -5865,13 +5874,15 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
 /*
  * flush the mdlog and wait for all write mds requests to flush.
  */
-static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc,
-						 u64 want_tid)
+static int flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc,
+						      u64 want_tid)
 {
 	struct ceph_client *cl = mdsc->fsc->client;
+	struct ceph_options *opts = mdsc->fsc->client->options;
 	struct ceph_mds_request *req = NULL, *nextreq;
 	struct ceph_mds_session *last_session = NULL;
 	struct rb_node *n;
+	unsigned long left;
 
 	mutex_lock(&mdsc->mutex);
 	doutc(cl, "want %lld\n", want_tid);
@@ -5910,7 +5921,19 @@ static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *md
 			}
 			doutc(cl, "wait on %llu (want %llu)\n",
 			      req->r_tid, want_tid);
-			wait_for_completion(&req->r_safe_completion);
+			left = wait_for_completion_timeout(
+					&req->r_safe_completion,
+					ceph_timeout_jiffies(opts->mount_timeout));
+			if (!left) {
+				pr_warn_client(cl,
+					       "flush mdlog request tid %llu timed out\n",
+					       req->r_tid);
+				ceph_mdsc_put_request(req);
+				if (nextreq)
+					ceph_mdsc_put_request(nextreq);
+				ceph_put_mds_session(last_session);
+				return -ETIMEDOUT;
+			}
 
 			mutex_lock(&mdsc->mutex);
 			ceph_mdsc_put_request(req);
@@ -5928,15 +5951,17 @@ static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *md
 	mutex_unlock(&mdsc->mutex);
 	ceph_put_mds_session(last_session);
 	doutc(cl, "done\n");
+	return 0;
 }
 
-void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
+int ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 {
 	struct ceph_client *cl = mdsc->fsc->client;
 	u64 want_tid, want_flush;
+	int ret;
 
 	if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN)
-		return;
+		return -EIO;
 
 	doutc(cl, "sync\n");
 	mutex_lock(&mdsc->mutex);
@@ -5957,8 +5982,11 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 
 	doutc(cl, "sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
 
-	flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid);
-	wait_caps_flush(mdsc, want_flush);
+	ret = flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid);
+	if (ret)
+		return ret;
+
+	return wait_caps_flush(mdsc, want_flush);
 }
 
 /*
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 0428a5eaf28c6..a8b72cb13de1f 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -569,7 +569,7 @@ extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
 extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc);
 extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
 
-extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
+extern int ceph_mdsc_sync(struct ceph_mds_client *mdsc);
 
 extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
 extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 7c1c1dac320da..6b0ad7a455815 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -125,6 +125,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
 {
 	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
 	struct ceph_client *cl = fsc->client;
+	int ret;
 
 	if (!wait) {
 		doutc(cl, "(non-blocking)\n");
@@ -136,9 +137,9 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
 
 	doutc(cl, "(blocking)\n");
 	ceph_osdc_sync(&fsc->client->osdc);
-	ceph_mdsc_sync(fsc->mdsc);
+	ret = ceph_mdsc_sync(fsc->mdsc);
 	doutc(cl, "(blocking) done\n");
-	return 0;
+	return ret;
 }
 
 /*
-- 
2.52.0