[<prev] [next>] [day] [month] [year] [list]
Message-ID: <20260208131819.37276-2-ionut.nechita@windriver.com>
Date: Sun, 8 Feb 2026 15:18:20 +0200
From: "Ionut Nechita (Wind River)" <ionut.nechita@...driver.com>
To: Ilya Dryomov <idryomov@...il.com>, Alex Markuze <amarkuze@...hat.com>,
Viacheslav Dubeyko <slava@...eyko.com>
Cc: Sebastian Andrzej Siewior <bigeasy@...utronix.de>,
Clark Williams <clrkwllms@...nel.org>,
Steven Rostedt <rostedt@...dmis.org>, ceph-devel@...r.kernel.org,
linux-kernel@...r.kernel.org, linux-rt-devel@...ts.linux.dev,
Ionut Nechita <ionut_n2001@...oo.com>,
Ionut Nechita <ionut.nechita@...driver.com>,
Xiubo Li <xiubli@...hat.com>, Jeff Layton <jlayton@...nel.org>,
superm1@...nel.org, jkosina@...e.com
Subject: [PATCH] ceph: add timeout protection to ceph_mdsc_sync() path
From: Ionut Nechita <ionut.nechita@...driver.com>
When Ceph MDS becomes unreachable (e.g., due to IPv6 EADDRNOTAVAIL
during DAD or network transitions), the sync syscall can block
indefinitely in ceph_mdsc_sync(). The hung_task detector fires
repeatedly (122s, 245s, 368s... up to 983+ seconds) with traces like:
INFO: task sync:12345 blocked for more than 122 seconds.
Call Trace:
ceph_mdsc_sync+0x4d6/0x5a0 [ceph]
ceph_sync_fs+0x31/0x130 [ceph]
iterate_supers+0x97/0x100
ksys_sync+0x32/0xb0
Three functions in the MDS sync path use indefinite waits:
1. wait_caps_flush() uses wait_event() with no timeout
2. flush_mdlog_and_wait_mdsc_unsafe_requests() uses
wait_for_completion() with no timeout
3. ceph_mdsc_sync() returns void, cannot propagate errors
This is particularly problematic in Kubernetes environments with
PREEMPT_RT kernels where Ceph storage pods undergo rolling updates
and IPv6 network reconfigurations cause temporary MDS unavailability.
Fix this by adding mount_timeout-based timeouts (default 60s) to the
blocking waits, following the existing pattern used by wait_requests()
and ceph_mdsc_close_sessions() in the same file:
- wait_caps_flush(): use wait_event_timeout() with mount_timeout
- flush_mdlog_and_wait_mdsc_unsafe_requests(): use
wait_for_completion_timeout() with mount_timeout
- ceph_mdsc_sync(): change return type to int, propagate -ETIMEDOUT
- ceph_sync_fs(): propagate error from ceph_mdsc_sync() to VFS
On timeout, dirty caps and pending requests are NOT discarded - they
remain in memory and are re-synced when MDS reconnects. The timeout
simply unblocks the calling task. If mount_timeout is set to 0,
ceph_timeout_jiffies() returns MAX_SCHEDULE_TIMEOUT, preserving the
original infinite-wait behavior.
Real-world impact: In production logs showing 'task sync blocked for
more than 983 seconds', this patch limits the block to mount_timeout
(60s default), returning -ETIMEDOUT to the VFS layer instead of
hanging indefinitely.
Fixes: 1b2ba3c5616e ("ceph: flush the mdlog for filesystem sync")
Signed-off-by: Ionut Nechita <ionut.nechita@...driver.com>
---
fs/ceph/mds_client.c | 50 ++++++++++++++++++++++++++++++++++----------
fs/ceph/mds_client.h | 2 +-
fs/ceph/super.c | 5 +++--
3 files changed, 43 insertions(+), 14 deletions(-)
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 7e4eab824daef..4cd8f584147f4 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2290,17 +2290,26 @@ static int check_caps_flush(struct ceph_mds_client *mdsc,
*
* returns true if we've flushed through want_flush_tid
*/
-static void wait_caps_flush(struct ceph_mds_client *mdsc,
- u64 want_flush_tid)
+static int wait_caps_flush(struct ceph_mds_client *mdsc,
+ u64 want_flush_tid)
{
struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_options *opts = mdsc->fsc->client->options;
+ long ret;
doutc(cl, "want %llu\n", want_flush_tid);
- wait_event(mdsc->cap_flushing_wq,
- check_caps_flush(mdsc, want_flush_tid));
+ ret = wait_event_timeout(mdsc->cap_flushing_wq,
+ check_caps_flush(mdsc, want_flush_tid),
+ ceph_timeout_jiffies(opts->mount_timeout));
+ if (!ret) {
+ pr_warn_client(cl, "cap flush timeout waiting for tid %llu\n",
+ want_flush_tid);
+ return -ETIMEDOUT;
+ }
doutc(cl, "ok, flushed thru %llu\n", want_flush_tid);
+ return 0;
}
/*
@@ -5865,13 +5874,15 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
/*
* flush the mdlog and wait for all write mds requests to flush.
*/
-static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc,
- u64 want_tid)
+static int flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc,
+ u64 want_tid)
{
struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_options *opts = mdsc->fsc->client->options;
struct ceph_mds_request *req = NULL, *nextreq;
struct ceph_mds_session *last_session = NULL;
struct rb_node *n;
+ unsigned long left;
mutex_lock(&mdsc->mutex);
doutc(cl, "want %lld\n", want_tid);
@@ -5910,7 +5921,19 @@ static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *md
}
doutc(cl, "wait on %llu (want %llu)\n",
req->r_tid, want_tid);
- wait_for_completion(&req->r_safe_completion);
+ left = wait_for_completion_timeout(
+ &req->r_safe_completion,
+ ceph_timeout_jiffies(opts->mount_timeout));
+ if (!left) {
+ pr_warn_client(cl,
+ "flush mdlog request tid %llu timed out\n",
+ req->r_tid);
+ ceph_mdsc_put_request(req);
+ if (nextreq)
+ ceph_mdsc_put_request(nextreq);
+ ceph_put_mds_session(last_session);
+ return -ETIMEDOUT;
+ }
mutex_lock(&mdsc->mutex);
ceph_mdsc_put_request(req);
@@ -5928,15 +5951,17 @@ static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *md
mutex_unlock(&mdsc->mutex);
ceph_put_mds_session(last_session);
doutc(cl, "done\n");
+ return 0;
}
-void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
+int ceph_mdsc_sync(struct ceph_mds_client *mdsc)
{
struct ceph_client *cl = mdsc->fsc->client;
u64 want_tid, want_flush;
+ int ret;
if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN)
- return;
+ return -EIO;
doutc(cl, "sync\n");
mutex_lock(&mdsc->mutex);
@@ -5957,8 +5982,11 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
doutc(cl, "sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
- flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid);
- wait_caps_flush(mdsc, want_flush);
+ ret = flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid);
+ if (ret)
+ return ret;
+
+ return wait_caps_flush(mdsc, want_flush);
}
/*
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 0428a5eaf28c6..a8b72cb13de1f 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -569,7 +569,7 @@ extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc);
extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
-extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
+extern int ceph_mdsc_sync(struct ceph_mds_client *mdsc);
extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 7c1c1dac320da..6b0ad7a455815 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -125,6 +125,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
{
struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
struct ceph_client *cl = fsc->client;
+ int ret;
if (!wait) {
doutc(cl, "(non-blocking)\n");
@@ -136,9 +137,9 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
doutc(cl, "(blocking)\n");
ceph_osdc_sync(&fsc->client->osdc);
- ceph_mdsc_sync(fsc->mdsc);
+ ret = ceph_mdsc_sync(fsc->mdsc);
doutc(cl, "(blocking) done\n");
- return 0;
+ return ret;
}
/*
--
2.52.0
Powered by blists - more mailing lists