[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <5aad730afbaf7865525ca9e16c999804146ce8c6.camel@ibm.com>
Date: Mon, 9 Feb 2026 23:03:47 +0000
From: Viacheslav Dubeyko <Slava.Dubeyko@....com>
To: "idryomov@...il.com" <idryomov@...il.com>,
Alex Markuze
<amarkuze@...hat.com>,
"slava@...eyko.com" <slava@...eyko.com>,
"ionut.nechita@...driver.com" <ionut.nechita@...driver.com>
CC: "ionut_n2001@...oo.com" <ionut_n2001@...oo.com>,
Xiubo Li
<xiubli@...hat.com>,
"linux-rt-devel@...ts.linux.dev"
<linux-rt-devel@...ts.linux.dev>,
"ceph-devel@...r.kernel.org"
<ceph-devel@...r.kernel.org>,
"rostedt@...dmis.org" <rostedt@...dmis.org>,
"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
"bigeasy@...utronix.de" <bigeasy@...utronix.de>,
"clrkwllms@...nel.org"
<clrkwllms@...nel.org>,
"superm1@...nel.org" <superm1@...nel.org>,
"jlayton@...nel.org" <jlayton@...nel.org>,
"jkosina@...e.com"
<jkosina@...e.com>
Subject: Re: [PATCH] ceph: add timeout protection to ceph_mdsc_sync() path
On Sun, 2026-02-08 at 15:18 +0200, Ionut Nechita (Wind River) wrote:
> From: Ionut Nechita <ionut.nechita@...driver.com>
>
> When Ceph MDS becomes unreachable (e.g., due to IPv6 EADDRNOTAVAIL
> during DAD or network transitions), the sync syscall can block
> indefinitely in ceph_mdsc_sync(). The hung_task detector fires
> repeatedly (122s, 245s, 368s... up to 983+ seconds) with traces like:
Do you have any reproduction path of this? Which particular use-case can trigger
the issue with higher probability?
Maybe, do we need to find the real reason of the issue? It sounds to me like a
workaround for some real issue(s).
>
> INFO: task sync:12345 blocked for more than 122 seconds.
> Call Trace:
> ceph_mdsc_sync+0x4d6/0x5a0 [ceph]
> ceph_sync_fs+0x31/0x130 [ceph]
> iterate_supers+0x97/0x100
> ksys_sync+0x32/0xb0
>
> Three functions in the MDS sync path use indefinite waits:
>
> 1. wait_caps_flush() uses wait_event() with no timeout
> 2. flush_mdlog_and_wait_mdsc_unsafe_requests() uses
> wait_for_completion() with no timeout
> 3. ceph_mdsc_sync() returns void, cannot propagate errors
>
> This is particularly problematic in Kubernetes environments with
> PREEMPT_RT kernels where Ceph storage pods undergo rolling updates
> and IPv6 network reconfigurations cause temporary MDS unavailability.
If it is temporary MDS unavailability, then, I assume that libceph should manage
this situation. Am I wrong here? I expect that libceph should wake up the
waiting flushing threads. Do I oversimplify the whole workflow? :)
>
> Fix this by adding mount_timeout-based timeouts (default 60s) to the
> blocking waits, following the existing pattern used by wait_requests()
> and ceph_mdsc_close_sessions() in the same file:
Maybe, you are right that we need to use the timeout based approach. But what
was the reason that flushing threads haven't been woken up?
Thanks,
Slava.
>
> - wait_caps_flush(): use wait_event_timeout() with mount_timeout
> - flush_mdlog_and_wait_mdsc_unsafe_requests(): use
> wait_for_completion_timeout() with mount_timeout
> - ceph_mdsc_sync(): change return type to int, propagate -ETIMEDOUT
> - ceph_sync_fs(): propagate error from ceph_mdsc_sync() to VFS
>
> On timeout, dirty caps and pending requests are NOT discarded - they
> remain in memory and are re-synced when MDS reconnects. The timeout
> simply unblocks the calling task. If mount_timeout is set to 0,
> ceph_timeout_jiffies() returns MAX_SCHEDULE_TIMEOUT, preserving the
> original infinite-wait behavior.
>
> Real-world impact: In production logs showing 'task sync blocked for
> more than 983 seconds', this patch limits the block to mount_timeout
> (60s default), returning -ETIMEDOUT to the VFS layer instead of
> hanging indefinitely.
>
> Fixes: 1b2ba3c5616e ("ceph: flush the mdlog for filesystem sync")
> Signed-off-by: Ionut Nechita <ionut.nechita@...driver.com>
> ---
> fs/ceph/mds_client.c | 50 ++++++++++++++++++++++++++++++++++----------
> fs/ceph/mds_client.h | 2 +-
> fs/ceph/super.c | 5 +++--
> 3 files changed, 43 insertions(+), 14 deletions(-)
>
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 7e4eab824daef..4cd8f584147f4 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -2290,17 +2290,26 @@ static int check_caps_flush(struct ceph_mds_client *mdsc,
> *
> * returns true if we've flushed through want_flush_tid
> */
> -static void wait_caps_flush(struct ceph_mds_client *mdsc,
> - u64 want_flush_tid)
> +static int wait_caps_flush(struct ceph_mds_client *mdsc,
> + u64 want_flush_tid)
> {
> struct ceph_client *cl = mdsc->fsc->client;
> + struct ceph_options *opts = mdsc->fsc->client->options;
> + long ret;
>
> doutc(cl, "want %llu\n", want_flush_tid);
>
> - wait_event(mdsc->cap_flushing_wq,
> - check_caps_flush(mdsc, want_flush_tid));
> + ret = wait_event_timeout(mdsc->cap_flushing_wq,
> + check_caps_flush(mdsc, want_flush_tid),
> + ceph_timeout_jiffies(opts->mount_timeout));
> + if (!ret) {
> + pr_warn_client(cl, "cap flush timeout waiting for tid %llu\n",
> + want_flush_tid);
> + return -ETIMEDOUT;
> + }
>
> doutc(cl, "ok, flushed thru %llu\n", want_flush_tid);
> + return 0;
> }
>
> /*
> @@ -5865,13 +5874,15 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
> /*
> * flush the mdlog and wait for all write mds requests to flush.
> */
> -static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc,
> - u64 want_tid)
> +static int flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc,
> + u64 want_tid)
> {
> struct ceph_client *cl = mdsc->fsc->client;
> + struct ceph_options *opts = mdsc->fsc->client->options;
> struct ceph_mds_request *req = NULL, *nextreq;
> struct ceph_mds_session *last_session = NULL;
> struct rb_node *n;
> + unsigned long left;
>
> mutex_lock(&mdsc->mutex);
> doutc(cl, "want %lld\n", want_tid);
> @@ -5910,7 +5921,19 @@ static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *md
> }
> doutc(cl, "wait on %llu (want %llu)\n",
> req->r_tid, want_tid);
> - wait_for_completion(&req->r_safe_completion);
> + left = wait_for_completion_timeout(
> + &req->r_safe_completion,
> + ceph_timeout_jiffies(opts->mount_timeout));
> + if (!left) {
> + pr_warn_client(cl,
> + "flush mdlog request tid %llu timed out\n",
> + req->r_tid);
> + ceph_mdsc_put_request(req);
> + if (nextreq)
> + ceph_mdsc_put_request(nextreq);
> + ceph_put_mds_session(last_session);
> + return -ETIMEDOUT;
> + }
>
> mutex_lock(&mdsc->mutex);
> ceph_mdsc_put_request(req);
> @@ -5928,15 +5951,17 @@ static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *md
> mutex_unlock(&mdsc->mutex);
> ceph_put_mds_session(last_session);
> doutc(cl, "done\n");
> + return 0;
> }
>
> -void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
> +int ceph_mdsc_sync(struct ceph_mds_client *mdsc)
> {
> struct ceph_client *cl = mdsc->fsc->client;
> u64 want_tid, want_flush;
> + int ret;
>
> if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN)
> - return;
> + return -EIO;
>
> doutc(cl, "sync\n");
> mutex_lock(&mdsc->mutex);
> @@ -5957,8 +5982,11 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
>
> doutc(cl, "sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
>
> - flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid);
> - wait_caps_flush(mdsc, want_flush);
> + ret = flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid);
> + if (ret)
> + return ret;
> +
> + return wait_caps_flush(mdsc, want_flush);
> }
>
> /*
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index 0428a5eaf28c6..a8b72cb13de1f 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -569,7 +569,7 @@ extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
> extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc);
> extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
>
> -extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
> +extern int ceph_mdsc_sync(struct ceph_mds_client *mdsc);
>
> extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
> extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> index 7c1c1dac320da..6b0ad7a455815 100644
> --- a/fs/ceph/super.c
> +++ b/fs/ceph/super.c
> @@ -125,6 +125,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
> {
> struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
> struct ceph_client *cl = fsc->client;
> + int ret;
>
> if (!wait) {
> doutc(cl, "(non-blocking)\n");
> @@ -136,9 +137,9 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
>
> doutc(cl, "(blocking)\n");
> ceph_osdc_sync(&fsc->client->osdc);
> - ceph_mdsc_sync(fsc->mdsc);
> + ret = ceph_mdsc_sync(fsc->mdsc);
> doutc(cl, "(blocking) done\n");
> - return 0;
> + return ret;
> }
>
> /*
Powered by blists - more mailing lists