linux-kernel - Re: [PATCH] Fix rolling back of CLOCK_MONOTONIC

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <87cyej5rid.ffs@tglx>
Date: Fri, 14 Mar 2025 20:41:46 +0100
From: Thomas Gleixner <tglx@...utronix.de>
To: Lei Chen <lei.chen@...rtx.com>, John Stultz <jstultz@...gle.com>,
 Stephen Boyd <sboyd@...nel.org>
Cc: linux-kernel@...r.kernel.org
Subject: Re: [PATCH] Fix rolling back of CLOCK_MONOTONIC_COARSE

On Mon, Mar 10 2025 at 11:00, Lei Chen wrote:
> To fix this issue, the patch accumulates offset into tk, and export
> N(P2) to real tk and vdso.
>
> tk.tkr_mono := N(P2) = N(P1) + offset * M1
>
> Then at P3, we calculate N(P3) based on N(P2) instead of N(P1):
> N(P3) := N(P2) + clock_delta * M2

Your analysis is mostly correct, but it is only correct versus the
coarse timekeeper.

Moving everything forward to P2 breaks the periodic accumulation and
therefore NTP. Monitoring NTP/chrony immediately shows that they are
behaving differently and do not really converge.

The real problem is that the introduction of the coarse time accessors
completely missed the fact, that xtime_nsec is adjusted by multiplier
changes. This went unnoticed for about 15 years :)

So the obvious cure is to leave the accumulation logic alone and to
provide a seperate coarse_nsec instance, which compensates for the
offset.

The offset contains the reminder of the periodic accumulation from the
point where timekeeping_advance() read the clocksource at T1.

At the point of readout T1 nanoseconds have been:

     T1nsec[OLD] = xtime_sec[OLD] * NSEC_PER_SEC +
                   (xtime_nsec[OLD] + offset * mult[OLD]) >> shift;

After the accumulation and eventual multiplier update that becomes:

     T1nsec[NEW] = xtime_sec[NEW] * NSEC_PER_SEC +
                   (xtime_nsec[NEW] + offset * mult[NEW]) >> shift;

If the unmodified accumulation math is correct then:

     T1nsec[OLD] == T1nsec[NEW]

The patch below implements exactly that and survives lightweight testing
where neither in kernel nor in userspace these time jumps are
observable anymore.

It needs quite some eyeballs, testing and validation.

Thanks,

        tglx
---
Subject: timekeeping: Decouple coarse time readout from xtime
From: Thomas Gleixner <tglx@...utronix.de>
Date: Fri, 14 Mar 2025 14:15:59 +0100

< Insert change log>

Fixes: da15cfdae033 ("time: Introduce CLOCK_REALTIME_COARSE")
Reported-by: Lei Chen <lei.chen@...rtx.com>
Not-Yet-Signed-off-by: Thomas Gleixner <tglx@...utronix.de>
---
 include/linux/timekeeper_internal.h |    8 +++--
 kernel/time/timekeeping.c           |   54 ++++++++++++++++++++++++++++++++----
 kernel/time/vsyscall.c              |    4 +-
 3 files changed, 55 insertions(+), 11 deletions(-)

--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -51,7 +51,7 @@ struct tk_read_base {
  * @offs_real:			Offset clock monotonic -> clock realtime
  * @offs_boot:			Offset clock monotonic -> clock boottime
  * @offs_tai:			Offset clock monotonic -> clock tai
- * @tai_offset:			The current UTC to TAI offset in seconds
+ * @coarse_nsec:		The nanoseconds part for coarse time getters
  * @tkr_raw:			The readout base structure for CLOCK_MONOTONIC_RAW
  * @raw_sec:			CLOCK_MONOTONIC_RAW  time in seconds
  * @clock_was_set_seq:		The sequence number of clock was set events
@@ -76,6 +76,7 @@ struct tk_read_base {
  *				ntp shifted nano seconds.
  * @ntp_err_mult:		Multiplication factor for scaled math conversion
  * @skip_second_overflow:	Flag used to avoid updating NTP twice with same second
+ * @tai_offset:			The current UTC to TAI offset in seconds
  *
  * Note: For timespec(64) based interfaces wall_to_monotonic is what
  * we need to add to xtime (or xtime corrected for sub jiffy times)
@@ -100,7 +101,7 @@ struct tk_read_base {
  * which results in the following cacheline layout:
  *
  * 0:	seqcount, tkr_mono
- * 1:	xtime_sec ... tai_offset
+ * 1:	xtime_sec ... coarse_nsec
  * 2:	tkr_raw, raw_sec
  * 3,4: Internal variables
  *
@@ -121,7 +122,7 @@ struct timekeeper {
 	ktime_t			offs_real;
 	ktime_t			offs_boot;
 	ktime_t			offs_tai;
-	s32			tai_offset;
+	u32			coarse_nsec;
 
 	/* Cacheline 2: */
 	struct tk_read_base	tkr_raw;
@@ -144,6 +145,7 @@ struct timekeeper {
 	u32			ntp_error_shift;
 	u32			ntp_err_mult;
 	u32			skip_second_overflow;
+	s32			tai_offset;
 };
 
 #ifdef CONFIG_GENERIC_TIME_VSYSCALL
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -164,10 +164,20 @@ static inline struct timespec64 tk_xtime
 	return ts;
 }
 
+static inline struct timespec64 tk_xtime_coarse(const struct timekeeper *tk)
+{
+	struct timespec64 ts;
+
+	ts.tv_sec = tk->xtime_sec;
+	ts.tv_nsec = tk->coarse_nsec;
+	return ts;
+}
+
 static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
 {
 	tk->xtime_sec = ts->tv_sec;
 	tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift;
+	tk->coarse_nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
 }
 
 static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
@@ -175,6 +185,7 @@ static void tk_xtime_add(struct timekeep
 	tk->xtime_sec += ts->tv_sec;
 	tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift;
 	tk_normalize_xtime(tk);
+	tk->coarse_nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
 }
 
 static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
@@ -708,6 +719,8 @@ static void timekeeping_forward_now(stru
 		tk_normalize_xtime(tk);
 		delta -= incr;
 	}
+
+	tk->coarse_nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
 }
 
 /**
@@ -804,8 +817,8 @@ EXPORT_SYMBOL_GPL(ktime_get_with_offset)
 ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
-	unsigned int seq;
 	ktime_t base, *offset = offsets[offs];
+	unsigned int seq;
 	u64 nsecs;
 
 	WARN_ON(timekeeping_suspended);
@@ -813,7 +826,7 @@ ktime_t ktime_get_coarse_with_offset(enu
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
 		base = ktime_add(tk->tkr_mono.base, *offset);
-		nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+		nsecs = tk->coarse_nsec;
 
 	} while (read_seqcount_retry(&tk_core.seq, seq));
 
@@ -1926,6 +1939,33 @@ static int __init timekeeping_init_ops(v
 device_initcall(timekeeping_init_ops);
 
 /*
+ * Update the nanoseconds part for the coarse time keepers. They can't rely
+ * on xtime_nsec because xtime_nsec is adjusted when the multiplication
+ * factor of the clock is adjusted. See timekeeping_apply_adjustment().
+ *
+ * This is required because tk_read::cycle_last must be advanced by
+ * timekeeper::cycle_interval so that the accumulation happens with a
+ * periodic reference.
+ *
+ * But that adjustment of xtime_nsec can make it go backward to compensate
+ * for a larger multiplicator.
+ *
+ * @offset contains the leftover cycles which were not accumulated.
+ * Therefore the nanoseconds portion of the time when the clocksource was
+ * read in timekeeping_advance() is:
+ *
+ *	nsec = (xtime_nsec + offset * mult) >> shift;
+ *
+ * Calculate that value and store it in timekeeper::coarse_nsec, from where
+ * the coarse time getters consume it.
+ */
+static inline void timekeeping_update_coarse_nsecs(struct timekeeper *tk, u64 offset)
+{
+	offset *= tk->tkr_mono.mult;
+	tk->coarse_nsec = (tk->tkr_mono.xtime_nsec + offset) >> tk->tkr_mono.shift;
+}
+
+/*
  * Apply a multiplier adjustment to the timekeeper
  */
 static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
@@ -2205,6 +2245,8 @@ static bool timekeeping_advance(enum tim
 	 */
 	clock_set |= accumulate_nsecs_to_secs(tk);
 
+	timekeeping_update_coarse_nsecs(tk, offset);
+
 	timekeeping_update_from_shadow(&tk_core, clock_set);
 
 	return !!clock_set;
@@ -2248,7 +2290,7 @@ void ktime_get_coarse_real_ts64(struct t
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
 
-		*ts = tk_xtime(tk);
+		*ts = tk_xtime_coarse(tk);
 	} while (read_seqcount_retry(&tk_core.seq, seq));
 }
 EXPORT_SYMBOL(ktime_get_coarse_real_ts64);
@@ -2271,7 +2313,7 @@ void ktime_get_coarse_real_ts64_mg(struc
 
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
-		*ts = tk_xtime(tk);
+		*ts = tk_xtime_coarse(tk);
 		offset = tk_core.timekeeper.offs_real;
 	} while (read_seqcount_retry(&tk_core.seq, seq));
 
@@ -2350,12 +2392,12 @@ void ktime_get_coarse_ts64(struct timesp
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
 
-		now = tk_xtime(tk);
+		now = tk_xtime_coarse(tk);
 		mono = tk->wall_to_monotonic;
 	} while (read_seqcount_retry(&tk_core.seq, seq));
 
 	set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec,
-				now.tv_nsec + mono.tv_nsec);
+				  now.tv_nsec + mono.tv_nsec);
 }
 EXPORT_SYMBOL(ktime_get_coarse_ts64);
 
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -97,12 +97,12 @@ void update_vsyscall(struct timekeeper *
 	/* CLOCK_REALTIME_COARSE */
 	vdso_ts		= &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE];
 	vdso_ts->sec	= tk->xtime_sec;
-	vdso_ts->nsec	= tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+	vdso_ts->nsec	= tk->coarse_nsec;
 
 	/* CLOCK_MONOTONIC_COARSE */
 	vdso_ts		= &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE];
 	vdso_ts->sec	= tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
-	nsec		= tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+	nsec		= tk->coarse_nsec;
 	nsec		= nsec + tk->wall_to_monotonic.tv_nsec;
 	vdso_ts->sec	+= __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec);