[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <87r01qq7hp.ffs@tglx>
Date: Fri, 18 Apr 2025 09:00:34 +0200
From: Thomas Gleixner <tglx@...utronix.de>
To: John Stultz <jstultz@...gle.com>
Cc: Miroslav Lichvar <mlichvar@...hat.com>, LKML
<linux-kernel@...r.kernel.org>, Stephen Boyd <sboyd@...nel.org>,
Anna-Maria Behnsen <anna-maria@...utronix.de>, Frederic Weisbecker
<frederic@...nel.org>, Shuah Khan <shuah@...nel.org>,
linux-kselftest@...r.kernel.org, kernel-team@...roid.com, Lei Chen
<lei.chen@...rtx.com>
Subject: Re: [PATCH] timekeeping: Prevent coarse clocks going backwards
On Fri, Apr 18 2025 at 08:37, Thomas Gleixner wrote:
> On Thu, Apr 17 2025 at 17:46, John Stultz wrote:
>> Instead it seems like we should just do:
>> tk->coarse_nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
>
> You end up with the same problem again because xtime_nsec can move
> backwards when the multiplier is updated, no?
Something like the below should work.
Thanks,
tglx
---
include/linux/timekeeper_internal.h | 10 ++++-
kernel/time/timekeeping.c | 62 ++++++++++++++++++++++++++++++++----
kernel/time/vsyscall.c | 4 +-
3 files changed, 65 insertions(+), 11 deletions(-)
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -51,7 +51,7 @@ struct tk_read_base {
* @offs_real: Offset clock monotonic -> clock realtime
* @offs_boot: Offset clock monotonic -> clock boottime
* @offs_tai: Offset clock monotonic -> clock tai
- * @tai_offset: The current UTC to TAI offset in seconds
+ * @coarse_nsec: The nanoseconds part for coarse time getters
* @tkr_raw: The readout base structure for CLOCK_MONOTONIC_RAW
* @raw_sec: CLOCK_MONOTONIC_RAW time in seconds
* @clock_was_set_seq: The sequence number of clock was set events
@@ -76,6 +76,8 @@ struct tk_read_base {
* ntp shifted nano seconds.
* @ntp_err_mult: Multiplication factor for scaled math conversion
* @skip_second_overflow: Flag used to avoid updating NTP twice with same second
+ * @tai_offset: The current UTC to TAI offset in seconds
+ * @coarse_offset: The offset of the coarse timekeeper in clock cycles
*
* Note: For timespec(64) based interfaces wall_to_monotonic is what
* we need to add to xtime (or xtime corrected for sub jiffy times)
@@ -100,7 +102,7 @@ struct tk_read_base {
* which results in the following cacheline layout:
*
* 0: seqcount, tkr_mono
- * 1: xtime_sec ... tai_offset
+ * 1: xtime_sec ... coarse_nsec
* 2: tkr_raw, raw_sec
* 3,4: Internal variables
*
@@ -121,7 +123,7 @@ struct timekeeper {
ktime_t offs_real;
ktime_t offs_boot;
ktime_t offs_tai;
- s32 tai_offset;
+ u32 coarse_nsec;
/* Cacheline 2: */
struct tk_read_base tkr_raw;
@@ -144,6 +146,8 @@ struct timekeeper {
u32 ntp_error_shift;
u32 ntp_err_mult;
u32 skip_second_overflow;
+ s32 tai_offset;
+ u32 coarse_offset;
};
#ifdef CONFIG_GENERIC_TIME_VSYSCALL
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -164,6 +164,15 @@ static inline struct timespec64 tk_xtime
return ts;
}
+static inline struct timespec64 tk_xtime_coarse(const struct timekeeper *tk)
+{
+ struct timespec64 ts;
+
+ ts.tv_sec = tk->xtime_sec;
+ ts.tv_nsec = tk->coarse_nsec;
+ return ts;
+}
+
static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
{
tk->xtime_sec = ts->tv_sec;
@@ -252,6 +261,7 @@ static void tk_setup_internals(struct ti
tk->tkr_raw.clock = clock;
tk->tkr_raw.mask = clock->mask;
tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last;
+ tk->coarse_offset = 0;
/* Do the ns -> cycle conversion first, using original mult */
tmp = NTP_INTERVAL_LENGTH;
@@ -636,6 +646,34 @@ static void timekeeping_restore_shadow(s
memcpy(&tkd->shadow_timekeeper, &tkd->timekeeper, sizeof(tkd->timekeeper));
}
+/*
+ * Update the nanoseconds part for the coarse time keepers. They can't rely
+ * on xtime_nsec because xtime_nsec is adjusted when the multiplication
+ * factor of the clock is adjusted. See timekeeping_apply_adjustment().
+ *
+ * This is required because tk_read::cycle_last must be advanced by
+ * timekeeper::cycle_interval so that the accumulation happens with a
+ * periodic reference.
+ *
+ * But that adjustment of xtime_nsec can make it go backward to compensate
+ * for a larger multiplicator.
+ *
+ * timekeeper::offset contains the leftover cycles which were not accumulated.
+ * Therefore the nanoseconds portion of the time when the clocksource was
+ * read in timekeeping_advance() is:
+ *
+ * nsec = (xtime_nsec + offset * mult) >> shift;
+ *
+ * Calculate that value and store it in timekeeper::coarse_nsec, from where
+ * the coarse time getters consume it.
+ */
+static inline void tk_update_coarse_nsecs(struct timekeeper *tk)
+{
+ u64 offset = (u64)tk->coarse_offset * tk->tkr_mono.mult;
+
+ tk->coarse_nsec = (tk->tkr_mono.xtime_nsec + offset) >> tk->tkr_mono.shift;
+}
+
static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action)
{
struct timekeeper *tk = &tk_core.shadow_timekeeper;
@@ -658,6 +696,7 @@ static void timekeeping_update_from_shad
tk_update_leap_state(tk);
tk_update_ktime_data(tk);
+ tk_update_coarse_nsecs(tk);
update_vsyscall(tk);
update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
@@ -708,6 +747,12 @@ static void timekeeping_forward_now(stru
tk_normalize_xtime(tk);
delta -= incr;
}
+
+ /*
+ * Clear the offset for the coarse time as the above forward
+ * brought the offset down to zero.
+ */
+ tk->coarse_offset = 0;
}
/**
@@ -804,8 +849,8 @@ EXPORT_SYMBOL_GPL(ktime_get_with_offset)
ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
{
struct timekeeper *tk = &tk_core.timekeeper;
- unsigned int seq;
ktime_t base, *offset = offsets[offs];
+ unsigned int seq;
u64 nsecs;
WARN_ON(timekeeping_suspended);
@@ -813,7 +858,7 @@ ktime_t ktime_get_coarse_with_offset(enu
do {
seq = read_seqcount_begin(&tk_core.seq);
base = ktime_add(tk->tkr_mono.base, *offset);
- nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ nsecs = tk->coarse_nsec;
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -1831,6 +1876,8 @@ void timekeeping_resume(void)
/* Re-base the last cycle value */
tks->tkr_mono.cycle_last = cycle_now;
tks->tkr_raw.cycle_last = cycle_now;
+ /* Reset the offset for the coarse time getters */
+ tks->coarse_offset = 0;
tks->ntp_error = 0;
timekeeping_suspended = 0;
@@ -2205,6 +2252,9 @@ static bool timekeeping_advance(enum tim
*/
clock_set |= accumulate_nsecs_to_secs(tk);
+ /* Compensate the coarse time getters xtime_nsec offset */
+ tk->coarse_offset = (u32)offset;
+
timekeeping_update_from_shadow(&tk_core, clock_set);
return !!clock_set;
@@ -2248,7 +2298,7 @@ void ktime_get_coarse_real_ts64(struct t
do {
seq = read_seqcount_begin(&tk_core.seq);
- *ts = tk_xtime(tk);
+ *ts = tk_xtime_coarse(tk);
} while (read_seqcount_retry(&tk_core.seq, seq));
}
EXPORT_SYMBOL(ktime_get_coarse_real_ts64);
@@ -2271,7 +2321,7 @@ void ktime_get_coarse_real_ts64_mg(struc
do {
seq = read_seqcount_begin(&tk_core.seq);
- *ts = tk_xtime(tk);
+ *ts = tk_xtime_coarse(tk);
offset = tk_core.timekeeper.offs_real;
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -2350,12 +2400,12 @@ void ktime_get_coarse_ts64(struct timesp
do {
seq = read_seqcount_begin(&tk_core.seq);
- now = tk_xtime(tk);
+ now = tk_xtime_coarse(tk);
mono = tk->wall_to_monotonic;
} while (read_seqcount_retry(&tk_core.seq, seq));
set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec,
- now.tv_nsec + mono.tv_nsec);
+ now.tv_nsec + mono.tv_nsec);
}
EXPORT_SYMBOL(ktime_get_coarse_ts64);
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -98,12 +98,12 @@ void update_vsyscall(struct timekeeper *
/* CLOCK_REALTIME_COARSE */
vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE];
vdso_ts->sec = tk->xtime_sec;
- vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ vdso_ts->nsec = tk->coarse_nsec;
/* CLOCK_MONOTONIC_COARSE */
vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE];
vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
- nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ nsec = tk->coarse_nsec;
nsec = nsec + tk->wall_to_monotonic.tv_nsec;
vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec);
Powered by blists - more mailing lists