[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <87iknnwxa4.ffs@tglx>
Date: Tue, 01 Apr 2025 20:29:23 +0200
From: Thomas Gleixner <tglx@...utronix.de>
To: Miroslav Lichvar <mlichvar@...hat.com>
Cc: John Stultz <jstultz@...gle.com>, LKML <linux-kernel@...r.kernel.org>,
Stephen Boyd <sboyd@...nel.org>, Anna-Maria Behnsen
<anna-maria@...utronix.de>, Frederic Weisbecker <frederic@...nel.org>,
Shuah Khan <shuah@...nel.org>, linux-kselftest@...r.kernel.org,
kernel-team@...roid.com, Lei Chen <lei.chen@...rtx.com>
Subject: Re: [PATCH v2 1/2] time/timekeeping: Fix possible inconsistencies
in _COARSE clockids
On Tue, Apr 01 2025 at 13:19, Miroslav Lichvar wrote:
> On Tue, Apr 01, 2025 at 08:34:23AM +0200, Thomas Gleixner wrote:
>> On Mon, Mar 31 2025 at 16:53, Miroslav Lichvar wrote:
>> > Mult reduction Updates/sec Skew before Skew after
>> > 16 4 0.004 0.009
>> > 16 16 0.011 0.069
>> > 16 64 0.020 0.117
>> > 64 4 0.013 0.012
>> > 64 16 0.030 0.107
>> > 64 64 0.058 0.879
>>
>> Hrm.
>>
>> Can you try the delta patch below?
>
> It seems to improve the worst cases, but overall it's still
> a regression.
>
> Mult reduction Updates/sec Skew
> 16 4 0.012
> 16 16 0.014
> 16 64 0.033
> 64 4 0.012
> 64 16 0.105
> 64 64 0.138
That's weird as it only delays the update to the next tick. My original
approach of maintaining seperate state for the coarse time keeper is
preserving the existing NTP behaviour.
Patch applies after reverting 757b000f7b93 ("timekeeping: Fix possible
inconsistencies in _COARSE clockids").
Thanks,
tglx
---
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -51,7 +51,7 @@ struct tk_read_base {
* @offs_real: Offset clock monotonic -> clock realtime
* @offs_boot: Offset clock monotonic -> clock boottime
* @offs_tai: Offset clock monotonic -> clock tai
- * @tai_offset: The current UTC to TAI offset in seconds
+ * @coarse_nsec: The nanoseconds part for coarse time getters
* @tkr_raw: The readout base structure for CLOCK_MONOTONIC_RAW
* @raw_sec: CLOCK_MONOTONIC_RAW time in seconds
* @clock_was_set_seq: The sequence number of clock was set events
@@ -76,6 +76,7 @@ struct tk_read_base {
* ntp shifted nano seconds.
* @ntp_err_mult: Multiplication factor for scaled math conversion
* @skip_second_overflow: Flag used to avoid updating NTP twice with same second
+ * @tai_offset: The current UTC to TAI offset in seconds
*
* Note: For timespec(64) based interfaces wall_to_monotonic is what
* we need to add to xtime (or xtime corrected for sub jiffy times)
@@ -100,7 +101,7 @@ struct tk_read_base {
* which results in the following cacheline layout:
*
* 0: seqcount, tkr_mono
- * 1: xtime_sec ... tai_offset
+ * 1: xtime_sec ... coarse_nsec
* 2: tkr_raw, raw_sec
* 3,4: Internal variables
*
@@ -121,7 +122,7 @@ struct timekeeper {
ktime_t offs_real;
ktime_t offs_boot;
ktime_t offs_tai;
- s32 tai_offset;
+ u32 coarse_nsec;
/* Cacheline 2: */
struct tk_read_base tkr_raw;
@@ -144,6 +145,7 @@ struct timekeeper {
u32 ntp_error_shift;
u32 ntp_err_mult;
u32 skip_second_overflow;
+ s32 tai_offset;
};
#ifdef CONFIG_GENERIC_TIME_VSYSCALL
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -31,6 +31,7 @@
#define TK_CLEAR_NTP (1 << 0)
#define TK_CLOCK_WAS_SET (1 << 1)
+#define TK_RETAIN_COARSE (1 << 2)
#define TK_UPDATE_ALL (TK_CLEAR_NTP | TK_CLOCK_WAS_SET)
@@ -164,6 +165,15 @@ static inline struct timespec64 tk_xtime
return ts;
}
+static inline struct timespec64 tk_xtime_coarse(const struct timekeeper *tk)
+{
+ struct timespec64 ts;
+
+ ts.tv_sec = tk->xtime_sec;
+ ts.tv_nsec = tk->coarse_nsec;
+ return ts;
+}
+
static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
{
tk->xtime_sec = ts->tv_sec;
@@ -636,7 +646,34 @@ static void timekeeping_restore_shadow(s
memcpy(&tkd->shadow_timekeeper, &tkd->timekeeper, sizeof(tkd->timekeeper));
}
-static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action)
+/*
+ * Update the nanoseconds part for the coarse time keepers. They can't rely
+ * on xtime_nsec because xtime_nsec is adjusted when the multiplication
+ * factor of the clock is adjusted. See timekeeping_apply_adjustment().
+ *
+ * This is required because tk_read::cycle_last must be advanced by
+ * timekeeper::cycle_interval so that the accumulation happens with a
+ * periodic reference.
+ *
+ * But that adjustment of xtime_nsec can make it go backward to compensate
+ * for a larger multiplicator.
+ *
+ * @offset contains the leftover cycles which were not accumulated.
+ * Therefore the nanoseconds portion of the time when the clocksource was
+ * read in timekeeping_advance() is:
+ *
+ * nsec = (xtime_nsec + offset * mult) >> shift;
+ *
+ * Calculate that value and store it in timekeeper::coarse_nsec, from where
+ * the coarse time getters consume it.
+ */
+static inline void tk_update_coarse_nsecs(struct timekeeper *tk, u64 offset)
+{
+ offset *= tk->tkr_mono.mult;
+ tk->coarse_nsec = (tk->tkr_mono.xtime_nsec + offset) >> tk->tkr_mono.shift;
+}
+
+static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action, u64 offset)
{
struct timekeeper *tk = &tk_core.shadow_timekeeper;
@@ -659,6 +696,9 @@ static void timekeeping_update_from_shad
tk_update_leap_state(tk);
tk_update_ktime_data(tk);
+ if (!(action & TK_RETAIN_COARSE))
+ tk_update_coarse_nsecs(tk, offset);
+
update_vsyscall(tk);
update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
@@ -804,8 +844,8 @@ EXPORT_SYMBOL_GPL(ktime_get_with_offset)
ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
{
struct timekeeper *tk = &tk_core.timekeeper;
- unsigned int seq;
ktime_t base, *offset = offsets[offs];
+ unsigned int seq;
u64 nsecs;
WARN_ON(timekeeping_suspended);
@@ -813,7 +853,7 @@ ktime_t ktime_get_coarse_with_offset(enu
do {
seq = read_seqcount_begin(&tk_core.seq);
base = ktime_add(tk->tkr_mono.base, *offset);
- nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ nsecs = tk->coarse_nsec;
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -1374,7 +1414,7 @@ int do_settimeofday64(const struct times
tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, ts_delta));
tk_set_xtime(tks, ts);
- timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
+ timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL, 0);
}
/* Signal hrtimers about time change */
@@ -1413,7 +1453,7 @@ static int timekeeping_inject_offset(con
tk_xtime_add(tks, ts);
tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts));
- timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
+ timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL, 0);
}
/* Signal hrtimers about time change */
@@ -1493,7 +1533,7 @@ static int change_clocksource(void *data
timekeeping_forward_now(tks);
old = tks->tkr_mono.clock;
tk_setup_internals(tks, new);
- timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
+ timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL, 0);
}
if (old) {
@@ -1690,7 +1730,7 @@ void __init timekeeping_init(void)
tk_set_wall_to_mono(tks, wall_to_mono);
- timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET);
+ timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET, 0);
}
/* time in seconds when suspend began for persistent clock */
@@ -1774,7 +1814,7 @@ void timekeeping_inject_sleeptime64(cons
suspend_timing_needed = false;
timekeeping_forward_now(tks);
__timekeeping_inject_sleeptime(tks, delta);
- timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
+ timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL, 0);
}
/* Signal hrtimers about time change */
@@ -1834,7 +1874,7 @@ void timekeeping_resume(void)
tks->ntp_error = 0;
timekeeping_suspended = 0;
- timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET);
+ timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET, 0);
raw_spin_unlock_irqrestore(&tk_core.lock, flags);
touch_softlockup_watchdog();
@@ -1901,7 +1941,7 @@ int timekeeping_suspend(void)
}
}
- timekeeping_update_from_shadow(&tk_core, 0);
+ timekeeping_update_from_shadow(&tk_core, 0, 0);
halt_fast_timekeeper(tks);
raw_spin_unlock_irqrestore(&tk_core.lock, flags);
@@ -2205,7 +2245,7 @@ static bool timekeeping_advance(enum tim
*/
clock_set |= accumulate_nsecs_to_secs(tk);
- timekeeping_update_from_shadow(&tk_core, clock_set);
+ timekeeping_update_from_shadow(&tk_core, clock_set, offset);
return !!clock_set;
}
@@ -2248,7 +2288,7 @@ void ktime_get_coarse_real_ts64(struct t
do {
seq = read_seqcount_begin(&tk_core.seq);
- *ts = tk_xtime(tk);
+ *ts = tk_xtime_coarse(tk);
} while (read_seqcount_retry(&tk_core.seq, seq));
}
EXPORT_SYMBOL(ktime_get_coarse_real_ts64);
@@ -2271,7 +2311,7 @@ void ktime_get_coarse_real_ts64_mg(struc
do {
seq = read_seqcount_begin(&tk_core.seq);
- *ts = tk_xtime(tk);
+ *ts = tk_xtime_coarse(tk);
offset = tk_core.timekeeper.offs_real;
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -2350,12 +2390,12 @@ void ktime_get_coarse_ts64(struct timesp
do {
seq = read_seqcount_begin(&tk_core.seq);
- now = tk_xtime(tk);
+ now = tk_xtime_coarse(tk);
mono = tk->wall_to_monotonic;
} while (read_seqcount_retry(&tk_core.seq, seq));
set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec,
- now.tv_nsec + mono.tv_nsec);
+ now.tv_nsec + mono.tv_nsec);
}
EXPORT_SYMBOL(ktime_get_coarse_ts64);
@@ -2539,7 +2579,8 @@ int do_adjtimex(struct __kernel_timex *t
if (tai != orig_tai) {
__timekeeping_set_tai_offset(tks, tai);
- timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET);
+ timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET |
+ TK_RETAIN_COARSE, 0);
clock_set = true;
} else {
tk_update_leap_state_all(&tk_core);
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -98,12 +98,12 @@ void update_vsyscall(struct timekeeper *
/* CLOCK_REALTIME_COARSE */
vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE];
vdso_ts->sec = tk->xtime_sec;
- vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ vdso_ts->nsec = tk->coarse_nsec;
/* CLOCK_MONOTONIC_COARSE */
vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE];
vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
- nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ nsec = tk->coarse_nsec;
nsec = nsec + tk->wall_to_monotonic.tv_nsec;
vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec);
Powered by blists - more mailing lists