linux-kernel - Re: [PATCH v10 18/20] timers: Implement the hierarchical pull model

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <87r0hxbvhc.fsf@somnus>
Date: Wed, 31 Jan 2024 12:19:59 +0100
From: Anna-Maria Behnsen <anna-maria@...utronix.de>
To: Frederic Weisbecker <frederic@...nel.org>
Cc: linux-kernel@...r.kernel.org, Peter Zijlstra <peterz@...radead.org>,
 John Stultz <jstultz@...gle.com>, Thomas Gleixner <tglx@...utronix.de>,
 Eric Dumazet <edumazet@...gle.com>, "Rafael J . Wysocki"
 <rafael.j.wysocki@...el.com>, Arjan van de Ven <arjan@...radead.org>,
 "Paul E . McKenney" <paulmck@...nel.org>, Rik van Riel <riel@...riel.com>,
 Steven Rostedt <rostedt@...dmis.org>, Sebastian Siewior
 <bigeasy@...utronix.de>, Giovanni Gherdovich <ggherdovich@...e.cz>, Lukasz
 Luba <lukasz.luba@....com>, "Gautham R . Shenoy" <gautham.shenoy@....com>,
 Srinivas Pandruvada <srinivas.pandruvada@...el.com>, K Prateek Nayak
 <kprateek.nayak@....com>
Subject: Re: [PATCH v10 18/20] timers: Implement the hierarchical pull model

Frederic Weisbecker <frederic@...nel.org> writes:

[...]

>
> But you know what, let's make it more simple. CPU down hotplug is not a
> fast path and it doesn't deserve so many optimizations. Just remove ->wakeup_recalc
> entirely and if the offlining CPU detects it's the last active CPU in the
> hierarchy, just queue an empty work to the first online CPU. It will briefly
> force that CPU out of idle and trigger an activate. Then either the CPU
> periodically checks remote timers or it will go back idle and notice.
>

I'll have a look at it and give it a try! Thanks!

> Something like this (untested):
>
> diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
> index de1905b0bae7..0f15215ef257 100644
> --- a/kernel/time/timer_migration.c
> +++ b/kernel/time/timer_migration.c
> @@ -548,7 +548,6 @@ static void __tmigr_cpu_activate(struct tmigr_cpu *tmc)
>  
>  	tmc->cpuevt.ignore = true;
>  	WRITE_ONCE(tmc->wakeup, KTIME_MAX);
> -	tmc->wakeup_recalc = false;
>  
>  	walk_groups(&tmigr_active_up, &data, tmc);
>  }
> @@ -1041,41 +1040,11 @@ int tmigr_requires_handle_remote(void)
>  	}
>  
>  	/*
> -	 * If the CPU is idle, check whether the recalculation of @tmc->wakeup
> -	 * is required. @tmc->wakeup_recalc is set, when the last active CPU
> -	 * went offline. The last active CPU delegated the handling of the timer
> -	 * migration hierarchy to another (this) CPU by updating this flag and
> -	 * sending a reschedule.
> -	 *
> -	 * Racy lockless check is valid:
> -	 * - @tmc->wakeup_recalc is set by the remote CPU before it issues
> -	 *   reschedule IPI.
> -	 * - As interrupts are disabled here this CPU will either observe
> -	 *   @tmc->wakeup_recalc set before the reschedule IPI can be handled or
> -	 *   it will observe it when this function is called again on return
> -	 *   from handling the reschedule IPI.
> -	 */
> -	if (tmc->wakeup_recalc) {
> -		__walk_groups(&tmigr_requires_handle_remote_up, &data, tmc);
> -
> -		if (data.firstexp != KTIME_MAX)
> -			ret = 1;
> -
> -		raw_spin_lock(&tmc->lock);
> -		WRITE_ONCE(tmc->wakeup, data.firstexp);
> -		tmc->wakeup_recalc = false;
> -		raw_spin_unlock(&tmc->lock);
> -
> -		return ret;
> -	}
> -
> -	/*
> -	 * When the CPU is idle and @tmc->wakeup is reliable as
> -	 * @tmc->wakeup_recalc is not set, compare it with @data.now. The lock
> -	 * is required on 32bit architectures to read the variable consistently
> -	 * with a concurrent writer. On 64bit the lock is not required because
> -	 * the read operation is not split and so it is always consistent.
> -
> +	 * When the CPU is idle and @tmc->wakeup is reliable, compare it with
> +	 * @data.now. The lock is required on 32bit architectures to read the
> +	 * variable consistently with a concurrent writer. On 64bit the lock
> +	 * is not required because the read operation is not split and so it is
> +	 * always consistent.
>  	 */
>  	if (IS_ENABLED(CONFIG_64BIT)) {
>  		if (data.now >= READ_ONCE(tmc->wakeup))
> @@ -1119,21 +1088,7 @@ u64 tmigr_cpu_new_timer(u64 nextexp)
>  		    tmc->cpuevt.ignore) {
>  			ret = tmigr_new_timer(tmc, nextexp);
>  		}
> -	} else if (tmc->wakeup_recalc) {
> -		struct tmigr_remote_data data;
> -
> -		data.now = KTIME_MAX;
> -		data.childmask = tmc->childmask;
> -		data.firstexp = KTIME_MAX;
> -		data.tmc_active = false;
> -		data.check = false;
> -
> -		__walk_groups(&tmigr_requires_handle_remote_up, &data, tmc);
> -
> -		ret = data.firstexp;
>  	}
> -	tmc->wakeup_recalc = false;
> -
>  	/*
>  	 * Make sure the reevaluation of timers in idle path will not miss an
>  	 * event.
> @@ -1212,36 +1167,7 @@ static bool tmigr_inactive_up(struct tmigr_group *group,
>  	 *   hierarchy) and
>  	 * - there is a pending event in the hierarchy
>  	 */
> -	if (data->firstexp != KTIME_MAX) {
> -		WARN_ON_ONCE(group->parent);
> -		/*
> -		 * Top level path: If this CPU is about going offline and was
> -		 * the last active CPU, wake up some random other CPU so it will
> -		 * take over the migrator duty and program its timer
> -		 * properly. Ideally wake the CPU with the closest expiry time,
> -		 * but that's overkill to figure out.
> -		 *
> -		 * Set wakeup_recalc of remote CPU, to make sure the complete
> -		 * idle hierarchy with enqueued timers is reevaluated.
> -		 */
> -		if (!(this_cpu_ptr(&tmigr_cpu)->online)) {
> -			struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
> -			unsigned int cpu = smp_processor_id();
> -			struct tmigr_cpu *tmc_resched;
> -
> -			cpu = cpumask_any_but(cpu_online_mask, cpu);
> -			tmc_resched = per_cpu_ptr(&tmigr_cpu, cpu);
> -
> -			raw_spin_unlock(&tmc->lock);
> -
> -			raw_spin_lock(&tmc_resched->lock);
> -			tmc_resched->wakeup_recalc = true;
> -			raw_spin_unlock(&tmc_resched->lock);
> -
> -			raw_spin_lock(&tmc->lock);
> -			smp_send_reschedule(cpu);
> -		}
> -	}
> +	WARN_ON_ONCE(data->firstexp != KTIME_MAX && group->parent);
>  
>  	return walk_done;
>  }
> @@ -1579,9 +1505,20 @@ static int tmigr_cpu_online(unsigned int cpu)
>  	return 0;
>  }
>  
> +long tmigr_trigger_active(void *unused)
> +{
> +	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
> +
> +	WARN_ON_ONCE(!tmc->online || tmc->idle);
> +
> +	return 0;
> +}
> +
>  static int tmigr_cpu_offline(unsigned int cpu)
>  {
>  	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
> +	int migrator;
> +	u64 firstexp;
>  
>  	raw_spin_lock_irq(&tmc->lock);
>  	tmc->online = false;
> @@ -1591,9 +1528,14 @@ static int tmigr_cpu_offline(unsigned int cpu)
>  	 * CPU has to handle the local events on his own, when on the way to
>  	 * offline; Therefore nextevt value is set to KTIME_MAX
>  	 */
> -	__tmigr_cpu_deactivate(tmc, KTIME_MAX);
> +	firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX);
>  	raw_spin_unlock_irq(&tmc->lock);
>  
> +	if (firstexp != KTIME_MAX) {
> +		migrator = cpumask_any_but(cpu_online_mask, cpu);
> +		work_on_cpu(migrator, tmigr_trigger_active, NULL);
> +	}
> +
>  	return 0;
>  }
>  
> diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h
> index c32947cf429b..c556d5824792 100644
> --- a/kernel/time/timer_migration.h
> +++ b/kernel/time/timer_migration.h
> @@ -78,18 +78,12 @@ struct tmigr_group {
>   * @idle:		Indicates whether the CPU is idle in the timer migration
>   *			hierarchy
>   * @remote:		Is set when timers of the CPU are expired remotely
> - * @wakeup_recalc:	Indicates, whether a recalculation of the @wakeup value
> - *			is required. @wakeup_recalc is only used by this CPU
> - *			when it is marked idle in the timer migration
> - *			hierarchy. It is set by a remote CPU which was the last
> - *			active CPU and is on the way to idle.
>   * @tmgroup:		Pointer to the parent group
>   * @childmask:		childmask of tmigr_cpu in the parent group
>   * @wakeup:		Stores the first timer when the timer migration
>   *			hierarchy is completely idle and remote expiry was done;
>   *			is returned to timer code in the idle path and is only
> - *			used in idle path; it is only valid, when @wakeup_recalc
> - *			is not set.
> + *			used in idle path.
>   * @cpuevt:		CPU event which could be enqueued into the parent group
>   */
>  struct tmigr_cpu {
> @@ -97,7 +91,6 @@ struct tmigr_cpu {
>  	bool			online;
>  	bool			idle;
>  	bool			remote;
> -	bool			wakeup_recalc;
>  	struct tmigr_group	*tmgroup;
>  	u8			childmask;
>  	u64			wakeup;