[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <cf25be6e-bffe-486a-a02e-d828203bec3e@oracle.com>
Date: Tue, 1 Apr 2025 18:52:46 -0700
From: Libo Chen <libo.chen@...cle.com>
To: Peter Zijlstra <peterz@...radead.org>
Cc: kprateek.nayak@....com, mingo@...nel.org, gautham.shenoy@....com,
juri.lelli@...hat.com, vincent.guittot@...aro.org,
dietmar.eggemann@....com, rostedt@...dmis.org, bsegall@...gle.com,
mgorman@...e.de, vschneid@...hat.com, linux-kernel@...r.kernel.org,
yu.c.chen@...el.com, tim.c.chen@...ux.intel.com, tglx@...utronix.de
Subject: Re: [RFC][PATCH] sched: Cache aware load-balancing
On 3/25/25 05:09, Peter Zijlstra wrote:
> + for_each_cpu(cpu, cpus) {
> + /* XXX sched_cluster_active */
> + struct sched_domain *sd = per_cpu(sd_llc, cpu);
Hi Peter,
I understand that this targets llc, but just want to point out that sd
here could be NULL for arch w/o llc, and then this can cause NULL ptr
dereference in sched_domain_span(sd)
Libo
> + unsigned long occ, m_occ = 0, a_occ = 0;
> + int m_cpu = -1, nr = 0, i;
> +
> + for_each_cpu(i, sched_domain_span(sd)) {
> + occ = fraction_mm_sched(cpu_rq(i),
> + per_cpu_ptr(mm->pcpu_sched, i));
> + a_occ += occ;
> + if (occ > m_occ) {
> + m_occ = occ;
> + m_cpu = i;
> + }
> + nr++;
> + trace_printk("(%d) occ: %ld m_occ: %ld m_cpu: %d nr: %d\n",
> + per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr);
> + }
> +
> + a_occ /= nr;
> + if (a_occ > m_a_occ) {
> + m_a_occ = a_occ;
> + m_a_cpu = m_cpu;
> + }
> +
> + trace_printk("(%d) a_occ: %ld m_a_occ: %ld\n",
> + per_cpu(sd_llc_id, cpu), a_occ, m_a_occ);
> +
> + for_each_cpu(i, sched_domain_span(sd)) {
> + /* XXX threshold ? */
> + per_cpu_ptr(mm->pcpu_sched, i)->occ = a_occ;
> + }
> +
> + cpumask_andnot(cpus, cpus, sched_domain_span(sd));
> + }
> + }
> +
> + /*
> + * If the max average cache occupancy is 'small' we don't care.
> + */
> + if (m_a_occ < (NICE_0_LOAD >> EPOCH_OLD))
> + m_a_cpu = -1;
> +
> + mm->mm_sched_cpu = m_a_cpu;
> +
> + free_cpumask_var(cpus);
> +}
> +
> +void init_sched_mm(struct task_struct *p)
> +{
> + struct callback_head *work = &p->cache_work;
> + init_task_work(work, task_cache_work);
> + work->next = work;
> +}
> +
> +#else
> +
> +static inline void account_mm_sched(struct rq *rq, struct task_struct *p,
> + s64 delta_exec) { }
> +
> +
> +void init_sched_mm(struct task_struct *p) { }
> +
> +static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
> +
> +#endif
> +
> +static inline
> +void update_curr_task(struct rq *rq, struct task_struct *p, s64 delta_exec)
> {
> trace_sched_stat_runtime(p, delta_exec);
> account_group_exec_runtime(p, delta_exec);
> + account_mm_sched(rq, p, delta_exec);
> cgroup_account_cputime(p, delta_exec);
> }
>
> @@ -1215,7 +1434,7 @@ s64 update_curr_common(struct rq *rq)
>
> delta_exec = update_curr_se(rq, &donor->se);
> if (likely(delta_exec > 0))
> - update_curr_task(donor, delta_exec);
> + update_curr_task(rq, donor, delta_exec);
>
> return delta_exec;
> }
> @@ -1244,7 +1463,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
> if (entity_is_task(curr)) {
> struct task_struct *p = task_of(curr);
>
> - update_curr_task(p, delta_exec);
> + update_curr_task(rq, p, delta_exec);
>
> /*
> * If the fair_server is active, we need to account for the
> @@ -7850,7 +8069,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
> * per-cpu select_rq_mask usage
> */
> lockdep_assert_irqs_disabled();
> -
> +again:
> if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
> asym_fits_cpu(task_util, util_min, util_max, target))
> return target;
> @@ -7888,7 +8107,8 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
> /* Check a recently used CPU as a potential idle candidate: */
> recent_used_cpu = p->recent_used_cpu;
> p->recent_used_cpu = prev;
> - if (recent_used_cpu != prev &&
> + if (prev == p->wake_cpu &&
> + recent_used_cpu != prev &&
> recent_used_cpu != target &&
> cpus_share_cache(recent_used_cpu, target) &&
> (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
> @@ -7941,6 +8161,18 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
> if ((unsigned)i < nr_cpumask_bits)
> return i;
>
> + if (prev != p->wake_cpu && !cpus_share_cache(prev, p->wake_cpu)) {
> + /*
> + * Most likely select_cache_cpu() will have re-directed
> + * the wakeup, but getting here means the preferred cache is
> + * too busy, so re-try with the actual previous.
> + *
> + * XXX wake_affine is lost for this pass.
> + */
> + prev = target = p->wake_cpu;
> + goto again;
> + }
> +
> /*
> * For cluster machines which have lower sharing cache like L2 or
> * LLC Tag, we tend to find an idle CPU in the target's cluster
> @@ -8563,6 +8795,40 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> return target;
> }
>
> +#ifdef CONFIG_SCHED_CACHE
> +static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle);
> +
> +static int select_cache_cpu(struct task_struct *p, int prev_cpu)
> +{
> + struct mm_struct *mm = p->mm;
> + int cpu;
> +
> + if (!mm || p->nr_cpus_allowed == 1)
> + return prev_cpu;
> +
> + cpu = mm->mm_sched_cpu;
> + if (cpu < 0)
> + return prev_cpu;
> +
> +
> + if (static_branch_likely(&sched_numa_balancing) &&
> + __migrate_degrades_locality(p, prev_cpu, cpu, false) > 0) {
> + /*
> + * XXX look for max occupancy inside prev_cpu's node
> + */
> + return prev_cpu;
> + }
> +
> + return cpu;
> +}
> +#else
> +static int select_cache_cpu(struct task_struct *p, int prev_cpu)
> +{
> + return prev_cpu;
> +}
> +#endif
> +
> +
> /*
> * select_task_rq_fair: Select target runqueue for the waking task in domains
> * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
> @@ -8588,6 +8854,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
> * required for stable ->cpus_allowed
> */
> lockdep_assert_held(&p->pi_lock);
> + guard(rcu)();
> +
> if (wake_flags & WF_TTWU) {
> record_wakee(p);
>
> @@ -8595,6 +8863,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
> cpumask_test_cpu(cpu, p->cpus_ptr))
> return cpu;
>
> + new_cpu = prev_cpu = select_cache_cpu(p, prev_cpu);
> +
> if (!is_rd_overutilized(this_rq()->rd)) {
> new_cpu = find_energy_efficient_cpu(p, prev_cpu);
> if (new_cpu >= 0)
> @@ -8605,7 +8875,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
> want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
> }
>
> - rcu_read_lock();
> for_each_domain(cpu, tmp) {
> /*
> * If both 'cpu' and 'prev_cpu' are part of this domain,
> @@ -8638,7 +8907,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
> /* Fast path */
> new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
> }
> - rcu_read_unlock();
>
> return new_cpu;
> }
> @@ -9288,6 +9556,17 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
> if (sysctl_sched_migration_cost == 0)
> return 0;
>
> +#ifdef CONFIG_SCHED_CACHE
> + if (p->mm && p->mm->pcpu_sched) {
> + /*
> + * XXX things like Skylake have non-inclusive L3 and might not
> + * like this L3 centric view. What to do about L2 stickyness ?
> + */
> + return per_cpu_ptr(p->mm->pcpu_sched, env->src_cpu)->occ >
> + per_cpu_ptr(p->mm->pcpu_sched, env->dst_cpu)->occ;
> + }
> +#endif
> +
> delta = rq_clock_task(env->src_rq) - p->se.exec_start;
>
> return delta < (s64)sysctl_sched_migration_cost;
> @@ -9299,27 +9578,25 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
> * Returns 0, if task migration is not affected by locality.
> * Returns a negative value, if task migration improves locality i.e migration preferred.
> */
> -static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
> +static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle)
> {
> struct numa_group *numa_group = rcu_dereference(p->numa_group);
> unsigned long src_weight, dst_weight;
> int src_nid, dst_nid, dist;
>
> - if (!static_branch_likely(&sched_numa_balancing))
> - return 0;
> -
> - if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
> + if (!p->numa_faults)
> return 0;
>
> - src_nid = cpu_to_node(env->src_cpu);
> - dst_nid = cpu_to_node(env->dst_cpu);
> + src_nid = cpu_to_node(src_cpu);
> + dst_nid = cpu_to_node(dst_cpu);
>
> if (src_nid == dst_nid)
> return 0;
>
> /* Migrating away from the preferred node is always bad. */
> if (src_nid == p->numa_preferred_nid) {
> - if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
> + struct rq *src_rq = cpu_rq(src_cpu);
> + if (src_rq->nr_running > src_rq->nr_preferred_running)
> return 1;
> else
> return 0;
> @@ -9330,7 +9607,7 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
> return -1;
>
> /* Leaving a core idle is often worse than degrading locality. */
> - if (env->idle == CPU_IDLE)
> + if (idle)
> return 0;
>
> dist = node_distance(src_nid, dst_nid);
> @@ -9345,7 +9622,24 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
> return src_weight - dst_weight;
> }
>
> +static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
> +{
> + if (!static_branch_likely(&sched_numa_balancing))
> + return 0;
> +
> + if (!(env->sd->flags & SD_NUMA))
> + return 0;
> +
> + return __migrate_degrades_locality(p, env->src_cpu, env->dst_cpu,
> + env->idle == CPU_IDLE);
> +}
> +
> #else
> +static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle)
> +{
> + return 0;
> +}
> +
> static inline long migrate_degrades_locality(struct task_struct *p,
> struct lb_env *env)
> {
> @@ -13104,8 +13398,8 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
> */
> static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
> {
> - struct cfs_rq *cfs_rq;
> struct sched_entity *se = &curr->se;
> + struct cfs_rq *cfs_rq;
>
> for_each_sched_entity(se) {
> cfs_rq = cfs_rq_of(se);
> @@ -13115,6 +13409,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
> if (static_branch_unlikely(&sched_numa_balancing))
> task_tick_numa(rq, curr);
>
> + task_tick_cache(rq, curr);
> +
> update_misfit_status(curr, rq);
> check_update_overutilized_status(task_rq(curr));
>
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 47972f34ea70..d16ccd66ca07 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1171,6 +1171,12 @@ struct rq {
> u64 clock_pelt_idle_copy;
> u64 clock_idle_copy;
> #endif
> +#ifdef CONFIG_SCHED_CACHE
> + raw_spinlock_t cpu_epoch_lock;
> + u64 cpu_runtime;
> + unsigned long cpu_epoch;
> + unsigned long cpu_epoch_next;
> +#endif
>
> atomic_t nr_iowait;
>
> @@ -3861,6 +3867,8 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
> static inline void init_sched_mm_cid(struct task_struct *t) { }
> #endif /* !CONFIG_SCHED_MM_CID */
>
> +extern void init_sched_mm(struct task_struct *p);
> +
> extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
> extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
> #ifdef CONFIG_SMP
>
Powered by blists - more mailing lists