[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <df3ea0c02afb98a8dfc06f29d5ff56bbe8588dd8.camel@redhat.com>
Date: Tue, 20 May 2025 15:39:45 +0200
From: Gabriele Monaco <gmonaco@...hat.com>
To: Waiman Long <longman@...hat.com>
Cc: linux-kernel@...r.kernel.org, Frederic Weisbecker <frederic@...nel.org>,
Thomas Gleixner <tglx@...utronix.de>
Subject: Re: [PATCH v5 5/6] cgroup/cpuset: Fail if isolated and nohz_full
don't leave any housekeeping
On Thu, 2025-05-08 at 16:53 +0200, Gabriele Monaco wrote:
> Currently the user can set up isolated cpus via cpuset and nohz_full
> in
> such a way that leaves no housekeeping CPU (i.e. no CPU that is
> neither
> domain isolated nor nohz full). This can be a problem for other
> subsystems (e.g. the timer wheel imgration).
>
> Prevent this configuration by blocking any assignation that would
> cause
> the union of domain isolated cpus and nohz_full to covers all CPUs.
>
> Signed-off-by: Gabriele Monaco <gmonaco@...hat.com>
> ---
Waiman, while testing this patch I got a few doubts how errors should
be reported in cpusets and the general behaviour when cpuset is
combined with boot-time isolation.
This is the behaviour introduced by the current patch:
* Assume we boot a 16 cores machine with nohz_full=8-15
* We configure an isolated cgroup with 0-9 exclusive CPUs
* the partition file complains with:
isolated invalid (partition config conflicts with housekeeping setup)
nproc: 16 (ok)
* we now set the same cgroup with 0-6 isolated CPUs
the partition is marked as isolated (expected)
nproc: 9 (ok)
* we set back the CPUs as 0-9
I'd expect an error somewhere but partition is still isolated
nproc: 9 (ok?)
Checking with nproc shows 7-9 are not isolated (but this is not visible
in the effective exclusive CPUs which shows still 0-9).
Now this behaviour seems incorrect to me, but is consistent with the
other flavour of PERR_HKEEPING (already upstream):
* set isolcpus=8-15
nproc: 8
* set 5-9 as isolated
isolated invalid (as above)
nproc: 8
* set 5-7
isolated
nproc: 13 (?!)
* set back 5-9
still isolated
nproc: 16 (?!)
Here nproc reports isolcpus as no longer isolated, which I find even
more confusing.
Now my questions: is it alright not to report errors when we fail to
isolate some CPUs but can allocate them as exclusive in the cpuset?
Can cpuset really undo some effects of isolcpus or is that a glitch?
Thanks,
Gabriele
> kernel/cgroup/cpuset.c | 67
> ++++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 65 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index 95316d39c282..2f1df6f5b988 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -80,6 +80,12 @@ static cpumask_var_t subpartitions_cpus;
> */
> static cpumask_var_t isolated_cpus;
>
> +/*
> + * Housekeeping CPUs for both HK_TYPE_DOMAIN and
> HK_TYPE_KERNEL_NOISE
> + */
> +static cpumask_var_t full_hk_cpus;
> +static bool have_boot_nohz_full;
> +
> /*
> * Housekeeping (HK_TYPE_DOMAIN) CPUs at boot
> */
> @@ -1253,10 +1259,26 @@ static void reset_partition_data(struct
> cpuset *cs)
> static void isolated_cpus_update(int old_prs, int new_prs, struct
> cpumask *xcpus)
> {
> WARN_ON_ONCE(old_prs == new_prs);
> - if (new_prs == PRS_ISOLATED)
> + if (new_prs == PRS_ISOLATED) {
> cpumask_or(isolated_cpus, isolated_cpus, xcpus);
> - else
> + cpumask_andnot(full_hk_cpus, full_hk_cpus, xcpus);
> + } else {
> cpumask_andnot(isolated_cpus, isolated_cpus, xcpus);
> + cpumask_or(full_hk_cpus, full_hk_cpus, xcpus);
> + }
> +}
> +
> +/*
> + * isolated_cpus_should_update - Returns if the isolated_cpus mask
> needs update
> + * @prs: new or old partition_root_state
> + * @parent: parent cpuset
> + * Return: true if isolated_cpus needs modification, false otherwise
> + */
> +static bool isolated_cpus_should_update(int prs, struct cpuset
> *parent)
> +{
> + if (!parent)
> + parent = &top_cpuset;
> + return prs != parent->partition_root_state;
> }
>
> /*
> @@ -1323,6 +1345,25 @@ static bool partition_xcpus_del(int old_prs,
> struct cpuset *parent,
> return isolcpus_updated;
> }
>
> +/*
> + * isolcpus_nohz_conflict - check for isolated & nohz_full conflicts
> + * @new_cpus: cpu mask
> + * Return: true if there is conflict, false otherwise
> + *
> + * If nohz_full is enabled and we have isolated CPUs, their
> combination must
> + * still leave housekeeping CPUs.
> + */
> +static bool isolcpus_nohz_conflict(struct cpumask *new_cpus)
> +{
> + if (!have_boot_nohz_full)
> + return false;
> +
> + if (!cpumask_weight_andnot(full_hk_cpus, new_cpus))
> + return true;
> +
> + return false;
> +}
> +
> static void update_exclusion_cpumasks(bool isolcpus_updated)
> {
> int ret;
> @@ -1448,6 +1489,9 @@ static int remote_partition_enable(struct
> cpuset *cs, int new_prs,
> cpumask_intersects(tmp->new_cpus, subpartitions_cpus) ||
> cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus))
> return PERR_INVCPUS;
> + if (isolated_cpus_should_update(new_prs, NULL) &&
> + isolcpus_nohz_conflict(tmp->new_cpus))
> + return PERR_HKEEPING;
>
> spin_lock_irq(&callback_lock);
> isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp-
> >new_cpus);
> @@ -1546,6 +1590,9 @@ static void remote_cpus_update(struct cpuset
> *cs, struct cpumask *xcpus,
> else if (cpumask_intersects(tmp->addmask, subpartitions_cpus) ||
> cpumask_subset(top_cpuset.effective_cpus, tmp->addmask))
> cs->prs_err = PERR_NOCPUS;
> + else if (isolated_cpus_should_update(prs, NULL) &&
> + isolcpus_nohz_conflict(tmp->addmask))
> + cs->prs_err = PERR_HKEEPING;
> if (cs->prs_err)
> goto invalidate;
> }
> @@ -1877,6 +1924,12 @@ static int
> update_parent_effective_cpumask(struct cpuset *cs, int cmd,
> return err;
> }
>
> + if (deleting && isolated_cpus_should_update(new_prs, parent) &&
> + isolcpus_nohz_conflict(tmp->delmask)) {
> + cs->prs_err = PERR_HKEEPING;
> + return PERR_HKEEPING;
> + }
> +
> /*
> * Change the parent's effective_cpus & effective_xcpus (top cpuset
> * only).
> @@ -2897,6 +2950,8 @@ static int update_prstate(struct cpuset *cs,
> int new_prs)
> * Need to update isolated_cpus.
> */
> isolcpus_updated = true;
> + if (isolcpus_nohz_conflict(cs->effective_xcpus))
> + err = PERR_HKEEPING;
> } else {
> /*
> * Switching back to member is always allowed even if it
> @@ -3715,6 +3770,7 @@ int __init cpuset_init(void)
> BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL));
> BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL));
> BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL));
> + BUG_ON(!alloc_cpumask_var(&full_hk_cpus, GFP_KERNEL));
>
> cpumask_setall(top_cpuset.cpus_allowed);
> nodes_setall(top_cpuset.mems_allowed);
> @@ -3722,17 +3778,24 @@ int __init cpuset_init(void)
> cpumask_setall(top_cpuset.effective_xcpus);
> cpumask_setall(top_cpuset.exclusive_cpus);
> nodes_setall(top_cpuset.effective_mems);
> + cpumask_copy(full_hk_cpus, cpu_present_mask);
>
> fmeter_init(&top_cpuset.fmeter);
> INIT_LIST_HEAD(&remote_children);
>
> BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
>
> + have_boot_nohz_full = housekeeping_enabled(HK_TYPE_KERNEL_NOISE);
> + if (have_boot_nohz_full)
> + cpumask_and(full_hk_cpus, cpu_possible_mask,
> + housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
> +
> have_boot_isolcpus = housekeeping_enabled(HK_TYPE_DOMAIN);
> if (have_boot_isolcpus) {
> BUG_ON(!alloc_cpumask_var(&boot_hk_cpus, GFP_KERNEL));
> cpumask_copy(boot_hk_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN));
> cpumask_andnot(isolated_cpus, cpu_possible_mask, boot_hk_cpus);
> + cpumask_and(full_hk_cpus, full_hk_cpus, boot_hk_cpus);
> }
>
> return 0;
Powered by blists - more mailing lists