[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <5f48073d-8b4e-4569-af4f-3a9b5586d7ad@redhat.com>
Date: Fri, 20 Sep 2024 03:05:57 -0400
From: Waiman Long <longman@...hat.com>
To: Pawan Gupta <pawan.kumar.gupta@...ux.intel.com>,
Andy Lutomirski <luto@...nel.org>, Thomas Gleixner <tglx@...utronix.de>,
Ingo Molnar <mingo@...hat.com>, Borislav Petkov <bp@...en8.de>,
Dave Hansen <dave.hansen@...ux.intel.com>,
David Kaplan <David.Kaplan@....com>,
Daniel Sneddon <daniel.sneddon@...ux.intel.com>, x86@...nel.org,
"H. Peter Anvin" <hpa@...or.com>, Peter Zijlstra <peterz@...radead.org>,
Josh Poimboeuf <jpoimboe@...nel.org>, Steven Rostedt <rostedt@...dmis.org>
Cc: linux-kernel@...r.kernel.org, cgroups@...r.kernel.org
Subject: Re: [PATCH RFC 2/2] cpu/bugs: cgroup: Add a cgroup knob to bypass CPU
mitigations
On 9/19/24 17:52, Pawan Gupta wrote:
> For cases where an admin wanting to bypass CPU mitigations for a specific
> workload that they trust. Add a cgroup attribute "cpu.skip_mitigation" that
> can only be set by a privileged user. When set, the CPU mitigations are
> bypassed for all tasks in that cgroup.
>
> Before setting this knob, the admin should be aware of possible security
> risks like confused deputy attack on trusted interpreters, JIT engine,
> etc.
>
> Signed-off-by: Pawan Gupta <pawan.kumar.gupta@...ux.intel.com>
> ---
> arch/x86/include/asm/switch_to.h | 10 ++++++++++
> arch/x86/kernel/cpu/bugs.c | 21 ++++++++++++++++++++
> include/linux/cgroup-defs.h | 3 +++
> kernel/cgroup/cgroup.c | 42 ++++++++++++++++++++++++++++++++++++++++
> kernel/sched/core.c | 2 +-
> 5 files changed, 77 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
> index c3bd0c0758c9..7f32fd139644 100644
> --- a/arch/x86/include/asm/switch_to.h
> +++ b/arch/x86/include/asm/switch_to.h
> @@ -46,6 +46,16 @@ struct fork_frame {
> struct pt_regs regs;
> };
>
> +extern inline void cpu_mitigation_skip(struct task_struct *prev, struct task_struct *next);
> +
> +#define prepare_arch_switch prepare_arch_switch
> +
> +static inline void prepare_arch_switch(struct task_struct *prev,
> + struct task_struct *next)
> +{
> + cpu_mitigation_skip(prev, next);
> +}
> +
> #define switch_to(prev, next, last) \
> do { \
> ((last) = __switch_to_asm((prev), (next))); \
> diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
> index 45675da354f3..77eb4f6dc5c9 100644
> --- a/arch/x86/kernel/cpu/bugs.c
> +++ b/arch/x86/kernel/cpu/bugs.c
> @@ -128,6 +128,27 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
> DEFINE_STATIC_KEY_FALSE(mmio_stale_data_clear);
> EXPORT_SYMBOL_GPL(mmio_stale_data_clear);
>
> +inline void cpu_mitigation_skip(struct task_struct *prev,
> + struct task_struct *next)
> +{
> + bool prev_skip = false, next_skip = false;
> +
> + if (prev->mm)
> + prev_skip = task_dfl_cgroup(prev)->cpu_skip_mitigation;
> + if (next->mm)
> + next_skip = task_dfl_cgroup(next)->cpu_skip_mitigation;
> +
> + if (!prev_skip && !next_skip)
> + return;
> + if (prev_skip == next_skip)
> + return;
I believe the first (!prev_skip && !next_skip) check is redundant.
> +
> + if (next_skip)
> + wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64_unmitigated);
> + else
> + wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64_mitigated);
> +}
> +
> void __init cpu_select_mitigations(void)
> {
> /*
> diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
> index ae04035b6cbe..6a131a62f43c 100644
> --- a/include/linux/cgroup-defs.h
> +++ b/include/linux/cgroup-defs.h
> @@ -546,6 +546,9 @@ struct cgroup {
> struct bpf_local_storage __rcu *bpf_cgrp_storage;
> #endif
>
> + /* Used to bypass the CPU mitigations for tasks in a cgroup */
> + bool cpu_skip_mitigation;
> +
> /* All ancestors including self */
> struct cgroup *ancestors[];
> };
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index c8e4b62b436a..b745dbcb153e 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -2045,6 +2045,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
> cgrp->dom_cgrp = cgrp;
> cgrp->max_descendants = INT_MAX;
> cgrp->max_depth = INT_MAX;
> + cgrp->cpu_skip_mitigation = 0;
> INIT_LIST_HEAD(&cgrp->rstat_css_list);
> prev_cputime_init(&cgrp->prev_cputime);
>
> @@ -3751,6 +3752,41 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
> return ret;
> }
>
> +static int cpu_skip_mitigation_show(struct seq_file *seq, void *v)
> +{
> + struct cgroup *cgrp = seq_css(seq)->cgroup;
> + int ret = 0;
> +
> + seq_printf(seq, "%d\n", cgrp->cpu_skip_mitigation);
> +
> + return ret;
> +}
> +
> +static ssize_t cgroup_skip_mitigation_write(struct kernfs_open_file *of,
> + char *buf, size_t nbytes,
> + loff_t off)
> +{
> + struct cgroup *cgrp = of->kn->parent->priv;
> + struct cgroup_file_ctx *ctx = of->priv;
> + u64 skip_mitigation;
> + int ret;
> +
> + /* Only privileged user in init namespace is allowed to set skip_mitigation */
> + if ((ctx->ns != &init_cgroup_ns) || !capable(CAP_SYS_ADMIN))
> + return -EPERM;
> +
> + ret = kstrtoull(buf, 0, &skip_mitigation);
> + if (ret)
> + return -EINVAL;
> +
> + if (skip_mitigation > 1)
> + return -EINVAL;
> +
> + cgrp->cpu_skip_mitigation = skip_mitigation;
> +
> + return nbytes;
> +}
> +
> static int cpu_local_stat_show(struct seq_file *seq, void *v)
> {
> struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
> @@ -5290,6 +5326,12 @@ static struct cftype cgroup_base_files[] = {
> .name = "cpu.stat.local",
> .seq_show = cpu_local_stat_show,
> },
> + {
> + .name = "cpu.skip_mitigation",
> + .flags = CFTYPE_NOT_ON_ROOT,
> + .seq_show = cpu_skip_mitigation_show,
> + .write = cgroup_skip_mitigation_write,
> + },
> { } /* terminate */
> };
Since this control knob is effective only for x86_64, should we enable
this only for this architecture? However, cgroup never has architecture
specific control knob like that before, it will be the first if we
decide to do that.
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index f3951e4a55e5..4b4109afbf7c 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4994,7 +4994,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
> fire_sched_out_preempt_notifiers(prev, next);
> kmap_local_sched_out();
> prepare_task(next);
> - prepare_arch_switch(next);
> + prepare_arch_switch(prev, next);
> }
>
> /**
>
The sparc and m68k arches have their own prepare_arch_switch() calls. So
you will need to modify those places as well.
Cheers,
Longman
Powered by blists - more mailing lists