[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAJHvVcg+r+mmdODWKH3YpA7VyRs+H-UfaB3W2JOsGj0V=2=yHg@mail.gmail.com>
Date: Tue, 14 Jun 2022 17:53:49 -0700
From: Axel Rasmussen <axelrasmussen@...gle.com>
To: Peter Xu <peterx@...hat.com>
Cc: Alexander Viro <viro@...iv.linux.org.uk>,
Andrew Morton <akpm@...ux-foundation.org>,
Charan Teja Reddy <charante@...eaurora.org>,
Dave Hansen <dave.hansen@...ux.intel.com>,
"Dmitry V . Levin" <ldv@...linux.org>,
Gleb Fotengauer-Malinovskiy <glebfm@...linux.org>,
Hugh Dickins <hughd@...gle.com>, Jan Kara <jack@...e.cz>,
Jonathan Corbet <corbet@....net>,
Mel Gorman <mgorman@...hsingularity.net>,
Mike Kravetz <mike.kravetz@...cle.com>,
Mike Rapoport <rppt@...nel.org>, Nadav Amit <namit@...are.com>,
Shuah Khan <shuah@...nel.org>,
Suren Baghdasaryan <surenb@...gle.com>,
Vlastimil Babka <vbabka@...e.cz>,
zhangyi <yi.zhang@...wei.com>, linux-doc@...r.kernel.org,
linux-fsdevel@...r.kernel.org, LKML <linux-kernel@...r.kernel.org>,
Linux MM <linux-mm@...ck.org>,
Linuxkselftest <linux-kselftest@...r.kernel.org>
Subject: Re: [PATCH v3 2/6] userfaultfd: add /dev/userfaultfd for fine grained
access control
On Tue, Jun 14, 2022 at 12:09 PM Peter Xu <peterx@...hat.com> wrote:
>
> On Wed, Jun 01, 2022 at 02:09:47PM -0700, Axel Rasmussen wrote:
> > Historically, it has been shown that intercepting kernel faults with
> > userfaultfd (thereby forcing the kernel to wait for an arbitrary amount
> > of time) can be exploited, or at least can make some kinds of exploits
> > easier. So, in 37cd0575b8 "userfaultfd: add UFFD_USER_MODE_ONLY" we
> > changed things so, in order for kernel faults to be handled by
> > userfaultfd, either the process needs CAP_SYS_PTRACE, or this sysctl
> > must be configured so that any unprivileged user can do it.
> >
> > In a typical implementation of a hypervisor with live migration (take
> > QEMU/KVM as one such example), we do indeed need to be able to handle
> > kernel faults. But, both options above are less than ideal:
> >
> > - Toggling the sysctl increases attack surface by allowing any
> > unprivileged user to do it.
> >
> > - Granting the live migration process CAP_SYS_PTRACE gives it this
> > ability, but *also* the ability to "observe and control the
> > execution of another process [...], and examine and change [its]
> > memory and registers" (from ptrace(2)). This isn't something we need
> > or want to be able to do, so granting this permission violates the
> > "principle of least privilege".
> >
> > This is all a long winded way to say: we want a more fine-grained way to
> > grant access to userfaultfd, without granting other additional
> > permissions at the same time.
> >
> > To achieve this, add a /dev/userfaultfd misc device. This device
> > provides an alternative to the userfaultfd(2) syscall for the creation
> > of new userfaultfds. The idea is, any userfaultfds created this way will
> > be able to handle kernel faults, without the caller having any special
> > capabilities. Access to this mechanism is instead restricted using e.g.
> > standard filesystem permissions.
> >
> > Signed-off-by: Axel Rasmussen <axelrasmussen@...gle.com>
> > ---
> > fs/userfaultfd.c | 76 ++++++++++++++++++++++++++------
> > include/uapi/linux/userfaultfd.h | 4 ++
> > 2 files changed, 66 insertions(+), 14 deletions(-)
> >
> > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > index e943370107d0..8b92c1398169 100644
> > --- a/fs/userfaultfd.c
> > +++ b/fs/userfaultfd.c
> > @@ -30,6 +30,7 @@
> > #include <linux/security.h>
> > #include <linux/hugetlb.h>
> > #include <linux/swapops.h>
> > +#include <linux/miscdevice.h>
> >
> > int sysctl_unprivileged_userfaultfd __read_mostly;
> >
> > @@ -413,13 +414,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
> >
> > if (ctx->features & UFFD_FEATURE_SIGBUS)
> > goto out;
> > - if ((vmf->flags & FAULT_FLAG_USER) == 0 &&
> > - ctx->flags & UFFD_USER_MODE_ONLY) {
> > - printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd "
> > - "sysctl knob to 1 if kernel faults must be handled "
> > - "without obtaining CAP_SYS_PTRACE capability\n");
> > + if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
> > goto out;
> > - }
> >
> > /*
> > * If it's already released don't get it. This avoids to loop
> > @@ -2052,19 +2048,33 @@ static void init_once_userfaultfd_ctx(void *mem)
> > seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
> > }
> >
> > -SYSCALL_DEFINE1(userfaultfd, int, flags)
> > +static inline bool userfaultfd_allowed(bool is_syscall, int flags)
> > +{
> > + bool kernel_faults = !(flags & UFFD_USER_MODE_ONLY);
> > + bool allow_unprivileged = sysctl_unprivileged_userfaultfd;
> > +
> > + /* userfaultfd(2) access is controlled by sysctl + capability. */
> > + if (is_syscall && kernel_faults) {
> > + if (!allow_unprivileged && !capable(CAP_SYS_PTRACE))
> > + return false;
> > + }
> > +
> > + /*
> > + * For /dev/userfaultfd, access is to be controlled using e.g.
> > + * permissions on the device node. We assume this is correctly
> > + * configured by userspace, so we simply allow access here.
> > + */
> > +
> > + return true;
> > +}
>
> This helper reads a bit weird because potentially it constantly returns
> "true" for !syscall use case but it's very not obvious..
>
> Would it be cleaner to not pass in the bool at all? Something like (I also
> un-nested some of the condition checks, hopefully it'll be easier to read):
>
> bool userfaultfd_syscall_allowed(int flags)
> {
> /* Userspace-only page faults are always allowed */
> if (flags & UFFD_USER_MODE_ONLY)
> return true;
>
> /*
> * The user is requesting kernel fault capabilities. Privileged
> * users are always allowed even for kernel fault traps.
> */
> if (capable(CAP_SYS_PTRACE))
> return true;
>
> /* Whether we allow unprivileged users for kernel faults? */
> return sysctl_unprivileged_userfaultfd;
> }
>
> Then below...
>
> > +
> > +static int new_userfaultfd(bool is_syscall, int flags)
> > {
> > struct userfaultfd_ctx *ctx;
> > int fd;
> >
> > - if (!sysctl_unprivileged_userfaultfd &&
> > - (flags & UFFD_USER_MODE_ONLY) == 0 &&
> > - !capable(CAP_SYS_PTRACE)) {
> > - printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd "
> > - "sysctl knob to 1 if kernel faults must be handled "
> > - "without obtaining CAP_SYS_PTRACE capability\n");
> > + if (!userfaultfd_allowed(is_syscall, flags))
> > return -EPERM;
> > - }
>
> .. we could write it as:
>
> if (is_syscall && !userfaultfd_syscall_allowed(flags))
> return -EPERM;
>
> What do you think?
>
> >
> > BUG_ON(!current->mm);
> >
> > @@ -2083,6 +2093,10 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
> > refcount_set(&ctx->refcount, 1);
> > ctx->flags = flags;
> > ctx->features = 0;
> > + /*
> > + * If UFFD_USER_MODE_ONLY is not set, then userfaultfd_allowed() above
> > + * decided that kernel faults were allowed and should be handled.
> > + */
>
> Hmm.. why this needs to be added above "released=false"? Did you want to
> add this (perhaps) above "flags" instead?
>
> IMHO when people reading the flags it'll be clear already on how it was
> handled, the thing is the comment probably hide deep anyway so I'd consider
> omitting it.
>
> The rest looks good to me, thanks.
Thanks for reviewing, Peter! Most of these comments look good to me,
I'll include them in a v4 after I get back to the office in about a
week.
>
> > ctx->released = false;
> > atomic_set(&ctx->mmap_changing, 0);
> > ctx->mm = current->mm;
> > @@ -2098,8 +2112,42 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
> > return fd;
> > }
> >
> > +SYSCALL_DEFINE1(userfaultfd, int, flags)
> > +{
> > + return new_userfaultfd(true, flags);
> > +}
> > +
> > +static int userfaultfd_dev_open(struct inode *inode, struct file *file)
> > +{
> > + return 0;
> > +}
> > +
> > +static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags)
> > +{
> > + if (cmd != USERFAULTFD_IOC_NEW)
> > + return -EINVAL;
> > +
> > + return new_userfaultfd(false, flags);
> > +}
> > +
> > +static const struct file_operations userfaultfd_dev_fops = {
> > + .open = userfaultfd_dev_open,
> > + .unlocked_ioctl = userfaultfd_dev_ioctl,
> > + .compat_ioctl = userfaultfd_dev_ioctl,
> > + .owner = THIS_MODULE,
> > + .llseek = noop_llseek,
> > +};
> > +
> > +static struct miscdevice userfaultfd_misc = {
> > + .minor = MISC_DYNAMIC_MINOR,
> > + .name = "userfaultfd",
> > + .fops = &userfaultfd_dev_fops
> > +};
> > +
> > static int __init userfaultfd_init(void)
> > {
> > + WARN_ON(misc_register(&userfaultfd_misc));
> > +
> > userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
> > sizeof(struct userfaultfd_ctx),
> > 0,
> > diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
> > index 7d32b1e797fb..005e5e306266 100644
> > --- a/include/uapi/linux/userfaultfd.h
> > +++ b/include/uapi/linux/userfaultfd.h
> > @@ -12,6 +12,10 @@
> >
> > #include <linux/types.h>
> >
> > +/* ioctls for /dev/userfaultfd */
> > +#define USERFAULTFD_IOC 0xAA
> > +#define USERFAULTFD_IOC_NEW _IO(USERFAULTFD_IOC, 0x00)
> > +
> > /*
> > * If the UFFDIO_API is upgraded someday, the UFFDIO_UNREGISTER and
> > * UFFDIO_WAKE ioctls should be defined as _IOW and not as _IOR. In
> > --
> > 2.36.1.255.ge46751e96f-goog
> >
>
> --
> Peter Xu
>
Powered by blists - more mailing lists