[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ZsMxAGsYskX0o51m@example.org>
Date: Mon, 19 Aug 2024 13:48:16 +0200
From: Alexey Gladkov <legion@...nel.org>
To: "Kirill A. Shutemov" <kirill.shutemov@...ux.intel.com>
Cc: linux-kernel@...r.kernel.org, linux-coco@...ts.linux.dev,
Thomas Gleixner <tglx@...utronix.de>,
Ingo Molnar <mingo@...hat.com>, Borislav Petkov <bp@...en8.de>,
Dave Hansen <dave.hansen@...ux.intel.com>,
"H. Peter Anvin" <hpa@...or.com>,
Andrew Morton <akpm@...ux-foundation.org>,
Yuan Yao <yuan.yao@...el.com>,
Geert Uytterhoeven <geert@...ux-m68k.org>,
Yuntao Wang <ytcoode@...il.com>, Kai Huang <kai.huang@...el.com>,
Baoquan He <bhe@...hat.com>, Oleg Nesterov <oleg@...hat.com>,
cho@...rosoft.com, decui@...rosoft.com, John.Starks@...rosoft.com
Subject: Re: [PATCH v3 02/10] x86/tdx: Add validation of userspace MMIO
instructions
On Mon, Aug 19, 2024 at 01:39:17PM +0300, Kirill A. Shutemov wrote:
> On Fri, Aug 16, 2024 at 03:43:52PM +0200, Alexey Gladkov wrote:
> > From: "Alexey Gladkov (Intel)" <legion@...nel.org>
> >
> > Instructions from kernel space are considered trusted. If the MMIO
> > instruction is from userspace it must be checked.
> >
> > For userspace instructions, it is need to check that the INSN has not
> > changed at the time of #VE and before the execution of the instruction.
>
> Well, we cannot really check if the instruction changed under us. We can
> only check if the parsed instruction does an MMIO operation that is
> allowed for the process.
We also check that the memory access (read/write) type matches. Yes, we
can't check the instruction itself, but we check the arguments.
> >
> > Once the userspace instruction parsed is enforced that the address
> > points to mapped memory of current process and that address does not
> > point to private memory.
> >
> > After parsing the userspace instruction, it is necessary to ensure that:
> >
> > 1. the operation direction (read/write) corresponds to #VE info;
> > 2. the address still points to mapped memory of current process;
> > 3. the address does not point to private memory.
>
> I don't see where you check 3.
>
> I guess you can add pte_decrypted(pte) check to get_phys_addr().
>
> But I'm not sure it is strictly needed.
(ve->gpa != cc_mkdec(phys_addr)
The ve->gpa was checked in the virt_exception_user/kernel().
>
> > Signed-off-by: Alexey Gladkov (Intel) <legion@...nel.org>
> > ---
> > arch/x86/coco/tdx/tdx.c | 128 ++++++++++++++++++++++++++++++++++++----
> > 1 file changed, 115 insertions(+), 13 deletions(-)
> >
> > diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
> > index af0b6c1cacf7..86c22fec97fb 100644
> > --- a/arch/x86/coco/tdx/tdx.c
> > +++ b/arch/x86/coco/tdx/tdx.c
> > @@ -8,6 +8,7 @@
> > #include <linux/export.h>
> > #include <linux/io.h>
> > #include <linux/kexec.h>
> > +#include <linux/mm.h>
> > #include <asm/coco.h>
> > #include <asm/tdx.h>
> > #include <asm/vmx.h>
> > @@ -405,6 +406,84 @@ static bool mmio_write(int size, unsigned long addr, unsigned long val)
> > EPT_WRITE, addr, val);
> > }
> >
> > +static inline bool is_private_gpa(u64 gpa)
> > +{
> > + return gpa == cc_mkenc(gpa);
> > +}
> > +
> > +static int get_phys_addr(unsigned long addr, phys_addr_t *phys_addr, bool *writable)
> > +{
> > + unsigned int level;
> > + pgd_t *pgdp;
> > + pte_t *ptep;
> > +
> > + /*
> > + * Address validation only makes sense for a user process. The lock must
> > + * be obtained before validation can begin.
> > + */
> > + mmap_assert_locked(current->mm);
> > +
> > + pgdp = pgd_offset(current->mm, addr);
> > +
> > + if (!pgd_none(*pgdp)) {
> > + ptep = lookup_address_in_pgd(pgdp, addr, &level);
> > + if (ptep) {
> > + unsigned long offset;
> > +
> > + offset = addr & ~page_level_mask(level);
> > + *phys_addr = PFN_PHYS(pte_pfn(*ptep));
> > + *phys_addr |= offset;
> > +
> > + *writable = pte_write(*ptep);
> > +
> > + return 0;
> > + }
> > + }
> > +
> > + return -EFAULT;
> > +}
> > +
> > +static int valid_vaddr(struct ve_info *ve, enum insn_mmio_type mmio, int size,
> > + unsigned long vaddr)
> > +{
> > + phys_addr_t phys_addr;
> > + bool writable = false;
> > +
> > + /* It's not fatal. This can happen due to swap out or page migration. */
> > + if (get_phys_addr(vaddr, &phys_addr, &writable) || (ve->gpa != cc_mkdec(phys_addr)))
>
> Too long line?
All patches pass checkpatch without warnings.
>
> > + return -EAGAIN;
> > +
> > + /*
> > + * Re-check whether #VE info matches the instruction that was decoded.
> > + *
> > + * The ve->gpa was valid at the time ve_info was received. But this code
> > + * executed with interrupts enabled, allowing tlb shootdown and therefore
> > + * munmap() to be executed in the parallel thread.
> > + *
> > + * By the time MMIO emulation is performed, ve->gpa may be already
> > + * unmapped from the process, the device it belongs to removed from
> > + * system and something else could be plugged in its place.
> > + */
> > + switch (mmio) {
> > + case INSN_MMIO_WRITE:
> > + case INSN_MMIO_WRITE_IMM:
> > + if (!writable || !(ve->exit_qual & EPT_VIOLATION_ACC_WRITE))
> > + return -EFAULT;
> > + break;
> > + case INSN_MMIO_READ:
> > + case INSN_MMIO_READ_ZERO_EXTEND:
> > + case INSN_MMIO_READ_SIGN_EXTEND:
> > + if (!(ve->exit_qual & EPT_VIOLATION_ACC_READ))
> > + return -EFAULT;
> > + break;
> > + default:
> > + WARN_ONCE(1, "Unsupported mmio instruction: %d", mmio);
> > + return -EINVAL;
> > + }
> > +
> > + return 0;
> > +}
> > +
> > static int handle_mmio_write(struct insn *insn, enum insn_mmio_type mmio, int size,
> > struct pt_regs *regs, struct ve_info *ve)
> > {
> > @@ -489,7 +568,7 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
> > enum insn_mmio_type mmio;
> > struct insn insn = {};
> > unsigned long vaddr;
> > - int size;
> > + int size, ret;
> >
> > /* Only in-kernel MMIO is supported */
> > if (WARN_ON_ONCE(user_mode(regs)))
> > @@ -505,6 +584,17 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
> > if (WARN_ON_ONCE(mmio == INSN_MMIO_DECODE_FAILED))
> > return -EINVAL;
> >
> > + vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
> > +
> > + if (current->mm) {
>
> Hm. This path will be taken for any MMIO if it is done in context of a
> process, even in-kernel only. I don't think we want it. It is useless
> overhead.
The kthread do not have a current->mm. As an example:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/vfio/vfio_iommu_type1.c#n3053
Also documentation mention this as the way to check a user context:
(which makes more sense anyway - the test is basically one of "do
we have a user context", and is generally done by the page fault
handler and things like that).
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/mm/active_mm.rst#n80
> Use user_mode(regs) instead.
I can't use this. When nested exception happens in the handle_mmio_movs()
the regs will be not in the user mode.
I can make a flag that will be set either for user_mode or if we have a
nested exception.
> > + if (mmap_read_lock_killable(current->mm))
> > + return -EINTR;
> > +
> > + ret = valid_vaddr(ve, mmio, size, vaddr);
> > + if (ret)
> > + goto unlock;
> > + }
> > +
> > /*
> > * Reject EPT violation #VEs that split pages.
> > *
> > @@ -514,30 +604,39 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
> > *
> > * load_unaligned_zeropad() will recover using exception fixups.
> > */
> > - vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
> > - if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE)
> > - return -EFAULT;
> > + if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE) {
> > + ret = -EFAULT;
> > + goto unlock;
> > + }
> >
> > switch (mmio) {
> > case INSN_MMIO_WRITE:
> > case INSN_MMIO_WRITE_IMM:
> > case INSN_MMIO_MOVS:
> > - return handle_mmio_write(&insn, mmio, size, regs, ve);
> > + ret = handle_mmio_write(&insn, mmio, size, regs, ve);
> > + break;
> > case INSN_MMIO_READ:
> > case INSN_MMIO_READ_ZERO_EXTEND:
> > case INSN_MMIO_READ_SIGN_EXTEND:
> > - return handle_mmio_read(&insn, mmio, size, regs, ve);
> > + ret = handle_mmio_read(&insn, mmio, size, regs, ve);
> > + break;
> > case INSN_MMIO_DECODE_FAILED:
> > /*
> > * MMIO was accessed with an instruction that could not be
> > * decoded or handled properly. It was likely not using io.h
> > * helpers or accessed MMIO accidentally.
> > */
> > - return -EINVAL;
> > + ret = -EINVAL;
> > + break;
> > default:
> > WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?");
> > - return -EINVAL;
> > + ret = -EINVAL;
> > }
> > +unlock:
> > + if (current->mm)
> > + mmap_read_unlock(current->mm);
> > +
> > + return ret;
> > }
> >
> > static bool handle_in(struct pt_regs *regs, int size, int port)
> > @@ -681,11 +780,6 @@ static int virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
> > }
> > }
> >
> > -static inline bool is_private_gpa(u64 gpa)
> > -{
> > - return gpa == cc_mkenc(gpa);
> > -}
> > -
> > /*
> > * Handle the kernel #VE.
> > *
> > @@ -723,6 +817,14 @@ bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
> > insn_len = virt_exception_user(regs, ve);
> > else
> > insn_len = virt_exception_kernel(regs, ve);
> > +
> > + /*
> > + * A special case to return to userspace without increasing regs->ip
> > + * to repeat the instruction once again.
> > + */
> > + if (insn_len == -EAGAIN)
> > + return true;
> > +
> > if (insn_len < 0)
> > return false;
> >
> > --
> > 2.45.2
> >
>
> --
> Kiryl Shutsemau / Kirill A. Shutemov
>
--
Rgrds, legion
Powered by blists - more mailing lists