[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1cace497215b025ed8b5f7815bdeb23382ecad32.camel@intel.com>
Date: Fri, 1 Nov 2024 11:05:45 +0000
From: "Huang, Kai" <kai.huang@...el.com>
To: "seanjc@...gle.com" <seanjc@...gle.com>
CC: "Edgecombe, Rick P" <rick.p.edgecombe@...el.com>,
"binbin.wu@...ux.intel.com" <binbin.wu@...ux.intel.com>, "Li, Xiaoyao"
<xiaoyao.li@...el.com>, "linux-kernel@...r.kernel.org"
<linux-kernel@...r.kernel.org>, "yuan.yao@...ux.intel.com"
<yuan.yao@...ux.intel.com>, "kvm@...r.kernel.org" <kvm@...r.kernel.org>,
"Yamahata, Isaku" <isaku.yamahata@...el.com>, "pbonzini@...hat.com"
<pbonzini@...hat.com>
Subject: Re: [PATCH v3 1/2] KVM: x86: Check hypercall's exit to userspace
generically
On Thu, 2024-10-31 at 07:54 -0700, Sean Christopherson wrote:
> On Thu, Oct 31, 2024, Kai Huang wrote:
> > > @@ -10111,12 +10111,21 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
> > > cpl = kvm_x86_call(get_cpl)(vcpu);
> > >
> > > ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl);
> > > - if (nr == KVM_HC_MAP_GPA_RANGE && !ret)
> > > - /* MAP_GPA tosses the request to the user space. */
> > > + if (!ret)
> > > return 0;
> > >
> > > - if (!op_64_bit)
> > > + /*
> > > + * KVM's ABI with the guest is that '0' is success, and any other value
> > > + * is an error code. Internally, '0' == exit to userspace (see above)
> > > + * and '1' == success, as KVM's de facto standard return codes are that
> > > + * plus -errno == failure. Explicitly check for '1' as some PV error
> > > + * codes are positive values.
> > > + */
> > > + if (ret == 1)
> > > + ret = 0;
> > > + else if (!op_64_bit)
> > > ret = (u32)ret;
> > > +
> > > kvm_rax_write(vcpu, ret);
> > >
> > > return kvm_skip_emulated_instruction(vcpu);
> > >
> >
> > It appears below two cases are not covered correctly?
> >
> > #ifdef CONFIG_X86_64
> > case KVM_HC_CLOCK_PAIRING:
> > ret = kvm_pv_clock_pairing(vcpu, a0, a1);
> > break;
> > #endif
> > case KVM_HC_SEND_IPI:
> > if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI))
> > break;
> >
> > ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
> > break;
> >
> > Looking at the code, seems at least for KVM_HC_CLOCK_PAIRING,
> > kvm_pv_clock_pairing() returns 0 on success, and the upstream behaviour is not
> > routing to userspace but writing 0 to vcpu's RAX and returning to guest. With
> > the change above, it immediately returns to userspace w/o writing 0 to RAX.
> >
> > For KVM_HC_SEND_IPI, seems the upstream behaviour is the return value of
> > kvm_pv_send_ipi() is always written to vcpu's RAX reg, and we always just go
> > back to guest. With your change, the behaviour will be changed to:
> >
> > 1) when ret == 0, exit to userspace w/o writing 0 to vcpu's RAX,
> > 2) when ret == 1, it is converted to 0 and then written to RAX.
> >
> > This doesn't look like safe.
>
> Drat, I managed to space on the cases that didn't explicit set '0'. Hrm.
>
> My other idea was have an out-param to separate the return code intended for KVM
> from the return code intended for the guest. I generally dislike out-params, but
> trying to juggle a return value that multiplexes guest and host values seems like
> an even worse idea.
Yeah this looks fine to me, one comment below ...
[...]
> - ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl);
> - if (nr == KVM_HC_MAP_GPA_RANGE && !ret)
> - /* MAP_GPA tosses the request to the user space. */
> - return 0;
> + r = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl, &ret);
> + if (r <= r)
> + return r;
... should be:
if (r <= 0)
return r;
?
Another option might be we move "set hypercall return value" code inside
__kvm_emulate_hypercall(). So IIUC the reason to split
__kvm_emulate_hypercall() out is for TDX, and while non-TDX uses RAX to carry
the hypercall return value, TDX uses R10.
We can additionally pass a "kvm_hypercall_set_ret_func" function pointer to
__kvm_emulate_hypercall(), and invoke it inside. Then we can change
__kvm_emulate_hypercall() to return:
< 0 error,
==0 return to userspace,
> 0 go back to guest.
I made some attempt below, build test only:
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6d9f763a7bb9..c48feb62a5d7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2179,10 +2179,14 @@ static inline void kvm_clear_apicv_inhibit(struct kvm
*kvm,
kvm_set_or_clear_apicv_inhibit(kvm, reason, false);
}
-unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
- unsigned long a0, unsigned long a1,
- unsigned long a2, unsigned long a3,
- int op_64_bit, int cpl);
+typedef void (*kvm_hypercall_set_ret_func_t)(struct kvm_vcpu *vcpu,
+ unsigned long val);
+
+int __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
+ unsigned long a0, unsigned long a1,
+ unsigned long a2, unsigned long a3,
+ int op_64_bit, int cpl,
+ kvm_hypercall_set_ret_func_t set_ret_func);
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 83fe0a78146f..01bdf01cfc79 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9998,10 +9998,11 @@ static int complete_hypercall_exit(struct kvm_vcpu
*vcpu)
return kvm_skip_emulated_instruction(vcpu);
}
-unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
- unsigned long a0, unsigned long a1,
- unsigned long a2, unsigned long a3,
- int op_64_bit, int cpl)
+int __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
+ unsigned long a0, unsigned long a1,
+ unsigned long a2, unsigned long a3,
+ int op_64_bit, int cpl,
+ kvm_hypercall_set_ret_func_t set_ret_func)
{
unsigned long ret;
@@ -10076,6 +10077,7 @@ unsigned long __kvm_emulate_hypercall(struct kvm_vcpu
*vcpu, unsigned long nr,
WARN_ON_ONCE(vcpu->run->hypercall.flags &
KVM_EXIT_HYPERCALL_MBZ);
vcpu->arch.complete_userspace_io = complete_hypercall_exit;
+ /* MAP_GPA tosses the request to the user space. */
/* stat is incremented on completion. */
return 0;
}
@@ -10085,16 +10087,26 @@ unsigned long __kvm_emulate_hypercall(struct kvm_vcpu
*vcpu, unsigned long nr,
}
out:
+ if (!op_64_bit)
+ ret = (u32)ret;
+ (*set_ret_func)(vcpu, ret);
+
++vcpu->stat.hypercalls;
- return ret;
+ return 1;
}
EXPORT_SYMBOL_GPL(__kvm_emulate_hypercall);
+static void kvm_hypercall_set_ret_default(struct kvm_vcpu *vcpu,
+ unsigned long val)
+{
+ kvm_rax_write(vcpu, val);
+}
+
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
{
- unsigned long nr, a0, a1, a2, a3, ret;
+ unsigned long nr, a0, a1, a2, a3;
int op_64_bit;
- int cpl;
+ int cpl, ret;
if (kvm_xen_hypercall_enabled(vcpu->kvm))
return kvm_xen_hypercall(vcpu);
@@ -10110,14 +10122,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
op_64_bit = is_64_bit_hypercall(vcpu);
cpl = kvm_x86_call(get_cpl)(vcpu);
- ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl);
- if (nr == KVM_HC_MAP_GPA_RANGE && !ret)
- /* MAP_GPA tosses the request to the user space. */
- return 0;
-
- if (!op_64_bit)
- ret = (u32)ret;
- kvm_rax_write(vcpu, ret);
+ ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl,
+ kvm_hypercall_set_ret_default);
+ if (ret <= 0)
+ return ret;
return kvm_skip_emulated_instruction(vcpu);
}
Powered by blists - more mailing lists