linux-kernel - Re: [PATCH v2 2/6] x86/kvm: add boot parameter for adding vcpu-id bits

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <874kb1n59j.fsf@vitty.brq.redhat.com>
Date:   Fri, 03 Sep 2021 15:43:52 +0200
From:   Vitaly Kuznetsov <vkuznets@...hat.com>
To:     Juergen Gross <jgross@...e.com>, kvm@...r.kernel.org,
        x86@...nel.org, linux-doc@...r.kernel.org,
        linux-kernel@...r.kernel.org
Cc:     maz@...nel.org, ehabkost@...hat.com,
        Juergen Gross <jgross@...e.com>,
        Jonathan Corbet <corbet@....net>,
        Paolo Bonzini <pbonzini@...hat.com>,
        Sean Christopherson <seanjc@...gle.com>,
        Wanpeng Li <wanpengli@...cent.com>,
        Jim Mattson <jmattson@...gle.com>,
        Joerg Roedel <joro@...tes.org>,
        Thomas Gleixner <tglx@...utronix.de>,
        Ingo Molnar <mingo@...hat.com>, Borislav Petkov <bp@...en8.de>,
        "H. Peter Anvin" <hpa@...or.com>
Subject: Re: [PATCH v2 2/6] x86/kvm: add boot parameter for adding vcpu-id bits

Juergen Gross <jgross@...e.com> writes:

> Today the maximum vcpu-id of a kvm guest's vcpu on x86 systems is set
> via a #define in a header file.
>
> In order to support higher vcpu-ids without generally increasing the
> memory consumption of guests on the host (some guest structures contain
> arrays sized by KVM_MAX_VCPU_ID) add a boot parameter for adding some
> bits to the vcpu-id. Additional bits are needed as the vcpu-id is
> constructed via bit-wise concatenation of socket-id, core-id, etc.
> As those ids maximum values are not always a power of 2, the vcpu-ids
> are sparse.
>
> The additional number of bits needed is basically the number of
> topology levels with a non-power-of-2 maximum value, excluding the top
> most level.
>
> The default value of the new parameter will be to take the correct
> setting from the host's topology.
>
> Calculating the maximum vcpu-id dynamically requires to allocate the
> arrays using KVM_MAX_VCPU_ID as the size dynamically.
>
> Signed-of-by: Juergen Gross <jgross@...e.com>
> ---
> V2:
> - switch to specifying additional bits (based on comment by Vitaly
>   Kuznetsov)
>
> Signed-off-by: Juergen Gross <jgross@...e.com>
> ---
>  .../admin-guide/kernel-parameters.txt         | 18 ++++++++++++
>  arch/x86/include/asm/kvm_host.h               |  4 ++-
>  arch/x86/kvm/ioapic.c                         | 12 +++++++-
>  arch/x86/kvm/ioapic.h                         |  4 +--
>  arch/x86/kvm/x86.c                            | 29 +++++++++++++++++++
>  5 files changed, 63 insertions(+), 4 deletions(-)
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index 84dc5790741b..37e194299311 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -2435,6 +2435,24 @@
>  			feature (tagged TLBs) on capable Intel chips.
>  			Default is 1 (enabled)
>  
> +	kvm.vcpu_id_add_bits=
> +			[KVM,X86] The vcpu-ids of guests are sparse, as they
> +			are constructed by bit-wise concatenation of the ids of
> +			the different topology levels (sockets, cores, threads).
> +
> +			This parameter specifies how many additional bits the
> +			maximum vcpu-id needs compared to the maximum number of
> +			vcpus.
> +
> +			Normally this value is the number of topology levels
> +			without the threads level and without the highest
> +			level.
> +
> +			The special value -1 can be used to support guests
> +			with the same topology is the host.
> +
> +			Default: -1
> +
>  	l1d_flush=	[X86,INTEL]
>  			Control mitigation for L1D based snooping vulnerability.
>  
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index af6ce8d4c86a..3513edee8e22 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -39,7 +39,7 @@
>  
>  #define KVM_MAX_VCPUS 288
>  #define KVM_SOFT_MAX_VCPUS 240
> -#define KVM_MAX_VCPU_ID 1023
> +#define KVM_MAX_VCPU_ID kvm_max_vcpu_id()
>  /* memory slots that are not exposed to userspace */
>  #define KVM_PRIVATE_MEM_SLOTS 3
>  
> @@ -1588,6 +1588,8 @@ extern u64  kvm_max_tsc_scaling_ratio;
>  extern u64  kvm_default_tsc_scaling_ratio;
>  /* bus lock detection supported? */
>  extern bool kvm_has_bus_lock_exit;
> +/* maximum vcpu-id */
> +unsigned int kvm_max_vcpu_id(void);
>  
>  extern u64 kvm_mce_cap_supported;
>  
> diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
> index ff005fe738a4..52e8ea90c914 100644
> --- a/arch/x86/kvm/ioapic.c
> +++ b/arch/x86/kvm/ioapic.c
> @@ -685,11 +685,21 @@ static const struct kvm_io_device_ops ioapic_mmio_ops = {
>  int kvm_ioapic_init(struct kvm *kvm)
>  {
>  	struct kvm_ioapic *ioapic;
> +	size_t sz;
>  	int ret;
>  
> -	ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL_ACCOUNT);
> +	sz = sizeof(struct kvm_ioapic) +
> +	     sizeof(*ioapic->rtc_status.dest_map.map) *
> +		    BITS_TO_LONGS(KVM_MAX_VCPU_ID + 1) +
> +	     sizeof(*ioapic->rtc_status.dest_map.vectors) *
> +		    (KVM_MAX_VCPU_ID + 1);
> +	ioapic = kzalloc(sz, GFP_KERNEL_ACCOUNT);
>  	if (!ioapic)
>  		return -ENOMEM;
> +	ioapic->rtc_status.dest_map.map = (void *)(ioapic + 1);
> +	ioapic->rtc_status.dest_map.vectors =
> +		(void *)(ioapic->rtc_status.dest_map.map +
> +			 BITS_TO_LONGS(KVM_MAX_VCPU_ID + 1));
>  	spin_lock_init(&ioapic->lock);
>  	INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work);
>  	kvm->arch.vioapic = ioapic;
> diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
> index bbd4a5d18b5d..623a3c5afad7 100644
> --- a/arch/x86/kvm/ioapic.h
> +++ b/arch/x86/kvm/ioapic.h
> @@ -39,13 +39,13 @@ struct kvm_vcpu;
>  
>  struct dest_map {
>  	/* vcpu bitmap where IRQ has been sent */
> -	DECLARE_BITMAP(map, KVM_MAX_VCPU_ID + 1);
> +	unsigned long *map;
>  
>  	/*
>  	 * Vector sent to a given vcpu, only valid when
>  	 * the vcpu's bit in map is set
>  	 */
> -	u8 vectors[KVM_MAX_VCPU_ID + 1];
> +	u8 *vectors;
>  };
>  
>  
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index e5d5c5ed7dd4..6b6f38f0b617 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -78,6 +78,7 @@
>  #include <asm/intel_pt.h>
>  #include <asm/emulate_prefix.h>
>  #include <asm/sgx.h>
> +#include <asm/topology.h>
>  #include <clocksource/hyperv_timer.h>
>  
>  #define CREATE_TRACE_POINTS
> @@ -184,6 +185,34 @@ module_param(force_emulation_prefix, bool, S_IRUGO);
>  int __read_mostly pi_inject_timer = -1;
>  module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
>  
> +static int __read_mostly vcpu_id_add_bits = -1;
> +module_param(vcpu_id_add_bits, int, S_IRUGO);
> +
> +unsigned int kvm_max_vcpu_id(void)
> +{
> +	int n_bits = fls(KVM_MAX_VCPUS - 1);
> +
> +	if (vcpu_id_add_bits < -1 || vcpu_id_add_bits > (32 - n_bits)) {
> +		pr_err("Invalid value of vcpu_id_add_bits=%d parameter!\n",
> +		       vcpu_id_add_bits);
> +		vcpu_id_add_bits = -1;
> +	}
> +
> +	if (vcpu_id_add_bits >= 0) {
> +		n_bits += vcpu_id_add_bits;
> +	} else {
> +		n_bits++;		/* One additional bit for core level. */
> +		if (topology_max_die_per_package() > 1)
> +			n_bits++;	/* One additional bit for die level. */

This assumes topology_max_die_per_package() can not be greater than 2,
or 1 additional bit may not suffice, right?

> +	}
> +
> +	if (!n_bits)
> +		n_bits = 1;

Nitpick: AFAIU n_bits can't be zero here as KVM_MAX_VCPUS is still
static. The last patch of the series, however, makes it possible when
max_vcpus = 1 and vcpu_id_add_bits = 0. With this, I'd suggest to move
the check to the last patch.

> +
> +	return (1U << n_bits) - 1;
> +}
> +EXPORT_SYMBOL_GPL(kvm_max_vcpu_id);
> +
>  /*
>   * Restoring the host value for MSRs that are only consumed when running in
>   * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU

-- 
Vitaly