linux-kernel - Re: [PART2 PATCH v5 12/12] svm: Implements update_pi

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20160813120325.GH8001@potion>
Date:	Sat, 13 Aug 2016 14:03:52 +0200
From:	Radim Krčmář <rkrcmar@...hat.com>
To:	Suravee Suthikulpanit <suravee.suthikulpanit@....com>
Cc:	joro@...tes.org, pbonzini@...hat.com, alex.williamson@...hat.com,
	kvm@...r.kernel.org, linux-kernel@...r.kernel.org,
	sherry.hurwitz@....com
Subject: Re: [PART2 PATCH v5 12/12] svm: Implements update_pi_irte hook to
 setup posted interrupt

2016-07-25 04:32-0500, Suravee Suthikulpanit:
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> @@ -1485,9 +1521,16 @@ static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
>  	WARN_ON(is_run == !!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK));
>  
>  	entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
> -	if (is_run)
> +	if (is_run) {
>  		entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
> -	WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
> +		WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
> +		avic_update_iommu(vcpu, h_physical_id,
> +				  page_to_phys(svm->avic_backing_page), 1);
> +	} else {
> +		avic_update_iommu(vcpu, h_physical_id,
> +				  page_to_phys(svm->avic_backing_page), 0);
> +		WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
> +	}

You need to do the same change twice ... I guess it is time to factor
the code. :)

Wouldn't the following be an improvement in the !is_run path too?

  static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
  {
  	svm->avic_is_running = is_run;
  
  	if (is_run)
  		avic_vcpu_load(vcpu, vcpu->cpu);
  	else
  		avic_vcpu_put(vcpu);
  }

> +static void svm_pi_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
> +{
> +	bool found = false;
> +	unsigned long flags;
> +	struct amd_iommu_pi_data *cur;
> +
> +	spin_lock_irqsave(&svm->pi_list_lock, flags);
> +	list_for_each_entry(cur, &svm->pi_list, node) {
> +		if (cur->ir_data != pi->ir_data)
> +			continue;
> +		found = true;

This optimization turned out to be ugly ... sorry.

Manipulation with pi_list is hard to understand, IMO, so a comment
explaining why we couldn't do that without traversing a list and
comparing pi->ir_data would be nice.

Maybe I was a bit confused by reusing amd_iommu_pi_data when all we care
about is a list of cur->ir_data -- can't we have a list of just ir_data?

Allocating the list node in svm_pi_list_add() would make it nicely
symmetrical with svm_pi_list_del() too.

> +		break;
> +	}
> +	if (!found)
> +		list_add(&pi->node, &svm->pi_list);
> +	spin_unlock_irqrestore(&svm->pi_list_lock, flags);
> +}
> +
> +static void svm_pi_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
> +{
> +	unsigned long flags;
> +	struct amd_iommu_pi_data *cur, *next;
> +
> +	spin_lock_irqsave(&svm->pi_list_lock, flags);
> +	list_for_each_entry_safe(cur, next, &svm->pi_list, node) {

No need to use _safe loop if you break on the first deletion.

> +		if (cur->ir_data != pi->ir_data)
> +			continue;
> +		list_del(&cur->node);
> +		kfree(cur);
> +		break;
> +	}
> +	spin_unlock_irqrestore(&svm->pi_list_lock, flags);
> +}
> +
> +/*
> + * svm_update_pi_irte - set IRTE for Posted-Interrupts
> + *
> + * @kvm: kvm
> + * @host_irq: host irq of the interrupt
> + * @guest_irq: gsi of the interrupt
> + * @set: set or unset PI
> + * returns 0 on success, < 0 on failure
> + */
> +static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
> +			      uint32_t guest_irq, bool set)
> +{
> +	struct kvm_kernel_irq_routing_entry *e;
> +	struct kvm_irq_routing_table *irq_rt;
> +	int idx, ret = -EINVAL;
> +
> +	if (!kvm_arch_has_assigned_device(kvm) ||
> +	    !irq_remapping_cap(IRQ_POSTING_CAP))
> +		return 0;
> +
> +	pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
> +		 __func__, host_irq, guest_irq, set);
> +
> +	idx = srcu_read_lock(&kvm->irq_srcu);
> +	irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
> +	WARN_ON(guest_irq >= irq_rt->nr_rt_entries);
> +
> +	hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
> +		struct kvm_lapic_irq irq;
> +		struct vcpu_data vcpu_info;
> +		struct kvm_vcpu *vcpu = NULL;
> +		struct vcpu_svm *svm = NULL;
> +
> +		if (e->type != KVM_IRQ_ROUTING_MSI)
> +			continue;
> +
> +		/**
> +		 * Note:
> +		 * The HW cannot support posting multicast/broadcast
> +		 * interrupts to a vCPU. So, we still use interrupt
> +		 * remapping for these kind of interrupts.
> +		 *
> +		 * For lowest-priority interrupts, we only support
> +		 * those with single CPU as the destination, e.g. user
> +		 * configures the interrupts via /proc/irq or uses
> +		 * irqbalance to make the interrupts single-CPU.
> +		 */
> +		kvm_set_msi_irq(e, &irq);
> +		if (kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
> +			svm = to_svm(vcpu);
> +			vcpu_info.pi_desc_addr = page_to_phys(svm->avic_backing_page);
> +			vcpu_info.vector = irq.vector;
> +
> +			trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi,
> +						 vcpu_info.vector,
> +						 vcpu_info.pi_desc_addr, set);

Trace below the if/else, we might update something in both of them.

> +
> +			pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
> +				 irq.vector);
> +		} else {
> +			set = false;
> +
> +			pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
> +				 __func__, irq.vector);
> +		}
> +
> +		/**
> +		 * When AVIC is disabled, we fall-back to setup
> +		 * IRTE w/ legacy mode
> +		 */
> +		if (set && kvm_vcpu_apicv_active(&svm->vcpu)) {
> +			struct amd_iommu_pi_data *pi_data;
> +
> +			/**
> +			 * Allocating new amd_iommu_pi_data, which will get
> +			 * add to the per-vcpu pi_list.
> +			 */
> +			pi_data = kzalloc(sizeof(struct amd_iommu_pi_data),
> +					  GFP_KERNEL);
> +			if (!pi_data) {
> +				ret = -ENOMEM;
> +				goto out;
> +			}
> +
> +			/* Try to enable guest_mode in IRTE */
> +			pi_data->ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id,
> +						     vcpu->vcpu_id);
> +			pi_data->vcpu_data = &vcpu_info;
> +			pi_data->is_guest_mode = true;
> +			ret = irq_set_vcpu_affinity(host_irq, pi_data);
> +
> +			/**
> +			 * We save the pointer to pi_data in the struct
> +			 * vcpu_svm so that we can reference to them directly
> +			 * when we update vcpu scheduling information in IOMMU
> +			 * irte.
> +			 */
> +			if (!ret && pi_data->is_guest_mode)
> +				svm_pi_list_add(svm, pi_data);

pi_data leaks in the else case.

(Allocating the element in svm_pi_list_add() would solve this.)

> +		} else {
> +			/* Use legacy mode in IRTE */
> +			struct amd_iommu_pi_data pi;
> +
> +			/**
> +			 * Here, pi is used to:
> +			 * - Tell IOMMU to use legacy mode for this interrupt.
> +			 * - Retrieve ga_tag of prior interrupt remapping data.
> +			 */
> +			pi.is_guest_mode = false;
> +			ret = irq_set_vcpu_affinity(host_irq, &pi);
> +
> +			/**
> +			 * We need to check if the interrupt was previously
> +			 * setup with the guest_mode by checking if the ga_tag
> +			 * was cached. If so, we need to clean up the per-vcpu
> +			 * pi_list.
> +			 */
> +			if (!ret && pi.ga_tag) {
> +				struct kvm_vcpu *vcpu = kvm_get_vcpu_by_id(kvm,
> +						AVIC_GATAG_TO_VCPUID(pi.ga_tag));
> +
> +				if (vcpu)
> +					svm_pi_list_del(to_svm(vcpu), &pi);
> +			}
> +		}
> +
> +		if (ret < 0) {
> +			pr_err("%s: failed to update PI IRTE\n", __func__);
> +			goto out;
> +		}
> +	}
> +
> +	ret = 0;
> +out:
> +	srcu_read_unlock(&kvm->irq_srcu, idx);
> +	return ret;
> +}
> +
>  static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_svm *svm = to_svm(vcpu);