linux-kernel - Re: [PATCH 4/4] drivers:pci:hv: New paravirtual PCI front-end for Hyper-V VMs

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <alpine.DEB.2.11.1508020948470.3825@nanos>
Date:	Sun, 2 Aug 2015 10:47:25 +0200 (CEST)
From:	Thomas Gleixner <tglx@...utronix.de>
To:	Jake Oshins <jakeo@...rosoft.com>
cc:	gregkh@...uxfoundation.org, kys@...rosoft.com,
	LKML <linux-kernel@...r.kernel.org>,
	devel@...uxdriverproject.org, olaf@...fle.de, apw@...onical.com,
	vkuznets@...hat.com, linux-pci@...r.kernel.org,
	bhelgaas@...gle.com, x86@...nel.org,
	Jiang Liu <jiang.liu@...ux.intel.com>
Subject: Re: [PATCH 4/4] drivers:pci:hv: New paravirtual PCI front-end for
 Hyper-V VMs

On Sun, 2 Aug 2015, jakeo@...rosoft.com wrote:
> +#include <linux/kernel.h>
> +#include <linux/jiffies.h>
> +#include <linux/mman.h>
> +#include <linux/delay.h>
> +#include <linux/init.h>
> +#include <linux/module.h>
> +#include <linux/slab.h>
> +#include <linux/kthread.h>
> +#include <linux/completion.h>
> +#include <linux/notifier.h>
> +#include <linux/of_pci.h>
> +#include <linux/of_platform.h>
> +#include <linux/pci.h>
> +#include <linux/semaphore.h>
> +#include <linux/irqdomain.h>
> +#include <asm/irqdomain.h>
> +#include <linux/msi.h>

I seriously doubt, that you need all of these includes.

> +/* Interrupt management hooks */
> +
> +/**
> + * hv_msi_free() - Free the MSI.
> + * @domain:	The interrupt domain pointer
> + * @info:	Extra MSI-related context
> + * @virq:	Identifies the IRQ.
> + *
> + * The Hyper-V parent partition and hypervisor are tracking the
> + * messages that are in use, keeping the interrupt redirection
> + * table up to date.  This callback sends a message that frees
> + * the the IRT entry and related tracking nonsense.
> + */
> +void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info,
> +		 unsigned int virq)

static?

> +{
> +	struct pci_delete_interrupt *int_pkt;
> +	struct {
> +		struct pci_packet pkt;
> +		u8 buffer[sizeof(struct pci_delete_interrupt) -
> +			  sizeof(struct pci_message)];
> +	} ctxt;
> +	struct hv_pcibus_device *hbus;
> +	struct hv_pci_dev *hpdev;
> +	struct irq_desc *desc;
> +	struct msi_desc *msi;
> +	struct tran_int_desc *int_desc;
> +	struct irq_desc *irq_desc;
> +
> +	desc = irq_to_desc(virq);
> +	msi = irq_desc_get_msi_desc(desc);

Why do you need to lookup the irq descriptor if you want the msi descriptor?

> +	hbus = info->data;
> +	hpdev = lookup_hv_dev(hbus, devfn_to_wslot(msi->dev->devfn));
> +	if (!hpdev)
> +		return;
> +
> +	int_desc = irq_get_handler_data(virq);

I don't think this is a proper storage point. The data is domain/chip
specific, right? So, what's wrong with storing it in irq_data::chip_data?

> +	if (int_desc) {
> +		memset(&ctxt, 0, sizeof(ctxt));
> +		int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message;
> +		int_pkt->message_type.message_type =
> +			PCI_DELETE_INTERRUPT_MESSAGE;
> +		int_pkt->wslot.slot = hpdev->desc.win_slot.slot;
> +		int_pkt->int_desc = *int_desc;
> +		vmbus_sendpacket(hbus->hdev->channel, int_pkt, sizeof(*int_pkt),
> +				 (unsigned long)&ctxt.pkt, VM_PKT_DATA_INBAND,
> +				 0);
> +		kfree(int_desc);

Free before clearing the reference?

> +		irq_desc = irq_to_desc(virq);

Do you expect the descriptor to change between the lookup above and
this one?

> +		irq_desc->irq_data.handler_data = NULL;

You are not supposed to fiddle in irq_desc or irq_data internals. We
have helper functions for that.

> +	}
> +
> +	hv_pcichild_dec(hpdev, hv_pcidev_ref_by_slot);
> +}
> +
> +int hv_set_affinity(struct irq_data *data, const struct cpumask *dest,
> +		    bool force)

static, if at all.

> +{
> +	struct irq_data *parent = data->parent_data;
> +
> +	return parent->chip->irq_set_affinity(parent, dest, force);
> +}

irq_chip_set_affinity_parent ???

> +/**
> + * hv_compose_msi_msg() - Supplies a valid MSI address/data
> + * @data:	Everything about this MSI
> + * @msg:	Buffer that is filled in by this function
> + *
> + * This function unpacks the IRQ looking for target CPU set, IDT
> + * vector and mode and sends a message to the parent partition
> + * asking for a mapping for that tuple in this partition.  The
> + * response supplies a data value and address to which that data
> + * should be written to trigger that interrupt.
> + */
> +void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)

static ?

> +{
> +	struct irq_cfg *cfg = irqd_cfg(data);
> +	struct hv_pcibus_device *hbus;
> +	struct hv_pci_dev *hpdev;
> +	struct pci_bus *pbus;
> +	struct pci_create_interrupt *int_pkt;
> +	struct compose_comp_ctxt comp;
> +	struct tran_int_desc *int_desc;
> +	struct irq_desc *irq_desc;
> +	struct {
> +		struct pci_packet pkt;
> +		u8 buffer[sizeof(struct pci_create_interrupt) -
> +			  sizeof(struct pci_message)];
> +	} ctxt;
> +	int cpu;
> +	int ret;
> +
> +	pbus = data->msi_desc->dev->bus;
> +	hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
> +	hpdev = lookup_hv_dev(hbus, devfn_to_wslot(data->msi_desc->dev->devfn));
> +
> +	if (!hpdev)
> +		goto return_null_message;
> +
> +	int_desc = kzalloc(sizeof(*int_desc), GFP_KERNEL);
> +	if (!int_desc)
> +		goto return_null_message;
> +
> +	memset(&ctxt, 0, sizeof(ctxt));
> +	init_completion(&comp.comp_pkt.host_event);
> +	ctxt.pkt.completion_func = hv_pci_compose_compl;
> +	ctxt.pkt.compl_ctxt = &comp;
> +	int_pkt = (struct pci_create_interrupt *)&ctxt.pkt.message;
> +	int_pkt->message_type.message_type = PCI_CREATE_INTERRUPT_MESSAGE;
> +	int_pkt->wslot.slot = hpdev->desc.win_slot.slot;
> +	int_pkt->int_desc.vector = cfg->vector;
> +	int_pkt->int_desc.vector_count = 1;
> +	int_pkt->int_desc.delivery_mode =
> +		(apic->irq_delivery_mode == dest_LowestPrio) ? 1 : 0;
> +
> +	/*
> +	 * Cut down interrupt targets to a single processor. Hyper-V
> +	 * currently only supports picking the target of an MSI by
> +	 * sending a message to the parent partition and receiving a
> +	 * response. This can't really work in the context of
> +	 * irq_set_affinity, which sometimes needs to run in an environment
> +	 * that can't wait for a completion.  Until a future version of
> +	 * Hyper-V exists that can retarget an interrupt without waiting,
> +	 * this code picks a single virtual processor as the target.  This
> +	 * comment and the following lines should be deleted when that
> +	 * happens.
> +	 */
> +	cpu = cpumask_any_and(data->affinity, cpu_online_mask);

We have accessors for this for a reason. data->affinity will be gone
after 4.3-rc1.

> +	cpumask_clear(data->affinity);
> +	cpumask_set_cpu(cpu, data->affinity);

> +	/*
> +	 * This bit doesn't have to work on machines with more than 64
> +	 * processors because Hyper-V only supports 64 in a guest.
> +	 */
> +	for_each_cpu(cpu, data->affinity) {
> +		if (cpu_is_offline(cpu))
> +			continue;
> +		int_pkt->int_desc.cpu_mask |=
> +			(1 << vmbus_cpu_number_to_vp_number(cpu));
> +	}

ROTFL. What's the point of this loop? To find the single already known
cpu which is in the mask?

> +	ret = vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt,
> +			       sizeof(*int_pkt), (unsigned long)&ctxt.pkt,
> +			       VM_PKT_DATA_INBAND,
> +			       VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
> +	if (!ret)
> +		wait_for_completion(&comp.comp_pkt.host_event);
> +
> +	if (comp.comp_pkt.completion_status < 0) {
> +		pr_err("Request for interrupt failed: 0x%x",
> +		       comp.comp_pkt.completion_status);
> +		goto return_null_message;
> +	}
> +
> +	/*
> +	 * Record the assignment so that this can be unwound later. Using
> +	 * irq_set_handler_data() here would be appropriate, but the lock
> +	 * it takes is already held.
> +	 */
> +	*int_desc = comp.int_desc;
> +	irq_desc = irq_to_desc(data->irq);
> +	irq_desc->irq_data.handler_data = int_desc;

You're really a fan of redundant lookups. How is irq_desc->irq_data
going to be different from data?

> +	/* Pass up the result. */
> +	msg->address_hi = comp.int_desc.address >> 32;
> +	msg->address_lo = comp.int_desc.address & 0xffffffff;
> +	msg->data = comp.int_desc.data;
> +
> +	hv_pcichild_dec(hpdev, hv_pcidev_ref_by_slot);
> +	return;
> +
> +return_null_message:

Leaks int_desc and a refcount on hpdev.

> +	msg->address_hi = 0;
> +	msg->address_lo = 0;
> +	msg->data = 0;
> +}
> +
> +/* HW Interrupt Chip Descriptor */
> +static struct irq_chip hv_msi_irq_chip = {
> +	.name = "Hyper-V PCIe MSI",
> +	.irq_compose_msi_msg = hv_compose_msi_msg,
> +	.irq_set_affinity = hv_set_affinity,
> +	.irq_ack = irq_chip_ack_parent,

Please format this readable:

	.name			= "Hyper-V PCIe MSI",
	.irq_compose_msi_msg	= hv_compose_msi_msg,
	.irq_set_affinity	= hv_set_affinity,
	.irq_ack		= irq_chip_ack_parent,

Hmm?       

> +};
> +
> +static irq_hw_number_t hv_msi_domain_ops_get_hwirq(struct msi_domain_info *info,
> +						   msi_alloc_info_t *arg)
> +{
> +	return arg->msi_hwirq;
> +}
> +
> +static int hv_msi_domain_ops_prepare(struct irq_domain *domain,
> +				     struct device *dev, int nvec,
> +				     msi_alloc_info_t *arg)
> +{
> +	struct pci_dev *pdev = to_pci_dev(dev);
> +	struct msi_desc *desc = first_pci_msi_entry(pdev);
> +
> +	memset(arg, 0, sizeof(*arg));
> +	arg->msi_dev = pdev;
> +	if (desc->msi_attrib.is_msix) {
> +		arg->type = X86_IRQ_ALLOC_TYPE_MSIX;
> +	} else {
> +		arg->type = X86_IRQ_ALLOC_TYPE_MSI;
> +		arg->flags |= X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
> +	}
> +
> +	return 0;
> +}

Pretty much a copy of the x86 code, right?

I wonder whether this MSI infrastructure code would be better
seperated out and moved to arch/x86/hyperv/msi.c or
arch/x86/kernel/apic/hvmsi.c. It's small enough to be built in. So all
you'd need to export is hv_pcie_init_irq_domain and
hv_pcie_free_irq_domain.

Thanks,

	tglx
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/