[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20120815140554.GD3068@redhat.com>
Date: Wed, 15 Aug 2012 17:05:54 +0300
From: "Michael S. Tsirkin" <mst@...hat.com>
To: Alex Williamson <alex.williamson@...hat.com>
Cc: avi@...hat.com, gleb@...hat.com, kvm@...r.kernel.org,
linux-kernel@...r.kernel.org
Subject: Re: [PATCH v8 5/6] kvm: KVM_IRQ_ACKFD
On Fri, Aug 10, 2012 at 04:37:48PM -0600, Alex Williamson wrote:
> Enable a mechanism for IRQ ACKs to be exposed through an eventfd. The
> user can specify the GSI and optionally an IRQ source ID and have the
> provided eventfd trigger whenever the irqchip resamples it's inputs,
> for instance on EOI.
>
> Signed-off-by: Alex Williamson <alex.williamson@...hat.com>
> ---
>
> Documentation/virtual/kvm/api.txt | 18 ++
> arch/x86/kvm/x86.c | 2
> include/linux/kvm.h | 16 ++
> include/linux/kvm_host.h | 13 ++
> virt/kvm/eventfd.c | 285 +++++++++++++++++++++++++++++++++++++
> virt/kvm/kvm_main.c | 10 +
> 6 files changed, 344 insertions(+)
>
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index 17cd599..77b4837 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -2011,6 +2011,24 @@ of IRQ source IDs. These IDs are also shared with KVM internal users
> (ex. KVM assigned devices, PIT, shared user ID), therefore not all IDs
> may be allocated through this interface.
>
> +4.78 KVM_IRQ_ACKFD
> +
> +Capability: KVM_CAP_IRQ_ACKFD
> +Architectures: x86
> +Type: vm ioctl
> +Parameters: struct kvm_irq_ackfd (in)
> +Returns: 0 on success, -errno on error
> +
> +Allows userspace notification of IRQ ACKs, or resampling of irqchip
> +inputs, often associated with an EOI. User provided kvm_irq_ackfd.fd
> +and kvm_irq_ackfd.gsi are required and result in an eventfd trigger
> +whenever the GSI is acknowledged. When KVM_CAP_IRQ_ACKFD_IRQ_SOURCE_FD
> +is available, KVM_IRQ_ACKFD supports the KVM_IRQ_ACKFD_FLAG_IRQ_SOURCE_ID
> +flag which indicates kvm_irqfd.irq_source_id is provided. With this,
> +the eventfd is only triggered when the specified IRQ source ID is
> +pending. On deassign, fd, gsi, and irq_source_id (if provided on assign)
> +must be provided.
> +
>
This seems to imply that ACKFD can be used with edge
GSIs, but this does not actually work: with source ID
because of the filtering, and without because of PV EOI.
It seems that what we are really tracking is
level change 1->0. For edge no level -> no notification?
Or does 'resampling of irqchip inputs' imply level somehow?
> 5. The kvm_run structure
> ------------------------
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 19680ed..3d98e59 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2176,6 +2176,8 @@ int kvm_dev_ioctl_check_extension(long ext)
> case KVM_CAP_KVMCLOCK_CTRL:
> case KVM_CAP_IRQFD_IRQ_SOURCE_ID:
> case KVM_CAP_IRQFD_ASSERT_ONLY:
> + case KVM_CAP_IRQ_ACKFD:
> + case KVM_CAP_IRQ_ACKFD_IRQ_SOURCE_ID:
> r = 1;
> break;
> case KVM_CAP_COALESCED_MMIO:
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index 19b1235..0f53bd5 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -621,6 +621,8 @@ struct kvm_ppc_smmu_info {
> #define KVM_CAP_NR_IRQ_SOURCE_ID 81
> #define KVM_CAP_IRQFD_IRQ_SOURCE_ID 82
> #define KVM_CAP_IRQFD_ASSERT_ONLY 83
> +#define KVM_CAP_IRQ_ACKFD 84
> +#define KVM_CAP_IRQ_ACKFD_IRQ_SOURCE_ID 85
>
> #ifdef KVM_CAP_IRQ_ROUTING
>
> @@ -707,6 +709,18 @@ struct kvm_irq_source_id {
> __u8 pad[24];
> };
>
> +#define KVM_IRQ_ACKFD_FLAG_DEASSIGN (1 << 0)
> +/* Available with KVM_CAP_IRQ_ACKFD_IRQ_SOURCE_ID */
> +#define KVM_IRQ_ACKFD_FLAG_IRQ_SOURCE_ID (1 << 1)
> +
> +struct kvm_irq_ackfd {
> + __u32 flags;
> + __u32 fd;
> + __u32 gsi;
> + __u32 irq_source_id;
> + __u8 pad[16];
> +};
> +
> struct kvm_clock_data {
> __u64 clock;
> __u32 flags;
> @@ -849,6 +863,8 @@ struct kvm_s390_ucas_mapping {
> #define KVM_PPC_ALLOCATE_HTAB _IOWR(KVMIO, 0xa7, __u32)
> /* Available with KVM_CAP_IRQ_SOURCE_ID */
> #define KVM_IRQ_SOURCE_ID _IOWR(KVMIO, 0xa8, struct kvm_irq_source_id)
> +/* Available with KVM_CAP_IRQ_ACKFD */
> +#define KVM_IRQ_ACKFD _IOW(KVMIO, 0xa9, struct kvm_irq_ackfd)
>
> /*
> * ioctls for vcpu fds
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index ea6d7a1..cdc55c2 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -285,6 +285,10 @@ struct kvm {
> struct list_head items;
> } irqfds;
> struct list_head ioeventfds;
> + struct {
> + spinlock_t lock;
> + struct list_head items;
> + } irq_ackfds;
> #endif
> struct kvm_vm_stat stat;
> struct kvm_arch arch;
> @@ -831,6 +835,8 @@ int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args);
> void kvm_irqfd_release(struct kvm *kvm);
> void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *);
> int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
> +int kvm_irq_ackfd(struct kvm *kvm, struct kvm_irq_ackfd *args);
> +void kvm_irq_ackfd_release(struct kvm *kvm);
>
> #else
>
> @@ -843,6 +849,13 @@ static inline int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
>
> static inline void kvm_irqfd_release(struct kvm *kvm) {}
>
> +static inline int kvm_irq_ackfd(struct kvm *kvm, struct kvm_irq_ackfd *args)
> +{
> + return -EINVAL;
> +}
> +
> +static inline void kvm_irq_ackfd_release(struct kvm *kvm) {}
> +
> #ifdef CONFIG_HAVE_KVM_IRQCHIP
> static inline void kvm_irq_routing_update(struct kvm *kvm,
> struct kvm_irq_routing_table *irq_rt)
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index df41038..ff5c784 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -300,6 +300,8 @@ kvm_eventfd_init(struct kvm *kvm)
> spin_lock_init(&kvm->irqfds.lock);
> INIT_LIST_HEAD(&kvm->irqfds.items);
> INIT_LIST_HEAD(&kvm->ioeventfds);
> + spin_lock_init(&kvm->irq_ackfds.lock);
> + INIT_LIST_HEAD(&kvm->irq_ackfds.items);
> }
>
> /*
> @@ -668,3 +670,286 @@ kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
>
> return kvm_assign_ioeventfd(kvm, args);
> }
> +
> +/*
> + * --------------------------------------------------------------------
> + * irq_ackfd: Expose IRQ ACKs to userspace via eventfd
> + *
> + * userspace can register to recieve eventfd notification for ACKs of
> + * interrupt lines.
> + * --------------------------------------------------------------------
> + */
> +
> +struct _irq_ackfd {
> + struct kvm *kvm;
> + struct eventfd_ctx *eventfd; /* signaled on ack */
> + struct kvm_irq_ack_notifier notifier;
> + /* Setup/shutdown */
> + wait_queue_t wait;
> + poll_table pt;
> + struct work_struct shutdown;
> + struct list_head list;
> +};
> +
> +/* Called under irq_ackfds.lock */
> +static void irq_ackfd_shutdown(struct work_struct *work)
> +{
> + struct _irq_ackfd *ackfd = container_of(work, struct _irq_ackfd,
> + shutdown);
> + struct kvm *kvm = ackfd->kvm;
> + u64 cnt;
> +
> + /*
> + * Stop ACK signaling
> + */
> + kvm_unregister_irq_ack_notifier(kvm, &ackfd->notifier);
> +
> + /*
> + * Synchronize with the wait-queue and unhook ourselves to prevent
> + * further events.
> + */
> + eventfd_ctx_remove_wait_queue(ackfd->eventfd, &ackfd->wait, &cnt);
> +
> + /*
> + * Release resources
> + */
> + eventfd_ctx_put(ackfd->eventfd);
> + kfree(ackfd);
> +}
> +
> +/* assumes kvm->irq_ackfds.lock is held */
> +static bool irq_ackfd_is_active(struct _irq_ackfd *ackfd)
> +{
> + return list_empty(&ackfd->list) ? false : true;
> +}
> +
> +/*
> + * Mark the irq_ackfd as inactive and schedule it for removal
> + *
> + * assumes kvm->irq_ackfds.lock is held
> + */
> +static void irq_ackfd_deactivate(struct _irq_ackfd *ackfd)
> +{
> + BUG_ON(!irq_ackfd_is_active(ackfd));
> +
> + list_del_init(&ackfd->list);
> +
> + /* Borrow irqfd's workqueue, we don't need our own. */
> + queue_work(irqfd_cleanup_wq, &ackfd->shutdown);
> +}
> +
> +/*
> + * Called with wqh->lock held and interrupts disabled
> + */
> +static int irq_ackfd_wakeup(wait_queue_t *wait, unsigned mode,
> + int sync, void *key)
> +{
> + unsigned long flags = (unsigned long)key;
> +
> + if (unlikely(flags & POLLHUP)) {
> + /* The eventfd is closing, detach from KVM */
> + struct _irq_ackfd *ackfd;
> + unsigned long flags;
> +
> + ackfd = container_of(wait, struct _irq_ackfd, wait);
> +
> + spin_lock_irqsave(&ackfd->kvm->irq_ackfds.lock, flags);
> +
> + /*
> + * We must check if someone deactivated the irq_ackfd before
> + * we could acquire the irq_ackfds.lock since the item is
> + * deactivated from the KVM side before it is unhooked from
> + * the wait-queue. If it is already deactivated, we can
> + * simply return knowing the other side will cleanup for us.
> + * We cannot race against the irq_ackfd going away since the
> + * other side is required to acquire wqh->lock, which we hold
> + */
> + if (irq_ackfd_is_active(ackfd))
> + irq_ackfd_deactivate(ackfd);
> +
> + spin_unlock_irqrestore(&ackfd->kvm->irq_ackfds.lock, flags);
> + }
> +
> + return 0;
> +}
> +
> +static void irq_ackfd_ptable_queue_proc(struct file *file,
> + wait_queue_head_t *wqh,
> + poll_table *pt)
> +{
> + struct _irq_ackfd *ackfd = container_of(pt, struct _irq_ackfd, pt);
> + add_wait_queue(wqh, &ackfd->wait);
> +}
> +
> +/*
> + * This function is called as the kvm VM fd is being released. Shutdown all
> + * irq_ackfds that still remain open
> + */
> +void kvm_irq_ackfd_release(struct kvm *kvm)
> +{
> + struct _irq_ackfd *tmp, *ackfd;
> +
> + spin_lock_irq(&kvm->irq_ackfds.lock);
> +
> + list_for_each_entry_safe(ackfd, tmp, &kvm->irq_ackfds.items, list)
> + irq_ackfd_deactivate(ackfd);
> +
> + spin_unlock_irq(&kvm->irq_ackfds.lock);
> +
> + flush_workqueue(irqfd_cleanup_wq);
> +}
> +
> +static void irq_ackfd_acked(struct kvm_irq_ack_notifier *notifier)
> +{
> + struct _irq_ackfd *ackfd;
> +
> + ackfd = container_of(notifier, struct _irq_ackfd, notifier);
> +
> + eventfd_signal(ackfd->eventfd, 1);
> +}
> +
> +static int kvm_assign_irq_ackfd(struct kvm *kvm, struct kvm_irq_ackfd *args)
> +{
> + struct file *file = NULL;
> + struct eventfd_ctx *eventfd = NULL;
> + struct _irq_ackfd *ackfd = NULL, *tmp;
> + int ret;
> + u64 cnt;
> +
> + file = eventfd_fget(args->fd);
> + if (IS_ERR(file)) {
> + ret = PTR_ERR(file);
> + goto fail;
> + }
> +
> + eventfd = eventfd_ctx_fileget(file);
> + if (IS_ERR(eventfd)) {
> + ret = PTR_ERR(eventfd);
> + goto fail;
> + }
> +
> + ackfd = kzalloc(sizeof(*ackfd), GFP_KERNEL);
> + if (!ackfd) {
> + ret = -ENOMEM;
> + goto fail;
> + }
> +
> + INIT_LIST_HEAD(&ackfd->list);
> + INIT_WORK(&ackfd->shutdown, irq_ackfd_shutdown);
> + ackfd->kvm = kvm;
> + ackfd->eventfd = eventfd;
> + ackfd->notifier.gsi = args->gsi;
> + if (args->flags & KVM_IRQ_ACKFD_FLAG_IRQ_SOURCE_ID)
> + ackfd->notifier.irq_source_id = args->irq_source_id;
> + else
> + ackfd->notifier.irq_source_id = -1;
> + ackfd->notifier.irq_acked = irq_ackfd_acked;
> +
> + /*
> + * Install our own custom wake-up handling so we are notified via
> + * a callback whenever someone releases the underlying eventfd
> + */
> + init_waitqueue_func_entry(&ackfd->wait, irq_ackfd_wakeup);
> + init_poll_funcptr(&ackfd->pt, irq_ackfd_ptable_queue_proc);
> +
> + /*
> + * Install the wait queue function to allow cleanup when the
> + * eventfd is closed by the user. This grabs the wqh lock, so
> + * we do it out of spinlock, holding the file reference ensures
> + * we won't see a POLLHUP until setup is complete.
> + */
> + file->f_op->poll(file, &ackfd->pt);
> +
> + spin_lock_irq(&kvm->irq_ackfds.lock);
> +
> + /* Check for duplicates. */
> + list_for_each_entry(tmp, &kvm->irq_ackfds.items, list) {
> + if (ackfd->eventfd == tmp->eventfd &&
> + ackfd->notifier.gsi == tmp->notifier.gsi &&
> + ackfd->notifier.irq_source_id ==
> + tmp->notifier.irq_source_id) {
> + spin_unlock_irq(&kvm->irq_ackfds.lock);
> + ret = -EBUSY;
> + goto fail_unqueue;
> + }
> + }
> +
So source ID is not validated at all, this means there are
4G ways for the same eventfd to be registered,
drinking up unlimited kernel memory by means of kzalloc above.
> + list_add_tail(&ackfd->list, &kvm->irq_ackfds.items);
> +
> + spin_unlock_irq(&kvm->irq_ackfds.lock);
> +
> + /* This can sleep, so register after spinlock. */
> + kvm_register_irq_ack_notifier(kvm, &ackfd->notifier);
> +
> + fput(file); /* Enable POLLHUP */
> +
> + return 0;
> +
> +fail_unqueue:
> + eventfd_ctx_remove_wait_queue(eventfd, &ackfd->wait, &cnt);
> +fail:
> + if (eventfd && !IS_ERR(eventfd))
> + eventfd_ctx_put(eventfd);
> +
> + if (file && !IS_ERR(file))
> + fput(file);
> +
> + kfree(ackfd);
> +
> + return ret;
> +}
> +
> +static int kvm_deassign_irq_ackfd(struct kvm *kvm, struct kvm_irq_ackfd *args)
> +{
> + struct eventfd_ctx *eventfd = NULL;
> + struct _irq_ackfd *ackfd;
> + int irq_source_id = -1, ret = -ENOENT;
> +
> + eventfd = eventfd_ctx_fdget(args->fd);
> + if (IS_ERR(eventfd)) {
> + ret = PTR_ERR(eventfd);
> + goto fail;
> + }
> +
> + if (args->flags & KVM_IRQ_ACKFD_FLAG_IRQ_SOURCE_ID)
> + irq_source_id = args->irq_source_id;
> +
> + spin_lock_irq(&kvm->irq_ackfds.lock);
> +
> + list_for_each_entry(ackfd, &kvm->irq_ackfds.items, list) {
> + if (ackfd->eventfd == eventfd &&
> + ackfd->notifier.gsi == args->gsi &&
> + ackfd->notifier.irq_source_id == irq_source_id) {
> + irq_ackfd_deactivate(ackfd);
> + ret = 0;
> + break;
> + }
> + }
> +
> + spin_unlock_irq(&kvm->irq_ackfds.lock);
> +
> +fail:
> + if (eventfd && !IS_ERR(eventfd))
> + eventfd_ctx_put(eventfd);
> +
> + /*
> + * Block until we know all outstanding shutdown jobs have completed
> + * so that we guarantee there will not be any more ACKs signaled on
> + * this eventfd once this deassign function returns.
> + */
> + flush_workqueue(irqfd_cleanup_wq);
> +
> + return ret;
> +}
> +
> +int kvm_irq_ackfd(struct kvm *kvm, struct kvm_irq_ackfd *args)
> +{
> + if (args->flags & ~(KVM_IRQ_ACKFD_FLAG_DEASSIGN |
> + KVM_IRQ_ACKFD_FLAG_IRQ_SOURCE_ID))
> + return -EINVAL;
> +
> + if (args->flags & KVM_IRQ_ACKFD_FLAG_DEASSIGN)
> + return kvm_deassign_irq_ackfd(kvm, args);
> +
> + return kvm_assign_irq_ackfd(kvm, args);
> +}
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index e120cb3..bbe3a20 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -619,6 +619,7 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
> struct kvm *kvm = filp->private_data;
>
> kvm_irqfd_release(kvm);
> + kvm_irq_ackfd_release(kvm);
>
> kvm_put_kvm(kvm);
> return 0;
> @@ -2109,6 +2110,15 @@ static long kvm_vm_ioctl(struct file *filp,
> break;
> }
> #endif
> + case KVM_IRQ_ACKFD: {
> + struct kvm_irq_ackfd data;
> +
> + r = -EFAULT;
> + if (copy_from_user(&data, argp, sizeof(data)))
> + goto out;
> + r = kvm_irq_ackfd(kvm, &data);
> + break;
> + }
> default:
> r = kvm_arch_vm_ioctl(filp, ioctl, arg);
> if (r == -ENOTTY)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists