[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <6cd949cd-c8a4-4d08-9cc7-b1204bcb23a3@suse.com>
Date: Tue, 22 Aug 2023 10:40:27 +0200
From: Juergen Gross <jgross@...e.com>
To: Viresh Kumar <viresh.kumar@...aro.org>,
Stefano Stabellini <sstabellini@...nel.org>,
Oleksandr Tyshchenko <oleksandr_tyshchenko@...m.com>
Cc: Vincent Guittot <vincent.guittot@...aro.org>,
Alex Bennée <alex.bennee@...aro.org>,
stratos-dev@...lists.linaro.org,
Erik Schilling <erik.schilling@...aro.org>,
Manos Pitsidianakis <manos.pitsidianakis@...aro.org>,
Mathieu Poirier <mathieu.poirier@...aro.org>,
xen-devel@...ts.xenproject.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH V4] xen: privcmd: Add support for irqfd
On 25.07.23 12:57, Viresh Kumar wrote:
> Xen provides support for injecting interrupts to the guests via the
> HYPERVISOR_dm_op() hypercall. The same is used by the Virtio based
> device backend implementations, in an inefficient manner currently.
>
> Generally, the Virtio backends are implemented to work with the Eventfd
> based mechanism. In order to make such backends work with Xen, another
> software layer needs to poll the Eventfds and raise an interrupt to the
> guest using the Xen based mechanism. This results in an extra context
> switch.
>
> This is not a new problem in Linux though. It is present with other
> hypervisors like KVM, etc. as well. The generic solution implemented in
> the kernel for them is to provide an IOCTL call to pass the interrupt
> details and eventfd, which lets the kernel take care of polling the
> eventfd and raising of the interrupt, instead of handling this in user
> space (which involves an extra context switch).
>
> This patch adds support to inject a specific interrupt to guest using
> the eventfd mechanism, by preventing the extra context switch.
>
> Inspired by existing implementations for KVM, etc..
>
> Signed-off-by: Viresh Kumar <viresh.kumar@...aro.org>
> ---
> V3->V4
> - Drop the imported definitions to hvm/dm_op.h.
> - Make the caller pass a pointer to pre-filled "struct xen_dm_op" instance and
> get rid of irq and level fields.
> - Enable the irqfd feature under a new Kconfig entry.
>
> V2->V3
> - Select EVENTFD from Kconfig
>
> V1->V2:
> - Improve error handling.
> - Remove the unnecessary usage of list_for_each_entry_safe().
> - Restrict the use of XEN_DMOP_set_irq_level to only ARM64.
> - Import definitions from Xen to hvm/dm_op.h.
>
> drivers/xen/Kconfig | 8 ++
> drivers/xen/privcmd.c | 286 ++++++++++++++++++++++++++++++++++++-
> include/uapi/xen/privcmd.h | 14 ++
> 3 files changed, 306 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
> index d5d7c402b651..c7fabfab4c20 100644
> --- a/drivers/xen/Kconfig
> +++ b/drivers/xen/Kconfig
> @@ -269,6 +269,14 @@ config XEN_PRIVCMD
> disaggregated Xen setups this driver might be needed for other
> domains, too.
>
> +config XEN_PRIVCMD_IRQFD
> + bool "Xen irqfd support"
> + depends on XEN_PRIVCMD && XEN_VIRTIO && EVENTFD
> + default m
"m"? I guess you meant "n"? In this case the "default" line should just
be dropped.
> + help
> + irqfd is a mechanism to inject a specific interrupt to a User VM using
> + a decoupled eventfd mechanism.
I think the help text should mention "virtio".
What about:
"Using the irqfd mechanism a virtio backend running in a daemon can speed
up interrupt injection into a guest."
> +
> config XEN_ACPI_PROCESSOR
> tristate "Xen ACPI processor"
> depends on XEN && XEN_PV_DOM0 && X86 && ACPI_PROCESSOR && CPU_FREQ
> diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
> index e2f580e30a86..584c8de56c5e 100644
> --- a/drivers/xen/privcmd.c
> +++ b/drivers/xen/privcmd.c
> @@ -9,11 +9,16 @@
>
> #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
>
> +#include <linux/eventfd.h>
> +#include <linux/file.h>
> #include <linux/kernel.h>
> #include <linux/module.h>
> +#include <linux/mutex.h>
> +#include <linux/poll.h>
> #include <linux/sched.h>
> #include <linux/slab.h>
> #include <linux/string.h>
> +#include <linux/workqueue.h>
> #include <linux/errno.h>
> #include <linux/mm.h>
> #include <linux/mman.h>
> @@ -833,6 +838,267 @@ static long privcmd_ioctl_mmap_resource(struct file *file,
> return rc;
> }
>
> +#ifdef CONFIG_XEN_PRIVCMD_IRQFD
> +/* Irqfd support */
> +static struct workqueue_struct *irqfd_cleanup_wq;
> +static DEFINE_MUTEX(irqfds_lock);
> +static LIST_HEAD(irqfds_list);
> +
> +struct privcmd_kernel_irqfd {
> + struct xen_dm_op_buf xbufs;
> + domid_t dom;
> + bool error;
> + struct eventfd_ctx *eventfd;
> + struct work_struct shutdown;
> + wait_queue_entry_t wait;
> + struct list_head list;
> + poll_table pt;
> +};
> +
> +static void irqfd_deactivate(struct privcmd_kernel_irqfd *kirqfd)
> +{
> + lockdep_assert_held(&irqfds_lock);
> +
> + list_del_init(&kirqfd->list);
> + queue_work(irqfd_cleanup_wq, &kirqfd->shutdown);
> +}
> +
> +static void irqfd_shutdown(struct work_struct *work)
> +{
> + struct privcmd_kernel_irqfd *kirqfd =
> + container_of(work, struct privcmd_kernel_irqfd, shutdown);
> + u64 cnt;
> +
> + eventfd_ctx_remove_wait_queue(kirqfd->eventfd, &kirqfd->wait, &cnt);
> + eventfd_ctx_put(kirqfd->eventfd);
> + kfree(kirqfd);
> +}
> +
> +static void irqfd_inject(struct privcmd_kernel_irqfd *kirqfd)
> +{
> + u64 cnt;
> + long rc;
> +
> + eventfd_ctx_do_read(kirqfd->eventfd, &cnt);
> +
> + xen_preemptible_hcall_begin();
> + rc = HYPERVISOR_dm_op(kirqfd->dom, 1, &kirqfd->xbufs);
> + xen_preemptible_hcall_end();
> +
> + /* Don't repeat the error message for consecutive failures */
> + if (rc && !kirqfd->error) {
> + pr_err("Failed to configure irq for guest domain: %d\n",
> + kirqfd->dom);
> + }
> +
> + kirqfd->error = !!rc;
No need for the "!!".
> +}
> +
> +static int
> +irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key)
> +{
> + struct privcmd_kernel_irqfd *kirqfd =
> + container_of(wait, struct privcmd_kernel_irqfd, wait);
> + __poll_t flags = key_to_poll(key);
> +
> + if (flags & EPOLLIN)
> + irqfd_inject(kirqfd);
> +
> + if (flags & EPOLLHUP) {
> + mutex_lock(&irqfds_lock);
> + irqfd_deactivate(kirqfd);
> + mutex_unlock(&irqfds_lock);
> + }
> +
> + return 0;
> +}
> +
> +static void
> +irqfd_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt)
> +{
> + struct privcmd_kernel_irqfd *kirqfd =
> + container_of(pt, struct privcmd_kernel_irqfd, pt);
> +
> + add_wait_queue_priority(wqh, &kirqfd->wait);
> +}
> +
> +static int privcmd_irqfd_assign(struct privcmd_irqfd *irqfd)
> +{
> + struct privcmd_kernel_irqfd *kirqfd, *tmp;
> + struct eventfd_ctx *eventfd;
> + __poll_t events;
> + struct fd f;
> + void *dm_op;
> + int ret;
> +
> + kirqfd = kzalloc(sizeof(*kirqfd) + irqfd->size, GFP_KERNEL);
> + if (!kirqfd)
> + return -ENOMEM;
> + dm_op = kirqfd + 1;
> +
> + if (copy_from_user(dm_op, irqfd->dm_op, irqfd->size)) {
> + ret = -EFAULT;
> + goto error_kfree;
> + }
> +
> + kirqfd->xbufs.size = irqfd->size;
> + set_xen_guest_handle(kirqfd->xbufs.h, dm_op);
> + kirqfd->dom = irqfd->dom;
> + INIT_LIST_HEAD(&kirqfd->list);
I don't think INIT_LIST_HEAD() is needed here. list_add_tail() doesn't
require that.
> + INIT_WORK(&kirqfd->shutdown, irqfd_shutdown);
> +
> + f = fdget(irqfd->fd);
> + if (!f.file) {
> + ret = -EBADF;
> + goto error_kfree;
> + }
> +
> + eventfd = eventfd_ctx_fileget(f.file);
> + if (IS_ERR(eventfd)) {
> + ret = PTR_ERR(eventfd);
> + goto error_fd_put;
> + }
> +
> + kirqfd->eventfd = eventfd;
I don't see why you need the local eventfd variable.
> +
> + /*
> + * Install our own custom wake-up handling so we are notified via a
> + * callback whenever someone signals the underlying eventfd.
> + */
> + init_waitqueue_func_entry(&kirqfd->wait, irqfd_wakeup);
> + init_poll_funcptr(&kirqfd->pt, irqfd_poll_func);
> +
> + mutex_lock(&irqfds_lock);
> +
> + list_for_each_entry(tmp, &irqfds_list, list) {
> + if (kirqfd->eventfd == tmp->eventfd) {
> + ret = -EBUSY;
> + mutex_unlock(&irqfds_lock);
> + goto error_eventfd;
> + }
> + }
> +
> + list_add_tail(&kirqfd->list, &irqfds_list);
> + mutex_unlock(&irqfds_lock);
> +
> + /*
> + * Check if there was an event already pending on the eventfd before we
> + * registered, and trigger it as if we didn't miss it.
> + */
> + events = vfs_poll(f.file, &kirqfd->pt);
> + if (events & EPOLLIN)
> + irqfd_inject(kirqfd);
> +
> + /*
> + * Do not drop the file until the kirqfd is fully initialized, otherwise
> + * we might race against the EPOLLHUP.
> + */
> + fdput(f);
> + return 0;
> +
> +error_eventfd:
> + eventfd_ctx_put(eventfd);
> +
> +error_fd_put:
> + fdput(f);
> +
> +error_kfree:
> + kfree(kirqfd);
> + return ret;
> +}
Juergen
Download attachment "OpenPGP_0xB0DE9DD628BF132F.asc" of type "application/pgp-keys" (3099 bytes)
Download attachment "OpenPGP_signature.asc" of type "application/pgp-signature" (496 bytes)
Powered by blists - more mailing lists