[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <a5521a9d-72c6-4e03-0fbb-8a37418c32f2@suse.com>
Date: Thu, 13 Jul 2023 09:44:47 +0200
From: Juergen Gross <jgross@...e.com>
To: Viresh Kumar <viresh.kumar@...aro.org>,
Stefano Stabellini <sstabellini@...nel.org>,
Oleksandr Tyshchenko <oleksandr_tyshchenko@...m.com>
Cc: Vincent Guittot <vincent.guittot@...aro.org>,
Alex Bennée <alex.bennee@...aro.org>,
stratos-dev@...lists.linaro.org,
Erik Schilling <erik.schilling@...aro.org>,
Manos Pitsidianakis <manos.pitsidianakis@...aro.org>,
Mathieu Poirier <mathieu.poirier@...aro.org>,
xen-devel@...ts.xenproject.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH] xen: privcmd: Add support for irqfd
On 12.07.23 10:48, Viresh Kumar wrote:
> Xen provides support for injecting interrupts to the guests via the
> HYPERVISOR_dm_op() hypercall. The same is used by the Virtio based
> device backend implementations, in an inefficient manner currently.
>
> Generally, the Virtio backends are implemented to work with the Eventfd
> based mechanism. In order to make such backends work with Xen, another
> software layer needs to poll the Eventfds and raise an interrupt to the
> guest using the Xen based mechanism. This results in an extra context
> switch.
>
> This is not a new problem in Linux though. It is present with other
> hypervisors like KVM, etc. as well. The generic solution implemented in
> the kernel for them is to provide an IOCTL call to pass the interrupt
> details and eventfd, which lets the kernel take care of polling the
> eventfd and raising of the interrupt, instead of handling this in user
> space (which involves an extra context switch).
>
> This patch adds support to inject a specific interrupt to guest using
> the eventfd mechanism, by preventing the extra context switch.
>
> Inspired by existing implementations for KVM, etc..
>
> Signed-off-by: Viresh Kumar <viresh.kumar@...aro.org>
> ---
> drivers/xen/privcmd.c | 285 ++++++++++++++++++++++++++++++++++++-
> include/uapi/xen/privcmd.h | 14 ++
> 2 files changed, 297 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
> index e2f580e30a86..e8096b09c113 100644
> --- a/drivers/xen/privcmd.c
> +++ b/drivers/xen/privcmd.c
> @@ -9,11 +9,16 @@
>
> #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
>
> +#include <linux/eventfd.h>
> +#include <linux/file.h>
> #include <linux/kernel.h>
> #include <linux/module.h>
> +#include <linux/mutex.h>
> +#include <linux/poll.h>
> #include <linux/sched.h>
> #include <linux/slab.h>
> #include <linux/string.h>
> +#include <linux/workqueue.h>
> #include <linux/errno.h>
> #include <linux/mm.h>
> #include <linux/mman.h>
> @@ -833,6 +838,266 @@ static long privcmd_ioctl_mmap_resource(struct file *file,
> return rc;
> }
>
> +/* Irqfd support */
> +static struct workqueue_struct *irqfd_cleanup_wq;
> +static DEFINE_MUTEX(irqfds_lock);
> +static LIST_HEAD(irqfds_list);
> +
> +struct privcmd_kernel_irqfd {
> + domid_t dom;
> + u8 level;
> + u32 irq;
> + struct eventfd_ctx *eventfd;
> + struct work_struct shutdown;
> + wait_queue_entry_t wait;
> + struct list_head list;
> + poll_table pt;
> +};
> +
> +/* From xen/include/public/hvm/dm_op.h */
> +#define XEN_DMOP_set_irq_level 19
> +
> +struct xen_dm_op_set_irq_level {
> + u32 irq;
> + /* IN - Level: 0 -> deasserted, 1 -> asserted */
> + u8 level;
> + u8 pad[3];
> +};
> +
> +struct xen_dm_op {
> + u32 op;
> + u32 pad;
> + union {
> + /*
> + * There are more structures here, we won't be using them, so
> + * can skip adding them here.
> + */
> + struct xen_dm_op_set_irq_level set_irq_level;
> + } u;
> +};
Instead of copying definitions over from Xen into privcmd.c, please just update
the related linux header include/xen/interface/dm_op.h from the Xen public
header.
> +
> +static void irqfd_deactivate(struct privcmd_kernel_irqfd *kirqfd)
> +{
> + lockdep_assert_held(&irqfds_lock);
> +
> + list_del_init(&kirqfd->list);
> + queue_work(irqfd_cleanup_wq, &kirqfd->shutdown);
> +}
> +
> +static void irqfd_shutdown(struct work_struct *work)
> +{
> + struct privcmd_kernel_irqfd *kirqfd =
> + container_of(work, struct privcmd_kernel_irqfd, shutdown);
> + u64 cnt;
> +
> + eventfd_ctx_remove_wait_queue(kirqfd->eventfd, &kirqfd->wait, &cnt);
> + eventfd_ctx_put(kirqfd->eventfd);
> + kfree(kirqfd);
> +}
> +
> +static void irqfd_inject(struct privcmd_kernel_irqfd *kirqfd)
> +{
> + struct xen_dm_op dm_op = {
> + .op = XEN_DMOP_set_irq_level,
> + .u.set_irq_level.irq = kirqfd->irq,
> + .u.set_irq_level.level = kirqfd->level,
> + };
> + struct xen_dm_op_buf xbufs = {
> + .size = sizeof(dm_op),
> + };
> + u64 cnt;
> +
> + eventfd_ctx_do_read(kirqfd->eventfd, &cnt);
> + set_xen_guest_handle(xbufs.h, &dm_op);
> +
> + xen_preemptible_hcall_begin();
> + HYPERVISOR_dm_op(kirqfd->dom, 1, &xbufs);
Please add some error handling, e.g. by issuing a message in case this hypercall
was failing. Adding a bool "error" to struct privcmd_kernel_irqfd in order to
avoid multiple error messages for the same device might be a good idea.
> + xen_preemptible_hcall_end();
> +}
> +
> +static int
> +irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
> +{
> + struct privcmd_kernel_irqfd *kirqfd =
> + container_of(wait, struct privcmd_kernel_irqfd, wait);
> + __poll_t flags = key_to_poll(key);
> +
> + if (flags & EPOLLIN)
> + irqfd_inject(kirqfd);
> +
> + if (flags & EPOLLHUP) {
> + mutex_lock(&irqfds_lock);
> + irqfd_deactivate(kirqfd);
> + mutex_unlock(&irqfds_lock);
> + }
> +
> + return 0;
> +}
> +
> +static void
> +irqfd_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt)
> +{
> + struct privcmd_kernel_irqfd *kirqfd =
> + container_of(pt, struct privcmd_kernel_irqfd, pt);
> +
> + add_wait_queue_priority(wqh, &kirqfd->wait);
> +}
> +
> +static int privcmd_irqfd_assign(struct privcmd_irqfd *irqfd)
> +{
> + struct privcmd_kernel_irqfd *kirqfd, *tmp;
> + struct eventfd_ctx *eventfd;
> + __poll_t events;
> + struct fd f;
> + int ret;
> +
> + kirqfd = kzalloc(sizeof(*kirqfd), GFP_KERNEL);
> + if (!kirqfd)
> + return -ENOMEM;
> +
> + kirqfd->irq = irqfd->irq;
> + kirqfd->dom = irqfd->dom;
> + kirqfd->level = irqfd->level;
> + INIT_LIST_HEAD(&kirqfd->list);
> + INIT_WORK(&kirqfd->shutdown, irqfd_shutdown);
> +
> + f = fdget(irqfd->fd);
> + if (!f.file) {
> + ret = -EBADF;
> + goto error_kfree;
> + }
> +
> + eventfd = eventfd_ctx_fileget(f.file);
> + if (IS_ERR(eventfd)) {
> + ret = PTR_ERR(eventfd);
> + goto error_fd_put;
> + }
> +
> + kirqfd->eventfd = eventfd;
> +
> + /*
> + * Install our own custom wake-up handling so we are notified via a
> + * callback whenever someone signals the underlying eventfd.
> + */
> + init_waitqueue_func_entry(&kirqfd->wait, irqfd_wakeup);
> + init_poll_funcptr(&kirqfd->pt, irqfd_poll_func);
> +
> + mutex_lock(&irqfds_lock);
> +
> + list_for_each_entry(tmp, &irqfds_list, list) {
> + if (kirqfd->eventfd == tmp->eventfd) {
> + ret = -EBUSY;
> + mutex_unlock(&irqfds_lock);
> + goto error_eventfd;
> + }
> + }
> +
> + list_add_tail(&kirqfd->list, &irqfds_list);
> + mutex_unlock(&irqfds_lock);
> +
> + /*
> + * Check if there was an event already pending on the eventfd before we
> + * registered, and trigger it as if we didn't miss it.
> + */
> + events = vfs_poll(f.file, &kirqfd->pt);
> + if (events & EPOLLIN)
> + irqfd_inject(kirqfd);
> +
> + /*
> + * Do not drop the file until the kirqfd is fully initialized, otherwise
> + * we might race against the EPOLLHUP.
> + */
> + fdput(f);
> + return 0;
> +
> +error_eventfd:
> + eventfd_ctx_put(eventfd);
> +
> +error_fd_put:
> + fdput(f);
> +
> +error_kfree:
> + kfree(kirqfd);
> + return ret;
> +}
> +
> +static int privcmd_irqfd_deassign(struct privcmd_irqfd *irqfd)
> +{
> + struct privcmd_kernel_irqfd *kirqfd, *tmp;
> + struct eventfd_ctx *eventfd;
> +
> + eventfd = eventfd_ctx_fdget(irqfd->fd);
> + if (IS_ERR(eventfd))
> + return PTR_ERR(eventfd);
> +
> + mutex_lock(&irqfds_lock);
> +
> + list_for_each_entry_safe(kirqfd, tmp, &irqfds_list, list) {
I don't think you need the safe variant here, as you will stop the loop in
case you are removing an entry.
> + if (kirqfd->eventfd == eventfd) {
> + irqfd_deactivate(kirqfd);
> + break;
> + }
> + }
> +
> + mutex_unlock(&irqfds_lock);
> +
> + eventfd_ctx_put(eventfd);
> +
> + /*
> + * Block until we know all outstanding shutdown jobs have completed so
> + * that we guarantee there will not be any more interrupts once this
> + * deassign function returns.
> + */
> + flush_workqueue(irqfd_cleanup_wq);
> +
> + return 0;
> +}
> +
> +static long privcmd_ioctl_irqfd(struct file *file, void __user *udata)
> +{
> + struct privcmd_data *data = file->private_data;
> + struct privcmd_irqfd irqfd;
> +
> + if (copy_from_user(&irqfd, udata, sizeof(irqfd)))
> + return -EFAULT;
> +
> + /* No other flags should be set */
> + if (irqfd.flags & ~PRIVCMD_IRQFD_FLAG_DEASSIGN)
> + return -EINVAL;
> +
> + /* If restriction is in place, check the domid matches */
> + if (data->domid != DOMID_INVALID && data->domid != irqfd.dom)
> + return -EPERM;
> +
> + if (irqfd.flags & PRIVCMD_IRQFD_FLAG_DEASSIGN)
> + return privcmd_irqfd_deassign(&irqfd);
> +
> + return privcmd_irqfd_assign(&irqfd);
> +}
> +
> +static int privcmd_irqfd_init(void)
> +{
> + irqfd_cleanup_wq = alloc_workqueue("privcmd-irqfd-cleanup", 0, 0);
> + if (!irqfd_cleanup_wq)
> + return -ENOMEM;
> +
> + return 0;
> +}
> +
> +static void privcmd_irqfd_exit(void)
> +{
> + struct privcmd_kernel_irqfd *kirqfd, *tmp;
> +
> + mutex_lock(&irqfds_lock);
> +
> + list_for_each_entry_safe(kirqfd, tmp, &irqfds_list, list)
> + irqfd_deactivate(kirqfd);
> +
> + mutex_unlock(&irqfds_lock);
> +
> + destroy_workqueue(irqfd_cleanup_wq);
> +}
> +
> static long privcmd_ioctl(struct file *file,
> unsigned int cmd, unsigned long data)
> {
> @@ -868,6 +1133,10 @@ static long privcmd_ioctl(struct file *file,
> ret = privcmd_ioctl_mmap_resource(file, udata);
> break;
>
> + case IOCTL_PRIVCMD_IRQFD:
> + ret = privcmd_ioctl_irqfd(file, udata);
> + break;
> +
> default:
> break;
> }
> @@ -992,15 +1261,27 @@ static int __init privcmd_init(void)
> err = misc_register(&xen_privcmdbuf_dev);
> if (err != 0) {
> pr_err("Could not register Xen hypercall-buf device\n");
> - misc_deregister(&privcmd_dev);
> - return err;
> + goto err_privcmdbuf;
> + }
> +
> + err = privcmd_irqfd_init();
> + if (err != 0) {
> + pr_err("irqfd init failed\n");
> + goto err_irqfd;
> }
>
> return 0;
> +
> +err_irqfd:
> + misc_deregister(&xen_privcmdbuf_dev);
> +err_privcmdbuf:
> + misc_deregister(&privcmd_dev);
> + return err;
> }
>
> static void __exit privcmd_exit(void)
> {
> + privcmd_irqfd_exit();
> misc_deregister(&privcmd_dev);
> misc_deregister(&xen_privcmdbuf_dev);
> }
> diff --git a/include/uapi/xen/privcmd.h b/include/uapi/xen/privcmd.h
> index d2029556083e..47334bb91a09 100644
> --- a/include/uapi/xen/privcmd.h
> +++ b/include/uapi/xen/privcmd.h
> @@ -98,6 +98,18 @@ struct privcmd_mmap_resource {
> __u64 addr;
> };
>
> +/* For privcmd_irqfd::flags */
> +#define PRIVCMD_IRQFD_FLAG_DEASSIGN (1 << 0)
> +
> +struct privcmd_irqfd {
> + __u32 fd;
> + __u32 flags;
> + __u32 irq;
> + domid_t dom;
> + __u8 level;
> + __u8 pad;
> +};
> +
> /*
> * @cmd: IOCTL_PRIVCMD_HYPERCALL
> * @arg: &privcmd_hypercall_t
> @@ -125,5 +137,7 @@ struct privcmd_mmap_resource {
> _IOC(_IOC_NONE, 'P', 6, sizeof(domid_t))
> #define IOCTL_PRIVCMD_MMAP_RESOURCE \
> _IOC(_IOC_NONE, 'P', 7, sizeof(struct privcmd_mmap_resource))
> +#define IOCTL_PRIVCMD_IRQFD \
> + _IOC(_IOC_NONE, 'P', 8, sizeof(struct privcmd_irqfd))
>
> #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
Juergen
Download attachment "OpenPGP_0xB0DE9DD628BF132F.asc" of type "application/pgp-keys" (3099 bytes)
Download attachment "OpenPGP_signature" of type "application/pgp-signature" (496 bytes)
Powered by blists - more mailing lists