lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <a5521a9d-72c6-4e03-0fbb-8a37418c32f2@suse.com>
Date:   Thu, 13 Jul 2023 09:44:47 +0200
From:   Juergen Gross <jgross@...e.com>
To:     Viresh Kumar <viresh.kumar@...aro.org>,
        Stefano Stabellini <sstabellini@...nel.org>,
        Oleksandr Tyshchenko <oleksandr_tyshchenko@...m.com>
Cc:     Vincent Guittot <vincent.guittot@...aro.org>,
        Alex Bennée <alex.bennee@...aro.org>,
        stratos-dev@...lists.linaro.org,
        Erik Schilling <erik.schilling@...aro.org>,
        Manos Pitsidianakis <manos.pitsidianakis@...aro.org>,
        Mathieu Poirier <mathieu.poirier@...aro.org>,
        xen-devel@...ts.xenproject.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH] xen: privcmd: Add support for irqfd

On 12.07.23 10:48, Viresh Kumar wrote:
> Xen provides support for injecting interrupts to the guests via the
> HYPERVISOR_dm_op() hypercall. The same is used by the Virtio based
> device backend implementations, in an inefficient manner currently.
> 
> Generally, the Virtio backends are implemented to work with the Eventfd
> based mechanism. In order to make such backends work with Xen, another
> software layer needs to poll the Eventfds and raise an interrupt to the
> guest using the Xen based mechanism. This results in an extra context
> switch.
> 
> This is not a new problem in Linux though. It is present with other
> hypervisors like KVM, etc. as well. The generic solution implemented in
> the kernel for them is to provide an IOCTL call to pass the interrupt
> details and eventfd, which lets the kernel take care of polling the
> eventfd and raising of the interrupt, instead of handling this in user
> space (which involves an extra context switch).
> 
> This patch adds support to inject a specific interrupt to guest using
> the eventfd mechanism, by preventing the extra context switch.
> 
> Inspired by existing implementations for KVM, etc..
> 
> Signed-off-by: Viresh Kumar <viresh.kumar@...aro.org>
> ---
>   drivers/xen/privcmd.c      | 285 ++++++++++++++++++++++++++++++++++++-
>   include/uapi/xen/privcmd.h |  14 ++
>   2 files changed, 297 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
> index e2f580e30a86..e8096b09c113 100644
> --- a/drivers/xen/privcmd.c
> +++ b/drivers/xen/privcmd.c
> @@ -9,11 +9,16 @@
>   
>   #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
>   
> +#include <linux/eventfd.h>
> +#include <linux/file.h>
>   #include <linux/kernel.h>
>   #include <linux/module.h>
> +#include <linux/mutex.h>
> +#include <linux/poll.h>
>   #include <linux/sched.h>
>   #include <linux/slab.h>
>   #include <linux/string.h>
> +#include <linux/workqueue.h>
>   #include <linux/errno.h>
>   #include <linux/mm.h>
>   #include <linux/mman.h>
> @@ -833,6 +838,266 @@ static long privcmd_ioctl_mmap_resource(struct file *file,
>   	return rc;
>   }
>   
> +/* Irqfd support */
> +static struct workqueue_struct *irqfd_cleanup_wq;
> +static DEFINE_MUTEX(irqfds_lock);
> +static LIST_HEAD(irqfds_list);
> +
> +struct privcmd_kernel_irqfd {
> +	domid_t dom;
> +	u8 level;
> +	u32 irq;
> +	struct eventfd_ctx *eventfd;
> +	struct work_struct shutdown;
> +	wait_queue_entry_t wait;
> +	struct list_head list;
> +	poll_table pt;
> +};
> +
> +/* From xen/include/public/hvm/dm_op.h */
> +#define XEN_DMOP_set_irq_level 19
> +
> +struct xen_dm_op_set_irq_level {
> +	u32 irq;
> +	/* IN - Level: 0 -> deasserted, 1 -> asserted */
> +	u8 level;
> +	u8 pad[3];
> +};
> +
> +struct xen_dm_op {
> +	u32 op;
> +	u32 pad;
> +	union {
> +		/*
> +		 * There are more structures here, we won't be using them, so
> +		 * can skip adding them here.
> +		 */
> +		struct xen_dm_op_set_irq_level set_irq_level;
> +	} u;
> +};

Instead of copying definitions over from Xen into privcmd.c, please just update
the related linux header include/xen/interface/dm_op.h from the Xen public
header.

> +
> +static void irqfd_deactivate(struct privcmd_kernel_irqfd *kirqfd)
> +{
> +	lockdep_assert_held(&irqfds_lock);
> +
> +	list_del_init(&kirqfd->list);
> +	queue_work(irqfd_cleanup_wq, &kirqfd->shutdown);
> +}
> +
> +static void irqfd_shutdown(struct work_struct *work)
> +{
> +	struct privcmd_kernel_irqfd *kirqfd =
> +		container_of(work, struct privcmd_kernel_irqfd, shutdown);
> +	u64 cnt;
> +
> +	eventfd_ctx_remove_wait_queue(kirqfd->eventfd, &kirqfd->wait, &cnt);
> +	eventfd_ctx_put(kirqfd->eventfd);
> +	kfree(kirqfd);
> +}
> +
> +static void irqfd_inject(struct privcmd_kernel_irqfd *kirqfd)
> +{
> +	struct xen_dm_op dm_op = {
> +		.op = XEN_DMOP_set_irq_level,
> +		.u.set_irq_level.irq = kirqfd->irq,
> +		.u.set_irq_level.level = kirqfd->level,
> +	};
> +	struct xen_dm_op_buf xbufs = {
> +		.size = sizeof(dm_op),
> +	};
> +	u64 cnt;
> +
> +	eventfd_ctx_do_read(kirqfd->eventfd, &cnt);
> +	set_xen_guest_handle(xbufs.h, &dm_op);
> +
> +	xen_preemptible_hcall_begin();
> +	HYPERVISOR_dm_op(kirqfd->dom, 1, &xbufs);

Please add some error handling, e.g. by issuing a message in case this hypercall
was failing. Adding a bool "error" to struct privcmd_kernel_irqfd in order to
avoid multiple error messages for the same device might be a good idea.

> +	xen_preemptible_hcall_end();
> +}
> +
> +static int
> +irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
> +{
> +	struct privcmd_kernel_irqfd *kirqfd =
> +		container_of(wait, struct privcmd_kernel_irqfd, wait);
> +	__poll_t flags = key_to_poll(key);
> +
> +	if (flags & EPOLLIN)
> +		irqfd_inject(kirqfd);
> +
> +	if (flags & EPOLLHUP) {
> +		mutex_lock(&irqfds_lock);
> +		irqfd_deactivate(kirqfd);
> +		mutex_unlock(&irqfds_lock);
> +	}
> +
> +	return 0;
> +}
> +
> +static void
> +irqfd_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt)
> +{
> +	struct privcmd_kernel_irqfd *kirqfd =
> +		container_of(pt, struct privcmd_kernel_irqfd, pt);
> +
> +	add_wait_queue_priority(wqh, &kirqfd->wait);
> +}
> +
> +static int privcmd_irqfd_assign(struct privcmd_irqfd *irqfd)
> +{
> +	struct privcmd_kernel_irqfd *kirqfd, *tmp;
> +	struct eventfd_ctx *eventfd;
> +	__poll_t events;
> +	struct fd f;
> +	int ret;
> +
> +	kirqfd = kzalloc(sizeof(*kirqfd), GFP_KERNEL);
> +	if (!kirqfd)
> +		return -ENOMEM;
> +
> +	kirqfd->irq = irqfd->irq;
> +	kirqfd->dom = irqfd->dom;
> +	kirqfd->level = irqfd->level;
> +	INIT_LIST_HEAD(&kirqfd->list);
> +	INIT_WORK(&kirqfd->shutdown, irqfd_shutdown);
> +
> +	f = fdget(irqfd->fd);
> +	if (!f.file) {
> +		ret = -EBADF;
> +		goto error_kfree;
> +	}
> +
> +	eventfd = eventfd_ctx_fileget(f.file);
> +	if (IS_ERR(eventfd)) {
> +		ret = PTR_ERR(eventfd);
> +		goto error_fd_put;
> +	}
> +
> +	kirqfd->eventfd = eventfd;
> +
> +	/*
> +	 * Install our own custom wake-up handling so we are notified via a
> +	 * callback whenever someone signals the underlying eventfd.
> +	 */
> +	init_waitqueue_func_entry(&kirqfd->wait, irqfd_wakeup);
> +	init_poll_funcptr(&kirqfd->pt, irqfd_poll_func);
> +
> +	mutex_lock(&irqfds_lock);
> +
> +	list_for_each_entry(tmp, &irqfds_list, list) {
> +		if (kirqfd->eventfd == tmp->eventfd) {
> +			ret = -EBUSY;
> +			mutex_unlock(&irqfds_lock);
> +			goto error_eventfd;
> +		}
> +	}
> +
> +	list_add_tail(&kirqfd->list, &irqfds_list);
> +	mutex_unlock(&irqfds_lock);
> +
> +	/*
> +	 * Check if there was an event already pending on the eventfd before we
> +	 * registered, and trigger it as if we didn't miss it.
> +	 */
> +	events = vfs_poll(f.file, &kirqfd->pt);
> +	if (events & EPOLLIN)
> +		irqfd_inject(kirqfd);
> +
> +	/*
> +	 * Do not drop the file until the kirqfd is fully initialized, otherwise
> +	 * we might race against the EPOLLHUP.
> +	 */
> +	fdput(f);
> +	return 0;
> +
> +error_eventfd:
> +	eventfd_ctx_put(eventfd);
> +
> +error_fd_put:
> +	fdput(f);
> +
> +error_kfree:
> +	kfree(kirqfd);
> +	return ret;
> +}
> +
> +static int privcmd_irqfd_deassign(struct privcmd_irqfd *irqfd)
> +{
> +	struct privcmd_kernel_irqfd *kirqfd, *tmp;
> +	struct eventfd_ctx *eventfd;
> +
> +	eventfd = eventfd_ctx_fdget(irqfd->fd);
> +	if (IS_ERR(eventfd))
> +		return PTR_ERR(eventfd);
> +
> +	mutex_lock(&irqfds_lock);
> +
> +	list_for_each_entry_safe(kirqfd, tmp, &irqfds_list, list) {

I don't think you need the safe variant here, as you will stop the loop in
case you are removing an entry.

> +		if (kirqfd->eventfd == eventfd) {
> +			irqfd_deactivate(kirqfd);
> +			break;
> +		}
> +	}
> +
> +	mutex_unlock(&irqfds_lock);
> +
> +	eventfd_ctx_put(eventfd);
> +
> +	/*
> +	 * Block until we know all outstanding shutdown jobs have completed so
> +	 * that we guarantee there will not be any more interrupts once this
> +	 * deassign function returns.
> +	 */
> +	flush_workqueue(irqfd_cleanup_wq);
> +
> +	return 0;
> +}
> +
> +static long privcmd_ioctl_irqfd(struct file *file, void __user *udata)
> +{
> +	struct privcmd_data *data = file->private_data;
> +	struct privcmd_irqfd irqfd;
> +
> +	if (copy_from_user(&irqfd, udata, sizeof(irqfd)))
> +		return -EFAULT;
> +
> +	/* No other flags should be set */
> +	if (irqfd.flags & ~PRIVCMD_IRQFD_FLAG_DEASSIGN)
> +		return -EINVAL;
> +
> +	/* If restriction is in place, check the domid matches */
> +	if (data->domid != DOMID_INVALID && data->domid != irqfd.dom)
> +		return -EPERM;
> +
> +	if (irqfd.flags & PRIVCMD_IRQFD_FLAG_DEASSIGN)
> +		return privcmd_irqfd_deassign(&irqfd);
> +
> +	return privcmd_irqfd_assign(&irqfd);
> +}
> +
> +static int privcmd_irqfd_init(void)
> +{
> +	irqfd_cleanup_wq = alloc_workqueue("privcmd-irqfd-cleanup", 0, 0);
> +	if (!irqfd_cleanup_wq)
> +		return -ENOMEM;
> +
> +	return 0;
> +}
> +
> +static void privcmd_irqfd_exit(void)
> +{
> +	struct privcmd_kernel_irqfd *kirqfd, *tmp;
> +
> +	mutex_lock(&irqfds_lock);
> +
> +	list_for_each_entry_safe(kirqfd, tmp, &irqfds_list, list)
> +		irqfd_deactivate(kirqfd);
> +
> +	mutex_unlock(&irqfds_lock);
> +
> +	destroy_workqueue(irqfd_cleanup_wq);
> +}
> +
>   static long privcmd_ioctl(struct file *file,
>   			  unsigned int cmd, unsigned long data)
>   {
> @@ -868,6 +1133,10 @@ static long privcmd_ioctl(struct file *file,
>   		ret = privcmd_ioctl_mmap_resource(file, udata);
>   		break;
>   
> +	case IOCTL_PRIVCMD_IRQFD:
> +		ret = privcmd_ioctl_irqfd(file, udata);
> +		break;
> +
>   	default:
>   		break;
>   	}
> @@ -992,15 +1261,27 @@ static int __init privcmd_init(void)
>   	err = misc_register(&xen_privcmdbuf_dev);
>   	if (err != 0) {
>   		pr_err("Could not register Xen hypercall-buf device\n");
> -		misc_deregister(&privcmd_dev);
> -		return err;
> +		goto err_privcmdbuf;
> +	}
> +
> +	err = privcmd_irqfd_init();
> +	if (err != 0) {
> +		pr_err("irqfd init failed\n");
> +		goto err_irqfd;
>   	}
>   
>   	return 0;
> +
> +err_irqfd:
> +	misc_deregister(&xen_privcmdbuf_dev);
> +err_privcmdbuf:
> +	misc_deregister(&privcmd_dev);
> +	return err;
>   }
>   
>   static void __exit privcmd_exit(void)
>   {
> +	privcmd_irqfd_exit();
>   	misc_deregister(&privcmd_dev);
>   	misc_deregister(&xen_privcmdbuf_dev);
>   }
> diff --git a/include/uapi/xen/privcmd.h b/include/uapi/xen/privcmd.h
> index d2029556083e..47334bb91a09 100644
> --- a/include/uapi/xen/privcmd.h
> +++ b/include/uapi/xen/privcmd.h
> @@ -98,6 +98,18 @@ struct privcmd_mmap_resource {
>   	__u64 addr;
>   };
>   
> +/* For privcmd_irqfd::flags */
> +#define PRIVCMD_IRQFD_FLAG_DEASSIGN (1 << 0)
> +
> +struct privcmd_irqfd {
> +	__u32 fd;
> +	__u32 flags;
> +	__u32 irq;
> +	domid_t dom;
> +	__u8 level;
> +	__u8 pad;
> +};
> +
>   /*
>    * @cmd: IOCTL_PRIVCMD_HYPERCALL
>    * @arg: &privcmd_hypercall_t
> @@ -125,5 +137,7 @@ struct privcmd_mmap_resource {
>   	_IOC(_IOC_NONE, 'P', 6, sizeof(domid_t))
>   #define IOCTL_PRIVCMD_MMAP_RESOURCE				\
>   	_IOC(_IOC_NONE, 'P', 7, sizeof(struct privcmd_mmap_resource))
> +#define IOCTL_PRIVCMD_IRQFD					\
> +	_IOC(_IOC_NONE, 'P', 8, sizeof(struct privcmd_irqfd))
>   
>   #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */


Juergen

Download attachment "OpenPGP_0xB0DE9DD628BF132F.asc" of type "application/pgp-keys" (3099 bytes)

Download attachment "OpenPGP_signature" of type "application/pgp-signature" (496 bytes)

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ