lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Date:   Tue, 22 Aug 2023 10:40:27 +0200
From:   Juergen Gross <jgross@...e.com>
To:     Viresh Kumar <viresh.kumar@...aro.org>,
        Stefano Stabellini <sstabellini@...nel.org>,
        Oleksandr Tyshchenko <oleksandr_tyshchenko@...m.com>
Cc:     Vincent Guittot <vincent.guittot@...aro.org>,
        Alex Bennée <alex.bennee@...aro.org>,
        stratos-dev@...lists.linaro.org,
        Erik Schilling <erik.schilling@...aro.org>,
        Manos Pitsidianakis <manos.pitsidianakis@...aro.org>,
        Mathieu Poirier <mathieu.poirier@...aro.org>,
        xen-devel@...ts.xenproject.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH V4] xen: privcmd: Add support for irqfd

On 25.07.23 12:57, Viresh Kumar wrote:
> Xen provides support for injecting interrupts to the guests via the
> HYPERVISOR_dm_op() hypercall. The same is used by the Virtio based
> device backend implementations, in an inefficient manner currently.
> 
> Generally, the Virtio backends are implemented to work with the Eventfd
> based mechanism. In order to make such backends work with Xen, another
> software layer needs to poll the Eventfds and raise an interrupt to the
> guest using the Xen based mechanism. This results in an extra context
> switch.
> 
> This is not a new problem in Linux though. It is present with other
> hypervisors like KVM, etc. as well. The generic solution implemented in
> the kernel for them is to provide an IOCTL call to pass the interrupt
> details and eventfd, which lets the kernel take care of polling the
> eventfd and raising of the interrupt, instead of handling this in user
> space (which involves an extra context switch).
> 
> This patch adds support to inject a specific interrupt to guest using
> the eventfd mechanism, by preventing the extra context switch.
> 
> Inspired by existing implementations for KVM, etc..
> 
> Signed-off-by: Viresh Kumar <viresh.kumar@...aro.org>
> ---
> V3->V4
> - Drop the imported definitions to hvm/dm_op.h.
> - Make the caller pass a pointer to pre-filled "struct xen_dm_op" instance and
>    get rid of irq and level fields.
> - Enable the irqfd feature under a new Kconfig entry.
> 
> V2->V3
> - Select EVENTFD from Kconfig
> 
> V1->V2:
> - Improve error handling.
> - Remove the unnecessary usage of list_for_each_entry_safe().
> - Restrict the use of XEN_DMOP_set_irq_level to only ARM64.
> - Import definitions from Xen to hvm/dm_op.h.
> 
>   drivers/xen/Kconfig        |   8 ++
>   drivers/xen/privcmd.c      | 286 ++++++++++++++++++++++++++++++++++++-
>   include/uapi/xen/privcmd.h |  14 ++
>   3 files changed, 306 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
> index d5d7c402b651..c7fabfab4c20 100644
> --- a/drivers/xen/Kconfig
> +++ b/drivers/xen/Kconfig
> @@ -269,6 +269,14 @@ config XEN_PRIVCMD
>   	  disaggregated Xen setups this driver might be needed for other
>   	  domains, too.
>   
> +config XEN_PRIVCMD_IRQFD
> +	bool "Xen irqfd support"
> +	depends on XEN_PRIVCMD && XEN_VIRTIO && EVENTFD
> +	default m

"m"? I guess you meant "n"? In this case the "default" line should just
be dropped.

> +	help
> +	  irqfd is a mechanism to inject a specific interrupt to a User VM using
> +	  a decoupled eventfd mechanism.

I think the help text should mention "virtio".

What about:

"Using the irqfd mechanism a virtio backend running in a daemon can speed
  up interrupt injection into a guest."

> +
>   config XEN_ACPI_PROCESSOR
>   	tristate "Xen ACPI processor"
>   	depends on XEN && XEN_PV_DOM0 && X86 && ACPI_PROCESSOR && CPU_FREQ
> diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
> index e2f580e30a86..584c8de56c5e 100644
> --- a/drivers/xen/privcmd.c
> +++ b/drivers/xen/privcmd.c
> @@ -9,11 +9,16 @@
>   
>   #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
>   
> +#include <linux/eventfd.h>
> +#include <linux/file.h>
>   #include <linux/kernel.h>
>   #include <linux/module.h>
> +#include <linux/mutex.h>
> +#include <linux/poll.h>
>   #include <linux/sched.h>
>   #include <linux/slab.h>
>   #include <linux/string.h>
> +#include <linux/workqueue.h>
>   #include <linux/errno.h>
>   #include <linux/mm.h>
>   #include <linux/mman.h>
> @@ -833,6 +838,267 @@ static long privcmd_ioctl_mmap_resource(struct file *file,
>   	return rc;
>   }
>   
> +#ifdef CONFIG_XEN_PRIVCMD_IRQFD
> +/* Irqfd support */
> +static struct workqueue_struct *irqfd_cleanup_wq;
> +static DEFINE_MUTEX(irqfds_lock);
> +static LIST_HEAD(irqfds_list);
> +
> +struct privcmd_kernel_irqfd {
> +	struct xen_dm_op_buf xbufs;
> +	domid_t dom;
> +	bool error;
> +	struct eventfd_ctx *eventfd;
> +	struct work_struct shutdown;
> +	wait_queue_entry_t wait;
> +	struct list_head list;
> +	poll_table pt;
> +};
> +
> +static void irqfd_deactivate(struct privcmd_kernel_irqfd *kirqfd)
> +{
> +	lockdep_assert_held(&irqfds_lock);
> +
> +	list_del_init(&kirqfd->list);
> +	queue_work(irqfd_cleanup_wq, &kirqfd->shutdown);
> +}
> +
> +static void irqfd_shutdown(struct work_struct *work)
> +{
> +	struct privcmd_kernel_irqfd *kirqfd =
> +		container_of(work, struct privcmd_kernel_irqfd, shutdown);
> +	u64 cnt;
> +
> +	eventfd_ctx_remove_wait_queue(kirqfd->eventfd, &kirqfd->wait, &cnt);
> +	eventfd_ctx_put(kirqfd->eventfd);
> +	kfree(kirqfd);
> +}
> +
> +static void irqfd_inject(struct privcmd_kernel_irqfd *kirqfd)
> +{
> +	u64 cnt;
> +	long rc;
> +
> +	eventfd_ctx_do_read(kirqfd->eventfd, &cnt);
> +
> +	xen_preemptible_hcall_begin();
> +	rc = HYPERVISOR_dm_op(kirqfd->dom, 1, &kirqfd->xbufs);
> +	xen_preemptible_hcall_end();
> +
> +	/* Don't repeat the error message for consecutive failures */
> +	if (rc && !kirqfd->error) {
> +		pr_err("Failed to configure irq for guest domain: %d\n",
> +		       kirqfd->dom);
> +	}
> +
> +	kirqfd->error = !!rc;

No need for the "!!".

> +}
> +
> +static int
> +irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key)
> +{
> +	struct privcmd_kernel_irqfd *kirqfd =
> +		container_of(wait, struct privcmd_kernel_irqfd, wait);
> +	__poll_t flags = key_to_poll(key);
> +
> +	if (flags & EPOLLIN)
> +		irqfd_inject(kirqfd);
> +
> +	if (flags & EPOLLHUP) {
> +		mutex_lock(&irqfds_lock);
> +		irqfd_deactivate(kirqfd);
> +		mutex_unlock(&irqfds_lock);
> +	}
> +
> +	return 0;
> +}
> +
> +static void
> +irqfd_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt)
> +{
> +	struct privcmd_kernel_irqfd *kirqfd =
> +		container_of(pt, struct privcmd_kernel_irqfd, pt);
> +
> +	add_wait_queue_priority(wqh, &kirqfd->wait);
> +}
> +
> +static int privcmd_irqfd_assign(struct privcmd_irqfd *irqfd)
> +{
> +	struct privcmd_kernel_irqfd *kirqfd, *tmp;
> +	struct eventfd_ctx *eventfd;
> +	__poll_t events;
> +	struct fd f;
> +	void *dm_op;
> +	int ret;
> +
> +	kirqfd = kzalloc(sizeof(*kirqfd) + irqfd->size, GFP_KERNEL);
> +	if (!kirqfd)
> +		return -ENOMEM;
> +	dm_op = kirqfd + 1;
> +
> +	if (copy_from_user(dm_op, irqfd->dm_op, irqfd->size)) {
> +		ret = -EFAULT;
> +		goto error_kfree;
> +	}
> +
> +	kirqfd->xbufs.size = irqfd->size;
> +	set_xen_guest_handle(kirqfd->xbufs.h, dm_op);
> +	kirqfd->dom = irqfd->dom;
> +	INIT_LIST_HEAD(&kirqfd->list);

I don't think INIT_LIST_HEAD() is needed here. list_add_tail() doesn't
require that.

> +	INIT_WORK(&kirqfd->shutdown, irqfd_shutdown);
> +
> +	f = fdget(irqfd->fd);
> +	if (!f.file) {
> +		ret = -EBADF;
> +		goto error_kfree;
> +	}
> +
> +	eventfd = eventfd_ctx_fileget(f.file);
> +	if (IS_ERR(eventfd)) {
> +		ret = PTR_ERR(eventfd);
> +		goto error_fd_put;
> +	}
> +
> +	kirqfd->eventfd = eventfd;

I don't see why you need the local eventfd variable.

> +
> +	/*
> +	 * Install our own custom wake-up handling so we are notified via a
> +	 * callback whenever someone signals the underlying eventfd.
> +	 */
> +	init_waitqueue_func_entry(&kirqfd->wait, irqfd_wakeup);
> +	init_poll_funcptr(&kirqfd->pt, irqfd_poll_func);
> +
> +	mutex_lock(&irqfds_lock);
> +
> +	list_for_each_entry(tmp, &irqfds_list, list) {
> +		if (kirqfd->eventfd == tmp->eventfd) {
> +			ret = -EBUSY;
> +			mutex_unlock(&irqfds_lock);
> +			goto error_eventfd;
> +		}
> +	}
> +
> +	list_add_tail(&kirqfd->list, &irqfds_list);
> +	mutex_unlock(&irqfds_lock);
> +
> +	/*
> +	 * Check if there was an event already pending on the eventfd before we
> +	 * registered, and trigger it as if we didn't miss it.
> +	 */
> +	events = vfs_poll(f.file, &kirqfd->pt);
> +	if (events & EPOLLIN)
> +		irqfd_inject(kirqfd);
> +
> +	/*
> +	 * Do not drop the file until the kirqfd is fully initialized, otherwise
> +	 * we might race against the EPOLLHUP.
> +	 */
> +	fdput(f);
> +	return 0;
> +
> +error_eventfd:
> +	eventfd_ctx_put(eventfd);
> +
> +error_fd_put:
> +	fdput(f);
> +
> +error_kfree:
> +	kfree(kirqfd);
> +	return ret;
> +}

Juergen


Download attachment "OpenPGP_0xB0DE9DD628BF132F.asc" of type "application/pgp-keys" (3099 bytes)

Download attachment "OpenPGP_signature.asc" of type "application/pgp-signature" (496 bytes)

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ