linux-kernel - Re: [PATCH RFC] vhost: convert pre sorted vhost memory array to interval tree

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20160119162406.7f0c2008@nial.brq.redhat.com>
Date:	Tue, 19 Jan 2016 16:24:06 +0100
From:	Igor Mammedov <imammedo@...hat.com>
To:	Jason Wang <jasowang@...hat.com>
Cc:	mst@...hat.com, kvm@...r.kernel.org,
	virtualization@...ts.linux-foundation.org, netdev@...r.kernel.org,
	linux-kernel@...r.kernel.org
Subject: Re: [PATCH RFC] vhost: convert pre sorted vhost memory array to
 interval tree

On Mon, 18 Jan 2016 10:42:29 +0800
Jason Wang <jasowang@...hat.com> wrote:

> Current pre-sorted memory region array has some limitations for future
> device IOTLB conversion:
> 
> 1) need extra work for adding and removing a single region, and it's
>    expected to be slow because of sorting or memory re-allocation.
> 2) need extra work of removing a large range which may intersect
>    several regions with different size.
> 3) need trick for a replacement policy like LRU
> 
> To overcome the above shortcomings, this patch convert it to interval
> tree which can easily address the above issue with almost no extra
> work.
> 
> The patch could be used for:
> 
> - Extend the current API and only let the userspace to send diffs of
>   memory table.
> - Simplify Device IOTLB implementation.

I'm curios how performance changes on translate_desc() hot-path in
default case and in the case of 256 memory regions?

> 
> Signed-off-by: Jason Wang <jasowang@...hat.com>
> ---
>  drivers/vhost/net.c   |   8 +--
>  drivers/vhost/vhost.c | 179 +++++++++++++++++++++++++++-----------------------
>  drivers/vhost/vhost.h |  27 ++++++--
>  3 files changed, 125 insertions(+), 89 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 9eda69e..481db96 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -968,20 +968,20 @@ static long vhost_net_reset_owner(struct vhost_net *n)
>  	struct socket *tx_sock = NULL;
>  	struct socket *rx_sock = NULL;
>  	long err;
> -	struct vhost_memory *memory;
> +	struct vhost_umem *umem;
>  
>  	mutex_lock(&n->dev.mutex);
>  	err = vhost_dev_check_owner(&n->dev);
>  	if (err)
>  		goto done;
> -	memory = vhost_dev_reset_owner_prepare();
> -	if (!memory) {
> +	umem = vhost_dev_reset_owner_prepare();
> +	if (!umem) {
>  		err = -ENOMEM;
>  		goto done;
>  	}
>  	vhost_net_stop(n, &tx_sock, &rx_sock);
>  	vhost_net_flush(n);
> -	vhost_dev_reset_owner(&n->dev, memory);
> +	vhost_dev_reset_owner(&n->dev, umem);
>  	vhost_net_vq_reset(n);
>  done:
>  	mutex_unlock(&n->dev.mutex);
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index ad2146a..851dce8 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -27,6 +27,7 @@
>  #include <linux/cgroup.h>
>  #include <linux/module.h>
>  #include <linux/sort.h>
> +#include <linux/interval_tree_generic.h>
>  
>  #include "vhost.h"
>  
> @@ -42,6 +43,10 @@ enum {
>  #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
>  #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
>  
> +INTERVAL_TREE_DEFINE(struct vhost_umem_node,
> +		     rb, __u64, __subtree_last,
> +		     START, LAST, , vhost_umem_interval_tree);
> +
>  #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
>  static void vhost_vq_reset_user_be(struct vhost_virtqueue *vq)
>  {
> @@ -275,7 +280,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>  	vq->call_ctx = NULL;
>  	vq->call = NULL;
>  	vq->log_ctx = NULL;
> -	vq->memory = NULL;
> +	vq->umem = NULL;
>  	vq->is_le = virtio_legacy_is_little_endian();
>  	vhost_vq_reset_user_be(vq);
>  }
> @@ -381,7 +386,7 @@ void vhost_dev_init(struct vhost_dev *dev,
>  	mutex_init(&dev->mutex);
>  	dev->log_ctx = NULL;
>  	dev->log_file = NULL;
> -	dev->memory = NULL;
> +	dev->umem = NULL;
>  	dev->mm = NULL;
>  	spin_lock_init(&dev->work_lock);
>  	INIT_LIST_HEAD(&dev->work_list);
> @@ -486,27 +491,36 @@ err_mm:
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_set_owner);
>  
> -struct vhost_memory *vhost_dev_reset_owner_prepare(void)
> +static void *vhost_kvzalloc(unsigned long size)
> +{
> +	void *n = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
> +
> +	if (!n)
> +		n = vzalloc(size);
> +	return n;
> +}
> +
> +struct vhost_umem *vhost_dev_reset_owner_prepare(void)
>  {
> -	return kmalloc(offsetof(struct vhost_memory, regions), GFP_KERNEL);
> +	return vhost_kvzalloc(sizeof(struct vhost_umem));
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_reset_owner_prepare);
>  
>  /* Caller should have device mutex */
> -void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_memory *memory)
> +void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_umem *umem)
>  {
>  	int i;
>  
>  	vhost_dev_cleanup(dev, true);
>  
>  	/* Restore memory to default empty mapping. */
> -	memory->nregions = 0;
> -	dev->memory = memory;
> +	INIT_LIST_HEAD(&umem->umem_list);
> +	dev->umem = umem;
>  	/* We don't need VQ locks below since vhost_dev_cleanup makes sure
>  	 * VQs aren't running.
>  	 */
>  	for (i = 0; i < dev->nvqs; ++i)
> -		dev->vqs[i]->memory = memory;
> +		dev->vqs[i]->umem = umem;
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_reset_owner);
>  
> @@ -523,6 +537,19 @@ void vhost_dev_stop(struct vhost_dev *dev)
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_stop);
>  
> +static void vhost_umem_clean(struct vhost_dev *dev)
> +{
> +	struct vhost_umem_node *node, *tmp;
> +	struct vhost_umem *umem = dev->umem;
> +
> +	list_for_each_entry_safe(node, tmp, &umem->umem_list, link) {
> +		vhost_umem_interval_tree_remove(node, &umem->umem_tree);
> +		list_del(&node->link);
> +		kvfree(node);
> +	}
> +	kvfree(umem);
> +}
> +
>  /* Caller should have device mutex if and only if locked is set */
>  void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
>  {
> @@ -549,8 +576,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
>  		fput(dev->log_file);
>  	dev->log_file = NULL;
>  	/* No one will access memory at this point */
> -	kvfree(dev->memory);
> -	dev->memory = NULL;
> +	vhost_umem_clean(dev);
> +	dev->umem = NULL;
>  	WARN_ON(!list_empty(&dev->work_list));
>  	if (dev->worker) {
>  		kthread_stop(dev->worker);
> @@ -576,25 +603,25 @@ static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
>  }
>  
>  /* Caller should have vq mutex and device mutex. */
> -static int vq_memory_access_ok(void __user *log_base, struct vhost_memory *mem,
> +static int vq_memory_access_ok(void __user *log_base, struct vhost_umem *umem,
>  			       int log_all)
>  {
> -	int i;
> +	struct vhost_umem_node *node;
>  
> -	if (!mem)
> +	if (!umem)
>  		return 0;
>  
> -	for (i = 0; i < mem->nregions; ++i) {
> -		struct vhost_memory_region *m = mem->regions + i;
> -		unsigned long a = m->userspace_addr;
> -		if (m->memory_size > ULONG_MAX)
> +	list_for_each_entry(node, &umem->umem_list, link) {
> +		unsigned long a = node->userspace_addr;
> +
> +		if (node->size > ULONG_MAX)
>  			return 0;
>  		else if (!access_ok(VERIFY_WRITE, (void __user *)a,
> -				    m->memory_size))
> +				    node->size))
>  			return 0;
>  		else if (log_all && !log_access_ok(log_base,
> -						   m->guest_phys_addr,
> -						   m->memory_size))
> +						   node->start,
> +						   node->size))
>  			return 0;
>  	}
>  	return 1;
> @@ -602,7 +629,7 @@ static int vq_memory_access_ok(void __user *log_base, struct vhost_memory *mem,
>  
>  /* Can we switch to this memory table? */
>  /* Caller should have device mutex but not vq mutex */
> -static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem,
> +static int memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem,
>  			    int log_all)
>  {
>  	int i;
> @@ -615,7 +642,8 @@ static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem,
>  		log = log_all || vhost_has_feature(d->vqs[i], VHOST_F_LOG_ALL);
>  		/* If ring is inactive, will check when it's enabled. */
>  		if (d->vqs[i]->private_data)
> -			ok = vq_memory_access_ok(d->vqs[i]->log_base, mem, log);
> +			ok = vq_memory_access_ok(d->vqs[i]->log_base,
> +						 umem, log);
>  		else
>  			ok = 1;
>  		mutex_unlock(&d->vqs[i]->mutex);
> @@ -642,7 +670,7 @@ static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
>  /* Caller should have device mutex but not vq mutex */
>  int vhost_log_access_ok(struct vhost_dev *dev)
>  {
> -	return memory_access_ok(dev, dev->memory, 1);
> +	return memory_access_ok(dev, dev->umem, 1);
>  }
>  EXPORT_SYMBOL_GPL(vhost_log_access_ok);
>  
> @@ -653,7 +681,7 @@ static int vq_log_access_ok(struct vhost_virtqueue *vq,
>  {
>  	size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
>  
> -	return vq_memory_access_ok(log_base, vq->memory,
> +	return vq_memory_access_ok(log_base, vq->umem,
>  				   vhost_has_feature(vq, VHOST_F_LOG_ALL)) &&
>  		(!vq->log_used || log_access_ok(log_base, vq->log_addr,
>  					sizeof *vq->used +
> @@ -669,28 +697,12 @@ int vhost_vq_access_ok(struct vhost_virtqueue *vq)
>  }
>  EXPORT_SYMBOL_GPL(vhost_vq_access_ok);
>  
> -static int vhost_memory_reg_sort_cmp(const void *p1, const void *p2)
> -{
> -	const struct vhost_memory_region *r1 = p1, *r2 = p2;
> -	if (r1->guest_phys_addr < r2->guest_phys_addr)
> -		return 1;
> -	if (r1->guest_phys_addr > r2->guest_phys_addr)
> -		return -1;
> -	return 0;
> -}
> -
> -static void *vhost_kvzalloc(unsigned long size)
> -{
> -	void *n = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
> -
> -	if (!n)
> -		n = vzalloc(size);
> -	return n;
> -}
> -
>  static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
>  {
> -	struct vhost_memory mem, *newmem, *oldmem;
> +	struct vhost_memory mem, *newmem;
> +	struct vhost_memory_region *region;
> +	struct vhost_umem_node *node;
> +	struct vhost_umem *newumem, *oldumem;
>  	unsigned long size = offsetof(struct vhost_memory, regions);
>  	int i;
>  
> @@ -710,24 +722,51 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
>  		kvfree(newmem);
>  		return -EFAULT;
>  	}
> -	sort(newmem->regions, newmem->nregions, sizeof(*newmem->regions),
> -		vhost_memory_reg_sort_cmp, NULL);
>  
> -	if (!memory_access_ok(d, newmem, 0)) {
> +	newumem = vhost_kvzalloc(sizeof(*newumem));
> +	if (!newumem) {
>  		kvfree(newmem);
> -		return -EFAULT;
> +		return -ENOMEM;
> +	}
> +
> +	newumem->umem_tree = RB_ROOT;
> +	INIT_LIST_HEAD(&newumem->umem_list);
> +
> +	for (region = newmem->regions;
> +	     region < newmem->regions + mem.nregions;
> +	     region++) {
> +		node = vhost_kvzalloc(sizeof(*node));
> +		if (!node)
> +			goto err;
> +		node->start = region->guest_phys_addr;
> +		node->size = region->memory_size;
> +		node->last = node->start + node->size - 1;
> +		node->userspace_addr = region->userspace_addr;
> +		INIT_LIST_HEAD(&node->link);
> +		list_add_tail(&node->link, &newumem->umem_list);
> +		vhost_umem_interval_tree_insert(node, &newumem->umem_tree);
>  	}
> -	oldmem = d->memory;
> -	d->memory = newmem;
> +
> +	if (!memory_access_ok(d, newumem, 0))
> +		goto err;
> +
> +	oldumem = d->umem;
> +	d->umem = newumem;
>  
>  	/* All memory accesses are done under some VQ mutex. */
>  	for (i = 0; i < d->nvqs; ++i) {
>  		mutex_lock(&d->vqs[i]->mutex);
> -		d->vqs[i]->memory = newmem;
> +		d->vqs[i]->umem = newumem;
>  		mutex_unlock(&d->vqs[i]->mutex);
>  	}
> -	kvfree(oldmem);
> +
> +	kvfree(newmem);
>  	return 0;
> +
> +err:
> +	vhost_umem_clean(d);
> +	kvfree(newmem);
> +	return -EFAULT;
>  }
>  
>  long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp)
> @@ -1017,28 +1056,6 @@ done:
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_ioctl);
>  
> -static const struct vhost_memory_region *find_region(struct vhost_memory *mem,
> -						     __u64 addr, __u32 len)
> -{
> -	const struct vhost_memory_region *reg;
> -	int start = 0, end = mem->nregions;
> -
> -	while (start < end) {
> -		int slot = start + (end - start) / 2;
> -		reg = mem->regions + slot;
> -		if (addr >= reg->guest_phys_addr)
> -			end = slot;
> -		else
> -			start = slot + 1;
> -	}
> -
> -	reg = mem->regions + start;
> -	if (addr >= reg->guest_phys_addr &&
> -		reg->guest_phys_addr + reg->memory_size > addr)
> -		return reg;
> -	return NULL;
> -}
> -
>  /* TODO: This is really inefficient.  We need something like get_user()
>   * (instruction directly accesses the data, with an exception table entry
>   * returning -EFAULT). See Documentation/x86/exception-tables.txt.
> @@ -1180,29 +1197,29 @@ EXPORT_SYMBOL_GPL(vhost_init_used);
>  static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
>  			  struct iovec iov[], int iov_size)
>  {
> -	const struct vhost_memory_region *reg;
> -	struct vhost_memory *mem;
> +	const struct vhost_umem_node *node;
> +	struct vhost_umem *umem = vq->umem;
>  	struct iovec *_iov;
>  	u64 s = 0;
>  	int ret = 0;
>  
> -	mem = vq->memory;
>  	while ((u64)len > s) {
>  		u64 size;
>  		if (unlikely(ret >= iov_size)) {
>  			ret = -ENOBUFS;
>  			break;
>  		}
> -		reg = find_region(mem, addr, len);
> -		if (unlikely(!reg)) {
> +		node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
> +							addr, addr + len - 1);
> +		if (node == NULL || node->start > addr) {
>  			ret = -EFAULT;
>  			break;
>  		}
>  		_iov = iov + ret;
> -		size = reg->memory_size - addr + reg->guest_phys_addr;
> +		size = node->size - addr + node->start;
>  		_iov->iov_len = min((u64)len - s, size);
>  		_iov->iov_base = (void __user *)(unsigned long)
> -			(reg->userspace_addr + addr - reg->guest_phys_addr);
> +			(node->userspace_addr + addr - node->start);
>  		s += size;
>  		addr += size;
>  		++ret;
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index d3f7674..5d64393 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -52,6 +52,25 @@ struct vhost_log {
>  	u64 len;
>  };
>  
> +#define START(node) ((node)->start)
> +#define LAST(node) ((node)->last)
> +
> +struct vhost_umem_node {
> +	struct rb_node rb;
> +	struct list_head link;
> +	__u64 start;
> +	__u64 last;
> +	__u64 size;
> +	__u64 userspace_addr;
> +	__u64 flags_padding;
> +	__u64 __subtree_last;
> +};
> +
> +struct vhost_umem {
> +	struct rb_root umem_tree;
> +	struct list_head umem_list;
> +};
> +
>  /* The virtqueue structure describes a queue attached to a device. */
>  struct vhost_virtqueue {
>  	struct vhost_dev *dev;
> @@ -100,7 +119,7 @@ struct vhost_virtqueue {
>  	struct iovec *indirect;
>  	struct vring_used_elem *heads;
>  	/* Protected by virtqueue mutex. */
> -	struct vhost_memory *memory;
> +	struct vhost_umem *umem;
>  	void *private_data;
>  	u64 acked_features;
>  	/* Log write descriptors */
> @@ -117,7 +136,6 @@ struct vhost_virtqueue {
>  };
>  
>  struct vhost_dev {
> -	struct vhost_memory *memory;
>  	struct mm_struct *mm;
>  	struct mutex mutex;
>  	struct vhost_virtqueue **vqs;
> @@ -127,14 +145,15 @@ struct vhost_dev {
>  	spinlock_t work_lock;
>  	struct list_head work_list;
>  	struct task_struct *worker;
> +	struct vhost_umem *umem;
>  };
>  
>  void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int nvqs);
>  long vhost_dev_set_owner(struct vhost_dev *dev);
>  bool vhost_dev_has_owner(struct vhost_dev *dev);
>  long vhost_dev_check_owner(struct vhost_dev *);
> -struct vhost_memory *vhost_dev_reset_owner_prepare(void);
> -void vhost_dev_reset_owner(struct vhost_dev *, struct vhost_memory *);
> +struct vhost_umem *vhost_dev_reset_owner_prepare(void);
> +void vhost_dev_reset_owner(struct vhost_dev *, struct vhost_umem *);
>  void vhost_dev_cleanup(struct vhost_dev *, bool locked);
>  void vhost_dev_stop(struct vhost_dev *);
>  long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp);