linux-kernel - Re: [PATCH V4] xen/privcmd: improve-performance-of-mapping-of-guest

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20121213163846.GF20661@localhost.localdomain>
Date:	Thu, 13 Dec 2012 11:38:49 -0500
From:	Konrad Rzeszutek Wilk <konrad.wilk@...cle.com>
To:	Mats Petersson <mats.petersson@...rix.com>
Cc:	xen-devel@...ts.xen.org, JBeulich@...e.com,
	Ian.Campbell@...rix.com, linux-kernel@...r.kernel.org,
	david.vrabel@...rix.com, mats@...netcatfish.com
Subject: Re: [PATCH V4] xen/privcmd: improve-performance-of-mapping-of-guest

On Wed, Dec 12, 2012 at 10:09:38PM +0000, Mats Petersson wrote:
> One comment asked for more details on the improvements:
> Using a small test program to map Guest memory into Dom0 (repeatedly
> for "Iterations" mapping the same first "Num Pages")

I missed this in my for 3.8 queue. I will queue it up next
week and send it to Linus after a review..

> Iterations    Num Pages	   Time 3.7rc4	Time With this patch
> 5000	      4096	   76.107	37.027
> 10000	      2048	   75.703	37.177
> 20000	      1024	   75.893	37.247
> So a little better than twice as fast.
> 
> Using this patch in migration, using "time" to measure the overall
> time it take to migrate a guest (Debian Squeeze 6.0, 1024MB guest
> memory, one network card, one disk, guest at login prompt):
> Time 3.7rc5		Time With this patch
> 6.697			5.667
> Since migration involves a whole lot of other things, it's only about
> 15% faster - but still a good improvement. Similar measurement with a
> guest that is running code to "dirty" memory shows about 23%
> improvement, as it spends more time copying dirtied memory.
> 
> As discussed elsewhere, a good deal more can be had from improving the
> munmap system call, but it is a little tricky to get this in without
> worsening non-PVOPS kernel, so I will have another look at this.
> 
> ---
> Update since last posting: 
> I have just run some benchmarks of a 16GB guest, and the improvement
> with this patch is around 23-30% for the overall copy time, and 42%
> shorter downtime on a "busy" guest (writing in a loop to about 8GB of
> memory). I think this is definitely worth having.
> 
> The V4 patch consists of cosmetic adjustments (spelling mistake in
> comment and reversing condition in an if-statement to avoid having an
> "else" branch, a random empty line added by accident now reverted back
> to previous state). Thanks to Jan Beulich for the comments.
> 
> The V3 of the patch contains suggested improvements from:
> David Vrabel - make it two distinct external functions, doc-comments.
> Ian Campbell - use one common function for the main work.
> Jan Beulich  - found a bug and pointed out some whitespace problems.
> 
> 
> 
> Signed-off-by: Mats Petersson <mats.petersson@...rix.com>
> 
> ---
>  arch/x86/xen/mmu.c    |  132 +++++++++++++++++++++++++++++++++++++++++++------
>  drivers/xen/privcmd.c |   55 +++++++++++++++++----
>  include/xen/xen-ops.h |    5 ++
>  3 files changed, 169 insertions(+), 23 deletions(-)
> 
> diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
> index dcf5f2d..a67774f 100644
> --- a/arch/x86/xen/mmu.c
> +++ b/arch/x86/xen/mmu.c
> @@ -2477,7 +2477,8 @@ void __init xen_hvm_init_mmu_ops(void)
>  #define REMAP_BATCH_SIZE 16
>  
>  struct remap_data {
> -	unsigned long mfn;
> +	unsigned long *mfn;
> +	bool contiguous;
>  	pgprot_t prot;
>  	struct mmu_update *mmu_update;
>  };
> @@ -2486,7 +2487,13 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
>  				 unsigned long addr, void *data)
>  {
>  	struct remap_data *rmd = data;
> -	pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
> +	pte_t pte = pte_mkspecial(pfn_pte(*rmd->mfn, rmd->prot));
> +	/* If we have a contigious range, just update the mfn itself,
> +	   else update pointer to be "next mfn". */
> +	if (rmd->contiguous)
> +		(*rmd->mfn)++;
> +	else
> +		rmd->mfn++;
>  
>  	rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
>  	rmd->mmu_update->val = pte_val_ma(pte);
> @@ -2495,16 +2502,34 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
>  	return 0;
>  }
>  
> -int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
> -			       unsigned long addr,
> -			       unsigned long mfn, int nr,
> -			       pgprot_t prot, unsigned domid)
> -{
> +/* do_remap_mfn() - helper function to map foreign pages
> + * @vma:     the vma for the pages to be mapped into
> + * @addr:    the address at which to map the pages
> + * @mfn:     pointer to array of MFNs to map
> + * @nr:      the number entries in the MFN array
> + * @err_ptr: pointer to array
> + * @prot:    page protection mask
> + * @domid:   id of the domain that we are mapping from
> + *
> + * This function takes an array of mfns and maps nr pages from that into
> + * this kernel's memory. The owner of the pages is defined by domid. Where the
> + * pages are mapped is determined by addr, and vma is used for "accounting" of
> + * the pages.
> + *
> + * Return value is zero for success, negative for failure.
> + *
> + * Note that err_ptr is used to indicate whether *mfn
> + * is a list or a "first mfn of a contiguous range". */
> +static int do_remap_mfn(struct vm_area_struct *vma,
> +			unsigned long addr,
> +			unsigned long *mfn, int nr,
> +			int *err_ptr, pgprot_t prot,
> +			unsigned domid)
> +{
> +	int err, last_err = 0;
>  	struct remap_data rmd;
>  	struct mmu_update mmu_update[REMAP_BATCH_SIZE];
> -	int batch;
>  	unsigned long range;
> -	int err = 0;
>  
>  	if (xen_feature(XENFEAT_auto_translated_physmap))
>  		return -EINVAL;
> @@ -2515,9 +2540,15 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
>  
>  	rmd.mfn = mfn;
>  	rmd.prot = prot;
> +	/* We use the err_ptr to indicate if there we are doing a contigious
> +	 * mapping or a discontigious mapping. */
> +	rmd.contiguous = !err_ptr;
>  
>  	while (nr) {
> -		batch = min(REMAP_BATCH_SIZE, nr);
> +		int index = 0;
> +		int done = 0;
> +		int batch = min(REMAP_BATCH_SIZE, nr);
> +		int batch_left = batch;
>  		range = (unsigned long)batch << PAGE_SHIFT;
>  
>  		rmd.mmu_update = mmu_update;
> @@ -2526,19 +2557,92 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
>  		if (err)
>  			goto out;
>  
> -		err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid);
> -		if (err < 0)
> -			goto out;
> +		/* We record the error for each page that gives an error, but
> +		 * continue mapping until the whole set is done */
> +		do {
> +			err = HYPERVISOR_mmu_update(&mmu_update[index],
> +						    batch_left, &done, domid);
> +			if (err < 0) {
> +				if (!err_ptr)
> +					goto out;
> +				/* increment done so we skip the error item */
> +				done++;
> +				last_err = err_ptr[index] = err;
> +			}
> +			batch_left -= done;
> +			index += done;
> +		} while (batch_left);
>  
>  		nr -= batch;
>  		addr += range;
> +		if (err_ptr)
> +			err_ptr += batch;
>  	}
>  
> -	err = 0;
> +	err = last_err;
>  out:
>  
>  	xen_flush_tlb_all();
>  
>  	return err;
>  }
> +
> +/* xen_remap_domain_mfn_range() - Used to map foreign pages
> + * @vma:   the vma for the pages to be mapped into
> + * @addr:  the address at which to map the pages
> + * @mfn:   the first MFN to map
> + * @nr:    the number of consecutive mfns to map
> + * @prot:  page protection mask
> + * @domid: id of the domain that we are mapping from
> + *
> + * This function takes a mfn and maps nr pages on from that into this kernel's
> + * memory. The owner of the pages is defined by domid. Where the pages are
> + * mapped is determined by addr, and vma is used for "accounting" of the
> + * pages.
> + *
> + * Return value is zero for success, negative ERRNO value for failure.
> + */
> +int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
> +			       unsigned long addr,
> +			       unsigned long mfn, int nr,
> +			       pgprot_t prot, unsigned domid)
> +{
> +	return do_remap_mfn(vma, addr, &mfn, nr, NULL, prot, domid);
> +}
>  EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
> +
> +/* xen_remap_domain_mfn_array() - Used to map an array of foreign pages
> + * @vma:     the vma for the pages to be mapped into
> + * @addr:    the address at which to map the pages
> + * @mfn:     pointer to array of MFNs to map
> + * @nr:      the number entries in the MFN array
> + * @err_ptr: pointer to array of integers, one per MFN, for an error
> + *           value for each page. The err_ptr must not be NULL.
> + * @prot:    page protection mask
> + * @domid:   id of the domain that we are mapping from
> + *
> + * This function takes an array of mfns and maps nr pages from that into this
> + * kernel's memory. The owner of the pages is defined by domid. Where the pages
> + * are mapped is determined by addr, and vma is used for "accounting" of the
> + * pages. The err_ptr array is filled in on any page that is not sucessfully
> + * mapped in.
> + *
> + * Return value is zero for success, negative ERRNO value for failure.
> + * Note that the error value -ENOENT is considered a "retry", so when this
> + * error code is seen, another call should be made with the list of pages that
> + * are marked as -ENOENT in the err_ptr array.
> + */
> +int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
> +			       unsigned long addr,
> +			       unsigned long *mfn, int nr,
> +			       int *err_ptr, pgprot_t prot,
> +			       unsigned domid)
> +{
> +	/* We BUG_ON because it's a programmer error to pass a NULL err_ptr,
> +	 * and the consequences later is quite hard to detect what the actual
> +	 * cause of "wrong memory was mapped in".
> +	 */
> +	BUG_ON(err_ptr == NULL);
> +	return do_remap_mfn(vma, addr, mfn, nr, err_ptr, prot, domid);
> +}
> +EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array);
> diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
> index 71f5c45..75f6e86 100644
> --- a/drivers/xen/privcmd.c
> +++ b/drivers/xen/privcmd.c
> @@ -151,6 +151,40 @@ static int traverse_pages(unsigned nelem, size_t size,
>  	return ret;
>  }
>  
> +/*
> + * Similar to traverse_pages, but use each page as a "block" of
> + * data to be processed as one unit.
> + */
> +static int traverse_pages_block(unsigned nelem, size_t size,
> +				struct list_head *pos,
> +				int (*fn)(void *data, int nr, void *state),
> +				void *state)
> +{
> +	void *pagedata;
> +	unsigned pageidx;
> +	int ret = 0;
> +
> +	BUG_ON(size > PAGE_SIZE);
> +
> +	pageidx = PAGE_SIZE;
> +
> +	while (nelem) {
> +		int nr = (PAGE_SIZE/size);
> +		struct page *page;
> +		if (nr > nelem)
> +			nr = nelem;
> +		pos = pos->next;
> +		page = list_entry(pos, struct page, lru);
> +		pagedata = page_address(page);
> +		ret = (*fn)(pagedata, nr, state);
> +		if (ret)
> +			break;
> +		nelem -= nr;
> +	}
> +
> +	return ret;
> +}
> +
>  struct mmap_mfn_state {
>  	unsigned long va;
>  	struct vm_area_struct *vma;
> @@ -250,7 +284,7 @@ struct mmap_batch_state {
>  	 *      0 for no errors
>  	 *      1 if at least one error has happened (and no
>  	 *          -ENOENT errors have happened)
> -	 *      -ENOENT if at least 1 -ENOENT has happened.
> +	 *      -ENOENT if at least one -ENOENT has happened.
>  	 */
>  	int global_error;
>  	/* An array for individual errors */
> @@ -260,17 +294,20 @@ struct mmap_batch_state {
>  	xen_pfn_t __user *user_mfn;
>  };
>  
> -static int mmap_batch_fn(void *data, void *state)
> +static int mmap_batch_fn(void *data, int nr, void *state)
>  {
>  	xen_pfn_t *mfnp = data;
>  	struct mmap_batch_state *st = state;
>  	int ret;
>  
> -	ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1,
> -					 st->vma->vm_page_prot, st->domain);
> +	BUG_ON(nr < 0);
>  
> -	/* Store error code for second pass. */
> -	*(st->err++) = ret;
> +	ret = xen_remap_domain_mfn_array(st->vma,
> +					 st->va & PAGE_MASK,
> +					 mfnp, nr,
> +					 st->err,
> +					 st->vma->vm_page_prot,
> +					 st->domain);
>  
>  	/* And see if it affects the global_error. */
>  	if (ret < 0) {
> @@ -282,7 +319,7 @@ static int mmap_batch_fn(void *data, void *state)
>  				st->global_error = 1;
>  		}
>  	}
> -	st->va += PAGE_SIZE;
> +	st->va += PAGE_SIZE * nr;
>  
>  	return 0;
>  }
> @@ -378,8 +415,8 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
>  	state.err           = err_array;
>  
>  	/* mmap_batch_fn guarantees ret == 0 */
> -	BUG_ON(traverse_pages(m.num, sizeof(xen_pfn_t),
> -			     &pagelist, mmap_batch_fn, &state));
> +	BUG_ON(traverse_pages_block(m.num, sizeof(xen_pfn_t),
> +				    &pagelist, mmap_batch_fn, &state));
>  
>  	up_write(&mm->mmap_sem);
>  
> diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
> index 6a198e4..22cad75 100644
> --- a/include/xen/xen-ops.h
> +++ b/include/xen/xen-ops.h
> @@ -29,4 +29,9 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
>  			       unsigned long mfn, int nr,
>  			       pgprot_t prot, unsigned domid);
>  
> +int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
> +			       unsigned long addr,
> +			       unsigned long *mfn, int nr,
> +			       int *err_ptr, pgprot_t prot,
> +			       unsigned domid);
>  #endif /* INCLUDE_XEN_OPS_H */
> -- 
> 1.7.9.5
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/