lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-ID: <20210703230253.GA2242521@roeck-us.net>
Date:   Sat, 3 Jul 2021 16:02:53 -0700
From:   Guenter Roeck <linux@...ck-us.net>
To:     Dennis Zhou <dennis@...nel.org>
Cc:     Tejun Heo <tj@...nel.org>, Christoph Lameter <cl@...ux.com>,
        linux-mm@...ck.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH] percpu: flush tlb in pcpu_reclaim_populated()

On Sat, Jul 03, 2021 at 09:05:23PM +0000, Dennis Zhou wrote:
> Prior to "percpu: implement partial chunk depopulation",
> pcpu_depopulate_chunk() was called only on the destruction path. This
> meant the virtual address range was on its way back to vmalloc which
> will handle flushing the tlbs for us.
> 
> However, with pcpu_reclaim_populated(), we are now calling
> pcpu_depopulate_chunk() during the active lifecycle of a chunk.
> Therefore, we need to flush the tlb as well otherwise we can end up
> accessing the wrong page through an invalid tlb mapping as reported in
> [1].
> 
> [1] https://lore.kernel.org/lkml/20210702191140.GA3166599@roeck-us.net/
> 
> Fixes: f183324133ea ("percpu: implement partial chunk depopulation")
> Reported-by: Guenter Roeck <linux@...ck-us.net>
> Signed-off-by: Dennis Zhou <dennis@...nel.org>

Tested-by: Guenter Roeck <linux@...ck-us.net>

Guenter

> ---
> I think I'm happier with this. It does the same thing as [2] but moves
> the flush to the caller so we can batch per chunk.
> 
> [2] https://lore.kernel.org/lkml/20210703040449.3213210-1-dennis@kernel.org/
> 
>  mm/percpu-km.c |  6 ++++++
>  mm/percpu-vm.c |  5 +++--
>  mm/percpu.c    | 29 +++++++++++++++++++++++------
>  3 files changed, 32 insertions(+), 8 deletions(-)
> 
> diff --git a/mm/percpu-km.c b/mm/percpu-km.c
> index c9d529dc7651..fe31aa19db81 100644
> --- a/mm/percpu-km.c
> +++ b/mm/percpu-km.c
> @@ -32,6 +32,12 @@
>  
>  #include <linux/log2.h>
>  
> +static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
> +				      int page_start, int page_end)
> +{
> +	/* nothing */
> +}
> +
>  static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
>  			       int page_start, int page_end, gfp_t gfp)
>  {
> diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
> index ee5d89fcd66f..2054c9213c43 100644
> --- a/mm/percpu-vm.c
> +++ b/mm/percpu-vm.c
> @@ -303,6 +303,9 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
>   * For each cpu, depopulate and unmap pages [@page_start,@page_end)
>   * from @chunk.
>   *
> + * Caller is required to call pcpu_post_unmap_tlb_flush() if not returning the
> + * region back to vmalloc() which will lazily flush the tlb.
> + *
>   * CONTEXT:
>   * pcpu_alloc_mutex.
>   */
> @@ -324,8 +327,6 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
>  
>  	pcpu_unmap_pages(chunk, pages, page_start, page_end);
>  
> -	/* no need to flush tlb, vmalloc will handle it lazily */
> -
>  	pcpu_free_pages(chunk, pages, page_start, page_end);
>  }
>  
> diff --git a/mm/percpu.c b/mm/percpu.c
> index b4cebeca4c0c..8d8efd668f76 100644
> --- a/mm/percpu.c
> +++ b/mm/percpu.c
> @@ -1572,6 +1572,7 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
>   *
>   * pcpu_populate_chunk		- populate the specified range of a chunk
>   * pcpu_depopulate_chunk	- depopulate the specified range of a chunk
> + * pcpu_post_unmap_tlb_flush	- flush tlb for the specified range of a chunk
>   * pcpu_create_chunk		- create a new chunk
>   * pcpu_destroy_chunk		- destroy a chunk, always preceded by full depop
>   * pcpu_addr_to_page		- translate address to physical address
> @@ -1581,6 +1582,8 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
>  			       int page_start, int page_end, gfp_t gfp);
>  static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
>  				  int page_start, int page_end);
> +static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
> +				      int page_start, int page_end);
>  static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
>  static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
>  static struct page *pcpu_addr_to_page(void *addr);
> @@ -2137,11 +2140,12 @@ static void pcpu_reclaim_populated(void)
>  {
>  	struct pcpu_chunk *chunk;
>  	struct pcpu_block_md *block;
> +	int freed_page_start, freed_page_end;
>  	int i, end;
> +	bool reintegrate;
>  
>  	lockdep_assert_held(&pcpu_lock);
>  
> -restart:
>  	/*
>  	 * Once a chunk is isolated to the to_depopulate list, the chunk is no
>  	 * longer discoverable to allocations whom may populate pages.  The only
> @@ -2157,6 +2161,9 @@ static void pcpu_reclaim_populated(void)
>  		 * Scan chunk's pages in the reverse order to keep populated
>  		 * pages close to the beginning of the chunk.
>  		 */
> +		freed_page_start = chunk->nr_pages;
> +		freed_page_end = 0;
> +		reintegrate = false;
>  		for (i = chunk->nr_pages - 1, end = -1; i >= 0; i--) {
>  			/* no more work to do */
>  			if (chunk->nr_empty_pop_pages == 0)
> @@ -2164,8 +2171,8 @@ static void pcpu_reclaim_populated(void)
>  
>  			/* reintegrate chunk to prevent atomic alloc failures */
>  			if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) {
> -				pcpu_reintegrate_chunk(chunk);
> -				goto restart;
> +				reintegrate = true;
> +				goto end_chunk;
>  			}
>  
>  			/*
> @@ -2194,16 +2201,26 @@ static void pcpu_reclaim_populated(void)
>  			spin_lock_irq(&pcpu_lock);
>  
>  			pcpu_chunk_depopulated(chunk, i + 1, end + 1);
> +			freed_page_start = min(freed_page_start, i + 1);
> +			freed_page_end = max(freed_page_end, end + 1);
>  
>  			/* reset the range and continue */
>  			end = -1;
>  		}
>  
> -		if (chunk->free_bytes == pcpu_unit_size)
> +end_chunk:
> +		/* batch tlb flush per chunk to amortize cost */
> +		if (freed_page_start < freed_page_end) {
> +			pcpu_post_unmap_tlb_flush(chunk,
> +						  freed_page_start,
> +						  freed_page_end);
> +		}
> +
> +		if (reintegrate || chunk->free_bytes == pcpu_unit_size)
>  			pcpu_reintegrate_chunk(chunk);
>  		else
> -			list_move(&chunk->list,
> -				  &pcpu_chunk_lists[pcpu_sidelined_slot]);
> +			list_move_tail(&chunk->list,
> +				       &pcpu_chunk_lists[pcpu_sidelined_slot]);
>  	}
>  }
>  
> -- 
> 2.32.0.93.g670b81a890-goog
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ