linux-kernel - Re: [PATCH 0/5] make slab gfp fair

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Pine.LNX.4.64.0705161235490.10660@schroedinger.engr.sgi.com>
Date:	Wed, 16 May 2007 12:53:07 -0700 (PDT)
From:	Christoph Lameter <clameter@....com>
To:	Peter Zijlstra <a.p.zijlstra@...llo.nl>
cc:	Matt Mackall <mpm@...enic.com>, linux-kernel@...r.kernel.org,
	linux-mm@...ck.org, Thomas Graf <tgraf@...g.ch>,
	David Miller <davem@...emloft.net>,
	Andrew Morton <akpm@...ux-foundation.org>,
	Daniel Phillips <phillips@...gle.com>,
	Pekka Enberg <penberg@...helsinki.fi>
Subject: Re: [PATCH 0/5] make slab gfp fair

On Wed, 16 May 2007, Peter Zijlstra wrote:

> If this 4k cpu system ever gets to touch the new lock it is in way
> deeper problems than a bouncing cache-line.

So its no use on NUMA?

> Please look at it more carefully.
> 
> We differentiate pages allocated at the level where GFP_ATOMIC starts to
> fail. By not updating the percpu slabs those are retried every time,
> except for ALLOC_NO_WATERMARKS allocations; those are served from the
> ->reserve_slab.
> 
> Once a regular slab allocation succeeds again, the ->reserve_slab is
> cleaned up and never again looked at it until we're in distress again.

A single slab? This may only give you a a single object in an extreme 
case. Are you sure that this solution is generic enough?

The problem here is that you may spinlock and take out the slab for one 
cpu but then (AFAICT) other cpus can still not get their high priority 
allocs satisfied. Some comments follow.


> Signed-off-by: Peter Zijlstra <a.p.zijlstra@...llo.nl>
> ---
>  include/linux/slub_def.h |    2 +
>  mm/slub.c                |   85 ++++++++++++++++++++++++++++++++++++++++++-----
>  2 files changed, 78 insertions(+), 9 deletions(-)
> 
> Index: linux-2.6-git/include/linux/slub_def.h
> ===================================================================
> --- linux-2.6-git.orig/include/linux/slub_def.h
> +++ linux-2.6-git/include/linux/slub_def.h
> @@ -46,6 +46,8 @@ struct kmem_cache {
>  	struct list_head list;	/* List of slab caches */
>  	struct kobject kobj;	/* For sysfs */
>  
> +	struct page *reserve_slab;
> +
>  #ifdef CONFIG_NUMA
>  	int defrag_ratio;
>  	struct kmem_cache_node *node[MAX_NUMNODES];
> Index: linux-2.6-git/mm/slub.c
> ===================================================================
> --- linux-2.6-git.orig/mm/slub.c
> +++ linux-2.6-git/mm/slub.c
> @@ -20,11 +20,13 @@
>  #include <linux/mempolicy.h>
>  #include <linux/ctype.h>
>  #include <linux/kallsyms.h>
> +#include "internal.h"
>  
>  /*
>   * Lock order:
> - *   1. slab_lock(page)
> - *   2. slab->list_lock
> + *   1. reserve_lock
> + *   2. slab_lock(page)
> + *   3. node->list_lock
>   *
>   *   The slab_lock protects operations on the object of a particular
>   *   slab and its metadata in the page struct. If the slab lock
> @@ -259,6 +261,8 @@ static int sysfs_slab_alias(struct kmem_
>  static void sysfs_slab_remove(struct kmem_cache *s) {}
>  #endif
>  
> +static DEFINE_SPINLOCK(reserve_lock);
> +
>  /********************************************************************
>   * 			Core slab cache functions
>   *******************************************************************/
> @@ -1007,7 +1011,7 @@ static void setup_object(struct kmem_cac
>  		s->ctor(object, s, 0);
>  }
>  
> -static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
> +static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node, int *rank)
>  {
>  	struct page *page;
>  	struct kmem_cache_node *n;
> @@ -1025,6 +1029,7 @@ static struct page *new_slab(struct kmem
>  	if (!page)
>  		goto out;
>  
> +	*rank = page->rank;
>  	n = get_node(s, page_to_nid(page));
>  	if (n)
>  		atomic_long_inc(&n->nr_slabs);
> @@ -1311,7 +1316,7 @@ static void unfreeze_slab(struct kmem_ca
>  /*
>   * Remove the cpu slab
>   */
> -static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu)
> +static void __deactivate_slab(struct kmem_cache *s, struct page *page)
>  {
>  	/*
>  	 * Merge cpu freelist into freelist. Typically we get here
> @@ -1330,10 +1335,15 @@ static void deactivate_slab(struct kmem_
>  		page->freelist = object;
>  		page->inuse--;
>  	}
> -	s->cpu_slab[cpu] = NULL;
>  	unfreeze_slab(s, page);
>  }

So you want to spill back the lockless_freelist without deactivating the 
slab? Why are you using the lockless_freelist at all? If you do not use it 
then you can call unfreeze_slab. No need for this split.

> @@ -1395,6 +1405,7 @@ static void *__slab_alloc(struct kmem_ca
>  {
>  	void **object;
>  	int cpu = smp_processor_id();
> +	int rank = 0;
>  
>  	if (!page)
>  		goto new_slab;
> @@ -1424,10 +1435,26 @@ new_slab:
>  	if (page) {
>  		s->cpu_slab[cpu] = page;
>  		goto load_freelist;
> -	}
> +	} else if (unlikely(gfp_to_alloc_flags(gfpflags) & ALLOC_NO_WATERMARKS))
> +		goto try_reserve;

Ok so we are trying to allocate a slab and do not get one thus -> 
try_reserve. But this is only working if we are using the slab after
explicitly flushing the cpuslabs. Otherwise the slab may be full and we
get to alloc_slab.

>  
> -	page = new_slab(s, gfpflags, node);
> -	if (page) {

> +alloc_slab:
> +	page = new_slab(s, gfpflags, node, &rank);
> +	if (page && rank) {

Huh? You mean !page?

> +		if (unlikely(s->reserve_slab)) {
> +			struct page *reserve;
> +
> +			spin_lock(&reserve_lock);
> +			reserve = s->reserve_slab;
> +			s->reserve_slab = NULL;
> +			spin_unlock(&reserve_lock);
> +
> +			if (reserve) {
> +				slab_lock(reserve);
> +				__deactivate_slab(s, reserve);
> +				putback_slab(s, reserve);

Remove the above two lines (they are wrong regardless) and simply make 
this the cpu slab.

> +			}
> +		}
>  		cpu = smp_processor_id();
>  		if (s->cpu_slab[cpu]) {
>  			/*
> @@ -1455,6 +1482,18 @@ new_slab:
>  		SetSlabFrozen(page);
>  		s->cpu_slab[cpu] = page;
>  		goto load_freelist;
> +	} else if (page) {
> +		spin_lock(&reserve_lock);
> +		if (s->reserve_slab) {
> +			discard_slab(s, page);
> +			page = s->reserve_slab;
> +		}
> +		slab_lock(page);
> +		SetPageActive(page);
> +		s->reserve_slab = page;
> +		spin_unlock(&reserve_lock);
> +
> +		goto got_reserve;
>  	}
>  	return NULL;
>  debug:
> @@ -1470,6 +1509,31 @@ debug:
>  	page->freelist = object[page->offset];
>  	slab_unlock(page);
>  	return object;
> +
> +try_reserve:
> +	spin_lock(&reserve_lock);
> +	page = s->reserve_slab;
> +	if (!page) {
> +		spin_unlock(&reserve_lock);
> +		goto alloc_slab;
> +	}
> +
> +	slab_lock(page);
> +	if (!page->freelist) {
> +		s->reserve_slab = NULL;
> +		spin_unlock(&reserve_lock);
> +		__deactivate_slab(s, page);
replace with unfreeze slab.

> +		putback_slab(s, page);

Putting back the slab twice.

> +		goto alloc_slab;
> +	}
> +	spin_unlock(&reserve_lock);
> +
> +got_reserve:
> +	object = page->freelist;
> +	page->inuse++;
> +	page->freelist = object[page->offset];
> +	slab_unlock(page);
> +	return object;


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/