linux-kernel - Re: [PATCH v5 06/13] lockdep: Implement crossrelease feature

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Fri, 3 Mar 2017 09:39:50 +0900
From:   Byungchul Park <byungchul.park@....com>
To:     Peter Zijlstra <peterz@...radead.org>
Cc:     mingo@...nel.org, tglx@...utronix.de, walken@...gle.com,
        boqun.feng@...il.com, kirill@...temov.name,
        linux-kernel@...r.kernel.org, linux-mm@...ck.org,
        iamjoonsoo.kim@....com, akpm@...ux-foundation.org,
        npiggin@...il.com, kernel-team@....com,
        Michal Hocko <mhocko@...nel.org>,
        Nikolay Borisov <nborisov@...e.com>,
        Mel Gorman <mgorman@...e.de>
Subject: Re: [PATCH v5 06/13] lockdep: Implement crossrelease feature

On Thu, Mar 02, 2017 at 02:40:31PM +0100, Peter Zijlstra wrote:
> [*] A while ago someone, and I cannot find the email just now, asked if
> we could not implement the RECLAIM_FS inversion stuff with a 'fake' lock

It looks interesting to me.

> like we use for other things like workqueues etc. I think this should be
> possible which allows reducing the 'irq' states and will reduce the
> amount of __bfs() lookups we do.
> 
> Removing the 1 IRQ state, would result in 4 less __bfs() walks if I'm
> not mistaken, more than making up for the 1 we'd have to add to detect
> redundant links.

OK.

Thanks,
Byungchul

> 
> 
>  include/linux/lockdep.h         | 11 +-----
>  include/linux/sched.h           |  1 -
>  kernel/locking/lockdep.c        | 87 +----------------------------------------
>  kernel/locking/lockdep_states.h |  1 -
>  mm/internal.h                   | 40 +++++++++++++++++++
>  mm/page_alloc.c                 | 13 ++++--
>  mm/slab.h                       |  7 +++-
>  mm/slob.c                       |  8 +++-
>  mm/vmscan.c                     | 13 +++---
>  9 files changed, 71 insertions(+), 110 deletions(-)
> 
> diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
> index 1e327bb..6ba1a65 100644
> --- a/include/linux/lockdep.h
> +++ b/include/linux/lockdep.h
> @@ -29,7 +29,7 @@ extern int lock_stat;
>   * We'd rather not expose kernel/lockdep_states.h this wide, but we do need
>   * the total number of states... :-(
>   */
> -#define XXX_LOCK_USAGE_STATES		(1+3*4)
> +#define XXX_LOCK_USAGE_STATES		(1+2*4)
>  
>  /*
>   * NR_LOCKDEP_CACHING_CLASSES ... Number of classes
> @@ -361,10 +361,6 @@ static inline void lock_set_subclass(struct lockdep_map *lock,
>  	lock_set_class(lock, lock->name, lock->key, subclass, ip);
>  }
>  
> -extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask);
> -extern void lockdep_clear_current_reclaim_state(void);
> -extern void lockdep_trace_alloc(gfp_t mask);
> -
>  struct pin_cookie { unsigned int val; };
>  
>  #define NIL_COOKIE (struct pin_cookie){ .val = 0U, }
> @@ -373,7 +369,7 @@ extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock);
>  extern void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie);
>  extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);
>  
> -# define INIT_LOCKDEP				.lockdep_recursion = 0, .lockdep_reclaim_gfp = 0,
> +# define INIT_LOCKDEP				.lockdep_recursion = 0,
>  
>  #define lockdep_depth(tsk)	(debug_locks ? (tsk)->lockdep_depth : 0)
>  
> @@ -413,9 +409,6 @@ static inline void lockdep_on(void)
>  # define lock_release(l, n, i)			do { } while (0)
>  # define lock_set_class(l, n, k, s, i)		do { } while (0)
>  # define lock_set_subclass(l, s, i)		do { } while (0)
> -# define lockdep_set_current_reclaim_state(g)	do { } while (0)
> -# define lockdep_clear_current_reclaim_state()	do { } while (0)
> -# define lockdep_trace_alloc(g)			do { } while (0)
>  # define lockdep_info()				do { } while (0)
>  # define lockdep_init_map(lock, name, key, sub) \
>  		do { (void)(name); (void)(key); } while (0)
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index d67eee8..0fa8a8f 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -806,7 +806,6 @@ struct task_struct {
>  	int				lockdep_depth;
>  	unsigned int			lockdep_recursion;
>  	struct held_lock		held_locks[MAX_LOCK_DEPTH];
> -	gfp_t				lockdep_reclaim_gfp;
>  #endif
>  
>  #ifdef CONFIG_UBSAN
> diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
> index a95e5d1..1051600 100644
> --- a/kernel/locking/lockdep.c
> +++ b/kernel/locking/lockdep.c
> @@ -343,14 +343,12 @@ EXPORT_SYMBOL(lockdep_on);
>  #if VERBOSE
>  # define HARDIRQ_VERBOSE	1
>  # define SOFTIRQ_VERBOSE	1
> -# define RECLAIM_VERBOSE	1
>  #else
>  # define HARDIRQ_VERBOSE	0
>  # define SOFTIRQ_VERBOSE	0
> -# define RECLAIM_VERBOSE	0
>  #endif
>  
> -#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE
> +#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE
>  /*
>   * Quick filtering for interesting events:
>   */
> @@ -2553,14 +2551,6 @@ static int SOFTIRQ_verbose(struct lock_class *class)
>  	return 0;
>  }
>  
> -static int RECLAIM_FS_verbose(struct lock_class *class)
> -{
> -#if RECLAIM_VERBOSE
> -	return class_filter(class);
> -#endif
> -	return 0;
> -}
> -
>  #define STRICT_READ_CHECKS	1
>  
>  static int (*state_verbose_f[])(struct lock_class *class) = {
> @@ -2856,51 +2846,6 @@ void trace_softirqs_off(unsigned long ip)
>  		debug_atomic_inc(redundant_softirqs_off);
>  }
>  
> -static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
> -{
> -	struct task_struct *curr = current;
> -
> -	if (unlikely(!debug_locks))
> -		return;
> -
> -	/* no reclaim without waiting on it */
> -	if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
> -		return;
> -
> -	/* this guy won't enter reclaim */
> -	if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
> -		return;
> -
> -	/* We're only interested __GFP_FS allocations for now */
> -	if (!(gfp_mask & __GFP_FS))
> -		return;
> -
> -	/*
> -	 * Oi! Can't be having __GFP_FS allocations with IRQs disabled.
> -	 */
> -	if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
> -		return;
> -
> -	mark_held_locks(curr, RECLAIM_FS);
> -}
> -
> -static void check_flags(unsigned long flags);
> -
> -void lockdep_trace_alloc(gfp_t gfp_mask)
> -{
> -	unsigned long flags;
> -
> -	if (unlikely(current->lockdep_recursion))
> -		return;
> -
> -	raw_local_irq_save(flags);
> -	check_flags(flags);
> -	current->lockdep_recursion = 1;
> -	__lockdep_trace_alloc(gfp_mask, flags);
> -	current->lockdep_recursion = 0;
> -	raw_local_irq_restore(flags);
> -}
> -
>  static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
>  {
>  	/*
> @@ -2946,22 +2891,6 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
>  		}
>  	}
>  
> -	/*
> -	 * We reuse the irq context infrastructure more broadly as a general
> -	 * context checking code. This tests GFP_FS recursion (a lock taken
> -	 * during reclaim for a GFP_FS allocation is held over a GFP_FS
> -	 * allocation).
> -	 */
> -	if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) {
> -		if (hlock->read) {
> -			if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ))
> -					return 0;
> -		} else {
> -			if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS))
> -					return 0;
> -		}
> -	}
> -
>  	return 1;
>  }
>  
> @@ -3020,10 +2949,6 @@ static inline int separate_irq_context(struct task_struct *curr,
>  	return 0;
>  }
>  
> -void lockdep_trace_alloc(gfp_t gfp_mask)
> -{
> -}
> -
>  #endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
>  
>  /*
> @@ -3859,16 +3784,6 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
>  }
>  EXPORT_SYMBOL_GPL(lock_unpin_lock);
>  
> -void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
> -{
> -	current->lockdep_reclaim_gfp = gfp_mask;
> -}
> -
> -void lockdep_clear_current_reclaim_state(void)
> -{
> -	current->lockdep_reclaim_gfp = 0;
> -}
> -
>  #ifdef CONFIG_LOCK_STAT
>  static int
>  print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
> diff --git a/kernel/locking/lockdep_states.h b/kernel/locking/lockdep_states.h
> index 995b0cc..35ca09f 100644
> --- a/kernel/locking/lockdep_states.h
> +++ b/kernel/locking/lockdep_states.h
> @@ -6,4 +6,3 @@
>   */
>  LOCKDEP_STATE(HARDIRQ)
>  LOCKDEP_STATE(SOFTIRQ)
> -LOCKDEP_STATE(RECLAIM_FS)
> diff --git a/mm/internal.h b/mm/internal.h
> index ccfc2a2..88b9107 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -15,6 +15,8 @@
>  #include <linux/mm.h>
>  #include <linux/pagemap.h>
>  #include <linux/tracepoint-defs.h>
> +#include <linux/lockdep.h>
> +#include <linux/sched/mm.h>
>  
>  /*
>   * The set of flags that only affect watermark checking and reclaim
> @@ -498,4 +500,42 @@ extern const struct trace_print_flags pageflag_names[];
>  extern const struct trace_print_flags vmaflag_names[];
>  extern const struct trace_print_flags gfpflag_names[];
>  
> +
> +#ifdef CONFIG_LOCKDEP
> +extern struct lockdep_map __fs_reclaim_map;
> +
> +static inline bool __need_fs_reclaim(gfp_t gfp_mask)
> +{
> +	gfp_mask = memalloc_noio_flags(gfp_mask);
> +
> +	/* no reclaim without waiting on it */
> +	if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
> +		return false;
> +
> +	/* this guy won't enter reclaim */
> +	if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
> +		return false;
> +
> +	/* We're only interested __GFP_FS allocations for now */
> +	if (!(gfp_mask & __GFP_FS))
> +		return false;
> +
> +	return true;
> +}
> +
> +static inline void fs_reclaim_acquire(gfp_t gfp_mask)
> +{
> +	if (__need_fs_reclaim(gfp_mask))
> +		lock_map_acquire(&__fs_reclaim_map);
> +}
> +static inline void fs_reclaim_release(gfp_t gfp_mask)
> +{
> +	if (__need_fs_reclaim(gfp_mask))
> +		lock_map_release(&__fs_reclaim_map);
> +}
> +#else
> +static inline void fs_reclaim_acquire(gfp_t gfp_mask) { }
> +static inline void fs_reclaim_release(gfp_t gfp_mask) { }
> +#endif
> +
>  #endif	/* __MM_INTERNAL_H */
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index eaa64d2..85ea8bf 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3387,6 +3387,12 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla
>  }
>  #endif /* CONFIG_COMPACTION */
>  
> +
> +#ifdef CONFIG_LOCKDEP
> +struct lockdep_map __fs_reclaim_map =
> +	STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
> +#endif
> +
>  /* Perform direct synchronous page reclaim */
>  static int
>  __perform_reclaim(gfp_t gfp_mask, unsigned int order,
> @@ -3400,7 +3406,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
>  	/* We now go into synchronous reclaim */
>  	cpuset_memory_pressure_bump();
>  	current->flags |= PF_MEMALLOC;
> -	lockdep_set_current_reclaim_state(gfp_mask);
> +	fs_reclaim_acquire(gfp_mask);
>  	reclaim_state.reclaimed_slab = 0;
>  	current->reclaim_state = &reclaim_state;
>  
> @@ -3408,7 +3414,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
>  								ac->nodemask);
>  
>  	current->reclaim_state = NULL;
> -	lockdep_clear_current_reclaim_state();
> +	fs_reclaim_release(gfp_mask);
>  	current->flags &= ~PF_MEMALLOC;
>  
>  	cond_resched();
> @@ -3913,7 +3919,8 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
>  			*alloc_flags |= ALLOC_CPUSET;
>  	}
>  
> -	lockdep_trace_alloc(gfp_mask);
> +	fs_reclaim_acquire(gfp_mask);
> +	fs_reclaim_release(gfp_mask);
>  
>  	might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
>  
> diff --git a/mm/slab.h b/mm/slab.h
> index 65e7c3f..753f552 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -44,6 +44,8 @@ struct kmem_cache {
>  #include <linux/kmemleak.h>
>  #include <linux/random.h>
>  
> +#include "internal.h"
> +
>  /*
>   * State of the slab allocator.
>   *
> @@ -428,7 +430,10 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
>  						     gfp_t flags)
>  {
>  	flags &= gfp_allowed_mask;
> -	lockdep_trace_alloc(flags);
> +
> +	fs_reclaim_acquire(flags);
> +	fs_reclaim_release(flags);
> +
>  	might_sleep_if(gfpflags_allow_blocking(flags));
>  
>  	if (should_failslab(s, flags))
> diff --git a/mm/slob.c b/mm/slob.c
> index eac04d43..3e32280 100644
> --- a/mm/slob.c
> +++ b/mm/slob.c
> @@ -73,6 +73,8 @@
>  #include <linux/atomic.h>
>  
>  #include "slab.h"
> +#include "internal.h"
> +
>  /*
>   * slob_block has a field 'units', which indicates size of block if +ve,
>   * or offset of next block if -ve (in SLOB_UNITs).
> @@ -432,7 +434,8 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
>  
>  	gfp &= gfp_allowed_mask;
>  
> -	lockdep_trace_alloc(gfp);
> +	fs_reclaim_acquire(gfp);
> +	fs_reclaim_release(gfp);
>  
>  	if (size < PAGE_SIZE - align) {
>  		if (!size)
> @@ -538,7 +541,8 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
>  
>  	flags &= gfp_allowed_mask;
>  
> -	lockdep_trace_alloc(flags);
> +	fs_reclaim_acquire(flags);
> +	fs_reclaim_release(flags);
>  
>  	if (c->size < PAGE_SIZE) {
>  		b = slob_alloc(c->size, flags, c->align, node);
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index bc8031e..2f57e36 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -3418,8 +3418,6 @@ static int kswapd(void *p)
>  	};
>  	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
>  
> -	lockdep_set_current_reclaim_state(GFP_KERNEL);
> -
>  	if (!cpumask_empty(cpumask))
>  		set_cpus_allowed_ptr(tsk, cpumask);
>  	current->reclaim_state = &reclaim_state;
> @@ -3475,7 +3473,9 @@ static int kswapd(void *p)
>  		 */
>  		trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
>  						alloc_order);
> +		fs_reclaim_acquire(GFP_KERNEL);
>  		reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
> +		fs_reclaim_release(GFP_KERNEL);
>  		if (reclaim_order < alloc_order)
>  			goto kswapd_try_sleep;
>  
> @@ -3485,7 +3485,6 @@ static int kswapd(void *p)
>  
>  	tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
>  	current->reclaim_state = NULL;
> -	lockdep_clear_current_reclaim_state();
>  
>  	return 0;
>  }
> @@ -3550,14 +3549,14 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
>  	unsigned long nr_reclaimed;
>  
>  	p->flags |= PF_MEMALLOC;
> -	lockdep_set_current_reclaim_state(sc.gfp_mask);
> +	fs_reclaim_acquire(sc.gfp_mask);
>  	reclaim_state.reclaimed_slab = 0;
>  	p->reclaim_state = &reclaim_state;
>  
>  	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
>  
>  	p->reclaim_state = NULL;
> -	lockdep_clear_current_reclaim_state();
> +	fs_reclaim_release(sc.gfp_mask);
>  	p->flags &= ~PF_MEMALLOC;
>  
>  	return nr_reclaimed;
> @@ -3741,7 +3740,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
>  	 * and RECLAIM_UNMAP.
>  	 */
>  	p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
> -	lockdep_set_current_reclaim_state(gfp_mask);
> +	fs_reclaim_acquire(gfp_mask);
>  	reclaim_state.reclaimed_slab = 0;
>  	p->reclaim_state = &reclaim_state;
>  
> @@ -3756,8 +3755,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
>  	}
>  
>  	p->reclaim_state = NULL;
> +	fs_reclaim_release(gfp_mask);
>  	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
> -	lockdep_clear_current_reclaim_state();
>  	return sc.nr_reclaimed >= nr_pages;
>  }
>