linux-kernel - [PATCH 14/19] staging/lustre/obdclass: Eliminate hash bucket scans in lu_cache

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1442270495-1655259-15-git-send-email-green@linuxhacker.ru>
Date:	Mon, 14 Sep 2015 18:41:30 -0400
From:	green@...uxhacker.ru
To:	Greg Kroah-Hartman <gregkh@...uxfoundation.org>,
	devel@...verdev.osuosl.org,
	Andreas Dilger <andreas.dilger@...el.com>
Cc:	Linux Kernel Mailing List <linux-kernel@...r.kernel.org>,
	Ann Koehler <amk@...y.com>, Oleg Drokin <oleg.drokin@...el.com>
Subject: [PATCH 14/19] staging/lustre/obdclass: Eliminate hash bucket scans in lu_cache_shrink

From: Ann Koehler <amk@...y.com>

The lu_cache_shrink slab shrinker is too slow, accounting for > 90% of
the time spent in shrink_slab when allocating huge pages. Most of its
time is spent iterating over the buckets in each site's object hash
table to compute the number of freeable objects. This iteration is
eliminated by adding an lru length count to the lu_site struct. A
percpu counter is used to maintain the lru length, so that the
lu_site does not need to be locked when an object is accessed through
the hash table. A counter is updated whenever an object is added to
or deleted from any of the hash table buckets. The number of freeable
objects is the sum of the counter values across all cpus.

Signed-off-by: Ann Koehler <amk@...y.com>
Reviewed-on: http://review.whamcloud.com/14066
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-6365
Reviewed-by: Mike Pershin <mike.pershin@...el.com>
Reviewed-by: Andreas Dilger <andreas.dilger@...el.com>
Reviewed-by: Alex Zhuravlev <alexey.zhuravlev@...el.com>
Signed-off-by: Oleg Drokin <oleg.drokin@...el.com>
---
 drivers/staging/lustre/lustre/include/lu_object.h  |  1 +
 drivers/staging/lustre/lustre/obdclass/lu_object.c | 66 ++++++++++++++--------
 2 files changed, 42 insertions(+), 25 deletions(-)

diff --git a/drivers/staging/lustre/lustre/include/lu_object.h b/drivers/staging/lustre/lustre/include/lu_object.h
index ea13a82..96e271d 100644
--- a/drivers/staging/lustre/lustre/include/lu_object.h
+++ b/drivers/staging/lustre/lustre/include/lu_object.h
@@ -584,6 +584,7 @@ enum {
 	LU_SS_CACHE_RACE,
 	LU_SS_CACHE_DEATH_RACE,
 	LU_SS_LRU_PURGED,
+	LU_SS_LRU_LEN,	/* # of objects in lsb_lru lists */
 	LU_SS_LAST_STAT
 };
 
diff --git a/drivers/staging/lustre/lustre/obdclass/lu_object.c b/drivers/staging/lustre/lustre/obdclass/lu_object.c
index 4f7899f..c892e82 100644
--- a/drivers/staging/lustre/lustre/obdclass/lu_object.c
+++ b/drivers/staging/lustre/lustre/obdclass/lu_object.c
@@ -59,6 +59,7 @@
 #include <linux/list.h>
 
 static void lu_object_free(const struct lu_env *env, struct lu_object *o);
+static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx);
 
 /**
  * Decrease reference counter on object. If last reference is freed, return
@@ -126,6 +127,9 @@ void lu_object_put(const struct lu_env *env, struct lu_object *o)
 		LASSERT(list_empty(&top->loh_lru));
 		list_add_tail(&top->loh_lru, &bkt->lsb_lru);
 		bkt->lsb_lru_len++;
+		lprocfs_counter_incr(site->ls_stats, LU_SS_LRU_LEN);
+		CDEBUG(D_INODE, "Add %p to site lru. hash: %p, bkt: %p, lru_len: %ld\n",
+		       o, site->ls_obj_hash, bkt, bkt->lsb_lru_len);
 		cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1);
 		return;
 	}
@@ -174,16 +178,18 @@ void lu_object_unhash(const struct lu_env *env, struct lu_object *o)
 	top = o->lo_header;
 	set_bit(LU_OBJECT_HEARD_BANSHEE, &top->loh_flags);
 	if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags)) {
-		struct cfs_hash *obj_hash = o->lo_dev->ld_site->ls_obj_hash;
+		struct lu_site *site = o->lo_dev->ld_site;
+		struct cfs_hash *obj_hash = site->ls_obj_hash;
 		struct cfs_hash_bd bd;
 
 		cfs_hash_bd_get_and_lock(obj_hash, &top->loh_fid, &bd, 1);
 		if (!list_empty(&top->loh_lru)) {
 			struct lu_site_bkt_data *bkt;
 
-		list_del_init(&top->loh_lru);
+			list_del_init(&top->loh_lru);
 			bkt = cfs_hash_bd_extra_get(obj_hash, &bd);
 			bkt->lsb_lru_len--;
+			lprocfs_counter_decr(site->ls_stats, LU_SS_LRU_LEN);
 		}
 		cfs_hash_bd_del_locked(obj_hash, &bd, &top->loh_hash);
 		cfs_hash_bd_unlock(obj_hash, &bd, 1);
@@ -355,6 +361,7 @@ int lu_site_purge(const struct lu_env *env, struct lu_site *s, int nr)
 					       &bd2, &h->loh_hash);
 			list_move(&h->loh_lru, &dispose);
 			bkt->lsb_lru_len--;
+			lprocfs_counter_decr(s->ls_stats, LU_SS_LRU_LEN);
 			if (did_sth == 0)
 				did_sth = 1;
 
@@ -568,8 +575,9 @@ static struct lu_object *htable_lookup(struct lu_site *s,
 		cfs_hash_get(s->ls_obj_hash, hnode);
 		lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_HIT);
 		if (!list_empty(&h->loh_lru)) {
-		list_del_init(&h->loh_lru);
+			list_del_init(&h->loh_lru);
 			bkt->lsb_lru_len--;
+			lprocfs_counter_decr(s->ls_stats, LU_SS_LRU_LEN);
 		}
 		return lu_object_top(h);
 	}
@@ -1029,6 +1037,12 @@ int lu_site_init(struct lu_site *s, struct lu_device *top)
 			     0, "cache_death_race", "cache_death_race");
 	lprocfs_counter_init(s->ls_stats, LU_SS_LRU_PURGED,
 			     0, "lru_purged", "lru_purged");
+	/*
+	 * Unlike other counters, lru_len can be decremented so
+	 * need lc_sum instead of just lc_count
+	 */
+	lprocfs_counter_init(s->ls_stats, LU_SS_LRU_LEN,
+			     LPROCFS_CNTR_AVGMINMAX, "lru_len", "lru_len");
 
 	INIT_LIST_HEAD(&s->ls_linkage);
 	s->ls_top_dev = top;
@@ -1817,27 +1831,21 @@ static void lu_site_stats_get(struct cfs_hash *hs,
 
 
 /*
- * There exists a potential lock inversion deadlock scenario when using
- * Lustre on top of ZFS. This occurs between one of ZFS's
- * buf_hash_table.ht_lock's, and Lustre's lu_sites_guard lock. Essentially,
- * thread A will take the lu_sites_guard lock and sleep on the ht_lock,
- * while thread B will take the ht_lock and sleep on the lu_sites_guard
- * lock. Obviously neither thread will wake and drop their respective hold
- * on their lock.
+ * lu_cache_shrink_count returns the number of cached objects that are
+ * candidates to be freed by shrink_slab(). A counter, which tracks
+ * the number of items in the site's lru, is maintained in the per cpu
+ * stats of each site. The counter is incremented when an object is added
+ * to a site's lru and decremented when one is removed. The number of
+ * free-able objects is the sum of all per cpu counters for all sites.
  *
- * To prevent this from happening we must ensure the lu_sites_guard lock is
- * not taken while down this code path. ZFS reliably does not set the
- * __GFP_FS bit in its code paths, so this can be used to determine if it
- * is safe to take the lu_sites_guard lock.
- *
- * Ideally we should accurately return the remaining number of cached
- * objects without taking the  lu_sites_guard lock, but this is not
- * possible in the current implementation.
+ * Using a per cpu counter is a compromise solution to concurrent access:
+ * lu_object_put() can update the counter without locking the site and
+ * lu_cache_shrink_count can sum the counters without locking each
+ * ls_obj_hash bucket.
  */
 static unsigned long lu_cache_shrink_count(struct shrinker *sk,
 					   struct shrink_control *sc)
 {
-	struct lu_site_stats stats;
 	struct lu_site *s;
 	struct lu_site *tmp;
 	unsigned long cached = 0;
@@ -1847,14 +1855,14 @@ static unsigned long lu_cache_shrink_count(struct shrinker *sk,
 
 	mutex_lock(&lu_sites_guard);
 	list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) {
-		memset(&stats, 0, sizeof(stats));
-		lu_site_stats_get(s->ls_obj_hash, &stats, 0);
-		cached += stats.lss_total - stats.lss_busy;
+		cached += ls_stats_read(s->ls_stats, LU_SS_LRU_LEN);
 	}
 	mutex_unlock(&lu_sites_guard);
 
 	cached = (cached / 100) * sysctl_vfs_cache_pressure;
-	CDEBUG(D_INODE, "%ld objects cached\n", cached);
+	CDEBUG(D_INODE, "%ld objects cached, cache pressure %d\n",
+	       cached, sysctl_vfs_cache_pressure);
+
 	return cached;
 }
 
@@ -1988,6 +1996,13 @@ static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx)
 	struct lprocfs_counter ret;
 
 	lprocfs_stats_collect(stats, idx, &ret);
+	if (idx == LU_SS_LRU_LEN)
+		/*
+		 * protect against counter on cpu A being decremented
+		 * before counter is incremented on cpu B; unlikely
+		 */
+		return (__u32)((ret.lc_sum > 0) ? ret.lc_sum : 0);
+
 	return (__u32)ret.lc_count;
 }
 
@@ -2002,7 +2017,7 @@ int lu_site_stats_print(const struct lu_site *s, struct seq_file *m)
 	memset(&stats, 0, sizeof(stats));
 	lu_site_stats_get(s->ls_obj_hash, &stats, 1);
 
-	seq_printf(m, "%d/%d %d/%d %d %d %d %d %d %d %d\n",
+	seq_printf(m, "%d/%d %d/%d %d %d %d %d %d %d %d %d\n",
 		   stats.lss_busy,
 		   stats.lss_total,
 		   stats.lss_populated,
@@ -2013,7 +2028,8 @@ int lu_site_stats_print(const struct lu_site *s, struct seq_file *m)
 		   ls_stats_read(s->ls_stats, LU_SS_CACHE_MISS),
 		   ls_stats_read(s->ls_stats, LU_SS_CACHE_RACE),
 		   ls_stats_read(s->ls_stats, LU_SS_CACHE_DEATH_RACE),
-		   ls_stats_read(s->ls_stats, LU_SS_LRU_PURGED));
+		   ls_stats_read(s->ls_stats, LU_SS_LRU_PURGED),
+		   ls_stats_read(s->ls_stats, LU_SS_LRU_LEN));
 	return 0;
 }
 EXPORT_SYMBOL(lu_site_stats_print);
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/