linux-kernel - Re: [PATCH 17/24] GFS2: Use RCU/hlist

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20140122053248.GX10038@linux.vnet.ibm.com>
Date:	Tue, 21 Jan 2014 21:32:48 -0800
From:	"Paul E. McKenney" <paulmck@...ux.vnet.ibm.com>
To:	Steven Whitehouse <swhiteho@...hat.com>
Cc:	linux-kernel@...r.kernel.org, cluster-devel@...hat.com,
	Abhijith Das <adas@...hat.com>, sasha.levin@...cle.com
Subject: Re: [PATCH 17/24] GFS2: Use RCU/hlist_bl based hash for quotas

On Mon, Jan 20, 2014 at 12:23:40PM +0000, Steven Whitehouse wrote:
> Prior to this patch, GFS2 kept all the quotas for each
> super block in a single linked list. This is rather slow
> when there are large numbers of quotas.
> 
> This patch introduces a hlist_bl based hash table, similar
> to the one used for glocks. The initial look up of the quota
> is now lockless in the case where it is already cached,
> although we still have to take the per quota spinlock in
> order to bump the ref count. Either way though, this is a
> big improvement on what was there before.
> 
> The qd_lock and the per super block list is preserved, for
> the time being. However it is intended that since this is no
> longer used for its original role, it should be possible to
> shrink the number of items on that list in due course and
> remove the requirement to take qd_lock in qd_get.
> 
> Signed-off-by: Steven Whitehouse <swhiteho@...hat.com>
> Cc: Abhijith Das <adas@...hat.com>
> Cc: Paul E. McKenney <paulmck@...ux.vnet.ibm.com>

Interesting!  I thought that Sasha Levin had a hash table in the works,
but I don't see it, so CCing him.

A few questions and comments below.

							Thanx, Paul

> diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> index a99f60c..59d99ec 100644
> --- a/fs/gfs2/incore.h
> +++ b/fs/gfs2/incore.h
> @@ -428,10 +428,13 @@ enum {
>  };
> 
>  struct gfs2_quota_data {
> +	struct hlist_bl_node qd_hlist;
>  	struct list_head qd_list;
>  	struct kqid qd_id;
> +	struct gfs2_sbd *qd_sbd;
>  	struct lockref qd_lockref;
>  	struct list_head qd_lru;
> +	unsigned qd_hash;
> 
>  	unsigned long qd_flags;		/* QDF_... */
> 
> @@ -450,6 +453,7 @@ struct gfs2_quota_data {
> 
>  	u64 qd_sync_gen;
>  	unsigned long qd_last_warn;
> +	struct rcu_head qd_rcu;
>  };
> 
>  struct gfs2_trans {
> diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
> index 0650db2..c272e73 100644
> --- a/fs/gfs2/main.c
> +++ b/fs/gfs2/main.c
> @@ -76,6 +76,7 @@ static int __init init_gfs2_fs(void)
> 
>  	gfs2_str2qstr(&gfs2_qdot, ".");
>  	gfs2_str2qstr(&gfs2_qdotdot, "..");
> +	gfs2_quota_hash_init();
> 
>  	error = gfs2_sys_init();
>  	if (error)
> diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
> index 1b6b367..a1df01d 100644
> --- a/fs/gfs2/quota.c
> +++ b/fs/gfs2/quota.c
> @@ -52,6 +52,10 @@
>  #include <linux/dqblk_xfs.h>
>  #include <linux/lockref.h>
>  #include <linux/list_lru.h>
> +#include <linux/rcupdate.h>
> +#include <linux/rculist_bl.h>
> +#include <linux/bit_spinlock.h>
> +#include <linux/jhash.h>
> 
>  #include "gfs2.h"
>  #include "incore.h"
> @@ -67,10 +71,43 @@
>  #include "inode.h"
>  #include "util.h"
> 
> -/* Lock order: qd_lock -> qd->lockref.lock -> lru lock */
> +#define GFS2_QD_HASH_SHIFT      12

Should this be a function of the number of CPUs?  (Might not be an issue
if the really big systems don't use GFS.)

> +#define GFS2_QD_HASH_SIZE       (1 << GFS2_QD_HASH_SHIFT)
> +#define GFS2_QD_HASH_MASK       (GFS2_QD_HASH_SIZE - 1)
> +
> +/* Lock order: qd_lock -> bucket lock -> qd->lockref.lock -> lru lock */
>  static DEFINE_SPINLOCK(qd_lock);
>  struct list_lru gfs2_qd_lru;
> 
> +static struct hlist_bl_head qd_hash_table[GFS2_QD_HASH_SIZE];
> +
> +static unsigned int gfs2_qd_hash(const struct gfs2_sbd *sdp,
> +				 const struct kqid qid)
> +{
> +	unsigned int h;
> +
> +	h = jhash(&sdp, sizeof(struct gfs2_sbd *), 0);
> +	h = jhash(&qid, sizeof(struct kqid), h);
> +
> +	return h & GFS2_QD_HASH_MASK;
> +}
> +
> +static inline void spin_lock_bucket(unsigned int hash)
> +{
> +        hlist_bl_lock(&qd_hash_table[hash]);
> +}
> +
> +static inline void spin_unlock_bucket(unsigned int hash)
> +{
> +        hlist_bl_unlock(&qd_hash_table[hash]);
> +}
> +
> +static void gfs2_qd_dealloc(struct rcu_head *rcu)
> +{
> +	struct gfs2_quota_data *qd = container_of(rcu, struct gfs2_quota_data, qd_rcu);
> +	kmem_cache_free(gfs2_quotad_cachep, qd);
> +}
> +
>  static void gfs2_qd_dispose(struct list_head *list)
>  {
>  	struct gfs2_quota_data *qd;
> @@ -87,6 +124,10 @@ static void gfs2_qd_dispose(struct list_head *list)
>  		list_del(&qd->qd_list);
>  		spin_unlock(&qd_lock);
> 
> +		spin_lock_bucket(qd->qd_hash);
> +		hlist_bl_del_rcu(&qd->qd_hlist);
> +		spin_unlock_bucket(qd->qd_hash);
> +

Good, removed from the RCU-traversed list before invoking call_rcu().

>  		gfs2_assert_warn(sdp, !qd->qd_change);
>  		gfs2_assert_warn(sdp, !qd->qd_slot_count);
>  		gfs2_assert_warn(sdp, !qd->qd_bh_count);
> @@ -95,7 +136,7 @@ static void gfs2_qd_dispose(struct list_head *list)
>  		atomic_dec(&sdp->sd_quota_count);
> 
>  		/* Delete it from the common reclaim list */
> -		kmem_cache_free(gfs2_quotad_cachep, qd);
> +		call_rcu(&qd->qd_rcu, gfs2_qd_dealloc);
>  	}
>  }
> 
> @@ -165,83 +206,95 @@ static u64 qd2offset(struct gfs2_quota_data *qd)
>  	return offset;
>  }
> 
> -static int qd_alloc(struct gfs2_sbd *sdp, struct kqid qid,
> -		    struct gfs2_quota_data **qdp)
> +static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, struct kqid qid)
>  {
>  	struct gfs2_quota_data *qd;
>  	int error;
> 
>  	qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS);
>  	if (!qd)
> -		return -ENOMEM;
> +		return NULL;
> 
> +	qd->qd_sbd = sdp;
>  	qd->qd_lockref.count = 1;
>  	spin_lock_init(&qd->qd_lockref.lock);
>  	qd->qd_id = qid;
>  	qd->qd_slot = -1;
>  	INIT_LIST_HEAD(&qd->qd_lru);
> +	qd->qd_hash = hash;
> 
>  	error = gfs2_glock_get(sdp, qd2index(qd),
>  			      &gfs2_quota_glops, CREATE, &qd->qd_gl);
>  	if (error)
>  		goto fail;
> 
> -	*qdp = qd;
> -
> -	return 0;
> +	return qd;
> 
>  fail:
>  	kmem_cache_free(gfs2_quotad_cachep, qd);
> -	return error;
> +	return NULL;
>  }
> 
> -static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
> -		  struct gfs2_quota_data **qdp)
> +static struct gfs2_quota_data *gfs2_qd_search_bucket(unsigned int hash,
> +						     const struct gfs2_sbd *sdp,
> +						     struct kqid qid)
>  {
> -	struct gfs2_quota_data *qd = NULL, *new_qd = NULL;
> -	int error, found;
> -
> -	*qdp = NULL;
> +	struct gfs2_quota_data *qd;
> +	struct hlist_bl_node *h;
> 
> -	for (;;) {
> -		found = 0;
> -		spin_lock(&qd_lock);
> -		list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
> -			if (qid_eq(qd->qd_id, qid) &&
> -			    lockref_get_not_dead(&qd->qd_lockref)) {
> -				list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
> -				found = 1;
> -				break;
> -			}
> +	hlist_bl_for_each_entry_rcu(qd, h, &qd_hash_table[hash], qd_hlist) {

All callers of gfs2_qd_search_bucket() either hold the update-side lock
(acquired via spin_lock_bucket()) or have invoked rcu_read_lock(), so
this is good.

> +		if (!qid_eq(qd->qd_id, qid))
> +			continue;
> +		if (qd->qd_sbd != sdp)
> +			continue;
> +		if (lockref_get_not_dead(&qd->qd_lockref)) {
> +			list_lru_del(&gfs2_qd_lru, &qd->qd_lru);

list_lru_del() acquires a lock, but it is from an array whose size
depends on the NODES_SHIFT Kconfig variable.  The array size seems to
vary from 8 to 16 to 64 to 1024, depending on other configuration info,
so should be OK.

> +			return qd;
>  		}
> +	}
> 
> -		if (!found)
> -			qd = NULL;
> +	return NULL;
> +}
> 
> -		if (!qd && new_qd) {
> -			qd = new_qd;
> -			list_add(&qd->qd_list, &sdp->sd_quota_list);
> -			atomic_inc(&sdp->sd_quota_count);
> -			new_qd = NULL;
> -		}
> 
> -		spin_unlock(&qd_lock);
> +static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
> +		  struct gfs2_quota_data **qdp)
> +{
> +	struct gfs2_quota_data *qd, *new_qd;
> +	unsigned int hash = gfs2_qd_hash(sdp, qid);
> 
> -		if (qd) {
> -			if (new_qd) {
> -				gfs2_glock_put(new_qd->qd_gl);
> -				kmem_cache_free(gfs2_quotad_cachep, new_qd);
> -			}
> -			*qdp = qd;
> -			return 0;
> -		}
> +	rcu_read_lock();
> +	*qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid);
> +	rcu_read_unlock();
> 
> -		error = qd_alloc(sdp, qid, &new_qd);
> -		if (error)
> -			return error;
> +	if (qd)
> +		return 0;
> +
> +	new_qd = qd_alloc(hash, sdp, qid);
> +	if (!new_qd)
> +		return -ENOMEM;
> +
> +	spin_lock(&qd_lock);
> +	spin_lock_bucket(hash);
> +	*qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid);
> +	if (qd == NULL) {
> +		*qdp = new_qd;
> +		list_add(&new_qd->qd_list, &sdp->sd_quota_list);
> +		hlist_bl_add_head_rcu(&new_qd->qd_hlist, &qd_hash_table[hash]);
> +		atomic_inc(&sdp->sd_quota_count);
> +	}
> +	spin_unlock_bucket(hash);
> +	spin_unlock(&qd_lock);
> +
> +	if (qd) {
> +		gfs2_glock_put(new_qd->qd_gl);
> +		kmem_cache_free(gfs2_quotad_cachep, new_qd);
>  	}
> +
> +	return 0;
>  }
> 
> +
>  static void qd_hold(struct gfs2_quota_data *qd)
>  {
>  	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
> @@ -1215,6 +1268,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
>  	unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift;
>  	unsigned int x, slot = 0;
>  	unsigned int found = 0;
> +	unsigned int hash;
>  	u64 dblock;
>  	u32 extlen = 0;
>  	int error;
> @@ -1272,8 +1326,9 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
>  			if (!qc_change)
>  				continue;
> 
> -			error = qd_alloc(sdp, qc_id, &qd);
> -			if (error) {
> +			hash = gfs2_qd_hash(sdp, qc_id);
> +			qd = qd_alloc(hash, sdp, qc_id);
> +			if (qd == NULL) {
>  				brelse(bh);
>  				goto fail;
>  			}
> @@ -1289,6 +1344,10 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
>  			atomic_inc(&sdp->sd_quota_count);
>  			spin_unlock(&qd_lock);
> 
> +			spin_lock_bucket(hash);
> +			hlist_bl_add_head_rcu(&qd->qd_hlist, &qd_hash_table[hash]);
> +			spin_unlock_bucket(hash);
> +
>  			found++;
>  		}
> 
> @@ -1335,11 +1394,16 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
>  		spin_unlock(&qd->qd_lockref.lock);
> 
>  		list_del(&qd->qd_list);
> +
>  		/* Also remove if this qd exists in the reclaim list */
>  		list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
>  		atomic_dec(&sdp->sd_quota_count);
>  		spin_unlock(&qd_lock);
> 
> +		spin_lock_bucket(qd->qd_hash);
> +		hlist_bl_del_rcu(&qd->qd_hlist);

Might just be my unfamiliarity with this code, but it took me a bit
to see the difference between ->qd_hlist and ->qd_list.  Of course, until
I spotted the difference, I was wondering why you were removing the
item twice.  ;-)

> +		spin_unlock_bucket(qd->qd_hash);
> +
>  		if (!qd->qd_lockref.count) {
>  			gfs2_assert_warn(sdp, !qd->qd_change);
>  			gfs2_assert_warn(sdp, !qd->qd_slot_count);
> @@ -1348,7 +1412,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
>  		gfs2_assert_warn(sdp, !qd->qd_bh_count);
> 
>  		gfs2_glock_put(qd->qd_gl);
> -		kmem_cache_free(gfs2_quotad_cachep, qd);
> +		call_rcu(&qd->qd_rcu, gfs2_qd_dealloc);
> 
>  		spin_lock(&qd_lock);
>  	}
> @@ -1643,3 +1707,11 @@ const struct quotactl_ops gfs2_quotactl_ops = {
>  	.get_dqblk	= gfs2_get_dqblk,
>  	.set_dqblk	= gfs2_set_dqblk,
>  };
> +
> +void __init gfs2_quota_hash_init(void)
> +{
> +	unsigned i;
> +
> +	for(i = 0; i < GFS2_QD_HASH_SIZE; i++)
> +		INIT_HLIST_BL_HEAD(&qd_hash_table[i]);
> +}
> diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
> index 96e4f34..55d506e 100644
> --- a/fs/gfs2/quota.h
> +++ b/fs/gfs2/quota.h
> @@ -57,5 +57,6 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
>  extern const struct quotactl_ops gfs2_quotactl_ops;
>  extern struct shrinker gfs2_qd_shrinker;
>  extern struct list_lru gfs2_qd_lru;
> +extern void __init gfs2_quota_hash_init(void);
> 
>  #endif /* __QUOTA_DOT_H__ */
> -- 
> 1.8.3.1
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/