[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <4F9993BF.7020104@redhat.com>
Date: Thu, 26 Apr 2012 13:28:15 -0500
From: Eric Sandeen <sandeen@...hat.com>
To: "J. Bruce Fields" <bfields@...hat.com>
CC: ext4 development <linux-ext4@...r.kernel.org>,
Jan Kara <jack@...e.cz>,
Bernd Schubert <bernd.schubert@...m.fraunhofer.de>
Subject: Re: ext3: return 32/64-bit dir name hash according to usage type
On 4/26/12 1:26 PM, J. Bruce Fields wrote:
> How are you testing this?
Basically as was suggested in the first ext4 patch series:
I created a filesystem with 600,000 files in a dir:
for x in $(seq 1 600000); do touch $x; done
then exported and mounted that fs to localhost:
mount -t nfs localhost:/mnt/export /mnt/nfs
and then looked for dups:
ls -l | dup -d
If you have more .... substantial nfs testing I could do I'm all ears :)
-Eric
> --b.
>
> On Thu, Apr 26, 2012 at 01:10:39PM -0500, Eric Sandeen wrote:
>> This is based on commit d1f5273e9adb40724a85272f248f210dc4ce919a
>> ext4: return 32/64-bit dir name hash according to usage type
>> by Fan Yong <yong.fan@...mcloud.com>
>>
>> Traditionally ext2/3/4 has returned a 32-bit hash value from llseek()
>> to appease NFSv2, which can only handle a 32-bit cookie for seekdir()
>> and telldir(). However, this causes problems if there are 32-bit hash
>> collisions, since the NFSv2 server can get stuck resending the same
>> entries from the directory repeatedly.
>>
>> Allow ext3 to return a full 64-bit hash (both major and minor) for
>> telldir to decrease the chance of hash collisions.
>>
>> This patch does implement a new ext3_dir_llseek op, because with 64-bit
>> hashes, nfs will attempt to seek to a hash "offset" which is much
>> larger than ext3's s_maxbytes. So for dx dirs, we call
>> generic_file_llseek_size() with the appropriate max hash value as the
>> maximum seekable size. Otherwise we just pass through to
>> generic_file_llseek().
>>
>> Patch-updated-by: Bernd Schubert <bernd.schubert@...m.fraunhofer.de>
>> Patch-updated-by: Eric Sandeen <sandeen@...hat.com>
>> (blame us if something is not correct)
>>
>> Signed-off-by: Eric Sandeen <sandeen@...hat.com>
>> ---
>>
>> diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
>> index cc761ad..92490e9 100644
>> --- a/fs/ext3/dir.c
>> +++ b/fs/ext3/dir.c
>> @@ -21,30 +21,15 @@
>> *
>> */
>>
>> +#include <linux/compat.h>
>> #include "ext3.h"
>>
>> static unsigned char ext3_filetype_table[] = {
>> DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
>> };
>>
>> -static int ext3_readdir(struct file *, void *, filldir_t);
>> static int ext3_dx_readdir(struct file * filp,
>> void * dirent, filldir_t filldir);
>> -static int ext3_release_dir (struct inode * inode,
>> - struct file * filp);
>> -
>> -const struct file_operations ext3_dir_operations = {
>> - .llseek = generic_file_llseek,
>> - .read = generic_read_dir,
>> - .readdir = ext3_readdir, /* we take BKL. needed?*/
>> - .unlocked_ioctl = ext3_ioctl,
>> -#ifdef CONFIG_COMPAT
>> - .compat_ioctl = ext3_compat_ioctl,
>> -#endif
>> - .fsync = ext3_sync_file, /* BKL held */
>> - .release = ext3_release_dir,
>> -};
>> -
>>
>> static unsigned char get_dtype(struct super_block *sb, int filetype)
>> {
>> @@ -55,6 +40,25 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
>> return (ext3_filetype_table[filetype]);
>> }
>>
>> +/**
>> + * Check if the given dir-inode refers to an htree-indexed directory
>> + * (or a directory which chould potentially get coverted to use htree
>> + * indexing).
>> + *
>> + * Return 1 if it is a dx dir, 0 if not
>> + */
>> +static int is_dx_dir(struct inode *inode)
>> +{
>> + struct super_block *sb = inode->i_sb;
>> +
>> + if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
>> + EXT3_FEATURE_COMPAT_DIR_INDEX) &&
>> + ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) ||
>> + ((inode->i_size >> sb->s_blocksize_bits) == 1)))
>> + return 1;
>> +
>> + return 0;
>> +}
>>
>> int ext3_check_dir_entry (const char * function, struct inode * dir,
>> struct ext3_dir_entry_2 * de,
>> @@ -94,18 +98,13 @@ static int ext3_readdir(struct file * filp,
>> unsigned long offset;
>> int i, stored;
>> struct ext3_dir_entry_2 *de;
>> - struct super_block *sb;
>> int err;
>> struct inode *inode = filp->f_path.dentry->d_inode;
>> + struct super_block *sb = inode->i_sb;
>> int ret = 0;
>> int dir_has_error = 0;
>>
>> - sb = inode->i_sb;
>> -
>> - if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
>> - EXT3_FEATURE_COMPAT_DIR_INDEX) &&
>> - ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) ||
>> - ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
>> + if (is_dx_dir(inode)) {
>> err = ext3_dx_readdir(filp, dirent, filldir);
>> if (err != ERR_BAD_DX_DIR) {
>> ret = err;
>> @@ -227,22 +226,87 @@ out:
>> return ret;
>> }
>>
>> +static inline int is_32bit_api(void)
>> +{
>> +#ifdef CONFIG_COMPAT
>> + return is_compat_task();
>> +#else
>> + return (BITS_PER_LONG == 32);
>> +#endif
>> +}
>> +
>> /*
>> * These functions convert from the major/minor hash to an f_pos
>> - * value.
>> + * value for dx directories
>> *
>> - * Currently we only use major hash numer. This is unfortunate, but
>> - * on 32-bit machines, the same VFS interface is used for lseek and
>> - * llseek, so if we use the 64 bit offset, then the 32-bit versions of
>> - * lseek/telldir/seekdir will blow out spectacularly, and from within
>> - * the ext2 low-level routine, we don't know if we're being called by
>> - * a 64-bit version of the system call or the 32-bit version of the
>> - * system call. Worse yet, NFSv2 only allows for a 32-bit readdir
>> - * cookie. Sigh.
>> + * Upper layer (for example NFS) should specify FMODE_32BITHASH or
>> + * FMODE_64BITHASH explicitly. On the other hand, we allow ext3 to be mounted
>> + * directly on both 32-bit and 64-bit nodes, under such case, neither
>> + * FMODE_32BITHASH nor FMODE_64BITHASH is specified.
>> */
>> -#define hash2pos(major, minor) (major >> 1)
>> -#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff)
>> -#define pos2min_hash(pos) (0)
>> +static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
>> +{
>> + if ((filp->f_mode & FMODE_32BITHASH) ||
>> + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
>> + return major >> 1;
>> + else
>> + return ((__u64)(major >> 1) << 32) | (__u64)minor;
>> +}
>> +
>> +static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
>> +{
>> + if ((filp->f_mode & FMODE_32BITHASH) ||
>> + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
>> + return (pos << 1) & 0xffffffff;
>> + else
>> + return ((pos >> 32) << 1) & 0xffffffff;
>> +}
>> +
>> +static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
>> +{
>> + if ((filp->f_mode & FMODE_32BITHASH) ||
>> + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
>> + return 0;
>> + else
>> + return pos & 0xffffffff;
>> +}
>> +
>> +/*
>> + * Return 32- or 64-bit end-of-file for dx directories
>> + */
>> +static inline loff_t ext3_get_htree_eof(struct file *filp)
>> +{
>> + if ((filp->f_mode & FMODE_32BITHASH) ||
>> + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
>> + return EXT3_HTREE_EOF_32BIT;
>> + else
>> + return EXT3_HTREE_EOF_64BIT;
>> +}
>> +
>> +
>> +/*
>> + * ext3_dir_llseek() calls generic_file_llseek[_size]() to handle both
>> + * non-htree and htree directories, where the "offset" is in terms
>> + * of the filename hash value instead of the byte offset.
>> + *
>> + * Because we may return a 64-bit hash that is well beyond s_maxbytes,
>> + * we need to pass the max hash as the maximum allowable offset in
>> + * the htree directory case.
>> + *
>> + * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX)
>> + * will be invalid once the directory was converted into a dx directory
>> + */
>> +loff_t ext3_dir_llseek(struct file *file, loff_t offset, int origin)
>> +{
>> + struct inode *inode = file->f_mapping->host;
>> + int dx_dir = is_dx_dir(inode);
>> +
>> + if (likely(dx_dir))
>> + return generic_file_llseek_size(file, offset, origin,
>> + ext3_get_htree_eof(file));
>> + else
>> + return generic_file_llseek(file, offset, origin);
>> +}
>>
>> /*
>> * This structure holds the nodes of the red-black tree used to store
>> @@ -303,15 +367,16 @@ static void free_rb_tree_fname(struct rb_root *root)
>> }
>>
>>
>> -static struct dir_private_info *ext3_htree_create_dir_info(loff_t pos)
>> +static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp,
>> + loff_t pos)
>> {
>> struct dir_private_info *p;
>>
>> p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
>> if (!p)
>> return NULL;
>> - p->curr_hash = pos2maj_hash(pos);
>> - p->curr_minor_hash = pos2min_hash(pos);
>> + p->curr_hash = pos2maj_hash(filp, pos);
>> + p->curr_minor_hash = pos2min_hash(filp, pos);
>> return p;
>> }
>>
>> @@ -401,7 +466,7 @@ static int call_filldir(struct file * filp, void * dirent,
>> printk("call_filldir: called with null fname?!?\n");
>> return 0;
>> }
>> - curr_pos = hash2pos(fname->hash, fname->minor_hash);
>> + curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
>> while (fname) {
>> error = filldir(dirent, fname->name,
>> fname->name_len, curr_pos,
>> @@ -426,13 +491,13 @@ static int ext3_dx_readdir(struct file * filp,
>> int ret;
>>
>> if (!info) {
>> - info = ext3_htree_create_dir_info(filp->f_pos);
>> + info = ext3_htree_create_dir_info(filp, filp->f_pos);
>> if (!info)
>> return -ENOMEM;
>> filp->private_data = info;
>> }
>>
>> - if (filp->f_pos == EXT3_HTREE_EOF)
>> + if (filp->f_pos == ext3_get_htree_eof(filp))
>> return 0; /* EOF */
>>
>> /* Some one has messed with f_pos; reset the world */
>> @@ -440,8 +505,8 @@ static int ext3_dx_readdir(struct file * filp,
>> free_rb_tree_fname(&info->root);
>> info->curr_node = NULL;
>> info->extra_fname = NULL;
>> - info->curr_hash = pos2maj_hash(filp->f_pos);
>> - info->curr_minor_hash = pos2min_hash(filp->f_pos);
>> + info->curr_hash = pos2maj_hash(filp, filp->f_pos);
>> + info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);
>> }
>>
>> /*
>> @@ -473,7 +538,7 @@ static int ext3_dx_readdir(struct file * filp,
>> if (ret < 0)
>> return ret;
>> if (ret == 0) {
>> - filp->f_pos = EXT3_HTREE_EOF;
>> + filp->f_pos = ext3_get_htree_eof(filp);
>> break;
>> }
>> info->curr_node = rb_first(&info->root);
>> @@ -493,7 +558,7 @@ static int ext3_dx_readdir(struct file * filp,
>> info->curr_minor_hash = fname->minor_hash;
>> } else {
>> if (info->next_hash == ~0) {
>> - filp->f_pos = EXT3_HTREE_EOF;
>> + filp->f_pos = ext3_get_htree_eof(filp);
>> break;
>> }
>> info->curr_hash = info->next_hash;
>> @@ -512,3 +577,15 @@ static int ext3_release_dir (struct inode * inode, struct file * filp)
>>
>> return 0;
>> }
>> +
>> +const struct file_operations ext3_dir_operations = {
>> + .llseek = ext3_dir_llseek,
>> + .read = generic_read_dir,
>> + .readdir = ext3_readdir,
>> + .unlocked_ioctl = ext3_ioctl,
>> +#ifdef CONFIG_COMPAT
>> + .compat_ioctl = ext3_compat_ioctl,
>> +#endif
>> + .fsync = ext3_sync_file,
>> + .release = ext3_release_dir,
>> +};
>> diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h
>> index b6515fd..fe5bef7 100644
>> --- a/fs/ext3/ext3.h
>> +++ b/fs/ext3/ext3.h
>> @@ -920,7 +920,11 @@ struct dx_hash_info
>> u32 *seed;
>> };
>>
>> -#define EXT3_HTREE_EOF 0x7fffffff
>> +
>> +/* 32 and 64 bit signed EOF for dx directories */
>> +#define EXT3_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1)
>> +#define EXT3_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1)
>> +
>>
>> /*
>> * Control parameters used by ext3_htree_next_block
>> diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c
>> index d10231d..ede315c 100644
>> --- a/fs/ext3/hash.c
>> +++ b/fs/ext3/hash.c
>> @@ -198,8 +198,8 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
>> return -1;
>> }
>> hash = hash & ~1;
>> - if (hash == (EXT3_HTREE_EOF << 1))
>> - hash = (EXT3_HTREE_EOF-1) << 1;
>> + if (hash == (EXT3_HTREE_EOF_32BIT << 1))
>> + hash = (EXT3_HTREE_EOF_32BIT - 1) << 1;
>> hinfo->hash = hash;
>> hinfo->minor_hash = minor_hash;
>> return 0;
>>
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists