lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <r2k87f94c371004201421se796c0ekbae28495e62a9eb7@mail.gmail.com>
Date:	Tue, 20 Apr 2010 17:21:03 -0400
From:	Greg Freemyer <greg.freemyer@...il.com>
To:	Lukas Czerner <lczerner@...hat.com>
Cc:	linux-ext4@...r.kernel.org, Jeff Moyer <jmoyer@...hat.com>,
	Edward Shishkin <eshishki@...hat.com>,
	Eric Sandeen <esandeen@...hat.com>,
	Ric Wheeler <rwheeler@...hat.com>,
	Mark Lord <kernel@...savvy.com>
Subject: Re: [PATCH 2/2] Add batched discard support for ext4.

Mark,

This is the patch implementing the new discard logic.

You did the benchmarking last year, but I thought you found calling
trim one contiguous sector range at a time was too inefficient.

See my highlight below:

On Mon, Apr 19, 2010 at 6:55 AM, Lukas Czerner <lczerner@...hat.com> wrote:
> Create an ioctl which walks through all the free extents in each
> allocating group and discard those extents. As an addition to improve
> its performance one can specify minimum free extent length, so ioctl
> will not bother with shorter extents.
>
> This of course means, that with each invocation the ioctl must walk
> through whole file system, checking and discarding free extents, which
> is not very efficient. The best way to avoid this is to keep track of
> deleted (freed) blocks. Then the ioctl have to trim just those free
> extents which were recently freed.
>
> In order to implement this I have added new bitmap into ext4_group_info
> (bb_bitmap_deleted) which stores recently freed blocks. The ioctl then
> walk through bb_bitmap_deleted, compare deleted extents with free
> extents trim them and then removes it from the bb_bitmap_deleted.
>
> But you may notice, that there is one problem. bb_bitmap_deleted does
> not survive umount. To bypass the problem the first ioctl call have to
> walk through whole file system trimming all free extents.
>
> Signed-off-by: Lukas Czerner <lczerner@...hat.com>
> ---
>  fs/ext4/ext4.h    |    4 +
>  fs/ext4/mballoc.c |  207 ++++++++++++++++++++++++++++++++++++++++++++++++++---
>  fs/ext4/super.c   |    1 +
>  3 files changed, 202 insertions(+), 10 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index bf938cf..e25f672 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1437,6 +1437,8 @@ extern int ext4_mb_add_groupinfo(struct super_block *sb,
>  extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
>  extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
>                                                ext4_group_t, int);
> +extern int ext4_trim_fs(unsigned int, struct super_block *);
> +
>  /* inode.c */
>  struct buffer_head *ext4_getblk(handle_t *, struct inode *,
>                                                ext4_lblk_t, int, int *);
> @@ -1682,6 +1684,8 @@ struct ext4_group_info {
>  #ifdef DOUBLE_CHECK
>        void            *bb_bitmap;
>  #endif
> +       void            *bb_bitmap_deleted;
> +       ext4_grpblk_t   bb_deleted;
>        struct rw_semaphore alloc_sem;
>        ext4_grpblk_t   bb_counters[];  /* Nr of free power-of-two-block
>                                         * regions, index is order.
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index bde9d0b..fbc83fe 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -2255,6 +2255,9 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
>        INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
>        init_rwsem(&meta_group_info[i]->alloc_sem);
>        meta_group_info[i]->bb_free_root = RB_ROOT;
> +       meta_group_info[i]->bb_deleted = -1;
> +
> +
>
>  #ifdef DOUBLE_CHECK
>        {
> @@ -2469,6 +2472,7 @@ int ext4_mb_release(struct super_block *sb)
>  #ifdef DOUBLE_CHECK
>                        kfree(grinfo->bb_bitmap);
>  #endif
> +                       kfree(grinfo->bb_bitmap_deleted);
>                        ext4_lock_group(sb, i);
>                        ext4_mb_cleanup_pa(grinfo);
>                        ext4_unlock_group(sb, i);
> @@ -2528,6 +2532,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
>        int err, count = 0, count2 = 0;
>        struct ext4_free_data *entry;
>        struct list_head *l, *ltmp;
> +       void *bitmap_deleted = NULL;
>
>        list_for_each_safe(l, ltmp, &txn->t_private_list) {
>                entry = list_entry(l, struct ext4_free_data, list);
> @@ -2543,6 +2548,14 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
>                /* there are blocks to put in buddy to make them really free */
>                count += entry->count;
>                count2++;
> +
> +               if (test_opt(sb, DISCARD) &&
> +                       (db->bb_bitmap_deleted == NULL) &&
> +                       (db->bb_deleted >= 0)) {
> +                       bitmap_deleted =
> +                               kmalloc(sb->s_blocksize, GFP_KERNEL);
> +               }
> +
>                ext4_lock_group(sb, entry->group);
>                /* Take it out of per group rb tree */
>                rb_erase(&entry->node, &(db->bb_free_root));
> @@ -2555,17 +2568,24 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
>                        page_cache_release(e4b.bd_buddy_page);
>                        page_cache_release(e4b.bd_bitmap_page);
>                }
> -               ext4_unlock_group(sb, entry->group);
> -               if (test_opt(sb, DISCARD)) {
> -                       ext4_fsblk_t discard_block;
> -
> -                       discard_block = entry->start_blk +
> -                               ext4_group_first_block_no(sb, entry->group);
> -                       trace_ext4_discard_blocks(sb,
> -                                       (unsigned long long)discard_block,
> -                                       entry->count);
> -                       sb_issue_discard(sb, discard_block, entry->count);
> +               if (test_opt(sb, DISCARD) && (db->bb_deleted >= 0)) {
> +                       if (db->bb_bitmap_deleted == NULL) {
> +                               db->bb_bitmap_deleted = bitmap_deleted;
> +                               BUG_ON(db->bb_bitmap_deleted == NULL);
> +
> +                               bitmap_deleted = NULL;
> +                               mb_clear_bits(db->bb_bitmap_deleted,
> +                                       0, EXT4_BLOCKS_PER_GROUP(sb));
> +                       }
> +
> +                       db->bb_deleted += entry->count;
> +                       mb_set_bits(db->bb_bitmap_deleted, entry->start_blk,
> +                                entry->count);
>                }
> +               ext4_unlock_group(sb, entry->group);
> +
> +               kfree(bitmap_deleted);
> +
>                kmem_cache_free(ext4_free_ext_cachep, entry);
>                ext4_mb_release_desc(&e4b);
>        }
> @@ -4639,3 +4659,170 @@ error_return:
>                kmem_cache_free(ext4_ac_cachep, ac);
>        return;
>  }
> +
> +/* Trim "count" blocks starting at "start" in "group"
> + * This must be called under group lock
> + */
> +void ext4_trim_extent(struct super_block *sb, int start, int count,
> +               ext4_group_t group, struct ext4_buddy *e4b)
> +{
> +       ext4_fsblk_t discard_block;
> +       struct ext4_super_block *es = EXT4_SB(sb)->s_es;
> +       struct ext4_free_extent ex;
> +
> +       assert_spin_locked(ext4_group_lock_ptr(sb, group));
> +
> +       ex.fe_start = start;
> +       ex.fe_group = group;
> +       ex.fe_len = count;
> +
> +       mb_mark_used(e4b, &ex);
> +       ext4_unlock_group(sb, group);
> +
> +       discard_block = (ext4_fsblk_t)group *
> +                       EXT4_BLOCKS_PER_GROUP(sb)
> +                       + start
> +                       + le32_to_cpu(es->s_first_data_block);
> +       trace_ext4_discard_blocks(sb,
> +                       (unsigned long long)discard_block,
> +                       count);
> +       sb_issue_discard(sb, discard_block, count);
> +
> +       ext4_lock_group(sb, group);
> +       mb_free_blocks(NULL, e4b, start, ex.fe_len);
> +}

Mark, unless I'm missing something, sb_issue_discard() above is going
to trigger a trim command for just the one range.  I thought the
benchmarks you did showed that a collection of ranges needed to be
built, then a single trim command invoked that trimmed that group of
ranges.

Greg

> +
> +/* Trim all free blocks, which have at least minlen length */
> +ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
> +               ext4_grpblk_t minblocks)
> +{
> +       void *bitmap;
> +       ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
> +       ext4_grpblk_t start, next, count = 0;
> +       ext4_group_t group;
> +
> +       BUG_ON(e4b == NULL);
> +
> +       bitmap = e4b->bd_bitmap;
> +       group = e4b->bd_group;
> +       start = 0;
> +       ext4_lock_group(sb, group);
> +
> +       while (start < max) {
> +
> +               start = mb_find_next_zero_bit(bitmap, max, start);
> +               if (start >= max)
> +                       break;
> +               next = mb_find_next_bit(bitmap, max, start);
> +
> +               if ((next - start) >= minblocks) {
> +
> +                       count += next - start;
> +                       ext4_trim_extent(sb, start,
> +                               next - start, group, e4b);
> +
> +               }
> +               start = next + 1;
> +       }
> +
> +       e4b->bd_info->bb_deleted = 0;
> +       ext4_unlock_group(sb, group);
> +
> +       return count;
> +}
> +
> +/* Trim only blocks which is free and in bb_bitmap_deleted */
> +ext4_grpblk_t ext4_trim_deleted(struct super_block *sb, struct ext4_buddy *e4b,
> +               ext4_grpblk_t minblocks)
> +{
> +       void *bitmap;
> +       struct ext4_group_info *grp;
> +       ext4_group_t group;
> +       ext4_grpblk_t max, next, count = 0, start = 0;
> +
> +       BUG_ON(e4b == NULL);
> +
> +       bitmap = e4b->bd_bitmap;
> +       group = e4b->bd_group;
> +       grp = ext4_get_group_info(sb, group);
> +
> +       if (grp->bb_deleted < minblocks)
> +               return 0;
> +
> +       ext4_lock_group(sb, group);
> +
> +       while (start < EXT4_BLOCKS_PER_GROUP(sb)) {
> +               start = mb_find_next_bit(grp->bb_bitmap_deleted,
> +                       EXT4_BLOCKS_PER_GROUP(sb), start);
> +               max = mb_find_next_zero_bit(grp->bb_bitmap_deleted,
> +                       EXT4_BLOCKS_PER_GROUP(sb), start);
> +
> +               while (start < max) {
> +                       start = mb_find_next_zero_bit(bitmap, max, start);
> +                       if (start >= max)
> +                               break;
> +                       next = mb_find_next_bit(bitmap, max, start);
> +                       if (next > max)
> +                               next = max;
> +
> +                       if ((next - start) >= minblocks) {
> +                               count += next - start;
> +
> +                               ext4_trim_extent(sb, start,
> +                                       next - start, group, e4b);
> +
> +                               mb_clear_bits(grp->bb_bitmap_deleted,
> +                                       start, next - start);
> +                       }
> +                       start = next + 1;
> +               }
> +       }
> +       grp->bb_deleted -= count;
> +
> +       ext4_unlock_group(sb, group);
> +
> +       ext4_debug("trimmed %d blocks in the group %d\n",
> +               count, group);
> +
> +       return count;
> +}
> +
> +int ext4_trim_fs(unsigned int minlen, struct super_block *sb)
> +{
> +       struct ext4_buddy e4b;
> +       struct ext4_group_info *grp;
> +       ext4_group_t group;
> +       ext4_group_t ngroups = ext4_get_groups_count(sb);
> +       ext4_grpblk_t minblocks;
> +
> +       if (!test_opt(sb, DISCARD))
> +               return 0;
> +
> +       minblocks = DIV_ROUND_UP(minlen, sb->s_blocksize);
> +       if (unlikely(minblocks > EXT4_BLOCKS_PER_GROUP(sb)))
> +               return -EINVAL;
> +
> +       for (group = 0; group < ngroups; group++) {
> +               int err;
> +
> +               err = ext4_mb_load_buddy(sb, group, &e4b);
> +               if (err) {
> +                       ext4_error(sb, "Error in loading buddy "
> +                                       "information for %u", group);
> +                       continue;
> +               }
> +               grp = ext4_get_group_info(sb, group);
> +
> +               if (grp->bb_deleted < 0) {
> +                       /* First run after mount */
> +                       ext4_trim_all_free(sb, &e4b, minblocks);
> +               } else if (grp->bb_deleted >= minblocks) {
> +                       /* Trim only blocks deleted since first run */
> +                       ext4_trim_deleted(sb, &e4b, minblocks);
> +               }
> +
> +               ext4_mb_release_desc(&e4b);
> +       }
> +
> +       return 0;
> +}
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index e14d22c..253eb98 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -1109,6 +1109,7 @@ static const struct super_operations ext4_sops = {
>        .quota_write    = ext4_quota_write,
>  #endif
>        .bdev_try_to_free_page = bdev_try_to_free_page,
> +       .trim_fs        = ext4_trim_fs
>  };
>
>  static const struct super_operations ext4_nojournal_sops = {
> --
> 1.6.6.1
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ