commit 9c69b141d8815d6ecba409c2ac119dd4d6f1ef76 Author: Ojaswin Mujoo Date: Wed Jan 18 17:14:49 2023 +0530 ext4: Retry rbtree walk after deleteing PA This commit has use after free issue and possibly other issues. Just an initial draft Signed-off-by: Ojaswin Mujoo diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 273a98bcaa0d..66a9ef416d00 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4073,6 +4073,7 @@ ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, new_end = *end; /* check we don't cross already preallocated blocks */ +retry: read_lock(&ei->i_prealloc_lock); for (iter = ei->i_prealloc_node.rb_node; iter; iter = ext4_mb_pa_rb_next_iter(new_start, tmp_pa_start, iter)) { @@ -4089,12 +4090,34 @@ ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, if (is_overlap) { /* * Normalized range overlaps with range of this - * deleted PA, that means we might need it in - * near future. Since the PA is yet to be - * removed from inode PA tree and freed, lets - * just undelete it. + * deleted PA. It can be tricky to decide how to + * descend in the tree so it's easier to just + * delete the PA from rbtree and retry the + * operation. Since such a case is rare and + * usually only happens when FS is under high + * utilization, the performance hit due to retry + * should be minimal. */ - ext4_mb_mark_pa_inuse(ac->ac_sb, tmp_pa); + spin_unlock(&tmp_pa->pa_lock); + read_unlock(&ei->i_prealloc_lock); + /* + * !!BUG!! As we giveup the tree and PA locks, + * it might be possible that the group discard + * path has already removed this PA from the + * tree or the pa has already been freed. This + * results in use after free when acquiring + * pa_lock below. + */ + write_lock(&ei->i_prealloc_lock); + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_removed == 0) { + rb_erase(&tmp_pa->pa_node.inode_node, + &ei->i_prealloc_node); + tmp_pa->pa_removed = 1; + } + spin_unlock(&tmp_pa->pa_lock); + write_unlock(&ei->i_prealloc_lock); + goto retry; } else { spin_unlock(&tmp_pa->pa_lock); continue; @@ -4493,15 +4516,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) /* found preallocated blocks, use them */ spin_lock(&tmp_pa->pa_lock); - if (tmp_pa->pa_free) { - if (tmp_pa->pa_deleted == 1) { - /* - * Since PA is yet to be deleted from inode PA - * rbtree, just undelete it and use it. - */ - ext4_mb_mark_pa_inuse(ac->ac_sb, tmp_pa); - } - + if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free) { atomic_inc(&tmp_pa->pa_count); ext4_mb_use_inode_pa(ac, tmp_pa); spin_unlock(&tmp_pa->pa_lock); @@ -5061,33 +5076,15 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, list_del_rcu(&pa->pa_node.lg_list); spin_unlock(pa->pa_node_lock.lg_lock); } else { - /* - * The allocation path might undelete a PA - * incase it expects it to be used again in near - * future. In that case, rollback the ongoing delete - * operation and don't remove the PA from inode PA - * tree. - * - * TODO: See if we need pa_lock since there might no - * path that contends with it once the rbtree writelock - * is taken. - */ write_lock(pa->pa_node_lock.inode_lock); spin_lock(&pa->pa_lock); - if (pa->pa_deleted == 0) { - free -= pa->pa_free; - list_add(&pa->pa_group_list, - &grp->bb_prealloc_list); - list_del(&pa->u.pa_tmp_list); - - spin_unlock(&pa->pa_lock); - write_unlock(pa->pa_node_lock.inode_lock); - continue; + if (pa->pa_removed == 0) { + ei = EXT4_I(pa->pa_inode); + rb_erase(&pa->pa_node.inode_node, + &ei->i_prealloc_node); + pa->pa_removed = 1; } spin_unlock(&pa->pa_lock); - - ei = EXT4_I(pa->pa_inode); - rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); write_unlock(pa->pa_node_lock.inode_lock); } diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 6d85ee8674a6..3ea5701215fc 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -120,7 +120,9 @@ struct ext4_prealloc_space { } u; spinlock_t pa_lock; atomic_t pa_count; - unsigned pa_deleted; + unsigned pa_deleted; /* Has PA been marked deleted */ + unsigned pa_removed; /* Has PA been removed from its + respecitve data structure */ ext4_fsblk_t pa_pstart; /* phys. block */ ext4_lblk_t pa_lstart; /* log. block */ ext4_grpblk_t pa_len; /* len of preallocated chunk */