lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20210630155438.GD13743@locust>
Date:   Wed, 30 Jun 2021 08:54:38 -0700
From:   "Darrick J. Wong" <djwong@...nel.org>
To:     Jan Kara <jack@...e.cz>
Cc:     Ted Tso <tytso@....edu>, linux-ext4@...r.kernel.org
Subject: Re: [PATCH 3/4] ext4: Speedup ext4 orphan inode handling

On Wed, Jun 16, 2021 at 12:56:54PM +0200, Jan Kara wrote:
> Ext4 orphan inode handling is a bottleneck for workloads which heavily
> truncate / unlink small files since it contends on the global
> s_orphan_mutex lock (and generally it's difficult to improve scalability
> of the ondisk linked list of orphaned inodes).
> 
> This patch implements new way of handling orphan inodes. Instead of
> linking orphaned inode into a linked list, we store it's inode number in
> a new special file which we call "orphan file". Currently we still
> protect the orphan file with a spinlock for simplicity but even in this
> setting we can substantially reduce the length of the critical section
> and thus speedup some workloads.
> 
> Note that the change is backwards compatible when the filesystem is
> clean - the existence of the orphan file is a compat feature, we set
> another ro-compat feature indicating orphan file needs scanning for
> orphaned inodes when mounting filesystem read-write. This ro-compat
> feature gets cleared on unmount / remount read-only.
> 
> Some performance data from 80 CPU Xeon Server with 512 GB of RAM,
> filesystem located on SSD, average of 5 runs:
> 
> stress-orphan (microbenchmark truncating files byte-by-byte from N
> processes in parallel)
> 
> Threads Time            Time
>         Vanilla         Patched
>   1       1.057200        0.945600
>   2       1.680400        1.331800
>   4       2.547000        1.995000
>   8       7.049400        6.424200
>  16      14.827800       14.937600
>  32      40.948200       33.038200
>  64      87.787400       60.823600
> 128     206.504000      122.941400
> 
> So we can see significant wins all over the board.
> 
> Signed-off-by: Jan Kara <jack@...e.cz>
> ---
>  fs/ext4/ext4.h   |  70 +++++++++--
>  fs/ext4/orphan.c | 319 ++++++++++++++++++++++++++++++++++++++++++-----
>  fs/ext4/super.c  |  34 ++++-
>  3 files changed, 379 insertions(+), 44 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 33508487516f..83298c0b6dae 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1025,7 +1025,14 @@ struct ext4_inode_info {
>  	 */
>  	struct rw_semaphore xattr_sem;
>  
> -	struct list_head i_orphan;	/* unlinked but open inodes */
> +	/*
> +	 * Inodes with EXT4_STATE_ORPHAN_FILE use i_orphan_idx. Otherwise
> +	 * i_orphan is used.
> +	 */
> +	union {
> +		struct list_head i_orphan;	/* unlinked but open inodes */
> +		unsigned int i_orphan_idx;	/* Index in orphan file */
> +	};
>  
>  	/* Fast commit related info */
>  
> @@ -1419,7 +1426,8 @@ struct ext4_super_block {
>  	__u8    s_last_error_errcode;
>  	__le16  s_encoding;		/* Filename charset encoding */
>  	__le16  s_encoding_flags;	/* Filename charset encoding flags */
> -	__le32	s_reserved[95];		/* Padding to the end of the block */
> +	__le32  s_orphan_file_inum;	/* Inode for tracking orphan inodes */
> +	__le32	s_reserved[94];		/* Padding to the end of the block */
>  	__le32	s_checksum;		/* crc32c(superblock) */
>  };
>  
> @@ -1440,6 +1448,7 @@ struct ext4_super_block {
>  
>  /* Types of ext4 journal triggers */
>  enum ext4_journal_trigger_type {
> +	EXT4_JTR_ORPHAN_FILE,
>  	EXT4_JTR_NONE	/* This must be the last entry for indexing to work! */
>  };
>  
> @@ -1456,6 +1465,36 @@ static inline struct ext4_journal_trigger *EXT4_TRIGGER(
>  	return container_of(trigger, struct ext4_journal_trigger, tr_triggers);
>  }
>  
> +#define EXT4_ORPHAN_BLOCK_MAGIC 0x0b10ca04
> +
> +/* Structure at the tail of orphan block */
> +struct ext4_orphan_block_tail {
> +	__le32 ob_magic;
> +	__le32 ob_checksum;
> +};

Can you add the ondisk format changes to the appropriate place in
Documentation/filesystems/ext4/ please?

--D

> +
> +static inline int ext4_inodes_per_orphan_block(struct super_block *sb)
> +{
> +	return (sb->s_blocksize - sizeof(struct ext4_orphan_block_tail)) /
> +			sizeof(u32);
> +}
> +
> +struct ext4_orphan_block {
> +	int ob_free_entries;	/* Number of free orphan entries in block */
> +	struct buffer_head *ob_bh;	/* Buffer for orphan block */
> +};
> +
> +/*
> + * Info about orphan file.
> + */
> +struct ext4_orphan_info {
> +	spinlock_t of_lock;
> +	int of_blocks;			/* Number of orphan blocks in a file */
> +	__u32 of_csum_seed;		/* Checksum seed for orphan file */
> +	struct ext4_orphan_block *of_binfo;	/* Array with info about orphan
> +						 * file blocks */
> +};
> +
>  /*
>   * fourth extended-fs super-block data in memory
>   */
> @@ -1509,9 +1548,11 @@ struct ext4_sb_info {
>  
>  	/* Journaling */
>  	struct journal_s *s_journal;
> -	struct list_head s_orphan;
> -	struct mutex s_orphan_lock;
>  	unsigned long s_ext4_flags;		/* Ext4 superblock flags */
> +	struct mutex s_orphan_lock;	/* Protects on disk list changes */
> +	struct list_head s_orphan;	/* List of orphaned inodes in on disk
> +					   list */
> +	struct ext4_orphan_info s_orphan_info;
>  	unsigned long s_commit_interval;
>  	u32 s_max_batch_time;
>  	u32 s_min_batch_time;
> @@ -1846,6 +1887,7 @@ enum {
>  	EXT4_STATE_LUSTRE_EA_INODE,	/* Lustre-style ea_inode */
>  	EXT4_STATE_VERITY_IN_PROGRESS,	/* building fs-verity Merkle tree */
>  	EXT4_STATE_FC_COMMITTING,	/* Fast commit ongoing */
> +	EXT4_STATE_ORPHAN_FILE,		/* Inode orphaned in orphan file */
>  };
>  
>  #define EXT4_INODE_BIT_FNS(name, field, offset)				\
> @@ -1947,6 +1989,7 @@ static inline bool ext4_verity_in_progress(struct inode *inode)
>   */
>  #define EXT4_FEATURE_COMPAT_FAST_COMMIT		0x0400
>  #define EXT4_FEATURE_COMPAT_STABLE_INODES	0x0800
> +#define EXT4_FEATURE_COMPAT_ORPHAN_FILE		0x1000	/* Orphan file exists */
>  
>  #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER	0x0001
>  #define EXT4_FEATURE_RO_COMPAT_LARGE_FILE	0x0002
> @@ -1955,6 +1998,8 @@ static inline bool ext4_verity_in_progress(struct inode *inode)
>  #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM		0x0010
>  #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK	0x0020
>  #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE	0x0040
> +#define EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT	0x0080 /* Orphan file may be
> +							  non-empty */
>  #define EXT4_FEATURE_RO_COMPAT_QUOTA		0x0100
>  #define EXT4_FEATURE_RO_COMPAT_BIGALLOC		0x0200
>  /*
> @@ -1964,6 +2009,7 @@ static inline bool ext4_verity_in_progress(struct inode *inode)
>   * GDT_CSUM bits are mutually exclusive.
>   */
>  #define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM	0x0400
> +/* 0x0800 Reserved for EXT4_FEATURE_RO_COMPAT_REPLICA */
>  #define EXT4_FEATURE_RO_COMPAT_READONLY		0x1000
>  #define EXT4_FEATURE_RO_COMPAT_PROJECT		0x2000
>  #define EXT4_FEATURE_RO_COMPAT_VERITY		0x8000
> @@ -2050,6 +2096,7 @@ EXT4_FEATURE_COMPAT_FUNCS(dir_index,		DIR_INDEX)
>  EXT4_FEATURE_COMPAT_FUNCS(sparse_super2,	SPARSE_SUPER2)
>  EXT4_FEATURE_COMPAT_FUNCS(fast_commit,		FAST_COMMIT)
>  EXT4_FEATURE_COMPAT_FUNCS(stable_inodes,	STABLE_INODES)
> +EXT4_FEATURE_COMPAT_FUNCS(orphan_file,		ORPHAN_FILE)
>  
>  EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super,	SPARSE_SUPER)
>  EXT4_FEATURE_RO_COMPAT_FUNCS(large_file,	LARGE_FILE)
> @@ -2064,6 +2111,7 @@ EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum,	METADATA_CSUM)
>  EXT4_FEATURE_RO_COMPAT_FUNCS(readonly,		READONLY)
>  EXT4_FEATURE_RO_COMPAT_FUNCS(project,		PROJECT)
>  EXT4_FEATURE_RO_COMPAT_FUNCS(verity,		VERITY)
> +EXT4_FEATURE_RO_COMPAT_FUNCS(orphan_present,	ORPHAN_PRESENT)
>  
>  EXT4_FEATURE_INCOMPAT_FUNCS(compression,	COMPRESSION)
>  EXT4_FEATURE_INCOMPAT_FUNCS(filetype,		FILETYPE)
> @@ -2097,7 +2145,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold,		CASEFOLD)
>  					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
>  					 EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
>  
> -#define EXT4_FEATURE_COMPAT_SUPP	EXT4_FEATURE_COMPAT_EXT_ATTR
> +#define EXT4_FEATURE_COMPAT_SUPP	(EXT4_FEATURE_COMPAT_EXT_ATTR| \
> +					 EXT4_FEATURE_COMPAT_ORPHAN_FILE)
>  #define EXT4_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
>  					 EXT4_FEATURE_INCOMPAT_RECOVER| \
>  					 EXT4_FEATURE_INCOMPAT_META_BG| \
> @@ -2122,7 +2171,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold,		CASEFOLD)
>  					 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
>  					 EXT4_FEATURE_RO_COMPAT_QUOTA |\
>  					 EXT4_FEATURE_RO_COMPAT_PROJECT |\
> -					 EXT4_FEATURE_RO_COMPAT_VERITY)
> +					 EXT4_FEATURE_RO_COMPAT_VERITY |\
> +					 EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT)
>  
>  #define EXTN_FEATURE_FUNCS(ver) \
>  static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \
> @@ -2172,7 +2222,6 @@ static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi)
>  	return test_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
>  }
>  
> -
>  /*
>   * Default values for user and/or group using reserved blocks
>   */
> @@ -3751,6 +3800,13 @@ extern int ext4_orphan_add(handle_t *, struct inode *);
>  extern int ext4_orphan_del(handle_t *, struct inode *);
>  extern void ext4_orphan_cleanup(struct super_block *sb,
>  				struct ext4_super_block *es);
> +extern void ext4_release_orphan_info(struct super_block *sb);
> +extern int ext4_init_orphan_info(struct super_block *sb);
> +extern int ext4_orphan_file_empty(struct super_block *sb);
> +extern void ext4_orphan_file_block_trigger(
> +				struct jbd2_buffer_trigger_type *triggers,
> +				struct buffer_head *bh,
> +				void *data, size_t size);
>  
>  /*
>   * Add new method to test whether block and inode bitmaps are properly
> diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c
> index 732b16ef655b..ac22667b7fd5 100644
> --- a/fs/ext4/orphan.c
> +++ b/fs/ext4/orphan.c
> @@ -8,6 +8,52 @@
>  #include "ext4.h"
>  #include "ext4_jbd2.h"
>  
> +static int ext4_orphan_file_add(handle_t *handle, struct inode *inode)
> +{
> +	int i, j;
> +	struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info;
> +	int ret = 0;
> +	__le32 *bdata;
> +	int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb);
> +
> +	spin_lock(&oi->of_lock);
> +	for (i = 0; i < oi->of_blocks && !oi->of_binfo[i].ob_free_entries; i++);
> +	if (i == oi->of_blocks) {
> +		spin_unlock(&oi->of_lock);
> +		/*
> +		 * For now we don't grow or shrink orphan file. We just use
> +		 * whatever was allocated at mke2fs time. The additional
> +		 * credits we would have to reserve for each orphan inode
> +		 * operation just don't seem worth it.
> +		 */
> +		return -ENOSPC;
> +	}
> +	oi->of_binfo[i].ob_free_entries--;
> +	spin_unlock(&oi->of_lock);
> +
> +	/*
> +	 * Get access to orphan block. We have dropped of_lock but since we
> +	 * have decremented number of free entries we are guaranteed free entry
> +	 * in our block.
> +	 */
> +	ret = ext4_journal_get_write_access(handle, inode->i_sb,
> +				oi->of_binfo[i].ob_bh, EXT4_JTR_ORPHAN_FILE);
> +	if (ret)
> +		return ret;
> +
> +	bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data);
> +	spin_lock(&oi->of_lock);
> +	/* Find empty slot in a block */
> +	for (j = 0; j < inodes_per_ob && bdata[j]; j++);
> +	BUG_ON(j == inodes_per_ob);
> +	bdata[j] = cpu_to_le32(inode->i_ino);
> +	EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j;
> +	ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
> +	spin_unlock(&oi->of_lock);
> +
> +	return ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[i].ob_bh);
> +}
> +
>  /*
>   * ext4_orphan_add() links an unlinked or truncated inode into a list of
>   * such inodes, starting at the superblock, in case we crash before the
> @@ -34,10 +80,10 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
>  	WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
>  		     !inode_is_locked(inode));
>  	/*
> -	 * Exit early if inode already is on orphan list. This is a big speedup
> -	 * since we don't have to contend on the global s_orphan_lock.
> +	 * Inode orphaned in orphan file or in orphan list?
>  	 */
> -	if (!list_empty(&EXT4_I(inode)->i_orphan))
> +	if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) ||
> +	    !list_empty(&EXT4_I(inode)->i_orphan))
>  		return 0;
>  
>  	/*
> @@ -49,6 +95,16 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
>  	ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
>  		  S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
>  
> +	if (sbi->s_orphan_info.of_blocks) {
> +		err = ext4_orphan_file_add(handle, inode);
> +		/*
> +		 * Fallback to normal orphan list of orphan file is
> +		 * out of space
> +		 */
> +		if (err != -ENOSPC)
> +			return err;
> +	}
> +
>  	BUFFER_TRACE(sbi->s_sbh, "get_write_access");
>  	err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh,
>  					    EXT4_JTR_NONE);
> @@ -103,6 +159,37 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
>  	return err;
>  }
>  
> +static int ext4_orphan_file_del(handle_t *handle, struct inode *inode)
> +{
> +	struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info;
> +	__le32 *bdata;
> +	int blk, off;
> +	int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb);
> +	int ret = 0;
> +
> +	if (!handle)
> +		goto out;
> +	blk = EXT4_I(inode)->i_orphan_idx / inodes_per_ob;
> +	off = EXT4_I(inode)->i_orphan_idx % inodes_per_ob;
> +
> +	ret = ext4_journal_get_write_access(handle, inode->i_sb,
> +				oi->of_binfo[blk].ob_bh, EXT4_JTR_ORPHAN_FILE);
> +	if (ret)
> +		goto out;
> +
> +	bdata = (__le32 *)(oi->of_binfo[blk].ob_bh->b_data);
> +	spin_lock(&oi->of_lock);
> +	bdata[off] = 0;
> +	oi->of_binfo[blk].ob_free_entries++;
> +	spin_unlock(&oi->of_lock);
> +	ret = ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[blk].ob_bh);
> +out:
> +	ext4_clear_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
> +	INIT_LIST_HEAD(&EXT4_I(inode)->i_orphan);
> +
> +	return ret;
> +}
> +
>  /*
>   * ext4_orphan_del() removes an unlinked or truncated inode from the list
>   * of such inodes stored on disk, because it is finally being cleaned up.
> @@ -121,6 +208,9 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
>  
>  	WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
>  		     !inode_is_locked(inode));
> +	if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE))
> +		return ext4_orphan_file_del(handle, inode);
> +
>  	/* Do this quick check before taking global s_orphan_lock. */
>  	if (list_empty(&ei->i_orphan))
>  		return 0;
> @@ -196,6 +286,39 @@ static int ext4_quota_on_mount(struct super_block *sb, int type)
>  					EXT4_SB(sb)->s_jquota_fmt, type);
>  }
>  
> +static void ext4_process_orphan(struct inode *inode,
> +				int *nr_truncates, int *nr_orphans)
> +{
> +	struct super_block *sb = inode->i_sb;
> +	int ret;
> +
> +	dquot_initialize(inode);
> +	if (inode->i_nlink) {
> +		if (test_opt(sb, DEBUG))
> +			ext4_msg(sb, KERN_DEBUG,
> +				"%s: truncating inode %lu to %lld bytes",
> +				__func__, inode->i_ino, inode->i_size);
> +		jbd_debug(2, "truncating inode %lu to %lld bytes\n",
> +			  inode->i_ino, inode->i_size);
> +		inode_lock(inode);
> +		truncate_inode_pages(inode->i_mapping, inode->i_size);
> +		ret = ext4_truncate(inode);
> +		if (ret)
> +			ext4_std_error(inode->i_sb, ret);
> +		inode_unlock(inode);
> +		(*nr_truncates)++;
> +	} else {
> +		if (test_opt(sb, DEBUG))
> +			ext4_msg(sb, KERN_DEBUG,
> +				"%s: deleting unreferenced inode %lu",
> +				__func__, inode->i_ino);
> +		jbd_debug(2, "deleting unreferenced inode %lu\n",
> +			  inode->i_ino);
> +		(*nr_orphans)++;
> +	}
> +	iput(inode);  /* The delete magic happens here! */
> +}
> +
>  /* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
>   * the superblock) which were deleted from all directories, but held open by
>   * a process at the time of a crash.  We walk the list and try to delete these
> @@ -216,12 +339,17 @@ static int ext4_quota_on_mount(struct super_block *sb, int type)
>  void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es)
>  {
>  	unsigned int s_flags = sb->s_flags;
> -	int ret, nr_orphans = 0, nr_truncates = 0;
> +	int nr_orphans = 0, nr_truncates = 0;
> +	struct inode *inode;
> +	int i, j;
>  #ifdef CONFIG_QUOTA
>  	int quota_update = 0;
> -	int i;
>  #endif
> -	if (!es->s_last_orphan) {
> +	__le32 *bdata;
> +	struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
> +	int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
> +
> +	if (!es->s_last_orphan && !oi->of_blocks) {
>  		jbd_debug(4, "no orphan inodes to clean up\n");
>  		return;
>  	}
> @@ -285,8 +413,6 @@ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es)
>  #endif
>  
>  	while (es->s_last_orphan) {
> -		struct inode *inode;
> -
>  		/*
>  		 * We may have encountered an error during cleanup; if
>  		 * so, skip the rest.
> @@ -304,31 +430,21 @@ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es)
>  		}
>  
>  		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
> -		dquot_initialize(inode);
> -		if (inode->i_nlink) {
> -			if (test_opt(sb, DEBUG))
> -				ext4_msg(sb, KERN_DEBUG,
> -					"%s: truncating inode %lu to %lld bytes",
> -					__func__, inode->i_ino, inode->i_size);
> -			jbd_debug(2, "truncating inode %lu to %lld bytes\n",
> -				  inode->i_ino, inode->i_size);
> -			inode_lock(inode);
> -			truncate_inode_pages(inode->i_mapping, inode->i_size);
> -			ret = ext4_truncate(inode);
> -			if (ret)
> -				ext4_std_error(inode->i_sb, ret);
> -			inode_unlock(inode);
> -			nr_truncates++;
> -		} else {
> -			if (test_opt(sb, DEBUG))
> -				ext4_msg(sb, KERN_DEBUG,
> -					"%s: deleting unreferenced inode %lu",
> -					__func__, inode->i_ino);
> -			jbd_debug(2, "deleting unreferenced inode %lu\n",
> -				  inode->i_ino);
> -			nr_orphans++;
> +		ext4_process_orphan(inode, &nr_truncates, &nr_orphans);
> +	}
> +
> +	for (i = 0; i < oi->of_blocks; i++) {
> +		bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data);
> +		for (j = 0; j < inodes_per_ob; j++) {
> +			if (!bdata[j])
> +				continue;
> +			inode = ext4_orphan_get(sb, le32_to_cpu(bdata[j]));
> +			if (IS_ERR(inode))
> +				continue;
> +			ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
> +			EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j;
> +			ext4_process_orphan(inode, &nr_truncates, &nr_orphans);
>  		}
> -		iput(inode);  /* The delete magic happens here! */
>  	}
>  
>  #define PLURAL(x) (x), ((x) == 1) ? "" : "s"
> @@ -350,3 +466,142 @@ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es)
>  #endif
>  	sb->s_flags = s_flags; /* Restore SB_RDONLY status */
>  }
> +
> +void ext4_release_orphan_info(struct super_block *sb)
> +{
> +	int i;
> +	struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
> +
> +	if (!oi->of_blocks)
> +		return;
> +	for (i = 0; i < oi->of_blocks; i++)
> +		brelse(oi->of_binfo[i].ob_bh);
> +	kfree(oi->of_binfo);
> +}
> +
> +static struct ext4_orphan_block_tail *ext4_orphan_block_tail(
> +						struct super_block *sb,
> +						struct buffer_head *bh)
> +{
> +	return (struct ext4_orphan_block_tail *)(bh->b_data + sb->s_blocksize -
> +				sizeof(struct ext4_orphan_block_tail));
> +}
> +
> +static int ext4_orphan_file_block_csum_verify(struct super_block *sb,
> +					      struct buffer_head *bh)
> +{
> +	__u32 calculated;
> +	int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
> +	struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
> +	struct ext4_orphan_block_tail *ot;
> +
> +	if (!ext4_has_metadata_csum(sb))
> +		return 1;
> +
> +	ot = ext4_orphan_block_tail(sb, bh);
> +	calculated = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed,
> +				 (__u8 *)bh->b_data,
> +				 inodes_per_ob * sizeof(__u32));
> +	return le32_to_cpu(ot->ob_checksum) == calculated;
> +}
> +
> +/* This gets called only when checksumming is enabled */
> +void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers,
> +				    struct buffer_head *bh,
> +				    void *data, size_t size)
> +{
> +	struct super_block *sb = EXT4_TRIGGER(triggers)->sb;
> +	__u32 csum;
> +	int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
> +	struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
> +	struct ext4_orphan_block_tail *ot;
> +
> +	csum = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, (__u8 *)data,
> +			   inodes_per_ob * sizeof(__u32));
> +	ot = ext4_orphan_block_tail(sb, bh);
> +	ot->ob_checksum = cpu_to_le32(csum);
> +}
> +
> +int ext4_init_orphan_info(struct super_block *sb)
> +{
> +	struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
> +	struct inode *inode;
> +	int i, j;
> +	int ret;
> +	int free;
> +	__le32 *bdata;
> +	int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
> +	struct ext4_orphan_block_tail *ot;
> +	ino_t orphan_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_orphan_file_inum);
> +
> +	spin_lock_init(&oi->of_lock);
> +
> +	if (!ext4_has_feature_orphan_file(sb))
> +		return 0;
> +
> +	inode = ext4_iget(sb, orphan_ino, EXT4_IGET_NORMAL);
> +	if (IS_ERR(inode)) {
> +		ext4_msg(sb, KERN_ERR, "get orphan inode failed");
> +		return PTR_ERR(inode);
> +	}
> +	oi->of_blocks = inode->i_size >> sb->s_blocksize_bits;
> +	oi->of_csum_seed = EXT4_I(inode)->i_csum_seed;
> +	oi->of_binfo = kmalloc(oi->of_blocks*sizeof(struct ext4_orphan_block),
> +			       GFP_KERNEL);
> +	if (!oi->of_binfo) {
> +		ret = -ENOMEM;
> +		goto out_put;
> +	}
> +	for (i = 0; i < oi->of_blocks; i++) {
> +		oi->of_binfo[i].ob_bh = ext4_bread(NULL, inode, i, 0);
> +		if (IS_ERR(oi->of_binfo[i].ob_bh)) {
> +			ret = PTR_ERR(oi->of_binfo[i].ob_bh);
> +			goto out_free;
> +		}
> +		if (!oi->of_binfo[i].ob_bh) {
> +			ret = -EIO;
> +			goto out_free;
> +		}
> +		ot = ext4_orphan_block_tail(sb, oi->of_binfo[i].ob_bh);
> +		if (le32_to_cpu(ot->ob_magic) != EXT4_ORPHAN_BLOCK_MAGIC) {
> +			ext4_error(sb, "orphan file block %d: bad magic", i);
> +			ret = -EIO;
> +			goto out_free;
> +		}
> +		if (!ext4_orphan_file_block_csum_verify(sb,
> +						oi->of_binfo[i].ob_bh)) {
> +			ext4_error(sb, "orphan file block %d: bad checksum", i);
> +			ret = -EIO;
> +			goto out_free;
> +		}
> +		bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data);
> +		free = 0;
> +		for (j = 0; j < inodes_per_ob; j++)
> +			if (bdata[j] == 0)
> +				free++;
> +		oi->of_binfo[i].ob_free_entries = free;
> +	}
> +	iput(inode);
> +	return 0;
> +out_free:
> +	for (i--; i >= 0; i--)
> +		brelse(oi->of_binfo[i].ob_bh);
> +	kfree(oi->of_binfo);
> +out_put:
> +	iput(inode);
> +	return ret;
> +}
> +
> +int ext4_orphan_file_empty(struct super_block *sb)
> +{
> +	struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
> +	int i;
> +	int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
> +
> +	if (!ext4_has_feature_orphan_file(sb))
> +		return 1;
> +	for (i = 0; i < oi->of_blocks; i++)
> +		if (oi->of_binfo[i].ob_free_entries != inodes_per_ob)
> +			return 0;
> +	return 1;
> +}
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 6e43c8546dc5..06f63b0cd988 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -1164,6 +1164,7 @@ static void ext4_put_super(struct super_block *sb)
>  
>  	flush_work(&sbi->s_error_work);
>  	destroy_workqueue(sbi->rsv_conversion_wq);
> +	ext4_release_orphan_info(sb);
>  
>  	/*
>  	 * Unregister sysfs before destroying jbd2 journal.
> @@ -1189,6 +1190,7 @@ static void ext4_put_super(struct super_block *sb)
>  
>  	if (!sb_rdonly(sb) && !aborted) {
>  		ext4_clear_feature_journal_needs_recovery(sb);
> +		ext4_clear_feature_orphan_present(sb);
>  		es->s_state = cpu_to_le16(sbi->s_mount_state);
>  	}
>  	if (!sb_rdonly(sb))
> @@ -2695,8 +2697,11 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
>  		es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
>  	le16_add_cpu(&es->s_mnt_count, 1);
>  	ext4_update_tstamp(es, s_mtime);
> -	if (sbi->s_journal)
> +	if (sbi->s_journal) {
>  		ext4_set_feature_journal_needs_recovery(sb);
> +		if (ext4_has_feature_orphan_file(sb))
> +			ext4_set_feature_orphan_present(sb);
> +	}
>  
>  	err = ext4_commit_super(sb);
>  done:
> @@ -3971,6 +3976,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  		silent = 1;
>  		goto cantfind_ext4;
>  	}
> +	ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE,
> +				ext4_orphan_file_block_trigger);
>  
>  	/* Load the checksum driver */
>  	sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
> @@ -4635,6 +4642,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  	sb->s_root = NULL;
>  
>  	needs_recovery = (es->s_last_orphan != 0 ||
> +			  ext4_has_feature_orphan_present(sb) ||
>  			  ext4_has_feature_journal_needs_recovery(sb));
>  
>  	if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb))
> @@ -4924,12 +4932,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  	if (err)
>  		goto failed_mount7;
>  
> +	err = ext4_init_orphan_info(sb);
> +	if (err)
> +		goto failed_mount8;
>  #ifdef CONFIG_QUOTA
>  	/* Enable quota usage during mount. */
>  	if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) {
>  		err = ext4_enable_quotas(sb);
>  		if (err)
> -			goto failed_mount8;
> +			goto failed_mount9;
>  	}
>  #endif  /* CONFIG_QUOTA */
>  
> @@ -4948,7 +4959,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  		ext4_msg(sb, KERN_INFO, "recovery complete");
>  		err = ext4_mark_recovery_complete(sb, es);
>  		if (err)
> -			goto failed_mount8;
> +			goto failed_mount9;
>  	}
>  	if (EXT4_SB(sb)->s_journal) {
>  		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
> @@ -4994,6 +5005,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  		ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
>  	goto failed_mount;
>  
> +failed_mount9:
> +	ext4_release_orphan_info(sb);
>  failed_mount8:
>  	ext4_unregister_sysfs(sb);
>  	kobject_put(&sbi->s_kobj);
> @@ -5505,8 +5518,15 @@ static int ext4_mark_recovery_complete(struct super_block *sb,
>  	if (err < 0)
>  		goto out;
>  
> -	if (ext4_has_feature_journal_needs_recovery(sb) && sb_rdonly(sb)) {
> +	if (sb_rdonly(sb) && (ext4_has_feature_journal_needs_recovery(sb) ||
> +	    ext4_has_feature_orphan_present(sb))) {
> +		if (!ext4_orphan_file_empty(sb)) {
> +			ext4_error(sb, "Orphan file not empty on read-only fs.");
> +			err = -EFSCORRUPTED;
> +			goto out;
> +		}
>  		ext4_clear_feature_journal_needs_recovery(sb);
> +		ext4_clear_feature_orphan_present(sb);
>  		ext4_commit_super(sb);
>  	}
>  out:
> @@ -5649,6 +5669,8 @@ static int ext4_freeze(struct super_block *sb)
>  
>  		/* Journal blocked and flushed, clear needs_recovery flag. */
>  		ext4_clear_feature_journal_needs_recovery(sb);
> +		if (ext4_orphan_file_empty(sb))
> +			ext4_clear_feature_orphan_present(sb);
>  	}
>  
>  	error = ext4_commit_super(sb);
> @@ -5671,6 +5693,8 @@ static int ext4_unfreeze(struct super_block *sb)
>  	if (EXT4_SB(sb)->s_journal) {
>  		/* Reset the needs_recovery flag before the fs is unlocked. */
>  		ext4_set_feature_journal_needs_recovery(sb);
> +		if (ext4_has_feature_orphan_file(sb))
> +			ext4_set_feature_orphan_present(sb);
>  	}
>  
>  	ext4_commit_super(sb);
> @@ -5876,7 +5900,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
>  			 * around from a previously readonly bdev mount,
>  			 * require a full umount/remount for now.
>  			 */
> -			if (es->s_last_orphan) {
> +			if (es->s_last_orphan || !ext4_orphan_file_empty(sb)) {
>  				ext4_msg(sb, KERN_WARNING, "Couldn't "
>  				       "remount RDWR because of unprocessed "
>  				       "orphan inode list.  Please "
> -- 
> 2.26.2
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ