lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <87k30ic0lw.fsf@openvz.org>
Date:	Mon, 19 Jan 2015 18:21:31 +0400
From:	Dmitry Monakhov <dmonlist@...il.com>
To:	Theodore Ts'o <tytso@....edu>,
	Linux Filesystem Development List 
	<linux-fsdevel@...r.kernel.org>
Cc:	Ext4 Developers List <linux-ext4@...r.kernel.org>,
	Theodore Ts'o <tytso@....edu>
Subject: Re: [PATCH-v7 1/3] vfs: add support for a lazytime mount option


Theodore Ts'o <tytso@....edu> writes:

> Add a new mount option which enables a new "lazytime" mode.  This mode
> causes atime, mtime, and ctime updates to only be made to the
> in-memory version of the inode.  The on-disk times will only get
> updated when (a) if the inode needs to be updated for some non-time
> related change, (b) if userspace calls fsync(), syncfs() or sync(), or
> (c) just before an undeleted inode is evicted from memory.
>
> This is OK according to POSIX because there are no guarantees after a
> crash unless userspace explicitly requests via a fsync(2) call.
>
> For workloads which feature a large number of random write to a
> preallocated file, the lazytime mount option significantly reduces
> writes to the inode table.  The repeated 4k writes to a single block
> will result in undesirable stress on flash devices and SMR disk
> drives.  Even on conventional HDD's, the repeated writes to the inode
> table block will trigger Adjacent Track Interference (ATI) remediation
> latencies, which very negatively impact long tail latencies --- which
> is a very big deal for web serving tiers (for example).
>
> Google-Bug-Id: 18297052
>
> Signed-off-by: Theodore Ts'o <tytso@....edu>
> ---
>  fs/ext4/inode.c                  |  6 ++++
>  fs/fs-writeback.c                | 64 ++++++++++++++++++++++++++++++++--------
>  fs/gfs2/file.c                   |  4 +--
>  fs/inode.c                       | 56 +++++++++++++++++++++++++----------
>  fs/jfs/file.c                    |  2 +-
>  fs/libfs.c                       |  2 +-
>  fs/proc_namespace.c              |  1 +
>  fs/sync.c                        |  8 +++++
>  include/linux/backing-dev.h      |  1 +
>  include/linux/fs.h               |  5 ++++
>  include/trace/events/writeback.h | 60 ++++++++++++++++++++++++++++++++++++-
>  include/uapi/linux/fs.h          |  4 ++-
>  mm/backing-dev.c                 | 10 +++++--
>  13 files changed, 187 insertions(+), 36 deletions(-)
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 5653fa4..628df5b 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4840,11 +4840,17 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
>   * If the inode is marked synchronous, we don't honour that here - doing
>   * so would cause a commit on atime updates, which we don't bother doing.
>   * We handle synchronous inodes at the highest possible level.
> + *
> + * If only the I_DIRTY_TIME flag is set, we can skip everything.  If
> + * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need
> + * to copy into the on-disk inode structure are the timestamp files.
>   */
>  void ext4_dirty_inode(struct inode *inode, int flags)
>  {
>  	handle_t *handle;
>  
> +	if (flags == I_DIRTY_TIME)
> +		return;
>  	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
>  	if (IS_ERR(handle))
>  		goto out;
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index ef9bef1..d5e02b8 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -247,14 +247,19 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
>  	return ret;
>  }
>  
> +#define EXPIRE_DIRTY_ATIME 0x0001
> +
>  /*
>   * Move expired (dirtied before work->older_than_this) dirty inodes from
>   * @delaying_queue to @dispatch_queue.
>   */
>  static int move_expired_inodes(struct list_head *delaying_queue,
>  			       struct list_head *dispatch_queue,
> +			       int flags,
>  			       struct wb_writeback_work *work)
>  {
> +	unsigned long *older_than_this = NULL;
> +	unsigned long expire_time;
>  	LIST_HEAD(tmp);
>  	struct list_head *pos, *node;
>  	struct super_block *sb = NULL;
> @@ -262,13 +267,21 @@ static int move_expired_inodes(struct list_head *delaying_queue,
>  	int do_sb_sort = 0;
>  	int moved = 0;
>  
> +	if ((flags & EXPIRE_DIRTY_ATIME) == 0)
> +		older_than_this = work->older_than_this;
> +	else if ((work->reason == WB_REASON_SYNC) == 0) {
> +		expire_time = jiffies - (HZ * 86400);
> +		older_than_this = &expire_time;
> +	}
>  	while (!list_empty(delaying_queue)) {
>  		inode = wb_inode(delaying_queue->prev);
> -		if (work->older_than_this &&
> -		    inode_dirtied_after(inode, *work->older_than_this))
> +		if (older_than_this &&
> +		    inode_dirtied_after(inode, *older_than_this))
>  			break;
>  		list_move(&inode->i_wb_list, &tmp);
>  		moved++;
> +		if (flags & EXPIRE_DIRTY_ATIME)
> +			set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
>  		if (sb_is_blkdev_sb(inode->i_sb))
>  			continue;
>  		if (sb && sb != inode->i_sb)
> @@ -309,9 +322,12 @@ out:
>  static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
>  {
>  	int moved;
> +
>  	assert_spin_locked(&wb->list_lock);
>  	list_splice_init(&wb->b_more_io, &wb->b_io);
> -	moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
> +	moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
> +	moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
> +				     EXPIRE_DIRTY_ATIME, work);
>  	trace_writeback_queue_io(wb, work, moved);
>  }
>  
> @@ -435,6 +451,8 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
>  		 * updates after data IO completion.
>  		 */
>  		redirty_tail(inode, wb);
> +	} else if (inode->i_state & I_DIRTY_TIME) {
> +		list_move(&inode->i_wb_list, &wb->b_dirty_time);
>  	} else {
>  		/* The inode is clean. Remove from writeback lists. */
>  		list_del_init(&inode->i_wb_list);
> @@ -482,11 +500,18 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
>  	/* Clear I_DIRTY_PAGES if we've written out all dirty pages */
>  	if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
>  		inode->i_state &= ~I_DIRTY_PAGES;
> -	dirty = inode->i_state & I_DIRTY;
> -	inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
> +	dirty = inode->i_state & (I_DIRTY_SYNC | I_DIRTY_DATASYNC);
> +	if ((dirty && (inode->i_state & I_DIRTY_TIME)) ||
> +	    (inode->i_state & I_DIRTY_TIME_EXPIRED)) {
> +		dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
> +		trace_writeback_lazytime(inode);
> +	}
> +	inode->i_state &= ~dirty;
>  	spin_unlock(&inode->i_lock);
> +	if (dirty & I_DIRTY_TIME)
> +		mark_inode_dirty_sync(inode);
>  	/* Don't write the inode if only I_DIRTY_PAGES was set */
> -	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
> +	if (dirty) {
>  		int err = write_inode(inode, wbc);
>  		if (ret == 0)
>  			ret = err;
> @@ -534,7 +559,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
>  	 * make sure inode is on some writeback list and leave it there unless
>  	 * we have completely cleaned the inode.
>  	 */
> -	if (!(inode->i_state & I_DIRTY) &&
> +	if (!(inode->i_state & I_DIRTY_ALL) &&
>  	    (wbc->sync_mode != WB_SYNC_ALL ||
>  	     !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
>  		goto out;
> @@ -549,7 +574,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
>  	 * If inode is clean, remove it from writeback lists. Otherwise don't
>  	 * touch it. See comment above for explanation.
>  	 */
> -	if (!(inode->i_state & I_DIRTY))
> +	if (!(inode->i_state & I_DIRTY_ALL))
>  		list_del_init(&inode->i_wb_list);
>  	spin_unlock(&wb->list_lock);
>  	inode_sync_complete(inode);
> @@ -691,7 +716,7 @@ static long writeback_sb_inodes(struct super_block *sb,
>  		wrote += write_chunk - wbc.nr_to_write;
>  		spin_lock(&wb->list_lock);
>  		spin_lock(&inode->i_lock);
> -		if (!(inode->i_state & I_DIRTY))
> +		if (!(inode->i_state & I_DIRTY_ALL))
>  			wrote++;
>  		requeue_inode(inode, wb, &wbc);
>  		inode_sync_complete(inode);
> @@ -1129,16 +1154,20 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
>   * page->mapping->host, so the page-dirtying time is recorded in the internal
>   * blockdev inode.
>   */
> +#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
>  void __mark_inode_dirty(struct inode *inode, int flags)
>  {
>  	struct super_block *sb = inode->i_sb;
>  	struct backing_dev_info *bdi = NULL;
> +	int dirtytime;
> +
> +	trace_writeback_mark_inode_dirty(inode, flags);
>  
>  	/*
>  	 * Don't do this for I_DIRTY_PAGES - that doesn't actually
>  	 * dirty the inode itself
>  	 */
> -	if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
> +	if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) {
>  		trace_writeback_dirty_inode_start(inode, flags);
>  
>  		if (sb->s_op->dirty_inode)
> @@ -1146,6 +1175,9 @@ void __mark_inode_dirty(struct inode *inode, int flags)
>  
>  		trace_writeback_dirty_inode(inode, flags);
>  	}
> +	if (flags & I_DIRTY_INODE)
> +		flags &= ~I_DIRTY_TIME;
> +	dirtytime = flags & I_DIRTY_TIME;
TYPO? 'dirtytime' is always false because you have already cleared that bit.
Probably you want to do that: 
        dirtytime = flags & I_DIRTY_TIME;
	if (flags & I_DIRTY_INODE)
		flags &= ~I_DIRTY_TIME;
>  
>  	/*
>  	 * make sure that changes are seen by all cpus before we test i_state
> @@ -1154,16 +1186,22 @@ void __mark_inode_dirty(struct inode *inode, int flags)
>  	smp_mb();
>  
>  	/* avoid the locking if we can */
> -	if ((inode->i_state & flags) == flags)
> +	if (((inode->i_state & flags) == flags) ||
> +	    (dirtytime && (inode->i_state & I_DIRTY_INODE)))
>  		return;
>  
>  	if (unlikely(block_dump))
>  		block_dump___mark_inode_dirty(inode);
>  
>  	spin_lock(&inode->i_lock);
> +	if (dirtytime && (inode->i_state & I_DIRTY_INODE))
> +		return;
>  	if ((inode->i_state & flags) != flags) {
>  		const int was_dirty = inode->i_state & I_DIRTY;
>  
> +		if (dirtytime && (inode->i_state & I_DIRTY_INODE))
> +			inode->i_state &= ~I_DIRTY_TIME;
> +
>  		inode->i_state |= flags;
>  
>  		/*
> @@ -1210,8 +1248,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
>  			}
>  
>  			inode->dirtied_when = jiffies;
> -			list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
> +			list_move(&inode->i_wb_list, dirtytime ?
> +				  &bdi->wb.b_dirty_time : &bdi->wb.b_dirty);
>  			spin_unlock(&bdi->wb.list_lock);
> +			trace_writeback_dirty_inode_enqueue(inode);
>  
>  			if (wakeup_bdi)
>  				bdi_wakeup_thread_delayed(bdi);
> diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
> index 80dd44d..e584bf9 100644
> --- a/fs/gfs2/file.c
> +++ b/fs/gfs2/file.c
> @@ -654,7 +654,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
>  {
>  	struct address_space *mapping = file->f_mapping;
>  	struct inode *inode = mapping->host;
> -	int sync_state = inode->i_state & I_DIRTY;
> +	int sync_state = inode->i_state & I_DIRTY_ALL;
>  	struct gfs2_inode *ip = GFS2_I(inode);
>  	int ret = 0, ret1 = 0;
>  
> @@ -667,7 +667,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
>  	if (!gfs2_is_jdata(ip))
>  		sync_state &= ~I_DIRTY_PAGES;
>  	if (datasync)
> -		sync_state &= ~I_DIRTY_SYNC;
> +		sync_state &= ~(I_DIRTY_SYNC | I_DIRTY_TIME);
>  
>  	if (sync_state) {
>  		ret = sync_inode_metadata(inode, 1);
> diff --git a/fs/inode.c b/fs/inode.c
> index 26753ba..dc48a23 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -18,6 +18,7 @@
>  #include <linux/buffer_head.h> /* for inode_has_buffers */
>  #include <linux/ratelimit.h>
>  #include <linux/list_lru.h>
> +#include <trace/events/writeback.h>
>  #include "internal.h"
>  
>  /*
> @@ -30,7 +31,7 @@
>   * inode_sb_list_lock protects:
>   *   sb->s_inodes, inode->i_sb_list
>   * bdi->wb.list_lock protects:
> - *   bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
> + *   bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list
>   * inode_hash_lock protects:
>   *   inode_hashtable, inode->i_hash
>   *
> @@ -414,7 +415,8 @@ static void inode_lru_list_add(struct inode *inode)
>   */
>  void inode_add_lru(struct inode *inode)
>  {
> -	if (!(inode->i_state & (I_DIRTY | I_SYNC | I_FREEING | I_WILL_FREE)) &&
> +	if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC |
> +				I_FREEING | I_WILL_FREE)) &&
>  	    !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE)
>  		inode_lru_list_add(inode);
>  }
> @@ -645,7 +647,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
>  			spin_unlock(&inode->i_lock);
>  			continue;
>  		}
> -		if (inode->i_state & I_DIRTY && !kill_dirty) {
> +		if (inode->i_state & I_DIRTY_ALL && !kill_dirty) {
>  			spin_unlock(&inode->i_lock);
>  			busy = 1;
>  			continue;
> @@ -1430,11 +1432,20 @@ static void iput_final(struct inode *inode)
>   */
>  void iput(struct inode *inode)
>  {
> -	if (inode) {
> -		BUG_ON(inode->i_state & I_CLEAR);
> -
> -		if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock))
> -			iput_final(inode);
> +	if (!inode)
> +		return;
> +	BUG_ON(inode->i_state & I_CLEAR);
> +retry:
> +	if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
> +		if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
> +			atomic_inc(&inode->i_count);
> +			inode->i_state &= ~I_DIRTY_TIME;
> +			spin_unlock(&inode->i_lock);
> +			trace_writeback_lazytime_iput(inode);
> +			mark_inode_dirty_sync(inode);
> +			goto retry;
> +		}
> +		iput_final(inode);
>  	}
>  }
>  EXPORT_SYMBOL(iput);
> @@ -1493,14 +1504,9 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
>  	return 0;
>  }
>  
> -/*
> - * This does the actual work of updating an inodes time or version.  Must have
> - * had called mnt_want_write() before calling this.
> - */
> -static int update_time(struct inode *inode, struct timespec *time, int flags)
> +int generic_update_time(struct inode *inode, struct timespec *time, int flags)
>  {
> -	if (inode->i_op->update_time)
> -		return inode->i_op->update_time(inode, time, flags);
> +	int iflags = I_DIRTY_TIME;
>  
>  	if (flags & S_ATIME)
>  		inode->i_atime = *time;
> @@ -1510,9 +1516,27 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
>  		inode->i_ctime = *time;
>  	if (flags & S_MTIME)
>  		inode->i_mtime = *time;
> -	mark_inode_dirty_sync(inode);
> +
> +	if (!(inode->i_sb->s_flags & MS_LAZYTIME) || (flags & S_VERSION))
> +		iflags |= I_DIRTY_SYNC;
> +	__mark_inode_dirty(inode, iflags);
>  	return 0;
>  }
> +EXPORT_SYMBOL(generic_update_time);
> +
> +/*
> + * This does the actual work of updating an inodes time or version.  Must have
> + * had called mnt_want_write() before calling this.
> + */
> +static int update_time(struct inode *inode, struct timespec *time, int flags)
> +{
> +	int (*update_time)(struct inode *, struct timespec *, int);
> +
> +	update_time = inode->i_op->update_time ? inode->i_op->update_time :
> +		generic_update_time;
> +
> +	return update_time(inode, time, flags);
> +}
>  
>  /**
>   *	touch_atime	-	update the access time
> diff --git a/fs/jfs/file.c b/fs/jfs/file.c
> index 33aa0cc..10815f8 100644
> --- a/fs/jfs/file.c
> +++ b/fs/jfs/file.c
> @@ -39,7 +39,7 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
>  		return rc;
>  
>  	mutex_lock(&inode->i_mutex);
> -	if (!(inode->i_state & I_DIRTY) ||
> +	if (!(inode->i_state & I_DIRTY_ALL) ||
>  	    (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
>  		/* Make sure committed changes hit the disk */
>  		jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
> diff --git a/fs/libfs.c b/fs/libfs.c
> index 171d284..7cb9cef 100644
> --- a/fs/libfs.c
> +++ b/fs/libfs.c
> @@ -948,7 +948,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
>  
>  	mutex_lock(&inode->i_mutex);
>  	ret = sync_mapping_buffers(inode->i_mapping);
> -	if (!(inode->i_state & I_DIRTY))
> +	if (!(inode->i_state & I_DIRTY_ALL))
>  		goto out;
>  	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
>  		goto out;
> diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
> index 73ca174..f98234a 100644
> --- a/fs/proc_namespace.c
> +++ b/fs/proc_namespace.c
> @@ -44,6 +44,7 @@ static int show_sb_opts(struct seq_file *m, struct super_block *sb)
>  		{ MS_SYNCHRONOUS, ",sync" },
>  		{ MS_DIRSYNC, ",dirsync" },
>  		{ MS_MANDLOCK, ",mand" },
> +		{ MS_LAZYTIME, ",lazytime" },
>  		{ 0, NULL }
>  	};
>  	const struct proc_fs_info *fs_infop;
> diff --git a/fs/sync.c b/fs/sync.c
> index bdc729d..6ac7bf0 100644
> --- a/fs/sync.c
> +++ b/fs/sync.c
> @@ -177,8 +177,16 @@ SYSCALL_DEFINE1(syncfs, int, fd)
>   */
>  int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
>  {
> +	struct inode *inode = file->f_mapping->host;
> +
>  	if (!file->f_op->fsync)
>  		return -EINVAL;
> +	if (!datasync && (inode->i_state & I_DIRTY_TIME)) {
> +		spin_lock(&inode->i_lock);
> +		inode->i_state &= ~I_DIRTY_TIME;
> +		spin_unlock(&inode->i_lock);
> +		mark_inode_dirty_sync(inode);
> +	}
>  	return file->f_op->fsync(file, start, end, datasync);
>  }
>  EXPORT_SYMBOL(vfs_fsync_range);
> diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
> index 5da6012..4cdf733 100644
> --- a/include/linux/backing-dev.h
> +++ b/include/linux/backing-dev.h
> @@ -55,6 +55,7 @@ struct bdi_writeback {
>  	struct list_head b_dirty;	/* dirty inodes */
>  	struct list_head b_io;		/* parked for writeback */
>  	struct list_head b_more_io;	/* parked for more writeback */
> +	struct list_head b_dirty_time;	/* time stamps are dirty */
>  	spinlock_t list_lock;		/* protects the b_* lists */
>  };
>  
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 9ab779e..bf00e98 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1720,8 +1720,12 @@ struct super_operations {
>  #define __I_DIO_WAKEUP		9
>  #define I_DIO_WAKEUP		(1 << I_DIO_WAKEUP)
>  #define I_LINKABLE		(1 << 10)
> +#define I_DIRTY_TIME		(1 << 11)
> +#define __I_DIRTY_TIME_EXPIRED	12
> +#define I_DIRTY_TIME_EXPIRED	(1 << __I_DIRTY_TIME_EXPIRED)
>  
>  #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
> +#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)
>  
>  extern void __mark_inode_dirty(struct inode *, int);
>  static inline void mark_inode_dirty(struct inode *inode)
> @@ -1884,6 +1888,7 @@ extern int current_umask(void);
>  
>  extern void ihold(struct inode * inode);
>  extern void iput(struct inode *);
> +extern int generic_update_time(struct inode *, struct timespec *, int);
>  
>  static inline struct inode *file_inode(const struct file *f)
>  {
> diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
> index cee02d6..5ecb4c2 100644
> --- a/include/trace/events/writeback.h
> +++ b/include/trace/events/writeback.h
> @@ -18,6 +18,8 @@
>  		{I_FREEING,		"I_FREEING"},		\
>  		{I_CLEAR,		"I_CLEAR"},		\
>  		{I_SYNC,		"I_SYNC"},		\
> +		{I_DIRTY_TIME,		"I_DIRTY_TIME"},	\
> +		{I_DIRTY_TIME_EXPIRED,	"I_DIRTY_TIME_EXPIRED"}, \
>  		{I_REFERENCED,		"I_REFERENCED"}		\
>  	)
>  
> @@ -68,6 +70,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
>  	TP_STRUCT__entry (
>  		__array(char, name, 32)
>  		__field(unsigned long, ino)
> +		__field(unsigned long, state)
>  		__field(unsigned long, flags)
>  	),
>  
> @@ -78,16 +81,25 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
>  		strncpy(__entry->name,
>  			bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32);
>  		__entry->ino		= inode->i_ino;
> +		__entry->state		= inode->i_state;
>  		__entry->flags		= flags;
>  	),
>  
> -	TP_printk("bdi %s: ino=%lu flags=%s",
> +	TP_printk("bdi %s: ino=%lu state=%s flags=%s",
>  		__entry->name,
>  		__entry->ino,
> +		show_inode_state(__entry->state),
>  		show_inode_state(__entry->flags)
>  	)
>  );
>  
> +DEFINE_EVENT(writeback_dirty_inode_template, writeback_mark_inode_dirty,
> +
> +	TP_PROTO(struct inode *inode, int flags),
> +
> +	TP_ARGS(inode, flags)
> +);
> +
>  DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start,
>  
>  	TP_PROTO(struct inode *inode, int flags),
> @@ -598,6 +610,52 @@ DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode,
>  	TP_ARGS(inode, wbc, nr_to_write)
>  );
>  
> +DECLARE_EVENT_CLASS(writeback_lazytime_template,
> +	TP_PROTO(struct inode *inode),
> +
> +	TP_ARGS(inode),
> +
> +	TP_STRUCT__entry(
> +		__field(	dev_t,	dev			)
> +		__field(unsigned long,	ino			)
> +		__field(unsigned long,	state			)
> +		__field(	__u16, mode			)
> +		__field(unsigned long, dirtied_when		)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->dev	= inode->i_sb->s_dev;
> +		__entry->ino	= inode->i_ino;
> +		__entry->state	= inode->i_state;
> +		__entry->mode	= inode->i_mode;
> +		__entry->dirtied_when = inode->dirtied_when;
> +	),
> +
> +	TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  __entry->ino, __entry->dirtied_when,
> +		  show_inode_state(__entry->state), __entry->mode)
> +);
> +
> +DEFINE_EVENT(writeback_lazytime_template, writeback_lazytime,
> +	TP_PROTO(struct inode *inode),
> +
> +	TP_ARGS(inode)
> +);
> +
> +DEFINE_EVENT(writeback_lazytime_template, writeback_lazytime_iput,
> +	TP_PROTO(struct inode *inode),
> +
> +	TP_ARGS(inode)
> +);
> +
> +DEFINE_EVENT(writeback_lazytime_template, writeback_dirty_inode_enqueue,
> +
> +	TP_PROTO(struct inode *inode),
> +
> +	TP_ARGS(inode)
> +);
> +
>  #endif /* _TRACE_WRITEBACK_H */
>  
>  /* This part must be outside protection */
> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> index 3735fa0..9b964a5 100644
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -90,6 +90,7 @@ struct inodes_stat_t {
>  #define MS_KERNMOUNT	(1<<22) /* this is a kern_mount call */
>  #define MS_I_VERSION	(1<<23) /* Update inode I_version field */
>  #define MS_STRICTATIME	(1<<24) /* Always perform atime updates */
> +#define MS_LAZYTIME	(1<<25) /* Update the on-disk [acm]times lazily */
>  
>  /* These sb flags are internal to the kernel */
>  #define MS_NOSEC	(1<<28)
> @@ -100,7 +101,8 @@ struct inodes_stat_t {
>  /*
>   * Superblock flags that can be altered by MS_REMOUNT
>   */
> -#define MS_RMT_MASK	(MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION)
> +#define MS_RMT_MASK	(MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION|\
> +			 MS_LAZYTIME)
>  
>  /*
>   * Old magic mount flag and mask
> diff --git a/mm/backing-dev.c b/mm/backing-dev.c
> index 0ae0df5..915feea 100644
> --- a/mm/backing-dev.c
> +++ b/mm/backing-dev.c
> @@ -69,10 +69,10 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
>  	unsigned long background_thresh;
>  	unsigned long dirty_thresh;
>  	unsigned long bdi_thresh;
> -	unsigned long nr_dirty, nr_io, nr_more_io;
> +	unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
>  	struct inode *inode;
>  
> -	nr_dirty = nr_io = nr_more_io = 0;
> +	nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
>  	spin_lock(&wb->list_lock);
>  	list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
>  		nr_dirty++;
> @@ -80,6 +80,9 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
>  		nr_io++;
>  	list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
>  		nr_more_io++;
> +	list_for_each_entry(inode, &wb->b_dirty_time, i_wb_list)
> +		if (inode->i_state & I_DIRTY_TIME)
> +			nr_dirty_time++;
>  	spin_unlock(&wb->list_lock);
>  
>  	global_dirty_limits(&background_thresh, &dirty_thresh);
> @@ -98,6 +101,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
>  		   "b_dirty:            %10lu\n"
>  		   "b_io:               %10lu\n"
>  		   "b_more_io:          %10lu\n"
> +		   "b_dirty_time:       %10lu\n"
>  		   "bdi_list:           %10u\n"
>  		   "state:              %10lx\n",
>  		   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
> @@ -111,6 +115,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
>  		   nr_dirty,
>  		   nr_io,
>  		   nr_more_io,
> +		   nr_dirty_time,
>  		   !list_empty(&bdi->bdi_list), bdi->state);
>  #undef K
>  
> @@ -418,6 +423,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
>  	INIT_LIST_HEAD(&wb->b_dirty);
>  	INIT_LIST_HEAD(&wb->b_io);
>  	INIT_LIST_HEAD(&wb->b_more_io);
> +	INIT_LIST_HEAD(&wb->b_dirty_time);
>  	spin_lock_init(&wb->list_lock);
>  	INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
>  }
> -- 
> 2.1.0
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@...r.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ