[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1397647830-24444-3-git-send-email-wenqing.lz@taobao.com>
Date: Wed, 16 Apr 2014 19:30:28 +0800
From: Zheng Liu <gnehzuil.liu@...il.com>
To: linux-ext4@...r.kernel.org
Cc: "Theodore Ts'o" <tytso@....edu>,
Andreas Dilger <adilger.kernel@...ger.ca>,
Jan Kara <jack@...e.cz>, Zheng Liu <wenqing.lz@...bao.com>
Subject: [RFC PATCH v2 2/4] ext4: track extent status tree shrinker delay statictics
From: Zheng Liu <wenqing.lz@...bao.com>
This commit adds some statictics in extent status tree shrinker. The
purpose to add these is that we want to collect more details when we
encounter a stall caused by extent status tree shrinker. Here we count
the following statictics:
stats:
the number of all objects on all extent status trees
the number of reclaimable objects on lru list
the last sorted interval
the number of inodes on lru list
average:
scan time for shrinking some objects
the number of shrunk objects
maximum:
the inode that has max nr. of objects on lru list
the maximum scan time for shrinking some objects
The output looks like below:
$ cat /proc/fs/ext4/sda1/es_shrinker_info
stats:
28228 objects
6341 reclaimable objects
586 ms last sorted interval
250 inodes on lru list
average:
153 us scan time
128 shrunk objects
maximum:
255 inode (255 objects, 198 reclaimable)
125723 us max scan time
If the lru list has never been sorted, the following line will not be
printed:
586ms last sorted interval
If there is an empty lru list, the following lines also will not be
printed:
250 inodes on lru list
...
maximum:
255 inode (255 objects, 198 reclaimable)
0 us max scan time
Meanwhile in this commit a new trace point is defined to print some
details in __ext4_es_shrink().
Cc: "Theodore Ts'o" <tytso@....edu>
Cc: Andreas Dilger <adilger.kernel@...ger.ca>
Cc: Jan Kara <jack@...e.cz>
Reviewed-by: Jan Kara <jack@...e.cz>
Signed-off-by: Zheng Liu <wenqing.lz@...bao.com>
---
fs/ext4/ext4.h | 4 +-
fs/ext4/extents_status.c | 176 ++++++++++++++++++++++++++++++++++++++++---
fs/ext4/extents_status.h | 11 ++-
fs/ext4/super.c | 17 ++---
include/trace/events/ext4.h | 31 ++++++++
5 files changed, 216 insertions(+), 23 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f1c65dc..89f3dfa 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -890,6 +890,7 @@ struct ext4_inode_info {
struct ext4_es_tree i_es_tree;
rwlock_t i_es_lock;
struct list_head i_es_lru;
+ unsigned int i_es_all_nr; /* protected by i_es_lock */
unsigned int i_es_lru_nr; /* protected by i_es_lock */
unsigned long i_touch_when; /* jiffies of last accessing */
@@ -1329,8 +1330,7 @@ struct ext4_sb_info {
/* Reclaim extents from extent status tree */
struct shrinker s_es_shrinker;
struct list_head s_es_lru;
- unsigned long s_es_last_sorted;
- struct percpu_counter s_extent_cache_cnt;
+ struct ext4_es_stats s_es_stats;
struct mb_cache *s_mb_cache;
spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 090e69a..9d977a3 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -11,6 +11,8 @@
*/
#include <linux/rbtree.h>
#include <linux/list_sort.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
#include "ext4.h"
#include "extents_status.h"
@@ -313,19 +315,27 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
*/
if (!ext4_es_is_delayed(es)) {
EXT4_I(inode)->i_es_lru_nr++;
- percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
+ percpu_counter_inc(&EXT4_SB(inode->i_sb)->
+ s_es_stats.es_stats_lru_cnt);
}
+ EXT4_I(inode)->i_es_all_nr++;
+ percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
+
return es;
}
static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
{
+ EXT4_I(inode)->i_es_all_nr--;
+ percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
+
/* Decrease the lru counter when this es is not delayed */
if (!ext4_es_is_delayed(es)) {
BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
EXT4_I(inode)->i_es_lru_nr--;
- percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
+ percpu_counter_dec(&EXT4_SB(inode->i_sb)->
+ s_es_stats.es_stats_lru_cnt);
}
kmem_cache_free(ext4_es_cachep, es);
@@ -927,11 +937,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
struct ext4_inode_info *locked_ei)
{
struct ext4_inode_info *ei;
+ struct ext4_es_stats *es_stats;
struct list_head *cur, *tmp;
LIST_HEAD(skipped);
+ ktime_t start_time;
+ u64 scan_time;
int nr_shrunk = 0;
int retried = 0, skip_precached = 1, nr_skipped = 0;
+ es_stats = &sbi->s_es_stats;
+ start_time = ktime_get();
spin_lock(&sbi->s_es_lru_lock);
retry:
@@ -942,7 +957,8 @@ retry:
* If we have already reclaimed all extents from extent
* status tree, just stop the loop immediately.
*/
- if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
+ if (percpu_counter_read_positive(
+ &es_stats->es_stats_lru_cnt) == 0)
break;
ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
@@ -952,7 +968,7 @@ retry:
* time. Normally we try hard to avoid shrinking
* precached inodes, but we will as a last resort.
*/
- if ((sbi->s_es_last_sorted < ei->i_touch_when) ||
+ if ((es_stats->es_stats_last_sorted < ei->i_touch_when) ||
(skip_precached && ext4_test_inode_state(&ei->vfs_inode,
EXT4_STATE_EXT_PRECACHED))) {
nr_skipped++;
@@ -986,7 +1002,7 @@ retry:
if ((nr_shrunk == 0) && nr_skipped && !retried) {
retried++;
list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
- sbi->s_es_last_sorted = jiffies;
+ es_stats->es_stats_last_sorted = jiffies;
ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
i_es_lru);
/*
@@ -1004,6 +1020,22 @@ retry:
if (locked_ei && nr_shrunk == 0)
nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
+ scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+ if (likely(es_stats->es_stats_scan_time))
+ es_stats->es_stats_scan_time = (scan_time +
+ es_stats->es_stats_scan_time*3) / 4;
+ else
+ es_stats->es_stats_scan_time = scan_time;
+ if (scan_time > es_stats->es_stats_max_scan_time)
+ es_stats->es_stats_max_scan_time = scan_time;
+ if (likely(es_stats->es_stats_shrunk))
+ es_stats->es_stats_shrunk = (nr_shrunk +
+ es_stats->es_stats_shrunk*3) / 4;
+ else
+ es_stats->es_stats_shrunk = nr_shrunk;
+
+ trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, skip_precached,
+ nr_skipped, retried);
return nr_shrunk;
}
@@ -1014,7 +1046,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink,
struct ext4_sb_info *sbi;
sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
- nr = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
+ nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
return nr;
}
@@ -1027,7 +1059,7 @@ static unsigned long ext4_es_scan(struct shrinker *shrink,
int nr_to_scan = sc->nr_to_scan;
int ret, nr_shrunk;
- ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
+ ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
if (!nr_to_scan)
@@ -1039,19 +1071,143 @@ static unsigned long ext4_es_scan(struct shrinker *shrink,
return nr_shrunk;
}
-void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
+static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos)
+{
+ return *pos ? NULL : SEQ_START_TOKEN;
+}
+
+static void *
+ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ return NULL;
+}
+
+static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
+{
+ struct ext4_sb_info *sbi = seq->private;
+ struct ext4_es_stats *es_stats = &sbi->s_es_stats;
+ struct ext4_inode_info *ei, *max = NULL;
+ unsigned int inode_cnt = 0;
+
+ if (v != SEQ_START_TOKEN)
+ return 0;
+
+ /* here we just find an inode that has the max nr. of objects */
+ spin_lock(&sbi->s_es_lru_lock);
+ list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) {
+ inode_cnt++;
+ if (max && max->i_es_all_nr < ei->i_es_all_nr)
+ max = ei;
+ else if (!max)
+ max = ei;
+ }
+ spin_unlock(&sbi->s_es_lru_lock);
+
+ seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n",
+ percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
+ percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt));
+ if (es_stats->es_stats_last_sorted != 0)
+ seq_printf(seq, " %u ms last sorted interval\n",
+ jiffies_to_msecs(jiffies -
+ es_stats->es_stats_last_sorted));
+ if (inode_cnt)
+ seq_printf(seq, " %d inodes on lru list\n", inode_cnt);
+
+ seq_printf(seq, "average:\n %llu us scan time\n",
+ div_u64(es_stats->es_stats_scan_time, 1000));
+ seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk);
+ if (inode_cnt)
+ seq_printf(seq,
+ "maximum:\n %lu inode (%u objects, %u reclaimable)\n"
+ " %llu us max scan time\n",
+ max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr,
+ div_u64(es_stats->es_stats_max_scan_time, 1000));
+
+ return 0;
+}
+
+static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v)
{
+}
+
+static const struct seq_operations ext4_es_seq_shrinker_info_ops = {
+ .start = ext4_es_seq_shrinker_info_start,
+ .next = ext4_es_seq_shrinker_info_next,
+ .stop = ext4_es_seq_shrinker_info_stop,
+ .show = ext4_es_seq_shrinker_info_show,
+};
+
+static int
+ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ ret = seq_open(file, &ext4_es_seq_shrinker_info_ops);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = PDE_DATA(inode);
+ }
+
+ return ret;
+}
+
+static int
+ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file)
+{
+ return seq_release(inode, file);
+}
+
+static const struct file_operations ext4_es_seq_shrinker_info_fops = {
+ .owner = THIS_MODULE,
+ .open = ext4_es_seq_shrinker_info_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = ext4_es_seq_shrinker_info_release,
+};
+
+int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
+{
+ int err;
+
INIT_LIST_HEAD(&sbi->s_es_lru);
spin_lock_init(&sbi->s_es_lru_lock);
- sbi->s_es_last_sorted = 0;
+ sbi->s_es_stats.es_stats_last_sorted = 0;
+ sbi->s_es_stats.es_stats_shrunk = 0;
+ sbi->s_es_stats.es_stats_scan_time = 0;
+ sbi->s_es_stats.es_stats_max_scan_time = 0;
+ err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0);
+ if (err)
+ return err;
+ err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt, 0);
+ if (err)
+ goto err1;
+
sbi->s_es_shrinker.scan_objects = ext4_es_scan;
sbi->s_es_shrinker.count_objects = ext4_es_count;
sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
- register_shrinker(&sbi->s_es_shrinker);
+ err = register_shrinker(&sbi->s_es_shrinker);
+ if (err)
+ goto err2;
+
+ if (sbi->s_proc)
+ proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc,
+ &ext4_es_seq_shrinker_info_fops, sbi);
+
+ return 0;
+
+err2:
+ percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
+err1:
+ percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
+ return err;
}
void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
{
+ if (sbi->s_proc)
+ remove_proc_entry("es_shrinker_info", sbi->s_proc);
+ percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
+ percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
unregister_shrinker(&sbi->s_es_shrinker);
}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index f1b62a4..647c3c9 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -64,6 +64,15 @@ struct ext4_es_tree {
struct extent_status *cache_es; /* recently accessed extent */
};
+struct ext4_es_stats {
+ unsigned long es_stats_last_sorted;
+ unsigned long es_stats_shrunk;
+ u64 es_stats_scan_time;
+ u64 es_stats_max_scan_time;
+ struct percpu_counter es_stats_all_cnt;
+ struct percpu_counter es_stats_lru_cnt;
+};
+
extern int __init ext4_init_es(void);
extern void ext4_exit_es(void);
extern void ext4_es_init_tree(struct ext4_es_tree *tree);
@@ -138,7 +147,7 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es,
(pb & ~ES_MASK));
}
-extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
+extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
extern void ext4_es_lru_add(struct inode *inode);
extern void ext4_es_lru_del(struct inode *inode);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f3c6670..df0fb22 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -820,7 +820,6 @@ static void ext4_put_super(struct super_block *sb)
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
- percpu_counter_destroy(&sbi->s_extent_cache_cnt);
brelse(sbi->s_sbh);
#ifdef CONFIG_QUOTA
for (i = 0; i < MAXQUOTAS; i++)
@@ -884,6 +883,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ext4_es_init_tree(&ei->i_es_tree);
rwlock_init(&ei->i_es_lock);
INIT_LIST_HEAD(&ei->i_es_lru);
+ ei->i_es_all_nr = 0;
ei->i_es_lru_nr = 0;
ei->i_touch_when = 0;
ei->i_reserved_data_blocks = 0;
@@ -3890,10 +3890,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_err_report.data = (unsigned long) sb;
/* Register extent status tree shrinker */
- ext4_es_register_shrinker(sbi);
-
- err = percpu_counter_init(&sbi->s_freeclusters_counter,
- ext4_count_free_clusters(sb));
+ err = ext4_es_register_shrinker(sbi);
+ if (!err) {
+ err = percpu_counter_init(&sbi->s_freeclusters_counter,
+ ext4_count_free_clusters(sb));
+ }
if (!err) {
err = percpu_counter_init(&sbi->s_freeinodes_counter,
ext4_count_free_inodes(sb));
@@ -3905,9 +3906,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (!err) {
err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0);
}
- if (!err) {
- err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0);
- }
if (err) {
ext4_msg(sb, KERN_ERR, "insufficient memory");
goto failed_mount3;
@@ -4220,8 +4218,8 @@ failed_mount_wq:
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
}
-failed_mount3:
ext4_es_unregister_shrinker(sbi);
+failed_mount3:
del_timer_sync(&sbi->s_err_report);
if (sbi->s_flex_groups)
ext4_kvfree(sbi->s_flex_groups);
@@ -4229,7 +4227,6 @@ failed_mount3:
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
- percpu_counter_destroy(&sbi->s_extent_cache_cnt);
if (sbi->s_mmp_tsk)
kthread_stop(sbi->s_mmp_tsk);
failed_mount2:
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index c1d7a7c..996e58b 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -2460,6 +2460,37 @@ TRACE_EVENT(ext4_collapse_range,
__entry->offset, __entry->len)
);
+TRACE_EVENT(ext4_es_shrink,
+ TP_PROTO(struct super_block *sb, int nr_shrunk, u64 scan_time,
+ int skip_precached, int nr_skipped, int retried),
+
+ TP_ARGS(sb, nr_shrunk, scan_time, skip_precached, nr_skipped, retried),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( int, nr_shrunk )
+ __field( unsigned long long, scan_time )
+ __field( int, skip_precached )
+ __field( int, nr_skipped )
+ __field( int, retried )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = sb->s_dev;
+ __entry->nr_shrunk = nr_shrunk;
+ __entry->scan_time = div_u64(scan_time, 1000);
+ __entry->skip_precached = skip_precached;
+ __entry->nr_skipped = nr_skipped;
+ __entry->retried = retried;
+ ),
+
+ TP_printk("dev %d,%d nr_shrunk %d, scan_time %llu skip_precached %d "
+ "nr_skipped %d retried %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_shrunk,
+ __entry->scan_time, __entry->skip_precached,
+ __entry->nr_skipped, __entry->retried)
+);
+
#endif /* _TRACE_EXT4_H */
/* This part must be outside protection */
--
1.7.9.7
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists