[<prev] [next>] [day] [month] [year] [list]
Message-Id: <1332288013-14632-1-git-send-email-asharma@fb.com>
Date: Tue, 20 Mar 2012 17:00:13 -0700
From: Arun Sharma <asharma@...com>
To: linux-kernel@...r.kernel.org
Cc: Arun Sharma <asharma@...com>, Tejun Heo <tj@...nel.org>,
axboe@...nel.dk, mingo@...hat.com, rostedt@...dmis.org,
fweisbec@...il.com, teravest@...gle.com, slavapestov@...gle.com,
ctalbott@...gle.com, dsharp@...gle.com,
Namhyung Kim <namhyung.kim@....com>
Subject: [RFC][PATCH] trace: add last read file to sched_stat_iowait
This is a lighter weight, but potentially imprecise way of figuring
out which files are responsible for a process to enter iowait.
It only keeps track of read code paths (vfs_read) and ignores
writeback related iowaits.
Sample output:
localhost ~ # perf record -ae sched:sched_stat_iowait -- sleep 10 &
<do some io>
localhosfd~ # perf script
swapper 0 [000] 176.082523: sched_stat_iowait: comm=cat pid=1387 delay=1362 [ns] ino=116899
swapper 0 [000] 176.082523: sched_stat_iowait: comm=cat pid=1387 delay=1362 [ns] ino=116899
swapper 0 [000] 176.082523: sched_stat_iowait: comm=cat pid=1387 delay=1362 [ns] ino=116899
localhost ~ # find /usr -inum 116899
/usr/tests/sets/files/testStaticFileSet.py
Signed-off-by: Arun Sharma <asharma@...com>
Cc: Tejun Heo <tj@...nel.org>
Cc: axboe@...nel.dk
Cc: mingo@...hat.com
Cc: rostedt@...dmis.org
Cc: fweisbec@...il.com
Cc: teravest@...gle.com
Cc: slavapestov@...gle.com
Cc: ctalbott@...gle.com
Cc: dsharp@...gle.com
Cc: Namhyung Kim <namhyung.kim@....com>
Cc: linux-kernel@...r.kernel.org
diff --git a/fs/file.c b/fs/file.c
index 4c6992d..eb77c95 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -304,6 +304,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
spin_lock_init(&newf->file_lock);
newf->next_fd = 0;
+ newf->last_read = NULL;
new_fdt = &newf->fdtab;
new_fdt->max_fds = NR_OPEN_DEFAULT;
new_fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
diff --git a/fs/read_write.c b/fs/read_write.c
index 5ad4248..c5a8e13 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -8,6 +8,7 @@
#include <linux/stat.h>
#include <linux/fcntl.h>
#include <linux/file.h>
+#include <linux/fdtable.h>
#include <linux/uio.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
@@ -375,6 +376,8 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
ret = rw_verify_area(READ, file, pos, count);
if (ret >= 0) {
count = ret;
+ if (current->files)
+ current->files->last_read = file;
if (file->f_op->read)
ret = file->f_op->read(file, buf, count, pos);
else
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 82163c4..0868880 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -53,6 +53,7 @@ struct files_struct {
*/
spinlock_t file_lock ____cacheline_aligned_in_smp;
int next_fd;
+ struct file *last_read;
struct embedded_fd_set close_on_exec_init;
struct embedded_fd_set open_fds_init;
struct file __rcu * fd_array[NR_OPEN_DEFAULT];
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index e33ed1b..3e96ea7 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -326,9 +326,41 @@ DEFINE_EVENT(sched_stat_template, sched_stat_sleep,
* Tracepoint for accounting iowait time (time the task is not runnable
* due to waiting on IO to complete).
*/
-DEFINE_EVENT(sched_stat_template, sched_stat_iowait,
- TP_PROTO(struct task_struct *tsk, u64 delay),
- TP_ARGS(tsk, delay));
+TRACE_EVENT(sched_stat_iowait,
+
+ TP_PROTO(struct task_struct *tsk, u64 delay, struct file *file),
+
+ TP_ARGS(tsk, delay, file),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field( u64, delay )
+ __field( ino_t, ino )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->delay = delay;
+ if (file) {
+ struct dentry *dentry = file->f_path.dentry;
+ struct inode *inode = NULL;
+ if (dentry)
+ inode = dentry->d_inode;
+ if (inode)
+ __entry->ino = inode->i_ino;
+ }
+ )
+ TP_perf_assign(
+ __perf_count(delay);
+ ),
+
+ TP_printk("comm=%s pid=%d delay=%Lu [ns] ino=%ld",
+ __entry->comm, __entry->pid,
+ (unsigned long long)__entry->delay,
+ (unsigned long) __entry->ino)
+);
/*
* Tracepoint for accounting blocked time (time the task is in uninterruptible).
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index aca16b8..5283990 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -26,6 +26,7 @@
#include <linux/slab.h>
#include <linux/profile.h>
#include <linux/interrupt.h>
+#include <linux/fdtable.h>
#include <trace/events/sched.h>
@@ -1025,9 +1026,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
if (tsk) {
if (tsk->in_iowait) {
+ struct file *last_read_file = NULL;
+ if (tsk->files)
+ last_read_file = tsk->files->last_read;
se->statistics.iowait_sum += delta;
se->statistics.iowait_count++;
- trace_sched_stat_iowait(tsk, delta);
+ trace_sched_stat_iowait(tsk, delta, last_read_file);
}
trace_sched_stat_blocked(tsk, delta);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists