--- block/blk-sysfs.c | 25 + include/linux/backing-dev.h | 1 include/linux/fs.h | 44 +++ include/linux/radix-tree.h | 2 lib/radix-tree.c | 37 ++ mm/Kconfig | 18 + mm/Makefile | 1 mm/filemap.c | 2 mm/readahead.c | 166 +++++++++-- mm/readahead_trace.c | 486 ++++++++++++++++++++++++++++++++++ 10 files changed, 754 insertions(+), 28 deletions(-) --- mm.orig/mm/readahead.c +++ mm/mm/readahead.c @@ -16,6 +16,7 @@ #include #include #include +#include void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) { @@ -204,6 +205,10 @@ int force_page_cache_readahead(struct ad offset += this_chunk; nr_to_read -= this_chunk; } + + trace_mark(readahead_fadvise, "%p %p %lu %lu %d", + mapping, filp, offset, nr_to_read, ret); + return ret; } @@ -217,10 +222,18 @@ int force_page_cache_readahead(struct ad int do_page_cache_readahead(struct address_space *mapping, struct file *filp, pgoff_t offset, unsigned long nr_to_read) { + int actual; + if (bdi_read_congested(mapping->backing_dev_info)) return -1; - return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0); + actual = __do_page_cache_readahead(mapping, filp, offset, + nr_to_read, 0); + + trace_mark(readahead_mmap, "%p %p %lu %lu %d", + mapping, filp, offset, nr_to_read, actual); + + return actual; } /* @@ -248,14 +261,21 @@ subsys_initcall(readahead_init); /* * Submit IO for the read-ahead request in file_ra_state. */ -static unsigned long ra_submit(struct file_ra_state *ra, - struct address_space *mapping, struct file *filp) +static unsigned long ra_submit(struct address_space *mapping, + struct file *filp, + pgoff_t offset, + unsigned long req_size, + struct file_ra_state *ra, + int async) { int actual; actual = __do_page_cache_readahead(mapping, filp, ra->start, ra->size, ra->async_size); + trace_mark(readahead_generic, "%p %p %lu %lu %p %d %d", + mapping, filp, offset, req_size, ra, actual, async); + return actual; } @@ -337,6 +357,60 @@ static unsigned long get_next_ra_size(st */ /* + * Count continuously cached pages from @offset-1 to @offset-@max, + * this count is a conservative estimation of + * - length of the sequential read sequence, or + * - thrashing threshold in memory tight systems + */ +static unsigned long count_history_pages(struct address_space *mapping, + struct file_ra_state *ra, + pgoff_t offset, unsigned long max) +{ + pgoff_t head; + + rcu_read_lock(); + head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max); + rcu_read_unlock(); + + return offset - 1 - head; +} + +/* + * page cache context based read-ahead + */ +static int try_context_readahead(struct address_space *mapping, + struct file_ra_state *ra, + pgoff_t offset, + unsigned int req_size, + unsigned int max) +{ + unsigned long size; + + size = count_history_pages(mapping, ra, offset, max); + + /* + * no history pages: + * it could be a random read + */ + if (!size) + return 0; + + /* + * starts from beginning of file: + * it is a strong indication of long-run stream (or whole-file-read) + */ + if (size >= offset) + size *= 2; + + ra_set_pattern(ra, RA_PATTERN_HAS_TRAIL); + ra->start = offset; + ra->size = get_init_ra_size(size + req_size, max); + ra->async_size = ra->size; + + return 1; +} + +/* * A minimal readahead algorithm for trivial sequential/random reads. */ static unsigned long @@ -345,34 +419,30 @@ ondemand_readahead(struct address_space bool hit_readahead_marker, pgoff_t offset, unsigned long req_size) { - int max = ra->ra_pages; /* max readahead pages */ - pgoff_t prev_offset; - int sequential; + unsigned long max = ra->ra_pages; /* max readahead pages */ + pgoff_t prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT; + + /* + * start of file + */ + if (!offset) { + ra_set_pattern(ra, RA_PATTERN_INITIAL0); + goto initial_readahead; + } /* * It's the expected callback offset, assume sequential access. * Ramp up sizes, and push forward the readahead window. */ - if (offset && (offset == (ra->start + ra->size - ra->async_size) || - offset == (ra->start + ra->size))) { + if ((offset == (ra->start + ra->size - ra->async_size) || + offset == (ra->start + ra->size))) { + ra_set_pattern(ra, RA_PATTERN_SUBSEQUENT); ra->start += ra->size; ra->size = get_next_ra_size(ra, max); ra->async_size = ra->size; goto readit; } - prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT; - sequential = offset - prev_offset <= 1UL || req_size > max; - - /* - * Standalone, small read. - * Read as is, and do not pollute the readahead state. - */ - if (!hit_readahead_marker && !sequential) { - return __do_page_cache_readahead(mapping, filp, - offset, req_size, 0); - } - /* * Hit a marked page without valid readahead state. * E.g. interleaved reads. @@ -389,6 +459,7 @@ ondemand_readahead(struct address_space if (!start || start - offset > max) return 0; + ra_set_pattern(ra, RA_PATTERN_HIT_MARKER); ra->start = start; ra->size = start - offset; /* old async_size */ ra->size = get_next_ra_size(ra, max); @@ -397,18 +468,59 @@ ondemand_readahead(struct address_space } /* - * It may be one of - * - first read on start of file - * - sequential cache miss - * - oversize random read - * Start readahead for it. + * oversize read + */ + if (req_size > max) { + ra_set_pattern(ra, RA_PATTERN_OVERSIZE); + goto initial_readahead; + } + + /* + * sequential cache miss + */ + if (offset - prev_offset <= 1UL) { + ra_set_pattern(ra, RA_PATTERN_INITIAL); + goto initial_readahead; + } + + /* + * Query the page cache and look for the traces(cached history pages) + * that a sequential stream would leave behind. + */ + if (mapping->backing_dev_info->context_readahead != 0 && + try_context_readahead(mapping, ra, offset, req_size, max)) + goto readit; + + /* + * Standalone, small read. + * Read as is, and do not pollute the readahead state. */ + ra_set_pattern(ra, RA_PATTERN_RANDOM); + ra->start = offset; + ra->size = req_size; + ra->async_size = 0; + goto readit; + +initial_readahead: ra->start = offset; ra->size = get_init_ra_size(req_size, max); - ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; + if (req_size <= max / 2) + ra->async_size = ra->size - req_size; + else + ra->async_size = ra->size; readit: - return ra_submit(ra, mapping, filp); + /* + * Will this read will hit the readahead marker made by itself? + */ + if (offset + min(req_size, max) > + ra->start + ra->size - ra->async_size) { + ra->async_size = get_next_ra_size(ra, max); + ra->size += ra->async_size; + } + + return ra_submit(mapping, filp, offset, req_size, ra, + hit_readahead_marker); } /** --- mm.orig/block/blk-sysfs.c +++ mm/block/blk-sysfs.c @@ -75,6 +75,24 @@ queue_requests_store(struct request_queu return ret; } +static ssize_t queue_cxt_ra_show(struct request_queue *q, char *page) +{ + unsigned int cra = q->backing_dev_info.context_readahead; + + return queue_var_show(cra, (page)); +} + +static ssize_t +queue_cxt_ra_store(struct request_queue *q, const char *page, size_t count) +{ + unsigned long cra; + ssize_t ret = queue_var_store(&cra, page, count); + + q->backing_dev_info.context_readahead = cra; + + return ret; +} + static ssize_t queue_ra_show(struct request_queue *q, char *page) { int ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10); @@ -163,6 +181,12 @@ static struct queue_sysfs_entry queue_re .store = queue_requests_store, }; +static struct queue_sysfs_entry queue_cxt_ra_entry = { + .attr = {.name = "context_readahead", .mode = S_IRUGO | S_IWUSR }, + .show = queue_cxt_ra_show, + .store = queue_cxt_ra_store, +}; + static struct queue_sysfs_entry queue_ra_entry = { .attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR }, .show = queue_ra_show, @@ -200,6 +224,7 @@ static struct queue_sysfs_entry queue_no static struct attribute *default_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, + &queue_cxt_ra_entry.attr, &queue_max_hw_sectors_entry.attr, &queue_max_sectors_entry.attr, &queue_iosched_entry.attr, --- mm.orig/include/linux/backing-dev.h +++ mm/include/linux/backing-dev.h @@ -41,6 +41,7 @@ enum bdi_stat_item { struct backing_dev_info { unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ + int context_readahead; /* enable page cache context readahead */ unsigned long state; /* Always use atomic bitops on this */ unsigned int capabilities; /* Device capabilities */ congested_fn *congested_fn; /* Function pointer if device is md/dm */ --- mm.orig/lib/radix-tree.c +++ mm/lib/radix-tree.c @@ -665,6 +665,43 @@ unsigned long radix_tree_next_hole(struc } EXPORT_SYMBOL(radix_tree_next_hole); +/** + * radix_tree_prev_hole - find the prev hole (not-present entry) + * @root: tree root + * @index: index key + * @max_scan: maximum range to search + * + * Search backwards in the range [max(index-max_scan+1, 0), index] + * for the first hole. + * + * Returns: the index of the hole if found, otherwise returns an index + * outside of the set specified (in which case 'index - return >= max_scan' + * will be true). In rare cases of wrap-around, LONG_MAX will be returned. + * + * radix_tree_next_hole may be called under rcu_read_lock. However, like + * radix_tree_gang_lookup, this will not atomically search a snapshot of + * the tree at a single point in time. For example, if a hole is created + * at index 10, then subsequently a hole is created at index 5, + * radix_tree_prev_hole covering both indexes may return 5 if called under + * rcu_read_lock. + */ +unsigned long radix_tree_prev_hole(struct radix_tree_root *root, + unsigned long index, unsigned long max_scan) +{ + unsigned long i; + + for (i = 0; i < max_scan; i++) { + if (!radix_tree_lookup(root, index)) + break; + index--; + if (index == LONG_MAX) + break; + } + + return index; +} +EXPORT_SYMBOL(radix_tree_prev_hole); + static unsigned int __lookup(struct radix_tree_node *slot, void ***results, unsigned long index, unsigned int max_items, unsigned long *next_index) --- mm.orig/include/linux/radix-tree.h +++ mm/include/linux/radix-tree.h @@ -167,6 +167,8 @@ radix_tree_gang_lookup_slot(struct radix unsigned long first_index, unsigned int max_items); unsigned long radix_tree_next_hole(struct radix_tree_root *root, unsigned long index, unsigned long max_scan); +unsigned long radix_tree_prev_hole(struct radix_tree_root *root, + unsigned long index, unsigned long max_scan); int radix_tree_preload(gfp_t gfp_mask); void radix_tree_init(void); void *radix_tree_tag_set(struct radix_tree_root *root, --- mm.orig/include/linux/fs.h +++ mm/include/linux/fs.h @@ -789,11 +789,55 @@ struct file_ra_state { there are only # of pages ahead */ unsigned int ra_pages; /* Maximum readahead window */ + unsigned int flags; int mmap_miss; /* Cache miss stat for mmap accesses */ loff_t prev_pos; /* Cache last read() position */ }; /* + * Which policy makes decision to do the current read-ahead IO? + */ +#define RA_PATTERN_SHIFT 0 +#define RA_PATTERN_MASK 0xf + +enum readahead_pattern { + RA_PATTERN_NONE, + RA_PATTERN_INITIAL0, /* start of file */ + RA_PATTERN_INITIAL, + RA_PATTERN_SUBSEQUENT, + RA_PATTERN_HIT_MARKER, + RA_PATTERN_HAS_TRAIL, + RA_PATTERN_OVERSIZE, + RA_PATTERN_REVERSE, + RA_PATTERN_STRIDE, + RA_PATTERN_THRASH, + RA_PATTERN_MMAP_AROUND, + RA_PATTERN_FADVISE, + RA_PATTERN_RANDOM, + RA_PATTERN_ALL, /* for collecting stats */ + RA_PATTERN_MAX +}; + +static inline int ra_pattern(struct file_ra_state *ra) +{ + int pattern = (ra->flags >> RA_PATTERN_SHIFT) & RA_PATTERN_MASK; + + if (pattern > RA_PATTERN_MAX) + pattern = RA_PATTERN_NONE; + + return pattern; +} + +/* + * Which method is issuing this read-ahead? + */ +static inline void ra_set_pattern(struct file_ra_state *ra, int pattern) +{ + ra->flags = (ra->flags & ~RA_PATTERN_MASK) | + (pattern << RA_PATTERN_SHIFT); +} + +/* * Check if @index falls in the readahead windows. */ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) --- /dev/null +++ mm/mm/readahead_trace.c @@ -0,0 +1,486 @@ +/* + * Readahead I/O tracing and accounting + * + * Copyright 2008 Wu Fengguang, Intel Corporation + * Subject to the GNU Public License, version 2 or later. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int option_read_jprobes; +int option_trace_enable; +u32 option_trace_pid; +u32 option_trace_ino; + +struct readahead_trace_probe { + const char *name; + const char *format; + marker_probe_func *probe_func; + int pattern; +}; + +enum ra_account { + /* number of readaheads */ + RA_ACCOUNT_COUNT, + RA_ACCOUNT_EOF, + RA_ACCOUNT_SYNC, + /* number of pages */ + RA_ACCOUNT_SIZE, + RA_ACCOUNT_ASIZE, + RA_ACCOUNT_ACTUAL, + /* end mark */ + RA_ACCOUNT_MAX, +}; + +static const char * const ra_pattern_names[] = { + [RA_PATTERN_NONE] = "none", + [RA_PATTERN_INITIAL0] = "initial0", + [RA_PATTERN_INITIAL] = "initial", + [RA_PATTERN_SUBSEQUENT] = "subsequent", + [RA_PATTERN_HIT_MARKER] = "marker", + [RA_PATTERN_HAS_TRAIL] = "trail", + [RA_PATTERN_OVERSIZE] = "oversize", + [RA_PATTERN_REVERSE] = "reverse", + [RA_PATTERN_STRIDE] = "stride", + [RA_PATTERN_THRASH] = "thrash", + [RA_PATTERN_MMAP_AROUND] = "mmap", + [RA_PATTERN_FADVISE] = "fadvise", + [RA_PATTERN_RANDOM] = "random", + [RA_PATTERN_ALL] = "all", +}; + +static unsigned long ra_stats[RA_PATTERN_MAX][RA_ACCOUNT_MAX]; + +/* + * readahead tracing + */ + +static void do_readahead_stats(struct address_space *mapping, + struct file_ra_state *ra, + int actual, + int async, + int pattern) +{ + pgoff_t lastpage = (i_size_read(mapping->host) - 1) >> PAGE_CACHE_SHIFT; + + ra_stats[pattern][RA_ACCOUNT_COUNT]++; + ra_stats[pattern][RA_ACCOUNT_SIZE] += ra->size; + ra_stats[pattern][RA_ACCOUNT_ASIZE] += ra->async_size; + ra_stats[pattern][RA_ACCOUNT_ACTUAL] += actual; + if (ra->start + ra->size > lastpage) + ra_stats[pattern][RA_ACCOUNT_EOF]++; + if (!async) + ra_stats[pattern][RA_ACCOUNT_SYNC]++; +} + +static void do_readahead_trace(struct address_space *mapping, + struct file *filp, + pgoff_t offset, + unsigned long req_size, + struct file_ra_state *ra, + int actual, + int async) +{ + int pattern = ra_pattern(ra); + + BUILD_BUG_ON(ARRAY_SIZE(ra_pattern_names) != RA_PATTERN_MAX); + + do_readahead_stats(mapping, ra, actual, async, pattern); + do_readahead_stats(mapping, ra, actual, async, RA_PATTERN_ALL); + + if (!option_trace_enable) + return; + if (option_trace_pid && option_trace_pid != current->pid) + return; + if (option_trace_ino && option_trace_ino != mapping->host->i_ino) + return; + + printk(KERN_DEBUG "readahead-%s(pid=%d(%s), dev=%02x:%02x(%s), " + "ino=%lu(%s), req=%lu+%lu, ra=%lu+%d-%d, async=%d) = %d\n", + ra_pattern_names[pattern], + current->pid, current->comm, + MAJOR(mapping->host->i_sb->s_dev), + MINOR(mapping->host->i_sb->s_dev), + mapping->host->i_sb->s_id, + mapping->host->i_ino, + filp->f_path.dentry->d_name.name, + offset, req_size, + ra->start, ra->size, ra->async_size, + async, + actual); +} + + +static void readahead_trace(void *readahead_trace_probe, void *call_data, + const char *format, va_list *args) +{ + struct address_space *mapping; + struct file *filp; + pgoff_t offset; + unsigned long req_size; + struct file_ra_state *ra; + int actual; + int async; + + mapping = va_arg(*args, typeof(mapping)); + filp = va_arg(*args, typeof(filp)); + offset = va_arg(*args, typeof(offset)); + req_size = va_arg(*args, typeof(req_size)); + ra = va_arg(*args, typeof(ra)); + actual = va_arg(*args, typeof(actual)); + async = va_arg(*args, typeof(async)); + + do_readahead_trace(mapping, filp, offset, req_size, ra, actual, async); +} + +static void plain_readahead_trace(void *probe_private, void *call_data, + const char *format, va_list *args) +{ + struct readahead_trace_probe *probe = probe_private; + struct address_space *mapping; + struct file *filp; + struct file_ra_state ra = {0}; + int actual; + + mapping = va_arg(*args, typeof(mapping)); + filp = va_arg(*args, typeof(filp)); + ra.start = va_arg(*args, typeof(ra.start)); + ra.size = va_arg(*args, typeof(ra.size)); + actual = va_arg(*args, typeof(actual)); + + ra_set_pattern(&ra, probe->pattern); + + do_readahead_trace(mapping, filp, 0, 0, &ra, actual, 0); +} + +/* + * read tracing + */ + +static void read_trace(const char func_name[], + struct file *file, + loff_t pos, size_t count) +{ + struct inode *inode = file->f_mapping->host; + + if (!option_trace_enable) + return; + if (option_trace_pid && option_trace_pid != current->pid) + return; + if (option_trace_ino && option_trace_ino != inode->i_ino) + return; + + printk(KERN_DEBUG "%s(pid=%d(%s), dev=%02x:%02x(%s), ino=%lu(%s), " + "pos=%llu, count=%lu)\n", + func_name, + current->pid, current->comm, + MAJOR(inode->i_sb->s_dev), + MINOR(inode->i_sb->s_dev), + inode->i_sb->s_id, + inode->i_ino, + file->f_path.dentry->d_name.name, + pos, count); +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24) +static ssize_t jdo_generic_file_splice_read(struct file *in, + loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, + unsigned int flags) +{ + read_trace("generic_file_splice_read", in, *ppos, len); + jprobe_return(); + return 0; +} +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,25) +static void jdo_do_generic_file_read(struct file *filp, + loff_t *ppos, + read_descriptor_t *desc, + read_actor_t actor) +{ + read_trace("do_generic_file_read", filp, *ppos, desc->count); + jprobe_return(); +} +#else +static void jdo_do_generic_mapping_read(struct address_space *mapping, + struct file_ra_state *_ra, + struct file *filp, + loff_t *ppos, + read_descriptor_t *desc, + read_actor_t actor) +{ + read_trace("do_generic_mapping_read", filp, *ppos, desc->count); + jprobe_return(); +} +#endif + +/* + * jprobes + */ + +#define jprobe_entry(func) \ +{ \ + .entry = jdo_##func, \ + .kp.symbol_name = __stringify(func) \ +} + +static struct jprobe jprobe_array[] = { +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,25) + jprobe_entry(do_generic_file_read), +#else + jprobe_entry(do_generic_mapping_read), +#endif +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24) + jprobe_entry(generic_file_splice_read), +#endif +}; + +static void insert_jprobes(void) +{ + int err; + int i; + + for (i = 0; i < ARRAY_SIZE(jprobe_array); i++) { + err = register_jprobe(jprobe_array + i); + if (err) + printk(KERN_ERR "unable to register probe %s\n", + jprobe_array[i].kp.symbol_name); + } +} + +static void remove_jprobes(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(jprobe_array); i++) + unregister_jprobe(jprobe_array + i); +} + +/* + * marker probes + */ + +static struct readahead_trace_probe probe_array[] = +{ + { + .name = "readahead_generic", + .format = "%p %p %lu %lu %p %d %d", + .probe_func = readahead_trace, + }, + { + .name = "readahead_mmap", + .format = "%p %p %lu %lu %d", + .probe_func = plain_readahead_trace, + .pattern = RA_PATTERN_MMAP_AROUND, + }, + { + .name = "readahead_fadvise", + .format = "%p %p %lu %lu %d", + .probe_func = plain_readahead_trace, + .pattern = RA_PATTERN_FADVISE, + }, +}; + +static void insert_marker(void) +{ + int err; + int i; + + for (i = 0; i < ARRAY_SIZE(probe_array); i++) { + err = marker_probe_register(probe_array[i].name, + probe_array[i].format, + probe_array[i].probe_func, + probe_array + i); + if (err) + printk(KERN_ERR "unable to register probe %s\n", + probe_array[i].name); + } +} + +static void remove_marker(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(probe_array); i++) + marker_probe_unregister(probe_array[i].name, + probe_array[i].probe_func, + probe_array + i); + + synchronize_sched(); +} + +/* + * file operations for readahead_stats + */ + +static int readahead_stats_show(struct seq_file *s, void *_) +{ + unsigned long i; + unsigned long count; + + seq_printf(s, "%-10s %10s %10s %10s %10s %10s %10s\n", + "pattern", + "count", "sync_count", "eof_count", + "size", "async_size", "actual"); + + for (i = 0; i < RA_PATTERN_MAX; i++) { + count = ra_stats[i][RA_ACCOUNT_COUNT]; + if (count == 0) + count = 1; + + seq_printf(s, "%-10s %10lu %10lu %10lu %10lu %10lu %10lu\n", + ra_pattern_names[i], + ra_stats[i][RA_ACCOUNT_COUNT], + ra_stats[i][RA_ACCOUNT_SYNC], + ra_stats[i][RA_ACCOUNT_EOF], + ra_stats[i][RA_ACCOUNT_SIZE] / count, + ra_stats[i][RA_ACCOUNT_ASIZE] / count, + ra_stats[i][RA_ACCOUNT_ACTUAL] / count); + } + + return 0; +} + +static int readahead_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, readahead_stats_show, NULL); +} + +static ssize_t readahead_stats_write(struct file *file, const char __user *buf, + size_t size, loff_t *offset) +{ + memset(ra_stats, 0, sizeof(ra_stats)); + return size; +} + +static struct file_operations readahead_stats_fops = { + .owner = THIS_MODULE, + .open = readahead_stats_open, + .write = readahead_stats_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* + * file operations for read_jprobes + */ + +static int read_jprobes_set(void *data, u64 val) +{ + if (val && !option_read_jprobes) + insert_jprobes(); + else if (!val && option_read_jprobes) + remove_jprobes(); + + *(int *)data = val; + WARN_ON(data != &option_read_jprobes); + + return 0; +} +static int read_jprobes_get(void *data, u64 *val) +{ + *val = *(int *)data; + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(read_jprobes_fops, + read_jprobes_get, read_jprobes_set, "%llu\n"); + +/* + * debugfs entries: readahead/stats + */ + +struct readahead_debug { + struct dentry *root; + struct dentry *stats; + struct dentry *trace_enable; + struct dentry *trace_pid; + struct dentry *read_jprobes; +}; +static struct readahead_debug debug; + +static void remove_debugfs(void) +{ + debugfs_remove(debug.read_jprobes); + debugfs_remove(debug.trace_enable); + debugfs_remove(debug.trace_pid); + debugfs_remove(debug.stats); + debugfs_remove(debug.root); +} + +static int create_debugfs(void) +{ + debug.root = debugfs_create_dir("readahead", NULL); + if (!debug.root) + goto error_debugfs; + + debug.stats = debugfs_create_file("stats", 0644, debug.root, + NULL, &readahead_stats_fops); + if (!debug.stats) + goto error_debugfs; + + debug.trace_enable = debugfs_create_bool("trace_enable", 0644, + debug.root, + &option_trace_enable); + if (!debug.trace_enable) + goto error_debugfs; + + debug.trace_pid = debugfs_create_u32("trace_pid", 0644, debug.root, + &option_trace_pid); + if (!debug.trace_pid) + goto error_debugfs; + + debug.read_jprobes = debugfs_create_file("read_jprobes", 0644, + debug.root, + &option_read_jprobes, + &read_jprobes_fops); + if (!debug.read_jprobes) + goto error_debugfs; + + return 0; + +error_debugfs: + printk(KERN_ERR "readahead: failed to create debugfs directory\n"); + return -ENOMEM; +} + +/* + * module init/exit + */ + +static int __init readahead_probe_init(void) +{ + create_debugfs(); + insert_marker(); + return 0; +} + +static void __exit readahead_probe_exit(void) +{ + remove_jprobes(); + remove_marker(); + remove_debugfs(); +} + +module_init(readahead_probe_init); +module_exit(readahead_probe_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Wu Fengguang"); +MODULE_DESCRIPTION("readahead tracing and accounting"); --- mm.orig/mm/Kconfig +++ mm/mm/Kconfig @@ -208,3 +208,21 @@ config VIRT_TO_BUS config MMU_NOTIFIER bool + +config READAHEAD_TRACE + tristate "Readahead tracing" + select MARKER + select KPROBES + select DEBUGFS + help + This module injects code to show detailed readahead traces and do + readahead events accounting. + + To actually get the data: + + # mount -t debugfs none /sys/kernel/debug + + After that you can do the following: + + # cat /sys/kernel/debug/readahead/stats # check counters + # echo > /sys/kernel/debug/readahead/stats # reset counters --- mm.orig/mm/Makefile +++ mm/mm/Makefile @@ -35,3 +35,4 @@ obj-$(CONFIG_SMP) += allocpercpu.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o +obj-$(CONFIG_READAHEAD_TRACE) += readahead_trace.o --- mm.orig/mm/filemap.c +++ mm/mm/filemap.c @@ -982,7 +982,7 @@ static void shrink_readahead_size_eio(st * This is really ugly. But the goto's actually try to clarify some * of the logic when it comes to error handling etc. */ -static void do_generic_file_read(struct file *filp, loff_t *ppos, +void do_generic_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) { struct address_space *mapping = filp->f_mapping;