Index: linux-2.6/fs/nfs/write.c =================================================================== --- linux-2.6.orig/fs/nfs/write.c +++ linux-2.6/fs/nfs/write.c @@ -237,10 +237,8 @@ static void nfs_end_page_writeback(struc struct nfs_server *nfss = NFS_SERVER(inode); end_page_writeback(page); - if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) { + if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) clear_bdi_congested(&nfss->backing_dev_info, WRITE); - congestion_end(WRITE); - } } /* @@ -466,6 +464,7 @@ nfs_mark_request_commit(struct nfs_page set_bit(PG_NEED_COMMIT, &(req)->wb_flags); spin_unlock(&nfsi->req_lock); inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); + inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } @@ -552,6 +551,8 @@ static void nfs_cancel_commit_list(struc while(!list_empty(head)) { req = nfs_list_entry(head->next); dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); + dec_bdi_stat(req->wb_page->mapping->backing_dev_info, + BDI_RECLAIMABLE); nfs_list_remove_request(req); clear_bit(PG_NEED_COMMIT, &(req)->wb_flags); nfs_inode_remove_request(req); @@ -1207,6 +1208,8 @@ nfs_commit_list(struct inode *inode, str nfs_list_remove_request(req); nfs_mark_request_commit(req); dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); + dec_bdi_stat(req->wb_page->mapping->backing_dev_info, + BDI_RECLAIMABLE); nfs_clear_page_writeback(req); } return -ENOMEM; @@ -1232,6 +1235,8 @@ static void nfs_commit_done(struct rpc_t nfs_list_remove_request(req); clear_bit(PG_NEED_COMMIT, &(req)->wb_flags); dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); + dec_bdi_stat(req->wb_page->mapping->backing_dev_info, + BDI_RECLAIMABLE); dprintk("NFS: commit (%s/%Ld %d@%Ld)", req->wb_context->dentry->d_inode->i_sb->s_id, Index: linux-2.6/include/linux/backing-dev.h =================================================================== --- linux-2.6.orig/include/linux/backing-dev.h +++ linux-2.6/include/linux/backing-dev.h @@ -8,6 +8,9 @@ #ifndef _LINUX_BACKING_DEV_H #define _LINUX_BACKING_DEV_H +#include +#include +#include #include struct page; @@ -24,6 +27,14 @@ enum bdi_state { typedef int (congested_fn)(void *, int); +enum bdi_stat_item { + BDI_RECLAIMABLE, + BDI_WRITEBACK, + NR_BDI_STAT_ITEMS +}; + +#define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) + struct backing_dev_info { unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ unsigned long state; /* Always use atomic bitops on this */ @@ -32,8 +43,90 @@ struct backing_dev_info { void *congested_data; /* Pointer to aux data for congested func */ void (*unplug_io_fn)(struct backing_dev_info *, struct page *); void *unplug_io_data; + + struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS]; + + struct prop_local_percpu completions; + int dirty_exceeded; }; +int bdi_init(struct backing_dev_info *bdi); +void bdi_destroy(struct backing_dev_info *bdi); + +static inline void __mod_bdi_stat(struct backing_dev_info *bdi, + enum bdi_stat_item item, s64 amount) +{ + __percpu_counter_add(&bdi->bdi_stat[item], amount, BDI_STAT_BATCH); +} + +static inline void __inc_bdi_stat(struct backing_dev_info *bdi, + enum bdi_stat_item item) +{ + __mod_bdi_stat(bdi, item, 1); +} + +static inline void inc_bdi_stat(struct backing_dev_info *bdi, + enum bdi_stat_item item) +{ + unsigned long flags; + + local_irq_save(flags); + __inc_bdi_stat(bdi, item); + local_irq_restore(flags); +} + +static inline void __dec_bdi_stat(struct backing_dev_info *bdi, + enum bdi_stat_item item) +{ + __mod_bdi_stat(bdi, item, -1); +} + +static inline void dec_bdi_stat(struct backing_dev_info *bdi, + enum bdi_stat_item item) +{ + unsigned long flags; + + local_irq_save(flags); + __dec_bdi_stat(bdi, item); + local_irq_restore(flags); +} + +static inline s64 bdi_stat(struct backing_dev_info *bdi, + enum bdi_stat_item item) +{ + return percpu_counter_read_positive(&bdi->bdi_stat[item]); +} + +static inline s64 __bdi_stat_sum(struct backing_dev_info *bdi, + enum bdi_stat_item item) +{ + return percpu_counter_sum_positive(&bdi->bdi_stat[item]); +} + +static inline s64 bdi_stat_sum(struct backing_dev_info *bdi, + enum bdi_stat_item item) +{ + s64 sum; + unsigned long flags; + + local_irq_save(flags); + sum = __bdi_stat_sum(bdi, item); + local_irq_restore(flags); + + return sum; +} + +/* + * maximal error of a stat counter. + */ +static inline unsigned long bdi_stat_error(struct backing_dev_info *bdi) +{ +#ifdef CONFIG_SMP + return nr_cpu_ids * BDI_STAT_BATCH; +#else + return 1; +#endif +} /* * Flags in backing_dev_info::capability @@ -94,7 +187,6 @@ void clear_bdi_congested(struct backing_ void set_bdi_congested(struct backing_dev_info *bdi, int rw); long congestion_wait(int rw, long timeout); long congestion_wait_interruptible(int rw, long timeout); -void congestion_end(int rw); #define bdi_cap_writeback_dirty(bdi) \ (!((bdi)->capabilities & BDI_CAP_NO_WRITEBACK)) Index: linux-2.6/mm/backing-dev.c =================================================================== --- linux-2.6.orig/mm/backing-dev.c +++ linux-2.6/mm/backing-dev.c @@ -5,6 +5,41 @@ #include #include +int bdi_init(struct backing_dev_info *bdi) +{ + int i, j; + int err; + + for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { + err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0); + if (err) + goto err; + } + + bdi->dirty_exceeded = 0; + err = prop_local_init_percpu(&bdi->completions); + + if (err) { +err: + for (j = 0; j < i; j++) + percpu_counter_destroy(&bdi->bdi_stat[i]); + } + + return err; +} +EXPORT_SYMBOL(bdi_init); + +void bdi_destroy(struct backing_dev_info *bdi) +{ + int i; + + for (i = 0; i < NR_BDI_STAT_ITEMS; i++) + percpu_counter_destroy(&bdi->bdi_stat[i]); + + prop_local_destroy_percpu(&bdi->completions); +} +EXPORT_SYMBOL(bdi_destroy); + static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) @@ -70,16 +105,3 @@ long congestion_wait_interruptible(int r return ret; } EXPORT_SYMBOL(congestion_wait_interruptible); - -/** - * congestion_end - wake up sleepers on a congested backing_dev_info - * @rw: READ or WRITE - */ -void congestion_end(int rw) -{ - wait_queue_head_t *wqh = &congestion_wqh[rw]; - - if (waitqueue_active(wqh)) - wake_up(wqh); -} -EXPORT_SYMBOL(congestion_end); Index: linux-2.6/fs/ext2/balloc.c =================================================================== --- linux-2.6.orig/fs/ext2/balloc.c +++ linux-2.6/fs/ext2/balloc.c @@ -124,7 +124,7 @@ static int reserve_blocks(struct super_b return 0; } - percpu_counter_mod(&sbi->s_freeblocks_counter, -count); + percpu_counter_sub(&sbi->s_freeblocks_counter, count); sb->s_dirt = 1; return count; } @@ -134,7 +134,7 @@ static void release_blocks(struct super_ if (count) { struct ext2_sb_info *sbi = EXT2_SB(sb); - percpu_counter_mod(&sbi->s_freeblocks_counter, count); + percpu_counter_add(&sbi->s_freeblocks_counter, count); sb->s_dirt = 1; } } Index: linux-2.6/fs/ext2/ialloc.c =================================================================== --- linux-2.6.orig/fs/ext2/ialloc.c +++ linux-2.6/fs/ext2/ialloc.c @@ -542,7 +542,7 @@ got: goto fail; } - percpu_counter_mod(&sbi->s_freeinodes_counter, -1); + percpu_counter_add(&sbi->s_freeinodes_counter, -1); if (S_ISDIR(mode)) percpu_counter_inc(&sbi->s_dirs_counter); Index: linux-2.6/fs/ext3/balloc.c =================================================================== --- linux-2.6.orig/fs/ext3/balloc.c +++ linux-2.6/fs/ext3/balloc.c @@ -570,7 +570,7 @@ do_more: cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) + group_freed); spin_unlock(sb_bgl_lock(sbi, block_group)); - percpu_counter_mod(&sbi->s_freeblocks_counter, count); + percpu_counter_add(&sbi->s_freeblocks_counter, count); /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); @@ -1633,7 +1633,7 @@ allocated: gdp->bg_free_blocks_count = cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)-num); spin_unlock(sb_bgl_lock(sbi, group_no)); - percpu_counter_mod(&sbi->s_freeblocks_counter, -num); + percpu_counter_sub(&sbi->s_freeblocks_counter, num); BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); err = ext3_journal_dirty_metadata(handle, gdp_bh); Index: linux-2.6/fs/ext3/resize.c =================================================================== --- linux-2.6.orig/fs/ext3/resize.c +++ linux-2.6/fs/ext3/resize.c @@ -884,9 +884,9 @@ int ext3_group_add(struct super_block *s input->reserved_blocks); /* Update the free space counts */ - percpu_counter_mod(&sbi->s_freeblocks_counter, + percpu_counter_add(&sbi->s_freeblocks_counter, input->free_blocks_count); - percpu_counter_mod(&sbi->s_freeinodes_counter, + percpu_counter_add(&sbi->s_freeinodes_counter, EXT3_INODES_PER_GROUP(sb)); ext3_journal_dirty_metadata(handle, sbi->s_sbh); Index: linux-2.6/fs/ext4/balloc.c =================================================================== --- linux-2.6.orig/fs/ext4/balloc.c +++ linux-2.6/fs/ext4/balloc.c @@ -587,7 +587,7 @@ do_more: cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) + group_freed); spin_unlock(sb_bgl_lock(sbi, block_group)); - percpu_counter_mod(&sbi->s_freeblocks_counter, count); + percpu_counter_add(&sbi->s_freeblocks_counter, count); /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); @@ -1647,7 +1647,7 @@ allocated: gdp->bg_free_blocks_count = cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)-num); spin_unlock(sb_bgl_lock(sbi, group_no)); - percpu_counter_mod(&sbi->s_freeblocks_counter, -num); + percpu_counter_sub(&sbi->s_freeblocks_counter, num); BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); err = ext4_journal_dirty_metadata(handle, gdp_bh); Index: linux-2.6/fs/ext4/resize.c =================================================================== --- linux-2.6.orig/fs/ext4/resize.c +++ linux-2.6/fs/ext4/resize.c @@ -893,9 +893,9 @@ int ext4_group_add(struct super_block *s input->reserved_blocks); /* Update the free space counts */ - percpu_counter_mod(&sbi->s_freeblocks_counter, + percpu_counter_add(&sbi->s_freeblocks_counter, input->free_blocks_count); - percpu_counter_mod(&sbi->s_freeinodes_counter, + percpu_counter_add(&sbi->s_freeinodes_counter, EXT4_INODES_PER_GROUP(sb)); ext4_journal_dirty_metadata(handle, sbi->s_sbh); Index: linux-2.6/include/linux/percpu_counter.h =================================================================== --- linux-2.6.orig/include/linux/percpu_counter.h +++ linux-2.6/include/linux/percpu_counter.h @@ -26,20 +26,43 @@ struct percpu_counter { #define FBC_BATCH (NR_CPUS*4) #endif -static inline void percpu_counter_init(struct percpu_counter *fbc, s64 amount) +static inline +int percpu_counter_init(struct percpu_counter *fbc, s64 amount) { spin_lock_init(&fbc->lock); fbc->count = amount; fbc->counters = alloc_percpu(s32); + if (!fbc->counters) + return -ENOMEM; + return 0; } +int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount); + static inline void percpu_counter_destroy(struct percpu_counter *fbc) { free_percpu(fbc->counters); } -void percpu_counter_mod(struct percpu_counter *fbc, s32 amount); -s64 percpu_counter_sum(struct percpu_counter *fbc); +void percpu_counter_set(struct percpu_counter *fbc, s64 amount); +void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch); +s64 __percpu_counter_sum(struct percpu_counter *fbc); + +static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) +{ + __percpu_counter_add(fbc, amount, FBC_BATCH); +} + +static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc) +{ + s64 ret = __percpu_counter_sum(fbc); + return ret < 0 ? 0 : ret; +} + +static inline s64 percpu_counter_sum(struct percpu_counter *fbc) +{ + return __percpu_counter_sum(fbc); +} static inline s64 percpu_counter_read(struct percpu_counter *fbc) { @@ -67,17 +90,28 @@ struct percpu_counter { s64 count; }; -static inline void percpu_counter_init(struct percpu_counter *fbc, s64 amount) +static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount) { fbc->count = amount; + return 0; } +#define percpu_counter_init_irq percpu_counter_init + static inline void percpu_counter_destroy(struct percpu_counter *fbc) { } +static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount) +{ + fbc->count = amount; +} + +#define __percpu_counter_add(fbc, amount, batch) \ + percpu_counter_add(fbc, amount) + static inline void -percpu_counter_mod(struct percpu_counter *fbc, s32 amount) +percpu_counter_add(struct percpu_counter *fbc, s64 amount) { preempt_disable(); fbc->count += amount; @@ -94,21 +128,31 @@ static inline s64 percpu_counter_read_po return fbc->count; } -static inline s64 percpu_counter_sum(struct percpu_counter *fbc) +static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc) { return percpu_counter_read_positive(fbc); } +static inline s64 percpu_counter_sum(struct percpu_counter *fbc) +{ + return percpu_counter_read(fbc); +} + #endif /* CONFIG_SMP */ static inline void percpu_counter_inc(struct percpu_counter *fbc) { - percpu_counter_mod(fbc, 1); + percpu_counter_add(fbc, 1); } static inline void percpu_counter_dec(struct percpu_counter *fbc) { - percpu_counter_mod(fbc, -1); + percpu_counter_add(fbc, -1); +} + +static inline void percpu_counter_sub(struct percpu_counter *fbc, s64 amount) +{ + percpu_counter_add(fbc, -amount); } #endif /* _LINUX_PERCPU_COUNTER_H */ Index: linux-2.6/lib/percpu_counter.c =================================================================== --- linux-2.6.orig/lib/percpu_counter.c +++ linux-2.6/lib/percpu_counter.c @@ -5,15 +5,41 @@ #include #include -void percpu_counter_mod(struct percpu_counter *fbc, s32 amount) +void percpu_counter_set(struct percpu_counter *fbc, s64 amount) { - long count; + int cpu; + + spin_lock(&fbc->lock); + for_each_possible_cpu(cpu) { + s32 *pcount = per_cpu_ptr(fbc->counters, cpu); + *pcount = 0; + } + fbc->count = amount; + spin_unlock(&fbc->lock); +} +EXPORT_SYMBOL(percpu_counter_set); + +static struct lock_class_key percpu_counter_irqsafe; + +int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount) +{ + int err; + + err = percpu_counter_init(fbc, amount); + if (!err) + lockdep_set_class(&fbc->lock, &percpu_counter_irqsafe); + return err; +} + +void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch) +{ + s64 count; s32 *pcount; int cpu = get_cpu(); pcount = per_cpu_ptr(fbc->counters, cpu); count = *pcount + amount; - if (count >= FBC_BATCH || count <= -FBC_BATCH) { + if (count >= batch || count <= -batch) { spin_lock(&fbc->lock); fbc->count += count; *pcount = 0; @@ -23,13 +49,13 @@ void percpu_counter_mod(struct percpu_co } put_cpu(); } -EXPORT_SYMBOL(percpu_counter_mod); +EXPORT_SYMBOL(__percpu_counter_add); /* * Add up all the per-cpu counts, return the result. This is a more accurate * but much slower version of percpu_counter_read_positive() */ -s64 percpu_counter_sum(struct percpu_counter *fbc) +s64 __percpu_counter_sum(struct percpu_counter *fbc) { s64 ret; int cpu; @@ -41,6 +67,6 @@ s64 percpu_counter_sum(struct percpu_cou ret += *pcount; } spin_unlock(&fbc->lock); - return ret < 0 ? 0 : ret; + return ret; } -EXPORT_SYMBOL(percpu_counter_sum); +EXPORT_SYMBOL(__percpu_counter_sum); Index: linux-2.6/fs/ext3/super.c =================================================================== --- linux-2.6.orig/fs/ext3/super.c +++ linux-2.6/fs/ext3/super.c @@ -1406,6 +1406,7 @@ static int ext3_fill_super (struct super int i; int needs_recovery; __le32 features; + int err; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) @@ -1665,12 +1666,16 @@ static int ext3_fill_super (struct super get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); - percpu_counter_init(&sbi->s_freeblocks_counter, + err = percpu_counter_init(&sbi->s_freeblocks_counter, ext3_count_free_blocks(sb)); - percpu_counter_init(&sbi->s_freeinodes_counter, + err |= percpu_counter_init(&sbi->s_freeinodes_counter, ext3_count_free_inodes(sb)); - percpu_counter_init(&sbi->s_dirs_counter, + err |= percpu_counter_init(&sbi->s_dirs_counter, ext3_count_dirs(sb)); + if (err) { + printk(KERN_ERR "EXT3-fs: insufficient memory\n"); + goto failed_mount3; + } /* per fileystem reservation list head & lock */ spin_lock_init(&sbi->s_rsv_window_lock); @@ -2448,12 +2453,12 @@ static int ext3_statfs (struct dentry * buf->f_type = EXT3_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead; - buf->f_bfree = percpu_counter_sum(&sbi->s_freeblocks_counter); + buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter); buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count); if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count)) buf->f_bavail = 0; buf->f_files = le32_to_cpu(es->s_inodes_count); - buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter); + buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); buf->f_namelen = EXT3_NAME_LEN; fsid = le64_to_cpup((void *)es->s_uuid) ^ le64_to_cpup((void *)es->s_uuid + sizeof(u64)); Index: linux-2.6/fs/ext4/super.c =================================================================== --- linux-2.6.orig/fs/ext4/super.c +++ linux-2.6/fs/ext4/super.c @@ -1465,6 +1465,7 @@ static int ext4_fill_super (struct super int needs_recovery; __le32 features; __u64 blocks_count; + int err; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) @@ -1737,12 +1738,16 @@ static int ext4_fill_super (struct super get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); - percpu_counter_init(&sbi->s_freeblocks_counter, + err = percpu_counter_init(&sbi->s_freeblocks_counter, ext4_count_free_blocks(sb)); - percpu_counter_init(&sbi->s_freeinodes_counter, + err |= percpu_counter_init(&sbi->s_freeinodes_counter, ext4_count_free_inodes(sb)); - percpu_counter_init(&sbi->s_dirs_counter, + err |= percpu_counter_init(&sbi->s_dirs_counter, ext4_count_dirs(sb)); + if (err) { + printk(KERN_ERR "EXT4-fs: insufficient memory\n"); + goto failed_mount3; + } /* per fileystem reservation list head & lock */ spin_lock_init(&sbi->s_rsv_window_lock); @@ -2523,12 +2528,12 @@ static int ext4_statfs (struct dentry * buf->f_type = EXT4_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = ext4_blocks_count(es) - overhead; - buf->f_bfree = percpu_counter_sum(&sbi->s_freeblocks_counter); + buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter); buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); if (buf->f_bfree < ext4_r_blocks_count(es)) buf->f_bavail = 0; buf->f_files = le32_to_cpu(es->s_inodes_count); - buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter); + buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); buf->f_namelen = EXT4_NAME_LEN; fsid = le64_to_cpup((void *)es->s_uuid) ^ le64_to_cpup((void *)es->s_uuid + sizeof(u64)); Index: linux-2.6/fs/file_table.c =================================================================== --- linux-2.6.orig/fs/file_table.c +++ linux-2.6/fs/file_table.c @@ -98,7 +98,7 @@ struct file *get_empty_filp(void) * percpu_counters are inaccurate. Do an expensive check before * we go and fail. */ - if (percpu_counter_sum(&nr_files) >= files_stat.max_files) + if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files) goto over; } Index: linux-2.6/fs/ext2/super.c =================================================================== --- linux-2.6.orig/fs/ext2/super.c +++ linux-2.6/fs/ext2/super.c @@ -652,6 +652,7 @@ static int ext2_fill_super(struct super_ int db_count; int i, j; __le32 features; + int err; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) @@ -907,12 +908,16 @@ static int ext2_fill_super(struct super_ get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); - percpu_counter_init(&sbi->s_freeblocks_counter, + err = percpu_counter_init(&sbi->s_freeblocks_counter, ext2_count_free_blocks(sb)); - percpu_counter_init(&sbi->s_freeinodes_counter, + err |= percpu_counter_init(&sbi->s_freeinodes_counter, ext2_count_free_inodes(sb)); - percpu_counter_init(&sbi->s_dirs_counter, + err |= percpu_counter_init(&sbi->s_dirs_counter, ext2_count_dirs(sb)); + if (err) { + printk(KERN_ERR "EXT2-fs: insufficient memory\n"); + goto failed_mount3; + } /* * set up enough so that it can read an inode */ Index: linux-2.6/block/ll_rw_blk.c =================================================================== --- linux-2.6.orig/block/ll_rw_blk.c +++ linux-2.6/block/ll_rw_blk.c @@ -1783,6 +1783,7 @@ static void blk_release_queue(struct kob blk_trace_shutdown(q); + bdi_destroy(&q->backing_dev_info); kmem_cache_free(requestq_cachep, q); } @@ -1835,6 +1836,7 @@ static struct kobj_type queue_ktype; request_queue_t *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) { + int err; request_queue_t *q; q = kmem_cache_alloc_node(requestq_cachep, gfp_mask, node_id); @@ -1842,15 +1844,20 @@ request_queue_t *blk_alloc_queue_node(gf return NULL; memset(q, 0, sizeof(*q)); + q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug; + q->backing_dev_info.unplug_io_data = q; + err = bdi_init(&q->backing_dev_info); + if (err) { + kmem_cache_free(requestq_cachep, q); + return NULL; + } + init_timer(&q->unplug_timer); snprintf(q->kobj.name, KOBJ_NAME_LEN, "%s", "queue"); q->kobj.ktype = &queue_ktype; kobject_init(&q->kobj); - q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug; - q->backing_dev_info.unplug_io_data = q; - mutex_init(&q->sysfs_lock); return q; @@ -3984,6 +3991,73 @@ static ssize_t queue_max_hw_sectors_show return queue_var_show(max_hw_sectors_kb, (page)); } +static ssize_t queue_nr_reclaimable_show(struct request_queue *q, char *page) +{ + unsigned long long nr_reclaimable = + bdi_stat(&q->backing_dev_info, BDI_RECLAIMABLE); + + return sprintf(page, "%llu\n", + nr_reclaimable >> (PAGE_CACHE_SHIFT - 10)); +} + +static ssize_t queue_nr_writeback_show(struct request_queue *q, char *page) +{ + unsigned long long nr_writeback = + bdi_stat(&q->backing_dev_info, BDI_WRITEBACK); + + return sprintf(page, "%llu\n", + nr_writeback >> (PAGE_CACHE_SHIFT - 10)); +} + +extern void bdi_writeout_fraction(struct backing_dev_info *bdi, + long *numerator, long *denominator); + +static ssize_t queue_nr_cache_ratio_show(struct request_queue *q, char *page) +{ + long scale, div; + + bdi_writeout_fraction(&q->backing_dev_info, &scale, &div); + scale *= 1024; + scale /= div; + + return sprintf(page, "%ld\n", scale); +} + +static ssize_t queue_nr_cache_num_show(struct request_queue *q, char *page) +{ + long scale, div; + + bdi_writeout_fraction(&q->backing_dev_info, &scale, &div); + + return sprintf(page, "%ld\n", scale); +} + +static ssize_t queue_nr_cache_denom_show(struct request_queue *q, char *page) +{ + long scale, div; + + bdi_writeout_fraction(&q->backing_dev_info, &scale, &div); + + return sprintf(page, "%ld\n", div); +} + +extern void +get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty, + struct backing_dev_info *bdi); + +static ssize_t queue_nr_cache_size_show(struct request_queue *q, char *page) +{ + long background, dirty, bdi_dirty; + get_dirty_limits(&background, &dirty, &bdi_dirty, &q->backing_dev_info); + return sprintf(page, "%ld\n", bdi_dirty); +} + +static ssize_t queue_nr_cache_total_show(struct request_queue *q, char *page) +{ + long background, dirty, bdi_dirty; + get_dirty_limits(&background, &dirty, &bdi_dirty, &q->backing_dev_info); + return sprintf(page, "%ld\n", dirty); +} static struct queue_sysfs_entry queue_requests_entry = { .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, @@ -4008,6 +4082,41 @@ static struct queue_sysfs_entry queue_ma .show = queue_max_hw_sectors_show, }; +static struct queue_sysfs_entry queue_reclaimable_entry = { + .attr = {.name = "reclaimable_kb", .mode = S_IRUGO }, + .show = queue_nr_reclaimable_show, +}; + +static struct queue_sysfs_entry queue_writeback_entry = { + .attr = {.name = "writeback_kb", .mode = S_IRUGO }, + .show = queue_nr_writeback_show, +}; + +static struct queue_sysfs_entry queue_cache_ratio_entry = { + .attr = {.name = "cache_ratio", .mode = S_IRUGO }, + .show = queue_nr_cache_ratio_show, +}; + +static struct queue_sysfs_entry queue_cache_num_entry = { + .attr = {.name = "cache_num", .mode = S_IRUGO }, + .show = queue_nr_cache_num_show, +}; + +static struct queue_sysfs_entry queue_cache_denom_entry = { + .attr = {.name = "cache_denom", .mode = S_IRUGO }, + .show = queue_nr_cache_denom_show, +}; + +static struct queue_sysfs_entry queue_cache_size_entry = { + .attr = {.name = "cache_size", .mode = S_IRUGO }, + .show = queue_nr_cache_size_show, +}; + +static struct queue_sysfs_entry queue_cache_total_entry = { + .attr = {.name = "cache_total", .mode = S_IRUGO }, + .show = queue_nr_cache_total_show, +}; + static struct queue_sysfs_entry queue_iosched_entry = { .attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR }, .show = elv_iosched_show, @@ -4019,6 +4128,13 @@ static struct attribute *default_attrs[] &queue_ra_entry.attr, &queue_max_hw_sectors_entry.attr, &queue_max_sectors_entry.attr, + &queue_reclaimable_entry.attr, + &queue_writeback_entry.attr, + &queue_cache_ratio_entry.attr, + &queue_cache_num_entry.attr, + &queue_cache_denom_entry.attr, + &queue_cache_size_entry.attr, + &queue_cache_total_entry.attr, &queue_iosched_entry.attr, NULL, }; Index: linux-2.6/drivers/block/rd.c =================================================================== --- linux-2.6.orig/drivers/block/rd.c +++ linux-2.6/drivers/block/rd.c @@ -411,6 +411,9 @@ static void __exit rd_cleanup(void) blk_cleanup_queue(rd_queue[i]); } unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); + + bdi_destroy(&rd_file_backing_dev_info); + bdi_destroy(&rd_backing_dev_info); } /* @@ -419,7 +422,19 @@ static void __exit rd_cleanup(void) static int __init rd_init(void) { int i; - int err = -ENOMEM; + int err; + + err = bdi_init(&rd_backing_dev_info); + if (err) + goto out2; + + err = bdi_init(&rd_file_backing_dev_info); + if (err) { + bdi_destroy(&rd_backing_dev_info); + goto out2; + } + + err = -ENOMEM; if (rd_blocksize > PAGE_SIZE || rd_blocksize < 512 || (rd_blocksize & (rd_blocksize-1))) { @@ -473,6 +488,9 @@ out: put_disk(rd_disks[i]); blk_cleanup_queue(rd_queue[i]); } + bdi_destroy(&rd_backing_dev_info); + bdi_destroy(&rd_file_backing_dev_info); +out2: return err; } Index: linux-2.6/drivers/char/mem.c =================================================================== --- linux-2.6.orig/drivers/char/mem.c +++ linux-2.6/drivers/char/mem.c @@ -977,6 +977,11 @@ static struct class *mem_class; static int __init chr_dev_init(void) { int i; + int err; + + err = bdi_init(&zero_bdi); + if (err) + return err; if (register_chrdev(MEM_MAJOR,"mem",&memory_fops)) printk("unable to get major %d for memory devs\n", MEM_MAJOR); Index: linux-2.6/fs/char_dev.c =================================================================== --- linux-2.6.orig/fs/char_dev.c +++ linux-2.6/fs/char_dev.c @@ -546,6 +546,7 @@ static struct kobject *base_probe(dev_t void __init chrdev_init(void) { cdev_map = kobj_map_init(base_probe, &chrdevs_lock); + bdi_init(&directly_mappable_cdev_bdi); } Index: linux-2.6/fs/fuse/inode.c =================================================================== --- linux-2.6.orig/fs/fuse/inode.c +++ linux-2.6/fs/fuse/inode.c @@ -401,6 +401,7 @@ static int fuse_show_options(struct seq_ static struct fuse_conn *new_conn(void) { struct fuse_conn *fc; + int err; fc = kzalloc(sizeof(*fc), GFP_KERNEL); if (fc) { @@ -416,10 +417,17 @@ static struct fuse_conn *new_conn(void) atomic_set(&fc->num_waiting, 0); fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; fc->bdi.unplug_io_fn = default_unplug_io_fn; + err = bdi_init(&fc->bdi); + if (err) { + kfree(fc); + fc = NULL; + goto out; + } fc->reqctr = 0; fc->blocked = 1; get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); } +out: return fc; } @@ -429,6 +437,7 @@ void fuse_conn_put(struct fuse_conn *fc) if (fc->destroy_req) fuse_request_free(fc->destroy_req); mutex_destroy(&fc->inst_mutex); + bdi_destroy(&fc->bdi); kfree(fc); } } Index: linux-2.6/fs/nfs/client.c =================================================================== --- linux-2.6.orig/fs/nfs/client.c +++ linux-2.6/fs/nfs/client.c @@ -658,6 +658,7 @@ static void nfs_server_set_fsinfo(struct if (server->rsize > NFS_MAX_FILE_IO_SIZE) server->rsize = NFS_MAX_FILE_IO_SIZE; server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; if (server->wsize > max_rpc_payload) @@ -708,6 +709,10 @@ static int nfs_probe_fsinfo(struct nfs_s goto out_error; nfs_server_set_fsinfo(server, &fsinfo); + error = bdi_init(&server->backing_dev_info); + if (error) + goto out_error; + /* Get some general file system info */ if (server->namelen == 0) { @@ -787,6 +792,7 @@ void nfs_free_server(struct nfs_server * nfs_put_client(server->nfs_client); nfs_free_iostats(server->io_stats); + bdi_destroy(&server->backing_dev_info); kfree(server); nfs_release_automount_timer(); dprintk("<-- nfs_free_server()\n"); Index: linux-2.6/fs/hugetlbfs/inode.c =================================================================== --- linux-2.6.orig/fs/hugetlbfs/inode.c +++ linux-2.6/fs/hugetlbfs/inode.c @@ -802,11 +802,15 @@ static int __init init_hugetlbfs_fs(void int error; struct vfsmount *vfsmount; + error = bdi_init(&hugetlbfs_backing_dev_info); + if (error) + return error; + hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", sizeof(struct hugetlbfs_inode_info), 0, 0, init_once, NULL); if (hugetlbfs_inode_cachep == NULL) - return -ENOMEM; + goto out2; error = register_filesystem(&hugetlbfs_fs_type); if (error) @@ -824,6 +828,8 @@ static int __init init_hugetlbfs_fs(void out: if (error) kmem_cache_destroy(hugetlbfs_inode_cachep); + out2: + bdi_destroy(&hugetlbfs_backing_dev_info); return error; } @@ -831,6 +837,7 @@ static void __exit exit_hugetlbfs_fs(voi { kmem_cache_destroy(hugetlbfs_inode_cachep); unregister_filesystem(&hugetlbfs_fs_type); + bdi_destroy(&hugetlbfs_backing_dev_info); } module_init(init_hugetlbfs_fs) Index: linux-2.6/fs/ocfs2/dlm/dlmfs.c =================================================================== --- linux-2.6.orig/fs/ocfs2/dlm/dlmfs.c +++ linux-2.6/fs/ocfs2/dlm/dlmfs.c @@ -588,13 +588,17 @@ static int __init init_dlmfs_fs(void) dlmfs_print_version(); + status = bdi_init(&dlmfs_backing_dev_info); + if (status) + return status; + dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache", sizeof(struct dlmfs_inode_private), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| SLAB_MEM_SPREAD), dlmfs_init_once, NULL); if (!dlmfs_inode_cache) - return -ENOMEM; + goto bail; cleanup_inode = 1; user_dlm_worker = create_singlethread_workqueue("user_dlm"); @@ -611,6 +615,7 @@ bail: kmem_cache_destroy(dlmfs_inode_cache); if (cleanup_worker) destroy_workqueue(user_dlm_worker); + bdi_destroy(&dlmfs_backing_dev_info); } else printk("OCFS2 User DLM kernel interface loaded\n"); return status; @@ -624,6 +629,8 @@ static void __exit exit_dlmfs_fs(void) destroy_workqueue(user_dlm_worker); kmem_cache_destroy(dlmfs_inode_cache); + + bdi_destroy(&dlmfs_backing_dev_info); } MODULE_AUTHOR("Oracle"); Index: linux-2.6/fs/configfs/configfs_internal.h =================================================================== --- linux-2.6.orig/fs/configfs/configfs_internal.h +++ linux-2.6/fs/configfs/configfs_internal.h @@ -55,6 +55,8 @@ extern int configfs_is_root(struct confi extern struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent *); extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *)); +extern int configfs_inode_init(void); +extern void configfs_inode_exit(void); extern int configfs_create_file(struct config_item *, const struct configfs_attribute *); extern int configfs_make_dirent(struct configfs_dirent *, Index: linux-2.6/fs/configfs/inode.c =================================================================== --- linux-2.6.orig/fs/configfs/inode.c +++ linux-2.6/fs/configfs/inode.c @@ -256,4 +256,12 @@ void configfs_hash_and_remove(struct den mutex_unlock(&dir->d_inode->i_mutex); } +int __init configfs_inode_init(void) +{ + return bdi_init(&configfs_backing_dev_info); +} +void __exit configfs_inode_exit(void) +{ + bdi_destroy(&configfs_backing_dev_info); +} Index: linux-2.6/fs/configfs/mount.c =================================================================== --- linux-2.6.orig/fs/configfs/mount.c +++ linux-2.6/fs/configfs/mount.c @@ -154,8 +154,16 @@ static int __init configfs_init(void) subsystem_unregister(&config_subsys); kmem_cache_destroy(configfs_dir_cachep); configfs_dir_cachep = NULL; + goto out; } + err = configfs_inode_init(); + if (err) { + unregister_filesystem(&configfs_fs_type); + subsystem_unregister(&config_subsys); + kmem_cache_destroy(configfs_dir_cachep); + configfs_dir_cachep = NULL; + } out: return err; } @@ -166,6 +174,7 @@ static void __exit configfs_exit(void) subsystem_unregister(&config_subsys); kmem_cache_destroy(configfs_dir_cachep); configfs_dir_cachep = NULL; + configfs_inode_exit(); } MODULE_AUTHOR("Oracle"); Index: linux-2.6/fs/ramfs/inode.c =================================================================== --- linux-2.6.orig/fs/ramfs/inode.c +++ linux-2.6/fs/ramfs/inode.c @@ -222,7 +222,17 @@ module_exit(exit_ramfs_fs) int __init init_rootfs(void) { - return register_filesystem(&rootfs_fs_type); + int err; + + err = bdi_init(&ramfs_backing_dev_info); + if (err) + return err; + + err = register_filesystem(&rootfs_fs_type); + if (err) + bdi_destroy(&ramfs_backing_dev_info); + + return err; } MODULE_LICENSE("GPL"); Index: linux-2.6/fs/sysfs/inode.c =================================================================== --- linux-2.6.orig/fs/sysfs/inode.c +++ linux-2.6/fs/sysfs/inode.c @@ -44,6 +44,11 @@ void sysfs_delete_inode(struct inode *in return generic_delete_inode(inode); } +int __init sysfs_inode_init(void) +{ + return bdi_init(&sysfs_backing_dev_info); +} + int sysfs_setattr(struct dentry * dentry, struct iattr * iattr) { struct inode * inode = dentry->d_inode; Index: linux-2.6/fs/sysfs/mount.c =================================================================== --- linux-2.6.orig/fs/sysfs/mount.c +++ linux-2.6/fs/sysfs/mount.c @@ -98,6 +98,10 @@ int __init sysfs_init(void) if (!sysfs_dir_cachep) goto out; + err = sysfs_inode_init(); + if (err) + goto out_err; + err = register_filesystem(&sysfs_fs_type); if (!err) { sysfs_mount = kern_mount(&sysfs_fs_type); Index: linux-2.6/fs/sysfs/sysfs.h =================================================================== --- linux-2.6.orig/fs/sysfs/sysfs.h +++ linux-2.6/fs/sysfs/sysfs.h @@ -17,6 +17,7 @@ extern struct kmem_cache *sysfs_dir_cach extern void sysfs_delete_inode(struct inode *inode); extern struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent *); extern int sysfs_create(struct dentry *, int mode, int (*init)(struct inode *)); +extern int sysfs_inode_init(void); extern int sysfs_dirent_exist(struct sysfs_dirent *, const unsigned char *); extern int sysfs_make_dirent(struct sysfs_dirent *, struct dentry *, void *, Index: linux-2.6/mm/shmem.c =================================================================== --- linux-2.6.orig/mm/shmem.c +++ linux-2.6/mm/shmem.c @@ -2490,6 +2490,10 @@ static int __init init_tmpfs(void) { int error; + error = bdi_init(&shmem_backing_dev_info); + if (error) + goto out4; + error = init_inodecache(); if (error) goto out3; @@ -2514,6 +2518,8 @@ out1: out2: destroy_inodecache(); out3: + bdi_destroy(&shmem_backing_dev_info); +out4: shm_mnt = ERR_PTR(error); return error; } Index: linux-2.6/mm/swap.c =================================================================== --- linux-2.6.orig/mm/swap.c +++ linux-2.6/mm/swap.c @@ -505,6 +505,10 @@ void __init swap_setup(void) { unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); +#ifdef CONFIG_SWAP + bdi_init(swapper_space.backing_dev_info); +#endif + /* Use a smaller cluster for small-memory machines */ if (megs < 16) page_cluster = 2; Index: linux-2.6/mm/readahead.c =================================================================== --- linux-2.6.orig/mm/readahead.c +++ linux-2.6/mm/readahead.c @@ -75,6 +75,12 @@ static inline void ra_off(struct file_ra return; } +static int __init readahead_init(void) +{ + return bdi_init(&default_backing_dev_info); +} +subsys_initcall(readahead_init); + /* * Set the initial window size, round to next power of 2 and square * for small size, x 4 for medium, and x 2 for large Index: linux-2.6/fs/buffer.c =================================================================== --- linux-2.6.orig/fs/buffer.c +++ linux-2.6/fs/buffer.c @@ -726,6 +726,8 @@ int __set_page_dirty_buffers(struct page if (page->mapping) { /* Race with truncate? */ if (mapping_cap_account_dirty(mapping)) { __inc_zone_page_state(page, NR_FILE_DIRTY); + __inc_bdi_stat(mapping->backing_dev_info, + BDI_RECLAIMABLE); task_io_account_write(PAGE_CACHE_SIZE); } radix_tree_tag_set(&mapping->page_tree, Index: linux-2.6/mm/page-writeback.c =================================================================== --- linux-2.6.orig/mm/page-writeback.c +++ linux-2.6/mm/page-writeback.c @@ -2,6 +2,7 @@ * mm/page-writeback.c * * Copyright (C) 2002, Linus Torvalds. + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * Contains functions related to writing back dirty pages at the * address_space level. @@ -49,8 +50,6 @@ */ static long ratelimit_pages = 32; -static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */ - /* * When balance_dirty_pages decides that the caller needs to perform some * non-background writeback, this is how many pages it will attempt to write. @@ -103,6 +102,141 @@ EXPORT_SYMBOL(laptop_mode); static void background_writeout(unsigned long _min_pages); /* + * Scale the writeback cache size proportional to the relative writeout speeds. + * + * We do this by keeping a floating proportion between BDIs, based on page + * writeback completions [end_page_writeback()]. Those devices that write out + * pages fastest will get the larger share, while the slower will get a smaller + * share. + * + * We use page writeout completions because we are interested in getting rid of + * dirty pages. Having them written out is the primary goal. + * + * We introduce a concept of time, a period over which we measure these events, + * because demand can/will vary over time. The length of this period itself is + * measured in page writeback completions. + * + */ +static struct prop_descriptor vm_completions; +static struct prop_descriptor vm_dirties; + +static unsigned long determine_dirtyable_memory(void); + +/* + * couple the period to the dirty_ratio: + * + * period/2 ~ roundup_pow_of_two(dirty limit) + */ +static int calc_period_shift(void) +{ + unsigned long dirty_total; + + dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100; + return 2 + ilog2(dirty_total - 1); +} + +/* + * update the period when the dirty ratio changes. + */ +int dirty_ratio_handler(ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int old_ratio = vm_dirty_ratio; + int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + if (ret == 0 && write && vm_dirty_ratio != old_ratio) { + int shift = calc_period_shift(); + prop_change_shift(&vm_completions, shift); + prop_change_shift(&vm_dirties, shift); + } + return ret; +} + +/* + * Increment the BDI's writeout completion count and the global writeout + * completion count. Called from test_clear_page_writeback(). + */ +static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) +{ + __prop_inc_percpu(&vm_completions, &bdi->completions); +} + +static inline void task_dirty_inc(struct task_struct *tsk) +{ + prop_inc_single(&vm_dirties, &tsk->dirties); +} + +/* + * Obtain an accurate fraction of the BDI's portion. + */ +void bdi_writeout_fraction(struct backing_dev_info *bdi, + long *numerator, long *denominator) +{ + if (bdi_cap_writeback_dirty(bdi)) { + prop_fraction_percpu(&vm_completions, &bdi->completions, + numerator, denominator); + } else { + *numerator = 0; + *denominator = 1; + } +} + +/* + * Clip the earned share of dirty pages to that which is actually available. + * This avoids exceeding the total dirty_limit when the floating averages + * fluctuate too quickly. + */ +static void +clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty) +{ + long avail_dirty; + + avail_dirty = dirty - + (global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_WRITEBACK) + + global_page_state(NR_UNSTABLE_NFS)); + + if (avail_dirty < 0) + avail_dirty = 0; + + avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + + bdi_stat(bdi, BDI_WRITEBACK); + + *pbdi_dirty = min(*pbdi_dirty, avail_dirty); +} + +static inline void task_dirties_fraction(struct task_struct *tsk, + long *numerator, long *denominator) +{ + prop_fraction_single(&vm_dirties, &tsk->dirties, + numerator, denominator); +} + +/* + * scale the dirty limit + * + * task specific dirty limit: + * + * dirty -= (dirty/8) * p_{t} + */ +void task_dirty_limit(struct task_struct *tsk, long *pdirty) +{ + long numerator, denominator; + long dirty = *pdirty; + long long inv = dirty >> 3; + + task_dirties_fraction(tsk, &numerator, &denominator); + inv *= numerator; + do_div(inv, denominator); + + dirty -= inv; + if (dirty < *pdirty/2) + dirty = *pdirty/2; + + *pdirty = dirty; +} + +/* * Work out the current dirty-memory clamping and background writeout * thresholds. * @@ -157,9 +291,9 @@ static unsigned long determine_dirtyable return x + 1; /* Ensure that we never return 0 */ } -static void -get_dirty_limits(long *pbackground, long *pdirty, - struct address_space *mapping) +void +get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty, + struct backing_dev_info *bdi) { int background_ratio; /* Percentages */ int dirty_ratio; @@ -193,6 +327,23 @@ get_dirty_limits(long *pbackground, long } *pbackground = background; *pdirty = dirty; + + if (bdi) { + long long bdi_dirty = dirty; + long numerator, denominator; + + /* + * Calculate this BDI's share of the dirty ratio. + */ + bdi_writeout_fraction(bdi, &numerator, &denominator); + + bdi_dirty *= numerator; + do_div(bdi_dirty, denominator); + + *pbdi_dirty = bdi_dirty; + clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty); + task_dirty_limit(current, pbdi_dirty); + } } /* @@ -204,9 +355,11 @@ get_dirty_limits(long *pbackground, long */ static void balance_dirty_pages(struct address_space *mapping) { - long nr_reclaimable; + long bdi_nr_reclaimable; + long bdi_nr_writeback; long background_thresh; long dirty_thresh; + long bdi_thresh; unsigned long pages_written = 0; unsigned long write_chunk = sync_writeback_pages(); @@ -221,15 +374,15 @@ static void balance_dirty_pages(struct a .range_cyclic = 1, }; - get_dirty_limits(&background_thresh, &dirty_thresh, mapping); - nr_reclaimable = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); - if (nr_reclaimable + global_page_state(NR_WRITEBACK) <= - dirty_thresh) + get_dirty_limits(&background_thresh, &dirty_thresh, + &bdi_thresh, bdi); + bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); + bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); + if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) break; - if (!dirty_exceeded) - dirty_exceeded = 1; + if (!bdi->dirty_exceeded) + bdi->dirty_exceeded = 1; /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. * Unstable writes are a feature of certain networked @@ -237,16 +390,37 @@ static void balance_dirty_pages(struct a * written to the server's write cache, but has not yet * been flushed to permanent storage. */ - if (nr_reclaimable) { + if (bdi_nr_reclaimable) { writeback_inodes(&wbc); - get_dirty_limits(&background_thresh, - &dirty_thresh, mapping); - nr_reclaimable = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); - if (nr_reclaimable + - global_page_state(NR_WRITEBACK) - <= dirty_thresh) - break; + + get_dirty_limits(&background_thresh, &dirty_thresh, + &bdi_thresh, bdi); + + /* + * In order to avoid the stacked BDI deadlock we need + * to ensure we accurately count the 'dirty' pages when + * the threshold is low. + * + * Otherwise it would be possible to get thresh+n pages + * reported dirty, even though there are thresh-m pages + * actually dirty; with m+n sitting in the percpu + * deltas. + */ + if (bdi_thresh < 2*bdi_stat_error(bdi)) { + bdi_nr_reclaimable = + bdi_stat_sum(bdi, BDI_RECLAIMABLE); + bdi_nr_writeback = + bdi_stat_sum(bdi, BDI_WRITEBACK); + } else { + bdi_nr_reclaimable = + bdi_stat(bdi, BDI_RECLAIMABLE); + bdi_nr_writeback = + bdi_stat(bdi, BDI_WRITEBACK); + } + + if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) + break; + pages_written += write_chunk - wbc.nr_to_write; if (pages_written >= write_chunk) break; /* We've done our duty */ @@ -254,9 +428,9 @@ static void balance_dirty_pages(struct a congestion_wait(WRITE, HZ/10); } - if (nr_reclaimable + global_page_state(NR_WRITEBACK) - <= dirty_thresh && dirty_exceeded) - dirty_exceeded = 0; + if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && + bdi->dirty_exceeded) + bdi->dirty_exceeded = 0; if (writeback_in_progress(bdi)) return; /* pdflush is already working this queue */ @@ -270,7 +444,9 @@ static void balance_dirty_pages(struct a * background_thresh, to keep the amount of dirty memory low. */ if ((laptop_mode && pages_written) || - (!laptop_mode && (nr_reclaimable > background_thresh))) + (!laptop_mode && (global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) + > background_thresh))) pdflush_operation(background_writeout, 0); } @@ -306,7 +482,7 @@ void balance_dirty_pages_ratelimited_nr( unsigned long *p; ratelimit = ratelimit_pages; - if (dirty_exceeded) + if (mapping->backing_dev_info->dirty_exceeded) ratelimit = 8; /* @@ -342,7 +518,7 @@ void throttle_vm_writeout(gfp_t gfp_mask } for ( ; ; ) { - get_dirty_limits(&background_thresh, &dirty_thresh, NULL); + get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); /* * Boost the allowable dirty threshold a bit for page @@ -377,7 +553,7 @@ static void background_writeout(unsigned long background_thresh; long dirty_thresh; - get_dirty_limits(&background_thresh, &dirty_thresh, NULL); + get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); if (global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS) < background_thresh && min_pages <= 0) @@ -582,9 +758,15 @@ static struct notifier_block __cpuinitda */ void __init page_writeback_init(void) { + int shift; + mod_timer(&wb_timer, jiffies + dirty_writeback_interval); writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); + + shift = calc_period_shift(); + prop_descriptor_init(&vm_completions, shift); + prop_descriptor_init(&vm_dirties, shift); } /** @@ -828,6 +1010,8 @@ int __set_page_dirty_nobuffers(struct pa BUG_ON(mapping2 != mapping); if (mapping_cap_account_dirty(mapping)) { __inc_zone_page_state(page, NR_FILE_DIRTY); + __inc_bdi_stat(mapping->backing_dev_info, + BDI_RECLAIMABLE); task_io_account_write(PAGE_CACHE_SIZE); } radix_tree_tag_set(&mapping->page_tree, @@ -860,7 +1044,7 @@ EXPORT_SYMBOL(redirty_page_for_writepage * If the mapping doesn't provide a set_page_dirty a_op, then * just fall through and assume that it wants buffer_heads. */ -int fastcall set_page_dirty(struct page *page) +static int __set_page_dirty(struct page *page) { struct address_space *mapping = page_mapping(page); @@ -878,6 +1062,14 @@ int fastcall set_page_dirty(struct page } return 0; } + +int fastcall set_page_dirty(struct page *page) +{ + int ret = __set_page_dirty(page); + if (ret) + task_dirty_inc(current); + return ret; +} EXPORT_SYMBOL(set_page_dirty); /* @@ -954,6 +1146,8 @@ int clear_page_dirty_for_io(struct page set_page_dirty(page); if (TestClearPageDirty(page)) { dec_zone_page_state(page, NR_FILE_DIRTY); + dec_bdi_stat(mapping->backing_dev_info, + BDI_RECLAIMABLE); return 1; } return 0; @@ -968,14 +1162,20 @@ int test_clear_page_writeback(struct pag int ret; if (mapping) { + struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; write_lock_irqsave(&mapping->tree_lock, flags); ret = TestClearPageWriteback(page); - if (ret) + if (ret) { radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_WRITEBACK); + if (bdi_cap_writeback_dirty(bdi)) { + __dec_bdi_stat(bdi, BDI_WRITEBACK); + __bdi_writeout_inc(bdi); + } + } write_unlock_irqrestore(&mapping->tree_lock, flags); } else { ret = TestClearPageWriteback(page); @@ -989,14 +1189,18 @@ int test_set_page_writeback(struct page int ret; if (mapping) { + struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; write_lock_irqsave(&mapping->tree_lock, flags); ret = TestSetPageWriteback(page); - if (!ret) + if (!ret) { radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_WRITEBACK); + if (bdi_cap_writeback_dirty(bdi)) + __inc_bdi_stat(bdi, BDI_WRITEBACK); + } if (!PageDirty(page)) radix_tree_tag_clear(&mapping->page_tree, page_index(page), Index: linux-2.6/mm/truncate.c =================================================================== --- linux-2.6.orig/mm/truncate.c +++ linux-2.6/mm/truncate.c @@ -72,6 +72,8 @@ void cancel_dirty_page(struct page *page struct address_space *mapping = page->mapping; if (mapping && mapping_cap_account_dirty(mapping)) { dec_zone_page_state(page, NR_FILE_DIRTY); + dec_bdi_stat(mapping->backing_dev_info, + BDI_RECLAIMABLE); if (account_size) task_io_account_cancelled_write(account_size); } Index: linux-2.6/lib/proportions.c =================================================================== --- /dev/null +++ linux-2.6/lib/proportions.c @@ -0,0 +1,385 @@ +/* + * FLoating proportions + * + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * + * Description: + * + * The floating proportion is a time derivative with an exponentially decaying + * history: + * + * p_{j} = \Sum_{i=0} (dx_{j}/dt_{-i}) / 2^(1+i) + * + * Where j is an element from {prop_local}, x_{j} is j's number of events, + * and i the time period over which the differential is taken. So d/dt_{-i} is + * the differential over the i-th last period. + * + * The decaying history gives smooth transitions. The time differential carries + * the notion of speed. + * + * The denominator is 2^(1+i) because we want the series to be normalised, ie. + * + * \Sum_{i=0} 1/2^(1+i) = 1 + * + * Further more, if we measure time (t) in the same events as x; so that: + * + * t = \Sum_{j} x_{j} + * + * we get that: + * + * \Sum_{j} p_{j} = 1 + * + * Writing this in an iterative fashion we get (dropping the 'd's): + * + * if (++x_{j}, ++t > period) + * t /= 2; + * for_each (j) + * x_{j} /= 2; + * + * so that: + * + * p_{j} = x_{j} / t; + * + * We optimize away the '/= 2' for the global time delta by noting that: + * + * if (++t > period) t /= 2: + * + * Can be approximated by: + * + * period/2 + (++t % period/2) + * + * [ Furthermore, when we choose period to be 2^n it can be written in terms of + * binary operations and wraparound artefacts disappear. ] + * + * Also note that this yields a natural counter of the elapsed periods: + * + * c = t / (period/2) + * + * [ Its monotonic increasing property can be applied to mitigate the wrap- + * around issue. ] + * + * This allows us to do away with the loop over all prop_locals on each period + * expiration. By remembering the period count under which it was last accessed + * as c_{j}, we can obtain the number of 'missed' cycles from: + * + * c - c_{j} + * + * We can then lazily catch up to the global period count every time we are + * going to use x_{j}, by doing: + * + * x_{j} /= 2^(c - c_{j}), c_{j} = c + */ + +#include +#include + +/* + * Limit the time part in order to ensure there are some bits left for the + * cycle counter. + */ +#define PROP_MAX_SHIFT (3*BITS_PER_LONG/4) + +int prop_descriptor_init(struct prop_descriptor *pd, int shift) +{ + int err; + + if (shift > PROP_MAX_SHIFT) + shift = PROP_MAX_SHIFT; + + pd->index = 0; + pd->pg[0].shift = shift; + mutex_init(&pd->mutex); + err = percpu_counter_init_irq(&pd->pg[0].events, 0); + if (err) + goto out; + + err = percpu_counter_init_irq(&pd->pg[1].events, 0); + if (err) + percpu_counter_destroy(&pd->pg[0].events); + +out: + return err; +} + +/* + * We have two copies, and flip between them to make it seem like an atomic + * update. The update is not really atomic wrt the events counter, but + * it is internally consistent with the bit layout depending on shift. + * + * We copy the events count, move the bits around and flip the index. + */ +void prop_change_shift(struct prop_descriptor *pd, int shift) +{ + int index; + int offset; + u64 events; + unsigned long flags; + + if (shift > PROP_MAX_SHIFT) + shift = PROP_MAX_SHIFT; + + mutex_lock(&pd->mutex); + + index = pd->index ^ 1; + offset = pd->pg[pd->index].shift - shift; + if (!offset) + goto out; + + pd->pg[index].shift = shift; + + local_irq_save(flags); + events = percpu_counter_sum(&pd->pg[pd->index].events); + if (offset < 0) + events <<= -offset; + else + events >>= offset; + percpu_counter_set(&pd->pg[index].events, events); + + /* + * ensure the new pg is fully written before the switch + */ + smp_wmb(); + pd->index = index; + local_irq_restore(flags); + + synchronize_rcu(); + +out: + mutex_unlock(&pd->mutex); +} + +/* + * wrap the access to the data in an rcu_read_lock() section; + * this is used to track the active references. + */ +static struct prop_global *prop_get_global(struct prop_descriptor *pd) +{ + int index; + + rcu_read_lock(); + index = pd->index; + /* + * match the wmb from vcd_flip() + */ + smp_rmb(); + return &pd->pg[index]; +} + +static void prop_put_global(struct prop_descriptor *pd, struct prop_global *pg) +{ + rcu_read_unlock(); +} + +static void +prop_adjust_shift(int *pl_shift, unsigned long *pl_period, int new_shift) +{ + int offset = *pl_shift - new_shift; + + if (!offset) + return; + + if (offset < 0) + *pl_period <<= -offset; + else + *pl_period >>= offset; + + *pl_shift = new_shift; +} + +/* + * PERCPU + */ + +int prop_local_init_percpu(struct prop_local_percpu *pl) +{ + spin_lock_init(&pl->lock); + pl->shift = 0; + pl->period = 0; + return percpu_counter_init_irq(&pl->events, 0); +} + +void prop_local_destroy_percpu(struct prop_local_percpu *pl) +{ + percpu_counter_destroy(&pl->events); +} + +/* + * Catch up with missed period expirations. + * + * until (c_{j} == c) + * x_{j} -= x_{j}/2; + * c_{j}++; + */ +static +void prop_norm_percpu(struct prop_global *pg, struct prop_local_percpu *pl) +{ + unsigned long period = 1UL << (pg->shift - 1); + unsigned long period_mask = ~(period - 1); + unsigned long global_period; + unsigned long flags; + + global_period = percpu_counter_read(&pg->events); + global_period &= period_mask; + + /* + * Fast path - check if the local and global period count still match + * outside of the lock. + */ + if (pl->period == global_period) + return; + + spin_lock_irqsave(&pl->lock, flags); + prop_adjust_shift(&pl->shift, &pl->period, pg->shift); + period = 1UL << (pg->shift - 1); + /* + * For each missed period, we half the local counter. + * basically: + * pl->events >> (global_period - pl->period); + * + * but since the distributed nature of percpu counters make division + * rather hard, use a regular subtraction loop. This is safe, because + * the events will only every be incremented, hence the subtraction + * can never result in a negative number. + */ + while (pl->period != global_period) { + unsigned long val = percpu_counter_read(&pl->events); + unsigned long half = (val + 1) >> 1; + + /* + * Half of zero won't be much less, break out. + * This limits the loop to shift iterations, even + * if we missed a million. + */ + if (!val) + break; + + percpu_counter_add(&pl->events, -half); + pl->period += period; + } + pl->period = global_period; + spin_unlock_irqrestore(&pl->lock, flags); +} + +/* + * ++x_{j}, ++t + */ +void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl) +{ + struct prop_global *pg = prop_get_global(pd); + + prop_norm_percpu(pg, pl); + percpu_counter_add(&pl->events, 1); + percpu_counter_add(&pg->events, 1); + prop_put_global(pd, pg); +} + +/* + * Obtain an fraction of this proportion + * + * p_{j} = x_{j} / (period/2 + t % period/2) + */ +void prop_fraction_percpu(struct prop_descriptor *pd, + struct prop_local_percpu *pl, + long *numerator, long *denominator) +{ + struct prop_global *pg = prop_get_global(pd); + unsigned long period_2 = 1UL << (pg->shift - 1); + unsigned long counter_mask = period_2 - 1; + unsigned long global_count; + + prop_norm_percpu(pg, pl); + *numerator = percpu_counter_read_positive(&pl->events); + + global_count = percpu_counter_read(&pg->events); + *denominator = period_2 + (global_count & counter_mask); + + prop_put_global(pd, pg); +} + +/* + * SINGLE + */ + +int prop_local_init_single(struct prop_local_single *pl) +{ + spin_lock_init(&pl->lock); + pl->shift = 0; + pl->period = 0; + pl->events = 0; + return 0; +} + +void prop_local_destroy_single(struct prop_local_single *pl) +{ +} + +/* + * Catch up with missed period expirations. + */ +static +void prop_norm_single(struct prop_global *pg, struct prop_local_single *pl) +{ + unsigned long period = 1UL << (pg->shift - 1); + unsigned long period_mask = ~(period - 1); + unsigned long global_period; + unsigned long flags; + + global_period = percpu_counter_read(&pg->events); + global_period &= period_mask; + + /* + * Fast path - check if the local and global period count still match + * outside of the lock. + */ + if (pl->period == global_period) + return; + + spin_lock_irqsave(&pl->lock, flags); + prop_adjust_shift(&pl->shift, &pl->period, pg->shift); + /* + * For each missed period, we half the local counter. + */ + period = (global_period - pl->period) >> (pg->shift - 1); + if (likely(period < BITS_PER_LONG)) + pl->events >>= period; + else + pl->events = 0; + pl->period = global_period; + spin_unlock_irqrestore(&pl->lock, flags); +} + +/* + * ++x_{j}, ++t + */ +void __prop_inc_single(struct prop_descriptor *pd, struct prop_local_single *pl) +{ + struct prop_global *pg = prop_get_global(pd); + + prop_norm_single(pg, pl); + pl->events++; + percpu_counter_add(&pg->events, 1); + prop_put_global(pd, pg); +} + +/* + * Obtain an fraction of this proportion + * + * p_{j} = x_{j} / (period/2 + t % period/2) + */ +void prop_fraction_single(struct prop_descriptor *pd, + struct prop_local_single *pl, + long *numerator, long *denominator) +{ + struct prop_global *pg = prop_get_global(pd); + unsigned long period_2 = 1UL << (pg->shift - 1); + unsigned long counter_mask = period_2 - 1; + unsigned long global_count; + + prop_norm_single(pg, pl); + *numerator = pl->events; + + global_count = percpu_counter_read(&pg->events); + *denominator = period_2 + (global_count & counter_mask); + + prop_put_global(pd, pg); +} Index: linux-2.6/include/linux/proportions.h =================================================================== --- /dev/null +++ linux-2.6/include/linux/proportions.h @@ -0,0 +1,119 @@ +/* + * FLoating proportions + * + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * + * This file contains the public data structure and API definitions. + */ + +#ifndef _LINUX_PROPORTIONS_H +#define _LINUX_PROPORTIONS_H + +#include +#include +#include + +struct prop_global { + /* + * The period over which we differentiate + * + * period = 2^shift + */ + int shift; + /* + * The total event counter aka 'time'. + * + * Treated as an unsigned long; the lower 'shift - 1' bits are the + * counter bits, the remaining upper bits the period counter. + */ + struct percpu_counter events; +}; + +/* + * global proportion descriptor + * + * this is needed to consitently flip prop_global structures. + */ +struct prop_descriptor { + int index; + struct prop_global pg[2]; + struct mutex mutex; /* serialize the prop_global switch */ +}; + +int prop_descriptor_init(struct prop_descriptor *pd, int shift); +void prop_change_shift(struct prop_descriptor *pd, int new_shift); + +/* + * ----- PERCPU ------ + */ + +struct prop_local_percpu { + /* + * the local events counter + */ + struct percpu_counter events; + + /* + * snapshot of the last seen global state + */ + int shift; + unsigned long period; + spinlock_t lock; /* protect the snapshot state */ +}; + +int prop_local_init_percpu(struct prop_local_percpu *pl); +void prop_local_destroy_percpu(struct prop_local_percpu *pl); +void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl); +void prop_fraction_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl, + long *numerator, long *denominator); + +static inline +void prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl) +{ + unsigned long flags; + + local_irq_save(flags); + __prop_inc_percpu(pd, pl); + local_irq_restore(flags); +} + +/* + * ----- SINGLE ------ + */ + +struct prop_local_single { + /* + * the local events counter + */ + unsigned long events; + + /* + * snapshot of the last seen global state + * and a lock protecting this state + */ + int shift; + unsigned long period; + spinlock_t lock; /* protect the snapshot state */ +}; + +#define INIT_PROP_LOCAL_SINGLE(name) \ +{ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ +} + +int prop_local_init_single(struct prop_local_single *pl); +void prop_local_destroy_single(struct prop_local_single *pl); +void __prop_inc_single(struct prop_descriptor *pd, struct prop_local_single *pl); +void prop_fraction_single(struct prop_descriptor *pd, struct prop_local_single *pl, + long *numerator, long *denominator); + +static inline +void prop_inc_single(struct prop_descriptor *pd, struct prop_local_single *pl) +{ + unsigned long flags; + + local_irq_save(flags); + __prop_inc_single(pd, pl); + local_irq_restore(flags); +} + +#endif /* _LINUX_PROPORTIONS_H */ Index: linux-2.6/lib/Makefile =================================================================== --- linux-2.6.orig/lib/Makefile +++ linux-2.6/lib/Makefile @@ -5,7 +5,8 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o dump_stack.o \ idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \ - sha1.o irq_regs.o reciprocal_div.o + sha1.o irq_regs.o reciprocal_div.o \ + proportions.o lib-$(CONFIG_MMU) += ioremap.o lib-$(CONFIG_SMP) += cpumask.o Index: linux-2.6/kernel/sysctl.c =================================================================== --- linux-2.6.orig/kernel/sysctl.c +++ linux-2.6/kernel/sysctl.c @@ -160,6 +160,10 @@ extern ctl_table inotify_table[]; int sysctl_legacy_va_layout; #endif +extern int dirty_ratio_handler(ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos); + /* The default sysctl tables: */ @@ -675,7 +679,7 @@ static ctl_table vm_table[] = { .data = &vm_dirty_ratio, .maxlen = sizeof(vm_dirty_ratio), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, + .proc_handler = &dirty_ratio_handler, .strategy = &sysctl_intvec, .extra1 = &zero, .extra2 = &one_hundred, Index: linux-2.6/include/linux/sched.h =================================================================== --- linux-2.6.orig/include/linux/sched.h +++ linux-2.6/include/linux/sched.h @@ -83,6 +83,7 @@ struct sched_param { #include #include #include +#include #include @@ -1076,6 +1077,7 @@ struct task_struct { #ifdef CONFIG_FAULT_INJECTION int make_it_fail; #endif + struct prop_local_single dirties; }; static inline pid_t process_group(struct task_struct *tsk) Index: linux-2.6/kernel/fork.c =================================================================== --- linux-2.6.orig/kernel/fork.c +++ linux-2.6/kernel/fork.c @@ -106,6 +106,7 @@ static struct kmem_cache *mm_cachep; void free_task(struct task_struct *tsk) { + prop_local_destroy_single(&tsk->dirties); free_thread_info(tsk->stack); rt_mutex_debug_task_free(tsk); free_task_struct(tsk); @@ -162,6 +163,7 @@ static struct task_struct *dup_task_stru { struct task_struct *tsk; struct thread_info *ti; + int err; prepare_to_copy(orig); @@ -177,6 +179,14 @@ static struct task_struct *dup_task_stru *tsk = *orig; tsk->stack = ti; + + err = prop_local_init_single(&tsk->dirties); + if (err) { + free_thread_info(ti); + free_task_struct(tsk); + return NULL; + } + setup_thread_stack(tsk, orig); #ifdef CONFIG_CC_STACKPROTECTOR Index: linux-2.6/include/linux/init_task.h =================================================================== --- linux-2.6.orig/include/linux/init_task.h +++ linux-2.6/include/linux/init_task.h @@ -167,6 +167,7 @@ extern struct group_info init_groups; [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \ [PIDTYPE_SID] = INIT_PID_LINK(PIDTYPE_SID), \ }, \ + .dirties = INIT_PROP_LOCAL_SINGLE(dirties), \ INIT_TRACE_IRQFLAGS \ INIT_LOCKDEP \ }