When the threshol is in the order of the per cpu inaccuracies we can deadlock by not receiveing the updated count, introduce a more expensive but more accurate stat read function to use on low thresholds. (TODO: roll into the bdi_stat patch) Signed-off-by: Peter Zijlstra --- include/linux/backing-dev.h | 13 ++++++++++++- mm/backing-dev.c | 31 +++++++++++++++++++++++++------ mm/page-writeback.c | 19 +++++++++++++++---- 3 files changed, 52 insertions(+), 11 deletions(-) Index: linux-2.6/include/linux/backing-dev.h =================================================================== --- linux-2.6.orig/include/linux/backing-dev.h +++ linux-2.6/include/linux/backing-dev.h @@ -8,6 +8,7 @@ #ifndef _LINUX_BACKING_DEV_H #define _LINUX_BACKING_DEV_H +#include #include #include @@ -34,7 +35,6 @@ enum bdi_stat_item { #ifdef CONFIG_SMP struct bdi_per_cpu_data { - s8 stat_threshold; s8 bdi_stat_diff[NR_BDI_STAT_ITEMS]; } ____cacheline_aligned_in_smp; #endif @@ -60,6 +60,7 @@ struct backing_dev_info { atomic_long_t bdi_stats[NR_BDI_STAT_ITEMS]; #ifdef CONFIG_SMP + int stat_threshold; struct bdi_per_cpu_data pcd[NR_CPUS]; #endif }; @@ -109,6 +110,8 @@ static inline unsigned long bdi_stat(str } #ifdef CONFIG_SMP +unsigned long bdi_stat_accurate(struct backing_dev_info *bdi, enum bdi_stat_item item); + void __mod_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item, int delta); void __inc_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item); void __dec_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item); @@ -117,8 +120,14 @@ void mod_bdi_stat(struct backing_dev_inf void inc_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item); void dec_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item); +static inline unsigned long bdi_stat_delta(struct backing_dev_info *bdi) +{ + return num_online_cpus() * bdi->stat_threshold; +} #else /* CONFIG_SMP */ +#define bdi_stat_accurate bdi_stat + static inline void __mod_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item, int delta) { @@ -142,6 +151,8 @@ static inline void __dec_bdi_stat(struct #define mod_bdi_stat __mod_bdi_stat #define inc_bdi_stat __inc_bdi_stat #define dec_bdi_stat __dec_bdi_stat + +#define bdi_stat_delta(bdi) 1UL #endif void bdi_init(struct backing_dev_info *bdi); Index: linux-2.6/mm/backing-dev.c =================================================================== --- linux-2.6.orig/mm/backing-dev.c +++ linux-2.6/mm/backing-dev.c @@ -98,17 +98,36 @@ void bdi_init(struct backing_dev_in atomic_long_set(&bdi->bdi_stats[i], 0); #ifdef CONFIG_SMP + bdi->stat_threshold = 8 * ilog2(num_online_cpus()); for (i = 0; i < NR_CPUS; i++) { int j; for (j = 0; j < NR_BDI_STAT_ITEMS; j++) bdi->pcd[i].bdi_stat_diff[j] = 0; - bdi->pcd[i].stat_threshold = 8 * ilog2(num_online_cpus()); } #endif } EXPORT_SYMBOL(bdi_init); #ifdef CONFIG_SMP +unsigned long bdi_stat_accurate(struct backing_dev_info *bdi, + enum bdi_stat_item item) +{ + long x = atomic_long_read(&bdi_stats[item]); + int cpu; + + for_each_possible_cpu(cpu) { + struct bdi_per_cpu_data *pcd = &bdi->pcd[cpu]; + s8 *p = pcd->bdi_stat_diff + item; + + x += *p; + } + + if (x < 0) + x = 0; + + return x; +} + void __mod_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item, int delta) { @@ -118,7 +137,7 @@ void __mod_bdi_stat(struct backing_dev_i x = delta + *p; - if (unlikely(x > pcd->stat_threshold || x < -pcd->stat_threshold)) { + if (unlikely(x > bdi->stat_threshold || x < -bdi->stat_threshold)) { bdi_stat_add(x, bdi, item); x = 0; } @@ -144,8 +163,8 @@ void __inc_bdi_stat(struct backing_dev_i (*p)++; - if (unlikely(*p > pcd->stat_threshold)) { - int overstep = pcd->stat_threshold / 2; + if (unlikely(*p > bdi->stat_threshold)) { + int overstep = bdi->stat_threshold / 2; bdi_stat_add(*p + overstep, bdi, item); *p = -overstep; @@ -170,8 +189,8 @@ void __dec_bdi_stat(struct backing_dev_i (*p)--; - if (unlikely(*p < -pcd->stat_threshold)) { - int overstep = pcd->stat_threshold / 2; + if (unlikely(*p < -bdi->stat_threshold)) { + int overstep = bdi->stat_threshold / 2; bdi_stat_add(*p - overstep, bdi, item); *p = overstep; Index: linux-2.6/mm/page-writeback.c =================================================================== --- linux-2.6.orig/mm/page-writeback.c +++ linux-2.6/mm/page-writeback.c @@ -341,14 +341,25 @@ static void balance_dirty_pages(struct a * been flushed to permanent storage. */ if (bdi_nr_reclaimable) { + unsigned long bdi_nr_writeback; writeback_inodes(&wbc); get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); - bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) + - bdi_stat(bdi, BDI_UNSTABLE); - if (bdi_nr_reclaimable + bdi_stat(bdi, BDI_WRITEBACK) <= - bdi_thresh) + + if (bdi_thresh < bdi_stat_delta(bdi)) { + bdi_nr_reclaimable = + bdi_stat_accurate(bdi, BDI_DIRTY) + + bdi_stat_accurate(bdi, BDI_UNSTABLE); + bdi_nr_writeback = + bdi_stat_accurate(bdi, NR_WRITEBACK); + } else { + bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) + + bdi_stat(bdi, BDI_UNSTABLE); + bdi_nr_writeback = bdi_stat(bdi, NR_WRITEBACK); + } + + if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) break; pages_written += write_chunk - wbc.nr_to_write; -- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/