pull out the floating proportion stuff and make it a lib Signed-off-by: Peter Zijlstra --- include/linux/proportions.h | 82 ++++++++++++ lib/Makefile | 2 lib/proportions.c | 259 +++++++++++++++++++++++++++++++++++++++ mm/backing-dev.c | 5 mm/page-writeback.c | 290 ++------------------------------------------ 5 files changed, 364 insertions(+), 274 deletions(-) Index: linux-2.6/mm/page-writeback.c =================================================================== --- linux-2.6.orig/mm/page-writeback.c 2007-05-10 10:52:24.000000000 +0200 +++ linux-2.6/mm/page-writeback.c 2007-05-10 11:06:12.000000000 +0200 @@ -116,93 +116,8 @@ static void background_writeout(unsigned * because demand can/will vary over time. The length of this period itself is * measured in page writeback completions. * - * DETAILS: - * - * The floating proportion is a time derivative with an exponentially decaying - * history: - * - * p_{j} = \Sum_{i=0} (dx_{j}/dt_{-i}) / 2^(1+i) - * - * Where j is an element from {BDIs}, x_{j} is j's number of completions, and i - * the time period over which the differential is taken. So d/dt_{-i} is the - * differential over the i-th last period. - * - * The decaying history gives smooth transitions. The time differential carries - * the notion of speed. - * - * The denominator is 2^(1+i) because we want the series to be normalised, ie. - * - * \Sum_{i=0} 1/2^(1+i) = 1 - * - * Further more, if we measure time (t) in the same events as x; so that: - * - * t = \Sum_{j} x_{j} - * - * we get that: - * - * \Sum_{j} p_{j} = 1 - * - * Writing this in an iterative fashion we get (dropping the 'd's): - * - * if (++x_{j}, ++t > period) - * t /= 2; - * for_each (j) - * x_{j} /= 2; - * - * so that: - * - * p_{j} = x_{j} / t; - * - * We optimize away the '/= 2' for the global time delta by noting that: - * - * if (++t > period) t /= 2: - * - * Can be approximated by: - * - * period/2 + (++t % period/2) - * - * [ Furthermore, when we choose period to be 2^n it can be written in terms of - * binary operations and wraparound artefacts disappear. ] - * - * Also note that this yields a natural counter of the elapsed periods: - * - * c = t / (period/2) - * - * [ Its monotonic increasing property can be applied to mitigate the wrap- - * around issue. ] - * - * This allows us to do away with the loop over all BDIs on each period - * expiration. By remembering the period count under which it was last - * accessed as c_{j}, we can obtain the number of 'missed' cycles from: - * - * c - c_{j} - * - * We can then lazily catch up to the global period count every time we are - * going to use x_{j}, by doing: - * - * x_{j} /= 2^(c - c_{j}), c_{j} = c */ - -struct vm_completions_data { - /* - * The period over which we differentiate (in pages) - * - * period = 2^shift - */ - int shift; - - /* - * The total page writeback completion counter aka 'time'. - * - * Treated as an unsigned long; the lower 'shift - 1' bits are the - * counter bits, the remaining upper bits the period counter. - */ - struct percpu_counter completions; -}; - -static int vm_completions_index; -static struct vm_completions_data vm_completions[2]; -static DEFINE_MUTEX(vm_completions_mutex); +struct prop_descriptor vm_completions; static unsigned long determine_dirtyable_memory(void); @@ -219,85 +134,6 @@ static int calc_period_shift(void) return 2 + ilog2(dirty_total - 1); } -static void vcd_init(void) -{ - vm_completions[0].shift = calc_period_shift(); - percpu_counter_init(&vm_completions[0].completions, 0); - percpu_counter_init(&vm_completions[1].completions, 0); -} - -/* - * We have two copies, and flip between them to make it seem like an atomic - * update. The update is not really atomic wrt the completions counter, but - * it is internally consistent with the bit layout depending on shift. - * - * We calculate the new shift, copy the completions count, move the bits around - * and flip the index. - */ -static void vcd_flip(void) -{ - int index; - int shift; - int offset; - u64 completions; - unsigned long flags; - - mutex_lock(&vm_completions_mutex); - - index = vm_completions_index ^ 1; - shift = calc_period_shift(); - offset = vm_completions[vm_completions_index].shift - shift; - if (!offset) - goto out; - - vm_completions[index].shift = shift; - - local_irq_save(flags); - completions = percpu_counter_sum_signed( - &vm_completions[vm_completions_index].completions); - - if (offset < 0) - completions <<= -offset; - else - completions >>= offset; - - percpu_counter_set(&vm_completions[index].completions, completions); - - /* - * ensure the new vcd is fully written before the switch - */ - smp_wmb(); - vm_completions_index = index; - local_irq_restore(flags); - - synchronize_rcu(); - -out: - mutex_unlock(&vm_completions_mutex); -} - -/* - * wrap the access to the data in an rcu_read_lock() section; - * this is used to track the active references. - */ -static struct vm_completions_data *get_vcd(void) -{ - int index; - - rcu_read_lock(); - index = vm_completions_index; - /* - * match the wmb from vcd_flip() - */ - smp_rmb(); - return &vm_completions[index]; -} - -static void put_vcd(struct vm_completions_data *vcd) -{ - rcu_read_unlock(); -} - /* * update the period when the dirty ratio changes. */ @@ -307,130 +143,38 @@ int dirty_ratio_handler(ctl_table *table { int old_ratio = vm_dirty_ratio; int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); - if (ret == 0 && write && vm_dirty_ratio != old_ratio) - vcd_flip(); - return ret; -} - -/* - * adjust the bdi local data to changes in the bit layout. - */ -static void bdi_adjust_shift(struct backing_dev_info *bdi, int shift) -{ - int offset = bdi->shift - shift; - - if (!offset) - return; - - if (offset < 0) - bdi->period <<= -offset; - else - bdi->period >>= offset; - - bdi->shift = shift; -} - -/* - * Catch up with missed period expirations. - * - * until (c_{j} == c) - * x_{j} -= x_{j}/2; - * c_{j}++; - */ -static void bdi_writeout_norm(struct backing_dev_info *bdi, - struct vm_completions_data *vcd) -{ - unsigned long period = 1UL << (vcd->shift - 1); - unsigned long period_mask = ~(period - 1); - unsigned long global_period; - unsigned long flags; - - global_period = percpu_counter_read(&vcd->completions); - global_period &= period_mask; - - /* - * Fast path - check if the local and global period count still match - * outside of the lock. - */ - if (bdi->period == global_period) - return; - - spin_lock_irqsave(&bdi->lock, flags); - bdi_adjust_shift(bdi, vcd->shift); - /* - * For each missed period, we half the local counter. - * basically: - * bdi_stat(bdi, BDI_COMPLETION) >> (global_period - bdi->period); - * - * but since the distributed nature of percpu counters make division - * rather hard, use a regular subtraction loop. This is safe, because - * BDI_COMPLETION will only every be incremented, hence the subtraction - * can never result in a negative number. - */ - while (bdi->period != global_period) { - unsigned long val = bdi_stat_unsigned(bdi, BDI_COMPLETION); - unsigned long half = (val + 1) >> 1; - - /* - * Half of zero won't be much less, break out. - * This limits the loop to shift iterations, even - * if we missed a million. - */ - if (!val) - break; - - /* - * Iff shift >32 half might exceed the limits of - * the regular percpu_counter_mod. - */ - __mod_bdi_stat64(bdi, BDI_COMPLETION, -half); - bdi->period += period; + if (ret == 0 && write && vm_dirty_ratio != old_ratio) { + int shift = calc_period_shift(); + prop_change_shift(&vm_completions, shift); } - bdi->period = global_period; - bdi->shift = vcd->shift; - spin_unlock_irqrestore(&bdi->lock, flags); + return ret; } /* * Increment the BDI's writeout completion count and the global writeout * completion count. Called from test_clear_page_writeback(). - * - * ++x_{j}, ++t */ static void __bdi_writeout_inc(struct backing_dev_info *bdi) { - struct vm_completions_data *vcd = get_vcd(); - /* Catch up with missed period expirations before using the counter. */ - bdi_writeout_norm(bdi, vcd); - __inc_bdi_stat(bdi, BDI_COMPLETION); - - percpu_counter_mod(&vcd->completions, 1); - put_vcd(vcd); + struct prop_global *pg = prop_get_global(&vm_completions); + __prop_inc(pg, &bdi->completions); + prop_put_global(&vm_completions, pg); } /* * Obtain an accurate fraction of the BDI's portion. - * - * p_{j} = x_{j} / (period/2 + t % period/2) */ void bdi_writeout_fraction(struct backing_dev_info *bdi, long *numerator, long *denominator) { - struct vm_completions_data *vcd = get_vcd(); - unsigned long period_2 = 1UL << (vcd->shift - 1); - unsigned long counter_mask = period_2 - 1; - unsigned long global_count; - if (bdi_cap_writeback_dirty(bdi)) { - /* Catch up with the period expirations before use. */ - bdi_writeout_norm(bdi, vcd); - *numerator = bdi_stat(bdi, BDI_COMPLETION); - } else + struct prop_global *pg = prop_get_global(&vm_completions); + prop_fraction(pg, &bdi->completions, numerator, denominator); + prop_put_global(&vm_completions, pg); + } else { *numerator = 0; - - global_count = percpu_counter_read(&vcd->completions); - *denominator = period_2 + (global_count & counter_mask); - put_vcd(vcd); + *denominator = 1; + } } /* @@ -980,10 +724,14 @@ static struct notifier_block __cpuinitda */ void __init page_writeback_init(void) { + int shift; + mod_timer(&wb_timer, jiffies + dirty_writeback_interval); writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); - vcd_init(); + + shift = calc_period_shift(); + prop_descriptor_init(&vm_completions, shift); } /** Index: linux-2.6/lib/proportions.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6/lib/proportions.c 2007-05-10 11:03:01.000000000 +0200 @@ -0,0 +1,259 @@ +/* + * FLoating proportions + * + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * + * DETAILS: + * + * The floating proportion is a time derivative with an exponentially decaying + * history: + * + * p_{j} = \Sum_{i=0} (dx_{j}/dt_{-i}) / 2^(1+i) + * + * Where j is an element from {BDIs}, x_{j} is j's number of completions, and i + * the time period over which the differential is taken. So d/dt_{-i} is the + * differential over the i-th last period. + * + * The decaying history gives smooth transitions. The time differential carries + * the notion of speed. + * + * The denominator is 2^(1+i) because we want the series to be normalised, ie. + * + * \Sum_{i=0} 1/2^(1+i) = 1 + * + * Further more, if we measure time (t) in the same events as x; so that: + * + * t = \Sum_{j} x_{j} + * + * we get that: + * + * \Sum_{j} p_{j} = 1 + * + * Writing this in an iterative fashion we get (dropping the 'd's): + * + * if (++x_{j}, ++t > period) + * t /= 2; + * for_each (j) + * x_{j} /= 2; + * + * so that: + * + * p_{j} = x_{j} / t; + * + * We optimize away the '/= 2' for the global time delta by noting that: + * + * if (++t > period) t /= 2: + * + * Can be approximated by: + * + * period/2 + (++t % period/2) + * + * [ Furthermore, when we choose period to be 2^n it can be written in terms of + * binary operations and wraparound artefacts disappear. ] + * + * Also note that this yields a natural counter of the elapsed periods: + * + * c = t / (period/2) + * + * [ Its monotonic increasing property can be applied to mitigate the wrap- + * around issue. ] + * + * This allows us to do away with the loop over all BDIs on each period + * expiration. By remembering the period count under which it was last + * accessed as c_{j}, we can obtain the number of 'missed' cycles from: + * + * c - c_{j} + * + * We can then lazily catch up to the global period count every time we are + * going to use x_{j}, by doing: + * + * x_{j} /= 2^(c - c_{j}), c_{j} = c + */ + +#include +#include + +void prop_descriptor_init(struct prop_descriptor *pd, int shift) +{ + pd->index = 0; + pd->pg[0].shift = shift; + percpu_counter_init(&pd->pg[0].events, 0); + percpu_counter_init(&pd->pg[1].events, 0); + mutex_init(&pd->mutex); +} + +/* + * We have two copies, and flip between them to make it seem like an atomic + * update. The update is not really atomic wrt the events counter, but + * it is internally consistent with the bit layout depending on shift. + * + * We copy the events count, move the bits around and flip the index. + */ +void prop_change_shift(struct prop_descriptor *pd, int shift) +{ + int index; + int offset; + u64 events; + unsigned long flags; + + mutex_lock(&pd->mutex); + + index = pd->index ^ 1; + offset = pd->pg[pd->index].shift - shift; + if (!offset) + goto out; + + pd->pg[index].shift = shift; + + local_irq_save(flags); + events = percpu_counter_sum_signed( + &pd->pg[pd->index].events); + if (offset < 0) + events <<= -offset; + else + events >>= offset; + percpu_counter_set(&pd->pg[index].events, events); + + /* + * ensure the new pg is fully written before the switch + */ + smp_wmb(); + pd->index = index; + local_irq_restore(flags); + + synchronize_rcu(); + +out: + mutex_unlock(&pd->mutex); +} + +/* + * wrap the access to the data in an rcu_read_lock() section; + * this is used to track the active references. + */ +struct prop_global *prop_get_global(struct prop_descriptor *pd) +{ + int index; + + rcu_read_lock(); + index = pd->index; + /* + * match the wmb from vcd_flip() + */ + smp_rmb(); + return &pd->pg[index]; +} + +void prop_put_global(struct prop_descriptor *pd, struct prop_global *pg) +{ + rcu_read_unlock(); +} + +static void prop_adjust_shift(struct prop_local *pl, int new_shift) +{ + int offset = pl->shift - new_shift; + + if (!offset) + return; + + if (offset < 0) + pl->period <<= -offset; + else + pl->period >>= offset; + + pl->shift = new_shift; +} + +void prop_local_init(struct prop_local *pl) +{ + spin_lock_init(&pl->lock); + pl->shift = 0; + pl->period = 0; + percpu_counter_init(&pl->events, 0); +} + +void prop_local_destroy(struct prop_local *pl) +{ + percpu_counter_destroy(&pl->events); +} + +/* + * Catch up with missed period expirations. + * + * until (c_{j} == c) + * x_{j} -= x_{j}/2; + * c_{j}++; + */ +void prop_norm(struct prop_global *pg, + struct prop_local *pl) +{ + unsigned long period = 1UL << (pg->shift - 1); + unsigned long period_mask = ~(period - 1); + unsigned long global_period; + unsigned long flags; + + global_period = percpu_counter_read(&pg->events); + global_period &= period_mask; + + /* + * Fast path - check if the local and global period count still match + * outside of the lock. + */ + if (pl->period == global_period) + return; + + spin_lock_irqsave(&pl->lock, flags); + prop_adjust_shift(pl, pg->shift); + /* + * For each missed period, we half the local counter. + * basically: + * pl->events >> (global_period - pl->period); + * + * but since the distributed nature of percpu counters make division + * rather hard, use a regular subtraction loop. This is safe, because + * the events will only every be incremented, hence the subtraction + * can never result in a negative number. + */ + while (pl->period != global_period) { + unsigned long val = percpu_counter_read(&pl->events); + unsigned long half = (val + 1) >> 1; + + /* + * Half of zero won't be much less, break out. + * This limits the loop to shift iterations, even + * if we missed a million. + */ + if (!val) + break; + + /* + * Iff shift >32 half might exceed the limits of + * the regular percpu_counter_mod. + */ + percpu_counter_mod64(&pl->events, -half); + pl->period += period; + } + pl->period = global_period; + spin_unlock_irqrestore(&pl->lock, flags); +} + +/* + * Obtain an fraction of this proportion + * + * p_{j} = x_{j} / (period/2 + t % period/2) + */ +void prop_fraction(struct prop_global *pg, struct prop_local *pl, + long *numerator, long *denominator) +{ + unsigned long period_2 = 1UL << (pg->shift - 1); + unsigned long counter_mask = period_2 - 1; + unsigned long global_count; + + prop_norm(pg, pl); + *numerator = percpu_counter_read(&pl->events); + + global_count = percpu_counter_read(&pg->events); + *denominator = period_2 + (global_count & counter_mask); +} + + Index: linux-2.6/include/linux/proportions.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6/include/linux/proportions.h 2007-05-10 11:03:01.000000000 +0200 @@ -0,0 +1,82 @@ +/* + * FLoating proportions + * + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * + * This file contains the public data structure and API definitions. + */ + +#ifndef _LINUX_PROPORTIONS_H +#define _LINUX_PROPORTIONS_H + +#include +#include +#include + +struct prop_global { + /* + * The period over which we differentiate (in pages) + * + * period = 2^shift + */ + int shift; + /* + * The total page writeback completion counter aka 'time'. + * + * Treated as an unsigned long; the lower 'shift - 1' bits are the + * counter bits, the remaining upper bits the period counter. + */ + struct percpu_counter events; +}; + +/* + * global property descriptor + * + * this is needed to consitently flip prop_global structures. + */ +struct prop_descriptor { + int index; + struct prop_global pg[2]; + struct mutex mutex; +}; + +void prop_descriptor_init(struct prop_descriptor *pd, int shift); +void prop_change_shift(struct prop_descriptor *pd, int new_shift); +struct prop_global *prop_get_global(struct prop_descriptor *pd); +void prop_put_global(struct prop_descriptor *pd, struct prop_global *pg); + +struct prop_local { + /* + * the local events counter + */ + struct percpu_counter events; + + /* + * snapshot of the last seen global state + * and a lock protecting this state + */ + int shift; + unsigned long period; + spinlock_t lock; +}; + +void prop_local_init(struct prop_local *pl); +void prop_local_destroy(struct prop_local *pl); + +void prop_norm(struct prop_global *pg, struct prop_local *pl); + +/* + * ++x_{j}, ++t + */ +static inline +void __prop_inc(struct prop_global *pg, struct prop_local *pl) +{ + prop_norm(pg, pl); + percpu_counter_mod(&pl->events, 1); + percpu_counter_mod(&pg->events, 1); +} + +void prop_fraction(struct prop_global *pg, struct prop_local *pl, + long *numerator, long *denominator); + +#endif /* _LINUX_PROPORTIONS_H */ Index: linux-2.6/mm/backing-dev.c =================================================================== --- linux-2.6.orig/mm/backing-dev.c 2007-05-10 10:52:24.000000000 +0200 +++ linux-2.6/mm/backing-dev.c 2007-05-10 11:03:01.000000000 +0200 @@ -12,9 +12,8 @@ void bdi_init(struct backing_dev_info *b for (i = 0; i < NR_BDI_STAT_ITEMS; i++) percpu_counter_init(&bdi->bdi_stat[i], 0); - spin_lock_init(&bdi->lock); - bdi->period = 0; bdi->dirty_exceeded = 0; + prop_local_init(&bdi->completions); } EXPORT_SYMBOL(bdi_init); @@ -25,6 +24,8 @@ void bdi_destroy(struct backing_dev_info for (i = 0; i < NR_BDI_STAT_ITEMS; i++) percpu_counter_destroy(&bdi->bdi_stat[i]); + + prop_local_destroy(&bdi->completions); } EXPORT_SYMBOL(bdi_destroy); Index: linux-2.6/lib/Makefile =================================================================== --- linux-2.6.orig/lib/Makefile 2007-05-10 10:51:35.000000000 +0200 +++ linux-2.6/lib/Makefile 2007-05-10 11:03:01.000000000 +0200 @@ -5,7 +5,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o dump_stack.o \ idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \ - sha1.o irq_regs.o reciprocal_div.o + sha1.o irq_regs.o reciprocal_div.o proportions.o lib-$(CONFIG_MMU) += ioremap.o pagewalk.o lib-$(CONFIG_SMP) += cpumask.o -- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/