Subject: writeback: async write IO controllers Date: Fri Mar 04 10:38:04 CST 2011 - a bare per-task async write IO controller - a bare per-cgroup async write IO controller XXX: the per-task user interface is reusing RLIMIT_RSS for now. XXX: the per-cgroup user interface is missing CC: Vivek Goyal CC: Andrea Righi Signed-off-by: Wu Fengguang --- block/blk-cgroup.c | 2 include/linux/blk-cgroup.h | 4 + mm/page-writeback.c | 86 +++++++++++++++++++++++++++++++---- 3 files changed, 84 insertions(+), 8 deletions(-) --- linux-next.orig/mm/page-writeback.c 2011-04-05 01:26:38.000000000 +0800 +++ linux-next/mm/page-writeback.c 2011-04-05 01:26:53.000000000 +0800 @@ -1117,6 +1117,49 @@ static unsigned long max_pause(struct ba return clamp_val(t, MIN_PAUSE, MAX_PAUSE); } +static void blkcg_update_throttle_bandwidth(struct blkio_cgroup *blkcg, + unsigned long dirtied, + unsigned long elapsed) +{ + unsigned long bw = blkcg->throttle_bandwidth; + unsigned long long ref_bw; + unsigned long dirty_bw; + + ref_bw = blkcg->async_write_bps >> (3 + PAGE_SHIFT - RATIO_SHIFT); + dirty_bw = ((dirtied - blkcg->dirtied_stamp)*HZ + elapsed/2) / elapsed; + do_div(ref_bw, dirty_bw | 1); + ref_bw = bw * ref_bw >> RATIO_SHIFT; + + blkcg->throttle_bandwidth = (bw + ref_bw) / 2; +} + +void blkcg_update_bandwidth(struct blkio_cgroup *blkcg) +{ + unsigned long now = jiffies; + unsigned long dirtied; + unsigned long elapsed; + + if (!blkcg) + return; + if (!spin_trylock(&blkcg->lock)) + return; + + elapsed = now - blkcg->bw_time_stamp; + dirtied = percpu_counter_read(&blkcg->nr_dirtied); + + if (elapsed > MAX_PAUSE * 2) + goto snapshot; + if (elapsed <= MAX_PAUSE) + goto unlock; + + blkcg_update_throttle_bandwidth(blkcg, dirtied, elapsed); +snapshot: + blkcg->dirtied_stamp = dirtied; + blkcg->bw_time_stamp = now; +unlock: + spin_unlock(&blkcg->lock); +} + /* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force @@ -1139,6 +1182,10 @@ static void balance_dirty_pages(struct a unsigned long pause_max; struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long start_time = jiffies; + struct blkio_cgroup *blkcg = task_to_blkio_cgroup(current); + + if (blkcg == &blkio_root_cgroup) + blkcg = NULL; for (;;) { unsigned long now = jiffies; @@ -1178,6 +1225,15 @@ static void balance_dirty_pages(struct a * when the bdi limits are ramping up. */ if (nr_dirty <= (background_thresh + dirty_thresh) / 2) { + if (blkcg) { + pause_max = max_pause(bdi, 0); + goto cgroup_ioc; + } + if (current->signal->rlim[RLIMIT_RSS].rlim_cur != + RLIM_INFINITY) { + pause_max = max_pause(bdi, 0); + goto task_ioc; + } current->paused_when = now; current->nr_dirtied = 0; break; @@ -1190,21 +1246,35 @@ static void balance_dirty_pages(struct a bdi_start_background_writeback(bdi); pause_max = max_pause(bdi, bdi_dirty); - base_bw = bdi->throttle_bandwidth; - /* - * Double the bandwidth for PF_LESS_THROTTLE (ie. nfsd) and - * real-time tasks. - */ - if (current->flags & PF_LESS_THROTTLE || rt_task(current)) - base_bw *= 2; bw = position_ratio(bdi, dirty_thresh, nr_dirty, bdi_dirty); if (unlikely(bw == 0)) { period = pause_max; pause = pause_max; goto pause; } - bw = base_bw * (u64)bw >> RATIO_SHIFT; + bw = (u64)base_bw * bw >> RATIO_SHIFT; + if (blkcg && bw > blkcg->throttle_bandwidth) { +cgroup_ioc: + blkcg_update_bandwidth(blkcg); + bw = blkcg->throttle_bandwidth; + base_bw = bw; + } + if (bw > current->signal->rlim[RLIMIT_RSS].rlim_cur >> + PAGE_SHIFT) { +task_ioc: + bw = current->signal->rlim[RLIMIT_RSS].rlim_cur >> + PAGE_SHIFT; + base_bw = bw; + } + /* + * Double the bandwidth for PF_LESS_THROTTLE (ie. nfsd) and + * real-time tasks. + */ + if (current->flags & PF_LESS_THROTTLE || rt_task(current)) { + bw *= 2; + base_bw = bw; + } period = (HZ * pages_dirtied + bw / 2) / (bw | 1); pause = current->paused_when + period - now; /* --- linux-next.orig/block/blk-cgroup.c 2011-04-05 01:26:38.000000000 +0800 +++ linux-next/block/blk-cgroup.c 2011-04-05 01:26:39.000000000 +0800 @@ -1486,6 +1486,8 @@ done: INIT_LIST_HEAD(&blkcg->policy_list); percpu_counter_init(&blkcg->nr_dirtied, 0); + blkcg->async_write_bps = 16 << 23; /* XXX: tunable interface */ + blkcg->throttle_bandwidth = 16 << (20 - PAGE_SHIFT); return &blkcg->css; } --- linux-next.orig/include/linux/blk-cgroup.h 2011-04-05 01:26:38.000000000 +0800 +++ linux-next/include/linux/blk-cgroup.h 2011-04-05 01:26:39.000000000 +0800 @@ -112,6 +112,10 @@ struct blkio_cgroup { struct hlist_head blkg_list; struct list_head policy_list; /* list of blkio_policy_node */ struct percpu_counter nr_dirtied; + unsigned long bw_time_stamp; + unsigned long dirtied_stamp; + unsigned long throttle_bandwidth; + unsigned long async_write_bps; }; struct blkio_group_stats {