[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <48598CB7.2000507@gmail.com>
Date: Thu, 19 Jun 2008 00:31:19 +0200
From: Andrea Righi <righi.andrea@...il.com>
To: Carl Henrik Lunde <chlunde@...g.uio.no>
CC: balbir@...ux.vnet.ibm.com, menage@...gle.com, matt@...ehost.com,
roberto@...it.it, randy.dunlap@...cle.com,
akpm@...ux-foundation.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH 2/3] i/o bandwidth controller infrastructure
Carl Henrik Lunde wrote:
> On Sat, Jun 07, 2008 at 12:27:29AM +0200, Andrea Righi wrote:
>> This is the core io-throttle kernel infrastructure. It creates the basic
>> interfaces to cgroups and implements the I/O measurement and throttling
>> functions.
> [...]
>> +void cgroup_io_account(struct block_device *bdev, size_t bytes)
> [...]
>> + /* Account the I/O activity */
>> + node->req += bytes;
>> +
>> + /* Evaluate if we need to throttle the current process */
>> + delta = (long)jiffies - (long)node->last_request;
>> + if (!delta)
>> + goto out;
>> +
>> + t = msecs_to_jiffies(node->req / node->iorate);
>> + if (!t)
>> + goto out;
>> +
>> + sleep = t - delta;
>> + if (unlikely(sleep > 0)) {
>> + spin_unlock_irq(&iot->lock);
>> + if (__cant_sleep())
>> + return;
>> + pr_debug("io-throttle: task %p (%s) must sleep %lu jiffies\n",
>> + current, current->comm, sleep);
>> + schedule_timeout_killable(sleep);
>> + return;
>> + }
>> +
>> + /* Reset I/O accounting */
>> + node->req = 0;
>> + node->last_request = jiffies;
> [...]
>
> Did you consider using token bucket instead of this (leaky bucket?)?
>
> I've attached a patch which implements token bucket. Although not as
> precise as the leaky bucket the performance is better at high bandwidth
> streaming loads.
Interesting! it could be great to have both available at runtime and
just switch between leaky or token bucket, e.g. by echo-ing "leaky" or
"token" to a file in the cgroup filesystem, ummm, block.limiting-algorithm?
>
> The leaky bucket stops at around 53 MB/s while token bucket works for
> up to 64 MB/s. The baseline (no cgroups) is 66 MB/s.
>
> benchmark:
> two streaming readers (fio) with block size 128k, bucket size 4 MB
> 90% of the bandwidth was allocated to one process, the other gets 10%
Thanks for posting the results, I'll look closely at your patch, test
it as well and try merge your work.
I also did some improvements in v2 in terms of scalability, in
particular I've replaced the rbtree with a liked list, in order to
remove the spinlocks and replace them by RCU to protect the list
structure. I need to do some stress tests before, but I'll post a v3
soon.
Some minor comments below for now.
> diff --git a/block/blk-io-throttle.c b/block/blk-io-throttle.c
> index 804df88..9ed0c7c 100644
> --- a/block/blk-io-throttle.c
> +++ b/block/blk-io-throttle.c
> @@ -40,7 +40,8 @@ struct iothrottle_node {
> struct rb_node node;
> dev_t dev;
> unsigned long iorate;
> - unsigned long req;
> + long bucket_size; /* Max value for t */
> + long t;
> unsigned long last_request;
> };
>
> @@ -180,18 +181,20 @@ static ssize_t iothrottle_read(struct cgroup *cont,
> iothrottle_for_each(n, &iot->tree) {
> struct iothrottle_node *node =
> rb_entry(n, struct iothrottle_node, node);
> - unsigned long delta = (long)jiffies - (long)node->last_request;
> + unsigned long delta = (((long)jiffies - (long)node->last_request) * 1000) / HZ;
Better to use jiffies_to_msecs() here.
>
> BUG_ON(!node->dev);
> s += snprintf(s, nbytes - (s - buffer),
> "=== device (%u,%u) ===\n"
> "bandwidth-max: %lu KiB/sec\n"
> - " requested: %lu bytes\n"
> - " last request: %lu jiffies\n"
> - " delta: %lu jiffies\n",
> + "bucket size : %ld KiB\n"
> + "bucket fill : %ld KiB (after last request)\n"
> + "last request : %lu ms ago\n",
> MAJOR(node->dev), MINOR(node->dev),
> - node->iorate, node->req,
> - node->last_request, delta);
> + node->iorate,
> + node->bucket_size / 1024,
> + node->t / 1024,
> + delta);
> }
> spin_unlock_irq(&iot->lock);
> buffer[nbytes] = '\0';
> @@ -220,21 +223,33 @@ static inline dev_t devname2dev_t(const char *buf)
> return ret;
> }
>
> -static inline int iothrottle_parse_args(char *buf, size_t nbytes,
> - dev_t *dev, unsigned long *val)
> +static inline int iothrottle_parse_args(char *buf, size_t nbytes, dev_t *dev,
> + unsigned long *iorate,
> + unsigned long *bucket_size)
> {
> - char *p;
> + char *ioratep, *bucket_sizep;
>
> - p = memchr(buf, ':', nbytes);
> - if (!p)
> + ioratep = memchr(buf, ':', nbytes);
> + if (!ioratep)
> return -EINVAL;
> - *p++ = '\0';
> + *ioratep++ = '\0';
> +
> + bucket_sizep = memchr(ioratep, ':', nbytes + ioratep - buf);
> + if (!bucket_sizep)
> + return -EINVAL;
> + *bucket_sizep++ = '\0';
>
> *dev = devname2dev_t(buf);
> if (!*dev)
> return -ENOTBLK;
>
> - return strict_strtoul(p, 10, val);
> + if (strict_strtoul(ioratep, 10, iorate))
> + return -EINVAL;
> +
> + if (strict_strtoul(bucket_sizep, 10, bucket_size))
> + return -EINVAL;
> +
> + return 0;
> }
>
> static ssize_t iothrottle_write(struct cgroup *cont,
> @@ -247,7 +262,7 @@ static ssize_t iothrottle_write(struct cgroup *cont,
> struct iothrottle_node *node, *tmpn = NULL;
> char *buffer, *tmpp;
> dev_t dev;
> - unsigned long val;
> + unsigned long iorate, bucket_size;
> int ret;
>
> if (unlikely(!nbytes))
> @@ -265,7 +280,7 @@ static ssize_t iothrottle_write(struct cgroup *cont,
> buffer[nbytes] = '\0';
> tmpp = strstrip(buffer);
>
> - ret = iothrottle_parse_args(tmpp, nbytes, &dev, &val);
> + ret = iothrottle_parse_args(tmpp, nbytes, &dev, &iorate, &bucket_size);
> if (ret)
> goto out1;
>
> @@ -284,7 +299,7 @@ static ssize_t iothrottle_write(struct cgroup *cont,
> iot = cgroup_to_iothrottle(cont);
>
> spin_lock_irq(&iot->lock);
> - if (!val) {
> + if (!iorate) {
> /* Delete a block device limiting rule */
> iothrottle_delete_node(iot, dev);
> ret = nbytes;
> @@ -293,8 +308,9 @@ static ssize_t iothrottle_write(struct cgroup *cont,
> node = iothrottle_search_node(iot, dev);
> if (node) {
> /* Update a block device limiting rule */
> - node->iorate = val;
> - node->req = 0;
> + node->iorate = iorate;
> + node->bucket_size = bucket_size * 1024;
> + node->t = 0;
> node->last_request = jiffies;
> ret = nbytes;
> goto out3;
> @@ -307,8 +323,9 @@ static ssize_t iothrottle_write(struct cgroup *cont,
> node = tmpn;
> tmpn = NULL;
>
> - node->iorate = val;
> - node->req = 0;
> + node->iorate = iorate;
> + node->bucket_size = bucket_size * 1024;
> + node->t = 0;
> node->last_request = jiffies;
> node->dev = dev;
> ret = iothrottle_insert_node(iot, node);
> @@ -355,7 +372,7 @@ void cgroup_io_account(struct block_device *bdev, size_t bytes)
> {
> struct iothrottle *iot;
> struct iothrottle_node *node;
> - unsigned long delta, t;
> + unsigned long delta;
> long sleep;
>
> if (unlikely(!bdev))
> @@ -370,36 +387,37 @@ void cgroup_io_account(struct block_device *bdev, size_t bytes)
> spin_lock_irq(&iot->lock);
>
> node = iothrottle_search_node(iot, bdev->bd_inode->i_rdev);
> - if (!node || !node->iorate)
> - goto out;
> -
> - /* Account the I/O activity */
> - node->req += bytes;
> + if (!node || !node->iorate) {
> + spin_unlock_irq(&iot->lock);
> + return;
> + }
>
> - /* Evaluate if we need to throttle the current process */
> + /* Add tokens for time elapsed since last read */
> delta = (long)jiffies - (long)node->last_request;
> - if (!delta)
> - goto out;
> + if (delta) {
> + node->last_request = jiffies;
> + node->t += (node->iorate * 1024 * delta) / HZ;
The same here:
node->t += node->iorate * 1024
* jiffies_to_msec(delta) * MSEC_PER_SEC;
>
> - t = msecs_to_jiffies(node->req / node->iorate);
> - if (!t)
> - goto out;
> + if (node->t > node->bucket_size)
> + node->t = node->bucket_size;
> + }
>
> - sleep = t - delta;
> - if (unlikely(sleep > 0)) {
> - spin_unlock_irq(&iot->lock);
> - if (__cant_sleep())
> - return;
> - pr_debug("io-throttle: task %p (%s) must sleep %lu jiffies\n",
> - current, current->comm, sleep);
> - schedule_timeout_killable(sleep);
> - return;
> + /* Account the I/O activity */
> + node->t -= bytes;
> +
> + if (node->t < 0) {
> + sleep = (-node->t) * HZ / (node->iorate * 1024);
And again:
sleep = msec_to_jiffies(-node->t / (node->iorate * 1024)
* MSEC_PER_SEC);
> + } else {
> + sleep = 0;
> }
>
> - /* Reset I/O accounting */
> - node->req = 0;
> - node->last_request = jiffies;
> -out:
> spin_unlock_irq(&iot->lock);
> +
> + if (sleep && !__cant_sleep()) {
> + pr_debug("io-throttle: %s[%d] must sleep %ld jiffies\n",
> + current->comm, current->pid, sleep);
> +
> + schedule_timeout_killable(sleep);
> + }
> }
> EXPORT_SYMBOL(cgroup_io_account);
Thanks,
-Andrea
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists