[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <6599ad830809151328n98dcffdxee0977e781b138cd@mail.gmail.com>
Date: Mon, 15 Sep 2008 13:28:22 -0700
From: "Paul Menage" <menage@...gle.com>
To: "Lai Jiangshan" <laijs@...fujitsu.com>
Cc: "Andrew Morton" <akpm@...ux-foundation.org>,
"Linux Kernel Mailing List" <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH -mm 2/2] cgroup: use multibuf for tasks file
On Fri, Sep 12, 2008 at 4:55 AM, Lai Jiangshan <laijs@...fujitsu.com> wrote:
>
> when we open a really large cgroup for read, we may failed
> for kmalloc() is not reliable for allocate a big buffer.
This still depends on an answer to whether using plain vmalloc is too
much overhead.
Balbir pointed out to me that most cgroups are likely to be pretty
small - so the solution of just doing a kmalloc() for 8K or less, and
a vmalloc() for more than 8K (which is >2000 threads) will avoid the
vmalloc overhead in almost all cases; the question is whether
eliminating the remaining overhead is worth the extra complexity.
Paul
>
> the patch use multibuf for tasks file, every buf is a page
> apart from we need only a small buffer.
>
> we use obj_sort() to sort this pids, so we don't need to map this
> pages to an continuous memory region.
>
> Signed-off-by: Lai Jiangshan <laijs@...fujitsu.com>
> ---
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index bb298de..3d3c3bb 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -141,8 +141,8 @@ struct cgroup {
>
> /* pids_mutex protects the fields below */
> struct rw_semaphore pids_mutex;
> - /* Array of process ids in the cgroup */
> - pid_t *tasks_pids;
> + /* Multi-array of process ids in the cgroup */
> + const pid_t *const *tasks_pids;
> /* How many files are using the current tasks_pids array */
> int pids_use_count;
> /* Length of the current tasks_pids array */
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 996865a..f61b152 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -2004,6 +2004,8 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
> *
> */
>
> +const static int pid_per_page = PAGE_SIZE / sizeof(pid_t);
> +
> /*
> * Load into 'pidarray' up to 'npids' of the tasks using cgroup
> * 'cgrp'. Return actual number of pids loaded. No need to
> @@ -2011,16 +2013,22 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
> * read section, so the css_set can't go away, and is
> * immutable after creation.
> */
> -static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
> +static int pid_array_load(pid_t **pidarray, int npids, struct cgroup *cgrp)
> {
> - int n = 0;
> + int n = 0, i = 0, j = 0;
> struct cgroup_iter it;
> struct task_struct *tsk;
> cgroup_iter_start(cgrp, &it);
> while ((tsk = cgroup_iter_next(cgrp, &it))) {
> if (unlikely(n == npids))
> break;
> - pidarray[n++] = task_pid_vnr(tsk);
> + pidarray[i][j] = task_pid_vnr(tsk);
> + n++;
> + j++;
> + if (j == pid_per_page) {
> + i++;
> + j = 0;
> + }
> }
> cgroup_iter_end(cgrp, &it);
> return n;
> @@ -2079,11 +2087,27 @@ err:
> return ret;
> }
>
> -static int cmppid(const void *a, const void *b)
> +static inline pid_t getpidofmbuf(const pid_t *const *multibuf, int index)
> +{
> + return multibuf[index / pid_per_page][index % pid_per_page];
> +}
> +
> +static int cmppid(const void *c, size_t left, size_t right)
> {
> - return *(pid_t *)a - *(pid_t *)b;
> + return getpidofmbuf(c, left) - getpidofmbuf(c, right);
> }
>
> +static inline pid_t *getpidptr(pid_t *const *multibuf, int index)
> +{
> + return &multibuf[index / pid_per_page][index % pid_per_page];
> +}
> +
> +static void swappid(void *c, size_t left, size_t right)
> +{
> + pid_t rpid = getpidofmbuf(c, right);
> + *getpidptr(c, right) = getpidofmbuf(c, left);
> + *getpidptr(c, left) = rpid;
> +}
>
> /*
> * seq_file methods for the "tasks" file. The seq_file position is the
> @@ -2100,19 +2124,19 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
> * next pid to display, if any
> */
> struct cgroup *cgrp = s->private;
> - int index = 0, pid = *pos;
> - int *iter;
> + int index = 0;
> + pid_t pid = *pos;
>
> down_read(&cgrp->pids_mutex);
> if (pid) {
> int end = cgrp->pids_length;
> - int i;
> while (index < end) {
> int mid = (index + end) / 2;
> - if (cgrp->tasks_pids[mid] == pid) {
> + pid_t mpid = getpidofmbuf(cgrp->tasks_pids, mid);
> + if (mpid == pid) {
> index = mid;
> break;
> - } else if (cgrp->tasks_pids[mid] <= pid)
> + } else if (mpid <= pid)
> index = mid + 1;
> else
> end = mid;
> @@ -2122,9 +2146,8 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
> if (index >= cgrp->pids_length)
> return NULL;
> /* Update the abstract position to be the actual pid that we found */
> - iter = cgrp->tasks_pids + index;
> - *pos = *iter;
> - return iter;
> + *pos = getpidofmbuf(cgrp->tasks_pids, index);
> + return (void *)(index ^ -0x10000); /* we cannot return 0 */
> }
>
> static void cgroup_tasks_stop(struct seq_file *s, void *v)
> @@ -2136,25 +2159,26 @@ static void cgroup_tasks_stop(struct seq_file *s, void *v)
> static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
> {
> struct cgroup *cgrp = s->private;
> - int *p = v;
> - int *end = cgrp->tasks_pids + cgrp->pids_length;
> + int index = (int)v ^ -0x10000;
>
> /*
> * Advance to the next pid in the array. If this goes off the
> * end, we're done
> */
> - p++;
> - if (p >= end) {
> + index++;
> + if (index >= cgrp->pids_length) {
> return NULL;
> } else {
> - *pos = *p;
> - return p;
> + *pos = getpidofmbuf(cgrp->tasks_pids, index);
> + return (void *)(index ^ -0x10000); /* we cannot return 0 */
> }
> }
>
> static int cgroup_tasks_show(struct seq_file *s, void *v)
> {
> - return seq_printf(s, "%d\n", *(int *)v);
> + struct cgroup *cgrp = s->private;
> + int index = (int)v ^ -0x10000;
> + return seq_printf(s, "%d\n", getpidofmbuf(cgrp->tasks_pids, index));
> }
>
> static struct seq_operations cgroup_tasks_seq_operations = {
> @@ -2164,12 +2188,60 @@ static struct seq_operations cgroup_tasks_seq_operations = {
> .show = cgroup_tasks_show,
> };
>
> +static void *alloc_mutibufs(size_t npids)
> +{
> + int i, j, npages = (npids + pid_per_page - 1) / pid_per_page;
> + unsigned long *pages;
> +
> + if (npids <= pid_per_page - sizeof(pid_t *) / sizeof(pid_t)) {
> + void *pids = kmalloc(sizeof(pid_t *) + sizeof(pid_t) * npids,
> + GFP_KERNEL);
> + if (!pids)
> + return NULL;
> + /* make single buf fake multi-buf */
> + *(void **)pids = pids + sizeof(pid_t *);
> + return pids;
> + }
> +
> + pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL);
> + if (!pages)
> + return NULL;
> +
> + for (i = 0; i < npages; i++) {
> + pages[i] = __get_free_page(GFP_KERNEL);
> + if (unlikely(!pages[i]))
> + goto depopulate;
> + }
> + return pages;
> +
> +depopulate:
> + for (j = 0; j < i; j++)
> + free_page(pages[j]);
> + kfree(pages);
> + return NULL;
> +}
> +
> +static void free_multibufs(void *ptr, size_t npids)
> +{
> + if (!ptr)
> + return;
> +
> + if (npids > pid_per_page - sizeof(pid_t *) / sizeof(pid_t)) {
> + int i, npages = (npids + pid_per_page - 1) / pid_per_page;
> + unsigned long *pages = ptr;
> + for (i = 0; i < npages; i++)
> + free_page(pages[i]);
> + }
> +
> + kfree(ptr);
> +}
> +
> static void release_cgroup_pid_array(struct cgroup *cgrp)
> {
> down_write(&cgrp->pids_mutex);
> BUG_ON(!cgrp->pids_use_count);
> if (!--cgrp->pids_use_count) {
> - kfree(cgrp->tasks_pids);
> + free_multibufs((void *)cgrp->tasks_pids, cgrp->pids_length);
> cgrp->tasks_pids = NULL;
> cgrp->pids_length = 0;
> }
> @@ -2202,7 +2274,7 @@ static struct file_operations cgroup_tasks_operations = {
> static int cgroup_tasks_open(struct inode *unused, struct file *file)
> {
> struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
> - pid_t *pidarray;
> + pid_t **pidarray;
> int npids;
> int retval;
>
> @@ -2217,19 +2289,19 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
> * show up until sometime later on.
> */
> npids = cgroup_task_count(cgrp);
> - pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
> + pidarray = alloc_mutibufs(npids);
> if (!pidarray)
> return -ENOMEM;
> npids = pid_array_load(pidarray, npids, cgrp);
> - sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
> + obj_sort(pidarray, 0, npids, cmppid, swappid);
>
> /*
> * Store the array in the cgroup, freeing the old
> * array if necessary
> */
> down_write(&cgrp->pids_mutex);
> - kfree(cgrp->tasks_pids);
> - cgrp->tasks_pids = pidarray;
> + free_multibufs((void *)cgrp->tasks_pids, cgrp->pids_length);
> + cgrp->tasks_pids = (const pid_t *const *)pidarray;
> cgrp->pids_length = npids;
> cgrp->pids_use_count++;
> up_write(&cgrp->pids_mutex);
>
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists