[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <48CF0DC4.8030402@cn.fujitsu.com>
Date: Tue, 16 Sep 2008 09:37:08 +0800
From: Lai Jiangshan <laijs@...fujitsu.com>
To: Paul Menage <menage@...gle.com>
CC: Andrew Morton <akpm@...ux-foundation.org>,
Linux Kernel Mailing List <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH -mm 2/2] cgroup: use multibuf for tasks file
Paul Menage wrote:
> On Fri, Sep 12, 2008 at 4:55 AM, Lai Jiangshan <laijs@...fujitsu.com> wrote:
>> when we open a really large cgroup for read, we may failed
>> for kmalloc() is not reliable for allocate a big buffer.
>
> This still depends on an answer to whether using plain vmalloc is too
> much overhead.
>
> Balbir pointed out to me that most cgroups are likely to be pretty
> small - so the solution of just doing a kmalloc() for 8K or less, and
> a vmalloc() for more than 8K (which is >2000 threads) will avoid the
> vmalloc overhead in almost all cases; the question is whether
> eliminating the remaining overhead is worth the extra complexity.
>
I think open cgroup.tasks to read is not a critical path.
so using plain vmalloc(even more overhead functions) is worth.
>
>> the patch use multibuf for tasks file, every buf is a page
>> apart from we need only a small buffer.
>>
>> we use obj_sort() to sort this pids, so we don't need to map this
>> pages to an continuous memory region.
>>
>> Signed-off-by: Lai Jiangshan <laijs@...fujitsu.com>
>> ---
>> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
>> index bb298de..3d3c3bb 100644
>> --- a/include/linux/cgroup.h
>> +++ b/include/linux/cgroup.h
>> @@ -141,8 +141,8 @@ struct cgroup {
>>
>> /* pids_mutex protects the fields below */
>> struct rw_semaphore pids_mutex;
>> - /* Array of process ids in the cgroup */
>> - pid_t *tasks_pids;
>> + /* Multi-array of process ids in the cgroup */
>> + const pid_t *const *tasks_pids;
>> /* How many files are using the current tasks_pids array */
>> int pids_use_count;
>> /* Length of the current tasks_pids array */
>> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
>> index 996865a..f61b152 100644
>> --- a/kernel/cgroup.c
>> +++ b/kernel/cgroup.c
>> @@ -2004,6 +2004,8 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
>> *
>> */
>>
>> +const static int pid_per_page = PAGE_SIZE / sizeof(pid_t);
>> +
>> /*
>> * Load into 'pidarray' up to 'npids' of the tasks using cgroup
>> * 'cgrp'. Return actual number of pids loaded. No need to
>> @@ -2011,16 +2013,22 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
>> * read section, so the css_set can't go away, and is
>> * immutable after creation.
>> */
>> -static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
>> +static int pid_array_load(pid_t **pidarray, int npids, struct cgroup *cgrp)
>> {
>> - int n = 0;
>> + int n = 0, i = 0, j = 0;
>> struct cgroup_iter it;
>> struct task_struct *tsk;
>> cgroup_iter_start(cgrp, &it);
>> while ((tsk = cgroup_iter_next(cgrp, &it))) {
>> if (unlikely(n == npids))
>> break;
>> - pidarray[n++] = task_pid_vnr(tsk);
>> + pidarray[i][j] = task_pid_vnr(tsk);
>> + n++;
>> + j++;
>> + if (j == pid_per_page) {
>> + i++;
>> + j = 0;
>> + }
>> }
>> cgroup_iter_end(cgrp, &it);
>> return n;
>> @@ -2079,11 +2087,27 @@ err:
>> return ret;
>> }
>>
>> -static int cmppid(const void *a, const void *b)
>> +static inline pid_t getpidofmbuf(const pid_t *const *multibuf, int index)
>> +{
>> + return multibuf[index / pid_per_page][index % pid_per_page];
>> +}
>> +
>> +static int cmppid(const void *c, size_t left, size_t right)
>> {
>> - return *(pid_t *)a - *(pid_t *)b;
>> + return getpidofmbuf(c, left) - getpidofmbuf(c, right);
>> }
>>
>> +static inline pid_t *getpidptr(pid_t *const *multibuf, int index)
>> +{
>> + return &multibuf[index / pid_per_page][index % pid_per_page];
>> +}
>> +
>> +static void swappid(void *c, size_t left, size_t right)
>> +{
>> + pid_t rpid = getpidofmbuf(c, right);
>> + *getpidptr(c, right) = getpidofmbuf(c, left);
>> + *getpidptr(c, left) = rpid;
>> +}
>>
>> /*
>> * seq_file methods for the "tasks" file. The seq_file position is the
>> @@ -2100,19 +2124,19 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
>> * next pid to display, if any
>> */
>> struct cgroup *cgrp = s->private;
>> - int index = 0, pid = *pos;
>> - int *iter;
>> + int index = 0;
>> + pid_t pid = *pos;
>>
>> down_read(&cgrp->pids_mutex);
>> if (pid) {
>> int end = cgrp->pids_length;
>> - int i;
>> while (index < end) {
>> int mid = (index + end) / 2;
>> - if (cgrp->tasks_pids[mid] == pid) {
>> + pid_t mpid = getpidofmbuf(cgrp->tasks_pids, mid);
>> + if (mpid == pid) {
>> index = mid;
>> break;
>> - } else if (cgrp->tasks_pids[mid] <= pid)
>> + } else if (mpid <= pid)
>> index = mid + 1;
>> else
>> end = mid;
>> @@ -2122,9 +2146,8 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
>> if (index >= cgrp->pids_length)
>> return NULL;
>> /* Update the abstract position to be the actual pid that we found */
>> - iter = cgrp->tasks_pids + index;
>> - *pos = *iter;
>> - return iter;
>> + *pos = getpidofmbuf(cgrp->tasks_pids, index);
>> + return (void *)(index ^ -0x10000); /* we cannot return 0 */
>> }
>>
>> static void cgroup_tasks_stop(struct seq_file *s, void *v)
>> @@ -2136,25 +2159,26 @@ static void cgroup_tasks_stop(struct seq_file *s, void *v)
>> static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
>> {
>> struct cgroup *cgrp = s->private;
>> - int *p = v;
>> - int *end = cgrp->tasks_pids + cgrp->pids_length;
>> + int index = (int)v ^ -0x10000;
>>
>> /*
>> * Advance to the next pid in the array. If this goes off the
>> * end, we're done
>> */
>> - p++;
>> - if (p >= end) {
>> + index++;
>> + if (index >= cgrp->pids_length) {
>> return NULL;
>> } else {
>> - *pos = *p;
>> - return p;
>> + *pos = getpidofmbuf(cgrp->tasks_pids, index);
>> + return (void *)(index ^ -0x10000); /* we cannot return 0 */
>> }
>> }
>>
>> static int cgroup_tasks_show(struct seq_file *s, void *v)
>> {
>> - return seq_printf(s, "%d\n", *(int *)v);
>> + struct cgroup *cgrp = s->private;
>> + int index = (int)v ^ -0x10000;
>> + return seq_printf(s, "%d\n", getpidofmbuf(cgrp->tasks_pids, index));
>> }
>>
>> static struct seq_operations cgroup_tasks_seq_operations = {
>> @@ -2164,12 +2188,60 @@ static struct seq_operations cgroup_tasks_seq_operations = {
>> .show = cgroup_tasks_show,
>> };
>>
>> +static void *alloc_mutibufs(size_t npids)
>> +{
>> + int i, j, npages = (npids + pid_per_page - 1) / pid_per_page;
>> + unsigned long *pages;
>> +
>> + if (npids <= pid_per_page - sizeof(pid_t *) / sizeof(pid_t)) {
>> + void *pids = kmalloc(sizeof(pid_t *) + sizeof(pid_t) * npids,
>> + GFP_KERNEL);
>> + if (!pids)
>> + return NULL;
>> + /* make single buf fake multi-buf */
>> + *(void **)pids = pids + sizeof(pid_t *);
>> + return pids;
>> + }
>> +
>> + pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL);
>> + if (!pages)
>> + return NULL;
>> +
>> + for (i = 0; i < npages; i++) {
>> + pages[i] = __get_free_page(GFP_KERNEL);
>> + if (unlikely(!pages[i]))
>> + goto depopulate;
>> + }
>> + return pages;
>> +
>> +depopulate:
>> + for (j = 0; j < i; j++)
>> + free_page(pages[j]);
>> + kfree(pages);
>> + return NULL;
>> +}
>> +
>> +static void free_multibufs(void *ptr, size_t npids)
>> +{
>> + if (!ptr)
>> + return;
>> +
>> + if (npids > pid_per_page - sizeof(pid_t *) / sizeof(pid_t)) {
>> + int i, npages = (npids + pid_per_page - 1) / pid_per_page;
>> + unsigned long *pages = ptr;
>> + for (i = 0; i < npages; i++)
>> + free_page(pages[i]);
>> + }
>> +
>> + kfree(ptr);
>> +}
>> +
>> static void release_cgroup_pid_array(struct cgroup *cgrp)
>> {
>> down_write(&cgrp->pids_mutex);
>> BUG_ON(!cgrp->pids_use_count);
>> if (!--cgrp->pids_use_count) {
>> - kfree(cgrp->tasks_pids);
>> + free_multibufs((void *)cgrp->tasks_pids, cgrp->pids_length);
>> cgrp->tasks_pids = NULL;
>> cgrp->pids_length = 0;
>> }
>> @@ -2202,7 +2274,7 @@ static struct file_operations cgroup_tasks_operations = {
>> static int cgroup_tasks_open(struct inode *unused, struct file *file)
>> {
>> struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
>> - pid_t *pidarray;
>> + pid_t **pidarray;
>> int npids;
>> int retval;
>>
>> @@ -2217,19 +2289,19 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
>> * show up until sometime later on.
>> */
>> npids = cgroup_task_count(cgrp);
>> - pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
>> + pidarray = alloc_mutibufs(npids);
>> if (!pidarray)
>> return -ENOMEM;
>> npids = pid_array_load(pidarray, npids, cgrp);
>> - sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
>> + obj_sort(pidarray, 0, npids, cmppid, swappid);
>>
>> /*
>> * Store the array in the cgroup, freeing the old
>> * array if necessary
>> */
>> down_write(&cgrp->pids_mutex);
>> - kfree(cgrp->tasks_pids);
>> - cgrp->tasks_pids = pidarray;
>> + free_multibufs((void *)cgrp->tasks_pids, cgrp->pids_length);
>> + cgrp->tasks_pids = (const pid_t *const *)pidarray;
>> cgrp->pids_length = npids;
>> cgrp->pids_use_count++;
>> up_write(&cgrp->pids_mutex);
>>
>>
>>
>
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists