[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <6599ad830809110945pb85ec68o16328b31cbb0dc52@mail.gmail.com>
Date: Thu, 11 Sep 2008 09:45:08 -0700
From: "Paul Menage" <menage@...gle.com>
To: "Lai Jiangshan" <laijs@...fujitsu.com>
Cc: "Andrew Morton" <akpm@...ux-foundation.org>,
"Paul Jackson" <pj@....com>,
"Linux Kernel Mailing List" <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH -mm] cgroup,cpuset: use alternative malloc to allocate large memory buf for tasks
On Thu, Sep 11, 2008 at 3:30 AM, Lai Jiangshan <laijs@...fujitsu.com> wrote:
> This new alternative allocation implementation can allocate memory
> up to 64M in 32bits system or 512M in 64bits system.
Isn't a lot of this patch just reimplementing vmalloc()?
Paul
>
> This patch fix the problem for a really large cgroup.
>
> Signed-off-by: Lai Jiangshan <laijs@...fujitsu.com>
> ---
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index bb298de..974e898 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -403,6 +403,18 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
> int cgroup_scan_tasks(struct cgroup_scanner *scan);
> int cgroup_attach_task(struct cgroup *, struct task_struct *);
>
> +/*
> + * Basic struct of cgroup huge memory allocation,
> + * use typedef to hide its implementation.
> + */
> +typedef struct {
> + struct page **page_array;
> + size_t page_count;
> +} cgroup_huge_mem_t;
> +
> +void *cgroup_huge_mem_alloc(size_t size, cgroup_huge_mem_t *huge);
> +void cgroup_huge_mem_free(void *ptr, cgroup_huge_mem_t *huge);
> +
> #else /* !CONFIG_CGROUPS */
>
> static inline int cgroup_init_early(void) { return 0; }
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 996865a..3ad4ff0 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -142,6 +142,55 @@ static int notify_on_release(const struct cgroup *cgrp)
> return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
> }
>
> +#define CGROUP_HUGE_PAGES_THRESHOLD 4
> +
> +void *cgroup_huge_mem_alloc(size_t size, cgroup_huge_mem_t *huge)
> +{
> + unsigned int i, j, n_pages;
> + struct page **pages;
> + void *mem;
> +
> + huge->page_array = NULL;
> + huge->page_count = 0;
> + if (size < PAGE_SIZE * CGROUP_HUGE_PAGES_THRESHOLD)
> + return kmalloc(size, GFP_KERNEL);
> +
> + n_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
> + pages = kmalloc(sizeof(*pages) * n_pages, GFP_KERNEL);
> + if (!pages)
> + return NULL;
> +
> + for (i = 0; i < n_pages; i++) {
> + pages[i] = alloc_page(GFP_KERNEL);
> + if (unlikely(!pages[i]))
> + goto depopulate;
> + }
> + mem = vmap(pages, n_pages, VM_MAP, PAGE_KERNEL);
> + if (mem) {
> + huge->page_array = pages;
> + huge->page_count = n_pages;
> + return mem;
> + }
> +
> +depopulate:
> + for (j = 0; j < i; j++)
> + __free_page(pages[j]);
> + kfree(pages);
> + return NULL;
> +}
> +
> +void cgroup_huge_mem_free(void *ptr, cgroup_huge_mem_t *huge)
> +{
> + if (huge->page_count) {
> + unsigned int i;
> + vunmap(ptr);
> + for (i = 0; i < huge->page_count; i++)
> + __free_page(huge->page_array[i]);
> + kfree(huge->page_array);
> + } else
> + kfree(ptr);
> +}
> +
> /*
> * for_each_subsys() allows you to iterate on each subsystem attached to
> * an active hierarchy
> @@ -2106,7 +2155,6 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
> down_read(&cgrp->pids_mutex);
> if (pid) {
> int end = cgrp->pids_length;
> - int i;
> while (index < end) {
> int mid = (index + end) / 2;
> if (cgrp->tasks_pids[mid] == pid) {
> @@ -2164,12 +2212,35 @@ static struct seq_operations cgroup_tasks_seq_operations = {
> .show = cgroup_tasks_show,
> };
>
> +
> +static void *cgroup_pid_array_alloc(size_t size)
> +{
> + cgroup_huge_mem_t huge;
> + void *mem = cgroup_huge_mem_alloc(size + sizeof(huge), &huge);
> + if (mem) {
> + *(cgroup_huge_mem_t *)mem = huge;
> + return mem + sizeof(huge);
> + }
> + return NULL;
> +}
> +
> +static void cgroup_pid_array_free(void *ptr)
> +{
> + if (ptr) {
> + cgroup_huge_mem_t huge;
> + void *mem = ptr - sizeof(huge);
> +
> + huge = *(cgroup_huge_mem_t *)mem;
> + cgroup_huge_mem_free(mem, &huge);
> + }
> +}
> +
> static void release_cgroup_pid_array(struct cgroup *cgrp)
> {
> down_write(&cgrp->pids_mutex);
> BUG_ON(!cgrp->pids_use_count);
> if (!--cgrp->pids_use_count) {
> - kfree(cgrp->tasks_pids);
> + cgroup_pid_array_free(cgrp->tasks_pids);
> cgrp->tasks_pids = NULL;
> cgrp->pids_length = 0;
> }
> @@ -2217,7 +2288,7 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
> * show up until sometime later on.
> */
> npids = cgroup_task_count(cgrp);
> - pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
> + pidarray = cgroup_pid_array_alloc(npids * sizeof(pid_t));
> if (!pidarray)
> return -ENOMEM;
> npids = pid_array_load(pidarray, npids, cgrp);
> @@ -2228,7 +2299,7 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
> * array if necessary
> */
> down_write(&cgrp->pids_mutex);
> - kfree(cgrp->tasks_pids);
> + cgroup_pid_array_free(cgrp->tasks_pids);
> cgrp->tasks_pids = pidarray;
> cgrp->pids_length = npids;
> cgrp->pids_use_count++;
> diff --git a/kernel/cpuset.c b/kernel/cpuset.c
> index f227bc1..38fde1e 100644
> --- a/kernel/cpuset.c
> +++ b/kernel/cpuset.c
> @@ -999,6 +999,7 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
> {
> struct task_struct *p;
> struct mm_struct **mmarray;
> + cgroup_huge_mem_t huge;
> int i, n, ntasks;
> int migrate;
> int fudge;
> @@ -1021,14 +1022,15 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
> while (1) {
> ntasks = cgroup_task_count(cs->css.cgroup); /* guess */
> ntasks += fudge;
> - mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
> + mmarray = cgroup_huge_mem_alloc(ntasks * sizeof(*mmarray),
> + &huge);
> if (!mmarray)
> goto done;
> read_lock(&tasklist_lock); /* block fork */
> if (cgroup_task_count(cs->css.cgroup) <= ntasks)
> break; /* got enough */
> read_unlock(&tasklist_lock); /* try again */
> - kfree(mmarray);
> + cgroup_huge_mem_free(mmarray, &huge);
> }
>
> n = 0;
> @@ -1075,7 +1077,7 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
> }
>
> /* We're done rebinding vmas to this cpuset's new mems_allowed. */
> - kfree(mmarray);
> + cgroup_huge_mem_free(mmarray, &huge);
> cpuset_being_rebound = NULL;
> retval = 0;
> done:
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists