linux-kernel - Re: [PATCH -mm] cgroup,cpuset: use alternative malloc to allocate large memory buf for tasks

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <6599ad830809110945pb85ec68o16328b31cbb0dc52@mail.gmail.com>
Date:	Thu, 11 Sep 2008 09:45:08 -0700
From:	"Paul Menage" <menage@...gle.com>
To:	"Lai Jiangshan" <laijs@...fujitsu.com>
Cc:	"Andrew Morton" <akpm@...ux-foundation.org>,
	"Paul Jackson" <pj@....com>,
	"Linux Kernel Mailing List" <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH -mm] cgroup,cpuset: use alternative malloc to allocate large memory buf for tasks

On Thu, Sep 11, 2008 at 3:30 AM, Lai Jiangshan <laijs@...fujitsu.com> wrote:
> This new alternative allocation implementation can allocate memory
> up to 64M in 32bits system or 512M in 64bits system.

Isn't a lot of this patch just reimplementing vmalloc()?

Paul

>
> This patch fix the problem for a really large cgroup.
>
> Signed-off-by: Lai Jiangshan <laijs@...fujitsu.com>
> ---
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index bb298de..974e898 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -403,6 +403,18 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
>  int cgroup_scan_tasks(struct cgroup_scanner *scan);
>  int cgroup_attach_task(struct cgroup *, struct task_struct *);
>
> +/*
> + * Basic struct of cgroup huge memory allocation,
> + * use typedef to hide its implementation.
> + */
> +typedef struct {
> +       struct page **page_array;
> +       size_t page_count;
> +} cgroup_huge_mem_t;
> +
> +void *cgroup_huge_mem_alloc(size_t size, cgroup_huge_mem_t *huge);
> +void cgroup_huge_mem_free(void *ptr, cgroup_huge_mem_t *huge);
> +
>  #else /* !CONFIG_CGROUPS */
>
>  static inline int cgroup_init_early(void) { return 0; }
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 996865a..3ad4ff0 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -142,6 +142,55 @@ static int notify_on_release(const struct cgroup *cgrp)
>        return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
>  }
>
> +#define CGROUP_HUGE_PAGES_THRESHOLD 4
> +
> +void *cgroup_huge_mem_alloc(size_t size, cgroup_huge_mem_t *huge)
> +{
> +       unsigned int i, j, n_pages;
> +       struct page **pages;
> +       void *mem;
> +
> +       huge->page_array = NULL;
> +       huge->page_count = 0;
> +       if (size < PAGE_SIZE * CGROUP_HUGE_PAGES_THRESHOLD)
> +               return kmalloc(size, GFP_KERNEL);
> +
> +       n_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
> +       pages = kmalloc(sizeof(*pages) * n_pages, GFP_KERNEL);
> +       if (!pages)
> +               return NULL;
> +
> +       for (i = 0; i < n_pages; i++) {
> +               pages[i] = alloc_page(GFP_KERNEL);
> +               if (unlikely(!pages[i]))
> +                       goto depopulate;
> +       }
> +       mem = vmap(pages, n_pages, VM_MAP, PAGE_KERNEL);
> +       if (mem) {
> +               huge->page_array = pages;
> +               huge->page_count = n_pages;
> +               return mem;
> +       }
> +
> +depopulate:
> +       for (j = 0; j < i; j++)
> +               __free_page(pages[j]);
> +       kfree(pages);
> +       return NULL;
> +}
> +
> +void cgroup_huge_mem_free(void *ptr, cgroup_huge_mem_t *huge)
> +{
> +       if (huge->page_count) {
> +               unsigned int i;
> +               vunmap(ptr);
> +               for (i = 0; i < huge->page_count; i++)
> +                       __free_page(huge->page_array[i]);
> +               kfree(huge->page_array);
> +       } else
> +               kfree(ptr);
> +}
> +
>  /*
>  * for_each_subsys() allows you to iterate on each subsystem attached to
>  * an active hierarchy
> @@ -2106,7 +2155,6 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
>        down_read(&cgrp->pids_mutex);
>        if (pid) {
>                int end = cgrp->pids_length;
> -               int i;
>                while (index < end) {
>                        int mid = (index + end) / 2;
>                        if (cgrp->tasks_pids[mid] == pid) {
> @@ -2164,12 +2212,35 @@ static struct seq_operations cgroup_tasks_seq_operations = {
>        .show = cgroup_tasks_show,
>  };
>
> +
> +static void *cgroup_pid_array_alloc(size_t size)
> +{
> +       cgroup_huge_mem_t huge;
> +       void *mem = cgroup_huge_mem_alloc(size + sizeof(huge), &huge);
> +       if (mem) {
> +               *(cgroup_huge_mem_t *)mem = huge;
> +               return mem + sizeof(huge);
> +       }
> +       return NULL;
> +}
> +
> +static void cgroup_pid_array_free(void *ptr)
> +{
> +       if (ptr) {
> +               cgroup_huge_mem_t huge;
> +               void *mem = ptr - sizeof(huge);
> +
> +               huge = *(cgroup_huge_mem_t *)mem;
> +               cgroup_huge_mem_free(mem, &huge);
> +       }
> +}
> +
>  static void release_cgroup_pid_array(struct cgroup *cgrp)
>  {
>        down_write(&cgrp->pids_mutex);
>        BUG_ON(!cgrp->pids_use_count);
>        if (!--cgrp->pids_use_count) {
> -               kfree(cgrp->tasks_pids);
> +               cgroup_pid_array_free(cgrp->tasks_pids);
>                cgrp->tasks_pids = NULL;
>                cgrp->pids_length = 0;
>        }
> @@ -2217,7 +2288,7 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
>         * show up until sometime later on.
>         */
>        npids = cgroup_task_count(cgrp);
> -       pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
> +       pidarray = cgroup_pid_array_alloc(npids * sizeof(pid_t));
>        if (!pidarray)
>                return -ENOMEM;
>        npids = pid_array_load(pidarray, npids, cgrp);
> @@ -2228,7 +2299,7 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
>         * array if necessary
>         */
>        down_write(&cgrp->pids_mutex);
> -       kfree(cgrp->tasks_pids);
> +       cgroup_pid_array_free(cgrp->tasks_pids);
>        cgrp->tasks_pids = pidarray;
>        cgrp->pids_length = npids;
>        cgrp->pids_use_count++;
> diff --git a/kernel/cpuset.c b/kernel/cpuset.c
> index f227bc1..38fde1e 100644
> --- a/kernel/cpuset.c
> +++ b/kernel/cpuset.c
> @@ -999,6 +999,7 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
>  {
>        struct task_struct *p;
>        struct mm_struct **mmarray;
> +       cgroup_huge_mem_t huge;
>        int i, n, ntasks;
>        int migrate;
>        int fudge;
> @@ -1021,14 +1022,15 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
>        while (1) {
>                ntasks = cgroup_task_count(cs->css.cgroup);  /* guess */
>                ntasks += fudge;
> -               mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
> +               mmarray = cgroup_huge_mem_alloc(ntasks * sizeof(*mmarray),
> +                               &huge);
>                if (!mmarray)
>                        goto done;
>                read_lock(&tasklist_lock);              /* block fork */
>                if (cgroup_task_count(cs->css.cgroup) <= ntasks)
>                        break;                          /* got enough */
>                read_unlock(&tasklist_lock);            /* try again */
> -               kfree(mmarray);
> +               cgroup_huge_mem_free(mmarray, &huge);
>        }
>
>        n = 0;
> @@ -1075,7 +1077,7 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
>        }
>
>        /* We're done rebinding vmas to this cpuset's new mems_allowed. */
> -       kfree(mmarray);
> +       cgroup_huge_mem_free(mmarray, &huge);
>        cpuset_being_rebound = NULL;
>        retval = 0;
>  done:
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/