unchanged: --- linux-2.6.22-rc6-balbir/include/linux/res_counter.h 2007-07-05 13:45:17.000000000 -0700 +++ linux-2.6.22-rc6-balbir/include/linux/res_counter.h 2007-07-05 18:28:12.000000000 -0700 @@ -0,0 +1,125 @@ +#ifndef __RES_COUNTER_H__ +#define __RES_COUNTER_H__ + +/* + * resource counters + * contain common data types and routines for resource accounting + * + * Copyright 2007 OpenVZ SWsoft Inc + * + * Author: Pavel Emelianov + * + */ + +#include + +/* + * the core object. the container that wishes to account for some + * resource may include this counter into its structures and use + * the helpers described beyond + */ + +struct res_counter { + /* + * the current resource consumption level + */ + unsigned long usage; + /* + * the limit that usage cannot exceed + */ + unsigned long limit; + /* + * the number of insuccessful attempts to consume the resource + */ + unsigned long failcnt; + /* + * the lock to protect all of the above. + * the routines below consider this to be IRQ-safe + */ + spinlock_t lock; +}; + +/* + * helpers to interact with userspace + * res_counter_read/_write - put/get the specified fields from the + * res_counter struct to/from the user + * + * @cnt: the counter in question + * @member: the field to work with (see RES_xxx below) + * @buf: the buffer to opeate on,... + * @nbytes: its size... + * @pos: and the offset. + */ + +ssize_t res_counter_read(struct res_counter *cnt, int member, + const char __user *buf, size_t nbytes, loff_t *pos); +ssize_t res_counter_write(struct res_counter *cnt, int member, + const char __user *buf, size_t nbytes, loff_t *pos); + +/* + * the field descriptors. one for each member of res_counter + */ + +enum { + RES_USAGE, + RES_LIMIT, + RES_FAILCNT, +}; + +/* + * helpers for accounting + */ + +void res_counter_init(struct res_counter *cnt); + +/* + * charge - try to consume more resource. + * + * @cnt: the counter + * @val: the amount of the resource. each controller defines its own + * units, e.g. numbers, bytes, Kbytes, etc + * + * returns 0 on success and <0 if the cnt->usage will exceed the cnt->limit + * _locked call expects the cnt->lock to be taken + */ + +int res_counter_charge_locked(struct res_counter *cnt, unsigned long val); +int res_counter_charge(struct res_counter *cnt, unsigned long val); + +/* + * uncharge - tell that some portion of the resource is released + * + * @cnt: the counter + * @val: the amount of the resource + * + * these calls check for usage underflow and show a warning on the console + * _locked call expects the cnt->lock to be taken + */ + +void res_counter_uncharge_locked(struct res_counter *cnt, unsigned long val); +void res_counter_uncharge(struct res_counter *cnt, unsigned long val); + +static inline bool res_counter_limit_check_locked(struct res_counter *cnt) +{ + if (cnt->usage < cnt->limit) + return true; + + return false; +} + +/* + * Helper function to detect if the container is within it's limit or + * not. It's currently called from container_rss_prepare() + */ +static inline bool res_counter_check_under_limit(struct res_counter *cnt) +{ + bool ret; + unsigned long flags; + + spin_lock_irqsave(&cnt->lock, flags); + ret = res_counter_limit_check_locked(cnt); + spin_unlock_irqrestore(&cnt->lock, flags); + return ret; +} + +#endif unchanged: --- linux-2.6.22-rc6-balbir/init/Kconfig 2007-07-05 13:45:17.000000000 -0700 +++ linux-2.6.22-rc6-balbir/init/Kconfig 2007-07-05 13:45:17.000000000 -0700 @@ -320,6 +320,10 @@ Say N if unsure. +config RESOURCE_COUNTERS + bool + select CONTAINERS + config SYSFS_DEPRECATED bool "Create deprecated sysfs files" default y @@ -356,6 +360,14 @@ for instance virtual servers and checkpoint/restart jobs. +config CONTAINER_MEM_CONT + bool "Memory controller for containers" + select CONTAINERS + select RESOURCE_COUNTERS + help + Provides a memory controller that manages both page cache and + RSS memory. + config PROC_PID_CPUSET bool "Include legacy /proc//cpuset file" depends on CPUSETS unchanged: --- linux-2.6.22-rc6/kernel/Makefile~res_counters_infra 2007-07-05 13:45:17.000000000 -0700 +++ linux-2.6.22-rc6-balbir/kernel/Makefile 2007-07-05 13:45:17.000000000 -0700 @@ -58,6 +58,7 @@ obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o +obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is unchanged: --- /dev/null 2007-06-01 08:12:04.000000000 -0700 +++ linux-2.6.22-rc6-balbir/kernel/res_counter.c 2007-07-05 13:45:17.000000000 -0700 @@ -0,0 +1,121 @@ +/* + * resource containers + * + * Copyright 2007 OpenVZ SWsoft Inc + * + * Author: Pavel Emelianov + * + */ + +#include +#include +#include +#include +#include + +void res_counter_init(struct res_counter *cnt) +{ + spin_lock_init(&cnt->lock); + cnt->limit = (unsigned long)LONG_MAX; +} + +int res_counter_charge_locked(struct res_counter *cnt, unsigned long val) +{ + if (cnt->usage <= cnt->limit - val) { + cnt->usage += val; + return 0; + } + + cnt->failcnt++; + return -ENOMEM; +} + +int res_counter_charge(struct res_counter *cnt, unsigned long val) +{ + int ret; + unsigned long flags; + + spin_lock_irqsave(&cnt->lock, flags); + ret = res_counter_charge_locked(cnt, val); + spin_unlock_irqrestore(&cnt->lock, flags); + return ret; +} + +void res_counter_uncharge_locked(struct res_counter *cnt, unsigned long val) +{ + if (unlikely(cnt->usage < val)) { + WARN_ON(1); + val = cnt->usage; + } + + cnt->usage -= val; +} + +void res_counter_uncharge(struct res_counter *cnt, unsigned long val) +{ + unsigned long flags; + + spin_lock_irqsave(&cnt->lock, flags); + res_counter_uncharge_locked(cnt, val); + spin_unlock_irqrestore(&cnt->lock, flags); +} + + +static inline unsigned long *res_counter_member(struct res_counter *cnt, int member) +{ + switch (member) { + case RES_USAGE: + return &cnt->usage; + case RES_LIMIT: + return &cnt->limit; + case RES_FAILCNT: + return &cnt->failcnt; + }; + + BUG(); + return NULL; +} + +ssize_t res_counter_read(struct res_counter *cnt, int member, + const char __user *userbuf, size_t nbytes, loff_t *pos) +{ + unsigned long *val; + char buf[64], *s; + + s = buf; + val = res_counter_member(cnt, member); + s += sprintf(s, "%lu\n", *val); + return simple_read_from_buffer((void __user *)userbuf, nbytes, + pos, buf, s - buf); +} + +ssize_t res_counter_write(struct res_counter *cnt, int member, + const char __user *userbuf, size_t nbytes, loff_t *pos) +{ + int ret; + char *buf, *end; + unsigned long tmp, *val; + + buf = kmalloc(nbytes + 1, GFP_KERNEL); + ret = -ENOMEM; + if (buf == NULL) + goto out; + + buf[nbytes] = 0; + ret = -EFAULT; + if (copy_from_user(buf, userbuf, nbytes)) + goto out_free; + + ret = -EINVAL; + tmp = simple_strtoul(buf, &end, 10); + if (*end != '\0') + goto out_free; + + val = res_counter_member(cnt, member); + *val = tmp; + ret = nbytes; +out_free: + kfree(buf); +out: + return ret; +} unchanged: --- linux-2.6.22-rc6/include/linux/container_subsys.h~mem-control-setup 2007-07-05 13:45:17.000000000 -0700 +++ linux-2.6.22-rc6-balbir/include/linux/container_subsys.h 2007-07-05 13:45:17.000000000 -0700 @@ -30,3 +30,9 @@ SUBSYS(ns) #endif /* */ + +#ifdef CONFIG_CONTAINER_MEM_CONT +SUBSYS(mem_container) +#endif + +/* */ unchanged: --- linux-2.6.22-rc6/mm/Makefile~mem-control-setup 2007-07-05 13:45:17.000000000 -0700 +++ linux-2.6.22-rc6-balbir/mm/Makefile 2007-07-05 13:45:17.000000000 -0700 @@ -30,4 +30,5 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_SMP) += allocpercpu.o obj-$(CONFIG_QUICKLIST) += quicklist.o +obj-$(CONFIG_CONTAINER_MEM_CONT) += memcontrol.o diff -u linux-2.6.22-rc6-balbir/mm/memcontrol.c linux-2.6.22-rc6-balbir/mm/memcontrol.c --- linux-2.6.22-rc6-balbir/mm/memcontrol.c 2007-07-05 18:49:35.000000000 -0700 +++ linux-2.6.22-rc6-balbir/mm/memcontrol.c 2007-07-05 20:09:40.000000000 -0700 @@ -0,0 +1,560 @@ +/* memcontrol.c - Memory Controller + * + * Copyright IBM Corporation, 2007 + * Author Balbir Singh + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +struct container_subsys mem_container_subsys; + +/* + * The memory controller data structure. The memory controller controls both + * page cache and RSS per container. We would eventually like to provide + * statistics based on the statistics developed by Rik Van Riel for clock-pro, + * to help the administrator determine what knobs to tune. + * + * TODO: Add a water mark for the memory controller. Reclaim will begin when + * we hit the water mark. May be even add a low water mark, such that + * no reclaim occurs from a container at it's low water mark, this is + * a feature that will be implemented much later in the future. + */ +struct mem_container { + struct container_subsys_state css; + /* + * the counter to account for memory usage + */ + struct res_counter res; + /* + * Per container active and inactive list, similar to the + * per zone LRU lists. + * TODO: Consider making these lists per zone + */ + struct list_head active_list; + struct list_head inactive_list; + /* + * spin_lock to protect the per container LRU + */ + spinlock_t lru_lock; + unsigned long control_type; /* control RSS or RSS+Pagecache */ +}; + +/* + * A meta page is associated with every page descriptor. The meta page + * helps us identify information about the container + */ +struct meta_page { + struct list_head list; /* per container LRU list */ + struct page *page; + struct mem_container *mem_container; + atomic_t ref_cnt; +}; + +enum { + MEM_CONTAINER_TYPE_UNSPEC = 0, + MEM_CONTAINER_TYPE_MAPPED, + MEM_CONTAINER_TYPE_ALL, + MEM_CONTAINER_TYPE_MAX, +} mem_control_type; + +static struct mem_container init_mem_container; + +static inline struct mem_container *mem_container_from_cont(struct container + *cnt) +{ + return container_of(container_subsys_state(cnt, + mem_container_subsys_id), struct mem_container, + css); +} + +void mm_init_container(struct mm_struct *mm, struct task_struct *p) +{ + struct mem_container *mem; + + mem = mem_container_from_cont(task_container(p, + mem_container_subsys_id)); + css_get(&mem->css); + mm->mem_container = mem; +} + +void mm_free_container(struct mm_struct *mm) +{ + css_put(&mm->mem_container->css); +} + +void page_assign_meta_page(struct page *page, struct meta_page *mp) +{ + page->meta_page = mp; +} + +struct meta_page *page_get_meta_page(struct page *page) +{ + return page->meta_page; +} + +void __always_inline lock_meta_page(struct page *page) +{ + bit_spin_lock(PG_metapage, &page->flags); +} + +void __always_inline unlock_meta_page(struct page *page) +{ + bit_spin_unlock(PG_metapage, &page->flags); +} + +unsigned long mem_container_isolate_pages(unsigned long nr_to_scan, + struct list_head *dst, + unsigned long *scanned, int order, + int mode, struct zone *z, + struct mem_container *mem_cont, + int active) +{ + unsigned long nr_taken = 0; + struct page *page; + unsigned long scan; + LIST_HEAD(mp_list); + struct list_head *src; + struct meta_page *mp; + + if (active) + src = &mem_cont->active_list; + else + src = &mem_cont->inactive_list; + + for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { + mp = list_entry(src->prev, struct meta_page, list); + page = mp->page; + + if (PageActive(page) && !active) { + mem_container_move_lists(mp, true); + scan--; + continue; + } + if (!PageActive(page) && active) { + mem_container_move_lists(mp, false); + scan--; + continue; + } + + /* + * Reclaim, per zone + * TODO: make the active/inactive lists per zone + */ + if (page_zone(page) != z) + continue; + + spin_lock(&mem_cont->lru_lock); + /* + * Check if the meta page went away from under us + */ + if (!list_empty(&mp->list) + list_move(&mp->list, &mp_list); + else { + spin_unlock(&mem_cont->lru_lock); + continue; + } + spin_unlock(&mem_cont->lru_lock); + + if (__isolate_lru_page(page, mode) == 0) { + list_move(&page->lru, dst); + nr_taken++; + } + } + + spin_lock(&mem_cont->lru_lock); + list_splice(&mp_list, src); + spin_unlock(&mem_cont->lru_lock); + + *scanned = scan; + return nr_taken; +} + +/* + * This routine assumes that the appropriate zone's lru lock is already held + */ +void mem_container_move_lists(struct meta_page *mp, bool active) +{ + struct mem_container *mem; + if (!mp) + return; + + mem = mp->mem_container; + + spin_lock(&mem->lru_lock); + if (active) + list_move(&mp->list, &mp->mem_container->active_list); + else + list_move(&mp->list, &mp->mem_container->inactive_list); + spin_unlock(&mem->lru_lock); +} + +/* + * Charge the memory controller for page usage. + * Return + * 0 if the charge was successful + * < 0 if the container is over its limit + */ +int mem_container_charge(struct page *page, struct mm_struct *mm) +{ + struct mem_container *mem; + struct meta_page *mp; + unsigned long flags; + + /* + * Should meta_page's go to their own slab? + * One could optimize the performance of the charging routine + * by saving a bit in the page_flags and using it as a lock + * to see if the container page already has a meta_page associated + * with it + */ + lock_meta_page(page); + mp = page_get_meta_page(page); + /* + * The meta_page exists and the page has already been accounted + */ + if (mp) { + atomic_inc(&mp->ref_cnt); + goto done; + } + + unlock_meta_page(page); + + mp = kzalloc(sizeof(struct meta_page), GFP_KERNEL); + if (mp == NULL) + goto err; + + rcu_read_lock(); + /* + * We always charge the container the mm_struct belongs to + * the mm_struct's mem_container changes on task migration if the + * thread group leader migrates. It's possible that mm is not + * set, if so charge the init_mm (happens for pagecache usage). + */ + if (!mm) + mm = &init_mm; + + mem = rcu_dereference(mm->mem_container); + /* + * For every charge from the container, increment reference + * count + */ + css_get(&mem->css); + rcu_read_unlock(); + + /* + * If we created the meta_page, we should free it on exceeding + * the container limit. + */ + while (res_counter_charge(&mem->res, 1)) { + if (try_to_free_mem_container_pages(mem)) + continue; + + /* + * try_to_free_mem_container_pages() might not give us a full + * picture of reclaim. Some pages are reclaimed and might be + * moved to swap cache or just unmapped from the container. + * Check the limit again to see if the reclaim reduced the + * current usage of the container before giving up + */ + if (res_counter_check_under_limit(&mem->res)) + continue; + + mem_container_out_of_memory(mem); + goto free_mp; + } + + lock_meta_page(page); + /* + * Check if somebody else beat us to allocating the meta_page + */ + if (page_get_meta_page(page)) { + atomic_inc(&mp->ref_cnt); + res_counter_uncharge(&mem->res, 1); + goto done; + } + + atomic_set(&mp->ref_cnt, 1); + mp->mem_container = mem; + mp->page = page; + page_assign_meta_page(page, mp); + + spin_lock_irqsave(&mem->lru_lock, flags); + list_add(&mp->list, &mem->active_list); + spin_unlock_irqrestore(&mem->lru_lock, flags); + +done: + unlock_meta_page(page); + return 0; +free_mp: + kfree(mp); + return -ENOMEM; +err: + unlock_meta_page(page); + return -ENOMEM; +} + +/* + * See if the cached pages should be charged at all? + */ +int mem_container_cache_charge(struct page *page, struct mm_struct *mm) +{ + struct mem_container *mem; + if (!mm) + mm = &init_mm; + + mem = rcu_dereference(mm->mem_container); + if (mem->control_type & MEM_CONTAINER_TYPE_ALL) + return mem_container_charge(page, mm); + else + return 0; +} + +/* + * Uncharging is always a welcome operation, we never complain, simply + * uncharge. + */ +void mem_container_uncharge(struct meta_page *mp) +{ + struct mem_container *mem; + struct page *page; + unsigned long flags; + + /* + * This can happen for PAGE_ZERO. This can also handle cases when + * a page is not charged at all and we are switching between + * handling the control_type. + */ + if (!mp) + return; + + if (atomic_dec_and_test(&mp->ref_cnt)) { + page = mp->page; + + lock_meta_page(page); + mem = mp->mem_container; + css_put(&mem->css); + page_assign_meta_page(page, NULL); + unlock_meta_page(page); + + res_counter_uncharge(&mem->res, 1); + + spin_lock_irqsave(&mem->lru_lock, flags); + list_del_init(&mp->list); + spin_unlock_irqrestore(&mem->lru_lock, flags); + kfree(mp); + } +} + +static ssize_t mem_container_read(struct container *cont, struct cftype *cft, + struct file *file, char __user *userbuf, size_t nbytes, + loff_t *ppos) +{ + return res_counter_read(&mem_container_from_cont(cont)->res, + cft->private, userbuf, nbytes, ppos); +} + +static ssize_t mem_container_write(struct container *cont, struct cftype *cft, + struct file *file, const char __user *userbuf, + size_t nbytes, loff_t *ppos) +{ + return res_counter_write(&mem_container_from_cont(cont)->res, + cft->private, userbuf, nbytes, ppos); +} + +static ssize_t mem_control_type_write(struct container *cont, + struct cftype *cft, struct file *file, + const char __user *userbuf, + size_t nbytes, loff_t *pos) +{ + int ret; + char *buf, *end; + unsigned long tmp; + struct mem_container *mem; + + mem = mem_container_from_cont(cont); + buf = kmalloc(nbytes + 1, GFP_KERNEL); + ret = -ENOMEM; + if (buf == NULL) + goto out; + + buf[nbytes] = 0; + ret = -EFAULT; + if (copy_from_user(buf, userbuf, nbytes)) + goto out_free; + + ret = -EINVAL; + tmp = simple_strtoul(buf, &end, 10); + if (*end != '\0') + goto out_free; + + if (tmp <= MEM_CONTAINER_TYPE_UNSPEC || tmp >= MEM_CONTAINER_TYPE_MAX) + goto out_free; + + mem->control_type = tmp; + ret = nbytes; +out_free: + kfree(buf); +out: + return ret; +} + +static ssize_t mem_control_type_read(struct container *cont, + struct cftype *cft, + struct file *file, char __user *userbuf, + size_t nbytes, loff_t *ppos) +{ + unsigned long val; + char buf[64], *s; + struct mem_container *mem; + + mem = mem_container_from_cont(cont); + s = buf; + val = mem->control_type; + s += sprintf(s, "%lu\n", val); + return simple_read_from_buffer((void __user *)userbuf, nbytes, + ppos, buf, s - buf); +} + +static int mem_container_create(struct container_subsys *ss, + struct container *cont) +{ + struct mem_container *mem; + + if (unlikely((cont->parent) == NULL)) { + mem = &init_mem_container; + css_get(&mem->css); + init_mm.mem_container = mem; + } else + mem = kzalloc(sizeof(struct mem_container), GFP_KERNEL); + + if (mem == NULL) + return -ENOMEM; + + res_counter_init(&mem->res); + cont->subsys[mem_container_subsys_id] = &mem->css; + mem->css.container = cont; + INIT_LIST_HEAD(&mem->active_list); + INIT_LIST_HEAD(&mem->inactive_list); + spin_lock_init(&mem->lru_lock); + mem->control_type = MEM_CONTAINER_TYPE_ALL; + return 0; +} + +static struct cftype mem_container_usage = { + .name = "mem_usage", + .private = RES_USAGE, + .read = mem_container_read, +}; + +static struct cftype mem_container_limit = { + .name = "mem_limit", + .private = RES_LIMIT, + .write = mem_container_write, + .read = mem_container_read, +}; + +static struct cftype mem_container_control_type = { + .name = "mem_control_type", + .write = mem_control_type_write, + .read = mem_control_type_read, +}; + +static struct cftype mem_container_failcnt = { + .name = "mem_failcnt", + .private = RES_FAILCNT, + .read = mem_container_read, +}; + + +static void mem_container_destroy(struct container_subsys *ss, + struct container *cont) +{ + kfree(mem_container_from_cont(cont)); +} + +static int mem_container_populate(struct container_subsys *ss, + struct container *cont) +{ + int rc = 0; + + rc = container_add_file(cont, &mem_container_usage); + if (rc < 0) + goto err; + + rc = container_add_file(cont, &mem_container_limit); + if (rc < 0) + goto err; + + rc = container_add_file(cont, &mem_container_failcnt); + if (rc < 0) + goto err; + + rc = container_add_file(cont, &mem_container_control_type); + if (rc < 0) + goto err; + +err: + return rc; +} + +static void mem_container_move_task(struct container_subsys *ss, + struct container *cont, + struct container *old_cont, + struct task_struct *p) +{ + struct mm_struct *mm; + struct mem_container *mem, *old_mem; + + mm = get_task_mm(p); + if (mm == NULL) + return; + + mem = mem_container_from_cont(cont); + old_mem = mem_container_from_cont(old_cont); + + if (mem == old_mem) + goto out; + + /* + * Only thread group leaders are allowed to migrate, the mm_struct is + * in effect owned by the leader + */ + if (p->tgid != p->pid) + goto out; + + css_get(&mem->css); + rcu_assign_pointer(mm->mem_container, mem); + css_put(&old_mem->css); + +out: + mmput(mm); + return; +} + +struct container_subsys mem_container_subsys = { + .name = "mem_container", + .subsys_id = mem_container_subsys_id, + .create = mem_container_create, + .destroy = mem_container_destroy, + .populate = mem_container_populate, + .attach = mem_container_move_task, + .early_init = 1, +}; diff -u linux-2.6.22-rc6-balbir/include/linux/memcontrol.h linux-2.6.22-rc6-balbir/include/linux/memcontrol.h --- linux-2.6.22-rc6-balbir/include/linux/memcontrol.h 2007-07-05 18:49:35.000000000 -0700 +++ linux-2.6.22-rc6-balbir/include/linux/memcontrol.h 2007-07-05 20:00:07.000000000 -0700 @@ -0,0 +1,83 @@ +/* memcontrol.h - Memory Controller + * + * Copyright IBM Corporation, 2007 + * Author Balbir Singh + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef _LINUX_MEMCONTROL_H +#define _LINUX_MEMCONTROL_H + +#include + +struct mem_container; +struct meta_page; + +#ifdef CONFIG_CONTAINER_MEM_CONT + +extern void mm_init_container(struct mm_struct *mm, struct task_struct *p); +extern void mm_free_container(struct mm_struct *mm); +extern void page_assign_meta_page(struct page *page, struct meta_page *mp); +extern struct meta_page *page_get_meta_page(struct page *page); +extern int mem_container_charge(struct page *page, struct mm_struct *mm); +extern void mem_container_uncharge(struct meta_page *mp); +extern void mem_container_move_lists(struct meta_page *mp, bool active); +extern unsigned long mem_container_isolate_pages(unsigned long nr_to_scan, + struct list_head *dst, + unsigned long *scanned, int order, + int mode, struct zone *z, + struct mem_container *mem_cont, + int active); +extern void mem_container_out_of_memory(struct mem_container *mem); +extern int mem_container_cache_charge(struct page *page, struct mm_struct *mm); + +#else /* CONFIG_CONTAINER_MEM_CONT */ +static inline void mm_init_container(struct mm_struct *mm, + struct task_struct *p) +{ +} + +static inline void mm_free_container(struct mm_struct *mm) +{ +} + +static inline void page_assign_meta_page(struct page *page, + struct meta_page *mp) +{ +} + +static inline struct meta_page *page_get_meta_page(struct page *page) +{ + return NULL; +} + +static inline int mem_container_charge(struct page *page, struct mm_struct *mm) +{ + return 0; +} + +static inline void mem_container_uncharge(struct meta_page *mp) +{ +} + +static inline void mem_container_move_lists(struct meta_page *mp, bool active) +{ +} + +static inline int mem_container_cache_charge(struct page *page, + struct mm_struct *mm) +{ + return 0; +} + +#endif /* CONFIG_CONTAINER_MEM_CONT */ + +#endif /* _LINUX_MEMCONTROL_H */ + unchanged: --- linux-2.6.22-rc6/include/linux/mm_types.h~mem-control-accounting-setup 2007-07-05 13:45:17.000000000 -0700 +++ linux-2.6.22-rc6-balbir/include/linux/mm_types.h 2007-07-05 13:45:17.000000000 -0700 @@ -5,6 +5,7 @@ #include #include #include +#include struct address_space; @@ -83,6 +84,9 @@ struct page { unsigned int gfp_mask; unsigned long trace[8]; #endif +#ifdef CONFIG_CONTAINER_MEM_CONT + struct meta_page *meta_page; +#endif }; #endif /* _LINUX_MM_TYPES_H */ unchanged: --- linux-2.6.22-rc6/include/linux/sched.h~mem-control-accounting-setup 2007-07-05 13:45:17.000000000 -0700 +++ linux-2.6.22-rc6-balbir/include/linux/sched.h 2007-07-05 13:45:17.000000000 -0700 @@ -87,6 +87,7 @@ struct sched_param { #include #include #include +#include #include @@ -416,6 +417,9 @@ struct mm_struct { /* aio bits */ rwlock_t ioctx_list_lock; struct kioctx *ioctx_list; +#ifdef CONFIG_CONTAINER_MEM_CONT + struct mem_container *mem_container; +#endif }; struct sighand_struct { unchanged: --- linux-2.6.22-rc6/kernel/fork.c~mem-control-accounting-setup 2007-07-05 13:45:17.000000000 -0700 +++ linux-2.6.22-rc6-balbir/kernel/fork.c 2007-07-05 13:45:17.000000000 -0700 @@ -330,7 +330,7 @@ static inline void mm_free_pgd(struct mm #include -static struct mm_struct * mm_init(struct mm_struct * mm) +static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) { atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); @@ -347,11 +347,14 @@ static struct mm_struct * mm_init(struct mm->ioctx_list = NULL; mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; + mm_init_container(mm, p); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; return mm; } + + mm_free_container(mm); free_mm(mm); return NULL; } @@ -366,7 +369,7 @@ struct mm_struct * mm_alloc(void) mm = allocate_mm(); if (mm) { memset(mm, 0, sizeof(*mm)); - mm = mm_init(mm); + mm = mm_init(mm, current); } return mm; } @@ -380,6 +383,7 @@ void fastcall __mmdrop(struct mm_struct { BUG_ON(mm == &init_mm); mm_free_pgd(mm); + mm_free_container(mm); destroy_context(mm); free_mm(mm); } @@ -500,7 +504,7 @@ static struct mm_struct *dup_mm(struct t mm->token_priority = 0; mm->last_interval = 0; - if (!mm_init(mm)) + if (!mm_init(mm, tsk)) goto fail_nomem; if (init_new_context(tsk, mm)) unchanged: --- linux-2.6.22-rc6/fs/exec.c~mem-control-accounting 2007-07-05 13:45:18.000000000 -0700 +++ linux-2.6.22-rc6-balbir/fs/exec.c 2007-07-05 13:45:18.000000000 -0700 @@ -51,6 +51,7 @@ #include #include #include +#include #include #include unchanged: --- linux-2.6.22-rc6/include/linux/page-flags.h~mem-control-accounting 2007-07-05 13:45:18.000000000 -0700 +++ linux-2.6.22-rc6-balbir/include/linux/page-flags.h 2007-07-05 13:45:18.000000000 -0700 @@ -98,6 +98,9 @@ #define PG_checked PG_owner_priv_1 /* Used by some filesystems */ #define PG_pinned PG_owner_priv_1 /* Xen pinned pagetable */ +#define PG_metapage 21 /* Used for checking if a meta_page */ + /* is associated with a page */ + #if (BITS_PER_LONG > 32) /* * 64-bit-only flags build down from bit 31 diff -u linux-2.6.22-rc6-balbir/mm/filemap.c linux-2.6.22-rc6-balbir/mm/filemap.c --- linux-2.6.22-rc6-balbir/mm/filemap.c 2007-07-05 18:26:29.000000000 -0700 +++ linux-2.6.22-rc6-balbir/mm/filemap.c 2007-07-05 20:00:07.000000000 -0700 @@ -31,6 +31,7 @@ #include #include #include /* for BUG_ON(!in_atomic()) only */ +#include #include "internal.h" /* @@ -116,6 +117,7 @@ { struct address_space *mapping = page->mapping; + mem_container_uncharge(page_get_meta_page(page)); radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; mapping->nrpages--; @@ -442,6 +444,11 @@ int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); if (error == 0) { + + error = mem_container_cache_charge(page, current->mm); + if (error) + goto out; + write_lock_irq(&mapping->tree_lock); error = radix_tree_insert(&mapping->page_tree, offset, page); if (!error) { @@ -455,6 +462,7 @@ write_unlock_irq(&mapping->tree_lock); radix_tree_preload_end(); } +out: return error; } EXPORT_SYMBOL(add_to_page_cache); unchanged: --- linux-2.6.22-rc6/mm/memory.c~mem-control-accounting 2007-07-05 13:45:18.000000000 -0700 +++ linux-2.6.22-rc6-balbir/mm/memory.c 2007-07-05 13:45:18.000000000 -0700 @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -1226,6 +1227,10 @@ static int insert_page(struct mm_struct pte_t *pte; spinlock_t *ptl; + retval = mem_container_charge(page, mm); + if (retval) + goto out; + retval = -EINVAL; if (PageAnon(page)) goto out; @@ -1731,6 +1736,9 @@ gotten: cow_user_page(new_page, old_page, address, vma); } + if (mem_container_charge(new_page, mm)) + goto oom; + /* * Re-check the pte - we dropped the lock */ @@ -2188,6 +2196,11 @@ static int do_swap_page(struct mm_struct } delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + if (mem_container_charge(page, mm)) { + ret = VM_FAULT_OOM; + goto out; + } + mark_page_accessed(page); lock_page(page); @@ -2255,6 +2268,7 @@ static int do_anonymous_page(struct mm_s pte_t entry; if (write_access) { + /* Allocate our own private page. */ pte_unmap(page_table); @@ -2264,6 +2278,9 @@ static int do_anonymous_page(struct mm_s if (!page) goto oom; + if (mem_container_charge(page, mm)) + goto oom; + entry = mk_pte(page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -2397,6 +2414,11 @@ static int __do_fault(struct mm_struct * } + if (mem_container_charge(page, mm)) { + fdata.type = VM_FAULT_OOM; + goto out; + } + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); /* unchanged: --- linux-2.6.22-rc6/mm/migrate.c~mem-control-accounting 2007-07-05 13:45:18.000000000 -0700 +++ linux-2.6.22-rc6-balbir/mm/migrate.c 2007-07-05 13:45:18.000000000 -0700 @@ -28,6 +28,7 @@ #include #include #include +#include #include "internal.h" @@ -157,6 +158,11 @@ static void remove_migration_pte(struct return; } + if (mem_container_charge(new, mm)) { + pte_unmap(ptep); + return; + } + ptl = pte_lockptr(mm, pmd); spin_lock(ptl); pte = *ptep; unchanged: --- linux-2.6.22-rc6/mm/page_alloc.c~mem-control-accounting 2007-07-05 13:45:18.000000000 -0700 +++ linux-2.6.22-rc6-balbir/mm/page_alloc.c 2007-07-05 13:45:18.000000000 -0700 @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -1015,6 +1016,7 @@ static void fastcall free_hot_cold_page( if (!PageHighMem(page)) debug_check_no_locks_freed(page_address(page), PAGE_SIZE); + page_assign_meta_page(page, NULL); arch_free_page(page, 0); kernel_map_pages(page, 1, 0); @@ -2576,6 +2578,7 @@ void __meminit memmap_init_zone(unsigned set_page_links(page, zone, nid, pfn); init_page_count(page); reset_page_mapcount(page); + page_assign_meta_page(page, NULL); SetPageReserved(page); /* unchanged: --- linux-2.6.22-rc6/mm/rmap.c~mem-control-accounting 2007-07-05 13:45:18.000000000 -0700 +++ linux-2.6.22-rc6-balbir/mm/rmap.c 2007-07-05 13:45:18.000000000 -0700 @@ -643,6 +643,8 @@ void page_remove_rmap(struct page *page, page_clear_dirty(page); set_page_dirty(page); } + + mem_container_uncharge(page_get_meta_page(page)); __dec_zone_page_state(page, PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); } unchanged: --- linux-2.6.22-rc6/mm/swapfile.c~mem-control-accounting 2007-07-05 13:45:18.000000000 -0700 +++ linux-2.6.22-rc6-balbir/mm/swapfile.c 2007-07-05 13:45:18.000000000 -0700 @@ -506,9 +506,12 @@ unsigned int count_swap_pages(int type, * just let do_wp_page work it out if a write is requested later - to * force COW, vm_page_prot omits write permission from any private vma. */ -static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, +static int unuse_pte(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, swp_entry_t entry, struct page *page) { + if (mem_container_charge(page, vma->vm_mm)) + return -ENOMEM; + inc_mm_counter(vma->vm_mm, anon_rss); get_page(page); set_pte_at(vma->vm_mm, addr, pte, @@ -520,6 +523,7 @@ static void unuse_pte(struct vm_area_str * immediately swapped out again after swapon. */ activate_page(page); + return 1; } static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, @@ -529,7 +533,7 @@ static int unuse_pte_range(struct vm_are pte_t swp_pte = swp_entry_to_pte(entry); pte_t *pte; spinlock_t *ptl; - int found = 0; + int ret = 0; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); do { @@ -538,13 +542,12 @@ static int unuse_pte_range(struct vm_are * Test inline before going to call unuse_pte. */ if (unlikely(pte_same(*pte, swp_pte))) { - unuse_pte(vma, pte++, addr, entry, page); - found = 1; + ret = unuse_pte(vma, pte++, addr, entry, page); break; } } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(pte - 1, ptl); - return found; + return ret; } static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, @@ -553,14 +556,16 @@ static inline int unuse_pmd_range(struct { pmd_t *pmd; unsigned long next; + int ret; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - if (unuse_pte_range(vma, pmd, addr, next, entry, page)) - return 1; + ret = unuse_pte_range(vma, pmd, addr, next, entry, page); + if (ret) + return ret; } while (pmd++, addr = next, addr != end); return 0; } @@ -571,14 +576,16 @@ static inline int unuse_pud_range(struct { pud_t *pud; unsigned long next; + int ret; pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - if (unuse_pmd_range(vma, pud, addr, next, entry, page)) - return 1; + ret = unuse_pmd_range(vma, pud, addr, next, entry, page); + if (ret) + return ret; } while (pud++, addr = next, addr != end); return 0; } @@ -588,6 +595,7 @@ static int unuse_vma(struct vm_area_stru { pgd_t *pgd; unsigned long addr, end, next; + int ret; if (page->mapping) { addr = page_address_in_vma(page, vma); @@ -605,8 +613,9 @@ static int unuse_vma(struct vm_area_stru next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - if (unuse_pud_range(vma, pgd, addr, next, entry, page)) - return 1; + ret = unuse_pud_range(vma, pgd, addr, next, entry, page); + if (ret) + return ret; } while (pgd++, addr = next, addr != end); return 0; } @@ -615,6 +624,7 @@ static int unuse_mm(struct mm_struct *mm swp_entry_t entry, struct page *page) { struct vm_area_struct *vma; + int ret = 0; if (!down_read_trylock(&mm->mmap_sem)) { /* @@ -627,15 +637,11 @@ static int unuse_mm(struct mm_struct *mm lock_page(page); } for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (vma->anon_vma && unuse_vma(vma, entry, page)) + if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) break; } up_read(&mm->mmap_sem); - /* - * Currently unuse_mm cannot fail, but leave error handling - * at call sites for now, since we change it from time to time. - */ - return 0; + return ret; } /* diff -u linux-2.6.22-rc6-balbir/mm/swap_state.c linux-2.6.22-rc6-balbir/mm/swap_state.c --- linux-2.6.22-rc6-balbir/mm/swap_state.c 2007-07-05 18:28:08.000000000 -0700 +++ linux-2.6.22-rc6-balbir/mm/swap_state.c 2007-07-05 20:00:07.000000000 -0700 @@ -17,6 +17,7 @@ #include #include #include +#include #include @@ -79,6 +80,11 @@ BUG_ON(PagePrivate(page)); error = radix_tree_preload(gfp_mask); if (!error) { + + error = mem_container_cache_charge(page, current->mm); + if (error) + goto out; + write_lock_irq(&swapper_space.tree_lock); error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); @@ -93,6 +99,7 @@ write_unlock_irq(&swapper_space.tree_lock); radix_tree_preload_end(); } +out: return error; } @@ -129,6 +136,7 @@ BUG_ON(PageWriteback(page)); BUG_ON(PagePrivate(page)); + mem_container_uncharge(page_get_meta_page(page)); radix_tree_delete(&swapper_space.page_tree, page_private(page)); set_page_private(page, 0); ClearPageSwapCache(page); unchanged: --- linux-2.6.22-rc6/include/linux/swap.h~mem-control-lru-and-reclaim 2007-07-05 18:28:12.000000000 -0700 +++ linux-2.6.22-rc6-balbir/include/linux/swap.h 2007-07-05 18:28:12.000000000 -0700 @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -191,6 +192,8 @@ extern void swap_setup(void); /* linux/mm/vmscan.c */ extern unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask); +extern unsigned long try_to_free_mem_container_pages(struct mem_container *mem); +extern int __isolate_lru_page(struct page *page, int mode); extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; extern int remove_mapping(struct address_space *mapping, struct page *page); unchanged: --- linux-2.6.22-rc6/mm/swap.c~mem-control-lru-and-reclaim 2007-07-05 18:28:12.000000000 -0700 +++ linux-2.6.22-rc6-balbir/mm/swap.c 2007-07-05 18:28:12.000000000 -0700 @@ -31,6 +31,7 @@ #include #include #include +#include /* How many pages do we try to swap or page in/out together? */ int page_cluster; @@ -148,6 +149,7 @@ void fastcall activate_page(struct page SetPageActive(page); add_page_to_active_list(zone, page); __count_vm_event(PGACTIVATE); + mem_container_move_lists(page_get_meta_page(page), true); } spin_unlock_irq(&zone->lru_lock); } unchanged: --- linux-2.6.22-rc6/mm/vmscan.c~mem-control-lru-and-reclaim 2007-07-05 18:28:12.000000000 -0700 +++ linux-2.6.22-rc6-balbir/mm/vmscan.c 2007-07-05 18:28:12.000000000 -0700 @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -70,6 +71,15 @@ struct scan_control { int all_unreclaimable; int order; + + /* Which container do we reclaim from */ + struct mem_container *mem_container; + + /* Pluggable isolate pages callback */ + unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, + unsigned long *scanned, int order, int mode, + struct zone *z, struct mem_container *mem_cont, + int active); }; #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) @@ -604,7 +614,7 @@ keep: * * returns 0 on success, -ve errno on failure. */ -static int __isolate_lru_page(struct page *page, int mode) +int __isolate_lru_page(struct page *page, int mode) { int ret = -EINVAL; @@ -738,6 +748,21 @@ static unsigned long isolate_lru_pages(u return nr_taken; } +static unsigned long isolate_pages_global(unsigned long nr, + struct list_head *dst, + unsigned long *scanned, int order, + int mode, struct zone *z, + struct mem_container *mem_cont, + int active) +{ + if (active) + return isolate_lru_pages(nr, &z->active_list, dst, + scanned, order, mode); + else + return isolate_lru_pages(nr, &z->inactive_list, dst, + scanned, order, mode); +} + /* * clear_active_flags() is a helper for shrink_active_list(), clearing * any active bits from the pages in the list. @@ -779,11 +804,11 @@ static unsigned long shrink_inactive_lis unsigned long nr_freed; unsigned long nr_active; - nr_taken = isolate_lru_pages(sc->swap_cluster_max, - &zone->inactive_list, + nr_taken = sc->isolate_pages(sc->swap_cluster_max, &page_list, &nr_scan, sc->order, (sc->order > PAGE_ALLOC_COSTLY_ORDER)? - ISOLATE_BOTH : ISOLATE_INACTIVE); + ISOLATE_BOTH : ISOLATE_INACTIVE, + zone, sc->mem_container, 0); nr_active = clear_active_flags(&page_list); __mod_zone_page_state(zone, NR_ACTIVE, -nr_active); @@ -932,8 +957,9 @@ force_reclaim_mapped: lru_add_drain(); spin_lock_irq(&zone->lru_lock); - pgmoved = isolate_lru_pages(nr_pages, &zone->active_list, - &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE); + pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, + ISOLATE_ACTIVE, zone, + sc->mem_container, 1); zone->pages_scanned += pgscanned; __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); spin_unlock_irq(&zone->lru_lock); @@ -968,6 +994,7 @@ force_reclaim_mapped: ClearPageActive(page); list_move(&page->lru, &zone->inactive_list); + mem_container_move_lists(page_get_meta_page(page), false); pgmoved++; if (!pagevec_add(&pvec, page)) { __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); @@ -996,6 +1023,7 @@ force_reclaim_mapped: SetPageLRU(page); VM_BUG_ON(!PageActive(page)); list_move(&page->lru, &zone->active_list); + mem_container_move_lists(page_get_meta_page(page), true); pgmoved++; if (!pagevec_add(&pvec, page)) { __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); @@ -1127,7 +1155,8 @@ static unsigned long shrink_zones(int pr * holds filesystem locks which prevent writeout this might not work, and the * allocation attempt will fail. */ -unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) +unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, + struct scan_control *sc) { int priority; int ret = 0; @@ -1136,14 +1165,6 @@ unsigned long try_to_free_pages(struct z struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long lru_pages = 0; int i; - struct scan_control sc = { - .gfp_mask = gfp_mask, - .may_writepage = !laptop_mode, - .swap_cluster_max = SWAP_CLUSTER_MAX, - .may_swap = 1, - .swappiness = vm_swappiness, - .order = order, - }; delay_swap_prefetch(); count_vm_event(ALLOCSTALL); @@ -1159,17 +1180,22 @@ unsigned long try_to_free_pages(struct z } for (priority = DEF_PRIORITY; priority >= 0; priority--) { - sc.nr_scanned = 0; + sc->nr_scanned = 0; if (!priority) disable_swap_token(); - nr_reclaimed += shrink_zones(priority, zones, &sc); - shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); + nr_reclaimed += shrink_zones(priority, zones, sc); + /* + * Don't shrink slabs when reclaiming memory from + * over limit containers + */ + if (sc->mem_container == NULL) + shrink_slab(sc->nr_scanned, gfp_mask, lru_pages); if (reclaim_state) { nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; } - total_scanned += sc.nr_scanned; - if (nr_reclaimed >= sc.swap_cluster_max) { + total_scanned += sc->nr_scanned; + if (nr_reclaimed >= sc->swap_cluster_max) { ret = 1; goto out; } @@ -1181,18 +1207,18 @@ unsigned long try_to_free_pages(struct z * that's undesirable in laptop mode, where we *want* lumpy * writeout. So in laptop mode, write out the whole world. */ - if (total_scanned > sc.swap_cluster_max + - sc.swap_cluster_max / 2) { + if (total_scanned > sc->swap_cluster_max + + sc->swap_cluster_max / 2) { wakeup_pdflush(laptop_mode ? 0 : total_scanned); - sc.may_writepage = 1; + sc->may_writepage = 1; } /* Take a nap, wait for some writeback to complete */ - if (sc.nr_scanned && priority < DEF_PRIORITY - 2) + if (sc->nr_scanned && priority < DEF_PRIORITY - 2) congestion_wait(WRITE, HZ/10); } /* top priority shrink_caches still had more to do? don't OOM, then */ - if (!sc.all_unreclaimable) + if (!sc->all_unreclaimable && sc->mem_container == NULL) ret = 1; out: /* @@ -1215,6 +1241,53 @@ out: return ret; } +unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) +{ + struct scan_control sc = { + .gfp_mask = gfp_mask, + .may_writepage = !laptop_mode, + .swap_cluster_max = SWAP_CLUSTER_MAX, + .may_swap = 1, + .swappiness = vm_swappiness, + .order = order, + .mem_container = NULL, + .isolate_pages = isolate_pages_global, + }; + + return do_try_to_free_pages(zones, gfp_mask, &sc); +} + +#ifdef CONFIG_CONTAINER_MEM_CONT + +#ifdef CONFIG_HIGHMEM +#define ZONE_USERPAGES ZONE_HIGHMEM +#else +#define ZONE_USERPAGES ZONE_NORMAL +#endif + +unsigned long try_to_free_mem_container_pages(struct mem_container *mem_cont) +{ + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .may_swap = 1, + .swap_cluster_max = SWAP_CLUSTER_MAX, + .swappiness = vm_swappiness, + .order = 1, + .mem_container = mem_cont, + .isolate_pages = mem_container_isolate_pages, + }; + int node; + struct zone **zones; + + for_each_online_node(node) { + zones = NODE_DATA(node)->node_zonelists[ZONE_USERPAGES].zones; + if (do_try_to_free_pages(zones, sc.gfp_mask, &sc)) + return 1; + } + return 0; +} +#endif + /* * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at pages_high. @@ -1250,6 +1323,8 @@ static unsigned long balance_pgdat(pg_da .swap_cluster_max = SWAP_CLUSTER_MAX, .swappiness = vm_swappiness, .order = order, + .mem_container = NULL, + .isolate_pages = isolate_pages_global, }; /* * temp_priority is used to remember the scanning priority at which unchanged: --- linux-2.6.22-rc6/mm/oom_kill.c~mem-control-out-of-memory 2007-07-05 18:49:35.000000000 -0700 +++ linux-2.6.22-rc6-balbir/mm/oom_kill.c 2007-07-05 18:49:35.000000000 -0700 @@ -24,6 +24,7 @@ #include #include #include +#include int sysctl_panic_on_oom; /* #define DEBUG */ @@ -47,7 +48,8 @@ int sysctl_panic_on_oom; * of least surprise ... (be careful when you change it) */ -unsigned long badness(struct task_struct *p, unsigned long uptime) +unsigned long badness(struct task_struct *p, unsigned long uptime, + struct mem_container *mem) { unsigned long points, cpu_time, run_time, s; struct mm_struct *mm; @@ -60,6 +62,13 @@ unsigned long badness(struct task_struct return 0; } +#ifdef CONFIG_CONTAINER_MEM_CONT + if (mem != NULL && mm->mem_container != mem) { + task_unlock(p); + return 0; + } +#endif + /* * The memory size of the process is the basis for the badness. */ @@ -204,7 +213,8 @@ static inline int constrained_alloc(stru * * (not docbooked, we don't want this one cluttering up the manual) */ -static struct task_struct *select_bad_process(unsigned long *ppoints) +static struct task_struct *select_bad_process(unsigned long *ppoints, + struct mem_container *mem) { struct task_struct *g, *p; struct task_struct *chosen = NULL; @@ -258,7 +268,7 @@ static struct task_struct *select_bad_pr if (p->oomkilladj == OOM_DISABLE) continue; - points = badness(p, uptime.tv_sec); + points = badness(p, uptime.tv_sec, mem); if (points > *ppoints || !chosen) { chosen = p; *ppoints = points; @@ -372,6 +382,30 @@ static int oom_kill_process(struct task_ return oom_kill_task(p); } +#ifdef CONFIG_CONTAINER_MEM_CONT +void mem_container_out_of_memory(struct mem_container *mem) +{ + unsigned long points = 0; + struct task_struct *p; + + container_lock(); + rcu_read_lock(); +retry: + p = select_bad_process(&points, mem); + if (PTR_ERR(p) == -1UL) + goto out; + + if (!p) + p = current; + + if (oom_kill_process(p, points, "Memory container out of memory")) + goto retry; +out: + rcu_read_unlock(); + container_unlock(); +} +#endif + static BLOCKING_NOTIFIER_HEAD(oom_notify_list); int register_oom_notifier(struct notifier_block *nb) @@ -444,7 +478,7 @@ retry: * Rambo mode: Shoot down a process and hope it solves whatever * issues we may have. */ - p = select_bad_process(&points); + p = select_bad_process(&points, NULL); if (PTR_ERR(p) == -1UL) goto out;