linux-kernel - [RFC][PATCH 3/5] memcg per zone softlimit scheduler core

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20090312095720.0dc397dc.kamezawa.hiroyu@jp.fujitsu.com>
Date:	Thu, 12 Mar 2009 09:57:20 +0900
From:	KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>
To:	KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>
Cc:	"linux-mm@...ck.org" <linux-mm@...ck.org>,
	"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
	"balbir@...ux.vnet.ibm.com" <balbir@...ux.vnet.ibm.com>,
	"nishimura@....nes.nec.co.jp" <nishimura@....nes.nec.co.jp>,
	"kosaki.motohiro@...fujitsu.com" <kosaki.motohiro@...fujitsu.com>
Subject: [RFC][PATCH 3/5] memcg per zone softlimit scheduler core

From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>

This patch implements per-zone queue for softlimit and adds some
member to memcg.
(This patch adds softlimit_priority but interface to modify this is
 in other patch.)

There are following requirements to implement softlimit.
  - softlimit has to check the whole usage of memcg v.s. softlimit.
  - hierarchy should be handled.
  - Need to know per-zone usage for making a cgroup to be victim.
  - Keeping predictability of behavior by users is important.
  - We want to avoid too much scan and global locks.

Considering above, this patch's softlimit handling concept is
  - Handle softlimit by priority queue
  - Use per-zone priority queue
  - Victim selection algorithm is static priority round robin
  - Prepare 2 lines of queue , Active Queue and Inactive queue.
    If an entry on Active queue doesn't hit condition for softlimit,
    it's moved to Inactive queue.
  - When reschedule_all() is called, Inactive queues are merged to
    Active queue to check all again.

For easy review, user interface etc...is in other patches.

Changelog v2->v3:
 - removed global rwsem.
 - renamed some definitions.
 - fixed problem at memory cgroup is disabled case.
 - almost all comments are rewritten.
 - removed sl_state from per-zone struct. added queue->victim.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>
---
 include/linux/memcontrol.h |   20 +++
 mm/memcontrol.c            |  232 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 250 insertions(+), 2 deletions(-)

Index: mmotm-2.6.29-Mar10/mm/memcontrol.c
===================================================================
--- mmotm-2.6.29-Mar10.orig/mm/memcontrol.c
+++ mmotm-2.6.29-Mar10/mm/memcontrol.c
@@ -116,6 +116,9 @@ struct mem_cgroup_per_zone {
 	unsigned long		count[NR_LRU_LISTS];
 
 	struct zone_reclaim_stat reclaim_stat;
+	/* For softlimit per-zone queue. See softlimit handling code. */
+	struct mem_cgroup *mem;
+	struct list_head sl_queue;
 };
 /* Macro for accessing counter */
 #define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])
@@ -175,7 +178,11 @@ struct mem_cgroup {
 	atomic_t	refcnt;
 
 	unsigned int	swappiness;
-
+	/*
+	 * priority of softlimit.
+	 */
+	int softlimit_priority;
+	struct mutex softlimit_mutex;
 	/*
 	 * statistics. This must be placed at the end of memcg.
 	 */
@@ -1916,6 +1923,221 @@ int mem_cgroup_force_empty_write(struct 
 	return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
 }
 
+/*
+ * SoftLimit
+ */
+/*
+ * Priority of softlimit. This is a scheduling parameter for softlimit victim
+ * selection logic. Low number is low priority. If priority is maximum, the
+ * cgroup will never be victim at softlimit memory reclaiming.
+ */
+#define SOFTLIMIT_MAXPRI (8)
+
+/* Name of queue for softlimit */
+enum {
+	SLQ_ACTIVE, /* queue for candidates of softlimit victim */
+	SLQ_INACTIVE, /* queue for not-candidates of softlimit victim */
+	SLQ_NUM,
+};
+/*
+ * On this queue, mem_cgroup_per_zone will be enqueued (sl_queue is used.)
+ * mz can take following 4 state.
+ * softlimitq_zone->victim == mz (selected by kswapd) or
+ * on ACTIVE queue (candidates for victim)
+ * on INACTIVE queue (not candidates for victim but prirority is not the highest
+ * out-of-queue (has the maximum priority or on some transition status)
+ */
+struct softlimitq_zone {
+	spinlock_t lock;
+	struct mem_cgroup_per_zone *victim;
+	struct list_head queue[SLQ_NUM][SOFTLIMIT_MAXPRI];
+};
+
+struct softlimitq_node {
+	struct softlimitq_zone zone[MAX_NR_ZONES];
+};
+
+struct softlimitq_node *softlimitq[MAX_NUMNODES];
+
+/* Return queue head for zone */
+static inline struct softlimitq_zone *softlimit_queue(int nid, int zid)
+{
+	return &softlimitq[nid]->zone[zid];
+}
+
+static void __init softlimitq_init(void)
+{
+	struct softlimitq_node *sqn;
+	struct softlimitq_zone *sqz;
+	int nid, zid, i;
+
+	for_each_node_state(nid, N_POSSIBLE) {
+		int tmp = nid;
+
+		if (!node_state(tmp, N_NORMAL_MEMORY))
+			tmp = -1;
+		sqn = kmalloc_node(sizeof(*sqn), GFP_KERNEL, tmp);
+		BUG_ON(!sqn);
+		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+			sqz = &sqn->zone[zid];
+			spin_lock_init(&sqz->lock);
+			sqz->victim = NULL;
+			for (i = 0; i < SOFTLIMIT_MAXPRI;i++) {
+				INIT_LIST_HEAD(&sqz->queue[SLQ_ACTIVE][i]);
+				INIT_LIST_HEAD(&sqz->queue[SLQ_INACTIVE][i]);
+			}
+		}
+		softlimitq[nid] = sqn;
+	}
+}
+
+/*
+ * Add (or remove) all mz of mem_cgroup to the queue. Using open codes to
+ * to handle racy corner case. Called by softlimit_priority user interface.
+ */
+static void memcg_softlimit_requeue(struct mem_cgroup *mem, int prio)
+{
+	int nid, zid;
+
+	/*
+	 * This mutex is for serializing multiple writers to softlimit file...
+	 * pesimistic but necessary for sanity.
+	 */
+	mutex_lock(&mem->softlimit_mutex);
+	mem->softlimit_priority = prio;
+
+	for_each_node_state(nid, N_POSSIBLE) {
+		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+			struct softlimitq_zone *sqz;
+			struct mem_cgroup_per_zone *mz;
+
+			sqz = softlimit_queue(nid, zid);
+			mz = mem_cgroup_zoneinfo(mem, nid, zid);
+			spin_lock(&sqz->lock);
+			/* If now grabbed by kswapd(), nothing to do */
+			if (sqz->victim != mz) {
+				list_del_init(&mz->sl_queue);
+				if (prio < SOFTLIMIT_MAXPRI)
+					list_add_tail(&mz->sl_queue,
+						&sqz->queue[SLQ_ACTIVE][prio]);
+			}
+			spin_unlock(&sqz->lock);
+		}
+	}
+	mutex_unlock(&mem->softlimit_mutex);
+}
+
+/*
+ * Join inactive list to active list to restart schedule and
+ * refresh queue information
+ */
+static void __softlimit_join_queue(int nid, int zid)
+{
+	struct softlimitq_zone *sqz = softlimit_queue(nid, zid);
+	int i;
+
+	spin_lock(&sqz->lock);
+	for (i = 0; i < SOFTLIMIT_MAXPRI; i++)
+		list_splice_tail_init(&sqz->queue[SLQ_INACTIVE][i],
+				      &sqz->queue[SLQ_ACTIVE][i]);
+	spin_unlock(&sqz->lock);
+}
+
+/* Return # of evictable memory in zone */
+static int mz_evictable_usage(struct mem_cgroup_per_zone *mz)
+{
+	long usage = 0;
+
+	if (nr_swap_pages) {
+		usage += MEM_CGROUP_ZSTAT(mz, LRU_ACTIVE_ANON);
+		usage += MEM_CGROUP_ZSTAT(mz, LRU_INACTIVE_ANON);
+	}
+	usage += MEM_CGROUP_ZSTAT(mz, LRU_ACTIVE_FILE);
+	usage += MEM_CGROUP_ZSTAT(mz, LRU_INACTIVE_FILE);
+
+	return usage;
+}
+
+struct mem_cgroup *mem_cgroup_schedule(int nid, int zid)
+{
+	struct softlimitq_zone *sqz;
+	struct mem_cgroup_per_zone *mz;
+	struct mem_cgroup *mem, *ret;
+	int prio;
+
+	if (mem_cgroup_disabled())
+		return NULL;
+	sqz = softlimit_queue(nid, zid);
+	ret = NULL;
+	spin_lock(&sqz->lock);
+	for (prio = 0; prio < SOFTLIMIT_MAXPRI; prio++) {
+		if (list_empty(&sqz->queue[SLQ_ACTIVE][prio]))
+			continue;
+		mz = list_first_entry(&sqz->queue[SLQ_ACTIVE][prio],
+				      struct mem_cgroup_per_zone, sl_queue);
+		list_del_init(&mz->sl_queue);
+		/*
+		 * Victim will be selected if
+		 * 1. it has memory in this zone.
+		 * 2. usage is bigger than softlimit
+		 * 3. it's not obsolete.
+		 */
+		if (mz_evictable_usage(mz)) {
+			mem = mz->mem;
+			if (!res_counter_check_under_softlimit(&mem->res)
+			    && css_tryget(&mem->css)) {
+				sqz->victim = mz;
+				ret = mem;
+				break;
+			}
+		}
+		/* This is not a candidate. enqueue this to INACTIVE list */
+		list_add_tail(&mz->sl_queue, &sqz->queue[SLQ_INACTIVE][prio]);
+	}
+	spin_unlock(&sqz->lock);
+	return ret;
+}
+
+/* requeue selected victim */
+void
+mem_cgroup_schedule_end(int nid, int zid, struct mem_cgroup *mem, bool hint)
+{
+	struct mem_cgroup_per_zone *mz;
+	struct softlimitq_zone *sqz;
+	long usage;
+	int prio;
+
+	if (!mem)
+		return;
+
+	sqz = softlimit_queue(nid, zid);
+	mz = mem_cgroup_zoneinfo(mem, nid, zid);
+	spin_lock(&sqz->lock);
+	/* clear information */
+	sqz->victim = NULL;
+	prio = mem->softlimit_priority;
+	/* priority can be changed */
+	if (prio == SOFTLIMIT_MAXPRI)
+		goto out;
+
+	usage = mz_evictable_usage(mz);
+	/* worth to be requeued ? */
+	if (hint)
+		list_add_tail(&mz->sl_queue, &sqz->queue[SLQ_ACTIVE][prio]);
+	else
+		list_add_tail(&mz->sl_queue, &sqz->queue[SLQ_INACTIVE][prio]);
+out:
+	spin_unlock(&sqz->lock);
+	css_put(&mem->css);
+}
+
+void mem_cgroup_reschedule_all(int nid)
+{
+	int zid;
+
+	for (zid = 0; zid < MAX_NR_ZONES; zid++)
+		__softlimit_join_queue(nid, zid);
+}
 
 static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
 {
@@ -2356,6 +2578,8 @@ static int alloc_mem_cgroup_per_zone_inf
 		mz = &pn->zoneinfo[zone];
 		for_each_lru(l)
 			INIT_LIST_HEAD(&mz->lists[l]);
+		INIT_LIST_HEAD(&mz->sl_queue);
+		mz->mem = mem;
 	}
 	return 0;
 }
@@ -2466,6 +2690,7 @@ mem_cgroup_create(struct cgroup_subsys *
 	/* root ? */
 	if (cont->parent == NULL) {
 		enable_swap_cgroup();
+		softlimitq_init();
 		parent = NULL;
 	} else {
 		parent = mem_cgroup_from_cont(cont->parent);
@@ -2487,6 +2712,8 @@ mem_cgroup_create(struct cgroup_subsys *
 		res_counter_init(&mem->memsw, NULL);
 	}
 	mem->last_scanned_child = 0;
+	mem->softlimit_priority = SOFTLIMIT_MAXPRI;
+	mutex_init(&mem->softlimit_mutex);
 	spin_lock_init(&mem->reclaim_param_lock);
 
 	if (parent)
@@ -2510,7 +2737,8 @@ static void mem_cgroup_destroy(struct cg
 				struct cgroup *cont)
 {
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
-
+	/* By calling this with MAXPRI, mz->sl_queue will be removed */
+	memcg_softlimit_requeue(mem, SOFTLIMIT_MAXPRI);
 	mem_cgroup_put(mem);
 }
 
Index: mmotm-2.6.29-Mar10/include/linux/memcontrol.h
===================================================================
--- mmotm-2.6.29-Mar10.orig/include/linux/memcontrol.h
+++ mmotm-2.6.29-Mar10/include/linux/memcontrol.h
@@ -117,6 +117,12 @@ static inline bool mem_cgroup_disabled(v
 
 extern bool mem_cgroup_oom_called(struct task_struct *task);
 
+/* softlimit */
+struct mem_cgroup *mem_cgroup_schedule(int nid, int zid);
+void mem_cgroup_schedule_end(int nid, int zid,
+		struct mem_cgroup *mem, bool hint);
+void mem_cgroup_reschedule_all(int nid);
+
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct mem_cgroup;
 
@@ -264,6 +270,20 @@ mem_cgroup_print_oom_info(struct mem_cgr
 {
 }
 
+struct mem_cgroup *mem_cgroup_schedule(int nid, int zid)
+{
+	return NULL;
+}
+
+void mem_cgroup_schedule_end(int nid, int zid,
+	struct mem_cgroup *mem, bool hint)
+{
+}
+
+void mem_cgroup_reschedule(int nid)
+{
+}
+
 #endif /* CONFIG_CGROUP_MEM_CONT */
 
 #endif /* _LINUX_MEMCONTROL_H */

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/