linux-kernel - [PATCH 7/7] blkio-cgroup: Fast page tracking

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20090428.214740.183024301.ryov@valinux.co.jp>
Date:	Tue, 28 Apr 2009 21:47:40 +0900 (JST)
From:	Ryo Tsuruta <ryov@...inux.co.jp>
To:	linux-kernel@...r.kernel.org, dm-devel@...hat.com,
	containers@...ts.linux-foundation.org,
	virtualization@...ts.linux-foundation.org,
	xen-devel@...ts.xensource.com
Subject: [PATCH 7/7] blkio-cgroup: Fast page tracking

This is an extra patch which reduces the overhead of IO tracking but
increases the size of struct page_cgroup.

Based on 2.6.30-rc3-git3
Signed-off-by: Hirokazu Takahashi <taka@...inux.co.jp>
Signed-off-by: Ryo Tsuruta <ryov@...inux.co.jp>

---
 include/linux/biotrack.h    |    5 -
 include/linux/page_cgroup.h |   26 --------
 mm/biotrack.c               |  134 ++++++++++++++++++++++++++------------------
 3 files changed, 87 insertions(+), 78 deletions(-)

Index: linux-2.6.30-rc3-git3/mm/biotrack.c
===================================================================
--- linux-2.6.30-rc3-git3.orig/mm/biotrack.c
+++ linux-2.6.30-rc3-git3/mm/biotrack.c
@@ -3,9 +3,6 @@
  * Copyright (C) VA Linux Systems Japan, 2008-2009
  * Developed by Hirokazu Takahashi <taka@...inux.co.jp>
  *
- * Copyright (C) 2008 Andrea Righi <righi.andrea@...il.com>
- * Use part of page_cgroup->flags to store blkio-cgroup ID.
- *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -20,6 +17,7 @@
 #include <linux/module.h>
 #include <linux/smp.h>
 #include <linux/bit_spinlock.h>
+#include <linux/idr.h>
 #include <linux/blkdev.h>
 #include <linux/biotrack.h>
 #include <linux/mm_inline.h>
@@ -45,8 +43,11 @@ static inline struct blkio_cgroup *blkio
 					struct blkio_cgroup, css);
 }
 
+static struct idr blkio_cgroup_id;
+static DEFINE_SPINLOCK(blkio_cgroup_idr_lock);
 static struct io_context default_blkio_io_context;
 static struct blkio_cgroup default_blkio_cgroup = {
+	.id		= 0,
 	.io_context	= &default_blkio_io_context,
 };
 
@@ -61,7 +62,6 @@ void blkio_cgroup_set_owner(struct page 
 {
 	struct blkio_cgroup *biog;
 	struct page_cgroup *pc;
-	unsigned long id;
 
 	if (blkio_cgroup_disabled())
 		return;
@@ -69,29 +69,27 @@ void blkio_cgroup_set_owner(struct page 
 	if (unlikely(!pc))
 		return;
 
-	lock_page_cgroup(pc);
-	page_cgroup_set_id(pc, 0);	/* 0: default blkio_cgroup id */
-	unlock_page_cgroup(pc);
+	pc->blkio_cgroup_id = 0;	/* 0: default blkio_cgroup id */
 	if (!mm)
 		return;
 
+	/*
+	 * Locking "pc" isn't necessary here since the current process is
+	 * the only one that can access the members related to blkio_cgroup.
+	 */
 	rcu_read_lock();
 	biog = blkio_cgroup_from_task(rcu_dereference(mm->owner));
-	if (unlikely(!biog)) {
-		rcu_read_unlock();
-		return;
-	}
+	if (unlikely(!biog))
+		goto out;
 	/*
 	 * css_get(&bio->css) isn't called to increment the reference
 	 * count of this blkio_cgroup "biog" so the css_id might turn
 	 * invalid even if this page is still active.
 	 * This approach is chosen to minimize the overhead.
 	 */
-	id = css_id(&biog->css);
+	pc->blkio_cgroup_id = biog->id;
+out:
 	rcu_read_unlock();
-	lock_page_cgroup(pc);
-	page_cgroup_set_id(pc, id);
-	unlock_page_cgroup(pc);
 }
 
 /**
@@ -103,6 +101,13 @@ void blkio_cgroup_set_owner(struct page 
  */
 void blkio_cgroup_reset_owner(struct page *page, struct mm_struct *mm)
 {
+	/*
+	 * A little trick:
+	 * Just call blkio_cgroup_set_owner() for pages which are already
+	 * active since the blkio_cgroup_id member of page_cgroup can be
+	 * updated without any locks. This is because an integer type of
+	 * variable can be set a new value at once on modern cpus.
+	 */
 	blkio_cgroup_set_owner(page, mm);
 }
 
@@ -133,7 +138,6 @@ void blkio_cgroup_reset_owner_pagedirty(
 void blkio_cgroup_copy_owner(struct page *npage, struct page *opage)
 {
 	struct page_cgroup *npc, *opc;
-	unsigned long id;
 
 	if (blkio_cgroup_disabled())
 		return;
@@ -144,12 +148,11 @@ void blkio_cgroup_copy_owner(struct page
 	if (unlikely(!opc))
 		return;
 
-	lock_page_cgroup(opc);
-	lock_page_cgroup(npc);
-	id = page_cgroup_get_id(opc);
-	page_cgroup_set_id(npc, id);
-	unlock_page_cgroup(npc);
-	unlock_page_cgroup(opc);
+	/*
+	 * Do this without any locks. The reason is the same as
+	 * blkio_cgroup_reset_owner().
+	 */
+	npc->blkio_cgroup_id = opc->blkio_cgroup_id;
 }
 
 /* Create a new blkio-cgroup. */
@@ -158,25 +161,44 @@ blkio_cgroup_create(struct cgroup_subsys
 {
 	struct blkio_cgroup *biog;
 	struct io_context *ioc;
+	int ret;
 
 	if (!cgrp->parent) {
 		biog = &default_blkio_cgroup;
 		init_io_context(biog->io_context);
 		/* Increment the referrence count not to be released ever. */
 		atomic_inc(&biog->io_context->refcount);
+		idr_init(&blkio_cgroup_id);
 		return &biog->css;
 	}
 
 	biog = kzalloc(sizeof(*biog), GFP_KERNEL);
-	if (!biog)
-		return ERR_PTR(-ENOMEM);
 	ioc = alloc_io_context(GFP_KERNEL, -1);
-	if (!ioc) {
-		kfree(biog);
-		return ERR_PTR(-ENOMEM);
+	if (!ioc || !biog) {
+		ret = -ENOMEM;
+		goto out_err;
 	}
 	biog->io_context = ioc;
+retry:
+	if (!idr_pre_get(&blkio_cgroup_id, GFP_KERNEL)) {
+		ret = -EAGAIN;
+		goto out_err;
+	}
+	spin_lock_irq(&blkio_cgroup_idr_lock);
+	ret = idr_get_new_above(&blkio_cgroup_id, (void *)biog, 1, &biog->id);
+	spin_unlock_irq(&blkio_cgroup_idr_lock);
+	if (ret == -EAGAIN)
+		goto retry;
+	else if (ret)
+		goto out_err;
+
 	return &biog->css;
+out_err:
+	if (biog)
+		kfree(biog);
+	if (ioc)
+		put_io_context(ioc);
+	return ERR_PTR(ret);
 }
 
 /* Delete the blkio-cgroup. */
@@ -185,10 +207,28 @@ static void blkio_cgroup_destroy(struct 
 	struct blkio_cgroup *biog = cgroup_blkio(cgrp);
 
 	put_io_context(biog->io_context);
-	free_css_id(&blkio_cgroup_subsys, &biog->css);
+
+	spin_lock_irq(&blkio_cgroup_idr_lock);
+	idr_remove(&blkio_cgroup_id, biog->id);
+	spin_unlock_irq(&blkio_cgroup_idr_lock);
+
 	kfree(biog);
 }
 
+static struct blkio_cgroup *find_blkio_cgroup(int id)
+{
+	struct blkio_cgroup *biog;
+	spin_lock_irq(&blkio_cgroup_idr_lock);
+	/*
+	 * It might fail to find A bio-group associated with "id" since it
+	 * is allowed to remove the bio-cgroup even when some of I/O requests
+	 * this group issued haven't completed yet.
+	 */
+	biog = (struct blkio_cgroup *)idr_find(&blkio_cgroup_id, id);
+	spin_unlock_irq(&blkio_cgroup_idr_lock);
+	return biog;
+}
+
 /**
  * get_blkio_cgroup_id() - determine the blkio-cgroup ID
  * @bio:	the &struct bio which describes the I/O
@@ -200,14 +240,11 @@ unsigned long get_blkio_cgroup_id(struct
 {
 	struct page_cgroup *pc;
 	struct page *page = bio_iovec_idx(bio, 0)->bv_page;
-	unsigned long id = 0;
+	int	id = 0;
 
 	pc = lookup_page_cgroup(page);
-	if (pc) {
-		lock_page_cgroup(pc);
-		id = page_cgroup_get_id(pc);
-		unlock_page_cgroup(pc);
-	}
+	if (pc)
+		id = pc->blkio_cgroup_id;
 	return id;
 }
 
@@ -219,21 +256,17 @@ unsigned long get_blkio_cgroup_id(struct
  */
 struct io_context *get_blkio_cgroup_iocontext(struct bio *bio)
 {
-	struct cgroup_subsys_state *css;
-	struct blkio_cgroup *biog;
+	struct blkio_cgroup *biog = NULL;
 	struct io_context *ioc;
-	unsigned long id;
+	int	id = 0;
 
 	id = get_blkio_cgroup_id(bio);
-	rcu_read_lock();
-	css = css_lookup(&blkio_cgroup_subsys, id);
-	if (css)
-		biog = container_of(css, struct blkio_cgroup, css);
-	else
+	if (id)
+		biog = find_blkio_cgroup(id);
+	if (!biog)
 		biog = &default_blkio_cgroup;
 	ioc = biog->io_context;	/* default io_context for this cgroup */
 	atomic_inc(&ioc->refcount);
-	rcu_read_unlock();
 	return ioc;
 }
 
@@ -249,17 +282,15 @@ struct io_context *get_blkio_cgroup_ioco
  */
 struct cgroup *blkio_cgroup_lookup(int id)
 {
-	struct cgroup *cgrp;
-	struct cgroup_subsys_state *css;
+	struct blkio_cgroup *biog = NULL;
 
 	if (blkio_cgroup_disabled())
 		return NULL;
-
-	css = css_lookup(&blkio_cgroup_subsys, id);
-	if (!css)
+	if (id)
+		biog = find_blkio_cgroup(id);
+	if (!biog)
 		return NULL;
-	cgrp = css->cgroup;
-	return cgrp;
+	return biog->css.cgroup;
 }
 EXPORT_SYMBOL(get_blkio_cgroup_iocontext);
 EXPORT_SYMBOL(get_blkio_cgroup_id);
@@ -273,7 +304,7 @@ static u64 blkio_id_read(struct cgroup *
 	rcu_read_lock();
 	id = css_id(&biog->css);
 	rcu_read_unlock();
-	return (u64)id;
+	return (u64) biog->id;
 }
 
 
@@ -296,5 +327,4 @@ struct cgroup_subsys blkio_cgroup_subsys
 	.destroy	= blkio_cgroup_destroy,
 	.populate	= blkio_cgroup_populate,
 	.subsys_id	= blkio_cgroup_subsys_id,
-	.use_id		= 1,
 };
Index: linux-2.6.30-rc3-git3/include/linux/biotrack.h
===================================================================
--- linux-2.6.30-rc3-git3.orig/include/linux/biotrack.h
+++ linux-2.6.30-rc3-git3/include/linux/biotrack.h
@@ -12,6 +12,7 @@ struct block_device;
 
 struct blkio_cgroup {
 	struct cgroup_subsys_state css;
+	int id;
 	struct io_context *io_context;	/* default io_context */
 /*	struct radix_tree_root io_context_root; per device io_context */
 };
@@ -24,9 +25,7 @@ struct blkio_cgroup {
  */
 static inline void __init_blkio_page_cgroup(struct page_cgroup *pc)
 {
-	lock_page_cgroup(pc);
-	page_cgroup_set_id(pc, 0);
-	unlock_page_cgroup(pc);
+	pc->blkio_cgroup_id = 0;
 }
 
 /**
Index: linux-2.6.30-rc3-git3/include/linux/page_cgroup.h
===================================================================
--- linux-2.6.30-rc3-git3.orig/include/linux/page_cgroup.h
+++ linux-2.6.30-rc3-git3/include/linux/page_cgroup.h
@@ -17,6 +17,9 @@ struct page_cgroup {
 	struct mem_cgroup *mem_cgroup;
 	struct list_head lru;		/* per cgroup LRU list */
 #endif
+#ifdef CONFIG_CGROUP_BLKIO
+	int blkio_cgroup_id;
+#endif
 };
 
 void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat);
@@ -124,27 +127,4 @@ static inline void swap_cgroup_swapoff(i
 }
 
 #endif
-
-#ifdef CONFIG_CGROUP_BLKIO
-/*
- * use lower 16 bits for flags and reserve the rest for the page tracking id
- */
-#define PCG_TRACKING_ID_SHIFT	(16)
-#define PCG_TRACKING_ID_BITS \
-	(8 * sizeof(unsigned long) - PCG_TRACKING_ID_SHIFT)
-
-/* NOTE: must be called with page_cgroup() held */
-static inline unsigned long page_cgroup_get_id(struct page_cgroup *pc)
-{
-	return pc->flags >> PCG_TRACKING_ID_SHIFT;
-}
-
-/* NOTE: must be called with page_cgroup() held */
-static inline void page_cgroup_set_id(struct page_cgroup *pc, unsigned long id)
-{
-	WARN_ON(id >= (1UL << PCG_TRACKING_ID_BITS));
-	pc->flags &= (1UL << PCG_TRACKING_ID_SHIFT) - 1;
-	pc->flags |= (unsigned long)(id << PCG_TRACKING_ID_SHIFT);
-}
-#endif
 #endif
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/