linux-kernel - [PATCH RT v2] kernel/res_counter.c: Change lock of struct res_counter to raw_spinlock

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Fri, 30 Jan 2015 11:59:49 -0700
From:	Thavatchai Makphaibulchoke <tmac@...com>
To:	rostedt@...dmis.org
Cc:	linux-kernel@...r.kernel.org, mingo@...hat.com, tglx@...utronix.de,
	linux-rt-users@...r.kernel.org,
	Thavatchai Makphaibulchoke <tmac@...com>
Subject: [PATCH RT v2] kernel/res_counter.c: Change lock of struct res_counter to raw_spinlock_t

Since memory cgroups can be called from a page fault handler as shown
by the stack dump here,

[12679.513255] BUG: scheduling while atomic: ssh/10621/0x00000002
[12679.513305] Preemption disabled at:[<ffffffff811a20f7>] mem_cgroup_charge_common+0x37/0x60
[12679.513305]
[12679.513322] Call Trace:
[12679.513331]  [<ffffffff81512f62>] dump_stack+0x4f/0x7c
[12679.513333]  [<ffffffff8150f4f1>] __schedule_bug+0x9f/0xad
[12679.513338]  [<ffffffff815155f3>] __schedule+0x653/0x720
[12679.513340]  [<ffffffff815180ce>] ? _raw_spin_unlock_irqrestore+0x2e/0x70
[12679.513343]  [<ffffffff81515784>] schedule+0x34/0xa0
[12679.513345]  [<ffffffff81516fdb>] rt_spin_lock_slowlock+0x10b/0x250
[12679.513348]  [<ffffffff815183a5>] rt_spin_lock+0x35/0x40
[12679.513352]  [<ffffffff810ec1d9>] res_counter_uncharge_until+0x69/0xb0
[12679.513354]  [<ffffffff810ec233>] res_counter_uncharge+0x13/0x20
[12679.513358]  [<ffffffff8119c0be>] drain_stock.isra.38+0x5e/0x90
[12679.513360]  [<ffffffff811a16a2>] __mem_cgroup_try_charge+0x3f2/0x8a0
[12679.513363]  [<ffffffff811a20f7>] mem_cgroup_charge_common+0x37/0x60
[12679.513365]  [<ffffffff811a3b06>] mem_cgroup_newpage_charge+0x26/0x30
[12679.513369]  [<ffffffff8116c8d2>] handle_mm_fault+0x9b2/0xdb0
[12679.513374]  [<ffffffff81400474>] ? sock_aio_read.part.11+0x104/0x130
[12679.513379]  [<ffffffff8151c072>] __do_page_fault+0x182/0x4f0
[12679.513381]  [<ffffffff814004c1>] ? sock_aio_read+0x21/0x30
[12679.513385]  [<ffffffff811ab25a>] ? do_sync_read+0x5a/0x90
[12679.513390]  [<ffffffff8108c981>] ? get_parent_ip+0x11/0x50
[12679.513392]  [<ffffffff8151c41e>] do_page_fault+0x3e/0x80
[12679.513395]  [<ffffffff81518e68>] page_fault+0x28/0x30

the lock member of struct res_counter should be of type raw_spinlock_t,
not spinlock_t which can go to sleep.

Tested on a 2 node, 32 thread, plaform with cyclictest.

Kernel version 3.14.25 + patch-3.14.25-rt22

Signed-off-by: T Makphaibulchoke <tmac@...com>
---

Changed in v2:
	- Fixed Signed-off-by tag.

 include/linux/res_counter.h | 26 +++++++++++++-------------
 kernel/res_counter.c        | 18 +++++++++---------
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index 201a697..61d94a4 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -47,7 +47,7 @@ struct res_counter {
 	 * the lock to protect all of the above.
 	 * the routines below consider this to be IRQ-safe
 	 */
-	spinlock_t lock;
+	raw_spinlock_t lock;
 	/*
 	 * Parent counter, used for hierarchial resource accounting
 	 */
@@ -148,12 +148,12 @@ static inline unsigned long long res_counter_margin(struct res_counter *cnt)
 	unsigned long long margin;
 	unsigned long flags;
 
-	spin_lock_irqsave(&cnt->lock, flags);
+	raw_spin_lock_irqsave(&cnt->lock, flags);
 	if (cnt->limit > cnt->usage)
 		margin = cnt->limit - cnt->usage;
 	else
 		margin = 0;
-	spin_unlock_irqrestore(&cnt->lock, flags);
+	raw_spin_unlock_irqrestore(&cnt->lock, flags);
 	return margin;
 }
 
@@ -170,12 +170,12 @@ res_counter_soft_limit_excess(struct res_counter *cnt)
 	unsigned long long excess;
 	unsigned long flags;
 
-	spin_lock_irqsave(&cnt->lock, flags);
+	raw_spin_lock_irqsave(&cnt->lock, flags);
 	if (cnt->usage <= cnt->soft_limit)
 		excess = 0;
 	else
 		excess = cnt->usage - cnt->soft_limit;
-	spin_unlock_irqrestore(&cnt->lock, flags);
+	raw_spin_unlock_irqrestore(&cnt->lock, flags);
 	return excess;
 }
 
@@ -183,18 +183,18 @@ static inline void res_counter_reset_max(struct res_counter *cnt)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&cnt->lock, flags);
+	raw_spin_lock_irqsave(&cnt->lock, flags);
 	cnt->max_usage = cnt->usage;
-	spin_unlock_irqrestore(&cnt->lock, flags);
+	raw_spin_unlock_irqrestore(&cnt->lock, flags);
 }
 
 static inline void res_counter_reset_failcnt(struct res_counter *cnt)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&cnt->lock, flags);
+	raw_spin_lock_irqsave(&cnt->lock, flags);
 	cnt->failcnt = 0;
-	spin_unlock_irqrestore(&cnt->lock, flags);
+	raw_spin_unlock_irqrestore(&cnt->lock, flags);
 }
 
 static inline int res_counter_set_limit(struct res_counter *cnt,
@@ -203,12 +203,12 @@ static inline int res_counter_set_limit(struct res_counter *cnt,
 	unsigned long flags;
 	int ret = -EBUSY;
 
-	spin_lock_irqsave(&cnt->lock, flags);
+	raw_spin_lock_irqsave(&cnt->lock, flags);
 	if (cnt->usage <= limit) {
 		cnt->limit = limit;
 		ret = 0;
 	}
-	spin_unlock_irqrestore(&cnt->lock, flags);
+	raw_spin_unlock_irqrestore(&cnt->lock, flags);
 	return ret;
 }
 
@@ -218,9 +218,9 @@ res_counter_set_soft_limit(struct res_counter *cnt,
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&cnt->lock, flags);
+	raw_spin_lock_irqsave(&cnt->lock, flags);
 	cnt->soft_limit = soft_limit;
-	spin_unlock_irqrestore(&cnt->lock, flags);
+	raw_spin_unlock_irqrestore(&cnt->lock, flags);
 	return 0;
 }
 
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 3fbcb0d..59a7a62 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -16,7 +16,7 @@
 
 void res_counter_init(struct res_counter *counter, struct res_counter *parent)
 {
-	spin_lock_init(&counter->lock);
+	raw_spin_lock_init(&counter->lock);
 	counter->limit = RES_COUNTER_MAX;
 	counter->soft_limit = RES_COUNTER_MAX;
 	counter->parent = parent;
@@ -51,9 +51,9 @@ static int __res_counter_charge(struct res_counter *counter, unsigned long val,
 	*limit_fail_at = NULL;
 	local_irq_save_nort(flags);
 	for (c = counter; c != NULL; c = c->parent) {
-		spin_lock(&c->lock);
+		raw_spin_lock(&c->lock);
 		r = res_counter_charge_locked(c, val, force);
-		spin_unlock(&c->lock);
+		raw_spin_unlock(&c->lock);
 		if (r < 0 && !ret) {
 			ret = r;
 			*limit_fail_at = c;
@@ -64,9 +64,9 @@ static int __res_counter_charge(struct res_counter *counter, unsigned long val,
 
 	if (ret < 0 && !force) {
 		for (u = counter; u != c; u = u->parent) {
-			spin_lock(&u->lock);
+			raw_spin_lock(&u->lock);
 			res_counter_uncharge_locked(u, val);
-			spin_unlock(&u->lock);
+			raw_spin_unlock(&u->lock);
 		}
 	}
 	local_irq_restore_nort(flags);
@@ -106,11 +106,11 @@ u64 res_counter_uncharge_until(struct res_counter *counter,
 	local_irq_save_nort(flags);
 	for (c = counter; c != top; c = c->parent) {
 		u64 r;
-		spin_lock(&c->lock);
+		raw_spin_lock(&c->lock);
 		r = res_counter_uncharge_locked(c, val);
 		if (c == counter)
 			ret = r;
-		spin_unlock(&c->lock);
+		raw_spin_unlock(&c->lock);
 	}
 	local_irq_restore_nort(flags);
 	return ret;
@@ -164,9 +164,9 @@ u64 res_counter_read_u64(struct res_counter *counter, int member)
 	unsigned long flags;
 	u64 ret;
 
-	spin_lock_irqsave(&counter->lock, flags);
+	raw_spin_lock_irqsave(&counter->lock, flags);
 	ret = *res_counter_member(counter, member);
-	spin_unlock_irqrestore(&counter->lock, flags);
+	raw_spin_unlock_irqrestore(&counter->lock, flags);
 
 	return ret;
 }
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/