[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20111216202907.GH7586@redhat.com>
Date: Fri, 16 Dec 2011 15:29:07 -0500
From: Vivek Goyal <vgoyal@...hat.com>
To: Nate Custer <nate@...nel.net>
Cc: Jens Axboe <axboe@...nel.dk>, Avi Kivity <avi@...hat.com>,
Marcelo Tosatti <mtosatti@...hat.com>, kvm@...r.kernel.org,
linux-kernel <linux-kernel@...r.kernel.org>
Subject: Re: [RFT PATCH] blkio: alloc per cpu data from worker thread
context( Re: kvm deadlock)
On Fri, Dec 16, 2011 at 12:43:52PM -0600, Nate Custer wrote:
>
> On Dec 15, 2011, at 1:47 PM, Vivek Goyal wrote:
> > Ok, I continued to develop on the patch which tries to allocate per cpu
> > stats from worker thread context. Here is the patch.
> >
> > Can the reporter please try out the patch and see if it helps. I am not
> > sure if deadlock was because of mutex issue or not, but it should help
> > get rid of lockdep warning.
> >
> > This patch is on top of for-3.3/core branch of jens's linux-block tree.
> > If it does not apply on your kernel version, do let me know the version
> > you are testing with and I will generate another version of patch.
> >
> > If testing results are good, I will break down the patch in small pieces
> > and post as a series separately.
> >
> > Thanks
> > Vivek
>
> Running on a fedora-16 box with the patch applied to the linux-block tree I still had deadlocks. In fact it seemed to happen much faster and with ligher workloads.
>
> I was able to get netconsole setup and a full stacktrace is posted here:
>
> http://pastebin.com/9Rq68exU
Thanks for testing it Nate. I did some debugging and found out that patch
is doing double free on per cpu pointer hence the crash you are running
into. I could reproduce this problem on my box. It is just a matter of
doing rmdir on the blkio cgroup.
I understood the cmpxchg() semantics wrong. I have fixed it now and
no crashes on directory removal. Can you please give this version a
try.
Thanks
Vivek
Alloc per cpu data from a worker thread context to avoid possibility of a
deadlock under memory pressure.
Only side affect of delayed allocation is that we will lose the blkio cgroup
stats for that group a small duration.
This patch is generated on top of "for-3.3/core" branch of linux-block tree.
-v2:
Fixed the issue of double free on percpu pointer.
---
block/blk-cgroup.c | 31 ++++++++++++-
block/blk-cgroup.h | 9 +++
block/blk-throttle.c | 116 ++++++++++++++++++++++++++++++++-----------------
block/cfq-iosched.c | 119 ++++++++++++++++++++++++++++++---------------------
4 files changed, 180 insertions(+), 95 deletions(-)
Index: block/blk-throttle.c
===================================================================
--- block/blk-throttle.c.orig 2011-12-17 01:49:34.000000000 -0500
+++ block/blk-throttle.c 2011-12-17 02:21:50.000000000 -0500
@@ -81,6 +81,7 @@ struct throtl_grp {
int limits_changed;
struct rcu_head rcu_head;
+ struct work_struct stat_alloc_work;
};
struct throtl_data
@@ -159,6 +160,13 @@ static void throtl_free_tg(struct rcu_he
struct throtl_grp *tg;
tg = container_of(head, struct throtl_grp, rcu_head);
+
+ /*
+ * Will delayed allocation be visible here for sure? I am relying
+ * on the fact that after blkg.stats_cpu assignment, we drop
+ * reference to group using atomic_dec() which should imply
+ * barrier
+ */
free_percpu(tg->blkg.stats_cpu);
kfree(tg);
}
@@ -181,6 +189,48 @@ static void throtl_put_tg(struct throtl_
call_rcu(&tg->rcu_head, throtl_free_tg);
}
+static inline void throtl_check_schedule_pcpu_stat_alloc(struct throtl_grp *tg) {
+ if (tg->blkg.stats_cpu != NULL)
+ return;
+ /*
+ * Schedule the group per cpu stat allocation through worker
+ * thread
+ */
+ throtl_ref_get_tg(tg);
+ if (!schedule_work(&tg->stat_alloc_work)) {
+ /* Work is already scheduled by somebody */
+ throtl_put_tg(tg);
+ }
+}
+
+/* No locks taken. A reference to throtl_grp taken before invocation */
+static void tg_stat_alloc_fn(struct work_struct *work)
+{
+ struct throtl_grp *tg = container_of(work, struct throtl_grp,
+ stat_alloc_work);
+ void *stat_ptr = NULL;
+ unsigned long ret;
+
+ stat_ptr = blkio_alloc_blkg_stats_pcpu();
+
+ if (stat_ptr == NULL) {
+ /* If memory allocation failed, try again */
+ throtl_check_schedule_pcpu_stat_alloc(tg);
+ throtl_put_tg(tg);
+ return;
+ }
+
+ ret = blkio_cmpxchg_blkg_stats(&tg->blkg, 0,
+ (unsigned long)stat_ptr);
+
+ if (ret != 0) {
+ /* Somebody else got to it first */
+ free_percpu(stat_ptr);
+ }
+
+ throtl_put_tg(tg);
+}
+
static void throtl_init_group(struct throtl_grp *tg)
{
INIT_HLIST_NODE(&tg->tg_node);
@@ -188,6 +238,7 @@ static void throtl_init_group(struct thr
bio_list_init(&tg->bio_lists[0]);
bio_list_init(&tg->bio_lists[1]);
tg->limits_changed = false;
+ INIT_WORK(&tg->stat_alloc_work, tg_stat_alloc_fn);
/* Practically unlimited BW */
tg->bps[0] = tg->bps[1] = -1;
@@ -263,8 +314,25 @@ static void throtl_init_add_tg_lists(str
throtl_add_group_to_td_list(td, tg);
}
-/* Should be called without queue lock and outside of rcu period */
-static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td)
+/* Allocates per cpu stats asynchronously with the help of worker thread */
+static struct throtl_grp *throtl_alloc_tg_async(struct throtl_data *td)
+{
+ struct throtl_grp *tg = NULL;
+
+ tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
+ if (!tg)
+ return NULL;
+
+ throtl_init_group(tg);
+ throtl_check_schedule_pcpu_stat_alloc(tg);
+ return tg;
+}
+
+/*
+ * Should be called without queue lock and outside of rcu period as per
+ * cpu alloc will block
+ */
+static struct throtl_grp *throtl_alloc_tg_sync(struct throtl_data *td)
{
struct throtl_grp *tg = NULL;
int ret;
@@ -273,7 +341,7 @@ static struct throtl_grp *throtl_alloc_t
if (!tg)
return NULL;
- ret = blkio_alloc_blkg_stats(&tg->blkg);
+ ret = blkio_alloc_set_blkg_stats(&tg->blkg);
if (ret) {
kfree(tg);
@@ -305,7 +373,7 @@ throtl_grp *throtl_find_tg(struct throtl
static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
{
- struct throtl_grp *tg = NULL, *__tg = NULL;
+ struct throtl_grp *tg = NULL;
struct blkio_cgroup *blkcg;
struct request_queue *q = td->queue;
@@ -321,46 +389,12 @@ static struct throtl_grp * throtl_get_tg
return tg;
}
- /*
- * Need to allocate a group. Allocation of group also needs allocation
- * of per cpu stats which in-turn takes a mutex() and can block. Hence
- * we need to drop rcu lock and queue_lock before we call alloc.
- */
- rcu_read_unlock();
- spin_unlock_irq(q->queue_lock);
-
- tg = throtl_alloc_tg(td);
-
- /* Group allocated and queue is still alive. take the lock */
- spin_lock_irq(q->queue_lock);
-
- /* Make sure @q is still alive */
- if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
- kfree(tg);
- return NULL;
- }
-
- /*
- * Initialize the new group. After sleeping, read the blkcg again.
- */
- rcu_read_lock();
- blkcg = task_blkio_cgroup(current);
-
- /*
- * If some other thread already allocated the group while we were
- * not holding queue lock, free up the group
- */
- __tg = throtl_find_tg(td, blkcg);
-
- if (__tg) {
- kfree(tg);
- rcu_read_unlock();
- return __tg;
- }
+ tg = throtl_alloc_tg_async(td);
/* Group allocation failed. Account the IO to root group */
if (!tg) {
tg = td->root_tg;
+ rcu_read_unlock();
return tg;
}
@@ -1254,7 +1288,7 @@ int blk_throtl_init(struct request_queue
/* alloc and Init root group. */
td->queue = q;
- tg = throtl_alloc_tg(td);
+ tg = throtl_alloc_tg_sync(td);
if (!tg) {
kfree(td);
Index: block/blk-cgroup.c
===================================================================
--- block/blk-cgroup.c.orig 2011-12-17 01:49:34.000000000 -0500
+++ block/blk-cgroup.c 2011-12-17 01:49:35.000000000 -0500
@@ -400,6 +400,9 @@ void blkiocg_update_dispatch_stats(struc
struct blkio_group_stats_cpu *stats_cpu;
unsigned long flags;
+ if (blkg->stats_cpu == NULL)
+ return;
+
/*
* Disabling interrupts to provide mutual exclusion between two
* writes on same cpu. It probably is not needed for 64bit. Not
@@ -446,6 +449,9 @@ void blkiocg_update_io_merged_stats(stru
struct blkio_group_stats_cpu *stats_cpu;
unsigned long flags;
+ if (blkg->stats_cpu == NULL)
+ return;
+
/*
* Disabling interrupts to provide mutual exclusion between two
* writes on same cpu. It probably is not needed for 64bit. Not
@@ -465,9 +471,11 @@ EXPORT_SYMBOL_GPL(blkiocg_update_io_merg
/*
* This function allocates the per cpu stats for blkio_group. Should be called
- * from sleepable context as alloc_per_cpu() requires that.
+ * from sleepable context as alloc_per_cpu() requires that. percpu alloc does
+ * not take any flags and does GFP_KERNEL allocations. Don't use it from
+ * IO submission path which usually might require GFP_NOIO.
*/
-int blkio_alloc_blkg_stats(struct blkio_group *blkg)
+int blkio_alloc_set_blkg_stats(struct blkio_group *blkg)
{
/* Allocate memory for per cpu stats */
blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
@@ -475,7 +483,14 @@ int blkio_alloc_blkg_stats(struct blkio_
return -ENOMEM;
return 0;
}
-EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats);
+EXPORT_SYMBOL_GPL(blkio_alloc_set_blkg_stats);
+
+void* blkio_alloc_blkg_stats_pcpu(void)
+{
+ /* Allocate memory for per cpu stats */
+ return alloc_percpu(struct blkio_group_stats_cpu);
+}
+EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats_pcpu);
void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
struct blkio_group *blkg, void *key, dev_t dev,
@@ -551,6 +566,12 @@ static void blkio_reset_stats_cpu(struct
{
struct blkio_group_stats_cpu *stats_cpu;
int i, j, k;
+
+ /* blkg->stats_cpu can have delayed allocation */
+
+ if (!blkg->stats_cpu)
+ return;
+
/*
* Note: On 64 bit arch this should not be an issue. This has the
* possibility of returning some inconsistent value on 32bit arch
@@ -673,6 +694,10 @@ static uint64_t blkio_read_stat_cpu(stru
struct blkio_group_stats_cpu *stats_cpu;
u64 val = 0, tval;
+ /* blkg->stats_cpu might not have been allocated yet */
+ if (blkg->stats_cpu == NULL)
+ return 0;
+
for_each_possible_cpu(cpu) {
unsigned int start;
stats_cpu = per_cpu_ptr(blkg->stats_cpu, cpu);
Index: block/blk-cgroup.h
===================================================================
--- block/blk-cgroup.h.orig 2011-12-17 01:49:34.000000000 -0500
+++ block/blk-cgroup.h 2011-12-17 01:49:35.000000000 -0500
@@ -310,7 +310,12 @@ extern struct blkio_cgroup *task_blkio_c
extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
struct blkio_group *blkg, void *key, dev_t dev,
enum blkio_policy_id plid);
-extern int blkio_alloc_blkg_stats(struct blkio_group *blkg);
+extern int blkio_alloc_set_blkg_stats(struct blkio_group *blkg);
+extern void* blkio_alloc_blkg_stats_pcpu(void);
+
+#define blkio_cmpxchg_blkg_stats(blkg, oldval, newval) \
+ cmpxchg((unsigned long *)&(blkg)->stats_cpu, oldval, newval);
+
extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
void *key);
@@ -338,7 +343,7 @@ static inline void blkiocg_add_blkio_gro
struct blkio_group *blkg, void *key, dev_t dev,
enum blkio_policy_id plid) {}
-static inline int blkio_alloc_blkg_stats(struct blkio_group *blkg) { return 0; }
+static inline int blkio_alloc_set_blkg_stats(struct blkio_group *blkg) { return 0; }
static inline int
blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
Index: block/cfq-iosched.c
===================================================================
--- block/cfq-iosched.c.orig 2011-12-17 01:49:34.000000000 -0500
+++ block/cfq-iosched.c 2011-12-17 02:21:50.000000000 -0500
@@ -209,7 +209,12 @@ struct cfq_group {
struct blkio_group blkg;
#ifdef CONFIG_CFQ_GROUP_IOSCHED
struct hlist_node cfqd_node;
- int ref;
+ /*
+ * blkg per cpu stat allocation code will need to put reference
+ * without having queue lock. Hence keep it atomic.
+ */
+ atomic_t ref;
+ struct work_struct stat_alloc_work;
#endif
/* number of requests that are on the dispatch list or inside driver */
int dispatched;
@@ -1012,6 +1017,9 @@ static void cfq_group_served(struct cfq_
}
#ifdef CONFIG_CFQ_GROUP_IOSCHED
+static void cfq_put_cfqg(struct cfq_group *cfqg);
+static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg);
+
static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
{
if (blkg)
@@ -1054,6 +1062,52 @@ static void cfq_init_add_cfqg_lists(stru
hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
}
+static inline void cfq_check_schedule_pcpu_stat_alloc(struct cfq_group *cfqg) {
+ if (cfqg->blkg.stats_cpu != NULL)
+ return;
+
+ /*
+ * Schedule the group per cpu stat allocation through worker
+ * thread
+ */
+ cfq_ref_get_cfqg(cfqg);
+ if (!schedule_work(&cfqg->stat_alloc_work)) {
+ /* Work is already scheduled by somebody */
+ cfq_put_cfqg(cfqg);
+ }
+}
+
+/* No locks taken. A reference to cfq_group taken before invocation */
+static void cfqg_stat_alloc_fn(struct work_struct *work)
+{
+ struct cfq_group *cfqg = container_of(work, struct cfq_group,
+ stat_alloc_work);
+ void *stat_ptr = NULL;
+ unsigned long ret;
+
+ stat_ptr = blkio_alloc_blkg_stats_pcpu();
+
+ if (stat_ptr == NULL) {
+ /*
+ * Memory allocation failed. Try again. Should an upper limit
+ * be put on number of retries?
+ */
+ cfq_check_schedule_pcpu_stat_alloc(cfqg);
+ cfq_put_cfqg(cfqg);
+ return;
+ }
+
+ ret = blkio_cmpxchg_blkg_stats(&cfqg->blkg, 0,
+ (unsigned long)stat_ptr);
+
+ if (ret != 0) {
+ /* Somebody else got to it first */
+ free_percpu(stat_ptr);
+ }
+
+ cfq_put_cfqg(cfqg);
+}
+
/*
* Should be called from sleepable context. No request queue lock as per
* cpu stats are allocated dynamically and alloc_percpu needs to be called
@@ -1062,7 +1116,7 @@ static void cfq_init_add_cfqg_lists(stru
static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd)
{
struct cfq_group *cfqg = NULL;
- int i, j, ret;
+ int i, j;
struct cfq_rb_root *st;
cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
@@ -1072,6 +1126,7 @@ static struct cfq_group * cfq_alloc_cfqg
for_each_cfqg_st(cfqg, i, j, st)
*st = CFQ_RB_ROOT;
RB_CLEAR_NODE(&cfqg->rb_node);
+ INIT_WORK(&cfqg->stat_alloc_work, cfqg_stat_alloc_fn);
cfqg->ttime.last_end_request = jiffies;
@@ -1081,14 +1136,9 @@ static struct cfq_group * cfq_alloc_cfqg
* elevator which will be dropped by either elevator exit
* or cgroup deletion path depending on who is exiting first.
*/
- cfqg->ref = 1;
-
- ret = blkio_alloc_blkg_stats(&cfqg->blkg);
- if (ret) {
- kfree(cfqg);
- return NULL;
- }
+ atomic_set(&cfqg->ref, 1);
+ cfq_check_schedule_pcpu_stat_alloc(cfqg);
return cfqg;
}
@@ -1124,8 +1174,7 @@ cfq_find_cfqg(struct cfq_data *cfqd, str
static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
{
struct blkio_cgroup *blkcg;
- struct cfq_group *cfqg = NULL, *__cfqg = NULL;
- struct request_queue *q = cfqd->queue;
+ struct cfq_group *cfqg = NULL;
rcu_read_lock();
blkcg = task_blkio_cgroup(current);
@@ -1135,41 +1184,13 @@ static struct cfq_group *cfq_get_cfqg(st
return cfqg;
}
- /*
- * Need to allocate a group. Allocation of group also needs allocation
- * of per cpu stats which in-turn takes a mutex() and can block. Hence
- * we need to drop rcu lock and queue_lock before we call alloc.
- *
- * Not taking any queue reference here and assuming that queue is
- * around by the time we return. CFQ queue allocation code does
- * the same. It might be racy though.
- */
-
- rcu_read_unlock();
- spin_unlock_irq(q->queue_lock);
-
cfqg = cfq_alloc_cfqg(cfqd);
-
- spin_lock_irq(q->queue_lock);
-
- rcu_read_lock();
- blkcg = task_blkio_cgroup(current);
-
- /*
- * If some other thread already allocated the group while we were
- * not holding queue lock, free up the group
- */
- __cfqg = cfq_find_cfqg(cfqd, blkcg);
-
- if (__cfqg) {
- kfree(cfqg);
+ if (!cfqg) {
+ cfqg = &cfqd->root_group;
rcu_read_unlock();
- return __cfqg;
+ return cfqg;
}
- if (!cfqg)
- cfqg = &cfqd->root_group;
-
cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg);
rcu_read_unlock();
return cfqg;
@@ -1177,7 +1198,7 @@ static struct cfq_group *cfq_get_cfqg(st
static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
{
- cfqg->ref++;
+ atomic_inc(&cfqg->ref);
return cfqg;
}
@@ -1189,7 +1210,7 @@ static void cfq_link_cfqq_cfqg(struct cf
cfqq->cfqg = cfqg;
/* cfqq reference on cfqg */
- cfqq->cfqg->ref++;
+ atomic_inc(&cfqq->cfqg->ref);
}
static void cfq_put_cfqg(struct cfq_group *cfqg)
@@ -1197,9 +1218,8 @@ static void cfq_put_cfqg(struct cfq_grou
struct cfq_rb_root *st;
int i, j;
- BUG_ON(cfqg->ref <= 0);
- cfqg->ref--;
- if (cfqg->ref)
+ BUG_ON(atomic_read(&cfqg->ref) <= 0);
+ if (!atomic_dec_and_test(&cfqg->ref))
return;
for_each_cfqg_st(cfqg, i, j, st)
BUG_ON(!RB_EMPTY_ROOT(&st->rb));
@@ -4025,6 +4045,7 @@ static void *cfq_init_queue(struct reque
cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
#ifdef CONFIG_CFQ_GROUP_IOSCHED
+ INIT_WORK(&cfqg->stat_alloc_work, cfqg_stat_alloc_fn);
/*
* Set root group reference to 2. One reference will be dropped when
* all groups on cfqd->cfqg_list are being deleted during queue exit.
@@ -4032,9 +4053,9 @@ static void *cfq_init_queue(struct reque
* group as it is statically allocated and gets destroyed when
* throtl_data goes away.
*/
- cfqg->ref = 2;
+ atomic_set(&cfqg->ref, 2);
- if (blkio_alloc_blkg_stats(&cfqg->blkg)) {
+ if (blkio_alloc_set_blkg_stats(&cfqg->blkg)) {
kfree(cfqg);
kfree(cfqd);
return NULL;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists