[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20070921230343.GE9059@linux.vnet.ibm.com>
Date: Fri, 21 Sep 2007 16:03:43 -0700
From: "Paul E. McKenney" <paulmck@...ux.vnet.ibm.com>
To: Steven Rostedt <rostedt@...dmis.org>
Cc: linux-kernel@...r.kernel.org, linux-rt-users@...r.kernel.org,
mingo@...e.hu, akpm@...ux-foundation.org, dipankar@...ibm.com,
josht@...ux.vnet.ibm.com, tytso@...ibm.com, dvhltc@...ibm.com,
tglx@...utronix.de, a.p.zijlstra@...llo.nl, bunk@...nel.org,
ego@...ibm.com, oleg@...sign.ru, srostedt@...hat.com
Subject: Re: [PATCH RFC 3/9] RCU: Preemptible RCU
On Fri, Sep 21, 2007 at 11:20:48AM -0400, Steven Rostedt wrote:
> On Mon, Sep 10, 2007 at 11:34:12AM -0700, Paul E. McKenney wrote:
> > +
> > +/*
> > + * PREEMPT_RCU data structures.
> > + */
> > +
> > +#define GP_STAGES 4
> > +struct rcu_data {
> > + spinlock_t lock; /* Protect rcu_data fields. */
> > + long completed; /* Number of last completed batch. */
> > + int waitlistcount;
> > + struct tasklet_struct rcu_tasklet;
> > + struct rcu_head *nextlist;
> > + struct rcu_head **nexttail;
> > + struct rcu_head *waitlist[GP_STAGES];
> > + struct rcu_head **waittail[GP_STAGES];
> > + struct rcu_head *donelist;
> > + struct rcu_head **donetail;
> > +#ifdef CONFIG_RCU_TRACE
> > + struct rcupreempt_trace trace;
> > +#endif /* #ifdef CONFIG_RCU_TRACE */
> > +};
> > +struct rcu_ctrlblk {
> > + spinlock_t fliplock; /* Protect state-machine transitions. */
> > + long completed; /* Number of last completed batch. */
> > +};
> > +static DEFINE_PER_CPU(struct rcu_data, rcu_data);
> > +static struct rcu_ctrlblk rcu_ctrlblk = {
> > + .fliplock = SPIN_LOCK_UNLOCKED,
> > + .completed = 0,
> > +};
> > +static DEFINE_PER_CPU(int [2], rcu_flipctr) = { 0, 0 };
> > +
> > +/*
> > + * States for rcu_try_flip() and friends.
> > + */
> > +
> > +enum rcu_try_flip_states {
> > + rcu_try_flip_idle_state, /* "I" */
> > + rcu_try_flip_waitack_state, /* "A" */
> > + rcu_try_flip_waitzero_state, /* "Z" */
> > + rcu_try_flip_waitmb_state /* "M" */
> > +};
> > +static enum rcu_try_flip_states rcu_try_flip_state = rcu_try_flip_idle_state;
> > +#ifdef CONFIG_RCU_TRACE
> > +static char *rcu_try_flip_state_names[] =
> > + { "idle", "waitack", "waitzero", "waitmb" };
> > +#endif /* #ifdef CONFIG_RCU_TRACE */
>
> [snip]
>
> > +/*
> > + * If a global counter flip has occurred since the last time that we
> > + * advanced callbacks, advance them. Hardware interrupts must be
> > + * disabled when calling this function.
> > + */
> > +static void __rcu_advance_callbacks(struct rcu_data *rdp)
> > +{
> > + int cpu;
> > + int i;
> > + int wlc = 0;
> > +
> > + if (rdp->completed != rcu_ctrlblk.completed) {
> > + if (rdp->waitlist[GP_STAGES - 1] != NULL) {
> > + *rdp->donetail = rdp->waitlist[GP_STAGES - 1];
> > + rdp->donetail = rdp->waittail[GP_STAGES - 1];
> > + RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp);
> > + }
> > + for (i = GP_STAGES - 2; i >= 0; i--) {
> > + if (rdp->waitlist[i] != NULL) {
> > + rdp->waitlist[i + 1] = rdp->waitlist[i];
> > + rdp->waittail[i + 1] = rdp->waittail[i];
> > + wlc++;
> > + } else {
> > + rdp->waitlist[i + 1] = NULL;
> > + rdp->waittail[i + 1] =
> > + &rdp->waitlist[i + 1];
> > + }
> > + }
> > + if (rdp->nextlist != NULL) {
> > + rdp->waitlist[0] = rdp->nextlist;
> > + rdp->waittail[0] = rdp->nexttail;
> > + wlc++;
> > + rdp->nextlist = NULL;
> > + rdp->nexttail = &rdp->nextlist;
> > + RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp);
> > + } else {
> > + rdp->waitlist[0] = NULL;
> > + rdp->waittail[0] = &rdp->waitlist[0];
> > + }
> > + rdp->waitlistcount = wlc;
> > + rdp->completed = rcu_ctrlblk.completed;
> > + }
> > +
> > + /*
> > + * Check to see if this CPU needs to report that it has seen
> > + * the most recent counter flip, thereby declaring that all
> > + * subsequent rcu_read_lock() invocations will respect this flip.
> > + */
> > +
> > + cpu = raw_smp_processor_id();
> > + if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
> > + smp_mb(); /* Subsequent counter accesses must see new value */
> > + per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
> > + smp_mb(); /* Subsequent RCU read-side critical sections */
> > + /* seen -after- acknowledgement. */
> > + }
> > +}
>
> [snip]
>
> > +/*
> > + * Attempt a single flip of the counters. Remember, a single flip does
> > + * -not- constitute a grace period. Instead, the interval between
> > + * at least three consecutive flips is a grace period.
> > + *
> > + * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
> > + * on a large SMP, they might want to use a hierarchical organization of
> > + * the per-CPU-counter pairs.
> > + */
> > +static void rcu_try_flip(void)
> > +{
> > + unsigned long oldirq;
> > +
> > + RCU_TRACE_ME(rcupreempt_trace_try_flip_1);
> > + if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, oldirq))) {
> > + RCU_TRACE_ME(rcupreempt_trace_try_flip_e1);
> > + return;
> > + }
> > +
> > + /*
> > + * Take the next transition(s) through the RCU grace-period
> > + * flip-counter state machine.
> > + */
> > +
> > + switch (rcu_try_flip_state) {
> > + case rcu_try_flip_idle_state:
> > + if (rcu_try_flip_idle())
> > + rcu_try_flip_state = rcu_try_flip_waitack_state;
> > + break;
> > + case rcu_try_flip_waitack_state:
> > + if (rcu_try_flip_waitack())
> > + rcu_try_flip_state = rcu_try_flip_waitzero_state;
> > + break;
> > + case rcu_try_flip_waitzero_state:
> > + if (rcu_try_flip_waitzero())
> > + rcu_try_flip_state = rcu_try_flip_waitmb_state;
> > + break;
> > + case rcu_try_flip_waitmb_state:
> > + if (rcu_try_flip_waitmb())
> > + rcu_try_flip_state = rcu_try_flip_idle_state;
> > + }
> > + spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq);
> > +}
>
> Paul,
>
> Looking further into this, I still think this is a bit of overkill. We
> go through 20 states from call_rcu to list->func().
>
> On call_rcu we put our stuff on the next list. Before we move stuff from
> next to wait, we need to go through 4 states. So we have
>
> next -> 4 states -> wait[0] -> 4 states -> wait[1] -> 4 states ->
> wait[2] -> 4 states -> wait[3] -> 4 states -> done.
>
> That's 20 states that we go through from the time we add our function to
> the list to the time it actually gets called. Do we really need the 4
> wait lists?
>
> Seems a bit overkill to me.
>
> What am I missing?
"Nothing kills like overkill!!!" ;-)
Seriously, I do expect to be able to squeeze this down over time, but
feel the need to be a bit on the cowardly side at the moment.
In any case, I will be looking at the scenarios more carefully. If
it turns out that GP_STAGES can indeed be cranked down a bit, well,
that is an easy change! I just fired off a POWER run with GP_STAGES
set to 3, will let you know how it goes.
Thanx, Paul
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists