[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <45B87873.3080207@freedesktop.org>
Date: Thu, 25 Jan 2007 01:29:23 -0800
From: Josh Triplett <josh@...edesktop.org>
To: paulmck@...ux.vnet.ibm.com
CC: linux-kernel@...r.kernel.org, mingo@...e.hu, tglx@...uxtronix.de,
dipankar@...ibm.com, tytso@...ibm.com, dvhltc@...ibm.com,
oleg@...sign.ru, twoerner.k@...il.com, billh@...ppy.monkey.org,
nielsen.esben@...glemail.com, corbet@....net
Subject: Re: [RFC PATCH -rt 1/2] RCU priority boosting that survives semi-vicious
testing
Overall, this code looks sensible to me. Some comments on the patch below.
Paul E. McKenney wrote:
> --- linux-2.6.20-rc4-rt1/include/linux/rcupreempt.h 2007-01-09 10:59:54.000000000 -0800
> +++ linux-2.6.20-rc4-rt1-rcub/include/linux/rcupreempt.h 2007-01-09 11:01:12.000000000 -0800
[...]
> +enum rcu_boost_state {
> + RCU_BOOST_IDLE = 0, /* Not yet blocked if in RCU read-side. */
> + RCU_BOOST_BLOCKED = 1, /* Blocked from RCU read-side. */
> + RCU_BOOSTED = 2, /* Boosting complete. */
> +};
> +
> +#define N_RCU_BOOST_STATE (RCU_BOOSTED + 1)
Here, you define the three possible states, the number of states. But
later...
> diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/kernel/rcupreempt.c linux-2.6.20-rc4-rt1-rcub/kernel/rcupreempt.c
> --- linux-2.6.20-rc4-rt1/kernel/rcupreempt.c 2007-01-09 10:59:54.000000000 -0800
> +++ linux-2.6.20-rc4-rt1-rcub/kernel/rcupreempt.c 2007-01-23 15:41:33.000000000 -0800
> @@ -49,6 +49,7 @@
[...]
> +#ifdef CONFIG_PREEMPT_RCU_BOOST_STATS
> + long rbs_stats[N_RCU_BOOST_DAT_EVENTS][N_RCU_BOOST_STATE + 1];
> +#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST_STATS */
...you declare this array to have space for one more than that, and then...
> +#ifdef CONFIG_PREEMPT_RCU_BOOST_STATS
> +
> +/*
> + * Function to increment indicated ->rbs_stats[] element.
> + */
> +static inline void rcu_boost_dat_stat(struct rcu_boost_dat *rbdp,
> + int event,
> + enum rcu_boost_state oldstate)
> +{
> + if (oldstate >= RCU_BOOST_IDLE &&
> + oldstate <= RCU_BOOSTED) {
> + rbdp->rbs_stats[event][oldstate]++;
> + } else {
> + rbdp->rbs_stats[event][N_RCU_BOOST_STATE]++;
...you use the last element as what looks like an "unknown" counter. How
about instead defining the enum to have an "unknown" state, defining
N_RCU_BOOST_STATE accordingly, and using N_RCU_BOOST_STATE without the +1 in
rbs_stats? I think that would make the code much more self-documenting, and
less error-prone.
> +#define rcu_boost_dat_stat_block(rbdp, oldstate) \
> + rcu_boost_dat_stat(rbdp, RCU_BOOST_DAT_BLOCK, oldstate)
> +#define rcu_boost_dat_stat_boost(rbdp, oldstate) \
> + rcu_boost_dat_stat(rbdp, RCU_BOOST_DAT_BOOST, oldstate)
> +#define rcu_boost_dat_stat_unlock(rbdp, oldstate) \
> + rcu_boost_dat_stat(rbdp, RCU_BOOST_DAT_UNLOCK, oldstate)
> +
> +/*
> + * Prefix for kprint() strings for periodic statistics messages.
> + */
> +static char *rcu_boost_state_event[] = {
> + "block: ",
> + "boost: ",
> + "unlock: ",
> +};
I don't know for sure, but I *think* that GCC may sometimes generate more
efficient code if you use a char [][], rather than a char *[].
> +
> +/*
> + * Indicators for numbers in kprint() strings. "!" indicates a state-event
> + * pair that should not happen, while "?" indicates a state that should
> + * not happen.
> + */
> +static char *rcu_boost_state_error[] = {
> + /*ibBe*/
> + " ?", /* block */
> + "! ?", /* boost */
> + "? ?", /* unlock */
> +};
Likewise. Also, the columns here don't seem to line up, probably because the
comment uses spaces and the other lines use a tab.
> +static void rcu_boost_dat_stat_print(void)
> +{
> + char buf[N_RCU_BOOST_STATE * (sizeof(long) * 3 + 2) + 2];
This expression really begs for some constants in place of the magic numbers,
or failing that, a comment.
> + int cpu;
> + int event;
> + int i;
> + static time_t lastprint = 0;
> + struct rcu_boost_dat *rbdp;
> + int state;
> + struct rcu_boost_dat sum;
> +
> + /* Wait a graceful interval between printk spamming. */
> +
> + if (xtime.tv_sec - lastprint <
> + CONFIG_PREEMPT_RCU_BOOST_STATS_INTERVAL)
> + return;
How about printk_ratelimit? It takes a parameter for the rate. (On the other
hand, it also takes a spinlock, which you might not want here, even in the
stats code.)
> + /* Print them out! */
> +
> + printk(KERN_ALERT
> + "rcu_boost_dat: idx=%d "
> + "b=%ld ul=%ld ub=%ld boost: a=%ld b=%ld\n",
> + rcu_boost_idx,
> + sum.rbs_blocked, sum.rbs_unlock, sum.rbs_unboosted,
> + sum.rbs_boost_attempt, sum.rbs_boost);
> + for (event = 0; event < N_RCU_BOOST_DAT_EVENTS; event++) {
> + i = 0;
> + for (state = 0; state <= N_RCU_BOOST_STATE; state++) {
> + i += sprintf(&buf[i], " %ld%c",
> + sum.rbs_stats[event][state],
> + rcu_boost_state_error[event][state]);
> + }
> + printk(KERN_ALERT "rcu_boost_dat %s %s\n",
> + rcu_boost_state_event[event], buf);
> + }
Most or all of these counters could likely become unsigned, since negative
values don't make sense for them.
> + /* Go away and don't come back for awhile. */
> +
> + lastprint = xtime.tv_sec;
> +}
> +
> +/*
> + * Return the current boost index for boosting target tasks.
> + * May only be invoked by the booster task, so guaranteed to
> + * already be initialized.
> + */
> +static inline int rcu_boost_idx_boosting(void)
> +{
> + return (rcu_boost_idx + 1) & (RCU_BOOST_ELEMENTS - 1);
> +}
Quoted for later reference.
> +/*
> + * Initialize RCU-boost state. This happens early in the boot process,
> + * when the scheduler does not yet exist. So don't try to use it.
> + */
> +static void init_rcu_boost_early(void)
> +{
> + struct rcu_boost_dat *rbdp;
> + int cpu;
> + int i;
> +
> + for_each_possible_cpu(cpu) {
> + rbdp = per_cpu(rcu_boost_dat, cpu);
> + for (i = 0; i < RCU_BOOST_ELEMENTS; i++) {
> + rbdp[i].rbs_mutex =
> + RAW_SPIN_LOCK_UNLOCKED(rbdp[i].rbs_mutex);
> + INIT_LIST_HEAD(&rbdp[i].rbs_toboost);
> + INIT_LIST_HEAD(&rbdp[i].rbs_boosted);
> + rbdp[i].rbs_blocked = 0;
> + rbdp[i].rbs_boost_attempt = 0;
> + rbdp[i].rbs_boost = 0;
> + rbdp[i].rbs_unlock = 0;
> + rbdp[i].rbs_unboosted = 0;
> +#ifdef CONFIG_PREEMPT_RCU_BOOST_STATS
> + {
> + int j, k;
> +
> + for (j = 0; j < N_RCU_BOOST_DAT_EVENTS; j++)
> + for (k = 0; k <= N_RCU_BOOST_STATE; k++)
> + rbdp[i].rbs_stats[j][k] = 0;
> + }
> +#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST_STATS */
> + }
> + smp_wmb();
Barrier without an accompanying comment. You want to make sure the structure
gets initialized before setting rcu_boost_idx and thus allowing boosting,
right?
> + rcu_boost_idx = 0;
> + }
> +}
> +/*
> + * Return the list on which the calling task should add itself, or
> + * NULL if too early during initialization.
> + */
> +static inline struct rcu_boost_dat *rcu_rbd_new(void)
> +{
> + int cpu = raw_smp_processor_id(); /* locks used, so preemption OK. */
> + int idx = rcu_boost_idx;
> +
> + smp_read_barrier_depends(); barrier();
Barriers without accompanying comments, though in this case they seem fairly
obvious.
> + if (unlikely(idx < 0))
> + return (NULL);
> + return &per_cpu(rcu_boost_dat, cpu)[idx];
> +}
> +
> +/*
> + * Return the list from which to boost target tasks.
> + * May only be invoked by the booster task, so guaranteed to
> + * already be initialized.
> + */
> +static inline struct rcu_boost_dat *rcu_rbd_boosting(int cpu)
> +{
> + int idx = (rcu_boost_idx + 1) & (RCU_BOOST_ELEMENTS - 1);
You defined rcu_boost_idx_boosting for exactly this expression.
> + return &per_cpu(rcu_boost_dat, cpu)[idx];
For that matter, since you wrapped it in the function, you could just call it
directly in this array index operation.
> +}
> +/*
> + * Priority-boost tasks stuck in RCU read-side critical sections as
> + * needed (presumably rarely).
> + */
> +static int rcu_booster(void *arg)
> +{
> + int cpu;
> + struct sched_param sp;
> +
> + sp.sched_priority = PREEMPT_RCU_BOOSTER_PRIO;
> + sched_setscheduler(current, SCHED_RR, &sp);
> + current->flags |= PF_NOFREEZE;
> +
> + do {
> +
> + /* Advance the lists of tasks. */
> +
> + rcu_boost_idx = (rcu_boost_idx + 1) % RCU_BOOST_ELEMENTS;
> + for_each_possible_cpu(cpu) {
> +
> + /*
> + * Boost all sufficiently aged readers.
> + * Readers must first be preempted or block
> + * on a mutex in an RCU read-side critical section,
> + * then remain in that critical section for
> + * RCU_BOOST_ELEMENTS-1 time intervals.
> + * So most of the time we should end up doing
> + * nothing.
> + */
> +
> + rcu_boost_one_reader_list(rcu_rbd_boosting(cpu));
> +
> + /*
> + * Large SMP systems may need to sleep sometimes
> + * in this loop. Or have multiple RCU-boost tasks.
> + */
The idea of multiple RCU-boost tasks gives me an idea: what about having
per-CPU boost queues, and using that to eliminate the need to lock the queues?
It might not do quite as good a job of scheduling, but it would eliminate
locking overhead *and* solve the problem you describe here.
> + }
> +
> + /*
> + * Sleep to allow any unstalled RCU read-side critical
> + * sections to age out of the list. @@@ FIXME: reduce,
> + * adjust, or eliminate in case of OOM.
> + */
> +
> + schedule_timeout_uninterruptible(HZ / 100);
> +
> + /* Print stats if enough time has passed. */
> +
> + rcu_boost_dat_stat_print();
> +
> + } while (!kthread_should_stop());
> +
> + return 0;
> +}
> +
> +/*
> + * Perform the portions of RCU-boost initialization that require the
> + * scheduler to be up and running.
> + */
> +void init_rcu_boost_late(void)
> +{
> + int i;
> +
> + /* Spawn RCU-boost task. */
> +
> + printk(KERN_ALERT "Starting RCU priority booster\n");
Judging by other initialization code, I think this belongs at KERN_INFO or
similar.
> + rcu_boost_task = kthread_run(rcu_booster, NULL, "RCU Prio Booster");
> + if (IS_ERR(rcu_boost_task)) {
> + i = PTR_ERR(rcu_boost_task);
> + printk(KERN_ALERT
> + "Unable to create RCU Priority Booster, errno %d\n", -i);
No need for a temporary here.
> +
> + /*
> + * Continue running, but tasks permanently blocked
> + * in RCU read-side critical sections will be able
> + * to stall grace-period processing, potentially
> + * OOMing the machine.
> + */
> +
> + rcu_boost_task = NULL;
> + }
> +}
> +
> +/*
> + * Update task's RCU-boost state to reflect blocking in RCU read-side
> + * critical section, so that the RCU-boost task can find it in case it
> + * later needs its priority boosted.
> + */
> +void __rcu_preempt_boost(void)
> +{
> + struct rcu_boost_dat *rbdp;
> + unsigned long oldirq;
> +
> + /* Identify list to place task on for possible later boosting. */
> +
> + local_irq_save(oldirq);
> + rbdp = rcu_rbd_new();
> + if (rbdp == NULL) {
> + local_irq_restore(oldirq);
> + printk("Preempted RCU read-side critical section too early.\n");
This printk lacks a priority.
> + return;
> + }
> + spin_lock(&rbdp->rbs_mutex);
> + rbdp->rbs_blocked++;
> +
> + /*
> + * Update state. We hold the lock and aren't yet on the list,
> + * so the booster cannot mess with us yet.
> + */
> +
> + rcu_boost_dat_stat_block(rbdp, current->rcub_state);
> + if (current->rcub_state != RCU_BOOST_IDLE) {
> +
> + /*
> + * We have been here before, so just update stats.
> + * It may seem strange to do all this work just to
> + * accumulate statistics, but this is such a
> + * low-probability code path that we shouldn't care.
> + * If it becomes a problem, it can be fixed.
> + */
> +
> + spin_unlock_irqrestore(&rbdp->rbs_mutex, oldirq);
> + return;
> + }
> + current->rcub_state = RCU_BOOST_BLOCKED;
> +
> + /* Now add ourselves to the list so that the booster can find use. */
Typo: s/use/us/
> +
> + list_add_tail(¤t->rcub_entry, &rbdp->rbs_toboost);
> + current->rcub_rbdp = rbdp;
> + spin_unlock_irqrestore(&rbdp->rbs_mutex, oldirq);
> +}
> diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/kernel/rtmutex.c linux-2.6.20-rc4-rt1-rcub/kernel/rtmutex.c
> --- linux-2.6.20-rc4-rt1/kernel/rtmutex.c 2007-01-09 10:59:54.000000000 -0800
> +++ linux-2.6.20-rc4-rt1-rcub/kernel/rtmutex.c 2007-01-09 11:01:12.000000000 -0800
> @@ -105,11 +105,14 @@ static inline void init_lists(struct rt_
> */
> int rt_mutex_getprio(struct task_struct *task)
> {
> + int prio = task->normal_prio;
> +
> + if (get_rcu_prio(task) < prio)
> + prio = get_rcu_prio(task);
int prio = min(task->normal_prio, get_rcu_prio(task)); ?
> if (likely(!task_has_pi_waiters(task)))
> - return task->normal_prio;
> + return prio;
>
> - return min(task_top_pi_waiter(task)->pi_list_entry.prio,
> - task->normal_prio);
> + return min(task_top_pi_waiter(task)->pi_list_entry.prio, prio);
> }
- Josh Triplett
Download attachment "signature.asc" of type "application/pgp-signature" (253 bytes)
Powered by blists - more mailing lists