linux-kernel - [PATCH] JRCU 2.6.32.41, consolidated

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-ID: <20110616212250.GA22923@tsunami.ccur.com>
Date:	Thu, 16 Jun 2011 17:22:50 -0400
From:	Joe Korty <joe.korty@...r.com>
To:	Daniel Goller <morfic@...il.com>
Cc:	linux-kernel@...r.kernel.org
Subject: [PATCH] JRCU 2.6.32.41, consolidated

Hi Daniel,

Here is JRCU, backported to 2.6.32.41.  There were more
changes than I expected, so hopefully it is all good.

My testing of this backport is minimal: an x86_64 compile
and boot. Under the boot I did another kernel compile
while watching the JRCU stats via:

  watch -n1 -d /sys/kernel/debug/rcu/rcudata 

Signed-off-by: Joe Korty <joe.korty@...r.com>

Index: 2.6.32.41/kernel/jrcu.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ 2.6.32.41/kernel/jrcu.c	2011-06-16 16:35:56.000000000 -0400
@@ -0,0 +1,775 @@
+/*
+ * Joe's tiny single-cpu RCU, for small SMP systems.
+ *
+ * Running RCU end-of-batch operations from a single cpu relieves the
+ * other CPUs from this periodic responsibility.  This will eventually
+ * be important for those realtime applications requiring full use of
+ * dedicated cpus.  JRCU is also a lockless implementation, currently,
+ * although some anticipated features will eventually require a per
+ * cpu rcu_lock along some minimal-contention paths.
+ *
+ * Author: Joe Korty <joe.korty@...r.com>
+ *
+ * Acknowledgements: Paul E. McKenney's 'TinyRCU for uniprocessors' inspired
+ * the thought that there could could be something similiarly simple for SMP.
+ * The rcu_list chain operators are from Jim Houston's Alternative RCU.
+ *
+ * Copyright Concurrent Computer Corporation, 2011
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+/*
+ * This RCU maintains three callback lists: the current batch (per cpu),
+ * the previous batch (also per cpu), and the pending list (global).
+ */
+
+#include <linux/bug.h>
+#include <linux/smp.h>
+#include <linux/ctype.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/preempt.h>
+#include <linux/compiler.h>
+#include <linux/irqflags.h>
+#include <linux/rcupdate.h>
+
+#include <asm/system.h>
+
+/*
+ * JRCU backport: also backport usleep_range.
+ */
+static void usleep_range(unsigned long min, unsigned long max)
+{
+	ktime_t kmin;
+	unsigned long delta;
+
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+
+	kmin = ktime_set(0, min * NSEC_PER_USEC);
+	delta = (max - min) * NSEC_PER_USEC;
+	schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
+}
+
+/*
+ * Define an rcu list type and operators.  An rcu list has only ->next
+ * pointers for the chain nodes; the list head however is special and
+ * has pointers to both the first and last nodes of the chain.  Tweaked
+ * so that null head, tail pointers can be used to signify an empty list.
+ */
+struct rcu_list {
+	struct rcu_head *head;
+	struct rcu_head **tail;
+	int count;		/* stats-n-debug */
+};
+
+static inline void rcu_list_init(struct rcu_list *l)
+{
+	l->head = NULL;
+	l->tail = NULL;
+	l->count = 0;
+}
+
+/*
+ * Add an element to the tail of an rcu list
+ */
+static inline void rcu_list_add(struct rcu_list *l, struct rcu_head *h)
+{
+	if (unlikely(l->tail == NULL))
+		l->tail = &l->head;
+	*l->tail = h;
+	l->tail = &h->next;
+	l->count++;
+	h->next = NULL;
+}
+
+/*
+ * Append the contents of one rcu list to another.  The 'from' list is left
+ * corrupted on exit; the caller must re-initialize it before it can be used
+ * again.
+ */
+static inline void rcu_list_join(struct rcu_list *to, struct rcu_list *from)
+{
+	if (from->head) {
+		if (unlikely(to->tail == NULL)) {
+			to->tail = &to->head;
+			to->count = 0;
+		}
+		*to->tail = from->head;
+		to->tail = from->tail;
+		to->count += from->count;
+	}
+}
+
+/*
+ * selects, in ->cblist[] below, which is the current callback list and which
+ * is the previous.
+ */
+static u8 rcu_which ____cacheline_aligned_in_smp;
+
+struct rcu_data {
+	u8 wait;		/* goes false when this cpu consents to
+				 * the retirement of the current batch */
+	struct rcu_list cblist[2]; /* current & previous callback lists */
+	s64 nqueued;		/* #callbacks queued (stats-n-debug) */
+} ____cacheline_aligned_in_smp;
+
+static struct rcu_data rcu_data[NR_CPUS];
+
+/* debug & statistics stuff */
+static struct rcu_stats {
+	unsigned npasses;	/* #passes made */
+	unsigned nlast;		/* #passes since last end-of-batch */
+	unsigned nbatches;	/* #end-of-batches (eobs) seen */
+	unsigned nmis;		/* #passes discarded due to NMI */
+	atomic_t nbarriers;	/* #rcu barriers processed */
+	atomic_t nsyncs;	/* #rcu syncs processed */
+	s64 ninvoked;		/* #invoked (ie, finished) callbacks */
+	unsigned nforced;	/* #forced eobs (should be zero) */
+} rcu_stats;
+
+#define RCU_HZ			(20)
+#define RCU_HZ_PERIOD_US	(USEC_PER_SEC / RCU_HZ)
+#define RCU_HZ_DELTA_US		(USEC_PER_SEC / HZ)
+
+static int rcu_hz = RCU_HZ;
+static int rcu_hz_period_us = RCU_HZ_PERIOD_US;
+static int rcu_hz_delta_us = RCU_HZ_DELTA_US;
+
+static int rcu_hz_precise;
+
+int rcu_scheduler_active __read_mostly;
+int rcu_nmi_seen __read_mostly;
+
+static int rcu_wdog_ctr;	/* time since last end-of-batch, in usecs */
+static int rcu_wdog_lim = 10 * USEC_PER_SEC;	/* rcu watchdog interval */
+
+/*
+ * Return our CPU id or zero if we are too early in the boot process to
+ * know what that is.  For RCU to work correctly, a cpu named '0' must
+ * eventually be present (but need not ever be online).
+ */
+#ifdef HAVE_THREAD_INFO_CPU
+static inline int rcu_cpu(void)
+{
+	return current_thread_info()->cpu;
+}
+
+#else
+
+static unsigned rcu_cpu_early_flag __read_mostly = 1;
+
+static inline int rcu_cpu(void)
+{
+	if (unlikely(rcu_cpu_early_flag)) {
+		if (!(rcu_scheduler_active && nr_cpu_ids > 1))
+			return 0;
+		rcu_cpu_early_flag = 0;
+	}
+	return raw_smp_processor_id();
+}
+#endif /* HAVE_THREAD_INFO_CPU */
+
+/*
+ * Invoke whenever the calling CPU consents to end-of-batch.  All CPUs
+ * must so consent before the batch is truly ended.
+ */
+static inline void rcu_eob(int cpu)
+{
+	struct rcu_data *rd = &rcu_data[cpu];
+	if (unlikely(rd->wait)) {
+		rd->wait = 0;
+#ifndef CONFIG_JRCU_LAZY
+		smp_wmb();
+#endif
+	}
+}
+
+void __rcu_read_unlock(void)
+{
+	if (preempt_count() == 1)
+		rcu_eob(rcu_cpu());
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+
+void rcu_note_context_switch(int cpu)
+{
+	rcu_eob(cpu);
+}
+
+void rcu_note_might_resched(void)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	rcu_eob(rcu_cpu());
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(rcu_note_might_resched);
+
+void synchronize_sched(void)
+{
+	struct rcu_synchronize rcu;
+
+	if (!rcu_scheduler_active)
+		return;
+
+	init_completion(&rcu.completion);
+	call_rcu(&rcu.head, wakeme_after_rcu);
+	wait_for_completion(&rcu.completion);
+	atomic_inc(&rcu_stats.nsyncs);
+
+}
+EXPORT_SYMBOL_GPL(synchronize_sched);
+
+void rcu_barrier(void)
+{
+	synchronize_sched();
+	synchronize_sched();
+	atomic_inc(&rcu_stats.nbarriers);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier);
+
+void rcu_force_quiescent_state(void)
+{
+}
+EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
+
+
+/*
+ * Insert an RCU callback onto the calling CPUs list of 'current batch'
+ * callbacks.  Lockless version, can be invoked anywhere except under NMI.
+ */
+void call_rcu(struct rcu_head *cb, void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+	struct rcu_data *rd;
+	struct rcu_list *cblist;
+	int which;
+
+	cb->func = func;
+	cb->next = NULL;
+
+	local_irq_save(flags);
+	smp_rmb();
+
+	rd = &rcu_data[rcu_cpu()];
+	which = ACCESS_ONCE(rcu_which);
+	cblist = &rd->cblist[which];
+
+	/* The following is not NMI-safe, therefore call_rcu()
+	 * cannot be invoked under NMI. */
+	rcu_list_add(cblist, cb);
+	rd->nqueued++;
+	smp_wmb();
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+/*
+ * Invoke all callbacks on the passed-in list.
+ */
+static void rcu_invoke_callbacks(struct rcu_list *pending)
+{
+	struct rcu_head *curr, *next;
+
+	for (curr = pending->head; curr;) {
+		next = curr->next;
+		curr->func(curr);
+		curr = next;
+		rcu_stats.ninvoked++;
+	}
+}
+
+/*
+ * Check if the conditions for ending the current batch are true. If
+ * so then end it.
+ *
+ * Must be invoked periodically, and the periodic invocations must be
+ * far enough apart in time for the previous batch to become quiescent.
+ * This is a few tens of microseconds unless NMIs are involved; an NMI
+ * stretches out the requirement by the duration of the NMI.
+ *
+ * "Quiescent" means the owning cpu is no longer appending callbacks
+ * and has completed execution of a trailing write-memory-barrier insn.
+ */
+static void __rcu_delimit_batches(struct rcu_list *pending)
+{
+	struct rcu_data *rd;
+	struct rcu_list *plist;
+	int cpu, eob, prev;
+
+	if (!rcu_scheduler_active)
+		return;
+
+	rcu_stats.nlast++;
+
+	/* If an NMI occured then the previous batch may not yet be
+	 * quiescent.  Let's wait till it is.
+	 */
+	if (rcu_nmi_seen) {
+		rcu_nmi_seen = 0;
+		rcu_stats.nmis++;
+		return;
+	}
+
+	/*
+	 * Find out if the current batch has ended
+	 * (end-of-batch).
+	 */
+	eob = 1;
+	for_each_online_cpu(cpu) {
+		rd = &rcu_data[cpu];
+		if (rd->wait) {
+			rd->wait = preempt_count_cpu(cpu) > idle_cpu(cpu);
+			if (rd->wait) {
+				eob = 0;
+				break;
+			}
+		}
+	}
+
+	/*
+	 * Exit if batch has not ended.  But first, tickle all non-cooperating
+	 * CPUs if enough time has passed.
+	 */
+	if (eob == 0) {
+		if (rcu_wdog_ctr >= rcu_wdog_lim) {
+			rcu_wdog_ctr = 0;
+			rcu_stats.nforced++;
+			for_each_online_cpu(cpu) {
+				if (rcu_data[cpu].wait)
+					force_cpu_resched(cpu);
+			}
+		}
+		rcu_wdog_ctr += rcu_hz_period_us;
+		return;
+	}
+
+	/*
+	 * End the current RCU batch and start a new one.
+	 *
+	 * This is a two-step operation: move every cpu's previous list
+	 * to the global pending list, then tell every cpu to swap its
+	 * current and pending lists (ie, toggle rcu_which).
+	 *
+	 * We tolerate the cpus taking a bit of time noticing this swap;
+	 * we expect them to continue to put callbacks on the old current
+	 * list (which is now the previous list) for a while.  That time,
+	 * however, cannot exceed one RCU_HZ period.
+	 */
+	prev = ACCESS_ONCE(rcu_which) ^ 1;
+
+	for_each_present_cpu(cpu) {
+		rd = &rcu_data[cpu];
+		plist = &rd->cblist[prev];
+		/* Chain previous batch of callbacks, if any, to the pending list */
+		if (plist->head) {
+			rcu_list_join(pending, plist);
+			rcu_list_init(plist);
+		}
+		if (cpu_online(cpu)) /* wins race with offlining every time */
+			rd->wait = preempt_count_cpu(cpu) > idle_cpu(cpu);
+		else
+			rd->wait = 0;
+	}
+	smp_wmb(); /* just paranoia, the below xchg should do this on all archs */
+
+	/*
+	 * Swap current and previous lists.  The other cpus must not
+	 * see this out-of-order w.r.t. the above emptying of each cpu's
+	 * previous list.  The xchg accomplishes that and, as a side (but
+	 * seemingly unneeded) bonus, keeps this cpu from advancing its insn
+	 * counter until the results of that xchg are visible on other cpus.
+	 */
+	(void)xchg(&rcu_which, prev); /* only place where rcu_which is written to */
+
+	rcu_stats.nbatches++;
+	rcu_stats.nlast = 0;
+	rcu_wdog_ctr = 0;
+}
+
+static void rcu_delimit_batches(void)
+{
+	unsigned long flags;
+	struct rcu_list pending;
+
+	rcu_list_init(&pending);
+	rcu_stats.npasses++;
+
+	local_irq_save(flags);
+	smp_rmb();
+	__rcu_delimit_batches(&pending);
+	smp_wmb();
+	local_irq_restore(flags);
+
+	if (pending.head)
+		rcu_invoke_callbacks(&pending);
+}
+
+/* ------------------ interrupt driver section ------------------ */
+
+/*
+ * We drive RCU from a periodic interrupt during most of boot. Once boot
+ * is complete we (optionally) transition to a daemon.
+ */
+
+#include <linux/time.h>
+#include <linux/delay.h>
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
+
+#define rcu_hz_period_ns	(rcu_hz_period_us * NSEC_PER_USEC)
+#define rcu_hz_delta_ns		(rcu_hz_delta_us * NSEC_PER_USEC)
+
+static struct hrtimer rcu_timer;
+
+static void rcu_softirq_func(struct softirq_action *h)
+{
+	rcu_delimit_batches();
+}
+
+static enum hrtimer_restart rcu_timer_func(struct hrtimer *t)
+{
+	ktime_t next;
+
+	raise_softirq(RCU_SOFTIRQ);
+
+	next = ktime_add_ns(ktime_get(), rcu_hz_period_ns);
+	hrtimer_set_expires_range_ns(&rcu_timer, next,
+		rcu_hz_precise ? 0 : rcu_hz_delta_ns);
+	return HRTIMER_RESTART;
+}
+
+static void rcu_timer_restart(void)
+{
+	pr_info("JRCU: starting timer. rate is %d Hz\n", RCU_HZ);
+	hrtimer_forward_now(&rcu_timer, ns_to_ktime(rcu_hz_period_ns));
+	hrtimer_start_expires(&rcu_timer, HRTIMER_MODE_ABS);
+}
+
+static __init int rcu_timer_start(void)
+{
+	open_softirq(RCU_SOFTIRQ, rcu_softirq_func);
+
+	hrtimer_init(&rcu_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	rcu_timer.function = rcu_timer_func;
+	rcu_timer_restart();
+
+	return 0;
+}
+
+#ifdef CONFIG_JRCU_DAEMON
+static void rcu_timer_stop(void)
+{
+	int stat;
+
+	stat = hrtimer_cancel(&rcu_timer);
+	if (stat)
+		pr_info("JRCU: timer canceled.\n");
+}
+#endif
+
+/*
+ * Transition from a simple to a full featured, interrupt driven RCU.
+ *
+ * This is to protect us against RCU being used very very early in the boot
+ * process, where ideas like 'tasks' and 'cpus' and 'timers' and such are
+ * not yet fully formed.  During this very early time, we use a simple,
+ * not-fully-functional braindead version of RCU.
+ *
+ * Invoked from main() at the earliest point where scheduling and timers
+ * are functional.
+ */
+void __init rcu_scheduler_starting(void)
+{
+	int stat;
+
+	stat = rcu_timer_start();
+	if (stat) {
+		pr_err("JRCU: failed to start.  This is fatal.\n");
+		return;
+	}
+
+	rcu_scheduler_active = 1;
+	smp_wmb();
+
+	pr_info("JRCU: started\n");
+}
+
+#ifdef CONFIG_JRCU_DAEMON
+
+/* ------------------ daemon driver section --------------------- */
+
+/*
+ * Once the system is fully up, we will drive the periodic-polling part
+ * of JRCU from a kernel daemon, jrcud.  Until then it is driven by
+ * an interrupt.
+ */
+#include <linux/err.h>
+#include <linux/param.h>
+#include <linux/kthread.h>
+
+static int rcu_priority;
+static struct task_struct *rcu_daemon;
+
+static int jrcu_set_priority(int priority)
+{
+	struct sched_param param;
+
+	if (priority == 0) {
+		set_user_nice(current, -19);
+		return 0;
+	}
+
+	if (priority < 0)
+		param.sched_priority = MAX_USER_RT_PRIO + priority;
+	else
+		param.sched_priority = priority;
+
+	sched_setscheduler_nocheck(current, SCHED_RR, &param);
+	return param.sched_priority;
+}
+
+static int jrcud_func(void *arg)
+{
+	current->flags |= PF_NOFREEZE;
+	rcu_priority = jrcu_set_priority(CONFIG_JRCU_DAEMON_PRIO);
+	rcu_timer_stop();
+
+	pr_info("JRCU: daemon started. Will operate at ~%d Hz.\n", rcu_hz);
+
+	while (!kthread_should_stop()) {
+		if (rcu_hz_precise) {
+			usleep_range(rcu_hz_period_us,
+				rcu_hz_period_us);
+		} else {
+			usleep_range(rcu_hz_period_us,
+				rcu_hz_period_us + rcu_hz_delta_us);
+		}
+		rcu_delimit_batches();
+	}
+
+	pr_info("JRCU: daemon exiting\n");
+	rcu_daemon = NULL;
+	rcu_timer_restart();
+	return 0;
+}
+
+static __init int jrcud_start(void)
+{
+	struct task_struct *p;
+
+	p = kthread_run(jrcud_func, NULL, "jrcud");
+	if (IS_ERR(p)) {
+		pr_warning("JRCU: daemon not started\n");
+		return -ENODEV;
+	}
+	rcu_daemon = p;
+	return 0;
+}
+late_initcall(jrcud_start);
+
+#endif /* CONFIG_JRCU_DAEMON */
+
+/* ------------------ debug and statistics section -------------- */
+
+#ifdef CONFIG_DEBUG_FS
+
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <asm/uaccess.h>
+
+static int rcu_debugfs_show(struct seq_file *m, void *unused)
+{
+	int cpu, q;
+	s64 nqueued;
+
+	nqueued = 0;
+	for_each_present_cpu(cpu)
+		nqueued += rcu_data[cpu].nqueued;
+
+	seq_printf(m, "%14u: hz, %s\n",
+		rcu_hz,
+		rcu_hz_precise ? "precise" : "sloppy");
+
+	seq_printf(m, "%14u: watchdog (secs)\n", rcu_wdog_lim / (int)USEC_PER_SEC);
+	seq_printf(m, "%14d: #secs left on watchdog\n",
+		(rcu_wdog_lim - rcu_wdog_ctr) / (int)USEC_PER_SEC);
+
+#ifdef CONFIG_JRCU_DAEMON
+	if (rcu_daemon)
+		seq_printf(m, "%14u: daemon priority\n", rcu_priority);
+	else
+		seq_printf(m, "%14s: daemon priority\n", "none, no daemon");
+#endif
+
+	seq_printf(m, "\n");
+	seq_printf(m, "%14u: #passes\n",
+		rcu_stats.npasses);
+	seq_printf(m, "%14u: #passes discarded due to NMI\n",
+		rcu_stats.nmis);
+	seq_printf(m, "%14u: #passes resulting in end-of-batch\n",
+		rcu_stats.nbatches);
+	seq_printf(m, "%14u: #passes not resulting in end-of-batch\n",
+		rcu_stats.npasses - rcu_stats.nbatches);
+	seq_printf(m, "%14u: #passes since last end-of-batch\n",
+		rcu_stats.nlast);
+	seq_printf(m, "%14u: #passes forced (0 is best)\n",
+		rcu_stats.nforced);
+
+	seq_printf(m, "\n");
+	seq_printf(m, "%14u: #barriers\n",
+		atomic_read(&rcu_stats.nbarriers));
+	seq_printf(m, "%14u: #syncs\n",
+		atomic_read(&rcu_stats.nsyncs));
+	seq_printf(m, "%14llu: #callbacks invoked\n",
+		rcu_stats.ninvoked);
+	seq_printf(m, "%14d: #callbacks left to invoke\n",
+		(int)(nqueued - rcu_stats.ninvoked));
+	seq_printf(m, "\n");
+
+	for_each_online_cpu(cpu)
+		seq_printf(m, "%4d ", cpu);
+	seq_printf(m, "  CPU\n");
+
+	for_each_online_cpu(cpu) {
+		struct rcu_data *rd = &rcu_data[cpu];
+		seq_printf(m, "--%c%c ",
+			idle_cpu(cpu) ? 'I' : '-',
+			rd->wait ? 'W' : '-');
+	}
+	seq_printf(m, "  FLAGS\n");
+
+	for (q = 0; q < 2; q++) {
+		int w = ACCESS_ONCE(rcu_which);
+		for_each_online_cpu(cpu) {
+			struct rcu_data *rd = &rcu_data[cpu];
+			struct rcu_list *l = &rd->cblist[q];
+			seq_printf(m, "%4d ", l->count);
+		}
+		seq_printf(m, "  Q%d%c\n", q, " *"[q == w]);
+	}
+	seq_printf(m, "\nFLAGS:\n");
+	seq_printf(m, "  I - cpu idle, W - cpu waiting for end-of-batch,\n");
+	seq_printf(m, "  * - the current Q, other is the previous Q.\n");
+
+	return 0;
+}
+
+static ssize_t rcu_debugfs_write(struct file *file,
+	const char __user *buffer, size_t count, loff_t *ppos)
+{
+	int i, j, c;
+	char token[32];
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (count <= 0)
+		return count;
+
+	if (!access_ok(VERIFY_READ, buffer, count))
+		return -EFAULT;
+
+	i = 0;
+	if (__get_user(c, &buffer[i++]))
+		return -EFAULT;
+
+next:
+	/* Token extractor -- first, skip leading whitepace */
+	while (c && isspace(c) && i < count) {
+		if (__get_user(c, &buffer[i++]))
+			return -EFAULT;
+	}
+
+	if (i >= count || c == 0)
+		return count;	/* all done, no more tokens */
+
+	j = 0;
+	do {
+		if (j == (sizeof(token) - 1))
+			return -EINVAL;
+		token[j++] = c;
+		if (__get_user(c, &buffer[i++]))
+			return -EFAULT;
+	} while (c && !isspace(c) && i < count); /* extract next token */
+	token[j++] = 0;
+
+	if (!strncmp(token, "hz=", 3)) {
+		int rcu_hz_wanted = -1;
+		sscanf(&token[3], "%d", &rcu_hz_wanted);
+		if (rcu_hz_wanted < 2 || rcu_hz_wanted > 1000)
+			return -EINVAL;
+		rcu_hz = rcu_hz_wanted;
+		rcu_hz_period_us = USEC_PER_SEC / rcu_hz;
+	} else if (!strncmp(token, "precise=", 8)) {
+		sscanf(&token[8], "%d", &rcu_hz_precise);
+	} else if (!strncmp(token, "wdog=", 5)) {
+		int wdog = -1;
+		sscanf(&token[5], "%d", &wdog);
+		if (wdog < 3 || wdog > 1000)
+			return -EINVAL;
+		rcu_wdog_lim = wdog * USEC_PER_SEC;
+	} else
+		return -EINVAL;
+	goto next;
+}
+
+static int rcu_debugfs_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, rcu_debugfs_show, NULL);
+}
+
+static const struct file_operations rcu_debugfs_fops = {
+	.owner = THIS_MODULE,
+	.open = rcu_debugfs_open,
+	.read = seq_read,
+	.write = rcu_debugfs_write,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static struct dentry *rcudir;
+
+static int __init rcu_debugfs_init(void)
+{
+	struct dentry *retval;
+
+	rcudir = debugfs_create_dir("rcu", NULL);
+	if (!rcudir)
+		goto error;
+
+	retval = debugfs_create_file("rcudata", 0644, rcudir,
+			NULL, &rcu_debugfs_fops);
+	if (!retval)
+		goto error;
+
+	pr_info("JRCU: Created debugfs files\n");
+	return 0;
+
+error:
+	debugfs_remove_recursive(rcudir);
+	pr_warning("JRCU: Could not create debugfs files\n");
+	return -ENOSYS;
+}
+late_initcall(rcu_debugfs_init);
+#endif /* CONFIG_DEBUG_FS */
Index: 2.6.32.41/include/linux/jrcu.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ 2.6.32.41/include/linux/jrcu.h	2011-06-16 16:01:25.000000000 -0400
@@ -0,0 +1,77 @@
+/*
+ * JRCU - A tiny single-cpu RCU for small SMP systems.
+ *
+ * Author: Joe Korty <joe.korty@...r.com>
+ * Copyright Concurrent Computer Corporation, 2011
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#ifndef __LINUX_JRCU_H
+#define __LINUX_JRCU_H
+
+#define __rcu_read_lock()			preempt_disable()
+extern void __rcu_read_unlock(void);
+
+#define __rcu_read_lock_bh()			__rcu_read_lock()
+#define __rcu_read_unlock_bh()			__rcu_read_unlock()
+
+extern void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
+
+#define call_rcu_sched				call_rcu
+#define call_rcu_bh				call_rcu
+
+extern void rcu_barrier(void);
+
+#define rcu_barrier_sched			rcu_barrier
+#define rcu_barrier_bh				rcu_barrier
+
+extern void synchronize_sched(void);
+
+#define synchronize_rcu				synchronize_sched
+#define synchronize_rcu_bh			synchronize_sched
+#define synchronize_rcu_expedited		synchronize_sched
+#define synchronize_rcu_bh_expedited		synchronize_sched
+
+#define rcu_init(cpu)				do { } while (0)
+#define rcu_init_sched()			do { } while (0)
+#define exit_rcu()				do { } while (0)
+
+static inline void __rcu_check_callbacks(int cpu, int user) { }
+#define rcu_check_callbacks			__rcu_check_callbacks
+
+#define rcu_needs_cpu(cpu)			(0)
+#define rcu_batches_completed()			(0)
+#define rcu_batches_completed_bh()		(0)
+#define rcu_preempt_depth()			(0)
+
+extern void rcu_force_quiescent_state(void);
+
+#define rcu_sched_force_quiescent_state		rcu_force_quiescent_state
+#define rcu_bh_force_quiescent_state		rcu_force_quiescent_state
+
+#define rcu_enter_nohz()			do { } while (0)
+#define rcu_exit_nohz()				do { } while (0)
+
+extern void rcu_note_context_switch(int cpu);
+
+#define rcu_sched_qs				rcu_note_context_switch
+#define rcu_bh_qs				rcu_note_context_switch
+
+extern void rcu_note_might_resched(void);
+
+extern void rcu_scheduler_starting(void);
+extern int rcu_scheduler_active __read_mostly;
+
+#endif /* __LINUX_JRCU_H */
Index: 2.6.32.41/include/linux/rcupdate.h
===================================================================
--- 2.6.32.41.orig/include/linux/rcupdate.h	2011-06-16 16:00:40.000000000 -0400
+++ 2.6.32.41/include/linux/rcupdate.h	2011-06-16 16:01:25.000000000 -0400
@@ -73,6 +73,8 @@
 
 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
 #include <linux/rcutree.h>
+#elif defined(CONFIG_JRCU)
+#include <linux/jrcu.h>
 #else
 #error "Unknown RCU implementation specified to kernel configuration"
 #endif
Index: 2.6.32.41/init/Kconfig
===================================================================
--- 2.6.32.41.orig/init/Kconfig	2011-06-16 16:00:40.000000000 -0400
+++ 2.6.32.41/init/Kconfig	2011-06-16 16:01:25.000000000 -0400
@@ -334,8 +334,71 @@
 	  is also required.  It also scales down nicely to
 	  smaller systems.
 
+config JRCU
+	bool "A tiny single-CPU RCU for small SMP systems"
+	depends on PREEMPT
+	depends on SMP
+	select PREEMPT_COUNT_CPU
+	help
+	  This option selects a minimal-footprint RCU suitable for small SMP
+	  systems -- that is, those with fewer than 16 or perhaps 32, and
+	  certainly less than 64 processors.
+
+	  This RCU variant may be a good choice for systems with low latency
+	  requirements.  It does RCU garbage collection from a single CPU
+	  rather than have each CPU do its own.  This frees up all but one
+	  CPU from interference by this periodic requirement.
+
+	  Most users should say N here.
+
 endchoice
 
+config JRCU_DAEMON
+	bool
+	depends on JRCU
+	default y
+	help
+	  Required. The context switch when leaving the daemon is needed
+	  to get the CPU to reliably participate in end-of-batch processing.
+
+config JRCU_DAEMON_PRIO
+	int "JRCU Daemon priority"
+	depends on JRCU_DAEMON
+	default 0
+	help
+	  The JRCU daemon priority. If 0 then the daemon runs SCHED_OTHER.
+	  If >0 then the daemon runs SCHED_RR and its priority will be
+	  the value selected.  If <0 then SCHED_RR is again selected, but
+	  now its priority will be the biased downwards from the maximum
+	  possible Posix priority.
+
+config JRCU_LAZY
+	bool "Should JRCU be lazy recognizing end-of-batch"
+	depends on JRCU
+	default n
+	help
+	  If you say Y here, JRCU will on occasion fail to recognize
+	  end-of-batch for an rcu period or two.
+
+	  If you say N here, JRCU will be more aggressive; in fact it
+	  will always recognize end-of-batch at the earliest possible time.
+
+	  Being lazy should be fractionally more efficient in that JRCU
+	  inserts fewer memory barriers along some high performance kernel
+	  code paths.
+
+	  If unsure, say N.
+
+config PREEMPT_COUNT_CPU
+	# bool "Let one CPU look at another CPUs preemption count"
+	bool
+	default n
+	help
+	  If Y then the preempt_count_cpu() function will be compiled into
+	  the kernel.  Its existance impacts kernel performance slightly,
+	  so this option should be selected only if other kernel features
+	  that use preempt_count_cpu() are also selected.
+
 config RCU_TRACE
 	bool "Enable tracing for RCU"
 	depends on TREE_RCU || TREE_PREEMPT_RCU
Index: 2.6.32.41/kernel/Makefile
===================================================================
--- 2.6.32.41.orig/kernel/Makefile	2011-06-16 16:00:40.000000000 -0400
+++ 2.6.32.41/kernel/Makefile	2011-06-16 16:01:25.000000000 -0400
@@ -80,6 +80,7 @@
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_TREE_RCU) += rcutree.o
+obj-$(CONFIG_JRCU) += jrcu.o
 obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
 obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
 obj-$(CONFIG_RELAY) += relay.o
Index: 2.6.32.41/include/linux/hardirq.h
===================================================================
--- 2.6.32.41.orig/include/linux/hardirq.h	2011-06-16 16:00:40.000000000 -0400
+++ 2.6.32.41/include/linux/hardirq.h	2011-06-16 16:01:25.000000000 -0400
@@ -145,7 +145,13 @@
 extern void account_system_vtime(struct task_struct *tsk);
 #endif
 
-#if defined(CONFIG_NO_HZ)
+#if defined(CONFIG_JRCU)
+extern int rcu_nmi_seen;
+# define rcu_irq_enter() do { } while (0)
+# define rcu_irq_exit() do { } while (0)
+# define rcu_nmi_enter() do { rcu_nmi_seen = 1; } while (0)
+# define rcu_nmi_exit() do { } while (0)
+#elif defined(CONFIG_NO_HZ)
 extern void rcu_irq_enter(void);
 extern void rcu_irq_exit(void);
 extern void rcu_nmi_enter(void);
Index: 2.6.32.41/kernel/sched.c
===================================================================
--- 2.6.32.41.orig/kernel/sched.c	2011-06-16 16:00:40.000000000 -0400
+++ 2.6.32.41/kernel/sched.c	2011-06-16 16:01:25.000000000 -0400
@@ -1190,6 +1190,16 @@
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 
+void force_cpu_resched(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	spin_lock_irqsave(&rq->lock, flags);
+	resched_task(cpu_curr(cpu));
+	spin_unlock_irqrestore(&rq->lock, flags);
+}
+
 #ifdef CONFIG_NO_HZ
 /*
  * When add_timer_on() enqueues a timer into the timer wheel of an
@@ -1273,6 +1283,11 @@
 static void sched_avg_update(struct rq *rq)
 {
 }
+
+void force_cpu_resched(int cpu)
+{
+	set_need_resched();
+}
 #endif /* CONFIG_SMP */
 
 #if BITS_PER_LONG == 32
@@ -2753,6 +2768,24 @@
 	put_cpu();
 }
 
+#ifdef CONFIG_PREEMPT_COUNT_CPU
+
+/*
+ * Fetch the preempt count of some cpu's current task.  Must be called
+ * with interrupts blocked.  Stale return value.
+ *
+ * No locking needed as this always wins the race with context-switch-out
+ * + task destruction, since that is so heavyweight.  The smp_rmb() is
+ * to protect the pointers in that race, not the data being pointed to
+ * (which, being guaranteed stale, can stand a bit of fuzziness).
+ */
+int preempt_count_cpu(int cpu)
+{
+	smp_rmb(); /* stop data prefetch until program ctr gets here */
+	return task_thread_info(cpu_curr(cpu))->preempt_count;
+}
+#endif
+
 /*
  * wake_up_new_task - wake up a newly created task for the first time.
  *
@@ -5552,7 +5585,7 @@
 	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
 		return;
 #endif
-	preempt_count() += val;
+	__add_preempt_count(val);
 #ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Spinlock count overflowing soon?
@@ -5583,7 +5616,7 @@
 
 	if (preempt_count() == val)
 		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
-	preempt_count() -= val;
+	__sub_preempt_count(val);
 }
 EXPORT_SYMBOL(sub_preempt_count);
 
@@ -5742,6 +5775,9 @@
 
 		rq->nr_switches++;
 		rq->curr = next;
+#ifdef CONFIG_PREEMPT_COUNT_CPU
+		smp_wmb();
+#endif
 		++*switch_count;
 
 		context_switch(rq, prev, next); /* unlocks the rq */
@@ -9983,6 +10019,9 @@
 void set_curr_task(int cpu, struct task_struct *p)
 {
 	cpu_curr(cpu) = p;
+#ifdef CONFIG_PREEMPT_COUNT_CPU
+	smp_wmb();
+#endif
 }
 
 #endif
Index: 2.6.32.41/include/linux/preempt.h
===================================================================
--- 2.6.32.41.orig/include/linux/preempt.h	2011-06-16 16:00:40.000000000 -0400
+++ 2.6.32.41/include/linux/preempt.h	2011-06-16 16:01:25.000000000 -0400
@@ -10,18 +10,45 @@
 #include <linux/linkage.h>
 #include <linux/list.h>
 
+/* cannot include rcupdate.h here, so open-code this */
+
+#if defined(CONFIG_JRCU)
+# define __add_preempt_count(val) do { \
+	int newval = (preempt_count() += (val)); \
+	if (newval == (val)) \
+		smp_wmb(); \
+} while (0)
+#else
+# define __add_preempt_count(val) do { preempt_count() += (val); } while (0)
+#endif
+
+#if defined(CONFIG_JRCU_LAZY) || !defined(CONFIG_JRCU)
+# define __sub_preempt_count(val) do { preempt_count() -= (val); } while (0)
+#else
+# define __sub_preempt_count(val) do { \
+	int newval = (preempt_count() -= (val)); \
+	if (newval == 0) { \
+		/* race with preemption OK, preempt will do the mb for us */ \
+		smp_wmb(); \
+	} \
+} while (0)
+#endif
+
 #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
   extern void add_preempt_count(int val);
   extern void sub_preempt_count(int val);
 #else
-# define add_preempt_count(val)	do { preempt_count() += (val); } while (0)
-# define sub_preempt_count(val)	do { preempt_count() -= (val); } while (0)
+# define add_preempt_count(val) __add_preempt_count(val)
+# define sub_preempt_count(val) __sub_preempt_count(val)
 #endif
 
 #define inc_preempt_count() add_preempt_count(1)
 #define dec_preempt_count() sub_preempt_count(1)
 
 #define preempt_count()	(current_thread_info()->preempt_count)
+#ifdef CONFIG_PREEMPT_COUNT_CPU
+extern int preempt_count_cpu(int cpu);
+#endif
 
 #ifdef CONFIG_PREEMPT
 
Index: 2.6.32.41/include/linux/kernel.h
===================================================================
--- 2.6.32.41.orig/include/linux/kernel.h	2011-06-16 16:00:40.000000000 -0400
+++ 2.6.32.41/include/linux/kernel.h	2011-06-16 16:01:25.000000000 -0400
@@ -117,11 +117,18 @@
 struct pt_regs;
 struct user;
 
+/* cannot bring in linux/rcupdate.h at this point */
+#ifdef CONFIG_JRCU
+extern void rcu_note_might_resched(void);
+#else
+#define rcu_note_might_resched()
+#endif /*JRCU */
+
 #ifdef CONFIG_PREEMPT_VOLUNTARY
 extern int _cond_resched(void);
-# define might_resched() _cond_resched()
+# define might_resched() do { _cond_resched(); rcu_note_might_resched(); } while (0)
 #else
-# define might_resched() do { } while (0)
+# define might_resched() do { rcu_note_might_resched(); } while (0)
 #endif
 
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
Index: 2.6.32.41/arch/x86/include/asm/thread_info.h
===================================================================
--- 2.6.32.41.orig/arch/x86/include/asm/thread_info.h	2011-06-16 16:00:40.000000000 -0400
+++ 2.6.32.41/arch/x86/include/asm/thread_info.h	2011-06-16 16:01:25.000000000 -0400
@@ -29,6 +29,7 @@
 	__u32			flags;		/* low level flags */
 	__u32			status;		/* thread synchronous flags */
 	__u32			cpu;		/* current CPU */
+#define HAVE_THREAD_INFO_CPU	1
 	int			preempt_count;	/* 0 => preemptable,
 						   <0 => BUG */
 	mm_segment_t		addr_limit;
Index: 2.6.32.41/include/linux/sched.h
===================================================================
--- 2.6.32.41.orig/include/linux/sched.h	2011-06-16 16:00:40.000000000 -0400
+++ 2.6.32.41/include/linux/sched.h	2011-06-16 16:01:25.000000000 -0400
@@ -1908,6 +1908,8 @@
 static inline void wake_up_idle_cpu(int cpu) { }
 #endif
 
+extern void force_cpu_resched(int cpu);
+
 extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
Index: 2.6.32.41/kernel/rcupdate.c
===================================================================
--- 2.6.32.41.orig/kernel/rcupdate.c	2009-12-02 22:51:21.000000000 -0500
+++ 2.6.32.41/kernel/rcupdate.c	2011-06-16 16:20:37.000000000 -0400
@@ -53,7 +53,9 @@
 EXPORT_SYMBOL_GPL(rcu_lock_map);
 #endif
 
+#ifndef CONFIG_JRCU
 int rcu_scheduler_active __read_mostly;
+#endif
 
 /*
  * Awaken the corresponding synchronize_rcu() instance now that a
@@ -95,6 +97,8 @@
 
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 
+#ifndef CONFIG_JRCU
+
 /**
  * synchronize_sched - wait until an rcu-sched grace period has elapsed.
  *
@@ -185,3 +189,5 @@
 	WARN_ON(nr_context_switches() > 0);
 	rcu_scheduler_active = 1;
 }
+
+#endif /* CONFIG_JRCU */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/