[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20101105210059.GA27317@tsunami.ccur.com>
Date: Fri, 5 Nov 2010 17:00:59 -0400
From: Joe Korty <joe.korty@...r.com>
To: "Paul E. McKenney" <paulmck@...ux.vnet.ibm.com>
Cc: fweisbec@...il.com, mathieu.desnoyers@...icios.com,
dhowells@...hat.com, loic.minier@...aro.org,
dhaval.giani@...il.com, tglx@...utronix.de, peterz@...radead.org,
linux-kernel@...r.kernel.org, josh@...htriplett.org
Subject: [PATCH] a local-timer-free version of RCU
On Thu, Nov 04, 2010 at 04:21:48PM -0700, Paul E. McKenney wrote:
> Just wanted some written record of our discussion this Wednesday.
> I don't have an email address for Jim Houston, and I am not sure I have
> all of the attendees, but here goes anyway. Please don't hesitate to
> reply with any corrections!
>
> The goal is to be able to turn of scheduling-clock interrupts for
> long-running user-mode execution when there is but one runnable task
> on a given CPU, but while still allowing RCU to function correctly.
> In particular, we need to minimize (or better, eliminate) any source
> of interruption to such a CPU. We discussed these approaches, along
> with their advantages and disadvantages:
Jim Houston's timer-less version of RCU.
This rather ancient version of RCU handles RCU garbage
collection in the absence of a per-cpu local timer
interrupt.
This is a minimal forward port to 2.6.36. It works,
but it is not yet a complete implementation of RCU.
Developed-by: Jim Houston <jim.houston@...r.com>
Signed-off-by: Joe Korty <joe.korty@...r.com>
Index: b/arch/x86/kernel/cpu/mcheck/mce.c
===================================================================
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -167,7 +167,8 @@ void mce_log(struct mce *mce)
mce->finished = 0;
wmb();
for (;;) {
- entry = rcu_dereference_check_mce(mcelog.next);
+ entry = mcelog.next;
+ smp_read_barrier_depends();
for (;;) {
/*
* If edac_mce is enabled, it will check the error type
@@ -1558,7 +1559,8 @@ static ssize_t mce_read(struct file *fil
goto out;
}
- next = rcu_dereference_check_mce(mcelog.next);
+ next = mcelog.next;
+ smp_read_barrier_depends();
/* Only supports full reads right now */
err = -EINVAL;
Index: b/include/linux/rcushield.h
===================================================================
--- /dev/null
+++ b/include/linux/rcushield.h
@@ -0,0 +1,361 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2001
+ *
+ * Author: Dipankar Sarma <dipankar@...ibm.com>
+ *
+ * Based on the original work by Paul McKenney <paul.mckenney@...ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ * Papers:
+ * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
+ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * http://lse.sourceforge.net/locking/rcupdate.html
+ *
+ */
+
+#ifndef __LINUX_RCUPDATE_H
+#define __LINUX_RCUPDATE_H
+
+#ifdef __KERNEL__
+
+#include <linux/cache.h>
+#include <linux/spinlock.h>
+#include <linux/threads.h>
+#include <linux/smp.h>
+#include <linux/cpumask.h>
+
+/*
+ * These #includes are not used by shielded RCUs; they are here
+ * to match the #includes made by the other rcu implementations.
+ */
+#include <linux/seqlock.h>
+#include <linux/lockdep.h>
+#include <linux/completion.h>
+
+/**
+ * struct rcu_head - callback structure for use with RCU
+ * @next: next update requests in a list
+ * @func: actual update function to call after the grace period.
+ */
+struct rcu_head {
+ struct rcu_head *next;
+ void (*func)(struct rcu_head *head);
+};
+
+#define RCU_HEAD_INIT { .next = NULL, .func = NULL }
+#define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT
+#define INIT_RCU_HEAD(ptr) do { \
+ (ptr)->next = NULL; (ptr)->func = NULL; \
+} while (0)
+
+/*
+ * The rcu_batch variable contains the current batch number
+ * and the following flags. The RCU_NEXT_PENDING bit requests that
+ * a new batch should start when the current batch completes. The
+ * RCU_COMPLETE bit indicates that the most recent batch has completed
+ * and RCU processing has stopped.
+ */
+extern long rcu_batch;
+#define RCU_BATCH_MASK (~3)
+#define RCU_INCREMENT 4
+#define RCU_COMPLETE 2
+#define RCU_NEXT_PENDING 1
+
+/* Is batch a before batch b ? */
+static inline int rcu_batch_before(long a, long b)
+{
+ return (a - b) < 0;
+}
+
+/* Is batch a after batch b ? */
+static inline int rcu_batch_after(long a, long b)
+{
+ return (a - b) > 0;
+}
+
+static inline int rcu_batch_complete(long batch)
+{
+ return !rcu_batch_before((rcu_batch & ~RCU_NEXT_PENDING), batch);
+}
+
+struct rcu_list {
+ struct rcu_head *head;
+ struct rcu_head **tail;
+};
+
+static inline void rcu_list_init(struct rcu_list *l)
+{
+ l->head = NULL;
+ l->tail = &l->head;
+}
+
+static inline void rcu_list_add(struct rcu_list *l, struct rcu_head *h)
+{
+ *l->tail = h;
+ l->tail = &h->next;
+}
+
+static inline void rcu_list_move(struct rcu_list *to, struct rcu_list *from)
+{
+ if (from->head) {
+ *to->tail = from->head;
+ to->tail = from->tail;
+ rcu_list_init(from);
+ }
+}
+
+/*
+ * Per-CPU data for Read-Copy UPdate.
+ * nxtlist - new callbacks are added here
+ * curlist - current batch for which quiescent cycle started if any
+ */
+struct rcu_data {
+ /* 1) batch handling */
+ long batch; /* batch # for current RCU batch */
+ unsigned long nxtbatch; /* batch # for next queue */
+ struct rcu_list nxt;
+ struct rcu_list cur;
+ struct rcu_list done;
+ long nxtcount; /* number of callbacks queued */
+ struct task_struct *krcud;
+ struct rcu_head barrier;
+
+ /* 2) synchronization between rcu_read_lock and rcu_start_batch. */
+ int nest_count; /* count of rcu_read_lock nesting */
+ unsigned int flags;
+ unsigned int sequence; /* count of read locks. */
+};
+
+/*
+ * Flags values used to synchronize between rcu_read_lock/rcu_read_unlock
+ * and the rcu_start_batch. Only processors executing rcu_read_lock
+ * protected code get invited to the rendezvous.
+ */
+#define IN_RCU_READ_LOCK 1
+#define DO_RCU_COMPLETION 2
+
+DECLARE_PER_CPU(struct rcu_data, rcu_data);
+
+/**
+ * rcu_assign_pointer - assign (publicize) a pointer to a newly
+ * initialized structure that will be dereferenced by RCU read-side
+ * critical sections. Returns the value assigned.
+ *
+ * Inserts memory barriers on architectures that require them
+ * (pretty much all of them other than x86), and also prevents
+ * the compiler from reordering the code that initializes the
+ * structure after the pointer assignment. More importantly, this
+ * call documents which pointers will be dereferenced by RCU read-side
+ * code.
+ */
+
+#define rcu_assign_pointer(p, v) ({ \
+ smp_wmb(); \
+ (p) = (v); \
+ })
+
+extern void rcu_init(void);
+extern void rcu_restart_cpu(int cpu);
+extern void rcu_quiescent(int cpu);
+extern void rcu_poll(int cpu);
+
+/* stubs for mainline rcu features we do not need */
+static inline void rcu_sched_qs(int cpu) { }
+static inline void rcu_bh_qs(int cpu) { }
+static inline int rcu_needs_cpu(int cpu) { return 0; }
+static inline void rcu_enter_nohz(void) { }
+static inline void rcu_exit_nohz(void) { }
+static inline void rcu_init_sched(void) { }
+
+extern void __rcu_read_lock(void);
+extern void __rcu_read_unlock(void);
+
+static inline void rcu_read_lock(void)
+{
+ preempt_disable();
+ __rcu_read_lock();
+}
+
+static inline void rcu_read_unlock(void)
+{
+ __rcu_read_unlock();
+ preempt_enable();
+}
+
+#define rcu_read_lock_sched(void) rcu_read_lock()
+#define rcu_read_unlock_sched(void) rcu_read_unlock()
+
+static inline void rcu_read_lock_sched_notrace(void)
+{
+ preempt_disable_notrace();
+ __rcu_read_lock();
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#error need DEBUG_LOCK_ALLOC definitions for rcu_read_lock_*_held
+#else
+static inline int rcu_read_lock_held(void)
+{
+ return 1;
+}
+
+static inline int rcu_read_lock_bh_held(void)
+{
+ return 1;
+}
+#endif /* CONFIG_DEBUG_LOCK_ALLOC */
+
+static inline int rcu_preempt_depth(void)
+{
+ return 0;
+}
+
+static inline void exit_rcu(void)
+{
+}
+
+static inline void rcu_read_unlock_sched_notrace(void)
+{
+ __rcu_read_unlock();
+ preempt_enable_notrace();
+}
+
+#ifdef CONFIG_DEBUG_KERNEL
+/*
+ * Try to catch code which depends on RCU but doesn't
+ * hold the rcu_read_lock.
+ */
+static inline void rcu_read_lock_assert(void)
+{
+#ifdef NOTYET
+ /* 2.6.13 has _lots_ of panics here. Must fix up. */
+ struct rcu_data *r;
+
+ r = &per_cpu(rcu_data, smp_processor_id());
+ BUG_ON(r->nest_count == 0);
+#endif
+}
+#else
+static inline void rcu_read_lock_assert(void) {}
+#endif
+
+/*
+ * So where is rcu_write_lock()? It does not exist, as there is no
+ * way for writers to lock out RCU readers. This is a feature, not
+ * a bug -- this property is what provides RCU's performance benefits.
+ * Of course, writers must coordinate with each other. The normal
+ * spinlock primitives work well for this, but any other technique may be
+ * used as well. RCU does not care how the writers keep out of each
+ * others' way, as long as they do so.
+ */
+
+/**
+ * rcu_read_lock_bh - mark the beginning of a softirq-only RCU critical section
+ *
+ * This is equivalent of rcu_read_lock(), but to be used when updates
+ * are being done using call_rcu_bh(). Since call_rcu_bh() callbacks
+ * consider completion of a softirq handler to be a quiescent state,
+ * a process in RCU read-side critical section must be protected by
+ * disabling softirqs. Read-side critical sections in interrupt context
+ * can use just rcu_read_lock().
+ *
+ * Hack alert. I'm not sure if I understand the reason this interface
+ * is needed and if it is still needed with my implementation of RCU.
+ */
+static inline void rcu_read_lock_bh(void)
+{
+ local_bh_disable();
+ rcu_read_lock();
+}
+
+/*
+ * rcu_read_unlock_bh - marks the end of a softirq-only RCU critical section
+ *
+ * See rcu_read_lock_bh() for more information.
+ */
+static inline void rcu_read_unlock_bh(void)
+{
+ rcu_read_unlock();
+ local_bh_enable();
+}
+
+/**
+ * rcu_dereference - fetch an RCU-protected pointer in an
+ * RCU read-side critical section. This pointer may later
+ * be safely dereferenced.
+ *
+ * Inserts memory barriers on architectures that require them
+ * (currently only the Alpha), and, more importantly, documents
+ * exactly which pointers are protected by RCU.
+ */
+
+#define rcu_dereference(p) ({ \
+ typeof(p) _________p1 = p; \
+ rcu_read_lock_assert(); \
+ smp_read_barrier_depends(); \
+ (_________p1); \
+ })
+
+#define rcu_dereference_raw(p) ({ \
+ typeof(p) _________p1 = p; \
+ smp_read_barrier_depends(); \
+ (_________p1); \
+ })
+
+#define rcu_dereference_sched(p) rcu_dereference(p)
+#define rcu_dereference_check(p, c) rcu_dereference(p)
+#define rcu_dereference_index_check(p, c) rcu_dereference(p)
+#define rcu_dereference_protected(p, c) rcu_dereference(p)
+#define rcu_dereference_bh(p) rcu_dereference(p)
+
+static inline void rcu_note_context_switch(int cpu) {}
+
+/**
+ * synchronize_sched - block until all CPUs have exited any non-preemptive
+ * kernel code sequences.
+ *
+ * This means that all preempt_disable code sequences, including NMI and
+ * hardware-interrupt handlers, in progress on entry will have completed
+ * before this primitive returns. However, this does not guarantee that
+ * softirq handlers will have completed, since in some kernels
+ *
+ * This primitive provides the guarantees made by the (deprecated)
+ * synchronize_kernel() API. In contrast, synchronize_rcu() only
+ * guarantees that rcu_read_lock() sections will have completed.
+ */
+#define synchronize_sched synchronize_rcu
+#define synchronize_sched_expedited synchronize_rcu
+
+/* Exported interfaces */
+#define call_rcu_sched(head, func) call_rcu(head, func)
+extern void call_rcu(struct rcu_head *head,
+ void (*func)(struct rcu_head *head));
+extern void call_rcu_bh(struct rcu_head *head,
+ void (*func)(struct rcu_head *head));
+extern __deprecated_for_modules void synchronize_kernel(void);
+extern void synchronize_rcu(void);
+extern void rcu_barrier(void);
+#define rcu_barrier_sched rcu_barrier
+#define rcu_barrier_bh rcu_barrier
+static inline void rcu_scheduler_starting(void) {}
+extern void do_delayed_rcu_daemon_wakeups(void);
+
+#endif /* __KERNEL__ */
+#endif /* __LINUX_RCUPDATE_H */
Index: b/include/linux/rcupdate.h
===================================================================
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -30,6 +30,10 @@
*
*/
+#ifdef CONFIG_SHIELDING_RCU
+#include <linux/rcushield.h>
+#else
+
#ifndef __LINUX_RCUPDATE_H
#define __LINUX_RCUPDATE_H
@@ -600,3 +604,4 @@ static inline void debug_rcu_head_unqueu
__rcu_dereference_index_check((p), (c))
#endif /* __LINUX_RCUPDATE_H */
+#endif /* CONFIG_SHIELDING_RCU */
Index: b/include/linux/sysctl.h
===================================================================
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -153,6 +153,7 @@ enum
KERN_MAX_LOCK_DEPTH=74, /* int: rtmutex's maximum lock depth */
KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */
KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */
+ KERN_RCU=77, /* make rcu variables available for debug */
};
@@ -235,6 +236,11 @@ enum
RANDOM_UUID=6
};
+/* /proc/sys/kernel/rcu */
+enum {
+ RCU_BATCH=1
+};
+
/* /proc/sys/kernel/pty */
enum
{
Index: b/init/main.c
===================================================================
--- a/init/main.c
+++ b/init/main.c
@@ -606,13 +606,13 @@ asmlinkage void __init start_kernel(void
"enabled *very* early, fixing it\n");
local_irq_disable();
}
- rcu_init();
radix_tree_init();
/* init some links before init_ISA_irqs() */
early_irq_init();
init_IRQ();
prio_tree_init();
init_timers();
+ rcu_init(); /* must appear after init_timers for shielded rcu */
hrtimers_init();
softirq_init();
timekeeping_init();
Index: b/kernel/Makefile
===================================================================
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -6,13 +6,16 @@ obj-y = sched.o fork.o exec_domain.o
cpu.o exit.o itimer.o time.o softirq.o resource.o \
sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
signal.o sys.o kmod.o workqueue.o pid.o \
- rcupdate.o extable.o params.o posix-timers.o \
+ extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
async.o range.o
obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
obj-y += groups.o
+ifndef CONFIG_SHIELDING_RCU
+obj-y += rcupdate.o
+endif
ifdef CONFIG_FUNCTION_TRACER
# Do not trace debug files and internal ftrace files
@@ -81,6 +84,7 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_t
obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
+obj-$(CONFIG_SHIELDING_RCU) += rcushield.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_TREE_RCU) += rcutree.o
obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
Index: b/kernel/rcushield.c
===================================================================
--- /dev/null
+++ b/kernel/rcushield.c
@@ -0,0 +1,812 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2001
+ *
+ * Authors: Dipankar Sarma <dipankar@...ibm.com>
+ * Manfred Spraul <manfred@...orfullife.com>
+ *
+ * Based on the original work by Paul McKenney <paulmck@...ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ * Papers:
+ * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
+ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * http://lse.sourceforge.net/locking/rcupdate.html
+ *
+ * Modified by: Jim Houston <jim.houston@...r.com>
+ * This is a experimental version which uses explicit synchronization
+ * between rcu_read_lock/rcu_read_unlock and rcu_poll_other_cpus()
+ * to complete RCU batches without relying on timer based polling.
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/rcupdate.h>
+#include <linux/cpu.h>
+#include <linux/jiffies.h>
+#include <linux/kthread.h>
+#include <linux/sysctl.h>
+
+/*
+ * Definition for rcu_batch. This variable includes the flags:
+ * RCU_NEXT_PENDING
+ * used to request that another batch should be
+ * started when the current batch completes.
+ * RCU_COMPLETE
+ * which indicates that the last batch completed and
+ * that rcu callback processing is stopped.
+ *
+ * Combinning this state in a single word allows them to be maintained
+ * using an atomic exchange.
+ */
+long rcu_batch = (-300*RCU_INCREMENT)+RCU_COMPLETE;
+unsigned long rcu_timestamp;
+
+/* Bookkeeping of the progress of the grace period */
+struct {
+ cpumask_t rcu_cpu_mask; /* CPUs that need to switch in order */
+ /* for current batch to proceed. */
+} rcu_state ____cacheline_internodealigned_in_smp =
+ { .rcu_cpu_mask = CPU_MASK_NONE };
+
+
+DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
+
+/*
+ * Limits to control when new batchs of RCU callbacks are started.
+ */
+long rcu_max_count = 256;
+unsigned long rcu_max_time = HZ/10;
+
+static void rcu_start_batch(void);
+
+/*
+ * Make the rcu_batch available for debug.
+ */
+ctl_table rcu_table[] = {
+ {
+ .procname = "batch",
+ .data = &rcu_batch,
+ .maxlen = sizeof(rcu_batch),
+ .mode = 0444,
+ .proc_handler = &proc_doulongvec_minmax,
+ },
+ {}
+};
+
+/*
+ * rcu_set_state maintains the RCU_COMPLETE and RCU_NEXT_PENDING
+ * bits in rcu_batch. Multiple processors might try to mark the
+ * current batch as complete, or start a new batch at the same time.
+ * The cmpxchg() makes the state transition atomic. rcu_set_state()
+ * returns the previous state. This allows the caller to tell if
+ * it caused the state transition.
+ */
+
+int rcu_set_state(long state)
+{
+ long batch, new, last;
+ do {
+ batch = rcu_batch;
+ if (batch & state)
+ return batch & (RCU_COMPLETE | RCU_NEXT_PENDING);
+ new = batch | state;
+ last = cmpxchg(&rcu_batch, batch, new);
+ } while (unlikely(last != batch));
+ return last & (RCU_COMPLETE | RCU_NEXT_PENDING);
+}
+
+
+static atomic_t rcu_barrier_cpu_count;
+static struct mutex rcu_barrier_mutex;
+static struct completion rcu_barrier_completion;
+
+/*
+ * If the batch in the nxt list or cur list has completed move it to the
+ * done list. If its grace period for the nxt list has begun
+ * move the contents to the cur list.
+ */
+static int rcu_move_if_done(struct rcu_data *r)
+{
+ int done = 0;
+
+ if (r->cur.head && rcu_batch_complete(r->batch)) {
+ rcu_list_move(&r->done, &r->cur);
+ done = 1;
+ }
+ if (r->nxt.head) {
+ if (rcu_batch_complete(r->nxtbatch)) {
+ rcu_list_move(&r->done, &r->nxt);
+ r->nxtcount = 0;
+ done = 1;
+ } else if (r->nxtbatch == rcu_batch) {
+ /*
+ * The grace period for the nxt list has started
+ * move its content to the cur list.
+ */
+ rcu_list_move(&r->cur, &r->nxt);
+ r->batch = r->nxtbatch;
+ r->nxtcount = 0;
+ }
+ }
+ return done;
+}
+
+/*
+ * support delayed krcud wakeups. Needed whenever we
+ * cannot wake up krcud directly, this happens whenever
+ * rcu_read_lock ... rcu_read_unlock is used under
+ * rq->lock.
+ */
+static cpumask_t rcu_wake_mask = CPU_MASK_NONE;
+static cpumask_t rcu_wake_mask_copy;
+static DEFINE_RAW_SPINLOCK(rcu_wake_lock);
+static int rcu_delayed_wake_count;
+
+void do_delayed_rcu_daemon_wakeups(void)
+{
+ int cpu;
+ unsigned long flags;
+ struct rcu_data *r;
+ struct task_struct *p;
+
+ if (likely(cpumask_empty(&rcu_wake_mask)))
+ return;
+
+ raw_spin_lock_irqsave(&rcu_wake_lock, flags);
+ cpumask_copy(&rcu_wake_mask_copy, &rcu_wake_mask);
+ cpumask_clear(&rcu_wake_mask);
+ raw_spin_unlock_irqrestore(&rcu_wake_lock, flags);
+
+ for_each_cpu(cpu, &rcu_wake_mask_copy) {
+ r = &per_cpu(rcu_data, cpu);
+ p = r->krcud;
+ if (p && p->state != TASK_RUNNING) {
+ wake_up_process(p);
+ rcu_delayed_wake_count++;
+ }
+ }
+}
+
+void rcu_wake_daemon_delayed(struct rcu_data *r)
+{
+ unsigned long flags;
+ raw_spin_lock_irqsave(&rcu_wake_lock, flags);
+ cpumask_set_cpu(task_cpu(r->krcud), &rcu_wake_mask);
+ raw_spin_unlock_irqrestore(&rcu_wake_lock, flags);
+}
+
+/*
+ * Wake rcu daemon if it is not already running. Note that
+ * we avoid invoking wake_up_process if RCU is being used under
+ * the rq lock.
+ */
+void rcu_wake_daemon(struct rcu_data *r)
+{
+ struct task_struct *p = r->krcud;
+
+ if (p && p->state != TASK_RUNNING) {
+#ifdef BROKEN
+ /* runqueue_is_locked is racy, let us use only
+ * the delayed approach.
+ */
+ if (unlikely(runqueue_is_locked(smp_processor_id())))
+ rcu_wake_daemon_delayed(r);
+ else
+ wake_up_process(p);
+#else
+ rcu_wake_daemon_delayed(r);
+#endif
+ }
+}
+
+/**
+ * rcu_read_lock - mark the beginning of an RCU read-side critical section.
+ *
+ * When synchronize_rcu() is invoked on one CPU while other CPUs
+ * are within RCU read-side critical sections, then the
+ * synchronize_rcu() is guaranteed to block until after all the other
+ * CPUs exit their critical sections. Similarly, if call_rcu() is invoked
+ * on one CPU while other CPUs are within RCU read-side critical
+ * sections, invocation of the corresponding RCU callback is deferred
+ * until after the all the other CPUs exit their critical sections.
+ *
+ * Note, however, that RCU callbacks are permitted to run concurrently
+ * with RCU read-side critical sections. One way that this can happen
+ * is via the following sequence of events: (1) CPU 0 enters an RCU
+ * read-side critical section, (2) CPU 1 invokes call_rcu() to register
+ * an RCU callback, (3) CPU 0 exits the RCU read-side critical section,
+ * (4) CPU 2 enters a RCU read-side critical section, (5) the RCU
+ * callback is invoked. This is legal, because the RCU read-side critical
+ * section that was running concurrently with the call_rcu() (and which
+ * therefore might be referencing something that the corresponding RCU
+ * callback would free up) has completed before the corresponding
+ * RCU callback is invoked.
+ *
+ * RCU read-side critical sections may be nested. Any deferred actions
+ * will be deferred until the outermost RCU read-side critical section
+ * completes.
+ *
+ * It is illegal to block while in an RCU read-side critical section.
+ */
+void __rcu_read_lock(void)
+{
+ struct rcu_data *r;
+
+ r = &per_cpu(rcu_data, smp_processor_id());
+ if (r->nest_count++ == 0)
+ /*
+ * Set the flags value to show that we are in
+ * a read side critical section. The code starting
+ * a batch uses this to determine if a processor
+ * needs to participate in the batch. Including
+ * a sequence allows the remote processor to tell
+ * that a critical section has completed and another
+ * has begun.
+ */
+ r->flags = IN_RCU_READ_LOCK | (r->sequence++ << 2);
+}
+EXPORT_SYMBOL(__rcu_read_lock);
+
+/**
+ * rcu_read_unlock - marks the end of an RCU read-side critical section.
+ * Check if a RCU batch was started while we were in the critical
+ * section. If so, call rcu_quiescent() join the rendezvous.
+ *
+ * See rcu_read_lock() for more information.
+ */
+void __rcu_read_unlock(void)
+{
+ struct rcu_data *r;
+ int cpu, flags;
+
+ cpu = smp_processor_id();
+ r = &per_cpu(rcu_data, cpu);
+ if (--r->nest_count == 0) {
+ flags = xchg(&r->flags, 0);
+ if (flags & DO_RCU_COMPLETION)
+ rcu_quiescent(cpu);
+ }
+}
+EXPORT_SYMBOL(__rcu_read_unlock);
+
+/**
+ * call_rcu - Queue an RCU callback for invocation after a grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual update function to be invoked after the grace period
+ *
+ * The update function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ */
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+ struct rcu_data *r;
+ unsigned long flags;
+ int cpu;
+
+ head->func = func;
+ head->next = NULL;
+ local_irq_save(flags);
+ cpu = smp_processor_id();
+ r = &per_cpu(rcu_data, cpu);
+ /*
+ * Avoid mixing new entries with batches which have already
+ * completed or have a grace period in progress.
+ */
+ if (r->nxt.head && rcu_move_if_done(r))
+ rcu_wake_daemon(r);
+
+ rcu_list_add(&r->nxt, head);
+ if (r->nxtcount++ == 0) {
+ r->nxtbatch = (rcu_batch & RCU_BATCH_MASK) + RCU_INCREMENT;
+ barrier();
+ if (!rcu_timestamp)
+ rcu_timestamp = jiffies ?: 1;
+ }
+ /* If we reach the limit start a batch. */
+ if (r->nxtcount > rcu_max_count) {
+ if (rcu_set_state(RCU_NEXT_PENDING) == RCU_COMPLETE)
+ rcu_start_batch();
+ }
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+/*
+ * Revisit - my patch treats any code not protected by rcu_read_lock(),
+ * rcu_read_unlock() as a quiescent state. I suspect that the call_rcu_bh()
+ * interface is not needed.
+ */
+void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+ call_rcu(head, func);
+}
+EXPORT_SYMBOL_GPL(call_rcu_bh);
+
+static void rcu_barrier_callback(struct rcu_head *notused)
+{
+ if (atomic_dec_and_test(&rcu_barrier_cpu_count))
+ complete(&rcu_barrier_completion);
+}
+
+/*
+ * Called with preemption disabled, and from cross-cpu IRQ context.
+ */
+static void rcu_barrier_func(void *notused)
+{
+ int cpu = smp_processor_id();
+ struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+ struct rcu_head *head;
+
+ head = &rdp->barrier;
+ atomic_inc(&rcu_barrier_cpu_count);
+ call_rcu(head, rcu_barrier_callback);
+}
+
+/**
+ * rcu_barrier - Wait until all the in-flight RCUs are complete.
+ */
+void rcu_barrier(void)
+{
+ BUG_ON(in_interrupt());
+ /* Take cpucontrol semaphore to protect against CPU hotplug */
+ mutex_lock(&rcu_barrier_mutex);
+ init_completion(&rcu_barrier_completion);
+ atomic_set(&rcu_barrier_cpu_count, 0);
+ on_each_cpu(rcu_barrier_func, NULL, 1);
+ wait_for_completion(&rcu_barrier_completion);
+ mutex_unlock(&rcu_barrier_mutex);
+}
+EXPORT_SYMBOL(rcu_barrier);
+
+
+/*
+ * cpu went through a quiescent state since the beginning of the grace period.
+ * Clear it from the cpu mask and complete the grace period if it was the last
+ * cpu. Start another grace period if someone has further entries pending
+ */
+
+static void rcu_grace_period_complete(void)
+{
+ struct rcu_data *r;
+ int cpu, last;
+
+ /*
+ * Mark the batch as complete. If RCU_COMPLETE was
+ * already set we raced with another processor
+ * and it will finish the completion processing.
+ */
+ last = rcu_set_state(RCU_COMPLETE);
+ if (last & RCU_COMPLETE)
+ return;
+ /*
+ * If RCU_NEXT_PENDING is set, start the new batch.
+ */
+ if (last & RCU_NEXT_PENDING)
+ rcu_start_batch();
+ /*
+ * Wake the krcud for any cpu which has requests queued.
+ */
+ for_each_online_cpu(cpu) {
+ r = &per_cpu(rcu_data, cpu);
+ if (r->nxt.head || r->cur.head || r->done.head)
+ rcu_wake_daemon(r);
+ }
+}
+
+/*
+ * rcu_quiescent() is called from rcu_read_unlock() when a
+ * RCU batch was started while the rcu_read_lock/rcu_read_unlock
+ * critical section was executing.
+ */
+
+void rcu_quiescent(int cpu)
+{
+ cpu_clear(cpu, rcu_state.rcu_cpu_mask);
+ if (cpus_empty(rcu_state.rcu_cpu_mask))
+ rcu_grace_period_complete();
+}
+
+/*
+ * Check if the other cpus are in rcu_read_lock/rcu_read_unlock protected code.
+ * If not they are assumed to be quiescent and we can clear the bit in
+ * bitmap. If not set DO_RCU_COMPLETION to request a quiescent point on
+ * the rcu_read_unlock.
+ *
+ * Do this in two passes. On the first pass we sample the flags value.
+ * The second pass only looks at processors which were found in the read
+ * side critical section on the first pass. The flags value contains
+ * a sequence value so we can tell if the processor has completed a
+ * critical section even if it has started another.
+ */
+long rcu_grace_periods;
+long rcu_count1;
+long rcu_count2;
+long rcu_count3;
+
+void rcu_poll_other_cpus(void)
+{
+ struct rcu_data *r;
+ int cpu;
+ cpumask_t mask;
+ unsigned int f, flags[NR_CPUS];
+
+ rcu_grace_periods++;
+ for_each_online_cpu(cpu) {
+ r = &per_cpu(rcu_data, cpu);
+ f = flags[cpu] = r->flags;
+ if (f == 0) {
+ cpu_clear(cpu, rcu_state.rcu_cpu_mask);
+ rcu_count1++;
+ }
+ }
+ mask = rcu_state.rcu_cpu_mask;
+ for_each_cpu_mask(cpu, mask) {
+ r = &per_cpu(rcu_data, cpu);
+ /*
+ * If the remote processor is still in the same read-side
+ * critical section set DO_RCU_COMPLETION to request that
+ * the cpu participate in the grace period.
+ */
+ f = r->flags;
+ if (f == flags[cpu])
+ f = cmpxchg(&r->flags, f, f | DO_RCU_COMPLETION);
+ /*
+ * If the other processors flags value changes before
+ * the cmpxchg() that processor is nolonger in the
+ * read-side critical section so we clear its bit.
+ */
+ if (f != flags[cpu]) {
+ cpu_clear(cpu, rcu_state.rcu_cpu_mask);
+ rcu_count2++;
+ } else
+ rcu_count3++;
+
+ }
+ if (cpus_empty(rcu_state.rcu_cpu_mask))
+ rcu_grace_period_complete();
+}
+
+/*
+ * Grace period handling:
+ * The grace period handling consists out of two steps:
+ * - A new grace period is started.
+ * This is done by rcu_start_batch. The rcu_poll_other_cpus()
+ * call drives the synchronization. It loops checking if each
+ * of the other cpus are executing in a rcu_read_lock/rcu_read_unlock
+ * critical section. The flags word for the cpus it finds in a
+ * rcu_read_lock/rcu_read_unlock critical section will be updated to
+ * request a rcu_quiescent() call.
+ * - Each of the cpus which were in the rcu_read_lock/rcu_read_unlock
+ * critical section will eventually call rcu_quiescent() and clear
+ * the bit corresponding to their cpu in rcu_state.rcu_cpu_mask.
+ * - The processor which clears the last bit wakes the krcud for
+ * the cpus which have rcu callback requests queued.
+ *
+ * The process of starting a batch is arbitrated with the RCU_COMPLETE &
+ * RCU_NEXT_PENDING bits. These bits can be set in either order but the
+ * thread which sets the second bit must call rcu_start_batch().
+ * Multiple processors might try to set these bits at the same time.
+ * By using cmpxchg() we can determine which processor actually set
+ * the bit and be sure that only a single thread trys to start the batch.
+ *
+ */
+static void rcu_start_batch(void)
+{
+ long batch, new;
+
+ batch = rcu_batch;
+ BUG_ON((batch & (RCU_COMPLETE|RCU_NEXT_PENDING)) !=
+ (RCU_COMPLETE|RCU_NEXT_PENDING));
+ rcu_timestamp = 0;
+ smp_mb();
+ /*
+ * nohz_cpu_mask can go away because only cpus executing
+ * rcu_read_lock/rcu_read_unlock critical sections need to
+ * participate in the rendezvous.
+ */
+ cpumask_andnot(&rcu_state.rcu_cpu_mask, cpu_online_mask, nohz_cpu_mask);
+ new = (batch & RCU_BATCH_MASK) + RCU_INCREMENT;
+ smp_mb();
+ rcu_batch = new;
+ smp_mb();
+ rcu_poll_other_cpus();
+}
+
+
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static void rcu_offline_cpu(int cpu)
+{
+ struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
+ struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+
+#if 0
+ /*
+ * The cpu should not have been in a read side critical
+ * section when it was removed. So this code is not needed.
+ */
+ /* if the cpu going offline owns the grace period
+ * we can block indefinitely waiting for it, so flush
+ * it here
+ */
+ if (!(rcu_batch & RCU_COMPLETE))
+ rcu_quiescent(cpu);
+#endif
+ local_irq_disable();
+ /*
+ * The rcu lists are per-cpu private data only protected by
+ * disabling interrupts. Since we know the other cpu is dead
+ * it should not be manipulating these lists.
+ */
+ rcu_list_move(&this_rdp->cur, &rdp->cur);
+ rcu_list_move(&this_rdp->nxt, &rdp->nxt);
+ this_rdp->nxtbatch = (rcu_batch & RCU_BATCH_MASK) + RCU_INCREMENT;
+ local_irq_enable();
+ put_cpu_var(rcu_data);
+}
+
+#else
+
+static inline void rcu_offline_cpu(int cpu)
+{
+}
+
+#endif
+
+/*
+ * Process the completed RCU callbacks.
+ */
+static void rcu_process_callbacks(struct rcu_data *r)
+{
+ struct rcu_head *list, *next;
+
+ local_irq_disable();
+ rcu_move_if_done(r);
+ list = r->done.head;
+ rcu_list_init(&r->done);
+ local_irq_enable();
+
+ while (list) {
+ next = list->next;
+ list->func(list);
+ list = next;
+ }
+}
+
+/*
+ * Poll rcu_timestamp to start a RCU batch if there are
+ * any pending request which have been waiting longer
+ * than rcu_max_time.
+ */
+struct timer_list rcu_timer;
+
+void rcu_timeout(unsigned long unused)
+{
+ do_delayed_rcu_daemon_wakeups();
+
+ if (rcu_timestamp
+ && time_after(jiffies, (rcu_timestamp + rcu_max_time))) {
+ if (rcu_set_state(RCU_NEXT_PENDING) == RCU_COMPLETE)
+ rcu_start_batch();
+ }
+ init_timer(&rcu_timer);
+ rcu_timer.expires = jiffies + (rcu_max_time/2?:1);
+ add_timer(&rcu_timer);
+}
+
+static void __devinit rcu_online_cpu(int cpu)
+{
+ struct rcu_data *r = &per_cpu(rcu_data, cpu);
+
+ memset(&per_cpu(rcu_data, cpu), 0, sizeof(struct rcu_data));
+ rcu_list_init(&r->nxt);
+ rcu_list_init(&r->cur);
+ rcu_list_init(&r->done);
+}
+
+int rcu_pending(struct rcu_data *r)
+{
+ return r->done.head ||
+ (r->cur.head && rcu_batch_complete(r->batch)) ||
+ (r->nxt.head && rcu_batch_complete(r->nxtbatch));
+}
+
+static int krcud(void *__bind_cpu)
+{
+ int cpu = (int)(long) __bind_cpu;
+ struct rcu_data *r = &per_cpu(rcu_data, cpu);
+
+ set_user_nice(current, 19);
+ current->flags |= PF_NOFREEZE;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ while (!kthread_should_stop()) {
+ if (!rcu_pending(r))
+ schedule();
+
+ __set_current_state(TASK_RUNNING);
+
+ while (rcu_pending(r)) {
+ /* Preempt disable stops cpu going offline.
+ If already offline, we'll be on wrong CPU:
+ don't process */
+ preempt_disable();
+ if (cpu_is_offline((long)__bind_cpu))
+ goto wait_to_die;
+ preempt_enable();
+ rcu_process_callbacks(r);
+ cond_resched();
+ }
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+ __set_current_state(TASK_RUNNING);
+ return 0;
+
+wait_to_die:
+ preempt_enable();
+ /* Wait for kthread_stop */
+ set_current_state(TASK_INTERRUPTIBLE);
+ while (!kthread_should_stop()) {
+ schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+ __set_current_state(TASK_RUNNING);
+ return 0;
+}
+
+static int __devinit rcu_cpu_notify(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ int cpu = (unsigned long)hcpu;
+ struct rcu_data *r = &per_cpu(rcu_data, cpu);
+ struct task_struct *p;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ rcu_online_cpu(cpu);
+ p = kthread_create(krcud, hcpu, "krcud/%d", cpu);
+ if (IS_ERR(p)) {
+ printk(KERN_INFO "krcud for %i failed\n", cpu);
+ return NOTIFY_BAD;
+ }
+ kthread_bind(p, cpu);
+ r->krcud = p;
+ break;
+ case CPU_ONLINE:
+ wake_up_process(r->krcud);
+ break;
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_UP_CANCELED:
+ /* Unbind so it can run. Fall thru. */
+ kthread_bind(r->krcud, smp_processor_id());
+ case CPU_DEAD:
+ p = r->krcud;
+ r->krcud = NULL;
+ kthread_stop(p);
+ rcu_offline_cpu(cpu);
+ break;
+#endif /* CONFIG_HOTPLUG_CPU */
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata rcu_nb = {
+ .notifier_call = rcu_cpu_notify,
+};
+
+static __init int spawn_krcud(void)
+{
+ void *cpu = (void *)(long)smp_processor_id();
+ rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, cpu);
+ rcu_cpu_notify(&rcu_nb, CPU_ONLINE, cpu);
+ register_cpu_notifier(&rcu_nb);
+ return 0;
+}
+early_initcall(spawn_krcud);
+/*
+ * Initializes rcu mechanism. Assumed to be called early.
+ * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
+ * Note that rcu_qsctr and friends are implicitly
+ * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
+ */
+void __init rcu_init(void)
+{
+ mutex_init(&rcu_barrier_mutex);
+ rcu_online_cpu(smp_processor_id());
+ /*
+ * Use a timer to catch the elephants which would otherwise
+ * fall throught the cracks on local timer shielded cpus.
+ */
+ init_timer(&rcu_timer);
+ rcu_timer.function = rcu_timeout;
+ rcu_timer.expires = jiffies + (rcu_max_time/2?:1);
+ add_timer(&rcu_timer);
+}
+
+
+struct rcu_synchronize {
+ struct rcu_head head;
+ struct completion completion;
+};
+
+/* Because of FASTCALL declaration of complete, we use this wrapper */
+static void wakeme_after_rcu(struct rcu_head *head)
+{
+ struct rcu_synchronize *rcu;
+
+ rcu = container_of(head, struct rcu_synchronize, head);
+ complete(&rcu->completion);
+}
+
+/**
+ * synchronize_rcu - wait until a grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full grace
+ * period has elapsed, in other words after all currently executing RCU
+ * read-side critical sections have completed. RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ *
+ * If your read-side code is not protected by rcu_read_lock(), do -not-
+ * use synchronize_rcu().
+ */
+void synchronize_rcu(void)
+{
+ struct rcu_synchronize rcu;
+
+ init_completion(&rcu.completion);
+ /* Will wake me after RCU finished */
+ call_rcu(&rcu.head, wakeme_after_rcu);
+
+ /* Wait for it */
+ wait_for_completion(&rcu.completion);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu);
+
+/*
+ * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
+ */
+void synchronize_kernel(void)
+{
+ synchronize_rcu();
+}
+EXPORT_SYMBOL(synchronize_kernel);
+
+module_param(rcu_max_count, long, 0644);
+module_param(rcu_max_time, long, 0644);
Index: b/kernel/sysctl.c
===================================================================
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -215,6 +215,10 @@ extern struct ctl_table random_table[];
extern struct ctl_table epoll_table[];
#endif
+#ifdef CONFIG_SHIELDING_RCU
+extern ctl_table rcu_table[];
+#endif
+
#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
int sysctl_legacy_va_layout;
#endif
@@ -808,6 +812,13 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
+#ifdef CONFIG_SHIELDING_RCU
+ {
+ .procname = "rcu",
+ .mode = 0555,
+ .child = rcu_table,
+ },
+#endif
#if defined(CONFIG_S390) && defined(CONFIG_SMP)
{
.procname = "spin_retry",
Index: b/kernel/timer.c
===================================================================
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1272,12 +1272,15 @@ unsigned long get_next_timer_interrupt(u
void update_process_times(int user_tick)
{
struct task_struct *p = current;
- int cpu = smp_processor_id();
/* Note: this timer irq context must be accounted for as well. */
account_process_tick(p, user_tick);
run_local_timers();
- rcu_check_callbacks(cpu, user_tick);
+#ifndef CONFIG_SHIELDING_RCU
+ rcu_check_callbacks(smp_processor_id(), user_tick);
+#else
+ do_delayed_rcu_daemon_wakeups();
+#endif
printk_tick();
perf_event_do_pending();
scheduler_tick();
Index: b/lib/Kconfig.debug
===================================================================
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -791,6 +791,7 @@ config BOOT_PRINTK_DELAY
config RCU_TORTURE_TEST
tristate "torture tests for RCU"
depends on DEBUG_KERNEL
+ depends on !SHIELDING_RCU
default n
help
This option provides a kernel module that runs torture tests
Index: b/init/Kconfig
===================================================================
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -365,6 +365,13 @@ config TINY_RCU
is not required. This option greatly reduces the
memory footprint of RCU.
+config SHIELDING_RCU
+ bool "Shielding RCU"
+ help
+ This option selects the RCU implementation that does not
+ depend on a per-cpu periodic interrupt to do garbage
+ collection. This is good when one is trying to shield
+ some set of CPUs from as much system activity as possible.
endchoice
config RCU_TRACE
Index: b/include/linux/hardirq.h
===================================================================
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -138,7 +138,12 @@ static inline void account_system_vtime(
}
#endif
-#if defined(CONFIG_NO_HZ)
+#if defined(CONFIG_SHIELDING_RCU)
+# define rcu_irq_enter() do { } while (0)
+# define rcu_irq_exit() do { } while (0)
+# define rcu_nmi_enter() do { } while (0)
+# define rcu_nmi_exit() do { } while (0)
+#elif defined(CONFIG_NO_HZ)
#if defined(CONFIG_TINY_RCU)
extern void rcu_enter_nohz(void);
extern void rcu_exit_nohz(void);
@@ -161,13 +166,13 @@ static inline void rcu_nmi_exit(void)
{
}
-#else
+#else /* !CONFIG_TINY_RCU */
extern void rcu_irq_enter(void);
extern void rcu_irq_exit(void);
extern void rcu_nmi_enter(void);
extern void rcu_nmi_exit(void);
#endif
-#else
+#else /* !CONFIG_NO_HZ */
# define rcu_irq_enter() do { } while (0)
# define rcu_irq_exit() do { } while (0)
# define rcu_nmi_enter() do { } while (0)
Index: b/kernel/sysctl_binary.c
===================================================================
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -61,6 +61,11 @@ static const struct bin_table bin_pty_ta
{}
};
+static const struct bin_table bin_rcu_table[] = {
+ { CTL_INT, RCU_BATCH, "batch" },
+ {}
+};
+
static const struct bin_table bin_kern_table[] = {
{ CTL_STR, KERN_OSTYPE, "ostype" },
{ CTL_STR, KERN_OSRELEASE, "osrelease" },
@@ -138,6 +143,7 @@ static const struct bin_table bin_kern_t
{ CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
{ CTL_INT, KERN_NMI_WATCHDOG, "nmi_watchdog" },
{ CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
+ { CTL_DIR, KERN_RCU, "rcu", bin_rcu_table },
{}
};
Index: b/kernel/sched.c
===================================================================
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9119,6 +9119,7 @@ struct cgroup_subsys cpuacct_subsys = {
};
#endif /* CONFIG_CGROUP_CPUACCT */
+#ifndef CONFIG_SHIELDING_RCU
#ifndef CONFIG_SMP
void synchronize_sched_expedited(void)
@@ -9188,3 +9189,4 @@ void synchronize_sched_expedited(void)
EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
#endif /* #else #ifndef CONFIG_SMP */
+#endif /* CONFIG_SHIELDING_RCU */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists