From: "Paul E. McKenney" <paulmck@us.ibm.com>

Updated patch adding a variant of RCU that permits sleeping in read-side
critical sections.  SRCU is as follows:

o	Each use of SRCU creates its own srcu_struct, and each
	srcu_struct has its own set of grace periods.  This is
	critical, as it prevents one subsystem with a blocking
	reader from holding up SRCU grace periods for other
	subsystems.

o	The SRCU primitives (srcu_read_lock(), srcu_read_unlock(),
	and synchronize_srcu()) all take a pointer to a srcu_struct.

o	The SRCU primitives must be called from process context.

o	srcu_read_lock() returns an int that must be passed to
	the matching srcu_read_unlock().  Realtime RCU avoids the
	need for this by storing the state in the task struct,
	but SRCU needs to allow a given code path to pass through
	multiple SRCU domains -- storing state in the task struct
	would therefore require either arbitrary space in the
	task struct or arbitrary limits on SRCU nesting.  So I
	kicked the state-storage problem up to the caller.

o	There is no call_srcu().  It would not be hard to implement
	one, but it seems like too easy a way to OOM the system.
	(Hey, we have enough trouble with call_rcu(), which does
	-not- permit readers to sleep!!!)  So, if you want it,
	please tell me why...

Signed-off-by: Paul E. McKenney <paulmck@us.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 RCU/checklist.txt |    0 
 RCU/rcu.txt       |    0 
 RCU/whatisRCU.txt |    0 
 kernel/srcu.c     |   68 +++++++++++++++++++++++++++-----------------
 linux/srcu.h      |    0 
 Makefile          |    0 
 6 files changed, 43 insertions(+), 25 deletions(-)

diff -puN Documentation/RCU/checklist.txt~srcu-2-rcu-variant-permitting-read-side-blocking Documentation/RCU/checklist.txt
diff -puN Documentation/RCU/rcu.txt~srcu-2-rcu-variant-permitting-read-side-blocking Documentation/RCU/rcu.txt
diff -puN Documentation/RCU/whatisRCU.txt~srcu-2-rcu-variant-permitting-read-side-blocking Documentation/RCU/whatisRCU.txt
diff -puN include/linux/srcu.h~srcu-2-rcu-variant-permitting-read-side-blocking include/linux/srcu.h
diff -puN kernel/Makefile~srcu-2-rcu-variant-permitting-read-side-blocking kernel/Makefile
diff -puN kernel/srcu.c~srcu-2-rcu-variant-permitting-read-side-blocking kernel/srcu.c
--- a/kernel/srcu.c~srcu-2-rcu-variant-permitting-read-side-blocking
+++ a/kernel/srcu.c
@@ -26,6 +26,7 @@
 
 #include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/percpu.h>
 #include <linux/preempt.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
@@ -43,16 +44,8 @@
  */
 void init_srcu_struct(struct srcu_struct *sp)
 {
-	int cpu;
-
 	sp->completed = 0;
-	sp->per_cpu_ref = (struct srcu_struct_array *)
-			  kmalloc(NR_CPUS * sizeof(*sp->per_cpu_ref),
-				  GFP_KERNEL);
-	for_each_possible_cpu(cpu) {
-		sp->per_cpu_ref[cpu].c[0] = 0;
-		sp->per_cpu_ref[cpu].c[1] = 0;
-	}
+	sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
 	mutex_init(&sp->mutex);
 }
 
@@ -65,7 +58,7 @@ void init_srcu_struct(struct srcu_struct
  */
 void cleanup_srcu_struct(struct srcu_struct *sp)
 {
-	kfree(sp->per_cpu_ref);
+	free_percpu(sp->per_cpu_ref);
 	sp->per_cpu_ref = NULL;
 }
 
@@ -84,7 +77,7 @@ int srcu_read_lock(struct srcu_struct *s
 	preempt_disable();
 	idx = sp->completed & 0x1;
 	barrier();
-	sp->per_cpu_ref[smp_processor_id()].c[idx]++;
+	per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++;
 	preempt_enable();
 	return idx;
 }
@@ -102,28 +95,30 @@ int srcu_read_lock(struct srcu_struct *s
 void srcu_read_unlock(struct srcu_struct *sp, int idx)
 {
 	preempt_disable();
-	sp->per_cpu_ref[smp_processor_id()].c[idx]--;
+	per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
 	preempt_enable();
 }
 
-/**
- * synchronize_srcu - wait for prior SRCU read-side critical-section completion
- * @sp: srcu_struct with which to synchronize.
+/*
+ * Do a single flip of the SRCU counters -- a pair of flips is required
+ * to force a grace period.  The reason that a single flip is insufficient
+ * is that there are no memory barriers in srcu_read_lock(), which
+ * means that the corresponding critical section can "bleed out", so
+ * that the old pointer is fetched, but the new counter incremented.
+ * Doing a pair of flips guarantees that -all- read-side critical sections
+ * are accounted for.
  *
- * Flip the completed counter, and wait for the old count to drain to zero.
- * As with classic RCU, the updater must use some separate means of
- * synchronizing concurrent updates.  Can block; must be called from
- * process context.
+ * We still need to flip the counters, since we want the grace period
+ * to end even when there might be at least one read-side critical section
+ * in progress at all times.
  */
-void synchronize_srcu(struct srcu_struct *sp)
+static void synchronize_srcu_flip(struct srcu_struct *sp)
 {
 	int cpu;
 	int idx;
 	int sum;
 
-	mutex_lock(&sp->mutex);
-
-	smp_mb();  /* Prevent operations from leaking in. */
+	/* Do the flip. */
 
 	idx = sp->completed & 0x1;
 	sp->completed++;
@@ -138,13 +133,36 @@ void synchronize_srcu(struct srcu_struct
 	for (;;) {
 		sum = 0;
 		for_each_possible_cpu(cpu)
-			sum += sp->per_cpu_ref[cpu].c[idx];
+			sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx];
 		if (sum == 0)
 			break;
 		schedule_timeout_interruptible(1);
 	}
+}
 
-	synchronize_sched();  /* forces memory barriers all around. */
+/**
+ * synchronize_srcu - wait for prior SRCU read-side critical-section completion
+ * @sp: srcu_struct with which to synchronize.
+ *
+ * Flip the completed counter, and wait for the old count to drain to zero.
+ * As with classic RCU, the updater must use some separate means of
+ * synchronizing concurrent updates.  Can block; must be called from
+ * process context.
+ */
+void synchronize_srcu(struct srcu_struct *sp)
+{
+	mutex_lock(&sp->mutex);
+
+	smp_mb();  /* Prevent operations from leaking in. */
+
+	/* Do a pair of flips to ensure that all prior readers complete. */
+
+	synchronize_srcu_flip(sp);
+	synchronize_srcu_flip(sp);
+
+	/* Force all concurrent srcu_read_unlock() calls to finish cleanly. */
+
+	synchronize_sched();
 
 	mutex_unlock(&sp->mutex);
 }
_