Index: linux.prev/Documentation/DocBook/Makefile
===================================================================
--- linux.prev.orig/Documentation/DocBook/Makefile
+++ linux.prev/Documentation/DocBook/Makefile
@@ -10,7 +10,8 @@ DOCBOOKS := wanbook.xml z8530book.xml mc
 	    kernel-hacking.xml kernel-locking.xml deviceiobook.xml \
 	    procfs-guide.xml writing_usb_driver.xml \
 	    sis900.xml kernel-api.xml journal-api.xml lsm.xml usb.xml \
-	    gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml
+	    gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \
+	    genericirq.xml
 
 ###
 # The build process is as follows (targets):
Index: linux.prev/Documentation/DocBook/genericirq.tmpl
===================================================================
--- /dev/null
+++ linux.prev/Documentation/DocBook/genericirq.tmpl
@@ -0,0 +1,560 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+	"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="Generic-IRQ-Guide">
+ <bookinfo>
+  <title>Linux generic IRQ handling</title>
+
+  <authorgroup>
+   <author>
+    <firstname>Thomas</firstname>
+    <surname>Gleixner</surname>
+    <affiliation>
+     <address>
+      <email>tglx@linutronix.de</email>
+     </address>
+    </affiliation>
+   </author>
+   <author>
+    <firstname>Ingo</firstname>
+    <surname>Molnar</surname>
+    <affiliation>
+     <address>
+      <email>mingo@elte.hu</email>
+     </address>
+    </affiliation>
+   </author>
+  </authorgroup>
+
+  <copyright>
+   <year>2005</year>
+   <holder>Thomas Gleixner</holder>
+  </copyright>
+  <copyright>
+   <year>2005</year>
+   <holder>Ingo Molnar</holder>
+  </copyright>
+
+  <legalnotice>
+   <para>
+     This documentation is free software; you can redistribute
+     it and/or modify it under the terms of the GNU General Public
+     License version 2 as published by the Free Software Foundation.
+   </para>
+
+   <para>
+     This program is distributed in the hope that it will be
+     useful, but WITHOUT ANY WARRANTY; without even the implied
+     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+     See the GNU General Public License for more details.
+   </para>
+
+   <para>
+     You should have received a copy of the GNU General Public
+     License along with this program; if not, write to the Free
+     Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+     MA 02111-1307 USA
+   </para>
+
+   <para>
+     For more details see the file COPYING in the source
+     distribution of Linux.
+   </para>
+  </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+  <chapter id="intro">
+    <title>Introduction</title>
+    <para>
+	The generic interrupt handling layer is designed to provide a
+	complete abstraction of interrupt handling for device drivers
+	and is able to handle all different types of interrupt controller
+	hardware. Device drivers use generic API function to request, enable,
+	disable and free interrupts. The drivers do not have to know anything
+	about interrupt hardware, so they can be used on different hardware
+	platforms without code changes.
+    </para>
+    <para>
+  	This documentation is provided for developers who want to implement
+	architecture interrupt support based on the Generic IRQ handling layer.
+    </para>
+  </chapter>
+
+  <chapter id="rationale">
+    <title>Rationale</title>
+	<para>
+	The original implementation of interrupt handling in Linux is using
+	the __do_IRQ() super-handler, which must be able to deal with every
+	type of interrupt logic. This is achieved by an 'interrupt type'
+	structure and runtime flags to handle special cases.
+	Furthermore the superhandler assumed a certain type of interrupt
+	handling hardware and turned out to be not capable of handling all
+	kind of interrupt controller hardware which can be found through
+	the architectures. The all in one approach also adds unnecessary
+	complexity for every user.
+	</para>
+	<para>
+	Originally, Russell King identified different types of handlers to
+	build a quite universal set for the ARM interrupt handler
+	implementation in Linux 2.5/2.6. He distiguished between:
+	<itemizedlist>
+	  <listitem><para>Level type</para></listitem>
+	  <listitem><para>Edge type</para></listitem>
+	  <listitem><para>Simple type</para></listitem>
+	</itemizedlist>
+	In the SMP world of the __do_IRQ() super-handler another type
+	was identified:
+	<itemizedlist>
+	  <listitem><para>Per CPU type</para></listitem>
+	</itemizedlist>
+	</para>
+	<para>
+	This split implementation of handlers allows to optimize the flow
+	of the interrupt handling for each specific interrupt type.
+	This reduces complexitiy in that particular code path and allows
+	the optimized handling of a given type.
+	</para>
+	<para>
+	The original general implementation uses interrupt_type structures
+	to differentiate the flow control in the super-handler. This
+	leads to a mix of flow logic and code related to hardware details.
+	Russell Kings ARM implementation which replaced the type by a chip
+	abstraction did the mix the other way around.
+	</para>
+	<para>
+	The natural conclusion was a clean seperation of the 'type flow'
+	and the	'chip'. Analysing a couple of architecture implementations
+	reveals that many of them can use a generic set of 'type flow'
+	implementations and only need to add the chip level specific code.
+	The seperation is also valuable for the (sub)architectures,
+	which need specific quirks in the type flow itself, because it
+	provides a more transparent design.
+	</para>
+	<para>
+	Each interrupt type implementation has assigned its own flow
+	handler, which should be normally one of the generic
+	implementations. The flow handler implementation makes it
+	simple to provide demultiplexing handlers which can be found in
+	embedded platforms on various architectures.
+	</para>
+	<para>
+	The seperation makes the generic interrupt handling more flexible
+	and extensible. An (sub)architecture can use a generic type flow
+	implementation for e.g. 'level type' interrupts and add a
+	(sub)architecture specific 'edge type' implementation.
+	</para>
+	<para>
+	To make the transition to the new model easier and prevent the
+	breakage of existing implementations the __do_IRQ() super-handler
+	is still available. This leads to a kind of duality for the time
+	being. Over time the new model should achieve a homogeneous
+	implementation scheme over all architectures with enhanced
+	maintainability and cleanliness.
+	</para>
+  </chapter>
+  <chapter id="bugs">
+    <title>Known Bugs And Assumptions</title>
+    <para>
+	None (hopefully).
+    </para>
+  </chapter>
+
+  <chapter id="Abstraction">
+    <title>Abstraction layers</title>
+    <para>
+	There are three main levels of abstraction in the interrupt code:
+	<orderedlist>
+	  <listitem><para>Highlevel driver API</para></listitem>
+	  <listitem><para>Abstract interrupt type</para></listitem>
+	  <listitem><para>Chiplevel hardware encapsulation</para></listitem>
+	</orderedlist>
+    </para>
+    <para>
+	The seperation of interrupt type and chip level functionality
+	provides the most flexible design. This implementation can handle
+	all kinds of interrupt hardware and the necessary workarounds for
+	the interrupt types without the need of redundant implementations.
+	The seperation handles also edge and level type interrupts
+	on the same hardware chip.
+    </para>
+    <sect1>
+	<title>Interrupt control flow</title>
+	<para>
+	Each interrupt is described by an interrupt description structure
+	irq_desc. The interrupt is referenced by an 'unsigned int' numeric
+	value which selects the corresponding interrupt decription structure
+	in the description structures array.
+	The description structure contains status information and pointers
+	to the interrupt type structure and the interrupt chip structure
+	which are assigned to this interrupt.
+	</para>
+	<para>
+	Whenever an interrupt triggers, the lowlevel arch code calls into
+	the generic interrupt code by calling desc->handler->handle_irq().
+	This highlevel IRQ handling function only uses other
+	desc->handler primitives which describe the control flow operation
+	necessary for the interrupt type. These operations are calling
+	the chip primitives referenced by the assigned chip description
+	structure.
+	</para>
+    </sect1>
+    <sect1>
+	<title>Highlevel Driver API</title>
+	<para>
+	  The highlevel Driver API consists of following functions:
+	  <itemizedlist>
+	  <listitem><para>request_irq()</para></listitem>
+	  <listitem><para>free_irq()</para></listitem>
+	  <listitem><para>disable_irq()</para></listitem>
+	  <listitem><para>enable_irq()</para></listitem>
+	  <listitem><para>disable_irq_nosync() (SMP only)</para></listitem>
+	  <listitem><para>synchronize_irq() (SMP only)</para></listitem>
+	  <listitem><para>set_irq_type()</para></listitem>
+	  <listitem><para>set_irq_wake()</para></listitem>
+	  <listitem><para>set_irq_data()</para></listitem>
+	  <listitem><para>set_irq_chip()</para></listitem>
+	  <listitem><para>set_irq_chip_data()</para></listitem>
+          </itemizedlist>
+	  See the autogenerated function documentation for details.
+	</para>
+    </sect1>
+    <sect1>
+	<title>Abstract interrupt type</title>
+	<para>
+	  The 'interrupt type' (struct irq_type) abstraction mainly consists of
+	  methods which implement the 'interrupt handling flow'. The generic
+	  layer provides a set of pre-defined types:
+	  <itemizedlist>
+	  <listitem><para>default_level_type</para></listitem>
+	  <listitem><para>default_edge_type</para></listitem>
+	  <listitem><para>default_simple_type</para></listitem>
+	  <listitem><para>default_percpu_type</para></listitem>
+	  </itemizedlist>
+	  The default type implementations use the generic type handlers.
+	  <itemizedlist>
+	  <listitem><para>handle_level_type</para></listitem>
+	  <listitem><para>handle_edge_type</para></listitem>
+	  <listitem><para>handle_simple_type</para></listitem>
+	  <listitem><para>handle_percpu_type</para></listitem>
+	  </itemizedlist>
+	  The interrupt types (either predefined or architecture specific) are
+	  assigned to specific interrupts by the architecture either during
+	  bootup or during device initialization.
+	</para>
+	<sect2>
+	<title>Default type implementations</title>
+	    <sect3>
+	 	<title>Helper functions</title>
+		<para>
+		The helper functions call the chip primitives and
+		are used by the default type implementations.
+		Following helper functions are implemented (simplified excerpt):
+		<programlisting>
+default_enable(irq)
+{
+	desc->chip->unmask(irq);
+}
+
+default_disable(irq)
+{
+	desc->chip->mask(irq);
+}
+
+default_ack(irq)
+{
+	chip->ack(irq);
+}
+
+default_mask_ack(irq)
+{
+	if (chip->mask_ack) {
+		chip->mask_ack(irq);
+	} else {
+		chip->mask(irq);
+		chip->ack(irq);
+	}
+}
+
+noop(irq)
+{
+}
+
+default_set_type(irq, type)
+{
+	if (desc->chip->set_type) {
+		if (desc->chip->set_type(irq, type))
+			return NULL;
+	}
+
+	return default_handler for type;
+}
+		</programlisting>
+	        </para>
+	    </sect3>
+	    <sect3>
+	 	<title>Default Level IRQ type</title>
+		<para>
+		The default Level IRQ type implements the functions
+		<simplelist type="horiz" columns="2">
+		<member>enable</member><member>default_enable</member>
+		<member>disable</member><member>default_disable</member>
+		<member>start</member><member>default_mask_ack</member>
+		<member>end</member><member>default_enable</member>
+		<member>handle_irq</member><member>handle_level_irq</member>
+		<member>set_type</member><member>default_set_type</member>
+		</simplelist>
+	        </para>
+	    </sect3>
+	    <sect3>
+	 	<title>Default Edge IRQ type</title>
+		<para>
+		The default Edge IRQ type implements the functions
+		<simplelist type="horiz" columns="2">
+		<member>enable</member><member>default_enable</member>
+		<member>disable</member><member>default_disable</member>
+		<member>start</member><member>default_ack</member>
+		<member>hold</member><member>default_mask_ack</member>
+		<member>end</member><member>noop</member>
+		<member>handle_irq</member><member>handle_edge_irq</member>
+		<member>set_type</member><member>default_set_type</member>
+		</simplelist>
+	        </para>
+	    </sect3>
+	    <sect3>
+	 	<title>Default simple IRQ type</title>
+		<para>
+		The default simple IRQ type implements the functions
+		<simplelist type="horiz" columns="2">
+		<member>enable</member><member>noop</member>
+		<member>disable</member><member>noop</member>
+		<member>handle_irq</member><member>handle_simple_irq</member>
+		</simplelist>
+	        </para>
+	    </sect3>
+	    <sect3>
+	 	<title>Default per CPU IRQ type</title>
+		<para>
+		The default per CPU IRQ type implements the functions
+		<simplelist type="horiz" columns="2">
+		<member>enable</member><member>default_enable</member>
+		<member>disable</member><member>default_disable</member>
+		<member>start</member><member>default_ack</member>
+		<member>end</member><member>default_enable</member>
+		<member>handle_irq</member><member>handle_percpu_irq</member>
+		</simplelist>
+	        </para>
+	    </sect3>
+	</sect2>
+	<sect2>
+	<title>Default type handler implementations</title>
+	    <sect3>
+	 	<title>Default Level IRQ type handler</title>
+		<para>
+		handle_level_type provides a generic implementation
+		for level type interrupts.
+		</para>
+		<para>
+		Following control flow is implemented (simplified excerpt):
+		<programlisting>
+desc->handler->start();
+handle_IRQ_event(desc->action);
+desc->handler->end();
+		</programlisting>
+		</para>
+   	    </sect3>
+	    <sect3>
+	 	<title>Default Edge IRQ type handler</title>
+		<para>
+		handle_edge_type provides a generic implementation
+		for edge type interrupts.
+		</para>
+		<para>
+		Following control flow is implemented (simplified excerpt):
+		<programlisting>
+if (desc->status &amp; running) {
+	desc->handler->hold();
+	desc->status |= pending | masked;
+	return;
+}
+desc->handler->start();
+desc->status |= running;
+do {
+	if (desc->status &amp; masked)
+		desc->handler->enable();
+	desc-status &amp;= ~pending;
+	handle_IRQ_event(desc->action);
+} while (status &amp; pending);
+desc-status &amp;= ~running;
+desc->handler->end();
+		</programlisting>
+		</para>
+   	    </sect3>
+	    <sect3>
+	 	<title>Default simple IRQ type handler</title>
+		<para>
+		handle_simple_type provides a generic implementation
+		for simple type interrupts.
+		</para>
+		<para>
+		Note: The simple type handler does not call any
+		handler/chip primitives.
+		</para>
+		<para>
+		Following control flow is implemented (simplified excerpt):
+		<programlisting>
+handle_IRQ_event(desc->action);
+		</programlisting>
+		</para>
+   	    </sect3>
+	    <sect3>
+	 	<title>Default per CPU type handler</title>
+		<para>
+		handle_percpu_type provides a generic implementation
+		for per CPU type interrupts.
+		</para>
+		<para>
+		Per CPU interrupts are only available on SMP and
+		the handler provides a simplified version without
+		locking.
+		</para>
+		<para>
+		Following control flow is implemented (simplified excerpt):
+		<programlisting>
+desc->handler->start();
+handle_IRQ_event(desc->action);
+desc->handler->end();
+		</programlisting>
+		</para>
+   	    </sect3>
+	</sect2>
+	<sect2>
+	<title>Architecture specific type implementation</title>
+	<para>
+	  If an architecture needs to implement its own type structures, then
+	  the following primitives have to be implemented:
+	  <itemizedlist>
+	  <listitem><para>handle_irq() - The handle_irq function pointer should preferably point to
+	  one of the generic type handler functions</para></listitem>
+	  <listitem><para>startup() - Optional</para></listitem>
+	  <listitem><para>shutdown() - Optional</para></listitem>
+	  <listitem><para>enable()</para></listitem>
+	  <listitem><para>disable()</para></listitem>
+	  <listitem><para>start()</para></listitem>
+	  <listitem><para>hold() - For edge type interupts only</para></listitem>
+	  <listitem><para>end()</para></listitem>
+	  <listitem><para>set_type - Optional</para></listitem>
+	  <listitem><para>set_affinity - SMP only</para></listitem>
+	  </itemizedlist>
+	</para>
+	</sect2>
+	<sect2>
+	<title>Quirks and optimizations</title>
+	<para>
+	The generic functions are intended for 'clean' architectures and chips,
+	which have no platform-specific IRQ handling quirks. If an architecture
+	needs to implement quirks on the 'flow' level then it can do so by
+	overriding the irqtype. This is also done for compatibility reasons, as
+	most architectures use irqtypes only at the moment.
+	</para>
+	<para>
+	An architecture could implement all of its IRQ logic via pushing
+	chip handling details into the irqtype's ->start()/->end()/->hold()
+	functions. This is only recommended when the underlying primitives
+	are pure chip primitives without additional quirks. The direct pointer
+	to the chip functions reduces the indirection level by one.
+	</para>
+	</sect2>
+    </sect1>
+    <sect1>
+	<title>Chiplevel hardware encapsulation</title>
+	<para>
+	The chip level hardware description structure irq_chip
+	contains all the direct chip relevant functions, which
+	can be utilized by the irq_type implementations.
+	  <itemizedlist>
+	  <listitem><para>ack()</para></listitem>
+	  <listitem><para>mask_ack() - Optional, recommended for performance</para></listitem>
+	  <listitem><para>mask()</para></listitem>
+	  <listitem><para>unmask()</para></listitem>
+	  <listitem><para>retrigger() - Optional</para></listitem>
+	  <listitem><para>set_type() - Optional</para></listitem>
+	  <listitem><para>set_wake() - Optional</para></listitem>
+	  </itemizedlist>
+	These primitives are strictly intended to mean what they say: ack means
+	ACK, masking means masking of an IRQ line, etc. It is up to the flow
+	handler(s) to use these basic units of lowlevel functionality.
+	</para>
+    </sect1>
+  </chapter>
+
+  <chapter id="doirq">
+     <title>__do_IRQ entry point</title>
+     <para>
+ 	The original implementation __do_IRQ() is an alternative entry
+	point for all types of interrupts.
+     </para>
+     <para>
+	This handler turned out to be not suitable for all
+	interrupt hardware and was therefor reimplemented with split
+	functionality for egde/level/simple/percpu interrupts. This is not
+	only a functional optimization. It also shortenes code pathes for
+	interrupts.
+      </para>
+      <para>
+	To make use of the split implementation, replace the call to
+	__do_IRQ by a call to desc->handler->handle_irq() and associate
+        the appropriate handler function to desc->handler->handle_irq().
+	In most cases the generic type and handler implementations should
+	be sufficient.
+     </para>
+  </chapter>
+
+  <chapter id="locking">
+     <title>Locking on SMP</title>
+     <para>
+	The locking of chip registers is up to the architecture that
+	defines the chip primitives. There is a chip->lock field that can be used
+	for serialization, but the generic layer does not touch it. The per-irq
+	structure is protected via desc->lock, by the generic layer.
+     </para>
+  </chapter>
+  <chapter id="structs">
+     <title>Structures</title>
+     <para>
+     This chapter contains the autogenerated documentation of the structures which are
+     used in the generic IRQ layer.
+     </para>
+!Iinclude/linux/irq.h
+  </chapter>
+
+  <chapter id="pubfunctions">
+     <title>Public Functions Provided</title>
+     <para>
+     This chapter contains the autogenerated documentation of the kernel API functions
+      which are exported.
+     </para>
+!Ekernel/irq/manage.c
+  </chapter>
+
+  <chapter id="intfunctions">
+     <title>Internal Functions Provided</title>
+     <para>
+     This chapter contains the autogenerated documentation of the internal functions.
+     </para>
+!Ikernel/irq/handle.c
+  </chapter>
+
+  <chapter id="credits">
+     <title>Credits</title>
+	<para>
+		The following people have contributed to this document:
+		<orderedlist>
+			<listitem><para>Thomas Gleixner<email>tglx@linutronix.de</email></para></listitem>
+			<listitem><para>Ingo Molnar<email>mingo@elte.hu</email></para></listitem>
+		</orderedlist>
+	</para>
+  </chapter>
+</book>
Index: linux.prev/Documentation/DocBook/kernel-api.tmpl
===================================================================
--- linux.prev.orig/Documentation/DocBook/kernel-api.tmpl
+++ linux.prev/Documentation/DocBook/kernel-api.tmpl
@@ -54,6 +54,11 @@
 !Ekernel/sched.c
 !Ekernel/timer.c
      </sect1>
+     <sect1><title>High-resolution timers</title>
+!Iinclude/linux/ktime.h
+!Iinclude/linux/hrtimer.h
+!Ekernel/hrtimer.c
+     </sect1>
      <sect1><title>Internal Functions</title>
 !Ikernel/exit.c
 !Ikernel/signal.c
Index: linux.prev/Documentation/RCU/proc.txt
===================================================================
--- /dev/null
+++ linux.prev/Documentation/RCU/proc.txt
@@ -0,0 +1,207 @@
+/proc Filesystem Entries for RCU
+
+
+CONFIG_RCU_STATS
+
+The CONFIG_RCU_STATS config option is available only in conjunction with
+CONFIG_PREEMPT_RCU.  It makes four /proc entries available, namely: rcuctrs,
+rcuptrs, rcugp, and rcustats.
+
+/proc/rcuctrs
+
+	CPU last cur
+	  0    1   1
+	  1    1   1
+	  2    1   1
+	  3    0   2
+	ggp = 230725
+
+This displays the number of processes that started RCU read-side critical
+sections on each CPU.  In absence of preemption, the "last" and "cur"
+counts for a given CPU will always sum to one.  Therefore, in the example
+output above, each CPU has started one RCU read-side critical section
+that was later preempted.  The "last" column counts RCU read-side critical
+sections that started prior to the last counter flip, while the "cur"
+column counts critical sections that started after the last counter flip.
+
+The "ggp" count is a count of the number of counter flips since boot.
+Since this is shown as an odd number, the "cur" counts are stored in
+the zero-th element of each of the per-CPU arrays, and the "last" counts
+are stored in the first element of each of the per-CPU arrays.
+
+
+/proc/rcuptrs
+
+	nl=c04c7160/c04c7960 nt=c04c72d0
+	 wl=c04c7168/c04c794c wt=c04c72bc dl=c04c7170/00000000 dt=c04c7170
+
+This displays the head and tail of each of CONFIG_PREEMPT_RCU's three
+callback lists.  This will soon change to display this on a per-CPU
+basis, since each CPU will soon have its own set of callback lists.
+In the example above, the "next" list header is located at hex address
+0xc04c7160, the first element on the list at hex address 0xc04c7960,
+and the last element on the list at hex address 0xc04c72d0.  The "wl="
+and "wt=" output is similar for the "wait" list, and the "dl=" and "dt="
+output for the "done" list.  The "done" list is normally emptied very
+quickly after being filled, so will usually be empty as shown above.
+Note that the tail pointer points into the list header in this case.
+
+Callbacks are placed in the "next" list by call_rcu(), moved to the
+"wait" list after the next counter flip, and moved to the "done" list
+on the counter flip after that.  Once on the "done" list, the callbacks
+are invoked.
+
+
+/proc/rcugp
+
+	oldggp=241419  newggp=241421
+
+This entry invokes synchronize_rcu() and prints out the number of counter
+flips since boot before and after the synchronize_rcu().  These two
+numbers will always differ by at least two.  Unless RCU is broken.  ;-)
+
+
+/proc/rcustats
+
+	ggp=242416 lgp=242416 sr=0 rcc=396233
+	na=2090938 nl=9 wa=2090929 wl=9 dl=0 dr=2090920 di=2090920
+	rtf1=22230730 rtf2=20139162 rtf3=242416 rtfe1=2085911 rtfe2=5657 rtfe3=19896746
+
+The quantities printed are as follows:
+
+o	"ggp=": The number of flips since boot.
+
+o	"lgp=": The number of flips sensed by the local structure since
+	boot.  This will soon be per-CPU.
+
+o	"sr=": The number of explicit call to synchronize_rcu().
+	Except that this is currently broken, so always reads as zero.
+	It is likely to be removed...
+
+o	"rcc=": The number of calls to rcu_check_callbacks().
+
+o	"na=": The number of callbacks that call_rcu() has registered
+	since boot.
+
+o	"nl=": The number of callbacks currently on the "next" list.
+
+o	"wa=": The number of callbacks that have moved to the "wait"
+	list since boot.
+
+o	"wl=": The number of callbacks currently on the "wait" list.
+
+o	"da=": The number of callbacks that have been moved to the
+	"done" list since boot.
+
+o	"dl=": The number of callbacks currently on the "done" list.
+
+o	"dr=": The number of callbacks that have been removed from the
+	"done" list since boot.
+
+o	"di=": The number of callbacks that have been invoked after being
+	removed from the "done" list.
+
+o	"rtf1=": The number of attempts to flip the counters.
+
+o	"rtf2=": The number of attempts to flip the counters that successfully
+	acquired the fliplock.
+
+o	"rtf3=": The number of successful counter flips.
+
+o	"rtfe1=": The number of attempts to flip the counters that failed
+	due to the lock being held by someone else.
+
+o	"rtfe2=": The number of attempts to flip the counters that were
+	abandoned due to someone else doing the job for us.
+
+o	"rtfe3=": The number of attempts to flip the counters that failed
+	due to some task still being in an RCU read-side critical section
+	starting from before the last successful counter flip.
+
+
+CONFIG_RCU_TORTURE_TEST
+
+The CONFIG_RCU_TORTURE_TEST config option is available for all RCU
+implementations.  It makes three /proc entries available, namely: rcutw,
+rcutr, and rcuts.
+
+
+/proc/rcutw
+
+Reading this entry starts a new torture test, or ends an earlier one
+if one is already in progress (in other words, there can be only one
+writer at a time).  This sleeps uninterruptibly, so be sure to run
+it in the background.  One could argue that it would be good to have
+multiple writers, but Linux uses RCU heavily enough that you will get
+write-side contention whether you want it or not.  If you want additional
+write-side contention, repeatedly create and destroy several large file
+trees in parallel.  Or use some other RCU-protected update.
+
+
+/proc/rcutr
+
+Reading this entry starts a new torture reader, which runs until sent
+a signal (e.g., control-C).  If testing an RCU implementation with
+preemptible read-side critical sections, make sure to spawn at least
+two /proc/rcutr instances for each CPU.
+
+
+/proc/rcuts
+
+Displays the current state of the torture test:
+
+	ggp = 20961
+	rtc: c04496f4 ver: 8734 tfle: 0 rta: 8734 rtaf: 0 rtf: 8715
+	Reader Pipe:  88024120 63914 0 0 0 0 0 0 0 0 0
+	Reader Batch:  88024097 63937 0 0 0 0 0 0 0 0
+	Free-Block Circulation:  8733 8731 8729 8727 8725 8723 8721 8719 8717 8715 0
+
+The entries are as follows:
+
+o	"ggp": The number of counter flips (or batches) since boot.
+
+o	"rtc": The hexadecimal address of the structure currently visible
+	to readers.
+
+o	"ver": The number of times since boot that the rcutw writer task
+	has changed the structure visible to readers.
+
+o	"tfle": If non-zero, indicates that the "torture freelist"
+	containing structure to be placed into the "rtc" area is empty.
+	This condition is important, since it can fool you into thinking
+	that RCU is working when it is not.  :-/
+
+o	"rta": Number of structures allocated from the torture freelist.
+
+o	"rtaf": Number of allocations from the torture freelist that have
+	failed due to the list being empty.
+
+o	"rtf": Number of frees into the torture freelist.
+
+o	"Reader Pipe": Histogram of "ages" of structures seen by readers.
+	If any entries past the first two are non-zero, RCU is broken.
+	And /proc/rcuts prints "!!!" to make sure you notice.  The age
+	of a newly allocated structure is zero, it becomes one when
+	removed from reader visibility, and is incremented once per
+	grace period subsequently -- and is freed after passing through
+	(RCU_TORTURE_PIPE_LEN-2) grace periods.
+
+	The output displayed above was taken from a correctly working
+	RCU.  If you want to see what it looks like when broken, break
+	it yourself.  ;-)
+
+o	"Reader Batch": Another histogram of "ages" of structures seen
+	by readers, but in terms of counter flips (or batches) rather
+	than in terms of grace periods.  The legal number of non-zero
+	entries is again two.  The reason for this separate view is
+	that it is easier to get the third entry to show up in the
+	"Reader Batch" list than in the "Reader Pipe" list.
+
+o	"Free-Block Circulation": Shows the number of torture structures
+	that have reached a given point in the pipeline.  The first element
+	should closely correspond to the number of structures allocated,
+	the second to the number that have been removed from reader view,
+	and all but the last remaining to the corresponding number of
+	passes through a grace period.  The last entry should be zero,
+	as it is only incremented if a torture structure's counter
+	somehow gets incremented farther than it should.
Index: linux.prev/Documentation/hrtimers.txt
===================================================================
--- /dev/null
+++ linux.prev/Documentation/hrtimers.txt
@@ -0,0 +1,178 @@
+
+hrtimers - subsystem for high-resolution kernel timers
+----------------------------------------------------
+
+This patch introduces a new subsystem for high-resolution kernel timers.
+
+One might ask the question: we already have a timer subsystem
+(kernel/timers.c), why do we need two timer subsystems? After a lot of
+back and forth trying to integrate high-resolution and high-precision
+features into the existing timer framework, and after testing various
+such high-resolution timer implementations in practice, we came to the
+conclusion that the timer wheel code is fundamentally not suitable for
+such an approach. We initially didnt believe this ('there must be a way
+to solve this'), and spent a considerable effort trying to integrate
+things into the timer wheel, but we failed. In hindsight, there are
+several reasons why such integration is hard/impossible:
+
+- the forced handling of low-resolution and high-resolution timers in
+  the same way leads to a lot of compromises, macro magic and #ifdef
+  mess. The timers.c code is very "tightly coded" around jiffies and
+  32-bitness assumptions, and has been honed and micro-optimized for a
+  relatively narrow use case (jiffies in a relatively narrow HZ range)
+  for many years - and thus even small extensions to it easily break
+  the wheel concept, leading to even worse compromises. The timer wheel
+  code is very good and tight code, there's zero problems with it in its
+  current usage - but it is simply not suitable to be extended for
+  high-res timers.
+
+- the unpredictable [O(N)] overhead of cascading leads to delays which
+  necessiate a more complex handling of high resolution timers, which
+  in turn decreases robustness. Such a design still led to rather large
+  timing inaccuracies. Cascading is a fundamental property of the timer
+  wheel concept, it cannot be 'designed out' without unevitably
+  degrading other portions of the timers.c code in an unacceptable way.
+
+- the implementation of the current posix-timer subsystem on top of
+  the timer wheel has already introduced a quite complex handling of
+  the required readjusting of absolute CLOCK_REALTIME timers at
+  settimeofday or NTP time - further underlying our experience by
+  example: that the timer wheel data structure is too rigid for high-res
+  timers.
+
+- the timer wheel code is most optimal for use cases which can be
+  identified as "timeouts". Such timeouts are usually set up to cover
+  error conditions in various I/O paths, such as networking and block
+  I/O. The vast majority of those timers never expire and are rarely
+  recascaded because the expected correct event arrives in time so they
+  can be removed from the timer wheel before any further processing of
+  them becomes necessary. Thus the users of these timeouts can accept
+  the granularity and precision tradeoffs of the timer wheel, and
+  largely expect the timer subsystem to have near-zero overhead.
+  Accurate timing for them is not a core purpose - in fact most of the
+  timeout values used are ad-hoc. For them it is at most a necessary
+  evil to guarantee the processing of actual timeout completions
+  (because most of the timeouts are deleted before completion), which
+  should thus be as cheap and unintrusive as possible.
+
+The primary users of precision timers are user-space applications that
+utilize nanosleep, posix-timers and itimer interfaces. Also, in-kernel
+users like drivers and subsystems which require precise timed events
+(e.g. multimedia) can benefit from the availability of a seperate
+high-resolution timer subsystem as well.
+
+While this subsystem does not offer high-resolution clock sources just
+yet, the hrtimer subsystem can be easily extended with high-resolution
+clock capabilities, and patches for that exist and are maturing quickly.
+The increasing demand for realtime and multimedia applications along
+with other potential users for precise timers gives another reason to
+separate the "timeout" and "precise timer" subsystems.
+
+Another potential benefit is that such a seperation allows even more
+special-purpose optimization of the existing timer wheel for the low
+resolution and low precision use cases - once the precision-sensitive
+APIs are separated from the timer wheel and are migrated over to
+hrtimers. E.g. we could decrease the frequency of the timeout subsystem
+from 250 Hz to 100 HZ (or even smaller).
+
+hrtimer subsystem implementation details
+----------------------------------------
+
+the basic design considerations were:
+
+- simplicity
+
+- data structure not bound to jiffies or any other granularity. All the
+  kernel logic works at 64-bit nanoseconds resolution - no compromises.
+
+- simplification of existing, timing related kernel code
+
+another basic requirement was the immediate enqueueing and ordering of
+timers at activation time. After looking at several possible solutions
+such as radix trees and hashes, we chose the red black tree as the basic
+data structure. Rbtrees are available as a library in the kernel and are
+used in various performance-critical areas of e.g. memory management and
+file systems. The rbtree is solely used for time sorted ordering, while
+a separate list is used to give the expiry code fast access to the
+queued timers, without having to walk the rbtree.
+
+(This seperate list is also useful for later when we'll introduce
+high-resolution clocks, where we need seperate pending and expired
+queues while keeping the time-order intact.)
+
+Time-ordered enqueueing is not purely for the purposes of
+high-resolution clocks though, it also simplifies the handling of
+absolute timers based on a low-resolution CLOCK_REALTIME. The existing
+implementation needed to keep an extra list of all armed absolute
+CLOCK_REALTIME timers along with complex locking. In case of
+settimeofday and NTP, all the timers (!) had to be dequeued, the
+time-changing code had to fix them up one by one, and all of them had to
+be enqueued again. The time-ordered enqueueing and the storage of the
+expiry time in absolute time units removes all this complex and poorly
+scaling code from the posix-timer implementation - the clock can simply
+be set without having to touch the rbtree. This also makes the handling
+of posix-timers simpler in general.
+
+The locking and per-CPU behavior of hrtimers was mostly taken from the
+existing timer wheel code, as it is mature and well suited. Sharing code
+was not really a win, due to the different data structures. Also, the
+hrtimer functions now have clearer behavior and clearer names - such as
+hrtimer_try_to_cancel() and hrtimer_cancel() [which are roughly
+equivalent to del_timer() and del_timer_sync()] - so there's no direct
+1:1 mapping between them on the algorithmical level, and thus no real
+potential for code sharing either.
+
+Basic data types: every time value, absolute or relative, is in a
+special nanosecond-resolution type: ktime_t. The kernel-internal
+representation of ktime_t values and operations is implemented via
+macros and inline functions, and can be switched between a "hybrid
+union" type and a plain "scalar" 64bit nanoseconds representation (at
+compile time). The hybrid union type optimizes time conversions on 32bit
+CPUs. This build-time-selectable ktime_t storage format was implemented
+to avoid the performance impact of 64-bit multiplications and divisions
+on 32bit CPUs. Such operations are frequently necessary to convert
+between the storage formats provided by kernel and userspace interfaces
+and the internal time format. (See include/linux/ktime.h for further
+details.)
+
+hrtimers - rounding of timer values
+-----------------------------------
+
+the hrtimer code will round timer events to lower-resolution clocks
+because it has to. Otherwise it will do no artificial rounding at all.
+
+one question is, what resolution value should be returned to the user by
+the clock_getres() interface. This will return whatever real resolution
+a given clock has - be it low-res, high-res, or artificially-low-res.
+
+hrtimers - testing and verification
+----------------------------------
+
+We used the high-resolution clock subsystem ontop of hrtimers to verify
+the hrtimer implementation details in praxis, and we also ran the posix
+timer tests in order to ensure specification compliance. We also ran
+tests on low-resolution clocks.
+
+The hrtimer patch converts the following kernel functionality to use
+hrtimers:
+
+ - nanosleep
+ - itimers
+ - posix-timers
+
+The conversion of nanosleep and posix-timers enabled the unification of
+nanosleep and clock_nanosleep.
+
+The code was successfully compiled for the following platforms:
+
+ i386, x86_64, ARM, PPC, PPC64, IA64
+
+The code was run-tested on the following platforms:
+
+ i386(UP/SMP), x86_64(UP/SMP), ARM, PPC
+
+hrtimers were also integrated into the -rt tree, along with a
+hrtimers-based high-resolution clock implementation, so the hrtimers
+code got a healthy amount of testing and use in practice.
+
+	Thomas Gleixner, Ingo Molnar
Index: linux.prev/Documentation/kernel-parameters.txt
===================================================================
--- linux.prev.orig/Documentation/kernel-parameters.txt
+++ linux.prev/Documentation/kernel-parameters.txt
@@ -52,6 +52,7 @@ restrictions referred to are that the re
 	MTD	MTD support is enabled.
 	NET	Appropriate network support is enabled.
 	NUMA	NUMA support is enabled.
+	GENERIC_TIME The generic timeofday code is enabled.
 	NFS	Appropriate NFS support is enabled.
 	OSS	OSS sound support is enabled.
 	PARIDE	The ParIDE subsystem is enabled.
@@ -329,10 +330,11 @@ running once the system is up.
 			Value can be changed at runtime via
 				/selinux/checkreqprot.
 
- 	clock=		[BUGS=IA-32,HW] gettimeofday timesource override.
-			Forces specified timesource (if avaliable) to be used
-			when calculating gettimeofday(). If specicified
-			timesource is not avalible, it defaults to PIT.
+	clock=		[BUGS=IA-32, HW] gettimeofday clocksource override.
+			[Deprecated]
+			Forces specified clocksource (if avaliable) to be used
+			when calculating gettimeofday(). If specified
+			clocksource is not avalible, it defaults to PIT.
 			Format: { pit | tsc | cyclone | pmtmr }
 
 	hpet=		[IA-32,HPET] option to disable HPET and use PIT.
@@ -1477,6 +1479,10 @@ running once the system is up.
 
 	time		Show timing data prefixed to each printk message line
 
+	clocksource=	[GENERIC_TIME] Override the default clocksource
+			Override the default clocksource and use the clocksource
+			with the name specified.
+
 	tipar.timeout=	[HW,PPT]
 			Set communications timeout in tenths of a second
 			(default 15).
Index: linux.prev/Documentation/timekeeping.txt
===================================================================
--- /dev/null
+++ linux.prev/Documentation/timekeeping.txt
@@ -0,0 +1,350 @@
+How timekeeping works with CONFIG_GENERIC_TIME
+========================================================================
+
+The generic timekeeping code maintains and allows access to the systems
+understanding of how much time has passed from a certain point. However, in
+order to measure the passing of time, the generic timekeeping code relies on
+the clocksource abstraction. A clocksource abstracts a free running counter
+who's value increases at a known frequency.
+
+In the generic timekeeping code, we use a pointer to a selected clocksource to
+measure the passing of time.
+
+struct clocksource *clock
+
+The clocksource has some limitations however. Since its likely of fixed width,
+it will not increment forever and will overflow. In order to still properly
+keep time, we must occasionally accumulate an interval of time. In the generic
+timekeeping code, we accumulate the amount of time system the system booted
+into the value system_time, which keeps nanosecond resolution in a ktime_t
+storage.
+
+ktime_t system_time
+
+Since its likely your system has not been running continually since midnight on
+the 1st of January in 1970, we must provide an offset from that time in
+accordance with conventions. This only occasionally changed (via
+settimeofday()) offset is the wall_time_offset value, which is also stored as a
+ktime_t.
+
+ktime_t wall_time_offset
+
+
+Since we accumulate time in intervals, we need a base cycle value that we can
+use to generate an offset from the time value kept in system_time. We store
+this value in cycle_last.
+
+cycle_t cycle_last;
+
+
+Further since all clocks drift somewhat from each other, we use the adjustment
+values provided via adjtimex() to correct our clocksource frequency for each
+interval. This frequency adjustment value is stored in ntp_adj.
+
+long ntp_adj;
+
+Now that we've covered the core global variables for timekeeping, lets look at
+how we maintain these values.
+
+As stated above, we want to avoid the clocksource from overflowing on us, so we
+accumulate a time interval periodically. This periodic accumulation function is
+called timeofday_periodic_hook().  In simplified pseudo code, it logically is
+presented as:
+
+timeofday_periodic_hook():
+	cycle_now = read_clocksource(clock)
+	cycle_delta = (cycle_now - cycle_last) & clock->mask
+	nsec = cyc2ns(clock, cycle_delta, ntp_adj)
+	system_time += nsec
+	cycle_last = cycle_now
+
+	/* do other stuff */
+
+You can see we read the cycle value from the clocksource, calculate a cycle
+delta for the interval since we last called timeofday_periodic_hook(), convert
+that cycle delta to a nanosecond interval (for now ignore ntp_adj), add it to
+the system time and finally set our cycle_last value to cycle_now for the next
+interval. Using this simple algorithm we can correctly measure and record the
+passing of time.
+
+But just storing this info isn't very useful, we also want to make it available
+to be used elsewhere. So how do we provide a notion of how much time has passed
+inbetween calls to timeofday_periodic_hook()?
+
+First, lets create a function that calculates the time since the last call to
+timeofday_peridoic_hook().
+
+get_nsec_offset():
+	cycle_now = read_clocksource(clock)
+	cycle_delta = (cycle_now - cycle_last) & clock->mask
+	nsec = cyc2ns(clock, cycle_delta, ntp_adj)
+	return nsec
+
+Here you can see, we read the clocksource, calculate a cycle interval, and
+convert that to a nanosecond interval. Just like how it is done in
+timeofday_periodic_hook!
+
+Now lets use this function to provide the number of nanoseconds that the system
+has been running:
+
+do_monotonic_clock():
+	return system_time + get_nsec_offset()
+
+Here we trivially add the nanosecond offset since the last
+timeofday_periodic_hook() to the value of system_time which was stored at the
+last timeofday_periodic_hook().
+
+Note that since we use the same method to calculate time intervals, assuming
+each function is atomic and the clocksource functions as it should, time cannot
+go backward!
+
+Now to get the time of day using the standard convention:
+
+do_gettimeofday():
+	return do_monotonic_clock() + wall_time_offset
+
+We simply add the wall_time_offset, and we have the number of nanoseconds since
+1970 began!
+
+
+Of course, in real life, things are not so static. We have to handle a number
+of dynamic values that may change and affect timekeeping. In order to do these
+safely, we must only change values in-between intervals. This means the
+periodic_hook call must handle these changes.
+
+Since clocksources can be changed while the system is running, we need to check
+for and possibly switch to using new clocksources in the periodic_hook call.
+Further, clocksources may change their frequency. Since this must be done only
+at a safe point, we use the update_callback function pointer (for more details,
+see "How to write a clocksource driver" below), this too must be done
+in-between intervals in the periodic_hook call. Finally, since the ntp
+adjustment made in the cyc2ns conversion is not static, we need to update the
+ntp state machine and get a calculate a new adjustment value.
+
+This adds some extra pseudo code to the timeofday_periodic_hook function:
+
+timeofday_periodic_hook():
+	cycle_now = read_clocksource(clock)
+	cycle_delta = (cycle_now - cycle_last) & clock->mask
+	nsec = cyc2ns(clock, cycle_delta, ntp_adj)
+	system_time += nsec
+	cycle_last = cycle_now
+
+	next = get_next_clocksource()
+	if(next != clock):
+		cycle_last = read_clocksource(next)
+		clock = next
+
+	if(clock->update_callback):
+		clock->update_callback()
+
+	ntp_advance(nsec)
+	ppm = ntp_get_ppm_adjustment()
+	ntp_adj = ppm_to_mult_adj(clock, ppm)
+
+
+Unfortunately, the actual timeofday_periodic_hook code is not as simple as this
+pseudo code. For performance concerns, much has been done to pre-calculate
+values and use them repeatedly. Thus be aware that the code in timeofday.c is
+more complex, however the functional logic is the same.
+
+
+How to port an architecture to GENERIC_TIME
+========================================================================
+Porting an architecture to the GENERIC_TIME timekeeping code consists of moving
+a little bit of code around then deleting a fair amount. It is my hope that
+this will reduce the arch specific maintenance work around timekeeping.
+
+Porting an arch usually requires the following steps.
+
+1. Define CONFIG_GENERIC_TIME in the arches Kconfig
+2. Implementing the following functions
+	nsec_t read_persistent_clock(void)
+	void sync_persistent_clock(struct timespec ts)
+3. Removing all of the arch specific timekeeping code
+	do_gettimeofday()
+	do_settimeofday()
+	etc
+4. Implementing clocksource drivers
+	See "How to write a clocksource driver" for more details
+
+The exceptions to the above are:
+
+5.  If the arch is has no continuous clocksource
+	A) Implement 1-3 in the above list.
+	B) Define CONFIG_IS_TICK_BASED in arches Kconfig
+	C) Implement the "long arch_getoffset(void)" function
+
+6. If the arch supports vsyscall gettimeofday (see x86_64 for reference)
+	A) Implement 1-4 in the above list
+	B) Define GENERIC_TIME_VSYSCALL
+	C) Implement arch_update_vsyscall_gtod()
+	D) Implement vsyscall gettimeofday (similar to __get_realtime_clock_ts)
+	E) Implement vread functions for supported clocksources
+
+
+
+How to write a clocksource driver.
+========================================================================
+First, a quick summary of what a clocksource driver provides.
+
+Simply put, a clocksource is a abstraction of a free running increasing
+counter. The abstraction provides the minimal amount of info for that counter
+to be usable for timekeeping. Those required values are:
+	1. It's name
+	2. A rating value for selection priority
+	3. A read function pointer
+	4. A mask value for correct twos-complement subtraction
+	5. A mult and shift pair that approximate the counter frequency
+		mult/(2^shift) ~= nanoseconds per cycle
+
+Additionally, there are other optionally set values that allow for advanced
+functionality. Those values are:
+	6. The update_callback function.
+	7. The is_continuous flag.
+	8. The vread function pointer
+	9. The vdata pointer value
+
+
+Now lets go over these values in detail.
+
+1. Name.
+	The clocksource's name should be unique since it is used for both
+identification as well as for manually overriding the default clocksource
+selection. The name length must be shorter then 32 characters in order for it
+to be properly overrided.
+
+2. Rating value
+	This rating value is used as a priority value for clocksource
+selection. It has no direct connection to quality or physical properties of the
+clocksource, but is to be set and manipulated to guarantee that the best (by no
+specific metric) clocksource that will provide correct timekeeping is
+automatically selected. Rating suggestions can be found in
+include/linux/clocksource.h
+
+3. Read function pointer
+	This pointer should point to a function that returns an unsigned
+increasing cycle value from the clocksource. The value should have a coverage
+from zero to the maximum cycle value the clocksource can provide. This does not
+have to be direct hardware value and can also be a software counter. An example
+of a software counter is the jiffies clocksource.
+
+4. The mask value
+	This value should be the largest power of two that is smaller then the
+maximum cycle value. This allows twos complement subtraction to work on
+overflow boundary conditions if the max value is less then (cycle_t)-1. So for
+example, if we have a 16 bit counter (ie: one that loops to zero after
+0x0000FFFF), the mask would be 0xFFFF. So then when finding the cycle
+difference around a overflow, where now = 0x0013 and then = 0xFFEE, we can
+compute the cycle delta properly using the equation:
+	delta = (now - then)&mask
+	delta = (0x0013 - 0xFFEE) & 0xFFFF
+	delta = 0xFFFF0025 & 0xFFFF  /* note the unmasked negative value */
+	delta = 0x25
+
+5. The mult and shift pair
+	These 32bit values approximate the nanosecond per cycle frequency of
+the clocksource using the equation: mult/(2^shift). If you have a khz or hz
+frequency value, the mult value for a given shift value can be easily
+calculated using the  clocksource_hz2mult() and clocksource_khz2mult() helper
+functions. When selecting a shift value, it is important to be careful. Larger
+shift values give a finer precision in the cycle to nanosecond conversion and
+allows for more exact NTP adjustments.	However if you select too large a shift
+value, the resulting mult value might overflow a cycle_t * mult computation.
+
+
+So if you have a simple hardware counter that does not change frequency,
+filling in the above should be sufficient for a functional clocksource. But
+read on for details on implementing a more complex clocksource.
+
+6. The update_callback function pointer.
+	If this function pointer is non-NULL, it will be called every periodic
+hook when it is safe for the clocksource to change its state. This would be
+necessary in the case where the counter frequency changes, for example. One
+user of this  function pointer is the TSC clocksource. When the TSC frequency
+changes (which may occur if the cpu changes frequency) we need to notify the
+clocksource at a safe point where that state may change. Thus, if the TSC has
+changed frequency we set the new mult/shift values in the update_callback
+function.
+
+7. The is_continuous flag.
+	This flag variable (0 if false, 1 if true) denotes that the clocksource
+is continuous. This means that it is a purely hardware driven clocksource and
+is not dependent on any software code to run for it to increment properly. This
+denotation will be useful in the future when timer ticks may be disabled for
+long periods of time. Doing so using software clocksources, like the jiffies
+clocksource, would cause timekeeping problems.
+
+8. The vread function pointer.
+	This function pointer points to a user-space accessible function that
+reads the clocksource. This is used in userspace gettimeofday implementations
+to improve performance. See the x86-64 TSC clocksource implementation for an
+example.
+
+8. The vdata pointer.
+	This pointer is passed to the vread function pointer in a userspace
+gettimeofday implementation. Its usage is dependent on the vread
+implementation, but if the pointer points to data, that data must be readable
+from userspace.
+
+
+Now lets write a quick clocksource for an imaginary bit of hardware. Here are
+the specs:
+
+	A 32bit counter can be found at the MMIO address 0xFEEDF000. It runs at
+100Mhz. To enable it, the the low bit of the address 0xFEEDF0F0 must be set to
+one.
+
+So lets start out an empty cool-counter.c file, and define the clocksource.
+
+#include <linux/clocksource.h>
+#include <linux/init.h>
+#include <asm/io.h>
+
+#define COOL_READ_PTR	0xFEEDF000
+#define COOL_START_PTR	0xFEEDF0F0
+
+static __iomem *cool_ptr = COOL_READ_PTR;
+
+struct clocksource clocksource_cool
+{
+	.name = "cool",
+	.rating = 200,		/* its a pretty decent clock */
+	.mask = 0xFFFFFFFF,	/* 32 bits */
+	.mult = 0,			/*to be computed */
+	.shift = 10,
+}
+
+
+Now let's write the read function:
+
+cycle_t cool_counter_read(void)
+{
+	cycle_t ret = readl(cool_ptr);
+	return ret;
+}
+
+Finally, lets write the init function:
+
+void cool_counter_init(void)
+{
+	__iomem *ptr = COOL_START_PTR;
+	u32 val;
+
+	/* start the counter */
+	val = readl(ptr);
+	val |= 0x1;
+	writel(val, ptr);
+
+	/* finish initializing the clocksource */
+	clocksource_cool.read = cool_counter_read;
+	clocksource_cool.mult = clocksource_khz2mult(100000,
+					clocksource_cool.shift);
+
+	/* register the clocksource */
+	register_clocksource(&clocksource_cool);
+}
+module_init(cool_counter_init);
+
+
+Now wasn't that easy!
Index: linux.prev/Makefile
===================================================================
--- linux.prev.orig/Makefile
+++ linux.prev/Makefile
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 15
-EXTRAVERSION =
+EXTRAVERSION =-rt21
 NAME=Sliding Snow Leopard
 
 # *DOCUMENTATION*
@@ -519,10 +519,14 @@ CFLAGS		+= $(call add-align,CONFIG_CC_AL
 CFLAGS		+= $(call add-align,CONFIG_CC_ALIGN_LOOPS,-loops)
 CFLAGS		+= $(call add-align,CONFIG_CC_ALIGN_JUMPS,-jumps)
 
-ifdef CONFIG_FRAME_POINTER
-CFLAGS		+= -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,)
+ifdef CONFIG_MCOUNT
+CFLAGS                += -pg -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,)
 else
-CFLAGS		+= -fomit-frame-pointer
+  ifdef CONFIG_FRAME_POINTER
+    CFLAGS		+= -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,)
+  else
+    CFLAGS		+= -fomit-frame-pointer
+  endif
 endif
 
 ifdef CONFIG_DEBUG_INFO
Index: linux.prev/arch/arm/Kconfig
===================================================================
--- linux.prev.orig/arch/arm/Kconfig
+++ linux.prev/arch/arm/Kconfig
@@ -50,6 +50,10 @@ config UID16
 	bool
 	default y
 
+config GENERIC_HARDIRQS
+	bool
+	default y
+
 config RWSEM_GENERIC_SPINLOCK
 	bool
 	default y
@@ -368,18 +372,7 @@ config LOCAL_TIMERS
 	  accounting to be spread across the timer interval, preventing a
 	  "thundering herd" at every timer tick.
 
-config PREEMPT
-	bool "Preemptible Kernel (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
-	help
-	  This option reduces the latency of the kernel when reacting to
-	  real-time or interactive events by allowing a low priority process to
-	  be preempted even if it is in kernel mode executing a system call.
-	  This allows applications to run more reliably even when the system is
-	  under load.
-
-	  Say Y here if you are building a kernel for a desktop, embedded
-	  or real-time system.  Say N if you are unsure.
+source kernel/Kconfig.preempt
 
 config NO_IDLE_HZ
 	bool "Dynamic tick timer"
Index: linux.prev/arch/arm/boot/compressed/head.S
===================================================================
--- linux.prev.orig/arch/arm/boot/compressed/head.S
+++ linux.prev/arch/arm/boot/compressed/head.S
@@ -710,6 +710,19 @@ memdump:	mov	r12, r0
 		mov	pc, r10
 #endif
 
+#ifdef CONFIG_MCOUNT
+/* CONFIG_MCOUNT causes boot header to be built with -pg requiring this
+ * trampoline
+ */
+                .text
+                .align 0
+                .type mcount %function
+                .global mcount
+mcount:
+		mov pc, lr	@ just return
+#endif
+
+
 reloc_end:
 
 		.align
Index: linux.prev/arch/arm/boot/compressed/misc.c
===================================================================
--- linux.prev.orig/arch/arm/boot/compressed/misc.c
+++ linux.prev/arch/arm/boot/compressed/misc.c
@@ -199,6 +199,7 @@ static ulg free_mem_ptr_end;
 
 #define HEAP_SIZE 0x2000
 
+#define ZLIB_INFLATE_NO_INFLATE_LOCK
 #include "../../../../lib/inflate.c"
 
 #ifndef STANDALONE_DEBUG
Index: linux.prev/arch/arm/common/dmabounce.c
===================================================================
--- linux.prev.orig/arch/arm/common/dmabounce.c
+++ linux.prev/arch/arm/common/dmabounce.c
@@ -404,11 +404,11 @@ dma_map_single(struct device *dev, void 
 
 	BUG_ON(dir == DMA_NONE);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	dma_addr = map_single(dev, ptr, size, dir);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return dma_addr;
 }
@@ -431,11 +431,11 @@ dma_unmap_single(struct device *dev, dma
 
 	BUG_ON(dir == DMA_NONE);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	unmap_single(dev, dma_addr, size, dir);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 int
@@ -450,7 +450,7 @@ dma_map_sg(struct device *dev, struct sc
 
 	BUG_ON(dir == DMA_NONE);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	for (i = 0; i < nents; i++, sg++) {
 		struct page *page = sg->page;
@@ -462,7 +462,7 @@ dma_map_sg(struct device *dev, struct sc
 			map_single(dev, ptr, length, dir);
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return nents;
 }
@@ -479,7 +479,7 @@ dma_unmap_sg(struct device *dev, struct 
 
 	BUG_ON(dir == DMA_NONE);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	for (i = 0; i < nents; i++, sg++) {
 		dma_addr_t dma_addr = sg->dma_address;
@@ -488,7 +488,7 @@ dma_unmap_sg(struct device *dev, struct 
 		unmap_single(dev, dma_addr, length, dir);
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void
@@ -500,11 +500,11 @@ dma_sync_single_for_cpu(struct device *d
 	dev_dbg(dev, "%s(ptr=%p,size=%d,dir=%x)\n",
 		__func__, (void *) dma_addr, size, dir);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	sync_single(dev, dma_addr, size, dir);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void
@@ -516,11 +516,11 @@ dma_sync_single_for_device(struct device
 	dev_dbg(dev, "%s(ptr=%p,size=%d,dir=%x)\n",
 		__func__, (void *) dma_addr, size, dir);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	sync_single(dev, dma_addr, size, dir);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void
@@ -535,7 +535,7 @@ dma_sync_sg_for_cpu(struct device *dev, 
 
 	BUG_ON(dir == DMA_NONE);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	for (i = 0; i < nents; i++, sg++) {
 		dma_addr_t dma_addr = sg->dma_address;
@@ -544,7 +544,7 @@ dma_sync_sg_for_cpu(struct device *dev, 
 		sync_single(dev, dma_addr, length, dir);
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void
@@ -559,7 +559,7 @@ dma_sync_sg_for_device(struct device *de
 
 	BUG_ON(dir == DMA_NONE);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	for (i = 0; i < nents; i++, sg++) {
 		dma_addr_t dma_addr = sg->dma_address;
@@ -568,7 +568,7 @@ dma_sync_sg_for_device(struct device *de
 		sync_single(dev, dma_addr, length, dir);
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static int
Index: linux.prev/arch/arm/common/locomo.c
===================================================================
--- linux.prev.orig/arch/arm/common/locomo.c
+++ linux.prev/arch/arm/common/locomo.c
@@ -425,6 +425,12 @@ static struct irqchip locomo_spi_chip = 
 	.unmask	= locomo_spi_unmask_irq,
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(locomo_handler);
+static DEFINE_IRQ_CHAINED_TYPE(locomo_key_handler);
+static DEFINE_IRQ_CHAINED_TYPE(locomo_gpio_handler);
+static DEFINE_IRQ_CHAINED_TYPE(locomo_lt_handler);
+static DEFINE_IRQ_CHAINED_TYPE(locomo_spi_handler);
+
 static void locomo_setup_irq(struct locomo *lchip)
 {
 	int irq;
Index: linux.prev/arch/arm/common/sa1111.c
===================================================================
--- linux.prev.orig/arch/arm/common/sa1111.c
+++ linux.prev/arch/arm/common/sa1111.c
@@ -171,11 +171,11 @@ sa1111_irq_handler(unsigned int irq, str
 
 	for (i = IRQ_SA1111_START; stat0; i++, stat0 >>= 1)
 		if (stat0 & 1)
-			do_edge_IRQ(i, irq_desc + i, regs);
+			handle_edge_irq(i, irq_desc + i, regs);
 
 	for (i = IRQ_SA1111_START + 32; stat1; i++, stat1 >>= 1)
 		if (stat1 & 1)
-			do_edge_IRQ(i, irq_desc + i, regs);
+			handle_edge_irq(i, irq_desc + i, regs);
 
 	/* For level-based interrupts */
 	desc->chip->unmask(irq);
@@ -380,6 +380,8 @@ static struct irqchip sa1111_high_chip =
 	.set_wake	= sa1111_wake_highirq,
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(sa1111_irq_handler);
+
 static void sa1111_setup_irq(struct sa1111 *sachip)
 {
 	void __iomem *irqbase = sachip->base + SA1111_INTC;
Index: linux.prev/arch/arm/common/time-acorn.c
===================================================================
--- linux.prev.orig/arch/arm/common/time-acorn.c
+++ linux.prev/arch/arm/common/time-acorn.c
@@ -16,6 +16,7 @@
 #include <linux/timex.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 
 #include <asm/hardware.h>
 #include <asm/io.h>
@@ -76,7 +77,7 @@ ioc_timer_interrupt(int irq, void *dev_i
 
 static struct irqaction ioc_timer_irq = {
 	.name		= "timer",
-	.flags		= SA_INTERRUPT,
+	.flags		= SA_INTERRUPT | SA_NODELAY,
 	.handler	= ioc_timer_interrupt
 };
 
Index: linux.prev/arch/arm/kernel/calls.S
===================================================================
--- linux.prev.orig/arch/arm/kernel/calls.S
+++ linux.prev/arch/arm/kernel/calls.S
@@ -7,11 +7,8 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  *
- *  This file is included twice in entry-common.S
+ *  NR_syscalls now defined in include/asm-arm/unistd.h - tglx
  */
-#ifndef NR_syscalls
-#define NR_syscalls 328
-#else
 
 __syscall_start:
 /* 0 */		.long	sys_restart_syscall
@@ -341,4 +338,3 @@ __syscall_end:
 		.rept	NR_syscalls - (__syscall_end - __syscall_start) / 4
 			.long	sys_ni_syscall
 		.endr
-#endif
Index: linux.prev/arch/arm/kernel/dma.c
===================================================================
--- linux.prev.orig/arch/arm/kernel/dma.c
+++ linux.prev/arch/arm/kernel/dma.c
@@ -22,7 +22,7 @@
 
 #include <asm/mach/dma.h>
 
-DEFINE_SPINLOCK(dma_spin_lock);
+DEFINE_RAW_SPINLOCK(dma_spin_lock);
 
 #if MAX_DMA_CHANNELS > 0
 
Index: linux.prev/arch/arm/kernel/ecard.c
===================================================================
--- linux.prev.orig/arch/arm/kernel/ecard.c
+++ linux.prev/arch/arm/kernel/ecard.c
@@ -619,7 +619,7 @@ ecard_irqexp_handler(unsigned int irq, s
 		ecard_t *ec = slot_to_ecard(slot);
 
 		if (ec->claimed) {
-			struct irqdesc *d = irqdesc + ec->irq;
+			struct irqdesc *d = irq_desc + ec->irq;
 			/*
 			 * this ugly code is so that we can operate a
 			 * prioritorising system:
@@ -1052,6 +1052,9 @@ ecard_probe(int slot, card_type_t type)
 	return rc;
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(ecard_irqexp_handler);
+static DEFINE_IRQ_CHAINED_TYPE(ecard_irq_handler);
+
 /*
  * Initialise the expansion card system.
  * Locate all hardware - interrupt management and
@@ -1081,8 +1084,10 @@ static int __init ecard_init(void)
 
 	irqhw = ecard_probeirqhw();
 
-	set_irq_chained_handler(IRQ_EXPANSIONCARD,
-				irqhw ? ecard_irqexp_handler : ecard_irq_handler);
+	if (irqhw)
+		set_irq_chained_handler(IRQ_EXPANSIONCARD, ecard_irqexp_handler);
+	else
+		set_irq_chained_handler(IRQ_EXPANSIONCARD, ecard_irq_handler);
 
 	ecard_proc_init();
 
Index: linux.prev/arch/arm/kernel/entry-armv.S
===================================================================
--- linux.prev.orig/arch/arm/kernel/entry-armv.S
+++ linux.prev/arch/arm/kernel/entry-armv.S
@@ -192,7 +192,7 @@ __irq_svc:
 	irq_handler
 #ifdef CONFIG_PREEMPT
 	ldr	r0, [tsk, #TI_FLAGS]		@ get flags
-	tst	r0, #_TIF_NEED_RESCHED
+	tst	r0, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED
 	blne	svc_preempt
 preempt_return:
 	ldr	r0, [tsk, #TI_PREEMPT]		@ read preempt value
@@ -219,7 +219,7 @@ svc_preempt:
 	str	r7, [tsk, #TI_PREEMPT]		@ expects preempt_count == 0
 1:	bl	preempt_schedule_irq		@ irq en/disable is done inside
 	ldr	r0, [tsk, #TI_FLAGS]		@ get new tasks TI_FLAGS
-	tst	r0, #_TIF_NEED_RESCHED
+	tst	r0, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED
 	beq	preempt_return			@ go again
 	b	1b
 #endif
Index: linux.prev/arch/arm/kernel/entry-common.S
===================================================================
--- linux.prev.orig/arch/arm/kernel/entry-common.S
+++ linux.prev/arch/arm/kernel/entry-common.S
@@ -3,6 +3,8 @@
  *
  *  Copyright (C) 2000 Russell King
  *
+ * LATENCY_TRACE/mcount support (C) 2005 Timesys john.cooper@timesys.com
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
@@ -41,7 +43,7 @@ ret_fast_syscall:
 fast_work_pending:
 	str	r0, [sp, #S_R0+S_OFF]!		@ returned r0
 work_pending:
-	tst	r1, #_TIF_NEED_RESCHED
+	tst	r1, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED
 	bne	work_resched
 	tst	r1, #_TIF_NOTIFY_RESUME | _TIF_SIGPENDING
 	beq	no_work_pending
@@ -51,7 +53,8 @@ work_pending:
 	b	ret_slow_syscall		@ Check work again
 
 work_resched:
-	bl	schedule
+	bl	__schedule
+
 /*
  * "slow" syscall return path.  "why" tells us if this was a real syscall.
  */
@@ -87,8 +90,6 @@ ENTRY(ret_from_fork)
 	b	ret_slow_syscall
 	
 
-#include "calls.S"
-
 /*=============================================================================
  * SWI handler
  *-----------------------------------------------------------------------------
@@ -271,3 +272,110 @@ sys_mmap2:
 		str	r5, [sp, #4]
 		b	do_mmap2
 #endif
+
+#ifdef CONFIG_FRAME_POINTER
+
+#ifdef CONFIG_MCOUNT
+/*
+ * At the point where we are in mcount() we maintain the
+ * frame of the prologue code and keep the call to mcount()
+ * out of the stack frame list:
+
+        saved pc          <---\     caller of instrumented routine
+        saved lr              |
+        ip/prev_sp            |
+        fp        -----^      |
+         :                    |
+                              |
+     -> saved pc              |     instrumented routine
+    |   saved lr              |
+    |   ip/prev_sp            |
+    |   fp           ---------/
+    |     :
+    |
+    |                             mcount
+    |	saved pc
+    |	saved lr
+    |	ip/prev sp
+     --	fp
+        r3
+        r2
+        r1
+   sp-> r0
+         :
+ */
+
+	.text
+	.align 0
+	.type mcount %function
+	.global mcount
+
+/* gcc -pg generated FUNCTION_PROLOGUE references mcount()
+ * and has already created the stack frame invocation for
+ * the routine we have been called to instrument. We create
+ * a complete frame nevertheless, as we want to use the same
+ * call to mcount() from c code.
+ */
+mcount:
+
+	ldr	ip, =mcount_enabled	@ leave early, if disabled
+	ldr	ip, [ip]
+	cmp	ip, #0
+	moveq	pc,lr
+
+	mov	ip,  sp
+	stmdb   sp!, {r0 - r3, fp, ip, lr, pc}	@ create stack frame
+
+	ldr	r1, [fp, #-4]		@ get lr (the return address
+					@ of the caller of the
+					@ instrumented function)
+	mov	r0, lr			@ get lr - (the return address
+					@ of the instrumented function)
+
+	sub	fp, ip, #4		@ point fp at this frame
+
+	bl	__trace
+1:
+	ldmdb   fp, {r0 - r3, fp, sp, pc}	@ pop entry frame and return
+
+#endif
+
+/* ARM replacement for unsupported gcc __builtin_return_address(n)
+ * where 0 < n.  n == 0 is supported here as well.
+ *
+ * Walk up the stack frame until the desired frame is found or a NULL
+ * fp is encountered, return NULL in the latter case.
+ *
+ * Note: it is possible under code optimization for the stack invocation
+ * of an ancestor function (level N) to be removed before calling a
+ * descendant function (level N+1).  No easy means is available to deduce
+ * this scenario with the result being [for example] caller_addr(0) when
+ * called from level N+1 returning level N-1 rather than the expected
+ * level N.  This optimization issue appears isolated to the case of
+ * a call to a level N+1 routine made at the tail end of a level N
+ * routine -- the level N frame is deleted and a simple branch is made
+ * to the level N+1 routine.
+ */
+
+	.text
+	.align 0
+	.type arm_return_addr %function
+	.global arm_return_addr
+
+arm_return_addr:
+	mov	ip, r0
+	mov	r0, fp
+3:
+	cmp	r0, #0
+	beq	1f		@ frame list hit end, bail
+	cmp	ip, #0
+	beq	2f		@ reached desired frame
+	ldr	r0, [r0, #-12]  @ else continue, get next fp
+	sub	ip, ip, #1
+	b	 3b
+2:
+	ldr	r0, [r0, #-4]   @ get target return address
+1:
+	mov	pc, lr
+
+#endif
Index: linux.prev/arch/arm/kernel/fiq.c
===================================================================
--- linux.prev.orig/arch/arm/kernel/fiq.c
+++ linux.prev/arch/arm/kernel/fiq.c
@@ -38,6 +38,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/seq_file.h>
 
 #include <asm/cacheflush.h>
@@ -88,7 +89,7 @@ void set_fiq_handler(void *start, unsign
  * disable irqs for the duration.  Note - these functions are almost
  * entirely coded in assembly.
  */
-void __attribute__((naked)) set_fiq_regs(struct pt_regs *regs)
+void notrace __attribute__((naked)) set_fiq_regs(struct pt_regs *regs)
 {
 	register unsigned long tmp;
 	asm volatile (
@@ -106,7 +107,7 @@ void __attribute__((naked)) set_fiq_regs
 	: "r" (&regs->ARM_r8), "I" (PSR_I_BIT | PSR_F_BIT | FIQ_MODE));
 }
 
-void __attribute__((naked)) get_fiq_regs(struct pt_regs *regs)
+void notrace __attribute__((naked)) get_fiq_regs(struct pt_regs *regs)
 {
 	register unsigned long tmp;
 	asm volatile (
Index: linux.prev/arch/arm/kernel/init_task.c
===================================================================
--- linux.prev.orig/arch/arm/kernel/init_task.c
+++ linux.prev/arch/arm/kernel/init_task.c
@@ -12,8 +12,8 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
-static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
+static struct fs_struct init_fs = INIT_FS(init_fs);
+static struct files_struct init_files = INIT_FILES(init_files);
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
Index: linux.prev/arch/arm/kernel/irq.c
===================================================================
--- linux.prev.orig/arch/arm/kernel/irq.c
+++ linux.prev/arch/arm/kernel/irq.c
@@ -27,6 +27,7 @@
 #include <linux/signal.h>
 #include <linux/ioport.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/ptrace.h>
 #include <linux/slab.h>
 #include <linux/random.h>
@@ -38,193 +39,11 @@
 #include <linux/kallsyms.h>
 #include <linux/proc_fs.h>
 
-#include <asm/irq.h>
 #include <asm/system.h>
-#include <asm/mach/irq.h>
 #include <asm/mach/time.h>
 
-/*
- * Maximum IRQ count.  Currently, this is arbitary.  However, it should
- * not be set too low to prevent false triggering.  Conversely, if it
- * is set too high, then you could miss a stuck IRQ.
- *
- * Maybe we ought to set a timer and re-enable the IRQ at a later time?
- */
-#define MAX_IRQ_CNT	100000
-
-static int noirqdebug;
-static volatile unsigned long irq_err_count;
-static DEFINE_SPINLOCK(irq_controller_lock);
-static LIST_HEAD(irq_pending);
-
-struct irqdesc irq_desc[NR_IRQS];
 void (*init_arch_irq)(void) __initdata = NULL;
 
-/*
- * No architecture-specific irq_finish function defined in arm/arch/irqs.h.
- */
-#ifndef irq_finish
-#define irq_finish(irq) do { } while (0)
-#endif
-
-/*
- * Dummy mask/unmask handler
- */
-void dummy_mask_unmask_irq(unsigned int irq)
-{
-}
-
-irqreturn_t no_action(int irq, void *dev_id, struct pt_regs *regs)
-{
-	return IRQ_NONE;
-}
-
-void do_bad_IRQ(unsigned int irq, struct irqdesc *desc, struct pt_regs *regs)
-{
-	irq_err_count += 1;
-	printk(KERN_ERR "IRQ: spurious interrupt %d\n", irq);
-}
-
-static struct irqchip bad_chip = {
-	.ack	= dummy_mask_unmask_irq,
-	.mask	= dummy_mask_unmask_irq,
-	.unmask = dummy_mask_unmask_irq,
-};
-
-static struct irqdesc bad_irq_desc = {
-	.chip		= &bad_chip,
-	.handle		= do_bad_IRQ,
-	.pend		= LIST_HEAD_INIT(bad_irq_desc.pend),
-	.disable_depth	= 1,
-};
-
-#ifdef CONFIG_SMP
-void synchronize_irq(unsigned int irq)
-{
-	struct irqdesc *desc = irq_desc + irq;
-
-	while (desc->running)
-		barrier();
-}
-EXPORT_SYMBOL(synchronize_irq);
-
-#define smp_set_running(desc)	do { desc->running = 1; } while (0)
-#define smp_clear_running(desc)	do { desc->running = 0; } while (0)
-#else
-#define smp_set_running(desc)	do { } while (0)
-#define smp_clear_running(desc)	do { } while (0)
-#endif
-
-/**
- *	disable_irq_nosync - disable an irq without waiting
- *	@irq: Interrupt to disable
- *
- *	Disable the selected interrupt line.  Enables and disables
- *	are nested.  We do this lazily.
- *
- *	This function may be called from IRQ context.
- */
-void disable_irq_nosync(unsigned int irq)
-{
-	struct irqdesc *desc = irq_desc + irq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&irq_controller_lock, flags);
-	desc->disable_depth++;
-	list_del_init(&desc->pend);
-	spin_unlock_irqrestore(&irq_controller_lock, flags);
-}
-EXPORT_SYMBOL(disable_irq_nosync);
-
-/**
- *	disable_irq - disable an irq and wait for completion
- *	@irq: Interrupt to disable
- *
- *	Disable the selected interrupt line.  Enables and disables
- *	are nested.  This functions waits for any pending IRQ
- *	handlers for this interrupt to complete before returning.
- *	If you use this function while holding a resource the IRQ
- *	handler may need you will deadlock.
- *
- *	This function may be called - with care - from IRQ context.
- */
-void disable_irq(unsigned int irq)
-{
-	struct irqdesc *desc = irq_desc + irq;
-
-	disable_irq_nosync(irq);
-	if (desc->action)
-		synchronize_irq(irq);
-}
-EXPORT_SYMBOL(disable_irq);
-
-/**
- *	enable_irq - enable interrupt handling on an irq
- *	@irq: Interrupt to enable
- *
- *	Re-enables the processing of interrupts on this IRQ line.
- *	Note that this may call the interrupt handler, so you may
- *	get unexpected results if you hold IRQs disabled.
- *
- *	This function may be called from IRQ context.
- */
-void enable_irq(unsigned int irq)
-{
-	struct irqdesc *desc = irq_desc + irq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&irq_controller_lock, flags);
-	if (unlikely(!desc->disable_depth)) {
-		printk("enable_irq(%u) unbalanced from %p\n", irq,
-			__builtin_return_address(0));
-	} else if (!--desc->disable_depth) {
-		desc->probing = 0;
-		desc->chip->unmask(irq);
-
-		/*
-		 * If the interrupt is waiting to be processed,
-		 * try to re-run it.  We can't directly run it
-		 * from here since the caller might be in an
-		 * interrupt-protected region.
-		 */
-		if (desc->pending && list_empty(&desc->pend)) {
-			desc->pending = 0;
-			if (!desc->chip->retrigger ||
-			    desc->chip->retrigger(irq))
-				list_add(&desc->pend, &irq_pending);
-		}
-	}
-	spin_unlock_irqrestore(&irq_controller_lock, flags);
-}
-EXPORT_SYMBOL(enable_irq);
-
-/*
- * Enable wake on selected irq
- */
-void enable_irq_wake(unsigned int irq)
-{
-	struct irqdesc *desc = irq_desc + irq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&irq_controller_lock, flags);
-	if (desc->chip->set_wake)
-		desc->chip->set_wake(irq, 1);
-	spin_unlock_irqrestore(&irq_controller_lock, flags);
-}
-EXPORT_SYMBOL(enable_irq_wake);
-
-void disable_irq_wake(unsigned int irq)
-{
-	struct irqdesc *desc = irq_desc + irq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&irq_controller_lock, flags);
-	if (desc->chip->set_wake)
-		desc->chip->set_wake(irq, 0);
-	spin_unlock_irqrestore(&irq_controller_lock, flags);
-}
-EXPORT_SYMBOL(disable_irq_wake);
-
 int show_interrupts(struct seq_file *p, void *v)
 {
 	int i = *(loff_t *) v, cpu;
@@ -243,7 +62,7 @@ int show_interrupts(struct seq_file *p, 
 	}
 
 	if (i < NR_IRQS) {
-		spin_lock_irqsave(&irq_controller_lock, flags);
+		spin_lock_irqsave(&irq_desc[i].lock, flags);
 	    	action = irq_desc[i].action;
 		if (!action)
 			goto unlock;
@@ -257,7 +76,7 @@ int show_interrupts(struct seq_file *p, 
 
 		seq_putc(p, '\n');
 unlock:
-		spin_unlock_irqrestore(&irq_controller_lock, flags);
+		spin_unlock_irqrestore(&irq_desc[i].lock, flags);
 	} else if (i == NR_IRQS) {
 #ifdef CONFIG_ARCH_ACORN
 		show_fiq_list(p, v);
@@ -266,374 +85,83 @@ unlock:
 		show_ipi_list(p);
 		show_local_irqs(p);
 #endif
+#ifdef FIXME_TGLX
 		seq_printf(p, "Err: %10lu\n", irq_err_count);
-	}
-	return 0;
-}
-
-/*
- * IRQ lock detection.
- *
- * Hopefully, this should get us out of a few locked situations.
- * However, it may take a while for this to happen, since we need
- * a large number if IRQs to appear in the same jiffie with the
- * same instruction pointer (or within 2 instructions).
- */
-static int check_irq_lock(struct irqdesc *desc, int irq, struct pt_regs *regs)
-{
-	unsigned long instr_ptr = instruction_pointer(regs);
-
-	if (desc->lck_jif == jiffies &&
-	    desc->lck_pc >= instr_ptr && desc->lck_pc < instr_ptr + 8) {
-		desc->lck_cnt += 1;
-
-		if (desc->lck_cnt > MAX_IRQ_CNT) {
-			printk(KERN_ERR "IRQ LOCK: IRQ%d is locking the system, disabled\n", irq);
-			return 1;
-		}
-	} else {
-		desc->lck_cnt = 0;
-		desc->lck_pc  = instruction_pointer(regs);
-		desc->lck_jif = jiffies;
-	}
-	return 0;
-}
-
-static void
-report_bad_irq(unsigned int irq, struct pt_regs *regs, struct irqdesc *desc, int ret)
-{
-	static int count = 100;
-	struct irqaction *action;
-
-	if (!count || noirqdebug)
-		return;
-
-	count--;
-
-	if (ret != IRQ_HANDLED && ret != IRQ_NONE) {
-		printk("irq%u: bogus retval mask %x\n", irq, ret);
-	} else {
-		printk("irq%u: nobody cared\n", irq);
-	}
-	show_regs(regs);
-	dump_stack();
-	printk(KERN_ERR "handlers:");
-	action = desc->action;
-	do {
-		printk("\n" KERN_ERR "[<%p>]", action->handler);
-		print_symbol(" (%s)", (unsigned long)action->handler);
-		action = action->next;
-	} while (action);
-	printk("\n");
-}
-
-static int
-__do_irq(unsigned int irq, struct irqaction *action, struct pt_regs *regs)
-{
-	unsigned int status;
-	int ret, retval = 0;
-
-	spin_unlock(&irq_controller_lock);
-
-#ifdef CONFIG_NO_IDLE_HZ
-	if (!(action->flags & SA_TIMER) && system_timer->dyn_tick != NULL) {
-		write_seqlock(&xtime_lock);
-		if (system_timer->dyn_tick->state & DYN_TICK_ENABLED)
-			system_timer->dyn_tick->handler(irq, 0, regs);
-		write_sequnlock(&xtime_lock);
-	}
 #endif
-
-	if (!(action->flags & SA_INTERRUPT))
-		local_irq_enable();
-
-	status = 0;
-	do {
-		ret = action->handler(irq, action->dev_id, regs);
-		if (ret == IRQ_HANDLED)
-			status |= action->flags;
-		retval |= ret;
-		action = action->next;
-	} while (action);
-
-	if (status & SA_SAMPLE_RANDOM)
-		add_interrupt_randomness(irq);
-
-	spin_lock_irq(&irq_controller_lock);
-
-	return retval;
-}
-
-/*
- * This is for software-decoded IRQs.  The caller is expected to
- * handle the ack, clear, mask and unmask issues.
- */
-void
-do_simple_IRQ(unsigned int irq, struct irqdesc *desc, struct pt_regs *regs)
-{
-	struct irqaction *action;
-	const unsigned int cpu = smp_processor_id();
-
-	desc->triggered = 1;
-
-	kstat_cpu(cpu).irqs[irq]++;
-
-	smp_set_running(desc);
-
-	action = desc->action;
-	if (action) {
-		int ret = __do_irq(irq, action, regs);
-		if (ret != IRQ_HANDLED)
-			report_bad_irq(irq, regs, desc, ret);
-	}
-
-	smp_clear_running(desc);
-}
-
-/*
- * Most edge-triggered IRQ implementations seem to take a broken
- * approach to this.  Hence the complexity.
- */
-void
-do_edge_IRQ(unsigned int irq, struct irqdesc *desc, struct pt_regs *regs)
-{
-	const unsigned int cpu = smp_processor_id();
-
-	desc->triggered = 1;
-
-	/*
-	 * If we're currently running this IRQ, or its disabled,
-	 * we shouldn't process the IRQ.  Instead, turn on the
-	 * hardware masks.
-	 */
-	if (unlikely(desc->running || desc->disable_depth))
-		goto running;
-
-	/*
-	 * Acknowledge and clear the IRQ, but don't mask it.
-	 */
-	desc->chip->ack(irq);
-
-	/*
-	 * Mark the IRQ currently in progress.
-	 */
-	desc->running = 1;
-
-	kstat_cpu(cpu).irqs[irq]++;
-
-	do {
-		struct irqaction *action;
-
-		action = desc->action;
-		if (!action)
-			break;
-
-		if (desc->pending && !desc->disable_depth) {
-			desc->pending = 0;
-			desc->chip->unmask(irq);
-		}
-
-		__do_irq(irq, action, regs);
-	} while (desc->pending && !desc->disable_depth);
-
-	desc->running = 0;
-
-	/*
-	 * If we were disabled or freed, shut down the handler.
-	 */
-	if (likely(desc->action && !check_irq_lock(desc, irq, regs)))
-		return;
-
- running:
-	/*
-	 * We got another IRQ while this one was masked or
-	 * currently running.  Delay it.
-	 */
-	desc->pending = 1;
-	desc->chip->mask(irq);
-	desc->chip->ack(irq);
-}
-
-/*
- * Level-based IRQ handler.  Nice and simple.
- */
-void
-do_level_IRQ(unsigned int irq, struct irqdesc *desc, struct pt_regs *regs)
-{
-	struct irqaction *action;
-	const unsigned int cpu = smp_processor_id();
-
-	desc->triggered = 1;
-
-	/*
-	 * Acknowledge, clear _AND_ disable the interrupt.
-	 */
-	desc->chip->ack(irq);
-
-	if (likely(!desc->disable_depth)) {
-		kstat_cpu(cpu).irqs[irq]++;
-
-		smp_set_running(desc);
-
-		/*
-		 * Return with this interrupt masked if no action
-		 */
-		action = desc->action;
-		if (action) {
-			int ret = __do_irq(irq, desc->action, regs);
-
-			if (ret != IRQ_HANDLED)
-				report_bad_irq(irq, regs, desc, ret);
-
-			if (likely(!desc->disable_depth &&
-				   !check_irq_lock(desc, irq, regs)))
-				desc->chip->unmask(irq);
-		}
-
-		smp_clear_running(desc);
 	}
+	return 0;
 }
 
-static void do_pending_irqs(struct pt_regs *regs)
-{
-	struct list_head head, *l, *n;
-
-	do {
-		struct irqdesc *desc;
-
-		/*
-		 * First, take the pending interrupts off the list.
-		 * The act of calling the handlers may add some IRQs
-		 * back onto the list.
-		 */
-		head = irq_pending;
-		INIT_LIST_HEAD(&irq_pending);
-		head.next->prev = &head;
-		head.prev->next = &head;
-
-		/*
-		 * Now run each entry.  We must delete it from our
-		 * list before calling the handler.
-		 */
-		list_for_each_safe(l, n, &head) {
-			desc = list_entry(l, struct irqdesc, pend);
-			list_del_init(&desc->pend);
-			desc_handle_irq(desc - irq_desc, desc, regs);
-		}
-
-		/*
-		 * The list must be empty.
-		 */
-		BUG_ON(!list_empty(&head));
-	} while (!list_empty(&irq_pending));
-}
+/* Handle bad interrupts */
+static struct irq_desc bad_irq = {
+	.handler = &no_irq_type,
+	.lock = RAW_SPIN_LOCK_UNLOCKED
+};
 
 /*
- * do_IRQ handles all hardware IRQ's.  Decoded IRQs should not
+ * asm_do_IRQ handles all hardware IRQ's.  Decoded IRQs should not
  * come via this function.  Instead, they should provide their
  * own 'handler'
  */
-asmlinkage void asm_do_IRQ(unsigned int irq, struct pt_regs *regs)
+asmlinkage notrace void asm_do_IRQ(unsigned int irq, struct pt_regs *regs)
 {
 	struct irqdesc *desc = irq_desc + irq;
 
+	trace_special(instruction_pointer(regs), irq, 0);
+
 	/*
 	 * Some hardware gives randomly wrong interrupts.  Rather
 	 * than crashing, do something sensible.
 	 */
 	if (irq >= NR_IRQS)
-		desc = &bad_irq_desc;
+		desc = &bad_irq;
 
 	irq_enter();
-	spin_lock(&irq_controller_lock);
-	desc_handle_irq(irq, desc, regs);
 
-	/*
-	 * Now re-run any pending interrupts.
-	 */
-	if (!list_empty(&irq_pending))
-		do_pending_irqs(regs);
-
-	irq_finish(irq);
+	desc_handle_irq(irq, desc, regs);
 
-	spin_unlock(&irq_controller_lock);
 	irq_exit();
 }
 
-void __set_irq_handler(unsigned int irq, irq_handler_t handle, int is_chained)
+void __set_irq_handler(unsigned int irq, struct irq_type *type, int is_chained)
 {
 	struct irqdesc *desc;
 	unsigned long flags;
 
 	if (irq >= NR_IRQS) {
-		printk(KERN_ERR "Trying to install handler for IRQ%d\n", irq);
+		printk(KERN_ERR "Trying to install type control for IRQ%d\n", irq);
 		return;
 	}
 
-	if (handle == NULL)
-		handle = do_bad_IRQ;
-
 	desc = irq_desc + irq;
 
-	if (is_chained && desc->chip == &bad_chip)
-		printk(KERN_WARNING "Trying to install chained handler for IRQ%d\n", irq);
-
-	spin_lock_irqsave(&irq_controller_lock, flags);
-	if (handle == do_bad_IRQ) {
-		desc->chip->mask(irq);
-		desc->chip->ack(irq);
-		desc->disable_depth = 1;
-	}
-	desc->handle = handle;
-	if (handle != do_bad_IRQ && is_chained) {
-		desc->valid = 0;
-		desc->probe_ok = 0;
-		desc->disable_depth = 0;
-		desc->chip->unmask(irq);
+	/* Uninstall ? */
+	if (type == NULL || type == &no_irq_type) {
+		spin_lock_irqsave(&desc->lock, flags);
+		if (desc->chip) {
+			desc->chip->mask(irq);
+			desc->chip->ack(irq);
+		}
+		desc->depth = 1;
+		spin_unlock_irqrestore(&desc->lock, flags);
 	}
-	spin_unlock_irqrestore(&irq_controller_lock, flags);
-}
 
-void set_irq_chip(unsigned int irq, struct irqchip *chip)
-{
-	struct irqdesc *desc;
-	unsigned long flags;
-
-	if (irq >= NR_IRQS) {
-		printk(KERN_ERR "Trying to install chip for IRQ%d\n", irq);
+	/* Install the irq_type */
+	if (generic_set_irq_type(irq, type))
 		return;
-	}
-
-	if (chip == NULL)
-		chip = &bad_chip;
-
-	desc = irq_desc + irq;
-	spin_lock_irqsave(&irq_controller_lock, flags);
-	desc->chip = chip;
-	spin_unlock_irqrestore(&irq_controller_lock, flags);
-}
 
-int set_irq_type(unsigned int irq, unsigned int type)
-{
-	struct irqdesc *desc;
-	unsigned long flags;
-	int ret = -ENXIO;
+	spin_lock_irqsave(&desc->lock, flags);
+	if (is_chained && (desc->handler == &no_irq_type || !desc->chip))
+		printk(KERN_WARNING "Trying to install chained interrupt type for IRQ%d\n", irq);
 
-	if (irq >= NR_IRQS) {
-		printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq);
-		return -ENODEV;
-	}
-
-	desc = irq_desc + irq;
-	if (desc->chip->set_type) {
-		spin_lock_irqsave(&irq_controller_lock, flags);
-		ret = desc->chip->set_type(irq, type);
-		spin_unlock_irqrestore(&irq_controller_lock, flags);
+	if (type != NULL && is_chained) {
+		desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
+		desc->depth = 0;
+		if (desc->chip)
+			desc->chip->unmask(irq);
 	}
-
-	return ret;
+	spin_unlock_irqrestore(&desc->lock, flags);
 }
-EXPORT_SYMBOL(set_irq_type);
 
 void set_irq_flags(unsigned int irq, unsigned int iflags)
 {
@@ -646,400 +174,28 @@ void set_irq_flags(unsigned int irq, uns
 	}
 
 	desc = irq_desc + irq;
-	spin_lock_irqsave(&irq_controller_lock, flags);
-	desc->valid = (iflags & IRQF_VALID) != 0;
-	desc->probe_ok = (iflags & IRQF_PROBE) != 0;
-	desc->noautoenable = (iflags & IRQF_NOAUTOEN) != 0;
-	spin_unlock_irqrestore(&irq_controller_lock, flags);
-}
-
-int setup_irq(unsigned int irq, struct irqaction *new)
-{
-	int shared = 0;
-	struct irqaction *old, **p;
-	unsigned long flags;
-	struct irqdesc *desc;
-
-	/*
-	 * Some drivers like serial.c use request_irq() heavily,
-	 * so we have to be careful not to interfere with a
-	 * running system.
-	 */
-	if (new->flags & SA_SAMPLE_RANDOM) {
-		/*
-		 * This function might sleep, we want to call it first,
-		 * outside of the atomic block.
-		 * Yes, this might clear the entropy pool if the wrong
-		 * driver is attempted to be loaded, without actually
-		 * installing a new handler, but is this really a problem,
-		 * only the sysadmin is able to do this.
-		 */
-	        rand_initialize_irq(irq);
-	}
-
-	/*
-	 * The following block of code has to be executed atomically
-	 */
-	desc = irq_desc + irq;
-	spin_lock_irqsave(&irq_controller_lock, flags);
-	p = &desc->action;
-	if ((old = *p) != NULL) {
-		/* Can't share interrupts unless both agree to */
-		if (!(old->flags & new->flags & SA_SHIRQ)) {
-			spin_unlock_irqrestore(&irq_controller_lock, flags);
-			return -EBUSY;
-		}
-
-		/* add new interrupt at end of irq queue */
-		do {
-			p = &old->next;
-			old = *p;
-		} while (old);
-		shared = 1;
-	}
-
-	*p = new;
-
-	if (!shared) {
- 		desc->probing = 0;
-		desc->running = 0;
-		desc->pending = 0;
-		desc->disable_depth = 1;
-		if (!desc->noautoenable) {
-			desc->disable_depth = 0;
-			desc->chip->unmask(irq);
-		}
-	}
-
-	spin_unlock_irqrestore(&irq_controller_lock, flags);
-	return 0;
-}
-
-/**
- *	request_irq - allocate an interrupt line
- *	@irq: Interrupt line to allocate
- *	@handler: Function to be called when the IRQ occurs
- *	@irqflags: Interrupt type flags
- *	@devname: An ascii name for the claiming device
- *	@dev_id: A cookie passed back to the handler function
- *
- *	This call allocates interrupt resources and enables the
- *	interrupt line and IRQ handling. From the point this
- *	call is made your handler function may be invoked. Since
- *	your handler function must clear any interrupt the board
- *	raises, you must take care both to initialise your hardware
- *	and to set up the interrupt handler in the right order.
- *
- *	Dev_id must be globally unique. Normally the address of the
- *	device data structure is used as the cookie. Since the handler
- *	receives this value it makes sense to use it.
- *
- *	If your interrupt is shared you must pass a non NULL dev_id
- *	as this is required when freeing the interrupt.
- *
- *	Flags:
- *
- *	SA_SHIRQ		Interrupt is shared
- *
- *	SA_INTERRUPT		Disable local interrupts while processing
- *
- *	SA_SAMPLE_RANDOM	The interrupt can be used for entropy
- *
- */
-int request_irq(unsigned int irq, irqreturn_t (*handler)(int, void *, struct pt_regs *),
-		 unsigned long irq_flags, const char * devname, void *dev_id)
-{
-	unsigned long retval;
-	struct irqaction *action;
-
-	if (irq >= NR_IRQS || !irq_desc[irq].valid || !handler ||
-	    (irq_flags & SA_SHIRQ && !dev_id))
-		return -EINVAL;
-
-	action = (struct irqaction *)kmalloc(sizeof(struct irqaction), GFP_KERNEL);
-	if (!action)
-		return -ENOMEM;
-
-	action->handler = handler;
-	action->flags = irq_flags;
-	cpus_clear(action->mask);
-	action->name = devname;
-	action->next = NULL;
-	action->dev_id = dev_id;
-
-	retval = setup_irq(irq, action);
-
-	if (retval)
-		kfree(action);
-	return retval;
-}
-
-EXPORT_SYMBOL(request_irq);
-
-/**
- *	free_irq - free an interrupt
- *	@irq: Interrupt line to free
- *	@dev_id: Device identity to free
- *
- *	Remove an interrupt handler. The handler is removed and if the
- *	interrupt line is no longer in use by any driver it is disabled.
- *	On a shared IRQ the caller must ensure the interrupt is disabled
- *	on the card it drives before calling this function.
- *
- *	This function must not be called from interrupt context.
- */
-void free_irq(unsigned int irq, void *dev_id)
-{
-	struct irqaction * action, **p;
-	unsigned long flags;
-
-	if (irq >= NR_IRQS || !irq_desc[irq].valid) {
-		printk(KERN_ERR "Trying to free IRQ%d\n",irq);
-		dump_stack();
-		return;
-	}
-
-	spin_lock_irqsave(&irq_controller_lock, flags);
-	for (p = &irq_desc[irq].action; (action = *p) != NULL; p = &action->next) {
-		if (action->dev_id != dev_id)
-			continue;
-
-	    	/* Found it - now free it */
-		*p = action->next;
-		break;
-	}
-	spin_unlock_irqrestore(&irq_controller_lock, flags);
-
-	if (!action) {
-		printk(KERN_ERR "Trying to free free IRQ%d\n",irq);
-		dump_stack();
-	} else {
-		synchronize_irq(irq);
-		kfree(action);
-	}
-}
-
-EXPORT_SYMBOL(free_irq);
-
-static DECLARE_MUTEX(probe_sem);
-
-/* Start the interrupt probing.  Unlike other architectures,
- * we don't return a mask of interrupts from probe_irq_on,
- * but return the number of interrupts enabled for the probe.
- * The interrupts which have been enabled for probing is
- * instead recorded in the irq_desc structure.
- */
-unsigned long probe_irq_on(void)
-{
-	unsigned int i, irqs = 0;
-	unsigned long delay;
-
-	down(&probe_sem);
-
-	/*
-	 * first snaffle up any unassigned but
-	 * probe-able interrupts
-	 */
-	spin_lock_irq(&irq_controller_lock);
-	for (i = 0; i < NR_IRQS; i++) {
-		if (!irq_desc[i].probe_ok || irq_desc[i].action)
-			continue;
-
-		irq_desc[i].probing = 1;
-		irq_desc[i].triggered = 0;
-		if (irq_desc[i].chip->set_type)
-			irq_desc[i].chip->set_type(i, IRQT_PROBE);
-		irq_desc[i].chip->unmask(i);
-		irqs += 1;
-	}
-	spin_unlock_irq(&irq_controller_lock);
-
-	/*
-	 * wait for spurious interrupts to mask themselves out again
-	 */
-	for (delay = jiffies + HZ/10; time_before(jiffies, delay); )
-		/* min 100ms delay */;
-
-	/*
-	 * now filter out any obviously spurious interrupts
-	 */
-	spin_lock_irq(&irq_controller_lock);
-	for (i = 0; i < NR_IRQS; i++) {
-		if (irq_desc[i].probing && irq_desc[i].triggered) {
-			irq_desc[i].probing = 0;
-			irqs -= 1;
-		}
-	}
-	spin_unlock_irq(&irq_controller_lock);
-
-	return irqs;
-}
-
-EXPORT_SYMBOL(probe_irq_on);
-
-unsigned int probe_irq_mask(unsigned long irqs)
-{
-	unsigned int mask = 0, i;
-
-	spin_lock_irq(&irq_controller_lock);
-	for (i = 0; i < 16 && i < NR_IRQS; i++)
-		if (irq_desc[i].probing && irq_desc[i].triggered)
-			mask |= 1 << i;
-	spin_unlock_irq(&irq_controller_lock);
-
-	up(&probe_sem);
-
-	return mask;
-}
-EXPORT_SYMBOL(probe_irq_mask);
-
-/*
- * Possible return values:
- *  >= 0 - interrupt number
- *    -1 - no interrupt/many interrupts
- */
-int probe_irq_off(unsigned long irqs)
-{
-	unsigned int i;
-	int irq_found = NO_IRQ;
-
-	/*
-	 * look at the interrupts, and find exactly one
-	 * that we were probing has been triggered
-	 */
-	spin_lock_irq(&irq_controller_lock);
-	for (i = 0; i < NR_IRQS; i++) {
-		if (irq_desc[i].probing &&
-		    irq_desc[i].triggered) {
-			if (irq_found != NO_IRQ) {
-				irq_found = NO_IRQ;
-				goto out;
-			}
-			irq_found = i;
-		}
-	}
-
-	if (irq_found == -1)
-		irq_found = NO_IRQ;
-out:
-	spin_unlock_irq(&irq_controller_lock);
-
-	up(&probe_sem);
-
-	return irq_found;
-}
-
-EXPORT_SYMBOL(probe_irq_off);
-
-#ifdef CONFIG_SMP
-static void route_irq(struct irqdesc *desc, unsigned int irq, unsigned int cpu)
-{
-	pr_debug("IRQ%u: moving from cpu%u to cpu%u\n", irq, desc->cpu, cpu);
-
-	spin_lock_irq(&irq_controller_lock);
-	desc->cpu = cpu;
-	desc->chip->set_cpu(desc, irq, cpu);
-	spin_unlock_irq(&irq_controller_lock);
-}
-
-#ifdef CONFIG_PROC_FS
-static int
-irq_affinity_read_proc(char *page, char **start, off_t off, int count,
-		       int *eof, void *data)
-{
-	struct irqdesc *desc = irq_desc + ((int)data);
-	int len = cpumask_scnprintf(page, count, desc->affinity);
-
-	if (count - len < 2)
-		return -EINVAL;
-	page[len++] = '\n';
-	page[len] = '\0';
-
-	return len;
-}
-
-static int
-irq_affinity_write_proc(struct file *file, const char __user *buffer,
-			unsigned long count, void *data)
-{
-	unsigned int irq = (unsigned int)data;
-	struct irqdesc *desc = irq_desc + irq;
-	cpumask_t affinity, tmp;
-	int ret = -EIO;
-
-	if (!desc->chip->set_cpu)
-		goto out;
-
-	ret = cpumask_parse(buffer, count, affinity);
-	if (ret)
-		goto out;
-
-	cpus_and(tmp, affinity, cpu_online_map);
-	if (cpus_empty(tmp)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	desc->affinity = affinity;
-	route_irq(desc, irq, first_cpu(tmp));
-	ret = count;
-
- out:
-	return ret;
-}
-#endif
-#endif
-
-void __init init_irq_proc(void)
-{
-#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS)
-	struct proc_dir_entry *dir;
-	int irq;
-
-	dir = proc_mkdir("irq", NULL);
-	if (!dir)
-		return;
-
-	for (irq = 0; irq < NR_IRQS; irq++) {
-		struct proc_dir_entry *entry;
-		struct irqdesc *desc;
-		char name[16];
-
-		desc = irq_desc + irq;
-		memset(name, 0, sizeof(name));
-		snprintf(name, sizeof(name) - 1, "%u", irq);
-
-		desc->procdir = proc_mkdir(name, dir);
-		if (!desc->procdir)
-			continue;
-
-		entry = create_proc_entry("smp_affinity", 0600, desc->procdir);
-		if (entry) {
-			entry->nlink = 1;
-			entry->data = (void *)irq;
-			entry->read_proc = irq_affinity_read_proc;
-			entry->write_proc = irq_affinity_write_proc;
-		}
-	}
-#endif
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
+	if (iflags & IRQF_VALID)
+		desc->status &= ~IRQ_NOREQUEST;
+	if (iflags & IRQF_PROBE)
+		desc->status &= ~IRQ_NOPROBE;
+	spin_unlock_irqrestore(&desc->lock, flags);
 }
 
 void __init init_IRQ(void)
 {
-	struct irqdesc *desc;
 	extern void init_dma(void);
 	int irq;
 
+	for (irq = 0; irq < NR_IRQS; irq++)
+		irq_desc[irq].status |= IRQ_NOREQUEST;
+
 #ifdef CONFIG_SMP
 	bad_irq_desc.affinity = CPU_MASK_ALL;
 	bad_irq_desc.cpu = smp_processor_id();
 #endif
 
-	for (irq = 0, desc = irq_desc; irq < NR_IRQS; irq++, desc++) {
-		*desc = bad_irq_desc;
-		INIT_LIST_HEAD(&desc->pend);
-	}
-
 	init_arch_irq();
 	init_dma();
 }
Index: linux.prev/arch/arm/kernel/process.c
===================================================================
--- linux.prev.orig/arch/arm/kernel/process.c
+++ linux.prev/arch/arm/kernel/process.c
@@ -89,12 +89,12 @@ void default_idle(void)
 	if (hlt_counter)
 		cpu_relax();
 	else {
-		local_irq_disable();
+		raw_local_irq_disable();
 		if (!need_resched()) {
 			timer_dyn_reprogram();
 			arch_idle();
 		}
-		local_irq_enable();
+		raw_local_irq_enable();
 	}
 }
 
@@ -124,8 +124,8 @@ void cpu_idle(void)
 		while (!need_resched())
 			idle();
 		leds_event(led_idle_end);
-		preempt_enable_no_resched();
-		schedule();
+		__preempt_enable_no_resched();
+		__schedule();
 		preempt_disable();
 	}
 }
Index: linux.prev/arch/arm/kernel/semaphore.c
===================================================================
--- linux.prev.orig/arch/arm/kernel/semaphore.c
+++ linux.prev/arch/arm/kernel/semaphore.c
@@ -49,14 +49,16 @@
  *    we cannot lose wakeup events.
  */
 
-void __up(struct semaphore *sem)
+fastcall void __attribute_used__ __compat_up(struct compat_semaphore *sem)
 {
 	wake_up(&sem->wait);
 }
 
+EXPORT_SYMBOL(__compat_up);
+
 static DEFINE_SPINLOCK(semaphore_lock);
 
-void __sched __down(struct semaphore * sem)
+fastcall void __attribute_used__ __sched __compat_down(struct compat_semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -89,7 +91,9 @@ void __sched __down(struct semaphore * s
 	wake_up(&sem->wait);
 }
 
-int __sched __down_interruptible(struct semaphore * sem)
+EXPORT_SYMBOL(__compat_down);
+
+fastcall int __attribute_used__ __sched __compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -140,6 +144,8 @@ int __sched __down_interruptible(struct 
 	return retval;
 }
 
+EXPORT_SYMBOL(__compat_down_interruptible);
+
 /*
  * Trylock failed - make sure we correct for
  * having decremented the count.
@@ -148,7 +154,7 @@ int __sched __down_interruptible(struct 
  * single "cmpxchg" without failure cases,
  * but then it wouldn't work on a 386.
  */
-int __down_trylock(struct semaphore * sem)
+fastcall int __attribute_used__ __compat_down_trylock(struct compat_semaphore * sem)
 {
 	int sleepers;
 	unsigned long flags;
@@ -168,6 +174,15 @@ int __down_trylock(struct semaphore * se
 	return 1;
 }
 
+EXPORT_SYMBOL(__compat_down_trylock);
+
+fastcall int compat_sem_is_locked(struct compat_semaphore *sem)
+{
+	return (int) atomic_read(&sem->count) < 0;
+}
+
+EXPORT_SYMBOL(compat_sem_is_locked);
+
 /*
  * The semaphore operations have a special calling sequence that
  * allow us to do a simpler in-line version of them. These routines
@@ -184,7 +199,7 @@ asm("	.section .sched.text,\"ax\",%progb
 __down_failed:					\n\
 	stmfd	sp!, {r0 - r3, lr}		\n\
 	mov	r0, ip				\n\
-	bl	__down				\n\
+	bl	__compat_down			\n\
 	ldmfd	sp!, {r0 - r3, pc}		\n\
 						\n\
 	.align	5				\n\
@@ -192,7 +207,7 @@ __down_failed:					\n\
 __down_interruptible_failed:			\n\
 	stmfd	sp!, {r0 - r3, lr}		\n\
 	mov	r0, ip				\n\
-	bl	__down_interruptible		\n\
+	bl	__compat_down_interruptible	\n\
 	mov	ip, r0				\n\
 	ldmfd	sp!, {r0 - r3, pc}		\n\
 						\n\
@@ -201,7 +216,7 @@ __down_interruptible_failed:			\n\
 __down_trylock_failed:				\n\
 	stmfd	sp!, {r0 - r3, lr}		\n\
 	mov	r0, ip				\n\
-	bl	__down_trylock			\n\
+	bl	__compat_down_trylock		\n\
 	mov	ip, r0				\n\
 	ldmfd	sp!, {r0 - r3, pc}		\n\
 						\n\
@@ -210,7 +225,7 @@ __down_trylock_failed:				\n\
 __up_wakeup:					\n\
 	stmfd	sp!, {r0 - r3, lr}		\n\
 	mov	r0, ip				\n\
-	bl	__up				\n\
+	bl	__compat_up			\n\
 	ldmfd	sp!, {r0 - r3, pc}		\n\
 	");
 
Index: linux.prev/arch/arm/kernel/signal.c
===================================================================
--- linux.prev.orig/arch/arm/kernel/signal.c
+++ linux.prev/arch/arm/kernel/signal.c
@@ -628,6 +628,14 @@ static int do_signal(sigset_t *oldset, s
 	siginfo_t info;
 	int signr;
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Fully-preemptible kernel does not need interrupts disabled:
+	 */
+	raw_local_irq_enable();
+	preempt_check_resched();
+#endif
+
 	/*
 	 * We want the common case to go fast, which
 	 * is why we may in certain cases get here from
Index: linux.prev/arch/arm/kernel/smp.c
===================================================================
--- linux.prev.orig/arch/arm/kernel/smp.c
+++ linux.prev/arch/arm/kernel/smp.c
@@ -56,6 +56,7 @@ struct ipi_data {
 	unsigned long bits;
 };
 
+/* FIXME */
 static DEFINE_PER_CPU(struct ipi_data, ipi_data) = {
 	.lock	= SPIN_LOCK_UNLOCKED,
 };
@@ -348,7 +349,7 @@ static void send_ipi_message(cpumask_t c
 	unsigned long flags;
 	unsigned int cpu;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	for_each_cpu_mask(cpu, callmap) {
 		struct ipi_data *ipi = &per_cpu(ipi_data, cpu);
@@ -363,7 +364,7 @@ static void send_ipi_message(cpumask_t c
 	 */
 	smp_cross_call(callmap);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /*
@@ -520,7 +521,7 @@ static void ipi_call_function(unsigned i
 		cpu_clear(cpu, data->unfinished);
 }
 
-static DEFINE_SPINLOCK(stop_lock);
+static DEFINE_RAW_SPINLOCK(stop_lock);
 
 /*
  * ipi_cpu_stop - handle IPI from smp_send_stop()
@@ -535,7 +536,7 @@ static void ipi_cpu_stop(unsigned int cp
 	cpu_clear(cpu, cpu_online_map);
 
 	local_fiq_disable();
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	while (1)
 		cpu_relax();
Index: linux.prev/arch/arm/kernel/traps.c
===================================================================
--- linux.prev.orig/arch/arm/kernel/traps.c
+++ linux.prev/arch/arm/kernel/traps.c
@@ -177,6 +177,8 @@ void dump_stack(void)
 {
 #ifdef CONFIG_DEBUG_ERRORS
 	__backtrace();
+	print_traces(current);
+	show_held_locks(current);
 #endif
 }
 
@@ -217,7 +219,7 @@ static void __die(const char *str, int e
 	}
 }
 
-DEFINE_SPINLOCK(die_lock);
+DEFINE_RAW_SPINLOCK(die_lock);
 
 /*
  * This function is protected against re-entrancy.
@@ -249,7 +251,7 @@ void notify_die(const char *str, struct 
 }
 
 static LIST_HEAD(undef_hook);
-static DEFINE_SPINLOCK(undef_lock);
+static DEFINE_RAW_SPINLOCK(undef_lock);
 
 void register_undef_hook(struct undef_hook *hook)
 {
@@ -341,7 +343,7 @@ asmlinkage void bad_mode(struct pt_regs 
 		handler[reason], processor_modes[proc_mode]);
 
 	die("Oops - bad mode", regs, 0);
-	local_irq_disable();
+	raw_local_irq_disable();
 	panic("bad mode");
 }
 
Index: linux.prev/arch/arm/mach-clps711x/p720t-leds.c
===================================================================
--- linux.prev.orig/arch/arm/mach-clps711x/p720t-leds.c
+++ linux.prev/arch/arm/mach-clps711x/p720t-leds.c
@@ -36,7 +36,7 @@ static void p720t_leds_event(led_event_t
 	unsigned long flags;
 	u32 pddr;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	switch(ledevt) {
 	case led_idle_start:
 		break;
@@ -53,7 +53,7 @@ static void p720t_leds_event(led_event_t
 		break;
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static int __init leds_init(void)
Index: linux.prev/arch/arm/mach-clps711x/time.c
===================================================================
--- linux.prev.orig/arch/arm/mach-clps711x/time.c
+++ linux.prev/arch/arm/mach-clps711x/time.c
@@ -19,6 +19,7 @@
 #include <linux/timex.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/sched.h>
 
 #include <asm/hardware.h>
Index: linux.prev/arch/arm/mach-clps7500/core.c
===================================================================
--- linux.prev.orig/arch/arm/mach-clps7500/core.c
+++ linux.prev/arch/arm/mach-clps7500/core.c
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/list.h>
 #include <linux/sched.h>
 #include <linux/init.h>
Index: linux.prev/arch/arm/mach-ebsa110/core.c
===================================================================
--- linux.prev.orig/arch/arm/mach-ebsa110/core.c
+++ linux.prev/arch/arm/mach-ebsa110/core.c
@@ -56,14 +56,14 @@ static void __init ebsa110_init_irq(void
 	unsigned long flags;
 	unsigned int irq;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	__raw_writeb(0xff, IRQ_MCLR);
 	__raw_writeb(0x55, IRQ_MSET);
 	__raw_writeb(0x00, IRQ_MSET);
 	if (__raw_readb(IRQ_MASK) != 0x55)
 		while (1);
 	__raw_writeb(0xff, IRQ_MCLR);	/* clear all interrupt enables */
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	for (irq = 0; irq < NR_IRQS; irq++) {
 		set_irq_chip(irq, &ebsa110_irq_chip);
Index: linux.prev/arch/arm/mach-footbridge/dc21285-timer.c
===================================================================
--- linux.prev.orig/arch/arm/mach-footbridge/dc21285-timer.c
+++ linux.prev/arch/arm/mach-footbridge/dc21285-timer.c
@@ -6,6 +6,7 @@
  */
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 
 #include <asm/irq.h>
 
Index: linux.prev/arch/arm/mach-footbridge/isa-irq.c
===================================================================
--- linux.prev.orig/arch/arm/mach-footbridge/isa-irq.c
+++ linux.prev/arch/arm/mach-footbridge/isa-irq.c
@@ -102,6 +102,17 @@ static struct irqaction irq_cascade = { 
 static struct resource pic1_resource = { "pic1", 0x20, 0x3f };
 static struct resource pic2_resource = { "pic2", 0xa0, 0xbf };
 
+static DEFINE_IRQ_CHAINED_TYPE(isa_irq_handler);
+
+static unsigned int startup_irq_disabled(unsigned int irq)
+{
+	return 0;
+}
+
+/* Interrupt type for irqs which must not be
+ * automatically enabled in reqeust_irq */
+static struct irq_type level_type_nostart;
+
 void __init isa_init_irq(unsigned int host_irq)
 {
 	unsigned int irq;
@@ -159,9 +170,11 @@ void __init isa_init_irq(unsigned int ho
 		 * There appears to be a missing pull-up
 		 * resistor on this line.
 		 */
-		if (machine_is_netwinder())
-			set_irq_flags(_ISA_IRQ(11), IRQF_VALID |
-				      IRQF_PROBE | IRQF_NOAUTOEN);
+		if (machine_is_netwinder()) {
+			level_type_nostart = default_level_type;
+			level_type_nostart.startup = startup_irq_disabled;
+			set_irq_handler(_ISA_IRQ(11), &level_type_nostart);
+		}
 	}
 }
 
Index: linux.prev/arch/arm/mach-footbridge/isa-timer.c
===================================================================
--- linux.prev.orig/arch/arm/mach-footbridge/isa-timer.c
+++ linux.prev/arch/arm/mach-footbridge/isa-timer.c
@@ -6,6 +6,7 @@
  */
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
Index: linux.prev/arch/arm/mach-footbridge/netwinder-hw.c
===================================================================
--- linux.prev.orig/arch/arm/mach-footbridge/netwinder-hw.c
+++ linux.prev/arch/arm/mach-footbridge/netwinder-hw.c
@@ -68,7 +68,7 @@ static inline void wb977_ww(int reg, int
 /*
  * This is a lock for accessing ports GP1_IO_BASE and GP2_IO_BASE
  */
-DEFINE_SPINLOCK(gpio_lock);
+DEFINE_RAW_SPINLOCK(gpio_lock);
 
 static unsigned int current_gpio_op;
 static unsigned int current_gpio_io;
Index: linux.prev/arch/arm/mach-footbridge/netwinder-leds.c
===================================================================
--- linux.prev.orig/arch/arm/mach-footbridge/netwinder-leds.c
+++ linux.prev/arch/arm/mach-footbridge/netwinder-leds.c
@@ -33,7 +33,7 @@ static char led_state;
 static char hw_led_state;
 
 static DEFINE_SPINLOCK(leds_lock);
-extern spinlock_t gpio_lock;
+extern raw_spinlock_t gpio_lock;
 
 static void netwinder_leds_event(led_event_t evt)
 {
Index: linux.prev/arch/arm/mach-h720x/common.c
===================================================================
--- linux.prev.orig/arch/arm/mach-h720x/common.c
+++ linux.prev/arch/arm/mach-h720x/common.c
@@ -163,6 +163,11 @@ h720x_gpiod_demux_handler(unsigned int i
 	h720x_gpio_handler(mask, irq, desc, regs);
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(h720x_gpioa_demux_handler);
+static DEFINE_IRQ_CHAINED_TYPE(h720x_gpiob_demux_handler);
+static DEFINE_IRQ_CHAINED_TYPE(h720x_gpioc_demux_handler);
+static DEFINE_IRQ_CHAINED_TYPE(h720x_gpiod_demux_handler);
+
 #ifdef CONFIG_CPU_H7202
 static void
 h720x_gpioe_demux_handler(unsigned int irq_unused, struct irqdesc *desc,
@@ -175,6 +180,7 @@ h720x_gpioe_demux_handler(unsigned int i
 	IRQDBG("%s mask: 0x%08x irq: %d\n",__FUNCTION__,mask,irq);
 	h720x_gpio_handler(mask, irq, desc, regs);
 }
+static DEFINE_IRQ_CHAINED_TYPE(h720x_gpioe_demux_handler);
 #endif
 
 static struct irqchip h720x_global_chip = {
Index: linux.prev/arch/arm/mach-h720x/cpu-h7202.c
===================================================================
--- linux.prev.orig/arch/arm/mach-h720x/cpu-h7202.c
+++ linux.prev/arch/arm/mach-h720x/cpu-h7202.c
@@ -175,6 +175,8 @@ static struct irqaction h7202_timer_irq 
 	.handler	= h7202_timer_interrupt,
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(h7202_timerx_demux_handler);
+
 /*
  * Setup TIMER0 as system timer
  */
Index: linux.prev/arch/arm/mach-imx/dma.c
===================================================================
--- linux.prev.orig/arch/arm/mach-imx/dma.c
+++ linux.prev/arch/arm/mach-imx/dma.c
@@ -43,7 +43,7 @@ imx_request_dma(char *name, imx_dma_prio
 	if (!name || !irq_handler)
 		return -EINVAL;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	/* try grabbing a DMA channel with the requested priority */
 	for (i = prio; i < prio + (prio == DMA_PRIO_LOW) ? 8 : 4; i++) {
@@ -75,7 +75,7 @@ imx_request_dma(char *name, imx_dma_prio
 		i = -ENODEV;
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return i;
 }
 
@@ -91,10 +91,10 @@ imx_free_dma(int dma_ch)
 		return;
 	}
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	DIMR &= ~(1 << dma_ch);
 	dma_channels[dma_ch].name = NULL;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static irqreturn_t
Index: linux.prev/arch/arm/mach-imx/irq.c
===================================================================
--- linux.prev.orig/arch/arm/mach-imx/irq.c
+++ linux.prev/arch/arm/mach-imx/irq.c
@@ -217,6 +217,11 @@ static struct irqchip imx_gpio_chip = {
 	.set_type = imx_gpio_irq_type,
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(imx_gpioa_demux_handler);
+static DEFINE_IRQ_CHAINED_TYPE(imx_gpiob_demux_handler);
+static DEFINE_IRQ_CHAINED_TYPE(imx_gpioc_demux_handler);
+static DEFINE_IRQ_CHAINED_TYPE(imx_gpiod_demux_handler);
+
 void __init
 imx_init_irq(void)
 {
Index: linux.prev/arch/arm/mach-imx/leds-mx1ads.c
===================================================================
--- linux.prev.orig/arch/arm/mach-imx/leds-mx1ads.c
+++ linux.prev/arch/arm/mach-imx/leds-mx1ads.c
@@ -29,7 +29,7 @@ mx1ads_leds_event(led_event_t ledevt)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	switch (ledevt) {
 #ifdef CONFIG_LEDS_CPU
@@ -49,5 +49,5 @@ mx1ads_leds_event(led_event_t ledevt)
 	default:
 		break;
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
Index: linux.prev/arch/arm/mach-imx/time.c
===================================================================
--- linux.prev.orig/arch/arm/mach-imx/time.c
+++ linux.prev/arch/arm/mach-imx/time.c
@@ -13,6 +13,7 @@
 #include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/time.h>
 
 #include <asm/hardware.h>
Index: linux.prev/arch/arm/mach-integrator/core.c
===================================================================
--- linux.prev.orig/arch/arm/mach-integrator/core.c
+++ linux.prev/arch/arm/mach-integrator/core.c
@@ -13,6 +13,7 @@
 #include <linux/device.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
 
@@ -117,7 +118,7 @@ arch_initcall(integrator_init);
 
 #define CM_CTRL	IO_ADDRESS(INTEGRATOR_HDR_BASE) + INTEGRATOR_HDR_CTRL_OFFSET
 
-static DEFINE_SPINLOCK(cm_lock);
+static DEFINE_RAW_SPINLOCK(cm_lock);
 
 /**
  * cm_control - update the CM_CTRL register.
Index: linux.prev/arch/arm/mach-integrator/leds.c
===================================================================
--- linux.prev.orig/arch/arm/mach-integrator/leds.c
+++ linux.prev/arch/arm/mach-integrator/leds.c
@@ -41,7 +41,7 @@ static void integrator_leds_event(led_ev
 	unsigned int update_alpha_leds;
 
 	// yup, change the LEDs
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	update_alpha_leds = 0;
 
 	switch(ledevt) {
@@ -76,7 +76,7 @@ static void integrator_leds_event(led_ev
 		while (__raw_readl(dbg_base + INTEGRATOR_DBG_ALPHA_OFFSET) & 1);
 		__raw_writel(saved_leds, dbg_base + INTEGRATOR_DBG_LEDS_OFFSET);
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static int __init leds_init(void)
Index: linux.prev/arch/arm/mach-integrator/pci_v3.c
===================================================================
--- linux.prev.orig/arch/arm/mach-integrator/pci_v3.c
+++ linux.prev/arch/arm/mach-integrator/pci_v3.c
@@ -163,7 +163,7 @@
  *	 7:2	register number
  *  
  */
-static DEFINE_SPINLOCK(v3_lock);
+static DEFINE_RAW_SPINLOCK(v3_lock);
 
 #define PCI_BUS_NONMEM_START	0x00000000
 #define PCI_BUS_NONMEM_SIZE	SZ_256M
Index: linux.prev/arch/arm/mach-integrator/platsmp.c
===================================================================
--- linux.prev.orig/arch/arm/mach-integrator/platsmp.c
+++ linux.prev/arch/arm/mach-integrator/platsmp.c
@@ -31,7 +31,7 @@ extern void integrator_secondary_startup
 volatile int __cpuinitdata pen_release = -1;
 unsigned long __cpuinitdata phys_pen_release = 0;
 
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
 
 void __cpuinit platform_secondary_init(unsigned int cpu)
 {
Index: linux.prev/arch/arm/mach-integrator/time.c
===================================================================
--- linux.prev.orig/arch/arm/mach-integrator/time.c
+++ linux.prev/arch/arm/mach-integrator/time.c
@@ -96,7 +96,8 @@ static struct rtc_ops rtc_ops = {
 	.set_alarm	= rtc_set_alarm,
 };
 
-static irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+static irqreturn_t arm_rtc_interrupt(int irq, void *dev_id,
+				     struct pt_regs *regs)
 {
 	writel(0, rtc_base + RTC_EOI);
 	return IRQ_HANDLED;
@@ -124,7 +125,7 @@ static int rtc_probe(struct amba_device 
 
 	xtime.tv_sec = __raw_readl(rtc_base + RTC_DR);
 
-	ret = request_irq(dev->irq[0], rtc_interrupt, SA_INTERRUPT,
+	ret = request_irq(dev->irq[0], arm_rtc_interrupt, SA_INTERRUPT,
 			  "rtc-pl030", dev);
 	if (ret)
 		goto map_out;
Index: linux.prev/arch/arm/mach-ixp2000/core.c
===================================================================
--- linux.prev.orig/arch/arm/mach-ixp2000/core.c
+++ linux.prev/arch/arm/mach-ixp2000/core.c
@@ -20,6 +20,7 @@
 #include <linux/spinlock.h>
 #include <linux/sched.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/serial.h>
 #include <linux/tty.h>
 #include <linux/bitops.h>
@@ -276,9 +277,9 @@ void gpio_line_config(int line, int dire
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	if (direction == GPIO_OUT) {
-		irq_desc[line + IRQ_IXP2000_GPIO0].valid = 0;
+ 		set_irq_flags(line + IRQ_IXP2000_GPIO0, 0);
 
 		/* if it's an output, it ain't an interrupt anymore */
 		GPIO_IRQ_falling_edge &= ~(1 << line);
@@ -291,7 +292,7 @@ void gpio_line_config(int line, int dire
 	} else if (direction == GPIO_IN) {
 		ixp2000_reg_wrb(IXP2000_GPIO_PDCR, 1 << line);
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 
@@ -344,8 +345,7 @@ static int ixp2000_GPIO_irq_type(unsigne
 	/*
 	 * Finally, mark the corresponding IRQ as valid.
 	 */
-	irq_desc[irq].valid = 1;
-
+	set_irq_flags(irq, IRQF_VALID);
 	return 0;
 }
 
@@ -449,6 +449,8 @@ static struct irqchip ixp2000_irq_chip =
 	.unmask	= ixp2000_irq_unmask
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(ixp2000_GPIO_irq_handler);
+
 void __init ixp2000_init_irq(void)
 {
 	int irq;
Index: linux.prev/arch/arm/mach-ixp2000/ixdp2x00.c
===================================================================
--- linux.prev.orig/arch/arm/mach-ixp2000/ixdp2x00.c
+++ linux.prev/arch/arm/mach-ixp2000/ixdp2x00.c
@@ -146,6 +146,8 @@ static struct irqchip ixdp2x00_cpld_irq_
 	.unmask	= ixdp2x00_irq_unmask
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(ixdp2x00_irq_handler);
+
 void ixdp2x00_init_irq(volatile unsigned long *stat_reg, volatile unsigned long *mask_reg, unsigned long nr_irqs)
 {
 	unsigned int irq;
@@ -168,7 +170,7 @@ void ixdp2x00_init_irq(volatile unsigned
 	}
 
 	/* Hook into PCI interrupt */
-	set_irq_chained_handler(IRQ_IXP2000_PCIB, &ixdp2x00_irq_handler);
+	set_irq_chained_handler(IRQ_IXP2000_PCIB, ixdp2x00_irq_handler);
 }
 
 /*************************************************************************
Index: linux.prev/arch/arm/mach-ixp2000/ixdp2x01.c
===================================================================
--- linux.prev.orig/arch/arm/mach-ixp2000/ixdp2x01.c
+++ linux.prev/arch/arm/mach-ixp2000/ixdp2x01.c
@@ -95,6 +95,8 @@ static struct irqchip ixdp2x01_irq_chip 
 	.unmask	= ixdp2x01_irq_unmask
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(ixdp2x01_irq_handler);
+
 /*
  * We only do anything if we are the master NPU on the board.
  * The slave NPU only has the ethernet chip going directly to
@@ -127,7 +129,7 @@ void __init ixdp2x01_init_irq(void)
 	}
 
 	/* Hook into PCI interrupts */
-	set_irq_chained_handler(IRQ_IXP2000_PCIB, &ixdp2x01_irq_handler);
+	set_irq_chained_handler(IRQ_IXP2000_PCIB, ixdp2x01_irq_handler);
 }
 
 
Index: linux.prev/arch/arm/mach-ixp2000/pci.c
===================================================================
--- linux.prev.orig/arch/arm/mach-ixp2000/pci.c
+++ linux.prev/arch/arm/mach-ixp2000/pci.c
@@ -145,7 +145,7 @@ int ixp2000_pci_abort_handler(unsigned l
 
 	pci_master_aborts = 1;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	temp = *(IXP2000_PCI_CONTROL);
 	if (temp & ((1 << 8) | (1 << 5))) {
 		ixp2000_reg_wrb(IXP2000_PCI_CONTROL, temp);
@@ -158,7 +158,7 @@ int ixp2000_pci_abort_handler(unsigned l
 			temp = *(IXP2000_PCI_CMDSTAT);
 		}
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	/*
 	 * If it was an imprecise abort, then we need to correct the
@@ -176,7 +176,7 @@ clear_master_aborts(void)
 	volatile u32 temp;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	temp = *(IXP2000_PCI_CONTROL);
 	if (temp & ((1 << 8) | (1 << 5))) {
 		ixp2000_reg_wrb(IXP2000_PCI_CONTROL, temp);
@@ -189,7 +189,7 @@ clear_master_aborts(void)
 			temp = *(IXP2000_PCI_CMDSTAT);
 		}
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return 0;
 }
Index: linux.prev/arch/arm/mach-ixp4xx/common-pci.c
===================================================================
--- linux.prev.orig/arch/arm/mach-ixp4xx/common-pci.c
+++ linux.prev/arch/arm/mach-ixp4xx/common-pci.c
@@ -53,7 +53,7 @@ unsigned long ixp4xx_pci_reg_base = 0;
  * these transactions are atomic or we will end up
  * with corrupt data on the bus or in a driver.
  */
-static DEFINE_SPINLOCK(ixp4xx_pci_lock);
+static DEFINE_RAW_SPINLOCK(ixp4xx_pci_lock);
 
 /*
  * Read from PCI config space
Index: linux.prev/arch/arm/mach-ixp4xx/coyote-pci.c
===================================================================
--- linux.prev.orig/arch/arm/mach-ixp4xx/coyote-pci.c
+++ linux.prev/arch/arm/mach-ixp4xx/coyote-pci.c
@@ -17,6 +17,7 @@
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/irq.h>
 
 #include <asm/mach-types.h>
 #include <asm/hardware.h>
Index: linux.prev/arch/arm/mach-ixp4xx/ixdp425-pci.c
===================================================================
--- linux.prev.orig/arch/arm/mach-ixp4xx/ixdp425-pci.c
+++ linux.prev/arch/arm/mach-ixp4xx/ixdp425-pci.c
@@ -16,6 +16,7 @@
 
 #include <linux/kernel.h>
 #include <linux/config.h>
+#include <linux/irq.h>
 #include <linux/pci.h>
 #include <linux/init.h>
 #include <linux/delay.h>
Index: linux.prev/arch/arm/mach-ixp4xx/ixdpg425-pci.c
===================================================================
--- linux.prev.orig/arch/arm/mach-ixp4xx/ixdpg425-pci.c
+++ linux.prev/arch/arm/mach-ixp4xx/ixdpg425-pci.c
@@ -16,10 +16,10 @@
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/irq.h>
 
 #include <asm/mach-types.h>
 #include <asm/hardware.h>
-#include <asm/irq.h>
 
 #include <asm/mach/pci.h>
 
Index: linux.prev/arch/arm/mach-l7200/core.c
===================================================================
--- linux.prev.orig/arch/arm/mach-l7200/core.c
+++ linux.prev/arch/arm/mach-l7200/core.c
@@ -7,6 +7,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/irq.h>
 #include <linux/device.h>
 
 #include <asm/types.h>
Index: linux.prev/arch/arm/mach-lh7a40x/arch-kev7a400.c
===================================================================
--- linux.prev.orig/arch/arm/mach-lh7a40x/arch-kev7a400.c
+++ linux.prev/arch/arm/mach-lh7a40x/arch-kev7a400.c
@@ -81,6 +81,8 @@ static void kev7a400_cpld_handler (unsig
 	}
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(kev7a400_cpld_handler);
+
 void __init lh7a40x_init_board_irq (void)
 {
 	int irq;
Index: linux.prev/arch/arm/mach-lh7a40x/arch-lpd7a40x.c
===================================================================
--- linux.prev.orig/arch/arm/mach-lh7a40x/arch-lpd7a40x.c
+++ linux.prev/arch/arm/mach-lh7a40x/arch-lpd7a40x.c
@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 
 #include <asm/hardware.h>
 #include <asm/setup.h>
@@ -173,6 +174,7 @@ static void lpd7a40x_cpld_handler (unsig
 	desc->chip->unmask (irq); /* Level-triggered need this */
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(lpd7a40x_cpld_handler);
 
 void __init lh7a40x_init_board_irq (void)
 {
Index: linux.prev/arch/arm/mach-lh7a40x/irq-kev7a400.c
===================================================================
--- linux.prev.orig/arch/arm/mach-lh7a40x/irq-kev7a400.c
+++ linux.prev/arch/arm/mach-lh7a40x/irq-kev7a400.c
@@ -60,6 +60,8 @@ lh7a400_cpld_handler (unsigned int irq, 
 	}
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(kev7a400_cpld_handler);
+
   /* IRQ initialization */
 
 void __init
Index: linux.prev/arch/arm/mach-lh7a40x/irq-lpd7a40x.c
===================================================================
--- linux.prev.orig/arch/arm/mach-lh7a40x/irq-lpd7a40x.c
+++ linux.prev/arch/arm/mach-lh7a40x/irq-lpd7a40x.c
@@ -71,6 +71,7 @@ static void lh7a40x_cpld_handler (unsign
 	desc->chip->unmask (irq); /* Level-triggered need this */
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(lh7a40x_cpld_handler);
 
   /* IRQ initialization */
 
Index: linux.prev/arch/arm/mach-lh7a40x/time.c
===================================================================
--- linux.prev.orig/arch/arm/mach-lh7a40x/time.c
+++ linux.prev/arch/arm/mach-lh7a40x/time.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/time.h>
 
 #include <asm/hardware.h>
Index: linux.prev/arch/arm/mach-omap1/board-osk.c
===================================================================
--- linux.prev.orig/arch/arm/mach-omap1/board-osk.c
+++ linux.prev/arch/arm/mach-omap1/board-osk.c
@@ -29,7 +29,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/platform_device.h>
-#include <linux/interrupt.h>
+#include <linux/irq.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
Index: linux.prev/arch/arm/mach-omap1/fpga.c
===================================================================
--- linux.prev.orig/arch/arm/mach-omap1/fpga.c
+++ linux.prev/arch/arm/mach-omap1/fpga.c
@@ -120,6 +120,8 @@ static struct irqchip omap_fpga_irq = {
 	.unmask		= fpga_unmask_irq,
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(innovator_fpga_IRQ_demux);
+
 /*
  * All of the FPGA interrupt request inputs except for the touchscreen are
  * edge-sensitive; the touchscreen is level-sensitive.  The edge-sensitive
Index: linux.prev/arch/arm/mach-omap1/leds-h2p2-debug.c
===================================================================
--- linux.prev.orig/arch/arm/mach-omap1/leds-h2p2-debug.c
+++ linux.prev/arch/arm/mach-omap1/leds-h2p2-debug.c
@@ -45,7 +45,7 @@ void h2p2_dbg_leds_event(led_event_t evt
 	static struct h2p2_dbg_fpga __iomem *fpga;
 	static u16 led_state, hw_led_state;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	if (!(led_state & LED_STATE_ENABLED) && evt != led_start)
 		goto done;
@@ -164,5 +164,5 @@ void h2p2_dbg_leds_event(led_event_t evt
 		__raw_writew(~hw_led_state, &fpga->leds);
 
 done:
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
Index: linux.prev/arch/arm/mach-omap1/serial.c
===================================================================
--- linux.prev.orig/arch/arm/mach-omap1/serial.c
+++ linux.prev/arch/arm/mach-omap1/serial.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/irq.h>
 #include <linux/delay.h>
 #include <linux/serial.h>
 #include <linux/tty.h>
Index: linux.prev/arch/arm/mach-pxa/dma.c
===================================================================
--- linux.prev.orig/arch/arm/mach-pxa/dma.c
+++ linux.prev/arch/arm/mach-pxa/dma.c
@@ -43,7 +43,7 @@ int pxa_request_dma (char *name, pxa_dma
 	if (!name || !irq_handler)
 		return -EINVAL;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	/* try grabbing a DMA channel with the requested priority */
 	for (i = prio; i < prio + PXA_DMA_NBCH(prio); i++) {
@@ -73,7 +73,7 @@ int pxa_request_dma (char *name, pxa_dma
 		i = -ENODEV;
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return i;
 }
 
@@ -88,10 +88,10 @@ void pxa_free_dma (int dma_ch)
 		return;
 	}
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	DCSR(dma_ch) = DCSR_STARTINTR|DCSR_ENDINTR|DCSR_BUSERR;
 	dma_channels[dma_ch].name = NULL;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static irqreturn_t dma_irq_handler(int irq, void *dev_id, struct pt_regs *regs)
Index: linux.prev/arch/arm/mach-pxa/generic.c
===================================================================
--- linux.prev.orig/arch/arm/mach-pxa/generic.c
+++ linux.prev/arch/arm/mach-pxa/generic.c
@@ -51,7 +51,7 @@ void pxa_gpio_mode(int gpio_mode)
 	int fn = (gpio_mode & GPIO_MD_MASK_FN) >> 8;
 	int gafr;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	if (gpio_mode & GPIO_DFLT_LOW)
 		GPCR(gpio) = GPIO_bit(gpio);
 	else if (gpio_mode & GPIO_DFLT_HIGH)
@@ -62,7 +62,7 @@ void pxa_gpio_mode(int gpio_mode)
 		GPDR(gpio) &= ~GPIO_bit(gpio);
 	gafr = GAFR(gpio) & ~(0x3 << (((gpio) & 0xf)*2));
 	GAFR(gpio) = gafr |  (fn  << (((gpio) & 0xf)*2));
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 EXPORT_SYMBOL(pxa_gpio_mode);
@@ -73,14 +73,14 @@ EXPORT_SYMBOL(pxa_gpio_mode);
 void pxa_set_cken(int clock, int enable)
 {
 	unsigned long flags;
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	if (enable)
 		CKEN |= clock;
 	else
 		CKEN &= ~clock;
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 EXPORT_SYMBOL(pxa_set_cken);
Index: linux.prev/arch/arm/mach-pxa/idp.c
===================================================================
--- linux.prev.orig/arch/arm/mach-pxa/idp.c
+++ linux.prev/arch/arm/mach-pxa/idp.c
@@ -18,6 +18,7 @@
 
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/platform_device.h>
 #include <linux/fb.h>
 
Index: linux.prev/arch/arm/mach-pxa/irq.c
===================================================================
--- linux.prev.orig/arch/arm/mach-pxa/irq.c
+++ linux.prev/arch/arm/mach-pxa/irq.c
@@ -244,6 +244,7 @@ static struct irqchip pxa_muxed_gpio_chi
 	.set_type	= pxa_gpio_irq_type,
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(pxa_gpio_demux_handler);
 
 void __init pxa_init_irq(void)
 {
Index: linux.prev/arch/arm/mach-pxa/leds-idp.c
===================================================================
--- linux.prev.orig/arch/arm/mach-pxa/leds-idp.c
+++ linux.prev/arch/arm/mach-pxa/leds-idp.c
@@ -34,7 +34,7 @@ void idp_leds_event(led_event_t evt)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	switch (evt) {
 	case led_start:
@@ -113,5 +113,5 @@ void idp_leds_event(led_event_t evt)
 	else
 		IDP_CPLD_LED_CONTROL |= IDP_LEDS_MASK;
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
Index: linux.prev/arch/arm/mach-pxa/leds-lubbock.c
===================================================================
--- linux.prev.orig/arch/arm/mach-pxa/leds-lubbock.c
+++ linux.prev/arch/arm/mach-pxa/leds-lubbock.c
@@ -48,7 +48,7 @@ void lubbock_leds_event(led_event_t evt)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	switch (evt) {
 	case led_start:
@@ -122,5 +122,5 @@ void lubbock_leds_event(led_event_t evt)
 	else
 		LUB_DISC_BLNK_LED |= 0xff;
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
Index: linux.prev/arch/arm/mach-pxa/leds-mainstone.c
===================================================================
--- linux.prev.orig/arch/arm/mach-pxa/leds-mainstone.c
+++ linux.prev/arch/arm/mach-pxa/leds-mainstone.c
@@ -43,7 +43,7 @@ void mainstone_leds_event(led_event_t ev
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	switch (evt) {
 	case led_start:
@@ -117,5 +117,5 @@ void mainstone_leds_event(led_event_t ev
 	else
 		MST_LEDCTRL |= 0xff;
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
Index: linux.prev/arch/arm/mach-pxa/lubbock.c
===================================================================
--- linux.prev.orig/arch/arm/mach-pxa/lubbock.c
+++ linux.prev/arch/arm/mach-pxa/lubbock.c
@@ -52,9 +52,9 @@ void lubbock_set_misc_wr(unsigned int ma
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	LUB_MISC_WR = (LUB_MISC_WR & ~mask) | (set & mask);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 EXPORT_SYMBOL(lubbock_set_misc_wr);
 
@@ -95,6 +95,8 @@ static void lubbock_irq_handler(unsigned
 	} while (pending);
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(lubbock_irq_handler);
+
 static void __init lubbock_init_irq(void)
 {
 	int irq;
Index: linux.prev/arch/arm/mach-pxa/mainstone.c
===================================================================
--- linux.prev.orig/arch/arm/mach-pxa/mainstone.c
+++ linux.prev/arch/arm/mach-pxa/mainstone.c
@@ -84,6 +84,8 @@ static void mainstone_irq_handler(unsign
 	} while (pending);
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(mainstone_irq_handler);
+
 static void __init mainstone_init_irq(void)
 {
 	int irq;
Index: linux.prev/arch/arm/mach-rpc/dma.c
===================================================================
--- linux.prev.orig/arch/arm/mach-rpc/dma.c
+++ linux.prev/arch/arm/mach-rpc/dma.c
@@ -171,11 +171,11 @@ static void iomd_disable_dma(dmach_t cha
 	unsigned long dma_base = dma->dma_base;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	if (dma->state != ~DMA_ST_AB)
 		disable_irq(dma->dma_irq);
 	iomd_writeb(0, dma_base + CR);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static int iomd_set_dma_speed(dmach_t channel, dma_t *dma, int cycle)
Index: linux.prev/arch/arm/mach-rpc/irq.c
===================================================================
--- linux.prev.orig/arch/arm/mach-rpc/irq.c
+++ linux.prev/arch/arm/mach-rpc/irq.c
@@ -112,6 +112,15 @@ static struct irqchip iomd_fiq_chip = {
 	.unmask = iomd_unmask_irq_fiq,
 };
 
+static unsigned int startup_irq_disabled(unsigned int irq)
+{
+	return 0;
+}
+
+/* Interrupt type for irqs which must not be
+ * automatically enabled in reqeust_irq */
+static struct irq_type level_type_nostart;
+
 void __init rpc_init_irq(void)
 {
 	unsigned int irq, flags;
@@ -121,16 +130,15 @@ void __init rpc_init_irq(void)
 	iomd_writeb(0, IOMD_FIQMASK);
 	iomd_writeb(0, IOMD_DMAMASK);
 
+	level_type_nostart = default_level_type;
+	level_type_nostart.startup = startup_irq_disabled;
+
 	for (irq = 0; irq < NR_IRQS; irq++) {
 		flags = IRQF_VALID;
 
 		if (irq <= 6 || (irq >= 9 && irq <= 15))
 			flags |= IRQF_PROBE;
 
-		if (irq == 21 || (irq >= 16 && irq <= 19) ||
-		    irq == IRQ_KEYBOARDTX)
-			flags |= IRQF_NOAUTOEN;
-
 		switch (irq) {
 		case 0 ... 7:
 			set_irq_chip(irq, &iomd_a_chip);
@@ -155,6 +163,10 @@ void __init rpc_init_irq(void)
 			set_irq_flags(irq, IRQF_VALID);
 			break;
 		}
+
+		if (irq == 21 || (irq >= 16 && irq <= 19) ||
+		    irq == IRQ_KEYBOARDTX)
+			set_irq_handler(irq, &level_type_nostart);
 	}
 
 	init_FIQ();
Index: linux.prev/arch/arm/mach-s3c2410/bast-irq.c
===================================================================
--- linux.prev.orig/arch/arm/mach-s3c2410/bast-irq.c
+++ linux.prev/arch/arm/mach-s3c2410/bast-irq.c
@@ -136,13 +136,15 @@ bast_irq_pc104_demux(unsigned int irq,
 		for (i = 0; stat != 0; i++, stat >>= 1) {
 			if (stat & 1) {
 				irqno = bast_pc104_irqs[i];
-
-				desc_handle_irq(irqno, irq_desc + irqno, regs);
+				desc = irq_desc + irqno;
+				desc_handle_irq(irqno, desc, regs);
 			}
 		}
 	}
 }
 
+DEFINE_IRQ_CHAINED_TYPE(bast_irq_pc104_demux);
+
 static __init int bast_irq_init(void)
 {
 	unsigned int i;
@@ -156,7 +158,7 @@ static __init int bast_irq_init(void)
 
 		set_irq_chained_handler(IRQ_ISA, bast_irq_pc104_demux);
 
-		/* reigster our IRQs */
+		/* register our IRQs */
 
 		for (i = 0; i < 4; i++) {
 			unsigned int irqno = bast_pc104_irqs[i];
Index: linux.prev/arch/arm/mach-s3c2410/clock.c
===================================================================
--- linux.prev.orig/arch/arm/mach-s3c2410/clock.c
+++ linux.prev/arch/arm/mach-s3c2410/clock.c
@@ -61,7 +61,7 @@ void inline s3c24xx_clk_enable(unsigned 
 	unsigned long clkcon;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	clkcon = __raw_readl(S3C2410_CLKCON);
 	clkcon &= ~clocks;
@@ -74,7 +74,7 @@ void inline s3c24xx_clk_enable(unsigned 
 
 	__raw_writel(clkcon, S3C2410_CLKCON);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /* enable and disable calls for use with the clk struct */
Index: linux.prev/arch/arm/mach-s3c2410/dma.c
===================================================================
--- linux.prev.orig/arch/arm/mach-s3c2410/dma.c
+++ linux.prev/arch/arm/mach-s3c2410/dma.c
@@ -329,11 +329,11 @@ static int s3c2410_dma_start(s3c2410_dma
 
 	pr_debug("s3c2410_start_dma: channel=%d\n", chan->number);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	if (chan->state == S3C2410_DMA_RUNNING) {
 		pr_debug("s3c2410_start_dma: already running (%d)\n", chan->state);
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 		return 0;
 	}
 
@@ -348,7 +348,7 @@ static int s3c2410_dma_start(s3c2410_dma
 			printk(KERN_ERR "dma%d: channel has nothing loaded\n",
 			       chan->number);
 			chan->state = S3C2410_DMA_IDLE;
-			local_irq_restore(flags);
+			raw_local_irq_restore(flags);
 			return -EINVAL;
 		}
 
@@ -385,7 +385,7 @@ static int s3c2410_dma_start(s3c2410_dma
 
 	dbg_showchan(chan);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return 0;
 }
 
@@ -451,7 +451,7 @@ int s3c2410_dma_enqueue(unsigned int cha
 	buf->id    = id;
 	buf->magic = BUF_MAGIC;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	if (chan->curr == NULL) {
 		/* we've got nothing loaded... */
@@ -485,7 +485,7 @@ int s3c2410_dma_enqueue(unsigned int cha
 				       "timeout loading buffer\n",
 				       chan->number);
 				dbg_showchan(chan);
-				local_irq_restore(flags);
+				raw_local_irq_restore(flags);
 				return -EINVAL;
 			}
 		}
@@ -499,7 +499,7 @@ int s3c2410_dma_enqueue(unsigned int cha
 		}
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return 0;
 }
 
@@ -661,9 +661,9 @@ s3c2410_dma_irq(int irq, void *devpw, st
 			return IRQ_HANDLED;
 		}
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		s3c2410_dma_loadbuffer(chan, chan->next);
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	} else {
 		s3c2410_dma_lastxfer(chan);
 
@@ -698,14 +698,14 @@ int s3c2410_dma_request(unsigned int cha
 
 	check_channel(channel);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	dbg_showchan(chan);
 
 	if (chan->in_use) {
 		if (client != chan->client) {
 			printk(KERN_ERR "dma%d: already in use\n", channel);
-			local_irq_restore(flags);
+			raw_local_irq_restore(flags);
 			return -EBUSY;
 		} else {
 			printk(KERN_ERR "dma%d: client already has channel\n", channel);
@@ -724,7 +724,7 @@ int s3c2410_dma_request(unsigned int cha
 
 		if (err) {
 			chan->in_use = 0;
-			local_irq_restore(flags);
+			raw_local_irq_restore(flags);
 
 			printk(KERN_ERR "%s: cannot get IRQ %d for DMA %d\n",
 			       client->name, chan->irq, chan->number);
@@ -735,7 +735,7 @@ int s3c2410_dma_request(unsigned int cha
 		chan->irq_enabled = 1;
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	/* need to setup */
 
@@ -764,7 +764,7 @@ int s3c2410_dma_free(dmach_t channel, s3
 
 	check_channel(channel);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 
 	if (chan->client != client) {
@@ -789,7 +789,7 @@ int s3c2410_dma_free(dmach_t channel, s3
 		free_irq(chan->irq, (void *)chan);
 	chan->irq_claimed = 0;
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return 0;
 }
@@ -805,7 +805,7 @@ static int s3c2410_dma_dostop(s3c2410_dm
 
 	dbg_showchan(chan);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	s3c2410_dma_call_op(chan,  S3C2410_DMAOP_STOP);
 
@@ -823,7 +823,7 @@ static int s3c2410_dma_dostop(s3c2410_dm
 	chan->state      = S3C2410_DMA_IDLE;
 	chan->load_state = S3C2410_DMALOAD_NONE;
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return 0;
 }
@@ -840,7 +840,7 @@ static int s3c2410_dma_flush(s3c2410_dma
 
 	pr_debug("%s:\n", __FUNCTION__);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	if (chan->state != S3C2410_DMA_IDLE) {
 		pr_debug("%s: stopping channel...\n", __FUNCTION__ );
@@ -865,7 +865,7 @@ static int s3c2410_dma_flush(s3c2410_dma
 		}
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return 0;
 }
Index: linux.prev/arch/arm/mach-s3c2410/gpio.c
===================================================================
--- linux.prev.orig/arch/arm/mach-s3c2410/gpio.c
+++ linux.prev/arch/arm/mach-s3c2410/gpio.c
@@ -80,7 +80,7 @@ void s3c2410_gpio_cfgpin(unsigned int pi
 
 	/* modify the specified register wwith IRQs off */
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	con  = __raw_readl(base + 0x00);
 	con &= ~mask;
@@ -88,7 +88,7 @@ void s3c2410_gpio_cfgpin(unsigned int pi
 
 	__raw_writel(con, base + 0x00);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 EXPORT_SYMBOL(s3c2410_gpio_cfgpin);
@@ -119,14 +119,14 @@ void s3c2410_gpio_pullup(unsigned int pi
 	if (pin < S3C2410_GPIO_BANKB)
 		return;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	up = __raw_readl(base + 0x08);
 	up &= ~(1L << offs);
 	up |= to << offs;
 	__raw_writel(up, base + 0x08);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 EXPORT_SYMBOL(s3c2410_gpio_pullup);
@@ -138,14 +138,14 @@ void s3c2410_gpio_setpin(unsigned int pi
 	unsigned long flags;
 	unsigned long dat;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	dat = __raw_readl(base + 0x04);
 	dat &= ~(1 << offs);
 	dat |= to << offs;
 	__raw_writel(dat, base + 0x04);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 EXPORT_SYMBOL(s3c2410_gpio_setpin);
@@ -165,12 +165,12 @@ unsigned int s3c2410_modify_misccr(unsig
 	unsigned long flags;
 	unsigned long misccr;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	misccr = __raw_readl(S3C2410_MISCCR);
 	misccr &= ~clear;
 	misccr ^= change;
 	__raw_writel(misccr, S3C2410_MISCCR);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return misccr;
 }
@@ -211,7 +211,7 @@ int s3c2410_gpio_irqfilter(unsigned int 
 	pin -= S3C2410_GPG8_EINT16;
 	reg += pin & ~3;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	/* update filter width and clock source */
 
@@ -227,7 +227,7 @@ int s3c2410_gpio_irqfilter(unsigned int 
 	val |= on << ((pin * 4) + 3);
 	__raw_writel(val, S3C2410_EXTINT2);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return 0;
 }
Index: linux.prev/arch/arm/mach-s3c2410/irq.c
===================================================================
--- linux.prev.orig/arch/arm/mach-s3c2410/irq.c
+++ linux.prev/arch/arm/mach-s3c2410/irq.c
@@ -573,6 +573,11 @@ s3c_irq_demux_uart2(unsigned int irq,
 }
 
 
+static DEFINE_IRQ_CHAINED_TYPE(s3c_irq_demux_uart0);
+static DEFINE_IRQ_CHAINED_TYPE(s3c_irq_demux_uart1);
+static DEFINE_IRQ_CHAINED_TYPE(s3c_irq_demux_uart2);
+static DEFINE_IRQ_CHAINED_TYPE(s3c_irq_demux_adc);
+
 /* s3c24xx_init_irq
  *
  * Initialise S3C2410 IRQ system
Index: linux.prev/arch/arm/mach-s3c2410/s3c2440-dsc.c
===================================================================
--- linux.prev.orig/arch/arm/mach-s3c2410/s3c2440-dsc.c
+++ linux.prev/arch/arm/mach-s3c2410/s3c2440-dsc.c
@@ -45,14 +45,14 @@ int s3c2440_set_dsc(unsigned int pin, un
 	base = (pin & S3C2440_SELECT_DSC1) ? S3C2440_DSC1 : S3C2440_DSC0;
 	mask = 3 << S3C2440_DSC_GETSHIFT(pin);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	val = __raw_readl(base);
 	val &= ~mask;
 	val |= value & mask;
 	__raw_writel(val, base);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return 0;
 }
 
Index: linux.prev/arch/arm/mach-s3c2410/s3c2440-irq.c
===================================================================
--- linux.prev.orig/arch/arm/mach-s3c2410/s3c2440-irq.c
+++ linux.prev/arch/arm/mach-s3c2410/s3c2440-irq.c
@@ -157,6 +157,9 @@ static struct irqchip s3c_irq_cam = {
 	.ack	    = s3c_irq_cam_ack,
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(s3c_irq_demux_wdtac97);
+static DEFINE_IRQ_CHAINED_TYPE(s3c_irq_demux_cam);
+
 static int s3c2440_irq_add(struct sys_device *sysdev)
 {
 	unsigned int irqno;
Index: linux.prev/arch/arm/mach-s3c2410/time.c
===================================================================
--- linux.prev.orig/arch/arm/mach-s3c2410/time.c
+++ linux.prev/arch/arm/mach-s3c2410/time.c
@@ -23,6 +23,7 @@
 #include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/err.h>
 
 #include <asm/system.h>
Index: linux.prev/arch/arm/mach-sa1100/assabet.c
===================================================================
--- linux.prev.orig/arch/arm/mach-sa1100/assabet.c
+++ linux.prev/arch/arm/mach-sa1100/assabet.c
@@ -61,10 +61,10 @@ void ASSABET_BCR_frob(unsigned int mask,
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	BCR_value = (BCR_value & ~mask) | val;
 	ASSABET_BCR = BCR_value;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 EXPORT_SYMBOL(ASSABET_BCR_frob);
Index: linux.prev/arch/arm/mach-sa1100/badge4.c
===================================================================
--- linux.prev.orig/arch/arm/mach-sa1100/badge4.c
+++ linux.prev/arch/arm/mach-sa1100/badge4.c
@@ -227,7 +227,7 @@ void badge4_set_5V(unsigned subsystem, i
 	unsigned long flags;
 	unsigned old_5V_bitmap;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	old_5V_bitmap = badge4_5V_bitmap;
 
@@ -240,15 +240,22 @@ void badge4_set_5V(unsigned subsystem, i
 	/* detect on->off and off->on transitions */
 	if ((!old_5V_bitmap) && (badge4_5V_bitmap)) {
 		/* was off, now on */
-		printk(KERN_INFO "%s: enabling 5V supply rail\n", __FUNCTION__);
 		GPSR = BADGE4_GPIO_PCMEN5V;
 	} else if ((old_5V_bitmap) && (!badge4_5V_bitmap)) {
 		/* was on, now off */
-		printk(KERN_INFO "%s: disabling 5V supply rail\n", __FUNCTION__);
 		GPCR = BADGE4_GPIO_PCMEN5V;
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
+
+	/* detect on->off and off->on transitions */
+	if ((!old_5V_bitmap) && (badge4_5V_bitmap)) {
+		/* was off, now on */
+		printk(KERN_INFO "%s: enabling 5V supply rail\n", __FUNCTION__);
+	} else if ((old_5V_bitmap) && (!badge4_5V_bitmap)) {
+		/* was on, now off */
+		printk(KERN_INFO "%s: disabling 5V supply rail\n", __FUNCTION__);
+	}
 }
 EXPORT_SYMBOL(badge4_set_5V);
 
Index: linux.prev/arch/arm/mach-sa1100/cerf.c
===================================================================
--- linux.prev.orig/arch/arm/mach-sa1100/cerf.c
+++ linux.prev/arch/arm/mach-sa1100/cerf.c
@@ -15,6 +15,7 @@
 #include <linux/kernel.h>
 #include <linux/tty.h>
 #include <linux/platform_device.h>
+#include <linux/irq.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
 
Index: linux.prev/arch/arm/mach-sa1100/cpu-sa1110.c
===================================================================
--- linux.prev.orig/arch/arm/mach-sa1100/cpu-sa1110.c
+++ linux.prev/arch/arm/mach-sa1100/cpu-sa1110.c
@@ -282,7 +282,7 @@ static int sa1110_target(struct cpufreq_
 	 * This means that we won't access SDRAM for the duration of
 	 * the programming.
 	 */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	asm("mcr p15, 0, %0, c7, c10, 4" : : "r" (0));
 	udelay(10);
 	__asm__ __volatile__("					\n\
@@ -303,7 +303,7 @@ static int sa1110_target(struct cpufreq_
 		: "r" (&MDCNFG), "r" (&PPCR), "0" (sd.mdcnfg),
 		  "r" (sd.mdrefr), "r" (sd.mdcas[0]),
 		  "r" (sd.mdcas[1]), "r" (sd.mdcas[2]), "r" (ppcr));
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	/*
 	 * Now, return the SDRAM refresh back to normal.
Index: linux.prev/arch/arm/mach-sa1100/dma.c
===================================================================
--- linux.prev.orig/arch/arm/mach-sa1100/dma.c
+++ linux.prev/arch/arm/mach-sa1100/dma.c
@@ -227,7 +227,7 @@ int sa1100_start_dma(dma_regs_t *regs, d
 	if (size > MAX_DMA_SIZE)
 		return -EOVERFLOW;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	status = regs->RdDCSR;
 
 	/* If both DMA buffers are started, there's nothing else we can do. */
@@ -262,7 +262,7 @@ int sa1100_start_dma(dma_regs_t *regs, d
 	ret = 0;
 
 out:
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return ret;
 }
 
Index: linux.prev/arch/arm/mach-sa1100/generic.c
===================================================================
--- linux.prev.orig/arch/arm/mach-sa1100/generic.c
+++ linux.prev/arch/arm/mach-sa1100/generic.c
@@ -138,7 +138,7 @@ unsigned long long sched_clock(void)
 static void sa1100_power_off(void)
 {
 	mdelay(100);
-	local_irq_disable();
+	raw_local_irq_disable();
 	/* disable internal oscillator, float CS lines */
 	PCFR = (PCFR_OPDE | PCFR_FP | PCFR_FS);
 	/* enable wake-up on GPIO0 (Assabet...) */
@@ -411,7 +411,7 @@ void __init sa1110_mb_disable(void)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	
 	PGSR &= ~GPIO_MBGNT;
 	GPCR = GPIO_MBGNT;
@@ -419,7 +419,7 @@ void __init sa1110_mb_disable(void)
 
 	GAFR &= ~(GPIO_MBGNT | GPIO_MBREQ);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /*
@@ -430,7 +430,7 @@ void __init sa1110_mb_enable(void)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	PGSR &= ~GPIO_MBGNT;
 	GPCR = GPIO_MBGNT;
@@ -439,6 +439,6 @@ void __init sa1110_mb_enable(void)
 	GAFR |= (GPIO_MBGNT | GPIO_MBREQ);
 	TUCR |= TUCR_MR;
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
Index: linux.prev/arch/arm/mach-sa1100/h3600.c
===================================================================
--- linux.prev.orig/arch/arm/mach-sa1100/h3600.c
+++ linux.prev/arch/arm/mach-sa1100/h3600.c
@@ -331,7 +331,7 @@ static void h3100_control_egpio(enum ipa
 	}
 
 	if (egpio || gpio) {
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		if (setp) {
 			h3100_egpio |= egpio;
 			GPSR = gpio;
@@ -340,7 +340,7 @@ static void h3100_control_egpio(enum ipa
 			GPCR = gpio;
 		}
 		H3100_EGPIO = h3100_egpio;
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 }
 
@@ -463,13 +463,13 @@ static void h3600_control_egpio(enum ipa
 	}
 
 	if (egpio) {
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		if (setp)
 			h3600_egpio |= egpio;
 		else
 			h3600_egpio &= ~egpio;
 		H3600_EGPIO = h3600_egpio;
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 }
 
@@ -800,6 +800,8 @@ static void h3800_unmask_gpio_irq(unsign
 	H3800_ASIC2_GPIINTSTAT |= mask;
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(h3800_IRQ_demux);
+
 static void __init h3800_init_irq(void)
 {
 	int i;
@@ -838,7 +840,7 @@ static void __init h3800_init_irq(void)
 	}
 #endif
 	set_irq_type(IRQ_GPIO_H3800_ASIC, IRQT_RISING);
-	set_irq_chained_handler(IRQ_GPIO_H3800_ASIC, &h3800_IRQ_demux);
+	set_irq_chained_handler(IRQ_GPIO_H3800_ASIC, h3800_IRQ_demux);
 }
 
 
Index: linux.prev/arch/arm/mach-sa1100/irq.c
===================================================================
--- linux.prev.orig/arch/arm/mach-sa1100/irq.c
+++ linux.prev/arch/arm/mach-sa1100/irq.c
@@ -11,12 +11,13 @@
  */
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/ioport.h>
 #include <linux/ptrace.h>
 #include <linux/sysdev.h>
 
 #include <asm/hardware.h>
-#include <asm/irq.h>
 #include <asm/mach/irq.h>
 
 #include "generic.h"
@@ -281,6 +282,8 @@ static int __init sa1100irq_init_devicef
 	return sysdev_register(&sa1100irq_device);
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(sa1100_high_gpio_handler);
+
 device_initcall(sa1100irq_init_devicefs);
 
 void __init sa1100_init_irq(void)
Index: linux.prev/arch/arm/mach-sa1100/leds-assabet.c
===================================================================
--- linux.prev.orig/arch/arm/mach-sa1100/leds-assabet.c
+++ linux.prev/arch/arm/mach-sa1100/leds-assabet.c
@@ -32,7 +32,7 @@ void assabet_leds_event(led_event_t evt)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	switch (evt) {
 	case led_start:
@@ -111,5 +111,5 @@ void assabet_leds_event(led_event_t evt)
 	if  (led_state & LED_STATE_ENABLED)
 		ASSABET_BCR_frob(ASSABET_BCR_LED_MASK, hw_led_state);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
Index: linux.prev/arch/arm/mach-sa1100/leds-badge4.c
===================================================================
--- linux.prev.orig/arch/arm/mach-sa1100/leds-badge4.c
+++ linux.prev/arch/arm/mach-sa1100/leds-badge4.c
@@ -36,7 +36,7 @@ void badge4_leds_event(led_event_t evt)
 {
         unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
         switch (evt) {
         case led_start:
@@ -108,5 +108,5 @@ void badge4_leds_event(led_event_t evt)
                 GPCR = hw_led_state ^ LED_MASK;
         }
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
Index: linux.prev/arch/arm/mach-sa1100/leds-cerf.c
===================================================================
--- linux.prev.orig/arch/arm/mach-sa1100/leds-cerf.c
+++ linux.prev/arch/arm/mach-sa1100/leds-cerf.c
@@ -29,7 +29,7 @@ void cerf_leds_event(led_event_t evt)
 {
         unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
         switch (evt) {
         case led_start:
@@ -107,5 +107,5 @@ void cerf_leds_event(led_event_t evt)
                 GPCR = hw_led_state ^ LED_MASK;
         }
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
Index: linux.prev/arch/arm/mach-sa1100/leds-hackkit.c
===================================================================
--- linux.prev.orig/arch/arm/mach-sa1100/leds-hackkit.c
+++ linux.prev/arch/arm/mach-sa1100/leds-hackkit.c
@@ -33,7 +33,7 @@ void hackkit_leds_event(led_event_t evt)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	switch(evt) {
 		case led_start:
@@ -109,5 +109,5 @@ void hackkit_leds_event(led_event_t evt)
 		GPCR = hw_led_state ^ LED_MASK;
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
Index: linux.prev/arch/arm/mach-sa1100/leds-lart.c
===================================================================
--- linux.prev.orig/arch/arm/mach-sa1100/leds-lart.c
+++ linux.prev/arch/arm/mach-sa1100/leds-lart.c
@@ -32,7 +32,7 @@ void lart_leds_event(led_event_t evt)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	switch(evt) {
 	case led_start:
@@ -98,5 +98,5 @@ void lart_leds_event(led_event_t evt)
 		GPCR = hw_led_state ^ LED_MASK;
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
Index: linux.prev/arch/arm/mach-sa1100/neponset.c
===================================================================
--- linux.prev.orig/arch/arm/mach-sa1100/neponset.c
+++ linux.prev/arch/arm/mach-sa1100/neponset.c
@@ -137,6 +137,8 @@ static struct sa1100_port_fns neponset_p
 	.get_mctrl	= neponset_get_mctrl,
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(neponset_irq_handler);
+
 static int neponset_probe(struct platform_device *dev)
 {
 	sa1100_register_uart_fns(&neponset_port_fns);
Index: linux.prev/arch/arm/mach-sa1100/pleb.c
===================================================================
--- linux.prev.orig/arch/arm/mach-sa1100/pleb.c
+++ linux.prev/arch/arm/mach-sa1100/pleb.c
@@ -7,6 +7,7 @@
 #include <linux/tty.h>
 #include <linux/ioport.h>
 #include <linux/platform_device.h>
+#include <linux/irq.h>
 
 #include <linux/mtd/partitions.h>
 
Index: linux.prev/arch/arm/mach-sa1100/simpad.c
===================================================================
--- linux.prev.orig/arch/arm/mach-sa1100/simpad.c
+++ linux.prev/arch/arm/mach-sa1100/simpad.c
@@ -174,7 +174,7 @@ static void __init simpad_map_io(void)
 
 static void simpad_power_off(void)
 {
-	local_irq_disable(); // was cli
+	raw_local_irq_disable(); // was cli
 	set_cs3(0x800);        /* only SD_MEDIAQ */
 
 	/* disable internal oscillator, float CS lines */
@@ -191,7 +191,7 @@ static void simpad_power_off(void)
 	PMCR = PMCR_SF;
 	while(1);
 
-	local_irq_enable(); /* we won't ever call it */
+	raw_local_irq_enable(); /* we won't ever call it */
 
 
 }
Index: linux.prev/arch/arm/mach-sa1100/time.c
===================================================================
--- linux.prev.orig/arch/arm/mach-sa1100/time.c
+++ linux.prev/arch/arm/mach-sa1100/time.c
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/timex.h>
 #include <linux/signal.h>
 
Index: linux.prev/arch/arm/mach-shark/core.c
===================================================================
--- linux.prev.orig/arch/arm/mach-shark/core.c
+++ linux.prev/arch/arm/mach-shark/core.c
@@ -6,6 +6,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/sched.h>
 #include <linux/serial_8250.h>
 
Index: linux.prev/arch/arm/mach-shark/leds.c
===================================================================
--- linux.prev.orig/arch/arm/mach-shark/leds.c
+++ linux.prev/arch/arm/mach-shark/leds.c
@@ -33,7 +33,7 @@ static char led_state;
 static short hw_led_state;
 static short saved_state;
 
-static DEFINE_SPINLOCK(leds_lock);
+static DEFINE_RAW_SPINLOCK(leds_lock);
 
 short sequoia_read(int addr) {
   outw(addr,0x24);
Index: linux.prev/arch/arm/mach-versatile/core.c
===================================================================
--- linux.prev.orig/arch/arm/mach-versatile/core.c
+++ linux.prev/arch/arm/mach-versatile/core.c
@@ -113,6 +113,8 @@ sic_handle_irq(unsigned int irq, struct 
 	} while (status);
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(sic_handle_irq);
+
 #if 1
 #define IRQ_MMCI0A	IRQ_VICSOURCE22
 #define IRQ_AACI	IRQ_VICSOURCE24
@@ -162,7 +164,7 @@ void __init versatile_init_irq(void)
 		}
 	}
 
-	set_irq_handler(IRQ_VICSOURCE31, sic_handle_irq);
+	set_irq_chained_handler(IRQ_VICSOURCE31, sic_handle_irq);
 	vic_unmask_irq(IRQ_VICSOURCE31);
 
 	/* Do second interrupt controller */
@@ -785,7 +787,7 @@ static void versatile_leds_event(led_eve
 	unsigned long flags;
 	u32 val;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	val = readl(VA_LEDS_BASE);
 
 	switch (ledevt) {
@@ -810,7 +812,7 @@ static void versatile_leds_event(led_eve
 	}
 
 	writel(val, VA_LEDS_BASE);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 #endif	/* CONFIG_LEDS */
 
Index: linux.prev/arch/arm/mm/consistent.c
===================================================================
--- linux.prev.orig/arch/arm/mm/consistent.c
+++ linux.prev/arch/arm/mm/consistent.c
@@ -30,7 +30,7 @@
  * This is the page table (2MB) covering uncached, DMA consistent allocations
  */
 static pte_t *consistent_pte;
-static DEFINE_SPINLOCK(consistent_lock);
+static DEFINE_RAW_SPINLOCK(consistent_lock);
 
 /*
  * VM region handling support.
Index: linux.prev/arch/arm/mm/copypage-v4mc.c
===================================================================
--- linux.prev.orig/arch/arm/mm/copypage-v4mc.c
+++ linux.prev/arch/arm/mm/copypage-v4mc.c
@@ -29,7 +29,7 @@
 
 #define TOP_PTE(x)	pte_offset_kernel(top_pmd, x)
 
-static DEFINE_SPINLOCK(minicache_lock);
+static DEFINE_RAW_SPINLOCK(minicache_lock);
 
 /*
  * ARMv4 mini-dcache optimised copy_user_page
@@ -43,7 +43,7 @@ static DEFINE_SPINLOCK(minicache_lock);
  * instruction.  If your processor does not supply this, you have to write your
  * own copy_user_page that does the right thing.
  */
-static void __attribute__((naked))
+static void notrace __attribute__((naked))
 mc_copy_user_page(void *from, void *to)
 {
 	asm volatile(
@@ -82,7 +82,7 @@ void v4_mc_copy_user_page(void *kto, con
 /*
  * ARMv4 optimised clear_user_page
  */
-void __attribute__((naked))
+void notrace __attribute__((naked))
 v4_mc_clear_user_page(void *kaddr, unsigned long vaddr)
 {
 	asm volatile(
Index: linux.prev/arch/arm/mm/copypage-v6.c
===================================================================
--- linux.prev.orig/arch/arm/mm/copypage-v6.c
+++ linux.prev/arch/arm/mm/copypage-v6.c
@@ -26,7 +26,7 @@
 
 #define TOP_PTE(x)	pte_offset_kernel(top_pmd, x)
 
-static DEFINE_SPINLOCK(v6_lock);
+static DEFINE_RAW_SPINLOCK(v6_lock);
 
 /*
  * Copy the user page.  No aliasing to deal with so we can just
Index: linux.prev/arch/arm/mm/copypage-xscale.c
===================================================================
--- linux.prev.orig/arch/arm/mm/copypage-xscale.c
+++ linux.prev/arch/arm/mm/copypage-xscale.c
@@ -31,7 +31,7 @@
 
 #define TOP_PTE(x)	pte_offset_kernel(top_pmd, x)
 
-static DEFINE_SPINLOCK(minicache_lock);
+static DEFINE_RAW_SPINLOCK(minicache_lock);
 
 /*
  * XScale mini-dcache optimised copy_user_page
@@ -41,7 +41,7 @@ static DEFINE_SPINLOCK(minicache_lock);
  * Dcache aliasing issue.  The writes will be forwarded to the write buffer,
  * and merged as appropriate.
  */
-static void __attribute__((naked))
+static void notrace __attribute__((naked))
 mc_copy_user_page(void *from, void *to)
 {
 	/*
@@ -104,7 +104,7 @@ void xscale_mc_copy_user_page(void *kto,
 /*
  * XScale optimised clear_user_page
  */
-void __attribute__((naked))
+void notrace __attribute__((naked))
 xscale_mc_clear_user_page(void *kaddr, unsigned long vaddr)
 {
 	asm volatile(
Index: linux.prev/arch/arm/mm/fault-armv.c
===================================================================
--- linux.prev.orig/arch/arm/mm/fault-armv.c
+++ linux.prev/arch/arm/mm/fault-armv.c
@@ -166,7 +166,7 @@ static int __init check_writebuffer(unsi
 {
 	register unsigned long zero = 0, one = 1, val;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	mb();
 	*p1 = one;
 	mb();
@@ -174,7 +174,7 @@ static int __init check_writebuffer(unsi
 	mb();
 	val = *p1;
 	mb();
-	local_irq_enable();
+	raw_local_irq_enable();
 	return val != zero;
 }
 
Index: linux.prev/arch/arm/mm/fault.c
===================================================================
--- linux.prev.orig/arch/arm/mm/fault.c
+++ linux.prev/arch/arm/mm/fault.c
@@ -216,7 +216,7 @@ out:
 	return fault;
 }
 
-static int
+static notrace int
 do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 {
 	struct task_struct *tsk;
@@ -316,7 +316,7 @@ no_context:
  * interrupt or a critical region, and should only copy the information
  * from the master page table, nothing more.
  */
-static int
+static notrace int
 do_translation_fault(unsigned long addr, unsigned int fsr,
 		     struct pt_regs *regs)
 {
@@ -362,7 +362,7 @@ bad_area:
  * Some section permission faults need to be handled gracefully.
  * They can happen due to a __{get,put}_user during an oops.
  */
-static int
+static notrace int
 do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 {
 	struct task_struct *tsk = current;
@@ -373,7 +373,7 @@ do_sect_fault(unsigned long addr, unsign
 /*
  * This abort handler always returns "fault".
  */
-static int
+static notrace int
 do_bad(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 {
 	return 1;
@@ -428,7 +428,7 @@ static struct fsr_info {
 	{ do_bad,		SIGBUS,  0,		"unknown 31"			   }
 };
 
-void __init
+void __init notrace
 hook_fault_code(int nr, int (*fn)(unsigned long, unsigned int, struct pt_regs *),
 		int sig, const char *name)
 {
@@ -442,7 +442,7 @@ hook_fault_code(int nr, int (*fn)(unsign
 /*
  * Dispatch a data abort to the relevant handler.
  */
-asmlinkage void
+asmlinkage notrace void
 do_DataAbort(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 {
 	const struct fsr_info *inf = fsr_info + (fsr & 15) + ((fsr & (1 << 10)) >> 6);
@@ -461,7 +461,7 @@ do_DataAbort(unsigned long addr, unsigne
 	notify_die("", regs, &info, fsr, 0);
 }
 
-asmlinkage void
+asmlinkage notrace void
 do_PrefetchAbort(unsigned long addr, struct pt_regs *regs)
 {
 	do_translation_fault(addr, 0, regs);
Index: linux.prev/arch/arm/mm/init.c
===================================================================
--- linux.prev.orig/arch/arm/mm/init.c
+++ linux.prev/arch/arm/mm/init.c
@@ -28,7 +28,7 @@
 
 #define TABLE_SIZE	(2 * PTRS_PER_PTE * sizeof(pte_t))
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
 extern void _stext, _text, _etext, __data_start, _end, __init_begin, __init_end;
Index: linux.prev/arch/arm/plat-omap/clock.c
===================================================================
--- linux.prev.orig/arch/arm/plat-omap/clock.c
+++ linux.prev/arch/arm/plat-omap/clock.c
@@ -28,7 +28,7 @@
 
 LIST_HEAD(clocks);
 static DECLARE_MUTEX(clocks_sem);
-DEFINE_SPINLOCK(clockfw_lock);
+DEFINE_RAW_SPINLOCK(clockfw_lock);
 
 static struct clk_functions *arch_clock;
 
Index: linux.prev/arch/arm/plat-omap/dma.c
===================================================================
--- linux.prev.orig/arch/arm/plat-omap/dma.c
+++ linux.prev/arch/arm/plat-omap/dma.c
@@ -557,7 +557,7 @@ void omap_clear_dma(int lch)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	if (cpu_class_is_omap1()) {
 		int status;
@@ -574,7 +574,7 @@ void omap_clear_dma(int lch)
 			omap_writel(0, lch_base + i);
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void omap_start_dma(int lch)
@@ -903,7 +903,7 @@ static struct irqaction omap24xx_dma_irq
 /*----------------------------------------------------------------------------*/
 
 static struct lcd_dma_info {
-	spinlock_t lock;
+	raw_spinlock_t lock;
 	int reserved;
 	void (* callback)(u16 status, void *data);
 	void *cb_data;
Index: linux.prev/arch/arm/plat-omap/gpio.c
===================================================================
--- linux.prev.orig/arch/arm/plat-omap/gpio.c
+++ linux.prev/arch/arm/plat-omap/gpio.c
@@ -121,7 +121,7 @@ struct gpio_bank {
 	u32 reserved_map;
 	u32 suspend_wakeup;
 	u32 saved_wakeup;
-	spinlock_t lock;
+	raw_spinlock_t lock;
 };
 
 #define METHOD_MPUIO		0
@@ -736,7 +736,7 @@ static void gpio_irq_handler(unsigned in
 
 	desc->chip->ack(irq);
 
-	bank = (struct gpio_bank *) desc->data;
+	bank = (struct gpio_bank *) desc->handler_data;
 	if (bank->method == METHOD_MPUIO)
 		isr_reg = bank->base + OMAP_MPUIO_GPIO_INT;
 #ifdef CONFIG_ARCH_OMAP15XX
@@ -837,6 +837,8 @@ static struct irqchip mpuio_irq_chip = {
 	.unmask = mpuio_unmask_irq
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(gpio_irq_handler);
+
 static int initialized;
 static struct clk * gpio_ick;
 static struct clk * gpio_fck;
Index: linux.prev/arch/arm/plat-omap/mux.c
===================================================================
--- linux.prev.orig/arch/arm/plat-omap/mux.c
+++ linux.prev/arch/arm/plat-omap/mux.c
@@ -57,7 +57,7 @@ int __init omap_mux_register(struct pin_
  */
 int __init_or_module omap_cfg_reg(const unsigned long index)
 {
-	static DEFINE_SPINLOCK(mux_spin_lock);
+	static DEFINE_RAW_SPINLOCK(mux_spin_lock);
 
 	unsigned long flags;
 	struct pin_config *cfg;
Index: linux.prev/arch/arm/plat-omap/pm.c
===================================================================
--- linux.prev.orig/arch/arm/plat-omap/pm.c
+++ linux.prev/arch/arm/plat-omap/pm.c
@@ -82,11 +82,11 @@ void omap_pm_idle(void)
 	 * seconds for wait for interrupt.
 	 */
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	local_fiq_disable();
 	if (need_resched()) {
 		local_fiq_enable();
-		local_irq_enable();
+		raw_local_irq_enable();
 		return;
 	}
 	mask32 = omap_readl(ARM_SYSST);
@@ -111,7 +111,7 @@ void omap_pm_idle(void)
 		omap_sram_idle();
 
 	local_fiq_enable();
-	local_irq_enable();
+	raw_local_irq_enable();
 }
 
 /*
@@ -182,7 +182,7 @@ void omap_pm_suspend(void)
 	 * Step 1: turn off interrupts (FIXME: NOTE: already disabled)
 	 */
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	local_fiq_disable();
 
 	/*
@@ -335,7 +335,7 @@ void omap_pm_suspend(void)
 	 * Reenable interrupts
 	 */
 
-	local_irq_enable();
+	raw_local_irq_enable();
 	local_fiq_enable();
 
 	omap_serial_wake_trigger(0);
Index: linux.prev/arch/arm26/boot/compressed/misc.c
===================================================================
--- linux.prev.orig/arch/arm26/boot/compressed/misc.c
+++ linux.prev/arch/arm26/boot/compressed/misc.c
@@ -184,6 +184,7 @@ static ulg free_mem_ptr_end;
 
 #define HEAP_SIZE 0x2000
 
+#define ZLIB_INFLATE_NO_INFLATE_LOCK
 #include "../../../../lib/inflate.c"
 
 #ifndef STANDALONE_DEBUG
Index: linux.prev/arch/i386/Kconfig
===================================================================
--- linux.prev.orig/arch/i386/Kconfig
+++ linux.prev/arch/i386/Kconfig
@@ -14,6 +14,10 @@ config X86_32
 	  486, 586, Pentiums, and various instruction-set-compatible chips by
 	  AMD, Cyrix, and others.
 
+config GENERIC_TIME
+	bool
+	default y
+
 config SEMAPHORE_SLEEPERS
 	bool
 	default y
@@ -173,6 +177,8 @@ config HPET_EMULATE_RTC
 	depends on HPET_TIMER && RTC=y
 	default y
 
+source "kernel/time/Kconfig"
+
 config SMP
 	bool "Symmetric multi-processing support"
 	---help---
@@ -228,6 +234,19 @@ config SCHED_SMT
 
 source "kernel/Kconfig.preempt"
 
+config RWSEM_GENERIC_SPINLOCK
+	bool
+	depends on M386 || PREEMPT_RT
+	default y
+
+config ASM_SEMAPHORES
+	bool
+	default y
+
+config RWSEM_XCHGADD_ALGORITHM
+	bool
+	default y if !RWSEM_GENERIC_SPINLOCK
+
 config X86_UP_APIC
 	bool "Local APIC support on uniprocessors"
 	depends on !SMP && !(X86_VISWS || X86_VOYAGER)
@@ -619,7 +638,7 @@ config BOOT_IOREMAP
 
 config REGPARM
 	bool "Use register arguments (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	depends on EXPERIMENTAL && !MCOUNT
 	default n
 	help
 	Compile the kernel with -mregparm=3. This uses a different ABI
@@ -1055,3 +1074,7 @@ config X86_TRAMPOLINE
 	bool
 	depends on X86_SMP || (X86_VOYAGER && SMP)
 	default y
+
+config KTIME_SCALAR
+	bool
+	default y
Index: linux.prev/arch/i386/Kconfig.cpu
===================================================================
--- linux.prev.orig/arch/i386/Kconfig.cpu
+++ linux.prev/arch/i386/Kconfig.cpu
@@ -229,11 +229,6 @@ config RWSEM_GENERIC_SPINLOCK
 	depends on M386
 	default y
 
-config RWSEM_XCHGADD_ALGORITHM
-	bool
-	depends on !M386
-	default y
-
 config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
Index: linux.prev/arch/i386/Kconfig.debug
===================================================================
--- linux.prev.orig/arch/i386/Kconfig.debug
+++ linux.prev/arch/i386/Kconfig.debug
@@ -18,6 +18,7 @@ config EARLY_PRINTK
 config DEBUG_STACKOVERFLOW
 	bool "Check for stack overflows"
 	depends on DEBUG_KERNEL
+	default y
 	help
 	  This option will cause messages to be printed if free stack space
 	  drops below a certain limit.
@@ -25,6 +26,7 @@ config DEBUG_STACKOVERFLOW
 config DEBUG_STACK_USAGE
 	bool "Stack utilization instrumentation"
 	depends on DEBUG_KERNEL
+	default y
 	help
 	  Enables the display of the minimum amount of free stack which each
 	  task has ever had available in the sysrq-T and sysrq-P debug output.
Index: linux.prev/arch/i386/boot/compressed/misc.c
===================================================================
--- linux.prev.orig/arch/i386/boot/compressed/misc.c
+++ linux.prev/arch/i386/boot/compressed/misc.c
@@ -15,6 +15,12 @@
 #include <asm/io.h>
 #include <asm/page.h>
 
+#ifdef CONFIG_MCOUNT
+void notrace mcount(void)
+{
+}
+#endif
+
 /*
  * gzip declarations
  */
@@ -112,7 +118,7 @@ static long free_mem_end_ptr;
 #define INPLACE_MOVE_ROUTINE  0x1000
 #define LOW_BUFFER_START      0x2000
 #define LOW_BUFFER_MAX       0x90000
-#define HEAP_SIZE             0x3000
+#define HEAP_SIZE             0x4000
 static unsigned int low_buffer_end, low_buffer_size;
 static int high_loaded =0;
 static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/;
@@ -125,6 +131,7 @@ static int lines, cols;
 static void * xquad_portio = NULL;
 #endif
 
+#define ZLIB_INFLATE_NO_INFLATE_LOCK
 #include "../../../../lib/inflate.c"
 
 static void *malloc(int size)
Index: linux.prev/arch/i386/kernel/Makefile
===================================================================
--- linux.prev.orig/arch/i386/kernel/Makefile
+++ linux.prev/arch/i386/kernel/Makefile
@@ -4,13 +4,13 @@
 
 extra-y := head.o init_task.o vmlinux.lds
 
-obj-y	:= process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \
+obj-y	:= process.o signal.o entry.o traps.o irq.o vm86.o \
 		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
 		pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \
-		doublefault.o quirks.o i8237.o
+		doublefault.o quirks.o i8237.o i8253.o tsc.o
 
+obj-$(CONFIG_ASM_SEMAPHORES)	+= semaphore.o
 obj-y				+= cpu/
-obj-y				+= timers/
 obj-$(CONFIG_ACPI)		+= acpi/
 obj-$(CONFIG_X86_BIOS_REBOOT)	+= reboot.o
 obj-$(CONFIG_MCA)		+= mca.o
@@ -20,6 +20,7 @@ obj-$(CONFIG_MICROCODE)		+= microcode.o
 obj-$(CONFIG_APM)		+= apm.o
 obj-$(CONFIG_X86_SMP)		+= smp.o smpboot.o
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
+obj-$(CONFIG_MCOUNT)		+= mcount-wrapper.o
 obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o nmi.o
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
@@ -34,6 +35,8 @@ obj-$(CONFIG_ACPI_SRAT) 	+= srat.o
 obj-$(CONFIG_HPET_TIMER) 	+= time_hpet.o
 obj-$(CONFIG_EFI) 		+= efi.o efi_stub.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
+obj-$(CONFIG_SYSFS)		+= switch2poll.o
+obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 
 EXTRA_AFLAGS   := -traditional
 
Index: linux.prev/arch/i386/kernel/acpi/boot.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/acpi/boot.c
+++ linux.prev/arch/i386/kernel/acpi/boot.c
@@ -567,7 +567,7 @@ static int __init acpi_parse_sbf(unsigne
 }
 
 #ifdef CONFIG_HPET_TIMER
-
+#include <asm/hpet.h>
 static int __init acpi_parse_hpet(unsigned long phys, unsigned long size)
 {
 	struct acpi_table_hpet *hpet_tbl;
@@ -589,6 +589,7 @@ static int __init acpi_parse_hpet(unsign
 #ifdef	CONFIG_X86_64
 	vxtime.hpet_address = hpet_tbl->addr.addrl |
 	    ((long)hpet_tbl->addr.addrh << 32);
+	hpet_address = vxtime.hpet_address;
 
 	printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
 	       hpet_tbl->id, vxtime.hpet_address);
@@ -597,10 +598,10 @@ static int __init acpi_parse_hpet(unsign
 		extern unsigned long hpet_address;
 
 		hpet_address = hpet_tbl->addr.addrl;
-		printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
-		       hpet_tbl->id, hpet_address);
 	}
-#endif				/* X86 */
+#endif	/* X86 */
+		printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
+			hpet_tbl->id, hpet_address);
 
 	return 0;
 }
@@ -608,9 +609,8 @@ static int __init acpi_parse_hpet(unsign
 #define	acpi_parse_hpet	NULL
 #endif
 
-#ifdef CONFIG_X86_PM_TIMER
-extern u32 pmtmr_ioport;
-#endif
+u32 acpi_pmtmr_ioport;
+int acpi_pmtmr_buggy;
 
 static int __init acpi_parse_fadt(unsigned long phys, unsigned long size)
 {
@@ -629,7 +629,6 @@ static int __init acpi_parse_fadt(unsign
 	acpi_fadt.force_apic_physical_destination_mode =
 	    fadt->force_apic_physical_destination_mode;
 
-#ifdef CONFIG_X86_PM_TIMER
 	/* detect the location of the ACPI PM Timer */
 	if (fadt->revision >= FADT2_REVISION_ID) {
 		/* FADT rev. 2 */
@@ -637,22 +636,22 @@ static int __init acpi_parse_fadt(unsign
 		    ACPI_ADR_SPACE_SYSTEM_IO)
 			return 0;
 
-		pmtmr_ioport = fadt->xpm_tmr_blk.address;
+		acpi_pmtmr_ioport = fadt->xpm_tmr_blk.address;
 		/*
 		 * "X" fields are optional extensions to the original V1.0
 		 * fields, so we must selectively expand V1.0 fields if the
 		 * corresponding X field is zero.
 	 	 */
-		if (!pmtmr_ioport)
-			pmtmr_ioport = fadt->V1_pm_tmr_blk;
+		if (!acpi_pmtmr_ioport)
+			acpi_pmtmr_ioport = fadt->V1_pm_tmr_blk;
 	} else {
 		/* FADT rev. 1 */
-		pmtmr_ioport = fadt->V1_pm_tmr_blk;
+		acpi_pmtmr_ioport = fadt->V1_pm_tmr_blk;
 	}
-	if (pmtmr_ioport)
-		printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n",
-		       pmtmr_ioport);
-#endif
+
+	if (acpi_pmtmr_ioport)
+		printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n", acpi_pmtmr_ioport);
+
 	return 0;
 }
 
Index: linux.prev/arch/i386/kernel/apic.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/apic.c
+++ linux.prev/arch/i386/kernel/apic.c
@@ -26,6 +26,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/sysdev.h>
 #include <linux/cpu.h>
+#include <linux/clockchips.h>
 
 #include <asm/atomic.h>
 #include <asm/smp.h>
@@ -50,6 +51,23 @@ int enable_local_apic __initdata = 0; /*
  */
 int apic_verbosity;
 
+static unsigned int calibration_result;
+
+static void lapic_next_event(unsigned long evt);
+static void lapic_timer_setup(int mode);
+
+static struct clock_event lapic_clockevent = {
+	.name = "lapic",
+	.capabilities = CLOCK_CAP_NEXTEVT | CLOCK_CAP_PROFILE |
+			CLOCK_HAS_IRQHANDLER
+#ifdef CONFIG_SMP
+			| CLOCK_CAP_UPDATE
+#endif
+	,
+	.shift = 32,
+	.set_mode = lapic_timer_setup,
+	.set_next_event = lapic_next_event,
+};
 
 static void apic_pm_activate(void);
 
@@ -92,10 +110,6 @@ void __init apic_intr_init(void)
 /* Using APIC to generate smp_local_timer_interrupt? */
 int using_apic_timer = 0;
 
-static DEFINE_PER_CPU(int, prof_multiplier) = 1;
-static DEFINE_PER_CPU(int, prof_old_multiplier) = 1;
-static DEFINE_PER_CPU(int, prof_counter) = 1;
-
 static int enabled_via_apicbase;
 
 void enable_NMI_through_LVT0 (void * dummy)
@@ -567,13 +581,13 @@ void lapic_shutdown(void)
 	if (!cpu_has_apic)
 		return;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	clear_local_APIC();
 
 	if (enabled_via_apicbase)
 		disable_local_APIC();
 
-	local_irq_enable();
+	raw_local_irq_enable();
 }
 
 #ifdef CONFIG_PM
@@ -617,9 +631,9 @@ static int lapic_suspend(struct sys_devi
 	apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
 	apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
 	
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	disable_local_APIC();
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return 0;
 }
 
@@ -631,7 +645,7 @@ static int lapic_resume(struct sys_devic
 	if (!apic_pm_state.active)
 		return 0;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	/*
 	 * Make sure the APICBASE points to the right address
@@ -662,7 +676,7 @@ static int lapic_resume(struct sys_devic
 	apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
 	apic_write(APIC_ESR, 0);
 	apic_read(APIC_ESR);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return 0;
 }
 
@@ -875,6 +889,11 @@ fake_ioapic_page:
  */
 
 /*
+ * FIXME: Move this to i8253.h. There is no need to keep the access to
+ * the PIT scattered all around the place -tglx
+ */
+
+/*
  * The timer chip is already set up at HZ interrupts per second here,
  * but we do not accept timer interrupts yet. We only allow the BP
  * to calibrate.
@@ -932,12 +951,16 @@ void (*wait_timer_tick)(void) __devinitd
 
 #define APIC_DIVISOR 16
 
-static void __setup_APIC_LVTT(unsigned int clocks)
+static void __setup_APIC_LVTT(unsigned int clocks, int oneshot)
 {
 	unsigned int lvtt_value, tmp_value, ver;
 
 	ver = GET_APIC_VERSION(apic_read(APIC_LVR));
-	lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
+
+	lvtt_value = LOCAL_TIMER_VECTOR;
+	if (!oneshot)
+		lvtt_value |= APIC_LVT_TIMER_PERIODIC;
+
 	if (!APIC_INTEGRATED(ver))
 		lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
 	apic_write_around(APIC_LVTT, lvtt_value);
@@ -950,23 +973,27 @@ static void __setup_APIC_LVTT(unsigned i
 				& ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
 				| APIC_TDR_DIV_16);
 
-	apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
+	if (!oneshot)
+		apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
 }
 
-static void __devinit setup_APIC_timer(unsigned int clocks)
+static void lapic_next_event(unsigned long evt)
 {
-	unsigned long flags;
-
-	local_irq_save(flags);
+	apic_write_around(APIC_TMICT, evt);
+}
 
-	/*
-	 * Wait for IRQ0's slice:
-	 */
-	wait_timer_tick();
+static void lapic_timer_setup(int mode)
+{
+	unsigned long flags;
 
-	__setup_APIC_LVTT(clocks);
+	raw_local_irq_save(flags);
+	__setup_APIC_LVTT(calibration_result, mode == CLOCK_EVT_ONESHOT);
+	raw_local_irq_restore(flags);
+}
 
-	local_irq_restore(flags);
+static void __devinit setup_APIC_timer(void)
+{
+	setup_local_clockevent(&lapic_clockevent, CPU_MASK_NONE);
 }
 
 /*
@@ -975,6 +1002,8 @@ static void __devinit setup_APIC_timer(u
  * to calibrate, since some later bootup code depends on getting
  * the first irq? Ugh.
  *
+ * TODO: Fix this rather than saying "Ugh" -tglx
+ *
  * We want to do the calibration only once since we
  * want to have local timer irqs syncron. CPUs connected
  * by the same APIC bus have the very same bus frequency.
@@ -997,7 +1026,7 @@ static int __init calibrate_APIC_clock(v
 	 * value into the APIC clock, we just want to get the
 	 * counter running for calibration.
 	 */
-	__setup_APIC_LVTT(1000000000);
+	__setup_APIC_LVTT(1000000000, 0);
 
 	/*
 	 * The timer chip counts down to zero. Let's wait
@@ -1034,6 +1063,13 @@ static int __init calibrate_APIC_clock(v
 
 	result = (tt1-tt2)*APIC_DIVISOR/LOOPS;
 
+	/* Calculate the scaled math multiplication factor */
+	lapic_clockevent.mult = div_sc32(tt1-tt2, TICK_NSEC * LOOPS);
+	lapic_clockevent.max_delta_ns =
+		clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
+	lapic_clockevent.min_delta_ns =
+		clockevent_delta2ns(0xF, &lapic_clockevent);
+
 	if (cpu_has_tsc)
 		apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
 			"%ld.%04ld MHz.\n",
@@ -1048,28 +1084,26 @@ static int __init calibrate_APIC_clock(v
 	return result;
 }
 
-static unsigned int calibration_result;
-
 void __init setup_boot_APIC_clock(void)
 {
 	unsigned long flags;
 	apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n");
 	using_apic_timer = 1;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	calibration_result = calibrate_APIC_clock();
 	/*
 	 * Now set up the timer for real.
 	 */
-	setup_APIC_timer(calibration_result);
+	setup_APIC_timer();
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void __devinit setup_secondary_APIC_clock(void)
 {
-	setup_APIC_timer(calibration_result);
+	setup_APIC_timer();
 }
 
 void __devinit disable_APIC_timer(void)
@@ -1092,6 +1126,8 @@ void enable_APIC_timer(void)
 	}
 }
 
+static DEFINE_PER_CPU(int, prof_multiplier) = 1;
+
 /*
  * the frequency of the profiling timer can be changed
  * by writing a multiplier value into /proc/profile.
@@ -1119,60 +1155,6 @@ int setup_profiling_timer(unsigned int m
 
 	return 0;
 }
-
-#undef APIC_DIVISOR
-
-/*
- * Local timer interrupt handler. It does both profiling and
- * process statistics/rescheduling.
- *
- * We do profiling in every local tick, statistics/rescheduling
- * happen only every 'profiling multiplier' ticks. The default
- * multiplier is 1 and it can be changed by writing the new multiplier
- * value into /proc/profile.
- */
-
-inline void smp_local_timer_interrupt(struct pt_regs * regs)
-{
-	int cpu = smp_processor_id();
-
-	profile_tick(CPU_PROFILING, regs);
-	if (--per_cpu(prof_counter, cpu) <= 0) {
-		/*
-		 * The multiplier may have changed since the last time we got
-		 * to this point as a result of the user writing to
-		 * /proc/profile. In this case we need to adjust the APIC
-		 * timer accordingly.
-		 *
-		 * Interrupts are already masked off at this point.
-		 */
-		per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu);
-		if (per_cpu(prof_counter, cpu) !=
-					per_cpu(prof_old_multiplier, cpu)) {
-			__setup_APIC_LVTT(
-					calibration_result/
-					per_cpu(prof_counter, cpu));
-			per_cpu(prof_old_multiplier, cpu) =
-						per_cpu(prof_counter, cpu);
-		}
-
-#ifdef CONFIG_SMP
-		update_process_times(user_mode_vm(regs));
-#endif
-	}
-
-	/*
-	 * We take the 'long' return path, and there every subsystem
-	 * grabs the apropriate locks (kernel lock/ irq lock).
-	 *
-	 * we might want to decouple profiling from the 'long path',
-	 * and do the profiling totally in assembly.
-	 *
-	 * Currently this isn't too much of an issue (performance wise),
-	 * we can take more than 100K local irqs per second on a 100 MHz P5.
-	 */
-}
-
 /*
  * Local APIC timer interrupt. This is the most natural way for doing
  * local interrupts, but local timer interrupts can be emulated by
@@ -1182,7 +1164,7 @@ inline void smp_local_timer_interrupt(st
  *   interrupt as well. Thus we cannot inline the local irq ... ]
  */
 
-fastcall void smp_apic_timer_interrupt(struct pt_regs *regs)
+fastcall notrace void smp_apic_timer_interrupt(struct pt_regs *regs)
 {
 	int cpu = smp_processor_id();
 
@@ -1191,6 +1173,8 @@ fastcall void smp_apic_timer_interrupt(s
 	 */
 	per_cpu(irq_stat, cpu).apic_timer_irqs++;
 
+        trace_special(regs->eip, 0, 0);
+
 	/*
 	 * NOTE! We'd better ACK the irq immediately,
 	 * because timer handling can be slow.
@@ -1202,7 +1186,17 @@ fastcall void smp_apic_timer_interrupt(s
 	 * interrupt lock, which is the WrongThing (tm) to do.
 	 */
 	irq_enter();
-	smp_local_timer_interrupt(regs);
+	/*
+	 * If the task is currently running in user mode, don't
+	 * detect soft lockups.  If CONFIG_DETECT_SOFTLOCKUP is not
+	 * configured, this should be optimized out.
+	 */
+	if (user_mode(regs))
+		touch_softlockup_watchdog();
+
+	if (lapic_clockevent.event_handler)
+		lapic_clockevent.event_handler(regs);
+
 	irq_exit();
 }
 
@@ -1257,6 +1251,7 @@ fastcall void smp_error_interrupt(struct
 	*/
 	printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
 	        smp_processor_id(), v , v1);
+	dump_stack();
 	irq_exit();
 }
 
Index: linux.prev/arch/i386/kernel/apm.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/apm.c
+++ linux.prev/arch/i386/kernel/apm.c
@@ -552,9 +552,9 @@ static inline void apm_restore_cpus(cpum
  */
 #define APM_DO_CLI	\
 	if (apm_info.allow_ints) \
-		local_irq_enable(); \
+		raw_local_irq_enable(); \
 	else \
-		local_irq_disable();
+		raw_local_irq_disable();
 
 #ifdef APM_ZERO_SEGS
 #	define APM_DECL_SEGS \
@@ -606,12 +606,12 @@ static u8 apm_bios_call(u32 func, u32 eb
 	save_desc_40 = gdt[0x40 / 8];
 	gdt[0x40 / 8] = bad_bios_desc;
 
-	local_save_flags(flags);
+	raw_local_save_flags(flags);
 	APM_DO_CLI;
 	APM_DO_SAVE_SEGS;
 	apm_bios_call_asm(func, ebx_in, ecx_in, eax, ebx, ecx, edx, esi);
 	APM_DO_RESTORE_SEGS;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	gdt[0x40 / 8] = save_desc_40;
 	put_cpu();
 	apm_restore_cpus(cpus);
@@ -650,12 +650,12 @@ static u8 apm_bios_call_simple(u32 func,
 	save_desc_40 = gdt[0x40 / 8];
 	gdt[0x40 / 8] = bad_bios_desc;
 
-	local_save_flags(flags);
+	raw_local_save_flags(flags);
 	APM_DO_CLI;
 	APM_DO_SAVE_SEGS;
 	error = apm_bios_call_simple_asm(func, ebx_in, ecx_in, eax);
 	APM_DO_RESTORE_SEGS;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	gdt[0x40 / 8] = save_desc_40;
 	put_cpu();
 	apm_restore_cpus(cpus);
@@ -1215,7 +1215,7 @@ static int suspend(int vetoable)
 	}
 
 	device_suspend(PMSG_SUSPEND);
-	local_irq_disable();
+	raw_local_irq_disable();
 	device_power_down(PMSG_SUSPEND);
 
 	/* serialize with the timer interrupt */
@@ -1231,14 +1231,14 @@ static int suspend(int vetoable)
 	 */
 	spin_unlock(&i8253_lock);
 	write_sequnlock(&xtime_lock);
-	local_irq_enable();
+	raw_local_irq_enable();
 
 	save_processor_state();
 	err = set_system_power_state(APM_STATE_SUSPEND);
 	ignore_normal_resume = 1;
 	restore_processor_state();
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	write_seqlock(&xtime_lock);
 	spin_lock(&i8253_lock);
 	reinit_timer();
@@ -1253,7 +1253,7 @@ static int suspend(int vetoable)
 		apm_error("suspend", err);
 	err = (err == APM_SUCCESS) ? 0 : -EIO;
 	device_power_up();
-	local_irq_enable();
+	raw_local_irq_enable();
 	device_resume();
 	pm_send_all(PM_RESUME, (void *)0);
 	queue_event(APM_NORMAL_RESUME, NULL);
@@ -1272,22 +1272,22 @@ static void standby(void)
 {
 	int	err;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	device_power_down(PMSG_SUSPEND);
 	/* serialize with the timer interrupt */
 	write_seqlock(&xtime_lock);
 	/* If needed, notify drivers here */
 	get_time_diff();
 	write_sequnlock(&xtime_lock);
-	local_irq_enable();
+	raw_local_irq_enable();
 
 	err = set_system_power_state(APM_STATE_STANDBY);
 	if ((err != APM_SUCCESS) && (err != APM_NO_ERROR))
 		apm_error("standby", err);
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	device_power_up();
-	local_irq_enable();
+	raw_local_irq_enable();
 }
 
 static apm_event_t get_event(void)
Index: linux.prev/arch/i386/kernel/cpu/cpufreq/longhaul.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/cpu/cpufreq/longhaul.c
+++ linux.prev/arch/i386/kernel/cpu/cpufreq/longhaul.c
@@ -144,7 +144,7 @@ static void do_powersaver(union msr_long
 	longhaul->bits.RevisionKey = 0;
 
 	preempt_disable();
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	/*
 	 * get current pci bus master state for all devices
@@ -166,11 +166,11 @@ static void do_powersaver(union msr_long
 	outb(0xFE,0x21);	/* TMR0 only */
 	outb(0xFF,0x80);	/* delay */
 
-	safe_halt();
+	raw_safe_halt();
 	wrmsrl(MSR_VIA_LONGHAUL, longhaul->val);
 	halt();
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	outb(tmp_mask,0x21);	/* restore mask */
 
@@ -184,7 +184,7 @@ static void do_powersaver(union msr_long
 			pci_write_config_byte(dev, PCI_COMMAND, pci_cmd);
 		}
 	} while (dev != NULL);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	preempt_enable();
 
 	/* disable bus ratio bit */
@@ -245,16 +245,16 @@ static void longhaul_setstate(unsigned i
 		/* Enable software clock multiplier */
 		bcr2.bits.ESOFTBF = 1;
 		bcr2.bits.CLOCKMUL = clock_ratio_index;
-		local_irq_disable();
+		raw_local_irq_disable();
 		wrmsrl (MSR_VIA_BCR2, bcr2.val);
-		safe_halt();
+		raw_safe_halt();
 
 		/* Disable software clock multiplier */
 		rdmsrl (MSR_VIA_BCR2, bcr2.val);
 		bcr2.bits.ESOFTBF = 0;
-		local_irq_disable();
+		raw_local_irq_disable();
 		wrmsrl (MSR_VIA_BCR2, bcr2.val);
-		local_irq_enable();
+		raw_local_irq_enable();
 		break;
 
 	/*
Index: linux.prev/arch/i386/kernel/cpu/mtrr/cyrix.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/cpu/mtrr/cyrix.c
+++ linux.prev/arch/i386/kernel/cpu/mtrr/cyrix.c
@@ -17,7 +17,7 @@ cyrix_get_arr(unsigned int reg, unsigned
 	arr = CX86_ARR_BASE + (reg << 1) + reg;	/* avoid multiplication by 3 */
 
 	/* Save flags and disable interrupts */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	ccr3 = getCx86(CX86_CCR3);
 	setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);	/* enable MAPEN */
@@ -28,7 +28,7 @@ cyrix_get_arr(unsigned int reg, unsigned
 	setCx86(CX86_CCR3, ccr3);	/* disable MAPEN */
 
 	/* Enable interrupts if it was enabled previously */
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	shift = ((unsigned char *) base)[1] & 0x0f;
 	*base >>= PAGE_SHIFT;
 
Index: linux.prev/arch/i386/kernel/cpu/mtrr/generic.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/cpu/mtrr/generic.c
+++ linux.prev/arch/i386/kernel/cpu/mtrr/generic.c
@@ -234,7 +234,7 @@ static unsigned long set_mtrr_state(u32 
 
 static unsigned long cr4 = 0;
 static u32 deftype_lo, deftype_hi;
-static DEFINE_SPINLOCK(set_atomicity_lock);
+static DEFINE_RAW_SPINLOCK(set_atomicity_lock);
 
 /*
  * Since we are disabling the cache don't allow any interrupts - they
@@ -296,14 +296,14 @@ static void generic_set_all(void)
 	unsigned long mask, count;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	prepare_set();
 
 	/* Actually set the state */
 	mask = set_mtrr_state(deftype_lo,deftype_hi);
 
 	post_set();
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	/*  Use the atomic bitops to update the global mask  */
 	for (count = 0; count < sizeof mask * 8; ++count) {
@@ -331,7 +331,7 @@ static void generic_set_mtrr(unsigned in
 
 	vr = &mtrr_state.var_ranges[reg];
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	prepare_set();
 
 	if (size == 0) {
@@ -350,7 +350,7 @@ static void generic_set_mtrr(unsigned in
 	}
 
 	post_set();
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
Index: linux.prev/arch/i386/kernel/cpu/mtrr/main.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/cpu/mtrr/main.c
+++ linux.prev/arch/i386/kernel/cpu/mtrr/main.c
@@ -146,7 +146,7 @@ static void ipi_handler(void *info)
 	struct set_mtrr_data *data = info;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	atomic_dec(&data->count);
 	while(!atomic_read(&data->gate))
@@ -164,7 +164,7 @@ static void ipi_handler(void *info)
 		cpu_relax();
 
 	atomic_dec(&data->count);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 #endif
@@ -225,7 +225,7 @@ static void set_mtrr(unsigned int reg, u
 	if (smp_call_function(ipi_handler, &data, 1, 0) != 0)
 		panic("mtrr: timed out waiting for other CPUs\n");
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	while(atomic_read(&data.count))
 		cpu_relax();
@@ -259,7 +259,7 @@ static void set_mtrr(unsigned int reg, u
 	while(atomic_read(&data.count))
 		cpu_relax();
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /**
@@ -695,11 +695,11 @@ void mtrr_ap_init(void)
 	 * 2.cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug lock to
 	 * prevent mtrr entry changes
 	 */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	mtrr_if->set_all();
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static int __init mtrr_init_finialize(void)
Index: linux.prev/arch/i386/kernel/cpu/mtrr/state.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/cpu/mtrr/state.c
+++ linux.prev/arch/i386/kernel/cpu/mtrr/state.c
@@ -12,7 +12,7 @@ void set_mtrr_prepare_save(struct set_mt
 	unsigned int cr0;
 
 	/*  Disable interrupts locally  */
-	local_irq_save(ctxt->flags);
+	raw_local_irq_save(ctxt->flags);
 
 	if (use_intel() || is_cpu(CYRIX)) {
 
@@ -73,6 +73,6 @@ void set_mtrr_done(struct set_mtrr_conte
 			write_cr4(ctxt->cr4val);
 	}
 	/*  Re-enable interrupts locally (if enabled previously)  */
-	local_irq_restore(ctxt->flags);
+	raw_local_irq_restore(ctxt->flags);
 }
 
Index: linux.prev/arch/i386/kernel/entry.S
===================================================================
--- linux.prev.orig/arch/i386/kernel/entry.S
+++ linux.prev/arch/i386/kernel/entry.S
@@ -76,10 +76,10 @@ NT_MASK		= 0x00004000
 VM_MASK		= 0x00020000
 
 #ifdef CONFIG_PREEMPT
-#define preempt_stop		cli
+# define preempt_stop		cli
 #else
-#define preempt_stop
-#define resume_kernel		restore_nocheck
+# define preempt_stop
+# define resume_kernel		restore_nocheck
 #endif
 
 #define SAVE_ALL \
@@ -160,14 +160,17 @@ ENTRY(resume_userspace)
 #ifdef CONFIG_PREEMPT
 ENTRY(resume_kernel)
 	cli
+	cmpl $0, kernel_preemption
+	jz restore_nocheck
 	cmpl $0,TI_preempt_count(%ebp)	# non-zero preempt_count ?
 	jnz restore_nocheck
 need_resched:
 	movl TI_flags(%ebp), %ecx	# need_resched set ?
 	testb $_TIF_NEED_RESCHED, %cl
-	jz restore_all
+	jz restore_nocheck
 	testl $IF_MASK,EFLAGS(%esp)     # interrupts off (exception path) ?
-	jz restore_all
+	jz restore_nocheck
+	cli
 	call preempt_schedule_irq
 	jmp need_resched
 #endif
@@ -200,6 +203,11 @@ sysenter_past_esp:
 
 	pushl %eax
 	SAVE_ALL
+#ifdef CONFIG_LATENCY_TRACE
+	pushl %edx; pushl %ecx; pushl %ebx; pushl %eax
+	call sys_call
+	popl %eax; popl %ebx; popl %ecx; popl %edx
+#endif
 	GET_THREAD_INFO(%ebp)
 
 	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
@@ -213,6 +221,11 @@ sysenter_past_esp:
 	movl TI_flags(%ebp), %ecx
 	testw $_TIF_ALLWORK_MASK, %cx
 	jne syscall_exit_work
+#ifdef CONFIG_LATENCY_TRACE
+	pushl %eax
+	call sys_ret
+	popl %eax
+#endif
 /* if something modifies registers it must also disable sysexit */
 	movl EIP(%esp), %edx
 	movl OLDESP(%esp), %ecx
@@ -225,6 +238,11 @@ sysenter_past_esp:
 ENTRY(system_call)
 	pushl %eax			# save orig_eax
 	SAVE_ALL
+#ifdef CONFIG_LATENCY_TRACE
+	pushl %edx; pushl %ecx; pushl %ebx; pushl %eax
+	call sys_call
+	popl %eax; popl %ebx; popl %ecx; popl %edx
+#endif
 	GET_THREAD_INFO(%ebp)
 					# system call tracing in operation / emulation
 	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
@@ -254,6 +272,17 @@ restore_all:
 	cmpl $((4 << 8) | 3), %eax
 	je ldt_ss			# returning to user-space with LDT SS
 restore_nocheck:
+#if defined(CONFIG_CRITICAL_IRQSOFF_TIMING) || defined(CONFIG_LATENCY_TRACE)
+	pushl %eax
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+	call trace_irqs_on
+#endif
+#ifdef CONFIG_LATENCY_TRACE
+	call sys_ret
+#endif
+	popl %eax
+#endif
+restore_nocheck_nmi:
 	RESTORE_REGS
 	addl $4, %esp
 1:	iret
@@ -297,18 +326,19 @@ ldt_ss:
 	# perform work that needs to be done immediately before resumption
 	ALIGN
 work_pending:
-	testb $_TIF_NEED_RESCHED, %cl
+	testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %ecx
 	jz work_notifysig
 work_resched:
-	call schedule
-	cli				# make sure we don't miss an interrupt
+	cli
+	call __schedule
+					# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
 	movl TI_flags(%ebp), %ecx
 	andl $_TIF_WORK_MASK, %ecx	# is there any work to be done other
 					# than syscall tracing?
 	jz restore_all
-	testb $_TIF_NEED_RESCHED, %cl
+	testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %ecx
 	jnz work_resched
 
 work_notifysig:				# deal with pending signals and
@@ -351,6 +381,11 @@ syscall_trace_entry:
 syscall_exit_work:
 	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
 	jz work_pending
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+	pushl %eax
+	call trace_irqs_on
+	popl %eax
+#endif
 	sti				# could let do_syscall_trace() call
 					# schedule() instead
 	movl %esp, %eax
@@ -412,9 +447,16 @@ ENTRY(irq_entries_start)
 vector=vector+1
 .endr
 
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+# define TRACE_IRQS_OFF call trace_irqs_off_lowlevel;
+#else
+# define TRACE_IRQS_OFF
+#endif
+
 	ALIGN
 common_interrupt:
 	SAVE_ALL
+	TRACE_IRQS_OFF
 	movl %esp,%eax
 	call do_IRQ
 	jmp ret_from_intr
@@ -423,6 +465,7 @@ common_interrupt:
 ENTRY(name)				\
 	pushl $nr-256;			\
 	SAVE_ALL			\
+	TRACE_IRQS_OFF			\
 	movl %esp,%eax;			\
 	call smp_/**/name;		\
 	jmp ret_from_intr;
@@ -552,7 +595,7 @@ nmi_stack_correct:
 	xorl %edx,%edx		# zero error code
 	movl %esp,%eax		# pt_regs pointer
 	call do_nmi
-	jmp restore_all
+	jmp restore_nocheck_nmi
 
 nmi_stack_fixup:
 	FIX_STACK(12,nmi_stack_correct, 1)
Index: linux.prev/arch/i386/kernel/hpet.c
===================================================================
--- /dev/null
+++ linux.prev/arch/i386/kernel/hpet.c
@@ -0,0 +1,69 @@
+#include <linux/clocksource.h>
+#include <linux/errno.h>
+#include <linux/hpet.h>
+#include <linux/init.h>
+
+#include <asm/hpet.h>
+#include <asm/io.h>
+
+#define HPET_MASK	0xFFFFFFFF
+#define HPET_SHIFT	22
+
+/* FSEC = 10^-15 NSEC = 10^-9 */
+#define FSEC_PER_NSEC	1000000
+
+static void *hpet_ptr;
+
+static cycle_t read_hpet(void)
+{
+	return (cycle_t)readl(hpet_ptr);
+}
+
+struct clocksource clocksource_hpet = {
+	.name		= "hpet",
+	.rating		= 250,
+	.read		= read_hpet,
+	.mask		= (cycle_t)HPET_MASK,
+	.mult		= 0, /* set below */
+	.shift		= HPET_SHIFT,
+	.is_continuous	= 1,
+};
+
+static int __init init_hpet_clocksource(void)
+{
+	unsigned long hpet_period;
+	void __iomem* hpet_base;
+	u64 tmp;
+
+	if (!hpet_address)
+		return -ENODEV;
+
+	/* calculate the hpet address: */
+	hpet_base =
+		(void __iomem*)ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
+	hpet_ptr = hpet_base + HPET_COUNTER;
+
+	/* calculate the frequency: */
+	hpet_period = readl(hpet_base + HPET_PERIOD);
+
+	/*
+	 * hpet period is in femto seconds per cycle
+	 * so we need to convert this to ns/cyc units
+	 * aproximated by mult/2^shift
+	 *
+	 *  fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift
+	 *  fsec/cyc * 1ns/1000000fsec * 2^shift = mult
+	 *  fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
+	 *  (fsec/cyc << shift)/1000000 = mult
+	 *  (hpet_period << shift)/FSEC_PER_NSEC = mult
+	 */
+	tmp = (u64)hpet_period << HPET_SHIFT;
+	do_div(tmp, FSEC_PER_NSEC);
+	clocksource_hpet.mult = (u32)tmp;
+
+	register_clocksource(&clocksource_hpet);
+
+	return 0;
+}
+
+module_init(init_hpet_clocksource);
Index: linux.prev/arch/i386/kernel/i386_ksyms.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/i386_ksyms.c
+++ linux.prev/arch/i386/kernel/i386_ksyms.c
@@ -6,10 +6,12 @@
 /* This is definitely a GPL-only symbol */
 EXPORT_SYMBOL_GPL(cpu_gdt_table);
 
-EXPORT_SYMBOL(__down_failed);
-EXPORT_SYMBOL(__down_failed_interruptible);
-EXPORT_SYMBOL(__down_failed_trylock);
-EXPORT_SYMBOL(__up_wakeup);
+#ifdef CONFIG_ASM_SEMAPHORES
+EXPORT_SYMBOL(__compat_down_failed);
+EXPORT_SYMBOL(__compat_down_failed_interruptible);
+EXPORT_SYMBOL(__compat_down_failed_trylock);
+EXPORT_SYMBOL(__compat_up_wakeup);
+#endif
 /* Networking helper routines. */
 EXPORT_SYMBOL(csum_partial_copy_generic);
 
@@ -25,7 +27,7 @@ EXPORT_SYMBOL(__put_user_8);
 EXPORT_SYMBOL(strpbrk);
 EXPORT_SYMBOL(strstr);
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && defined(CONFIG_ASM_SEMAPHORES)
 extern void FASTCALL( __write_lock_failed(rwlock_t *rw));
 extern void FASTCALL( __read_lock_failed(rwlock_t *rw));
 EXPORT_SYMBOL(__write_lock_failed);
Index: linux.prev/arch/i386/kernel/i8253.c
===================================================================
--- /dev/null
+++ linux.prev/arch/i386/kernel/i8253.c
@@ -0,0 +1,137 @@
+/*
+ * i8253.c  8253/PIT functions
+ *
+ */
+#include <linux/clockchips.h>
+#include <linux/spinlock.h>
+#include <linux/jiffies.h>
+#include <linux/sysdev.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/mca.h>
+
+#include <asm/smp.h>
+#include <asm/io_apic.h>
+#include <asm/delay.h>
+#include <asm/i8253.h>
+#include <asm/io.h>
+
+#include "io_ports.h"
+
+DEFINE_RAW_SPINLOCK(i8253_lock);
+EXPORT_SYMBOL(i8253_lock);
+
+static void init_pit_timer(int mode)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8253_lock, flags);
+
+	if (mode != CLOCK_EVT_ONESHOT) {
+		/* binary, mode 2, LSB/MSB, ch 0 */
+		outb_p(0x34, PIT_MODE);
+		udelay(10);
+		outb_p(LATCH & 0xff , PIT_CH0);	/* LSB */
+		outb(LATCH >> 8 , PIT_CH0);	/* MSB */
+	} else {
+		/* One shot setup */
+		outb_p(0x38, PIT_MODE);
+		udelay(10);
+	}
+
+	spin_unlock_irqrestore(&i8253_lock, flags);
+}
+
+static void pit_next_event(unsigned long evt)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8253_lock, flags);
+	outb_p(evt & 0xff , PIT_CH0);	/* LSB */
+	outb(evt >> 8 , PIT_CH0);	/* MSB */
+	spin_unlock_irqrestore(&i8253_lock, flags);
+}
+
+static struct clock_event pit_clockevent = {
+	.name		= "pit",
+	.capabilities	= CLOCK_CAP_TICK
+#ifndef CONFIG_SMP
+			| CLOCK_CAP_NEXTEVT | CLOCK_CAP_PROFILE |
+			CLOCK_CAP_UPDATE
+#endif
+	,
+	.set_mode	= init_pit_timer,
+	.set_next_event = pit_next_event,
+	.start_event	= io_apic_timer_ack,
+	.end_event	= mca_timer_ack,
+	.shift		= 32,
+	.irq		= 0,
+};
+
+void setup_pit_timer(void)
+{
+	pit_clockevent.mult = div_sc32(CLOCK_TICK_RATE, NSEC_PER_SEC);
+	pit_clockevent.max_delta_ns =
+		clockevent_delta2ns(0x7FFF, &pit_clockevent);
+	pit_clockevent.min_delta_ns =
+		clockevent_delta2ns(0xF, &pit_clockevent);
+	setup_global_clockevent(&pit_clockevent, CPU_MASK_NONE);
+}
+
+/*
+ * Since the PIT overflows every tick, its not very useful
+ * to just read by itself. So use jiffies to emulate a free
+ * running counter:
+ */
+static cycle_t pit_read(void)
+{
+	unsigned long flags, seq;
+	int count;
+	u64 jifs;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+
+		spin_lock_irqsave(&i8253_lock, flags);
+		outb_p(0x00, PIT_MODE);	/* latch the count ASAP */
+		count = inb_p(PIT_CH0);	/* read the latched count */
+		count |= inb_p(PIT_CH0) << 8;
+
+		/* VIA686a test code... reset the latch if count > max + 1 */
+		if (count > LATCH) {
+			outb_p(0x34, PIT_MODE);
+			outb_p(LATCH & 0xff, PIT_CH0);
+			outb(LATCH >> 8, PIT_CH0);
+			count = LATCH - 1;
+		}
+		spin_unlock_irqrestore(&i8253_lock, flags);
+
+		jifs = jiffies_64;
+	} while (read_seqretry(&xtime_lock, seq));
+
+	jifs -= INITIAL_JIFFIES;
+	count = (LATCH-1) - count;
+
+	return (cycle_t)(jifs * LATCH) + count;
+}
+
+static struct clocksource clocksource_pit = {
+	.name	= "pit",
+	.rating = 110,
+	.read	= pit_read,
+	.mask	= (cycle_t)-1,
+	.mult	= 0,
+	.shift	= 20,
+};
+
+static int __init init_pit_clocksource(void)
+{
+	if (num_possible_cpus() > 4) /* PIT does not scale! */
+		return 0;
+
+	clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20);
+	register_clocksource(&clocksource_pit);
+
+	return 0;
+}
+module_init(init_pit_clocksource);
Index: linux.prev/arch/i386/kernel/i8259.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/i8259.c
+++ linux.prev/arch/i386/kernel/i8259.c
@@ -35,7 +35,7 @@
  * moves to arch independent land
  */
 
-DEFINE_SPINLOCK(i8259A_lock);
+DEFINE_RAW_SPINLOCK(i8259A_lock);
 
 static void end_8259A_irq (unsigned int irq)
 {
@@ -366,7 +366,7 @@ static irqreturn_t math_error_irq(int cp
  * New motherboards sometimes make IRQ 13 be a PCI interrupt,
  * so allow interrupt sharing.
  */
-static struct irqaction fpu_irq = { math_error_irq, 0, CPU_MASK_NONE, "fpu", NULL, NULL };
+static struct irqaction fpu_irq = { math_error_irq, SA_NODELAY, CPU_MASK_NONE, "fpu", NULL, NULL };
 
 void __init init_ISA_irqs (void)
 {
@@ -422,12 +422,6 @@ void __init init_IRQ(void)
 	intr_init_hook();
 
 	/*
-	 * Set the clock to HZ Hz, we already have a valid
-	 * vector now:
-	 */
-	setup_pit_timer();
-
-	/*
 	 * External FPU? Set up irq13 if so, for
 	 * original braindamaged IBM FERR coupling.
 	 */
Index: linux.prev/arch/i386/kernel/init_task.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/init_task.c
+++ linux.prev/arch/i386/kernel/init_task.c
@@ -10,8 +10,8 @@
 #include <asm/pgtable.h>
 #include <asm/desc.h>
 
-static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
+static struct fs_struct init_fs = INIT_FS(init_fs);
+static struct files_struct init_files = INIT_FILES(init_files);
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
Index: linux.prev/arch/i386/kernel/io_apic.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/io_apic.c
+++ linux.prev/arch/i386/kernel/io_apic.c
@@ -49,7 +49,7 @@ atomic_t irq_mis_count;
 /* Where if anywhere is the i8259 connect in external int mode */
 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 
-static DEFINE_SPINLOCK(ioapic_lock);
+static DEFINE_RAW_SPINLOCK(ioapic_lock);
 
 /*
  *	Is the SiS APIC rmw bug present ?
@@ -90,6 +90,27 @@ int vector_irq[NR_VECTORS] __read_mostly
 #define vector_to_irq(vector)	(vector)
 #endif
 
+static int timer_ack;
+
+void io_apic_timer_ack(void *priv)
+{
+	unsigned long flags;
+
+	if (timer_ack) {
+		/*
+		 * Subtle, when I/O APICs are used we have to ack timer IRQ
+		 * manually to reset the IRR bit for do_slow_gettimeoffset().
+		 * This will also deassert NMI lines for the watchdog if run
+		 * on an 82489DX-based system.
+		 */
+		spin_lock_irqsave(&i8259A_lock, flags);
+		outb(0x0c, PIC_MASTER_OCW3);
+		/* Ack the IRQ; AEOI will end it automatically. */
+		inb(PIC_MASTER_POLL);
+		spin_unlock_irqrestore(&i8259A_lock, flags);
+	}
+}
+
 /*
  * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
  * shared ISA-space IRQs, so we have to support them. We are super
@@ -133,6 +154,105 @@ static void __init replace_pin_at_irq(un
 	}
 }
 
+//#define IOAPIC_CACHE
+
+#ifdef IOAPIC_CACHE
+# define MAX_IOAPIC_CACHE 512
+
+/*
+ * Cache register values:
+ */
+static unsigned int io_apic_cache[MAX_IO_APICS][MAX_IOAPIC_CACHE]
+		____cacheline_aligned_in_smp;
+#endif
+
+inline unsigned int __raw_io_apic_read(unsigned int apic, unsigned int reg)
+{
+	*IO_APIC_BASE(apic) = reg;
+	return *(IO_APIC_BASE(apic)+4);
+}
+
+unsigned int raw_io_apic_read(unsigned int apic, unsigned int reg)
+{
+	unsigned int val = __raw_io_apic_read(apic, reg);
+
+#ifdef IOAPIC_CACHE
+	io_apic_cache[apic][reg] = val;
+#endif
+	return val;
+}
+
+unsigned int io_apic_read(unsigned int apic, unsigned int reg)
+{
+#ifdef IOAPIC_CACHE
+	if (unlikely(reg >= MAX_IOAPIC_CACHE)) {
+		static int once = 1;
+
+		if (once) {
+			once = 0;
+			printk("WARNING: ioapic register cache overflow: %d.\n",
+				reg);
+			dump_stack();
+		}
+		return __raw_io_apic_read(apic, reg);
+	}
+	if (io_apic_cache[apic][reg] && !sis_apic_bug)
+		return io_apic_cache[apic][reg];
+#endif
+	return raw_io_apic_read(apic, reg);
+}
+
+void io_apic_write(unsigned int apic, unsigned int reg, unsigned int val)
+{
+#ifdef IOAPIC_CACHE
+	if (unlikely(reg >= MAX_IOAPIC_CACHE)) {
+		static int once = 1;
+
+		if (once) {
+			once = 0;
+			printk("WARNING: ioapic register cache overflow: %d.\n",
+				reg);
+			dump_stack();
+		}
+	} else
+		io_apic_cache[apic][reg] = val;
+#endif
+	*IO_APIC_BASE(apic) = reg;
+	*(IO_APIC_BASE(apic)+4) = val;
+}
+
+/*
+ * Some systems need a POST flush or else level-triggered interrupts
+ * generate lots of spurious interrupts due to the POST-ed write not
+ * reaching the IOAPIC before the IRQ is ACK-ed in the local APIC.
+ */
+#ifdef CONFIG_SMP
+# define IOAPIC_POSTFLUSH
+#endif
+
+/*
+ * Re-write a value: to be used for read-modify-write
+ * cycles where the read already set up the index register.
+ *
+ * Older SiS APIC requires we rewrite the index regiser
+ */
+void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val)
+{
+#ifdef IOAPIC_CACHE
+	io_apic_cache[apic][reg] = val;
+#endif
+	if (unlikely(sis_apic_bug))
+		*IO_APIC_BASE(apic) = reg;
+	*(IO_APIC_BASE(apic)+4) = val;
+#ifndef IOAPIC_POSTFLUSH
+	if (unlikely(sis_apic_bug))
+#endif
+		/*
+		 * Force POST flush by reading:
+ 		 */
+		val = *(IO_APIC_BASE(apic)+4);
+}
+
 static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
 {
 	struct irq_pin_list *entry = irq_2_pin + irq;
@@ -164,18 +284,6 @@ static void __unmask_IO_APIC_irq (unsign
 	__modify_IO_APIC_irq(irq, 0, 0x00010000);
 }
 
-/* mask = 1, trigger = 0 */
-static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
-{
-	__modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
-}
-
-/* mask = 0, trigger = 1 */
-static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
-{
-	__modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
-}
-
 static void mask_IO_APIC_irq (unsigned int irq)
 {
 	unsigned long flags;
@@ -1430,8 +1538,8 @@ void __init print_IO_APIC(void)
 		struct IO_APIC_route_entry entry;
 
 		spin_lock_irqsave(&ioapic_lock, flags);
-		*(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
-		*(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
+		*(((int *)&entry)+0) = raw_io_apic_read(apic, 0x10+i*2);
+		*(((int *)&entry)+1) = raw_io_apic_read(apic, 0x11+i*2);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 
 		printk(KERN_DEBUG " %02x %03X %02X  ",
@@ -1477,7 +1585,7 @@ void __init print_IO_APIC(void)
 	return;
 }
 
-#if 0
+#if 1
 
 static void print_APIC_bitfield (int base)
 {
@@ -1866,7 +1974,7 @@ static int __init timer_irq_works(void)
 {
 	unsigned long t1 = jiffies;
 
-	local_irq_enable();
+	raw_local_irq_enable();
 	/* Let ten ticks pass... */
 	mdelay((10 * 1000) / HZ);
 
@@ -1877,7 +1985,7 @@ static int __init timer_irq_works(void)
 	 * might have cached one ExtINT interrupt.  Finally, at
 	 * least one tick may be lost due to delays.
 	 */
-	if (jiffies - t1 > 4)
+	if (jiffies - t1 > 4 && jiffies - t1 < 16)
 		return 1;
 
 	return 0;
@@ -1930,9 +2038,11 @@ static unsigned int startup_edge_ioapic_
 static void ack_edge_ioapic_irq(unsigned int irq)
 {
 	move_irq(irq);
+#if 0
 	if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
 					== (IRQ_PENDING | IRQ_DISABLED))
 		mask_IO_APIC_irq(irq);
+#endif
 	ack_APIC_irq();
 }
 
@@ -1957,6 +2067,30 @@ static unsigned int startup_level_ioapic
 	return 0; /* don't check for pending */
 }
 
+#ifdef CONFIG_PREEMPT_HARDIRQS
+
+/*
+ * in the PREEMPT_HARDIRQS case we dont want to keep the local
+ * APIC unacked, because the prevents further interrupts from
+ * being handled - and with IRQ threads being delayed arbitrarily,
+ * that's unacceptable. So we first mask the IRQ, then ack it.
+ * The hardirq thread will then unmask it.
+ */
+static void mask_and_ack_level_ioapic_irq(unsigned int irq)
+{
+	move_irq(irq);
+	mask_IO_APIC_irq(irq);
+	ack_APIC_irq();
+}
+
+#else
+
+static void mask_and_ack_level_ioapic_irq(unsigned int irq)
+{
+}
+
+#endif
+
 static void end_level_ioapic_irq (unsigned int irq)
 {
 	unsigned long v;
@@ -1991,8 +2125,10 @@ static void end_level_ioapic_irq (unsign
 	if (!(v & (1 << (i & 0x1f)))) {
 		atomic_inc(&irq_mis_count);
 		spin_lock(&ioapic_lock);
-		__mask_and_edge_IO_APIC_irq(irq);
-		__unmask_and_level_IO_APIC_irq(irq);
+		/* mask = 1, trigger = 0 */
+		__modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
+		/* mask = 0, trigger = 1 */
+		__modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
 		spin_unlock(&ioapic_lock);
 	}
 }
@@ -2020,6 +2156,13 @@ static unsigned int startup_level_ioapic
 	return startup_level_ioapic_irq (irq);
 }
 
+static void mask_and_ack_level_ioapic_vector (unsigned int vector)
+{
+	int irq = vector_to_irq(vector);
+
+	mask_and_ack_level_ioapic_irq(irq);
+}
+
 static void end_level_ioapic_vector (unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
Index: linux.prev/arch/i386/kernel/irq.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/irq.c
+++ linux.prev/arch/i386/kernel/irq.c
@@ -51,7 +51,7 @@ static union irq_ctx *softirq_ctx[NR_CPU
  * SMP cross-CPU interrupts have their own specific
  * handlers).
  */
-fastcall unsigned int do_IRQ(struct pt_regs *regs)
+fastcall notrace unsigned int do_IRQ(struct pt_regs *regs)
 {	
 	/* high bits used in ret_from_ code */
 	int irq = regs->orig_eax & 0xff;
@@ -59,8 +59,12 @@ fastcall unsigned int do_IRQ(struct pt_r
 	union irq_ctx *curctx, *irqctx;
 	u32 *isp;
 #endif
-
 	irq_enter();
+#ifdef CONFIG_LATENCY_TRACE
+	if (irq == trace_user_trigger_irq)
+		user_trace_start();
+#endif
+	trace_special(regs->eip, irq, 0);
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 	/* Debugging check for stack overflow: is there less than 1KB free? */
 	{
@@ -69,7 +73,7 @@ fastcall unsigned int do_IRQ(struct pt_r
 		__asm__ __volatile__("andl %%esp,%0" :
 					"=r" (esp) : "0" (THREAD_SIZE - 1));
 		if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
-			printk("do_IRQ: stack overflow: %ld\n",
+			printk("BUG: do_IRQ: stack overflow: %ld\n",
 				esp - sizeof(struct thread_info));
 			dump_stack();
 		}
@@ -173,7 +177,7 @@ asmlinkage void do_softirq(void)
 	if (in_interrupt())
 		return;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	if (local_softirq_pending()) {
 		curctx = current_thread_info();
@@ -194,7 +198,7 @@ asmlinkage void do_softirq(void)
 		);
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 EXPORT_SYMBOL(do_softirq);
@@ -224,8 +228,10 @@ int show_interrupts(struct seq_file *p, 
 	}
 
 	if (i < NR_IRQS) {
-		spin_lock_irqsave(&irq_desc[i].lock, flags);
-		action = irq_desc[i].action;
+		irq_desc_t *desc = irq_desc + i;
+
+		spin_lock_irqsave(&desc->lock, flags);
+		action = desc->action;
 		if (!action)
 			goto skip;
 		seq_printf(p, "%3d: ",i);
@@ -235,15 +241,27 @@ int show_interrupts(struct seq_file *p, 
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
 #endif
-		seq_printf(p, " %14s", irq_desc[i].handler->typename);
+		seq_printf(p, " %-14s", desc->handler->typename);
+#define F(x,c) ((desc->status & x) ? c : '.')
+		seq_printf(p, " [%c%c%c%c%c%c%c%c%c/",
+			F(IRQ_INPROGRESS,	'I'),
+			F(IRQ_DISABLED,		'D'),
+			F(IRQ_PENDING,		'P'),
+			F(IRQ_REPLAY,		'R'),
+			F(IRQ_AUTODETECT,	'A'),
+			F(IRQ_WAITING,		'W'),
+			F(IRQ_LEVEL,		'L'),
+			F(IRQ_MASKED,		'M'),
+			F(IRQ_NODELAY,		'N'));
+#undef F
+		seq_printf(p, "%3d]", desc->irqs_unhandled);
 		seq_printf(p, "  %s", action->name);
-
 		for (action=action->next; action; action = action->next)
 			seq_printf(p, ", %s", action->name);
 
 		seq_putc(p, '\n');
 skip:
-		spin_unlock_irqrestore(&irq_desc[i].lock, flags);
+		spin_unlock_irqrestore(&desc->lock, flags);
 	} else if (i == NR_IRQS) {
 		seq_printf(p, "NMI: ");
 		for_each_online_cpu(j)
@@ -298,9 +316,9 @@ void fixup_irqs(cpumask_t map)
 	barrier();
 #else
 	/* That doesn't seem sufficient.  Give it 1ms. */
-	local_irq_enable();
+	raw_local_irq_enable();
 	mdelay(1);
-	local_irq_disable();
+	raw_local_irq_disable();
 #endif
 }
 #endif
Index: linux.prev/arch/i386/kernel/mca.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/mca.c
+++ linux.prev/arch/i386/kernel/mca.c
@@ -472,3 +472,22 @@ void mca_handle_nmi(void)
 
 	mca_nmi_hook();
 } /* mca_handle_nmi */
+
+void mca_timer_ack(void *priv)
+{
+	int irq;
+
+	if (MCA_bus) {
+		/* The PS/2 uses level-triggered interrupts.  You can't
+		turn them off, nor would you want to (any attempt to
+		enable edge-triggered interrupts usually gets intercepted by a
+		special hardware circuit).  Hence we have to acknowledge
+		the timer interrupt.  Through some incredibly stupid
+		design idea, the reset for IRQ 0 is done by setting the
+		high bit of the PPI port B (0x61).  Note that some PS/2s,
+		notably the 55SX, work fine if this is removed.  */
+
+		irq = inb_p( 0x61 );	/* read the current state */
+		outb_p( irq|0x80, 0x61 );	/* reset the IRQ */
+	}
+}
Index: linux.prev/arch/i386/kernel/mcount-wrapper.S
===================================================================
--- /dev/null
+++ linux.prev/arch/i386/kernel/mcount-wrapper.S
@@ -0,0 +1,27 @@
+/*
+ *  linux/arch/i386/mcount-wrapper.S
+ *
+ *  Copyright (C) 2004 Ingo Molnar
+ */
+
+.globl mcount
+mcount:
+
+	cmpl $0, mcount_enabled
+	jz out
+
+	push %ebp
+	mov %esp, %ebp
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+
+	call __mcount
+
+	popl %edx
+	popl %ecx
+	popl %eax
+	popl %ebp
+out:
+	ret
+
Index: linux.prev/arch/i386/kernel/microcode.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/microcode.c
+++ linux.prev/arch/i386/kernel/microcode.c
@@ -109,7 +109,7 @@ MODULE_LICENSE("GPL");
 #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
 
 /* serialize access to the physical write to MSR 0x79 */
-static DEFINE_SPINLOCK(microcode_update_lock);
+static DEFINE_RAW_SPINLOCK(microcode_update_lock);
 
 /* no concurrent ->write()s are allowed on /dev/cpu/microcode */
 static DECLARE_MUTEX(microcode_sem);
Index: linux.prev/arch/i386/kernel/nmi.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/nmi.c
+++ linux.prev/arch/i386/kernel/nmi.c
@@ -34,7 +34,7 @@
 
 unsigned int nmi_watchdog = NMI_NONE;
 extern int unknown_nmi_panic;
-static unsigned int nmi_hz = HZ;
+static unsigned int nmi_hz = 1000;
 static unsigned int nmi_perfctr_msr;	/* the MSR to reset in NMI handler */
 static unsigned int nmi_p4_cccr_val;
 extern void show_registers(struct pt_regs *regs);
@@ -108,7 +108,7 @@ int nmi_active;
 static __init void nmi_cpu_busy(void *data)
 {
 	volatile int *endflag = data;
-	local_irq_enable();
+	raw_local_irq_enable();
 	/* Intentionally don't use cpu_relax here. This is
 	   to make sure that the performance counter really ticks,
 	   even if there is a simulator or similar that catches the
@@ -140,8 +140,8 @@ static int __init check_nmi_watchdog(voi
 
 	for (cpu = 0; cpu < NR_CPUS; cpu++)
 		prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
-	local_irq_enable();
-	mdelay((10*1000)/nmi_hz); // wait 10 ticks
+	raw_local_irq_enable();
+	mdelay((100*1000)/nmi_hz); // wait 100 ticks
 
 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
 #ifdef CONFIG_SMP
@@ -168,7 +168,7 @@ static int __init check_nmi_watchdog(voi
 	/* now that we know it works we can reduce NMI frequency to
 	   something more reasonable; makes a difference in some configs */
 	if (nmi_watchdog == NMI_LOCAL_APIC)
-		nmi_hz = 1;
+		nmi_hz = 10000;
 
 	kfree(prev_nmi_count);
 	return 0;
@@ -521,9 +521,34 @@ void touch_nmi_watchdog (void)
 
 extern void die_nmi(struct pt_regs *, const char *msg);
 
-void nmi_watchdog_tick (struct pt_regs * regs)
+int nmi_show_regs[NR_CPUS];
+
+void nmi_show_all_regs(void)
 {
+	int i;
+
+	if (nmi_watchdog == NMI_NONE)
+		return;
+	if (system_state != SYSTEM_RUNNING) {
+		printk("nmi_show_all_regs(): system state %d, not doing.\n",
+			system_state);
+		return;
+	}
+	printk("nmi_show_all_regs(): start on CPU#%d.\n",
+		raw_smp_processor_id());
+	dump_stack();
+
+	for_each_online_cpu(i)
+		nmi_show_regs[i] = 1;
+	for_each_online_cpu(i)
+		while (nmi_show_regs[i] == 1)
+			barrier();
+}
+
+static DEFINE_RAW_SPINLOCK(nmi_print_lock);
 
+void notrace nmi_watchdog_tick (struct pt_regs * regs)
+{
 	/*
 	 * Since current_thread_info()-> is always on the stack, and we
 	 * always switch the stack NMI-atomically, it's safe to use
@@ -531,7 +556,16 @@ void nmi_watchdog_tick (struct pt_regs *
 	 */
 	int sum, cpu = smp_processor_id();
 
-	sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
+	sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_irqs(0);
+
+	profile_tick(CPU_PROFILING, regs);
+	if (nmi_show_regs[cpu]) {
+		nmi_show_regs[cpu] = 0;
+		spin_lock(&nmi_print_lock);
+		printk("NMI show regs on CPU#%d:\n", cpu);
+		show_regs(regs);
+		spin_unlock(&nmi_print_lock);
+	}
 
 	if (last_irq_sums[cpu] == sum) {
 		/*
@@ -539,12 +573,25 @@ void nmi_watchdog_tick (struct pt_regs *
 		 * wait a few IRQs (5 seconds) before doing the oops ...
 		 */
 		alert_counter[cpu]++;
-		if (alert_counter[cpu] == 5*nmi_hz)
-			/*
-			 * die_nmi will return ONLY if NOTIFY_STOP happens..
-			 */
-			die_nmi(regs, "NMI Watchdog detected LOCKUP");
+		if (alert_counter[cpu] && !(alert_counter[cpu] % (5*nmi_hz))) {
+			int i;
+
+			bust_spinlocks(1);
+			spin_lock(&nmi_print_lock);
+			printk("NMI watchdog detected lockup on CPU#%d (%d/%d)\n", cpu, alert_counter[cpu], 5*nmi_hz);
+			show_regs(regs);
+			spin_unlock(&nmi_print_lock);
+
+			for_each_online_cpu(i)
+				if (i != cpu)
+					nmi_show_regs[i] = 1;
+			for_each_online_cpu(i)
+				while (nmi_show_regs[i] == 1)
+					barrier();
 
+			die_nmi(regs, "NMI Watchdog detected LOCKUP");
+		}
+	} else {
 		last_irq_sums[cpu] = sum;
 		alert_counter[cpu] = 0;
 	}
Index: linux.prev/arch/i386/kernel/process.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/process.c
+++ linux.prev/arch/i386/kernel/process.c
@@ -39,6 +39,7 @@
 #include <linux/ptrace.h>
 #include <linux/random.h>
 #include <linux/kprobes.h>
+#include <linux/spinlock.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -64,6 +65,12 @@ static int hlt_counter;
 unsigned long boot_option_idle_override = 0;
 EXPORT_SYMBOL(boot_option_idle_override);
 
+DEFINE_SPINLOCK(pm_idle_switch_lock);
+EXPORT_SYMBOL_GPL(pm_idle_switch_lock);
+
+int pm_idle_locked = 0;
+EXPORT_SYMBOL_GPL(pm_idle_locked);
+
 /*
  * Return saved PC of a blocked thread.
  */
@@ -99,21 +106,21 @@ EXPORT_SYMBOL(enable_hlt);
  */
 void default_idle(void)
 {
-	local_irq_enable();
+	raw_local_irq_enable();
 
 	if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
 		clear_thread_flag(TIF_POLLING_NRFLAG);
 		smp_mb__after_clear_bit();
-		while (!need_resched()) {
-			local_irq_disable();
-			if (!need_resched())
-				safe_halt();
+		while (!need_resched() && !need_resched_delayed()) {
+			raw_local_irq_disable();
+			if (!need_resched() && !need_resched_delayed())
+				raw_safe_halt();
 			else
-				local_irq_enable();
+				raw_local_irq_enable();
 		}
 		set_thread_flag(TIF_POLLING_NRFLAG);
 	} else {
-		while (!need_resched())
+		while (!need_resched() && !need_resched_delayed())
 			cpu_relax();
 	}
 }
@@ -126,16 +133,17 @@ EXPORT_SYMBOL(default_idle);
  * to poll the ->work.need_resched flag instead of waiting for the
  * cross-CPU IPI to arrive. Use this option with caution.
  */
-static void poll_idle (void)
+void poll_idle (void)
 {
-	local_irq_enable();
+	raw_local_irq_enable();
 
 	asm volatile(
 		"2:"
 		"testl %0, %1;"
 		"rep; nop;"
 		"je 2b;"
-		: : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags));
+		: : "i"(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED),
+		    "m" (current_thread_info()->flags));
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -153,7 +161,7 @@ static inline void play_dead(void)
 	/*
 	 * With physical CPU hotplug, we should halt the cpu
 	 */
-	local_irq_disable();
+	raw_local_irq_disable();
 	while (1)
 		halt();
 }
@@ -178,7 +186,9 @@ void cpu_idle(void)
 
 	/* endless idle loop with no priority at all */
 	while (1) {
-		while (!need_resched()) {
+		BUG_ON(raw_irqs_disabled());
+
+		while (!need_resched() && !need_resched_delayed()) {
 			void (*idle)(void);
 
 			if (__get_cpu_var(cpu_idle_state))
@@ -196,9 +206,11 @@ void cpu_idle(void)
 			__get_cpu_var(irq_stat).idle_timestamp = jiffies;
 			idle();
 		}
-		preempt_enable_no_resched();
-		schedule();
+		raw_local_irq_disable();
+		__preempt_enable_no_resched();
+		__schedule();
 		preempt_disable();
+		raw_local_irq_enable();
 	}
 }
 
@@ -239,12 +251,12 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
  */
 static void mwait_idle(void)
 {
-	local_irq_enable();
+	raw_local_irq_enable();
 
-	while (!need_resched()) {
+	while (!need_resched() && !need_resched_delayed()) {
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		smp_mb();
-		if (need_resched())
+		if (need_resched() || need_resched_delayed())
 			break;
 		__mwait(0, 0);
 	}
@@ -372,11 +384,16 @@ void exit_thread(void)
 
 	/* The process may have allocated an io port bitmap... nuke it. */
 	if (unlikely(NULL != t->io_bitmap_ptr)) {
-		int cpu = get_cpu();
-		struct tss_struct *tss = &per_cpu(init_tss, cpu);
+		int cpu;
+		struct tss_struct *tss;
+		void *io_bitmap_ptr = t->io_bitmap_ptr;
 
-		kfree(t->io_bitmap_ptr);
 		t->io_bitmap_ptr = NULL;
+		mb();
+		kfree(io_bitmap_ptr);
+
+		cpu = get_cpu();
+		tss = &per_cpu(init_tss, cpu);
 		/*
 		 * Careful, clear this in the TSS too:
 		 */
Index: linux.prev/arch/i386/kernel/reboot.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/reboot.c
+++ linux.prev/arch/i386/kernel/reboot.c
@@ -202,7 +202,7 @@ void machine_real_restart(unsigned char 
 {
 	unsigned long flags;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* Write zero to CMOS register number 0x0f, which the BIOS POST
 	   routine will recognize as telling it to do a proper reboot.  (Well
Index: linux.prev/arch/i386/kernel/semaphore.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/semaphore.c
+++ linux.prev/arch/i386/kernel/semaphore.c
@@ -13,6 +13,7 @@
  * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
  */
 #include <linux/config.h>
+#include <linux/module.h>
 #include <asm/semaphore.h>
 
 /*
@@ -28,15 +29,15 @@
 asm(
 ".section .sched.text\n"
 ".align 4\n"
-".globl __down_failed\n"
-"__down_failed:\n\t"
+".globl __compat_down_failed\n"
+"__compat_down_failed:\n\t"
 #if defined(CONFIG_FRAME_POINTER)
 	"pushl %ebp\n\t"
 	"movl  %esp,%ebp\n\t"
 #endif
 	"pushl %edx\n\t"
 	"pushl %ecx\n\t"
-	"call __down\n\t"
+	"call __compat_down\n\t"
 	"popl %ecx\n\t"
 	"popl %edx\n\t"
 #if defined(CONFIG_FRAME_POINTER)
@@ -49,15 +50,15 @@ asm(
 asm(
 ".section .sched.text\n"
 ".align 4\n"
-".globl __down_failed_interruptible\n"
-"__down_failed_interruptible:\n\t"
+".globl __compat_down_failed_interruptible\n"
+"__compat_down_failed_interruptible:\n\t"
 #if defined(CONFIG_FRAME_POINTER)
 	"pushl %ebp\n\t"
 	"movl  %esp,%ebp\n\t"
 #endif
 	"pushl %edx\n\t"
 	"pushl %ecx\n\t"
-	"call __down_interruptible\n\t"
+	"call __compat_down_interruptible\n\t"
 	"popl %ecx\n\t"
 	"popl %edx\n\t"
 #if defined(CONFIG_FRAME_POINTER)
@@ -70,15 +71,15 @@ asm(
 asm(
 ".section .sched.text\n"
 ".align 4\n"
-".globl __down_failed_trylock\n"
-"__down_failed_trylock:\n\t"
+".globl __compat_down_failed_trylock\n"
+"__compat_down_failed_trylock:\n\t"
 #if defined(CONFIG_FRAME_POINTER)
 	"pushl %ebp\n\t"
 	"movl  %esp,%ebp\n\t"
 #endif
 	"pushl %edx\n\t"
 	"pushl %ecx\n\t"
-	"call __down_trylock\n\t"
+	"call __compat_down_trylock\n\t"
 	"popl %ecx\n\t"
 	"popl %edx\n\t"
 #if defined(CONFIG_FRAME_POINTER)
@@ -91,45 +92,13 @@ asm(
 asm(
 ".section .sched.text\n"
 ".align 4\n"
-".globl __up_wakeup\n"
-"__up_wakeup:\n\t"
+".globl __compat_up_wakeup\n"
+"__compat_up_wakeup:\n\t"
 	"pushl %edx\n\t"
 	"pushl %ecx\n\t"
-	"call __up\n\t"
+	"call __compat_up\n\t"
 	"popl %ecx\n\t"
 	"popl %edx\n\t"
 	"ret"
 );
 
-/*
- * rw spinlock fallbacks
- */
-#if defined(CONFIG_SMP)
-asm(
-".section .sched.text\n"
-".align	4\n"
-".globl	__write_lock_failed\n"
-"__write_lock_failed:\n\t"
-	LOCK "addl	$" RW_LOCK_BIAS_STR ",(%eax)\n"
-"1:	rep; nop\n\t"
-	"cmpl	$" RW_LOCK_BIAS_STR ",(%eax)\n\t"
-	"jne	1b\n\t"
-	LOCK "subl	$" RW_LOCK_BIAS_STR ",(%eax)\n\t"
-	"jnz	__write_lock_failed\n\t"
-	"ret"
-);
-
-asm(
-".section .sched.text\n"
-".align	4\n"
-".globl	__read_lock_failed\n"
-"__read_lock_failed:\n\t"
-	LOCK "incl	(%eax)\n"
-"1:	rep; nop\n\t"
-	"cmpl	$1,(%eax)\n\t"
-	"js	1b\n\t"
-	LOCK "decl	(%eax)\n\t"
-	"js	__read_lock_failed\n\t"
-	"ret"
-);
-#endif
Index: linux.prev/arch/i386/kernel/setup.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/setup.c
+++ linux.prev/arch/i386/kernel/setup.c
@@ -1620,6 +1620,7 @@ void __init setup_arch(char **cmdline_p)
 	conswitchp = &dummy_con;
 #endif
 #endif
+	tsc_init();
 }
 
 #include "setup_arch_post.h"
Index: linux.prev/arch/i386/kernel/signal.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/signal.c
+++ linux.prev/arch/i386/kernel/signal.c
@@ -604,6 +604,13 @@ int fastcall do_signal(struct pt_regs *r
 	int signr;
 	struct k_sigaction ka;
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Fully-preemptible kernel does not need interrupts disabled:
+	 */
+	raw_local_irq_enable();
+	preempt_check_resched();
+#endif
 	/*
 	 * We want the common case to go fast, which
 	 * is why we may in certain cases get here from
Index: linux.prev/arch/i386/kernel/smp.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/smp.c
+++ linux.prev/arch/i386/kernel/smp.c
@@ -163,7 +163,7 @@ void send_IPI_mask_bitmask(cpumask_t cpu
 	unsigned long cfg;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
 	/*
 	 * Wait for idle.
@@ -186,7 +186,7 @@ void send_IPI_mask_bitmask(cpumask_t cpu
 	 */
 	apic_write_around(APIC_ICR, cfg);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void send_IPI_mask_sequence(cpumask_t mask, int vector)
@@ -200,7 +200,7 @@ void send_IPI_mask_sequence(cpumask_t ma
 	 * should be modified to do 1 message per cluster ID - mbligh
 	 */ 
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) {
 		if (cpu_isset(query_cpu, mask)) {
@@ -227,7 +227,7 @@ void send_IPI_mask_sequence(cpumask_t ma
 			apic_write_around(APIC_ICR, cfg);
 		}
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 #include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
@@ -245,7 +245,7 @@ void send_IPI_mask_sequence(cpumask_t ma
 static cpumask_t flush_cpumask;
 static struct mm_struct * flush_mm;
 static unsigned long flush_va;
-static DEFINE_SPINLOCK(tlbstate_lock);
+static DEFINE_RAW_SPINLOCK(tlbstate_lock);
 #define FLUSH_ALL	0xffffffff
 
 /*
@@ -390,7 +390,7 @@ static void flush_tlb_others(cpumask_t c
 
 	while (!cpus_empty(flush_cpumask))
 		/* nothing. lockup detection does not belong here */
-		mb();
+		cpu_relax();
 
 	flush_mm = NULL;
 	flush_va = 0;
@@ -481,10 +481,20 @@ void smp_send_reschedule(int cpu)
 }
 
 /*
+ * this function sends a 'reschedule' IPI to all other CPUs.
+ * This is used when RT tasks are starving and other CPUs
+ * might be able to run them:
+ */
+void smp_send_reschedule_allbutself(void)
+{
+	send_IPI_allbutself(RESCHEDULE_VECTOR);
+}
+
+/*
  * Structure and data for smp_call_function(). This is designed to minimise
  * static memory requirements. It also looks cleaner.
  */
-static DEFINE_SPINLOCK(call_lock);
+static DEFINE_RAW_SPINLOCK(call_lock);
 
 struct call_data_struct {
 	void (*func) (void *info);
@@ -538,7 +548,7 @@ int smp_call_function (void (*func) (voi
 	}
 
 	/* Can deadlock when called with interrupts disabled */
-	WARN_ON(irqs_disabled());
+	WARN_ON(raw_irqs_disabled());
 
 	data.func = func;
 	data.info = info;
@@ -572,7 +582,7 @@ static void stop_this_cpu (void * dummy)
 	 * Remove this CPU:
 	 */
 	cpu_clear(smp_processor_id(), cpu_online_map);
-	local_irq_disable();
+	raw_local_irq_disable();
 	disable_local_APIC();
 	if (cpu_data[smp_processor_id()].hlt_works_ok)
 		for(;;) halt();
@@ -587,19 +597,20 @@ void smp_send_stop(void)
 {
 	smp_call_function(stop_this_cpu, NULL, 1, 0);
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	disable_local_APIC();
-	local_irq_enable();
+	raw_local_irq_enable();
 }
 
 /*
- * Reschedule call back. Nothing to do,
- * all the work is done automatically when
- * we return from the interrupt.
+ * Reschedule call back. Trigger a reschedule pass so that
+ * RT-overload balancing can pass tasks around.
  */
-fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
+fastcall notrace void smp_reschedule_interrupt(struct pt_regs *regs)
 {
+	trace_special(regs->eip, 0, 0);
 	ack_APIC_irq();
+	set_tsk_need_resched(current);
 }
 
 fastcall void smp_call_function_interrupt(struct pt_regs *regs)
Index: linux.prev/arch/i386/kernel/smpboot.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/smpboot.c
+++ linux.prev/arch/i386/kernel/smpboot.c
@@ -212,142 +212,299 @@ valid_k7:
 	;
 }
 
-/*
- * TSC synchronization.
- *
- * We first check whether all CPUs have their TSC's synchronized,
- * then we print a warning if not, and always resync.
- */
+static atomic_t tsc_start_flag, tsc_check_start, tsc_check_stop;
 
-static atomic_t tsc_start_flag = ATOMIC_INIT(0);
-static atomic_t tsc_count_start = ATOMIC_INIT(0);
-static atomic_t tsc_count_stop = ATOMIC_INIT(0);
-static unsigned long long tsc_values[NR_CPUS];
-
-#define NR_LOOPS 5
-
-static void __init synchronize_tsc_bp (void)
+static int __init check_tsc_warp(void)
 {
-	int i;
-	unsigned long long t0;
-	unsigned long long sum, avg;
-	long long delta;
-	unsigned int one_usec;
-	int buggy = 0;
-
-	printk(KERN_INFO "checking TSC synchronization across %u CPUs: ", num_booting_cpus());
-
-	/* convert from kcyc/sec to cyc/usec */
-	one_usec = cpu_khz / 1000;
+	static DEFINE_RAW_SPINLOCK(warp_lock);
+	static long long prev;
+	static unsigned int error;
 
-	atomic_set(&tsc_start_flag, 1);
-	wmb();
+	int cpus = num_booting_cpus(), nr = 0;
+	long long start, now, end, delta;
 
+	atomic_inc(&tsc_check_start);
+	while (atomic_read(&tsc_check_start) != cpus)
+		cpu_relax();
 	/*
-	 * We loop a few times to get a primed instruction cache,
-	 * then the last pass is more or less synchronized and
-	 * the BP and APs set their cycle counters to zero all at
-	 * once. This reduces the chance of having random offsets
-	 * between the processors, and guarantees that the maximum
-	 * delay between the cycle counters is never bigger than
-	 * the latency of information-passing (cachelines) between
-	 * two CPUs.
+	 * Run the check for 500 msecs:
 	 */
-	for (i = 0; i < NR_LOOPS; i++) {
-		/*
-		 * all APs synchronize but they loop on '== num_cpus'
-		 */
-		while (atomic_read(&tsc_count_start) != num_booting_cpus()-1)
-			mb();
-		atomic_set(&tsc_count_stop, 0);
-		wmb();
-		/*
-		 * this lets the APs save their current TSC:
-		 */
-		atomic_inc(&tsc_count_start);
+	rdtscll(start);
+	end = start + cpu_khz*500;
 
-		rdtscll(tsc_values[smp_processor_id()]);
+	for (;;) {
 		/*
-		 * We clear the TSC in the last loop:
+		 * Check for the TSC going backwards (between CPUs):
 		 */
-		if (i == NR_LOOPS-1)
-			write_tsc(0, 0);
+		spin_lock(&warp_lock);
+		rdtscll(now);
+		delta = now - prev;
+		prev = now;
+		spin_unlock(&warp_lock);
+		if (unlikely(delta < 0))
+			error = 1;
 
+		if (now > end)
+			break;
 		/*
-		 * Wait for all APs to leave the synchronization point:
+		 * Take it easy every couple of iterations,
+		 * to not starve other CPUs:
 		 */
-		while (atomic_read(&tsc_count_stop) != num_booting_cpus()-1)
-			mb();
-		atomic_set(&tsc_count_start, 0);
-		wmb();
-		atomic_inc(&tsc_count_stop);
+		nr++;
+		if (!(nr % 31))
+			cpu_relax();
 	}
 
-	sum = 0;
-	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_isset(i, cpu_callout_map)) {
-			t0 = tsc_values[i];
-			sum += t0;
-		}
-	}
-	avg = sum;
-	do_div(avg, num_booting_cpus());
+	atomic_inc(&tsc_check_stop);
+	while (atomic_read(&tsc_check_stop) != cpus)
+		cpu_relax();
 
-	sum = 0;
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_isset(i, cpu_callout_map))
-			continue;
-		delta = tsc_values[i] - avg;
-		if (delta < 0)
-			delta = -delta;
-		/*
-		 * We report bigger than 2 microseconds clock differences.
-		 */
-		if (delta > 2*one_usec) {
-			long realdelta;
-			if (!buggy) {
-				buggy = 1;
-				printk("\n");
-			}
-			realdelta = delta;
-			do_div(realdelta, one_usec);
-			if (tsc_values[i] < avg)
-				realdelta = -realdelta;
+	return error;
+}
 
-			printk(KERN_INFO "CPU#%d had %ld usecs TSC skew, fixed it up.\n", i, realdelta);
-		}
+/*
+ * TSC synchronization based on ia64 itc synchronization code.  Synchronize
+ * pairs of processors rahter than tring to synchronize all of the processors
+ * with a single event.  When several processors are all waiting for an
+ * event they don't all see it at the same time.  The write will cause
+ * an invalidate on each processors cache and then they all scramble to
+ * re-read that cache line.
+ *
+ * Writing the TSC resets the upper 32-bits, so we need to be careful
+ * that all of the cpus can be synchronized before we overflow the
+ * 32-bit count.
+ */
 
-		sum += delta;
+#define MASTER	0
+#define SLAVE	(SMP_CACHE_BYTES/sizeof(long))
+
+#define NUM_ROUNDS	64	/* magic value */
+#define NUM_ITERS	5	/* likewise */
+
+static volatile unsigned long go[2*SLAVE] __cacheline_aligned;
+static volatile int current_slave = -1;
+static volatile int tsc_sync_complete = 0;
+static volatile int tsc_adj_latency = 0;
+static unsigned int max_rt = 0;
+static unsigned int max_delta = 0;
+
+#define DEBUG_TSC_SYNC	0
+#if DEBUG_TSC_SYNC
+struct tsc_sync_debug {
+	long rt;	/* roundtrip time */
+	long master;	/* master's timestamp */
+	long diff;	/* difference between midpoint and master's timestamp */
+	long lat;	/* estimate of tsc adjustment latency */
+} tsc_sync_debug[NUM_ROUNDS*NR_CPUS];
+#endif
+
+void
+sync_master(void)
+{
+	unsigned long  n, tsc, last_go_master;
+
+	last_go_master = 0;
+	while (1) {
+		while ((n = go[MASTER]) == last_go_master)
+			rep_nop();
+		if (n == ~0)
+			break;
+		rdtscl(tsc);
+		if (unlikely(!tsc))
+			tsc = 1;
+		go[SLAVE] = tsc;
+		last_go_master = n;
 	}
-	if (!buggy)
-		printk("passed.\n");
 }
 
-static void __init synchronize_tsc_ap (void)
+/*
+ * Return the number of cycles by which our TSC differs from the TSC on
+ * the master (time-keeper) CPU.  A positive number indicates our TSC is
+ * ahead of the master, negative that it is behind.
+ */
+static inline long
+get_delta (long *rt, long *master)
 {
-	int i;
+	unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
+	unsigned long tcenter, t0, t1, tm, last_go_slave;
+	long i;
+
+	last_go_slave = go[SLAVE];
+	for (i = 0; i < NUM_ITERS; ++i) {
+		rdtscl(t0);
+		go[MASTER] = i+1;
+		while ((tm = go[SLAVE]) == last_go_slave)
+			rep_nop();
+		rdtscl(t1);
+
+		if (t1 - t0 < best_t1 - best_t0)
+			best_t0 = t0, best_t1 = t1, best_tm = tm;
+		last_go_slave = tm;
+	}
+
+	*rt = best_t1 - best_t0;
+	*master = best_tm - best_t0;
+
+	/* average best_t0 and best_t1 without overflow: */
+	tcenter = (best_t0/2 + best_t1/2);
+	if (best_t0 % 2 + best_t1 % 2 == 2)
+		++tcenter;
+	return tcenter - best_tm;
+}
+
+/*
+ * Synchronize TSC of the current (slave) CPU with the TSC of the MASTER CPU
+ * (normally the time-keeper CPU).  We use a closed loop to eliminate the
+ * possibility of unaccounted-for errors (such as getting a machine check in
+ * the middle of a calibration step).  The basic idea is for the slave to ask
+ * the master what TSC value it has and to read its own TSC before and after
+ * the master responds.  Each iteration gives us three
+ * timestamps:
+ *
+ *	slave		master
+ *
+ *	t0 ---\
+ *             ---\
+ *		   --->
+ *			tm
+ *		   /---
+ *	       /---
+ *	t1 <---
+ *
+ *
+ * The goal is to adjust the slave's TSC such that tm falls exactly half-way
+ * between t0 and t1.  If we achieve this, the clocks are synchronized provided
+ * the interconnect between the slave and the master is symmetric.  Even if the
+ * interconnect were asymmetric, we would still know that the synchronization
+ * error is smaller than the roundtrip latency (t0 - t1).
+ *
+ * When the interconnect is quiet and symmetric, this lets us synchronize the
+ * TSC to within one or two cycles.  However, we can only *guarantee* that the
+ * synchronization is accurate to within a round-trip time, which is typically
+ * in the range of several hundred cycles (e.g., ~500 cycles).  In practice,
+ * this means that the TSC's are usually almost perfectly synchronized, but we
+ * shouldn't assume that the accuracy is much better than half a micro second
+ * or so.
+ */
+
+static void __init
+synchronize_tsc_ap (void)
+{
+	long i, delta, adj, adjust_latency, n_rounds;
+	unsigned long rt, master_time_stamp,  tsc;
+#if DEBUG_TSC_SYNC
+	struct tsc_sync_debug *t =
+		 &tsc_sync_debug[smp_processor_id() * NUM_ROUNDS];
+#endif
+
+	while (!atomic_read(&tsc_start_flag))
+		mb();
+
+	if (!check_tsc_warp())
+		return;
 
 	/*
-	 * Not every cpu is online at the time
-	 * this gets called, so we first wait for the BP to
-	 * finish SMP initialization:
+	 * Wait for our turn to synchronize with the boot processor.
 	 */
-	while (!atomic_read(&tsc_start_flag)) mb();
+	while (current_slave != smp_processor_id())
+		rep_nop();
+	adjust_latency = tsc_adj_latency;
+
+	go[SLAVE] = 0;
+	go[MASTER] = 0;
+	write_tsc(0,0);
+	for (i = 0; i < NUM_ROUNDS; ++i) {
+		delta = get_delta(&rt, &master_time_stamp);
+		if (delta == 0)
+			break;
+
+		if (i > 0)
+			adjust_latency += -delta;
+		adj = -delta + adjust_latency/8;
+		rdtscl(tsc);
+		write_tsc(tsc + adj, 0);
+#if DEBUG_TSC_SYNC
+		t[i].rt = rt;
+		t[i].master = master_time_stamp;
+		t[i].diff = delta;
+		t[i].lat = adjust_latency/8;
+#endif
+	}
+	n_rounds = i;
+	go[MASTER] = ~0;
+
+#if (DEBUG_TSC_SYNC == 2)
+	for (i = 0; i < n_rounds; ++i)
+		printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
+		       t[i].rt, t[i].master, t[i].diff, t[i].lat);
+
+	printk("CPU %d: synchronized TSC (last diff %ld cycles, maxerr %lu cycles)\n",
+	       smp_processor_id(), delta, rt);
+
+	printk("It took %ld rounds\n", n_rounds);
+#endif
+	if (rt > max_rt)
+		max_rt = rt;
+	if (delta < 0)
+		delta = -delta;
+	if (delta > max_delta)
+		max_delta = delta;
+	tsc_adj_latency = adjust_latency;
+	current_slave = -1;
+	while (!tsc_sync_complete)
+		rep_nop();
+}
+
+/*
+ * The boot processor set its own TSC to zero and then gives each
+ * slave processor the chance to synchronize itself.
+ */
 
-	for (i = 0; i < NR_LOOPS; i++) {
-		atomic_inc(&tsc_count_start);
-		while (atomic_read(&tsc_count_start) != num_booting_cpus())
-			mb();
+static void __init synchronize_tsc_bp (void)
+{
+	unsigned int tsc_low, tsc_high, error;
+	int cpu;
+
+	atomic_set(&tsc_start_flag, 1);
 
-		rdtscll(tsc_values[smp_processor_id()]);
-		if (i == NR_LOOPS-1)
-			write_tsc(0, 0);
+	printk(KERN_INFO "checking TSC synchronization across %u CPUs: ",
+		num_booting_cpus());
 
-		atomic_inc(&tsc_count_stop);
-		while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb();
+	if (!check_tsc_warp()) {
+		printk("passed.\n");
+		return;
+	}
+	printk("failed.\n");
+
+	printk(KERN_INFO "starting TSC synchronization\n");
+	write_tsc(0, 0);
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		if (!cpu_isset(cpu, cpu_callout_map))
+			continue;
+		if (cpu == smp_processor_id())
+			continue;
+		go[MASTER] = 0;
+		current_slave = cpu;
+		sync_master();
+		while (current_slave != -1)
+			rep_nop();
+	}
+	rdtsc(tsc_low, tsc_high);
+	if (tsc_high)
+		printk("TSC overflowed during synchronization\n");
+	else
+		printk("TSC synchronization complete max_delta=%d cycles\n",
+			max_delta);
+	if (max_rt < 4293) {
+		error = (max_rt * 1000000)/cpu_khz;
+		printk("TSC sync round-trip time %d.%03d microseconds\n",
+			error/1000, error%1000);
+	} else {
+		printk("TSC sync round-trip time %d cycles\n", max_rt);
 	}
+	tsc_sync_complete = 1;
 }
-#undef NR_LOOPS
 
 extern void calibrate_delay(void);
 
@@ -547,7 +704,7 @@ static void __devinit start_secondary(vo
 	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
 
 	/* We can take interrupts now: we're officially "up". */
-	local_irq_enable();
+	raw_local_irq_enable();
 
 	wmb();
 	cpu_idle();
@@ -1340,9 +1497,9 @@ int __cpu_disable(void)
 
 	clear_local_APIC();
 	/* Allow any queued timer interrupts to get serviced */
-	local_irq_enable();
+	raw_local_irq_enable();
 	mdelay(1);
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	remove_siblinginfo(cpu);
 
@@ -1386,11 +1543,11 @@ int __devinit __cpu_up(unsigned int cpu)
 	/* In case one didn't come up */
 	if (!cpu_isset(cpu, cpu_callin_map)) {
 		printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu);
-		local_irq_enable();
+		raw_local_irq_enable();
 		return -EIO;
 	}
 
-	local_irq_enable();
+	raw_local_irq_enable();
 	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
 	/* Unleash the CPU! */
 	cpu_set(cpu, smp_commenced_mask);
Index: linux.prev/arch/i386/kernel/switch2poll.c
===================================================================
--- /dev/null
+++ linux.prev/arch/i386/kernel/switch2poll.c
@@ -0,0 +1,5 @@
+/*
+ * Same type of hack used for early_printk.  This keeps the code
+ * in one place.
+ */
+#include "../../x86_64/kernel/switch2poll.c"
Index: linux.prev/arch/i386/kernel/time.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/time.c
+++ linux.prev/arch/i386/kernel/time.c
@@ -46,6 +46,7 @@
 #include <linux/bcd.h>
 #include <linux/efi.h>
 #include <linux/mca.h>
+#include <linux/clockchips.h>
 
 #include <asm/io.h>
 #include <asm/smp.h>
@@ -56,6 +57,7 @@
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 #include <asm/timer.h>
+#include <asm/timeofday.h>
 
 #include "mach_time.h"
 
@@ -79,16 +81,9 @@ EXPORT_SYMBOL(cpu_khz);
 
 extern unsigned long wall_jiffies;
 
-DEFINE_SPINLOCK(rtc_lock);
+DEFINE_RAW_SPINLOCK(rtc_lock);
 EXPORT_SYMBOL(rtc_lock);
 
-#include <asm/i8253.h>
-
-DEFINE_SPINLOCK(i8253_lock);
-EXPORT_SYMBOL(i8253_lock);
-
-struct timer_opts *cur_timer __read_mostly = &timer_none;
-
 /*
  * This is a special lock that is owned by the CPU and holds the index
  * register we are working with.  It is required for NMI access to the
@@ -118,118 +113,25 @@ void rtc_cmos_write(unsigned char val, u
 }
 EXPORT_SYMBOL(rtc_cmos_write);
 
-/*
- * This version of gettimeofday has microsecond resolution
- * and better than microsecond precision on fast x86 machines with TSC.
- */
-void do_gettimeofday(struct timeval *tv)
-{
-	unsigned long seq;
-	unsigned long usec, sec;
-	unsigned long max_ntp_tick;
-
-	do {
-		unsigned long lost;
-
-		seq = read_seqbegin(&xtime_lock);
-
-		usec = cur_timer->get_offset();
-		lost = jiffies - wall_jiffies;
-
-		/*
-		 * If time_adjust is negative then NTP is slowing the clock
-		 * so make sure not to go into next possible interval.
-		 * Better to lose some accuracy than have time go backwards..
-		 */
-		if (unlikely(time_adjust < 0)) {
-			max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj;
-			usec = min(usec, max_ntp_tick);
-
-			if (lost)
-				usec += lost * max_ntp_tick;
-		}
-		else if (unlikely(lost))
-			usec += lost * (USEC_PER_SEC / HZ);
-
-		sec = xtime.tv_sec;
-		usec += (xtime.tv_nsec / 1000);
-	} while (read_seqretry(&xtime_lock, seq));
-
-	while (usec >= 1000000) {
-		usec -= 1000000;
-		sec++;
-	}
-
-	tv->tv_sec = sec;
-	tv->tv_usec = usec;
-}
-
-EXPORT_SYMBOL(do_gettimeofday);
-
-int do_settimeofday(struct timespec *tv)
-{
-	time_t wtm_sec, sec = tv->tv_sec;
-	long wtm_nsec, nsec = tv->tv_nsec;
-
-	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
-		return -EINVAL;
-
-	write_seqlock_irq(&xtime_lock);
-	/*
-	 * This is revolting. We need to set "xtime" correctly. However, the
-	 * value in this location is the value at the most recent update of
-	 * wall time.  Discover what correction gettimeofday() would have
-	 * made, and then undo it!
-	 */
-	nsec -= cur_timer->get_offset() * NSEC_PER_USEC;
-	nsec -= (jiffies - wall_jiffies) * TICK_NSEC;
-
-	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
-	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
-
-	set_normalized_timespec(&xtime, sec, nsec);
-	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
-
-	ntp_clear();
-	write_sequnlock_irq(&xtime_lock);
-	clock_was_set();
-	return 0;
-}
-
-EXPORT_SYMBOL(do_settimeofday);
-
 static int set_rtc_mmss(unsigned long nowtime)
 {
 	int retval;
-
-	WARN_ON(irqs_disabled());
+	unsigned long flags;
 
 	/* gets recalled with irq locally disabled */
-	spin_lock_irq(&rtc_lock);
+	/* XXX - does irqsave resolve this? -johnstul */
+	spin_lock_irqsave(&rtc_lock, flags);
 	if (efi_enabled)
 		retval = efi_set_rtc_mmss(nowtime);
 	else
 		retval = mach_set_rtc_mmss(nowtime);
-	spin_unlock_irq(&rtc_lock);
+	spin_unlock_irqrestore(&rtc_lock, flags);
 
 	return retval;
 }
 
-
-int timer_ack;
-
-/* monotonic_clock(): returns # of nanoseconds passed since time_init()
- *		Note: This function is required to return accurate
- *		time even in the absence of multiple timer ticks.
- */
-unsigned long long monotonic_clock(void)
-{
-	return cur_timer->monotonic_clock();
-}
-EXPORT_SYMBOL(monotonic_clock);
-
 #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
-unsigned long profile_pc(struct pt_regs *regs)
+unsigned long notrace profile_pc(struct pt_regs *regs)
 {
 	unsigned long pc = instruction_pointer(regs);
 
@@ -241,70 +143,6 @@ unsigned long profile_pc(struct pt_regs 
 EXPORT_SYMBOL(profile_pc);
 #endif
 
-/*
- * timer_interrupt() needs to keep up the real-time clock,
- * as well as call the "do_timer()" routine every clocktick
- */
-static inline void do_timer_interrupt(int irq, struct pt_regs *regs)
-{
-#ifdef CONFIG_X86_IO_APIC
-	if (timer_ack) {
-		/*
-		 * Subtle, when I/O APICs are used we have to ack timer IRQ
-		 * manually to reset the IRR bit for do_slow_gettimeoffset().
-		 * This will also deassert NMI lines for the watchdog if run
-		 * on an 82489DX-based system.
-		 */
-		spin_lock(&i8259A_lock);
-		outb(0x0c, PIC_MASTER_OCW3);
-		/* Ack the IRQ; AEOI will end it automatically. */
-		inb(PIC_MASTER_POLL);
-		spin_unlock(&i8259A_lock);
-	}
-#endif
-
-	do_timer_interrupt_hook(regs);
-
-
-	if (MCA_bus) {
-		/* The PS/2 uses level-triggered interrupts.  You can't
-		turn them off, nor would you want to (any attempt to
-		enable edge-triggered interrupts usually gets intercepted by a
-		special hardware circuit).  Hence we have to acknowledge
-		the timer interrupt.  Through some incredibly stupid
-		design idea, the reset for IRQ 0 is done by setting the
-		high bit of the PPI port B (0x61).  Note that some PS/2s,
-		notably the 55SX, work fine if this is removed.  */
-
-		irq = inb_p( 0x61 );	/* read the current state */
-		outb_p( irq|0x80, 0x61 );	/* reset the IRQ */
-	}
-}
-
-/*
- * This is the same as the above, except we _also_ save the current
- * Time Stamp Counter value at the time of the timer interrupt, so that
- * we later on can estimate the time of day more exactly.
- */
-irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
-{
-	/*
-	 * Here we are in the timer irq handler. We just have irqs locally
-	 * disabled but we don't know if the timer_bh is running on the other
-	 * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
-	 * the irq version of write_lock because as just said we have irq
-	 * locally disabled. -arca
-	 */
-	write_seqlock(&xtime_lock);
-
-	cur_timer->mark_offset();
- 
-	do_timer_interrupt(irq, regs);
-
-	write_sequnlock(&xtime_lock);
-	return IRQ_HANDLED;
-}
-
 /* not static: needed by APM */
 unsigned long get_cmos_time(void)
 {
@@ -323,139 +161,42 @@ unsigned long get_cmos_time(void)
 }
 EXPORT_SYMBOL(get_cmos_time);
 
-static void sync_cmos_clock(unsigned long dummy);
-
-static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
-
-static void sync_cmos_clock(unsigned long dummy)
+/* arch specific timeofday hooks */
+nsec_t read_persistent_clock(void)
 {
-	struct timeval now, next;
-	int fail = 1;
+	return (nsec_t)get_cmos_time() * NSEC_PER_SEC;
+}
 
+void sync_persistent_clock(struct timespec ts)
+{
+	static unsigned long last_rtc_update;
 	/*
 	 * If we have an externally synchronized Linux clock, then update
 	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
 	 * called as close as possible to 500 ms before the new second starts.
-	 * This code is run on a timer.  If the clock is set, that timer
-	 * may not expire at the correct time.  Thus, we adjust...
 	 */
-	if (!ntp_synced())
-		/*
-		 * Not synced, exit, do not restart a timer (if one is
-		 * running, let it run out).
-		 */
+	if (ts.tv_sec <= last_rtc_update + 660)
 		return;
 
-	do_gettimeofday(&now);
-	if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
-	    now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
-		fail = set_rtc_mmss(now.tv_sec);
-
-	next.tv_usec = USEC_AFTER - now.tv_usec;
-	if (next.tv_usec <= 0)
-		next.tv_usec += USEC_PER_SEC;
-
-	if (!fail)
-		next.tv_sec = 659;
-	else
-		next.tv_sec = 0;
-
-	if (next.tv_usec >= USEC_PER_SEC) {
-		next.tv_sec++;
-		next.tv_usec -= USEC_PER_SEC;
+	if((ts.tv_nsec / 1000) >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
+		(ts.tv_nsec / 1000) <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2) {
+		/* horrible...FIXME */
+		if (set_rtc_mmss(ts.tv_sec) == 0)
+			last_rtc_update = ts.tv_sec;
+		else
+			last_rtc_update = ts.tv_sec - 600; /* do it again in 60 s */
 	}
-	mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
-}
-
-void notify_arch_cmos_timer(void)
-{
-	mod_timer(&sync_cmos_timer, jiffies + 1);
-}
-
-static long clock_cmos_diff, sleep_start;
-
-static struct timer_opts *last_timer;
-static int timer_suspend(struct sys_device *dev, pm_message_t state)
-{
-	/*
-	 * Estimate time zone so that set_time can update the clock
-	 */
-	clock_cmos_diff = -get_cmos_time();
-	clock_cmos_diff += get_seconds();
-	sleep_start = get_cmos_time();
-	last_timer = cur_timer;
-	cur_timer = &timer_none;
-	if (last_timer->suspend)
-		last_timer->suspend(state);
-	return 0;
-}
-
-static int timer_resume(struct sys_device *dev)
-{
-	unsigned long flags;
-	unsigned long sec;
-	unsigned long sleep_length;
-
-#ifdef CONFIG_HPET_TIMER
-	if (is_hpet_enabled())
-		hpet_reenable();
-#endif
-	setup_pit_timer();
-	sec = get_cmos_time() + clock_cmos_diff;
-	sleep_length = (get_cmos_time() - sleep_start) * HZ;
-	write_seqlock_irqsave(&xtime_lock, flags);
-	xtime.tv_sec = sec;
-	xtime.tv_nsec = 0;
-	write_sequnlock_irqrestore(&xtime_lock, flags);
-	jiffies += sleep_length;
-	wall_jiffies += sleep_length;
-	if (last_timer->resume)
-		last_timer->resume();
-	cur_timer = last_timer;
-	last_timer = NULL;
-	touch_softlockup_watchdog();
-	return 0;
 }
 
-static struct sysdev_class timer_sysclass = {
-	.resume = timer_resume,
-	.suspend = timer_suspend,
-	set_kset_name("timer"),
-};
-
-
-/* XXX this driverfs stuff should probably go elsewhere later -john */
-static struct sys_device device_timer = {
-	.id	= 0,
-	.cls	= &timer_sysclass,
-};
-
-static int time_init_device(void)
-{
-	int error = sysdev_class_register(&timer_sysclass);
-	if (!error)
-		error = sysdev_register(&device_timer);
-	return error;
-}
-
-device_initcall(time_init_device);
-
 #ifdef CONFIG_HPET_TIMER
 extern void (*late_time_init)(void);
 /* Duplicate of time_init() below, with hpet_enable part added */
 static void __init hpet_time_init(void)
 {
-	xtime.tv_sec = get_cmos_time();
-	xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
-	set_normalized_timespec(&wall_to_monotonic,
-		-xtime.tv_sec, -xtime.tv_nsec);
-
 	if ((hpet_enable() >= 0) && hpet_use_timer) {
 		printk("Using HPET for base-timer\n");
 	}
 
-	cur_timer = select_timer();
-	printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
 
 	time_init_hook();
 }
@@ -463,6 +204,9 @@ static void __init hpet_time_init(void)
 
 void __init time_init(void)
 {
+	/* Set the clock to HZ Hz: */
+	setup_pit_timer();
+
 #ifdef CONFIG_HPET_TIMER
 	if (is_hpet_capable()) {
 		/*
@@ -473,13 +217,5 @@ void __init time_init(void)
 		return;
 	}
 #endif
-	xtime.tv_sec = get_cmos_time();
-	xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
-	set_normalized_timespec(&wall_to_monotonic,
-		-xtime.tv_sec, -xtime.tv_nsec);
-
-	cur_timer = select_timer();
-	printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
-
 	time_init_hook();
 }
Index: linux.prev/arch/i386/kernel/time_hpet.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/time_hpet.c
+++ linux.prev/arch/i386/kernel/time_hpet.c
@@ -259,8 +259,6 @@ __setup("hpet=", hpet_setup);
 #include <linux/mc146818rtc.h>
 #include <linux/rtc.h>
 
-extern irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs);
-
 #define DEFAULT_RTC_INT_FREQ 	64
 #define RTC_NUM_INTS 		1
 
@@ -303,12 +301,12 @@ int hpet_rtc_timer_init(void)
 	else
 		hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	cnt = hpet_readl(HPET_COUNTER);
 	cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
 	hpet_writel(cnt, HPET_T1_CMP);
 	hpet_t1_cmp = cnt;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	cfg = hpet_readl(HPET_T1_CFG);
 	cfg &= ~HPET_TN_PERIODIC;
Index: linux.prev/arch/i386/kernel/timers/Makefile
===================================================================
--- linux.prev.orig/arch/i386/kernel/timers/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-#
-# Makefile for x86 timers
-#
-
-obj-y := timer.o timer_none.o timer_tsc.o timer_pit.o common.o
-
-obj-$(CONFIG_X86_CYCLONE_TIMER)	+= timer_cyclone.o
-obj-$(CONFIG_HPET_TIMER)	+= timer_hpet.o
-obj-$(CONFIG_X86_PM_TIMER)	+= timer_pm.o
Index: linux.prev/arch/i386/kernel/timers/common.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/timers/common.c
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- *	Common functions used across the timers go here
- */
-
-#include <linux/init.h>
-#include <linux/timex.h>
-#include <linux/errno.h>
-#include <linux/jiffies.h>
-#include <linux/module.h>
-
-#include <asm/io.h>
-#include <asm/timer.h>
-#include <asm/hpet.h>
-
-#include "mach_timer.h"
-
-/* ------ Calibrate the TSC -------
- * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset().
- * Too much 64-bit arithmetic here to do this cleanly in C, and for
- * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2)
- * output busy loop as low as possible. We avoid reading the CTC registers
- * directly because of the awkward 8-bit access mechanism of the 82C54
- * device.
- */
-
-#define CALIBRATE_TIME	(5 * 1000020/HZ)
-
-unsigned long calibrate_tsc(void)
-{
-	mach_prepare_counter();
-
-	{
-		unsigned long startlow, starthigh;
-		unsigned long endlow, endhigh;
-		unsigned long count;
-
-		rdtsc(startlow,starthigh);
-		mach_countup(&count);
-		rdtsc(endlow,endhigh);
-
-
-		/* Error: ECTCNEVERSET */
-		if (count <= 1)
-			goto bad_ctc;
-
-		/* 64-bit subtract - gcc just messes up with long longs */
-		__asm__("subl %2,%0\n\t"
-			"sbbl %3,%1"
-			:"=a" (endlow), "=d" (endhigh)
-			:"g" (startlow), "g" (starthigh),
-			 "0" (endlow), "1" (endhigh));
-
-		/* Error: ECPUTOOFAST */
-		if (endhigh)
-			goto bad_ctc;
-
-		/* Error: ECPUTOOSLOW */
-		if (endlow <= CALIBRATE_TIME)
-			goto bad_ctc;
-
-		__asm__("divl %2"
-			:"=a" (endlow), "=d" (endhigh)
-			:"r" (endlow), "0" (0), "1" (CALIBRATE_TIME));
-
-		return endlow;
-	}
-
-	/*
-	 * The CTC wasn't reliable: we got a hit on the very first read,
-	 * or the CPU was so fast/slow that the quotient wouldn't fit in
-	 * 32 bits..
-	 */
-bad_ctc:
-	return 0;
-}
-
-#ifdef CONFIG_HPET_TIMER
-/* ------ Calibrate the TSC using HPET -------
- * Return 2^32 * (1 / (TSC clocks per usec)) for getting the CPU freq.
- * Second output is parameter 1 (when non NULL)
- * Set 2^32 * (1 / (tsc per HPET clk)) for delay_hpet().
- * calibrate_tsc() calibrates the processor TSC by comparing
- * it to the HPET timer of known frequency.
- * Too much 64-bit arithmetic here to do this cleanly in C
- */
-#define CALIBRATE_CNT_HPET 	(5 * hpet_tick)
-#define CALIBRATE_TIME_HPET 	(5 * KERNEL_TICK_USEC)
-
-unsigned long __devinit calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr)
-{
-	unsigned long tsc_startlow, tsc_starthigh;
-	unsigned long tsc_endlow, tsc_endhigh;
-	unsigned long hpet_start, hpet_end;
-	unsigned long result, remain;
-
-	hpet_start = hpet_readl(HPET_COUNTER);
-	rdtsc(tsc_startlow, tsc_starthigh);
-	do {
-		hpet_end = hpet_readl(HPET_COUNTER);
-	} while ((hpet_end - hpet_start) < CALIBRATE_CNT_HPET);
-	rdtsc(tsc_endlow, tsc_endhigh);
-
-	/* 64-bit subtract - gcc just messes up with long longs */
-	__asm__("subl %2,%0\n\t"
-		"sbbl %3,%1"
-		:"=a" (tsc_endlow), "=d" (tsc_endhigh)
-		:"g" (tsc_startlow), "g" (tsc_starthigh),
-		 "0" (tsc_endlow), "1" (tsc_endhigh));
-
-	/* Error: ECPUTOOFAST */
-	if (tsc_endhigh)
-		goto bad_calibration;
-
-	/* Error: ECPUTOOSLOW */
-	if (tsc_endlow <= CALIBRATE_TIME_HPET)
-		goto bad_calibration;
-
-	ASM_DIV64_REG(result, remain, tsc_endlow, 0, CALIBRATE_TIME_HPET);
-	if (remain > (tsc_endlow >> 1))
-		result++; /* rounding the result */
-
-	if (tsc_hpet_quotient_ptr) {
-		unsigned long tsc_hpet_quotient;
-
-		ASM_DIV64_REG(tsc_hpet_quotient, remain, tsc_endlow, 0,
-			CALIBRATE_CNT_HPET);
-		if (remain > (tsc_endlow >> 1))
-			tsc_hpet_quotient++; /* rounding the result */
-		*tsc_hpet_quotient_ptr = tsc_hpet_quotient;
-	}
-
-	return result;
-bad_calibration:
-	/*
-	 * the CPU was so fast/slow that the quotient wouldn't fit in
-	 * 32 bits..
-	 */
-	return 0;
-}
-#endif
-
-
-unsigned long read_timer_tsc(void)
-{
-	unsigned long retval;
-	rdtscl(retval);
-	return retval;
-}
-
-
-/* calculate cpu_khz */
-void init_cpu_khz(void)
-{
-	if (cpu_has_tsc) {
-		unsigned long tsc_quotient = calibrate_tsc();
-		if (tsc_quotient) {
-			/* report CPU clock rate in Hz.
-			 * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) =
-			 * clock/second. Our precision is about 100 ppm.
-			 */
-			{	unsigned long eax=0, edx=1000;
-				__asm__("divl %2"
-		       		:"=a" (cpu_khz), "=d" (edx)
-        	       		:"r" (tsc_quotient),
-	                	"0" (eax), "1" (edx));
-				printk("Detected %u.%03u MHz processor.\n",
-					cpu_khz / 1000, cpu_khz % 1000);
-			}
-		}
-	}
-}
-
Index: linux.prev/arch/i386/kernel/timers/timer.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/timers/timer.c
+++ /dev/null
@@ -1,75 +0,0 @@
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <asm/timer.h>
-
-#ifdef CONFIG_HPET_TIMER
-/*
- * HPET memory read is slower than tsc reads, but is more dependable as it
- * always runs at constant frequency and reduces complexity due to
- * cpufreq. So, we prefer HPET timer to tsc based one. Also, we cannot use
- * timer_pit when HPET is active. So, we default to timer_tsc.
- */
-#endif
-/* list of timers, ordered by preference, NULL terminated */
-static struct init_timer_opts* __initdata timers[] = {
-#ifdef CONFIG_X86_CYCLONE_TIMER
-	&timer_cyclone_init,
-#endif
-#ifdef CONFIG_HPET_TIMER
-	&timer_hpet_init,
-#endif
-#ifdef CONFIG_X86_PM_TIMER
-	&timer_pmtmr_init,
-#endif
-	&timer_tsc_init,
-	&timer_pit_init,
-	NULL,
-};
-
-static char clock_override[10] __initdata;
-
-static int __init clock_setup(char* str)
-{
-	if (str)
-		strlcpy(clock_override, str, sizeof(clock_override));
-	return 1;
-}
-__setup("clock=", clock_setup);
-
-
-/* The chosen timesource has been found to be bad.
- * Fall back to a known good timesource (the PIT)
- */
-void clock_fallback(void)
-{
-	cur_timer = &timer_pit;
-}
-
-/* iterates through the list of timers, returning the first 
- * one that initializes successfully.
- */
-struct timer_opts* __init select_timer(void)
-{
-	int i = 0;
-	
-	/* find most preferred working timer */
-	while (timers[i]) {
-		if (timers[i]->init)
-			if (timers[i]->init(clock_override) == 0)
-				return timers[i]->opts;
-		++i;
-	}
-		
-	panic("select_timer: Cannot find a suitable timer\n");
-	return NULL;
-}
-
-int read_current_timer(unsigned long *timer_val)
-{
-	if (cur_timer->read_timer) {
-		*timer_val = cur_timer->read_timer();
-		return 0;
-	}
-	return -1;
-}
Index: linux.prev/arch/i386/kernel/timers/timer_cyclone.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/timers/timer_cyclone.c
+++ /dev/null
@@ -1,259 +0,0 @@
-/*	Cyclone-timer: 
- *		This code implements timer_ops for the cyclone counter found
- *		on IBM x440, x360, and other Summit based systems.
- *
- *	Copyright (C) 2002 IBM, John Stultz (johnstul@us.ibm.com)
- */
-
-
-#include <linux/spinlock.h>
-#include <linux/init.h>
-#include <linux/timex.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/jiffies.h>
-
-#include <asm/timer.h>
-#include <asm/io.h>
-#include <asm/pgtable.h>
-#include <asm/fixmap.h>
-#include <asm/i8253.h>
-
-#include "io_ports.h"
-
-/* Number of usecs that the last interrupt was delayed */
-static int delay_at_last_interrupt;
-
-#define CYCLONE_CBAR_ADDR 0xFEB00CD0
-#define CYCLONE_PMCC_OFFSET 0x51A0
-#define CYCLONE_MPMC_OFFSET 0x51D0
-#define CYCLONE_MPCS_OFFSET 0x51A8
-#define CYCLONE_TIMER_FREQ 100000000
-#define CYCLONE_TIMER_MASK (((u64)1<<40)-1) /* 40 bit mask */
-int use_cyclone = 0;
-
-static u32* volatile cyclone_timer;	/* Cyclone MPMC0 register */
-static u32 last_cyclone_low;
-static u32 last_cyclone_high;
-static unsigned long long monotonic_base;
-static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
-
-/* helper macro to atomically read both cyclone counter registers */
-#define read_cyclone_counter(low,high) \
-	do{ \
-		high = cyclone_timer[1]; low = cyclone_timer[0]; \
-	} while (high != cyclone_timer[1]);
-
-
-static void mark_offset_cyclone(void)
-{
-	unsigned long lost, delay;
-	unsigned long delta = last_cyclone_low;
-	int count;
-	unsigned long long this_offset, last_offset;
-
-	write_seqlock(&monotonic_lock);
-	last_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low;
-	
-	spin_lock(&i8253_lock);
-	read_cyclone_counter(last_cyclone_low,last_cyclone_high);
-
-	/* read values for delay_at_last_interrupt */
-	outb_p(0x00, 0x43);     /* latch the count ASAP */
-
-	count = inb_p(0x40);    /* read the latched count */
-	count |= inb(0x40) << 8;
-
-	/*
-	 * VIA686a test code... reset the latch if count > max + 1
-	 * from timer_pit.c - cjb
-	 */
-	if (count > LATCH) {
-		outb_p(0x34, PIT_MODE);
-		outb_p(LATCH & 0xff, PIT_CH0);
-		outb(LATCH >> 8, PIT_CH0);
-		count = LATCH - 1;
-	}
-	spin_unlock(&i8253_lock);
-
-	/* lost tick compensation */
-	delta = last_cyclone_low - delta;	
-	delta /= (CYCLONE_TIMER_FREQ/1000000);
-	delta += delay_at_last_interrupt;
-	lost = delta/(1000000/HZ);
-	delay = delta%(1000000/HZ);
-	if (lost >= 2)
-		jiffies_64 += lost-1;
-	
-	/* update the monotonic base value */
-	this_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low;
-	monotonic_base += (this_offset - last_offset) & CYCLONE_TIMER_MASK;
-	write_sequnlock(&monotonic_lock);
-
-	/* calculate delay_at_last_interrupt */
-	count = ((LATCH-1) - count) * TICK_SIZE;
-	delay_at_last_interrupt = (count + LATCH/2) / LATCH;
-
-
-	/* catch corner case where tick rollover occured 
-	 * between cyclone and pit reads (as noted when 
-	 * usec delta is > 90% # of usecs/tick)
-	 */
-	if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ))
-		jiffies_64++;
-}
-
-static unsigned long get_offset_cyclone(void)
-{
-	u32 offset;
-
-	if(!cyclone_timer)
-		return delay_at_last_interrupt;
-
-	/* Read the cyclone timer */
-	offset = cyclone_timer[0];
-
-	/* .. relative to previous jiffy */
-	offset = offset - last_cyclone_low;
-
-	/* convert cyclone ticks to microseconds */	
-	/* XXX slow, can we speed this up? */
-	offset = offset/(CYCLONE_TIMER_FREQ/1000000);
-
-	/* our adjusted time offset in microseconds */
-	return delay_at_last_interrupt + offset;
-}
-
-static unsigned long long monotonic_clock_cyclone(void)
-{
-	u32 now_low, now_high;
-	unsigned long long last_offset, this_offset, base;
-	unsigned long long ret;
-	unsigned seq;
-
-	/* atomically read monotonic base & last_offset */
-	do {
-		seq = read_seqbegin(&monotonic_lock);
-		last_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low;
-		base = monotonic_base;
-	} while (read_seqretry(&monotonic_lock, seq));
-
-
-	/* Read the cyclone counter */
-	read_cyclone_counter(now_low,now_high);
-	this_offset = ((unsigned long long)now_high<<32)|now_low;
-
-	/* convert to nanoseconds */
-	ret = base + ((this_offset - last_offset)&CYCLONE_TIMER_MASK);
-	return ret * (1000000000 / CYCLONE_TIMER_FREQ);
-}
-
-static int __init init_cyclone(char* override)
-{
-	u32* reg;	
-	u32 base;		/* saved cyclone base address */
-	u32 pageaddr;	/* page that contains cyclone_timer register */
-	u32 offset;		/* offset from pageaddr to cyclone_timer register */
-	int i;
-	
-	/* check clock override */
-	if (override[0] && strncmp(override,"cyclone",7))
-			return -ENODEV;
-
-	/*make sure we're on a summit box*/
-	if(!use_cyclone) return -ENODEV; 
-	
-	printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n");
-
-	/* find base address */
-	pageaddr = (CYCLONE_CBAR_ADDR)&PAGE_MASK;
-	offset = (CYCLONE_CBAR_ADDR)&(~PAGE_MASK);
-	set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
-	reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
-	if(!reg){
-		printk(KERN_ERR "Summit chipset: Could not find valid CBAR register.\n");
-		return -ENODEV;
-	}
-	base = *reg;	
-	if(!base){
-		printk(KERN_ERR "Summit chipset: Could not find valid CBAR value.\n");
-		return -ENODEV;
-	}
-	
-	/* setup PMCC */
-	pageaddr = (base + CYCLONE_PMCC_OFFSET)&PAGE_MASK;
-	offset = (base + CYCLONE_PMCC_OFFSET)&(~PAGE_MASK);
-	set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
-	reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
-	if(!reg){
-		printk(KERN_ERR "Summit chipset: Could not find valid PMCC register.\n");
-		return -ENODEV;
-	}
-	reg[0] = 0x00000001;
-
-	/* setup MPCS */
-	pageaddr = (base + CYCLONE_MPCS_OFFSET)&PAGE_MASK;
-	offset = (base + CYCLONE_MPCS_OFFSET)&(~PAGE_MASK);
-	set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
-	reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
-	if(!reg){
-		printk(KERN_ERR "Summit chipset: Could not find valid MPCS register.\n");
-		return -ENODEV;
-	}
-	reg[0] = 0x00000001;
-
-	/* map in cyclone_timer */
-	pageaddr = (base + CYCLONE_MPMC_OFFSET)&PAGE_MASK;
-	offset = (base + CYCLONE_MPMC_OFFSET)&(~PAGE_MASK);
-	set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
-	cyclone_timer = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
-	if(!cyclone_timer){
-		printk(KERN_ERR "Summit chipset: Could not find valid MPMC register.\n");
-		return -ENODEV;
-	}
-
-	/*quick test to make sure its ticking*/
-	for(i=0; i<3; i++){
-		u32 old = cyclone_timer[0];
-		int stall = 100;
-		while(stall--) barrier();
-		if(cyclone_timer[0] == old){
-			printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n");
-			cyclone_timer = 0;
-			return -ENODEV;
-		}
-	}
-
-	init_cpu_khz();
-
-	/* Everything looks good! */
-	return 0;
-}
-
-
-static void delay_cyclone(unsigned long loops)
-{
-	unsigned long bclock, now;
-	if(!cyclone_timer)
-		return;
-	bclock = cyclone_timer[0];
-	do {
-		rep_nop();
-		now = cyclone_timer[0];
-	} while ((now-bclock) < loops);
-}
-/************************************************************/
-
-/* cyclone timer_opts struct */
-static struct timer_opts timer_cyclone = {
-	.name = "cyclone",
-	.mark_offset = mark_offset_cyclone, 
-	.get_offset = get_offset_cyclone,
-	.monotonic_clock =	monotonic_clock_cyclone,
-	.delay = delay_cyclone,
-};
-
-struct init_timer_opts __initdata timer_cyclone_init = {
-	.init = init_cyclone,
-	.opts = &timer_cyclone,
-};
Index: linux.prev/arch/i386/kernel/timers/timer_hpet.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/timers/timer_hpet.c
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * This code largely moved from arch/i386/kernel/time.c.
- * See comments there for proper credits.
- */
-
-#include <linux/spinlock.h>
-#include <linux/init.h>
-#include <linux/timex.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/jiffies.h>
-
-#include <asm/timer.h>
-#include <asm/io.h>
-#include <asm/processor.h>
-
-#include "io_ports.h"
-#include "mach_timer.h"
-#include <asm/hpet.h>
-
-static unsigned long hpet_usec_quotient __read_mostly;	/* convert hpet clks to usec */
-static unsigned long tsc_hpet_quotient __read_mostly;	/* convert tsc to hpet clks */
-static unsigned long hpet_last; 	/* hpet counter value at last tick*/
-static unsigned long last_tsc_low;	/* lsb 32 bits of Time Stamp Counter */
-static unsigned long last_tsc_high; 	/* msb 32 bits of Time Stamp Counter */
-static unsigned long long monotonic_base;
-static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
-
-/* convert from cycles(64bits) => nanoseconds (64bits)
- *  basic equation:
- *		ns = cycles / (freq / ns_per_sec)
- *		ns = cycles * (ns_per_sec / freq)
- *		ns = cycles * (10^9 / (cpu_khz * 10^3))
- *		ns = cycles * (10^6 / cpu_khz)
- *
- *	Then we use scaling math (suggested by george@mvista.com) to get:
- *		ns = cycles * (10^6 * SC / cpu_khz) / SC
- *		ns = cycles * cyc2ns_scale / SC
- *
- *	And since SC is a constant power of two, we can convert the div
- *  into a shift.
- *
- *  We can use khz divisor instead of mhz to keep a better percision, since
- *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
- *  (mathieu.desnoyers@polymtl.ca)
- *
- *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
- */
-static unsigned long cyc2ns_scale;
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
-
-static inline void set_cyc2ns_scale(unsigned long cpu_khz)
-{
-	cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
-}
-
-static inline unsigned long long cycles_2_ns(unsigned long long cyc)
-{
-	return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
-}
-
-static unsigned long long monotonic_clock_hpet(void)
-{
-	unsigned long long last_offset, this_offset, base;
-	unsigned seq;
-
-	/* atomically read monotonic base & last_offset */
-	do {
-		seq = read_seqbegin(&monotonic_lock);
-		last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
-		base = monotonic_base;
-	} while (read_seqretry(&monotonic_lock, seq));
-
-	/* Read the Time Stamp Counter */
-	rdtscll(this_offset);
-
-	/* return the value in ns */
-	return base + cycles_2_ns(this_offset - last_offset);
-}
-
-static unsigned long get_offset_hpet(void)
-{
-	register unsigned long eax, edx;
-
-	eax = hpet_readl(HPET_COUNTER);
-	eax -= hpet_last;	/* hpet delta */
-	eax = min(hpet_tick, eax);
-	/*
-         * Time offset = (hpet delta) * ( usecs per HPET clock )
-	 *             = (hpet delta) * ( usecs per tick / HPET clocks per tick)
-	 *             = (hpet delta) * ( hpet_usec_quotient ) / (2^32)
-	 *
-	 * Where,
-	 * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick
-	 *
-	 * Using a mull instead of a divl saves some cycles in critical path.
-         */
-	ASM_MUL64_REG(eax, edx, hpet_usec_quotient, eax);
-
-	/* our adjusted time offset in microseconds */
-	return edx;
-}
-
-static void mark_offset_hpet(void)
-{
-	unsigned long long this_offset, last_offset;
-	unsigned long offset;
-
-	write_seqlock(&monotonic_lock);
-	last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
-	rdtsc(last_tsc_low, last_tsc_high);
-
-	if (hpet_use_timer)
-		offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
-	else
-		offset = hpet_readl(HPET_COUNTER);
-	if (unlikely(((offset - hpet_last) >= (2*hpet_tick)) && (hpet_last != 0))) {
-		int lost_ticks = ((offset - hpet_last) / hpet_tick) - 1;
-		jiffies_64 += lost_ticks;
-	}
-	hpet_last = offset;
-
-	/* update the monotonic base value */
-	this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
-	monotonic_base += cycles_2_ns(this_offset - last_offset);
-	write_sequnlock(&monotonic_lock);
-}
-
-static void delay_hpet(unsigned long loops)
-{
-	unsigned long hpet_start, hpet_end;
-	unsigned long eax;
-
-	/* loops is the number of cpu cycles. Convert it to hpet clocks */
-	ASM_MUL64_REG(eax, loops, tsc_hpet_quotient, loops);
-
-	hpet_start = hpet_readl(HPET_COUNTER);
-	do {
-		rep_nop();
-		hpet_end = hpet_readl(HPET_COUNTER);
-	} while ((hpet_end - hpet_start) < (loops));
-}
-
-static struct timer_opts timer_hpet;
-
-static int __init init_hpet(char* override)
-{
-	unsigned long result, remain;
-
-	/* check clock override */
-	if (override[0] && strncmp(override,"hpet",4))
-		return -ENODEV;
-
-	if (!is_hpet_enabled())
-		return -ENODEV;
-
-	printk("Using HPET for gettimeofday\n");
-	if (cpu_has_tsc) {
-		unsigned long tsc_quotient = calibrate_tsc_hpet(&tsc_hpet_quotient);
-		if (tsc_quotient) {
-			/* report CPU clock rate in Hz.
-			 * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) =
-			 * clock/second. Our precision is about 100 ppm.
-			 */
-			{	unsigned long eax=0, edx=1000;
-				ASM_DIV64_REG(cpu_khz, edx, tsc_quotient,
-						eax, edx);
-				printk("Detected %u.%03u MHz processor.\n",
-					cpu_khz / 1000, cpu_khz % 1000);
-			}
-			set_cyc2ns_scale(cpu_khz);
-		}
-		/* set this only when cpu_has_tsc */
-		timer_hpet.read_timer = read_timer_tsc;
-	}
-
-	/*
-	 * Math to calculate hpet to usec multiplier
-	 * Look for the comments at get_offset_hpet()
-	 */
-	ASM_DIV64_REG(result, remain, hpet_tick, 0, KERNEL_TICK_USEC);
-	if (remain > (hpet_tick >> 1))
-		result++; /* rounding the result */
-	hpet_usec_quotient = result;
-
-	return 0;
-}
-
-static int hpet_resume(void)
-{
-	write_seqlock(&monotonic_lock);
-	/* Assume this is the last mark offset time */
-	rdtsc(last_tsc_low, last_tsc_high);
-
-	if (hpet_use_timer)
-		hpet_last = hpet_readl(HPET_T0_CMP) - hpet_tick;
-	else
-		hpet_last = hpet_readl(HPET_COUNTER);
-	write_sequnlock(&monotonic_lock);
-	return 0;
-}
-/************************************************************/
-
-/* tsc timer_opts struct */
-static struct timer_opts timer_hpet __read_mostly = {
-	.name = 		"hpet",
-	.mark_offset =		mark_offset_hpet,
-	.get_offset =		get_offset_hpet,
-	.monotonic_clock =	monotonic_clock_hpet,
-	.delay = 		delay_hpet,
-	.resume	=		hpet_resume,
-};
-
-struct init_timer_opts __initdata timer_hpet_init = {
-	.init =	init_hpet,
-	.opts = &timer_hpet,
-};
Index: linux.prev/arch/i386/kernel/timers/timer_none.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/timers/timer_none.c
+++ /dev/null
@@ -1,39 +0,0 @@
-#include <linux/init.h>
-#include <asm/timer.h>
-
-static void mark_offset_none(void)
-{
-	/* nothing needed */
-}
-
-static unsigned long get_offset_none(void)
-{
-	return 0;
-}
-
-static unsigned long long monotonic_clock_none(void)
-{
-	return 0;
-}
-
-static void delay_none(unsigned long loops)
-{
-	int d0;
-	__asm__ __volatile__(
-		"\tjmp 1f\n"
-		".align 16\n"
-		"1:\tjmp 2f\n"
-		".align 16\n"
-		"2:\tdecl %0\n\tjns 2b"
-		:"=&a" (d0)
-		:"0" (loops));
-}
-
-/* none timer_opts struct */
-struct timer_opts timer_none = {
-	.name = 	"none",
-	.mark_offset =	mark_offset_none, 
-	.get_offset =	get_offset_none,
-	.monotonic_clock =	monotonic_clock_none,
-	.delay = delay_none,
-};
Index: linux.prev/arch/i386/kernel/timers/timer_pit.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/timers/timer_pit.c
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * This code largely moved from arch/i386/kernel/time.c.
- * See comments there for proper credits.
- */
-
-#include <linux/spinlock.h>
-#include <linux/module.h>
-#include <linux/device.h>
-#include <linux/sysdev.h>
-#include <linux/timex.h>
-#include <asm/delay.h>
-#include <asm/mpspec.h>
-#include <asm/timer.h>
-#include <asm/smp.h>
-#include <asm/io.h>
-#include <asm/arch_hooks.h>
-#include <asm/i8253.h>
-
-#include "do_timer.h"
-#include "io_ports.h"
-
-static int count_p; /* counter in get_offset_pit() */
-
-static int __init init_pit(char* override)
-{
- 	/* check clock override */
- 	if (override[0] && strncmp(override,"pit",3))
- 		printk(KERN_ERR "Warning: clock= override failed. Defaulting "
-				"to PIT\n");
- 	init_cpu_khz();
-	count_p = LATCH;
-	return 0;
-}
-
-static void mark_offset_pit(void)
-{
-	/* nothing needed */
-}
-
-static unsigned long long monotonic_clock_pit(void)
-{
-	return 0;
-}
-
-static void delay_pit(unsigned long loops)
-{
-	int d0;
-	__asm__ __volatile__(
-		"\tjmp 1f\n"
-		".align 16\n"
-		"1:\tjmp 2f\n"
-		".align 16\n"
-		"2:\tdecl %0\n\tjns 2b"
-		:"=&a" (d0)
-		:"0" (loops));
-}
-
-
-/* This function must be called with xtime_lock held.
- * It was inspired by Steve McCanne's microtime-i386 for BSD.  -- jrs
- * 
- * However, the pc-audio speaker driver changes the divisor so that
- * it gets interrupted rather more often - it loads 64 into the
- * counter rather than 11932! This has an adverse impact on
- * do_gettimeoffset() -- it stops working! What is also not
- * good is that the interval that our timer function gets called
- * is no longer 10.0002 ms, but 9.9767 ms. To get around this
- * would require using a different timing source. Maybe someone
- * could use the RTC - I know that this can interrupt at frequencies
- * ranging from 8192Hz to 2Hz. If I had the energy, I'd somehow fix
- * it so that at startup, the timer code in sched.c would select
- * using either the RTC or the 8253 timer. The decision would be
- * based on whether there was any other device around that needed
- * to trample on the 8253. I'd set up the RTC to interrupt at 1024 Hz,
- * and then do some jiggery to have a version of do_timer that 
- * advanced the clock by 1/1024 s. Every time that reached over 1/100
- * of a second, then do all the old code. If the time was kept correct
- * then do_gettimeoffset could just return 0 - there is no low order
- * divider that can be accessed.
- *
- * Ideally, you would be able to use the RTC for the speaker driver,
- * but it appears that the speaker driver really needs interrupt more
- * often than every 120 us or so.
- *
- * Anyway, this needs more thought....		pjsg (1993-08-28)
- * 
- * If you are really that interested, you should be reading
- * comp.protocols.time.ntp!
- */
-
-static unsigned long get_offset_pit(void)
-{
-	int count;
-	unsigned long flags;
-	static unsigned long jiffies_p = 0;
-
-	/*
-	 * cache volatile jiffies temporarily; we have xtime_lock. 
-	 */
-	unsigned long jiffies_t;
-
-	spin_lock_irqsave(&i8253_lock, flags);
-	/* timer count may underflow right here */
-	outb_p(0x00, PIT_MODE);	/* latch the count ASAP */
-
-	count = inb_p(PIT_CH0);	/* read the latched count */
-
-	/*
-	 * We do this guaranteed double memory access instead of a _p 
-	 * postfix in the previous port access. Wheee, hackady hack
-	 */
- 	jiffies_t = jiffies;
-
-	count |= inb_p(PIT_CH0) << 8;
-	
-        /* VIA686a test code... reset the latch if count > max + 1 */
-        if (count > LATCH) {
-                outb_p(0x34, PIT_MODE);
-                outb_p(LATCH & 0xff, PIT_CH0);
-                outb(LATCH >> 8, PIT_CH0);
-                count = LATCH - 1;
-        }
-	
-	/*
-	 * avoiding timer inconsistencies (they are rare, but they happen)...
-	 * there are two kinds of problems that must be avoided here:
-	 *  1. the timer counter underflows
-	 *  2. hardware problem with the timer, not giving us continuous time,
-	 *     the counter does small "jumps" upwards on some Pentium systems,
-	 *     (see c't 95/10 page 335 for Neptun bug.)
-	 */
-
-	if( jiffies_t == jiffies_p ) {
-		if( count > count_p ) {
-			/* the nutcase */
-			count = do_timer_overflow(count);
-		}
-	} else
-		jiffies_p = jiffies_t;
-
-	count_p = count;
-
-	spin_unlock_irqrestore(&i8253_lock, flags);
-
-	count = ((LATCH-1) - count) * TICK_SIZE;
-	count = (count + LATCH/2) / LATCH;
-
-	return count;
-}
-
-
-/* tsc timer_opts struct */
-struct timer_opts timer_pit = {
-	.name = "pit",
-	.mark_offset = mark_offset_pit, 
-	.get_offset = get_offset_pit,
-	.monotonic_clock = monotonic_clock_pit,
-	.delay = delay_pit,
-};
-
-struct init_timer_opts __initdata timer_pit_init = {
-	.init = init_pit, 
-	.opts = &timer_pit,
-};
-
-void setup_pit_timer(void)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&i8253_lock, flags);
-	outb_p(0x34,PIT_MODE);		/* binary, mode 2, LSB/MSB, ch 0 */
-	udelay(10);
-	outb_p(LATCH & 0xff , PIT_CH0);	/* LSB */
-	udelay(10);
-	outb(LATCH >> 8 , PIT_CH0);	/* MSB */
-	spin_unlock_irqrestore(&i8253_lock, flags);
-}
Index: linux.prev/arch/i386/kernel/timers/timer_pm.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/timers/timer_pm.c
+++ /dev/null
@@ -1,268 +0,0 @@
-/*
- * (C) Dominik Brodowski <linux@brodo.de> 2003
- *
- * Driver to use the Power Management Timer (PMTMR) available in some
- * southbridges as primary timing source for the Linux kernel.
- *
- * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
- * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
- *
- * This file is licensed under the GPL v2.
- */
-
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/device.h>
-#include <linux/init.h>
-#include <asm/types.h>
-#include <asm/timer.h>
-#include <asm/smp.h>
-#include <asm/io.h>
-#include <asm/arch_hooks.h>
-
-#include <linux/timex.h>
-#include "mach_timer.h"
-
-/* Number of PMTMR ticks expected during calibration run */
-#define PMTMR_TICKS_PER_SEC 3579545
-#define PMTMR_EXPECTED_RATE \
-  ((CALIBRATE_LATCH * (PMTMR_TICKS_PER_SEC >> 10)) / (CLOCK_TICK_RATE>>10))
-
-
-/* The I/O port the PMTMR resides at.
- * The location is detected during setup_arch(),
- * in arch/i386/acpi/boot.c */
-u32 pmtmr_ioport = 0;
-
-
-/* value of the Power timer at last timer interrupt */
-static u32 offset_tick;
-static u32 offset_delay;
-
-static unsigned long long monotonic_base;
-static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
-
-#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
-
-/*helper function to safely read acpi pm timesource*/
-static inline u32 read_pmtmr(void)
-{
-	u32 v1=0,v2=0,v3=0;
-	/* It has been reported that because of various broken
-	 * chipsets (ICH4, PIIX4 and PIIX4E) where the ACPI PM time
-	 * source is not latched, so you must read it multiple
-	 * times to insure a safe value is read.
-	 */
-	do {
-		v1 = inl(pmtmr_ioport);
-		v2 = inl(pmtmr_ioport);
-		v3 = inl(pmtmr_ioport);
-	} while ((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1)
-			|| (v3 > v1 && v3 < v2));
-
-	/* mask the output to 24 bits */
-	return v2 & ACPI_PM_MASK;
-}
-
-
-/*
- * Some boards have the PMTMR running way too fast. We check
- * the PMTMR rate against PIT channel 2 to catch these cases.
- */
-static int verify_pmtmr_rate(void)
-{
-	u32 value1, value2;
-	unsigned long count, delta;
-
-	mach_prepare_counter();
-	value1 = read_pmtmr();
-	mach_countup(&count);
-	value2 = read_pmtmr();
-	delta = (value2 - value1) & ACPI_PM_MASK;
-
-	/* Check that the PMTMR delta is within 5% of what we expect */
-	if (delta < (PMTMR_EXPECTED_RATE * 19) / 20 ||
-	    delta > (PMTMR_EXPECTED_RATE * 21) / 20) {
-		printk(KERN_INFO "PM-Timer running at invalid rate: %lu%% of normal - aborting.\n", 100UL * delta / PMTMR_EXPECTED_RATE);
-		return -1;
-	}
-
-	return 0;
-}
-
-
-static int init_pmtmr(char* override)
-{
-	u32 value1, value2;
-	unsigned int i;
-
- 	if (override[0] && strncmp(override,"pmtmr",5))
-		return -ENODEV;
-
-	if (!pmtmr_ioport)
-		return -ENODEV;
-
-	/* we use the TSC for delay_pmtmr, so make sure it exists */
-	if (!cpu_has_tsc)
-		return -ENODEV;
-
-	/* "verify" this timing source */
-	value1 = read_pmtmr();
-	for (i = 0; i < 10000; i++) {
-		value2 = read_pmtmr();
-		if (value2 == value1)
-			continue;
-		if (value2 > value1)
-			goto pm_good;
-		if ((value2 < value1) && ((value2) < 0xFFF))
-			goto pm_good;
-		printk(KERN_INFO "PM-Timer had inconsistent results: 0x%#x, 0x%#x - aborting.\n", value1, value2);
-		return -EINVAL;
-	}
-	printk(KERN_INFO "PM-Timer had no reasonable result: 0x%#x - aborting.\n", value1);
-	return -ENODEV;
-
-pm_good:
-	if (verify_pmtmr_rate() != 0)
-		return -ENODEV;
-
-	init_cpu_khz();
-	return 0;
-}
-
-static inline u32 cyc2us(u32 cycles)
-{
-	/* The Power Management Timer ticks at 3.579545 ticks per microsecond.
-	 * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
-	 *
-	 * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
-	 * easily be multiplied with 286 (=0x11E) without having to fear
-	 * u32 overflows.
-	 */
-	cycles *= 286;
-	return (cycles >> 10);
-}
-
-/*
- * this gets called during each timer interrupt
- *   - Called while holding the writer xtime_lock
- */
-static void mark_offset_pmtmr(void)
-{
-	u32 lost, delta, last_offset;
-	static int first_run = 1;
-	last_offset = offset_tick;
-
-	write_seqlock(&monotonic_lock);
-
-	offset_tick = read_pmtmr();
-
-	/* calculate tick interval */
-	delta = (offset_tick - last_offset) & ACPI_PM_MASK;
-
-	/* convert to usecs */
-	delta = cyc2us(delta);
-
-	/* update the monotonic base value */
-	monotonic_base += delta * NSEC_PER_USEC;
-	write_sequnlock(&monotonic_lock);
-
-	/* convert to ticks */
-	delta += offset_delay;
-	lost = delta / (USEC_PER_SEC / HZ);
-	offset_delay = delta % (USEC_PER_SEC / HZ);
-
-
-	/* compensate for lost ticks */
-	if (lost >= 2)
-		jiffies_64 += lost - 1;
-
-	/* don't calculate delay for first run,
-	   or if we've got less then a tick */
-	if (first_run || (lost < 1)) {
-		first_run = 0;
-		offset_delay = 0;
-	}
-}
-
-static int pmtmr_resume(void)
-{
-	write_seqlock(&monotonic_lock);
-	/* Assume this is the last mark offset time */
-	offset_tick = read_pmtmr();
-	write_sequnlock(&monotonic_lock);
-	return 0;
-}
-
-static unsigned long long monotonic_clock_pmtmr(void)
-{
-	u32 last_offset, this_offset;
-	unsigned long long base, ret;
-	unsigned seq;
-
-
-	/* atomically read monotonic base & last_offset */
-	do {
-		seq = read_seqbegin(&monotonic_lock);
-		last_offset = offset_tick;
-		base = monotonic_base;
-	} while (read_seqretry(&monotonic_lock, seq));
-
-	/* Read the pmtmr */
-	this_offset =  read_pmtmr();
-
-	/* convert to nanoseconds */
-	ret = (this_offset - last_offset) & ACPI_PM_MASK;
-	ret = base + (cyc2us(ret) * NSEC_PER_USEC);
-	return ret;
-}
-
-static void delay_pmtmr(unsigned long loops)
-{
-	unsigned long bclock, now;
-
-	rdtscl(bclock);
-	do
-	{
-		rep_nop();
-		rdtscl(now);
-	} while ((now-bclock) < loops);
-}
-
-
-/*
- * get the offset (in microseconds) from the last call to mark_offset()
- *	- Called holding a reader xtime_lock
- */
-static unsigned long get_offset_pmtmr(void)
-{
-	u32 now, offset, delta = 0;
-
-	offset = offset_tick;
-	now = read_pmtmr();
-	delta = (now - offset)&ACPI_PM_MASK;
-
-	return (unsigned long) offset_delay + cyc2us(delta);
-}
-
-
-/* acpi timer_opts struct */
-static struct timer_opts timer_pmtmr = {
-	.name			= "pmtmr",
-	.mark_offset		= mark_offset_pmtmr,
-	.get_offset		= get_offset_pmtmr,
-	.monotonic_clock 	= monotonic_clock_pmtmr,
-	.delay 			= delay_pmtmr,
-	.read_timer 		= read_timer_tsc,
-	.resume			= pmtmr_resume,
-};
-
-struct init_timer_opts __initdata timer_pmtmr_init = {
-	.init = init_pmtmr,
-	.opts = &timer_pmtmr,
-};
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("Power Management Timer (PMTMR) as primary timing source for x86");
Index: linux.prev/arch/i386/kernel/timers/timer_tsc.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/timers/timer_tsc.c
+++ /dev/null
@@ -1,600 +0,0 @@
-/*
- * This code largely moved from arch/i386/kernel/time.c.
- * See comments there for proper credits.
- *
- * 2004-06-25    Jesper Juhl
- *      moved mark_offset_tsc below cpufreq_delayed_get to avoid gcc 3.4
- *      failing to inline.
- */
-
-#include <linux/spinlock.h>
-#include <linux/init.h>
-#include <linux/timex.h>
-#include <linux/errno.h>
-#include <linux/cpufreq.h>
-#include <linux/string.h>
-#include <linux/jiffies.h>
-
-#include <asm/timer.h>
-#include <asm/io.h>
-/* processor.h for distable_tsc flag */
-#include <asm/processor.h>
-
-#include "io_ports.h"
-#include "mach_timer.h"
-
-#include <asm/hpet.h>
-#include <asm/i8253.h>
-
-#ifdef CONFIG_HPET_TIMER
-static unsigned long hpet_usec_quotient;
-static unsigned long hpet_last;
-static struct timer_opts timer_tsc;
-#endif
-
-static inline void cpufreq_delayed_get(void);
-
-int tsc_disable __devinitdata = 0;
-
-static int use_tsc;
-/* Number of usecs that the last interrupt was delayed */
-static int delay_at_last_interrupt;
-
-static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */
-static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */
-static unsigned long long monotonic_base;
-static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
-
-/* convert from cycles(64bits) => nanoseconds (64bits)
- *  basic equation:
- *		ns = cycles / (freq / ns_per_sec)
- *		ns = cycles * (ns_per_sec / freq)
- *		ns = cycles * (10^9 / (cpu_khz * 10^3))
- *		ns = cycles * (10^6 / cpu_khz)
- *
- *	Then we use scaling math (suggested by george@mvista.com) to get:
- *		ns = cycles * (10^6 * SC / cpu_khz) / SC
- *		ns = cycles * cyc2ns_scale / SC
- *
- *	And since SC is a constant power of two, we can convert the div
- *  into a shift.
- *
- *  We can use khz divisor instead of mhz to keep a better percision, since
- *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
- *  (mathieu.desnoyers@polymtl.ca)
- *
- *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
- */
-static unsigned long cyc2ns_scale; 
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
-
-static inline void set_cyc2ns_scale(unsigned long cpu_khz)
-{
-	cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
-}
-
-static inline unsigned long long cycles_2_ns(unsigned long long cyc)
-{
-	return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
-}
-
-static int count2; /* counter for mark_offset_tsc() */
-
-/* Cached *multiplier* to convert TSC counts to microseconds.
- * (see the equation below).
- * Equal to 2^32 * (1 / (clocks per usec) ).
- * Initialized in time_init.
- */
-static unsigned long fast_gettimeoffset_quotient;
-
-static unsigned long get_offset_tsc(void)
-{
-	register unsigned long eax, edx;
-
-	/* Read the Time Stamp Counter */
-
-	rdtsc(eax,edx);
-
-	/* .. relative to previous jiffy (32 bits is enough) */
-	eax -= last_tsc_low;	/* tsc_low delta */
-
-	/*
-         * Time offset = (tsc_low delta) * fast_gettimeoffset_quotient
-         *             = (tsc_low delta) * (usecs_per_clock)
-         *             = (tsc_low delta) * (usecs_per_jiffy / clocks_per_jiffy)
-	 *
-	 * Using a mull instead of a divl saves up to 31 clock cycles
-	 * in the critical path.
-         */
-
-	__asm__("mull %2"
-		:"=a" (eax), "=d" (edx)
-		:"rm" (fast_gettimeoffset_quotient),
-		 "0" (eax));
-
-	/* our adjusted time offset in microseconds */
-	return delay_at_last_interrupt + edx;
-}
-
-static unsigned long long monotonic_clock_tsc(void)
-{
-	unsigned long long last_offset, this_offset, base;
-	unsigned seq;
-	
-	/* atomically read monotonic base & last_offset */
-	do {
-		seq = read_seqbegin(&monotonic_lock);
-		last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
-		base = monotonic_base;
-	} while (read_seqretry(&monotonic_lock, seq));
-
-	/* Read the Time Stamp Counter */
-	rdtscll(this_offset);
-
-	/* return the value in ns */
-	return base + cycles_2_ns(this_offset - last_offset);
-}
-
-/*
- * Scheduler clock - returns current time in nanosec units.
- */
-unsigned long long sched_clock(void)
-{
-	unsigned long long this_offset;
-
-	/*
-	 * In the NUMA case we dont use the TSC as they are not
-	 * synchronized across all CPUs.
-	 */
-#ifndef CONFIG_NUMA
-	if (!use_tsc)
-#endif
-		/* no locking but a rare wrong value is not a big deal */
-		return jiffies_64 * (1000000000 / HZ);
-
-	/* Read the Time Stamp Counter */
-	rdtscll(this_offset);
-
-	/* return the value in ns */
-	return cycles_2_ns(this_offset);
-}
-
-static void delay_tsc(unsigned long loops)
-{
-	unsigned long bclock, now;
-	
-	rdtscl(bclock);
-	do
-	{
-		rep_nop();
-		rdtscl(now);
-	} while ((now-bclock) < loops);
-}
-
-#ifdef CONFIG_HPET_TIMER
-static void mark_offset_tsc_hpet(void)
-{
-	unsigned long long this_offset, last_offset;
- 	unsigned long offset, temp, hpet_current;
-
-	write_seqlock(&monotonic_lock);
-	last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
-	/*
-	 * It is important that these two operations happen almost at
-	 * the same time. We do the RDTSC stuff first, since it's
-	 * faster. To avoid any inconsistencies, we need interrupts
-	 * disabled locally.
-	 */
-	/*
-	 * Interrupts are just disabled locally since the timer irq
-	 * has the SA_INTERRUPT flag set. -arca
-	 */
-	/* read Pentium cycle counter */
-
-	hpet_current = hpet_readl(HPET_COUNTER);
-	rdtsc(last_tsc_low, last_tsc_high);
-
-	/* lost tick compensation */
-	offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
-	if (unlikely(((offset - hpet_last) > hpet_tick) && (hpet_last != 0))) {
-		int lost_ticks = (offset - hpet_last) / hpet_tick;
-		jiffies_64 += lost_ticks;
-	}
-	hpet_last = hpet_current;
-
-	/* update the monotonic base value */
-	this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
-	monotonic_base += cycles_2_ns(this_offset - last_offset);
-	write_sequnlock(&monotonic_lock);
-
-	/* calculate delay_at_last_interrupt */
-	/*
-	 * Time offset = (hpet delta) * ( usecs per HPET clock )
-	 *             = (hpet delta) * ( usecs per tick / HPET clocks per tick)
-	 *             = (hpet delta) * ( hpet_usec_quotient ) / (2^32)
-	 * Where,
-	 * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick
-	 */
-	delay_at_last_interrupt = hpet_current - offset;
-	ASM_MUL64_REG(temp, delay_at_last_interrupt,
-			hpet_usec_quotient, delay_at_last_interrupt);
-}
-#endif
-
-
-#ifdef CONFIG_CPU_FREQ
-#include <linux/workqueue.h>
-
-static unsigned int cpufreq_delayed_issched = 0;
-static unsigned int cpufreq_init = 0;
-static struct work_struct cpufreq_delayed_get_work;
-
-static void handle_cpufreq_delayed_get(void *v)
-{
-	unsigned int cpu;
-	for_each_online_cpu(cpu) {
-		cpufreq_get(cpu);
-	}
-	cpufreq_delayed_issched = 0;
-}
-
-/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries
- * to verify the CPU frequency the timing core thinks the CPU is running
- * at is still correct.
- */
-static inline void cpufreq_delayed_get(void) 
-{
-	if (cpufreq_init && !cpufreq_delayed_issched) {
-		cpufreq_delayed_issched = 1;
-		printk(KERN_DEBUG "Losing some ticks... checking if CPU frequency changed.\n");
-		schedule_work(&cpufreq_delayed_get_work);
-	}
-}
-
-/* If the CPU frequency is scaled, TSC-based delays will need a different
- * loops_per_jiffy value to function properly.
- */
-
-static unsigned int  ref_freq = 0;
-static unsigned long loops_per_jiffy_ref = 0;
-
-#ifndef CONFIG_SMP
-static unsigned long fast_gettimeoffset_ref = 0;
-static unsigned int cpu_khz_ref = 0;
-#endif
-
-static int
-time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
-		       void *data)
-{
-	struct cpufreq_freqs *freq = data;
-
-	if (val != CPUFREQ_RESUMECHANGE)
-		write_seqlock_irq(&xtime_lock);
-	if (!ref_freq) {
-		ref_freq = freq->old;
-		loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy;
-#ifndef CONFIG_SMP
-		fast_gettimeoffset_ref = fast_gettimeoffset_quotient;
-		cpu_khz_ref = cpu_khz;
-#endif
-	}
-
-	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
-	    (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
-	    (val == CPUFREQ_RESUMECHANGE)) {
-		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
-			cpu_data[freq->cpu].loops_per_jiffy = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
-#ifndef CONFIG_SMP
-		if (cpu_khz)
-			cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
-		if (use_tsc) {
-			if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
-				fast_gettimeoffset_quotient = cpufreq_scale(fast_gettimeoffset_ref, freq->new, ref_freq);
-				set_cyc2ns_scale(cpu_khz);
-			}
-		}
-#endif
-	}
-
-	if (val != CPUFREQ_RESUMECHANGE)
-		write_sequnlock_irq(&xtime_lock);
-
-	return 0;
-}
-
-static struct notifier_block time_cpufreq_notifier_block = {
-	.notifier_call	= time_cpufreq_notifier
-};
-
-
-static int __init cpufreq_tsc(void)
-{
-	int ret;
-	INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL);
-	ret = cpufreq_register_notifier(&time_cpufreq_notifier_block,
-					CPUFREQ_TRANSITION_NOTIFIER);
-	if (!ret)
-		cpufreq_init = 1;
-	return ret;
-}
-core_initcall(cpufreq_tsc);
-
-#else /* CONFIG_CPU_FREQ */
-static inline void cpufreq_delayed_get(void) { return; }
-#endif 
-
-int recalibrate_cpu_khz(void)
-{
-#ifndef CONFIG_SMP
-	unsigned int cpu_khz_old = cpu_khz;
-
-	if (cpu_has_tsc) {
-		init_cpu_khz();
-		cpu_data[0].loops_per_jiffy =
-		    cpufreq_scale(cpu_data[0].loops_per_jiffy,
-			          cpu_khz_old,
-				  cpu_khz);
-		return 0;
-	} else
-		return -ENODEV;
-#else
-	return -ENODEV;
-#endif
-}
-EXPORT_SYMBOL(recalibrate_cpu_khz);
-
-static void mark_offset_tsc(void)
-{
-	unsigned long lost,delay;
-	unsigned long delta = last_tsc_low;
-	int count;
-	int countmp;
-	static int count1 = 0;
-	unsigned long long this_offset, last_offset;
-	static int lost_count = 0;
-
-	write_seqlock(&monotonic_lock);
-	last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
-	/*
-	 * It is important that these two operations happen almost at
-	 * the same time. We do the RDTSC stuff first, since it's
-	 * faster. To avoid any inconsistencies, we need interrupts
-	 * disabled locally.
-	 */
-
-	/*
-	 * Interrupts are just disabled locally since the timer irq
-	 * has the SA_INTERRUPT flag set. -arca
-	 */
-
-	/* read Pentium cycle counter */
-
-	rdtsc(last_tsc_low, last_tsc_high);
-
-	spin_lock(&i8253_lock);
-	outb_p(0x00, PIT_MODE);     /* latch the count ASAP */
-
-	count = inb_p(PIT_CH0);    /* read the latched count */
-	count |= inb(PIT_CH0) << 8;
-
-	/*
-	 * VIA686a test code... reset the latch if count > max + 1
-	 * from timer_pit.c - cjb
-	 */
-	if (count > LATCH) {
-		outb_p(0x34, PIT_MODE);
-		outb_p(LATCH & 0xff, PIT_CH0);
-		outb(LATCH >> 8, PIT_CH0);
-		count = LATCH - 1;
-	}
-
-	spin_unlock(&i8253_lock);
-
-	if (pit_latch_buggy) {
-		/* get center value of last 3 time lutch */
-		if ((count2 >= count && count >= count1)
-		    || (count1 >= count && count >= count2)) {
-			count2 = count1; count1 = count;
-		} else if ((count1 >= count2 && count2 >= count)
-			   || (count >= count2 && count2 >= count1)) {
-			countmp = count;count = count2;
-			count2 = count1;count1 = countmp;
-		} else {
-			count2 = count1; count1 = count; count = count1;
-		}
-	}
-
-	/* lost tick compensation */
-	delta = last_tsc_low - delta;
-	{
-		register unsigned long eax, edx;
-		eax = delta;
-		__asm__("mull %2"
-		:"=a" (eax), "=d" (edx)
-		:"rm" (fast_gettimeoffset_quotient),
-		 "0" (eax));
-		delta = edx;
-	}
-	delta += delay_at_last_interrupt;
-	lost = delta/(1000000/HZ);
-	delay = delta%(1000000/HZ);
-	if (lost >= 2) {
-		jiffies_64 += lost-1;
-
-		/* sanity check to ensure we're not always losing ticks */
-		if (lost_count++ > 100) {
-			printk(KERN_WARNING "Losing too many ticks!\n");
-			printk(KERN_WARNING "TSC cannot be used as a timesource.  \n");
-			printk(KERN_WARNING "Possible reasons for this are:\n");
-			printk(KERN_WARNING "  You're running with Speedstep,\n");
-			printk(KERN_WARNING "  You don't have DMA enabled for your hard disk (see hdparm),\n");
-			printk(KERN_WARNING "  Incorrect TSC synchronization on an SMP system (see dmesg).\n");
-			printk(KERN_WARNING "Falling back to a sane timesource now.\n");
-
-			clock_fallback();
-		}
-		/* ... but give the TSC a fair chance */
-		if (lost_count > 25)
-			cpufreq_delayed_get();
-	} else
-		lost_count = 0;
-	/* update the monotonic base value */
-	this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
-	monotonic_base += cycles_2_ns(this_offset - last_offset);
-	write_sequnlock(&monotonic_lock);
-
-	/* calculate delay_at_last_interrupt */
-	count = ((LATCH-1) - count) * TICK_SIZE;
-	delay_at_last_interrupt = (count + LATCH/2) / LATCH;
-
-	/* catch corner case where tick rollover occured
-	 * between tsc and pit reads (as noted when
-	 * usec delta is > 90% # of usecs/tick)
-	 */
-	if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ))
-		jiffies_64++;
-}
-
-static int __init init_tsc(char* override)
-{
-
-	/* check clock override */
-	if (override[0] && strncmp(override,"tsc",3)) {
-#ifdef CONFIG_HPET_TIMER
-		if (is_hpet_enabled()) {
-			printk(KERN_ERR "Warning: clock= override failed. Defaulting to tsc\n");
-		} else
-#endif
-		{
-			return -ENODEV;
-		}
-	}
-
-	/*
-	 * If we have APM enabled or the CPU clock speed is variable
-	 * (CPU stops clock on HLT or slows clock to save power)
-	 * then the TSC timestamps may diverge by up to 1 jiffy from
-	 * 'real time' but nothing will break.
-	 * The most frequent case is that the CPU is "woken" from a halt
-	 * state by the timer interrupt itself, so we get 0 error. In the
-	 * rare cases where a driver would "wake" the CPU and request a
-	 * timestamp, the maximum error is < 1 jiffy. But timestamps are
-	 * still perfectly ordered.
-	 * Note that the TSC counter will be reset if APM suspends
-	 * to disk; this won't break the kernel, though, 'cuz we're
-	 * smart.  See arch/i386/kernel/apm.c.
-	 */
- 	/*
- 	 *	Firstly we have to do a CPU check for chips with
- 	 * 	a potentially buggy TSC. At this point we haven't run
- 	 *	the ident/bugs checks so we must run this hook as it
- 	 *	may turn off the TSC flag.
- 	 *
- 	 *	NOTE: this doesn't yet handle SMP 486 machines where only
- 	 *	some CPU's have a TSC. Thats never worked and nobody has
- 	 *	moaned if you have the only one in the world - you fix it!
- 	 */
-
-	count2 = LATCH; /* initialize counter for mark_offset_tsc() */
-
-	if (cpu_has_tsc) {
-		unsigned long tsc_quotient;
-#ifdef CONFIG_HPET_TIMER
-		if (is_hpet_enabled() && hpet_use_timer) {
-			unsigned long result, remain;
-			printk("Using TSC for gettimeofday\n");
-			tsc_quotient = calibrate_tsc_hpet(NULL);
-			timer_tsc.mark_offset = &mark_offset_tsc_hpet;
-			/*
-			 * Math to calculate hpet to usec multiplier
-			 * Look for the comments at get_offset_tsc_hpet()
-			 */
-			ASM_DIV64_REG(result, remain, hpet_tick,
-					0, KERNEL_TICK_USEC);
-			if (remain > (hpet_tick >> 1))
-				result++; /* rounding the result */
-
-			hpet_usec_quotient = result;
-		} else
-#endif
-		{
-			tsc_quotient = calibrate_tsc();
-		}
-
-		if (tsc_quotient) {
-			fast_gettimeoffset_quotient = tsc_quotient;
-			use_tsc = 1;
-			/*
-			 *	We could be more selective here I suspect
-			 *	and just enable this for the next intel chips ?
-			 */
-			/* report CPU clock rate in Hz.
-			 * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) =
-			 * clock/second. Our precision is about 100 ppm.
-			 */
-			{	unsigned long eax=0, edx=1000;
-				__asm__("divl %2"
-		       		:"=a" (cpu_khz), "=d" (edx)
-        	       		:"r" (tsc_quotient),
-	                	"0" (eax), "1" (edx));
-				printk("Detected %u.%03u MHz processor.\n",
-					cpu_khz / 1000, cpu_khz % 1000);
-			}
-			set_cyc2ns_scale(cpu_khz);
-			return 0;
-		}
-	}
-	return -ENODEV;
-}
-
-static int tsc_resume(void)
-{
-	write_seqlock(&monotonic_lock);
-	/* Assume this is the last mark offset time */
-	rdtsc(last_tsc_low, last_tsc_high);
-#ifdef CONFIG_HPET_TIMER
-	if (is_hpet_enabled() && hpet_use_timer)
-		hpet_last = hpet_readl(HPET_COUNTER);
-#endif
-	write_sequnlock(&monotonic_lock);
-	return 0;
-}
-
-#ifndef CONFIG_X86_TSC
-/* disable flag for tsc.  Takes effect by clearing the TSC cpu flag
- * in cpu/common.c */
-static int __init tsc_setup(char *str)
-{
-	tsc_disable = 1;
-	return 1;
-}
-#else
-static int __init tsc_setup(char *str)
-{
-	printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
-				"cannot disable TSC.\n");
-	return 1;
-}
-#endif
-__setup("notsc", tsc_setup);
-
-
-
-/************************************************************/
-
-/* tsc timer_opts struct */
-static struct timer_opts timer_tsc = {
-	.name = "tsc",
-	.mark_offset = mark_offset_tsc, 
-	.get_offset = get_offset_tsc,
-	.monotonic_clock = monotonic_clock_tsc,
-	.delay = delay_tsc,
-	.read_timer = read_timer_tsc,
-	.resume	= tsc_resume,
-};
-
-struct init_timer_opts __initdata timer_tsc_init = {
-	.init = init_tsc,
-	.opts = &timer_tsc,
-};
Index: linux.prev/arch/i386/kernel/traps.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/traps.c
+++ linux.prev/arch/i386/kernel/traps.c
@@ -93,7 +93,7 @@ asmlinkage void machine_check(void);
 
 static int kstack_depth_to_print = 24;
 struct notifier_block *i386die_chain;
-static DEFINE_SPINLOCK(die_notifier_lock);
+static DEFINE_RAW_SPINLOCK(die_notifier_lock);
 
 int register_die_notifier(struct notifier_block *nb)
 {
@@ -116,22 +116,27 @@ static inline unsigned long print_contex
 				unsigned long *stack, unsigned long ebp)
 {
 	unsigned long addr;
+#ifndef CONFIG_FRAME_POINTER
+	unsigned long prev_frame;
+#endif
 
-#ifdef	CONFIG_FRAME_POINTER
+#ifdef CONFIG_FRAME_POINTER
 	while (valid_stack_ptr(tinfo, (void *)ebp)) {
 		addr = *(unsigned long *)(ebp + 4);
 		printk(" [<%08lx>] ", addr);
 		print_symbol("%s", addr);
-		printk("\n");
+		printk(" (%ld)\n", *(unsigned long *)ebp - ebp);
 		ebp = *(unsigned long *)ebp;
 	}
 #else
+	prev_frame = (unsigned long)stack;
 	while (valid_stack_ptr(tinfo, stack)) {
 		addr = *stack++;
 		if (__kernel_text_address(addr)) {
 			printk(" [<%08lx>]", addr);
 			print_symbol(" %s", addr);
-			printk("\n");
+			printk(" (%ld)\n", (unsigned long)stack - prev_frame);
+			prev_frame = (unsigned long)stack;
 		}
 	}
 #endif
@@ -163,6 +168,8 @@ void show_trace(struct task_struct *task
 			break;
 		printk(" =======================\n");
 	}
+	print_traces(task);
+	show_held_locks(task);
 }
 
 void show_stack(struct task_struct *task, unsigned long *esp)
@@ -201,6 +208,12 @@ void dump_stack(void)
 
 EXPORT_SYMBOL(dump_stack);
 
+#if defined(CONFIG_DEBUG_STACKOVERFLOW) && defined(CONFIG_LATENCY_TRACE)
+extern unsigned long worst_stack_left;
+#else
+# define worst_stack_left -1L
+#endif
+
 void show_registers(struct pt_regs *regs)
 {
 	int i;
@@ -225,10 +238,17 @@ void show_registers(struct pt_regs *regs
 		regs->eax, regs->ebx, regs->ecx, regs->edx);
 	printk("esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
 		regs->esi, regs->edi, regs->ebp, esp);
-	printk("ds: %04x   es: %04x   ss: %04x\n",
-		regs->xds & 0xffff, regs->xes & 0xffff, ss);
-	printk("Process %s (pid: %d, threadinfo=%p task=%p)",
+	printk("ds: %04x   es: %04x   ss: %04x   preempt: %08x\n",
+		regs->xds & 0xffff, regs->xes & 0xffff, ss, preempt_count());
+	printk("Process %s (pid: %d, threadinfo=%p task=%p",
 		current->comm, current->pid, current_thread_info(), current);
+
+	if (in_kernel)
+		printk(" stack_left=%ld worst_left=%ld)",
+			(esp & (THREAD_SIZE-1))-sizeof(struct thread_info),
+			worst_stack_left);
+	else
+		printk(")");
 	/*
 	 * When in-kernel, we also print out the stack and code at the
 	 * time of the fault..
@@ -297,11 +317,11 @@ bug:
 void die(const char * str, struct pt_regs * regs, long err)
 {
 	static struct {
-		spinlock_t lock;
+		raw_spinlock_t lock;
 		u32 lock_owner;
 		int lock_owner_depth;
 	} die = {
-		.lock =			SPIN_LOCK_UNLOCKED,
+		.lock =			RAW_SPIN_LOCK_UNLOCKED,
 		.lock_owner =		-1,
 		.lock_owner_depth =	0
 	};
@@ -379,6 +399,11 @@ static void __kprobes do_trap(int trapnr
 	if (!user_mode(regs))
 		goto kernel_trap;
 
+#ifdef CONFIG_PREEMPT_RT
+	raw_local_irq_enable();
+	preempt_check_resched();
+#endif
+
 	trap_signal: {
 		if (info)
 			force_sig_info(signr, info, tsk);
@@ -509,7 +534,7 @@ fastcall void __kprobes do_general_prote
 	return;
 
 gp_in_vm86:
-	local_irq_enable();
+	raw_local_irq_enable();
 	handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
 	return;
 
@@ -563,10 +588,12 @@ static void unknown_nmi_error(unsigned c
 	printk("Do you have a strange power saving mode enabled?\n");
 }
 
-static DEFINE_SPINLOCK(nmi_print_lock);
+static DEFINE_RAW_SPINLOCK(nmi_print_lock);
 
 void die_nmi (struct pt_regs *regs, const char *msg)
 {
+	deadlock_trace_off();
+
 	if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 0, SIGINT) ==
 	    NOTIFY_STOP)
 		return;
@@ -594,10 +621,11 @@ void die_nmi (struct pt_regs *regs, cons
 		crash_kexec(regs);
 	}
 
+	nmi_exit();
 	do_exit(SIGSEGV);
 }
 
-static void default_do_nmi(struct pt_regs * regs)
+static void notrace default_do_nmi(struct pt_regs * regs)
 {
 	unsigned char reason = 0;
 
@@ -616,6 +644,7 @@ static void default_do_nmi(struct pt_reg
 		 */
 		if (nmi_watchdog) {
 			nmi_watchdog_tick(regs);
+//			trace_special(6, 1, 0);
 			return;
 		}
 #endif
@@ -635,18 +664,19 @@ static void default_do_nmi(struct pt_reg
 	reassert_nmi();
 }
 
-static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
+static notrace int dummy_nmi_callback(struct pt_regs * regs, int cpu)
 {
 	return 0;
 }
  
 static nmi_callback_t nmi_callback = dummy_nmi_callback;
  
-fastcall void do_nmi(struct pt_regs * regs, long error_code)
+fastcall notrace void do_nmi(struct pt_regs * regs, long error_code)
 {
 	int cpu;
 
 	nmi_enter();
+	nmi_trace((unsigned long)do_nmi, regs->eip, regs->eflags);
 
 	cpu = smp_processor_id();
 
@@ -717,7 +747,7 @@ fastcall void __kprobes do_debug(struct 
 		return;
 	/* It's safe to allow irq's after DR6 has been saved */
 	if (regs->eflags & X86_EFLAGS_IF)
-		local_irq_enable();
+		raw_local_irq_enable();
 
 	/* Mask out spurious debug traps due to lazy DR7 setting */
 	if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
Index: linux.prev/arch/i386/kernel/tsc.c
===================================================================
--- /dev/null
+++ linux.prev/arch/i386/kernel/tsc.c
@@ -0,0 +1,395 @@
+/*
+ * This code largely moved from arch/i386/kernel/timer/timer_tsc.c
+ * which was originally moved from arch/i386/kernel/time.c.
+ * See comments there for proper credits.
+ */
+
+#include <linux/clocksource.h>
+#include <linux/workqueue.h>
+#include <linux/cpufreq.h>
+#include <linux/jiffies.h>
+#include <linux/init.h>
+
+#include <asm/delay.h>
+#include <asm/tsc.h>
+#include <asm/delay.h>
+#include <asm/io.h>
+
+#include "mach_timer.h"
+
+/*
+ * On some systems the TSC frequency does not
+ * change with the cpu frequency. So we need
+ * an extra value to store the TSC freq
+ */
+unsigned int tsc_khz;
+
+int tsc_disable __initdata = 0;
+
+#ifdef CONFIG_X86_TSC
+static int __init tsc_setup(char *str)
+{
+	printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
+				"cannot disable TSC.\n");
+	return 1;
+}
+#else
+/*
+ * disable flag for tsc. Takes effect by clearing the TSC cpu flag
+ * in cpu/common.c
+ */
+static int __init tsc_setup(char *str)
+{
+	tsc_disable = 1;
+
+	return 1;
+}
+#endif
+
+__setup("notsc", tsc_setup);
+
+/*
+ * code to mark and check if the TSC is unstable
+ * due to cpufreq or due to unsynced TSCs
+ */
+static int tsc_unstable;
+
+static inline int check_tsc_unstable(void)
+{
+	return tsc_unstable;
+}
+
+void mark_tsc_unstable(void)
+{
+	tsc_unstable = 1;
+}
+EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+
+/* Accellerators for sched_clock()
+ * convert from cycles(64bits) => nanoseconds (64bits)
+ *  basic equation:
+ *		ns = cycles / (freq / ns_per_sec)
+ *		ns = cycles * (ns_per_sec / freq)
+ *		ns = cycles * (10^9 / (cpu_khz * 10^3))
+ *		ns = cycles * (10^6 / cpu_khz)
+ *
+ *	Then we use scaling math (suggested by george@mvista.com) to get:
+ *		ns = cycles * (10^6 * SC / cpu_khz) / SC
+ *		ns = cycles * cyc2ns_scale / SC
+ *
+ *	And since SC is a constant power of two, we can convert the div
+ *  into a shift.
+ *
+ *  We can use khz divisor instead of mhz to keep a better percision, since
+ *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ *  (mathieu.desnoyers@polymtl.ca)
+ *
+ *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
+ */
+static unsigned long cyc2ns_scale;
+
+#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
+
+static inline void set_cyc2ns_scale(unsigned long cpu_khz)
+{
+	cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
+}
+
+static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+	return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
+}
+
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ */
+unsigned long long sched_clock(void)
+{
+	unsigned long long this_offset;
+
+	/*
+	 * in the NUMA case we dont use the TSC as they are not
+	 * synchronized across all CPUs.
+	 */
+#ifndef CONFIG_NUMA
+	if (!cpu_khz || check_tsc_unstable())
+#endif
+		/* no locking but a rare wrong value is not a big deal */
+		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
+
+	/* read the Time Stamp Counter: */
+	rdtscll(this_offset);
+
+	/* return the value in ns */
+	return cycles_2_ns(this_offset);
+}
+
+static unsigned long calculate_cpu_khz(void)
+{
+	unsigned long long start, end;
+	unsigned long count;
+	u64 delta64;
+	int i;
+	unsigned long flags;
+
+	raw_local_irq_save(flags);
+
+	/* run 3 times to ensure the cache is warm */
+	for (i = 0; i < 3; i++) {
+		mach_prepare_counter();
+		rdtscll(start);
+		mach_countup(&count);
+		rdtscll(end);
+	}
+	/*
+	 * Error: ECTCNEVERSET
+	 * The CTC wasn't reliable: we got a hit on the very first read,
+	 * or the CPU was so fast/slow that the quotient wouldn't fit in
+	 * 32 bits..
+	 */
+	if (count <= 1)
+		goto err;
+
+	delta64 = end - start;
+
+	/* cpu freq too fast: */
+	if (delta64 > (1ULL<<32))
+		goto err;
+
+	/* cpu freq too slow: */
+	if (delta64 <= CALIBRATE_TIME_MSEC)
+		goto err;
+
+	delta64 += CALIBRATE_TIME_MSEC/2; /* round for do_div */
+	do_div(delta64,CALIBRATE_TIME_MSEC);
+
+	raw_local_irq_restore(flags);
+	return (unsigned long)delta64;
+err:
+	raw_local_irq_restore(flags);
+	return 0;
+}
+
+int recalibrate_cpu_khz(void)
+{
+#ifndef CONFIG_SMP
+	unsigned long cpu_khz_old = cpu_khz;
+
+	if (cpu_has_tsc) {
+		cpu_khz = calculate_cpu_khz();
+		tsc_khz = cpu_khz;
+		cpu_data[0].loops_per_jiffy =
+			cpufreq_scale(cpu_data[0].loops_per_jiffy,
+					cpu_khz_old, cpu_khz);
+		return 0;
+	} else
+		return -ENODEV;
+#else
+	return -ENODEV;
+#endif
+}
+
+EXPORT_SYMBOL(recalibrate_cpu_khz);
+
+void tsc_init(void)
+{
+	if (!cpu_has_tsc || tsc_disable)
+		return;
+
+	cpu_khz = calculate_cpu_khz();
+	tsc_khz = cpu_khz;
+
+	if (!cpu_khz)
+		return;
+
+	printk("Detected %lu.%03lu MHz processor.\n",
+				(unsigned long)cpu_khz / 1000,
+				(unsigned long)cpu_khz % 1000);
+
+	set_cyc2ns_scale(cpu_khz);
+	use_tsc_delay();
+}
+
+#ifdef CONFIG_CPU_FREQ
+
+static unsigned int cpufreq_delayed_issched = 0;
+static unsigned int cpufreq_init = 0;
+static struct work_struct cpufreq_delayed_get_work;
+
+static void handle_cpufreq_delayed_get(void *v)
+{
+	unsigned int cpu;
+
+	for_each_online_cpu(cpu)
+		cpufreq_get(cpu);
+
+	cpufreq_delayed_issched = 0;
+}
+
+/*
+ * if we notice cpufreq oddness, schedule a call to cpufreq_get() as it tries
+ * to verify the CPU frequency the timing core thinks the CPU is running
+ * at is still correct.
+ */
+static inline void cpufreq_delayed_get(void)
+{
+	if (cpufreq_init && !cpufreq_delayed_issched) {
+		cpufreq_delayed_issched = 1;
+		printk(KERN_DEBUG "Checking if CPU frequency changed.\n");
+		schedule_work(&cpufreq_delayed_get_work);
+	}
+}
+
+/*
+ * if the CPU frequency is scaled, TSC-based delays will need a different
+ * loops_per_jiffy value to function properly.
+ */
+static unsigned int ref_freq = 0;
+static unsigned long loops_per_jiffy_ref = 0;
+static unsigned long cpu_khz_ref = 0;
+
+static int
+time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
+{
+	struct cpufreq_freqs *freq = data;
+
+	if (val != CPUFREQ_RESUMECHANGE)
+		write_seqlock_irq(&xtime_lock);
+
+	if (!ref_freq) {
+		ref_freq = freq->old;
+		loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy;
+		cpu_khz_ref = cpu_khz;
+	}
+
+	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
+	    (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
+	    (val == CPUFREQ_RESUMECHANGE)) {
+		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+			cpu_data[freq->cpu].loops_per_jiffy = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
+
+		if (cpu_khz) {
+
+			if (num_online_cpus() == 1)
+				cpu_khz = cpufreq_scale(cpu_khz_ref,
+						ref_freq, freq->new);
+			if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
+				tsc_khz = cpu_khz;
+				set_cyc2ns_scale(cpu_khz);
+				/*
+				 * TSC based sched_clock turns
+				 * to junk w/ cpufreq
+				 */
+				mark_tsc_unstable();
+			}
+		}
+	}
+
+	if (val != CPUFREQ_RESUMECHANGE)
+		write_sequnlock_irq(&xtime_lock);
+
+	return 0;
+}
+
+static struct notifier_block time_cpufreq_notifier_block = {
+	.notifier_call	= time_cpufreq_notifier
+};
+
+static int __init cpufreq_tsc(void)
+{
+	int ret;
+
+	INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL);
+	ret = cpufreq_register_notifier(&time_cpufreq_notifier_block,
+					CPUFREQ_TRANSITION_NOTIFIER);
+	if (!ret)
+		cpufreq_init = 1;
+
+	return ret;
+}
+
+core_initcall(cpufreq_tsc);
+
+#endif
+
+/* clock source code */
+
+static unsigned long current_tsc_khz = 0;
+static int tsc_update_callback(void);
+
+static cycle_t read_tsc(void)
+{
+	cycle_t ret;
+
+	rdtscll(ret);
+
+	return ret;
+}
+
+static struct clocksource clocksource_tsc = {
+	.name			= "tsc",
+	.rating			= 300,
+	.read			= read_tsc,
+	.mask			= (cycle_t)-1,
+	.mult			= 0, /* to be set */
+	.shift			= 22,
+	.update_callback	= tsc_update_callback,
+	.is_continuous		= 1,
+};
+
+static int tsc_update_callback(void)
+{
+	int change = 0;
+
+	/* check to see if we should switch to the safe clocksource: */
+	if (clocksource_tsc.rating != 50 && check_tsc_unstable()) {
+		clocksource_tsc.rating = 50;
+		reselect_clocksource();
+		change = 1;
+	}
+
+	/* only update if tsc_khz has changed: */
+	if (current_tsc_khz != tsc_khz) {
+		current_tsc_khz = tsc_khz;
+		clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
+							clocksource_tsc.shift);
+		change = 1;
+	}
+
+	return change;
+}
+
+/*
+ * Make an educated guess if the TSC is trustworthy and synchronized
+ * over all CPUs.
+ */
+static __init int unsynchronized_tsc(void)
+{
+	/*
+	 * Intel systems are normally all synchronized.
+	 * Exceptions must mark TSC as unstable:
+	 */
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ 		return 0;
+
+	/* assume multi socket systems are not synchronized: */
+ 	return num_possible_cpus() > 1;
+}
+
+/* NUMAQ can't use TSC: */
+static int __init init_tsc_clocksource(void)
+{
+	/* TSC initialization is done in arch/i386/kernel/tsc.c */
+	if (cpu_has_tsc && tsc_khz && !tsc_disable) {
+		if (unsynchronized_tsc()) /* lower rating if unsynced */
+			mark_tsc_unstable();
+		current_tsc_khz = tsc_khz;
+		clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
+							clocksource_tsc.shift);
+		register_clocksource(&clocksource_tsc);
+	}
+
+	return 0;
+}
+
+module_init(init_tsc_clocksource);
Index: linux.prev/arch/i386/kernel/vm86.c
===================================================================
--- linux.prev.orig/arch/i386/kernel/vm86.c
+++ linux.prev/arch/i386/kernel/vm86.c
@@ -105,9 +105,10 @@ struct pt_regs * fastcall save_v86_state
 	 * from process context. Enable interrupts here, before trying
 	 * to access user space.
 	 */
-	local_irq_enable();
+	raw_local_irq_enable();
 
 	if (!current->thread.vm86_info) {
+		raw_local_irq_disable();
 		printk("no vm86_info: BAD\n");
 		do_exit(SIGSEGV);
 	}
Index: linux.prev/arch/i386/lib/bitops.c
===================================================================
--- linux.prev.orig/arch/i386/lib/bitops.c
+++ linux.prev/arch/i386/lib/bitops.c
@@ -68,3 +68,37 @@ int find_next_zero_bit(const unsigned lo
 	return (offset + set + res);
 }
 EXPORT_SYMBOL(find_next_zero_bit);
+
+
+/*
+ * rw spinlock fallbacks
+ */
+#if defined(CONFIG_SMP)
+asm(
+".section .sched.text\n"
+".align	4\n"
+".globl	__write_lock_failed\n"
+"__write_lock_failed:\n\t"
+	LOCK "addl	$" RW_LOCK_BIAS_STR ",(%eax)\n"
+"1:	rep; nop\n\t"
+	"cmpl	$" RW_LOCK_BIAS_STR ",(%eax)\n\t"
+	"jne	1b\n\t"
+	LOCK "subl	$" RW_LOCK_BIAS_STR ",(%eax)\n\t"
+	"jnz	__write_lock_failed\n\t"
+	"ret"
+);
+
+asm(
+".section .sched.text\n"
+".align	4\n"
+".globl	__read_lock_failed\n"
+"__read_lock_failed:\n\t"
+	LOCK "incl	(%eax)\n"
+"1:	rep; nop\n\t"
+	"cmpl	$1,(%eax)\n\t"
+	"js	1b\n\t"
+	LOCK "decl	(%eax)\n\t"
+	"js	__read_lock_failed\n\t"
+	"ret"
+);
+#endif
Index: linux.prev/arch/i386/lib/delay.c
===================================================================
--- linux.prev.orig/arch/i386/lib/delay.c
+++ linux.prev/arch/i386/lib/delay.c
@@ -10,43 +10,93 @@
  *	we have to worry about.
  */
 
+#include <linux/timeofday.h>
+#include <linux/module.h>
 #include <linux/config.h>
 #include <linux/sched.h>
 #include <linux/delay.h>
-#include <linux/module.h>
+
 #include <asm/processor.h>
 #include <asm/delay.h>
 #include <asm/timer.h>
 
 #ifdef CONFIG_SMP
-#include <asm/smp.h>
+# include <asm/smp.h>
 #endif
 
-extern struct timer_opts* timer;
+/* simple loop based delay: */
+static void delay_loop(unsigned long loops)
+{
+	int d0;
+
+	__asm__ __volatile__(
+		"\tjmp 1f\n"
+		".align 16\n"
+		"1:\tjmp 2f\n"
+		".align 16\n"
+		"2:\tdecl %0\n\tjns 2b"
+		:"=&a" (d0)
+		:"0" (loops));
+}
+
+/* TSC based delay: */
+static void delay_tsc(unsigned long loops)
+{
+	unsigned long bclock, now;
+
+	rdtscl(bclock);
+	do {
+		rep_nop();
+		rdtscl(now);
+	} while ((now-bclock) < loops);
+}
+
+/*
+ * Since we calibrate only once at boot, this
+ * function should be set once at boot and not changed
+ */
+static void (*delay_fn)(unsigned long) = delay_loop;
+
+void use_tsc_delay(void)
+{
+	delay_fn = delay_tsc;
+}
+
+int read_current_timer(unsigned long *timer_val)
+{
+	if (delay_fn == delay_tsc) {
+		rdtscl(*timer_val);
+		return 0;
+	}
+	return -1;
+}
 
 void __delay(unsigned long loops)
 {
-	cur_timer->delay(loops);
+	delay_fn(loops);
 }
 
 inline void __const_udelay(unsigned long xloops)
 {
 	int d0;
+
 	xloops *= 4;
 	__asm__("mull %0"
 		:"=d" (xloops), "=&a" (d0)
-		:"1" (xloops),"0" (cpu_data[raw_smp_processor_id()].loops_per_jiffy * (HZ/4)));
-        __delay(++xloops);
+		:"1" (xloops), "0"
+		(cpu_data[raw_smp_processor_id()].loops_per_jiffy * (HZ/4)));
+
+	__delay(++xloops);
 }
 
 void __udelay(unsigned long usecs)
 {
-	__const_udelay(usecs * 0x000010c7);  /* 2**32 / 1000000 (rounded up) */
+	__const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
 }
 
 void __ndelay(unsigned long nsecs)
 {
-	__const_udelay(nsecs * 0x00005);  /* 2**32 / 1000000000 (rounded up) */
+	__const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
 }
 
 EXPORT_SYMBOL(__delay);
Index: linux.prev/arch/i386/mach-default/setup.c
===================================================================
--- linux.prev.orig/arch/i386/mach-default/setup.c
+++ linux.prev/arch/i386/mach-default/setup.c
@@ -34,7 +34,7 @@ void __init pre_intr_init_hook(void)
 /*
  * IRQ2 is cascade interrupt to second interrupt controller
  */
-static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
+static struct irqaction irq2 = { no_action, SA_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL};
 
 /**
  * intr_init_hook - post gate setup interrupt initialisation
@@ -78,8 +78,6 @@ void __init trap_init_hook(void)
 {
 }
 
-static struct irqaction irq0  = { timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL};
-
 /**
  * time_init_hook - do any specific initialisations for the system timer.
  *
@@ -89,7 +87,6 @@ static struct irqaction irq0  = { timer_
  **/
 void __init time_init_hook(void)
 {
-	setup_irq(0, &irq0);
 }
 
 #ifdef CONFIG_MCA
Index: linux.prev/arch/i386/mach-visws/setup.c
===================================================================
--- linux.prev.orig/arch/i386/mach-visws/setup.c
+++ linux.prev/arch/i386/mach-visws/setup.c
@@ -113,7 +113,7 @@ void __init pre_setup_arch_hook()
 
 static struct irqaction irq0 = {
 	.handler =	timer_interrupt,
-	.flags =	SA_INTERRUPT,
+	.flags =	SA_INTERRUPT | SA_NODELAY,
 	.name =		"timer",
 };
 
Index: linux.prev/arch/i386/mach-visws/visws_apic.c
===================================================================
--- linux.prev.orig/arch/i386/mach-visws/visws_apic.c
+++ linux.prev/arch/i386/mach-visws/visws_apic.c
@@ -260,11 +260,13 @@ out_unlock:
 static struct irqaction master_action = {
 	.handler =	piix4_master_intr,
 	.name =		"PIIX4-8259",
+	.flags =	SA_NODELAY,
 };
 
 static struct irqaction cascade_action = {
 	.handler = 	no_action,
 	.name =		"cascade",
+	.flags =	SA_NODELAY,
 };
 
 
Index: linux.prev/arch/i386/mach-voyager/setup.c
===================================================================
--- linux.prev.orig/arch/i386/mach-voyager/setup.c
+++ linux.prev/arch/i386/mach-voyager/setup.c
@@ -16,7 +16,7 @@ void __init pre_intr_init_hook(void)
 /*
  * IRQ2 is cascade interrupt to second interrupt controller
  */
-static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
+static struct irqaction irq2 = { no_action, SA_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL};
 
 void __init intr_init_hook(void)
 {
@@ -39,7 +39,7 @@ void __init trap_init_hook(void)
 {
 }
 
-static struct irqaction irq0  = { timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL};
+static struct irqaction irq0  = { timer_interrupt, SA_INTERRUPT | SA_NODELAY, CPU_MASK_NONE, "timer", NULL, NULL};
 
 void __init time_init_hook(void)
 {
Index: linux.prev/arch/i386/mm/fault.c
===================================================================
--- linux.prev.orig/arch/i386/mm/fault.c
+++ linux.prev/arch/i386/mm/fault.c
@@ -39,6 +39,8 @@ void bust_spinlocks(int yes)
 	int loglevel_save = console_loglevel;
 
 	if (yes) {
+		stop_trace();
+		zap_rt_locks();
 		oops_in_progress = 1;
 		return;
 	}
@@ -224,8 +226,8 @@ fastcall void do_invalid_op(struct pt_re
  *	bit 1 == 0 means read, 1 means write
  *	bit 2 == 0 means kernel, 1 means user-mode
  */
-fastcall void __kprobes do_page_fault(struct pt_regs *regs,
-				      unsigned long error_code)
+fastcall notrace void __kprobes do_page_fault(struct pt_regs *regs,
+					      unsigned long error_code)
 {
 	struct task_struct *tsk;
 	struct mm_struct *mm;
@@ -236,13 +238,14 @@ fastcall void __kprobes do_page_fault(st
 
 	/* get the address */
         address = read_cr2();
+	trace_special(regs->eip, error_code, address);
 
 	if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
 					SIGSEGV) == NOTIFY_STOP)
 		return;
 	/* It's safe to allow irq's after cr2 has been saved */
 	if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
-		local_irq_enable();
+		raw_local_irq_enable();
 
 	tsk = current;
 
@@ -449,9 +452,9 @@ no_context:
 	}
 #endif
 	if (address < PAGE_SIZE)
-		printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
+		printk(KERN_ALERT "BUG: Unable to handle kernel NULL pointer dereference");
 	else
-		printk(KERN_ALERT "Unable to handle kernel paging request");
+		printk(KERN_ALERT "BUG: Unable to handle kernel paging request");
 	printk(" at virtual address %08lx\n",address);
 	printk(KERN_ALERT " printing eip:\n");
 	printk("%08lx\n", regs->eip);
Index: linux.prev/arch/i386/mm/highmem.c
===================================================================
--- linux.prev.orig/arch/i386/mm/highmem.c
+++ linux.prev/arch/i386/mm/highmem.c
@@ -18,6 +18,27 @@ void kunmap(struct page *page)
 	kunmap_high(page);
 }
 
+void kunmap_virt(void *ptr)
+{
+	struct page *page;
+
+	if ((unsigned long)ptr < PKMAP_ADDR(0))
+		return;
+	page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]);
+	kunmap(page);
+}
+
+struct page *kmap_to_page(void *ptr)
+{
+	struct page *page;
+
+	if ((unsigned long)ptr < PKMAP_ADDR(0))
+		return virt_to_page(ptr);
+	page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]);
+	return page;
+}
+
+
 /*
  * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
  * no global lock is needed and because the kmap code must perform a global TLB
@@ -26,7 +47,7 @@ void kunmap(struct page *page)
  * However when holding an atomic kmap is is not legal to sleep, so atomic
  * kmaps are appropriate for short, tight code paths only.
  */
-void *kmap_atomic(struct page *page, enum km_type type)
+void *__kmap_atomic(struct page *page, enum km_type type)
 {
 	enum fixed_addresses idx;
 	unsigned long vaddr;
@@ -48,7 +69,7 @@ void *kmap_atomic(struct page *page, enu
 	return (void*) vaddr;
 }
 
-void kunmap_atomic(void *kvaddr, enum km_type type)
+void __kunmap_atomic(void *kvaddr, enum km_type type)
 {
 #ifdef CONFIG_DEBUG_HIGHMEM
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
@@ -78,7 +99,7 @@ void kunmap_atomic(void *kvaddr, enum km
 /* This is the same as kmap_atomic() but can map memory that doesn't
  * have a struct page associated with it.
  */
-void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
+void *__kmap_atomic_pfn(unsigned long pfn, enum km_type type)
 {
 	enum fixed_addresses idx;
 	unsigned long vaddr;
@@ -93,7 +114,7 @@ void *kmap_atomic_pfn(unsigned long pfn,
 	return (void*) vaddr;
 }
 
-struct page *kmap_atomic_to_page(void *ptr)
+struct page *__kmap_atomic_to_page(void *ptr)
 {
 	unsigned long idx, vaddr = (unsigned long)ptr;
 	pte_t *pte;
@@ -108,6 +129,7 @@ struct page *kmap_atomic_to_page(void *p
 
 EXPORT_SYMBOL(kmap);
 EXPORT_SYMBOL(kunmap);
-EXPORT_SYMBOL(kmap_atomic);
-EXPORT_SYMBOL(kunmap_atomic);
-EXPORT_SYMBOL(kmap_atomic_to_page);
+EXPORT_SYMBOL(kunmap_virt);
+EXPORT_SYMBOL(__kmap_atomic);
+EXPORT_SYMBOL(__kunmap_atomic);
+EXPORT_SYMBOL(__kmap_atomic_to_page);
Index: linux.prev/arch/i386/mm/init.c
===================================================================
--- linux.prev.orig/arch/i386/mm/init.c
+++ linux.prev/arch/i386/mm/init.c
@@ -44,7 +44,7 @@
 
 unsigned int __VMALLOC_RESERVE = 128 << 20;
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 unsigned long highstart_pfn, highend_pfn;
 
 static int noinline do_test_wp_bit(void);
Index: linux.prev/arch/i386/mm/pageattr.c
===================================================================
--- linux.prev.orig/arch/i386/mm/pageattr.c
+++ linux.prev/arch/i386/mm/pageattr.c
@@ -207,6 +207,9 @@ void kernel_map_pages(struct page *page,
 {
 	if (PageHighMem(page))
 		return;
+	if (!enable)
+		check_no_locks_freed(page_address(page), page_address(page+numpages));
+
 	/* the return value is ignored - the calls cannot fail,
 	 * large pages are disabled at boot time.
 	 */
Index: linux.prev/arch/i386/mm/pgtable.c
===================================================================
--- linux.prev.orig/arch/i386/mm/pgtable.c
+++ linux.prev/arch/i386/mm/pgtable.c
@@ -183,7 +183,7 @@ void pmd_ctor(void *pmd, kmem_cache_t *c
  * recommendations and having no core impact whatsoever.
  * -- wli
  */
-DEFINE_SPINLOCK(pgd_lock);
+DEFINE_RAW_SPINLOCK(pgd_lock);
 struct page *pgd_list;
 
 static inline void pgd_list_add(pgd_t *pgd)
Index: linux.prev/arch/i386/oprofile/Kconfig
===================================================================
--- linux.prev.orig/arch/i386/oprofile/Kconfig
+++ linux.prev/arch/i386/oprofile/Kconfig
@@ -15,3 +15,6 @@ config OPROFILE
 
 	  If unsure, say N.
 
+config PROFILE_NMI
+	bool
+	default y
Index: linux.prev/arch/i386/pci/Makefile
===================================================================
--- linux.prev.orig/arch/i386/pci/Makefile
+++ linux.prev/arch/i386/pci/Makefile
@@ -4,8 +4,9 @@ obj-$(CONFIG_PCI_BIOS)		+= pcbios.o
 obj-$(CONFIG_PCI_MMCONFIG)	+= mmconfig.o direct.o
 obj-$(CONFIG_PCI_DIRECT)	+= direct.o
 
+obj-$(CONFIG_ACPI)		+= acpi.o
+
 pci-y				:= fixup.o
-pci-$(CONFIG_ACPI)		+= acpi.o
 pci-y				+= legacy.o irq.o
 
 pci-$(CONFIG_X86_VISWS)		:= visws.o fixup.o
Index: linux.prev/arch/i386/pci/direct.c
===================================================================
--- linux.prev.orig/arch/i386/pci/direct.c
+++ linux.prev/arch/i386/pci/direct.c
@@ -211,16 +211,23 @@ static int __init pci_check_type1(void)
 	unsigned int tmp;
 	int works = 0;
 
-	local_irq_save(flags);
+	spin_lock_irqsave(&pci_config_lock, flags);
 
 	outb(0x01, 0xCFB);
 	tmp = inl(0xCF8);
 	outl(0x80000000, 0xCF8);
-	if (inl(0xCF8) == 0x80000000 && pci_sanity_check(&pci_direct_conf1)) {
-		works = 1;
+
+	if (inl(0xCF8) == 0x80000000) {
+		spin_unlock_irqrestore(&pci_config_lock, flags);
+
+		if (pci_sanity_check(&pci_direct_conf1))
+			works = 1;
+
+		spin_lock_irqsave(&pci_config_lock, flags);
 	}
 	outl(tmp, 0xCF8);
-	local_irq_restore(flags);
+
+	spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return works;
 }
@@ -230,17 +237,19 @@ static int __init pci_check_type2(void)
 	unsigned long flags;
 	int works = 0;
 
-	local_irq_save(flags);
+	spin_lock_irqsave(&pci_config_lock, flags);
 
 	outb(0x00, 0xCFB);
 	outb(0x00, 0xCF8);
 	outb(0x00, 0xCFA);
-	if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00 &&
-	    pci_sanity_check(&pci_direct_conf2)) {
-		works = 1;
-	}
 
-	local_irq_restore(flags);
+	if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00) {
+		spin_unlock_irqrestore(&pci_config_lock, flags);
+
+		if (pci_sanity_check(&pci_direct_conf2))
+			works = 1;
+	} else
+		spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return works;
 }
Index: linux.prev/arch/i386/pci/pcbios.c
===================================================================
--- linux.prev.orig/arch/i386/pci/pcbios.c
+++ linux.prev/arch/i386/pci/pcbios.c
@@ -70,7 +70,7 @@ static unsigned long bios32_service(unsi
 	unsigned long entry;		/* %edx */
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	__asm__("lcall *(%%edi); cld"
 		: "=a" (return_code),
 		  "=b" (address),
@@ -79,7 +79,7 @@ static unsigned long bios32_service(unsi
 		: "0" (service),
 		  "1" (0),
 		  "D" (&bios32_indirect));
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	switch (return_code) {
 		case 0:
@@ -110,7 +110,7 @@ static int __devinit check_pcibios(void)
 	if ((pcibios_entry = bios32_service(PCI_SERVICE))) {
 		pci_indirect.address = pcibios_entry + PAGE_OFFSET;
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		__asm__(
 			"lcall *(%%edi); cld\n\t"
 			"jc 1f\n\t"
@@ -123,7 +123,7 @@ static int __devinit check_pcibios(void)
 			: "1" (PCIBIOS_PCI_BIOS_PRESENT),
 			  "D" (&pci_indirect)
 			: "memory");
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 
 		status = (eax >> 8) & 0xff;
 		hw_mech = eax & 0xff;
Index: linux.prev/arch/mips/Kconfig
===================================================================
--- linux.prev.orig/arch/mips/Kconfig
+++ linux.prev/arch/mips/Kconfig
@@ -362,6 +362,7 @@ config MOMENCO_JAGUAR_ATX
 config MOMENCO_OCELOT
 	bool "Support for Momentum Ocelot board"
 	select DMA_NONCOHERENT
+	select NO_SPINLOCK
 	select HW_HAS_PCI
 	select IRQ_CPU
 	select IRQ_CPU_RM7K
@@ -792,12 +793,21 @@ source "arch/mips/philips/pnx8550/common
 
 endmenu
 
+source "kernel/Kconfig.preempt"
+
 config RWSEM_GENERIC_SPINLOCK
 	bool
+	depends on !PREEMPT_RT
 	default y
 
 config RWSEM_XCHGADD_ALGORITHM
 	bool
+	depends on !PREEMPT_RT
+
+config ASM_SEMAPHORES
+	bool
+#	depends on !PREEMPT_RT
+	default y
 
 config GENERIC_CALIBRATE_DELAY
 	bool
@@ -832,6 +842,9 @@ config DMA_NEED_PCI_MAP_STATE
 config OWN_DMA
 	bool
 
+config NO_SPINLOCK
+	bool
+
 config EARLY_PRINTK
 	bool
 
@@ -1637,10 +1650,6 @@ config MIPS_INSANE_LARGE
 
 endmenu
 
-config RWSEM_GENERIC_SPINLOCK
-	bool
-	default y
-
 source "init/Kconfig"
 
 menu "Bus options (PCI, PCMCIA, EISA, ISA, TC)"
Index: linux.prev/arch/mips/arc/misc.c
===================================================================
--- linux.prev.orig/arch/mips/arc/misc.c
+++ linux.prev/arch/mips/arc/misc.c
@@ -27,7 +27,7 @@ VOID
 ArcHalt(VOID)
 {
 	bc_disable();
-	local_irq_disable();
+	raw_local_irq_disable();
 #ifdef CONFIG_SCSI_SGIWD93
 	reset_wd33c93(sgiwd93_host);
 #endif
@@ -39,7 +39,7 @@ VOID
 ArcPowerDown(VOID)
 {
 	bc_disable();
-	local_irq_disable();
+	raw_local_irq_disable();
 #ifdef CONFIG_SCSI_SGIWD93
 	reset_wd33c93(sgiwd93_host);
 #endif
@@ -52,7 +52,7 @@ VOID
 ArcRestart(VOID)
 {
 	bc_disable();
-	local_irq_disable();
+	raw_local_irq_disable();
 #ifdef CONFIG_SCSI_SGIWD93
 	reset_wd33c93(sgiwd93_host);
 #endif
@@ -64,7 +64,7 @@ VOID
 ArcReboot(VOID)
 {
 	bc_disable();
-	local_irq_disable();
+	raw_local_irq_disable();
 #ifdef CONFIG_SCSI_SGIWD93
 	reset_wd33c93(sgiwd93_host);
 #endif
@@ -76,7 +76,7 @@ VOID
 ArcEnterInteractiveMode(VOID)
 {
 	bc_disable();
-	local_irq_disable();
+	raw_local_irq_disable();
 #ifdef CONFIG_SCSI_SGIWD93
 	reset_wd33c93(sgiwd93_host);
 #endif
Index: linux.prev/arch/mips/gt64120/ev64120/irq.c
===================================================================
--- linux.prev.orig/arch/mips/gt64120/ev64120/irq.c
+++ linux.prev/arch/mips/gt64120/ev64120/irq.c
@@ -60,25 +60,25 @@ static void disable_ev64120_irq(unsigned
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	if (irq_nr >= 8) {	// All PCI interrupts are on line 5 or 2
 		clear_c0_status(9 << 10);
 	} else {
 		clear_c0_status(1 << (irq_nr + 8));
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static void enable_ev64120_irq(unsigned int irq_nr)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	if (irq_nr >= 8)	// All PCI interrupts are on line 5 or 2
 		set_c0_status(9 << 10);
 	else
 		set_c0_status(1 << (irq_nr + 8));
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static unsigned int startup_ev64120_irq(unsigned int irq)
@@ -119,7 +119,7 @@ void gt64120_irq_setup(void)
 	/* Sets the exception_handler array. */
 	set_except_vector(0, galileo_handle_int);
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/*
 	 * Enable timer.  Other interrupts will be enabled as they are
Index: linux.prev/arch/mips/gt64120/momenco_ocelot/irq.c
===================================================================
--- linux.prev.orig/arch/mips/gt64120/momenco_ocelot/irq.c
+++ linux.prev/arch/mips/gt64120/momenco_ocelot/irq.c
@@ -57,7 +57,7 @@ void __init arch_init_irq(void)
 	 * int-handler is not on bootstrap
 	 */
 	clear_c0_status(ST0_IM);
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* Sets the first-level interrupt dispatcher. */
 	set_except_vector(0, ocelot_handle_int);
Index: linux.prev/arch/mips/ite-boards/generic/irq.c
===================================================================
--- linux.prev.orig/arch/mips/ite-boards/generic/irq.c
+++ linux.prev/arch/mips/ite-boards/generic/irq.c
@@ -171,9 +171,9 @@ void enable_cpu_timer(void)
 {
         unsigned long flags;
 
-        local_irq_save(flags);
+        raw_local_irq_save(flags);
 	set_c0_status(0x100 << EXT_IRQ5_TO_IP);
-        local_irq_restore(flags);
+        raw_local_irq_restore(flags);
 }
 
 void __init arch_init_irq(void)
Index: linux.prev/arch/mips/ite-boards/generic/time.c
===================================================================
--- linux.prev.orig/arch/mips/ite-boards/generic/time.c
+++ linux.prev/arch/mips/ite-boards/generic/time.c
@@ -124,7 +124,7 @@ static unsigned long __init cal_r4koff(v
 {
 	unsigned int flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	/* Start counter exactly on falling edge of update flag */
 	while (CMOS_READ(RTC_REG_A) & RTC_UIP);
@@ -140,7 +140,7 @@ static unsigned long __init cal_r4koff(v
 	mips_hpt_frequency = read_c0_count();
 
 	/* restore interrupts */
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return (mips_hpt_frequency / HZ);
 }
@@ -153,11 +153,11 @@ it8172_rtc_get_time(void)
 
 	/* avoid update-in-progress. */
 	for (;;) {
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		if (! (CMOS_READ(RTC_REG_A) & RTC_UIP))
 			break;
 		/* don't hold intr closed all the time */
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 
 	/* Read regs. */
@@ -170,7 +170,7 @@ it8172_rtc_get_time(void)
 		hw_to_bin(*rtc_century_reg) * 100;
 
 	/* restore interrupts */
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return mktime(year, mon, day, hour, min, sec);
 }
@@ -186,11 +186,11 @@ it8172_rtc_set_time(unsigned long t)
 
 	/* avoid update-in-progress. */
 	for (;;) {
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		if (! (CMOS_READ(RTC_REG_A) & RTC_UIP))
 			break;
 		/* don't hold intr closed all the time */
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 
 	*rtc_century_reg = bin_to_hw(tm.tm_year/100);
@@ -202,7 +202,7 @@ it8172_rtc_set_time(unsigned long t)
 	CMOS_WRITE(bin_to_hw(tm.tm_year%100), RTC_YEAR);
 
 	/* restore interrupts */
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return 0;
 }
@@ -211,7 +211,7 @@ void __init it8172_time_init(void)
 {
         unsigned int est_freq, flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	saved_control = CMOS_READ(RTC_CONTROL);
 
@@ -225,7 +225,7 @@ void __init it8172_time_init(void)
 	printk("CPU frequency %d.%02d MHz\n", est_freq/1000000,
 	       (est_freq%1000000)*100/1000000);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	rtc_get_time = it8172_rtc_get_time;
 	rtc_set_time = it8172_rtc_set_time;
Index: linux.prev/arch/mips/jmr3927/rbhma3100/setup.c
===================================================================
--- linux.prev.orig/arch/mips/jmr3927/rbhma3100/setup.c
+++ linux.prev/arch/mips/jmr3927/rbhma3100/setup.c
@@ -115,7 +115,7 @@ static inline void do_reset(void)
 
 static void jmr3927_machine_restart(char *command)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	puts("Rebooting...");
 	do_reset();
 }
Index: linux.prev/arch/mips/kernel/Makefile
===================================================================
--- linux.prev.orig/arch/mips/kernel/Makefile
+++ linux.prev/arch/mips/kernel/Makefile
@@ -5,7 +5,7 @@
 extra-y		:= head.o init_task.o vmlinux.lds
 
 obj-y		+= cpu-probe.o branch.o entry.o genex.o irq.o process.o \
-		   ptrace.o reset.o semaphore.o setup.o signal.o syscall.o \
+		   ptrace.o reset.o setup.o signal.o syscall.o \
 		   time.o traps.o unaligned.o
 
 binfmt_irix-objs	:= irixelf.o irixinv.o irixioctl.o irixsig.o	\
@@ -13,6 +13,8 @@ binfmt_irix-objs	:= irixelf.o irixinv.o 
 
 obj-$(CONFIG_MODULES)		+= mips_ksyms.o module.o
 
+obj-$(CONFIG_ASM_SEMAPHORES)	+= semaphore.o
+
 obj-$(CONFIG_CPU_R3000)		+= r2300_fpu.o r2300_switch.o
 obj-$(CONFIG_CPU_TX39XX)	+= r2300_fpu.o r2300_switch.o
 obj-$(CONFIG_CPU_TX49XX)	+= r4k_fpu.o r4k_switch.o
Index: linux.prev/arch/mips/kernel/asm-offsets.c
===================================================================
--- linux.prev.orig/arch/mips/kernel/asm-offsets.c
+++ linux.prev/arch/mips/kernel/asm-offsets.c
@@ -11,6 +11,9 @@
 #include <linux/config.h>
 #include <linux/compat.h>
 #include <linux/types.h>
+#include <linux/linkage.h>
+#include <linux/rt_irq.h>
+#include <asm/interrupt.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
Index: linux.prev/arch/mips/kernel/cpu-bugs64.c
===================================================================
--- linux.prev.orig/arch/mips/kernel/cpu-bugs64.c
+++ linux.prev/arch/mips/kernel/cpu-bugs64.c
@@ -48,7 +48,7 @@ static inline void mult_sh_align_mod(lon
 	 * used for.
 	 */
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	/*
 	 * The following code leads to a wrong result of the first
 	 * dsll32 when executed on R4000 rev. 2.2 or 3.0 (PRId
@@ -101,7 +101,7 @@ static inline void mult_sh_align_mod(lon
 		""
 		: "=r" (lv2)
 		: "0" (lv2), "r" (p));
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	*v1 = lv1;
 	*v2 = lv2;
@@ -182,7 +182,7 @@ static inline void check_daddi(void)
 
 	printk("Checking for the daddi bug... ");
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	handler = set_except_vector(12, handle_daddi_ov);
 	/*
 	 * The following code fails to trigger an overflow exception
@@ -208,7 +208,7 @@ static inline void check_daddi(void)
 		: "=r" (v), "=&r" (tmp)
 		: "I" (0xffffffffffffdb9a), "I" (0x1234));
 	set_except_vector(12, handler);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	if (daddi_ov) {
 		printk("no.\n");
@@ -217,7 +217,7 @@ static inline void check_daddi(void)
 
 	printk("yes, workaround... ");
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	handler = set_except_vector(12, handle_daddi_ov);
 	asm volatile(
 		"addiu	%1, $0, %2\n\t"
@@ -226,7 +226,7 @@ static inline void check_daddi(void)
 		: "=r" (v), "=&r" (tmp)
 		: "I" (0xffffffffffffdb9a), "I" (0x1234));
 	set_except_vector(12, handler);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	if (daddi_ov) {
 		printk("yes.\n");
Index: linux.prev/arch/mips/kernel/entry.S
===================================================================
--- linux.prev.orig/arch/mips/kernel/entry.S
+++ linux.prev/arch/mips/kernel/entry.S
@@ -23,7 +23,7 @@
 	.endm
 #else
 	.macro	preempt_stop
-	local_irq_disable
+	mips_raw_local_irq_disable
 	.endm
 #define resume_kernel	restore_all
 #endif
@@ -38,7 +38,7 @@ FEXPORT(ret_from_irq)
 	beqz	t0, resume_kernel
 
 resume_userspace:
-	local_irq_disable		# make sure we dont miss an
+	mips_raw_local_irq_disable	# make sure we dont miss an
 					# interrupt setting need_resched
 					# between sampling and return
 	LONG_L	a2, TI_FLAGS($28)	# current->work
@@ -48,7 +48,9 @@ resume_userspace:
 
 #ifdef CONFIG_PREEMPT
 resume_kernel:
-	local_irq_disable
+	mips_local_irq_disable
+	lw	t0, kernel_preemption
+	beqz	t0, restore_all
 	lw	t0, TI_PRE_COUNT($28)
 	bnez	t0, restore_all
 need_resched:
@@ -66,7 +68,7 @@ FEXPORT(ret_from_fork)
 	jal	schedule_tail		# a0 = task_t *prev
 
 FEXPORT(syscall_exit)
-	local_irq_disable		# make sure need_resched and
+	mips_raw_local_irq_disable	# make sure need_resched and
 					# signals dont change between
 					# sampling and return
 	LONG_L	a2, TI_FLAGS($28)	# current->work
@@ -85,19 +87,21 @@ FEXPORT(restore_partial)		# restore part
 	.set	at
 
 work_pending:
-	andi	t0, a2, _TIF_NEED_RESCHED # a2 is preloaded with TI_FLAGS
+					# a2 is preloaded with TI_FLAGS
+	andi	t0, a2, (_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	beqz	t0, work_notifysig
 work_resched:
+	mips_raw_local_irq_enable  t0
 	jal	schedule
 
-	local_irq_disable		# make sure need_resched and
+	mips_raw_local_irq_disable	# make sure need_resched and
 					# signals dont change between
 					# sampling and return
 	LONG_L	a2, TI_FLAGS($28)
 	andi	t0, a2, _TIF_WORK_MASK	# is there any work to be done
 					# other than syscall tracing?
 	beqz	t0, restore_all
-	andi	t0, a2, _TIF_NEED_RESCHED
+	andi	t0, a2, (_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	bnez	t0, work_resched
 
 work_notifysig:				# deal with pending signals and
@@ -113,7 +117,7 @@ syscall_exit_work:
 	li	t0, _TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT
 	and	t0, a2			# a2 is preloaded with TI_FLAGS
 	beqz	t0, work_pending	# trace bit set?
-	local_irq_enable		# could let do_syscall_trace()
+	mips_raw_local_irq_enable	# could let do_syscall_trace()
 					# call schedule() instead
 	move	a0, sp
 	li	a1, 1
Index: linux.prev/arch/mips/kernel/gdb-stub.c
===================================================================
--- linux.prev.orig/arch/mips/kernel/gdb-stub.c
+++ linux.prev/arch/mips/kernel/gdb-stub.c
@@ -402,7 +402,7 @@ void set_debug_traps(void)
 	unsigned long flags;
 	unsigned char c;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	for (ht = hard_trap_info; ht->tt && ht->signo; ht++)
 		saved_vectors[ht->tt] = set_except_vector(ht->tt, trap_low);
 
@@ -418,7 +418,7 @@ void set_debug_traps(void)
 	putDebugChar('+'); /* ack it */
 
 	initialized = 1;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void restore_debug_traps(void)
@@ -426,10 +426,10 @@ void restore_debug_traps(void)
 	struct hard_trap_info *ht;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	for (ht = hard_trap_info; ht->tt && ht->signo; ht++)
 		set_except_vector(ht->tt, saved_vectors[ht->tt]);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /*
@@ -661,12 +661,12 @@ static void kgdb_wait(void *arg)
 	unsigned flags;
 	int cpu = smp_processor_id();
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	__raw_spin_lock(&kgdb_cpulock[cpu]);
 	__raw_spin_unlock(&kgdb_cpulock[cpu]);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 
Index: linux.prev/arch/mips/kernel/i8259.c
===================================================================
--- linux.prev.orig/arch/mips/kernel/i8259.c
+++ linux.prev/arch/mips/kernel/i8259.c
@@ -31,7 +31,7 @@ void disable_8259A_irq(unsigned int irq)
  * moves to arch independent land
  */
 
-DEFINE_SPINLOCK(i8259A_lock);
+DEFINE_RAW_SPINLOCK(i8259A_lock);
 
 static void end_8259A_irq (unsigned int irq)
 {
Index: linux.prev/arch/mips/kernel/init_task.c
===================================================================
--- linux.prev.orig/arch/mips/kernel/init_task.c
+++ linux.prev/arch/mips/kernel/init_task.c
@@ -9,8 +9,8 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
-static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
+static struct fs_struct init_fs = INIT_FS(init_fs);
+static struct files_struct init_files = INIT_FILES(init_files);
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
Index: linux.prev/arch/mips/kernel/irq-rm7000.c
===================================================================
--- linux.prev.orig/arch/mips/kernel/irq-rm7000.c
+++ linux.prev/arch/mips/kernel/irq-rm7000.c
@@ -33,18 +33,18 @@ static inline void rm7k_cpu_irq_enable(u
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	unmask_rm7k_irq(irq);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static void rm7k_cpu_irq_disable(unsigned int irq)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	mask_rm7k_irq(irq);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static unsigned int rm7k_cpu_irq_startup(unsigned int irq)
Index: linux.prev/arch/mips/kernel/irq-rm9000.c
===================================================================
--- linux.prev.orig/arch/mips/kernel/irq-rm9000.c
+++ linux.prev/arch/mips/kernel/irq-rm9000.c
@@ -34,18 +34,18 @@ static inline void rm9k_cpu_irq_enable(u
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	unmask_rm9k_irq(irq);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static void rm9k_cpu_irq_disable(unsigned int irq)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	mask_rm9k_irq(irq);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static unsigned int rm9k_cpu_irq_startup(unsigned int irq)
@@ -79,9 +79,9 @@ static void local_rm9k_perfcounter_irq_s
 	unsigned int irq = (unsigned int) args;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	mask_rm9k_irq(irq);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static void rm9k_perfcounter_irq_shutdown(unsigned int irq)
Index: linux.prev/arch/mips/kernel/irq.c
===================================================================
--- linux.prev.orig/arch/mips/kernel/irq.c
+++ linux.prev/arch/mips/kernel/irq.c
@@ -125,7 +125,10 @@ void __init init_IRQ(void)
 		irq_desc[i].action  = NULL;
 		irq_desc[i].depth   = 1;
 		irq_desc[i].handler = &no_irq_type;
-		spin_lock_init(&irq_desc[i].lock);
+		__raw_spin_lock_init(&irq_desc[i].lock);
+#ifdef CONFIG_PREEMPT_HARDIRQS
+		irq_desc[i].thread = NULL;
+#endif
 	}
 
 	arch_init_irq();
Index: linux.prev/arch/mips/kernel/irq_cpu.c
===================================================================
--- linux.prev.orig/arch/mips/kernel/irq_cpu.c
+++ linux.prev/arch/mips/kernel/irq_cpu.c
@@ -54,20 +54,20 @@ static inline void mips_cpu_irq_enable(u
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	unmask_mips_irq(irq);
 	back_to_back_c0_hazard();
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static void mips_cpu_irq_disable(unsigned int irq)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	mask_mips_irq(irq);
 	back_to_back_c0_hazard();
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static unsigned int mips_cpu_irq_startup(unsigned int irq)
Index: linux.prev/arch/mips/kernel/module.c
===================================================================
--- linux.prev.orig/arch/mips/kernel/module.c
+++ linux.prev/arch/mips/kernel/module.c
@@ -39,7 +39,7 @@ struct mips_hi16 {
 static struct mips_hi16 *mips_hi16_list;
 
 static LIST_HEAD(dbe_list);
-static DEFINE_SPINLOCK(dbe_lock);
+static DEFINE_RAW_SPINLOCK(dbe_lock);
 
 void *module_alloc(unsigned long size)
 {
Index: linux.prev/arch/mips/kernel/process.c
===================================================================
--- linux.prev.orig/arch/mips/kernel/process.c
+++ linux.prev/arch/mips/kernel/process.c
@@ -47,13 +47,15 @@
  */
 ATTRIB_NORET void cpu_idle(void)
 {
+	raw_local_irq_enable();
+
 	/* endless idle loop with no priority at all */
 	while (1) {
-		while (!need_resched())
+		while (!need_resched() && !need_resched_delayed())
 			if (cpu_wait)
 				(*cpu_wait)();
-		preempt_enable_no_resched();
-		schedule();
+		__preempt_enable_no_resched();
+		__schedule();
 		preempt_disable();
 	}
 }
Index: linux.prev/arch/mips/kernel/scall32-o32.S
===================================================================
--- linux.prev.orig/arch/mips/kernel/scall32-o32.S
+++ linux.prev/arch/mips/kernel/scall32-o32.S
@@ -72,7 +72,7 @@ stack_done:
 1:	sw	v0, PT_R2(sp)		# result
 
 o32_syscall_exit:
-	local_irq_disable		# make sure need_resched and
+	mips_raw_local_irq_disable	# make sure need_resched and
 					# signals dont change between
 					# sampling and return
 	lw	a2, TI_FLAGS($28)	# current->work
Index: linux.prev/arch/mips/kernel/scall64-64.S
===================================================================
--- linux.prev.orig/arch/mips/kernel/scall64-64.S
+++ linux.prev/arch/mips/kernel/scall64-64.S
@@ -71,7 +71,7 @@ NESTED(handle_sys64, PT_SIZE, sp)
 1:	sd	v0, PT_R2(sp)		# result
 
 n64_syscall_exit:
-	local_irq_disable		# make sure need_resched and
+	raw_local_irq_disable		# make sure need_resched and
 					# signals dont change between
 					# sampling and return
 	LONG_L	a2, TI_FLAGS($28)	# current->work
Index: linux.prev/arch/mips/kernel/scall64-n32.S
===================================================================
--- linux.prev.orig/arch/mips/kernel/scall64-n32.S
+++ linux.prev/arch/mips/kernel/scall64-n32.S
@@ -68,7 +68,7 @@ NESTED(handle_sysn32, PT_SIZE, sp)
 	sd	v0, PT_R0(sp)		# set flag for syscall restarting
 1:	sd	v0, PT_R2(sp)		# result
 
-	local_irq_disable		# make sure need_resched and
+	raw_local_irq_disable		# make sure need_resched and
 					# signals dont change between
 					# sampling and return
 	LONG_L  a2, TI_FLAGS($28)	# current->work
Index: linux.prev/arch/mips/kernel/scall64-o32.S
===================================================================
--- linux.prev.orig/arch/mips/kernel/scall64-o32.S
+++ linux.prev/arch/mips/kernel/scall64-o32.S
@@ -97,7 +97,7 @@ NESTED(handle_sys, PT_SIZE, sp)
 1:	sd	v0, PT_R2(sp)		# result
 
 o32_syscall_exit:
-	local_irq_disable		# make need_resched and
+	raw_local_irq_disable		# make need_resched and
 					# signals dont change between
 					# sampling and return
 	LONG_L	a2, TI_FLAGS($28)
Index: linux.prev/arch/mips/kernel/semaphore.c
===================================================================
--- linux.prev.orig/arch/mips/kernel/semaphore.c
+++ linux.prev/arch/mips/kernel/semaphore.c
@@ -36,7 +36,7 @@
  * sem->count and sem->waking atomic.  Scalability isn't an issue because
  * this lock is used on UP only so it's just an empty variable.
  */
-static inline int __sem_update_count(struct semaphore *sem, int incr)
+static inline int __sem_update_count(struct compat_semaphore *sem, int incr)
 {
 	int old_count, tmp;
 
@@ -67,7 +67,7 @@ static inline int __sem_update_count(str
 		: "=&r" (old_count), "=&r" (tmp), "=m" (sem->count)
 		: "r" (incr), "m" (sem->count));
 	} else {
-		static DEFINE_SPINLOCK(semaphore_lock);
+		static DEFINE_RAW_SPINLOCK(semaphore_lock);
 		unsigned long flags;
 
 		spin_lock_irqsave(&semaphore_lock, flags);
@@ -80,7 +80,7 @@ static inline int __sem_update_count(str
 	return old_count;
 }
 
-void __up(struct semaphore *sem)
+void __compat_up(struct compat_semaphore *sem)
 {
 	/*
 	 * Note that we incremented count in up() before we came here,
@@ -94,7 +94,7 @@ void __up(struct semaphore *sem)
 	wake_up(&sem->wait);
 }
 
-EXPORT_SYMBOL(__up);
+EXPORT_SYMBOL(__compat_up);
 
 /*
  * Note that when we come in to __down or __down_interruptible,
@@ -104,7 +104,7 @@ EXPORT_SYMBOL(__up);
  * Thus it is only when we decrement count from some value > 0
  * that we have actually got the semaphore.
  */
-void __sched __down(struct semaphore *sem)
+void __sched __compat_down(struct compat_semaphore *sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -133,9 +133,9 @@ void __sched __down(struct semaphore *se
 	wake_up(&sem->wait);
 }
 
-EXPORT_SYMBOL(__down);
+EXPORT_SYMBOL(__compat_down);
 
-int __sched __down_interruptible(struct semaphore * sem)
+int __sched __compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -165,4 +165,4 @@ int __sched __down_interruptible(struct 
 	return retval;
 }
 
-EXPORT_SYMBOL(__down_interruptible);
+EXPORT_SYMBOL(__compat_down_interruptible);
Index: linux.prev/arch/mips/kernel/signal.c
===================================================================
--- linux.prev.orig/arch/mips/kernel/signal.c
+++ linux.prev/arch/mips/kernel/signal.c
@@ -426,6 +426,10 @@ int do_signal(sigset_t *oldset, struct p
 	siginfo_t info;
 	int signr;
 
+#ifdef CONFIG_PREEMPT_RT
+	raw_local_irq_enable();
+	preempt_check_resched();
+#endif
 	/*
 	 * We want the common case to go fast, which is why we may in certain
 	 * cases get here from kernel mode. Just return without doing anything
Index: linux.prev/arch/mips/kernel/signal32.c
===================================================================
--- linux.prev.orig/arch/mips/kernel/signal32.c
+++ linux.prev/arch/mips/kernel/signal32.c
@@ -814,6 +814,10 @@ int do_signal32(sigset_t *oldset, struct
 	siginfo_t info;
 	int signr;
 
+#ifdef CONFIG_PREEMPT_RT
+	raw_local_irq_enable();
+	preempt_check_resched();
+#endif
 	/*
 	 * We want the common case to go fast, which is why we may in certain
 	 * cases get here from kernel mode. Just return without doing anything
Index: linux.prev/arch/mips/kernel/smp.c
===================================================================
--- linux.prev.orig/arch/mips/kernel/smp.c
+++ linux.prev/arch/mips/kernel/smp.c
@@ -106,7 +106,22 @@ asmlinkage void start_secondary(void)
 	cpu_idle();
 }
 
-DEFINE_SPINLOCK(smp_call_lock);
+DEFINE_RAW_SPINLOCK(smp_call_lock);
+
+/*
+ * this function sends a 'reschedule' IPI to all other CPUs.
+ * This is used when RT tasks are starving and other CPUs
+ * might be able to run them.
+ */
+void smp_send_reschedule_allbutself(void)
+{
+	int cpu = smp_processor_id();
+	int i;
+
+	for (i = 0; i < NR_CPUS; i++)
+		if (cpu_online(i) && i != cpu)
+			core_send_ipi(i, SMP_RESCHEDULE_YOURSELF);
+}
 
 struct call_data_struct *call_data;
 
@@ -215,7 +230,7 @@ static void stop_this_cpu(void *dummy)
 	 * Remove this CPU:
 	 */
 	cpu_clear(smp_processor_id(), cpu_online_map);
-	local_irq_enable();	/* May need to service _machine_restart IPI */
+	raw_local_irq_enable();	/* May need to service _machine_restart IPI */
 	for (;;);		/* Wait if available. */
 }
 
@@ -289,6 +304,8 @@ int setup_profiling_timer(unsigned int m
 	return 0;
 }
 
+static DEFINE_RAW_SPINLOCK(tlbstate_lock);
+
 static void flush_tlb_all_ipi(void *info)
 {
 	local_flush_tlb_all();
@@ -320,6 +337,7 @@ static void flush_tlb_mm_ipi(void *mm)
 void flush_tlb_mm(struct mm_struct *mm)
 {
 	preempt_disable();
+	spin_lock(&tlbstate_lock);
 
 	if ((atomic_read(&mm->mm_users) != 1) || (current->mm != mm)) {
 		smp_call_function(flush_tlb_mm_ipi, (void *)mm, 1, 1);
@@ -329,6 +347,7 @@ void flush_tlb_mm(struct mm_struct *mm)
 			if (smp_processor_id() != i)
 				cpu_context(i, mm) = 0;
 	}
+	spin_unlock(&tlbstate_lock);
 	local_flush_tlb_mm(mm);
 
 	preempt_enable();
@@ -352,6 +371,8 @@ void flush_tlb_range(struct vm_area_stru
 	struct mm_struct *mm = vma->vm_mm;
 
 	preempt_disable();
+	spin_lock(&tlbstate_lock);
+
 	if ((atomic_read(&mm->mm_users) != 1) || (current->mm != mm)) {
 		struct flush_tlb_data fd;
 
@@ -365,6 +386,7 @@ void flush_tlb_range(struct vm_area_stru
 			if (smp_processor_id() != i)
 				cpu_context(i, mm) = 0;
 	}
+	spin_unlock(&tlbstate_lock);
 	local_flush_tlb_range(vma, start, end);
 	preempt_enable();
 }
@@ -395,6 +417,8 @@ static void flush_tlb_page_ipi(void *inf
 void flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
 {
 	preempt_disable();
+	spin_lock(&tlbstate_lock);
+
 	if ((atomic_read(&vma->vm_mm->mm_users) != 1) || (current->mm != vma->vm_mm)) {
 		struct flush_tlb_data fd;
 
@@ -407,6 +431,7 @@ void flush_tlb_page(struct vm_area_struc
 			if (smp_processor_id() != i)
 				cpu_context(i, vma->vm_mm) = 0;
 	}
+	spin_unlock(&tlbstate_lock);
 	local_flush_tlb_page(vma, page);
 	preempt_enable();
 }
Index: linux.prev/arch/mips/kernel/time.c
===================================================================
--- linux.prev.orig/arch/mips/kernel/time.c
+++ linux.prev/arch/mips/kernel/time.c
@@ -50,7 +50,7 @@
  */
 extern volatile unsigned long wall_jiffies;
 
-DEFINE_SPINLOCK(rtc_lock);
+DEFINE_RAW_SPINLOCK(rtc_lock);
 
 /*
  * By default we provide the null RTC ops
@@ -554,7 +554,7 @@ unsigned int mips_hpt_frequency;
 
 static struct irqaction timer_irqaction = {
 	.handler = timer_interrupt,
-	.flags = SA_INTERRUPT,
+	.flags = SA_NODELAY | SA_INTERRUPT,
 	.name = "timer",
 };
 
Index: linux.prev/arch/mips/kernel/traps.c
===================================================================
--- linux.prev.orig/arch/mips/kernel/traps.c
+++ linux.prev/arch/mips/kernel/traps.c
@@ -274,7 +274,7 @@ void show_registers(struct pt_regs *regs
 	printk("\n");
 }
 
-static DEFINE_SPINLOCK(die_lock);
+static DEFINE_RAW_SPINLOCK(die_lock);
 
 NORET_TYPE void ATTRIB_NORET die(const char * str, struct pt_regs * regs)
 {
Index: linux.prev/arch/mips/lasat/interrupt.c
===================================================================
--- linux.prev.orig/arch/mips/lasat/interrupt.c
+++ linux.prev/arch/mips/lasat/interrupt.c
@@ -39,18 +39,18 @@ void disable_lasat_irq(unsigned int irq_
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	*lasat_int_mask &= ~(1 << irq_nr) << lasat_int_mask_shift;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void enable_lasat_irq(unsigned int irq_nr)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	*lasat_int_mask |= (1 << irq_nr) << lasat_int_mask_shift;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static unsigned int startup_lasat_irq(unsigned int irq)
Index: linux.prev/arch/mips/lasat/reset.c
===================================================================
--- linux.prev.orig/arch/mips/lasat/reset.c
+++ linux.prev/arch/mips/lasat/reset.c
@@ -33,7 +33,7 @@ int lasat_boot_to_service = 0;
 
 static void lasat_machine_restart(char *command)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	if (lasat_boot_to_service) {
 		printk("machine_restart: Rebooting to service mode\n");
@@ -47,7 +47,7 @@ static void lasat_machine_restart(char *
 #define MESSAGE "System halted"
 static void lasat_machine_halt(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* Disable interrupts and loop forever */
 	printk(KERN_NOTICE MESSAGE "\n");
Index: linux.prev/arch/mips/lib-32/dump_tlb.c
===================================================================
--- linux.prev.orig/arch/mips/lib-32/dump_tlb.c
+++ linux.prev/arch/mips/lib-32/dump_tlb.c
@@ -118,7 +118,7 @@ void dump_tlb_addr(unsigned long addr)
 	unsigned int flags, oldpid;
 	int index;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	oldpid = read_c0_entryhi() & 0xff;
 	BARRIER();
 	write_c0_entryhi((addr & PAGE_MASK) | oldpid);
@@ -127,7 +127,7 @@ void dump_tlb_addr(unsigned long addr)
 	BARRIER();
 	index = read_c0_index();
 	write_c0_entryhi(oldpid);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	if (index < 0) {
 		printk("No entry for address 0x%08lx in TLB\n", addr);
Index: linux.prev/arch/mips/lib-32/r3k_dump_tlb.c
===================================================================
--- linux.prev.orig/arch/mips/lib-32/r3k_dump_tlb.c
+++ linux.prev/arch/mips/lib-32/r3k_dump_tlb.c
@@ -79,13 +79,13 @@ void dump_tlb_addr(unsigned long addr)
 	unsigned long flags, oldpid;
 	int index;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	oldpid = read_c0_entryhi() & 0xff;
 	write_c0_entryhi((addr & PAGE_MASK) | oldpid);
 	tlb_probe();
 	index = read_c0_index();
 	write_c0_entryhi(oldpid);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	if (index < 0) {
 		printk("No entry for address 0x%08lx in TLB\n", addr);
Index: linux.prev/arch/mips/lib-64/dump_tlb.c
===================================================================
--- linux.prev.orig/arch/mips/lib-64/dump_tlb.c
+++ linux.prev/arch/mips/lib-64/dump_tlb.c
@@ -112,7 +112,7 @@ void dump_tlb_addr(unsigned long addr)
 	unsigned int flags, oldpid;
 	int index;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	oldpid = read_c0_entryhi() & 0xff;
 	BARRIER();
 	write_c0_entryhi((addr & PAGE_MASK) | oldpid);
@@ -121,7 +121,7 @@ void dump_tlb_addr(unsigned long addr)
 	BARRIER();
 	index = read_c0_index();
 	write_c0_entryhi(oldpid);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	if (index < 0) {
 		printk("No entry for address 0x%08lx in TLB\n", addr);
Index: linux.prev/arch/mips/math-emu/cp1emu.c
===================================================================
--- linux.prev.orig/arch/mips/math-emu/cp1emu.c
+++ linux.prev/arch/mips/math-emu/cp1emu.c
@@ -1269,7 +1269,9 @@ int fpu_emulator_cop1Handler(struct pt_r
 		if (sig)
 			break;
 
+		preempt_enable();
 		cond_resched();
+		preempt_disable();
 	} while (xcp->cp0_epc > prevepc);
 
 	/* SIGILL indicates a non-fpu instruction */
Index: linux.prev/arch/mips/mips-boards/generic/time.c
===================================================================
--- linux.prev.orig/arch/mips/mips-boards/generic/time.c
+++ linux.prev/arch/mips/mips-boards/generic/time.c
@@ -139,7 +139,7 @@ static unsigned int __init estimate_cpu_
 #if defined(CONFIG_MIPS_ATLAS) || defined(CONFIG_MIPS_MALTA)
 	unsigned int flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	/* Start counter exactly on falling edge of update flag */
 	while (CMOS_READ(RTC_REG_A) & RTC_UIP);
@@ -155,7 +155,7 @@ static unsigned int __init estimate_cpu_
 	count = read_c0_count();
 
 	/* restore interrupts */
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 #endif
 
 	mips_hpt_frequency = count;
@@ -178,7 +178,7 @@ void __init mips_time_init(void)
 {
 	unsigned int est_freq, flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
         /* Set Data mode - binary. */
         CMOS_WRITE(CMOS_READ(RTC_CONTROL) | RTC_DM_BINARY, RTC_CONTROL);
@@ -190,7 +190,7 @@ void __init mips_time_init(void)
 
         cpu_khz = est_freq / 1000;
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void __init mips_timer_setup(struct irqaction *irq)
Index: linux.prev/arch/mips/mm/c-r4k.c
===================================================================
--- linux.prev.orig/arch/mips/mm/c-r4k.c
+++ linux.prev/arch/mips/mm/c-r4k.c
@@ -117,9 +117,9 @@ static inline void blast_r4600_v1_icache
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	blast_icache32();
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static inline void tx49_blast_icache32(void)
@@ -147,9 +147,9 @@ static inline void blast_icache32_r4600_
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	blast_icache32_page_indexed(page);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static inline void tx49_blast_icache32_page_indexed(unsigned long page)
@@ -1090,7 +1090,7 @@ static int __init probe_scache(void)
 	 * This is such a bitch, you'd think they would make it easy to do
 	 * this.  Away you daemons of stupidity!
 	 */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	/* Fill each size-multiple cache line with a valid tag. */
 	pow2 = (64 * 1024);
@@ -1118,7 +1118,7 @@ static int __init probe_scache(void)
 			break;
 		pow2 <<= 1;
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	addr -= begin;
 
 	scache_size = addr;
Index: linux.prev/arch/mips/mm/c-tx39.c
===================================================================
--- linux.prev.orig/arch/mips/mm/c-tx39.c
+++ linux.prev/arch/mips/mm/c-tx39.c
@@ -49,7 +49,7 @@ static void tx39h_flush_icache_all(void)
 	unsigned long flags, config;
 
 	/* disable icache (set ICE#) */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	config = read_c0_conf();
 	write_c0_conf(config & ~TX39_CONF_ICE);
 	TX39_STOP_STREAMING();
@@ -61,7 +61,7 @@ static void tx39h_flush_icache_all(void)
 	}
 
 	write_c0_conf(config);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static void tx39h_dma_cache_wback_inv(unsigned long addr, unsigned long size)
@@ -104,39 +104,39 @@ static inline void tx39_blast_icache_pag
 {
 	unsigned long flags, config;
 	/* disable icache (set ICE#) */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	config = read_c0_conf();
 	write_c0_conf(config & ~TX39_CONF_ICE);
 	TX39_STOP_STREAMING();
 	blast_icache16_page(addr);
 	write_c0_conf(config);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static inline void tx39_blast_icache_page_indexed(unsigned long addr)
 {
 	unsigned long flags, config;
 	/* disable icache (set ICE#) */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	config = read_c0_conf();
 	write_c0_conf(config & ~TX39_CONF_ICE);
 	TX39_STOP_STREAMING();
 	blast_icache16_page_indexed(addr);
 	write_c0_conf(config);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static inline void tx39_blast_icache(void)
 {
 	unsigned long flags, config;
 	/* disable icache (set ICE#) */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	config = read_c0_conf();
 	write_c0_conf(config & ~TX39_CONF_ICE);
 	TX39_STOP_STREAMING();
 	blast_icache16();
 	write_c0_conf(config);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static inline void tx39_flush_cache_all(void)
@@ -266,7 +266,7 @@ static void tx39_flush_icache_range(unsi
 		addr = start & ~(dc_lsize - 1);
 		aend = (end - 1) & ~(dc_lsize - 1);
 		/* disable icache (set ICE#) */
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		config = read_c0_conf();
 		write_c0_conf(config & ~TX39_CONF_ICE);
 		TX39_STOP_STREAMING();
@@ -278,7 +278,7 @@ static void tx39_flush_icache_range(unsi
 			addr += dc_lsize;
 		}
 		write_c0_conf(config);
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 }
 
@@ -367,13 +367,13 @@ static void tx39_flush_cache_sigtramp(un
 	protected_writeback_dcache_line(addr & ~(dc_lsize - 1));
 
 	/* disable icache (set ICE#) */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	config = read_c0_conf();
 	write_c0_conf(config & ~TX39_CONF_ICE);
 	TX39_STOP_STREAMING();
 	protected_flush_icache_line(addr & ~(ic_lsize - 1));
 	write_c0_conf(config);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static __init void tx39_probe_cache(void)
Index: linux.prev/arch/mips/mm/init.c
===================================================================
--- linux.prev.orig/arch/mips/mm/init.c
+++ linux.prev/arch/mips/mm/init.c
@@ -35,7 +35,7 @@
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 unsigned long highstart_pfn, highend_pfn;
 
Index: linux.prev/arch/mips/mm/sc-ip22.c
===================================================================
--- linux.prev.orig/arch/mips/mm/sc-ip22.c
+++ linux.prev/arch/mips/mm/sc-ip22.c
@@ -72,7 +72,7 @@ static void indy_sc_wback_invalidate(uns
 	first_line = SC_INDEX(addr);
 	last_line = SC_INDEX(addr + size - 1);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	if (first_line <= last_line) {
 		indy_sc_wipe(first_line, last_line);
 		goto out;
@@ -81,7 +81,7 @@ static void indy_sc_wback_invalidate(uns
 	indy_sc_wipe(first_line, SC_SIZE - SC_LINE);
 	indy_sc_wipe(0, last_line);
 out:
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static void indy_sc_enable(void)
Index: linux.prev/arch/mips/mm/sc-r5k.c
===================================================================
--- linux.prev.orig/arch/mips/mm/sc-r5k.c
+++ linux.prev/arch/mips/mm/sc-r5k.c
@@ -61,20 +61,20 @@ static void r5k_sc_enable(void)
 {
         unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	set_c0_config(R5K_CONF_SE);
 	blast_r5000_scache();
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static void r5k_sc_disable(void)
 {
         unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	blast_r5000_scache();
 	clear_c0_config(R5K_CONF_SE);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static inline int __init r5k_sc_probe(void)
Index: linux.prev/arch/mips/mm/tlb-andes.c
===================================================================
--- linux.prev.orig/arch/mips/mm/tlb-andes.c
+++ linux.prev/arch/mips/mm/tlb-andes.c
@@ -27,7 +27,7 @@ void local_flush_tlb_all(void)
 	unsigned long old_ctx;
 	unsigned long entry;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	/* Save old context and create impossible VPN2 value */
 	old_ctx = read_c0_entryhi() & ASID_MASK;
 	write_c0_entryhi(CKSEG0);
@@ -43,7 +43,7 @@ void local_flush_tlb_all(void)
 		entry++;
 	}
 	write_c0_entryhi(old_ctx);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void local_flush_tlb_mm(struct mm_struct *mm)
@@ -64,7 +64,7 @@ void local_flush_tlb_range(struct vm_are
 		unsigned long flags;
 		int size;
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		size = (end - start + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
 		size = (size + 1) >> 1;
 		if (size <= NTLB_ENTRIES_HALF) {
@@ -93,7 +93,7 @@ void local_flush_tlb_range(struct vm_are
 		} else {
 			drop_mmu_context(mm, cpu);
 		}
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 }
 
@@ -105,7 +105,7 @@ void local_flush_tlb_kernel_range(unsign
 	size = (end - start + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
 	size = (size + 1) >> 1;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	if (size <= NTLB_ENTRIES_HALF) {
 		int pid = read_c0_entryhi();
 
@@ -131,7 +131,7 @@ void local_flush_tlb_kernel_range(unsign
 	} else {
 		local_flush_tlb_all();
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
@@ -143,7 +143,7 @@ void local_flush_tlb_page(struct vm_area
 		newpid = (cpu_context(smp_processor_id(), vma->vm_mm) &
 			  ASID_MASK);
 		page &= (PAGE_MASK << 1);
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		oldpid = (read_c0_entryhi() & ASID_MASK);
 		write_c0_entryhi(page | newpid);
 		tlb_probe();
@@ -157,7 +157,7 @@ void local_flush_tlb_page(struct vm_area
 
 	finish:
 		write_c0_entryhi(oldpid);
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 }
 
@@ -170,7 +170,7 @@ void local_flush_tlb_one(unsigned long p
 	unsigned long flags;
 	int oldpid, idx;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	page &= (PAGE_MASK << 1);
 	oldpid = read_c0_entryhi() & 0xff;
 	write_c0_entryhi(page);
@@ -185,7 +185,7 @@ void local_flush_tlb_one(unsigned long p
 	}
 	write_c0_entryhi(oldpid);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /* XXX Simplify this.  On the R10000 writing a TLB entry for an virtual
@@ -216,7 +216,7 @@ void __update_tlb(struct vm_area_struct 
 		       vma->vm_mm) & ASID_MASK), pid);
 	}
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	address &= (PAGE_MASK << 1);
 	write_c0_entryhi(address | (pid));
 	pgdp = pgd_offset(vma->vm_mm, address);
@@ -234,7 +234,7 @@ void __update_tlb(struct vm_area_struct 
 		tlb_write_indexed();
 	}
 	write_c0_entryhi(pid);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void __init tlb_init(void)
Index: linux.prev/arch/mips/mm/tlb-r3k.c
===================================================================
--- linux.prev.orig/arch/mips/mm/tlb-r3k.c
+++ linux.prev/arch/mips/mm/tlb-r3k.c
@@ -49,7 +49,7 @@ void local_flush_tlb_all(void)
 	printk("[tlball]");
 #endif
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	old_ctx = read_c0_entryhi() & ASID_MASK;
 	write_c0_entrylo0(0);
 	entry = r3k_have_wired_reg ? read_c0_wired() : 8;
@@ -60,7 +60,7 @@ void local_flush_tlb_all(void)
 		tlb_write_indexed();
 	}
 	write_c0_entryhi(old_ctx);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void local_flush_tlb_mm(struct mm_struct *mm)
@@ -89,7 +89,7 @@ void local_flush_tlb_range(struct vm_are
 		printk("[tlbrange<%lu,0x%08lx,0x%08lx>]",
 			cpu_context(cpu, mm) & ASID_MASK, start, end);
 #endif
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		size = (end - start + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
 		if (size <= current_cpu_data.tlbsize) {
 			int oldpid = read_c0_entryhi() & ASID_MASK;
@@ -115,7 +115,7 @@ void local_flush_tlb_range(struct vm_are
 		} else {
 			drop_mmu_context(mm, cpu);
 		}
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 }
 
@@ -127,7 +127,7 @@ void local_flush_tlb_kernel_range(unsign
 #ifdef DEBUG_TLB
 	printk("[tlbrange<%lu,0x%08lx,0x%08lx>]", start, end);
 #endif
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	size = (end - start + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
 	if (size <= current_cpu_data.tlbsize) {
 		int pid = read_c0_entryhi();
@@ -153,7 +153,7 @@ void local_flush_tlb_kernel_range(unsign
 	} else {
 		local_flush_tlb_all();
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
@@ -169,7 +169,7 @@ void local_flush_tlb_page(struct vm_area
 #endif
 		newpid = cpu_context(cpu, vma->vm_mm) & ASID_MASK;
 		page &= PAGE_MASK;
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		oldpid = read_c0_entryhi() & ASID_MASK;
 		write_c0_entryhi(page | newpid);
 		BARRIER;
@@ -183,7 +183,7 @@ void local_flush_tlb_page(struct vm_area
 
 finish:
 		write_c0_entryhi(oldpid);
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 }
 
@@ -207,7 +207,7 @@ void __update_tlb(struct vm_area_struct 
 	}
 #endif
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	address &= PAGE_MASK;
 	write_c0_entryhi(address | pid);
 	BARRIER;
@@ -221,7 +221,7 @@ void __update_tlb(struct vm_area_struct 
 		tlb_write_indexed();
 	}
 	write_c0_entryhi(pid);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void __init add_wired_entry(unsigned long entrylo0, unsigned long entrylo1,
@@ -240,7 +240,7 @@ void __init add_wired_entry(unsigned lon
 		       entrylo0, entryhi, pagemask);
 #endif
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		/* Save old context and create impossible VPN2 value */
 		old_ctx = read_c0_entryhi() & ASID_MASK;
 		old_pagemask = read_c0_pagemask();
@@ -260,7 +260,7 @@ void __init add_wired_entry(unsigned lon
 		write_c0_entryhi(old_ctx);
 		write_c0_pagemask(old_pagemask);
 		local_flush_tlb_all();
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 
 	} else if (wired < 8) {
 #ifdef DEBUG_TLB
@@ -268,7 +268,7 @@ void __init add_wired_entry(unsigned lon
 		       entrylo0, entryhi);
 #endif
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		old_ctx = read_c0_entryhi() & ASID_MASK;
 		write_c0_entrylo0(entrylo0);
 		write_c0_entryhi(entryhi);
@@ -277,7 +277,7 @@ void __init add_wired_entry(unsigned lon
 		tlb_write_indexed();
 		write_c0_entryhi(old_ctx);
 		local_flush_tlb_all();
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 }
 
Index: linux.prev/arch/mips/mm/tlb-r4k.c
===================================================================
--- linux.prev.orig/arch/mips/mm/tlb-r4k.c
+++ linux.prev/arch/mips/mm/tlb-r4k.c
@@ -38,7 +38,7 @@ void local_flush_tlb_all(void)
 	unsigned long old_ctx;
 	int entry;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	/* Save old context and create impossible VPN2 value */
 	old_ctx = read_c0_entryhi();
 	write_c0_entrylo0(0);
@@ -57,7 +57,7 @@ void local_flush_tlb_all(void)
 	}
 	tlbw_use_hazard();
 	write_c0_entryhi(old_ctx);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /* All entries common to a mm share an asid.  To effectively flush
@@ -89,7 +89,7 @@ void local_flush_tlb_range(struct vm_are
 
 		size = (end - start + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
 		size = (size + 1) >> 1;
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		if (size <= current_cpu_data.tlbsize/2) {
 			int oldpid = read_c0_entryhi();
 			int newpid = cpu_asid(cpu, mm);
@@ -120,7 +120,7 @@ void local_flush_tlb_range(struct vm_are
 		} else {
 			drop_mmu_context(mm, cpu);
 		}
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 }
 
@@ -131,7 +131,7 @@ void local_flush_tlb_kernel_range(unsign
 
 	size = (end - start + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
 	size = (size + 1) >> 1;
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	if (size <= current_cpu_data.tlbsize / 2) {
 		int pid = read_c0_entryhi();
 
@@ -162,7 +162,7 @@ void local_flush_tlb_kernel_range(unsign
 	} else {
 		local_flush_tlb_all();
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
@@ -175,7 +175,7 @@ void local_flush_tlb_page(struct vm_area
 
 		newpid = cpu_asid(cpu, vma->vm_mm);
 		page &= (PAGE_MASK << 1);
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		oldpid = read_c0_entryhi();
 		write_c0_entryhi(page | newpid);
 		mtc0_tlbw_hazard();
@@ -194,7 +194,7 @@ void local_flush_tlb_page(struct vm_area
 
 	finish:
 		write_c0_entryhi(oldpid);
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 }
 
@@ -207,7 +207,7 @@ void local_flush_tlb_one(unsigned long p
 	unsigned long flags;
 	int oldpid, idx;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	oldpid = read_c0_entryhi();
 	page &= (PAGE_MASK << 1);
 	write_c0_entryhi(page);
@@ -226,7 +226,7 @@ void local_flush_tlb_one(unsigned long p
 	}
 	write_c0_entryhi(oldpid);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /*
@@ -249,7 +249,7 @@ void __update_tlb(struct vm_area_struct 
 	if (current->active_mm != vma->vm_mm)
 		return;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	pid = read_c0_entryhi() & ASID_MASK;
 	address &= (PAGE_MASK << 1);
@@ -277,7 +277,7 @@ void __update_tlb(struct vm_area_struct 
 	else
 		tlb_write_indexed();
 	tlbw_use_hazard();
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 #if 0
@@ -291,7 +291,7 @@ static void r4k_update_mmu_cache_hwbug(s
 	pte_t *ptep;
 	int idx;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	address &= (PAGE_MASK << 1);
 	asid = read_c0_entryhi() & ASID_MASK;
 	write_c0_entryhi(address | asid);
@@ -310,7 +310,7 @@ static void r4k_update_mmu_cache_hwbug(s
 	else
 		tlb_write_indexed();
 	tlbw_use_hazard();
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 #endif
 
@@ -322,7 +322,7 @@ void __init add_wired_entry(unsigned lon
 	unsigned long old_pagemask;
 	unsigned long old_ctx;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	/* Save old context and create impossible VPN2 value */
 	old_ctx = read_c0_entryhi();
 	old_pagemask = read_c0_pagemask();
@@ -342,7 +342,7 @@ void __init add_wired_entry(unsigned lon
 	BARRIER;
 	write_c0_pagemask(old_pagemask);
 	local_flush_tlb_all();
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /*
@@ -362,7 +362,7 @@ __init int add_temporary_entry(unsigned 
 	unsigned long old_pagemask;
 	unsigned long old_ctx;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	/* Save old context and create impossible VPN2 value */
 	old_ctx = read_c0_entryhi();
 	old_pagemask = read_c0_pagemask();
@@ -386,7 +386,7 @@ __init int add_temporary_entry(unsigned 
 	write_c0_entryhi(old_ctx);
 	write_c0_pagemask(old_pagemask);
 out:
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return ret;
 }
 
Index: linux.prev/arch/mips/mm/tlb-r8k.c
===================================================================
--- linux.prev.orig/arch/mips/mm/tlb-r8k.c
+++ linux.prev/arch/mips/mm/tlb-r8k.c
@@ -35,7 +35,7 @@ void local_flush_tlb_all(void)
 	unsigned long old_ctx;
 	int entry;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	/* Save old context and create impossible VPN2 value */
 	old_ctx = read_c0_entryhi();
 	write_c0_entrylo(0);
@@ -49,7 +49,7 @@ void local_flush_tlb_all(void)
 	}
 	tlbw_use_hazard();
 	write_c0_entryhi(old_ctx);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void local_flush_tlb_mm(struct mm_struct *mm)
@@ -74,7 +74,7 @@ void local_flush_tlb_range(struct vm_are
 	size = (end - start + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
 	size = (size + 1) >> 1;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	if (size > TFP_TLB_SIZE / 2) {
 		drop_mmu_context(mm, cpu);
@@ -106,7 +106,7 @@ void local_flush_tlb_range(struct vm_are
 	write_c0_entryhi(oldpid);
 
 out_restore:
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /* Usable for KV1 addresses only! */
@@ -123,7 +123,7 @@ void local_flush_tlb_kernel_range(unsign
 		return;
 	}
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	write_c0_entrylo(0);
 
@@ -145,7 +145,7 @@ void local_flush_tlb_kernel_range(unsign
 		tlb_write();
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
@@ -160,7 +160,7 @@ void local_flush_tlb_page(struct vm_area
 
 	newpid = cpu_asid(cpu, vma->vm_mm);
 	page &= PAGE_MASK;
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	oldpid = read_c0_entryhi();
 	write_c0_vaddr(page);
 	write_c0_entryhi(newpid);
@@ -175,7 +175,7 @@ void local_flush_tlb_page(struct vm_area
 
 finish:
 	write_c0_entryhi(oldpid);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /*
@@ -199,7 +199,7 @@ void __update_tlb(struct vm_area_struct 
 
 	pid = read_c0_entryhi() & ASID_MASK;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	address &= PAGE_MASK;
 	write_c0_vaddr(address);
 	write_c0_entryhi(pid);
@@ -212,7 +212,7 @@ void __update_tlb(struct vm_area_struct 
 	tlb_write();
 
 	write_c0_entryhi(pid);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static void __init probe_tlb(unsigned long config)
Index: linux.prev/arch/mips/momentum/ocelot_g/irq.c
===================================================================
--- linux.prev.orig/arch/mips/momentum/ocelot_g/irq.c
+++ linux.prev/arch/mips/momentum/ocelot_g/irq.c
@@ -58,7 +58,7 @@ void __init arch_init_irq(void)
 	 * int-handler is not on bootstrap
 	 */
 	clear_c0_status(ST0_IM);
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* Sets the first-level interrupt dispatcher. */
 	set_except_vector(0, ocelot_handle_int);
Index: linux.prev/arch/mips/pci/ops-au1000.c
===================================================================
--- linux.prev.orig/arch/mips/pci/ops-au1000.c
+++ linux.prev/arch/mips/pci/ops-au1000.c
@@ -93,7 +93,7 @@ static int config_access(unsigned char a
 		return -1;
 	}
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	au_writel(((0x2000 << 16) | (au_readl(Au1500_PCI_STATCMD) & 0xffff)),
 			Au1500_PCI_STATCMD);
 	au_sync_udelay(1);
@@ -125,7 +125,7 @@ static int config_access(unsigned char a
 	if (board_pci_idsel) {
 		if (board_pci_idsel(device, 1) == 0) {
 			*data = 0xffffffff;
-			local_irq_restore(flags);
+			raw_local_irq_restore(flags);
 			return -1;
 		}
 	}
@@ -184,7 +184,7 @@ static int config_access(unsigned char a
 		(void)board_pci_idsel(device, 0);
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return error;
 #endif
 }
Index: linux.prev/arch/mips/pmc-sierra/yosemite/smp.c
===================================================================
--- linux.prev.orig/arch/mips/pmc-sierra/yosemite/smp.c
+++ linux.prev/arch/mips/pmc-sierra/yosemite/smp.c
@@ -19,7 +19,7 @@ static unsigned char launchstack[LAUNCHS
 
 static void __init prom_smp_bootstrap(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	while (spin_is_locked(&launch_lock));
 
Index: linux.prev/arch/mips/sgi-ip22/ip22-eisa.c
===================================================================
--- linux.prev.orig/arch/mips/sgi-ip22/ip22-eisa.c
+++ linux.prev/arch/mips/sgi-ip22/ip22-eisa.c
@@ -98,13 +98,13 @@ static void enable_eisa1_irq(unsigned in
 	unsigned long flags;
 	u8 mask;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	mask = inb(EISA_INT1_MASK);
 	mask &= ~((u8) (1 << irq));
 	outb(mask, EISA_INT1_MASK);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static unsigned int startup_eisa1_irq(unsigned int irq)
@@ -160,13 +160,13 @@ static void enable_eisa2_irq(unsigned in
 	unsigned long flags;
 	u8 mask;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	mask = inb(EISA_INT2_MASK);
 	mask &= ~((u8) (1 << (irq - 8)));
 	outb(mask, EISA_INT2_MASK);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static unsigned int startup_eisa2_irq(unsigned int irq)
Index: linux.prev/arch/mips/sgi-ip22/ip22-int.c
===================================================================
--- linux.prev.orig/arch/mips/sgi-ip22/ip22-int.c
+++ linux.prev/arch/mips/sgi-ip22/ip22-int.c
@@ -44,12 +44,12 @@ static void enable_local0_irq(unsigned i
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	/* don't allow mappable interrupt to be enabled from setup_irq,
 	 * we have our own way to do so */
 	if (irq != SGI_MAP_0_IRQ)
 		sgint->imask0 |= (1 << (irq - SGINT_LOCAL0));
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static unsigned int startup_local0_irq(unsigned int irq)
@@ -62,9 +62,9 @@ static void disable_local0_irq(unsigned 
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	sgint->imask0 &= ~(1 << (irq - SGINT_LOCAL0));
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 #define shutdown_local0_irq	disable_local0_irq
@@ -90,12 +90,12 @@ static void enable_local1_irq(unsigned i
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	/* don't allow mappable interrupt to be enabled from setup_irq,
 	 * we have our own way to do so */
 	if (irq != SGI_MAP_1_IRQ)
 		sgint->imask1 |= (1 << (irq - SGINT_LOCAL1));
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static unsigned int startup_local1_irq(unsigned int irq)
@@ -108,9 +108,9 @@ void disable_local1_irq(unsigned int irq
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	sgint->imask1 &= ~(1 << (irq - SGINT_LOCAL1));
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 #define shutdown_local1_irq	disable_local1_irq
@@ -136,10 +136,10 @@ static void enable_local2_irq(unsigned i
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	sgint->imask0 |= (1 << (SGI_MAP_0_IRQ - SGINT_LOCAL0));
 	sgint->cmeimask0 |= (1 << (irq - SGINT_LOCAL2));
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static unsigned int startup_local2_irq(unsigned int irq)
@@ -152,11 +152,11 @@ void disable_local2_irq(unsigned int irq
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	sgint->cmeimask0 &= ~(1 << (irq - SGINT_LOCAL2));
 	if (!sgint->cmeimask0)
 		sgint->imask0 &= ~(1 << (SGI_MAP_0_IRQ - SGINT_LOCAL0));
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 #define shutdown_local2_irq disable_local2_irq
@@ -182,10 +182,10 @@ static void enable_local3_irq(unsigned i
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	sgint->imask1 |= (1 << (SGI_MAP_1_IRQ - SGINT_LOCAL1));
 	sgint->cmeimask1 |= (1 << (irq - SGINT_LOCAL3));
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static unsigned int startup_local3_irq(unsigned int irq)
@@ -198,11 +198,11 @@ void disable_local3_irq(unsigned int irq
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	sgint->cmeimask1 &= ~(1 << (irq - SGINT_LOCAL3));
 	if (!sgint->cmeimask1)
 		sgint->imask1 &= ~(1 << (SGI_MAP_1_IRQ - SGINT_LOCAL1));
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 #define shutdown_local3_irq disable_local3_irq
Index: linux.prev/arch/mips/sgi-ip22/ip22-reset.c
===================================================================
--- linux.prev.orig/arch/mips/sgi-ip22/ip22-reset.c
+++ linux.prev/arch/mips/sgi-ip22/ip22-reset.c
@@ -66,7 +66,7 @@ static void sgi_machine_power_off(void)
 {
 	unsigned int tmp;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* Disable watchdog */
 	tmp = hpc3c0->rtcregs[RTC_CMD] & 0xff;
Index: linux.prev/arch/mips/sgi-ip27/ip27-smp.c
===================================================================
--- linux.prev.orig/arch/mips/sgi-ip27/ip27-smp.c
+++ linux.prev/arch/mips/sgi-ip27/ip27-smp.c
@@ -179,7 +179,7 @@ void __init prom_boot_secondary(int cpu,
 void prom_init_secondary(void)
 {
 	per_cpu_init();
-	local_irq_enable();
+	raw_local_irq_enable();
 }
 
 void __init prom_cpus_done(void)
Index: linux.prev/arch/mips/sibyte/sb1250/irq.c
===================================================================
--- linux.prev.orig/arch/mips/sibyte/sb1250/irq.c
+++ linux.prev/arch/mips/sibyte/sb1250/irq.c
@@ -86,7 +86,7 @@ static struct hw_interrupt_type sb1250_i
 /* Store the CPU id (not the logical number) */
 int sb1250_irq_owner[SB1250_NR_IRQS];
 
-DEFINE_SPINLOCK(sb1250_imr_lock);
+DEFINE_RAW_SPINLOCK(sb1250_imr_lock);
 
 void sb1250_mask_irq(int cpu, int irq)
 {
@@ -267,7 +267,7 @@ static irqreturn_t  sb1250_dummy_handler
 
 static struct irqaction sb1250_dummy_action = {
 	.handler = sb1250_dummy_handler,
-	.flags   = 0,
+	.flags   = SA_NODELAY,
 	.mask    = CPU_MASK_NONE,
 	.name    = "sb1250-private",
 	.next    = NULL,
Index: linux.prev/arch/mips/sibyte/sb1250/smp.c
===================================================================
--- linux.prev.orig/arch/mips/sibyte/sb1250/smp.c
+++ linux.prev/arch/mips/sibyte/sb1250/smp.c
@@ -59,7 +59,7 @@ void sb1250_smp_finish(void)
 {
 	extern void sb1250_time_init(void);
 	sb1250_time_init();
-	local_irq_enable();
+	raw_local_irq_enable();
 }
 
 /*
Index: linux.prev/arch/mips/sni/reset.c
===================================================================
--- linux.prev.orig/arch/mips/sni/reset.c
+++ linux.prev/arch/mips/sni/reset.c
@@ -30,7 +30,7 @@ void sni_machine_restart(char *command)
 
 	/* This does a normal via the keyboard controller like a PC.
 	   We can do that easier ...  */
-	local_irq_disable();
+	raw_local_irq_disable();
 	for (;;) {
 		for (i=0; i<100; i++) {
 			kb_wait();
Index: linux.prev/arch/mips/tx4927/toshiba_rbtx4927/toshiba_rbtx4927_irq.c
===================================================================
--- linux.prev.orig/arch/mips/tx4927/toshiba_rbtx4927/toshiba_rbtx4927_irq.c
+++ linux.prev/arch/mips/tx4927/toshiba_rbtx4927/toshiba_rbtx4927_irq.c
@@ -669,7 +669,7 @@ void __init arch_init_irq(void)
 {
 	extern void tx4927_irq_init(void);
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	tx4927_irq_init();
 	toshiba_rbtx4927_irq_ioc_init();
Index: linux.prev/arch/mips/tx4927/toshiba_rbtx4927/toshiba_rbtx4927_setup.c
===================================================================
--- linux.prev.orig/arch/mips/tx4927/toshiba_rbtx4927/toshiba_rbtx4927_setup.c
+++ linux.prev/arch/mips/tx4927/toshiba_rbtx4927/toshiba_rbtx4927_setup.c
@@ -732,7 +732,7 @@ void toshiba_rbtx4927_restart(char *comm
 	reg_wr08(RBTX4927_SW_RESET_DO, RBTX4927_SW_RESET_DO_SET);
 
 	/* do something passive while waiting for reset */
-	local_irq_disable();
+	raw_local_irq_disable();
 	while (1)
 		asm_wait();
 
@@ -743,7 +743,7 @@ void toshiba_rbtx4927_restart(char *comm
 void toshiba_rbtx4927_halt(void)
 {
 	printk(KERN_NOTICE "System Halted\n");
-	local_irq_disable();
+	raw_local_irq_disable();
 	while (1) {
 		asm_wait();
 	}
Index: linux.prev/arch/mips/vr41xx/common/pmu.c
===================================================================
--- linux.prev.orig/arch/mips/vr41xx/common/pmu.c
+++ linux.prev/arch/mips/vr41xx/common/pmu.c
@@ -62,7 +62,7 @@ static inline void software_reset(void)
 
 static void vr41xx_restart(char *command)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	software_reset();
 	printk(KERN_NOTICE "\nYou can reset your system\n");
 	while (1) ;
@@ -70,14 +70,14 @@ static void vr41xx_restart(char *command
 
 static void vr41xx_halt(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	printk(KERN_NOTICE "\nYou can turn off the power supply\n");
 	while (1) ;
 }
 
 static void vr41xx_power_off(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	printk(KERN_NOTICE "\nYou can turn off the power supply\n");
 	while (1) ;
 }
Index: linux.prev/arch/powerpc/Kconfig
===================================================================
--- linux.prev.orig/arch/powerpc/Kconfig
+++ linux.prev/arch/powerpc/Kconfig
@@ -33,13 +33,6 @@ config GENERIC_HARDIRQS
 	bool
 	default y
 
-config RWSEM_GENERIC_SPINLOCK
-	bool
-
-config RWSEM_XCHGADD_ALGORITHM
-	bool
-	default y
-
 config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
@@ -484,6 +477,18 @@ config HIGHMEM
 
 source kernel/Kconfig.hz
 source kernel/Kconfig.preempt
+
+config RWSEM_GENERIC_SPINLOCK
+	bool
+	default y
+
+config ASM_SEMAPHORES
+	bool
+	default y
+
+config RWSEM_XCHGADD_ALGORITHM
+	bool
+
 source "fs/Kconfig.binfmt"
 
 # We optimistically allocate largepages from the VM, so make the limit
Index: linux.prev/arch/powerpc/boot/Makefile
===================================================================
--- linux.prev.orig/arch/powerpc/boot/Makefile
+++ linux.prev/arch/powerpc/boot/Makefile
@@ -28,6 +28,14 @@ BOOTAFLAGS	:= -D__ASSEMBLY__ $(BOOTCFLAG
 BOOTLFLAGS	:= -T $(srctree)/$(src)/zImage.lds
 OBJCOPYFLAGS    := contents,alloc,load,readonly,data
 
+ifdef CONFIG_MCOUNT
+# do not trace the boot loader
+nullstring	:=
+space		:= $(nullstring) # end of the line
+pg_flag		= $(nullstring) -pg # end of the line
+CFLAGS		:= $(subst ${pg_flag},${space},${CFLAGS})
+endif
+
 zlib       := infblock.c infcodes.c inffast.c inflate.c inftrees.c infutil.c
 zlibheader := infblock.h infcodes.h inffast.h inftrees.h infutil.h
 zliblinuxheader := zlib.h zconf.h zutil.h
@@ -43,7 +51,7 @@ obj-boot := $(addsuffix .o, $(basename $
 BOOTCFLAGS	+= -I$(obj) -I$(srctree)/$(obj)
 
 quiet_cmd_copy_zlib = COPY    $@
-      cmd_copy_zlib = sed "s@__attribute_used__@@;s@<linux/\([^>]\+\).*@\"\1\"@" $< > $@
+	cmd_copy_zlib = sed "s@__attribute_used__@@;s@.include.<linux/module.h>@@;s@.include.<linux/spinlock.h>@@;s@.*spin.*lock.*@@;s@.*SPINLOCK.*@@;s@<linux/\([^>]\+\).*@\"\1\"@" $< > $@
 
 quiet_cmd_copy_zlibheader = COPY    $@
       cmd_copy_zlibheader = sed "s@<linux/\([^>]\+\).*@\"\1\"@" $< > $@
Index: linux.prev/arch/powerpc/kernel/Makefile
===================================================================
--- linux.prev.orig/arch/powerpc/kernel/Makefile
+++ linux.prev/arch/powerpc/kernel/Makefile
@@ -11,9 +11,10 @@ CFLAGS_prom_init.o      += -fPIC
 CFLAGS_btext.o		+= -fPIC
 endif
 
-obj-y				:= semaphore.o cputable.o ptrace.o syscalls.o \
+obj-y				:= cputable.o ptrace.o syscalls.o \
 				   irq.o align.o signal_32.o pmc.o vdso.o
 obj-y				+= vdso32/
+obj-$(CONFIG_ASM_SEMAPHORES)	+= semaphore.o
 obj-$(CONFIG_PPC64)		+= setup_64.o binfmt_elf32.o sys_ppc32.o \
 				   signal_64.o ptrace32.o systbl.o \
 				   paca.o ioctl32.o cpu_setup_power4.o \
Index: linux.prev/arch/powerpc/kernel/entry_32.S
===================================================================
--- linux.prev.orig/arch/powerpc/kernel/entry_32.S
+++ linux.prev/arch/powerpc/kernel/entry_32.S
@@ -239,7 +239,7 @@ ret_from_syscall:
 	SYNC
 	MTMSRD(r10)
 	lwz	r9,TI_FLAGS(r12)
-	andi.	r0,r9,(_TIF_SYSCALL_T_OR_A|_TIF_SIGPENDING|_TIF_NEED_RESCHED)
+	andi.	r0,r9,(_TIF_SYSCALL_T_OR_A|_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	bne-	syscall_exit_work
 syscall_exit_cont:
 #if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
@@ -317,7 +317,7 @@ syscall_exit_work:
 	rlwinm	r12,r1,0,0,(31-THREAD_SHIFT)	/* current_thread_info() */
 	lwz	r9,TI_FLAGS(r12)
 5:
-	andi.	r0,r9,_TIF_NEED_RESCHED
+	andi.	r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	bne	1f
 	lwz	r5,_MSR(r1)
 	andi.	r5,r5,MSR_PR
@@ -658,7 +658,7 @@ user_exc_return:		/* r10 contains MSR_KE
 	/* Check current_thread_info()->flags */
 	rlwinm	r9,r1,0,0,(31-THREAD_SHIFT)
 	lwz	r9,TI_FLAGS(r9)
-	andi.	r0,r9,(_TIF_SIGPENDING|_TIF_NEED_RESCHED)
+	andi.	r0,r9,(_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	bne	do_work
 
 restore_user:
@@ -876,7 +876,7 @@ load_dbcr0:
 #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
 
 do_work:			/* r10 contains MSR_KERNEL here */
-	andi.	r0,r9,_TIF_NEED_RESCHED
+	andi.	r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	beq	do_user_signal
 
 do_resched:			/* r10 contains MSR_KERNEL here */
@@ -998,3 +998,85 @@ machine_check_in_rtas:
 	/* XXX load up BATs and panic */
 
 #endif /* CONFIG_PPC_RTAS */
+
+#ifdef CONFIG_MCOUNT
+/*
+ * mcount() is not the same as _mcount().  The callers of mcount() have a
+ * normal context.  The callers of _mcount() do not have a stack frame and
+ * have not saved the "caller saves" registers.
+ */
+_GLOBAL(mcount)
+	stwu	r1,-16(r1)
+	mflr	r3
+	lis	r5,mcount_enabled@ha
+	lwz	r5,mcount_enabled@l(r5)
+	stw	r3,20(r1)
+	cmpwi	r5,0
+	beq	1f
+	/* r3 contains lr (eip), put parent lr (parent_eip) in r4 */
+	lwz	r4,16(r1)
+	lwz	r4,4(r4)
+	bl	__trace
+1:
+	lwz	r0,20(r1)
+	mtlr	r0
+	addi	r1,r1,16
+	blr
+
+/*
+ * The -pg flag, which is specified in the case of CONFIG_MCOUNT, causes the
+ * C compiler to add a call to _mcount() at the start of each function
+ * preamble, before the stack frame is created.  An example of this preamble
+ * code is:
+ *
+ *	mflr	r0
+ *	lis	r12,-16354
+ *	stw	r0,4(r1)
+ *	addi	r0,r12,-19652
+ *	bl	0xc00034c8 <_mcount>
+ *	mflr	r0
+ *	stwu	r1,-16(r1)
+ */
+_GLOBAL(_mcount)
+#define M_STK_SIZE 48
+	/* Would not expect to need to save cr, but glibc version of */
+	/* _mcount() does, so cautiously saving it here too. */
+	stwu	r1,-M_STK_SIZE(r1)
+	stw	r3, 12(r1)
+	stw	r4, 16(r1)
+	stw	r5, 20(r1)
+	stw	r6, 24(r1)
+	mflr	r3	/* will use as first arg to __trace() */
+	mfcr	r4
+	lis	r5,mcount_enabled@ha
+	lwz	r5,mcount_enabled@l(r5)
+	cmpwi	r5,0
+	stw	r3, 44(r1)	/* lr */
+	stw	r4,  8(r1)	/* cr */
+	stw	r7, 28(r1)
+	stw	r8, 32(r1)
+	stw	r9, 36(r1)
+	stw	r10,40(r1)
+	beq	1f
+	/* r3 contains lr (eip), put parent lr (parent_eip) in r4 */
+	lwz	r4,M_STK_SIZE+4(r1)
+	bl	__trace
+1:
+	lwz	r8,  8(r1)	/* cr */
+	lwz	r9, 44(r1)	/* lr */
+	lwz	r3, 12(r1)
+	lwz	r4, 16(r1)
+	lwz	r5, 20(r1)
+	mtcrf	0xff,r8
+	mtctr	r9
+	lwz	r0, 52(r1)
+	lwz	r6, 24(r1)
+	lwz	r7, 28(r1)
+	lwz	r8, 32(r1)
+	lwz	r9, 36(r1)
+	lwz	r10,40(r1)
+	addi	r1,r1,M_STK_SIZE
+	mtlr	r0
+	bctr
+
+#endif /* CONFIG_MCOUNT */
Index: linux.prev/arch/powerpc/kernel/idle_64.c
===================================================================
--- linux.prev.orig/arch/powerpc/kernel/idle_64.c
+++ linux.prev/arch/powerpc/kernel/idle_64.c
@@ -37,7 +37,7 @@ void default_idle(void)
 	set_thread_flag(TIF_POLLING_NRFLAG);
 
 	while (1) {
-		if (!need_resched()) {
+		if (!need_resched() && !need_resched_delayed()) {
 			while (!need_resched() && !cpu_is_offline(cpu)) {
 				ppc64_runlatch_off();
 
@@ -53,9 +53,11 @@ void default_idle(void)
 		}
 
 		ppc64_runlatch_on();
-		preempt_enable_no_resched();
-		schedule();
+		raw_local_irq_disable();
+		__preempt_enable_no_resched();
+		__schedule();
 		preempt_disable();
+		raw_local_irq_enable();
 		if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)
 			cpu_die();
 	}
@@ -71,9 +73,11 @@ void native_idle(void)
 
 		if (need_resched()) {
 			ppc64_runlatch_on();
-			preempt_enable_no_resched();
-			schedule();
+			raw_local_irq_disable();
+			__preempt_enable_no_resched();
+			__schedule();
 			preempt_disable();
+			raw_local_irq_enable();
 		}
 
 		if (cpu_is_offline(smp_processor_id()) &&
Index: linux.prev/arch/powerpc/kernel/init_task.c
===================================================================
--- linux.prev.orig/arch/powerpc/kernel/init_task.c
+++ linux.prev/arch/powerpc/kernel/init_task.c
@@ -3,12 +3,12 @@
 #include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/init_task.h>
-#include <linux/fs.h>
+#include <linux/fs_struct.h>
 #include <linux/mqueue.h>
 #include <asm/uaccess.h>
 
-static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
+static struct fs_struct init_fs = INIT_FS(init_fs);
+static struct files_struct init_files = INIT_FILES(init_files);
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
Index: linux.prev/arch/powerpc/kernel/irq.c
===================================================================
--- linux.prev.orig/arch/powerpc/kernel/irq.c
+++ linux.prev/arch/powerpc/kernel/irq.c
@@ -100,8 +100,6 @@ extern atomic_t ipi_sent;
 #endif /* CONFIG_PPC32 */
 
 #ifdef CONFIG_PPC64
-EXPORT_SYMBOL(irq_desc);
-
 int distribute_irqs = 1;
 u64 ppc64_interrupt_controller;
 #endif /* CONFIG_PPC64 */
Index: linux.prev/arch/powerpc/kernel/ppc_ksyms.c
===================================================================
--- linux.prev.orig/arch/powerpc/kernel/ppc_ksyms.c
+++ linux.prev/arch/powerpc/kernel/ppc_ksyms.c
@@ -17,7 +17,6 @@
 #include <linux/bitops.h>
 
 #include <asm/page.h>
-#include <asm/semaphore.h>
 #include <asm/processor.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -221,16 +220,11 @@ EXPORT_SYMBOL(screen_info);
 #ifdef CONFIG_PPC32
 EXPORT_SYMBOL(__delay);
 EXPORT_SYMBOL(timer_interrupt);
-EXPORT_SYMBOL(irq_desc);
 EXPORT_SYMBOL(tb_ticks_per_jiffy);
 EXPORT_SYMBOL(console_drivers);
 EXPORT_SYMBOL(cacheable_memcpy);
 #endif
 
-EXPORT_SYMBOL(__up);
-EXPORT_SYMBOL(__down);
-EXPORT_SYMBOL(__down_interruptible);
-
 #ifdef  CONFIG_8xx
 EXPORT_SYMBOL(cpm_install_handler);
 EXPORT_SYMBOL(cpm_free_handler);
Index: linux.prev/arch/powerpc/kernel/process.c
===================================================================
--- linux.prev.orig/arch/powerpc/kernel/process.c
+++ linux.prev/arch/powerpc/kernel/process.c
@@ -327,10 +327,10 @@ struct task_struct *__switch_to(struct t
 	}
 #endif
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	last = _switch(old_thread, new_thread);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return last;
 }
Index: linux.prev/arch/powerpc/kernel/rtas.c
===================================================================
--- linux.prev.orig/arch/powerpc/kernel/rtas.c
+++ linux.prev/arch/powerpc/kernel/rtas.c
@@ -31,7 +31,7 @@
 #include <asm/lmb.h>
 
 struct rtas_t rtas = {
-	.lock = SPIN_LOCK_UNLOCKED
+	.lock = SPIN_LOCK_UNLOCKED(rtas.lock)
 };
 
 EXPORT_SYMBOL(rtas);
@@ -620,7 +620,7 @@ void rtas_stop_self(void)
 {
 	struct rtas_args *rtas_args = &rtas_stop_self_args;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	BUG_ON(rtas_args->token == RTAS_UNKNOWN_SERVICE);
 
Index: linux.prev/arch/powerpc/kernel/semaphore.c
===================================================================
--- linux.prev.orig/arch/powerpc/kernel/semaphore.c
+++ linux.prev/arch/powerpc/kernel/semaphore.c
@@ -31,7 +31,7 @@
  *	sem->count = tmp;
  *	return old_count;
  */
-static inline int __sem_update_count(struct semaphore *sem, int incr)
+static inline int __sem_update_count(struct compat_semaphore *sem, int incr)
 {
 	int old_count, tmp;
 
@@ -50,7 +50,7 @@ static inline int __sem_update_count(str
 	return old_count;
 }
 
-void __up(struct semaphore *sem)
+void __compat_up(struct compat_semaphore *sem)
 {
 	/*
 	 * Note that we incremented count in up() before we came here,
@@ -63,7 +63,7 @@ void __up(struct semaphore *sem)
 	__sem_update_count(sem, 1);
 	wake_up(&sem->wait);
 }
-EXPORT_SYMBOL(__up);
+EXPORT_SYMBOL(__compat_up);
 
 /*
  * Note that when we come in to __down or __down_interruptible,
@@ -73,7 +73,7 @@ EXPORT_SYMBOL(__up);
  * Thus it is only when we decrement count from some value > 0
  * that we have actually got the semaphore.
  */
-void __sched __down(struct semaphore *sem)
+void __sched __compat_down(struct compat_semaphore *sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -101,9 +101,9 @@ void __sched __down(struct semaphore *se
 	 */
 	wake_up(&sem->wait);
 }
-EXPORT_SYMBOL(__down);
+EXPORT_SYMBOL(__compat_down);
 
-int __sched __down_interruptible(struct semaphore * sem)
+int __sched __compat_down_interruptible(struct compat_semaphore *sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -132,4 +132,10 @@ int __sched __down_interruptible(struct 
 	wake_up(&sem->wait);
 	return retval;
 }
-EXPORT_SYMBOL(__down_interruptible);
+EXPORT_SYMBOL(__compat_down_interruptible);
+
+int compat_sem_is_locked(struct compat_semaphore *sem)
+{
+	return (int) atomic_read(&sem->count) < 0;
+}
+EXPORT_SYMBOL(compat_sem_is_locked);
Index: linux.prev/arch/powerpc/kernel/setup-common.c
===================================================================
--- linux.prev.orig/arch/powerpc/kernel/setup-common.c
+++ linux.prev/arch/powerpc/kernel/setup-common.c
@@ -105,7 +105,7 @@ void machine_restart(char *cmd)
 	smp_send_stop();
 #endif
 	printk(KERN_EMERG "System Halted, OK to turn off power\n");
-	local_irq_disable();
+	raw_local_irq_disable();
 	while (1) ;
 }
 
@@ -117,7 +117,7 @@ void machine_power_off(void)
 	smp_send_stop();
 #endif
 	printk(KERN_EMERG "System Halted, OK to turn off power\n");
-	local_irq_disable();
+	raw_local_irq_disable();
 	while (1) ;
 }
 /* Used by the G5 thermal driver */
@@ -134,7 +134,7 @@ void machine_halt(void)
 	smp_send_stop();
 #endif
 	printk(KERN_EMERG "System Halted, OK to turn off power\n");
-	local_irq_disable();
+	raw_local_irq_disable();
 	while (1) ;
 }
 
Index: linux.prev/arch/powerpc/kernel/smp-tbsync.c
===================================================================
--- linux.prev.orig/arch/powerpc/kernel/smp-tbsync.c
+++ linux.prev/arch/powerpc/kernel/smp-tbsync.c
@@ -47,7 +47,7 @@ void __devinit smp_generic_take_timebase
 	int cmd;
 	u64 tb;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	while (!running)
 		barrier();
 	rmb();
@@ -71,7 +71,7 @@ void __devinit smp_generic_take_timebase
 			set_tb(tb >> 32, tb & 0xfffffffful);
 		enter_contest(tbsync->mark, -1);
 	}
-	local_irq_enable();
+	raw_local_irq_enable();
 }
 
 static int __devinit start_contest(int cmd, long offset, int num)
@@ -82,7 +82,7 @@ static int __devinit start_contest(int c
 
 	tbsync->cmd = cmd;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	for (i = -3; i < num; ) {
 		tb = get_tb() + 400;
 		tbsync->tb = tb + offset;
@@ -105,7 +105,7 @@ static int __devinit start_contest(int c
 		if (i++ > 0)
 			score += tbsync->race_result;
 	}
-	local_irq_enable();
+	raw_local_irq_enable();
 	return score;
 }
 
Index: linux.prev/arch/powerpc/kernel/smp.c
===================================================================
--- linux.prev.orig/arch/powerpc/kernel/smp.c
+++ linux.prev/arch/powerpc/kernel/smp.c
@@ -140,6 +140,16 @@ void smp_send_reschedule(int cpu)
 	smp_ops->message_pass(cpu, PPC_MSG_RESCHEDULE);
 }
 
+/*
+ * this function sends a 'reschedule' IPI to all other CPUs.
+ * This is used when RT tasks are starving and other CPUs
+ * might be able to run them:
+ */
+void smp_send_reschedule_allbutself(void)
+{
+	smp_ops->message_pass(MSG_ALL_BUT_SELF, PPC_MSG_RESCHEDULE);
+}
+
 #ifdef CONFIG_DEBUGGER
 void smp_send_debugger_break(int cpu)
 {
@@ -149,7 +159,7 @@ void smp_send_debugger_break(int cpu)
 
 static void stop_this_cpu(void *dummy)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	while (1)
 		;
 }
@@ -164,7 +174,7 @@ void smp_send_stop(void)
  * static memory requirements. It also looks cleaner.
  * Stolen from the i386 version.
  */
-static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(call_lock);
+static  __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(call_lock);
 
 static struct call_data_struct {
 	void (*func) (void *info);
@@ -200,7 +210,7 @@ int smp_call_function (void (*func) (voi
 	u64 timeout;
 
 	/* Can deadlock when called with interrupts disabled */
-	WARN_ON(irqs_disabled());
+	WARN_ON(raw_irqs_disabled());
 
 	data.func = func;
 	data.info = info;
@@ -529,7 +539,7 @@ int __devinit start_secondary(void *unus
 	cpu_set(cpu, cpu_online_map);
 	spin_unlock(&call_lock);
 
-	local_irq_enable();
+	raw_local_irq_enable();
 
 	cpu_idle();
 	return 0;
Index: linux.prev/arch/powerpc/kernel/time.c
===================================================================
--- linux.prev.orig/arch/powerpc/kernel/time.c
+++ linux.prev/arch/powerpc/kernel/time.c
@@ -72,6 +72,9 @@
 #endif
 #include <asm/smp.h>
 
+unsigned long cpu_khz;	/* Detected as we calibrate the TSC */
+EXPORT_SYMBOL(cpu_khz);
+
 /* keep track of when we need to update the rtc */
 time_t last_rtc_update;
 extern int piranha_simulator;
@@ -100,7 +103,7 @@ unsigned long tb_ticks_per_sec;
 u64 tb_to_xs;
 unsigned tb_to_us;
 unsigned long processor_freq;
-DEFINE_SPINLOCK(rtc_lock);
+DEFINE_RAW_SPINLOCK(rtc_lock);
 EXPORT_SYMBOL_GPL(rtc_lock);
 
 u64 tb_to_ns_scale;
@@ -335,7 +338,7 @@ static __inline__ void timer_recalc_offs
 }
 
 #ifdef CONFIG_SMP
-unsigned long profile_pc(struct pt_regs *regs)
+unsigned long notrace profile_pc(struct pt_regs *regs)
 {
 	unsigned long pc = instruction_pointer(regs);
 
@@ -698,6 +701,7 @@ void __init time_init(void)
 	tb_to_us = mulhwu_scale_factor(ppc_tb_freq, 1000000);
 	div128_by_32(1024*1024, 0, tb_ticks_per_sec, &res);
 	tb_to_xs = res.result_low;
+	cpu_khz = ppc_tb_freq / 1000;
 
 #ifdef CONFIG_PPC64
 	get_paca()->default_decr = tb_ticks_per_jiffy;
Index: linux.prev/arch/powerpc/kernel/traps.c
===================================================================
--- linux.prev.orig/arch/powerpc/kernel/traps.c
+++ linux.prev/arch/powerpc/kernel/traps.c
@@ -91,7 +91,7 @@ int register_die_notifier(struct notifie
  * Trap & Exception support
  */
 
-static DEFINE_SPINLOCK(die_lock);
+static DEFINE_RAW_SPINLOCK(die_lock);
 
 int die(const char *str, struct pt_regs *regs, long err)
 {
@@ -182,6 +182,11 @@ void _exception(int signr, struct pt_reg
 			return;
 	}
 
+#ifdef CONFIG_PREEMPT_RT
+	raw_local_irq_enable();
+	preempt_check_resched();
+#endif
+
 	memset(&info, 0, sizeof(info));
 	info.si_signo = signr;
 	info.si_code = code;
Index: linux.prev/arch/powerpc/lib/locks.c
===================================================================
--- linux.prev.orig/arch/powerpc/lib/locks.c
+++ linux.prev/arch/powerpc/lib/locks.c
@@ -25,7 +25,7 @@
 #include <asm/iseries/hv_call.h>
 #include <asm/smp.h>
 
-void __spin_yield(raw_spinlock_t *lock)
+void __spin_yield(__raw_spinlock_t *lock)
 {
 	unsigned int lock_value, holder_cpu, yield_count;
 	struct paca_struct *holder_paca;
@@ -84,7 +84,7 @@ void __rw_yield(raw_rwlock_t *rw)
 }
 #endif
 
-void __raw_spin_unlock_wait(raw_spinlock_t *lock)
+void __raw_spin_unlock_wait(__raw_spinlock_t *lock)
 {
 	while (lock->slock) {
 		HMT_low();
Index: linux.prev/arch/powerpc/mm/fault.c
===================================================================
--- linux.prev.orig/arch/powerpc/mm/fault.c
+++ linux.prev/arch/powerpc/mm/fault.c
@@ -117,8 +117,8 @@ static void do_dabr(struct pt_regs *regs
  * The return value is 0 if the fault was handled, or the signal
  * number if this is a kernel fault that can't be handled here.
  */
-int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
-			    unsigned long error_code)
+int __kprobes notrace do_page_fault(struct pt_regs *regs,
+		unsigned long address, unsigned long error_code)
 {
 	struct vm_area_struct * vma;
 	struct mm_struct *mm = current->mm;
Index: linux.prev/arch/powerpc/mm/init_32.c
===================================================================
--- linux.prev.orig/arch/powerpc/mm/init_32.c
+++ linux.prev/arch/powerpc/mm/init_32.c
@@ -57,7 +57,7 @@
 #endif
 #define MAX_LOW_MEM	CONFIG_LOWMEM_SIZE
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 unsigned long total_memory;
 unsigned long total_lowmem;
Index: linux.prev/arch/powerpc/mm/tlb_64.c
===================================================================
--- linux.prev.orig/arch/powerpc/mm/tlb_64.c
+++ linux.prev/arch/powerpc/mm/tlb_64.c
@@ -38,7 +38,7 @@ DEFINE_PER_CPU(struct ppc64_tlb_batch, p
 /* This is declared as we are using the more or less generic
  * include/asm-ppc64/tlb.h file -- tgall
  */
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
 unsigned long pte_freelist_forced_free;
 
Index: linux.prev/arch/powerpc/platforms/cell/smp.c
===================================================================
--- linux.prev.orig/arch/powerpc/platforms/cell/smp.c
+++ linux.prev/arch/powerpc/platforms/cell/smp.c
@@ -134,7 +134,7 @@ static void __devinit smp_iic_setup_cpu(
 		iic_setup_cpu();
 }
 
-static DEFINE_SPINLOCK(timebase_lock);
+static DEFINE_RAW_SPINLOCK(timebase_lock);
 static unsigned long timebase = 0;
 
 static void __devinit cell_give_timebase(void)
Index: linux.prev/arch/powerpc/platforms/chrp/smp.c
===================================================================
--- linux.prev.orig/arch/powerpc/platforms/chrp/smp.c
+++ linux.prev/arch/powerpc/platforms/chrp/smp.c
@@ -47,7 +47,7 @@ static void __devinit smp_chrp_setup_cpu
 	mpic_setup_this_cpu();
 }
 
-static DEFINE_SPINLOCK(timebase_lock);
+static DEFINE_RAW_SPINLOCK(timebase_lock);
 static unsigned int timebase_upper = 0, timebase_lower = 0;
 
 void __devinit smp_chrp_give_timebase(void)
Index: linux.prev/arch/powerpc/platforms/chrp/time.c
===================================================================
--- linux.prev.orig/arch/powerpc/platforms/chrp/time.c
+++ linux.prev/arch/powerpc/platforms/chrp/time.c
@@ -28,7 +28,7 @@
 #include <asm/sections.h>
 #include <asm/time.h>
 
-extern spinlock_t rtc_lock;
+extern raw_spinlock_t rtc_lock;
 
 static int nvram_as1 = NVRAM_AS1;
 static int nvram_as0 = NVRAM_AS0;
Index: linux.prev/arch/powerpc/platforms/iseries/setup.c
===================================================================
--- linux.prev.orig/arch/powerpc/platforms/iseries/setup.c
+++ linux.prev/arch/powerpc/platforms/iseries/setup.c
@@ -673,16 +673,18 @@ static void yield_shared_processor(void)
 static void iseries_shared_idle(void)
 {
 	while (1) {
-		while (!need_resched() && !hvlpevent_is_pending()) {
-			local_irq_disable();
+		while (!need_resched() && !need_resched_delayed()
+				&& !hvlpevent_is_pending()) {
+			raw_local_irq_disable();
 			ppc64_runlatch_off();
 
 			/* Recheck with irqs off */
-			if (!need_resched() && !hvlpevent_is_pending())
+			if (!need_resched() && !need_resched_delayed()
+					&& !hvlpevent_is_pending())
 				yield_shared_processor();
 
 			HMT_medium();
-			local_irq_enable();
+			raw_local_irq_enable();
 		}
 
 		ppc64_runlatch_on();
Index: linux.prev/arch/powerpc/platforms/powermac/feature.c
===================================================================
--- linux.prev.orig/arch/powerpc/platforms/powermac/feature.c
+++ linux.prev/arch/powerpc/platforms/powermac/feature.c
@@ -63,7 +63,7 @@ extern struct device_node *k2_skiplist[2
  * We use a single global lock to protect accesses. Each driver has
  * to take care of its own locking
  */
-static DEFINE_SPINLOCK(feature_lock);
+static DEFINE_RAW_SPINLOCK(feature_lock);
 
 #define LOCK(flags)	spin_lock_irqsave(&feature_lock, flags);
 #define UNLOCK(flags)	spin_unlock_irqrestore(&feature_lock, flags);
Index: linux.prev/arch/powerpc/platforms/powermac/nvram.c
===================================================================
--- linux.prev.orig/arch/powerpc/platforms/powermac/nvram.c
+++ linux.prev/arch/powerpc/platforms/powermac/nvram.c
@@ -81,7 +81,7 @@ static int is_core_99;
 static int core99_bank = 0;
 static int nvram_partitions[3];
 // XXX Turn that into a sem
-static DEFINE_SPINLOCK(nv_lock);
+static DEFINE_RAW_SPINLOCK(nv_lock);
 
 extern int pmac_newworld;
 extern int system_running;
Index: linux.prev/arch/powerpc/platforms/powermac/pic.c
===================================================================
--- linux.prev.orig/arch/powerpc/platforms/powermac/pic.c
+++ linux.prev/arch/powerpc/platforms/powermac/pic.c
@@ -69,7 +69,7 @@ static int max_irqs;
 static int max_real_irqs;
 static u32 level_mask[4];
 
-static DEFINE_SPINLOCK(pmac_pic_lock);
+static DEFINE_RAW_SPINLOCK(pmac_pic_lock);
 
 #define GATWICK_IRQ_POOL_SIZE        10
 static struct interrupt_info gatwick_int_pool[GATWICK_IRQ_POOL_SIZE];
Index: linux.prev/arch/powerpc/platforms/powermac/smp.c
===================================================================
--- linux.prev.orig/arch/powerpc/platforms/powermac/smp.c
+++ linux.prev/arch/powerpc/platforms/powermac/smp.c
@@ -436,7 +436,7 @@ struct smp_ops_t psurge_smp_ops = {
 static struct device_node *pmac_tb_clock_chip_host;
 static u8 pmac_tb_pulsar_addr;
 static void (*pmac_tb_freeze)(int freeze);
-static DEFINE_SPINLOCK(timebase_lock);
+static DEFINE_RAW_SPINLOCK(timebase_lock);
 static unsigned long timebase;
 
 static void smp_core99_cypress_tb_freeze(int freeze)
Index: linux.prev/arch/powerpc/platforms/pseries/setup.c
===================================================================
--- linux.prev.orig/arch/powerpc/platforms/pseries/setup.c
+++ linux.prev/arch/powerpc/platforms/pseries/setup.c
@@ -336,7 +336,7 @@ static  void __init pSeries_discover_pic
 
 static void pSeries_mach_cpu_die(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	idle_task_exit();
 	/* Some hardware requires clearing the CPPR, while other hardware does not
 	 * it is safe either way
@@ -458,7 +458,7 @@ static inline void dedicated_idle_sleep(
 
 	/* Only sleep if the other thread is not idle */
 	if (!(ppaca->lppaca.idle)) {
-		local_irq_disable();
+		raw_local_irq_disable();
 
 		/*
 		 * We are about to sleep the thread and so wont be polling any
@@ -474,10 +474,10 @@ static inline void dedicated_idle_sleep(
 		 * a prod occurs.  Returning from the cede enables external
 		 * interrupts.
 		 */
-		if (!need_resched())
+		if (!need_resched() && !need_resched_delayed())
 			cede_processor();
 		else
-			local_irq_enable();
+			raw_local_irq_enable();
 		set_thread_flag(TIF_POLLING_NRFLAG);
 	} else {
 		/*
@@ -552,8 +552,9 @@ static void pseries_shared_idle(void)
 		 */
 		lpaca->lppaca.idle = 1;
 
-		while (!need_resched() && !cpu_is_offline(cpu)) {
-			local_irq_disable();
+		while (!need_resched() && !need_resched_delayed() &&
+				!cpu_is_offline(cpu)) {
+			raw_local_irq_disable();
 			ppc64_runlatch_off();
 
 			/*
@@ -569,7 +570,7 @@ static void pseries_shared_idle(void)
 			if (!need_resched())
 				cede_processor();
 			else
-				local_irq_enable();
+				raw_local_irq_enable();
 
 			HMT_medium();
 		}
@@ -577,8 +578,8 @@ static void pseries_shared_idle(void)
 		lpaca->lppaca.idle = 0;
 		ppc64_runlatch_on();
 
-		preempt_enable_no_resched();
-		schedule();
+		__preempt_enable_no_resched();
+		__schedule();
 		preempt_disable();
 
 		if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)
Index: linux.prev/arch/powerpc/platforms/pseries/smp.c
===================================================================
--- linux.prev.orig/arch/powerpc/platforms/pseries/smp.c
+++ linux.prev/arch/powerpc/platforms/pseries/smp.c
@@ -345,7 +345,7 @@ static void __devinit smp_xics_setup_cpu
 }
 #endif /* CONFIG_XICS */
 
-static DEFINE_SPINLOCK(timebase_lock);
+static DEFINE_RAW_SPINLOCK(timebase_lock);
 static unsigned long timebase = 0;
 
 static void __devinit pSeries_give_timebase(void)
Index: linux.prev/arch/powerpc/xmon/xmon.c
===================================================================
--- linux.prev.orig/arch/powerpc/xmon/xmon.c
+++ linux.prev/arch/powerpc/xmon/xmon.c
@@ -522,10 +522,10 @@ irqreturn_t
 xmon_irq(int irq, void *d, struct pt_regs *regs)
 {
 	unsigned long flags;
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	printf("Keyboard interrupt\n");
 	xmon(regs);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return IRQ_HANDLED;
 }
 
Index: linux.prev/arch/ppc/8260_io/enet.c
===================================================================
--- linux.prev.orig/arch/ppc/8260_io/enet.c
+++ linux.prev/arch/ppc/8260_io/enet.c
@@ -116,7 +116,7 @@ struct scc_enet_private {
 	scc_t	*sccp;
 	struct	net_device_stats stats;
 	uint	tx_full;
-	spinlock_t lock;
+	raw_spinlock_t lock;
 };
 
 static int scc_enet_open(struct net_device *dev);
Index: linux.prev/arch/ppc/8260_io/fcc_enet.c
===================================================================
--- linux.prev.orig/arch/ppc/8260_io/fcc_enet.c
+++ linux.prev/arch/ppc/8260_io/fcc_enet.c
@@ -377,7 +377,7 @@ struct fcc_enet_private {
 	volatile fcc_enet_t	*ep;
 	struct	net_device_stats stats;
 	uint	tx_free;
-	spinlock_t lock;
+	raw_spinlock_t lock;
 
 #ifdef	CONFIG_USE_MDIO
 	uint	phy_id;
Index: linux.prev/arch/ppc/8xx_io/commproc.c
===================================================================
--- linux.prev.orig/arch/ppc/8xx_io/commproc.c
+++ linux.prev/arch/ppc/8xx_io/commproc.c
@@ -356,7 +356,7 @@ cpm_setbrg(uint brg, uint rate)
 /*
  * dpalloc / dpfree bits.
  */
-static spinlock_t cpm_dpmem_lock;
+static raw_spinlock_t cpm_dpmem_lock;
 /*
  * 16 blocks should be enough to satisfy all requests
  * until the memory subsystem goes up...
Index: linux.prev/arch/ppc/8xx_io/enet.c
===================================================================
--- linux.prev.orig/arch/ppc/8xx_io/enet.c
+++ linux.prev/arch/ppc/8xx_io/enet.c
@@ -144,7 +144,7 @@ struct scc_enet_private {
 	unsigned char *rx_vaddr[RX_RING_SIZE];
 	struct	net_device_stats stats;
 	uint	tx_full;
-	spinlock_t lock;
+	raw_spinlock_t lock;
 };
 
 static int scc_enet_open(struct net_device *dev);
Index: linux.prev/arch/ppc/8xx_io/fec.c
===================================================================
--- linux.prev.orig/arch/ppc/8xx_io/fec.c
+++ linux.prev/arch/ppc/8xx_io/fec.c
@@ -165,7 +165,7 @@ struct fec_enet_private {
 
 	struct	net_device_stats stats;
 	uint	tx_full;
-	spinlock_t lock;
+	raw_spinlock_t lock;
 
 #ifdef	CONFIG_USE_MDIO
 	uint	phy_id;
Index: linux.prev/arch/ppc/Kconfig
===================================================================
--- linux.prev.orig/arch/ppc/Kconfig
+++ linux.prev/arch/ppc/Kconfig
@@ -15,13 +15,6 @@ config GENERIC_HARDIRQS
 	bool
 	default y
 
-config RWSEM_GENERIC_SPINLOCK
-	bool
-
-config RWSEM_XCHGADD_ALGORITHM
-	bool
-	default y
-
 config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
@@ -950,6 +943,18 @@ config HIGHMEM
 
 source kernel/Kconfig.hz
 source kernel/Kconfig.preempt
+
+config RWSEM_GENERIC_SPINLOCK
+	bool
+	default y
+
+config ASM_SEMAPHORES
+	bool
+	default y
+
+config RWSEM_XCHGADD_ALGORITHM
+	bool
+
 source "mm/Kconfig"
 
 source "fs/Kconfig.binfmt"
Index: linux.prev/arch/ppc/boot/Makefile
===================================================================
--- linux.prev.orig/arch/ppc/boot/Makefile
+++ linux.prev/arch/ppc/boot/Makefile
@@ -11,6 +11,15 @@
 #
 
 CFLAGS	 	+= -fno-builtin -D__BOOTER__ -Iarch/$(ARCH)/boot/include
+
+ifdef CONFIG_MCOUNT
+# do not trace the boot loader
+nullstring :=
+space      := $(nullstring) # end of the line
+pg_flag     = $(nullstring) -pg # end of the line
+CFLAGS     := $(subst ${pg_flag},${space},${CFLAGS})
+endif
+
 HOSTCFLAGS	+= -Iarch/$(ARCH)/boot/include
 
 BOOT_TARGETS	= zImage zImage.initrd znetboot znetboot.initrd
Index: linux.prev/arch/ppc/boot/lib/Makefile
===================================================================
--- linux.prev.orig/arch/ppc/boot/lib/Makefile
+++ linux.prev/arch/ppc/boot/lib/Makefile
@@ -5,19 +5,49 @@
 CFLAGS_kbd.o	:= -Idrivers/char
 CFLAGS_vreset.o := -I$(srctree)/arch/ppc/boot/include
 
-zlib  := infblock.c infcodes.c inffast.c inflate.c inftrees.c infutil.c
-	 
-lib-y += $(zlib:.c=.o) div64.o
-lib-$(CONFIG_VGA_CONSOLE) += vreset.o kbd.o
-
+zlib       := infblock.c infcodes.c inffast.c inflate.c inftrees.c infutil.c
+zlibheader := infblock.h infcodes.h inffast.h inftrees.h infutil.h
+zliblinuxheader := zlib.h zconf.h zutil.h
+
+$(addprefix $(obj)/,$(zlib)): $(addprefix $(obj)/,$(zliblinuxheader)) $(addprefix $(obj)/,$(zlibheader))
+
+src-boot := div64.S
+src-boot += $(zlib)
+#src-boot := $(addprefix $(obj)/, $(src-boot))
+obj-boot := $(addsuffix .o, $(basename $(src-boot)))
 
-# zlib files needs header from their original place
-EXTRA_CFLAGS += -Ilib/zlib_inflate
+BOOTCFLAGS	+= -I$(obj) -I$(srctree)/$(obj) $(CFLAGS)
 
 quiet_cmd_copy_zlib = COPY    $@
-      cmd_copy_zlib = cat $< > $@
+      cmd_copy_zlib = sed "s@__attribute_used__@@;s@.include.<linux/module.h>@@;s@.include.<linux/spinlock.h>@@;s@.*spin.*lock.*@@;s@.*SPINLOCK.*@@;s@<linux/\([^>]\+\).*@\"\1\"@" $< > $@
+
+quiet_cmd_copy_zlibheader = COPY    $@
+      cmd_copy_zlibheader = sed "s@<linux/\([^>]\+\).*@\"\1\"@" $< > $@
+# stddef.h for NULL
+quiet_cmd_copy_zliblinuxheader = COPY    $@
+      cmd_copy_zliblinuxheader = sed "s@.include.<linux/string.h>@@;s@.include.<linux/errno.h>@@;s@<linux/kernel.h>@<stddef.h>@;s@<linux/\([^>]\+\).*@\"\1\"@" $< > $@
 
 $(addprefix $(obj)/,$(zlib)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
 	$(call cmd,copy_zlib)
 
-clean-files := $(zlib)
+$(addprefix $(obj)/,$(zlibheader)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
+	$(call cmd,copy_zlibheader)
+
+$(addprefix $(obj)/,$(zliblinuxheader)): $(obj)/%: $(srctree)/include/linux/%
+	$(call cmd,copy_zliblinuxheader)
+
+clean-files := $(zlib) $(zlibheader) $(zliblinuxheader)
+
+quiet_cmd_bootcc = BOOTCC  $@
+      cmd_bootcc = $(CC) -Wp,-MD,$(depfile) $(BOOTCFLAGS) -c -o $@ $<
+
+quiet_cmd_bootas = BOOTAS  $@
+      cmd_bootas = $(CC) -Wp,-MD,$(depfile) $(BOOTAFLAGS) -c -o $@ $<
+
+$(patsubst %.c,%.o, $(filter %.c, $(src-boot))): %.o: %.c
+	$(call if_changed_dep,bootcc)
+$(patsubst %.S,%.o, $(filter %.S, $(src-boot))): %.o: %.S
+	$(call if_changed_dep,bootas)
+
+lib-y += $(obj-boot)
+lib-$(CONFIG_VGA_CONSOLE) += vreset.o kbd.o
Index: linux.prev/arch/ppc/kernel/dma-mapping.c
===================================================================
--- linux.prev.orig/arch/ppc/kernel/dma-mapping.c
+++ linux.prev/arch/ppc/kernel/dma-mapping.c
@@ -71,7 +71,7 @@ int map_page(unsigned long va, phys_addr
  * This is the page table (2MB) covering uncached, DMA consistent allocations
  */
 static pte_t *consistent_pte;
-static DEFINE_SPINLOCK(consistent_lock);
+static DEFINE_RAW_SPINLOCK(consistent_lock);
 
 /*
  * VM region handling support.
@@ -403,7 +403,7 @@ static inline void __dma_sync_page_highm
 	int nr_segs = 1 + ((size - seg_size) + PAGE_SIZE - 1)/PAGE_SIZE;
 	int seg_nr = 0;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	do {
 		start = (unsigned long)kmap_atomic(page + seg_nr,
@@ -422,7 +422,7 @@ static inline void __dma_sync_page_highm
 		seg_offset = 0;
 	} while (seg_nr < nr_segs);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 #endif /* CONFIG_HIGHMEM */
 
Index: linux.prev/arch/ppc/kernel/entry.S
===================================================================
--- linux.prev.orig/arch/ppc/kernel/entry.S
+++ linux.prev/arch/ppc/kernel/entry.S
@@ -239,7 +239,7 @@ ret_from_syscall:
 	SYNC
 	MTMSRD(r10)
 	lwz	r9,TI_FLAGS(r12)
-	andi.	r0,r9,(_TIF_SYSCALL_T_OR_A|_TIF_SIGPENDING|_TIF_NEED_RESCHED)
+	andi.	r0,r9,(_TIF_SYSCALL_T_OR_A|_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	bne-	syscall_exit_work
 syscall_exit_cont:
 #if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
@@ -317,7 +317,7 @@ syscall_exit_work:
 	rlwinm	r12,r1,0,0,18	/* current_thread_info() */
 	lwz	r9,TI_FLAGS(r12)
 5:
-	andi.	r0,r9,_TIF_NEED_RESCHED
+	andi.	r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	bne	1f
 	lwz	r5,_MSR(r1)
 	andi.	r5,r5,MSR_PR
@@ -658,7 +658,7 @@ user_exc_return:		/* r10 contains MSR_KE
 	/* Check current_thread_info()->flags */
 	rlwinm	r9,r1,0,0,18
 	lwz	r9,TI_FLAGS(r9)
-	andi.	r0,r9,(_TIF_SIGPENDING|_TIF_NEED_RESCHED)
+	andi.	r0,r9,(_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	bne	do_work
 
 restore_user:
@@ -876,7 +876,7 @@ load_dbcr0:
 #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
 
 do_work:			/* r10 contains MSR_KERNEL here */
-	andi.	r0,r9,_TIF_NEED_RESCHED
+	andi.	r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	beq	do_user_signal
 
 do_resched:			/* r10 contains MSR_KERNEL here */
@@ -890,7 +890,7 @@ recheck:
 	MTMSRD(r10)		/* disable interrupts */
 	rlwinm	r9,r1,0,0,18
 	lwz	r9,TI_FLAGS(r9)
-	andi.	r0,r9,_TIF_NEED_RESCHED
+	andi.	r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	bne-	do_resched
 	andi.	r0,r9,_TIF_SIGPENDING
 	beq	restore_user
@@ -1000,3 +1000,85 @@ machine_check_in_rtas:
 	/* XXX load up BATs and panic */
 
 #endif /* CONFIG_PPC_OF */
+
+#ifdef CONFIG_MCOUNT
+
+/*
+ * mcount() is not the same as _mcount().  The callers of mcount() have a
+ * normal context.  The callers of _mcount() do not have a stack frame and
+ * have not saved the "caller saves" registers.
+ */
+_GLOBAL(mcount)
+	stwu	r1,-16(r1)
+	mflr	r3
+	lis	r5,mcount_enabled@ha
+	lwz	r5,mcount_enabled@l(r5)
+	stw	r3,20(r1)
+	cmpwi	r5,0
+	beq	1f
+	/* r3 contains lr (eip), put parent lr (parent_eip) in r4 */
+	lwz	r4,16(r1)
+	lwz	r4,4(r4)
+	bl	__trace
+1:
+	lwz	r0,20(r1)
+	mtlr	r0
+	addi	r1,r1,16
+	blr
+
+/*
+ * The -pg flag, which is specified in the case of CONFIG_MCOUNT, causes the
+ * C compiler to add a call to _mcount() at the start of each function preamble,
+ * before the stack frame is created.  An example of this preamble code is:
+ *
+ * 	mflr    r0
+ * 	lis     r12,-16354
+ * 	stw     r0,4(r1)
+ * 	addi    r0,r12,-19652
+ * 	bl      0xc00034c8 <_mcount>
+ * 	mflr    r0
+ * 	stwu    r1,-16(r1)
+ */
+_GLOBAL(_mcount)
+#define M_STK_SIZE 48
+	/* Would not expect to need to save cr, but glibc version of */
+	/* _mcount() does, so cautiously saving it here too.         */
+	stwu	r1,-M_STK_SIZE(r1)
+	stw	r3, 12(r1)
+	stw	r4, 16(r1)
+	stw	r5, 20(r1)
+	stw	r6, 24(r1)
+	mflr	r3		/* will use as first arg to __trace() */
+	mfcr	r4
+	lis	r5,mcount_enabled@ha
+	lwz	r5,mcount_enabled@l(r5)
+	cmpwi	r5,0
+	stw	r3, 44(r1)	/* lr */
+	stw	r4,  8(r1)	/* cr */
+	stw	r7, 28(r1)
+	stw	r8, 32(r1)
+	stw	r9, 36(r1)
+	stw	r10,40(r1)
+	beq	1f
+	/* r3 contains lr (eip), put parent lr (parent_eip) in r4 */
+	lwz	r4,M_STK_SIZE+4(r1)
+	bl	__trace
+1:
+	lwz	r8,  8(r1)	/* cr */
+	lwz	r9, 44(r1)	/* lr */
+	lwz	r3, 12(r1)
+	lwz	r4, 16(r1)
+	lwz	r5, 20(r1)
+	mtcrf	0xff,r8
+	mtctr	r9
+	lwz	r0, 52(r1)
+	lwz	r6, 24(r1)
+	lwz	r7, 28(r1)
+	lwz	r8, 32(r1)
+	lwz	r9, 36(r1)
+	lwz	r10,40(r1)
+	addi	r1,r1,M_STK_SIZE
+	mtlr	r0
+	bctr
+
+#endif /* CONFIG_MCOUNT */
Index: linux.prev/arch/ppc/kernel/idle.c
===================================================================
--- linux.prev.orig/arch/ppc/kernel/idle.c
+++ linux.prev/arch/ppc/kernel/idle.c
@@ -41,7 +41,7 @@ void default_idle(void)
 
 	powersave = ppc_md.power_save;
 
-	if (!need_resched()) {
+	if (!need_resched() && !need_resched_delayed()) {
 		if (powersave != NULL)
 			powersave();
 #ifdef CONFIG_SMP
@@ -64,6 +64,10 @@ void cpu_idle(void)
 
 	for (;;) {
 		while (!need_resched()) {
+			BUG_ON(raw_irqs_disabled());
+			stop_critical_timing();
+			propagate_preempt_locks_value();
+
 			if (ppc_md.idle != NULL)
 				ppc_md.idle();
 			else
@@ -72,9 +76,11 @@ void cpu_idle(void)
 
 		if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)
 			cpu_die();
-		preempt_enable_no_resched();
-		schedule();
+		raw_local_irq_disable();
+		__preempt_enable_no_resched();
+		__schedule();
 		preempt_disable();
+		raw_local_irq_enable();
 	}
 }
 
Index: linux.prev/arch/ppc/kernel/ppc_ksyms.c
===================================================================
--- linux.prev.orig/arch/ppc/kernel/ppc_ksyms.c
+++ linux.prev/arch/ppc/kernel/ppc_ksyms.c
@@ -272,7 +272,6 @@ EXPORT_SYMBOL(screen_info);
 
 EXPORT_SYMBOL(__delay);
 EXPORT_SYMBOL(timer_interrupt);
-EXPORT_SYMBOL(irq_desc);
 EXPORT_SYMBOL(tb_ticks_per_jiffy);
 EXPORT_SYMBOL(get_wchan);
 EXPORT_SYMBOL(console_drivers);
@@ -280,9 +279,6 @@ EXPORT_SYMBOL(console_drivers);
 EXPORT_SYMBOL(xmon);
 EXPORT_SYMBOL(xmon_printf);
 #endif
-EXPORT_SYMBOL(__up);
-EXPORT_SYMBOL(__down);
-EXPORT_SYMBOL(__down_interruptible);
 
 #if defined(CONFIG_KGDB) || defined(CONFIG_XMON)
 extern void (*debugger)(struct pt_regs *regs);
Index: linux.prev/arch/ppc/kernel/process.c
===================================================================
--- linux.prev.orig/arch/ppc/kernel/process.c
+++ linux.prev/arch/ppc/kernel/process.c
@@ -37,6 +37,8 @@
 #include <linux/kallsyms.h>
 #include <linux/mqueue.h>
 #include <linux/hardirq.h>
+#include <linux/init_task.h>
+#include <linux/fs_struct.h>
 
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
@@ -52,8 +54,8 @@ struct task_struct *last_task_used_math 
 struct task_struct *last_task_used_altivec = NULL;
 struct task_struct *last_task_used_spe = NULL;
 
-static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
+static struct fs_struct init_fs = INIT_FS(init_fs);
+static struct files_struct init_files = INIT_FILES(init_files);
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
@@ -301,7 +303,7 @@ struct task_struct *__switch_to(struct t
 	unsigned long s;
 	struct task_struct *last;
 
-	local_irq_save(s);
+	raw_local_irq_save(s);
 #ifdef CHECK_STACK
 	check_stack(prev);
 	check_stack(new);
@@ -364,7 +366,7 @@ struct task_struct *__switch_to(struct t
 	new_thread = &new->thread;
 	old_thread = &current->thread;
 	last = _switch(old_thread, new_thread);
-	local_irq_restore(s);
+	raw_local_irq_restore(s);
 	return last;
 }
 
Index: linux.prev/arch/ppc/kernel/semaphore.c
===================================================================
--- linux.prev.orig/arch/ppc/kernel/semaphore.c
+++ linux.prev/arch/ppc/kernel/semaphore.c
@@ -29,7 +29,7 @@
  *	sem->count = tmp;
  *	return old_count;
  */
-static inline int __sem_update_count(struct semaphore *sem, int incr)
+static inline int __sem_update_count(struct compat_semaphore *sem, int incr)
 {
 	int old_count, tmp;
 
@@ -48,7 +48,7 @@ static inline int __sem_update_count(str
 	return old_count;
 }
 
-void __up(struct semaphore *sem)
+void __compat_up(struct compat_semaphore *sem)
 {
 	/*
 	 * Note that we incremented count in up() before we came here,
@@ -70,7 +70,7 @@ void __up(struct semaphore *sem)
  * Thus it is only when we decrement count from some value > 0
  * that we have actually got the semaphore.
  */
-void __sched __down(struct semaphore *sem)
+void __sched __compat_down(struct compat_semaphore *sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -100,7 +100,7 @@ void __sched __down(struct semaphore *se
 	wake_up(&sem->wait);
 }
 
-int __sched __down_interruptible(struct semaphore * sem)
+int __sched __compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -129,3 +129,8 @@ int __sched __down_interruptible(struct 
 	wake_up(&sem->wait);
 	return retval;
 }
+
+int compat_sem_is_locked(struct compat_semaphore *sem)
+{
+	return (int) atomic_read(&sem->count) < 0;
+}
Index: linux.prev/arch/ppc/kernel/smp-tbsync.c
===================================================================
--- linux.prev.orig/arch/ppc/kernel/smp-tbsync.c
+++ linux.prev/arch/ppc/kernel/smp-tbsync.c
@@ -49,7 +49,7 @@ smp_generic_take_timebase( void )
 {
 	int cmd, tbl, tbu;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	while( !running )
 		;
 	rmb();
@@ -78,7 +78,7 @@ smp_generic_take_timebase( void )
 		}
 		enter_contest( tbsync->mark, -1 );
 	}
-	local_irq_enable();
+	raw_local_irq_enable();
 }
 
 static int __devinit
@@ -88,7 +88,7 @@ start_contest( int cmd, int offset, int 
 
 	tbsync->cmd = cmd;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	for( i=-3; i<num; ) {
 		tbl = get_tbl() + 400;
 		tbsync->tbu = tbu = get_tbu();
@@ -114,7 +114,7 @@ start_contest( int cmd, int offset, int 
 		if( i++ > 0 )
 			score += tbsync->race_result;
 	}
-	local_irq_enable();
+	raw_local_irq_enable();
 	return score;
 }
 
Index: linux.prev/arch/ppc/kernel/smp.c
===================================================================
--- linux.prev.orig/arch/ppc/kernel/smp.c
+++ linux.prev/arch/ppc/kernel/smp.c
@@ -138,6 +138,16 @@ void smp_send_reschedule(int cpu)
 	smp_message_pass(cpu, PPC_MSG_RESCHEDULE);
 }
 
+/*
+ * this function sends a 'reschedule' IPI to all other CPUs.
+ * This is used when RT tasks are starving and other CPUs
+ * might be able to run them:
+ */
+void smp_send_reschedule_allbutself(void)
+{
+	smp_message_pass(MSG_ALL_BUT_SELF, PPC_MSG_RESCHEDULE, 0, 0);
+}
+
 #ifdef CONFIG_XMON
 void smp_send_xmon_break(int cpu)
 {
@@ -147,7 +157,7 @@ void smp_send_xmon_break(int cpu)
 
 static void stop_this_cpu(void *dummy)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	while (1)
 		;
 }
@@ -162,7 +172,7 @@ void smp_send_stop(void)
  * static memory requirements. It also looks cleaner.
  * Stolen from the i386 version.
  */
-static DEFINE_SPINLOCK(call_lock);
+static DEFINE_RAW_SPINLOCK(call_lock);
 
 static struct call_data_struct {
 	void (*func) (void *info);
@@ -197,7 +207,7 @@ int smp_call_function(void (*func) (void
 	if (num_online_cpus() <= 1)
 		return 0;
 	/* Can deadlock when called with interrupts disabled */
-	WARN_ON(irqs_disabled());
+	WARN_ON(raw_irqs_disabled());
 	return __smp_call_function(func, info, wait, MSG_ALL_BUT_SELF);
 }
 
@@ -358,7 +368,7 @@ int __devinit start_secondary(void *unus
 	cpu_set(cpu, cpu_online_map);
 	spin_unlock(&call_lock);
 
-	local_irq_enable();
+	raw_local_irq_enable();
 
 	cpu_idle();
 	return 0;
Index: linux.prev/arch/ppc/kernel/temp.c
===================================================================
--- linux.prev.orig/arch/ppc/kernel/temp.c
+++ linux.prev/arch/ppc/kernel/temp.c
@@ -142,7 +142,7 @@ static void tau_timeout(void * info)
 	int shrink;
 
 	/* disabling interrupts *should* be okay */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	cpu = smp_processor_id();
 
 #ifndef CONFIG_TAU_INT
@@ -185,7 +185,7 @@ static void tau_timeout(void * info)
 	 */
 	mtspr(SPRN_THRM3, THRM3_SITV(500*60) | THRM3_E);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static void tau_timeout_smp(unsigned long unused)
Index: linux.prev/arch/ppc/kernel/time.c
===================================================================
--- linux.prev.orig/arch/ppc/kernel/time.c
+++ linux.prev/arch/ppc/kernel/time.c
@@ -66,6 +66,9 @@
 
 #include <asm/time.h>
 
+unsigned long cpu_khz;   /* Detected as we calibrate the TSC */
+EXPORT_SYMBOL(cpu_khz);
+
 unsigned long disarm_decr[NR_CPUS];
 
 extern struct timezone sys_tz;
@@ -86,7 +89,7 @@ extern unsigned long wall_jiffies;
 /* used for timezone offset */
 static long timezone_offset;
 
-DEFINE_SPINLOCK(rtc_lock);
+DEFINE_RAW_SPINLOCK(rtc_lock);
 
 EXPORT_SYMBOL(rtc_lock);
 
@@ -104,7 +107,7 @@ static inline int tb_delta(unsigned *jif
 }
 
 #ifdef CONFIG_SMP
-unsigned long profile_pc(struct pt_regs *regs)
+unsigned long notrace profile_pc(struct pt_regs *regs)
 {
 	unsigned long pc = instruction_pointer(regs);
 
Index: linux.prev/arch/ppc/kernel/traps.c
===================================================================
--- linux.prev.orig/arch/ppc/kernel/traps.c
+++ linux.prev/arch/ppc/kernel/traps.c
@@ -77,7 +77,7 @@ void (*debugger_fault_handler)(struct pt
  * Trap & Exception support
  */
 
-DEFINE_SPINLOCK(die_lock);
+DEFINE_RAW_SPINLOCK(die_lock);
 
 int die(const char * str, struct pt_regs * fp, long err)
 {
@@ -118,6 +118,10 @@ void _exception(int signr, struct pt_reg
 		debugger(regs);
 		die("Exception in kernel mode", regs, signr);
 	}
+#ifdef CONFIG_PREEMPT_RT
+	raw_local_irq_enable();
+	preempt_check_resched();
+#endif
 	info.si_signo = signr;
 	info.si_errno = 0;
 	info.si_code = code;
Index: linux.prev/arch/ppc/lib/locks.c
===================================================================
--- linux.prev.orig/arch/ppc/lib/locks.c
+++ linux.prev/arch/ppc/lib/locks.c
@@ -43,7 +43,7 @@ static inline unsigned long __spin_trylo
 	return ret;
 }
 
-void _raw_spin_lock(spinlock_t *lock)
+void __raw_spin_lock(raw_spinlock_t *lock)
 {
 	int cpu = smp_processor_id();
 	unsigned int stuck = INIT_STUCK;
@@ -63,9 +63,9 @@ void _raw_spin_lock(spinlock_t *lock)
 	lock->owner_pc = (unsigned long)__builtin_return_address(0);
 	lock->owner_cpu = cpu;
 }
-EXPORT_SYMBOL(_raw_spin_lock);
+EXPORT_SYMBOL(__raw_spin_lock);
 
-int _raw_spin_trylock(spinlock_t *lock)
+int __raw_spin_trylock(raw_spinlock_t *lock)
 {
 	if (__spin_trylock(&lock->lock))
 		return 0;
@@ -73,9 +73,9 @@ int _raw_spin_trylock(spinlock_t *lock)
 	lock->owner_pc = (unsigned long)__builtin_return_address(0);
 	return 1;
 }
-EXPORT_SYMBOL(_raw_spin_trylock);
+EXPORT_SYMBOL(__raw_spin_trylock);
 
-void _raw_spin_unlock(spinlock_t *lp)
+void __raw_spin_unlock(raw_spinlock_t *lp)
 {
   	if ( !lp->lock )
 		printk("_spin_unlock(%p): no lock cpu %d curr PC %p %s/%d\n",
@@ -89,13 +89,13 @@ void _raw_spin_unlock(spinlock_t *lp)
 	wmb();
 	lp->lock = 0;
 }
-EXPORT_SYMBOL(_raw_spin_unlock);
+EXPORT_SYMBOL(__raw_spin_unlock);
 
 /*
  * For rwlocks, zero is unlocked, -1 is write-locked,
  * positive is read-locked.
  */
-static __inline__ int __read_trylock(rwlock_t *rw)
+static __inline__ int __read_trylock(raw_rwlock_t *rw)
 {
 	signed int tmp;
 
@@ -115,13 +115,13 @@ static __inline__ int __read_trylock(rwl
 	return tmp;
 }
 
-int _raw_read_trylock(rwlock_t *rw)
+int __raw_read_trylock(raw_rwlock_t *rw)
 {
 	return __read_trylock(rw) > 0;
 }
-EXPORT_SYMBOL(_raw_read_trylock);
+EXPORT_SYMBOL(__raw_read_trylock);
 
-void _raw_read_lock(rwlock_t *rw)
+void __raw_read_lock(rwlock_t *rw)
 {
 	unsigned int stuck;
 
@@ -136,9 +136,9 @@ void _raw_read_lock(rwlock_t *rw)
 		}
 	}
 }
-EXPORT_SYMBOL(_raw_read_lock);
+EXPORT_SYMBOL(__raw_read_lock);
 
-void _raw_read_unlock(rwlock_t *rw)
+void __raw_read_unlock(raw_rwlock_t *rw)
 {
 	if ( rw->lock == 0 )
 		printk("_read_unlock(): %s/%d (nip %08lX) lock %d\n",
@@ -147,9 +147,9 @@ void _raw_read_unlock(rwlock_t *rw)
 	wmb();
 	atomic_dec((atomic_t *) &(rw)->lock);
 }
-EXPORT_SYMBOL(_raw_read_unlock);
+EXPORT_SYMBOL(__raw_read_unlock);
 
-void _raw_write_lock(rwlock_t *rw)
+void __raw_write_lock(raw_rwlock_t *rw)
 {
 	unsigned int stuck;
 
@@ -165,18 +165,18 @@ void _raw_write_lock(rwlock_t *rw)
 	}
 	wmb();
 }
-EXPORT_SYMBOL(_raw_write_lock);
+EXPORT_SYMBOL(__raw_write_lock);
 
-int _raw_write_trylock(rwlock_t *rw)
+int __raw_write_trylock(raw_rwlock_t *rw)
 {
 	if (cmpxchg(&rw->lock, 0, -1) != 0)
 		return 0;
 	wmb();
 	return 1;
 }
-EXPORT_SYMBOL(_raw_write_trylock);
+EXPORT_SYMBOL(__raw_write_trylock);
 
-void _raw_write_unlock(rwlock_t *rw)
+void __raw_write_unlock(raw_rwlock_t *rw)
 {
 	if (rw->lock >= 0)
 		printk("_write_lock(): %s/%d (nip %08lX) lock %d\n",
@@ -185,6 +185,6 @@ void _raw_write_unlock(rwlock_t *rw)
 	wmb();
 	rw->lock = 0;
 }
-EXPORT_SYMBOL(_raw_write_unlock);
+EXPORT_SYMBOL(__raw_write_unlock);
 
 #endif
Index: linux.prev/arch/ppc/mm/fault.c
===================================================================
--- linux.prev.orig/arch/ppc/mm/fault.c
+++ linux.prev/arch/ppc/mm/fault.c
@@ -92,7 +92,7 @@ static int store_updates_sp(struct pt_re
  * the error_code parameter is ESR for a data fault, 0 for an instruction
  * fault.
  */
-int do_page_fault(struct pt_regs *regs, unsigned long address,
+int notrace do_page_fault(struct pt_regs *regs, unsigned long address,
 		  unsigned long error_code)
 {
 	struct vm_area_struct * vma;
Index: linux.prev/arch/ppc/mm/init.c
===================================================================
--- linux.prev.orig/arch/ppc/mm/init.c
+++ linux.prev/arch/ppc/mm/init.c
@@ -56,7 +56,7 @@
 #endif
 #define MAX_LOW_MEM	CONFIG_LOWMEM_SIZE
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 unsigned long total_memory;
 unsigned long total_lowmem;
Index: linux.prev/arch/ppc/platforms/4xx/xilinx_ml300.c
===================================================================
--- linux.prev.orig/arch/ppc/platforms/4xx/xilinx_ml300.c
+++ linux.prev/arch/ppc/platforms/4xx/xilinx_ml300.c
@@ -62,7 +62,7 @@ static volatile unsigned *powerdown_base
 static void
 xilinx_power_off(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	out_be32(powerdown_base, XPAR_POWER_0_POWERDOWN_VALUE);
 	while (1) ;
 }
Index: linux.prev/arch/ppc/platforms/apus_setup.c
===================================================================
--- linux.prev.orig/arch/ppc/platforms/apus_setup.c
+++ linux.prev/arch/ppc/platforms/apus_setup.c
@@ -282,6 +282,7 @@ void apus_calibrate_decr(void)
 	       freq/1000000, freq%1000000);
 	tb_ticks_per_jiffy = freq / HZ;
 	tb_to_us = mulhwu_scale_factor(freq, 1000000);
+	cpu_khz = freq / 1000;
 
 	__bus_speed = bus_speed;
 	__speed_test_failed = speed_test_failed;
@@ -480,7 +481,7 @@ void cache_clear(__u32 addr, int length)
 void
 apus_restart(char *cmd)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	APUS_WRITE(APUS_REG_LOCK,
 		   REGLOCK_BLACKMAGICK1|REGLOCK_BLACKMAGICK2);
@@ -598,7 +599,7 @@ int __debug_serinit( void )
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	/* turn off Rx and Tx interrupts */
 	custom.intena = IF_RBF | IF_TBE;
@@ -606,7 +607,7 @@ int __debug_serinit( void )
 	/* clear any pending interrupt */
 	custom.intreq = IF_RBF | IF_TBE;
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	/*
 	 * set the appropriate directions for the modem control flags,
Index: linux.prev/arch/ppc/platforms/chestnut.c
===================================================================
--- linux.prev.orig/arch/ppc/platforms/chestnut.c
+++ linux.prev/arch/ppc/platforms/chestnut.c
@@ -455,7 +455,7 @@ chestnut_restart(char *cmd)
 {
 	volatile ulong i = 10000000;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
         /*
          * Set CPLD Reg 3 bit 0 to 1 to allow MPP signals on reset to work
@@ -474,7 +474,7 @@ chestnut_restart(char *cmd)
 static void
 chestnut_halt(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	for (;;);
 	/* NOTREACHED */
 }
Index: linux.prev/arch/ppc/platforms/chrp_smp.c
===================================================================
--- linux.prev.orig/arch/ppc/platforms/chrp_smp.c
+++ linux.prev/arch/ppc/platforms/chrp_smp.c
@@ -58,7 +58,7 @@ smp_chrp_setup_cpu(int cpu_nr)
 		do_openpic_setup_cpu();
 }
 
-static DEFINE_SPINLOCK(timebase_lock);
+static DEFINE_RAW_SPINLOCK(timebase_lock);
 static unsigned int timebase_upper = 0, timebase_lower = 0;
 
 void __devinit
Index: linux.prev/arch/ppc/platforms/chrp_time.c
===================================================================
--- linux.prev.orig/arch/ppc/platforms/chrp_time.c
+++ linux.prev/arch/ppc/platforms/chrp_time.c
@@ -28,7 +28,7 @@
 #include <asm/sections.h>
 #include <asm/time.h>
 
-extern spinlock_t rtc_lock;
+extern raw_spinlock_t rtc_lock;
 
 static int nvram_as1 = NVRAM_AS1;
 static int nvram_as0 = NVRAM_AS0;
@@ -188,4 +188,5 @@ void __init chrp_calibrate_decr(void)
  	       freq/1000000, freq%1000000);
 	tb_ticks_per_jiffy = freq / HZ;
 	tb_to_us = mulhwu_scale_factor(freq, 1000000);
+	cpu_khz = freq / 1000;
 }
Index: linux.prev/arch/ppc/platforms/cpci690.c
===================================================================
--- linux.prev.orig/arch/ppc/platforms/cpci690.c
+++ linux.prev/arch/ppc/platforms/cpci690.c
@@ -322,7 +322,7 @@ cpci690_reset_board(void)
 {
 	u32	i = 10000;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	out_8((cpci690_br_base + CPCI690_BR_SW_RESET), 0x11);
 
 	while (i != 0) i++;
Index: linux.prev/arch/ppc/platforms/ev64260.c
===================================================================
--- linux.prev.orig/arch/ppc/platforms/ev64260.c
+++ linux.prev/arch/ppc/platforms/ev64260.c
@@ -446,7 +446,7 @@ ev64260_platform_notify(struct device *d
 static void
 ev64260_reset_board(void *addr)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* disable and invalidate the L2 cache */
 	_set_L2CR(0);
@@ -514,7 +514,7 @@ ev64260_restart(char *cmd)
 static void
 ev64260_halt(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	while (1);
 	/* NOTREACHED */
 }
@@ -553,6 +553,7 @@ ev64260_calibrate_decr(void)
 
 	tb_ticks_per_jiffy = freq / HZ;
 	tb_to_us = mulhwu_scale_factor(freq, 1000000);
+	cpu_khz = freq / 1000;
 
 	return;
 }
Index: linux.prev/arch/ppc/platforms/gemini_setup.c
===================================================================
--- linux.prev.orig/arch/ppc/platforms/gemini_setup.c
+++ linux.prev/arch/ppc/platforms/gemini_setup.c
@@ -303,7 +303,7 @@ void __init gemini_init_l2(void)
 void
 gemini_restart(char *cmd)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	/* make a clean restart, not via the MPIC */
 	_gemini_reboot();
 	for(;;);
@@ -462,6 +462,7 @@ void __init gemini_calibrate_decr(void)
 	divisor = 4;
 	tb_ticks_per_jiffy = freq / HZ / divisor;
 	tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000);
+	cpu_khz = (freq / divisor) / 1000;
 }
 
 unsigned long __init gemini_find_end_of_memory(void)
Index: linux.prev/arch/ppc/platforms/hdpu.c
===================================================================
--- linux.prev.orig/arch/ppc/platforms/hdpu.c
+++ linux.prev/arch/ppc/platforms/hdpu.c
@@ -474,7 +474,7 @@ static void hdpu_reset_board(void)
 
 	hdpu_cpustate_set(CPUSTATE_KERNEL_MAJOR | CPUSTATE_KERNEL_RESET);
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* Clear all the LEDs */
 	mv64x60_write(&bh, MV64x60_GPP_VALUE_CLR, ((1 << 4) |
@@ -516,7 +516,7 @@ static void hdpu_restart(char *cmd)
 
 static void hdpu_halt(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	hdpu_cpustate_set(CPUSTATE_KERNEL_MAJOR | CPUSTATE_KERNEL_HALT);
 
Index: linux.prev/arch/ppc/platforms/lopec.c
===================================================================
--- linux.prev.orig/arch/ppc/platforms/lopec.c
+++ linux.prev/arch/ppc/platforms/lopec.c
@@ -153,7 +153,7 @@ lopec_restart(char *cmd)
 	reg |= 0x80;
 	*((unsigned char *) LOPEC_SYSSTAT1) = reg;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	while(1);
 #undef LOPEC_SYSSTAT1
 }
@@ -161,7 +161,7 @@ lopec_restart(char *cmd)
 static void
 lopec_halt(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	while(1);
 }
 
Index: linux.prev/arch/ppc/platforms/mvme5100.c
===================================================================
--- linux.prev.orig/arch/ppc/platforms/mvme5100.c
+++ linux.prev/arch/ppc/platforms/mvme5100.c
@@ -262,7 +262,7 @@ mvme5100_map_io(void)
 static void
 mvme5100_reset_board(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* Set exception prefix high - to the firmware */
 	_nmask_and_or_msr(0, MSR_IP);
@@ -286,7 +286,7 @@ mvme5100_restart(char *cmd)
 static void
 mvme5100_halt(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	while (1);
 }
 
Index: linux.prev/arch/ppc/platforms/pal4_setup.c
===================================================================
--- linux.prev.orig/arch/ppc/platforms/pal4_setup.c
+++ linux.prev/arch/ppc/platforms/pal4_setup.c
@@ -82,7 +82,7 @@ pal4_show_cpuinfo(struct seq_file *m)
 static void
 pal4_restart(char *cmd)
 {
-        local_irq_disable();
+        raw_local_irq_disable();
         __asm__ __volatile__("lis  3,0xfff0\n \
                               ori  3,3,0x100\n \
                               mtspr 26,3\n \
@@ -96,7 +96,7 @@ pal4_restart(char *cmd)
 static void
 pal4_power_off(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	for(;;);
 }
 
Index: linux.prev/arch/ppc/platforms/pmac_cpufreq.c
===================================================================
--- linux.prev.orig/arch/ppc/platforms/pmac_cpufreq.c
+++ linux.prev/arch/ppc/platforms/pmac_cpufreq.c
@@ -285,7 +285,7 @@ static int pmu_set_cpu_speed(int low_spe
 	asm volatile("mtdec %0" : : "r" (0x7fffffff));
 
 	/* We can now disable MSR_EE */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	/* Giveup the FPU & vec */
 	enable_kernel_fp();
@@ -341,7 +341,7 @@ static int pmu_set_cpu_speed(int low_spe
  	openpic_set_priority(pic_prio);
 
 	/* Let interrupts flow again ... */
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 #ifdef DEBUG_FREQ
 	debug_calc_bogomips();
Index: linux.prev/arch/ppc/platforms/pmac_feature.c
===================================================================
--- linux.prev.orig/arch/ppc/platforms/pmac_feature.c
+++ linux.prev/arch/ppc/platforms/pmac_feature.c
@@ -63,7 +63,7 @@ extern struct device_node *k2_skiplist[2
  * We use a single global lock to protect accesses. Each driver has
  * to take care of its own locking
  */
-static DEFINE_SPINLOCK(feature_lock);
+static DEFINE_RAW_SPINLOCK(feature_lock);
 
 #define LOCK(flags)	spin_lock_irqsave(&feature_lock, flags);
 #define UNLOCK(flags)	spin_unlock_irqrestore(&feature_lock, flags);
Index: linux.prev/arch/ppc/platforms/pmac_nvram.c
===================================================================
--- linux.prev.orig/arch/ppc/platforms/pmac_nvram.c
+++ linux.prev/arch/ppc/platforms/pmac_nvram.c
@@ -80,7 +80,7 @@ static volatile unsigned char *nvram_dat
 static int nvram_mult, is_core_99;
 static int core99_bank = 0;
 static int nvram_partitions[3];
-static DEFINE_SPINLOCK(nv_lock);
+static DEFINE_RAW_SPINLOCK(nv_lock);
 
 extern int pmac_newworld;
 extern int system_running;
Index: linux.prev/arch/ppc/platforms/pmac_pic.c
===================================================================
--- linux.prev.orig/arch/ppc/platforms/pmac_pic.c
+++ linux.prev/arch/ppc/platforms/pmac_pic.c
@@ -69,7 +69,7 @@ static int max_irqs;
 static int max_real_irqs;
 static u32 level_mask[4];
 
-static DEFINE_SPINLOCK(pmac_pic_lock);
+static DEFINE_RAW_SPINLOCK(pmac_pic_lock);
 
 
 #define GATWICK_IRQ_POOL_SIZE        10
Index: linux.prev/arch/ppc/platforms/pmac_smp.c
===================================================================
--- linux.prev.orig/arch/ppc/platforms/pmac_smp.c
+++ linux.prev/arch/ppc/platforms/pmac_smp.c
@@ -499,8 +499,8 @@ static void __devinit smp_core99_kick_cp
 		return;
 	if (ppc_md.progress) ppc_md.progress("smp_core99_kick_cpu", 0x346);
 
-	local_irq_save(flags);
-	local_irq_disable();
+	raw_local_irq_save(flags);
+	raw_local_irq_disable();
 
 	/* Save reset vector */
 	save_vector = *vector;
@@ -528,7 +528,7 @@ static void __devinit smp_core99_kick_cp
 	*vector = save_vector;
 	flush_icache_range((unsigned long) vector, (unsigned long) vector + 4);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	if (ppc_md.progress) ppc_md.progress("smp_core99_kick_cpu done", 0x347);
 }
 
@@ -570,7 +570,7 @@ void smp_core99_take_timebase(void)
 		mb();
 
 	/* set our stuff the same as the primary */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	set_dec(1);
 	set_tb(pri_tb_hi, pri_tb_lo);
 	last_jiffy_stamp(smp_processor_id()) = pri_tb_stamp;
@@ -579,7 +579,7 @@ void smp_core99_take_timebase(void)
 	/* tell the primary we're done */
        	sec_tb_reset = 0;
 	mb();
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /* not __init, called in sleep/wakeup code */
@@ -599,7 +599,7 @@ void smp_core99_give_timebase(void)
 	/* freeze the timebase and read it */
 	/* disable interrupts so the timebase is disabled for the
 	   shortest possible time */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	pmac_call_feature(PMAC_FTR_WRITE_GPIO, NULL, core99_tb_gpio, 4);
 	pmac_call_feature(PMAC_FTR_READ_GPIO, NULL, core99_tb_gpio, 0);
 	mb();
@@ -623,7 +623,7 @@ void smp_core99_give_timebase(void)
 	/* Now, restart the timebase by leaving the GPIO to an open collector */
        	pmac_call_feature(PMAC_FTR_WRITE_GPIO, NULL, core99_tb_gpio, 0);
         pmac_call_feature(PMAC_FTR_READ_GPIO, NULL, core99_tb_gpio, 0);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 
Index: linux.prev/arch/ppc/platforms/pmac_time.c
===================================================================
--- linux.prev.orig/arch/ppc/platforms/pmac_time.c
+++ linux.prev/arch/ppc/platforms/pmac_time.c
@@ -197,6 +197,7 @@ via_calibrate_decr(void)
 
 	tb_ticks_per_jiffy = (dstart - dend) / ((6 * HZ)/100);
 	tb_to_us = mulhwu_scale_factor(dstart - dend, 60000);
+	cpu_khz = (dstart - dend) / 60;
 
 	printk(KERN_INFO "via_calibrate_decr: ticks per jiffy = %u (%u ticks)\n",
 	       tb_ticks_per_jiffy, dstart - dend);
@@ -288,4 +289,5 @@ pmac_calibrate_decr(void)
 	       freq/1000000, freq%1000000);
 	tb_ticks_per_jiffy = freq / HZ;
 	tb_to_us = mulhwu_scale_factor(freq, 1000000);
+	cpu_khz = freq / 1000;
 }
Index: linux.prev/arch/ppc/platforms/powerpmc250.c
===================================================================
--- linux.prev.orig/arch/ppc/platforms/powerpmc250.c
+++ linux.prev/arch/ppc/platforms/powerpmc250.c
@@ -166,12 +166,13 @@ powerpmc250_calibrate_decr(void)
 
 	tb_ticks_per_jiffy = freq / (HZ * divisor);
 	tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000);
+	cpu_khz = (freq / divisor) / 1000;
 }
 
 static void
 powerpmc250_restart(char *cmd)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	/* Hard reset */
 	writeb(0x11, 0xfe000332);
 	while(1);
@@ -180,7 +181,7 @@ powerpmc250_restart(char *cmd)
 static void
 powerpmc250_halt(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	while (1);
 }
 
Index: linux.prev/arch/ppc/platforms/pplus.c
===================================================================
--- linux.prev.orig/arch/ppc/platforms/pplus.c
+++ linux.prev/arch/ppc/platforms/pplus.c
@@ -607,7 +607,7 @@ static void pplus_restart(char *cmd)
 {
 	unsigned long i = 10000;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* set VIA IDE controller into native mode */
 	pplus_set_VIA_IDE_native();
Index: linux.prev/arch/ppc/platforms/prep_setup.c
===================================================================
--- linux.prev.orig/arch/ppc/platforms/prep_setup.c
+++ linux.prev/arch/ppc/platforms/prep_setup.c
@@ -464,7 +464,7 @@ static void
 prep_restart(char *cmd)
 {
 #define PREP_SP92	0x92	/* Special Port 92 */
-	local_irq_disable(); /* no interrupts */
+	raw_local_irq_disable(); /* no interrupts */
 
 	/* set exception prefix high - to the prom */
 	_nmask_and_or_msr(0, MSR_IP);
@@ -482,7 +482,7 @@ prep_restart(char *cmd)
 static void
 prep_halt(void)
 {
-	local_irq_disable(); /* no interrupts */
+	raw_local_irq_disable(); /* no interrupts */
 
 	/* set exception prefix high - to the prom */
 	_nmask_and_or_msr(0, MSR_IP);
@@ -550,7 +550,7 @@ prep_sig750_poweroff(void)
 {
 	/* tweak the power manager found in most IBM PRePs (except Thinkpads) */
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	/* set exception prefix high - to the prom */
 	_nmask_and_or_msr(0, MSR_IP);
 
@@ -944,6 +944,7 @@ prep_calibrate_decr(void)
 					(freq/divisor)/1000000,
 					(freq/divisor)%1000000);
 			tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000);
+			cpu_khz = (freq / divisor) / 1000;
 			tb_ticks_per_jiffy = freq / HZ / divisor;
 		}
 	}
Index: linux.prev/arch/ppc/platforms/prpmc750.c
===================================================================
--- linux.prev.orig/arch/ppc/platforms/prpmc750.c
+++ linux.prev/arch/ppc/platforms/prpmc750.c
@@ -271,18 +271,19 @@ static void __init prpmc750_calibrate_de
 
 	tb_ticks_per_jiffy = freq / (HZ * divisor);
 	tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000);
+	cpu_khz = (freq / divisor) / 1000;
 }
 
 static void prpmc750_restart(char *cmd)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	writeb(PRPMC750_MODRST_MASK, PRPMC750_MODRST_REG);
 	while (1) ;
 }
 
 static void prpmc750_halt(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	while (1) ;
 }
 
Index: linux.prev/arch/ppc/platforms/prpmc800.c
===================================================================
--- linux.prev.orig/arch/ppc/platforms/prpmc800.c
+++ linux.prev/arch/ppc/platforms/prpmc800.c
@@ -330,6 +330,7 @@ static void __init prpmc800_calibrate_de
 		tb_ticks_per_second = 100000000 / 4;
 		tb_ticks_per_jiffy = tb_ticks_per_second / HZ;
 		tb_to_us = mulhwu_scale_factor(tb_ticks_per_second, 1000000);
+		cpu_khz = tb_ticks_per_second / 1000;
 		return;
 	}
 
@@ -370,13 +371,14 @@ static void __init prpmc800_calibrate_de
 	tb_ticks_per_second = (tbl_end - tbl_start) * 2;
 	tb_ticks_per_jiffy = tb_ticks_per_second / HZ;
 	tb_to_us = mulhwu_scale_factor(tb_ticks_per_second, 1000000);
+	cpu_khz = tb_ticks_per_second / 1000;
 }
 
 static void prpmc800_restart(char *cmd)
 {
 	ulong temp;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	temp = in_be32((uint *) HARRIER_MISC_CSR_REG);
 	temp |= HARRIER_RSTOUT;
 	out_be32((uint *) HARRIER_MISC_CSR_REG, temp);
@@ -385,7 +387,7 @@ static void prpmc800_restart(char *cmd)
 
 static void prpmc800_halt(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	while (1) ;
 }
 
Index: linux.prev/arch/ppc/platforms/radstone_ppc7d.c
===================================================================
--- linux.prev.orig/arch/ppc/platforms/radstone_ppc7d.c
+++ linux.prev/arch/ppc/platforms/radstone_ppc7d.c
@@ -176,7 +176,7 @@ static void ppc7d_power_off(void)
 {
 	u32 data;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* Ensure that internal MV643XX watchdog is disabled.
 	 * The Disco watchdog uses MPP17 on this hardware.
Index: linux.prev/arch/ppc/platforms/sandpoint.c
===================================================================
--- linux.prev.orig/arch/ppc/platforms/sandpoint.c
+++ linux.prev/arch/ppc/platforms/sandpoint.c
@@ -527,7 +527,7 @@ sandpoint_map_io(void)
 static void
 sandpoint_restart(char *cmd)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* Set exception prefix high - to the firmware */
 	_nmask_and_or_msr(0, MSR_IP);
@@ -541,7 +541,7 @@ sandpoint_restart(char *cmd)
 static void
 sandpoint_power_off(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	for(;;);	/* No way to shut power off with software */
 	/* NOTREACHED */
 }
Index: linux.prev/arch/ppc/platforms/sbc82xx.c
===================================================================
--- linux.prev.orig/arch/ppc/platforms/sbc82xx.c
+++ linux.prev/arch/ppc/platforms/sbc82xx.c
@@ -68,7 +68,7 @@ static void sbc82xx_time_init(void)
 
 static volatile char *sbc82xx_i8259_map;
 static char sbc82xx_i8259_mask = 0xff;
-static DEFINE_SPINLOCK(sbc82xx_i8259_lock);
+static DEFINE_RAW_SPINLOCK(sbc82xx_i8259_lock);
 
 static void sbc82xx_i8259_mask_and_ack_irq(unsigned int irq_nr)
 {
Index: linux.prev/arch/ppc/platforms/spruce.c
===================================================================
--- linux.prev.orig/arch/ppc/platforms/spruce.c
+++ linux.prev/arch/ppc/platforms/spruce.c
@@ -150,6 +150,7 @@ spruce_calibrate_decr(void)
 	freq = SPRUCE_BUS_SPEED;
 	tb_ticks_per_jiffy = freq / HZ / divisor;
 	tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000);
+	cpu_khz = (freq / divisor) / 1000;
 }
 
 static int
@@ -236,7 +237,7 @@ spruce_setup_arch(void)
 static void
 spruce_restart(char *cmd)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* SRR0 has system reset vector, SRR1 has default MSR value */
 	/* rfi restores MSR from SRR1 and sets the PC to the SRR0 value */
Index: linux.prev/arch/ppc/syslib/cpm2_common.c
===================================================================
--- linux.prev.orig/arch/ppc/syslib/cpm2_common.c
+++ linux.prev/arch/ppc/syslib/cpm2_common.c
@@ -114,7 +114,7 @@ cpm2_fastbrg(uint brg, uint rate, int di
 /*
  * dpalloc / dpfree bits.
  */
-static spinlock_t cpm_dpmem_lock;
+static raw_spinlock_t cpm_dpmem_lock;
 /* 16 blocks should be enough to satisfy all requests
  * until the memory subsystem goes up... */
 static rh_block_t cpm_boot_dpmem_rh_block[16];
Index: linux.prev/arch/ppc/syslib/ibm440gx_common.c
===================================================================
--- linux.prev.orig/arch/ppc/syslib/ibm440gx_common.c
+++ linux.prev/arch/ppc/syslib/ibm440gx_common.c
@@ -157,7 +157,7 @@ void __init ibm440gx_l2c_enable(void){
 		return;
 	}
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	asm volatile ("sync" ::: "memory");
 
 	/* Disable SRAM */
@@ -201,7 +201,7 @@ void __init ibm440gx_l2c_enable(void){
 	mtdcr(DCRN_L2C0_CFG, r);
 
 	asm volatile ("sync; isync" ::: "memory");
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /* Disable L2 cache */
@@ -209,7 +209,7 @@ void __init ibm440gx_l2c_disable(void){
 	u32 r;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	asm volatile ("sync" ::: "memory");
 
 	/* Disable L2C mode */
@@ -228,7 +228,7 @@ void __init ibm440gx_l2c_disable(void){
 	      SRAM_SBCR_BAS3 | SRAM_SBCR_BS_64KB | SRAM_SBCR_BU_RW);
 
 	asm volatile ("sync; isync" ::: "memory");
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void __init ibm440gx_l2c_setup(struct ibm44x_clocks* p)
Index: linux.prev/arch/ppc/syslib/ibm44x_common.c
===================================================================
--- linux.prev.orig/arch/ppc/syslib/ibm44x_common.c
+++ linux.prev/arch/ppc/syslib/ibm44x_common.c
@@ -66,6 +66,7 @@ void __init ibm44x_calibrate_decr(unsign
 {
 	tb_ticks_per_jiffy = freq / HZ;
 	tb_to_us = mulhwu_scale_factor(freq, 1000000);
+	cpu_khz = freq / 1000;
 
 	/* Set the time base to zero */
 	mtspr(SPRN_TBWL, 0);
@@ -82,19 +83,19 @@ extern void abort(void);
 
 static void ibm44x_restart(char *cmd)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	abort();
 }
 
 static void ibm44x_power_off(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	for(;;);
 }
 
 static void ibm44x_halt(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	for(;;);
 }
 
Index: linux.prev/arch/ppc/syslib/m8260_pci_erratum9.c
===================================================================
--- linux.prev.orig/arch/ppc/syslib/m8260_pci_erratum9.c
+++ linux.prev/arch/ppc/syslib/m8260_pci_erratum9.c
@@ -132,7 +132,7 @@ idma_pci9_read(u8 *dst, u8 *src, int byt
 	volatile idma_bd_t *bd = &idma_dpram->bd;
 	volatile cpm2_map_t *immap = cpm2_immr;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	/* initialize IDMA parameter RAM for this transfer */
 	if (sinc)
@@ -161,7 +161,7 @@ idma_pci9_read(u8 *dst, u8 *src, int byt
 	/* wait for transfer to complete */
 	while(bd->flags & IDMA_BD_V);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return;
 }
@@ -184,7 +184,7 @@ idma_pci9_write(u8 *dst, u8 *src, int by
 	volatile idma_bd_t *bd = &idma_dpram->bd;
 	volatile cpm2_map_t *immap = cpm2_immr;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	/* initialize IDMA parameter RAM for this transfer */
 	if (dinc)
@@ -213,7 +213,7 @@ idma_pci9_write(u8 *dst, u8 *src, int by
 	/* wait for transfer to complete */
 	while(bd->flags & IDMA_BD_V);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return;
 }
Index: linux.prev/arch/ppc/syslib/m8260_setup.c
===================================================================
--- linux.prev.orig/arch/ppc/syslib/m8260_setup.c
+++ linux.prev/arch/ppc/syslib/m8260_setup.c
@@ -82,6 +82,7 @@ m8260_calibrate_decr(void)
         divisor = 4;
         tb_ticks_per_jiffy = freq / HZ / divisor;
 	tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000);
+	cpu_khz = (freq / divisor) / 1000;
 }
 
 /* The 8260 has an internal 1-second timer update register that
@@ -132,7 +133,7 @@ m8260_restart(char *cmd)
 static void
 m8260_halt(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	while (1);
 }
 
Index: linux.prev/arch/ppc/syslib/m8xx_setup.c
===================================================================
--- linux.prev.orig/arch/ppc/syslib/m8xx_setup.c
+++ linux.prev/arch/ppc/syslib/m8xx_setup.c
@@ -160,6 +160,7 @@ void __init m8xx_calibrate_decr(void)
         printk("Decrementer Frequency = %d/%d\n", freq, divisor);
         tb_ticks_per_jiffy = freq / HZ / divisor;
 	tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000);
+	cpu_khz = (freq / divisor) / 1000;
 
 	/* Perform some more timer/timebase initialization.  This used
 	 * to be done elsewhere, but other changes caused it to get
@@ -231,7 +232,7 @@ m8xx_restart(char *cmd)
 {
 	__volatile__ unsigned char dummy;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	out_be32(&((immap_t *)IMAP_ADDR)->im_clkrst.car_plprcr, in_be32(&((immap_t *)IMAP_ADDR)->im_clkrst.car_plprcr) | 0x00000080);
 
 	/* Clear the ME bit in MSR to cause checkstop on machine check
Index: linux.prev/arch/ppc/syslib/mpc52xx_setup.c
===================================================================
--- linux.prev.orig/arch/ppc/syslib/mpc52xx_setup.c
+++ linux.prev/arch/ppc/syslib/mpc52xx_setup.c
@@ -40,7 +40,7 @@ mpc52xx_restart(char *cmd)
 {
 	struct mpc52xx_gpt __iomem *gpt0 = MPC52xx_VA(MPC52xx_GPTx_OFFSET(0));
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* Turn on the watchdog and wait for it to expire. It effectively
 	  does a reset */
@@ -53,7 +53,7 @@ mpc52xx_restart(char *cmd)
 void
 mpc52xx_halt(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	while (1);
 }
@@ -214,6 +214,7 @@ mpc52xx_calibrate_decr(void)
 
 	tb_ticks_per_jiffy = xlbfreq / HZ / divisor;
 	tb_to_us = mulhwu_scale_factor(xlbfreq / divisor, 1000000);
+	cpu_khz = (xlbfreq / divisor) / 1000;
 }
 
 int mpc52xx_match_psc_function(int psc_idx, const char *func)
Index: linux.prev/arch/ppc/syslib/ocp.c
===================================================================
--- linux.prev.orig/arch/ppc/syslib/ocp.c
+++ linux.prev/arch/ppc/syslib/ocp.c
@@ -45,11 +45,11 @@
 #include <linux/pm.h>
 #include <linux/bootmem.h>
 #include <linux/device.h>
+#include <linux/rwsem.h>
 
 #include <asm/io.h>
 #include <asm/ocp.h>
 #include <asm/errno.h>
-#include <asm/rwsem.h>
 #include <asm/semaphore.h>
 
 //#define DBG(x)	printk x
Index: linux.prev/arch/ppc/syslib/open_pic.c
===================================================================
--- linux.prev.orig/arch/ppc/syslib/open_pic.c
+++ linux.prev/arch/ppc/syslib/open_pic.c
@@ -529,7 +529,7 @@ void openpic_reset_processor_phys(u_int 
 }
 
 #if defined(CONFIG_SMP) || defined(CONFIG_PM)
-static DEFINE_SPINLOCK(openpic_setup_lock);
+static DEFINE_RAW_SPINLOCK(openpic_setup_lock);
 #endif
 
 #ifdef CONFIG_SMP
Index: linux.prev/arch/ppc/syslib/open_pic2.c
===================================================================
--- linux.prev.orig/arch/ppc/syslib/open_pic2.c
+++ linux.prev/arch/ppc/syslib/open_pic2.c
@@ -383,7 +383,7 @@ static void openpic2_set_spurious(u_int 
 			   vec);
 }
 
-static DEFINE_SPINLOCK(openpic2_setup_lock);
+static DEFINE_RAW_SPINLOCK(openpic2_setup_lock);
 
 /*
  *  Initialize a timer interrupt (and disable it)
Index: linux.prev/arch/ppc/syslib/ppc4xx_setup.c
===================================================================
--- linux.prev.orig/arch/ppc/syslib/ppc4xx_setup.c
+++ linux.prev/arch/ppc/syslib/ppc4xx_setup.c
@@ -142,7 +142,7 @@ static void
 ppc4xx_power_off(void)
 {
 	printk("System Halted\n");
-	local_irq_disable();
+	raw_local_irq_disable();
 	while (1) ;
 }
 
@@ -150,7 +150,7 @@ static void
 ppc4xx_halt(void)
 {
 	printk("System Halted\n");
-	local_irq_disable();
+	raw_local_irq_disable();
 	while (1) ;
 }
 
@@ -173,6 +173,7 @@ ppc4xx_calibrate_decr(void)
 	freq = bip->bi_tbfreq;
 	tb_ticks_per_jiffy = freq / HZ;
 	tb_to_us = mulhwu_scale_factor(freq, 1000000);
+	cpu_khz = freq / 1000;
 
 	/* Set the time base to zero.
 	   ** At 200 Mhz, time base will rollover in ~2925 years.
Index: linux.prev/arch/ppc/syslib/ppc83xx_setup.c
===================================================================
--- linux.prev.orig/arch/ppc/syslib/ppc83xx_setup.c
+++ linux.prev/arch/ppc/syslib/ppc83xx_setup.c
@@ -138,7 +138,7 @@ mpc83xx_restart(char *cmd)
 
 	reg = ioremap(BCSR_PHYS_ADDR, BCSR_SIZE);
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/*
 	 * Unlock the BCSR bits so a PRST will update the contents.
@@ -167,14 +167,14 @@ mpc83xx_restart(char *cmd)
 void
 mpc83xx_power_off(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	for(;;);
 }
 
 void
 mpc83xx_halt(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	for(;;);
 }
 
Index: linux.prev/arch/ppc/syslib/ppc85xx_setup.c
===================================================================
--- linux.prev.orig/arch/ppc/syslib/ppc85xx_setup.c
+++ linux.prev/arch/ppc/syslib/ppc85xx_setup.c
@@ -60,6 +60,7 @@ mpc85xx_calibrate_decr(void)
         divisor = 8;
         tb_ticks_per_jiffy = freq / divisor / HZ;
         tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000);
+        cpu_khz = (freq / divisor) / 1000;
 
 	/* Set the time base to zero */
 	mtspr(SPRN_TBWL, 0);
@@ -115,21 +116,21 @@ mpc85xx_early_serial_map(void)
 void
 mpc85xx_restart(char *cmd)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	abort();
 }
 
 void
 mpc85xx_power_off(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	for(;;);
 }
 
 void
 mpc85xx_halt(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	for(;;);
 }
 
Index: linux.prev/arch/ppc/syslib/prom.c
===================================================================
--- linux.prev.orig/arch/ppc/syslib/prom.c
+++ linux.prev/arch/ppc/syslib/prom.c
@@ -1396,7 +1396,7 @@ print_properties(struct device_node *np)
 }
 #endif
 
-static DEFINE_SPINLOCK(rtas_lock);
+static DEFINE_RAW_SPINLOCK(rtas_lock);
 
 /* this can be called after setup -- Cort */
 int
Index: linux.prev/arch/ppc/syslib/todc_time.c
===================================================================
--- linux.prev.orig/arch/ppc/syslib/todc_time.c
+++ linux.prev/arch/ppc/syslib/todc_time.c
@@ -508,6 +508,7 @@ todc_calibrate_decr(void)
 
 	tb_ticks_per_jiffy = freq / HZ;
 	tb_to_us = mulhwu_scale_factor(freq, 1000000);
+	cpu_khz = freq / 1000;
 
 	return;
 }
Index: linux.prev/arch/ppc/xmon/xmon.c
===================================================================
--- linux.prev.orig/arch/ppc/xmon/xmon.c
+++ linux.prev/arch/ppc/xmon/xmon.c
@@ -297,10 +297,10 @@ irqreturn_t
 xmon_irq(int irq, void *d, struct pt_regs *regs)
 {
 	unsigned long flags;
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	printf("Keyboard interrupt\n");
 	xmon(regs);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return IRQ_HANDLED;
 }
 
Index: linux.prev/arch/sh64/kernel/time.c
===================================================================
--- linux.prev.orig/arch/sh64/kernel/time.c
+++ linux.prev/arch/sh64/kernel/time.c
@@ -417,7 +417,7 @@ static __init unsigned int get_cpu_hz(vo
 	/*
 	** Regardless the toolchain, force the compiler to use the
 	** arbitrary register r3 as a clock tick counter.
-	** NOTE: r3 must be in accordance with rtc_interrupt()
+	** NOTE: r3 must be in accordance with sh64_rtc_interrupt()
 	*/
 	register unsigned long long  __rtc_irq_flag __asm__ ("r3");
 
@@ -482,7 +482,8 @@ static __init unsigned int get_cpu_hz(vo
 #endif
 }
 
-static irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+static irqreturn_t sh64_rtc_interrupt(int irq, void *dev_id,
+				      struct pt_regs *regs)
 {
 	ctrl_outb(0, RCR1);	/* Disable Carry Interrupts */
 	regs->regs[3] = 1;	/* Using r3 */
@@ -491,7 +492,7 @@ static irqreturn_t rtc_interrupt(int irq
 }
 
 static struct irqaction irq0  = { timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL};
-static struct irqaction irq1  = { rtc_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "rtc", NULL, NULL};
+static struct irqaction irq1  = { sh64_rtc_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "rtc", NULL, NULL};
 
 void __init time_init(void)
 {
Index: linux.prev/arch/x86_64/Kconfig
===================================================================
--- linux.prev.orig/arch/x86_64/Kconfig
+++ linux.prev/arch/x86_64/Kconfig
@@ -24,6 +24,14 @@ config X86
 	bool
 	default y
 
+config GENERIC_TIME
+       bool
+       default y
+
+config GENERIC_TIME_VSYSCALL
+       bool
+       default y
+
 config SEMAPHORE_SLEEPERS
 	bool
 	default y
@@ -38,13 +46,6 @@ config ISA
 config SBUS
 	bool
 
-config RWSEM_GENERIC_SPINLOCK
-	bool
-	default y
-
-config RWSEM_XCHGADD_ALGORITHM
-	bool
-
 config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
@@ -199,6 +200,8 @@ config MTRR
 
 	  See <file:Documentation/mtrr.txt> for more information.
 
+source "kernel/time/Kconfig"
+
 config SMP
 	bool "Symmetric multi-processing support"
 	---help---
@@ -237,6 +240,14 @@ config NUMA
 	 If the system is EM64T, you should say N unless your system is EM64T 
 	 NUMA. 
 
+config RWSEM_GENERIC_SPINLOCK
+	bool
+	default y
+
+config RWSEM_XCHGADD_ALGORITHM
+	depends on !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT
+	bool
+
 config K8_NUMA
        bool "Old style AMD Opteron NUMA detection"
        depends on NUMA
@@ -327,21 +338,6 @@ config HPET_TIMER
 	  as it is off-chip.  You can find the HPET spec at
 	  <http://www.intel.com/hardwaredesign/hpetspec.htm>.
 
-config X86_PM_TIMER
-	bool "PM timer"
-	depends on ACPI
-	default y
-	help
-	  Support the ACPI PM timer for time keeping. This is slow,
-	  but is useful on some chipsets without HPET on systems with more
-	  than one CPU. On a single processor or single socket multi core
-	  system it is normally not required.
-	  When the PM timer is active 64bit vsyscalls are disabled
-	  and should not be enabled (/proc/sys/kernel/vsyscall64 should
-	  not be changed).
-	  The kernel selects the PM timer only as a last resort, so it is
-	  useful to enable just in case.
-
 config HPET_EMULATE_RTC
 	bool "Provide RTC interrupt"
 	depends on HPET_TIMER && RTC=y
Index: linux.prev/arch/x86_64/boot/compressed/misc.c
===================================================================
--- linux.prev.orig/arch/x86_64/boot/compressed/misc.c
+++ linux.prev/arch/x86_64/boot/compressed/misc.c
@@ -114,6 +114,7 @@ static char *vidmem = (char *)0xb8000;
 static int vidport;
 static int lines, cols;
 
+#define ZLIB_INFLATE_NO_INFLATE_LOCK
 #include "../../../../lib/inflate.c"
 
 static void *malloc(int size)
Index: linux.prev/arch/x86_64/ia32/sys_ia32.c
===================================================================
--- linux.prev.orig/arch/x86_64/ia32/sys_ia32.c
+++ linux.prev/arch/x86_64/ia32/sys_ia32.c
@@ -456,6 +456,10 @@ sys32_settimeofday(struct compat_timeval
 	struct timespec kts;
 	struct timezone ktz;
 
+	int ret = timeofday_API_hacks(tv, tz);
+	if (ret != 1)
+		return ret;
+
  	if (tv) {
 		if (get_tv32(&ktv, tv))
 			return -EFAULT;
Index: linux.prev/arch/x86_64/kernel/Makefile
===================================================================
--- linux.prev.orig/arch/x86_64/kernel/Makefile
+++ linux.prev/arch/x86_64/kernel/Makefile
@@ -29,7 +29,7 @@ obj-$(CONFIG_EARLY_PRINTK)	+= early_prin
 obj-$(CONFIG_GART_IOMMU)	+= pci-gart.o aperture.o
 obj-$(CONFIG_DUMMY_IOMMU)	+= pci-nommu.o pci-dma.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
-obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer.o
+obj-$(CONFIG_SYSFS)		+= switch2poll.o
 
 obj-$(CONFIG_MODULES)		+= module.o
 
Index: linux.prev/arch/x86_64/kernel/apic.c
===================================================================
--- linux.prev.orig/arch/x86_64/kernel/apic.c
+++ linux.prev/arch/x86_64/kernel/apic.c
@@ -25,6 +25,7 @@
 #include <linux/mc146818rtc.h>
 #include <linux/kernel_stat.h>
 #include <linux/sysdev.h>
+#include <linux/module.h>
 
 #include <asm/atomic.h>
 #include <asm/smp.h>
@@ -38,13 +39,15 @@ int apic_verbosity;
 
 int disable_apic_timer __initdata;
 
+/*
+ * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
+ * IPIs in place of local APIC timers
+ */
+static cpumask_t timer_interrupt_broadcast_ipi_mask;
+
 /* Using APIC to generate smp_local_timer_interrupt? */
 int using_apic_timer = 0;
 
-static DEFINE_PER_CPU(int, prof_multiplier) = 1;
-static DEFINE_PER_CPU(int, prof_old_multiplier) = 1;
-static DEFINE_PER_CPU(int, prof_counter) = 1;
-
 static void apic_pm_activate(void);
 
 void enable_NMI_through_LVT0 (void * dummy)
@@ -485,10 +488,9 @@ static int lapic_suspend(struct sys_devi
 	apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
 	apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
 	apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
-	local_save_flags(flags);
-	local_irq_disable();
+	raw_local_irq_save(flags);
 	disable_local_APIC();
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return 0;
 }
 
@@ -503,7 +505,7 @@ static int lapic_resume(struct sys_devic
 	/* XXX: Pavel needs this for S3 resume, but can't explain why */
 	set_fixmap_nocache(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	rdmsr(MSR_IA32_APICBASE, l, h);
 	l &= ~MSR_IA32_APICBASE_BASE;
 	l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
@@ -526,7 +528,7 @@ static int lapic_resume(struct sys_devic
 	apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
 	apic_write(APIC_ESR, 0);
 	apic_read(APIC_ESR);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return 0;
 }
 
@@ -660,9 +662,14 @@ void __init init_apic_mappings(void)
 static void __setup_APIC_LVTT(unsigned int clocks)
 {
 	unsigned int lvtt_value, tmp_value, ver;
+	int cpu = smp_processor_id();
 
 	ver = GET_APIC_VERSION(apic_read(APIC_LVR));
 	lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
+
+	if (cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask))
+		lvtt_value |= APIC_LVT_MASKED;
+
 	apic_write_around(APIC_LVTT, lvtt_value);
 
 	/*
@@ -680,7 +687,7 @@ static void setup_APIC_timer(unsigned in
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	/* For some reasons this doesn't work on Simics, so fake it for now */ 
 	if (!strstr(boot_cpu_data.x86_model_id, "Screwdriver")) { 
@@ -710,7 +717,7 @@ static void setup_APIC_timer(unsigned in
 
 	__setup_APIC_LVTT(clocks);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /*
@@ -767,7 +774,7 @@ void __init setup_boot_APIC_clock (void)
 	printk(KERN_INFO "Using local APIC timer interrupts.\n");
 	using_apic_timer = 1;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	calibration_result = calibrate_APIC_clock();
 	/*
@@ -775,17 +782,17 @@ void __init setup_boot_APIC_clock (void)
 	 */
 	setup_APIC_timer(calibration_result);
 
-	local_irq_enable();
+	raw_local_irq_enable();
 }
 
 void __cpuinit setup_secondary_APIC_clock(void)
 {
-	local_irq_disable(); /* FIXME: Do we need this? --RR */
+	raw_local_irq_disable(); /* FIXME: Do we need this? --RR */
 	setup_APIC_timer(calibration_result);
-	local_irq_enable();
+	raw_local_irq_enable();
 }
 
-void __cpuinit disable_APIC_timer(void)
+void disable_APIC_timer(void)
 {
 	if (using_apic_timer) {
 		unsigned long v;
@@ -797,7 +804,10 @@ void __cpuinit disable_APIC_timer(void)
 
 void enable_APIC_timer(void)
 {
-	if (using_apic_timer) {
+	int cpu = smp_processor_id();
+
+	if (using_apic_timer &&
+	    !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) {
 		unsigned long v;
 
 		v = apic_read(APIC_LVTT);
@@ -805,32 +815,45 @@ void enable_APIC_timer(void)
 	}
 }
 
-/*
- * the frequency of the profiling timer can be changed
- * by writing a multiplier value into /proc/profile.
- */
-int setup_profiling_timer(unsigned int multiplier)
+void switch_APIC_timer_to_ipi(void *cpumask)
 {
-	int i;
+	cpumask_t mask = *(cpumask_t *)cpumask;
+	int cpu = smp_processor_id();
 
-	/*
-	 * Sanity check. [at least 500 APIC cycles should be
-	 * between APIC interrupts as a rule of thumb, to avoid
-	 * irqs flooding us]
-	 */
-	if ( (!multiplier) || (calibration_result/multiplier < 500))
-		return -EINVAL;
+	if (cpu_isset(cpu, mask) &&
+	    !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) {
+		disable_APIC_timer();
+		cpu_set(cpu, timer_interrupt_broadcast_ipi_mask);
+	}
+}
+EXPORT_SYMBOL(switch_APIC_timer_to_ipi);
 
-	/* 
-	 * Set the new multiplier for each CPU. CPUs don't start using the
-	 * new values until the next timer interrupt in which they do process
-	 * accounting. At that time they also adjust their APIC timers
-	 * accordingly.
-	 */
-	for (i = 0; i < NR_CPUS; ++i)
-		per_cpu(prof_multiplier, i) = multiplier;
+void smp_send_timer_broadcast_ipi(void)
+{
+	cpumask_t mask;
 
-	return 0;
+	cpus_and(mask, cpu_online_map, timer_interrupt_broadcast_ipi_mask);
+	if (!cpus_empty(mask)) {
+		send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
+	}
+}
+
+void switch_ipi_to_APIC_timer(void *cpumask)
+{
+	cpumask_t mask = *(cpumask_t *)cpumask;
+	int cpu = smp_processor_id();
+
+	if (cpu_isset(cpu, mask) &&
+	    cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) {
+		cpu_clear(cpu, timer_interrupt_broadcast_ipi_mask);
+		enable_APIC_timer();
+	}
+}
+EXPORT_SYMBOL(switch_ipi_to_APIC_timer);
+
+int setup_profiling_timer(unsigned int multiplier)
+{
+	return -EINVAL;
 }
 
 #ifdef CONFIG_X86_MCE_AMD
@@ -857,32 +880,10 @@ void setup_threshold_lvt(unsigned long l
 
 void smp_local_timer_interrupt(struct pt_regs *regs)
 {
-	int cpu = smp_processor_id();
-
 	profile_tick(CPU_PROFILING, regs);
-	if (--per_cpu(prof_counter, cpu) <= 0) {
-		/*
-		 * The multiplier may have changed since the last time we got
-		 * to this point as a result of the user writing to
-		 * /proc/profile. In this case we need to adjust the APIC
-		 * timer accordingly.
-		 *
-		 * Interrupts are already masked off at this point.
-		 */
-		per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu);
-		if (per_cpu(prof_counter, cpu) != 
-		    per_cpu(prof_old_multiplier, cpu)) {
-			__setup_APIC_LVTT(calibration_result/
-					per_cpu(prof_counter, cpu));
-			per_cpu(prof_old_multiplier, cpu) =
-				per_cpu(prof_counter, cpu);
-		}
-
 #ifdef CONFIG_SMP
-		update_process_times(user_mode(regs));
+	update_process_times(user_mode(regs));
 #endif
-	}
-
 	/*
 	 * We take the 'long' return path, and there every subsystem
 	 * grabs the appropriate locks (kernel lock/ irq lock).
Index: linux.prev/arch/x86_64/kernel/early_printk.c
===================================================================
--- linux.prev.orig/arch/x86_64/kernel/early_printk.c
+++ linux.prev/arch/x86_64/kernel/early_printk.c
@@ -206,7 +206,7 @@ static int early_console_initialized = 0
 
 void early_printk(const char *fmt, ...)
 { 
-	char buf[512]; 
+	static char buf[512];
 	int n; 
 	va_list ap;
 
Index: linux.prev/arch/x86_64/kernel/entry.S
===================================================================
--- linux.prev.orig/arch/x86_64/kernel/entry.S
+++ linux.prev/arch/x86_64/kernel/entry.S
@@ -48,6 +48,15 @@
 #define retint_kernel retint_restore_args
 #endif	
 	
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+# define CALL_TRACE_IRQS_ON \
+	push %rbp; \
+	mov %rsp, %rbp; \
+	call trace_irqs_on; \
+	leaveq
+#else
+# define CALL_TRACE_IRQS_ON
+#endif
 /*
  * C code is not supposed to know about undefined top of stack. Every time 
  * a C function with an pt_regs argument is called from the SYSCALL based 
@@ -230,8 +239,8 @@ sysret_check:		
 	/* edx:	work, edi: workmask */	
 sysret_careful:
 	CFI_RESTORE_STATE
-	bt $TIF_NEED_RESCHED,%edx
-	jnc sysret_signal
+	testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx
+	jz sysret_signal
 	sti
 	pushq %rdi
 	CFI_ADJUST_CFA_OFFSET 8
@@ -252,7 +261,7 @@ sysret_signal:
 	leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
 	xorl %esi,%esi # oldset -> arg2
 	call ptregscall_common
-1:	movl $_TIF_NEED_RESCHED,%edi
+1:	movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi
 	jmp sysret_check
 	
 badsys:
@@ -319,8 +328,8 @@ int_with_check:
 	/* First do a reschedule test. */
 	/* edx:	work, edi: workmask */
 int_careful:
-	bt $TIF_NEED_RESCHED,%edx
-	jnc  int_very_careful
+	testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx
+	jz int_very_careful
 	sti
 	pushq %rdi
 	CFI_ADJUST_CFA_OFFSET 8
@@ -353,7 +362,7 @@ int_signal:
 	movq %rsp,%rdi		# &ptregs -> arg1
 	xorl %esi,%esi		# oldset -> arg2
 	call do_notify_resume
-1:	movl $_TIF_NEED_RESCHED,%edi	
+1:	movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi
 int_restore_rest:
 	RESTORE_REST
 	cli
@@ -554,8 +563,8 @@ bad_iret:
 	/* edi: workmask, edx: work */
 retint_careful:
 	CFI_RESTORE_STATE
-	bt    $TIF_NEED_RESCHED,%edx
-	jnc   retint_signal
+	testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx
+	jz    retint_signal
 	sti
 	pushq %rdi
 	CFI_ADJUST_CFA_OFFSET	8
@@ -577,7 +586,7 @@ retint_signal:
 	call do_notify_resume
 	RESTORE_REST
 	cli
-	movl $_TIF_NEED_RESCHED,%edi
+	movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi
 	GET_THREAD_INFO(%rcx)
 	jmp retint_check
 
@@ -593,6 +602,7 @@ retint_kernel:	
 	bt   $9,EFLAGS-ARGOFFSET(%rsp)	/* interrupts off? */
 	jnc  retint_restore_args
 	call preempt_schedule_irq
+	CALL_TRACE_IRQS_ON
 	jmp exit_intr
 #endif	
 	CFI_ENDPROC
@@ -1041,3 +1051,41 @@ ENTRY(call_softirq)
 	CFI_ADJUST_CFA_OFFSET -8
 	ret
 	CFI_ENDPROC
+
+#ifdef CONFIG_LATENCY_TRACE
+
+ENTRY(mcount)
+	cmpq $0, trace_enabled
+	jz out
+
+	push %rbp
+	mov %rsp,%rbp
+
+	push %r9
+	push %r8
+	push %rdi
+	push %rsi
+	push %rdx
+	push %rcx
+	push %rax
+
+	mov 0x0(%rbp),%rax
+	mov 0x8(%rbp),%rdi
+	mov 0x8(%rax),%rsi
+
+	call   __trace
+
+	pop %rax
+	pop %rcx
+	pop %rdx
+	pop %rsi
+	pop %rdi
+	pop %r8
+	pop %r9
+
+	leaveq
+out:
+	ret
+
+#endif
+
Index: linux.prev/arch/x86_64/kernel/genapic_flat.c
===================================================================
--- linux.prev.orig/arch/x86_64/kernel/genapic_flat.c
+++ linux.prev/arch/x86_64/kernel/genapic_flat.c
@@ -50,8 +50,8 @@ static void flat_send_IPI_mask(cpumask_t
 	unsigned long cfg;
 	unsigned long flags;
 
-	local_save_flags(flags);
-	local_irq_disable();
+	raw_local_save_flags(flags);
+	raw_local_irq_disable();
 
 	/*
 	 * Wait for idle.
@@ -73,7 +73,7 @@ static void flat_send_IPI_mask(cpumask_t
 	 * Send the IPI. The write to APIC_ICR fires this off.
 	 */
 	apic_write(APIC_ICR, cfg);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static void flat_send_IPI_allbutself(int vector)
Index: linux.prev/arch/x86_64/kernel/i8259.c
===================================================================
--- linux.prev.orig/arch/x86_64/kernel/i8259.c
+++ linux.prev/arch/x86_64/kernel/i8259.c
@@ -127,7 +127,7 @@ void (*interrupt[NR_IRQS])(void) = {
  * moves to arch independent land
  */
 
-DEFINE_SPINLOCK(i8259A_lock);
+DEFINE_RAW_SPINLOCK(i8259A_lock);
 
 static void end_8259A_irq (unsigned int irq)
 {
@@ -448,7 +448,7 @@ device_initcall(i8259A_init_sysfs);
  * IRQ2 is cascade interrupt to second interrupt controller
  */
 
-static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
+static struct irqaction irq2 = { no_action, SA_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL};
 
 void __init init_ISA_irqs (void)
 {
Index: linux.prev/arch/x86_64/kernel/init_task.c
===================================================================
--- linux.prev.orig/arch/x86_64/kernel/init_task.c
+++ linux.prev/arch/x86_64/kernel/init_task.c
@@ -10,8 +10,8 @@
 #include <asm/pgtable.h>
 #include <asm/desc.h>
 
-static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
+static struct fs_struct init_fs = INIT_FS(init_fs);
+static struct files_struct init_files = INIT_FILES(init_files);
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
Index: linux.prev/arch/x86_64/kernel/io_apic.c
===================================================================
--- linux.prev.orig/arch/x86_64/kernel/io_apic.c
+++ linux.prev/arch/x86_64/kernel/io_apic.c
@@ -46,7 +46,7 @@ static int no_timer_check;
 
 int disable_timer_pin_1 __initdata;
 
-static DEFINE_SPINLOCK(ioapic_lock);
+static DEFINE_RAW_SPINLOCK(ioapic_lock);
 
 /*
  * # of IRQ routing registers
@@ -94,6 +94,9 @@ int vector_irq[NR_VECTORS] __read_mostly
 		reg = io_apic_read(entry->apic, 0x10 + R + pin*2);	\
 		reg ACTION;						\
 		io_apic_modify(entry->apic, reg);			\
+		 /* Force POST flush by reading: */			\
+		reg = io_apic_read(entry->apic, 0x10 + R + pin*2);	\
+									\
 		if (!entry->next)					\
 			break;						\
 		entry = irq_2_pin + entry->next;			\
@@ -160,10 +163,8 @@ static void add_pin_to_irq(unsigned int 
 	static void name##_IO_APIC_irq (unsigned int irq)		\
 	__DO_ACTION(R, ACTION, FINAL)
 
-DO_ACTION( __mask,             0, |= 0x00010000, io_apic_sync(entry->apic) )
-						/* mask = 1 */
-DO_ACTION( __unmask,           0, &= 0xfffeffff, )
-						/* mask = 0 */
+DO_ACTION( __mask,             0, |= 0x00010000, ) /* mask = 1 */
+DO_ACTION( __unmask,           0, &= 0xfffeffff, ) /* mask = 0 */
 
 static void mask_IO_APIC_irq (unsigned int irq)
 {
@@ -1338,7 +1339,7 @@ static int __init timer_irq_works(void)
 {
 	unsigned long t1 = jiffies;
 
-	local_irq_enable();
+	raw_local_irq_enable();
 	/* Let ten ticks pass... */
 	mdelay((10 * 1000) / HZ);
 
@@ -1431,12 +1432,50 @@ static unsigned int startup_level_ioapic
 	return 0; /* don't check for pending */
 }
 
+/*
+ * In the preemptible case mask the IRQ first then handle it and ack it.
+ *
+ * (In the non-preemptible case we keep the IRQ unacked in the local APIC
+ * and dont need to do the masking, because the code executes atomically.)
+ */
+#ifdef CONFIG_PREEMPT_HARDIRQS
+
+static void mask_and_ack_level_ioapic_irq(unsigned int irq)
+{
+	move_irq(irq);
+	mask_IO_APIC_irq(irq);
+	ack_APIC_irq();
+}
+
+static void end_level_ioapic_irq(unsigned int irq)
+{
+	if (!(irq_desc[irq].status & IRQ_INPROGRESS))
+		unmask_IO_APIC_irq(irq);
+}
+
+static void enable_level_ioapic_irq(unsigned int irq)
+{
+	unmask_IO_APIC_irq(irq);
+}
+
+#else /* !CONFIG_PREEMPT_HARDIRQS */
+
+static void mask_and_ack_level_ioapic_irq(unsigned int irq)
+{
+}
+
 static void end_level_ioapic_irq (unsigned int irq)
 {
 	move_irq(irq);
 	ack_APIC_irq();
 }
 
+static void enable_level_ioapic_irq(unsigned int irq)
+{
+	unmask_IO_APIC_irq(irq);
+}
+#endif /* !CONFIG_PREEMPT_HARDIRQS */
+
 #ifdef CONFIG_PCI_MSI
 static unsigned int startup_edge_ioapic_vector(unsigned int vector)
 {
@@ -1460,6 +1499,13 @@ static unsigned int startup_level_ioapic
 	return startup_level_ioapic_irq (irq);
 }
 
+static void mask_and_ack_level_ioapic_vector (unsigned int vector)
+{
+	int irq = vector_to_irq(vector);
+
+	mask_and_ack_level_ioapic_irq(irq);
+}
+
 static void end_level_ioapic_vector (unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
@@ -1468,6 +1514,11 @@ static void end_level_ioapic_vector (uns
 	end_level_ioapic_irq(irq);
 }
 
+static void enable_level_ioapic_vector(unsigned int vector)
+{
+	enable_level_ioapic_irq(vector_to_irq(vector));
+}
+
 static void mask_IO_APIC_vector (unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
Index: linux.prev/arch/x86_64/kernel/irq.c
===================================================================
--- linux.prev.orig/arch/x86_64/kernel/irq.c
+++ linux.prev/arch/x86_64/kernel/irq.c
@@ -129,9 +129,9 @@ void fixup_irqs(cpumask_t map)
 	}
 
 	/* That doesn't seem sufficient.  Give it 1ms. */
-	local_irq_enable();
+	raw_local_irq_enable();
 	mdelay(1);
-	local_irq_disable();
+	raw_local_irq_disable();
 }
 #endif
 
@@ -145,11 +145,11 @@ asmlinkage void do_softirq(void)
  	if (in_interrupt())
  		return;
 
- 	local_irq_save(flags);
+ 	raw_local_irq_save(flags);
  	pending = local_softirq_pending();
  	/* Switch to interrupt stack */
  	if (pending)
 		call_softirq();
- 	local_irq_restore(flags);
+ 	raw_local_irq_restore(flags);
 }
 EXPORT_SYMBOL(do_softirq);
Index: linux.prev/arch/x86_64/kernel/machine_kexec.c
===================================================================
--- linux.prev.orig/arch/x86_64/kernel/machine_kexec.c
+++ linux.prev/arch/x86_64/kernel/machine_kexec.c
@@ -190,7 +190,7 @@ NORET_TYPE void machine_kexec(struct kim
 	relocate_new_kernel_t rnk;
 
 	/* Interrupts aren't acceptable while we reboot */
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* Calculate the offsets */
 	page_list = image->head;
Index: linux.prev/arch/x86_64/kernel/nmi.c
===================================================================
--- linux.prev.orig/arch/x86_64/kernel/nmi.c
+++ linux.prev/arch/x86_64/kernel/nmi.c
@@ -43,7 +43,7 @@
  * This is maintained separately from nmi_active because the NMI
  * watchdog may also be driven from the I/O APIC timer.
  */
-static DEFINE_SPINLOCK(lapic_nmi_owner_lock);
+static DEFINE_RAW_SPINLOCK(lapic_nmi_owner_lock);
 static unsigned int lapic_nmi_owner;
 #define LAPIC_NMI_WATCHDOG	(1<<0)
 #define LAPIC_NMI_RESERVED	(1<<1)
@@ -127,7 +127,7 @@ void __cpuinit nmi_watchdog_default(void
 static __init void nmi_cpu_busy(void *data)
 {
 	volatile int *endflag = data;
-	local_irq_enable();
+	raw_local_irq_enable();
 	/* Intentionally don't use cpu_relax here. This is
 	   to make sure that the performance counter really ticks,
 	   even if there is a simulator or similar that catches the
@@ -156,7 +156,7 @@ int __init check_nmi_watchdog (void)
 
 	for (cpu = 0; cpu < NR_CPUS; cpu++)
 		counts[cpu] = cpu_pda[cpu].__nmi_count; 
-	local_irq_enable();
+	raw_local_irq_enable();
 	mdelay((10*1000)/nmi_hz); // wait 10 ticks
 
 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
@@ -466,12 +466,42 @@ void touch_nmi_watchdog (void)
  	touch_softlockup_watchdog();
 }
 
+int nmi_show_regs[NR_CPUS];
+
+void nmi_show_all_regs(void)
+{
+	int i;
+
+	if (nmi_watchdog == NMI_NONE)
+		return;
+	if (system_state != SYSTEM_RUNNING) {
+		printk("nmi_show_all_regs(): system state %d, not doing.\n",
+			system_state);
+		return;
+	}
+
+	for_each_online_cpu(i)
+		nmi_show_regs[i] = 1;
+	for_each_online_cpu(i)
+		while (nmi_show_regs[i] == 1)
+			barrier();
+}
+
+static DEFINE_RAW_SPINLOCK(nmi_print_lock);
+
 void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason)
 {
 	int sum;
 	int touched = 0;
+	int cpu = safe_smp_processor_id();
 
 	sum = read_pda(apic_timer_irqs);
+	if (nmi_show_regs[cpu]) {
+		nmi_show_regs[cpu] = 0;
+		spin_lock(&nmi_print_lock);
+		show_regs(regs);
+		spin_unlock(&nmi_print_lock);
+	}
 	if (__get_cpu_var(nmi_touch)) {
 		__get_cpu_var(nmi_touch) = 0;
 		touched = 1;
@@ -483,6 +513,11 @@ void nmi_watchdog_tick (struct pt_regs *
 		 */
 		local_inc(&__get_cpu_var(alert_counter));
 		if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) {
+			int i;
+
+			for (i = 0; i < NR_CPUS; i++)
+				nmi_show_regs[i] = 1;
+
 			if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
 							== NOTIFY_STOP) {
 				local_set(&__get_cpu_var(alert_counter), 0);
Index: linux.prev/arch/x86_64/kernel/pmtimer.c
===================================================================
--- linux.prev.orig/arch/x86_64/kernel/pmtimer.c
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Ported over from i386 by AK, original copyright was:
- *
- * (C) Dominik Brodowski <linux@brodo.de> 2003
- *
- * Driver to use the Power Management Timer (PMTMR) available in some
- * southbridges as primary timing source for the Linux kernel.
- *
- * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
- * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
- *
- * This file is licensed under the GPL v2.
- *
- * Dropped all the hardware bug workarounds for now. Hopefully they
- * are not needed on 64bit chipsets.
- */
-
-#include <linux/jiffies.h>
-#include <linux/kernel.h>
-#include <linux/time.h>
-#include <linux/init.h>
-#include <linux/cpumask.h>
-#include <asm/io.h>
-#include <asm/proto.h>
-#include <asm/msr.h>
-#include <asm/vsyscall.h>
-
-/* The I/O port the PMTMR resides at.
- * The location is detected during setup_arch(),
- * in arch/i386/kernel/acpi/boot.c */
-u32 pmtmr_ioport;
-
-/* value of the Power timer at last timer interrupt */
-static u32 offset_delay;
-static u32 last_pmtmr_tick;
-
-#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
-
-static inline u32 cyc2us(u32 cycles)
-{
-	/* The Power Management Timer ticks at 3.579545 ticks per microsecond.
-	 * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
-	 *
-	 * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
-	 * easily be multiplied with 286 (=0x11E) without having to fear
-	 * u32 overflows.
-	 */
-	cycles *= 286;
-	return (cycles >> 10);
-}
-
-int pmtimer_mark_offset(void)
-{
-	static int first_run = 1;
-	unsigned long tsc;
-	u32 lost;
-
-	u32 tick = inl(pmtmr_ioport);
-	u32 delta;
-
-	delta = cyc2us((tick - last_pmtmr_tick) & ACPI_PM_MASK);
-
-	last_pmtmr_tick = tick;
-	monotonic_base += delta * NSEC_PER_USEC;
-
-	delta += offset_delay;
-
-	lost = delta / (USEC_PER_SEC / HZ);
-	offset_delay = delta % (USEC_PER_SEC / HZ);
-
-	rdtscll(tsc);
-	vxtime.last_tsc = tsc - offset_delay * cpu_khz;
-
-	/* don't calculate delay for first run,
-	   or if we've got less then a tick */
-	if (first_run || (lost < 1)) {
-		first_run = 0;
-		offset_delay = 0;
-	}
-
-	return lost - 1;
-}
-
-unsigned int do_gettimeoffset_pm(void)
-{
-	u32 now, offset, delta = 0;
-
-	offset = last_pmtmr_tick;
-	now = inl(pmtmr_ioport);
-	delta = (now - offset) & ACPI_PM_MASK;
-
-	return offset_delay + cyc2us(delta);
-}
-
-
-static int __init nopmtimer_setup(char *s)
-{
-	pmtmr_ioport = 0;
-	return 0;
-}
-
-__setup("nopmtimer", nopmtimer_setup);
Index: linux.prev/arch/x86_64/kernel/process.c
===================================================================
--- linux.prev.orig/arch/x86_64/kernel/process.c
+++ linux.prev/arch/x86_64/kernel/process.c
@@ -36,6 +36,7 @@
 #include <linux/utsname.h>
 #include <linux/random.h>
 #include <linux/kprobes.h>
+#include <linux/spinlock.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -60,6 +61,12 @@ static atomic_t hlt_counter = ATOMIC_INI
 unsigned long boot_option_idle_override = 0;
 EXPORT_SYMBOL(boot_option_idle_override);
 
+DEFINE_SPINLOCK(pm_idle_switch_lock);
+EXPORT_SYMBOL_GPL(pm_idle_switch_lock);
+
+int pm_idle_locked = 0;
+EXPORT_SYMBOL_GPL(pm_idle_locked);
+
 /*
  * Powermanagement idle function, if any..
  */
@@ -86,21 +93,21 @@ EXPORT_SYMBOL(enable_hlt);
  */
 void default_idle(void)
 {
-	local_irq_enable();
+	raw_local_irq_enable();
 
 	if (!atomic_read(&hlt_counter)) {
 		clear_thread_flag(TIF_POLLING_NRFLAG);
 		smp_mb__after_clear_bit();
-		while (!need_resched()) {
-			local_irq_disable();
-			if (!need_resched())
-				safe_halt();
+		while (!need_resched() && !need_resched_delayed()) {
+			raw_local_irq_disable();
+			if (!need_resched() && !need_resched_delayed())
+				raw_safe_halt();
 			else
-				local_irq_enable();
+				raw_local_irq_enable();
 		}
 		set_thread_flag(TIF_POLLING_NRFLAG);
 	} else {
-		while (!need_resched())
+		while (!need_resched() && !need_resched_delayed())
 			cpu_relax();
 	}
 }
@@ -110,9 +117,9 @@ void default_idle(void)
  * to poll the ->need_resched flag instead of waiting for the
  * cross-CPU IPI to arrive. Use this option with caution.
  */
-static void poll_idle (void)
+void poll_idle (void)
 {
-	local_irq_enable();
+	raw_local_irq_enable();
 
 	asm volatile(
 		"2:"
@@ -188,7 +195,9 @@ void cpu_idle (void)
 
 	/* endless idle loop with no priority at all */
 	while (1) {
-		while (!need_resched()) {
+		BUG_ON(raw_irqs_disabled());
+
+		while (!need_resched() && !need_resched_delayed()) {
 			void (*idle)(void);
 
 			if (__get_cpu_var(cpu_idle_state))
@@ -200,12 +209,15 @@ void cpu_idle (void)
 				idle = default_idle;
 			if (cpu_is_offline(smp_processor_id()))
 				play_dead();
+			stop_critical_timing();
+			propagate_preempt_locks_value();
 			idle();
 		}
-
-		preempt_enable_no_resched();
-		schedule();
+		raw_local_irq_disable();
+		__preempt_enable_no_resched();
+		__schedule();
 		preempt_disable();
+		raw_local_irq_enable();
 	}
 }
 
@@ -218,12 +230,12 @@ void cpu_idle (void)
  */
 static void mwait_idle(void)
 {
-	local_irq_enable();
+	raw_local_irq_enable();
 
-	while (!need_resched()) {
+	while (!need_resched() && !need_resched_delayed()) {
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		smp_mb();
-		if (need_resched())
+		if (need_resched() || need_resched_delayed())
 			break;
 		__mwait(0, 0);
 	}
@@ -314,7 +326,7 @@ void show_regs(struct pt_regs *regs)
 {
 	printk("CPU %d:", smp_processor_id());
 	__show_regs(regs);
-	show_trace(&regs->rsp);
+	show_trace(current, &regs->rsp);
 }
 
 /*
@@ -333,13 +345,14 @@ void exit_thread(void)
 	kprobe_flush_task(me);
 
 	if (me->thread.io_bitmap_ptr) { 
-		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
+		struct tss_struct *tss;
 
 		kfree(t->io_bitmap_ptr);
 		t->io_bitmap_ptr = NULL;
 		/*
 		 * Careful, clear this in the TSS too:
 		 */
+		tss = &per_cpu(init_tss, get_cpu());
 		memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
 		t->io_bitmap_max = 0;
 		put_cpu();
Index: linux.prev/arch/x86_64/kernel/reboot.c
===================================================================
--- linux.prev.orig/arch/x86_64/kernel/reboot.c
+++ linux.prev/arch/x86_64/kernel/reboot.c
@@ -99,7 +99,7 @@ void machine_shutdown(void)
 	smp_send_stop();
 #endif
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 #ifndef CONFIG_SMP
 	disable_local_APIC();
@@ -107,7 +107,7 @@ void machine_shutdown(void)
 
 	disable_IO_APIC();
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void machine_emergency_restart(void)
Index: linux.prev/arch/x86_64/kernel/setup.c
===================================================================
--- linux.prev.orig/arch/x86_64/kernel/setup.c
+++ linux.prev/arch/x86_64/kernel/setup.c
@@ -993,6 +993,7 @@ static void __cpuinit init_intel(struct 
 		c->x86_cache_alignment = c->x86_clflush_size * 2;
 	if (c->x86 >= 15)
 		set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
+	set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
  	c->x86_max_cores = intel_num_cpu_cores(c);
 
 	srat_detect_node();
Index: linux.prev/arch/x86_64/kernel/signal.c
===================================================================
--- linux.prev.orig/arch/x86_64/kernel/signal.c
+++ linux.prev/arch/x86_64/kernel/signal.c
@@ -434,6 +434,13 @@ int do_signal(struct pt_regs *regs, sigs
 	siginfo_t info;
 	int signr;
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Fully-preemptible kernel does not need interrupts disabled:
+	 */
+	raw_local_irq_enable();
+	preempt_check_resched();
+#endif
 	/*
 	 * We want the common case to go fast, which
 	 * is why we may in certain cases get here from
Index: linux.prev/arch/x86_64/kernel/smp.c
===================================================================
--- linux.prev.orig/arch/x86_64/kernel/smp.c
+++ linux.prev/arch/x86_64/kernel/smp.c
@@ -297,10 +297,20 @@ void smp_send_reschedule(int cpu)
 }
 
 /*
+ * this function sends a 'reschedule' IPI to all other CPUs.
+ * This is used when RT tasks are starving and other CPUs
+ * might be able to run them:
+ */
+void smp_send_reschedule_allbutself(void)
+{
+	send_IPI_allbutself(RESCHEDULE_VECTOR);
+}
+
+/*
  * Structure and data for smp_call_function(). This is designed to minimise
  * static memory requirements. It also looks cleaner.
  */
-static DEFINE_SPINLOCK(call_lock);
+static DEFINE_RAW_SPINLOCK(call_lock);
 
 struct call_data_struct {
 	void (*func) (void *info);
@@ -455,9 +465,9 @@ void smp_stop_cpu(void)
 	 * Remove this CPU:
 	 */
 	cpu_clear(smp_processor_id(), cpu_online_map);
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	disable_local_APIC();
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static void smp_really_stop_cpu(void *dummy)
@@ -481,9 +491,9 @@ void smp_send_stop(void)
 	if (!nolock)
 		spin_unlock(&call_lock);
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	disable_local_APIC();
-	local_irq_enable();
+	raw_local_irq_enable();
 }
 
 /*
Index: linux.prev/arch/x86_64/kernel/smpboot.c
===================================================================
--- linux.prev.orig/arch/x86_64/kernel/smpboot.c
+++ linux.prev/arch/x86_64/kernel/smpboot.c
@@ -200,7 +200,7 @@ static void __cpuinit smp_store_cpu_info
    latency and low latency is the primary objective here. -AK */
 #define no_cpu_relax() barrier()
 
-static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock);
+static __cpuinitdata __DEFINE_RAW_SPINLOCK(tsc_sync_lock);
 static volatile __cpuinitdata unsigned long go[SLAVE + 1];
 static int notscsync __cpuinitdata;
 
@@ -216,7 +216,7 @@ static __cpuinit void sync_master(void *
 
 	go[MASTER] = 0;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	{
 		for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) {
 			while (!go[MASTER])
@@ -225,7 +225,7 @@ static __cpuinit void sync_master(void *
 			rdtscll(go[SLAVE]);
 		}
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /*
@@ -335,7 +335,13 @@ static __cpuinit void sync_tsc(unsigned 
 
 static void __cpuinit tsc_sync_wait(void)
 {
-	if (notscsync || !cpu_has_tsc)
+	/*
+	 * When the CPU has synchronized TSCs assume the BIOS
+  	 * or the hardware already synced.  Otherwise we could
+	 * mess up a possible perfect synchronization with a
+	 * not-quite-perfect algorithm.
+	 */
+	if (notscsync || !cpu_has_tsc || !unsynchronized_tsc())
 		return;
 	sync_tsc(0);
 }
@@ -1080,7 +1086,7 @@ int __cpuinit __cpu_up(unsigned int cpu)
 	int err;
 	int apicid = cpu_present_to_apicid(cpu);
 
-	WARN_ON(irqs_disabled());
+	WARN_ON(raw_irqs_disabled());
 
 	Dprintk("++++++++++++++++++++=_---CPU UP  %u\n", cpu);
 
Index: linux.prev/arch/x86_64/kernel/switch2poll.c
===================================================================
--- /dev/null
+++ linux.prev/arch/x86_64/kernel/switch2poll.c
@@ -0,0 +1,112 @@
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/spinlock.h>
+#include <linux/pm.h>
+
+extern void poll_idle (void);
+
+#define KERNEL_ATTR_RW(_name) \
+static struct subsys_attribute _name##_attr = \
+	__ATTR(_name, 0644, _name##_show, _name##_store)
+
+static struct idlep_kobject
+{
+	struct kobject kobj;
+	int is_poll;
+	void (*idle)(void);
+} idle_kobj;
+
+static ssize_t idle_poll_show(struct subsystem *subsys, char *page)
+{
+	return sprintf(page, "%s\n", (idle_kobj.is_poll ? "on" : "off"));
+}
+
+static ssize_t idle_poll_store(struct subsystem *subsys,
+			       const char *buf, size_t len)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pm_idle_switch_lock, flags);
+
+	/*
+	 * If power management is handling the idle function,
+	 * then leave it be.
+	 */
+	if (pm_idle_locked) {
+		len = -EBUSY;
+		goto out;
+	}
+
+	if (strncmp(buf,"1",1)==0 ||
+	    (len >=2 && strncmp(buf,"on",2)==0)) {
+		if (idle_kobj.is_poll != 1) {
+			idle_kobj.is_poll = 1;
+			boot_option_idle_override = 1;
+			idle_kobj.idle = pm_idle;
+			pm_idle = poll_idle;
+		}
+	} else if (strncmp(buf,"0",1)==0 ||
+		   (len >= 3 && strncmp(buf,"off",3)==0)) {
+		if (idle_kobj.is_poll != 0) {
+			boot_option_idle_override = 0;
+			idle_kobj.is_poll = 0;
+			pm_idle = idle_kobj.idle;
+		}
+	}
+
+out:
+	spin_unlock_irqrestore(&pm_idle_switch_lock, flags);
+
+	return len;
+}
+
+
+KERNEL_ATTR_RW(idle_poll);
+
+static struct attribute * idle_attrs[] = {
+	&idle_poll_attr.attr,
+	NULL
+};
+
+static struct attribute_group idle_attr_group = {
+	.attrs = idle_attrs,
+};
+
+static int __init idle_poll_set_init(void)
+{
+	int err;
+
+	/*
+	 * If the default is alread poll_idle then
+	 * don't even bother with this.
+	 */
+	if (pm_idle == poll_idle)
+		return 0;
+
+	memset(&idle_kobj, 0, sizeof(idle_kobj));
+
+	idle_kobj.is_poll = 0;
+	idle_kobj.idle = pm_idle;
+
+	err = kobject_set_name(&idle_kobj.kobj, "%s", "idle");
+	if (err)
+		goto out;
+
+	idle_kobj.kobj.parent = &kernel_subsys.kset.kobj;
+	err = kobject_register(&idle_kobj.kobj);
+	if (err)
+		goto out;
+
+	err = sysfs_create_group(&idle_kobj.kobj,
+				 &idle_attr_group);
+	if (err)
+		goto out;
+
+	return 0;
+out:
+	printk(KERN_INFO "Problem setting up sysfs idle_poll\n");
+	return 0;
+}
+
+late_initcall(idle_poll_set_init);
Index: linux.prev/arch/x86_64/kernel/time.c
===================================================================
--- linux.prev.orig/arch/x86_64/kernel/time.c
+++ linux.prev/arch/x86_64/kernel/time.c
@@ -26,6 +26,7 @@
 #include <linux/bcd.h>
 #include <linux/kallsyms.h>
 #include <linux/acpi.h>
+
 #ifdef CONFIG_ACPI
 #include <acpi/achware.h>	/* for PM timer frequency */
 #endif
@@ -38,25 +39,26 @@
 #include <asm/sections.h>
 #include <linux/cpufreq.h>
 #include <linux/hpet.h>
+#include <linux/clocksource.h>
+#include <linux/timeofday.h>
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/apic.h>
 #endif
 
-#ifdef CONFIG_CPU_FREQ
-static void cpufreq_delayed_get(void);
-#endif
 extern void i8254_timer_resume(void);
 extern int using_apic_timer;
 
-DEFINE_SPINLOCK(rtc_lock);
-DEFINE_SPINLOCK(i8253_lock);
+DEFINE_RAW_SPINLOCK(rtc_lock);
+DEFINE_RAW_SPINLOCK(i8253_lock);
 
 static int nohpet __initdata = 0;
 static int notsc __initdata = 0;
 
 #undef HPET_HACK_ENABLE_DANGEROUS
 
-unsigned int cpu_khz;					/* TSC clocks / usec, not used here */
+unsigned int cpu_khz;					/* CPU clocks / usec, not used here */
+unsigned int tsc_khz;					/* TSC clocks / usec, not used here */
+unsigned long hpet_address;
 static unsigned long hpet_period;			/* fsecs / HPET clock */
 unsigned long hpet_tick;				/* HPET clocks / interrupt */
 static int hpet_use_timer;				/* Use counter of hpet for time keeping, otherwise PIT */
@@ -79,107 +81,6 @@ static inline void rdtscll_sync(unsigned
 	rdtscll(*tsc);
 }
 
-/*
- * do_gettimeoffset() returns microseconds since last timer interrupt was
- * triggered by hardware. A memory read of HPET is slower than a register read
- * of TSC, but much more reliable. It's also synchronized to the timer
- * interrupt. Note that do_gettimeoffset() may return more than hpet_tick, if a
- * timer interrupt has happened already, but vxtime.trigger wasn't updated yet.
- * This is not a problem, because jiffies hasn't updated either. They are bound
- * together by xtime_lock.
- */
-
-static inline unsigned int do_gettimeoffset_tsc(void)
-{
-	unsigned long t;
-	unsigned long x;
-	rdtscll_sync(&t);
-	if (t < vxtime.last_tsc) t = vxtime.last_tsc; /* hack */
-	x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> 32;
-	return x;
-}
-
-static inline unsigned int do_gettimeoffset_hpet(void)
-{
-	/* cap counter read to one tick to avoid inconsistencies */
-	unsigned long counter = hpet_readl(HPET_COUNTER) - vxtime.last;
-	return (min(counter,hpet_tick) * vxtime.quot) >> 32;
-}
-
-unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc;
-
-/*
- * This version of gettimeofday() has microsecond resolution and better than
- * microsecond precision, as we're using at least a 10 MHz (usually 14.31818
- * MHz) HPET timer.
- */
-
-void do_gettimeofday(struct timeval *tv)
-{
-	unsigned long seq, t;
- 	unsigned int sec, usec;
-
-	do {
-		seq = read_seqbegin(&xtime_lock);
-
-		sec = xtime.tv_sec;
-		usec = xtime.tv_nsec / 1000;
-
-		/* i386 does some correction here to keep the clock 
-		   monotonous even when ntpd is fixing drift.
-		   But they didn't work for me, there is a non monotonic
-		   clock anyways with ntp.
-		   I dropped all corrections now until a real solution can
-		   be found. Note when you fix it here you need to do the same
-		   in arch/x86_64/kernel/vsyscall.c and export all needed
-		   variables in vmlinux.lds. -AK */ 
-
-		t = (jiffies - wall_jiffies) * (1000000L / HZ) +
-			do_gettimeoffset();
-		usec += t;
-
-	} while (read_seqretry(&xtime_lock, seq));
-
-	tv->tv_sec = sec + usec / 1000000;
-	tv->tv_usec = usec % 1000000;
-}
-
-EXPORT_SYMBOL(do_gettimeofday);
-
-/*
- * settimeofday() first undoes the correction that gettimeofday would do
- * on the time, and then saves it. This is ugly, but has been like this for
- * ages already.
- */
-
-int do_settimeofday(struct timespec *tv)
-{
-	time_t wtm_sec, sec = tv->tv_sec;
-	long wtm_nsec, nsec = tv->tv_nsec;
-
-	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
-		return -EINVAL;
-
-	write_seqlock_irq(&xtime_lock);
-
-	nsec -= do_gettimeoffset() * 1000 +
-		(jiffies - wall_jiffies) * (NSEC_PER_SEC/HZ);
-
-	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
-	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
-
-	set_normalized_timespec(&xtime, sec, nsec);
-	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
-
-	ntp_clear();
-
-	write_sequnlock_irq(&xtime_lock);
-	clock_was_set();
-	return 0;
-}
-
-EXPORT_SYMBOL(do_settimeofday);
-
 unsigned long profile_pc(struct pt_regs *regs)
 {
 	unsigned long pc = instruction_pointer(regs);
@@ -259,8 +160,8 @@ static void set_rtc_mmss(unsigned long n
 #endif
 
 	{
-			BIN_TO_BCD(real_seconds);
-			BIN_TO_BCD(real_minutes);
+		BIN_TO_BCD(real_seconds);
+		BIN_TO_BCD(real_minutes);
 		CMOS_WRITE(real_seconds, RTC_SECONDS);
 		CMOS_WRITE(real_minutes, RTC_MINUTES);
 	}
@@ -279,90 +180,8 @@ static void set_rtc_mmss(unsigned long n
 	spin_unlock(&rtc_lock);
 }
 
-
-/* monotonic_clock(): returns # of nanoseconds passed since time_init()
- *		Note: This function is required to return accurate
- *		time even in the absence of multiple timer ticks.
- */
-unsigned long long monotonic_clock(void)
-{
-	unsigned long seq;
- 	u32 last_offset, this_offset, offset;
-	unsigned long long base;
-
-	if (vxtime.mode == VXTIME_HPET) {
-		do {
-			seq = read_seqbegin(&xtime_lock);
-
-			last_offset = vxtime.last;
-			base = monotonic_base;
-			this_offset = hpet_readl(HPET_COUNTER);
-
-		} while (read_seqretry(&xtime_lock, seq));
-		offset = (this_offset - last_offset);
-		offset *=(NSEC_PER_SEC/HZ)/hpet_tick;
-		return base + offset;
-	}else{
-		do {
-			seq = read_seqbegin(&xtime_lock);
-
-			last_offset = vxtime.last_tsc;
-			base = monotonic_base;
-		} while (read_seqretry(&xtime_lock, seq));
-		sync_core();
-		rdtscll(this_offset);
-		offset = (this_offset - last_offset)*1000/cpu_khz; 
-		return base + offset;
-	}
-
-
-}
-EXPORT_SYMBOL(monotonic_clock);
-
-static noinline void handle_lost_ticks(int lost, struct pt_regs *regs)
-{
-    static long lost_count;
-    static int warned;
-
-    if (report_lost_ticks) {
-	    printk(KERN_WARNING "time.c: Lost %d timer "
-		   "tick(s)! ", lost);
-	    print_symbol("rip %s)\n", regs->rip);
-    }
-
-    if (lost_count == 1000 && !warned) {
-	    printk(KERN_WARNING
-		   "warning: many lost ticks.\n"
-		   KERN_WARNING "Your time source seems to be instable or "
-		   		"some driver is hogging interupts\n");
-	    print_symbol("rip %s\n", regs->rip);
-	    if (vxtime.mode == VXTIME_TSC && vxtime.hpet_address) {
-		    printk(KERN_WARNING "Falling back to HPET\n");
-		    vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
-		    vxtime.mode = VXTIME_HPET;
-		    do_gettimeoffset = do_gettimeoffset_hpet;
-	    }
-	    /* else should fall back to PIT, but code missing. */
-	    warned = 1;
-    } else
-	    lost_count++;
-
-#ifdef CONFIG_CPU_FREQ
-    /* In some cases the CPU can change frequency without us noticing
-       (like going into thermal throttle)
-       Give cpufreq a change to catch up. */
-    if ((lost_count+1) % 25 == 0) {
-	    cpufreq_delayed_get();
-    }
-#endif
-}
-
 static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 {
-	static unsigned long rtc_update = 0;
-	unsigned long tsc;
-	int delay, offset = 0, lost = 0;
-
 /*
  * Here we are in the timer irq handler. We have irqs locally disabled (so we
  * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running
@@ -372,67 +191,6 @@ static irqreturn_t timer_interrupt(int i
 
 	write_seqlock(&xtime_lock);
 
-	if (vxtime.hpet_address)
-		offset = hpet_readl(HPET_COUNTER);
-
-	if (hpet_use_timer) {
-		/* if we're using the hpet timer functionality,
-		 * we can more accurately know the counter value
-		 * when the timer interrupt occured.
-		 */
-		offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
-		delay = hpet_readl(HPET_COUNTER) - offset;
-	} else {
-		spin_lock(&i8253_lock);
-		outb_p(0x00, 0x43);
-		delay = inb_p(0x40);
-		delay |= inb(0x40) << 8;
-		spin_unlock(&i8253_lock);
-		delay = LATCH - 1 - delay;
-	}
-
-	rdtscll_sync(&tsc);
-
-	if (vxtime.mode == VXTIME_HPET) {
-		if (offset - vxtime.last > hpet_tick) {
-			lost = (offset - vxtime.last) / hpet_tick - 1;
-		}
-
-		monotonic_base += 
-			(offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick;
-
-		vxtime.last = offset;
-#ifdef CONFIG_X86_PM_TIMER
-	} else if (vxtime.mode == VXTIME_PMTMR) {
-		lost = pmtimer_mark_offset();
-#endif
-	} else {
-		offset = (((tsc - vxtime.last_tsc) *
-			   vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ);
-
-		if (offset < 0)
-			offset = 0;
-
-		if (offset > (USEC_PER_SEC / HZ)) {
-			lost = offset / (USEC_PER_SEC / HZ);
-			offset %= (USEC_PER_SEC / HZ);
-		}
-
-		monotonic_base += (tsc - vxtime.last_tsc)*1000000/cpu_khz ;
-
-		vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot;
-
-		if ((((tsc - vxtime.last_tsc) *
-		      vxtime.tsc_quot) >> 32) < offset)
-			vxtime.last_tsc = tsc -
-				(((long) offset << 32) / vxtime.tsc_quot) - 1;
-	}
-
-	if (lost > 0) {
-		handle_lost_ticks(lost, regs);
-		jiffies += lost;
-	}
-
 /*
  * Do the timer stuff.
  */
@@ -455,22 +213,13 @@ static irqreturn_t timer_interrupt(int i
 		smp_local_timer_interrupt(regs);
 #endif
 
-/*
- * If we have an externally synchronized Linux clock, then update CMOS clock
- * accordingly every ~11 minutes. set_rtc_mmss() will be called in the jiffy
- * closest to exactly 500 ms before the next second. If the update fails, we
- * don't care, as it'll be updated on the next turn, and the problem (time way
- * off) isn't likely to go away much sooner anyway.
- */
-
-	if (ntp_synced() && xtime.tv_sec > rtc_update &&
-		abs(xtime.tv_nsec - 500000000) <= tick_nsec / 2) {
-		set_rtc_mmss(xtime.tv_sec);
-		rtc_update = xtime.tv_sec + 660;
-	}
- 
 	write_sequnlock(&xtime_lock);
 
+#ifdef CONFIG_X86_LOCAL_APIC
+	if (using_apic_timer)
+		smp_send_timer_broadcast_ipi();
+#endif
+
 	return IRQ_HANDLED;
 }
 
@@ -509,10 +258,23 @@ unsigned long long sched_clock(void)
 	return cycles_2_ns(a);
 }
 
+static int tsc_unstable;
+
+static inline int check_tsc_unstable(void)
+{
+	return tsc_unstable;
+}
+
+void mark_tsc_unstable(void)
+{
+	tsc_unstable = 1;
+}
+EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+
 unsigned long get_cmos_time(void)
 {
-	unsigned int timeout, year, mon, day, hour, min, sec;
-	unsigned char last, this;
+	unsigned int timeout = 1000000, year, mon, day, hour, min, sec;
+	unsigned char uip = 0, this = 0;
 	unsigned long flags;
 
 /*
@@ -525,50 +287,70 @@ unsigned long get_cmos_time(void)
 
 	spin_lock_irqsave(&rtc_lock, flags);
 
-	timeout = 1000000;
-	last = this = 0;
-
-	while (timeout && last && !this) {
-		last = this;
+	while (timeout && (!uip || this)) {
+		uip |= this;
 		this = CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP;
 		timeout--;
 	}
 
-/*
- * Here we are safe to assume the registers won't change for a whole second, so
- * we just go ahead and read them.
-	 */
-
-		sec = CMOS_READ(RTC_SECONDS);
-		min = CMOS_READ(RTC_MINUTES);
-		hour = CMOS_READ(RTC_HOURS);
-		day = CMOS_READ(RTC_DAY_OF_MONTH);
-		mon = CMOS_READ(RTC_MONTH);
-		year = CMOS_READ(RTC_YEAR);
+	/*
+	 * Here we are safe to assume the registers won't change for a whole
+	 * second, so we just go ahead and read them.
+ 	 */
+	sec = CMOS_READ(RTC_SECONDS);
+	min = CMOS_READ(RTC_MINUTES);
+	hour = CMOS_READ(RTC_HOURS);
+	day = CMOS_READ(RTC_DAY_OF_MONTH);
+	mon = CMOS_READ(RTC_MONTH);
+	year = CMOS_READ(RTC_YEAR);
 
 	spin_unlock_irqrestore(&rtc_lock, flags);
 
-/*
- * We know that x86-64 always uses BCD format, no need to check the config
- * register.
- */
-
-	    BCD_TO_BIN(sec);
-	    BCD_TO_BIN(min);
-	    BCD_TO_BIN(hour);
-	    BCD_TO_BIN(day);
-	    BCD_TO_BIN(mon);
-	    BCD_TO_BIN(year);
+	/*
+	 * We know that x86-64 always uses BCD format, no need to check the
+	 * config register.
+ 	*/
+
+	BCD_TO_BIN(sec);
+	BCD_TO_BIN(min);
+	BCD_TO_BIN(hour);
+	BCD_TO_BIN(day);
+	BCD_TO_BIN(mon);
+	BCD_TO_BIN(year);
 
-/*
- * x86-64 systems only exists since 2002.
- * This will work up to Dec 31, 2100
- */
+	/*
+	 * x86-64 systems only exists since 2002.
+	 * This will work up to Dec 31, 2100
+	 */
 	year += 2000;
 
 	return mktime(year, mon, day, hour, min, sec);
 }
 
+/* arch specific timeofday hooks: */
+u64 read_persistent_clock(void)
+{
+	return (u64)get_cmos_time() * NSEC_PER_SEC;
+}
+
+void sync_persistent_clock(struct timespec ts)
+{
+	static unsigned long rtc_update = 0;
+	/*
+	 * If we have an externally synchronized Linux clock, then update
+	 * CMOS clock accordingly every ~11 minutes. set_rtc_mmss() will
+	 * be called in the jiffy closest to exactly 500 ms before the
+	 * next second. If the update fails, we don't care, as it'll be
+	 * updated on the next turn, and the problem (time way off) isn't
+	 * likely to go away much sooner anyway.
+	 */
+	if (ts.tv_sec > rtc_update &&
+		abs(ts.tv_nsec - 500000000) <= tick_nsec / 2) {
+		set_rtc_mmss(xtime.tv_sec);
+		rtc_update = xtime.tv_sec + 660;
+	}
+}
+
 #ifdef CONFIG_CPU_FREQ
 
 /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
@@ -596,23 +378,6 @@ static void handle_cpufreq_delayed_get(v
 	cpufreq_delayed_issched = 0;
 }
 
-/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries
- * to verify the CPU frequency the timing core thinks the CPU is running
- * at is still correct.
- */
-static void cpufreq_delayed_get(void)
-{
-	static int warned;
-	if (cpufreq_init && !cpufreq_delayed_issched) {
-		cpufreq_delayed_issched = 1;
-		if (!warned) {
-			warned = 1;
-			printk(KERN_DEBUG "Losing some ticks... checking if CPU frequency changed.\n");
-		}
-		schedule_work(&cpufreq_delayed_get_work);
-	}
-}
-
 static unsigned int  ref_freq = 0;
 static unsigned long loops_per_jiffy_ref = 0;
 
@@ -647,8 +412,11 @@ static int time_cpufreq_notifier(struct 
 		cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
 
 		cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
-		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+		if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
 			vxtime.tsc_quot = (1000L << 32) / cpu_khz;
+			tsc_khz = cpu_khz;
+		}
+
 	}
 	
 	set_cyc2ns_scale(cpu_khz_ref);
@@ -686,18 +454,17 @@ static unsigned int __init hpet_calibrat
 	int tsc_now, hpet_now;
 	unsigned long flags;
 
-	local_irq_save(flags);
-	local_irq_disable();
+	raw_local_irq_save(flags);
+	raw_local_irq_disable();
 
 	hpet_start = hpet_readl(HPET_COUNTER);
 	rdtscl(tsc_start);
 
 	do {
-		local_irq_disable();
+		raw_local_irq_disable();
 		hpet_now = hpet_readl(HPET_COUNTER);
-		sync_core();
-		rdtscl(tsc_now);
-		local_irq_restore(flags);
+		tsc_now = get_cycles_sync();
+		raw_local_irq_restore(flags);
 	} while ((tsc_now - tsc_start) < TICK_COUNT &&
 		 (hpet_now - hpet_start) < TICK_COUNT);
 
@@ -726,11 +493,9 @@ static unsigned int __init pit_calibrate
 	outb(0xb0, 0x43);
 	outb((PIT_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
 	outb((PIT_TICK_RATE / (1000 / 50)) >> 8, 0x42);
-	rdtscll(start);
-	sync_core();
+	start = get_cycles_sync();
 	while ((inb(0x61) & 0x20) == 0);
-	sync_core();
-	rdtscll(end);
+	end = get_cycles_sync();
 
 	spin_unlock_irqrestore(&i8253_lock, flags);
 	
@@ -744,7 +509,7 @@ static __init int late_hpet_init(void)
 	unsigned int 		ntimer;
 
 	if (!vxtime.hpet_address)
-          return -1;
+        	return -1;
 
 	memset(&hd, 0, sizeof (hd));
 
@@ -875,11 +640,9 @@ int __init time_setup(char *str)
 }
 
 static struct irqaction irq0 = {
-	timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL
+	timer_interrupt, SA_INTERRUPT | SA_NODELAY, CPU_MASK_NONE, "timer", NULL, NULL
 };
 
-extern void __init config_acpi_tables(void);
-
 void __init time_init(void)
 {
 	char *timename;
@@ -914,18 +677,12 @@ void __init time_init(void)
 	if (hpet_use_timer) {
 		cpu_khz = hpet_calibrate_tsc();
 		timename = "HPET";
-#ifdef CONFIG_X86_PM_TIMER
-	} else if (pmtmr_ioport && !vxtime.hpet_address) {
-		vxtime_hz = PM_TIMER_FREQUENCY;
-		timename = "PM";
-		pit_init();
-		cpu_khz = pit_calibrate_tsc();
-#endif
 	} else {
 		pit_init();
 		cpu_khz = pit_calibrate_tsc();
 		timename = "PIT";
 	}
+	tsc_khz = cpu_khz;
 
 	printk(KERN_INFO "time.c: Using %ld.%06ld MHz %s timer.\n",
 	       vxtime_hz / 1000000, vxtime_hz % 1000000, timename);
@@ -934,7 +691,7 @@ void __init time_init(void)
 	vxtime.mode = VXTIME_TSC;
 	vxtime.quot = (1000000L << 32) / vxtime_hz;
 	vxtime.tsc_quot = (1000L << 32) / cpu_khz;
-	rdtscll_sync(&vxtime.last_tsc);
+	vxtime.last_tsc = get_cycles_sync();
 	setup_irq(0, &irq0);
 
 	set_cyc2ns_scale(cpu_khz);
@@ -948,7 +705,7 @@ void __init time_init(void)
  * Make an educated guess if the TSC is trustworthy and synchronized
  * over all CPUs.
  */
-static __init int unsynchronized_tsc(void)
+__init int unsynchronized_tsc(void)
 {
 #ifdef CONFIG_SMP
 	if (oem_force_hpet_timer())
@@ -959,7 +716,7 @@ static __init int unsynchronized_tsc(voi
  		return 0;
 #endif
  	/* Assume multi socket systems are not synchronized */
- 	return num_online_cpus() > 1;
+ 	return num_present_cpus() > 1;
 }
 
 /*
@@ -967,31 +724,8 @@ static __init int unsynchronized_tsc(voi
  */
 void __init time_init_gtod(void)
 {
-	char *timetype;
-
 	if (unsynchronized_tsc())
-		notsc = 1;
-	if (vxtime.hpet_address && notsc) {
-		timetype = hpet_use_timer ? "HPET" : "PIT/HPET";
-		vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
-		vxtime.mode = VXTIME_HPET;
-		do_gettimeoffset = do_gettimeoffset_hpet;
-#ifdef CONFIG_X86_PM_TIMER
-	/* Using PM for gettimeofday is quite slow, but we have no other
-	   choice because the TSC is too unreliable on some systems. */
-	} else if (pmtmr_ioport && !vxtime.hpet_address && notsc) {
-		timetype = "PM";
-		do_gettimeoffset = do_gettimeoffset_pm;
-		vxtime.mode = VXTIME_PMTMR;
-		sysctl_vsyscall = 0;
-		printk(KERN_INFO "Disabling vsyscall due to use of PM timer\n");
-#endif
-	} else {
-		timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC";
-		vxtime.mode = VXTIME_TSC;
-	}
-
-	printk(KERN_INFO "time.c: Using %s based timekeeping.\n", timetype);
+		mark_tsc_unstable();
 }
 
 __setup("report_lost_ticks", time_setup);
@@ -999,6 +733,10 @@ __setup("report_lost_ticks", time_setup)
 static long clock_cmos_diff;
 static unsigned long sleep_start;
 
+/*
+ * sysfs support for the timer.
+ */
+
 static int timer_suspend(struct sys_device *dev, pm_message_t state)
 {
 	/*
@@ -1014,7 +752,6 @@ static int timer_suspend(struct sys_devi
 
 static int timer_resume(struct sys_device *dev)
 {
-	unsigned long flags;
 	unsigned long sec;
 	unsigned long ctime = get_cmos_time();
 	unsigned long sleep_length = (ctime - sleep_start) * HZ;
@@ -1025,10 +762,6 @@ static int timer_resume(struct sys_devic
 		i8254_timer_resume();
 
 	sec = ctime + clock_cmos_diff;
-	write_seqlock_irqsave(&xtime_lock,flags);
-	xtime.tv_sec = sec;
-	xtime.tv_nsec = 0;
-	write_sequnlock_irqrestore(&xtime_lock,flags);
 	jiffies += sleep_length;
 	wall_jiffies += sleep_length;
 	touch_softlockup_watchdog();
@@ -1041,7 +774,6 @@ static struct sysdev_class timer_sysclas
 	set_kset_name("timer"),
 };
 
-
 /* XXX this driverfs stuff should probably go elsewhere later -john */
 static struct sys_device device_timer = {
 	.id	= 0,
@@ -1075,8 +807,6 @@ device_initcall(time_init_device);
  */
 #include <linux/rtc.h>
 
-extern irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs);
-
 #define DEFAULT_RTC_INT_FREQ 	64
 #define RTC_NUM_INTS 		1
 
@@ -1124,12 +854,12 @@ int hpet_rtc_timer_init(void)
 	else
 		hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	cnt = hpet_readl(HPET_COUNTER);
 	cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
 	hpet_writel(cnt, HPET_T1_CMP);
 	hpet_t1_cmp = cnt;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	cfg = hpet_readl(HPET_T1_CFG);
 	cfg &= ~HPET_TN_PERIODIC;
@@ -1285,8 +1015,6 @@ irqreturn_t hpet_rtc_interrupt(int irq, 
 }
 #endif
 
-
-
 static int __init nohpet_setup(char *s) 
 { 
 	nohpet = 1;
@@ -1305,3 +1033,141 @@ static int __init notsc_setup(char *s)
 __setup("notsc", notsc_setup);
 
 
+/* clock source code: */
+
+static unsigned long current_tsc_khz = 0;
+
+static int tsc_update_callback(void);
+
+static cycle_t read_tsc(void)
+{
+	cycle_t ret;
+
+	rdtscll(ret);
+
+	return ret;
+}
+
+static cycle_t __vsyscall_fn vread_tsc(void* unused)
+{
+	cycle_t ret;
+
+	rdtscll(ret);
+
+	return ret;
+}
+
+static struct clocksource clocksource_tsc = {
+	.name			= "tsc",
+	.rating			= 300,
+	.read			= read_tsc,
+	.vread			= vread_tsc,
+	.mask			= (cycle_t)-1,
+	.mult			= 0, /* to be set */
+	.shift			= 22,
+	.update_callback	= tsc_update_callback,
+	.is_continuous		= 1,
+};
+
+static int tsc_update_callback(void)
+{
+	int change = 0;
+
+	/* check to see if we should switch to the safe clocksource: */
+	if (clocksource_tsc.rating != 50 && check_tsc_unstable()) {
+		clocksource_tsc.rating = 50;
+		reselect_clocksource();
+		change = 1;
+	}
+
+	/* only update if tsc_khz has changed: */
+	if (current_tsc_khz != tsc_khz){
+		current_tsc_khz = tsc_khz;
+		clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
+							clocksource_tsc.shift);
+		change = 1;
+	}
+	return change;
+}
+
+static int __init init_tsc_clocksource(void)
+{
+	if (!notsc && tsc_khz) {
+		current_tsc_khz = tsc_khz;
+		clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
+							clocksource_tsc.shift);
+		register_clocksource(&clocksource_tsc);
+	}
+	return 0;
+}
+
+module_init(init_tsc_clocksource);
+
+
+#define HPET_MASK	0xFFFFFFFF
+#define HPET_SHIFT	22
+
+/* FSEC = 10^-15 NSEC = 10^-9 */
+#define FSEC_PER_NSEC	1000000
+
+static void *hpet_ptr;
+
+static cycle_t read_hpet(void)
+{
+	return (cycle_t)readl(hpet_ptr);
+}
+
+static cycle_t __vsyscall_fn vread_hpet(void* ptr)
+{
+	return (cycle_t)readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
+}
+
+struct clocksource clocksource_hpet = {
+	.name		= "hpet",
+	.rating		= 250,
+	.read		= read_hpet,
+	.vread		= vread_hpet,
+	.mask		= (cycle_t)HPET_MASK,
+	.mult		= 0, /* set below */
+	.shift		= HPET_SHIFT,
+	.is_continuous	= 1,
+};
+
+static int __init init_hpet_clocksource(void)
+{
+	unsigned long hpet_period;
+	void __iomem *hpet_base;
+	u64 tmp;
+
+	if (!hpet_address)
+		return -ENODEV;
+
+	/* calculate the hpet address: */
+	hpet_base =
+		(void __iomem*)ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
+	hpet_ptr = hpet_base + HPET_COUNTER;
+
+	/* calculate the frequency: */
+	hpet_period = readl(hpet_base + HPET_PERIOD);
+
+	/*
+	 * hpet period is in femto seconds per cycle
+	 * so we need to convert this to ns/cyc units
+	 * aproximated by mult/2^shift
+	 *
+	 *  fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift
+	 *  fsec/cyc * 1ns/1000000fsec * 2^shift = mult
+	 *  fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
+	 *  (fsec/cyc << shift)/1000000 = mult
+	 *  (hpet_period << shift)/FSEC_PER_NSEC = mult
+	 */
+	tmp = (u64)hpet_period << HPET_SHIFT;
+	do_div(tmp, FSEC_PER_NSEC);
+	clocksource_hpet.mult = (u32)tmp;
+
+	register_clocksource(&clocksource_hpet);
+
+	return 0;
+}
+
+module_init(init_hpet_clocksource);
Index: linux.prev/arch/x86_64/kernel/traps.c
===================================================================
--- linux.prev.orig/arch/x86_64/kernel/traps.c
+++ linux.prev/arch/x86_64/kernel/traps.c
@@ -88,7 +88,7 @@ int register_die_notifier(struct notifie
 static inline void conditional_sti(struct pt_regs *regs)
 {
 	if (regs->eflags & X86_EFLAGS_IF)
-		local_irq_enable();
+		raw_local_irq_enable();
 }
 
 static int kstack_depth_to_print = 10;
@@ -154,7 +154,7 @@ static unsigned long *in_exception_stack
  * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
  */
 
-void show_trace(unsigned long *stack)
+void show_trace(struct task_struct *task, unsigned long *stack)
 {
 	unsigned long addr;
 	const unsigned cpu = safe_smp_processor_id();
@@ -219,6 +219,7 @@ void show_trace(unsigned long *stack)
 	HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0);
 #undef HANDLE_STACK
 	printk("\n");
+	print_traces(task);
 }
 
 void show_stack(struct task_struct *tsk, unsigned long * rsp)
@@ -255,7 +256,7 @@ void show_stack(struct task_struct *tsk,
 		printk("%016lx ", *stack++);
 		touch_nmi_watchdog();
 	}
-	show_trace((unsigned long *)rsp);
+	show_trace(tsk, (unsigned long *)rsp);
 }
 
 /*
@@ -264,7 +265,7 @@ void show_stack(struct task_struct *tsk,
 void dump_stack(void)
 {
 	unsigned long dummy;
-	show_trace(&dummy);
+	show_trace(current, &dummy);
 }
 
 EXPORT_SYMBOL(dump_stack);
@@ -337,7 +338,7 @@ void out_of_line_bug(void)
 } 
 #endif
 
-static DEFINE_SPINLOCK(die_lock);
+static DEFINE_RAW_SPINLOCK(die_lock);
 static int die_owner = -1;
 
 unsigned long oops_begin(void)
@@ -346,7 +347,7 @@ unsigned long oops_begin(void)
 	unsigned long flags;
 
 	/* racy, but better than risking deadlock. */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	if (!spin_trylock(&die_lock)) { 
 		if (cpu == die_owner) 
 			/* nested oops. should stop eventually */;
Index: linux.prev/arch/x86_64/kernel/vmlinux.lds.S
===================================================================
--- linux.prev.orig/arch/x86_64/kernel/vmlinux.lds.S
+++ linux.prev/arch/x86_64/kernel/vmlinux.lds.S
@@ -99,6 +99,18 @@ SECTIONS
   .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
   jiffies = VVIRT(.jiffies);
 
+  .vsyscall_fn :  AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) }
+  .vsyscall_data :  AT(VLOAD(.vsyscall_data)) { *(.vsyscall_data) }
+
+  .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { *(.vsyscall_gtod_data) }
+  vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
+
+  .vsyscall_gtod_lock : AT(VLOAD(.vsyscall_gtod_lock)) { *(.vsyscall_gtod_lock) }
+  vsyscall_gtod_lock = VVIRT(.vsyscall_gtod_lock);
+
+  .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) }
+  .vsyscall_data : AT(VLOAD(.vsyscall_data)) { *(.vsyscall_data) }
+
   .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) }
   .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { *(.vsyscall_2) }
   .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) }
Index: linux.prev/arch/x86_64/kernel/vsyscall.c
===================================================================
--- linux.prev.orig/arch/x86_64/kernel/vsyscall.c
+++ linux.prev/arch/x86_64/kernel/vsyscall.c
@@ -19,6 +19,8 @@
  *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
  */
 
+#include <linux/timeofday.h>
+#include <linux/clocksource.h>
 #include <linux/time.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -27,22 +29,34 @@
 #include <linux/jiffies.h>
 #include <linux/sysctl.h>
 
+
 #include <asm/vsyscall.h>
 #include <asm/pgtable.h>
+#include <asm/unistd.h>
 #include <asm/page.h>
 #include <asm/fixmap.h>
 #include <asm/errno.h>
 #include <asm/io.h>
 
-#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
-#define force_inline __attribute__((always_inline)) inline
+#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) notrace
 
 int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
-seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
+raw_seqlock_t __xtime_lock __section_xtime_lock = RAW_SEQLOCK_UNLOCKED;
 
-#include <asm/unistd.h>
+struct vsyscall_gtod_data_t {
+	struct timeval wall_time_tv;
+	struct timezone sys_tz;
+	cycle_t offset_base;
+	struct clocksource clock;
+};
+
+extern struct vsyscall_gtod_data_t vsyscall_gtod_data;
+struct vsyscall_gtod_data_t __vsyscall_gtod_data __section_vsyscall_gtod_data;
 
-static force_inline void timeval_normalize(struct timeval * tv)
+extern raw_seqlock_t vsyscall_gtod_lock;
+raw_seqlock_t __vsyscall_gtod_lock __section_vsyscall_gtod_lock = RAW_SEQLOCK_UNLOCKED;
+
+static __always_inline void timeval_normalize(struct timeval * tv)
 {
 	time_t __sec;
 
@@ -53,43 +67,71 @@ static force_inline void timeval_normali
 	}
 }
 
-static force_inline void do_vgettimeofday(struct timeval * tv)
+/*
+ * XXX - this is ugly. gettimeofday() has a label in it so we can't
+ *       call it twice.
+ */
+static __always_inline int syscall_gtod(struct timeval *tv, struct timezone *tz)
 {
-	long sequence, t;
-	unsigned long sec, usec;
+	int ret;
+
+	asm volatile("syscall"
+		: "=a" (ret)
+		: "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
+		: __syscall_clobber);
+
+	return ret;
+}
+
+static __always_inline void do_vgettimeofday(struct timeval * tv)
+{
+	cycle_t now, base, mask, cycle_delta;
+	unsigned long mult, shift, seq;
+	nsec_t nsec_delta;
 
 	do {
-		sequence = read_seqbegin(&__xtime_lock);
-		
-		sec = __xtime.tv_sec;
-		usec = (__xtime.tv_nsec / 1000) +
-			(__jiffies - __wall_jiffies) * (1000000 / HZ);
-
-		if (__vxtime.mode != VXTIME_HPET) {
-			sync_core();
-			rdtscll(t);
-			if (t < __vxtime.last_tsc)
-				t = __vxtime.last_tsc;
-			usec += ((t - __vxtime.last_tsc) *
-				 __vxtime.tsc_quot) >> 32;
-			/* See comment in x86_64 do_gettimeofday. */
-		} else {
-			usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) -
-				  __vxtime.last) * __vxtime.quot) >> 32;
+		seq = read_seqbegin(&__vsyscall_gtod_lock);
+
+		if (!__vsyscall_gtod_data.clock.vread) {
+			syscall_gtod(tv, NULL);
+			return;
 		}
-	} while (read_seqretry(&__xtime_lock, sequence));
 
-	tv->tv_sec = sec + usec / 1000000;
-	tv->tv_usec = usec % 1000000;
+		/* read the timeosurce and store state values */
+		now = __vsyscall_gtod_data.clock.vread(
+				__vsyscall_gtod_data.clock.vdata);
+
+		base = __vsyscall_gtod_data.offset_base;
+		mask = __vsyscall_gtod_data.clock.mask;
+
+		mult = __vsyscall_gtod_data.clock.mult;
+		shift = __vsyscall_gtod_data.clock.shift;
+
+		*tv = __vsyscall_gtod_data.wall_time_tv;
+	} while (read_seqretry(&__vsyscall_gtod_lock, seq));
+
+	/* calculate interval: */
+	cycle_delta = (now - base) & mask;
+	/* convert to nsecs: */
+	nsec_delta = (cycle_delta * mult) >> shift;
+
+	/* convert to usecs and add to timespec: */
+	do_div(nsec_delta, NSEC_PER_USEC);
+	tv->tv_usec += (unsigned long) nsec_delta;
+
+	while (tv->tv_usec > USEC_PER_SEC) {
+		tv->tv_sec += 1;
+		tv->tv_usec -= USEC_PER_SEC;
+	}
 }
 
 /* RED-PEN may want to readd seq locking, but then the variable should be write-once. */
-static force_inline void do_get_tz(struct timezone * tz)
+static __always_inline void do_get_tz(struct timezone * tz)
 {
-	*tz = __sys_tz;
+	*tz = __vsyscall_gtod_data.sys_tz;
 }
 
-static force_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
+static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
 {
 	int ret;
 	asm volatile("vsysc2: syscall"
@@ -98,7 +140,7 @@ static force_inline int gettimeofday(str
 	return ret;
 }
 
-static force_inline long time_syscall(long *t)
+static __always_inline long time_syscall(long *t)
 {
 	long secs;
 	asm volatile("vsysc1: syscall"
@@ -122,11 +164,16 @@ int __vsyscall(0) vgettimeofday(struct t
  * unlikely */
 time_t __vsyscall(1) vtime(time_t *t)
 {
+	struct timeval tv;
+
 	if (unlikely(!__sysctl_vsyscall))
 		return time_syscall(t);
-	else if (t)
-		*t = __xtime.tv_sec;		
-	return __xtime.tv_sec;
+
+	vgettimeofday(&tv, 0);
+	if (t)
+		*t = tv.tv_sec;
+
+	return tv.tv_sec;
 }
 
 long __vsyscall(2) venosys_0(void)
@@ -139,6 +186,38 @@ long __vsyscall(3) venosys_1(void)
 	return -ENOSYS;
 }
 
+struct clocksource *curr_clock;
+
+void arch_update_vsyscall_gtod(struct timespec wall_time, cycle_t offset_base,
+				struct clocksource *clock, int ntp_adj)
+{
+	unsigned long flags;
+
+	write_seqlock_irqsave(&vsyscall_gtod_lock, flags);
+
+	/* XXX - hackitty hack hack. this is terrible! */
+	if (curr_clock != clock)
+		curr_clock = clock;
+
+	/* save off wall time as timeval: */
+	vsyscall_gtod_data.wall_time_tv.tv_sec = wall_time.tv_sec;
+	vsyscall_gtod_data.wall_time_tv.tv_usec = wall_time.tv_nsec/1000;
+
+	/* save offset_base: */
+	vsyscall_gtod_data.offset_base = offset_base;
+
+	/* copy current clocksource: */
+	vsyscall_gtod_data.clock = *clock;
+
+	/* apply ntp adjustment to clocksource mult: */
+	vsyscall_gtod_data.clock.mult += ntp_adj;
+
+	/* save off current timezone: */
+	vsyscall_gtod_data.sys_tz = sys_tz;
+
+	write_sequnlock_irqrestore(&vsyscall_gtod_lock, flags);
+}
+
 #ifdef CONFIG_SYSCTL
 
 #define SYSCALL 0x050f
@@ -217,6 +296,7 @@ static int __init vsyscall_init(void)
 	BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
 	BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
 	map_vsyscall();
+	sysctl_vsyscall = 1;
 #ifdef CONFIG_SYSCTL
 	register_sysctl_table(kernel_root_table2, 0);
 #endif
Index: linux.prev/arch/x86_64/kernel/x8664_ksyms.c
===================================================================
--- linux.prev.orig/arch/x86_64/kernel/x8664_ksyms.c
+++ linux.prev/arch/x86_64/kernel/x8664_ksyms.c
@@ -14,6 +14,7 @@
 #include <linux/syscalls.h>
 #include <linux/tty.h>
 #include <linux/ioctl32.h>
+#include <linux/mc146818rtc.h>
 
 #include <asm/semaphore.h>
 #include <asm/processor.h>
@@ -33,8 +34,6 @@
 #include <asm/tlbflush.h>
 #include <asm/kdebug.h>
 
-extern spinlock_t rtc_lock;
-
 #ifdef CONFIG_SMP
 extern void __write_lock_failed(rwlock_t *rw);
 extern void __read_lock_failed(rwlock_t *rw);
@@ -45,8 +44,6 @@ extern struct drive_info_struct drive_in
 EXPORT_SYMBOL(drive_info);
 #endif
 
-extern unsigned long get_cmos_time(void);
-
 /* platform dependent support */
 EXPORT_SYMBOL(boot_cpu_data);
 //EXPORT_SYMBOL(dump_fpu);
@@ -60,12 +57,13 @@ EXPORT_SYMBOL(probe_irq_mask);
 EXPORT_SYMBOL(kernel_thread);
 EXPORT_SYMBOL(pm_idle);
 EXPORT_SYMBOL(pm_power_off);
-EXPORT_SYMBOL(get_cmos_time);
 
-EXPORT_SYMBOL(__down_failed);
-EXPORT_SYMBOL(__down_failed_interruptible);
-EXPORT_SYMBOL(__down_failed_trylock);
-EXPORT_SYMBOL(__up_wakeup);
+#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
+EXPORT_SYMBOL(__compat_down_failed);
+EXPORT_SYMBOL(__compat_down_failed_interruptible);
+EXPORT_SYMBOL(__compat_down_failed_trylock);
+EXPORT_SYMBOL(__compat_up_wakeup);
+#endif
 /* Networking helper routines. */
 EXPORT_SYMBOL(csum_partial_copy_nocheck);
 EXPORT_SYMBOL(ip_compute_csum);
Index: linux.prev/arch/x86_64/lib/thunk.S
===================================================================
--- linux.prev.orig/arch/x86_64/lib/thunk.S
+++ linux.prev/arch/x86_64/lib/thunk.S
@@ -43,11 +43,13 @@
 	thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
 #endif	
 	thunk do_softirq_thunk,do_softirq
-	
-	thunk __down_failed,__down
-	thunk_retrax __down_failed_interruptible,__down_interruptible
-	thunk_retrax __down_failed_trylock,__down_trylock
-	thunk __up_wakeup,__up
+
+#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
+	thunk __compat_down_failed,__compat_down
+	thunk_retrax __compat_down_failed_interruptible,__compat_down_interruptible
+	thunk_retrax __compat_down_failed_trylock,__compat_down_trylock
+	thunk __compat_up_wakeup,__compat_up
+#endif
 	
 	/* SAVE_ARGS below is used only for the .cfi directives it contains. */
 	CFI_STARTPROC
Index: linux.prev/arch/x86_64/mm/fault.c
===================================================================
--- linux.prev.orig/arch/x86_64/mm/fault.c
+++ linux.prev/arch/x86_64/mm/fault.c
@@ -39,6 +39,7 @@ void bust_spinlocks(int yes)
 {
 	int loglevel_save = console_loglevel;
 	if (yes) {
+		stop_trace();
 		oops_in_progress = 1;
 	} else {
 #ifdef CONFIG_VT
@@ -315,7 +316,7 @@ asmlinkage void __kprobes do_page_fault(
 		return;
 
 	if (likely(regs->eflags & X86_EFLAGS_IF))
-		local_irq_enable();
+		raw_local_irq_enable();
 
 	if (unlikely(page_fault_trace))
 		printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
Index: linux.prev/arch/x86_64/mm/init.c
===================================================================
--- linux.prev.orig/arch/x86_64/mm/init.c
+++ linux.prev/arch/x86_64/mm/init.c
@@ -45,7 +45,7 @@
 
 static unsigned long dma_reserve __initdata;
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 /*
  * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
Index: linux.prev/block/cfq-iosched.c
===================================================================
--- linux.prev.orig/block/cfq-iosched.c
+++ linux.prev/block/cfq-iosched.c
@@ -1241,7 +1241,7 @@ static void cfq_exit_single_io_context(s
 	struct cfq_data *cfqd = cic->cfqq->cfqd;
 	request_queue_t *q = cfqd->queue;
 
-	WARN_ON(!irqs_disabled());
+	WARN_ON_NONRT(!irqs_disabled());
 
 	spin_lock(q->queue_lock);
 
@@ -1265,7 +1265,9 @@ static void cfq_exit_io_context(struct c
 	struct list_head *entry;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	// FIXME: i dont think this code is safe, upstream!
+
+	local_irq_save_nort(flags);
 
 	/*
 	 * put the reference this task is holding to the various queues
@@ -1276,7 +1278,7 @@ static void cfq_exit_io_context(struct c
 	}
 
 	cfq_exit_single_io_context(cic);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 }
 
 static struct cfq_io_context *
Index: linux.prev/block/ll_rw_blk.c
===================================================================
--- linux.prev.orig/block/ll_rw_blk.c
+++ linux.prev/block/ll_rw_blk.c
@@ -1412,7 +1412,7 @@ static int ll_merge_requests_fn(request_
  */
 void blk_plug_device(request_queue_t *q)
 {
-	WARN_ON(!irqs_disabled());
+	WARN_ON_NONRT(!irqs_disabled());
 
 	/*
 	 * don't plug a stopped queue, it must be paired with blk_start_queue()
@@ -1433,7 +1433,7 @@ EXPORT_SYMBOL(blk_plug_device);
  */
 int blk_remove_plug(request_queue_t *q)
 {
-	WARN_ON(!irqs_disabled());
+	WARN_ON_NONRT(!irqs_disabled());
 
 	if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
 		return 0;
@@ -3270,13 +3270,15 @@ void exit_io_context(void)
 	unsigned long flags;
 	struct io_context *ioc;
 
-	local_irq_save(flags);
+	// FIXME: unsafe upstream too?
+
+	local_irq_save_nort(flags);
 	task_lock(current);
 	ioc = current->io_context;
 	current->io_context = NULL;
 	ioc->task = NULL;
 	task_unlock(current);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 
 	if (ioc->aic && ioc->aic->exit)
 		ioc->aic->exit(ioc->aic);
Index: linux.prev/drivers/Makefile
===================================================================
--- linux.prev.orig/drivers/Makefile
+++ linux.prev/drivers/Makefile
@@ -70,3 +70,4 @@ obj-$(CONFIG_SGI_IOC4)		+= sn/
 obj-y				+= firmware/
 obj-$(CONFIG_CRYPTO)		+= crypto/
 obj-$(CONFIG_SUPERH)		+= sh/
+obj-$(CONFIG_GENERIC_TIME)	+= clocksource/
Index: linux.prev/drivers/acpi/Kconfig
===================================================================
--- linux.prev.orig/drivers/acpi/Kconfig
+++ linux.prev/drivers/acpi/Kconfig
@@ -287,24 +287,6 @@ config ACPI_SYSTEM
 	  This driver will enable your system to shut down using ACPI, and
 	  dump your ACPI DSDT table using /proc/acpi/dsdt.
 
-config X86_PM_TIMER
-	bool "Power Management Timer Support"
-	depends on X86
-	depends on !X86_64
-	default y
-	help
-	  The Power Management Timer is available on all ACPI-capable,
-	  in most cases even if ACPI is unusable or blacklisted.
-
-	  This timing source is not affected by powermanagement features
-	  like aggressive processor idling, throttling, frequency and/or
-	  voltage scaling, unlike the commonly used Time Stamp Counter
-	  (TSC) timing source.
-
-	  So, if you see messages like 'Losing too many ticks!' in the
-	  kernel logs, and/or you are using this on a notebook which
-	  does not yet have an HPET, you should say "Y" here.
-
 config ACPI_CONTAINER
 	tristate "ACPI0004,PNP0A05 and PNP0A06 Container Driver (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
Index: linux.prev/drivers/acpi/events/evgpe.c
===================================================================
--- linux.prev.orig/drivers/acpi/events/evgpe.c
+++ linux.prev/drivers/acpi/events/evgpe.c
@@ -377,7 +377,7 @@ u32 acpi_ev_gpe_detect(struct acpi_gpe_x
 	struct acpi_gpe_register_info *gpe_register_info;
 	u32 status_reg;
 	u32 enable_reg;
-	u32 flags;
+	unsigned long flags;
 	acpi_status status;
 	struct acpi_gpe_block_info *gpe_block;
 	acpi_native_uint i;
Index: linux.prev/drivers/acpi/events/evgpeblk.c
===================================================================
--- linux.prev.orig/drivers/acpi/events/evgpeblk.c
+++ linux.prev/drivers/acpi/events/evgpeblk.c
@@ -136,7 +136,7 @@ acpi_status acpi_ev_walk_gpe_list(ACPI_G
 	struct acpi_gpe_block_info *gpe_block;
 	struct acpi_gpe_xrupt_info *gpe_xrupt_info;
 	acpi_status status = AE_OK;
-	u32 flags;
+	unsigned long flags;
 
 	ACPI_FUNCTION_TRACE("ev_walk_gpe_list");
 
@@ -479,7 +479,7 @@ static struct acpi_gpe_xrupt_info *acpi_
 	struct acpi_gpe_xrupt_info *next_gpe_xrupt;
 	struct acpi_gpe_xrupt_info *gpe_xrupt;
 	acpi_status status;
-	u32 flags;
+	unsigned long flags;
 
 	ACPI_FUNCTION_TRACE("ev_get_gpe_xrupt_block");
 
@@ -553,7 +553,7 @@ static acpi_status
 acpi_ev_delete_gpe_xrupt(struct acpi_gpe_xrupt_info *gpe_xrupt)
 {
 	acpi_status status;
-	u32 flags;
+	unsigned long flags;
 
 	ACPI_FUNCTION_TRACE("ev_delete_gpe_xrupt");
 
@@ -610,7 +610,7 @@ acpi_ev_install_gpe_block(struct acpi_gp
 	struct acpi_gpe_block_info *next_gpe_block;
 	struct acpi_gpe_xrupt_info *gpe_xrupt_block;
 	acpi_status status;
-	u32 flags;
+	unsigned long flags;
 
 	ACPI_FUNCTION_TRACE("ev_install_gpe_block");
 
@@ -663,7 +663,7 @@ acpi_ev_install_gpe_block(struct acpi_gp
 acpi_status acpi_ev_delete_gpe_block(struct acpi_gpe_block_info *gpe_block)
 {
 	acpi_status status;
-	u32 flags;
+	unsigned long flags;
 
 	ACPI_FUNCTION_TRACE("ev_install_gpe_block");
 
Index: linux.prev/drivers/acpi/events/evxface.c
===================================================================
--- linux.prev.orig/drivers/acpi/events/evxface.c
+++ linux.prev/drivers/acpi/events/evxface.c
@@ -562,7 +562,7 @@ acpi_install_gpe_handler(acpi_handle gpe
 	struct acpi_gpe_event_info *gpe_event_info;
 	struct acpi_handler_info *handler;
 	acpi_status status;
-	u32 flags;
+	unsigned long flags;
 
 	ACPI_FUNCTION_TRACE("acpi_install_gpe_handler");
 
@@ -653,7 +653,7 @@ acpi_remove_gpe_handler(acpi_handle gpe_
 	struct acpi_gpe_event_info *gpe_event_info;
 	struct acpi_handler_info *handler;
 	acpi_status status;
-	u32 flags;
+	unsigned long flags;
 
 	ACPI_FUNCTION_TRACE("acpi_remove_gpe_handler");
 
Index: linux.prev/drivers/acpi/osl.c
===================================================================
--- linux.prev.orig/drivers/acpi/osl.c
+++ linux.prev/drivers/acpi/osl.c
@@ -346,9 +346,7 @@ u64 acpi_os_get_timer(void)
 	/* TBD: use HPET if available */
 #endif
 
-#ifdef	CONFIG_X86_PM_TIMER
 	/* TBD: default to PM timer if HPET was not available */
-#endif
 	if (!t)
 		printk(KERN_ERR PREFIX "acpi_os_get_timer() TBD\n");
 
@@ -728,14 +726,14 @@ void acpi_os_delete_lock(acpi_handle han
 acpi_status
 acpi_os_create_semaphore(u32 max_units, u32 initial_units, acpi_handle * handle)
 {
-	struct semaphore *sem = NULL;
+	struct compat_semaphore *sem = NULL;
 
 	ACPI_FUNCTION_TRACE("os_create_semaphore");
 
-	sem = acpi_os_allocate(sizeof(struct semaphore));
+	sem = acpi_os_allocate(sizeof(struct compat_semaphore));
 	if (!sem)
 		return_ACPI_STATUS(AE_NO_MEMORY);
-	memset(sem, 0, sizeof(struct semaphore));
+	memset(sem, 0, sizeof(struct compat_semaphore));
 
 	sema_init(sem, initial_units);
 
@@ -758,7 +756,7 @@ EXPORT_SYMBOL(acpi_os_create_semaphore);
 
 acpi_status acpi_os_delete_semaphore(acpi_handle handle)
 {
-	struct semaphore *sem = (struct semaphore *)handle;
+	struct compat_semaphore *sem = (struct compat_semaphore *)handle;
 
 	ACPI_FUNCTION_TRACE("os_delete_semaphore");
 
@@ -787,7 +785,7 @@ EXPORT_SYMBOL(acpi_os_delete_semaphore);
 acpi_status acpi_os_wait_semaphore(acpi_handle handle, u32 units, u16 timeout)
 {
 	acpi_status status = AE_OK;
-	struct semaphore *sem = (struct semaphore *)handle;
+	struct compat_semaphore *sem = (struct compat_semaphore *)handle;
 	int ret = 0;
 
 	ACPI_FUNCTION_TRACE("os_wait_semaphore");
@@ -868,7 +866,7 @@ EXPORT_SYMBOL(acpi_os_wait_semaphore);
  */
 acpi_status acpi_os_signal_semaphore(acpi_handle handle, u32 units)
 {
-	struct semaphore *sem = (struct semaphore *)handle;
+	struct compat_semaphore *sem = (struct compat_semaphore *)handle;
 
 	ACPI_FUNCTION_TRACE("os_signal_semaphore");
 
Index: linux.prev/drivers/acpi/processor_idle.c
===================================================================
--- linux.prev.orig/drivers/acpi/processor_idle.c
+++ linux.prev/drivers/acpi/processor_idle.c
@@ -37,6 +37,7 @@
 #include <linux/acpi.h>
 #include <linux/dmi.h>
 #include <linux/moduleparam.h>
+#include <linux/spinlock.h>
 #include <linux/sched.h>	/* need_resched() */
 
 #include <asm/io.h>
@@ -172,7 +173,7 @@ static void acpi_safe_halt(void)
 	clear_thread_flag(TIF_POLLING_NRFLAG);
 	smp_mb__after_clear_bit();
 	if (!need_resched())
-		safe_halt();
+		raw_safe_halt();
 	set_thread_flag(TIF_POLLING_NRFLAG);
 }
 
@@ -194,14 +195,14 @@ static void acpi_processor_idle(void)
 	 * Interrupts must be disabled during bus mastering calculations and
 	 * for C2/C3 transitions.
 	 */
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/*
 	 * Check whether we truly need to go idle, or should
 	 * reschedule:
 	 */
 	if (unlikely(need_resched())) {
-		local_irq_enable();
+		raw_local_irq_enable();
 		return;
 	}
 
@@ -268,7 +269,7 @@ static void acpi_processor_idle(void)
 		 *      issues (e.g. floppy DMA transfer overrun/underrun).
 		 */
 		if (pr->power.bm_activity & cx->demotion.threshold.bm) {
-			local_irq_enable();
+			raw_local_irq_enable();
 			next_state = cx->demotion.state;
 			goto end;
 		}
@@ -297,7 +298,7 @@ static void acpi_processor_idle(void)
 		smp_mb__after_clear_bit();
 		if (need_resched()) {
 			set_thread_flag(TIF_POLLING_NRFLAG);
-			local_irq_enable();
+			raw_local_irq_enable();
 			return;
 		}
 	}
@@ -333,7 +334,7 @@ static void acpi_processor_idle(void)
 		/* Get end time (ticks) */
 		t2 = inl(acpi_fadt.xpm_tmr_blk.address);
 		/* Re-enable interrupts */
-		local_irq_enable();
+		raw_local_irq_enable();
 		set_thread_flag(TIF_POLLING_NRFLAG);
 		/* Compute time (ticks) that we were actually asleep */
 		sleep_ticks =
@@ -372,8 +373,12 @@ static void acpi_processor_idle(void)
 					  ACPI_MTX_DO_NOT_LOCK);
 		}
 
+#ifdef CONFIG_GENERIC_TIME
+		/* TSC halts in C3, so notify users */
+		mark_tsc_unstable();
+#endif
 		/* Re-enable interrupts */
-		local_irq_enable();
+		raw_local_irq_enable();
 		set_thread_flag(TIF_POLLING_NRFLAG);
 		/* Compute time (ticks) that we were actually asleep */
 		sleep_ticks =
@@ -381,7 +386,7 @@ static void acpi_processor_idle(void)
 		break;
 
 	default:
-		local_irq_enable();
+		raw_local_irq_enable();
 		return;
 	}
 
@@ -1027,6 +1032,7 @@ int acpi_processor_power_init(struct acp
 	static int first_run = 0;
 	struct proc_dir_entry *entry = NULL;
 	unsigned int i;
+	unsigned long flags;
 
 	ACPI_FUNCTION_TRACE("acpi_processor_power_init");
 
@@ -1060,6 +1066,7 @@ int acpi_processor_power_init(struct acp
 	 * Note that we use previously set idle handler will be used on
 	 * platforms that only support C1.
 	 */
+	spin_lock_irqsave(&pm_idle_switch_lock, flags);
 	if ((pr->flags.power) && (!boot_option_idle_override)) {
 		printk(KERN_INFO PREFIX "CPU%d (power states:", pr->id);
 		for (i = 1; i <= pr->power.count; i++)
@@ -1071,8 +1078,13 @@ int acpi_processor_power_init(struct acp
 		if (pr->id == 0) {
 			pm_idle_save = pm_idle;
 			pm_idle = acpi_processor_idle;
+			/*
+			 * Don't allow switching of the pm_idle to poll.
+			 */
+			pm_idle_locked = 1;
 		}
 	}
+	spin_unlock_irqrestore(&pm_idle_switch_lock, flags);
 
 	/* 'power' [R] */
 	entry = create_proc_entry(ACPI_PROCESSOR_FILE_POWER,
@@ -1115,5 +1127,7 @@ int acpi_processor_power_exit(struct acp
 		cpu_idle_wait();
 	}
 
+	pm_idle_locked = 0;
+
 	return_VALUE(0);
 }
Index: linux.prev/drivers/acpi/processor_throttling.c
===================================================================
--- linux.prev.orig/drivers/acpi/processor_throttling.c
+++ linux.prev/drivers/acpi/processor_throttling.c
@@ -69,7 +69,7 @@ static int acpi_processor_get_throttling
 
 	duty_mask <<= pr->throttling.duty_offset;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	value = inl(pr->throttling.address);
 
@@ -87,7 +87,7 @@ static int acpi_processor_get_throttling
 
 	pr->throttling.state = state;
 
-	local_irq_enable();
+	raw_local_irq_enable();
 
 	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
 			  "Throttling state is T%d (%d%% throttling applied)\n",
@@ -131,7 +131,7 @@ int acpi_processor_set_throttling(struct
 		duty_mask = ~duty_mask;
 	}
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/*
 	 * Disable throttling by writing a 0 to bit 4.  Note that we must
@@ -158,7 +158,7 @@ int acpi_processor_set_throttling(struct
 
 	pr->throttling.state = state;
 
-	local_irq_enable();
+	raw_local_irq_enable();
 
 	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
 			  "Throttling state set to T%d (%d%%)\n", state,
Index: linux.prev/drivers/acpi/sleep/main.c
===================================================================
--- linux.prev.orig/drivers/acpi/sleep/main.c
+++ linux.prev/drivers/acpi/sleep/main.c
@@ -82,7 +82,7 @@ static int acpi_pm_enter(suspend_state_t
 			return error;
 	}
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	acpi_enable_wakeup_device(acpi_state);
 	switch (pm_state) {
 	case PM_SUSPEND_STANDBY:
@@ -105,7 +105,7 @@ static int acpi_pm_enter(suspend_state_t
 	default:
 		return -EINVAL;
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	printk(KERN_DEBUG "Back to C!\n");
 
 	/* restore processor state
Index: linux.prev/drivers/acpi/sleep/poweroff.c
===================================================================
--- linux.prev.orig/drivers/acpi/sleep/poweroff.c
+++ linux.prev/drivers/acpi/sleep/poweroff.c
@@ -46,7 +46,7 @@ void acpi_power_off(void)
 {
 	/* acpi_sleep_prepare(ACPI_STATE_S5) should have already been called */
 	printk("%s called\n", __FUNCTION__);
-	local_irq_disable();
+	raw_local_irq_disable();
 	/* Some SMP machines only can poweroff in boot CPU */
 	acpi_enter_sleep_state(ACPI_STATE_S5);
 }
Index: linux.prev/drivers/atm/atmtcp.c
===================================================================
--- linux.prev.orig/drivers/atm/atmtcp.c
+++ linux.prev/drivers/atm/atmtcp.c
@@ -352,7 +352,7 @@ static struct atm_dev atmtcp_control_dev
 	.ops		= &atmtcp_c_dev_ops,
 	.type		= "atmtcp",
 	.number		= 999,
-	.lock		= SPIN_LOCK_UNLOCKED
+	.lock		= SPIN_LOCK_UNLOCKED(atmtcp_control_dev.lock)
 };
 
 
Index: linux.prev/drivers/base/class.c
===================================================================
--- linux.prev.orig/drivers/base/class.c
+++ linux.prev/drivers/base/class.c
@@ -555,8 +555,10 @@ int class_device_add(struct class_device
 		class_name = make_class_name(class_dev);
 		sysfs_create_link(&class_dev->kobj,
 				  &class_dev->dev->kobj, "device");
+		/*
 		sysfs_create_link(&class_dev->dev->kobj, &class_dev->kobj,
 				  class_name);
+		*/
 	}
 
 	kobject_hotplug(&class_dev->kobj, KOBJ_ADD);
@@ -667,7 +669,9 @@ void class_device_del(struct class_devic
 	if (class_dev->dev) {
 		class_name = make_class_name(class_dev);
 		sysfs_remove_link(&class_dev->kobj, "device");
+		/*
 		sysfs_remove_link(&class_dev->dev->kobj, class_name);
+		*/
 	}
 	class_device_remove_file(class_dev, &class_dev->uevent_attr);
 	if (class_dev->devt_attr)
Index: linux.prev/drivers/block/loop.c
===================================================================
--- linux.prev.orig/drivers/block/loop.c
+++ linux.prev/drivers/block/loop.c
@@ -514,12 +514,12 @@ static int loop_make_request(request_que
 	lo->lo_pending++;
 	loop_add_bio(lo, old_bio);
 	spin_unlock_irq(&lo->lo_lock);
-	up(&lo->lo_bh_mutex);
+	complete(&lo->lo_bh_done);
 	return 0;
 
 out:
 	if (lo->lo_pending == 0)
-		up(&lo->lo_bh_mutex);
+		complete(&lo->lo_bh_done);
 	spin_unlock_irq(&lo->lo_lock);
 	bio_io_error(old_bio, old_bio->bi_size);
 	return 0;
@@ -580,23 +580,20 @@ static int loop_thread(void *data)
 	lo->lo_pending = 1;
 
 	/*
-	 * up sem, we are running
+	 * complete it, we are running
 	 */
-	up(&lo->lo_sem);
+	complete(&lo->lo_done);
 
 	for (;;) {
 		int pending;
 
-		/*
-		 * interruptible just to not contribute to load avg
-		 */
-		if (down_interruptible(&lo->lo_bh_mutex))
+		if (wait_for_completion_interruptible(&lo->lo_bh_done))
 			continue;
 
 		spin_lock_irq(&lo->lo_lock);
 
 		/*
-		 * could be upped because of tear-down, not pending work
+		 * could be completed because of tear-down, not pending work
 		 */
 		if (unlikely(!lo->lo_pending)) {
 			spin_unlock_irq(&lo->lo_lock);
@@ -619,7 +616,7 @@ static int loop_thread(void *data)
 			break;
 	}
 
-	up(&lo->lo_sem);
+	complete(&lo->lo_done);
 	return 0;
 }
 
@@ -830,7 +827,7 @@ static int loop_set_fd(struct loop_devic
 	set_blocksize(bdev, lo_blocksize);
 
 	kernel_thread(loop_thread, lo, CLONE_KERNEL);
-	down(&lo->lo_sem);
+	wait_for_completion(&lo->lo_done);
 	return 0;
 
  out_putf:
@@ -896,10 +893,10 @@ static int loop_clr_fd(struct loop_devic
 	lo->lo_state = Lo_rundown;
 	lo->lo_pending--;
 	if (!lo->lo_pending)
-		up(&lo->lo_bh_mutex);
+		complete(&lo->lo_bh_done);
 	spin_unlock_irq(&lo->lo_lock);
 
-	down(&lo->lo_sem);
+	wait_for_completion(&lo->lo_done);
 
 	lo->lo_backing_file = NULL;
 
@@ -1276,8 +1273,8 @@ static int __init loop_init(void)
 		if (!lo->lo_queue)
 			goto out_mem4;
 		init_MUTEX(&lo->lo_ctl_mutex);
-		init_MUTEX_LOCKED(&lo->lo_sem);
-		init_MUTEX_LOCKED(&lo->lo_bh_mutex);
+		init_completion(&lo->lo_done);
+		init_completion(&lo->lo_bh_done);
 		lo->lo_number = i;
 		spin_lock_init(&lo->lo_lock);
 		disk->major = LOOP_MAJOR;
Index: linux.prev/drivers/block/paride/pseudo.h
===================================================================
--- linux.prev.orig/drivers/block/paride/pseudo.h
+++ linux.prev/drivers/block/paride/pseudo.h
@@ -43,7 +43,7 @@ static unsigned long ps_timeout;
 static int ps_tq_active = 0;
 static int ps_nice = 0;
 
-static DEFINE_SPINLOCK(ps_spinlock __attribute__((unused)));
+static __attribute__((unused)) DEFINE_SPINLOCK(ps_spinlock);
 
 static DECLARE_WORK(ps_tq, ps_tq_int, NULL);
 
Index: linux.prev/drivers/block/sx8.c
===================================================================
--- linux.prev.orig/drivers/block/sx8.c
+++ linux.prev/drivers/block/sx8.c
@@ -27,6 +27,7 @@
 #include <linux/time.h>
 #include <linux/hdreg.h>
 #include <linux/dma-mapping.h>
+#include <linux/completion.h>
 #include <asm/io.h>
 #include <asm/semaphore.h>
 #include <asm/uaccess.h>
@@ -303,7 +304,7 @@ struct carm_host {
 
 	struct work_struct		fsm_task;
 
-	struct semaphore		probe_sem;
+	struct completion		probe_comp;
 };
 
 struct carm_response {
@@ -1365,7 +1366,7 @@ static void carm_fsm_task (void *_data)
 	}
 
 	case HST_PROBE_FINISHED:
-		up(&host->probe_sem);
+		complete(&host->probe_comp);
 		break;
 
 	case HST_ERROR:
@@ -1641,7 +1642,7 @@ static int carm_init_one (struct pci_dev
 	host->flags = pci_dac ? FL_DAC : 0;
 	spin_lock_init(&host->lock);
 	INIT_WORK(&host->fsm_task, carm_fsm_task, host);
-	init_MUTEX_LOCKED(&host->probe_sem);
+	init_completion(&host->probe_comp);
 
 	for (i = 0; i < ARRAY_SIZE(host->req); i++)
 		host->req[i].tag = i;
@@ -1710,8 +1711,8 @@ static int carm_init_one (struct pci_dev
 	if (rc)
 		goto err_out_free_irq;
 
-	DPRINTK("waiting for probe_sem\n");
-	down(&host->probe_sem);
+	DPRINTK("waiting for probe_comp\n");
+	wait_for_completion(&host->probe_comp);
 
 	printk(KERN_INFO "%s: pci %s, ports %d, io %lx, irq %u, major %d\n",
 	       host->name, pci_name(pdev), (int) CARM_MAX_PORTS,
Index: linux.prev/drivers/char/Kconfig
===================================================================
--- linux.prev.orig/drivers/char/Kconfig
+++ linux.prev/drivers/char/Kconfig
@@ -711,6 +711,45 @@ config RTC
 	  To compile this driver as a module, choose M here: the
 	  module will be called rtc.
 
+config RTC_HISTOGRAM
+	bool "Real Time Clock Histogram Support"
+	default n
+	depends on RTC
+	---help---
+	  If you say Y here then the kernel will track the delivery and
+	  wakeup latency of /dev/rtc using tasks and will report a
+	  histogram to the kernel log when the application closes /dev/rtc.
+
+config BLOCKER
+	tristate "Priority Inheritance Debugging (Blocker) Device Support"
+	default y
+	---help---
+	  If you say Y here then a device will be created that the userspace
+	  pi_test suite uses to test and measure kernel locking primitives.
+
+config LPPTEST
+	tristate "Parallel Port Based Latency Measurement Device"
+	depends on !PARPORT && X86
+	default y
+	---help---
+	  If you say Y here then a device will be created that the userspace
+	  testlpp utility uses to measure IRQ latencies of a target system
+	  from an independent measurement system.
+
+	  NOTE: this code assumes x86 PCs and that the parallel port is
+	  bidirectional and is on IRQ 7.
+
+	  to use the device, both the target and the source system needs to
+	  run a kernel with CONFIG_LPPTEST enabled. To measure latencies,
+	  use the scripts/testlpp utility in your kernel source directory,
+	  and run it (as root) on the source system - it will start printing
+	  out the latencies it took to get a response from the target system:
+
+	    Latency of response: 12.2 usecs (121265 cycles)
+
+	  then generate various workloads on the target system to see how
+	  (worst-case-) latencies are impacted.
+
 config SGI_DS1286
 	tristate "SGI DS1286 RTC support"
 	depends on SGI_IP22
Index: linux.prev/drivers/char/Makefile
===================================================================
--- linux.prev.orig/drivers/char/Makefile
+++ linux.prev/drivers/char/Makefile
@@ -57,6 +57,8 @@ obj-$(CONFIG_R3964) += n_r3964.o
 obj-$(CONFIG_APPLICOM) += applicom.o
 obj-$(CONFIG_SONYPI) += sonypi.o
 obj-$(CONFIG_RTC) += rtc.o
+obj-$(CONFIG_BLOCKER) += blocker.o
+obj-$(CONFIG_LPPTEST) += lpptest.o
 obj-$(CONFIG_HPET) += hpet.o
 obj-$(CONFIG_GEN_RTC) += genrtc.o
 obj-$(CONFIG_EFI_RTC) += efirtc.o
Index: linux.prev/drivers/char/blocker.c
===================================================================
--- /dev/null
+++ linux.prev/drivers/char/blocker.c
@@ -0,0 +1,108 @@
+/*
+ * priority inheritance testing device
+ */
+
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <asm/rtc.h>
+
+#define BLOCKER_MINOR		221
+
+#define BLOCK_IOCTL		4245
+#define BLOCK_SET_DEPTH		4246
+
+#define MAX_LOCK_DEPTH		10
+
+void loop(int loops)
+{
+	int i;
+
+	for (i = 0; i < loops; i++)
+		get_cycles();
+}
+
+static spinlock_t blocker_lock[MAX_LOCK_DEPTH];
+
+static unsigned int lock_depth = 1;
+
+void do_the_lock_and_loop(unsigned int args)
+{
+	int i, max;
+
+	if (rt_task(current))
+		max = lock_depth;
+	else if (lock_depth > 1)
+		max = (current->pid % lock_depth) + 1;
+	else
+		max = 1;
+
+	/* Always lock from the top down */
+	for (i = max-1; i >= 0; i--)
+		 spin_lock(&blocker_lock[i]);
+	loop(args);
+	for (i = 0; i < max; i++)
+		spin_unlock(&blocker_lock[i]);
+}
+
+static int blocker_open(struct inode *in, struct file *file)
+{
+	printk(KERN_INFO "blocker_open called\n");
+
+	return 0;
+}
+
+static long blocker_ioctl(struct file *file,
+			  unsigned int cmd, unsigned long args)
+{
+	switch(cmd) {
+	case BLOCK_IOCTL:
+		do_the_lock_and_loop(args);
+		return 0;
+	case BLOCK_SET_DEPTH:
+		if (args >= MAX_LOCK_DEPTH)
+			return -EINVAL;
+		lock_depth = args;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static struct file_operations blocker_fops = {
+	.owner		= THIS_MODULE,
+	.llseek		= no_llseek,
+	.unlocked_ioctl = blocker_ioctl,
+	.open		= blocker_open,
+};
+
+static struct miscdevice blocker_dev =
+{
+	BLOCKER_MINOR,
+	"blocker",
+	&blocker_fops
+};
+
+static int __init blocker_init(void)
+{
+	int i;
+
+	if (misc_register(&blocker_dev))
+		return -ENODEV;
+
+	for (i = 0; i < MAX_LOCK_DEPTH; i++)
+		spin_lock_init(blocker_lock + i);
+
+	return 0;
+}
+
+void __exit blocker_exit(void)
+{
+	printk(KERN_INFO "blocker device uninstalled\n");
+	misc_deregister(&blocker_dev);
+}
+
+module_init(blocker_init);
+module_exit(blocker_exit);
+
+MODULE_LICENSE("GPL");
+
Index: linux.prev/drivers/char/epca.c
===================================================================
--- linux.prev.orig/drivers/char/epca.c
+++ linux.prev/drivers/char/epca.c
@@ -80,7 +80,7 @@ static int invalid_lilo_config;
 /* The ISA boards do window flipping into the same spaces so its only sane
    with a single lock. It's still pretty efficient */
 
-static spinlock_t epca_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(epca_lock);
 
 /* -----------------------------------------------------------------------
 	MAXBOARDS is typically 12, but ISA and EISA cards are restricted to 
Index: linux.prev/drivers/char/hangcheck-timer.c
===================================================================
--- linux.prev.orig/drivers/char/hangcheck-timer.c
+++ linux.prev/drivers/char/hangcheck-timer.c
@@ -49,6 +49,7 @@
 #include <linux/delay.h>
 #include <asm/uaccess.h>
 #include <linux/sysrq.h>
+#include <linux/timeofday.h>
 
 
 #define VERSION_STR "0.9.0"
@@ -130,8 +131,12 @@ __setup("hcheck_dump_tasks", hangcheck_p
 #endif
 
 #ifdef HAVE_MONOTONIC
+#ifndef CONFIG_GENERIC_TIME
 extern unsigned long long monotonic_clock(void);
 #else
+#define monotonic_clock() ktime_to_ns(get_monotonic_clock())
+#endif
+#else
 static inline unsigned long long monotonic_clock(void)
 {
 # ifdef __s390__
Index: linux.prev/drivers/char/ipmi/ipmi_si_intf.c
===================================================================
--- linux.prev.orig/drivers/char/ipmi/ipmi_si_intf.c
+++ linux.prev/drivers/char/ipmi/ipmi_si_intf.c
@@ -54,7 +54,7 @@
 #include <linux/notifier.h>
 #include <linux/kthread.h>
 #include <asm/irq.h>
-#ifdef CONFIG_HIGH_RES_TIMERS
+#ifdef CONFIG_HIGH_RES_TIMERS_OLD
 #include <linux/hrtime.h>
 # if defined(schedule_next_int)
 /* Old high-res timer code, do translations. */
@@ -824,7 +824,7 @@ static int initialized = 0;
 /* Must be called with interrupts off and with the si_lock held. */
 static void si_restart_short_timer(struct smi_info *smi_info)
 {
-#if defined(CONFIG_HIGH_RES_TIMERS)
+#if defined(CONFIG_HIGH_RES_TIMERS_OLD)
 	unsigned long flags;
 	unsigned long jiffies_now;
 	unsigned long seq;
@@ -892,13 +892,13 @@ static void smi_timeout(unsigned long da
 	/* If the state machine asks for a short delay, then shorten
            the timer timeout. */
 	if (smi_result == SI_SM_CALL_WITH_DELAY) {
-#if defined(CONFIG_HIGH_RES_TIMERS)
+#if defined(CONFIG_HIGH_RES_TIMERS_OLD)
 		unsigned long seq;
 #endif
 		spin_lock_irqsave(&smi_info->count_lock, flags);
 		smi_info->short_timeouts++;
 		spin_unlock_irqrestore(&smi_info->count_lock, flags);
-#if defined(CONFIG_HIGH_RES_TIMERS)
+#if defined(CONFIG_HIGH_RES_TIMERS_OLD)
 		do {
 			seq = read_seqbegin_irqsave(&xtime_lock, flags);
 			smi_info->si_timer.expires = jiffies;
@@ -914,7 +914,7 @@ static void smi_timeout(unsigned long da
 		smi_info->long_timeouts++;
 		spin_unlock_irqrestore(&smi_info->count_lock, flags);
 		smi_info->si_timer.expires = jiffies + SI_TIMEOUT_JIFFIES;
-#if defined(CONFIG_HIGH_RES_TIMERS)
+#if defined(CONFIG_HIGH_RES_TIMERS_OLD)
 		smi_info->si_timer.arch_cycle_expires = 0;
 #endif
 	}
Index: linux.prev/drivers/char/ipmi/ipmi_watchdog.c
===================================================================
--- linux.prev.orig/drivers/char/ipmi/ipmi_watchdog.c
+++ linux.prev/drivers/char/ipmi/ipmi_watchdog.c
@@ -459,7 +459,8 @@ static void panic_halt_ipmi_set_timeout(
    when both messages are free. */
 static atomic_t heartbeat_tofree = ATOMIC_INIT(0);
 static DECLARE_MUTEX(heartbeat_lock);
-static DECLARE_MUTEX_LOCKED(heartbeat_wait_lock);
+/* PREEMPT_RT: should be a completion instead */
+static COMPAT_DECLARE_MUTEX_LOCKED(heartbeat_wait_lock);
 static void heartbeat_free_smi(struct ipmi_smi_msg *msg)
 {
     if (atomic_dec_and_test(&heartbeat_tofree))
Index: linux.prev/drivers/char/lpptest.c
===================================================================
--- /dev/null
+++ linux.prev/drivers/char/lpptest.c
@@ -0,0 +1,163 @@
+/*
+ * /dev/lpptest device: test IRQ handling latencies over parallel port
+ *
+ *      Copyright (C) 2005 Thomas Gleixner, Ingo Molnar
+ *
+ * licensed under the GPL
+ *
+ * You need to have CONFIG_PARPORT disabled for this device, it is a
+ * completely self-contained device that assumes sole ownership of the
+ * parallel port.
+ */
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/fs.h>
+#include <linux/delay.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/rtc.h>
+
+#define LPPTEST_CHAR_MAJOR 245
+#define LPPTEST_DEVICE_NAME "lpptest"
+
+#define LPPTEST_IRQ 7
+
+#define LPPTEST_TEST    _IOR (LPPTEST_CHAR_MAJOR, 1, unsigned long long)
+#define LPPTEST_DISABLE _IOR (LPPTEST_CHAR_MAJOR, 2, unsigned long long)
+#define LPPTEST_ENABLE  _IOR (LPPTEST_CHAR_MAJOR, 3, unsigned long long)
+
+static char dev_id[] = "lpptest";
+
+#define INIT_PORT()	outb(0x04, 0x37a)
+#define ENABLE_IRQ()	outb(0x10, 0x37a)
+#define DISABLE_IRQ()	outb(0, 0x37a)
+
+static unsigned char out = 0x5a;
+
+/**
+ * Interrupt handler. Flip a bit in the reply.
+ */
+static int lpptest_irq (int irq, void *dev_id, struct pt_regs *regs)
+{
+	out ^= 0xff;
+	outb(out, 0x378);
+
+	return IRQ_HANDLED;
+}
+
+static cycles_t test_response(void)
+{
+	cycles_t now, end;
+	unsigned char in;
+	int timeout = 0;
+
+	raw_local_irq_disable();
+	in = inb(0x379);
+	inb(0x378);
+	outb(0x08, 0x378);
+	now = get_cycles();
+	while(1) {
+    		if (inb(0x379) != in)
+			break;
+		if (timeout++ > 1000000) {
+			outb(0x00, 0x378);
+			raw_local_irq_enable();
+
+			return 0;
+		}
+	}
+	end = get_cycles();
+	outb(0x00, 0x378);
+	raw_local_irq_enable();
+
+	return end - now;
+}
+
+static int lpptest_open(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+static int lpptest_close(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+int lpptest_ioctl(struct inode *inode, struct file *file, unsigned int ioctl_num, unsigned long ioctl_param)
+{
+	int retval = 0;
+
+	switch (ioctl_num) {
+
+	case LPPTEST_DISABLE:
+		DISABLE_IRQ();
+		break;
+
+	case LPPTEST_ENABLE:
+		ENABLE_IRQ();
+		break;
+
+	case LPPTEST_TEST: {
+
+		cycles_t diff = test_response();
+		if (copy_to_user((void *)ioctl_param, (void*) &diff, sizeof(diff)))
+			goto errcpy;
+		break;
+	}
+	default: retval = -EINVAL;
+	}
+
+	return retval;
+
+ errcpy:
+	return -EFAULT;
+}
+
+static struct file_operations lpptest_dev_fops = {
+	.ioctl = lpptest_ioctl,
+	.open = lpptest_open,
+	.release = lpptest_close,
+};
+
+static int __init lpptest_init (void)
+{
+	if (register_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME, &lpptest_dev_fops))
+	{
+		printk(KERN_NOTICE "Can't allocate major number %d for lpptest.\n",
+		       LPPTEST_CHAR_MAJOR);
+		return -EAGAIN;
+	}
+
+	if (request_irq (LPPTEST_IRQ, lpptest_irq, 0, "lpptest", dev_id)) {
+		printk (KERN_WARNING "lpptest: irq %d in use. Unload parport module!\n", LPPTEST_IRQ);
+		unregister_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME);
+		return -EAGAIN;
+	}
+	irq_desc[LPPTEST_IRQ].status |= IRQ_NODELAY;
+	irq_desc[LPPTEST_IRQ].action->flags |= SA_NODELAY | SA_INTERRUPT;
+
+	INIT_PORT();
+	ENABLE_IRQ();
+
+	return 0;
+}
+module_init (lpptest_init);
+
+static void __exit lpptest_exit (void)
+{
+	DISABLE_IRQ();
+
+	free_irq(LPPTEST_IRQ, dev_id);
+	unregister_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME);
+}
+module_exit (lpptest_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("lpp test module");
+
Index: linux.prev/drivers/char/random.c
===================================================================
--- linux.prev.orig/drivers/char/random.c
+++ linux.prev/drivers/char/random.c
@@ -417,7 +417,7 @@ static struct entropy_store input_pool =
 	.poolinfo = &poolinfo_table[0],
 	.name = "input",
 	.limit = 1,
-	.lock = SPIN_LOCK_UNLOCKED,
+	.lock = SPIN_LOCK_UNLOCKED(input_pool.lock),
 	.pool = input_pool_data
 };
 
@@ -426,7 +426,7 @@ static struct entropy_store blocking_poo
 	.name = "blocking",
 	.limit = 1,
 	.pull = &input_pool,
-	.lock = SPIN_LOCK_UNLOCKED,
+	.lock = SPIN_LOCK_UNLOCKED(blocking_pool.lock),
 	.pool = blocking_pool_data
 };
 
@@ -434,7 +434,7 @@ static struct entropy_store nonblocking_
 	.poolinfo = &poolinfo_table[1],
 	.name = "nonblocking",
 	.pull = &input_pool,
-	.lock = SPIN_LOCK_UNLOCKED,
+	.lock = SPIN_LOCK_UNLOCKED(nonblocking_pool.lock),
 	.pool = nonblocking_pool_data
 };
 
@@ -581,8 +581,11 @@ static void add_timer_randomness(struct 
 	preempt_disable();
 	/* if over the trickle threshold, use only 1 in 4096 samples */
 	if (input_pool.entropy_count > trickle_thresh &&
-	    (__get_cpu_var(trickle_count)++ & 0xfff))
-		goto out;
+	    (__get_cpu_var(trickle_count)++ & 0xfff)) {
+		preempt_enable();
+		return;
+	}
+	preempt_enable();
 
 	sample.jiffies = jiffies;
 	sample.cycles = get_cycles();
@@ -627,9 +630,6 @@ static void add_timer_randomness(struct 
 
 	if(input_pool.entropy_count >= random_read_wakeup_thresh)
 		wake_up_interruptible(&random_read_wait);
-
-out:
-	preempt_enable();
 }
 
 extern void add_input_randomness(unsigned int type, unsigned int code,
Index: linux.prev/drivers/char/rtc.c
===================================================================
--- linux.prev.orig/drivers/char/rtc.c
+++ linux.prev/drivers/char/rtc.c
@@ -84,10 +84,36 @@
 #include <asm/uaccess.h>
 #include <asm/system.h>
 
+#ifdef CONFIG_MIPS
+# include <asm/time.h>
+#endif
+
 #if defined(__i386__)
 #include <asm/hpet.h>
 #endif
 
+#ifdef CONFIG_RTC_HISTOGRAM
+
+static cycles_t last_interrupt_time;
+
+#include <asm/timex.h>
+
+#define CPU_MHZ		(cpu_khz / 1000)
+
+#define HISTSIZE	10000
+static int histogram[HISTSIZE];
+
+static int rtc_state;
+
+enum rtc_states {
+	S_STARTUP,		/* First round - let the application start */
+	S_IDLE,			/* Waiting for an interrupt */
+	S_WAITING_FOR_READ,	/* Signal delivered. waiting for rtc_read() */
+	S_READ_MISSED,		/* Signal delivered, read() deadline missed */
+};
+
+#endif
+
 #ifdef __sparc__
 #include <linux/pci.h>
 #include <asm/ebus.h>
@@ -149,22 +175,8 @@ static void get_rtc_alm_time (struct rtc
 #ifdef RTC_IRQ
 static void rtc_dropped_irq(unsigned long data);
 
-static void set_rtc_irq_bit_locked(unsigned char bit);
-static void mask_rtc_irq_bit_locked(unsigned char bit);
-
-static inline void set_rtc_irq_bit(unsigned char bit)
-{
-	spin_lock_irq(&rtc_lock);
-	set_rtc_irq_bit_locked(bit);
-	spin_unlock_irq(&rtc_lock);
-}
-
-static void mask_rtc_irq_bit(unsigned char bit)
-{
-	spin_lock_irq(&rtc_lock);
-	mask_rtc_irq_bit_locked(bit);
-	spin_unlock_irq(&rtc_lock);
-}
+static void set_rtc_irq_bit(unsigned char bit);
+static void mask_rtc_irq_bit(unsigned char bit);
 #endif
 
 static int rtc_proc_open(struct inode *inode, struct file *file);
@@ -193,6 +205,7 @@ static unsigned long rtc_max_user_freq =
  * rtc_task_lock nests inside rtc_lock.
  */
 static DEFINE_SPINLOCK(rtc_task_lock);
+static DEFINE_SPINLOCK(rtc_timer_lock);
 static rtc_task_t *rtc_callback = NULL;
 #endif
 
@@ -219,7 +232,146 @@ static inline unsigned char rtc_is_updat
 	return uip;
 }
 
+#ifndef RTC_IRQ
+# undef CONFIG_RTC_HISTOGRAM
+#endif
+
+static inline void rtc_open_event(void)
+{
+#ifdef CONFIG_RTC_HISTOGRAM
+	int i;
+
+	last_interrupt_time = 0;
+	rtc_state = S_STARTUP;
+	rtc_irq_data = 0;
+
+	for (i = 0; i < HISTSIZE; i++)
+		histogram[i] = 0;
+#endif
+}
+
+static inline void rtc_wake_event(void)
+{
+#ifndef CONFIG_RTC_HISTOGRAM
+	kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+#else
+	if (!(rtc_status & RTC_IS_OPEN))
+		return;
+
+	switch (rtc_state) {
+	/* Startup */
+	case S_STARTUP:
+		kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+		break;
+	/* Waiting for an interrupt */
+	case S_IDLE:
+		kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+		last_interrupt_time = get_cycles();
+		rtc_state = S_WAITING_FOR_READ;
+		break;
+
+	/* Signal has been delivered. waiting for rtc_read() */
+	case S_WAITING_FOR_READ:
+		/*
+		 * Well foo.  The usermode application didn't
+		 * schedule and read in time.
+		 */
+		last_interrupt_time = get_cycles();
+		rtc_state = S_READ_MISSED;
+		printk("Read missed before next interrupt\n");
+		break;
+	/* Signal has been delivered, read() deadline was missed */
+	case S_READ_MISSED:
+		/*
+		 * Not much we can do here.  We're waiting for the usermode
+		 * application to read the rtc
+		 */
+		last_interrupt_time = get_cycles();
+		break;
+	}
+#endif
+}
+
+static inline void rtc_read_event(void)
+{
+#ifdef CONFIG_RTC_HISTOGRAM
+	cycles_t now = get_cycles();
+
+	switch (rtc_state) {
+	/* Startup */
+	case S_STARTUP:
+		rtc_state = S_IDLE;
+		break;
+
+	/* Waiting for an interrupt */
+	case S_IDLE:
+		printk("bug in rtc_read(): called in state S_IDLE!\n");
+		break;
+	case S_WAITING_FOR_READ:	/*
+					 * Signal has been delivered.
+					 * waiting for rtc_read()
+					 */
+		/*
+		 * Well done
+		 */
+	case S_READ_MISSED:		/*
+					 * Signal has been delivered, read()
+					 * deadline was missed
+					 */
+		/*
+		 * So, you finally got here.
+		 */
+		if (!last_interrupt_time)
+			printk("bug in rtc_read(): last_interrupt_time = 0\n");
+		rtc_state = S_IDLE;
+		{
+			cycles_t latency = now - last_interrupt_time;
+			unsigned long delta;	/* Microseconds */
+
+			delta = latency;
+			delta /= CPU_MHZ;
+
+			if (delta > 1000 * 1000) {
+				printk("rtc: eek\n");
+			} else {
+				unsigned long slot = delta;
+				if (slot >= HISTSIZE)
+					slot = HISTSIZE - 1;
+				histogram[slot]++;
+				if (delta > 2000)
+					printk("wow!  That was a "
+							"%ld millisec bump\n",
+						delta / 1000);
+			}
+		}
+		rtc_state = S_IDLE;
+		break;
+	}
+#endif
+}
+
+static inline void rtc_close_event(void)
+{
+#ifdef CONFIG_RTC_HISTOGRAM
+	int i = 0;
+	unsigned long total = 0;
+
+	for (i = 0; i < HISTSIZE; i++)
+		total += histogram[i];
+	if (!total)
+		return;
+
+	printk("\nrtc latency histogram of {%s/%d, %lu samples}:\n",
+		current->comm, current->pid, total);
+	for (i = 0; i < HISTSIZE; i++) {
+		if (histogram[i])
+			printk("%d %d\n", i, histogram[i]);
+	}
+#endif
+}
+
 #ifdef RTC_IRQ
+
 /*
  *	A very tiny interrupt handler. It runs with SA_INTERRUPT set,
  *	but there is possibility of conflicting with the set_rtc_mmss()
@@ -232,6 +384,8 @@ static inline unsigned char rtc_is_updat
 
 irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 {
+	int mod;
+
 	/*
 	 *	Can be an alarm interrupt, update complete interrupt,
 	 *	or a periodic interrupt. We store the status in the
@@ -239,7 +393,8 @@ irqreturn_t rtc_interrupt(int irq, void 
 	 *	the last read in the remainder of rtc_irq_data.
 	 */
 
-	spin_lock (&rtc_lock);
+	spin_lock(&rtc_timer_lock);
+	spin_lock(&rtc_lock);
 	rtc_irq_data += 0x100;
 	rtc_irq_data &= ~0xff;
 	if (is_hpet_enabled()) {
@@ -253,19 +408,23 @@ irqreturn_t rtc_interrupt(int irq, void 
 		rtc_irq_data |= (CMOS_READ(RTC_INTR_FLAGS) & 0xF0);
 	}
 
+	mod = 0;
 	if (rtc_status & RTC_TIMER_ON)
-		mod_timer(&rtc_irq_timer, jiffies + HZ/rtc_freq + 2*HZ/100);
+		mod = 1;
 
-	spin_unlock (&rtc_lock);
+	spin_unlock(&rtc_lock);
+	if (mod)
+		mod_timer(&rtc_irq_timer, jiffies + HZ/rtc_freq + 2*HZ/100);
+	spin_unlock(&rtc_timer_lock);
 
 	/* Now do the rest of the actions */
 	spin_lock(&rtc_task_lock);
 	if (rtc_callback)
 		rtc_callback->func(rtc_callback->private_data);
 	spin_unlock(&rtc_task_lock);
-	wake_up_interruptible(&rtc_wait);	
 
-	kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+	rtc_wake_event();
+	wake_up_interruptible(&rtc_wait);
 
 	return IRQ_HANDLED;
 }
@@ -350,10 +509,10 @@ static ssize_t rtc_read(struct file *fil
 
 		__set_current_state(TASK_INTERRUPTIBLE);
 		
-		spin_lock_irq (&rtc_lock);
+		spin_lock_irq(&rtc_lock);
 		data = rtc_irq_data;
 		rtc_irq_data = 0;
-		spin_unlock_irq (&rtc_lock);
+		spin_unlock_irq(&rtc_lock);
 
 		if (data != 0)
 			break;
@@ -369,6 +528,8 @@ static ssize_t rtc_read(struct file *fil
 		schedule();
 	} while (1);
 
+	rtc_read_event();
+
 	if (count < sizeof(unsigned long))
 		retval = put_user(data, (unsigned int __user *)buf) ?: sizeof(int); 
 	else
@@ -383,7 +544,7 @@ static ssize_t rtc_read(struct file *fil
 
 static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
 {
-	struct rtc_time wtime; 
+	struct rtc_time wtime;
 
 #ifdef RTC_IRQ
 	if (rtc_has_irq == 0) {
@@ -415,19 +576,24 @@ static int rtc_do_ioctl(unsigned int cmd
 	}
 	case RTC_PIE_OFF:	/* Mask periodic int. enab. bit	*/
 	{
-		unsigned long flags; /* can be called from isr via rtc_control() */
-		spin_lock_irqsave (&rtc_lock, flags);
-		mask_rtc_irq_bit_locked(RTC_PIE);
+		int del = 0;
+		mask_rtc_irq_bit(RTC_PIE);
+		spin_lock_irq(&rtc_timer_lock);
+		spin_lock(&rtc_lock);
 		if (rtc_status & RTC_TIMER_ON) {
+			del = 1;
 			rtc_status &= ~RTC_TIMER_ON;
-			del_timer(&rtc_irq_timer);
 		}
-		spin_unlock_irqrestore (&rtc_lock, flags);
+		spin_unlock(&rtc_lock);
+		if (del)
+			del_timer(&rtc_irq_timer);
+		spin_unlock_irq(&rtc_timer_lock);
 		return 0;
 	}
 	case RTC_PIE_ON:	/* Allow periodic ints		*/
 	{
-		unsigned long flags; /* can be called from isr via rtc_control() */
+		int add = 0;
+
 		/*
 		 * We don't really want Joe User enabling more
 		 * than 64Hz of interrupts on a multi-user machine.
@@ -436,14 +602,18 @@ static int rtc_do_ioctl(unsigned int cmd
 			(!capable(CAP_SYS_RESOURCE)))
 			return -EACCES;
 
-		spin_lock_irqsave (&rtc_lock, flags);
+		spin_lock_irq(&rtc_timer_lock);
+		spin_lock(&rtc_lock);
 		if (!(rtc_status & RTC_TIMER_ON)) {
+			add = 1;
 			rtc_irq_timer.expires = jiffies + HZ/rtc_freq + 2*HZ/100;
-			add_timer(&rtc_irq_timer);
 			rtc_status |= RTC_TIMER_ON;
 		}
-		set_rtc_irq_bit_locked(RTC_PIE);
-		spin_unlock_irqrestore (&rtc_lock, flags);
+		spin_unlock(&rtc_lock);
+		if (add)
+			add_timer(&rtc_irq_timer);
+		spin_unlock_irq(&rtc_timer_lock);
+		set_rtc_irq_bit(RTC_PIE);
 		return 0;
 	}
 	case RTC_UIE_OFF:	/* Mask ints from RTC updates.	*/
@@ -599,6 +769,11 @@ static int rtc_do_ioctl(unsigned int cmd
 		save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
 		CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
 
+		/*
+		 * Make CMOS date writes nonpreemptible even on PREEMPT_RT.
+		 * There's a limit to everything! =B-)
+		 */
+		preempt_disable();
 #ifdef CONFIG_MACH_DECSTATION
 		CMOS_WRITE(real_yrs, RTC_DEC_YEAR);
 #endif
@@ -608,6 +783,7 @@ static int rtc_do_ioctl(unsigned int cmd
 		CMOS_WRITE(hrs, RTC_HOURS);
 		CMOS_WRITE(min, RTC_MINUTES);
 		CMOS_WRITE(sec, RTC_SECONDS);
+		preempt_enable();
 
 		CMOS_WRITE(save_control, RTC_CONTROL);
 		CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
@@ -624,7 +800,6 @@ static int rtc_do_ioctl(unsigned int cmd
 	{
 		int tmp = 0;
 		unsigned char val;
-		unsigned long flags; /* can be called from isr via rtc_control() */
 
 		/* 
 		 * The max we can do is 8192Hz.
@@ -647,9 +822,9 @@ static int rtc_do_ioctl(unsigned int cmd
 		if (arg != (1<<tmp))
 			return -EINVAL;
 
-		spin_lock_irqsave(&rtc_lock, flags);
+		spin_lock_irq(&rtc_lock);
 		if (hpet_set_periodic_freq(arg)) {
-			spin_unlock_irqrestore(&rtc_lock, flags);
+			spin_unlock_irq(&rtc_lock);
 			return 0;
 		}
 		rtc_freq = arg;
@@ -657,7 +832,7 @@ static int rtc_do_ioctl(unsigned int cmd
 		val = CMOS_READ(RTC_FREQ_SELECT) & 0xf0;
 		val |= (16 - tmp);
 		CMOS_WRITE(val, RTC_FREQ_SELECT);
-		spin_unlock_irqrestore(&rtc_lock, flags);
+		spin_unlock_irq(&rtc_lock);
 		return 0;
 	}
 #endif
@@ -701,19 +876,20 @@ static int rtc_ioctl(struct inode *inode
  * needed here. Or anywhere else in this driver. */
 static int rtc_open(struct inode *inode, struct file *file)
 {
-	spin_lock_irq (&rtc_lock);
+	spin_lock_irq(&rtc_lock);
 
 	if(rtc_status & RTC_IS_OPEN)
 		goto out_busy;
 
+	rtc_open_event();
 	rtc_status |= RTC_IS_OPEN;
 
 	rtc_irq_data = 0;
-	spin_unlock_irq (&rtc_lock);
+	spin_unlock_irq(&rtc_lock);
 	return 0;
 
 out_busy:
-	spin_unlock_irq (&rtc_lock);
+	spin_unlock_irq(&rtc_lock);
 	return -EBUSY;
 }
 
@@ -727,6 +903,7 @@ static int rtc_release(struct inode *ino
 {
 #ifdef RTC_IRQ
 	unsigned char tmp;
+	int del;
 
 	if (rtc_has_irq == 0)
 		goto no_irq;
@@ -736,7 +913,8 @@ static int rtc_release(struct inode *ino
 	 * in use, and clear the data.
 	 */
 
-	spin_lock_irq(&rtc_lock);
+	spin_lock_irq(&rtc_timer_lock);
+	spin_lock(&rtc_lock);
 	if (!hpet_mask_rtc_irq_bit(RTC_PIE | RTC_AIE | RTC_UIE)) {
 		tmp = CMOS_READ(RTC_CONTROL);
 		tmp &=  ~RTC_PIE;
@@ -745,11 +923,15 @@ static int rtc_release(struct inode *ino
 		CMOS_WRITE(tmp, RTC_CONTROL);
 		CMOS_READ(RTC_INTR_FLAGS);
 	}
+	del = 0;
 	if (rtc_status & RTC_TIMER_ON) {
 		rtc_status &= ~RTC_TIMER_ON;
-		del_timer(&rtc_irq_timer);
+		del = 1;
 	}
-	spin_unlock_irq(&rtc_lock);
+	spin_unlock(&rtc_lock);
+	if (del)
+		del_timer(&rtc_irq_timer);
+	spin_unlock_irq(&rtc_timer_lock);
 
 	if (file->f_flags & FASYNC) {
 		rtc_fasync (-1, file, 0);
@@ -757,10 +939,11 @@ static int rtc_release(struct inode *ino
 no_irq:
 #endif
 
-	spin_lock_irq (&rtc_lock);
+	spin_lock_irq(&rtc_lock);
 	rtc_irq_data = 0;
 	rtc_status &= ~RTC_IS_OPEN;
-	spin_unlock_irq (&rtc_lock);
+	spin_unlock_irq(&rtc_lock);
+	rtc_close_event();
 	return 0;
 }
 
@@ -775,9 +958,9 @@ static unsigned int rtc_poll(struct file
 
 	poll_wait(file, &rtc_wait, wait);
 
-	spin_lock_irq (&rtc_lock);
+	spin_lock_irq(&rtc_lock);
 	l = rtc_irq_data;
-	spin_unlock_irq (&rtc_lock);
+	spin_unlock_irq(&rtc_lock);
 
 	if (l != 0)
 		return POLLIN | POLLRDNORM;
@@ -825,12 +1008,15 @@ int rtc_unregister(rtc_task_t *task)
 	return -EIO;
 #else
 	unsigned char tmp;
+	int del;
 
-	spin_lock_irq(&rtc_lock);
+	spin_lock_irq(&rtc_timer_lock);
+	spin_lock(&rtc_lock);
 	spin_lock(&rtc_task_lock);
 	if (rtc_callback != task) {
 		spin_unlock(&rtc_task_lock);
-		spin_unlock_irq(&rtc_lock);
+		spin_unlock(&rtc_lock);
+		spin_unlock_irq(&rtc_timer_lock);
 		return -ENXIO;
 	}
 	rtc_callback = NULL;
@@ -844,13 +1030,17 @@ int rtc_unregister(rtc_task_t *task)
 		CMOS_WRITE(tmp, RTC_CONTROL);
 		CMOS_READ(RTC_INTR_FLAGS);
 	}
+	del = 0;
 	if (rtc_status & RTC_TIMER_ON) {
 		rtc_status &= ~RTC_TIMER_ON;
-		del_timer(&rtc_irq_timer);
+		del = 1;
 	}
 	rtc_status &= ~RTC_IS_OPEN;
 	spin_unlock(&rtc_task_lock);
-	spin_unlock_irq(&rtc_lock);
+	spin_unlock(&rtc_lock);
+	if (del)
+		del_timer(&rtc_irq_timer);
+	spin_unlock_irq(&rtc_timer_lock);
 	return 0;
 #endif
 }
@@ -860,15 +1050,12 @@ int rtc_control(rtc_task_t *task, unsign
 #ifndef RTC_IRQ
 	return -EIO;
 #else
-	unsigned long flags;
-	if (cmd != RTC_PIE_ON && cmd != RTC_PIE_OFF && cmd != RTC_IRQP_SET)
-		return -EINVAL;
-	spin_lock_irqsave(&rtc_task_lock, flags);
+	spin_lock_irq(&rtc_task_lock);
 	if (rtc_callback != task) {
-		spin_unlock_irqrestore(&rtc_task_lock, flags);
+		spin_unlock_irq(&rtc_task_lock);
 		return -ENXIO;
 	}
-	spin_unlock_irqrestore(&rtc_task_lock, flags);
+	spin_unlock_irq(&rtc_task_lock);
 	return rtc_do_ioctl(cmd, arg, 1);
 #endif
 }
@@ -1111,17 +1298,21 @@ module_exit(rtc_exit);
 static void rtc_dropped_irq(unsigned long data)
 {
 	unsigned long freq;
+	int mod;
 
-	spin_lock_irq (&rtc_lock);
+	spin_lock_irq(&rtc_timer_lock);
+	spin_lock(&rtc_lock);
 
 	if (hpet_rtc_dropped_irq()) {
-		spin_unlock_irq(&rtc_lock);
+		spin_unlock(&rtc_lock);
+		spin_unlock_irq(&rtc_timer_lock);
 		return;
 	}
 
 	/* Just in case someone disabled the timer from behind our back... */
+	mod = 0;
 	if (rtc_status & RTC_TIMER_ON)
-		mod_timer(&rtc_irq_timer, jiffies + HZ/rtc_freq + 2*HZ/100);
+		mod = 1;
 
 	rtc_irq_data += ((rtc_freq/HZ)<<8);
 	rtc_irq_data &= ~0xff;
@@ -1129,7 +1320,10 @@ static void rtc_dropped_irq(unsigned lon
 
 	freq = rtc_freq;
 
-	spin_unlock_irq(&rtc_lock);
+	spin_unlock(&rtc_lock);
+	if (mod)
+		mod_timer(&rtc_irq_timer, jiffies + HZ/rtc_freq + 2*HZ/100);
+	spin_unlock_irq(&rtc_timer_lock);
 
 	printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", freq);
 
@@ -1325,32 +1519,40 @@ static void get_rtc_alm_time(struct rtc_
  * meddles with the interrupt enable/disable bits.
  */
 
-static void mask_rtc_irq_bit_locked(unsigned char bit)
+static void mask_rtc_irq_bit(unsigned char bit)
 {
 	unsigned char val;
 
-	if (hpet_mask_rtc_irq_bit(bit))
+	spin_lock_irq(&rtc_lock);
+	if (hpet_mask_rtc_irq_bit(bit)) {
+		spin_unlock_irq(&rtc_lock);
 		return;
+	}
 	val = CMOS_READ(RTC_CONTROL);
 	val &=  ~bit;
 	CMOS_WRITE(val, RTC_CONTROL);
 	CMOS_READ(RTC_INTR_FLAGS);
 
 	rtc_irq_data = 0;
+	spin_unlock_irq(&rtc_lock);
 }
 
-static void set_rtc_irq_bit_locked(unsigned char bit)
+static void set_rtc_irq_bit(unsigned char bit)
 {
 	unsigned char val;
 
-	if (hpet_set_rtc_irq_bit(bit))
+	spin_lock_irq(&rtc_lock);
+	if (hpet_set_rtc_irq_bit(bit)) {
+		spin_unlock_irq(&rtc_lock);
 		return;
+	}
 	val = CMOS_READ(RTC_CONTROL);
 	val |= bit;
 	CMOS_WRITE(val, RTC_CONTROL);
 	CMOS_READ(RTC_INTR_FLAGS);
 
 	rtc_irq_data = 0;
+	spin_unlock_irq(&rtc_lock);
 }
 #endif
 
Index: linux.prev/drivers/char/s3c2410-rtc.c
===================================================================
--- linux.prev.orig/drivers/char/s3c2410-rtc.c
+++ linux.prev/drivers/char/s3c2410-rtc.c
@@ -22,6 +22,7 @@
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/rtc.h>
 #include <linux/bcd.h>
 
Index: linux.prev/drivers/char/specialix.c
===================================================================
--- linux.prev.orig/drivers/char/specialix.c
+++ linux.prev/drivers/char/specialix.c
@@ -2488,7 +2488,7 @@ static int __init specialix_init(void)
 #endif
 
 	for (i = 0; i < SX_NBOARD; i++)
-		sx_board[i].lock = SPIN_LOCK_UNLOCKED;
+		spin_lock_init(&sx_board[i].lock);
 
 	if (sx_init_drivers()) {
 		func_exit();
Index: linux.prev/drivers/char/sx.c
===================================================================
--- linux.prev.orig/drivers/char/sx.c
+++ linux.prev/drivers/char/sx.c
@@ -2321,7 +2321,7 @@ static int sx_init_portstructs (int nboa
 #ifdef NEW_WRITE_LOCKING
 			port->gs.port_write_sem = MUTEX;
 #endif
-			port->gs.driver_lock = SPIN_LOCK_UNLOCKED;
+			spin_lock_init(&port->gs.driver_lock);
 			/*
 			 * Initializing wait queue
 			 */
Index: linux.prev/drivers/char/sysrq.c
===================================================================
--- linux.prev.orig/drivers/char/sysrq.c
+++ linux.prev/drivers/char/sysrq.c
@@ -114,7 +114,7 @@ static struct sysrq_key_op sysrq_crashdu
 static void sysrq_handle_reboot(int key, struct pt_regs *pt_regs,
 				struct tty_struct *tty) 
 {
-	local_irq_enable();
+	raw_local_irq_enable();
 	emergency_restart();
 }
 
@@ -169,6 +169,38 @@ static struct sysrq_key_op sysrq_showreg
 	.enable_mask	= SYSRQ_ENABLE_DUMP,
 };
 
+#ifdef CONFIG_DEBUG_DEADLOCKS
+
+static void sysrq_handle_showlocks(int key, struct pt_regs *pt_regs,
+				   struct tty_struct *tty)
+{
+	show_all_locks();
+}
+
+static struct sysrq_key_op sysrq_showlocks_op = {
+	.handler	= sysrq_handle_showlocks,
+	.help_msg	= "show-all-locks(D)",
+	.action_msg	= "Show Locks Held",
+};
+
+#endif
+
+#if defined(__i386__)
+
+static void sysrq_handle_showallregs(int key, struct pt_regs *pt_regs,
+				     struct tty_struct *tty)
+{
+	nmi_show_all_regs();
+}
+
+static struct sysrq_key_op sysrq_showallregs_op = {
+	.handler	= sysrq_handle_showallregs,
+	.help_msg	= "showalLcpupc",
+	.action_msg	= "Show Regs On All CPUs",
+};
+
+#endif
+
 
 static void sysrq_handle_showstate(int key, struct pt_regs *pt_regs,
 				   struct tty_struct *tty) 
@@ -294,7 +326,11 @@ static struct sysrq_key_op *sysrq_key_ta
 #else
 /* c */	NULL,
 #endif
+#ifdef CONFIG_DEBUG_DEADLOCKS
+/* d */ &sysrq_showlocks_op,
+#else
 /* d */ NULL,
+#endif
 /* e */	&sysrq_term_op,
 /* f */	&sysrq_moom_op,
 /* g */	NULL,
@@ -306,7 +342,11 @@ static struct sysrq_key_op *sysrq_key_ta
 #else
 /* k */	NULL,
 #endif
+#if defined(__i386__)
+/* l */	&sysrq_showallregs_op,
+#else
 /* l */	NULL,
+#endif
 /* m */	&sysrq_showmem_op,
 /* n */	&sysrq_unrt_op,
 /* o */	NULL, /* This will often be registered
Index: linux.prev/drivers/char/tty_io.c
===================================================================
--- linux.prev.orig/drivers/char/tty_io.c
+++ linux.prev/drivers/char/tty_io.c
@@ -224,6 +224,7 @@ static int check_tty_count(struct tty_st
 		printk(KERN_WARNING "Warning: dev (%s) tty->count(%d) "
 				    "!= #fd's(%d) in %s\n",
 		       tty->name, tty->count, count, routine);
+		dump_stack();
 		return count;
        }	
 #endif
@@ -867,8 +868,8 @@ static void do_tty_hangup(void *data)
 				p->signal->tty = NULL;
 			if (!p->signal->leader)
 				continue;
-			send_group_sig_info(SIGHUP, SEND_SIG_PRIV, p);
-			send_group_sig_info(SIGCONT, SEND_SIG_PRIV, p);
+			group_send_sig_info(SIGHUP, SEND_SIG_PRIV, p);
+			group_send_sig_info(SIGCONT, SEND_SIG_PRIV, p);
 			if (tty->pgrp > 0)
 				p->signal->tty_old_pgrp = tty->pgrp;
 		} while_each_task_pid(tty->session, PIDTYPE_SID, p);
Index: linux.prev/drivers/char/watchdog/cpu5wdt.c
===================================================================
--- linux.prev.orig/drivers/char/watchdog/cpu5wdt.c
+++ linux.prev/drivers/char/watchdog/cpu5wdt.c
@@ -28,6 +28,7 @@
 #include <linux/init.h>
 #include <linux/ioport.h>
 #include <linux/timer.h>
+#include <linux/completion.h>
 #include <linux/jiffies.h>
 #include <asm/io.h>
 #include <asm/uaccess.h>
@@ -57,7 +58,7 @@ static int ticks = 10000;
 /* some device data */
 
 static struct {
-	struct semaphore stop;
+	struct completion stop;
 	volatile int running;
 	struct timer_list timer;
 	volatile int queue;
@@ -85,7 +86,7 @@ static void cpu5wdt_trigger(unsigned lon
 	}
 	else {
 		/* ticks doesn't matter anyway */
-		up(&cpu5wdt_device.stop);
+		complete(&cpu5wdt_device.stop);
 	}
 
 }
@@ -239,7 +240,7 @@ static int __devinit cpu5wdt_init(void)
 	if ( !val )
 		printk(KERN_INFO PFX "sorry, was my fault\n");
 
-	init_MUTEX_LOCKED(&cpu5wdt_device.stop);
+	init_completion(&cpu5wdt_device.stop);
 	cpu5wdt_device.queue = 0;
 
 	clear_bit(0, &cpu5wdt_device.inuse);
@@ -269,7 +270,7 @@ static void __devexit cpu5wdt_exit(void)
 {
 	if ( cpu5wdt_device.queue ) {
 		cpu5wdt_device.queue = 0;
-		down(&cpu5wdt_device.stop);
+		wait_for_completion(&cpu5wdt_device.stop);
 	}
 
 	misc_deregister(&cpu5wdt_misc);
Index: linux.prev/drivers/clocksource/Makefile
===================================================================
--- /dev/null
+++ linux.prev/drivers/clocksource/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_X86_CYCLONE_TIMER) += cyclone.o
+obj-$(CONFIG_ACPI) += acpi_pm.o
Index: linux.prev/drivers/clocksource/acpi_pm.c
===================================================================
--- /dev/null
+++ linux.prev/drivers/clocksource/acpi_pm.c
@@ -0,0 +1,123 @@
+/*
+ * linux/drivers/clocksource/acpi_pm.c
+ *
+ * This file contains the ACPI PM based clocksource.
+ *
+ * This code was largely moved from the i386 timer_pm.c file
+ * which was (C) Dominik Brodowski <linux@brodo.de> 2003
+ * and contained the following comments:
+ *
+ * Driver to use the Power Management Timer (PMTMR) available in some
+ * southbridges as primary timing source for the Linux kernel.
+ *
+ * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
+ * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
+ *
+ * This file is licensed under the GPL v2.
+ */
+
+#include <linux/clocksource.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <asm/io.h>
+
+/* Number of PMTMR ticks expected during calibration run */
+#define PMTMR_TICKS_PER_SEC 3579545
+
+#if (defined(CONFIG_X86) && (!defined(CONFIG_X86_64)))
+# include "mach_timer.h"
+# define PMTMR_EXPECTED_RATE ((PMTMR_TICKS_PER_SEC*CALIBRATE_TIME_MSEC)/1000)
+#endif
+
+/*
+ * The I/O port the PMTMR resides at.
+ * The location is detected during setup_arch(),
+ * in arch/i386/acpi/boot.c
+ */
+extern u32 acpi_pmtmr_ioport;
+extern int acpi_pmtmr_buggy;
+
+#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
+
+static inline u32 read_pmtmr(void)
+{
+	/* mask the output to 24 bits */
+	return inl(acpi_pmtmr_ioport) & ACPI_PM_MASK;
+}
+
+static cycle_t acpi_pm_read_verified(void)
+{
+	u32 v1 = 0, v2 = 0, v3 = 0;
+
+	/*
+	 * It has been reported that because of various broken
+	 * chipsets (ICH4, PIIX4 and PIIX4E) where the ACPI PM clock
+	 * source is not latched, so you must read it multiple
+	 * times to ensure a safe value is read:
+	 */
+	do {
+		v1 = read_pmtmr();
+		v2 = read_pmtmr();
+		v3 = read_pmtmr();
+	} while ((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1)
+			|| (v3 > v1 && v3 < v2));
+
+	return (cycle_t)v2;
+}
+
+static cycle_t acpi_pm_read(void)
+{
+	return (cycle_t)read_pmtmr();
+}
+
+struct clocksource clocksource_acpi_pm = {
+	.name		= "acpi_pm",
+	.rating		= 200,
+	.read		= acpi_pm_read,
+	.mask		= (cycle_t)ACPI_PM_MASK,
+	.mult		= 0, /*to be caluclated*/
+	.shift		= 22,
+	.is_continuous	= 1,
+};
+
+static int __init init_acpi_pm_clocksource(void)
+{
+	u32 value1, value2;
+	unsigned int i;
+
+	if (!acpi_pmtmr_ioport)
+		return -ENODEV;
+
+	clocksource_acpi_pm.mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC,
+						clocksource_acpi_pm.shift);
+
+	/* "verify" this timing source: */
+	value1 = read_pmtmr();
+	for (i = 0; i < 10000; i++) {
+		value2 = read_pmtmr();
+		if (value2 == value1)
+			continue;
+		if (value2 > value1)
+			goto pm_good;
+		if ((value2 < value1) && ((value2) < 0xFFF))
+			goto pm_good;
+		printk(KERN_INFO "PM-Timer had inconsistent results: 0x%#x, 0x%#x - aborting.\n", value1, value2);
+		return -EINVAL;
+	}
+	printk(KERN_INFO "PM-Timer had no reasonable result: 0x%#x - aborting.\n", value1);
+	return -ENODEV;
+
+pm_good:
+
+	/* check to see if pmtmr is known buggy: */
+	if (acpi_pmtmr_buggy) {
+		clocksource_acpi_pm.read = acpi_pm_read_verified;
+		clocksource_acpi_pm.rating = 110;
+	}
+
+	register_clocksource(&clocksource_acpi_pm);
+
+	return 0;
+}
+
+module_init(init_acpi_pm_clocksource);
Index: linux.prev/drivers/clocksource/cyclone.c
===================================================================
--- /dev/null
+++ linux.prev/drivers/clocksource/cyclone.c
@@ -0,0 +1,121 @@
+#include <linux/clocksource.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/timex.h>
+#include <linux/init.h>
+
+#include <asm/pgtable.h>
+#include <asm/io.h>
+
+#include "mach_timer.h"
+
+#define CYCLONE_CBAR_ADDR	0xFEB00CD0	/* base address ptr */
+#define CYCLONE_PMCC_OFFSET	0x51A0		/* offset to control register */
+#define CYCLONE_MPCS_OFFSET	0x51A8		/* offset to select register */
+#define CYCLONE_MPMC_OFFSET	0x51D0		/* offset to count register */
+#define CYCLONE_TIMER_FREQ	99780000	/* 100Mhz, but not really */
+#define CYCLONE_TIMER_MASK	0xFFFFFFFF	/* 32 bit mask */
+
+int use_cyclone = 0;
+static void __iomem *cyclone_ptr;
+
+static cycle_t read_cyclone(void)
+{
+	return (cycle_t)readl(cyclone_ptr);
+}
+
+struct clocksource clocksource_cyclone = {
+	.name		= "cyclone",
+	.rating		= 250,
+	.read		= read_cyclone,
+	.mask		= (cycle_t)CYCLONE_TIMER_MASK,
+	.mult		= 10,
+	.shift		= 0,
+	.is_continuous	= 1,
+};
+
+static int __init init_cyclone_clocksource(void)
+{
+	unsigned long base;	/* saved value from CBAR */
+	unsigned long offset;
+	u32 __iomem* volatile cyclone_timer;	/* Cyclone MPMC0 register */
+	u32 __iomem* reg;
+	int i;
+
+	/* make sure we're on a summit box: */
+	if (!use_cyclone)
+		return -ENODEV;
+
+	printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n");
+
+	/* find base address: */
+	offset = CYCLONE_CBAR_ADDR;
+	reg = ioremap_nocache(offset, sizeof(reg));
+	if (!reg) {
+		printk(KERN_ERR "Summit chipset: Could not find valid CBAR register.\n");
+		return -ENODEV;
+	}
+	/* even on 64bit systems, this is only 32bits: */
+	base = readl(reg);
+	if (!base) {
+		printk(KERN_ERR "Summit chipset: Could not find valid CBAR value.\n");
+		return -ENODEV;
+	}
+	iounmap(reg);
+
+	/* setup PMCC: */
+	offset = base + CYCLONE_PMCC_OFFSET;
+	reg = ioremap_nocache(offset, sizeof(reg));
+	if (!reg) {
+		printk(KERN_ERR "Summit chipset: Could not find valid PMCC register.\n");
+		return -ENODEV;
+	}
+	writel(0x00000001,reg);
+	iounmap(reg);
+
+	/* setup MPCS: */
+	offset = base + CYCLONE_MPCS_OFFSET;
+	reg = ioremap_nocache(offset, sizeof(reg));
+	if (!reg) {
+		printk(KERN_ERR "Summit chipset: Could not find valid MPCS register.\n");
+		return -ENODEV;
+	}
+	writel(0x00000001,reg);
+	iounmap(reg);
+
+	/* map in cyclone_timer: */
+	offset = base + CYCLONE_MPMC_OFFSET;
+	cyclone_timer = ioremap_nocache(offset, sizeof(u64));
+	if (!cyclone_timer) {
+		printk(KERN_ERR "Summit chipset: Could not find valid MPMC register.\n");
+		return -ENODEV;
+	}
+
+	/* quick test to make sure its ticking: */
+	for (i = 0; i < 3; i++){
+		u32 old = readl(cyclone_timer);
+		int stall = 100;
+
+		while (stall--)
+			barrier();
+
+		if (readl(cyclone_timer) == old) {
+			printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n");
+			iounmap(cyclone_timer);
+			cyclone_timer = NULL;
+			return -ENODEV;
+		}
+	}
+	cyclone_ptr = cyclone_timer;
+
+	/* sort out mult/shift values: */
+	clocksource_cyclone.shift = 22;
+	clocksource_cyclone.mult = clocksource_hz2mult(CYCLONE_TIMER_FREQ,
+						clocksource_cyclone.shift);
+
+	register_clocksource(&clocksource_cyclone);
+
+	return 0;
+}
+
+module_init(init_cyclone_clocksource);
Index: linux.prev/drivers/connector/cn_proc.c
===================================================================
--- linux.prev.orig/drivers/connector/cn_proc.c
+++ linux.prev/drivers/connector/cn_proc.c
@@ -24,6 +24,7 @@
 
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/ktime.h>
 #include <linux/init.h>
 #include <asm/atomic.h>
 
@@ -56,7 +57,7 @@ void proc_fork_connector(struct task_str
 	msg = (struct cn_msg*)buffer;
 	ev = (struct proc_event*)msg->data;
 	get_seq(&msg->seq, &ev->cpu);
-	getnstimestamp(&ev->timestamp);
+	ktime_get_ts(&ev->timestamp); /* get high res monotonic timestamp */
 	ev->what = PROC_EVENT_FORK;
 	ev->event_data.fork.parent_pid = task->real_parent->pid;
 	ev->event_data.fork.parent_tgid = task->real_parent->tgid;
@@ -82,7 +83,7 @@ void proc_exec_connector(struct task_str
 	msg = (struct cn_msg*)buffer;
 	ev = (struct proc_event*)msg->data;
 	get_seq(&msg->seq, &ev->cpu);
-	getnstimestamp(&ev->timestamp);
+	ktime_get_ts(&ev->timestamp);
 	ev->what = PROC_EVENT_EXEC;
 	ev->event_data.exec.process_pid = task->pid;
 	ev->event_data.exec.process_tgid = task->tgid;
@@ -116,7 +117,7 @@ void proc_id_connector(struct task_struc
 	} else
 	     	return;
 	get_seq(&msg->seq, &ev->cpu);
-	getnstimestamp(&ev->timestamp);
+	ktime_get_ts(&ev->timestamp);
 
 	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
 	msg->ack = 0; /* not used */
@@ -136,7 +137,7 @@ void proc_exit_connector(struct task_str
 	msg = (struct cn_msg*)buffer;
 	ev = (struct proc_event*)msg->data;
 	get_seq(&msg->seq, &ev->cpu);
-	getnstimestamp(&ev->timestamp);
+	ktime_get_ts(&ev->timestamp);
 	ev->what = PROC_EVENT_EXIT;
 	ev->event_data.exit.process_pid = task->pid;
 	ev->event_data.exit.process_tgid = task->tgid;
@@ -169,7 +170,7 @@ static void cn_proc_ack(int err, int rcv
 	msg = (struct cn_msg*)buffer;
 	ev = (struct proc_event*)msg->data;
 	msg->seq = rcvd_seq;
-	getnstimestamp(&ev->timestamp);
+	ktime_get_ts(&ev->timestamp);
 	ev->cpu = -1;
 	ev->what = PROC_EVENT_NONE;
 	ev->event_data.ack.err = err;
Index: linux.prev/drivers/cpufreq/cpufreq.c
===================================================================
--- linux.prev.orig/drivers/cpufreq/cpufreq.c
+++ linux.prev/drivers/cpufreq/cpufreq.c
@@ -601,7 +601,8 @@ static int cpufreq_add_dev (struct sys_d
 	policy->cpu = cpu;
 	policy->cpus = cpumask_of_cpu(cpu);
 
-	init_MUTEX_LOCKED(&policy->lock);
+	init_MUTEX(&policy->lock);
+	down(&policy->lock);
 	init_completion(&policy->kobj_unregister);
 	INIT_WORK(&policy->update, handle_update, (void *)(long)cpu);
 
@@ -610,6 +611,7 @@ static int cpufreq_add_dev (struct sys_d
 	 */
 	ret = cpufreq_driver->init(policy);
 	if (ret) {
+		up(&policy->lock);
 		dprintk("initialization failed\n");
 		goto err_out;
 	}
@@ -622,8 +624,10 @@ static int cpufreq_add_dev (struct sys_d
 	strlcpy(policy->kobj.name, "cpufreq", KOBJ_NAME_LEN);
 
 	ret = kobject_register(&policy->kobj);
-	if (ret)
+	if (ret) {
+		up(&policy->lock);
 		goto err_out_driver_exit;
+	}
 
 	/* set up files for this cpu device */
 	drv_attr = cpufreq_driver->attr;
Index: linux.prev/drivers/i2c/busses/i2c-pxa.c
===================================================================
--- linux.prev.orig/drivers/i2c/busses/i2c-pxa.c
+++ linux.prev/drivers/i2c/busses/i2c-pxa.c
@@ -926,7 +926,7 @@ static struct i2c_algorithm i2c_pxa_algo
 };
 
 static struct pxa_i2c i2c_pxa = {
-	.lock	= SPIN_LOCK_UNLOCKED,
+	.lock	= SPIN_LOCK_UNLOCKED(i2c_pxa.lock),
 	.wait	= __WAIT_QUEUE_HEAD_INITIALIZER(i2c_pxa.wait),
 	.adap	= {
 		.owner		= THIS_MODULE,
Index: linux.prev/drivers/i2c/busses/i2c-s3c2410.c
===================================================================
--- linux.prev.orig/drivers/i2c/busses/i2c-s3c2410.c
+++ linux.prev/drivers/i2c/busses/i2c-s3c2410.c
@@ -573,7 +573,7 @@ static struct i2c_algorithm s3c24xx_i2c_
 };
 
 static struct s3c24xx_i2c s3c24xx_i2c = {
-	.lock	= SPIN_LOCK_UNLOCKED,
+	.lock	= SPIN_LOCK_UNLOCKED(s3c24xx_i2c.lock),
 	.wait	= __WAIT_QUEUE_HEAD_INITIALIZER(s3c24xx_i2c.wait),
 	.adap	= {
 		.name			= "s3c2410-i2c",
Index: linux.prev/drivers/i2c/chips/tps65010.c
===================================================================
--- linux.prev.orig/drivers/i2c/chips/tps65010.c
+++ linux.prev/drivers/i2c/chips/tps65010.c
@@ -25,6 +25,7 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/device.h>
 #include <linux/i2c.h>
 #include <linux/delay.h>
@@ -33,7 +34,6 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 
-#include <asm/irq.h>
 #include <asm/mach-types.h>
 
 #include <asm/arch/gpio.h>
Index: linux.prev/drivers/ide/ide-floppy.c
===================================================================
--- linux.prev.orig/drivers/ide/ide-floppy.c
+++ linux.prev/drivers/ide/ide-floppy.c
@@ -838,7 +838,7 @@ static ide_startstop_t idefloppy_pc_intr
 			"transferred\n", pc->actually_transferred);
 		clear_bit(PC_DMA_IN_PROGRESS, &pc->flags);
 
-		local_irq_enable();
+		local_irq_enable_nort();
 
 		if (status.b.check || test_bit(PC_DMA_ERROR, &pc->flags)) {
 			/* Error detected */
@@ -1670,9 +1670,9 @@ static int idefloppy_get_format_progress
 		atapi_status_t status;
 		unsigned long flags;
 
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		status.all = HWIF(drive)->INB(IDE_STATUS_REG);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 
 		progress_indication = !status.b.dsc ? 0 : 0x10000;
 	}
Index: linux.prev/drivers/ide/ide-io.c
===================================================================
--- linux.prev.orig/drivers/ide/ide-io.c
+++ linux.prev/drivers/ide/ide-io.c
@@ -636,7 +636,7 @@ static ide_startstop_t drive_cmd_intr (i
 	u8 stat = hwif->INB(IDE_STATUS_REG);
 	int retries = 10;
 
-	local_irq_enable();
+	local_irq_enable_nort();
 	if ((stat & DRQ_STAT) && args && args[3]) {
 		u8 io_32bit = drive->io_32bit;
 		drive->io_32bit = 0;
@@ -1107,7 +1107,7 @@ static void ide_do_request (ide_hwgroup_
 	ide_get_lock(ide_intr, hwgroup);
 
 	/* caller must own ide_lock */
-	BUG_ON(!irqs_disabled());
+	BUG_ON_NONRT(!irqs_disabled());
 
 	while (!hwgroup->busy) {
 		hwgroup->busy = 1;
@@ -1219,8 +1219,7 @@ static void ide_do_request (ide_hwgroup_
 		 */
 		if (masked_irq != IDE_NO_IRQ && hwif->irq != masked_irq)
 			disable_irq_nosync(hwif->irq);
-		spin_unlock(&ide_lock);
-		local_irq_enable();
+		spin_unlock_irq(&ide_lock);
 			/* allow other IRQs while we start this request */
 		startstop = start_request(drive, rq);
 		spin_lock_irq(&ide_lock);
@@ -1368,7 +1367,7 @@ void ide_timer_expiry (unsigned long dat
 #endif /* DISABLE_IRQ_NOSYNC */
 			/* local CPU only,
 			 * as if we were handling an interrupt */
-			local_irq_disable();
+			local_irq_disable_nort();
 			if (hwgroup->polling) {
 				startstop = handler(drive);
 			} else if (drive_is_ready(drive)) {
@@ -1565,7 +1564,7 @@ irqreturn_t ide_intr (int irq, void *dev
 	spin_unlock(&ide_lock);
 
 	if (drive->unmask)
-		local_irq_enable();
+		local_irq_enable_nort();
 	/* service this interrupt, may set handler for next interrupt */
 	startstop = handler(drive);
 	spin_lock_irq(&ide_lock);
Index: linux.prev/drivers/ide/ide-iops.c
===================================================================
--- linux.prev.orig/drivers/ide/ide-iops.c
+++ linux.prev/drivers/ide/ide-iops.c
@@ -244,10 +244,10 @@ static void ata_input_data(ide_drive_t *
 	if (io_32bit) {
 		if (io_32bit & 2) {
 			unsigned long flags;
-			local_irq_save(flags);
+			local_irq_save_nort(flags);
 			ata_vlb_sync(drive, IDE_NSECTOR_REG);
 			hwif->INSL(IDE_DATA_REG, buffer, wcount);
-			local_irq_restore(flags);
+			local_irq_restore_nort(flags);
 		} else
 			hwif->INSL(IDE_DATA_REG, buffer, wcount);
 	} else {
@@ -266,10 +266,10 @@ static void ata_output_data(ide_drive_t 
 	if (io_32bit) {
 		if (io_32bit & 2) {
 			unsigned long flags;
-			local_irq_save(flags);
+			local_irq_save_nort(flags);
 			ata_vlb_sync(drive, IDE_NSECTOR_REG);
 			hwif->OUTSL(IDE_DATA_REG, buffer, wcount);
-			local_irq_restore(flags);
+			local_irq_restore_nort(flags);
 		} else
 			hwif->OUTSL(IDE_DATA_REG, buffer, wcount);
 	} else {
@@ -564,12 +564,12 @@ int ide_wait_stat (ide_startstop_t *star
 				if (!(stat & BUSY_STAT))
 					break;
 
-				local_irq_restore(flags);
+				local_irq_restore_nort(flags);
 				*startstop = ide_error(drive, "status timeout", stat);
 				return 1;
 			}
 		}
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	}
 	/*
 	 * Allow status to settle, then read it again.
@@ -727,17 +727,15 @@ int ide_driveid_update (ide_drive_t *dri
 		printk("%s: CHECK for good STATUS\n", drive->name);
 		return 0;
 	}
-	local_irq_save(flags);
-	SELECT_MASK(drive, 0);
 	id = kmalloc(SECTOR_WORDS*4, GFP_ATOMIC);
-	if (!id) {
-		local_irq_restore(flags);
+	if (!id)
 		return 0;
-	}
+	local_irq_save_nort(flags);
+	SELECT_MASK(drive, 0);
 	ata_input_data(drive, id, SECTOR_WORDS);
 	(void) hwif->INB(IDE_STATUS_REG);	/* clear drive IRQ */
-	local_irq_enable();
-	local_irq_restore(flags);
+	local_irq_enable_nort();
+	local_irq_restore_nort(flags);
 	ide_fix_driveid(id);
 	if (id) {
 		drive->id->dma_ultra = id->dma_ultra;
@@ -817,7 +815,7 @@ int ide_config_drive_speed (ide_drive_t 
 			if (time_after(jiffies, timeout))
 				break;
 		}
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	}
 
 	/*
@@ -1243,6 +1241,7 @@ int ide_wait_not_busy(ide_hwif_t *hwif, 
 		 */
 		if (stat == 0xff)
 			return -ENODEV;
+		touch_softlockup_watchdog();
 	}
 	return -EBUSY;
 }
Index: linux.prev/drivers/ide/ide-lib.c
===================================================================
--- linux.prev.orig/drivers/ide/ide-lib.c
+++ linux.prev/drivers/ide/ide-lib.c
@@ -447,15 +447,16 @@ EXPORT_SYMBOL_GPL(ide_set_xfer_rate);
 
 static void ide_dump_opcode(ide_drive_t *drive)
 {
+	unsigned long flags;
 	struct request *rq;
 	u8 opcode = 0;
 	int found = 0;
 
-	spin_lock(&ide_lock);
+	spin_lock_irqsave(&ide_lock, flags);
 	rq = NULL;
 	if (HWGROUP(drive))
 		rq = HWGROUP(drive)->rq;
-	spin_unlock(&ide_lock);
+	spin_unlock_irqrestore(&ide_lock, flags);
 	if (!rq)
 		return;
 	if (rq->flags & (REQ_DRIVE_CMD | REQ_DRIVE_TASK)) {
@@ -483,10 +484,8 @@ static void ide_dump_opcode(ide_drive_t 
 static u8 ide_dump_ata_status(ide_drive_t *drive, const char *msg, u8 stat)
 {
 	ide_hwif_t *hwif = HWIF(drive);
-	unsigned long flags;
 	u8 err = 0;
 
-	local_irq_set(flags);
 	printk("%s: %s: status=0x%02x { ", drive->name, msg, stat);
 	if (stat & BUSY_STAT)
 		printk("Busy ");
@@ -546,7 +545,7 @@ static u8 ide_dump_ata_status(ide_drive_
 		printk("\n");
 	}
 	ide_dump_opcode(drive);
-	local_irq_restore(flags);
+
 	return err;
 }
 
@@ -561,14 +560,12 @@ static u8 ide_dump_ata_status(ide_drive_
 
 static u8 ide_dump_atapi_status(ide_drive_t *drive, const char *msg, u8 stat)
 {
-	unsigned long flags;
-
 	atapi_status_t status;
 	atapi_error_t error;
 
 	status.all = stat;
 	error.all = 0;
-	local_irq_set(flags);
+
 	printk("%s: %s: status=0x%02x { ", drive->name, msg, stat);
 	if (status.b.bsy)
 		printk("Busy ");
@@ -594,7 +591,7 @@ static u8 ide_dump_atapi_status(ide_driv
 		printk("}\n");
 	}
 	ide_dump_opcode(drive);
-	local_irq_restore(flags);
+
 	return error.all;
 }
 
Index: linux.prev/drivers/ide/ide-probe.c
===================================================================
--- linux.prev.orig/drivers/ide/ide-probe.c
+++ linux.prev/drivers/ide/ide-probe.c
@@ -184,7 +184,7 @@ static inline void do_identify (ide_driv
 	hwif->ata_input_data(drive, id, SECTOR_WORDS);
 
 	drive->id_read = 1;
-	local_irq_enable();
+	local_irq_enable_nort();
 	ide_fix_driveid(id);
 
 #if defined (CONFIG_SCSI_EATA_DMA) || defined (CONFIG_SCSI_EATA_PIO) || defined (CONFIG_SCSI_EATA)
@@ -362,14 +362,14 @@ static int actual_try_to_identify (ide_d
 		unsigned long flags;
 
 		/* local CPU only; some systems need this */
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		/* drive returned ID */
 		do_identify(drive, cmd);
 		/* drive responded with ID */
 		rc = 0;
 		/* clear drive IRQ */
 		(void) hwif->INB(IDE_STATUS_REG);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	} else {
 		/* drive refused ID */
 		rc = 2;
@@ -655,7 +655,7 @@ static void hwif_release_dev (struct dev
 {
 	ide_hwif_t *hwif = container_of(dev, ide_hwif_t, gendev);
 
-	up(&hwif->gendev_rel_sem);
+	complete(&hwif->gendev_rel_comp);
 }
 
 static void hwif_register (ide_hwif_t *hwif)
@@ -841,7 +841,7 @@ static void probe_hwif(ide_hwif_t *hwif)
 		} while ((stat & BUSY_STAT) && time_after(timeout, jiffies));
 
 	}
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 	/*
 	 * Use cached IRQ number. It might be (and is...) changed by probe
 	 * code above
@@ -1325,7 +1325,7 @@ static void drive_release_dev (struct de
 	drive->queue = NULL;
 	spin_unlock_irq(&ide_lock);
 
-	up(&drive->gendev_rel_sem);
+	complete(&drive->gendev_rel_comp);
 }
 
 /*
Index: linux.prev/drivers/ide/ide-taskfile.c
===================================================================
--- linux.prev.orig/drivers/ide/ide-taskfile.c
+++ linux.prev/drivers/ide/ide-taskfile.c
@@ -223,7 +223,7 @@ ide_startstop_t task_no_data_intr (ide_d
 	ide_hwif_t *hwif	= HWIF(drive);
 	u8 stat;
 
-	local_irq_enable();
+	local_irq_enable_nort();
 	if (!OK_STAT(stat = hwif->INB(IDE_STATUS_REG),READY_STAT,BAD_STAT)) {
 		return ide_error(drive, "task_no_data_intr", stat);
 		/* calls ide_end_drive_cmd */
@@ -275,7 +275,7 @@ static void ide_pio_sector(ide_drive_t *
 	offset %= PAGE_SIZE;
 
 #ifdef CONFIG_HIGHMEM
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 #endif
 	buf = kmap_atomic(page, KM_BIO_SRC_IRQ) + offset;
 
@@ -295,7 +295,7 @@ static void ide_pio_sector(ide_drive_t *
 
 	kunmap_atomic(buf, KM_BIO_SRC_IRQ);
 #ifdef CONFIG_HIGHMEM
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 #endif
 }
 
@@ -453,7 +453,7 @@ ide_startstop_t pre_task_out_intr (ide_d
 	}
 
 	if (!drive->unmask)
-		local_irq_disable();
+		local_irq_disable_nort();
 
 	ide_set_handler(drive, &task_out_intr, WAIT_WORSTCASE, NULL);
 	ide_pio_datablock(drive, rq, 1);
Index: linux.prev/drivers/ide/ide.c
===================================================================
--- linux.prev.orig/drivers/ide/ide.c
+++ linux.prev/drivers/ide/ide.c
@@ -222,7 +222,7 @@ static void init_hwif_data(ide_hwif_t *h
 	hwif->mwdma_mask = 0x80;	/* disable all mwdma */
 	hwif->swdma_mask = 0x80;	/* disable all swdma */
 
-	sema_init(&hwif->gendev_rel_sem, 0);
+	init_completion(&hwif->gendev_rel_comp);
 
 	default_hwif_iops(hwif);
 	default_hwif_transport(hwif);
@@ -245,7 +245,7 @@ static void init_hwif_data(ide_hwif_t *h
 		drive->is_flash			= 0;
 		drive->vdma			= 0;
 		INIT_LIST_HEAD(&drive->list);
-		sema_init(&drive->gendev_rel_sem, 0);
+		init_completion(&drive->gendev_rel_comp);
 	}
 }
 
@@ -602,7 +602,7 @@ void ide_unregister(unsigned int index)
 		}
 		spin_unlock_irq(&ide_lock);
 		device_unregister(&drive->gendev);
-		down(&drive->gendev_rel_sem);
+		wait_for_completion(&drive->gendev_rel_comp);
 		spin_lock_irq(&ide_lock);
 	}
 	hwif->present = 0;
@@ -662,7 +662,7 @@ void ide_unregister(unsigned int index)
 	/* More messed up locking ... */
 	spin_unlock_irq(&ide_lock);
 	device_unregister(&hwif->gendev);
-	down(&hwif->gendev_rel_sem);
+	wait_for_completion(&hwif->gendev_rel_comp);
 
 	/*
 	 * Remove us from the kernel's knowledge
@@ -1048,15 +1048,13 @@ int ide_spin_wait_hwgroup (ide_drive_t *
 	spin_lock_irq(&ide_lock);
 
 	while (hwgroup->busy) {
-		unsigned long lflags;
 		spin_unlock_irq(&ide_lock);
-		local_irq_set(lflags);
+
 		if (time_after(jiffies, timeout)) {
-			local_irq_restore(lflags);
 			printk(KERN_ERR "%s: channel busy\n", drive->name);
 			return -EBUSY;
 		}
-		local_irq_restore(lflags);
+
 		spin_lock_irq(&ide_lock);
 	}
 	return 0;
Index: linux.prev/drivers/ide/pci/alim15x3.c
===================================================================
--- linux.prev.orig/drivers/ide/pci/alim15x3.c
+++ linux.prev/drivers/ide/pci/alim15x3.c
@@ -296,7 +296,6 @@ static void ali15x3_tune_drive (ide_driv
 	struct pci_dev *dev = hwif->pci_dev;
 	int s_time, a_time, c_time;
 	u8 s_clc, a_clc, r_clc;
-	unsigned long flags;
 	int bus_speed = system_bus_clock();
 	int port = hwif->channel ? 0x5c : 0x58;
 	int portFIFO = hwif->channel ? 0x55 : 0x54;
@@ -323,7 +322,6 @@ static void ali15x3_tune_drive (ide_driv
 		if (r_clc >= 16)
 			r_clc = 0;
 	}
-	local_irq_save(flags);
 	
 	/* 
 	 * PIO mode => ATA FIFO on, ATAPI FIFO off
@@ -345,7 +343,6 @@ static void ali15x3_tune_drive (ide_driv
 	
 	pci_write_config_byte(dev, port, s_clc);
 	pci_write_config_byte(dev, port+drive->select.b.unit+2, (a_clc << 4) | r_clc);
-	local_irq_restore(flags);
 
 	/*
 	 * setup   active  rec
@@ -585,7 +582,6 @@ static int ali15x3_dma_setup(ide_drive_t
   
 static unsigned int __devinit init_chipset_ali15x3 (struct pci_dev *dev, const char *name)
 {
-	unsigned long flags;
 	u8 tmpbyte;
 	struct pci_dev *north = pci_find_slot(0, PCI_DEVFN(0,0));
 
@@ -601,7 +597,6 @@ static unsigned int __devinit init_chips
 	}
 #endif  /* defined(DISPLAY_ALI_TIMINGS) && defined(CONFIG_PROC_FS) */
 
-	local_irq_save(flags);
 
 	if (m5229_revision < 0xC2) {
 		/*
@@ -614,7 +609,6 @@ static unsigned int __devinit init_chips
 		 * clear bit 7
 		 */
 		pci_write_config_byte(dev, 0x4b, tmpbyte & 0x7F);
-		local_irq_restore(flags);
 		return 0;
 	}
 
@@ -639,7 +633,6 @@ static unsigned int __devinit init_chips
 	 * 0:0.0 so if we didn't find one we know what is cooking.
 	 */
 	if (north && north->vendor != PCI_VENDOR_ID_AL) {
-		local_irq_restore(flags);
 	        return 0;
 	}
 
@@ -662,7 +655,6 @@ static unsigned int __devinit init_chips
 			pci_write_config_byte(isa_dev, 0x79, tmpbyte | 0x02);
 		}
 	}
-	local_irq_restore(flags);
 	return 0;
 }
 
@@ -683,10 +675,8 @@ static unsigned int __devinit ata66_ali1
 	unsigned int ata66	= 0;
 	u8 cable_80_pin[2]	= { 0, 0 };
 
-	unsigned long flags;
 	u8 tmpbyte;
 
-	local_irq_save(flags);
 
 	if (m5229_revision >= 0xC2) {
 		/*
@@ -736,7 +726,6 @@ static unsigned int __devinit ata66_ali1
 
 	pci_write_config_byte(dev, 0x53, tmpbyte);
 
-	local_irq_restore(flags);
 
 	return(ata66);
 }
Index: linux.prev/drivers/ide/pci/hpt366.c
===================================================================
--- linux.prev.orig/drivers/ide/pci/hpt366.c
+++ linux.prev/drivers/ide/pci/hpt366.c
@@ -1481,7 +1481,6 @@ static void __devinit init_dma_hpt366(id
 	u8 dma_new	= 0, dma_old = 0;
 	u8 primary	= hwif->channel ? 0x4b : 0x43;
 	u8 secondary	= hwif->channel ? 0x4f : 0x47;
-	unsigned long flags;
 
 	if (!dmabase)
 		return;
@@ -1493,8 +1492,6 @@ static void __devinit init_dma_hpt366(id
 
 	dma_old = hwif->INB(dmabase+2);
 
-	local_irq_save(flags);
-
 	dma_new = dma_old;
 	pci_read_config_byte(hwif->pci_dev, primary, &masterdma);
 	pci_read_config_byte(hwif->pci_dev, secondary, &slavedma);
@@ -1504,8 +1501,6 @@ static void __devinit init_dma_hpt366(id
 	if (dma_new != dma_old)
 		hwif->OUTB(dma_new, dmabase+2);
 
-	local_irq_restore(flags);
-
 	ide_setup_dma(hwif, dmabase, 8);
 }
 
Index: linux.prev/drivers/ide/setup-pci.c
===================================================================
--- linux.prev.orig/drivers/ide/setup-pci.c
+++ linux.prev/drivers/ide/setup-pci.c
@@ -665,8 +665,11 @@ static int do_ide_setup_pci_device(struc
 {
 	static ata_index_t ata_index = { .b = { .low = 0xff, .high = 0xff } };
 	int tried_config = 0;
+	unsigned long flags;
 	int pciirq, ret;
 
+	spin_lock_irqsave(&ide_lock, flags);
+
 	ret = ide_setup_pci_controller(dev, d, noisy, &tried_config);
 	if (ret < 0)
 		goto out;
@@ -721,6 +724,8 @@ static int do_ide_setup_pci_device(struc
 	*index = ata_index;
 	ide_pci_setup_ports(dev, d, pciirq, index);
 out:
+	spin_unlock_irqrestore(&ide_lock, flags);
+
 	return ret;
 }
 
Index: linux.prev/drivers/ieee1394/ieee1394_types.h
===================================================================
--- linux.prev.orig/drivers/ieee1394/ieee1394_types.h
+++ linux.prev/drivers/ieee1394/ieee1394_types.h
@@ -19,7 +19,7 @@ struct hpsb_tlabel_pool {
 	spinlock_t lock;
 	u8 next;
 	u32 allocations;
-	struct semaphore count;
+	struct compat_semaphore count;
 };
 
 #define HPSB_TPOOL_INIT(_tp)			\
Index: linux.prev/drivers/ieee1394/nodemgr.c
===================================================================
--- linux.prev.orig/drivers/ieee1394/nodemgr.c
+++ linux.prev/drivers/ieee1394/nodemgr.c
@@ -114,7 +114,7 @@ struct host_info {
 	struct hpsb_host *host;
 	struct list_head list;
 	struct completion exited;
-	struct semaphore reset_sem;
+	struct compat_semaphore reset_sem;
 	int pid;
 	char daemon_name[15];
 	int kill_me;
Index: linux.prev/drivers/ieee1394/raw1394-private.h
===================================================================
--- linux.prev.orig/drivers/ieee1394/raw1394-private.h
+++ linux.prev/drivers/ieee1394/raw1394-private.h
@@ -29,7 +29,7 @@ struct file_info {
 
         struct list_head req_pending;
         struct list_head req_complete;
-        struct semaphore complete_sem;
+        struct compat_semaphore complete_sem;
         spinlock_t reqlists_lock;
         wait_queue_head_t poll_wait_complete;
 
Index: linux.prev/drivers/input/gameport/gameport.c
===================================================================
--- linux.prev.orig/drivers/input/gameport/gameport.c
+++ linux.prev/drivers/input/gameport/gameport.c
@@ -21,6 +21,7 @@
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
+#include <linux/interrupt.h>
 #include <linux/sched.h>	/* HZ */
 
 /*#include <asm/io.h>*/
@@ -101,12 +102,12 @@ static int gameport_measure_speed(struct
 	tx = 1 << 30;
 
 	for(i = 0; i < 50; i++) {
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		GET_TIME(t1);
 		for (t = 0; t < 50; t++) gameport_read(gameport);
 		GET_TIME(t2);
 		GET_TIME(t3);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 		udelay(i * 10);
 		if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
 	}
@@ -125,11 +126,11 @@ static int gameport_measure_speed(struct
 	tx = 1 << 30;
 
 	for(i = 0; i < 50; i++) {
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		rdtscl(t1);
 		for (t = 0; t < 50; t++) gameport_read(gameport);
 		rdtscl(t2);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 		udelay(i * 10);
 		if (t2 - t1 < tx) tx = t2 - t1;
 	}
Index: linux.prev/drivers/input/serio/sa1111ps2.c
===================================================================
--- linux.prev.orig/drivers/input/serio/sa1111ps2.c
+++ linux.prev/drivers/input/serio/sa1111ps2.c
@@ -13,6 +13,7 @@
 #include <linux/serio.h>
 #include <linux/errno.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/ioport.h>
 #include <linux/delay.h>
 #include <linux/device.h>
Index: linux.prev/drivers/media/dvb/dvb-core/dvb_frontend.c
===================================================================
--- linux.prev.orig/drivers/media/dvb/dvb-core/dvb_frontend.c
+++ linux.prev/drivers/media/dvb/dvb-core/dvb_frontend.c
@@ -95,7 +95,7 @@ struct dvb_frontend_private {
 	struct dvb_device *dvbdev;
 	struct dvb_frontend_parameters parameters;
 	struct dvb_fe_events events;
-	struct semaphore sem;
+	struct compat_semaphore sem;
 	struct list_head list_head;
 	wait_queue_head_t wait_queue;
 	pid_t thread_pid;
Index: linux.prev/drivers/media/dvb/dvb-core/dvb_frontend.h
===================================================================
--- linux.prev.orig/drivers/media/dvb/dvb-core/dvb_frontend.h
+++ linux.prev/drivers/media/dvb/dvb-core/dvb_frontend.h
@@ -86,7 +86,7 @@ struct dvb_fe_events {
 	int			  eventr;
 	int			  overflow;
 	wait_queue_head_t	  wait_queue;
-	struct semaphore	  sem;
+	struct compat_semaphore	  sem;
 };
 
 struct dvb_frontend {
Index: linux.prev/drivers/media/video/zr36120_i2c.c
===================================================================
--- linux.prev.orig/drivers/media/video/zr36120_i2c.c
+++ linux.prev/drivers/media/video/zr36120_i2c.c
@@ -120,7 +120,7 @@ struct i2c_bus zoran_i2c_bus_template =
 	I2C_BUSID_ZORAN,
 	NULL,
 
-	SPIN_LOCK_UNLOCKED,
+	SPIN_LOCK_UNLOCKED(zoran_i2c_bus_template.lock),
 
 	attach_inform,
 	detach_inform,
Index: linux.prev/drivers/message/i2o/exec-osm.c
===================================================================
--- linux.prev.orig/drivers/message/i2o/exec-osm.c
+++ linux.prev/drivers/message/i2o/exec-osm.c
@@ -209,7 +209,7 @@ static int i2o_msg_post_wait_complete(st
 {
 	struct i2o_exec_wait *wait, *tmp;
 	unsigned long flags;
-	static spinlock_t lock = SPIN_LOCK_UNLOCKED;
+	static DEFINE_SPINLOCK(lock);
 	int rc = 1;
 
 	/*
Index: linux.prev/drivers/misc/ibmasm/module.c
===================================================================
--- linux.prev.orig/drivers/misc/ibmasm/module.c
+++ linux.prev/drivers/misc/ibmasm/module.c
@@ -85,7 +85,7 @@ static int __devinit ibmasm_init_one(str
 	}
 	memset(sp, 0, sizeof(struct service_processor));
 
-	sp->lock = SPIN_LOCK_UNLOCKED;
+	spin_lock_init(&sp->lock);
 	INIT_LIST_HEAD(&sp->command_queue);
 
 	pci_set_drvdata(pdev, (void *)sp);
Index: linux.prev/drivers/net/3c527.c
===================================================================
--- linux.prev.orig/drivers/net/3c527.c
+++ linux.prev/drivers/net/3c527.c
@@ -182,7 +182,7 @@ struct mc32_local 
 
 	u16 rx_ring_tail;       /* index to rx de-queue end */ 
 
-	struct semaphore cmd_mutex;    /* Serialises issuing of execute commands */
+	struct compat_semaphore cmd_mutex;    /* Serialises issuing of execute commands */
         struct completion execution_cmd; /* Card has completed an execute command */
 	struct completion xceiver_cmd;   /* Card has completed a tx or rx command */
 };
Index: linux.prev/drivers/net/3c59x.c
===================================================================
--- linux.prev.orig/drivers/net/3c59x.c
+++ linux.prev/drivers/net/3c59x.c
@@ -963,9 +963,9 @@ static void poll_vortex(struct net_devic
 	struct vortex_private *vp = netdev_priv(dev);
 	unsigned long flags;
 	local_save_flags(flags);
-	local_irq_disable();
+	local_irq_disable_nort();
 	(vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev,NULL);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 } 
 #endif
 
@@ -2035,13 +2035,17 @@ static void vortex_tx_timeout(struct net
 			/*
 			 * Block interrupts because vortex_interrupt does a bare spin_lock()
 			 */
+#ifndef CONFIG_PREEMPT_RT
 			unsigned long flags;
 			local_irq_save(flags);
+#endif
 			if (vp->full_bus_master_tx)
 				boomerang_interrupt(dev->irq, dev, NULL);
 			else
 				vortex_interrupt(dev->irq, dev, NULL);
+#ifndef CONFIG_PREEMPT_RT
 			local_irq_restore(flags);
+#endif
 		}
 	}
 
Index: linux.prev/drivers/net/8139too.c
===================================================================
--- linux.prev.orig/drivers/net/8139too.c
+++ linux.prev/drivers/net/8139too.c
@@ -2130,10 +2130,10 @@ static int rtl8139_poll(struct net_devic
 		 * Order is important since data can get interrupted
 		 * again when we think we are done.
 		 */
-		local_irq_disable();
+		raw_local_irq_disable();
 		RTL_W16_F(IntrMask, rtl8139_intr_mask);
 		__netif_rx_complete(dev);
-		local_irq_enable();
+		raw_local_irq_enable();
 	}
 	spin_unlock(&tp->rx_lock);
 
Index: linux.prev/drivers/net/e1000/e1000_main.c
===================================================================
--- linux.prev.orig/drivers/net/e1000/e1000_main.c
+++ linux.prev/drivers/net/e1000/e1000_main.c
@@ -2736,10 +2736,8 @@ e1000_xmit_frame(struct sk_buff *skb, st
 	if(adapter->hw.tx_pkt_filtering && (adapter->hw.mac_type == e1000_82573) )
 		e1000_transfer_dhcp_info(adapter, skb);
 
-	local_irq_save(flags);
-	if (!spin_trylock(&tx_ring->tx_lock)) {
+	if (!spin_trylock_irqsave(&tx_ring->tx_lock, flags)) {
 		/* Collision - tell upper layer to requeue */
-		local_irq_restore(flags);
 		return NETDEV_TX_LOCKED;
 	}
 
Index: linux.prev/drivers/net/hamradio/6pack.c
===================================================================
--- linux.prev.orig/drivers/net/hamradio/6pack.c
+++ linux.prev/drivers/net/hamradio/6pack.c
@@ -124,7 +124,7 @@ struct sixpack {
 	struct timer_list	tx_t;
 	struct timer_list	resync_t;
 	atomic_t		refcnt;
-	struct semaphore	dead_sem;
+	struct compat_semaphore	dead_sem;
 	spinlock_t		lock;
 };
 
Index: linux.prev/drivers/net/hamradio/mkiss.c
===================================================================
--- linux.prev.orig/drivers/net/hamradio/mkiss.c
+++ linux.prev/drivers/net/hamradio/mkiss.c
@@ -85,7 +85,7 @@ struct mkiss {
 #define CRC_MODE_SMACK_TEST	4
 
 	atomic_t		refcnt;
-	struct semaphore	dead_sem;
+	struct compat_semaphore	dead_sem;
 };
 
 /*---------------------------------------------------------------------------*/
Index: linux.prev/drivers/net/netconsole.c
===================================================================
--- linux.prev.orig/drivers/net/netconsole.c
+++ linux.prev/drivers/net/netconsole.c
@@ -74,16 +74,22 @@ static void write_msg(struct console *co
 	if (!np.dev)
 		return;
 
-	local_irq_save(flags);
+	/*
+	 * A bit hairy. Netconsole uses mutexes (indirectly) and
+	 * thus must have interrupts enabled:
+	 */
+	local_irq_save_nort(flags);
 
 	for(left = len; left; ) {
 		frag = min(left, MAX_PRINT_CHUNK);
+		WARN_ON_RT(irqs_disabled());
 		netpoll_send_udp(&np, msg, frag);
+		WARN_ON_RT(irqs_disabled());
 		msg += frag;
 		left -= frag;
 	}
 
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 }
 
 static struct console netconsole = {
Index: linux.prev/drivers/net/ns83820.c
===================================================================
--- linux.prev.orig/drivers/net/ns83820.c
+++ linux.prev/drivers/net/ns83820.c
@@ -1012,8 +1012,6 @@ static void do_tx_done(struct net_device
 	struct ns83820 *dev = PRIV(ndev);
 	u32 cmdsts, tx_done_idx, *desc;
 
-	spin_lock_irq(&dev->tx_lock);
-
 	dprintk("do_tx_done(%p)\n", ndev);
 	tx_done_idx = dev->tx_done_idx;
 	desc = dev->tx_descs + (tx_done_idx * DESC_SIZE);
@@ -1069,7 +1067,6 @@ static void do_tx_done(struct net_device
 		netif_start_queue(ndev);
 		netif_wake_queue(ndev);
 	}
-	spin_unlock_irq(&dev->tx_lock);
 }
 
 static void ns83820_cleanup_tx(struct ns83820 *dev)
@@ -1370,7 +1367,9 @@ static void ns83820_do_isr(struct net_de
 	 * work has accumulated
 	 */
 	if ((ISR_TXDESC | ISR_TXIDLE | ISR_TXOK | ISR_TXERR) & isr) {
+		spin_lock_irq(&dev->tx_lock);
 		do_tx_done(ndev);
+		spin_unlock_irq(&dev->tx_lock);
 
 		/* Disable TxOk if there are no outstanding tx packets.
 		 */
@@ -1455,7 +1454,7 @@ static void ns83820_tx_timeout(struct ne
         u32 tx_done_idx, *desc;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	spin_lock_irqsave(&dev->tx_lock, flags);
 
 	tx_done_idx = dev->tx_done_idx;
 	desc = dev->tx_descs + (tx_done_idx * DESC_SIZE);
@@ -1482,7 +1481,7 @@ static void ns83820_tx_timeout(struct ne
 		ndev->name,
 		tx_done_idx, dev->tx_free_idx, le32_to_cpu(desc[DESC_CMDSTS]));
 
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&dev->tx_lock, flags);
 }
 
 static void ns83820_tx_watch(unsigned long data)
Index: linux.prev/drivers/net/plip.c
===================================================================
--- linux.prev.orig/drivers/net/plip.c
+++ linux.prev/drivers/net/plip.c
@@ -229,7 +229,10 @@ struct net_local {
 	                              struct hh_cache *hh);
 	spinlock_t lock;
 	atomic_t kill_timer;
-	struct semaphore killed_timer_sem;
+	/*
+	 * PREEMPT_RT: this isnt a mutex, it should be struct completion.
+	 */
+	struct compat_semaphore killed_timer_sem;
 };
 
 static inline void enable_parport_interrupts (struct net_device *dev)
Index: linux.prev/drivers/net/ppp_async.c
===================================================================
--- linux.prev.orig/drivers/net/ppp_async.c
+++ linux.prev/drivers/net/ppp_async.c
@@ -66,7 +66,7 @@ struct asyncppp {
 	struct tasklet_struct tsk;
 
 	atomic_t	refcnt;
-	struct semaphore dead_sem;
+	struct compat_semaphore dead_sem;
 	struct ppp_channel chan;	/* interface to generic ppp layer */
 	unsigned char	obuf[OBUFSIZE];
 };
Index: linux.prev/drivers/net/ppp_synctty.c
===================================================================
--- linux.prev.orig/drivers/net/ppp_synctty.c
+++ linux.prev/drivers/net/ppp_synctty.c
@@ -70,7 +70,7 @@ struct syncppp {
 	struct tasklet_struct tsk;
 
 	atomic_t	refcnt;
-	struct semaphore dead_sem;
+	struct compat_semaphore dead_sem;
 	struct ppp_channel chan;	/* interface to generic ppp layer */
 };
 
Index: linux.prev/drivers/net/skge.c
===================================================================
--- linux.prev.orig/drivers/net/skge.c
+++ linux.prev/drivers/net/skge.c
@@ -2272,12 +2272,9 @@ static int skge_xmit_frame(struct sk_buf
 	if (!skb)
 		return NETDEV_TX_OK;
 
-	local_irq_save(flags);
-	if (!spin_trylock(&skge->tx_lock)) {
+	if (!spin_trylock_irqsave(&skge->tx_lock, flags))
  		/* Collision - tell upper layer to requeue */
- 		local_irq_restore(flags);
  		return NETDEV_TX_LOCKED;
- 	}
 
 	if (unlikely(skge->tx_avail < skb_shinfo(skb)->nr_frags +1)) {
 		if (!netif_queue_stopped(dev)) {
@@ -2823,10 +2820,10 @@ static void skge_extirq(unsigned long da
 	}
 	spin_unlock(&hw->phy_lock);
 
-	local_irq_disable();
+	spin_lock_irq(&hw->hw_lock);
 	hw->intr_mask |= IS_EXT_REG;
 	skge_write32(hw, B0_IMSK, hw->intr_mask);
-	local_irq_enable();
+	spin_unlock_irq(&hw->hw_lock);
 }
 
 static inline void skge_wakeup(struct net_device *dev)
@@ -2845,6 +2842,8 @@ static irqreturn_t skge_intr(int irq, vo
 	if (status == 0 || status == ~0) /* hotplug or shared irq */
 		return IRQ_NONE;
 
+	spin_lock(&hw->hw_lock);
+
 	status &= hw->intr_mask;
 	if (status & IS_R1_F) {
 		hw->intr_mask &= ~IS_R1_F;
@@ -2896,6 +2895,8 @@ static irqreturn_t skge_intr(int irq, vo
 
 	skge_write32(hw, B0_IMSK, hw->intr_mask);
 
+	spin_unlock(&hw->hw_lock);
+
 	return IRQ_HANDLED;
 }
 
@@ -3252,6 +3253,7 @@ static int __devinit skge_probe(struct p
 	}
 
 	hw->pdev = pdev;
+	spin_lock_init(&hw->hw_lock);
 	spin_lock_init(&hw->phy_lock);
 	tasklet_init(&hw->ext_tasklet, skge_extirq, (unsigned long) hw);
 
Index: linux.prev/drivers/net/skge.h
===================================================================
--- linux.prev.orig/drivers/net/skge.h
+++ linux.prev/drivers/net/skge.h
@@ -2472,6 +2472,7 @@ struct skge_hw {
 	u16		     phy_addr;
 
 	struct tasklet_struct ext_tasklet;
+	spinlock_t	     hw_lock;
 	spinlock_t	     phy_lock;
 };
 
Index: linux.prev/drivers/net/smc91x.c
===================================================================
--- linux.prev.orig/drivers/net/smc91x.c
+++ linux.prev/drivers/net/smc91x.c
@@ -74,6 +74,7 @@ static const char version[] =
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/errno.h>
 #include <linux/ioport.h>
 #include <linux/crc32.h>
@@ -2011,7 +2012,7 @@ static int __init smc_probe(struct net_d
       	if (retval)
       		goto err_out;
 
-	set_irq_type(dev->irq, SMC_IRQ_TRIGGER_TYPE);
+	SMC_SET_IRQ_TYPE(dev->irq, SMC_IRQ_TRIGGER_TYPE);
 
 #ifdef SMC_USE_PXA_DMA
 	{
Index: linux.prev/drivers/net/smc91x.h
===================================================================
--- linux.prev.orig/drivers/net/smc91x.h
+++ linux.prev/drivers/net/smc91x.h
@@ -90,7 +90,7 @@
 			__l--;						\
 		}							\
 	} while (0)
-#define set_irq_type(irq, type)
+#define SMC_SET_IRQ_TYPE(irq, type)
 
 #elif defined(CONFIG_SA1100_PLEB)
 /* We can only do 16-bit reads and writes in the static memory space. */
@@ -109,7 +109,7 @@
 #define SMC_outw(v, a, r)	writew(v, (a) + (r))
 #define SMC_outsw(a, r, p, l)	writesw((a) + (r), p, l)
 
-#define set_irq_type(irq, type) do {} while (0)
+#define SMC_SET_IRQ_TYPE(irq, type) do {} while (0)
 
 #elif defined(CONFIG_SA1100_ASSABET)
 
@@ -209,7 +209,7 @@ SMC_outw(u16 val, void __iomem *ioaddr, 
 #define SMC_insw(a, r, p, l)	insw((a) + (r) - 0xa0000000, p, l)
 #define SMC_outsw(a, r, p, l)	outsw((a) + (r) - 0xa0000000, p, l)
 
-#define set_irq_type(irq, type)	do {} while(0)
+#define SMC_SET_IRQ_TYPE(irq, type)	do {} while(0)
 
 #elif	defined(CONFIG_ISA)
 
@@ -237,7 +237,7 @@ SMC_outw(u16 val, void __iomem *ioaddr, 
 #define SMC_insw(a, r, p, l)	insw(((u32)a) + (r), p, l)
 #define SMC_outsw(a, r, p, l)	outsw(((u32)a) + (r), p, l)
 
-#define set_irq_type(irq, type)	do {} while(0)
+#define SMC_SET_IRQ_TYPE(irq, type)	do {} while(0)
 
 #define RPC_LSA_DEFAULT		RPC_LED_TX_RX
 #define RPC_LSB_DEFAULT		RPC_LED_100_10
@@ -342,6 +342,10 @@ static inline void SMC_outsw (unsigned l
 
 #endif
 
+#ifndef SMC_SET_IRQ_TYPE
+#define SMC_SET_IRQ_TYPE set_irq_type
+#endif
+
 #ifndef	SMC_IRQ_TRIGGER_TYPE
 #define	SMC_IRQ_TRIGGER_TYPE	IRQT_RISING
 #endif
Index: linux.prev/drivers/net/tulip/tulip_core.c
===================================================================
--- linux.prev.orig/drivers/net/tulip/tulip_core.c
+++ linux.prev/drivers/net/tulip/tulip_core.c
@@ -1809,6 +1809,7 @@ static void __devexit tulip_remove_one (
 	pci_iounmap(pdev, tp->base_addr);
 	free_netdev (dev);
 	pci_release_regions (pdev);
+	pci_disable_device (pdev);
 	pci_set_drvdata (pdev, NULL);
 
 	/* pci_power_off (pdev, -1); */
Index: linux.prev/drivers/oprofile/buffer_sync.c
===================================================================
--- linux.prev.orig/drivers/oprofile/buffer_sync.c
+++ linux.prev/drivers/oprofile/buffer_sync.c
@@ -43,13 +43,16 @@ static void process_task_mortuary(void);
  * list for processing. Only after two full buffer syncs
  * does the task eventually get freed, because by then
  * we are sure we will not reference it again.
+ * Can be invoked from softirq via RCU callback due to
+ * call_rcu() of the task struct, hence the _irqsave.
  */
 static int task_free_notify(struct notifier_block * self, unsigned long val, void * data)
 {
+	unsigned long flags;
 	struct task_struct * task = data;
-	spin_lock(&task_mortuary);
+	spin_lock_irqsave(&task_mortuary, flags);
 	list_add(&task->tasks, &dying_tasks);
-	spin_unlock(&task_mortuary);
+	spin_unlock_irqrestore(&task_mortuary, flags);
 	return NOTIFY_OK;
 }
 
@@ -431,25 +434,22 @@ static void increment_tail(struct oprofi
  */
 static void process_task_mortuary(void)
 {
-	struct list_head * pos;
-	struct list_head * pos2;
+	unsigned long flags;
+	LIST_HEAD(local_dead_tasks);
 	struct task_struct * task;
+	struct task_struct * ttask;
 
-	spin_lock(&task_mortuary);
+	spin_lock_irqsave(&task_mortuary, flags);
 
-	list_for_each_safe(pos, pos2, &dead_tasks) {
-		task = list_entry(pos, struct task_struct, tasks);
-		list_del(&task->tasks);
-		free_task(task);
-	}
+	list_splice_init(&dead_tasks, &local_dead_tasks);
+	list_splice_init(&dying_tasks, &dead_tasks);
 
-	list_for_each_safe(pos, pos2, &dying_tasks) {
-		task = list_entry(pos, struct task_struct, tasks);
+	spin_unlock_irqrestore(&task_mortuary, flags);
+
+	list_for_each_entry_safe(task, ttask, &local_dead_tasks, tasks) {
 		list_del(&task->tasks);
-		list_add_tail(&task->tasks, &dead_tasks);
+		free_task(task);
 	}
-
-	spin_unlock(&task_mortuary);
 }
 
 
Index: linux.prev/drivers/oprofile/oprofilefs.c
===================================================================
--- linux.prev.orig/drivers/oprofile/oprofilefs.c
+++ linux.prev/drivers/oprofile/oprofilefs.c
@@ -21,7 +21,7 @@
 
 #define OPROFILEFS_MAGIC 0x6f70726f
 
-DEFINE_SPINLOCK(oprofilefs_lock);
+DEFINE_RAW_SPINLOCK(oprofilefs_lock);
 
 static struct inode * oprofilefs_get_inode(struct super_block * sb, int mode)
 {
Index: linux.prev/drivers/pci/hotplug/cpci_hotplug_core.c
===================================================================
--- linux.prev.orig/drivers/pci/hotplug/cpci_hotplug_core.c
+++ linux.prev/drivers/pci/hotplug/cpci_hotplug_core.c
@@ -60,8 +60,8 @@ static int slots;
 static atomic_t extracting;
 int cpci_debug;
 static struct cpci_hp_controller *controller;
-static struct semaphore event_semaphore;	/* mutex for process loop (up if something to process) */
-static struct semaphore thread_exit;		/* guard ensure thread has exited before calling it quits */
+static struct compat_semaphore event_semaphore;	/* mutex for process loop (up if something to process) */
+static struct compat_semaphore thread_exit;		/* guard ensure thread has exited before calling it quits */
 static int thread_finished = 1;
 
 static int enable_slot(struct hotplug_slot *slot);
Index: linux.prev/drivers/pci/hotplug/cpqphp_ctrl.c
===================================================================
--- linux.prev.orig/drivers/pci/hotplug/cpqphp_ctrl.c
+++ linux.prev/drivers/pci/hotplug/cpqphp_ctrl.c
@@ -45,8 +45,8 @@ static int configure_new_function(struct
 			u8 behind_bridge, struct resource_lists *resources);
 static void interrupt_event_handler(struct controller *ctrl);
 
-static struct semaphore event_semaphore;	/* mutex for process loop (up if something to process) */
-static struct semaphore event_exit;		/* guard ensure thread has exited before calling it quits */
+static struct compat_semaphore event_semaphore;	/* mutex for process loop (up if something to process) */
+static struct compat_semaphore event_exit;		/* guard ensure thread has exited before calling it quits */
 static int event_finished;
 static unsigned long pushbutton_pending;	/* = 0 */
 
Index: linux.prev/drivers/pci/hotplug/ibmphp_hpc.c
===================================================================
--- linux.prev.orig/drivers/pci/hotplug/ibmphp_hpc.c
+++ linux.prev/drivers/pci/hotplug/ibmphp_hpc.c
@@ -104,7 +104,7 @@ static int tid_poll;
 static struct semaphore sem_hpcaccess;	// lock access to HPC
 static struct semaphore semOperations;	// lock all operations and
 					// access to data structures
-static struct semaphore sem_exit;	// make sure polling thread goes away
+static struct compat_semaphore sem_exit;	// make sure polling thread goes away
 //----------------------------------------------------------------------------
 // local function prototypes
 //----------------------------------------------------------------------------
Index: linux.prev/drivers/pci/hotplug/pciehp_ctrl.c
===================================================================
--- linux.prev.orig/drivers/pci/hotplug/pciehp_ctrl.c
+++ linux.prev/drivers/pci/hotplug/pciehp_ctrl.c
@@ -37,8 +37,8 @@
 
 static void interrupt_event_handler(struct controller *ctrl);
 
-static struct semaphore event_semaphore;	/* mutex for process loop (up if something to process) */
-static struct semaphore event_exit;		/* guard ensure thread has exited before calling it quits */
+static struct compat_semaphore event_semaphore;	/* mutex for process loop (up if something to process) */
+static struct compat_semaphore event_exit;		/* guard ensure thread has exited before calling it quits */
 static int event_finished;
 static unsigned long pushbutton_pending;	/* = 0 */
 static unsigned long surprise_rm_pending;	/* = 0 */
Index: linux.prev/drivers/pci/hotplug/shpchp_ctrl.c
===================================================================
--- linux.prev.orig/drivers/pci/hotplug/shpchp_ctrl.c
+++ linux.prev/drivers/pci/hotplug/shpchp_ctrl.c
@@ -37,8 +37,8 @@
 
 static void interrupt_event_handler(struct controller *ctrl);
 
-static struct semaphore event_semaphore;	/* mutex for process loop (up if something to process) */
-static struct semaphore event_exit;		/* guard ensure thread has exited before calling it quits */
+static struct compat_semaphore event_semaphore;	/* mutex for process loop (up if something to process) */
+static struct compat_semaphore event_exit;		/* guard ensure thread has exited before calling it quits */
 static int event_finished;
 static unsigned long pushbutton_pending;	/* = 0 */
 
Index: linux.prev/drivers/pcmcia/soc_common.c
===================================================================
--- linux.prev.orig/drivers/pcmcia/soc_common.c
+++ linux.prev/drivers/pcmcia/soc_common.c
@@ -39,6 +39,7 @@
 #include <linux/timer.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/spinlock.h>
 #include <linux/cpufreq.h>
 
Index: linux.prev/drivers/s390/char/vmlogrdr.c
===================================================================
--- linux.prev.orig/drivers/s390/char/vmlogrdr.c
+++ linux.prev/drivers/s390/char/vmlogrdr.c
@@ -145,7 +145,7 @@ static struct vmlogrdr_priv_t sys_ser[] 
 	  .recording_name = "EREP",
 	  .minor_num      = 0,
 	  .buffer_free    = 1,
-	  .priv_lock      = SPIN_LOCK_UNLOCKED,
+	  .priv_lock      = SPIN_LOCK_UNLOCKED(sys_ser[0].priv_lock),
 	  .autorecording  = 1,
 	  .autopurge      = 1,
 	},
@@ -154,7 +154,7 @@ static struct vmlogrdr_priv_t sys_ser[] 
 	  .recording_name = "ACCOUNT",
 	  .minor_num      = 1,
 	  .buffer_free    = 1,
-	  .priv_lock      = SPIN_LOCK_UNLOCKED,
+	  .priv_lock      = SPIN_LOCK_UNLOCKED(sys_ser[1].priv_lock),
 	  .autorecording  = 1,
 	  .autopurge      = 1,
 	},
@@ -163,7 +163,7 @@ static struct vmlogrdr_priv_t sys_ser[] 
 	  .recording_name = "SYMPTOM",
 	  .minor_num      = 2,
 	  .buffer_free    = 1,
-	  .priv_lock      = SPIN_LOCK_UNLOCKED,
+	  .priv_lock      = SPIN_LOCK_UNLOCKED(sys_ser[2].priv_lock),
 	  .autorecording  = 1,
 	  .autopurge      = 1,
 	}
Index: linux.prev/drivers/s390/cio/cmf.c
===================================================================
--- linux.prev.orig/drivers/s390/cio/cmf.c
+++ linux.prev/drivers/s390/cio/cmf.c
@@ -300,7 +300,7 @@ struct cmb_area {
 };
 
 static struct cmb_area cmb_area = {
-	.lock = SPIN_LOCK_UNLOCKED,
+	.lock = SPIN_LOCK_UNLOCKED(cmb_area.lock),
 	.list = LIST_HEAD_INIT(cmb_area.list),
 	.num_channels  = 1024,
 };
Index: linux.prev/drivers/sbus/char/cpwatchdog.c
===================================================================
--- linux.prev.orig/drivers/sbus/char/cpwatchdog.c
+++ linux.prev/drivers/sbus/char/cpwatchdog.c
@@ -156,7 +156,7 @@ struct wd_device {
 };
 
 static struct wd_device wd_dev = { 
-		0, SPIN_LOCK_UNLOCKED, 0, 0, 0, 0,
+		0, SPIN_LOCK_UNLOCKED(wd_dev.lock), 0, 0, 0, 0,
 };
 
 static struct timer_list wd_timer;
Index: linux.prev/drivers/scsi/aacraid/aacraid.h
===================================================================
--- linux.prev.orig/drivers/scsi/aacraid/aacraid.h
+++ linux.prev/drivers/scsi/aacraid/aacraid.h
@@ -735,7 +735,7 @@ struct aac_fib_context {
 	u32			unique;		// unique value representing this context
 	ulong			jiffies;	// used for cleanup - dmb changed to ulong
 	struct list_head	next;		// used to link context's into a linked list
-	struct semaphore 	wait_sem;	// this is used to wait for the next fib to arrive.
+	struct compat_semaphore	wait_sem;	// this is used to wait for the next fib to arrive.
 	int			wait;		// Set to true when thread is in WaitForSingleObject
 	unsigned long		count;		// total number of FIBs on FibList
 	struct list_head	fib_list;	// this holds fibs and their attachd hw_fibs
@@ -804,7 +804,7 @@ struct fib {
 	 *	This is the event the sendfib routine will wait on if the
 	 *	caller did not pass one and this is synch io.
 	 */
-	struct semaphore 	event_wait;
+	struct compat_semaphore	event_wait;
 	spinlock_t		event_lock;
 
 	u32			done;	/* gets set to 1 when fib is complete */
Index: linux.prev/drivers/scsi/aic7xxx/aic79xx_osm.h
===================================================================
--- linux.prev.orig/drivers/scsi/aic7xxx/aic79xx_osm.h
+++ linux.prev/drivers/scsi/aic7xxx/aic79xx_osm.h
@@ -390,7 +390,7 @@ struct ahd_platform_data {
 	spinlock_t		 spin_lock;
 	u_int			 qfrozen;
 	struct timer_list	 reset_timer;
-	struct semaphore	 eh_sem;
+	struct compat_semaphore	 eh_sem;
 	struct Scsi_Host        *host;		/* pointer to scsi host */
 #define AHD_LINUX_NOIRQ	((uint32_t)~0)
 	uint32_t		 irq;		/* IRQ for this adapter */
Index: linux.prev/drivers/scsi/aic7xxx/aic7xxx_osm.h
===================================================================
--- linux.prev.orig/drivers/scsi/aic7xxx/aic7xxx_osm.h
+++ linux.prev/drivers/scsi/aic7xxx/aic7xxx_osm.h
@@ -394,7 +394,7 @@ struct ahc_platform_data {
 	spinlock_t		 spin_lock;
 	u_int			 qfrozen;
 	struct timer_list	 reset_timer;
-	struct semaphore	 eh_sem;
+	struct compat_semaphore	 eh_sem;
 	struct Scsi_Host        *host;		/* pointer to scsi host */
 #define AHC_LINUX_NOIRQ	((uint32_t)~0)
 	uint32_t		 irq;		/* IRQ for this adapter */
Index: linux.prev/drivers/scsi/libata-core.c
===================================================================
--- linux.prev.orig/drivers/scsi/libata-core.c
+++ linux.prev/drivers/scsi/libata-core.c
@@ -4882,7 +4882,7 @@ module_init(ata_init);
 module_exit(ata_exit);
 
 static unsigned long ratelimit_time;
-static spinlock_t ata_ratelimit_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(ata_ratelimit_lock);
 
 int ata_ratelimit(void)
 {
Index: linux.prev/drivers/scsi/qla2xxx/qla_def.h
===================================================================
--- linux.prev.orig/drivers/scsi/qla2xxx/qla_def.h
+++ linux.prev/drivers/scsi/qla2xxx/qla_def.h
@@ -2411,7 +2411,7 @@ typedef struct scsi_qla_host {
 	spinlock_t	mbx_reg_lock;   /* Mbx Cmd Register Lock */
 
 	struct semaphore mbx_cmd_sem;	/* Serialialize mbx access */
-	struct semaphore mbx_intr_sem;  /* Used for completion notification */
+	struct compat_semaphore mbx_intr_sem;  /* Used for completion notification */
 
 	uint32_t	mbx_flags;
 #define  MBX_IN_PROGRESS	BIT_0
Index: linux.prev/drivers/scsi/qla2xxx/qla_os.c
===================================================================
--- linux.prev.orig/drivers/scsi/qla2xxx/qla_os.c
+++ linux.prev/drivers/scsi/qla2xxx/qla_os.c
@@ -2082,12 +2082,13 @@ qla2x00_free_sp_pool( scsi_qla_host_t *h
 static int
 qla2x00_do_dpc(void *data)
 {
-	DECLARE_MUTEX_LOCKED(sem);
+	DECLARE_MUTEX(sem);
 	scsi_qla_host_t *ha;
 	fc_port_t	*fcport;
 	uint8_t		status;
 	uint16_t	next_loopid;
 
+	down(&sem);
 	ha = (scsi_qla_host_t *)data;
 
 	lock_kernel();
Index: linux.prev/drivers/scsi/scsi.c
===================================================================
--- linux.prev.orig/drivers/scsi/scsi.c
+++ linux.prev/drivers/scsi/scsi.c
@@ -768,10 +768,10 @@ void __scsi_done(struct scsi_cmnd *cmd)
 	 * It is a per-CPU queue, so we just disable local interrupts
 	 * and need no spinlock.
 	 */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	list_add_tail(&cmd->eh_entry, &__get_cpu_var(scsi_done_q));
 	raise_softirq_irqoff(SCSI_SOFTIRQ);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /**
@@ -788,9 +788,9 @@ static void scsi_softirq(struct softirq_
 	int disposition;
 	LIST_HEAD(local_q);
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	list_splice_init(&__get_cpu_var(scsi_done_q), &local_q);
-	local_irq_enable();
+	raw_local_irq_enable();
 
 	while (!list_empty(&local_q)) {
 		struct scsi_cmnd *cmd = list_entry(local_q.next,
@@ -1282,11 +1282,11 @@ static int scsi_cpu_notify(struct notifi
 	switch(action) {
 	case CPU_DEAD:
 		/* Drain scsi_done_q. */
-		local_irq_disable();
+		raw_local_irq_disable();
 		list_splice_init(&per_cpu(scsi_done_q, cpu),
 				 &__get_cpu_var(scsi_done_q));
 		raise_softirq_irqoff(SCSI_SOFTIRQ);
-		local_irq_enable();
+		raw_local_irq_enable();
 		break;
 	default:
 		break;
Index: linux.prev/drivers/serial/8250.c
===================================================================
--- linux.prev.orig/drivers/serial/8250.c
+++ linux.prev/drivers/serial/8250.c
@@ -1344,6 +1344,17 @@ static irqreturn_t serial8250_interrupt(
 				"irq%d\n", irq);
 			break;
 		}
+		/*
+		 * If we have a buggy TX line, that doesn't
+		 * notify us via iir that we need to transmit
+		 * then force the call.
+		 */
+		if (!handled && (up->bugs & UART_BUG_TXEN)) {
+			spin_lock(&up->port.lock);
+			serial8250_handle_port(up, regs);
+			spin_unlock(&up->port.lock);
+		}
+
 	} while (l != end);
 
 	spin_unlock(&i->lock);
Index: linux.prev/drivers/serial/cpm_uart/cpm_uart_core.c
===================================================================
--- linux.prev.orig/drivers/serial/cpm_uart/cpm_uart_core.c
+++ linux.prev/drivers/serial/cpm_uart/cpm_uart_core.c
@@ -909,7 +909,7 @@ struct uart_cpm_port cpm_uart_ports[UART
 			.irq		= SMC1_IRQ,
 			.ops		= &cpm_uart_pops,
 			.iotype		= SERIAL_IO_MEM,
-			.lock		= SPIN_LOCK_UNLOCKED,
+			.lock		= SPIN_LOCK_UNLOCKED(cpm_uart_ports[UART_SMC1].port.lock),
 		},
 		.flags = FLAG_SMC,
 		.tx_nrfifos = TX_NUM_FIFO,
@@ -923,7 +923,7 @@ struct uart_cpm_port cpm_uart_ports[UART
 			.irq		= SMC2_IRQ,
 			.ops		= &cpm_uart_pops,
 			.iotype		= SERIAL_IO_MEM,
-			.lock		= SPIN_LOCK_UNLOCKED,
+			.lock		= SPIN_LOCK_UNLOCKED(cpm_uart_ports[UART_SMC2].port.lock),
 		},
 		.flags = FLAG_SMC,
 		.tx_nrfifos = TX_NUM_FIFO,
@@ -940,7 +940,7 @@ struct uart_cpm_port cpm_uart_ports[UART
 			.irq		= SCC1_IRQ,
 			.ops		= &cpm_uart_pops,
 			.iotype		= SERIAL_IO_MEM,
-			.lock		= SPIN_LOCK_UNLOCKED,
+			.lock		= SPIN_LOCK_UNLOCKED(cpm_uart_ports[UART_SCC1].port.lock),
 		},
 		.tx_nrfifos = TX_NUM_FIFO,
 		.tx_fifosize = TX_BUF_SIZE,
@@ -954,7 +954,7 @@ struct uart_cpm_port cpm_uart_ports[UART
 			.irq		= SCC2_IRQ,
 			.ops		= &cpm_uart_pops,
 			.iotype		= SERIAL_IO_MEM,
-			.lock		= SPIN_LOCK_UNLOCKED,
+			.lock		= SPIN_LOCK_UNLOCKED(cpm_uart_ports[UART_SCC2].port.lock),
 		},
 		.tx_nrfifos = TX_NUM_FIFO,
 		.tx_fifosize = TX_BUF_SIZE,
@@ -968,7 +968,7 @@ struct uart_cpm_port cpm_uart_ports[UART
 			.irq		= SCC3_IRQ,
 			.ops		= &cpm_uart_pops,
 			.iotype		= SERIAL_IO_MEM,
-			.lock		= SPIN_LOCK_UNLOCKED,
+			.lock		= SPIN_LOCK_UNLOCKED(cpm_uart_ports[UART_SCC3].port.lock),
 		},
 		.tx_nrfifos = TX_NUM_FIFO,
 		.tx_fifosize = TX_BUF_SIZE,
@@ -982,7 +982,7 @@ struct uart_cpm_port cpm_uart_ports[UART
 			.irq		= SCC4_IRQ,
 			.ops		= &cpm_uart_pops,
 			.iotype		= SERIAL_IO_MEM,
-			.lock		= SPIN_LOCK_UNLOCKED,
+			.lock		= SPIN_LOCK_UNLOCKED(cpm_uart_ports[UART_SCC4].port.lock),
 		},
 		.tx_nrfifos = TX_NUM_FIFO,
 		.tx_fifosize = TX_BUF_SIZE,
Index: linux.prev/drivers/serial/s3c2410.c
===================================================================
--- linux.prev.orig/drivers/serial/s3c2410.c
+++ linux.prev/drivers/serial/s3c2410.c
@@ -966,7 +966,7 @@ static struct uart_driver s3c24xx_uart_d
 static struct s3c24xx_uart_port s3c24xx_serial_ports[NR_PORTS] = {
 	[0] = {
 		.port = {
-			.lock		= SPIN_LOCK_UNLOCKED,
+			.lock		= SPIN_LOCK_UNLOCKED(s3c24xx_serial_ports[0].port.lock),
 			.iotype		= UPIO_MEM,
 			.irq		= IRQ_S3CUART_RX0,
 			.uartclk	= 0,
@@ -978,7 +978,7 @@ static struct s3c24xx_uart_port s3c24xx_
 	},
 	[1] = {
 		.port = {
-			.lock		= SPIN_LOCK_UNLOCKED,
+			.lock		= SPIN_LOCK_UNLOCKED(s3c24xx_serial_ports[1].port.lock),
 			.iotype		= UPIO_MEM,
 			.irq		= IRQ_S3CUART_RX1,
 			.uartclk	= 0,
@@ -992,7 +992,7 @@ static struct s3c24xx_uart_port s3c24xx_
 
 	[2] = {
 		.port = {
-			.lock		= SPIN_LOCK_UNLOCKED,
+			.lock		= SPIN_LOCK_UNLOCKED(s3c24xx_serial_ports[2].port.lock),
 			.iotype		= UPIO_MEM,
 			.irq		= IRQ_S3CUART_RX2,
 			.uartclk	= 0,
Index: linux.prev/drivers/usb/core/devio.c
===================================================================
--- linux.prev.orig/drivers/usb/core/devio.c
+++ linux.prev/drivers/usb/core/devio.c
@@ -307,10 +307,11 @@ static void async_completed(struct urb *
         struct async *as = (struct async *)urb->context;
         struct dev_state *ps = as->ps;
 	struct siginfo sinfo;
+	unsigned long flags;
 
-        spin_lock(&ps->lock);
-        list_move_tail(&as->asynclist, &ps->async_completed);
-        spin_unlock(&ps->lock);
+	spin_lock_irqsave(&ps->lock, flags);
+	list_move_tail(&as->asynclist, &ps->async_completed);
+	spin_unlock_irqrestore(&ps->lock, flags);
 	if (as->signr) {
 		sinfo.si_signo = as->signr;
 		sinfo.si_errno = as->urb->status;
Index: linux.prev/drivers/usb/core/hcd.c
===================================================================
--- linux.prev.orig/drivers/usb/core/hcd.c
+++ linux.prev/drivers/usb/core/hcd.c
@@ -497,13 +497,11 @@ error:
 	}
 
 	/* any errors get returned through the urb completion */
-	local_irq_save (flags);
-	spin_lock (&urb->lock);
+	spin_lock_irqsave(&urb->lock, flags);
 	if (urb->status == -EINPROGRESS)
 		urb->status = status;
-	spin_unlock (&urb->lock);
+	spin_unlock_irqrestore(&urb->lock, flags);
 	usb_hcd_giveback_urb (hcd, urb, NULL);
-	local_irq_restore (flags);
 	return 0;
 }
 
@@ -531,8 +529,7 @@ void usb_hcd_poll_rh_status(struct usb_h
 	if (length > 0) {
 
 		/* try to complete the status urb */
-		local_irq_save (flags);
-		spin_lock(&hcd_root_hub_lock);
+		spin_lock_irqsave(&hcd_root_hub_lock, flags);
 		urb = hcd->status_urb;
 		if (urb) {
 			spin_lock(&urb->lock);
@@ -548,14 +545,13 @@ void usb_hcd_poll_rh_status(struct usb_h
 			spin_unlock(&urb->lock);
 		} else
 			length = 0;
-		spin_unlock(&hcd_root_hub_lock);
+		spin_unlock_irqrestore(&hcd_root_hub_lock, flags);
 
 		/* local irqs are always blocked in completions */
 		if (length > 0)
 			usb_hcd_giveback_urb (hcd, urb, NULL);
 		else
 			hcd->poll_pending = 1;
-		local_irq_restore (flags);
 	}
 
 	/* The USB 2.0 spec says 256 ms.  This is close enough and won't
@@ -638,17 +634,15 @@ static int usb_rh_urb_dequeue (struct us
 	} else {				/* Status URB */
 		if (!hcd->uses_new_polling)
 			del_timer_sync (&hcd->rh_timer);
-		local_irq_disable ();
-		spin_lock (&hcd_root_hub_lock);
+		spin_lock_irq(&hcd_root_hub_lock);
 		if (urb == hcd->status_urb) {
 			hcd->status_urb = NULL;
 			urb->hcpriv = NULL;
 		} else
 			urb = NULL;		/* wasn't fully queued */
-		spin_unlock (&hcd_root_hub_lock);
+		spin_unlock_irq(&hcd_root_hub_lock);
 		if (urb)
 			usb_hcd_giveback_urb (hcd, urb, NULL);
-		local_irq_enable ();
 	}
 
 	return 0;
@@ -1361,15 +1355,13 @@ hcd_endpoint_disable (struct usb_device 
 	WARN_ON (!HC_IS_RUNNING (hcd->state) && hcd->state != HC_STATE_HALT &&
 			udev->state != USB_STATE_NOTATTACHED);
 
-	local_irq_disable ();
-
 	/* FIXME move most of this into message.c as part of its
 	 * endpoint disable logic
 	 */
 
 	/* ep is already gone from udev->ep_{in,out}[]; no more submits */
 rescan:
-	spin_lock (&hcd_data_lock);
+	spin_lock_irq(&hcd_data_lock);
 	list_for_each_entry (urb, &ep->urb_list, urb_list) {
 		int	tmp;
 
@@ -1382,13 +1374,13 @@ rescan:
 		if (urb->status != -EINPROGRESS)
 			continue;
 		usb_get_urb (urb);
-		spin_unlock (&hcd_data_lock);
+		spin_unlock_irq(&hcd_data_lock);
 
-		spin_lock (&urb->lock);
+		spin_lock_irq(&urb->lock);
 		tmp = urb->status;
 		if (tmp == -EINPROGRESS)
 			urb->status = -ESHUTDOWN;
-		spin_unlock (&urb->lock);
+		spin_unlock_irq(&urb->lock);
 
 		/* kick hcd unless it's already returning this */
 		if (tmp == -EINPROGRESS) {
@@ -1411,8 +1403,7 @@ rescan:
 		/* list contents may have changed */
 		goto rescan;
 	}
-	spin_unlock (&hcd_data_lock);
-	local_irq_enable ();
+	spin_unlock_irq(&hcd_data_lock);
 
 	/* synchronize with the hardware, so old configuration state
 	 * clears out immediately (and will be freed).
Index: linux.prev/drivers/usb/core/message.c
===================================================================
--- linux.prev.orig/drivers/usb/core/message.c
+++ linux.prev/drivers/usb/core/message.c
@@ -233,8 +233,9 @@ static void sg_clean (struct usb_sg_requ
 static void sg_complete (struct urb *urb, struct pt_regs *regs)
 {
 	struct usb_sg_request	*io = (struct usb_sg_request *) urb->context;
+	unsigned long flags;
 
-	spin_lock (&io->lock);
+	spin_lock_irqsave (&io->lock, flags);
 
 	/* In 2.5 we require hcds' endpoint queues not to progress after fault
 	 * reports, until the completion callback (this!) returns.  That lets
@@ -268,7 +269,7 @@ static void sg_complete (struct urb *urb
 		 * unlink pending urbs so they won't rx/tx bad data.
 		 * careful: unlink can sometimes be synchronous...
 		 */
-		spin_unlock (&io->lock);
+		spin_unlock_irqrestore (&io->lock, flags);
 		for (i = 0, found = 0; i < io->entries; i++) {
 			if (!io->urbs [i] || !io->urbs [i]->dev)
 				continue;
@@ -283,7 +284,7 @@ static void sg_complete (struct urb *urb
 			} else if (urb == io->urbs [i])
 				found = 1;
 		}
-		spin_lock (&io->lock);
+		spin_lock_irqsave (&io->lock, flags);
 	}
 	urb->dev = NULL;
 
@@ -293,7 +294,7 @@ static void sg_complete (struct urb *urb
 	if (!io->count)
 		complete (&io->complete);
 
-	spin_unlock (&io->lock);
+	spin_unlock_irqrestore (&io->lock, flags);
 }
 
 
Index: linux.prev/drivers/usb/net/usbnet.c
===================================================================
--- linux.prev.orig/drivers/usb/net/usbnet.c
+++ linux.prev/drivers/usb/net/usbnet.c
@@ -819,6 +819,8 @@ static void tx_complete (struct urb *urb
 
 	urb->dev = NULL;
 	entry->state = tx_done;
+	spin_lock_rt(&dev->txq.lock);
+	spin_unlock_rt(&dev->txq.lock);
 	defer_bh(dev, skb, &dev->txq);
 }
 
Index: linux.prev/drivers/usb/storage/usb.c
===================================================================
--- linux.prev.orig/drivers/usb/storage/usb.c
+++ linux.prev/drivers/usb/storage/usb.c
@@ -327,6 +327,7 @@ static int usb_stor_control_thread(void 
 		if (test_bit(US_FLIDX_DISCONNECTING, &us->flags)) {
 			US_DEBUGP("-- exiting\n");
 			up(&(us->dev_semaphore));
+			up(&us->sema);
 			break;
 		}
 
Index: linux.prev/drivers/usb/storage/usb.h
===================================================================
--- linux.prev.orig/drivers/usb/storage/usb.h
+++ linux.prev/drivers/usb/storage/usb.h
@@ -171,7 +171,7 @@ struct us_data {
 	dma_addr_t		iobuf_dma;
 
 	/* mutual exclusion and synchronization structures */
-	struct semaphore	sema;		 /* to sleep thread on	    */
+	struct compat_semaphore	sema;		 /* to sleep thread on	    */
 	struct completion	notify;		 /* thread begin/end	    */
 	wait_queue_head_t	delay_wait;	 /* wait during scan, reset */
 
Index: linux.prev/drivers/video/backlight/corgi_bl.c
===================================================================
--- linux.prev.orig/drivers/video/backlight/corgi_bl.c
+++ linux.prev/drivers/video/backlight/corgi_bl.c
@@ -28,7 +28,7 @@ static int corgibl_powermode = FB_BLANK_
 static int current_intensity = 0;
 static int corgibl_limit = 0;
 static void (*corgibl_mach_set_intensity)(int intensity);
-static spinlock_t bl_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(bl_lock);
 static struct backlight_properties corgibl_data;
 
 static void corgibl_send_intensity(int intensity)
Index: linux.prev/drivers/video/console/fbcon.c
===================================================================
--- linux.prev.orig/drivers/video/console/fbcon.c
+++ linux.prev/drivers/video/console/fbcon.c
@@ -1187,7 +1187,6 @@ static void fbcon_clear(struct vc_data *
 {
 	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
 	struct fbcon_ops *ops = info->fbcon_par;
-
 	struct display *p = &fb_display[vc->vc_num];
 	u_int y_break;
 
@@ -1216,10 +1215,11 @@ static void fbcon_putcs(struct vc_data *
 	struct display *p = &fb_display[vc->vc_num];
 	struct fbcon_ops *ops = info->fbcon_par;
 
-	if (!fbcon_is_inactive(vc, info))
+	if (!fbcon_is_inactive(vc, info)) {
 		ops->putcs(vc, info, s, count, real_y(p, ypos), xpos,
 			   get_color(vc, info, scr_readw(s), 1),
 			   get_color(vc, info, scr_readw(s), 0));
+	}
 }
 
 static void fbcon_putc(struct vc_data *vc, int c, int ypos, int xpos)
@@ -2990,6 +2990,7 @@ static const struct consw fb_con = {
 	.con_screen_pos 	= fbcon_screen_pos,
 	.con_getxy 		= fbcon_getxy,
 	.con_resize             = fbcon_resize,
+	.con_preemptible 	= 1,
 };
 
 static struct notifier_block fbcon_event_notifier = {
Index: linux.prev/drivers/video/console/vgacon.c
===================================================================
--- linux.prev.orig/drivers/video/console/vgacon.c
+++ linux.prev/drivers/video/console/vgacon.c
@@ -53,7 +53,7 @@
 #include <video/vga.h>
 #include <asm/io.h>
 
-static DEFINE_SPINLOCK(vga_lock);
+static DEFINE_RAW_SPINLOCK(vga_lock);
 static int cursor_size_lastfrom;
 static int cursor_size_lastto;
 static struct vgastate state;
Index: linux.prev/drivers/video/fbmon.c
===================================================================
--- linux.prev.orig/drivers/video/fbmon.c
+++ linux.prev/drivers/video/fbmon.c
@@ -317,8 +317,12 @@ static int edid_is_monitor_block(unsigne
 static void calc_mode_timings(int xres, int yres, int refresh,
 			      struct fb_videomode *mode)
 {
-	struct fb_var_screeninfo var;
-	struct fb_info info;
+	// FIXME: ugly hack to reduce stack footprint
+	static struct fb_var_screeninfo var;
+	static struct fb_info info;
+	static DECLARE_MUTEX(fb_lock);
+
+	down(&fb_lock);
 	
 	memset(&var, 0, sizeof(struct fb_var_screeninfo));
 	var.xres = xres;
@@ -337,6 +341,7 @@ static void calc_mode_timings(int xres, 
 	mode->vsync_len = var.vsync_len;
 	mode->vmode = 0;
 	mode->sync = 0;
+	up(&fb_lock);
 }
 
 static int get_est_timing(unsigned char *block, struct fb_videomode *mode)
Index: linux.prev/fs/aio.c
===================================================================
--- linux.prev.orig/fs/aio.c
+++ linux.prev/fs/aio.c
@@ -580,13 +580,15 @@ static void use_mm(struct mm_struct *mm)
 	tsk->flags |= PF_BORROWED_MM;
 	active_mm = tsk->active_mm;
 	atomic_inc(&mm->mm_count);
-	tsk->mm = mm;
-	tsk->active_mm = mm;
+	local_irq_disable(); // FIXME
 	/*
 	 * Note that on UML this *requires* PF_BORROWED_MM to be set, otherwise
 	 * it won't work. Update it accordingly if you change it here
 	 */
 	activate_mm(active_mm, mm);
+	tsk->mm = mm;
+	tsk->active_mm = mm;
+	local_irq_enable();
 	task_unlock(tsk);
 
 	mmdrop(active_mm);
Index: linux.prev/fs/block_dev.c
===================================================================
--- linux.prev.orig/fs/block_dev.c
+++ linux.prev/fs/block_dev.c
@@ -667,14 +667,32 @@ int blkdev_get(struct block_device *bdev
 	 * For now, block device ->open() routine must _not_
 	 * examine anything in 'inode' argument except ->i_rdev.
 	 */
-	struct file fake_file = {};
-	struct dentry fake_dentry = {};
-	fake_file.f_mode = mode;
-	fake_file.f_flags = flags;
-	fake_file.f_dentry = &fake_dentry;
-	fake_dentry.d_inode = bdev->bd_inode;
-
-	return do_open(bdev, &fake_file);
+	struct file *fake_file;
+	struct dentry *fake_dentry;
+	int err = -ENOMEM;
+
+	fake_file = kmalloc(sizeof(*fake_file), GFP_KERNEL);
+	if (!fake_file)
+		goto out;
+	memset(fake_file, 0, sizeof(*fake_file));
+
+	fake_dentry = kmalloc(sizeof(*fake_dentry), GFP_KERNEL);
+	if (!fake_dentry)
+		goto out_free_file;
+	memset(fake_dentry, 0, sizeof(*fake_dentry));
+
+	fake_file->f_mode = mode;
+	fake_file->f_flags = flags;
+	fake_file->f_dentry = fake_dentry;
+	fake_dentry->d_inode = bdev->bd_inode;
+
+	err = do_open(bdev, fake_file);
+
+	kfree(fake_dentry);
+out_free_file:
+	kfree(fake_file);
+out:
+	return err;
 }
 
 EXPORT_SYMBOL(blkdev_get);
Index: linux.prev/fs/buffer.c
===================================================================
--- linux.prev.orig/fs/buffer.c
+++ linux.prev/fs/buffer.c
@@ -538,8 +538,7 @@ static void end_buffer_async_read(struct
 	 * decide that the page is now completely done.
 	 */
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	spin_lock_irqsave(&first->b_uptodate_lock, flags);
 	clear_buffer_async_read(bh);
 	unlock_buffer(bh);
 	tmp = bh;
@@ -552,8 +551,7 @@ static void end_buffer_async_read(struct
 		}
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 
 	/*
 	 * If none of the buffers had errors and they are all
@@ -565,8 +563,7 @@ static void end_buffer_async_read(struct
 	return;
 
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	return;
 }
 
@@ -600,8 +597,7 @@ void end_buffer_async_write(struct buffe
 	}
 
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	spin_lock_irqsave(&first->b_uptodate_lock, flags);
 
 	clear_buffer_async_write(bh);
 	unlock_buffer(bh);
@@ -613,14 +609,12 @@ void end_buffer_async_write(struct buffe
 		}
 		tmp = tmp->b_this_page;
 	}
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	end_page_writeback(page);
 	return;
 
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	return;
 }
 
@@ -1336,9 +1330,9 @@ struct bh_lru {
 
 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
 
-#ifdef CONFIG_SMP
-#define bh_lru_lock()	local_irq_disable()
-#define bh_lru_unlock()	local_irq_enable()
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
+#define bh_lru_lock()	raw_local_irq_disable()
+#define bh_lru_unlock()	raw_local_irq_enable()
 #else
 #define bh_lru_lock()	preempt_disable()
 #define bh_lru_unlock()	preempt_enable()
@@ -3071,6 +3065,8 @@ EXPORT_SYMBOL(alloc_buffer_head);
 void free_buffer_head(struct buffer_head *bh)
 {
 	BUG_ON(!list_empty(&bh->b_assoc_buffers));
+	BUG_ON(spin_is_locked(&bh->b_uptodate_lock));
+	BUG_ON(spin_is_locked(&bh->b_state_lock));
 	kmem_cache_free(bh_cachep, bh);
 	get_cpu_var(bh_accounting).nr--;
 	recalc_bh_state();
@@ -3087,6 +3083,8 @@ init_buffer_head(void *data, kmem_cache_
 
 		memset(bh, 0, sizeof(*bh));
 		INIT_LIST_HEAD(&bh->b_assoc_buffers);
+		spin_lock_init(&bh->b_uptodate_lock);
+		spin_lock_init(&bh->b_state_lock);
 	}
 }
 
Index: linux.prev/fs/dcache.c
===================================================================
--- linux.prev.orig/fs/dcache.c
+++ linux.prev/fs/dcache.c
@@ -40,7 +40,7 @@ int sysctl_vfs_cache_pressure = 100;
 EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
 
  __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock);
-static seqlock_t rename_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED;
+static DECLARE_SEQLOCK(rename_lock);
 
 EXPORT_SYMBOL(dcache_lock);
 
Index: linux.prev/fs/devfs/base.c
===================================================================
--- linux.prev.orig/fs/devfs/base.c
+++ linux.prev/fs/devfs/base.c
@@ -826,7 +826,7 @@ struct fs_info {		/*  This structure is 
 	wait_queue_head_t revalidate_wait_queue;	/*  Wake when devfsd sleeps    */
 };
 
-static struct fs_info fs_info = {.devfsd_buffer_lock = SPIN_LOCK_UNLOCKED };
+static struct fs_info fs_info = {.devfsd_buffer_lock = SPIN_LOCK_UNLOCKED(fs_info.devfsd_buffer_lock) };
 static kmem_cache_t *devfsd_buf_cache;
 #ifdef CONFIG_DEVFS_DEBUG
 static unsigned int devfs_debug_init __initdata = DEBUG_NONE;
Index: linux.prev/fs/dnotify.c
===================================================================
--- linux.prev.orig/fs/dnotify.c
+++ linux.prev/fs/dnotify.c
@@ -162,7 +162,7 @@ void dnotify_parent(struct dentry *dentr
 
 	spin_lock(&dentry->d_lock);
 	parent = dentry->d_parent;
-	if (parent->d_inode->i_dnotify_mask & event) {
+	if (unlikely(parent->d_inode->i_dnotify_mask & event)) {
 		dget(parent);
 		spin_unlock(&dentry->d_lock);
 		__inode_dir_notify(parent->d_inode, event);
Index: linux.prev/fs/exec.c
===================================================================
--- linux.prev.orig/fs/exec.c
+++ linux.prev/fs/exec.c
@@ -48,6 +48,7 @@
 #include <linux/syscalls.h>
 #include <linux/rmap.h>
 #include <linux/acct.h>
+#include <linux/delay.h>
 #include <linux/cn_proc.h>
 
 #include <asm/uaccess.h>
@@ -553,11 +554,16 @@ static int exec_mmap(struct mm_struct *m
 		}
 	}
 	task_lock(tsk);
+
+	raw_local_irq_disable();
 	active_mm = tsk->active_mm;
+	activate_mm(active_mm, mm);
 	tsk->mm = mm;
 	tsk->active_mm = mm;
-	activate_mm(active_mm, mm);
+	raw_local_irq_enable();
+
 	task_unlock(tsk);
+
 	arch_pick_mmap_layout(mm);
 	if (old_mm) {
 		up_read(&old_mm->mmap_sem);
@@ -632,10 +638,10 @@ static inline int de_thread(struct task_
 		 * synchronize with any firing (by calling del_timer_sync)
 		 * before we can safely let the old group leader die.
 		 */
-		sig->real_timer.data = (unsigned long)current;
+		sig->real_timer.data = current;
 		spin_unlock_irq(lock);
-		if (del_timer_sync(&sig->real_timer))
-			add_timer(&sig->real_timer);
+		if (hrtimer_cancel(&sig->real_timer))
+			hrtimer_restart(&sig->real_timer);
 		spin_lock_irq(lock);
 	}
 	while (atomic_read(&sig->count) > count) {
@@ -667,7 +673,7 @@ static inline int de_thread(struct task_
 		 */
 		leader = current->group_leader;
 		while (leader->exit_state != EXIT_ZOMBIE)
-			yield();
+			msleep(1);
 
 		spin_lock(&leader->proc_lock);
 		spin_lock(&current->proc_lock);
@@ -768,7 +774,7 @@ no_thread_group:
 		write_unlock_irq(&tasklist_lock);
 
 		if (atomic_dec_and_test(&oldsighand->count))
-			kmem_cache_free(sighand_cachep, oldsighand);
+			sighand_free(oldsighand);
 	}
 
 	BUG_ON(!thread_group_leader(current));
Index: linux.prev/fs/fcntl.c
===================================================================
--- linux.prev.orig/fs/fcntl.c
+++ linux.prev/fs/fcntl.c
@@ -461,7 +461,8 @@ static void send_sigio_to_task(struct ta
 				break;
 		/* fall-through: fall back on the old plain SIGIO signal */
 		case 0:
-			send_group_sig_info(SIGIO, SEND_SIG_PRIV, p);
+			// we hold the tasklist lock already:
+			group_send_sig_info(SIGIO, SEND_SIG_PRIV, p);
 	}
 }
 
@@ -495,7 +496,7 @@ static void send_sigurg_to_task(struct t
                                 struct fown_struct *fown)
 {
 	if (sigio_perm(p, fown, SIGURG))
-		send_group_sig_info(SIGURG, SEND_SIG_PRIV, p);
+		group_send_sig_info(SIGURG, SEND_SIG_PRIV, p);
 }
 
 int send_sigurg(struct fown_struct *fown)
Index: linux.prev/fs/jbd/transaction.c
===================================================================
--- linux.prev.orig/fs/jbd/transaction.c
+++ linux.prev/fs/jbd/transaction.c
@@ -1480,7 +1480,7 @@ void __journal_temp_unlink_buffer(struct
 	transaction_t *transaction;
 	struct buffer_head *bh = jh2bh(jh);
 
-	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh));
 	transaction = jh->b_transaction;
 	if (transaction)
 		assert_spin_locked(&transaction->t_journal->j_list_lock);
@@ -1925,7 +1925,7 @@ void __journal_file_buffer(struct journa
 	int was_dirty = 0;
 	struct buffer_head *bh = jh2bh(jh);
 
-	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh));
 	assert_spin_locked(&transaction->t_journal->j_list_lock);
 
 	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
@@ -2014,7 +2014,7 @@ void __journal_refile_buffer(struct jour
 	int was_dirty;
 	struct buffer_head *bh = jh2bh(jh);
 
-	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh));
 	if (jh->b_transaction)
 		assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
 
Index: linux.prev/fs/lockd/svc.c
===================================================================
--- linux.prev.orig/fs/lockd/svc.c
+++ linux.prev/fs/lockd/svc.c
@@ -49,7 +49,7 @@ static pid_t			nlmsvc_pid;
 int				nlmsvc_grace_period;
 unsigned long			nlmsvc_timeout;
 
-static DECLARE_MUTEX_LOCKED(lockd_start);
+static DECLARE_WAIT_QUEUE_HEAD(lockd_start);
 static DECLARE_WAIT_QUEUE_HEAD(lockd_exit);
 
 /*
@@ -112,7 +112,7 @@ lockd(struct svc_rqst *rqstp)
 	 * Let our maker know we're running.
 	 */
 	nlmsvc_pid = current->pid;
-	up(&lockd_start);
+	wake_up(&lockd_start);
 
 	daemonize("lockd");
 
@@ -263,8 +263,15 @@ lockd_up(void)
 			"lockd_up: create thread failed, error=%d\n", error);
 		goto destroy_and_out;
 	}
-	down(&lockd_start);
-
+	/*
+	 * Wait for the lockd process to start, but since we're holding
+	 * the lockd semaphore, we can't wait around forever ...
+	 */
+	if (wait_event_interruptible_timeout(lockd_start,
+					     nlmsvc_pid != 0, HZ) <= 0) {
+		printk(KERN_WARNING
+			"lockd_down: lockd failed to start\n");
+	}
 	/*
 	 * Note: svc_serv structures have an initial use count of 1,
 	 * so we exit through here on both success and failure.
@@ -304,16 +311,12 @@ lockd_down(void)
 	 * Wait for the lockd process to exit, but since we're holding
 	 * the lockd semaphore, we can't wait around forever ...
 	 */
-	clear_thread_flag(TIF_SIGPENDING);
-	interruptible_sleep_on_timeout(&lockd_exit, HZ);
-	if (nlmsvc_pid) {
+	if (wait_event_interruptible_timeout(lockd_exit,
+					     nlmsvc_pid == 0, HZ) <= 0) {
 		printk(KERN_WARNING 
 			"lockd_down: lockd failed to exit, clearing pid\n");
 		nlmsvc_pid = 0;
 	}
-	spin_lock_irq(&current->sighand->siglock);
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
 out:
 	up(&nlmsvc_sema);
 }
Index: linux.prev/fs/nfsd/nfs4state.c
===================================================================
--- linux.prev.orig/fs/nfsd/nfs4state.c
+++ linux.prev/fs/nfsd/nfs4state.c
@@ -44,6 +44,7 @@
 #include <linux/mount.h>
 #include <linux/workqueue.h>
 #include <linux/smp_lock.h>
+#include <linux/spinlock.h>
 #include <linux/kthread.h>
 #include <linux/nfs4.h>
 #include <linux/nfsd/state.h>
@@ -122,7 +123,7 @@ static void release_stateid(struct nfs4_
  */
 
 /* recall_lock protects the del_recall_lru */
-static spinlock_t recall_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t recall_lock = SPIN_LOCK_UNLOCKED(recall_lock);
 static struct list_head del_recall_lru;
 
 static void
Index: linux.prev/fs/ntfs/aops.c
===================================================================
--- linux.prev.orig/fs/ntfs/aops.c
+++ linux.prev/fs/ntfs/aops.c
@@ -104,8 +104,7 @@ static void ntfs_end_buffer_async_read(s
 				"0x%llx.", (unsigned long long)bh->b_blocknr);
 	}
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	spin_lock_irqsave(&first->b_uptodate_lock, flags);
 	clear_buffer_async_read(bh);
 	unlock_buffer(bh);
 	tmp = bh;
@@ -120,8 +119,7 @@ static void ntfs_end_buffer_async_read(s
 		}
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	/*
 	 * If none of the buffers had errors then we can set the page uptodate,
 	 * but we first have to perform the post read mst fixups, if the
@@ -154,8 +152,7 @@ static void ntfs_end_buffer_async_read(s
 	unlock_page(page);
 	return;
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	return;
 }
 
Index: linux.prev/fs/pipe.c
===================================================================
--- linux.prev.orig/fs/pipe.c
+++ linux.prev/fs/pipe.c
@@ -206,8 +206,14 @@ pipe_readv(struct file *filp, const stru
 		wake_up_interruptible(PIPE_WAIT(*inode));
 		kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
 	}
+	/*
+	 * Hack: we turn off atime updates for -RT kernels.
+	 * Who uses them on pipes anyway?
+	 */
+#ifndef CONFIG_PREEMPT_RT
 	if (ret > 0)
 		file_accessed(filp);
+#endif
 	return ret;
 }
 
@@ -346,8 +352,14 @@ out:
 		wake_up_interruptible(PIPE_WAIT(*inode));
 		kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
 	}
+	/*
+	 * Hack: we turn off atime updates for -RT kernels.
+	 * Who uses them on pipes anyway?
+	 */
+#ifndef CONFIG_PREEMPT_RT
 	if (ret > 0)
 		inode_update_time(inode, 1);	/* mtime and ctime */
+#endif
 	return ret;
 }
 
Index: linux.prev/fs/proc/array.c
===================================================================
--- linux.prev.orig/fs/proc/array.c
+++ linux.prev/fs/proc/array.c
@@ -130,17 +130,19 @@ static inline char * task_name(struct ta
  */
 static const char *task_state_array[] = {
 	"R (running)",		/*  0 */
-	"S (sleeping)",		/*  1 */
-	"D (disk sleep)",	/*  2 */
-	"T (stopped)",		/*  4 */
-	"T (tracing stop)",	/*  8 */
-	"Z (zombie)",		/* 16 */
-	"X (dead)"		/* 32 */
+	"M (running-mutex)",	/*  1 */
+	"S (sleeping)",		/*  2 */
+	"D (disk sleep)",	/*  4 */
+	"T (stopped)",		/*  8 */
+	"T (tracing stop)",	/* 16 */
+	"Z (zombie)",		/* 32 */
+	"X (dead)"		/* 64 */
 };
 
 static inline const char * get_task_state(struct task_struct *tsk)
 {
 	unsigned int state = (tsk->state & (TASK_RUNNING |
+					    TASK_RUNNING_MUTEX |
 					    TASK_INTERRUPTIBLE |
 					    TASK_UNINTERRUPTIBLE |
 					    TASK_STOPPED |
@@ -293,6 +295,18 @@ static inline char *task_cap(struct task
 			    cap_t(p->cap_effective));
 }
 
+
+static char *show_blocked_on(task_t *task, char *buffer)
+{
+	pid_t pid = get_blocked_on(task);
+
+	if (pid < 0)
+		return buffer;
+
+	return buffer + sprintf(buffer,"BlckOn: %d\n",pid);
+}
+
+
 int proc_pid_status(struct task_struct *task, char * buffer)
 {
 	char * orig = buffer;
@@ -311,6 +325,7 @@ int proc_pid_status(struct task_struct *
 #if defined(CONFIG_ARCH_S390)
 	buffer = task_show_regs(task, buffer);
 #endif
+	buffer = show_blocked_on(task,buffer);
 	return buffer - orig;
 }
 
@@ -330,7 +345,7 @@ static int do_task_stat(struct task_stru
 	unsigned long  min_flt = 0,  maj_flt = 0;
 	cputime_t cutime, cstime, utime, stime;
 	unsigned long rsslim = 0;
-	unsigned long it_real_value = 0;
+	DEFINE_KTIME(it_real_value);
 	struct task_struct *t;
 	char tcomm[sizeof(task->comm)];
 
@@ -386,7 +401,7 @@ static int do_task_stat(struct task_stru
 			utime = cputime_add(utime, task->signal->utime);
 			stime = cputime_add(stime, task->signal->stime);
 		}
-		it_real_value = task->signal->it_real_value;
+		it_real_value = task->signal->real_timer.expires;
 	}
 	ppid = pid_alive(task) ? task->group_leader->real_parent->tgid : 0;
 	read_unlock(&tasklist_lock);
@@ -435,7 +450,7 @@ static int do_task_stat(struct task_stru
 		priority,
 		nice,
 		num_threads,
-		jiffies_to_clock_t(it_real_value),
+		(long) ktime_to_clock_t(it_real_value),
 		start_time,
 		vsize,
 		mm ? get_mm_rss(mm) : 0,
Index: linux.prev/fs/proc/generic.c
===================================================================
--- linux.prev.orig/fs/proc/generic.c
+++ linux.prev/fs/proc/generic.c
@@ -19,6 +19,7 @@
 #include <linux/idr.h>
 #include <linux/namei.h>
 #include <linux/bitops.h>
+#include <linux/spinlock.h>
 #include <asm/uaccess.h>
 
 static ssize_t proc_file_read(struct file *file, char __user *buf,
@@ -27,6 +28,8 @@ static ssize_t proc_file_write(struct fi
 			       size_t count, loff_t *ppos);
 static loff_t proc_file_lseek(struct file *, loff_t, int);
 
+DEFINE_SPINLOCK(proc_subdir_lock);
+
 int proc_match(int len, const char *name, struct proc_dir_entry *de)
 {
 	if (de->namelen != len)
@@ -275,7 +278,9 @@ static int xlate_proc_name(const char *n
 	const char     		*cp = name, *next;
 	struct proc_dir_entry	*de;
 	int			len;
+	int 			rtn = 0;
 
+	spin_lock(&proc_subdir_lock);
 	de = &proc_root;
 	while (1) {
 		next = strchr(cp, '/');
@@ -287,13 +292,17 @@ static int xlate_proc_name(const char *n
 			if (proc_match(len, cp, de))
 				break;
 		}
-		if (!de)
-			return -ENOENT;
+		if (!de) {
+			rtn = -ENOENT;
+			goto out;
+		}
 		cp += len + 1;
 	}
 	*residual = cp;
 	*ret = de;
-	return 0;
+out:
+	spin_unlock(&proc_subdir_lock);
+	return rtn;
 }
 
 static DEFINE_IDR(proc_inum_idr);
@@ -378,6 +387,7 @@ struct dentry *proc_lookup(struct inode 
 	int error = -ENOENT;
 
 	lock_kernel();
+	spin_lock(&proc_subdir_lock);
 	de = PDE(dir);
 	if (de) {
 		for (de = de->subdir; de ; de = de->next) {
@@ -386,12 +396,15 @@ struct dentry *proc_lookup(struct inode 
 			if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
 				unsigned int ino = de->low_ino;
 
+				spin_unlock(&proc_subdir_lock);
 				error = -EINVAL;
 				inode = proc_get_inode(dir->i_sb, ino, de);
+				spin_lock(&proc_subdir_lock);
 				break;
 			}
 		}
 	}
+	spin_unlock(&proc_subdir_lock);
 	unlock_kernel();
 
 	if (inode) {
@@ -445,11 +458,13 @@ int proc_readdir(struct file * filp,
 			filp->f_pos++;
 			/* fall through */
 		default:
+			spin_lock(&proc_subdir_lock);
 			de = de->subdir;
 			i -= 2;
 			for (;;) {
 				if (!de) {
 					ret = 1;
+					spin_unlock(&proc_subdir_lock);
 					goto out;
 				}
 				if (!i)
@@ -459,12 +474,16 @@ int proc_readdir(struct file * filp,
 			}
 
 			do {
+				/* filldir passes info to user space */
+				spin_unlock(&proc_subdir_lock);
 				if (filldir(dirent, de->name, de->namelen, filp->f_pos,
 					    de->low_ino, de->mode >> 12) < 0)
 					goto out;
+				spin_lock(&proc_subdir_lock);
 				filp->f_pos++;
 				de = de->next;
 			} while (de);
+			spin_unlock(&proc_subdir_lock);
 	}
 	ret = 1;
 out:	unlock_kernel();
@@ -498,9 +517,13 @@ static int proc_register(struct proc_dir
 	if (i == 0)
 		return -EAGAIN;
 	dp->low_ino = i;
+
+	spin_lock(&proc_subdir_lock);
 	dp->next = dir->subdir;
 	dp->parent = dir;
 	dir->subdir = dp;
+	spin_unlock(&proc_subdir_lock);
+
 	if (S_ISDIR(dp->mode)) {
 		if (dp->proc_iops == NULL) {
 			dp->proc_fops = &proc_dir_operations;
@@ -692,6 +715,8 @@ void remove_proc_entry(const char *name,
 	if (!parent && xlate_proc_name(name, &parent, &fn) != 0)
 		goto out;
 	len = strlen(fn);
+
+	spin_lock(&proc_subdir_lock);
 	for (p = &parent->subdir; *p; p=&(*p)->next ) {
 		if (!proc_match(len, fn, *p))
 			continue;
@@ -712,6 +737,7 @@ void remove_proc_entry(const char *name,
 		}
 		break;
 	}
+	spin_unlock(&proc_subdir_lock);
 out:
 	return;
 }
Index: linux.prev/fs/proc/proc_devtree.c
===================================================================
--- linux.prev.orig/fs/proc/proc_devtree.c
+++ linux.prev/fs/proc/proc_devtree.c
@@ -112,9 +112,11 @@ void proc_device_tree_add_node(struct de
 		 * properties are quite unimportant for us though, thus we
 		 * simply "skip" them here, but we do have to check.
 		 */
+		spin_lock(&proc_subdir_lock);
 		for (ent = de->subdir; ent != NULL; ent = ent->next)
 			if (!strcmp(ent->name, pp->name))
 				break;
+		spin_unlock(&proc_subdir_lock);
 		if (ent != NULL) {
 			printk(KERN_WARNING "device-tree: property \"%s\" name"
 			       " conflicts with node in %s\n", pp->name,
Index: linux.prev/fs/proc/proc_misc.c
===================================================================
--- linux.prev.orig/fs/proc/proc_misc.c
+++ linux.prev/fs/proc/proc_misc.c
@@ -323,6 +323,7 @@ static struct file_operations proc_modul
 };
 #endif
 
+#ifdef CONFIG_SLAB
 extern struct seq_operations slabinfo_op;
 extern ssize_t slabinfo_write(struct file *, const char __user *, size_t, loff_t *);
 static int slabinfo_open(struct inode *inode, struct file *file)
@@ -336,6 +337,7 @@ static struct file_operations proc_slabi
 	.llseek		= seq_lseek,
 	.release	= seq_release,
 };
+#endif
 
 static int show_stat(struct seq_file *p, void *v)
 {
@@ -415,6 +417,39 @@ static int show_stat(struct seq_file *p,
 		nr_running(),
 		nr_iowait());
 
+#ifdef CONFIG_PREEMPT_RT
+	{
+		unsigned long nr_uninterruptible_cpu(int cpu);
+		extern int pi_initialized;
+		extern int rt_overload_schedule,
+			   rt_overload_wakeup, rt_overload_pulled;
+		unsigned long rt_nr_running_cpu(int cpu);
+		extern atomic_t rt_overload;
+
+		int i;
+
+		seq_printf(p, "rt_overload_schedule: %d\n",
+					rt_overload_schedule);
+		seq_printf(p, "rt_overload_wakeup:   %d\n",
+					rt_overload_wakeup);
+		seq_printf(p, "rt_overload_pulled:   %d\n",
+					rt_overload_pulled);
+		seq_printf(p, "pi_init: %d\n", pi_initialized);
+		seq_printf(p, "nr_running(): %ld\n",
+			nr_running());
+		seq_printf(p, "nr_uninterruptible(): %ld\n",
+			nr_uninterruptible());
+		for_each_cpu(i)
+			seq_printf(p, "nr_uninterruptible(%d): %ld\n",
+				i, nr_uninterruptible_cpu(i));
+		for_each_cpu(i)
+			seq_printf(p, "rt_nr_running(%d): %ld\n",
+				i, rt_nr_running_cpu(i));
+		seq_printf(p, "rt_overload: %d\n", atomic_read(&rt_overload));
+
+	}
+#endif
+
 	return 0;
 }
 
@@ -531,6 +566,20 @@ static int execdomains_read_proc(char *p
 	return proc_calc_metrics(page, start, off, count, eof, len);
 }
 
+#ifdef CONFIG_LATENCY_TRACE
+extern struct seq_operations latency_trace_op;
+static int latency_trace_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &latency_trace_op);
+}
+static struct file_operations proc_latency_trace_operations = {
+	.open		= latency_trace_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+#endif
+
 #ifdef CONFIG_MAGIC_SYSRQ
 /*
  * writing 'C' to /proc/sysrq-trigger is like sysrq-C
@@ -563,6 +612,48 @@ void create_seq_entry(char *name, mode_t
 		entry->proc_fops = f;
 }
 
+#ifdef CONFIG_RCU_STATS
+int rcu_read_proc(char *page, char **start, off_t off,
+		  int count, int *eof, void *data)
+{
+	int len;
+	extern int rcu_read_proc_data(char *page);
+
+	len = rcu_read_proc_data(page);
+	return proc_calc_metrics(page, start, off, count, eof, len);
+}
+
+int rcu_read_proc_gp(char *page, char **start, off_t off,
+		     int count, int *eof, void *data)
+{
+	int len;
+	extern int rcu_read_proc_gp_data(char *page);
+
+	len = rcu_read_proc_gp_data(page);
+	return proc_calc_metrics(page, start, off, count, eof, len);
+}
+
+int rcu_read_proc_ptrs(char *page, char **start, off_t off,
+		       int count, int *eof, void *data)
+{
+	int len;
+	extern int rcu_read_proc_ptrs_data(char *page);
+
+	len = rcu_read_proc_ptrs_data(page);
+	return proc_calc_metrics(page, start, off, count, eof, len);
+}
+
+int rcu_read_proc_ctrs(char *page, char **start, off_t off,
+		       int count, int *eof, void *data)
+{
+	int len;
+	extern int rcu_read_proc_ctrs_data(char *page);
+
+	len = rcu_read_proc_ctrs_data(page);
+	return proc_calc_metrics(page, start, off, count, eof, len);
+}
+#endif /* #ifdef CONFIG_RCU_STATS */
+
 void __init proc_misc_init(void)
 {
 	struct proc_dir_entry *entry;
@@ -585,6 +676,12 @@ void __init proc_misc_init(void)
 		{"cmdline",	cmdline_read_proc},
 		{"locks",	locks_read_proc},
 		{"execdomains",	execdomains_read_proc},
+#ifdef CONFIG_RCU_STATS
+		{"rcustats",	rcu_read_proc},
+		{"rcugp",	rcu_read_proc_gp},
+		{"rcuptrs",	rcu_read_proc_ptrs},
+		{"rcuctrs",	rcu_read_proc_ctrs},
+#endif /* #ifdef CONFIG_RCU_STATS */
 		{NULL,}
 	};
 	for (p = simple_ones; p->name; p++)
@@ -600,7 +697,9 @@ void __init proc_misc_init(void)
 	create_seq_entry("partitions", 0, &proc_partitions_operations);
 	create_seq_entry("stat", 0, &proc_stat_operations);
 	create_seq_entry("interrupts", 0, &proc_interrupts_operations);
+#ifdef CONFIG_SLAB
 	create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations);
+#endif
 	create_seq_entry("buddyinfo",S_IRUGO, &fragmentation_file_operations);
 	create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations);
 	create_seq_entry("zoneinfo",S_IRUGO, &proc_zoneinfo_file_operations);
@@ -611,6 +710,9 @@ void __init proc_misc_init(void)
 #ifdef CONFIG_SCHEDSTATS
 	create_seq_entry("schedstat", 0, &proc_schedstat_operations);
 #endif
+#ifdef CONFIG_LATENCY_TRACE
+	create_seq_entry("latency_trace", 0, &proc_latency_trace_operations);
+#endif
 #ifdef CONFIG_PROC_KCORE
 	proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL);
 	if (proc_root_kcore) {
Index: linux.prev/fs/proc/task_mmu.c
===================================================================
--- linux.prev.orig/fs/proc/task_mmu.c
+++ linux.prev/fs/proc/task_mmu.c
@@ -332,8 +332,10 @@ static void *m_start(struct seq_file *m,
 	vma = NULL;
 	if ((unsigned long)l < mm->map_count) {
 		vma = mm->mmap;
-		while (l-- && vma)
+		while (l-- && vma) {
 			vma = vma->vm_next;
+			cond_resched();
+		}
 		goto out;
 	}
 
Index: linux.prev/fs/sysfs/dir.c
===================================================================
--- linux.prev.orig/fs/sysfs/dir.c
+++ linux.prev/fs/sysfs/dir.c
@@ -112,7 +112,11 @@ static int create_dir(struct kobject * k
 			}
 		}
 		if (error && (error != -EEXIST)) {
-			sysfs_put((*d)->d_fsdata);
+			struct sysfs_dirent *sd = (*d)->d_fsdata;
+			if (sd) {
+ 				list_del_init(&sd->s_sibling);
+				sysfs_put(sd);
+			}
 			d_drop(*d);
 		}
 		dput(*d);
Index: linux.prev/fs/xfs/linux-2.6/mrlock.h
===================================================================
--- linux.prev.orig/fs/xfs/linux-2.6/mrlock.h
+++ linux.prev/fs/xfs/linux-2.6/mrlock.h
@@ -23,12 +23,12 @@
 enum { MR_NONE, MR_ACCESS, MR_UPDATE };
 
 typedef struct {
-	struct rw_semaphore	mr_lock;
-	int			mr_writer;
+	struct compat_rw_semaphore	mr_lock;
+	int				mr_writer;
 } mrlock_t;
 
 #define mrinit(mrp, name)	\
-	( (mrp)->mr_writer = 0, init_rwsem(&(mrp)->mr_lock) )
+	do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0)
 #define mrlock_init(mrp, t,n,s)	mrinit(mrp, n)
 #define mrfree(mrp)		do { } while (0)
 #define mraccess(mrp)		mraccessf(mrp, 0)
Index: linux.prev/fs/xfs/linux-2.6/mutex.h
===================================================================
--- linux.prev.orig/fs/xfs/linux-2.6/mutex.h
+++ linux.prev/fs/xfs/linux-2.6/mutex.h
@@ -28,7 +28,7 @@
  * callers.
  */
 #define MUTEX_DEFAULT		0x0
-typedef struct semaphore	mutex_t;
+typedef struct compat_semaphore	mutex_t;
 
 #define mutex_init(lock, type, name)		sema_init(lock, 1)
 #define mutex_destroy(lock)			sema_init(lock, -99)
Index: linux.prev/fs/xfs/linux-2.6/sema.h
===================================================================
--- linux.prev.orig/fs/xfs/linux-2.6/sema.h
+++ linux.prev/fs/xfs/linux-2.6/sema.h
@@ -27,7 +27,7 @@
  * sema_t structure just maps to struct semaphore in Linux kernel.
  */
 
-typedef struct semaphore sema_t;
+typedef struct compat_semaphore sema_t;
 
 #define init_sema(sp, val, c, d)	sema_init(sp, val)
 #define initsema(sp, val)		sema_init(sp, val)
Index: linux.prev/fs/xfs/linux-2.6/xfs_aops.c
===================================================================
--- linux.prev.orig/fs/xfs/linux-2.6/xfs_aops.c
+++ linux.prev/fs/xfs/linux-2.6/xfs_aops.c
@@ -179,7 +179,7 @@ linvfs_unwritten_done(
 	int			uptodate)
 {
 	xfs_ioend_t		*ioend = bh->b_private;
-	static spinlock_t	unwritten_done_lock = SPIN_LOCK_UNLOCKED;
+	static DEFINE_SPINLOCK(unwritten_done_lock);
 	unsigned long		flags;
 
 	ASSERT(buffer_unwritten(bh));
Index: linux.prev/fs/xfs/linux-2.6/xfs_buf.c
===================================================================
--- linux.prev.orig/fs/xfs/linux-2.6/xfs_buf.c
+++ linux.prev/fs/xfs/linux-2.6/xfs_buf.c
@@ -896,7 +896,7 @@ int
 pagebuf_lock_value(
 	xfs_buf_t		*pb)
 {
-	return(atomic_read(&pb->pb_sema.count));
+	return !sem_is_locked(&pb->pb_sema);
 }
 #endif
 
Index: linux.prev/fs/xfs/linux-2.6/xfs_buf.h
===================================================================
--- linux.prev.orig/fs/xfs/linux-2.6/xfs_buf.h
+++ linux.prev/fs/xfs/linux-2.6/xfs_buf.h
@@ -114,7 +114,7 @@ typedef int (*page_buf_bdstrat_t)(struct
 #define PB_PAGES	2
 
 typedef struct xfs_buf {
-	struct semaphore	pb_sema;	/* semaphore for lockables  */
+	struct compat_semaphore	pb_sema;	/* semaphore for lockables  */
 	unsigned long		pb_queuetime;	/* time buffer was queued   */
 	atomic_t		pb_pin_count;	/* pin count		    */
 	wait_queue_head_t	pb_waiters;	/* unpin waiters	    */
@@ -134,7 +134,7 @@ typedef struct xfs_buf {
 	page_buf_iodone_t	pb_iodone;	/* I/O completion function */
 	page_buf_relse_t	pb_relse;	/* releasing function */
 	page_buf_bdstrat_t	pb_strat;	/* pre-write function */
-	struct semaphore	pb_iodonesema;	/* Semaphore for I/O waiters */
+	struct compat_semaphore	pb_iodonesema;	/* Semaphore for I/O waiters */
 	void			*pb_fspriv;
 	void			*pb_fspriv2;
 	void			*pb_fspriv3;
Index: linux.prev/fs/xfs/quota/xfs_qm.h
===================================================================
--- linux.prev.orig/fs/xfs/quota/xfs_qm.h
+++ linux.prev/fs/xfs/quota/xfs_qm.h
@@ -165,8 +165,8 @@ typedef struct xfs_dquot_acct {
 #define XFS_QM_IWARNLIMIT	5
 #define XFS_QM_RTBWARNLIMIT	5
 
-#define XFS_QM_LOCK(xqm)	(mutex_lock(&xqm##_lock, PINOD))
-#define XFS_QM_UNLOCK(xqm)	(mutex_unlock(&xqm##_lock))
+#define XFS_QM_LOCK(xqm)	mutex_lock(&xqm##_lock, PINOD)
+#define XFS_QM_UNLOCK(xqm)	mutex_unlock(&xqm##_lock)
 #define XFS_QM_HOLD(xqm)	((xqm)->qm_nrefs++)
 #define XFS_QM_RELE(xqm)	((xqm)->qm_nrefs--)
 
Index: linux.prev/fs/xfs/quota/xfs_quota_priv.h
===================================================================
--- linux.prev.orig/fs/xfs/quota/xfs_quota_priv.h
+++ linux.prev/fs/xfs/quota/xfs_quota_priv.h
@@ -51,8 +51,8 @@
 #define XFS_QI_MPLNEXT(mp)	((mp)->m_quotainfo->qi_dqlist.qh_next)
 #define XFS_QI_MPLNDQUOTS(mp)	((mp)->m_quotainfo->qi_dqlist.qh_nelems)
 
-#define XQMLCK(h)			(mutex_lock(&((h)->qh_lock), PINOD))
-#define XQMUNLCK(h)			(mutex_unlock(&((h)->qh_lock)))
+#define XQMLCK(h)			mutex_lock(&((h)->qh_lock), PINOD)
+#define XQMUNLCK(h)			mutex_unlock(&((h)->qh_lock))
 #ifdef DEBUG
 struct xfs_dqhash;
 static inline int XQMISLCKD(struct xfs_dqhash *h)
Index: linux.prev/fs/xfs/xfs_mount.h
===================================================================
--- linux.prev.orig/fs/xfs/xfs_mount.h
+++ linux.prev/fs/xfs/xfs_mount.h
@@ -329,7 +329,7 @@ typedef struct xfs_mount {
 	uint			m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
 	uint			m_in_maxlevels;	/* XFS_IN_MAXLEVELS */
 	struct xfs_perag	*m_perag;	/* per-ag accounting info */
-	struct rw_semaphore	m_peraglock;	/* lock for m_perag (pointer) */
+	struct compat_rw_semaphore m_peraglock;	/* lock for m_perag (pointer) */
 	sema_t			m_growlock;	/* growfs mutex */
 	int			m_fixedfsid[2];	/* unchanged for life of FS */
 	uint			m_dmevmask;	/* DMI events for this FS */
Index: linux.prev/include/acpi/acpiosxf.h
===================================================================
--- linux.prev.orig/include/acpi/acpiosxf.h
+++ linux.prev/include/acpi/acpiosxf.h
@@ -57,7 +57,7 @@
 #define OSD_PRIORITY_MED            3
 #define OSD_PRIORITY_LO             4
 
-#define ACPI_NO_UNIT_LIMIT          ((u32) -1)
+#define ACPI_NO_UNIT_LIMIT          (INT_MAX/2)
 #define ACPI_MUTEX_SEM              1
 
 /* Functions for acpi_os_signal */
Index: linux.prev/include/asm-arm/arch-ixp2000/system.h
===================================================================
--- linux.prev.orig/include/asm-arm/arch-ixp2000/system.h
+++ linux.prev/include/asm-arm/arch-ixp2000/system.h
@@ -19,7 +19,7 @@ static inline void arch_idle(void)
 
 static inline void arch_reset(char mode)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/*
 	 * Reset flash banking register so that we are pointing at
Index: linux.prev/include/asm-arm/arch-shark/system.h
===================================================================
--- linux.prev.orig/include/asm-arm/arch-shark/system.h
+++ linux.prev/include/asm-arm/arch-shark/system.h
@@ -11,7 +11,7 @@
 static void arch_reset(char mode)
 {
 	short temp;
-	local_irq_disable();
+	raw_local_irq_disable();
 	/* Reset the Machine via pc[3] of the sequoia chipset */
 	outw(0x09,0x24);
 	temp=inw(0x26);
Index: linux.prev/include/asm-arm/atomic.h
===================================================================
--- linux.prev.orig/include/asm-arm/atomic.h
+++ linux.prev/include/asm-arm/atomic.h
@@ -129,10 +129,10 @@ static inline int atomic_add_return(int 
 	unsigned long flags;
 	int val;
 
-	local_irq_save(flags);
+	__raw_local_irq_save(flags);
 	val = v->counter;
 	v->counter = val += i;
-	local_irq_restore(flags);
+	__raw_local_irq_restore(flags);
 
 	return val;
 }
@@ -142,10 +142,10 @@ static inline int atomic_sub_return(int 
 	unsigned long flags;
 	int val;
 
-	local_irq_save(flags);
+	__raw_local_irq_save(flags);
 	val = v->counter;
 	v->counter = val -= i;
-	local_irq_restore(flags);
+	__raw_local_irq_restore(flags);
 
 	return val;
 }
@@ -168,11 +168,46 @@ static inline void atomic_clear_mask(uns
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	__raw_local_irq_save(flags);
 	*addr &= ~mask;
-	local_irq_restore(flags);
+	__raw_local_irq_restore(flags);
 }
 
+#ifndef CONFIG_SMP
+/*
+ * Atomic compare and exchange.
+ */
+#define __HAVE_ARCH_CMPXCHG	1
+
+extern unsigned long wrong_size_cmpxchg(volatile void *ptr);
+
+static inline unsigned long __cmpxchg(volatile void *ptr,
+				    unsigned long old,
+				    unsigned long new, int size)
+{
+	unsigned long flags, prev;
+	volatile unsigned long *p = ptr;
+
+	if (size == 4) {
+		__raw_local_irq_save(flags);
+		if ((prev = *p) == old)
+			*p = new;
+		__raw_local_irq_restore(flags);
+		return(prev);
+	} else
+		return wrong_size_cmpxchg(ptr);
+}
+
+#define cmpxchg(ptr,o,n)					  	\
+({									\
+     __typeof__(*(ptr)) _o_ = (o);					\
+     __typeof__(*(ptr)) _n_ = (n);					\
+     (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_,		\
+			   	 (unsigned long)_n_, sizeof(*(ptr)));	\
+})
+
+#endif
+
 #endif /* __LINUX_ARM_ARCH__ */
 
 static inline int atomic_add_unless(atomic_t *v, int a, int u)
Index: linux.prev/include/asm-arm/bitops.h
===================================================================
--- linux.prev.orig/include/asm-arm/bitops.h
+++ linux.prev/include/asm-arm/bitops.h
@@ -37,9 +37,9 @@ static inline void ____atomic_set_bit(un
 
 	p += bit >> 5;
 
-	local_irq_save(flags);
+	__raw_local_irq_save(flags);
 	*p |= mask;
-	local_irq_restore(flags);
+	__raw_local_irq_restore(flags);
 }
 
 static inline void ____atomic_clear_bit(unsigned int bit, volatile unsigned long *p)
@@ -49,9 +49,9 @@ static inline void ____atomic_clear_bit(
 
 	p += bit >> 5;
 
-	local_irq_save(flags);
+	__raw_local_irq_save(flags);
 	*p &= ~mask;
-	local_irq_restore(flags);
+	__raw_local_irq_restore(flags);
 }
 
 static inline void ____atomic_change_bit(unsigned int bit, volatile unsigned long *p)
@@ -61,9 +61,9 @@ static inline void ____atomic_change_bit
 
 	p += bit >> 5;
 
-	local_irq_save(flags);
+	__raw_local_irq_save(flags);
 	*p ^= mask;
-	local_irq_restore(flags);
+	__raw_local_irq_restore(flags);
 }
 
 static inline int
@@ -75,10 +75,10 @@ ____atomic_test_and_set_bit(unsigned int
 
 	p += bit >> 5;
 
-	local_irq_save(flags);
+	__raw_local_irq_save(flags);
 	res = *p;
 	*p = res | mask;
-	local_irq_restore(flags);
+	__raw_local_irq_restore(flags);
 
 	return res & mask;
 }
@@ -92,10 +92,10 @@ ____atomic_test_and_clear_bit(unsigned i
 
 	p += bit >> 5;
 
-	local_irq_save(flags);
+	__raw_local_irq_save(flags);
 	res = *p;
 	*p = res & ~mask;
-	local_irq_restore(flags);
+	__raw_local_irq_restore(flags);
 
 	return res & mask;
 }
@@ -109,10 +109,10 @@ ____atomic_test_and_change_bit(unsigned 
 
 	p += bit >> 5;
 
-	local_irq_save(flags);
+	__raw_local_irq_save(flags);
 	res = *p;
 	*p = res ^ mask;
-	local_irq_restore(flags);
+	__raw_local_irq_restore(flags);
 
 	return res & mask;
 }
Index: linux.prev/include/asm-arm/dma.h
===================================================================
--- linux.prev.orig/include/asm-arm/dma.h
+++ linux.prev/include/asm-arm/dma.h
@@ -21,7 +21,7 @@ typedef unsigned int dmamode_t;
 #define DMA_MODE_CASCADE 2
 #define DMA_AUTOINIT	 4
 
-extern spinlock_t  dma_spin_lock;
+extern raw_spinlock_t  dma_spin_lock;
 
 static inline unsigned long claim_dma_lock(void)
 {
Index: linux.prev/include/asm-arm/dyntick.h
===================================================================
--- /dev/null
+++ linux.prev/include/asm-arm/dyntick.h
@@ -0,0 +1,6 @@
+#ifndef _ASMARM_DYNTICK_H
+#define _ASMARM_DYNTICK_H
+
+#include <asm/mach/time.h>
+
+#endif /* _ASMARM_DYNTICK_H */
Index: linux.prev/include/asm-arm/hw_irq.h
===================================================================
--- /dev/null
+++ linux.prev/include/asm-arm/hw_irq.h
@@ -0,0 +1,9 @@
+/*
+ * Nothing to see here yet
+ */
+#ifndef _ARCH_ARM_HW_IRQ_H
+#define _ARCH_ARM_HW_IRQ_H
+
+#include <asm/mach/irq.h>
+
+#endif
Index: linux.prev/include/asm-arm/irq.h
===================================================================
--- linux.prev.orig/include/asm-arm/irq.h
+++ linux.prev/include/asm-arm/irq.h
@@ -19,16 +19,10 @@
 #define NO_IRQ	((unsigned int)(-1))
 #endif
 
-struct irqaction;
-
-extern void disable_irq_nosync(unsigned int);
-extern void disable_irq(unsigned int);
-extern void enable_irq(unsigned int);
-
-#define __IRQT_FALEDGE	(1 << 0)
-#define __IRQT_RISEDGE	(1 << 1)
-#define __IRQT_LOWLVL	(1 << 2)
-#define __IRQT_HIGHLVL	(1 << 3)
+#define __IRQT_FALEDGE	IRQ_TYPE_EDGEL
+#define __IRQT_RISEDGE	IRQ_TYPE_EDGEH
+#define __IRQT_LOWLVL	IRQ_TYPE_LEVELL
+#define __IRQT_HIGHLVL	IRQ_TYPE_LEVELH
 
 #define IRQT_NOEDGE	(0)
 #define IRQT_RISING	(__IRQT_RISEDGE)
@@ -36,16 +30,9 @@ extern void enable_irq(unsigned int);
 #define IRQT_BOTHEDGE	(__IRQT_RISEDGE|__IRQT_FALEDGE)
 #define IRQT_LOW	(__IRQT_LOWLVL)
 #define IRQT_HIGH	(__IRQT_HIGHLVL)
-#define IRQT_PROBE	(1 << 4)
-
-int set_irq_type(unsigned int irq, unsigned int type);
-void disable_irq_wake(unsigned int irq);
-void enable_irq_wake(unsigned int irq);
-int setup_irq(unsigned int, struct irqaction *);
 
-struct irqaction;
-struct pt_regs;
-int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+/* FIXME_TGLX */
+#define IRQT_PROBE	(1 << 7)
 
 extern void migrate_irqs(void);
 #endif
Index: linux.prev/include/asm-arm/mach/irq.h
===================================================================
--- linux.prev.orig/include/asm-arm/mach/irq.h
+++ linux.prev/include/asm-arm/mach/irq.h
@@ -10,94 +10,9 @@
 #ifndef __ASM_ARM_MACH_IRQ_H
 #define __ASM_ARM_MACH_IRQ_H
 
-struct irqdesc;
-struct pt_regs;
-struct seq_file;
-
-typedef void (*irq_handler_t)(unsigned int, struct irqdesc *, struct pt_regs *);
-typedef void (*irq_control_t)(unsigned int);
+#include <linux/irq.h>
 
-struct irqchip {
-	/*
-	 * Acknowledge the IRQ.
-	 * If this is a level-based IRQ, then it is expected to mask the IRQ
-	 * as well.
-	 */
-	void (*ack)(unsigned int);
-	/*
-	 * Mask the IRQ in hardware.
-	 */
-	void (*mask)(unsigned int);
-	/*
-	 * Unmask the IRQ in hardware.
-	 */
-	void (*unmask)(unsigned int);
-	/*
-	 * Ask the hardware to re-trigger the IRQ.
-	 * Note: This method _must_ _not_ call the interrupt handler.
-	 * If you are unable to retrigger the interrupt, do not
-	 * provide a function, or if you do, return non-zero.
-	 */
-	int (*retrigger)(unsigned int);
-	/*
-	 * Set the type of the IRQ.
-	 */
-	int (*set_type)(unsigned int, unsigned int);
-	/*
-	 * Set wakeup-enable on the selected IRQ
-	 */
-	int (*set_wake)(unsigned int, unsigned int);
-
-#ifdef CONFIG_SMP
-	/*
-	 * Route an interrupt to a CPU
-	 */
-	void (*set_cpu)(struct irqdesc *desc, unsigned int irq, unsigned int cpu);
-#endif
-};
-
-struct irqdesc {
-	irq_handler_t	handle;
-	struct irqchip	*chip;
-	struct irqaction *action;
-	struct list_head pend;
-	void		*chipdata;
-	void		*data;
-	unsigned int	disable_depth;
-
-	unsigned int	triggered: 1;		/* IRQ has occurred	      */
-	unsigned int	running  : 1;		/* IRQ is running             */
-	unsigned int	pending  : 1;		/* IRQ is pending	      */
-	unsigned int	probing  : 1;		/* IRQ in use for a probe     */
-	unsigned int	probe_ok : 1;		/* IRQ can be used for probe  */
-	unsigned int	valid    : 1;		/* IRQ claimable	      */
-	unsigned int	noautoenable : 1;	/* don't automatically enable IRQ */
-	unsigned int	unused   :25;
-
-	struct proc_dir_entry *procdir;
-
-#ifdef CONFIG_SMP
-	cpumask_t	affinity;
-	unsigned int	cpu;
-#endif
-
-	/*
-	 * IRQ lock detection
-	 */
-	unsigned int	lck_cnt;
-	unsigned int	lck_pc;
-	unsigned int	lck_jif;
-};
-
-extern struct irqdesc irq_desc[];
-
-/*
- * Helpful inline function for calling irq descriptor handlers.
- */
-static inline void desc_handle_irq(unsigned int irq, struct irqdesc *desc, struct pt_regs *regs)
-{
-	desc->handle(irq, desc, regs);
-}
+struct seq_file;
 
 /*
  * This is internal.  Do not use it.
@@ -105,31 +20,52 @@ static inline void desc_handle_irq(unsig
 extern void (*init_arch_irq)(void);
 extern void init_FIQ(void);
 extern int show_fiq_list(struct seq_file *, void *);
-void __set_irq_handler(unsigned int irq, irq_handler_t, int);
+void __set_irq_handler(unsigned int irq, struct irq_type *, int);
 
 /*
  * External stuff.
  */
 #define set_irq_handler(irq,handler)		__set_irq_handler(irq,handler,0)
-#define set_irq_chained_handler(irq,handler)	__set_irq_handler(irq,handler,1)
-#define set_irq_data(irq,d)			do { irq_desc[irq].data = d; } while (0)
-#define set_irq_chipdata(irq,d)			do { irq_desc[irq].chipdata = d; } while (0)
-#define get_irq_chipdata(irq)			(irq_desc[irq].chipdata)
 
-void set_irq_chip(unsigned int irq, struct irqchip *);
+
+#define set_irq_chipdata(irq,d)			set_irq_chip_data(irq, d)
+#define get_irq_chipdata(irq)			get_irq_chip_data(irq)
+
 void set_irq_flags(unsigned int irq, unsigned int flags);
 
 #define IRQF_VALID	(1 << 0)
 #define IRQF_PROBE	(1 << 1)
 #define IRQF_NOAUTOEN	(1 << 2)
 
+/* ARM uses the retrigger functions in desc->chip or software retrigger */
+static inline void hw_resend_irq(struct irq_type *t, unsigned int i) {}
+
 /*
- * Built-in IRQ handlers.
+ * Hack alert. This is for easy migration, but should be changed in the source
  */
-void do_level_IRQ(unsigned int irq, struct irqdesc *desc, struct pt_regs *regs);
-void do_edge_IRQ(unsigned int irq, struct irqdesc *desc, struct pt_regs *regs);
-void do_simple_IRQ(unsigned int irq, struct irqdesc *desc, struct pt_regs *regs);
-void do_bad_IRQ(unsigned int irq, struct irqdesc *desc, struct pt_regs *regs);
-void dummy_mask_unmask_irq(unsigned int irq);
+#define do_level_IRQ	(&default_level_type)
+#define do_edge_IRQ	(&default_edge_type)
+#define do_simple_IRQ	(&default_simple_type)
+
+/* Hack to get around set_irq_chained_handler(nr,NULL) problem */
+#define irq_NULL_type no_irq_type
+#define set_irq_chained_handler(irq,handler) \
+	__set_irq_handler(irq,&irq_##handler##_type,1)
+
+#define DEFINE_IRQ_CHAINED_TYPE(function)		\
+struct irq_type irq_##function##_type = {		\
+	.typename = #function"-chained_type",		\
+	.handle_irq = function,				\
+}
+
+#define do_bad_IRQ(irq,desc,regs)			\
+do {							\
+	spin_lock(&desc->lock);				\
+	handle_bad_irq(irq, desc, regs);		\
+	spin_unlock(&desc->lock);			\
+} while(0)
+
+/* FIXME */
+#define ack_bad_irq(irq) do {} while (0)
 
 #endif
Index: linux.prev/include/asm-arm/pgalloc.h
===================================================================
--- linux.prev.orig/include/asm-arm/pgalloc.h
+++ linux.prev/include/asm-arm/pgalloc.h
@@ -102,7 +102,7 @@ static inline void __pmd_populate(pmd_t 
  *
  * Ensure that we always set both PMD entries.
  */
-static inline void
+static inline void notrace
 pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep)
 {
 	unsigned long pte_ptr = (unsigned long)ptep;
@@ -115,7 +115,7 @@ pmd_populate_kernel(struct mm_struct *mm
 	__pmd_populate(pmdp, __pa(pte_ptr) | _PAGE_KERNEL_TABLE);
 }
 
-static inline void
+static inline void notrace
 pmd_populate(struct mm_struct *mm, pmd_t *pmdp, struct page *ptep)
 {
 	__pmd_populate(pmdp, page_to_pfn(ptep) << PAGE_SHIFT | _PAGE_USER_TABLE);
Index: linux.prev/include/asm-arm/semaphore.h
===================================================================
--- linux.prev.orig/include/asm-arm/semaphore.h
+++ linux.prev/include/asm-arm/semaphore.h
@@ -5,46 +5,67 @@
 #define __ASM_ARM_SEMAPHORE_H
 
 #include <linux/linkage.h>
+
+#ifdef CONFIG_PREEMPT_RT
+# include <linux/rt_lock.h>
+#endif
+
 #include <linux/spinlock.h>
 #include <linux/wait.h>
 #include <linux/rwsem.h>
 
+/*
+ * On !PREEMPT_RT all semaphores are compat:
+ */
+#ifndef CONFIG_PREEMPT_RT
+# define semaphore compat_semaphore
+#define __MUTEX_INITIALIZER(name) __COMPAT_MUTEX_INITIALIZER(name)
+#endif
+
 #include <asm/atomic.h>
 #include <asm/locks.h>
 
-struct semaphore {
+struct compat_semaphore {
 	atomic_t count;
 	int sleepers;
 	wait_queue_head_t wait;
 };
 
-#define __SEMAPHORE_INIT(name, cnt)				\
+#define __COMPAT_SEMAPHORE_INITIALIZER(name, cnt)				\
 {								\
 	.count	= ATOMIC_INIT(cnt),				\
 	.wait	= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait),	\
 }
 
-#define __DECLARE_SEMAPHORE_GENERIC(name,count)	\
-	struct semaphore name = __SEMAPHORE_INIT(name,count)
+#define __COMPAT_MUTEX_INITIALIZER(name) \
+	__COMPAT_SEMAPHORE_INITIALIZER(name,1)
+
+#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count) \
+	struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count)
 
-#define DECLARE_MUTEX(name)		__DECLARE_SEMAPHORE_GENERIC(name,1)
-#define DECLARE_MUTEX_LOCKED(name)	__DECLARE_SEMAPHORE_GENERIC(name,0)
+#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,1)
+#define COMPAT_DECLARE_MUTEX_LOCKED(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,0)
 
-static inline void sema_init(struct semaphore *sem, int val)
+static inline void compat_sema_init(struct compat_semaphore *sem, int val)
 {
 	atomic_set(&sem->count, val);
 	sem->sleepers = 0;
 	init_waitqueue_head(&sem->wait);
 }
 
-static inline void init_MUTEX(struct semaphore *sem)
+static inline void compat_init_MUTEX(struct compat_semaphore *sem)
+{
+	compat_sema_init(sem, 1);
+}
+
+static inline void compat_init_MUTEX_LOCKED(struct compat_semaphore *sem)
 {
-	sema_init(sem, 1);
+	compat_sema_init(sem, 0);
 }
 
-static inline void init_MUTEX_LOCKED(struct semaphore *sem)
+static inline int compat_sema_count(struct compat_semaphore *sem)
 {
-	sema_init(sem, 0);
+	return atomic_read(&sem->count);
 }
 
 /*
@@ -55,16 +76,18 @@ asmlinkage int  __down_interruptible_fai
 asmlinkage int  __down_trylock_failed(void);
 asmlinkage void __up_wakeup(void);
 
-extern void __down(struct semaphore * sem);
-extern int  __down_interruptible(struct semaphore * sem);
-extern int  __down_trylock(struct semaphore * sem);
-extern void __up(struct semaphore * sem);
+extern void __compat_up(struct compat_semaphore *sem);
+extern int __compat_down_interruptible(struct compat_semaphore * sem);
+extern int __compat_down_trylock(struct compat_semaphore * sem);
+extern void __compat_down(struct compat_semaphore * sem);
+
+extern int compat_sem_is_locked(struct compat_semaphore *sem);
 
 /*
  * This is ugly, but we want the default case to fall through.
  * "__down" is the actual routine that waits...
  */
-static inline void down(struct semaphore * sem)
+static inline void compat_down(struct compat_semaphore * sem)
 {
 	might_sleep();
 	__down_op(sem, __down_failed);
@@ -74,13 +97,13 @@ static inline void down(struct semaphore
  * This is ugly, but we want the default case to fall through.
  * "__down_interruptible" is the actual routine that waits...
  */
-static inline int down_interruptible (struct semaphore * sem)
+static inline int compat_down_interruptible (struct compat_semaphore * sem)
 {
 	might_sleep();
 	return __down_op_ret(sem, __down_interruptible_failed);
 }
 
-static inline int down_trylock(struct semaphore *sem)
+static inline int compat_down_trylock(struct compat_semaphore *sem)
 {
 	return __down_op_ret(sem, __down_trylock_failed);
 }
@@ -91,9 +114,10 @@ static inline int down_trylock(struct se
  * The default case (no contention) will result in NO
  * jumps for both down() and up().
  */
-static inline void up(struct semaphore * sem)
+static inline void compat_up(struct compat_semaphore * sem)
 {
 	__up_op(sem, __up_wakeup);
 }
 
+#include <linux/semaphore.h>
 #endif
Index: linux.prev/include/asm-arm/signal.h
===================================================================
--- linux.prev.orig/include/asm-arm/signal.h
+++ linux.prev/include/asm-arm/signal.h
@@ -114,7 +114,7 @@ typedef unsigned long sigset_t;
 #define SIGSTKSZ	8192
 
 #ifdef __KERNEL__
-#define SA_TIMER		0x40000000
+#define SA_TIMER		(0x40000000 | SA_NODELAY)
 #endif
 
 #include <asm-generic/signal.h>
Index: linux.prev/include/asm-arm/system.h
===================================================================
--- linux.prev.orig/include/asm-arm/system.h
+++ linux.prev/include/asm-arm/system.h
@@ -4,6 +4,7 @@
 #ifdef __KERNEL__
 
 #include <linux/config.h>
+#include <asm/ptrace.h>
 
 #define CPU_ARCH_UNKNOWN	0
 #define CPU_ARCH_ARMv3		1
@@ -176,7 +177,7 @@ do {									\
  */
 #if __LINUX_ARM_ARCH__ >= 6
 
-#define local_irq_save(x)					\
+#define __raw_local_irq_save(x)					\
 	({							\
 	__asm__ __volatile__(					\
 	"mrs	%0, cpsr		@ local_irq_save\n"	\
@@ -184,17 +185,17 @@ do {									\
 	: "=r" (x) : : "memory", "cc");				\
 	})
 
-#define local_irq_enable()  __asm__("cpsie i	@ __sti" : : : "memory", "cc")
-#define local_irq_disable() __asm__("cpsid i	@ __cli" : : : "memory", "cc")
-#define local_fiq_enable()  __asm__("cpsie f	@ __stf" : : : "memory", "cc")
-#define local_fiq_disable() __asm__("cpsid f	@ __clf" : : : "memory", "cc")
+#define __raw_local_irq_enable()  __asm__("cpsie i	@ __sti" : : : "memory", "cc")
+#define __raw_local_irq_disable() __asm__("cpsid i	@ __cli" : : : "memory", "cc")
+#define __raw_local_fiq_enable()  __asm__("cpsie f	@ __stf" : : : "memory", "cc")
+#define __raw_local_fiq_disable() __asm__("cpsid f	@ __clf" : : : "memory", "cc")
 
 #else
 
 /*
  * Save the current interrupt enable state & disable IRQs
  */
-#define local_irq_save(x)					\
+#define __raw_local_irq_save(x)					\
 	({							\
 		unsigned long temp;				\
 		(void) (&temp == &x);				\
@@ -206,11 +207,11 @@ do {									\
 	:							\
 	: "memory", "cc");					\
 	})
-	
+
 /*
  * Enable IRQs
  */
-#define local_irq_enable()					\
+#define __raw_local_irq_enable()				\
 	({							\
 		unsigned long temp;				\
 	__asm__ __volatile__(					\
@@ -225,7 +226,7 @@ do {									\
 /*
  * Disable IRQs
  */
-#define local_irq_disable()					\
+#define __raw_local_irq_disable()				\
 	({							\
 		unsigned long temp;				\
 	__asm__ __volatile__(					\
@@ -272,7 +273,7 @@ do {									\
 /*
  * Save the current interrupt enable state.
  */
-#define local_save_flags(x)					\
+#define __raw_local_save_flags(x)				\
 	({							\
 	__asm__ __volatile__(					\
 	"mrs	%0, cpsr		@ local_save_flags"	\
@@ -282,20 +283,27 @@ do {									\
 /*
  * restore saved IRQ & FIQ state
  */
-#define local_irq_restore(x)					\
+#define __raw_local_irq_restore(x)				\
 	__asm__ __volatile__(					\
 	"msr	cpsr_c, %0		@ local_irq_restore\n"	\
 	:							\
 	: "r" (x)						\
 	: "memory", "cc")
 
-#define irqs_disabled()			\
-({					\
-	unsigned long flags;		\
-	local_save_flags(flags);	\
-	(int)(flags & PSR_I_BIT);	\
+#define __raw_irqs_disabled_flags(flags)	\
+({						\
+	(int)(flags & PSR_I_BIT);		\
+})
+
+#define __raw_irqs_disabled()			\
+({						\
+	unsigned long flags;			\
+	__raw_local_save_flags(flags);		\
+	__raw_irqs_disabled_flags(flags);	\
 })
 
+#include <linux/rt_irq.h>
+
 #ifdef CONFIG_SMP
 
 #define smp_mb()		mb()
Index: linux.prev/include/asm-arm/thread_info.h
===================================================================
--- linux.prev.orig/include/asm-arm/thread_info.h
+++ linux.prev/include/asm-arm/thread_info.h
@@ -129,6 +129,7 @@ extern void iwmmxt_task_release(struct t
 #define TIF_NOTIFY_RESUME	0
 #define TIF_SIGPENDING		1
 #define TIF_NEED_RESCHED	2
+#define TIF_NEED_RESCHED_DELAYED 3
 #define TIF_SYSCALL_TRACE	8
 #define TIF_POLLING_NRFLAG	16
 #define TIF_USING_IWMMXT	17
@@ -137,6 +138,7 @@ extern void iwmmxt_task_release(struct t
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
+#define _TIF_NEED_RESCHED_DELAYED (1<<TIF_NEED_RESCHED_DELAYED)
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
 #define _TIF_USING_IWMMXT	(1 << TIF_USING_IWMMXT)
Index: linux.prev/include/asm-arm/timex.h
===================================================================
--- linux.prev.orig/include/asm-arm/timex.h
+++ linux.prev/include/asm-arm/timex.h
@@ -16,9 +16,17 @@
 
 typedef unsigned long cycles_t;
 
+#ifndef mach_read_cycles
+ #define mach_read_cycles() (0)
+#ifdef CONFIG_LATENCY_TIMING
+ #define mach_cycles_to_usecs(d) (d)
+ #define mach_usecs_to_cycles(d) (d)
+#endif
+#endif
+
 static inline cycles_t get_cycles (void)
 {
-	return 0;
+	return mach_read_cycles();
 }
 
 #endif
Index: linux.prev/include/asm-arm/tlb.h
===================================================================
--- linux.prev.orig/include/asm-arm/tlb.h
+++ linux.prev/include/asm-arm/tlb.h
@@ -28,15 +28,18 @@
 struct mmu_gather {
 	struct mm_struct	*mm;
 	unsigned int		fullmm;
+	int			cpu;
 };
 
-DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
+DECLARE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 static inline struct mmu_gather *
 tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 {
-	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
+	int cpu;
+	struct mmu_gather *tlb = &get_cpu_var_locked(mmu_gathers, &cpu);
 
+	tlb->cpu = cpu;
 	tlb->mm = mm;
 	tlb->fullmm = full_mm_flush;
 
@@ -52,7 +55,7 @@ tlb_finish_mmu(struct mmu_gather *tlb, u
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
 
-	put_cpu_var(mmu_gathers);
+	put_cpu_var_locked(mmu_gathers, tlb->cpu);
 }
 
 #define tlb_remove_tlb_entry(tlb,ptep,address)	do { } while (0)
Index: linux.prev/include/asm-arm/tlbflush.h
===================================================================
--- linux.prev.orig/include/asm-arm/tlbflush.h
+++ linux.prev/include/asm-arm/tlbflush.h
@@ -240,6 +240,7 @@ static inline void local_flush_tlb_all(v
 	const int zero = 0;
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
+	preempt_disable();
 	if (tlb_flag(TLB_WB))
 		asm("mcr%? p15, 0, %0, c7, c10, 4" : : "r" (zero));
 
@@ -251,6 +252,7 @@ static inline void local_flush_tlb_all(v
 		asm("mcr%? p15, 0, %0, c8, c6, 0" : : "r" (zero));
 	if (tlb_flag(TLB_V4_I_FULL | TLB_V6_I_FULL))
 		asm("mcr%? p15, 0, %0, c8, c5, 0" : : "r" (zero));
+	preempt_enable();
 }
 
 static inline void local_flush_tlb_mm(struct mm_struct *mm)
@@ -259,6 +261,7 @@ static inline void local_flush_tlb_mm(st
 	const int asid = ASID(mm);
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
+	preempt_disable();
 	if (tlb_flag(TLB_WB))
 		asm("mcr%? p15, 0, %0, c7, c10, 4" : : "r" (zero));
 
@@ -279,6 +282,7 @@ static inline void local_flush_tlb_mm(st
 		asm("mcr%? p15, 0, %0, c8, c6, 2" : : "r" (asid));
 	if (tlb_flag(TLB_V6_I_ASID))
 		asm("mcr%? p15, 0, %0, c8, c5, 2" : : "r" (asid));
+	preempt_enable();
 }
 
 static inline void
@@ -287,6 +291,7 @@ local_flush_tlb_page(struct vm_area_stru
 	const int zero = 0;
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
+	preempt_disable();
 	uaddr = (uaddr & PAGE_MASK) | ASID(vma->vm_mm);
 
 	if (tlb_flag(TLB_WB))
@@ -311,6 +316,7 @@ local_flush_tlb_page(struct vm_area_stru
 		asm("mcr%? p15, 0, %0, c8, c6, 1" : : "r" (uaddr));
 	if (tlb_flag(TLB_V6_I_PAGE))
 		asm("mcr%? p15, 0, %0, c8, c5, 1" : : "r" (uaddr));
+	preempt_enable();
 }
 
 static inline void local_flush_tlb_kernel_page(unsigned long kaddr)
@@ -318,6 +324,7 @@ static inline void local_flush_tlb_kerne
 	const int zero = 0;
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
+	preempt_disable();
 	kaddr &= PAGE_MASK;
 
 	if (tlb_flag(TLB_WB))
@@ -340,6 +347,7 @@ static inline void local_flush_tlb_kerne
 		asm("mcr%? p15, 0, %0, c8, c6, 1" : : "r" (kaddr));
 	if (tlb_flag(TLB_V6_I_PAGE))
 		asm("mcr%? p15, 0, %0, c8, c5, 1" : : "r" (kaddr));
+	preempt_enable();
 }
 
 /*
@@ -360,21 +368,25 @@ static inline void flush_pmd_entry(pmd_t
 	const unsigned int zero = 0;
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
+	preempt_disable();
 	if (tlb_flag(TLB_DCLEAN))
 		asm("mcr%?	p15, 0, %0, c7, c10, 1	@ flush_pmd"
 			: : "r" (pmd));
 	if (tlb_flag(TLB_WB))
 		asm("mcr%?	p15, 0, %0, c7, c10, 4	@ flush_pmd"
 			: : "r" (zero));
+	preempt_enable();
 }
 
 static inline void clean_pmd_entry(pmd_t *pmd)
 {
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
+	preempt_disable();
 	if (tlb_flag(TLB_DCLEAN))
 		asm("mcr%?	p15, 0, %0, c7, c10, 1	@ flush_pmd"
 			: : "r" (pmd));
+	preempt_enable();
 }
 
 #undef tlb_flag
Index: linux.prev/include/asm-arm/unistd.h
===================================================================
--- linux.prev.orig/include/asm-arm/unistd.h
+++ linux.prev/include/asm-arm/unistd.h
@@ -526,6 +526,9 @@ type name(type1 arg1, type2 arg2, type3 
 #define __ARCH_WANT_SYS_SIGPENDING
 #define __ARCH_WANT_SYS_SIGPROCMASK
 #define __ARCH_WANT_SYS_RT_SIGACTION
+
+#define NR_syscalls	328
+
 #endif
 
 #ifdef __KERNEL_SYSCALLS__
Index: linux.prev/include/asm-generic/bug.h
===================================================================
--- linux.prev.orig/include/asm-generic/bug.h
+++ linux.prev/include/asm-generic/bug.h
@@ -16,12 +16,12 @@
 #define BUG_ON(condition) do { if (unlikely((condition)!=0)) BUG(); } while(0)
 #endif
 
+extern void __WARN_ON(const char *func, const char *file, const int line);
+
 #ifndef HAVE_ARCH_WARN_ON
 #define WARN_ON(condition) do { \
-	if (unlikely((condition)!=0)) { \
-		printk("Badness in %s at %s:%d\n", __FUNCTION__, __FILE__, __LINE__); \
-		dump_stack(); \
-	} \
+	if (unlikely((condition)!=0)) \
+		__WARN_ON(__FUNCTION__, __FILE__, __LINE__); \
 } while (0)
 #endif
 
@@ -39,4 +39,16 @@
 #endif
 #endif
 
+#ifdef CONFIG_PREEMPT_RT
+# define BUG_ON_RT(c)			BUG_ON(c)
+# define BUG_ON_NONRT(c)		do { } while (0)
+# define WARN_ON_RT(condition)		WARN_ON(condition)
+# define WARN_ON_NONRT(condition)	do { } while (0)
+#else
+# define BUG_ON_RT(c)			do { } while (0)
+# define BUG_ON_NONRT(c)		BUG_ON(c)
+# define WARN_ON_RT(condition)		do { } while (0)
+# define WARN_ON_NONRT(condition)	WARN_ON(condition)
+#endif
+
 #endif
Index: linux.prev/include/asm-generic/div64.h
===================================================================
--- linux.prev.orig/include/asm-generic/div64.h
+++ linux.prev/include/asm-generic/div64.h
@@ -55,4 +55,13 @@ extern uint32_t __div64_32(uint64_t *div
 
 #endif /* BITS_PER_LONG */
 
+#ifndef div_long_long_rem
+#define div_long_long_rem(dividend,divisor,remainder) \
+({							\
+	u64 result = dividend;				\
+	*remainder = do_div(result,divisor);		\
+	result;						\
+})
+#endif
+
 #endif /* _ASM_GENERIC_DIV64_H */
Index: linux.prev/include/asm-generic/percpu.h
===================================================================
--- linux.prev.orig/include/asm-generic/percpu.h
+++ linux.prev/include/asm-generic/percpu.h
@@ -10,11 +10,23 @@ extern unsigned long __per_cpu_offset[NR
 /* Separate out the type, so (int[3], foo) works. */
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
+#define DEFINE_PER_CPU_LOCKED(type, name) \
+    __attribute__((__section__(".data.percpu"))) spinlock_t per_cpu_lock__##name##_locked = SPIN_LOCK_UNLOCKED(per_cpu_lock__##name##_locked); \
+    __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name##_locked
 
 /* var is in discarded region: offset to particular copy we want */
 #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu]))
 #define __get_cpu_var(var) per_cpu(var, smp_processor_id())
 
+#define per_cpu_lock(var, cpu) \
+	(*RELOC_HIDE(&per_cpu_lock__##var##_locked, __per_cpu_offset[cpu]))
+#define per_cpu_var_locked(var, cpu) \
+		(*RELOC_HIDE(&per_cpu__##var##_locked, __per_cpu_offset[cpu]))
+#define __get_cpu_lock(var, cpu) \
+		per_cpu_lock(var, cpu)
+#define __get_cpu_var_locked(var, cpu) \
+		per_cpu_var_locked(var, cpu)
+
 /* A macro to avoid #include hell... */
 #define percpu_modcopy(pcpudst, src, size)			\
 do {								\
@@ -28,15 +40,25 @@ do {								\
 
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
+#define DEFINE_PER_CPU_LOCKED(type, name) \
+    spinlock_t per_cpu_lock__##name##_locked = SPIN_LOCK_UNLOCKED(per_cpu_lock__##name##_locked); \
+    __typeof__(type) per_cpu__##name##_locked
 
 #define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu__##var))
 #define __get_cpu_var(var)			per_cpu__##var
+#define __get_cpu_lock(var, cpu)		per_cpu_lock__##var##_locked
+#define __get_cpu_var_locked(var, cpu)		per_cpu__##var##_locked
 
 #endif	/* SMP */
 
 #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
+#define DECLARE_PER_CPU_LOCKED(type, name) \
+    extern spinlock_t per_cpu_lock__##name##_locked; \
+    extern __typeof__(type) per_cpu__##name##_locked
 
 #define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
 #define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
+#define EXPORT_PER_CPU_LOCKED_SYMBOL(var) EXPORT_SYMBOL(per_cpu_lock__##var##_locked); EXPORT_SYMBOL(per_cpu__##var##_locked)
+#define EXPORT_PER_CPU_LOCKED_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu_lock__##var##_locked); EXPORT_SYMBOL_GPL(per_cpu__##var##_locked)
 
 #endif /* _ASM_GENERIC_PERCPU_H_ */
Index: linux.prev/include/asm-generic/timeofday.h
===================================================================
--- /dev/null
+++ linux.prev/include/asm-generic/timeofday.h
@@ -0,0 +1,30 @@
+/*  linux/include/asm-generic/timeofday.h
+ *
+ *  This file contains the asm-generic interface
+ *  to the arch specific calls used by the time of day subsystem
+ */
+#ifndef _ASM_GENERIC_TIMEOFDAY_H
+#define _ASM_GENERIC_TIMEOFDAY_H
+#include <linux/types.h>
+#include <linux/time.h>
+#include <linux/timex.h>
+#include <linux/timeofday.h>
+#include <linux/clocksource.h>
+
+#include <asm/div64.h>
+
+#ifdef CONFIG_GENERIC_TIME
+/* Required externs */
+extern nsec_t read_persistent_clock(void);
+extern void sync_persistent_clock(struct timespec ts);
+
+#ifdef CONFIG_GENERIC_TIME_VSYSCALL
+extern void arch_update_vsyscall_gtod(struct timespec wall_time,
+				cycle_t offset_base, struct clocksource* clock,
+				int ntp_adj);
+#else
+# define arch_update_vsyscall_gtod(x,y,z,w) do { } while(0)
+#endif /* CONFIG_GENERIC_TIME_VSYSCALL */
+
+#endif /* CONFIG_GENERIC_TIME */
+#endif
Index: linux.prev/include/asm-generic/tlb.h
===================================================================
--- linux.prev.orig/include/asm-generic/tlb.h
+++ linux.prev/include/asm-generic/tlb.h
@@ -42,11 +42,12 @@ struct mmu_gather {
 	unsigned int		nr;	/* set to ~0U means fast mode */
 	unsigned int		need_flush;/* Really unmapped some ptes? */
 	unsigned int		fullmm; /* non-zero means full mm flush */
+	int			cpu;
 	struct page *		pages[FREE_PTE_NR];
 };
 
 /* Users of the generic TLB shootdown code must declare this storage space. */
-DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
+DECLARE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 /* tlb_gather_mmu
  *	Return a pointer to an initialized struct mmu_gather.
@@ -54,8 +55,10 @@ DECLARE_PER_CPU(struct mmu_gather, mmu_g
 static inline struct mmu_gather *
 tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 {
-	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
+	int cpu;
+	struct mmu_gather *tlb = &get_cpu_var_locked(mmu_gathers, &cpu);
 
+	tlb->cpu = cpu;
 	tlb->mm = mm;
 
 	/* Use fast mode if only one CPU is online */
@@ -91,7 +94,7 @@ tlb_finish_mmu(struct mmu_gather *tlb, u
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
 
-	put_cpu_var(mmu_gathers);
+	put_cpu_var_locked(mmu_gathers, tlb->cpu);
 }
 
 /* tlb_remove_page
Index: linux.prev/include/asm-i386/acpi.h
===================================================================
--- linux.prev.orig/include/asm-i386/acpi.h
+++ linux.prev/include/asm-i386/acpi.h
@@ -52,8 +52,8 @@
 
 #define ACPI_ASM_MACROS
 #define BREAKPOINT3
-#define ACPI_DISABLE_IRQS() local_irq_disable()
-#define ACPI_ENABLE_IRQS()  local_irq_enable()
+#define ACPI_DISABLE_IRQS() local_irq_disable_nort()
+#define ACPI_ENABLE_IRQS()  local_irq_enable_nort()
 #define ACPI_FLUSH_CPU_CACHE()	wbinvd()
 
 
Index: linux.prev/include/asm-i386/arch_hooks.h
===================================================================
--- linux.prev.orig/include/asm-i386/arch_hooks.h
+++ linux.prev/include/asm-i386/arch_hooks.h
@@ -14,7 +14,6 @@
 extern void init_ISA_irqs(void);
 extern void apic_intr_init(void);
 extern void smp_intr_init(void);
-extern irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs);
 
 /* these are the defined hooks */
 extern void intr_init_hook(void);
Index: linux.prev/include/asm-i386/atomic.h
===================================================================
--- linux.prev.orig/include/asm-i386/atomic.h
+++ linux.prev/include/asm-i386/atomic.h
@@ -202,10 +202,10 @@ static __inline__ int atomic_add_return(
 
 #ifdef CONFIG_M386
 no_xadd: /* Legacy 386 processor */
-	local_irq_disable();
+	raw_local_irq_disable();
 	__i = atomic_read(v);
 	atomic_set(v, i + __i);
-	local_irq_enable();
+	raw_local_irq_enable();
 	return i + __i;
 #endif
 }
Index: linux.prev/include/asm-i386/bitops.h
===================================================================
--- linux.prev.orig/include/asm-i386/bitops.h
+++ linux.prev/include/asm-i386/bitops.h
@@ -389,7 +389,7 @@ static inline int sched_find_first_bit(c
 		return __ffs(b[1]) + 32;
 	if (unlikely(b[2]))
 		return __ffs(b[2]) + 64;
-	if (b[3])
+	if (unlikely(b[3]))
 		return __ffs(b[3]) + 96;
 	return __ffs(b[4]) + 128;
 }
Index: linux.prev/include/asm-i386/bug.h
===================================================================
--- linux.prev.orig/include/asm-i386/bug.h
+++ linux.prev/include/asm-i386/bug.h
@@ -13,10 +13,13 @@
 #define HAVE_ARCH_BUG
 #ifdef CONFIG_DEBUG_BUGVERBOSE
 #define BUG()				\
+do {					\
+printk("BUG at %s:%d!\n", __FILE__, __LINE__); \
  __asm__ __volatile__(	"ud2\n"		\
 			"\t.word %c0\n"	\
 			"\t.long %c1\n"	\
-			 : : "i" (__LINE__), "i" (__FILE__))
+			 : : "i" (__LINE__), "i" (__FILE__)); \
+} while (0)
 #else
 #define BUG() __asm__ __volatile__("ud2\n")
 #endif
Index: linux.prev/include/asm-i386/current.h
===================================================================
--- linux.prev.orig/include/asm-i386/current.h
+++ linux.prev/include/asm-i386/current.h
@@ -5,11 +5,16 @@
 
 struct task_struct;
 
-static inline struct task_struct * get_current(void)
+static inline struct task_struct * __current(void)
 {
-	return current_thread_info()->task;
+	return __current_thread_info()->task;
 }
- 
-#define current get_current()
+
+#ifndef CURRENT_PTR
+# define current __current()
+#else
+  extern struct task_struct * const ___current;
+# define current ___current
+#endif
 
 #endif /* !(_I386_CURRENT_H) */
Index: linux.prev/include/asm-i386/delay.h
===================================================================
--- linux.prev.orig/include/asm-i386/delay.h
+++ linux.prev/include/asm-i386/delay.h
@@ -23,4 +23,6 @@ extern void __delay(unsigned long loops)
 	((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \
 	__ndelay(n))
 
+void use_tsc_delay(void);
+
 #endif /* defined(_I386_DELAY_H) */
Index: linux.prev/include/asm-i386/dma.h
===================================================================
--- linux.prev.orig/include/asm-i386/dma.h
+++ linux.prev/include/asm-i386/dma.h
@@ -135,7 +135,7 @@
 #define DMA_AUTOINIT	0x10
 
 
-extern spinlock_t  dma_spin_lock;
+extern spinlock_t dma_spin_lock;
 
 static __inline__ unsigned long claim_dma_lock(void)
 {
Index: linux.prev/include/asm-i386/highmem.h
===================================================================
--- linux.prev.orig/include/asm-i386/highmem.h
+++ linux.prev/include/asm-i386/highmem.h
@@ -67,14 +67,40 @@ extern void * FASTCALL(kmap_high(struct 
 extern void FASTCALL(kunmap_high(struct page *page));
 
 void *kmap(struct page *page);
+extern void kunmap_virt(void *ptr);
+extern struct page *kmap_to_page(void *ptr);
 void kunmap(struct page *page);
-void *kmap_atomic(struct page *page, enum km_type type);
-void kunmap_atomic(void *kvaddr, enum km_type type);
-void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
-struct page *kmap_atomic_to_page(void *ptr);
+
+void *__kmap_atomic(struct page *page, enum km_type type);
+void __kunmap_atomic(void *kvaddr, enum km_type type);
+void *__kmap_atomic_pfn(unsigned long pfn, enum km_type type);
+struct page *__kmap_atomic_to_page(void *ptr);
 
 #define flush_cache_kmaps()	do { } while (0)
 
+/*
+ * on PREEMPT_RT kmap_atomic() is a wrapper that uses kmap():
+ */
+#ifdef CONFIG_PREEMPT_RT
+# ifdef CONFIG_DEBUG_RT_LOCKING_MODE
+extern int preempt_locks;
+#  define kmap_atomic(page, type)	({ void *__page; if (preempt_locks) __page = kmap(page); else __page = __kmap_atomic(page, type); __page; })
+#  define kmap_atomic_pfn(pfn, type)	({ void *__page; if (preempt_locks) __page = kmap(pfn_to_page(pfn)); else __page = __kmap_atomic_pfn(pfn, type); __page; })
+#  define kunmap_atomic(kvaddr, type)	do { if (preempt_locks) kunmap_virt(kvaddr); else __kunmap_atomic(kvaddr, type); } while (0)
+#  define kmap_atomic_to_page(kvaddr)	({ struct page *__page; if (preempt_locks) __page = kmap_to_page(kvaddr); else __page = __kmap_atomic_to_page(kvaddr); __page; })
+# else
+#  define kmap_atomic(page, type)	kmap(page)
+#  define kmap_atomic_pfn(pfn, type)	kmap(pfn_to_page(pfn))
+#  define kunmap_atomic(kvaddr, type)	kunmap_virt(kvaddr)
+#  define kmap_atomic_to_page(kvaddr)	kmap_to_page(kvaddr)
+# endif
+#else
+# define kmap_atomic(page, type)	__kmap_atomic(page, type)
+# define kmap_atomic_pfn(pfn, type)	__kmap_atomic_pfn(pfn, type)
+# define kunmap_atomic(kvaddr, type)	__kunmap_atomic(kvaddr, type)
+# define kmap_atomic_to_page(kvaddr)	__kmap_atomic_to_page(kvaddr)
+#endif
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_HIGHMEM_H */
Index: linux.prev/include/asm-i386/i387.h
===================================================================
--- linux.prev.orig/include/asm-i386/i387.h
+++ linux.prev/include/asm-i386/i387.h
@@ -53,7 +53,7 @@ static inline void __save_init_fpu( stru
 }
 
 #define __unlazy_fpu( tsk ) do { \
-	if ((tsk)->thread_info->status & TS_USEDFPU) \
+	if (unlikely((tsk)->thread_info->status & TS_USEDFPU)) \
 		save_init_fpu( tsk ); \
 } while (0)
 
Index: linux.prev/include/asm-i386/i8253.h
===================================================================
--- linux.prev.orig/include/asm-i386/i8253.h
+++ linux.prev/include/asm-i386/i8253.h
@@ -1,6 +1,6 @@
 #ifndef __ASM_I8253_H__
 #define __ASM_I8253_H__
 
-extern spinlock_t i8253_lock;
+extern raw_spinlock_t i8253_lock;
 
 #endif	/* __ASM_I8253_H__ */
Index: linux.prev/include/asm-i386/i8259.h
===================================================================
--- linux.prev.orig/include/asm-i386/i8259.h
+++ linux.prev/include/asm-i386/i8259.h
@@ -7,7 +7,7 @@ extern unsigned int cached_irq_mask;
 #define cached_master_mask	(__byte(0, cached_irq_mask))
 #define cached_slave_mask	(__byte(1, cached_irq_mask))
 
-extern spinlock_t i8259A_lock;
+extern raw_spinlock_t i8259A_lock;
 
 extern void init_8259A(int auto_eoi);
 extern void enable_8259A_irq(unsigned int irq);
Index: linux.prev/include/asm-i386/io_apic.h
===================================================================
--- linux.prev.orig/include/asm-i386/io_apic.h
+++ linux.prev/include/asm-i386/io_apic.h
@@ -4,6 +4,7 @@
 #include <linux/config.h>
 #include <asm/types.h>
 #include <asm/mpspec.h>
+#include <asm/apicdef.h>
 
 /*
  * Intel IO-APIC support for SMP and UP systems.
@@ -16,7 +17,6 @@
 #ifdef CONFIG_PCI_MSI
 static inline int use_pci_vector(void)	{return 1;}
 static inline void disable_edge_ioapic_vector(unsigned int vector) { }
-static inline void mask_and_ack_level_ioapic_vector(unsigned int vector) { }
 static inline void end_edge_ioapic_vector (unsigned int vector) { }
 #define startup_level_ioapic	startup_level_ioapic_vector
 #define shutdown_level_ioapic	mask_IO_APIC_vector
@@ -35,7 +35,6 @@ static inline void end_edge_ioapic_vecto
 #else
 static inline int use_pci_vector(void)	{return 0;}
 static inline void disable_edge_ioapic_irq(unsigned int irq) { }
-static inline void mask_and_ack_level_ioapic_irq(unsigned int irq) { }
 static inline void end_edge_ioapic_irq (unsigned int irq) { }
 #define startup_level_ioapic	startup_level_ioapic_irq
 #define shutdown_level_ioapic	mask_IO_APIC_irq
@@ -160,31 +159,11 @@ extern struct mpc_config_intsrc mp_irqs[
 /* non-0 if default (table-less) MP configuration */
 extern int mpc_default_type;
 
-static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
-{
-	*IO_APIC_BASE(apic) = reg;
-	return *(IO_APIC_BASE(apic)+4);
-}
-
-static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
-{
-	*IO_APIC_BASE(apic) = reg;
-	*(IO_APIC_BASE(apic)+4) = value;
-}
-
-/*
- * Re-write a value: to be used for read-modify-write
- * cycles where the read already set up the index register.
- *
- * Older SiS APIC requires we rewrite the index regiser
- */
+extern unsigned int raw_io_apic_read(unsigned int apic, unsigned int reg);
+extern unsigned int io_apic_read(unsigned int apic, unsigned int reg);
+extern void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value);
+extern void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value);
 extern int sis_apic_bug;
-static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
-{
-	if (sis_apic_bug)
-		*IO_APIC_BASE(apic) = reg;
-	*(IO_APIC_BASE(apic)+4) = value;
-}
 
 /* 1 if "noapic" boot option passed */
 extern int skip_ioapic_setup;
@@ -204,8 +183,11 @@ extern int io_apic_set_pci_routing (int 
 
 extern int (*ioapic_renumber_irq)(int ioapic, int irq);
 
+extern void io_apic_timer_ack(void *);
+
 #else  /* !CONFIG_X86_IO_APIC */
 #define io_apic_assign_pci_irqs 0
+#define io_apic_timer_ack NULL
 #endif
 
 extern int assign_irq_vector(int irq);
Index: linux.prev/include/asm-i386/mach-default/do_timer.h
===================================================================
--- linux.prev.orig/include/asm-i386/mach-default/do_timer.h
+++ linux.prev/include/asm-i386/mach-default/do_timer.h
@@ -1,86 +1,2 @@
 /* defines for inline arch setup functions */
 
-#include <asm/apic.h>
-#include <asm/i8259.h>
-
-/**
- * do_timer_interrupt_hook - hook into timer tick
- * @regs:	standard registers from interrupt
- *
- * Description:
- *	This hook is called immediately after the timer interrupt is ack'd.
- *	It's primary purpose is to allow architectures that don't possess
- *	individual per CPU clocks (like the CPU APICs supply) to broadcast the
- *	timer interrupt as a means of triggering reschedules etc.
- **/
-
-static inline void do_timer_interrupt_hook(struct pt_regs *regs)
-{
-	do_timer(regs);
-#ifndef CONFIG_SMP
-	update_process_times(user_mode(regs));
-#endif
-/*
- * In the SMP case we use the local APIC timer interrupt to do the
- * profiling, except when we simulate SMP mode on a uniprocessor
- * system, in that case we have to call the local interrupt handler.
- */
-#ifndef CONFIG_X86_LOCAL_APIC
-	profile_tick(CPU_PROFILING, regs);
-#else
-	if (!using_apic_timer)
-		smp_local_timer_interrupt(regs);
-#endif
-}
-
-
-/* you can safely undefine this if you don't have the Neptune chipset */
-
-#define BUGGY_NEPTUN_TIMER
-
-/**
- * do_timer_overflow - process a detected timer overflow condition
- * @count:	hardware timer interrupt count on overflow
- *
- * Description:
- *	This call is invoked when the jiffies count has not incremented but
- *	the hardware timer interrupt has.  It means that a timer tick interrupt
- *	came along while the previous one was pending, thus a tick was missed
- **/
-static inline int do_timer_overflow(int count)
-{
-	int i;
-
-	spin_lock(&i8259A_lock);
-	/*
-	 * This is tricky when I/O APICs are used;
-	 * see do_timer_interrupt().
-	 */
-	i = inb(0x20);
-	spin_unlock(&i8259A_lock);
-	
-	/* assumption about timer being IRQ0 */
-	if (i & 0x01) {
-		/*
-		 * We cannot detect lost timer interrupts ... 
-		 * well, that's why we call them lost, don't we? :)
-		 * [hmm, on the Pentium and Alpha we can ... sort of]
-		 */
-		count -= LATCH;
-	} else {
-#ifdef BUGGY_NEPTUN_TIMER
-		/*
-		 * for the Neptun bug we know that the 'latch'
-		 * command doesn't latch the high and low value
-		 * of the counter atomically. Thus we have to 
-		 * substract 256 from the counter 
-		 * ... funny, isnt it? :)
-		 */
-		
-		count -= 256;
-#else
-		printk("do_slow_gettimeoffset(): hardware timer problem?\n");
-#endif
-	}
-	return count;
-}
Index: linux.prev/include/asm-i386/mach-default/irq_vectors.h
===================================================================
--- linux.prev.orig/include/asm-i386/mach-default/irq_vectors.h
+++ linux.prev/include/asm-i386/mach-default/irq_vectors.h
@@ -63,7 +63,7 @@
  * levels. (0x80 is the syscall vector)
  */
 #define FIRST_DEVICE_VECTOR	0x31
-#define FIRST_SYSTEM_VECTOR	0xef
+#define FIRST_SYSTEM_VECTOR	0xee
 
 #define TIMER_IRQ 0
 
Index: linux.prev/include/asm-i386/mach-default/mach_timer.h
===================================================================
--- linux.prev.orig/include/asm-i386/mach-default/mach_timer.h
+++ linux.prev/include/asm-i386/mach-default/mach_timer.h
@@ -15,7 +15,9 @@
 #ifndef _MACH_TIMER_H
 #define _MACH_TIMER_H
 
-#define CALIBRATE_LATCH	(5 * LATCH)
+#define CALIBRATE_TIME_MSEC 30 /* 30 msecs */
+#define CALIBRATE_LATCH	\
+	((CLOCK_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000)
 
 static inline void mach_prepare_counter(void)
 {
Index: linux.prev/include/asm-i386/mach-summit/mach_mpparse.h
===================================================================
--- linux.prev.orig/include/asm-i386/mach-summit/mach_mpparse.h
+++ linux.prev/include/asm-i386/mach-summit/mach_mpparse.h
@@ -2,6 +2,7 @@
 #define __ASM_MACH_MPPARSE_H
 
 #include <mach_apic.h>
+#include <asm/tsc.h>
 
 extern int use_cyclone;
 
@@ -29,6 +30,7 @@ static inline int mps_oem_check(struct m
 			(!strncmp(productid, "VIGIL SMP", 9) 
 			 || !strncmp(productid, "EXA", 3)
 			 || !strncmp(productid, "RUTHLESS SMP", 12))){
+		mark_tsc_unstable();
 		use_cyclone = 1; /*enable cyclone-timer*/
 		setup_summit();
 		return 1;
@@ -42,6 +44,7 @@ static inline int acpi_madt_oem_check(ch
 	if (!strncmp(oem_id, "IBM", 3) &&
 	    (!strncmp(oem_table_id, "SERVIGIL", 8)
 	     || !strncmp(oem_table_id, "EXA", 3))){
+		mark_tsc_unstable();
 		use_cyclone = 1; /*enable cyclone-timer*/
 		setup_summit();
 		return 1;
Index: linux.prev/include/asm-i386/mc146818rtc.h
===================================================================
--- linux.prev.orig/include/asm-i386/mc146818rtc.h
+++ linux.prev/include/asm-i386/mc146818rtc.h
@@ -65,11 +65,11 @@ static inline unsigned char current_lock
 #define lock_cmos_prefix(reg) \
 	do {					\
 		unsigned long cmos_flags;	\
-		local_irq_save(cmos_flags);	\
+		raw_local_irq_save(cmos_flags);	\
 		lock_cmos(reg)
 #define lock_cmos_suffix(reg) \
 		unlock_cmos();			\
-		local_irq_restore(cmos_flags);	\
+		raw_local_irq_restore(cmos_flags); \
 	} while (0)
 #else
 #define lock_cmos_prefix(reg) do {} while (0)
Index: linux.prev/include/asm-i386/pgtable.h
===================================================================
--- linux.prev.orig/include/asm-i386/pgtable.h
+++ linux.prev/include/asm-i386/pgtable.h
@@ -37,7 +37,7 @@ extern unsigned long empty_zero_page[102
 extern pgd_t swapper_pg_dir[1024];
 extern kmem_cache_t *pgd_cache;
 extern kmem_cache_t *pmd_cache;
-extern spinlock_t pgd_lock;
+extern raw_spinlock_t pgd_lock;
 extern struct page *pgd_list;
 
 void pmd_ctor(void *, kmem_cache_t *, unsigned long);
Index: linux.prev/include/asm-i386/rwsem.h
===================================================================
--- linux.prev.orig/include/asm-i386/rwsem.h
+++ linux.prev/include/asm-i386/rwsem.h
@@ -43,15 +43,15 @@
 
 struct rwsem_waiter;
 
-extern struct rw_semaphore *FASTCALL(rwsem_down_read_failed(struct rw_semaphore *sem));
-extern struct rw_semaphore *FASTCALL(rwsem_down_write_failed(struct rw_semaphore *sem));
-extern struct rw_semaphore *FASTCALL(rwsem_wake(struct rw_semaphore *));
-extern struct rw_semaphore *FASTCALL(rwsem_downgrade_wake(struct rw_semaphore *sem));
+extern struct compat_rw_semaphore *FASTCALL(rwsem_down_read_failed(struct compat_rw_semaphore *sem));
+extern struct compat_rw_semaphore *FASTCALL(rwsem_down_write_failed(struct compat_rw_semaphore *sem));
+extern struct compat_rw_semaphore *FASTCALL(rwsem_wake(struct compat_rw_semaphore *));
+extern struct compat_rw_semaphore *FASTCALL(rwsem_downgrade_wake(struct compat_rw_semaphore *sem));
 
 /*
  * the semaphore definition
  */
-struct rw_semaphore {
+struct compat_rw_semaphore {
 	signed long		count;
 #define RWSEM_UNLOCKED_VALUE		0x00000000
 #define RWSEM_ACTIVE_BIAS		0x00000001
@@ -76,13 +76,13 @@ struct rw_semaphore {
 #endif
 
 #define __RWSEM_INITIALIZER(name) \
-{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) \
+{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED((name).wait_lock), LIST_HEAD_INIT((name).wait_list) \
 	__RWSEM_DEBUG_INIT }
 
-#define DECLARE_RWSEM(name) \
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
+#define COMPAT_DECLARE_RWSEM(name) \
+	struct compat_rw_semaphore name = __RWSEM_INITIALIZER(name)
 
-static inline void init_rwsem(struct rw_semaphore *sem)
+static inline void compat_init_rwsem(struct compat_rw_semaphore *sem)
 {
 	sem->count = RWSEM_UNLOCKED_VALUE;
 	spin_lock_init(&sem->wait_lock);
@@ -95,7 +95,7 @@ static inline void init_rwsem(struct rw_
 /*
  * lock for reading
  */
-static inline void __down_read(struct rw_semaphore *sem)
+static inline void __down_read(struct compat_rw_semaphore *sem)
 {
 	__asm__ __volatile__(
 		"# beginning down_read\n\t"
@@ -120,7 +120,7 @@ LOCK_PREFIX	"  incl      (%%eax)\n\t" /*
 /*
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
-static inline int __down_read_trylock(struct rw_semaphore *sem)
+static inline int __down_read_trylock(struct compat_rw_semaphore *sem)
 {
 	__s32 result, tmp;
 	__asm__ __volatile__(
@@ -143,7 +143,7 @@ LOCK_PREFIX	"  cmpxchgl  %2,%0\n\t"
 /*
  * lock for writing
  */
-static inline void __down_write(struct rw_semaphore *sem)
+static inline void __down_write(struct compat_rw_semaphore *sem)
 {
 	int tmp;
 
@@ -170,7 +170,7 @@ LOCK_PREFIX	"  xadd      %%edx,(%%eax)\n
 /*
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
-static inline int __down_write_trylock(struct rw_semaphore *sem)
+static inline int __down_write_trylock(struct compat_rw_semaphore *sem)
 {
 	signed long ret = cmpxchg(&sem->count,
 				  RWSEM_UNLOCKED_VALUE, 
@@ -183,7 +183,7 @@ static inline int __down_write_trylock(s
 /*
  * unlock after reading
  */
-static inline void __up_read(struct rw_semaphore *sem)
+static inline void __up_read(struct compat_rw_semaphore *sem)
 {
 	__s32 tmp = -RWSEM_ACTIVE_READ_BIAS;
 	__asm__ __volatile__(
@@ -209,7 +209,7 @@ LOCK_PREFIX	"  xadd      %%edx,(%%eax)\n
 /*
  * unlock after writing
  */
-static inline void __up_write(struct rw_semaphore *sem)
+static inline void __up_write(struct compat_rw_semaphore *sem)
 {
 	__asm__ __volatile__(
 		"# beginning __up_write\n\t"
@@ -235,7 +235,7 @@ LOCK_PREFIX	"  xaddl     %%edx,(%%eax)\n
 /*
  * downgrade write lock to read lock
  */
-static inline void __downgrade_write(struct rw_semaphore *sem)
+static inline void __downgrade_write(struct compat_rw_semaphore *sem)
 {
 	__asm__ __volatile__(
 		"# beginning __downgrade_write\n\t"
@@ -260,7 +260,7 @@ LOCK_PREFIX	"  addl      %2,(%%eax)\n\t"
 /*
  * implement atomic add functionality
  */
-static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
+static inline void rwsem_atomic_add(int delta, struct compat_rw_semaphore *sem)
 {
 	__asm__ __volatile__(
 LOCK_PREFIX	"addl %1,%0"
@@ -271,7 +271,7 @@ LOCK_PREFIX	"addl %1,%0"
 /*
  * implement exchange and add functionality
  */
-static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
+static inline int rwsem_atomic_update(int delta, struct compat_rw_semaphore *sem)
 {
 	int tmp = delta;
 
@@ -284,7 +284,7 @@ LOCK_PREFIX	"xadd %0,(%2)"
 	return tmp+delta;
 }
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
+static inline int compat_rwsem_is_locked(struct rw_semaphore *sem)
 {
 	return (sem->count != 0);
 }
Index: linux.prev/include/asm-i386/semaphore.h
===================================================================
--- linux.prev.orig/include/asm-i386/semaphore.h
+++ linux.prev/include/asm-i386/semaphore.h
@@ -1,10 +1,9 @@
 #ifndef _I386_SEMAPHORE_H
 #define _I386_SEMAPHORE_H
 
+#include <linux/config.h>
 #include <linux/linkage.h>
 
-#ifdef __KERNEL__
-
 /*
  * SMP- and interrupt-safe semaphores..
  *
@@ -41,30 +40,40 @@
 #include <linux/wait.h>
 #include <linux/rwsem.h>
 
-struct semaphore {
+/*
+ * On !PREEMPT_RT all semaphores are compat:
+ */
+#ifndef CONFIG_PREEMPT_RT
+# define compat_semaphore semaphore
+#endif
+
+struct compat_semaphore {
 	atomic_t count;
 	int sleepers;
 	wait_queue_head_t wait;
 };
 
 
-#define __SEMAPHORE_INITIALIZER(name, n)				\
+#define __COMPAT_SEMAPHORE_INITIALIZER(name, n)				\
 {									\
 	.count		= ATOMIC_INIT(n),				\
 	.sleepers	= 0,						\
 	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
 }
 
-#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
+#define __COMPAT_MUTEX_INITIALIZER(name) \
+	__COMPAT_SEMAPHORE_INITIALIZER(name,1)
 
-#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
-#define DECLARE_MUTEX_LOCKED(name) __DECLARE_SEMAPHORE_GENERIC(name,0)
+#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count) \
+	struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count)
 
-static inline void sema_init (struct semaphore *sem, int val)
+#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,1)
+#define COMPAT_DECLARE_MUTEX_LOCKED(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,0)
+
+static inline void compat_sema_init (struct compat_semaphore *sem, int val)
 {
 /*
- *	*sem = (struct semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
+ *	*sem = (struct compat_semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
  *
  * i'd rather use the more flexible initialization above, but sadly
  * GCC 2.7.2.3 emits a bogus warning. EGCS doesn't. Oh well.
@@ -74,27 +83,27 @@ static inline void sema_init (struct sem
 	init_waitqueue_head(&sem->wait);
 }
 
-static inline void init_MUTEX (struct semaphore *sem)
+static inline void compat_init_MUTEX (struct compat_semaphore *sem)
 {
-	sema_init(sem, 1);
+	compat_sema_init(sem, 1);
 }
 
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
+static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem)
 {
-	sema_init(sem, 0);
+	compat_sema_init(sem, 0);
 }
 
-fastcall void __down_failed(void /* special register calling convention */);
-fastcall int  __down_failed_interruptible(void  /* params in registers */);
-fastcall int  __down_failed_trylock(void  /* params in registers */);
-fastcall void __up_wakeup(void /* special register calling convention */);
+fastcall void __compat_down_failed(void /* special register calling convention */);
+fastcall int  __compat_down_failed_interruptible(void  /* params in registers */);
+fastcall int  __compat_down_failed_trylock(void  /* params in registers */);
+fastcall void __compat_up_wakeup(void /* special register calling convention */);
 
 /*
  * This is ugly, but we want the default case to fall through.
  * "__down_failed" is a special asm handler that calls the C
  * routine that actually waits. See arch/i386/kernel/semaphore.c
  */
-static inline void down(struct semaphore * sem)
+static inline void compat_down(struct compat_semaphore * sem)
 {
 	might_sleep();
 	__asm__ __volatile__(
@@ -104,7 +113,7 @@ static inline void down(struct semaphore
 		"1:\n"
 		LOCK_SECTION_START("")
 		"2:\tlea %0,%%eax\n\t"
-		"call __down_failed\n\t"
+		"call __compat_down_failed\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		:"=m" (sem->count)
@@ -116,7 +125,7 @@ static inline void down(struct semaphore
  * Interruptible try to acquire a semaphore.  If we obtained
  * it, return zero.  If we were interrupted, returns -EINTR
  */
-static inline int down_interruptible(struct semaphore * sem)
+static inline int compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int result;
 
@@ -129,7 +138,7 @@ static inline int down_interruptible(str
 		"1:\n"
 		LOCK_SECTION_START("")
 		"2:\tlea %1,%%eax\n\t"
-		"call __down_failed_interruptible\n\t"
+		"call __compat_down_failed_interruptible\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		:"=a" (result), "=m" (sem->count)
@@ -142,7 +151,7 @@ static inline int down_interruptible(str
  * Non-blockingly attempt to down() a semaphore.
  * Returns zero if we acquired it
  */
-static inline int down_trylock(struct semaphore * sem)
+static inline int compat_down_trylock(struct compat_semaphore * sem)
 {
 	int result;
 
@@ -154,7 +163,7 @@ static inline int down_trylock(struct se
 		"1:\n"
 		LOCK_SECTION_START("")
 		"2:\tlea %1,%%eax\n\t"
-		"call __down_failed_trylock\n\t"
+		"call __compat_down_failed_trylock\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		:"=a" (result), "=m" (sem->count)
@@ -169,7 +178,7 @@ static inline int down_trylock(struct se
  * The default case (no contention) will result in NO
  * jumps for both down() and up().
  */
-static inline void up(struct semaphore * sem)
+static inline void compat_up(struct compat_semaphore * sem)
 {
 	__asm__ __volatile__(
 		"# atomic up operation\n\t"
@@ -178,7 +187,7 @@ static inline void up(struct semaphore *
 		"1:\n"
 		LOCK_SECTION_START("")
 		"2:\tlea %0,%%eax\n\t"
-		"call __up_wakeup\n\t"
+		"call __compat_up_wakeup\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		".subsection 0\n"
@@ -187,5 +196,10 @@ static inline void up(struct semaphore *
 		:"memory","ax");
 }
 
-#endif
+extern int FASTCALL(compat_sem_is_locked(struct compat_semaphore *sem));
+
+#define compat_sema_count(sem) atomic_read(&(sem)->count)
+
+#include <linux/semaphore.h>
+
 #endif
Index: linux.prev/include/asm-i386/spinlock.h
===================================================================
--- linux.prev.orig/include/asm-i386/spinlock.h
+++ linux.prev/include/asm-i386/spinlock.h
@@ -34,7 +34,7 @@
 
 #define __raw_spin_lock_string_flags \
 	"\n1:\t" \
-	"lock ; decb %0\n\t" \
+	LOCK_PREFIX "decb %0\n\t" \
 	"jns 4f\n\t" \
 	"2:\t" \
 	"testl $0x200, %1\n\t" \
@@ -48,21 +48,21 @@
 	"jmp 1b\n" \
 	"4:\n\t"
 
-static inline void __raw_spin_lock(raw_spinlock_t *lock)
+static inline void __raw_spin_lock(__raw_spinlock_t *lock)
 {
 	__asm__ __volatile__(
 		__raw_spin_lock_string
 		:"=m" (lock->slock) : : "memory");
 }
 
-static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
+static inline void __raw_spin_lock_flags(__raw_spinlock_t *lock, unsigned long flags)
 {
 	__asm__ __volatile__(
 		__raw_spin_lock_string_flags
 		:"=m" (lock->slock) : "r" (flags) : "memory");
 }
 
-static inline int __raw_spin_trylock(raw_spinlock_t *lock)
+static inline int __raw_spin_trylock(__raw_spinlock_t *lock)
 {
 	char oldval;
 	__asm__ __volatile__(
@@ -86,7 +86,7 @@ static inline int __raw_spin_trylock(raw
 		:"=m" (lock->slock) : : "memory"
 
 
-static inline void __raw_spin_unlock(raw_spinlock_t *lock)
+static inline void __raw_spin_unlock(__raw_spinlock_t *lock)
 {
 	__asm__ __volatile__(
 		__raw_spin_unlock_string
@@ -100,7 +100,7 @@ static inline void __raw_spin_unlock(raw
 		:"=q" (oldval), "=m" (lock->slock) \
 		:"0" (oldval) : "memory"
 
-static inline void __raw_spin_unlock(raw_spinlock_t *lock)
+static inline void __raw_spin_unlock(__raw_spinlock_t *lock)
 {
 	char oldval = 1;
 
@@ -147,17 +147,17 @@ static inline void __raw_spin_unlock(raw
  */
 #define __raw_write_can_lock(x)		((x)->lock == RW_LOCK_BIAS)
 
-static inline void __raw_read_lock(raw_rwlock_t *rw)
+static inline void __raw_read_lock(__raw_rwlock_t *rw)
 {
 	__build_read_lock(rw, "__read_lock_failed");
 }
 
-static inline void __raw_write_lock(raw_rwlock_t *rw)
+static inline void __raw_write_lock(__raw_rwlock_t *rw)
 {
 	__build_write_lock(rw, "__write_lock_failed");
 }
 
-static inline int __raw_read_trylock(raw_rwlock_t *lock)
+static inline int __raw_read_trylock(__raw_rwlock_t *lock)
 {
 	atomic_t *count = (atomic_t *)lock;
 	atomic_dec(count);
@@ -167,7 +167,7 @@ static inline int __raw_read_trylock(raw
 	return 0;
 }
 
-static inline int __raw_write_trylock(raw_rwlock_t *lock)
+static inline int __raw_write_trylock(__raw_rwlock_t *lock)
 {
 	atomic_t *count = (atomic_t *)lock;
 	if (atomic_sub_and_test(RW_LOCK_BIAS, count))
@@ -176,12 +176,12 @@ static inline int __raw_write_trylock(ra
 	return 0;
 }
 
-static inline void __raw_read_unlock(raw_rwlock_t *rw)
+static inline void __raw_read_unlock(__raw_rwlock_t *rw)
 {
 	asm volatile("lock ; incl %0" :"=m" (rw->lock) : : "memory");
 }
 
-static inline void __raw_write_unlock(raw_rwlock_t *rw)
+static inline void __raw_write_unlock(__raw_rwlock_t *rw)
 {
 	asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ", %0"
 				 : "=m" (rw->lock) : : "memory");
Index: linux.prev/include/asm-i386/spinlock_types.h
===================================================================
--- linux.prev.orig/include/asm-i386/spinlock_types.h
+++ linux.prev/include/asm-i386/spinlock_types.h
@@ -7,13 +7,13 @@
 
 typedef struct {
 	volatile unsigned int slock;
-} raw_spinlock_t;
+} __raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED	{ 1 }
 
 typedef struct {
 	volatile unsigned int lock;
-} raw_rwlock_t;
+} __raw_rwlock_t;
 
 #define __RAW_RW_LOCK_UNLOCKED		{ RW_LOCK_BIAS }
 
Index: linux.prev/include/asm-i386/system.h
===================================================================
--- linux.prev.orig/include/asm-i386/system.h
+++ linux.prev/include/asm-i386/system.h
@@ -522,24 +522,32 @@ struct alt_instr { 
 #define set_wmb(var, value) do { var = value; wmb(); } while (0)
 
 /* interrupt control.. */
-#define local_save_flags(x)	do { typecheck(unsigned long,x); __asm__ __volatile__("pushfl ; popl %0":"=g" (x): /* no input */); } while (0)
-#define local_irq_restore(x) 	do { typecheck(unsigned long,x); __asm__ __volatile__("pushl %0 ; popfl": /* no output */ :"g" (x):"memory", "cc"); } while (0)
-#define local_irq_disable() 	__asm__ __volatile__("cli": : :"memory")
-#define local_irq_enable()	__asm__ __volatile__("sti": : :"memory")
+
+#define __raw_local_save_flags(x)	do { typecheck(unsigned long,x); __asm__ __volatile__("pushfl ; popl %0":"=g" (x): /* no input */); } while (0)
+#define __raw_local_irq_restore(x) 	do { __asm__ __volatile__("pushl %0 ; popfl": /* no output */ :"g" (x):"memory", "cc"); } while (0)
+#define __raw_local_irq_disable() 	do { __asm__ __volatile__("cli": : :"memory"); } while (0)
+#define __raw_local_irq_enable()	do { __asm__ __volatile__("sti": : :"memory"); } while (0)
 /* used in the idle loop; sti takes one instruction cycle to complete */
-#define safe_halt()		__asm__ __volatile__("sti; hlt": : :"memory")
-/* used when interrupts are already enabled or to shutdown the processor */
-#define halt()			__asm__ __volatile__("hlt": : :"memory")
-
-#define irqs_disabled()			\
-({					\
-	unsigned long flags;		\
-	local_save_flags(flags);	\
-	!(flags & (1<<9));		\
+#define __raw_safe_halt()		do { __asm__ __volatile__("sti; hlt": : :"memory"); } while (0)
+#define halt()				__asm__ __volatile__("hlt": : :"memory")
+
+
+#define __raw_irqs_disabled_flags(flags)	\
+({						\
+	!(flags & (1<<9));			\
+})
+
+#define __raw_irqs_disabled()			\
+({						\
+	unsigned long flags;			\
+	__raw_local_save_flags(flags);		\
+	__raw_irqs_disabled_flags(flags);	\
 })
 
 /* For spinlocks etc */
-#define local_irq_save(x)	__asm__ __volatile__("pushfl ; popl %0 ; cli":"=g" (x): /* no input */ :"memory")
+#define __raw_local_irq_save(x)	do { __asm__ __volatile__("pushfl ; popl %0 ; cli":"=g" (x): /* no input */ :"memory"); } while (0)
+
+#include <linux/rt_irq.h>
 
 /*
  * disable hlt during certain critical i/o operations
Index: linux.prev/include/asm-i386/thread_info.h
===================================================================
--- linux.prev.orig/include/asm-i386/thread_info.h
+++ linux.prev/include/asm-i386/thread_info.h
@@ -83,15 +83,28 @@ struct thread_info {
 #define init_thread_info	(init_thread_union.thread_info)
 #define init_stack		(init_thread_union.stack)
 
+#ifndef CONFIG_SMP
+// # define CURRENT_PTR
+#endif
 
 /* how to get the thread information struct from C */
-static inline struct thread_info *current_thread_info(void)
+static inline struct thread_info *__current_thread_info(void)
 {
 	struct thread_info *ti;
 	__asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~(THREAD_SIZE - 1)));
 	return ti;
 }
 
+#ifndef CURRENT_PTR
+static inline struct thread_info *current_thread_info(void)
+{
+	return __current_thread_info();
+}
+#else
+extern struct thread_info * const current_ti;
+# define current_thread_info() current_ti
+#endif
+
 /* how to get the current stack pointer from C */
 register unsigned long current_stack_pointer asm("esp") __attribute_used__;
 
@@ -144,11 +157,14 @@ register unsigned long current_stack_poi
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_MEMDIE		17
+#define TIF_NEED_RESCHED_DELAYED 18	/* reschedule on return to userspace */
+
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
 #define _TIF_SIGPENDING		(1<<TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
+#define _TIF_NEED_RESCHED_DELAYED (1<<TIF_NEED_RESCHED_DELAYED)
 #define _TIF_SINGLESTEP		(1<<TIF_SINGLESTEP)
 #define _TIF_IRET		(1<<TIF_IRET)
 #define _TIF_SYSCALL_EMU	(1<<TIF_SYSCALL_EMU)
Index: linux.prev/include/asm-i386/timeofday.h
===================================================================
--- /dev/null
+++ linux.prev/include/asm-i386/timeofday.h
@@ -0,0 +1,4 @@
+#ifndef _ASM_I386_TIMEOFDAY_H
+#define _ASM_I386_TIMEOFDAY_H
+#include <asm-generic/timeofday.h>
+#endif
Index: linux.prev/include/asm-i386/timer.h
===================================================================
--- linux.prev.orig/include/asm-i386/timer.h
+++ linux.prev/include/asm-i386/timer.h
@@ -3,68 +3,10 @@
 #include <linux/init.h>
 #include <linux/pm.h>
 
-/**
- * struct timer_ops - used to define a timer source
- *
- * @name: name of the timer.
- * @init: Probes and initializes the timer. Takes clock= override 
- *        string as an argument. Returns 0 on success, anything else
- *        on failure.
- * @mark_offset: called by the timer interrupt.
- * @get_offset:  called by gettimeofday(). Returns the number of microseconds
- *               since the last timer interupt.
- * @monotonic_clock: returns the number of nanoseconds since the init of the
- *                   timer.
- * @delay: delays this many clock cycles.
- */
-struct timer_opts {
-	char* name;
-	void (*mark_offset)(void);
-	unsigned long (*get_offset)(void);
-	unsigned long long (*monotonic_clock)(void);
-	void (*delay)(unsigned long);
-	unsigned long (*read_timer)(void);
-	int (*suspend)(pm_message_t state);
-	int (*resume)(void);
-};
-
-struct init_timer_opts {
-	int (*init)(char *override);
-	struct timer_opts *opts;
-};
-
 #define TICK_SIZE (tick_nsec / 1000)
-
-extern struct timer_opts* __init select_timer(void);
-extern void clock_fallback(void);
 void setup_pit_timer(void);
-
 /* Modifiers for buggy PIT handling */
-
 extern int pit_latch_buggy;
-
-extern struct timer_opts *cur_timer;
-extern int timer_ack;
-
-/* list of externed timers */
-extern struct timer_opts timer_none;
-extern struct timer_opts timer_pit;
-extern struct init_timer_opts timer_pit_init;
-extern struct init_timer_opts timer_tsc_init;
-#ifdef CONFIG_X86_CYCLONE_TIMER
-extern struct init_timer_opts timer_cyclone_init;
-#endif
-
-extern unsigned long calibrate_tsc(void);
-extern unsigned long read_timer_tsc(void);
-extern void init_cpu_khz(void);
 extern int recalibrate_cpu_khz(void);
-#ifdef CONFIG_HPET_TIMER
-extern struct init_timer_opts timer_hpet_init;
-extern unsigned long calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr);
-#endif
 
-#ifdef CONFIG_X86_PM_TIMER
-extern struct init_timer_opts timer_pmtmr_init;
-#endif
 #endif
Index: linux.prev/include/asm-i386/timex.h
===================================================================
--- linux.prev.orig/include/asm-i386/timex.h
+++ linux.prev/include/asm-i386/timex.h
@@ -8,6 +8,7 @@
 
 #include <linux/config.h>
 #include <asm/processor.h>
+#include <asm/tsc.h>
 
 #ifdef CONFIG_X86_ELAN
 #  define CLOCK_TICK_RATE 1189200 /* AMD Elan has different frequency! */
@@ -16,39 +17,6 @@
 #endif
 
 
-/*
- * Standard way to access the cycle counter on i586+ CPUs.
- * Currently only used on SMP.
- *
- * If you really have a SMP machine with i486 chips or older,
- * compile for that, and this will just always return zero.
- * That's ok, it just means that the nicer scheduling heuristics
- * won't work for you.
- *
- * We only use the low 32 bits, and we'd simply better make sure
- * that we reschedule before that wraps. Scheduling at least every
- * four billion cycles just basically sounds like a good idea,
- * regardless of how fast the machine is. 
- */
-typedef unsigned long long cycles_t;
-
-static inline cycles_t get_cycles (void)
-{
-	unsigned long long ret=0;
-
-#ifndef CONFIG_X86_TSC
-	if (!cpu_has_tsc)
-		return 0;
-#endif
-
-#if defined(CONFIG_X86_GENERIC) || defined(CONFIG_X86_TSC)
-	rdtscll(ret);
-#endif
-	return ret;
-}
-
-extern unsigned int cpu_khz;
-
 extern int read_current_timer(unsigned long *timer_value);
 #define ARCH_HAS_READ_CURRENT_TIMER	1
 
Index: linux.prev/include/asm-i386/tlbflush.h
===================================================================
--- linux.prev.orig/include/asm-i386/tlbflush.h
+++ linux.prev/include/asm-i386/tlbflush.h
@@ -5,15 +5,32 @@
 #include <linux/mm.h>
 #include <asm/processor.h>
 
+/*
+ * TLB-flush needs to be nonpreemptible on PREEMPT_RT due to the
+ * following complex race scenario:
+ *
+ * if the current task is lazy-TLB and does a TLB flush and
+ * gets preempted after the movl %%r3, %0 but before the
+ * movl %0, %%cr3 then its ->active_mm might change and it will
+ * install the wrong cr3 when it switches back. This is not a
+ * problem for the lazy-TLB task itself, but if the next task it
+ * switches to has an ->mm that is also the lazy-TLB task's
+ * new ->active_mm, then the scheduler will assume that cr3 is
+ * the new one, while we overwrote it with the old one. The result
+ * is the wrong cr3 in the new (non-lazy-TLB) task, which typically
+ * causes an infinite pagefault upon the next userspace access.
+ */
 #define __flush_tlb()							\
 	do {								\
 		unsigned int tmpreg;					\
 									\
+		preempt_disable();					\
 		__asm__ __volatile__(					\
 			"movl %%cr3, %0;              \n"		\
 			"movl %0, %%cr3;  # flush TLB \n"		\
 			: "=r" (tmpreg)					\
 			:: "memory");					\
+		preempt_enable();					\
 	} while (0)
 
 /*
@@ -24,6 +41,7 @@
 	do {								\
 		unsigned int tmpreg, cr4, cr4_orig;			\
 									\
+		preempt_disable();					\
 		__asm__ __volatile__(					\
 			"movl %%cr4, %2;  # turn off PGE     \n"	\
 			"movl %2, %1;                        \n"	\
@@ -35,6 +53,7 @@
 			: "=&r" (tmpreg), "=&r" (cr4), "=&r" (cr4_orig)	\
 			: "i" (~X86_CR4_PGE)				\
 			: "memory");					\
+		preempt_enable();					\
 	} while (0)
 
 extern unsigned long pgkern_mask;
@@ -87,6 +106,13 @@ extern unsigned long pgkern_mask;
 
 static inline void flush_tlb_mm(struct mm_struct *mm)
 {
+	/*
+	 * This is safe on PREEMPT_RT because if we preempt
+	 * right after the check but before the __flush_tlb(),
+	 * and if ->active_mm changes, then we might miss a
+	 * TLB flush, but that TLB flush happened already when
+	 * ->active_mm was changed:
+	 */
 	if (mm == current->active_mm)
 		__flush_tlb();
 }
Index: linux.prev/include/asm-i386/tsc.h
===================================================================
--- /dev/null
+++ linux.prev/include/asm-i386/tsc.h
@@ -0,0 +1,49 @@
+/*
+ * linux/include/asm-i386/tsc.h
+ *
+ * i386 TSC related functions
+ */
+#ifndef _ASM_i386_TSC_H
+#define _ASM_i386_TSC_H
+
+#include <linux/config.h>
+#include <asm/processor.h>
+
+/*
+ * Standard way to access the cycle counter on i586+ CPUs.
+ * Currently only used on SMP.
+ *
+ * If you really have a SMP machine with i486 chips or older,
+ * compile for that, and this will just always return zero.
+ * That's ok, it just means that the nicer scheduling heuristics
+ * won't work for you.
+ *
+ * We only use the low 32 bits, and we'd simply better make sure
+ * that we reschedule before that wraps. Scheduling at least every
+ * four billion cycles just basically sounds like a good idea,
+ * regardless of how fast the machine is.
+ */
+typedef unsigned long long cycles_t;
+
+extern unsigned int cpu_khz;
+extern unsigned int tsc_khz;
+
+static inline cycles_t get_cycles(void)
+{
+	unsigned long long ret = 0;
+
+#ifndef CONFIG_X86_TSC
+	if (!cpu_has_tsc)
+		return 0;
+#endif
+
+#if defined(CONFIG_X86_GENERIC) || defined(CONFIG_X86_TSC)
+	rdtscll(ret);
+#endif
+	return ret;
+}
+
+extern void tsc_init(void);
+extern void mark_tsc_unstable(void);
+
+#endif
Index: linux.prev/include/asm-i386/xor.h
===================================================================
--- linux.prev.orig/include/asm-i386/xor.h
+++ linux.prev/include/asm-i386/xor.h
@@ -862,7 +862,21 @@ static struct xor_block_template xor_blo
 #include <asm-generic/xor.h>
 
 #undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES				\
+/*
+ * MMX/SSE ops disable preemption for long periods of time,
+ * so on PREEMPT_RT use the register-based ops only:
+ */
+#ifdef CONFIG_PREEMPT_RT
+# define XOR_TRY_TEMPLATES				\
+	do {						\
+		xor_speed(&xor_block_8regs);		\
+		xor_speed(&xor_block_8regs_p);		\
+		xor_speed(&xor_block_32regs);		\
+		xor_speed(&xor_block_32regs_p);		\
+	} while (0)
+# define XOR_SELECT_TEMPLATE(FASTEST) (FASTEST)
+#else
+# define XOR_TRY_TEMPLATES				\
 	do {						\
 		xor_speed(&xor_block_8regs);		\
 		xor_speed(&xor_block_8regs_p);		\
@@ -875,9 +889,10 @@ static struct xor_block_template xor_blo
 	                xor_speed(&xor_block_p5_mmx);	\
 	        }					\
 	} while (0)
-
 /* We force the use of the SSE xor block because it can write around L2.
    We may also be able to load into the L1 only depending on how the cpu
    deals with a load to a line that is being prefetched.  */
-#define XOR_SELECT_TEMPLATE(FASTEST) \
+# define XOR_SELECT_TEMPLATE(FASTEST) \
 	(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
+#endif
+
Index: linux.prev/include/asm-mips/asmmacro.h
===================================================================
--- linux.prev.orig/include/asm-mips/asmmacro.h
+++ linux.prev/include/asm-mips/asmmacro.h
@@ -18,14 +18,14 @@
 #include <asm/asmmacro-64.h>
 #endif
 
-	.macro	local_irq_enable reg=t0
+	.macro	mips_raw_local_irq_enable reg=t0
 	mfc0	\reg, CP0_STATUS
 	ori	\reg, \reg, 1
 	mtc0	\reg, CP0_STATUS
 	irq_enable_hazard
 	.endm
 
-	.macro	local_irq_disable reg=t0
+	.macro	mips_raw_local_irq_disable reg=t0
 	mfc0	\reg, CP0_STATUS
 	ori	\reg, \reg, 1
 	xori	\reg, \reg, 1
Index: linux.prev/include/asm-mips/atomic.h
===================================================================
--- linux.prev.orig/include/asm-mips/atomic.h
+++ linux.prev/include/asm-mips/atomic.h
@@ -18,15 +18,20 @@
  * main big wrapper ...
  */
 #include <linux/config.h>
-#include <linux/spinlock.h>
 
 #ifndef _ASM_ATOMIC_H
 #define _ASM_ATOMIC_H
 
 #include <asm/cpu-features.h>
 #include <asm/war.h>
+#include <asm/types.h>
+
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+
+#include <linux/spinlock.h>
+extern raw_spinlock_t atomic_lock;
 
-extern spinlock_t atomic_lock;
+#endif
 
 typedef struct { volatile int counter; } atomic_t;
 
@@ -82,13 +87,16 @@ static __inline__ void atomic_add(int i,
 		"	.set	mips0					\n"
 		: "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter));
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		spin_lock_irqsave(&atomic_lock, flags);
 		v->counter += i;
 		spin_unlock_irqrestore(&atomic_lock, flags);
 	}
+#endif
 }
 
 /*
@@ -124,13 +132,16 @@ static __inline__ void atomic_sub(int i,
 		"	.set	mips0					\n"
 		: "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter));
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		spin_lock_irqsave(&atomic_lock, flags);
 		v->counter -= i;
 		spin_unlock_irqrestore(&atomic_lock, flags);
 	}
+#endif
 }
 
 /*
@@ -170,7 +181,9 @@ static __inline__ int atomic_add_return(
 		: "=&r" (result), "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter)
 		: "memory");
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		spin_lock_irqsave(&atomic_lock, flags);
@@ -179,6 +192,7 @@ static __inline__ int atomic_add_return(
 		v->counter = result;
 		spin_unlock_irqrestore(&atomic_lock, flags);
 	}
+#endif
 
 	return result;
 }
@@ -217,7 +231,9 @@ static __inline__ int atomic_sub_return(
 		: "=&r" (result), "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter)
 		: "memory");
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		spin_lock_irqsave(&atomic_lock, flags);
@@ -226,6 +242,7 @@ static __inline__ int atomic_sub_return(
 		v->counter = result;
 		spin_unlock_irqrestore(&atomic_lock, flags);
 	}
+#endif
 
 	return result;
 }
@@ -274,7 +291,9 @@ static __inline__ int atomic_sub_if_posi
 		: "=&r" (result), "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter)
 		: "memory");
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		spin_lock_irqsave(&atomic_lock, flags);
@@ -284,6 +303,7 @@ static __inline__ int atomic_sub_if_posi
 			v->counter = result;
 		spin_unlock_irqrestore(&atomic_lock, flags);
 	}
+#endif
 
 	return result;
 }
@@ -429,13 +449,16 @@ static __inline__ void atomic64_add(long
 		"	.set	mips0					\n"
 		: "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter));
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		spin_lock_irqsave(&atomic_lock, flags);
 		v->counter += i;
 		spin_unlock_irqrestore(&atomic_lock, flags);
 	}
+#endif
 }
 
 /*
@@ -471,13 +494,16 @@ static __inline__ void atomic64_sub(long
 		"	.set	mips0					\n"
 		: "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter));
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		spin_lock_irqsave(&atomic_lock, flags);
 		v->counter -= i;
 		spin_unlock_irqrestore(&atomic_lock, flags);
 	}
+#endif
 }
 
 /*
@@ -517,7 +543,9 @@ static __inline__ long atomic64_add_retu
 		: "=&r" (result), "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter)
 		: "memory");
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		spin_lock_irqsave(&atomic_lock, flags);
@@ -526,6 +554,8 @@ static __inline__ long atomic64_add_retu
 		v->counter = result;
 		spin_unlock_irqrestore(&atomic_lock, flags);
 	}
+#endif
+#endif
 
 	return result;
 }
@@ -564,7 +594,9 @@ static __inline__ long atomic64_sub_retu
 		: "=&r" (result), "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter)
 		: "memory");
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		spin_lock_irqsave(&atomic_lock, flags);
@@ -631,6 +663,7 @@ static __inline__ long atomic64_sub_if_p
 			v->counter = result;
 		spin_unlock_irqrestore(&atomic_lock, flags);
 	}
+#endif
 
 	return result;
 }
Index: linux.prev/include/asm-mips/bitops.h
===================================================================
--- linux.prev.orig/include/asm-mips/bitops.h
+++ linux.prev/include/asm-mips/bitops.h
@@ -91,6 +91,7 @@ static inline void set_bit(unsigned long
 		"	.set	mips0					\n"
 		: "=&r" (temp), "=m" (*m)
 		: "ir" (1UL << (nr & SZLONG_MASK)), "m" (*m));
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		volatile unsigned long *a = addr;
 		unsigned long mask;
@@ -101,6 +102,7 @@ static inline void set_bit(unsigned long
 		__bi_local_irq_save(flags);
 		*a |= mask;
 		__bi_local_irq_restore(flags);
+#endif
 	}
 }
 
@@ -155,6 +157,7 @@ static inline void clear_bit(unsigned lo
 		"	.set	mips0					\n"
 		: "=&r" (temp), "=m" (*m)
 		: "ir" (~(1UL << (nr & SZLONG_MASK))), "m" (*m));
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		volatile unsigned long *a = addr;
 		unsigned long mask;
@@ -165,6 +168,7 @@ static inline void clear_bit(unsigned lo
 		__bi_local_irq_save(flags);
 		*a &= ~mask;
 		__bi_local_irq_restore(flags);
+#endif
 	}
 }
 
@@ -221,6 +225,7 @@ static inline void change_bit(unsigned l
 		"	.set	mips0				\n"
 		: "=&r" (temp), "=m" (*m)
 		: "ir" (1UL << (nr & SZLONG_MASK)), "m" (*m));
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		volatile unsigned long *a = addr;
 		unsigned long mask;
@@ -231,6 +236,7 @@ static inline void change_bit(unsigned l
 		__bi_local_irq_save(flags);
 		*a ^= mask;
 		__bi_local_irq_restore(flags);
+#endif
 	}
 }
 
@@ -303,6 +309,7 @@ static inline int test_and_set_bit(unsig
 		: "memory");
 
 		return res != 0;
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		volatile unsigned long *a = addr;
 		unsigned long mask;
@@ -317,6 +324,7 @@ static inline int test_and_set_bit(unsig
 		__bi_local_irq_restore(flags);
 
 		return retval;
+#endif
 	}
 }
 
@@ -399,6 +407,7 @@ static inline int test_and_clear_bit(uns
 		: "memory");
 
 		return res != 0;
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		volatile unsigned long *a = addr;
 		unsigned long mask;
@@ -413,6 +422,7 @@ static inline int test_and_clear_bit(uns
 		__bi_local_irq_restore(flags);
 
 		return retval;
+#endif
 	}
 }
 
@@ -493,6 +503,7 @@ static inline int test_and_change_bit(un
 		: "memory");
 
 		return res != 0;
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		volatile unsigned long *a = addr;
 		unsigned long mask, retval;
@@ -506,6 +517,7 @@ static inline int test_and_change_bit(un
 		__bi_local_irq_restore(flags);
 
 		return retval;
+#endif
 	}
 }
 
Index: linux.prev/include/asm-mips/hw_irq.h
===================================================================
--- linux.prev.orig/include/asm-mips/hw_irq.h
+++ linux.prev/include/asm-mips/hw_irq.h
@@ -10,6 +10,7 @@
 
 #include <linux/profile.h>
 #include <asm/atomic.h>
+#include <linux/rt_irq.h>
 
 extern void disable_8259A_irq(unsigned int irq);
 extern void enable_8259A_irq(unsigned int irq);
Index: linux.prev/include/asm-mips/i8259.h
===================================================================
--- linux.prev.orig/include/asm-mips/i8259.h
+++ linux.prev/include/asm-mips/i8259.h
@@ -19,7 +19,7 @@
 
 #include <asm/io.h>
 
-extern spinlock_t i8259A_lock;
+extern raw_spinlock_t i8259A_lock;
 
 extern void init_i8259_irqs(void);
 
Index: linux.prev/include/asm-mips/interrupt.h
===================================================================
--- linux.prev.orig/include/asm-mips/interrupt.h
+++ linux.prev/include/asm-mips/interrupt.h
@@ -15,7 +15,7 @@
 #include <asm/hazards.h>
 
 __asm__ (
-	"	.macro	local_irq_enable				\n"
+	"	.macro	mips_raw_local_irq_enable			\n"
 	"	.set	push						\n"
 	"	.set	reorder						\n"
 	"	.set	noat						\n"
@@ -31,10 +31,10 @@ __asm__ (
 	"	.set	pop						\n"
 	"	.endm");
 
-static inline void local_irq_enable(void)
+static inline void __raw_local_irq_enable(void)
 {
 	__asm__ __volatile__(
-		"local_irq_enable"
+		"mips_raw_local_irq_enable"
 		: /* no outputs */
 		: /* no inputs */
 		: "memory");
@@ -48,7 +48,7 @@ static inline void local_irq_enable(void
  * no nops at all.
  */
 __asm__ (
-	"	.macro	local_irq_disable\n"
+	"	.macro	mips_raw_local_irq_disable			\n"
 	"	.set	push						\n"
 	"	.set	noat						\n"
 #ifdef CONFIG_CPU_MIPSR2
@@ -64,30 +64,30 @@ __asm__ (
 	"	.set	pop						\n"
 	"	.endm							\n");
 
-static inline void local_irq_disable(void)
+static inline void __raw_local_irq_disable(void)
 {
 	__asm__ __volatile__(
-		"local_irq_disable"
+		"mips_raw_local_irq_disable"
 		: /* no outputs */
 		: /* no inputs */
 		: "memory");
 }
 
 __asm__ (
-	"	.macro	local_save_flags flags				\n"
+	"	.macro	mips_raw_local_save_flags flags			\n"
 	"	.set	push						\n"
 	"	.set	reorder						\n"
 	"	mfc0	\\flags, $12					\n"
 	"	.set	pop						\n"
 	"	.endm							\n");
 
-#define local_save_flags(x)						\
+#define __raw_local_save_flags(x)					\
 __asm__ __volatile__(							\
-	"local_save_flags %0"						\
+	"mips_raw_local_save_flags %0"					\
 	: "=r" (x))
 
 __asm__ (
-	"	.macro	local_irq_save result				\n"
+	"	.macro	mips_raw_local_irq_save result			\n"
 	"	.set	push						\n"
 	"	.set	reorder						\n"
 	"	.set	noat						\n"
@@ -104,15 +104,15 @@ __asm__ (
 	"	.set	pop						\n"
 	"	.endm							\n");
 
-#define local_irq_save(x)						\
+#define __raw_local_irq_save(x)						\
 __asm__ __volatile__(							\
-	"local_irq_save\t%0"						\
+	"mips_raw_local_irq_save\t%0"					\
 	: "=r" (x)							\
 	: /* no inputs */						\
 	: "memory")
 
 __asm__ (
-	"	.macro	local_irq_restore flags				\n"
+	"	.macro	mips_raw_local_irq_restore flags		\n"
 	"	.set	noreorder					\n"
 	"	.set	noat						\n"
 #if defined(CONFIG_CPU_MIPSR2) && defined(CONFIG_IRQ_CPU)
@@ -144,22 +144,28 @@ __asm__ (
 	"	.set	reorder						\n"
 	"	.endm							\n");
 
-#define local_irq_restore(flags)					\
+#define __raw_local_irq_restore(flags)					\
 do {									\
 	unsigned long __tmp1;						\
 									\
 	__asm__ __volatile__(						\
-		"local_irq_restore\t%0"					\
+		"mips_raw_local_irq_restore\t%0"				\
 		: "=r" (__tmp1)						\
 		: "0" (flags)						\
 		: "memory");						\
 } while(0)
 
-#define irqs_disabled()							\
+#define __raw_irqs_disabled()							\
 ({									\
 	unsigned long flags;						\
-	local_save_flags(flags);					\
+	__raw_local_save_flags(flags);					\
 	!(flags & 1);							\
 })
 
+#define __raw_irqs_disabled_flags(flags)	\
+({						\
+	!(flags & 1);				\
+})
+
 #endif /* _ASM_INTERRUPT_H */
+
Index: linux.prev/include/asm-mips/io.h
===================================================================
--- linux.prev.orig/include/asm-mips/io.h
+++ linux.prev/include/asm-mips/io.h
@@ -16,6 +16,7 @@
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
+#include <linux/rt_irq.h>
 
 #include <asm/addrspace.h>
 #include <asm/bug.h>
@@ -326,7 +327,7 @@ static inline void pfx##write##bwlq(type
 		type __tmp;						\
 									\
 		if (irq)						\
-			local_irq_save(__flags);			\
+			raw_local_irq_save(__flags);			\
 		__asm__ __volatile__(					\
 			".set	mips3"		"\t\t# __writeq""\n\t"	\
 			"dsll32	%L0, %L0, 0"			"\n\t"	\
@@ -338,7 +339,7 @@ static inline void pfx##write##bwlq(type
 			: "=r" (__tmp)					\
 			: "0" (__val), "m" (*__mem));			\
 		if (irq)						\
-			local_irq_restore(__flags);			\
+			raw_local_irq_restore(__flags);			\
 	} else								\
 		BUG();							\
 }									\
@@ -356,7 +357,7 @@ static inline type pfx##read##bwlq(volat
 		unsigned long __flags;					\
 									\
 		if (irq)						\
-			local_irq_save(__flags);			\
+			raw_local_irq_save(__flags);			\
 		__asm__ __volatile__(					\
 			".set	mips3"		"\t\t# __readq"	"\n\t"	\
 			"ld	%L0, %1"			"\n\t"	\
@@ -366,7 +367,7 @@ static inline type pfx##read##bwlq(volat
 			: "=r" (__val)					\
 			: "m" (*__mem));				\
 		if (irq)						\
-			local_irq_restore(__flags);			\
+			raw_local_irq_restore(__flags);			\
 	} else {							\
 		__val = 0;						\
 		BUG();							\
Index: linux.prev/include/asm-mips/linkage.h
===================================================================
--- linux.prev.orig/include/asm-mips/linkage.h
+++ linux.prev/include/asm-mips/linkage.h
@@ -1,6 +1,8 @@
 #ifndef __ASM_LINKAGE_H
 #define __ASM_LINKAGE_H
 
-/* Nothing to see here... */
+/* FASTCALL stuff */
+#define FASTCALL(x)	x
+#define fastcall
 
 #endif
Index: linux.prev/include/asm-mips/m48t35.h
===================================================================
--- linux.prev.orig/include/asm-mips/m48t35.h
+++ linux.prev/include/asm-mips/m48t35.h
@@ -6,7 +6,7 @@
 
 #include <linux/spinlock.h>
 
-extern spinlock_t rtc_lock;
+extern raw_spinlock_t rtc_lock;
 
 struct m48t35_rtc {
 	volatile u8	pad[0x7ff8];    /* starts at 0x7ff8 */
Index: linux.prev/include/asm-mips/mipsregs.h
===================================================================
--- linux.prev.orig/include/asm-mips/mipsregs.h
+++ linux.prev/include/asm-mips/mipsregs.h
@@ -758,7 +758,7 @@ do {									\
 	unsigned long long val;						\
 	unsigned long flags;						\
 									\
-	local_irq_save(flags);						\
+	raw_local_irq_save(flags);					\
 	if (sel == 0)							\
 		__asm__ __volatile__(					\
 			".set\tmips64\n\t"				\
@@ -777,7 +777,7 @@ do {									\
 			"dsrl\t%L0, %L0, 32\n\t"			\
 			".set\tmips0"					\
 			: "=r" (val));					\
-	local_irq_restore(flags);					\
+	raw_local_irq_restore(flags);					\
 									\
 	val;								\
 })
@@ -786,7 +786,7 @@ do {									\
 do {									\
 	unsigned long flags;						\
 									\
-	local_irq_save(flags);						\
+	raw_local_irq_save(flags);					\
 	if (sel == 0)							\
 		__asm__ __volatile__(					\
 			".set\tmips64\n\t"				\
@@ -807,7 +807,7 @@ do {									\
 			"dmtc0\t%L0, " #source ", " #sel "\n\t"		\
 			".set\tmips0"					\
 			: : "r" (val));					\
-	local_irq_restore(flags);					\
+	raw_local_irq_restore(flags);					\
 } while (0)
 
 #define read_c0_index()		__read_32bit_c0_register($0, 0)
Index: linux.prev/include/asm-mips/mmu_context.h
===================================================================
--- linux.prev.orig/include/asm-mips/mmu_context.h
+++ linux.prev/include/asm-mips/mmu_context.h
@@ -117,7 +117,7 @@ static inline void switch_mm(struct mm_s
 	unsigned int cpu = smp_processor_id();
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	/* Check if our ASID is of an older version and thus invalid */
 	if ((cpu_context(cpu, next) ^ asid_cache(cpu)) & ASID_VERSION_MASK)
@@ -133,7 +133,7 @@ static inline void switch_mm(struct mm_s
 	cpu_clear(cpu, prev->cpu_vm_mask);
 	cpu_set(cpu, next->cpu_vm_mask);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /*
@@ -156,7 +156,7 @@ activate_mm(struct mm_struct *prev, stru
 	unsigned long flags;
 	unsigned int cpu = smp_processor_id();
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	/* Unconditionally get a new ASID.  */
 	get_new_mmu_context(next, cpu);
@@ -168,7 +168,7 @@ activate_mm(struct mm_struct *prev, stru
 	cpu_clear(cpu, prev->cpu_vm_mask);
 	cpu_set(cpu, next->cpu_vm_mask);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /*
@@ -180,7 +180,7 @@ drop_mmu_context(struct mm_struct *mm, u
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	if (cpu_isset(cpu, mm->cpu_vm_mask))  {
 		get_new_mmu_context(mm, cpu);
@@ -190,7 +190,7 @@ drop_mmu_context(struct mm_struct *mm, u
 		cpu_context(cpu, mm) = 0;
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 #endif /* _ASM_MMU_CONTEXT_H */
Index: linux.prev/include/asm-mips/rwsem.h
===================================================================
--- /dev/null
+++ linux.prev/include/asm-mips/rwsem.h
@@ -0,0 +1,176 @@
+/*
+ * include/asm-mips/rwsem.h: R/W semaphores for MIPS using the stuff
+ * in lib/rwsem.c.  Adapted largely from include/asm-ppc/rwsem.h
+ * by john.cooper@timesys.com
+ */
+
+#ifndef _MIPS_RWSEM_H
+#define _MIPS_RWSEM_H
+
+#ifndef _LINUX_RWSEM_H
+#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead"
+#endif
+
+#ifdef __KERNEL__
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+
+/*
+ * the semaphore definition
+ */
+struct compat_rw_semaphore {
+	/* XXX this should be able to be an atomic_t  -- paulus */
+	signed long		count;
+#define RWSEM_UNLOCKED_VALUE		0x00000000
+#define RWSEM_ACTIVE_BIAS		0x00000001
+#define RWSEM_ACTIVE_MASK		0x0000ffff
+#define RWSEM_WAITING_BIAS		(-0x00010000)
+#define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
+#define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
+	raw_spinlock_t		wait_lock;
+	struct list_head	wait_list;
+#if RWSEM_DEBUG
+	int			debug;
+#endif
+};
+
+/*
+ * initialisation
+ */
+#if RWSEM_DEBUG
+#define __RWSEM_DEBUG_INIT      , 0
+#else
+#define __RWSEM_DEBUG_INIT	/* */
+#endif
+
+#define __COMPAT_RWSEM_INITIALIZER(name) \
+	{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \
+	  LIST_HEAD_INIT((name).wait_list) \
+	  __RWSEM_DEBUG_INIT }
+
+#define COMPAT_DECLARE_RWSEM(name)		\
+	struct compat_rw_semaphore name = __COMPAT_RWSEM_INITIALIZER(name)
+
+extern struct compat_rw_semaphore *rwsem_down_read_failed(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_down_write_failed(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_wake(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_downgrade_wake(struct compat_rw_semaphore *sem);
+
+static inline void compat_init_rwsem(struct compat_rw_semaphore *sem)
+{
+	sem->count = RWSEM_UNLOCKED_VALUE;
+	spin_lock_init(&sem->wait_lock);
+	INIT_LIST_HEAD(&sem->wait_list);
+#if RWSEM_DEBUG
+	sem->debug = 0;
+#endif
+}
+
+/*
+ * lock for reading
+ */
+static inline void __down_read(struct compat_rw_semaphore *sem)
+{
+	if (atomic_inc_return((atomic_t *)(&sem->count)) > 0)
+		smp_wmb();
+	else
+		rwsem_down_read_failed(sem);
+}
+
+static inline int __down_read_trylock(struct compat_rw_semaphore *sem)
+{
+	int tmp;
+
+	while ((tmp = sem->count) >= 0) {
+		if (tmp == cmpxchg(&sem->count, tmp,
+				   tmp + RWSEM_ACTIVE_READ_BIAS)) {
+			smp_wmb();
+			return 1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * lock for writing
+ */
+static inline void __down_write(struct compat_rw_semaphore *sem)
+{
+	int tmp;
+
+	tmp = atomic_add_return(RWSEM_ACTIVE_WRITE_BIAS,
+				(atomic_t *)(&sem->count));
+	if (tmp == RWSEM_ACTIVE_WRITE_BIAS)
+		smp_wmb();
+	else
+		rwsem_down_write_failed(sem);
+}
+
+static inline int __down_write_trylock(struct compat_rw_semaphore *sem)
+{
+	int tmp;
+
+	tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE,
+		      RWSEM_ACTIVE_WRITE_BIAS);
+	smp_wmb();
+	return tmp == RWSEM_UNLOCKED_VALUE;
+}
+
+/*
+ * unlock after reading
+ */
+static inline void __up_read(struct compat_rw_semaphore *sem)
+{
+	int tmp;
+
+	smp_wmb();
+	tmp = atomic_dec_return((atomic_t *)(&sem->count));
+	if (tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0)
+		rwsem_wake(sem);
+}
+
+/*
+ * unlock after writing
+ */
+static inline void __up_write(struct compat_rw_semaphore *sem)
+{
+	smp_wmb();
+	if (atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS,
+			      (atomic_t *)(&sem->count)) < 0)
+		rwsem_wake(sem);
+}
+
+/*
+ * implement atomic add functionality
+ */
+static inline void rwsem_atomic_add(int delta, struct compat_rw_semaphore *sem)
+{
+	atomic_add(delta, (atomic_t *)(&sem->count));
+}
+
+/*
+ * downgrade write lock to read lock
+ */
+static inline void __downgrade_write(struct compat_rw_semaphore *sem)
+{
+	int tmp;
+
+	smp_wmb();
+	tmp = atomic_add_return(-RWSEM_WAITING_BIAS, (atomic_t *)(&sem->count));
+	if (tmp < 0)
+		rwsem_downgrade_wake(sem);
+}
+
+/*
+ * implement exchange and add functionality
+ */
+static inline int rwsem_atomic_update(int delta, struct compat_rw_semaphore *sem)
+{
+	smp_mb();
+	return atomic_add_return(delta, (atomic_t *)(&sem->count));
+}
+
+#endif /* __KERNEL__ */
+#endif /* _MIPS_RWSEM_H */
Index: linux.prev/include/asm-mips/semaphore.h
===================================================================
--- linux.prev.orig/include/asm-mips/semaphore.h
+++ linux.prev/include/asm-mips/semaphore.h
@@ -24,12 +24,20 @@
 
 #ifdef __KERNEL__
 
-#include <asm/atomic.h>
-#include <asm/system.h>
 #include <linux/wait.h>
 #include <linux/rwsem.h>
 
-struct semaphore {
+/*
+ * On !PREEMPT_RT all semaphores are compat:
+ */
+#ifndef CONFIG_PREEMPT_RT
+# define compat_semaphore semaphore
+#endif
+
+#include <asm/atomic.h>
+#include <asm/system.h>
+
+struct compat_semaphore {
 	/*
 	 * Note that any negative value of count is equivalent to 0,
 	 * but additionally indicates that some process(es) might be
@@ -39,39 +47,42 @@ struct semaphore {
 	wait_queue_head_t wait;
 };
 
-#define __SEMAPHORE_INITIALIZER(name, n)				\
+#define __COMPAT_SEMAPHORE_INITIALIZER(name, n)				\
 {									\
 	.count		= ATOMIC_INIT(n),				\
 	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
 }
 
-#define __DECLARE_SEMAPHORE_GENERIC(name, count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
+#define __COMPAT_MUTEX_INITIALIZER(name) \
+	__COMPAT_SEMAPHORE_INITIALIZER(name, 1)
+
+#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name, count) \
+	struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count)
 
-#define DECLARE_MUTEX(name)		__DECLARE_SEMAPHORE_GENERIC(name, 1)
-#define DECLARE_MUTEX_LOCKED(name)	__DECLARE_SEMAPHORE_GENERIC(name, 0)
+#define COMPAT_DECLARE_MUTEX(name)		__COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 1)
+#define COMPAT_DECLARE_MUTEX_LOCKED(name)	__COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 0)
 
-static inline void sema_init (struct semaphore *sem, int val)
+static inline void compat_sema_init (struct compat_semaphore *sem, int val)
 {
 	atomic_set(&sem->count, val);
 	init_waitqueue_head(&sem->wait);
 }
 
-static inline void init_MUTEX (struct semaphore *sem)
+static inline void compat_init_MUTEX (struct compat_semaphore *sem)
 {
 	sema_init(sem, 1);
 }
 
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
+static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem)
 {
 	sema_init(sem, 0);
 }
 
-extern void __down(struct semaphore * sem);
-extern int  __down_interruptible(struct semaphore * sem);
-extern void __up(struct semaphore * sem);
+extern void __compat_down(struct compat_semaphore * sem);
+extern int  __compat_down_interruptible(struct compat_semaphore * sem);
+extern void __compat_up(struct compat_semaphore * sem);
 
-static inline void down(struct semaphore * sem)
+static inline void compat_down(struct compat_semaphore * sem)
 {
 	might_sleep();
 
@@ -79,31 +90,35 @@ static inline void down(struct semaphore
 	 * Try to get the semaphore, take the slow path if we fail.
 	 */
 	if (unlikely(atomic_dec_return(&sem->count) < 0))
-		__down(sem);
+		__compat_down(sem);
 }
 
-static inline int down_interruptible(struct semaphore * sem)
+static inline int compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int ret = 0;
 
 	might_sleep();
 
 	if (unlikely(atomic_dec_return(&sem->count) < 0))
-		ret = __down_interruptible(sem);
+		ret = __compat_down_interruptible(sem);
 	return ret;
 }
 
-static inline int down_trylock(struct semaphore * sem)
+static inline int compat_down_trylock(struct compat_semaphore * sem)
 {
 	return atomic_dec_if_positive(&sem->count) < 0;
 }
 
-static inline void up(struct semaphore * sem)
+static inline void compat_up(struct compat_semaphore * sem)
 {
 	if (unlikely(atomic_inc_return(&sem->count) <= 0))
-		__up(sem);
+		__compat_up(sem);
 }
 
+#define compat_sema_count(sem) atomic_read(&(sem)->count)
+
+#include <linux/semaphore.h>
+
 #endif /* __KERNEL__ */
 
 #endif /* __ASM_SEMAPHORE_H */
Index: linux.prev/include/asm-mips/system.h
===================================================================
--- linux.prev.orig/include/asm-mips/system.h
+++ linux.prev/include/asm-mips/system.h
@@ -204,6 +204,7 @@ static inline unsigned long __xchg_u32(v
 		: "=&r" (retval), "=m" (*m), "=&r" (dummy)
 		: "R" (*m), "Jr" (val)
 		: "memory");
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		unsigned long flags;
 
@@ -211,6 +212,7 @@ static inline unsigned long __xchg_u32(v
 		retval = *m;
 		*m = val;
 		local_irq_restore(flags);	/* implies memory barrier  */
+#endif
 	}
 
 	return retval;
@@ -335,6 +337,7 @@ static inline unsigned long __cmpxchg_u3
 		: "=&r" (retval), "=m" (*m)
 		: "R" (*m), "Jr" (old), "Jr" (new)
 		: "memory");
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		unsigned long flags;
 
@@ -343,6 +346,7 @@ static inline unsigned long __cmpxchg_u3
 		if (retval == old)
 			*m = new;
 		local_irq_restore(flags);	/* implies memory barrier  */
+#endif
 	}
 
 	return retval;
Index: linux.prev/include/asm-mips/thread_info.h
===================================================================
--- linux.prev.orig/include/asm-mips/thread_info.h
+++ linux.prev/include/asm-mips/thread_info.h
@@ -116,6 +116,7 @@ register struct thread_info *__current_t
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_SYSCALL_AUDIT	4	/* syscall auditing active */
 #define TIF_SECCOMP		5	/* secure computing */
+#define TIF_NEED_RESCHED_DELAYED 6	/* reschedule on return to userspace */
 #define TIF_USEDFPU		16	/* FPU was used by this task this quantum (SMP) */
 #define TIF_POLLING_NRFLAG	17	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_MEMDIE		18
@@ -127,6 +128,7 @@ register struct thread_info *__current_t
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
+#define _TIF_NEED_RESCHED_DELAYED (1<<TIF_NEED_RESCHED_DELAYED)
 #define _TIF_USEDFPU		(1<<TIF_USEDFPU)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 
Index: linux.prev/include/asm-mips/time.h
===================================================================
--- linux.prev.orig/include/asm-mips/time.h
+++ linux.prev/include/asm-mips/time.h
@@ -34,6 +34,7 @@ extern spinlock_t rtc_lock;
 extern unsigned long (*rtc_get_time)(void);
 extern int (*rtc_set_time)(unsigned long);
 extern int (*rtc_set_mmss)(unsigned long);
+extern unsigned long cpu_khz;
 
 /*
  * Timer interrupt functions.
Index: linux.prev/include/asm-powerpc/bitops.h
===================================================================
--- linux.prev.orig/include/asm-powerpc/bitops.h
+++ linux.prev/include/asm-powerpc/bitops.h
@@ -417,7 +417,7 @@ static inline int sched_find_first_bit(c
 		return __ffs(b[1]) + 32;
 	if (unlikely(b[2]))
 		return __ffs(b[2]) + 64;
-	if (b[3])
+	if (unlikely(b[3]))
 		return __ffs(b[3]) + 96;
 	return __ffs(b[4]) + 128;
 #endif
Index: linux.prev/include/asm-powerpc/bug.h
===================================================================
--- linux.prev.orig/include/asm-powerpc/bug.h
+++ linux.prev/include/asm-powerpc/bug.h
@@ -63,8 +63,6 @@ struct bug_entry *find_bug(unsigned long
 #define HAVE_ARCH_BUG_ON
 #define HAVE_ARCH_WARN_ON
 #endif /* CONFIG_BUG */
-#endif /* __ASSEMBLY __ */
-
 #include <asm-generic/bug.h>
-
+#endif /* __ASSEMBLY __ */
 #endif /* _ASM_POWERPC_BUG_H */
Index: linux.prev/include/asm-powerpc/hw_irq.h
===================================================================
--- linux.prev.orig/include/asm-powerpc/hw_irq.h
+++ linux.prev/include/asm-powerpc/hw_irq.h
@@ -15,33 +15,34 @@ extern void timer_interrupt(struct pt_re
 
 #ifdef CONFIG_PPC_ISERIES
 
-extern unsigned long local_get_flags(void);
-extern unsigned long local_irq_disable(void);
-extern void local_irq_restore(unsigned long);
-
-#define local_irq_enable()	local_irq_restore(1)
-#define local_save_flags(flags)	((flags) = local_get_flags())
-#define local_irq_save(flags)	((flags) = local_irq_disable())
+extern unsigned long __raw_local_get_flags(void);
+extern unsigned long __raw_local_irq_disable(void);
+extern void __raw_local_irq_restore(unsigned long);
+
+#define __raw_local_irq_enable()	__raw_local_irq_restore(1)
+#define __raw_local_save_flags(flags)	((flags) = __raw_local_get_flags())
+#define __raw_local_irq_save(flags)	((flags) = __raw_local_irq_disable())
 
-#define irqs_disabled()		(local_get_flags() == 0)
+#define __raw_irqs_disabled()			(__raw_local_get_flags() == 0)
+#define __raw_irqs_disabled_flags(flags)	((flags) == 0)
 
 #else
 
 #if defined(CONFIG_BOOKE)
 #define SET_MSR_EE(x)	mtmsr(x)
-#define local_irq_restore(flags)	__asm__ __volatile__("wrtee %0" : : "r" (flags) : "memory")
+#define __raw_local_irq_restore(flags)	__asm__ __volatile__("wrtee %0" : : "r" (flags) : "memory")
 #elif defined(__powerpc64__)
 #define SET_MSR_EE(x)	__mtmsrd(x, 1)
-#define local_irq_restore(flags) do { \
+#define __raw_local_irq_restore(flags) do { \
 	__asm__ __volatile__("": : :"memory"); \
 	__mtmsrd((flags), 1); \
 } while(0)
 #else
 #define SET_MSR_EE(x)	mtmsr(x)
-#define local_irq_restore(flags)	mtmsr(flags)
+#define __raw_local_irq_restore(flags)	mtmsr(flags)
 #endif
 
-static inline void local_irq_disable(void)
+static inline void __raw_local_irq_disable(void)
 {
 #ifdef CONFIG_BOOKE
 	__asm__ __volatile__("wrteei 0": : :"memory");
@@ -53,7 +54,7 @@ static inline void local_irq_disable(voi
 #endif
 }
 
-static inline void local_irq_enable(void)
+static inline void __raw_local_irq_enable(void)
 {
 #ifdef CONFIG_BOOKE
 	__asm__ __volatile__("wrteei 1": : :"memory");
@@ -65,7 +66,7 @@ static inline void local_irq_enable(void
 #endif
 }
 
-static inline void local_irq_save_ptr(unsigned long *flags)
+static inline void __raw_local_irq_save_ptr(unsigned long *flags)
 {
 	unsigned long msr;
 	msr = mfmsr();
@@ -78,12 +79,15 @@ static inline void local_irq_save_ptr(un
 	__asm__ __volatile__("": : :"memory");
 }
 
-#define local_save_flags(flags)	((flags) = mfmsr())
-#define local_irq_save(flags)	local_irq_save_ptr(&flags)
-#define irqs_disabled()		((mfmsr() & MSR_EE) == 0)
+#define __raw_local_save_flags(flags)		((flags) = mfmsr())
+#define __raw_local_irq_save(flags)		__raw_local_irq_save_ptr(&flags)
+#define __raw_irqs_disabled()			((mfmsr() & MSR_EE) == 0)
+#define __raw_irqs_disabled_flags(flags)	((flags & MSR_EE) == 0)
 
 #endif /* CONFIG_PPC_ISERIES */
 
+#include <linux/rt_irq.h>
+
 #define mask_irq(irq)						\
 	({							\
 	 	irq_desc_t *desc = get_irq_desc(irq);		\
Index: linux.prev/include/asm-powerpc/rwsem.h
===================================================================
--- linux.prev.orig/include/asm-powerpc/rwsem.h
+++ linux.prev/include/asm-powerpc/rwsem.h
@@ -1,6 +1,10 @@
 #ifndef _ASM_POWERPC_RWSEM_H
 #define _ASM_POWERPC_RWSEM_H
 
+#ifndef _LINUX_RWSEM_H
+#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead"
+#endif
+
 #ifdef __KERNEL__
 
 /*
@@ -17,7 +21,7 @@
 /*
  * the semaphore definition
  */
-struct rw_semaphore {
+struct compat_rw_semaphore {
 	/* XXX this should be able to be an atomic_t  -- paulus */
 	signed int		count;
 #define RWSEM_UNLOCKED_VALUE		0x00000000
@@ -26,7 +30,7 @@ struct rw_semaphore {
 #define RWSEM_WAITING_BIAS		(-0x00010000)
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
-	spinlock_t		wait_lock;
+	raw_spinlock_t		wait_lock;
 	struct list_head	wait_list;
 #if RWSEM_DEBUG
 	int			debug;
@@ -43,19 +47,19 @@ struct rw_semaphore {
 #endif
 
 #define __RWSEM_INITIALIZER(name) \
-	{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \
+	{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED((name).wait_lock), \
 	  LIST_HEAD_INIT((name).wait_list) \
 	  __RWSEM_DEBUG_INIT }
 
-#define DECLARE_RWSEM(name)		\
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
+#define COMPAT_DECLARE_RWSEM(name)		\
+	struct compat_rw_semaphore name = __RWSEM_INITIALIZER(name)
 
-extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_down_read_failed(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_down_write_failed(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_wake(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_downgrade_wake(struct compat_rw_semaphore *sem);
 
-static inline void init_rwsem(struct rw_semaphore *sem)
+static inline void compat_init_rwsem(struct compat_rw_semaphore *sem)
 {
 	sem->count = RWSEM_UNLOCKED_VALUE;
 	spin_lock_init(&sem->wait_lock);
@@ -68,13 +72,13 @@ static inline void init_rwsem(struct rw_
 /*
  * lock for reading
  */
-static inline void __down_read(struct rw_semaphore *sem)
+static inline void __down_read(struct compat_rw_semaphore *sem)
 {
 	if (unlikely(atomic_inc_return((atomic_t *)(&sem->count)) <= 0))
 		rwsem_down_read_failed(sem);
 }
 
-static inline int __down_read_trylock(struct rw_semaphore *sem)
+static inline int __down_read_trylock(struct compat_rw_semaphore *sem)
 {
 	int tmp;
 
@@ -90,7 +94,7 @@ static inline int __down_read_trylock(st
 /*
  * lock for writing
  */
-static inline void __down_write(struct rw_semaphore *sem)
+static inline void __down_write(struct compat_rw_semaphore *sem)
 {
 	int tmp;
 
@@ -100,7 +104,7 @@ static inline void __down_write(struct r
 		rwsem_down_write_failed(sem);
 }
 
-static inline int __down_write_trylock(struct rw_semaphore *sem)
+static inline int __down_write_trylock(struct compat_rw_semaphore *sem)
 {
 	int tmp;
 
@@ -112,7 +116,7 @@ static inline int __down_write_trylock(s
 /*
  * unlock after reading
  */
-static inline void __up_read(struct rw_semaphore *sem)
+static inline void __up_read(struct compat_rw_semaphore *sem)
 {
 	int tmp;
 
@@ -124,7 +128,7 @@ static inline void __up_read(struct rw_s
 /*
  * unlock after writing
  */
-static inline void __up_write(struct rw_semaphore *sem)
+static inline void __up_write(struct compat_rw_semaphore *sem)
 {
 	if (unlikely(atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS,
 			      (atomic_t *)(&sem->count)) < 0))
@@ -134,7 +138,7 @@ static inline void __up_write(struct rw_
 /*
  * implement atomic add functionality
  */
-static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
+static inline void rwsem_atomic_add(int delta, struct compat_rw_semaphore *sem)
 {
 	atomic_add(delta, (atomic_t *)(&sem->count));
 }
@@ -142,7 +146,7 @@ static inline void rwsem_atomic_add(int 
 /*
  * downgrade write lock to read lock
  */
-static inline void __downgrade_write(struct rw_semaphore *sem)
+static inline void __downgrade_write(struct compat_rw_semaphore *sem)
 {
 	int tmp;
 
@@ -154,12 +158,12 @@ static inline void __downgrade_write(str
 /*
  * implement exchange and add functionality
  */
-static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
+static inline int rwsem_atomic_update(int delta, struct compat_rw_semaphore *sem)
 {
 	return atomic_add_return(delta, (atomic_t *)(&sem->count));
 }
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
+static inline int compat_rwsem_is_locked(struct compat_rw_semaphore *sem)
 {
 	return (sem->count != 0);
 }
Index: linux.prev/include/asm-powerpc/semaphore.h
===================================================================
--- linux.prev.orig/include/asm-powerpc/semaphore.h
+++ linux.prev/include/asm-powerpc/semaphore.h
@@ -10,54 +10,65 @@
 
 #ifdef __KERNEL__
 
+#include <linux/config.h>
 #include <asm/atomic.h>
 #include <asm/system.h>
 #include <linux/wait.h>
 #include <linux/rwsem.h>
 
-struct semaphore {
+/*
+ * On !PREEMPT_RT all sempahores are compat
+ */
+#ifndef CONFIG_PREEMPT_RT
+# define compat_semaphore semaphore
+#endif
+
+struct compat_semaphore {
 	/*
 	 * Note that any negative value of count is equivalent to 0,
 	 * but additionally indicates that some process(es) might be
 	 * sleeping on `wait'.
 	 */
 	atomic_t count;
+	int sleepers;
 	wait_queue_head_t wait;
 };
 
-#define __SEMAPHORE_INITIALIZER(name, n)				\
+#define __COMPAT_SEMAPHORE_INITIALIZER(name, n)				\
 {									\
 	.count		= ATOMIC_INIT(n),				\
 	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
 }
 
-#define __DECLARE_SEMAPHORE_GENERIC(name, count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
+#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name, count) \
+	struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count)
 
-#define DECLARE_MUTEX(name)		__DECLARE_SEMAPHORE_GENERIC(name, 1)
-#define DECLARE_MUTEX_LOCKED(name)	__DECLARE_SEMAPHORE_GENERIC(name, 0)
+#define COMPAT_DECLARE_MUTEX(name)		__COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 1)
+#define COMPAT_DECLARE_MUTEX_LOCKED(name)	__COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 0)
 
-static inline void sema_init (struct semaphore *sem, int val)
+static inline void compat_sema_init (struct compat_semaphore *sem, int val)
 {
 	atomic_set(&sem->count, val);
 	init_waitqueue_head(&sem->wait);
 }
 
-static inline void init_MUTEX (struct semaphore *sem)
+static inline void compat_init_MUTEX (struct compat_semaphore *sem)
 {
-	sema_init(sem, 1);
+	compat_sema_init(sem, 1);
 }
 
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
+static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem)
 {
-	sema_init(sem, 0);
+	compat_sema_init(sem, 0);
 }
 
-extern void __down(struct semaphore * sem);
-extern int  __down_interruptible(struct semaphore * sem);
-extern void __up(struct semaphore * sem);
+extern void __compat_down(struct compat_semaphore * sem);
+extern int  __compat_down_interruptible(struct compat_semaphore * sem);
+extern void __compat_up(struct compat_semaphore * sem);
+
+extern int compat_sem_is_locked(struct compat_semaphore *sem);
 
-static inline void down(struct semaphore * sem)
+static inline void compat_down(struct compat_semaphore * sem)
 {
 	might_sleep();
 
@@ -65,31 +76,35 @@ static inline void down(struct semaphore
 	 * Try to get the semaphore, take the slow path if we fail.
 	 */
 	if (unlikely(atomic_dec_return(&sem->count) < 0))
-		__down(sem);
+		__compat_down(sem);
 }
 
-static inline int down_interruptible(struct semaphore * sem)
+static inline int compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int ret = 0;
 
 	might_sleep();
 
 	if (unlikely(atomic_dec_return(&sem->count) < 0))
-		ret = __down_interruptible(sem);
+		ret = __compat_down_interruptible(sem);
 	return ret;
 }
 
-static inline int down_trylock(struct semaphore * sem)
+static inline int compat_down_trylock(struct compat_semaphore * sem)
 {
 	return atomic_dec_if_positive(&sem->count) < 0;
 }
 
-static inline void up(struct semaphore * sem)
+static inline void compat_up(struct compat_semaphore * sem)
 {
 	if (unlikely(atomic_inc_return(&sem->count) <= 0))
-		__up(sem);
+		__compat_up(sem);
 }
 
+#define compat_sema_count(sem) atomic_read(&(sem)->count)
+
+#include <linux/semaphore.h>
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_POWERPC_SEMAPHORE_H */
Index: linux.prev/include/asm-powerpc/spinlock.h
===================================================================
--- linux.prev.orig/include/asm-powerpc/spinlock.h
+++ linux.prev/include/asm-powerpc/spinlock.h
@@ -39,13 +39,13 @@
  * This returns the old value in the lock, so we succeeded
  * in getting the lock if the return value is 0.
  */
-static __inline__ unsigned long __spin_trylock(raw_spinlock_t *lock)
+static __inline__ unsigned long ___raw_spin_trylock(__raw_spinlock_t *lock)
 {
 	unsigned long tmp, token;
 
 	token = LOCK_TOKEN;
 	__asm__ __volatile__(
-"1:	lwarx		%0,0,%2		# __spin_trylock\n\
+"1:	lwarx		%0,0,%2		# ___raw_spin_trylock\n\
 	cmpwi		0,%0,0\n\
 	bne-		2f\n\
 	stwcx.		%1,0,%2\n\
@@ -58,9 +58,9 @@ static __inline__ unsigned long __spin_t
 	return tmp;
 }
 
-static int __inline__ __raw_spin_trylock(raw_spinlock_t *lock)
+static int __inline__ __raw_spin_trylock(__raw_spinlock_t *lock)
 {
-	return __spin_trylock(lock) == 0;
+	return ___raw_spin_trylock(lock) == 0;
 }
 
 /*
@@ -80,18 +80,18 @@ static int __inline__ __raw_spin_trylock
 #if defined(CONFIG_PPC_SPLPAR) || defined(CONFIG_PPC_ISERIES)
 /* We only yield to the hypervisor if we are in shared processor mode */
 #define SHARED_PROCESSOR (get_paca()->lppaca.shared_proc)
-extern void __spin_yield(raw_spinlock_t *lock);
-extern void __rw_yield(raw_rwlock_t *lock);
+extern void __spin_yield(__raw_spinlock_t *lock);
+extern void __rw_yield(__raw_rwlock_t *lock);
 #else /* SPLPAR || ISERIES */
 #define __spin_yield(x)	barrier()
 #define __rw_yield(x)	barrier()
 #define SHARED_PROCESSOR	0
 #endif
 
-static void __inline__ __raw_spin_lock(raw_spinlock_t *lock)
+static void __inline__ __raw_spin_lock(__raw_spinlock_t *lock)
 {
 	while (1) {
-		if (likely(__spin_trylock(lock) == 0))
+		if (likely(___raw_spin_trylock(lock) == 0))
 			break;
 		do {
 			HMT_low();
@@ -102,12 +102,12 @@ static void __inline__ __raw_spin_lock(r
 	}
 }
 
-static void __inline__ __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
+static void __inline__ __raw_spin_lock_flags(__raw_spinlock_t *lock, unsigned long flags)
 {
 	unsigned long flags_dis;
 
 	while (1) {
-		if (likely(__spin_trylock(lock) == 0))
+		if (likely(___raw_spin_trylock(lock) == 0))
 			break;
 		local_save_flags(flags_dis);
 		local_irq_restore(flags);
@@ -121,7 +121,7 @@ static void __inline__ __raw_spin_lock_f
 	}
 }
 
-static __inline__ void __raw_spin_unlock(raw_spinlock_t *lock)
+static __inline__ void __raw_spin_unlock(__raw_spinlock_t *lock)
 {
 	__asm__ __volatile__(SYNC_ON_SMP"	# __raw_spin_unlock"
 			     : : :"memory");
@@ -129,7 +129,7 @@ static __inline__ void __raw_spin_unlock
 }
 
 #ifdef CONFIG_PPC64
-extern void __raw_spin_unlock_wait(raw_spinlock_t *lock);
+extern void __raw_spin_unlock_wait(__raw_spinlock_t *lock);
 #else
 #define __raw_spin_unlock_wait(lock) \
 	do { while (__raw_spin_is_locked(lock)) cpu_relax(); } while (0)
@@ -161,7 +161,7 @@ extern void __raw_spin_unlock_wait(raw_s
  * This returns the old value in the lock + 1,
  * so we got a read lock if the return value is > 0.
  */
-static long __inline__ __read_trylock(raw_rwlock_t *rw)
+static long __inline__ __read_trylock(__raw_rwlock_t *rw)
 {
 	long tmp;
 
@@ -185,7 +185,7 @@ static long __inline__ __read_trylock(ra
  * This returns the old value in the lock,
  * so we got the write lock if the return value is 0.
  */
-static __inline__ long __write_trylock(raw_rwlock_t *rw)
+static __inline__ long __write_trylock(__raw_rwlock_t *rw)
 {
 	long tmp, token;
 
@@ -205,7 +205,7 @@ static __inline__ long __write_trylock(r
 	return tmp;
 }
 
-static void __inline__ __raw_read_lock(raw_rwlock_t *rw)
+static void __inline__ __raw_read_lock(__raw_rwlock_t *rw)
 {
 	while (1) {
 		if (likely(__read_trylock(rw) > 0))
@@ -219,7 +219,7 @@ static void __inline__ __raw_read_lock(r
 	}
 }
 
-static void __inline__ __raw_write_lock(raw_rwlock_t *rw)
+static void __inline__ __raw_write_lock(__raw_rwlock_t *rw)
 {
 	while (1) {
 		if (likely(__write_trylock(rw) == 0))
@@ -233,17 +233,17 @@ static void __inline__ __raw_write_lock(
 	}
 }
 
-static int __inline__ __raw_read_trylock(raw_rwlock_t *rw)
+static int __inline__ __raw_read_trylock(__raw_rwlock_t *rw)
 {
 	return __read_trylock(rw) > 0;
 }
 
-static int __inline__ __raw_write_trylock(raw_rwlock_t *rw)
+static int __inline__ __raw_write_trylock(__raw_rwlock_t *rw)
 {
 	return __write_trylock(rw) == 0;
 }
 
-static void __inline__ __raw_read_unlock(raw_rwlock_t *rw)
+static void __inline__ __raw_read_unlock(__raw_rwlock_t *rw)
 {
 	long tmp;
 
@@ -259,7 +259,7 @@ static void __inline__ __raw_read_unlock
 	: "cr0", "memory");
 }
 
-static __inline__ void __raw_write_unlock(raw_rwlock_t *rw)
+static __inline__ void __raw_write_unlock(__raw_rwlock_t *rw)
 {
 	__asm__ __volatile__(SYNC_ON_SMP"	# write_unlock"
 			     : : :"memory");
Index: linux.prev/include/asm-powerpc/spinlock_types.h
===================================================================
--- linux.prev.orig/include/asm-powerpc/spinlock_types.h
+++ linux.prev/include/asm-powerpc/spinlock_types.h
@@ -7,13 +7,13 @@
 
 typedef struct {
 	volatile unsigned int slock;
-} raw_spinlock_t;
+} __raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED	{ 0 }
 
 typedef struct {
 	volatile signed int lock;
-} raw_rwlock_t;
+} __raw_rwlock_t;
 
 #define __RAW_RW_LOCK_UNLOCKED		{ 0 }
 
Index: linux.prev/include/asm-powerpc/thread_info.h
===================================================================
--- linux.prev.orig/include/asm-powerpc/thread_info.h
+++ linux.prev/include/asm-powerpc/thread_info.h
@@ -117,7 +117,7 @@ static inline struct thread_info *curren
 #define TIF_POLLING_NRFLAG	4	/* true if poll_idle() is polling
 					   TIF_NEED_RESCHED */
 #define TIF_32BIT		5	/* 32 bit binary */
-/* #define SPARE		6 */
+#define TIF_NEED_RESCHED_DELAYED 6	/* reschedule on return to userspace */
 #define TIF_ABI_PENDING		7	/* 32/64 bit switch needed */
 #define TIF_SYSCALL_AUDIT	8	/* syscall auditing active */
 #define TIF_SINGLESTEP		9	/* singlestepping active */
@@ -131,7 +131,7 @@ static inline struct thread_info *curren
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 #define _TIF_32BIT		(1<<TIF_32BIT)
-/* #define _SPARE		(1<<SPARE) */
+#define _TIF_NEED_RESCHED_DELAYED (1<<TIF_NEED_RESCHED_DELAYED)
 #define _TIF_ABI_PENDING	(1<<TIF_ABI_PENDING)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SINGLESTEP		(1<<TIF_SINGLESTEP)
Index: linux.prev/include/asm-powerpc/time.h
===================================================================
--- linux.prev.orig/include/asm-powerpc/time.h
+++ linux.prev/include/asm-powerpc/time.h
@@ -32,6 +32,7 @@ extern u64 tb_to_xs;
 extern unsigned      tb_to_us;
 extern unsigned long tb_last_stamp;
 extern u64 tb_last_jiffy;
+extern unsigned long cpu_khz;
 
 DECLARE_PER_CPU(unsigned long, last_jiffy);
 
Index: linux.prev/include/asm-ppc/ocp.h
===================================================================
--- linux.prev.orig/include/asm-ppc/ocp.h
+++ linux.prev/include/asm-ppc/ocp.h
@@ -29,10 +29,10 @@
 #include <linux/config.h>
 #include <linux/devfs_fs_kernel.h>
 #include <linux/device.h>
+#include <linux/rwsem.h>
 
 #include <asm/mmu.h>
 #include <asm/ocp_ids.h>
-#include <asm/rwsem.h>
 #include <asm/semaphore.h>
 
 #ifdef CONFIG_PPC_OCP
Index: linux.prev/include/asm-ppc/time.h
===================================================================
--- linux.prev.orig/include/asm-ppc/time.h
+++ linux.prev/include/asm-ppc/time.h
@@ -20,6 +20,7 @@
 extern unsigned tb_ticks_per_jiffy;
 extern unsigned tb_to_us;
 extern unsigned tb_last_stamp;
+extern unsigned long cpu_khz;
 extern unsigned long disarm_decr[NR_CPUS];
 
 extern void to_tm(int tim, struct rtc_time * tm);
Index: linux.prev/include/asm-x86_64/acpi.h
===================================================================
--- linux.prev.orig/include/asm-x86_64/acpi.h
+++ linux.prev/include/asm-x86_64/acpi.h
@@ -50,8 +50,8 @@
 
 #define ACPI_ASM_MACROS
 #define BREAKPOINT3
-#define ACPI_DISABLE_IRQS() local_irq_disable()
-#define ACPI_ENABLE_IRQS()  local_irq_enable()
+#define ACPI_DISABLE_IRQS() local_irq_disable_nort()
+#define ACPI_ENABLE_IRQS()  local_irq_enable_nort()
 #define ACPI_FLUSH_CPU_CACHE()	wbinvd()
 
 
Index: linux.prev/include/asm-x86_64/apic.h
===================================================================
--- linux.prev.orig/include/asm-x86_64/apic.h
+++ linux.prev/include/asm-x86_64/apic.h
@@ -113,6 +113,12 @@ extern int disable_timer_pin_1;
 
 extern void setup_threshold_lvt(unsigned long lvt_off);
 
+void smp_send_timer_broadcast_ipi(void);
+void switch_APIC_timer_to_ipi(void *cpumask);
+void switch_ipi_to_APIC_timer(void *cpumask);
+
+#define ARCH_APICTIMER_STOPS_ON_C3	1
+
 #endif /* CONFIG_X86_LOCAL_APIC */
 
 extern unsigned boot_cpu_id;
Index: linux.prev/include/asm-x86_64/cpufeature.h
===================================================================
--- linux.prev.orig/include/asm-x86_64/cpufeature.h
+++ linux.prev/include/asm-x86_64/cpufeature.h
@@ -63,6 +63,7 @@
 #define X86_FEATURE_CENTAUR_MCR	(3*32+ 3) /* Centaur MCRs (= MTRRs) */
 #define X86_FEATURE_K8_C	(3*32+ 4) /* C stepping K8 */
 #define X86_FEATURE_CONSTANT_TSC (3*32+5) /* TSC runs at constant rate */
+#define X86_FEATURE_SYNC_RDTSC  (3*32+6)  /* RDTSC syncs CPU core */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* Streaming SIMD Extensions-3 */
Index: linux.prev/include/asm-x86_64/hpet.h
===================================================================
--- linux.prev.orig/include/asm-x86_64/hpet.h
+++ linux.prev/include/asm-x86_64/hpet.h
@@ -1,6 +1,6 @@
 #ifndef _ASM_X8664_HPET_H
 #define _ASM_X8664_HPET_H 1
-
+#include <asm/fixmap.h>
 /*
  * Documentation on HPET can be found at:
  *      http://www.intel.com/ial/home/sp/pcmmspec.htm
@@ -51,6 +51,7 @@
 
 #define HPET_TN_ROUTE_SHIFT	9
 
+extern unsigned long hpet_address;	/* hpet memory map physical address */
 extern int is_hpet_enabled(void);
 extern int hpet_rtc_timer_init(void);
 extern int oem_force_hpet_timer(void);
Index: linux.prev/include/asm-x86_64/io_apic.h
===================================================================
--- linux.prev.orig/include/asm-x86_64/io_apic.h
+++ linux.prev/include/asm-x86_64/io_apic.h
@@ -4,6 +4,7 @@
 #include <linux/config.h>
 #include <asm/types.h>
 #include <asm/mpspec.h>
+#include <asm/apicdef.h>
 
 /*
  * Intel IO-APIC support for SMP and UP systems.
@@ -16,11 +17,10 @@
 #ifdef CONFIG_PCI_MSI
 static inline int use_pci_vector(void)	{return 1;}
 static inline void disable_edge_ioapic_vector(unsigned int vector) { }
-static inline void mask_and_ack_level_ioapic_vector(unsigned int vector) { }
 static inline void end_edge_ioapic_vector (unsigned int vector) { }
 #define startup_level_ioapic	startup_level_ioapic_vector
 #define shutdown_level_ioapic	mask_IO_APIC_vector
-#define enable_level_ioapic	unmask_IO_APIC_vector
+#define enable_level_ioapic	enable_level_ioapic_vector
 #define disable_level_ioapic	mask_IO_APIC_vector
 #define mask_and_ack_level_ioapic mask_and_ack_level_ioapic_vector
 #define end_level_ioapic	end_level_ioapic_vector
@@ -35,11 +35,10 @@ static inline void end_edge_ioapic_vecto
 #else
 static inline int use_pci_vector(void)	{return 0;}
 static inline void disable_edge_ioapic_irq(unsigned int irq) { }
-static inline void mask_and_ack_level_ioapic_irq(unsigned int irq) { }
 static inline void end_edge_ioapic_irq (unsigned int irq) { }
 #define startup_level_ioapic	startup_level_ioapic_irq
 #define shutdown_level_ioapic	mask_IO_APIC_irq
-#define enable_level_ioapic	unmask_IO_APIC_irq
+#define enable_level_ioapic	enable_level_ioapic_irq
 #define disable_level_ioapic	mask_IO_APIC_irq
 #define mask_and_ack_level_ioapic mask_and_ack_level_ioapic_irq
 #define end_level_ioapic	end_level_ioapic_irq
@@ -217,6 +216,6 @@ extern int assign_irq_vector(int irq);
 
 void enable_NMI_through_LVT0 (void * dummy);
 
-extern spinlock_t i8259A_lock;
+extern raw_spinlock_t i8259A_lock;
 
 #endif
Index: linux.prev/include/asm-x86_64/ipi.h
===================================================================
--- linux.prev.orig/include/asm-x86_64/ipi.h
+++ linux.prev/include/asm-x86_64/ipi.h
@@ -91,7 +91,7 @@ static inline void send_IPI_mask_sequenc
 	 * to an arbitrary mask, so I do a unicast to each CPU instead.
 	 * - mbligh
 	 */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	for_each_cpu_mask(query_cpu, mask) {
 		/*
@@ -115,7 +115,7 @@ static inline void send_IPI_mask_sequenc
 		 */
 		apic_write(APIC_ICR, cfg);
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 #endif /* __ASM_IPI_H */
Index: linux.prev/include/asm-x86_64/kprobes.h
===================================================================
--- linux.prev.orig/include/asm-x86_64/kprobes.h
+++ linux.prev/include/asm-x86_64/kprobes.h
@@ -73,7 +73,7 @@ struct kprobe_ctlblk {
 static inline void restore_interrupts(struct pt_regs *regs)
 {
 	if (regs->eflags & IF_MASK)
-		local_irq_enable();
+		raw_local_irq_enable();
 }
 
 extern int post_kprobe_handler(struct pt_regs *regs);
Index: linux.prev/include/asm-x86_64/percpu.h
===================================================================
--- linux.prev.orig/include/asm-x86_64/percpu.h
+++ linux.prev/include/asm-x86_64/percpu.h
@@ -17,11 +17,23 @@
 /* Separate out the type, so (int[3], foo) works. */
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
+#define DEFINE_PER_CPU_LOCKED(type, name) \
+    __attribute__((__section__(".data.percpu"))) spinlock_t per_cpu_lock__##name##_locked = SPIN_LOCK_UNLOCKED(per_cpu_lock__##name##_locked); \
+    __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name##_locked
 
 /* var is in discarded region: offset to particular copy we want */
 #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu)))
 #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset()))
 
+#define per_cpu_lock(var, cpu) \
+	(*RELOC_HIDE(&per_cpu_lock__##var##_locked, __per_cpu_offset(cpu)))
+#define per_cpu_var_locked(var, cpu) \
+		(*RELOC_HIDE(&per_cpu__##var##_locked, __per_cpu_offset(cpu)))
+#define __get_cpu_lock(var, cpu) \
+		per_cpu_lock(var, cpu)
+#define __get_cpu_var_locked(var, cpu) \
+		per_cpu_var_locked(var, cpu)
+
 /* A macro to avoid #include hell... */
 #define percpu_modcopy(pcpudst, src, size)			\
 do {								\
@@ -39,14 +51,26 @@ extern void setup_per_cpu_areas(void);
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
 
+#define DEFINE_PER_CPU_LOCKED(type, name) \
+	spinlock_t per_cpu_lock__##name##_locked = SPIN_LOCK_UNLOCKED(per_cpu_lock__##name##_locked); \
+	__typeof__(type) per_cpu__##name##_locked
+
 #define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu__##var))
 #define __get_cpu_var(var)			per_cpu__##var
+#define __get_cpu_lock(var, cpu)		per_cpu_lock__##var##_locked
+#define __get_cpu_var_locked(var, cpu)		per_cpu__##var##_locked
 
 #endif	/* SMP */
 
 #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
 
+#define DECLARE_PER_CPU_LOCKED(type, name) \
+	extern spinlock_t per_cpu_lock__##name##_locked; \
+	extern __typeof__(type) per_cpu__##name##_locked
+
 #define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
 #define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
+#define EXPORT_PER_CPU_LOCKED_SYMBOL(var) EXPORT_SYMBOL(per_cpu_lock__##var##_locked); EXPORT_SYMBOL(per_cpu__##var##_locked)
+#define EXPORT_PER_CPU_LOCKED_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu_lock__##var##_locked); EXPORT_SYMBOL_GPL(per_cpu__##var##_locked)
 
 #endif /* _ASM_X8664_PERCPU_H_ */
Index: linux.prev/include/asm-x86_64/proto.h
===================================================================
--- linux.prev.orig/include/asm-x86_64/proto.h
+++ linux.prev/include/asm-x86_64/proto.h
@@ -69,7 +69,7 @@ extern unsigned long end_pfn_map; 
 
 extern cpumask_t cpu_initialized;
 
-extern void show_trace(unsigned long * rsp);
+extern void show_trace(struct task_struct *task, unsigned long * rsp);
 extern void show_registers(struct pt_regs *regs);
 
 extern void exception_table_check(void);
@@ -91,6 +91,8 @@ extern void check_efer(void);
 
 extern int unhandled_signal(struct task_struct *tsk, int sig);
 
+extern int unsynchronized_tsc(void);
+
 extern void select_idle_routine(const struct cpuinfo_x86 *c);
 extern void swiotlb_init(void);
 
Index: linux.prev/include/asm-x86_64/semaphore.h
===================================================================
--- linux.prev.orig/include/asm-x86_64/semaphore.h
+++ linux.prev/include/asm-x86_64/semaphore.h
@@ -1,10 +1,15 @@
 #ifndef _X86_64_SEMAPHORE_H
 #define _X86_64_SEMAPHORE_H
 
+#include <linux/config.h>
 #include <linux/linkage.h>
 
 #ifdef __KERNEL__
 
+#ifndef CONFIG_PREEMPT_RT
+# define compat_semaphore semaphore
+#endif
+
 /*
  * SMP- and interrupt-safe semaphores..
  *
@@ -43,29 +48,34 @@
 #include <linux/rwsem.h>
 #include <linux/stringify.h>
 
-struct semaphore {
+struct compat_semaphore {
 	atomic_t count;
 	int sleepers;
 	wait_queue_head_t wait;
 };
 
-#define __SEMAPHORE_INITIALIZER(name, n)				\
+#define __COMPAT_SEMAPHORE_INITIALIZER(name, n)				\
 {									\
 	.count		= ATOMIC_INIT(n),				\
 	.sleepers	= 0,						\
 	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
 }
 
-#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
+#define __COMPAT_MUTEX_INITIALIZER(name) \
+	__COMPAT_SEMAPHORE_INITIALIZER(name,1)
+
+#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count) \
+	struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count)
 
-#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
-#define DECLARE_MUTEX_LOCKED(name) __DECLARE_SEMAPHORE_GENERIC(name,0)
+#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,1)
+#define COMPAT_DECLARE_MUTEX_LOCKED(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,0)
 
-static inline void sema_init (struct semaphore *sem, int val)
+#define compat_sema_count(sem) atomic_read(&(sem)->count)
+
+static inline void compat_sema_init (struct compat_semaphore *sem, int val)
 {
 /*
- *	*sem = (struct semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
+ *	*sem = (struct compat_semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
  *
  * i'd rather use the more flexible initialization above, but sadly
  * GCC 2.7.2.3 emits a bogus warning. EGCS doesn't. Oh well.
@@ -75,32 +85,33 @@ static inline void sema_init (struct sem
 	init_waitqueue_head(&sem->wait);
 }
 
-static inline void init_MUTEX (struct semaphore *sem)
+static inline void compat_init_MUTEX (struct compat_semaphore *sem)
 {
-	sema_init(sem, 1);
+	compat_sema_init(sem, 1);
 }
 
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
+static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem)
 {
-	sema_init(sem, 0);
+	compat_sema_init(sem, 0);
 }
 
-asmlinkage void __down_failed(void /* special register calling convention */);
-asmlinkage int  __down_failed_interruptible(void  /* params in registers */);
-asmlinkage int  __down_failed_trylock(void  /* params in registers */);
-asmlinkage void __up_wakeup(void /* special register calling convention */);
+asmlinkage void __compat_down_failed(void /* special register calling convention */);
+asmlinkage int  __compat_down_failed_interruptible(void  /* params in registers */);
+asmlinkage int  __compat_down_failed_trylock(void  /* params in registers */);
+asmlinkage void __compat_up_wakeup(void /* special register calling convention */);
 
-asmlinkage void __down(struct semaphore * sem);
-asmlinkage int  __down_interruptible(struct semaphore * sem);
-asmlinkage int  __down_trylock(struct semaphore * sem);
-asmlinkage void __up(struct semaphore * sem);
+asmlinkage void __compat_down(struct compat_semaphore * sem);
+asmlinkage int  __compat_down_interruptible(struct compat_semaphore * sem);
+asmlinkage int  __compat_down_trylock(struct compat_semaphore * sem);
+asmlinkage void __compat_up(struct compat_semaphore * sem);
+asmlinkage int compat_sem_is_locked(struct compat_semaphore *sem);
 
 /*
  * This is ugly, but we want the default case to fall through.
  * "__down_failed" is a special asm handler that calls the C
  * routine that actually waits. See arch/x86_64/kernel/semaphore.c
  */
-static inline void down(struct semaphore * sem)
+static inline void compat_down(struct compat_semaphore * sem)
 {
 	might_sleep();
 
@@ -110,7 +121,7 @@ static inline void down(struct semaphore
 		"js 2f\n"
 		"1:\n"
 		LOCK_SECTION_START("")
-		"2:\tcall __down_failed\n\t"
+		"2:\tcall __compat_down_failed\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		:"=m" (sem->count)
@@ -122,7 +133,7 @@ static inline void down(struct semaphore
  * Interruptible try to acquire a semaphore.  If we obtained
  * it, return zero.  If we were interrupted, returns -EINTR
  */
-static inline int down_interruptible(struct semaphore * sem)
+static inline int compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int result;
 
@@ -135,7 +146,7 @@ static inline int down_interruptible(str
 		"xorl %0,%0\n"
 		"1:\n"
 		LOCK_SECTION_START("")
-		"2:\tcall __down_failed_interruptible\n\t"
+		"2:\tcall __compat_down_failed_interruptible\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		:"=a" (result), "=m" (sem->count)
@@ -148,7 +159,7 @@ static inline int down_interruptible(str
  * Non-blockingly attempt to down() a semaphore.
  * Returns zero if we acquired it
  */
-static inline int down_trylock(struct semaphore * sem)
+static inline int compat_down_trylock(struct compat_semaphore * sem)
 {
 	int result;
 
@@ -159,7 +170,7 @@ static inline int down_trylock(struct se
 		"xorl %0,%0\n"
 		"1:\n"
 		LOCK_SECTION_START("")
-		"2:\tcall __down_failed_trylock\n\t"
+		"2:\tcall __compat_down_failed_trylock\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		:"=a" (result), "=m" (sem->count)
@@ -174,7 +185,7 @@ static inline int down_trylock(struct se
  * The default case (no contention) will result in NO
  * jumps for both down() and up().
  */
-static inline void up(struct semaphore * sem)
+static inline void compat_up(struct compat_semaphore * sem)
 {
 	__asm__ __volatile__(
 		"# atomic up operation\n\t"
@@ -182,12 +193,15 @@ static inline void up(struct semaphore *
 		"jle 2f\n"
 		"1:\n"
 		LOCK_SECTION_START("")
-		"2:\tcall __up_wakeup\n\t"
+		"2:\tcall __compat_up_wakeup\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		:"=m" (sem->count)
 		:"D" (sem)
 		:"memory");
 }
+
+#include <linux/semaphore.h>
+
 #endif /* __KERNEL__ */
 #endif
Index: linux.prev/include/asm-x86_64/spinlock.h
===================================================================
--- linux.prev.orig/include/asm-x86_64/spinlock.h
+++ linux.prev/include/asm-x86_64/spinlock.h
@@ -36,7 +36,7 @@
 	"movl $1,%0" \
 		:"=m" (lock->slock) : : "memory"
 
-static inline void __raw_spin_lock(raw_spinlock_t *lock)
+static inline void __raw_spin_lock(__raw_spinlock_t *lock)
 {
 	__asm__ __volatile__(
 		__raw_spin_lock_string
@@ -45,7 +45,7 @@ static inline void __raw_spin_lock(raw_s
 
 #define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
 
-static inline int __raw_spin_trylock(raw_spinlock_t *lock)
+static inline int __raw_spin_trylock(__raw_spinlock_t *lock)
 {
 	int oldval;
 
@@ -57,7 +57,7 @@ static inline int __raw_spin_trylock(raw
 	return oldval > 0;
 }
 
-static inline void __raw_spin_unlock(raw_spinlock_t *lock)
+static inline void __raw_spin_unlock(__raw_spinlock_t *lock)
 {
 	__asm__ __volatile__(
 		__raw_spin_unlock_string
@@ -91,17 +91,17 @@ static inline void __raw_spin_unlock(raw
 #define __raw_read_can_lock(x)		((int)(x)->lock > 0)
 #define __raw_write_can_lock(x)		((x)->lock == RW_LOCK_BIAS)
 
-static inline void __raw_read_lock(raw_rwlock_t *rw)
+static inline void __raw_read_lock(__raw_rwlock_t *rw)
 {
 	__build_read_lock(rw, "__read_lock_failed");
 }
 
-static inline void __raw_write_lock(raw_rwlock_t *rw)
+static inline void __raw_write_lock(__raw_rwlock_t *rw)
 {
 	__build_write_lock(rw, "__write_lock_failed");
 }
 
-static inline int __raw_read_trylock(raw_rwlock_t *lock)
+static inline int __raw_read_trylock(__raw_rwlock_t *lock)
 {
 	atomic_t *count = (atomic_t *)lock;
 	atomic_dec(count);
@@ -111,7 +111,7 @@ static inline int __raw_read_trylock(raw
 	return 0;
 }
 
-static inline int __raw_write_trylock(raw_rwlock_t *lock)
+static inline int __raw_write_trylock(__raw_rwlock_t *lock)
 {
 	atomic_t *count = (atomic_t *)lock;
 	if (atomic_sub_and_test(RW_LOCK_BIAS, count))
@@ -120,12 +120,12 @@ static inline int __raw_write_trylock(ra
 	return 0;
 }
 
-static inline void __raw_read_unlock(raw_rwlock_t *rw)
+static inline void __raw_read_unlock(__raw_rwlock_t *rw)
 {
 	asm volatile("lock ; incl %0" :"=m" (rw->lock) : : "memory");
 }
 
-static inline void __raw_write_unlock(raw_rwlock_t *rw)
+static inline void __raw_write_unlock(__raw_rwlock_t *rw)
 {
 	asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0"
 				: "=m" (rw->lock) : : "memory");
Index: linux.prev/include/asm-x86_64/spinlock_types.h
===================================================================
--- linux.prev.orig/include/asm-x86_64/spinlock_types.h
+++ linux.prev/include/asm-x86_64/spinlock_types.h
@@ -7,13 +7,13 @@
 
 typedef struct {
 	volatile unsigned int slock;
-} raw_spinlock_t;
+} __raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED	{ 1 }
 
 typedef struct {
 	volatile unsigned int lock;
-} raw_rwlock_t;
+} __raw_rwlock_t;
 
 #define __RAW_RW_LOCK_UNLOCKED		{ RW_LOCK_BIAS }
 
Index: linux.prev/include/asm-x86_64/system.h
===================================================================
--- linux.prev.orig/include/asm-x86_64/system.h
+++ linux.prev/include/asm-x86_64/system.h
@@ -137,6 +137,21 @@ struct alt_instr { 
 		      "663:\n\t" newinstr "\n664:\n"   /* replacement */ \
 		      ".previous" :: "i" (feature), ##input)
 
+/* Like alternative_input, but with a single output argument */
+#define alternative_io(oldinstr, newinstr, feature, output, input...) \
+	asm volatile ("661:\n\t" oldinstr "\n662:\n"			\
+		      ".section .altinstructions,\"a\"\n"		\
+		      "  .align 8\n"					\
+		      "  .quad 661b\n"            /* label */		\
+		      "  .quad 663f\n"		  /* new instruction */	\
+		      "  .byte %c[feat]\n"        /* feature bit */	\
+		      "  .byte 662b-661b\n"       /* sourcelen */	\
+		      "  .byte 664f-663f\n"       /* replacementlen */	\
+		      ".previous\n"					\
+		      ".section .altinstr_replacement,\"ax\"\n"		\
+		      "663:\n\t" newinstr "\n664:\n"   /* replacement */ \
+		      ".previous" : output : [feat] "i" (feature), ##input)
+
 /*
  * Clear and set 'TS' bit respectively
  */
@@ -309,22 +324,30 @@ static inline unsigned long __cmpxchg(vo
 #define warn_if_not_ulong(x) do { unsigned long foo; (void) (&(x) == &foo); } while (0)
 
 /* interrupt control.. */
-#define local_save_flags(x)	do { warn_if_not_ulong(x); __asm__ __volatile__("# save_flags \n\t pushfq ; popq %q0":"=g" (x): /* no input */ :"memory"); } while (0)
-#define local_irq_restore(x) 	__asm__ __volatile__("# restore_flags \n\t pushq %0 ; popfq": /* no output */ :"g" (x):"memory", "cc")
-#define local_irq_disable() 	__asm__ __volatile__("cli": : :"memory")
-#define local_irq_enable()	__asm__ __volatile__("sti": : :"memory")
+
+#define __raw_local_save_flags(x)	do { warn_if_not_ulong(x); __asm__ __volatile__("# save_flags \n\t pushfq ; popq %q0":"=g" (x): /* no input */ :"memory"); } while (0)
+#define __raw_local_irq_restore(x) 	__asm__ __volatile__("# restore_flags \n\t pushq %0 ; popfq": /* no output */ :"g" (x):"memory", "cc")
+#define __raw_local_irq_disable() 	__asm__ __volatile__("cli": : :"memory")
+#define __raw_local_irq_enable()	__asm__ __volatile__("sti": : :"memory")
 /* used in the idle loop; sti takes one instruction cycle to complete */
-#define safe_halt()		__asm__ __volatile__("sti; hlt": : :"memory")
+#define __raw_safe_halt()		__asm__ __volatile__("sti; hlt": : :"memory")
 
-#define irqs_disabled()			\
-({					\
-	unsigned long flags;		\
-	local_save_flags(flags);	\
-	!(flags & (1<<9));		\
+#define __raw_irqs_disabled_flags(flags)	\
+({						\
+	!(flags & (1<<9));			\
+})
+
+#define __raw_irqs_disabled()			\
+({						\
+	unsigned long flags;			\
+	__raw_local_save_flags(flags);		\
+	__raw_irqs_disabled_flags(flags);		\
 })
 
 /* For spinlocks etc */
-#define local_irq_save(x) 	do { warn_if_not_ulong(x); __asm__ __volatile__("# local_irq_save \n\t pushfq ; popq %0 ; cli":"=g" (x): /* no input */ :"memory"); } while (0)
+#define __raw_local_irq_save(x) 	do { warn_if_not_ulong(x); __asm__ __volatile__("# __raw_local_irq_save \n\t pushfq ; popq %0 ; cli":"=g" (x): /* no input */ :"memory"); } while (0)
+
+#include <linux/rt_irq.h>
 
 void cpu_idle_wait(void);
 
Index: linux.prev/include/asm-x86_64/thread_info.h
===================================================================
--- linux.prev.orig/include/asm-x86_64/thread_info.h
+++ linux.prev/include/asm-x86_64/thread_info.h
@@ -101,6 +101,7 @@ static inline struct thread_info *stack_
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_SINGLESTEP		4	/* reenable singlestep on user return*/
 #define TIF_IRET		5	/* force IRET */
+#define TIF_NEED_RESCHED_DELAYED 6	/* reschedul on return to userspace */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
@@ -117,6 +118,7 @@ static inline struct thread_info *stack_
 #define _TIF_IRET		(1<<TIF_IRET)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
+#define _TIF_NEED_RESCHED_DELAYED	(1<<TIF_NEED_RESCHED_DELAYED)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 #define _TIF_IA32		(1<<TIF_IA32)
 #define _TIF_FORK		(1<<TIF_FORK)
Index: linux.prev/include/asm-x86_64/timeofday.h
===================================================================
--- /dev/null
+++ linux.prev/include/asm-x86_64/timeofday.h
@@ -0,0 +1,4 @@
+#ifndef _ASM_X86_64_TIMEOFDAY_H
+#define _ASM_X86_64_TIMEOFDAY_H
+#include <asm-generic/timeofday.h>
+#endif
Index: linux.prev/include/asm-x86_64/timex.h
===================================================================
--- linux.prev.orig/include/asm-x86_64/timex.h
+++ linux.prev/include/asm-x86_64/timex.h
@@ -10,6 +10,9 @@
 #include <asm/msr.h>
 #include <asm/vsyscall.h>
 #include <asm/hpet.h>
+#include <asm/system.h>
+#include <asm/processor.h>
+#include <linux/compiler.h>
 
 #define CLOCK_TICK_RATE	PIT_TICK_RATE	/* Underlying HZ */
 
@@ -23,7 +26,23 @@ static inline cycles_t get_cycles (void)
 	return ret;
 }
 
+/* Like get_cycles, but make sure the CPU is synchronized. */
+static __always_inline cycles_t get_cycles_sync(void)
+{
+	unsigned long long ret;
+	unsigned eax;
+	/* Don't do an additional sync on CPUs where we know
+	   RDTSC is already synchronous. */
+	alternative_io(ASM_NOP2, "cpuid", X86_FEATURE_SYNC_RDTSC,
+			  "=a" (eax), "0" (1) : "ebx","ecx","edx","memory");
+	rdtscll(ret);
+	return ret;
+}
+
 extern unsigned int cpu_khz;
+extern unsigned int tsc_khz;
+
+extern void mark_tsc_unstable(void);
 
 extern int read_current_timer(unsigned long *timer_value);
 #define ARCH_HAS_READ_CURRENT_TIMER	1
Index: linux.prev/include/asm-x86_64/tlbflush.h
===================================================================
--- linux.prev.orig/include/asm-x86_64/tlbflush.h
+++ linux.prev/include/asm-x86_64/tlbflush.h
@@ -9,11 +9,13 @@
 	do {								\
 		unsigned long tmpreg;					\
 									\
+		preempt_disable();					\
 		__asm__ __volatile__(					\
 			"movq %%cr3, %0;  # flush TLB \n"		\
 			"movq %0, %%cr3;              \n"		\
 			: "=r" (tmpreg)					\
 			:: "memory");					\
+		preempt_enable();					\
 	} while (0)
 
 /*
@@ -24,6 +26,7 @@
 	do {								\
 		unsigned long tmpreg, cr4, cr4_orig;			\
 									\
+		preempt_disable();					\
 		__asm__ __volatile__(					\
 			"movq %%cr4, %2;  # turn off PGE     \n"	\
 			"movq %2, %1;                        \n"	\
@@ -35,6 +38,7 @@
 			: "=&r" (tmpreg), "=&r" (cr4), "=&r" (cr4_orig)	\
 			: "i" (~X86_CR4_PGE)				\
 			: "memory");					\
+		preempt_enable();					\
 	} while (0)
 
 extern unsigned long pgkern_mask;
Index: linux.prev/include/asm-x86_64/unistd.h
===================================================================
--- linux.prev.orig/include/asm-x86_64/unistd.h
+++ linux.prev/include/asm-x86_64/unistd.h
@@ -573,6 +573,7 @@ __SYSCALL(__NR_inotify_add_watch, sys_in
 __SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch)
 
 #define __NR_syscall_max __NR_inotify_rm_watch
+#define NR_syscalls	__NR_syscall_max+1
 #ifndef __NO_STUBS
 
 /* user-visible error numbers are in the range -1 - -4095 */
Index: linux.prev/include/asm-x86_64/vsyscall.h
===================================================================
--- linux.prev.orig/include/asm-x86_64/vsyscall.h
+++ linux.prev/include/asm-x86_64/vsyscall.h
@@ -14,7 +14,7 @@ enum vsyscall_num {
 #define VSYSCALL_ADDR(vsyscall_nr) (VSYSCALL_START+VSYSCALL_SIZE*(vsyscall_nr))
 
 #ifdef __KERNEL__
-
+/* XXX - All of these are unused w/ CONFIG_GENERIC_TIME and should be removed */
 #define __section_vxtime __attribute__ ((unused, __section__ (".vxtime"), aligned(16)))
 #define __section_wall_jiffies __attribute__ ((unused, __section__ (".wall_jiffies"), aligned(16)))
 #define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16)))
@@ -23,6 +23,12 @@ enum vsyscall_num {
 #define __section_xtime __attribute__ ((unused, __section__ (".xtime"), aligned(16)))
 #define __section_xtime_lock __attribute__ ((unused, __section__ (".xtime_lock"), aligned(16)))
 
+/* Definitions for CONFIG_GENERIC_TIME definitions */
+#define __section_vsyscall_gtod_data __attribute__ ((unused, __section__ (".vsyscall_gtod_data"),aligned(16)))
+#define __section_vsyscall_gtod_lock __attribute__ ((unused, __section__ (".vsyscall_gtod_lock"),aligned(16)))
+#define __vsyscall_fn __attribute__ ((unused,__section__(".vsyscall_fn")))
+#define __vsyscall_data __attribute__ ((unused,__section__(".vsyscall_data")))
+
 #define VXTIME_TSC	1
 #define VXTIME_HPET	2
 #define VXTIME_PMTMR	3
@@ -45,14 +51,14 @@ extern struct timespec __xtime;
 extern volatile unsigned long __jiffies;
 extern unsigned long __wall_jiffies;
 extern struct timezone __sys_tz;
-extern seqlock_t __xtime_lock;
+extern raw_seqlock_t __xtime_lock;
 
 /* kernel space (writeable) */
 extern struct vxtime_data vxtime;
 extern unsigned long wall_jiffies;
 extern struct timezone sys_tz;
 extern int sysctl_vsyscall;
-extern seqlock_t xtime_lock;
+extern raw_seqlock_t xtime_lock;
 
 extern int sysctl_vsyscall;
 
Index: linux.prev/include/linux/bit_spinlock.h
===================================================================
--- linux.prev.orig/include/linux/bit_spinlock.h
+++ linux.prev/include/linux/bit_spinlock.h
@@ -1,6 +1,8 @@
 #ifndef __LINUX_BIT_SPINLOCK_H
 #define __LINUX_BIT_SPINLOCK_H
 
+#if 0
+
 /*
  *  bit-based spin_lock()
  *
@@ -73,5 +75,7 @@ static inline int bit_spin_is_locked(int
 #endif
 }
 
+#endif
+
 #endif /* __LINUX_BIT_SPINLOCK_H */
 
Index: linux.prev/include/linux/buffer_head.h
===================================================================
--- linux.prev.orig/include/linux/buffer_head.h
+++ linux.prev/include/linux/buffer_head.h
@@ -19,10 +19,6 @@ enum bh_state_bits {
 	BH_Dirty,	/* Is dirty */
 	BH_Lock,	/* Is locked */
 	BH_Req,		/* Has been submitted for I/O */
-	BH_Uptodate_Lock,/* Used by the first bh in a page, to serialise
-			  * IO completion of other buffers in the page
-			  */
-
 	BH_Mapped,	/* Has a disk mapping */
 	BH_New,		/* Disk mapping was newly created by get_block */
 	BH_Async_Read,	/* Is under end_buffer_async_read I/O */
@@ -65,6 +61,8 @@ struct buffer_head {
 	bh_end_io_t *b_end_io;		/* I/O completion */
  	void *b_private;		/* reserved for b_end_io */
 	struct list_head b_assoc_buffers; /* associated with another mapping */
+	spinlock_t b_uptodate_lock;
+	spinlock_t b_state_lock;
 };
 
 /*
Index: linux.prev/include/linux/calc64.h
===================================================================
--- /dev/null
+++ linux.prev/include/linux/calc64.h
@@ -0,0 +1,49 @@
+#ifndef _LINUX_CALC64_H
+#define _LINUX_CALC64_H
+
+#include <linux/types.h>
+#include <asm/div64.h>
+
+/*
+ * This is a generic macro which is used when the architecture
+ * specific div64.h does not provide a optimized one.
+ *
+ * The 64bit dividend is divided by the divisor (data type long), the
+ * result is returned and the remainder stored in the variable
+ * referenced by remainder (data type long *). In contrast to the
+ * do_div macro the dividend is kept intact.
+ */
+#ifndef div_long_long_rem
+#define div_long_long_rem(dividend, divisor, remainder)	\
+	do_div_llr((dividend), divisor, remainder)
+
+static inline unsigned long do_div_llr(const long long dividend,
+				       const long divisor, long *remainder)
+{
+	u64 result = dividend;
+
+	*(remainder) = do_div(result, divisor);
+	return (unsigned long) result;
+}
+#endif
+
+/*
+ * Sign aware variation of the above. On some architectures a
+ * negative dividend leads to an divide overflow exception, which
+ * is avoided by the sign check.
+ */
+static inline long div_long_long_rem_signed(const long long dividend,
+					    const long divisor, long *remainder)
+{
+	long res;
+
+	if (unlikely(dividend < 0)) {
+		res = -div_long_long_rem(-dividend, divisor, remainder);
+		*remainder = -(*remainder);
+	} else
+		res = div_long_long_rem(dividend, divisor, remainder);
+
+	return res;
+}
+
+#endif
Index: linux.prev/include/linux/clockchips.h
===================================================================
--- /dev/null
+++ linux.prev/include/linux/clockchips.h
@@ -0,0 +1,127 @@
+/*  linux/include/linux/clockchips.h
+ *
+ *  This file contains the structure definitions for clockchips.
+ *
+ *  If you are not a clockchip, or the time of day code, you should
+ *  not be including this file!
+ */
+#ifndef _LINUX_CLOCKCHIPS_H
+#define _LINUX_CLOCKCHIPS_H
+
+#include <linux/config.h>
+
+#ifdef CONFIG_GENERIC_TIME
+
+#include <linux/clocksource.h>
+#include <linux/interrupt.h>
+
+/* Clock event modes and commands */
+enum {
+	CLOCK_EVT_NONE,
+	CLOCK_EVT_STARTUP,
+	CLOCK_EVT_PERIODIC,
+	CLOCK_EVT_ONESHOT,
+	CLOCK_EVT_IPI,
+	CLOCK_EVT_STOP,
+	CLOCK_EVT_SHUTDOWN,
+	CLOCK_EVT_RUN_CYCLIC,
+	CLOCK_EVT_SCHEDTICK,
+	CLOCK_EVT_NOTICK,
+};
+
+/* Clock event capability flags */
+#define CLOCK_CAP_TICK		0x000001
+
+#if defined(CONFIG_HIGH_RES_TIMERS) || defined(CONFIG_DYNTICK)
+#define CLOCK_CAP_NEXTEVT	0x000002
+#else
+#define CLOCK_CAP_NEXTEVT	0x000000
+#endif
+
+#define CLOCK_CAP_UPDATE	0x000004
+
+#ifndef CONFIG_PROFILE_NMI
+#define CLOCK_CAP_PROFILE	0x000008
+#else
+#define CLOCK_CAP_PROFILE	0x000000
+#endif
+
+#define CLOCK_CAP_MASK		(CLOCK_CAP_TICK | CLOCK_CAP_NEXTEVT | CLOCK_CAP_PROFILE | CLOCK_CAP_UPDATE)
+
+/* The device has its own interrupt handler */
+#define CLOCK_HAS_IRQHANDLER	0x010000
+
+struct clock_event;
+
+/**
+ * struct clock_event - clock event descriptor
+ *
+ * @name:		ptr to clock event name
+ * @capabilities:	capabilities of the event chip
+ * @max_delta_ns:	maximum delta value in ns
+ * @min_delta_ns:	minimum delta value in ns
+ * @mult:		nanosecond to cycles multiplier
+ * @shift:		nanoseconds to cycles divisor (power of two)
+ * @set_next_event:	set next event
+ * @set_mode:		set mode function
+ * @suspend:		suspend function (optional)
+ * @resume:		resume function (optional)
+ * @evthandler:		Assigned by the framework to be called by the low
+ *			level handler of the event source
+ * @start_event:	called on entry (optional for chip handling...)
+ * @end_event:		called on exit (optional for chip handling...)
+ * @priv:		private device data
+ */
+struct clock_event {
+	const char* name;
+	unsigned int capabilities;
+	unsigned long max_delta_ns;
+	unsigned long min_delta_ns;
+	u32 mult;
+	u32 shift;
+	void (*set_next_event)(unsigned long evt);
+	void (*set_mode)(int mode);
+	int (*suspend)(void);
+	int (*resume)(void);
+	void (*event_handler)(struct pt_regs *regs);
+	void (*start_event)(void *priv);
+	void (*end_event)(void *priv);
+	unsigned int irq;
+	void *priv;
+};
+
+
+
+/*
+ * Calculate a multiplication factor with shift=32
+ */
+static inline unsigned long div_sc32(unsigned long a, unsigned long b)
+{
+	u64 tmp = ((u64)a) << 32;
+	do_div(tmp, b);
+	return (unsigned long) tmp;
+}
+
+static inline unsigned long mpy_sc32(unsigned long a, unsigned long b)
+{
+	u64 res = (u64) a * b;
+
+	return (unsigned long) (res >> 32);
+}
+
+/* Clock event layer functions */
+extern int setup_local_clockevent(struct clock_event *, cpumask_t cpumask);
+extern int setup_global_clockevent(struct clock_event *, cpumask_t cpumask);
+extern unsigned long clockevent_delta2ns(unsigned long latch, struct clock_event *evt);
+extern void init_clockevents(void);
+
+extern int clockevents_init_next_event(void);
+extern int clockevents_set_next_event(ktime_t expires);
+extern void clockevents_trigger_next_event(void);
+extern int clockevents_next_event_available(void);
+
+#else
+# define init_clockevents() do { } while(0)
+#endif
+
+#endif
Index: linux.prev/include/linux/clocksource.h
===================================================================
--- /dev/null
+++ linux.prev/include/linux/clocksource.h
@@ -0,0 +1,304 @@
+/*  linux/include/linux/clocksource.h
+ *
+ *  This file contains the structure definitions for clocksources.
+ *
+ *  If you are not a clocksource, or the time of day code, you should
+ *  not be including this file!
+ */
+#ifndef _LINUX_CLOCKSOURCE_H
+#define _LINUX_CLOCKSOURCE_H
+
+#include <linux/types.h>
+#include <linux/timex.h>
+#include <linux/time.h>
+#include <linux/list.h>
+#include <asm/div64.h>
+#include <asm/io.h>
+
+/**
+ * struct clocksource - hardware abstraction for a free running counter
+ *	Provides mostly state-free accessors to the underlying hardware.
+ *
+ * @name:		ptr to clocksource name
+ * @list:		list head for registration
+ * @rating:		rating value for selection (higher is better)
+ *			To avoid rating inflation the following
+ *			list should give you a guide as to how
+ *			to assign your clocksource a rating
+ *			1-99: Unfit for real use
+ *				Only available for bootup and testing purposes.
+ *			100-199: Base level usability.
+ *				Functional for real use, but not desired.
+ *			200-299: Good.
+ *				A correct and usable clocksource.
+ *			300-399: Desired.
+ *				A reasonably fast and accurate clocksource.
+ *			400-499: Perfect
+ *				The ideal clocksource. A must-use where
+ *				available.
+ * @read:		returns a cycle value
+ * @mask:		bitmask for two's complement
+ *			subtraction of non 64 bit counters
+ * @mult:		cycle to nanosecond multiplier
+ * @shift:		cycle to nanosecond divisor (power of two)
+ * @update_callback:	called when safe to alter clocksource values
+ * @is_continuous:	defines if clocksource is free-running.
+ * @vread:		vsyscall read function
+ * @vdata:		vsyscall data value passed to read function
+ */
+struct clocksource {
+	char *name;
+	struct list_head list;
+	int rating;
+	cycle_t (*read)(void);
+	cycle_t mask;
+	u32 mult;
+	u32 shift;
+	int (*update_callback)(void);
+	int is_continuous;
+	cycle_t (*vread)(void *);
+	void *vdata;
+};
+
+
+/**
+ * clocksource_khz2mult - calculates mult from khz and shift
+ * @khz:		Clocksource frequency in KHz
+ * @shift_constant:	Clocksource shift factor
+ *
+ * Helper functions that converts a khz counter frequency to a timsource
+ * multiplier, given the clocksource shift value
+ */
+static inline u32 clocksource_khz2mult(u32 khz, u32 shift_constant)
+{
+	/*  khz = cyc/(Million ns)
+	 *  mult/2^shift  = ns/cyc
+	 *  mult = ns/cyc * 2^shift
+	 *  mult = 1Million/khz * 2^shift
+	 *  mult = 1000000 * 2^shift / khz
+	 *  mult = (1000000<<shift) / khz
+	 */
+	u64 tmp = ((u64)1000000) << shift_constant;
+
+	tmp += khz/2; /* round for do_div */
+	do_div(tmp, khz);
+
+	return (u32)tmp;
+}
+
+/**
+ * clocksource_hz2mult - calculates mult from hz and shift
+ * @hz:			Clocksource frequency in Hz
+ * @shift_constant:	Clocksource shift factor
+ *
+ * Helper functions that converts a hz counter
+ * frequency to a timsource multiplier, given the
+ * clocksource shift value
+ */
+static inline u32 clocksource_hz2mult(u32 hz, u32 shift_constant)
+{
+	/*  hz = cyc/(Billion ns)
+	 *  mult/2^shift  = ns/cyc
+	 *  mult = ns/cyc * 2^shift
+	 *  mult = 1Billion/hz * 2^shift
+	 *  mult = 1000000000 * 2^shift / hz
+	 *  mult = (1000000000<<shift) / hz
+	 */
+	u64 tmp = ((u64)1000000000) << shift_constant;
+
+	tmp += hz/2; /* round for do_div */
+	do_div(tmp, hz);
+
+	return (u32)tmp;
+}
+
+/**
+ * read_clocksource: - Access the clocksource's current cycle value
+ * @cs:		pointer to clocksource being read
+ *
+ * Uses the clocksource to return the current cycle_t value
+ */
+static inline cycle_t read_clocksource(struct clocksource *cs)
+{
+	return cs->read();
+}
+
+/**
+ * ppm_to_mult_adj - Converts shifted ppm values to mult adjustment
+ * @cs:		Pointer to clocksource
+ * @ppm:	Shifted PPM value
+ *
+ * Helper which converts a shifted ppm value to clocksource mult_adj value.
+ *
+ * XXX - this could use some optimization
+ */
+static inline int ppm_to_mult_adj(struct clocksource *cs, int ppm)
+{
+	u64 mult_adj;
+	int ret_adj;
+
+	/* The basic math is as follows:
+	 *     cyc * mult/2^shift * (1 + ppm/MILL) = scaled ns
+	 * We want to precalculate the ppm factor so it can be added
+	 * to the multiplyer saving the extra multiplication step.
+	 *     cyc * (mult/2^shift + (mult/2^shift) * (ppm/MILL)) =
+	 *     cyc * (mult/2^shift + (mult*ppm/MILL)/2^shift) =
+	 *     cyc * (mult + (mult*ppm/MILL))/2^shift =
+	 * Thus we want to calculate the value of:
+	 *     mult*ppm/MILL
+	 */
+	mult_adj = abs(ppm);
+	mult_adj = (mult_adj * cs->mult)>>SHIFT_USEC;
+	mult_adj += 1000000/2; /* round for div*/
+	do_div(mult_adj, 1000000);
+	if (ppm < 0)
+		ret_adj = -(int)mult_adj;
+	else
+		ret_adj = (int)mult_adj;
+
+	return ret_adj;
+}
+
+/**
+ * cyc2ns - converts clocksource cycles to nanoseconds
+ * @cs:		Pointer to clocksource
+ * @ntp_adj:	Multiplier adjustment value
+ * @cycles:	Cycles
+ *
+ * Uses the clocksource and ntp ajdustment to convert cycle_ts to nanoseconds.
+ *
+ * XXX - This could use some mult_lxl_ll() asm optimization
+ */
+static inline nsec_t cyc2ns(struct clocksource *cs, int ntp_adj, cycle_t cycles)
+{
+	u64 ret = (u64)cycles;
+
+	ret *= (cs->mult + ntp_adj);
+	ret >>= cs->shift;
+
+	return (nsec_t)ret;
+}
+
+/**
+ * cyc2ns_rem - converts clocksource cycles to nanoseconds w/ remainder
+ * @cs:		Pointer to clocksource
+ * @ntp_adj:	Multiplier adjustment value
+ * @cycles:	Cycles
+ * @rem:	Remainder
+ *
+ * Uses the clocksource and ntp ajdustment interval to convert cycle_t to
+ * nanoseconds. Add in remainder portion which is stored in (ns<<cs->shift)
+ * units and save the new remainder off.
+ *
+ * XXX - This could use some mult_lxl_ll() asm optimization.
+ */
+static inline nsec_t cyc2ns_rem(struct clocksource *cs, int ntp_adj,
+				cycle_t cycles, u64* rem)
+{
+	u64 ret = (u64)cycles;
+
+	ret *= (cs->mult + ntp_adj);
+	if (rem) {
+		ret += *rem;
+		*rem = ret & ((1<<cs->shift)-1);
+	}
+	ret >>= cs->shift;
+
+	return (nsec_t)ret;
+}
+
+
+/**
+ * struct clocksource_interval - Fixed interval conversion structure
+ *
+ * @cycles:	A specified number of cycles
+ * @nsecs:	The number of nanoseconds equivalent to the cycles value
+ * @remainder:	Non-integer nanosecond remainder stored in (ns<<cs->shift) units
+ * @remainder_ns_overflow:	Value at which the remainder is equal to
+ *				one second
+ *
+ * This is a optimization structure used by cyc2ns_fixed_rem() to avoid the
+ * multiply in cyc2ns().
+ *
+ * Unless you're the timeofday_periodic_hook, you should not be using this!
+ */
+struct clocksource_interval {
+	cycle_t cycles;
+	nsec_t nsecs;
+	u64 remainder;
+	u64 remainder_ns_overflow;
+};
+
+/**
+ * calculate_clocksource_interval - Calculates a clocksource interval struct
+ *
+ * @c:		Pointer to clocksource.
+ * @adj:	Multiplyer adjustment.
+ * @length_nsec: Desired interval length in nanoseconds.
+ *
+ * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
+ * pair and interval request.
+ *
+ * Unless you're the timeofday_periodic_hook, you should not be using this!
+ */
+static inline struct clocksource_interval
+calculate_clocksource_interval(struct clocksource *c, long adj,
+			       unsigned long length_nsec)
+{
+	struct clocksource_interval ret;
+	u64 tmp;
+
+	/* XXX - All of this could use a whole lot of optimization */
+	tmp = length_nsec;
+	tmp <<= c->shift;
+	do_div(tmp, c->mult+adj);
+
+	ret.cycles = (cycle_t)tmp;
+	if(ret.cycles == 0)
+		ret.cycles = 1;
+
+	ret.remainder = 0;
+	ret.remainder_ns_overflow = 1 << c->shift;
+	ret.nsecs = cyc2ns_rem(c, adj, ret.cycles, &ret.remainder);
+
+	return ret;
+}
+
+/**
+ * cyc2ns_fixed_rem -
+ *	converts clocksource cycles to nanoseconds using fixed intervals
+ *
+ * @interval:	precalculated clocksource_interval structure
+ * @cycles:	Number of clocksource cycles
+ * @rem:	Remainder
+ *
+ * Uses a precalculated fixed cycle/nsec interval to convert cycles to
+ * nanoseconds. Returns the unaccumulated cycles in the cycles pointer as
+ * well as uses and updates the value at the remainder pointer
+ *
+ * Unless you're the timeofday_periodic_hook, you should not be using this!
+ */
+static inline nsec_t cyc2ns_fixed_rem(struct clocksource_interval interval,
+				      cycle_t *cycles, u64* rem)
+{
+	nsec_t delta_nsec = 0;
+
+	while (*cycles > interval.cycles) {
+		delta_nsec += interval.nsecs;
+		*cycles -= interval.cycles;
+		*rem += interval.remainder;
+		while(*rem > interval.remainder_ns_overflow) {
+			*rem -= interval.remainder_ns_overflow;
+			delta_nsec += 1;
+		}
+	}
+
+	return delta_nsec;
+}
+
+/* used to install a new clocksource */
+void register_clocksource(struct clocksource*);
+void reselect_clocksource(void);
+struct clocksource* get_next_clocksource(void);
+
+#endif /* _LINUX_CLOCKSOURCE_H */
Index: linux.prev/include/linux/completion.h
===================================================================
--- linux.prev.orig/include/linux/completion.h
+++ linux.prev/include/linux/completion.h
@@ -33,6 +33,7 @@ extern unsigned long FASTCALL(wait_for_c
 						   unsigned long timeout));
 extern unsigned long FASTCALL(wait_for_completion_interruptible_timeout(
 			struct completion *x, unsigned long timeout));
+extern unsigned int FASTCALL(completion_done(struct completion *x));
 
 extern void FASTCALL(complete(struct completion *));
 extern void FASTCALL(complete_all(struct completion *));
Index: linux.prev/include/linux/console.h
===================================================================
--- linux.prev.orig/include/linux/console.h
+++ linux.prev/include/linux/console.h
@@ -54,6 +54,7 @@ struct consw {
 	void	(*con_invert_region)(struct vc_data *, u16 *, int);
 	u16    *(*con_screen_pos)(struct vc_data *, int);
 	unsigned long (*con_getxy)(struct vc_data *, unsigned long, int *, int *);
+	int	con_preemptible; // can it reschedule from within printk?
 };
 
 extern const struct consw *conswitchp;
Index: linux.prev/include/linux/fs.h
===================================================================
--- linux.prev.orig/include/linux/fs.h
+++ linux.prev/include/linux/fs.h
@@ -855,7 +855,6 @@ static inline int has_fs_excl(void)
 	return atomic_read(&current->fs_excl);
 }
 
-
 /*
  * Superblock locking.
  */
Index: linux.prev/include/linux/fs_struct.h
===================================================================
--- linux.prev.orig/include/linux/fs_struct.h
+++ linux.prev/include/linux/fs_struct.h
@@ -12,9 +12,9 @@ struct fs_struct {
 	struct vfsmount * rootmnt, * pwdmnt, * altrootmnt;
 };
 
-#define INIT_FS {				\
-	.count		= ATOMIC_INIT(1),	\
-	.lock		= RW_LOCK_UNLOCKED,	\
+#define INIT_FS(name) {					\
+	.count		= ATOMIC_INIT(1),		\
+	.lock		= RW_LOCK_UNLOCKED(name.lock),	\
 	.umask		= 0022, \
 }
 
Index: linux.prev/include/linux/genhd.h
===================================================================
--- linux.prev.orig/include/linux/genhd.h
+++ linux.prev/include/linux/genhd.h
@@ -142,18 +142,26 @@ struct disk_attribute {
  * variants disable/enable preemption.
  */
 #ifdef	CONFIG_SMP
-#define __disk_stat_add(gendiskp, field, addnd) 	\
-	(per_cpu_ptr(gendiskp->dkstats, smp_processor_id())->field += addnd)
+#define __disk_stat_add(gendiskp, field, addnd)			\
+do {								\
+	preempt_disable();					\
+	(per_cpu_ptr(gendiskp->dkstats,				\
+			smp_processor_id())->field += addnd);	\
+	preempt_enable();					\
+} while (0)
+
 
 #define disk_stat_read(gendiskp, field)					\
 ({									\
 	typeof(gendiskp->dkstats->field) res = 0;			\
 	int i;								\
+	preempt_disable();						\
 	for (i=0; i < NR_CPUS; i++) {					\
 		if (!cpu_possible(i))					\
 			continue;					\
 		res += per_cpu_ptr(gendiskp->dkstats, i)->field;	\
 	}								\
+	preempt_enable();						\
 	res;								\
 })
 
Index: linux.prev/include/linux/hardirq.h
===================================================================
--- linux.prev.orig/include/linux/hardirq.h
+++ linux.prev/include/linux/hardirq.h
@@ -17,17 +17,18 @@
  * The hardirq count can be overridden per architecture, the default is:
  *
  * - bits 16-27 are the hardirq count (max # of hardirqs: 4096)
- * - ( bit 28 is the PREEMPT_ACTIVE flag. )
+ * - bit 28 is the PREEMPT_ACTIVE flag
  *
- * PREEMPT_MASK: 0x000000ff
- * SOFTIRQ_MASK: 0x0000ff00
- * HARDIRQ_MASK: 0x0fff0000
+ * PREEMPT_MASK:         0x000000ff
+ * SOFTIRQ_MASK:         0x0000ff00
+ * HARDIRQ_MASK:         0x0fff0000
+ * PREEMPT_ACTIVE_MASK:  0x10000000
  */
-#define PREEMPT_BITS	8
-#define SOFTIRQ_BITS	8
-
+#define PREEMPT_BITS		8
+#define SOFTIRQ_BITS		8
 #ifndef HARDIRQ_BITS
-#define HARDIRQ_BITS	12
+#define HARDIRQ_BITS		12
+
 /*
  * The hardirq mask has to be large enough to have space for potentially
  * all IRQ sources in the system nesting on a single CPU.
@@ -36,38 +37,43 @@
 # error HARDIRQ_BITS is too low!
 #endif
 #endif
+#define PREEMPT_ACTIVE_BITS	1
 
-#define PREEMPT_SHIFT	0
-#define SOFTIRQ_SHIFT	(PREEMPT_SHIFT + PREEMPT_BITS)
-#define HARDIRQ_SHIFT	(SOFTIRQ_SHIFT + SOFTIRQ_BITS)
-
-#define __IRQ_MASK(x)	((1UL << (x))-1)
-
-#define PREEMPT_MASK	(__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
-#define SOFTIRQ_MASK	(__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
-#define HARDIRQ_MASK	(__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
-
-#define PREEMPT_OFFSET	(1UL << PREEMPT_SHIFT)
-#define SOFTIRQ_OFFSET	(1UL << SOFTIRQ_SHIFT)
-#define HARDIRQ_OFFSET	(1UL << HARDIRQ_SHIFT)
+#define PREEMPT_SHIFT		0
+#define SOFTIRQ_SHIFT		(PREEMPT_SHIFT + PREEMPT_BITS)
+#define HARDIRQ_SHIFT		(SOFTIRQ_SHIFT + SOFTIRQ_BITS)
+#define PREEMPT_ACTIVE_SHIFT	(HARDIRQ_SHIFT + HARDIRQ_BITS)
+
+#define __IRQ_MASK(x)		((1UL << (x))-1)
+
+#define PREEMPT_MASK		(__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
+#define SOFTIRQ_MASK		(__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
+#define HARDIRQ_MASK		(__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
+
+#define PREEMPT_OFFSET		(1UL << PREEMPT_SHIFT)
+#define SOFTIRQ_OFFSET		(1UL << SOFTIRQ_SHIFT)
+#define HARDIRQ_OFFSET		(1UL << HARDIRQ_SHIFT)
 
 #if PREEMPT_ACTIVE < (1 << (HARDIRQ_SHIFT + HARDIRQ_BITS))
-#error PREEMPT_ACTIVE is too low!
+# error PREEMPT_ACTIVE is too low!
 #endif
 
 #define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
 #define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
 #define irq_count()	(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK))
+#define irqs_off()	(current->flags & PF_IRQSOFF)
 
 /*
  * Are we doing bottom half or hardware interrupt processing?
  * Are we in a softirq context? Interrupt context?
  */
-#define in_irq()		(hardirq_count())
-#define in_softirq()		(softirq_count())
-#define in_interrupt()		(irq_count())
-
-#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
+#define in_irq()	(hardirq_count() || (current->flags & PF_HARDIRQ))
+#define in_softirq()	(softirq_count() || (current->flags & PF_SOFTIRQ))
+#define in_interrupt()	(irq_count())
+
+#if defined(CONFIG_PREEMPT) && \
+	!defined(CONFIG_PREEMPT_BKL) && \
+		!defined(CONFIG_PREEMPT_RT)
 # define in_atomic()	((preempt_count() & ~PREEMPT_ACTIVE) != kernel_locked())
 #else
 # define in_atomic()	((preempt_count() & ~PREEMPT_ACTIVE) != 0)
@@ -87,8 +93,8 @@ extern void synchronize_irq(unsigned int
 # define synchronize_irq(irq)	barrier()
 #endif
 
-#define nmi_enter()		irq_enter()
-#define nmi_exit()		sub_preempt_count(HARDIRQ_OFFSET)
+#define nmi_enter()		/* irq_enter() */
+#define nmi_exit()		/* sub_preempt_count(HARDIRQ_OFFSET) */
 
 struct task_struct;
 
Index: linux.prev/include/linux/hrtimer.h
===================================================================
--- /dev/null
+++ linux.prev/include/linux/hrtimer.h
@@ -0,0 +1,189 @@
+/*
+ *  include/linux/hrtimer.h
+ *
+ *  hrtimers - High-resolution kernel timers
+ *
+ *   Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
+ *   Copyright(C) 2005, Red Hat, Inc., Ingo Molnar
+ *
+ *  data type definitions, declarations, prototypes
+ *
+ *  Started by: Thomas Gleixner and Ingo Molnar
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+#ifndef _LINUX_HRTIMER_H
+#define _LINUX_HRTIMER_H
+
+#include <linux/rbtree.h>
+#include <linux/ktime.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/wait.h>
+#include <linux/timeofday.h>
+
+/*
+ * Mode arguments of xxx_hrtimer functions:
+ */
+enum hrtimer_mode {
+	HRTIMER_ABS,	/* Time value is absolute */
+	HRTIMER_REL,	/* Time value is relative to now */
+};
+
+enum hrtimer_restart {
+	HRTIMER_NORESTART,
+	HRTIMER_RESTART,
+};
+
+/*
+ * Timer states:
+ */
+enum hrtimer_state {
+	HRTIMER_INACTIVE,		/* Timer is inactive */
+	HRTIMER_EXPIRED,		/* Timer is expired */
+	HRTIMER_RUNNING,		/* Timer is running the callback function */
+	HRTIMER_PENDING,		/* Timer is pending */
+	HRTIMER_PENDING_CALLBACK,	/* Timer callback is pending */
+};
+
+struct hrtimer_base;
+
+/**
+ * struct hrtimer - the basic hrtimer structure
+ *
+ * @node:	red black tree node for time ordered insertion
+ * @expires:	the absolute expiry time in the hrtimers internal
+ *		representation. The time is related to the clock on
+ *		which the timer is based.
+ * @state:	state of the timer
+ * @function:	timer expiry callback function
+ * @data:	argument for the callback function
+ * @base:	pointer to the timer base (per cpu and per clock)
+ *
+ * The hrtimer structure must be initialized by init_hrtimer_#CLOCKTYPE()
+ */
+struct hrtimer {
+	struct rb_node		node;
+	ktime_t			expires;
+	enum hrtimer_state	state;
+	int			(*function)(void *);
+	void			*data;
+	struct hrtimer_base	*base;
+#ifdef CONFIG_HIGH_RES_TIMERS
+	struct list_head	list;
+#endif
+};
+
+/**
+ * struct hrtimer_base - the timer base for a specific clock
+ *
+ * @index:	clock type index for per_cpu support when moving a timer
+ *		to a base on another cpu.
+ * @lock:	lock protecting the base and associated timers
+ * @active:	red black tree root node for the active timers
+ * @first:	pointer to the timer node which expires first
+ * @resolution:	the resolution of the clock, in nanoseconds
+ * @get_time:	function to retrieve the current time of the clock
+ * @curr_timer:	the timer which is executing a callback right now
+ */
+struct hrtimer_base {
+	clockid_t		index;
+	raw_spinlock_t		lock;
+	struct rb_root		active;
+	struct rb_node		*first;
+	ktime_t			resolution;
+	ktime_t			(*get_time)(void);
+	struct hrtimer		*curr_timer;
+#ifdef CONFIG_HIGH_RES_TIMERS
+	struct list_head	expired;
+	ktime_t			offset;
+	int			(*reprogram)(struct hrtimer *t,
+					     struct hrtimer_base *b, ktime_t n);
+#endif
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+	wait_queue_head_t	wait;
+#endif
+};
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+
+extern void hrtimer_clock_notify(void);
+extern void clock_was_set(void);
+extern int hrtimer_interrupt(void);
+
+/*
+ * The resolution of the clocks. The resolution value is returned in
+ * the clock_getres() system call to give application programmers an
+ * idea of the (in)accuracy of timers. Timer values are rounded up to
+ * this resolution values.
+ */
+#define KTIME_REALTIME_RES	(ktime_t) { .tv64 = CONFIG_HIGH_RES_RESOLUTION }
+#define KTIME_MONOTONIC_RES	(ktime_t) { .tv64 = CONFIG_HIGH_RES_RESOLUTION }
+
+#else
+
+#define KTIME_REALTIME_RES		KTIME_LOW_RES
+#define KTIME_MONOTONIC_RES		KTIME_LOW_RES
+
+/*
+ * clock_was_set() is a NOP for non- high-resolution systems. The
+ * time-sorted order guarantees that a timer does not expire early and
+ * is expired in the next softirq when the clock was advanced.
+ */
+#define clock_was_set()		do { } while (0)
+#define hrtimer_clock_notify()	do { } while (0)
+
+#endif
+
+# if (BITS_PER_LONG == 64) || defined(CONFIG_KTIME_SCALAR)
+#  define hrtimer_trace(a,b)		trace_special_u64((a).tv64,b)
+# else
+#  define hrtimer_trace(a,b)		trace_special((a).tv.sec,(a).tv.nsec,b)
+# endif
+
+/* Exported timer functions: */
+
+/* Initialize timers: */
+extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock,
+			 enum hrtimer_mode mode);
+
+/* Basic timer operations: */
+extern int hrtimer_start(struct hrtimer *timer, ktime_t tim,
+			 const enum hrtimer_mode mode);
+extern int hrtimer_cancel(struct hrtimer *timer);
+extern int hrtimer_try_to_cancel(struct hrtimer *timer);
+
+#define hrtimer_restart(timer) hrtimer_start((timer), (timer)->expires, HRTIMER_ABS)
+
+/* Softirq preemption could deadlock timer removal */
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
+#else
+#define hrtimer_wait_for_timer(timer)	do { } while (0)
+#endif
+
+/* Query timers: */
+extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer);
+extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp);
+
+static inline int hrtimer_active(const struct hrtimer *timer)
+{
+	return timer->state >= HRTIMER_PENDING;
+}
+
+/* Forward a hrtimer so it expires after now: */
+extern unsigned long hrtimer_forward(struct hrtimer *timer, ktime_t interval);
+
+/* Precise sleep: */
+extern long hrtimer_nanosleep(struct timespec *rqtp,
+			      struct timespec __user *rmtp,
+			      const enum hrtimer_mode mode,
+			      const clockid_t clockid);
+
+/* Soft interrupt function to run the hrtimer queues: */
+extern void hrtimer_run_queues(void);
+
+/* Bootup initialization: */
+extern void __init hrtimers_init(void);
+
+#endif
Index: linux.prev/include/linux/ide.h
===================================================================
--- linux.prev.orig/include/linux/ide.h
+++ linux.prev/include/linux/ide.h
@@ -18,6 +18,7 @@
 #include <linux/bio.h>
 #include <linux/device.h>
 #include <linux/pci.h>
+#include <linux/completion.h>
 #include <asm/byteorder.h>
 #include <asm/system.h>
 #include <asm/io.h>
@@ -638,7 +639,7 @@ typedef struct ide_drive_s {
 	int		crc_count;	/* crc counter to reduce drive speed */
 	struct list_head list;
 	struct device	gendev;
-	struct semaphore gendev_rel_sem;	/* to deal with device release() */
+	struct completion gendev_rel_comp;	/* to deal with device release() */
 } ide_drive_t;
 
 #define to_ide_device(dev)container_of(dev, ide_drive_t, gendev)
@@ -794,7 +795,7 @@ typedef struct hwif_s {
 	unsigned	sg_mapped  : 1;	/* sg_table and sg_nents are ready */
 
 	struct device	gendev;
-	struct semaphore gendev_rel_sem; /* To deal with device release() */
+	struct completion gendev_rel_comp; /* To deal with device release() */
 
 	void		*hwif_data;	/* extra hwif data */
 
@@ -1353,7 +1354,7 @@ extern struct semaphore ide_cfg_sem;
  * ide_drive_t->hwif: constant, no locking
  */
 
-#define local_irq_set(flags)	do { local_save_flags((flags)); local_irq_enable(); } while (0)
+#define local_irq_set(flags)	do { local_save_flags((flags)); local_irq_enable_nort(); } while (0)
 
 extern struct bus_type ide_bus_type;
 
Index: linux.prev/include/linux/idr.h
===================================================================
--- linux.prev.orig/include/linux/idr.h
+++ linux.prev/include/linux/idr.h
@@ -66,7 +66,7 @@ struct idr {
 	.id_free	= NULL,					\
 	.layers 	= 0,					\
 	.id_free_cnt	= 0,					\
-	.lock		= SPIN_LOCK_UNLOCKED,			\
+	.lock		= SPIN_LOCK_UNLOCKED(name.lock),	\
 }
 #define DEFINE_IDR(name)	struct idr name = IDR_INIT(name)
 
Index: linux.prev/include/linux/init_task.h
===================================================================
--- linux.prev.orig/include/linux/init_task.h
+++ linux.prev/include/linux/init_task.h
@@ -2,6 +2,7 @@
 #define _LINUX__INIT_TASK_H
 
 #include <linux/file.h>
+#include <linux/fs_struct.h>
 #include <linux/rcupdate.h>
 
 #define INIT_FDTABLE \
@@ -17,10 +18,10 @@
 	.next		= NULL,		 		\
 }
 
-#define INIT_FILES \
+#define INIT_FILES(name) \
 { 							\
 	.count		= ATOMIC_INIT(1), 		\
-	.file_lock	= SPIN_LOCK_UNLOCKED, 		\
+	.file_lock	= SPIN_LOCK_UNLOCKED(name.file_lock), \
 	.fdt		= &init_files.fdtab, 		\
 	.fdtab		= INIT_FDTABLE,			\
 	.close_on_exec_init = { { 0, } }, 		\
@@ -36,7 +37,7 @@
 	.user_id	= 0,				\
 	.next		= NULL,				\
 	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER(name.wait), \
-	.ctx_lock	= SPIN_LOCK_UNLOCKED,		\
+	.ctx_lock	= SPIN_LOCK_UNLOCKED(name.ctx_lock),	\
 	.reqs_active	= 0U,				\
 	.max_reqs	= ~0U,				\
 }
@@ -48,7 +49,7 @@
 	.mm_users	= ATOMIC_INIT(2), 			\
 	.mm_count	= ATOMIC_INIT(1), 			\
 	.mmap_sem	= __RWSEM_INITIALIZER(name.mmap_sem),	\
-	.page_table_lock =  SPIN_LOCK_UNLOCKED, 		\
+	.page_table_lock = SPIN_LOCK_UNLOCKED(name.page_table_lock), \
 	.mmlist		= LIST_HEAD_INIT(name.mmlist),		\
 	.cpu_vm_mask	= CPU_MASK_ALL,				\
 }
@@ -67,7 +68,7 @@
 #define INIT_SIGHAND(sighand) {						\
 	.count		= ATOMIC_INIT(1), 				\
 	.action		= { { { .sa_handler = NULL, } }, },		\
-	.siglock	= SPIN_LOCK_UNLOCKED, 				\
+	.siglock	= SPIN_LOCK_UNLOCKED(sighand.siglock),		\
 }
 
 extern struct group_info init_groups;
@@ -85,6 +86,7 @@ extern struct group_info init_groups;
 	.lock_depth	= -1,						\
 	.prio		= MAX_PRIO-20,					\
 	.static_prio	= MAX_PRIO-20,					\
+	.normal_prio	= MAX_PRIO-20,					\
 	.policy		= SCHED_NORMAL,					\
 	.cpus_allowed	= CPU_MASK_ALL,					\
 	.mm		= NULL,						\
@@ -116,8 +118,11 @@ extern struct group_info init_groups;
 		.list = LIST_HEAD_INIT(tsk.pending.list),		\
 		.signal = {{0}}},					\
 	.blocked	= {{0}},					\
-	.alloc_lock	= SPIN_LOCK_UNLOCKED,				\
-	.proc_lock	= SPIN_LOCK_UNLOCKED,				\
+	.alloc_lock	= SPIN_LOCK_UNLOCKED(tsk.alloc_lock),		\
+	.proc_lock	= SPIN_LOCK_UNLOCKED(tsk.proc_lock),		\
+	.delayed_put	= LIST_HEAD_INIT(tsk.delayed_put),		\
+	.pi_waiters	= PLIST_HEAD_INIT(tsk.pi_waiters),		\
+	.pi_lock	= RAW_SPIN_LOCK_UNLOCKED,			\
 	.journal_info	= NULL,						\
 	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
 	.fs_excl	= ATOMIC_INIT(0),				\
Index: linux.prev/include/linux/interrupt.h
===================================================================
--- linux.prev.orig/include/linux/interrupt.h
+++ linux.prev/include/linux/interrupt.h
@@ -42,7 +42,7 @@ struct irqaction {
 	void *dev_id;
 	struct irqaction *next;
 	int irq;
-	struct proc_dir_entry *dir;
+	struct proc_dir_entry *dir, *threaded;
 };
 
 extern irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs);
@@ -60,6 +60,7 @@ extern void enable_irq(unsigned int irq)
 
 #ifndef __ARCH_SET_SOFTIRQ_PENDING
 #define set_softirq_pending(x) (local_softirq_pending() = (x))
+// FIXME: PREEMPT_RT: set_bit()?
 #define or_softirq_pending(x)  (local_softirq_pending() |= (x))
 #endif
 
@@ -92,13 +93,18 @@ static inline void __deprecated save_and
 #define save_and_cli(x)	save_and_cli(&x)
 #endif /* CONFIG_SMP */
 
+#ifdef CONFIG_PREEMPT_RT
+# define local_bh_disable() do { } while (0)
+# define local_bh_enable() do { } while (0)
+# define __local_bh_enable() do { } while (0)
+#else
 /* SoftIRQ primitives.  */
-#define local_bh_disable() \
+# define local_bh_disable() \
 		do { add_preempt_count(SOFTIRQ_OFFSET); barrier(); } while (0)
-#define __local_bh_enable() \
+# define __local_bh_enable() \
 		do { barrier(); sub_preempt_count(SOFTIRQ_OFFSET); } while (0)
-
-extern void local_bh_enable(void);
+  extern void local_bh_enable(void);
+#endif
 
 /* PLEASE, avoid to allocate new softirqs, if you need not _really_ high
    frequency threaded job scheduling. For almost all the purposes
@@ -113,7 +119,12 @@ enum
 	NET_TX_SOFTIRQ,
 	NET_RX_SOFTIRQ,
 	SCSI_SOFTIRQ,
-	TASKLET_SOFTIRQ
+	TASKLET_SOFTIRQ,
+#ifdef CONFIG_HIGH_RES_TIMERS
+	HRTIMER_SOFTIRQ,
+#endif
+	/* Entries after this are ignored in the split softirq mode */
+	MAX_SOFTIRQ,
 };
 
 /* softirq mask and active fields moved to irq_cpustat_t in
@@ -132,7 +143,13 @@ extern void softirq_init(void);
 #define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0)
 extern void FASTCALL(raise_softirq_irqoff(unsigned int nr));
 extern void FASTCALL(raise_softirq(unsigned int nr));
+extern void wakeup_irqd(void);
 
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+extern void wait_for_softirq(int softirq);
+#else
+# define wait_for_softirq(x) do {} while(0)
+#endif
 
 /* Tasklets --- multithreaded analogue of BHs.
 
@@ -147,8 +164,9 @@ extern void FASTCALL(raise_softirq(unsig
      to be executed on some cpu at least once after this.
    * If the tasklet is already scheduled, but its excecution is still not
      started, it will be executed only once.
-   * If this tasklet is already running on another CPU (or schedule is called
-     from tasklet itself), it is rescheduled for later.
+   * If this tasklet is already running on another CPU, it is rescheduled
+     for later.
+   * Schedule must not be called from the tasklet itself (a lockup occurs)
    * Tasklet is strictly serialized wrt itself, but not
      wrt another tasklets. If client needs some intertask synchronization,
      he makes it with spinlocks.
@@ -173,15 +191,25 @@ struct tasklet_struct name = { NULL, 0, 
 enum
 {
 	TASKLET_STATE_SCHED,	/* Tasklet is scheduled for execution */
-	TASKLET_STATE_RUN	/* Tasklet is running (SMP only) */
+	TASKLET_STATE_RUN,	/* Tasklet is running (SMP only) */
+	TASKLET_STATE_PENDING	/* Tasklet is pending */
 };
 
-#ifdef CONFIG_SMP
+#define TASKLET_STATEF_SCHED	(1 << TASKLET_STATE_SCHED)
+#define TASKLET_STATEF_RUN	(1 << TASKLET_STATE_RUN)
+#define TASKLET_STATEF_PENDING	(1 << TASKLET_STATE_PENDING)
+
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
 static inline int tasklet_trylock(struct tasklet_struct *t)
 {
 	return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
 }
 
+static inline int tasklet_tryunlock(struct tasklet_struct *t)
+{
+	return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
+}
+
 static inline void tasklet_unlock(struct tasklet_struct *t)
 {
 	smp_mb__before_clear_bit(); 
@@ -193,9 +221,10 @@ static inline void tasklet_unlock_wait(s
 	while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
 }
 #else
-#define tasklet_trylock(t) 1
-#define tasklet_unlock_wait(t) do { } while (0)
-#define tasklet_unlock(t) do { } while (0)
+# define tasklet_trylock(t)		1
+# define tasklet_tryunlock(t)		1
+# define tasklet_unlock_wait(t)		do { } while (0)
+# define tasklet_unlock(t)		do { } while (0)
 #endif
 
 extern void FASTCALL(__tasklet_schedule(struct tasklet_struct *t));
@@ -228,22 +257,14 @@ static inline void tasklet_disable(struc
 	smp_mb();
 }
 
-static inline void tasklet_enable(struct tasklet_struct *t)
-{
-	smp_mb__before_atomic_dec();
-	atomic_dec(&t->count);
-}
-
-static inline void tasklet_hi_enable(struct tasklet_struct *t)
-{
-	smp_mb__before_atomic_dec();
-	atomic_dec(&t->count);
-}
+extern fastcall void tasklet_enable(struct tasklet_struct *t);
+extern fastcall void tasklet_hi_enable(struct tasklet_struct *t);
 
 extern void tasklet_kill(struct tasklet_struct *t);
 extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
 extern void tasklet_init(struct tasklet_struct *t,
 			 void (*func)(unsigned long), unsigned long data);
+void takeover_tasklets(unsigned int cpu);
 
 /*
  * Autoprobing for irqs:
@@ -292,4 +313,35 @@ extern int probe_irq_off(unsigned long);
 extern unsigned int probe_irq_mask(unsigned long);	/* returns mask of ISA interrupts */
 #endif
 
+#ifdef CONFIG_PREEMPT_RT
+# define local_irq_disable_nort()	do { BUG_ON(in_interrupt()); } while (0)
+# define local_irq_enable_nort()	do { BUG_ON(in_interrupt()); } while (0)
+# define local_irq_save_nort(flags)	do { local_save_flags(flags); WARN_ON(in_interrupt()); } while (0)
+# define local_irq_restore_nort(flags)	do { (void)(flags); WARN_ON(in_interrupt()); } while (0)
+# define spin_lock_nort(lock)		do { } while (0)
+# define spin_unlock_nort(lock)		do { } while (0)
+# define spin_lock_bh_nort(lock)	do { } while (0)
+# define spin_unlock_bh_nort(lock)	do { } while (0)
+# define spin_lock_rt(lock)		spin_lock(lock)
+# define spin_unlock_rt(lock)		spin_unlock(lock)
+# define smp_processor_id_rt(cpu)	(cpu)
+# define in_atomic_rt()			(!oops_in_progress && \
+					  (in_atomic() || irqs_disabled()))
+# define read_trylock_rt(lock)		({read_lock(lock); 1; })
+#else
+# define local_irq_disable_nort()	local_irq_disable()
+# define local_irq_enable_nort()	local_irq_enable()
+# define local_irq_save_nort(flags)	local_irq_save(flags)
+# define local_irq_restore_nort(flags)	local_irq_restore(flags)
+# define spin_lock_rt(lock)		do { } while (0)
+# define spin_unlock_rt(lock)		do { } while (0)
+# define spin_lock_nort(lock)		spin_lock(lock)
+# define spin_unlock_nort(lock)		spin_unlock(lock)
+# define spin_lock_bh_nort(lock)	spin_lock_bh(lock)
+# define spin_unlock_bh_nort(lock)	spin_unlock_bh(lock)
+# define smp_processor_id_rt(cpu)	smp_processor_id()
+# define in_atomic_rt()			0
+# define read_trylock_rt(lock)		read_trylock(lock)
+#endif
+
 #endif
Index: linux.prev/include/linux/irq.h
===================================================================
--- linux.prev.orig/include/linux/irq.h
+++ linux.prev/include/linux/irq.h
@@ -1,14 +1,6 @@
 #ifndef __irq_h
 #define __irq_h
 
-/*
- * Please do not include this file in generic code.  There is currently
- * no requirement for any architecture to implement anything held
- * within this file.
- *
- * Thanks. --rmk
- */
-
 #include <linux/config.h>
 #include <linux/smp.h>
 
@@ -18,9 +10,11 @@
 #include <linux/cache.h>
 #include <linux/spinlock.h>
 #include <linux/cpumask.h>
+#include <linux/wait.h>
 
 #include <asm/irq.h>
 #include <asm/ptrace.h>
+#include <asm/timex.h>
 
 /*
  * IRQ line status.
@@ -40,48 +34,155 @@
 # define CHECK_IRQ_PER_CPU(var) 0
 #endif
 
+#define IRQ_NOPROBE	512	/* IRQ is not valid for probing */
+#define IRQ_NOREQUEST	1024	/* IRQ cannot be requested */
+
+#define IRQ_NODELAY	2048     /* IRQ must run immediately */
+
 /*
- * Interrupt controller descriptor. This is all we need
- * to describe about the low-level hardware. 
+ * Not used on any of the architectures, but feel free to provide
+ * your own per-arch one:
  */
-struct hw_interrupt_type {
-	const char * typename;
-	unsigned int (*startup)(unsigned int irq);
-	void (*shutdown)(unsigned int irq);
-	void (*enable)(unsigned int irq);
-	void (*disable)(unsigned int irq);
-	void (*ack)(unsigned int irq);
-	void (*end)(unsigned int irq);
-	void (*set_affinity)(unsigned int irq, cpumask_t dest);
+#ifndef SA_NODELAY
+# define SA_NODELAY 0x01000000
+#endif
+
+/*
+ * IRQ types
+ */
+#define IRQ_TYPE_NONE		0x0000		/* Default, unspecified type */
+#define IRQ_TYPE_EDGEL		0x0001		/* Edge low/falling type */
+#define IRQ_TYPE_EDGEH		0x0002		/* Edge high/rising type */
+#define IRQ_TYPE_EDGEB \
+	(IRQ_TYPE_EDGEL | IRQ_TYPE_EDGEH)	/* Edge low+high/both type */
+#define IRQ_TYPE_LEVELL		0x0004		/* Level low type */
+#define IRQ_TYPE_LEVELH		0x0008		/* Level high type */
+#define IRQ_TYPE_SIMPLE		0x0010		/* Simple type */
+
+
+/*
+ * IRQ wakeup control modes
+ */
+#define IRQ_WAKE_NORESUME	0x0000	/* Do not resume on this irq */
+#define IRQ_WAKE_RESUME		0x0001	/* Enable resume on this irq */
+
+/**
+ * struct irq_chip - Low level interrupt controller hardware descriptor
+ *
+ * @ack:	acknowledge IRQ
+ * @mask:	mask the IRQ
+ * @mask_ack:	acknowledge and mask the IRQ
+ * @unmask:	unmask the IRQ
+ * @retrigger:	retrigger the IRQ in hardware, if possible. Return 0 on success.
+ * @set_type:	set the IRQ type (level, edge[high,low,both])
+ * @set_wake:	Set the IRQ PM-wakeup function
+ * @options:	option field to store type, wake information
+ * @lock:	locking for SMP
+ * @chip_data:	platform-specific private data for the chip
+ */
+struct irq_chip {
+	spinlock_t	lock;
+	void		(*ack)(unsigned int irq);
+	void		(*mask)(unsigned int irq);
+	void		(*mask_ack)(unsigned int irq);
+	void		(*unmask)(unsigned int irq);
+	int		(*retrigger)(unsigned int irq);
+	int		(*set_type)(unsigned int irq, unsigned int hw_type);
+	int		(*set_wake)(unsigned int irq, unsigned int mode);
+	unsigned long	options;
+	void		*chip_data;
+};
+
+struct irq_desc;
+struct irq_type;
+
+/**
+ * struct irq_type - high level hardware interrupt type descriptor
+ *
+ * @typename:		name for /proc/interrupts
+ * @startup:		start up the interrupt (defaults to ->enable if NULL)
+ * @shutdown:		shut down the interrupt (defaults to ->disable if NULL)
+ * @enable:		enable the interrupt (defaults to chip->unmask if NULL)
+ * @disable:		disable the interrupt (defaults to chip->mask if NULL)
+ * @handle_irq:		irq flow handler called from the arch IRQ glue code
+ * @ack:		start of new interrupt.	(Note: This will be renamed to 'start')
+ * @hold:		same interrupt while the handler is running
+ * @end:		end of interrupt
+ * @set_affinity:	set the CPU affinity on SMP machines
+ * @set_type:		set the interrupt type (level, edge[high,low,both]),
+ *			returns a pointer to the irq_type structure which can
+ *			handle the requested type or NULL, if the type cannot
+ *			be handled.
+ */
+struct irq_type {
+	const char	*typename;
+	unsigned int 	(*startup)(unsigned int irq);
+	void		(*shutdown)(unsigned int irq);
+	void		(*enable)(unsigned int irq);
+	void		(*disable)(unsigned int irq);
+
+	void		(*handle_irq)(unsigned int irq, struct irq_desc *desc,
+				      struct pt_regs *regs);
+
+			/* (*start) Will be renamed */
+	void		(*ack)(unsigned int irq);
+	void		(*hold)(unsigned int irq);
+	void		(*end)(unsigned int irq);
+	void		(*set_affinity)(unsigned int irq, cpumask_t dest);
+	struct irq_type *(*set_type)(unsigned int irq, unsigned int type);
 	/* Currently used only by UML, might disappear one day.*/
 #ifdef CONFIG_IRQ_RELEASE_METHOD
 	void (*release)(unsigned int irq, void *dev_id);
 #endif
 };
 
-typedef struct hw_interrupt_type  hw_irq_controller;
-
-/*
- * This is the "IRQ descriptor", which contains various information
- * about the irq, including what kind of hardware handling it has,
- * whether it is disabled etc etc.
+/**
+ * struct irq_desc - interrupt descriptor
+ *
+ * @handler:		interrupt type dependent handler functions,
+ * 			(this should be renamed to 'type')
+ * @handler_data:	data for the type handlers
+ * @chip:		low level hardware access functions - comes from type
+ * @action:		the irq action chain
+ * @status:		status information
+ * @depth:		disable-depth, for nested irq_disable() calls
+ * @irq_count:		stats field to detect stalled irqs
+ * @irqs_unhandled:	stats field for spurious unhandled interrupts
+ * @thread:		Thread pointer for threaded preemptible irq handling
+ * @wait_for_handler:	Waitqueue to wait for a running preemptible handler
+ * @lock:		locking for SMP
+ * @move_irq:		Flag need to re-target interrupt destination
  *
  * Pad this out to 32 bytes for cache and indexing reasons.
  */
 typedef struct irq_desc {
-	hw_irq_controller *handler;
-	void *handler_data;
-	struct irqaction *action;	/* IRQ action list */
-	unsigned int status;		/* IRQ status */
-	unsigned int depth;		/* nested irq disables */
-	unsigned int irq_count;		/* For detecting broken interrupts */
-	unsigned int irqs_unhandled;
-	spinlock_t lock;
+	struct irq_type		*handler;
+	void			*handler_data;
+	struct irq_chip		*chip;
+	struct irqaction	*action;
+	unsigned int		status;
+
+	unsigned int		depth;
+	unsigned int		irq_count;
+	unsigned int		irqs_unhandled;
+ 	struct task_struct	*thread;
+ 	wait_queue_head_t	wait_for_handler;
+	raw_spinlock_t		lock;
 #if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
-	unsigned int move_irq;		/* Flag need to re-target intr dest*/
+	unsigned int		move_irq;
 #endif
 } ____cacheline_aligned irq_desc_t;
 
+
+/*
+ * Migration helpers for obsolete names, they will go away:
+ */
+#define irqdesc			irq_desc
+#define irqchip			irq_chip
+#define hw_interrupt_type	irq_type
+#define set_irq_type		set_hwirq_type
+typedef struct irq_type hw_irq_controller;
+
 extern irq_desc_t irq_desc [NR_IRQS];
 
 /* Return a pointer to the irq descriptor for IRQ.  */
@@ -211,16 +312,87 @@ static inline void set_irq_info(int irq,
 #endif // CONFIG_SMP
 
 extern int no_irq_affinity;
-extern int noirqdebug_setup(char *str);
 
+/* Handle irq action chains */
 extern fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
-					struct irqaction *action);
+				       struct irqaction *action);
+
+/*
+ * Built-in IRQ handlers for various IRQ types,
+ * callable via desc->handler->handle_irq()
+ */
+extern void handle_level_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs);
+extern void handle_edge_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs);
+extern void handle_simple_irq(unsigned int irq, struct irq_desc *desc,  struct pt_regs *regs);
+extern void handle_percpu_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs);
+extern void handle_bad_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs);
+
+#define desc_handle_irq(irq, desc, regs)		\
+do {							\
+	spin_lock(&(desc)->lock);			\
+	(desc)->handler->handle_irq(irq, (desc), regs);	\
+	spin_unlock(&(desc)->lock);			\
+} while(0)
+
+/* Monolithic do_IRQ implementation */
 extern fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs);
+
+/* Handling of unhandled and spurious interrupts */
 extern void note_interrupt(unsigned int irq, irq_desc_t *desc,
 					int action_ret, struct pt_regs *regs);
-extern int can_request_irq(unsigned int irq, unsigned long irqflags);
 
+/* Resending of interrupts */
+void check_irq_resend(irq_desc_t *desc, unsigned int irq);
+
+/* Proc filesystem */
 extern void init_irq_proc(void);
+
+/* Enable/disable irq debugging output */
+extern int noirqdebug_setup(char *str);
+
+/* Set/get irq type */
+extern int set_irq_type(unsigned int irq, unsigned int type);
+extern int get_irq_type(unsigned int irq, unsigned int type);
+
+/* Irq wakeup (PM) control) */
+extern int set_irq_wake(unsigned int irq, unsigned int mode);
+#define enable_irq_wake(irq) set_irq_wake(irq, IRQ_WAKE_RESUME)
+#define disable_irq_wake(irq) set_irq_wake(irq, IRQ_WAKE_NORESUME)
+
+/* Checks whether the interrupt can be requested by request_irq() */
+extern int can_request_irq(unsigned int irq, unsigned long irqflags);
+
+/* Set type control/chip/data for an interrupt */
+extern int generic_set_irq_type(unsigned int irq, struct irq_type *type);
+extern int set_irq_data(unsigned int irq, void *data);
+extern int set_irq_chip(unsigned int irq, struct irq_chip *chip);
+extern int set_irq_chip_data(unsigned int irq, void *data);
+
+/* Get chip/data for an interrupt */
+#define get_irq_chip(irq) (irq_desc[irq].chip)
+#define get_irq_chip_data(irq) (irq_desc[irq].chip->chip_data)
+
+/* Interrupt type default implementations */
+extern struct irq_type no_irq_type;
+extern struct irq_type default_edge_type;
+extern struct irq_type default_level_type;
+extern struct irq_type default_simple_type;
+extern struct irq_type default_percpu_type;
+
+/* Early initialization of irqs */
+extern void early_init_hardirqs(void);
+
+#if defined(CONFIG_PREEMPT_HARDIRQS)
+extern void init_hardirqs(void);
+#else
+static inline void init_hardirqs(void) { }
+#endif
+
+#else	/* GENERIC HARDIRQS */
+
+static inline void early_init_hardirqs(void) { }
+static inline void init_hardirqs(void) { }
+
 #endif
 
 extern hw_irq_controller no_irq_type;  /* needed in every arch ? */
Index: linux.prev/include/linux/jbd.h
===================================================================
--- linux.prev.orig/include/linux/jbd.h
+++ linux.prev/include/linux/jbd.h
@@ -270,6 +270,15 @@ void buffer_assertion_failure(struct buf
 #define J_ASSERT(assert)	do { } while (0)
 #endif		/* JBD_ASSERTIONS */
 
+/*
+ * For assertions that are only valid on SMP (e.g. spin_is_locked()):
+ */
+#ifdef CONFIG_SMP
+# define J_ASSERT_JH_SMP(jh, expr)	J_ASSERT_JH(jh, expr)
+#else
+# define J_ASSERT_JH_SMP(jh, assert)	do { } while (0)
+#endif
+
 #if defined(JBD_PARANOID_IOFAIL)
 #define J_EXPECT(expr, why...)		J_ASSERT(expr)
 #define J_EXPECT_BH(bh, expr, why...)	J_ASSERT_BH(bh, expr)
@@ -325,32 +334,32 @@ static inline struct journal_head *bh2jh
 
 static inline void jbd_lock_bh_state(struct buffer_head *bh)
 {
-	bit_spin_lock(BH_State, &bh->b_state);
+	spin_lock(&bh->b_state_lock);
 }
 
 static inline int jbd_trylock_bh_state(struct buffer_head *bh)
 {
-	return bit_spin_trylock(BH_State, &bh->b_state);
+	return spin_trylock(&bh->b_state_lock);
 }
 
 static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
 {
-	return bit_spin_is_locked(BH_State, &bh->b_state);
+	return spin_is_locked(&bh->b_state_lock);
 }
 
 static inline void jbd_unlock_bh_state(struct buffer_head *bh)
 {
-	bit_spin_unlock(BH_State, &bh->b_state);
+	spin_unlock(&bh->b_state_lock);
 }
 
 static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
 {
-	bit_spin_lock(BH_JournalHead, &bh->b_state);
+	spin_lock(&bh->b_uptodate_lock);
 }
 
 static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
 {
-	bit_spin_unlock(BH_JournalHead, &bh->b_state);
+	spin_unlock(&bh->b_uptodate_lock);
 }
 
 struct jbd_revoke_table_s;
Index: linux.prev/include/linux/jffs2_fs_i.h
===================================================================
--- linux.prev.orig/include/linux/jffs2_fs_i.h
+++ linux.prev/include/linux/jffs2_fs_i.h
@@ -14,7 +14,15 @@ struct jffs2_inode_info {
 	   before letting GC proceed. Or we'd have to put ugliness
 	   into the GC code so it didn't attempt to obtain the i_sem
 	   for the inode(s) which are already locked */
-	struct semaphore sem;
+	/*
+	 * (On PREEMPT_RT: while use of ei->sem is mostly mutex-alike, the
+	 * SLAB cache keeps the semaphore locked, which breaks the strict
+	 * "owner must exist" properties of rt_mutexes. Fix it the easy
+	 * way: by going to a compat_semaphore. But the real fix would be
+	 * to cache inodes in an unlocked state and lock them when
+	 * allocating a new inode.)
+	 */
+	struct compat_semaphore sem;
 
 	/* The highest (datanode) version number used for this ino */
 	uint32_t highest_version;
Index: linux.prev/include/linux/jffs2_fs_sb.h
===================================================================
--- linux.prev.orig/include/linux/jffs2_fs_sb.h
+++ linux.prev/include/linux/jffs2_fs_sb.h
@@ -35,7 +35,7 @@ struct jffs2_sb_info {
 	struct completion gc_thread_start; /* GC thread start completion */
 	struct completion gc_thread_exit; /* GC thread exit completion port */
 
-	struct semaphore alloc_sem;	/* Used to protect all the following
+	struct compat_semaphore alloc_sem; /* Used to protect all the following
 					   fields, and also to protect against
 					   out-of-order writing of nodes. And GC. */
 	uint32_t cleanmarker_size;	/* Size of an _inline_ CLEANMARKER
@@ -93,7 +93,7 @@ struct jffs2_sb_info {
 	/* Sem to allow jffs2_garbage_collect_deletion_dirent to
 	   drop the erase_completion_lock while it's holding a pointer
 	   to an obsoleted node. I don't like this. Alternatives welcomed. */
-	struct semaphore erase_free_sem;
+	struct compat_semaphore erase_free_sem;
 
 	uint32_t wbuf_pagesize; /* 0 for NOR and other flashes with no wbuf */
 
@@ -104,7 +104,7 @@ struct jffs2_sb_info {
 	uint32_t wbuf_len;
 	struct jffs2_inodirty *wbuf_inodes;
 
-	struct rw_semaphore wbuf_sem;	/* Protects the write buffer */
+	struct compat_rw_semaphore wbuf_sem;	/* Protects the write buffer */
 
 	/* Information about out-of-band area usage... */
 	struct nand_oobinfo *oobinfo;
Index: linux.prev/include/linux/jiffies.h
===================================================================
--- linux.prev.orig/include/linux/jiffies.h
+++ linux.prev/include/linux/jiffies.h
@@ -1,21 +1,12 @@
 #ifndef _LINUX_JIFFIES_H
 #define _LINUX_JIFFIES_H
 
+#include <linux/calc64.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/time.h>
 #include <linux/timex.h>
 #include <asm/param.h>			/* for HZ */
-#include <asm/div64.h>
-
-#ifndef div_long_long_rem
-#define div_long_long_rem(dividend,divisor,remainder) \
-({							\
-	u64 result = dividend;				\
-	*remainder = do_div(result,divisor);		\
-	result;						\
-})
-#endif
 
 /*
  * The following defines establish the engineering parameters of the PLL
Index: linux.prev/include/linux/kernel.h
===================================================================
--- linux.prev.orig/include/linux/kernel.h
+++ linux.prev/include/linux/kernel.h
@@ -65,7 +65,7 @@ extern int cond_resched(void);
 # define might_resched() do { } while (0)
 #endif
 
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT)
   void __might_sleep(char *file, int line);
 # define might_sleep() \
 	do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0)
@@ -128,6 +128,8 @@ asmlinkage int vprintk(const char *fmt, 
 	__attribute__ ((format (printf, 1, 0)));
 asmlinkage int printk(const char * fmt, ...)
 	__attribute__ ((format (printf, 1, 2)));
+extern void early_printk(const char *fmt, ...)
+	__attribute__ ((format (printf, 1, 2)));
 #else
 static inline int vprintk(const char *s, va_list args)
 	__attribute__ ((format (printf, 1, 0)));
@@ -137,6 +139,12 @@ static inline int printk(const char *s, 
 static inline int printk(const char *s, ...) { return 0; }
 #endif
 
+#ifdef CONFIG_PREEMPT_RT
+extern void zap_rt_locks(void);
+#else
+# define zap_rt_locks() do { } while (0)
+#endif
+
 unsigned long int_sqrt(unsigned long);
 
 static inline int __attribute_pure__ long_log2(unsigned long x)
@@ -177,6 +185,7 @@ extern void add_taint(unsigned);
 /* Values used for system_state */
 extern enum system_states {
 	SYSTEM_BOOTING,
+	SYSTEM_BOOTING_SCHEDULER_OK,
 	SYSTEM_RUNNING,
 	SYSTEM_HALT,
 	SYSTEM_POWER_OFF,
Index: linux.prev/include/linux/ktime.h
===================================================================
--- /dev/null
+++ linux.prev/include/linux/ktime.h
@@ -0,0 +1,300 @@
+/*
+ *  include/linux/ktime.h
+ *
+ *  ktime_t - nanosecond-resolution time format.
+ *
+ *   Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
+ *   Copyright(C) 2005, Red Hat, Inc., Ingo Molnar
+ *
+ *  data type definitions, declarations, prototypes and macros.
+ *
+ *  Started by: Thomas Gleixner and Ingo Molnar
+ *
+ *  Credits:
+ *
+ *  	Roman Zippel provided the ideas and primary code snippets of
+ *  	the ktime_t union and further simplifications of the original
+ *  	code.
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+#ifndef _LINUX_KTIME_H
+#define _LINUX_KTIME_H
+
+#include <linux/time.h>
+#include <linux/jiffies.h>
+
+/*
+ * ktime_t:
+ *
+ * On 64-bit CPUs a single 64-bit variable is used to store the hrtimers
+ * internal representation of time values in scalar nanoseconds. The
+ * design plays out best on 64-bit CPUs, where most conversions are
+ * NOPs and most arithmetic ktime_t operations are plain arithmetic
+ * operations.
+ *
+ * On 32-bit CPUs an optimized representation of the timespec structure
+ * is used to avoid expensive conversions from and to timespecs. The
+ * endian-aware order of the tv struct members is choosen to allow
+ * mathematical operations on the tv64 member of the union too, which
+ * for certain operations produces better code.
+ *
+ * For architectures with efficient support for 64/32-bit conversions the
+ * plain scalar nanosecond based representation can be selected by the
+ * config switch CONFIG_KTIME_SCALAR.
+ */
+typedef union {
+	s64	tv64;
+#if BITS_PER_LONG != 64 && !defined(CONFIG_KTIME_SCALAR)
+	struct {
+# ifdef __BIG_ENDIAN
+	s32	sec, nsec;
+# else
+	s32	nsec, sec;
+# endif
+	} tv;
+#endif
+} ktime_t;
+
+#define KTIME_MAX			(~((u64)1 << 63))
+
+/*
+ * ktime_t definitions when using the 64-bit scalar representation:
+ */
+
+#if (BITS_PER_LONG == 64) || defined(CONFIG_KTIME_SCALAR)
+
+/* Define a ktime_t variable and initialize it to zero: */
+#define DEFINE_KTIME(kt)		ktime_t kt = { .tv64 = 0 }
+
+/**
+ * ktime_set - Set a ktime_t variable from a seconds/nanoseconds value
+ *
+ * @secs:	seconds to set
+ * @nsecs:	nanoseconds to set
+ *
+ * Return the ktime_t representation of the value
+ */
+static inline ktime_t ktime_set(const long secs, const unsigned long nsecs)
+{
+	return (ktime_t) { .tv64 = (s64)secs * NSEC_PER_SEC + (s64)nsecs };
+}
+
+/* Subtract two ktime_t variables. rem = lhs -rhs: */
+#define ktime_sub(lhs, rhs) \
+		({ (ktime_t){ .tv64 = (lhs).tv64 - (rhs).tv64 }; })
+
+/* Add two ktime_t variables. res = lhs + rhs: */
+#define ktime_add(lhs, rhs) \
+		({ (ktime_t){ .tv64 = (lhs).tv64 + (rhs).tv64 }; })
+
+/*
+ * Add a ktime_t variable and a scalar nanosecond value.
+ * res = kt + nsval:
+ */
+#define ktime_add_ns(kt, nsval) \
+		({ (ktime_t){ .tv64 = (kt).tv64 + (nsval) }; })
+
+/* convert a timespec to ktime_t format: */
+#define timespec_to_ktime(ts)		ktime_set((ts).tv_sec, (ts).tv_nsec)
+
+/* convert a timeval to ktime_t format: */
+#define timeval_to_ktime(tv)		ktime_set((tv).tv_sec, (tv).tv_usec * 1000)
+
+/* Map the ktime_t to timespec conversion to ns_to_timespec function */
+#define ktime_to_timespec(kt)		ns_to_timespec((kt).tv64)
+
+/* Map the ktime_t to timeval conversion to ns_to_timeval function */
+#define ktime_to_timeval(kt)		ns_to_timeval((kt).tv64)
+
+/* Map the ktime_t to clock_t conversion to the inline in jiffies.h: */
+#define ktime_to_clock_t(kt)		nsec_to_clock_t((kt).tv64)
+
+/* Convert ktime_t to nanoseconds - NOP in the scalar storage format: */
+#define ktime_to_ns(kt)			((kt).tv64)
+
+#else
+
+/*
+ * Helper macros/inlines to get the ktime_t math right in the timespec
+ * representation. The macros are sometimes ugly - their actual use is
+ * pretty okay-ish, given the circumstances. We do all this for
+ * performance reasons. The pure scalar nsec_t based code was nice and
+ * simple, but created too many 64-bit / 32-bit conversions and divisions.
+ *
+ * Be especially aware that negative values are represented in a way
+ * that the tv.sec field is negative and the tv.nsec field is greater
+ * or equal to zero but less than nanoseconds per second. This is the
+ * same representation which is used by timespecs.
+ *
+ *   tv.sec < 0 and 0 >= tv.nsec < NSEC_PER_SEC
+ */
+
+/* Define a ktime_t variable and initialize it to zero: */
+#define DEFINE_KTIME(kt)		ktime_t kt = { .tv64 = 0 }
+
+/* Set a ktime_t variable to a value in sec/nsec representation: */
+static inline ktime_t ktime_set(const long secs, const unsigned long nsecs)
+{
+	return (ktime_t) { .tv = { .sec = secs, .nsec = nsecs } };
+}
+
+/**
+ * ktime_sub - subtract two ktime_t variables
+ *
+ * @lhs:	minuend
+ * @rhs:	subtrahend
+ *
+ * Returns the remainder of the substraction
+ */
+static inline ktime_t ktime_sub(const ktime_t lhs, const ktime_t rhs)
+{
+	ktime_t res;
+
+	res.tv64 = lhs.tv64 - rhs.tv64;
+	if (res.tv.nsec < 0)
+		res.tv.nsec += NSEC_PER_SEC;
+
+	return res;
+}
+
+/**
+ * ktime_add - add two ktime_t variables
+ *
+ * @add1:	addend1
+ * @add2:	addend2
+ *
+ * Returns the sum of addend1 and addend2
+ */
+static inline ktime_t ktime_add(const ktime_t add1, const ktime_t add2)
+{
+	ktime_t res;
+
+	res.tv64 = add1.tv64 + add2.tv64;
+	/*
+	 * performance trick: the (u32) -NSEC gives 0x00000000Fxxxxxxx
+	 * so we subtract NSEC_PER_SEC and add 1 to the upper 32 bit.
+	 *
+	 * it's equivalent to:
+	 *   tv.nsec -= NSEC_PER_SEC
+	 *   tv.sec ++;
+	 */
+	if (res.tv.nsec >= NSEC_PER_SEC)
+		res.tv64 += (u32)-NSEC_PER_SEC;
+
+	return res;
+}
+
+/**
+ * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable
+ *
+ * @kt:		addend
+ * @nsec:	the scalar nsec value to add
+ *
+ * Returns the sum of kt and nsec in ktime_t format
+ */
+extern ktime_t ktime_add_ns(const ktime_t kt, u64 nsec);
+
+/**
+ * timespec_to_ktime - convert a timespec to ktime_t format
+ *
+ * @ts:		the timespec variable to convert
+ *
+ * Returns a ktime_t variable with the converted timespec value
+ */
+static inline ktime_t timespec_to_ktime(const struct timespec ts)
+{
+	return (ktime_t) { .tv = { .sec = (s32)ts.tv_sec,
+			   	   .nsec = (s32)ts.tv_nsec } };
+}
+
+/**
+ * timeval_to_ktime - convert a timeval to ktime_t format
+ *
+ * @tv:		the timeval variable to convert
+ *
+ * Returns a ktime_t variable with the converted timeval value
+ */
+static inline ktime_t timeval_to_ktime(const struct timeval tv)
+{
+	return (ktime_t) { .tv = { .sec = (s32)tv.tv_sec,
+				   .nsec = (s32)tv.tv_usec * 1000 } };
+}
+
+/**
+ * ktime_to_timespec - convert a ktime_t variable to timespec format
+ *
+ * @kt:		the ktime_t variable to convert
+ *
+ * Returns the timespec representation of the ktime value
+ */
+static inline struct timespec ktime_to_timespec(const ktime_t kt)
+{
+	return (struct timespec) { .tv_sec = (time_t) kt.tv.sec,
+				   .tv_nsec = (long) kt.tv.nsec };
+}
+
+/**
+ * ktime_to_timeval - convert a ktime_t variable to timeval format
+ *
+ * @kt:		the ktime_t variable to convert
+ *
+ * Returns the timeval representation of the ktime value
+ */
+static inline struct timeval ktime_to_timeval(const ktime_t kt)
+{
+	return (struct timeval) {
+		.tv_sec = (time_t) kt.tv.sec,
+		.tv_usec = (suseconds_t) (kt.tv.nsec / NSEC_PER_USEC) };
+}
+
+/**
+ * ktime_to_clock_t - convert a ktime_t variable to clock_t format
+ * @kt:		the ktime_t variable to convert
+ *
+ * Returns a clock_t variable with the converted value
+ */
+static inline clock_t ktime_to_clock_t(const ktime_t kt)
+{
+	return nsec_to_clock_t( (u64) kt.tv.sec * NSEC_PER_SEC + kt.tv.nsec);
+}
+
+/**
+ * ktime_to_ns - convert a ktime_t variable to scalar nanoseconds
+ * @kt:		the ktime_t variable to convert
+ *
+ * Returns the scalar nanoseconds representation of kt
+ */
+static inline u64 ktime_to_ns(const ktime_t kt)
+{
+	return (u64) kt.tv.sec * NSEC_PER_SEC + kt.tv.nsec;
+}
+
+#endif
+
+/*
+ * The resolution of the clocks. The resolution value is returned in
+ * the clock_getres() system call to give application programmers an
+ * idea of the (in)accuracy of timers. Timer values are rounded up to
+ * this resolution values.
+ */
+#define KTIME_LOW_RES		(ktime_t){ .tv64 = TICK_NSEC }
+
+#ifdef CONFIG_GENERIC_TIME
+
+#define ktime_get		get_monotonic_clock
+#define ktime_get_real		get_realtime_clock
+#define ktime_get_ts(ts)	get_monotonic_clock_ts(ts)
+#define ktime_get_real_ts(ts)	get_realtime_clock_ts(ts)
+
+#else /* CONFIG_GENERIC_TIME */
+
+/* Get the monotonic time in timespec format: */
+extern void ktime_get_ts(struct timespec *ts);
+
+/* Get the real (wall-) time in timespec format: */
+#define ktime_get_real_ts(ts)	getnstimeofday(ts)
+
+#endif /* !CONFIG_GENERIC_TIME */
+
+#endif
Index: linux.prev/include/linux/latency_hist.h
===================================================================
--- /dev/null
+++ linux.prev/include/linux/latency_hist.h
@@ -0,0 +1,32 @@
+/*
+ * kernel/latency_hist.h
+ *
+ * Add support for histograms of preemption-off latency and
+ * interrupt-off latency and wakeup latency, it depends on
+ * Real-Time Preemption Support.
+ *
+ *  Copyright (C) 2005 MontaVista Software, Inc.
+ *  Yi Yang <yyang@ch.mvista.com>
+ *
+ */
+#ifndef _LINUX_LATENCY_HIST_H_
+#define _LINUX_LATENCY_HIST_H_
+
+enum {
+        INTERRUPT_LATENCY = 0,
+        PREEMPT_LATENCY,
+        WAKEUP_LATENCY
+};
+
+#define MAX_ENTRY_NUM 10240
+#define LATENCY_TYPE_NUM 3
+
+#ifdef CONFIG_LATENCY_HIST
+extern void latency_hist(int latency_type, int cpu, unsigned long latency);
+# define latency_hist_flag 1
+#else
+# define latency_hist(a,b,c) do { (void)(cpu); } while (0)
+# define latency_hist_flag 0
+#endif /* CONFIG_LATENCY_HIST */
+
+#endif /* ifndef _LINUX_LATENCY_HIST_H_ */
Index: linux.prev/include/linux/linkage.h
===================================================================
--- linux.prev.orig/include/linux/linkage.h
+++ linux.prev/include/linux/linkage.h
@@ -4,6 +4,8 @@
 #include <linux/config.h>
 #include <asm/linkage.h>
 
+#define notrace __attribute ((no_instrument_function))
+
 #ifdef __cplusplus
 #define CPP_ASMLINKAGE extern "C"
 #else
@@ -42,7 +44,7 @@
 
 #endif
 
-#define NORET_TYPE    /**/
+#define NORET_TYPE    /* */
 #define ATTRIB_NORET  __attribute__((noreturn))
 #define NORET_AND     noreturn,
 
Index: linux.prev/include/linux/loop.h
===================================================================
--- linux.prev.orig/include/linux/loop.h
+++ linux.prev/include/linux/loop.h
@@ -58,9 +58,9 @@ struct loop_device {
 	struct bio 		*lo_bio;
 	struct bio		*lo_biotail;
 	int			lo_state;
-	struct semaphore	lo_sem;
+	struct completion	lo_done;
+	struct completion	lo_bh_done;
 	struct semaphore	lo_ctl_mutex;
-	struct semaphore	lo_bh_mutex;
 	int			lo_pending;
 
 	request_queue_t		*lo_queue;
Index: linux.prev/include/linux/mc146818rtc.h
===================================================================
--- linux.prev.orig/include/linux/mc146818rtc.h
+++ linux.prev/include/linux/mc146818rtc.h
@@ -17,7 +17,7 @@
 
 #ifdef __KERNEL__
 #include <linux/spinlock.h>		/* spinlock_t */
-extern spinlock_t rtc_lock;		/* serialize CMOS RAM access */
+extern raw_spinlock_t rtc_lock;		/* serialize CMOS RAM access */
 #endif
 
 /**********************************************************************
Index: linux.prev/include/linux/mca.h
===================================================================
--- linux.prev.orig/include/linux/mca.h
+++ linux.prev/include/linux/mca.h
@@ -12,8 +12,10 @@
 #include <asm/mca.h>
 
 extern int MCA_bus;
+extern void mca_timer_ack(void *);
 #else
 #define MCA_bus 0
+#define mca_timer_ack NULL
 #endif
 
 /* This sets up an information callback for /proc/mca/slot?.  The
Index: linux.prev/include/linux/mm.h
===================================================================
--- linux.prev.orig/include/linux/mm.h
+++ linux.prev/include/linux/mm.h
@@ -974,10 +974,21 @@ static inline void vm_stat_account(struc
 }
 #endif /* CONFIG_PROC_FS */
 
+#ifdef CONFIG_DEBUG_DEADLOCKS
+ extern int check_no_locks_freed(const void *from, const void *to);
+#else
+ static inline int check_no_locks_freed(const void *from, const void *to)
+ {
+	return 0;
+ }
+#endif
+
 #ifndef CONFIG_DEBUG_PAGEALLOC
 static inline void
 kernel_map_pages(struct page *page, int numpages, int enable)
 {
+	if (!PageHighMem(page) && !enable)
+		check_no_locks_freed(page_address(page), page_address(page+numpages));
 }
 #endif
 
Index: linux.prev/include/linux/netdevice.h
===================================================================
--- linux.prev.orig/include/linux/netdevice.h
+++ linux.prev/include/linux/netdevice.h
@@ -600,12 +600,12 @@ static inline void __netif_schedule(stru
 		unsigned long flags;
 		struct softnet_data *sd;
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		sd = &__get_cpu_var(softnet_data);
 		dev->next_sched = sd->output_queue;
 		sd->output_queue = dev;
 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 }
 
@@ -659,12 +659,12 @@ static inline void dev_kfree_skb_irq(str
 		struct softnet_data *sd;
 		unsigned long flags;
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		sd = &__get_cpu_var(softnet_data);
 		skb->next = sd->completion_queue;
 		sd->completion_queue = skb;
 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 }
 
@@ -817,15 +817,15 @@ static inline void __netif_rx_schedule(s
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	dev_hold(dev);
 	list_add_tail(&dev->poll_list, &__get_cpu_var(softnet_data).poll_list);
 	if (dev->quota < 0)
 		dev->quota += dev->weight;
 	else
 		dev->quota = dev->weight;
-	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
-	local_irq_restore(flags);
+	raise_softirq_irqoff(NET_RX_SOFTIRQ);
+	raw_local_irq_restore(flags);
 }
 
 /* Try to reschedule poll. Called by irq handler. */
@@ -846,10 +846,10 @@ static inline int netif_rx_reschedule(st
 
 		dev->quota += undo;
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		list_add_tail(&dev->poll_list, &__get_cpu_var(softnet_data).poll_list);
-		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
-		local_irq_restore(flags);
+		raise_softirq_irqoff(NET_RX_SOFTIRQ);
+		raw_local_irq_restore(flags);
 		return 1;
 	}
 	return 0;
@@ -864,12 +864,12 @@ static inline void netif_rx_complete(str
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	BUG_ON(!test_bit(__LINK_STATE_RX_SCHED, &dev->state));
 	list_del(&dev->poll_list);
 	smp_mb__before_clear_bit();
 	clear_bit(__LINK_STATE_RX_SCHED, &dev->state);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static inline void netif_poll_disable(struct net_device *dev)
@@ -884,7 +884,7 @@ static inline void netif_poll_enable(str
 	clear_bit(__LINK_STATE_RX_SCHED, &dev->state);
 }
 
-/* same as netif_rx_complete, except that local_irq_save(flags)
+/* same as netif_rx_complete, except that raw_local_irq_save(flags)
  * has already been issued
  */
 static inline void __netif_rx_complete(struct net_device *dev)
Index: linux.prev/include/linux/netfilter_ipv4/ip_conntrack.h
===================================================================
--- linux.prev.orig/include/linux/netfilter_ipv4/ip_conntrack.h
+++ linux.prev/include/linux/netfilter_ipv4/ip_conntrack.h
@@ -294,8 +294,14 @@ static inline int is_dying(struct ip_con
 }
 
 extern unsigned int ip_conntrack_htable_size;
- 
-#define CONNTRACK_STAT_INC(count) (__get_cpu_var(ip_conntrack_stat).count++)
+
+
+#define CONNTRACK_STAT_INC(count) \
+do { \
+	preempt_disable(); \
+	__get_cpu_var(ip_conntrack_stat).count++; \
+	preempt_enable(); \
+} while (0)
 
 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
 #include <linux/notifier.h>
@@ -305,10 +311,8 @@ struct ip_conntrack_ecache {
 	struct ip_conntrack *ct;
 	unsigned int events;
 };
-DECLARE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
+DECLARE_PER_CPU_LOCKED(struct ip_conntrack_ecache, ip_conntrack_ecache);
 
-#define CONNTRACK_ECACHE(x)	(__get_cpu_var(ip_conntrack_ecache).x)
- 
 extern struct notifier_block *ip_conntrack_chain;
 extern struct notifier_block *ip_conntrack_expect_chain;
 
@@ -343,12 +347,14 @@ ip_conntrack_event_cache(enum ip_conntra
 {
 	struct ip_conntrack *ct = (struct ip_conntrack *)skb->nfct;
 	struct ip_conntrack_ecache *ecache;
-	
+	int cpu;
+
 	local_bh_disable();
-	ecache = &__get_cpu_var(ip_conntrack_ecache);
+	ecache = &get_cpu_var_locked(ip_conntrack_ecache, &cpu);
 	if (ct != ecache->ct)
 		__ip_ct_event_cache_init(ct);
 	ecache->events |= event;
+	put_cpu_var_locked(ip_conntrack_ecache, cpu);
 	local_bh_enable();
 }
 
Index: linux.prev/include/linux/oprofile.h
===================================================================
--- linux.prev.orig/include/linux/oprofile.h
+++ linux.prev/include/linux/oprofile.h
@@ -114,6 +114,6 @@ ssize_t oprofilefs_ulong_to_user(unsigne
 int oprofilefs_ulong_from_user(unsigned long * val, char const __user * buf, size_t count);
 
 /** lock for read/write safety */
-extern spinlock_t oprofilefs_lock;
+extern raw_spinlock_t oprofilefs_lock;
  
 #endif /* OPROFILE_H */
Index: linux.prev/include/linux/pagemap.h
===================================================================
--- linux.prev.orig/include/linux/pagemap.h
+++ linux.prev/include/linux/pagemap.h
@@ -112,20 +112,19 @@ DECLARE_PER_CPU(long, nr_pagecache_local
  * an offset in their per-cpu arena and will spill that into the
  * global count whenever the absolute value of the local count
  * exceeds the counter's threshold.
- *
- * MUST be protected from preemption.
- * current protection is mapping->page_lock.
  */
 static inline void pagecache_acct(int count)
 {
 	long *local;
 
+	preempt_disable();
 	local = &__get_cpu_var(nr_pagecache_local);
 	*local += count;
 	if (*local > PAGECACHE_ACCT_THRESHOLD || *local < -PAGECACHE_ACCT_THRESHOLD) {
 		atomic_add(*local, &nr_pagecache);
 		*local = 0;
 	}
+	preempt_enable();
 }
 
 #else
Index: linux.prev/include/linux/pagevec.h
===================================================================
--- linux.prev.orig/include/linux/pagevec.h
+++ linux.prev/include/linux/pagevec.h
@@ -6,7 +6,7 @@
  */
 
 /* 14 pointers + two long's align the pagevec structure to a power of two */
-#define PAGEVEC_SIZE	14
+#define PAGEVEC_SIZE	8
 
 struct page;
 struct address_space;
Index: linux.prev/include/linux/parport.h
===================================================================
--- linux.prev.orig/include/linux/parport.h
+++ linux.prev/include/linux/parport.h
@@ -254,7 +254,7 @@ enum ieee1284_phase {
 struct ieee1284_info {
 	int mode;
 	volatile enum ieee1284_phase phase;
-	struct semaphore irq;
+	struct compat_semaphore irq;
 };
 
 /* A parallel port */
Index: linux.prev/include/linux/percpu.h
===================================================================
--- linux.prev.orig/include/linux/percpu.h
+++ linux.prev/include/linux/percpu.h
@@ -8,13 +8,36 @@
 
 /* Enough to cover all DEFINE_PER_CPUs in kernel, including modules. */
 #ifndef PERCPU_ENOUGH_ROOM
-#define PERCPU_ENOUGH_ROOM 32768
+#define PERCPU_ENOUGH_ROOM 65536
 #endif
 
 /* Must be an lvalue. */
 #define get_cpu_var(var) (*({ preempt_disable(); &__get_cpu_var(var); }))
 #define put_cpu_var(var) preempt_enable()
 
+/*
+ * Per-CPU data structures with an additional lock - useful for
+ * PREEMPT_RT code that wants to reschedule but also wants
+ * per-CPU data structures.
+ *
+ * 'cpu' gets updated with the CPU the task is currently executing on.
+ *
+ * NOTE: on normal !PREEMPT_RT kernels these per-CPU variables
+ * are the same as the normal per-CPU variables, so there no
+ * runtime overhead.
+ */
+#define get_cpu_var_locked(var, cpuptr)			\
+(*({							\
+	int __cpu = raw_smp_processor_id();		\
+							\
+	*(cpuptr) = __cpu;				\
+	spin_lock(&__get_cpu_lock(var, __cpu));		\
+	&__get_cpu_var_locked(var, __cpu);		\
+}))
+
+#define put_cpu_var_locked(var, cpu) \
+		 do { (void)cpu; spin_unlock(&__get_cpu_lock(var, cpu)); } while (0)
+
 #ifdef CONFIG_SMP
 
 struct percpu_data {
Index: linux.prev/include/linux/percpu_counter.h
===================================================================
--- linux.prev.orig/include/linux/percpu_counter.h
+++ linux.prev/include/linux/percpu_counter.h
@@ -15,7 +15,7 @@
 #ifdef CONFIG_SMP
 
 struct percpu_counter {
-	spinlock_t lock;
+	raw_spinlock_t lock;
 	long count;
 	long *counters;
 };
Index: linux.prev/include/linux/plist.h
===================================================================
--- /dev/null
+++ linux.prev/include/linux/plist.h
@@ -0,0 +1,226 @@
+/*
+ * Descending-priority-sorted double-linked list
+ *
+ * (C) 2002-2003 Intel Corp
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>.
+ *
+ * 2001-2005 (c) MontaVista Software, Inc.
+ * Daniel Walker <dwalker@mvista.com>
+ *
+ * (C) 2005 Thomas Gleixner <tglx@linutronix.de>
+ * Tested and made it functional.
+ *
+ * Licensed under the FSF's GNU Public License v2 or later.
+ *
+ * Based on simple lists (include/linux/list.h).
+ *
+ *
+ * This is a priority-sorted list of nodes; each node has a
+ * priority from INT_MIN (highest) to INT_MAX (lowest).
+ *
+ * Addition is O(K), removal is O(1), change of priority of a node is
+ * O(K) and K is the number of RT priority levels used in the system.
+ * (1 <= K <= 99)
+ *
+ * This list is really a list of lists:
+ *
+ *  - The tier 1 list is the prio_list, different priority nodes.
+ *
+ *  - The tier 2 list is the node_list, serialized nodes.
+ *
+ * Simple ASCII art explanation:
+ *
+ * |HEAD          |
+ * |              |
+ * |prio_list.prev|<------------------------------------|
+ * |prio_list.next|<->|pl|<->|pl|<--------------->|pl|<-|
+ * |10            |   |10|   |21|   |21|   |21|   |40|   (prio)
+ * |              |   |  |   |  |   |  |   |  |   |  |
+ * |              |   |  |   |  |   |  |   |  |   |  |
+ * |node_list.next|<->|nl|<->|nl|<->|nl|<->|nl|<->|nl|<-|
+ * |node_list.prev|<------------------------------------|
+ *
+ * The nodes on the prio_list list are sorted by priority to simplify
+ * the insertion of new nodes. There are no nodes with duplicate
+ * priorites on the list.
+ *
+ * The nodes on the node_list is ordered by priority and can contain
+ * entries which have the same priority. Those entries are ordered
+ * FIFO
+ *
+ * Addition means: look for the prio_list node in the prio_list
+ * for the priority of the node and insert it before the node_list
+ * entry of the next prio_list node. If it is the first node of
+ * that priority, add it to the prio_list in the right position and
+ * insert it into the serialized node_list list
+ *
+ * Removal means remove it from the node_list and remove it from
+ * the prio_list if the node_list list_head is non empty. In case
+ * of removal from the prio_list it must be checked whether other
+ * entries of the same priority are on the list or not. If there
+ * is another entry of the same priority then this entry has to
+ * replace the removed entry on the prio_list. If the entry which
+ * is removed is the only entry of this priority then a simple
+ * remove from both list is sufficient.
+ *
+ * INT_MIN is the highest priority, 0 is the medium highest, INT_MAX
+ * is lowest priority.
+ *
+ * No locking is done, up to the caller.
+ *
+ * NOTE: This implementation does not offer as many interfaces as
+ *       linux/list.h does -- it is lazily minimal. You are welcome to
+ *       add them.
+ */
+#ifndef _LINUX_PLIST_H_
+#define _LINUX_PLIST_H_
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+
+struct plist_head {
+	struct list_head prio_list;
+	struct list_head node_list;
+};
+
+struct plist_node {
+	int			prio;
+	struct plist_head	plist;
+};
+
+/**
+ * #PLIST_HEAD_INIT - static struct plist_head initializer
+ *
+ * @head:	struct plist_head variable name
+ */
+#define PLIST_HEAD_INIT(head)	\
+{							\
+	.prio_list = LIST_HEAD_INIT((head).prio_list),	\
+	.node_list = LIST_HEAD_INIT((head).node_list),	\
+}
+
+/**
+ * #PLIST_NODE_INIT - static struct plist_node initializer
+ *
+ * @node:	struct plist_node variable name
+ * @__prio:	initial node priority
+ */
+#define PLIST_NODE_INIT(node, __prio)	\
+{							\
+	.prio  = (__prio),				\
+	.plist = PLIST_HEAD_INIT((node).plist),		\
+}
+
+/**
+ * plist_head_init - dynamic struct plist_head initializer
+ *
+ * @head:	&struct plist_head pointer
+ */
+static inline void plist_head_init(struct plist_head *head)
+{
+	INIT_LIST_HEAD(&head->prio_list);
+	INIT_LIST_HEAD(&head->node_list);
+}
+
+/**
+ * plist_node_init - Dynamic struct plist_node initializer
+ *
+ * @node:	&struct plist_node pointer
+ * @prio:	initial node priority
+ */
+static inline void plist_node_init(struct plist_node *node, int prio)
+{
+	node->prio = prio;
+	plist_head_init(&node->plist);
+}
+
+extern void plist_add(struct plist_node *node, struct plist_head *head);
+extern void plist_del(struct plist_node *node);
+
+/**
+ * plist_for_each - iterate over the plist
+ *
+ * @pos1:	the type * to use as a loop counter.
+ * @head:	the head for your list.
+ */
+#define plist_for_each(pos, head)	\
+	 list_for_each_entry(pos, &(head)->node_list, plist.node_list)
+
+/**
+ * plist_for_each_entry_safe - iterate over a plist of given type safe
+ * against removal of list entry
+ *
+ * @pos1:	the type * to use as a loop counter.
+ * @n1:	another type * to use as temporary storage
+ * @head:	the head for your list.
+ */
+#define plist_for_each_safe(pos, n, head)	\
+	 list_for_each_entry_safe(pos, n, &(head)->node_list, plist.node_list)
+
+/**
+ * plist_for_each_entry	- iterate over list of given type
+ *
+ * @pos:	the type * to use as a loop counter.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define plist_for_each_entry(pos, head, mem)	\
+	 list_for_each_entry(pos, &(head)->node_list, mem.plist.node_list)
+
+/**
+ * plist_for_each_entry_safe - iterate over list of given type safe against
+ * removal of list entry
+ *
+ * @pos:	the type * to use as a loop counter.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define plist_for_each_entry_safe(pos, n, head, mem)	\
+	 list_for_each_entry_safe(pos, n, &(head)->node_list, mem.plist.node_list)
+
+/**
+ * plist_head_empty - return !0 if a plist_head is empty
+ *
+ * @head:	&struct plist_head pointer
+ */
+static inline int plist_head_empty(const struct plist_head *head)
+{
+	return list_empty(&head->node_list);
+}
+
+/**
+ * plist_node_empty - return !0 if plist_node is not on a list
+ *
+ * @node:	&struct plist_node pointer
+ */
+static inline int plist_node_empty(const struct plist_node *node)
+{
+	return list_empty(&node->plist.node_list);
+}
+
+/* All functions below assume the plist_head is not empty. */
+
+/**
+ * plist_first_entry - get the struct for the first entry
+ *
+ * @ptr:	the &struct plist_head pointer.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define plist_first_entry(head, type, member)	\
+	container_of(plist_first(head), type, member)
+
+/**
+ * plist_first - return the first node (and thus, highest priority)
+ *
+ * @head:	the &struct plist_head pointer
+ *
+ * Assumes the plist is _not_ empty.
+ */
+static inline struct plist_node* plist_first(const struct plist_head *head)
+{
+	return list_entry(head->node_list.next, struct plist_node, plist.node_list);
+}
+
+#endif
Index: linux.prev/include/linux/pm.h
===================================================================
--- linux.prev.orig/include/linux/pm.h
+++ linux.prev/include/linux/pm.h
@@ -25,6 +25,7 @@
 
 #include <linux/config.h>
 #include <linux/list.h>
+#include <linux/spinlock.h>
 #include <asm/atomic.h>
 
 /*
@@ -102,6 +103,8 @@ struct pm_dev
  */
 extern void (*pm_idle)(void);
 extern void (*pm_power_off)(void);
+extern spinlock_t pm_idle_switch_lock;
+extern int pm_idle_locked;
 
 typedef int __bitwise suspend_state_t;
 
Index: linux.prev/include/linux/posix-timers.h
===================================================================
--- linux.prev.orig/include/linux/posix-timers.h
+++ linux.prev/include/linux/posix-timers.h
@@ -42,7 +42,7 @@ struct k_itimer {
 	timer_t it_id;			/* timer id */
 	int it_overrun;			/* overrun on pending signal  */
 	int it_overrun_last;		/* overrun on last delivered signal */
-	int it_requeue_pending;         /* waiting to requeue this timer */
+	int it_requeue_pending;		/* waiting to requeue this timer */
 #define REQUEUE_PENDING 1
 	int it_sigev_notify;		/* notify word of sigevent struct */
 	int it_sigev_signo;		/* signo word of sigevent struct */
@@ -51,10 +51,8 @@ struct k_itimer {
 	struct sigqueue *sigq;		/* signal queue entry. */
 	union {
 		struct {
-			struct timer_list timer;
-			struct list_head abs_timer_entry; /* clock abs_timer_list */
-			struct timespec wall_to_prev;   /* wall_to_monotonic used when set */
-			unsigned long incr; /* interval in jiffies */
+			struct hrtimer timer;
+			ktime_t interval;
 		} real;
 		struct cpu_timer_list cpu;
 		struct {
@@ -66,18 +64,14 @@ struct k_itimer {
 	} it;
 };
 
-struct k_clock_abs {
-	struct list_head list;
-	spinlock_t lock;
-};
 struct k_clock {
-	int res;		/* in nano seconds */
-	int (*clock_getres) (clockid_t which_clock, struct timespec *tp);
-	struct k_clock_abs *abs_struct;
-	int (*clock_set) (clockid_t which_clock, struct timespec * tp);
-	int (*clock_get) (clockid_t which_clock, struct timespec * tp);
+	int res;		/* in nanoseconds */
+	int (*clock_getres) (const clockid_t which_clock, struct timespec *tp);
+	int (*clock_set) (const clockid_t which_clock, struct timespec * tp);
+	int (*clock_get) (const clockid_t which_clock, struct timespec * tp);
 	int (*timer_create) (struct k_itimer *timer);
-	int (*nsleep) (clockid_t which_clock, int flags, struct timespec *);
+	int (*nsleep) (const clockid_t which_clock, int flags,
+		       struct timespec *, struct timespec __user *);
 	int (*timer_set) (struct k_itimer * timr, int flags,
 			  struct itimerspec * new_setting,
 			  struct itimerspec * old_setting);
@@ -87,53 +81,35 @@ struct k_clock {
 			   struct itimerspec * cur_setting);
 };
 
-void register_posix_clock(clockid_t clock_id, struct k_clock *new_clock);
+void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock);
 
-/* Error handlers for timer_create, nanosleep and settime */
+/* error handlers for timer_create, nanosleep and settime */
 int do_posix_clock_notimer_create(struct k_itimer *timer);
-int do_posix_clock_nonanosleep(clockid_t, int flags, struct timespec *);
-int do_posix_clock_nosettime(clockid_t, struct timespec *tp);
+int do_posix_clock_nonanosleep(const clockid_t, int flags, struct timespec *,
+			       struct timespec __user *);
+int do_posix_clock_nosettime(const clockid_t, struct timespec *tp);
 
 /* function to call to trigger timer event */
 int posix_timer_event(struct k_itimer *timr, int si_private);
 
-struct now_struct {
-	unsigned long jiffies;
-};
-
-#define posix_get_now(now) (now)->jiffies = jiffies;
-#define posix_time_before(timer, now) \
-                      time_before((timer)->expires, (now)->jiffies)
-
-#define posix_bump_timer(timr, now)					\
-         do {								\
-              long delta, orun;						\
-	      delta = now.jiffies - (timr)->it.real.timer.expires;	\
-              if (delta >= 0) {						\
-	           orun = 1 + (delta / (timr)->it.real.incr);		\
-	          (timr)->it.real.timer.expires +=			\
-			 orun * (timr)->it.real.incr;			\
-                  (timr)->it_overrun += orun;				\
-              }								\
-            }while (0)
-
-int posix_cpu_clock_getres(clockid_t which_clock, struct timespec *);
-int posix_cpu_clock_get(clockid_t which_clock, struct timespec *);
-int posix_cpu_clock_set(clockid_t which_clock, const struct timespec *tp);
-int posix_cpu_timer_create(struct k_itimer *);
-int posix_cpu_nsleep(clockid_t, int, struct timespec *);
-int posix_cpu_timer_set(struct k_itimer *, int,
-			struct itimerspec *, struct itimerspec *);
-int posix_cpu_timer_del(struct k_itimer *);
-void posix_cpu_timer_get(struct k_itimer *, struct itimerspec *);
-
-void posix_cpu_timer_schedule(struct k_itimer *);
-
-void run_posix_cpu_timers(struct task_struct *);
-void posix_cpu_timers_exit(struct task_struct *);
-void posix_cpu_timers_exit_group(struct task_struct *);
+int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *ts);
+int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *ts);
+int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *ts);
+int posix_cpu_timer_create(struct k_itimer *timer);
+int posix_cpu_nsleep(const clockid_t which_clock, int flags,
+		     struct timespec *rqtp, struct timespec __user *rmtp);
+int posix_cpu_timer_set(struct k_itimer *timer, int flags,
+			struct itimerspec *new, struct itimerspec *old);
+int posix_cpu_timer_del(struct k_itimer *timer);
+void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp);
+
+void posix_cpu_timer_schedule(struct k_itimer *timer);
+
+void run_posix_cpu_timers(struct task_struct *task);
+void posix_cpu_timers_exit(struct task_struct *task);
+void posix_cpu_timers_exit_group(struct task_struct *task);
 
-void set_process_cpu_timer(struct task_struct *, unsigned int,
-			   cputime_t *, cputime_t *);
+void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx,
+			   cputime_t *newval, cputime_t *oldval);
 
 #endif
Index: linux.prev/include/linux/preempt.h
===================================================================
--- linux.prev.orig/include/linux/preempt.h
+++ linux.prev/include/linux/preempt.h
@@ -9,23 +9,53 @@
 #include <linux/config.h>
 #include <linux/thread_info.h>
 #include <linux/linkage.h>
+#include <linux/thread_info.h>
 
-#ifdef CONFIG_DEBUG_PREEMPT
-  extern void fastcall add_preempt_count(int val);
-  extern void fastcall sub_preempt_count(int val);
+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_CRITICAL_TIMING)
+  extern void notrace add_preempt_count(unsigned int val);
+  extern void notrace sub_preempt_count(unsigned int val);
+  extern void notrace add_preempt_count_ti(struct thread_info *ti, unsigned int val);
+  extern void notrace sub_preempt_count_ti(struct thread_info *ti, unsigned int val);
+  extern void notrace mask_preempt_count(unsigned int mask);
+  extern void notrace unmask_preempt_count(unsigned int mask);
 #else
 # define add_preempt_count(val)	do { preempt_count() += (val); } while (0)
 # define sub_preempt_count(val)	do { preempt_count() -= (val); } while (0)
+# define add_preempt_count_ti(ti, val)	do { preempt_count_ti(ti) += (val); } while (0)
+# define sub_preempt_count_ti(ti, val)	do { preempt_count_ti(ti) -= (val); } while (0)
+# define mask_preempt_count(mask) \
+		do { preempt_count() |= (mask); } while (0)
+# define unmask_preempt_count(mask) \
+		do { preempt_count() &= ~(mask); } while (0)
+#endif
+
+#ifdef CONFIG_CRITICAL_TIMING
+  extern void touch_critical_timing(void);
+  extern void stop_critical_timing(void);
+#else
+# define touch_critical_timing()	do { } while (0)
+# define stop_critical_timing()	do { } while (0)
 #endif
 
 #define inc_preempt_count() add_preempt_count(1)
 #define dec_preempt_count() sub_preempt_count(1)
 
-#define preempt_count()	(current_thread_info()->preempt_count)
+#define inc_preempt_count_ti(ti) add_preempt_count_ti(ti, 1)
+#define dec_preempt_count_ti(ti) sub_preempt_count_ti(ti, 1)
+
+#define preempt_count()		(current_thread_info()->preempt_count)
+#define preempt_count_ti(ti)	((ti)->preempt_count)
 
 #ifdef CONFIG_PREEMPT
 
 asmlinkage void preempt_schedule(void);
+asmlinkage void preempt_schedule_irq(void);
+
+#define preempt_disable_ti(ti) \
+do { \
+	inc_preempt_count_ti(ti); \
+	barrier(); \
+} while (0)
 
 #define preempt_disable() \
 do { \
@@ -33,31 +63,61 @@ do { \
 	barrier(); \
 } while (0)
 
-#define preempt_enable_no_resched() \
+#define __preempt_enable_no_resched() \
 do { \
 	barrier(); \
 	dec_preempt_count(); \
 } while (0)
 
+#define __preempt_enable_no_resched_ti(ti) \
+do { \
+	barrier(); \
+	dec_preempt_count_ti(ti); \
+} while (0)
+
+
+#ifdef CONFIG_DEBUG_PREEMPT
+extern void notrace preempt_enable_no_resched(void);
+#else
+# define preempt_enable_no_resched() __preempt_enable_no_resched()
+#endif
+
 #define preempt_check_resched() \
 do { \
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
 		preempt_schedule(); \
 } while (0)
 
+#define preempt_check_resched_delayed() \
+do { \
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED_DELAYED))) \
+		preempt_schedule(); \
+} while (0)
+
 #define preempt_enable() \
 do { \
-	preempt_enable_no_resched(); \
+	__preempt_enable_no_resched(); \
 	barrier(); \
 	preempt_check_resched(); \
 } while (0)
 
+#define preempt_enable_ti(ti) \
+do { \
+	__preempt_enable_no_resched_ti(ti); \
+	if (unlikely(test_ti_thread_flag(ti, TIF_NEED_RESCHED))) \
+		preempt_schedule(); \
+} while (0)
+
 #else
 
 #define preempt_disable()		do { } while (0)
 #define preempt_enable_no_resched()	do { } while (0)
+#define __preempt_enable_no_resched()	do { } while (0)
 #define preempt_enable()		do { } while (0)
 #define preempt_check_resched()		do { } while (0)
+#define preempt_check_resched_delayed()	do { } while (0)
+
+#define preempt_schedule_irq()		do { } while (0)
 
 #endif
 
Index: linux.prev/include/linux/proc_fs.h
===================================================================
--- linux.prev.orig/include/linux/proc_fs.h
+++ linux.prev/include/linux/proc_fs.h
@@ -4,6 +4,7 @@
 #include <linux/config.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
+#include <linux/spinlock.h>
 #include <asm/atomic.h>
 
 /*
@@ -92,6 +93,8 @@ extern struct proc_dir_entry *proc_bus;
 extern struct proc_dir_entry *proc_root_driver;
 extern struct proc_dir_entry *proc_root_kcore;
 
+extern spinlock_t proc_subdir_lock;
+
 extern void proc_root_init(void);
 extern void proc_misc_init(void);
 
Index: linux.prev/include/linux/profile.h
===================================================================
--- linux.prev.orig/include/linux/profile.h
+++ linux.prev/include/linux/profile.h
@@ -7,10 +7,12 @@
 #include <linux/config.h>
 #include <linux/init.h>
 #include <linux/cpumask.h>
+#include <linux/kernel_stat.h>
 #include <asm/errno.h>
 
-#define CPU_PROFILING	1
-#define SCHED_PROFILING	2
+#define CPU_PROFILING		1
+#define SCHED_PROFILING		2
+#define PREEMPT_PROFILING	3
 
 struct proc_dir_entry;
 struct pt_regs;
@@ -30,6 +32,8 @@ enum profile_type {
 	PROFILE_MUNMAP
 };
 
+extern int prof_pid;
+
 #ifdef CONFIG_PROFILING
 
 struct notifier_block;
Index: linux.prev/include/linux/quota.h
===================================================================
--- linux.prev.orig/include/linux/quota.h
+++ linux.prev/include/linux/quota.h
@@ -37,6 +37,7 @@
 
 #include <linux/errno.h>
 #include <linux/types.h>
+#include <linux/wait.h>
 #include <linux/spinlock.h>
 
 #define __DQUOT_VERSION__	"dquot_6.5.1"
Index: linux.prev/include/linux/radix-tree.h
===================================================================
--- linux.prev.orig/include/linux/radix-tree.h
+++ linux.prev/include/linux/radix-tree.h
@@ -19,6 +19,7 @@
 #ifndef _LINUX_RADIX_TREE_H
 #define _LINUX_RADIX_TREE_H
 
+#include <linux/config.h>
 #include <linux/preempt.h>
 #include <linux/types.h>
 
@@ -51,7 +52,18 @@ void *radix_tree_delete(struct radix_tre
 unsigned int
 radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 			unsigned long first_index, unsigned int max_items);
+/*
+ * On a mutex based kernel we can freely schedule within the radix code:
+ */
+#ifdef CONFIG_PREEMPT_RT
+static inline int radix_tree_preload(gfp_t gfp_mask)
+{
+	return 0;
+}
+#else
 int radix_tree_preload(gfp_t gfp_mask);
+#endif
+
 void radix_tree_init(void);
 void *radix_tree_tag_set(struct radix_tree_root *root,
 			unsigned long index, int tag);
@@ -66,7 +78,9 @@ int radix_tree_tagged(struct radix_tree_
 
 static inline void radix_tree_preload_end(void)
 {
+#ifndef CONFIG_PREEMPT_RT
 	preempt_enable();
+#endif
 }
 
 #endif /* _LINUX_RADIX_TREE_H */
Index: linux.prev/include/linux/rcupdate.h
===================================================================
--- linux.prev.orig/include/linux/rcupdate.h
+++ linux.prev/include/linux/rcupdate.h
@@ -59,6 +59,7 @@ struct rcu_head {
 } while (0)
 
 
+#ifndef CONFIG_PREEMPT_RCU
 
 /* Global control variables for rcupdate callback mechanism. */
 struct rcu_ctrlblk {
@@ -185,14 +186,26 @@ static inline int rcu_pending(int cpu)
  *
  * It is illegal to block while in an RCU read-side critical section.
  */
-#define rcu_read_lock()		preempt_disable()
+#define rcu_read_lock preempt_disable
 
 /**
  * rcu_read_unlock - marks the end of an RCU read-side critical section.
  *
  * See rcu_read_lock() for more information.
  */
-#define rcu_read_unlock()	preempt_enable()
+#define rcu_read_unlock preempt_enable
+
+#else /* #ifndef CONFIG_PREEMPT_RCU */
+
+#define rcu_qsctr_inc(cpu)
+#define rcu_bh_qsctr_inc(cpu)
+#define call_rcu_bh(head, rcu) call_rcu(head, rcu)
+
+extern void rcu_read_lock(void);
+extern void rcu_read_unlock(void);
+extern int rcu_pending(int cpu);
+
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU */
 
 /*
  * So where is rcu_write_lock()?  It does not exist, as there is no
@@ -215,14 +228,22 @@ static inline int rcu_pending(int cpu)
  * can use just rcu_read_lock().
  *
  */
+#ifndef CONFIG_PREEMPT_RCU
 #define rcu_read_lock_bh()	local_bh_disable()
+#else /* #ifndef CONFIG_PREEMPT_RCU */
+#define rcu_read_lock_bh()	{ rcu_read_lock(); local_bh_disable(); }
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU */
 
 /*
  * rcu_read_unlock_bh - marks the end of a softirq-only RCU critical section
  *
  * See rcu_read_lock_bh() for more information.
  */
+#ifndef CONFIG_PREEMPT_RCU
 #define rcu_read_unlock_bh()	local_bh_enable()
+#else /* #ifndef CONFIG_PREEMPT_RCU */
+#define rcu_read_unlock_bh()	{ local_bh_enable(); rcu_read_unlock(); }
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU */
 
 /**
  * rcu_dereference - fetch an RCU-protected pointer in an
@@ -271,7 +292,13 @@ static inline int rcu_pending(int cpu)
  * synchronize_kernel() API.  In contrast, synchronize_rcu() only
  * guarantees that rcu_read_lock() sections will have completed.
  */
+#ifndef CONFIG_PREEMPT_RCU
 #define synchronize_sched() synchronize_rcu()
+extern void rcu_barrier(void);
+#else /* #ifndef CONFIG_PREEMPT_RCU */
+extern void synchronize_sched(void);
+#define rcu_barrier() do {} while(0)
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU */
 
 extern void rcu_init(void);
 extern void rcu_check_callbacks(int cpu, int user);
@@ -286,7 +313,6 @@ extern void FASTCALL(call_rcu_bh(struct 
 extern __deprecated_for_modules void synchronize_kernel(void);
 extern void synchronize_rcu(void);
 void synchronize_idle(void);
-extern void rcu_barrier(void);
 
 #endif /* __KERNEL__ */
 #endif /* __LINUX_RCUPDATE_H */
Index: linux.prev/include/linux/rt_irq.h
===================================================================
--- /dev/null
+++ linux.prev/include/linux/rt_irq.h
@@ -0,0 +1,67 @@
+#ifndef __LINUX_RT_IRQ_H
+#define __LINUX_RT_IRQ_H
+
+/*
+ * Soft IRQ flag support on PREEMPT_RT kernels:
+ */
+#ifdef CONFIG_PREEMPT_RT
+
+extern void local_irq_enable(void);
+extern void local_irq_disable(void);
+extern void local_irq_restore(unsigned long flags);
+extern void __local_save_flags(unsigned long *flags);
+extern void __local_irq_save(unsigned long *flags);
+extern int irqs_disabled(void);
+extern int irqs_disabled_flags(unsigned long flags);
+
+# define local_save_flags(flags)	__local_save_flags(&(flags))
+# define local_irq_save(flags)		__local_irq_save(&(flags))
+
+# define RAW_LOCAL_ILLEGAL_MASK		0x20000000UL
+# ifdef CONFIG_DEBUG_IRQ_FLAGS
+#  define LOCAL_ILLEGAL_MASK		0x40000000UL
+   void check_raw_flags(unsigned long flags);
+# else
+#  define check_raw_flags(flags)	do { } while (0)
+# endif
+
+/* soft state does not follow the hard state */
+# define raw_local_irq_enable()		do { trace_irqs_on(); __raw_local_irq_enable(); } while (0)
+# define raw_local_irq_disable()	do { __raw_local_irq_disable(); trace_irqs_off(); } while (0)
+# define raw_local_irq_save(flags)	do { __raw_local_irq_save(flags); trace_irqs_off(); } while (0)
+# define raw_local_irq_restore(flags) \
+	do { check_raw_flags(flags); if (__raw_irqs_disabled_flags(flags)) { __raw_local_irq_restore(flags); trace_irqs_off(); } else { __raw_local_irq_restore(flags); trace_irqs_on(); } } while (0)
+# define raw_safe_halt()		do { trace_irqs_on(); __raw_safe_halt(); } while (0)
+#else
+# define RAW_LOCAL_ILLEGAL_MASK		0UL
+# define LOCAL_ILLEGAL_MASK		0UL
+# define raw_local_irq_enable		__raw_local_irq_enable
+# define raw_local_irq_disable		__raw_local_irq_disable
+# define raw_local_irq_save		__raw_local_irq_save
+# define raw_local_irq_restore		__raw_local_irq_restore
+# define raw_safe_halt			__raw_safe_halt
+# define safe_halt			raw_safe_halt
+# define local_save_flags		__raw_local_save_flags
+# define local_irq_enable		__raw_local_irq_enable
+# define local_irq_disable		__raw_local_irq_disable
+# define local_irq_save			__raw_local_irq_save
+# define local_irq_restore		__raw_local_irq_restore
+# define irqs_disabled			__raw_irqs_disabled
+# define irqs_disabled_flags		__raw_irqs_disabled_flags
+#endif
+
+#define raw_local_save_flags		__raw_local_save_flags
+#define raw_irqs_disabled		__raw_irqs_disabled
+#define raw_irqs_disabled_flags		__raw_irqs_disabled_flags
+
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+  extern void notrace trace_irqs_off_lowlevel(void);
+  extern void notrace trace_irqs_off(void);
+  extern void notrace trace_irqs_on(void);
+#else
+# define trace_irqs_off_lowlevel()	do { } while (0)
+# define trace_irqs_off()		do { } while (0)
+# define trace_irqs_on()		do { } while (0)
+#endif
+
+#endif /* __LINUX_RT_IRQ_H */
Index: linux.prev/include/linux/rt_lock.h
===================================================================
--- /dev/null
+++ linux.prev/include/linux/rt_lock.h
@@ -0,0 +1,393 @@
+#ifndef __LINUX_RT_LOCK_H
+#define __LINUX_RT_LOCK_H
+
+/*
+ * Real-Time Preemption Support
+ *
+ * started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004, 2005 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * This file contains the main data structure definitions.
+ */
+#include <linux/config.h>
+#include <linux/list.h>
+#include <linux/plist.h>
+#include <asm/atomic.h>
+#include <linux/spinlock_types.h>
+
+/*
+ * This is the core locking object used by PREEMPT_RT.
+ * This one handles all the logic necessary, the other locking
+ * objects (spinlocks, rwlocks, semaphores and rw-semaphores)
+ * all use this synchronization object internally:
+ */
+struct rt_mutex {
+	raw_spinlock_t		wait_lock;
+	struct plist_head	wait_list;
+	struct task_struct	*owner;
+
+# ifdef CONFIG_DEBUG_RT_LOCKING_MODE
+	raw_spinlock_t		debug_slock;
+	raw_rwlock_t		debug_rwlock;
+# endif
+# ifdef CONFIG_DEBUG_DEADLOCKS
+	int			save_state;
+	struct list_head	held_list;
+	unsigned long		acquire_eip;
+	char 			*name, *file;
+	int			line;
+# endif
+# ifdef CONFIG_DEBUG_PREEMPT
+	int			was_preempt_off;
+# endif
+};
+
+/*
+ * This is the control structure for tasks blocked on an
+ * RT mutex:
+ */
+struct rt_mutex_waiter {
+	struct rt_mutex		*lock;
+	struct plist_node	list;
+	struct plist_node	pi_list;
+	struct task_struct	*task;
+#ifdef CONFIG_DEBUG_DEADLOCKS
+	unsigned long eip;
+#endif
+};
+
+#ifdef CONFIG_PREEMPT_RT
+
+#ifdef CONFIG_DEBUG_PREEMPT
+# define __WAS_PREEMPT_OFF(x)	, .was_preempt_off = x
+#else
+# define __WAS_PREEMPT_OFF(x)
+#endif
+
+#ifdef CONFIG_DEBUG_DEADLOCKS
+# define __RT_MUTEX_DEADLOCK_DETECT_INITIALIZER(lockname) \
+	, .name = #lockname, .file = __FILE__, .line = __LINE__
+#else
+# define __RT_MUTEX_DEADLOCK_DETECT_INITIALIZER(lockname)
+#endif
+
+#ifdef CONFIG_DEBUG_RT_LOCKING_MODE
+# define __RT_MUTEX_DEBUG_RT_LOCKING_MODE_INITIALIZER \
+	, .debug_slock = _RAW_SPIN_LOCK_UNLOCKED \
+	, .debug_rwlock = _RAW_RW_LOCK_UNLOCKED
+#else
+# define __RT_MUTEX_DEBUG_RT_LOCKING_MODE_INITIALIZER
+#endif
+
+/*
+ * FIXME: on SMP it's hard to initialize plists in the percpu.data area
+ */
+#ifdef CONFIG_SMP
+# define __PLIST_INIT(lockname)
+#else
+# define __PLIST_INIT(lockname) \
+	, .wait_list = PLIST_HEAD_INIT((lockname).wait_list)
+#endif
+
+#define __RT_MUTEX_INITIALIZER(lockname) \
+	{ .wait_lock = _RAW_SPIN_LOCK_UNLOCKED \
+	__PLIST_INIT(lockname) \
+	__WAS_PREEMPT_OFF(0) \
+	__RT_MUTEX_DEADLOCK_DETECT_INITIALIZER(lockname) \
+	__RT_MUTEX_DEBUG_RT_LOCKING_MODE_INITIALIZER }
+
+/*
+ * RW-semaphores are an RT mutex plus a reader-depth count.
+ *
+ * Note that the semantics are different from the usual
+ * Linux rw-sems, in PREEMPT_RT mode we do not allow
+ * multiple readers to hold the lock at once, we only allow
+ * a read-lock owner to read-lock recursively. This is
+ * better for latency, makes the implementation inherently
+ * fair and makes it simpler as well:
+ */
+struct rw_semaphore {
+	struct rt_mutex		lock;
+	int			read_depth;
+};
+
+/*
+ * rwlocks - an RW semaphore plus lock-break field:
+ */
+typedef struct {
+	struct rw_semaphore	lock;
+	unsigned int		break_lock;
+} rwlock_t;
+
+# ifdef CONFIG_DEBUG_DEADLOCKS
+#  define __RW_LOCK_UNLOCKED(lockname) \
+	.wait_lock = _RAW_SPIN_LOCK_UNLOCKED, .save_state = 1 \
+	__PLIST_INIT((lockname).lock.lock) \
+	, .file = __FILE__, .line = __LINE__ \
+	__WAS_PREEMPT_OFF(1) \
+	__RT_MUTEX_DEBUG_RT_LOCKING_MODE_INITIALIZER
+#  define _RW_LOCK_UNLOCKED(lockname) \
+	(rwlock_t) { { { __RW_LOCK_UNLOCKED(lockname), .name = #lockname } } }
+#  define RW_LOCK_UNLOCKED(lockname) \
+	(rwlock_t) { { { __RW_LOCK_UNLOCKED(lockname) } } }
+# else
+#  define RW_LOCK_UNLOCKED(lockname) (rwlock_t) \
+	{ { { .wait_lock = _RAW_SPIN_LOCK_UNLOCKED \
+	__PLIST_INIT(((lockname).lock.lock)) \
+	__RT_MUTEX_DEBUG_RT_LOCKING_MODE_INITIALIZER } } }
+#  define _RW_LOCK_UNLOCKED(lockname) RW_LOCK_UNLOCKED(lockname)
+# endif
+#else /* !PREEMPT_RT */
+  typedef raw_rwlock_t rwlock_t;
+# ifdef CONFIG_DEBUG_SPINLOCK
+# define _RW_LOCK_UNLOCKED(lockname)					\
+	(rwlock_t)	{	.raw_lock = __RAW_RW_LOCK_UNLOCKED,	\
+				.magic = RWLOCK_MAGIC,			\
+				.owner = SPINLOCK_OWNER_INIT,		\
+				.owner_cpu = -1 }
+# else
+#  define _RW_LOCK_UNLOCKED(lockname)					\
+	(rwlock_t)	{	.raw_lock = __RAW_RW_LOCK_UNLOCKED }
+# endif
+# define RW_LOCK_UNLOCKED(lockname)	_RW_LOCK_UNLOCKED(lockname)
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+
+/*
+ * spinlocks - an RT mutex plus lock-break field:
+ */
+typedef struct {
+	struct rt_mutex lock;
+	unsigned int break_lock;
+} spinlock_t;
+
+#ifdef CONFIG_DEBUG_DEADLOCKS
+# define __SPIN_LOCK_UNLOCKED(lockname) \
+	.wait_lock = _RAW_SPIN_LOCK_UNLOCKED \
+	__PLIST_INIT(((lockname).lock)) \
+	, .save_state = 1, .file = __FILE__, .line = __LINE__ \
+	__WAS_PREEMPT_OFF(1) \
+	__RT_MUTEX_DEBUG_RT_LOCKING_MODE_INITIALIZER
+# define _SPIN_LOCK_UNLOCKED(lockname) \
+	(spinlock_t) { { __SPIN_LOCK_UNLOCKED(lockname), .name = #lockname } }
+# define SPIN_LOCK_UNLOCKED(lockname) \
+	(spinlock_t) { { __SPIN_LOCK_UNLOCKED(lockname) } }
+#else
+# define SPIN_LOCK_UNLOCKED(lockname) \
+	(spinlock_t) { { .wait_lock = _RAW_SPIN_LOCK_UNLOCKED \
+	__PLIST_INIT(((lockname).lock)) \
+	__RT_MUTEX_DEBUG_RT_LOCKING_MODE_INITIALIZER } }
+# define _SPIN_LOCK_UNLOCKED(lockname) SPIN_LOCK_UNLOCKED(lockname)
+#endif
+#else /* !PREEMPT_RT */
+  typedef raw_spinlock_t spinlock_t;
+# ifdef CONFIG_DEBUG_SPINLOCK
+#  define _SPIN_LOCK_UNLOCKED(lockname)					\
+	(spinlock_t)	{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED,	\
+				.magic = SPINLOCK_MAGIC,		\
+				.owner = SPINLOCK_OWNER_INIT,		\
+				.owner_cpu = -1 }
+# else
+#  define _SPIN_LOCK_UNLOCKED(lockname) \
+	(spinlock_t)	{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED }
+# endif
+# define SPIN_LOCK_UNLOCKED(lockname) _SPIN_LOCK_UNLOCKED(lockname)
+#endif
+
+#define DEFINE_SPINLOCK(name) \
+	spinlock_t name __cacheline_aligned_in_smp = _SPIN_LOCK_UNLOCKED(name)
+
+#define DEFINE_RWLOCK(name) \
+	rwlock_t name __cacheline_aligned_in_smp = _RW_LOCK_UNLOCKED(name)
+
+#ifdef CONFIG_PREEMPT_RT
+
+/*
+ * Semaphores - an RT-mutex plus the semaphore count:
+ */
+struct semaphore {
+	atomic_t count;
+	struct rt_mutex lock;
+};
+
+#define DECLARE_MUTEX(name) \
+struct semaphore name = \
+	{ .count = { 1 }, .lock = __RT_MUTEX_INITIALIZER(name.lock) }
+
+/*
+ * DECLARE_MUTEX_LOCKED() is deprecated: very hard to initialize properly
+ * and it also often signals abuse of semaphores. So we redirect it to
+ * compat semaphores:
+ */
+#define DECLARE_MUTEX_LOCKED COMPAT_DECLARE_MUTEX_LOCKED
+
+extern void FASTCALL(__sema_init(struct semaphore *sem, int val, char *name, char *file, int line));
+
+#define rt_sema_init(sem, val) \
+		__sema_init(sem, val, #sem, __FILE__, __LINE__)
+
+extern void FASTCALL(__init_MUTEX(struct semaphore *sem, char *name, char *file, int line));
+#define rt_init_MUTEX(sem) \
+		__init_MUTEX(sem, #sem, __FILE__, __LINE__)
+
+extern void there_is_no_init_MUTEX_LOCKED_for_RT_semaphores(void);
+
+/*
+ * No locked initialization for RT semaphores
+ */
+#define rt_init_MUTEX_LOCKED(sem) \
+		there_is_no_init_MUTEX_LOCKED_for_RT_semaphores()
+extern void FASTCALL(rt_down(struct semaphore *sem));
+extern int FASTCALL(rt_down_interruptible(struct semaphore *sem));
+extern int FASTCALL(rt_down_trylock(struct semaphore *sem));
+extern void FASTCALL(rt_up(struct semaphore *sem));
+extern int FASTCALL(rt_sem_is_locked(struct semaphore *sem));
+extern int FASTCALL(rt_sema_count(struct semaphore *sem));
+
+
+extern int __bad_func_type(void);
+
+#undef TYPE_EQUAL
+#define TYPE_EQUAL(var, type) \
+		__builtin_types_compatible_p(typeof(var), type *)
+
+#define PICK_FUNC_1ARG(type1, type2, func1, func2, arg)			\
+do {									\
+	if (TYPE_EQUAL((arg), type1))					\
+		func1((type1 *)(arg));					\
+	else if (TYPE_EQUAL((arg), type2))				\
+		func2((type2 *)(arg));					\
+	else __bad_func_type();						\
+} while (0)
+
+#define PICK_FUNC_1ARG_RET(type1, type2, func1, func2, arg)		\
+({									\
+	unsigned long __ret;						\
+									\
+	if (TYPE_EQUAL((arg), type1))					\
+		__ret = func1((type1 *)(arg));				\
+	else if (TYPE_EQUAL((arg), type2))				\
+		__ret = func2((type2 *)(arg));				\
+	else __ret = __bad_func_type();					\
+									\
+	__ret;								\
+})
+
+#define PICK_FUNC_2ARG(type1, type2, func1, func2, arg0, arg1)		\
+do {									\
+	if (TYPE_EQUAL((arg0), type1))					\
+		func1((type1 *)(arg0), arg1);				\
+	else if (TYPE_EQUAL((arg0), type2))				\
+		func2((type2 *)(arg0), arg1);				\
+	else __bad_func_type();						\
+} while (0)
+
+#define sema_init(sem, val) \
+	PICK_FUNC_2ARG(struct compat_semaphore, struct semaphore, \
+		compat_sema_init, rt_sema_init, sem, val)
+
+#define init_MUTEX(sem) \
+	PICK_FUNC_1ARG(struct compat_semaphore, struct semaphore, \
+		compat_init_MUTEX, rt_init_MUTEX, sem)
+
+#define init_MUTEX_LOCKED(sem) \
+	PICK_FUNC_1ARG(struct compat_semaphore, struct semaphore, \
+		compat_init_MUTEX_LOCKED, rt_init_MUTEX_LOCKED, sem)
+
+#define down(sem) \
+	PICK_FUNC_1ARG(struct compat_semaphore, struct semaphore, \
+		compat_down, rt_down, sem)
+
+#define down_interruptible(sem) \
+	PICK_FUNC_1ARG_RET(struct compat_semaphore, struct semaphore, \
+		compat_down_interruptible, rt_down_interruptible, sem)
+
+#define down_trylock(sem) \
+	PICK_FUNC_1ARG_RET(struct compat_semaphore, struct semaphore, \
+		compat_down_trylock, rt_down_trylock, sem)
+
+#define up(sem) \
+	PICK_FUNC_1ARG(struct compat_semaphore, struct semaphore, \
+		compat_up, rt_up, sem)
+
+#define sem_is_locked(sem) \
+	PICK_FUNC_1ARG_RET(struct compat_semaphore, struct semaphore, \
+		compat_sem_is_locked, rt_sem_is_locked, sem)
+
+#define sema_count(sem) \
+	PICK_FUNC_1ARG_RET(struct compat_semaphore, struct semaphore, \
+		compat_sema_count, rt_sema_count, sem)
+
+/*
+ * rwsems:
+ */
+
+#define __RWSEM_INITIALIZER(lockname) \
+	{ .lock = __RT_MUTEX_INITIALIZER(lockname.lock) }
+
+#define DECLARE_RWSEM(lockname) \
+	struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
+
+extern void FASTCALL(__init_rwsem(struct rw_semaphore *rwsem, int mutex,
+				char *name, char *file, int line));
+
+#define rt_init_rwsem(sem) __init_rwsem(sem, 0, #sem, __FILE__, __LINE__)
+
+extern void FASTCALL(rt_down_read(struct rw_semaphore *rwsem));
+extern int FASTCALL(rt_down_read_trylock(struct rw_semaphore *rwsem));
+extern void FASTCALL(rt_down_write(struct rw_semaphore *rwsem));
+extern int FASTCALL(rt_down_write_trylock(struct rw_semaphore *rwsem));
+extern void FASTCALL(rt_up_read(struct rw_semaphore *rwsem));
+extern void FASTCALL(rt_up_write(struct rw_semaphore *rwsem));
+extern void FASTCALL(rt_downgrade_write(struct rw_semaphore *rwsem));
+extern int FASTCALL(rt_rwsem_is_locked(struct rw_semaphore *rwsem));
+
+#define init_rwsem(rwsem) \
+	PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_init_rwsem, rt_init_rwsem, rwsem)
+
+#define down_read(rwsem) \
+	PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_down_read, rt_down_read, rwsem)
+
+#define down_read_trylock(rwsem) \
+	PICK_FUNC_1ARG_RET(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_down_read_trylock, rt_down_read_trylock, rwsem)
+
+#define down_write(rwsem) \
+	PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_down_write, rt_down_write, rwsem)
+
+#define down_write_trylock(rwsem) \
+	PICK_FUNC_1ARG_RET(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_down_write_trylock, rt_down_write_trylock, rwsem)
+
+#define up_read(rwsem) \
+	PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_up_read, rt_up_read, rwsem)
+
+#define up_write(rwsem) \
+	PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_up_write, rt_up_write, rwsem)
+
+#define downgrade_write(rwsem) \
+	PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_downgrade_write, rt_downgrade_write, rwsem)
+
+#define rwsem_is_locked(rwsem) \
+	PICK_FUNC_1ARG_RET(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_rwsem_is_locked, rt_rwsem_is_locked, rwsem)
+
+#endif /* CONFIG_PREEMPT_RT */
+
+/* Futex support */
+extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
+				       struct task_struct *proxy_owner);
+extern int rt_mutex_lock_interruptible(struct rt_mutex *lock,
+					unsigned long time);
+extern void rt_mutex_unlock(struct rt_mutex *lock);
+
+#endif
+
Index: linux.prev/include/linux/rtc.h
===================================================================
--- linux.prev.orig/include/linux/rtc.h
+++ linux.prev/include/linux/rtc.h
@@ -11,6 +11,8 @@
 #ifndef _LINUX_RTC_H_
 #define _LINUX_RTC_H_
 
+#include <linux/interrupt.h>
+
 /*
  * The struct used to pass data via the following ioctl. Similar to the
  * struct tm in <time.h>, but it needs to be here so that the kernel 
@@ -102,6 +104,7 @@ int rtc_register(rtc_task_t *task);
 int rtc_unregister(rtc_task_t *task);
 int rtc_control(rtc_task_t *t, unsigned int cmd, unsigned long arg);
 void rtc_get_rtc_time(struct rtc_time *rtc_tm);
+irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs);
 
 #endif /* __KERNEL__ */
 
Index: linux.prev/include/linux/rwsem-spinlock.h
===================================================================
--- linux.prev.orig/include/linux/rwsem-spinlock.h
+++ linux.prev/include/linux/rwsem-spinlock.h
@@ -28,7 +28,7 @@ struct rwsem_waiter;
  * - if activity is -1 then there is one active writer
  * - if wait_list is not empty, then there are processes waiting for the semaphore
  */
-struct rw_semaphore {
+struct compat_rw_semaphore {
 	__s32			activity;
 	spinlock_t		wait_lock;
 	struct list_head	wait_list;
@@ -46,22 +46,22 @@ struct rw_semaphore {
 #define __RWSEM_DEBUG_INIT	/* */
 #endif
 
-#define __RWSEM_INITIALIZER(name) \
-{ 0, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) __RWSEM_DEBUG_INIT }
+#define __COMPAT_RWSEM_INITIALIZER(name) \
+{ 0, SPIN_LOCK_UNLOCKED((name).wait_lock), LIST_HEAD_INIT((name).wait_list) __RWSEM_DEBUG_INIT }
 
-#define DECLARE_RWSEM(name) \
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
+#define COMPAT_DECLARE_RWSEM(name) \
+	struct compat_rw_semaphore name = __COMPAT_RWSEM_INITIALIZER(name)
 
-extern void FASTCALL(init_rwsem(struct rw_semaphore *sem));
-extern void FASTCALL(__down_read(struct rw_semaphore *sem));
-extern int FASTCALL(__down_read_trylock(struct rw_semaphore *sem));
-extern void FASTCALL(__down_write(struct rw_semaphore *sem));
-extern int FASTCALL(__down_write_trylock(struct rw_semaphore *sem));
-extern void FASTCALL(__up_read(struct rw_semaphore *sem));
-extern void FASTCALL(__up_write(struct rw_semaphore *sem));
-extern void FASTCALL(__downgrade_write(struct rw_semaphore *sem));
+extern void FASTCALL(compat_init_rwsem(struct compat_rw_semaphore *sem));
+extern void FASTCALL(__down_read(struct compat_rw_semaphore *sem));
+extern int FASTCALL(__down_read_trylock(struct compat_rw_semaphore *sem));
+extern void FASTCALL(__down_write(struct compat_rw_semaphore *sem));
+extern int FASTCALL(__down_write_trylock(struct compat_rw_semaphore *sem));
+extern void FASTCALL(__up_read(struct compat_rw_semaphore *sem));
+extern void FASTCALL(__up_write(struct compat_rw_semaphore *sem));
+extern void FASTCALL(__downgrade_write(struct compat_rw_semaphore *sem));
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
+static inline int compat_rwsem_is_locked(struct compat_rw_semaphore *sem)
 {
 	return (sem->activity != 0);
 }
Index: linux.prev/include/linux/rwsem.h
===================================================================
--- linux.prev.orig/include/linux/rwsem.h
+++ linux.prev/include/linux/rwsem.h
@@ -9,6 +9,10 @@
 
 #include <linux/linkage.h>
 
+#ifdef CONFIG_PREEMPT_RT
+# include <linux/rt_lock.h>
+#endif
+
 #define RWSEM_DEBUG 0
 
 #ifdef __KERNEL__
@@ -19,17 +23,29 @@
 #include <asm/system.h>
 #include <asm/atomic.h>
 
-struct rw_semaphore;
+#ifndef CONFIG_PREEMPT_RT
+/*
+ * On !PREEMPT_RT all rw-semaphores are compat:
+ */
+#define compat_rw_semaphore rw_semaphore
+#endif
+
+struct compat_rw_semaphore;
+
 
 #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
-#include <linux/rwsem-spinlock.h> /* use a generic implementation */
+# include <linux/rwsem-spinlock.h> /* use a generic implementation */
+#  ifndef CONFIG_PREEMPT_RT
+#  define __RWSEM_INITIALIZER __COMPAT_RWSEM_INITIALIZER
+#  define DECLARE_RWSEM COMPAT_DECLARE_RWSEM
+# endif
 #else
-#include <asm/rwsem.h> /* use an arch-specific implementation */
+# include <asm/rwsem.h> /* use an arch-specific implementation */
 #endif
 
 #ifndef rwsemtrace
 #if RWSEM_DEBUG
-extern void FASTCALL(rwsemtrace(struct rw_semaphore *sem, const char *str));
+extern void FASTCALL(rwsemtrace(struct compat_rw_semaphore *sem, const char *str));
 #else
 #define rwsemtrace(SEM,FMT)
 #endif
@@ -38,7 +54,7 @@ extern void FASTCALL(rwsemtrace(struct r
 /*
  * lock for reading
  */
-static inline void down_read(struct rw_semaphore *sem)
+static inline void compat_down_read(struct compat_rw_semaphore *sem)
 {
 	might_sleep();
 	rwsemtrace(sem,"Entering down_read");
@@ -49,7 +65,7 @@ static inline void down_read(struct rw_s
 /*
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
-static inline int down_read_trylock(struct rw_semaphore *sem)
+static inline int compat_down_read_trylock(struct compat_rw_semaphore *sem)
 {
 	int ret;
 	rwsemtrace(sem,"Entering down_read_trylock");
@@ -61,7 +77,7 @@ static inline int down_read_trylock(stru
 /*
  * lock for writing
  */
-static inline void down_write(struct rw_semaphore *sem)
+static inline void compat_down_write(struct compat_rw_semaphore *sem)
 {
 	might_sleep();
 	rwsemtrace(sem,"Entering down_write");
@@ -72,7 +88,7 @@ static inline void down_write(struct rw_
 /*
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
-static inline int down_write_trylock(struct rw_semaphore *sem)
+static inline int compat_down_write_trylock(struct compat_rw_semaphore *sem)
 {
 	int ret;
 	rwsemtrace(sem,"Entering down_write_trylock");
@@ -84,7 +100,7 @@ static inline int down_write_trylock(str
 /*
  * release a read lock
  */
-static inline void up_read(struct rw_semaphore *sem)
+static inline void compat_up_read(struct compat_rw_semaphore *sem)
 {
 	rwsemtrace(sem,"Entering up_read");
 	__up_read(sem);
@@ -94,7 +110,7 @@ static inline void up_read(struct rw_sem
 /*
  * release a write lock
  */
-static inline void up_write(struct rw_semaphore *sem)
+static inline void compat_up_write(struct compat_rw_semaphore *sem)
 {
 	rwsemtrace(sem,"Entering up_write");
 	__up_write(sem);
@@ -104,12 +120,54 @@ static inline void up_write(struct rw_se
 /*
  * downgrade write lock to read lock
  */
-static inline void downgrade_write(struct rw_semaphore *sem)
+static inline void compat_downgrade_write(struct compat_rw_semaphore *sem)
 {
 	rwsemtrace(sem,"Entering downgrade_write");
 	__downgrade_write(sem);
 	rwsemtrace(sem,"Leaving downgrade_write");
 }
 
+#ifndef CONFIG_PREEMPT_RT
+
+#define DECLARE_RWSEM COMPAT_DECLARE_RWSEM
+
+static inline void init_rwsem(struct compat_rw_semaphore *rwsem)
+{
+	compat_init_rwsem(rwsem);
+}
+static inline void down_read(struct compat_rw_semaphore *rwsem)
+{
+	compat_down_read(rwsem);
+}
+static inline int down_read_trylock(struct compat_rw_semaphore *rwsem)
+{
+	return compat_down_read_trylock(rwsem);
+}
+static inline void down_write(struct compat_rw_semaphore *rwsem)
+{
+	compat_down_write(rwsem);
+}
+static inline int down_write_trylock(struct compat_rw_semaphore *rwsem)
+{
+	return compat_down_write_trylock(rwsem);
+}
+static inline void up_read(struct compat_rw_semaphore *rwsem)
+{
+	compat_up_read(rwsem);
+}
+static inline void up_write(struct compat_rw_semaphore *rwsem)
+{
+	compat_up_write(rwsem);
+}
+static inline void downgrade_write(struct compat_rw_semaphore *rwsem)
+{
+	compat_downgrade_write(rwsem);
+}
+static inline int rwsem_is_locked(struct compat_rw_semaphore *sem)
+{
+	return compat_rwsem_is_locked(sem);
+}
+#endif /* CONFIG_PREEMPT_RT */
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_RWSEM_H */
Index: linux.prev/include/linux/sched.h
===================================================================
--- linux.prev.orig/include/linux/sched.h
+++ linux.prev/include/linux/sched.h
@@ -34,9 +34,166 @@
 #include <linux/percpu.h>
 #include <linux/topology.h>
 #include <linux/seccomp.h>
+#include <linux/rcupdate.h>
 
 #include <linux/auxvec.h>	/* For AT_VECTOR_SIZE */
 
+#ifdef CONFIG_PREEMPT
+extern int kernel_preemption;
+#else
+# define kernel_preemption 0
+#endif
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+extern int voluntary_preemption;
+#else
+# define voluntary_preemption 0
+#endif
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+extern int softirq_preemption;
+#else
+# define softirq_preemption 0
+#endif
+#ifdef CONFIG_PREEMPT_HARDIRQS
+extern int hardirq_preemption;
+#else
+# define hardirq_preemption 0
+#endif
+
+#ifdef CONFIG_PREEMPT_BKL
+extern struct semaphore kernel_sem;
+#endif
+
+#ifdef CONFIG_GENERIC_HARDIRQS
+extern int debug_direct_keyboard;
+#else
+# define debug_direct_keyboard 0
+#endif
+
+#if defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPT_RT)
+extern int check_locking_preempt_off(struct task_struct *p);
+extern void check_preempt_wakeup(struct task_struct * p);
+#else
+#define check_locking_preempt_off(x)		0
+#define check_preempt_wakeup(p)			do { } while (0)
+#endif
+
+#ifdef CONFIG_DEBUG_DEADLOCKS
+  extern void deadlock_trace_off(void);
+  extern void show_held_locks(struct task_struct *filter);
+  extern void check_no_held_locks(struct task_struct *task);
+  extern void show_all_locks(void);
+#else
+# define deadlock_trace_off()			do { } while (0)
+# define show_held_locks(p)			do { } while (0)
+# define check_no_held_locks(task)		do { } while (0)
+# define show_all_locks()			do { } while (0)
+#endif
+
+#if defined(CONFIG_PREEMPT_TRACE) || defined(CONFIG_LATENCY_TRACE)
+  extern void print_traces(struct task_struct *task);
+#else
+# define print_traces(task)			do { } while (0)
+#endif
+
+#ifdef CONFIG_FRAME_POINTER
+# ifndef CONFIG_ARM
+#  define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+#  define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
+#  define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
+#  define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3))
+# else
+   extern unsigned long arm_return_addr(int level);
+#  define CALLER_ADDR0 arm_return_addr(0)
+#  define CALLER_ADDR1 arm_return_addr(1)
+#  define CALLER_ADDR2 arm_return_addr(2)
+#  define CALLER_ADDR3 arm_return_addr(3)
+#endif
+#else
+# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+# define CALLER_ADDR1 0UL
+# define CALLER_ADDR2 0UL
+# define CALLER_ADDR3 0UL
+#endif
+
+#ifdef CONFIG_MCOUNT
+  extern void notrace mcount(void);
+#else
+# define mcount() do { } while (0)
+#endif
+
+#ifdef CONFIG_LATENCY_TRACE
+  extern int mcount_enabled, trace_enabled, trace_user_triggered,
+		trace_user_trigger_irq, trace_freerunning, trace_verbose,
+		trace_print_at_crash, trace_all_cpus, trace_functions;
+  extern void notrace trace_special(unsigned long v1, unsigned long v2, unsigned long v3);
+  extern void notrace trace_special_pid(int pid, unsigned long v1, unsigned long v2);
+  extern void notrace trace_special_u64(unsigned long long v1, unsigned long v2);
+  extern void stop_trace(void);
+  extern void print_last_trace(void);
+  extern void nmi_trace(unsigned long eip, unsigned long parent_eip,
+			unsigned long flags);
+  extern long user_trace_start(void);
+  extern long user_trace_stop(void);
+  extern void trace_cmdline(void);
+#else
+# define mcount_enabled				0
+# define trace_enabled				0
+# define trace_user_triggered			0
+# define trace_freerunning			0
+# define trace_all_cpus				0
+# define trace_verbose				0
+# define trace_special(v1,v2,v3)		do { } while (0)
+# define trace_special_pid(pid,v1,v2)		do { } while (0)
+# define trace_special_u64(v1,v2)		do { } while (0)
+# define stop_trace()				do { } while (0)
+# define print_last_trace()			do { } while (0)
+# define nmi_trace(eip, parent_eip, flags)	do { } while (0)
+# define user_trace_start()			do { } while (0)
+# define user_trace_stop()			do { } while (0)
+# define trace_cmdline()			do { } while (0)
+#endif
+
+extern int timeofday_API_hacks(void *tv, void *tz);
+
+#ifdef CONFIG_WAKEUP_TIMING
+  extern int wakeup_timing;
+  extern void __trace_start_sched_wakeup(struct task_struct *p);
+  extern void trace_stop_sched_switched(struct task_struct *p);
+  extern void trace_change_sched_cpu(struct task_struct *p, int new_cpu);
+#else
+# define wakeup_timing 0
+# define __trace_start_sched_wakeup(p)		do { } while (0)
+# define trace_stop_sched_switched(p)		do { } while (0)
+# define trace_change_sched_cpu(p, cpu)		do { } while (0)
+#endif
+
+// #define PREEMPT_DIRECT
+
+#ifdef CONFIG_DEBUG_RT_LOCKING_MODE
+extern int preempt_locks_user;
+extern void propagate_preempt_locks_value(void);
+#else
+# define propagate_preempt_locks_value() do { } while (0)
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+extern void nmi_show_all_regs(void);
+#else
+# define nmi_show_all_regs() do { } while (0)
+#endif
+
+#include <linux/smp.h>
+#include <linux/sem.h>
+#include <linux/signal.h>
+#include <linux/securebits.h>
+#include <linux/fs_struct.h>
+#include <linux/compiler.h>
+#include <linux/completion.h>
+#include <linux/pid.h>
+#include <linux/percpu.h>
+#include <linux/topology.h>
+#include <linux/seccomp.h>
+
 struct exec_domain;
 
 /*
@@ -104,6 +261,7 @@ extern unsigned long nr_iowait(void);
 #include <linux/param.h>
 #include <linux/resource.h>
 #include <linux/timer.h>
+#include <linux/hrtimer.h>
 
 #include <asm/processor.h>
 
@@ -118,15 +276,16 @@ extern unsigned long nr_iowait(void);
  * mistake.
  */
 #define TASK_RUNNING		0
-#define TASK_INTERRUPTIBLE	1
-#define TASK_UNINTERRUPTIBLE	2
-#define TASK_STOPPED		4
-#define TASK_TRACED		8
+#define TASK_RUNNING_MUTEX	1
+#define TASK_INTERRUPTIBLE	2
+#define TASK_UNINTERRUPTIBLE	4
+#define TASK_STOPPED		8
+#define TASK_TRACED		16
 /* in tsk->exit_state */
-#define EXIT_ZOMBIE		16
-#define EXIT_DEAD		32
+#define EXIT_ZOMBIE		32
+#define EXIT_DEAD		64
 /* in tsk->state again */
-#define TASK_NONINTERACTIVE	64
+#define TASK_NONINTERACTIVE	128
 
 #define __set_task_state(tsk, state_value)		\
 	do { (tsk)->state = (state_value); } while (0)
@@ -203,11 +362,11 @@ extern void update_process_times(int use
 extern void scheduler_tick(void);
 
 #ifdef CONFIG_DETECT_SOFTLOCKUP
-extern void softlockup_tick(struct pt_regs *regs);
+extern void softlockup_tick(void);
 extern void spawn_softlockup_task(void);
 extern void touch_softlockup_watchdog(void);
 #else
-static inline void softlockup_tick(struct pt_regs *regs)
+static inline void softlockup_tick(void)
 {
 }
 static inline void spawn_softlockup_task(void)
@@ -229,6 +388,11 @@ extern signed long FASTCALL(schedule_tim
 extern signed long schedule_timeout_interruptible(signed long timeout);
 extern signed long schedule_timeout_uninterruptible(signed long timeout);
 asmlinkage void schedule(void);
+/*
+ * This one can be called with interrupts disabled, only
+ * to be used by lowlevel arch code!
+ */
+extern void __sched __schedule(void);
 
 struct namespace;
 
@@ -346,6 +510,9 @@ struct mm_struct {
 	/* Architecture-specific MM context */
 	mm_context_t context;
 
+	/* realtime bits */
+	struct list_head	delayed_drop;
+
 	/* Token based thrashing protection. */
 	unsigned long swap_token_time;
 	char recent_pagein;
@@ -363,8 +530,16 @@ struct sighand_struct {
 	atomic_t		count;
 	struct k_sigaction	action[_NSIG];
 	spinlock_t		siglock;
+	struct rcu_head		rcu;
 };
 
+static inline void sighand_free(struct sighand_struct *sp)
+{
+	extern void sighand_free_cb(struct rcu_head *rhp);
+
+	call_rcu(&sp->rcu, sighand_free_cb);
+}
+
 /*
  * NOTE! "signal_struct" does not have it's own
  * locking, because a shared signal_struct always
@@ -402,8 +577,8 @@ struct signal_struct {
 	struct list_head posix_timers;
 
 	/* ITIMER_REAL timer for the process */
-	struct timer_list real_timer;
-	unsigned long it_real_value, it_real_incr;
+	struct hrtimer real_timer;
+	ktime_t it_real_incr;
 
 	/* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */
 	cputime_t it_prof_expires, it_virt_expires;
@@ -489,7 +664,8 @@ struct signal_struct {
 
 #define MAX_PRIO		(MAX_RT_PRIO + 40)
 
-#define rt_task(p)		(unlikely((p)->prio < MAX_RT_PRIO))
+#define rt_prio(prio)		((prio) < MAX_RT_PRIO)
+#define rt_task(p)		(unlikely(rt_prio((p)->prio)))
 
 /*
  * Some day this will be a full-fledged user tracking system..
@@ -695,7 +871,7 @@ struct task_struct {
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
 	int oncpu;
 #endif
-	int prio, static_prio;
+	int prio, static_prio, normal_prio;
 	struct list_head run_list;
 	prio_array_t *array;
 
@@ -710,6 +886,11 @@ struct task_struct {
 	cpumask_t cpus_allowed;
 	unsigned int time_slice, first_time_slice;
 
+#ifdef CONFIG_PREEMPT_RCU
+	int rcu_read_lock_nesting;
+	atomic_t *rcu_flipctr1;
+	atomic_t *rcu_flipctr2;
+#endif
 #ifdef CONFIG_SCHEDSTATS
 	struct sched_info sched_info;
 #endif
@@ -820,6 +1001,40 @@ struct task_struct {
 /* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */
 	spinlock_t proc_lock;
 
+#define MAX_PREEMPT_TRACE 25
+
+#ifdef CONFIG_PREEMPT_TRACE
+	unsigned long preempt_trace_eip[MAX_PREEMPT_TRACE];
+	unsigned long preempt_trace_parent_eip[MAX_PREEMPT_TRACE];
+#endif
+
+#define MAX_LOCK_STACK	MAX_PREEMPT_TRACE
+#ifdef CONFIG_DEBUG_PREEMPT
+	int lock_count;
+# ifdef CONFIG_PREEMPT_RT
+	struct rt_mutex *owned_lock[MAX_LOCK_STACK];
+# endif
+#endif
+#ifdef CONFIG_DETECT_SOFTLOCKUP
+	unsigned long	softlockup_count; /* Count to keep track how long the
+					   *  thread is in the kernel without
+					   *  sleeping.
+					   */
+#endif
+	/* realtime bits */
+	struct list_head delayed_put;
+	struct plist_head pi_waiters;
+
+	/* RT deadlock detection and priority inheritance handling */
+	struct rt_mutex_waiter *blocked_on;
+	//struct rt_mutex *pending_owner;
+	raw_spinlock_t pi_lock;
+	//unsigned long rt_flags;
+
+#ifdef CONFIG_DEBUG_DEADLOCKS
+	void *last_kernel_lock;
+#endif
+
 /* journalling filesystem info */
 	void *journal_info;
 
@@ -857,6 +1072,7 @@ struct task_struct {
 	int cpuset_mems_generation;
 #endif
 	atomic_t fs_excl;	/* holding fs exclusive resources */
+	struct rcu_head rcu;
 };
 
 static inline pid_t process_group(struct task_struct *tsk)
@@ -880,8 +1096,29 @@ static inline int pid_alive(struct task_
 extern void free_task(struct task_struct *tsk);
 extern void __put_task_struct(struct task_struct *tsk);
 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
-#define put_task_struct(tsk) \
-do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
+
+static inline int get_task_struct_rcu(struct task_struct *t)
+{
+	int oldusage;
+
+	do {
+		oldusage = atomic_read(&t->usage);
+		if (oldusage == 0) {
+			return 0;
+		}
+	} while (cmpxchg(&t->usage.counter,
+		 oldusage, oldusage + 1) != oldusage);
+	return 1;
+}
+
+extern void __put_task_struct_cb(struct rcu_head *rhp);
+
+static inline void put_task_struct(struct task_struct *t)
+{
+	if (atomic_dec_and_test(&t->usage)) {
+		call_rcu(&t->rcu, __put_task_struct_cb);
+	}
+}
 
 /*
  * Per process flags
@@ -908,6 +1145,10 @@ do { if (atomic_dec_and_test(&(tsk)->usa
 #define PF_SYNCWRITE	0x00200000	/* I am doing a sync write */
 #define PF_BORROWED_MM	0x00400000	/* I am a kthread doing use_mm */
 #define PF_RANDOMIZE	0x00800000	/* randomize virtual address space */
+#define PF_SOFTIRQ	0x01000000	/* softirq context */
+#define PF_HARDIRQ	0x02000000	/* hardirq context */
+#define PF_NOSCHED	0x04000000	/* no voluntary scheduling */
+#define PF_IRQSOFF	0x08000000	/* soft IRQs-off flag */
 
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
@@ -973,7 +1214,12 @@ extern task_t *idle_task(int cpu);
 extern task_t *curr_task(int cpu);
 extern void set_curr_task(int cpu, task_t *p);
 
+extern void mutex_setprio(task_t *p, int prio);
+extern void pi_changeprio(task_t *p, int prio);
+extern int normal_prio(task_t *p);
+
 void yield(void);
+void __yield(void);
 
 /*
  * The default (Linux) execution domain.
@@ -1021,6 +1267,9 @@ extern void do_timer(struct pt_regs *);
 
 extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state));
 extern int FASTCALL(wake_up_process(struct task_struct * tsk));
+extern int FASTCALL(wake_up_process_mutex(struct task_struct * tsk));
+extern int FASTCALL(wake_up_process_sync(struct task_struct * tsk));
+extern int FASTCALL(wake_up_process_mutex_sync(struct task_struct * tsk));
 extern void FASTCALL(wake_up_new_task(struct task_struct * tsk,
 						unsigned long clone_flags));
 #ifdef CONFIG_SMP
@@ -1123,12 +1372,20 @@ extern struct mm_struct * mm_alloc(void)
 
 /* mmdrop drops the mm and the page tables */
 extern void FASTCALL(__mmdrop(struct mm_struct *));
+extern void FASTCALL(__mmdrop_delayed(struct mm_struct *));
+
 static inline void mmdrop(struct mm_struct * mm)
 {
 	if (atomic_dec_and_test(&mm->mm_count))
 		__mmdrop(mm);
 }
 
+static inline void mmdrop_delayed(struct mm_struct * mm)
+{
+	if (atomic_dec_and_test(&mm->mm_count))
+		__mmdrop_delayed(mm);
+}
+
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
 /* Grab a reference to a task's mm, if it is not already going away */
@@ -1276,6 +1533,11 @@ static inline int test_tsk_thread_flag(s
 	return test_ti_thread_flag(task_thread_info(tsk), flag);
 }
 
+static inline int signal_pending(struct task_struct *p)
+{
+	return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
+}
+
 static inline void set_tsk_need_resched(struct task_struct *tsk)
 {
 	set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
@@ -1286,48 +1548,97 @@ static inline void clear_tsk_need_resche
 	clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
 }
 
-static inline int signal_pending(struct task_struct *p)
+static inline int _need_resched(void)
 {
-	return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
+	return unlikely(test_thread_flag(TIF_NEED_RESCHED));
 }
-  
+
 static inline int need_resched(void)
 {
-	return unlikely(test_thread_flag(TIF_NEED_RESCHED));
+	touch_critical_timing();
+	return _need_resched();
 }
 
-/*
- * cond_resched() and cond_resched_lock(): latency reduction via
- * explicit rescheduling in places that are safe. The return
- * value indicates whether a reschedule was done in fact.
- * cond_resched_lock() will drop the spinlock before scheduling,
- * cond_resched_softirq() will enable bhs before scheduling.
- */
-extern int cond_resched(void);
-extern int cond_resched_lock(spinlock_t * lock);
-extern int cond_resched_softirq(void);
+static inline void set_tsk_need_resched_delayed(struct task_struct *tsk)
+{
+	set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_DELAYED);
+}
+
+static inline void clear_tsk_need_resched_delayed(struct task_struct *tsk)
+{
+	clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_DELAYED);
+}
+
+static inline int need_resched_delayed(void)
+{
+	return unlikely(test_thread_flag(TIF_NEED_RESCHED_DELAYED));
+}
 
 /*
  * Does a critical section need to be broken due to another
  * task waiting?:
  */
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
-# define need_lockbreak(lock) ((lock)->break_lock)
+#if (defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)) || defined(CONFIG_PREEMPT_RT)
+# define need_lockbreak(lock) ({ int __need = ((lock)->break_lock); if (__need) (lock)->break_lock = 0; __need; })
 #else
 # define need_lockbreak(lock) 0
 #endif
 
+#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
+# define need_lockbreak_raw(lock) ({ int __need = ((lock)->break_lock); if (__need) (lock)->break_lock = 0; __need; })
+#else
+# define need_lockbreak_raw(lock) 0
+#endif
+
 /*
  * Does a critical section need to be broken due to another
  * task waiting or preemption being signalled:
  */
-static inline int lock_need_resched(spinlock_t *lock)
+#define lock_need_resched(lock) \
+	unlikely(need_lockbreak(lock) || need_resched())
+
+static inline int softirq_need_resched(void)
 {
-	if (need_lockbreak(lock) || need_resched())
-		return 1;
+	if (softirq_preemption)
+		return need_resched();
 	return 0;
 }
 
+static inline int hardirq_need_resched(void)
+{
+	if (current->flags & PF_HARDIRQ)
+		return need_resched();
+	return 0;
+}
+
+/*
+ * cond_resched() and cond_resched_lock(): latency reduction via
+ * explicit rescheduling in places that are safe. The return
+ * value indicates whether a reschedule was done in fact.
+ * cond_resched_lock() will drop the spinlock before scheduling,
+ * cond_resched_softirq() will enable bhs before scheduling.
+ */
+extern int cond_resched(void);
+extern int __cond_resched_raw_spinlock(raw_spinlock_t *lock);
+extern int __cond_resched_spinlock(spinlock_t *spinlock);
+
+#define cond_resched_lock(lock) \
+({								\
+	int __ret;						\
+								\
+	if (TYPE_EQUAL((lock), raw_spinlock_t))	 		\
+		__ret = __cond_resched_raw_spinlock((raw_spinlock_t *)lock);\
+	else if (TYPE_EQUAL(lock, spinlock_t))			\
+		__ret = __cond_resched_spinlock((spinlock_t *)lock); \
+	else __ret = __bad_spinlock_type();			\
+								\
+	__ret;							\
+})
+
+extern int cond_resched_softirq(void);
+extern int cond_resched_hardirq(void);
+extern int cond_resched_all(void);
+
 /* Reevaluate whether the task has signals pending delivery.
    This is required every time the blocked sigset_t changes.
    callers must hold sighand->siglock.  */
@@ -1337,6 +1648,8 @@ extern void recalc_sigpending(void);
 
 extern void signal_wake_up(struct task_struct *t, int resume_stopped);
 
+extern pid_t get_blocked_on(task_t *task);
+
 /*
  * Wrappers for p->thread_info->cpu access. No-op on UP.
  */
@@ -1349,6 +1662,7 @@ static inline unsigned int task_cpu(cons
 
 static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
+	trace_change_sched_cpu(p, cpu);
 	task_thread_info(p)->cpu = cpu;
 }
 
Index: linux.prev/include/linux/semaphore.h
===================================================================
--- /dev/null
+++ linux.prev/include/linux/semaphore.h
@@ -0,0 +1,52 @@
+#ifndef _LINUX_SEMAPHORE_H
+#define _LINUX_SEMAPHORE_H
+
+#include <linux/config.h>
+
+#ifdef CONFIG_PREEMPT_RT
+# include <linux/rt_lock.h>
+#else
+
+#define DECLARE_MUTEX COMPAT_DECLARE_MUTEX
+#define DECLARE_MUTEX_LOCKED COMPAT_DECLARE_MUTEX_LOCKED
+
+static inline void sema_init(struct compat_semaphore *sem, int val)
+{
+	compat_sema_init(sem, val);
+}
+static inline void init_MUTEX(struct compat_semaphore *sem)
+{
+	compat_init_MUTEX(sem);
+}
+static inline void init_MUTEX_LOCKED(struct compat_semaphore *sem)
+{
+	compat_init_MUTEX_LOCKED(sem);
+}
+static inline void down(struct compat_semaphore *sem)
+{
+	compat_down(sem);
+}
+static inline int down_interruptible(struct compat_semaphore *sem)
+{
+	return compat_down_interruptible(sem);
+}
+static inline int down_trylock(struct compat_semaphore *sem)
+{
+	return compat_down_trylock(sem);
+}
+static inline void up(struct compat_semaphore *sem)
+{
+	compat_up(sem);
+}
+static inline int sem_is_locked(struct compat_semaphore *sem)
+{
+	return compat_sem_is_locked(sem);
+}
+static inline int sema_count(struct compat_semaphore *sem)
+{
+	return compat_sema_count(sem);
+}
+
+#endif /* CONFIG_PREEMPT_RT */
+
+#endif /* _LINUX_SEMAPHORE_H */
Index: linux.prev/include/linux/seqlock.h
===================================================================
--- linux.prev.orig/include/linux/seqlock.h
+++ linux.prev/include/linux/seqlock.h
@@ -29,39 +29,63 @@
 #include <linux/config.h>
 #include <linux/spinlock.h>
 #include <linux/preempt.h>
+#include <linux/rt_irq.h>
 
 typedef struct {
 	unsigned sequence;
 	spinlock_t lock;
-} seqlock_t;
+} __seqlock_t;
+
+typedef struct {
+	unsigned sequence;
+	raw_spinlock_t lock;
+} __raw_seqlock_t;
+
+#define seqlock_need_resched(seq) lock_need_resched(&(seq)->lock)
+
+#ifdef CONFIG_PREEMPT_RT
+typedef __seqlock_t seqlock_t;
+#else
+typedef __raw_seqlock_t seqlock_t;
+#endif
+
+typedef __raw_seqlock_t raw_seqlock_t;
 
 /*
  * These macros triggered gcc-3.x compile-time problems.  We think these are
  * OK now.  Be cautious.
  */
-#define SEQLOCK_UNLOCKED { 0, SPIN_LOCK_UNLOCKED }
-#define seqlock_init(x)	do { *(x) = (seqlock_t) SEQLOCK_UNLOCKED; } while (0)
-
+#ifdef CONFIG_PREEMPT_RT
+#define SEQLOCK_UNLOCKED(name) { 0, SPIN_LOCK_UNLOCKED((name).lock) }
+#else
+#define SEQLOCK_UNLOCKED(name) { 0, RAW_SPIN_LOCK_UNLOCKED }
+#endif
+
+#define seqlock_init(x)	do { (x)->sequence = 0; spin_lock_init(&(x)->lock); } while (0)
+
+#define RAW_SEQLOCK_UNLOCKED { 0, RAW_SPIN_LOCK_UNLOCKED }
+#define raw_seqlock_init(x) \
+		do { *(x) = (raw_seqlock_t) RAW_SEQLOCK_UNLOCKED; } while (0)
 
 /* Lock out other writers and update the count.
  * Acts like a normal spin_lock/unlock.
  * Don't need preempt_disable() because that is in the spin_lock already.
  */
-static inline void write_seqlock(seqlock_t *sl)
+static inline void __write_seqlock(seqlock_t *sl)
 {
 	spin_lock(&sl->lock);
 	++sl->sequence;
 	smp_wmb();			
 }	
 
-static inline void write_sequnlock(seqlock_t *sl) 
+static inline void __write_sequnlock(seqlock_t *sl)
 {
 	smp_wmb();
 	sl->sequence++;
 	spin_unlock(&sl->lock);
 }
 
-static inline int write_tryseqlock(seqlock_t *sl)
+static inline int __write_tryseqlock(seqlock_t *sl)
 {
 	int ret = spin_trylock(&sl->lock);
 
@@ -73,7 +97,7 @@ static inline int write_tryseqlock(seqlo
 }
 
 /* Start of read calculation -- fetch last complete writer token */
-static inline unsigned read_seqbegin(const seqlock_t *sl)
+static inline unsigned __read_seqbegin(const seqlock_t *sl)
 {
 	unsigned ret = sl->sequence;
 	smp_rmb();
@@ -88,13 +112,126 @@ static inline unsigned read_seqbegin(con
  *    
  * Using xor saves one conditional branch.
  */
-static inline int read_seqretry(const seqlock_t *sl, unsigned iv)
+static inline int __read_seqretry(seqlock_t *sl, unsigned iv)
+{
+	int ret;
+
+	smp_rmb();
+	ret = (iv & 1) | (sl->sequence ^ iv);
+	/*
+	 * If invalid then serialize with the writer, to make sure we
+	 * are not livelocking it:
+	 */
+	if (unlikely(ret)) {
+		unsigned long flags;
+		spin_lock_irqsave(&sl->lock, flags);
+		spin_unlock_irqrestore(&sl->lock, flags);
+	}
+	return ret;
+}
+
+static inline void __write_seqlock_raw(raw_seqlock_t *sl)
+{
+	spin_lock(&sl->lock);
+	++sl->sequence;
+	smp_wmb();
+}
+
+static inline void __write_sequnlock_raw(raw_seqlock_t *sl)
+{
+	smp_wmb();
+	sl->sequence++;
+	spin_unlock(&sl->lock);
+}
+
+static inline int __write_tryseqlock_raw(raw_seqlock_t *sl)
+{
+	int ret = spin_trylock(&sl->lock);
+
+	if (ret) {
+		++sl->sequence;
+		smp_wmb();
+	}
+	return ret;
+}
+
+static inline unsigned __read_seqbegin_raw(const raw_seqlock_t *sl)
+{
+	unsigned ret = sl->sequence;
+	smp_rmb();
+	return ret;
+}
+
+static inline int __read_seqretry_raw(const raw_seqlock_t *sl, unsigned iv)
 {
 	smp_rmb();
 	return (iv & 1) | (sl->sequence ^ iv);
 }
 
 
+extern int __bad_seqlock_type(void);
+
+#define PICK_SEQOP(op, lock)					\
+do {								\
+	if (TYPE_EQUAL((lock), raw_seqlock_t))			\
+		op##_raw((raw_seqlock_t *)(lock));		\
+	else if (TYPE_EQUAL(lock, seqlock_t))			\
+		op((seqlock_t *)(lock));			\
+	else __bad_seqlock_type();				\
+} while (0)
+
+#define PICK_SEQOP_RET(op, lock)				\
+({								\
+	unsigned long __ret;					\
+								\
+	if (TYPE_EQUAL((lock), raw_seqlock_t))			\
+		__ret = op##_raw((raw_seqlock_t *)(lock));	\
+	else if (TYPE_EQUAL(lock, seqlock_t))			\
+		__ret = op((seqlock_t *)(lock));		\
+	else __ret = __bad_seqlock_type();			\
+								\
+	__ret;							\
+})
+
+#define PICK_SEQOP_CONST_RET(op, lock)				\
+({								\
+	unsigned long __ret;					\
+								\
+	if (TYPE_EQUAL((lock), raw_seqlock_t))			\
+		__ret = op##_raw((const raw_seqlock_t *)(lock));\
+	else if (TYPE_EQUAL(lock, seqlock_t))			\
+		__ret = op((const seqlock_t *)(lock));		\
+	else __ret = __bad_seqlock_type();			\
+								\
+	__ret;							\
+})
+
+#define PICK_SEQOP2_CONST_RET(op, lock, arg)				\
+({									\
+	unsigned long __ret;						\
+									\
+	if (TYPE_EQUAL((lock), raw_seqlock_t))				\
+		__ret = op##_raw((const raw_seqlock_t *)(lock), (arg));	\
+	else if (TYPE_EQUAL(lock, seqlock_t))				\
+		__ret = op((seqlock_t *)(lock), (arg));			\
+	else __ret = __bad_seqlock_type();				\
+									\
+	__ret;								\
+})
+
+
+#define write_seqlock(sl)	PICK_SEQOP(__write_seqlock, sl)
+#define write_sequnlock(sl)	PICK_SEQOP(__write_sequnlock, sl)
+#define write_tryseqlock(sl)	PICK_SEQOP_RET(__write_tryseqlock, sl)
+#define read_seqbegin(sl)	PICK_SEQOP_CONST_RET(__read_seqbegin, sl)
+#define read_seqretry(sl, iv)	PICK_SEQOP2_CONST_RET(__read_seqretry, sl, iv)
+
+#define DECLARE_SEQLOCK(name) \
+	seqlock_t name __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED(name)
+
+#define DECLARE_RAW_SEQLOCK(name) \
+	raw_seqlock_t name __cacheline_aligned_in_smp = RAW_SEQLOCK_UNLOCKED
+
 /*
  * Version using sequence counter only.
  * This can be used when code has its own mutex protecting the
@@ -145,30 +282,51 @@ static inline void write_seqcount_end(se
 	s->sequence++;
 }
 
+#define PICK_IRQOP(op, lock)					\
+do {								\
+	if (TYPE_EQUAL((lock), raw_seqlock_t))			\
+		op();						\
+	else if (TYPE_EQUAL((lock), seqlock_t))			\
+		{ /* nothing */ }				\
+	else __bad_seqlock_type();				\
+} while (0)
+
+#define PICK_IRQOP2(op, arg, lock)				\
+do {								\
+	if (TYPE_EQUAL((lock), raw_seqlock_t))			\
+		op(arg);					\
+	else if (TYPE_EQUAL(lock, seqlock_t))			\
+		{ /* nothing */ }				\
+	else __bad_seqlock_type();				\
+} while (0)
+
+
+
 /*
  * Possible sw/hw IRQ protected versions of the interfaces.
  */
 #define write_seqlock_irqsave(lock, flags)				\
-	do { local_irq_save(flags); write_seqlock(lock); } while (0)
+	do { PICK_IRQOP2(raw_local_irq_save, flags, lock); write_seqlock(lock); } while (0)
 #define write_seqlock_irq(lock)						\
-	do { local_irq_disable();   write_seqlock(lock); } while (0)
+	do { PICK_IRQOP(raw_local_irq_disable, lock); write_seqlock(lock); } while (0)
 #define write_seqlock_bh(lock)						\
-        do { local_bh_disable();    write_seqlock(lock); } while (0)
+        do { PICK_IRQOP(local_bh_disable, lock); write_seqlock(lock); } while (0)
 
 #define write_sequnlock_irqrestore(lock, flags)				\
-	do { write_sequnlock(lock); local_irq_restore(flags); } while(0)
+	do { write_sequnlock(lock); PICK_IRQOP2(raw_local_irq_restore, flags, lock); preempt_check_resched(); } while(0)
 #define write_sequnlock_irq(lock)					\
-	do { write_sequnlock(lock); local_irq_enable(); } while(0)
+	do { write_sequnlock(lock); PICK_IRQOP(raw_local_irq_enable, lock); preempt_check_resched(); } while(0)
 #define write_sequnlock_bh(lock)					\
-	do { write_sequnlock(lock); local_bh_enable(); } while(0)
+	do { write_sequnlock(lock); PICK_IRQOP(local_bh_enable, lock); } while(0)
 
 #define read_seqbegin_irqsave(lock, flags)				\
-	({ local_irq_save(flags);   read_seqbegin(lock); })
+	({ PICK_IRQOP2(raw_local_irq_save, flags, lock); read_seqbegin(lock); })
 
 #define read_seqretry_irqrestore(lock, iv, flags)			\
 	({								\
 		int ret = read_seqretry(lock, iv);			\
-		local_irq_restore(flags);				\
+		PICK_IRQOP2(raw_local_irq_restore, flags, lock);		\
+		preempt_check_resched(); 				\
 		ret;							\
 	})
 
Index: linux.prev/include/linux/slab.h
===================================================================
--- linux.prev.orig/include/linux/slab.h
+++ linux.prev/include/linux/slab.h
@@ -53,6 +53,8 @@ typedef struct kmem_cache kmem_cache_t;
 #define SLAB_CTOR_ATOMIC	0x002UL		/* tell constructor it can't sleep */
 #define	SLAB_CTOR_VERIFY	0x004UL		/* tell constructor it's a verify call */
 
+#ifndef CONFIG_SLOB
+
 /* prototypes */
 extern void __init kmem_cache_init(void);
 
@@ -74,7 +76,14 @@ struct cache_sizes {
 	kmem_cache_t	*cs_dmacachep;
 };
 extern struct cache_sizes malloc_sizes[];
+
+#ifndef CONFIG_DEBUG_SLAB
 extern void *__kmalloc(size_t, gfp_t);
+#else
+extern void *__kmalloc_track_caller(size_t, gfp_t, void*);
+#define __kmalloc(size, flags) \
+    __kmalloc_track_caller(size, flags, __builtin_return_address(0))
+#endif
 
 static inline void *kmalloc(size_t size, gfp_t flags)
 {
@@ -134,6 +143,39 @@ static inline void *kmalloc_node(size_t 
 extern int FASTCALL(kmem_cache_reap(int));
 extern int FASTCALL(kmem_ptr_validate(kmem_cache_t *cachep, void *ptr));
 
+#else /* CONFIG_SLOB */
+
+/* SLOB allocator routines */
+
+void kmem_cache_init(void);
+struct kmem_cache *kmem_find_general_cachep(size_t, gfp_t gfpflags);
+struct kmem_cache *kmem_cache_create(const char *c, size_t, size_t,
+	unsigned long,
+	void (*)(void *, struct kmem_cache *, unsigned long),
+	void (*)(void *, struct kmem_cache *, unsigned long));
+int kmem_cache_destroy(struct kmem_cache *c);
+void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags);
+void kmem_cache_free(struct kmem_cache *c, void *b);
+const char *kmem_cache_name(struct kmem_cache *);
+void *kmalloc(size_t size, gfp_t flags);
+void *kzalloc(size_t size, gfp_t flags);
+void kfree(const void *m);
+unsigned int ksize(const void *m);
+unsigned int kmem_cache_size(struct kmem_cache *c);
+
+static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
+{
+	return kzalloc(n * size, flags);
+}
+
+#define kmem_cache_shrink(d) (0)
+#define kmem_cache_reap(a)
+#define kmem_ptr_validate(a, b) (0)
+#define kmem_cache_alloc_node(c, f, n) kmem_cache_alloc(c, f)
+#define kmalloc_node(s, f, n) kmalloc(s, f)
+
+#endif /* CONFIG_SLOB */
+
 /* System wide caches */
 extern kmem_cache_t	*vm_area_cachep;
 extern kmem_cache_t	*names_cachep;
Index: linux.prev/include/linux/smp.h
===================================================================
--- linux.prev.orig/include/linux/smp.h
+++ linux.prev/include/linux/smp.h
@@ -34,6 +34,11 @@ extern void smp_send_stop(void);
  */
 extern void smp_send_reschedule(int cpu);
 
+/*
+ * trigger a reschedule on all other CPUs:
+ */
+extern void smp_send_reschedule_allbutself(void);
+
 
 /*
  * Prepare machine for booting other CPUs.
@@ -97,6 +102,7 @@ void smp_prepare_boot_cpu(void);
 #define smp_call_function(func,info,retry,wait)	({ 0; })
 #define on_each_cpu(func,info,retry,wait)	({ func(info); 0; })
 static inline void smp_send_reschedule(int cpu) { }
+static inline void smp_send_reschedule_allbutself(void) { }
 #define num_booting_cpus()			1
 #define smp_prepare_boot_cpu()			do {} while (0)
 
@@ -126,6 +132,6 @@ static inline void smp_send_reschedule(i
 
 #define get_cpu()		({ preempt_disable(); smp_processor_id(); })
 #define put_cpu()		preempt_enable()
-#define put_cpu_no_resched()	preempt_enable_no_resched()
+#define put_cpu_no_resched()	__preempt_enable_no_resched()
 
 #endif /* __LINUX_SMP_H */
Index: linux.prev/include/linux/smp_lock.h
===================================================================
--- linux.prev.orig/include/linux/smp_lock.h
+++ linux.prev/include/linux/smp_lock.h
@@ -19,6 +19,7 @@ extern void __lockfunc __release_kernel_
 		__release_kernel_lock();	\
 } while (0)
 
+
 /*
  * Non-SMP kernels will never block on the kernel lock,
  * so we are better off returning a constant zero from
@@ -46,7 +47,7 @@ extern void __lockfunc unlock_kernel(voi
 #define lock_kernel()				do { } while(0)
 #define unlock_kernel()				do { } while(0)
 #define release_kernel_lock(task)		do { } while(0)
-#define reacquire_kernel_lock(task)		0
+#define reacquire_kernel_lock(task)		do { } while(0)
 #define kernel_locked()				1
 
 #endif /* CONFIG_LOCK_KERNEL */
Index: linux.prev/include/linux/spinlock.h
===================================================================
--- linux.prev.orig/include/linux/spinlock.h
+++ linux.prev/include/linux/spinlock.h
@@ -52,6 +52,7 @@
 #include <linux/compiler.h>
 #include <linux/thread_info.h>
 #include <linux/kernel.h>
+#include <linux/cache.h>
 #include <linux/stringify.h>
 
 #include <asm/system.h>
@@ -90,16 +91,10 @@ extern int __lockfunc generic__raw_read_
 # include <linux/spinlock_up.h>
 #endif
 
-#define spin_lock_init(lock)	do { *(lock) = SPIN_LOCK_UNLOCKED; } while (0)
-#define rwlock_init(lock)	do { *(lock) = RW_LOCK_UNLOCKED; } while (0)
-
-#define spin_is_locked(lock)	__raw_spin_is_locked(&(lock)->raw_lock)
-
-/**
- * spin_unlock_wait - wait until the spinlock gets unlocked
- * @lock: the spinlock in question.
+/*
+ * Pull the RT types:
  */
-#define spin_unlock_wait(lock)	__raw_spin_unlock_wait(&(lock)->raw_lock)
+#include <linux/rt_lock.h>
 
 /*
  * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
@@ -110,18 +105,19 @@ extern int __lockfunc generic__raw_read_
 # include <linux/spinlock_api_up.h>
 #endif
 
+#if 0
 #ifdef CONFIG_DEBUG_SPINLOCK
- extern void _raw_spin_lock(spinlock_t *lock);
-#define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock)
- extern int _raw_spin_trylock(spinlock_t *lock);
- extern void _raw_spin_unlock(spinlock_t *lock);
-
- extern void _raw_read_lock(rwlock_t *lock);
- extern int _raw_read_trylock(rwlock_t *lock);
- extern void _raw_read_unlock(rwlock_t *lock);
- extern void _raw_write_lock(rwlock_t *lock);
- extern int _raw_write_trylock(rwlock_t *lock);
- extern void _raw_write_unlock(rwlock_t *lock);
+ extern void _raw_spin_lock(raw_spinlock_t *lock);
+#define _raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
+ extern int _raw_spin_trylock(raw_spinlock_t *lock);
+ extern void _raw_spin_unlock(raw_spinlock_t *lock);
+
+ extern void _raw_read_lock(raw_rwlock_t *lock);
+ extern int _raw_read_trylock(raw_rwlock_t *lock);
+ extern void _raw_read_unlock(raw_rwlock_t *lock);
+ extern void _raw_write_lock(raw_rwlock_t *lock);
+ extern int _raw_write_trylock(raw_rwlock_t *lock);
+ extern void _raw_write_unlock(raw_rwlock_t *lock);
 #else
 # define _raw_spin_unlock(lock)		__raw_spin_unlock(&(lock)->raw_lock)
 # define _raw_spin_trylock(lock)	__raw_spin_trylock(&(lock)->raw_lock)
@@ -135,114 +131,452 @@ extern int __lockfunc generic__raw_read_
 # define _raw_read_trylock(rwlock)	__raw_read_trylock(&(rwlock)->raw_lock)
 # define _raw_write_trylock(rwlock)	__raw_write_trylock(&(rwlock)->raw_lock)
 #endif
+#endif
 
-#define read_can_lock(rwlock)		__raw_read_can_lock(&(rwlock)->raw_lock)
-#define write_can_lock(rwlock)		__raw_write_can_lock(&(rwlock)->raw_lock)
+extern int __bad_spinlock_type(void);
 
 /*
- * Define the various spin_lock and rw_lock methods.  Note we define these
- * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various
- * methods are defined as nops in the case they are not required.
+ * The following ones are only implemented on PREEMPT_RT, but
+ * the type selection macros need the prototypes even though the
+ * functions never get called (hence, linked):
  */
-#define spin_trylock(lock)		__cond_lock(_spin_trylock(lock))
-#define read_trylock(lock)		__cond_lock(_read_trylock(lock))
-#define write_trylock(lock)		__cond_lock(_write_trylock(lock))
-
-#define spin_lock(lock)			_spin_lock(lock)
-#define write_lock(lock)		_write_lock(lock)
-#define read_lock(lock)			_read_lock(lock)
+#if !defined(CONFIG_PREEMPT_RT) || \
+	defined(CONFIG_DEBUG_RT_LOCKING_MODE) || \
+	defined(CONFIG_DEBUG_DEADLOCKS) || \
+	defined(CONFIG_DEBUG_IRQ_FLAGS)
+# undef DEBUG_RT_DONT_INLINE
+# define DEBUG_RT_DONT_INLINE
+#endif
 
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
-#define spin_lock_irqsave(lock, flags)	flags = _spin_lock_irqsave(lock)
-#define read_lock_irqsave(lock, flags)	flags = _read_lock_irqsave(lock)
-#define write_lock_irqsave(lock, flags)	flags = _write_lock_irqsave(lock)
+#ifdef DEBUG_RT_DONT_INLINE
+extern void __lockfunc _spin_lock(spinlock_t *lock);
+extern void __lockfunc _spin_lock_bh(spinlock_t *lock);
+extern void __lockfunc _spin_lock_irq(spinlock_t *lock);
+extern void __lockfunc _spin_unlock(spinlock_t *lock);
+extern void __lockfunc _spin_unlock_no_resched(spinlock_t *lock);
+extern void __lockfunc _spin_unlock_bh(spinlock_t *lock);
+extern void __lockfunc _spin_unlock_irq(spinlock_t *lock);
+extern unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock);
+extern void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags);
 #else
-#define spin_lock_irqsave(lock, flags)	_spin_lock_irqsave(lock, flags)
-#define read_lock_irqsave(lock, flags)	_read_lock_irqsave(lock, flags)
-#define write_lock_irqsave(lock, flags)	_write_lock_irqsave(lock, flags)
+/*
+ * Inlined shortcuts for the most common APIs:
+ */
+extern void __down_mutex(struct rt_mutex *lock);
+extern void __up_mutex_nosavestate(struct rt_mutex *lock);
+extern void __up_mutex_savestate(struct rt_mutex *lock);
+
+static inline void _spin_lock(spinlock_t *lock)
+{
+	__down_mutex(&lock->lock);
+}
+static inline void _spin_lock_bh(spinlock_t *lock)
+{
+	__down_mutex(&lock->lock);
+}
+static inline void _spin_lock_irq(spinlock_t *lock)
+{
+	__down_mutex(&lock->lock);
+}
+static inline unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
+{
+	__down_mutex(&lock->lock);
+	return 0;
+}
+static inline void _spin_unlock(spinlock_t *lock)
+{
+	__up_mutex_savestate(&lock->lock);
+}
+static inline void _spin_unlock_no_resched(spinlock_t *lock)
+{
+	__up_mutex_savestate(&lock->lock);
+}
+static inline void _spin_unlock_bh(spinlock_t *lock)
+{
+	__up_mutex_savestate(&lock->lock);
+}
+static inline void _spin_unlock_irq(spinlock_t *lock)
+{
+	__up_mutex_savestate(&lock->lock);
+}
+static inline void _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
+{
+	__up_mutex_savestate(&lock->lock);
+}
 #endif
+extern void __lockfunc _spin_unlock_wait(spinlock_t *lock);
+extern int __lockfunc _spin_trylock(spinlock_t *lock);
+extern int __lockfunc _spin_trylock_bh(spinlock_t *lock);
+extern int __lockfunc _spin_trylock_irq(spinlock_t *lock);
+extern int __lockfunc _spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
+extern int _spin_can_lock(spinlock_t *lock);
+extern int _spin_is_locked(spinlock_t *lock);
+extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
+extern void _spin_lock_init(spinlock_t *lock, char *name, char *file, int line);
+
+#undef TYPE_EQUAL
+#define TYPE_EQUAL(lock, type) \
+		__builtin_types_compatible_p(typeof(lock), type *)
+
+#define PICK_OP(type, optype, op, lock)				\
+do {								\
+	if (TYPE_EQUAL((lock), type))				\
+		_raw_##optype##op((type *)(lock));		\
+	else if (TYPE_EQUAL(lock, spinlock_t))			\
+		_spin##op((spinlock_t *)(lock));		\
+	else __bad_spinlock_type();				\
+} while (0)
+
+#define PICK_OP_RET(type, optype, op, lock...)			\
+({								\
+	unsigned long __ret;					\
+								\
+	if (TYPE_EQUAL((lock), type))	  			\
+		__ret = _raw_##optype##op((type *)(lock));	\
+	else if (TYPE_EQUAL(lock, spinlock_t))			\
+		__ret = _spin##op((spinlock_t *)(lock));	\
+	else __ret = __bad_spinlock_type();			\
+								\
+	__ret;							\
+})
+
+#define PICK_OP2(type, optype, op, lock, flags)			\
+do {								\
+	if (TYPE_EQUAL((lock), type))				\
+		_raw_##optype##op((type *)(lock), flags);	\
+	else if (TYPE_EQUAL(lock, spinlock_t))			\
+		_spin##op((spinlock_t *)(lock), flags);		\
+	else __bad_spinlock_type();				\
+} while (0)
+
+#define PICK_OP2_RET(type, optype, op, lock, flags)		\
+({								\
+	unsigned long __ret;					\
+								\
+	if (TYPE_EQUAL((lock), type))				\
+		__ret = _raw_##optype##op((type *)(lock), flags);\
+	else if (TYPE_EQUAL(lock, spinlock_t))			\
+		__ret = _spin##op((spinlock_t *)(lock), flags);	\
+	else __bad_spinlock_type();				\
+								\
+	__ret;							\
+})
+
+
+extern int __lockfunc _read_trylock(rwlock_t *rwlock);
+extern int __lockfunc _write_trylock(rwlock_t *rwlock);
+extern int _read_can_lock(rwlock_t *rwlock);
+extern int _write_can_lock(rwlock_t *rwlock);
+extern void __lockfunc _write_lock(rwlock_t *rwlock);
+extern void __lockfunc _read_lock(rwlock_t *rwlock);
+extern void __lockfunc _write_unlock(rwlock_t *rwlock);
+extern void __lockfunc _read_unlock(rwlock_t *rwlock);
+extern unsigned long __lockfunc _write_lock_irqsave(rwlock_t *rwlock);
+extern unsigned long __lockfunc _read_lock_irqsave(rwlock_t *rwlock);
+extern void __lockfunc _write_lock_irq(rwlock_t *rwlock);
+extern void __lockfunc _read_lock_irq(rwlock_t *rwlock);
+extern void __lockfunc _write_lock_bh(rwlock_t *rwlock);
+extern void __lockfunc _read_lock_bh(rwlock_t *rwlock);
+extern void __lockfunc _write_unlock_irq(rwlock_t *rwlock);
+extern void __lockfunc _read_unlock_irq(rwlock_t *rwlock);
+extern void __lockfunc _write_unlock_bh(rwlock_t *rwlock);
+extern void __lockfunc _read_unlock_bh(rwlock_t *rwlock);
+extern void __lockfunc _write_unlock_irqrestore(rwlock_t *rwlock, unsigned long flags);
+extern void __lockfunc _read_unlock_irqrestore(rwlock_t *rwlock, unsigned long flags);
+extern void _rwlock_init(rwlock_t *rwlock, char *name, char *file, int line);
+
+#define __PICK_RW_OP(type, optype, op, lock)				\
+do {									\
+	if (TYPE_EQUAL((lock), type))					\
+		_raw_##optype##op((type *)(lock));			\
+	else if (TYPE_EQUAL(lock, rwlock_t))				\
+		##op((rwlock_t *)(lock));				\
+	else __bad_spinlock_type();					\
+} while (0)
+
+#define PICK_RW_OP(type, optype, op, lock)				\
+do {									\
+	if (TYPE_EQUAL((lock), type))					\
+		_raw_##optype##op((type *)(lock));			\
+	else if (TYPE_EQUAL(lock, rwlock_t))				\
+		_##optype##op((rwlock_t *)(lock));			\
+	else __bad_spinlock_type();					\
+} while (0)
+
+#define __PICK_RW_OP_RET(type, optype, op, lock...)			\
+({									\
+	unsigned long __ret;						\
+									\
+	if (TYPE_EQUAL((lock), type))	  				\
+		__ret = _raw_##optype##op((type *)(lock));		\
+	else if (TYPE_EQUAL(lock, rwlock_t))				\
+		__ret = _##optype##op((rwlock_t *)(lock));		\
+	else __ret = __bad_spinlock_type();				\
+									\
+	__ret;								\
+})
 
-#define spin_lock_irq(lock)		_spin_lock_irq(lock)
-#define spin_lock_bh(lock)		_spin_lock_bh(lock)
+#define PICK_RW_OP_RET(type, optype, op, lock...)			\
+({									\
+	unsigned long __ret;						\
+									\
+	if (TYPE_EQUAL((lock), type))	  				\
+		__ret = _raw_##optype##op((type *)(lock));		\
+	else if (TYPE_EQUAL(lock, rwlock_t))				\
+		__ret = _##optype##op((rwlock_t *)(lock));		\
+	else __ret = __bad_spinlock_type();				\
+									\
+	__ret;								\
+})
+
+#define PICK_RW_OP2(type, optype, op, lock, flags)			\
+do {									\
+	if (TYPE_EQUAL((lock), type))					\
+		_raw_##optype##op((type *)(lock), flags);		\
+	else if (TYPE_EQUAL(lock, rwlock_t))				\
+		_##optype##op((rwlock_t *)(lock), flags);		\
+	else __bad_spinlock_type();					\
+} while (0)
+
+#define _raw_spin_lock_init __raw_spin_lock_init
+
+#define PICK_OP_INIT(type, optype, op, lock)				\
+do {									\
+	if (TYPE_EQUAL((lock), type))					\
+		_raw_##optype##op((type *)(lock));			\
+	else if (TYPE_EQUAL(lock, spinlock_t))				\
+		_spin##op((spinlock_t *)(lock), #lock, __FILE__, __LINE__); \
+	else __bad_spinlock_type();					\
+} while (0)
+
+
+#define spin_lock_init(lock) \
+		PICK_OP_INIT(raw_spinlock_t, spin, _lock_init, lock)
+
+#define _raw_rwlock_init __raw_rwlock_init
+
+#define __PICK_RW_OP_INIT(type, optype, op, lock)			\
+do {									\
+	if (TYPE_EQUAL((lock), type))					\
+		_raw_##optype##op((type *)(lock));			\
+	else if (TYPE_EQUAL(lock, rwlock_t))				\
+		_##optype##op((rwlock_t *)(lock), #lock, __FILE__, __LINE__);\
+	else __bad_spinlock_type();					\
+} while (0)
+
+
+#define rwlock_init(lock) \
+		__PICK_RW_OP_INIT(raw_rwlock_t, rwlock, _init, lock)
+
+#define _raw_spin_is_locked(lock) __raw_spin_is_locked(&(lock)->raw_lock)
 
-#define read_lock_irq(lock)		_read_lock_irq(lock)
-#define read_lock_bh(lock)		_read_lock_bh(lock)
+#define spin_is_locked(lock) \
+		PICK_OP_RET(raw_spinlock_t, spin, _is_locked, lock)
 
-#define write_lock_irq(lock)		_write_lock_irq(lock)
-#define write_lock_bh(lock)		_write_lock_bh(lock)
+#define _raw_spin_unlock_wait(lock) __raw_spin_unlock_wait(&(lock)->raw_lock)
 
+#define spin_unlock_wait(lock) \
+		PICK_OP(raw_spinlock_t, spin, _unlock_wait, lock)
 /*
- * We inline the unlock functions in the nondebug case:
+ * Define the various spin_lock and rw_lock methods.  Note we define these
+ * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various
+ * methods are defined as nops in the case they are not required.
  */
-#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP)
-# define spin_unlock(lock)		_spin_unlock(lock)
-# define read_unlock(lock)		_read_unlock(lock)
-# define write_unlock(lock)		_write_unlock(lock)
-#else
-# define spin_unlock(lock)		__raw_spin_unlock(&(lock)->raw_lock)
-# define read_unlock(lock)		__raw_read_unlock(&(lock)->raw_lock)
-# define write_unlock(lock)		__raw_write_unlock(&(lock)->raw_lock)
-#endif
+// #define spin_trylock(lock)	_spin_trylock(lock)
+#define spin_trylock(lock)	__cond_lock(PICK_OP_RET(raw_spinlock_t, spin, _trylock, lock))
+
+//#define read_trylock(lock)	_read_trylock(lock)
+#define read_trylock(lock)	__cond_lock(PICK_RW_OP_RET(raw_rwlock_t, read, _trylock, lock))
+
+//#define write_trylock(lock)	_write_trylock(lock)
+#define write_trylock(lock)	__cond_lock(PICK_RW_OP_RET(raw_rwlock_t, write, _trylock, lock))
+
+#define _raw_spin_can_lock(lock) __raw_spin_can_lock(&(lock)->raw_lock)
+#define _raw_read_can_lock(lock) __raw_read_can_lock(&(lock)->raw_lock)
+#define _raw_write_can_lock(lock) __raw_write_can_lock(&(lock)->raw_lock)
+
+#define spin_can_lock(lock)	__cond_lock(PICK_OP_RET(raw_spinlock_t, spin, _can_lock, lock))
+#define read_can_lock(lock)	__cond_lock(PICK_RW_OP_RET(raw_rwlock_t, read, _can_lock, lock))
+#define write_can_lock(lock)	__cond_lock(PICK_RW_OP_RET(raw_rwlock_t, write, _can_lock, lock))
 
-#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP)
-# define spin_unlock_irq(lock)		_spin_unlock_irq(lock)
-# define read_unlock_irq(lock)		_read_unlock_irq(lock)
-# define write_unlock_irq(lock)		_write_unlock_irq(lock)
+// #define spin_lock(lock)	_spin_lock(lock)
+#define spin_lock(lock)		PICK_OP(raw_spinlock_t, spin, _lock, lock)
+
+//#define write_lock(lock)	_write_lock(lock)
+#define write_lock(lock)	PICK_RW_OP(raw_rwlock_t, write, _lock, lock)
+
+// #define read_lock(lock)		_read_lock(lock)
+#define read_lock(lock)		PICK_RW_OP(raw_rwlock_t, read, _lock, lock)
+
+#ifdef CONFIG_SMP
+// #define spin_lock_irqsave(lock, flags)	flags = _spin_lock_irqsave(lock)
+// #define read_lock_irqsave(lock, flags)	flags = _read_lock_irqsave(lock)
+// #define write_lock_irqsave(lock, flags)	flags = _write_lock_irqsave(lock)
 #else
-# define spin_unlock_irq(lock) \
-    do { __raw_spin_unlock(&(lock)->raw_lock); local_irq_enable(); } while (0)
-# define read_unlock_irq(lock) \
-    do { __raw_read_unlock(&(lock)->raw_lock); local_irq_enable(); } while (0)
-# define write_unlock_irq(lock) \
-    do { __raw_write_unlock(&(lock)->raw_lock); local_irq_enable(); } while (0)
+// #define spin_lock_irqsave(lock, flags)	_spin_lock_irqsave(lock, flags)
+// #define read_lock_irqsave(lock, flags)	_read_lock_irqsave(lock, flags)
+// #define write_lock_irqsave(lock, flags)	_write_lock_irqsave(lock, flags)
 #endif
 
+# define spin_lock_irqsave(lock, flags) \
+	flags = PICK_OP_RET(raw_spinlock_t, spin, _lock_irqsave, lock)
+# define read_lock_irqsave(lock, flags) \
+	flags = PICK_RW_OP_RET(raw_rwlock_t, read, _lock_irqsave, lock)
+# define write_lock_irqsave(lock, flags) \
+	flags = PICK_RW_OP_RET(raw_rwlock_t, write, _lock_irqsave, lock)
+
+// #define spin_lock_irq(lock)	_spin_lock_irq(lock)
+// #define spin_lock_bh(lock)	_spin_lock_bh(lock)
+#define spin_lock_irq(lock)	PICK_OP(raw_spinlock_t, spin, _lock_irq, lock)
+#define spin_lock_bh(lock)	PICK_OP(raw_spinlock_t, spin, _lock_bh, lock)
+
+// #define read_lock_irq(lock)	_read_lock_irq(lock)
+// #define read_lock_bh(lock)	_read_lock_bh(lock)
+#define read_lock_irq(lock)	PICK_RW_OP(raw_rwlock_t, read, _lock_irq, lock)
+#define read_lock_bh(lock)	PICK_RW_OP(raw_rwlock_t, read, _lock_bh, lock)
+
+// #define write_lock_irq(lock)		_write_lock_irq(lock)
+// #define write_lock_bh(lock)		_write_lock_bh(lock)
+#define write_lock_irq(lock)	PICK_RW_OP(raw_rwlock_t, write, _lock_irq, lock)
+#define write_lock_bh(lock)	PICK_RW_OP(raw_rwlock_t, write, _lock_bh, lock)
+
+// #define spin_unlock(lock)	_spin_unlock(lock)
+// #define write_unlock(lock)	_write_unlock(lock)
+// #define read_unlock(lock)	_read_unlock(lock)
+#define spin_unlock(lock)	PICK_OP(raw_spinlock_t, spin, _unlock, lock)
+#define read_unlock(lock)	PICK_RW_OP(raw_rwlock_t, read, _unlock, lock)
+#define write_unlock(lock)	PICK_RW_OP(raw_rwlock_t, write, _unlock, lock)
+
+// #define spin_unlock(lock)	_spin_unlock_no_resched(lock)
+#define spin_unlock_no_resched(lock) \
+			PICK_OP(raw_spinlock_t, spin, _unlock_no_resched, lock)
+
+//#define spin_unlock_irqrestore(lock, flags)
+//		_spin_unlock_irqrestore(lock, flags)
+//#define spin_unlock_irq(lock)	_spin_unlock_irq(lock)
+//#define spin_unlock_bh(lock)	_spin_unlock_bh(lock)
 #define spin_unlock_irqrestore(lock, flags) \
-					_spin_unlock_irqrestore(lock, flags)
-#define spin_unlock_bh(lock)		_spin_unlock_bh(lock)
-
+	PICK_OP2(raw_spinlock_t, spin, _unlock_irqrestore, lock, flags)
+#define spin_unlock_irq(lock)	PICK_OP(raw_spinlock_t, spin, _unlock_irq, lock)
+#define spin_unlock_bh(lock)	PICK_OP(raw_spinlock_t, spin, _unlock_bh, lock)
+
+// #define read_unlock_irqrestore(lock, flags)
+// 		_read_unlock_irqrestore(lock, flags)
+// #define read_unlock_irq(lock)	_read_unlock_irq(lock)
+// #define read_unlock_bh(lock)	_read_unlock_bh(lock)
 #define read_unlock_irqrestore(lock, flags) \
-					_read_unlock_irqrestore(lock, flags)
-#define read_unlock_bh(lock)		_read_unlock_bh(lock)
-
+		PICK_RW_OP2(raw_rwlock_t, read, _unlock_irqrestore, lock, flags)
+#define read_unlock_irq(lock) PICK_RW_OP(raw_rwlock_t, read, _unlock_irq, lock)
+#define read_unlock_bh(lock) PICK_RW_OP(raw_rwlock_t, read, _unlock_bh, lock)
+
+// #define write_unlock_irqrestore(lock, flags)
+// 	_write_unlock_irqrestore(lock, flags)
+// #define write_unlock_irq(lock)			_write_unlock_irq(lock)
+// #define write_unlock_bh(lock)			_write_unlock_bh(lock)
 #define write_unlock_irqrestore(lock, flags) \
-					_write_unlock_irqrestore(lock, flags)
-#define write_unlock_bh(lock)		_write_unlock_bh(lock)
+	PICK_RW_OP2(raw_rwlock_t, write, _unlock_irqrestore, lock, flags)
+#define write_unlock_irq(lock) PICK_RW_OP(raw_rwlock_t, write, _unlock_irq, lock)
+#define write_unlock_bh(lock) PICK_RW_OP(raw_rwlock_t, write, _unlock_bh, lock)
 
-#define spin_trylock_bh(lock)		__cond_lock(_spin_trylock_bh(lock))
+// #define spin_trylock_bh(lock)	_spin_trylock_bh(lock)
+#define spin_trylock_bh(lock)	__cond_lock(PICK_OP_RET(raw_spinlock_t, spin, _trylock_bh, lock))
 
-#define spin_trylock_irq(lock) \
-({ \
-	local_irq_disable(); \
-	_spin_trylock(lock) ? \
-	1 : ({ local_irq_enable(); 0;  }); \
-})
+// #define spin_trylock_irq(lock)
+
+#define spin_trylock_irq(lock)	__cond_lock(PICK_OP_RET(raw_spinlock_t, spin, _trylock_irq, lock))
+
+// #define spin_trylock_irqsave(lock, flags)
+
+#define spin_trylock_irqsave(lock, flags)	__cond_lock(PICK_OP2_RET(raw_spinlock_t, spin, _trylock_irqsave, lock, &flags))
+
+/* "lock on reference count zero" */
+#ifndef ATOMIC_DEC_AND_LOCK
+# include <asm/atomic.h>
+  extern int _atomic_dec_and_raw_spin_lock(atomic_t *atomic, raw_spinlock_t *lock);
+#endif
+
+#define atomic_dec_and_lock(atomic, lock)				\
+__cond_lock(({								\
+	unsigned long __ret;						\
+									\
+	if (TYPE_EQUAL(lock, raw_spinlock_t))				\
+		__ret = _atomic_dec_and_raw_spin_lock(atomic,		\
+					(raw_spinlock_t *)(lock));	\
+	else if (TYPE_EQUAL(lock, spinlock_t))				\
+		__ret = atomic_dec_and_spin_lock(atomic,		\
+					(spinlock_t *)(lock));		\
+	else __ret = __bad_spinlock_type();				\
+									\
+	__ret;								\
+}))
 
-#define spin_trylock_irqsave(lock, flags) \
-({ \
-	local_irq_save(flags); \
-	_spin_trylock(lock) ? \
-	1 : ({ local_irq_restore(flags); 0; }); \
-})
 
 /*
- * Pull the atomic_t declaration:
- * (asm-mips/atomic.h needs above definitions)
+ *  bit-based spin_lock()
+ *
+ * Don't use this unless you really need to: spin_lock() and spin_unlock()
+ * are significantly faster.
  */
-#include <asm/atomic.h>
-/**
- * atomic_dec_and_lock - lock on reaching reference count zero
- * @atomic: the atomic counter
- * @lock: the spinlock in question
- */
-extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
-#define atomic_dec_and_lock(atomic, lock) \
-		__cond_lock(_atomic_dec_and_lock(atomic, lock))
+static inline void bit_spin_lock(int bitnum, unsigned long *addr)
+{
+	/*
+	 * Assuming the lock is uncontended, this never enters
+	 * the body of the outer loop. If it is contended, then
+	 * within the inner loop a non-atomic test is used to
+	 * busywait with less bus contention for a good time to
+	 * attempt to acquire the lock bit.
+	 */
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT)
+	while (test_and_set_bit(bitnum, addr))
+		while (test_bit(bitnum, addr))
+			cpu_relax();
+#endif
+	__acquire(bitlock);
+}
+
+/*
+ * Return true if it was acquired
+ */
+static inline int bit_spin_trylock(int bitnum, unsigned long *addr)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT)
+	if (test_and_set_bit(bitnum, addr))
+		return 0;
+#endif
+	__acquire(bitlock);
+	return 1;
+}
+
+/*
+ *  bit-based spin_unlock()
+ */
+static inline void bit_spin_unlock(int bitnum, unsigned long *addr)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT)
+	BUG_ON(!test_bit(bitnum, addr));
+	smp_mb__before_clear_bit();
+	clear_bit(bitnum, addr);
+#endif
+	__release(bitlock);
+}
+
+/*
+ * Return true if the lock is held.
+ */
+static inline int bit_spin_is_locked(int bitnum, unsigned long *addr)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT)
+	return test_bit(bitnum, addr);
+#else
+	return 1;
+#endif
+}
 
 /**
- * spin_can_lock - would spin_trylock() succeed?
+ * __raw_spin_can_lock - would __raw_spin_trylock() succeed?
  * @lock: the spinlock in question.
  */
-#define spin_can_lock(lock)	(!spin_is_locked(lock))
+#define __raw_spin_can_lock(lock)            (!__raw_spin_is_locked(lock))
 
 #endif /* __LINUX_SPINLOCK_H */
+
Index: linux.prev/include/linux/spinlock_api_smp.h
===================================================================
--- linux.prev.orig/include/linux/spinlock_api_smp.h
+++ linux.prev/include/linux/spinlock_api_smp.h
@@ -19,39 +19,42 @@ int in_lock_functions(unsigned long addr
 
 #define assert_spin_locked(x)	BUG_ON(!spin_is_locked(x))
 
-void __lockfunc _spin_lock(spinlock_t *lock)		__acquires(spinlock_t);
-void __lockfunc _read_lock(rwlock_t *lock)		__acquires(rwlock_t);
-void __lockfunc _write_lock(rwlock_t *lock)		__acquires(rwlock_t);
-void __lockfunc _spin_lock_bh(spinlock_t *lock)		__acquires(spinlock_t);
-void __lockfunc _read_lock_bh(rwlock_t *lock)		__acquires(rwlock_t);
-void __lockfunc _write_lock_bh(rwlock_t *lock)		__acquires(rwlock_t);
-void __lockfunc _spin_lock_irq(spinlock_t *lock)	__acquires(spinlock_t);
-void __lockfunc _read_lock_irq(rwlock_t *lock)		__acquires(rwlock_t);
-void __lockfunc _write_lock_irq(rwlock_t *lock)		__acquires(rwlock_t);
-unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
-							__acquires(spinlock_t);
-unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
-							__acquires(rwlock_t);
-unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
-							__acquires(rwlock_t);
-int __lockfunc _spin_trylock(spinlock_t *lock);
-int __lockfunc _read_trylock(rwlock_t *lock);
-int __lockfunc _write_trylock(rwlock_t *lock);
-int __lockfunc _spin_trylock_bh(spinlock_t *lock);
-void __lockfunc _spin_unlock(spinlock_t *lock)		__releases(spinlock_t);
-void __lockfunc _read_unlock(rwlock_t *lock)		__releases(rwlock_t);
-void __lockfunc _write_unlock(rwlock_t *lock)		__releases(rwlock_t);
-void __lockfunc _spin_unlock_bh(spinlock_t *lock)	__releases(spinlock_t);
-void __lockfunc _read_unlock_bh(rwlock_t *lock)		__releases(rwlock_t);
-void __lockfunc _write_unlock_bh(rwlock_t *lock)	__releases(rwlock_t);
-void __lockfunc _spin_unlock_irq(spinlock_t *lock)	__releases(spinlock_t);
-void __lockfunc _read_unlock_irq(rwlock_t *lock)	__releases(rwlock_t);
-void __lockfunc _write_unlock_irq(rwlock_t *lock)	__releases(rwlock_t);
-void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
-							__releases(spinlock_t);
-void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
-							__releases(rwlock_t);
-void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
-							__releases(rwlock_t);
+void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)		__acquires(raw_spinlock_t);
+void __lockfunc _raw_read_lock(raw_rwlock_t *lock)		__acquires(raw_rwlock_t);
+void __lockfunc _raw_write_lock(raw_rwlock_t *lock)		__acquires(raw_rwlock_t);
+void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)		__acquires(raw_spinlock_t);
+void __lockfunc _raw_read_lock_bh(raw_rwlock_t *lock)		__acquires(raw_rwlock_t);
+void __lockfunc _raw_write_lock_bh(raw_rwlock_t *lock)		__acquires(raw_rwlock_t);
+void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)	__acquires(raw_spinlock_t);
+void __lockfunc _raw_read_lock_irq(raw_rwlock_t *lock)		__acquires(raw_rwlock_t);
+void __lockfunc _raw_write_lock_irq(raw_rwlock_t *lock)		__acquires(raw_rwlock_t);
+unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
+							__acquires(raw_spinlock_t);
+unsigned long __lockfunc _raw_read_lock_irqsave(raw_rwlock_t *lock)
+							__acquires(raw_rwlock_t);
+unsigned long __lockfunc _raw_write_lock_irqsave(raw_rwlock_t *lock)
+							__acquires(raw_rwlock_t);
+int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock);
+int __lockfunc _raw_read_trylock(raw_rwlock_t *lock);
+int __lockfunc _raw_write_trylock(raw_rwlock_t *lock);
+int __lockfunc _raw_spin_trylock_irqsave(raw_spinlock_t *lock,
+					 unsigned long *flags);
+int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock);
+void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)		__releases(raw_spinlock_t);
+void __lockfunc _raw_spin_unlock_no_resched(raw_spinlock_t *lock) __releases(raw_spinlock_t);
+void __lockfunc _raw_read_unlock(raw_rwlock_t *lock)		__releases(raw_rwlock_t);
+void __lockfunc _raw_write_unlock(raw_rwlock_t *lock)		__releases(raw_rwlock_t);
+void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)	__releases(raw_spinlock_t);
+void __lockfunc _raw_read_unlock_bh(raw_rwlock_t *lock)		__releases(raw_rwlock_t);
+void __lockfunc _raw_write_unlock_bh(raw_rwlock_t *lock)	__releases(raw_rwlock_t);
+void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)	__releases(raw_spinlock_t);
+void __lockfunc _raw_read_unlock_irq(raw_rwlock_t *lock)	__releases(raw_rwlock_t);
+void __lockfunc _raw_write_unlock_irq(raw_rwlock_t *lock)	__releases(raw_rwlock_t);
+void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
+							__releases(raw_spinlock_t);
+void __lockfunc _raw_read_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags)
+							__releases(raw_rwlock_t);
+void __lockfunc _raw_write_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags)
+							__releases(raw_rwlock_t);
 
 #endif /* __LINUX_SPINLOCK_API_SMP_H */
Index: linux.prev/include/linux/spinlock_api_up.h
===================================================================
--- linux.prev.orig/include/linux/spinlock_api_up.h
+++ linux.prev/include/linux/spinlock_api_up.h
@@ -31,50 +31,67 @@
   do { local_bh_disable(); __LOCK(lock); } while (0)
 
 #define __LOCK_IRQ(lock) \
-  do { local_irq_disable(); __LOCK(lock); } while (0)
+  do { raw_local_irq_disable(); __LOCK(lock); } while (0)
 
-#define __LOCK_IRQSAVE(lock, flags) \
-  do { local_irq_save(flags); __LOCK(lock); } while (0)
+#define __LOCK_IRQSAVE(lock) \
+  ({ unsigned long __flags; raw_local_irq_save(__flags); __LOCK(lock); __flags; })
+
+#define __TRYLOCK_IRQSAVE(lock, flags) \
+	({ raw_local_irq_save(*(flags)); __LOCK(lock); 1; })
+
+#define _raw_spin_trylock_irqsave(lock, flags)	__TRYLOCK_IRQSAVE(lock, flags)
 
 #define __UNLOCK(lock) \
   do { preempt_enable(); __release(lock); (void)(lock); } while (0)
 
+#define __UNLOCK_NO_RESCHED(lock) \
+  do { __preempt_enable_no_resched(); __release(lock); (void)(lock); } while (0)
+
 #define __UNLOCK_BH(lock) \
   do { preempt_enable_no_resched(); local_bh_enable(); __release(lock); (void)(lock); } while (0)
 
 #define __UNLOCK_IRQ(lock) \
-  do { local_irq_enable(); __UNLOCK(lock); } while (0)
+  do { raw_local_irq_enable(); __UNLOCK(lock); } while (0)
 
 #define __UNLOCK_IRQRESTORE(lock, flags) \
-  do { local_irq_restore(flags); __UNLOCK(lock); } while (0)
+  do { raw_local_irq_restore(flags); __UNLOCK(lock); } while (0)
 
-#define _spin_lock(lock)			__LOCK(lock)
-#define _read_lock(lock)			__LOCK(lock)
-#define _write_lock(lock)			__LOCK(lock)
-#define _spin_lock_bh(lock)			__LOCK_BH(lock)
-#define _read_lock_bh(lock)			__LOCK_BH(lock)
-#define _write_lock_bh(lock)			__LOCK_BH(lock)
-#define _spin_lock_irq(lock)			__LOCK_IRQ(lock)
-#define _read_lock_irq(lock)			__LOCK_IRQ(lock)
-#define _write_lock_irq(lock)			__LOCK_IRQ(lock)
-#define _spin_lock_irqsave(lock, flags)		__LOCK_IRQSAVE(lock, flags)
-#define _read_lock_irqsave(lock, flags)		__LOCK_IRQSAVE(lock, flags)
-#define _write_lock_irqsave(lock, flags)	__LOCK_IRQSAVE(lock, flags)
-#define _spin_trylock(lock)			({ __LOCK(lock); 1; })
-#define _read_trylock(lock)			({ __LOCK(lock); 1; })
-#define _write_trylock(lock)			({ __LOCK(lock); 1; })
-#define _spin_trylock_bh(lock)			({ __LOCK_BH(lock); 1; })
-#define _spin_unlock(lock)			__UNLOCK(lock)
-#define _read_unlock(lock)			__UNLOCK(lock)
-#define _write_unlock(lock)			__UNLOCK(lock)
-#define _spin_unlock_bh(lock)			__UNLOCK_BH(lock)
-#define _write_unlock_bh(lock)			__UNLOCK_BH(lock)
-#define _read_unlock_bh(lock)			__UNLOCK_BH(lock)
-#define _spin_unlock_irq(lock)			__UNLOCK_IRQ(lock)
-#define _read_unlock_irq(lock)			__UNLOCK_IRQ(lock)
-#define _write_unlock_irq(lock)			__UNLOCK_IRQ(lock)
-#define _spin_unlock_irqrestore(lock, flags)	__UNLOCK_IRQRESTORE(lock, flags)
-#define _read_unlock_irqrestore(lock, flags)	__UNLOCK_IRQRESTORE(lock, flags)
-#define _write_unlock_irqrestore(lock, flags)	__UNLOCK_IRQRESTORE(lock, flags)
+#define _raw_spin_lock(lock)			__LOCK(lock)
+#define _raw_read_lock(lock)			__LOCK(lock)
+#define _raw_write_lock(lock)			__LOCK(lock)
+#define _raw_spin_lock_bh(lock)			__LOCK_BH(lock)
+#define _raw_read_lock_bh(lock)			__LOCK_BH(lock)
+#define _raw_write_lock_bh(lock)		__LOCK_BH(lock)
+#define _raw_spin_lock_irq(lock)		__LOCK_IRQ(lock)
+#define _raw_read_lock_irq(lock)		__LOCK_IRQ(lock)
+#define _raw_write_lock_irq(lock)		__LOCK_IRQ(lock)
+#define _raw_spin_lock_irqsave(lock)		__LOCK_IRQSAVE(lock)
+#define _raw_read_lock_irqsave(lock)		__LOCK_IRQSAVE(lock)
+#define _raw_write_lock_irqsave(lock)		__LOCK_IRQSAVE(lock)
+#define _raw_spin_trylock(lock)			({ __LOCK(lock); 1; })
+#define _raw_read_trylock(lock)			({ __LOCK(lock); 1; })
+#define _raw_write_trylock(lock)		({ __LOCK(lock); 1; })
+#define _raw_spin_trylock_bh(lock)		({ __LOCK_BH(lock); 1; })
+#define _raw_read_trylock_bh(lock)		({ __LOCK_BH(lock); 1; })
+#define _raw_write_trylock_bh(lock)		({ __LOCK_BH(lock); 1; })
+#define _raw_spin_trylock_irqsave(lock, flags)	__TRYLOCK_IRQSAVE(lock, flags)
+#define _raw_read_trylock_irqsave(lock, flags)	__TRYLOCK_IRQSAVE(lock, flags)
+#define _raw_read_trylock_irqsave(lock, flags)	__TRYLOCK_IRQSAVE(lock, flags)
+#define _raw_spin_unlock(lock)			__UNLOCK(lock)
+#define _raw_spin_unlock_no_resched(lock)	__UNLOCK_NO_RESCHED(lock)
+#define _raw_read_unlock(lock)			__UNLOCK(lock)
+#define _raw_write_unlock(lock)			__UNLOCK(lock)
+#define _raw_spin_unlock_bh(lock)		__UNLOCK_BH(lock)
+#define _raw_write_unlock_bh(lock)		__UNLOCK_BH(lock)
+#define _raw_read_unlock_bh(lock)		__UNLOCK_BH(lock)
+#define _raw_spin_unlock_irq(lock)		__UNLOCK_IRQ(lock)
+#define _raw_read_unlock_irq(lock)		__UNLOCK_IRQ(lock)
+#define _raw_write_unlock_irq(lock)		__UNLOCK_IRQ(lock)
+#define _raw_spin_unlock_irqrestore(lock, flags) \
+						__UNLOCK_IRQRESTORE(lock, flags)
+#define _raw_read_unlock_irqrestore(lock, flags) \
+						__UNLOCK_IRQRESTORE(lock, flags)
+#define _raw_write_unlock_irqrestore(lock, flags) \
+						__UNLOCK_IRQRESTORE(lock, flags)
 
 #endif /* __LINUX_SPINLOCK_API_UP_H */
Index: linux.prev/include/linux/spinlock_types.h
===================================================================
--- linux.prev.orig/include/linux/spinlock_types.h
+++ linux.prev/include/linux/spinlock_types.h
@@ -16,7 +16,7 @@
 #endif
 
 typedef struct {
-	raw_spinlock_t raw_lock;
+	__raw_spinlock_t raw_lock;
 #if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
 	unsigned int break_lock;
 #endif
@@ -24,12 +24,12 @@ typedef struct {
 	unsigned int magic, owner_cpu;
 	void *owner;
 #endif
-} spinlock_t;
+} raw_spinlock_t;
 
-#define SPINLOCK_MAGIC		0xdead4ead
+#define RAW_SPINLOCK_MAGIC	0xdead4ead
 
 typedef struct {
-	raw_rwlock_t raw_lock;
+	__raw_rwlock_t raw_lock;
 #if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
 	unsigned int break_lock;
 #endif
@@ -37,31 +37,46 @@ typedef struct {
 	unsigned int magic, owner_cpu;
 	void *owner;
 #endif
-} rwlock_t;
+} raw_rwlock_t;
 
 #define RWLOCK_MAGIC		0xdeaf1eed
 
 #define SPINLOCK_OWNER_INIT	((void *)-1L)
 
 #ifdef CONFIG_DEBUG_SPINLOCK
-# define SPIN_LOCK_UNLOCKED						\
-	(spinlock_t)	{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED,	\
-				.magic = SPINLOCK_MAGIC,		\
+# define RAW_SPIN_LOCK_UNLOCKED						\
+	(raw_spinlock_t) {	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED,	\
+				.magic = RAW_SPINLOCK_MAGIC,		\
 				.owner = SPINLOCK_OWNER_INIT,		\
 				.owner_cpu = -1 }
-#define RW_LOCK_UNLOCKED						\
-	(rwlock_t)	{	.raw_lock = __RAW_RW_LOCK_UNLOCKED,	\
+# define RAW_RW_LOCK_UNLOCKED						\
+	(raw_rwlock_t) {	.raw_lock = __RAW_RW_LOCK_UNLOCKED,	\
 				.magic = RWLOCK_MAGIC,			\
 				.owner = SPINLOCK_OWNER_INIT,		\
 				.owner_cpu = -1 }
 #else
-# define SPIN_LOCK_UNLOCKED \
-	(spinlock_t)	{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED }
-#define RW_LOCK_UNLOCKED \
-	(rwlock_t)	{	.raw_lock = __RAW_RW_LOCK_UNLOCKED }
-#endif
-
-#define DEFINE_SPINLOCK(x)	spinlock_t x = SPIN_LOCK_UNLOCKED
-#define DEFINE_RWLOCK(x)	rwlock_t x = RW_LOCK_UNLOCKED
+# define _RAW_SPIN_LOCK_UNLOCKED \
+		{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED }
+# define _RAW_RW_LOCK_UNLOCKED \
+		{	.raw_lock = __RAW_RW_LOCK_UNLOCKED }
+# define RAW_SPIN_LOCK_UNLOCKED \
+	(raw_spinlock_t) _RAW_SPIN_LOCK_UNLOCKED
+# define RAW_RW_LOCK_UNLOCKED \
+	(raw_rwlock_t) _RAW_RW_LOCK_UNLOCKED
+#endif
+
+#define DEFINE_RAW_SPINLOCK(name) \
+	raw_spinlock_t name __cacheline_aligned_in_smp = RAW_SPIN_LOCK_UNLOCKED
+
+#define __DEFINE_RAW_SPINLOCK(name) \
+	raw_spinlock_t name = RAW_SPIN_LOCK_UNLOCKED
+
+#define DEFINE_RAW_RWLOCK(name) \
+	raw_rwlock_t name __cacheline_aligned_in_smp = RAW_RW_LOCK_UNLOCKED
+
+#define __raw_spin_lock_init(lock) \
+	do { *(lock) = RAW_SPIN_LOCK_UNLOCKED; } while (0)
+#define __raw_rwlock_init(lock) \
+	do { *(lock) = RAW_RW_LOCK_UNLOCKED; } while (0)
 
 #endif /* __LINUX_SPINLOCK_TYPES_H */
Index: linux.prev/include/linux/spinlock_types_up.h
===================================================================
--- linux.prev.orig/include/linux/spinlock_types_up.h
+++ linux.prev/include/linux/spinlock_types_up.h
@@ -16,7 +16,7 @@
 
 typedef struct {
 	volatile unsigned int slock;
-} raw_spinlock_t;
+} __raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED { 1 }
 
@@ -27,11 +27,11 @@ typedef struct {
  * with empty initializers.
  */
 #if (__GNUC__ > 2)
-typedef struct { } raw_spinlock_t;
+typedef struct { } __raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED { }
 #else
-typedef struct { int gcc_is_buggy; } raw_spinlock_t;
+typedef struct { int gcc_is_buggy; } __raw_spinlock_t;
 #define __RAW_SPIN_LOCK_UNLOCKED (raw_spinlock_t) { 0 }
 #endif
 
@@ -40,7 +40,7 @@ typedef struct { int gcc_is_buggy; } raw
 #if (__GNUC__ > 2)
 typedef struct {
 	/* no debug version on UP */
-} raw_rwlock_t;
+} __raw_rwlock_t;
 
 #define __RAW_RW_LOCK_UNLOCKED { }
 #else
Index: linux.prev/include/linux/spinlock_up.h
===================================================================
--- linux.prev.orig/include/linux/spinlock_up.h
+++ linux.prev/include/linux/spinlock_up.h
@@ -29,7 +29,7 @@ static inline void __raw_spin_lock(raw_s
 static inline void
 __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
 {
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	lock->slock = 0;
 }
 
Index: linux.prev/include/linux/stop_machine.h
===================================================================
--- linux.prev.orig/include/linux/stop_machine.h
+++ linux.prev/include/linux/stop_machine.h
@@ -43,9 +43,9 @@ static inline int stop_machine_run(int (
 				   unsigned int cpu)
 {
 	int ret;
-	local_irq_disable();
+	raw_local_irq_disable();
 	ret = fn(data);
-	local_irq_enable();
+	raw_local_irq_enable();
 	return ret;
 }
 #endif /* CONFIG_SMP */
Index: linux.prev/include/linux/sunrpc/sched.h
===================================================================
--- linux.prev.orig/include/linux/sunrpc/sched.h
+++ linux.prev/include/linux/sunrpc/sched.h
@@ -203,7 +203,7 @@ struct rpc_wait_queue {
 
 #ifndef RPC_DEBUG
 # define RPC_WAITQ_INIT(var,qname) { \
-		.lock = SPIN_LOCK_UNLOCKED, \
+		.lock = SPIN_LOCK_UNLOCKED(var.lock), \
 		.tasks = { \
 			[0] = LIST_HEAD_INIT(var.tasks[0]), \
 			[1] = LIST_HEAD_INIT(var.tasks[1]), \
@@ -212,7 +212,7 @@ struct rpc_wait_queue {
 	}
 #else
 # define RPC_WAITQ_INIT(var,qname) { \
-		.lock = SPIN_LOCK_UNLOCKED, \
+		.lock = SPIN_LOCK_UNLOCKED(var.lock), \
 		.tasks = { \
 			[0] = LIST_HEAD_INIT(var.tasks[0]), \
 			[1] = LIST_HEAD_INIT(var.tasks[1]), \
Index: linux.prev/include/linux/time.h
===================================================================
--- linux.prev.orig/include/linux/time.h
+++ linux.prev/include/linux/time.h
@@ -4,7 +4,7 @@
 #include <linux/types.h>
 
 #ifdef __KERNEL__
-#include <linux/seqlock.h>
+# include <linux/seqlock.h>
 #endif
 
 #ifndef _STRUCT_TIMESPEC
@@ -13,7 +13,7 @@ struct timespec {
 	time_t	tv_sec;		/* seconds */
 	long	tv_nsec;	/* nanoseconds */
 };
-#endif /* _STRUCT_TIMESPEC */
+#endif
 
 struct timeval {
 	time_t		tv_sec;		/* seconds */
@@ -27,91 +27,113 @@ struct timezone {
 
 #ifdef __KERNEL__
 
-/* Parameters used to convert the timespec values */
-#define MSEC_PER_SEC (1000L)
-#define USEC_PER_SEC (1000000L)
-#define NSEC_PER_SEC (1000000000L)
-#define NSEC_PER_USEC (1000L)
+/* timeofday base types */
+typedef s64 nsec_t;
+typedef u64 cycle_t;
+
+/* Parameters used to convert the timespec values: */
+#define MSEC_PER_SEC		1000L
+#define USEC_PER_SEC		1000000L
+#define NSEC_PER_SEC		1000000000L
+#define NSEC_PER_USEC		1000L
 
-static __inline__ int timespec_equal(struct timespec *a, struct timespec *b) 
-{ 
+static __inline__ int timespec_equal(struct timespec *a, struct timespec *b)
+{
 	return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec);
-} 
+}
 
-/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
- * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
- * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
- *
- * [For the Julian calendar (which was used in Russia before 1917,
- * Britain & colonies before 1752, anywhere else before 1582,
- * and is still in use by some communities) leave out the
- * -year/100+year/400 terms, and add 10.]
- *
- * This algorithm was first published by Gauss (I think).
- *
- * WARNING: this function will overflow on 2106-02-07 06:28:16 on
- * machines were long is 32-bit! (However, as time_t is signed, we
- * will already get problems at other places on 2038-01-19 03:14:08)
- */
-static inline unsigned long
-mktime (unsigned int year, unsigned int mon,
-	unsigned int day, unsigned int hour,
-	unsigned int min, unsigned int sec)
-{
-	if (0 >= (int) (mon -= 2)) {	/* 1..12 -> 11,12,1..10 */
-		mon += 12;		/* Puts Feb last since it has leap day */
-		year -= 1;
-	}
+extern unsigned long mktime(const unsigned int year, const unsigned int mon,
+			    const unsigned int day, const unsigned int hour,
+			    const unsigned int min, const unsigned int sec);
 
-	return (((
-		(unsigned long) (year/4 - year/100 + year/400 + 367*mon/12 + day) +
-			year*365 - 719499
-	    )*24 + hour /* now have hours */
-	  )*60 + min /* now have minutes */
-	)*60 + sec; /* finally seconds */
-}
+extern void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec);
+
+/*
+ * Returns true if the timespec is norm, false if denorm:
+ */
+#define timespec_valid(ts) \
+	(((ts)->tv_sec >= 0) && (((unsigned) (ts)->tv_nsec) < NSEC_PER_SEC))
 
 extern struct timespec xtime;
 extern struct timespec wall_to_monotonic;
-extern seqlock_t xtime_lock;
+extern raw_seqlock_t xtime_lock;
 
 static inline unsigned long get_seconds(void)
-{ 
+{
 	return xtime.tv_sec;
 }
 
 struct timespec current_kernel_time(void);
 
-#define CURRENT_TIME (current_kernel_time())
-#define CURRENT_TIME_SEC ((struct timespec) { xtime.tv_sec, 0 })
+#define CURRENT_TIME		(current_kernel_time())
+#define CURRENT_TIME_SEC	((struct timespec) { xtime.tv_sec, 0 })
 
 extern void do_gettimeofday(struct timeval *tv);
 extern int do_settimeofday(struct timespec *tv);
 extern int do_sys_settimeofday(struct timespec *tv, struct timezone *tz);
-extern void clock_was_set(void); // call when ever the clock is set
-extern int do_posix_clock_monotonic_gettime(struct timespec *tp);
-extern long do_utimes(char __user * filename, struct timeval * times);
+#define do_posix_clock_monotonic_gettime(ts) ktime_get_ts(ts)
+extern long do_utimes(char __user *filename, struct timeval *times);
 struct itimerval;
-extern int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue);
+extern int do_setitimer(int which, struct itimerval *value,
+			struct itimerval *ovalue);
 extern int do_getitimer(int which, struct itimerval *value);
-extern void getnstimeofday (struct timespec *tv);
-extern void getnstimestamp(struct timespec *ts);
 
 extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
 
-static inline void
-set_normalized_timespec (struct timespec *ts, time_t sec, long nsec)
+/**
+ * timespec_to_ns - Convert timespec to nanoseconds
+ * @ts:		pointer to the timespec variable to be converted
+ *
+ * Returns the scalar nanosecond representation of the timespec
+ * parameter.
+ */
+static inline nsec_t timespec_to_ns(const struct timespec *ts)
 {
-	while (nsec >= NSEC_PER_SEC) {
-		nsec -= NSEC_PER_SEC;
-		++sec;
-	}
-	while (nsec < 0) {
-		nsec += NSEC_PER_SEC;
-		--sec;
+	return ((nsec_t) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec;
+}
+
+/**
+ * timeval_to_ns - Convert timeval to nanoseconds
+ * @ts:		pointer to the timeval variable to be converted
+ *
+ * Returns the scalar nanosecond representation of the timeval
+ * parameter.
+ */
+static inline nsec_t timeval_to_ns(const struct timeval *tv)
+{
+	return ((nsec_t) tv->tv_sec * NSEC_PER_SEC) +
+		tv->tv_usec * NSEC_PER_USEC;
+}
+
+/**
+ * ns_to_timespec - Convert nanoseconds to timespec
+ * @nsec:	the nanoseconds value to be converted
+ *
+ * Returns the timespec representation of the nsec parameter.
+ */
+extern struct timespec ns_to_timespec(const nsec_t nsec);
+
+/**
+ * ns_to_timeval - Convert nanoseconds to timeval
+ * @nsec:	the nanoseconds value to be converted
+ *
+ * Returns the timeval representation of the nsec parameter.
+ */
+extern struct timeval ns_to_timeval(const nsec_t nsec);
+
+/**
+ * timespec_add_ns - Adds nanoseconds to a timespec
+ * @a:		pointer to timespec to be incremented
+ * @ns:		the nanoseconds value to be added
+ */
+static inline void timespec_add_ns(struct timespec *a, nsec_t ns)
+{
+	ns += a->tv_nsec;
+	while(unlikely(ns >= NSEC_PER_SEC)) {
+		ns -= NSEC_PER_SEC;
+		a->tv_sec++;
 	}
-	ts->tv_sec = sec;
-	ts->tv_nsec = nsec;
+	a->tv_nsec = ns;
 }
 
 #endif /* __KERNEL__ */
@@ -126,49 +148,41 @@ set_normalized_timespec (struct timespec
 
 /*
  * Names of the interval timers, and structure
- * defining a timer setting.
+ * defining a timer setting:
  */
-#define	ITIMER_REAL	0
-#define	ITIMER_VIRTUAL	1
-#define	ITIMER_PROF	2
-
-struct  itimerspec {
-        struct  timespec it_interval;    /* timer period */
-        struct  timespec it_value;       /* timer expiration */
+#define	ITIMER_REAL		0
+#define	ITIMER_VIRTUAL		1
+#define	ITIMER_PROF		2
+
+struct itimerspec {
+	struct timespec it_interval;	/* timer period */
+	struct timespec it_value;	/* timer expiration */
 };
 
-struct	itimerval {
-	struct	timeval it_interval;	/* timer interval */
-	struct	timeval it_value;	/* current value */
+struct itimerval {
+	struct timeval it_interval;	/* timer interval */
+	struct timeval it_value;	/* current value */
 };
 
-
 /*
- * The IDs of the various system clocks (for POSIX.1b interval timers).
+ * The IDs of the various system clocks (for POSIX.1b interval timers):
  */
-#define CLOCK_REALTIME		  0
-#define CLOCK_MONOTONIC	  1
-#define CLOCK_PROCESS_CPUTIME_ID 2
-#define CLOCK_THREAD_CPUTIME_ID	 3
-#define CLOCK_REALTIME_HR	 4
-#define CLOCK_MONOTONIC_HR	  5
+#define CLOCK_REALTIME			0
+#define CLOCK_MONOTONIC			1
+#define CLOCK_PROCESS_CPUTIME_ID	2
+#define CLOCK_THREAD_CPUTIME_ID		3
 
 /*
- * The IDs of various hardware clocks
+ * The IDs of various hardware clocks:
  */
-
-
-#define CLOCK_SGI_CYCLE 10
-#define MAX_CLOCKS 16
-#define CLOCKS_MASK  (CLOCK_REALTIME | CLOCK_MONOTONIC | \
-                     CLOCK_REALTIME_HR | CLOCK_MONOTONIC_HR)
-#define CLOCKS_MONO (CLOCK_MONOTONIC & CLOCK_MONOTONIC_HR)
+#define CLOCK_SGI_CYCLE			10
+#define MAX_CLOCKS			16
+#define CLOCKS_MASK			(CLOCK_REALTIME | CLOCK_MONOTONIC)
+#define CLOCKS_MONO			CLOCK_MONOTONIC
 
 /*
- * The various flags for setting POSIX.1b interval timers.
+ * The various flags for setting POSIX.1b interval timers:
  */
-
-#define TIMER_ABSTIME 0x01
-
+#define TIMER_ABSTIME			0x01
 
 #endif
Index: linux.prev/include/linux/timeofday.h
===================================================================
--- /dev/null
+++ linux.prev/include/linux/timeofday.h
@@ -0,0 +1,46 @@
+/*  linux/include/linux/timeofday.h
+ *
+ *  This file contains the interface to the time of day subsystem
+ */
+#ifndef _LINUX_TIMEOFDAY_H
+#define _LINUX_TIMEOFDAY_H
+#include <linux/calc64.h>
+#include <linux/types.h>
+#include <linux/ktime.h>
+#include <linux/time.h>
+#include <linux/timex.h>
+
+#ifdef CONFIG_GENERIC_TIME
+
+/* Kernel internal interfaces */
+extern ktime_t get_monotonic_clock(void);
+extern ktime_t get_realtime_clock(void);
+extern ktime_t get_realtime_offset(void);
+
+/* Timepsec based interfaces for user space functionality */
+extern void get_realtime_clock_ts(struct timespec *ts);
+extern void get_monotonic_clock_ts(struct timespec *ts);
+
+/* legacy timeofday interfaces*/
+#define getnstimeofday(ts) get_realtime_clock_ts(ts)
+#define getnstimestamp(ts) get_monotonic_clock_ts(ts)
+extern void getnstimeofday(struct timespec *ts);
+extern void do_gettimeofday(struct timeval *tv);
+extern int do_settimeofday(struct timespec *ts);
+
+/* Internal functions */
+extern int timeofday_is_continuous(void);
+extern void timeofday_init(void);
+
+#ifndef CONFIG_IS_TICK_BASED
+#define arch_getoffset() (0)
+#else
+extern unsigned long arch_getoffset(void);
+#endif
+
+#else /* CONFIG_GENERIC_TIME */
+#define timeofday_init()
+extern void getnstimeofday(struct timespec *ts);
+extern void getnstimestamp(struct timespec *ts);
+#endif /* CONFIG_GENERIC_TIME */
+#endif /* _LINUX_TIMEOFDAY_H */
Index: linux.prev/include/linux/timer.h
===================================================================
--- linux.prev.orig/include/linux/timer.h
+++ linux.prev/include/linux/timer.h
@@ -84,10 +84,12 @@ static inline void add_timer(struct time
 	__mod_timer(timer, timer->expires);
 }
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS)
+  extern int timer_pending_sync(struct timer_list *timer);
   extern int try_to_del_timer_sync(struct timer_list *timer);
   extern int del_timer_sync(struct timer_list *timer);
 #else
+# define timer_pending_sync(t)		timer_pending(t)
 # define try_to_del_timer_sync(t)	del_timer(t)
 # define del_timer_sync(t)		del_timer(t)
 #endif
@@ -96,6 +98,6 @@ static inline void add_timer(struct time
 
 extern void init_timers(void);
 extern void run_local_timers(void);
-extern void it_real_fn(unsigned long);
+extern int it_real_fn(void *);
 
 #endif
Index: linux.prev/include/linux/timex.h
===================================================================
--- linux.prev.orig/include/linux/timex.h
+++ linux.prev/include/linux/timex.h
@@ -260,6 +260,8 @@ extern long pps_calcnt;		/* calibration 
 extern long pps_errcnt;		/* calibration errors */
 extern long pps_stbcnt;		/* stability limit exceeded */
 
+extern raw_seqlock_t ntp_lock;
+
 /**
  * ntp_clear - Clears the NTP state variables
  *
@@ -267,21 +269,40 @@ extern long pps_stbcnt;		/* stability li
  */
 static inline void ntp_clear(void)
 {
+	unsigned long flags;
+
+	write_seqlock_irqsave(&ntp_lock, flags);
 	time_adjust = 0;		/* stop active adjtime() */
 	time_status |= STA_UNSYNC;
 	time_maxerror = NTP_PHASE_LIMIT;
 	time_esterror = NTP_PHASE_LIMIT;
+	write_sequnlock_irqrestore(&ntp_lock, flags);
 }
 
 /**
  * ntp_synced - Returns 1 if the NTP status is not UNSYNC
- *
  */
 static inline int ntp_synced(void)
 {
 	return !(time_status & STA_UNSYNC);
 }
 
+/**
+ * ntp_get_ppm_adjustment - Returns Shifted PPM adjustment
+ */
+extern long ntp_get_ppm_adjustment(void);
+
+/**
+ * ntp_advance - Advances the NTP state machine by interval_ns
+ */
+extern void ntp_advance(unsigned long interval_ns);
+
+/**
+ * ntp_leapsecond - NTP leapsecond processing code.
+ */
+extern int ntp_leapsecond(struct timespec now);
+
+
 /* Required to safely shift negative values */
 #define shift_right(x, s) ({	\
 	__typeof__(x) __x = (x);	\
@@ -289,6 +310,7 @@ static inline int ntp_synced(void)
 	__x < 0 ? -(-__x >> __s) : __x >> __s;	\
 })
 
+#ifndef CONFIG_GENERIC_TIME
 
 #ifdef CONFIG_TIME_INTERPOLATION
 
@@ -344,6 +366,7 @@ time_interpolator_reset(void)
 }
 
 #endif /* !CONFIG_TIME_INTERPOLATION */
+#endif /* !CONFIG_GENERIC_TIME */
 
 #endif /* KERNEL */
 
Index: linux.prev/include/linux/wait.h
===================================================================
--- linux.prev.orig/include/linux/wait.h
+++ linux.prev/include/linux/wait.h
@@ -48,11 +48,13 @@ struct wait_bit_queue {
 	wait_queue_t wait;
 };
 
+#if 1
 struct __wait_queue_head {
 	spinlock_t lock;
 	struct list_head task_list;
 };
 typedef struct __wait_queue_head wait_queue_head_t;
+#endif
 
 struct task_struct;
 
@@ -69,7 +71,7 @@ struct task_struct;
 	wait_queue_t name = __WAITQUEUE_INITIALIZER(name, tsk)
 
 #define __WAIT_QUEUE_HEAD_INITIALIZER(name) {				\
-	.lock		= SPIN_LOCK_UNLOCKED,				\
+	.lock		= SPIN_LOCK_UNLOCKED(name.lock),		\
 	.task_list	= { &(name).task_list, &(name).task_list } }
 
 #define DECLARE_WAIT_QUEUE_HEAD(name) \
Index: linux.prev/include/linux/workqueue.h
===================================================================
--- linux.prev.orig/include/linux/workqueue.h
+++ linux.prev/include/linux/workqueue.h
@@ -54,6 +54,8 @@ extern struct workqueue_struct *__create
 						    int singlethread);
 #define create_workqueue(name) __create_workqueue((name), 0)
 #define create_singlethread_workqueue(name) __create_workqueue((name), 1)
+extern void set_workqueue_prio(struct workqueue_struct *wq, int policy,
+				int rt_priority, int nice);
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
Index: linux.prev/include/net/dn_dev.h
===================================================================
--- linux.prev.orig/include/net/dn_dev.h
+++ linux.prev/include/net/dn_dev.h
@@ -76,9 +76,9 @@ struct dn_dev_parms {
 	int priority;             /* Priority to be a router            */
 	char *name;               /* Name for sysctl                    */
 	int ctl_name;             /* Index for sysctl                   */
-	int  (*up)(struct net_device *);
-	void (*down)(struct net_device *);
-	void (*timer3)(struct net_device *, struct dn_ifaddr *ifa);
+	int  (*dn_up)(struct net_device *);
+	void (*dn_down)(struct net_device *);
+	void (*dn_timer3)(struct net_device *, struct dn_ifaddr *ifa);
 	void *sysctl;
 };
 
Index: linux.prev/include/net/inet_hashtables.h
===================================================================
--- linux.prev.orig/include/net/inet_hashtables.h
+++ linux.prev/include/net/inet_hashtables.h
@@ -101,6 +101,7 @@ struct inet_hashinfo {
 	 * is for TIME_WAIT sockets only.
 	 */
 	struct inet_ehash_bucket	*ehash;
+	unsigned long			*ebitmask;
 
 	/* Ok, let's try this, I give up, we do need a local binding
 	 * TCP hash as well as the others for fast bind/connect.
@@ -155,6 +156,13 @@ static inline struct inet_ehash_bucket *
 	return &hashinfo->ehash[hash & (hashinfo->ehash_size - 1)];
 }
 
+static inline unsigned int inet_ehash_index(
+	struct inet_hashinfo *hashinfo,
+	unsigned int hash)
+{
+	return hash & (hashinfo->ehash_size - 1);
+}
+
 extern struct inet_bind_bucket *
 		    inet_bind_bucket_create(kmem_cache_t *cachep,
 					    struct inet_bind_hashbucket *head,
@@ -227,11 +235,25 @@ static inline void inet_listen_unlock(st
 		wake_up(&hashinfo->lhash_wait);
 }
 
+static inline void __inet_hash_setbit(unsigned long *bitmask, unsigned int index)
+{
+	if (bitmask)
+		set_bit(index, bitmask);
+}
+
+static inline void __inet_hash_clearbit(unsigned long *bitmask, unsigned int index)
+{
+	if (bitmask)
+		clear_bit(index, bitmask);
+}
+
 static inline void __inet_hash(struct inet_hashinfo *hashinfo,
 			       struct sock *sk, const int listen_possible)
 {
 	struct hlist_head *list;
 	rwlock_t *lock;
+	unsigned long *bitmask = NULL;
+	unsigned int index = 0;
 
 	BUG_TRAP(sk_unhashed(sk));
 	if (listen_possible && sk->sk_state == TCP_LISTEN) {
@@ -241,12 +263,15 @@ static inline void __inet_hash(struct in
 	} else {
 		struct inet_ehash_bucket *head;
 		sk->sk_hash = inet_sk_ehashfn(sk);
+		index = inet_ehash_index(hashinfo, sk->sk_hash);
 		head = inet_ehash_bucket(hashinfo, sk->sk_hash);
 		list = &head->chain;
 		lock = &head->lock;
+		bitmask = hashinfo->ebitmask;
 		write_lock(lock);
 	}
 	__sk_add_node(sk, list);
+	__inet_hash_setbit(bitmask, index);
 	sock_prot_inc_use(sk->sk_prot);
 	write_unlock(lock);
 	if (listen_possible && sk->sk_state == TCP_LISTEN)
@@ -265,6 +290,8 @@ static inline void inet_hash(struct inet
 static inline void inet_unhash(struct inet_hashinfo *hashinfo, struct sock *sk)
 {
 	rwlock_t *lock;
+	unsigned long *bitmask = NULL;
+	unsigned int index = 0;
 
 	if (sk_unhashed(sk))
 		goto out;
@@ -274,12 +301,16 @@ static inline void inet_unhash(struct in
 		inet_listen_wlock(hashinfo);
 		lock = &hashinfo->lhash_lock;
 	} else {
+		index = inet_ehash_index(hashinfo, sk->sk_hash);
 		lock = &inet_ehash_bucket(hashinfo, sk->sk_hash)->lock;
+		bitmask = hashinfo->ebitmask;
 		write_lock_bh(lock);
 	}
 
-	if (__sk_del_node_init(sk))
+	if (__sk_del_node_init(sk)) {
+		__inet_hash_clearbit(bitmask, index);
 		sock_prot_dec_use(sk->sk_prot);
+	}
 	write_unlock_bh(lock);
 out:
 	if (sk->sk_state == TCP_LISTEN)
Index: linux.prev/include/net/sock.h
===================================================================
--- linux.prev.orig/include/net/sock.h
+++ linux.prev/include/net/sock.h
@@ -608,12 +608,12 @@ static inline void sk_refcnt_debug_relea
 /* Called with local bh disabled */
 static __inline__ void sock_prot_inc_use(struct proto *prot)
 {
-	prot->stats[smp_processor_id()].inuse++;
+	prot->stats[raw_smp_processor_id()].inuse++;
 }
 
 static __inline__ void sock_prot_dec_use(struct proto *prot)
 {
-	prot->stats[smp_processor_id()].inuse--;
+	prot->stats[raw_smp_processor_id()].inuse--;
 }
 
 /* With per-bucket locks this operation is not-atomic, so that
@@ -735,8 +735,8 @@ extern void FASTCALL(lock_sock(struct so
 extern void FASTCALL(release_sock(struct sock *sk));
 
 /* BH context may only use the following locking interface. */
-#define bh_lock_sock(__sk)	spin_lock(&((__sk)->sk_lock.slock))
-#define bh_unlock_sock(__sk)	spin_unlock(&((__sk)->sk_lock.slock))
+#define bh_lock_sock(__sk)	do { spin_lock(&((__sk)->sk_lock.slock)); } while (0)
+#define bh_unlock_sock(__sk)	do { spin_unlock(&((__sk)->sk_lock.slock)); } while (0)
 
 extern struct sock		*sk_alloc(int family,
 					  gfp_t priority,
Index: linux.prev/include/pcmcia/ss.h
===================================================================
--- linux.prev.orig/include/pcmcia/ss.h
+++ linux.prev/include/pcmcia/ss.h
@@ -243,7 +243,7 @@ struct pcmcia_socket {
 #endif
 
 	/* state thread */
-	struct semaphore		skt_sem;	/* protects socket h/w state */
+	struct compat_semaphore		skt_sem;	/* protects socket h/w state */
 
 	struct task_struct		*thread;
 	struct completion		thread_done;
Index: linux.prev/include/scsi/scsi_transport_spi.h
===================================================================
--- linux.prev.orig/include/scsi/scsi_transport_spi.h
+++ linux.prev/include/scsi/scsi_transport_spi.h
@@ -51,7 +51,7 @@ struct spi_transport_attrs {
 	unsigned int support_qas; /* supports quick arbitration and selection */
 	/* Private Fields */
 	unsigned int dv_pending:1; /* Internal flag */
-	struct semaphore dv_sem; /* semaphore to serialise dv */
+	struct compat_semaphore dv_sem; /* semaphore to serialise dv */
 };
 
 enum spi_signal_type {
Index: linux.prev/include/sound/timer.h
===================================================================
--- linux.prev.orig/include/sound/timer.h
+++ linux.prev/include/sound/timer.h
@@ -25,6 +25,7 @@
 
 #include <sound/asound.h>
 #include <linux/interrupt.h>
+#include <linux/timeofday.h>
 
 typedef enum sndrv_timer_class snd_timer_class_t;
 typedef enum sndrv_timer_slave_class snd_timer_slave_class_t;
Index: linux.prev/init/Kconfig
===================================================================
--- linux.prev.orig/init/Kconfig
+++ linux.prev/init/Kconfig
@@ -399,6 +399,17 @@ config CC_ALIGN_JUMPS
 	  no dummy operations need be executed.
 	  Zero means use compiler's default.
 
+config SLAB
+	default y
+	bool "Use full SLAB allocator" if EMBEDDED
+	# we switch to the SLOB on PREEMPT_RT
+#	depends on !PREEMPT_RT
+	help
+	  Disabling this replaces the advanced SLAB allocator and
+	  kmalloc support with the drastically simpler SLOB allocator.
+	  SLOB is more space efficient but does not scale well and is
+	  more susceptible to fragmentation.
+
 endmenu		# General setup
 
 config TINY_SHMEM
@@ -410,6 +421,10 @@ config BASE_SMALL
 	default 0 if BASE_FULL
 	default 1 if !BASE_FULL
 
+config SLOB
+	default !SLAB
+	bool
+
 menu "Loadable module support"
 
 config MODULES
Index: linux.prev/init/main.c
===================================================================
--- linux.prev.orig/init/main.c
+++ linux.prev/init/main.c
@@ -45,8 +45,12 @@
 #include <linux/efi.h>
 #include <linux/unistd.h>
 #include <linux/rmap.h>
+#include <linux/irq.h>
 #include <linux/mempolicy.h>
 #include <linux/key.h>
+#include <linux/timeofday.h>
+#include <linux/clockchips.h>
+
 #include <net/sock.h>
 
 #include <asm/io.h>
@@ -391,6 +395,8 @@ static void __init smp_init(void)
 static void noinline rest_init(void)
 	__releases(kernel_lock)
 {
+	system_state = SYSTEM_BOOTING_SCHEDULER_OK;
+
 	kernel_thread(init, NULL, CLONE_FS | CLONE_SIGHAND);
 	numa_default_policy();
 	unlock_kernel();
@@ -399,7 +405,7 @@ static void noinline rest_init(void)
 	 * The boot idle thread must execute schedule()
 	 * at least one to get things moving:
 	 */
-	preempt_enable_no_resched();
+	__preempt_enable_no_resched();
 	schedule();
 	preempt_disable();
 
@@ -446,6 +452,7 @@ asmlinkage void __init start_kernel(void
 {
 	char * command_line;
 	extern struct kernel_param __start___param[], __stop___param[];
+
 /*
  * Interrupts are still disabled. Do necessary setups, then
  * enable them
@@ -474,8 +481,10 @@ asmlinkage void __init start_kernel(void
 	 * fragile until we cpu_idle() for the first time.
 	 */
 	preempt_disable();
+
 	build_all_zonelists();
 	page_alloc_init();
+	early_init_hardirqs();
 	printk(KERN_NOTICE "Kernel command line: %s\n", saved_command_line);
 	parse_early_param();
 	parse_args("Booting kernel", command_line, __start___param,
@@ -486,8 +495,11 @@ asmlinkage void __init start_kernel(void
 	rcu_init();
 	init_IRQ();
 	pidhash_init();
+	init_clockevents();
 	init_timers();
+	hrtimers_init();
 	softirq_init();
+	timeofday_init();
 	time_init();
 
 	/*
@@ -499,7 +511,12 @@ asmlinkage void __init start_kernel(void
 	if (panic_later)
 		panic(panic_later, panic_param);
 	profile_init();
-	local_irq_enable();
+
+	/*
+	 * Soft IRQ state will be enabled with the hard state.
+	 */
+	raw_local_irq_enable();
+
 #ifdef CONFIG_BLK_DEV_INITRD
 	if (initrd_start && !initrd_below_start_ok &&
 			initrd_start < min_low_pfn << PAGE_SHIFT) {
@@ -544,6 +561,9 @@ asmlinkage void __init start_kernel(void
 
 	acpi_early_init(); /* before LAPIC and SMP init */
 
+#ifdef CONFIG_PREEMPT_RT
+	WARN_ON(raw_irqs_disabled());
+#endif
 	/* Do the rest non-__init'ed, we're now alive */
 	rest_init();
 }
@@ -586,10 +606,22 @@ static void __init do_initcalls(void)
 			msg = "disabled interrupts";
 			local_irq_enable();
 		}
+#ifdef CONFIG_PREEMPT_RT
+		if (raw_irqs_disabled()) {
+			msg = "disabled hard interrupts";
+			raw_local_irq_enable();
+		}
+#endif
 		if (msg) {
 			printk(KERN_WARNING "error in initcall at 0x%p: "
 				"returned with %s\n", *call, msg);
 		}
+		if (initcall_debug) {
+			printk(KERN_DEBUG "Returned from initcall 0x%p", *call);
+			print_fn_descriptor_symbol(": %s()", (unsigned long) *call);
+			printk("\n");
+		}
+
 	}
 
 	/* Make sure there is no pending stuff from the initcall sequence */
@@ -623,6 +655,7 @@ static void __init do_basic_setup(void)
 static void do_pre_smp_initcalls(void)
 {
 	extern int spawn_ksoftirqd(void);
+	extern int spawn_desched_task(void);
 #ifdef CONFIG_SMP
 	extern int migration_init(void);
 
@@ -630,6 +663,7 @@ static void do_pre_smp_initcalls(void)
 #endif
 	spawn_ksoftirqd();
 	spawn_softlockup_task();
+	spawn_desched_task();
 }
 
 static void run_init_process(char *init_filename)
@@ -676,6 +710,8 @@ static int init(void * unused)
 	/* Sets up cpus_possible() */
 	smp_prepare_cpus(max_cpus);
 
+	init_hardirqs();
+
 	do_pre_smp_initcalls();
 
 	fixup_cpu_present_map();
@@ -705,6 +741,50 @@ static int init(void * unused)
 		prepare_namespace();
 	}
 
+#define DEBUG_COUNT (defined(CONFIG_DEBUG_RT_LOCKING_MODE) + defined(CONFIG_DEBUG_DEADLOCKS) + defined(CONFIG_DEBUG_PREEMPT) + defined(CONFIG_CRITICAL_PREEMPT_TIMING) + defined(CONFIG_CRITICAL_IRQSOFF_TIMING) + defined(CONFIG_LATENCY_TRACE) + defined(CONFIG_DEBUG_SLAB) + defined(CONFIG_DEBUG_PAGEALLOC))
+
+#if DEBUG_COUNT > 0
+	printk(KERN_ERR "*****************************************************************************\n");
+	printk(KERN_ERR "*                                                                           *\n");
+#if DEBUG_COUNT == 1
+	printk(KERN_ERR "*  REMINDER, the following debugging option is turned on in your .config:   *\n");
+#else
+	printk(KERN_ERR "*  REMINDER, the following debugging options are turned on in your .config: *\n");
+#endif
+	printk(KERN_ERR "*                                                                           *\n");
+#ifdef CONFIG_DEBUG_RT_LOCKING_MODE
+	printk(KERN_ERR "*        CONFIG_DEBUG_RT_LOCKING_MODE                                       *\n");
+#endif
+#ifdef CONFIG_DEBUG_DEADLOCKS
+	printk(KERN_ERR "*        CONFIG_DEBUG_DEADLOCKS                                             *\n");
+#endif
+#ifdef CONFIG_DEBUG_PREEMPT
+	printk(KERN_ERR "*        CONFIG_DEBUG_PREEMPT                                               *\n");
+#endif
+#ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+	printk(KERN_ERR "*        CONFIG_CRITICAL_PREEMPT_TIMING                                     *\n");
+#endif
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+	printk(KERN_ERR "*        CONFIG_CRITICAL_IRQSOFF_TIMING                                     *\n");
+#endif
+#ifdef CONFIG_LATENCY_TRACE
+	printk(KERN_ERR "*        CONFIG_LATENCY_TRACE                                               *\n");
+#endif
+#ifdef CONFIG_DEBUG_SLAB
+	printk(KERN_ERR "*        CONFIG_DEBUG_SLAB                                                  *\n");
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	printk(KERN_ERR "*        CONFIG_DEBUG_PAGEALLOC                                             *\n");
+#endif
+	printk(KERN_ERR "*                                                                           *\n");
+#if DEBUG_COUNT == 1
+	printk(KERN_ERR "*  it may increase runtime overhead and latencies.                          *\n");
+#else
+	printk(KERN_ERR "*  they may increase runtime overhead and latencies.                        *\n");
+#endif
+	printk(KERN_ERR "*                                                                           *\n");
+	printk(KERN_ERR "*****************************************************************************\n");
+#endif
 	/*
 	 * Ok, we have completed the initial bootup, and
 	 * we're essentially up and running. Get rid of the
@@ -726,6 +806,9 @@ static int init(void * unused)
 		printk(KERN_WARNING "Failed to execute %s\n",
 				ramdisk_execute_command);
 	}
+#ifdef CONFIG_PREEMPT_RT
+	WARN_ON(raw_irqs_disabled() || irqs_disabled());
+#endif
 
 	/*
 	 * We try each of these until one succeeds.
Index: linux.prev/ipc/mqueue.c
===================================================================
--- linux.prev.orig/ipc/mqueue.c
+++ linux.prev/ipc/mqueue.c
@@ -765,12 +765,17 @@ static inline void pipelined_send(struct
 				  struct msg_msg *message,
 				  struct ext_wait_queue *receiver)
 {
+	/*
+	 * Keep them in one critical section for PREEMPT_RT:
+	 */
+	preempt_disable();
 	receiver->msg = message;
 	list_del(&receiver->list);
 	receiver->state = STATE_PENDING;
 	wake_up_process(receiver->task);
 	smp_wmb();
 	receiver->state = STATE_READY;
+	preempt_enable();
 }
 
 /* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
Index: linux.prev/ipc/msg.c
===================================================================
--- linux.prev.orig/ipc/msg.c
+++ linux.prev/ipc/msg.c
@@ -164,6 +164,11 @@ static void expunge_all(struct msg_queue
 	tmp = msq->q_receivers.next;
 	while (tmp != &msq->q_receivers) {
 		struct msg_receiver* msr;
+		/*
+		 * Make sure that the wakeup doesnt preempt
+		 * _this_ CPU prematurely. (on PREEMPT_RT)
+		 */
+		preempt_disable();
 		
 		msr = list_entry(tmp,struct msg_receiver,r_list);
 		tmp = tmp->next;
@@ -171,6 +176,8 @@ static void expunge_all(struct msg_queue
 		wake_up_process(msr->r_tsk);
 		smp_mb();
 		msr->r_msg = ERR_PTR(res);
+
+		preempt_enable();
 	}
 }
 /* 
@@ -532,7 +539,13 @@ static inline int pipelined_send(struct 
 		if(testmsg(msg,msr->r_msgtype,msr->r_mode) &&
 		   !security_msg_queue_msgrcv(msq, msg, msr->r_tsk, msr->r_msgtype, msr->r_mode)) {
 			list_del(&msr->r_list);
+			/*
+			 * Make sure that the wakeup doesnt preempt
+			 * _this_ CPU prematurely. (on PREEMPT_RT)
+			 */
+			preempt_disable();
 			if(msr->r_maxsize < msg->m_ts) {
+
 				msr->r_msg = NULL;
 				wake_up_process(msr->r_tsk);
 				smp_mb();
@@ -544,8 +557,10 @@ static inline int pipelined_send(struct 
 				wake_up_process(msr->r_tsk);
 				smp_mb();
 				msr->r_msg = msg;
+				preempt_enable();
 				return 1;
 			}
+			preempt_enable();
 		}
 	}
 	return 0;
Index: linux.prev/ipc/sem.c
===================================================================
--- linux.prev.orig/ipc/sem.c
+++ linux.prev/ipc/sem.c
@@ -361,6 +361,11 @@ static void update_queue (struct sem_arr
 		if (error <= 0) {
 			struct sem_queue *n;
 			remove_from_queue(sma,q);
+			/*
+			 * make sure that the wakeup doesnt preempt
+			 * _this_ cpu prematurely. (on preempt_rt)
+			 */
+			preempt_disable();
 			q->status = IN_WAKEUP;
 			/*
 			 * Continue scanning. The next operation
@@ -383,6 +388,7 @@ static void update_queue (struct sem_arr
 			 */
 			smp_wmb();
 			q->status = error;
+			preempt_enable();
 			q = n;
 		} else {
 			q = q->next;
Index: linux.prev/kernel/Kconfig.preempt
===================================================================
--- linux.prev.orig/kernel/Kconfig.preempt
+++ linux.prev/kernel/Kconfig.preempt
@@ -1,14 +1,13 @@
-
 choice
-	prompt "Preemption Model"
-	default PREEMPT_NONE
+	prompt "Preemption Mode"
+	default PREEMPT_RT
 
 config PREEMPT_NONE
 	bool "No Forced Preemption (Server)"
 	help
-	  This is the traditional Linux preemption model, geared towards
+	  This is the traditional Linux preemption model geared towards
 	  throughput. It will still provide good latencies most of the
-	  time, but there are no guarantees and occasional longer delays
+	  time but there are no guarantees and occasional long delays
 	  are possible.
 
 	  Select this option if you are building a kernel for a server or
@@ -21,7 +20,7 @@ config PREEMPT_VOLUNTARY
 	help
 	  This option reduces the latency of the kernel by adding more
 	  "explicit preemption points" to the kernel code. These new
-	  preemption points have been selected to reduce the maximum
+	  preemption points have been selected to minimize the maximum
 	  latency of rescheduling, providing faster application reactions,
 	  at the cost of slighly lower throughput.
 
@@ -33,33 +32,133 @@ config PREEMPT_VOLUNTARY
 
 	  Select this if you are building a kernel for a desktop system.
 
-config PREEMPT
+config PREEMPT_DESKTOP
 	bool "Preemptible Kernel (Low-Latency Desktop)"
 	help
 	  This option reduces the latency of the kernel by making
-	  all kernel code (that is not executing in a critical section)
+	  all kernel code that is not executing in a critical section
 	  preemptible.  This allows reaction to interactive events by
 	  permitting a low priority process to be preempted involuntarily
 	  even if it is in kernel mode executing a system call and would
-	  otherwise not be about to reach a natural preemption point.
-	  This allows applications to run more 'smoothly' even when the
-	  system is under load, at the cost of slighly lower throughput
-	  and a slight runtime overhead to kernel code.
+	  otherwise not about to reach a preemption point.  This allows
+	  applications to run more 'smoothly' even when the system is
+	  under load, at the cost of slighly lower throughput and a
+	  slight runtime overhead to kernel code.
+
+	  (According to profiles, when this mode is selected then even
+	  during kernel-intense workloads the system is in an immediately
+	  preemptible state more than 50% of the time.)
 
 	  Select this if you are building a kernel for a desktop or
 	  embedded system with latency requirements in the milliseconds
 	  range.
 
+config PREEMPT_RT
+	bool "Complete Preemption (Real-Time)"
+	select PREEMPT_SOFTIRQS
+	select PREEMPT_HARDIRQS
+	select PREEMPT_RCU
+	help
+	  This option further reduces the scheduling latency of the
+	  kernel by replacing almost every spinlock used by the kernel
+	  with preemptible mutexes and thus making all but the most
+	  critical kernel code involuntarily preemptible. The remaining
+	  handful of lowlevel non-preemptible codepaths are short and
+	  have a deterministic latency of a couple of tens of
+	  microseconds (depending the the hardware).  This also allows
+	  applications to run more 'smoothly' even when the system is
+	  under load, at the cost of lower throughput and runtime
+	  overhead to kernel code.
+
+	  (According to profiles, when this mode is selected then even
+	  during kernel-intense workloads the system is in an immediately
+	  preemptible state more than 95% of the time.)
+
+	  Select this if you are building a kernel for a desktop,
+	  embedded or real-time system with guaranteed latency
+	  requirements of 100 usecs or lower.
+
 endchoice
 
-config PREEMPT_BKL
-	bool "Preempt The Big Kernel Lock"
-	depends on SMP || PREEMPT
+config PREEMPT
+	bool
 	default y
+	depends on PREEMPT_DESKTOP || PREEMPT_RT
+
+config PREEMPT_SOFTIRQS
+	bool "Thread Softirqs"
+	default n
+#	depends on PREEMPT
+	help
+	  This option reduces the latency of the kernel by 'threading'
+          soft interrupts. This means that all softirqs will execute
+          in softirqd's context. While this helps latency, it can also
+          reduce performance.
+
+          The threading of softirqs can also be controlled via
+          /proc/sys/kernel/softirq_preemption runtime flag and the
+          sofirq-preempt=0/1 boot-time option.
+
+	  Say N if you are unsure.
+
+config PREEMPT_HARDIRQS
+	bool "Thread Hardirqs"
+	default n
+#	depends on PREEMPT
 	help
-	  This option reduces the latency of the kernel by making the
-	  big kernel lock preemptible.
+	  This option reduces the latency of the kernel by 'threading'
+          hardirqs. This means that all (or selected) hardirqs will run
+          in their own kernel thread context. While this helps latency,
+          this feature can also reduce performance.
+
+          The threading of hardirqs can also be controlled via the
+          /proc/sys/kernel/hardirq_preemption runtime flag and the
+          hardirq-preempt=0/1 boot-time option. Per-irq threading can
+          be enabled/disable via the /proc/irq/<IRQ>/<handler>/threaded
+          runtime flags.
+
+	  Say N if you are unsure.
+
+config SPINLOCK_BKL
+	bool "Old-Style Big Kernel Lock"
+	depends on (PREEMPT || SMP) && !PREEMPT_RT
+	default n
+	help
+	  This option increases the latency of the kernel by making the
+	  big kernel lock spinlock-based (which is bad for latency).
+	  However, enable this option if you see any problems to revert
+	  back to the traditional spinlock BKL design.
 
 	  Say Y here if you are building a kernel for a desktop system.
 	  Say N if you are unsure.
 
+config PREEMPT_BKL
+	bool
+	depends on PREEMPT_RT || !SPINLOCK_BKL
+	default n if !PREEMPT
+	default y
+
+config PREEMPT_RCU
+	bool "Preemptible RCU"
+	default n
+	depends on PREEMPT
+	help
+	  This option reduces the latency of the kernel by making certain
+	  RCU sections preemptible. Normally RCU code is non-preemptible, if
+	  this option is selected then read-only RCU sections become
+	  preemptible. This helps latency, but may expose bugs due to
+	  now-naive assumptions about each RCU read-side critical section
+	  remaining on a given CPU through its execution.
+
+	  Say N if you are unsure.
+
+config RCU_STATS
+	bool "/proc stats for preemptible RCU read-side critical sections"
+	depends on PREEMPT_RCU
+	default y
+	help
+	  This option provides /proc stats to provide debugging info for
+	  the preemptible realtime RCU implementation.
+
+	  Say Y here if you want to see RCU stats in /proc
+	  Say N if you are unsure.
Index: linux.prev/kernel/Makefile
===================================================================
--- linux.prev.orig/kernel/Makefile
+++ linux.prev/kernel/Makefile
@@ -7,7 +7,16 @@ obj-y     = sched.o fork.o exec_domain.o
 	    sysctl.o capability.o ptrace.o timer.o user.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
 	    rcupdate.o intermodule.o extable.o params.o posix-timers.o \
-	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o
+	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o \
+	    hrtimer.o rt.o
+
+obj-$(CONFIG_GENERIC_TIME) += time/
+obj-$(CONFIG_DEBUG_PREEMPT) += latency.o
+obj-$(CONFIG_LATENCY_TIMING) += latency.o
+obj-$(CONFIG_LATENCY_HIST) += latency_hist.o
+obj-$(CONFIG_DEBUG_PREEMPT) += rt-debug.o
+obj-$(CONFIG_DEBUG_DEADLOCKS) += rt-debug.o
+obj-$(CONFIG_PREEMPT_RT) += rt-irqflags.o
 
 obj-$(CONFIG_FUTEX) += futex.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
Index: linux.prev/kernel/acct.c
===================================================================
--- linux.prev.orig/kernel/acct.c
+++ linux.prev/kernel/acct.c
@@ -89,7 +89,7 @@ struct acct_glbs {
 	struct timer_list	timer;
 };
 
-static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED};
+static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED(acct_globals.lock)};
 
 /*
  * Called whenever the timer says to check the free space.
Index: linux.prev/kernel/audit.c
===================================================================
--- linux.prev.orig/kernel/audit.c
+++ linux.prev/kernel/audit.c
@@ -616,7 +616,7 @@ err:
 
 unsigned int audit_serial(void)
 {
-	static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED;
+	static DEFINE_SPINLOCK(serial_lock);
 	static unsigned int serial = 0;
 
 	unsigned long flags;
Index: linux.prev/kernel/exit.c
===================================================================
--- linux.prev.orig/kernel/exit.c
+++ linux.prev/kernel/exit.c
@@ -50,8 +50,11 @@ static void __unhash_process(struct task
 	if (thread_group_leader(p)) {
 		detach_pid(p, PIDTYPE_PGID);
 		detach_pid(p, PIDTYPE_SID);
-		if (p->pid)
+		if (p->pid) {
+			preempt_disable();
 			__get_cpu_var(process_counts)--;
+			preempt_enable();
+		}
 	}
 
 	REMOVE_LINKS(p);
@@ -72,7 +75,6 @@ repeat: 
 		__ptrace_unlink(p);
 	BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
 	__exit_signal(p);
-	__exit_sighand(p);
 	/*
 	 * Note that the fastpath in sys_times depends on __exit_signal having
 	 * updated the counters before a task is removed from the tasklist of
@@ -388,8 +390,10 @@ static inline void close_files(struct fi
 		while (set) {
 			if (set & 1) {
 				struct file * file = xchg(&fdt->fd[i], NULL);
-				if (file)
+				if (file) {
 					filp_close(file, files);
+					cond_resched();
+				}
 			}
 			i++;
 			set >>= 1;
@@ -523,9 +527,11 @@ static void exit_mm(struct task_struct *
 	if (mm != tsk->active_mm) BUG();
 	/* more a memory barrier than a real lock */
 	task_lock(tsk);
+	preempt_disable(); // FIXME
 	tsk->mm = NULL;
 	up_read(&mm->mmap_sem);
 	enter_lazy_tlb(mm, current);
+	preempt_enable();
 	task_unlock(tsk);
 	mmput(mm);
 }
@@ -795,6 +801,7 @@ fastcall NORET_TYPE void do_exit(long co
 
 	WARN_ON(atomic_read(&tsk->fs_excl));
 
+	BUG_ON(in_interrupt());
 	if (unlikely(in_interrupt()))
 		panic("Aiee, killing interrupt handler!");
 	if (unlikely(!tsk->pid))
@@ -842,7 +849,7 @@ fastcall NORET_TYPE void do_exit(long co
 	}
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
- 		del_timer_sync(&tsk->signal->real_timer);
+ 		hrtimer_cancel(&tsk->signal->real_timer);
 		exit_itimers(tsk->signal);
 		acct_process(code);
 	}
@@ -871,15 +878,21 @@ fastcall NORET_TYPE void do_exit(long co
 	tsk->mempolicy = NULL;
 #endif
 
+	check_no_held_locks(tsk);
+
 	/* PF_DEAD causes final put_task_struct after we schedule. */
-	preempt_disable();
+again:
+	raw_local_irq_disable();
 	BUG_ON(tsk->flags & PF_DEAD);
 	tsk->flags |= PF_DEAD;
 
-	schedule();
-	BUG();
-	/* Avoid "noreturn function does return".  */
-	for (;;) ;
+	__schedule();
+	printk(KERN_ERR "BUG: dead task %s:%d back from the grave!\n",
+		current->comm, current->pid);
+	printk(KERN_ERR ".... flags: %08lx, count: %d, state: %08lx\n",
+		current->flags, atomic_read(&current->usage), current->state);
+	printk(KERN_ERR ".... trying again ...\n");
+	goto again;
 }
 
 EXPORT_SYMBOL_GPL(do_exit);
@@ -1379,6 +1392,7 @@ repeat:
 		list_for_each(_p,&tsk->children) {
 			p = list_entry(_p,struct task_struct,sibling);
 
+			BUG_ON(!atomic_read(&p->usage));
 			ret = eligible_child(pid, options, p);
 			if (!ret)
 				continue;
Index: linux.prev/kernel/fork.c
===================================================================
--- linux.prev.orig/kernel/fork.c
+++ linux.prev/kernel/fork.c
@@ -42,6 +42,8 @@
 #include <linux/profile.h>
 #include <linux/rmap.h>
 #include <linux/acct.h>
+#include <linux/kthread.h>
+#include <linux/notifier.h>
 #include <linux/cn_proc.h>
 
 #include <asm/pgtable.h>
@@ -65,6 +67,16 @@ DEFINE_PER_CPU(unsigned long, process_co
 
 EXPORT_SYMBOL(tasklist_lock);
 
+/*
+ * Delayed mmdrop/put_task_struct. In the PREEMPT_RT case we
+ * dont want to do this from the scheduling context.
+ */
+static DEFINE_PER_CPU(struct task_struct *, desched_task);
+
+static DEFINE_PER_CPU(struct list_head, delayed_put_list);
+static DEFINE_PER_CPU(struct list_head, delayed_drop_list);
+
+
 int nr_processes(void)
 {
 	int cpu;
@@ -109,6 +121,8 @@ EXPORT_SYMBOL(free_task);
 
 void __put_task_struct(struct task_struct *tsk)
 {
+	BUG_ON(atomic_read(&tsk->usage));
+	WARN_ON(!(tsk->flags & PF_DEAD));
 	WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
@@ -123,8 +137,33 @@ void __put_task_struct(struct task_struc
 		free_task(tsk);
 }
 
+#if 0
+
+void put_task_struct(struct task_struct *tsk)
+{
+	BUG_ON(!atomic_read(&tsk->usage));
+
+	if (!atomic_dec_and_test(&tsk->usage))
+		return;
+	__put_task_struct(tsk);
+}
+
+EXPORT_SYMBOL(put_task_struct);
+
+void get_task_struct(struct task_struct *tsk)
+{
+	BUG_ON(!atomic_read(&tsk->usage));
+	atomic_inc(&tsk->usage);
+}
+
+EXPORT_SYMBOL(get_task_struct);
+
+#endif
+
 void __init fork_init(unsigned long mempages)
 {
+	int i;
+
 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
 #define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
@@ -152,6 +191,11 @@ void __init fork_init(unsigned long memp
 	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
 	init_task.signal->rlim[RLIMIT_SIGPENDING] =
 		init_task.signal->rlim[RLIMIT_NPROC];
+
+	for (i = 0; i < NR_CPUS; i++) {
+		INIT_LIST_HEAD(&per_cpu(delayed_drop_list, i));
+		INIT_LIST_HEAD(&per_cpu(delayed_put_list, i));
+	}
 }
 
 static struct task_struct *dup_task_struct(struct task_struct *orig)
@@ -323,6 +367,7 @@ static struct mm_struct * mm_init(struct
 	spin_lock_init(&mm->page_table_lock);
 	rwlock_init(&mm->ioctx_list_lock);
 	mm->ioctx_list = NULL;
+	INIT_LIST_HEAD(&mm->delayed_drop);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->cached_hole_size = ~0UL;
 
@@ -743,6 +788,14 @@ int unshare_files(void)
 
 EXPORT_SYMBOL(unshare_files);
 
+void sighand_free_cb(struct rcu_head *rhp)
+{
+	struct sighand_struct *sp =
+		container_of(rhp, struct sighand_struct, rcu);
+
+	kmem_cache_free(sighand_cachep, sp);
+}
+
 static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
 {
 	struct sighand_struct *sig;
@@ -793,10 +846,10 @@ static inline int copy_signal(unsigned l
 	init_sigpending(&sig->shared_pending);
 	INIT_LIST_HEAD(&sig->posix_timers);
 
-	sig->it_real_value = sig->it_real_incr = 0;
+	hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL);
+	sig->it_real_incr.tv64 = 0;
 	sig->real_timer.function = it_real_fn;
-	sig->real_timer.data = (unsigned long) tsk;
-	init_timer(&sig->real_timer);
+	sig->real_timer.data = tsk;
 
 	sig->it_virt_expires = cputime_zero;
 	sig->it_virt_incr = cputime_zero;
@@ -972,6 +1025,12 @@ static task_t *copy_process(unsigned lon
  		goto bad_fork_cleanup;
  	}
 #endif
+	INIT_LIST_HEAD(&p->delayed_put);
+	preempt_disable();
+	plist_head_init(&p->pi_waiters);
+	preempt_enable();
+	p->blocked_on = NULL; /* not blocked yet */
+	spin_lock_init(&p->pi_lock);
 
 	p->tgid = p->pid;
 	if (clone_flags & CLONE_THREAD)
@@ -1001,6 +1060,9 @@ static task_t *copy_process(unsigned lon
 	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
 	if (retval)
 		goto bad_fork_cleanup_namespace;
+#ifdef CONFIG_DEBUG_PREEMPT
+	p->lock_count = 0;
+#endif
 
 	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
 	/*
@@ -1050,10 +1112,12 @@ static task_t *copy_process(unsigned lon
 	 * to ensure it is on a valid CPU (and if not, just force it back to
 	 * parent's CPU). This avoids alot of nasty races.
 	 */
+	preempt_disable();
 	p->cpus_allowed = current->cpus_allowed;
 	if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
 			!cpu_online(task_cpu(p))))
 		set_task_cpu(p, smp_processor_id());
+	preempt_enable();
 
 	/*
 	 * Check for pending SIGKILL! The new thread should not be allowed
@@ -1129,8 +1193,11 @@ static task_t *copy_process(unsigned lon
 	if (thread_group_leader(p)) {
 		attach_pid(p, PIDTYPE_PGID, process_group(p));
 		attach_pid(p, PIDTYPE_SID, p->signal->session);
-		if (p->pid)
+		if (p->pid) {
+			preempt_disable();
 			__get_cpu_var(process_counts)++;
+			preempt_enable();
+		}
 	}
 
 	if (!current->signal->tty && p->signal->tty)
@@ -1311,3 +1378,168 @@ void __init proc_caches_init(void)
 			sizeof(struct mm_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
 }
+
+static int put_task_complete(void)
+{
+	struct list_head *head;
+	int ret = 0;
+
+	head = &get_cpu_var(delayed_put_list);
+	while (!list_empty(head)) {
+		struct task_struct *task = list_entry(head->next,
+					struct task_struct, delayed_put);
+		list_del(&task->delayed_put);
+		put_cpu_var(delayed_put_list);
+
+		__put_task_struct(task);
+		ret = 1;
+
+		head = &get_cpu_var(delayed_put_list);
+	}
+	put_cpu_var(delayed_put_list);
+
+	return ret;
+}
+
+/*
+ * We dont want to do complex work from the scheduler, thus
+ * we delay the work to a per-CPU worker thread:
+ */
+void fastcall __put_task_struct_delayed(struct task_struct *task)
+{
+	struct task_struct *desched_task;
+	struct list_head *head;
+
+	head = &get_cpu_var(delayed_put_list);
+	list_add_tail(&task->delayed_put, head);
+	desched_task = __get_cpu_var(desched_task);
+	if (desched_task)
+		wake_up_process(desched_task);
+	put_cpu_var(delayed_put_list);
+}
+
+void put_task_struct_delayed(struct task_struct *tsk)
+{
+	BUG_ON(!atomic_read(&tsk->usage));
+
+	if (!atomic_dec_and_test(&tsk->usage))
+		return;
+	__put_task_struct_delayed(tsk);
+}
+
+static int mmdrop_complete(void)
+{
+	struct list_head *head;
+	int ret = 0;
+
+	head = &get_cpu_var(delayed_drop_list);
+	while (!list_empty(head)) {
+		struct mm_struct *mm = list_entry(head->next,
+					struct mm_struct, delayed_drop);
+		list_del(&mm->delayed_drop);
+		put_cpu_var(delayed_drop_list);
+
+		__mmdrop(mm);
+		ret = 1;
+
+		head = &get_cpu_var(delayed_drop_list);
+	}
+	put_cpu_var(delayed_drop_list);
+
+	return ret;
+}
+
+/*
+ * We dont want to do complex work from the scheduler, thus
+ * we delay the work to a per-CPU worker thread:
+ */
+void fastcall __mmdrop_delayed(struct mm_struct *mm)
+{
+	struct task_struct *desched_task;
+	struct list_head *head;
+
+	head = &get_cpu_var(delayed_drop_list);
+	list_add_tail(&mm->delayed_drop, head);
+	desched_task = __get_cpu_var(desched_task);
+	if (desched_task)
+		wake_up_process(desched_task);
+	put_cpu_var(delayed_drop_list);
+}
+
+static int desched_thread(void * __bind_cpu)
+{
+	set_user_nice(current, -10);
+	current->flags |= PF_NOFREEZE | PF_SOFTIRQ;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	while (!kthread_should_stop()) {
+		int ret;
+
+		ret = put_task_complete();
+		ret |= mmdrop_complete();
+		if (ret)
+			continue;
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+static int __devinit cpu_callback(struct notifier_block *nfb,
+				  unsigned long action,
+				  void *hcpu)
+{
+	int hotcpu = (unsigned long)hcpu;
+	struct task_struct *p;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+
+		BUG_ON(per_cpu(desched_task, hotcpu));
+		INIT_LIST_HEAD(&per_cpu(delayed_drop_list, hotcpu));
+		INIT_LIST_HEAD(&per_cpu(delayed_put_list, hotcpu));
+		p = kthread_create(desched_thread, hcpu, "desched/%d", hotcpu);
+		if (IS_ERR(p)) {
+			printk("desched_thread for %i failed\n", hotcpu);
+			return NOTIFY_BAD;
+		}
+  		per_cpu(desched_task, hotcpu) = p;
+		kthread_bind(p, hotcpu);
+ 		break;
+	case CPU_ONLINE:
+
+		wake_up_process(per_cpu(desched_task, hotcpu));
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+
+		/* Unbind so it can run.  Fall thru. */
+		kthread_bind(per_cpu(desched_task, hotcpu), smp_processor_id());
+	case CPU_DEAD:
+
+		p = per_cpu(desched_task, hotcpu);
+		per_cpu(desched_task, hotcpu) = NULL;
+		kthread_stop(p);
+		takeover_tasklets(hotcpu);
+		break;
+#endif /* CONFIG_HOTPLUG_CPU */
+ 	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata cpu_nfb = {
+	.notifier_call = cpu_callback
+};
+
+__init int spawn_desched_task(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+
+	cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+	register_cpu_notifier(&cpu_nfb);
+	return 0;
+}
+
Index: linux.prev/kernel/futex.c
===================================================================
--- linux.prev.orig/kernel/futex.c
+++ linux.prev/kernel/futex.c
@@ -695,8 +695,13 @@ static int futex_wait(unsigned long uadd
 	 * !list_empty() is safe here without any lock.
 	 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
 	 */
-	if (likely(!list_empty(&q.list)))
+	if (likely(!list_empty(&q.list))) {
+		unsigned long nosched_flag = current->flags & PF_NOSCHED;
+
+		current->flags &= ~PF_NOSCHED;
 		time = schedule_timeout(time);
+		current->flags |= nosched_flag;
+	}
 	__set_current_state(TASK_RUNNING);
 
 	/*
Index: linux.prev/kernel/hrtimer.c
===================================================================
--- /dev/null
+++ linux.prev/kernel/hrtimer.c
@@ -0,0 +1,1255 @@
+/*
+ *  linux/kernel/hrtimer.c
+ *
+ *  Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright(C) 2005, Red Hat, Inc., Ingo Molnar
+ *
+ *  High-resolution kernel timers
+ *
+ *  In contrast to the low-resolution timeout API implemented in
+ *  kernel/timer.c, hrtimers provide finer resolution and accuracy
+ *  depending on system configuration and capabilities.
+ *
+ *  These timers are currently used for:
+ *   - itimers
+ *   - POSIX timers
+ *   - nanosleep
+ *   - precise in-kernel timing
+ *
+ *  Started by: Thomas Gleixner and Ingo Molnar
+ *
+ *  Credits:
+ *	based on kernel/timer.c
+ *	Thanks to George Anzinger and Roman Zippel
+ *	for testing, buxfixes and suggestions
+ *
+ *	Help, testing, suggestions, bugfixes, improvements were
+ *	provided by:
+ *
+ *	George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel
+ *	et. al.
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/cpu.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/hrtimer.h>
+#include <linux/notifier.h>
+#include <linux/syscalls.h>
+#include <linux/interrupt.h>
+#include <linux/clockchips.h>
+
+#include <asm/uaccess.h>
+
+#ifndef CONFIG_GENERIC_TIME
+
+/**
+ * ktime_get - get the monotonic time in ktime_t format
+ *
+ * returns the time in ktime_t format
+ */
+static ktime_t ktime_get(void)
+{
+	struct timespec now;
+
+	ktime_get_ts(&now);
+
+	return timespec_to_ktime(now);
+}
+
+/**
+ * ktime_get_real - get the real (wall-) time in ktime_t format
+ *
+ * returns the time in ktime_t format
+ */
+static ktime_t ktime_get_real(void)
+{
+	struct timespec now;
+
+	getnstimeofday(&now);
+
+	return timespec_to_ktime(now);
+}
+
+EXPORT_SYMBOL_GPL(ktime_get_real);
+
+/**
+ * ktime_get_ts - get the monotonic clock in timespec format
+ *
+ * @ts:		pointer to timespec variable
+ *
+ * The function calculates the monotonic clock from the realtime
+ * clock and the wall_to_monotonic offset and stores the result
+ * in normalized timespec format in the variable pointed to by ts.
+ */
+void ktime_get_ts(struct timespec *ts)
+{
+	struct timespec tomono;
+	unsigned long seq;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		getnstimeofday(ts);
+		tomono = wall_to_monotonic;
+
+	} while (read_seqretry(&xtime_lock, seq));
+
+	set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
+				ts->tv_nsec + tomono.tv_nsec);
+}
+EXPORT_SYMBOL_GPL(ktime_get_ts);
+
+#endif
+
+/*
+ * The timer bases:
+ */
+
+#define MAX_HRTIMER_BASES 2
+
+static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) =
+{
+	{
+		.index = CLOCK_REALTIME,
+		.get_time = &ktime_get_real,
+		.resolution = KTIME_REALTIME_RES,
+	},
+	{
+		.index = CLOCK_MONOTONIC,
+		.get_time = &ktime_get,
+		.resolution = KTIME_MONOTONIC_RES,
+	},
+};
+
+/*
+ * Functions and macros which are different for UP/SMP systems are kept in a
+ * single place
+ */
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS)
+
+#define set_curr_timer(b, t)		do { (b)->curr_timer = (t); } while (0)
+
+/*
+ * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
+ * means that all timers which are tied to this base via timer->base are
+ * locked, and the base itself is locked too.
+ *
+ * So __run_timers/migrate_timers can safely modify all timers which could
+ * be found on the lists/queues.
+ *
+ * When the timer's base is locked, and the timer removed from list, it is
+ * possible to set timer->base = NULL and drop the lock: the timer remains
+ * locked.
+ */
+static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer,
+					      unsigned long *flags)
+{
+	struct hrtimer_base *base;
+
+	for (;;) {
+		base = timer->base;
+		if (likely(base != NULL)) {
+			spin_lock_irqsave(&base->lock, *flags);
+			if (likely(base == timer->base))
+				return base;
+			/* The timer has migrated to another CPU: */
+			spin_unlock_irqrestore(&base->lock, *flags);
+		}
+		cpu_relax();
+	}
+}
+
+/*
+ * Switch the timer base to the current CPU when possible.
+ */
+static inline struct hrtimer_base *
+switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base)
+{
+	struct hrtimer_base *new_base;
+
+	new_base = &__get_cpu_var(hrtimer_bases[base->index]);
+
+	if (base != new_base) {
+		/*
+		 * We are trying to schedule the timer on the local CPU.
+		 * However we can't change timer's base while it is running,
+		 * so we keep it on the same CPU. No hassle vs. reprogramming
+		 * the event source in the high resolution case. The softirq
+		 * code will take care of this when the timer function has
+		 * completed. There is no conflict as we hold the lock until
+		 * the timer is enqueued.
+		 */
+		if (unlikely(base->curr_timer == timer))
+			return base;
+
+		/* See the comment in lock_timer_base() */
+		timer->base = NULL;
+		spin_unlock(&base->lock);
+		spin_lock(&new_base->lock);
+		timer->base = new_base;
+	}
+	return new_base;
+}
+
+#else /* CONFIG_SMP */
+
+#define set_curr_timer(b, t)		do { } while (0)
+
+static inline struct hrtimer_base *
+lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
+{
+	struct hrtimer_base *base = timer->base;
+
+	spin_lock_irqsave(&base->lock, *flags);
+
+	return base;
+}
+
+#define switch_hrtimer_base(t, b)	(b)
+
+#endif	/* !CONFIG_SMP */
+
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+#define wake_up_timer_waiters(b)	wake_up(&(b)->wait)
+
+/**
+ * hrtimer_wait_for_timer - Wait for a running timer
+ *
+ * @timer:	timer to wait for
+ *
+ * The function waits in case the timers callback function is
+ * currently executed on the waitqueue of the timer base. The
+ * waitqueue is woken up after the timer callback function has
+ * finished execution.
+ */
+void hrtimer_wait_for_timer(const struct hrtimer *timer)
+{
+	struct hrtimer_base *base = timer->base;
+
+	if (base)
+		wait_event(base->wait, base->curr_timer != timer);
+}
+
+#else
+#define wake_up_timer_waiters(b)	do { } while (0)
+#endif
+
+/* High resolution timer related functions */
+#ifdef CONFIG_HIGH_RES_TIMERS
+
+#define hrtimer_hres_active 		(__get_cpu_var(hrtimer_hres).active)
+
+struct hrtimer_hres {
+	ktime_t		expires_next;
+	ktime_t		next_tick;
+	ktime_t		tick_incr;
+	int		active;
+	int		dotick;
+	unsigned long	check_clocks;
+};
+
+static DEFINE_PER_CPU(struct hrtimer_hres, hrtimer_hres);
+
+/*
+ * Shared reprogramming for clock_realtime and clock_monotonic
+ *
+ * When a new expires first timer is enqueued, we have
+ * to check, whether it expires earlier than the timer
+ * for which the hrt time source was armed.
+ *
+ * Called with interrupts disabled and base lock held
+ */
+static int hrtimer_reprogram(struct hrtimer *timer, struct hrtimer_base *base)
+{
+	ktime_t *expires_next = &__get_cpu_var(hrtimer_hres).expires_next;
+	ktime_t expires = ktime_sub(timer->expires, base->offset);
+	int res;
+
+	if (expires.tv64 >= expires_next->tv64)
+		return 0;
+
+	res = clockevents_set_next_event(expires);
+	if (!res)
+		*expires_next = expires;
+	return res;
+}
+
+
+/*
+ * Retrigger next event is called after clock was set
+ */
+static void retrigger_next_event(void *arg)
+{
+	ktime_t expires_next, now;
+	int i, cpu = smp_processor_id();
+	struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu);
+	struct hrtimer_hres *hres = &per_cpu(hrtimer_hres, cpu);
+
+ retry:
+	now = ktime_get();
+
+	if (hres->dotick)
+		expires_next = hres->next_tick;
+	else
+		expires_next.tv64 = KTIME_MAX;
+
+	for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) {
+		ktime_t basenow;
+		struct hrtimer *timer;
+		struct rb_node *next;
+
+		spin_lock(&base->lock);
+
+		if (i == CLOCK_REALTIME)
+			base->offset = get_realtime_offset();
+
+		next = base->first;
+		while (next) {
+			basenow = ktime_add(now, base->offset);
+			timer = rb_entry(base->first, struct hrtimer, node);
+
+			if (basenow.tv64 < timer->expires.tv64) {
+				ktime_t expires;
+
+				expires = ktime_sub(timer->expires,
+						    base->offset);
+				if (expires.tv64 < expires_next.tv64)
+					expires_next = expires;
+				break;
+			} else {
+				next = rb_next(next);
+				/* FIXME: Must we expire timers here ? */
+			}
+		}
+		spin_unlock(&base->lock);
+	}
+
+	hres->expires_next = expires_next;
+
+	/* Reprogramming necessary ? */
+	if (expires_next.tv64 != KTIME_MAX) {
+		if (clockevents_set_next_event(expires_next)) {
+			hrtimer_interrupt();
+			base = per_cpu(hrtimer_bases, cpu);
+			goto retry;
+		}
+	}
+}
+
+/*
+ * Clock realtime was set
+ *
+ * Change the offset of the realtime clock vs. the monotonic
+ * clock. Called with xtime lock held !
+ *
+ * We might have to reprogram the high resolution timer interrupt. On
+ * SMP we call the architecture specific code to retrigger _all_ high
+ * resolution timer interrupts. On UP we just disable interrupts and
+ * call the high resolution interrupt code.
+ */
+void clock_was_set(void)
+{
+	preempt_disable();
+	raw_local_irq_disable();
+
+	if (hrtimer_hres_active) {
+		retrigger_next_event(NULL);
+		raw_local_irq_enable();
+
+		if (smp_call_function(retrigger_next_event, NULL, 1, 1))
+			BUG();
+	} else
+		raw_local_irq_enable();
+	preempt_enable();
+}
+
+/***
+ * hrtimer_clock_notify - A clock source or a clock event has been installed
+ *
+ * Notify the per cpu softirqs to recheck the clock sources and events
+ */
+void hrtimer_clock_notify(void)
+{
+	int i;
+
+	for (i = 0; i < NR_CPUS; i++)
+		set_bit(0, &per_cpu(hrtimer_hres, i).check_clocks);
+}
+
+/*
+ * A change in the clock source or clock events was detected.
+ * Check the clock source and the events, whether we can switch to
+ * high resolution mode or not.
+ *
+ * TODO: Handle the removal of clock sources / events
+ */
+static void hrtimer_check_clocks(void)
+{
+	struct hrtimer_hres *hres = &__get_cpu_var(hrtimer_hres);
+	unsigned long flags;
+	int dotick;
+
+	if (!test_and_clear_bit(0, &hres->check_clocks))
+		return;
+
+	if (!timeofday_is_continuous())
+		return;
+
+	if (!(dotick = clockevents_next_event_available()))
+		return;
+
+	raw_local_irq_save(flags);
+	clockevents_init_next_event();
+	hres->active = 1;
+	if (dotick == CLOCK_EVT_SCHEDTICK) {
+		hres->tick_incr = ktime_set(0, NSEC_PER_SEC/HZ);
+		hres->next_tick = ktime_add(ktime_get(), hres->tick_incr);
+		hres->dotick = 1;
+	} else
+		hres->dotick = 0;
+
+	/* "Retrigger" the interrupt to get things going */
+	retrigger_next_event(NULL);
+	raw_local_irq_restore(flags);
+	printk(KERN_INFO "hrtimers: Switched to high resolution mode CPU %d\n",
+	       smp_processor_id());
+}
+
+#else /* CONFIG_HIGH_RES_TIMERS */
+
+# define hrtimer_hres_active		0
+# define hres_enqueue_expired(t,b,n)	0
+# define hrtimer_check_clocks()		do { } while (0)
+
+#endif /* !CONFIG_HIGH_RES_TIMERS */
+
+/*
+ * Functions for the union type storage format of ktime_t which are
+ * too large for inlining:
+ */
+#if BITS_PER_LONG < 64
+# ifndef CONFIG_KTIME_SCALAR
+/**
+ * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable
+ *
+ * @kt:		addend
+ * @nsec:	the scalar nsec value to add
+ *
+ * Returns the sum of kt and nsec in ktime_t format
+ */
+ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
+{
+	ktime_t tmp;
+
+	if (likely(nsec < NSEC_PER_SEC)) {
+		tmp.tv64 = nsec;
+	} else {
+		unsigned long rem = do_div(nsec, NSEC_PER_SEC);
+
+		tmp = ktime_set((long)nsec, rem);
+	}
+
+	return ktime_add(kt, tmp);
+}
+
+#else /* CONFIG_KTIME_SCALAR */
+
+# endif /* !CONFIG_KTIME_SCALAR */
+
+/*
+ * Divide a ktime value by a nanosecond value
+ */
+static unsigned long ktime_divns(const ktime_t kt, nsec_t div)
+{
+	u64 dclc, inc, dns;
+	int sft = 0;
+
+	dclc = dns = ktime_to_ns(kt);
+	inc = div;
+	/* Make sure the divisor is less than 2^32: */
+	while (div >> 32) {
+		sft++;
+		div >>= 1;
+	}
+	dclc >>= sft;
+	do_div(dclc, (unsigned long) div);
+
+	return (unsigned long) dclc;
+}
+
+#else /* BITS_PER_LONG < 64 */
+# define ktime_divns(kt, div)		(unsigned long)((kt).tv64 / (div))
+#endif /* BITS_PER_LONG >= 64 */
+
+/*
+ * Counterpart to lock_timer_base above:
+ */
+static inline
+void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
+{
+	spin_unlock_irqrestore(&timer->base->lock, *flags);
+}
+
+/**
+ * hrtimer_forward - forward the timer expiry
+ *
+ * @timer:	hrtimer to forward
+ * @interval:	the interval to forward
+ *
+ * Forward the timer expiry so it will expire in the future.
+ * Returns the number of overruns.
+ */
+unsigned long
+hrtimer_forward(struct hrtimer *timer, ktime_t interval)
+{
+	unsigned long orun = 1;
+	ktime_t delta, now;
+
+	now = timer->base->get_time();
+
+	delta = ktime_sub(now, timer->expires);
+
+	if (delta.tv64 < 0)
+		return 0;
+
+	if (interval.tv64 < timer->base->resolution.tv64)
+		interval.tv64 = timer->base->resolution.tv64;
+
+	if (unlikely(delta.tv64 >= interval.tv64)) {
+		nsec_t incr = ktime_to_ns(interval);
+
+		orun = ktime_divns(delta, incr);
+		timer->expires = ktime_add_ns(timer->expires, incr * orun);
+		if (timer->expires.tv64 > now.tv64)
+			return orun;
+		/*
+		 * This (and the ktime_add() below) is the
+		 * correction for exact:
+		 */
+		orun++;
+	}
+	timer->expires = ktime_add(timer->expires, interval);
+
+	return orun;
+}
+
+/*
+ * enqueue_hrtimer - internal function to (re)start a timer
+ *
+ * The timer is inserted in expiry order. Insertion into the
+ * red black tree is O(log(n)). Must hold the base lock.
+ */
+static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
+{
+	struct rb_node **link = &base->active.rb_node;
+	struct rb_node *parent = NULL;
+	struct hrtimer *entry;
+
+	/*
+	 * Find the right place in the rbtree:
+	 */
+	while (*link) {
+		parent = *link;
+		entry = rb_entry(parent, struct hrtimer, node);
+		/*
+		 * We dont care about collisions. Nodes with
+		 * the same expiry time stay together.
+		 */
+		if (timer->expires.tv64 < entry->expires.tv64)
+			link = &(*link)->rb_left;
+		else
+			link = &(*link)->rb_right;
+	}
+
+	/*
+	 * Insert the timer to the rbtree and check whether it
+	 * replaces the first pending timer
+	 */
+	if (!base->first || timer->expires.tv64 <
+	    rb_entry(base->first, struct hrtimer, node)->expires.tv64) {
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+		/*
+		 * High resolution timers, when active try
+		 * to reprogram. If the timer is in the
+		 * past we just move it to the expired list
+		 * and schedule the softirq.
+		 */
+		if (hrtimer_hres_active && hrtimer_reprogram(timer, base)) {
+			/*
+			 * Only wake up the hrtimer softirq if it is needed,
+			 * otherwise wake up the process waiting for this timer.
+			 */
+			if (!timer->function) {
+				wake_up_process(timer->data);
+				timer->state = HRTIMER_EXPIRED;
+			} else {
+				list_add_tail(&timer->list, &base->expired);
+				timer->state = HRTIMER_PENDING_CALLBACK;
+				raise_softirq(HRTIMER_SOFTIRQ);
+			}
+			return;
+		}
+#endif
+		base->first = &timer->node;
+	}
+	rb_link_node(&timer->node, parent, link);
+	rb_insert_color(&timer->node, &base->active);
+
+	timer->state = HRTIMER_PENDING;
+}
+
+/*
+ * __remove_hrtimer - internal function to remove a timer
+ *
+ * Caller must hold the base lock.
+ */
+static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
+{
+#ifdef CONFIG_HIGH_RES_TIMERS
+	if (timer->state == HRTIMER_PENDING_CALLBACK) {
+		list_del(&timer->list);
+		return;
+	}
+#endif
+	/*
+	 * Remove the timer from the rbtree and replace the
+	 * first entry pointer if necessary.
+	 */
+	if (base->first == &timer->node)
+		base->first = rb_next(&timer->node);
+	rb_erase(&timer->node, &base->active);
+}
+
+/*
+ * remove hrtimer, called with base lock held
+ */
+static inline int
+remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
+{
+	if (hrtimer_active(timer)) {
+		__remove_hrtimer(timer, base);
+		timer->state = HRTIMER_INACTIVE;
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * hrtimer_start - (re)start an relative timer on the current CPU
+ *
+ * @timer:	the timer to be added
+ * @tim:	expiry time
+ * @mode:	expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+ *
+ * Returns:
+ *  0 on success
+ *  1 when the timer was active
+ */
+int
+hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
+{
+	struct hrtimer_base *base, *new_base;
+	unsigned long flags;
+	int ret;
+
+	base = lock_hrtimer_base(timer, &flags);
+
+	/* Remove an active timer from the queue: */
+	ret = remove_hrtimer(timer, base);
+
+	/* Switch the timer base, if necessary: */
+	new_base = switch_hrtimer_base(timer, base);
+
+	if (mode == HRTIMER_REL)
+		tim = ktime_add(tim, new_base->get_time());
+	timer->expires = tim;
+
+	enqueue_hrtimer(timer, new_base);
+
+	unlock_hrtimer_base(timer, &flags);
+
+	return ret;
+}
+
+/**
+ * hrtimer_try_to_cancel - try to deactivate a timer
+ *
+ * @timer:	hrtimer to stop
+ *
+ * Returns:
+ *  0 when the timer was not active
+ *  1 when the timer was active
+ * -1 when the timer is currently excuting the callback function and
+ *    can not be stopped
+ */
+int hrtimer_try_to_cancel(struct hrtimer *timer)
+{
+	struct hrtimer_base *base;
+	unsigned long flags;
+	int ret = -1;
+
+	base = lock_hrtimer_base(timer, &flags);
+
+	if (base->curr_timer != timer)
+		ret = remove_hrtimer(timer, base);
+
+	unlock_hrtimer_base(timer, &flags);
+
+	return ret;
+
+}
+
+/**
+ * hrtimer_cancel - cancel a timer and wait for the handler to finish.
+ *
+ * @timer:	the timer to be cancelled
+ *
+ * Returns:
+ *  0 when the timer was not active
+ *  1 when the timer was active
+ */
+int hrtimer_cancel(struct hrtimer *timer)
+{
+	for (;;) {
+		int ret = hrtimer_try_to_cancel(timer);
+
+		if (ret >= 0)
+			return ret;
+		hrtimer_wait_for_timer(timer);
+	}
+}
+
+/**
+ * hrtimer_get_remaining - get remaining time for the timer
+ *
+ * @timer:	the timer to read
+ */
+ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
+{
+	struct hrtimer_base *base;
+	unsigned long flags;
+	ktime_t rem;
+
+	base = lock_hrtimer_base(timer, &flags);
+	rem = ktime_sub(timer->expires, timer->base->get_time());
+	unlock_hrtimer_base(timer, &flags);
+
+	return rem;
+}
+
+/**
+ * hrtimer_init - initialize a timer to the given clock
+ *
+ * @timer:	the timer to be initialized
+ * @clock_id:	the clock to be used
+ * @mode:	timer mode abs/rel
+ */
+void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
+		  enum hrtimer_mode mode)
+{
+	struct hrtimer_base *bases;
+
+	memset(timer, 0, sizeof(struct hrtimer));
+
+	bases = per_cpu(hrtimer_bases, raw_smp_processor_id());
+
+	if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS)
+		clock_id = CLOCK_MONOTONIC;
+
+	timer->base = &bases[clock_id];
+}
+
+/**
+ * hrtimer_get_res - get the timer resolution for a clock
+ *
+ * @which_clock: which clock to query
+ * @tp:		 pointer to timespec variable to store the resolution
+ *
+ * Store the resolution of the clock selected by which_clock in the
+ * variable pointed to by tp.
+ */
+int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
+{
+	struct hrtimer_base *bases;
+
+	bases = per_cpu(hrtimer_bases, raw_smp_processor_id());
+	*tp = ktime_to_timespec(bases[which_clock].resolution);
+
+	return 0;
+}
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+
+/*
+ * High resolution timer interrupt
+ * Called with interrupts disabled
+ */
+int hrtimer_interrupt(void)
+{
+	struct hrtimer_base *base;
+	ktime_t expires_next, now;
+	int i, raise = 0, ret = 0;
+	int cpu = smp_processor_id();
+	struct hrtimer_hres *hres = &per_cpu(hrtimer_hres, cpu);
+
+	/* As long as we did not switch over to high resolution mode
+	 * we expect, that the event source is running in periodic
+	 * mode when it is a source serving other (tick based)
+	 * functionality than next event
+	 */
+	if (!hres->active)
+		return 1;
+ retry:
+	now = ktime_get();
+
+	/* FIXME: Switch this over to use a hrtimer ! */
+	if (hres->dotick) {
+		while (now.tv64 >= hres->next_tick.tv64) {
+			hres->next_tick = ktime_add(hres->next_tick,
+						    hres->tick_incr);
+			ret++;
+		}
+		expires_next = hres->next_tick;
+	} else
+		expires_next.tv64 = KTIME_MAX;
+
+	base = per_cpu(hrtimer_bases, cpu);
+
+	for (i = 0; i < MAX_HRTIMER_BASES; i++) {
+		ktime_t basenow;
+		struct rb_node *node;
+
+		spin_lock(&base->lock);
+
+		basenow = ktime_add(now, base->offset);
+
+		while ((node = base->first)) {
+			struct hrtimer *timer;
+
+			timer = rb_entry(node, struct hrtimer, node);
+
+			if (basenow.tv64 < timer->expires.tv64) {
+				ktime_t expires;
+
+				expires = ktime_sub(timer->expires,
+						    base->offset);
+				if (expires.tv64 < expires_next.tv64)
+					expires_next = expires;
+				break;
+			}
+			__remove_hrtimer(timer, base);
+
+			if (!timer->function) {
+				wake_up_process(timer->data);
+				timer->state = HRTIMER_EXPIRED;
+			} else {
+				list_add_tail(&timer->list, &base->expired);
+				timer->state = HRTIMER_PENDING_CALLBACK;
+				raise = 1;
+			}
+		}
+		spin_unlock(&base->lock);
+		base++;
+	}
+
+	hres->expires_next = expires_next;
+
+	/* Reprogramming necessary ? */
+	if (expires_next.tv64 != KTIME_MAX) {
+		if (clockevents_set_next_event(expires_next))
+			goto retry;
+	}
+
+	/* Raise softirq ? */
+	if (raise)
+		raise_softirq(HRTIMER_SOFTIRQ);
+
+	return ret;
+}
+
+/*
+ * Handle the per base hrtimer-queue in high resolution mode. The
+ * timer have been expired by the high resolution interrupt or have
+ * been enqueued into the expired list in the first place.
+ */
+static inline void run_hrtimer_hres_queue(struct hrtimer_base *base)
+{
+	spin_lock_irq(&base->lock);
+
+	while (!list_empty(&base->expired)) {
+		struct hrtimer *timer;
+		int (*fn)(void *);
+		void *data;
+		int restart;
+
+		timer = list_entry(base->expired.next, struct hrtimer, list);
+		fn = timer->function;
+		data = timer->data;
+		set_curr_timer(base, timer);
+		timer->state = HRTIMER_RUNNING;
+		list_del(&timer->list);
+		spin_unlock_irq(&base->lock);
+
+		/*
+		 * fn == NULL is special case for the simplest timer
+		 * variant - wake up process and do not restart:
+		 */
+		if (!fn) {
+			wake_up_process(data);
+			restart = HRTIMER_NORESTART;
+		} else
+			restart = fn(data);
+
+		spin_lock_irq(&base->lock);
+
+		/* Another CPU has added back the timer */
+		if (timer->state != HRTIMER_RUNNING)
+			continue;
+
+		if (restart == HRTIMER_RESTART)
+			enqueue_hrtimer(timer, base);
+		else
+			timer->state = HRTIMER_EXPIRED;
+	}
+	set_curr_timer(base, NULL);
+	spin_unlock_irq(&base->lock);
+
+	wake_up_timer_waiters(base);
+}
+
+static void run_hrtimer_softirq(struct softirq_action *h)
+{
+	struct hrtimer_base *base = per_cpu(hrtimer_bases, smp_processor_id());
+	int i;
+
+	for (i = 0; i < MAX_HRTIMER_BASES; i++)
+		run_hrtimer_hres_queue(&base[i]);
+}
+
+#endif	/* CONFIG_HIGH_RES_TIMERS */
+
+/*
+ * Expire the per base hrtimer-queue:
+ */
+static inline void run_hrtimer_queue(struct hrtimer_base *base)
+{
+	ktime_t now = base->get_time();
+	struct rb_node *node;
+
+	spin_lock_irq(&base->lock);
+
+	while ((node = base->first)) {
+		struct hrtimer *timer;
+		int (*fn)(void *);
+		int restart;
+		void *data;
+
+		timer = rb_entry(node, struct hrtimer, node);
+		if (now.tv64 <= timer->expires.tv64)
+			break;
+
+		fn = timer->function;
+		data = timer->data;
+		set_curr_timer(base, timer);
+		timer->state = HRTIMER_RUNNING;
+		__remove_hrtimer(timer, base);
+		spin_unlock_irq(&base->lock);
+
+		/*
+		 * fn == NULL is special case for the simplest timer
+		 * variant - wake up process and do not restart:
+		 */
+		if (!fn) {
+			wake_up_process(data);
+			restart = HRTIMER_NORESTART;
+		} else
+			restart = fn(data);
+
+		spin_lock_irq(&base->lock);
+
+		/* Another CPU has added back the timer */
+		if (timer->state != HRTIMER_RUNNING)
+			continue;
+
+		if (restart == HRTIMER_RESTART)
+			enqueue_hrtimer(timer, base);
+		else
+			timer->state = HRTIMER_EXPIRED;
+	}
+	set_curr_timer(base, NULL);
+	spin_unlock_irq(&base->lock);
+	wake_up_timer_waiters(base);
+}
+
+/*
+ * Called from timer softirq every jiffy, expire hrtimers:
+ *
+ * For HRT its the fall back code to run the softirq in the timer
+ * softirq context in case the hrtimer initialization failed or has
+ * not been done yet.
+ */
+void hrtimer_run_queues(void)
+{
+	struct hrtimer_base *base = __get_cpu_var(hrtimer_bases);
+	int i;
+
+	hrtimer_check_clocks();
+
+	if (hrtimer_hres_active)
+		return;
+
+	for (i = 0; i < MAX_HRTIMER_BASES; i++)
+		run_hrtimer_queue(&base[i]);
+}
+
+/*
+ * Sleep related functions:
+ */
+
+/**
+ * schedule_hrtimer - sleep until timeout
+ *
+ * @timer:	hrtimer variable initialized with the correct clock base
+ * @mode:	timeout value is abs/rel
+ *
+ * Make the current task sleep until @timeout is
+ * elapsed.
+ *
+ * You can set the task state as follows -
+ *
+ * %TASK_UNINTERRUPTIBLE - at least @timeout is guaranteed to
+ * pass before the routine returns. The routine will return 0
+ *
+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+ * delivered to the current task. In this case the remaining time
+ * will be returned
+ *
+ * The current task state is guaranteed to be TASK_RUNNING when this
+ * routine returns.
+ */
+static ktime_t __sched
+schedule_hrtimer(struct hrtimer *timer, const enum hrtimer_mode mode)
+{
+	/* fn stays NULL, meaning single-shot wakeup: */
+	timer->data = current;
+
+	hrtimer_start(timer, timer->expires, mode);
+
+	/*
+	 * the timer could have arleady expired, in which
+	 * case current would be running. Don't bother calling
+	 * schedule.
+	 */
+	if (likely(current->state))
+		schedule();
+	hrtimer_cancel(timer);
+
+	/* Return the remaining time: */
+	if (timer->state != HRTIMER_EXPIRED)
+		return ktime_sub(timer->expires, timer->base->get_time());
+	else
+		return (ktime_t) {.tv64 = 0 };
+}
+
+static inline ktime_t __sched
+schedule_hrtimer_interruptible(struct hrtimer *timer,
+			       const enum hrtimer_mode mode)
+{
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	return schedule_hrtimer(timer, mode);
+}
+
+static long __sched nanosleep_restart(struct restart_block *restart)
+{
+	struct timespec __user *rmtp;
+	struct timespec tu;
+	void *rfn_save = restart->fn;
+	struct hrtimer timer;
+	ktime_t rem;
+
+	restart->fn = do_no_restart_syscall;
+
+	hrtimer_init(&timer, (clockid_t) restart->arg3, HRTIMER_ABS);
+
+	timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0;
+
+	rem = schedule_hrtimer_interruptible(&timer, HRTIMER_ABS);
+
+	if (rem.tv64 <= 0)
+		return 0;
+
+	rmtp = (struct timespec __user *) restart->arg2;
+	tu = ktime_to_timespec(rem);
+	if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu)))
+		return -EFAULT;
+
+	restart->fn = rfn_save;
+
+	/* The other values in restart are already filled in */
+	return -ERESTART_RESTARTBLOCK;
+}
+
+long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
+		       const enum hrtimer_mode mode, const clockid_t clockid)
+{
+	struct restart_block *restart;
+	struct hrtimer timer;
+	struct timespec tu;
+	ktime_t rem;
+
+	hrtimer_init(&timer, clockid, mode);
+
+	timer.expires = timespec_to_ktime(*rqtp);
+
+	rem = schedule_hrtimer_interruptible(&timer, mode);
+	if (rem.tv64 <= 0)
+		return 0;
+
+	/* Absolute timers do not update the rmtp value and restart: */
+	if (mode == HRTIMER_ABS)
+		return -ERESTARTNOHAND;
+
+	tu = ktime_to_timespec(rem);
+
+	if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu)))
+		return -EFAULT;
+
+	restart = &current_thread_info()->restart_block;
+	restart->fn = nanosleep_restart;
+	restart->arg0 = timer.expires.tv64 & 0xFFFFFFFF;
+	restart->arg1 = timer.expires.tv64 >> 32;
+	restart->arg2 = (unsigned long) rmtp;
+	restart->arg3 = (unsigned long) timer.base->index;
+
+	return -ERESTART_RESTARTBLOCK;
+}
+
+asmlinkage long
+sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
+{
+	struct timespec tu;
+
+	if (copy_from_user(&tu, rqtp, sizeof(tu)))
+		return -EFAULT;
+
+	if (!timespec_valid(&tu))
+		return -EINVAL;
+
+	return hrtimer_nanosleep(&tu, rmtp, HRTIMER_REL, CLOCK_MONOTONIC);
+}
+
+/*
+ * Functions related to boot-time initialization:
+ */
+static void __devinit init_hrtimers_cpu(int cpu)
+{
+	struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu);
+	int i;
+
+	for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) {
+		spin_lock_init(&base->lock);
+#ifdef CONFIG_HIGH_RES_TIMERS
+		INIT_LIST_HEAD(&base->expired);
+#endif
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+		init_waitqueue_head(&base->wait);
+#endif
+  	}
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+	per_cpu(hrtimer_hres, cpu).expires_next.tv64 = KTIME_MAX;
+	set_bit(0, &per_cpu(hrtimer_hres, cpu).check_clocks);
+	per_cpu(hrtimer_hres, cpu).active = 0;
+#endif
+
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static void migrate_hrtimer_list(struct hrtimer_base *old_base,
+				struct hrtimer_base *new_base)
+{
+	struct hrtimer *timer;
+	struct rb_node *node;
+
+	while ((node = rb_first(&old_base->active))) {
+		timer = rb_entry(node, struct hrtimer, node);
+		__remove_hrtimer(timer, old_base);
+		timer->base = new_base;
+		enqueue_hrtimer(timer, new_base);
+	}
+}
+
+static void migrate_hrtimers(int cpu)
+{
+	struct hrtimer_base *old_base, *new_base;
+	int i;
+
+	BUG_ON(cpu_online(cpu));
+	old_base = per_cpu(hrtimer_bases, cpu);
+	new_base = get_cpu_var(hrtimer_bases);
+
+	raw_local_irq_disable();
+
+	for (i = 0; i < MAX_HRTIMER_BASES; i++) {
+
+		spin_lock(&new_base->lock);
+		spin_lock(&old_base->lock);
+
+		BUG_ON(old_base->curr_timer);
+
+		migrate_hrtimer_list(old_base, new_base);
+
+		spin_unlock(&old_base->lock);
+		spin_unlock(&new_base->lock);
+		old_base++;
+		new_base++;
+	}
+
+	raw_local_irq_enable();
+	put_cpu_var(hrtimer_bases);
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
+					unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+
+	switch (action) {
+
+	case CPU_UP_PREPARE:
+		init_hrtimers_cpu(cpu);
+		break;
+
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_DEAD:
+		migrate_hrtimers(cpu);
+		break;
+#endif
+
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata hrtimers_nb = {
+	.notifier_call = hrtimer_cpu_notify,
+};
+
+void __init hrtimers_init(void)
+{
+	hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
+			  (void *)(long)smp_processor_id());
+	register_cpu_notifier(&hrtimers_nb);
+#ifdef CONFIG_HIGH_RES_TIMERS
+	open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq, NULL);
+#endif
+}
+
Index: linux.prev/kernel/irq/Makefile
===================================================================
--- linux.prev.orig/kernel/irq/Makefile
+++ linux.prev/kernel/irq/Makefile
@@ -1,5 +1,5 @@
 
-obj-y := handle.o manage.o spurious.o
+obj-y := handle.o manage.o spurious.o resend.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
 
Index: linux.prev/kernel/irq/autoprobe.c
===================================================================
--- linux.prev.orig/kernel/irq/autoprobe.c
+++ linux.prev/kernel/irq/autoprobe.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/irq.h>
+#include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
Index: linux.prev/kernel/irq/handle.c
===================================================================
--- linux.prev.orig/kernel/irq/handle.c
+++ linux.prev/kernel/irq/handle.c
@@ -1,68 +1,75 @@
 /*
  * linux/kernel/irq/handle.c
  *
- * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 1992, 1998-2005 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005, Thomas Gleixner, Russell King
  *
  * This file contains the core interrupt handling code.
+ *
+ * Detailed information is available in Documentation/DocBook/genericirq
+ *
  */
 
 #include <linux/irq.h>
 #include <linux/module.h>
 #include <linux/random.h>
+#include <linux/kallsyms.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
 
+#if defined(CONFIG_NO_IDLE_HZ)
+#include <asm/dyntick.h>
+#endif
+
 #include "internals.h"
 
 /*
- * Linux has a controller-independent interrupt architecture.
- * Every controller has a 'controller-template', that is used
- * by the main code to do the right thing. Each driver-visible
- * interrupt source is transparently wired to the apropriate
- * controller. Thus drivers need not be aware of the
- * interrupt-controller.
- *
- * The code is designed to be easily extended with new/different
- * interrupt controllers, without having to do assembly magic or
- * having to touch the generic code.
- *
- * Controller mappings for all interrupt sources:
+ * Default initialization for all interrupt sources
  */
 irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
 	[0 ... NR_IRQS-1] = {
 		.status = IRQ_DISABLED,
 		.handler = &no_irq_type,
-		.lock = SPIN_LOCK_UNLOCKED
+ 		.lock = RAW_SPIN_LOCK_UNLOCKED,
+		.depth = 1,
 	}
 };
 
+EXPORT_SYMBOL_GPL(irq_desc);
+
 /*
- * Generic 'no controller' code
+ * What should we do if we get a hw irq event on an illegal vector?
+ * Each architecture has to answer this themself.
  */
-static void end_none(unsigned int irq) { }
-static void enable_none(unsigned int irq) { }
-static void disable_none(unsigned int irq) { }
-static void shutdown_none(unsigned int irq) { }
-static unsigned int startup_none(unsigned int irq) { return 0; }
-
-static void ack_none(unsigned int irq)
+static void ack_bad(unsigned int irq)
 {
-	/*
-	 * 'what should we do if we get a hw irq event on an illegal vector'.
-	 * each architecture has to answer this themself.
-	 */
 	ack_bad_irq(irq);
 }
 
-struct hw_interrupt_type no_irq_type = {
+/*
+ * NOP functions
+ */
+static void noop(unsigned int irq)
+{
+}
+
+static unsigned int noop_ret(unsigned int irq)
+{
+	return 0;
+}
+
+/*
+ * Generic no controller implementation
+ */
+struct irq_type no_irq_type = {
 	.typename = 	"none",
-	.startup = 	startup_none,
-	.shutdown = 	shutdown_none,
-	.enable = 	enable_none,
-	.disable = 	disable_none,
-	.ack = 		ack_none,
-	.end = 		end_none,
-	.set_affinity = NULL
+	.startup = 	noop_ret,
+	.shutdown = 	noop,
+	.enable = 	noop,
+	.disable = 	noop,
+	.ack = 		ack_bad,
+	.end = 		noop,
+	.handle_irq =	handle_bad_irq,
 };
 
 /*
@@ -74,42 +81,459 @@ irqreturn_t no_action(int cpl, void *dev
 }
 
 /*
- * Have got an event to handle:
+ * default ack function
+ */
+static void default_ack(unsigned int irq)
+{
+	irq_desc[irq].chip->ack(irq);
+}
+
+/*
+ * default mask ack function
+ */
+static void default_mask_ack(unsigned int irq)
+{
+	irq_desc_t *desc = irq_desc + irq;
+
+	if (desc->chip->mask_ack) {
+		desc->chip->mask_ack(irq);
+	} else {
+		desc->chip->mask(irq);
+		desc->chip->ack(irq);
+	}
+}
+
+/*
+ * default enable function
+ */
+static void default_enable(unsigned int irq)
+{
+	irq_desc_t *desc = irq_desc + irq;
+
+	desc->chip->unmask(irq);
+	desc->status &= ~IRQ_MASKED;
+}
+
+/*
+ * default end function
+ */
+static void default_end(unsigned int irq)
+{
+	irq_desc_t *desc = irq_desc + irq;
+
+	if (!desc->depth)
+		desc->chip->unmask(irq);
+}
+
+/*
+ * default disable function
+ */
+static void default_disable(unsigned int irq)
+{
+	irq_desc[irq].chip->mask(irq);
+}
+
+/*
+ * Default set type function
+ */
+static struct irq_type *default_set_type(unsigned int irq, unsigned int type)
+{
+	irq_desc_t *desc = irq_desc + irq;
+
+	if (desc->chip->set_type)
+		if (desc->chip->set_type(irq, type))
+			return NULL;
+
+	switch (type) {
+	case IRQ_TYPE_NONE:
+		return &no_irq_type;
+
+	case IRQ_TYPE_EDGEL:
+	case IRQ_TYPE_EDGEH:
+	case IRQ_TYPE_EDGEB:
+		return &default_edge_type;
+	case IRQ_TYPE_LEVELL:
+	case IRQ_TYPE_LEVELH:
+		return &default_level_type;
+	case IRQ_TYPE_SIMPLE:
+		return &default_simple_type;
+	}
+	return NULL;
+}
+
+/*
+ * Generic edge type interrupt
+ *
+ */
+struct irq_type default_edge_type = {
+	.typename	= "default_edge",
+	.enable		= default_enable,
+	.disable	= default_disable,
+	.ack		= default_ack,
+	.hold		= default_mask_ack,
+	.end		= noop,
+	.handle_irq	= handle_edge_irq,
+	.set_type	= default_set_type,
+};
+
+/*
+ * Generic level type interrupt
+ */
+struct irq_type default_level_type = {
+	.typename	= "default_level",
+	.enable		= default_enable,
+	.disable	= default_disable,
+	.ack		= default_mask_ack,
+	.end		= default_end,
+	.handle_irq	= handle_level_irq,
+	.set_type	= default_set_type,
+};
+
+/*
+ * Generic simple type interrupt
+ *
+ * No hardware handling necessary
+ */
+struct irq_type default_simple_type = {
+	.typename	= "default_simple",
+	.enable		= default_enable,
+	.disable	= default_disable,
+	.set_type	= default_set_type,
+	.handle_irq	= handle_simple_irq,
+};
+
+#ifdef CONFIG_SMP
+/*
+ * Generic per cpu type interrupt
+ */
+struct irq_type default_percpu_type = {
+	.typename	= "default_percpu",
+	.enable		= default_enable,
+	.disable	= default_disable,
+	.ack		= default_ack,
+	.end		= default_end,
+	.handle_irq	= handle_percpu_irq,
+};
+#endif
+
+/*
+ * Hack - used for development only.
+ */
+int debug_direct_keyboard = 0;
+
+int redirect_hardirq(struct irq_desc *desc)
+{
+	/*
+	 * Direct execution:
+	 */
+	if (!hardirq_preemption || (desc->status & IRQ_NODELAY) ||
+							!desc->thread)
+		return 0;
+
+#ifdef __i386__
+	if (debug_direct_keyboard && (desc - irq_desc == 1))
+		return 0;
+#endif
+
+	BUG_ON(!raw_irqs_disabled());
+	if (desc->thread && desc->thread->state != TASK_RUNNING)
+		wake_up_process(desc->thread);
+
+	return 1;
+}
+
+/**
+ * handle_IRQ_event - irq action chain handler
+ * @irq:	the interrupt number
+ * @regs:	pointer to a register structure
+ * @action:	the interrupt action chain for this irq
+ *
+ * Handles the action chain of an irq event
  */
 fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
 				struct irqaction *action)
 {
 	int ret, retval = 0, status = 0;
 
-	if (!(action->flags & SA_INTERRUPT))
-		local_irq_enable();
+	/*
+	 * Unconditionally enable interrupts for threaded
+	 * IRQ handlers:
+	 */
+	if (!hardirq_count() || !(action->flags & SA_INTERRUPT))
+		raw_local_irq_enable();
+
+#if defined(CONFIG_NO_IDLE_HZ)
+	if (!(action->flags & SA_TIMER) && system_timer->dyn_tick != NULL) {
+		write_seqlock(&xtime_lock);
+		if (system_timer->dyn_tick->state & DYN_TICK_ENABLED)
+			system_timer->dyn_tick->handler(irq, 0, regs);
+		write_sequnlock(&xtime_lock);
+	}
+#endif
 
 	do {
+		unsigned int preempt_count = preempt_count();
+
 		ret = action->handler(irq, action->dev_id, regs);
+		if (preempt_count() != preempt_count) {
+			stop_trace();
+			print_symbol("BUG: unbalanced irq-handler preempt count in %s!\n", (unsigned long) action->handler);
+			printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count());
+			dump_stack();
+			preempt_count() = preempt_count;
+		}
 		if (ret == IRQ_HANDLED)
 			status |= action->flags;
 		retval |= ret;
 		action = action->next;
 	} while (action);
 
-	if (status & SA_SAMPLE_RANDOM)
+	if (status & SA_SAMPLE_RANDOM) {
+		raw_local_irq_enable();
 		add_interrupt_randomness(irq);
-	local_irq_disable();
+	}
+	raw_local_irq_disable();
 
 	return retval;
 }
 
-/*
- * do_IRQ handles all normal device IRQ's (the special
+/**
+ * handle_bad_irq - handle spurious and unhandled irqs
+ * @irq:	the interrupt number
+ * @desc:	the interrupt description structure for this irq
+ * @regs:	pointer to a register structure
+ */
+void notrace handle_bad_irq(unsigned int irq, irq_desc_t *desc, struct pt_regs *regs)
+{
+}
+
+/**
+ * handle_simple_irq - Simple and software-decoded IRQs.
+ * @irq:	the interrupt number
+ * @desc:	the interrupt description structure for this irq
+ * @regs:	pointer to a register structure
+ *
+ * Simple interrupts are either sent from a demultiplexing interrupt
+ * handler or come from hardware, where no interrupt hardware control
+ * is necessary.
+ *
+ * Note: The caller is expected to handle the ack, clear, mask and
+ * unmask issues if necessary.
+ *
+ * Must be called with the irq_desc->lock held
+ */
+void handle_simple_irq(unsigned int irq, irq_desc_t *desc, struct pt_regs *regs)
+{
+	struct irqaction *action;
+	irqreturn_t action_ret;
+	const unsigned int cpu = smp_processor_id();
+
+	kstat_cpu(cpu).irqs[irq]++;
+
+	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+
+	action = desc->action;
+	if (unlikely(!action || desc->depth))
+		return;
+
+	desc->status |= IRQ_INPROGRESS;
+
+	/*
+	 * hardirq redirection to the irqd process context:
+	 */
+	if (redirect_hardirq(desc))
+		return;
+
+	spin_unlock(&desc->lock);
+	action_ret = handle_IRQ_event(irq, regs, action);
+	if (!noirqdebug)
+		note_interrupt(irq, desc, action_ret, regs);
+	spin_lock(&desc->lock);
+	desc->status &= ~IRQ_INPROGRESS;
+}
+
+/**
+ * handle_level_irq - Level type irq handler
+ * @irq:	the interrupt number
+ * @desc:	the interrupt description structure for this irq
+ * @regs:	pointer to a register structure
+ *
+ * Level type interrupts are active as long as the hardware line has
+ * the active level. This may require to mask the interrupt and unmask it
+ * after the associated handler has acknowledged the device, so the
+ * interrupt line is back to inactive.
+ *
+ * Must be called with the irq_desc->lock held
+ */
+void notrace handle_level_irq(unsigned int irq, irq_desc_t *desc, struct pt_regs *regs)
+{
+	struct irqaction *action;
+	irqreturn_t action_ret;
+	const unsigned int cpu = smp_processor_id();
+
+	kstat_cpu(cpu).irqs[irq]++;
+
+	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+
+	desc->handler->ack(irq);
+
+	/*
+	 * If its disabled or no action available
+	 * keep it masked and get out of here
+	 */
+	action = desc->action;
+	if (unlikely(!action || desc->depth))
+		goto out;
+
+	desc->status |= IRQ_INPROGRESS;
+
+	/*
+	 * hardirq redirection to the irqd process context:
+	 */
+	if (redirect_hardirq(desc))
+		return;
+
+	spin_unlock(&desc->lock);
+	action_ret = handle_IRQ_event(irq, regs, action);
+	if (!noirqdebug)
+		note_interrupt(irq, desc, action_ret, regs);
+	spin_lock(&desc->lock);
+
+	desc->status &= ~IRQ_INPROGRESS;
+out:
+	end_irq(desc, irq);
+}
+
+/**
+ * handle_edge_irq - edge type IRQ handler
+ * @irq:	the interrupt number
+ * @desc:	the interrupt description structure for this irq
+ * @regs:	pointer to a register structure
+ *
+ * Interrupt occures on the falling and/or rising edge of a hardware
+ * signal. The occurence is latched into the irq controller hardware
+ * and must be acked in order to be reenabled. After the ack another
+ * interrupt can happen on the same source even before the first one
+ * is handled by the assosiacted event handler. If this happens it
+ * might be necessary to disable (mask) the interrupt depending on the
+ * controller hardware. This requires to reenable the interrupt inside
+ * of the loop which handles the interrupts which have arrived while
+ * the handler was running. If all pending interrupts are handled, the
+ * loop is left and depending on the hardware controller some final
+ * ack might be necessary.
+ *
+ * Must be called with the irq_desc->lock held
+ */
+void notrace handle_edge_irq(unsigned int irq, irq_desc_t *desc, struct pt_regs *regs)
+{
+	const unsigned int cpu = smp_processor_id();
+
+	kstat_cpu(cpu).irqs[irq]++;
+
+	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+
+	/*
+	 * If we're currently running this IRQ, or its disabled,
+	 * we shouldn't process the IRQ. Mark it pending, handle
+	 * the necessary masking and go out
+	 */
+	if (unlikely((desc->status & IRQ_INPROGRESS) || desc->depth ||
+		    !desc->action)) {
+		desc->status |= (IRQ_PENDING | IRQ_MASKED);
+		desc->handler->hold(irq);
+		return;
+	}
+
+	/* Start handling the irq */
+	desc->handler->ack(irq);
+
+	/* Mark the IRQ currently in progress.*/
+	desc->status |= IRQ_INPROGRESS;
+
+	/*
+	 * hardirq redirection to the irqd process context:
+	 */
+	if (redirect_hardirq(desc))
+		return;
+
+	do {
+		struct irqaction *action = desc->action;
+		irqreturn_t action_ret;
+
+		if (unlikely(!action)) {
+			desc->handler->disable(irq);
+			return;
+		}
+
+		/*
+		 * When another irq arrived while we were handling
+		 * one, we could have masked the irq.
+		 * Renable it, if it was not disabled in meantime.
+		 */
+		if (unlikely(((desc->status & (IRQ_PENDING | IRQ_MASKED)) ==
+			    (IRQ_PENDING | IRQ_MASKED)) && !desc->depth))
+			desc->handler->enable(irq);
+
+		desc->status &= ~IRQ_PENDING;
+		spin_unlock(&desc->lock);
+		action_ret = handle_IRQ_event(irq, regs, action);
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret, regs);
+		spin_lock(&desc->lock);
+
+	} while ((desc->status & IRQ_PENDING) && !desc->depth);
+
+	desc->status &= ~IRQ_INPROGRESS;
+	end_irq(desc, irq);
+}
+
+#ifdef CONFIG_SMP
+/**
+ * handle_percpu_IRQ - Per CPU local irq handler
+ * @irq:	the interrupt number
+ * @desc:	the interrupt description structure for this irq
+ * @regs:	pointer to a register structure
+ *
+ * Per CPU interrupts on SMP machines without locking requirements
+ */
+void notrace handle_percpu_irq(unsigned int irq, irq_desc_t *desc, struct pt_regs *regs)
+{
+	irqreturn_t action_ret;
+
+	kstat_this_cpu.irqs[irq]++;
+	desc->handler->ack(irq);
+	action_ret = handle_IRQ_event(irq, regs, desc->action);
+	if (!noirqdebug)
+		note_interrupt(irq, desc, action_ret, regs);
+	desc->handler->end(irq);
+}
+#endif /* CONFIG_SMP */
+
+/**
+ * __do_IRQ - original all in one handler
+ * @irq:	the interrupt number
+ * @regs:	pointer to a register structure
+ *
+ * __do_IRQ handles all normal device IRQ's (the special
  * SMP cross-CPU interrupts have their own specific
- * handlers).
+ * handlers). * This is the original x86 implementation which is used for every
+ * type of interrupt.
+ *
  */
-fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
+fastcall notrace unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
 {
 	irq_desc_t *desc = irq_desc + irq;
 	struct irqaction * action;
 	unsigned int status;
 
+	/*
+	 * If the task is currently running in user mode, don't
+	 * detect soft lockups.  If CONFIG_DETECT_SOFTLOCKUP is not
+	 * configured, this should be optimized out.
+	 */
+	if (user_mode(regs))
+		touch_softlockup_watchdog();
+
 	kstat_this_cpu.irqs[irq]++;
 	if (CHECK_IRQ_PER_CPU(desc->status)) {
 		irqreturn_t action_ret;
@@ -120,7 +544,7 @@ fastcall unsigned int __do_IRQ(unsigned 
 		if (desc->handler->ack)
 			desc->handler->ack(irq);
 		action_ret = handle_IRQ_event(irq, regs, desc->action);
-		desc->handler->end(irq);
+		end_irq(desc, irq);
 		return 1;
 	}
 
@@ -156,6 +580,12 @@ fastcall unsigned int __do_IRQ(unsigned 
 		goto out;
 
 	/*
+	 * hardirq redirection to the irqd process context:
+	 */
+	if (redirect_hardirq(desc))
+		goto out_no_end;
+
+	/*
 	 * Edge triggered interrupts need to remember
 	 * pending events.
 	 * This applies to any hw interrupts that allow a second
@@ -180,13 +610,13 @@ fastcall unsigned int __do_IRQ(unsigned 
 		desc->status &= ~IRQ_PENDING;
 	}
 	desc->status &= ~IRQ_INPROGRESS;
-
 out:
 	/*
-	 * The ->end() handler has to deal with interrupts which got
-	 * disabled while the handler was running.
+	 * The end-handler has to deal with interrupts which got
+	 * disabled while the handler was running:
 	 */
-	desc->handler->end(irq);
+	end_irq(desc, irq);
+out_no_end:
 	spin_unlock(&desc->lock);
 
 	return 1;
Index: linux.prev/kernel/irq/internals.h
===================================================================
--- linux.prev.orig/kernel/irq/internals.h
+++ linux.prev/kernel/irq/internals.h
@@ -4,6 +4,23 @@
 
 extern int noirqdebug;
 
+void recalculate_desc_flags(struct irq_desc *desc);
+
+/*
+ * On PREEMPT_HARDIRQS, the ->ack handler masks interrupts, so that
+ * they can be redirected to an IRQ thread, if needed. So here we
+ * have to unmask the interrupt line, if needed:
+ */
+static inline void end_irq(irq_desc_t *desc, unsigned int irq)
+{
+#if defined(CONFIG_PREEMPT_HARDIRQS) && !defined(CONFIG_ARM) && !defined(CONFIG_PPC)
+	if (!(desc->status & (IRQ_DISABLED|IRQ_INPROGRESS)))
+		desc->handler->enable(irq);
+#else
+	desc->handler->end(irq);
+#endif
+}
+
 #ifdef CONFIG_PROC_FS
 extern void register_irq_proc(unsigned int irq);
 extern void register_handler_proc(unsigned int irq, struct irqaction *action);
Index: linux.prev/kernel/irq/manage.c
===================================================================
--- linux.prev.orig/kernel/irq/manage.c
+++ linux.prev/kernel/irq/manage.c
@@ -1,15 +1,18 @@
 /*
  * linux/kernel/irq/manage.c
  *
- * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 1992, 1998-2005 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005, Thomas Gleixner
  *
  * This file contains driver APIs to the irq subsystem.
  */
 
 #include <linux/config.h>
 #include <linux/irq.h>
-#include <linux/module.h>
 #include <linux/random.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/syscalls.h>
 #include <linux/interrupt.h>
 
 #include "internals.h"
@@ -39,8 +42,12 @@ void synchronize_irq(unsigned int irq)
 	if (irq >= NR_IRQS)
 		return;
 
-	while (desc->status & IRQ_INPROGRESS)
-		cpu_relax();
+	if (hardirq_preemption && !(desc->status & IRQ_NODELAY))
+		wait_event(desc->wait_for_handler,
+			!(desc->status & IRQ_INPROGRESS));
+	else
+		while (desc->status & IRQ_INPROGRESS)
+			cpu_relax();
 }
 
 EXPORT_SYMBOL(synchronize_irq);
@@ -128,11 +135,9 @@ void enable_irq(unsigned int irq)
 	case 1: {
 		unsigned int status = desc->status & ~IRQ_DISABLED;
 
-		desc->status = status;
-		if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
-			desc->status = status | IRQ_REPLAY;
-			hw_resend_irq(desc->handler,irq);
-		}
+		/* Prevent probing on this irq */
+		desc->status = status | IRQ_NOPROBE;
+		check_irq_resend(desc, irq);
 		desc->handler->enable(irq);
 		/* fall-through */
 	}
@@ -144,6 +149,43 @@ void enable_irq(unsigned int irq)
 
 EXPORT_SYMBOL(enable_irq);
 
+/**
+ * 	set_irq_wake - control irq power management wakeup
+ *	@irq: 	Interrupt to control
+ *	@mode:	power management wakeup mode
+ *
+ *	Enable/disable power management wakeup mode
+ */
+int set_irq_wake(unsigned int irq, unsigned int mode)
+{
+	irq_desc_t *desc = irq_desc + irq;
+	unsigned long flags;
+	int ret = -ENXIO;
+
+	spin_lock_irqsave(&desc->lock, flags);
+	if (desc->chip && desc->chip->set_wake)
+		ret = desc->chip->set_wake(irq, mode);
+	spin_unlock_irqrestore(&desc->lock, flags);
+	return ret;
+}
+
+EXPORT_SYMBOL(set_irq_wake);
+
+/*
+ * If any action has SA_NODELAY then turn IRQ_NODELAY on:
+ */
+void recalculate_desc_flags(struct irq_desc *desc)
+{
+	struct irqaction *action;
+
+	desc->status &= ~IRQ_NODELAY;
+	for (action = desc->action ; action; action = action->next)
+		if (action->flags & SA_NODELAY)
+			desc->status |= IRQ_NODELAY;
+}
+
+static int start_irq_thread(int irq, struct irq_desc *desc);
+
 /*
  * Internal function that tells the architecture code whether a
  * particular irq has been exclusively allocated or is available
@@ -153,7 +195,7 @@ int can_request_irq(unsigned int irq, un
 {
 	struct irqaction *action;
 
-	if (irq >= NR_IRQS)
+	if (irq >= NR_IRQS || irq_desc[irq].status & IRQ_NOREQUEST)
 		return 0;
 
 	action = irq_desc[irq].action;
@@ -197,15 +239,18 @@ int setup_irq(unsigned int irq, struct i
 		rand_initialize_irq(irq);
 	}
 
+	if (!(new->flags & SA_NODELAY))
+		if (start_irq_thread(irq, desc))
+			return -ENOMEM;
 	/*
 	 * The following block of code has to be executed atomically
 	 */
-	spin_lock_irqsave(&desc->lock,flags);
+	spin_lock_irqsave(&desc->lock, flags);
 	p = &desc->action;
 	if ((old = *p) != NULL) {
 		/* Can't share interrupts unless both agree to */
 		if (!(old->flags & new->flags & SA_SHIRQ)) {
-			spin_unlock_irqrestore(&desc->lock,flags);
+			spin_unlock_irqrestore(&desc->lock, flags);
 			return -EBUSY;
 		}
 
@@ -219,6 +264,11 @@ int setup_irq(unsigned int irq, struct i
 
 	*p = new;
 
+	/*
+	 * Propagate any possible SA_NODELAY flag into IRQ_NODELAY:
+	 */
+	recalculate_desc_flags(desc);
+
 	if (!shared) {
 		desc->depth = 0;
 		desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT |
@@ -228,11 +278,11 @@ int setup_irq(unsigned int irq, struct i
 		else
 			desc->handler->enable(irq);
 	}
-	spin_unlock_irqrestore(&desc->lock,flags);
+	spin_unlock_irqrestore(&desc->lock, flags);
 
 	new->irq = irq;
 	register_irq_proc(irq);
-	new->dir = NULL;
+	new->dir = new->threaded = NULL;
 	register_handler_proc(irq, new);
 
 	return 0;
@@ -262,7 +312,7 @@ void free_irq(unsigned int irq, void *de
 		return;
 
 	desc = irq_desc + irq;
-	spin_lock_irqsave(&desc->lock,flags);
+	spin_lock_irqsave(&desc->lock, flags);
 	p = &desc->action;
 	for (;;) {
 		struct irqaction * action = *p;
@@ -290,7 +340,8 @@ void free_irq(unsigned int irq, void *de
 				else
 					desc->handler->disable(irq);
 			}
-			spin_unlock_irqrestore(&desc->lock,flags);
+			recalculate_desc_flags(desc);
+			spin_unlock_irqrestore(&desc->lock, flags);
 			unregister_handler_proc(irq, action);
 
 			/* Make sure it's not being used on another CPU */
@@ -298,8 +349,8 @@ void free_irq(unsigned int irq, void *de
 			kfree(action);
 			return;
 		}
-		printk(KERN_ERR "Trying to free free IRQ%d\n",irq);
-		spin_unlock_irqrestore(&desc->lock,flags);
+		printk(KERN_ERR "Trying to free free IRQ%d\n", irq);
+		spin_unlock_irqrestore(&desc->lock, flags);
 		return;
 	}
 }
@@ -352,6 +403,8 @@ int request_irq(unsigned int irq,
 		return -EINVAL;
 	if (irq >= NR_IRQS)
 		return -EINVAL;
+	if (irq_desc[irq].status & IRQ_NOREQUEST)
+		return -EINVAL;
 	if (!handler)
 		return -EINVAL;
 
@@ -375,3 +428,371 @@ int request_irq(unsigned int irq,
 
 EXPORT_SYMBOL(request_irq);
 
+/**
+ *	generic_set_irq_type - set the hardware irq type structure for an irq
+ *	@irq: 	Interrupt number
+ *	@type: 	Pointer to irq_type structure
+ */
+int generic_set_irq_type(unsigned int irq, struct irq_type *type)
+{
+	irq_desc_t *desc;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS) {
+		printk(KERN_ERR "Trying to install type for IRQ%d\n", irq);
+		return -EINVAL;
+	}
+
+	if (!type)
+		type = &no_irq_type;
+
+	desc = irq_desc + irq;
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->handler = type;
+	spin_unlock_irqrestore(&desc->lock, flags);
+
+	return 0;
+}
+
+EXPORT_SYMBOL(generic_set_irq_type);
+
+/**
+ *	set_irq_data - set irq type data for an irq
+ *	@irq: 	Interrupt number
+ *	@data: 	Pointer to interrupt specific data
+ *
+ * 	Set the hardware irq controller data for an irq
+ */
+int set_irq_data(unsigned int irq, void *data)
+{
+	irq_desc_t *desc;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS) {
+		printk(KERN_ERR "Trying to install controller data for IRQ%d\n", irq);
+		return -EINVAL;
+	}
+
+	desc = irq_desc + irq;
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->handler_data = data;
+	spin_unlock_irqrestore(&desc->lock, flags);
+	return 0;
+}
+
+EXPORT_SYMBOL(set_irq_data);
+
+/**
+ *	set_irq_chip - set irq chip for an IRQ
+ *	@irq: 	Interrupt number
+ *	@chip: 	Pointer to irq_chip structure
+ *
+ * 	Set the hardware chip structure for an IRQ
+ */
+int set_irq_chip(unsigned int irq, struct irq_chip *chip)
+{
+	irq_desc_t *desc;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS) {
+		printk(KERN_ERR "Trying to install chip for IRQ%d\n", irq);
+		return -EINVAL;
+	}
+
+	desc = irq_desc + irq;
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->chip = chip;
+	spin_unlock_irqrestore(&desc->lock, flags);
+	return 0;
+}
+
+EXPORT_SYMBOL(set_irq_chip);
+
+/**
+ *	set_irq_chip_data - set irq chip data for an irq
+ *	@irq: 	Interrupt number
+ *	@data: 	Pointer to chip specific data
+ *
+ * 	Set the hardware irq chip data for an irq
+ */
+int set_irq_chip_data(unsigned int irq, void *data)
+{
+	irq_desc_t *desc = irq_desc + irq;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS || !desc->handler || !desc->chip) {
+		printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
+		return -EINVAL;
+	}
+
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->chip->chip_data = data;
+	spin_unlock_irqrestore(&desc->lock, flags);
+
+	return 0;
+}
+
+EXPORT_SYMBOL(set_irq_chip_data);
+
+/*
+ * 	set_hwirq_type - Set the irq type (level/edge/simple/percpu)
+ *	@irq: 		Interrupt number
+ *	@hw_type: 	interrupt type (see constants in include/linux/irq.h)
+ *
+ * 	Called from device drivers to configure GPIO interrupts
+ * 	according to their requirements. The set_type function of the
+ * 	handler returns a pointer to an irq_type structure which is
+ * 	able to handle this interrupt type. The handler in the irq
+ * 	descriptor structure is set to the new handler type.
+ *
+ */
+int set_hwirq_type(unsigned int irq, unsigned int hw_type)
+{
+	struct irq_type *type = NULL;
+	unsigned long flags;
+	irq_desc_t *desc;
+
+	if (irq >= NR_IRQS) {
+		printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq);
+		return -ENODEV;
+	}
+
+	desc = irq_desc + irq;
+	spin_lock_irqsave(&desc->lock, flags);
+	if (desc->handler->set_type) {
+		type = desc->handler->set_type(irq, hw_type);
+		if (type)
+			desc->handler = type;
+	}
+	spin_unlock_irqrestore(&desc->lock, flags);
+	return type ? -ENXIO : 0;
+}
+
+EXPORT_SYMBOL(set_hwirq_type);
+
+#ifdef CONFIG_PREEMPT_HARDIRQS
+
+int hardirq_preemption = 1;
+
+EXPORT_SYMBOL(hardirq_preemption);
+
+/*
+ * Real-Time Preemption depends on hardirq threading:
+ */
+#ifndef CONFIG_PREEMPT_RT
+
+static int __init hardirq_preempt_setup (char *str)
+{
+	if (!strncmp(str, "off", 3))
+		hardirq_preemption = 0;
+	else
+		get_option(&str, &hardirq_preemption);
+	if (!hardirq_preemption)
+		printk("turning off hardirq preemption!\n");
+
+	return 1;
+}
+
+__setup("hardirq-preempt=", hardirq_preempt_setup);
+
+#endif
+
+/*
+ * threaded simple handler
+ */
+static void thread_simple_irq(irq_desc_t *desc)
+{
+	struct irqaction *action = desc->action;
+	unsigned int irq = desc - irq_desc;
+	irqreturn_t action_ret;
+
+	if (action && !desc->depth) {
+		spin_unlock(&desc->lock);
+		action_ret = handle_IRQ_event(irq, NULL, action);
+		raw_local_irq_enable();
+ 		cond_resched_all();
+		spin_lock_irq(&desc->lock);
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret, NULL);
+	}
+	desc->status &= ~IRQ_INPROGRESS;
+}
+
+/*
+ * threaded level type irq handler
+ */
+static void thread_level_irq(irq_desc_t *desc)
+{
+	thread_simple_irq(desc);
+	end_irq(desc, desc - irq_desc);
+}
+
+/*
+ * threaded edge type IRQ handler
+ */
+static void thread_edge_irq(irq_desc_t *desc)
+{
+	unsigned int irq = desc - irq_desc;
+
+	do {
+		struct irqaction *action = desc->action;
+		irqreturn_t action_ret;
+
+		if (unlikely(!action)) {
+			desc->status &= ~IRQ_INPROGRESS;
+			desc->handler->disable(irq);
+			return;
+		}
+
+		/*
+		 * When another irq arrived while we were handling
+		 * one, we could have masked the irq.
+		 * Renable it, if it was not disabled in meantime.
+		 */
+		if (unlikely(((desc->status & (IRQ_PENDING | IRQ_MASKED)) ==
+			    (IRQ_PENDING | IRQ_MASKED)) && !desc->depth))
+			desc->handler->enable(irq);
+
+		desc->status &= ~IRQ_PENDING;
+		spin_unlock(&desc->lock);
+		action_ret = handle_IRQ_event(irq, NULL, action);
+		raw_local_irq_enable();
+		cond_resched_all();
+		spin_lock_irq(&desc->lock);
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret, NULL);
+	} while ((desc->status & IRQ_PENDING) && !desc->depth);
+
+	desc->status &= ~IRQ_INPROGRESS;
+	/*
+	 * The end-handler has to deal with interrupts which got
+	 * disabled while the handler was running:
+	 */
+	end_irq(desc, irq);
+}
+
+static void do_hardirq(struct irq_desc *desc)
+{
+	spin_lock_irq(&desc->lock);
+
+	if (!(desc->status & IRQ_INPROGRESS))
+		goto out;
+
+	if (desc->handler->handle_irq == handle_simple_irq)
+		thread_simple_irq(desc);
+	else if (desc->handler->handle_irq == handle_level_irq)
+		thread_level_irq(desc);
+	else
+		thread_edge_irq(desc);
+ out:
+	spin_unlock_irq(&desc->lock);
+
+	if (waitqueue_active(&desc->wait_for_handler))
+		wake_up(&desc->wait_for_handler);
+}
+
+extern asmlinkage void __do_softirq(void);
+
+static int curr_irq_prio = 49;
+
+static int do_irqd(void * __desc)
+{
+	struct sched_param param = { 0, };
+	struct irq_desc *desc = __desc;
+#ifdef CONFIG_SMP
+	int irq = desc - irq_desc;
+	cpumask_t mask;
+
+	mask = cpumask_of_cpu(any_online_cpu(irq_affinity[irq]));
+	set_cpus_allowed(current, mask);
+#endif
+	current->flags |= PF_NOFREEZE | PF_HARDIRQ;
+
+	/*
+	 * Scale irq thread priorities from prio 50 to prio 25
+	 */
+	param.sched_priority = curr_irq_prio;
+	if (param.sched_priority > 25)
+		curr_irq_prio = param.sched_priority - 1;
+
+//	param.sched_priority = 1;
+	sys_sched_setscheduler(current->pid, SCHED_FIFO, &param);
+
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		do_hardirq(desc);
+		cond_resched_all();
+		__do_softirq();
+//		do_softirq_from_hardirq();
+		raw_local_irq_enable();
+#ifdef CONFIG_SMP
+		/*
+		 * Did IRQ affinities change?
+		 */
+		if (!cpu_isset(smp_processor_id(), irq_affinity[irq])) {
+			mask = cpumask_of_cpu(any_online_cpu(irq_affinity[irq]));
+			set_cpus_allowed(current, mask);
+		}
+#endif
+		schedule();
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+static int ok_to_create_irq_threads;
+
+static int start_irq_thread(int irq, struct irq_desc *desc)
+{
+	if (desc->thread || !ok_to_create_irq_threads)
+		return 0;
+
+	desc->thread = kthread_create(do_irqd, desc, "IRQ %d", irq);
+	if (!desc->thread) {
+		printk(KERN_ERR "irqd: could not create IRQ thread %d!\n", irq);
+		return -ENOMEM;
+	}
+
+	/*
+	 * An interrupt may have come in before the thread pointer was
+	 * stored in desc->thread; make sure the thread gets woken up in
+	 * such a case:
+	 */
+	smp_mb();
+	wake_up_process(desc->thread);
+
+	return 0;
+}
+
+void __init init_hardirqs(void)
+{
+	int i;
+	ok_to_create_irq_threads = 1;
+
+	for (i = 0; i < NR_IRQS; i++) {
+		irq_desc_t *desc = irq_desc + i;
+
+		if (desc->action && !(desc->status & IRQ_NODELAY))
+			start_irq_thread(i, desc);
+	}
+}
+
+#else
+
+static int start_irq_thread(int irq, struct irq_desc *desc)
+{
+	return 0;
+}
+
+#endif
+
+void __init early_init_hardirqs(void)
+{
+	int i;
+
+	for (i = 0; i < NR_IRQS; i++)
+		init_waitqueue_head(&irq_desc[i].wait_for_handler);
+}
+
+
+
Index: linux.prev/kernel/irq/proc.c
===================================================================
--- linux.prev.orig/kernel/irq/proc.c
+++ linux.prev/kernel/irq/proc.c
@@ -7,9 +7,13 @@
  */
 
 #include <linux/irq.h>
+#include <asm/uaccess.h>
+#include <linux/profile.h>
 #include <linux/proc_fs.h>
 #include <linux/interrupt.h>
 
+#include "internals.h"
+
 static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
 
 #ifdef CONFIG_SMP
@@ -77,37 +81,6 @@ static int irq_affinity_write_proc(struc
 
 #endif
 
-#define MAX_NAMELEN 128
-
-static int name_unique(unsigned int irq, struct irqaction *new_action)
-{
-	struct irq_desc *desc = irq_desc + irq;
-	struct irqaction *action;
-
-	for (action = desc->action ; action; action = action->next)
-		if ((action != new_action) && action->name &&
-				!strcmp(new_action->name, action->name))
-			return 0;
-	return 1;
-}
-
-void register_handler_proc(unsigned int irq, struct irqaction *action)
-{
-	char name [MAX_NAMELEN];
-
-	if (!irq_dir[irq] || action->dir || !action->name ||
-					!name_unique(irq, action))
-		return;
-
-	memset(name, 0, MAX_NAMELEN);
-	snprintf(name, MAX_NAMELEN, "%s", action->name);
-
-	/* create /proc/irq/1234/handler/ */
-	action->dir = proc_mkdir(name, irq_dir[irq]);
-}
-
-#undef MAX_NAMELEN
-
 #define MAX_NAMELEN 10
 
 void register_irq_proc(unsigned int irq)
@@ -147,10 +120,96 @@ void register_irq_proc(unsigned int irq)
 
 void unregister_handler_proc(unsigned int irq, struct irqaction *action)
 {
+	if (action->threaded)
+		remove_proc_entry(action->threaded->name, action->dir);
 	if (action->dir)
 		remove_proc_entry(action->dir->name, irq_dir[irq]);
 }
 
+#ifndef CONFIG_PREEMPT_RT
+
+static int threaded_read_proc(char *page, char **start, off_t off,
+			      int count, int *eof, void *data)
+{
+	return sprintf(page, "%c\n",
+		((struct irqaction *)data)->flags & SA_NODELAY ? '0' : '1');
+}
+
+static int threaded_write_proc(struct file *file, const char __user *buffer,
+			       unsigned long count, void *data)
+{
+	int c;
+	struct irqaction *action = data;
+	irq_desc_t *desc = irq_desc + action->irq;
+
+	if (get_user(c, buffer))
+		return -EFAULT;
+	if (c != '0' && c != '1')
+		return -EINVAL;
+
+	spin_lock_irq(&desc->lock);
+
+	if (c == '0')
+		action->flags |= SA_NODELAY;
+	if (c == '1')
+		action->flags &= ~SA_NODELAY;
+	recalculate_desc_flags(desc);
+
+	spin_unlock_irq(&desc->lock);
+
+	return 1;
+}
+
+#endif
+
+#define MAX_NAMELEN 128
+
+static int name_unique(unsigned int irq, struct irqaction *new_action)
+{
+	struct irq_desc *desc = irq_desc + irq;
+	struct irqaction *action;
+
+	for (action = desc->action ; action; action = action->next)
+		if ((action != new_action) && action->name &&
+				!strcmp(new_action->name, action->name))
+			return 0;
+	return 1;
+}
+
+void register_handler_proc(unsigned int irq, struct irqaction *action)
+{
+	char name [MAX_NAMELEN];
+
+	if (!irq_dir[irq] || action->dir || !action->name ||
+					!name_unique(irq, action))
+		return;
+
+	memset(name, 0, MAX_NAMELEN);
+	snprintf(name, MAX_NAMELEN, "%s", action->name);
+
+	/* create /proc/irq/1234/handler/ */
+	action->dir = proc_mkdir(name, irq_dir[irq]);
+	if (!action->dir)
+		return;
+#ifndef CONFIG_PREEMPT_RT
+	{
+		struct proc_dir_entry *entry;
+		/* create /proc/irq/1234/handler/threaded */
+		entry = create_proc_entry("threaded", 0600, action->dir);
+		if (!entry)
+			return;
+		entry->nlink = 1;
+		entry->data = (void *)action;
+		entry->read_proc = threaded_read_proc;
+		entry->write_proc = threaded_write_proc;
+		action->threaded = entry;
+	}
+#endif
+}
+
+#undef MAX_NAMELEN
+
+
 void init_irq_proc(void)
 {
 	int i;
@@ -160,6 +219,9 @@ void init_irq_proc(void)
 	if (!root_irq_dir)
 		return;
 
+	/* create /proc/irq/prof_cpu_mask */
+	create_prof_cpu_mask(root_irq_dir);
+
 	/*
 	 * Create entries for all existing IRQs.
 	 */
Index: linux.prev/kernel/irq/resend.c
===================================================================
--- /dev/null
+++ linux.prev/kernel/irq/resend.c
@@ -0,0 +1,82 @@
+/*
+ * linux/kernel/irq/resend.c
+ *
+ * Copyright (C) 1992, 1998-2005 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005, Thomas Gleixner
+ *
+ * This file contains the tasklet-based IRQ-resend code
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/interrupt.h>
+
+#include "internals.h"
+
+/* Bitmap to handle software resend of interrupts: */
+static DECLARE_BITMAP(irqs_resend, NR_IRQS);
+
+/*
+ * Run software resends of IRQ's
+ */
+static void resend_irqs(unsigned long arg)
+{
+	unsigned long flags;
+	int irq;
+
+	for (;;) {
+		if (bitmap_empty(irqs_resend, NR_IRQS))
+			break;
+		irq = find_first_bit(irqs_resend, NR_IRQS);
+		clear_bit(irq, irqs_resend);
+		local_irq_save(flags);
+		desc_handle_irq(irq, (irq_desc + irq), NULL);
+		local_irq_restore(flags);
+	}
+}
+
+/* Tasklet to handle resend: */
+static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
+
+/*
+ * Handle irq resend
+ *
+ * If the interrupt is waiting to be processed, try to re-run it.  We
+ * can't directly run it from here since the caller might be in an
+ * interrupt-protected region. Not all irq controller chips can
+ * retrigger interrupts at hardware level. For edge type interrupts it
+ * is necessary to resend them by software.  At the moment the pending
+ * list is handled at the end of asm_do_IRQ. That means the next
+ * interrupt (on any irq line) will invoke the do_pending function. It
+ * could also be done by a thread which is woken up by the
+ * check_irq_resend function.
+ *
+ * Is called with interrupts disabled and desc->lock held
+ */
+void check_irq_resend(irq_desc_t *desc, unsigned int irq)
+{
+
+	/* Chipless implementation. This should vanish in the long run */
+	if (!desc->chip) {
+		unsigned int status = desc->status;
+		if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
+			desc->status = status | IRQ_REPLAY;
+			hw_resend_irq(desc->handler, irq);
+		}
+		return;
+	}
+
+	/* Chip based implementation */
+	if ((desc->status & IRQ_PENDING) && !test_bit(irq, irqs_resend)) {
+		desc->status &= ~IRQ_PENDING;
+		/* Try to retrigger it in hardware */
+		if (!desc->chip || !desc->chip->retrigger ||
+		    desc->chip->retrigger(irq)) {
+			/* Mark it pending */
+			set_bit(irq, irqs_resend);
+			tasklet_schedule(&resend_tasklet);
+		}
+	}
+}
+
Index: linux.prev/kernel/irq/spurious.c
===================================================================
--- linux.prev.orig/kernel/irq/spurious.c
+++ linux.prev/kernel/irq/spurious.c
@@ -10,6 +10,10 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/interrupt.h>
+#ifdef CONFIG_X86_IO_APIC
+# include <asm/apicdef.h>
+# include <asm/io_apic.h>
+#endif
 
 static int irqfixup;
 
@@ -55,9 +59,8 @@ static int misrouted_irq(int irq, struct
 			}
 			action = action->next;
 		}
-		local_irq_disable();
 		/* Now clean up the flags */
-		spin_lock(&desc->lock);
+		spin_lock_irq(&desc->lock);
 		action = desc->action;
 
 		/*
@@ -161,12 +164,19 @@ void note_interrupt(unsigned int irq, ir
 		 * The interrupt is stuck
 		 */
 		__report_bad_irq(irq, desc, action_ret);
+#ifdef CONFIG_X86_IO_APIC
+		if (!sis_apic_bug) {
+			sis_apic_bug = 1;
+			printk(KERN_ERR "turning off IO-APIC fast mode.\n");
+		}
+#else
 		/*
 		 * Now kill the IRQ
 		 */
 		printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
 		desc->status |= IRQ_DISABLED;
 		desc->handler->disable(irq);
+#endif
 	}
 	desc->irqs_unhandled = 0;
 }
Index: linux.prev/kernel/itimer.c
===================================================================
--- linux.prev.orig/kernel/itimer.c
+++ linux.prev/kernel/itimer.c
@@ -12,36 +12,48 @@
 #include <linux/syscalls.h>
 #include <linux/time.h>
 #include <linux/posix-timers.h>
+#include <linux/hrtimer.h>
 
 #include <asm/uaccess.h>
 
-static unsigned long it_real_value(struct signal_struct *sig)
+/**
+ * itimer_get_remtime - get remaining time for the timer
+ *
+ * @timer: the timer to read
+ *
+ * Returns the delta between the expiry time and now, which can be
+ * less than zero or 1usec for an pending expired timer
+ */
+static struct timeval itimer_get_remtime(struct hrtimer *timer)
 {
-	unsigned long val = 0;
-	if (timer_pending(&sig->real_timer)) {
-		val = sig->real_timer.expires - jiffies;
-
-		/* look out for negative/zero itimer.. */
-		if ((long) val <= 0)
-			val = 1;
-	}
-	return val;
+	ktime_t rem = hrtimer_get_remaining(timer);
+
+	/*
+	 * Racy but safe: if the itimer expires after the above
+	 * hrtimer_get_remtime() call but before this condition
+	 * then we return 0 - which is correct.
+	 */
+	if (hrtimer_active(timer)) {
+		if (rem.tv64 <= 0)
+			rem.tv64 = NSEC_PER_USEC;
+	} else
+		rem.tv64 = 0;
+
+	return ktime_to_timeval(rem);
 }
 
 int do_getitimer(int which, struct itimerval *value)
 {
 	struct task_struct *tsk = current;
-	unsigned long interval, val;
 	cputime_t cinterval, cval;
 
 	switch (which) {
 	case ITIMER_REAL:
 		spin_lock_irq(&tsk->sighand->siglock);
-		interval = tsk->signal->it_real_incr;
-		val = it_real_value(tsk->signal);
+		value->it_value = itimer_get_remtime(&tsk->signal->real_timer);
+		value->it_interval =
+			ktime_to_timeval(tsk->signal->it_real_incr);
 		spin_unlock_irq(&tsk->sighand->siglock);
-		jiffies_to_timeval(val, &value->it_value);
-		jiffies_to_timeval(interval, &value->it_interval);
 		break;
 	case ITIMER_VIRTUAL:
 		read_lock(&tasklist_lock);
@@ -113,59 +125,53 @@ asmlinkage long sys_getitimer(int which,
 }
 
 
-void it_real_fn(unsigned long __data)
+/*
+ * The timer is automagically restarted, when interval != 0
+ */
+int it_real_fn(void *data)
 {
-	struct task_struct * p = (struct task_struct *) __data;
-	unsigned long inc = p->signal->it_real_incr;
+	struct task_struct *tsk = (struct task_struct *) data;
 
-	send_group_sig_info(SIGALRM, SEND_SIG_PRIV, p);
+	send_group_sig_info(SIGALRM, SEND_SIG_PRIV, tsk);
 
-	/*
-	 * Now restart the timer if necessary.  We don't need any locking
-	 * here because do_setitimer makes sure we have finished running
-	 * before it touches anything.
-	 * Note, we KNOW we are (or should be) at a jiffie edge here so
-	 * we don't need the +1 stuff.  Also, we want to use the prior
-	 * expire value so as to not "slip" a jiffie if we are late.
-	 * Deal with requesting a time prior to "now" here rather than
-	 * in add_timer.
-	 */
-	if (!inc)
-		return;
-	while (time_before_eq(p->signal->real_timer.expires, jiffies))
-		p->signal->real_timer.expires += inc;
-	add_timer(&p->signal->real_timer);
+	if (tsk->signal->it_real_incr.tv64 != 0) {
+		hrtimer_forward(&tsk->signal->real_timer,
+			       tsk->signal->it_real_incr);
+
+		return HRTIMER_RESTART;
+	}
+	return HRTIMER_NORESTART;
 }
 
 int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
 {
 	struct task_struct *tsk = current;
- 	unsigned long val, interval, expires;
+	struct hrtimer *timer;
+	ktime_t expires;
 	cputime_t cval, cinterval, nval, ninterval;
 
 	switch (which) {
 	case ITIMER_REAL:
 again:
 		spin_lock_irq(&tsk->sighand->siglock);
-		interval = tsk->signal->it_real_incr;
-		val = it_real_value(tsk->signal);
+		timer = &tsk->signal->real_timer;
+		if (ovalue) {
+			ovalue->it_value = itimer_get_remtime(timer);
+			ovalue->it_interval
+				= ktime_to_timeval(tsk->signal->it_real_incr);
+		}
 		/* We are sharing ->siglock with it_real_fn() */
-		if (try_to_del_timer_sync(&tsk->signal->real_timer) < 0) {
+		if (hrtimer_try_to_cancel(timer) < 0) {
 			spin_unlock_irq(&tsk->sighand->siglock);
+			hrtimer_wait_for_timer(&tsk->signal->real_timer);
 			goto again;
 		}
 		tsk->signal->it_real_incr =
-			timeval_to_jiffies(&value->it_interval);
-		expires = timeval_to_jiffies(&value->it_value);
-		if (expires)
-			mod_timer(&tsk->signal->real_timer,
-				  jiffies + 1 + expires);
+			timeval_to_ktime(value->it_interval);
+		expires = timeval_to_ktime(value->it_value);
+		if (expires.tv64 != 0)
+			hrtimer_start(timer, expires, HRTIMER_REL);
 		spin_unlock_irq(&tsk->sighand->siglock);
-		if (ovalue) {
-			jiffies_to_timeval(val, &ovalue->it_value);
-			jiffies_to_timeval(interval,
-					   &ovalue->it_interval);
-		}
 		break;
 	case ITIMER_VIRTUAL:
 		nval = timeval_to_cputime(&value->it_value);
Index: linux.prev/kernel/latency.c
===================================================================
--- /dev/null
+++ linux.prev/kernel/latency.c
@@ -0,0 +1,2451 @@
+/*
+ *  kernel/latency.c
+ *
+ *  Copyright (C) 2004, 2005 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+
+#include <linux/mm.h>
+#include <linux/nmi.h>
+#include <linux/rtc.h>
+#include <linux/sched.h>
+#include <linux/percpu.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/profile.h>
+#include <linux/bootmem.h>
+#include <linux/version.h>
+#include <linux/notifier.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/interrupt.h>
+#include <linux/proc_fs.h>
+#include <linux/latency_hist.h>
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+#include <asm/rtc.h>
+
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+# ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+#  define irqs_off_preempt_count() preempt_count()
+# else
+#  define irqs_off_preempt_count() 0
+# endif
+#endif
+
+#ifdef CONFIG_WAKEUP_TIMING
+struct sch_struct {
+	raw_spinlock_t trace_lock;
+	struct task_struct *task;
+	int cpu;
+	struct cpu_trace *tr;
+} ____cacheline_aligned_in_smp;
+
+static __cacheline_aligned_in_smp struct sch_struct sch =
+#ifdef CONFIG_PREEMPT_RT
+		{ trace_lock: RAW_SPIN_LOCK_UNLOCKED };
+#else
+		{ trace_lock: SPIN_LOCK_UNLOCKED(sch.trace_lock) };
+#endif
+
+int wakeup_timing = 1;
+#endif
+
+#ifdef CONFIG_LATENCY_TIMING
+
+/*
+ * Maximum preemption latency measured. Initialize to maximum,
+ * we clear it after bootup.
+ */
+#ifdef CONFIG_LATENCY_HIST
+static cycles_t preempt_max_latency = (cycles_t)0UL;
+#else
+static cycles_t preempt_max_latency = (cycles_t)ULONG_MAX;
+#endif
+
+static cycles_t preempt_thresh;
+
+/*
+ * Should this new latency be reported/recorded?
+ */
+static int report_latency(cycles_t delta)
+{
+	if (latency_hist_flag && !trace_user_triggered)
+		return 1;
+
+	if (preempt_thresh) {
+		if (delta < preempt_thresh)
+			return 0;
+	} else {
+		if (delta <= preempt_max_latency)
+			return 0;
+	}
+	return 1;
+}
+
+/*
+ * Track maximum latencies and save the trace:
+ */
+
+/*
+ * trace_stop_sched_switched must not be called with runqueue locks held!
+ */
+static __cacheline_aligned_in_smp DECLARE_MUTEX(max_mutex);
+
+/*
+ * Sequence count - we record it when starting a measurement and
+ * skip the latency if the sequence has changed - some other section
+ * did a maximum and could disturb our measurement with serial console
+ * printouts, etc. Truly coinciding maximum latencies should be rare
+ * and what happens together happens separately as well, so this doesnt
+ * decrease the validity of the maximum found:
+ */
+static __cacheline_aligned_in_smp int max_sequence;
+
+enum trace_type
+{
+	__TRACE_FIRST_TYPE = 0,
+
+	TRACE_FN,
+	TRACE_SPECIAL,
+	TRACE_SPECIAL_PID,
+	TRACE_SPECIAL_U64,
+	TRACE_CMDLINE,
+	TRACE_SYSCALL,
+	TRACE_SYSRET,
+
+	__TRACE_LAST_TYPE
+};
+
+enum trace_flag_type
+{
+	TRACE_FLAG_IRQS_OFF		= 0x01,
+	TRACE_FLAG_NEED_RESCHED		= 0x02,
+	TRACE_FLAG_HARDIRQ		= 0x04,
+	TRACE_FLAG_SOFTIRQ		= 0x08,
+	TRACE_FLAG_IRQS_HARD_OFF	= 0x10,
+};
+
+
+#ifdef CONFIG_LATENCY_TRACE
+
+/*
+ * On DEBUG_PAGEALLOC && SMP there's not too much lowmem, so reduce
+ * the # of trace entries, or else we OOM on bootup. Same applies for
+ * ARM where we have only 4MB boot window for kernel text+data+bss.
+ *
+ * The large buffer allocates 8MB memory, which might also be more
+ * than the available memory on a small embedded box. This needs more
+ * thought for embedded devices and should be initialized at runtime
+ * under consideration of the available memory resources.
+ */
+#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_SMP) && !defined(CONFIG_ARM)
+# define MAX_TRACE (unsigned long)(8192*16-1)
+#else
+# define MAX_TRACE (unsigned long)(8192*2-1)
+#endif
+
+#define CMDLINE_BYTES 16
+
+/*
+ * 32 bytes on 32-bit platforms:
+ */
+struct trace_entry {
+	char type;
+	char cpu;
+	char flags;
+	char preempt_count; // assumes PREEMPT_MASK is 8 bits or less
+	int pid;
+	cycles_t timestamp;
+	union {
+		struct {
+			unsigned long eip;
+			unsigned long parent_eip;
+		} fn;
+		struct {
+			unsigned long eip;
+			unsigned long v1, v2, v3;
+		} special;
+		struct {
+			unsigned char str[CMDLINE_BYTES];
+		} cmdline;
+		struct {
+			unsigned int nr;
+			unsigned long p1, p2, p3;
+		} syscall;
+		struct {
+			unsigned int ret;
+		} sysret;
+		struct {
+			int __pad3[4];
+		} pad;
+	} u;
+} __attribute__((packed));
+
+#endif
+
+struct cpu_trace {
+	atomic_t disabled;
+	unsigned long trace_idx;
+	cycles_t preempt_timestamp;
+	unsigned long critical_start, critical_end;
+	int critical_sequence;
+	atomic_t overrun;
+	int early_warning;
+	int latency_type;
+	int cpu;
+
+#ifdef CONFIG_LATENCY_TRACE
+	struct trace_entry trace[MAX_TRACE];
+	char comm[CMDLINE_BYTES];
+	pid_t pid;
+	unsigned long uid;
+	unsigned long nice;
+	unsigned long policy;
+	unsigned long rt_priority;
+	unsigned long saved_latency;
+#endif
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+	unsigned long stack_check;
+#endif
+} ____cacheline_aligned_in_smp;
+
+static struct cpu_trace cpu_traces[NR_CPUS] ____cacheline_aligned_in_smp =
+{ [0 ... NR_CPUS-1] = {
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+ .stack_check = 1
+#endif
+ } };
+
+static unsigned long notrace cycles_to_usecs(cycles_t delta)
+{
+#ifdef CONFIG_X86
+	do_div(delta, cpu_khz/1000+1);
+#elif defined(CONFIG_PPC)
+	delta = mulhwu(tb_to_us, delta);
+#elif defined(CONFIG_ARM)
+	delta = mach_cycles_to_usecs(delta);
+#else
+	#error Implement cycles_to_usecs.
+#endif
+
+	return (unsigned long) delta;
+}
+
+static cycles_t notrace usecs_to_cycles(unsigned long delta)
+{
+#if defined(CONFIG_X86) || defined(CONFIG_PPC)
+	return (cycles_t) delta * (cycles_t) (cpu_khz/1000+1);
+#elif defined(CONFIG_ARM)
+	return mach_usecs_to_cycles(delta);
+#else
+	#error Implement usecs_to_cycles
+#endif
+}
+
+#ifdef CONFIG_LATENCY_TRACE
+
+int trace_enabled = 1;
+int mcount_enabled = 1;
+int trace_freerunning = 0;
+int trace_print_at_crash = 0;
+int trace_verbose = 0;
+int trace_all_cpus = 0;
+int trace_functions = 0;
+
+/*
+ * user-triggered via gettimeofday(0,1)/gettimeofday(0,0)
+ */
+int trace_user_triggered = 0;
+int trace_user_trigger_irq = -1;
+
+struct saved_trace_struct {
+	int cpu;
+	cycles_t first_timestamp, last_timestamp;
+	struct cpu_trace traces[NR_CPUS];
+} ____cacheline_aligned_in_smp;
+
+/*
+ * The current worst-case trace:
+ */
+static struct saved_trace_struct max_tr;
+
+/*
+ * /proc/latency_trace atomicity:
+ */
+static DECLARE_MUTEX(out_mutex);
+
+static struct saved_trace_struct out_tr;
+
+static void notrace printk_name(unsigned long eip)
+{
+	char namebuf[KSYM_NAME_LEN+1];
+	unsigned long size, offset;
+	const char *sym_name;
+	char *modname;
+
+	sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf);
+	if (sym_name)
+		printk("%s+%#lx/%#lx", sym_name, offset, size);
+	else
+		printk("<%08lx>", eip);
+}
+
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+
+#define MIN_STACK_NEEDED (sizeof(struct thread_info) + STACK_WARN)
+#define MAX_STACK (THREAD_SIZE - sizeof(struct thread_info))
+
+#if (defined(__i386__) || defined(__x86_64__)) && defined(CONFIG_FRAME_POINTER)
+# define PRINT_EXACT_STACKFRAME
+#endif
+
+#ifdef PRINT_EXACT_STACKFRAME
+static unsigned long *worst_stack_bp;
+#endif
+static DEFINE_RAW_SPINLOCK(worst_stack_lock);
+unsigned long worst_stack_left = THREAD_SIZE;
+static unsigned long worst_stack_printed = THREAD_SIZE;
+static char worst_stack_comm[TASK_COMM_LEN+1];
+static int worst_stack_pid;
+static unsigned long worst_stack_sp;
+static char worst_stack[THREAD_SIZE];
+
+static notrace void fill_worst_stack(unsigned long stack_left)
+{
+	unsigned long flags;
+
+	/*
+	 * On x64, we must not read the PDA during early bootup:
+	 */
+#ifdef CONFIG_X86_64
+	if (system_state == SYSTEM_BOOTING)
+		return;
+#endif
+	spin_lock_irqsave(&worst_stack_lock, flags);
+	if (likely(stack_left < worst_stack_left)) {
+		worst_stack_left = stack_left;
+		memcpy(worst_stack, current_thread_info(), THREAD_SIZE);
+		worst_stack_sp = (unsigned long)&stack_left;
+		memcpy(worst_stack_comm, current->comm, TASK_COMM_LEN);
+		worst_stack_pid = current->pid;
+#ifdef PRINT_EXACT_STACKFRAME
+# ifdef __i386__
+		asm ("mov %%ebp, %0\n" :"=g"(worst_stack_bp));
+# elif defined(__x86_64__)
+		asm ("mov %%rbp, %0\n" :"=g"(worst_stack_bp));
+# else
+#  error Poke the author of above asm code lines !
+# endif
+#endif
+	}
+	spin_unlock_irqrestore(&worst_stack_lock, flags);
+}
+
+#ifdef PRINT_EXACT_STACKFRAME
+
+/*
+ * This takes a BP offset to point the BP back into the saved stack,
+ * the original stack might be long gone (but the stackframe within
+ * the saved copy still contains references to it).
+ */
+#define CONVERT_TO_SAVED_STACK(bp) \
+	((void *)worst_stack + ((unsigned long)bp & (THREAD_SIZE-1)))
+
+static void show_stackframe(void)
+{
+	unsigned long addr, frame_size, *bp, *prev_bp, sum = 0;
+
+	bp = CONVERT_TO_SAVED_STACK(worst_stack_bp);
+
+	while (bp[0]) {
+		addr = bp[1];
+		if (!kernel_text_address(addr))
+			break;
+
+		prev_bp = bp;
+		bp = CONVERT_TO_SAVED_STACK((unsigned long *)bp[0]);
+
+		frame_size = (bp - prev_bp) * sizeof(long);
+
+		if (frame_size < THREAD_SIZE) {
+			printk("{ %4ld} ", frame_size);
+			sum += frame_size;
+		} else
+			printk("{=%4ld} ", sum);
+
+		printk("[<%08lx>] ", addr);
+		printk_name(addr);
+		printk("\n");
+	}
+}
+
+#else
+
+static inline int valid_stack_ptr(void *p)
+{
+	return  p > (void *)worst_stack &&
+                p < (void *)worst_stack + THREAD_SIZE - 3;
+}
+
+static void show_stackframe(void)
+{
+	unsigned long prev_frame, addr;
+	unsigned long *stack;
+
+	prev_frame = (unsigned long)(worst_stack +
+					(worst_stack_sp & (THREAD_SIZE-1)));
+	stack = (unsigned long *)prev_frame;
+
+	while (valid_stack_ptr(stack)) {
+		addr = *stack++;
+		if (__kernel_text_address(addr)) {
+			printk("(%4ld) ", (unsigned long)stack - prev_frame);
+			printk("[<%08lx>] ", addr);
+			print_symbol("%s\n", addr);
+			prev_frame = (unsigned long)stack;
+		}
+		if ((char *)stack >= worst_stack + THREAD_SIZE)
+			break;
+	}
+}
+
+#endif
+
+static notrace void __print_worst_stack(void)
+{
+	unsigned long fill_ratio;
+	printk("----------------------------->\n");
+	printk("| new stack fill maximum: %s/%d, %ld bytes (out of %ld bytes).\n",
+		worst_stack_comm, worst_stack_pid,
+		MAX_STACK-worst_stack_left, (long)MAX_STACK);
+	fill_ratio = (MAX_STACK-worst_stack_left)*100/(long)MAX_STACK;
+	printk("| Stack fill ratio: %02ld%%", fill_ratio);
+	if (fill_ratio >= 90)
+		printk(" - BUG: that's quite high, please report this!\n");
+	else
+		printk(" - that's still OK, no need to report this.\n");
+	printk("------------|\n");
+
+	show_stackframe();
+	printk("<---------------------------\n\n");
+}
+
+static notrace void print_worst_stack(void)
+{
+	unsigned long flags;
+
+	if (raw_irqs_disabled())
+		return;
+
+	spin_lock_irqsave(&worst_stack_lock, flags);
+	if (worst_stack_printed == worst_stack_left) {
+		spin_unlock_irqrestore(&worst_stack_lock, flags);
+		return;
+	}
+	worst_stack_printed = worst_stack_left;
+	spin_unlock_irqrestore(&worst_stack_lock, flags);
+
+	__print_worst_stack();
+}
+
+static notrace void debug_stackoverflow(struct cpu_trace *tr)
+{
+	long stack_left;
+
+	if (unlikely(tr->stack_check <= 0))
+		return;
+	atomic_inc(&tr->disabled);
+
+	/* Debugging check for stack overflow: is there less than 1KB free? */
+#ifdef __i386__
+	__asm__ __volatile__("and %%esp,%0" :
+				"=r" (stack_left) : "0" (THREAD_SIZE - 1));
+#elif defined(__x86_64__)
+	__asm__ __volatile__("and %%rsp,%0" :
+				"=r" (stack_left) : "0" (THREAD_SIZE - 1));
+#else
+# error Poke the author of above asm code lines !
+#endif
+	if (unlikely(stack_left < MIN_STACK_NEEDED)) {
+		tr->stack_check = 0;
+		printk(KERN_ALERT "BUG: stack overflow: only %ld bytes left! [%08lx...(%08lx-%08lx)]\n",
+			stack_left - sizeof(struct thread_info),
+			(long)&stack_left,
+			(long)current_thread_info(),
+			(long)current_thread_info() + THREAD_SIZE);
+		fill_worst_stack(stack_left);
+		__print_worst_stack();
+		goto out;
+	}
+	if (unlikely(stack_left < worst_stack_left)) {
+		tr->stack_check--;
+		fill_worst_stack(stack_left);
+		print_worst_stack();
+		tr->stack_check++;
+	} else
+		if (worst_stack_printed != worst_stack_left) {
+			tr->stack_check--;
+			print_worst_stack();
+			tr->stack_check++;
+		}
+out:
+	atomic_dec(&tr->disabled);
+}
+
+#endif
+
+static void notrace early_printk_name(unsigned long eip)
+{
+	char namebuf[KSYM_NAME_LEN+1];
+	unsigned long size, offset;
+	const char *sym_name;
+	char *modname;
+
+	sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf);
+	if (sym_name)
+		early_printk("%s <%08lx>", sym_name, eip);
+	else
+		early_printk("<%08lx>", eip);
+}
+
+static DEFINE_RAW_SPINLOCK(early_print_lock);
+
+static void notrace early_print_entry(struct trace_entry *entry)
+{
+	int hardirq, softirq;
+
+	spin_lock(&early_print_lock);
+	early_printk("%-5d ", entry->pid);
+
+	early_printk("%d%c%c",
+		entry->cpu,
+		(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+		(entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.',
+ 		(entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'n' : '.');
+
+	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+	if (hardirq && softirq)
+		early_printk("H");
+	else {
+		if (hardirq)
+			early_printk("h");
+		else {
+			if (softirq)
+				early_printk("s");
+			else
+				early_printk(".");
+		}
+	}
+
+	early_printk(":%d: ", entry->preempt_count);
+
+	if (entry->type == TRACE_FN) {
+		early_printk_name(entry->u.fn.eip);
+		early_printk("  <= (");
+		early_printk_name(entry->u.fn.parent_eip);
+		early_printk(")\n");
+	} else {
+		/* special entries: */
+		early_printk_name(entry->u.special.eip);
+		early_printk(": <%08lx> <%08lx> <%08lx>\n",
+			entry->u.special.v1,
+			entry->u.special.v2,
+			entry->u.special.v3);
+	}
+	spin_unlock(&early_print_lock);
+}
+
+
+static void notrace
+____trace(int cpu, enum trace_type type, struct cpu_trace *tr,
+	  unsigned long eip, unsigned long parent_eip,
+	  unsigned long v1, unsigned long v2, unsigned long v3,
+	  unsigned long flags)
+{
+	struct trace_entry *entry;
+	unsigned long idx, idx_next;
+	cycles_t timestamp;
+	u32 pc;
+
+#ifdef CONFIG_DEBUG_PREEMPT
+//	WARN_ON(!atomic_read(&tr->disabled));
+#endif
+	if (!tr->critical_start && !trace_user_triggered && !trace_all_cpus && !trace_print_at_crash && !trace_functions)
+		goto out;
+	/*
+	 * Allocate the next index. Make sure an NMI (or interrupt)
+	 * has not taken it away. Potentially redo the timestamp as
+	 * well to make sure the trace timestamps are in chronologic
+	 * order.
+	 */
+again:
+	idx = tr->trace_idx;
+	idx_next = idx + 1;
+	timestamp = get_cycles();
+
+	if (unlikely((trace_freerunning || trace_functions) &&
+						(idx_next >= MAX_TRACE)))
+		idx_next = 0;
+	if (unlikely(idx_next >= MAX_TRACE)) {
+		atomic_inc(&tr->overrun);
+		goto out;
+	}
+#ifdef __HAVE_ARCH_CMPXCHG
+	if (unlikely(cmpxchg(&tr->trace_idx, idx, idx_next) != idx))
+		goto again;
+#else
+# ifdef CONFIG_SMP
+#  error CMPXHG missing
+# else
+	/* No worry, we are protected by the atomic_incr(&tr->disabled)
+	 * in __trace further down
+	 */
+	tr->trace_idx = idx_next;
+# endif
+#endif
+	pc = preempt_count();
+
+	entry = tr->trace + idx;
+	entry->type = type;
+#ifdef CONFIG_SMP
+	entry->cpu = cpu;
+#endif
+	entry->flags = (irqs_off() ? TRACE_FLAG_IRQS_OFF : 0) |
+		(raw_irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_HARD_OFF : 0)|
+		((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
+		((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
+		(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
+	entry->preempt_count = pc & 0xff;
+	entry->pid = current->pid;
+	entry->timestamp = timestamp;
+
+	switch (type) {
+	case TRACE_FN:
+		entry->u.fn.eip = eip;
+		entry->u.fn.parent_eip = parent_eip;
+		if (unlikely(trace_functions && !in_interrupt()))
+			early_print_entry(entry);
+		break;
+	case TRACE_SPECIAL:
+	case TRACE_SPECIAL_PID:
+	case TRACE_SPECIAL_U64:
+		entry->u.special.eip = eip;
+		entry->u.special.v1 = v1;
+		entry->u.special.v2 = v2;
+		entry->u.special.v3 = v3;
+		if (unlikely(trace_functions && !in_interrupt()))
+			early_print_entry(entry);
+		break;
+	case TRACE_SYSCALL:
+		entry->u.syscall.nr = eip;
+		entry->u.syscall.p1 = v1;
+		entry->u.syscall.p2 = v2;
+		entry->u.syscall.p3 = v3;
+		break;
+	case TRACE_SYSRET:
+		entry->u.sysret.ret = eip;
+		break;
+	case TRACE_CMDLINE:
+		memcpy(entry->u.cmdline.str, current->comm, CMDLINE_BYTES);
+		break;
+	default:
+		break;
+	}
+out:
+	;
+}
+
+static inline void notrace
+___trace(enum trace_type type, unsigned long eip, unsigned long parent_eip,
+		unsigned long v1, unsigned long v2,
+			unsigned long v3)
+{
+	struct cpu_trace *tr;
+	unsigned long flags;
+	int cpu;
+
+	if (unlikely(trace_enabled <= 0))
+		return;
+
+#if defined(CONFIG_DEBUG_STACKOVERFLOW) && defined(CONFIG_X86)
+	debug_stackoverflow(cpu_traces + raw_smp_processor_id());
+#endif
+
+	__raw_local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	/*
+	 * Trace on the CPU where the current highest-prio task
+	 * is waiting to become runnable:
+	 */
+#ifdef CONFIG_WAKEUP_TIMING
+	if (wakeup_timing && !trace_all_cpus && !trace_print_at_crash && !trace_functions) {
+		if (!sch.tr || cpu != sch.cpu)
+			goto out;
+		tr = sch.tr;
+	} else
+		tr = cpu_traces + cpu;
+#else
+	tr = cpu_traces + cpu;
+#endif
+	atomic_inc(&tr->disabled);
+	if (likely(atomic_read(&tr->disabled) == 1)) {
+//#define DEBUG_STACK_POISON
+#ifdef DEBUG_STACK_POISON
+		char stack;
+
+		memset(&stack - 128, 0x34, 128);
+#endif
+		____trace(cpu, type, tr, eip, parent_eip, v1, v2, v3, flags);
+	}
+	atomic_dec(&tr->disabled);
+#ifdef CONFIG_WAKEUP_TIMING
+out:
+#endif
+	__raw_local_irq_restore(flags);
+}
+
+/*
+ * Special, ad-hoc tracepoints:
+ */
+void notrace trace_special(unsigned long v1, unsigned long v2, unsigned long v3)
+{
+	___trace(TRACE_SPECIAL, CALLER_ADDR0, 0, v1, v2, v3);
+}
+
+EXPORT_SYMBOL(trace_special);
+
+void notrace trace_special_pid(int pid, unsigned long v1, unsigned long v2)
+{
+	___trace(TRACE_SPECIAL_PID, CALLER_ADDR0, 0, pid, v1, v2);
+}
+
+EXPORT_SYMBOL(trace_special_pid);
+
+void notrace trace_special_u64(unsigned long long v1, unsigned long v2)
+{
+	___trace(TRACE_SPECIAL_U64, CALLER_ADDR0, 0,
+		 (unsigned long) (v1 >> 32), (unsigned long) (v1 & 0xFFFFFFFF), v2);
+}
+
+EXPORT_SYMBOL(trace_special_u64);
+
+/*
+ * Non-inlined function:
+ */
+void notrace __trace(unsigned long eip, unsigned long parent_eip)
+{
+	___trace(TRACE_FN, eip, parent_eip, 0, 0, 0);
+}
+
+extern void mcount(void);
+
+EXPORT_SYMBOL(mcount);
+
+void notrace __mcount(void)
+{
+	___trace(TRACE_FN, CALLER_ADDR1, CALLER_ADDR2, 0, 0, 0);
+}
+
+void notrace
+sys_call(int nr, unsigned long p1, unsigned long p2, unsigned long p3)
+{
+	___trace(TRACE_SYSCALL, nr, 0, p1, p2, p3);
+}
+
+void notrace sys_ret(int ret)
+{
+	___trace(TRACE_SYSRET, ret, 0, 0, 0, 0);
+}
+
+static void notrace print_name(struct seq_file *m, unsigned long eip)
+{
+	char namebuf[KSYM_NAME_LEN+1];
+	unsigned long size, offset;
+	const char *sym_name;
+	char *modname;
+
+	/*
+	 * Special trace values:
+	 */
+	if (((long)eip < 10000L) && ((long)eip > -10000L)) {
+		seq_printf(m, "(%ld)", eip);
+		return;
+	}
+	sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf);
+	if (sym_name)
+		seq_puts(m, sym_name);
+	else
+		seq_printf(m, "<%08lx>", eip);
+}
+
+static void notrace print_name_offset(struct seq_file *m, unsigned long eip)
+{
+	char namebuf[KSYM_NAME_LEN+1];
+	unsigned long size, offset;
+	const char *sym_name;
+	char *modname;
+
+	sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf);
+	if (sym_name)
+		seq_printf(m, "%s+%#lx/%#lx <%08lx>",
+					sym_name, offset, size, eip);
+	else
+		seq_printf(m, "<%08lx>", eip);
+}
+
+static unsigned int out_sequence = -1;
+static int pid_to_cmdline_array[PID_MAX_DEFAULT+1];
+
+static void notrace _trace_cmdline(int cpu, struct cpu_trace *tr)
+{
+	unsigned long flags;
+
+	raw_local_save_flags(flags);
+	____trace(cpu, TRACE_CMDLINE, tr, 0, 0, 0, 0, 0, flags);
+}
+
+void notrace trace_cmdline(void)
+{
+	___trace(TRACE_CMDLINE, 0, 0, 0, 0, 0);
+}
+
+static void construct_pid_to_cmdline(void)
+{
+	struct cpu_trace *tr = out_tr.traces;
+	unsigned int i, j, entries, pid;
+
+	if (tr->critical_sequence == out_sequence)
+		return;
+	out_sequence = tr->critical_sequence;
+
+	memset(pid_to_cmdline_array, -1, sizeof(int) * (PID_MAX_DEFAULT + 1));
+
+	entries = min(tr->trace_idx, MAX_TRACE-1);
+
+	for (i = 0; i < entries; i++) {
+		struct trace_entry *entry = tr->trace + i;
+
+		if (entry->type != TRACE_CMDLINE)
+			continue;
+		pid = entry->pid;
+		if (pid < PID_MAX_DEFAULT) {
+			pid_to_cmdline_array[pid] = i;
+			/*
+			 * Replace space with underline - makes it easier
+			 * to process for tools:
+			 */
+			for (j = 0; j < CMDLINE_BYTES; j++)
+				if (entry->u.cmdline.str[j] == ' ')
+					entry->u.cmdline.str[j] = '_';
+		}
+	}
+}
+
+char *pid_to_cmdline(unsigned long pid)
+{
+	struct cpu_trace *tr = out_tr.traces;
+	char *cmdline = "<...>";
+	int idx;
+
+	pid = min(pid, (unsigned long)PID_MAX_DEFAULT);
+	if (!pid)
+		return "<idle>";
+
+	if (pid_to_cmdline_array[pid] != -1) {
+		idx = pid_to_cmdline_array[pid];
+		if (tr->trace[idx].type == TRACE_CMDLINE)
+			cmdline = tr->trace[idx].u.cmdline.str;
+	}
+	return cmdline;
+}
+
+struct block_idx {
+	int idx[NR_CPUS];
+};
+
+/*
+ * return the trace entry (position) of the smallest-timestamp
+ * one (that is still in the valid idx range):
+ */
+static int min_idx(struct block_idx *bidx)
+{
+	cycles_t min_stamp = (cycles_t) -1;
+	struct trace_entry *entry;
+	int cpu, min_cpu = -1, idx;
+
+	for_each_online_cpu(cpu) {
+		idx = bidx->idx[cpu];
+		if (idx >= min(max_tr.traces[cpu].trace_idx, MAX_TRACE-1))
+			continue;
+		if (idx >= MAX_TRACE*NR_CPUS) {
+			printk("huh: idx (%d) > %ld*%d!\n", idx, MAX_TRACE, NR_CPUS);
+			WARN_ON(1);
+			break;
+		}
+		entry = max_tr.traces[cpu].trace + bidx->idx[cpu];
+		if (entry->timestamp < min_stamp) {
+			min_cpu = cpu;
+			min_stamp = entry->timestamp;
+		}
+	}
+
+	return min_cpu;
+}
+
+/*
+ * This code is called to construct an output trace from
+ * the maximum trace. Having separate traces serves both
+ * atomicity (a new max might be saved while we are busy
+ * accessing /proc/latency_trace) and it is also used to
+ * delay the (expensive) sorting of the output trace by
+ * timestamps, in the trace_all_cpus case.
+ */
+static void update_out_trace(void)
+{
+	int cpu, sum, entries, overrun_sum;
+	struct cpu_trace *tmp_max, *tmp_out;
+	struct trace_entry *out_entry, *entry;
+	struct block_idx bidx = { { 0, }, };
+	cycles_t stamp, first_stamp, last_stamp;
+
+	/*
+	 * Nasty trick. We might overflow the first array but
+	 * there are NR_CPUS of them so we use it as a 'big'
+	 * trace buffer.
+	 */
+	tmp_out = out_tr.traces + 0;
+	*tmp_out = max_tr.traces[max_tr.cpu];
+	out_tr.cpu = max_tr.cpu;
+	out_entry = tmp_out->trace + 0;
+
+	if (!trace_all_cpus) {
+		entries = min(tmp_out->trace_idx, MAX_TRACE-1);
+		if (!entries)
+			return;
+		out_tr.first_timestamp = tmp_out->trace[0].timestamp;
+		out_tr.last_timestamp = tmp_out->trace[entries-1].timestamp;
+		return;
+	}
+	/*
+	 * Find the range of timestamps that are fully traced in
+	 * all CPU traces. (since CPU traces can cover a variable
+	 * range of time, we have to find the best range.)
+	 */
+	first_stamp = 0;
+	for_each_online_cpu(cpu) {
+		tmp_max = max_tr.traces + cpu;
+		stamp = tmp_max->trace[0].timestamp;
+		if (stamp > first_stamp)
+			first_stamp = stamp;
+	}
+	/*
+	 * Save the timestamp range:
+	 */
+	tmp_max = max_tr.traces + max_tr.cpu;
+	entries = min(tmp_max->trace_idx, MAX_TRACE-1);
+	/*
+	 * No saved trace yet?
+	 */
+	if (!entries) {
+		out_tr.traces[0].trace_idx = 0;
+		return;
+	}
+
+	last_stamp = tmp_max->trace[entries-1].timestamp;
+
+	if (last_stamp < first_stamp) {
+		WARN_ON(1);
+
+		for_each_online_cpu(cpu) {
+			tmp_max = max_tr.traces + cpu;
+			entries = min(tmp_max->trace_idx, MAX_TRACE-1);
+			printk("CPU%d: %016Lx (%016Lx) ... #%d (%016Lx) %016Lx\n", cpu,
+				tmp_max->trace[0].timestamp,
+				tmp_max->trace[1].timestamp,
+				entries,
+				tmp_max->trace[entries-2].timestamp,
+				tmp_max->trace[entries-1].timestamp);
+		}
+		tmp_max = max_tr.traces + max_tr.cpu;
+		entries = min(tmp_max->trace_idx, MAX_TRACE-1);
+
+		printk("CPU%d entries: %d\n", max_tr.cpu, entries);
+		printk("first stamp: %016Lx\n", first_stamp);
+		printk(" last stamp: %016Lx\n", first_stamp);
+	}
+
+#if 0
+	printk("first_stamp: %Ld [%016Lx]\n", first_stamp, first_stamp);
+	printk(" last_stamp: %Ld [%016Lx]\n", last_stamp, last_stamp);
+	printk("   +1 stamp: %Ld [%016Lx]\n",
+		tmp_max->trace[entries].timestamp,
+		tmp_max->trace[entries].timestamp);
+	printk("   +2 stamp: %Ld [%016Lx]\n",
+		tmp_max->trace[entries+1].timestamp,
+		tmp_max->trace[entries+1].timestamp);
+	printk("      delta: %Ld\n", last_stamp-first_stamp);
+	printk("    entries: %d\n", entries);
+#endif
+
+	out_tr.first_timestamp = first_stamp;
+	out_tr.last_timestamp = last_stamp;
+
+	/*
+	 * Fetch trace entries one by one, in increasing timestamp
+	 * order. Start at first_stamp, stop at last_stamp:
+	 */
+	sum = 0;
+	for (;;) {
+		cpu = min_idx(&bidx);
+		if (cpu == -1)
+			break;
+		entry = max_tr.traces[cpu].trace + bidx.idx[cpu];
+		if (entry->timestamp > last_stamp)
+			break;
+
+		bidx.idx[cpu]++;
+		if (entry->timestamp < first_stamp)
+			continue;
+		*out_entry = *entry;
+		out_entry++;
+		sum++;
+		if (sum >= MAX_TRACE*NR_CPUS) {
+			printk("huh: sum (%d) > %ld*%d!\n", sum, MAX_TRACE, NR_CPUS);
+			WARN_ON(1);
+			break;
+		}
+	}
+
+	sum = 0;
+	overrun_sum = 0;
+	for_each_online_cpu(cpu) {
+		sum += max_tr.traces[cpu].trace_idx;
+		overrun_sum += atomic_read(&max_tr.traces[cpu].overrun);
+	}
+	tmp_out->trace_idx = sum;
+	atomic_set(&tmp_out->overrun, overrun_sum);
+}
+
+static void notrace print_help_header(struct seq_file *m)
+{
+	seq_puts(m, "                 _------=> CPU#            \n");
+	seq_puts(m, "                / _-----=> irqs-off        \n");
+	seq_puts(m, "               | / _----=> need-resched    \n");
+	seq_puts(m, "               || / _---=> hardirq/softirq \n");
+	seq_puts(m, "               ||| / _--=> preempt-depth   \n");
+	seq_puts(m, "               |||| /                      \n");
+	seq_puts(m, "               |||||     delay             \n");
+	seq_puts(m, "   cmd     pid ||||| time  |   caller      \n");
+	seq_puts(m, "      \\   /    |||||   \\   |   /           \n");
+}
+
+static void * notrace l_start(struct seq_file *m, loff_t *pos)
+{
+	loff_t n = *pos;
+	unsigned long entries;
+	struct cpu_trace *tr;
+
+	down(&out_mutex);
+	/*
+	 * if the file is being read newly, update the output trace:
+	 */
+	if (!n) {
+		// TODO: use the sequence counter here to optimize
+		down(&max_mutex);
+		update_out_trace();
+		up(&max_mutex);
+		if (!out_tr.traces[0].trace_idx) {
+			up(&out_mutex);
+			return NULL;
+		}
+		construct_pid_to_cmdline();
+	}
+	tr = out_tr.traces;
+	entries = min(tr->trace_idx, MAX_TRACE-1);
+
+	if (!n) {
+		seq_printf(m, "preemption latency trace v1.1.5 on %s\n", UTS_RELEASE);
+		seq_puts(m, "--------------------------------------------------------------------\n");
+		seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d | (M:%s VP:%d, KP:%d, SP:%d HP:%d",
+			cycles_to_usecs(tr->saved_latency),
+			entries, entries + atomic_read(&tr->overrun),
+			out_tr.cpu,
+#if defined(CONFIG_PREEMPT_NONE)
+			"server",
+#elif defined(CONFIG_PREEMPT_VOLUNTARY)
+			"desktop",
+#elif defined(CONFIG_PREEMPT_DESKTOP)
+			"preempt",
+#else
+			"rt",
+#endif
+			0, 0,
+			softirq_preemption, hardirq_preemption);
+#ifdef CONFIG_SMP
+		seq_printf(m, " #P:%d)\n", num_online_cpus());
+#else
+		seq_puts(m, ")\n");
+#endif
+		seq_puts(m, "    -----------------\n");
+		seq_printf(m, "    | task: %.16s-%d (uid:%ld nice:%ld policy:%ld rt_prio:%ld)\n",
+			tr->comm, tr->pid, tr->uid, tr->nice,
+			tr->policy, tr->rt_priority);
+		seq_puts(m, "    -----------------\n");
+		if (trace_user_triggered) {
+			seq_puts(m, " => started at: ");
+			print_name_offset(m, tr->critical_start);
+			seq_puts(m, "\n => ended at:   ");
+			print_name_offset(m, tr->critical_end);
+			seq_puts(m, "\n");
+		}
+		seq_puts(m, "\n");
+
+		if (!trace_verbose)
+			print_help_header(m);
+	}
+	if (n >= entries)
+		return NULL;
+
+	return tr->trace + n;
+}
+
+static void * notrace l_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	struct cpu_trace *tr = out_tr.traces;
+	unsigned long entries = min(tr->trace_idx, MAX_TRACE-1);
+
+	if (++*pos >= entries) {
+		if (*pos == entries)
+			seq_puts(m, "\n\nvim:ft=help\n");
+		return NULL;
+	}
+	return tr->trace + *pos;
+}
+
+static void notrace l_stop(struct seq_file *m, void *p)
+{
+	up(&out_mutex);
+}
+
+static void print_timestamp(struct seq_file *m, unsigned long abs_usecs,
+						unsigned long rel_usecs)
+{
+	seq_printf(m, " %4ldus", abs_usecs);
+	if (rel_usecs > 100)
+		seq_puts(m, "!: ");
+	else if (rel_usecs > 1)
+		seq_puts(m, "+: ");
+	else
+		seq_puts(m, " : ");
+}
+
+static void
+print_timestamp_short(struct seq_file *m, unsigned long abs_usecs,
+			unsigned long rel_usecs)
+{
+	seq_printf(m, " %4ldus", abs_usecs);
+	if (rel_usecs > 100)
+		seq_putc(m, '!');
+	else if (rel_usecs > 1)
+		seq_putc(m, '+');
+	else
+		seq_putc(m, ' ');
+}
+
+static void
+print_generic(struct seq_file *m, struct trace_entry *entry)
+{
+	int hardirq, softirq;
+
+	seq_printf(m, "%8.8s-%-5d ", pid_to_cmdline(entry->pid), entry->pid);
+	seq_printf(m, "%d", entry->cpu);
+	seq_printf(m, "%c%c",
+		(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+		(entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.',
+		(entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'n' : '.');
+
+	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+	if (hardirq && softirq)
+		seq_putc(m, 'H');
+	else {
+		if (hardirq)
+			seq_putc(m, 'h');
+		else {
+			if (softirq)
+				seq_putc(m, 's');
+			else
+				seq_putc(m, '.');
+		}
+	}
+
+	if (entry->preempt_count)
+		seq_printf(m, "%x", entry->preempt_count);
+	else
+		seq_puts(m, ".");
+}
+
+
+static int notrace l_show_fn(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	if (trace_verbose) {
+		seq_printf(m, "%16s %5d %d %d %08x %08lx [%016Lx] %ld.%03ldms (+%ld.%03ldms): ",
+			pid_to_cmdline(entry->pid),
+			entry->pid, entry->cpu, entry->flags,
+			entry->preempt_count, trace_idx,
+			entry->timestamp, abs_usecs/1000,
+			abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000);
+		print_name_offset(m, entry->u.fn.eip);
+		seq_puts(m, " (");
+		print_name_offset(m, entry->u.fn.parent_eip);
+		seq_puts(m, ")\n");
+	} else {
+		print_generic(m, entry);
+		print_timestamp(m, abs_usecs, rel_usecs);
+		print_name(m, entry->u.fn.eip);
+		seq_puts(m, " (");
+		print_name(m, entry->u.fn.parent_eip);
+		seq_puts(m, ")\n");
+	}
+	return 0;
+}
+
+static int notrace l_show_special(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry, int mode64)
+{
+	unsigned long abs_usecs, rel_usecs;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	print_generic(m, entry);
+	print_timestamp(m, abs_usecs, rel_usecs);
+	if (trace_verbose)
+		print_name_offset(m, entry->u.special.eip);
+	else
+		print_name(m, entry->u.special.eip);
+
+	if (!mode64) {
+		seq_printf(m, " (%lx %lx %lx)\n",
+			   entry->u.special.v1, entry->u.special.v2, entry->u.special.v3);
+	} else {
+		seq_printf(m, " (%lx%8lx %lx)\n",
+			   entry->u.special.v1, entry->u.special.v2, entry->u.special.v3);
+	}
+	return 0;
+}
+
+static int notrace
+l_show_special_pid(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+	unsigned int pid;
+
+	pid = entry->u.special.v1;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	print_generic(m, entry);
+	print_timestamp(m, abs_usecs, rel_usecs);
+	if (trace_verbose)
+		print_name_offset(m, entry->u.special.eip);
+	else
+		print_name(m, entry->u.special.eip);
+	seq_printf(m, " <%.8s-%d> (%lx %lx)\n",
+		pid_to_cmdline(pid), pid,
+		entry->u.special.v2, entry->u.special.v3);
+
+	return 0;
+}
+
+static int notrace l_show_cmdline(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+
+	if (!trace_verbose)
+		return 0;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	seq_printf(m,
+		"[ => %16s ] %ld.%03ldms (+%ld.%03ldms)\n",
+			entry->u.cmdline.str,
+			abs_usecs/1000, abs_usecs % 1000,
+			rel_usecs/1000, rel_usecs % 1000);
+
+	return 0;
+}
+
+extern unsigned long sys_call_table[NR_syscalls];
+
+static int notrace l_show_syscall(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+	unsigned int nr;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	print_generic(m, entry);
+	print_timestamp_short(m, abs_usecs, rel_usecs);
+
+	seq_puts(m, "> ");
+	nr = entry->u.syscall.nr;
+	if (nr < NR_syscalls)
+		print_name(m, sys_call_table[nr]);
+	else
+		seq_printf(m, "<badsys(%u)>", nr);
+
+	seq_printf(m, " (%08lx %08lx %08lx)\n",
+		entry->u.syscall.p1, entry->u.syscall.p2, entry->u.syscall.p3);
+
+	return 0;
+}
+
+static int notrace l_show_sysret(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	print_generic(m, entry);
+	print_timestamp_short(m, abs_usecs, rel_usecs);
+
+	seq_printf(m, "< (%d)\n", entry->u.sysret.ret);
+
+	return 0;
+}
+
+
+static int notrace l_show(struct seq_file *m, void *p)
+{
+	struct cpu_trace *tr = out_tr.traces;
+	struct trace_entry *entry, *entry0, *next_entry;
+	unsigned long trace_idx;
+
+	cond_resched();
+	entry = p;
+	if (entry->timestamp < out_tr.first_timestamp)
+		return 0;
+	if (entry->timestamp > out_tr.last_timestamp)
+		return 0;
+
+	entry0 = tr->trace;
+	trace_idx = entry - entry0;
+
+	if (trace_idx + 1 < tr->trace_idx)
+		next_entry = entry + 1;
+	else
+		next_entry = entry;
+
+	if (trace_verbose)
+		seq_printf(m, "(T%d/#%ld) ", entry->type, trace_idx);
+
+	switch (entry->type) {
+		case TRACE_FN:
+			l_show_fn(m, trace_idx, entry, entry0, next_entry);
+			break;
+		case TRACE_SPECIAL:
+			l_show_special(m, trace_idx, entry, entry0, next_entry, 0);
+			break;
+		case TRACE_SPECIAL_PID:
+			l_show_special_pid(m, trace_idx, entry, entry0, next_entry);
+			break;
+		case TRACE_SPECIAL_U64:
+			l_show_special(m, trace_idx, entry, entry0, next_entry, 1);
+			break;
+		case TRACE_CMDLINE:
+			l_show_cmdline(m, trace_idx, entry, entry0, next_entry);
+			break;
+		case TRACE_SYSCALL:
+			l_show_syscall(m, trace_idx, entry, entry0, next_entry);
+			break;
+		case TRACE_SYSRET:
+			l_show_sysret(m, trace_idx, entry, entry0, next_entry);
+			break;
+		default:
+			seq_printf(m, "unknown trace type %d\n", entry->type);
+	}
+	return 0;
+}
+
+struct seq_operations latency_trace_op = {
+	.start	= l_start,
+	.next	= l_next,
+	.stop	= l_stop,
+	.show	= l_show
+};
+
+static void copy_trace(struct cpu_trace *save, struct cpu_trace *tr)
+{
+	/* free-running needs reordering */
+	if (trace_freerunning) {
+		int i, idx, idx0 = tr->trace_idx;
+
+		for (i = 0; i < MAX_TRACE; i++) {
+			idx = (idx0 + i) % MAX_TRACE;
+			save->trace[i] = tr->trace[idx];
+		}
+		save->trace_idx = MAX_TRACE-1;
+	} else {
+		save->trace_idx = tr->trace_idx;
+
+		memcpy(save->trace, tr->trace,
+			min(save->trace_idx + 1, MAX_TRACE-1) *
+					sizeof(struct trace_entry));
+	}
+	save->overrun = tr->overrun;
+}
+
+/*
+ * Copy the new maximum trace into the separate maximum-trace
+ * structure. (this way the maximum trace is permanently saved,
+ * for later retrieval via /proc/latency_trace)
+ */
+static void update_max_tr(struct cpu_trace *tr)
+{
+	struct cpu_trace *save;
+	int cpu, all_cpus = 0;
+
+#ifdef CONFIG_PREEMPT
+	WARN_ON(!preempt_count() && !raw_irqs_disabled());
+#endif
+
+	max_tr.cpu = tr->cpu;
+	save = max_tr.traces + tr->cpu;
+
+	if ((wakeup_timing || trace_user_triggered || trace_print_at_crash || trace_functions) &&
+				trace_all_cpus) {
+		all_cpus = 1;
+		for_each_online_cpu(cpu)
+			atomic_inc(&cpu_traces[cpu].disabled);
+	}
+
+	save->saved_latency = preempt_max_latency;
+	save->preempt_timestamp = tr->preempt_timestamp;
+	save->critical_start = tr->critical_start;
+	save->critical_end = tr->critical_end;
+	save->critical_sequence = tr->critical_sequence;
+
+	memcpy(save->comm, current->comm, CMDLINE_BYTES);
+	save->pid = current->pid;
+	save->uid = current->uid;
+	save->nice = current->static_prio - 20 - MAX_RT_PRIO;
+	save->policy = current->policy;
+	save->rt_priority = current->rt_priority;
+
+	if (all_cpus) {
+		for_each_online_cpu(cpu) {
+			copy_trace(max_tr.traces + cpu, cpu_traces + cpu);
+			atomic_dec(&cpu_traces[cpu].disabled);
+		}
+	} else
+		copy_trace(save, tr);
+}
+
+#else /* !LATENCY_TRACE */
+
+static inline void notrace
+____trace(int cpu, enum trace_type type, struct cpu_trace *tr,
+	  unsigned long eip, unsigned long parent_eip,
+	  unsigned long v1, unsigned long v2, unsigned long v3,
+	  unsigned long flags)
+{
+}
+
+static inline void notrace
+___trace(enum trace_type type, unsigned long eip, unsigned long parent_eip,
+		unsigned long v1, unsigned long v2,
+			unsigned long v3)
+{
+}
+
+static inline void notrace __trace(unsigned long eip, unsigned long parent_eip)
+{
+}
+
+static inline void update_max_tr(struct cpu_trace *tr)
+{
+}
+
+static inline void notrace _trace_cmdline(int cpu, struct cpu_trace *tr)
+{
+}
+
+#endif
+
+static int setup_preempt_thresh(char *s)
+{
+	int thresh;
+
+	get_option(&s, &thresh);
+	if (thresh > 0) {
+		preempt_thresh = usecs_to_cycles(thresh);
+		printk("Preemption threshold = %u us\n", thresh);
+	}
+	return 1;
+}
+__setup("preempt_thresh=", setup_preempt_thresh);
+
+static inline void notrace reset_trace_idx(int cpu, struct cpu_trace *tr)
+{
+	if (trace_all_cpus)
+		for_each_online_cpu(cpu)
+			cpu_traces[cpu].trace_idx = 0;
+	else
+		tr->trace_idx = 0;
+}
+
+#ifdef CONFIG_CRITICAL_TIMING
+
+static void notrace
+check_critical_timing(int cpu, struct cpu_trace *tr, unsigned long parent_eip)
+{
+	unsigned long latency, t0, t1;
+	cycles_t T0, T1, T2, delta;
+	unsigned long flags;
+
+	if (trace_user_triggered)
+		return;
+	/*
+	 * usecs conversion is slow so we try to delay the conversion
+	 * as long as possible:
+	 */
+	T0 = tr->preempt_timestamp;
+	T1 = get_cycles();
+	delta = T1-T0;
+
+	raw_local_save_flags(flags);
+
+	if (!report_latency(delta))
+		goto out;
+
+	____trace(cpu, TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0, flags);
+	/*
+	 * Update the timestamp, because the trace entry above
+	 * might change it (it can only get larger so the latency
+	 * is fair to be reported):
+	 */
+	T2 = get_cycles();
+	if (T2 < T1)
+		printk("bug: %016Lx < %016Lx!\n", T2, T1);
+	delta = T2-T0;
+
+	latency = cycles_to_usecs(delta);
+	latency_hist(tr->latency_type, cpu, latency);
+
+	if (latency_hist_flag) {
+		if (preempt_max_latency >= delta)
+			goto out;
+	}
+
+	if (tr->critical_sequence != max_sequence || down_trylock(&max_mutex))
+		goto out;
+
+#ifndef CONFIG_CRITICAL_LATENCY_HIST
+	if (!preempt_thresh && preempt_max_latency > delta) {
+		printk("bug: updating %016Lx > %016Lx?\n",
+			preempt_max_latency, delta);
+		printk("  [%016Lx %016Lx %016Lx]\n", T0, T1, T2);
+	}
+#endif
+
+	preempt_max_latency = delta;
+	t0 = cycles_to_usecs(T0);
+	t1 = cycles_to_usecs(T1);
+
+	tr->critical_end = parent_eip;
+
+	update_max_tr(tr);
+
+#ifndef CONFIG_CRITICAL_LATENCY_HIST
+	if (preempt_thresh)
+		printk("(%16s-%-5d|#%d): %lu us critical section "
+			"violates %lu us threshold.\n"
+			" => started at timestamp %lu: ",
+				current->comm, current->pid,
+				raw_smp_processor_id(),
+				latency, cycles_to_usecs(preempt_thresh), t0);
+	else
+		printk("(%16s-%-5d|#%d): new %lu us maximum-latency "
+			"critical section.\n => started at timestamp %lu: ",
+				current->comm, current->pid,
+				raw_smp_processor_id(),
+				latency, t0);
+
+	print_symbol("<%s>\n", tr->critical_start);
+	printk(" =>   ended at timestamp %lu: ", t1);
+	print_symbol("<%s>\n", tr->critical_end);
+	dump_stack();
+	t1 = cycles_to_usecs(get_cycles());
+	printk(" =>   dump-end timestamp %lu\n\n", t1);
+#endif
+
+	max_sequence++;
+
+	up(&max_mutex);
+
+out:
+	tr->critical_sequence = max_sequence;
+	tr->preempt_timestamp = get_cycles();
+	tr->early_warning = 0;
+	reset_trace_idx(cpu, tr);
+	_trace_cmdline(cpu, tr);
+	____trace(cpu, TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0, flags);
+}
+
+void notrace touch_critical_timing(void)
+{
+	int cpu = raw_smp_processor_id();
+	struct cpu_trace *tr = cpu_traces + cpu;
+
+	if (!tr->critical_start || atomic_read(&tr->disabled) ||
+			trace_user_triggered || wakeup_timing)
+		return;
+
+	if (preempt_count() > 0 && tr->critical_start) {
+		atomic_inc(&tr->disabled);
+		check_critical_timing(cpu, tr, CALLER_ADDR0);
+		tr->critical_start = CALLER_ADDR0;
+		tr->critical_sequence = max_sequence;
+		atomic_dec(&tr->disabled);
+	}
+}
+EXPORT_SYMBOL(touch_critical_timing);
+
+void notrace stop_critical_timing(void)
+{
+	struct cpu_trace *tr = cpu_traces + raw_smp_processor_id();
+
+	tr->critical_start = 0;
+}
+EXPORT_SYMBOL(stop_critical_timing);
+
+static inline void notrace
+__start_critical_timing(unsigned long eip, unsigned long parent_eip, int latency_type)
+{
+	int cpu = raw_smp_processor_id();
+	struct cpu_trace *tr = cpu_traces + cpu;
+	unsigned long flags;
+
+	if (tr->critical_start || atomic_read(&tr->disabled) ||
+			trace_user_triggered || wakeup_timing)
+		return;
+
+	atomic_inc(&tr->disabled);
+
+	tr->critical_sequence = max_sequence;
+	tr->preempt_timestamp = get_cycles();
+	tr->critical_start = eip;
+	atomic_set(&tr->overrun, 0);
+	reset_trace_idx(cpu, tr);
+	tr->latency_type = latency_type;
+	_trace_cmdline(cpu, tr);
+
+	raw_local_save_flags(flags);
+	____trace(cpu, TRACE_FN, tr, eip, parent_eip, 0, 0, 0, flags);
+
+	atomic_dec(&tr->disabled);
+}
+
+static inline void notrace
+__stop_critical_timing(unsigned long eip, unsigned long parent_eip)
+{
+	int cpu = raw_smp_processor_id();
+	struct cpu_trace *tr = cpu_traces + cpu;
+	unsigned long flags;
+
+	if (!tr->critical_start || atomic_read(&tr->disabled) ||
+			trace_user_triggered || wakeup_timing)
+		return;
+
+	atomic_inc(&tr->disabled);
+	raw_local_save_flags(flags);
+	____trace(cpu, TRACE_FN, tr, eip, parent_eip, 0, 0, 0, flags);
+	check_critical_timing(cpu, tr, eip);
+	tr->critical_start = 0;
+	atomic_dec(&tr->disabled);
+}
+
+#endif
+
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+
+/* FIXME: do we have to save flags here? */
+void notrace trace_irqs_off_lowlevel(void)
+{
+	unsigned long flags;
+
+	raw_local_save_flags(flags);
+
+	if (!irqs_off_preempt_count() && raw_irqs_disabled_flags(flags))
+		__start_critical_timing(CALLER_ADDR0, 0, INTERRUPT_LATENCY);
+}
+
+void notrace trace_irqs_off(void)
+{
+	unsigned long flags;
+
+	raw_local_save_flags(flags);
+
+	if (!irqs_off_preempt_count() && raw_irqs_disabled_flags(flags))
+		__start_critical_timing(CALLER_ADDR0, CALLER_ADDR1, INTERRUPT_LATENCY);
+}
+
+EXPORT_SYMBOL(trace_irqs_off);
+
+void notrace trace_irqs_on(void)
+{
+	unsigned long flags;
+
+	raw_local_save_flags(flags);
+
+	if (!irqs_off_preempt_count() && raw_irqs_disabled_flags(flags))
+		__stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+
+EXPORT_SYMBOL(trace_irqs_on);
+
+#endif
+
+#endif /* LATENCY_TIMING */
+
+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_CRITICAL_TIMING)
+
+static inline unsigned long get_parent_eip(void)
+{
+	unsigned long parent_eip = CALLER_ADDR1;
+
+	if (in_lock_functions(parent_eip)) {
+		parent_eip = CALLER_ADDR2;
+		if (in_lock_functions(parent_eip))
+			parent_eip = CALLER_ADDR3;
+	}
+
+	return parent_eip;
+}
+
+void notrace add_preempt_count_ti(struct thread_info *ti, unsigned int val)
+{
+	unsigned long eip = CALLER_ADDR0;
+	unsigned long parent_eip = get_parent_eip();
+
+#ifdef CONFIG_DEBUG_PREEMPT
+	/*
+	 * Underflow?
+	 */
+	BUG_ON(((int)preempt_count_ti(ti) < 0));
+	/*
+	 * Spinlock count overflowing soon?
+	 */
+	BUG_ON((preempt_count_ti(ti) & PREEMPT_MASK) >= PREEMPT_MASK-10);
+#endif
+
+	preempt_count_ti(ti) += val;
+#ifdef CONFIG_PREEMPT_TRACE
+	if (val <= 10) {
+		unsigned int idx = preempt_count_ti(ti) & PREEMPT_MASK;
+		if (idx < MAX_PREEMPT_TRACE) {
+			current->preempt_trace_eip[idx] = eip;
+			current->preempt_trace_parent_eip[idx] = parent_eip;
+		}
+	}
+#endif
+#ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+	{
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+		unsigned long flags;
+
+		raw_local_save_flags(flags);
+
+		if (!raw_irqs_disabled_flags(flags))
+#endif
+			if (preempt_count() == val)
+				__start_critical_timing(eip, parent_eip, PREEMPT_LATENCY);
+	}
+#endif
+	(void)eip, (void)parent_eip;
+}
+EXPORT_SYMBOL(add_preempt_count_ti);
+
+void notrace add_preempt_count(unsigned int val)
+{
+	add_preempt_count_ti(current_thread_info(), val);
+}
+
+EXPORT_SYMBOL(add_preempt_count);
+
+void notrace sub_preempt_count_ti(struct thread_info *ti, unsigned int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+	/*
+	 * Underflow?
+	 */
+	BUG_ON(unlikely(val > preempt_count_ti(ti)));
+
+	/*
+	 * Is the spinlock portion underflowing?
+	 */
+	BUG_ON((val < PREEMPT_MASK) && !(preempt_count_ti(ti) & PREEMPT_MASK));
+#endif
+
+#ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+	{
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+		unsigned long flags;
+
+		raw_local_save_flags(flags);
+
+		if (!raw_irqs_disabled_flags(flags))
+#endif
+			if (preempt_count_ti(ti) == val)
+				__stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+	}
+#endif
+	preempt_count_ti(ti) -= val;
+}
+
+EXPORT_SYMBOL(sub_preempt_count_ti);
+
+void notrace sub_preempt_count(unsigned int val)
+{
+	sub_preempt_count_ti(current_thread_info(), val);
+}
+
+EXPORT_SYMBOL(sub_preempt_count);
+
+void notrace mask_preempt_count(unsigned int mask)
+{
+	unsigned long eip = CALLER_ADDR0;
+	unsigned long parent_eip = get_parent_eip();
+
+	preempt_count() |= mask;
+
+#ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+	{
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+		unsigned long flags;
+
+		raw_local_save_flags(flags);
+
+		if (!raw_irqs_disabled_flags(flags))
+#endif
+			if (preempt_count() == mask)
+				__start_critical_timing(eip, parent_eip, PREEMPT_LATENCY);
+	}
+#endif
+	(void) eip, (void) parent_eip;
+}
+EXPORT_SYMBOL(mask_preempt_count);
+
+void notrace unmask_preempt_count(unsigned int mask)
+{
+#ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+	{
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+		unsigned long flags;
+
+		raw_local_save_flags(flags);
+
+		if (!raw_irqs_disabled_flags(flags))
+#endif
+			if (preempt_count() == mask)
+				__stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+	}
+#endif
+	preempt_count() &= ~mask;
+}
+EXPORT_SYMBOL(unmask_preempt_count);
+
+
+#endif
+
+/*
+ * Wakeup latency timing/tracing. We get upcalls from the scheduler
+ * when a task is being woken up and we time/trace it until it gets
+ * to a CPU - or an even-higher-prio task supercedes it. (in that
+ * case we throw away the currently traced task - we dont try to
+ * handle nesting, that simplifies things significantly)
+ */
+#ifdef CONFIG_WAKEUP_TIMING
+
+static void notrace
+check_wakeup_timing(struct cpu_trace *tr, unsigned long parent_eip)
+{
+	unsigned long latency, t0, t1, flags;
+	cycles_t T0, T1, T2, delta;
+	int cpu = raw_smp_processor_id();
+
+	if (trace_user_triggered)
+		return;
+
+	atomic_inc(&tr->disabled);
+	if (atomic_read(&tr->disabled) != 1)
+		goto out;
+
+	T0 = tr->preempt_timestamp;
+	T1 = get_cycles();
+	/*
+	 * maybe preempt_timestamp originated on another CPU,
+	 * with a TSC drift:
+	 */
+	if (T0 > T1)
+		T0 = T1;
+	delta = T1-T0;
+
+	if (!report_latency(delta))
+		goto out;
+
+	raw_local_save_flags(flags);
+	____trace(smp_processor_id(), TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0, flags);
+	T2 = get_cycles();
+	if (T2 < T1)
+		printk("bug2: %016Lx < %016Lx!\n", T2, T1);
+	delta = T2-T0;
+
+	latency = cycles_to_usecs(delta);
+	latency_hist(tr->latency_type, cpu, latency);
+
+	if (latency_hist_flag) {
+		if (preempt_max_latency >= delta)
+			goto out;
+	}
+
+	if (tr->critical_sequence != max_sequence || down_trylock(&max_mutex))
+		goto out;
+
+#ifndef CONFIG_WAKEUP_LATENCY_HIST
+	if (!preempt_thresh && preempt_max_latency > delta) {
+		printk("bug2: updating %016Lx > %016Lx?\n",
+			preempt_max_latency, delta);
+		printk("  [%016Lx %016Lx %016Lx]\n", T0, T1, T2);
+	}
+#endif
+
+	preempt_max_latency = delta;
+	t0 = cycles_to_usecs(T0);
+	t1 = cycles_to_usecs(T1);
+	tr->critical_end = parent_eip;
+
+	update_max_tr(tr);
+
+#ifndef CONFIG_WAKEUP_LATENCY_HIST
+	if (preempt_thresh)
+		printk("(%16s-%-5d|#%d): %lu us wakeup latency "
+			"violates %lu us threshold.\n",
+				current->comm, current->pid,
+				raw_smp_processor_id(), latency,
+				cycles_to_usecs(preempt_thresh));
+	else
+		printk("(%16s-%-5d|#%d): new %lu us maximum-latency "
+			"wakeup.\n", current->comm, current->pid,
+				raw_smp_processor_id(), latency);
+#endif
+
+	max_sequence++;
+
+	up(&max_mutex);
+
+out:
+	atomic_dec(&tr->disabled);
+}
+
+/*
+ * Start wakeup latency tracing - called with the runqueue held
+ * and interrupts disabled:
+ */
+void __trace_start_sched_wakeup(struct task_struct *p)
+{
+	struct cpu_trace *tr;
+	int cpu;
+
+	if (trace_user_triggered || !wakeup_timing)
+		return;
+
+	spin_lock(&sch.trace_lock);
+	if (sch.task && (sch.task->prio <= p->prio))
+		goto out_unlock;
+
+	/*
+	 * New highest-prio task just woke up - start tracing:
+	 */
+	sch.task = p;
+	cpu = task_cpu(p);
+	sch.cpu = cpu;
+	/*
+	 * We keep using this CPU's trace buffer even if the task
+	 * gets migrated to another CPU. Tracing only happens on
+	 * the CPU that 'owns' the highest-prio task so it's
+	 * fundamentally single-threaded.
+	 */
+	sch.tr = tr = cpu_traces + cpu;
+	reset_trace_idx(cpu, tr);
+
+//	if (!atomic_read(&tr->disabled)) {
+		atomic_inc(&tr->disabled);
+		tr->critical_sequence = max_sequence;
+		tr->preempt_timestamp = get_cycles();
+		tr->latency_type = WAKEUP_LATENCY;
+		tr->critical_start = CALLER_ADDR0;
+		atomic_set(&tr->overrun, 0);
+		_trace_cmdline(raw_smp_processor_id(), tr);
+		atomic_dec(&tr->disabled);
+//	}
+
+	mcount();
+	trace_special_pid(p->pid, p->prio, cpu);
+out_unlock:
+	spin_unlock(&sch.trace_lock);
+}
+
+void trace_stop_sched_switched(struct task_struct *p)
+{
+	struct cpu_trace *tr;
+	unsigned long flags;
+
+	if (trace_user_triggered || !wakeup_timing)
+		return;
+
+	raw_local_irq_save(flags);
+	spin_lock(&sch.trace_lock);
+	if (p == sch.task) {
+		trace_special_pid(p->pid, p->prio, task_cpu(p));
+
+		sch.task = NULL;
+		tr = sch.tr;
+		sch.tr = NULL;
+		WARN_ON(!tr);
+		/*
+		 * Somewhat racy but safer - the printks within
+		 * check_wakeup_timing() can call back into the
+		 * wakup-timing code and deadlock:
+		 */
+//		atomic_inc(&tr->disabled);
+		preempt_disable();
+		spin_unlock(&sch.trace_lock);
+		check_wakeup_timing(tr, CALLER_ADDR0);
+		preempt_enable();
+//		atomic_dec(&tr->disabled);
+	} else {
+		if (sch.task)
+			trace_special_pid(sch.task->pid, sch.task->prio, p->prio);
+		if (sch.task && (sch.task->prio >= p->prio))
+			sch.task = NULL;
+		spin_unlock(&sch.trace_lock);
+	}
+	raw_local_irq_restore(flags);
+}
+
+void trace_change_sched_cpu(struct task_struct *p, int new_cpu)
+{
+	unsigned long flags;
+
+	if (!wakeup_timing)
+		return;
+
+	trace_special(task_cpu(p), task_cpu(p), new_cpu);
+	raw_local_irq_save(flags);
+	spin_lock(&sch.trace_lock);
+	if (p == sch.task && task_cpu(p) != new_cpu) {
+		sch.cpu = new_cpu;
+		trace_special(task_cpu(p), new_cpu, 0);
+	}
+	spin_unlock(&sch.trace_lock);
+	raw_local_irq_restore(flags);
+}
+
+#endif
+
+#ifdef CONFIG_LATENCY_TRACE
+
+long user_trace_start(void)
+{
+	struct cpu_trace *tr;
+	unsigned long flags;
+	int cpu;
+
+	if (!trace_user_triggered || trace_print_at_crash || trace_functions)
+		return -EINVAL;
+
+	/*
+	 * user_trace_start() might be called from hardirq
+	 * context, if trace_user_triggered_irq is set, so
+	 * be careful about locking:
+	 */
+	if (preempt_count()) {
+		if (down_trylock(&max_mutex))
+			return -EAGAIN;
+	} else
+		down(&max_mutex);
+
+	raw_local_irq_save(flags);
+	cpu = smp_processor_id();
+	tr = cpu_traces + cpu;
+
+#ifdef CONFIG_WAKEUP_TIMING
+	if (wakeup_timing) {
+		spin_lock(&sch.trace_lock);
+		sch.task = current;
+		sch.cpu = cpu;
+		sch.tr = tr;
+		spin_unlock(&sch.trace_lock);
+	}
+#endif
+	reset_trace_idx(cpu, tr);
+
+	tr->critical_sequence = max_sequence;
+	tr->preempt_timestamp = get_cycles();
+	tr->critical_start = CALLER_ADDR0;
+	atomic_set(&tr->overrun, 0);
+	_trace_cmdline(cpu, tr);
+	mcount();
+
+	WARN_ON(!raw_irqs_disabled());
+	raw_local_irq_restore(flags);
+
+	up(&max_mutex);
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(user_trace_start);
+
+long user_trace_stop(void)
+{
+	unsigned long latency, flags;
+	struct cpu_trace *tr;
+	cycles_t delta;
+
+
+	if (!trace_user_triggered || trace_print_at_crash || trace_functions)
+		return -EINVAL;
+
+	preempt_disable();
+	mcount();
+
+	raw_local_irq_save(flags);
+#ifdef CONFIG_WAKEUP_TIMING
+	if (wakeup_timing) {
+		spin_lock(&sch.trace_lock);
+		if (current != sch.task) {
+			spin_unlock(&sch.trace_lock);
+			raw_local_irq_restore(flags);
+			preempt_enable();
+			return -EINVAL;
+		}
+		sch.task = NULL;
+		tr = sch.tr;
+		sch.tr = NULL;
+		spin_unlock(&sch.trace_lock);
+	} else
+#endif
+		tr = cpu_traces + smp_processor_id();
+
+	atomic_inc(&tr->disabled);
+	if (tr->preempt_timestamp) {
+		cycles_t T0, T1;
+		unsigned long long tmp0;
+
+		T0 = tr->preempt_timestamp;
+		T1 = get_cycles();
+		tmp0 = preempt_max_latency;
+		if (T1 < T0)
+			T0 = T1;
+		delta = T1 - T0;
+		if (!report_latency(delta))
+			goto out;
+		if (tr->critical_sequence != max_sequence ||
+						down_trylock(&max_mutex))
+			goto out;
+
+		if (!preempt_thresh && preempt_max_latency > delta) {
+			raw_local_irq_restore(flags);
+			printk("bug3: updating %016Lx > %016Lx [%016Lx]?\n",
+				preempt_max_latency, delta, tmp0);
+			printk("  [%016Lx %016Lx]\n", T0, T1);
+			raw_local_irq_save(flags);
+		}
+
+		preempt_max_latency = delta;
+		update_max_tr(tr);
+
+		latency = cycles_to_usecs(delta);
+
+		raw_local_irq_restore(flags);
+		if (preempt_thresh)
+			printk("(%16s-%-5d|#%d): %lu us user-latency "
+				"violates %lu us threshold.\n",
+					current->comm, current->pid,
+					raw_smp_processor_id(), latency,
+					cycles_to_usecs(preempt_thresh));
+		else
+			printk("(%16s-%-5d|#%d): new %lu us user-latency.\n",
+				current->comm, current->pid,
+					raw_smp_processor_id(), latency);
+		raw_local_irq_save(flags);
+
+		max_sequence++;
+		up(&max_mutex);
+out:
+		tr->preempt_timestamp = 0;
+	}
+	atomic_dec(&tr->disabled);
+	raw_local_irq_restore(flags);
+	preempt_enable();
+
+	return 0;
+}
+
+EXPORT_SYMBOL(user_trace_stop);
+
+void stop_trace(void)
+{
+	if (trace_print_at_crash)
+		trace_enabled = -1;
+}
+
+static void print_entry(struct trace_entry *entry, struct trace_entry *entry0,
+			struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs;
+	int hardirq, softirq;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+
+	printk("%-5d ", entry->pid);
+
+	printk("%c%c",
+		(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+		(entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.',
+ 		(entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'n' : '.');
+
+	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+	if (hardirq && softirq)
+		printk("H");
+	else {
+		if (hardirq)
+			printk("h");
+		else {
+			if (softirq)
+				printk("s");
+			else
+				printk(".");
+		}
+	}
+
+	printk(":%d %ld.%03ldms: ",
+		entry->preempt_count, abs_usecs/1000, abs_usecs % 1000);
+
+	printk_name(entry->u.fn.eip);
+	printk("  <= (");
+	printk_name(entry->u.fn.parent_eip);
+	printk(")\n");
+}
+
+/*
+ * Print the current trace at crash time.
+ *
+ * We print it backwards, so that the newest (most interesting) entries
+ * are printed first.
+ */
+void print_last_trace(void)
+{
+	unsigned int idx0, idx, i;
+	struct cpu_trace *tr;
+	struct trace_entry *entry0, *entry, *next_entry;
+
+	if (trace_enabled != -1 || !trace_print_at_crash)
+		return;
+
+	trace_print_at_crash = 0;
+
+	preempt_disable();
+	tr = cpu_traces + smp_processor_id();
+
+	printk("Last %ld trace entries:\n", MAX_TRACE);
+	idx0 = tr->trace_idx;
+	printk("curr idx: %d\n", idx0);
+	if (idx0 >= MAX_TRACE)
+		idx0 = MAX_TRACE-1;
+	idx = idx0;
+	entry0 = tr->trace + idx0;
+
+	for (i = 0; i < MAX_TRACE; i++) {
+		next_entry = tr->trace + idx;
+		if (idx == 0)
+			idx = MAX_TRACE-1;
+		else
+			idx--;
+		entry = tr->trace + idx;
+		if (entry->type == TRACE_FN)
+			print_entry(entry, entry0, next_entry);
+	}
+	printk("printed %ld entries\n", MAX_TRACE);
+
+	preempt_enable();
+}
+
+#ifdef CONFIG_SMP
+/*
+ * On SMP, try to 'peek' on other CPU's traces and record them
+ * in this CPU's trace. This way we get a rough idea about what's
+ * going on there, without the overhead of global tracing.
+ *
+ * (no need to make this PER_CPU, we bounce it around anyway.)
+ */
+unsigned long nmi_eips[NR_CPUS];
+unsigned long nmi_flags[NR_CPUS];
+
+void notrace nmi_trace(unsigned long eip, unsigned long parent_eip,
+			unsigned long flags)
+{
+	int cpu, this_cpu = smp_processor_id();
+
+	__trace(eip, parent_eip);
+
+	nmi_eips[this_cpu] = parent_eip;
+	nmi_flags[this_cpu] = flags;
+	for (cpu = 0; cpu < NR_CPUS; cpu++)
+		if (cpu_online(cpu) && cpu != this_cpu) {
+			__trace(eip, nmi_eips[cpu]);
+			__trace(eip, nmi_flags[cpu]);
+		}
+}
+#else
+/*
+ * On UP, NMI tracing is quite simple:
+ */
+void notrace nmi_trace(unsigned long eip, unsigned long parent_eip,
+			unsigned long flags)
+{
+	__trace(eip, parent_eip);
+}
+#endif
+
+#endif
+
+#ifdef CONFIG_PREEMPT_TRACE
+
+static void print_preempt_trace(struct task_struct *task)
+{
+	unsigned int count = task->thread_info->preempt_count;
+	unsigned int i, lim = count & PREEMPT_MASK;
+	if (lim >= MAX_PREEMPT_TRACE)
+		lim = MAX_PREEMPT_TRACE-1;
+	printk("---------------------------\n");
+	printk("| preempt count: %08x ]\n", count);
+	printk("| %d-level deep critical section nesting:\n", lim);
+	printk("----------------------------------------\n");
+	for (i = 1; i <= lim; i++) {
+		printk(".. [<%08lx>] .... ", task->preempt_trace_eip[i]);
+		print_symbol("%s\n", task->preempt_trace_eip[i]);
+		printk(".....[<%08lx>] ..   ( <= ",
+				task->preempt_trace_parent_eip[i]);
+		print_symbol("%s)\n", task->preempt_trace_parent_eip[i]);
+	}
+	printk("\n");
+}
+
+#endif
+
+#if defined(CONFIG_PREEMPT_TRACE) || defined(CONFIG_LATENCY_TRACE)
+void print_traces(struct task_struct *task)
+{
+#ifdef CONFIG_PREEMPT_TRACE
+	print_preempt_trace(task);
+#endif
+#ifdef CONFIG_LATENCY_TRACE
+	print_last_trace();
+#endif
+}
+#endif
+
+#ifdef CONFIG_LATENCY_TIMING
+
+static int preempt_read_proc(char *page, char **start, off_t off,
+			     int count, int *eof, void *data)
+{
+	cycles_t *max = data;
+
+	return sprintf(page, "%ld\n", cycles_to_usecs(*max));
+}
+
+static int preempt_write_proc(struct file *file, const char __user *buffer,
+			      unsigned long count, void *data)
+{
+	unsigned int c, done = 0, val, sum = 0;
+	cycles_t *max = data;
+
+	while (count) {
+		if (get_user(c, buffer))
+			return -EFAULT;
+		val = c - '0';
+		buffer++;
+		done++;
+		count--;
+		if (c == 0 || c == '\n')
+			break;
+		if (val > 9)
+			return -EINVAL;
+		sum *= 10;
+		sum += val;
+	}
+	*max = usecs_to_cycles(sum);
+	return done;
+}
+
+#define	PROCNAME_PML	"sys/kernel/preempt_max_latency"
+#define PROCNAME_PT	"sys/kernel/preempt_thresh"
+
+static __init int latency_init(void)
+{
+	struct proc_dir_entry *entry;
+	int cpu;
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++)
+		cpu_traces[cpu].cpu = cpu;
+
+	if (!(entry = create_proc_entry(PROCNAME_PML, 0644, NULL)))
+		printk("latency_init(): can't create %s\n", PROCNAME_PML);
+	else {
+		entry->nlink = 1;
+		entry->data = &preempt_max_latency;
+		entry->read_proc = preempt_read_proc;
+		entry->write_proc = preempt_write_proc;
+	}
+
+	if (!(entry = create_proc_entry(PROCNAME_PT, 0644, NULL)))
+		printk("latency_init(): can't create %s\n", PROCNAME_PT);
+	else {
+		entry->nlink = 1;
+		entry->data = &preempt_thresh;
+		entry->read_proc = preempt_read_proc;
+		entry->write_proc = preempt_write_proc;
+	}
+	return 0;
+}
+__initcall(latency_init);
+
+#endif
Index: linux.prev/kernel/latency_hist.c
===================================================================
--- /dev/null
+++ linux.prev/kernel/latency_hist.c
@@ -0,0 +1,267 @@
+/*
+ * kernel/latency_hist.c
+ *
+ * Add support for histograms of preemption-off latency and
+ * interrupt-off latency and wakeup latency, it depends on
+ * Real-Time Preemption Support.
+ *
+ *  Copyright (C) 2005 MontaVista Software, Inc.
+ *  Yi Yang <yyang@ch.mvista.com>
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/percpu.h>
+#include <linux/latency_hist.h>
+#include <asm/atomic.h>
+
+typedef struct hist_data_struct {
+	atomic_t hist_mode; /* 0 log, 1 don't log */
+	unsigned long min_lat;
+	unsigned long avg_lat;
+	unsigned long max_lat;
+	unsigned long long beyond_hist_bound_samples;
+	unsigned long long accumulate_lat;
+	unsigned long long total_samples;
+	unsigned long long hist_array[MAX_ENTRY_NUM];
+} hist_data_t;
+
+static struct proc_dir_entry * latency_hist_root = NULL;
+static char * latency_hist_proc_dir_root = "latency_hist";
+
+static char * percpu_proc_name = "CPU";
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+static DEFINE_PER_CPU(hist_data_t, interrupt_off_hist);
+static char * interrupt_off_hist_proc_dir = "interrupt_off_latency";
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+static DEFINE_PER_CPU(hist_data_t, preempt_off_hist);
+static char * preempt_off_hist_proc_dir = "preempt_off_latency";
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+static DEFINE_PER_CPU(hist_data_t, wakeup_latency_hist);
+static char * wakeup_latency_hist_proc_dir = "wakeup_latency";
+#endif
+
+static struct proc_dir_entry *entry[LATENCY_TYPE_NUM][NR_CPUS];
+
+static inline u64 u64_div(u64 x, u64 y)
+{
+        do_div(x, y);
+        return x;
+}
+
+void latency_hist(int latency_type, int cpu, unsigned long latency)
+{
+	hist_data_t * my_hist;
+
+	if ((cpu < 0) || (cpu >= NR_CPUS) || (latency_type < INTERRUPT_LATENCY)
+			|| (latency_type > WAKEUP_LATENCY) || (latency < 0))
+		return;
+
+	switch(latency_type) {
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+	case INTERRUPT_LATENCY:
+		my_hist = (hist_data_t *)&per_cpu(interrupt_off_hist, cpu);
+		break;
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+	case PREEMPT_LATENCY:
+		my_hist = (hist_data_t *)&per_cpu(preempt_off_hist, cpu);
+		break;
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+	case WAKEUP_LATENCY:
+		my_hist = (hist_data_t *)&per_cpu(wakeup_latency_hist, cpu);
+		break;
+#endif
+	default:
+		return;
+	}
+
+	if (atomic_read(&my_hist->hist_mode) == 0)
+		return;
+
+	if (latency >= MAX_ENTRY_NUM)
+		my_hist->beyond_hist_bound_samples++;
+	else
+		my_hist->hist_array[latency]++;
+
+	if (latency < my_hist->min_lat)
+		my_hist->min_lat = latency;
+	else if (latency > my_hist->max_lat)
+		my_hist->max_lat = latency;
+
+	my_hist->total_samples++;
+	my_hist->accumulate_lat += latency;
+	my_hist->avg_lat = (unsigned long) u64_div(my_hist->accumulate_lat,
+						  my_hist->total_samples);
+	return;
+}
+
+static void *l_start(struct seq_file *m, loff_t * pos)
+{
+	loff_t *index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL);
+	loff_t index = *pos;
+	hist_data_t *my_hist = (hist_data_t *) m->private;
+
+	if (!index_ptr)
+		return NULL;
+
+	if (index == 0) {
+		atomic_dec(&my_hist->hist_mode);
+		seq_printf(m, "#Minimum latency: %lu microseconds.\n"
+			   "#Average latency: %lu microseconds.\n"
+			   "#Maximum latency: %lu microseconds.\n"
+			   "#Total samples: %llu\n"
+			   "#There are %llu samples greater or equal than %d microseconds\n"
+			   "#usecs\t%16s\n"
+			   , my_hist->min_lat
+			   , my_hist->avg_lat
+			   , my_hist->max_lat
+			   , my_hist->total_samples
+			   , my_hist->beyond_hist_bound_samples
+			   , MAX_ENTRY_NUM, "samples");
+	}
+	if (index >= MAX_ENTRY_NUM)
+		return NULL;
+
+	*index_ptr = index;
+	return index_ptr;
+}
+
+static void *l_next(struct seq_file *m, void *p, loff_t * pos)
+{
+	loff_t *index_ptr = p;
+	hist_data_t *my_hist = (hist_data_t *) m->private;
+
+	if (++*pos >= MAX_ENTRY_NUM) {
+		atomic_inc(&my_hist->hist_mode);
+		return NULL;
+	}
+	*index_ptr = *pos;
+	return index_ptr;
+}
+
+static void l_stop(struct seq_file *m, void *p)
+{
+	kfree(p);
+}
+
+static int l_show(struct seq_file *m, void *p)
+{
+	int index = *(loff_t *) p;
+	hist_data_t *my_hist = (hist_data_t *) m->private;
+
+	seq_printf(m, "%5d\t%16llu\n", index, my_hist->hist_array[index]);
+	return 0;
+}
+
+static struct seq_operations latency_hist_seq_op = {
+	.start = l_start,
+	.next  = l_next,
+	.stop  = l_stop,
+	.show  = l_show
+};
+
+static int latency_hist_seq_open(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *entry_ptr = NULL;
+	int ret, i, j, break_flags = 0;
+	struct seq_file *seq;
+
+	entry_ptr = PDE(file->f_dentry->d_inode);
+	for (i = 0; i < LATENCY_TYPE_NUM; i++) {
+		for (j = 0; j < NR_CPUS; j++) {
+			if (entry[i][j] == NULL)
+				continue;
+			if (entry_ptr->low_ino == entry[i][j]->low_ino) {
+				break_flags = 1;
+				break;
+			}
+		}
+		if (break_flags == 1)
+			break;
+	}
+	ret = seq_open(file, &latency_hist_seq_op);
+	if (break_flags == 1) {
+		seq = (struct seq_file *)file->private_data;
+		seq->private = entry[i][j]->data;
+	}
+	return ret;
+}
+
+static struct file_operations latency_hist_seq_fops = {
+	.open = latency_hist_seq_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+static __init int latency_hist_init(void)
+{
+	struct proc_dir_entry *tmp_parent_proc_dir;
+	int i = 0, len = 0;
+	hist_data_t *my_hist;
+	char procname[64];
+
+	latency_hist_root = proc_mkdir(latency_hist_proc_dir_root, NULL);
+
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+	tmp_parent_proc_dir = proc_mkdir(interrupt_off_hist_proc_dir, latency_hist_root);
+	for (i = 0; i < NR_CPUS; i++) {
+		len = sprintf(procname, "%s%d", percpu_proc_name, i);
+		procname[len] = '\0';
+		entry[INTERRUPT_LATENCY][i] =
+			create_proc_entry(procname, 0, tmp_parent_proc_dir);
+		entry[INTERRUPT_LATENCY][i]->data = (void *)&per_cpu(interrupt_off_hist, i);
+		entry[INTERRUPT_LATENCY][i]->proc_fops = &latency_hist_seq_fops;
+		my_hist = (hist_data_t *) entry[INTERRUPT_LATENCY][i]->data;
+		atomic_set(&my_hist->hist_mode,1);
+		my_hist->min_lat = 0xFFFFFFFFUL;
+	}
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+	tmp_parent_proc_dir = proc_mkdir(preempt_off_hist_proc_dir, latency_hist_root);
+	for (i = 0; i < NR_CPUS; i++) {
+		len = sprintf(procname, "%s%d", percpu_proc_name, i);
+		procname[len] = '\0';
+		entry[PREEMPT_LATENCY][i] =
+			create_proc_entry(procname, 0, tmp_parent_proc_dir);
+		entry[PREEMPT_LATENCY][i]->data = (void *)&per_cpu(preempt_off_hist, i);
+		entry[PREEMPT_LATENCY][i]->proc_fops = &latency_hist_seq_fops;
+		my_hist = (hist_data_t *) entry[PREEMPT_LATENCY][i]->data;
+		atomic_set(&my_hist->hist_mode,1);
+		my_hist->min_lat = 0xFFFFFFFFUL;
+	}
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+	tmp_parent_proc_dir = proc_mkdir(wakeup_latency_hist_proc_dir, latency_hist_root);
+	for (i = 0; i < NR_CPUS; i++) {
+		len = sprintf(procname, "%s%d", percpu_proc_name, i);
+		procname[len] = '\0';
+		entry[WAKEUP_LATENCY][i] =
+			create_proc_entry(procname, 0, tmp_parent_proc_dir);
+		entry[WAKEUP_LATENCY][i]->data = (void *)&per_cpu(wakeup_latency_hist, i);
+		entry[WAKEUP_LATENCY][i]->proc_fops = &latency_hist_seq_fops;
+		my_hist = (hist_data_t *) entry[WAKEUP_LATENCY][i]->data;
+		atomic_set(&my_hist->hist_mode,1);
+		my_hist->min_lat = 0xFFFFFFFFUL;
+	}
+#endif
+	return 0;
+
+}
+
+__initcall(latency_hist_init);
+
Index: linux.prev/kernel/panic.c
===================================================================
--- linux.prev.orig/kernel/panic.c
+++ linux.prev/kernel/panic.c
@@ -76,6 +76,7 @@ NORET_TYPE void panic(const char * fmt, 
 	vsnprintf(buf, sizeof(buf), fmt, args);
 	va_end(args);
 	printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
+	dump_stack();
 	bust_spinlocks(0);
 
 	/*
@@ -128,7 +129,7 @@ NORET_TYPE void panic(const char * fmt, 
 #if defined(CONFIG_ARCH_S390)
         disabled_wait(caller);
 #endif
-	local_irq_enable();
+	raw_local_irq_enable();
 	for (i = 0;;) {
 		i += panic_blink(i);
 		mdelay(1);
Index: linux.prev/kernel/pid.c
===================================================================
--- linux.prev.orig/kernel/pid.c
+++ linux.prev/kernel/pid.c
@@ -60,7 +60,7 @@ typedef struct pidmap {
 static pidmap_t pidmap_array[PIDMAP_ENTRIES] =
 	 { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } };
 
-static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
+static DEFINE_SPINLOCK(pidmap_lock);
 
 fastcall void free_pidmap(int pid)
 {
@@ -136,7 +136,7 @@ struct pid * fastcall find_pid(enum pid_
 	struct hlist_node *elem;
 	struct pid *pid;
 
-	hlist_for_each_entry(pid, elem,
+	hlist_for_each_entry_rcu(pid, elem,
 			&pid_hash[type][pid_hashfn(nr)], pid_chain) {
 		if (pid->nr == nr)
 			return pid;
@@ -151,12 +151,12 @@ int fastcall attach_pid(task_t *task, en
 	task_pid = &task->pids[type];
 	pid = find_pid(type, nr);
 	if (pid == NULL) {
-		hlist_add_head(&task_pid->pid_chain,
-				&pid_hash[type][pid_hashfn(nr)]);
 		INIT_LIST_HEAD(&task_pid->pid_list);
+		hlist_add_head_rcu(&task_pid->pid_chain,
+				   &pid_hash[type][pid_hashfn(nr)]);
 	} else {
 		INIT_HLIST_NODE(&task_pid->pid_chain);
-		list_add_tail(&task_pid->pid_list, &pid->pid_list);
+		list_add_tail_rcu(&task_pid->pid_list, &pid->pid_list);
 	}
 	task_pid->nr = nr;
 
@@ -170,20 +170,20 @@ static fastcall int __detach_pid(task_t 
 
 	pid = &task->pids[type];
 	if (!hlist_unhashed(&pid->pid_chain)) {
-		hlist_del(&pid->pid_chain);
 
-		if (list_empty(&pid->pid_list))
+		if (list_empty(&pid->pid_list)) {
 			nr = pid->nr;
-		else {
+			hlist_del_rcu(&pid->pid_chain);
+		} else {
 			pid_next = list_entry(pid->pid_list.next,
 						struct pid, pid_list);
 			/* insert next pid from pid_list to hash */
-			hlist_add_head(&pid_next->pid_chain,
-				&pid_hash[type][pid_hashfn(pid_next->nr)]);
+			hlist_replace_rcu(&pid->pid_chain,
+					  &pid_next->pid_chain);
 		}
 	}
 
-	list_del(&pid->pid_list);
+	list_del_rcu(&pid->pid_list);
 	pid->nr = 0;
 
 	return nr;
Index: linux.prev/kernel/posix-cpu-timers.c
===================================================================
--- linux.prev.orig/kernel/posix-cpu-timers.c
+++ linux.prev/kernel/posix-cpu-timers.c
@@ -7,7 +7,7 @@
 #include <asm/uaccess.h>
 #include <linux/errno.h>
 
-static int check_clock(clockid_t which_clock)
+static int check_clock(const clockid_t which_clock)
 {
 	int error = 0;
 	struct task_struct *p;
@@ -31,7 +31,7 @@ static int check_clock(clockid_t which_c
 }
 
 static inline union cpu_time_count
-timespec_to_sample(clockid_t which_clock, const struct timespec *tp)
+timespec_to_sample(const clockid_t which_clock, const struct timespec *tp)
 {
 	union cpu_time_count ret;
 	ret.sched = 0;		/* high half always zero when .cpu used */
@@ -43,7 +43,7 @@ timespec_to_sample(clockid_t which_clock
 	return ret;
 }
 
-static void sample_to_timespec(clockid_t which_clock,
+static void sample_to_timespec(const clockid_t which_clock,
 			       union cpu_time_count cpu,
 			       struct timespec *tp)
 {
@@ -55,7 +55,7 @@ static void sample_to_timespec(clockid_t
 	}
 }
 
-static inline int cpu_time_before(clockid_t which_clock,
+static inline int cpu_time_before(const clockid_t which_clock,
 				  union cpu_time_count now,
 				  union cpu_time_count then)
 {
@@ -65,7 +65,7 @@ static inline int cpu_time_before(clocki
 		return cputime_lt(now.cpu, then.cpu);
 	}
 }
-static inline void cpu_time_add(clockid_t which_clock,
+static inline void cpu_time_add(const clockid_t which_clock,
 				union cpu_time_count *acc,
 			        union cpu_time_count val)
 {
@@ -75,7 +75,7 @@ static inline void cpu_time_add(clockid_
 		acc->cpu = cputime_add(acc->cpu, val.cpu);
 	}
 }
-static inline union cpu_time_count cpu_time_sub(clockid_t which_clock,
+static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
 						union cpu_time_count a,
 						union cpu_time_count b)
 {
@@ -151,7 +151,7 @@ static inline unsigned long long sched_n
 	return (p == current) ? current_sched_time(p) : p->sched_time;
 }
 
-int posix_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
+int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
 {
 	int error = check_clock(which_clock);
 	if (!error) {
@@ -169,7 +169,7 @@ int posix_cpu_clock_getres(clockid_t whi
 	return error;
 }
 
-int posix_cpu_clock_set(clockid_t which_clock, const struct timespec *tp)
+int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
 {
 	/*
 	 * You can never reset a CPU clock, but we check for other errors
@@ -186,7 +186,7 @@ int posix_cpu_clock_set(clockid_t which_
 /*
  * Sample a per-thread clock for the given task.
  */
-static int cpu_clock_sample(clockid_t which_clock, struct task_struct *p,
+static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
 			    union cpu_time_count *cpu)
 {
 	switch (CPUCLOCK_WHICH(which_clock)) {
@@ -238,18 +238,7 @@ static int cpu_clock_sample_group_locked
 		while ((t = next_thread(t)) != p) {
 			cpu->sched += t->sched_time;
 		}
-		if (p->tgid == current->tgid) {
-			/*
-			 * We're sampling ourselves, so include the
-			 * cycles not yet banked.  We still omit
-			 * other threads running on other CPUs,
-			 * so the total can always be behind as
-			 * much as max(nthreads-1,ncpus) * (NSEC_PER_SEC/HZ).
-			 */
-			cpu->sched += current_sched_time(current);
-		} else {
-			cpu->sched += p->sched_time;
-		}
+		cpu->sched += sched_ns(p);
 		break;
 	}
 	return 0;
@@ -259,7 +248,7 @@ static int cpu_clock_sample_group_locked
  * Sample a process (thread group) clock for the given group_leader task.
  * Must be called with tasklist_lock held for reading.
  */
-static int cpu_clock_sample_group(clockid_t which_clock,
+static int cpu_clock_sample_group(const clockid_t which_clock,
 				  struct task_struct *p,
 				  union cpu_time_count *cpu)
 {
@@ -273,7 +262,7 @@ static int cpu_clock_sample_group(clocki
 }
 
 
-int posix_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
+int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
 {
 	const pid_t pid = CPUCLOCK_PID(which_clock);
 	int error = -EINVAL;
@@ -302,7 +291,7 @@ int posix_cpu_clock_get(clockid_t which_
 		 * should be able to see it.
 		 */
 		struct task_struct *p;
-		read_lock(&tasklist_lock);
+		rcu_read_lock();
 		p = find_task_by_pid(pid);
 		if (p) {
 			if (CPUCLOCK_PERTHREAD(which_clock)) {
@@ -311,11 +300,13 @@ int posix_cpu_clock_get(clockid_t which_
 								 p, &rtn);
 				}
 			} else if (p->tgid == pid && p->signal) {
+				read_lock(&tasklist_lock);
 				error = cpu_clock_sample_group(which_clock,
 							       p, &rtn);
+				read_unlock(&tasklist_lock);
 			}
 		}
-		read_unlock(&tasklist_lock);
+		rcu_read_unlock();
 	}
 
 	if (error)
@@ -1410,8 +1401,8 @@ void set_process_cpu_timer(struct task_s
 
 static long posix_cpu_clock_nanosleep_restart(struct restart_block *);
 
-int posix_cpu_nsleep(clockid_t which_clock, int flags,
-		     struct timespec *rqtp)
+int posix_cpu_nsleep(const clockid_t which_clock, int flags,
+		     struct timespec *rqtp, struct timespec __user *rmtp)
 {
 	struct restart_block *restart_block =
 	    &current_thread_info()->restart_block;
@@ -1436,7 +1427,6 @@ int posix_cpu_nsleep(clockid_t which_clo
 	error = posix_cpu_timer_create(&timer);
 	timer.it_process = current;
 	if (!error) {
-		struct timespec __user *rmtp;
 		static struct itimerspec zero_it;
 		struct itimerspec it = { .it_value = *rqtp,
 					 .it_interval = {} };
@@ -1483,7 +1473,6 @@ int posix_cpu_nsleep(clockid_t which_clo
 		/*
 		 * Report back to the user the time still remaining.
 		 */
-		rmtp = (struct timespec __user *) restart_block->arg1;
 		if (rmtp != NULL && !(flags & TIMER_ABSTIME) &&
 		    copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
 			return -EFAULT;
@@ -1491,6 +1480,7 @@ int posix_cpu_nsleep(clockid_t which_clo
 		restart_block->fn = posix_cpu_clock_nanosleep_restart;
 		/* Caller already set restart_block->arg1 */
 		restart_block->arg0 = which_clock;
+		restart_block->arg1 = (unsigned long) rmtp;
 		restart_block->arg2 = rqtp->tv_sec;
 		restart_block->arg3 = rqtp->tv_nsec;
 
@@ -1504,21 +1494,28 @@ static long
 posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block)
 {
 	clockid_t which_clock = restart_block->arg0;
-	struct timespec t = { .tv_sec = restart_block->arg2,
-			      .tv_nsec = restart_block->arg3 };
+	struct timespec __user *rmtp;
+	struct timespec t;
+
+	rmtp = (struct timespec __user *) restart_block->arg1;
+	t.tv_sec = restart_block->arg2;
+	t.tv_nsec = restart_block->arg3;
+
 	restart_block->fn = do_no_restart_syscall;
-	return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t);
+	return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t, rmtp);
 }
 
 
 #define PROCESS_CLOCK	MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
 #define THREAD_CLOCK	MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
 
-static int process_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
+static int process_cpu_clock_getres(const clockid_t which_clock,
+				    struct timespec *tp)
 {
 	return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
 }
-static int process_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
+static int process_cpu_clock_get(const clockid_t which_clock,
+				 struct timespec *tp)
 {
 	return posix_cpu_clock_get(PROCESS_CLOCK, tp);
 }
@@ -1527,16 +1524,19 @@ static int process_cpu_timer_create(stru
 	timer->it_clock = PROCESS_CLOCK;
 	return posix_cpu_timer_create(timer);
 }
-static int process_cpu_nsleep(clockid_t which_clock, int flags,
-			      struct timespec *rqtp)
+static int process_cpu_nsleep(const clockid_t which_clock, int flags,
+			      struct timespec *rqtp,
+			      struct timespec __user *rmtp)
 {
-	return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp);
+	return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
 }
-static int thread_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
+static int thread_cpu_clock_getres(const clockid_t which_clock,
+				   struct timespec *tp)
 {
 	return posix_cpu_clock_getres(THREAD_CLOCK, tp);
 }
-static int thread_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
+static int thread_cpu_clock_get(const clockid_t which_clock,
+				struct timespec *tp)
 {
 	return posix_cpu_clock_get(THREAD_CLOCK, tp);
 }
@@ -1545,8 +1545,8 @@ static int thread_cpu_timer_create(struc
 	timer->it_clock = THREAD_CLOCK;
 	return posix_cpu_timer_create(timer);
 }
-static int thread_cpu_nsleep(clockid_t which_clock, int flags,
-			      struct timespec *rqtp)
+static int thread_cpu_nsleep(const clockid_t which_clock, int flags,
+			      struct timespec *rqtp, struct timespec __user *rmtp)
 {
 	return -EINVAL;
 }
Index: linux.prev/kernel/posix-timers.c
===================================================================
--- linux.prev.orig/kernel/posix-timers.c
+++ linux.prev/kernel/posix-timers.c
@@ -34,7 +34,7 @@
 #include <linux/smp_lock.h>
 #include <linux/interrupt.h>
 #include <linux/slab.h>
-#include <linux/time.h>
+#include <linux/timeofday.h>
 
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
@@ -48,21 +48,6 @@
 #include <linux/workqueue.h>
 #include <linux/module.h>
 
-#ifndef div_long_long_rem
-#include <asm/div64.h>
-
-#define div_long_long_rem(dividend,divisor,remainder) ({ \
-		       u64 result = dividend;		\
-		       *remainder = do_div(result,divisor); \
-		       result; })
-
-#endif
-#define CLOCK_REALTIME_RES TICK_NSEC  /* In nano seconds. */
-
-static inline u64  mpy_l_X_l_ll(unsigned long mpy1,unsigned long mpy2)
-{
-	return (u64)mpy1 * mpy2;
-}
 /*
  * Management arrays for POSIX timers.	 Timers are kept in slab memory
  * Timer ids are allocated by an external routine that keeps track of the
@@ -148,18 +133,18 @@ static DEFINE_SPINLOCK(idr_lock);
  */
 
 static struct k_clock posix_clocks[MAX_CLOCKS];
+
 /*
- * We only have one real clock that can be set so we need only one abs list,
- * even if we should want to have several clocks with differing resolutions.
+ * These ones are defined below.
  */
-static struct k_clock_abs abs_list = {.list = LIST_HEAD_INIT(abs_list.list),
-				      .lock = SPIN_LOCK_UNLOCKED};
+static int common_nsleep(const clockid_t, int flags, struct timespec *t,
+			 struct timespec __user *rmtp);
+static void common_timer_get(struct k_itimer *, struct itimerspec *);
+static int common_timer_set(struct k_itimer *, int,
+			    struct itimerspec *, struct itimerspec *);
+static int common_timer_del(struct k_itimer *timer);
 
-static void posix_timer_fn(unsigned long);
-static u64 do_posix_clock_monotonic_gettime_parts(
-	struct timespec *tp, struct timespec *mo);
-int do_posix_clock_monotonic_gettime(struct timespec *tp);
-static int do_posix_clock_monotonic_get(clockid_t, struct timespec *tp);
+static int posix_timer_fn(void *data);
 
 static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
 
@@ -184,7 +169,7 @@ static inline void unlock_timer(struct k
  * the function pointer CALL in struct k_clock.
  */
 
-static inline int common_clock_getres(clockid_t which_clock,
+static inline int common_clock_getres(const clockid_t which_clock,
 				      struct timespec *tp)
 {
 	tp->tv_sec = 0;
@@ -192,39 +177,31 @@ static inline int common_clock_getres(cl
 	return 0;
 }
 
-static inline int common_clock_get(clockid_t which_clock, struct timespec *tp)
+/*
+ * Get real time for posix timers
+ */
+static int common_clock_get(clockid_t which_clock, struct timespec *tp)
 {
-	getnstimeofday(tp);
+	ktime_get_real_ts(tp);
 	return 0;
 }
 
-static inline int common_clock_set(clockid_t which_clock, struct timespec *tp)
+static inline int common_clock_set(const clockid_t which_clock,
+				   struct timespec *tp)
 {
 	return do_sys_settimeofday(tp, NULL);
 }
 
-static inline int common_timer_create(struct k_itimer *new_timer)
+static int common_timer_create(struct k_itimer *new_timer)
 {
-	INIT_LIST_HEAD(&new_timer->it.real.abs_timer_entry);
-	init_timer(&new_timer->it.real.timer);
-	new_timer->it.real.timer.data = (unsigned long) new_timer;
-	new_timer->it.real.timer.function = posix_timer_fn;
+	hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
 	return 0;
 }
 
 /*
- * These ones are defined below.
+ * Return nonzero if we know a priori this clockid_t value is bogus.
  */
-static int common_nsleep(clockid_t, int flags, struct timespec *t);
-static void common_timer_get(struct k_itimer *, struct itimerspec *);
-static int common_timer_set(struct k_itimer *, int,
-			    struct itimerspec *, struct itimerspec *);
-static int common_timer_del(struct k_itimer *timer);
-
-/*
- * Return nonzero iff we know a priori this clockid_t value is bogus.
- */
-static inline int invalid_clockid(clockid_t which_clock)
+static inline int invalid_clockid(const clockid_t which_clock)
 {
 	if (which_clock < 0)	/* CPU clock, posix_cpu_* will check it */
 		return 0;
@@ -232,26 +209,32 @@ static inline int invalid_clockid(clocki
 		return 1;
 	if (posix_clocks[which_clock].clock_getres != NULL)
 		return 0;
-#ifndef CLOCK_DISPATCH_DIRECT
 	if (posix_clocks[which_clock].res != 0)
 		return 0;
-#endif
 	return 1;
 }
 
+/*
+ * Get monotonic time for posix timers
+ */
+static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
+{
+	ktime_get_ts(tp);
+	return 0;
+}
 
 /*
  * Initialize everything, well, just everything in Posix clocks/timers ;)
  */
 static __init int init_posix_timers(void)
 {
-	struct k_clock clock_realtime = {.res = CLOCK_REALTIME_RES,
-					 .abs_struct = &abs_list
+	struct k_clock clock_realtime = {
+		.clock_getres = hrtimer_get_res,
 	};
-	struct k_clock clock_monotonic = {.res = CLOCK_REALTIME_RES,
-		.abs_struct = NULL,
-		.clock_get = do_posix_clock_monotonic_get,
-		.clock_set = do_posix_clock_nosettime
+	struct k_clock clock_monotonic = {
+		.clock_getres = hrtimer_get_res,
+		.clock_get = posix_ktime_get_ts,
+		.clock_set = do_posix_clock_nosettime,
 	};
 
 	register_posix_clock(CLOCK_REALTIME, &clock_realtime);
@@ -265,117 +248,17 @@ static __init int init_posix_timers(void
 
 __initcall(init_posix_timers);
 
-static void tstojiffie(struct timespec *tp, int res, u64 *jiff)
-{
-	long sec = tp->tv_sec;
-	long nsec = tp->tv_nsec + res - 1;
-
-	if (nsec >= NSEC_PER_SEC) {
-		sec++;
-		nsec -= NSEC_PER_SEC;
-	}
-
-	/*
-	 * The scaling constants are defined in <linux/time.h>
-	 * The difference between there and here is that we do the
-	 * res rounding and compute a 64-bit result (well so does that
-	 * but it then throws away the high bits).
-  	 */
-	*jiff =  (mpy_l_X_l_ll(sec, SEC_CONVERSION) +
-		  (mpy_l_X_l_ll(nsec, NSEC_CONVERSION) >> 
-		   (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
-}
-
-/*
- * This function adjusts the timer as needed as a result of the clock
- * being set.  It should only be called for absolute timers, and then
- * under the abs_list lock.  It computes the time difference and sets
- * the new jiffies value in the timer.  It also updates the timers
- * reference wall_to_monotonic value.  It is complicated by the fact
- * that tstojiffies() only handles positive times and it needs to work
- * with both positive and negative times.  Also, for negative offsets,
- * we need to defeat the res round up.
- *
- * Return is true if there is a new time, else false.
- */
-static long add_clockset_delta(struct k_itimer *timr,
-			       struct timespec *new_wall_to)
-{
-	struct timespec delta;
-	int sign = 0;
-	u64 exp;
-
-	set_normalized_timespec(&delta,
-				new_wall_to->tv_sec -
-				timr->it.real.wall_to_prev.tv_sec,
-				new_wall_to->tv_nsec -
-				timr->it.real.wall_to_prev.tv_nsec);
-	if (likely(!(delta.tv_sec | delta.tv_nsec)))
-		return 0;
-	if (delta.tv_sec < 0) {
-		set_normalized_timespec(&delta,
-					-delta.tv_sec,
-					1 - delta.tv_nsec -
-					posix_clocks[timr->it_clock].res);
-		sign++;
-	}
-	tstojiffie(&delta, posix_clocks[timr->it_clock].res, &exp);
-	timr->it.real.wall_to_prev = *new_wall_to;
-	timr->it.real.timer.expires += (sign ? -exp : exp);
-	return 1;
-}
-
-static void remove_from_abslist(struct k_itimer *timr)
-{
-	if (!list_empty(&timr->it.real.abs_timer_entry)) {
-		spin_lock(&abs_list.lock);
-		list_del_init(&timr->it.real.abs_timer_entry);
-		spin_unlock(&abs_list.lock);
-	}
-}
-
 static void schedule_next_timer(struct k_itimer *timr)
 {
-	struct timespec new_wall_to;
-	struct now_struct now;
-	unsigned long seq;
-
-	/*
-	 * Set up the timer for the next interval (if there is one).
-	 * Note: this code uses the abs_timer_lock to protect
-	 * it.real.wall_to_prev and must hold it until exp is set, not exactly
-	 * obvious...
-
-	 * This function is used for CLOCK_REALTIME* and
-	 * CLOCK_MONOTONIC* timers.  If we ever want to handle other
-	 * CLOCKs, the calling code (do_schedule_next_timer) would need
-	 * to pull the "clock" info from the timer and dispatch the
-	 * "other" CLOCKs "next timer" code (which, I suppose should
-	 * also be added to the k_clock structure).
-	 */
-	if (!timr->it.real.incr)
+	if (timr->it.real.interval.tv64 == 0)
 		return;
 
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		new_wall_to =	wall_to_monotonic;
-		posix_get_now(&now);
-	} while (read_seqretry(&xtime_lock, seq));
-
-	if (!list_empty(&timr->it.real.abs_timer_entry)) {
-		spin_lock(&abs_list.lock);
-		add_clockset_delta(timr, &new_wall_to);
-
-		posix_bump_timer(timr, now);
-
-		spin_unlock(&abs_list.lock);
-	} else {
-		posix_bump_timer(timr, now);
-	}
+	timr->it_overrun += hrtimer_forward(&timr->it.real.timer,
+					    timr->it.real.interval);
 	timr->it_overrun_last = timr->it_overrun;
 	timr->it_overrun = -1;
 	++timr->it_requeue_pending;
-	add_timer(&timr->it.real.timer);
+	hrtimer_restart(&timr->it.real.timer);
 }
 
 /*
@@ -396,15 +279,15 @@ void do_schedule_next_timer(struct sigin
 
 	timr = lock_timer(info->si_tid, &flags);
 
-	if (!timr || timr->it_requeue_pending != info->si_sys_private)
-		goto exit;
+	if (timr && timr->it_requeue_pending == info->si_sys_private) {
+		if (timr->it_clock < 0)
+			posix_cpu_timer_schedule(timr);
+		else
+			schedule_next_timer(timr);
+
+		info->si_overrun = timr->it_overrun_last;
+	}
 
-	if (timr->it_clock < 0)	/* CPU clock */
-		posix_cpu_timer_schedule(timr);
-	else
-		schedule_next_timer(timr);
-	info->si_overrun = timr->it_overrun_last;
-exit:
 	if (timr)
 		unlock_timer(timr, flags);
 }
@@ -413,14 +296,7 @@ int posix_timer_event(struct k_itimer *t
 {
 	memset(&timr->sigq->info, 0, sizeof(siginfo_t));
 	timr->sigq->info.si_sys_private = si_private;
-	/*
-	 * Send signal to the process that owns this timer.
-
-	 * This code assumes that all the possible abs_lists share the
-	 * same lock (there is only one list at this time). If this is
-	 * not the case, the CLOCK info would need to be used to find
-	 * the proper abs list lock.
-	 */
+	/* Send signal to the process that owns this timer.*/
 
 	timr->sigq->info.si_signo = timr->it_sigev_signo;
 	timr->sigq->info.si_errno = 0;
@@ -454,66 +330,37 @@ EXPORT_SYMBOL_GPL(posix_timer_event);
 
  * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
  */
-static void posix_timer_fn(unsigned long __data)
+static int posix_timer_fn(void *data)
 {
-	struct k_itimer *timr = (struct k_itimer *) __data;
+	struct k_itimer *timr = data;
 	unsigned long flags;
-	unsigned long seq;
-	struct timespec delta, new_wall_to;
-	u64 exp = 0;
-	int do_notify = 1;
+	int si_private = 0;
+	int ret = HRTIMER_NORESTART;
 
 	spin_lock_irqsave(&timr->it_lock, flags);
-	if (!list_empty(&timr->it.real.abs_timer_entry)) {
-		spin_lock(&abs_list.lock);
-		do {
-			seq = read_seqbegin(&xtime_lock);
-			new_wall_to =	wall_to_monotonic;
-		} while (read_seqretry(&xtime_lock, seq));
-		set_normalized_timespec(&delta,
-					new_wall_to.tv_sec -
-					timr->it.real.wall_to_prev.tv_sec,
-					new_wall_to.tv_nsec -
-					timr->it.real.wall_to_prev.tv_nsec);
-		if (likely((delta.tv_sec | delta.tv_nsec ) == 0)) {
-			/* do nothing, timer is on time */
-		} else if (delta.tv_sec < 0) {
-			/* do nothing, timer is already late */
-		} else {
-			/* timer is early due to a clock set */
-			tstojiffie(&delta,
-				   posix_clocks[timr->it_clock].res,
-				   &exp);
-			timr->it.real.wall_to_prev = new_wall_to;
-			timr->it.real.timer.expires += exp;
-			add_timer(&timr->it.real.timer);
-			do_notify = 0;
-		}
-		spin_unlock(&abs_list.lock);
 
-	}
-	if (do_notify)  {
-		int si_private=0;
+	if (timr->it.real.interval.tv64 != 0)
+		si_private = ++timr->it_requeue_pending;
 
-		if (timr->it.real.incr)
-			si_private = ++timr->it_requeue_pending;
-		else {
-			remove_from_abslist(timr);
+	if (posix_timer_event(timr, si_private)) {
+		/*
+		 * signal was not sent because of sig_ignor
+		 * we will not get a call back to restart it AND
+		 * it should be restarted.
+		 */
+		if (timr->it.real.interval.tv64 != 0) {
+			timr->it_overrun +=
+				hrtimer_forward(&timr->it.real.timer,
+						timr->it.real.interval);
+			ret = HRTIMER_RESTART;
 		}
-
-		if (posix_timer_event(timr, si_private))
-			/*
-			 * signal was not sent because of sig_ignor
-			 * we will not get a call back to restart it AND
-			 * it should be restarted.
-			 */
-			schedule_next_timer(timr);
 	}
-	unlock_timer(timr, flags); /* hold thru abs lock to keep irq off */
-}
 
+	unlock_timer(timr, flags);
+	return ret;
+}
 
-static inline struct task_struct * good_sigevent(sigevent_t * event)
+static struct task_struct * good_sigevent(sigevent_t * event)
 {
 	struct task_struct *rtn = current->group_leader;
 
@@ -530,7 +377,7 @@ static inline struct task_struct * good_
 	return rtn;
 }
 
-void register_posix_clock(clockid_t clock_id, struct k_clock *new_clock)
+void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock)
 {
 	if ((unsigned) clock_id >= MAX_CLOCKS) {
 		printk("POSIX clock register failed for clock_id %d\n",
@@ -576,7 +423,7 @@ static void release_posix_timer(struct k
 /* Create a POSIX.1b interval timer. */
 
 asmlinkage long
-sys_timer_create(clockid_t which_clock,
+sys_timer_create(const clockid_t which_clock,
 		 struct sigevent __user *timer_event_spec,
 		 timer_t __user * created_timer_id)
 {
@@ -602,8 +449,7 @@ sys_timer_create(clockid_t which_clock,
 		goto out;
 	}
 	spin_lock_irq(&idr_lock);
-	error = idr_get_new(&posix_timers_id,
-			    (void *) new_timer,
+	error = idr_get_new(&posix_timers_id, (void *) new_timer,
 			    &new_timer_id);
 	spin_unlock_irq(&idr_lock);
 	if (error == -EAGAIN)
@@ -704,27 +550,6 @@ out:
 }
 
 /*
- * good_timespec
- *
- * This function checks the elements of a timespec structure.
- *
- * Arguments:
- * ts	     : Pointer to the timespec structure to check
- *
- * Return value:
- * If a NULL pointer was passed in, or the tv_nsec field was less than 0
- * or greater than NSEC_PER_SEC, or the tv_sec field was less than 0,
- * this function returns 0. Otherwise it returns 1.
- */
-static int good_timespec(const struct timespec *ts)
-{
-	if ((!ts) || (ts->tv_sec < 0) ||
-			((unsigned) ts->tv_nsec >= NSEC_PER_SEC))
-		return 0;
-	return 1;
-}
-
-/*
  * Locking issues: We need to protect the result of the id look up until
  * we get the timer locked down so it is not deleted under us.  The
  * removal is done under the idr spinlock so we use that here to bridge
@@ -776,39 +601,39 @@ static struct k_itimer * lock_timer(time
 static void
 common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
 {
-	unsigned long expires;
-	struct now_struct now;
+	ktime_t remaining;
+	struct hrtimer *timer = &timr->it.real.timer;
 
-	do
-		expires = timr->it.real.timer.expires;
-	while ((volatile long) (timr->it.real.timer.expires) != expires);
-
-	posix_get_now(&now);
-
-	if (expires &&
-	    ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) &&
-	    !timr->it.real.incr &&
-	    posix_time_before(&timr->it.real.timer, &now))
-		timr->it.real.timer.expires = expires = 0;
-	if (expires) {
-		if (timr->it_requeue_pending & REQUEUE_PENDING ||
-		    (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
-			posix_bump_timer(timr, now);
-			expires = timr->it.real.timer.expires;
-		}
-		else
-			if (!timer_pending(&timr->it.real.timer))
-				expires = 0;
-		if (expires)
-			expires -= now.jiffies;
-	}
-	jiffies_to_timespec(expires, &cur_setting->it_value);
-	jiffies_to_timespec(timr->it.real.incr, &cur_setting->it_interval);
+	memset(cur_setting, 0, sizeof(struct itimerspec));
+	remaining = hrtimer_get_remaining(timer);
 
-	if (cur_setting->it_value.tv_sec < 0) {
+	/* Time left ? or timer pending */
+	if (remaining.tv64 > 0 || hrtimer_active(timer))
+		goto calci;
+	/* interval timer ? */
+	if (timr->it.real.interval.tv64 == 0)
+		return;
+	/*
+	 * When a requeue is pending or this is a SIGEV_NONE timer
+	 * move the expiry time forward by intervals, so expiry is >
+	 * now.
+	 */
+	if (timr->it_requeue_pending & REQUEUE_PENDING ||
+	    (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
+		timr->it_overrun +=
+			hrtimer_forward(timer, timr->it.real.interval);
+		remaining = hrtimer_get_remaining(timer);
+	}
+ calci:
+	/* interval timer ? */
+	if (timr->it.real.interval.tv64 != 0)
+		cur_setting->it_interval =
+			ktime_to_timespec(timr->it.real.interval);
+	/* Return 0 only, when the timer is expired and not pending */
+	if (remaining.tv64 <= 0)
 		cur_setting->it_value.tv_nsec = 1;
-		cur_setting->it_value.tv_sec = 0;
-	}
+	else
+		cur_setting->it_value = ktime_to_timespec(remaining);
 }
 
 /* Get the time remaining on a POSIX.1b interval timer. */
@@ -832,6 +657,7 @@ sys_timer_gettime(timer_t timer_id, stru
 
 	return 0;
 }
+
 /*
  * Get the number of overruns of a POSIX.1b interval timer.  This is to
  * be the overrun of the timer last delivered.  At the same time we are
@@ -841,7 +667,6 @@ sys_timer_gettime(timer_t timer_id, stru
  * the call back to do_schedule_next_timer().  So all we need to do is
  * to pick up the frozen overrun.
  */
-
 asmlinkage long
 sys_timer_getoverrun(timer_t timer_id)
 {
@@ -858,153 +683,56 @@ sys_timer_getoverrun(timer_t timer_id)
 
 	return overrun;
 }
-/*
- * Adjust for absolute time
- *
- * If absolute time is given and it is not CLOCK_MONOTONIC, we need to
- * adjust for the offset between the timer clock (CLOCK_MONOTONIC) and
- * what ever clock he is using.
- *
- * If it is relative time, we need to add the current (CLOCK_MONOTONIC)
- * time to it to get the proper time for the timer.
- */
-static int adjust_abs_time(struct k_clock *clock, struct timespec *tp, 
-			   int abs, u64 *exp, struct timespec *wall_to)
-{
-	struct timespec now;
-	struct timespec oc = *tp;
-	u64 jiffies_64_f;
-	int rtn =0;
-
-	if (abs) {
-		/*
-		 * The mask pick up the 4 basic clocks 
-		 */
-		if (!((clock - &posix_clocks[0]) & ~CLOCKS_MASK)) {
-			jiffies_64_f = do_posix_clock_monotonic_gettime_parts(
-				&now,  wall_to);
-			/*
-			 * If we are doing a MONOTONIC clock
-			 */
-			if((clock - &posix_clocks[0]) & CLOCKS_MONO){
-				now.tv_sec += wall_to->tv_sec;
-				now.tv_nsec += wall_to->tv_nsec;
-			}
-		} else {
-			/*
-			 * Not one of the basic clocks
-			 */
-			clock->clock_get(clock - posix_clocks, &now);
-			jiffies_64_f = get_jiffies_64();
-		}
-		/*
-		 * Take away now to get delta and normalize
-		 */
-		set_normalized_timespec(&oc, oc.tv_sec - now.tv_sec,
-					oc.tv_nsec - now.tv_nsec);
-	}else{
-		jiffies_64_f = get_jiffies_64();
-	}
-	/*
-	 * Check if the requested time is prior to now (if so set now)
-	 */
-	if (oc.tv_sec < 0)
-		oc.tv_sec = oc.tv_nsec = 0;
-
-	if (oc.tv_sec | oc.tv_nsec)
-		set_normalized_timespec(&oc, oc.tv_sec,
-					oc.tv_nsec + clock->res);
-	tstojiffie(&oc, clock->res, exp);
-
-	/*
-	 * Check if the requested time is more than the timer code
-	 * can handle (if so we error out but return the value too).
-	 */
-	if (*exp > ((u64)MAX_JIFFY_OFFSET))
-			/*
-			 * This is a considered response, not exactly in
-			 * line with the standard (in fact it is silent on
-			 * possible overflows).  We assume such a large 
-			 * value is ALMOST always a programming error and
-			 * try not to compound it by setting a really dumb
-			 * value.
-			 */
-			rtn = -EINVAL;
-	/*
-	 * return the actual jiffies expire time, full 64 bits
-	 */
-	*exp += jiffies_64_f;
-	return rtn;
-}
 
 /* Set a POSIX.1b interval timer. */
 /* timr->it_lock is taken. */
-static inline int
+static int
 common_timer_set(struct k_itimer *timr, int flags,
 		 struct itimerspec *new_setting, struct itimerspec *old_setting)
 {
-	struct k_clock *clock = &posix_clocks[timr->it_clock];
-	u64 expire_64;
+	struct hrtimer *timer = &timr->it.real.timer;
+	enum hrtimer_mode mode;
 
 	if (old_setting)
 		common_timer_get(timr, old_setting);
 
 	/* disable the timer */
-	timr->it.real.incr = 0;
+	timr->it.real.interval.tv64 = 0;
 	/*
 	 * careful here.  If smp we could be in the "fire" routine which will
 	 * be spinning as we hold the lock.  But this is ONLY an SMP issue.
 	 */
-	if (try_to_del_timer_sync(&timr->it.real.timer) < 0) {
-#ifdef CONFIG_SMP
-		/*
-		 * It can only be active if on an other cpu.  Since
-		 * we have cleared the interval stuff above, it should
-		 * clear once we release the spin lock.  Of course once
-		 * we do that anything could happen, including the
-		 * complete melt down of the timer.  So return with
-		 * a "retry" exit status.
-		 */
+	if (hrtimer_try_to_cancel(timer) < 0)
 		return TIMER_RETRY;
-#endif
-	}
-
-	remove_from_abslist(timr);
 
 	timr->it_requeue_pending = (timr->it_requeue_pending + 2) & 
 		~REQUEUE_PENDING;
 	timr->it_overrun_last = 0;
-	timr->it_overrun = -1;
-	/*
-	 *switch off the timer when it_value is zero
-	 */
-	if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) {
-		timr->it.real.timer.expires = 0;
+
+	/* switch off the timer when it_value is zero */
+	if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec)
 		return 0;
-	}
 
-	if (adjust_abs_time(clock,
-			    &new_setting->it_value, flags & TIMER_ABSTIME, 
-			    &expire_64, &(timr->it.real.wall_to_prev))) {
-		return -EINVAL;
+	mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL;
+	hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
+	timr->it.real.timer.data = timr;
+	timr->it.real.timer.function = posix_timer_fn;
+
+	timer->expires = timespec_to_ktime(new_setting->it_value);
+
+	/* Convert interval */
+	timr->it.real.interval = timespec_to_ktime(new_setting->it_interval);
+
+	/* SIGEV_NONE timers are not queued ! See common_timer_get */
+	if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
+		/* Setup correct expiry time for relative timers */
+		if (mode == HRTIMER_REL)
+			timer->expires = ktime_add(timer->expires,
+						   timer->base->get_time());
+		return 0;
 	}
-	timr->it.real.timer.expires = (unsigned long)expire_64;
-	tstojiffie(&new_setting->it_interval, clock->res, &expire_64);
-	timr->it.real.incr = (unsigned long)expire_64;
-
-	/*
-	 * We do not even queue SIGEV_NONE timers!  But we do put them
-	 * in the abs list so we can do that right.
-	 */
-	if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE))
-		add_timer(&timr->it.real.timer);
 
-	if (flags & TIMER_ABSTIME && clock->abs_struct) {
-		spin_lock(&clock->abs_struct->lock);
-		list_add_tail(&(timr->it.real.abs_timer_entry),
-			      &(clock->abs_struct->list));
-		spin_unlock(&clock->abs_struct->lock);
-	}
+	hrtimer_start(timer, timer->expires, mode);
 	return 0;
 }
 
@@ -1026,8 +754,8 @@ sys_timer_settime(timer_t timer_id, int 
 	if (copy_from_user(&new_spec, new_setting, sizeof (new_spec)))
 		return -EFAULT;
 
-	if ((!good_timespec(&new_spec.it_interval)) ||
-	    (!good_timespec(&new_spec.it_value)))
+	if (!timespec_valid(&new_spec.it_interval) ||
+	    !timespec_valid(&new_spec.it_value))
 		return -EINVAL;
 retry:
 	timr = lock_timer(timer_id, &flag);
@@ -1039,12 +767,13 @@ retry:
 
 	unlock_timer(timr, flag);
 	if (error == TIMER_RETRY) {
+		hrtimer_wait_for_timer(&timr->it.real.timer);
 		rtn = NULL;	// We already got the old time...
 		goto retry;
 	}
 
-	if (old_setting && !error && copy_to_user(old_setting,
-						  &old_spec, sizeof (old_spec)))
+	if (old_setting && !error &&
+	    copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
 		error = -EFAULT;
 
 	return error;
@@ -1052,24 +781,10 @@ retry:
 
 static inline int common_timer_del(struct k_itimer *timer)
 {
-	timer->it.real.incr = 0;
+	timer->it.real.interval.tv64 = 0;
 
-	if (try_to_del_timer_sync(&timer->it.real.timer) < 0) {
-#ifdef CONFIG_SMP
-		/*
-		 * It can only be active if on an other cpu.  Since
-		 * we have cleared the interval stuff above, it should
-		 * clear once we release the spin lock.  Of course once
-		 * we do that anything could happen, including the
-		 * complete melt down of the timer.  So return with
-		 * a "retry" exit status.
-		 */
+	if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0)
 		return TIMER_RETRY;
-#endif
-	}
-
-	remove_from_abslist(timer);
-
 	return 0;
 }
 
@@ -1085,24 +800,17 @@ sys_timer_delete(timer_t timer_id)
 	struct k_itimer *timer;
 	long flags;
 
-#ifdef CONFIG_SMP
-	int error;
 retry_delete:
-#endif
 	timer = lock_timer(timer_id, &flags);
 	if (!timer)
 		return -EINVAL;
 
-#ifdef CONFIG_SMP
-	error = timer_delete_hook(timer);
-
-	if (error == TIMER_RETRY) {
+	if (timer_delete_hook(timer) == TIMER_RETRY) {
 		unlock_timer(timer, flags);
+		hrtimer_wait_for_timer(&timer->it.real.timer);
 		goto retry_delete;
 	}
-#else
-	timer_delete_hook(timer);
-#endif
+
 	spin_lock(&current->sighand->siglock);
 	list_del(&timer->list);
 	spin_unlock(&current->sighand->siglock);
@@ -1119,29 +827,22 @@ retry_delete:
 	release_posix_timer(timer, IT_ID_SET);
 	return 0;
 }
+
 /*
  * return timer owned by the process, used by exit_itimers
  */
-static inline void itimer_delete(struct k_itimer *timer)
+static void itimer_delete(struct k_itimer *timer)
 {
 	unsigned long flags;
 
-#ifdef CONFIG_SMP
-	int error;
 retry_delete:
-#endif
 	spin_lock_irqsave(&timer->it_lock, flags);
 
-#ifdef CONFIG_SMP
-	error = timer_delete_hook(timer);
-
-	if (error == TIMER_RETRY) {
+	if (timer_delete_hook(timer) == TIMER_RETRY) {
 		unlock_timer(timer, flags);
+		hrtimer_wait_for_timer(&timer->it.real.timer);
 		goto retry_delete;
 	}
-#else
-	timer_delete_hook(timer);
-#endif
 	list_del(&timer->list);
 	/*
 	 * This keeps any tasks waiting on the spin lock from thinking
@@ -1170,57 +871,8 @@ void exit_itimers(struct signal_struct *
 	}
 }
 
-/*
- * And now for the "clock" calls
- *
- * These functions are called both from timer functions (with the timer
- * spin_lock_irq() held and from clock calls with no locking.	They must
- * use the save flags versions of locks.
- */
-
-/*
- * We do ticks here to avoid the irq lock ( they take sooo long).
- * The seqlock is great here.  Since we a reader, we don't really care
- * if we are interrupted since we don't take lock that will stall us or
- * any other cpu. Voila, no irq lock is needed.
- *
- */
-
-static u64 do_posix_clock_monotonic_gettime_parts(
-	struct timespec *tp, struct timespec *mo)
-{
-	u64 jiff;
-	unsigned int seq;
-
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		getnstimeofday(tp);
-		*mo = wall_to_monotonic;
-		jiff = jiffies_64;
-
-	} while(read_seqretry(&xtime_lock, seq));
-
-	return jiff;
-}
-
-static int do_posix_clock_monotonic_get(clockid_t clock, struct timespec *tp)
-{
-	struct timespec wall_to_mono;
-
-	do_posix_clock_monotonic_gettime_parts(tp, &wall_to_mono);
-
-	set_normalized_timespec(tp, tp->tv_sec + wall_to_mono.tv_sec,
-				tp->tv_nsec + wall_to_mono.tv_nsec);
-
-	return 0;
-}
-
-int do_posix_clock_monotonic_gettime(struct timespec *tp)
-{
-	return do_posix_clock_monotonic_get(CLOCK_MONOTONIC, tp);
-}
-
-int do_posix_clock_nosettime(clockid_t clockid, struct timespec *tp)
+/* Not available / possible... functions */
+int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp)
 {
 	return -EINVAL;
 }
@@ -1232,7 +884,8 @@ int do_posix_clock_notimer_create(struct
 }
 EXPORT_SYMBOL_GPL(do_posix_clock_notimer_create);
 
-int do_posix_clock_nonanosleep(clockid_t clock, int flags, struct timespec *t)
+int do_posix_clock_nonanosleep(const clockid_t clock, int flags,
+			       struct timespec *t, struct timespec __user *r)
 {
 #ifndef ENOTSUP
 	return -EOPNOTSUPP;	/* aka ENOTSUP in userland for POSIX */
@@ -1242,8 +895,8 @@ int do_posix_clock_nonanosleep(clockid_t
 }
 EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep);
 
-asmlinkage long
-sys_clock_settime(clockid_t which_clock, const struct timespec __user *tp)
+asmlinkage long sys_clock_settime(const clockid_t which_clock,
+				  const struct timespec __user *tp)
 {
 	struct timespec new_tp;
 
@@ -1256,7 +909,7 @@ sys_clock_settime(clockid_t which_clock,
 }
 
 asmlinkage long
-sys_clock_gettime(clockid_t which_clock, struct timespec __user *tp)
+sys_clock_gettime(const clockid_t which_clock, struct timespec __user *tp)
 {
 	struct timespec kernel_tp;
 	int error;
@@ -1273,7 +926,7 @@ sys_clock_gettime(clockid_t which_clock,
 }
 
 asmlinkage long
-sys_clock_getres(clockid_t which_clock, struct timespec __user *tp)
+sys_clock_getres(const clockid_t which_clock, struct timespec __user *tp)
 {
 	struct timespec rtn_tp;
 	int error;
@@ -1292,117 +945,21 @@ sys_clock_getres(clockid_t which_clock, 
 }
 
 /*
- * The standard says that an absolute nanosleep call MUST wake up at
- * the requested time in spite of clock settings.  Here is what we do:
- * For each nanosleep call that needs it (only absolute and not on
- * CLOCK_MONOTONIC* (as it can not be set)) we thread a little structure
- * into the "nanosleep_abs_list".  All we need is the task_struct pointer.
- * When ever the clock is set we just wake up all those tasks.	 The rest
- * is done by the while loop in clock_nanosleep().
- *
- * On locking, clock_was_set() is called from update_wall_clock which
- * holds (or has held for it) a write_lock_irq( xtime_lock) and is
- * called from the timer bh code.  Thus we need the irq save locks.
- *
- * Also, on the call from update_wall_clock, that is done as part of a
- * softirq thing.  We don't want to delay the system that much (possibly
- * long list of timers to fix), so we defer that work to keventd.
+ * nanosleep for monotonic and realtime clocks
  */
-
-static DECLARE_WAIT_QUEUE_HEAD(nanosleep_abs_wqueue);
-static DECLARE_WORK(clock_was_set_work, (void(*)(void*))clock_was_set, NULL);
-
-static DECLARE_MUTEX(clock_was_set_lock);
-
-void clock_was_set(void)
+static int common_nsleep(const clockid_t which_clock, int flags,
+			 struct timespec *tsave, struct timespec __user *rmtp)
 {
-	struct k_itimer *timr;
-	struct timespec new_wall_to;
-	LIST_HEAD(cws_list);
-	unsigned long seq;
-
-
-	if (unlikely(in_interrupt())) {
-		schedule_work(&clock_was_set_work);
-		return;
-	}
-	wake_up_all(&nanosleep_abs_wqueue);
-
-	/*
-	 * Check if there exist TIMER_ABSTIME timers to correct.
-	 *
-	 * Notes on locking: This code is run in task context with irq
-	 * on.  We CAN be interrupted!  All other usage of the abs list
-	 * lock is under the timer lock which holds the irq lock as
-	 * well.  We REALLY don't want to scan the whole list with the
-	 * interrupt system off, AND we would like a sequence lock on
-	 * this code as well.  Since we assume that the clock will not
-	 * be set often, it seems ok to take and release the irq lock
-	 * for each timer.  In fact add_timer will do this, so this is
-	 * not an issue.  So we know when we are done, we will move the
-	 * whole list to a new location.  Then as we process each entry,
-	 * we will move it to the actual list again.  This way, when our
-	 * copy is empty, we are done.  We are not all that concerned
-	 * about preemption so we will use a semaphore lock to protect
-	 * aginst reentry.  This way we will not stall another
-	 * processor.  It is possible that this may delay some timers
-	 * that should have expired, given the new clock, but even this
-	 * will be minimal as we will always update to the current time,
-	 * even if it was set by a task that is waiting for entry to
-	 * this code.  Timers that expire too early will be caught by
-	 * the expire code and restarted.
-
-	 * Absolute timers that repeat are left in the abs list while
-	 * waiting for the task to pick up the signal.  This means we
-	 * may find timers that are not in the "add_timer" list, but are
-	 * in the abs list.  We do the same thing for these, save
-	 * putting them back in the "add_timer" list.  (Note, these are
-	 * left in the abs list mainly to indicate that they are
-	 * ABSOLUTE timers, a fact that is used by the re-arm code, and
-	 * for which we have no other flag.)
-
-	 */
-
-	down(&clock_was_set_lock);
-	spin_lock_irq(&abs_list.lock);
-	list_splice_init(&abs_list.list, &cws_list);
-	spin_unlock_irq(&abs_list.lock);
-	do {
-		do {
-			seq = read_seqbegin(&xtime_lock);
-			new_wall_to =	wall_to_monotonic;
-		} while (read_seqretry(&xtime_lock, seq));
-
-		spin_lock_irq(&abs_list.lock);
-		if (list_empty(&cws_list)) {
-			spin_unlock_irq(&abs_list.lock);
-			break;
-		}
-		timr = list_entry(cws_list.next, struct k_itimer,
-				  it.real.abs_timer_entry);
-
-		list_del_init(&timr->it.real.abs_timer_entry);
-		if (add_clockset_delta(timr, &new_wall_to) &&
-		    del_timer(&timr->it.real.timer))  /* timer run yet? */
-			add_timer(&timr->it.real.timer);
-		list_add(&timr->it.real.abs_timer_entry, &abs_list.list);
-		spin_unlock_irq(&abs_list.lock);
-	} while (1);
-
-	up(&clock_was_set_lock);
+	return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ?
+				 HRTIMER_ABS : HRTIMER_REL, which_clock);
 }
 
-long clock_nanosleep_restart(struct restart_block *restart_block);
-
 asmlinkage long
-sys_clock_nanosleep(clockid_t which_clock, int flags,
+sys_clock_nanosleep(const clockid_t which_clock, int flags,
 		    const struct timespec __user *rqtp,
 		    struct timespec __user *rmtp)
 {
 	struct timespec t;
-	struct restart_block *restart_block =
-	    &(current_thread_info()->restart_block);
-	int ret;
 
 	if (invalid_clockid(which_clock))
 		return -EINVAL;
@@ -1410,125 +967,9 @@ sys_clock_nanosleep(clockid_t which_cloc
 	if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
 		return -EFAULT;
 
-	if ((unsigned) t.tv_nsec >= NSEC_PER_SEC || t.tv_sec < 0)
+	if (!timespec_valid(&t))
 		return -EINVAL;
 
-	/*
-	 * Do this here as nsleep function does not have the real address.
-	 */
-	restart_block->arg1 = (unsigned long)rmtp;
-
-	ret = CLOCK_DISPATCH(which_clock, nsleep, (which_clock, flags, &t));
-
-	if ((ret == -ERESTART_RESTARTBLOCK) && rmtp &&
-					copy_to_user(rmtp, &t, sizeof (t)))
-		return -EFAULT;
-	return ret;
-}
-
-
-static int common_nsleep(clockid_t which_clock,
-			 int flags, struct timespec *tsave)
-{
-	struct timespec t, dum;
-	DECLARE_WAITQUEUE(abs_wqueue, current);
-	u64 rq_time = (u64)0;
-	s64 left;
-	int abs;
-	struct restart_block *restart_block =
-	    &current_thread_info()->restart_block;
-
-	abs_wqueue.flags = 0;
-	abs = flags & TIMER_ABSTIME;
-
-	if (restart_block->fn == clock_nanosleep_restart) {
-		/*
-		 * Interrupted by a non-delivered signal, pick up remaining
-		 * time and continue.  Remaining time is in arg2 & 3.
-		 */
-		restart_block->fn = do_no_restart_syscall;
-
-		rq_time = restart_block->arg3;
-		rq_time = (rq_time << 32) + restart_block->arg2;
-		if (!rq_time)
-			return -EINTR;
-		left = rq_time - get_jiffies_64();
-		if (left <= (s64)0)
-			return 0;	/* Already passed */
-	}
-
-	if (abs && (posix_clocks[which_clock].clock_get !=
-			    posix_clocks[CLOCK_MONOTONIC].clock_get))
-		add_wait_queue(&nanosleep_abs_wqueue, &abs_wqueue);
-
-	do {
-		t = *tsave;
-		if (abs || !rq_time) {
-			adjust_abs_time(&posix_clocks[which_clock], &t, abs,
-					&rq_time, &dum);
-		}
-
-		left = rq_time - get_jiffies_64();
-		if (left >= (s64)MAX_JIFFY_OFFSET)
-			left = (s64)MAX_JIFFY_OFFSET;
-		if (left < (s64)0)
-			break;
-
-		schedule_timeout_interruptible(left);
-
-		left = rq_time - get_jiffies_64();
-	} while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING));
-
-	if (abs_wqueue.task_list.next)
-		finish_wait(&nanosleep_abs_wqueue, &abs_wqueue);
-
-	if (left > (s64)0) {
-
-		/*
-		 * Always restart abs calls from scratch to pick up any
-		 * clock shifting that happened while we are away.
-		 */
-		if (abs)
-			return -ERESTARTNOHAND;
-
-		left *= TICK_NSEC;
-		tsave->tv_sec = div_long_long_rem(left, 
-						  NSEC_PER_SEC, 
-						  &tsave->tv_nsec);
-		/*
-		 * Restart works by saving the time remaing in 
-		 * arg2 & 3 (it is 64-bits of jiffies).  The other
-		 * info we need is the clock_id (saved in arg0). 
-		 * The sys_call interface needs the users 
-		 * timespec return address which _it_ saves in arg1.
-		 * Since we have cast the nanosleep call to a clock_nanosleep
-		 * both can be restarted with the same code.
-		 */
-		restart_block->fn = clock_nanosleep_restart;
-		restart_block->arg0 = which_clock;
-		/*
-		 * Caller sets arg1
-		 */
-		restart_block->arg2 = rq_time & 0xffffffffLL;
-		restart_block->arg3 = rq_time >> 32;
-
-		return -ERESTART_RESTARTBLOCK;
-	}
-
-	return 0;
-}
-/*
- * This will restart clock_nanosleep.
- */
-long
-clock_nanosleep_restart(struct restart_block *restart_block)
-{
-	struct timespec t;
-	int ret = common_nsleep(restart_block->arg0, 0, &t);
-
-	if ((ret == -ERESTART_RESTARTBLOCK) && restart_block->arg1 &&
-	    copy_to_user((struct timespec __user *)(restart_block->arg1), &t,
-			 sizeof (t)))
-		return -EFAULT;
-	return ret;
+	return CLOCK_DISPATCH(which_clock, nsleep,
+			      (which_clock, flags, &t, rmtp));
 }
Index: linux.prev/kernel/power/swsusp.c
===================================================================
--- linux.prev.orig/kernel/power/swsusp.c
+++ linux.prev/kernel/power/swsusp.c
@@ -611,6 +611,7 @@ int swsusp_suspend(void)
 	restore_processor_state();
 Restore_highmem:
 	restore_highmem();
+	touch_softlockup_watchdog();
 	device_power_up();
 Enable_irqs:
 	local_irq_enable();
Index: linux.prev/kernel/printk.c
===================================================================
--- linux.prev.orig/kernel/printk.c
+++ linux.prev/kernel/printk.c
@@ -83,7 +83,7 @@ static int console_locked;
  * It is also used in interesting ways to provide interlocking in
  * release_console_sem().
  */
-static DEFINE_SPINLOCK(logbuf_lock);
+static DEFINE_RAW_SPINLOCK(logbuf_lock);
 
 #define LOG_BUF_MASK	(log_buf_len-1)
 #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
@@ -363,10 +363,12 @@ static void __call_console_drivers(unsig
 {
 	struct console *con;
 
+	touch_critical_timing();
 	for (con = console_drivers; con; con = con->next) {
 		if ((con->flags & CON_ENABLED) && con->write)
 			con->write(con, &LOG_BUF(start), end - start);
 	}
+	touch_critical_timing();
 }
 
 /*
@@ -375,7 +377,10 @@ static void __call_console_drivers(unsig
 static void _call_console_drivers(unsigned long start,
 				unsigned long end, int msg_log_level)
 {
-	if (msg_log_level < console_loglevel &&
+	if (
+#ifndef CONFIG_PRINTK_IGNORE_LOGLEVEL
+			msg_log_level < console_loglevel &&
+#endif
 			console_drivers && start != end) {
 		if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
 			/* wrapped write */
@@ -468,6 +473,7 @@ static void zap_locks(void)
 	spin_lock_init(&logbuf_lock);
 	/* And make sure that we print immediately */
 	init_MUTEX(&console_sem);
+	zap_rt_locks();
 }
 
 #if defined(CONFIG_PRINTK_TIME)
@@ -543,6 +549,7 @@ asmlinkage int vprintk(const char *fmt, 
 	/* This stops the holder of console_sem just where we want him */
 	spin_lock_irqsave(&logbuf_lock, flags);
 	printk_cpu = smp_processor_id();
+	preempt_enable();
 
 	/* Emit the output into the temporary buffer */
 	printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args);
@@ -635,7 +642,6 @@ asmlinkage int vprintk(const char *fmt, 
 		spin_unlock_irqrestore(&logbuf_lock, flags);
 	}
 out:
-	preempt_enable();
 	return printed_len;
 }
 EXPORT_SYMBOL(printk);
@@ -760,14 +766,35 @@ void release_console_sem(void)
 		_con_start = con_start;
 		_log_end = log_end;
 		con_start = log_end;		/* Flush */
+		/*
+		 * on PREEMPT_RT, call console drivers with
+		 * interrupts enabled (if printk was called
+		 * with interrupts disabled):
+		 */
+#ifdef CONFIG_PREEMPT_RT
+		spin_unlock_irqrestore(&logbuf_lock, flags);
+#else
 		spin_unlock(&logbuf_lock);
+#endif
 		call_console_drivers(_con_start, _log_end);
-		local_irq_restore(flags);
+#ifndef CONFIG_PREEMPT_RT
+		raw_local_irq_restore(flags);
+#endif
 	}
 	console_locked = 0;
 	console_may_schedule = 0;
-	up(&console_sem);
 	spin_unlock_irqrestore(&logbuf_lock, flags);
+	up(&console_sem);
+
+	/*
+	 * On PREEMPT_RT kernels __wake_up may sleep, so wake syslogd
+	 * up only if we are in a preemptible section. We normally dont
+	 * printk from non-preemptible sections so this is for the emergency
+	 * case only.
+	 */
+#ifdef CONFIG_PREEMPT_RT
+	if (!in_atomic() && !irqs_disabled() && !raw_irqs_disabled())
+#endif
 	if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait))
 		wake_up_interruptible(&log_wait);
 }
@@ -1009,7 +1036,7 @@ void tty_write_message(struct tty_struct
  */
 int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
 {
-	static DEFINE_SPINLOCK(ratelimit_lock);
+	static DEFINE_RAW_SPINLOCK(ratelimit_lock);
 	static unsigned long toks = 10 * 5 * HZ;
 	static unsigned long last_msg;
 	static int missed;
@@ -1049,3 +1076,20 @@ int printk_ratelimit(void)
 				printk_ratelimit_burst);
 }
 EXPORT_SYMBOL(printk_ratelimit);
+
+static DEFINE_RAW_SPINLOCK(warn_lock);
+
+void __WARN_ON(const char *func, const char *file, const int line)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&warn_lock, flags);
+	printk("%s/%d[CPU#%d]: BUG in %s at %s:%d\n",
+		current->comm, current->pid, raw_smp_processor_id(),
+		func, file, line);
+	dump_stack();
+	spin_unlock_irqrestore(&warn_lock, flags);
+}
+
+EXPORT_SYMBOL(__WARN_ON);
+
Index: linux.prev/kernel/profile.c
===================================================================
--- linux.prev.orig/kernel/profile.c
+++ linux.prev/kernel/profile.c
@@ -41,6 +41,7 @@ static atomic_t *prof_buffer;
 static unsigned long prof_len, prof_shift;
 static int prof_on __read_mostly;
 static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
+int prof_pid = -1;
 #ifdef CONFIG_SMP
 static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
 static DEFINE_PER_CPU(int, cpu_profile_flip);
@@ -294,7 +295,7 @@ void profile_hit(int type, void *__pc)
 		put_cpu();
 		return;
 	}
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	do {
 		for (j = 0; j < PROFILE_GRPSZ; ++j) {
 			if (hits[i + j].pc == pc) {
@@ -314,7 +315,7 @@ void profile_hit(int type, void *__pc)
 		hits[i].pc = hits[i].hits = 0;
 	}
 out:
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	put_cpu();
 }
 
@@ -387,7 +388,7 @@ void profile_tick(int type, struct pt_re
 {
 	if (type == CPU_PROFILING && timer_hook)
 		timer_hook(regs);
-	if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask))
+	if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask) && (prof_pid == -1 || prof_pid == current->pid))
 		profile_hit(type, (void *)profile_pc(regs));
 }
 
Index: linux.prev/kernel/rcupdate.c
===================================================================
--- linux.prev.orig/kernel/rcupdate.c
+++ linux.prev/kernel/rcupdate.c
@@ -19,15 +19,15 @@
  *
  * Authors: Dipankar Sarma <dipankar@in.ibm.com>
  *	    Manfred Spraul <manfred@colorfullife.com>
+ *	    Paul E. McKenney <paulmck@us.ibm.com> (PREEMPT_RCU)
  * 
  * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
  * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
- * Papers:
- * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
- * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * Papers:  http://www.rdrop.com/users/paulmck/RCU
  *
  * For detailed explanation of Read-Copy Update mechanism see -
- * 		http://lse.sourceforge.net/locking/rcupdate.html
+ * 		Documentation/RCU/ *.txt
  *
  */
 #include <linux/types.h>
@@ -35,6 +35,7 @@
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/smp.h>
+#include <linux/rcupdate.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
 #include <asm/atomic.h>
@@ -47,6 +48,69 @@
 #include <linux/rcupdate.h>
 #include <linux/rcuref.h>
 #include <linux/cpu.h>
+#include <linux/random.h>
+#include <linux/delay.h>
+#include <linux/byteorder/swabb.h>
+
+struct rcu_synchronize {
+	struct rcu_head head;
+	struct completion completion;
+};
+
+/* Because of FASTCALL declaration of complete, we use this wrapper */
+static void wakeme_after_rcu(struct rcu_head  *head)
+{
+	struct rcu_synchronize *rcu;
+
+	rcu = container_of(head, struct rcu_synchronize, head);
+	complete(&rcu->completion);
+}
+
+/**
+ * synchronize_rcu - wait until a grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full grace
+ * period has elapsed, in other words after all currently executing RCU
+ * read-side critical sections have completed.  RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ *
+ * If your read-side code is not protected by rcu_read_lock(), do -not-
+ * use synchronize_rcu().
+ */
+void synchronize_rcu(void)
+{
+	struct rcu_synchronize rcu;
+
+	init_completion(&rcu.completion);
+	/* Will wake me after RCU finished */
+	call_rcu(&rcu.head, wakeme_after_rcu);
+
+	/* Wait for it */
+	wait_for_completion(&rcu.completion);
+}
+
+#ifndef __HAVE_ARCH_CMPXCHG
+/*
+ * We use an array of spinlocks for the rcurefs -- similar to ones in sparc
+ * 32 bit atomic_t implementations, and a hash function similar to that
+ * for our refcounting needs.
+ * Can't help multiprocessors which donot have cmpxchg :(
+ */
+spinlock_t __rcuref_hash[RCUREF_HASH_SIZE];
+
+static inline void init_rcurefs(void)
+{
+	int i;
+
+	for (i = 0; i < RCUREF_HASH_SIZE; i++)
+		spin_lock_init(&__rcuref_hash[i]);
+}
+#else
+#define init_rcurefs()	do { } while (0)
+#endif
+
+#ifndef CONFIG_PREEMPT_RCU
 
 /* Definition for rcupdate control block. */
 struct rcu_ctrlblk rcu_ctrlblk = 
@@ -62,9 +126,9 @@ struct rcu_state {
 };
 
 static struct rcu_state rcu_state ____cacheline_maxaligned_in_smp =
-	  {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
+	  {.lock = SPIN_LOCK_UNLOCKED(rcu_state.lock), .cpumask = CPU_MASK_NONE };
 static struct rcu_state rcu_bh_state ____cacheline_maxaligned_in_smp =
-	  {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
+	  {.lock = SPIN_LOCK_UNLOCKED(rcu_bh_state.lock), .cpumask = CPU_MASK_NONE };
 
 DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
@@ -73,18 +137,6 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_d
 static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
 static int maxbatch = 10000;
 
-#ifndef __HAVE_ARCH_CMPXCHG
-/*
- * We use an array of spinlocks for the rcurefs -- similar to ones in sparc
- * 32 bit atomic_t implementations, and a hash function similar to that
- * for our refcounting needs.
- * Can't help multiprocessors which donot have cmpxchg :(
- */
-
-spinlock_t __rcuref_hash[RCUREF_HASH_SIZE] = {
-	[0 ... (RCUREF_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED
-};
-#endif
 
 /**
  * call_rcu - Queue an RCU callback for invocation after a grace period.
@@ -105,7 +157,7 @@ void fastcall call_rcu(struct rcu_head *
 
 	head->func = func;
 	head->next = NULL;
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	rdp = &__get_cpu_var(rcu_data);
 	*rdp->nxttail = head;
 	rdp->nxttail = &head->next;
@@ -113,7 +165,7 @@ void fastcall call_rcu(struct rcu_head *
 	if (unlikely(++rdp->count > 10000))
 		set_need_resched();
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static atomic_t rcu_barrier_cpu_count;
@@ -144,7 +196,7 @@ void fastcall call_rcu_bh(struct rcu_hea
 
 	head->func = func;
 	head->next = NULL;
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	rdp = &__get_cpu_var(rcu_bh_data);
 	*rdp->nxttail = head;
 	rdp->nxttail = &head->next;
@@ -154,7 +206,7 @@ void fastcall call_rcu_bh(struct rcu_hea
  *  if (unlikely(rdp->count > 10000))
  *      rcu_do_batch(rdp);
  */
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /*
@@ -344,11 +396,11 @@ static void rcu_check_quiescent_state(st
 static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
 				struct rcu_head **tail)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	*this_rdp->nxttail = list;
 	if (list)
 		this_rdp->nxttail = tail;
-	local_irq_enable();
+	raw_local_irq_enable();
 }
 
 static void __rcu_offline_cpu(struct rcu_data *this_rdp,
@@ -401,13 +453,13 @@ static void __rcu_process_callbacks(stru
 		rdp->curtail = &rdp->curlist;
 	}
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	if (rdp->nxtlist && !rdp->curlist) {
 		rdp->curlist = rdp->nxtlist;
 		rdp->curtail = rdp->nxttail;
 		rdp->nxtlist = NULL;
 		rdp->nxttail = &rdp->nxtlist;
-		local_irq_enable();
+		raw_local_irq_enable();
 
 		/*
 		 * start the next batch of callbacks
@@ -427,7 +479,7 @@ static void __rcu_process_callbacks(stru
 			spin_unlock(&rsp->lock);
 		}
 	} else {
-		local_irq_enable();
+		raw_local_irq_enable();
 	}
 	rcu_check_quiescent_state(rcp, rsp, rdp);
 	if (rdp->donelist)
@@ -506,48 +558,363 @@ static struct notifier_block __devinitda
 void __init rcu_init(void)
 {
 	sema_init(&rcu_barrier_sema, 1);
+	init_rcurefs();
 	rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
 			(void *)(long)smp_processor_id());
 	/* Register notifier for non-boot CPUs */
 	register_cpu_notifier(&rcu_nb);
 }
 
-struct rcu_synchronize {
-	struct rcu_head head;
-	struct completion completion;
+/*
+ * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
+ */
+void synchronize_kernel(void)
+{
+	synchronize_rcu();
+}
+
+module_param(maxbatch, int, 0);
+EXPORT_SYMBOL(call_rcu);  /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL(call_rcu_bh);  /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL_GPL(synchronize_rcu);
+EXPORT_SYMBOL(synchronize_kernel);  /* WARNING: GPL-only in April 2006. */
+
+#else /* #ifndef CONFIG_PREEMPT_RCU */
+
+struct rcu_data {
+	raw_spinlock_t	lock;
+	long		completed;	/* Number of last completed batch. */
+	struct tasklet_struct rcu_tasklet;
+	struct rcu_head *nextlist;
+	struct rcu_head **nexttail;
+	struct rcu_head *waitlist;
+	struct rcu_head **waittail;
+	struct rcu_head *donelist;
+	struct rcu_head **donetail;
+#ifdef CONFIG_RCU_STATS
+	long		n_next_length;
+	long		n_next_add;
+	long		n_wait_length;
+	long		n_wait_add;
+	long		n_done_length;
+	long		n_done_add;
+	long		n_done_remove;
+	atomic_t	n_done_invoked;
+	long		n_rcu_check_callbacks;
+	atomic_t	n_rcu_try_flip1;
+	long		n_rcu_try_flip2;
+	long		n_rcu_try_flip3;
+	atomic_t	n_rcu_try_flip_e1;
+	long		n_rcu_try_flip_e2;
+	long		n_rcu_try_flip_e3;
+#endif /* #ifdef CONFIG_RCU_STATS */
+};
+struct rcu_ctrlblk {
+	raw_spinlock_t	fliplock;
+	long		completed;	/* Number of last completed batch. */
+};
+static struct rcu_data rcu_data;
+static struct rcu_ctrlblk rcu_ctrlblk = {
+	.fliplock = RAW_SPIN_LOCK_UNLOCKED,
+	.completed = 0,
 };
+static DEFINE_PER_CPU(atomic_t [2], rcu_flipctr) =
+	{ ATOMIC_INIT(0), ATOMIC_INIT(0) };
 
-/* Because of FASTCALL declaration of complete, we use this wrapper */
-static void wakeme_after_rcu(struct rcu_head  *head)
+/*
+ * Return the number of RCU batches processed thus far.  Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed(void)
 {
-	struct rcu_synchronize *rcu;
+	return rcu_ctrlblk.completed;
+}
 
-	rcu = container_of(head, struct rcu_synchronize, head);
-	complete(&rcu->completion);
+void
+rcu_read_lock(void)
+{
+	int flipctr;
+	unsigned long oldirq;
+
+	raw_local_irq_save(oldirq);
+	if (current->rcu_read_lock_nesting++ == 0) {
+
+		/*
+		 * Outermost nesting of rcu_read_lock(), so atomically
+		 * increment the current counter for the current CPU.
+		 */
+
+		flipctr = rcu_ctrlblk.completed & 0x1;
+		smp_read_barrier_depends();
+		current->rcu_flipctr1 = &(__get_cpu_var(rcu_flipctr)[flipctr]);
+		/* Can optimize to non-atomic on fastpath, but start simple. */
+		atomic_inc(current->rcu_flipctr1);
+		smp_mb__after_atomic_inc();  /* might optimize out... */
+		if (unlikely(flipctr != (rcu_ctrlblk.completed & 0x1))) {
+
+			/*
+			 * We raced with grace-period processing (flip).
+			 * Although we cannot be preempted here, there
+			 * could be interrupts, ECC errors and the like,
+			 * so just nail down both sides of the rcu_flipctr
+			 * array for the duration of our RCU read-side
+			 * critical section, preventing a second flip
+			 * from racing with us.  At some point, it would
+			 * be safe to decrement one of the counters, but
+			 * we have no way of knowing when that would be.
+			 * So just decrement them both in rcu_read_unlock().
+			 */
+
+			current->rcu_flipctr2 =
+				&(__get_cpu_var(rcu_flipctr)[!flipctr]);
+			/* Can again optimize to non-atomic on fastpath. */
+			atomic_inc(current->rcu_flipctr2);
+			smp_mb__after_atomic_inc();  /* might optimize out... */
+		}
+	}
+	raw_local_irq_restore(oldirq);
 }
 
-/**
- * synchronize_rcu - wait until a grace period has elapsed.
- *
- * Control will return to the caller some time after a full grace
- * period has elapsed, in other words after all currently executing RCU
- * read-side critical sections have completed.  RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
- * and may be nested.
+void
+rcu_read_unlock(void)
+{
+	unsigned long oldirq;
+
+	raw_local_irq_save(oldirq);
+	if (--current->rcu_read_lock_nesting == 0) {
+
+		/*
+		 * Just atomically decrement whatever we incremented.
+		 * Might later want to awaken some task waiting for the
+		 * grace period to complete, but keep it simple for the
+		 * moment.
+		 */
+
+		smp_mb__before_atomic_dec();
+		atomic_dec(current->rcu_flipctr1);
+		current->rcu_flipctr1 = NULL;
+		if (unlikely(current->rcu_flipctr2 != NULL)) {
+			atomic_dec(current->rcu_flipctr2);
+			current->rcu_flipctr2 = NULL;
+		}
+	}
+	raw_local_irq_restore(oldirq);
+}
+
+static void
+__rcu_advance_callbacks(void)
+{
+
+	if (rcu_data.completed != rcu_ctrlblk.completed) {
+		if (rcu_data.waitlist != NULL) {
+			*rcu_data.donetail = rcu_data.waitlist;
+			rcu_data.donetail = rcu_data.waittail;
+#ifdef CONFIG_RCU_STATS
+			rcu_data.n_done_length += rcu_data.n_wait_length;
+			rcu_data.n_done_add += rcu_data.n_wait_length;
+			rcu_data.n_wait_length = 0;
+#endif /* #ifdef CONFIG_RCU_STATS */
+		}
+		if (rcu_data.nextlist != NULL) {
+			rcu_data.waitlist = rcu_data.nextlist;
+			rcu_data.waittail = rcu_data.nexttail;
+			rcu_data.nextlist = NULL;
+			rcu_data.nexttail = &rcu_data.nextlist;
+#ifdef CONFIG_RCU_STATS
+			rcu_data.n_wait_length += rcu_data.n_next_length;
+			rcu_data.n_wait_add += rcu_data.n_next_length;
+			rcu_data.n_next_length = 0;
+#endif /* #ifdef CONFIG_RCU_STATS */
+		} else {
+			rcu_data.waitlist = NULL;
+			rcu_data.waittail = &rcu_data.waitlist;
+		}
+		rcu_data.completed = rcu_ctrlblk.completed;
+	}
+}
+
+/*
+ * Attempt a single flip of the counters.  Remember, a single flip does
+ * -not- constitute a grace period.  Instead, the interval between
+ * a pair of consecutive flips is a grace period.
  *
- * If your read-side code is not protected by rcu_read_lock(), do -not-
- * use synchronize_rcu().
+ * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
+ * on a large SMP, they might want to use a hierarchical organization of
+ * the per-CPU-counter pairs.
+ */
+static void
+rcu_try_flip(void)
+{
+	int cpu;
+	long flipctr;
+	unsigned long oldirq;
+
+	flipctr = rcu_ctrlblk.completed;
+#ifdef CONFIG_RCU_STATS
+	atomic_inc(&rcu_data.n_rcu_try_flip1);
+#endif /* #ifdef CONFIG_RCU_STATS */
+	if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, oldirq))) {
+#ifdef CONFIG_RCU_STATS
+		atomic_inc(&rcu_data.n_rcu_try_flip_e1);
+#endif /* #ifdef CONFIG_RCU_STATS */
+		return;
+	}
+	if (unlikely(flipctr != rcu_ctrlblk.completed)) {
+
+		/* Our work is done!  ;-) */
+
+#ifdef CONFIG_RCU_STATS
+		rcu_data.n_rcu_try_flip_e2++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+		spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq);
+		return;
+	}
+	flipctr &= 0x1;
+
+	/*
+	 * Check for completion of all RCU read-side critical sections
+	 * that started prior to the previous flip.
+	 */
+
+#ifdef CONFIG_RCU_STATS
+	rcu_data.n_rcu_try_flip2++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+	for_each_cpu(cpu) {
+		if (atomic_read(&per_cpu(rcu_flipctr, cpu)[!flipctr]) != 0) {
+#ifdef CONFIG_RCU_STATS
+			rcu_data.n_rcu_try_flip_e3++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+			spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq);
+			return;
+		}
+	}
+
+	/* Do the flip. */
+
+	smp_mb();
+	rcu_ctrlblk.completed++;
+
+#ifdef CONFIG_RCU_STATS
+	rcu_data.n_rcu_try_flip3++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq);
+}
+
+void
+rcu_check_callbacks(int cpu, int user)
+{
+	unsigned long oldirq;
+
+	if (rcu_ctrlblk.completed == rcu_data.completed) {
+		rcu_try_flip();
+		if (rcu_ctrlblk.completed == rcu_data.completed) {
+			return;
+		}
+	}
+	spin_lock_irqsave(&rcu_data.lock, oldirq);
+#ifdef CONFIG_RCU_STATS
+	rcu_data.n_rcu_check_callbacks++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+	__rcu_advance_callbacks();
+	if (rcu_data.donelist == NULL) {
+		spin_unlock_irqrestore(&rcu_data.lock, oldirq);
+	} else {
+		spin_unlock_irqrestore(&rcu_data.lock, oldirq);
+		tasklet_schedule(&rcu_data.rcu_tasklet);
+	}
+}
+
+static
+void rcu_process_callbacks(unsigned long data)
+{
+	unsigned long flags;
+	struct rcu_head *next, *list;
+
+	spin_lock_irqsave(&rcu_data.lock, flags);
+	list = rcu_data.donelist;
+	if (list == NULL) {
+		spin_unlock_irqrestore(&rcu_data.lock, flags);
+		return;
+	}
+	rcu_data.donelist = NULL;
+	rcu_data.donetail = &rcu_data.donelist;
+#ifdef CONFIG_RCU_STATS
+	rcu_data.n_done_remove += rcu_data.n_done_length;
+	rcu_data.n_done_length = 0;
+#endif /* #ifdef CONFIG_RCU_STATS */
+	spin_unlock_irqrestore(&rcu_data.lock, flags);
+	while (list) {
+		next = list->next;
+		list->func(list);
+		list = next;
+#ifdef CONFIG_RCU_STATS
+		atomic_inc(&rcu_data.n_done_invoked);
+#endif /* #ifdef CONFIG_RCU_STATS */
+	}
+}
+
+void fastcall
+call_rcu(struct rcu_head *head,
+	 void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+
+	head->func = func;
+	head->next = NULL;
+	spin_lock_irqsave(&rcu_data.lock, flags);
+	__rcu_advance_callbacks();
+	*rcu_data.nexttail = head;
+	rcu_data.nexttail = &head->next;
+#ifdef CONFIG_RCU_STATS
+	rcu_data.n_next_add++;
+	rcu_data.n_next_length++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+	spin_unlock_irqrestore(&rcu_data.lock, flags);
+}
+
+/*
+ * Crude hack, reduces but does not eliminate possibility of failure.
+ * Needs to wait for all CPUs to pass through a -voluntary- context
+ * switch to eliminate possibility of failure.  (Maybe just crank
+ * priority down...)
  */
-void synchronize_rcu(void)
+void
+synchronize_sched(void)
 {
-	struct rcu_synchronize rcu;
+	cpumask_t oldmask;
+	int cpu;
 
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished */
-	call_rcu(&rcu.head, wakeme_after_rcu);
+	if (sched_getaffinity(0, &oldmask) < 0) {
+		oldmask = cpu_possible_map;
+	}
+	for_each_cpu(cpu) {
+		sched_setaffinity(0, cpumask_of_cpu(cpu));
+		schedule();
+	}
+	sched_setaffinity(0, oldmask);
+}
 
-	/* Wait for it */
-	wait_for_completion(&rcu.completion);
+int
+rcu_pending(int cpu)
+{
+	return (rcu_data.donelist != NULL ||
+		rcu_data.waitlist != NULL ||
+		rcu_data.nextlist != NULL);
+}
+
+void __init rcu_init(void)
+{
+	init_rcurefs();
+/*&&&&*/printk("WARNING: experimental RCU implementation.\n");
+	spin_lock_init(&rcu_data.lock);
+	rcu_data.completed = 0;
+	rcu_data.nextlist = NULL;
+	rcu_data.nexttail = &rcu_data.nextlist;
+	rcu_data.waitlist = NULL;
+	rcu_data.waittail = &rcu_data.waitlist;
+	rcu_data.donelist = NULL;
+	rcu_data.donetail = &rcu_data.donelist;
+	tasklet_init(&rcu_data.rcu_tasklet, rcu_process_callbacks, 0UL);
 }
 
 /*
@@ -558,9 +925,79 @@ void synchronize_kernel(void)
 	synchronize_rcu();
 }
 
-module_param(maxbatch, int, 0);
+#ifdef CONFIG_RCU_STATS
+int rcu_read_proc_data(char *page)
+{
+	return sprintf(page,
+		       "ggp=%ld lgp=%ld rcc=%ld\n"
+		       "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n"
+		       "rtf1=%d rtf2=%ld rtf3=%ld rtfe1=%d rtfe2=%ld rtfe3=%ld\n",
+
+		       rcu_ctrlblk.completed,
+		       rcu_data.completed,
+		       rcu_data.n_rcu_check_callbacks,
+
+		       rcu_data.n_next_add,
+		       rcu_data.n_next_length,
+		       rcu_data.n_wait_add,
+		       rcu_data.n_wait_length,
+		       rcu_data.n_done_add,
+		       rcu_data.n_done_length,
+		       rcu_data.n_done_remove,
+		       atomic_read(&rcu_data.n_done_invoked),
+
+		       atomic_read(&rcu_data.n_rcu_try_flip1),
+		       rcu_data.n_rcu_try_flip2,
+		       rcu_data.n_rcu_try_flip3,
+		       atomic_read(&rcu_data.n_rcu_try_flip_e1),
+		       rcu_data.n_rcu_try_flip_e2,
+		       rcu_data.n_rcu_try_flip_e3);
+}
+
+int rcu_read_proc_gp_data(char *page)
+{
+	long oldgp = rcu_ctrlblk.completed;
+
+	synchronize_rcu();
+	return sprintf(page, "oldggp=%ld  newggp=%ld\n",
+		       oldgp, rcu_ctrlblk.completed);
+}
+
+int rcu_read_proc_ptrs_data(char *page)
+{
+	return sprintf(page,
+		       "nl=%p/%p nt=%p\n wl=%p/%p wt=%p dl=%p/%p dt=%p\n",
+		       &rcu_data.nextlist, rcu_data.nextlist, rcu_data.nexttail,
+		       &rcu_data.waitlist, rcu_data.waitlist, rcu_data.waittail,
+		       &rcu_data.donelist, rcu_data.donelist, rcu_data.donetail
+		      );
+}
+
+int rcu_read_proc_ctrs_data(char *page)
+{
+	int cnt = 0;
+	int cpu;
+	int f = rcu_data.completed & 0x1;
+
+	cnt += sprintf(&page[cnt], "CPU last cur\n");
+	for_each_cpu(cpu) {
+		cnt += sprintf(&page[cnt], "%3d %4d %3d\n",
+			       cpu,
+			       atomic_read(&per_cpu(rcu_flipctr, cpu)[!f]),
+			       atomic_read(&per_cpu(rcu_flipctr, cpu)[f]));
+	}
+	cnt += sprintf(&page[cnt], "ggp = %ld\n", rcu_data.completed);
+	return (cnt);
+}
+
+#endif /* #ifdef CONFIG_RCU_STATS */
+
+EXPORT_SYMBOL(call_rcu); /* WARNING: GPL-only in April 2006. */
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
-EXPORT_SYMBOL(call_rcu);  /* WARNING: GPL-only in April 2006. */
-EXPORT_SYMBOL(call_rcu_bh);  /* WARNING: GPL-only in April 2006. */
 EXPORT_SYMBOL_GPL(synchronize_rcu);
-EXPORT_SYMBOL(synchronize_kernel);  /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL_GPL(synchronize_sched);
+EXPORT_SYMBOL(rcu_read_lock);  /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL(rcu_read_unlock);  /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL(synchronize_kernel);  /* WARNING: Removal in April 2006. */
+
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU */
Index: linux.prev/kernel/rcutorture.c
===================================================================
--- linux.prev.orig/kernel/rcutorture.c
+++ linux.prev/kernel/rcutorture.c
@@ -254,7 +254,7 @@ rcu_torture_reader(void *arg)
 		if (p == NULL) {
 			/* Wait for rcu_torture_writer to get underway */
 			rcu_read_unlock();
-			schedule_timeout_interruptible(HZ);
+			msleep(1000);
 			continue;
 		}
 		if (p->rtort_mbtest == 0)
@@ -275,7 +275,7 @@ rcu_torture_reader(void *arg)
 		++__get_cpu_var(rcu_torture_batch)[completed];
 		preempt_enable();
 		rcu_read_unlock();
-		schedule();
+		cond_resched();
 	} while (!kthread_should_stop() && !fullstop);
 	VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
 	while (!kthread_should_stop())
Index: linux.prev/kernel/rt-debug.c
===================================================================
--- /dev/null
+++ linux.prev/kernel/rt-debug.c
@@ -0,0 +1,492 @@
+/*
+ * kernel/rt-debug.c
+ *
+ * Real-Time Preemption Support
+ *
+ * started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ *
+ * Debugging code
+ *
+ */
+#include <linux/config.h>
+#include <linux/rt_lock.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/interrupt.h>
+#include <linux/plist.h>
+#include <linux/fs.h>
+
+#include "rt.h"
+#include "rt-debug.h"
+
+#if defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPT_RT)
+int check_locking_preempt_off(task_t *p)
+{
+	int i;
+
+	for (i = 0; i < p->lock_count; i++)
+		if (p->owned_lock[i]->was_preempt_off)
+			return 1;
+	return 0;
+}
+
+void check_preempt_wakeup(task_t * p)
+{
+	/*
+	 * Possible PREEMPT_RT race scenario when wake_up_proces() is
+	 * usually called with preemption off, but PREEMPT_RT enables
+	 * it. If the task is dependent on preventing context switches
+	 * either with spinlocks or rcu locks, then this could result
+	 * in hangs and race conditions.
+	 */
+	if (!preempt_count() &&
+		!__raw_irqs_disabled() &&
+		p->prio < current->prio &&
+		rt_task(p) &&
+		(current->rcu_read_lock_nesting != 0 ||
+				check_locking_preempt_off(current))) {
+
+			printk("BUG: %s/%d, possible wake_up race on %s/%d\n",
+				current->comm, current->pid, p->comm, p->pid);
+			dump_stack();
+		}
+}
+#endif
+
+#ifdef CONFIG_DEBUG_DEADLOCKS
+/*
+ * We need a global lock when we walk through the multi-process
+ * lock tree...
+ */
+raw_spinlock_t tracelock = RAW_SPIN_LOCK_UNLOCKED;
+LIST_HEAD(held_locks);
+
+/*
+ * deadlock detection flag. We turn it off when we detect
+ * the first problem because we dont want to recurse back
+ * into the tracing code when doing error printk or
+ * executing a BUG():
+ */
+int rt_trace_on = 1;
+
+void deadlock_trace_off(void)
+{
+	rt_trace_on = 0;
+}
+
+/*
+ * BKL check
+ */
+#ifdef CONFIG_PREEMPT_RT
+static inline int is_kernel_lock(struct rt_mutex *lock)
+{
+	return (lock == &kernel_sem.lock);
+}
+#else
+# define is_kernel_lock(lock) (0)
+#endif
+
+static void printk_task(task_t *p)
+{
+	if (p)
+		printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio);
+	else
+		printk("<none>");
+}
+
+static void printk_task_short(task_t *p)
+{
+	if (p)
+		printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio);
+	else
+		printk("<none>");
+}
+
+void printk_lock(struct rt_mutex *lock, int print_owner)
+{
+	if (lock->name)
+		printk(" [%p] {%s}\n",
+			lock, lock->name);
+	else
+		printk(" [%p] {%s:%d}\n",
+			lock, lock->file, lock->line);
+
+	if (print_owner && lock_owner(lock)) {
+		printk(".. ->owner: %p\n", lock->owner);
+		printk(".. held by:  ");
+		printk_task(lock_owner(lock));
+		printk("\n");
+	}
+	if (lock_owner(lock)) {
+		printk("... acquired at:               ");
+		print_symbol("%s\n", lock->acquire_eip);
+	}
+}
+
+void printk_lock_debug(struct rt_mutex *lock)
+{
+	printk("save_state: %d\n", lock->save_state);
+	printk("wait_list: %p/%p %p/%p\n",
+		lock->wait_list.prio_list.prev,
+		lock->wait_list.prio_list.next,
+		lock->wait_list.node_list.prev,
+		lock->wait_list.node_list.next);
+	printk("held_list: %p/%p\n", lock->held_list.prev,
+					lock->held_list.next);
+	printk_lock(lock, 1);
+}
+
+static void printk_waiter(struct rt_mutex_waiter *w)
+{
+	printk("-------------------------\n");
+	printk("| waiter struct %p:\n", w);
+	printk("| w->list: [DP:%p/%p|SP:%p/%p|PRI:%d]\n",
+	       w->list.plist.prio_list.prev, w->list.plist.prio_list.next,
+	       w->list.plist.node_list.prev, w->list.plist.node_list.next,
+	       w->list.prio);
+	printk("| w->pi_list: [DP:%p/%p|SP:%p/%p|PRI:%d]\n",
+	       w->pi_list.plist.prio_list.prev, w->pi_list.plist.prio_list.next,
+	       w->pi_list.plist.node_list.prev, w->pi_list.plist.node_list.next,
+	       w->pi_list.prio);
+	printk("\n| lock:\n");
+	printk_lock(w->lock, 1);
+	printk("| w->ti->task:\n");
+	printk_task(w->task);
+	printk("| blocked at:  ");
+	print_symbol("%s\n", w->eip);
+	printk("-------------------------\n");
+}
+
+static void show_task_locks(task_t *p)
+{
+	switch (p->state) {
+	case TASK_RUNNING:		printk("R"); break;
+	case TASK_RUNNING_MUTEX:	printk("M"); break;
+	case TASK_INTERRUPTIBLE:	printk("S"); break;
+	case TASK_UNINTERRUPTIBLE:	printk("D"); break;
+	case TASK_STOPPED:		printk("T"); break;
+	case EXIT_ZOMBIE:		printk("Z"); break;
+	case EXIT_DEAD:			printk("X"); break;
+	default:			printk("?"); break;
+	}
+	printk_task(p);
+	if (p->blocked_on) {
+		struct rt_mutex *lock = p->blocked_on->lock;
+
+		printk(" blocked on:");
+		printk_lock(lock, 1);
+	} else
+		printk(" (not blocked)\n");
+}
+
+void show_held_locks(task_t *filter)
+{
+	struct list_head *curr, *cursor = NULL;
+	struct rt_mutex *lock;
+	task_t *t;
+	unsigned long flags;
+	int count = 0;
+
+	if (!rt_trace_on)
+		return;
+
+	if (filter) {
+		printk("------------------------------\n");
+		printk("| showing all locks held by: |  (");
+		printk_task_short(filter);
+		printk("):\n");
+		printk("------------------------------\n");
+	} else {
+		printk("---------------------------\n");
+		printk("| showing all locks held: |\n");
+		printk("---------------------------\n");
+	}
+
+	/*
+	 * Play safe and acquire the global trace lock. We
+	 * cannot printk with that lock held so we iterate
+	 * very carefully:
+	 */
+next:
+	trace_lock_irqsave(&tracelock, flags);
+	list_for_each(curr, &held_locks) {
+		if (cursor && curr != cursor)
+			continue;
+		lock = list_entry(curr, struct rt_mutex, held_list);
+		t = lock_owner(lock);
+		if (filter && (t != filter))
+			continue;
+		count++;
+		cursor = curr->next;
+		trace_unlock_irqrestore(&tracelock, flags);
+
+		printk("\n#%03d:            ", count);
+		printk_lock(lock, filter ? 0 : 1);
+		goto next;
+	}
+	trace_unlock_irqrestore(&tracelock, flags);
+	printk("\n");
+}
+
+void show_all_locks(void)
+{
+	task_t *g, *p;
+	int count = 10;
+	int unlock = 1;
+
+	printk("\nshowing all tasks:\n");
+
+	/*
+	 * Here we try to get the tasklist_lock as hard as possible,
+	 * if not successful after 2 seconds we ignore it (but keep
+	 * trying). This is to enable a debug printout even if a
+	 * tasklist_lock-holding task deadlocks or crashes.
+	 */
+retry:
+	if (!read_trylock(&tasklist_lock)) {
+		if (count == 10)
+			printk("hm, tasklist_lock locked, retrying... ");
+		if (count) {
+			count--;
+			printk(" #%d", 10-count);
+			mdelay(200);
+			goto retry;
+		}
+		printk(" ignoring it.\n");
+		unlock = 0;
+	}
+	if (count != 10)
+		printk(" locked it.\n");
+
+	do_each_thread(g, p) {
+		show_task_locks(p);
+		if (!unlock)
+			if (read_trylock(&tasklist_lock))
+				unlock = 1;
+	} while_each_thread(g, p);
+
+	printk("\n");
+	show_held_locks(NULL);
+	printk("=============================================\n\n");
+
+	if (unlock)
+		read_unlock(&tasklist_lock);
+}
+
+int check_deadlock(struct rt_mutex *lock, int depth, unsigned long eip)
+{
+	struct rt_mutex *lockblk;
+	task_t *task;
+
+	if (!rt_trace_on)
+		return 0;
+	/*
+	 * Special-case: the BKL self-releases at schedule()
+	 * time so it can never deadlock:
+	 */
+	if (is_kernel_lock(lock))
+		return 0;
+
+	task = lock_owner(lock);
+	if (!task)
+		return 0;
+	lockblk = NULL;
+	if (task->blocked_on)
+		lockblk = task->blocked_on->lock;
+	if (current == task) {
+		TRACE_OFF();
+		if (depth)
+			return 1;
+		printk("\n==========================================\n");
+		printk(  "[ BUG: lock recursion deadlock detected! |\n");
+		printk(  "------------------------------------------\n");
+		printk("already locked: ");
+		printk_lock(lock, 1);
+		show_held_locks(task);
+		printk("-{current task's backtrace}----------------->\n");
+		dump_stack();
+		show_all_locks();
+		printk("[ turning off deadlock detection."
+		       "Please report this trace. ]\n\n");
+		raw_local_irq_disable();
+		return 0;
+	}
+	if (is_kernel_lock(lockblk))
+		return 0;
+	/*
+	 * Ugh, something corrupted the lock data structure?
+	 */
+	if (depth > 20) {
+		TRACE_OFF();
+		printk("\n===========================================\n");
+		printk(  "[ BUG: infinite lock dependency detected!? |\n");
+		printk(  "-------------------------------------------\n");
+		goto print_it;
+	}
+	barrier();
+	if (lockblk && check_deadlock(lockblk, depth+1, eip)) {
+		printk("\n============================================\n");
+		printk(  "[ BUG: circular locking deadlock detected! ]\n");
+		printk(  "--------------------------------------------\n");
+print_it:
+		printk("%s/%d is deadlocking current task %s/%d\n\n",
+			task->comm, task->pid, current->comm, current->pid);
+		printk("\n1) %s/%d is trying to acquire this lock:\n",
+			current->comm, current->pid);
+		printk_lock(lock, 1);
+
+		printk("... trying at:                 ");
+		print_symbol("%s\n", eip);
+
+		printk("\n2) %s/%d is blocked on this lock:\n",
+			task->comm, task->pid);
+		printk_lock(lockblk, 1);
+
+		show_held_locks(current);
+		show_held_locks(task);
+
+		printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm,
+		       task->pid);
+		show_stack(task, NULL);
+		printk("\n%s/%d's [current] stackdump:\n\n",
+			current->comm, current->pid);
+		dump_stack();
+		show_all_locks();
+		printk("[ turning off deadlock detection."
+		       "Please report this trace. ]\n\n");
+		raw_local_irq_disable();
+		return 0;
+	}
+	return 0;
+}
+
+void check_no_held_locks(task_t *task)
+{
+	struct list_head *curr, *next, *cursor = NULL;
+	struct rt_mutex *lock;
+	struct rt_mutex_waiter *w;
+	unsigned long flags;
+
+	if (!rt_trace_on)
+		return;
+#ifdef CONFIG_DEBUG_PREEMPT
+	if (task->lock_count) {
+		static int once = 1;
+		if (once) {
+			once = 0;
+			printk("BUG: nonzero lock count %d at exit time?\n",
+				task->lock_count);
+			printk_task(task);
+			printk("\n");
+			dump_stack();
+		}
+	}
+#endif
+	if (!rt_prio(task->normal_prio) && rt_prio(task->prio)) {
+		printk("BUG: PI priority boost leaked!\n");
+		printk_task(task);
+		printk("\n");
+	}
+restart:
+	trace_lock_irqsave(&tracelock, flags);
+	list_for_each_safe(curr, next, &held_locks) {
+		if (cursor && curr != cursor)
+			continue;
+		lock = list_entry(curr, struct rt_mutex, held_list);
+		if(task != lock_owner(lock))
+			continue;
+		cursor = next;
+		list_del_init(curr);
+		trace_unlock_irqrestore(&tracelock, flags);
+
+		if (is_kernel_lock(lock)) {
+			printk("BUG: %s/%d, BKL held at task exit time!\n",
+				task->comm, task->pid);
+			printk("BKL acquired at: ");
+			print_symbol("%s\n",
+				(unsigned long) task->last_kernel_lock);
+		} else
+			printk("BUG: %s/%d, lock held at task exit time!\n",
+				task->comm, task->pid);
+		printk_lock(lock, 1);
+		if (lock_owner(lock) != task)
+			printk("exiting task is not even the owner??\n");
+		goto restart;
+	}
+	_raw_spin_lock(&task->pi_lock);
+	plist_for_each_entry(w, &task->pi_waiters, pi_list) {
+		TRACE_OFF();
+
+		printk("hm, PI interest held at exit time? Task:\n");
+		printk_task(task);
+		printk_waiter(w);
+		return;
+	}
+	_raw_spin_unlock(&task->pi_lock);
+	trace_unlock_irqrestore(&tracelock, flags);
+}
+
+int check_no_locks_freed(const void *from, const void *to)
+{
+	struct list_head *curr, *next, *cursor = NULL;
+	struct rt_mutex *lock;
+	unsigned long flags;
+	void *lock_addr;
+	int err = 0;
+
+	if (!rt_trace_on)
+		return err;
+restart:
+	trace_lock_irqsave(&tracelock, flags);
+	list_for_each_safe(curr, next, &held_locks) {
+		if (cursor && curr != cursor)
+			continue;
+		lock = list_entry(curr, struct rt_mutex, held_list);
+		lock_addr = lock;
+		if (lock_addr < from || lock_addr >= to)
+			continue;
+		cursor = next;
+		list_del_init(curr);
+		TRACE_OFF();
+		err = 1;
+
+		printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n",
+			current->comm, current->pid, lock, from, to);
+		dump_stack();
+		printk_lock(lock, 1);
+		if (lock_owner(lock) != current)
+			printk("freeing task is not even the owner??\n");
+		goto restart;
+	}
+	trace_unlock_irqrestore(&tracelock, flags);
+
+	return err;
+}
+
+void debug_deadlocks_down_mutex(struct rt_mutex *lock __EIP_DECL__)
+{
+	if (rt_trace_on) {
+		TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list));
+		list_add_tail(&lock->held_list, &held_locks);
+		lock->acquire_eip = eip;
+	}
+}
+
+void debug_deadlocks_up_mutex(struct rt_mutex *lock)
+{
+
+	if (rt_trace_on) {
+		TRACE_WARN_ON_LOCKED(lock_owner(lock) != current);
+		TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list));
+		list_del_init(&lock->held_list);
+	}
+}
+
+#endif
Index: linux.prev/kernel/rt-debug.h
===================================================================
--- /dev/null
+++ linux.prev/kernel/rt-debug.h
@@ -0,0 +1,304 @@
+/*
+ * Debugging code
+ *
+ */
+
+/*
+ * This flag is good for debugging the PI code - it makes all tasks
+ * in the system fall under PI handling. Normally only SCHED_FIFO/RR
+ * tasks are PI-handled:
+ */
+#define ALL_TASKS_PI 0
+
+#ifdef CONFIG_DEBUG_DEADLOCKS
+
+# define __EIP_DECL__ , unsigned long eip
+# define __EIP__ , eip
+# define __W_EIP__(waiter) , (waiter)->eip
+# define __CALLER0__ , CALLER_ADDR0
+
+extern raw_spinlock_t tracelock;
+extern struct list_head held_locks;
+extern int rt_trace_on;
+extern void deadlock_trace_off(void);
+extern void debug_deadlocks_down_mutex(struct rt_mutex *lock __EIP_DECL__);
+extern void debug_deadlocks_up_mutex(struct rt_mutex *lock);
+extern void printk_lock(struct rt_mutex *lock, int print_owner);
+extern void printk_lock_debug(struct rt_mutex *lock);
+extern int check_deadlock(struct rt_mutex *lock, int depth, unsigned long eip);
+
+# define trace_lock_init()			\
+	spin_lock_init(&tracelock)
+
+# define trace_lock_irq(lock)			\
+	do {					\
+		raw_local_irq_disable();	\
+		if (rt_trace_on)		\
+			spin_lock(lock);	\
+	} while (0)
+
+# define trace_lock(lock)			\
+	do {					\
+		if (rt_trace_on)		\
+			spin_lock(lock);	\
+	} while (0)
+
+# define trace_unlock(lock)			\
+	do {					\
+		if (rt_trace_on)		\
+			spin_unlock(lock);	\
+	} while (0)
+
+# define trace_unlock_irq(lock)			\
+	do {					\
+		if (rt_trace_on)		\
+			spin_unlock(lock);	\
+		raw_local_irq_enable();		\
+		preempt_check_resched();	\
+	} while (0)
+
+# define trace_lock_irqsave(lock, flags)	\
+	do {					\
+		raw_local_irq_save(flags);	\
+		if (rt_trace_on)		\
+			spin_lock(lock);	\
+	} while (0)
+
+# define trace_unlock_irqrestore(lock, flags)	\
+	do {					\
+		if (rt_trace_on)		\
+			spin_unlock(lock);	\
+		raw_local_irq_restore(flags);	\
+		preempt_check_resched();	\
+	} while (0)
+
+# define TRACE_WARN_ON(x)			WARN_ON(x)
+# define TRACE_BUG_ON(x)			BUG_ON(x)
+
+# define TRACE_OFF()						\
+do {								\
+	if (rt_trace_on) {					\
+		rt_trace_on = 0;				\
+		console_verbose();				\
+		if (_raw_spin_is_locked(&current->pi_lock))	\
+			_raw_spin_unlock(&current->pi_lock);	\
+		spin_unlock(&tracelock);			\
+	}							\
+} while (0)
+
+# define TRACE_BUG_LOCKED()			\
+do {						\
+	TRACE_OFF();				\
+	BUG();					\
+} while (0)
+
+# define TRACE_WARN_ON_LOCKED(c)		\
+do {						\
+	if (unlikely(c)) {			\
+		TRACE_OFF();			\
+		WARN_ON(1);			\
+	}					\
+} while (0)
+
+# define TRACE_BUG_ON_LOCKED(c)			\
+do {						\
+	if (unlikely(c))			\
+		TRACE_BUG_LOCKED();		\
+} while (0)
+
+#ifdef CONFIG_SMP
+# define SMP_TRACE_BUG_ON_LOCKED(c)	TRACE_BUG_ON_LOCKED(c)
+#else
+# define SMP_TRACE_BUG_ON_LOCKED(c)	do { } while (0)
+#endif
+
+static inline void INIT_WAITER(struct rt_mutex_waiter *waiter)
+{
+	memset(waiter, 0x11, sizeof(*waiter));
+	plist_node_init(&waiter->list, MAX_PRIO);
+	plist_node_init(&waiter->pi_list, MAX_PRIO);
+}
+
+static inline void FREE_WAITER(struct rt_mutex_waiter *waiter)
+{
+	TRACE_WARN_ON(!plist_node_empty(&waiter->list));
+	TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list));
+	memset(waiter, 0x22, sizeof(*waiter));
+}
+
+# define debug_rt_mutex_init(_lock, _save_state, _name, _file, _line)	\
+do {									\
+	_lock->save_state = _save_state;				\
+	INIT_LIST_HEAD(&_lock->held_list);				\
+	_lock->name = _name;						\
+	_lock->file = _file;						\
+	_lock->line = _line;						\
+} while (0)
+
+# define debug_rt_mutex_save_state(_lock, _state)			\
+do {									\
+	if (unlikely(_lock->lock.save_state != _state)) {		\
+		printk_lock_debug(&lock->lock);				\
+		TRACE_WARN_ON(1);					\
+	}								\
+} while (0)
+
+#else /* !CONFIG_DEBUG_DEADLOCKS */
+
+# define __EIP_DECL__
+# define __EIP__
+# define __W_EIP__(waiter)
+# define __CALLER0__
+
+# define debug_deadlocks_down_mutex(lock)	do { } while (0)
+# define debug_deadlocks_up_mutex(lock)		do { } while (0)
+
+# define trace_lock_init()			do { } while (0)
+# define trace_lock(lock)			do { } while (0)
+# define trace_lock_irq(lock)			raw_local_irq_disable()
+# define trace_lock_irqsave(lock, flags)	raw_local_irq_save(flags)
+# define trace_unlock_irq(lock)			raw_local_irq_enable()
+
+# define trace_unlock_irqrestore(lock, flags)	\
+	do {					\
+		raw_local_irq_restore(flags);	\
+		preempt_check_resched();	\
+	} while (0)
+
+# define trace_unlock(lock)			do { } while (0)
+
+# define TRACE_WARN_ON(x)			do { } while (0)
+# define TRACE_BUG_ON(x)			do { } while (0)
+# define TRACE_BUG_LOCKED()			do { } while (0)
+# define TRACE_WARN_ON_LOCKED(c)		do { } while (0)
+# define TRACE_OFF()				do { } while (0)
+# define TRACE_BUG_ON_LOCKED(c)			do { } while (0)
+# define SMP_TRACE_BUG_ON_LOCKED(c)		do { } while (0)
+
+# define INIT_WAITER(w)				do { } while (0)
+# define FREE_WAITER(w)				do { } while (0)
+
+# define debug_rt_mutex_init(l,s, n, f, ln)	do { } while (0)
+# define debug_rt_mutex_save_state(l, s)	do { } while (0)
+
+#endif /* CONFIG_DEBUG_DEADLOCKS */
+
+/*
+ * DEBUG_PREEMPT && RT
+ */
+#if defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPT_RT)
+static inline void account_mutex_owner_down(task_t *task, struct rt_mutex *lock)
+{
+	if (task->lock_count >= MAX_LOCK_STACK) {
+		TRACE_OFF();
+		printk("BUG: %s/%d: lock count overflow!\n",
+			task->comm, task->pid);
+		dump_stack();
+		return;
+	}
+	task->owned_lock[task->lock_count] = lock;
+	task->lock_count++;
+}
+
+static inline void account_mutex_owner_up(task_t *task)
+{
+	if (!task->lock_count) {
+		TRACE_OFF();
+		printk("BUG: %s/%d: lock count underflow!\n",
+			task->comm, task->pid);
+		dump_stack();
+		return;
+	}
+	task->lock_count--;
+	task->owned_lock[task->lock_count] = NULL;
+}
+
+#else
+# define account_mutex_owner_down(task, lock)	do { } while(0)
+# define account_mutex_owner_up(task)		do { } while(0)
+#endif
+
+#ifdef CONFIG_DEBUG_PREEMPT
+
+# define debug_rt_mutex_preempt(_lock, val)	\
+do {						\
+	(_lock)->was_preempt_off = val;		\
+}while (0)
+
+#else
+
+# define debug_rt_mutex_preempt(_lock, val)	do { } while(0)
+
+#endif
+
+#ifdef CONFIG_DEBUG_RT_LOCKING_MODE
+
+# define debug_rt_lockmode_init(_lock)			\
+	_raw_spin_lock_init(&(_lock)->debug_slock)
+
+# define debug_rt_lockmode_nopreempt()			(!preempt_locks)
+
+# define debug_rt_lockmode_spin_lock(_lock)		\
+	_raw_spin_lock(&(_lock)->debug_slock)
+
+# define debug_rt_lockmode_spin_unlock(_lock)		\
+	_raw_spin_unlock(&(_lock)->debug_slock)
+
+# define debug_rt_lockmode_spin_is_locked(_lock)	\
+	_raw_spin_is_locked(&(_lock)->debug_slock)
+
+# define debug_rt_lockmode_spin_trylock(_lock)		\
+	_raw_spin_trylock(&(_lock)->debug_slock)
+
+# define debug_rt_lockmode_initrw(_lock)		\
+	_raw_rwlock_init(&(_lock)->debug_rwlock)
+
+# define debug_rt_lockmode_read_can_lock(_lock)		\
+	_raw_read_can_lock(&(_lock)->debug_rwlock)
+
+# define debug_rt_lockmode_write_can_lock(_lock)	\
+	_raw_write_can_lock(&(_lock)->debug_rwlock)
+
+# define debug_rt_lockmode_read_trylock(_lock)		\
+	_raw_read_trylock(&(_lock)->debug_rwlock)
+
+# define debug_rt_lockmode_write_trylock(_lock)		\
+	_raw_write_trylock(&(_lock)->debug_rwlock)
+
+# define debug_rt_lockmode_read_lock(_lock)		\
+	_raw_read_lock(&(_lock)->debug_rwlock)
+
+# define debug_rt_lockmode_write_lock(_lock)		\
+	_raw_write_lock(&(_lock)->debug_rwlock)
+
+# define debug_rt_lockmode_read_unlock(_lock)		\
+	_raw_read_unlock(&(_lock)->debug_rwlock)
+
+# define debug_rt_lockmode_write_unlock(_lock)		\
+	_raw_write_unlock(&(_lock)->debug_rwlock)
+
+#else
+
+# define debug_rt_lockmode_init(_lock)			do { } while(0)
+# define debug_rt_lockmode_nopreempt()			(0)
+# define debug_rt_lockmode_spin_lock(_lock)		do { } while(0)
+# define debug_rt_lockmode_spin_trylock(_lock)		(0)
+# define debug_rt_lockmode_spin_unlock(_lock)		do { } while(0)
+# define debug_rt_lockmode_spin_is_locked(_lock)	(0)
+
+# define debug_rt_lockmode_initrw(_lock)		do { } while(0)
+# define debug_rt_lockmode_read_can_lock(_lock)		(0)
+# define debug_rt_lockmode_write_can_lock(_lock)	(0)
+# define debug_rt_lockmode_read_trylock(_lock)		(0)
+# define debug_rt_lockmode_write_trylock(_lock)		(0)
+# define debug_rt_lockmode_read_lock(_lock)		do { } while(0)
+# define debug_rt_lockmode_write_lock(_lock)		do { } while(0)
+# define debug_rt_lockmode_read_unlock(_lock)		do { } while(0)
+# define debug_rt_lockmode_write_unlock(_lock)		do { } while(0)
+
+#endif
+
+/* FIXME: Find a better place for these */
+#define __local_save_flags_inline(flags) \
+	do { (flags) = irqs_off() | RAW_LOCAL_ILLEGAL_MASK; } while (0)
+
Index: linux.prev/kernel/rt-irqflags.c
===================================================================
--- /dev/null
+++ linux.prev/kernel/rt-irqflags.c
@@ -0,0 +1,163 @@
+/*
+ * kernel/rt-irqflags.c
+ *
+ * Real-Time Preemption Support
+ *
+ * started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ */
+
+#include <linux/config.h>
+#include <linux/rt_lock.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/interrupt.h>
+#include <linux/plist.h>
+#include <linux/fs.h>
+#include <linux/futex.h>
+
+#include "rt.h"
+#include "rt-debug.h"
+
+/*
+ * Soft irq-flag support:
+ */
+
+#ifdef CONFIG_DEBUG_IRQ_FLAGS
+
+void check_raw_flags(unsigned long flags)
+{
+	if (flags & RAW_LOCAL_ILLEGAL_MASK) {
+		static int print_once = 1;
+		if (print_once) {
+			print_once = 0;
+			printk("BUG: bad raw irq-flag value %08lx, %s/%d!\n",
+				flags, current->comm, current->pid);
+			dump_stack();
+		}
+	}
+}
+
+EXPORT_SYMBOL(check_raw_flags);
+
+static int check_soft_flags(unsigned long flags)
+{
+	if (flags == (RAW_LOCAL_ILLEGAL_MASK | LOCAL_ILLEGAL_MASK)) {
+		static int print_once = 1;
+		if (print_once) {
+			print_once = 0;
+			printk("BUG: %s/%d: spin-lock irq flags assymetry?\n",
+				current->comm, current->pid);
+			dump_stack();
+		}
+		local_irq_enable();
+		return -1;
+	}
+	if ((flags & ~PF_IRQSOFF) != RAW_LOCAL_ILLEGAL_MASK) {
+		static int print_once = 1;
+		if (print_once) {
+			print_once = 0;
+#if 1
+			raw_local_irq_disable();
+			printk("BUG: bad soft irq-flag value %08lx, %s/%d! %08lx/%08lx\n",
+				flags, current->comm, current->pid, CALLER_ADDR0, CALLER_ADDR1);
+#else
+			printk("BUG: bad soft irq-flag value %08lx, %s/%d!\n",
+				flags, current->comm, current->pid);
+#endif
+			dump_stack();
+			raw_local_irq_enable();
+		}
+		return -1;
+	}
+	return 0;
+}
+#else
+static inline int check_soft_flags(unsigned long flags)
+{
+	return 0;
+}
+#endif
+
+static void illegal_API_call(void)
+{
+	static int print_once = 1;
+
+	if (print_once) {
+		print_once = 0;
+		printk("WARNING: %s/%d changed soft IRQ-flags.\n",
+			current->comm, current->pid);
+		dump_stack();
+	}
+}
+
+static inline void turn_soft_irqs_off(void)
+{
+	current->flags |= PF_IRQSOFF;
+	illegal_API_call();
+}
+
+static inline void turn_soft_irqs_on(void)
+{
+	current->flags &= ~PF_IRQSOFF;
+	illegal_API_call();
+}
+
+void local_irq_enable(void)
+{
+	turn_soft_irqs_on();
+	preempt_check_resched();
+}
+EXPORT_SYMBOL(local_irq_enable);
+
+void local_irq_disable(void)
+{
+	turn_soft_irqs_off();
+}
+EXPORT_SYMBOL(local_irq_disable);
+
+int irqs_disabled_flags(unsigned long flags)
+{
+	check_soft_flags(flags);
+
+	return (flags & PF_IRQSOFF) != 0;
+}
+EXPORT_SYMBOL(irqs_disabled_flags);
+
+void __local_save_flags(unsigned long *flags)
+{
+	__local_save_flags_inline(*flags);
+}
+EXPORT_SYMBOL(__local_save_flags);
+
+void __local_irq_save(unsigned long *flags)
+{
+	*flags = irqs_off() | RAW_LOCAL_ILLEGAL_MASK;
+	turn_soft_irqs_off();
+}
+EXPORT_SYMBOL(__local_irq_save);
+
+void local_irq_restore(unsigned long flags)
+{
+	if (check_soft_flags(flags))
+		return;
+	if (flags & ~RAW_LOCAL_ILLEGAL_MASK)
+		turn_soft_irqs_off();
+	else {
+		turn_soft_irqs_on();
+		preempt_check_resched();
+	}
+}
+EXPORT_SYMBOL(local_irq_restore);
+
+notrace int irqs_disabled(void)
+{
+	return irqs_off();
+}
+EXPORT_SYMBOL(irqs_disabled);
+
Index: linux.prev/kernel/rt.c
===================================================================
--- /dev/null
+++ linux.prev/kernel/rt.c
@@ -0,0 +1,1687 @@
+/*
+ * kernel/rt.c
+ *
+ * Real-Time Preemption Support
+ *
+ * started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * historic credit for proving that Linux spinlocks can be implemented via
+ * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
+ * and others) who prototyped it on 2.4 and did lots of comparative
+ * research and analysis; TimeSys, for proving that you can implement a
+ * fully preemptible kernel via the use of IRQ threading and mutexes;
+ * Bill Huey for persuasively arguing on lkml that the mutex model is the
+ * right one; and to MontaVista, who ported pmutexes to 2.6.
+ *
+ * This code is a from-scratch implementation and is not based on pmutexes,
+ * but the idea of converting spinlocks to mutexes is used here too.
+ *
+ * lock debugging, locking tree, deadlock detection:
+ *
+ *  Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
+ *  Released under the General Public License (GPL).
+ *
+ * Includes portions of the generic R/W semaphore implementation from:
+ *
+ *  Copyright (c) 2001   David Howells (dhowells@redhat.com).
+ *  - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
+ *  - Derived also from comments by Linus
+ *
+ * Pending ownership of locks and ownership stealing:
+ *
+ *  Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
+ *
+ *   (also by Steven Rostedt)
+ *    - Converted single pi_lock to individual task locks.
+ *
+ * By Esben Nielsen:
+ *    Doing priority inheritance with help of the scheduler.
+ *
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *  - major rework based on Esben Nielsens initial patch
+ *  - replaced thread_info references by task_struct refs
+ *  - removed task->pending_owner dependency
+ *  - BKL drop/reacquire for semaphore style locks to avoid deadlocks
+ *    in the scheduler return path as discussed with Steven Rostedt
+ *
+ *  Copyright (C) 2006, Kihon Technologies Inc.
+ *    Steven Rostedt <rostedt@goodmis.org>
+ *  - debugged and patched Thomas Gleixner's rework.
+ *  - added back the cmpxchg to the rework.
+ *  - turned atomic require back on for SMP.
+ */
+
+#include <linux/config.h>
+#include <linux/rt_lock.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/interrupt.h>
+#include <linux/plist.h>
+#include <linux/fs.h>
+#include <linux/futex.h>
+
+#include "rt.h"
+#include "rt-debug.h"
+
+/*
+ * Locking:
+ *
+ * All operations on rt_mutexes are protected by lock->wait_lock. The
+ * pi_waiters list of the tasks are protected by task->pi_lock. Also
+ * priority adjustement functions have to be done with task->pi_lock
+ * held.
+ *
+ * For nested locking the lock order is lock->wait_lock,
+ * task->pi_lock. Do not hold more than one task->pi_lock at a time.
+ */
+
+/*
+ * lock->owner state tracking:
+ *
+ * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1
+ * are used to keep track of the "owner is pending" and "lock has
+ * waiters" state.
+ *
+ * owner	bit1	bit0
+ * NULL		0	0	Lock is free (fast acquire possible)
+ * NULL		0	1	invalid state
+ * NULL		1	0	invalid state
+ * NULL		1	1	invalid state
+ * taskpointer	0	0	Lock is held (fast release possible)
+ * taskpointer	0	1	task is pending owner
+ * taskpointer	1	0	Lock is held and has waiters
+ * taskpointer	1	1	task is pending owner and lock has more waiters
+ *
+ * Pending ownership is assigned to the first (highest priority)
+ * waiter of the lock, when the lock is released. The thread is woken
+ * up and can now take the lock. Until the lock is taken (bit 0
+ * cleared) a competing higher priority thread can steal the lock
+ * which puts the woken up thread back on the lock waiters list.
+ *
+ * The fast atomic compare exchange based acquire and release is only
+ * possible when bit 0 and 1 of lock->owner are 0.
+ */
+
+/*
+ * Check, if a task is blocked on a lock.
+ *
+ * If so, return the pid of the task holding the lock
+ * if not, return -1
+ */
+pid_t get_blocked_on(task_t *task)
+{
+	pid_t res = -1;
+	struct rt_mutex *lock;
+	struct task_struct *owner;
+
+ try_again:
+	_raw_spin_lock(&task->pi_lock);
+
+	if (!task->blocked_on) {
+		_raw_spin_unlock(&task->pi_lock);
+		return res;
+	}
+
+	lock = task->blocked_on->lock;
+
+       /* Now we have to take lock->wait_lock _before_ releasing
+	* task->pi_lock. Otherwise lock can be deallocated while we
+	* are refering to it as the subsystem has no way of knowing
+	* about us hanging around in here.
+	*/
+	if (!_raw_spin_trylock(&lock->wait_lock)) {
+		_raw_spin_unlock(&task->pi_lock);
+		goto try_again;
+	}
+	_raw_spin_unlock(&task->pi_lock);
+
+	owner = lock_owner(lock);
+	if (owner)
+		res = owner->pid;
+
+	_raw_spin_unlock(&lock->wait_lock);
+
+	return res;
+}
+
+int pi_initialized;
+
+/*
+ * TODO: on SMP we still have to initialize the wait_list runtime,
+ *       due to percpu.data.
+ */
+static inline void init_lists(struct rt_mutex *lock)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_DEADLOCKS)
+	// we have to do this until the static initializers get fixed:
+	if (unlikely(!lock->wait_list.prio_list.prev)) {
+		plist_head_init(&lock->wait_list);
+#ifdef CONFIG_DEBUG_DEADLOCKS
+		pi_initialized++;
+#endif
+#ifndef CONFIG_SMP
+		{
+			static int once = 1;
+			if (once) {
+				once = 0;
+				printk("BUG: lock wait_list not initialized?\n");
+				printk_lock(lock, 1);
+				WARN_ON(1);
+			}
+		}
+#endif
+	}
+#endif
+#ifdef CONFIG_DEBUG_DEADLOCKS
+	if (!lock->held_list.prev && !lock->held_list.next) {
+		INIT_LIST_HEAD(&lock->held_list);
+	}
+#endif
+}
+
+/*
+ * Calculate task priority from the waiter list priority
+ *
+ * Return task->normal_prio when the waiter list is empty or when
+ * the waiter is not allowed to do priority boosting
+ */
+static inline int calc_pi_prio(task_t *task)
+{
+	int prio = task->normal_prio;
+	struct rt_mutex_waiter *waiter;
+
+	if (likely(!task_has_waiters(task)))
+		return prio;
+
+	waiter = task_first_waiter(task);
+
+	if ((ALL_TASKS_PI) || rt_prio(waiter->list.prio))
+		prio = min(task_first_waiter(task)->pi_list.prio, prio);
+
+	return prio;
+}
+
+/*
+ * Adjust priority of a task
+ */
+static void adjust_prio(task_t *task)
+{
+	int prio = calc_pi_prio(task);
+
+	if (task->prio != prio)
+		mutex_setprio(task, prio);
+}
+
+/*
+ * Task blocks on lock.
+ *
+ * Enqueue the waiter and boost the owner eventually
+ *
+ * Return the state of the task which was active at xchg(state)
+ *
+ * This must be called with lock->wait_lock held.
+ */
+static long task_blocks_on_lock(struct rt_mutex_waiter *waiter,
+				struct rt_mutex *lock, int state __EIP_DECL__)
+{
+	struct rt_mutex_waiter *cur_waiter;
+	task_t *owner;
+	long ret;
+
+#ifdef CONFIG_DEBUG_DEADLOCKS
+	check_deadlock(lock, 0, eip);
+	waiter->eip = eip;
+#endif
+	SMP_TRACE_BUG_ON_LOCKED(!spin_is_locked(&lock->wait_lock));
+
+	/* Get the top priority waiter on the lock */
+	cur_waiter = lock_waiters_empty(lock) ? NULL : lock_first_waiter(lock);
+
+	/* Enqueue the task into the lock waiter list */
+	_raw_spin_lock(&current->pi_lock);
+	current->blocked_on = waiter;
+	waiter->lock = lock;
+	waiter->task = current;
+	TRACE_BUG_ON_LOCKED(!lock_has_waiters(lock));
+	/* We already marked lock_waiters in the try_to_take_lock */
+
+	plist_node_init(&waiter->list, current->prio);
+	plist_node_init(&waiter->pi_list, current->prio);
+	plist_add(&waiter->list, &lock->wait_list);
+	ret = xchg(&current->state, state);
+	_raw_spin_unlock(&current->pi_lock);
+
+	/*
+	 * Check if waiter can boost and if waiter is the top priority
+	 * waiter
+	 */
+	if (waiter != lock_first_waiter(lock))
+		return ret;
+
+	owner = lock_owner(lock);
+	_raw_spin_lock(&owner->pi_lock);
+
+	/*
+	 * Remove the previous waiter from the owners waiter list
+	 * No further checks. lock->wait_lock is held and prevents
+	 * the access to anything related to the lock. If cur_waiter
+	 * is not enqueued then plist_del is just a nop.
+	 */
+	if (unlikely(cur_waiter))
+		plist_del(&cur_waiter->pi_list);
+
+	/* Add the new top priority waiter to the owners waiter list */
+	plist_add(&waiter->pi_list, &owner->pi_waiters);
+	adjust_prio(owner);
+
+	/*
+	 * The owner will have the blocked field set if it is
+	 * blocked on a lock. So in this case we want to wake
+	 * the owner up so it can boost who it is blocked on.
+	 */
+	if (owner->blocked_on)
+		wake_up_process_mutex(owner);
+
+	_raw_spin_unlock(&owner->pi_lock);
+	return ret;
+}
+
+/*
+ * Check, if we can steal the lock from the assigned pending owner
+ */
+static inline int check_pending_owner(struct rt_mutex *lock)
+{
+	int steal = 1;
+	task_t *po = lock_owner(lock);
+
+	if (!lock_owner_pending(lock))
+		return 0;
+
+	if (unlikely(po && po != current)) {
+		_raw_spin_lock(&po->pi_lock);
+		if (current->prio >= po->prio)
+			steal = 0;
+		_raw_spin_unlock(&po->pi_lock);
+	}
+	return steal;
+}
+
+/*
+ * Try to take the lock
+ *
+ * This fails
+ * - when the lock has a real owner
+ * - when a different pending owner exists and has higher priority than current
+ *
+ * Must be called with lock->wait_lock held
+ */
+static int try_to_take_lock(struct rt_mutex *lock __EIP_DECL__)
+{
+	/*
+	 * We have to be careful here if the atomic speedups are
+	 * enabled, such that, when
+	 *  - no other waiter is on the lock
+	 *  - the lock has been released since we did the cmpxchg
+	 * the lock can be released or taken while we are doing the
+	 * checks and marking the lock with LOCK_HAS_WAITERS.
+	 *
+	 * The atomic acquire/release aware variant of
+	 * mark_lock_waiters uses a cmpxchg loop. After setting the
+	 * WAITERS bit, the atomic release / acquire can not happen
+	 * anymore and lock->wait_lock protects us from the non-atomic
+	 * case.
+	 *
+	 * Note, that this might set lock->owner = LOCK_HAS_WAITERS in
+	 * the case the lock is not contended any more. This is fixed
+	 * up when we take the lock ownership and stays valid if we
+	 * don't.
+	 */
+	mark_lock_waiters(lock);
+
+	if (lock_real_owner(lock) && !check_pending_owner(lock))
+		return 0;
+
+	trace_special_pid(current->pid, current->prio, 0);
+
+	debug_deadlocks_down_mutex(lock __EIP__);
+
+	/* We got the lock. Check for other pending waiters */
+	set_lock_owner(lock, current,
+		       (lock_waiters_empty(lock)) ? 0 :	LOCK_HAS_WAITERS);
+	return 1;
+}
+
+/*
+ * Remove a waiter from a lock
+ *
+ * Must be called with lock->wait_lock held
+ */
+static void remove_waiter(struct rt_mutex *lock,
+			  struct rt_mutex_waiter *waiter)
+{
+	task_t *owner = lock_owner(lock);
+	int first = (waiter == lock_first_waiter(lock));
+
+	plist_del(&waiter->list);
+
+	if (lock_waiters_empty(lock))
+		clear_lock_waiters(lock);
+
+	if (first) {
+		_raw_spin_lock(&owner->pi_lock);
+		plist_del(&waiter->pi_list);
+
+		if (!lock_waiters_empty(lock)) {
+			struct rt_mutex_waiter *next;
+
+			next = lock_first_waiter(lock);
+			plist_add(&next->pi_list, &owner->pi_waiters);
+		}
+		adjust_prio(owner);
+		_raw_spin_unlock(&owner->pi_lock);
+	}
+}
+
+static void wakeup_next_waiter(struct rt_mutex *lock, int save_state)
+{
+	struct rt_mutex_waiter *waiter;
+	task_t *next;
+
+	/*
+	 * Remove the first waiter from the current tasks waiter list
+	 * and from the lock waiter list. Then wake it up
+	 */
+	waiter = lock_first_waiter(lock);
+	plist_del(&waiter->list);
+	next = waiter->task;
+	next->blocked_on = NULL;
+
+	set_lock_owner(lock, next,
+		       LOCK_OWNER_PENDING |
+		       (lock_waiters_empty(lock) ? 0 : LOCK_HAS_WAITERS));
+
+	waiter->task = NULL;
+
+	_raw_spin_lock(&current->pi_lock);
+	plist_del(&waiter->pi_list);
+	_raw_spin_unlock(&current->pi_lock);
+
+	if (save_state)
+		wake_up_process_mutex(next);
+	else
+		wake_up_process(next);
+}
+
+/*
+ * Release a lock
+ *
+ * lock->wait_lock is locked/released here
+ *
+ */
+static void release_lock(struct rt_mutex *lock, int save_state)
+{
+	_raw_spin_lock(&lock->wait_lock);
+
+	TRACE_BUG_ON_LOCKED(!lock->wait_list.prio_list.prev &&
+			    !lock->wait_list.prio_list.next);
+
+	debug_deadlocks_up_mutex(lock);
+
+	if (likely(!lock_has_waiters(lock))) {
+		lock->owner = NULL;
+		_raw_spin_unlock(&lock->wait_lock);
+		return;
+	}
+
+	wakeup_next_waiter(lock, save_state);
+
+	_raw_spin_unlock(&lock->wait_lock);
+
+	/*  Readjust priority, when necessary. */
+	_raw_spin_lock(&current->pi_lock);
+	adjust_prio(current);
+	_raw_spin_unlock(&current->pi_lock);
+}
+
+/*
+ * The caller was interrupted or timeout hit, but it might be
+ * the current pending owner. Make sure to remove it and wake
+ * the next waiter on the lock, when there is one
+ */
+static void remove_pending_owner(struct rt_mutex *lock, task_t *tsk)
+{
+	if (!lock_owner_pending(lock) || lock_owner(lock) != tsk)
+		return;
+
+	if (likely(!lock_has_waiters(lock))) {
+		lock->owner = NULL;
+		return;
+	}
+	wakeup_next_waiter(lock, 0);
+}
+
+/*
+ * lock it mutex-style: this variant is very careful not to
+ * miss any non-mutex wakeups.
+ *
+ * The wakeup side uses wake_up_process_mutex, which, combined with
+ * the xchg code of this function is a transparent sleep/wakeup
+ * mechanism nested within any existing sleep/wakeup mechanism. This
+ * enables the seemless use of arbitrary (blocking) spinlocks within
+ * sleep/wakeup event loops.
+ */
+static void ____down_mutex(struct rt_mutex *lock __EIP_DECL__)
+{
+	unsigned long saved_state, state;
+	struct rt_mutex_waiter waiter;
+	unsigned long flags;
+
+	trace_lock_irqsave(&tracelock, flags);
+	TRACE_BUG_ON_LOCKED(!raw_irqs_disabled());
+	_raw_spin_lock(&lock->wait_lock);
+
+	/*
+	 * Here we save whatever state the task was in originally,
+	 * we'll restore it at the end of the function and we'll take
+	 * any intermediate wakeup into account as well, independently
+	 * of the mutex sleep/wakeup mechanism. When we get a real
+	 * wakeup the task->state is TASK_RUNNING and we change
+	 * saved_state accordingly. If we did not get a real wakeup
+	 * then we return with the saved state.
+	 */
+	saved_state = xchg(&current->state, TASK_UNINTERRUPTIBLE);
+
+	INIT_WAITER(&waiter);
+
+	init_lists(lock);
+
+	/* wait to be given the lock */
+ again:
+	if (try_to_take_lock(lock __EIP__)) {
+		_raw_spin_unlock(&lock->wait_lock);
+		state = xchg(&current->state, saved_state);
+		if (unlikely(state == TASK_RUNNING))
+			current->state = TASK_RUNNING;
+		trace_unlock_irqrestore(&tracelock, flags);
+		preempt_check_resched();
+		FREE_WAITER(&waiter);
+		account_mutex_owner_down(current, lock);
+		return;
+	}
+
+	if (unlikely(task_blocks_on_lock(&waiter, lock, TASK_UNINTERRUPTIBLE
+					 __EIP__)) == TASK_RUNNING)
+		saved_state = TASK_RUNNING;
+
+	/* release lock structure */
+	_raw_spin_unlock(&lock->wait_lock);
+
+	TRACE_BUG_ON_LOCKED(!raw_irqs_disabled());
+	trace_unlock(&tracelock);
+
+	if (waiter.task) {
+		unsigned long saved_flags = current->flags & PF_NOSCHED;
+		int saved_lock_depth = current->lock_depth;
+
+		/*
+		 * Prevent schedule() to drop BKL, while waiting for
+		 * the lock ! We restore lock_depth when we come back.
+		 */
+		current->lock_depth = -1;
+		current->flags &= ~PF_NOSCHED;
+		raw_local_irq_enable();
+
+		schedule();
+
+		raw_local_irq_disable();
+		current->flags |= saved_flags;
+		current->lock_depth = saved_lock_depth;
+		state = xchg(&current->state, TASK_UNINTERRUPTIBLE);
+		if (unlikely(state == TASK_RUNNING))
+			saved_state = TASK_RUNNING;
+	}
+
+	trace_lock(&tracelock);
+	_raw_spin_lock(&lock->wait_lock);
+	if (waiter.task)
+		remove_waiter(lock, &waiter);
+	_raw_spin_lock(&current->pi_lock);
+	current->blocked_on = NULL;
+	_raw_spin_unlock(&current->pi_lock);
+	goto again;
+}
+
+static void process_timeout(unsigned long __data)
+{
+	wake_up_process((task_t *)__data);
+}
+
+#ifdef CONFIG_PREEMPT_BKL
+
+static inline int rt_release_bkl(struct rt_mutex *lock)
+{
+	int saved_lock_depth = current->lock_depth;
+
+	current->lock_depth = -1;
+	/*
+	 * try_to_take_lock set the waiters, make sure it's
+	 * still correct.
+	 */
+	if (lock_waiters_empty(lock))
+		clear_lock_waiters(lock);
+	_raw_spin_unlock(&lock->wait_lock);
+	trace_unlock_irq(&tracelock);
+
+	up(&kernel_sem);
+
+	trace_lock_irq(&tracelock);
+	_raw_spin_lock(&lock->wait_lock);
+
+	return saved_lock_depth;
+}
+
+static inline void rt_reacquire_bkl(int saved_lock_depth)
+{
+	down(&kernel_sem);
+	current->lock_depth = saved_lock_depth;
+}
+
+#else
+# define rt_release_bkl(x)	(-1)
+# define rt_reacquire_bkl(x)	do { } while (0)
+#endif
+
+/*
+ * Get a lock - semaphore style
+ *
+ * Used for ____down() and __down_interruptible() !
+ */
+static int __sched down_rtsem(struct rt_mutex *lock, int state,
+			      unsigned long time __EIP_DECL__)
+{
+	unsigned long flags;
+	struct rt_mutex_waiter waiter;
+	struct timer_list timer;
+	unsigned long expire = 0;
+	int timer_installed = 0;
+	int ret = 0, saved_lock_depth = -1;
+
+	trace_lock_irqsave(&tracelock, flags);
+	_raw_spin_lock(&lock->wait_lock);
+	INIT_WAITER(&waiter);
+
+	init_lists(lock);
+
+	if (try_to_take_lock(lock __EIP__))
+		goto out_unlock;
+
+	/*
+	 * We drop BKL here before we go into the wait loop to avoid a
+	 * possible deadlock in the scheduler.
+	 */
+	if (unlikely(current->lock_depth >= 0))
+		saved_lock_depth = rt_release_bkl(lock);
+
+ again:
+	if (try_to_take_lock(lock __EIP__))
+		goto out_unlock;
+
+	task_blocks_on_lock(&waiter, lock, state __EIP__);
+
+	TRACE_BUG_ON_LOCKED(!raw_irqs_disabled());
+	/* we don't need to touch the lock struct anymore */
+	_raw_spin_unlock(&lock->wait_lock);
+	trace_unlock_irq(&tracelock);
+
+	might_sleep();
+
+	if (time && !timer_installed) {
+		setup_timer(&timer, process_timeout, (unsigned long)current);
+		expire = time + jiffies;
+		__mod_timer(&timer, expire);
+		timer_installed = 1;
+	}
+
+	if (waiter.task) {
+		unsigned long nosched_flag = current->flags & PF_NOSCHED;
+
+		current->flags &= ~PF_NOSCHED;
+
+		schedule();
+
+		raw_local_irq_disable();
+		current->flags |= nosched_flag;
+		current->state = TASK_RUNNING;
+	}
+
+	trace_lock_irq(&tracelock);
+	_raw_spin_lock(&lock->wait_lock);
+	if (waiter.task)
+		remove_waiter(lock, &waiter);
+	_raw_spin_lock(&current->pi_lock);
+	current->blocked_on = NULL;
+	_raw_spin_unlock(&current->pi_lock);
+
+	/* Timeout ? */
+	if (timer_installed && (!timer_pending(&timer) || expire < jiffies))
+		ret = -ETIMEDOUT;
+	/* Signal pending */
+	if (state == TASK_INTERRUPTIBLE && signal_pending(current))
+		ret = -EINTR;
+
+	/* Timeout or signal pending ? */
+	if (!ret)
+		goto again;
+
+	/*
+	 * We may have been woken up, but timed out or got a signal
+	 * before we grabbed the lock. So we might be the pending
+	 * owner. Clean it up.
+	 */
+	remove_pending_owner(lock, current);
+
+ out_unlock:
+	_raw_spin_unlock(&lock->wait_lock);
+
+	if (ret) {
+		/*
+		 * Readjust priority, when we did not get the lock we
+		 * might have been the pending owner and boosted
+		 * already. Since we did not take the lock, the
+		 * PI boost has to go
+		 */
+		_raw_spin_lock(&current->pi_lock);
+		adjust_prio(current);
+		_raw_spin_unlock(&current->pi_lock);
+	}
+	trace_unlock_irqrestore(&tracelock, flags);
+
+	if (!ret)
+		account_mutex_owner_down(current, lock);
+
+	/* Must we reaquire BKL ? */
+	if (unlikely(saved_lock_depth >= 0))
+		rt_reacquire_bkl(saved_lock_depth);
+
+	if (timer_installed)
+		del_singleshot_timer_sync(&timer);
+
+	FREE_WAITER(&waiter);
+
+	return ret;
+}
+
+/*
+ * get a lock semaphore style - interruptible
+ */
+static int __sched __down_interruptible(struct rt_mutex *lock,
+					unsigned long time __EIP_DECL__)
+{
+	return down_rtsem(lock, TASK_INTERRUPTIBLE, time __EIP__);
+}
+
+/*
+ * get a lock semaphore-style - uninterruptible
+ */
+static void ____down(struct rt_mutex *lock __EIP_DECL__)
+{
+	down_rtsem(lock, TASK_UNINTERRUPTIBLE, 0 __EIP__);
+}
+
+
+/*
+ * trylock for writing -- returns 1 if successful, 0 if contention
+ */
+static int __down_trylock(struct rt_mutex *lock __EIP_DECL__)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	trace_lock_irqsave(&tracelock, flags);
+	TRACE_BUG_ON_LOCKED(!raw_irqs_disabled());
+	/*
+	 * If we fail to get the raw spinlock protecting the lock
+	 * itself, somebody else is fiddling with the lock, so we
+	 * decide to fail.
+	 */
+	if (_raw_spin_trylock(&lock->wait_lock)) {
+		init_lists(lock);
+
+		if (likely(lock_owner(lock) != current)) {
+			ret = try_to_set_lock_owner(lock, current);
+			if (ret)
+				debug_deadlocks_down_mutex(lock __EIP__);
+		}
+		_raw_spin_unlock(&lock->wait_lock);
+	}
+
+	trace_unlock_irqrestore(&tracelock, flags);
+	if (ret)
+		account_mutex_owner_down(current, lock);
+
+	return ret;
+}
+
+/*
+ * release the lock:
+ */
+static void ____up_mutex(struct rt_mutex *lock, int save_state __EIP_DECL__)
+{
+	unsigned long flags;
+
+	TRACE_WARN_ON(lock->save_state != save_state);
+
+	trace_lock_irqsave(&tracelock, flags);
+
+	release_lock(lock, save_state);
+
+	account_mutex_owner_up(current);
+
+#if defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPT_RT)
+	if (!current->lock_count && !rt_prio(current->normal_prio) &&
+					rt_prio(current->prio)) {
+		static int once = 1;
+
+		if (once) {
+			once = 0;
+			TRACE_OFF();
+			printk("BUG: %s/%d: leaked RT prio %d (%d)?\n",
+				current->comm, current->pid,
+				current->prio, current->normal_prio);
+			dump_stack();
+		}
+	}
+#endif
+	trace_unlock_irqrestore(&tracelock, flags);
+	/* no need to check for preempt here - we just handled it */
+}
+
+static void __sched ___down_mutex(struct rt_mutex *lock __EIP_DECL__)
+{
+	____down_mutex(lock __EIP__);
+}
+
+static void __sched ___down(struct rt_mutex *lock __EIP_DECL__)
+{
+	____down(lock __EIP__);
+}
+
+
+static inline void __down_mutex_inline(struct rt_mutex *lock __EIP_DECL__)
+{
+	if (likely(can_atomic_aquire(lock) &&
+		   lock_cmpxchg(lock, NULL, current)))
+		account_mutex_owner_down(current, lock);
+	else
+		___down_mutex(lock __EIP__);
+}
+
+static inline void __down_inline(struct rt_mutex *lock __EIP_DECL__)
+{
+	if (likely(can_atomic_aquire(lock) &&
+		   lock_cmpxchg(lock, NULL, current)))
+		account_mutex_owner_down(current, lock);
+	else
+		___down(lock __EIP__);
+}
+
+void __sched __down_mutex(struct rt_mutex *lock __EIP_DECL__)
+{
+	__down_mutex_inline(lock __EIP__);
+}
+EXPORT_SYMBOL(__down_mutex);
+
+void __sched __down(struct rt_mutex *lock __EIP_DECL__)
+{
+	__down_inline(lock __EIP__);
+}
+EXPORT_SYMBOL(__down);
+
+static void __sched ___up_mutex_savestate(struct rt_mutex *lock __EIP_DECL__)
+{
+	____up_mutex(lock, 1 __EIP__);
+}
+
+static void __sched ___up_mutex_nosavestate(struct rt_mutex *lock __EIP_DECL__)
+{
+	____up_mutex(lock, 0 __EIP__);
+}
+
+static inline void
+ __up_mutex_savestate_inline(struct rt_mutex *lock __EIP_DECL__)
+{
+	if (likely(lock_cmpxchg(lock, current, NULL)))
+		account_mutex_owner_up(current);
+	else
+		___up_mutex_savestate(lock __EIP__);
+}
+
+void __sched __up_mutex_savestate(struct rt_mutex *lock __EIP_DECL__)
+{
+	__up_mutex_savestate_inline(lock __EIP__);
+}
+EXPORT_SYMBOL(__up_mutex_savestate);
+
+static inline void
+__up_mutex_nosavestate_inline(struct rt_mutex *lock __EIP_DECL__)
+{
+	if (likely(lock_cmpxchg(lock, current, NULL)))
+		account_mutex_owner_up(current);
+	else
+		___up_mutex_nosavestate(lock __EIP__);
+}
+
+void __sched __up_mutex_nosavestate(struct rt_mutex *lock __EIP_DECL__)
+{
+	__up_mutex_nosavestate_inline(lock __EIP__);
+}
+EXPORT_SYMBOL(__up_mutex_nosavestate);
+
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * get a write lock on the rw-semaphore
+ */
+void fastcall __sched rt_down_write(struct rw_semaphore *rwsem)
+{
+	__down_inline(&rwsem->lock __CALLER0__);
+}
+EXPORT_SYMBOL(rt_down_write);
+
+/*
+ * get a read lock on the rw-semaphore
+ */
+void fastcall __sched rt_down_read(struct rw_semaphore *rwsem)
+{
+	unsigned long flags;
+	/*
+	 * Read locks within the write lock succeed.
+	 */
+	flags = _raw_spin_lock_irqsave(&rwsem->lock.wait_lock);
+	if (lock_real_owner(&rwsem->lock) == current) {
+		_raw_spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+		rwsem->read_depth++;
+		return;
+	}
+	_raw_spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+	return __down_inline(&rwsem->lock __CALLER0__);
+}
+EXPORT_SYMBOL(rt_down_read);
+
+static inline
+void __sched down_write_mutex(struct rw_semaphore *rwsem,
+			      task_t *tsk __EIP_DECL__)
+{
+	__down_mutex_inline(&rwsem->lock __EIP__);
+}
+
+static inline
+void __sched down_read_mutex(struct rw_semaphore *rwsem __EIP_DECL__)
+{
+	unsigned long flags;
+	/*
+	 * Read locks within the write lock succeed.
+	 */
+	flags = _raw_spin_lock_irqsave(&rwsem->lock.wait_lock);
+	if (lock_real_owner(&rwsem->lock) == current) {
+		_raw_spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+		rwsem->read_depth++;
+		return;
+	}
+	_raw_spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+	__down_mutex_inline(&rwsem->lock __EIP__);
+}
+
+int fastcall rt_down_write_trylock(struct rw_semaphore *rwsem)
+{
+	return __down_trylock(&rwsem->lock __CALLER0__);
+}
+EXPORT_SYMBOL(rt_down_write_trylock);
+
+/*
+ * trylock for reading -- returns 1 if successful, 0 if contention
+ */
+int fastcall rt_down_read_trylock(struct rw_semaphore *rwsem)
+{
+	unsigned long flags;
+	/*
+	 * Read locks within the self-held write lock succeed.
+	 */
+	flags = _raw_spin_lock_irqsave(&rwsem->lock.wait_lock);
+	if (lock_real_owner(&rwsem->lock) == current) {
+		_raw_spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+		rwsem->read_depth++;
+		return 1;
+	}
+	_raw_spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+	return __down_trylock(&rwsem->lock __CALLER0__);
+}
+EXPORT_SYMBOL(rt_down_read_trylock);
+
+static int down_write_trylock_mutex(struct rw_semaphore *rwsem)
+{
+	return __down_trylock(&rwsem->lock __CALLER0__);
+}
+
+static int down_read_trylock_mutex(struct rw_semaphore *rwsem)
+{
+	unsigned long flags;
+	/*
+	 * Read locks within the self-held write lock succeed.
+	 */
+	flags = _raw_spin_lock_irqsave(&rwsem->lock.wait_lock);
+	if (lock_real_owner(&rwsem->lock) == current) {
+		_raw_spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+		rwsem->read_depth++;
+		return 1;
+	}
+	_raw_spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+	return __down_trylock(&rwsem->lock __CALLER0__);
+}
+#endif
+
+/*
+ * initialise the lock:
+ */
+static void __init_rt_mutex(struct rt_mutex *lock, int save_state,
+				char *name, char *file, int line)
+{
+	lock->owner = NULL;
+	spin_lock_init(&lock->wait_lock);
+	preempt_disable();
+	plist_head_init(&lock->wait_list);
+	preempt_enable();
+	debug_rt_mutex_init(lock, save_state, name, file, line);
+	debug_rt_mutex_preempt(lock, 0);
+}
+
+/**
+ * rt_mutex_init_proxy_locked - initialize a rt_mutex and lock it on behalf of a
+ *				proxy owner
+ *
+ * @lock: 	the rt_mutex to be locked
+ * @proxy_owner:the task to set as owner
+ */
+void rt_mutex_init_proxy_locked(struct rt_mutex *lock, task_t *proxy_owner)
+{
+	unsigned long flags;
+
+	__init_rt_mutex(lock, 0, "futex", __FILE__, __LINE__ );
+
+	trace_lock_irqsave(&tracelock, flags);
+	_raw_spin_lock(&lock->wait_lock);
+
+	set_lock_owner(lock, proxy_owner, 0);
+	debug_deadlocks_down_mutex(lock __CALLER0__);
+
+	_raw_spin_unlock(&lock->wait_lock);
+	trace_unlock_irqrestore(&tracelock, flags);
+	account_mutex_owner_down(proxy_owner, lock);
+}
+
+/**
+ * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
+ *
+ * @lock: 	the rt_mutex to be locked
+ * @timeout:	maximum time to wait on the lock (0 = no timeout)
+ *
+ * Returns:
+ *  0 		on success
+ * -EINTR 	when interrupted by a signal
+ * -ETIMEOUT	when the timeout expired
+ */
+int rt_mutex_lock_interruptible(struct rt_mutex *lock,
+				unsigned long timeout)
+{
+	if (likely(can_atomic_aquire(lock) &&
+		   lock_cmpxchg(lock, NULL, current))) {
+		account_mutex_owner_down(current, lock);
+		return 0;
+	} else
+		return down_rtsem(lock,
+				  TASK_INTERRUPTIBLE, timeout __CALLER0__);
+}
+
+/**
+ * rt_mutex_unlock - unlock a rt_mutex
+ *
+ * @lock: 	the rt_mutex to be unlocked
+ *
+ */
+void rt_mutex_unlock(struct rt_mutex *lock)
+{
+	if (likely(lock_cmpxchg(lock, current, NULL)))
+		account_mutex_owner_up(current);
+	else
+		____up_mutex(lock, 0 __CALLER0__);
+}
+
+#ifdef CONFIG_PREEMPT_RT
+
+void fastcall __init_rwsem(struct rw_semaphore *rwsem, int save_state,
+			char *name, char *file, int line)
+{
+	__init_rt_mutex(&rwsem->lock, save_state, name, file, line);
+	rwsem->read_depth = 0;
+}
+EXPORT_SYMBOL(__init_rwsem);
+
+/*
+ * Do owner check too:
+ */
+void fastcall rt_up_write(struct rw_semaphore *rwsem)
+{
+	TRACE_WARN_ON(lock_real_owner(&rwsem->lock) != current);
+	TRACE_BUG_ON(rwsem->read_depth);
+	__up_mutex_nosavestate_inline(&rwsem->lock __CALLER0__);
+}
+EXPORT_SYMBOL(rt_up_write);
+
+static inline void _up_write(struct rw_semaphore *rwsem __EIP_DECL__)
+{
+	TRACE_WARN_ON(lock_real_owner(&rwsem->lock) != current);
+	TRACE_BUG_ON(rwsem->read_depth);
+	__up_mutex_nosavestate_inline(&rwsem->lock __EIP__);
+}
+
+static void up_write_mutex(struct rw_semaphore *rwsem __EIP_DECL__)
+{
+	TRACE_WARN_ON(rwsem->lock.save_state != 1);
+	TRACE_WARN_ON(lock_real_owner(&rwsem->lock) != current);
+	TRACE_BUG_ON(rwsem->read_depth);
+	__up_mutex_savestate_inline(&rwsem->lock __EIP__);
+}
+
+/*
+ * release a read lock on the semaphore
+ */
+void fastcall rt_up_read(struct rw_semaphore *rwsem)
+{
+	unsigned long flags;
+	/*
+	 * Read locks within the self-held write lock succeed.
+	 */
+	flags = _raw_spin_lock_irqsave(&rwsem->lock.wait_lock);
+	if (lock_real_owner(&rwsem->lock) == current && rwsem->read_depth) {
+		_raw_spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+		rwsem->read_depth--;
+		return;
+	}
+	_raw_spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+	return _up_write(rwsem __CALLER0__);
+}
+EXPORT_SYMBOL(rt_up_read);
+
+static void up_read_mutex(struct rw_semaphore *rwsem __EIP_DECL__)
+{
+	unsigned long flags;
+	TRACE_WARN_ON(rwsem->lock.save_state != 1);
+	/*
+	 * Read locks within the self-held write lock succeed.
+	 */
+	flags = _raw_spin_lock_irqsave(&rwsem->lock.wait_lock);
+	if (lock_real_owner(&rwsem->lock) == current && rwsem->read_depth) {
+		_raw_spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+		rwsem->read_depth--;
+		return;
+	}
+	_raw_spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+	return up_write_mutex(rwsem __EIP__);
+}
+
+/*
+ * downgrade a write lock into a read lock
+ * - just wake up any readers at the front of the queue
+ */
+void fastcall rt_downgrade_write(struct rw_semaphore *rwsem)
+{
+	BUG();
+}
+EXPORT_SYMBOL(rt_downgrade_write);
+
+static int rt_mutex_is_locked(struct rt_mutex *lock)
+{
+	mb();
+	return lock_owner(lock) != NULL;
+}
+
+int fastcall rt_rwsem_is_locked(struct rw_semaphore *rwsem)
+{
+	return rt_mutex_is_locked(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_rwsem_is_locked);
+
+void fastcall __sema_init(struct semaphore *sem, int val,
+			  char *name, char *file, int line)
+{
+	atomic_set(&sem->count, val);
+	switch (val) {
+	case 0:
+		__init_rt_mutex(&sem->lock, 0, name, file, line);
+		__down_inline(&sem->lock __CALLER0__);
+		break;
+	default:
+		__init_rt_mutex(&sem->lock, 0, name, file, line);
+		break;
+	}
+}
+EXPORT_SYMBOL(__sema_init);
+
+void fastcall __init_MUTEX(struct semaphore *sem, char *name, char *file,
+			   int line)
+{
+	__sema_init(sem, 1, name, file, line);
+}
+EXPORT_SYMBOL(__init_MUTEX);
+
+static int down_trylock_mutex(struct rt_mutex *lock __EIP_DECL__)
+{
+	TRACE_WARN_ON(lock->save_state != 1);
+	return __down_trylock(lock __EIP__);
+}
+
+static void fastcall up_mutex(struct rt_mutex *lock __EIP_DECL__)
+{
+	TRACE_WARN_ON(lock->save_state != 1);
+	TRACE_WARN_ON(lock_real_owner(lock) != current);
+	__up_mutex_savestate_inline(lock __EIP__);
+}
+
+/*
+ * Linux Semaphores implemented via RT-mutexes.
+ *
+ * In the down() variants we use the mutex as the semaphore blocking
+ * object: we always acquire it, decrease the counter and keep the lock
+ * locked if we did the 1->0 transition. The next down() will then block.
+ *
+ * In the up() path we atomically increase the counter and do the
+ * unlock if we were the one doing the 0->1 transition.
+ */
+
+static inline void __down_complete(struct semaphore *sem __EIP_DECL__)
+{
+	int count = atomic_dec_return(&sem->count);
+
+	TRACE_WARN_ON(sem->lock.save_state != 0 || count < 0);
+
+	if (unlikely(count > 0))
+		__up_mutex_nosavestate_inline(&sem->lock __EIP__);
+}
+
+void fastcall rt_down(struct semaphore *sem)
+{
+	TRACE_WARN_ON(sem->lock.save_state != 0);
+	__down_inline(&sem->lock __CALLER0__);
+	__down_complete(sem __CALLER0__);
+}
+EXPORT_SYMBOL(rt_down);
+
+int fastcall rt_down_interruptible(struct semaphore *sem)
+{
+	int ret;
+
+	TRACE_WARN_ON(sem->lock.save_state != 0);
+	ret = __down_interruptible(&sem->lock, 0 __CALLER0__);
+	if (ret)
+		return ret;
+	__down_complete(sem __CALLER0__);
+	return 0;
+}
+EXPORT_SYMBOL(rt_down_interruptible);
+
+/*
+ * try to down the semaphore, 0 on success and 1 on failure. (inverted)
+ */
+int fastcall rt_down_trylock(struct semaphore *sem)
+{
+	TRACE_WARN_ON(sem->lock.save_state != 0);
+	/*
+	 * Here we are a tiny bit different from ordinary Linux semaphores,
+	 * because we can get 'transient' locking-failures when say a
+	 * process decreases the count from 9 to 8 and locks/releases the
+	 * embedded mutex internally. It would be quite complex to remove
+	 * these transient failures so lets try it the simple way first:
+	 */
+	if (__down_trylock(&sem->lock __CALLER0__)) {
+		__down_complete(sem __CALLER0__);
+		return 0;
+	}
+	return 1;
+}
+EXPORT_SYMBOL(rt_down_trylock);
+
+void fastcall rt_up(struct semaphore *sem)
+{
+	struct thread_info *ti = current_thread_info();
+	int count;
+
+	TRACE_WARN_ON(sem->lock.save_state != 0);
+	/*
+	 * Disable preemption to make sure a highprio trylock-er cannot
+	 * preempt us here and get into an infinite loop:
+	 */
+	preempt_disable_ti(ti);
+	count = atomic_inc_return(&sem->count);
+	/*
+	 * If we did the 0 -> 1 transition then we are the ones to unlock it:
+	 */
+	if (likely(count == 1))
+		__up_mutex_nosavestate_inline(&sem->lock __CALLER0__);
+	preempt_enable_ti(ti);
+}
+EXPORT_SYMBOL(rt_up);
+
+int fastcall rt_sem_is_locked(struct semaphore *sem)
+{
+	TRACE_WARN_ON(sem->lock.save_state != 0);
+	return rt_mutex_is_locked(&sem->lock);
+}
+EXPORT_SYMBOL(rt_sem_is_locked);
+
+int fastcall rt_sema_count(struct semaphore *sem)
+{
+	TRACE_WARN_ON(sem->lock.save_state != 0);
+	return atomic_read(&sem->count);
+}
+EXPORT_SYMBOL(rt_sema_count);
+#endif
+
+/*
+ * Spinlock wrappers:
+ *
+ * (DEBUG_RT_LOCKING_MODE is a spinning/preempt-disabling variant of the APIs.
+ * Used for debugging/profiling only.)
+ */
+
+#ifdef CONFIG_DEBUG_RT_LOCKING_MODE
+
+int preempt_locks_user = 0;
+int preempt_locks = 0;
+
+EXPORT_SYMBOL(preempt_locks);
+
+/*
+ * Called from the idle thread - it is not safe to switch the locking
+ * mode runtime from a normal process context (locks might be in use)
+ */
+void propagate_preempt_locks_value(void)
+{
+	if (preempt_locks != preempt_locks_user)
+		preempt_locks = preempt_locks_user;
+}
+
+#else
+# define preempt_locks 1
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+static inline void __spin_lock(spinlock_t *lock __EIP_DECL__)
+{
+	if(debug_rt_lockmode_nopreempt())
+		debug_rt_lockmode_spin_lock(&lock->lock);
+	else {
+		debug_rt_mutex_save_state(lock, 1);
+		__down_mutex_inline(&lock->lock __EIP__);
+	}
+}
+
+static inline void __spin_unlock(spinlock_t *lock)
+{
+	if(debug_rt_lockmode_nopreempt())
+		debug_rt_lockmode_spin_unlock(&lock->lock);
+	else
+		up_mutex(&lock->lock __CALLER0__);
+}
+
+#ifdef DEBUG_RT_DONT_INLINE
+void __lockfunc _spin_lock(spinlock_t *spin)
+{
+	__spin_lock(spin __CALLER0__);
+}
+EXPORT_SYMBOL(_spin_lock);
+
+void __lockfunc _spin_lock_bh(spinlock_t *spin)
+{
+	__spin_lock(spin __CALLER0__);
+}
+EXPORT_SYMBOL(_spin_lock_bh);
+
+void __lockfunc _spin_lock_irq(spinlock_t *spin)
+{
+	__spin_lock(spin __CALLER0__);
+}
+EXPORT_SYMBOL(_spin_lock_irq);
+
+void __lockfunc _spin_unlock(spinlock_t *lock)
+{
+	__spin_unlock(lock);
+}
+EXPORT_SYMBOL(_spin_unlock);
+
+void __lockfunc _spin_unlock_bh(spinlock_t *lock)
+{
+	__spin_unlock(lock);
+}
+EXPORT_SYMBOL(_spin_unlock_bh);
+
+void __lockfunc _spin_unlock_irq(spinlock_t *lock)
+{
+	__spin_unlock(lock);
+}
+EXPORT_SYMBOL(_spin_unlock_irq);
+
+void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
+{
+#ifdef CONFIG_DEBUG_IRQ_FLAGS
+	if (flags != (RAW_LOCAL_ILLEGAL_MASK | LOCAL_ILLEGAL_MASK)) {
+		static int print_once = 1;
+
+		if (print_once) {
+			print_once = 0;
+			printk("BUG: %s/%d: spin-unlock irq flags assymetry?\n",
+				current->comm, current->pid);
+			dump_stack();
+		}
+		local_irq_enable();
+	}
+#endif
+	__spin_unlock(lock);
+}
+EXPORT_SYMBOL(_spin_unlock_irqrestore);
+
+unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *spin)
+{
+	unsigned long flags;
+
+	__spin_lock(spin __CALLER0__);
+#ifdef CONFIG_DEBUG_IRQ_FLAGS
+	flags = RAW_LOCAL_ILLEGAL_MASK | LOCAL_ILLEGAL_MASK;
+#else
+	flags = 0;
+#endif
+
+	return flags;
+}
+EXPORT_SYMBOL(_spin_lock_irqsave);
+
+#endif /* DEBUG_RT_DONT_INLINE */
+
+void __lockfunc _spin_unlock_wait(spinlock_t *lock)
+{
+	do {
+		barrier();
+	} while (spin_is_locked(&lock->lock.wait_lock));
+}
+EXPORT_SYMBOL(_spin_unlock_wait);
+
+static inline int __spin_trylock(spinlock_t *lock)
+{
+	if(debug_rt_lockmode_nopreempt())
+		return debug_rt_lockmode_spin_trylock(&lock->lock);
+	else
+		return down_trylock_mutex(&lock->lock __CALLER0__);
+}
+
+int __lockfunc _spin_trylock(spinlock_t *lock)
+{
+	return __spin_trylock(lock);
+}
+EXPORT_SYMBOL(_spin_trylock);
+
+int __lockfunc _spin_trylock_bh(spinlock_t *lock)
+{
+	return __spin_trylock(lock);
+}
+EXPORT_SYMBOL(_spin_trylock_bh);
+
+int __lockfunc _spin_trylock_irq(spinlock_t *lock)
+{
+	return __spin_trylock(lock);
+}
+EXPORT_SYMBOL(_spin_trylock_irq);
+
+int __lockfunc _spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
+{
+	int ret;
+
+#ifdef CONFIG_DEBUG_IRQ_FLAGS
+	*flags = RAW_LOCAL_ILLEGAL_MASK | LOCAL_ILLEGAL_MASK;
+#else
+	*flags = 0;
+#endif
+
+	ret = __spin_trylock(lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(_spin_trylock_irqsave);
+
+static inline int __spin_is_locked(spinlock_t *lock)
+{
+	if(debug_rt_lockmode_nopreempt())
+		return debug_rt_lockmode_spin_is_locked(&lock->lock);
+	else
+		return rt_mutex_is_locked(&lock->lock);
+}
+
+int _spin_is_locked(spinlock_t *lock)
+{
+	return __spin_is_locked(lock);
+}
+EXPORT_SYMBOL(_spin_is_locked);
+
+int _spin_can_lock(spinlock_t *lock)
+{
+	return !__spin_is_locked(lock);
+}
+EXPORT_SYMBOL(_spin_can_lock);
+
+int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
+{
+	__spin_lock(lock __CALLER0__);
+	if (atomic_dec_and_test(atomic))
+		return 1;
+	__spin_unlock(lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(atomic_dec_and_spin_lock);
+
+void _spin_lock_init(spinlock_t *lock, char *name, char *file, int line)
+{
+	__init_rt_mutex(&lock->lock, 1, name, file, line);
+	debug_rt_mutex_preempt(&lock->lock, 1);
+	debug_rt_lockmode_init(&lock->lock);
+}
+EXPORT_SYMBOL(_spin_lock_init);
+
+
+/*
+ * RW-lock wrappers:
+ */
+int __lockfunc _read_trylock(rwlock_t *rwlock)
+{
+	if(debug_rt_lockmode_nopreempt())
+		return debug_rt_lockmode_read_trylock(&rwlock->lock.lock);
+	else
+		return down_read_trylock_mutex(&rwlock->lock);
+}
+EXPORT_SYMBOL(_read_trylock);
+
+int __lockfunc _write_trylock(rwlock_t *rwlock)
+{
+	if(debug_rt_lockmode_nopreempt())
+		return debug_rt_lockmode_write_trylock(&rwlock->lock.lock);
+	else
+		return down_write_trylock_mutex(&rwlock->lock);
+}
+EXPORT_SYMBOL(_write_trylock);
+
+inline void __lockfunc _write_lock(rwlock_t *rwlock)
+{
+	if(debug_rt_lockmode_nopreempt())
+		debug_rt_lockmode_write_lock(&rwlock->lock.lock);
+	else
+		down_write_mutex(&rwlock->lock, current __CALLER0__);
+}
+EXPORT_SYMBOL(_write_lock);
+
+inline void __lockfunc _read_lock(rwlock_t *rwlock)
+{
+	if(debug_rt_lockmode_nopreempt())
+		debug_rt_lockmode_read_lock(&rwlock->lock.lock);
+	else
+		down_read_mutex(&rwlock->lock __CALLER0__);
+}
+EXPORT_SYMBOL(_read_lock);
+
+inline void __lockfunc _write_unlock(rwlock_t *rwlock)
+{
+	if(debug_rt_lockmode_nopreempt())
+		debug_rt_lockmode_write_unlock(&rwlock->lock.lock);
+	else
+		up_write_mutex(&rwlock->lock __CALLER0__);
+}
+EXPORT_SYMBOL(_write_unlock);
+
+static inline void __read_unlock(rwlock_t *rwlock)
+{
+	if(debug_rt_lockmode_nopreempt())
+		debug_rt_lockmode_read_unlock(&rwlock->lock.lock);
+	else
+		up_read_mutex(&rwlock->lock __CALLER0__);
+}
+
+void __lockfunc _read_unlock(rwlock_t *rwlock)
+{
+	__read_unlock(rwlock);
+}
+EXPORT_SYMBOL(_read_unlock);
+
+unsigned long __lockfunc _write_lock_irqsave(rwlock_t *rwlock)
+{
+	unsigned long flags;
+
+	_write_lock(rwlock);
+
+	__local_save_flags_inline(flags);
+	return flags;
+}
+EXPORT_SYMBOL(_write_lock_irqsave);
+
+unsigned long __lockfunc _read_lock_irqsave(rwlock_t *rwlock)
+{
+	unsigned long flags;
+
+	_read_lock(rwlock);
+
+	__local_save_flags_inline(flags);
+	return flags;
+}
+EXPORT_SYMBOL(_read_lock_irqsave);
+
+void __lockfunc _write_lock_irq(rwlock_t *rwlock)
+{
+	_write_lock(rwlock);
+}
+EXPORT_SYMBOL(_write_lock_irq);
+
+void __lockfunc _read_lock_irq(rwlock_t *rwlock)
+{
+	_read_lock(rwlock);
+}
+EXPORT_SYMBOL(_read_lock_irq);
+
+void __lockfunc _write_lock_bh(rwlock_t *rwlock)
+{
+	_write_lock(rwlock);
+}
+EXPORT_SYMBOL(_write_lock_bh);
+
+void __lockfunc _read_lock_bh(rwlock_t *rwlock)
+{
+	_read_lock(rwlock);
+}
+EXPORT_SYMBOL(_read_lock_bh);
+
+void __lockfunc _write_unlock_irq(rwlock_t *rwlock)
+{
+	_write_unlock(rwlock);
+}
+EXPORT_SYMBOL(_write_unlock_irq);
+
+void __lockfunc _read_unlock_irq(rwlock_t *rwlock)
+{
+	_read_unlock(rwlock);
+}
+EXPORT_SYMBOL(_read_unlock_irq);
+
+void __lockfunc _write_unlock_bh(rwlock_t *rwlock)
+{
+	_write_unlock(rwlock);
+}
+EXPORT_SYMBOL(_write_unlock_bh);
+
+void __lockfunc _read_unlock_bh(rwlock_t *rwlock)
+{
+	_read_unlock(rwlock);
+}
+EXPORT_SYMBOL(_read_unlock_bh);
+
+void __lockfunc _write_unlock_irqrestore(rwlock_t *rwlock, unsigned long flags)
+{
+	_write_unlock(rwlock);
+}
+EXPORT_SYMBOL(_write_unlock_irqrestore);
+
+void __lockfunc _read_unlock_irqrestore(rwlock_t *rwlock, unsigned long flags)
+{
+	_read_unlock(rwlock);
+}
+EXPORT_SYMBOL(_read_unlock_irqrestore);
+
+void _rwlock_init(rwlock_t *rwlock, char *name, char *file, int line)
+{
+	__init_rwsem(&rwlock->lock, 1, name, file, line);
+	debug_rt_mutex_preempt(&rwlock->lock.lock, 1);
+	debug_rt_lockmode_initrw(&rwlock->lock.lock);
+}
+EXPORT_SYMBOL(_rwlock_init);
+
+/*
+ * _read_can_lock() and _write_can_lock() does the same
+ */
+int _read_can_lock(rwlock_t *rwlock)
+{
+	if (debug_rt_lockmode_nopreempt())
+		return debug_rt_lockmode_read_can_lock(&rwlock->lock.lock);
+	else
+		return !rt_rwsem_is_locked(&rwlock->lock);
+}
+EXPORT_SYMBOL(_read_can_lock);
+
+int _write_can_lock(rwlock_t *rwlock)
+{
+	if (debug_rt_lockmode_nopreempt())
+		return debug_rt_lockmode_write_can_lock(&rwlock->lock.lock);
+	else
+		return !rt_rwsem_is_locked(&rwlock->lock);
+}
+EXPORT_SYMBOL(_write_can_lock);
+#endif
+
+
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * Unlock these on crash:
+ */
+void zap_rt_locks(void)
+{
+	trace_lock_init();
+}
+#endif
Index: linux.prev/kernel/rt.h
===================================================================
--- /dev/null
+++ linux.prev/kernel/rt.h
@@ -0,0 +1,101 @@
+#define LOCK_OWNER_PENDING	1UL
+#define LOCK_HAS_WAITERS	2UL
+#define LOCK_OWNER_MASKALL	3UL
+
+#define lock_owner(lock)						\
+({									\
+	typecheck(struct rt_mutex *,(lock));				\
+ 	((task_t *)((unsigned long)(lock->owner) & ~3UL));		\
+})
+
+#define lock_real_owner(lock)						\
+({									\
+	typecheck(struct rt_mutex *,(lock));				\
+	((task_t *)((unsigned long)((lock)->owner) & ~2UL));		\
+})
+
+#define lock_owner_pending(lock)					\
+({									\
+	typecheck(struct rt_mutex *,(lock));				\
+	((unsigned long)((lock)->owner) & 1UL);				\
+})
+
+#define lock_has_waiters(lock)						\
+({									\
+	typecheck(struct rt_mutex *,(lock));				\
+	((unsigned long)((lock)->owner) & 2UL);				\
+})
+
+static inline void set_lock_owner(struct rt_mutex *lock, task_t *owner,
+				  unsigned long msk)
+{
+	lock->owner = (task_t *)(((unsigned long) owner) | msk);
+}
+
+static inline void clear_lock_waiters(struct rt_mutex *lock)
+{
+	lock->owner =
+		(task_t *)(((unsigned long) lock->owner) & ~LOCK_HAS_WAITERS);
+}
+
+/*
+ * We can speed up the lock acquire/release, if the architecture
+ * supports cmpxchg and if there's no debugging state to be set up
+ * (!DEBUG_DEADLOCKS).
+ */
+#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_DEADLOCKS)
+
+/* On SMP lists have to be initialized in the acquire path ! */
+# ifdef CONFIG_SMP
+#  define can_atomic_aquire(lock) ((lock)->wait_list.prio_list.prev)
+# else
+#  define can_atomic_aquire(lock) (1)
+# endif
+
+# define lock_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
+# define try_to_set_lock_owner(l,o) lock_cmpxchg(l, NULL, o)
+
+static inline void mark_lock_waiters(struct rt_mutex *lock)
+{
+	unsigned long owner, *p = (unsigned long *) &lock->owner;
+
+	do {
+		owner = *p;
+	} while (cmpxchg(p, owner, owner | LOCK_HAS_WAITERS) != owner);
+}
+
+#else
+
+# define can_atomic_aquire(lock) (0)
+# define lock_cmpxchg(l,c,n)	 (0)
+
+/*
+ * Must be called with lock->wait_lock held !
+ */
+static inline int try_to_set_lock_owner(struct rt_mutex *lock, task_t *owner)
+{
+	int ret = (lock->owner == NULL);
+
+	if (ret)
+		lock->owner = owner;
+	return ret;
+}
+
+static inline void mark_lock_waiters(struct rt_mutex *lock)
+{
+	unsigned long owner = ((unsigned long) lock->owner)| LOCK_HAS_WAITERS;
+
+	lock->owner = (task_t *) owner;
+}
+
+#endif
+
+#define lock_waiters_empty(lock)	(plist_head_empty(&lock->wait_list))
+
+#define lock_first_waiter(lock)						\
+	plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, list)
+
+#define task_has_waiters(task)		(!plist_head_empty(&task->pi_waiters))
+
+#define task_first_waiter(task)						\
+	plist_first_entry(&task->pi_waiters, struct rt_mutex_waiter, pi_list)
Index: linux.prev/kernel/sched.c
===================================================================
--- linux.prev.orig/kernel/sched.c
+++ linux.prev/kernel/sched.c
@@ -4,6 +4,7 @@
  *  Kernel scheduler and related syscalls
  *
  *  Copyright (C) 1991-2002  Linus Torvalds
+ *  Copyright (C) 2004 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  *
  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
  *		make semaphores SMP safe
@@ -16,6 +17,7 @@
  *		by Davide Libenzi, preemptible kernel bits by Robert Love.
  *  2003-09-03	Interactivity tuning by Con Kolivas.
  *  2004-04-02	Scheduler domains code by Nick Piggin
+ *  2004-10-13  Real-Time Preemption support by Ingo Molnar
  */
 
 #include <linux/mm.h>
@@ -46,6 +48,7 @@
 #include <linux/seq_file.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
+#include <linux/kallsyms.h>
 #include <linux/acct.h>
 #include <asm/tlb.h>
 
@@ -176,6 +179,13 @@ static unsigned int task_timeslice(task_
 #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)	\
 				< (long long) (sd)->cache_hot_time)
 
+void __put_task_struct_cb(struct rcu_head *rhp)
+{
+	__put_task_struct(container_of(rhp, struct task_struct, rcu));
+}
+
+EXPORT_SYMBOL_GPL(__put_task_struct_cb);
+
 /*
  * These are the runqueue data structures:
  */
@@ -184,7 +194,27 @@ static unsigned int task_timeslice(task_
 
 typedef struct runqueue runqueue_t;
 
+#ifdef CURRENT_PTR
+struct task_struct * const ___current = &init_task;
+struct task_struct ** const current_ptr = (struct task_struct ** const)&___current;
+struct thread_info * const current_ti = &init_thread_union.thread_info;
+struct thread_info ** const current_ti_ptr = (struct thread_info ** const)&current_ti;
+
+EXPORT_SYMBOL(___current);
+EXPORT_SYMBOL(current_ti);
+
+/*
+ * The scheduler itself doesnt want 'current' to be cached
+ * during context-switches:
+ */
+# undef current
+# define current __current()
+# undef current_thread_info
+# define current_thread_info() __current_thread_info()
+#endif
+
 struct prio_array {
+	runqueue_t *rq;
 	unsigned int nr_active;
 	unsigned long bitmap[BITMAP_SIZE];
 	struct list_head queue[MAX_PRIO];
@@ -198,7 +228,7 @@ struct prio_array {
  * acquire operations must be ordered by ascending &runqueue.
  */
 struct runqueue {
-	spinlock_t lock;
+	raw_spinlock_t lock;
 
 	/*
 	 * nr_running and cpu_load should be in the same cacheline because
@@ -207,6 +237,9 @@ struct runqueue {
 	unsigned long nr_running;
 #ifdef CONFIG_SMP
 	unsigned long prio_bias;
+# ifdef CONFIG_PREEMPT_RT
+	unsigned long rt_nr_running;
+# endif
 	unsigned long cpu_load[3];
 #endif
 	unsigned long long nr_switches;
@@ -276,11 +309,23 @@ for (domain = rcu_dereference(cpu_rq(cpu
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
+/*
+ * We really dont want to do anything complex within switch_to()
+ * on PREEMPT_RT - this check enforces this.
+ */
+#ifdef prepare_arch_switch
+# ifdef CONFIG_PREEMPT_RT
+#   error FIXME
+# else
+#  define _finish_arch_switch finish_arch_switch
+# endif
+#endif
+
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
 #ifndef finish_arch_switch
-# define finish_arch_switch(prev)	do { } while (0)
+# define _finish_arch_switch(prev)	do { } while (0)
 #endif
 
 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
@@ -341,7 +386,7 @@ static inline void finish_lock_switch(ru
 	prev->oncpu = 0;
 #endif
 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
-	local_irq_enable();
+	raw_local_irq_enable();
 #endif
 }
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
@@ -357,7 +402,7 @@ static inline runqueue_t *task_rq_lock(t
 	struct runqueue *rq;
 
 repeat_lock_task:
-	local_irq_save(*flags);
+	raw_local_irq_save(*flags);
 	rq = task_rq(p);
 	spin_lock(&rq->lock);
 	if (unlikely(rq != task_rq(p))) {
@@ -479,7 +524,7 @@ static inline runqueue_t *this_rq_lock(v
 {
 	runqueue_t *rq;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	rq = this_rq();
 	spin_lock(&rq->lock);
 
@@ -592,6 +637,33 @@ static inline void sched_info_switch(tas
 #define sched_info_switch(t, next)	do { } while (0)
 #endif /* CONFIG_SCHEDSTATS */
 
+int rt_overload_schedule, rt_overload_wakeup, rt_overload_pulled;
+
+__cacheline_aligned_in_smp atomic_t rt_overload;
+
+static inline void inc_rt_tasks(task_t *p, runqueue_t *rq)
+{
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+	if (rt_task(p)) {
+		rq->rt_nr_running++;
+		if (rq->rt_nr_running == 2)
+			atomic_inc(&rt_overload);
+	}
+#endif
+}
+
+static inline void dec_rt_tasks(task_t *p, runqueue_t *rq)
+{
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+	if (rt_task(p)) {
+		WARN_ON(!rq->rt_nr_running);
+		rq->rt_nr_running--;
+		if (rq->rt_nr_running == 1)
+			atomic_dec(&rt_overload);
+	}
+#endif
+}
+
 /*
  * Adding/removing a task to/from a priority array:
  */
@@ -601,15 +673,21 @@ static void dequeue_task(struct task_str
 	list_del(&p->run_list);
 	if (list_empty(array->queue + p->prio))
 		__clear_bit(p->prio, array->bitmap);
+	dec_rt_tasks(p, array->rq);
 }
 
 static void enqueue_task(struct task_struct *p, prio_array_t *array)
 {
+	if (p->flags & PF_DEAD) {
+		printk("BUG: %s/%d: dead task enqueued!\n", p->comm, p->pid);
+		dump_stack();
+	}
 	sched_info_queued(p);
 	list_add_tail(&p->run_list, array->queue + p->prio);
 	__set_bit(p->prio, array->bitmap);
 	array->nr_active++;
 	p->array = array;
+	inc_rt_tasks(p, array->rq);
 }
 
 /*
@@ -630,7 +708,7 @@ static inline void enqueue_task_head(str
 }
 
 /*
- * effective_prio - return the priority that is based on the static
+ * __normal_prio - return the priority that is based on the static
  * priority but is modified by bonuses/penalties.
  *
  * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
@@ -643,13 +721,11 @@ static inline void enqueue_task_head(str
  *
  * Both properties are important to certain workloads.
  */
-static int effective_prio(task_t *p)
+
+static inline int __normal_prio(task_t *p)
 {
 	int bonus, prio;
 
-	if (rt_task(p))
-		return p->prio;
-
 	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
 
 	prio = p->static_prio - bonus;
@@ -716,23 +792,49 @@ static inline void dec_nr_running(task_t
 #endif
 
 /*
- * __activate_task - move a task to the runqueue.
+ * Calculate the expected normal priority: i.e. priority
+ * without taking RT-inheritance into account. Might be
+ * boosted by interactivity modifiers. Changes upon fork,
+ * setprio syscalls, and whenever the interactivity
+ * estimator recalculates.
  */
-static inline void __activate_task(task_t *p, runqueue_t *rq)
+inline int normal_prio(task_t *p)
 {
-	enqueue_task(p, rq->active);
-	inc_nr_running(p, rq);
+	int prio;
+
+	if (p->policy != SCHED_NORMAL)
+		prio = MAX_RT_PRIO-1 - p->rt_priority;
+	else
+		prio = __normal_prio(p);
+
+	trace_special_pid(p->pid, p->prio, prio);
+	return prio;
 }
 
 /*
- * __activate_idle_task - move idle task to the _front_ of runqueue.
+ * Calculate the current priority, i.e. the priority
+ * taken into account by the scheduler. This value might
+ * be boosted by RT tasks, or might be boosted by
+ * interactivity modifiers. Will be RT if the task got
+ * RT-boosted. If not then it returns p->normal_prio.
  */
-static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
+static int __recalc_task_prio(task_t *p)
 {
-	enqueue_task_head(p, rq->active);
-	inc_nr_running(p, rq);
+	p->normal_prio = normal_prio(p);
+	/*
+	 * If we are RT tasks or we were boosted to RT priority,
+	 * keep the priority unchanged. Otherwise, update priority
+	 * to the normal priority:
+	 */
+	if (!rt_prio(p->prio))
+		return p->normal_prio;
+	return p->prio;
 }
 
+/*
+ * Recalculate p->normal_prio and p->prio after having slept,
+ * updating the sleep-average too:
+ */
 static int recalc_task_prio(task_t *p, unsigned long long now)
 {
 	/* Caller must always ensure 'now >= p->timestamp' */
@@ -792,10 +894,56 @@ static int recalc_task_prio(task_t *p, u
 		}
 	}
 
-	return effective_prio(p);
+	return __recalc_task_prio(p);
+}
+
+static inline void trace_start_sched_wakeup(task_t *p, runqueue_t *rq)
+{
+	if (TASK_PREEMPTS_CURR(p, rq) && (p != rq->curr))
+		__trace_start_sched_wakeup(p);
 }
 
 /*
+ * __activate_task - move a task to the runqueue.
+ */
+static inline void __activate_task(task_t *p, runqueue_t *rq)
+{
+	trace_special_pid(p->pid, p->prio, rq->nr_running);
+	enqueue_task(p, rq->active);
+	inc_nr_running(p, rq);
+}
+
+/*
+ * __activate_task_after - move a task to the runqueue,
+ *                         to execute after a specific task.
+ */
+static inline
+void __activate_task_after(task_t *p, task_t *parent, runqueue_t *rq)
+{
+	trace_special_pid(p->pid, p->prio, rq->nr_running);
+	if (p->flags & PF_DEAD) {
+		printk("BUG: %s/%d: dead task enqueued!\n", p->comm, p->pid);
+		dump_stack();
+	}
+	sched_info_queued(p);
+	// FIXME: to head rather?
+	list_add_tail(&p->run_list, &parent->run_list);
+	p->array = parent->array;
+	p->array->nr_active++;
+	inc_rt_tasks(p, p->array->rq);
+	inc_nr_running(p, rq);
+}
+
+/*
+ * __activate_idle_task - move idle task to the _front_ of runqueue.
+ */
+static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
+{
+	enqueue_task_head(p, rq->active);
+	inc_nr_running(p, rq);
+	WARN_ON(rt_task(p));
+}
+/*
  * activate_task - move a task to the runqueue and do priority recalculation
  *
  * Update all the scheduling statistics stuff. (sleep average
@@ -1106,6 +1254,101 @@ nextgroup:
 	return idlest;
 }
 
+#ifdef CONFIG_PREEMPT_RT
+
+static task_t * pick_rt_task(runqueue_t *src_rq, int this_cpu)
+{
+	struct list_head *head, *curr;
+	prio_array_t *array;
+	task_t *tmp;
+	int idx;
+
+	WARN_ON(!spin_is_locked(&src_rq->lock));
+	/*
+	 * Only consider the active array - we are looking for
+	 * RT tasks. Must have 2 tasks at least:
+	 */
+	array = src_rq->active;
+	if (unlikely(array->nr_active < 2))
+		return NULL;
+
+	idx = sched_find_first_bit(array->bitmap);
+next_in_bitmap:
+	/*
+	 * Only non-RT tasks available - abort the search:
+	 */
+	if (idx >= MAX_RT_PRIO)
+		return NULL;
+
+	head = array->queue + idx;
+	curr = head->next;
+next_in_queue:
+	tmp = list_entry(curr, task_t, run_list);
+	/*
+	 * Return the highest-prio non-running RT task (if task
+	 * may run on this CPU):
+	 */
+	if (!task_running(src_rq, tmp) &&
+				cpu_isset(this_cpu, tmp->cpus_allowed))
+		return tmp;
+
+	curr = curr->next;
+	if (curr != head)
+		goto next_in_queue;
+
+	idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1);
+	goto next_in_bitmap;
+}
+
+static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest);
+
+/*
+ * Pull RT tasks from other CPUs in the RT-overload
+ * case. Interrupts are disabled, local rq is locked.
+ */
+static void pull_rt_tasks(runqueue_t *this_rq, int this_cpu)
+{
+	runqueue_t *src_rq;
+	task_t *p;
+	int cpu;
+
+	WARN_ON(!raw_irqs_disabled());
+
+	for_each_online_cpu(cpu) {
+		if (cpu == this_cpu)
+			continue;
+		src_rq = cpu_rq(cpu);
+		if (src_rq->rt_nr_running <= 1)
+			continue;
+
+		double_lock_balance(this_rq, src_rq);
+
+		p = pick_rt_task(src_rq, this_cpu);
+
+		if (p /* && TASK_PREEMPTS_CURR(p, this_rq) */ ) {
+			WARN_ON(p == src_rq->curr);
+			WARN_ON(!p->array);
+			rt_overload_pulled++;
+
+			set_task_cpu(p, this_cpu);
+
+			p->timestamp = p->timestamp -
+				src_rq->timestamp_last_tick
+				+ this_rq->timestamp_last_tick;
+			deactivate_task(p, src_rq);
+			activate_task(p, this_rq, 0);
+			/*
+			 * We continue with the search, just in
+			 * case there's an even higher prio task
+			 * in another runqueue.
+			 */
+		}
+		spin_unlock(&src_rq->lock);
+	}
+}
+
+#endif
+
 /*
  * find_idlest_queue - find the idlest runqueue among the cpus in group.
  */
@@ -1238,7 +1481,7 @@ static inline int wake_idle(int cpu, tas
  *
  * returns failure only if the task is already active.
  */
-static int try_to_wake_up(task_t *p, unsigned int state, int sync)
+static int try_to_wake_up(task_t *p, unsigned int state, int sync, int mutex)
 {
 	int cpu, this_cpu, success = 0;
 	unsigned long flags;
@@ -1250,6 +1493,13 @@ static int try_to_wake_up(task_t *p, uns
 	int new_cpu;
 #endif
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * sync wakeups can increase wakeup latencies:
+	 */
+	if (rt_task(p))
+		sync = 0;
+#endif
 	rq = task_rq_lock(p, &flags);
 	old_state = p->state;
 	if (!(old_state & state))
@@ -1349,6 +1599,16 @@ out_set_cpu:
 
 		this_cpu = smp_processor_id();
 		cpu = task_cpu(p);
+	} else {
+		/*
+		 * If a newly woken up RT task cannot preempt the
+		 * current (RT) task then try to find another
+		 * CPU it can preempt:
+		 */
+		if (rt_task(p) && !TASK_PREEMPTS_CURR(p, rq)) {
+			smp_send_reschedule_allbutself();
+			rt_overload_wakeup++;
+		}
 	}
 
 out_activate:
@@ -1363,48 +1623,113 @@ out_activate:
 	}
 
 	/*
-	 * Tasks that have marked their sleep as noninteractive get
-	 * woken up without updating their sleep average. (i.e. their
-	 * sleep is handled in a priority-neutral manner, no priority
-	 * boost and no penalty.)
-	 */
-	if (old_state & TASK_NONINTERACTIVE)
-		__activate_task(p, rq);
-	else
-		activate_task(p, rq, cpu == this_cpu);
-	/*
 	 * Sync wakeups (i.e. those types of wakeups where the waker
 	 * has indicated that it will leave the CPU in short order)
-	 * don't trigger a preemption, if the woken up task will run on
-	 * this cpu. (in this case the 'I will reschedule' promise of
-	 * the waker guarantees that the freshly woken up task is going
-	 * to be considered on this CPU.)
+	 * trigger a 'delayed preemption', if the woken up task will run on
+	 * this cpu. Delayed preemption is guaranteed to happen upon
+	 * return to userspace.
 	 */
 	if (!sync || cpu != this_cpu) {
-		if (TASK_PREEMPTS_CURR(p, rq))
+		/*
+		 * Mutex wakeups cause no boosting:
+		 */
+		if (mutex)
+			__activate_task(p, rq);
+		else
+			activate_task(p, rq, cpu == this_cpu);
+		if (TASK_PREEMPTS_CURR(p, rq)) {
+			trace_start_sched_wakeup(p, rq);
 			resched_task(rq->curr);
+		}
+	} else {
+		activate_task(p, rq, cpu == this_cpu);
+		if (TASK_PREEMPTS_CURR(p, rq))
+			set_tsk_need_resched_delayed(rq->curr);
 	}
+	if (rq->curr && p && rq && _need_resched())
+		trace_special_pid(p->pid, p->prio, rq->curr->prio);
 	success = 1;
 
 out_running:
-	p->state = TASK_RUNNING;
+	if (mutex)
+		p->state = TASK_RUNNING_MUTEX;
+	else
+		p->state = TASK_RUNNING;
 out:
-	task_rq_unlock(rq, &flags);
+#ifdef PREEMPT_DIRECT
+	spin_unlock(&rq->lock);
+	/*
+	 * Common place where preemption is requested - if we can
+	 * reschedule then do it here without enabling interrupts
+	 * again (and lengthening latency):
+	 */
+	if (_need_resched() && !irqs_disabled_flags(flags) && !preempt_count())
+		preempt_schedule_irq();
+	raw_local_irq_restore(flags);
+#else
+	spin_unlock_irqrestore(&rq->lock, flags);
+#endif
+	/* no need to check for preempt here - we just handled it */
 
 	return success;
 }
 
-int fastcall wake_up_process(task_t *p)
+int fastcall wake_up_process(task_t * p)
 {
-	return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
-				 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
+	int ret;
+
+	check_preempt_wakeup(p);
+	ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
+				 TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE |
+				 TASK_UNINTERRUPTIBLE, 0, 0);
+	mcount();
+	return ret;
 }
 
 EXPORT_SYMBOL(wake_up_process);
 
+int fastcall wake_up_process_sync(task_t * p)
+{
+	int ret;
+
+	check_preempt_wakeup(p);
+	ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
+				 TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE |
+				 TASK_UNINTERRUPTIBLE, 1, 0);
+	mcount();
+	return ret;
+}
+
+EXPORT_SYMBOL(wake_up_process_sync);
+
+
+int fastcall wake_up_process_mutex(task_t * p)
+{
+	int ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
+				 TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE |
+				 TASK_UNINTERRUPTIBLE, 0, 1);
+	mcount();
+	return ret;
+}
+
+EXPORT_SYMBOL(wake_up_process_mutex);
+
+int fastcall wake_up_process_mutex_sync(task_t * p)
+{
+	int ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
+				 TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE |
+				 TASK_UNINTERRUPTIBLE, 1, 1);
+	mcount();
+	return ret;
+}
+
+EXPORT_SYMBOL(wake_up_process_mutex_sync);
+
 int fastcall wake_up_state(task_t *p, unsigned int state)
 {
-	return try_to_wake_up(p, state, 0);
+	int ret = try_to_wake_up(p, state | TASK_RUNNING_MUTEX, 0, 0);
+	mcount();
+	return ret;
 }
 
 /*
@@ -1427,6 +1752,27 @@ void fastcall sched_fork(task_t *p, int 
 	 * event cannot wake it up and insert it on the runqueue either.
 	 */
 	p->state = TASK_RUNNING;
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Some callers of copy_process() (e.g. kernel_thread()) might
+	 * hold locks/semaphores, which might cause the current thread
+	 * to be boosted. To make sure it does not leak to the child,
+	 * we restore the parent's normal prio into the child:
+	 */
+	{
+		if (!rt_prio(current->normal_prio) && rt_prio(current->prio)) {
+			static int once = 1;
+
+			if (once) {
+				once = 0;
+				printk("BUG in %s/%d: priority-boost leaks to child! fixed it up.\n",
+					current->comm, current->pid);
+				dump_stack();
+			}
+			p->prio = current->normal_prio;
+		}
+	}
+#endif
 	INIT_LIST_HEAD(&p->run_list);
 	p->array = NULL;
 #ifdef CONFIG_SCHEDSTATS
@@ -1444,7 +1790,7 @@ void fastcall sched_fork(task_t *p, int 
 	 * total amount of pending timeslices in the system doesn't change,
 	 * resulting in more scheduling fairness.
 	 */
-	local_irq_disable();
+	raw_local_irq_disable();
 	p->time_slice = (current->time_slice + 1) >> 1;
 	/*
 	 * The remainder of the first timeslice might be recovered by
@@ -1462,7 +1808,7 @@ void fastcall sched_fork(task_t *p, int 
 		current->time_slice = 1;
 		scheduler_tick();
 	}
-	local_irq_enable();
+	raw_local_irq_enable();
 	put_cpu();
 }
 
@@ -1493,7 +1839,7 @@ void fastcall wake_up_new_task(task_t *p
 	p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
 		CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
 
-	p->prio = effective_prio(p);
+	p->prio = __recalc_task_prio(p);
 
 	if (likely(cpu == this_cpu)) {
 		if (!(clone_flags & CLONE_VM)) {
@@ -1506,15 +1852,17 @@ void fastcall wake_up_new_task(task_t *p
 				__activate_task(p, rq);
 			else {
 				p->prio = current->prio;
-				list_add_tail(&p->run_list, &current->run_list);
-				p->array = current->array;
-				p->array->nr_active++;
-				inc_nr_running(p, rq);
+				p->normal_prio = current->normal_prio;
+				__activate_task_after(p, current, rq);
 			}
 			set_need_resched();
-		} else
+			trace_start_sched_wakeup(p, rq);
+		} else {
 			/* Run child last */
 			__activate_task(p, rq);
+			if (rt_task(p) && TASK_PREEMPTS_CURR(p, rq))
+				set_need_resched();
+		}
 		/*
 		 * We skip the following code due to cpu == this_cpu
 	 	 *
@@ -1631,10 +1979,25 @@ static inline void finish_task_switch(ru
 	 *		Manfred Spraul <manfred@colorfullife.com>
 	 */
 	prev_task_flags = prev->flags;
-	finish_arch_switch(prev);
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+	/*
+	 * If we pushed an RT task off the runqueue,
+	 * then kick other CPUs, they might run it:
+	 */
+	if (unlikely(rt_task(current) && prev->array && rt_task(prev))) {
+		rt_overload_schedule++;
+		smp_send_reschedule_allbutself();
+	}
+#endif
+	_finish_arch_switch(prev);
 	finish_lock_switch(rq, prev);
+	trace_stop_sched_switched(current);
+	/*
+	 * Delay the final freeing of the mm or task, so that we dont have
+	 * to do complex work from within the scheduler:
+	 */
 	if (mm)
-		mmdrop(mm);
+		mmdrop_delayed(mm);
 	if (unlikely(prev_task_flags & PF_DEAD))
 		put_task_struct(prev);
 }
@@ -1646,12 +2009,17 @@ static inline void finish_task_switch(ru
 asmlinkage void schedule_tail(task_t *prev)
 	__releases(rq->lock)
 {
-	runqueue_t *rq = this_rq();
-	finish_task_switch(rq, prev);
+	preempt_disable(); // TODO: move this to fork setup
+	finish_task_switch(this_rq(), prev);
+	__preempt_enable_no_resched();
+	raw_local_irq_enable();
 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
 	/* In this case, finish_task_switch does not reenable preemption */
 	preempt_enable();
+#else
+	preempt_check_resched();
 #endif
+
 	if (current->set_child_tid)
 		put_user(current->pid, current->set_child_tid);
 }
@@ -1679,6 +2047,13 @@ task_t * context_switch(runqueue_t *rq, 
 		rq->prev_mm = oldmm;
 	}
 
+	trace_cmdline();
+
+#ifdef CURRENT_PTR
+	barrier();
+	*current_ptr = next;
+	*current_ti_ptr = next->thread_info;
+#endif
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
 
@@ -1719,6 +2094,21 @@ unsigned long nr_uninterruptible(void)
 	return sum;
 }
 
+unsigned long nr_uninterruptible_cpu(int cpu)
+{
+	return cpu_rq(cpu)->nr_uninterruptible;
+}
+
+unsigned long rt_nr_running_cpu(int cpu)
+{
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+	return cpu_rq(cpu)->rt_nr_running;
+#else
+	return 0;
+#endif
+}
+
+
 unsigned long long nr_context_switches(void)
 {
 	unsigned long long i, sum = 0;
@@ -2537,10 +2927,11 @@ unsigned long long current_sched_time(co
 {
 	unsigned long long ns;
 	unsigned long flags;
-	local_irq_save(flags);
+
+	raw_local_irq_save(flags);
 	ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick);
 	ns = tsk->sched_time + (sched_clock() - ns);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return ns;
 }
 
@@ -2647,6 +3038,8 @@ void scheduler_tick(void)
 	task_t *p = current;
 	unsigned long long now = sched_clock();
 
+	BUG_ON(!raw_irqs_disabled());
+
 	update_cpu_clock(p, rq, now);
 
 	rq->timestamp_last_tick = now;
@@ -2670,11 +3063,17 @@ void scheduler_tick(void)
 	 * priority until it either goes to sleep or uses up its
 	 * timeslice. This makes it possible for interactive tasks
 	 * to use up their timeslices at their highest priority levels.
+	 *
+	 * Priority-boosted SCHED_NORMAL tasks may go here too.
 	 */
 	if (rt_task(p)) {
 		/*
 		 * RR tasks need a special form of timeslice management.
 		 * FIFO tasks have no timeslices.
+		 *
+		 * On PREEMPT_RT, boosted tasks will also get into this
+		 * branch and wont get their timeslice decreased until
+		 * they have done their work.
 		 */
 		if ((p->policy == SCHED_RR) && !--p->time_slice) {
 			p->time_slice = task_timeslice(p);
@@ -2689,7 +3088,7 @@ void scheduler_tick(void)
 	if (!--p->time_slice) {
 		dequeue_task(p, rq->active);
 		set_tsk_need_resched(p);
-		p->prio = effective_prio(p);
+		p->prio = __recalc_task_prio(p);
 		p->time_slice = task_timeslice(p);
 		p->first_time_slice = 0;
 
@@ -2795,13 +3194,13 @@ static inline unsigned long smt_slice(ta
 	return p->time_slice * (100 - sd->per_cpu_gain) / 100;
 }
 
-static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
+static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
 {
 	struct sched_domain *tmp, *sd = NULL;
 	cpumask_t sibling_map;
 	prio_array_t *array;
-	int ret = 0, i;
-	task_t *p;
+	int ret = 0, i, stop_sched_switched = 0;
+	task_t *p = NULL;
 
 	for_each_domain(this_cpu, tmp)
 		if (tmp->flags & SD_SHARE_CPUPOWER)
@@ -2863,6 +3262,12 @@ static inline int dependent_sleeper(int 
 				!TASK_PREEMPTS_CURR(p, smt_rq) &&
 				smt_slice(smt_curr, sd) > task_timeslice(p))
 					ret = 1;
+		if (ret) {
+			trace_special_pid(smt_curr->pid, smt_curr->prio,
+						smt_curr->static_prio);
+			trace_special_pid(p->pid, p->prio, p->static_prio);
+			stop_sched_switched = 1;
+		}
 
 check_smt_task:
 		if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
@@ -2893,6 +3298,9 @@ check_smt_task:
 out_unlock:
 	for_each_cpu_mask(i, sibling_map)
 		spin_unlock(&cpu_rq(i)->lock);
+	if (p && stop_sched_switched)
+		trace_stop_sched_switched(p);
+
 	return ret;
 }
 #else
@@ -2906,42 +3314,51 @@ static inline int dependent_sleeper(int 
 }
 #endif
 
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
+#if defined(CONFIG_LATENCY_TRACE) && defined(CONFIG_DEBUG_DEADLOCKS)
 
-void fastcall add_preempt_count(int val)
+static void trace_array(prio_array_t *array)
 {
-	/*
-	 * Underflow?
-	 */
-	BUG_ON((preempt_count() < 0));
-	preempt_count() += val;
-	/*
-	 * Spinlock count overflowing soon?
-	 */
-	BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
+	int i;
+	task_t *p;
+	struct list_head *head, *tmp;
+
+	for (i = 0; i < MAX_PRIO; i++) {
+		head = array->queue + i;
+		if (list_empty(head)) {
+			WARN_ON(test_bit(i, array->bitmap));
+			continue;
+		}
+		WARN_ON(!test_bit(i, array->bitmap));
+		list_for_each(tmp, head) {
+			p = list_entry(tmp, task_t, run_list);
+			trace_special_pid(p->pid, p->prio,
+				p->policy == SCHED_NORMAL ?
+					p->static_prio :
+					(MAX_RT_PRIO-1) - p->rt_priority);
+		}
+	}
 }
-EXPORT_SYMBOL(add_preempt_count);
 
-void fastcall sub_preempt_count(int val)
+static inline void trace_all_runnable_tasks(runqueue_t *rq)
+{
+	if (trace_enabled) {
+		trace_array(rq->active);
+		trace_array(rq->expired);
+	}
+}
+
+#else
+
+static inline void trace_all_runnable_tasks(runqueue_t *rq)
 {
-	/*
-	 * Underflow?
-	 */
-	BUG_ON(val > preempt_count());
-	/*
-	 * Is the spinlock portion underflowing?
-	 */
-	BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK));
-	preempt_count() -= val;
 }
-EXPORT_SYMBOL(sub_preempt_count);
 
 #endif
 
 /*
- * schedule() is the main scheduler function.
+ * __schedule() is the main scheduler function.
  */
-asmlinkage void __sched schedule(void)
+void __sched __schedule(void)
 {
 	long *switch_count;
 	task_t *prev, *next;
@@ -2952,26 +3369,24 @@ asmlinkage void __sched schedule(void)
 	unsigned long run_time;
 	int cpu, idx, new_prio;
 
+	WARN_ON(system_state == SYSTEM_BOOTING);
 	/*
-	 * Test if we are atomic.  Since do_exit() needs to call into
-	 * schedule() atomically, we ignore that path for now.
-	 * Otherwise, whine if we are scheduling when we should not be.
-	 */
-	if (likely(!current->exit_state)) {
-		if (unlikely(in_atomic())) {
-			printk(KERN_ERR "scheduling while atomic: "
-				"%s/0x%08x/%d\n",
-				current->comm, preempt_count(), current->pid);
-			dump_stack();
-		}
+	 * Test if we are atomic.
+	 */
+	if (unlikely(in_atomic())) {
+		stop_trace();
+		printk(KERN_ERR "BUG: scheduling while atomic: "
+			"%s/0x%08x/%d\n",
+			current->comm, preempt_count(), current->pid);
+		print_symbol("caller is %s\n",
+			(long)__builtin_return_address(0));
+		dump_stack();
 	}
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 
-need_resched:
-	preempt_disable();
+	preempt_disable(); // FIXME: disable irqs here
 	prev = current;
 	release_kernel_lock(prev);
-need_resched_nonpreemptible:
 	rq = this_rq();
 
 	/*
@@ -2979,7 +3394,7 @@ need_resched_nonpreemptible:
 	 * Remove this check after it has been exercised a bit.
 	 */
 	if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
-		printk(KERN_ERR "bad: scheduling from the idle thread!\n");
+		printk(KERN_ERR "BUG: scheduling from the idle thread!\n");
 		dump_stack();
 	}
 
@@ -2998,13 +3413,12 @@ need_resched_nonpreemptible:
 	 */
 	run_time /= (CURRENT_BONUS(prev) ? : 1);
 
+	cpu = smp_processor_id();
 	spin_lock_irq(&rq->lock);
 
-	if (unlikely(prev->flags & PF_DEAD))
-		prev->state = EXIT_DEAD;
-
-	switch_count = &prev->nivcsw;
-	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+	switch_count = &prev->nvcsw; // TODO: temporary - to see it in vmstat
+	if ((prev->state & ~TASK_RUNNING_MUTEX) &&
+			!(preempt_count() & PREEMPT_ACTIVE)) {
 		switch_count = &prev->nvcsw;
 		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
 				unlikely(signal_pending(prev))))
@@ -3012,11 +3426,27 @@ need_resched_nonpreemptible:
 		else {
 			if (prev->state == TASK_UNINTERRUPTIBLE)
 				rq->nr_uninterruptible++;
+			touch_softlockup_watchdog();
 			deactivate_task(prev, rq);
 		}
 	}
+	if (preempt_count() & PREEMPT_ACTIVE)
+		sub_preempt_count(PREEMPT_ACTIVE);
+	if (unlikely(prev->flags & PF_DEAD)) {
+		if (prev->state != TASK_RUNNING) {
+			printk("prev->state: %ld != TASK_RUNNING??\n",
+				prev->state);
+			WARN_ON(1);
+		} else
+			deactivate_task(prev, rq);
+		prev->state = EXIT_DEAD;
+	}
+
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+	if (unlikely(atomic_read(&rt_overload)))
+		pull_rt_tasks(rq, cpu);
+#endif
 
-	cpu = smp_processor_id();
 	if (unlikely(!rq->nr_running)) {
 go_idle:
 		idle_balance(cpu, rq);
@@ -3088,6 +3518,7 @@ switch_tasks:
 	prefetch(next);
 	prefetch_stack(next);
 	clear_tsk_need_resched(prev);
+	clear_tsk_need_resched_delayed(prev);
 	rcu_qsctr_inc(task_cpu(prev));
 
 	update_cpu_clock(prev, rq, now);
@@ -3097,6 +3528,8 @@ switch_tasks:
 		prev->sleep_avg = 0;
 	prev->timestamp = prev->last_ran = now;
 
+	trace_all_runnable_tasks(rq);
+
 	sched_info_switch(prev, next);
 	if (likely(prev != next)) {
 		next->timestamp = now;
@@ -3107,26 +3540,82 @@ switch_tasks:
 		prepare_task_switch(rq, next);
 		prev = context_switch(rq, prev, next);
 		barrier();
+		trace_special_pid(prev->pid, prev->prio, current->prio);
 		/*
 		 * this_rq must be evaluated again because prev may have moved
 		 * CPUs since it called schedule(), thus the 'rq' on its stack
 		 * frame will be invalid.
 		 */
 		finish_task_switch(this_rq(), prev);
-	} else
-		spin_unlock_irq(&rq->lock);
+		__preempt_enable_no_resched();
+	} else {
+		__preempt_enable_no_resched();
+		spin_unlock(&rq->lock);
+		trace_stop_sched_switched(next);
+	}
 
-	prev = current;
-	if (unlikely(reacquire_kernel_lock(prev) < 0))
-		goto need_resched_nonpreemptible;
-	preempt_enable_no_resched();
-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
-		goto need_resched;
+	reacquire_kernel_lock(current);
 }
 
+/*
+ * schedule() is the main scheduler function.
+ */
+asmlinkage void __sched schedule(void)
+{
+	WARN_ON(system_state == SYSTEM_BOOTING);
+	/*
+	 * Test if we have interrupts disabled.
+	 */
+	if (unlikely(irqs_disabled() || raw_irqs_disabled())) {
+		stop_trace();
+		printk(KERN_ERR "BUG: scheduling with %s irqs disabled: "
+			"%s/0x%08x/%d\n",
+				(irqs_disabled()) ? "" : "raw ", current->comm,
+				preempt_count(), current->pid);
+		print_symbol("caller is %s\n",
+			(long)__builtin_return_address(0));
+		dump_stack();
+	}
+	if (unlikely(current->flags & PF_NOSCHED)) {
+		current->flags &= ~PF_NOSCHED;
+		printk(KERN_ERR "%s:%d userspace BUG: scheduling in user-atomic context!\n", current->comm, current->pid);
+		dump_stack();
+		send_sig(SIGUSR2, current, 1);
+	}
+	do {
+		__schedule();
+	} while (unlikely(test_thread_flag(TIF_NEED_RESCHED) || test_thread_flag(TIF_NEED_RESCHED_DELAYED)));
+	raw_local_irq_enable(); // TODO: do sti; ret
+}
 EXPORT_SYMBOL(schedule);
 
 #ifdef CONFIG_PREEMPT
+
+int kernel_preemption = 1;
+
+static int __init preempt_setup (char *str)
+{
+	if (!strncmp(str, "off", 3)) {
+		if (kernel_preemption) {
+			printk("turning off kernel preemption!\n");
+			kernel_preemption = 0;
+		}
+		return 1;
+	}
+	if (!strncmp(str, "on", 2)) {
+		if (!kernel_preemption) {
+			printk("turning on kernel preemption!\n");
+			kernel_preemption = 1;
+		}
+		return 1;
+	}
+	get_option(&str, &kernel_preemption);
+
+	return 1;
+}
+
+__setup("preempt=", preempt_setup);
+
 /*
  * this is is the entry point to schedule() from in-kernel preemption
  * off of preempt_enable.  Kernel preemptions off return from interrupt
@@ -3139,14 +3628,17 @@ asmlinkage void __sched preempt_schedule
 	struct task_struct *task = current;
 	int saved_lock_depth;
 #endif
+	if (!kernel_preemption)
+		return;
 	/*
 	 * If there is a non-zero preempt_count or interrupts are disabled,
 	 * we do not want to preempt the current task.  Just return..
 	 */
-	if (unlikely(ti->preempt_count || irqs_disabled()))
+	if (unlikely(ti->preempt_count || irqs_disabled() || raw_irqs_disabled()))
 		return;
 
 need_resched:
+	raw_local_irq_disable();
 	add_preempt_count(PREEMPT_ACTIVE);
 	/*
 	 * We keep the big kernel semaphore locked, but we
@@ -3157,25 +3649,24 @@ need_resched:
 	saved_lock_depth = task->lock_depth;
 	task->lock_depth = -1;
 #endif
-	schedule();
+	__schedule();
 #ifdef CONFIG_PREEMPT_BKL
 	task->lock_depth = saved_lock_depth;
 #endif
-	sub_preempt_count(PREEMPT_ACTIVE);
-
 	/* we could miss a preemption opportunity between schedule and now */
 	barrier();
-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED) || test_thread_flag(TIF_NEED_RESCHED_DELAYED)))
 		goto need_resched;
+	raw_local_irq_enable();
 }
 
 EXPORT_SYMBOL(preempt_schedule);
 
 /*
- * this is is the entry point to schedule() from kernel preemption
- * off of irq context.
- * Note, that this is called and return with irqs disabled. This will
- * protect us against recursive calling from irq.
+ * this is is the entry point for the IRQ return path. Called with
+ * interrupts disabled.  To avoid infinite irq-entry recursion problems
+ * with fast-paced IRQ sources we do all of this carefully to never
+ * enable interrupts again.
  */
 asmlinkage void __sched preempt_schedule_irq(void)
 {
@@ -3184,10 +3675,17 @@ asmlinkage void __sched preempt_schedule
 	struct task_struct *task = current;
 	int saved_lock_depth;
 #endif
-	/* Catch callers which need to be fixed*/
-	BUG_ON(ti->preempt_count || !irqs_disabled());
+	if (!kernel_preemption)
+		return;
+	/*
+	 * If there is a non-zero preempt_count then just return.
+	 * (interrupts are disabled)
+	 */
+	if (unlikely(ti->preempt_count))
+		return;
 
 need_resched:
+	raw_local_irq_disable();
 	add_preempt_count(PREEMPT_ACTIVE);
 	/*
 	 * We keep the big kernel semaphore locked, but we
@@ -3198,17 +3696,16 @@ need_resched:
 	saved_lock_depth = task->lock_depth;
 	task->lock_depth = -1;
 #endif
-	local_irq_enable();
-	schedule();
-	local_irq_disable();
+	__schedule();
+
+	raw_local_irq_disable();
+
 #ifdef CONFIG_PREEMPT_BKL
 	task->lock_depth = saved_lock_depth;
 #endif
-	sub_preempt_count(PREEMPT_ACTIVE);
-
 	/* we could miss a preemption opportunity between schedule and now */
 	barrier();
-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED) || test_thread_flag(TIF_NEED_RESCHED_DELAYED)))
 		goto need_resched;
 }
 
@@ -3218,7 +3715,7 @@ int default_wake_function(wait_queue_t *
 			  void *key)
 {
 	task_t *p = curr->private;
-	return try_to_wake_up(p, mode, sync);
+	return try_to_wake_up(p, mode | TASK_RUNNING_MUTEX, sync, 0);
 }
 
 EXPORT_SYMBOL(default_wake_function);
@@ -3262,8 +3759,9 @@ void fastcall __wake_up(wait_queue_head_
 	unsigned long flags;
 
 	spin_lock_irqsave(&q->lock, flags);
-	__wake_up_common(q, mode, nr_exclusive, 0, key);
+	__wake_up_common(q, mode, nr_exclusive, 1, key);
 	spin_unlock_irqrestore(&q->lock, flags);
+	preempt_check_resched_delayed();
 }
 
 EXPORT_SYMBOL(__wake_up);
@@ -3314,8 +3812,9 @@ void fastcall complete(struct completion
 	spin_lock_irqsave(&x->wait.lock, flags);
 	x->done++;
 	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
-			 1, 0, NULL);
+			 1, 1, NULL);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
+	preempt_check_resched_delayed();
 }
 EXPORT_SYMBOL(complete);
 
@@ -3326,11 +3825,19 @@ void fastcall complete_all(struct comple
 	spin_lock_irqsave(&x->wait.lock, flags);
 	x->done += UINT_MAX/2;
 	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
-			 0, 0, NULL);
+			 0, 1, NULL);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
+	preempt_check_resched_delayed();
 }
 EXPORT_SYMBOL(complete_all);
 
+unsigned int fastcall completion_done(struct completion *x)
+{
+	return x->done;
+}
+EXPORT_SYMBOL(completion_done);
+
+
 void fastcall __sched wait_for_completion(struct completion *x)
 {
 	might_sleep();
@@ -3559,7 +4066,7 @@ void set_user_nice(task_t *p, long nice)
 	new_prio = NICE_TO_PRIO(nice);
 	delta = new_prio - old_prio;
 	p->static_prio = NICE_TO_PRIO(nice);
-	p->prio += delta;
+	p->prio = __recalc_task_prio(p);
 
 	if (array) {
 		enqueue_task(p, array);
@@ -3590,6 +4097,53 @@ int can_nice(const task_t *p, const int 
 		capable(CAP_SYS_NICE));
 }
 
+/*
+ * Used by the PREEMPT_RT code to implement
+ * priority inheritance logic:
+ */
+void mutex_setprio(task_t *p, int prio)
+{
+	unsigned long flags;
+	prio_array_t *array;
+	runqueue_t *rq;
+	int oldprio, prev_resched;
+
+	BUG_ON(prio < 0 || prio > MAX_PRIO);
+
+	rq = task_rq_lock(p, &flags);
+
+	oldprio = p->prio;
+	array = p->array;
+	if (array)
+		dequeue_task(p, array);
+	p->prio = prio;
+
+	trace_special_pid(p->pid, oldprio, prio);
+	prev_resched = _need_resched();
+	if (array) {
+		/*
+		 * If changing to an RT priority then queue it
+		 * in the active array!
+		 */
+		if (rt_task(p))
+			array = rq->active;
+		enqueue_task(p, array);
+		/*
+		 * Reschedule if we are currently running on this runqueue and
+		 * our priority decreased, or if we are not currently running on
+		 * this runqueue and our priority is higher than the current's
+		 */
+		if (task_running(rq, p)) {
+			if (p->prio > oldprio)
+				resched_task(rq->curr);
+		} else if (TASK_PREEMPTS_CURR(p, rq))
+			resched_task(rq->curr);
+	}
+	trace_special(prev_resched, _need_resched(), 0);
+
+	task_rq_unlock(rq, &flags);
+}
+
 #ifdef __ARCH_WANT_SYS_NICE
 
 /*
@@ -3689,10 +4243,8 @@ static void __setscheduler(struct task_s
 	BUG_ON(p->array);
 	p->policy = policy;
 	p->rt_priority = prio;
-	if (policy != SCHED_NORMAL)
-		p->prio = MAX_RT_PRIO-1 - p->rt_priority;
-	else
-		p->prio = p->static_prio;
+	__recalc_task_prio(p);
+	p->prio = p->normal_prio;
 }
 
 /**
@@ -4074,11 +4626,11 @@ asmlinkage long sys_sched_yield(void)
 	 * Since we are going to call schedule() anyway, there's
 	 * no need to preempt or enable interrupts:
 	 */
-	__release(rq->lock);
-	_raw_spin_unlock(&rq->lock);
-	preempt_enable_no_resched();
+	spin_unlock_no_resched(&rq->lock);
 
-	schedule();
+	__schedule();
+	raw_local_irq_enable();
+	preempt_check_resched();
 
 	return 0;
 }
@@ -4093,10 +4645,11 @@ static inline void __cond_resched(void)
 	if (unlikely(preempt_count()))
 		return;
 	do {
+		raw_local_irq_disable();
 		add_preempt_count(PREEMPT_ACTIVE);
-		schedule();
-		sub_preempt_count(PREEMPT_ACTIVE);
+		__schedule();
 	} while (need_resched());
+	raw_local_irq_enable();
 }
 
 int __sched cond_resched(void)
@@ -4118,43 +4671,121 @@ EXPORT_SYMBOL(cond_resched);
  * operations here to prevent schedule() from being called twice (once via
  * spin_unlock(), once by hand).
  */
-int cond_resched_lock(spinlock_t *lock)
+int __cond_resched_raw_spinlock(raw_spinlock_t *lock)
 {
 	int ret = 0;
 
-	if (need_lockbreak(lock)) {
+	if (need_lockbreak_raw(lock)) {
 		spin_unlock(lock);
 		cpu_relax();
-		ret = 1;
 		spin_lock(lock);
+		ret = 1;
 	}
 	if (need_resched()) {
-		_raw_spin_unlock(lock);
-		preempt_enable_no_resched();
+		spin_unlock_no_resched(lock);
 		__cond_resched();
-		ret = 1;
 		spin_lock(lock);
+		ret = 1;
 	}
 	return ret;
 }
 
-EXPORT_SYMBOL(cond_resched_lock);
+EXPORT_SYMBOL(__cond_resched_raw_spinlock);
+
+#ifdef CONFIG_PREEMPT_RT
+
+int __cond_resched_spinlock(spinlock_t *lock)
+{
+#if (defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)) || defined(CONFIG_PREEMPT_RT)
+	if (lock->break_lock) {
+		lock->break_lock = 0;
+		_spin_unlock(lock);
+		__cond_resched();
+		_spin_lock(lock);
+		return 1;
+	}
+#endif
+	return 0;
+}
+
+EXPORT_SYMBOL(__cond_resched_spinlock);
 
+#endif
+
+
+/*
+ * Preempt a softirq context if necessary:
+ */
 int __sched cond_resched_softirq(void)
 {
+#ifndef CONFIG_PREEMPT_RT
 	BUG_ON(!in_softirq());
 
-	if (need_resched()) {
+	if (softirq_need_resched()) {
 		__local_bh_enable();
 		__cond_resched();
 		local_bh_disable();
 		return 1;
 	}
+#endif
 	return 0;
 }
 
 EXPORT_SYMBOL(cond_resched_softirq);
 
+/*
+ * Preempt a hardirq context if necessary:
+ */
+int cond_resched_hardirq(void)
+{
+	BUG_ON(!in_irq());
+
+	if (hardirq_need_resched()) {
+		irq_exit();
+		__cond_resched();
+		irq_enter();
+		return 1;
+	}
+	return 0;
+}
+
+EXPORT_SYMBOL(cond_resched_hardirq);
+
+/*
+ * Preempt any context:
+ */
+int cond_resched_all(void)
+{
+	if (hardirq_count())
+		return cond_resched_hardirq();
+	if (softirq_count())
+		return cond_resched_softirq();
+	return cond_resched();
+}
+
+EXPORT_SYMBOL(cond_resched_all);
+
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+
+int voluntary_preemption = 1;
+
+EXPORT_SYMBOL(voluntary_preemption);
+
+static int __init voluntary_preempt_setup (char *str)
+{
+	if (!strncmp(str, "off", 3))
+		voluntary_preemption = 0;
+	else
+		get_option(&str, &voluntary_preemption);
+	if (!voluntary_preemption)
+		printk("turning off voluntary preemption!\n");
+
+	return 1;
+}
+
+__setup("voluntary-preempt=", voluntary_preempt_setup);
+
+#endif
 
 /**
  * yield - yield the current processor to other threads.
@@ -4162,12 +4793,31 @@ EXPORT_SYMBOL(cond_resched_softirq);
  * this is a shortcut for kernel-space yielding - it marks the
  * thread runnable and calls sys_sched_yield().
  */
-void __sched yield(void)
+void __sched __yield(void)
 {
 	set_current_state(TASK_RUNNING);
 	sys_sched_yield();
 }
 
+void __sched yield(void)
+{
+	static int once = 1;
+
+	/*
+	 * it's a bug to rely on yield() with RT priorities. We print
+	 * the first occurance after bootup ... this will still give
+	 * us an idea about the scope of the problem, without spamming
+	 * the syslog:
+	 */
+	if (once && rt_task(current)) {
+		once = 0;
+		printk(KERN_ERR "BUG: %s:%d RT task yield()-ing!\n",
+			current->comm, current->pid);
+		dump_stack();
+	}
+	__yield();
+}
+
 EXPORT_SYMBOL(yield);
 
 /*
@@ -4306,25 +4956,29 @@ static void show_task(task_t *p)
 	task_t *relative;
 	unsigned state;
 	unsigned long free = 0;
-	static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" };
+	static const char *stat_nam[] = { "R", "M", "S", "D", "T", "t", "Z", "X" };
 
-	printk("%-13.13s ", p->comm);
+	printk("%-13.13s [%p]", p->comm, p);
 	state = p->state ? __ffs(p->state) + 1 : 0;
 	if (state < ARRAY_SIZE(stat_nam))
 		printk(stat_nam[state]);
 	else
 		printk("?");
 #if (BITS_PER_LONG == 32)
-	if (state == TASK_RUNNING)
+	if (0 && (state == TASK_RUNNING))
 		printk(" running ");
 	else
 		printk(" %08lX ", thread_saved_pc(p));
 #else
-	if (state == TASK_RUNNING)
+	if (0 && (state == TASK_RUNNING))
 		printk("  running task   ");
 	else
 		printk(" %016lx ", thread_saved_pc(p));
 #endif
+	if (task_curr(p))
+		printk("[curr] ");
+	else if (p->array)
+		printk("[on rq #%d] ", task_cpu(p));
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	{
 		unsigned long *n = end_of_stack(p);
@@ -4351,13 +5005,14 @@ static void show_task(task_t *p)
 	else
 		printk(" (NOTLB)\n");
 
-	if (state != TASK_RUNNING)
+//	if (state != TASK_RUNNING)
 		show_stack(p, NULL);
 }
 
 void show_state(void)
 {
 	task_t *g, *p;
+	int do_unlock = 1;
 
 #if (BITS_PER_LONG == 32)
 	printk("\n"
@@ -4368,7 +5023,16 @@ void show_state(void)
 	       "                                                       sibling\n");
 	printk("  task                 PC          pid father child younger older\n");
 #endif
+#ifdef CONFIG_PREEMPT_RT
+	if (!read_trylock(&tasklist_lock)) {
+		printk("hm, tasklist_lock write-locked.\n");
+		printk("ignoring ...\n");
+		do_unlock = 0;
+	}
+#else
 	read_lock(&tasklist_lock);
+#endif
+
 	do_each_thread(g, p) {
 		/*
 		 * reset the NMI-timeout, listing all files on a slow
@@ -4378,7 +5042,9 @@ void show_state(void)
 		show_task(p);
 	} while_each_thread(g, p);
 
-	read_unlock(&tasklist_lock);
+	if (do_unlock)
+		read_unlock(&tasklist_lock);
+	show_all_locks();
 }
 
 /**
@@ -4396,7 +5062,7 @@ void __devinit init_idle(task_t *idle, i
 
 	idle->sleep_avg = 0;
 	idle->array = NULL;
-	idle->prio = MAX_PRIO;
+	idle->prio = idle->normal_prio = MAX_PRIO;
 	idle->state = TASK_RUNNING;
 	idle->cpus_allowed = cpumask_of_cpu(cpu);
 	set_task_cpu(idle, cpu);
@@ -4409,7 +5075,9 @@ void __devinit init_idle(task_t *idle, i
 	spin_unlock_irqrestore(&rq->lock, flags);
 
 	/* Set the preempt count _outside_ the spinlocks! */
-#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
+#if defined(CONFIG_PREEMPT) && \
+	!defined(CONFIG_PREEMPT_BKL) && \
+		!defined(CONFIG_PREEMPT_RT)
 	task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
 #else
 	task_thread_info(idle)->preempt_count = 0;
@@ -4493,12 +5161,13 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
  * So we race with normal scheduler movements, but that's OK, as long
  * as the task is no longer on this CPU.
  */
-static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
+static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
 	runqueue_t *rq_dest, *rq_src;
+	int ret = 0;
 
 	if (unlikely(cpu_is_offline(dest_cpu)))
-		return;
+		return 0;
 
 	rq_src = cpu_rq(src_cpu);
 	rq_dest = cpu_rq(dest_cpu);
@@ -4511,7 +5180,9 @@ static void __migrate_task(struct task_s
 	if (!cpu_isset(dest_cpu, p->cpus_allowed))
 		goto out;
 
+	WARN_ON(p == rq_src->curr);
 	set_task_cpu(p, dest_cpu);
+
 	if (p->array) {
 		/*
 		 * Sync timestamp with rq_dest's before activating.
@@ -4525,10 +5196,13 @@ static void __migrate_task(struct task_s
 		activate_task(p, rq_dest, 0);
 		if (TASK_PREEMPTS_CURR(p, rq_dest))
 			resched_task(rq_dest->curr);
+		ret = 1;
 	}
 
 out:
 	double_rq_unlock(rq_src, rq_dest);
+
+	return ret;
 }
 
 /*
@@ -4576,7 +5250,7 @@ static int migration_thread(void *data)
 
 		spin_unlock(&rq->lock);
 		__migrate_task(req->task, cpu, req->dest_cpu);
-		local_irq_enable();
+		raw_local_irq_enable();
 
 		complete(&req->done);
 	}
@@ -4640,12 +5314,12 @@ static void migrate_nr_uninterruptible(r
 	runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	double_rq_lock(rq_src, rq_dest);
 	rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
 	rq_src->nr_uninterruptible = 0;
 	double_rq_unlock(rq_src, rq_dest);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /* Run through task list and migrate tasks from the dead cpu. */
@@ -4723,9 +5397,9 @@ static void migrate_dead(unsigned int de
 	 * that's OK.  No task can be added to this CPU, so iteration is
 	 * fine.
 	 */
-	spin_unlock_irq(&rq->lock);
-	move_task_off_dead_cpu(dead_cpu, tsk);
 	spin_lock_irq(&rq->lock);
+	move_task_off_dead_cpu(dead_cpu, tsk);
+	spin_unlock_irq(&rq->lock);
 
 	put_task_struct(tsk);
 }
@@ -5625,6 +6299,7 @@ void __init sched_init(void)
 
 		for (j = 0; j < 2; j++) {
 			array = rq->arrays + j;
+			array->rq = rq;
 			for (k = 0; k < MAX_PRIO; k++) {
 				INIT_LIST_HEAD(array->queue + k);
 				__clear_bit(k, array->bitmap);
@@ -5640,6 +6315,9 @@ void __init sched_init(void)
 	atomic_inc(&init_mm.mm_count);
 	enter_lazy_tlb(&init_mm, current);
 
+#ifdef CONFIG_PREEMPT_RT
+	printk("Real-Time Preemption Support (C) 2004-2005 Ingo Molnar\n");
+#endif
 	/*
 	 * Make us the idle thread. Technically, schedule() should not be
 	 * called from this thread, however somewhere below it might be,
@@ -5649,21 +6327,25 @@ void __init sched_init(void)
 	init_idle(current, smp_processor_id());
 }
 
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT)
 void __might_sleep(char *file, int line)
 {
 #if defined(in_atomic)
 	static unsigned long prev_jiffy;	/* ratelimiting */
 
-	if ((in_atomic() || irqs_disabled()) &&
+	if ((in_atomic() || irqs_disabled() || raw_irqs_disabled()) &&
 	    system_state == SYSTEM_RUNNING && !oops_in_progress) {
+		if (debug_direct_keyboard && hardirq_count())
+			return;
 		if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
 			return;
 		prev_jiffy = jiffies;
-		printk(KERN_ERR "Debug: sleeping function called from invalid"
-				" context at %s:%d\n", file, line);
-		printk("in_atomic():%d, irqs_disabled():%d\n",
-			in_atomic(), irqs_disabled());
+		stop_trace();
+		printk(KERN_ERR "BUG: sleeping function called from invalid"
+				" context %s(%d) at %s:%d\n",
+				current->comm, current->pid, file, line);
+		printk("in_atomic():%d [%08x], irqs_disabled():%d\n",
+			in_atomic(), preempt_count(), irqs_disabled());
 		dump_stack();
 	}
 #endif
@@ -5745,3 +6427,23 @@ void set_curr_task(int cpu, task_t *p)
 }
 
 #endif
+
+#ifdef CONFIG_DEBUG_PREEMPT
+void notrace preempt_enable_no_resched(void)
+{
+	static int once = 1;
+
+	barrier();
+	dec_preempt_count();
+
+	if (once && !preempt_count()) {
+		once = 0;
+		printk(KERN_ERR "BUG: %s:%d task might have lost a preemption check!\n",
+			current->comm, current->pid);
+		dump_stack();
+	}
+}
+
+EXPORT_SYMBOL(preempt_enable_no_resched);
+#endif
+
Index: linux.prev/kernel/signal.c
===================================================================
--- linux.prev.orig/kernel/signal.c
+++ linux.prev/kernel/signal.c
@@ -329,13 +329,20 @@ void __exit_sighand(struct task_struct *
 	/* Ok, we're done with the signal handlers */
 	tsk->sighand = NULL;
 	if (atomic_dec_and_test(&sighand->count))
-		kmem_cache_free(sighand_cachep, sighand);
+		sighand_free(sighand);
 }
 
 void exit_sighand(struct task_struct *tsk)
 {
 	write_lock_irq(&tasklist_lock);
-	__exit_sighand(tsk);
+	rcu_read_lock();
+	if (tsk->sighand != NULL) {
+		struct sighand_struct *sighand = rcu_dereference(tsk->sighand);
+		spin_lock(&sighand->siglock);
+		__exit_sighand(tsk);
+		spin_unlock(&sighand->siglock);
+	}
+	rcu_read_unlock();
 	write_unlock_irq(&tasklist_lock);
 }
 
@@ -345,12 +352,14 @@ void exit_sighand(struct task_struct *ts
 void __exit_signal(struct task_struct *tsk)
 {
 	struct signal_struct * sig = tsk->signal;
-	struct sighand_struct * sighand = tsk->sighand;
+	struct sighand_struct * sighand;
 
 	if (!sig)
 		BUG();
 	if (!atomic_read(&sig->count))
 		BUG();
+	rcu_read_lock();
+	sighand = rcu_dereference(tsk->sighand);
 	spin_lock(&sighand->siglock);
 	posix_cpu_timers_exit(tsk);
 	if (atomic_dec_and_test(&sig->count)) {
@@ -358,6 +367,7 @@ void __exit_signal(struct task_struct *t
 		if (tsk == sig->curr_target)
 			sig->curr_target = next_thread(tsk);
 		tsk->signal = NULL;
+		__exit_sighand(tsk);
 		spin_unlock(&sighand->siglock);
 		flush_sigqueue(&sig->shared_pending);
 	} else {
@@ -389,9 +399,11 @@ void __exit_signal(struct task_struct *t
 		sig->nvcsw += tsk->nvcsw;
 		sig->nivcsw += tsk->nivcsw;
 		sig->sched_time += tsk->sched_time;
+		__exit_sighand(tsk);
 		spin_unlock(&sighand->siglock);
 		sig = NULL;	/* Marker for below.  */
 	}
+	rcu_read_unlock();
 	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
 	flush_sigqueue(&tsk->pending);
 	if (sig) {
@@ -837,9 +849,12 @@ specific_send_sig_info(int sig, struct s
 {
 	int ret = 0;
 
-	if (!irqs_disabled())
-		BUG();
+#ifndef CONFIG_PREEMPT_RT
+	BUG_ON(!irqs_disabled());
+#endif
+#ifdef CONFIG_SMP
 	assert_spin_locked(&t->sighand->siglock);
+#endif
 
 	/* Short-circuit ignored signals.  */
 	if (sig_ignored(t, sig))
@@ -1080,18 +1095,29 @@ void zap_other_threads(struct task_struc
 }
 
 /*
- * Must be called with the tasklist_lock held for reading!
+ * Must be called under rcu_read_lock() or with tasklist_lock read-held.
  */
 int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
 	unsigned long flags;
+	struct sighand_struct *sp;
 	int ret;
 
+retry:
 	ret = check_kill_permission(sig, info, p);
-	if (!ret && sig && p->sighand) {
-		spin_lock_irqsave(&p->sighand->siglock, flags);
+	if (!ret && sig && (sp = p->sighand)) {
+		if (!get_task_struct_rcu(p)) {
+			return -ESRCH;
+		}
+		spin_lock_irqsave(&sp->siglock, flags);
+		if (p->sighand != sp) {
+			spin_unlock_irqrestore(&sp->siglock, flags);
+			put_task_struct(p);
+			goto retry;
+		}
 		ret = __group_send_sig_info(sig, info, p);
-		spin_unlock_irqrestore(&p->sighand->siglock, flags);
+		spin_unlock_irqrestore(&sp->siglock, flags);
+		put_task_struct(p);
 	}
 
 	return ret;
@@ -1136,14 +1162,21 @@ int
 kill_proc_info(int sig, struct siginfo *info, pid_t pid)
 {
 	int error;
+	int acquired_tasklist_lock = 0;
 	struct task_struct *p;
 
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
+	if (unlikely(sig_kernel_stop(sig) || sig == SIGCONT)) {
+		read_lock(&tasklist_lock);
+		acquired_tasklist_lock = 1;
+	}
 	p = find_task_by_pid(pid);
 	error = -ESRCH;
 	if (p)
 		error = group_send_sig_info(sig, info, p);
-	read_unlock(&tasklist_lock);
+	if (unlikely(acquired_tasklist_lock))
+		read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 	return error;
 }
 
@@ -1355,16 +1388,49 @@ send_sigqueue(int sig, struct sigqueue *
 {
 	unsigned long flags;
 	int ret = 0;
+	struct sighand_struct *sh;
 
 	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
-	read_lock(&tasklist_lock);
+
+	/*
+	 * The rcu based delayed sighand destroy makes it possible to
+	 * run this without tasklist lock held. The task struct itself
+	 * cannot go away as create_timer did get_task_struct().
+	 *
+	 * We return -1, when the task is marked exiting, so
+	 * posix_timer_event can redirect it to the group leader
+	 *
+	 */
+	rcu_read_lock();
 
 	if (unlikely(p->flags & PF_EXITING)) {
 		ret = -1;
 		goto out_err;
 	}
 
-	spin_lock_irqsave(&p->sighand->siglock, flags);
+	sh = rcu_dereference(p->sighand);
+
+	spin_lock_irqsave(&sh->siglock, flags);
+
+	/*
+	 * We do the check here again to handle the following scenario:
+	 *
+	 * CPU 0		CPU 1
+	 * send_sigqueue
+	 * check PF_EXITING
+	 * interrupt		exit code running
+	 *			__exit_signal
+	 *			lock sighand->siglock
+	 *			unlock sighand->siglock
+	 * lock sh->siglock
+	 * add(tsk->pending) 	flush_sigqueue(tsk->pending)
+	 *
+	 */
+
+	if (unlikely(p->flags & PF_EXITING)) {
+		ret = -1;
+		goto out;
+	}
 
 	if (unlikely(!list_empty(&q->list))) {
 		/*
@@ -1388,9 +1454,9 @@ send_sigqueue(int sig, struct sigqueue *
 		signal_wake_up(p, sig == SIGKILL);
 
 out:
-	spin_unlock_irqrestore(&p->sighand->siglock, flags);
+	spin_unlock_irqrestore(&sh->siglock, flags);
 out_err:
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 
 	return ret;
 }
@@ -1402,7 +1468,16 @@ send_group_sigqueue(int sig, struct sigq
 	int ret = 0;
 
 	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
-	read_lock(&tasklist_lock);
+
+	while(!read_trylock_rt(&tasklist_lock)) {
+		if (!p->sighand)
+			return -1;
+		cpu_relax();
+	}
+	if (unlikely(!p->sighand)) {
+		ret = -1;
+		goto out_err;
+	}
 	spin_lock_irqsave(&p->sighand->siglock, flags);
 	handle_stop_signal(sig, p);
 
@@ -1435,8 +1510,9 @@ send_group_sigqueue(int sig, struct sigq
 	__group_complete_signal(sig, p);
 out:
 	spin_unlock_irqrestore(&p->sighand->siglock, flags);
+out_err:
 	read_unlock(&tasklist_lock);
-	return(ret);
+	return ret;
 }
 
 /*
@@ -1602,6 +1678,7 @@ static void ptrace_stop(int exit_code, i
 	     !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
 		do_notify_parent_cldstop(current, 1, CLD_TRAPPED);
 		read_unlock(&tasklist_lock);
+		current->flags &= ~PF_NOSCHED;
 		schedule();
 	} else {
 		/*
@@ -1668,6 +1745,7 @@ finish_stop(int stop_count)
 	read_unlock(&tasklist_lock);
 
 out:
+	current->flags &= ~PF_NOSCHED;
 	schedule();
 	/*
 	 * Now we don't run again until continued.
@@ -1827,6 +1905,9 @@ int get_signal_to_deliver(siginfo_t *inf
 	sigset_t *mask = &current->blocked;
 	int signr = 0;
 
+#ifdef CONFIG_PREEMPT_RT
+	might_sleep();
+#endif
 relock:
 	spin_lock_irq(&current->sighand->siglock);
 	for (;;) {
Index: linux.prev/kernel/softirq.c
===================================================================
--- linux.prev.orig/kernel/softirq.c
+++ linux.prev/kernel/softirq.c
@@ -4,6 +4,9 @@
  *	Copyright (C) 1992 Linus Torvalds
  *
  * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
+ *
+ *	Softirq-split implemetation by
+ *	Copyright (C) 2005 Thomas Gleixner, Ingo Molnar
  */
 
 #include <linux/module.h>
@@ -16,6 +19,10 @@
 #include <linux/cpu.h>
 #include <linux/kthread.h>
 #include <linux/rcupdate.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/wait.h>
+#include <linux/delay.h>
 
 #include <asm/irq.h>
 /*
@@ -43,7 +50,41 @@ EXPORT_SYMBOL(irq_stat);
 
 static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp;
 
-static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
+struct softirqdata {
+	int			nr;
+	unsigned long		cpu;
+	struct task_struct	*tsk;
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+	wait_queue_head_t	wait;
+	int			running;
+#endif
+};
+
+static DEFINE_PER_CPU(struct softirqdata, ksoftirqd[MAX_SOFTIRQ]);
+
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+/*
+ * Preempting the softirq causes cases that would not be a
+ * problem when the softirq is not preempted. That is a
+ * process may have code to spin while waiting for a softirq
+ * to finish on another CPU.  But if it happens that the
+ * process has preempted the softirq, this could cause a
+ * deadlock.
+ */
+void wait_for_softirq(int softirq)
+{
+	struct softirqdata *data = &__get_cpu_var(ksoftirqd[softirq]);
+	if (data->running) {
+		DECLARE_WAITQUEUE(wait, current);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		add_wait_queue(&data->wait, &wait);
+		if (data->running)
+			schedule();
+		remove_wait_queue(&data->wait, &wait);
+		__set_current_state(TASK_RUNNING);
+	}
+}
+#endif
 
 /*
  * we cannot loop indefinitely here to avoid userspace starvation,
@@ -51,16 +92,31 @@ static DEFINE_PER_CPU(struct task_struct
  * to the pending events, so lets the scheduler to balance
  * the softirq load for us.
  */
-static inline void wakeup_softirqd(void)
+static void wakeup_softirqd(int softirq)
 {
 	/* Interrupts are disabled: no need to stop preemption */
-	struct task_struct *tsk = __get_cpu_var(ksoftirqd);
+	struct task_struct *tsk = __get_cpu_var(ksoftirqd[softirq].tsk);
 
 	if (tsk && tsk->state != TASK_RUNNING)
 		wake_up_process(tsk);
 }
 
 /*
+ * Wake up the softirq threads which have work
+ */
+static void trigger_softirqs(void)
+{
+	u32 pending = local_softirq_pending();
+	int curr = 0;
+
+	while (pending) {
+		if (pending & 1)
+			wakeup_softirqd(curr);
+		pending >>= 1;
+		curr++;
+	}
+}
+/*
  * We restart softirq processing MAX_SOFTIRQ_RESTART times,
  * and we fall back to softirqd after that.
  *
@@ -71,7 +127,7 @@ static inline void wakeup_softirqd(void)
  */
 #define MAX_SOFTIRQ_RESTART 10
 
-asmlinkage void __do_softirq(void)
+asmlinkage void ___do_softirq(void)
 {
 	struct softirq_action *h;
 	__u32 pending;
@@ -80,37 +136,106 @@ asmlinkage void __do_softirq(void)
 
 	pending = local_softirq_pending();
 
-	local_bh_disable();
 	cpu = smp_processor_id();
 restart:
 	/* Reset the pending bitmask before enabling irqs */
 	set_softirq_pending(0);
 
-	local_irq_enable();
+	raw_local_irq_enable();
 
 	h = softirq_vec;
 
 	do {
 		if (pending & 1) {
-			h->action(h);
+			{
+				u32 preempt_count = preempt_count();
+				h->action(h);
+				if (preempt_count != preempt_count()) {
+					print_symbol("BUG: softirq exited %s with wrong preemption count!\n", (unsigned long) h->action);
+					printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count());
+					preempt_count() = preempt_count;
+				}
+			}
 			rcu_bh_qsctr_inc(cpu);
+			cond_resched_all();
 		}
 		h++;
 		pending >>= 1;
 	} while (pending);
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	pending = local_softirq_pending();
 	if (pending && --max_restart)
 		goto restart;
 
 	if (pending)
-		wakeup_softirqd();
+		trigger_softirqs();
+}
 
+asmlinkage void __do_softirq(void)
+{
+	unsigned long p_flags;
+
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+	/*
+	 * 'preempt harder'. Push all softirq processing off to ksoftirqd.
+	 */
+	if (softirq_preemption) {
+		if (local_softirq_pending())
+			trigger_softirqs();
+		return;
+	}
+#endif
+	/*
+	 * 'immediate' softirq execution:
+	 */
+	local_bh_disable();
+	p_flags = current->flags & PF_HARDIRQ;
+	current->flags &= ~PF_HARDIRQ;
+
+	___do_softirq();
 	__local_bh_enable();
+
+	current->flags |= p_flags;
 }
 
+void do_softirq_from_hardirq(void)
+{
+	unsigned long p_flags;
+
+	if (!local_softirq_pending())
+		return;
+	/*
+	 * 'immediate' softirq execution:
+	 */
+	local_bh_disable();
+	p_flags = current->flags & PF_HARDIRQ;
+	current->flags &= ~PF_HARDIRQ;
+
+	___do_softirq();
+	__local_bh_enable();
+
+	current->flags |= p_flags;
+}
+
+
+/*
+ * 'delayed' softirq execution. Does not disable bhs and thus
+ * makes most of the softirq handlers preemptable - as long as
+ * they are not executed 'directly'.
+ */
+asmlinkage void _do_softirq(void)
+{
+	raw_local_irq_disable();
+	if (!softirq_preemption)
+		__do_softirq();
+	else
+		___do_softirq();
+	raw_local_irq_enable();
+}
+
+
 #ifndef __ARCH_HAS_DO_SOFTIRQ
 
 asmlinkage void do_softirq(void)
@@ -121,20 +246,22 @@ asmlinkage void do_softirq(void)
 	if (in_interrupt())
 		return;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	pending = local_softirq_pending();
 
 	if (pending)
 		__do_softirq();
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 EXPORT_SYMBOL(do_softirq);
 
 #endif
 
+#ifndef CONFIG_PREEMPT_RT
+
 void local_bh_enable(void)
 {
 	WARN_ON(irqs_disabled());
@@ -152,6 +279,8 @@ void local_bh_enable(void)
 }
 EXPORT_SYMBOL(local_bh_enable);
 
+#endif
+
 #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
 # define invoke_softirq()	__do_softirq()
 #else
@@ -165,9 +294,9 @@ void irq_exit(void)
 {
 	account_system_vtime(current);
 	sub_preempt_count(IRQ_EXIT_OFFSET);
-	if (!in_interrupt() && local_softirq_pending())
-		invoke_softirq();
-	preempt_enable_no_resched();
+	//if (!in_interrupt() && local_softirq_pending())
+	//	invoke_softirq();
+	__preempt_enable_no_resched();
 }
 
 /*
@@ -186,8 +315,9 @@ inline fastcall void raise_softirq_irqof
 	 * Otherwise we wake up ksoftirqd to make sure we
 	 * schedule the softirq soon.
 	 */
-	if (!in_interrupt())
-		wakeup_softirqd();
+	if (!in_interrupt() || (current->flags & PF_HARDIRQ) || hardirq_count())
+		//trigger_softirqs();
+		wakeup_softirqd(nr);
 }
 
 EXPORT_SYMBOL(raise_softirq_irqoff);
@@ -196,9 +326,9 @@ void fastcall raise_softirq(unsigned int
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	raise_softirq_irqoff(nr);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void open_softirq(int nr, void (*action)(struct softirq_action*), void *data)
@@ -220,15 +350,25 @@ struct tasklet_head
 static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec) = { NULL };
 static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec) = { NULL };
 
+static void inline
+__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
+{
+	if (tasklet_trylock(t)) {
+		WARN_ON(t->next != NULL);
+		t->next = head->list;
+		head->list = t;
+		raise_softirq_irqoff(nr);
+		tasklet_unlock(t);
+	}
+}
+
 void fastcall __tasklet_schedule(struct tasklet_struct *t)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
-	t->next = __get_cpu_var(tasklet_vec).list;
-	__get_cpu_var(tasklet_vec).list = t;
-	raise_softirq_irqoff(TASKLET_SOFTIRQ);
-	local_irq_restore(flags);
+	raw_local_irq_save(flags);
+	__tasklet_common_schedule(t, &__get_cpu_var(tasklet_vec), TASKLET_SOFTIRQ);
+	raw_local_irq_restore(flags);
 }
 
 EXPORT_SYMBOL(__tasklet_schedule);
@@ -237,79 +377,129 @@ void fastcall __tasklet_hi_schedule(stru
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
-	t->next = __get_cpu_var(tasklet_hi_vec).list;
-	__get_cpu_var(tasklet_hi_vec).list = t;
-	raise_softirq_irqoff(HI_SOFTIRQ);
-	local_irq_restore(flags);
+	raw_local_irq_save(flags);
+	__tasklet_common_schedule(t, &__get_cpu_var(tasklet_hi_vec), HI_SOFTIRQ);
+	raw_local_irq_restore(flags);
 }
 
 EXPORT_SYMBOL(__tasklet_hi_schedule);
 
-static void tasklet_action(struct softirq_action *a)
+void fastcall tasklet_enable(struct tasklet_struct *t)
 {
-	struct tasklet_struct *list;
+	if (!atomic_dec_and_test(&t->count))
+		return;
+	if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
+		tasklet_schedule(t);
+}
 
-	local_irq_disable();
-	list = __get_cpu_var(tasklet_vec).list;
-	__get_cpu_var(tasklet_vec).list = NULL;
-	local_irq_enable();
+EXPORT_SYMBOL(tasklet_enable);
+
+void fastcall tasklet_hi_enable(struct tasklet_struct *t)
+{
+	if (!atomic_dec_and_test(&t->count))
+		return;
+	if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
+		tasklet_hi_schedule(t);
+}
+
+EXPORT_SYMBOL(tasklet_hi_enable);
+
+static void
+__tasklet_action(struct softirq_action *a, struct tasklet_struct *list)
+{
+	int loops = 1000000;
 
 	while (list) {
 		struct tasklet_struct *t = list;
 
 		list = list->next;
+		/*
+		 * Should always succeed - after a tasklist got on the
+		 * list (after getting the SCHED bit set from 0 to 1),
+		 * nothing but the tasklet softirq it got queued to can
+		 * lock it:
+		 */
+		if (!tasklet_trylock(t)) {
+			WARN_ON(1);
+			continue;
+		}
+
+		t->next = NULL;
 
-		if (tasklet_trylock(t)) {
-			if (!atomic_read(&t->count)) {
-				if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
-					BUG();
-				t->func(t->data);
+		/*
+		 * If we cannot handle the tasklet because it's disabled,
+		 * mark it as pending. tasklet_enable() will later
+		 * re-schedule the tasklet.
+		 */
+		if (unlikely(atomic_read(&t->count))) {
+out_disabled:
+			/* implicit unlock: */
+			wmb();
+			t->state = TASKLET_STATEF_PENDING;
+			continue;
+		}
+
+		/*
+		 * After this point on the tasklet might be rescheduled
+		 * on another CPU, but it can only be added to another
+		 * CPU's tasklet list if we unlock the tasklet (which we
+		 * dont do yet).
+		 */
+		if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
+			WARN_ON(1);
+
+again:
+		t->func(t->data);
+
+		/*
+		 * Try to unlock the tasklet. We must use cmpxchg, because
+		 * another CPU might have scheduled or disabled the tasklet.
+		 * We only allow the STATE_RUN -> 0 transition here.
+		 */
+		while (!tasklet_tryunlock(t)) {
+			/*
+			 * If it got disabled meanwhile, bail out:
+			 */
+			if (atomic_read(&t->count))
+				goto out_disabled;
+			/*
+			 * If it got scheduled meanwhile, re-execute
+			 * the tasklet function:
+			 */
+			if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
+				goto again;
+			if (!--loops) {
+				printk("hm, tasklet state: %08lx\n", t->state);
+				WARN_ON(1);
 				tasklet_unlock(t);
-				continue;
+				break;
 			}
-			tasklet_unlock(t);
 		}
-
-		local_irq_disable();
-		t->next = __get_cpu_var(tasklet_vec).list;
-		__get_cpu_var(tasklet_vec).list = t;
-		__raise_softirq_irqoff(TASKLET_SOFTIRQ);
-		local_irq_enable();
 	}
 }
 
-static void tasklet_hi_action(struct softirq_action *a)
+static void tasklet_action(struct softirq_action *a)
 {
 	struct tasklet_struct *list;
 
-	local_irq_disable();
-	list = __get_cpu_var(tasklet_hi_vec).list;
-	__get_cpu_var(tasklet_hi_vec).list = NULL;
-	local_irq_enable();
+	raw_local_irq_disable();
+	list = __get_cpu_var(tasklet_vec).list;
+	__get_cpu_var(tasklet_vec).list = NULL;
+	raw_local_irq_enable();
 
-	while (list) {
-		struct tasklet_struct *t = list;
+	__tasklet_action(a, list);
+}
 
-		list = list->next;
+static void tasklet_hi_action(struct softirq_action *a)
+{
+	struct tasklet_struct *list;
 
-		if (tasklet_trylock(t)) {
-			if (!atomic_read(&t->count)) {
-				if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
-					BUG();
-				t->func(t->data);
-				tasklet_unlock(t);
-				continue;
-			}
-			tasklet_unlock(t);
-		}
+	raw_local_irq_disable();
+	list = __get_cpu_var(tasklet_hi_vec).list;
+	__get_cpu_var(tasklet_hi_vec).list = NULL;
+	raw_local_irq_enable();
 
-		local_irq_disable();
-		t->next = __get_cpu_var(tasklet_hi_vec).list;
-		__get_cpu_var(tasklet_hi_vec).list = t;
-		__raise_softirq_irqoff(HI_SOFTIRQ);
-		local_irq_enable();
-	}
+	__tasklet_action(a, list);
 }
 
 
@@ -332,7 +522,7 @@ void tasklet_kill(struct tasklet_struct 
 
 	while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
 		do
-			yield();
+			msleep(1);
 		while (test_bit(TASKLET_STATE_SCHED, &t->state));
 	}
 	tasklet_unlock_wait(t);
@@ -347,36 +537,64 @@ void __init softirq_init(void)
 	open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL);
 }
 
-static int ksoftirqd(void * __bind_cpu)
+static int ksoftirqd(void * __data)
 {
-	set_user_nice(current, 19);
-	current->flags |= PF_NOFREEZE;
+	struct sched_param param = { .sched_priority = MAX_RT_PRIO/4-1 };
+	struct softirqdata *data = __data;
+	u32 mask = (1 << data->nr);
+	struct softirq_action *h;
+
+	param.sched_priority = 1;
+	sys_sched_setscheduler(current->pid, SCHED_FIFO, &param);
+//	set_user_nice(current, -10);
+	current->flags |= PF_NOFREEZE | PF_SOFTIRQ;
+
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+	init_waitqueue_head(&data->wait);
+#endif
 
 	set_current_state(TASK_INTERRUPTIBLE);
 
 	while (!kthread_should_stop()) {
 		preempt_disable();
-		if (!local_softirq_pending()) {
-			preempt_enable_no_resched();
+		if (!(local_softirq_pending() & mask)) {
+			__preempt_enable_no_resched();
 			schedule();
 			preempt_disable();
 		}
-
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+		data->running = 1;
+#endif
 		__set_current_state(TASK_RUNNING);
 
-		while (local_softirq_pending()) {
+		while (local_softirq_pending() & mask) {
 			/* Preempt disable stops cpu going offline.
 			   If already offline, we'll be on wrong CPU:
 			   don't process */
-			if (cpu_is_offline((long)__bind_cpu))
+			if (cpu_is_offline(data->cpu))
 				goto wait_to_die;
-			do_softirq();
-			preempt_enable_no_resched();
+
+			raw_local_irq_disable();
+			__preempt_enable_no_resched();
+			set_softirq_pending(local_softirq_pending() & ~mask);
+			local_bh_disable();
+			raw_local_irq_enable();
+
+			h = &softirq_vec[data->nr];
+			if (h)
+				h->action(h);
+			rcu_bh_qsctr_inc(data->cpu);
+
+			__local_bh_enable();
 			cond_resched();
 			preempt_disable();
 		}
 		preempt_enable();
 		set_current_state(TASK_INTERRUPTIBLE);
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+		data->running = 0;
+		wake_up(&data->wait);
+#endif
 	}
 	__set_current_state(TASK_RUNNING);
 	return 0;
@@ -423,12 +641,12 @@ void tasklet_kill_immediate(struct taskl
 	BUG();
 }
 
-static void takeover_tasklets(unsigned int cpu)
+void takeover_tasklets(unsigned int cpu)
 {
 	struct tasklet_struct **i;
 
 	/* CPU is dead, so no lock needed. */
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* Find end, append list for that CPU. */
 	for (i = &__get_cpu_var(tasklet_vec).list; *i; i = &(*i)->next);
@@ -441,41 +659,66 @@ static void takeover_tasklets(unsigned i
 	per_cpu(tasklet_hi_vec, cpu).list = NULL;
 	raise_softirq_irqoff(HI_SOFTIRQ);
 
-	local_irq_enable();
+	raw_local_irq_enable();
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
+static const char *softirq_names [] =
+{
+  [HI_SOFTIRQ]		= "high",
+  [TIMER_SOFTIRQ]	= "timer",
+  [NET_TX_SOFTIRQ]	= "net-tx",
+  [NET_RX_SOFTIRQ]	= "net-rx",
+  [SCSI_SOFTIRQ]	= "scsi",
+  [TASKLET_SOFTIRQ]	= "tasklet",
+#ifdef CONFIG_HIGH_RES_TIMERS
+  [HRTIMER_SOFTIRQ]	= "hrtimer",
+#endif
+};
+
 static int __devinit cpu_callback(struct notifier_block *nfb,
 				  unsigned long action,
 				  void *hcpu)
 {
-	int hotcpu = (unsigned long)hcpu;
+	int hotcpu = (unsigned long)hcpu, i;
 	struct task_struct *p;
 
 	switch (action) {
 	case CPU_UP_PREPARE:
-		BUG_ON(per_cpu(tasklet_vec, hotcpu).list);
-		BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list);
-		p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
-		if (IS_ERR(p)) {
-			printk("ksoftirqd for %i failed\n", hotcpu);
-			return NOTIFY_BAD;
+		/* We may have tasklets already scheduled on
+		   processor 0, so don't check there. */
+		if (hotcpu != 0) {
+			BUG_ON(per_cpu(tasklet_vec, hotcpu).list);
+			BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list);
+		}
+		for (i = 0; i < MAX_SOFTIRQ; i++) {
+			per_cpu(ksoftirqd[i].nr, hotcpu) = i;
+			per_cpu(ksoftirqd[i].cpu, hotcpu) = hotcpu;
+			p = kthread_create(ksoftirqd, &per_cpu(ksoftirqd[i], hotcpu),
+					   "softirq-%s/%d", softirq_names[i], hotcpu);
+			if (IS_ERR(p)) {
+				printk("ksoftirqd %d for %i failed\n", i, hotcpu);
+				return NOTIFY_BAD;
+			}
+			kthread_bind(p, hotcpu);
+			per_cpu(ksoftirqd[i].tsk, hotcpu) = p;
 		}
-		kthread_bind(p, hotcpu);
-  		per_cpu(ksoftirqd, hotcpu) = p;
  		break;
 	case CPU_ONLINE:
-		wake_up_process(per_cpu(ksoftirqd, hotcpu));
+		for (i = 0; i < MAX_SOFTIRQ; i++)
+			wake_up_process(per_cpu(ksoftirqd[i].tsk, hotcpu));
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_UP_CANCELED:
 		/* Unbind so it can run.  Fall thru. */
-		kthread_bind(per_cpu(ksoftirqd, hotcpu),
-			     any_online_cpu(cpu_online_map));
+		for (i = 0; i < MAX_SOFTIRQ; i++)
+			kthread_bind(per_cpu(ksoftirqd[i], hotcpu).tsk, any_online_cpu(cpu_online_map));
 	case CPU_DEAD:
-		p = per_cpu(ksoftirqd, hotcpu);
-		per_cpu(ksoftirqd, hotcpu) = NULL;
-		kthread_stop(p);
+		for (i = 0; i < MAX_SOFTIRQ; i++) {
+			p = per_cpu(ksoftirqd[i], hotcpu).tsk;
+			per_cpu(ksoftirqd[i], hotcpu).tsk = NULL;
+			kthread_stop(p);
+		}
 		takeover_tasklets(hotcpu);
 		break;
 #endif /* CONFIG_HOTPLUG_CPU */
@@ -495,3 +738,33 @@ __init int spawn_ksoftirqd(void)
 	register_cpu_notifier(&cpu_nfb);
 	return 0;
 }
+
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+
+int softirq_preemption = 1;
+
+EXPORT_SYMBOL(softirq_preemption);
+
+/*
+ * Real-Time Preemption depends on softirq threading:
+ */
+#ifndef CONFIG_PREEMPT_RT
+
+static int __init softirq_preempt_setup (char *str)
+{
+	if (!strncmp(str, "off", 3))
+		softirq_preemption = 0;
+	else
+		get_option(&str, &softirq_preemption);
+	if (!softirq_preemption)
+		printk("turning off softirq preemption!\n");
+
+	return 1;
+}
+
+__setup("softirq-preempt=", softirq_preempt_setup);
+
+#endif
+
+#endif
+
Index: linux.prev/kernel/softlockup.c
===================================================================
--- linux.prev.orig/kernel/softlockup.c
+++ linux.prev/kernel/softlockup.c
@@ -1,12 +1,11 @@
 /*
  * Detect Soft Lockups
  *
- * started by Ingo Molnar, (C) 2005, Red Hat
+ * started by Ingo Molnar, Copyright (C) 2005, 2006 Red Hat, Inc.
  *
  * this code detects soft lockups: incidents in where on a CPU
  * the kernel does not reschedule for 10 seconds or more.
  */
-
 #include <linux/mm.h>
 #include <linux/cpu.h>
 #include <linux/init.h>
@@ -15,15 +14,16 @@
 #include <linux/notifier.h>
 #include <linux/module.h>
 
-static DEFINE_SPINLOCK(print_lock);
+static DEFINE_RAW_SPINLOCK(print_lock);
 
-static DEFINE_PER_CPU(unsigned long, timestamp) = 0;
-static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0;
+static DEFINE_PER_CPU(unsigned long, touch_timestamp);
+static DEFINE_PER_CPU(unsigned long, print_timestamp);
 static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
 
 static int did_panic = 0;
-static int softlock_panic(struct notifier_block *this, unsigned long event,
-				void *ptr)
+
+static int
+softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
 {
 	did_panic = 1;
 
@@ -36,7 +36,7 @@ static struct notifier_block panic_block
 
 void touch_softlockup_watchdog(void)
 {
-	per_cpu(timestamp, raw_smp_processor_id()) = jiffies;
+	per_cpu(touch_timestamp, raw_smp_processor_id()) = jiffies;
 }
 EXPORT_SYMBOL(touch_softlockup_watchdog);
 
@@ -44,25 +44,35 @@ EXPORT_SYMBOL(touch_softlockup_watchdog)
  * This callback runs from the timer interrupt, and checks
  * whether the watchdog thread has hung or not:
  */
-void softlockup_tick(struct pt_regs *regs)
+void softlockup_tick(void)
 {
 	int this_cpu = smp_processor_id();
-	unsigned long timestamp = per_cpu(timestamp, this_cpu);
+	unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu);
 
-	if (per_cpu(print_timestamp, this_cpu) == timestamp)
+	/* prevent double reports: */
+	if (per_cpu(print_timestamp, this_cpu) == touch_timestamp ||
+		did_panic ||
+			!per_cpu(watchdog_task, this_cpu))
 		return;
 
-	/* Do not cause a second panic when there already was one */
-	if (did_panic)
+	/* do not print during early bootup: */
+	if (unlikely(system_state != SYSTEM_RUNNING)) {
+		touch_softlockup_watchdog();
 		return;
+	}
 
-	if (time_after(jiffies, timestamp + 10*HZ)) {
-		per_cpu(print_timestamp, this_cpu) = timestamp;
+	/* Wake up the high-prio watchdog task every second: */
+	if (time_after(jiffies, touch_timestamp + HZ))
+		wake_up_process(per_cpu(watchdog_task, this_cpu));
+
+	/* Warn about unreasonable 10+ seconds delays: */
+	if (time_after(jiffies, touch_timestamp + 10*HZ)) {
+		per_cpu(print_timestamp, this_cpu) = touch_timestamp;
 
 		spin_lock(&print_lock);
 		printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n",
 			this_cpu);
-		show_regs(regs);
+		dump_stack();
 		spin_unlock(&print_lock);
 	}
 }
@@ -77,18 +87,16 @@ static int watchdog(void * __bind_cpu)
 	sched_setscheduler(current, SCHED_FIFO, &param);
 	current->flags |= PF_NOFREEZE;
 
-	set_current_state(TASK_INTERRUPTIBLE);
-
 	/*
-	 * Run briefly once per second - if this gets delayed for
-	 * more than 10 seconds then the debug-printout triggers
-	 * in softlockup_tick():
+	 * Run briefly once per second to reset the softlockup timestamp.
+	 * If this gets delayed for more than 10 seconds then the
+	 * debug-printout triggers in softlockup_tick().
 	 */
 	while (!kthread_should_stop()) {
-		msleep_interruptible(1000);
+		set_current_state(TASK_INTERRUPTIBLE);
 		touch_softlockup_watchdog();
+		schedule();
 	}
-	__set_current_state(TASK_RUNNING);
 
 	return 0;
 }
@@ -110,11 +118,11 @@ cpu_callback(struct notifier_block *nfb,
 			printk("watchdog for %i failed\n", hotcpu);
 			return NOTIFY_BAD;
 		}
+  		per_cpu(touch_timestamp, hotcpu) = jiffies;
   		per_cpu(watchdog_task, hotcpu) = p;
 		kthread_bind(p, hotcpu);
  		break;
 	case CPU_ONLINE:
-
 		wake_up_process(per_cpu(watchdog_task, hotcpu));
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
@@ -146,4 +154,3 @@ __init void spawn_softlockup_task(void)
 
 	notifier_chain_register(&panic_notifier_list, &panic_block);
 }
-
Index: linux.prev/kernel/spinlock.c
===================================================================
--- linux.prev.orig/kernel/spinlock.c
+++ linux.prev/kernel/spinlock.c
@@ -20,151 +20,197 @@
  * Generic declaration of the raw read_trylock() function,
  * architectures are supposed to optimize this:
  */
-int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock)
+int __lockfunc generic_raw_read_trylock(raw_rwlock_t *lock)
 {
-	__raw_read_lock(lock);
+	__raw_read_lock(&lock->raw_lock);
 	return 1;
 }
-EXPORT_SYMBOL(generic__raw_read_trylock);
+EXPORT_SYMBOL(generic_raw_read_trylock);
 
-int __lockfunc _spin_trylock(spinlock_t *lock)
+int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)
 {
 	preempt_disable();
-	if (_raw_spin_trylock(lock))
+
+	if (__raw_spin_trylock(&lock->raw_lock))
 		return 1;
 	
 	preempt_enable();
+
+	return 0;
+}
+EXPORT_SYMBOL(_raw_spin_trylock);
+
+int __lockfunc _raw_spin_trylock_irq(raw_spinlock_t *lock)
+{
+	raw_local_irq_disable();
+	preempt_disable();
+
+	if (__raw_spin_trylock(&lock->raw_lock))
+		return 1;
+
+	__preempt_enable_no_resched();
+	raw_local_irq_enable();
+	preempt_check_resched();
+
+	return 0;
+}
+EXPORT_SYMBOL(_raw_spin_trylock_irq);
+
+int __lockfunc _raw_spin_trylock_irqsave(raw_spinlock_t *lock,
+					 unsigned long *flags)
+{
+	raw_local_irq_save(*flags);
+	preempt_disable();
+
+	if (__raw_spin_trylock(&lock->raw_lock))
+		return 1;
+
+	__preempt_enable_no_resched();
+	raw_local_irq_restore(*flags);
+	preempt_check_resched();
+
 	return 0;
 }
-EXPORT_SYMBOL(_spin_trylock);
+EXPORT_SYMBOL(_raw_spin_trylock_irqsave);
 
-int __lockfunc _read_trylock(rwlock_t *lock)
+int __lockfunc _raw_read_trylock(raw_rwlock_t *lock)
 {
 	preempt_disable();
-	if (_raw_read_trylock(lock))
+
+	if (__raw_read_trylock(&lock->raw_lock))
 		return 1;
 
 	preempt_enable();
+
 	return 0;
 }
-EXPORT_SYMBOL(_read_trylock);
+EXPORT_SYMBOL(_raw_read_trylock);
 
-int __lockfunc _write_trylock(rwlock_t *lock)
+int __lockfunc _raw_write_trylock(raw_rwlock_t *lock)
 {
 	preempt_disable();
-	if (_raw_write_trylock(lock))
+
+	if (__raw_write_trylock(&lock->raw_lock))
 		return 1;
 
 	preempt_enable();
+
 	return 0;
 }
-EXPORT_SYMBOL(_write_trylock);
+EXPORT_SYMBOL(_raw_write_trylock);
 
-#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP)
+//#ifndef CONFIG_PREEMPT
+#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_RT)
 
-void __lockfunc _read_lock(rwlock_t *lock)
+void __lockfunc _raw_read_lock(raw_rwlock_t *lock)
 {
 	preempt_disable();
-	_raw_read_lock(lock);
+	__raw_read_lock(&lock->raw_lock);
 }
-EXPORT_SYMBOL(_read_lock);
+EXPORT_SYMBOL(_raw_read_lock);
 
-unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
+unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	preempt_disable();
-	_raw_spin_lock_flags(lock, &flags);
+
+	__raw_spin_lock_flags(&lock->raw_lock, flags);
+
 	return flags;
 }
-EXPORT_SYMBOL(_spin_lock_irqsave);
+EXPORT_SYMBOL(_raw_spin_lock_irqsave);
 
-void __lockfunc _spin_lock_irq(spinlock_t *lock)
+void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	preempt_disable();
-	_raw_spin_lock(lock);
+
+	__raw_spin_lock(&lock->raw_lock);
 }
-EXPORT_SYMBOL(_spin_lock_irq);
+EXPORT_SYMBOL(_raw_spin_lock_irq);
 
-void __lockfunc _spin_lock_bh(spinlock_t *lock)
+void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
-	_raw_spin_lock(lock);
+
+	__raw_spin_lock(&lock->raw_lock);
 }
-EXPORT_SYMBOL(_spin_lock_bh);
+EXPORT_SYMBOL(_raw_spin_lock_bh);
 
-unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
+unsigned long __lockfunc _raw_read_lock_irqsave(raw_rwlock_t *lock)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	preempt_disable();
-	_raw_read_lock(lock);
+
+	__raw_read_lock(&lock->raw_lock);
+
 	return flags;
 }
-EXPORT_SYMBOL(_read_lock_irqsave);
+EXPORT_SYMBOL(_raw_read_lock_irqsave);
 
-void __lockfunc _read_lock_irq(rwlock_t *lock)
+void __lockfunc _raw_read_lock_irq(raw_rwlock_t *lock)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	preempt_disable();
-	_raw_read_lock(lock);
+
+	__raw_read_lock(&lock->raw_lock);
 }
-EXPORT_SYMBOL(_read_lock_irq);
+EXPORT_SYMBOL(_raw_read_lock_irq);
 
-void __lockfunc _read_lock_bh(rwlock_t *lock)
+void __lockfunc _raw_read_lock_bh(raw_rwlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
-	_raw_read_lock(lock);
+
+	__raw_read_lock(&lock->raw_lock);
 }
-EXPORT_SYMBOL(_read_lock_bh);
+EXPORT_SYMBOL(_raw_read_lock_bh);
 
-unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
+unsigned long __lockfunc _raw_write_lock_irqsave(raw_rwlock_t *lock)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	preempt_disable();
-	_raw_write_lock(lock);
+	__raw_write_lock(&lock->raw_lock);
 	return flags;
 }
-EXPORT_SYMBOL(_write_lock_irqsave);
+EXPORT_SYMBOL(_raw_write_lock_irqsave);
 
-void __lockfunc _write_lock_irq(rwlock_t *lock)
+void __lockfunc _raw_write_lock_irq(raw_rwlock_t *lock)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	preempt_disable();
-	_raw_write_lock(lock);
+	__raw_write_lock(&lock->raw_lock);
 }
-EXPORT_SYMBOL(_write_lock_irq);
+EXPORT_SYMBOL(_raw_write_lock_irq);
 
-void __lockfunc _write_lock_bh(rwlock_t *lock)
+void __lockfunc _raw_write_lock_bh(raw_rwlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
-	_raw_write_lock(lock);
+	__raw_write_lock(&lock->raw_lock);
 }
-EXPORT_SYMBOL(_write_lock_bh);
+EXPORT_SYMBOL(_raw_write_lock_bh);
 
-void __lockfunc _spin_lock(spinlock_t *lock)
+void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
 {
 	preempt_disable();
-	_raw_spin_lock(lock);
+	__raw_spin_lock(&lock->raw_lock);
 }
+EXPORT_SYMBOL(_raw_spin_lock);
 
-EXPORT_SYMBOL(_spin_lock);
-
-void __lockfunc _write_lock(rwlock_t *lock)
+void __lockfunc _raw_write_lock(raw_rwlock_t *lock)
 {
 	preempt_disable();
-	_raw_write_lock(lock);
+	__raw_write_lock(&lock->raw_lock);
 }
-
-EXPORT_SYMBOL(_write_lock);
+EXPORT_SYMBOL(_raw_write_lock);
 
 #else /* CONFIG_PREEMPT: */
 
@@ -177,39 +223,41 @@ EXPORT_SYMBOL(_write_lock);
  */
 
 #define BUILD_LOCK_OPS(op, locktype)					\
-void __lockfunc _##op##_lock(locktype##_t *lock)			\
+void __lockfunc _raw_##op##_lock(locktype##_t *lock)			\
 {									\
 	preempt_disable();						\
 	for (;;) {							\
-		if (likely(_raw_##op##_trylock(lock)))			\
+		if (likely(__raw_##op##_trylock(&(lock)->raw_lock)))	\
 			break;						\
 		preempt_enable();					\
 		if (!(lock)->break_lock)				\
 			(lock)->break_lock = 1;				\
-		while (!op##_can_lock(lock) && (lock)->break_lock)	\
+		while (!__raw_##op##_can_lock(&(lock)->raw_lock) &&	\
+						 (lock)->break_lock)	\
 			cpu_relax();					\
 		preempt_disable();					\
 	}								\
 	(lock)->break_lock = 0;						\
 }									\
 									\
-EXPORT_SYMBOL(_##op##_lock);						\
+EXPORT_SYMBOL(_raw_##op##_lock);					\
 									\
-unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock)	\
+unsigned long __lockfunc _raw_##op##_lock_irqsave(locktype##_t *lock)	\
 {									\
 	unsigned long flags;						\
 									\
 	preempt_disable();						\
 	for (;;) {							\
-		local_irq_save(flags);					\
-		if (likely(_raw_##op##_trylock(lock)))			\
+		raw_local_irq_save(flags);				\
+		if (likely(__raw_##op##_trylock(&(lock)->raw_lock)))	\
 			break;						\
-		local_irq_restore(flags);				\
+		raw_local_irq_restore(flags);				\
 									\
 		preempt_enable();					\
 		if (!(lock)->break_lock)				\
 			(lock)->break_lock = 1;				\
-		while (!op##_can_lock(lock) && (lock)->break_lock)	\
+		while (!__raw_##op##_can_lock(&(lock)->raw_lock) &&	\
+						 (lock)->break_lock)	\
 			cpu_relax();					\
 		preempt_disable();					\
 	}								\
@@ -217,16 +265,16 @@ unsigned long __lockfunc _##op##_lock_ir
 	return flags;							\
 }									\
 									\
-EXPORT_SYMBOL(_##op##_lock_irqsave);					\
+EXPORT_SYMBOL(_raw_##op##_lock_irqsave);				\
 									\
-void __lockfunc _##op##_lock_irq(locktype##_t *lock)			\
+void __lockfunc _raw_##op##_lock_irq(locktype##_t *lock)		\
 {									\
-	_##op##_lock_irqsave(lock);					\
+	_raw_##op##_lock_irqsave(lock);					\
 }									\
 									\
-EXPORT_SYMBOL(_##op##_lock_irq);					\
+EXPORT_SYMBOL(_raw_##op##_lock_irq);					\
 									\
-void __lockfunc _##op##_lock_bh(locktype##_t *lock)			\
+void __lockfunc _raw_##op##_lock_bh(locktype##_t *lock)			\
 {									\
 	unsigned long flags;						\
 									\
@@ -235,12 +283,12 @@ void __lockfunc _##op##_lock_bh(locktype
 	/* irq-disabling. We use the generic preemption-aware	*/	\
 	/* function:						*/	\
 	/**/								\
-	flags = _##op##_lock_irqsave(lock);				\
+	flags = _raw_##op##_lock_irqsave(lock);				\
 	local_bh_disable();						\
-	local_irq_restore(flags);					\
+	raw_local_irq_restore(flags);					\
 }									\
 									\
-EXPORT_SYMBOL(_##op##_lock_bh)
+EXPORT_SYMBOL(_raw_##op##_lock_bh)
 
 /*
  * Build preemption-friendly versions of the following
@@ -251,119 +299,132 @@ EXPORT_SYMBOL(_##op##_lock_bh)
  *         _[spin|read|write]_lock_irqsave()
  *         _[spin|read|write]_lock_bh()
  */
-BUILD_LOCK_OPS(spin, spinlock);
-BUILD_LOCK_OPS(read, rwlock);
-BUILD_LOCK_OPS(write, rwlock);
+BUILD_LOCK_OPS(spin, raw_spinlock);
+BUILD_LOCK_OPS(read, raw_rwlock);
+BUILD_LOCK_OPS(write, raw_rwlock);
 
 #endif /* CONFIG_PREEMPT */
 
-void __lockfunc _spin_unlock(spinlock_t *lock)
+void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
 {
-	_raw_spin_unlock(lock);
+	__raw_spin_unlock(&lock->raw_lock);
 	preempt_enable();
 }
-EXPORT_SYMBOL(_spin_unlock);
+EXPORT_SYMBOL(_raw_spin_unlock);
 
-void __lockfunc _write_unlock(rwlock_t *lock)
+void __lockfunc _raw_spin_unlock_no_resched(raw_spinlock_t *lock)
 {
-	_raw_write_unlock(lock);
-	preempt_enable();
+	__raw_spin_unlock(&lock->raw_lock);
+	__preempt_enable_no_resched();
 }
-EXPORT_SYMBOL(_write_unlock);
+/* not exported */
 
-void __lockfunc _read_unlock(rwlock_t *lock)
+void __lockfunc _raw_write_unlock(raw_rwlock_t *lock)
 {
-	_raw_read_unlock(lock);
+	__raw_write_unlock(&lock->raw_lock);
 	preempt_enable();
 }
-EXPORT_SYMBOL(_read_unlock);
+EXPORT_SYMBOL(_raw_write_unlock);
 
-void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
+void __lockfunc _raw_read_unlock(raw_rwlock_t *lock)
 {
-	_raw_spin_unlock(lock);
-	local_irq_restore(flags);
+	__raw_read_unlock(&lock->raw_lock);
 	preempt_enable();
 }
-EXPORT_SYMBOL(_spin_unlock_irqrestore);
+EXPORT_SYMBOL(_raw_read_unlock);
 
-void __lockfunc _spin_unlock_irq(spinlock_t *lock)
+void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
 {
-	_raw_spin_unlock(lock);
-	local_irq_enable();
-	preempt_enable();
+	__raw_spin_unlock(&lock->raw_lock);
+	__preempt_enable_no_resched();
+	raw_local_irq_restore(flags);
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_spin_unlock_irq);
+EXPORT_SYMBOL(_raw_spin_unlock_irqrestore);
 
-void __lockfunc _spin_unlock_bh(spinlock_t *lock)
+void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)
 {
-	_raw_spin_unlock(lock);
-	preempt_enable_no_resched();
+	__raw_spin_unlock(&lock->raw_lock);
+	__preempt_enable_no_resched();
+	raw_local_irq_enable();
+	preempt_check_resched();
+}
+EXPORT_SYMBOL(_raw_spin_unlock_irq);
+
+void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
+{
+	__raw_spin_unlock(&lock->raw_lock);
+	__preempt_enable_no_resched();
 	local_bh_enable();
 }
-EXPORT_SYMBOL(_spin_unlock_bh);
+EXPORT_SYMBOL(_raw_spin_unlock_bh);
 
-void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+void __lockfunc _raw_read_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags)
 {
-	_raw_read_unlock(lock);
-	local_irq_restore(flags);
-	preempt_enable();
+	__raw_read_unlock(&lock->raw_lock);
+	__preempt_enable_no_resched();
+	raw_local_irq_restore(flags);
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_read_unlock_irqrestore);
+EXPORT_SYMBOL(_raw_read_unlock_irqrestore);
 
-void __lockfunc _read_unlock_irq(rwlock_t *lock)
+void __lockfunc _raw_read_unlock_irq(raw_rwlock_t *lock)
 {
-	_raw_read_unlock(lock);
-	local_irq_enable();
-	preempt_enable();
+	__raw_read_unlock(&lock->raw_lock);
+	__preempt_enable_no_resched();
+	raw_local_irq_enable();
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_read_unlock_irq);
+EXPORT_SYMBOL(_raw_read_unlock_irq);
 
-void __lockfunc _read_unlock_bh(rwlock_t *lock)
+void __lockfunc _raw_read_unlock_bh(raw_rwlock_t *lock)
 {
-	_raw_read_unlock(lock);
-	preempt_enable_no_resched();
+	__raw_read_unlock(&lock->raw_lock);
+	__preempt_enable_no_resched();
 	local_bh_enable();
 }
-EXPORT_SYMBOL(_read_unlock_bh);
+EXPORT_SYMBOL(_raw_read_unlock_bh);
 
-void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+void __lockfunc _raw_write_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags)
 {
-	_raw_write_unlock(lock);
-	local_irq_restore(flags);
-	preempt_enable();
+	__raw_write_unlock(&lock->raw_lock);
+	__preempt_enable_no_resched();
+	raw_local_irq_restore(flags);
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_write_unlock_irqrestore);
+EXPORT_SYMBOL(_raw_write_unlock_irqrestore);
 
-void __lockfunc _write_unlock_irq(rwlock_t *lock)
+void __lockfunc _raw_write_unlock_irq(raw_rwlock_t *lock)
 {
-	_raw_write_unlock(lock);
-	local_irq_enable();
-	preempt_enable();
+	__raw_write_unlock(&lock->raw_lock);
+	__preempt_enable_no_resched();
+	raw_local_irq_enable();
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_write_unlock_irq);
+EXPORT_SYMBOL(_raw_write_unlock_irq);
 
-void __lockfunc _write_unlock_bh(rwlock_t *lock)
+void __lockfunc _raw_write_unlock_bh(raw_rwlock_t *lock)
 {
-	_raw_write_unlock(lock);
-	preempt_enable_no_resched();
+	__raw_write_unlock(&lock->raw_lock);
+	__preempt_enable_no_resched();
 	local_bh_enable();
 }
-EXPORT_SYMBOL(_write_unlock_bh);
+EXPORT_SYMBOL(_raw_write_unlock_bh);
 
-int __lockfunc _spin_trylock_bh(spinlock_t *lock)
+int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
-	if (_raw_spin_trylock(lock))
+	if (__raw_spin_trylock(&lock->raw_lock))
 		return 1;
 
-	preempt_enable_no_resched();
+	__preempt_enable_no_resched();
 	local_bh_enable();
 	return 0;
 }
-EXPORT_SYMBOL(_spin_trylock_bh);
+EXPORT_SYMBOL(_raw_spin_trylock_bh);
 
-int in_lock_functions(unsigned long addr)
+int notrace in_lock_functions(unsigned long addr)
 {
 	/* Linker adds these: start and end of __lockfunc functions */
 	extern char __lock_text_start[], __lock_text_end[];
@@ -372,3 +433,14 @@ int in_lock_functions(unsigned long addr
 	&& addr < (unsigned long)__lock_text_end;
 }
 EXPORT_SYMBOL(in_lock_functions);
+
+void notrace __debug_atomic_dec_and_test(atomic_t *v)
+{
+	static int warn_once = 1;
+
+	if (!atomic_read(v) && warn_once) {
+		warn_once = 0;
+		printk("BUG: atomic counter underflow!\n");
+		WARN_ON(1);
+	}
+}
Index: linux.prev/kernel/stop_machine.c
===================================================================
--- linux.prev.orig/kernel/stop_machine.c
+++ linux.prev/kernel/stop_machine.c
@@ -40,7 +40,7 @@ static int stopmachine(void *cpu)
 	while (stopmachine_state != STOPMACHINE_EXIT) {
 		if (stopmachine_state == STOPMACHINE_DISABLE_IRQ 
 		    && !irqs_disabled) {
-			local_irq_disable();
+			raw_local_irq_disable();
 			irqs_disabled = 1;
 			/* Ack: irqs disabled. */
 			smp_mb(); /* Must read state first. */
@@ -56,7 +56,7 @@ static int stopmachine(void *cpu)
 		/* Yield in first stage: migration threads need to
 		 * help our sisters onto their CPUs. */
 		if (!prepared && !irqs_disabled)
-			yield();
+			__yield();
 		else
 			cpu_relax();
 	}
@@ -66,7 +66,7 @@ static int stopmachine(void *cpu)
 	atomic_inc(&stopmachine_thread_ack);
 
 	if (irqs_disabled)
-		local_irq_enable();
+		raw_local_irq_enable();
 	if (prepared)
 		preempt_enable();
 
@@ -110,7 +110,7 @@ static int stop_machine(void)
 
 	/* Wait for them all to come to life. */
 	while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
-		yield();
+		__yield();
 
 	/* If some failed, kill them all. */
 	if (ret < 0) {
@@ -124,7 +124,7 @@ static int stop_machine(void)
 	stopmachine_set_state(STOPMACHINE_PREPARE);
 
 	/* Make them disable irqs. */
-	local_irq_disable();
+	raw_local_irq_disable();
 	stopmachine_set_state(STOPMACHINE_DISABLE_IRQ);
 
 	return 0;
@@ -133,8 +133,8 @@ static int stop_machine(void)
 static void restart_machine(void)
 {
 	stopmachine_set_state(STOPMACHINE_EXIT);
-	local_irq_enable();
-	preempt_enable_no_resched();
+	raw_local_irq_enable();
+	__preempt_enable_no_resched();
 }
 
 struct stop_machine_data
Index: linux.prev/kernel/sys.c
===================================================================
--- linux.prev.orig/kernel/sys.c
+++ linux.prev/kernel/sys.c
@@ -32,6 +32,7 @@
 
 #include <linux/compat.h>
 #include <linux/syscalls.h>
+#include <linux/rt_lock.h>
 #include <linux/kprobes.h>
 
 #include <asm/uaccess.h>
@@ -169,7 +170,7 @@ EXPORT_SYMBOL(notifier_chain_unregister)
  *	of the last notifier function called.
  */
  
-int __kprobes notifier_call_chain(struct notifier_block **n, unsigned long val, void *v)
+int notrace __kprobes notifier_call_chain(struct notifier_block **n, unsigned long val, void *v)
 {
 	int ret=NOTIFY_DONE;
 	struct notifier_block *nb = *n;
Index: linux.prev/kernel/sysctl.c
===================================================================
--- linux.prev.orig/kernel/sysctl.c
+++ linux.prev/kernel/sysctl.c
@@ -42,6 +42,7 @@
 #include <linux/limits.h>
 #include <linux/dcache.h>
 #include <linux/syscalls.h>
+#include <linux/profile.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -279,6 +280,158 @@ static ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "prof_pid",
+		.data		= &prof_pid,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#ifdef CONFIG_PREEMPT
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "kernel_preemption",
+		.data		= &kernel_preemption,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "voluntary_preemption",
+		.data		= &voluntary_preemption,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#if defined(CONFIG_PREEMPT_SOFTIRQS) && !defined(CONFIG_PREEMPT_RT)
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "softirq_preemption",
+		.data		= &softirq_preemption,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#if defined(CONFIG_PREEMPT_HARDIRQS) && !defined(CONFIG_PREEMPT_RT)
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "hardirq_preemption",
+		.data		= &hardirq_preemption,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_WAKEUP_TIMING
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "wakeup_timing",
+		.data		= &wakeup_timing,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_LATENCY_TRACE
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_enabled",
+		.data		= &trace_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "mcount_enabled",
+		.data		= &mcount_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_user_triggered",
+		.data		= &trace_user_triggered,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_user_trigger_irq",
+		.data		= &trace_user_trigger_irq,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_user_trigger_irq",
+		.data		= &trace_user_trigger_irq,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_freerunning",
+		.data		= &trace_freerunning,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_print_at_crash",
+		.data		= &trace_print_at_crash,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_verbose",
+		.data		= &trace_verbose,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_all_cpus",
+		.data		= &trace_all_cpus,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_DEBUG_RT_LOCKING_MODE
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "preempt_locks",
+		.data		= &preempt_locks_user,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_GENERIC_HARDIRQS
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "debug_direct_keyboard",
+		.data		= &debug_direct_keyboard,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+	{
 		.ctl_name	= KERN_CORE_USES_PID,
 		.procname	= "core_uses_pid",
 		.data		= &core_uses_pid,
@@ -482,7 +635,8 @@ static ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
-#ifdef CONFIG_MAGIC_SYSRQ
+// ignore sysrq-off when debug-printks are enabled:
+#if defined(CONFIG_MAGIC_SYSRQ) && !defined(CONFIG_PRINTK_IGNORE_LOGLEVEL)
 	{
 		.ctl_name	= KERN_SYSRQ,
 		.procname	= "sysrq",
Index: linux.prev/kernel/time.c
===================================================================
--- linux.prev.orig/kernel/time.c
+++ linux.prev/kernel/time.c
@@ -38,6 +38,7 @@
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
+#include <linux/timeofday.h>
 
 /* 
  * The timezone where the local system is located.  Used as a default by some
@@ -97,8 +98,31 @@ asmlinkage long sys_stime(time_t __user 
 
 #endif /* __ARCH_WANT_SYS_TIME */
 
+int timeofday_API_hacks(void *tv, void *tz)
+{
+#ifdef CONFIG_LATENCY_TRACE
+	if (!tv && ((long)tz == 1))
+		return user_trace_start();
+	if (!tv && !tz)
+		return user_trace_stop();
+#endif
+	if (((long)tv == 1) && ((long)tz == 1)) {
+		current->flags |= PF_NOSCHED;
+		return 0;
+	}
+	if (((long)tv == 1) && ((long)tz == 0)) {
+		current->flags &= ~PF_NOSCHED;
+		return 0;
+	}
+	return 1;
+}
+
 asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __user *tz)
 {
+	int ret = timeofday_API_hacks(tv, tz);
+	if (ret != 1)
+		return ret;
+
 	if (likely(tv != NULL)) {
 		struct timeval ktv;
 		do_gettimeofday(&ktv);
@@ -128,6 +152,7 @@ asmlinkage long sys_gettimeofday(struct 
  * as real UNIX machines always do it. This avoids all headaches about
  * daylight saving times and warping kernel clocks.
  */
+#ifndef CONFIG_GENERIC_TIME
 static inline void warp_clock(void)
 {
 	write_seqlock_irq(&xtime_lock);
@@ -137,6 +162,18 @@ static inline void warp_clock(void)
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 }
+#else /* !CONFIG_GENERIC_TIME */
+/* XXX - this is somewhat cracked out and should
+         be checked  -johnstul@us.ibm.com
+*/
+static inline void warp_clock(void)
+{
+	struct timespec ts;
+	getnstimeofday(&ts);
+	ts.tv_sec += sys_tz.tz_minuteswest * 60;
+	do_settimeofday(&ts);
+}
+#endif /* !CONFIG_GENERIC_TIME */
 
 /*
  * In case for some reason the CMOS clock has not already been running
@@ -154,6 +191,9 @@ int do_sys_settimeofday(struct timespec 
 	static int firsttime = 1;
 	int error = 0;
 
+	if (!timespec_valid(tv))
+		return -EINVAL;
+
 	error = security_settime(tv, tz);
 	if (error)
 		return error;
@@ -184,6 +224,10 @@ asmlinkage long sys_settimeofday(struct 
 	struct timespec	new_ts;
 	struct timezone new_tz;
 
+	int ret = timeofday_API_hacks(tv, tz);
+	if (ret != 1)
+		return ret;
+
 	if (tv) {
 		if (copy_from_user(&user_tv, tv, sizeof(*tv)))
 			return -EFAULT;
@@ -255,6 +299,8 @@ int do_adjtimex(struct timex *txc)
 			return -EINVAL;
 
 	write_seqlock_irq(&xtime_lock);
+	write_seqlock(&ntp_lock);
+
 	result = time_state;	/* mostly `TIME_OK' */
 
 	/* Save for later - semantics of adjtime is to return old value */
@@ -392,6 +438,7 @@ leave:	if ((time_status & (STA_UNSYNC|ST
 	txc->calcnt	   = pps_calcnt;
 	txc->errcnt	   = pps_errcnt;
 	txc->stbcnt	   = pps_stbcnt;
+	write_sequnlock(&ntp_lock);
 	write_sequnlock_irq(&xtime_lock);
 	do_gettimeofday(&txc->time);
 	notify_arch_cmos_timer();
@@ -473,6 +520,7 @@ struct timespec timespec_trunc(struct ti
 }
 EXPORT_SYMBOL(timespec_trunc);
 
+#ifndef CONFIG_GENERIC_TIME
 #ifdef CONFIG_TIME_INTERPOLATION
 void getnstimeofday (struct timespec *tv)
 {
@@ -509,10 +557,7 @@ int do_settimeofday (struct timespec *tv
 		set_normalized_timespec(&xtime, sec, nsec);
 		set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-		time_adjust = 0;		/* stop active adjtime() */
-		time_status |= STA_UNSYNC;
-		time_maxerror = NTP_PHASE_LIMIT;
-		time_esterror = NTP_PHASE_LIMIT;
+		ntp_clear();
 		time_interpolator_reset();
 	}
 	write_sequnlock_irq(&xtime_lock);
@@ -560,28 +605,110 @@ void getnstimeofday(struct timespec *tv)
 }
 EXPORT_SYMBOL_GPL(getnstimeofday);
 #endif
+#endif /* !CONFIG_GENERIC_TIME */
 
-void getnstimestamp(struct timespec *ts)
+/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
+ * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
+ * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
+ *
+ * [For the Julian calendar (which was used in Russia before 1917,
+ * Britain & colonies before 1752, anywhere else before 1582,
+ * and is still in use by some communities) leave out the
+ * -year/100+year/400 terms, and add 10.]
+ *
+ * This algorithm was first published by Gauss (I think).
+ *
+ * WARNING: this function will overflow on 2106-02-07 06:28:16 on
+ * machines were long is 32-bit! (However, as time_t is signed, we
+ * will already get problems at other places on 2038-01-19 03:14:08)
+ */
+unsigned long
+mktime(const unsigned int year0, const unsigned int mon0,
+       const unsigned int day, const unsigned int hour,
+       const unsigned int min, const unsigned int sec)
 {
-	unsigned int seq;
-	struct timespec wall2mono;
+	unsigned int mon = mon0, year = year0;
 
-	/* synchronize with settimeofday() changes */
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		getnstimeofday(ts);
-		wall2mono = wall_to_monotonic;
-	} while(unlikely(read_seqretry(&xtime_lock, seq)));
-
-	/* adjust to monotonicaly-increasing values */
-	ts->tv_sec += wall2mono.tv_sec;
-	ts->tv_nsec += wall2mono.tv_nsec;
-	while (unlikely(ts->tv_nsec >= NSEC_PER_SEC)) {
-		ts->tv_nsec -= NSEC_PER_SEC;
-		ts->tv_sec++;
+	/* 1..12 -> 11,12,1..10 */
+	if (0 >= (int) (mon -= 2)) {
+		mon += 12;	/* Puts Feb last since it has leap day */
+		year -= 1;
 	}
+
+	return ((((unsigned long)
+		  (year/4 - year/100 + year/400 + 367*mon/12 + day) +
+		  year*365 - 719499
+	    )*24 + hour /* now have hours */
+	  )*60 + min /* now have minutes */
+	)*60 + sec; /* finally seconds */
+}
+
+EXPORT_SYMBOL(mktime);
+
+/**
+ * set_normalized_timespec - set timespec sec and nsec parts and normalize
+ *
+ * @ts:		pointer to timespec variable to be set
+ * @sec:	seconds to set
+ * @nsec:	nanoseconds to set
+ *
+ * Set seconds and nanoseconds field of a timespec variable and
+ * normalize to the timespec storage format
+ *
+ * Note: The tv_nsec part is always in the range of
+ * 	0 <= tv_nsec < NSEC_PER_SEC
+ * For negative values only the tv_sec field is negative !
+ */
+void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
+{
+	while (nsec >= NSEC_PER_SEC) {
+		nsec -= NSEC_PER_SEC;
+		++sec;
+	}
+	while (nsec < 0) {
+		nsec += NSEC_PER_SEC;
+		--sec;
+	}
+	ts->tv_sec = sec;
+	ts->tv_nsec = nsec;
+}
+
+/**
+ * ns_to_timespec - Convert nanoseconds to timespec
+ * @nsec:       the nanoseconds value to be converted
+ *
+ * Returns the timespec representation of the nsec parameter.
+ */
+struct timespec ns_to_timespec(const nsec_t nsec)
+{
+	struct timespec ts;
+
+	if (!nsec)
+		return (struct timespec) {0, 0};
+
+	ts.tv_sec = div_long_long_rem_signed(nsec, NSEC_PER_SEC, &ts.tv_nsec);
+	if (unlikely(nsec < 0))
+		set_normalized_timespec(&ts, ts.tv_sec, ts.tv_nsec);
+
+	return ts;
+}
+
+/**
+ * ns_to_timeval - Convert nanoseconds to timeval
+ * @nsec:       the nanoseconds value to be converted
+ *
+ * Returns the timeval representation of the nsec parameter.
+ */
+struct timeval ns_to_timeval(const nsec_t nsec)
+{
+	struct timespec ts = ns_to_timespec(nsec);
+	struct timeval tv;
+
+	tv.tv_sec = ts.tv_sec;
+	tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000;
+
+	return tv;
 }
-EXPORT_SYMBOL_GPL(getnstimestamp);
 
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void)
Index: linux.prev/kernel/time/Kconfig
===================================================================
--- /dev/null
+++ linux.prev/kernel/time/Kconfig
@@ -0,0 +1,22 @@
+#
+# Timer subsystem related configuration options
+#
+config HIGH_RES_TIMERS
+	bool "High Resolution Timer Support"
+	depends on GENERIC_TIME
+	help
+	  This option enables high resolution timer support. If your
+	  hardware is not capable then this option only increases
+	  the size of the kernel image.
+
+config HIGH_RES_RESOLUTION
+	int "High Resolution Timer resolution (nanoseconds)"
+	depends on HIGH_RES_TIMERS
+	default 1000
+	help
+	  This sets the resolution in nanoseconds of the high resolution
+	  timers. Too fine a resolution (small a number) will usually
+	  not be observable due to normal system latencies.  For an
+          800 MHz processor about 10,000 (10 microseconds) is recommended as a
+	  finest resolution.  If you don't need that sort of resolution,
+	  larger values may generate less overhead.
Index: linux.prev/kernel/time/Makefile
===================================================================
--- /dev/null
+++ linux.prev/kernel/time/Makefile
@@ -0,0 +1 @@
+obj-y	= clocksource.o jiffies.o clockevents.o timeofday.o
Index: linux.prev/kernel/time/clockevents.c
===================================================================
--- /dev/null
+++ linux.prev/kernel/time/clockevents.c
@@ -0,0 +1,626 @@
+/*
+ * linux/kernel/time/clockevents.c
+ *
+ * This file contains functions which manage clock event drivers.
+ *
+ * Copyright(C) 2005 Thomas Gleixner <tglx@linutronix.de>
+ *
+ * Kudos to Ingo Molnar for review, criticism, ideas
+ *
+ * We have two types of clock event devices:
+ * - global events (one device per system)
+ * - local events (one device per cpu)
+ *
+ * We assign the various time(r) related interrupts to those devices
+ *
+ * - global tick
+ * - profiling (per cpu)
+ * - next timer events (per cpu)
+ *
+ * TODO:
+ * - implement variable frequency profiling
+ */
+
+#include <linux/clockchips.h>
+#include <linux/cpu.h>
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/sysdev.h>
+#include <linux/hrtimer.h>
+
+#define MAX_CLOCK_EVENTS	4
+
+struct event_descr {
+	struct clock_event *event;
+	unsigned int mode;
+	unsigned int real_caps;
+	struct irqaction action;
+};
+
+struct local_events {
+	int installed;
+	struct event_descr events[MAX_CLOCK_EVENTS];
+	struct clock_event *nextevt;
+};
+
+/* Variables related to the global event source */
+static struct event_descr global_eventsource;
+
+/* Variables related to the per cpu local event sources */
+static DEFINE_PER_CPU(struct local_events, local_eventsources);
+
+#ifdef CONFIG_SMP
+# define recalc_global_event(e) do { } while(0)
+#else
+# define recalc_global_event(c) recalc_active_event(&global_eventsource, c)
+#endif
+
+/*
+ * Math helper. Convert a latch value to ns
+ */
+unsigned long clockevent_delta2ns(unsigned long latch, struct clock_event *evt)
+{
+	u64 clc = ((u64) latch << evt->shift);
+
+	do_div(clc, evt->mult);
+	if (clc < KTIME_MONOTONIC_RES.tv64)
+		clc = KTIME_MONOTONIC_RES.tv64;
+	if (clc > 0x7FFFFFFF)
+		clc = 0x7FFFFFFF;
+
+	return (unsigned long) clc;
+}
+
+/*
+ * Generic timer interrupt handler usable for all kinds of events
+ */
+static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+{
+	struct clock_event *evt = dev_id;
+
+	if (evt->start_event)
+		evt->start_event(evt->priv);
+
+	evt->event_handler(regs);
+
+	if (evt->end_event)
+		evt->end_event(evt->priv);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Handle tick
+ */
+extern void timeofday_ensure_correctness(void);
+
+static void handle_tick(struct pt_regs *regs)
+{
+	static atomic_t tick_ctr;
+
+	atomic_inc(&tick_ctr);
+	if (!(atomic_read(&tick_ctr) % HZ))
+		timeofday_ensure_correctness();
+
+	write_seqlock(&xtime_lock);
+	do_timer(regs);
+	write_sequnlock(&xtime_lock);
+}
+
+/*
+ * Handle tick and update
+ */
+static void handle_tick_update(struct pt_regs *regs)
+{
+	write_seqlock(&xtime_lock);
+	do_timer(regs);
+	write_sequnlock(&xtime_lock);
+
+	update_process_times(user_mode(regs));
+}
+
+/*
+ * Handle tick, update and profiling
+ */
+static void handle_tick_update_profile(struct pt_regs *regs)
+{
+	write_seqlock(&xtime_lock);
+	do_timer(regs);
+	write_sequnlock(&xtime_lock);
+
+	update_process_times(user_mode(regs));
+	profile_tick(CPU_PROFILING, regs);
+}
+
+/*
+ * Handle update
+ */
+static void handle_update(struct pt_regs *regs)
+{
+	update_process_times(user_mode(regs));
+}
+
+/*
+ * Handle update and profile
+ */
+static void handle_update_profile(struct pt_regs *regs)
+{
+	update_process_times(user_mode(regs));
+	profile_tick(CPU_PROFILING, regs);
+}
+
+/*
+ * Handle profile
+ */
+static void handle_profile(struct pt_regs *regs)
+{
+	profile_tick(CPU_PROFILING, regs);
+}
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+/*
+ * Handle next event
+ */
+static void handle_nextevent(struct pt_regs *regs)
+{
+	hrtimer_interrupt();
+}
+
+/*
+ * Handle next event, tick
+ */
+static void handle_nextevent_tick(struct pt_regs *regs)
+{
+	int res;
+
+	res = hrtimer_interrupt();
+	for (; res > 0; res--)
+		handle_tick(regs);
+}
+
+/*
+ * Handle next event, update
+ */
+static void handle_nextevent_update(struct pt_regs *regs)
+{
+	if (hrtimer_interrupt() > 0)
+		handle_update(regs);
+}
+
+/*
+ * Handle next event, tick, update
+ */
+static void handle_nextevent_tick_update(struct pt_regs *regs)
+{
+	int res;
+
+	if ((res = hrtimer_interrupt()) == 0)
+		return;
+
+	for (; res > 0; res--)
+		handle_tick(regs);
+
+	handle_update(regs);
+}
+
+/*
+ * Handle next event, profile
+ */
+static void handle_nextevent_profile(struct pt_regs *regs)
+{
+	if (hrtimer_interrupt() > 0)
+		handle_profile(regs);
+}
+
+/*
+ * Handle next event, update, profile
+ */
+static void handle_nextevent_update_profile(struct pt_regs *regs)
+{
+	if (hrtimer_interrupt() > 0)
+		handle_update_profile(regs);
+}
+
+/*
+ * Handle next event, tick, update, profile
+ */
+static void handle_nextevent_all(struct pt_regs *regs)
+{
+	int res;
+
+	if ((res = hrtimer_interrupt()) == 0)
+		return;
+
+	for (; res > 0; res--)
+		handle_tick(regs);
+
+	handle_update_profile(regs);
+}
+#else
+#define handle_nextevent		NULL
+#define handle_nextevent_tick		NULL
+#define handle_nextevent_update		NULL
+#define handle_nextevent_tick_update	NULL
+#define handle_nextevent_profile	NULL
+#define handle_nextevent_update_profile	NULL
+#define handle_nextevent_all		NULL
+#endif
+
+/*
+ * Lookup table for event assignment
+ */
+static void *event_handlers[] = {
+	NULL,				/* 0: No capability selected */
+	handle_tick,			/* 1: Tick only	*/
+	handle_nextevent,		/* 2: Next event only */
+	handle_nextevent_tick,		/* 3: Next event + tick */
+	handle_update,			/* 4: Update process times */
+	handle_tick_update,		/* 5: Tick + update process times */
+	handle_nextevent_update,	/* 6: Next event +
+					      update process times */
+	handle_nextevent_tick_update,	/* 7: Next event + tick +
+					      update process times */
+	handle_profile,			/* 8: Profiling int */
+	NULL,				/* 9: Tick + profiling */
+	handle_nextevent_profile,	/* A: Next event + profiling */
+	NULL,				/* B: Next event + tick + profiling */
+	handle_update_profile,		/* C: Update process times +
+					      profiling */
+	handle_tick_update_profile,	/* D: Tick + update process times +
+					      profiling */
+	handle_nextevent_update_profile,/* E: Next event +
+					      update process times +
+					      profiling */
+	handle_nextevent_all,		/* F: Next event + tick +
+					      update process times +
+					      profiling */
+};
+
+/*
+ * The selection model makes following assumptions:
+ *
+ * There is only one global event source set up. Global event sources
+ * are unique devices in a system (UP/SMP) Usually they are setup
+ * early in the bootup phase to provide the basic tick environment to
+ * bring up hardware. Such a device can be capable of providing all in
+ * one functionality including next event scheduling.
+ *
+ * When a system has decicated event sources which can be used for
+ * particular purposes then we assume that there are no devices setup
+ * which provide "competing" functionality. i.e. the developer has to
+ * decide which device should be used for a particular functionality
+ * rather than letting the management code guess about the best
+ * fit. The code manages the cases, where the number of event sources
+ * is unknown during compile time, but the functionality of the event
+ * source is assigned to the respective event source by a human best
+ * fit decision.
+ *
+ * The purpose of the management code is to provide handling code for
+ * the various possible combinations and the necessary infrastructure
+ * to handle next event (e.g. high resolution) scheduling with a
+ * single event source, which makes a periodic rescheduling of the
+ * tick interupt necessary. This is done to avoid the #ifdef mess all
+ * over the architecture dependend timer and event interupt code for
+ * the various possible use case combinations and allows clean non
+ * intrusive implementation of configurable extensions to the time
+ * related event system e.g. dynamic ticks, high resolution
+ * timers.
+ *
+ * Some architectures can use a NMI based profiling mechanism. If this
+ * is used, then profiling is excluded from the event assignements.
+ *
+ * SMP systems CPU which have no unique global event source should not
+ * setup a global event source. The correct way is setting up one
+ * event source (usually local to CPU0 or the bootcpu in hotplug
+ * systems) which has the CLOCK_CAP_TICK flag set, so the management
+ * code assigns exactly one tick source for the complete system.
+ *
+ * A special case are pseudo event sources (IPI mechanisms) on SMP
+ * systems. They can be used for populating tick events from one event
+ * source across multiple CPUs.
+ *
+ */
+static int setup_event(struct event_descr *descr, struct clock_event *evt,
+		       unsigned int caps, cpumask_t cpumask)
+{
+	void *handler = event_handlers[caps];
+
+	if (!handler) {
+		printk(KERN_ERR "Unsupported event source %s\n", evt->name);
+		return -EINVAL;
+	}
+
+	/* Store the event handler */
+	evt->event_handler = handler;
+
+	/* Save the event descriptor reference */
+	descr->event = evt;
+
+	if (!(evt->capabilities & CLOCK_HAS_IRQHANDLER)) {
+		descr->action.name = evt->name;
+		descr->action.handler = timer_interrupt;
+		descr->action.flags = SA_INTERRUPT | SA_NODELAY;
+		descr->action.mask = cpumask;
+		descr->action.dev_id = evt;
+		setup_irq(evt->irq, &descr->action);
+	}
+
+	descr->real_caps = caps;
+	descr->mode = CLOCK_EVT_STARTUP;
+	if (evt->set_mode)
+		evt->set_mode(CLOCK_EVT_STARTUP);
+	printk(KERN_INFO "Event source %s installed with caps set: %02x\n",
+	       descr->event->name, descr->real_caps);
+
+	return 0;
+}
+
+/*
+ * Mask out the functionality which is covered by the new event source
+ * and assign a new event handler.
+ */
+static unsigned int recalc_active_event(struct event_descr *descr,
+					unsigned int caps)
+{
+	unsigned int gcaps;
+
+	if (!descr->event)
+		return caps;
+
+	/* Find out the overlapping bits */
+	gcaps = descr->real_caps & caps;
+
+	/*
+	 * Be careful here. We dont know in which order the event
+	 * sources are set up. So we might switch off a previously
+	 * registered source completely.
+	 *
+	 * Might need more thoughts though.
+	 */
+	if (gcaps == descr->real_caps) {
+		int i;
+
+		i = ffs(gcaps) - 1;
+		gcaps &= ~(1 << i);
+		caps &= ~(1 << i);
+	}
+	if (!gcaps)
+		return caps;
+
+	/* Mask the bits which are now covered by the new event */
+	descr->real_caps &= ~gcaps;
+
+	/* Assign the new event handler */
+	descr->event->event_handler = event_handlers[descr->real_caps];
+	printk(KERN_INFO "Event source %s new caps set: %02x\n" ,
+	       descr->event->name, descr->real_caps);
+
+	return caps;
+}
+
+/*
+ * Recalc the events and reassign the handlers if necessary
+ */
+static int recalc_events(struct local_events *sources, struct clock_event *evt,
+			 cpumask_t cpumask)
+{
+	unsigned int caps = evt->capabilities & CLOCK_CAP_MASK;
+	int i;
+
+	if (sources->installed == MAX_CLOCK_EVENTS)
+		return -ENOSPC;
+
+	if (!event_handlers[caps])
+		return -EINVAL;
+
+	recalc_global_event(caps);
+
+	for (i = 0; i < sources->installed; i++)
+		caps = recalc_active_event(&sources->events[i], caps);
+
+	setup_event(&sources->events[sources->installed], evt, caps, cpumask);
+	sources->installed++;
+	if (evt->capabilities & CLOCK_CAP_NEXTEVT) {
+		sources->nextevt = evt;
+		hrtimer_clock_notify();
+	}
+
+	return 0;
+}
+
+/**
+ * setup_local_clockevent - Set up a cpu local clock event device
+ *
+ * @evtdev:	event device to be registered
+ * @cpumask:	cpumask for the irq setup
+ */
+int setup_local_clockevent(struct clock_event *evtdev, cpumask_t cpumask)
+{
+	struct local_events *sources = &__get_cpu_var(local_eventsources);
+	unsigned long flags;
+	int res;
+
+	/* Recalc event sources and maybe reassign interrupts */
+	raw_local_irq_save(flags);
+	res = recalc_events(sources, evtdev, cpumask);
+	raw_local_irq_restore(flags);
+
+	return res;
+}
+EXPORT_SYMBOL(setup_local_clockevent);
+
+/**
+ * set_global_clockevent - Set the device which generates global clock events
+ *
+ * @evt:	The device which generates global clock events (ticks)
+ */
+int __init setup_global_clockevent(struct clock_event *evt, cpumask_t cpumask)
+{
+	int res;
+
+	res = setup_event(&global_eventsource, evt,
+			   evt->capabilities & CLOCK_CAP_MASK, cpumask);
+#ifndef CONFIG_SMP
+	/*
+	 * The "global" event source on UP systems can serve as
+	 * next event source !
+	 */
+	if (!res && (evt->capabilities & CLOCK_CAP_NEXTEVT))
+		per_cpu(local_eventsources, 0).nextevt = evt;
+#endif
+	return res;
+}
+
+/**
+ * clockevents_next_event_available - Check for a installed next event source
+ */
+int clockevents_next_event_available(void)
+{
+	struct local_events *sources = &__get_cpu_var(local_eventsources);
+        int i;
+
+	if (!sources->nextevt)
+		return 0;
+
+#ifndef CONFIG_SMP
+	if (sources->nextevt == global_eventsource.event)
+		return CLOCK_EVT_SCHEDTICK;
+#endif
+	/*
+	 * Check, whether the next event source is solely for next events or
+	 * it has to do some periodic tick functionality
+	 * We use the real_caps field here, as some other source might
+	 * have switched off one of the capability flags.
+	 */
+	for (i = 0; i < sources->installed; i++) {
+                if (sources->nextevt != sources->events[i].event)
+                        continue;
+
+	        if (sources->events[i].real_caps & ~CLOCK_CAP_NEXTEVT)
+        		return CLOCK_EVT_SCHEDTICK;
+		return CLOCK_EVT_NOTICK;
+        }
+	return CLOCK_EVT_NOTICK;
+}
+
+int clockevents_init_next_event(void)
+{
+	struct local_events *sources = &__get_cpu_var(local_eventsources);
+
+	if (!sources->nextevt)
+		return 0;
+
+	if (sources->nextevt->set_mode)
+		sources->nextevt->set_mode(CLOCK_EVT_ONESHOT);
+
+	return 1;
+}
+
+int clockevents_set_next_event(ktime_t expires)
+{
+	struct local_events *sources = &__get_cpu_var(local_eventsources);
+	nsec_t delta = ktime_to_ns(ktime_sub(expires, ktime_get()));
+	unsigned long clc;
+
+	if (delta <= 0)
+		return -ETIME;
+	if (delta > sources->nextevt->max_delta_ns)
+		delta = sources->nextevt->max_delta_ns;
+	if (delta < sources->nextevt->min_delta_ns)
+		delta = sources->nextevt->min_delta_ns;
+
+	clc = mpy_sc32((unsigned long) delta, sources->nextevt->mult);
+	sources->nextevt->set_next_event(clc);
+
+	hrtimer_trace(expires, clc);
+
+	return 0;
+}
+
+void clockevents_trigger_next_event(void)
+{
+}
+
+#ifdef CONFIG_PM
+static int
+global_eventsource_suspend(struct sys_device *dev, pm_message_t state)
+{
+	/* Do generic stuff here */
+	if (global_eventsource.event->suspend)
+		global_eventsource.event->suspend();
+	return 0;
+}
+
+static int global_eventsource_resume(struct sys_device *dev)
+{
+	/* Do generic stuff here */
+	if (global_eventsource.event->resume)
+		global_eventsource.event->resume();
+	return 0;
+}
+#else
+# define global_eventsource_resume	NULL
+# define global_eventsource_suspend	NULL
+#endif
+
+static struct sysdev_class global_clock_event_sysclass = {
+	.resume = global_eventsource_resume,
+	.suspend = global_eventsource_suspend,
+	set_kset_name("global_clock_event"),
+};
+
+static struct sys_device device_global_clock_event = {
+	.id	= 0,
+	.cls	= &global_clock_event_sysclass,
+};
+
+static int __init global_clock_event_devinit(void)
+{
+	int error = sysdev_class_register(&global_clock_event_sysclass);
+
+	if (!error)
+		error = sysdev_register(&device_global_clock_event);
+
+	return error;
+}
+
+device_initcall(global_clock_event_devinit);
+
+/*
+ * Functions related to initialization
+ */
+static void __devinit init_clockevents_cpu(int cpu)
+{
+}
+
+static int __devinit clockevents_cpu_notify(struct notifier_block *self,
+					    unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+
+	switch(action) {
+	case CPU_UP_PREPARE:
+		init_clockevents_cpu(cpu);
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_DEAD:
+		break;
+#endif
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata clockevents_nb = {
+	.notifier_call	= clockevents_cpu_notify,
+};
+
+void __init init_clockevents(void)
+{
+	clockevents_cpu_notify(&clockevents_nb, (unsigned long)CPU_UP_PREPARE,
+				(void *)(long)smp_processor_id());
+	register_cpu_notifier(&clockevents_nb);
+}
Index: linux.prev/kernel/time/clocksource.c
===================================================================
--- /dev/null
+++ linux.prev/kernel/time/clocksource.c
@@ -0,0 +1,327 @@
+/*
+ * linux/kernel/time/clocksource.c
+ *
+ * This file contains the functions which manage clocksource drivers.
+ *
+ * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * TODO WishList:
+ *   o Allow clocksource drivers to be unregistered
+ *   o get rid of clocksource_jiffies extern
+ */
+
+#include <linux/clocksource.h>
+#include <linux/sysdev.h>
+#include <linux/init.h>
+#include <linux/module.h>
+
+/* XXX - Would like a better way for initializing curr_clocksource */
+extern struct clocksource clocksource_jiffies;
+
+/*[Clocksource internal variables]---------
+ * curr_clocksource:
+ *	currently selected clocksource. Initialized to clocksource_jiffies.
+ * next_clocksource:
+ *	pending next selected clocksource.
+ * clocksource_list:
+ *	linked list with the registered clocksources
+ * clocksource_lock:
+ *	protects manipulations to curr_clocksource and next_clocksource
+ *	and the clocksource_list
+ * override_name:
+ *	Name of the user-specified clocksource.
+ */
+static struct clocksource *curr_clocksource = &clocksource_jiffies;
+static struct clocksource *next_clocksource;
+static LIST_HEAD(clocksource_list);
+static DEFINE_RAW_SPINLOCK(clocksource_lock);
+static char override_name[32];
+
+static int finished_booting;
+
+/* clocksource_done_booting - Called near the end of bootup
+ *
+ * Hack to avoid lots of clocksource churn at boot time 
+ */
+static int clocksource_done_booting(void)
+{
+	finished_booting = 1;
+	return 0;
+}
+
+late_initcall(clocksource_done_booting);
+
+/**
+ * get_next_clocksource - Returns the selected clocksource
+ */
+struct clocksource *get_next_clocksource(void)
+{
+	spin_lock(&clocksource_lock);
+	if (next_clocksource && finished_booting) {
+		curr_clocksource = next_clocksource;
+		next_clocksource = NULL;
+	}
+	spin_unlock(&clocksource_lock);
+
+	return curr_clocksource;
+}
+
+/**
+ * select_clocksource - Finds the best registered clocksource.
+ *
+ * Private function. Must hold clocksource_lock when called.
+ */
+static struct clocksource *select_clocksource(void)
+{
+	struct clocksource *best = NULL;
+	struct list_head *tmp;
+
+	list_for_each(tmp, &clocksource_list) {
+		struct clocksource *src;
+
+		src = list_entry(tmp, struct clocksource, list);
+		if (!best)
+			best = src;
+
+		/* check for override: */
+		if (strlen(src->name) == strlen(override_name) &&
+		    !strcmp(src->name, override_name)) {
+			best = src;
+			break;
+		}
+		/* pick the highest rating: */
+		if (src->rating > best->rating)
+		 	best = src;
+	}
+
+	return best;
+}
+
+/**
+ * is_registered_source - Checks if clocksource is registered
+ * @c:		pointer to a clocksource
+ *
+ * Private helper function. Must hold clocksource_lock when called.
+ *
+ * Returns one if the clocksource is already registered, zero otherwise.
+ */
+static int is_registered_source(struct clocksource *c)
+{
+	int len = strlen(c->name);
+	struct list_head *tmp;
+
+	list_for_each(tmp, &clocksource_list) {
+		struct clocksource *src;
+
+		src = list_entry(tmp, struct clocksource, list);
+		if (strlen(src->name) == len &&	!strcmp(src->name, c->name))
+			return 1;
+	}
+
+	return 0;
+}
+
+/**
+ * register_clocksource - Used to install new clocksources
+ * @t:		clocksource to be registered
+ */
+void register_clocksource(struct clocksource *c)
+{
+	spin_lock(&clocksource_lock);
+
+	/* check if clocksource is already registered */
+	if (is_registered_source(c)) {
+		printk("register_clocksource: Cannot register %s. Already registered!",
+		       c->name);
+	} else {
+		list_add(&c->list, &clocksource_list);
+		/* select next clocksource */
+		next_clocksource = select_clocksource();
+	}
+	spin_unlock(&clocksource_lock);
+}
+
+EXPORT_SYMBOL(register_clocksource);
+
+/**
+ * reselect_clocksource - Rescan list for next clocksource
+ *
+ * A quick helper function to be used if a clocksource changes its
+ * rating. Forces the clocksource list to be re-scaned for the best
+ * clocksource.
+ */
+void reselect_clocksource(void)
+{
+	spin_lock(&clocksource_lock);
+	next_clocksource = select_clocksource();
+	spin_unlock(&clocksource_lock);
+}
+
+/**
+ * sysfs_show_current_clocksources - sysfs interface for current clocksource
+ * @dev:	unused
+ * @buf:	char buffer to be filled with clocksource list
+ *
+ * Provides sysfs interface for listing current clocksource.
+ */
+static ssize_t
+sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
+{
+	char *curr = buf;
+
+	spin_lock(&clocksource_lock);
+	curr += sprintf(curr, "%s ", curr_clocksource->name);
+	spin_unlock(&clocksource_lock);
+
+	curr += sprintf(curr, "\n");
+
+	return curr - buf;
+}
+
+/**
+ * sysfs_override_clocksource - interface for manually overriding clocksource
+ * @dev:	unused
+ * @buf:	name of override clocksource
+ * @count:	length of buffer
+ *
+ * Takes input from sysfs interface for manually overriding the default
+ * clocksource selction.
+ */
+static ssize_t sysfs_override_clocksource(struct sys_device *dev,
+					  const char *buf, size_t count)
+{
+	/* strings from sysfs write are not 0 terminated! */
+	if (count >= sizeof(override_name))
+		return -EINVAL;
+
+	/* strip of \n: */
+	if (buf[count-1] == '\n')
+		count--;
+	if (count < 1)
+		return -EINVAL;
+
+	spin_lock(&clocksource_lock);
+
+	/* copy the name given: */
+	memcpy(override_name, buf, count);
+	override_name[count] = 0;
+
+	/* try to select it: */
+	next_clocksource = select_clocksource();
+
+	spin_unlock(&clocksource_lock);
+
+	return count;
+}
+
+/**
+ * sysfs_show_available_clocksources - sysfs interface for listing clocksource
+ * @dev:	unused
+ * @buf:	char buffer to be filled with clocksource list
+ *
+ * Provides sysfs interface for listing registered clocksources
+ */
+static ssize_t
+sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
+{
+	struct list_head *tmp;
+	char *curr = buf;
+
+	spin_lock(&clocksource_lock);
+	list_for_each(tmp, &clocksource_list) {
+		struct clocksource *src;
+
+		src = list_entry(tmp, struct clocksource, list);
+		curr += sprintf(curr, "%s ", src->name);
+	}
+	spin_unlock(&clocksource_lock);
+
+	curr += sprintf(curr, "\n");
+
+	return curr - buf;
+}
+
+/*
+ * Sysfs setup bits:
+ */
+static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources,
+			sysfs_override_clocksource);
+
+static SYSDEV_ATTR(available_clocksource, 0600,
+			sysfs_show_available_clocksources, NULL);
+
+static struct sysdev_class clocksource_sysclass = {
+	set_kset_name("clocksource"),
+};
+
+static struct sys_device device_clocksource = {
+	.id	= 0,
+	.cls	= &clocksource_sysclass,
+};
+
+static int init_clocksource_sysfs(void)
+{
+	int error = sysdev_class_register(&clocksource_sysclass);
+
+	if (!error)
+		error = sysdev_register(&device_clocksource);
+	if (!error)
+		error = sysdev_create_file(
+				&device_clocksource,
+				&attr_current_clocksource);
+	if (!error)
+		error = sysdev_create_file(
+				&device_clocksource,
+				&attr_available_clocksource);
+	return error;
+}
+
+device_initcall(init_clocksource_sysfs);
+
+/**
+ * boot_override_clocksource - boot clock override
+ * @str:	override name
+ *
+ * Takes a clocksource= boot argument and uses it
+ * as the clocksource override name.
+ */
+static int __init boot_override_clocksource(char* str)
+{
+	spin_lock(&clocksource_lock);
+	if (str)
+		strlcpy(override_name, str, sizeof(override_name));
+	spin_unlock(&clocksource_lock);
+	return 1;
+}
+
+__setup("clocksource=", boot_override_clocksource);
+
+/**
+ * boot_override_clock - Compatibility layer for deprecated boot option
+ * @str:	override name
+ *
+ * DEPRECATED! Takes a clock= boot argument and uses it
+ * as the clocksource override name
+ */
+static int __init boot_override_clock(char* str)
+{
+	printk("Warning! clock= boot option is deprecated.\n");
+
+	return boot_override_clocksource(str);
+}
+
+__setup("clock=", boot_override_clock);
Index: linux.prev/kernel/time/jiffies.c
===================================================================
--- /dev/null
+++ linux.prev/kernel/time/jiffies.c
@@ -0,0 +1,75 @@
+/***********************************************************************
+* linux/kernel/time/jiffies.c
+*
+* This file contains the jiffies based clocksource.
+*
+* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*
+************************************************************************/
+#include <linux/clocksource.h>
+#include <linux/jiffies.h>
+#include <linux/init.h>
+
+/* The Jiffies based clocksource is the lowest common
+ * denominator clock source which should function on
+ * all systems. It has the same coarse resolution as
+ * the timer interrupt frequency HZ and it suffers
+ * inaccuracies caused by missed or lost timer
+ * interrupts and the inability for the timer
+ * interrupt hardware to accuratly tick at the
+ * requested HZ value. It is also not reccomended
+ * for "tick-less" systems.
+ */
+#define NSEC_PER_JIFFY	((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
+
+/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
+ * conversion, the .shift value could be zero. However
+ * this would make NTP adjustments impossible as they are
+ * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to
+ * shift both the nominator and denominator the same
+ * amount, and give ntp adjustments in units of 1/2^8
+ *
+ * The value 8 is somewhat carefully chosen, as anything
+ * larger can result in overflows. NSEC_PER_JIFFY grows as
+ * HZ shrinks, so values greater then 8 overflow 32bits when
+ * HZ=100.
+ */
+#define JIFFIES_SHIFT	8
+
+static cycle_t jiffies_read(void)
+{
+	return (cycle_t) get_jiffies_64();
+}
+
+struct clocksource clocksource_jiffies = {
+	.name		= "jiffies",
+	.rating		= 0, /* lowest rating*/
+	.read		= jiffies_read,
+	.mask		= (cycle_t)-1,
+	.mult		= NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
+	.shift		= JIFFIES_SHIFT,
+	.is_continuous	= 0, /* tick based, not free running */
+};
+
+static int __init init_jiffies_clocksource(void)
+{
+	register_clocksource(&clocksource_jiffies);
+
+	return 0;
+}
+
+module_init(init_jiffies_clocksource);
Index: linux.prev/kernel/time/timeofday.c
===================================================================
--- /dev/null
+++ linux.prev/kernel/time/timeofday.c
@@ -0,0 +1,706 @@
+/*
+ * linux/kernel/time/timeofday.c
+ *
+ * This file contains the functions which access and manage
+ * the system's time of day functionality.
+ *
+ * Copyright (C) 2003, 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * TODO WishList:
+ *   o See XXX's below.
+ */
+
+#include <linux/clocksource.h>
+#include <linux/timeofday.h>
+#include <linux/jiffies.h>
+#include <linux/sysdev.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/ktime.h>
+#include <linux/timex.h>
+#include <linux/sched.h>
+
+#include <asm/timeofday.h>
+
+/* Periodic hook interval */
+#define PERIODIC_INTERVAL_MS 50
+
+/* [ktime_t based variables]
+ * system_time:
+ *	Monotonically increasing counter of the number of nanoseconds
+ *	since boot.
+ * wall_time_offset:
+ *	Offset added to system_time to provide accurate time-of-day
+ */
+static ktime_t system_time;
+static ktime_t wall_time_offset;
+
+/* [timespec based variables]
+ * These variables mirror teh ktime_t based variables to avoid
+ * performance issues in the userspace syscall paths.
+ *
+ * wall_time_ts:
+ *	timespec holding the current wall time.
+ *	NOTE: clock_was_set() must be called when this value changes.
+ * mono_time_ts:
+ *	timespec holding the current monotonic time.
+ * monotonic_time_offset_ts:
+ *	timespec holding the difference between wall and monotonic time.
+ */
+static struct timespec wall_time_ts;
+static struct timespec mono_time_ts;
+static struct timespec monotonic_time_offset_ts;
+
+/* [cycle based variables]
+ * cycle_last:
+ *	Value of the clocksource at the last timeofday_periodic_hook()
+ *	(adjusted only minorly to account for rounded off cycles)
+ */
+static cycle_t cycle_last;
+
+/* [clocksource_interval variables]
+ * ts_interval:
+ *	This clocksource_interval is used in the fixed interval
+ *	cycles to nanosecond calculation.
+ * INTERVAL_LEN:
+ *	This constant is the requested fixed interval period
+ *	in nanoseconds.
+ */
+struct clocksource_interval ts_interval;
+#define INTERVAL_LEN ((PERIODIC_INTERVAL_MS-1)*1000000)
+
+/* [clocksource data]
+ * clock:
+ *	current clocksource pointer
+ */
+static struct clocksource *clock;
+
+/* [NTP adjustment]
+ * ntp_adj:
+ *	value of the current ntp adjustment, stored in
+ *	clocksource multiplier units.
+ */
+static int ntp_adj;
+
+/* [locks]
+ * system_time_lock:
+ *	generic lock for all locally scoped time values
+ */
+static DECLARE_RAW_SEQLOCK(system_time_lock);
+
+
+/* [suspend/resume info]
+ * time_suspend_state:
+ *	variable that keeps track of suspend state
+ * suspend_start:
+ *	start of the suspend call
+ */
+static enum {
+	TIME_RUNNING,
+	TIME_SUSPENDED
+} time_suspend_state = TIME_RUNNING;
+
+static nsec_t suspend_start;
+
+/* [Soft-Timers]
+ * timeofday_timer:
+ *	soft-timer used to call timeofday_periodic_hook()
+ */
+static struct timer_list timeofday_timer;
+
+/**
+ * update_legacy_time_values - sync legacy time values
+ *
+ * This function is necessary for a smooth transition to the
+ * new timekeeping code. When all the xtime/wall_to_monotonic
+ * users are converted this function can be removed.
+ *
+ * system_time_lock must be held by the caller
+ */
+static void update_legacy_time_values(void)
+{
+	unsigned long flags;
+
+	write_seqlock_irqsave(&xtime_lock, flags);
+
+	xtime = wall_time_ts;
+	set_normalized_timespec(&wall_to_monotonic,
+		-monotonic_time_offset_ts.tv_sec,
+		-monotonic_time_offset_ts.tv_nsec);
+
+	write_sequnlock_irqrestore(&xtime_lock, flags);
+
+	/* since time state has changed, notify vsyscall code */
+	arch_update_vsyscall_gtod(wall_time_ts, cycle_last, clock, ntp_adj);
+}
+
+/**
+ * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
+ *
+ * private function, must hold system_time_lock lock when being
+ * called. Returns the number of nanoseconds since the
+ * last call to timeofday_periodic_hook() (adjusted by NTP scaling)
+ */
+static inline nsec_t __get_nsec_offset(void)
+{
+	cycle_t cycle_now, cycle_delta;
+	nsec_t ns_offset;
+
+	/* read clocksource: */
+	cycle_now = read_clocksource(clock);
+
+	/* calculate the delta since the last timeofday_periodic_hook: */
+	cycle_delta = (cycle_now - cycle_last) & clock->mask;
+
+	/* convert to nanoseconds: */
+	ns_offset = cyc2ns(clock, ntp_adj, cycle_delta);
+
+	/*
+	 * special case for jiffies tick/offset based systems,
+	 * add arch-specific offset:
+	 */
+	ns_offset += arch_getoffset();
+
+	return ns_offset;
+}
+
+/**
+ * __get_monotonic_clock - Returns monotonically increasing nanoseconds
+ *
+ * private function, must hold system_time_lock lock when being
+ * called. Returns the monotonically increasing number of
+ * nanoseconds since the system booted (adjusted by NTP scaling)
+ */
+static ktime_t __get_monotonic_clock(void)
+{
+	nsec_t offset = __get_nsec_offset();
+	return ktime_add_ns(system_time, offset);
+}
+
+/**
+ * get_monotonic_clock - Returns monotonic time in ktime_t format
+ *
+ * Returns the monotonically increasing number of nanoseconds
+ * since the system booted via __monotonic_clock()
+ */
+ktime_t get_monotonic_clock(void)
+{
+	unsigned long seq;
+	ktime_t ret;
+
+	/* atomically read __get_monotonic_clock() */
+	do {
+		seq = read_seqbegin(&system_time_lock);
+
+		ret = __get_monotonic_clock();
+
+	} while (read_seqretry(&system_time_lock, seq));
+
+	return ret;
+}
+
+EXPORT_SYMBOL_GPL(get_monotonic_clock);
+
+/**
+ * get_realtime_clock - Returns the timeofday in ktime_t format
+ *
+ * Returns the wall time in ktime_t format. The resolution
+ * is nanoseconds
+ */
+ktime_t get_realtime_clock(void)
+{
+	unsigned long seq;
+	ktime_t ret;
+
+	/* atomically read __get_monotonic_clock() */
+	do {
+		seq = read_seqbegin(&system_time_lock);
+
+		ret = __get_monotonic_clock();
+		ret = ktime_add(ret, wall_time_offset);
+
+	} while (read_seqretry(&system_time_lock, seq));
+
+	return ret;
+}
+
+/**
+ * get_realtime_offset - Returns the offset of realtime clock
+ *
+ * Returns the number of nanoseconds in ktime_t storage format which
+ * represents the offset of the realtime clock to the the monotonic clock
+ */
+ktime_t get_realtime_offset(void)
+{
+	unsigned long seq;
+	ktime_t ret;
+
+	/* atomically read wall_time_offset */
+	do {
+		seq = read_seqbegin(&system_time_lock);
+
+		ret = wall_time_offset;
+
+	} while (read_seqretry(&system_time_lock, seq));
+
+	return ret;
+}
+
+/**
+ * get_monotonic_clock_ts - Returns monotonic time in timespec format
+ * @ts:		pointer to the timespec to be set
+ *
+ * Returns a timespec of nanoseconds since the system booted and
+ * store the result in the timespec variable pointed to by @ts
+ */
+void get_monotonic_clock_ts(struct timespec *ts)
+{
+	unsigned long seq;
+	nsec_t offset;
+
+	do {
+		seq = read_seqbegin(&system_time_lock);
+
+		*ts = mono_time_ts;
+		offset = __get_nsec_offset();
+	} while (read_seqretry(&system_time_lock, seq));
+
+	timespec_add_ns(ts, offset);
+}
+
+/**
+ * __get_realtime_clock_ts - Returns the time of day in a timespec
+ * @ts:		pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec. Used by
+ * do_gettimeofday() and get_realtime_clock_ts().
+ */
+static inline void __get_realtime_clock_ts(struct timespec *ts)
+{
+	unsigned long seq;
+	nsec_t nsecs;
+
+	do {
+		seq = read_seqbegin(&system_time_lock);
+
+		*ts = wall_time_ts;
+		nsecs = __get_nsec_offset();
+
+	} while (read_seqretry(&system_time_lock, seq));
+
+	timespec_add_ns(ts, nsecs);
+}
+
+/**
+ * get_realtime_clock_ts - Returns the time of day in a timespec
+ * @ts:		pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec.
+ */
+void get_realtime_clock_ts(struct timespec *ts)
+{
+	__get_realtime_clock_ts(ts);
+}
+
+EXPORT_SYMBOL(get_realtime_clock_ts);
+
+/**
+ * do_gettimeofday - Returns the time of day in a timeval
+ * @tv:		pointer to the timeval to be set
+ *
+ * NOTE: Users should be converted to using get_realtime_clock_ts()
+ */
+void do_gettimeofday(struct timeval *tv)
+{
+	struct timespec now;
+
+	__get_realtime_clock_ts(&now);
+	tv->tv_sec = now.tv_sec;
+	tv->tv_usec = now.tv_nsec/1000;
+}
+
+EXPORT_SYMBOL(do_gettimeofday);
+
+/**
+ * do_settimeofday - Sets the time of day
+ * @tv:		pointer to the timespec variable containing the new time
+ *
+ * Sets the time of day to the new time and update NTP and notify hrtimers
+ */
+int do_settimeofday(struct timespec *tv)
+{
+	unsigned long flags;
+	ktime_t newtime;
+
+	newtime = timespec_to_ktime(*tv);
+
+	write_seqlock_irqsave(&system_time_lock, flags);
+
+	/* calculate the new offset from the monotonic clock */
+	wall_time_offset = ktime_sub(newtime, __get_monotonic_clock());
+
+	/* update the internal timespec variables */
+	wall_time_ts = ktime_to_timespec(ktime_add(system_time,
+						wall_time_offset));
+	monotonic_time_offset_ts = ktime_to_timespec(wall_time_offset);
+
+	ntp_clear();
+	update_legacy_time_values();
+
+	write_sequnlock_irqrestore(&system_time_lock, flags);
+
+	/* signal hrtimers about time change */
+	clock_was_set();
+
+	return 0;
+}
+
+EXPORT_SYMBOL(do_settimeofday);
+
+/**
+ * __increment_system_time - Increments system time
+ * @delta:	nanosecond delta to add to the time variables
+ *
+ * Private helper that increments system_time and related
+ * timekeeping variables.
+ */
+static void __increment_system_time(nsec_t delta)
+{
+	system_time = ktime_add_ns(system_time, delta);
+	timespec_add_ns(&wall_time_ts, delta);
+	timespec_add_ns(&mono_time_ts, delta);
+}
+
+/**
+ * timeofday_suspend_hook - allows the timeofday subsystem to be shutdown
+ * @dev:	unused
+ * @state:	unused
+ *
+ * This function allows the timeofday subsystem to be shutdown for a period
+ * of time. Called when going into suspend/hibernate mode.
+ */
+static int timeofday_suspend_hook(struct sys_device *dev, pm_message_t state)
+{
+	unsigned long flags;
+
+	write_seqlock_irqsave(&system_time_lock, flags);
+
+	BUG_ON(time_suspend_state != TIME_RUNNING);
+
+	/*
+	 * First off, save suspend start time
+	 * then quickly accumulate the current nsec offset.
+	 * These two calls hopefully occur quickly
+	 * because the difference between reads will
+	 * accumulate as time drift on resume.
+	 */
+	suspend_start = read_persistent_clock();
+	__increment_system_time(__get_nsec_offset());
+
+	time_suspend_state = TIME_SUSPENDED;
+
+	write_sequnlock_irqrestore(&system_time_lock, flags);
+
+	return 0;
+}
+
+/**
+ * timeofday_resume_hook - Resumes the timeofday subsystem.
+ * @dev:	unused
+ *
+ * This function resumes the timeofday subsystem from a previous call
+ * to timeofday_suspend_hook.
+ */
+static int timeofday_resume_hook(struct sys_device *dev)
+{
+	nsec_t suspend_end, suspend_time;
+	unsigned long flags;
+
+	write_seqlock_irqsave(&system_time_lock, flags);
+
+	BUG_ON(time_suspend_state != TIME_SUSPENDED);
+
+	/*
+	 * Read persistent clock to mark the end of
+	 * the suspend interval then rebase the
+	 * cycle_last to current clocksource value.
+	 * Again, time between these two calls will
+	 * not be accounted for and will show up as
+	 * time drift.
+	 */
+	suspend_end = read_persistent_clock();
+	cycle_last = read_clocksource(clock);
+
+	/* calculate suspend time and add it to system time: */
+	suspend_time = suspend_end - suspend_start;
+	__increment_system_time(suspend_time);
+
+	ntp_clear();
+
+	time_suspend_state = TIME_RUNNING;
+
+	update_legacy_time_values();
+
+	write_sequnlock_irqrestore(&system_time_lock, flags);
+
+	/* notify the posix timers if wall_time_offset changed */
+	clock_was_set();
+
+	return 0;
+}
+
+/* sysfs resume/suspend bits */
+static struct sysdev_class timeofday_sysclass = {
+	.resume		= timeofday_resume_hook,
+	.suspend	= timeofday_suspend_hook,
+	set_kset_name("timeofday"),
+};
+
+static struct sys_device device_timer = {
+	.id		= 0,
+	.cls		= &timeofday_sysclass,
+};
+
+static int timeofday_init_device(void)
+{
+	int error = sysdev_class_register(&timeofday_sysclass);
+
+	if (!error)
+		error = sysdev_register(&device_timer);
+
+	return error;
+}
+
+device_initcall(timeofday_init_device);
+
+/* hack for periodic hook starvation */
+unsigned long recently_run;
+
+/**
+ * timeofday_periodic_hook - Does periodic update of timekeeping values.
+ * @unused:	unused value
+ *
+ * Calculates the delta since the last call, updates system time and
+ * clears the offset.
+ *
+ * Called via timeofday_timer.
+ */
+static void timeofday_periodic_hook(unsigned long unused)
+{
+	unsigned long flags;
+
+	cycle_t cycle_now, cycle_delta;
+	nsec_t delta_nsec;
+	static u64 remainder;
+
+	long leapsecond = 0;
+	struct clocksource* next;
+
+	int ppm;
+	static int ppm_last;
+
+	int something_changed = 0, clocksource_changed = 0;
+	struct clocksource old_clock;
+	static nsec_t second_check;
+
+	set_bit(0, &recently_run);
+
+	write_seqlock_irqsave(&system_time_lock, flags);
+
+	/* read time source & calc time since last call: */
+	cycle_now = read_clocksource(clock);
+	cycle_delta = (cycle_now - cycle_last) & clock->mask;
+
+	delta_nsec = cyc2ns_fixed_rem(ts_interval, &cycle_delta, &remainder);
+	cycle_last = (cycle_now - cycle_delta)&clock->mask;
+
+	/* update system_time:  */
+	__increment_system_time(delta_nsec);
+
+	/* advance the ntp state machine by ns interval: */
+	ntp_advance(delta_nsec);
+
+	/* only call ntp_leapsecond and ntp_sync once a sec:  */
+	second_check += delta_nsec;
+	if (second_check >= NSEC_PER_SEC) {
+		/* do ntp leap second processing: */
+		leapsecond = ntp_leapsecond(wall_time_ts);
+		if (leapsecond) {
+			wall_time_offset = ktime_add_ns(wall_time_offset,
+						leapsecond * NSEC_PER_SEC);
+			wall_time_ts.tv_sec += leapsecond;
+			monotonic_time_offset_ts.tv_sec += leapsecond;
+		}
+		/* sync the persistent clock: */
+		if (ntp_synced())
+			sync_persistent_clock(wall_time_ts);
+		second_check -= NSEC_PER_SEC;
+	}
+
+	/* if necessary, switch clocksources: */
+	next = get_next_clocksource();
+	if (next != clock) {
+		/* immediately set new cycle_last: */
+		cycle_last = read_clocksource(next);
+		/* update cycle_now to avoid problems in accumulation later: */
+		cycle_now = cycle_last;
+		/* swap clocksources: */
+		old_clock = *clock;
+		clock = next;
+		printk(KERN_INFO "Time: %s clocksource has been installed.\n",
+					clock->name);
+		ntp_clear();
+		ntp_adj = 0;
+		remainder = 0;
+		something_changed = 1;
+		clocksource_changed = 1;
+	}
+
+	/*
+	 * now is a safe time, so allow clocksource to adjust
+	 * itself (for example: to make cpufreq changes):
+	 */
+	if (clock->update_callback) {
+		/*
+		 * since clocksource state might change,
+		 * keep a copy, but only if we've not
+		 * already changed timesources:
+		 */
+		if (!something_changed)
+			old_clock = *clock;
+		if (clock->update_callback()) {
+			remainder = 0;
+			something_changed = 1;
+		}
+	}
+
+	/* check for new PPM adjustment: */
+	ppm = ntp_get_ppm_adjustment();
+	if (ppm_last != ppm) {
+		/* make sure old_clock is set: */
+		if (!something_changed)
+			old_clock = *clock;
+		something_changed = 1;
+	}
+
+	/* if something changed, recalculate the ntp adjustment value: */
+	if (something_changed) {
+		/* accumulate current leftover cycles using old_clock: */
+		if (cycle_delta) {
+			delta_nsec = cyc2ns_rem(&old_clock, ntp_adj,
+						cycle_delta, &remainder);
+			cycle_last = cycle_now;
+			__increment_system_time(delta_nsec);
+			ntp_advance(delta_nsec);
+		}
+
+		/* recalculate the ntp adjustment and fixed interval values: */
+		ppm_last = ppm;
+		ntp_adj = ppm_to_mult_adj(clock, ppm);
+		ts_interval = calculate_clocksource_interval(clock, ntp_adj,
+					INTERVAL_LEN);
+	}
+
+	update_legacy_time_values();
+
+	write_sequnlock_irqrestore(&system_time_lock, flags);
+
+	if (clocksource_changed)
+		hrtimer_clock_notify();
+
+	/* notify the posix timers if wall_time_offset changed */
+	if (leapsecond)
+		clock_was_set();
+
+	/* set us up to go off on the next interval: */
+	mod_timer(&timeofday_timer,
+		jiffies + 1 + msecs_to_jiffies(PERIODIC_INTERVAL_MS));
+}
+
+/*
+ * If timeofday_periodic_hook might be starved, call this at least
+ * once a second from interrupt context to ensure things run properly.
+ */
+void timeofday_ensure_correctness(void)
+{
+	if (!test_bit(0, &recently_run))
+		timeofday_periodic_hook(0);
+	else
+		clear_bit(0, &recently_run);
+}
+
+/**
+ * timeofday_is_continuous - check to see if timekeeping is free running
+ */
+int timeofday_is_continuous(void)
+{
+	unsigned long seq;
+	int ret;
+
+	do {
+		seq = read_seqbegin(&system_time_lock);
+
+		ret = clock->is_continuous;
+
+	} while (read_seqretry(&system_time_lock, seq));
+
+	return ret;
+}
+
+/**
+ * timeofday_init - Initializes time variables
+ */
+void __init timeofday_init(void)
+{
+	unsigned long flags;
+
+	write_seqlock_irqsave(&system_time_lock, flags);
+
+	/* initialize the clock variable: */
+	clock = get_next_clocksource();
+
+	/* initialize cycle_last offset base: */
+	cycle_last = read_clocksource(clock);
+
+	/* initialize wall_time_offset to now: */
+	/* XXX - this should be something like ns_to_ktime() */
+	wall_time_offset = ktime_add_ns(wall_time_offset,
+					read_persistent_clock());
+
+	/* initialize timespec values: */
+	wall_time_ts = ktime_to_timespec(ktime_add(system_time,
+						wall_time_offset));
+	monotonic_time_offset_ts = ktime_to_timespec(wall_time_offset);
+
+	/* clear NTP scaling factor & state machine: */
+	ntp_adj = 0;
+	ntp_clear();
+	ts_interval = calculate_clocksource_interval(clock, ntp_adj,
+				INTERVAL_LEN);
+
+	/* initialize legacy time values: */
+	update_legacy_time_values();
+
+	write_sequnlock_irqrestore(&system_time_lock, flags);
+
+	/* install timeofday_periodic_hook timer: */
+	init_timer(&timeofday_timer);
+	timeofday_timer.function = timeofday_periodic_hook;
+	timeofday_timer.expires = jiffies + 1
+				+ msecs_to_jiffies(PERIODIC_INTERVAL_MS);
+	add_timer(&timeofday_timer);
+}
Index: linux.prev/kernel/timer.c
===================================================================
--- linux.prev.orig/kernel/timer.c
+++ linux.prev/kernel/timer.c
@@ -28,11 +28,12 @@
 #include <linux/swap.h>
 #include <linux/notifier.h>
 #include <linux/thread_info.h>
-#include <linux/time.h>
+#include <linux/timeofday.h>
 #include <linux/jiffies.h>
 #include <linux/posix-timers.h>
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
+#include <linux/kallsyms.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -64,6 +65,7 @@ EXPORT_SYMBOL(jiffies_64);
 struct timer_base_s {
 	spinlock_t lock;
 	struct timer_list *running_timer;
+	wait_queue_head_t wait_for_running_timer;
 };
 
 typedef struct tvec_s {
@@ -90,7 +92,7 @@ static DEFINE_PER_CPU(tvec_base_t, tvec_
 static inline void set_running_timer(tvec_base_t *base,
 					struct timer_list *timer)
 {
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS)
 	base->t_base.running_timer = timer;
 #endif
 }
@@ -142,8 +144,7 @@ typedef struct timer_base_s timer_base_t
  * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases)
  * at compile time, and we need timer->base to lock the timer.
  */
-timer_base_t __init_timer_base
-	____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED };
+timer_base_t __init_timer_base ____cacheline_aligned_in_smp;
 EXPORT_SYMBOL(__init_timer_base);
 
 /***
@@ -206,7 +207,7 @@ int __mod_timer(struct timer_list *timer
 	timer_base_t *base;
 	tvec_base_t *new_base;
 	unsigned long flags;
-	int ret = 0;
+	int ret = 0, cpu;
 
 	BUG_ON(!timer->function);
 
@@ -216,8 +217,8 @@ int __mod_timer(struct timer_list *timer
 		detach_timer(timer, 0);
 		ret = 1;
 	}
-
-	new_base = &__get_cpu_var(tvec_bases);
+	cpu = raw_smp_processor_id();
+	new_base = &per_cpu(tvec_bases, cpu);
 
 	if (base != &new_base->t_base) {
 		/*
@@ -267,6 +268,17 @@ void add_timer_on(struct timer_list *tim
 	spin_unlock_irqrestore(&base->t_base.lock, flags);
 }
 
+/*
+ * Wait for a running timer
+ */
+void wait_for_running_timer(struct timer_list *timer)
+{
+	timer_base_t *base = timer->base;
+
+	if (base->running_timer == timer)
+		wait_event(base->wait_for_running_timer,
+			   base->running_timer != timer);
+}
 
 /***
  * mod_timer - modify a timer's timeout
@@ -335,7 +347,34 @@ int del_timer(struct timer_list *timer)
 
 EXPORT_SYMBOL(del_timer);
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS)
+/*
+ * This function checks whether a timer is active and not running on any
+ * CPU. Upon successful (ret >= 0) exit the timer is not queued and the
+ * handler is not running on any CPU.
+ *
+ * It must not be called from interrupt contexts.
+ */
+int timer_pending_sync(struct timer_list *timer)
+{
+	timer_base_t *base;
+	unsigned long flags;
+	int ret = -1;
+
+	base = lock_timer_base(timer, &flags);
+
+	if (base->running_timer == timer)
+		goto out;
+
+	ret = 0;
+	if (timer_pending(timer))
+		ret = 1;
+out:
+	spin_unlock_irqrestore(&base->lock, flags);
+
+	return ret;
+}
+
 /*
  * This function tries to deactivate a timer. Upon successful (ret >= 0)
  * exit the timer is not queued and the handler is not running on any CPU.
@@ -364,6 +403,7 @@ out:
 	return ret;
 }
 
+
 /***
  * del_timer_sync - deactivate a timer and wait for the handler to finish.
  * @timer: the timer to be deactivated
@@ -387,6 +427,7 @@ int del_timer_sync(struct timer_list *ti
 		int ret = try_to_del_timer_sync(timer);
 		if (ret >= 0)
 			return ret;
+		wait_for_running_timer(timer);
 	}
 }
 
@@ -434,8 +475,21 @@ static inline void __run_timers(tvec_bas
 	while (time_after_eq(jiffies, base->timer_jiffies)) {
 		struct list_head work_list = LIST_HEAD_INIT(work_list);
 		struct list_head *head = &work_list;
- 		int index = base->timer_jiffies & TVR_MASK;
- 
+		int index = base->timer_jiffies & TVR_MASK;
+
+		if (softirq_need_resched()) {
+			spin_unlock_irq(&base->t_base.lock);
+			wake_up(&base->t_base.wait_for_running_timer);
+			cond_resched_all();
+			cpu_relax();
+			spin_lock_irq(&base->t_base.lock);
+			/*
+			 * We can simply continue after preemption, nobody
+			 * else can touch timer_jiffies so 'index' is still
+			 * valid. Any new jiffy will be taken care of in
+			 * subsequent loops:
+			 */
+		}
 		/*
 		 * Cascade timers:
 		 */
@@ -461,19 +515,18 @@ static inline void __run_timers(tvec_bas
 				int preempt_count = preempt_count();
 				fn(data);
 				if (preempt_count != preempt_count()) {
-					printk(KERN_WARNING "huh, entered %p "
-					       "with preempt_count %08x, exited"
-					       " with %08x?\n",
-					       fn, preempt_count,
-					       preempt_count());
-					BUG();
+					print_symbol("BUG: unbalanced timer-handler preempt count in %s!\n", (unsigned long) fn);
+					printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count());
+					preempt_count() = preempt_count;
 				}
 			}
+			set_running_timer(base, NULL);
+			cond_resched_all();
 			spin_lock_irq(&base->t_base.lock);
 		}
 	}
-	set_running_timer(base, NULL);
 	spin_unlock_irq(&base->t_base.lock);
+	wake_up(&base->t_base.wait_for_running_timer);
 }
 
 #ifdef CONFIG_NO_IDLE_HZ
@@ -582,16 +635,99 @@ long time_tolerance = MAXFREQ;		/* frequ
 long time_precision = 1;		/* clock precision (us)		*/
 long time_maxerror = NTP_PHASE_LIMIT;	/* maximum error (us)		*/
 long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
-static long time_phase;			/* phase offset (scaled us)	*/
 long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
 					/* frequency offset (scaled ppm)*/
 static long time_adj;			/* tick adjust (scaled 1 / HZ)	*/
 long time_reftime;			/* time at last adjustment (s)	*/
 long time_adjust;
 long time_next_adjust;
+long time_adjust_step;			/* per tick time_adjust step */
+
+long total_sppm;			/* shifted ppm sum of all adjustments */
+long offset_adj_ppm;
+long tick_adj_ppm;
+long singleshot_adj_ppm;
+
+#define MAX_SINGLESHOT_ADJ	500	/* (ppm) */
+#define SEC_PER_DAY		86400
+#define END_OF_DAY(x)		((x) + SEC_PER_DAY - ((x) % SEC_PER_DAY) - 1)
+
+/* NTP lock, protects NTP state machine */
+DECLARE_RAW_SEQLOCK(ntp_lock);
+
+/**
+ * ntp_leapsecond - NTP leapsecond processing code.
+ * now: the current time
+ *
+ * Returns the number of seconds (-1, 0, or 1) that
+ * should be added to the current time to properly
+ * adjust for leapseconds.
+ */
+int ntp_leapsecond(struct timespec now)
+{
+	/*
+	 * Leap second processing. If in leap-insert state at
+	 * the end of the day, the system clock is set back one
+	 * second; if in leap-delete state, the system clock is
+	 * set ahead one second.
+	 */
+	static time_t leaptime = 0;
+
+	unsigned long flags;
+	int ret = 0;
+
+	write_seqlock_irqsave(&ntp_lock, flags);
+
+	switch (time_state) {
+
+	case TIME_OK:
+		if (time_status & STA_INS) {
+			time_state = TIME_INS;
+			leaptime = END_OF_DAY(now.tv_sec);
+		} else if (time_status & STA_DEL) {
+			time_state = TIME_DEL;
+			leaptime = END_OF_DAY(now.tv_sec);
+		}
+		break;
+
+	case TIME_INS:
+		/* Once we are at (or past) leaptime, insert the second */
+		if (now.tv_sec >= leaptime) {
+			time_state = TIME_OOP;
+			printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n");
+			ret = -1;
+		}
+		break;
+
+	case TIME_DEL:
+		/* Once we are at (or past) leaptime, delete the second */
+		if (now.tv_sec >= leaptime) {
+			time_state = TIME_WAIT;
+			printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n");
+			ret = 1;
+		}
+		break;
+
+	case TIME_OOP:
+		/* Wait for the end of the leap second */
+		if (now.tv_sec > (leaptime + 1))
+			time_state = TIME_WAIT;
+		time_state = TIME_WAIT;
+		break;
+
+	case TIME_WAIT:
+		if (!(time_status & (STA_INS | STA_DEL)))
+			time_state = TIME_OK;
+		break;
+	}
+
+	write_sequnlock_irqrestore(&ntp_lock, flags);
+
+	return ret;
+}
 
 /*
- * this routine handles the overflow of the microsecond field
+ * this routine handles the overflow of the nanosecond field
  *
  * The tricky bits of code to handle the accurate clock support
  * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
@@ -677,6 +813,13 @@ static void second_overflow(void)
 	time_offset -= ltemp;
 	time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
 
+	offset_adj_ppm = shift_right(ltemp, SHIFT_UPDATE); /* ppm */
+
+	/* first calculate usec/user_tick offset: */
+	tick_adj_ppm = ((USEC_PER_SEC + USER_HZ/2)/USER_HZ) - tick_usec;
+	/* multiply by user_hz to get usec/sec => ppm: */
+	tick_adj_ppm *= USER_HZ;
+
 	/*
 	 * Compute the frequency estimate and additional phase adjustment due
 	 * to frequency error for the next second. When the PPS signal is
@@ -716,45 +859,128 @@ static void second_overflow(void)
 #endif
 }
 
-/* in the NTP reference this is called "hardclock()" */
-static void update_wall_time_one_tick(void)
+/**
+ * ntp_get_ppm_adjustment - return shifted PPM adjustment
+ */
+long ntp_get_ppm_adjustment(void)
 {
-	long time_adjust_step, delta_nsec;
+	return total_sppm;
+}
 
-	if ((time_adjust_step = time_adjust) != 0 ) {
-		/*
-		 * We are doing an adjtime thing.  Prepare time_adjust_step to
-		 * be within bounds.  Note that a positive time_adjust means we
-		 * want the clock to run faster.
-		 *
-		 * Limit the amount of the step to be in the range
-		 * -tickadj .. +tickadj
-		 */
-		time_adjust_step = min(time_adjust_step, (long)tickadj);
-		time_adjust_step = max(time_adjust_step, (long)-tickadj);
+/**
+ * ntp_advance - increments the NTP state machine
+ * @interval_ns: interval, in nanoseconds
+ */
+void ntp_advance(unsigned long interval_ns)
+{
+	static unsigned long interval_sum;
 
-		/* Reduce by this step the amount of time left  */
-		time_adjust -= time_adjust_step;
-	}
-	delta_nsec = tick_nsec + time_adjust_step * 1000;
-	/*
-	 * Advance the phase, once it gets to one microsecond, then
-	 * advance the tick more.
-	 */
-	time_phase += time_adj;
-	if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) {
-		long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10));
-		time_phase -= ltemp << (SHIFT_SCALE - 10);
-		delta_nsec += ltemp;
+	unsigned long flags;
+
+	write_seqlock_irqsave(&ntp_lock, flags);
+
+	/* increment the interval sum: */
+	interval_sum += interval_ns;
+
+	/* calculate the per tick singleshot adjtime adjustment step: */
+	while (interval_ns >= tick_nsec) {
+		time_adjust_step = time_adjust;
+		if (time_adjust_step) {
+	    		/*
+			 * We are doing an adjtime thing.
+			 *
+			 * Prepare time_adjust_step to be within bounds.
+			 * Note that a positive time_adjust means we want
+			 * the clock to run faster.
+			 *
+			 * Limit the amount of the step to be in the range
+			 * -tickadj .. +tickadj:
+			 */
+			time_adjust_step = min(time_adjust_step, (long)tickadj);
+			time_adjust_step = max(time_adjust_step,
+							 (long)-tickadj);
+
+			/* Reduce by this step the amount of time left: */
+			time_adjust -= time_adjust_step;
+		}
+		interval_ns -= tick_nsec;
 	}
-	xtime.tv_nsec += delta_nsec;
-	time_interpolator_update(delta_nsec);
+	/* usec/tick => ppm: */
+	singleshot_adj_ppm = time_adjust_step*(1000000/HZ);
 
 	/* Changes by adjtime() do not take effect till next tick. */
 	if (time_next_adjust != 0) {
 		time_adjust = time_next_adjust;
 		time_next_adjust = 0;
 	}
+
+	while (interval_sum >= NSEC_PER_SEC) {
+		interval_sum -= NSEC_PER_SEC;
+		second_overflow();
+	}
+
+	/* calculate the total continuous ppm adjustment: */
+	total_sppm = time_freq; /* already shifted by SHIFT_USEC */
+	total_sppm += offset_adj_ppm << SHIFT_USEC;
+	total_sppm += tick_adj_ppm << SHIFT_USEC;
+	total_sppm += singleshot_adj_ppm << SHIFT_USEC;
+
+	write_sequnlock_irqrestore(&ntp_lock, flags);
+}
+
+#ifdef CONFIG_GENERIC_TIME
+# define update_wall_time(x) do { } while (0)
+#else
+
+/**
+ * phase_advance - advance the phase
+ *
+ * advance the phase, once it gets to one nanosecond advance the tick more.
+ */
+static inline long phase_advance(void)
+{
+	static long time_phase; /* phase offset (scaled us) */
+
+	long delta = 0;
+
+	time_phase += time_adj;
+
+	if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) {
+		delta = shift_right(time_phase, (SHIFT_SCALE - 10));
+		time_phase -= delta << (SHIFT_SCALE - 10);
+	}
+
+	return delta;
+}
+
+/**
+ * xtime_advance - advance xtime
+ * @delta_nsec: adjustment in nsecs
+ */
+static inline void xtime_advance(long delta_nsec)
+{
+	int leapsecond;
+
+	xtime.tv_nsec += delta_nsec;
+	if (likely(xtime.tv_nsec < NSEC_PER_SEC))
+		return;
+
+	xtime.tv_nsec -= NSEC_PER_SEC;
+	xtime.tv_sec++;
+
+	/* process leapsecond: */
+	leapsecond = ntp_leapsecond(xtime);
+	if (likely(!leapsecond))
+		return;
+
+	xtime.tv_sec += leapsecond;
+	wall_to_monotonic.tv_sec -= leapsecond;
+	/*
+	 * Use of time interpolator for a gradual
+	 * change of time:
+	 */
+	time_interpolator_update(leapsecond*NSEC_PER_SEC);
+	clock_was_set();
 }
 
 /*
@@ -762,20 +988,26 @@ static void update_wall_time_one_tick(vo
  * usually just one (we shouldn't be losing ticks,
  * we're doing this this way mainly for interrupt
  * latency reasons, not because we think we'll
- * have lots of lost timer ticks
+ * have lots of lost timer ticks)
  */
 static void update_wall_time(unsigned long ticks)
 {
 	do {
-		ticks--;
-		update_wall_time_one_tick();
-		if (xtime.tv_nsec >= 1000000000) {
-			xtime.tv_nsec -= 1000000000;
-			xtime.tv_sec++;
-			second_overflow();
-		}
-	} while (ticks);
+		/*
+		 * Calculate the nsec delta using the precomputed NTP
+		 * adjustments:
+		 *     tick_nsec, time_adjust_step, time_adj
+		 */
+		long delta_nsec = tick_nsec + time_adjust_step * 1000;
+		delta_nsec += phase_advance();
+
+		xtime_advance(delta_nsec);
+		ntp_advance(tick_nsec);
+		time_interpolator_update(delta_nsec);
+
+	} while (--ticks);
 }
+#endif /* !CONFIG_GENERIC_TIME */
 
 /*
  * Called from the timer interrupt handler to charge one tick to the current 
@@ -783,8 +1015,8 @@ static void update_wall_time(unsigned lo
  */
 void update_process_times(int user_tick)
 {
-	struct task_struct *p = current;
 	int cpu = smp_processor_id();
+	struct task_struct *p = current;
 
 	/* Note: this timer irq context must be accounted for as well. */
 	if (user_tick)
@@ -795,7 +1027,9 @@ void update_process_times(int user_tick)
 	if (rcu_pending(cpu))
 		rcu_check_callbacks(cpu, user_tick);
 	scheduler_tick();
+#ifndef CONFIG_PREEMPT_RT
  	run_posix_cpu_timers(p);
+#endif
 }
 
 /*
@@ -803,7 +1037,15 @@ void update_process_times(int user_tick)
  */
 static unsigned long count_active_tasks(void)
 {
+	/*
+	 * On PREEMPT_RT, we are running in the timer softirq thread,
+	 * so consider 1 less running tasks:
+	 */
+#ifdef CONFIG_PREEMPT_RT
+	return (nr_running()-1 + nr_uninterruptible()) * FIXED_1;
+#else
 	return (nr_running() + nr_uninterruptible()) * FIXED_1;
+#endif
 }
 
 /*
@@ -845,47 +1087,64 @@ unsigned long wall_jiffies = INITIAL_JIF
  * playing with xtime and avenrun.
  */
 #ifndef ARCH_HAVE_XTIME_LOCK
-seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED;
+DECLARE_RAW_SEQLOCK(xtime_lock);
 
 EXPORT_SYMBOL(xtime_lock);
 #endif
 
 /*
- * This function runs timers and the timer-tq in bottom half context.
- */
-static void run_timer_softirq(struct softirq_action *h)
-{
-	tvec_base_t *base = &__get_cpu_var(tvec_bases);
-
-	if (time_after_eq(jiffies, base->timer_jiffies))
-		__run_timers(base);
-}
-
-/*
  * Called by the local, per-CPU timer interrupt on SMP.
  */
 void run_local_timers(void)
 {
 	raise_softirq(TIMER_SOFTIRQ);
+	softlockup_tick();
 }
 
 /*
- * Called by the timer interrupt. xtime_lock must already be taken
- * by the timer IRQ!
+ * Time of day handling:
  */
 static inline void update_times(void)
 {
-	unsigned long ticks;
+	unsigned long ticks = 0;
+	/*
+	 * First test outside the lock for performance reasons:
+	 */
+	if (jiffies != wall_jiffies) {
+		unsigned long flags;
 
-	ticks = jiffies - wall_jiffies;
-	if (ticks) {
-		wall_jiffies += ticks;
-		update_wall_time(ticks);
+		write_seqlock_irqsave(&xtime_lock, flags);
+		while (jiffies != wall_jiffies) {
+			wall_jiffies++;
+			ticks++;
+			update_wall_time(1);
+			/*
+			 * Unlock unconditionally, to make sure
+			 * we dont keep irqs off for a long time!
+			 */
+			write_sequnlock_irqrestore(&xtime_lock, flags);
+			cond_resched_softirq();
+			write_seqlock_irqsave(&xtime_lock, flags);
+		}
+		calc_load(ticks);
+		write_sequnlock_irqrestore(&xtime_lock, flags);
 	}
-	calc_load(ticks);
 }
   
 /*
+ * This function runs timers and the timer-tq in bottom half context.
+ */
+static void run_timer_softirq(struct softirq_action *h)
+{
+	tvec_base_t *base = &__get_cpu_var(tvec_bases);
+
+	update_times();
+ 	hrtimer_run_queues();
+	if (time_after_eq(jiffies, base->timer_jiffies))
+		__run_timers(base);
+}
+
+/*
  * The 64-bit jiffies value is not atomic - you MUST NOT read it
  * without sampling the sequence number in xtime_lock.
  * jiffies is defined in the linker script...
@@ -894,8 +1153,6 @@ static inline void update_times(void)
 void do_timer(struct pt_regs *regs)
 {
 	jiffies_64++;
-	update_times();
-	softlockup_tick(regs);
 }
 
 #ifdef __ARCH_WANT_SYS_ALARM
@@ -1118,62 +1375,6 @@ asmlinkage long sys_gettid(void)
 	return current->pid;
 }
 
-static long __sched nanosleep_restart(struct restart_block *restart)
-{
-	unsigned long expire = restart->arg0, now = jiffies;
-	struct timespec __user *rmtp = (struct timespec __user *) restart->arg1;
-	long ret;
-
-	/* Did it expire while we handled signals? */
-	if (!time_after(expire, now))
-		return 0;
-
-	expire = schedule_timeout_interruptible(expire - now);
-
-	ret = 0;
-	if (expire) {
-		struct timespec t;
-		jiffies_to_timespec(expire, &t);
-
-		ret = -ERESTART_RESTARTBLOCK;
-		if (rmtp && copy_to_user(rmtp, &t, sizeof(t)))
-			ret = -EFAULT;
-		/* The 'restart' block is already filled in */
-	}
-	return ret;
-}
-
-asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
-{
-	struct timespec t;
-	unsigned long expire;
-	long ret;
-
-	if (copy_from_user(&t, rqtp, sizeof(t)))
-		return -EFAULT;
-
-	if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0))
-		return -EINVAL;
-
-	expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
-	expire = schedule_timeout_interruptible(expire);
-
-	ret = 0;
-	if (expire) {
-		struct restart_block *restart;
-		jiffies_to_timespec(expire, &t);
-		if (rmtp && copy_to_user(rmtp, &t, sizeof(t)))
-			return -EFAULT;
-
-		restart = &current_thread_info()->restart_block;
-		restart->fn = nanosleep_restart;
-		restart->arg0 = jiffies + expire;
-		restart->arg1 = (unsigned long) rmtp;
-		ret = -ERESTART_RESTARTBLOCK;
-	}
-	return ret;
-}
-
 /*
  * sys_sysinfo - fill in sysinfo struct
  */ 
@@ -1270,6 +1471,7 @@ static void __devinit init_timers_cpu(in
 
 	base = &per_cpu(tvec_bases, cpu);
 	spin_lock_init(&base->t_base.lock);
+	init_waitqueue_head(&base->t_base.wait_for_running_timer);
 	for (j = 0; j < TVN_SIZE; j++) {
 		INIT_LIST_HEAD(base->tv5.vec + j);
 		INIT_LIST_HEAD(base->tv4.vec + j);
@@ -1305,8 +1507,7 @@ static void __devinit migrate_timers(int
 	old_base = &per_cpu(tvec_bases, cpu);
 	new_base = &get_cpu_var(tvec_bases);
 
-	local_irq_disable();
-	spin_lock(&new_base->t_base.lock);
+	spin_lock_irq(&new_base->t_base.lock);
 	spin_lock(&old_base->t_base.lock);
 
 	if (old_base->t_base.running_timer)
@@ -1321,8 +1522,7 @@ static void __devinit migrate_timers(int
 	}
 
 	spin_unlock(&old_base->t_base.lock);
-	spin_unlock(&new_base->t_base.lock);
-	local_irq_enable();
+	spin_unlock_irq(&new_base->t_base.lock);
 	put_cpu_var(tvec_bases);
 }
 #endif /* CONFIG_HOTPLUG_CPU */
@@ -1353,6 +1553,7 @@ static struct notifier_block __devinitda
 
 void __init init_timers(void)
 {
+	spin_lock_init(&__init_timer_base.lock);
 	timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
 				(void *)(long)smp_processor_id());
 	register_cpu_notifier(&timers_nb);
Index: linux.prev/kernel/workqueue.c
===================================================================
--- linux.prev.orig/kernel/workqueue.c
+++ linux.prev/kernel/workqueue.c
@@ -27,6 +27,7 @@
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <linux/kthread.h>
+#include <linux/syscalls.h>
 
 /*
  * The per-CPU workqueue (if single thread, we always use cpu 0's).
@@ -95,10 +96,12 @@ static void __queue_work(struct cpu_work
  *
  * We queue the work to the CPU it was submitted, but there is no
  * guarantee that it will be processed by that CPU.
+ *
+ * Especially no such guarantee on PREEMPT_RT.
  */
 int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
 {
-	int ret = 0, cpu = get_cpu();
+	int ret = 0, cpu = raw_smp_processor_id();
 
 	if (!test_and_set_bit(0, &work->pending)) {
 		if (unlikely(is_single_threaded(wq)))
@@ -107,7 +110,6 @@ int fastcall queue_work(struct workqueue
 		__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 		ret = 1;
 	}
-	put_cpu();
 	return ret;
 }
 
@@ -365,6 +367,46 @@ static void cleanup_workqueue_thread(str
 		kthread_stop(p);
 }
 
+void set_workqueue_thread_prio(struct workqueue_struct *wq, int cpu,
+				int policy, int rt_priority, int nice)
+{
+	struct cpu_workqueue_struct *cwq;
+	struct task_struct *p;
+	struct sched_param param = { .sched_priority = rt_priority };
+	unsigned long flags;
+	int ret;
+
+	cwq = per_cpu_ptr(wq->cpu_wq, cpu);
+	spin_lock_irqsave(&cwq->lock, flags);
+	p = cwq->thread;
+	spin_unlock_irqrestore(&cwq->lock, flags);
+
+	set_user_nice(p, nice);
+	ret = sys_sched_setscheduler(p->pid, policy, &param);
+	if (ret)
+		printk("BUG: wq(%s) setscheduler() returned: %d.\n",
+			wq->name, ret);
+
+}
+
+void set_workqueue_prio(struct workqueue_struct *wq, int policy,
+			int rt_priority, int nice)
+{
+	int cpu;
+
+	/* We don't need the distraction of CPUs appearing and vanishing. */
+	lock_cpu_hotplug();
+	if (is_single_threaded(wq))
+		set_workqueue_thread_prio(wq, 0, policy, rt_priority, nice);
+	else {
+		for_each_online_cpu(cpu)
+			set_workqueue_thread_prio(wq, cpu, policy,
+						  rt_priority, nice);
+	}
+	unlock_cpu_hotplug();
+}
+
+
 void destroy_workqueue(struct workqueue_struct *wq)
 {
 	int cpu;
@@ -546,6 +588,7 @@ void init_workqueues(void)
 	hotcpu_notifier(workqueue_cpu_callback, 0);
 	keventd_wq = create_workqueue("events");
 	BUG_ON(!keventd_wq);
+	set_workqueue_prio(keventd_wq, SCHED_FIFO, 1, -20);
 }
 
 EXPORT_SYMBOL_GPL(__create_workqueue);
Index: linux.prev/lib/Kconfig.debug
===================================================================
--- linux.prev.orig/lib/Kconfig.debug
+++ linux.prev/lib/Kconfig.debug
@@ -8,6 +8,22 @@ config PRINTK_TIME
 	  operations.  This is useful for identifying long delays
 	  in kernel startup.
 
+config PRINTK_IGNORE_LOGLEVEL
+	bool "Ignore loglevel on printks"
+	default n
+	help
+	  Selecting this option causes all printk messages to go
+	  to the console.  This allows you to serial-log kernel
+	  messages, no matter what userspace does. (e.g. some
+	  distributions disable kernel log messages during
+	  certain phases of system startup.)
+
+	  NOTE: this option also makes printk non-preemptible,
+	  which might improve the output of debugging info or
+	  crash info, but it might also cause latencies if your
+	  kernel is printk-ing alot.
+
+	  Normally you dont need or want this option.
 
 config DEBUG_KERNEL
 	bool "Kernel debugging"
@@ -79,7 +95,7 @@ config SCHEDSTATS
 
 config DEBUG_SLAB
 	bool "Debug memory allocations"
-	depends on DEBUG_KERNEL
+	depends on DEBUG_KERNEL && SLAB
 	help
 	  Say Y here to have the kernel do limited verification on memory
 	  allocation as well as poisoning memory on free to catch use of freed
@@ -93,11 +109,19 @@ config DEBUG_PREEMPT
 	  If you say Y here then the kernel will use a debug variant of the
 	  commonly used smp_processor_id() function and will print warnings
 	  if kernel code uses it in a preemption-unsafe way. Also, the kernel
-	  will detect preemption count underflows.
+	  will detect preemption count underflows and will track critical
+	  section entries and print that info when an illegal sleep happens.
 
+config DEBUG_IRQ_FLAGS
+	bool
+	default y
+	depends on DEBUG_PREEMPT
+
+# broken by PREEMPT_RT, disable for now
 config DEBUG_SPINLOCK
 	bool "Spinlock debugging"
-	depends on DEBUG_KERNEL
+	depends on 0 && DEBUG_KERNEL
+	default n
 	help
 	  Say Y here and build SMP to catch missing spinlock initialization
 	  and certain other kinds of spinlock errors commonly made.  This is
@@ -106,11 +130,187 @@ config DEBUG_SPINLOCK
 
 config DEBUG_SPINLOCK_SLEEP
 	bool "Sleep-inside-spinlock checking"
-	depends on DEBUG_KERNEL
+	depends on DEBUG_KERNEL && !DEBUG_PREEMPT
 	help
 	  If you say Y here, various routines which may sleep will become very
 	  noisy if they are called with a spinlock held.
 
+config WAKEUP_TIMING
+	bool "Wakeup latency timing"
+	default y
+	help
+	  This option measures the time spent from a highprio thread being
+	  woken up to it getting scheduled on a CPU, with microsecond
+	  accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started via:
+
+	      echo 0 > /proc/sys/kernel/preempt_max_latency
+
+config WAKEUP_LATENCY_HIST
+	bool "wakeup latency histogram"
+	default n
+	depends on WAKEUP_TIMING
+	help
+	  This option logs all the wakeup latency timing to a big histogram
+	  bucket, in the meanwhile, it also dummies up printk produced by
+	  wakeup latency timing.
+
+	  The wakeup latency timing histogram can be viewed via:
+
+	      cat /proc/latency_hist/wakeup_latency/CPU*
+
+	  (Note: * presents CPU ID.)
+
+config PREEMPT_TRACE
+	bool
+	default y
+	depends on DEBUG_PREEMPT
+
+config CRITICAL_PREEMPT_TIMING
+	bool "Non-preemptible critical section latency timing"
+	default n
+	depends on PREEMPT
+	help
+	  This option measures the time spent in preempt-off critical
+	  sections, with microsecond accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started via:
+
+	      echo 0 > /proc/sys/kernel/preempt_max_latency
+
+	  (Note that kernel size and overhead increases with this option
+	  enabled. This option and the irqs-off timing option can be
+	  used together or separately.)
+
+config PREEMPT_OFF_HIST
+        bool "non-preemptible critical section latency histogram"
+        default n
+        depends on CRITICAL_PREEMPT_TIMING
+        help
+          This option logs all the non-preemptible critical section latency
+	  timing to a big histogram bucket, in the meanwhile, it also
+	  dummies up printk produced by non-preemptible critical section
+	  latency timing.
+
+          The non-preemptible critical section latency timing histogram can
+	  be viewed via:
+
+              cat /proc/latency_hist/preempt_off_latency/CPU*
+
+          (Note: * presents CPU ID.)
+
+config CRITICAL_IRQSOFF_TIMING
+	bool "Interrupts-off critical section latency timing"
+	default n
+	help
+	  This option measures the time spent in irqs-off critical
+	  sections, with microsecond accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started via:
+
+	      echo 0 > /proc/sys/kernel/preempt_max_latency
+
+	  (Note that kernel size and overhead increases with this option
+	  enabled. This option and the preempt-off timing option can be
+	  used together or separately.)
+
+config INTERRUPT_OFF_HIST
+        bool "interrupts-off critical section latency histogram"
+        default n
+        depends on CRITICAL_IRQSOFF_TIMING
+        help
+          This option logs all the interrupts-off critical section latency
+          timing to a big histogram bucket, in the meanwhile, it also
+          dummies up printk produced by interrupts-off critical section
+          latency timing.
+
+          The interrupts-off critical section latency timing histogram can
+          be viewed via:
+
+              cat /proc/latency_hist/interrupt_off_latency/CPU*
+
+          (Note: * presents CPU ID.)
+
+config CRITICAL_TIMING
+	bool
+	default y
+	depends on CRITICAL_PREEMPT_TIMING || CRITICAL_IRQSOFF_TIMING
+
+config LATENCY_TIMING
+	bool
+	default y
+	depends on WAKEUP_TIMING || CRITICAL_TIMING
+	select SYSCTL
+
+config CRITICAL_LATENCY_HIST
+	bool
+	default y
+	depends on PREEMPT_OFF_HIST || INTERRUPT_OFF_HIST
+
+config LATENCY_HIST
+	bool
+	default y
+	depends on WAKEUP_LATENCY_HIST || CRITICAL_LATENCY_HIST
+
+config LATENCY_TRACE
+	bool "Latency tracing"
+	default n
+	depends on LATENCY_TIMING
+	help
+	  This option enables a kernel tracing mechanism that will track
+	  precise function-call granularity kernel execution during
+	  wakeup paths or critical sections.  When this option is enabled
+	  then the last maximum latency timing event's full trace can be
+	  found in /proc/latency_trace, in a human-readable (or rather as
+	  some would say, in a kernel-developer-readable) form.
+
+	  (Note that kernel size and overhead increases noticeably
+	  with this option enabled.)
+
+config MCOUNT
+	bool
+	depends on LATENCY_TRACE
+	default y
+
+config DEBUG_DEADLOCKS
+	bool "Automatic spinlock/rwlock/mutex/rwsem deadlock detection"
+	depends on PREEMPT_RT
+	default y
+	help
+	  This allows semaphores, rw-semaphores, and spinlock/rwlock
+	  related deadlocks (lockups) to be detected and reported
+	  automatically.
+
+config DEBUG_RT_LOCKING_MODE
+	bool "Runtime switching between spinlocks and mutexes"
+	depends on PREEMPT_RT
+	default n
+	help
+	  This debugging option makes RT mutexes non-preemptible again,
+	  thus simulating the locking and scheduling properties of a
+	  spin-lock based kernel. The /proc/sys/kernel/preempt_locks
+	  flag can be used to switch between the locking modes runtime.
+
+	  This option is useful to developers only, it helps measuring
+	  the impact of the PREEMPT_RT framework. As a user you dont
+	  want to enable this, as with lock preemption turned off you'll
+	  get high scheduling latencies again! Also, this feature has
+	  some runtime overhead, even if preempt_locks is turned on.
+
+          (NOTE: the preempt_locks flag is not directly changed but is
+	   indirectly propagated by the idle thread. [this is a safe
+	   method of changing the locking mode.] But this also means
+	   that if you are e.g. scripting the mode-change you should
+	   make sure the idle thread has scheduled at least once.
+	   You can achieve this via inserting "sleep 1" into the script
+	   for example.)
+
+	  Say N if you are unsure.
+
 config DEBUG_KOBJECT
 	bool "kobject debugging"
 	depends on DEBUG_KERNEL
@@ -176,9 +376,9 @@ config DEBUG_VM
 
 	  If unsure, say N.
 
-config FRAME_POINTER
+config USE_FRAME_POINTER
 	bool "Compile the kernel with frame pointers"
-	depends on DEBUG_KERNEL && (X86 || CRIS || M68K || M68KNOMMU || FRV || UML)
+	depends on DEBUG_KERNEL && !MCOUNT && (X86 || CRIS || M68K || M68KNOMMU || FRV || UML)
 	default y if DEBUG_INFO && UML
 	help
 	  If you say Y here the resulting kernel image will be slightly larger
@@ -186,6 +386,11 @@ config FRAME_POINTER
 	  some architectures or if you use external debuggers.
 	  If you don't debug the kernel, you can say N.
 
+config FRAME_POINTER
+	bool
+	depends on USE_FRAME_POINTER || MCOUNT
+	default y
+
 config RCU_TORTURE_TEST
 	tristate "torture tests for RCU"
 	depends on DEBUG_KERNEL
Index: linux.prev/lib/Makefile
===================================================================
--- linux.prev.orig/lib/Makefile
+++ linux.prev/lib/Makefile
@@ -5,7 +5,7 @@
 lib-y := errno.o ctype.o string.o vsprintf.o cmdline.o \
 	 bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o \
 	 idr.o div64.o int_sqrt.o bitmap.o extable.o prio_tree.o \
-	 sha1.o
+	 sha1.o plist.o
 
 lib-y	+= kobject.o kref.o kobject_uevent.o klist.o
 
@@ -17,7 +17,8 @@ CFLAGS_kobject_uevent.o += -DDEBUG
 endif
 
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
-lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
+obj-$(CONFIG_PREEMPT_RT) += plist.o
+obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o
 lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
Index: linux.prev/lib/dec_and_lock.c
===================================================================
--- linux.prev.orig/lib/dec_and_lock.c
+++ linux.prev/lib/dec_and_lock.c
@@ -13,7 +13,7 @@
  * the value of the atomic (i.e. the high bits aren't used
  * for a lock or anything like that).
  */
-int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
+int _atomic_dec_and_raw_spin_lock(atomic_t *atomic, raw_spinlock_t *lock)
 {
 	int counter;
 	int newcount;
@@ -29,10 +29,10 @@ int _atomic_dec_and_lock(atomic_t *atomi
 			return 0;
 	}
 
-	spin_lock(lock);
+	_raw_spin_lock(lock);
 	if (atomic_dec_and_test(atomic))
 		return 1;
-	spin_unlock(lock);
+	_raw_spin_unlock(lock);
 	return 0;
 }
 #else
@@ -59,14 +59,14 @@ int _atomic_dec_and_lock(atomic_t *atomi
  * this is trivially done efficiently using a load-locked
  * store-conditional approach, for example.
  */
-int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
+int _atomic_dec_and_raw_spin_lock(atomic_t *atomic, raw_spinlock_t *lock)
 {
-	spin_lock(lock);
+	_raw_spin_lock(lock);
 	if (atomic_dec_and_test(atomic))
 		return 1;
-	spin_unlock(lock);
+	_raw_spin_unlock(lock);
 	return 0;
 }
 #endif
 
-EXPORT_SYMBOL(_atomic_dec_and_lock);
+EXPORT_SYMBOL(_atomic_dec_and_raw_spin_lock);
Index: linux.prev/lib/inflate.c
===================================================================
--- linux.prev.orig/lib/inflate.c
+++ linux.prev/lib/inflate.c
@@ -141,6 +141,25 @@ struct huft {
   } v;
 };
 
+/*
+ * turn off the inflate_lock for the bootloader code, it is
+ * single-threaded and has no need for (nor access to) the
+ * kernel's locking primitives:
+ */
+#ifdef ZLIB_INFLATE_NO_INFLATE_LOCK
+# undef DEFINE_SPINLOCK
+# undef spin_lock
+# undef spin_unlock
+# define DEFINE_SPINLOCK(x)	int x
+# define spin_lock(x)		(void)(x)
+# define spin_unlock(x)		(void)(x)
+#endif
+
+/*
+ * lock protecting static variables of huft_build() and other inflate
+ * functions, to reduce their insane stack footprint.
+ */
+static DEFINE_SPINLOCK(inflate_lock);
 
 /* Function prototypes */
 STATIC int INIT huft_build OF((unsigned *, unsigned, unsigned, 
@@ -304,7 +323,7 @@ STATIC int INIT huft_build(
   register struct huft *q;      /* points to current table */
   struct huft r;                /* table entry for structure assignment */
   struct huft *u[BMAX];         /* table stack */
-  unsigned v[N_MAX];            /* values in order of bit length */
+  static unsigned v[N_MAX];     /* values in order of bit length */
   register int w;               /* bits before this table == (l * h) */
   unsigned x[BMAX+1];           /* bit offsets, then code stack */
   unsigned *xp;                 /* pointer into x */
@@ -705,7 +724,7 @@ STATIC int noinline INIT inflate_fixed(v
   struct huft *td;      /* distance code table */
   int bl;               /* lookup bits for tl */
   int bd;               /* lookup bits for td */
-  unsigned l[288];      /* length list for huft_build */
+  static unsigned l[288];      /* length list for huft_build */
 
 DEBG("<fix");
 
@@ -767,9 +786,9 @@ STATIC int noinline INIT inflate_dynamic
   unsigned nl;          /* number of literal/length codes */
   unsigned nd;          /* number of distance codes */
 #ifdef PKZIP_BUG_WORKAROUND
-  unsigned ll[288+32];  /* literal/length and distance code lengths */
+  static unsigned ll[288+32];  /* literal/length and distance code lengths */
 #else
-  unsigned ll[286+30];  /* literal/length and distance code lengths */
+  static unsigned ll[286+30];  /* literal/length and distance code lengths */
 #endif
   register ulg b;       /* bit buffer */
   register unsigned k;  /* number of bits in bit buffer */
@@ -940,6 +959,7 @@ STATIC int INIT inflate_block(
   unsigned t;           /* block type */
   register ulg b;       /* bit buffer */
   register unsigned k;  /* number of bits in bit buffer */
+  unsigned ret;         /* return code */
 
   DEBG("<blk");
 
@@ -965,17 +985,19 @@ STATIC int INIT inflate_block(
   bk = k;
 
   /* inflate that block type */
-  if (t == 2)
-    return inflate_dynamic();
-  if (t == 0)
-    return inflate_stored();
-  if (t == 1)
-    return inflate_fixed();
+  ret = 2;
+  spin_lock(&inflate_lock);
+  switch (t) {
+	case 2: ret = inflate_dynamic(); break;
+	case 0: ret = inflate_stored();  break;
+	case 1: ret = inflate_fixed();   break;
+  }
+  spin_unlock(&inflate_lock);
 
   DEBG(">");
 
   /* bad block type */
-  return 2;
+  return ret;
 
  underrun:
   return 4;			/* Input underrun */
Index: linux.prev/lib/kernel_lock.c
===================================================================
--- linux.prev.orig/lib/kernel_lock.c
+++ linux.prev/lib/kernel_lock.c
@@ -24,7 +24,7 @@
  *
  * Don't use in new code.
  */
-static DECLARE_MUTEX(kernel_sem);
+DECLARE_MUTEX(kernel_sem);
 
 /*
  * Re-acquire the kernel semaphore.
@@ -35,22 +35,25 @@ static DECLARE_MUTEX(kernel_sem);
  * about recursion, both due to the down() and due to the enabling of
  * preemption. schedule() will re-check the preemption flag after
  * reacquiring the semaphore.
+ *
+ * Called with interrupts disabled.
  */
 int __lockfunc __reacquire_kernel_lock(void)
 {
 	struct task_struct *task = current;
 	int saved_lock_depth = task->lock_depth;
 
+	raw_local_irq_enable();
 	BUG_ON(saved_lock_depth < 0);
 
 	task->lock_depth = -1;
-	preempt_enable_no_resched();
 
 	down(&kernel_sem);
 
-	preempt_disable();
 	task->lock_depth = saved_lock_depth;
 
+	raw_local_irq_disable();
+
 	return 0;
 }
 
@@ -67,11 +70,15 @@ void __lockfunc lock_kernel(void)
 	struct task_struct *task = current;
 	int depth = task->lock_depth + 1;
 
-	if (likely(!depth))
+	if (likely(!depth)) {
 		/*
 		 * No recursion worries - we set up lock_depth _after_
 		 */
 		down(&kernel_sem);
+#ifdef CONFIG_DEBUG_DEADLOCKS
+		current->last_kernel_lock = __builtin_return_address(0);
+#endif
+	}
 
 	task->lock_depth = depth;
 }
@@ -82,8 +89,12 @@ void __lockfunc unlock_kernel(void)
 
 	BUG_ON(task->lock_depth < 0);
 
-	if (likely(--task->lock_depth < 0))
+	if (likely(--task->lock_depth == -1)) {
+#ifdef CONFIG_DEBUG_DEADLOCKS
+		current->last_kernel_lock = NULL;
+#endif
 		up(&kernel_sem);
+	}
 }
 
 #else
@@ -116,38 +127,40 @@ static  __cacheline_aligned_in_smp DEFIN
  */
 int __lockfunc __reacquire_kernel_lock(void)
 {
-	while (!_raw_spin_trylock(&kernel_flag)) {
+	raw_local_irq_enable();
+	while (!__raw_spin_trylock(&kernel_flag.raw_lock)) {
 		if (test_thread_flag(TIF_NEED_RESCHED))
 			return -EAGAIN;
 		cpu_relax();
 	}
+	raw_local_irq_disable();
 	preempt_disable();
 	return 0;
 }
 
 void __lockfunc __release_kernel_lock(void)
 {
-	_raw_spin_unlock(&kernel_flag);
+	__raw_spin_unlock(&kernel_flag.raw_lock);
 	preempt_enable_no_resched();
 }
 
 /*
  * These are the BKL spinlocks - we try to be polite about preemption. 
  * If SMP is not on (ie UP preemption), this all goes away because the
- * _raw_spin_trylock() will always succeed.
+ * __raw_spin_trylock() will always succeed.
  */
 #ifdef CONFIG_PREEMPT
 static inline void __lock_kernel(void)
 {
 	preempt_disable();
-	if (unlikely(!_raw_spin_trylock(&kernel_flag))) {
+	if (unlikely(!__raw_spin_trylock(&kernel_flag.raw_lock))) {
 		/*
 		 * If preemption was disabled even before this
 		 * was called, there's nothing we can be polite
 		 * about - just spin.
 		 */
 		if (preempt_count() > 1) {
-			_raw_spin_lock(&kernel_flag);
+			__raw_spin_lock(&kernel_flag.raw_lock);
 			return;
 		}
 
@@ -160,7 +173,7 @@ static inline void __lock_kernel(void)
 			while (spin_is_locked(&kernel_flag))
 				cpu_relax();
 			preempt_disable();
-		} while (!_raw_spin_trylock(&kernel_flag));
+		} while (!__raw_spin_trylock(&kernel_flag.raw_lock));
 	}
 }
 
@@ -171,7 +184,7 @@ static inline void __lock_kernel(void)
  */
 static inline void __lock_kernel(void)
 {
-	_raw_spin_lock(&kernel_flag);
+	__raw_spin_lock(&kernel_flag.raw_lock);
 }
 #endif
 
Index: linux.prev/lib/plist.c
===================================================================
--- /dev/null
+++ linux.prev/lib/plist.c
@@ -0,0 +1,66 @@
+
+/*
+ * lib/plist.c
+ *
+ * Descending-priority-sorted double-linked list
+ *
+ * (C) 2002-2003 Intel Corp
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>.
+ *
+ * 2001-2005 (c) MontaVista Software, Inc.
+ * Daniel Walker <dwalker@mvista.com>
+ *
+ * (C) 2005 Thomas Gleixner <tglx@linutronix.de>
+ * Tested and made it functional.
+ *
+ * Licensed under the FSF's GNU Public License v2 or later.
+ *
+ * Based on simple lists (include/linux/list.h).
+ */
+
+#include <linux/plist.h>
+
+/**
+ * plist_add - add @node to @head returns !0 if the plist prio changed, 0
+ * otherwise. XXX: Fix return code.
+ *
+ * @node:	&struct plist_node pointer
+ * @head:	&struct plist_head pointer
+ */
+void plist_add(struct plist_node *node, struct plist_head *head)
+{
+	struct plist_node *iter;
+
+	INIT_LIST_HEAD(&node->plist.prio_list);
+
+	list_for_each_entry(iter, &head->prio_list, plist.prio_list)
+		if (node->prio < iter->prio)
+			goto lt_prio;
+		else if (node->prio == iter->prio) {
+			iter = list_entry(iter->plist.prio_list.next,
+					struct plist_node, plist.prio_list);
+			goto eq_prio;
+		}
+
+lt_prio:
+	list_add_tail(&node->plist.prio_list, &iter->plist.prio_list);
+eq_prio:
+	list_add_tail(&node->plist.node_list, &iter->plist.node_list);
+}
+
+/**
+ * plist_del - Remove a @node from plist. returns !0 if the plist prio
+ * changed, 0 otherwise. XXX: Fix return code.
+ *
+ * @node:	&struct plist_node pointer
+ */
+void plist_del(struct plist_node *node)
+{
+	if (!list_empty(&node->plist.prio_list)) {
+		struct plist_node *next = plist_first(&node->plist);
+		list_move_tail(&next->plist.prio_list, &node->plist.prio_list);
+		list_del_init(&node->plist.prio_list);
+	}
+
+	list_del_init(&node->plist.node_list);
+}
Index: linux.prev/lib/radix-tree.c
===================================================================
--- linux.prev.orig/lib/radix-tree.c
+++ linux.prev/lib/radix-tree.c
@@ -104,6 +104,8 @@ radix_tree_node_free(struct radix_tree_n
 	kmem_cache_free(radix_tree_node_cachep, node);
 }
 
+#ifndef CONFIG_PREEMPT_RT
+
 /*
  * Load up this CPU's radix_tree_node buffer with sufficient objects to
  * ensure that the addition of a single element in the tree cannot fail.  On
@@ -135,6 +137,8 @@ out:
 	return ret;
 }
 
+#endif
+
 static inline void tag_set(struct radix_tree_node *node, int tag, int offset)
 {
 	if (!test_bit(offset, &node->tags[tag][0]))
Index: linux.prev/lib/rwsem-spinlock.c
===================================================================
--- linux.prev.orig/lib/rwsem-spinlock.c
+++ linux.prev/lib/rwsem-spinlock.c
@@ -18,7 +18,7 @@ struct rwsem_waiter {
 };
 
 #if RWSEM_DEBUG
-void rwsemtrace(struct rw_semaphore *sem, const char *str)
+void rwsemtrace(struct compat_rw_semaphore *sem, const char *str)
 {
 	if (sem->debug)
 		printk("[%d] %s({%d,%d})\n",
@@ -30,7 +30,7 @@ void rwsemtrace(struct rw_semaphore *sem
 /*
  * initialise the semaphore
  */
-void fastcall init_rwsem(struct rw_semaphore *sem)
+void fastcall compat_init_rwsem(struct compat_rw_semaphore *sem)
 {
 	sem->activity = 0;
 	spin_lock_init(&sem->wait_lock);
@@ -49,8 +49,8 @@ void fastcall init_rwsem(struct rw_semap
  * - woken process blocks are discarded from the list after having task zeroed
  * - writers are only woken if wakewrite is non-zero
  */
-static inline struct rw_semaphore *
-__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
+static inline struct compat_rw_semaphore *
+__rwsem_do_wake(struct compat_rw_semaphore *sem, int wakewrite)
 {
 	struct rwsem_waiter *waiter;
 	struct task_struct *tsk;
@@ -111,8 +111,8 @@ __rwsem_do_wake(struct rw_semaphore *sem
 /*
  * wake a single writer
  */
-static inline struct rw_semaphore *
-__rwsem_wake_one_writer(struct rw_semaphore *sem)
+static inline struct compat_rw_semaphore *
+__rwsem_wake_one_writer(struct compat_rw_semaphore *sem)
 {
 	struct rwsem_waiter *waiter;
 	struct task_struct *tsk;
@@ -133,7 +133,7 @@ __rwsem_wake_one_writer(struct rw_semaph
 /*
  * get a read lock on the semaphore
  */
-void fastcall __sched __down_read(struct rw_semaphore *sem)
+void fastcall __sched __down_read(struct compat_rw_semaphore *sem)
 {
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk;
@@ -179,7 +179,7 @@ void fastcall __sched __down_read(struct
 /*
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
-int fastcall __down_read_trylock(struct rw_semaphore *sem)
+int fastcall __down_read_trylock(struct compat_rw_semaphore *sem)
 {
 	unsigned long flags;
 	int ret = 0;
@@ -204,7 +204,7 @@ int fastcall __down_read_trylock(struct 
  * get a write lock on the semaphore
  * - we increment the waiting count anyway to indicate an exclusive lock
  */
-void fastcall __sched __down_write(struct rw_semaphore *sem)
+void fastcall __sched __down_write(struct compat_rw_semaphore *sem)
 {
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk;
@@ -250,7 +250,7 @@ void fastcall __sched __down_write(struc
 /*
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
-int fastcall __down_write_trylock(struct rw_semaphore *sem)
+int fastcall __down_write_trylock(struct compat_rw_semaphore *sem)
 {
 	unsigned long flags;
 	int ret = 0;
@@ -274,7 +274,7 @@ int fastcall __down_write_trylock(struct
 /*
  * release a read lock on the semaphore
  */
-void fastcall __up_read(struct rw_semaphore *sem)
+void fastcall __up_read(struct compat_rw_semaphore *sem)
 {
 	unsigned long flags;
 
@@ -293,7 +293,7 @@ void fastcall __up_read(struct rw_semaph
 /*
  * release a write lock on the semaphore
  */
-void fastcall __up_write(struct rw_semaphore *sem)
+void fastcall __up_write(struct compat_rw_semaphore *sem)
 {
 	unsigned long flags;
 
@@ -314,7 +314,7 @@ void fastcall __up_write(struct rw_semap
  * downgrade a write lock into a read lock
  * - just wake up any readers at the front of the queue
  */
-void fastcall __downgrade_write(struct rw_semaphore *sem)
+void fastcall __downgrade_write(struct compat_rw_semaphore *sem)
 {
 	unsigned long flags;
 
@@ -331,7 +331,7 @@ void fastcall __downgrade_write(struct r
 	rwsemtrace(sem, "Leaving __downgrade_write");
 }
 
-EXPORT_SYMBOL(init_rwsem);
+EXPORT_SYMBOL(compat_init_rwsem);
 EXPORT_SYMBOL(__down_read);
 EXPORT_SYMBOL(__down_read_trylock);
 EXPORT_SYMBOL(__down_write);
Index: linux.prev/lib/semaphore-sleepers.c
===================================================================
--- linux.prev.orig/lib/semaphore-sleepers.c
+++ linux.prev/lib/semaphore-sleepers.c
@@ -16,6 +16,7 @@
 #include <linux/sched.h>
 #include <linux/err.h>
 #include <linux/init.h>
+#include <linux/module.h>
 #include <asm/semaphore.h>
 
 /*
@@ -49,12 +50,12 @@
  *    we cannot lose wakeup events.
  */
 
-fastcall void __up(struct semaphore *sem)
+fastcall void __compat_up(struct compat_semaphore *sem)
 {
 	wake_up(&sem->wait);
 }
 
-fastcall void __sched __down(struct semaphore * sem)
+fastcall void __sched __compat_down(struct compat_semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -91,7 +92,7 @@ fastcall void __sched __down(struct sema
 	tsk->state = TASK_RUNNING;
 }
 
-fastcall int __sched __down_interruptible(struct semaphore * sem)
+fastcall int __sched __compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -154,7 +155,7 @@ fastcall int __sched __down_interruptibl
  * single "cmpxchg" without failure cases,
  * but then it wouldn't work on a 386.
  */
-fastcall int __down_trylock(struct semaphore * sem)
+fastcall int __compat_down_trylock(struct compat_semaphore * sem)
 {
 	int sleepers;
 	unsigned long flags;
@@ -175,3 +176,10 @@ fastcall int __down_trylock(struct semap
 	spin_unlock_irqrestore(&sem->wait.lock, flags);
 	return 1;
 }
+
+int fastcall compat_sem_is_locked(struct compat_semaphore *sem)
+{
+	return (int) atomic_read(&sem->count) < 0;
+}
+
+EXPORT_SYMBOL(compat_sem_is_locked);
Index: linux.prev/lib/smp_processor_id.c
===================================================================
--- linux.prev.orig/lib/smp_processor_id.c
+++ linux.prev/lib/smp_processor_id.c
@@ -7,7 +7,7 @@
 #include <linux/kallsyms.h>
 #include <linux/sched.h>
 
-unsigned int debug_smp_processor_id(void)
+unsigned int notrace debug_smp_processor_id(void)
 {
 	unsigned long preempt_count = preempt_count();
 	int this_cpu = raw_smp_processor_id();
@@ -16,7 +16,7 @@ unsigned int debug_smp_processor_id(void
 	if (likely(preempt_count))
 		goto out;
 
-	if (irqs_disabled())
+	if (irqs_disabled() || raw_irqs_disabled())
 		goto out;
 
 	/*
@@ -42,7 +42,7 @@ unsigned int debug_smp_processor_id(void
 	if (!printk_ratelimit())
 		goto out_enable;
 
-	printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x] code: %s/%d\n", preempt_count(), current->comm, current->pid);
+	printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x] code: %s/%d\n", preempt_count()-1, current->comm, current->pid);
 	print_symbol("caller is %s\n", (long)__builtin_return_address(0));
 	dump_stack();
 
Index: linux.prev/lib/zlib_inflate/inftrees.c
===================================================================
--- linux.prev.orig/lib/zlib_inflate/inftrees.c
+++ linux.prev/lib/zlib_inflate/inftrees.c
@@ -4,11 +4,19 @@
  */
 
 #include <linux/zutil.h>
+#include <linux/spinlock.h>
 #include "inftrees.h"
 #include "infutil.h"
 
 static const char inflate_copyright[] __attribute_used__ =
    " inflate 1.1.3 Copyright 1995-1998 Mark Adler ";
+
+/*
+ * lock protecting static variables of huft_build() and other inflate
+ * functions, to reduce their insane stack footprint.
+ */
+static DEFINE_SPINLOCK(inflate_lock);
+
 /*
   If you use the zlib library in a product, an acknowledgment is welcome
   in the documentation of your product. If for some reason you cannot
@@ -107,7 +115,7 @@ static int huft_build(
 {
 
   uInt a;                       /* counter for codes of length k */
-  uInt c[BMAX+1];               /* bit length count table */
+  static uInt c[BMAX+1];        /* bit length count table */
   uInt f;                       /* i repeats in table every f entries */
   int g;                        /* maximum code length */
   int h;                        /* table level */
@@ -118,10 +126,10 @@ static int huft_build(
   uInt mask;                    /* (1 << w) - 1, to avoid cc -O bug on HP */
   register uInt *p;             /* pointer into c[], b[], or v[] */
   inflate_huft *q;              /* points to current table */
-  struct inflate_huft_s r;      /* table entry for structure assignment */
-  inflate_huft *u[BMAX];        /* table stack */
+  static struct inflate_huft_s r; /* table entry for structure assignment */
+  static inflate_huft *u[BMAX]; /* table stack */
   register int w;               /* bits before this table == (l * h) */
-  uInt x[BMAX+1];               /* bit offsets, then code stack */
+  static uInt x[BMAX+1];        /* bit offsets, then code stack */
   uInt *xp;                     /* pointer into x */
   int y;                        /* number of dummy codes added */
   uInt z;                       /* number of entries in current table */
@@ -300,9 +308,13 @@ int zlib_inflate_trees_bits(
   int r;
   uInt hn = 0;          /* hufts used in space */
   uInt *v;              /* work area for huft_build */
-  
+
   v = WS(z)->tree_work_area_1;
+
+  spin_lock(&inflate_lock);
   r = huft_build(c, 19, 19, NULL, NULL, tb, bb, hp, &hn, v);
+  spin_unlock(&inflate_lock);
+
   if (r == Z_DATA_ERROR)
     z->msg = (char*)"oversubscribed dynamic bit lengths tree";
   else if (r == Z_BUF_ERROR || *bb == 0)
@@ -333,7 +345,10 @@ int zlib_inflate_trees_dynamic(
   v = WS(z)->tree_work_area_2;
 
   /* build literal/length tree */
+  spin_lock(&inflate_lock);
   r = huft_build(c, nl, 257, cplens, cplext, tl, bl, hp, &hn, v);
+  spin_unlock(&inflate_lock);
+
   if (r != Z_OK || *bl == 0)
   {
     if (r == Z_DATA_ERROR)
@@ -347,7 +362,10 @@ int zlib_inflate_trees_dynamic(
   }
 
   /* build distance tree */
+  spin_lock(&inflate_lock);
   r = huft_build(c + nl, nd, 0, cpdist, cpdext, td, bd, hp, &hn, v);
+  spin_unlock(&inflate_lock);
+
   if (r != Z_OK || (*bd == 0 && nl > 257))
   {
     if (r == Z_DATA_ERROR)
@@ -383,9 +401,11 @@ int zlib_inflate_trees_fixed(
 	z_streamp z              /* for memory allocation */
 )
 {
-  int i;                /* temporary variable */
-  unsigned l[288];      /* length list for huft_build */
-  uInt *v;              /* work area for huft_build */
+  int i;                       /* temporary variable */
+  static unsigned l[288];      /* length list for huft_build */
+  uInt *v;                     /* work area for huft_build */
+
+  spin_lock(&inflate_lock);
 
   /* set up literal table */
   for (i = 0; i < 144; i++)
@@ -398,15 +418,20 @@ int zlib_inflate_trees_fixed(
     l[i] = 8;
   *bl = 9;
   v = WS(z)->tree_work_area_1;
-  if ((i = huft_build(l, 288, 257, cplens, cplext, tl, bl, hp,  &i, v)) != 0)
+  if ((i = huft_build(l, 288, 257, cplens, cplext, tl, bl, hp,  &i, v)) != 0) {
+    spin_unlock(&inflate_lock);
     return i;
+  }
 
   /* set up distance table */
   for (i = 0; i < 30; i++)      /* make an incomplete code set */
     l[i] = 5;
   *bd = 5;
-  if ((i = huft_build(l, 30, 0, cpdist, cpdext, td, bd, hp, &i, v)) > 1)
+  if ((i = huft_build(l, 30, 0, cpdist, cpdext, td, bd, hp, &i, v)) > 1) {
+    spin_unlock(&inflate_lock);
     return i;
+  }
 
+  spin_unlock(&inflate_lock);
   return Z_OK;
 }
Index: linux.prev/mm/Makefile
===================================================================
--- linux.prev.orig/mm/Makefile
+++ linux.prev/mm/Makefile
@@ -9,8 +9,8 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o 
 
 obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   page_alloc.o page-writeback.o pdflush.o \
-			   readahead.o slab.o swap.o truncate.o vmscan.o \
-			   prio_tree.o $(mmu-y)
+			   readahead.o swap.o truncate.o vmscan.o \
+			   prio_tree.o util.o $(mmu-y)
 
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
@@ -18,5 +18,7 @@ obj-$(CONFIG_NUMA) 	+= mempolicy.o
 obj-$(CONFIG_SPARSEMEM)	+= sparse.o
 obj-$(CONFIG_SHMEM) += shmem.o
 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
+obj-$(CONFIG_SLOB) += slob.o
+obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
Index: linux.prev/mm/highmem.c
===================================================================
--- linux.prev.orig/mm/highmem.c
+++ linux.prev/mm/highmem.c
@@ -246,11 +246,11 @@ static void bounce_copy_vec(struct bio_v
 	unsigned long flags;
 	unsigned char *vto;
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
 	memcpy(vto + to->bv_offset, vfrom, to->bv_len);
 	kunmap_atomic(vto, KM_BOUNCE_READ);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 }
 
 #else /* CONFIG_HIGHMEM */
Index: linux.prev/mm/memory.c
===================================================================
--- linux.prev.orig/mm/memory.c
+++ linux.prev/mm/memory.c
@@ -254,18 +254,48 @@ void free_pgd_range(struct mmu_gather **
 		flush_tlb_pgtables((*tlb)->mm, start, end);
 }
 
+#ifdef CONFIG_IA64
+#define tlb_start_addr(tlb)	(tlb)->start_addr
+#define tlb_end_addr(tlb)	(tlb)->end_addr
+#else
+#define tlb_start_addr(tlb)	0UL	/* only ia64 really uses it */
+#define tlb_end_addr(tlb)	0UL	/* only ia64 really uses it */
+#endif
+
 void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
 		unsigned long floor, unsigned long ceiling)
 {
+#ifdef CONFIG_PREEMPT
+	struct vm_area_struct *unlink = vma;
+	int fullmm = (*tlb)->fullmm;
+
+	if (!vma)	/* Sometimes when exiting after an oops */
+		return;
+	if (vma->vm_next)
+		tlb_finish_mmu(*tlb, tlb_start_addr(*tlb), tlb_end_addr(*tlb));
+	/*
+	 * Hide vma from rmap and vmtruncate before freeeing pgtables,
+	 * with preemption enabled, except when unmapping just one area.
+	 */
+	while (unlink) {
+		anon_vma_unlink(unlink);
+		unlink_file_vma(unlink);
+		unlink = unlink->vm_next;
+	}
+	if (vma->vm_next)
+		*tlb = tlb_gather_mmu(vma->vm_mm, fullmm);
+#endif
 	while (vma) {
 		struct vm_area_struct *next = vma->vm_next;
 		unsigned long addr = vma->vm_start;
 
+#ifndef CONFIG_PREEMPT
 		/*
 		 * Hide vma from rmap and vmtruncate before freeing pgtables
 		 */
 		anon_vma_unlink(vma);
 		unlink_file_vma(vma);
+#endif
 
 		if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) {
 			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
@@ -279,8 +309,10 @@ void free_pgtables(struct mmu_gather **t
 							HPAGE_SIZE)) {
 				vma = next;
 				next = vma->vm_next;
+#ifndef CONFIG_PREEMPT
 				anon_vma_unlink(vma);
 				unlink_file_vma(vma);
+#endif
 			}
 			free_pgd_range(tlb, addr, vma->vm_end,
 				floor, next? next->vm_start: ceiling);
@@ -749,10 +781,13 @@ static unsigned long unmap_page_range(st
 	return addr;
 }
 
-#ifdef CONFIG_PREEMPT
+#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_RT)
 # define ZAP_BLOCK_SIZE	(8 * PAGE_SIZE)
 #else
-/* No preempt: go for improved straight-line efficiency */
+/*
+ * No preempt: go for improved straight-line efficiency
+ * on PREEMPT_RT this is not a critical latency-path.
+ */
 # define ZAP_BLOCK_SIZE	(1024 * PAGE_SIZE)
 #endif
 
Index: linux.prev/mm/mmap.c
===================================================================
--- linux.prev.orig/mm/mmap.c
+++ linux.prev/mm/mmap.c
@@ -1826,10 +1826,16 @@ asmlinkage long sys_munmap(unsigned long
 static inline void verify_mm_writelocked(struct mm_struct *mm)
 {
 #ifdef CONFIG_DEBUG_VM
-	if (unlikely(down_read_trylock(&mm->mmap_sem))) {
+# ifdef CONFIG_PREEMPT_RT
+	if (unlikely(!rt_rwsem_is_locked(&mm->mmap_sem))) {
 		WARN_ON(1);
-		up_read(&mm->mmap_sem);
 	}
+# else
+        if (unlikely(down_read_trylock(&mm->mmap_sem))) {
+		WARN_ON(1);
+		up_read(&mm->mmap_sem);
+        }
+# endif
 #endif
 }
 
Index: linux.prev/mm/oom_kill.c
===================================================================
--- linux.prev.orig/mm/oom_kill.c
+++ linux.prev/mm/oom_kill.c
@@ -266,12 +266,16 @@ static struct mm_struct *oom_kill_proces
 void out_of_memory(gfp_t gfp_mask, int order)
 {
 	struct mm_struct *mm = NULL;
+	int print = 0;
 	task_t * p;
 
 	if (printk_ratelimit()) {
 		printk("oom-killer: gfp_mask=0x%x, order=%d\n",
 			gfp_mask, order);
 		show_mem();
+		printk("current: %s/%d.\n", current->comm, current->pid);
+		dump_stack();
+		print = 1;
 	}
 
 	read_lock(&tasklist_lock);
@@ -281,6 +285,11 @@ retry:
 	if (PTR_ERR(p) == -1UL)
 		goto out;
 
+	if (print) {
+		printk("victim: %s/%d.\n", p->comm, p->pid);
+		show_stack(p, NULL);
+	}
+
 	/* Found nothing?!?! Either we hang forever, or we panic. */
 	if (!p) {
 		read_unlock(&tasklist_lock);
Index: linux.prev/mm/page_alloc.c
===================================================================
--- linux.prev.orig/mm/page_alloc.c
+++ linux.prev/mm/page_alloc.c
@@ -400,6 +400,9 @@ void __free_pages_ok(struct page *page, 
 	int reserved = 0;
 
 	arch_free_page(page, order);
+	if (!PageHighMem(page))
+		check_no_locks_freed(page_address(page),
+			page_address(page+(1<<order)));
 
 #ifndef CONFIG_MMU
 	if (order > 0)
@@ -531,6 +534,7 @@ static struct page *__rmqueue(struct zon
 	return NULL;
 }
 
+#ifndef CONFIG_PREEMPT_RT
 /* 
  * Obtain a specified number of elements from the buddy allocator, all under
  * a single hold of the lock, for efficiency.  Add them to the supplied list.
@@ -555,6 +559,7 @@ static int rmqueue_bulk(struct zone *zon
 	spin_unlock_irqrestore(&zone->lock, flags);
 	return allocated;
 }
+#endif
 
 #ifdef CONFIG_NUMA
 /* Called from the slab reaper to drain remote pagesets */
@@ -673,6 +678,7 @@ static void zone_statistics(struct zonel
 #endif
 }
 
+#ifndef CONFIG_PREEMPT_RT
 /*
  * Free a 0-order page
  */
@@ -702,15 +708,32 @@ static void fastcall free_hot_cold_page(
 	local_irq_restore(flags);
 	put_cpu();
 }
+#endif
 
+/*
+ * On PREEMPT_RT we use a simple solution for the time being,
+ * per-CPU allocation is disabled.
+ */
 void fastcall free_hot_page(struct page *page)
 {
+#ifdef CONFIG_PREEMPT_RT
+	if (PageAnon(page))
+		page->mapping = NULL;
+	__free_pages_ok(page, 0);
+#else
 	free_hot_cold_page(page, 0);
+#endif
 }
 	
 void fastcall free_cold_page(struct page *page)
 {
+#ifdef CONFIG_PREEMPT_RT
+	if (PageAnon(page))
+		page->mapping = NULL;
+	__free_pages_ok(page, 0);
+#else
 	free_hot_cold_page(page, 1);
+#endif
 }
 
 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
@@ -732,9 +755,12 @@ buffered_rmqueue(struct zone *zone, int 
 {
 	unsigned long flags;
 	struct page *page;
+#ifndef CONFIG_PREEMPT_RT
 	int cold = !!(gfp_flags & __GFP_COLD);
+#endif
 
 again:
+#ifndef CONFIG_PREEMPT_RT
 	if (order == 0) {
 		struct per_cpu_pages *pcp;
 
@@ -751,7 +777,9 @@ again:
 		}
 		local_irq_restore(flags);
 		put_cpu();
-	} else {
+	} else
+#endif
+	{
 		spin_lock_irqsave(&zone->lock, flags);
 		page = __rmqueue(zone, order);
 		spin_unlock_irqrestore(&zone->lock, flags);
@@ -1046,8 +1074,15 @@ void __pagevec_free(struct pagevec *pvec
 {
 	int i = pagevec_count(pvec);
 
-	while (--i >= 0)
+	while (--i >= 0) {
+#ifdef CONFIG_PREEMPT_RT
+		if (PageAnon(pvec->pages[i]))
+			pvec->pages[i]->mapping = NULL;
+		__free_pages_ok(pvec->pages[i], 0);
+#else
 		free_hot_cold_page(pvec->pages[i], pvec->cold);
+#endif
+	}
 }
 
 fastcall void __free_pages(struct page *page, unsigned int order)
@@ -1243,10 +1278,10 @@ void __mod_page_state(unsigned long offs
 	unsigned long flags;
 	void* ptr;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	ptr = &__get_cpu_var(page_states);
 	*(unsigned long*)(ptr + offset) += delta;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 EXPORT_SYMBOL(__mod_page_state);
@@ -2629,3 +2664,30 @@ void *__init alloc_large_system_hash(con
 
 	return table;
 }
+
+void *__init alloc_large_system_bitmask(char *bitmaskname,
+					unsigned long bits, int flags)
+{
+	unsigned long words = bits / (sizeof(unsigned long)*8);
+	unsigned long size = words * sizeof(unsigned long);
+	unsigned long *bitmask = NULL;
+
+	if (flags & HASH_EARLY)
+		bitmask = alloc_bootmem(size);
+	else if (hashdist)
+		bitmask = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
+	else {
+		bitmask = kmalloc(size, GFP_ATOMIC);
+		if (!bitmask) {
+			unsigned long order;
+			for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
+				;
+			bitmask = (void*) __get_free_pages(GFP_ATOMIC, order);
+		}
+	}
+
+	if (!bitmask)
+		panic("Failed to allocate %s bitmask\n", bitmaskname);
+
+	return bitmask;
+}
Index: linux.prev/mm/slab.c
===================================================================
--- linux.prev.orig/mm/slab.c
+++ linux.prev/mm/slab.c
@@ -55,7 +55,7 @@
  *
  * SMP synchronization:
  *  constructors and destructors are called without any locking.
- *  Several members in kmem_cache_t and struct slab never change, they
+ *  Several members in struct kmem_cache and struct slab never change, they
  *	are accessed without any locking.
  *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
  *  	and local interrupts are disabled so slab code is preempt-safe.
@@ -68,7 +68,7 @@
  * Further notes from the original documentation:
  *
  * 11 April '97.  Started multi-threading - markhe
- *	The global cache-chain is protected by the semaphore 'cache_chain_sem'.
+ *	The global cache-chain is protected by the mutex 'cache_chain_sem'.
  *	The sem is only needed when accessing/extending the cache-chain, which
  *	can never happen inside an interrupt (kmem_cache_create(),
  *	kmem_cache_shrink() and kmem_cache_reap()).
@@ -103,13 +103,72 @@
 #include	<linux/rcupdate.h>
 #include	<linux/string.h>
 #include	<linux/nodemask.h>
+#include	<linux/mempolicy.h>
 
 #include	<asm/uaccess.h>
 #include	<asm/cacheflush.h>
 #include	<asm/tlbflush.h>
+#include	<asm/semaphore.h>
 #include	<asm/page.h>
 
 /*
+ * On !PREEMPT_RT, raw irq flags are used as a per-CPU locking
+ * mechanism.
+ *
+ * On PREEMPT_RT, we use per-CPU locks for this. That's why the
+ * calling convention is changed slightly: a new 'flags' argument
+ * is passed to 'irq disable/enable' - the PREEMPT_RT code stores
+ * the CPU number of the lock there.
+ */
+#ifndef CONFIG_PREEMPT_RT
+# define slab_irq_disable(cpu) \
+	do { local_irq_disable(); (cpu) = smp_processor_id(); } while (0)
+# define slab_irq_enable(cpu)		local_irq_enable()
+# define slab_irq_save(flags, cpu) \
+	do { local_irq_save(flags); (cpu) = smp_processor_id(); } while (0)
+# define slab_irq_restore(flags, cpu)	local_irq_restore(flags)
+/*
+ * In the __GFP_WAIT case we enable/disable interrupts on !PREEMPT_RT,
+ * which has no per-CPU locking effect since we are holding the cache
+ * lock in that case already.
+ *
+ * (On PREEMPT_RT, these are NOPs, but we have to drop/get the irq locks.)
+ */
+# define slab_irq_disable_nort()	local_irq_disable()
+# define slab_irq_enable_nort()		local_irq_enable()
+# define slab_irq_disable_rt(flags)	do { (void)(flags); } while (0)
+# define slab_irq_enable_rt(flags)	do { (void)(flags); } while (0)
+# define slab_spin_lock_irq(lock, cpu) \
+ 	do { spin_lock_irq(lock); (cpu) = smp_processor_id(); } while (0)
+# define slab_spin_unlock_irq(lock, cpu) \
+				 	spin_unlock_irq(lock)
+# define slab_spin_lock_irqsave(lock, flags, cpu) \
+ 	do { spin_lock_irqsave(lock, flags); (cpu) = smp_processor_id(); } while (0)
+# define slab_spin_unlock_irqrestore(lock, flags, cpu) \
+ 	do { spin_unlock_irqrestore(lock, flags); } while (0)
+#else
+DEFINE_PER_CPU_LOCKED(int, slab_locks) = { 0, };
+# define slab_irq_disable(cpu)		get_cpu_var_locked(slab_locks, &(cpu))
+# define slab_irq_enable(cpu)		put_cpu_var_locked(slab_locks, cpu)
+# define slab_irq_save(flags, cpu) \
+	do { slab_irq_disable(cpu); (void) (flags); } while (0)
+# define slab_irq_restore(flags, cpu) \
+	do { slab_irq_enable(cpu); (void) (flags); } while (0)
+# define slab_irq_disable_rt(cpu)	slab_irq_disable(cpu)
+# define slab_irq_enable_rt(cpu)	slab_irq_enable(cpu)
+# define slab_irq_disable_nort()	do { } while (0)
+# define slab_irq_enable_nort()		do { } while (0)
+# define slab_spin_lock_irq(lock, cpu) \
+		do { slab_irq_disable(cpu); spin_lock(lock); } while (0)
+# define slab_spin_unlock_irq(lock, cpu) \
+		do { spin_unlock(lock); slab_irq_enable(cpu); } while (0)
+# define slab_spin_lock_irqsave(lock, flags, cpu) \
+ 	do { slab_irq_disable(cpu); spin_lock_irqsave(lock, flags); } while (0)
+# define slab_spin_unlock_irqrestore(lock, flags, cpu) \
+ 	do { spin_unlock_irqrestore(lock, flags); slab_irq_enable(cpu); } while (0)
+#endif
+
+/*
  * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
  *		  SLAB_RED_ZONE & SLAB_POISON.
  *		  0 for faster, smaller code (especially in the critical paths).
@@ -130,7 +189,6 @@
 #define	FORCED_DEBUG	0
 #endif
 
-
 /* Shouldn't this be in a header file somewhere? */
 #define	BYTES_PER_WORD		sizeof(void *)
 
@@ -217,12 +275,12 @@ static unsigned long offslab_limit;
  * Slabs are chained into three list: fully used, partial, fully free slabs.
  */
 struct slab {
-	struct list_head	list;
-	unsigned long		colouroff;
-	void			*s_mem;		/* including colour offset */
-	unsigned int		inuse;		/* num of objs active in slab */
-	kmem_bufctl_t		free;
-	unsigned short          nodeid;
+	struct list_head list;
+	unsigned long colouroff;
+	void *s_mem;		/* including colour offset */
+	unsigned int inuse;	/* num of objs active in slab */
+	kmem_bufctl_t free;
+	unsigned short nodeid;
 };
 
 /*
@@ -242,9 +300,9 @@ struct slab {
  * We assume struct slab_rcu can overlay struct slab when destroying.
  */
 struct slab_rcu {
-	struct rcu_head		head;
-	kmem_cache_t		*cachep;
-	void			*addr;
+	struct rcu_head head;
+	struct kmem_cache *cachep;
+	void *addr;
 };
 
 /*
@@ -279,23 +337,24 @@ struct array_cache {
 #define BOOT_CPUCACHE_ENTRIES	1
 struct arraycache_init {
 	struct array_cache cache;
-	void * entries[BOOT_CPUCACHE_ENTRIES];
+	void *entries[BOOT_CPUCACHE_ENTRIES];
 };
 
 /*
  * The slab lists for all objects.
  */
 struct kmem_list3 {
-	struct list_head	slabs_partial;	/* partial list first, better asm code */
-	struct list_head	slabs_full;
-	struct list_head	slabs_free;
-	unsigned long	free_objects;
-	unsigned long	next_reap;
-	int		free_touched;
-	unsigned int 	free_limit;
-	spinlock_t      list_lock;
-	struct array_cache	*shared;	/* shared per node */
-	struct array_cache	**alien;	/* on other nodes */
+	struct list_head slabs_partial;	/* partial list first, better asm code */
+	struct list_head slabs_full;
+	struct list_head slabs_free;
+	unsigned long free_objects;
+	unsigned long next_reap;
+	int free_touched;
+	unsigned int free_limit;
+	unsigned int colour_next;	/* Per-node cache coloring */
+	spinlock_t list_lock;
+	struct array_cache *shared;	/* shared per node */
+	struct array_cache **alien;	/* on other nodes */
 };
 
 /*
@@ -315,6 +374,8 @@ struct kmem_list3 __initdata initkmem_li
  */
 static __always_inline int index_of(const size_t size)
 {
+	extern void __bad_size(void);
+
 	if (__builtin_constant_p(size)) {
 		int i = 0;
 
@@ -325,25 +386,23 @@ static __always_inline int index_of(cons
 		i++;
 #include "linux/kmalloc_sizes.h"
 #undef CACHE
-		{
-			extern void __bad_size(void);
-			__bad_size();
-		}
+		__bad_size();
 	} else
-		BUG();
+		__bad_size();
 	return 0;
 }
 
 #define INDEX_AC index_of(sizeof(struct arraycache_init))
 #define INDEX_L3 index_of(sizeof(struct kmem_list3))
 
-static inline void kmem_list3_init(struct kmem_list3 *parent)
+static void kmem_list3_init(struct kmem_list3 *parent)
 {
 	INIT_LIST_HEAD(&parent->slabs_full);
 	INIT_LIST_HEAD(&parent->slabs_partial);
 	INIT_LIST_HEAD(&parent->slabs_free);
 	parent->shared = NULL;
 	parent->alien = NULL;
+	parent->colour_next = 0;
 	spin_lock_init(&parent->list_lock);
 	parent->free_objects = 0;
 	parent->free_touched = 0;
@@ -363,67 +422,72 @@ static inline void kmem_list3_init(struc
 	} while (0)
 
 /*
- * kmem_cache_t
+ * struct kmem_cache
  *
  * manages a cache.
  */
-	
+
 struct kmem_cache {
 /* 1) per-cpu data, touched during every alloc/free */
-	struct array_cache	*array[NR_CPUS];
-	unsigned int		batchcount;
-	unsigned int		limit;
-	unsigned int 		shared;
-	unsigned int		objsize;
+	struct array_cache *array[NR_CPUS];
+	unsigned int batchcount;
+	unsigned int limit;
+	unsigned int shared;
+	unsigned int buffer_size;
 /* 2) touched by every alloc & free from the backend */
-	struct kmem_list3	*nodelists[MAX_NUMNODES];
-	unsigned int	 	flags;	/* constant flags */
-	unsigned int		num;	/* # of objs per slab */
-	spinlock_t		spinlock;
+	struct kmem_list3 *nodelists[MAX_NUMNODES];
+	unsigned int flags;	/* constant flags */
+	unsigned int num;	/* # of objs per slab */
+	spinlock_t spinlock;
 
 /* 3) cache_grow/shrink */
 	/* order of pgs per slab (2^n) */
-	unsigned int		gfporder;
+	unsigned int gfporder;
 
 	/* force GFP flags, e.g. GFP_DMA */
-	gfp_t			gfpflags;
+	gfp_t gfpflags;
 
-	size_t			colour;		/* cache colouring range */
-	unsigned int		colour_off;	/* colour offset */
-	unsigned int		colour_next;	/* cache colouring */
-	kmem_cache_t		*slabp_cache;
-	unsigned int		slab_size;
-	unsigned int		dflags;		/* dynamic flags */
+	size_t colour;		/* cache colouring range */
+	unsigned int colour_off;	/* colour offset */
+	struct kmem_cache *slabp_cache;
+	unsigned int slab_size;
+	unsigned int dflags;	/* dynamic flags */
 
 	/* constructor func */
-	void (*ctor)(void *, kmem_cache_t *, unsigned long);
+	void (*ctor) (void *, struct kmem_cache *, unsigned long);
 
 	/* de-constructor func */
-	void (*dtor)(void *, kmem_cache_t *, unsigned long);
+	void (*dtor) (void *, struct kmem_cache *, unsigned long);
 
 /* 4) cache creation/removal */
-	const char		*name;
-	struct list_head	next;
+	const char *name;
+	struct list_head next;
 
 /* 5) statistics */
 #if STATS
-	unsigned long		num_active;
-	unsigned long		num_allocations;
-	unsigned long		high_mark;
-	unsigned long		grown;
-	unsigned long		reaped;
-	unsigned long 		errors;
-	unsigned long		max_freeable;
-	unsigned long		node_allocs;
-	unsigned long		node_frees;
-	atomic_t		allochit;
-	atomic_t		allocmiss;
-	atomic_t		freehit;
-	atomic_t		freemiss;
+	unsigned long num_active;
+	unsigned long num_allocations;
+	unsigned long high_mark;
+	unsigned long grown;
+	unsigned long reaped;
+	unsigned long errors;
+	unsigned long max_freeable;
+	unsigned long node_allocs;
+	unsigned long node_frees;
+	atomic_t allochit;
+	atomic_t allocmiss;
+	atomic_t freehit;
+	atomic_t freemiss;
 #endif
 #if DEBUG
-	int			dbghead;
-	int			reallen;
+	/*
+	 * If debugging is enabled, then the allocator can add additional
+	 * fields and/or padding to every object. buffer_size contains the total
+	 * object size including these internal fields, the following two
+	 * variables contain the offset to the user object and its size.
+	 */
+	int obj_offset;
+	int obj_size;
 #endif
 };
 
@@ -494,49 +558,50 @@ struct kmem_cache {
 
 /* memory layout of objects:
  * 0		: objp
- * 0 .. cachep->dbghead - BYTES_PER_WORD - 1: padding. This ensures that
+ * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
  * 		the end of an object is aligned with the end of the real
  * 		allocation. Catches writes behind the end of the allocation.
- * cachep->dbghead - BYTES_PER_WORD .. cachep->dbghead - 1:
+ * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
  * 		redzone word.
- * cachep->dbghead: The real object.
- * cachep->objsize - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
- * cachep->objsize - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long]
+ * cachep->obj_offset: The real object.
+ * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
+ * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long]
  */
-static int obj_dbghead(kmem_cache_t *cachep)
+static int obj_offset(struct kmem_cache *cachep)
 {
-	return cachep->dbghead;
+	return cachep->obj_offset;
 }
 
-static int obj_reallen(kmem_cache_t *cachep)
+static int obj_size(struct kmem_cache *cachep)
 {
-	return cachep->reallen;
+	return cachep->obj_size;
 }
 
-static unsigned long *dbg_redzone1(kmem_cache_t *cachep, void *objp)
+static unsigned long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
 {
 	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
-	return (unsigned long*) (objp+obj_dbghead(cachep)-BYTES_PER_WORD);
+	return (unsigned long*) (objp+obj_offset(cachep)-BYTES_PER_WORD);
 }
 
-static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp)
+static unsigned long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
 {
 	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
 	if (cachep->flags & SLAB_STORE_USER)
-		return (unsigned long*) (objp+cachep->objsize-2*BYTES_PER_WORD);
-	return (unsigned long*) (objp+cachep->objsize-BYTES_PER_WORD);
+		return (unsigned long *)(objp + cachep->buffer_size -
+					 2 * BYTES_PER_WORD);
+	return (unsigned long *)(objp + cachep->buffer_size - BYTES_PER_WORD);
 }
 
-static void **dbg_userword(kmem_cache_t *cachep, void *objp)
+static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 {
 	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
-	return (void**)(objp+cachep->objsize-BYTES_PER_WORD);
+	return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
 }
 
 #else
 
-#define obj_dbghead(x)			0
-#define obj_reallen(cachep)		(cachep->objsize)
+#define obj_offset(x)			0
+#define obj_size(cachep)		(cachep->buffer_size)
 #define dbg_redzone1(cachep, objp)	({BUG(); (unsigned long *)NULL;})
 #define dbg_redzone2(cachep, objp)	({BUG(); (unsigned long *)NULL;})
 #define dbg_userword(cachep, objp)	({BUG(); (void **)NULL;})
@@ -589,6 +654,18 @@ static inline struct slab *page_get_slab
 	return (struct slab *)page->lru.prev;
 }
 
+static inline struct kmem_cache *virt_to_cache(const void *obj)
+{
+	struct page *page = virt_to_page(obj);
+	return page_get_cache(page);
+}
+
+static inline struct slab *virt_to_slab(const void *obj)
+{
+	struct page *page = virt_to_page(obj);
+	return page_get_slab(page);
+}
+
 /* These are the default caches for kmalloc. Custom caches can have other sizes. */
 struct cache_sizes malloc_sizes[] = {
 #define CACHE(x) { .cs_size = (x) },
@@ -607,31 +684,31 @@ struct cache_names {
 static struct cache_names __initdata cache_names[] = {
 #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
 #include <linux/kmalloc_sizes.h>
-	{ NULL, }
+	{NULL,}
 #undef CACHE
 };
 
 static struct arraycache_init initarray_cache __initdata =
-	{ { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
+    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 static struct arraycache_init initarray_generic =
-	{ { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
+    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 
 /* internal cache of cache description objs */
-static kmem_cache_t cache_cache = {
-	.batchcount	= 1,
-	.limit		= BOOT_CPUCACHE_ENTRIES,
-	.shared		= 1,
-	.objsize	= sizeof(kmem_cache_t),
-	.flags		= SLAB_NO_REAP,
-	.spinlock	= SPIN_LOCK_UNLOCKED,
-	.name		= "kmem_cache",
+static struct kmem_cache cache_cache = {
+	.batchcount = 1,
+	.limit = BOOT_CPUCACHE_ENTRIES,
+	.shared = 1,
+	.buffer_size = sizeof(struct kmem_cache),
+	.flags = SLAB_NO_REAP,
+	.spinlock = SPIN_LOCK_UNLOCKED(cache_cache.spinlock),
+	.name = "kmem_cache",
 #if DEBUG
-	.reallen	= sizeof(kmem_cache_t),
+	.obj_size = sizeof(struct kmem_cache),
 #endif
 };
 
 /* Guard access to the cache-chain. */
-static struct semaphore	cache_chain_sem;
+static DECLARE_MUTEX(cache_chain_sem);
 static struct list_head cache_chain;
 
 /*
@@ -655,25 +732,26 @@ static enum {
 
 static DEFINE_PER_CPU(struct work_struct, reap_work);
 
-static void free_block(kmem_cache_t* cachep, void** objpp, int len, int node);
-static void enable_cpucache (kmem_cache_t *cachep);
-static void cache_reap (void *unused);
-static int __node_shrink(kmem_cache_t *cachep, int node);
+static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node, int *this_cpu);
+static void enable_cpucache(struct kmem_cache *cachep);
+static void cache_reap(void *unused);
+static int __node_shrink(struct kmem_cache *cachep, int node, int *this_cpu);
 
-static inline struct array_cache *ac_data(kmem_cache_t *cachep)
+static inline struct
+array_cache *cpu_cache_get(struct kmem_cache *cachep, int this_cpu)
 {
-	return cachep->array[smp_processor_id()];
+	return cachep->array[this_cpu];
 }
 
-static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags)
+static inline struct kmem_cache *__find_general_cachep(size_t size, gfp_t gfpflags)
 {
 	struct cache_sizes *csizep = malloc_sizes;
 
 #if DEBUG
 	/* This happens if someone tries to call
- 	* kmem_cache_create(), or __kmalloc(), before
- 	* the generic caches are initialized.
- 	*/
+	 * kmem_cache_create(), or __kmalloc(), before
+	 * the generic caches are initialized.
+	 */
 	BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
 #endif
 	while (size > csizep->cs_size)
@@ -689,46 +767,83 @@ static inline kmem_cache_t *__find_gener
 	return csizep->cs_cachep;
 }
 
-kmem_cache_t *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
+struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
 {
 	return __find_general_cachep(size, gfpflags);
 }
 EXPORT_SYMBOL(kmem_find_general_cachep);
 
-/* Cal the num objs, wastage, and bytes left over for a given slab size. */
-static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
-		 int flags, size_t *left_over, unsigned int *num)
+static size_t slab_mgmt_size(size_t nr_objs, size_t align)
 {
-	int i;
-	size_t wastage = PAGE_SIZE<<gfporder;
-	size_t extra = 0;
-	size_t base = 0;
-
-	if (!(flags & CFLGS_OFF_SLAB)) {
-		base = sizeof(struct slab);
-		extra = sizeof(kmem_bufctl_t);
-	}
-	i = 0;
-	while (i*size + ALIGN(base+i*extra, align) <= wastage)
-		i++;
-	if (i > 0)
-		i--;
+	return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
+}
+
+/* Calculate the number of objects and left-over bytes for a given
+   buffer size. */
+static void cache_estimate(unsigned long gfporder, size_t buffer_size,
+			   size_t align, int flags, size_t *left_over,
+			   unsigned int *num)
+{
+	int nr_objs;
+	size_t mgmt_size;
+	size_t slab_size = PAGE_SIZE << gfporder;
+
+	/*
+	 * The slab management structure can be either off the slab or
+	 * on it. For the latter case, the memory allocated for a
+	 * slab is used for:
+	 *
+	 * - The struct slab
+	 * - One kmem_bufctl_t for each object
+	 * - Padding to respect alignment of @align
+	 * - @buffer_size bytes for each object
+	 *
+	 * If the slab management structure is off the slab, then the
+	 * alignment will already be calculated into the size. Because
+	 * the slabs are all pages aligned, the objects will be at the
+	 * correct alignment when allocated.
+	 */
+	if (flags & CFLGS_OFF_SLAB) {
+		mgmt_size = 0;
+		nr_objs = slab_size / buffer_size;
+
+		if (nr_objs > SLAB_LIMIT)
+			nr_objs = SLAB_LIMIT;
+	} else {
+		/*
+		 * Ignore padding for the initial guess. The padding
+		 * is at most @align-1 bytes, and @buffer_size is at
+		 * least @align. In the worst case, this result will
+		 * be one greater than the number of objects that fit
+		 * into the memory allocation when taking the padding
+		 * into account.
+		 */
+		nr_objs = (slab_size - sizeof(struct slab)) /
+			  (buffer_size + sizeof(kmem_bufctl_t));
+
+		/*
+		 * This calculated number will be either the right
+		 * amount, or one greater than what we want.
+		 */
+		if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
+		       > slab_size)
+			nr_objs--;
 
-	if (i > SLAB_LIMIT)
-		i = SLAB_LIMIT;
+		if (nr_objs > SLAB_LIMIT)
+			nr_objs = SLAB_LIMIT;
 
-	*num = i;
-	wastage -= i*size;
-	wastage -= ALIGN(base+i*extra, align);
-	*left_over = wastage;
+		mgmt_size = slab_mgmt_size(nr_objs, align);
+	}
+	*num = nr_objs;
+	*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
 }
 
 #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
 
-static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg)
+static void __slab_error(const char *function, struct kmem_cache *cachep, char *msg)
 {
 	printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
-		function, cachep->name, msg);
+	       function, cachep->name, msg);
 	dump_stack();
 }
 
@@ -755,9 +870,9 @@ static void __devinit start_cpu_timer(in
 }
 
 static struct array_cache *alloc_arraycache(int node, int entries,
-						int batchcount)
+					    int batchcount)
 {
-	int memsize = sizeof(void*)*entries+sizeof(struct array_cache);
+	int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
 	struct array_cache *nc = NULL;
 
 	nc = kmalloc_node(memsize, GFP_KERNEL, node);
@@ -772,10 +887,13 @@ static struct array_cache *alloc_arrayca
 }
 
 #ifdef CONFIG_NUMA
-static inline struct array_cache **alloc_alien_cache(int node, int limit)
+static void * __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
+				 int nodeid, int *this_cpu);
+
+static struct array_cache **alloc_alien_cache(int node, int limit)
 {
 	struct array_cache **ac_ptr;
-	int memsize = sizeof(void*)*MAX_NUMNODES;
+	int memsize = sizeof(void *) * MAX_NUMNODES;
 	int i;
 
 	if (limit > 1)
@@ -789,7 +907,7 @@ static inline struct array_cache **alloc
 			}
 			ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
 			if (!ac_ptr[i]) {
-				for (i--; i <=0; i--)
+				for (i--; i <= 0; i--)
 					kfree(ac_ptr[i]);
 				kfree(ac_ptr);
 				return NULL;
@@ -799,7 +917,7 @@ static inline struct array_cache **alloc
 	return ac_ptr;
 }
 
-static inline void free_alien_cache(struct array_cache **ac_ptr)
+static void free_alien_cache(struct array_cache **ac_ptr)
 {
 	int i;
 
@@ -807,53 +925,65 @@ static inline void free_alien_cache(stru
 		return;
 
 	for_each_node(i)
-		kfree(ac_ptr[i]);
+	    kfree(ac_ptr[i]);
 
 	kfree(ac_ptr);
 }
 
-static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache *ac, int node)
+static void __drain_alien_cache(struct kmem_cache *cachep,
+				struct array_cache *ac, int node,
+				int *this_cpu)
 {
 	struct kmem_list3 *rl3 = cachep->nodelists[node];
 
 	if (ac->avail) {
 		spin_lock(&rl3->list_lock);
-		free_block(cachep, ac->entry, ac->avail, node);
+		free_block(cachep, ac->entry, ac->avail, node, this_cpu);
 		ac->avail = 0;
 		spin_unlock(&rl3->list_lock);
 	}
 }
 
-static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
+static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)
 {
-	int i=0;
+	int i = 0;
 	struct array_cache *ac;
 	unsigned long flags;
+	int this_cpu;
 
 	for_each_online_node(i) {
-		ac = l3->alien[i];
+		ac = alien[i];
 		if (ac) {
-			spin_lock_irqsave(&ac->lock, flags);
-			__drain_alien_cache(cachep, ac, i);
-			spin_unlock_irqrestore(&ac->lock, flags);
+			slab_spin_lock_irqsave(&ac->lock, flags, this_cpu);
+			__drain_alien_cache(cachep, ac, i, &this_cpu);
+			slab_spin_unlock_irqrestore(&ac->lock, flags, this_cpu);
 		}
 	}
 }
 #else
-#define alloc_alien_cache(node, limit) do { } while (0)
-#define free_alien_cache(ac_ptr) do { } while (0)
-#define drain_alien_cache(cachep, l3) do { } while (0)
+
+#define drain_alien_cache(cachep, alien) do { } while (0)
+
+static inline struct array_cache **alloc_alien_cache(int node, int limit)
+{
+	return (struct array_cache **) 0x01020304ul;
+}
+
+static inline void free_alien_cache(struct array_cache **ac_ptr)
+{
+}
+
 #endif
 
 static int __devinit cpuup_callback(struct notifier_block *nfb,
-				  unsigned long action, void *hcpu)
+				    unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
-	kmem_cache_t* cachep;
+	struct kmem_cache *cachep;
 	struct kmem_list3 *l3 = NULL;
 	int node = cpu_to_node(cpu);
 	int memsize = sizeof(struct kmem_list3);
-	struct array_cache *nc = NULL;
+	int this_cpu;
 
 	switch (action) {
 	case CPU_UP_PREPARE:
@@ -871,43 +1001,73 @@ static int __devinit cpuup_callback(stru
 			 */
 			if (!cachep->nodelists[node]) {
 				if (!(l3 = kmalloc_node(memsize,
-						GFP_KERNEL, node)))
+							GFP_KERNEL, node)))
 					goto bad;
 				kmem_list3_init(l3);
 				l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
-				  ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+				    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
 
+				/*
+				 * The l3s don't come and go as CPUs come and
+				 * go.  cache_chain_sem is sufficient
+				 * protection here.
+				 */
 				cachep->nodelists[node] = l3;
 			}
 
-			spin_lock_irq(&cachep->nodelists[node]->list_lock);
+			slab_spin_lock_irq(&cachep->nodelists[node]->list_lock,
+this_cpu);
 			cachep->nodelists[node]->free_limit =
-				(1 + nr_cpus_node(node)) *
-				cachep->batchcount + cachep->num;
-			spin_unlock_irq(&cachep->nodelists[node]->list_lock);
+			    (1 + nr_cpus_node(node)) *
+			    cachep->batchcount + cachep->num;
+			slab_spin_unlock_irq(&cachep->nodelists[node]->list_lock, this_cpu);
 		}
 
 		/* Now we can go ahead with allocating the shared array's
-		  & array cache's */
+		   & array cache's */
 		list_for_each_entry(cachep, &cache_chain, next) {
+			struct array_cache *nc;
+			int this_cpu;
+			struct array_cache *shared;
+			struct array_cache **alien;
+
 			nc = alloc_arraycache(node, cachep->limit,
-					cachep->batchcount);
+						cachep->batchcount);
 			if (!nc)
 				goto bad;
+			shared = alloc_arraycache(node,
+					cachep->shared * cachep->batchcount,
+					0xbaadf00d);
+			if (!shared)
+				goto bad;
+
+			alien = alloc_alien_cache(node, cachep->limit);
+			if (!alien)
+				goto bad;
 			cachep->array[cpu] = nc;
 
 			l3 = cachep->nodelists[node];
 			BUG_ON(!l3);
+
+			slab_spin_lock_irq(&l3->list_lock, this_cpu);
 			if (!l3->shared) {
-				if (!(nc = alloc_arraycache(node,
-					cachep->shared*cachep->batchcount,
-					0xbaadf00d)))
-					goto  bad;
-
-				/* we are serialised from CPU_DEAD or
-				  CPU_UP_CANCELLED by the cpucontrol lock */
-				l3->shared = nc;
+				/*
+				 * We are serialised from CPU_DEAD or
+				 * CPU_UP_CANCELLED by the cpucontrol lock
+				 */
+				l3->shared = shared;
+				shared = NULL;
 			}
+#ifdef CONFIG_NUMA
+			if (!l3->alien) {
+				l3->alien = alien;
+				alien = NULL;
+			}
+#endif
+			slab_spin_unlock_irq(&l3->list_lock, this_cpu);
+
+			kfree(shared);
+			free_alien_cache(alien);
 		}
 		up(&cache_chain_sem);
 		break;
@@ -916,66 +1076,89 @@ static int __devinit cpuup_callback(stru
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_DEAD:
+		/*
+		 * Even if all the cpus of a node are down, we don't free the
+		 * kmem_list3 of any cache. This to avoid a race between
+		 * cpu_down, and a kmalloc allocation from another cpu for
+		 * memory from the node of the cpu going down.  The list3
+		 * structure is usually allocated from kmem_cache_create() and
+		 * gets destroyed at kmem_cache_destroy().
+		 */
 		/* fall thru */
 	case CPU_UP_CANCELED:
 		down(&cache_chain_sem);
 
 		list_for_each_entry(cachep, &cache_chain, next) {
 			struct array_cache *nc;
+			struct array_cache *shared;
+			struct array_cache **alien;
+			int this_cpu;
 			cpumask_t mask;
 
 			mask = node_to_cpumask(node);
-			spin_lock_irq(&cachep->spinlock);
 			/* cpu is dead; no one can alloc from it. */
 			nc = cachep->array[cpu];
 			cachep->array[cpu] = NULL;
 			l3 = cachep->nodelists[node];
 
 			if (!l3)
-				goto unlock_cache;
+				goto free_array_cache;
 
-			spin_lock(&l3->list_lock);
+			slab_spin_lock_irq(&l3->list_lock, this_cpu);
 
 			/* Free limit for this kmem_list3 */
 			l3->free_limit -= cachep->batchcount;
 			if (nc)
-				free_block(cachep, nc->entry, nc->avail, node);
+				free_block(cachep, nc->entry, nc->avail, node,
+					   &this_cpu);
 
 			if (!cpus_empty(mask)) {
-                                spin_unlock(&l3->list_lock);
-                                goto unlock_cache;
-                        }
+				slab_spin_unlock_irq(&l3->list_lock, this_cpu);
+				goto free_array_cache;
+			}
 
-			if (l3->shared) {
+			shared = l3->shared;
+			if (shared) {
 				free_block(cachep, l3->shared->entry,
-						l3->shared->avail, node);
-				kfree(l3->shared);
+					   l3->shared->avail, node, &this_cpu);
 				l3->shared = NULL;
 			}
-			if (l3->alien) {
-				drain_alien_cache(cachep, l3);
-				free_alien_cache(l3->alien);
-				l3->alien = NULL;
-			}
 
-			/* free slabs belonging to this node */
-			if (__node_shrink(cachep, node)) {
-				cachep->nodelists[node] = NULL;
-				spin_unlock(&l3->list_lock);
-				kfree(l3);
-			} else {
-				spin_unlock(&l3->list_lock);
+			alien = l3->alien;
+			l3->alien = NULL;
+
+			slab_spin_unlock_irq(&l3->list_lock, this_cpu);
+
+			kfree(shared);
+			if (alien) {
+				drain_alien_cache(cachep, alien);
+				free_alien_cache(alien);
 			}
-unlock_cache:
-			spin_unlock_irq(&cachep->spinlock);
+free_array_cache:
 			kfree(nc);
 		}
+		/*
+		 * In the previous loop, all the objects were freed to
+		 * the respective cache's slabs,  now we can go ahead and
+		 * shrink each nodelist to its limit.
+		 */
+		list_for_each_entry(cachep, &cache_chain, next) {
+			int this_cpu;
+
+			l3 = cachep->nodelists[node];
+			if (!l3)
+				continue;
+			slab_spin_lock_irq(&l3->list_lock, this_cpu);
+			/* free slabs belonging to this node */
+			__node_shrink(cachep, node, &this_cpu);
+			slab_spin_unlock_irq(&l3->list_lock, this_cpu);
+		}
 		up(&cache_chain_sem);
 		break;
 #endif
 	}
 	return NOTIFY_OK;
-bad:
+      bad:
 	up(&cache_chain_sem);
 	return NOTIFY_BAD;
 }
@@ -985,20 +1168,27 @@ static struct notifier_block cpucache_no
 /*
  * swap the static kmem_list3 with kmalloced memory
  */
-static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list,
-		int nodeid)
+static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int nodeid)
 {
 	struct kmem_list3 *ptr;
+	int this_cpu;
 
 	BUG_ON(cachep->nodelists[nodeid] != list);
 	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
 	BUG_ON(!ptr);
 
-	local_irq_disable();
+	WARN_ON(spin_is_locked(&list->list_lock));
+	slab_irq_disable(this_cpu);
 	memcpy(ptr, list, sizeof(struct kmem_list3));
+	/*
+	 * Do not assume that spinlocks can be initialized via memcpy:
+	 */
+	spin_lock_init(&ptr->list_lock);
+
 	MAKE_ALL_LISTS(cachep, ptr, nodeid);
 	cachep->nodelists[nodeid] = ptr;
-	local_irq_enable();
+
+	slab_irq_enable(this_cpu);
 }
 
 /* Initialisation.
@@ -1026,14 +1216,14 @@ void __init kmem_cache_init(void)
 
 	/* Bootstrap is tricky, because several objects are allocated
 	 * from caches that do not exist yet:
-	 * 1) initialize the cache_cache cache: it contains the kmem_cache_t
+	 * 1) initialize the cache_cache cache: it contains the struct kmem_cache
 	 *    structures of all caches, except cache_cache itself: cache_cache
 	 *    is statically allocated.
 	 *    Initially an __init data area is used for the head array and the
 	 *    kmem_list3 structures, it's replaced with a kmalloc allocated
 	 *    array at the end of the bootstrap.
 	 * 2) Create the first kmalloc cache.
-	 *    The kmem_cache_t for the new cache is allocated normally.
+	 *    The struct kmem_cache for the new cache is allocated normally.
 	 *    An __init data area is used for the head array.
 	 * 3) Create the remaining kmalloc caches, with minimally sized
 	 *    head arrays.
@@ -1045,24 +1235,22 @@ void __init kmem_cache_init(void)
 	 */
 
 	/* 1) create the cache_cache */
-	init_MUTEX(&cache_chain_sem);
 	INIT_LIST_HEAD(&cache_chain);
 	list_add(&cache_cache.next, &cache_chain);
 	cache_cache.colour_off = cache_line_size();
 	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
 	cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
 
-	cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());
+	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size());
 
-	cache_estimate(0, cache_cache.objsize, cache_line_size(), 0,
-				&left_over, &cache_cache.num);
+	cache_estimate(0, cache_cache.buffer_size, cache_line_size(), 0,
+		       &left_over, &cache_cache.num);
 	if (!cache_cache.num)
 		BUG();
 
-	cache_cache.colour = left_over/cache_cache.colour_off;
-	cache_cache.colour_next = 0;
-	cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) +
-				sizeof(struct slab), cache_line_size());
+	cache_cache.colour = left_over / cache_cache.colour_off;
+	cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
+				      sizeof(struct slab), cache_line_size());
 
 	/* 2+3) create the kmalloc caches */
 	sizes = malloc_sizes;
@@ -1074,14 +1262,18 @@ void __init kmem_cache_init(void)
 	 */
 
 	sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
-				sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN,
-				(ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+						      sizes[INDEX_AC].cs_size,
+						      ARCH_KMALLOC_MINALIGN,
+						      (ARCH_KMALLOC_FLAGS |
+						       SLAB_PANIC), NULL, NULL);
 
 	if (INDEX_AC != INDEX_L3)
 		sizes[INDEX_L3].cs_cachep =
-			kmem_cache_create(names[INDEX_L3].name,
-				sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN,
-				(ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+		    kmem_cache_create(names[INDEX_L3].name,
+				      sizes[INDEX_L3].cs_size,
+				      ARCH_KMALLOC_MINALIGN,
+				      (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL,
+				      NULL);
 
 	while (sizes->cs_size != ULONG_MAX) {
 		/*
@@ -1091,74 +1283,82 @@ void __init kmem_cache_init(void)
 		 * Note for systems short on memory removing the alignment will
 		 * allow tighter packing of the smaller caches.
 		 */
-		if(!sizes->cs_cachep)
+		if (!sizes->cs_cachep)
 			sizes->cs_cachep = kmem_cache_create(names->name,
-				sizes->cs_size, ARCH_KMALLOC_MINALIGN,
-				(ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+							     sizes->cs_size,
+							     ARCH_KMALLOC_MINALIGN,
+							     (ARCH_KMALLOC_FLAGS
+							      | SLAB_PANIC),
+							     NULL, NULL);
 
 		/* Inc off-slab bufctl limit until the ceiling is hit. */
 		if (!(OFF_SLAB(sizes->cs_cachep))) {
-			offslab_limit = sizes->cs_size-sizeof(struct slab);
+			offslab_limit = sizes->cs_size - sizeof(struct slab);
 			offslab_limit /= sizeof(kmem_bufctl_t);
 		}
 
 		sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
-			sizes->cs_size, ARCH_KMALLOC_MINALIGN,
-			(ARCH_KMALLOC_FLAGS | SLAB_CACHE_DMA | SLAB_PANIC),
-			NULL, NULL);
+							sizes->cs_size,
+							ARCH_KMALLOC_MINALIGN,
+							(ARCH_KMALLOC_FLAGS |
+							 SLAB_CACHE_DMA |
+							 SLAB_PANIC), NULL,
+							NULL);
 
 		sizes++;
 		names++;
 	}
 	/* 4) Replace the bootstrap head arrays */
 	{
-		void * ptr;
+		struct array_cache *ptr;
+		int this_cpu;
 
 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
 
-		local_irq_disable();
-		BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
-		memcpy(ptr, ac_data(&cache_cache),
+		slab_irq_disable(this_cpu);
+		BUG_ON(cpu_cache_get(&cache_cache, this_cpu) != &initarray_cache.cache);
+		memcpy(ptr, cpu_cache_get(&cache_cache, this_cpu),
 				sizeof(struct arraycache_init));
-		cache_cache.array[smp_processor_id()] = ptr;
-		local_irq_enable();
+		spin_lock_init(&ptr->lock);
+		cache_cache.array[this_cpu] = ptr;
+		slab_irq_enable(this_cpu);
 
 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
 
-		local_irq_disable();
-		BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep)
+		slab_irq_disable(this_cpu);
+		BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep, this_cpu)
 				!= &initarray_generic.cache);
-		memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep),
+		memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep, this_cpu),
 				sizeof(struct arraycache_init));
-		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
-						ptr;
-		local_irq_enable();
+		spin_lock_init(&ptr->lock);
+		malloc_sizes[INDEX_AC].cs_cachep->array[this_cpu] = ptr;
+		slab_irq_enable(this_cpu);
 	}
 	/* 5) Replace the bootstrap kmem_list3's */
 	{
 		int node;
 		/* Replace the static kmem_list3 structures for the boot cpu */
 		init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
-				numa_node_id());
+			  numa_node_id());
 
 		for_each_online_node(node) {
 			init_list(malloc_sizes[INDEX_AC].cs_cachep,
-					&initkmem_list3[SIZE_AC+node], node);
+				  &initkmem_list3[SIZE_AC + node], node);
 
 			if (INDEX_AC != INDEX_L3) {
 				init_list(malloc_sizes[INDEX_L3].cs_cachep,
-						&initkmem_list3[SIZE_L3+node],
-						node);
+					  &initkmem_list3[SIZE_L3 + node],
+					  node);
 			}
 		}
 	}
 
 	/* 6) resize the head arrays to their final sizes */
 	{
-		kmem_cache_t *cachep;
+		struct kmem_cache *cachep;
 		down(&cache_chain_sem);
 		list_for_each_entry(cachep, &cache_chain, next)
-			enable_cpucache(cachep);
+		    enable_cpucache(cachep);
 		up(&cache_chain_sem);
 	}
 
@@ -1166,7 +1366,7 @@ void __init kmem_cache_init(void)
 	g_cpucache_up = FULL;
 
 	/* Register a cpu startup notifier callback
-	 * that initializes ac_data for all new cpus
+	 * that initializes cpu_cache_get for all new cpus
 	 */
 	register_cpu_notifier(&cpucache_notifier);
 
@@ -1184,7 +1384,7 @@ static int __init cpucache_init(void)
 	 * pages to gfp.
 	 */
 	for_each_online_cpu(cpu)
-		start_cpu_timer(cpu);
+	    start_cpu_timer(cpu);
 
 	return 0;
 }
@@ -1198,7 +1398,7 @@ __initcall(cpucache_init);
  * did not request dmaable memory, we might get it, but that
  * would be relatively rare and ignorable.
  */
-static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid)
+static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
 	struct page *page;
 	void *addr;
@@ -1224,9 +1424,9 @@ static void *kmem_getpages(kmem_cache_t 
 /*
  * Interface to system's page release.
  */
-static void kmem_freepages(kmem_cache_t *cachep, void *addr)
+static void kmem_freepages(struct kmem_cache *cachep, void *addr)
 {
-	unsigned long i = (1<<cachep->gfporder);
+	unsigned long i = (1 << cachep->gfporder);
 	struct page *page = virt_to_page(addr);
 	const unsigned long nr_freed = i;
 
@@ -1239,14 +1439,14 @@ static void kmem_freepages(kmem_cache_t 
 	if (current->reclaim_state)
 		current->reclaim_state->reclaimed_slab += nr_freed;
 	free_pages((unsigned long)addr, cachep->gfporder);
-	if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 
-		atomic_sub(1<<cachep->gfporder, &slab_reclaim_pages);
+	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
+		atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages);
 }
 
 static void kmem_rcu_free(struct rcu_head *head)
 {
-	struct slab_rcu *slab_rcu = (struct slab_rcu *) head;
-	kmem_cache_t *cachep = slab_rcu->cachep;
+	struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
+	struct kmem_cache *cachep = slab_rcu->cachep;
 
 	kmem_freepages(cachep, slab_rcu->addr);
 	if (OFF_SLAB(cachep))
@@ -1256,20 +1456,20 @@ static void kmem_rcu_free(struct rcu_hea
 #if DEBUG
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
-static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
-				unsigned long caller)
+static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
+			    unsigned long caller)
 {
-	int size = obj_reallen(cachep);
+	int size = obj_size(cachep);
 
-	addr = (unsigned long *)&((char*)addr)[obj_dbghead(cachep)];
+	addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
 
-	if (size < 5*sizeof(unsigned long))
+	if (size < 5 * sizeof(unsigned long))
 		return;
 
-	*addr++=0x12345678;
-	*addr++=caller;
-	*addr++=smp_processor_id();
-	size -= 3*sizeof(unsigned long);
+	*addr++ = 0x12345678;
+	*addr++ = caller;
+	*addr++ = raw_smp_processor_id();
+	size -= 3 * sizeof(unsigned long);
 	{
 		unsigned long *sptr = &caller;
 		unsigned long svalue;
@@ -1277,7 +1477,7 @@ static void store_stackinfo(kmem_cache_t
 		while (!kstack_end(sptr)) {
 			svalue = *sptr++;
 			if (kernel_text_address(svalue)) {
-				*addr++=svalue;
+				*addr++ = svalue;
 				size -= sizeof(unsigned long);
 				if (size <= sizeof(unsigned long))
 					break;
@@ -1285,25 +1485,25 @@ static void store_stackinfo(kmem_cache_t
 		}
 
 	}
-	*addr++=0x87654321;
+	*addr++ = 0x87654321;
 }
 #endif
 
-static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val)
+static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
 {
-	int size = obj_reallen(cachep);
-	addr = &((char*)addr)[obj_dbghead(cachep)];
+	int size = obj_size(cachep);
+	addr = &((char *)addr)[obj_offset(cachep)];
 
 	memset(addr, val, size);
-	*(unsigned char *)(addr+size-1) = POISON_END;
+	*(unsigned char *)(addr + size - 1) = POISON_END;
 }
 
 static void dump_line(char *data, int offset, int limit)
 {
 	int i;
 	printk(KERN_ERR "%03x:", offset);
-	for (i=0;i<limit;i++) {
-		printk(" %02x", (unsigned char)data[offset+i]);
+	for (i = 0; i < limit; i++) {
+		printk(" %02x", (unsigned char)data[offset + i]);
 	}
 	printk("\n");
 }
@@ -1311,62 +1511,63 @@ static void dump_line(char *data, int of
 
 #if DEBUG
 
-static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines)
+static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
 {
 	int i, size;
 	char *realobj;
 
 	if (cachep->flags & SLAB_RED_ZONE) {
 		printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
-			*dbg_redzone1(cachep, objp),
-			*dbg_redzone2(cachep, objp));
+		       *dbg_redzone1(cachep, objp),
+		       *dbg_redzone2(cachep, objp));
 	}
 
 	if (cachep->flags & SLAB_STORE_USER) {
 		printk(KERN_ERR "Last user: [<%p>]",
-				*dbg_userword(cachep, objp));
+		       *dbg_userword(cachep, objp));
 		print_symbol("(%s)",
-				(unsigned long)*dbg_userword(cachep, objp));
+			     (unsigned long)*dbg_userword(cachep, objp));
 		printk("\n");
 	}
-	realobj = (char*)objp+obj_dbghead(cachep);
-	size = obj_reallen(cachep);
-	for (i=0; i<size && lines;i+=16, lines--) {
+	realobj = (char *)objp + obj_offset(cachep);
+	size = obj_size(cachep);
+	for (i = 0; i < size && lines; i += 16, lines--) {
 		int limit;
 		limit = 16;
-		if (i+limit > size)
-			limit = size-i;
+		if (i + limit > size)
+			limit = size - i;
 		dump_line(realobj, i, limit);
 	}
 }
 
-static void check_poison_obj(kmem_cache_t *cachep, void *objp)
+static void check_poison_obj(struct kmem_cache *cachep, void *objp)
 {
 	char *realobj;
 	int size, i;
 	int lines = 0;
 
-	realobj = (char*)objp+obj_dbghead(cachep);
-	size = obj_reallen(cachep);
+	realobj = (char *)objp + obj_offset(cachep);
+	size = obj_size(cachep);
 
-	for (i=0;i<size;i++) {
+	for (i = 0; i < size; i++) {
 		char exp = POISON_FREE;
-		if (i == size-1)
+		if (i == size - 1)
 			exp = POISON_END;
 		if (realobj[i] != exp) {
 			int limit;
 			/* Mismatch ! */
 			/* Print header */
 			if (lines == 0) {
-				printk(KERN_ERR "Slab corruption: start=%p, len=%d\n",
-						realobj, size);
+				printk(KERN_ERR
+				       "Slab corruption: start=%p, len=%d\n",
+				       realobj, size);
 				print_objinfo(cachep, objp, 0);
 			}
 			/* Hexdump the affected line */
-			i = (i/16)*16;
+			i = (i / 16) * 16;
 			limit = 16;
-			if (i+limit > size)
-				limit = size-i;
+			if (i + limit > size)
+				limit = size - i;
 			dump_line(realobj, i, limit);
 			i += 16;
 			lines++;
@@ -1379,45 +1580,46 @@ static void check_poison_obj(kmem_cache_
 		/* Print some data about the neighboring objects, if they
 		 * exist:
 		 */
-		struct slab *slabp = page_get_slab(virt_to_page(objp));
+		struct slab *slabp = virt_to_slab(objp);
 		int objnr;
 
-		objnr = (objp-slabp->s_mem)/cachep->objsize;
+		objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
 		if (objnr) {
-			objp = slabp->s_mem+(objnr-1)*cachep->objsize;
-			realobj = (char*)objp+obj_dbghead(cachep);
+			objp = slabp->s_mem + (objnr - 1) * cachep->buffer_size;
+			realobj = (char *)objp + obj_offset(cachep);
 			printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
-						realobj, size);
+			       realobj, size);
 			print_objinfo(cachep, objp, 2);
 		}
-		if (objnr+1 < cachep->num) {
-			objp = slabp->s_mem+(objnr+1)*cachep->objsize;
-			realobj = (char*)objp+obj_dbghead(cachep);
+		if (objnr + 1 < cachep->num) {
+			objp = slabp->s_mem + (objnr + 1) * cachep->buffer_size;
+			realobj = (char *)objp + obj_offset(cachep);
 			printk(KERN_ERR "Next obj: start=%p, len=%d\n",
-						realobj, size);
+			       realobj, size);
 			print_objinfo(cachep, objp, 2);
 		}
 	}
 }
 #endif
 
-/* Destroy all the objs in a slab, and release the mem back to the system.
- * Before calling the slab must have been unlinked from the cache.
- * The cache-lock is not held/needed.
+#if DEBUG
+/**
+ * slab_destroy_objs - call the registered destructor for each object in
+ *      a slab that is to be destroyed.
  */
-static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
+static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 {
-	void *addr = slabp->s_mem - slabp->colouroff;
-
-#if DEBUG
 	int i;
 	for (i = 0; i < cachep->num; i++) {
-		void *objp = slabp->s_mem + cachep->objsize * i;
+		void *objp = slabp->s_mem + cachep->buffer_size * i;
 
 		if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-			if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep))
-				kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE,1);
+			if ((cachep->buffer_size % PAGE_SIZE) == 0
+			    && OFF_SLAB(cachep))
+				kernel_map_pages(virt_to_page(objp),
+						 cachep->buffer_size / PAGE_SIZE,
+						 1);
 			else
 				check_poison_obj(cachep, objp);
 #else
@@ -1427,53 +1629,128 @@ static void slab_destroy (kmem_cache_t *
 		if (cachep->flags & SLAB_RED_ZONE) {
 			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
 				slab_error(cachep, "start of a freed object "
-							"was overwritten");
+					   "was overwritten");
 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
 				slab_error(cachep, "end of a freed object "
-							"was overwritten");
+					   "was overwritten");
 		}
 		if (cachep->dtor && !(cachep->flags & SLAB_POISON))
-			(cachep->dtor)(objp+obj_dbghead(cachep), cachep, 0);
+			(cachep->dtor) (objp + obj_offset(cachep), cachep, 0);
 	}
+}
 #else
+static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
+{
 	if (cachep->dtor) {
 		int i;
 		for (i = 0; i < cachep->num; i++) {
-			void* objp = slabp->s_mem+cachep->objsize*i;
-			(cachep->dtor)(objp, cachep, 0);
+			void *objp = slabp->s_mem + cachep->buffer_size * i;
+			(cachep->dtor) (objp, cachep, 0);
 		}
 	}
+}
 #endif
 
+static void
+__cache_free(struct kmem_cache *cachep, void *objp, int *this_cpu);
+
+/**
+ * Destroy all the objs in a slab, and release the mem back to the system.
+ * Before calling the slab must have been unlinked from the cache.
+ * The cache-lock is not held/needed.
+ */
+static void
+slab_destroy(struct kmem_cache *cachep, struct slab *slabp,
+	     int *this_cpu)
+{
+	void *addr = slabp->s_mem - slabp->colouroff;
+
+	slab_destroy_objs(cachep, slabp);
 	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
 		struct slab_rcu *slab_rcu;
 
-		slab_rcu = (struct slab_rcu *) slabp;
+		slab_rcu = (struct slab_rcu *)slabp;
 		slab_rcu->cachep = cachep;
 		slab_rcu->addr = addr;
 		call_rcu(&slab_rcu->head, kmem_rcu_free);
 	} else {
 		kmem_freepages(cachep, addr);
-		if (OFF_SLAB(cachep))
-			kmem_cache_free(cachep->slabp_cache, slabp);
+		if (OFF_SLAB(cachep)) {
+			if (this_cpu)
+				__cache_free(cachep->slabp_cache, slabp, this_cpu);
+			else
+				kmem_cache_free(cachep->slabp_cache, slabp);
+		}
 	}
 }
 
-/* For setting up all the kmem_list3s for cache whose objsize is same
+/* For setting up all the kmem_list3s for cache whose buffer_size is same
    as size of kmem_list3. */
-static inline void set_up_list3s(kmem_cache_t *cachep, int index)
+static void set_up_list3s(struct kmem_cache *cachep, int index)
 {
 	int node;
 
 	for_each_online_node(node) {
-		cachep->nodelists[node] = &initkmem_list3[index+node];
+		cachep->nodelists[node] = &initkmem_list3[index + node];
 		cachep->nodelists[node]->next_reap = jiffies +
-			REAPTIMEOUT_LIST3 +
-			((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+		    REAPTIMEOUT_LIST3 +
+		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
 	}
 }
 
 /**
+ * calculate_slab_order - calculate size (page order) of slabs
+ * @cachep: pointer to the cache that is being created
+ * @size: size of objects to be created in this cache.
+ * @align: required alignment for the objects.
+ * @flags: slab allocation flags
+ *
+ * Also calculates the number of objects per slab.
+ *
+ * This could be made much more intelligent.  For now, try to avoid using
+ * high order pages for slabs.  When the gfp() functions are more friendly
+ * towards high-order requests, this should be changed.
+ */
+static inline size_t calculate_slab_order(struct kmem_cache *cachep,
+			size_t size, size_t align, unsigned long flags)
+{
+	size_t left_over = 0;
+
+	for (;; cachep->gfporder++) {
+		unsigned int num;
+		size_t remainder;
+
+		if (cachep->gfporder > MAX_GFP_ORDER) {
+			cachep->num = 0;
+			break;
+		}
+
+		cache_estimate(cachep->gfporder, size, align, flags,
+			       &remainder, &num);
+		if (!num)
+			continue;
+		/* More than offslab_limit objects will cause problems */
+		if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit)
+			break;
+
+		cachep->num = num;
+		left_over = remainder;
+
+		/*
+		 * Large number of objects is good, but very large slabs are
+		 * currently bad for the gfp()s.
+		 */
+		if (cachep->gfporder >= slab_break_gfp_order)
+			break;
+
+		if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder))
+			/* Acceptable internal fragmentation */
+			break;
+	}
+	return left_over;
+}
+
+/**
  * kmem_cache_create - Create a cache.
  * @name: A string which is used in /proc/slabinfo to identify this cache.
  * @size: The size of objects to be created in this cache.
@@ -1506,32 +1783,37 @@ static inline void set_up_list3s(kmem_ca
  * cacheline.  This can be beneficial if you're counting cycles as closely
  * as davem.
  */
-kmem_cache_t *
+struct kmem_cache *
 kmem_cache_create (const char *name, size_t size, size_t align,
-	unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long),
-	void (*dtor)(void*, kmem_cache_t *, unsigned long))
+	unsigned long flags, void (*ctor)(void*, struct kmem_cache *, unsigned long),
+	void (*dtor)(void*, struct kmem_cache *, unsigned long))
 {
 	size_t left_over, slab_size, ralign;
-	kmem_cache_t *cachep = NULL;
+	struct kmem_cache *cachep = NULL;
 	struct list_head *p;
 
 	/*
 	 * Sanity checks... these are all serious usage bugs.
 	 */
 	if ((!name) ||
-		in_interrupt() ||
-		(size < BYTES_PER_WORD) ||
-		(size > (1<<MAX_OBJ_ORDER)*PAGE_SIZE) ||
-		(dtor && !ctor)) {
-			printk(KERN_ERR "%s: Early error in slab %s\n",
-					__FUNCTION__, name);
-			BUG();
-		}
+	    in_interrupt() ||
+	    (size < BYTES_PER_WORD) ||
+	    (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
+		printk(KERN_ERR "%s: Early error in slab %s\n",
+		       __FUNCTION__, name);
+		BUG();
+	}
+
+	/*
+	 * Prevent CPUs from coming and going.
+	 * lock_cpu_hotplug() nests outside cache_chain_sem
+	 */
+	lock_cpu_hotplug();
 
 	down(&cache_chain_sem);
 
 	list_for_each(p, &cache_chain) {
-		kmem_cache_t *pc = list_entry(p, kmem_cache_t, next);
+		struct kmem_cache *pc = list_entry(p, struct kmem_cache, next);
 		mm_segment_t old_fs = get_fs();
 		char tmp;
 		int res;
@@ -1546,11 +1828,11 @@ kmem_cache_create (const char *name, siz
 		set_fs(old_fs);
 		if (res) {
 			printk("SLAB: cache with size %d has lost its name\n",
-					pc->objsize);
+			       pc->buffer_size);
 			continue;
 		}
 
-		if (!strcmp(pc->name,name)) {
+		if (!strcmp(pc->name, name)) {
 			printk("kmem_cache_create: duplicate cache %s\n", name);
 			dump_stack();
 			goto oops;
@@ -1562,10 +1844,9 @@ kmem_cache_create (const char *name, siz
 	if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
 		/* No constructor, but inital state check requested */
 		printk(KERN_ERR "%s: No con, but init state check "
-				"requested - %s\n", __FUNCTION__, name);
+		       "requested - %s\n", __FUNCTION__, name);
 		flags &= ~SLAB_DEBUG_INITIAL;
 	}
-
 #if FORCED_DEBUG
 	/*
 	 * Enable redzoning and last user accounting, except for caches with
@@ -1573,8 +1854,9 @@ kmem_cache_create (const char *name, siz
 	 * above the next power of two: caches with object sizes just above a
 	 * power of two have a significant amount of internal fragmentation.
 	 */
-	if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD)))
-		flags |= SLAB_RED_ZONE|SLAB_STORE_USER;
+	if ((size < 4096
+	     || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD)))
+		flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
 	if (!(flags & SLAB_DESTROY_BY_RCU))
 		flags |= SLAB_POISON;
 #endif
@@ -1595,9 +1877,9 @@ kmem_cache_create (const char *name, siz
 	 * unaligned accesses for some archs when redzoning is used, and makes
 	 * sure any on-slab bufctl's are also correctly aligned.
 	 */
-	if (size & (BYTES_PER_WORD-1)) {
-		size += (BYTES_PER_WORD-1);
-		size &= ~(BYTES_PER_WORD-1);
+	if (size & (BYTES_PER_WORD - 1)) {
+		size += (BYTES_PER_WORD - 1);
+		size &= ~(BYTES_PER_WORD - 1);
 	}
 
 	/* calculate out the final buffer alignment: */
@@ -1608,7 +1890,7 @@ kmem_cache_create (const char *name, siz
 		 * objects into one cacheline.
 		 */
 		ralign = cache_line_size();
-		while (size <= ralign/2)
+		while (size <= ralign / 2)
 			ralign /= 2;
 	} else {
 		ralign = BYTES_PER_WORD;
@@ -1617,13 +1899,13 @@ kmem_cache_create (const char *name, siz
 	if (ralign < ARCH_SLAB_MINALIGN) {
 		ralign = ARCH_SLAB_MINALIGN;
 		if (ralign > BYTES_PER_WORD)
-			flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER);
+			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
 	}
 	/* 3) caller mandated alignment: disables debug if necessary */
 	if (ralign < align) {
 		ralign = align;
 		if (ralign > BYTES_PER_WORD)
-			flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER);
+			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
 	}
 	/* 4) Store it. Note that the debug code below can reduce
 	 *    the alignment to BYTES_PER_WORD.
@@ -1631,21 +1913,21 @@ kmem_cache_create (const char *name, siz
 	align = ralign;
 
 	/* Get cache's description obj. */
-	cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
+	cachep = kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
 	if (!cachep)
 		goto oops;
-	memset(cachep, 0, sizeof(kmem_cache_t));
+	memset(cachep, 0, sizeof(struct kmem_cache));
 
 #if DEBUG
-	cachep->reallen = size;
+	cachep->obj_size = size;
 
 	if (flags & SLAB_RED_ZONE) {
 		/* redzoning only works with word aligned caches */
 		align = BYTES_PER_WORD;
 
 		/* add space for red zone words */
-		cachep->dbghead += BYTES_PER_WORD;
-		size += 2*BYTES_PER_WORD;
+		cachep->obj_offset += BYTES_PER_WORD;
+		size += 2 * BYTES_PER_WORD;
 	}
 	if (flags & SLAB_STORE_USER) {
 		/* user store requires word alignment and
@@ -1656,15 +1938,16 @@ kmem_cache_create (const char *name, siz
 		size += BYTES_PER_WORD;
 	}
 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
-	if (size >= malloc_sizes[INDEX_L3+1].cs_size && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
-		cachep->dbghead += PAGE_SIZE - size;
+	if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
+	    && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
+		cachep->obj_offset += PAGE_SIZE - size;
 		size = PAGE_SIZE;
 	}
 #endif
 #endif
 
 	/* Determine if the slab management is 'on' or 'off' slab. */
-	if (size >= (PAGE_SIZE>>3))
+	if (size >= (PAGE_SIZE >> 3))
 		/*
 		 * Size is large, assume best to place the slab management obj
 		 * off-slab (should allow better packing of objs).
@@ -1681,47 +1964,9 @@ kmem_cache_create (const char *name, siz
 		 */
 		cachep->gfporder = 0;
 		cache_estimate(cachep->gfporder, size, align, flags,
-					&left_over, &cachep->num);
-	} else {
-		/*
-		 * Calculate size (in pages) of slabs, and the num of objs per
-		 * slab.  This could be made much more intelligent.  For now,
-		 * try to avoid using high page-orders for slabs.  When the
-		 * gfp() funcs are more friendly towards high-order requests,
-		 * this should be changed.
-		 */
-		do {
-			unsigned int break_flag = 0;
-cal_wastage:
-			cache_estimate(cachep->gfporder, size, align, flags,
-						&left_over, &cachep->num);
-			if (break_flag)
-				break;
-			if (cachep->gfporder >= MAX_GFP_ORDER)
-				break;
-			if (!cachep->num)
-				goto next;
-			if (flags & CFLGS_OFF_SLAB &&
-					cachep->num > offslab_limit) {
-				/* This num of objs will cause problems. */
-				cachep->gfporder--;
-				break_flag++;
-				goto cal_wastage;
-			}
-
-			/*
-			 * Large num of objs is good, but v. large slabs are
-			 * currently bad for the gfp()s.
-			 */
-			if (cachep->gfporder >= slab_break_gfp_order)
-				break;
-
-			if ((left_over*8) <= (PAGE_SIZE<<cachep->gfporder))
-				break;	/* Acceptable internal fragmentation. */
-next:
-			cachep->gfporder++;
-		} while (1);
-	}
+			       &left_over, &cachep->num);
+	} else
+		left_over = calculate_slab_order(cachep, size, align, flags);
 
 	if (!cachep->num) {
 		printk("kmem_cache_create: couldn't create cache %s.\n", name);
@@ -1729,8 +1974,8 @@ next:
 		cachep = NULL;
 		goto oops;
 	}
-	slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t)
-				+ sizeof(struct slab), align);
+	slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
+			  + sizeof(struct slab), align);
 
 	/*
 	 * If the slab has been placed off-slab, and we have enough space then
@@ -1743,21 +1988,22 @@ next:
 
 	if (flags & CFLGS_OFF_SLAB) {
 		/* really off slab. No need for manual alignment */
-		slab_size = cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab);
+		slab_size =
+		    cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
 	}
 
 	cachep->colour_off = cache_line_size();
 	/* Offset must be a multiple of the alignment. */
 	if (cachep->colour_off < align)
 		cachep->colour_off = align;
-	cachep->colour = left_over/cachep->colour_off;
+	cachep->colour = left_over / cachep->colour_off;
 	cachep->slab_size = slab_size;
 	cachep->flags = flags;
 	cachep->gfpflags = 0;
 	if (flags & SLAB_CACHE_DMA)
 		cachep->gfpflags |= GFP_DMA;
 	spin_lock_init(&cachep->spinlock);
-	cachep->objsize = size;
+	cachep->buffer_size = size;
 
 	if (flags & CFLGS_OFF_SLAB)
 		cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
@@ -1765,9 +2011,6 @@ next:
 	cachep->dtor = dtor;
 	cachep->name = name;
 
-	/* Don't let CPUs to come and go */
-	lock_cpu_hotplug();
-
 	if (g_cpucache_up == FULL) {
 		enable_cpucache(cachep);
 	} else {
@@ -1777,7 +2020,7 @@ next:
 			 * the creation of further caches will BUG().
 			 */
 			cachep->array[smp_processor_id()] =
-				&initarray_generic.cache;
+			    &initarray_generic.cache;
 
 			/* If the cache that's used by
 			 * kmalloc(sizeof(kmem_list3)) is the first cache,
@@ -1791,8 +2034,7 @@ next:
 				g_cpucache_up = PARTIAL_AC;
 		} else {
 			cachep->array[smp_processor_id()] =
-				kmalloc(sizeof(struct arraycache_init),
-						GFP_KERNEL);
+			    kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
 
 			if (g_cpucache_up == PARTIAL_AC) {
 				set_up_list3s(cachep, SIZE_L3);
@@ -1802,34 +2044,40 @@ next:
 				for_each_online_node(node) {
 
 					cachep->nodelists[node] =
-						kmalloc_node(sizeof(struct kmem_list3),
-								GFP_KERNEL, node);
+					    kmalloc_node(sizeof
+							 (struct kmem_list3),
+							 GFP_KERNEL, node);
 					BUG_ON(!cachep->nodelists[node]);
-					kmem_list3_init(cachep->nodelists[node]);
+					kmem_list3_init(cachep->
+							nodelists[node]);
 				}
 			}
 		}
 		cachep->nodelists[numa_node_id()]->next_reap =
-			jiffies + REAPTIMEOUT_LIST3 +
-			((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+		    jiffies + REAPTIMEOUT_LIST3 +
+		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+
+		{
+			int this_cpu = raw_smp_processor_id();
 
-		BUG_ON(!ac_data(cachep));
-		ac_data(cachep)->avail = 0;
-		ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
-		ac_data(cachep)->batchcount = 1;
-		ac_data(cachep)->touched = 0;
-		cachep->batchcount = 1;
-		cachep->limit = BOOT_CPUCACHE_ENTRIES;
-	} 
+			BUG_ON(!cpu_cache_get(cachep, this_cpu));
+			cpu_cache_get(cachep, this_cpu)->avail = 0;
+			cpu_cache_get(cachep, this_cpu)->limit = BOOT_CPUCACHE_ENTRIES;
+			cpu_cache_get(cachep, this_cpu)->batchcount = 1;
+			cpu_cache_get(cachep, this_cpu)->touched = 0;
+			cachep->batchcount = 1;
+			cachep->limit = BOOT_CPUCACHE_ENTRIES;
+		}
+	}
 
 	/* cache setup completed, link it into the list */
 	list_add(&cachep->next, &cache_chain);
-	unlock_cpu_hotplug();
-oops:
+      oops:
 	if (!cachep && (flags & SLAB_PANIC))
 		panic("kmem_cache_create(): failed to create slab `%s'\n",
-			name);
+		      name);
 	up(&cache_chain_sem);
+	unlock_cpu_hotplug();
 	return cachep;
 }
 EXPORT_SYMBOL(kmem_cache_create);
@@ -1837,15 +2085,23 @@ EXPORT_SYMBOL(kmem_cache_create);
 #if DEBUG
 static void check_irq_off(void)
 {
+/*
+ * On PREEMPT_RT we use locks to protect the per-CPU lists,
+ * and keep interrupts enabled.
+ */
+#ifndef CONFIG_PREEMPT_RT
 	BUG_ON(!irqs_disabled());
+#endif
 }
 
 static void check_irq_on(void)
 {
+#ifndef CONFIG_PREEMPT_RT
 	BUG_ON(irqs_disabled());
+#endif
 }
 
-static void check_spinlock_acquired(kmem_cache_t *cachep)
+static void check_spinlock_acquired(struct kmem_cache *cachep)
 {
 #ifdef CONFIG_SMP
 	check_irq_off();
@@ -1853,7 +2109,7 @@ static void check_spinlock_acquired(kmem
 #endif
 }
 
-static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node)
+static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
 {
 #ifdef CONFIG_SMP
 	check_irq_off();
@@ -1871,14 +2127,16 @@ static inline void check_spinlock_acquir
 /*
  * Waits for all CPUs to execute func().
  */
-static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg)
+static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
 {
+	unsigned long flags;
+
 	check_irq_on();
 	preempt_disable();
 
-	local_irq_disable();
+	slab_irq_disable(flags);
 	func(arg);
-	local_irq_enable();
+	slab_irq_enable(flags);
 
 	if (smp_call_function(func, arg, 1, 1))
 		BUG();
@@ -1886,45 +2144,46 @@ static void smp_call_function_all_cpus(v
 	preempt_enable();
 }
 
-static void drain_array_locked(kmem_cache_t* cachep,
-				struct array_cache *ac, int force, int node);
+static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
+				int force, int node);
 
 static void do_drain(void *arg)
 {
-	kmem_cache_t *cachep = (kmem_cache_t*)arg;
+	struct kmem_cache *cachep = (struct kmem_cache *) arg;
 	struct array_cache *ac;
 	int node = numa_node_id();
+	int this_cpu = smp_processor_id();
 
 	check_irq_off();
-	ac = ac_data(cachep);
+	ac = cpu_cache_get(cachep, this_cpu);
 	spin_lock(&cachep->nodelists[node]->list_lock);
-	free_block(cachep, ac->entry, ac->avail, node);
+	free_block(cachep, ac->entry, ac->avail, node, &this_cpu);
 	spin_unlock(&cachep->nodelists[node]->list_lock);
 	ac->avail = 0;
 }
 
-static void drain_cpu_caches(kmem_cache_t *cachep)
+static void drain_cpu_caches(struct kmem_cache *cachep)
 {
 	struct kmem_list3 *l3;
+	int this_cpu;
 	int node;
 
-	smp_call_function_all_cpus(do_drain, cachep);
+// FIXME:
+//	smp_call_function_all_cpus(do_drain, cachep);
 	check_irq_on();
-	spin_lock_irq(&cachep->spinlock);
-	for_each_online_node(node)  {
+	for_each_online_node(node) {
 		l3 = cachep->nodelists[node];
 		if (l3) {
-			spin_lock(&l3->list_lock);
+			slab_spin_lock_irq(&l3->list_lock, this_cpu);
 			drain_array_locked(cachep, l3->shared, 1, node);
-			spin_unlock(&l3->list_lock);
+			slab_spin_unlock_irq(&l3->list_lock, this_cpu);
 			if (l3->alien)
-				drain_alien_cache(cachep, l3);
+				drain_alien_cache(cachep, l3->alien);
 		}
 	}
-	spin_unlock_irq(&cachep->spinlock);
 }
 
-static int __node_shrink(kmem_cache_t *cachep, int node)
+static int __node_shrink(struct kmem_cache *cachep, int node, int *this_cpu)
 {
 	struct slab *slabp;
 	struct kmem_list3 *l3 = cachep->nodelists[node];
@@ -1945,19 +2204,19 @@ static int __node_shrink(kmem_cache_t *c
 		list_del(&slabp->list);
 
 		l3->free_objects -= cachep->num;
-		spin_unlock_irq(&l3->list_lock);
-		slab_destroy(cachep, slabp);
-		spin_lock_irq(&l3->list_lock);
+		slab_spin_unlock_irq(&l3->list_lock, *this_cpu);
+		slab_destroy(cachep, slabp, NULL);
+		slab_spin_lock_irq(&l3->list_lock, *this_cpu);
 	}
-	ret = !list_empty(&l3->slabs_full) ||
-		!list_empty(&l3->slabs_partial);
+	ret = !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial);
 	return ret;
 }
 
-static int __cache_shrink(kmem_cache_t *cachep)
+static int __cache_shrink(struct kmem_cache *cachep)
 {
 	int ret = 0, i = 0;
 	struct kmem_list3 *l3;
+	int this_cpu;
 
 	drain_cpu_caches(cachep);
 
@@ -1965,9 +2224,9 @@ static int __cache_shrink(kmem_cache_t *
 	for_each_online_node(i) {
 		l3 = cachep->nodelists[i];
 		if (l3) {
-			spin_lock_irq(&l3->list_lock);
-			ret += __node_shrink(cachep, i);
-			spin_unlock_irq(&l3->list_lock);
+			slab_spin_lock_irq(&l3->list_lock, this_cpu);
+			ret += __node_shrink(cachep, i, &this_cpu);
+			slab_spin_unlock_irq(&l3->list_lock, this_cpu);
 		}
 	}
 	return (ret ? 1 : 0);
@@ -1980,7 +2239,7 @@ static int __cache_shrink(kmem_cache_t *
  * Releases as many slabs as possible for a cache.
  * To help debugging, a zero exit status indicates all slabs were released.
  */
-int kmem_cache_shrink(kmem_cache_t *cachep)
+int kmem_cache_shrink(struct kmem_cache *cachep)
 {
 	if (!cachep || in_interrupt())
 		BUG();
@@ -1993,7 +2252,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
  * kmem_cache_destroy - delete a cache
  * @cachep: the cache to destroy
  *
- * Remove a kmem_cache_t object from the slab cache.
+ * Remove a struct kmem_cache object from the slab cache.
  * Returns 0 on success.
  *
  * It is expected this function will be called by a module when it is
@@ -2006,7 +2265,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
  * The caller must guarantee that noone will allocate memory from the cache
  * during the kmem_cache_destroy().
  */
-int kmem_cache_destroy(kmem_cache_t * cachep)
+int kmem_cache_destroy(struct kmem_cache *cachep)
 {
 	int i;
 	struct kmem_list3 *l3;
@@ -2028,7 +2287,7 @@ int kmem_cache_destroy(kmem_cache_t * ca
 	if (__cache_shrink(cachep)) {
 		slab_error(cachep, "Can't free all objects");
 		down(&cache_chain_sem);
-		list_add(&cachep->next,&cache_chain);
+		list_add(&cachep->next, &cache_chain);
 		up(&cache_chain_sem);
 		unlock_cpu_hotplug();
 		return 1;
@@ -2038,7 +2297,7 @@ int kmem_cache_destroy(kmem_cache_t * ca
 		synchronize_rcu();
 
 	for_each_online_cpu(i)
-		kfree(cachep->array[i]);
+	    kfree(cachep->array[i]);
 
 	/* NUMA: free the list3 structures */
 	for_each_online_node(i) {
@@ -2057,39 +2316,39 @@ int kmem_cache_destroy(kmem_cache_t * ca
 EXPORT_SYMBOL(kmem_cache_destroy);
 
 /* Get the memory for a slab management obj. */
-static struct slab* alloc_slabmgmt(kmem_cache_t *cachep, void *objp,
-			int colour_off, gfp_t local_flags)
+static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
+				   int colour_off, gfp_t local_flags)
 {
 	struct slab *slabp;
-	
+
 	if (OFF_SLAB(cachep)) {
 		/* Slab management obj is off-slab. */
 		slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags);
 		if (!slabp)
 			return NULL;
 	} else {
-		slabp = objp+colour_off;
+		slabp = objp + colour_off;
 		colour_off += cachep->slab_size;
 	}
 	slabp->inuse = 0;
 	slabp->colouroff = colour_off;
-	slabp->s_mem = objp+colour_off;
+	slabp->s_mem = objp + colour_off;
 
 	return slabp;
 }
 
 static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
 {
-	return (kmem_bufctl_t *)(slabp+1);
+	return (kmem_bufctl_t *) (slabp + 1);
 }
 
-static void cache_init_objs(kmem_cache_t *cachep,
-			struct slab *slabp, unsigned long ctor_flags)
+static void cache_init_objs(struct kmem_cache *cachep,
+			    struct slab *slabp, unsigned long ctor_flags)
 {
 	int i;
 
 	for (i = 0; i < cachep->num; i++) {
-		void *objp = slabp->s_mem+cachep->objsize*i;
+		void *objp = slabp->s_mem + cachep->buffer_size * i;
 #if DEBUG
 		/* need to poison the objs? */
 		if (cachep->flags & SLAB_POISON)
@@ -2107,29 +2366,32 @@ static void cache_init_objs(kmem_cache_t
 		 * Otherwise, deadlock. They must also be threaded.
 		 */
 		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
-			cachep->ctor(objp+obj_dbghead(cachep), cachep, ctor_flags);
+			cachep->ctor(objp + obj_offset(cachep), cachep,
+				     ctor_flags);
 
 		if (cachep->flags & SLAB_RED_ZONE) {
 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
 				slab_error(cachep, "constructor overwrote the"
-							" end of an object");
+					   " end of an object");
 			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
 				slab_error(cachep, "constructor overwrote the"
-							" start of an object");
+					   " start of an object");
 		}
-		if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
-	       		kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0);
+		if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)
+		    && cachep->flags & SLAB_POISON)
+			kernel_map_pages(virt_to_page(objp),
+					 cachep->buffer_size / PAGE_SIZE, 0);
 #else
 		if (cachep->ctor)
 			cachep->ctor(objp, cachep, ctor_flags);
 #endif
-		slab_bufctl(slabp)[i] = i+1;
+		slab_bufctl(slabp)[i] = i + 1;
 	}
-	slab_bufctl(slabp)[i-1] = BUFCTL_END;
+	slab_bufctl(slabp)[i - 1] = BUFCTL_END;
 	slabp->free = 0;
 }
 
-static void kmem_flagcheck(kmem_cache_t *cachep, gfp_t flags)
+static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
 {
 	if (flags & SLAB_DMA) {
 		if (!(cachep->gfpflags & GFP_DMA))
@@ -2140,7 +2402,43 @@ static void kmem_flagcheck(kmem_cache_t 
 	}
 }
 
-static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
+static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nodeid)
+{
+	void *objp = slabp->s_mem + (slabp->free * cachep->buffer_size);
+	kmem_bufctl_t next;
+
+	slabp->inuse++;
+	next = slab_bufctl(slabp)[slabp->free];
+#if DEBUG
+	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
+	WARN_ON(slabp->nodeid != nodeid);
+#endif
+	slabp->free = next;
+
+	return objp;
+}
+
+static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *objp,
+			  int nodeid)
+{
+	unsigned int objnr = (unsigned)(objp-slabp->s_mem) / cachep->buffer_size;
+
+#if DEBUG
+	/* Verify that the slab belongs to the intended node */
+	WARN_ON(slabp->nodeid != nodeid);
+
+	if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
+		printk(KERN_ERR "slab: double free detected in cache "
+		       "'%s', objp %p\n", cachep->name, objp);
+		BUG();
+	}
+#endif
+	slab_bufctl(slabp)[objnr] = slabp->free;
+	slabp->free = objnr;
+	slabp->inuse--;
+}
+
+static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, void *objp)
 {
 	int i;
 	struct page *page;
@@ -2159,19 +2457,20 @@ static void set_slab_attr(kmem_cache_t *
  * Grow (by 1) the number of slabs within a cache.  This is called by
  * kmem_cache_alloc() when there are no active objs left in a cache.
  */
-static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
+static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid,
+		      int *this_cpu)
 {
-	struct slab	*slabp;
-	void		*objp;
-	size_t		 offset;
-	gfp_t	 	 local_flags;
-	unsigned long	 ctor_flags;
+	struct slab *slabp;
+	void *objp;
+	size_t offset;
+	gfp_t local_flags;
+	unsigned long ctor_flags;
 	struct kmem_list3 *l3;
 
 	/* Be lazy and only check for valid flags here,
- 	 * keeping it out of the critical path in kmem_cache_alloc().
+	 * keeping it out of the critical path in kmem_cache_alloc().
 	 */
-	if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW))
+	if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW))
 		BUG();
 	if (flags & SLAB_NO_GROW)
 		return 0;
@@ -2185,22 +2484,23 @@ static int cache_grow(kmem_cache_t *cach
 		 */
 		ctor_flags |= SLAB_CTOR_ATOMIC;
 
-	/* About to mess with non-constant members - lock. */
+	/* Take the l3 list lock to change the colour_next on this node */
 	check_irq_off();
-	spin_lock(&cachep->spinlock);
+	l3 = cachep->nodelists[nodeid];
+	spin_lock(&l3->list_lock);
 
 	/* Get colour for the slab, and cal the next value. */
-	offset = cachep->colour_next;
-	cachep->colour_next++;
-	if (cachep->colour_next >= cachep->colour)
-		cachep->colour_next = 0;
-	offset *= cachep->colour_off;
+	offset = l3->colour_next;
+	l3->colour_next++;
+	if (l3->colour_next >= cachep->colour)
+		l3->colour_next = 0;
+	spin_unlock(&l3->list_lock);
 
-	spin_unlock(&cachep->spinlock);
+	offset *= cachep->colour_off;
 
-	check_irq_off();
 	if (local_flags & __GFP_WAIT)
-		local_irq_enable();
+		slab_irq_enable_nort();
+	slab_irq_enable_rt(*this_cpu);
 
 	/*
 	 * The test for missing atomic flag is performed here, rather than
@@ -2225,10 +2525,10 @@ static int cache_grow(kmem_cache_t *cach
 
 	cache_init_objs(cachep, slabp, ctor_flags);
 
+	slab_irq_disable_rt(*this_cpu);
 	if (local_flags & __GFP_WAIT)
-		local_irq_disable();
+		slab_irq_disable_nort();
 	check_irq_off();
-	l3 = cachep->nodelists[nodeid];
 	spin_lock(&l3->list_lock);
 
 	/* Make slab active. */
@@ -2237,11 +2537,12 @@ static int cache_grow(kmem_cache_t *cach
 	l3->free_objects += cachep->num;
 	spin_unlock(&l3->list_lock);
 	return 1;
-opps1:
+      opps1:
 	kmem_freepages(cachep, objp);
-failed:
+      failed:
+	slab_irq_disable_rt(*this_cpu);
 	if (local_flags & __GFP_WAIT)
-		local_irq_disable();
+		slab_irq_disable_nort();
 	return 0;
 }
 
@@ -2259,42 +2560,49 @@ static void kfree_debugcheck(const void 
 
 	if (!virt_addr_valid(objp)) {
 		printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
-			(unsigned long)objp);	
-		BUG();	
+		       (unsigned long)objp);
+		BUG();
 	}
 	page = virt_to_page(objp);
 	if (!PageSlab(page)) {
-		printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", (unsigned long)objp);
+		printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n",
+		       (unsigned long)objp);
 		BUG();
 	}
 }
 
-static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
-					void *caller)
+static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
+				   void *caller)
 {
 	struct page *page;
 	unsigned int objnr;
 	struct slab *slabp;
 
-	objp -= obj_dbghead(cachep);
+	objp -= obj_offset(cachep);
 	kfree_debugcheck(objp);
 	page = virt_to_page(objp);
 
 	if (page_get_cache(page) != cachep) {
-		printk(KERN_ERR "mismatch in kmem_cache_free: expected cache %p, got %p\n",
-				page_get_cache(page),cachep);
+		printk(KERN_ERR
+		       "mismatch in kmem_cache_free: expected cache %p, got %p\n",
+		       page_get_cache(page), cachep);
 		printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
-		printk(KERN_ERR "%p is %s.\n", page_get_cache(page), page_get_cache(page)->name);
+		printk(KERN_ERR "%p is %s.\n", page_get_cache(page),
+		       page_get_cache(page)->name);
 		WARN_ON(1);
 	}
 	slabp = page_get_slab(page);
 
 	if (cachep->flags & SLAB_RED_ZONE) {
-		if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
-			slab_error(cachep, "double free, or memory outside"
-						" object was overwritten");
-			printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
-					objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp));
+		if (*dbg_redzone1(cachep, objp) != RED_ACTIVE
+		    || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
+			slab_error(cachep,
+				   "double free, or memory outside"
+				   " object was overwritten");
+			printk(KERN_ERR
+			       "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
+			       objp, *dbg_redzone1(cachep, objp),
+			       *dbg_redzone2(cachep, objp));
 		}
 		*dbg_redzone1(cachep, objp) = RED_INACTIVE;
 		*dbg_redzone2(cachep, objp) = RED_INACTIVE;
@@ -2302,30 +2610,31 @@ static void *cache_free_debugcheck(kmem_
 	if (cachep->flags & SLAB_STORE_USER)
 		*dbg_userword(cachep, objp) = caller;
 
-	objnr = (objp-slabp->s_mem)/cachep->objsize;
+	objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
 
 	BUG_ON(objnr >= cachep->num);
-	BUG_ON(objp != slabp->s_mem + objnr*cachep->objsize);
+	BUG_ON(objp != slabp->s_mem + objnr * cachep->buffer_size);
 
 	if (cachep->flags & SLAB_DEBUG_INITIAL) {
 		/* Need to call the slab's constructor so the
 		 * caller can perform a verify of its state (debugging).
 		 * Called without the cache-lock held.
 		 */
-		cachep->ctor(objp+obj_dbghead(cachep),
-					cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY);
+		cachep->ctor(objp + obj_offset(cachep),
+			     cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
 	}
 	if (cachep->flags & SLAB_POISON && cachep->dtor) {
 		/* we want to cache poison the object,
 		 * call the destruction callback
 		 */
-		cachep->dtor(objp+obj_dbghead(cachep), cachep, 0);
+		cachep->dtor(objp + obj_offset(cachep), cachep, 0);
 	}
 	if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-		if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
+		if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
 			store_stackinfo(cachep, objp, (unsigned long)caller);
-	       		kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0);
+			kernel_map_pages(virt_to_page(objp),
+					 cachep->buffer_size / PAGE_SIZE, 0);
 		} else {
 			poison_obj(cachep, objp, POISON_FREE);
 		}
@@ -2336,11 +2645,11 @@ static void *cache_free_debugcheck(kmem_
 	return objp;
 }
 
-static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
+static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
 {
 	kmem_bufctl_t i;
 	int entries = 0;
-	
+
 	/* Check slab's freelist to see if this obj is there. */
 	for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
 		entries++;
@@ -2348,13 +2657,16 @@ static void check_slabp(kmem_cache_t *ca
 			goto bad;
 	}
 	if (entries != cachep->num - slabp->inuse) {
-bad:
-		printk(KERN_ERR "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
-				cachep->name, cachep->num, slabp, slabp->inuse);
-		for (i=0;i<sizeof(slabp)+cachep->num*sizeof(kmem_bufctl_t);i++) {
-			if ((i%16)==0)
+	      bad:
+		printk(KERN_ERR
+		       "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
+		       cachep->name, cachep->num, slabp, slabp->inuse);
+		for (i = 0;
+		     i < sizeof(slabp) + cachep->num * sizeof(kmem_bufctl_t);
+		     i++) {
+			if ((i % 16) == 0)
 				printk("\n%03x:", i);
-			printk(" %02x", ((unsigned char*)slabp)[i]);
+			printk(" %02x", ((unsigned char *)slabp)[i]);
 		}
 		printk("\n");
 		BUG();
@@ -2366,15 +2678,16 @@ bad:
 #define check_slabp(x,y) do { } while(0)
 #endif
 
-static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
+static void *
+cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, int *this_cpu)
 {
 	int batchcount;
 	struct kmem_list3 *l3;
 	struct array_cache *ac;
 
 	check_irq_off();
-	ac = ac_data(cachep);
-retry:
+	ac = cpu_cache_get(cachep, *this_cpu);
+      retry:
 	batchcount = ac->batchcount;
 	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
 		/* if there was little recent activity on this
@@ -2396,8 +2709,8 @@ retry:
 			shared_array->avail -= batchcount;
 			ac->avail = batchcount;
 			memcpy(ac->entry,
-				&(shared_array->entry[shared_array->avail]),
-				sizeof(void*)*batchcount);
+			       &(shared_array->entry[shared_array->avail]),
+			       sizeof(void *) * batchcount);
 			shared_array->touched = 1;
 			goto alloc_done;
 		}
@@ -2418,22 +2731,12 @@ retry:
 		check_slabp(cachep, slabp);
 		check_spinlock_acquired(cachep);
 		while (slabp->inuse < cachep->num && batchcount--) {
-			kmem_bufctl_t next;
 			STATS_INC_ALLOCED(cachep);
 			STATS_INC_ACTIVE(cachep);
 			STATS_SET_HIGH(cachep);
 
-			/* get obj pointer */
-			ac->entry[ac->avail++] = slabp->s_mem +
-				slabp->free*cachep->objsize;
-
-			slabp->inuse++;
-			next = slab_bufctl(slabp)[slabp->free];
-#if DEBUG
-			slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
-			WARN_ON(numa_node_id() != slabp->nodeid);
-#endif
-		       	slabp->free = next;
+			ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
+							    numa_node_id());
 		}
 		check_slabp(cachep, slabp);
 
@@ -2445,21 +2748,21 @@ retry:
 			list_add(&slabp->list, &l3->slabs_partial);
 	}
 
-must_grow:
+      must_grow:
 	l3->free_objects -= ac->avail;
-alloc_done:
+      alloc_done:
 	spin_unlock(&l3->list_lock);
 
 	if (unlikely(!ac->avail)) {
 		int x;
-		x = cache_grow(cachep, flags, numa_node_id());
+		x = cache_grow(cachep, flags, numa_node_id(), this_cpu);
 
 		// cache_grow can reenable interrupts, then ac could change.
-		ac = ac_data(cachep);
+		ac = cpu_cache_get(cachep, *this_cpu);
 		if (!x && ac->avail == 0)	// no objects in sight? abort
 			return NULL;
 
-		if (!ac->avail)		// objects refilled by interrupt?
+		if (!ac->avail)	// objects refilled by interrupt?
 			goto retry;
 	}
 	ac->touched = 1;
@@ -2467,7 +2770,7 @@ alloc_done:
 }
 
 static inline void
-cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags)
+cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags)
 {
 	might_sleep_if(flags & __GFP_WAIT);
 #if DEBUG
@@ -2476,16 +2779,16 @@ cache_alloc_debugcheck_before(kmem_cache
 }
 
 #if DEBUG
-static void *
-cache_alloc_debugcheck_after(kmem_cache_t *cachep,
-			gfp_t flags, void *objp, void *caller)
+static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags,
+					void *objp, void *caller)
 {
-	if (!objp)	
+	if (!objp)
 		return objp;
- 	if (cachep->flags & SLAB_POISON) {
+	if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-		if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
-			kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 1);
+		if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
+			kernel_map_pages(virt_to_page(objp),
+					 cachep->buffer_size / PAGE_SIZE, 1);
 		else
 			check_poison_obj(cachep, objp);
 #else
@@ -2497,60 +2800,76 @@ cache_alloc_debugcheck_after(kmem_cache_
 		*dbg_userword(cachep, objp) = caller;
 
 	if (cachep->flags & SLAB_RED_ZONE) {
-		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
-			slab_error(cachep, "double free, or memory outside"
-						" object was overwritten");
-			printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
-					objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp));
+		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE
+		    || *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
+			slab_error(cachep,
+				   "double free, or memory outside"
+				   " object was overwritten");
+			printk(KERN_ERR
+			       "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
+			       objp, *dbg_redzone1(cachep, objp),
+			       *dbg_redzone2(cachep, objp));
 		}
 		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
 		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
 	}
-	objp += obj_dbghead(cachep);
+	objp += obj_offset(cachep);
 	if (cachep->ctor && cachep->flags & SLAB_POISON) {
-		unsigned long	ctor_flags = SLAB_CTOR_CONSTRUCTOR;
+		unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
 
 		if (!(flags & __GFP_WAIT))
 			ctor_flags |= SLAB_CTOR_ATOMIC;
 
 		cachep->ctor(objp, cachep, ctor_flags);
-	}	
+	}
 	return objp;
 }
 #else
 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
 #endif
 
-static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
+static inline void *
+____cache_alloc(struct kmem_cache *cachep, gfp_t flags, int *this_cpu)
 {
-	void* objp;
+	void *objp;
 	struct array_cache *ac;
 
+#ifdef CONFIG_NUMA
+	if (unlikely(current->mempolicy && !in_interrupt())) {
+		int nid = slab_node(current->mempolicy);
+
+		if (nid != numa_node_id())
+			return __cache_alloc_node(cachep, flags, nid, this_cpu);
+	}
+#endif
+
 	check_irq_off();
-	ac = ac_data(cachep);
+	ac = cpu_cache_get(cachep, *this_cpu);
 	if (likely(ac->avail)) {
 		STATS_INC_ALLOCHIT(cachep);
 		ac->touched = 1;
 		objp = ac->entry[--ac->avail];
 	} else {
 		STATS_INC_ALLOCMISS(cachep);
-		objp = cache_alloc_refill(cachep, flags);
+		objp = cache_alloc_refill(cachep, flags, this_cpu);
 	}
 	return objp;
 }
 
-static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
+static __always_inline void *
+__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
 {
-	unsigned long save_flags;
-	void* objp;
+	unsigned long irqflags;
+	int this_cpu;
+	void *objp;
 
 	cache_alloc_debugcheck_before(cachep, flags);
 
-	local_irq_save(save_flags);
-	objp = ____cache_alloc(cachep, flags);
-	local_irq_restore(save_flags);
+	slab_irq_save(irqflags, this_cpu);
+	objp = ____cache_alloc(cachep, flags, &this_cpu);
+	slab_irq_restore(irqflags, this_cpu);
 	objp = cache_alloc_debugcheck_after(cachep, flags, objp,
-					__builtin_return_address(0));
+					    caller);
 	prefetchw(objp);
 	return objp;
 }
@@ -2559,77 +2878,72 @@ static inline void *__cache_alloc(kmem_c
 /*
  * A interface to enable slab creation on nodeid
  */
-static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
+static void * __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
+				 int nodeid, int *this_cpu)
 {
 	struct list_head *entry;
- 	struct slab *slabp;
- 	struct kmem_list3 *l3;
- 	void *obj;
- 	kmem_bufctl_t next;
- 	int x;
-
- 	l3 = cachep->nodelists[nodeid];
- 	BUG_ON(!l3);
-
-retry:
- 	spin_lock(&l3->list_lock);
- 	entry = l3->slabs_partial.next;
- 	if (entry == &l3->slabs_partial) {
- 		l3->free_touched = 1;
- 		entry = l3->slabs_free.next;
- 		if (entry == &l3->slabs_free)
- 			goto must_grow;
- 	}
-
- 	slabp = list_entry(entry, struct slab, list);
- 	check_spinlock_acquired_node(cachep, nodeid);
- 	check_slabp(cachep, slabp);
-
- 	STATS_INC_NODEALLOCS(cachep);
- 	STATS_INC_ACTIVE(cachep);
- 	STATS_SET_HIGH(cachep);
-
- 	BUG_ON(slabp->inuse == cachep->num);
-
- 	/* get obj pointer */
- 	obj =  slabp->s_mem + slabp->free*cachep->objsize;
- 	slabp->inuse++;
- 	next = slab_bufctl(slabp)[slabp->free];
-#if DEBUG
- 	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
-#endif
- 	slabp->free = next;
- 	check_slabp(cachep, slabp);
- 	l3->free_objects--;
- 	/* move slabp to correct slabp list: */
- 	list_del(&slabp->list);
-
- 	if (slabp->free == BUFCTL_END) {
- 		list_add(&slabp->list, &l3->slabs_full);
- 	} else {
- 		list_add(&slabp->list, &l3->slabs_partial);
- 	}
-
- 	spin_unlock(&l3->list_lock);
- 	goto done;
-
-must_grow:
- 	spin_unlock(&l3->list_lock);
- 	x = cache_grow(cachep, flags, nodeid);
-
- 	if (!x)
- 		return NULL;
-
- 	goto retry;
-done:
- 	return obj;
+	struct slab *slabp;
+	struct kmem_list3 *l3;
+	void *obj;
+	int x;
+
+	l3 = cachep->nodelists[nodeid];
+	BUG_ON(!l3);
+
+      retry:
+	check_irq_off();
+	spin_lock(&l3->list_lock);
+	entry = l3->slabs_partial.next;
+	if (entry == &l3->slabs_partial) {
+		l3->free_touched = 1;
+		entry = l3->slabs_free.next;
+		if (entry == &l3->slabs_free)
+			goto must_grow;
+	}
+
+	slabp = list_entry(entry, struct slab, list);
+	check_spinlock_acquired_node(cachep, nodeid);
+	check_slabp(cachep, slabp);
+
+	STATS_INC_NODEALLOCS(cachep);
+	STATS_INC_ACTIVE(cachep);
+	STATS_SET_HIGH(cachep);
+
+	BUG_ON(slabp->inuse == cachep->num);
+
+	obj = slab_get_obj(cachep, slabp, nodeid);
+	check_slabp(cachep, slabp);
+	l3->free_objects--;
+	/* move slabp to correct slabp list: */
+	list_del(&slabp->list);
+
+	if (slabp->free == BUFCTL_END) {
+		list_add(&slabp->list, &l3->slabs_full);
+	} else {
+		list_add(&slabp->list, &l3->slabs_partial);
+	}
+
+	spin_unlock(&l3->list_lock);
+	goto done;
+
+      must_grow:
+	spin_unlock(&l3->list_lock);
+	x = cache_grow(cachep, flags, nodeid, this_cpu);
+
+	if (!x)
+		return NULL;
+
+	goto retry;
+      done:
+	return obj;
 }
 #endif
 
 /*
  * Caller needs to acquire correct kmem_list's list_lock
  */
-static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int node)
+static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
+		       int node, int *this_cpu)
 {
 	int i;
 	struct kmem_list3 *l3;
@@ -2637,29 +2951,14 @@ static void free_block(kmem_cache_t *cac
 	for (i = 0; i < nr_objects; i++) {
 		void *objp = objpp[i];
 		struct slab *slabp;
-		unsigned int objnr;
 
-		slabp = page_get_slab(virt_to_page(objp));
+		slabp = virt_to_slab(objp);
 		l3 = cachep->nodelists[node];
 		list_del(&slabp->list);
-		objnr = (objp - slabp->s_mem) / cachep->objsize;
 		check_spinlock_acquired_node(cachep, node);
 		check_slabp(cachep, slabp);
-
-#if DEBUG
-		/* Verify that the slab belongs to the intended node */
-		WARN_ON(slabp->nodeid != node);
-
-		if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
-			printk(KERN_ERR "slab: double free detected in cache "
-					"'%s', objp %p\n", cachep->name, objp);
-			BUG();
-		}
-#endif
-		slab_bufctl(slabp)[objnr] = slabp->free;
-		slabp->free = objnr;
+		slab_put_obj(cachep, slabp, objp, node);
 		STATS_DEC_ACTIVE(cachep);
-		slabp->inuse--;
 		l3->free_objects++;
 		check_slabp(cachep, slabp);
 
@@ -2667,7 +2966,7 @@ static void free_block(kmem_cache_t *cac
 		if (slabp->inuse == 0) {
 			if (l3->free_objects > l3->free_limit) {
 				l3->free_objects -= cachep->num;
-				slab_destroy(cachep, slabp);
+				slab_destroy(cachep, slabp, this_cpu);
 			} else {
 				list_add(&slabp->list, &l3->slabs_free);
 			}
@@ -2681,7 +2980,7 @@ static void free_block(kmem_cache_t *cac
 	}
 }
 
-static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
+static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac, int *this_cpu)
 {
 	int batchcount;
 	struct kmem_list3 *l3;
@@ -2696,20 +2995,19 @@ static void cache_flusharray(kmem_cache_
 	spin_lock(&l3->list_lock);
 	if (l3->shared) {
 		struct array_cache *shared_array = l3->shared;
-		int max = shared_array->limit-shared_array->avail;
+		int max = shared_array->limit - shared_array->avail;
 		if (max) {
 			if (batchcount > max)
 				batchcount = max;
 			memcpy(&(shared_array->entry[shared_array->avail]),
-					ac->entry,
-					sizeof(void*)*batchcount);
+			       ac->entry, sizeof(void *) * batchcount);
 			shared_array->avail += batchcount;
 			goto free_done;
 		}
 	}
 
-	free_block(cachep, ac->entry, batchcount, node);
-free_done:
+	free_block(cachep, ac->entry, batchcount, node, this_cpu);
+      free_done:
 #if STATS
 	{
 		int i = 0;
@@ -2731,10 +3029,9 @@ free_done:
 	spin_unlock(&l3->list_lock);
 	ac->avail -= batchcount;
 	memmove(ac->entry, &(ac->entry[batchcount]),
-			sizeof(void*)*ac->avail);
+		sizeof(void *) * ac->avail);
 }
 
-
 /*
  * __cache_free
  * Release an obj back to its cache. If the obj has a constructed
@@ -2742,9 +3039,10 @@ free_done:
  *
  * Called with disabled ints.
  */
-static inline void __cache_free(kmem_cache_t *cachep, void *objp)
+static void
+__cache_free(struct kmem_cache *cachep, void *objp, int *this_cpu)
 {
-	struct array_cache *ac = ac_data(cachep);
+	struct array_cache *ac = cpu_cache_get(cachep, *this_cpu);
 
 	check_irq_off();
 	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
@@ -2755,27 +3053,28 @@ static inline void __cache_free(kmem_cac
 #ifdef CONFIG_NUMA
 	{
 		struct slab *slabp;
-		slabp = page_get_slab(virt_to_page(objp));
+		slabp = virt_to_slab(objp);
 		if (unlikely(slabp->nodeid != numa_node_id())) {
 			struct array_cache *alien = NULL;
 			int nodeid = slabp->nodeid;
-			struct kmem_list3 *l3 = cachep->nodelists[numa_node_id()];
+			struct kmem_list3 *l3 =
+			    cachep->nodelists[numa_node_id()];
 
 			STATS_INC_NODEFREES(cachep);
 			if (l3->alien && l3->alien[nodeid]) {
 				alien = l3->alien[nodeid];
 				spin_lock(&alien->lock);
 				if (unlikely(alien->avail == alien->limit))
-					__drain_alien_cache(cachep,
-							alien, nodeid);
+					__drain_alien_cache(cachep, alien,
+							    nodeid, this_cpu);
 				alien->entry[alien->avail++] = objp;
 				spin_unlock(&alien->lock);
 			} else {
 				spin_lock(&(cachep->nodelists[nodeid])->
-						list_lock);
-				free_block(cachep, &objp, 1, nodeid);
+					  list_lock);
+				free_block(cachep, &objp, 1, nodeid, this_cpu);
 				spin_unlock(&(cachep->nodelists[nodeid])->
-						list_lock);
+					    list_lock);
 			}
 			return;
 		}
@@ -2787,7 +3086,7 @@ static inline void __cache_free(kmem_cac
 		return;
 	} else {
 		STATS_INC_FREEMISS(cachep);
-		cache_flusharray(cachep, ac);
+		cache_flusharray(cachep, ac, this_cpu);
 		ac->entry[ac->avail++] = objp;
 	}
 }
@@ -2800,9 +3099,9 @@ static inline void __cache_free(kmem_cac
  * Allocate an object from this cache.  The flags are only relevant
  * if the cache has no available objects.
  */
-void *kmem_cache_alloc(kmem_cache_t *cachep, gfp_t flags)
+void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
-	return __cache_alloc(cachep, flags);
+	return __cache_alloc(cachep, flags, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(kmem_cache_alloc);
 
@@ -2820,12 +3119,12 @@ EXPORT_SYMBOL(kmem_cache_alloc);
  *
  * Currently only used for dentry validation.
  */
-int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
+int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr)
 {
-	unsigned long addr = (unsigned long) ptr;
+	unsigned long addr = (unsigned long)ptr;
 	unsigned long min_addr = PAGE_OFFSET;
-	unsigned long align_mask = BYTES_PER_WORD-1;
-	unsigned long size = cachep->objsize;
+	unsigned long align_mask = BYTES_PER_WORD - 1;
+	unsigned long size = cachep->buffer_size;
 	struct page *page;
 
 	if (unlikely(addr < min_addr))
@@ -2844,7 +3143,7 @@ int fastcall kmem_ptr_validate(kmem_cach
 	if (unlikely(page_get_cache(page) != cachep))
 		goto out;
 	return 1;
-out:
+      out:
 	return 0;
 }
 
@@ -2861,28 +3160,23 @@ out:
  * New and improved: it will now make sure that the object gets
  * put on the correct node list so that there is no false sharing.
  */
-void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
+void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
 	unsigned long save_flags;
+	int this_cpu;
 	void *ptr;
 
-	if (nodeid == -1)
-		return __cache_alloc(cachep, flags);
-
-	if (unlikely(!cachep->nodelists[nodeid])) {
-		/* Fall back to __cache_alloc if we run into trouble */
-		printk(KERN_WARNING "slab: not allocating in inactive node %d for cache %s\n", nodeid, cachep->name);
-		return __cache_alloc(cachep,flags);
-	}
-
 	cache_alloc_debugcheck_before(cachep, flags);
-	local_irq_save(save_flags);
-	if (nodeid == numa_node_id())
-		ptr = ____cache_alloc(cachep, flags);
+	slab_irq_save(save_flags, this_cpu);
+	if (nodeid == -1 || nodeid == numa_node_id() ||
+	    !cachep->nodelists[nodeid])
+		ptr = ____cache_alloc(cachep, flags, &this_cpu);
 	else
-		ptr = __cache_alloc_node(cachep, flags, nodeid);
-	local_irq_restore(save_flags);
-	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, __builtin_return_address(0));
+		ptr = __cache_alloc_node(cachep, flags, nodeid, &this_cpu);
+	slab_irq_restore(save_flags, this_cpu);
+
+	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr,
+					   __builtin_return_address(0));
 
 	return ptr;
 }
@@ -2890,7 +3184,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node);
 
 void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
-	kmem_cache_t *cachep;
+	struct kmem_cache *cachep;
 
 	cachep = kmem_find_general_cachep(size, flags);
 	if (unlikely(cachep == NULL))
@@ -2921,9 +3215,10 @@ EXPORT_SYMBOL(kmalloc_node);
  * platforms.  For example, on i386, it means that the memory must come
  * from the first 16MB.
  */
-void *__kmalloc(size_t size, gfp_t flags)
+static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
+					  void *caller)
 {
-	kmem_cache_t *cachep;
+	struct kmem_cache *cachep;
 
 	/* If you want to save a few bytes .text space: replace
 	 * __ with kmem_.
@@ -2933,10 +3228,27 @@ void *__kmalloc(size_t size, gfp_t flags
 	cachep = __find_general_cachep(size, flags);
 	if (unlikely(cachep == NULL))
 		return NULL;
-	return __cache_alloc(cachep, flags);
+	return __cache_alloc(cachep, flags, caller);
+}
+
+#ifndef CONFIG_DEBUG_SLAB
+
+void *__kmalloc(size_t size, gfp_t flags)
+{
+	return __do_kmalloc(size, flags, NULL);
 }
 EXPORT_SYMBOL(__kmalloc);
 
+#else
+
+void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
+{
+	return __do_kmalloc(size, flags, caller);
+}
+EXPORT_SYMBOL(__kmalloc_track_caller);
+
+#endif
+
 #ifdef CONFIG_SMP
 /**
  * __alloc_percpu - allocate one copy of the object for every present
@@ -2944,12 +3256,11 @@ EXPORT_SYMBOL(__kmalloc);
  * Objects should be dereferenced using the per_cpu_ptr macro only.
  *
  * @size: how many bytes of memory are required.
- * @align: the alignment, which can't be greater than SMP_CACHE_BYTES.
  */
 void *__alloc_percpu(size_t size, size_t align)
 {
 	int i;
-	struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
+	struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL);
 
 	if (!pdata)
 		return NULL;
@@ -2973,9 +3284,9 @@ void *__alloc_percpu(size_t size, size_t
 	}
 
 	/* Catch derefs w/o wrappers */
-	return (void *) (~(unsigned long) pdata);
+	return (void *)(~(unsigned long)pdata);
 
-unwind_oom:
+      unwind_oom:
 	while (--i >= 0) {
 		if (!cpu_possible(i))
 			continue;
@@ -2995,31 +3306,18 @@ EXPORT_SYMBOL(__alloc_percpu);
  * Free an object which was previously allocated from this
  * cache.
  */
-void kmem_cache_free(kmem_cache_t *cachep, void *objp)
+void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 {
 	unsigned long flags;
+	int this_cpu;
 
-	local_irq_save(flags);
-	__cache_free(cachep, objp);
-	local_irq_restore(flags);
+	slab_irq_save(flags, this_cpu);
+	__cache_free(cachep, objp, &this_cpu);
+	slab_irq_restore(flags, this_cpu);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
 /**
- * kzalloc - allocate memory. The memory is set to zero.
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- */
-void *kzalloc(size_t size, gfp_t flags)
-{
-	void *ret = kmalloc(size, flags);
-	if (ret)
-		memset(ret, 0, size);
-	return ret;
-}
-EXPORT_SYMBOL(kzalloc);
-
-/**
  * kfree - free previously allocated memory
  * @objp: pointer returned by kmalloc.
  *
@@ -3030,16 +3328,18 @@ EXPORT_SYMBOL(kzalloc);
  */
 void kfree(const void *objp)
 {
-	kmem_cache_t *c;
+	struct kmem_cache *c;
 	unsigned long flags;
+	int this_cpu;
 
 	if (unlikely(!objp))
 		return;
-	local_irq_save(flags);
+	slab_irq_save(flags, this_cpu);
 	kfree_debugcheck(objp);
-	c = page_get_cache(virt_to_page(objp));
-	__cache_free(c, (void*)objp);
-	local_irq_restore(flags);
+	c = virt_to_cache(objp);
+//	mutex_debug_check_no_locks_freed(objp, obj_size(c));
+	__cache_free(c, (void *)objp, &this_cpu);
+	slab_irq_restore(flags, this_cpu);
 }
 EXPORT_SYMBOL(kfree);
 
@@ -3051,29 +3351,28 @@ EXPORT_SYMBOL(kfree);
  * Don't free memory not originally allocated by alloc_percpu()
  * The complemented objp is to check for that.
  */
-void
-free_percpu(const void *objp)
+void free_percpu(const void *objp)
 {
 	int i;
-	struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
+	struct percpu_data *p = (struct percpu_data *)(~(unsigned long)objp);
 
 	/*
 	 * We allocate for all cpus so we cannot use for online cpu here.
 	 */
 	for_each_cpu(i)
-		kfree(p->ptrs[i]);
+	    kfree(p->ptrs[i]);
 	kfree(p);
 }
 EXPORT_SYMBOL(free_percpu);
 #endif
 
-unsigned int kmem_cache_size(kmem_cache_t *cachep)
+unsigned int kmem_cache_size(struct kmem_cache *cachep)
 {
-	return obj_reallen(cachep);
+	return obj_size(cachep);
 }
 EXPORT_SYMBOL(kmem_cache_size);
 
-const char *kmem_cache_name(kmem_cache_t *cachep)
+const char *kmem_cache_name(struct kmem_cache *cachep)
 {
 	return cachep->name;
 }
@@ -3082,7 +3381,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name);
 /*
  * This initializes kmem_list3 for all nodes.
  */
-static int alloc_kmemlist(kmem_cache_t *cachep)
+static int alloc_kmemlist(struct kmem_cache *cachep)
 {
 	int node;
 	struct kmem_list3 *l3;
@@ -3091,54 +3390,57 @@ static int alloc_kmemlist(kmem_cache_t *
 	for_each_online_node(node) {
 		struct array_cache *nc = NULL, *new;
 		struct array_cache **new_alien = NULL;
+		int this_cpu;
+
 #ifdef CONFIG_NUMA
 		if (!(new_alien = alloc_alien_cache(node, cachep->limit)))
 			goto fail;
 #endif
-		if (!(new = alloc_arraycache(node, (cachep->shared*
-				cachep->batchcount), 0xbaadf00d)))
+		if (!(new = alloc_arraycache(node, (cachep->shared *
+						    cachep->batchcount),
+					     0xbaadf00d)))
 			goto fail;
 		if ((l3 = cachep->nodelists[node])) {
 
-			spin_lock_irq(&l3->list_lock);
+			slab_spin_lock_irq(&l3->list_lock, this_cpu);
 
 			if ((nc = cachep->nodelists[node]->shared))
-				free_block(cachep, nc->entry,
-							nc->avail, node);
+				free_block(cachep, nc->entry, nc->avail, node,
+					   &this_cpu);
 
 			l3->shared = new;
 			if (!cachep->nodelists[node]->alien) {
 				l3->alien = new_alien;
 				new_alien = NULL;
 			}
-			l3->free_limit = (1 + nr_cpus_node(node))*
-				cachep->batchcount + cachep->num;
-			spin_unlock_irq(&l3->list_lock);
+			l3->free_limit = (1 + nr_cpus_node(node)) *
+			    cachep->batchcount + cachep->num;
+			slab_spin_unlock_irq(&l3->list_lock, this_cpu);
 			kfree(nc);
 			free_alien_cache(new_alien);
 			continue;
 		}
 		if (!(l3 = kmalloc_node(sizeof(struct kmem_list3),
-						GFP_KERNEL, node)))
+					GFP_KERNEL, node)))
 			goto fail;
 
 		kmem_list3_init(l3);
 		l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
-			((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
 		l3->shared = new;
 		l3->alien = new_alien;
-		l3->free_limit = (1 + nr_cpus_node(node))*
-			cachep->batchcount + cachep->num;
+		l3->free_limit = (1 + nr_cpus_node(node)) *
+		    cachep->batchcount + cachep->num;
 		cachep->nodelists[node] = l3;
 	}
 	return err;
-fail:
+      fail:
 	err = -ENOMEM;
 	return err;
 }
 
 struct ccupdate_struct {
-	kmem_cache_t *cachep;
+	struct kmem_cache *cachep;
 	struct array_cache *new[NR_CPUS];
 };
 
@@ -3148,24 +3450,26 @@ static void do_ccupdate_local(void *info
 	struct array_cache *old;
 
 	check_irq_off();
-	old = ac_data(new->cachep);
+	old = cpu_cache_get(new->cachep, smp_processor_id());
 
 	new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
 	new->new[smp_processor_id()] = old;
 }
 
-
-static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
-				int shared)
+static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount,
+			    int shared)
 {
 	struct ccupdate_struct new;
+	int this_cpu;
 	int i, err;
 
-	memset(&new.new,0,sizeof(new.new));
+	memset(&new.new, 0, sizeof(new.new));
 	for_each_online_cpu(i) {
-		new.new[i] = alloc_arraycache(cpu_to_node(i), limit, batchcount);
+		new.new[i] =
+		    alloc_arraycache(cpu_to_node(i), limit, batchcount);
 		if (!new.new[i]) {
-			for (i--; i >= 0; i--) kfree(new.new[i]);
+			for (i--; i >= 0; i--)
+				kfree(new.new[i]);
 			return -ENOMEM;
 		}
 	}
@@ -3174,33 +3478,32 @@ static int do_tune_cpucache(kmem_cache_t
 	smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
 
 	check_irq_on();
-	spin_lock_irq(&cachep->spinlock);
+	spin_lock(&cachep->spinlock);
 	cachep->batchcount = batchcount;
 	cachep->limit = limit;
 	cachep->shared = shared;
-	spin_unlock_irq(&cachep->spinlock);
+	spin_unlock(&cachep->spinlock);
 
 	for_each_online_cpu(i) {
 		struct array_cache *ccold = new.new[i];
 		if (!ccold)
 			continue;
-		spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
-		free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
-		spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
+		slab_spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock, this_cpu);
+		free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i), &this_cpu);
+		slab_spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock, this_cpu);
 		kfree(ccold);
 	}
 
 	err = alloc_kmemlist(cachep);
 	if (err) {
 		printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n",
-				cachep->name, -err);
+		       cachep->name, -err);
 		BUG();
 	}
 	return 0;
 }
 
-
-static void enable_cpucache(kmem_cache_t *cachep)
+static void enable_cpucache(struct kmem_cache *cachep)
 {
 	int err;
 	int limit, shared;
@@ -3213,13 +3516,13 @@ static void enable_cpucache(kmem_cache_t
 	 * The numbers are guessed, we should auto-tune as described by
 	 * Bonwick.
 	 */
-	if (cachep->objsize > 131072)
+	if (cachep->buffer_size > 131072)
 		limit = 1;
-	else if (cachep->objsize > PAGE_SIZE)
+	else if (cachep->buffer_size > PAGE_SIZE)
 		limit = 8;
-	else if (cachep->objsize > 1024)
+	else if (cachep->buffer_size > 1024)
 		limit = 24;
-	else if (cachep->objsize > 256)
+	else if (cachep->buffer_size > 256)
 		limit = 54;
 	else
 		limit = 120;
@@ -3234,7 +3537,7 @@ static void enable_cpucache(kmem_cache_t
 	 */
 	shared = 0;
 #ifdef CONFIG_SMP
-	if (cachep->objsize <= PAGE_SIZE)
+	if (cachep->buffer_size <= PAGE_SIZE)
 		shared = 8;
 #endif
 
@@ -3246,29 +3549,30 @@ static void enable_cpucache(kmem_cache_t
 	if (limit > 32)
 		limit = 32;
 #endif
-	err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared);
+	err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
 	if (err)
 		printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
-					cachep->name, -err);
+		       cachep->name, -err);
 }
 
-static void drain_array_locked(kmem_cache_t *cachep,
-				struct array_cache *ac, int force, int node)
+static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
+				int force, int node)
 {
+	int this_cpu = smp_processor_id();
 	int tofree;
 
 	check_spinlock_acquired_node(cachep, node);
 	if (ac->touched && !force) {
 		ac->touched = 0;
 	} else if (ac->avail) {
-		tofree = force ? ac->avail : (ac->limit+4)/5;
+		tofree = force ? ac->avail : (ac->limit + 4) / 5;
 		if (tofree > ac->avail) {
-			tofree = (ac->avail+1)/2;
+			tofree = (ac->avail + 1) / 2;
 		}
-		free_block(cachep, ac->entry, tofree, node);
+		free_block(cachep, ac->entry, tofree, node, &this_cpu);
 		ac->avail -= tofree;
 		memmove(ac->entry, &(ac->entry[tofree]),
-					sizeof(void*)*ac->avail);
+			sizeof(void *) * ac->avail);
 	}
 }
 
@@ -3281,27 +3585,29 @@ static void drain_array_locked(kmem_cach
  * - clear the per-cpu caches for this CPU.
  * - return freeable pages to the main free memory pool.
  *
- * If we cannot acquire the cache chain semaphore then just give up - we'll
+ * If we cannot acquire the cache chain mutex then just give up - we'll
  * try again on the next iteration.
  */
 static void cache_reap(void *unused)
 {
 	struct list_head *walk;
 	struct kmem_list3 *l3;
+	int this_cpu;
 
 	if (down_trylock(&cache_chain_sem)) {
 		/* Give up. Setup the next iteration. */
-		schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
+		schedule_delayed_work(&__get_cpu_var(reap_work),
+				      REAPTIMEOUT_CPUC);
 		return;
 	}
 
 	list_for_each(walk, &cache_chain) {
-		kmem_cache_t *searchp;
-		struct list_head* p;
+		struct kmem_cache *searchp;
+		struct list_head *p;
 		int tofree;
 		struct slab *slabp;
 
-		searchp = list_entry(walk, kmem_cache_t, next);
+		searchp = list_entry(walk, struct kmem_cache, next);
 
 		if (searchp->flags & SLAB_NO_REAP)
 			goto next;
@@ -3310,11 +3616,11 @@ static void cache_reap(void *unused)
 
 		l3 = searchp->nodelists[numa_node_id()];
 		if (l3->alien)
-			drain_alien_cache(searchp, l3);
-		spin_lock_irq(&l3->list_lock);
+			drain_alien_cache(searchp, l3->alien);
+		slab_spin_lock_irq(&l3->list_lock, this_cpu);
 
-		drain_array_locked(searchp, ac_data(searchp), 0,
-				numa_node_id());
+		drain_array_locked(searchp, cpu_cache_get(searchp, this_cpu), 0,
+				   numa_node_id());
 
 		if (time_after(l3->next_reap, jiffies))
 			goto next_unlock;
@@ -3323,14 +3629,16 @@ static void cache_reap(void *unused)
 
 		if (l3->shared)
 			drain_array_locked(searchp, l3->shared, 0,
-				numa_node_id());
+					   numa_node_id());
 
 		if (l3->free_touched) {
 			l3->free_touched = 0;
 			goto next_unlock;
 		}
 
-		tofree = (l3->free_limit+5*searchp->num-1)/(5*searchp->num);
+		tofree =
+		    (l3->free_limit + 5 * searchp->num -
+		     1) / (5 * searchp->num);
 		do {
 			p = l3->slabs_free.next;
 			if (p == &(l3->slabs_free))
@@ -3347,13 +3655,13 @@ static void cache_reap(void *unused)
 			 * cache_chain_lock
 			 */
 			l3->free_objects -= searchp->num;
-			spin_unlock_irq(&l3->list_lock);
-			slab_destroy(searchp, slabp);
-			spin_lock_irq(&l3->list_lock);
-		} while(--tofree > 0);
-next_unlock:
-		spin_unlock_irq(&l3->list_lock);
-next:
+			slab_spin_unlock_irq(&l3->list_lock, this_cpu);
+			slab_destroy(searchp, slabp, NULL);
+			slab_spin_lock_irq(&l3->list_lock, this_cpu);
+		} while (--tofree > 0);
+	      next_unlock:
+		slab_spin_unlock_irq(&l3->list_lock, this_cpu);
+	      next:
 		cond_resched();
 	}
 	check_irq_on();
@@ -3365,47 +3673,52 @@ next:
 
 #ifdef CONFIG_PROC_FS
 
-static void *s_start(struct seq_file *m, loff_t *pos)
+static void print_slabinfo_header(struct seq_file *m)
 {
-	loff_t n = *pos;
-	struct list_head *p;
-
-	down(&cache_chain_sem);
-	if (!n) {
-		/*
-		 * Output format version, so at least we can change it
-		 * without _too_ many complaints.
-		 */
+	/*
+	 * Output format version, so at least we can change it
+	 * without _too_ many complaints.
+	 */
 #if STATS
-		seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
+	seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
 #else
-		seq_puts(m, "slabinfo - version: 2.1\n");
+	seq_puts(m, "slabinfo - version: 2.1\n");
 #endif
-		seq_puts(m, "# name            <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>");
-		seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
-		seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
+	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
+		 "<objperslab> <pagesperslab>");
+	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
+	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
 #if STATS
-		seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped>"
-				" <error> <maxfreeable> <nodeallocs> <remotefrees>");
-		seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
+	seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
+		 "<error> <maxfreeable> <nodeallocs> <remotefrees>");
+	seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
 #endif
-		seq_putc(m, '\n');
-	}
+	seq_putc(m, '\n');
+}
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+	loff_t n = *pos;
+	struct list_head *p;
+
+	down(&cache_chain_sem);
+	if (!n)
+		print_slabinfo_header(m);
 	p = cache_chain.next;
 	while (n--) {
 		p = p->next;
 		if (p == &cache_chain)
 			return NULL;
 	}
-	return list_entry(p, kmem_cache_t, next);
+	return list_entry(p, struct kmem_cache, next);
 }
 
 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 {
-	kmem_cache_t *cachep = p;
+	struct kmem_cache *cachep = p;
 	++*pos;
 	return cachep->next.next == &cache_chain ? NULL
-		: list_entry(cachep->next.next, kmem_cache_t, next);
+	    : list_entry(cachep->next.next, struct kmem_cache, next);
 }
 
 static void s_stop(struct seq_file *m, void *p)
@@ -3415,20 +3728,19 @@ static void s_stop(struct seq_file *m, v
 
 static int s_show(struct seq_file *m, void *p)
 {
-	kmem_cache_t *cachep = p;
+	struct kmem_cache *cachep = p;
 	struct list_head *q;
-	struct slab	*slabp;
-	unsigned long	active_objs;
-	unsigned long	num_objs;
-	unsigned long	active_slabs = 0;
-	unsigned long	num_slabs, free_objects = 0, shared_avail = 0;
+	struct slab *slabp;
+	unsigned long active_objs;
+	unsigned long num_objs;
+	unsigned long active_slabs = 0;
+	unsigned long num_slabs, free_objects = 0, shared_avail = 0;
 	const char *name;
 	char *error = NULL;
-	int node;
+	int this_cpu, node;
 	struct kmem_list3 *l3;
 
-	check_irq_on();
-	spin_lock_irq(&cachep->spinlock);
+	spin_lock(&cachep->spinlock);
 	active_objs = 0;
 	num_slabs = 0;
 	for_each_online_node(node) {
@@ -3436,16 +3748,17 @@ static int s_show(struct seq_file *m, vo
 		if (!l3)
 			continue;
 
-		spin_lock(&l3->list_lock);
+		check_irq_on();
+		slab_spin_lock_irq(&l3->list_lock, this_cpu);
 
-		list_for_each(q,&l3->slabs_full) {
+		list_for_each(q, &l3->slabs_full) {
 			slabp = list_entry(q, struct slab, list);
 			if (slabp->inuse != cachep->num && !error)
 				error = "slabs_full accounting error";
 			active_objs += cachep->num;
 			active_slabs++;
 		}
-		list_for_each(q,&l3->slabs_partial) {
+		list_for_each(q, &l3->slabs_partial) {
 			slabp = list_entry(q, struct slab, list);
 			if (slabp->inuse == cachep->num && !error)
 				error = "slabs_partial inuse accounting error";
@@ -3454,36 +3767,36 @@ static int s_show(struct seq_file *m, vo
 			active_objs += slabp->inuse;
 			active_slabs++;
 		}
-		list_for_each(q,&l3->slabs_free) {
+		list_for_each(q, &l3->slabs_free) {
 			slabp = list_entry(q, struct slab, list);
 			if (slabp->inuse && !error)
 				error = "slabs_free/inuse accounting error";
 			num_slabs++;
 		}
 		free_objects += l3->free_objects;
-		shared_avail += l3->shared->avail;
+		if (l3->shared)
+			shared_avail += l3->shared->avail;
 
-		spin_unlock(&l3->list_lock);
+		slab_spin_unlock_irq(&l3->list_lock, this_cpu);
 	}
-	num_slabs+=active_slabs;
-	num_objs = num_slabs*cachep->num;
+	num_slabs += active_slabs;
+	num_objs = num_slabs * cachep->num;
 	if (num_objs - active_objs != free_objects && !error)
 		error = "free_objects accounting error";
 
-	name = cachep->name; 
+	name = cachep->name;
 	if (error)
 		printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
 
 	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
-		name, active_objs, num_objs, cachep->objsize,
-		cachep->num, (1<<cachep->gfporder));
+		   name, active_objs, num_objs, cachep->buffer_size,
+		   cachep->num, (1 << cachep->gfporder));
 	seq_printf(m, " : tunables %4u %4u %4u",
-			cachep->limit, cachep->batchcount,
-			cachep->shared);
+		   cachep->limit, cachep->batchcount, cachep->shared);
 	seq_printf(m, " : slabdata %6lu %6lu %6lu",
-			active_slabs, num_slabs, shared_avail);
+		   active_slabs, num_slabs, shared_avail);
 #if STATS
-	{	/* list3 stats */
+	{			/* list3 stats */
 		unsigned long high = cachep->high_mark;
 		unsigned long allocs = cachep->num_allocations;
 		unsigned long grown = cachep->grown;
@@ -3494,9 +3807,7 @@ static int s_show(struct seq_file *m, vo
 		unsigned long node_frees = cachep->node_frees;
 
 		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
-				%4lu %4lu %4lu %4lu",
-				allocs, high, grown, reaped, errors,
-				max_freeable, node_allocs, node_frees);
+				%4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees);
 	}
 	/* cpu stats */
 	{
@@ -3506,11 +3817,11 @@ static int s_show(struct seq_file *m, vo
 		unsigned long freemiss = atomic_read(&cachep->freemiss);
 
 		seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
-			allochit, allocmiss, freehit, freemiss);
+			   allochit, allocmiss, freehit, freemiss);
 	}
 #endif
 	seq_putc(m, '\n');
-	spin_unlock_irq(&cachep->spinlock);
+	spin_unlock(&cachep->spinlock);
 	return 0;
 }
 
@@ -3529,10 +3840,10 @@ static int s_show(struct seq_file *m, vo
  */
 
 struct seq_operations slabinfo_op = {
-	.start	= s_start,
-	.next	= s_next,
-	.stop	= s_stop,
-	.show	= s_show,
+	.start = s_start,
+	.next = s_next,
+	.stop = s_stop,
+	.show = s_show,
 };
 
 #define MAX_SLABINFO_WRITE 128
@@ -3543,18 +3854,18 @@ struct seq_operations slabinfo_op = {
  * @count: data length
  * @ppos: unused
  */
-ssize_t slabinfo_write(struct file *file, const char __user *buffer,
-				size_t count, loff_t *ppos)
+ssize_t slabinfo_write(struct file *file, const char __user * buffer,
+		       size_t count, loff_t *ppos)
 {
-	char kbuf[MAX_SLABINFO_WRITE+1], *tmp;
+	char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
 	int limit, batchcount, shared, res;
 	struct list_head *p;
-	
+
 	if (count > MAX_SLABINFO_WRITE)
 		return -EINVAL;
 	if (copy_from_user(&kbuf, buffer, count))
 		return -EFAULT;
-	kbuf[MAX_SLABINFO_WRITE] = '\0'; 
+	kbuf[MAX_SLABINFO_WRITE] = '\0';
 
 	tmp = strchr(kbuf, ' ');
 	if (!tmp)
@@ -3567,18 +3878,18 @@ ssize_t slabinfo_write(struct file *file
 	/* Find the cache in the chain of caches. */
 	down(&cache_chain_sem);
 	res = -EINVAL;
-	list_for_each(p,&cache_chain) {
-		kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next);
+	list_for_each(p, &cache_chain) {
+		struct kmem_cache *cachep = list_entry(p, struct kmem_cache,
+						       next);
 
 		if (!strcmp(cachep->name, kbuf)) {
 			if (limit < 1 ||
 			    batchcount < 1 ||
-			    batchcount > limit ||
-			    shared < 0) {
+			    batchcount > limit || shared < 0) {
 				res = 0;
 			} else {
 				res = do_tune_cpucache(cachep, limit,
-							batchcount, shared);
+						       batchcount, shared);
 			}
 			break;
 		}
@@ -3607,28 +3918,5 @@ unsigned int ksize(const void *objp)
 	if (unlikely(objp == NULL))
 		return 0;
 
-	return obj_reallen(page_get_cache(virt_to_page(objp)));
-}
-
-
-/*
- * kstrdup - allocate space for and copy an existing string
- *
- * @s: the string to duplicate
- * @gfp: the GFP mask used in the kmalloc() call when allocating memory
- */
-char *kstrdup(const char *s, gfp_t gfp)
-{
-	size_t len;
-	char *buf;
-
-	if (!s)
-		return NULL;
-
-	len = strlen(s) + 1;
-	buf = kmalloc(len, gfp);
-	if (buf)
-		memcpy(buf, s, len);
-	return buf;
+	return obj_size(virt_to_cache(objp));
 }
-EXPORT_SYMBOL(kstrdup);
Index: linux.prev/mm/slob.c
===================================================================
--- /dev/null
+++ linux.prev/mm/slob.c
@@ -0,0 +1,385 @@
+/*
+ * SLOB Allocator: Simple List Of Blocks
+ *
+ * Matt Mackall <mpm@selenic.com> 12/30/03
+ *
+ * How SLOB works:
+ *
+ * The core of SLOB is a traditional K&R style heap allocator, with
+ * support for returning aligned objects. The granularity of this
+ * allocator is 8 bytes on x86, though it's perhaps possible to reduce
+ * this to 4 if it's deemed worth the effort. The slob heap is a
+ * singly-linked list of pages from __get_free_page, grown on demand
+ * and allocation from the heap is currently first-fit.
+ *
+ * Above this is an implementation of kmalloc/kfree. Blocks returned
+ * from kmalloc are 8-byte aligned and prepended with a 8-byte header.
+ * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
+ * __get_free_pages directly so that it can return page-aligned blocks
+ * and keeps a linked list of such pages and their orders. These
+ * objects are detected in kfree() by their page alignment.
+ *
+ * SLAB is emulated on top of SLOB by simply calling constructors and
+ * destructors for every SLAB allocation. Objects are returned with
+ * the 8-byte alignment unless the SLAB_MUST_HWCACHE_ALIGN flag is
+ * set, in which case the low-level allocator will fragment blocks to
+ * create the proper alignment. Again, objects of page-size or greater
+ * are allocated by calling __get_free_pages. As SLAB objects know
+ * their size, no separate size bookkeeping is necessary and there is
+ * essentially no allocation space overhead.
+ */
+
+#include <linux/config.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+
+struct slob_block {
+	int units;
+	struct slob_block *next;
+};
+typedef struct slob_block slob_t;
+
+#define SLOB_UNIT sizeof(slob_t)
+#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
+#define SLOB_ALIGN L1_CACHE_BYTES
+
+struct bigblock {
+	int order;
+	void *pages;
+	struct bigblock *next;
+};
+typedef struct bigblock bigblock_t;
+
+static slob_t arena = { .next = &arena, .units = 1 };
+static slob_t *slobfree = &arena;
+static bigblock_t *bigblocks;
+static DEFINE_SPINLOCK(slob_lock);
+static DEFINE_SPINLOCK(block_lock);
+
+static void slob_free(void *b, int size);
+
+static void *slob_alloc(size_t size, gfp_t gfp, int align)
+{
+	slob_t *prev, *cur, *aligned = 0;
+	int delta = 0, units = SLOB_UNITS(size);
+	unsigned long flags;
+
+	spin_lock_irqsave(&slob_lock, flags);
+	prev = slobfree;
+	for (cur = prev->next; ; prev = cur, cur = cur->next) {
+		if (align) {
+			aligned = (slob_t *)ALIGN((unsigned long)cur, align);
+			delta = aligned - cur;
+		}
+		if (cur->units >= units + delta) { /* room enough? */
+			if (delta) { /* need to fragment head to align? */
+				aligned->units = cur->units - delta;
+				aligned->next = cur->next;
+				cur->next = aligned;
+				cur->units = delta;
+				prev = cur;
+				cur = aligned;
+			}
+
+			if (cur->units == units) /* exact fit? */
+				prev->next = cur->next; /* unlink */
+			else { /* fragment */
+				prev->next = cur + units;
+				prev->next->units = cur->units - units;
+				prev->next->next = cur->next;
+				cur->units = units;
+			}
+
+			slobfree = prev;
+			spin_unlock_irqrestore(&slob_lock, flags);
+			return cur;
+		}
+		if (cur == slobfree) {
+			spin_unlock_irqrestore(&slob_lock, flags);
+
+			if (size == PAGE_SIZE) /* trying to shrink arena? */
+				return 0;
+
+			cur = (slob_t *)__get_free_page(gfp);
+			if (!cur)
+				return 0;
+
+			slob_free(cur, PAGE_SIZE);
+			spin_lock_irqsave(&slob_lock, flags);
+			cur = slobfree;
+		}
+	}
+}
+
+static void slob_free(void *block, int size)
+{
+	slob_t *cur, *b = (slob_t *)block;
+	unsigned long flags;
+
+	if (!block)
+		return;
+
+	if (size)
+		b->units = SLOB_UNITS(size);
+
+	/* Find reinsertion point */
+	spin_lock_irqsave(&slob_lock, flags);
+	for (cur = slobfree; !(b > cur && b < cur->next); cur = cur->next)
+		if (cur >= cur->next && (b > cur || b < cur->next))
+			break;
+
+	if (b + b->units == cur->next) {
+		b->units += cur->next->units;
+		b->next = cur->next->next;
+	} else
+		b->next = cur->next;
+
+	if (cur + cur->units == b) {
+		cur->units += b->units;
+		cur->next = b->next;
+	} else
+		cur->next = b;
+
+	slobfree = cur;
+
+	spin_unlock_irqrestore(&slob_lock, flags);
+}
+
+static int FASTCALL(find_order(int size));
+static int fastcall find_order(int size)
+{
+	int order = 0;
+	for ( ; size > 4096 ; size >>=1)
+		order++;
+	return order;
+}
+
+void *kmalloc(size_t size, gfp_t gfp)
+{
+	slob_t *m;
+	bigblock_t *bb;
+	unsigned long flags;
+
+	if (size < PAGE_SIZE - SLOB_UNIT) {
+		m = slob_alloc(size + SLOB_UNIT, gfp, 0);
+		return m ? (void *)(m + 1) : 0;
+	}
+
+	bb = slob_alloc(sizeof(bigblock_t), gfp, 0);
+	if (!bb)
+		return 0;
+
+	bb->order = find_order(size);
+	bb->pages = (void *)__get_free_pages(gfp, bb->order);
+
+	if (bb->pages) {
+		spin_lock_irqsave(&block_lock, flags);
+		bb->next = bigblocks;
+		bigblocks = bb;
+		spin_unlock_irqrestore(&block_lock, flags);
+		return bb->pages;
+	}
+
+	slob_free(bb, sizeof(bigblock_t));
+	return 0;
+}
+
+EXPORT_SYMBOL(kmalloc);
+
+void kfree(const void *block)
+{
+	bigblock_t *bb, **last = &bigblocks;
+	unsigned long flags;
+
+	if (!block)
+		return;
+
+	if (!((unsigned long)block & (PAGE_SIZE-1))) {
+		/* might be on the big block list */
+		spin_lock_irqsave(&block_lock, flags);
+		for (bb = bigblocks; bb; last = &bb->next, bb = bb->next) {
+			if (bb->pages == block) {
+				*last = bb->next;
+				spin_unlock_irqrestore(&block_lock, flags);
+				free_pages((unsigned long)block, bb->order);
+				slob_free(bb, sizeof(bigblock_t));
+				return;
+			}
+		}
+		spin_unlock_irqrestore(&block_lock, flags);
+	}
+
+	slob_free((slob_t *)block - 1, 0);
+	return;
+}
+
+EXPORT_SYMBOL(kfree);
+
+unsigned int ksize(const void *block)
+{
+	bigblock_t *bb;
+	unsigned long flags;
+
+	if (!block)
+		return 0;
+
+	if (!((unsigned long)block & (PAGE_SIZE-1))) {
+		spin_lock_irqsave(&block_lock, flags);
+		for (bb = bigblocks; bb; bb = bb->next)
+			if (bb->pages == block) {
+				spin_unlock_irqrestore(&slob_lock, flags);
+				return PAGE_SIZE << bb->order;
+			}
+		spin_unlock_irqrestore(&block_lock, flags);
+	}
+
+	return ((slob_t *)block - 1)->units * SLOB_UNIT;
+}
+
+struct kmem_cache {
+	unsigned int size, align;
+	const char *name;
+	void (*ctor)(void *, struct kmem_cache *, unsigned long);
+	void (*dtor)(void *, struct kmem_cache *, unsigned long);
+};
+
+struct kmem_cache *kmem_cache_create(const char *name, size_t size,
+	size_t align, unsigned long flags,
+	void (*ctor)(void*, struct kmem_cache *, unsigned long),
+	void (*dtor)(void*, struct kmem_cache *, unsigned long))
+{
+	struct kmem_cache *c;
+
+	c = slob_alloc(sizeof(struct kmem_cache), flags, 0);
+
+	if (c) {
+		c->name = name;
+		c->size = size;
+		c->ctor = ctor;
+		c->dtor = dtor;
+		/* ignore alignment unless it's forced */
+		c->align = (flags & SLAB_MUST_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
+		if (c->align < align)
+			c->align = align;
+	}
+
+	return c;
+}
+EXPORT_SYMBOL(kmem_cache_create);
+
+int kmem_cache_destroy(struct kmem_cache *c)
+{
+	slob_free(c, sizeof(struct kmem_cache));
+	return 0;
+}
+EXPORT_SYMBOL(kmem_cache_destroy);
+
+void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
+{
+	void *b;
+
+	if (c->size < PAGE_SIZE)
+		b = slob_alloc(c->size, flags, c->align);
+	else
+		b = (void *)__get_free_pages(flags, find_order(c->size));
+
+	if (c->ctor)
+		c->ctor(b, c, SLAB_CTOR_CONSTRUCTOR);
+
+	return b;
+}
+EXPORT_SYMBOL(kmem_cache_alloc);
+
+void kmem_cache_free(struct kmem_cache *c, void *b)
+{
+	if (c->dtor)
+		c->dtor(b, c, 0);
+
+	if (c->size < PAGE_SIZE)
+		slob_free(b, c->size);
+	else
+		free_pages((unsigned long)b, find_order(c->size));
+}
+EXPORT_SYMBOL(kmem_cache_free);
+
+unsigned int kmem_cache_size(struct kmem_cache *c)
+{
+	return c->size;
+}
+EXPORT_SYMBOL(kmem_cache_size);
+
+const char *kmem_cache_name(struct kmem_cache *c)
+{
+	return c->name;
+}
+EXPORT_SYMBOL(kmem_cache_name);
+
+static struct timer_list slob_timer = TIMER_INITIALIZER(
+	(void (*)(unsigned long))kmem_cache_init, 0, 0);
+
+void kmem_cache_init(void)
+{
+	void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1);
+
+	if (p)
+		free_page((unsigned long)p);
+
+	mod_timer(&slob_timer, jiffies + HZ);
+}
+
+atomic_t slab_reclaim_pages = ATOMIC_INIT(0);
+EXPORT_SYMBOL(slab_reclaim_pages);
+
+#ifdef CONFIG_SMP
+
+void *__alloc_percpu(size_t size, size_t align)
+{
+	int i;
+	struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
+
+	if (!pdata)
+		return NULL;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_possible(i))
+			continue;
+		pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
+		if (!pdata->ptrs[i])
+			goto unwind_oom;
+		memset(pdata->ptrs[i], 0, size);
+	}
+
+	/* Catch derefs w/o wrappers */
+	return (void *) (~(unsigned long) pdata);
+
+unwind_oom:
+	while (--i >= 0) {
+		if (!cpu_possible(i))
+			continue;
+		kfree(pdata->ptrs[i]);
+	}
+	kfree(pdata);
+	return NULL;
+}
+EXPORT_SYMBOL(__alloc_percpu);
+
+void
+free_percpu(const void *objp)
+{
+	int i;
+	struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_possible(i))
+			continue;
+		kfree(p->ptrs[i]);
+	}
+	kfree(p);
+}
+EXPORT_SYMBOL(free_percpu);
+
+#endif
Index: linux.prev/mm/sparse.c
===================================================================
--- linux.prev.orig/mm/sparse.c
+++ linux.prev/mm/sparse.c
@@ -42,7 +42,7 @@ static struct mem_section *sparse_index_
 
 static int sparse_index_init(unsigned long section_nr, int nid)
 {
-	static spinlock_t index_init_lock = SPIN_LOCK_UNLOCKED;
+	static DEFINE_SPINLOCK(index_init_lock);
 	unsigned long root = SECTION_NR_TO_ROOT(section_nr);
 	struct mem_section *section;
 	int ret = 0;
Index: linux.prev/mm/swap.c
===================================================================
--- linux.prev.orig/mm/swap.c
+++ linux.prev/mm/swap.c
@@ -133,39 +133,45 @@ EXPORT_SYMBOL(mark_page_accessed);
  * lru_cache_add: add a page to the page lists
  * @page: the page to add
  */
-static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
-static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
+static DEFINE_PER_CPU_LOCKED(struct pagevec, lru_add_pvecs) = { 0, };
+static DEFINE_PER_CPU_LOCKED(struct pagevec, lru_add_active_pvecs) = { 0, };
 
 void fastcall lru_cache_add(struct page *page)
 {
-	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
+	int cpu;
+	struct pagevec *pvec = &get_cpu_var_locked(lru_add_pvecs, &cpu);
 
 	page_cache_get(page);
 	if (!pagevec_add(pvec, page))
 		__pagevec_lru_add(pvec);
-	put_cpu_var(lru_add_pvecs);
+	put_cpu_var_locked(lru_add_pvecs, cpu);
 }
 
 void fastcall lru_cache_add_active(struct page *page)
 {
-	struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
+	int cpu;
+	struct pagevec *pvec = &get_cpu_var_locked(lru_add_active_pvecs, &cpu);
 
 	page_cache_get(page);
 	if (!pagevec_add(pvec, page))
 		__pagevec_lru_add_active(pvec);
-	put_cpu_var(lru_add_active_pvecs);
+	put_cpu_var_locked(lru_add_active_pvecs, cpu);
 }
 
 void lru_add_drain(void)
 {
-	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
+	struct pagevec *pvec;
+	int cpu;
 
+	pvec = &get_cpu_var_locked(lru_add_pvecs, &cpu);
 	if (pagevec_count(pvec))
 		__pagevec_lru_add(pvec);
-	pvec = &__get_cpu_var(lru_add_active_pvecs);
+	put_cpu_var_locked(lru_add_pvecs, cpu);
+
+	pvec = &get_cpu_var_locked(lru_add_active_pvecs, &cpu);
 	if (pagevec_count(pvec))
 		__pagevec_lru_add_active(pvec);
-	put_cpu_var(lru_add_pvecs);
+	put_cpu_var_locked(lru_add_active_pvecs, cpu);
 }
 
 /*
@@ -414,12 +420,13 @@ void vm_acct_memory(long pages)
 #ifdef CONFIG_HOTPLUG_CPU
 static void lru_drain_cache(unsigned int cpu)
 {
-	struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
+	struct pagevec *pvec = &__get_cpu_var_locked(lru_add_pvecs, cpu);
 
 	/* CPU is dead, so no locking needed. */
 	if (pagevec_count(pvec))
 		__pagevec_lru_add(pvec);
-	pvec = &per_cpu(lru_add_active_pvecs, cpu);
+
+	pvec = &__get_cpu_var_locked(lru_add_active_pvecs, cpu);
 	if (pagevec_count(pvec))
 		__pagevec_lru_add_active(pvec);
 }
Index: linux.prev/mm/swap_state.c
===================================================================
--- linux.prev.orig/mm/swap_state.c
+++ linux.prev/mm/swap_state.c
@@ -35,7 +35,7 @@ static struct backing_dev_info swap_back
 
 struct address_space swapper_space = {
 	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
-	.tree_lock	= RW_LOCK_UNLOCKED,
+	.tree_lock	= RW_LOCK_UNLOCKED(swapper_space.tree_lock),
 	.a_ops		= &swap_aops,
 	.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
 	.backing_dev_info = &swap_backing_dev_info,
Index: linux.prev/mm/util.c
===================================================================
--- /dev/null
+++ linux.prev/mm/util.c
@@ -0,0 +1,39 @@
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/module.h>
+
+/**
+ * kzalloc - allocate memory. The memory is set to zero.
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ */
+void *kzalloc(size_t size, gfp_t flags)
+{
+	void *ret = kmalloc(size, flags);
+	if (ret)
+		memset(ret, 0, size);
+	return ret;
+}
+EXPORT_SYMBOL(kzalloc);
+
+/*
+ * kstrdup - allocate space for and copy an existing string
+ *
+ * @s: the string to duplicate
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ */
+char *kstrdup(const char *s, gfp_t gfp)
+{
+	size_t len;
+	char *buf;
+
+	if (!s)
+		return NULL;
+
+	len = strlen(s) + 1;
+	buf = kmalloc(len, gfp);
+	if (buf)
+		memcpy(buf, s, len);
+	return buf;
+}
+EXPORT_SYMBOL(kstrdup);
Index: linux.prev/net/atm/clip.c
===================================================================
--- linux.prev.orig/net/atm/clip.c
+++ linux.prev/net/atm/clip.c
@@ -707,7 +707,7 @@ static struct atm_dev atmarpd_dev = {
 	.ops =			&atmarpd_dev_ops,
 	.type =			"arpd",
 	.number = 		999,
-	.lock =			SPIN_LOCK_UNLOCKED
+	.lock =			SPIN_LOCK_UNLOCKED(atmarpd_dev.lock)
 };
 
 
Index: linux.prev/net/atm/lec.c
===================================================================
--- linux.prev.orig/net/atm/lec.c
+++ linux.prev/net/atm/lec.c
@@ -597,7 +597,7 @@ static struct atm_dev lecatm_dev = {
 	.ops	= &lecdev_ops,
 	.type	= "lec",
 	.number	= 999,	/* dummy device number */
-	.lock	= SPIN_LOCK_UNLOCKED
+	.lock	= SPIN_LOCK_UNLOCKED(lecatm_dev.lock)
 };
 
 /*
Index: linux.prev/net/atm/mpc.c
===================================================================
--- linux.prev.orig/net/atm/mpc.c
+++ linux.prev/net/atm/mpc.c
@@ -749,7 +749,7 @@ static struct atm_dev mpc_dev = {
 	.ops	= &mpc_ops,
 	.type	= "mpc",
 	.number	= 42,
-	.lock	= SPIN_LOCK_UNLOCKED
+	.lock	= SPIN_LOCK_UNLOCKED(mpc_dev.lock)
 	/* members not explicitly initialised will be 0 */
 };
 
Index: linux.prev/net/atm/signaling.c
===================================================================
--- linux.prev.orig/net/atm/signaling.c
+++ linux.prev/net/atm/signaling.c
@@ -261,7 +261,7 @@ static struct atm_dev sigd_dev = {
 	.ops =		&sigd_dev_ops,
 	.type =		"sig",
 	.number =	999,
-	.lock =		SPIN_LOCK_UNLOCKED
+	.lock =		SPIN_LOCK_UNLOCKED(sigd_dev.lock)
 };
 
 
Index: linux.prev/net/bluetooth/hci_sock.c
===================================================================
--- linux.prev.orig/net/bluetooth/hci_sock.c
+++ linux.prev/net/bluetooth/hci_sock.c
@@ -84,7 +84,7 @@ static struct hci_sec_filter hci_sec_fil
 };
 
 static struct bt_sock_list hci_sk_list = {
-	.lock = RW_LOCK_UNLOCKED
+	.lock = RW_LOCK_UNLOCKED(hci_sk_list.lock)
 };
 
 /* Send frame to RAW socket */
Index: linux.prev/net/bluetooth/l2cap.c
===================================================================
--- linux.prev.orig/net/bluetooth/l2cap.c
+++ linux.prev/net/bluetooth/l2cap.c
@@ -60,7 +60,7 @@
 static struct proto_ops l2cap_sock_ops;
 
 static struct bt_sock_list l2cap_sk_list = {
-	.lock = RW_LOCK_UNLOCKED
+	.lock = RW_LOCK_UNLOCKED(l2cap_sk_list.lock)
 };
 
 static int l2cap_conn_del(struct hci_conn *conn, int err);
Index: linux.prev/net/bluetooth/rfcomm/sock.c
===================================================================
--- linux.prev.orig/net/bluetooth/rfcomm/sock.c
+++ linux.prev/net/bluetooth/rfcomm/sock.c
@@ -61,7 +61,7 @@
 static struct proto_ops rfcomm_sock_ops;
 
 static struct bt_sock_list rfcomm_sk_list = {
-	.lock = RW_LOCK_UNLOCKED
+	.lock = RW_LOCK_UNLOCKED(rfcomm_sk_list.lock)
 };
 
 static void rfcomm_sock_close(struct sock *sk);
Index: linux.prev/net/bluetooth/sco.c
===================================================================
--- linux.prev.orig/net/bluetooth/sco.c
+++ linux.prev/net/bluetooth/sco.c
@@ -59,7 +59,7 @@
 static struct proto_ops sco_sock_ops;
 
 static struct bt_sock_list sco_sk_list = {
-	.lock = RW_LOCK_UNLOCKED
+	.lock = RW_LOCK_UNLOCKED(sco_sk_list.lock)
 };
 
 static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, struct sock *parent);
Index: linux.prev/net/bridge/netfilter/ebtable_broute.c
===================================================================
--- linux.prev.orig/net/bridge/netfilter/ebtable_broute.c
+++ linux.prev/net/bridge/netfilter/ebtable_broute.c
@@ -46,7 +46,7 @@ static struct ebt_table broute_table =
 	.name		= "broute",
 	.table		= &initial_table,
 	.valid_hooks	= 1 << NF_BR_BROUTING,
-	.lock		= RW_LOCK_UNLOCKED,
+	.lock		= RW_LOCK_UNLOCKED(broute_table.lock),
 	.check		= check,
 	.me		= THIS_MODULE,
 };
Index: linux.prev/net/bridge/netfilter/ebtable_filter.c
===================================================================
--- linux.prev.orig/net/bridge/netfilter/ebtable_filter.c
+++ linux.prev/net/bridge/netfilter/ebtable_filter.c
@@ -55,7 +55,7 @@ static struct ebt_table frame_filter =
 	.name		= "filter",
 	.table		= &initial_table,
 	.valid_hooks	= FILTER_VALID_HOOKS, 
-	.lock		= RW_LOCK_UNLOCKED,
+	.lock		= RW_LOCK_UNLOCKED(frame_filter.lock),
 	.check		= check,
 	.me		= THIS_MODULE,
 };
Index: linux.prev/net/bridge/netfilter/ebtable_nat.c
===================================================================
--- linux.prev.orig/net/bridge/netfilter/ebtable_nat.c
+++ linux.prev/net/bridge/netfilter/ebtable_nat.c
@@ -55,7 +55,7 @@ static struct ebt_table frame_nat =
 	.name		= "nat",
 	.table		= &initial_table,
 	.valid_hooks	= NAT_VALID_HOOKS,
-	.lock		= RW_LOCK_UNLOCKED,
+	.lock		= RW_LOCK_UNLOCKED(frame_nat.lock),
 	.check		= check,
 	.me		= THIS_MODULE,
 };
Index: linux.prev/net/core/dev.c
===================================================================
--- linux.prev.orig/net/core/dev.c
+++ linux.prev/net/core/dev.c
@@ -1321,10 +1321,16 @@ int dev_queue_xmit(struct sk_buff *skb)
 	   Either shot noqueue qdisc, it is even simpler 8)
 	 */
 	if (dev->flags & IFF_UP) {
-		int cpu = smp_processor_id(); /* ok because BHs are off */
+		int cpu = raw_smp_processor_id(); /* ok because BHs are off */
 
+		/*
+		 * No need to check for recursion with threaded interrupts:
+		 */
+#ifdef CONFIG_PREEMPT_RT
+		if (1) {
+#else
 		if (dev->xmit_lock_owner != cpu) {
-
+#endif
 			HARD_TX_LOCK(dev, cpu);
 
 			if (!netif_queue_stopped(dev)) {
@@ -1407,7 +1413,7 @@ int netif_rx(struct sk_buff *skb)
 	 * The code is rearranged so that the path is the most
 	 * short when CPU is congested, but is still operating.
 	 */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	queue = &__get_cpu_var(softnet_data);
 
 	__get_cpu_var(netdev_rx_stat).total++;
@@ -1416,7 +1422,7 @@ int netif_rx(struct sk_buff *skb)
 enqueue:
 			dev_hold(skb->dev);
 			__skb_queue_tail(&queue->input_pkt_queue, skb);
-			local_irq_restore(flags);
+			raw_local_irq_restore(flags);
 			return NET_RX_SUCCESS;
 		}
 
@@ -1425,7 +1431,7 @@ enqueue:
 	}
 
 	__get_cpu_var(netdev_rx_stat).dropped++;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	kfree_skb(skb);
 	return NET_RX_DROP;
@@ -1463,10 +1469,10 @@ static void net_tx_action(struct softirq
 	if (sd->completion_queue) {
 		struct sk_buff *clist;
 
-		local_irq_disable();
+		raw_local_irq_disable();
 		clist = sd->completion_queue;
 		sd->completion_queue = NULL;
-		local_irq_enable();
+		raw_local_irq_enable();
 
 		while (clist) {
 			struct sk_buff *skb = clist;
@@ -1474,16 +1480,21 @@ static void net_tx_action(struct softirq
 
 			BUG_TRAP(!atomic_read(&skb->users));
 			__kfree_skb(skb);
+			/*
+			 * Safe to reschedule - the list is private
+			 * at this point.
+			 */
+			cond_resched_all();
 		}
 	}
 
 	if (sd->output_queue) {
 		struct net_device *head;
 
-		local_irq_disable();
+		raw_local_irq_disable();
 		head = sd->output_queue;
 		sd->output_queue = NULL;
-		local_irq_enable();
+		raw_local_irq_enable();
 
 		while (head) {
 			struct net_device *dev = head;
@@ -1496,10 +1507,20 @@ static void net_tx_action(struct softirq
 				qdisc_run(dev);
 				spin_unlock(&dev->queue_lock);
 			} else {
+				/*
+				 * Dont re-kick the queue here, it will cause
+				 * excessive scheduling of ksoftirqd due
+				 * to retry. When the queue is released
+				 * it will be completed anyway.
+				 */
+//#warning checkme!
+#ifndef CONFIG_PREEMPT_SOFTIRQS
 				netif_schedule(dev);
+#endif
 			}
 		}
 	}
+
 }
 
 static __inline__ int deliver_skb(struct sk_buff *skb,
@@ -1680,11 +1701,11 @@ static int process_backlog(struct net_de
 		struct sk_buff *skb;
 		struct net_device *dev;
 
-		local_irq_disable();
+		raw_local_irq_disable();
 		skb = __skb_dequeue(&queue->input_pkt_queue);
 		if (!skb)
 			goto job_done;
-		local_irq_enable();
+		raw_local_irq_enable();
 
 		dev = skb->dev;
 
@@ -1711,18 +1732,19 @@ job_done:
 	smp_mb__before_clear_bit();
 	netif_poll_enable(backlog_dev);
 
-	local_irq_enable();
+	raw_local_irq_enable();
 	return 0;
 }
 
 static void net_rx_action(struct softirq_action *h)
 {
-	struct softnet_data *queue = &__get_cpu_var(softnet_data);
+	struct softnet_data *queue;
 	unsigned long start_time = jiffies;
 	int budget = netdev_budget;
 	void *have;
 
-	local_irq_disable();
+	raw_local_irq_disable();
+	queue = &__get_cpu_var(softnet_data);
 
 	while (!list_empty(&queue->poll_list)) {
 		struct net_device *dev;
@@ -1730,7 +1752,11 @@ static void net_rx_action(struct softirq
 		if (budget <= 0 || jiffies - start_time > 1)
 			goto softnet_break;
 
-		local_irq_enable();
+		raw_local_irq_enable();
+		if (unlikely(cond_resched_all())) {
+			raw_local_irq_disable();
+			continue;
+		}
 
 		dev = list_entry(queue->poll_list.next,
 				 struct net_device, poll_list);
@@ -1738,7 +1764,7 @@ static void net_rx_action(struct softirq
 
 		if (dev->quota <= 0 || dev->poll(dev, &budget)) {
 			netpoll_poll_unlock(have);
-			local_irq_disable();
+			raw_local_irq_disable();
 			list_del(&dev->poll_list);
 			list_add_tail(&dev->poll_list, &queue->poll_list);
 			if (dev->quota < 0)
@@ -1748,16 +1774,18 @@ static void net_rx_action(struct softirq
 		} else {
 			netpoll_poll_unlock(have);
 			dev_put(dev);
-			local_irq_disable();
+			raw_local_irq_disable();
 		}
 	}
 out:
-	local_irq_enable();
+	raw_local_irq_enable();
 	return;
 
 softnet_break:
+	preempt_disable();
 	__get_cpu_var(netdev_rx_stat).time_squeeze++;
 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+	preempt_enable();
 	goto out;
 }
 
@@ -3166,7 +3194,7 @@ static int dev_cpu_callback(struct notif
 	if (action != CPU_DEAD)
 		return NOTIFY_OK;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	cpu = smp_processor_id();
 	sd = &per_cpu(softnet_data, cpu);
 	oldsd = &per_cpu(softnet_data, oldcpu);
@@ -3188,7 +3216,7 @@ static int dev_cpu_callback(struct notif
 	oldsd->output_queue = NULL;
 
 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
-	local_irq_enable();
+	raw_local_irq_enable();
 
 	/* Process offline CPU's input_pkt_queue */
 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
Index: linux.prev/net/core/flow.c
===================================================================
--- linux.prev.orig/net/core/flow.c
+++ linux.prev/net/core/flow.c
@@ -38,6 +38,8 @@ atomic_t flow_cache_genid = ATOMIC_INIT(
 
 static u32 flow_hash_shift;
 #define flow_hash_size	(1 << flow_hash_shift)
+
+// #warning FIXME: this code is PREEMPT_RT-unsafe
 static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL };
 
 #define flow_table(cpu) (per_cpu(flow_tables, cpu))
Index: linux.prev/net/core/netpoll.c
===================================================================
--- linux.prev.orig/net/core/netpoll.c
+++ linux.prev/net/core/netpoll.c
@@ -139,7 +139,7 @@ static void poll_napi(struct netpoll *np
 	int budget = 16;
 
 	if (test_bit(__LINK_STATE_RX_SCHED, &np->dev->state) &&
-	    npinfo->poll_owner != smp_processor_id() &&
+	    npinfo->poll_owner != raw_smp_processor_id() &&
 	    spin_trylock(&npinfo->poll_lock)) {
 		npinfo->rx_flags |= NETPOLL_RX_DROP;
 		atomic_inc(&trapped);
@@ -158,7 +158,9 @@ void netpoll_poll(struct netpoll *np)
 		return;
 
 	/* Process pending work on NIC */
+	WARN_ON_RT(irqs_disabled());
 	np->dev->poll_controller(np->dev);
+	WARN_ON_RT(irqs_disabled());
 	if (np->dev->poll)
 		poll_napi(np);
 
@@ -185,28 +187,31 @@ static void refill_skbs(void)
 
 static void zap_completion_queue(void)
 {
-	unsigned long flags;
 	struct softnet_data *sd = &get_cpu_var(softnet_data);
+	struct sk_buff *clist = NULL;
+	unsigned long flags;
 
 	if (sd->completion_queue) {
-		struct sk_buff *clist;
-
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		clist = sd->completion_queue;
 		sd->completion_queue = NULL;
-		local_irq_restore(flags);
-
-		while (clist != NULL) {
-			struct sk_buff *skb = clist;
-			clist = clist->next;
-			if(skb->destructor)
-				dev_kfree_skb_any(skb); /* put this one back */
-			else
-				__kfree_skb(skb);
-		}
+		raw_local_irq_restore(flags);
 	}
 
+	/*
+	 * Took the list private, can drop our softnet
+	 * reference:
+	 */
 	put_cpu_var(softnet_data);
+
+	while (clist != NULL) {
+		struct sk_buff *skb = clist;
+		clist = clist->next;
+		if(skb->destructor)
+			dev_kfree_skb_any(skb); /* put this one back */
+		else
+			__kfree_skb(skb);
+	}
 }
 
 static struct sk_buff * find_skb(struct netpoll *np, int len, int reserve)
@@ -261,8 +266,8 @@ static void netpoll_send_skb(struct netp
 	npinfo = np->dev->npinfo;
 
 	/* avoid recursion */
-	if (npinfo->poll_owner == smp_processor_id() ||
-	    np->dev->xmit_lock_owner == smp_processor_id()) {
+	if (npinfo->poll_owner == raw_smp_processor_id() ||
+	    np->dev->xmit_lock_owner == raw_smp_processor_id()) {
 		if (np->drop)
 			np->drop(skb);
 		else
@@ -273,7 +278,7 @@ static void netpoll_send_skb(struct netp
 	do {
 		npinfo->tries--;
 		spin_lock(&np->dev->xmit_lock);
-		np->dev->xmit_lock_owner = smp_processor_id();
+		np->dev->xmit_lock_owner = raw_smp_processor_id();
 
 		/*
 		 * network drivers do not expect to be called if the queue is
Index: linux.prev/net/core/sock.c
===================================================================
--- linux.prev.orig/net/core/sock.c
+++ linux.prev/net/core/sock.c
@@ -1216,7 +1216,7 @@ static void sock_def_readable(struct soc
 {
 	read_lock(&sk->sk_callback_lock);
 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
-		wake_up_interruptible(sk->sk_sleep);
+		wake_up_interruptible_sync(sk->sk_sleep);
 	sk_wake_async(sk,1,POLL_IN);
 	read_unlock(&sk->sk_callback_lock);
 }
Index: linux.prev/net/dccp/ipv4.c
===================================================================
--- linux.prev.orig/net/dccp/ipv4.c
+++ linux.prev/net/dccp/ipv4.c
@@ -28,7 +28,7 @@
 #include "dccp.h"
 
 struct inet_hashinfo __cacheline_aligned dccp_hashinfo = {
-	.lhash_lock	= RW_LOCK_UNLOCKED,
+	.lhash_lock	= RW_LOCK_UNLOCKED(dccp_hashinfo.lhash_lock),
 	.lhash_users	= ATOMIC_INIT(0),
 	.lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait),
 };
Index: linux.prev/net/dccp/minisocks.c
===================================================================
--- linux.prev.orig/net/dccp/minisocks.c
+++ linux.prev/net/dccp/minisocks.c
@@ -26,7 +26,7 @@
 struct inet_timewait_death_row dccp_death_row = {
 	.sysctl_max_tw_buckets = NR_FILE * 2,
 	.period		= DCCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
-	.death_lock	= SPIN_LOCK_UNLOCKED,
+	.death_lock	= SPIN_LOCK_UNLOCKED(dccp_death_row.death_lock),
 	.hashinfo	= &dccp_hashinfo,
 	.tw_timer	= TIMER_INITIALIZER(inet_twdr_hangman, 0,
 					    (unsigned long)&dccp_death_row),
Index: linux.prev/net/decnet/dn_dev.c
===================================================================
--- linux.prev.orig/net/decnet/dn_dev.c
+++ linux.prev/net/decnet/dn_dev.c
@@ -87,9 +87,9 @@ static struct dn_dev_parms dn_dev_list[]
 	.t3 =		10,
 	.name =		"ethernet",
 	.ctl_name =	NET_DECNET_CONF_ETHER,
-	.up =		dn_eth_up,
-	.down = 	dn_eth_down,
-	.timer3 =	dn_send_brd_hello,
+	.dn_up =		dn_eth_up,
+	.dn_down = 	dn_eth_down,
+	.dn_timer3 =	dn_send_brd_hello,
 },
 {
 	.type =		ARPHRD_IPGRE, /* DECnet tunneled over GRE in IP */
@@ -99,7 +99,7 @@ static struct dn_dev_parms dn_dev_list[]
 	.t3 =		10,
 	.name =		"ipgre",
 	.ctl_name =	NET_DECNET_CONF_GRE,
-	.timer3 =	dn_send_brd_hello,
+	.dn_timer3 =	dn_send_brd_hello,
 },
 #if 0
 {
@@ -110,7 +110,7 @@ static struct dn_dev_parms dn_dev_list[]
 	.t3 =		120,
 	.name =		"x25",
 	.ctl_name =	NET_DECNET_CONF_X25,
-	.timer3 =	dn_send_ptp_hello,
+	.dn_timer3 =	dn_send_ptp_hello,
 },
 #endif
 #if 0
@@ -122,7 +122,7 @@ static struct dn_dev_parms dn_dev_list[]
 	.t3 =		10,
 	.name =		"ppp",
 	.ctl_name =	NET_DECNET_CONF_PPP,
-	.timer3 =	dn_send_brd_hello,
+	.dn_timer3 =	dn_send_brd_hello,
 },
 #endif
 {
@@ -133,7 +133,7 @@ static struct dn_dev_parms dn_dev_list[]
 	.t3 =		120,
 	.name =		"ddcmp",
 	.ctl_name =	NET_DECNET_CONF_DDCMP,
-	.timer3 =	dn_send_ptp_hello,
+	.dn_timer3 =	dn_send_ptp_hello,
 },
 {
 	.type =		ARPHRD_LOOPBACK, /* Loopback interface - always last */
@@ -143,7 +143,7 @@ static struct dn_dev_parms dn_dev_list[]
 	.t3 =		10,
 	.name =		"loopback",
 	.ctl_name =	NET_DECNET_CONF_LOOPBACK,
-	.timer3 =	dn_send_brd_hello,
+	.dn_timer3 =	dn_send_brd_hello,
 }
 };
 
@@ -332,11 +332,11 @@ static int dn_forwarding_proc(ctl_table 
 		 */
 		tmp = dn_db->parms.forwarding;
 		dn_db->parms.forwarding = old;
-		if (dn_db->parms.down)
-			dn_db->parms.down(dev);
+		if (dn_db->parms.dn_down)
+			dn_db->parms.dn_down(dev);
 		dn_db->parms.forwarding = tmp;
-		if (dn_db->parms.up)
-			dn_db->parms.up(dev);
+		if (dn_db->parms.dn_up)
+			dn_db->parms.dn_up(dev);
 	}
 
 	return err;
@@ -371,11 +371,11 @@ static int dn_forwarding_sysctl(ctl_tabl
 		if (value > 2)
 			return -EINVAL;
 
-		if (dn_db->parms.down)
-			dn_db->parms.down(dev);
+		if (dn_db->parms.dn_down)
+			dn_db->parms.dn_down(dev);
 		dn_db->parms.forwarding = value;
-		if (dn_db->parms.up)
-			dn_db->parms.up(dev);
+		if (dn_db->parms.dn_up)
+			dn_db->parms.dn_up(dev);
 	}
 
 	return 0;
@@ -1061,10 +1061,10 @@ static void dn_dev_timer_func(unsigned l
 	struct dn_ifaddr *ifa;
 
 	if (dn_db->t3 <= dn_db->parms.t2) {
-		if (dn_db->parms.timer3) {
+		if (dn_db->parms.dn_timer3) {
 			for(ifa = dn_db->ifa_list; ifa; ifa = ifa->ifa_next) {
 				if (!(ifa->ifa_flags & IFA_F_SECONDARY))
-					dn_db->parms.timer3(dev, ifa);
+					dn_db->parms.dn_timer3(dev, ifa);
 			}
 		}
 		dn_db->t3 = dn_db->parms.t3;
@@ -1116,8 +1116,8 @@ struct dn_dev *dn_dev_create(struct net_
 	init_timer(&dn_db->timer);
 
 	dn_db->uptime = jiffies;
-	if (dn_db->parms.up) {
-		if (dn_db->parms.up(dev) < 0) {
+	if (dn_db->parms.dn_up) {
+		if (dn_db->parms.dn_up(dev) < 0) {
 			dev->dn_ptr = NULL;
 			kfree(dn_db);
 			return NULL;
@@ -1212,8 +1212,8 @@ static void dn_dev_delete(struct net_dev
 	dn_dev_check_default(dev);
 	neigh_ifdown(&dn_neigh_table, dev);
 
-	if (dn_db->parms.down)
-		dn_db->parms.down(dev);
+	if (dn_db->parms.dn_down)
+		dn_db->parms.dn_down(dev);
 
 	dev->dn_ptr = NULL;
 
Index: linux.prev/net/ipv4/netfilter/arptable_filter.c
===================================================================
--- linux.prev.orig/net/ipv4/netfilter/arptable_filter.c
+++ linux.prev/net/ipv4/netfilter/arptable_filter.c
@@ -142,7 +142,7 @@ static struct
 static struct arpt_table packet_filter = {
 	.name		= "filter",
 	.valid_hooks	= FILTER_VALID_HOOKS,
-	.lock		= RW_LOCK_UNLOCKED,
+	.lock		= RW_LOCK_UNLOCKED(packet_filter.lock),
 	.private	= NULL,
 	.me		= THIS_MODULE,
 };
Index: linux.prev/net/ipv4/netfilter/ip_conntrack_core.c
===================================================================
--- linux.prev.orig/net/ipv4/netfilter/ip_conntrack_core.c
+++ linux.prev/net/ipv4/netfilter/ip_conntrack_core.c
@@ -83,7 +83,7 @@ static unsigned int ip_conntrack_expect_
 struct notifier_block *ip_conntrack_chain;
 struct notifier_block *ip_conntrack_expect_chain;
 
-DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
+DEFINE_PER_CPU_LOCKED(struct ip_conntrack_ecache, ip_conntrack_ecache);
 
 /* deliver cached events and clear cache entry - must be called with locally
  * disabled softirqs */
@@ -104,20 +104,23 @@ __ip_ct_deliver_cached_events(struct ip_
 void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
 {
 	struct ip_conntrack_ecache *ecache;
-	
+	int cpu;
+
 	local_bh_disable();
-	ecache = &__get_cpu_var(ip_conntrack_ecache);
+	ecache = &get_cpu_var_locked(ip_conntrack_ecache, &cpu);
 	if (ecache->ct == ct)
 		__ip_ct_deliver_cached_events(ecache);
+	put_cpu_var_locked(ip_conntrack_ecache, cpu);
 	local_bh_enable();
 }
 
 void __ip_ct_event_cache_init(struct ip_conntrack *ct)
 {
 	struct ip_conntrack_ecache *ecache;
+	int cpu = raw_smp_processor_id();
 
 	/* take care of delivering potentially old events */
-	ecache = &__get_cpu_var(ip_conntrack_ecache);
+	ecache = &__get_cpu_var_locked(ip_conntrack_ecache, cpu);
 	BUG_ON(ecache->ct == ct);
 	if (ecache->ct)
 		__ip_ct_deliver_cached_events(ecache);
@@ -133,8 +136,11 @@ static void ip_ct_event_cache_flush(void
 	struct ip_conntrack_ecache *ecache;
 	int cpu;
 
+	/*
+	 * First get all locks, then do the flush and drop the locks.
+	 */
 	for_each_cpu(cpu) {
-		ecache = &per_cpu(ip_conntrack_ecache, cpu);
+		ecache = &__get_cpu_var_locked(ip_conntrack_ecache, cpu);
 		if (ecache->ct)
 			ip_conntrack_put(ecache->ct);
 	}
Index: linux.prev/net/ipv4/netfilter/ip_conntrack_standalone.c
===================================================================
--- linux.prev.orig/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ linux.prev/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -977,7 +977,7 @@ EXPORT_SYMBOL_GPL(ip_conntrack_expect_ch
 EXPORT_SYMBOL_GPL(ip_conntrack_register_notifier);
 EXPORT_SYMBOL_GPL(ip_conntrack_unregister_notifier);
 EXPORT_SYMBOL_GPL(__ip_ct_event_cache_init);
-EXPORT_PER_CPU_SYMBOL_GPL(ip_conntrack_ecache);
+EXPORT_PER_CPU_LOCKED_SYMBOL_GPL(ip_conntrack_ecache);
 #endif
 EXPORT_SYMBOL(ip_conntrack_protocol_register);
 EXPORT_SYMBOL(ip_conntrack_protocol_unregister);
Index: linux.prev/net/ipv4/netfilter/ip_nat_rule.c
===================================================================
--- linux.prev.orig/net/ipv4/netfilter/ip_nat_rule.c
+++ linux.prev/net/ipv4/netfilter/ip_nat_rule.c
@@ -93,7 +93,7 @@ static struct
 static struct ipt_table nat_table = {
 	.name		= "nat",
 	.valid_hooks	= NAT_VALID_HOOKS,
-	.lock		= RW_LOCK_UNLOCKED,
+	.lock		= RW_LOCK_UNLOCKED(nat_table.lock),
 	.me		= THIS_MODULE,
 };
 
Index: linux.prev/net/ipv4/netfilter/ip_tables.c
===================================================================
--- linux.prev.orig/net/ipv4/netfilter/ip_tables.c
+++ linux.prev/net/ipv4/netfilter/ip_tables.c
@@ -111,7 +111,11 @@ struct ipt_table_info
 static LIST_HEAD(ipt_target);
 static LIST_HEAD(ipt_match);
 static LIST_HEAD(ipt_tables);
-#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
+/*
+ * Use atomic add because on PREEMPT_RT the same table might
+ * be used on two CPUs at once:
+ */
+#define ADD_COUNTER(c,b,p) do { atomic_add((b), (atomic_t *)(&(c).bcnt)); atomic_add((p), (atomic_t *)(&(c).pcnt)); } while(0)
 
 #ifdef CONFIG_SMP
 #define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
@@ -290,8 +294,17 @@ ipt_do_table(struct sk_buff **pskb,
 
 	read_lock_bh(&table->lock);
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
+	/*
+	 * on a PREEMPT_RT kernel the task could schedule
+	 * off and smp_processor_id() is not safe. So we take
+	 * the current value of the CPU and use that table. We
+	 * only update the counters while read-locking the table
+	 * and dont change the rules so the possibility of the
+	 * same table being used by two tasks at once is not a
+	 * problem.
+	 */
 	table_base = (void *)table->private->entries
-		+ TABLE_OFFSET(table->private, smp_processor_id());
+		+ TABLE_OFFSET(table->private, raw_smp_processor_id());
 	e = get_entry(table_base, table->private->hook_entry[hook]);
 
 #ifdef CONFIG_NETFILTER_DEBUG
@@ -299,7 +312,7 @@ ipt_do_table(struct sk_buff **pskb,
 	if (((struct ipt_entry *)table_base)->comefrom != 0xdead57ac
 	    && ((struct ipt_entry *)table_base)->comefrom != 0xeeeeeeec) {
 		printk("ASSERT: CPU #%u, %s comefrom(%p) = %X\n",
-		       smp_processor_id(),
+		       raw_smp_processor_id(),
 		       table->name,
 		       &((struct ipt_entry *)table_base)->comefrom,
 		       ((struct ipt_entry *)table_base)->comefrom);
Index: linux.prev/net/ipv4/netfilter/iptable_filter.c
===================================================================
--- linux.prev.orig/net/ipv4/netfilter/iptable_filter.c
+++ linux.prev/net/ipv4/netfilter/iptable_filter.c
@@ -77,7 +77,7 @@ static struct
 static struct ipt_table packet_filter = {
 	.name		= "filter",
 	.valid_hooks	= FILTER_VALID_HOOKS,
-	.lock		= RW_LOCK_UNLOCKED,
+	.lock		= RW_LOCK_UNLOCKED(packet_filter.lock),
 	.me		= THIS_MODULE
 };
 
Index: linux.prev/net/ipv4/netfilter/iptable_mangle.c
===================================================================
--- linux.prev.orig/net/ipv4/netfilter/iptable_mangle.c
+++ linux.prev/net/ipv4/netfilter/iptable_mangle.c
@@ -107,7 +107,7 @@ static struct
 static struct ipt_table packet_mangler = {
 	.name		= "mangle",
 	.valid_hooks	= MANGLE_VALID_HOOKS,
-	.lock		= RW_LOCK_UNLOCKED,
+	.lock		= RW_LOCK_UNLOCKED(packet_mangler.lock),
 	.me		= THIS_MODULE,
 };
 
Index: linux.prev/net/ipv4/netfilter/iptable_raw.c
===================================================================
--- linux.prev.orig/net/ipv4/netfilter/iptable_raw.c
+++ linux.prev/net/ipv4/netfilter/iptable_raw.c
@@ -82,7 +82,7 @@ static struct
 static struct ipt_table packet_raw = { 
 	.name = "raw", 
 	.valid_hooks =  RAW_VALID_HOOKS, 
-	.lock = RW_LOCK_UNLOCKED, 
+	.lock = RW_LOCK_UNLOCKED(packet_raw.lock),
 	.me = THIS_MODULE
 };
 
Index: linux.prev/net/ipv4/route.c
===================================================================
--- linux.prev.orig/net/ipv4/route.c
+++ linux.prev/net/ipv4/route.c
@@ -204,14 +204,14 @@ __u8 ip_tos2prio[16] = {
 struct rt_hash_bucket {
 	struct rtable	*chain;
 };
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT_RT)
 /*
  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
  * The size of this table is a power of two and depends on the number of CPUS.
  */
-#if NR_CPUS >= 32
+#if NR_CPUS >= 32 && !defined(CONFIG_PREEMPT_RT)
 #define RT_HASH_LOCK_SZ	4096
-#elif NR_CPUS >= 16
+#elif NR_CPUS >= 16 && !defined(CONFIG_PREEMPT_RT)
 #define RT_HASH_LOCK_SZ	2048
 #elif NR_CPUS >= 8
 #define RT_HASH_LOCK_SZ	1024
@@ -231,7 +231,7 @@ static spinlock_t	*rt_hash_locks;
 			spin_lock_init(&rt_hash_locks[i]); \
 		}
 #else
-# define rt_hash_lock_addr(slot) NULL
+# define rt_hash_lock_addr(slot) ((spinlock_t *)NULL)
 # define rt_hash_lock_init()
 #endif
 
Index: linux.prev/net/ipv4/tcp.c
===================================================================
--- linux.prev.orig/net/ipv4/tcp.c
+++ linux.prev/net/ipv4/tcp.c
@@ -2039,6 +2039,8 @@ static int __init set_thash_entries(char
 }
 __setup("thash_entries=", set_thash_entries);
 
+void *__init alloc_large_system_bitmask(char *bitmaskname,
+					unsigned long bits, int flags);
 void __init tcp_init(void)
 {
 	struct sk_buff *skb = NULL;
@@ -2071,6 +2073,10 @@ void __init tcp_init(void)
 					NULL,
 					0);
 	tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
+	tcp_hashinfo.ebitmask =
+		alloc_large_system_bitmask("TCP established",
+					  tcp_hashinfo.ehash_size,
+					  HASH_HIGHMEM);
 	for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
 		rwlock_init(&tcp_hashinfo.ehash[i].lock);
 		INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
Index: linux.prev/net/ipv4/tcp_ipv4.c
===================================================================
--- linux.prev.orig/net/ipv4/tcp_ipv4.c
+++ linux.prev/net/ipv4/tcp_ipv4.c
@@ -90,7 +90,7 @@ void tcp_v4_send_check(struct sock *sk, 
 		       struct sk_buff *skb);
 
 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
-	.lhash_lock	= RW_LOCK_UNLOCKED,
+	.lhash_lock	= RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
 	.lhash_users	= ATOMIC_INIT(0),
 	.lhash_wait	= __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
 };
@@ -1581,7 +1581,12 @@ static void *established_get_first(struc
 	struct tcp_iter_state* st = seq->private;
 	void *rc = NULL;
 
-	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
+	for (st->bucket = find_first_bit(tcp_hashinfo.ebitmask,
+					 tcp_hashinfo.ehash_size);
+	     st->bucket < tcp_hashinfo.ehash_size;
+	     st->bucket = find_next_bit(tcp_hashinfo.ebitmask,
+					tcp_hashinfo.ehash_size,
+					st->bucket+1)) {
 		struct sock *sk;
 		struct hlist_node *node;
 		struct inet_timewait_sock *tw;
Index: linux.prev/net/ipv4/tcp_minisocks.c
===================================================================
--- linux.prev.orig/net/ipv4/tcp_minisocks.c
+++ linux.prev/net/ipv4/tcp_minisocks.c
@@ -41,7 +41,7 @@ int sysctl_tcp_abort_on_overflow;
 struct inet_timewait_death_row tcp_death_row = {
 	.sysctl_max_tw_buckets = NR_FILE * 2,
 	.period		= TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
-	.death_lock	= SPIN_LOCK_UNLOCKED,
+	.death_lock	= SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock),
 	.hashinfo	= &tcp_hashinfo,
 	.tw_timer	= TIMER_INITIALIZER(inet_twdr_hangman, 0,
 					    (unsigned long)&tcp_death_row),
Index: linux.prev/net/ipv4/xfrm4_policy.c
===================================================================
--- linux.prev.orig/net/ipv4/xfrm4_policy.c
+++ linux.prev/net/ipv4/xfrm4_policy.c
@@ -18,7 +18,7 @@
 static struct dst_ops xfrm4_dst_ops;
 static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
 
-static struct xfrm_type_map xfrm4_type_map = { .lock = RW_LOCK_UNLOCKED };
+static struct xfrm_type_map xfrm4_type_map = { .lock = RW_LOCK_UNLOCKED(xfrm4_type_map.lock) };
 
 static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl)
 {
@@ -297,7 +297,7 @@ static struct dst_ops xfrm4_dst_ops = {
 
 static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
 	.family = 		AF_INET,
-	.lock = 		RW_LOCK_UNLOCKED,
+	.lock = 		RW_LOCK_UNLOCKED(xfrm4_policy_afinfo.lock),
 	.type_map = 		&xfrm4_type_map,
 	.dst_ops =		&xfrm4_dst_ops,
 	.dst_lookup =		xfrm4_dst_lookup,
Index: linux.prev/net/ipv4/xfrm4_state.c
===================================================================
--- linux.prev.orig/net/ipv4/xfrm4_state.c
+++ linux.prev/net/ipv4/xfrm4_state.c
@@ -116,7 +116,7 @@ __xfrm4_find_acq(u8 mode, u32 reqid, u8 
 
 static struct xfrm_state_afinfo xfrm4_state_afinfo = {
 	.family			= AF_INET,
-	.lock			= RW_LOCK_UNLOCKED,
+	.lock			= RW_LOCK_UNLOCKED(xfrm4_state_afinfo.lock),
 	.init_flags		= xfrm4_init_flags,
 	.init_tempsel		= __xfrm4_init_tempsel,
 	.state_lookup		= __xfrm4_state_lookup,
Index: linux.prev/net/ipv6/mcast.c
===================================================================
--- linux.prev.orig/net/ipv6/mcast.c
+++ linux.prev/net/ipv6/mcast.c
@@ -224,7 +224,7 @@ int ipv6_sock_mc_join(struct sock *sk, i
 
 	mc_lst->ifindex = dev->ifindex;
 	mc_lst->sfmode = MCAST_EXCLUDE;
-	mc_lst->sflock = RW_LOCK_UNLOCKED;
+	mc_lst->sflock = RW_LOCK_UNLOCKED(mc_lst->sflock);
 	mc_lst->sflist = NULL;
 
 	/*
Index: linux.prev/net/ipv6/netfilter/ip6table_filter.c
===================================================================
--- linux.prev.orig/net/ipv6/netfilter/ip6table_filter.c
+++ linux.prev/net/ipv6/netfilter/ip6table_filter.c
@@ -95,7 +95,7 @@ static struct
 static struct ip6t_table packet_filter = {
 	.name		= "filter",
 	.valid_hooks	= FILTER_VALID_HOOKS,
-	.lock		= RW_LOCK_UNLOCKED,
+	.lock		= RW_LOCK_UNLOCKED(packet_filter.lock),
 	.me		= THIS_MODULE,
 };
 
Index: linux.prev/net/ipv6/netfilter/ip6table_mangle.c
===================================================================
--- linux.prev.orig/net/ipv6/netfilter/ip6table_mangle.c
+++ linux.prev/net/ipv6/netfilter/ip6table_mangle.c
@@ -125,7 +125,7 @@ static struct
 static struct ip6t_table packet_mangler = {
 	.name		= "mangle",
 	.valid_hooks	= MANGLE_VALID_HOOKS,
-	.lock		= RW_LOCK_UNLOCKED,
+	.lock		= RW_LOCK_UNLOCKED(packet_mangler.lock),
 	.me		= THIS_MODULE,
 };
 
Index: linux.prev/net/ipv6/netfilter/ip6table_raw.c
===================================================================
--- linux.prev.orig/net/ipv6/netfilter/ip6table_raw.c
+++ linux.prev/net/ipv6/netfilter/ip6table_raw.c
@@ -109,7 +109,7 @@ static struct
 static struct ip6t_table packet_raw = { 
 	.name = "raw", 
 	.valid_hooks = RAW_VALID_HOOKS, 
-	.lock = RW_LOCK_UNLOCKED, 
+	.lock = RW_LOCK_UNLOCKED(packet_raw.lock),
 	.me = THIS_MODULE
 };
 
Index: linux.prev/net/ipv6/xfrm6_policy.c
===================================================================
--- linux.prev.orig/net/ipv6/xfrm6_policy.c
+++ linux.prev/net/ipv6/xfrm6_policy.c
@@ -24,7 +24,7 @@
 static struct dst_ops xfrm6_dst_ops;
 static struct xfrm_policy_afinfo xfrm6_policy_afinfo;
 
-static struct xfrm_type_map xfrm6_type_map = { .lock = RW_LOCK_UNLOCKED };
+static struct xfrm_type_map xfrm6_type_map = { .lock = RW_LOCK_UNLOCKED(xfrm6_type_map.lock) };
 
 static int xfrm6_dst_lookup(struct xfrm_dst **dst, struct flowi *fl)
 {
@@ -310,7 +310,7 @@ static struct dst_ops xfrm6_dst_ops = {
 
 static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
 	.family =		AF_INET6,
-	.lock = 		RW_LOCK_UNLOCKED,
+	.lock = 		RW_LOCK_UNLOCKED(xfrm6_policy_afinfo.lock),
 	.type_map = 		&xfrm6_type_map,
 	.dst_ops =		&xfrm6_dst_ops,
 	.dst_lookup =		xfrm6_dst_lookup,
Index: linux.prev/net/ipv6/xfrm6_state.c
===================================================================
--- linux.prev.orig/net/ipv6/xfrm6_state.c
+++ linux.prev/net/ipv6/xfrm6_state.c
@@ -118,7 +118,7 @@ __xfrm6_find_acq(u8 mode, u32 reqid, u8 
 
 static struct xfrm_state_afinfo xfrm6_state_afinfo = {
 	.family			= AF_INET6,
-	.lock			= RW_LOCK_UNLOCKED,
+	.lock			= RW_LOCK_UNLOCKED(xfrm6_state_afinfo.lock),
 	.init_tempsel		= __xfrm6_init_tempsel,
 	.state_lookup		= __xfrm6_state_lookup,
 	.find_acq		= __xfrm6_find_acq,
Index: linux.prev/net/netfilter/nfnetlink_log.c
===================================================================
--- linux.prev.orig/net/netfilter/nfnetlink_log.c
+++ linux.prev/net/netfilter/nfnetlink_log.c
@@ -151,7 +151,7 @@ instance_create(u_int16_t group_num, int
 		goto out_unlock;
 
 	INIT_HLIST_NODE(&inst->hlist);
-	inst->lock = SPIN_LOCK_UNLOCKED;
+	spin_lock_init(&inst->lock);
 	/* needs to be two, since we _put() after creation */
 	atomic_set(&inst->use, 2);
 
Index: linux.prev/net/netfilter/nfnetlink_queue.c
===================================================================
--- linux.prev.orig/net/netfilter/nfnetlink_queue.c
+++ linux.prev/net/netfilter/nfnetlink_queue.c
@@ -148,7 +148,7 @@ instance_create(u_int16_t queue_num, int
 	atomic_set(&inst->id_sequence, 0);
 	/* needs to be two, since we _put() after creation */
 	atomic_set(&inst->use, 2);
-	inst->lock = SPIN_LOCK_UNLOCKED;
+	spin_lock_init(&inst->lock);
 	INIT_LIST_HEAD(&inst->queue_list);
 
 	if (!try_module_get(THIS_MODULE))
Index: linux.prev/net/sched/sch_generic.c
===================================================================
--- linux.prev.orig/net/sched/sch_generic.c
+++ linux.prev/net/sched/sch_generic.c
@@ -14,6 +14,7 @@
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <linux/bitops.h>
+#include <linux/kallsyms.h>
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/types.h>
@@ -32,6 +33,7 @@
 #include <linux/init.h>
 #include <linux/rcupdate.h>
 #include <linux/list.h>
+#include <linux/delay.h>
 #include <net/sock.h>
 #include <net/pkt_sched.h>
 
@@ -108,8 +110,11 @@ int qdisc_restart(struct net_device *dev
 		 * will be requeued.
 		 */
 		if (!nolock) {
+#ifdef CONFIG_PREEMPT_RT
+			spin_lock(&dev->xmit_lock);
+			dev->xmit_lock_owner = raw_smp_processor_id();
+#else
 			if (!spin_trylock(&dev->xmit_lock)) {
-			collision:
 				/* So, someone grabbed the driver. */
 				
 				/* It may be transient configuration error,
@@ -117,7 +122,7 @@ int qdisc_restart(struct net_device *dev
 				   it by checking xmit owner and drop the
 				   packet when deadloop is detected.
 				*/
-				if (dev->xmit_lock_owner == smp_processor_id()) {
+				if (dev->xmit_lock_owner == raw_smp_processor_id()) {
 					kfree_skb(skb);
 					if (net_ratelimit())
 						printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
@@ -127,7 +132,8 @@ int qdisc_restart(struct net_device *dev
 				goto requeue;
 			}
 			/* Remember that the driver is grabbed by us. */
-			dev->xmit_lock_owner = smp_processor_id();
+			dev->xmit_lock_owner = raw_smp_processor_id();
+#endif
 		}
 		
 		{
@@ -139,7 +145,20 @@ int qdisc_restart(struct net_device *dev
 				if (netdev_nit)
 					dev_queue_xmit_nit(skb, dev);
 
+				WARN_ON_RT(irqs_disabled());
 				ret = dev->hard_start_xmit(skb, dev);
+#ifdef CONFIG_PREEMPT_RT
+				if (irqs_disabled()) {
+					if (printk_ratelimit())
+						print_symbol("network driver disabled interrupts: %s\n", (unsigned long)dev->hard_start_xmit);
+					local_irq_enable();
+				}
+				if (raw_irqs_disabled()) {
+					if (printk_ratelimit())
+						print_symbol("network driver disabled raw interrupts: %s\n", (unsigned long)dev->hard_start_xmit);
+					raw_local_irq_enable();
+				}
+#endif
 				if (ret == NETDEV_TX_OK) { 
 					if (!nolock) {
 						dev->xmit_lock_owner = -1;
@@ -150,7 +169,10 @@ int qdisc_restart(struct net_device *dev
 				}
 				if (ret == NETDEV_TX_LOCKED && nolock) {
 					spin_lock(&dev->queue_lock);
-					goto collision; 
+					preempt_disable();
+					__get_cpu_var(netdev_rx_stat).cpu_collision++;
+					preempt_enable();
+					goto requeue;
 				}
 			}
 
@@ -578,7 +600,7 @@ void dev_deactivate(struct net_device *d
 	dev_watchdog_down(dev);
 
 	while (test_bit(__LINK_STATE_SCHED, &dev->state))
-		yield();
+		msleep(1);
 
 	spin_unlock_wait(&dev->xmit_lock);
 }
Index: linux.prev/net/unix/af_unix.c
===================================================================
--- linux.prev.orig/net/unix/af_unix.c
+++ linux.prev/net/unix/af_unix.c
@@ -290,10 +290,11 @@ static void unix_write_space(struct sock
 	read_lock(&sk->sk_callback_lock);
 	if (unix_writable(sk)) {
 		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
-			wake_up_interruptible(sk->sk_sleep);
+			wake_up_interruptible_sync(sk->sk_sleep);
 		sk_wake_async(sk, 2, POLL_OUT);
 	}
 	read_unlock(&sk->sk_callback_lock);
+	preempt_check_resched_delayed();
 }
 
 /* When dgram socket disconnects (or changes its peer), we clear its receive
Index: linux.prev/scripts/Makefile
===================================================================
--- linux.prev.orig/scripts/Makefile
+++ linux.prev/scripts/Makefile
@@ -12,6 +12,9 @@ hostprogs-$(CONFIG_LOGO)         += pnmt
 hostprogs-$(CONFIG_VT)           += conmakehash
 hostprogs-$(CONFIG_PROM_CONSOLE) += conmakehash
 hostprogs-$(CONFIG_IKCONFIG)     += bin2c
+ifdef CONFIG_LPPTEST
+hostprogs-y      += testlpp
+endif
 
 always		:= $(hostprogs-y)
 
Index: linux.prev/scripts/testlpp.c
===================================================================
--- /dev/null
+++ linux.prev/scripts/testlpp.c
@@ -0,0 +1,159 @@
+/*
+ * testlpp.c: use the /dev/lpptest device to test IRQ handling
+ *            latencies over parallel port
+ *
+ *      Copyright (C) 2005 Thomas Gleixner
+ *
+ * licensed under the GPL
+ */
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/io.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#define LPPTEST_CHAR_MAJOR 245
+#define LPPTEST_DEVICE_NAME "lpptest"
+
+#define LPPTEST_TEST    _IOR (LPPTEST_CHAR_MAJOR, 1, unsigned long long)
+#define LPPTEST_DISABLE _IOR (LPPTEST_CHAR_MAJOR, 2, unsigned long long)
+#define LPPTEST_ENABLE  _IOR (LPPTEST_CHAR_MAJOR, 3, unsigned long long)
+
+#define HIST_SIZE 10000
+
+static int hist_total;
+static unsigned long hist[HIST_SIZE];
+
+static void hist_hit(unsigned long usecs)
+{
+	hist_total++;
+	if (usecs >= HIST_SIZE-1)
+		hist[HIST_SIZE-1]++;
+	else
+		hist[usecs]++;
+}
+
+static void print_hist(void)
+{
+	int i;
+
+	printf("LPP latency histogram:\n");
+
+	for (i = 0; i < HIST_SIZE; i++) {
+		if (hist[i])
+			printf("%3d usecs: %9ld\n", i, hist[i]);
+	}
+}
+
+static inline unsigned long long int rdtsc(void)
+{
+	unsigned long long int x, y;
+	for (;;) {
+		__asm__ volatile ("rdtsc" : "=A" (x));
+		__asm__ volatile ("rdtsc" : "=A" (y));
+		if (y - x < 1000)
+			return y;
+	}
+}
+
+static unsigned long long calibrate_loop(void)
+{
+	unsigned long long mytime1, mytime2;
+
+	mytime1 = rdtsc();
+	usleep(500000);
+	mytime2 = rdtsc();
+
+	return (mytime2 - mytime1) * 2;
+}
+
+#define time_to_usecs(time) ((double)time*1000000.0/(double)cycles_per_sec)
+
+#define time_to_usecs_l(time) (long)(time*1000000/cycles_per_sec)
+
+int fd, total;
+unsigned long long tim, sum_tim, min_tim = -1ULL, max_tim, cycles_per_sec;
+
+void cleanup(int sig)
+{
+	ioctl (fd, LPPTEST_ENABLE, &tim);
+	if (sig)
+		printf("[ interrupted - exiting ]\n");
+	printf("\ntotal number of responses: %d\n", total);
+	printf("average reponse latency:   %.2lf usecs\n",
+		time_to_usecs(sum_tim/total));
+	printf("minimum latency:           %.2lf usecs\n",
+			time_to_usecs(min_tim));
+	printf("maximum latency:           %.2lf usecs\n",
+			time_to_usecs(max_tim));
+	print_hist();
+	exit(0);
+}
+
+#define HZ 3000
+
+int main (int argc, char **argv)
+{
+	unsigned int nr_requests = 0;
+
+	if (argc > 2) {
+		fprintf(stderr, "usage: testlpp [<nr_of_requests>]\n");
+		exit(-1);
+	}
+	if (argc == 2)
+		nr_requests = atol(argv[1]);
+
+	if (getuid() != 0) {
+		fprintf(stderr, "need to run as root!\n");
+		exit(-1);
+	}
+	mknod("/dev/lpptest", S_IFCHR|0666, makedev(245, 1));
+
+	fd = open("/dev/lpptest", O_RDWR);
+	if (fd == -1) {
+		fprintf(stderr, "could not open /dev/lpptest, your kernel doesnt have CONFIG_LPPTEST enabled?\n");
+		exit(-1);
+	}
+
+	signal(SIGINT,&cleanup);
+
+	ioctl (fd, LPPTEST_DISABLE, &tim);
+
+	fprintf(stderr, "calibrating cycles to usecs: ");
+	cycles_per_sec = calibrate_loop();
+	fprintf(stderr, "%lld cycles per usec\n", cycles_per_sec/1000000);
+	if (nr_requests)
+		fprintf(stderr, "[max # of requests: %u]\n", nr_requests);
+	fprintf(stderr, "starting %dHz test, hit Ctrl-C to stop:\n\n", HZ);
+
+	while(1) {
+		ioctl (fd, LPPTEST_TEST, &tim);
+		if (tim == 0)
+			printf ("No response from target.\n");
+		else {
+			hist_hit(time_to_usecs_l(tim));
+			if (tim > max_tim) {
+				printf ("new max latency: %.2lf usecs (%Ld cycles)\n", time_to_usecs(tim), tim);
+				max_tim = tim;
+			}
+			if (tim < min_tim)
+				min_tim = tim;
+			total++;
+			if (total == nr_requests)
+				break;
+			sum_tim += tim;
+		}
+		usleep(1000000/HZ);
+	}
+	cleanup(0);
+
+	return 0;
+}
+
+
Index: linux.prev/security/keys/process_keys.c
===================================================================
--- linux.prev.orig/security/keys/process_keys.c
+++ linux.prev/security/keys/process_keys.c
@@ -26,7 +26,7 @@ static DECLARE_MUTEX(key_session_sem);
 struct key_user root_key_user = {
 	.usage		= ATOMIC_INIT(3),
 	.consq		= LIST_HEAD_INIT(root_key_user.consq),
-	.lock		= SPIN_LOCK_UNLOCKED,
+	.lock		= SPIN_LOCK_UNLOCKED(root_key_user.lock),
 	.nkeys		= ATOMIC_INIT(2),
 	.nikeys		= ATOMIC_INIT(2),
 	.uid		= 0,
Index: linux.prev/sound/core/pcm_lib.c
===================================================================
--- linux.prev.orig/sound/core/pcm_lib.c
+++ linux.prev/sound/core/pcm_lib.c
@@ -133,6 +133,7 @@ static void xrun(snd_pcm_substream_t *su
 	snd_pcm_stop(substream, SNDRV_PCM_STATE_XRUN);
 #ifdef CONFIG_SND_DEBUG
 	if (substream->pstr->xrun_debug) {
+		user_trace_stop();
 		snd_printd(KERN_DEBUG "XRUN: pcmC%dD%d%c\n",
 			   substream->pcm->card->number,
 			   substream->pcm->device,
Index: linux.prev/sound/oss/dmasound/dmasound_core.c
===================================================================
--- linux.prev.orig/sound/oss/dmasound/dmasound_core.c
+++ linux.prev/sound/oss/dmasound/dmasound_core.c
@@ -230,7 +230,7 @@ static int shared_resources_initialised;
      *  Mid level stuff
      */
 
-struct sound_settings dmasound = { .lock = SPIN_LOCK_UNLOCKED };
+struct sound_settings dmasound = { .lock = SPIN_LOCK_UNLOCKED(dmasound.lock) };
 
 static inline void sound_silence(void)
 {
Index: linux.prev/sound/oss/emu10k1/midi.c
===================================================================
--- linux.prev.orig/sound/oss/emu10k1/midi.c
+++ linux.prev/sound/oss/emu10k1/midi.c
@@ -45,7 +45,7 @@
 #include "../sound_config.h"
 #endif
 
-static DEFINE_SPINLOCK(midi_spinlock __attribute((unused)));
+static DEFINE_SPINLOCK(midi_spinlock);
 
 static void init_midi_hdr(struct midi_hdr *midihdr)
 {