Index: linux-2.6.16/Documentation/DocBook/Makefile
===================================================================
--- linux-2.6.16.orig/Documentation/DocBook/Makefile	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/Documentation/DocBook/Makefile	2006-10-19 16:52:29.000000000 +0200
@@ -10,7 +10,8 @@ DOCBOOKS := wanbook.xml z8530book.xml mc
 	    kernel-hacking.xml kernel-locking.xml deviceiobook.xml \
 	    procfs-guide.xml writing_usb_driver.xml \
 	    sis900.xml kernel-api.xml journal-api.xml lsm.xml usb.xml \
-	    gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml
+	    gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \
+	    genericirq.xml
 
 ###
 # The build process is as follows (targets):
Index: linux-2.6.16/Documentation/DocBook/genericirq.tmpl
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/Documentation/DocBook/genericirq.tmpl	2006-10-19 16:52:29.000000000 +0200
@@ -0,0 +1,560 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+	"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="Generic-IRQ-Guide">
+ <bookinfo>
+  <title>Linux generic IRQ handling</title>
+
+  <authorgroup>
+   <author>
+    <firstname>Thomas</firstname>
+    <surname>Gleixner</surname>
+    <affiliation>
+     <address>
+      <email>tglx@linutronix.de</email>
+     </address>
+    </affiliation>
+   </author>
+   <author>
+    <firstname>Ingo</firstname>
+    <surname>Molnar</surname>
+    <affiliation>
+     <address>
+      <email>mingo@elte.hu</email>
+     </address>
+    </affiliation>
+   </author>
+  </authorgroup>
+
+  <copyright>
+   <year>2005</year>
+   <holder>Thomas Gleixner</holder>
+  </copyright>
+  <copyright>
+   <year>2005</year>
+   <holder>Ingo Molnar</holder>
+  </copyright>
+
+  <legalnotice>
+   <para>
+     This documentation is free software; you can redistribute
+     it and/or modify it under the terms of the GNU General Public
+     License version 2 as published by the Free Software Foundation.
+   </para>
+
+   <para>
+     This program is distributed in the hope that it will be
+     useful, but WITHOUT ANY WARRANTY; without even the implied
+     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+     See the GNU General Public License for more details.
+   </para>
+
+   <para>
+     You should have received a copy of the GNU General Public
+     License along with this program; if not, write to the Free
+     Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+     MA 02111-1307 USA
+   </para>
+
+   <para>
+     For more details see the file COPYING in the source
+     distribution of Linux.
+   </para>
+  </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+  <chapter id="intro">
+    <title>Introduction</title>
+    <para>
+	The generic interrupt handling layer is designed to provide a
+	complete abstraction of interrupt handling for device drivers
+	and is able to handle all different types of interrupt controller
+	hardware. Device drivers use generic API function to request, enable,
+	disable and free interrupts. The drivers do not have to know anything
+	about interrupt hardware, so they can be used on different hardware
+	platforms without code changes.
+    </para>
+    <para>
+  	This documentation is provided for developers who want to implement
+	architecture interrupt support based on the Generic IRQ handling layer.
+    </para>
+  </chapter>
+
+  <chapter id="rationale">
+    <title>Rationale</title>
+	<para>
+	The original implementation of interrupt handling in Linux is using
+	the __do_IRQ() super-handler, which must be able to deal with every
+	type of interrupt logic. This is achieved by an 'interrupt type'
+	structure and runtime flags to handle special cases.
+	Furthermore the superhandler assumed a certain type of interrupt
+	handling hardware and turned out to be not capable of handling all
+	kind of interrupt controller hardware which can be found through
+	the architectures. The all in one approach also adds unnecessary
+	complexity for every user.
+	</para>
+	<para>
+	Originally, Russell King identified different types of handlers to
+	build a quite universal set for the ARM interrupt handler
+	implementation in Linux 2.5/2.6. He distiguished between:
+	<itemizedlist>
+	  <listitem><para>Level type</para></listitem>
+	  <listitem><para>Edge type</para></listitem>
+	  <listitem><para>Simple type</para></listitem>
+	</itemizedlist>
+	In the SMP world of the __do_IRQ() super-handler another type
+	was identified:
+	<itemizedlist>
+	  <listitem><para>Per CPU type</para></listitem>
+	</itemizedlist>
+	</para>
+	<para>
+	This split implementation of handlers allows to optimize the flow
+	of the interrupt handling for each specific interrupt type.
+	This reduces complexitiy in that particular code path and allows
+	the optimized handling of a given type.
+	</para>
+	<para>
+	The original general implementation uses interrupt_type structures
+	to differentiate the flow control in the super-handler. This
+	leads to a mix of flow logic and code related to hardware details.
+	Russell Kings ARM implementation which replaced the type by a chip
+	abstraction did the mix the other way around.
+	</para>
+	<para>
+	The natural conclusion was a clean seperation of the 'type flow'
+	and the	'chip'. Analysing a couple of architecture implementations
+	reveals that many of them can use a generic set of 'type flow'
+	implementations and only need to add the chip level specific code.
+	The seperation is also valuable for the (sub)architectures,
+	which need specific quirks in the type flow itself, because it
+	provides a more transparent design.
+	</para>
+	<para>
+	Each interrupt type implementation has assigned its own flow
+	handler, which should be normally one of the generic
+	implementations. The flow handler implementation makes it
+	simple to provide demultiplexing handlers which can be found in
+	embedded platforms on various architectures.
+	</para>
+	<para>
+	The seperation makes the generic interrupt handling more flexible
+	and extensible. An (sub)architecture can use a generic type flow
+	implementation for e.g. 'level type' interrupts and add a
+	(sub)architecture specific 'edge type' implementation.
+	</para>
+	<para>
+	To make the transition to the new model easier and prevent the
+	breakage of existing implementations the __do_IRQ() super-handler
+	is still available. This leads to a kind of duality for the time
+	being. Over time the new model should achieve a homogeneous
+	implementation scheme over all architectures with enhanced
+	maintainability and cleanliness.
+	</para>
+  </chapter>
+  <chapter id="bugs">
+    <title>Known Bugs And Assumptions</title>
+    <para>
+	None (hopefully).
+    </para>
+  </chapter>
+
+  <chapter id="Abstraction">
+    <title>Abstraction layers</title>
+    <para>
+	There are three main levels of abstraction in the interrupt code:
+	<orderedlist>
+	  <listitem><para>Highlevel driver API</para></listitem>
+	  <listitem><para>Abstract interrupt type</para></listitem>
+	  <listitem><para>Chiplevel hardware encapsulation</para></listitem>
+	</orderedlist>
+    </para>
+    <para>
+	The seperation of interrupt type and chip level functionality
+	provides the most flexible design. This implementation can handle
+	all kinds of interrupt hardware and the necessary workarounds for
+	the interrupt types without the need of redundant implementations.
+	The seperation handles also edge and level type interrupts
+	on the same hardware chip.
+    </para>
+    <sect1>
+	<title>Interrupt control flow</title>
+	<para>
+	Each interrupt is described by an interrupt description structure
+	irq_desc. The interrupt is referenced by an 'unsigned int' numeric
+	value which selects the corresponding interrupt decription structure
+	in the description structures array.
+	The description structure contains status information and pointers
+	to the interrupt type structure and the interrupt chip structure
+	which are assigned to this interrupt.
+	</para>
+	<para>
+	Whenever an interrupt triggers, the lowlevel arch code calls into
+	the generic interrupt code by calling desc->handler->handle_irq().
+	This highlevel IRQ handling function only uses other
+	desc->handler primitives which describe the control flow operation
+	necessary for the interrupt type. These operations are calling
+	the chip primitives referenced by the assigned chip description
+	structure.
+	</para>
+    </sect1>
+    <sect1>
+	<title>Highlevel Driver API</title>
+	<para>
+	  The highlevel Driver API consists of following functions:
+	  <itemizedlist>
+	  <listitem><para>request_irq()</para></listitem>
+	  <listitem><para>free_irq()</para></listitem>
+	  <listitem><para>disable_irq()</para></listitem>
+	  <listitem><para>enable_irq()</para></listitem>
+	  <listitem><para>disable_irq_nosync() (SMP only)</para></listitem>
+	  <listitem><para>synchronize_irq() (SMP only)</para></listitem>
+	  <listitem><para>set_irq_type()</para></listitem>
+	  <listitem><para>set_irq_wake()</para></listitem>
+	  <listitem><para>set_irq_data()</para></listitem>
+	  <listitem><para>set_irq_chip()</para></listitem>
+	  <listitem><para>set_irq_chip_data()</para></listitem>
+          </itemizedlist>
+	  See the autogenerated function documentation for details.
+	</para>
+    </sect1>
+    <sect1>
+	<title>Abstract interrupt type</title>
+	<para>
+	  The 'interrupt type' (struct irq_type) abstraction mainly consists of
+	  methods which implement the 'interrupt handling flow'. The generic
+	  layer provides a set of pre-defined types:
+	  <itemizedlist>
+	  <listitem><para>default_level_type</para></listitem>
+	  <listitem><para>default_edge_type</para></listitem>
+	  <listitem><para>default_simple_type</para></listitem>
+	  <listitem><para>default_percpu_type</para></listitem>
+	  </itemizedlist>
+	  The default type implementations use the generic type handlers.
+	  <itemizedlist>
+	  <listitem><para>handle_level_type</para></listitem>
+	  <listitem><para>handle_edge_type</para></listitem>
+	  <listitem><para>handle_simple_type</para></listitem>
+	  <listitem><para>handle_percpu_type</para></listitem>
+	  </itemizedlist>
+	  The interrupt types (either predefined or architecture specific) are
+	  assigned to specific interrupts by the architecture either during
+	  bootup or during device initialization.
+	</para>
+	<sect2>
+	<title>Default type implementations</title>
+	    <sect3>
+	 	<title>Helper functions</title>
+		<para>
+		The helper functions call the chip primitives and
+		are used by the default type implementations.
+		Following helper functions are implemented (simplified excerpt):
+		<programlisting>
+default_enable(irq)
+{
+	desc->chip->unmask(irq);
+}
+
+default_disable(irq)
+{
+	desc->chip->mask(irq);
+}
+
+default_ack(irq)
+{
+	chip->ack(irq);
+}
+
+default_mask_ack(irq)
+{
+	if (chip->mask_ack) {
+		chip->mask_ack(irq);
+	} else {
+		chip->mask(irq);
+		chip->ack(irq);
+	}
+}
+
+noop(irq)
+{
+}
+
+default_set_type(irq, type)
+{
+	if (desc->chip->set_type) {
+		if (desc->chip->set_type(irq, type))
+			return NULL;
+	}
+
+	return default_handler for type;
+}
+		</programlisting>
+	        </para>
+	    </sect3>
+	    <sect3>
+	 	<title>Default Level IRQ type</title>
+		<para>
+		The default Level IRQ type implements the functions
+		<simplelist type="horiz" columns="2">
+		<member>enable</member><member>default_enable</member>
+		<member>disable</member><member>default_disable</member>
+		<member>start</member><member>default_mask_ack</member>
+		<member>end</member><member>default_enable</member>
+		<member>handle_irq</member><member>handle_level_irq</member>
+		<member>set_type</member><member>default_set_type</member>
+		</simplelist>
+	        </para>
+	    </sect3>
+	    <sect3>
+	 	<title>Default Edge IRQ type</title>
+		<para>
+		The default Edge IRQ type implements the functions
+		<simplelist type="horiz" columns="2">
+		<member>enable</member><member>default_enable</member>
+		<member>disable</member><member>default_disable</member>
+		<member>start</member><member>default_ack</member>
+		<member>hold</member><member>default_mask_ack</member>
+		<member>end</member><member>noop</member>
+		<member>handle_irq</member><member>handle_edge_irq</member>
+		<member>set_type</member><member>default_set_type</member>
+		</simplelist>
+	        </para>
+	    </sect3>
+	    <sect3>
+	 	<title>Default simple IRQ type</title>
+		<para>
+		The default simple IRQ type implements the functions
+		<simplelist type="horiz" columns="2">
+		<member>enable</member><member>noop</member>
+		<member>disable</member><member>noop</member>
+		<member>handle_irq</member><member>handle_simple_irq</member>
+		</simplelist>
+	        </para>
+	    </sect3>
+	    <sect3>
+	 	<title>Default per CPU IRQ type</title>
+		<para>
+		The default per CPU IRQ type implements the functions
+		<simplelist type="horiz" columns="2">
+		<member>enable</member><member>default_enable</member>
+		<member>disable</member><member>default_disable</member>
+		<member>start</member><member>default_ack</member>
+		<member>end</member><member>default_enable</member>
+		<member>handle_irq</member><member>handle_percpu_irq</member>
+		</simplelist>
+	        </para>
+	    </sect3>
+	</sect2>
+	<sect2>
+	<title>Default type handler implementations</title>
+	    <sect3>
+	 	<title>Default Level IRQ type handler</title>
+		<para>
+		handle_level_type provides a generic implementation
+		for level type interrupts.
+		</para>
+		<para>
+		Following control flow is implemented (simplified excerpt):
+		<programlisting>
+desc->handler->start();
+handle_IRQ_event(desc->action);
+desc->handler->end();
+		</programlisting>
+		</para>
+   	    </sect3>
+	    <sect3>
+	 	<title>Default Edge IRQ type handler</title>
+		<para>
+		handle_edge_type provides a generic implementation
+		for edge type interrupts.
+		</para>
+		<para>
+		Following control flow is implemented (simplified excerpt):
+		<programlisting>
+if (desc->status &amp; running) {
+	desc->handler->hold();
+	desc->status |= pending | masked;
+	return;
+}
+desc->handler->start();
+desc->status |= running;
+do {
+	if (desc->status &amp; masked)
+		desc->handler->enable();
+	desc-status &amp;= ~pending;
+	handle_IRQ_event(desc->action);
+} while (status &amp; pending);
+desc-status &amp;= ~running;
+desc->handler->end();
+		</programlisting>
+		</para>
+   	    </sect3>
+	    <sect3>
+	 	<title>Default simple IRQ type handler</title>
+		<para>
+		handle_simple_type provides a generic implementation
+		for simple type interrupts.
+		</para>
+		<para>
+		Note: The simple type handler does not call any
+		handler/chip primitives.
+		</para>
+		<para>
+		Following control flow is implemented (simplified excerpt):
+		<programlisting>
+handle_IRQ_event(desc->action);
+		</programlisting>
+		</para>
+   	    </sect3>
+	    <sect3>
+	 	<title>Default per CPU type handler</title>
+		<para>
+		handle_percpu_type provides a generic implementation
+		for per CPU type interrupts.
+		</para>
+		<para>
+		Per CPU interrupts are only available on SMP and
+		the handler provides a simplified version without
+		locking.
+		</para>
+		<para>
+		Following control flow is implemented (simplified excerpt):
+		<programlisting>
+desc->handler->start();
+handle_IRQ_event(desc->action);
+desc->handler->end();
+		</programlisting>
+		</para>
+   	    </sect3>
+	</sect2>
+	<sect2>
+	<title>Architecture specific type implementation</title>
+	<para>
+	  If an architecture needs to implement its own type structures, then
+	  the following primitives have to be implemented:
+	  <itemizedlist>
+	  <listitem><para>handle_irq() - The handle_irq function pointer should preferably point to
+	  one of the generic type handler functions</para></listitem>
+	  <listitem><para>startup() - Optional</para></listitem>
+	  <listitem><para>shutdown() - Optional</para></listitem>
+	  <listitem><para>enable()</para></listitem>
+	  <listitem><para>disable()</para></listitem>
+	  <listitem><para>start()</para></listitem>
+	  <listitem><para>hold() - For edge type interupts only</para></listitem>
+	  <listitem><para>end()</para></listitem>
+	  <listitem><para>set_type - Optional</para></listitem>
+	  <listitem><para>set_affinity - SMP only</para></listitem>
+	  </itemizedlist>
+	</para>
+	</sect2>
+	<sect2>
+	<title>Quirks and optimizations</title>
+	<para>
+	The generic functions are intended for 'clean' architectures and chips,
+	which have no platform-specific IRQ handling quirks. If an architecture
+	needs to implement quirks on the 'flow' level then it can do so by
+	overriding the irqtype. This is also done for compatibility reasons, as
+	most architectures use irqtypes only at the moment.
+	</para>
+	<para>
+	An architecture could implement all of its IRQ logic via pushing
+	chip handling details into the irqtype's ->start()/->end()/->hold()
+	functions. This is only recommended when the underlying primitives
+	are pure chip primitives without additional quirks. The direct pointer
+	to the chip functions reduces the indirection level by one.
+	</para>
+	</sect2>
+    </sect1>
+    <sect1>
+	<title>Chiplevel hardware encapsulation</title>
+	<para>
+	The chip level hardware description structure irq_chip
+	contains all the direct chip relevant functions, which
+	can be utilized by the irq_type implementations.
+	  <itemizedlist>
+	  <listitem><para>ack()</para></listitem>
+	  <listitem><para>mask_ack() - Optional, recommended for performance</para></listitem>
+	  <listitem><para>mask()</para></listitem>
+	  <listitem><para>unmask()</para></listitem>
+	  <listitem><para>retrigger() - Optional</para></listitem>
+	  <listitem><para>set_type() - Optional</para></listitem>
+	  <listitem><para>set_wake() - Optional</para></listitem>
+	  </itemizedlist>
+	These primitives are strictly intended to mean what they say: ack means
+	ACK, masking means masking of an IRQ line, etc. It is up to the flow
+	handler(s) to use these basic units of lowlevel functionality.
+	</para>
+    </sect1>
+  </chapter>
+
+  <chapter id="doirq">
+     <title>__do_IRQ entry point</title>
+     <para>
+ 	The original implementation __do_IRQ() is an alternative entry
+	point for all types of interrupts.
+     </para>
+     <para>
+	This handler turned out to be not suitable for all
+	interrupt hardware and was therefor reimplemented with split
+	functionality for egde/level/simple/percpu interrupts. This is not
+	only a functional optimization. It also shortenes code pathes for
+	interrupts.
+      </para>
+      <para>
+	To make use of the split implementation, replace the call to
+	__do_IRQ by a call to desc->handler->handle_irq() and associate
+        the appropriate handler function to desc->handler->handle_irq().
+	In most cases the generic type and handler implementations should
+	be sufficient.
+     </para>
+  </chapter>
+
+  <chapter id="locking">
+     <title>Locking on SMP</title>
+     <para>
+	The locking of chip registers is up to the architecture that
+	defines the chip primitives. There is a chip->lock field that can be used
+	for serialization, but the generic layer does not touch it. The per-irq
+	structure is protected via desc->lock, by the generic layer.
+     </para>
+  </chapter>
+  <chapter id="structs">
+     <title>Structures</title>
+     <para>
+     This chapter contains the autogenerated documentation of the structures which are
+     used in the generic IRQ layer.
+     </para>
+!Iinclude/linux/irq.h
+  </chapter>
+
+  <chapter id="pubfunctions">
+     <title>Public Functions Provided</title>
+     <para>
+     This chapter contains the autogenerated documentation of the kernel API functions
+      which are exported.
+     </para>
+!Ekernel/irq/manage.c
+  </chapter>
+
+  <chapter id="intfunctions">
+     <title>Internal Functions Provided</title>
+     <para>
+     This chapter contains the autogenerated documentation of the internal functions.
+     </para>
+!Ikernel/irq/handle.c
+  </chapter>
+
+  <chapter id="credits">
+     <title>Credits</title>
+	<para>
+		The following people have contributed to this document:
+		<orderedlist>
+			<listitem><para>Thomas Gleixner<email>tglx@linutronix.de</email></para></listitem>
+			<listitem><para>Ingo Molnar<email>mingo@elte.hu</email></para></listitem>
+		</orderedlist>
+	</para>
+  </chapter>
+</book>
Index: linux-2.6.16/Documentation/RCU/proc.txt
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/Documentation/RCU/proc.txt	2006-10-19 16:52:29.000000000 +0200
@@ -0,0 +1,207 @@
+/proc Filesystem Entries for RCU
+
+
+CONFIG_RCU_STATS
+
+The CONFIG_RCU_STATS config option is available only in conjunction with
+CONFIG_PREEMPT_RCU.  It makes four /proc entries available, namely: rcuctrs,
+rcuptrs, rcugp, and rcustats.
+
+/proc/rcuctrs
+
+	CPU last cur
+	  0    1   1
+	  1    1   1
+	  2    1   1
+	  3    0   2
+	ggp = 230725
+
+This displays the number of processes that started RCU read-side critical
+sections on each CPU.  In absence of preemption, the "last" and "cur"
+counts for a given CPU will always sum to one.  Therefore, in the example
+output above, each CPU has started one RCU read-side critical section
+that was later preempted.  The "last" column counts RCU read-side critical
+sections that started prior to the last counter flip, while the "cur"
+column counts critical sections that started after the last counter flip.
+
+The "ggp" count is a count of the number of counter flips since boot.
+Since this is shown as an odd number, the "cur" counts are stored in
+the zero-th element of each of the per-CPU arrays, and the "last" counts
+are stored in the first element of each of the per-CPU arrays.
+
+
+/proc/rcuptrs
+
+	nl=c04c7160/c04c7960 nt=c04c72d0
+	 wl=c04c7168/c04c794c wt=c04c72bc dl=c04c7170/00000000 dt=c04c7170
+
+This displays the head and tail of each of CONFIG_PREEMPT_RCU's three
+callback lists.  This will soon change to display this on a per-CPU
+basis, since each CPU will soon have its own set of callback lists.
+In the example above, the "next" list header is located at hex address
+0xc04c7160, the first element on the list at hex address 0xc04c7960,
+and the last element on the list at hex address 0xc04c72d0.  The "wl="
+and "wt=" output is similar for the "wait" list, and the "dl=" and "dt="
+output for the "done" list.  The "done" list is normally emptied very
+quickly after being filled, so will usually be empty as shown above.
+Note that the tail pointer points into the list header in this case.
+
+Callbacks are placed in the "next" list by call_rcu(), moved to the
+"wait" list after the next counter flip, and moved to the "done" list
+on the counter flip after that.  Once on the "done" list, the callbacks
+are invoked.
+
+
+/proc/rcugp
+
+	oldggp=241419  newggp=241421
+
+This entry invokes synchronize_rcu() and prints out the number of counter
+flips since boot before and after the synchronize_rcu().  These two
+numbers will always differ by at least two.  Unless RCU is broken.  ;-)
+
+
+/proc/rcustats
+
+	ggp=242416 lgp=242416 sr=0 rcc=396233
+	na=2090938 nl=9 wa=2090929 wl=9 dl=0 dr=2090920 di=2090920
+	rtf1=22230730 rtf2=20139162 rtf3=242416 rtfe1=2085911 rtfe2=5657 rtfe3=19896746
+
+The quantities printed are as follows:
+
+o	"ggp=": The number of flips since boot.
+
+o	"lgp=": The number of flips sensed by the local structure since
+	boot.  This will soon be per-CPU.
+
+o	"sr=": The number of explicit call to synchronize_rcu().
+	Except that this is currently broken, so always reads as zero.
+	It is likely to be removed...
+
+o	"rcc=": The number of calls to rcu_check_callbacks().
+
+o	"na=": The number of callbacks that call_rcu() has registered
+	since boot.
+
+o	"nl=": The number of callbacks currently on the "next" list.
+
+o	"wa=": The number of callbacks that have moved to the "wait"
+	list since boot.
+
+o	"wl=": The number of callbacks currently on the "wait" list.
+
+o	"da=": The number of callbacks that have been moved to the
+	"done" list since boot.
+
+o	"dl=": The number of callbacks currently on the "done" list.
+
+o	"dr=": The number of callbacks that have been removed from the
+	"done" list since boot.
+
+o	"di=": The number of callbacks that have been invoked after being
+	removed from the "done" list.
+
+o	"rtf1=": The number of attempts to flip the counters.
+
+o	"rtf2=": The number of attempts to flip the counters that successfully
+	acquired the fliplock.
+
+o	"rtf3=": The number of successful counter flips.
+
+o	"rtfe1=": The number of attempts to flip the counters that failed
+	due to the lock being held by someone else.
+
+o	"rtfe2=": The number of attempts to flip the counters that were
+	abandoned due to someone else doing the job for us.
+
+o	"rtfe3=": The number of attempts to flip the counters that failed
+	due to some task still being in an RCU read-side critical section
+	starting from before the last successful counter flip.
+
+
+CONFIG_RCU_TORTURE_TEST
+
+The CONFIG_RCU_TORTURE_TEST config option is available for all RCU
+implementations.  It makes three /proc entries available, namely: rcutw,
+rcutr, and rcuts.
+
+
+/proc/rcutw
+
+Reading this entry starts a new torture test, or ends an earlier one
+if one is already in progress (in other words, there can be only one
+writer at a time).  This sleeps uninterruptibly, so be sure to run
+it in the background.  One could argue that it would be good to have
+multiple writers, but Linux uses RCU heavily enough that you will get
+write-side contention whether you want it or not.  If you want additional
+write-side contention, repeatedly create and destroy several large file
+trees in parallel.  Or use some other RCU-protected update.
+
+
+/proc/rcutr
+
+Reading this entry starts a new torture reader, which runs until sent
+a signal (e.g., control-C).  If testing an RCU implementation with
+preemptible read-side critical sections, make sure to spawn at least
+two /proc/rcutr instances for each CPU.
+
+
+/proc/rcuts
+
+Displays the current state of the torture test:
+
+	ggp = 20961
+	rtc: c04496f4 ver: 8734 tfle: 0 rta: 8734 rtaf: 0 rtf: 8715
+	Reader Pipe:  88024120 63914 0 0 0 0 0 0 0 0 0
+	Reader Batch:  88024097 63937 0 0 0 0 0 0 0 0
+	Free-Block Circulation:  8733 8731 8729 8727 8725 8723 8721 8719 8717 8715 0
+
+The entries are as follows:
+
+o	"ggp": The number of counter flips (or batches) since boot.
+
+o	"rtc": The hexadecimal address of the structure currently visible
+	to readers.
+
+o	"ver": The number of times since boot that the rcutw writer task
+	has changed the structure visible to readers.
+
+o	"tfle": If non-zero, indicates that the "torture freelist"
+	containing structure to be placed into the "rtc" area is empty.
+	This condition is important, since it can fool you into thinking
+	that RCU is working when it is not.  :-/
+
+o	"rta": Number of structures allocated from the torture freelist.
+
+o	"rtaf": Number of allocations from the torture freelist that have
+	failed due to the list being empty.
+
+o	"rtf": Number of frees into the torture freelist.
+
+o	"Reader Pipe": Histogram of "ages" of structures seen by readers.
+	If any entries past the first two are non-zero, RCU is broken.
+	And /proc/rcuts prints "!!!" to make sure you notice.  The age
+	of a newly allocated structure is zero, it becomes one when
+	removed from reader visibility, and is incremented once per
+	grace period subsequently -- and is freed after passing through
+	(RCU_TORTURE_PIPE_LEN-2) grace periods.
+
+	The output displayed above was taken from a correctly working
+	RCU.  If you want to see what it looks like when broken, break
+	it yourself.  ;-)
+
+o	"Reader Batch": Another histogram of "ages" of structures seen
+	by readers, but in terms of counter flips (or batches) rather
+	than in terms of grace periods.  The legal number of non-zero
+	entries is again two.  The reason for this separate view is
+	that it is easier to get the third entry to show up in the
+	"Reader Batch" list than in the "Reader Pipe" list.
+
+o	"Free-Block Circulation": Shows the number of torture structures
+	that have reached a given point in the pipeline.  The first element
+	should closely correspond to the number of structures allocated,
+	the second to the number that have been removed from reader view,
+	and all but the last remaining to the corresponding number of
+	passes through a grace period.  The last entry should be zero,
+	as it is only incremented if a torture structure's counter
+	somehow gets incremented farther than it should.
Index: linux-2.6.16/Documentation/RCU/whatisRCU.txt
===================================================================
--- linux-2.6.16.orig/Documentation/RCU/whatisRCU.txt	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/Documentation/RCU/whatisRCU.txt	2006-10-19 16:52:29.000000000 +0200
@@ -790,7 +790,6 @@ RCU pointer update:
 
 RCU grace period:
 
-	synchronize_kernel (deprecated)
 	synchronize_net
 	synchronize_sched
 	synchronize_rcu
Index: linux-2.6.16/Documentation/feature-removal-schedule.txt
===================================================================
--- linux-2.6.16.orig/Documentation/feature-removal-schedule.txt	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/Documentation/feature-removal-schedule.txt	2006-10-19 16:52:29.000000000 +0200
@@ -32,21 +32,6 @@ Who:	Adrian Bunk <bunk@stusta.de>
 
 ---------------------------
 
-What:	RCU API moves to EXPORT_SYMBOL_GPL
-When:	April 2006
-Files:	include/linux/rcupdate.h, kernel/rcupdate.c
-Why:	Outside of Linux, the only implementations of anything even
-	vaguely resembling RCU that I am aware of are in DYNIX/ptx,
-	VM/XA, Tornado, and K42.  I do not expect anyone to port binary
-	drivers or kernel modules from any of these, since the first two
-	are owned by IBM and the last two are open-source research OSes.
-	So these will move to GPL after a grace period to allow
-	people, who might be using implementations that I am not aware
-	of, to adjust to this upcoming change.
-Who:	Paul E. McKenney <paulmck@us.ibm.com>
-
----------------------------
-
 What:	raw1394: requests of type RAW1394_REQ_ISO_SEND, RAW1394_REQ_ISO_LISTEN
 When:	November 2005
 Why:	Deprecated in favour of the new ioctl-based rawiso interface, which is
Index: linux-2.6.16/Documentation/kernel-parameters.txt
===================================================================
--- linux-2.6.16.orig/Documentation/kernel-parameters.txt	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/Documentation/kernel-parameters.txt	2006-10-19 16:52:29.000000000 +0200
@@ -52,6 +52,7 @@ restrictions referred to are that the re
 	MTD	MTD support is enabled.
 	NET	Appropriate network support is enabled.
 	NUMA	NUMA support is enabled.
+	GENERIC_TIME The generic timeofday code is enabled.
 	NFS	Appropriate NFS support is enabled.
 	OSS	OSS sound support is enabled.
 	PARIDE	The ParIDE subsystem is enabled.
@@ -329,10 +330,11 @@ running once the system is up.
 			Value can be changed at runtime via
 				/selinux/checkreqprot.
 
- 	clock=		[BUGS=IA-32,HW] gettimeofday timesource override.
-			Forces specified timesource (if avaliable) to be used
-			when calculating gettimeofday(). If specicified
-			timesource is not avalible, it defaults to PIT.
+	clock=		[BUGS=IA-32, HW] gettimeofday clocksource override.
+			[Deprecated]
+			Forces specified clocksource (if avaliable) to be used
+			when calculating gettimeofday(). If specified
+			clocksource is not avalible, it defaults to PIT.
 			Format: { pit | tsc | cyclone | pmtmr }
 
 	disable_8254_timer
@@ -1579,6 +1581,10 @@ running once the system is up.
 
 	time		Show timing data prefixed to each printk message line
 
+	clocksource=	[GENERIC_TIME] Override the default clocksource
+			Override the default clocksource and use the clocksource
+			with the name specified.
+
 	tipar.timeout=	[HW,PPT]
 			Set communications timeout in tenths of a second
 			(default 15).
Index: linux-2.6.16/Documentation/pi-futex.txt
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/Documentation/pi-futex.txt	2006-10-19 16:52:29.000000000 +0200
@@ -0,0 +1,121 @@
+Lightweight PI-futexes
+----------------------
+
+We are calling them lightweight for 3 reasons:
+
+ - in the user-space fastpath a PI-enabled futex involves no kernel work
+   (or any other PI complexity) at all. No registration, no extra kernel
+   calls - just pure fast atomic ops in userspace.
+
+ - even in the slowpath, the system call and scheduling pattern is very
+   similar to normal futexes.
+
+ - the in-kernel PI implementation is streamlined around the mutex
+   abstraction, with strict rules that keep the implementation
+   relatively simple: only a single owner may own a lock (i.e. no
+   read-write lock support), only the owner may unlock a lock, no
+   recursive locking, etc.
+
+Priority Inheritance - why?
+---------------------------
+
+The short reply: user-space PI helps achieving/improving determinism for
+user-space applications. In the best-case, it can help achieve
+determinism and well-bound latencies. Even in the worst-case, PI will
+improve the statistical distribution of locking related application
+delays.
+
+The longer reply:
+-----------------
+
+Firstly, sharing locks between multiple tasks is a common programming
+technique that often cannot be replaced with lockless algorithms. As we
+can see it in the kernel [which is a quite complex program in itself],
+lockless structures are rather the exception than the norm - the current
+ratio of lockless vs. locky code for shared data structures is somewhere
+between 1:10 and 1:100. Lockless is hard, and the complexity of lockless
+algorithms often endangers to ability to do robust reviews of said code.
+I.e. critical RT apps often choose lock structures to protect critical
+data structures, instead of lockless algorithms. Furthermore, there are
+cases (like shared hardware, or other resource limits) where lockless
+access is mathematically impossible.
+
+Media players (such as Jack) are an example of reasonable application
+design with multiple tasks (with multiple priority levels) sharing
+short-held locks: for example, a highprio audio playback thread is
+combined with medium-prio construct-audio-data threads and low-prio
+display-colory-stuff threads. Add video and decoding to the mix and
+we've got even more priority levels.
+
+So once we accept that synchronization objects (locks) are an
+unavoidable fact of life, and once we accept that multi-task userspace
+apps have a very fair expectation of being able to use locks, we've got
+to think about how to offer the option of a deterministic locking
+implementation to user-space.
+
+Most of the technical counter-arguments against doing priority
+inheritance only apply to kernel-space locks. But user-space locks are
+different, there we cannot disable interrupts or make the task
+non-preemptible in a critical section, so the 'use spinlocks' argument
+does not apply (user-space spinlocks have the same priority inversion
+problems as other user-space locking constructs). Fact is, pretty much
+the only technique that currently enables good determinism for userspace
+locks (such as futex-based pthread mutexes) is priority inheritance:
+
+Currently (without PI), if a high-prio and a low-prio task shares a lock
+[this is a quite common scenario for most non-trivial RT applications],
+even if all critical sections are coded carefully to be deterministic
+(i.e. all critical sections are short in duration and only execute a
+limited number of instructions), the kernel cannot guarantee any
+deterministic execution of the high-prio task: any medium-priority task
+could preempt the low-prio task while it holds the shared lock and
+executes the critical section, and could delay it indefinitely.
+
+Implementation:
+---------------
+
+As mentioned before, the userspace fastpath of PI-enabled pthread
+mutexes involves no kernel work at all - they behave quite similarly to
+normal futex-based locks: a 0 value means unlocked, and a value==TID
+means locked. (This is the same method as used by list-based robust
+futexes.) Userspace uses atomic ops to lock/unlock these mutexes without
+entering the kernel.
+
+To handle the slowpath, we have added two new futex ops:
+
+  FUTEX_LOCK_PI
+  FUTEX_UNLOCK_PI
+
+If the lock-acquire fastpath fails, [i.e. an atomic transition from 0 to
+TID fails], then FUTEX_LOCK_PI is called. The kernel does all the
+remaining work: if there is no futex-queue attached to the futex address
+yet then the code looks up the task that owns the futex [it has put its
+own TID into the futex value], and attaches a 'PI state' structure to
+the futex-queue. The pi_state includes an rt-mutex, which is a PI-aware,
+kernel-based synchronization object. The 'other' task is made the owner
+of the rt-mutex, and the FUTEX_WAITERS bit is atomically set in the
+futex value. Then this task tries to lock the rt-mutex, on which it
+blocks. Once it returns, it has the mutex acquired, and it sets the
+futex value to its own TID and returns. Userspace has no other work to
+perform - it now owns the lock, and futex value contains
+FUTEX_WAITERS|TID.
+
+If the unlock side fastpath succeeds, [i.e. userspace manages to do a
+TID -> 0 atomic transition of the futex value], then no kernel work is
+triggered.
+
+If the unlock fastpath fails (because the FUTEX_WAITERS bit is set),
+then FUTEX_UNLOCK_PI is called, and the kernel unlocks the futex on the
+behalf of userspace - and it also unlocks the attached
+pi_state->rt_mutex and thus wakes up any potential waiters.
+
+Note that under this approach, contrary to previous PI-futex approaches,
+there is no prior 'registration' of a PI-futex. [which is not quite
+possible anyway, due to existing ABI properties of pthread mutexes.]
+
+Also, under this scheme, 'robustness' and 'PI' are two orthogonal
+properties of futexes, and all four combinations are possible: futex,
+robust-futex, PI-futex, robust+PI-futex.
+
+More details about priority inheritance can be found in
+Documentation/rtmutex.txt.
Index: linux-2.6.16/Documentation/robust-futex-ABI.txt
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/Documentation/robust-futex-ABI.txt	2006-10-19 16:52:29.000000000 +0200
@@ -0,0 +1,182 @@
+Started by Paul Jackson <pj@sgi.com>
+
+The robust futex ABI
+--------------------
+
+Robust_futexes provide a mechanism that is used in addition to normal
+futexes, for kernel assist of cleanup of held locks on task exit.
+
+The interesting data as to what futexes a thread is holding is kept on a
+linked list in user space, where it can be updated efficiently as locks
+are taken and dropped, without kernel intervention.  The only additional
+kernel intervention required for robust_futexes above and beyond what is
+required for futexes is:
+
+ 1) a one time call, per thread, to tell the kernel where its list of
+    held robust_futexes begins, and
+ 2) internal kernel code at exit, to handle any listed locks held
+    by the exiting thread.
+
+The existing normal futexes already provide a "Fast Userspace Locking"
+mechanism, which handles uncontested locking without needing a system
+call, and handles contested locking by maintaining a list of waiting
+threads in the kernel.  Options on the sys_futex(2) system call support
+waiting on a particular futex, and waking up the next waiter on a
+particular futex.
+
+For robust_futexes to work, the user code (typically in a library such
+as glibc linked with the application) has to manage and place the
+necessary list elements exactly as the kernel expects them.  If it fails
+to do so, then improperly listed locks will not be cleaned up on exit,
+probably causing deadlock or other such failure of the other threads
+waiting on the same locks.
+
+A thread that anticipates possibly using robust_futexes should first
+issue the system call:
+
+    asmlinkage long
+    sys_set_robust_list(struct robust_list_head __user *head, size_t len);
+
+The pointer 'head' points to a structure in the threads address space
+consisting of three words.  Each word is 32 bits on 32 bit arch's, or 64
+bits on 64 bit arch's, and local byte order.  Each thread should have
+its own thread private 'head'.
+
+If a thread is running in 32 bit compatibility mode on a 64 native arch
+kernel, then it can actually have two such structures - one using 32 bit
+words for 32 bit compatibility mode, and one using 64 bit words for 64
+bit native mode.  The kernel, if it is a 64 bit kernel supporting 32 bit
+compatibility mode, will attempt to process both lists on each task
+exit, if the corresponding sys_set_robust_list() call has been made to
+setup that list.
+
+  The first word in the memory structure at 'head' contains a
+  pointer to a single linked list of 'lock entries', one per lock,
+  as described below.  If the list is empty, the pointer will point
+  to itself, 'head'.  The last 'lock entry' points back to the 'head'.
+
+  The second word, called 'offset', specifies the offset from the
+  address of the associated 'lock entry', plus or minus, of what will
+  be called the 'lock word', from that 'lock entry'.  The 'lock word'
+  is always a 32 bit word, unlike the other words above.  The 'lock
+  word' holds 3 flag bits in the upper 3 bits, and the thread id (TID)
+  of the thread holding the lock in the bottom 29 bits.  See further
+  below for a description of the flag bits.
+
+  The third word, called 'list_op_pending', contains transient copy of
+  the address of the 'lock entry', during list insertion and removal,
+  and is needed to correctly resolve races should a thread exit while
+  in the middle of a locking or unlocking operation.
+
+Each 'lock entry' on the single linked list starting at 'head' consists
+of just a single word, pointing to the next 'lock entry', or back to
+'head' if there are no more entries.  In addition, nearby to each 'lock
+entry', at an offset from the 'lock entry' specified by the 'offset'
+word, is one 'lock word'.
+
+The 'lock word' is always 32 bits, and is intended to be the same 32 bit
+lock variable used by the futex mechanism, in conjunction with
+robust_futexes.  The kernel will only be able to wakeup the next thread
+waiting for a lock on a threads exit if that next thread used the futex
+mechanism to register the address of that 'lock word' with the kernel.
+
+For each futex lock currently held by a thread, if it wants this
+robust_futex support for exit cleanup of that lock, it should have one
+'lock entry' on this list, with its associated 'lock word' at the
+specified 'offset'.  Should a thread die while holding any such locks,
+the kernel will walk this list, mark any such locks with a bit
+indicating their holder died, and wakeup the next thread waiting for
+that lock using the futex mechanism.
+
+When a thread has invoked the above system call to indicate it
+anticipates using robust_futexes, the kernel stores the passed in 'head'
+pointer for that task.  The task may retrieve that value later on by
+using the system call:
+
+    asmlinkage long
+    sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
+                        size_t __user *len_ptr);
+
+It is anticipated that threads will use robust_futexes embedded in
+larger, user level locking structures, one per lock.  The kernel
+robust_futex mechanism doesn't care what else is in that structure, so
+long as the 'offset' to the 'lock word' is the same for all
+robust_futexes used by that thread.  The thread should link those locks
+it currently holds using the 'lock entry' pointers.  It may also have
+other links between the locks, such as the reverse side of a double
+linked list, but that doesn't matter to the kernel.
+
+By keeping its locks linked this way, on a list starting with a 'head'
+pointer known to the kernel, the kernel can provide to a thread the
+essential service available for robust_futexes, which is to help clean
+up locks held at the time of (a perhaps unexpectedly) exit.
+
+Actual locking and unlocking, during normal operations, is handled
+entirely by user level code in the contending threads, and by the
+existing futex mechanism to wait for, and wakeup, locks.  The kernels
+only essential involvement in robust_futexes is to remember where the
+list 'head' is, and to walk the list on thread exit, handling locks
+still held by the departing thread, as described below.
+
+There may exist thousands of futex lock structures in a threads shared
+memory, on various data structures, at a given point in time. Only those
+lock structures for locks currently held by that thread should be on
+that thread's robust_futex linked lock list a given time.
+
+A given futex lock structure in a user shared memory region may be held
+at different times by any of the threads with access to that region. The
+thread currently holding such a lock, if any, is marked with the threads
+TID in the lower 29 bits of the 'lock word'.
+
+When adding or removing a lock from its list of held locks, in order for
+the kernel to correctly handle lock cleanup regardless of when the task
+exits (perhaps it gets an unexpected signal 9 in the middle of
+manipulating this list), the user code must observe the following
+protocol on 'lock entry' insertion and removal:
+
+On insertion:
+ 1) set the 'list_op_pending' word to the address of the 'lock word'
+    to be inserted,
+ 2) acquire the futex lock,
+ 3) add the lock entry, with its thread id (TID) in the bottom 29 bits
+    of the 'lock word', to the linked list starting at 'head', and
+ 4) clear the 'list_op_pending' word.
+
+On removal:
+ 1) set the 'list_op_pending' word to the address of the 'lock word'
+    to be removed,
+ 2) remove the lock entry for this lock from the 'head' list,
+ 2) release the futex lock, and
+ 2) clear the 'lock_op_pending' word.
+
+On exit, the kernel will consider the address stored in
+'list_op_pending' and the address of each 'lock word' found by walking
+the list starting at 'head'.  For each such address, if the bottom 29
+bits of the 'lock word' at offset 'offset' from that address equals the
+exiting threads TID, then the kernel will do two things:
+
+ 1) if bit 31 (0x80000000) is set in that word, then attempt a futex
+    wakeup on that address, which will waken the next thread that has
+    used to the futex mechanism to wait on that address, and
+ 2) atomically set  bit 30 (0x40000000) in the 'lock word'.
+
+In the above, bit 31 was set by futex waiters on that lock to indicate
+they were waiting, and bit 30 is set by the kernel to indicate that the
+lock owner died holding the lock.
+
+The kernel exit code will silently stop scanning the list further if at
+any point:
+
+ 1) the 'head' pointer or an subsequent linked list pointer
+    is not a valid address of a user space word
+ 2) the calculated location of the 'lock word' (address plus
+    'offset') is not the valud address of a 32 bit user space
+    word
+ 3) if the list contains more than 1 million (subject to
+    future kernel configuration changes) elements.
+
+When the kernel sees a list entry whose 'lock word' doesn't have the
+current threads TID in the lower 29 bits, it does nothing with that
+entry, and goes on to the next entry.
+
+Bit 29 (0x20000000) of the 'lock word' is reserved for future use.
Index: linux-2.6.16/Documentation/robust-futexes.txt
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/Documentation/robust-futexes.txt	2006-10-19 16:52:29.000000000 +0200
@@ -0,0 +1,218 @@
+Started by: Ingo Molnar <mingo@redhat.com>
+
+Background
+----------
+
+what are robust futexes? To answer that, we first need to understand
+what futexes are: normal futexes are special types of locks that in the
+noncontended case can be acquired/released from userspace without having
+to enter the kernel.
+
+A futex is in essence a user-space address, e.g. a 32-bit lock variable
+field. If userspace notices contention (the lock is already owned and
+someone else wants to grab it too) then the lock is marked with a value
+that says "there's a waiter pending", and the sys_futex(FUTEX_WAIT)
+syscall is used to wait for the other guy to release it. The kernel
+creates a 'futex queue' internally, so that it can later on match up the
+waiter with the waker - without them having to know about each other.
+When the owner thread releases the futex, it notices (via the variable
+value) that there were waiter(s) pending, and does the
+sys_futex(FUTEX_WAKE) syscall to wake them up.  Once all waiters have
+taken and released the lock, the futex is again back to 'uncontended'
+state, and there's no in-kernel state associated with it. The kernel
+completely forgets that there ever was a futex at that address. This
+method makes futexes very lightweight and scalable.
+
+"Robustness" is about dealing with crashes while holding a lock: if a
+process exits prematurely while holding a pthread_mutex_t lock that is
+also shared with some other process (e.g. yum segfaults while holding a
+pthread_mutex_t, or yum is kill -9-ed), then waiters for that lock need
+to be notified that the last owner of the lock exited in some irregular
+way.
+
+To solve such types of problems, "robust mutex" userspace APIs were
+created: pthread_mutex_lock() returns an error value if the owner exits
+prematurely - and the new owner can decide whether the data protected by
+the lock can be recovered safely.
+
+There is a big conceptual problem with futex based mutexes though: it is
+the kernel that destroys the owner task (e.g. due to a SEGFAULT), but
+the kernel cannot help with the cleanup: if there is no 'futex queue'
+(and in most cases there is none, futexes being fast lightweight locks)
+then the kernel has no information to clean up after the held lock!
+Userspace has no chance to clean up after the lock either - userspace is
+the one that crashes, so it has no opportunity to clean up. Catch-22.
+
+In practice, when e.g. yum is kill -9-ed (or segfaults), a system reboot
+is needed to release that futex based lock. This is one of the leading
+bugreports against yum.
+
+To solve this problem, the traditional approach was to extend the vma
+(virtual memory area descriptor) concept to have a notion of 'pending
+robust futexes attached to this area'. This approach requires 3 new
+syscall variants to sys_futex(): FUTEX_REGISTER, FUTEX_DEREGISTER and
+FUTEX_RECOVER. At do_exit() time, all vmas are searched to see whether
+they have a robust_head set. This approach has two fundamental problems
+left:
+
+ - it has quite complex locking and race scenarios. The vma-based
+   approach had been pending for years, but they are still not completely
+   reliable.
+
+ - they have to scan _every_ vma at sys_exit() time, per thread!
+
+The second disadvantage is a real killer: pthread_exit() takes around 1
+microsecond on Linux, but with thousands (or tens of thousands) of vmas
+every pthread_exit() takes a millisecond or more, also totally
+destroying the CPU's L1 and L2 caches!
+
+This is very much noticeable even for normal process sys_exit_group()
+calls: the kernel has to do the vma scanning unconditionally! (this is
+because the kernel has no knowledge about how many robust futexes there
+are to be cleaned up, because a robust futex might have been registered
+in another task, and the futex variable might have been simply mmap()-ed
+into this process's address space).
+
+This huge overhead forced the creation of CONFIG_FUTEX_ROBUST so that
+normal kernels can turn it off, but worse than that: the overhead makes
+robust futexes impractical for any type of generic Linux distribution.
+
+So something had to be done.
+
+New approach to robust futexes
+------------------------------
+
+At the heart of this new approach there is a per-thread private list of
+robust locks that userspace is holding (maintained by glibc) - which
+userspace list is registered with the kernel via a new syscall [this
+registration happens at most once per thread lifetime]. At do_exit()
+time, the kernel checks this user-space list: are there any robust futex
+locks to be cleaned up?
+
+In the common case, at do_exit() time, there is no list registered, so
+the cost of robust futexes is just a simple current->robust_list != NULL
+comparison. If the thread has registered a list, then normally the list
+is empty. If the thread/process crashed or terminated in some incorrect
+way then the list might be non-empty: in this case the kernel carefully
+walks the list [not trusting it], and marks all locks that are owned by
+this thread with the FUTEX_OWNER_DIED bit, and wakes up one waiter (if
+any).
+
+The list is guaranteed to be private and per-thread at do_exit() time,
+so it can be accessed by the kernel in a lockless way.
+
+There is one race possible though: since adding to and removing from the
+list is done after the futex is acquired by glibc, there is a few
+instructions window for the thread (or process) to die there, leaving
+the futex hung. To protect against this possibility, userspace (glibc)
+also maintains a simple per-thread 'list_op_pending' field, to allow the
+kernel to clean up if the thread dies after acquiring the lock, but just
+before it could have added itself to the list. Glibc sets this
+list_op_pending field before it tries to acquire the futex, and clears
+it after the list-add (or list-remove) has finished.
+
+That's all that is needed - all the rest of robust-futex cleanup is done
+in userspace [just like with the previous patches].
+
+Ulrich Drepper has implemented the necessary glibc support for this new
+mechanism, which fully enables robust mutexes.
+
+Key differences of this userspace-list based approach, compared to the
+vma based method:
+
+ - it's much, much faster: at thread exit time, there's no need to loop
+   over every vma (!), which the VM-based method has to do. Only a very
+   simple 'is the list empty' op is done.
+
+ - no VM changes are needed - 'struct address_space' is left alone.
+
+ - no registration of individual locks is needed: robust mutexes dont
+   need any extra per-lock syscalls. Robust mutexes thus become a very
+   lightweight primitive - so they dont force the application designer
+   to do a hard choice between performance and robustness - robust
+   mutexes are just as fast.
+
+ - no per-lock kernel allocation happens.
+
+ - no resource limits are needed.
+
+ - no kernel-space recovery call (FUTEX_RECOVER) is needed.
+
+ - the implementation and the locking is "obvious", and there are no
+   interactions with the VM.
+
+Performance
+-----------
+
+I have benchmarked the time needed for the kernel to process a list of 1
+million (!) held locks, using the new method [on a 2GHz CPU]:
+
+ - with FUTEX_WAIT set [contended mutex]: 130 msecs
+ - without FUTEX_WAIT set [uncontended mutex]: 30 msecs
+
+I have also measured an approach where glibc does the lock notification
+[which it currently does for !pshared robust mutexes], and that took 256
+msecs - clearly slower, due to the 1 million FUTEX_WAKE syscalls
+userspace had to do.
+
+(1 million held locks are unheard of - we expect at most a handful of
+locks to be held at a time. Nevertheless it's nice to know that this
+approach scales nicely.)
+
+Implementation details
+----------------------
+
+The patch adds two new syscalls: one to register the userspace list, and
+one to query the registered list pointer:
+
+ asmlinkage long
+ sys_set_robust_list(struct robust_list_head __user *head,
+                     size_t len);
+
+ asmlinkage long
+ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
+                     size_t __user *len_ptr);
+
+List registration is very fast: the pointer is simply stored in
+current->robust_list. [Note that in the future, if robust futexes become
+widespread, we could extend sys_clone() to register a robust-list head
+for new threads, without the need of another syscall.]
+
+So there is virtually zero overhead for tasks not using robust futexes,
+and even for robust futex users, there is only one extra syscall per
+thread lifetime, and the cleanup operation, if it happens, is fast and
+straightforward. The kernel doesnt have any internal distinction between
+robust and normal futexes.
+
+If a futex is found to be held at exit time, the kernel sets the
+following bit of the futex word:
+
+	#define FUTEX_OWNER_DIED        0x40000000
+
+and wakes up the next futex waiter (if any). User-space does the rest of
+the cleanup.
+
+Otherwise, robust futexes are acquired by glibc by putting the TID into
+the futex field atomically. Waiters set the FUTEX_WAITERS bit:
+
+	#define FUTEX_WAITERS           0x80000000
+
+and the remaining bits are for the TID.
+
+Testing, architecture support
+-----------------------------
+
+i've tested the new syscalls on x86 and x86_64, and have made sure the
+parsing of the userspace list is robust [ ;-) ] even if the list is
+deliberately corrupted.
+
+i386 and x86_64 syscalls are wired up at the moment, and Ulrich has
+tested the new glibc code (on x86_64 and i386), and it works for his
+robust-mutex testcases.
+
+All other architectures should build just fine too - but they wont have
+the new syscalls yet.
+
+Architectures need to implement the new futex_atomic_cmpxchg_inatomic()
+inline function before writing up the syscalls (that function returns
+-ENOSYS right now).
Index: linux-2.6.16/Documentation/rt-mutex.txt
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/Documentation/rt-mutex.txt	2006-10-19 16:52:29.000000000 +0200
@@ -0,0 +1,74 @@
+RT-mutex subsystem with PI support
+----------------------------------
+
+RT-mutexes with priority inheritance are used to support PI-futexes,
+which enable pthread_mutex_t priority inheritance attributes
+(PTHREAD_PRIO_INHERIT). [See Documentation/pi-futex.txt for more details
+about PI-futexes.]
+
+This technology was developed in the -rt tree and streamlined for
+pthread_mutex support.
+
+Basic principles:
+-----------------
+
+RT-mutexes extend the semantics of simple mutexes by the priority
+inheritance protocol.
+
+A low priority owner of a rt-mutex inherits the priority of a higher
+priority waiter until the rt-mutex is released. If the temporarily
+boosted owner blocks on a rt-mutex itself it propagates the priority
+boosting to the owner of the other rt_mutex it gets blocked on. The
+priority boosting is immediately removed once the rt_mutex has been
+unlocked.
+
+This approach allows us to shorten the block of high-prio tasks on
+mutexes which protect shared resources. Priority inheritance is not a
+magic bullet for poorly designed applications, but it allows
+well-designed applications to use userspace locks in critical parts of
+an high priority thread, without losing determinism.
+
+The enqueueing of the waiters into the rtmutex waiter list is done in
+priority order. For same priorities FIFO order is chosen. For each
+rtmutex, only the top priority waiter is enqueued into the owner's
+priority waiters list. This list too queues in priority order. Whenever
+the top priority waiter of a task changes (for example it timed out or
+got a signal), the priority of the owner task is readjusted. [The
+priority enqueueing is handled by "plists", see include/linux/plist.h
+for more details.]
+
+RT-mutexes are optimized for fastpath operations and have no internal
+locking overhead when locking an uncontended mutex or unlocking a mutex
+without waiters. The optimized fastpath operations require cmpxchg
+support. [If that is not available then the rt-mutex internal spinlock
+is used]
+
+The state of the rt-mutex is tracked via the owner field of the rt-mutex
+structure:
+
+rt_mutex->owner holds the task_struct pointer of the owner. Bit 0 and 1
+are used to keep track of the "owner is pending" and "rtmutex has
+waiters" state.
+
+ owner		bit1	bit0
+ NULL		0	0	mutex is free (fast acquire possible)
+ NULL		0	1	invalid state
+ NULL		1	0	invalid state
+ NULL		1	1	invalid state
+ taskpointer	0	0	mutex is held (fast release possible)
+ taskpointer	0	1	task is pending owner
+ taskpointer	1	0	mutex is held and has waiters
+ taskpointer	1	1	task is pending owner and mutex has waiters
+
+Pending-ownership handling is a performance optimization:
+pending-ownership is assigned to the first (highest priority) waiter of
+the mutex, when the mutex is released. The thread is woken up and once
+it starts executing it can acquire the mutex. Until the mutex is taken
+by it (bit 0 is cleared) a competing higher priority thread can "steal"
+the mutex which puts the woken up thread back on the waiters list.
+
+The pending-ownership optimization is especially important for the
+uninterrupted workflow of high-prio tasks which repeatedly
+takes/releases locks that have lower-prio waiters. Without this
+optimization the higher-prio thread would ping-pong to the lower-prio
+task [because at unlock time we always assign a new owner].
Index: linux-2.6.16/Documentation/timekeeping.txt
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/Documentation/timekeeping.txt	2006-10-19 16:52:29.000000000 +0200
@@ -0,0 +1,347 @@
+How timekeeping works with CONFIG_GENERIC_TIME
+========================================================================
+
+The generic timekeeping code maintains and allows access to the systems
+understanding of how much time has passed from a certain point. However, in
+order to measure the passing of time, the generic timekeeping code relies on
+the clocksource abstraction. A clocksource abstracts a free running counter
+who's value increases at a known frequency.
+
+In the generic timekeeping code, we use a pointer to a selected clocksource to
+measure the passing of time.
+
+struct clocksource *clock
+
+The clocksource has some limitations however. Since its likely of fixed width,
+it will not increment forever and will overflow. In order to still properly
+keep time, we must occasionally accumulate an interval of time. In the generic
+timekeeping code, we accumulate the amount of time system the system booted
+into the value system_time, which keeps nanosecond resolution in a ktime_t
+storage.
+
+ktime_t system_time
+
+Since its likely your system has not been running continually since midnight on
+the 1st of January in 1970, we must provide an offset from that time in
+accordance with conventions. This only occasionally changed (via
+settimeofday()) offset is the wall_time_offset value, which is also stored as a
+ktime_t.
+
+ktime_t wall_time_offset
+
+
+Since we accumulate time in intervals, we need a base cycle value that we can
+use to generate an offset from the time value kept in system_time. We store
+this value in cycle_last.
+
+cycle_t cycle_last;
+
+
+Further since all clocks drift somewhat from each other, we use the adjustment
+values provided via adjtimex() to correct our clocksource frequency for each
+interval. This frequency adjustment value is stored in ntp_adj.
+
+long ntp_adj;
+
+Now that we've covered the core global variables for timekeeping, lets look at
+how we maintain these values.
+
+As stated above, we want to avoid the clocksource from overflowing on us, so we
+accumulate a time interval periodically. This periodic accumulation function is
+called timeofday_periodic_hook().  In simplified pseudo code, it logically is
+presented as:
+
+timeofday_periodic_hook():
+	cycle_now = read_clocksource(clock)
+	cycle_delta = (cycle_now - cycle_last) & clock->mask
+	nsec = cyc2ns(clock, cycle_delta, ntp_adj)
+	system_time += nsec
+	cycle_last = cycle_now
+
+	/* do other stuff */
+
+You can see we read the cycle value from the clocksource, calculate a cycle
+delta for the interval since we last called timeofday_periodic_hook(), convert
+that cycle delta to a nanosecond interval (for now ignore ntp_adj), add it to
+the system time and finally set our cycle_last value to cycle_now for the next
+interval. Using this simple algorithm we can correctly measure and record the
+passing of time.
+
+But just storing this info isn't very useful, we also want to make it available
+to be used elsewhere. So how do we provide a notion of how much time has passed
+inbetween calls to timeofday_periodic_hook()?
+
+First, lets create a function that calculates the time since the last call to
+timeofday_peridoic_hook().
+
+get_nsec_offset():
+	cycle_now = read_clocksource(clock)
+	cycle_delta = (cycle_now - cycle_last) & clock->mask
+	nsec = cyc2ns(clock, cycle_delta, ntp_adj)
+	return nsec
+
+Here you can see, we read the clocksource, calculate a cycle interval, and
+convert that to a nanosecond interval. Just like how it is done in
+timeofday_periodic_hook!
+
+Now lets use this function to provide the number of nanoseconds that the system
+has been running:
+
+do_monotonic_clock():
+	return system_time + get_nsec_offset()
+
+Here we trivially add the nanosecond offset since the last
+timeofday_periodic_hook() to the value of system_time which was stored at the
+last timeofday_periodic_hook().
+
+Note that since we use the same method to calculate time intervals, assuming
+each function is atomic and the clocksource functions as it should, time cannot
+go backward!
+
+Now to get the time of day using the standard convention:
+
+do_gettimeofday():
+	return do_monotonic_clock() + wall_time_offset
+
+We simply add the wall_time_offset, and we have the number of nanoseconds since
+1970 began!
+
+
+Of course, in real life, things are not so static. We have to handle a number
+of dynamic values that may change and affect timekeeping. In order to do these
+safely, we must only change values in-between intervals. This means the
+periodic_hook call must handle these changes.
+
+Since clocksources can be changed while the system is running, we need to check
+for and possibly switch to using new clocksources in the periodic_hook call.
+Further, clocksources may change their frequency. Since this must be done only
+at a safe point, we use the update_callback function pointer (for more details,
+see "How to write a clocksource driver" below), this too must be done
+in-between intervals in the periodic_hook call. Finally, since the ntp
+adjustment made in the cyc2ns conversion is not static, we need to update the
+ntp state machine and get a calculate a new adjustment value.
+
+This adds some extra pseudo code to the timeofday_periodic_hook function:
+
+timeofday_periodic_hook():
+	cycle_now = read_clocksource(clock)
+	cycle_delta = (cycle_now - cycle_last) & clock->mask
+	nsec = cyc2ns(clock, cycle_delta, ntp_adj)
+	system_time += nsec
+	cycle_last = cycle_now
+
+	next = get_next_clocksource()
+	if(next != clock):
+		cycle_last = read_clocksource(next)
+		clock = next
+
+	if(clock->update_callback):
+		clock->update_callback()
+
+	ntp_advance(nsec)
+	ppm = ntp_get_ppm_adjustment()
+	ntp_adj = ppm_to_mult_adj(clock, ppm)
+
+
+Unfortunately, the actual timeofday_periodic_hook code is not as simple as this
+pseudo code. For performance concerns, much has been done to pre-calculate
+values and use them repeatedly. Thus be aware that the code in timeofday.c is
+more complex, however the functional logic is the same.
+
+
+How to port an architecture to GENERIC_TIME
+========================================================================
+Porting an architecture to the GENERIC_TIME timekeeping code consists of moving
+a little bit of code around then deleting a fair amount. It is my hope that
+this will reduce the arch specific maintenance work around timekeeping.
+
+Porting an arch usually requires the following steps.
+
+1. Define CONFIG_GENERIC_TIME in the arches Kconfig
+2. Implementing the following functions
+	s64 read_persistent_clock(void)
+	void sync_persistent_clock(struct timespec ts)
+3. Removing all of the arch specific timekeeping code
+	do_gettimeofday()
+	do_settimeofday()
+	etc
+4. Implementing clocksource drivers
+	See "How to write a clocksource driver" for more details
+
+The exceptions to the above are:
+
+5.  If the arch is has no continuous clocksource
+	A) Implement 1-3 in the above list.
+	B) Define CONFIG_IS_TICK_BASED in arches Kconfig
+	C) Implement the "long arch_getoffset(void)" function
+
+6. If the arch supports vsyscall gettimeofday (see x86_64 for reference)
+	A) Implement 1-4 in the above list
+	B) Define GENERIC_TIME_VSYSCALL
+	C) Implement arch_update_vsyscall_gtod()
+	D) Implement vsyscall gettimeofday (similar to __get_realtime_clock_ts)
+	E) Implement vread functions for supported clocksources
+
+
+
+How to write a clocksource driver.
+========================================================================
+First, a quick summary of what a clocksource driver provides.
+
+Simply put, a clocksource is a abstraction of a free running increasing
+counter. The abstraction provides the minimal amount of info for that counter
+to be usable for timekeeping. Those required values are:
+	1. It's name
+	2. A rating value for selection priority
+	3. A read function pointer
+	4. A mask value for correct twos-complement subtraction
+	5. A mult and shift pair that approximate the counter frequency
+		mult/(2^shift) ~= nanoseconds per cycle
+
+Additionally, there are other optionally set values that allow for advanced
+functionality. Those values are:
+	6. The update_callback function.
+	7. The is_continuous flag.
+	8. The vread function pointer
+	9. The vdata pointer value
+
+
+Now lets go over these values in detail.
+
+1. Name.
+	The clocksource's name should be unique since it is used for both
+identification as well as for manually overriding the default clocksource
+selection. The name length must be shorter then 32 characters in order for it
+to be properly overrided.
+
+2. Rating value
+	This rating value is used as a priority value for clocksource
+selection. It has no direct connection to quality or physical properties of the
+clocksource, but is to be set and manipulated to guarantee that the best (by no
+specific metric) clocksource that will provide correct timekeeping is
+automatically selected. Rating suggestions can be found in
+include/linux/clocksource.h
+
+3. Read function pointer
+	This pointer should point to a function that returns an unsigned
+increasing cycle value from the clocksource. The value should have a coverage
+from zero to the maximum cycle value the clocksource can provide. This does not
+have to be direct hardware value and can also be a software counter. An example
+of a software counter is the jiffies clocksource.
+
+4. The mask value
+	This value should be the largest power of two that is smaller then the
+maximum cycle value. This allows twos complement subtraction to work on
+overflow boundary conditions if the max value is less then (cycle_t)-1. So for
+example, if we have a 16 bit counter (ie: one that loops to zero after
+0x0000FFFF), the mask would be 0xFFFF. So then when finding the cycle
+difference around a overflow, where now = 0x0013 and then = 0xFFEE, we can
+compute the cycle delta properly using the equation:
+	delta = (now - then)&mask
+	delta = (0x0013 - 0xFFEE) & 0xFFFF
+	delta = 0xFFFF0025 & 0xFFFF  /* note the unmasked negative value */
+	delta = 0x25
+
+5. The mult and shift pair
+	These 32bit values approximate the nanosecond per cycle frequency of
+the clocksource using the equation: mult/(2^shift). If you have a khz or hz
+frequency value, the mult value for a given shift value can be easily
+calculated using the  clocksource_hz2mult() and clocksource_khz2mult() helper
+functions. When selecting a shift value, it is important to be careful. Larger
+shift values give a finer precision in the cycle to nanosecond conversion and
+allows for more exact NTP adjustments.	However if you select too large a shift
+value, the resulting mult value might overflow a cycle_t * mult computation.
+
+
+So if you have a simple hardware counter that does not change frequency,
+filling in the above should be sufficient for a functional clocksource. But
+read on for details on implementing a more complex clocksource.
+
+6. The update_callback function pointer.
+	If this function pointer is non-NULL, it will be called every periodic
+hook when it is safe for the clocksource to change its state. This would be
+necessary in the case where the counter frequency changes, for example. One
+user of this  function pointer is the TSC clocksource. When the TSC frequency
+changes (which may occur if the cpu changes frequency) we need to notify the
+clocksource at a safe point where that state may change. Thus, if the TSC has
+changed frequency we set the new mult/shift values in the update_callback
+function.
+
+7. The is_continuous flag.
+	This flag variable (0 if false, 1 if true) denotes that the clocksource
+is continuous. This means that it is a purely hardware driven clocksource and
+is not dependent on any software code to run for it to increment properly. This
+denotation will be useful in the future when timer ticks may be disabled for
+long periods of time. Doing so using software clocksources, like the jiffies
+clocksource, would cause timekeeping problems.
+
+8. The vread function pointer.
+	This function pointer points to a user-space accessible function that
+reads the clocksource. This is used in userspace gettimeofday implementations
+to improve performance. See the x86-64 TSC clocksource implementation for an
+example.
+
+8. The vdata pointer.
+	This pointer is passed to the vread function pointer in a userspace
+gettimeofday implementation. Its usage is dependent on the vread
+implementation, but if the pointer points to data, that data must be readable
+from userspace.
+
+
+Now lets write a quick clocksource for an imaginary bit of hardware. Here are
+the specs:
+
+	A 32bit counter can be found at the MMIO address 0xFEEDF000. It runs at
+100Mhz. To enable it, the the low bit of the address 0xFEEDF0F0 must be set to
+one.
+
+So lets start out an empty cool-counter.c file, and define the clocksource.
+
+#include <linux/clocksource.h>
+#include <linux/init.h>
+#include <asm/io.h>
+
+#define COOL_READ_PTR	0xFEEDF000
+#define COOL_START_PTR	0xFEEDF0F0
+
+static __iomem void *cool_ptr = (void*)COOL_READ_PTR;
+
+static struct clocksource clocksource_cool = {
+	.name = "cool",
+	.rating = 200,		/* its a pretty decent clock */
+	.mask = 0xFFFFFFFF,	/* 32 bits */
+	.mult = 0,		/*to be computed */
+	.shift = 10,
+};
+
+/* Now let's create the read function: */
+
+static cycle_t cool_counter_read(void)
+{
+	return (cycle_t)readl(cool_ptr);
+}
+
+/* Finally, lets create the init function: */
+
+static int __init cool_counter_init(void)
+{
+	__iomem void *ptr = (void*)COOL_START_PTR;
+	u32 val;
+
+	/* start the counter */
+	val = readl(ptr);
+	val |= 0x1;
+	writel(val, ptr);
+
+	/* finish initializing the clocksource */
+	clocksource_cool.read = cool_counter_read;
+	clocksource_cool.mult = clocksource_khz2mult(100000,
+					clocksource_cool.shift);
+
+	/* register the clocksource */
+	return register_clocksource(&clocksource_cool);
+}
+module_init(cool_counter_init);
+
+
+Now wasn't that easy!
Index: linux-2.6.16/Makefile
===================================================================
--- linux-2.6.16.orig/Makefile	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/Makefile	2006-11-22 17:50:43.000000000 +0100
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 16
-EXTRAVERSION =
+EXTRAVERSION =-rt29-tglx4
 NAME=Sliding Snow Leopard
 
 # *DOCUMENTATION*
@@ -511,10 +511,14 @@ CFLAGS		+= $(call add-align,CONFIG_CC_AL
 CFLAGS		+= $(call add-align,CONFIG_CC_ALIGN_LOOPS,-loops)
 CFLAGS		+= $(call add-align,CONFIG_CC_ALIGN_JUMPS,-jumps)
 
-ifdef CONFIG_FRAME_POINTER
-CFLAGS		+= -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,)
+ifdef CONFIG_MCOUNT
+CFLAGS                += -pg -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,)
 else
-CFLAGS		+= -fomit-frame-pointer
+  ifdef CONFIG_FRAME_POINTER
+    CFLAGS		+= -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,)
+  else
+    CFLAGS		+= -fomit-frame-pointer
+  endif
 endif
 
 ifdef CONFIG_DEBUG_INFO
Index: linux-2.6.16/arch/arm/Kconfig
===================================================================
--- linux-2.6.16.orig/arch/arm/Kconfig	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/Kconfig	2006-10-19 16:52:29.000000000 +0200
@@ -46,6 +46,10 @@ config MCA
 	  <file:Documentation/mca.txt> (and especially the web page given
 	  there) before attempting to build an MCA bus kernel.
 
+config GENERIC_HARDIRQS
+	bool
+	default y
+
 config RWSEM_GENERIC_SPINLOCK
 	bool
 	default y
@@ -401,18 +405,7 @@ config LOCAL_TIMERS
 	  accounting to be spread across the timer interval, preventing a
 	  "thundering herd" at every timer tick.
 
-config PREEMPT
-	bool "Preemptible Kernel (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
-	help
-	  This option reduces the latency of the kernel when reacting to
-	  real-time or interactive events by allowing a low priority process to
-	  be preempted even if it is in kernel mode executing a system call.
-	  This allows applications to run more reliably even when the system is
-	  under load.
-
-	  Say Y here if you are building a kernel for a desktop, embedded
-	  or real-time system.  Say N if you are unsure.
+source kernel/Kconfig.preempt
 
 config NO_IDLE_HZ
 	bool "Dynamic tick timer"
Index: linux-2.6.16/arch/arm/boot/compressed/head.S
===================================================================
--- linux-2.6.16.orig/arch/arm/boot/compressed/head.S	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/boot/compressed/head.S	2006-10-19 16:52:29.000000000 +0200
@@ -714,6 +714,19 @@ memdump:	mov	r12, r0
 		mov	pc, r10
 #endif
 
+#ifdef CONFIG_MCOUNT
+/* CONFIG_MCOUNT causes boot header to be built with -pg requiring this
+ * trampoline
+ */
+                .text
+                .align 0
+                .type mcount %function
+                .global mcount
+mcount:
+		mov pc, lr	@ just return
+#endif
+
+
 reloc_end:
 
 		.align
Index: linux-2.6.16/arch/arm/boot/compressed/misc.c
===================================================================
--- linux-2.6.16.orig/arch/arm/boot/compressed/misc.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/boot/compressed/misc.c	2006-10-19 16:52:29.000000000 +0200
@@ -199,6 +199,7 @@ static ulg free_mem_ptr_end;
 
 #define HEAP_SIZE 0x2000
 
+#define ZLIB_INFLATE_NO_INFLATE_LOCK
 #include "../../../../lib/inflate.c"
 
 #ifndef STANDALONE_DEBUG
Index: linux-2.6.16/arch/arm/common/locomo.c
===================================================================
--- linux-2.6.16.orig/arch/arm/common/locomo.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/common/locomo.c	2006-10-19 16:52:29.000000000 +0200
@@ -425,6 +425,12 @@ static struct irqchip locomo_spi_chip = 
 	.unmask	= locomo_spi_unmask_irq,
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(locomo_handler);
+static DEFINE_IRQ_CHAINED_TYPE(locomo_key_handler);
+static DEFINE_IRQ_CHAINED_TYPE(locomo_gpio_handler);
+static DEFINE_IRQ_CHAINED_TYPE(locomo_lt_handler);
+static DEFINE_IRQ_CHAINED_TYPE(locomo_spi_handler);
+
 static void locomo_setup_irq(struct locomo *lchip)
 {
 	int irq;
Index: linux-2.6.16/arch/arm/common/sa1111.c
===================================================================
--- linux-2.6.16.orig/arch/arm/common/sa1111.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/common/sa1111.c	2006-10-19 16:52:29.000000000 +0200
@@ -153,7 +153,7 @@ static void
 sa1111_irq_handler(unsigned int irq, struct irqdesc *desc, struct pt_regs *regs)
 {
 	unsigned int stat0, stat1, i;
-	void __iomem *base = desc->data;
+	void __iomem *base = desc->handler_data;
 
 	stat0 = sa1111_readl(base + SA1111_INTSTATCLR0);
 	stat1 = sa1111_readl(base + SA1111_INTSTATCLR1);
@@ -171,11 +171,11 @@ sa1111_irq_handler(unsigned int irq, str
 
 	for (i = IRQ_SA1111_START; stat0; i++, stat0 >>= 1)
 		if (stat0 & 1)
-			do_edge_IRQ(i, irq_desc + i, regs);
+			handle_edge_irq(i, irq_desc + i, regs);
 
 	for (i = IRQ_SA1111_START + 32; stat1; i++, stat1 >>= 1)
 		if (stat1 & 1)
-			do_edge_IRQ(i, irq_desc + i, regs);
+			handle_edge_irq(i, irq_desc + i, regs);
 
 	/* For level-based interrupts */
 	desc->chip->unmask(irq);
@@ -380,6 +380,8 @@ static struct irqchip sa1111_high_chip =
 	.set_wake	= sa1111_wake_highirq,
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(sa1111_irq_handler);
+
 static void sa1111_setup_irq(struct sa1111 *sachip)
 {
 	void __iomem *irqbase = sachip->base + SA1111_INTC;
Index: linux-2.6.16/arch/arm/common/time-acorn.c
===================================================================
--- linux-2.6.16.orig/arch/arm/common/time-acorn.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/common/time-acorn.c	2006-10-19 16:52:29.000000000 +0200
@@ -16,6 +16,7 @@
 #include <linux/timex.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 
 #include <asm/hardware.h>
 #include <asm/io.h>
@@ -76,7 +77,7 @@ ioc_timer_interrupt(int irq, void *dev_i
 
 static struct irqaction ioc_timer_irq = {
 	.name		= "timer",
-	.flags		= SA_INTERRUPT,
+	.flags		= SA_INTERRUPT | SA_NODELAY,
 	.handler	= ioc_timer_interrupt
 };
 
Index: linux-2.6.16/arch/arm/kernel/calls.S
===================================================================
--- linux-2.6.16.orig/arch/arm/kernel/calls.S	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/kernel/calls.S	2006-10-19 16:52:29.000000000 +0200
@@ -332,7 +332,7 @@
 /* 320 */	CALL(sys_get_mempolicy)
 		CALL(sys_set_mempolicy)
 #ifndef syscalls_counted
-.equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
+.equ syscalls_padding, ((__NR_syscalls + 3) & ~3) - __NR_syscalls
 #define syscalls_counted
 #endif
 .rept syscalls_padding
Index: linux-2.6.16/arch/arm/kernel/dma.c
===================================================================
--- linux-2.6.16.orig/arch/arm/kernel/dma.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/kernel/dma.c	2006-10-19 16:52:29.000000000 +0200
@@ -20,7 +20,7 @@
 
 #include <asm/mach/dma.h>
 
-DEFINE_SPINLOCK(dma_spin_lock);
+DEFINE_RAW_SPINLOCK(dma_spin_lock);
 EXPORT_SYMBOL(dma_spin_lock);
 
 static dma_t dma_chan[MAX_DMA_CHANNELS];
Index: linux-2.6.16/arch/arm/kernel/ecard.c
===================================================================
--- linux-2.6.16.orig/arch/arm/kernel/ecard.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/kernel/ecard.c	2006-10-19 16:52:29.000000000 +0200
@@ -620,7 +620,7 @@ ecard_irqexp_handler(unsigned int irq, s
 		ecard_t *ec = slot_to_ecard(slot);
 
 		if (ec->claimed) {
-			struct irqdesc *d = irqdesc + ec->irq;
+			struct irqdesc *d = irq_desc + ec->irq;
 			/*
 			 * this ugly code is so that we can operate a
 			 * prioritorising system:
@@ -1053,6 +1053,9 @@ ecard_probe(int slot, card_type_t type)
 	return rc;
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(ecard_irqexp_handler);
+static DEFINE_IRQ_CHAINED_TYPE(ecard_irq_handler);
+
 /*
  * Initialise the expansion card system.
  * Locate all hardware - interrupt management and
@@ -1082,8 +1085,10 @@ static int __init ecard_init(void)
 
 	irqhw = ecard_probeirqhw();
 
-	set_irq_chained_handler(IRQ_EXPANSIONCARD,
-				irqhw ? ecard_irqexp_handler : ecard_irq_handler);
+	if (irqhw)
+		set_irq_chained_handler(IRQ_EXPANSIONCARD, ecard_irqexp_handler);
+	else
+		set_irq_chained_handler(IRQ_EXPANSIONCARD, ecard_irq_handler);
 
 	ecard_proc_init();
 
Index: linux-2.6.16/arch/arm/kernel/entry-armv.S
===================================================================
--- linux-2.6.16.orig/arch/arm/kernel/entry-armv.S	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/kernel/entry-armv.S	2006-10-19 16:52:29.000000000 +0200
@@ -201,7 +201,7 @@ __irq_svc:
 	irq_handler
 #ifdef CONFIG_PREEMPT
 	ldr	r0, [tsk, #TI_FLAGS]		@ get flags
-	tst	r0, #_TIF_NEED_RESCHED
+	tst	r0, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED
 	blne	svc_preempt
 preempt_return:
 	ldr	r0, [tsk, #TI_PREEMPT]		@ read preempt value
@@ -228,7 +228,7 @@ svc_preempt:
 	str	r7, [tsk, #TI_PREEMPT]		@ expects preempt_count == 0
 1:	bl	preempt_schedule_irq		@ irq en/disable is done inside
 	ldr	r0, [tsk, #TI_FLAGS]		@ get new tasks TI_FLAGS
-	tst	r0, #_TIF_NEED_RESCHED
+	tst	r0, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED
 	beq	preempt_return			@ go again
 	b	1b
 #endif
Index: linux-2.6.16/arch/arm/kernel/entry-common.S
===================================================================
--- linux-2.6.16.orig/arch/arm/kernel/entry-common.S	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/kernel/entry-common.S	2006-10-19 16:52:29.000000000 +0200
@@ -3,6 +3,8 @@
  *
  *  Copyright (C) 2000 Russell King
  *
+ * LATENCY_TRACE/mcount support (C) 2005 Timesys john.cooper@timesys.com
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
@@ -41,7 +43,7 @@ ret_fast_syscall:
 fast_work_pending:
 	str	r0, [sp, #S_R0+S_OFF]!		@ returned r0
 work_pending:
-	tst	r1, #_TIF_NEED_RESCHED
+	tst	r1, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED
 	bne	work_resched
 	tst	r1, #_TIF_NOTIFY_RESUME | _TIF_SIGPENDING
 	beq	no_work_pending
@@ -51,7 +53,8 @@ work_pending:
 	b	ret_slow_syscall		@ Check work again
 
 work_resched:
-	bl	schedule
+	bl	__schedule
+
 /*
  * "slow" syscall return path.  "why" tells us if this was a real syscall.
  */
@@ -87,8 +90,8 @@ ENTRY(ret_from_fork)
 	b	ret_slow_syscall
 	
 
-	.equ NR_syscalls,0
-#define CALL(x) .equ NR_syscalls,NR_syscalls+1
+	.equ __NR_syscalls,0			@ Used to determine syscall table padding.
+#define CALL(x) .equ __NR_syscalls,__NR_syscalls+1
 #include "calls.S"
 #undef CALL
 #define CALL(x) .long x
@@ -202,7 +205,7 @@ ENTRY(vector_swi)
 	tst	ip, #_TIF_SYSCALL_TRACE		@ are we tracing syscalls?
 	bne	__sys_trace
 
-	cmp	scno, #NR_syscalls		@ check upper syscall limit
+	cmp	scno, #__NR_syscalls		@ check upper syscall limit
 	adr	lr, ret_fast_syscall		@ return address
 	ldrcc	pc, [tbl, scno, lsl #2]		@ call sys_* routine
 
@@ -226,7 +229,7 @@ __sys_trace:
 	adr	lr, __sys_trace_return		@ return address
 	mov	scno, r0			@ syscall number (possibly new)
 	add	r1, sp, #S_R0 + S_OFF		@ pointer to regs
-	cmp	scno, #NR_syscalls		@ check upper syscall limit
+	cmp	scno, #__NR_syscalls		@ check upper syscall limit
 	ldmccia	r1, {r0 - r3}			@ have to reload r0 - r3
 	ldrcc	pc, [tbl, scno, lsl #2]		@ call sys_* routine
 	b	2b
@@ -273,7 +276,7 @@ ENTRY(sys_call_table)
 sys_syscall:
 		eor	scno, r0, #__NR_OABI_SYSCALL_BASE
 		cmp	scno, #__NR_syscall - __NR_SYSCALL_BASE
-		cmpne	scno, #NR_syscalls	@ check range
+		cmpne	scno, #__NR_syscalls	@ check range
 		stmloia	sp, {r5, r6}		@ shuffle args
 		movlo	r0, r1
 		movlo	r1, r2
@@ -388,6 +391,112 @@ ENTRY(sys_oabi_call_table)
 #include "calls.S"
 #undef ABI
 #undef OBSOLETE
+#endif
+
+#ifdef CONFIG_FRAME_POINTER
+
+#ifdef CONFIG_MCOUNT
+/*
+ * At the point where we are in mcount() we maintain the
+ * frame of the prologue code and keep the call to mcount()
+ * out of the stack frame list:
+
+        saved pc          <---\     caller of instrumented routine
+        saved lr              |
+        ip/prev_sp            |
+        fp        -----^      |
+         :                    |
+                              |
+     -> saved pc              |     instrumented routine
+    |   saved lr              |
+    |   ip/prev_sp            |
+    |   fp           ---------/
+    |     :
+    |
+    |                             mcount
+    |	saved pc
+    |	saved lr
+    |	ip/prev sp
+     --	fp
+        r3
+        r2
+        r1
+   sp-> r0
+         :
+ */
+
+	.text
+	.align 0
+	.type mcount %function
+	.global mcount
+
+/* gcc -pg generated FUNCTION_PROLOGUE references mcount()
+ * and has already created the stack frame invocation for
+ * the routine we have been called to instrument. We create
+ * a complete frame nevertheless, as we want to use the same
+ * call to mcount() from c code.
+ */
+mcount:
+
+	ldr	ip, =mcount_enabled	@ leave early, if disabled
+	ldr	ip, [ip]
+	cmp	ip, #0
+	moveq	pc, lr
+
+	mov	ip,  sp
+	stmdb   sp!, {r0 - r3, fp, ip, lr, pc}	@ create stack frame
+
+	ldr	r1, [fp, #-4]		@ get lr (the return address
+					@ of the caller of the
+					@ instrumented function)
+	mov	r0, lr			@ get lr - (the return address
+					@ of the instrumented function)
+
+	sub	fp, ip, #4		@ point fp at this frame
+
+	bl	__trace
+1:
+	ldmdb   fp, {r0 - r3, fp, sp, pc}	@ pop entry frame and return
+
+#endif
+
+/* ARM replacement for unsupported gcc __builtin_return_address(n)
+ * where 0 < n.  n == 0 is supported here as well.
+ *
+ * Walk up the stack frame until the desired frame is found or a NULL
+ * fp is encountered, return NULL in the latter case.
+ *
+ * Note: it is possible under code optimization for the stack invocation
+ * of an ancestor function (level N) to be removed before calling a
+ * descendant function (level N+1).  No easy means is available to deduce
+ * this scenario with the result being [for example] caller_addr(0) when
+ * called from level N+1 returning level N-1 rather than the expected
+ * level N.  This optimization issue appears isolated to the case of
+ * a call to a level N+1 routine made at the tail end of a level N
+ * routine -- the level N frame is deleted and a simple branch is made
+ * to the level N+1 routine.
+ */
+
+	.text
+	.align 0
+	.type arm_return_addr %function
+	.global arm_return_addr
+
+arm_return_addr:
+	mov	ip, r0
+	mov	r0, fp
+3:
+	cmp	r0, #0
+	beq	1f		@ frame list hit end, bail
+	cmp	ip, #0
+	beq	2f		@ reached desired frame
+	ldr	r0, [r0, #-12]  @ else continue, get next fp
+	sub	ip, ip, #1
+	b	 3b
+2:
+	ldr	r0, [r0, #-4]   @ get target return address
+1:
+	mov	pc, lr
 
 #endif
 
Index: linux-2.6.16/arch/arm/kernel/fiq.c
===================================================================
--- linux-2.6.16.orig/arch/arm/kernel/fiq.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/kernel/fiq.c	2006-10-19 16:52:29.000000000 +0200
@@ -38,6 +38,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/seq_file.h>
 
 #include <asm/cacheflush.h>
@@ -88,7 +89,7 @@ void set_fiq_handler(void *start, unsign
  * disable irqs for the duration.  Note - these functions are almost
  * entirely coded in assembly.
  */
-void __attribute__((naked)) set_fiq_regs(struct pt_regs *regs)
+void notrace __attribute__((naked)) set_fiq_regs(struct pt_regs *regs)
 {
 	register unsigned long tmp;
 	asm volatile (
@@ -106,7 +107,7 @@ void __attribute__((naked)) set_fiq_regs
 	: "r" (&regs->ARM_r8), "I" (PSR_I_BIT | PSR_F_BIT | FIQ_MODE));
 }
 
-void __attribute__((naked)) get_fiq_regs(struct pt_regs *regs)
+void notrace __attribute__((naked)) get_fiq_regs(struct pt_regs *regs)
 {
 	register unsigned long tmp;
 	asm volatile (
Index: linux-2.6.16/arch/arm/kernel/irq.c
===================================================================
--- linux-2.6.16.orig/arch/arm/kernel/irq.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/kernel/irq.c	2006-10-19 16:52:29.000000000 +0200
@@ -27,6 +27,7 @@
 #include <linux/signal.h>
 #include <linux/ioport.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/ptrace.h>
 #include <linux/slab.h>
 #include <linux/random.h>
@@ -38,193 +39,11 @@
 #include <linux/kallsyms.h>
 #include <linux/proc_fs.h>
 
-#include <asm/irq.h>
 #include <asm/system.h>
-#include <asm/mach/irq.h>
 #include <asm/mach/time.h>
 
-/*
- * Maximum IRQ count.  Currently, this is arbitary.  However, it should
- * not be set too low to prevent false triggering.  Conversely, if it
- * is set too high, then you could miss a stuck IRQ.
- *
- * Maybe we ought to set a timer and re-enable the IRQ at a later time?
- */
-#define MAX_IRQ_CNT	100000
-
-static int noirqdebug;
-static volatile unsigned long irq_err_count;
-static DEFINE_SPINLOCK(irq_controller_lock);
-static LIST_HEAD(irq_pending);
-
-struct irqdesc irq_desc[NR_IRQS];
 void (*init_arch_irq)(void) __initdata = NULL;
 
-/*
- * No architecture-specific irq_finish function defined in arm/arch/irqs.h.
- */
-#ifndef irq_finish
-#define irq_finish(irq) do { } while (0)
-#endif
-
-/*
- * Dummy mask/unmask handler
- */
-void dummy_mask_unmask_irq(unsigned int irq)
-{
-}
-
-irqreturn_t no_action(int irq, void *dev_id, struct pt_regs *regs)
-{
-	return IRQ_NONE;
-}
-
-void do_bad_IRQ(unsigned int irq, struct irqdesc *desc, struct pt_regs *regs)
-{
-	irq_err_count += 1;
-	printk(KERN_ERR "IRQ: spurious interrupt %d\n", irq);
-}
-
-static struct irqchip bad_chip = {
-	.ack	= dummy_mask_unmask_irq,
-	.mask	= dummy_mask_unmask_irq,
-	.unmask = dummy_mask_unmask_irq,
-};
-
-static struct irqdesc bad_irq_desc = {
-	.chip		= &bad_chip,
-	.handle		= do_bad_IRQ,
-	.pend		= LIST_HEAD_INIT(bad_irq_desc.pend),
-	.disable_depth	= 1,
-};
-
-#ifdef CONFIG_SMP
-void synchronize_irq(unsigned int irq)
-{
-	struct irqdesc *desc = irq_desc + irq;
-
-	while (desc->running)
-		barrier();
-}
-EXPORT_SYMBOL(synchronize_irq);
-
-#define smp_set_running(desc)	do { desc->running = 1; } while (0)
-#define smp_clear_running(desc)	do { desc->running = 0; } while (0)
-#else
-#define smp_set_running(desc)	do { } while (0)
-#define smp_clear_running(desc)	do { } while (0)
-#endif
-
-/**
- *	disable_irq_nosync - disable an irq without waiting
- *	@irq: Interrupt to disable
- *
- *	Disable the selected interrupt line.  Enables and disables
- *	are nested.  We do this lazily.
- *
- *	This function may be called from IRQ context.
- */
-void disable_irq_nosync(unsigned int irq)
-{
-	struct irqdesc *desc = irq_desc + irq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&irq_controller_lock, flags);
-	desc->disable_depth++;
-	list_del_init(&desc->pend);
-	spin_unlock_irqrestore(&irq_controller_lock, flags);
-}
-EXPORT_SYMBOL(disable_irq_nosync);
-
-/**
- *	disable_irq - disable an irq and wait for completion
- *	@irq: Interrupt to disable
- *
- *	Disable the selected interrupt line.  Enables and disables
- *	are nested.  This functions waits for any pending IRQ
- *	handlers for this interrupt to complete before returning.
- *	If you use this function while holding a resource the IRQ
- *	handler may need you will deadlock.
- *
- *	This function may be called - with care - from IRQ context.
- */
-void disable_irq(unsigned int irq)
-{
-	struct irqdesc *desc = irq_desc + irq;
-
-	disable_irq_nosync(irq);
-	if (desc->action)
-		synchronize_irq(irq);
-}
-EXPORT_SYMBOL(disable_irq);
-
-/**
- *	enable_irq - enable interrupt handling on an irq
- *	@irq: Interrupt to enable
- *
- *	Re-enables the processing of interrupts on this IRQ line.
- *	Note that this may call the interrupt handler, so you may
- *	get unexpected results if you hold IRQs disabled.
- *
- *	This function may be called from IRQ context.
- */
-void enable_irq(unsigned int irq)
-{
-	struct irqdesc *desc = irq_desc + irq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&irq_controller_lock, flags);
-	if (unlikely(!desc->disable_depth)) {
-		printk("enable_irq(%u) unbalanced from %p\n", irq,
-			__builtin_return_address(0));
-	} else if (!--desc->disable_depth) {
-		desc->probing = 0;
-		desc->chip->unmask(irq);
-
-		/*
-		 * If the interrupt is waiting to be processed,
-		 * try to re-run it.  We can't directly run it
-		 * from here since the caller might be in an
-		 * interrupt-protected region.
-		 */
-		if (desc->pending && list_empty(&desc->pend)) {
-			desc->pending = 0;
-			if (!desc->chip->retrigger ||
-			    desc->chip->retrigger(irq))
-				list_add(&desc->pend, &irq_pending);
-		}
-	}
-	spin_unlock_irqrestore(&irq_controller_lock, flags);
-}
-EXPORT_SYMBOL(enable_irq);
-
-/*
- * Enable wake on selected irq
- */
-void enable_irq_wake(unsigned int irq)
-{
-	struct irqdesc *desc = irq_desc + irq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&irq_controller_lock, flags);
-	if (desc->chip->set_wake)
-		desc->chip->set_wake(irq, 1);
-	spin_unlock_irqrestore(&irq_controller_lock, flags);
-}
-EXPORT_SYMBOL(enable_irq_wake);
-
-void disable_irq_wake(unsigned int irq)
-{
-	struct irqdesc *desc = irq_desc + irq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&irq_controller_lock, flags);
-	if (desc->chip->set_wake)
-		desc->chip->set_wake(irq, 0);
-	spin_unlock_irqrestore(&irq_controller_lock, flags);
-}
-EXPORT_SYMBOL(disable_irq_wake);
-
 int show_interrupts(struct seq_file *p, void *v)
 {
 	int i = *(loff_t *) v, cpu;
@@ -243,7 +62,7 @@ int show_interrupts(struct seq_file *p, 
 	}
 
 	if (i < NR_IRQS) {
-		spin_lock_irqsave(&irq_controller_lock, flags);
+		spin_lock_irqsave(&irq_desc[i].lock, flags);
 	    	action = irq_desc[i].action;
 		if (!action)
 			goto unlock;
@@ -257,7 +76,7 @@ int show_interrupts(struct seq_file *p, 
 
 		seq_putc(p, '\n');
 unlock:
-		spin_unlock_irqrestore(&irq_controller_lock, flags);
+		spin_unlock_irqrestore(&irq_desc[i].lock, flags);
 	} else if (i == NR_IRQS) {
 #ifdef CONFIG_ARCH_ACORN
 		show_fiq_list(p, v);
@@ -266,374 +85,83 @@ unlock:
 		show_ipi_list(p);
 		show_local_irqs(p);
 #endif
+#ifdef FIXME_TGLX
 		seq_printf(p, "Err: %10lu\n", irq_err_count);
-	}
-	return 0;
-}
-
-/*
- * IRQ lock detection.
- *
- * Hopefully, this should get us out of a few locked situations.
- * However, it may take a while for this to happen, since we need
- * a large number if IRQs to appear in the same jiffie with the
- * same instruction pointer (or within 2 instructions).
- */
-static int check_irq_lock(struct irqdesc *desc, int irq, struct pt_regs *regs)
-{
-	unsigned long instr_ptr = instruction_pointer(regs);
-
-	if (desc->lck_jif == jiffies &&
-	    desc->lck_pc >= instr_ptr && desc->lck_pc < instr_ptr + 8) {
-		desc->lck_cnt += 1;
-
-		if (desc->lck_cnt > MAX_IRQ_CNT) {
-			printk(KERN_ERR "IRQ LOCK: IRQ%d is locking the system, disabled\n", irq);
-			return 1;
-		}
-	} else {
-		desc->lck_cnt = 0;
-		desc->lck_pc  = instruction_pointer(regs);
-		desc->lck_jif = jiffies;
-	}
-	return 0;
-}
-
-static void
-report_bad_irq(unsigned int irq, struct pt_regs *regs, struct irqdesc *desc, int ret)
-{
-	static int count = 100;
-	struct irqaction *action;
-
-	if (!count || noirqdebug)
-		return;
-
-	count--;
-
-	if (ret != IRQ_HANDLED && ret != IRQ_NONE) {
-		printk("irq%u: bogus retval mask %x\n", irq, ret);
-	} else {
-		printk("irq%u: nobody cared\n", irq);
-	}
-	show_regs(regs);
-	dump_stack();
-	printk(KERN_ERR "handlers:");
-	action = desc->action;
-	do {
-		printk("\n" KERN_ERR "[<%p>]", action->handler);
-		print_symbol(" (%s)", (unsigned long)action->handler);
-		action = action->next;
-	} while (action);
-	printk("\n");
-}
-
-static int
-__do_irq(unsigned int irq, struct irqaction *action, struct pt_regs *regs)
-{
-	unsigned int status;
-	int ret, retval = 0;
-
-	spin_unlock(&irq_controller_lock);
-
-#ifdef CONFIG_NO_IDLE_HZ
-	if (!(action->flags & SA_TIMER) && system_timer->dyn_tick != NULL) {
-		write_seqlock(&xtime_lock);
-		if (system_timer->dyn_tick->state & DYN_TICK_ENABLED)
-			system_timer->dyn_tick->handler(irq, 0, regs);
-		write_sequnlock(&xtime_lock);
-	}
 #endif
-
-	if (!(action->flags & SA_INTERRUPT))
-		local_irq_enable();
-
-	status = 0;
-	do {
-		ret = action->handler(irq, action->dev_id, regs);
-		if (ret == IRQ_HANDLED)
-			status |= action->flags;
-		retval |= ret;
-		action = action->next;
-	} while (action);
-
-	if (status & SA_SAMPLE_RANDOM)
-		add_interrupt_randomness(irq);
-
-	spin_lock_irq(&irq_controller_lock);
-
-	return retval;
-}
-
-/*
- * This is for software-decoded IRQs.  The caller is expected to
- * handle the ack, clear, mask and unmask issues.
- */
-void
-do_simple_IRQ(unsigned int irq, struct irqdesc *desc, struct pt_regs *regs)
-{
-	struct irqaction *action;
-	const unsigned int cpu = smp_processor_id();
-
-	desc->triggered = 1;
-
-	kstat_cpu(cpu).irqs[irq]++;
-
-	smp_set_running(desc);
-
-	action = desc->action;
-	if (action) {
-		int ret = __do_irq(irq, action, regs);
-		if (ret != IRQ_HANDLED)
-			report_bad_irq(irq, regs, desc, ret);
-	}
-
-	smp_clear_running(desc);
-}
-
-/*
- * Most edge-triggered IRQ implementations seem to take a broken
- * approach to this.  Hence the complexity.
- */
-void
-do_edge_IRQ(unsigned int irq, struct irqdesc *desc, struct pt_regs *regs)
-{
-	const unsigned int cpu = smp_processor_id();
-
-	desc->triggered = 1;
-
-	/*
-	 * If we're currently running this IRQ, or its disabled,
-	 * we shouldn't process the IRQ.  Instead, turn on the
-	 * hardware masks.
-	 */
-	if (unlikely(desc->running || desc->disable_depth))
-		goto running;
-
-	/*
-	 * Acknowledge and clear the IRQ, but don't mask it.
-	 */
-	desc->chip->ack(irq);
-
-	/*
-	 * Mark the IRQ currently in progress.
-	 */
-	desc->running = 1;
-
-	kstat_cpu(cpu).irqs[irq]++;
-
-	do {
-		struct irqaction *action;
-
-		action = desc->action;
-		if (!action)
-			break;
-
-		if (desc->pending && !desc->disable_depth) {
-			desc->pending = 0;
-			desc->chip->unmask(irq);
-		}
-
-		__do_irq(irq, action, regs);
-	} while (desc->pending && !desc->disable_depth);
-
-	desc->running = 0;
-
-	/*
-	 * If we were disabled or freed, shut down the handler.
-	 */
-	if (likely(desc->action && !check_irq_lock(desc, irq, regs)))
-		return;
-
- running:
-	/*
-	 * We got another IRQ while this one was masked or
-	 * currently running.  Delay it.
-	 */
-	desc->pending = 1;
-	desc->chip->mask(irq);
-	desc->chip->ack(irq);
-}
-
-/*
- * Level-based IRQ handler.  Nice and simple.
- */
-void
-do_level_IRQ(unsigned int irq, struct irqdesc *desc, struct pt_regs *regs)
-{
-	struct irqaction *action;
-	const unsigned int cpu = smp_processor_id();
-
-	desc->triggered = 1;
-
-	/*
-	 * Acknowledge, clear _AND_ disable the interrupt.
-	 */
-	desc->chip->ack(irq);
-
-	if (likely(!desc->disable_depth)) {
-		kstat_cpu(cpu).irqs[irq]++;
-
-		smp_set_running(desc);
-
-		/*
-		 * Return with this interrupt masked if no action
-		 */
-		action = desc->action;
-		if (action) {
-			int ret = __do_irq(irq, desc->action, regs);
-
-			if (ret != IRQ_HANDLED)
-				report_bad_irq(irq, regs, desc, ret);
-
-			if (likely(!desc->disable_depth &&
-				   !check_irq_lock(desc, irq, regs)))
-				desc->chip->unmask(irq);
-		}
-
-		smp_clear_running(desc);
 	}
+	return 0;
 }
 
-static void do_pending_irqs(struct pt_regs *regs)
-{
-	struct list_head head, *l, *n;
-
-	do {
-		struct irqdesc *desc;
-
-		/*
-		 * First, take the pending interrupts off the list.
-		 * The act of calling the handlers may add some IRQs
-		 * back onto the list.
-		 */
-		head = irq_pending;
-		INIT_LIST_HEAD(&irq_pending);
-		head.next->prev = &head;
-		head.prev->next = &head;
-
-		/*
-		 * Now run each entry.  We must delete it from our
-		 * list before calling the handler.
-		 */
-		list_for_each_safe(l, n, &head) {
-			desc = list_entry(l, struct irqdesc, pend);
-			list_del_init(&desc->pend);
-			desc_handle_irq(desc - irq_desc, desc, regs);
-		}
-
-		/*
-		 * The list must be empty.
-		 */
-		BUG_ON(!list_empty(&head));
-	} while (!list_empty(&irq_pending));
-}
+/* Handle bad interrupts */
+static struct irq_desc bad_irq = {
+	.handler = &no_irq_type,
+	.lock = RAW_SPIN_LOCK_UNLOCKED
+};
 
 /*
- * do_IRQ handles all hardware IRQ's.  Decoded IRQs should not
+ * asm_do_IRQ handles all hardware IRQ's.  Decoded IRQs should not
  * come via this function.  Instead, they should provide their
  * own 'handler'
  */
-asmlinkage void asm_do_IRQ(unsigned int irq, struct pt_regs *regs)
+asmlinkage notrace void asm_do_IRQ(unsigned int irq, struct pt_regs *regs)
 {
 	struct irqdesc *desc = irq_desc + irq;
 
+	trace_special(instruction_pointer(regs), irq, 0);
+
 	/*
 	 * Some hardware gives randomly wrong interrupts.  Rather
 	 * than crashing, do something sensible.
 	 */
 	if (irq >= NR_IRQS)
-		desc = &bad_irq_desc;
+		desc = &bad_irq;
 
 	irq_enter();
-	spin_lock(&irq_controller_lock);
-	desc_handle_irq(irq, desc, regs);
-
-	/*
-	 * Now re-run any pending interrupts.
-	 */
-	if (!list_empty(&irq_pending))
-		do_pending_irqs(regs);
 
-	irq_finish(irq);
+	desc_handle_irq(irq, desc, regs);
 
-	spin_unlock(&irq_controller_lock);
 	irq_exit();
 }
 
-void __set_irq_handler(unsigned int irq, irq_handler_t handle, int is_chained)
+void __set_irq_handler(unsigned int irq, struct irq_type *type, int is_chained)
 {
 	struct irqdesc *desc;
 	unsigned long flags;
 
 	if (irq >= NR_IRQS) {
-		printk(KERN_ERR "Trying to install handler for IRQ%d\n", irq);
+		printk(KERN_ERR "Trying to install type control for IRQ%d\n", irq);
 		return;
 	}
 
-	if (handle == NULL)
-		handle = do_bad_IRQ;
-
 	desc = irq_desc + irq;
 
-	if (is_chained && desc->chip == &bad_chip)
-		printk(KERN_WARNING "Trying to install chained handler for IRQ%d\n", irq);
-
-	spin_lock_irqsave(&irq_controller_lock, flags);
-	if (handle == do_bad_IRQ) {
-		desc->chip->mask(irq);
-		desc->chip->ack(irq);
-		desc->disable_depth = 1;
-	}
-	desc->handle = handle;
-	if (handle != do_bad_IRQ && is_chained) {
-		desc->valid = 0;
-		desc->probe_ok = 0;
-		desc->disable_depth = 0;
-		desc->chip->unmask(irq);
+	/* Uninstall ? */
+	if (type == NULL || type == &no_irq_type) {
+		spin_lock_irqsave(&desc->lock, flags);
+		if (desc->chip) {
+			desc->chip->mask(irq);
+			desc->chip->ack(irq);
+		}
+		desc->depth = 1;
+		spin_unlock_irqrestore(&desc->lock, flags);
 	}
-	spin_unlock_irqrestore(&irq_controller_lock, flags);
-}
-
-void set_irq_chip(unsigned int irq, struct irqchip *chip)
-{
-	struct irqdesc *desc;
-	unsigned long flags;
 
-	if (irq >= NR_IRQS) {
-		printk(KERN_ERR "Trying to install chip for IRQ%d\n", irq);
+	/* Install the irq_type */
+	if (generic_set_irq_type(irq, type))
 		return;
-	}
-
-	if (chip == NULL)
-		chip = &bad_chip;
-
-	desc = irq_desc + irq;
-	spin_lock_irqsave(&irq_controller_lock, flags);
-	desc->chip = chip;
-	spin_unlock_irqrestore(&irq_controller_lock, flags);
-}
-
-int set_irq_type(unsigned int irq, unsigned int type)
-{
-	struct irqdesc *desc;
-	unsigned long flags;
-	int ret = -ENXIO;
 
-	if (irq >= NR_IRQS) {
-		printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq);
-		return -ENODEV;
-	}
+	spin_lock_irqsave(&desc->lock, flags);
+	if (is_chained && (desc->handler == &no_irq_type || !desc->chip))
+		printk(KERN_WARNING "Trying to install chained interrupt type for IRQ%d\n", irq);
 
-	desc = irq_desc + irq;
-	if (desc->chip->set_type) {
-		spin_lock_irqsave(&irq_controller_lock, flags);
-		ret = desc->chip->set_type(irq, type);
-		spin_unlock_irqrestore(&irq_controller_lock, flags);
+	if (type != NULL && is_chained) {
+		desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
+		desc->depth = 0;
+		if (desc->chip)
+			desc->chip->unmask(irq);
 	}
-
-	return ret;
+	spin_unlock_irqrestore(&desc->lock, flags);
 }
-EXPORT_SYMBOL(set_irq_type);
 
 void set_irq_flags(unsigned int irq, unsigned int iflags)
 {
@@ -646,421 +174,30 @@ void set_irq_flags(unsigned int irq, uns
 	}
 
 	desc = irq_desc + irq;
-	spin_lock_irqsave(&irq_controller_lock, flags);
-	desc->valid = (iflags & IRQF_VALID) != 0;
-	desc->probe_ok = (iflags & IRQF_PROBE) != 0;
-	desc->noautoenable = (iflags & IRQF_NOAUTOEN) != 0;
-	spin_unlock_irqrestore(&irq_controller_lock, flags);
-}
-
-int setup_irq(unsigned int irq, struct irqaction *new)
-{
-	int shared = 0;
-	struct irqaction *old, **p;
-	unsigned long flags;
-	struct irqdesc *desc;
-
-	/*
-	 * Some drivers like serial.c use request_irq() heavily,
-	 * so we have to be careful not to interfere with a
-	 * running system.
-	 */
-	if (new->flags & SA_SAMPLE_RANDOM) {
-		/*
-		 * This function might sleep, we want to call it first,
-		 * outside of the atomic block.
-		 * Yes, this might clear the entropy pool if the wrong
-		 * driver is attempted to be loaded, without actually
-		 * installing a new handler, but is this really a problem,
-		 * only the sysadmin is able to do this.
-		 */
-	        rand_initialize_irq(irq);
-	}
-
-	/*
-	 * The following block of code has to be executed atomically
-	 */
-	desc = irq_desc + irq;
-	spin_lock_irqsave(&irq_controller_lock, flags);
-	p = &desc->action;
-	if ((old = *p) != NULL) {
-		/*
-		 * Can't share interrupts unless both agree to and are
-		 * the same type.
-		 */
-		if (!(old->flags & new->flags & SA_SHIRQ) ||
-		    (~old->flags & new->flags) & SA_TRIGGER_MASK) {
-			spin_unlock_irqrestore(&irq_controller_lock, flags);
-			return -EBUSY;
-		}
-
-		/* add new interrupt at end of irq queue */
-		do {
-			p = &old->next;
-			old = *p;
-		} while (old);
-		shared = 1;
-	}
-
-	*p = new;
-
-	if (!shared) {
- 		desc->probing = 0;
-		desc->running = 0;
-		desc->pending = 0;
-		desc->disable_depth = 1;
-
-		if (new->flags & SA_TRIGGER_MASK &&
-		    desc->chip->set_type) {
-			unsigned int type = new->flags & SA_TRIGGER_MASK;
-			desc->chip->set_type(irq, type);
-		}
-
-		if (!desc->noautoenable) {
-			desc->disable_depth = 0;
-			desc->chip->unmask(irq);
-		}
-	}
-
-	spin_unlock_irqrestore(&irq_controller_lock, flags);
-	return 0;
-}
-
-/**
- *	request_irq - allocate an interrupt line
- *	@irq: Interrupt line to allocate
- *	@handler: Function to be called when the IRQ occurs
- *	@irqflags: Interrupt type flags
- *	@devname: An ascii name for the claiming device
- *	@dev_id: A cookie passed back to the handler function
- *
- *	This call allocates interrupt resources and enables the
- *	interrupt line and IRQ handling. From the point this
- *	call is made your handler function may be invoked. Since
- *	your handler function must clear any interrupt the board
- *	raises, you must take care both to initialise your hardware
- *	and to set up the interrupt handler in the right order.
- *
- *	Dev_id must be globally unique. Normally the address of the
- *	device data structure is used as the cookie. Since the handler
- *	receives this value it makes sense to use it.
- *
- *	If your interrupt is shared you must pass a non NULL dev_id
- *	as this is required when freeing the interrupt.
- *
- *	Flags:
- *
- *	SA_SHIRQ		Interrupt is shared
- *
- *	SA_INTERRUPT		Disable local interrupts while processing
- *
- *	SA_SAMPLE_RANDOM	The interrupt can be used for entropy
- *
- */
-int request_irq(unsigned int irq, irqreturn_t (*handler)(int, void *, struct pt_regs *),
-		 unsigned long irq_flags, const char * devname, void *dev_id)
-{
-	unsigned long retval;
-	struct irqaction *action;
-
-	if (irq >= NR_IRQS || !irq_desc[irq].valid || !handler ||
-	    (irq_flags & SA_SHIRQ && !dev_id))
-		return -EINVAL;
-
-	action = (struct irqaction *)kmalloc(sizeof(struct irqaction), GFP_KERNEL);
-	if (!action)
-		return -ENOMEM;
-
-	action->handler = handler;
-	action->flags = irq_flags;
-	cpus_clear(action->mask);
-	action->name = devname;
-	action->next = NULL;
-	action->dev_id = dev_id;
-
-	retval = setup_irq(irq, action);
-
-	if (retval)
-		kfree(action);
-	return retval;
-}
-
-EXPORT_SYMBOL(request_irq);
-
-/**
- *	free_irq - free an interrupt
- *	@irq: Interrupt line to free
- *	@dev_id: Device identity to free
- *
- *	Remove an interrupt handler. The handler is removed and if the
- *	interrupt line is no longer in use by any driver it is disabled.
- *	On a shared IRQ the caller must ensure the interrupt is disabled
- *	on the card it drives before calling this function.
- *
- *	This function must not be called from interrupt context.
- */
-void free_irq(unsigned int irq, void *dev_id)
-{
-	struct irqaction * action, **p;
-	unsigned long flags;
-
-	if (irq >= NR_IRQS || !irq_desc[irq].valid) {
-		printk(KERN_ERR "Trying to free IRQ%d\n",irq);
-		dump_stack();
-		return;
-	}
-
-	spin_lock_irqsave(&irq_controller_lock, flags);
-	for (p = &irq_desc[irq].action; (action = *p) != NULL; p = &action->next) {
-		if (action->dev_id != dev_id)
-			continue;
-
-	    	/* Found it - now free it */
-		*p = action->next;
-		break;
-	}
-	spin_unlock_irqrestore(&irq_controller_lock, flags);
-
-	if (!action) {
-		printk(KERN_ERR "Trying to free free IRQ%d\n",irq);
-		dump_stack();
-	} else {
-		synchronize_irq(irq);
-		kfree(action);
-	}
-}
-
-EXPORT_SYMBOL(free_irq);
-
-static DECLARE_MUTEX(probe_sem);
-
-/* Start the interrupt probing.  Unlike other architectures,
- * we don't return a mask of interrupts from probe_irq_on,
- * but return the number of interrupts enabled for the probe.
- * The interrupts which have been enabled for probing is
- * instead recorded in the irq_desc structure.
- */
-unsigned long probe_irq_on(void)
-{
-	unsigned int i, irqs = 0;
-	unsigned long delay;
-
-	down(&probe_sem);
-
-	/*
-	 * first snaffle up any unassigned but
-	 * probe-able interrupts
-	 */
-	spin_lock_irq(&irq_controller_lock);
-	for (i = 0; i < NR_IRQS; i++) {
-		if (!irq_desc[i].probe_ok || irq_desc[i].action)
-			continue;
-
-		irq_desc[i].probing = 1;
-		irq_desc[i].triggered = 0;
-		if (irq_desc[i].chip->set_type)
-			irq_desc[i].chip->set_type(i, IRQT_PROBE);
-		irq_desc[i].chip->unmask(i);
-		irqs += 1;
-	}
-	spin_unlock_irq(&irq_controller_lock);
-
-	/*
-	 * wait for spurious interrupts to mask themselves out again
-	 */
-	for (delay = jiffies + HZ/10; time_before(jiffies, delay); )
-		/* min 100ms delay */;
-
-	/*
-	 * now filter out any obviously spurious interrupts
-	 */
-	spin_lock_irq(&irq_controller_lock);
-	for (i = 0; i < NR_IRQS; i++) {
-		if (irq_desc[i].probing && irq_desc[i].triggered) {
-			irq_desc[i].probing = 0;
-			irqs -= 1;
-		}
-	}
-	spin_unlock_irq(&irq_controller_lock);
-
-	return irqs;
-}
-
-EXPORT_SYMBOL(probe_irq_on);
-
-unsigned int probe_irq_mask(unsigned long irqs)
-{
-	unsigned int mask = 0, i;
-
-	spin_lock_irq(&irq_controller_lock);
-	for (i = 0; i < 16 && i < NR_IRQS; i++)
-		if (irq_desc[i].probing && irq_desc[i].triggered)
-			mask |= 1 << i;
-	spin_unlock_irq(&irq_controller_lock);
-
-	up(&probe_sem);
-
-	return mask;
-}
-EXPORT_SYMBOL(probe_irq_mask);
-
-/*
- * Possible return values:
- *  >= 0 - interrupt number
- *    -1 - no interrupt/many interrupts
- */
-int probe_irq_off(unsigned long irqs)
-{
-	unsigned int i;
-	int irq_found = NO_IRQ;
-
-	/*
-	 * look at the interrupts, and find exactly one
-	 * that we were probing has been triggered
-	 */
-	spin_lock_irq(&irq_controller_lock);
-	for (i = 0; i < NR_IRQS; i++) {
-		if (irq_desc[i].probing &&
-		    irq_desc[i].triggered) {
-			if (irq_found != NO_IRQ) {
-				irq_found = NO_IRQ;
-				goto out;
-			}
-			irq_found = i;
-		}
-	}
-
-	if (irq_found == -1)
-		irq_found = NO_IRQ;
-out:
-	spin_unlock_irq(&irq_controller_lock);
-
-	up(&probe_sem);
-
-	return irq_found;
-}
-
-EXPORT_SYMBOL(probe_irq_off);
-
-#ifdef CONFIG_SMP
-static void route_irq(struct irqdesc *desc, unsigned int irq, unsigned int cpu)
-{
-	pr_debug("IRQ%u: moving from cpu%u to cpu%u\n", irq, desc->cpu, cpu);
-
-	spin_lock_irq(&irq_controller_lock);
-	desc->cpu = cpu;
-	desc->chip->set_cpu(desc, irq, cpu);
-	spin_unlock_irq(&irq_controller_lock);
-}
-
-#ifdef CONFIG_PROC_FS
-static int
-irq_affinity_read_proc(char *page, char **start, off_t off, int count,
-		       int *eof, void *data)
-{
-	struct irqdesc *desc = irq_desc + ((int)data);
-	int len = cpumask_scnprintf(page, count, desc->affinity);
-
-	if (count - len < 2)
-		return -EINVAL;
-	page[len++] = '\n';
-	page[len] = '\0';
-
-	return len;
-}
-
-static int
-irq_affinity_write_proc(struct file *file, const char __user *buffer,
-			unsigned long count, void *data)
-{
-	unsigned int irq = (unsigned int)data;
-	struct irqdesc *desc = irq_desc + irq;
-	cpumask_t affinity, tmp;
-	int ret = -EIO;
-
-	if (!desc->chip->set_cpu)
-		goto out;
-
-	ret = cpumask_parse(buffer, count, affinity);
-	if (ret)
-		goto out;
-
-	cpus_and(tmp, affinity, cpu_online_map);
-	if (cpus_empty(tmp)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	desc->affinity = affinity;
-	route_irq(desc, irq, first_cpu(tmp));
-	ret = count;
-
- out:
-	return ret;
-}
-#endif
-#endif
-
-void __init init_irq_proc(void)
-{
-#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS)
-	struct proc_dir_entry *dir;
-	int irq;
-
-	dir = proc_mkdir("irq", NULL);
-	if (!dir)
-		return;
-
-	for (irq = 0; irq < NR_IRQS; irq++) {
-		struct proc_dir_entry *entry;
-		struct irqdesc *desc;
-		char name[16];
-
-		desc = irq_desc + irq;
-		memset(name, 0, sizeof(name));
-		snprintf(name, sizeof(name) - 1, "%u", irq);
-
-		desc->procdir = proc_mkdir(name, dir);
-		if (!desc->procdir)
-			continue;
-
-		entry = create_proc_entry("smp_affinity", 0600, desc->procdir);
-		if (entry) {
-			entry->nlink = 1;
-			entry->data = (void *)irq;
-			entry->read_proc = irq_affinity_read_proc;
-			entry->write_proc = irq_affinity_write_proc;
-		}
-	}
-#endif
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
+	if (iflags & IRQF_VALID)
+		desc->status &= ~IRQ_NOREQUEST;
+	if (iflags & IRQF_PROBE)
+		desc->status &= ~IRQ_NOPROBE;
+	spin_unlock_irqrestore(&desc->lock, flags);
 }
 
 void __init init_IRQ(void)
 {
-	struct irqdesc *desc;
+	extern void init_dma(void);
 	int irq;
 
+	for (irq = 0; irq < NR_IRQS; irq++)
+		irq_desc[irq].status |= IRQ_NOREQUEST;
+
 #ifdef CONFIG_SMP
 	bad_irq_desc.affinity = CPU_MASK_ALL;
 	bad_irq_desc.cpu = smp_processor_id();
 #endif
-
-	for (irq = 0, desc = irq_desc; irq < NR_IRQS; irq++, desc++) {
-		*desc = bad_irq_desc;
-		INIT_LIST_HEAD(&desc->pend);
-	}
-
 	init_arch_irq();
 }
 
-static int __init noirqdebug_setup(char *str)
-{
-	noirqdebug = 1;
-	return 1;
-}
-
-__setup("noirqdebug", noirqdebug_setup);
-
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * The CPU has been marked offline.  Migrate IRQs off this CPU.  If
Index: linux-2.6.16/arch/arm/kernel/process.c
===================================================================
--- linux-2.6.16.orig/arch/arm/kernel/process.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/kernel/process.c	2006-10-19 16:52:29.000000000 +0200
@@ -124,8 +124,8 @@ void cpu_idle(void)
 		while (!need_resched())
 			idle();
 		leds_event(led_idle_end);
-		preempt_enable_no_resched();
-		schedule();
+		__preempt_enable_no_resched();
+		__schedule();
 		preempt_disable();
 	}
 }
Index: linux-2.6.16/arch/arm/kernel/semaphore.c
===================================================================
--- linux-2.6.16.orig/arch/arm/kernel/semaphore.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/kernel/semaphore.c	2006-10-19 16:52:29.000000000 +0200
@@ -49,14 +49,16 @@
  *    we cannot lose wakeup events.
  */
 
-void __up(struct semaphore *sem)
+fastcall void __attribute_used__ __compat_up(struct compat_semaphore *sem)
 {
 	wake_up(&sem->wait);
 }
 
+EXPORT_SYMBOL(__compat_up);
+
 static DEFINE_SPINLOCK(semaphore_lock);
 
-void __sched __down(struct semaphore * sem)
+fastcall void __attribute_used__ __sched __compat_down(struct compat_semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -89,7 +91,9 @@ void __sched __down(struct semaphore * s
 	wake_up(&sem->wait);
 }
 
-int __sched __down_interruptible(struct semaphore * sem)
+EXPORT_SYMBOL(__compat_down);
+
+fastcall int __attribute_used__ __sched __compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -140,6 +144,8 @@ int __sched __down_interruptible(struct 
 	return retval;
 }
 
+EXPORT_SYMBOL(__compat_down_interruptible);
+
 /*
  * Trylock failed - make sure we correct for
  * having decremented the count.
@@ -148,7 +154,7 @@ int __sched __down_interruptible(struct 
  * single "cmpxchg" without failure cases,
  * but then it wouldn't work on a 386.
  */
-int __down_trylock(struct semaphore * sem)
+fastcall int __attribute_used__ __compat_down_trylock(struct compat_semaphore * sem)
 {
 	int sleepers;
 	unsigned long flags;
@@ -168,6 +174,15 @@ int __down_trylock(struct semaphore * se
 	return 1;
 }
 
+EXPORT_SYMBOL(__compat_down_trylock);
+
+fastcall int compat_sem_is_locked(struct compat_semaphore *sem)
+{
+	return (int) atomic_read(&sem->count) < 0;
+}
+
+EXPORT_SYMBOL(compat_sem_is_locked);
+
 /*
  * The semaphore operations have a special calling sequence that
  * allow us to do a simpler in-line version of them. These routines
@@ -185,7 +200,7 @@ asm("	.section .sched.text,\"ax\",%progb
 __down_failed:					\n\
 	stmfd	sp!, {r0 - r4, lr}		\n\
 	mov	r0, ip				\n\
-	bl	__down				\n\
+	bl	__compat_down			\n\
 	ldmfd	sp!, {r0 - r4, pc}		\n\
 						\n\
 	.align	5				\n\
@@ -193,7 +208,7 @@ __down_failed:					\n\
 __down_interruptible_failed:			\n\
 	stmfd	sp!, {r0 - r4, lr}		\n\
 	mov	r0, ip				\n\
-	bl	__down_interruptible		\n\
+	bl	__compat_down_interruptible	\n\
 	mov	ip, r0				\n\
 	ldmfd	sp!, {r0 - r4, pc}		\n\
 						\n\
@@ -202,7 +217,7 @@ __down_interruptible_failed:			\n\
 __down_trylock_failed:				\n\
 	stmfd	sp!, {r0 - r4, lr}		\n\
 	mov	r0, ip				\n\
-	bl	__down_trylock			\n\
+	bl	__compat_down_trylock		\n\
 	mov	ip, r0				\n\
 	ldmfd	sp!, {r0 - r4, pc}		\n\
 						\n\
@@ -211,7 +226,7 @@ __down_trylock_failed:				\n\
 __up_wakeup:					\n\
 	stmfd	sp!, {r0 - r4, lr}		\n\
 	mov	r0, ip				\n\
-	bl	__up				\n\
+	bl	__compat_up			\n\
 	ldmfd	sp!, {r0 - r4, pc}		\n\
 	");
 
Index: linux-2.6.16/arch/arm/kernel/signal.c
===================================================================
--- linux-2.6.16.orig/arch/arm/kernel/signal.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/kernel/signal.c	2006-10-19 16:52:29.000000000 +0200
@@ -635,6 +635,14 @@ static int do_signal(sigset_t *oldset, s
 	siginfo_t info;
 	int signr;
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Fully-preemptible kernel does not need interrupts disabled:
+	 */
+	local_irq_enable();
+	preempt_check_resched();
+#endif
+
 	/*
 	 * We want the common case to go fast, which
 	 * is why we may in certain cases get here from
Index: linux-2.6.16/arch/arm/kernel/smp.c
===================================================================
--- linux-2.6.16.orig/arch/arm/kernel/smp.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/kernel/smp.c	2006-10-19 16:52:29.000000000 +0200
@@ -519,7 +519,7 @@ static void ipi_call_function(unsigned i
 		cpu_clear(cpu, data->unfinished);
 }
 
-static DEFINE_SPINLOCK(stop_lock);
+static DEFINE_RAW_SPINLOCK(stop_lock);
 
 /*
  * ipi_cpu_stop - handle IPI from smp_send_stop()
Index: linux-2.6.16/arch/arm/kernel/traps.c
===================================================================
--- linux-2.6.16.orig/arch/arm/kernel/traps.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/kernel/traps.c	2006-10-19 16:52:29.000000000 +0200
@@ -177,6 +177,10 @@ void dump_stack(void)
 {
 #ifdef CONFIG_DEBUG_ERRORS
 	__backtrace();
+	print_traces(current);
+#ifdef CONFIG_DEBUG_MUTEXES
+	show_held_locks(current);
+#endif
 #endif
 }
 
@@ -217,7 +221,7 @@ static void __die(const char *str, int e
 	}
 }
 
-DEFINE_SPINLOCK(die_lock);
+DEFINE_RAW_SPINLOCK(die_lock);
 
 /*
  * This function is protected against re-entrancy.
@@ -256,7 +260,7 @@ void notify_die(const char *str, struct 
 }
 
 static LIST_HEAD(undef_hook);
-static DEFINE_SPINLOCK(undef_lock);
+static DEFINE_RAW_SPINLOCK(undef_lock);
 
 void register_undef_hook(struct undef_hook *hook)
 {
Index: linux-2.6.16/arch/arm/mach-at91rm9200/gpio.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-at91rm9200/gpio.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-at91rm9200/gpio.c	2006-10-19 16:52:29.000000000 +0200
@@ -261,7 +261,7 @@ static void gpio_irq_handler(unsigned ir
 	void __iomem	*pio;
 	u32		isr;
 
-	pio = (void __force __iomem *) desc->chipdata;
+	pio = (void __force __iomem *) desc->chip->chip_data;
 
 	/* temporarily mask (level sensitive) parent IRQ */
 	desc->chip->ack(irq);
@@ -270,12 +270,12 @@ static void gpio_irq_handler(unsigned ir
 		if (!isr)
 			break;
 
-		pin = (unsigned) desc->data;
+		pin = (unsigned) desc->handler_data;
 		gpio = &irq_desc[pin];
 
 		while (isr) {
 			if (isr & 1) {
-				if (unlikely(gpio->disable_depth)) {
+				if (unlikely(gpio->depth)) {
 					/*
 					 * The core ARM interrupt handler lazily disables IRQs so
 					 * another IRQ must be generated before it actually gets
@@ -284,7 +284,7 @@ static void gpio_irq_handler(unsigned ir
 					gpio_irq_mask(pin);
 				}
 				else
-					gpio->handle(pin, gpio, regs);
+					desc_handle_irq(pin, gpio, regs);
 			}
 			pin++;
 			gpio++;
@@ -295,6 +295,8 @@ static void gpio_irq_handler(unsigned ir
 	/* now it may re-trigger */
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(gpio_irq_handler);
+
 /* call this from board-specific init_irq */
 void __init at91_gpio_irq_setup(unsigned banks)
 {
Index: linux-2.6.16/arch/arm/mach-at91rm9200/time.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-at91rm9200/time.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-at91rm9200/time.c	2006-10-19 16:52:29.000000000 +0200
@@ -22,13 +22,13 @@
 #include <linux/config.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/time.h>
 
 #include <asm/hardware.h>
 #include <asm/io.h>
-#include <asm/irq.h>
 #include <asm/mach/time.h>
 
 /*
Index: linux-2.6.16/arch/arm/mach-clps711x/time.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-clps711x/time.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-clps711x/time.c	2006-10-19 16:52:29.000000000 +0200
@@ -19,6 +19,7 @@
 #include <linux/timex.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/sched.h>
 
 #include <asm/hardware.h>
Index: linux-2.6.16/arch/arm/mach-clps7500/core.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-clps7500/core.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-clps7500/core.c	2006-10-19 16:52:29.000000000 +0200
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/list.h>
 #include <linux/sched.h>
 #include <linux/init.h>
Index: linux-2.6.16/arch/arm/mach-footbridge/dc21285-timer.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-footbridge/dc21285-timer.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-footbridge/dc21285-timer.c	2006-10-19 16:52:29.000000000 +0200
@@ -6,6 +6,7 @@
  */
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 
 #include <asm/irq.h>
 
Index: linux-2.6.16/arch/arm/mach-footbridge/isa-irq.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-footbridge/isa-irq.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-footbridge/isa-irq.c	2006-10-19 16:52:29.000000000 +0200
@@ -102,6 +102,17 @@ static struct irqaction irq_cascade = { 
 static struct resource pic1_resource = { "pic1", 0x20, 0x3f };
 static struct resource pic2_resource = { "pic2", 0xa0, 0xbf };
 
+static DEFINE_IRQ_CHAINED_TYPE(isa_irq_handler);
+
+static unsigned int startup_irq_disabled(unsigned int irq)
+{
+	return 0;
+}
+
+/* Interrupt type for irqs which must not be
+ * automatically enabled in reqeust_irq */
+static struct irq_type level_type_nostart;
+
 void __init isa_init_irq(unsigned int host_irq)
 {
 	unsigned int irq;
@@ -159,9 +170,11 @@ void __init isa_init_irq(unsigned int ho
 		 * There appears to be a missing pull-up
 		 * resistor on this line.
 		 */
-		if (machine_is_netwinder())
-			set_irq_flags(_ISA_IRQ(11), IRQF_VALID |
-				      IRQF_PROBE | IRQF_NOAUTOEN);
+		if (machine_is_netwinder()) {
+			level_type_nostart = default_level_type;
+			level_type_nostart.startup = startup_irq_disabled;
+			set_irq_handler(_ISA_IRQ(11), &level_type_nostart);
+		}
 	}
 }
 
Index: linux-2.6.16/arch/arm/mach-footbridge/isa-timer.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-footbridge/isa-timer.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-footbridge/isa-timer.c	2006-10-19 16:52:29.000000000 +0200
@@ -6,6 +6,7 @@
  */
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
Index: linux-2.6.16/arch/arm/mach-footbridge/netwinder-hw.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-footbridge/netwinder-hw.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-footbridge/netwinder-hw.c	2006-10-19 16:52:29.000000000 +0200
@@ -68,7 +68,7 @@ static inline void wb977_ww(int reg, int
 /*
  * This is a lock for accessing ports GP1_IO_BASE and GP2_IO_BASE
  */
-DEFINE_SPINLOCK(gpio_lock);
+DEFINE_RAW_SPINLOCK(gpio_lock);
 
 static unsigned int current_gpio_op;
 static unsigned int current_gpio_io;
Index: linux-2.6.16/arch/arm/mach-footbridge/netwinder-leds.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-footbridge/netwinder-leds.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-footbridge/netwinder-leds.c	2006-10-19 16:52:29.000000000 +0200
@@ -33,7 +33,7 @@ static char led_state;
 static char hw_led_state;
 
 static DEFINE_SPINLOCK(leds_lock);
-extern spinlock_t gpio_lock;
+extern raw_spinlock_t gpio_lock;
 
 static void netwinder_leds_event(led_event_t evt)
 {
Index: linux-2.6.16/arch/arm/mach-h720x/common.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-h720x/common.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-h720x/common.c	2006-10-19 16:52:29.000000000 +0200
@@ -163,6 +163,11 @@ h720x_gpiod_demux_handler(unsigned int i
 	h720x_gpio_handler(mask, irq, desc, regs);
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(h720x_gpioa_demux_handler);
+static DEFINE_IRQ_CHAINED_TYPE(h720x_gpiob_demux_handler);
+static DEFINE_IRQ_CHAINED_TYPE(h720x_gpioc_demux_handler);
+static DEFINE_IRQ_CHAINED_TYPE(h720x_gpiod_demux_handler);
+
 #ifdef CONFIG_CPU_H7202
 static void
 h720x_gpioe_demux_handler(unsigned int irq_unused, struct irqdesc *desc,
@@ -175,6 +180,7 @@ h720x_gpioe_demux_handler(unsigned int i
 	IRQDBG("%s mask: 0x%08x irq: %d\n",__FUNCTION__,mask,irq);
 	h720x_gpio_handler(mask, irq, desc, regs);
 }
+static DEFINE_IRQ_CHAINED_TYPE(h720x_gpioe_demux_handler);
 #endif
 
 static struct irqchip h720x_global_chip = {
Index: linux-2.6.16/arch/arm/mach-h720x/cpu-h7202.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-h720x/cpu-h7202.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-h720x/cpu-h7202.c	2006-10-19 16:52:29.000000000 +0200
@@ -175,6 +175,8 @@ static struct irqaction h7202_timer_irq 
 	.handler	= h7202_timer_interrupt,
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(h7202_timerx_demux_handler);
+
 /*
  * Setup TIMER0 as system timer
  */
Index: linux-2.6.16/arch/arm/mach-imx/irq.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-imx/irq.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-imx/irq.c	2006-10-19 16:52:29.000000000 +0200
@@ -217,6 +217,11 @@ static struct irqchip imx_gpio_chip = {
 	.set_type = imx_gpio_irq_type,
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(imx_gpioa_demux_handler);
+static DEFINE_IRQ_CHAINED_TYPE(imx_gpiob_demux_handler);
+static DEFINE_IRQ_CHAINED_TYPE(imx_gpioc_demux_handler);
+static DEFINE_IRQ_CHAINED_TYPE(imx_gpiod_demux_handler);
+
 void __init
 imx_init_irq(void)
 {
Index: linux-2.6.16/arch/arm/mach-imx/time.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-imx/time.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-imx/time.c	2006-10-19 16:52:29.000000000 +0200
@@ -13,6 +13,7 @@
 #include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/time.h>
 
 #include <asm/hardware.h>
Index: linux-2.6.16/arch/arm/mach-integrator/core.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-integrator/core.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-integrator/core.c	2006-10-19 16:52:29.000000000 +0200
@@ -13,6 +13,7 @@
 #include <linux/device.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
 #include <linux/amba/bus.h>
@@ -117,7 +118,7 @@ arch_initcall(integrator_init);
 
 #define CM_CTRL	IO_ADDRESS(INTEGRATOR_HDR_BASE) + INTEGRATOR_HDR_CTRL_OFFSET
 
-static DEFINE_SPINLOCK(cm_lock);
+static DEFINE_RAW_SPINLOCK(cm_lock);
 
 /**
  * cm_control - update the CM_CTRL register.
Index: linux-2.6.16/arch/arm/mach-integrator/pci_v3.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-integrator/pci_v3.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-integrator/pci_v3.c	2006-10-19 16:52:29.000000000 +0200
@@ -163,7 +163,7 @@
  *	 7:2	register number
  *  
  */
-static DEFINE_SPINLOCK(v3_lock);
+static DEFINE_RAW_SPINLOCK(v3_lock);
 
 #define PCI_BUS_NONMEM_START	0x00000000
 #define PCI_BUS_NONMEM_SIZE	SZ_256M
Index: linux-2.6.16/arch/arm/mach-integrator/platsmp.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-integrator/platsmp.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-integrator/platsmp.c	2006-10-19 16:52:29.000000000 +0200
@@ -31,7 +31,7 @@ extern void integrator_secondary_startup
 volatile int __cpuinitdata pen_release = -1;
 unsigned long __cpuinitdata phys_pen_release = 0;
 
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
 
 void __cpuinit platform_secondary_init(unsigned int cpu)
 {
Index: linux-2.6.16/arch/arm/mach-ixp2000/core.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-ixp2000/core.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-ixp2000/core.c	2006-10-19 16:52:29.000000000 +0200
@@ -20,6 +20,7 @@
 #include <linux/spinlock.h>
 #include <linux/sched.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/serial.h>
 #include <linux/tty.h>
 #include <linux/bitops.h>
@@ -288,7 +289,7 @@ void gpio_line_config(int line, int dire
 
 	local_irq_save(flags);
 	if (direction == GPIO_OUT) {
-		irq_desc[line + IRQ_IXP2000_GPIO0].valid = 0;
+ 		set_irq_flags(line + IRQ_IXP2000_GPIO0, 0);
 
 		/* if it's an output, it ain't an interrupt anymore */
 		GPIO_IRQ_falling_edge &= ~(1 << line);
@@ -354,8 +355,7 @@ static int ixp2000_GPIO_irq_type(unsigne
 	/*
 	 * Finally, mark the corresponding IRQ as valid.
 	 */
-	irq_desc[irq].valid = 1;
-
+	set_irq_flags(irq, IRQF_VALID);
 	return 0;
 }
 
@@ -414,7 +414,7 @@ static void ixp2000_err_irq_handler(unsi
 	for(i = 31; i >= 0; i--) {
 		if(status & (1 << i)) {
 			desc = irq_desc + IRQ_IXP2000_DRAM0_MIN_ERR + i;
-			desc->handle(IRQ_IXP2000_DRAM0_MIN_ERR + i, desc, regs);
+			desc_handle_irq(IRQ_IXP2000_DRAM0_MIN_ERR + i, desc, regs);
 		}
 	}
 }
@@ -459,6 +459,9 @@ static struct irqchip ixp2000_irq_chip =
 	.unmask	= ixp2000_irq_unmask
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(ixp2000_GPIO_irq_handler);
+static DEFINE_IRQ_CHAINED_TYPE(ixp2000_err_irq_handler);
+
 void __init ixp2000_init_irq(void)
 {
 	int irq;
Index: linux-2.6.16/arch/arm/mach-ixp2000/ixdp2x00.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-ixp2000/ixdp2x00.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-ixp2000/ixdp2x00.c	2006-10-19 16:52:29.000000000 +0200
@@ -146,6 +146,8 @@ static struct irqchip ixdp2x00_cpld_irq_
 	.unmask	= ixdp2x00_irq_unmask
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(ixdp2x00_irq_handler);
+
 void ixdp2x00_init_irq(volatile unsigned long *stat_reg, volatile unsigned long *mask_reg, unsigned long nr_irqs)
 {
 	unsigned int irq;
@@ -168,7 +170,7 @@ void ixdp2x00_init_irq(volatile unsigned
 	}
 
 	/* Hook into PCI interrupt */
-	set_irq_chained_handler(IRQ_IXP2000_PCIB, &ixdp2x00_irq_handler);
+	set_irq_chained_handler(IRQ_IXP2000_PCIB, ixdp2x00_irq_handler);
 }
 
 /*************************************************************************
Index: linux-2.6.16/arch/arm/mach-ixp2000/ixdp2x01.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-ixp2000/ixdp2x01.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-ixp2000/ixdp2x01.c	2006-10-19 16:52:29.000000000 +0200
@@ -95,6 +95,8 @@ static struct irqchip ixdp2x01_irq_chip 
 	.unmask	= ixdp2x01_irq_unmask
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(ixdp2x01_irq_handler);
+
 /*
  * We only do anything if we are the master NPU on the board.
  * The slave NPU only has the ethernet chip going directly to
@@ -127,7 +129,7 @@ void __init ixdp2x01_init_irq(void)
 	}
 
 	/* Hook into PCI interrupts */
-	set_irq_chained_handler(IRQ_IXP2000_PCIB, &ixdp2x01_irq_handler);
+	set_irq_chained_handler(IRQ_IXP2000_PCIB, ixdp2x01_irq_handler);
 }
 
 
Index: linux-2.6.16/arch/arm/mach-ixp4xx/common-pci.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-ixp4xx/common-pci.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-ixp4xx/common-pci.c	2006-10-19 16:52:29.000000000 +0200
@@ -53,7 +53,7 @@ unsigned long ixp4xx_pci_reg_base = 0;
  * these transactions are atomic or we will end up
  * with corrupt data on the bus or in a driver.
  */
-static DEFINE_SPINLOCK(ixp4xx_pci_lock);
+static DEFINE_RAW_SPINLOCK(ixp4xx_pci_lock);
 
 /*
  * Read from PCI config space
Index: linux-2.6.16/arch/arm/mach-ixp4xx/coyote-pci.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-ixp4xx/coyote-pci.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-ixp4xx/coyote-pci.c	2006-10-19 16:52:29.000000000 +0200
@@ -17,6 +17,7 @@
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/irq.h>
 
 #include <asm/mach-types.h>
 #include <asm/hardware.h>
Index: linux-2.6.16/arch/arm/mach-ixp4xx/ixdp425-pci.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-ixp4xx/ixdp425-pci.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-ixp4xx/ixdp425-pci.c	2006-10-19 16:52:29.000000000 +0200
@@ -16,6 +16,7 @@
 
 #include <linux/kernel.h>
 #include <linux/config.h>
+#include <linux/irq.h>
 #include <linux/pci.h>
 #include <linux/init.h>
 #include <linux/delay.h>
Index: linux-2.6.16/arch/arm/mach-ixp4xx/ixdpg425-pci.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-ixp4xx/ixdpg425-pci.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-ixp4xx/ixdpg425-pci.c	2006-10-19 16:52:29.000000000 +0200
@@ -16,10 +16,10 @@
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/irq.h>
 
 #include <asm/mach-types.h>
 #include <asm/hardware.h>
-#include <asm/irq.h>
 
 #include <asm/mach/pci.h>
 
Index: linux-2.6.16/arch/arm/mach-ixp4xx/nas100d-pci.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-ixp4xx/nas100d-pci.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-ixp4xx/nas100d-pci.c	2006-10-19 16:52:29.000000000 +0200
@@ -18,6 +18,7 @@
 #include <linux/config.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/irq.h>
 
 #include <asm/mach/pci.h>
 #include <asm/mach-types.h>
Index: linux-2.6.16/arch/arm/mach-ixp4xx/nas100d-power.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-ixp4xx/nas100d-power.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-ixp4xx/nas100d-power.c	2006-10-19 16:52:29.000000000 +0200
@@ -20,6 +20,7 @@
 #include <linux/module.h>
 #include <linux/reboot.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 
 #include <asm/mach-types.h>
 
Index: linux-2.6.16/arch/arm/mach-l7200/core.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-l7200/core.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-l7200/core.c	2006-10-19 16:52:29.000000000 +0200
@@ -7,6 +7,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/irq.h>
 #include <linux/device.h>
 
 #include <asm/types.h>
Index: linux-2.6.16/arch/arm/mach-lh7a40x/arch-kev7a400.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-lh7a40x/arch-kev7a400.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-lh7a40x/arch-kev7a400.c	2006-10-19 16:52:29.000000000 +0200
@@ -81,6 +81,8 @@ static void kev7a400_cpld_handler (unsig
 	}
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(kev7a400_cpld_handler);
+
 void __init lh7a40x_init_board_irq (void)
 {
 	int irq;
Index: linux-2.6.16/arch/arm/mach-lh7a40x/arch-lpd7a40x.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-lh7a40x/arch-lpd7a40x.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-lh7a40x/arch-lpd7a40x.c	2006-10-19 16:52:29.000000000 +0200
@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 
 #include <asm/hardware.h>
 #include <asm/setup.h>
@@ -173,6 +174,7 @@ static void lpd7a40x_cpld_handler (unsig
 	desc->chip->unmask (irq); /* Level-triggered need this */
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(lpd7a40x_cpld_handler);
 
 void __init lh7a40x_init_board_irq (void)
 {
Index: linux-2.6.16/arch/arm/mach-lh7a40x/irq-kev7a400.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-lh7a40x/irq-kev7a400.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-lh7a40x/irq-kev7a400.c	2006-10-19 16:52:29.000000000 +0200
@@ -60,6 +60,8 @@ lh7a400_cpld_handler (unsigned int irq, 
 	}
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(kev7a400_cpld_handler);
+
   /* IRQ initialization */
 
 void __init
Index: linux-2.6.16/arch/arm/mach-lh7a40x/irq-lpd7a40x.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-lh7a40x/irq-lpd7a40x.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-lh7a40x/irq-lpd7a40x.c	2006-10-19 16:52:29.000000000 +0200
@@ -71,6 +71,7 @@ static void lh7a40x_cpld_handler (unsign
 	desc->chip->unmask (irq); /* Level-triggered need this */
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(lh7a40x_cpld_handler);
 
   /* IRQ initialization */
 
Index: linux-2.6.16/arch/arm/mach-lh7a40x/time.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-lh7a40x/time.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-lh7a40x/time.c	2006-10-19 16:52:29.000000000 +0200
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/time.h>
 
 #include <asm/hardware.h>
Index: linux-2.6.16/arch/arm/mach-omap1/board-osk.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-omap1/board-osk.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-omap1/board-osk.c	2006-10-19 16:52:29.000000000 +0200
@@ -29,7 +29,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/platform_device.h>
-#include <linux/interrupt.h>
+#include <linux/irq.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
Index: linux-2.6.16/arch/arm/mach-omap1/fpga.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-omap1/fpga.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-omap1/fpga.c	2006-10-19 16:52:29.000000000 +0200
@@ -120,6 +120,8 @@ static struct irqchip omap_fpga_irq = {
 	.unmask		= fpga_unmask_irq,
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(innovator_fpga_IRQ_demux);
+
 /*
  * All of the FPGA interrupt request inputs except for the touchscreen are
  * edge-sensitive; the touchscreen is level-sensitive.  The edge-sensitive
Index: linux-2.6.16/arch/arm/mach-omap1/serial.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-omap1/serial.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-omap1/serial.c	2006-10-19 16:52:29.000000000 +0200
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/irq.h>
 #include <linux/delay.h>
 #include <linux/serial.h>
 #include <linux/tty.h>
Index: linux-2.6.16/arch/arm/mach-pxa/idp.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-pxa/idp.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-pxa/idp.c	2006-10-19 16:52:29.000000000 +0200
@@ -18,6 +18,7 @@
 
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/platform_device.h>
 #include <linux/fb.h>
 
Index: linux-2.6.16/arch/arm/mach-pxa/irq.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-pxa/irq.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-pxa/irq.c	2006-10-19 16:52:29.000000000 +0200
@@ -244,6 +244,7 @@ static struct irqchip pxa_muxed_gpio_chi
 	.set_type	= pxa_gpio_irq_type,
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(pxa_gpio_demux_handler);
 
 void __init pxa_init_irq(void)
 {
Index: linux-2.6.16/arch/arm/mach-pxa/lubbock.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-pxa/lubbock.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-pxa/lubbock.c	2006-10-19 16:52:29.000000000 +0200
@@ -95,6 +95,8 @@ static void lubbock_irq_handler(unsigned
 	} while (pending);
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(lubbock_irq_handler);
+
 static void __init lubbock_init_irq(void)
 {
 	int irq;
Index: linux-2.6.16/arch/arm/mach-pxa/mainstone.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-pxa/mainstone.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-pxa/mainstone.c	2006-10-19 16:52:29.000000000 +0200
@@ -85,6 +85,8 @@ static void mainstone_irq_handler(unsign
 	} while (pending);
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(mainstone_irq_handler);
+
 static void __init mainstone_init_irq(void)
 {
 	int irq;
Index: linux-2.6.16/arch/arm/mach-pxa/sharpsl_pm.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-pxa/sharpsl_pm.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-pxa/sharpsl_pm.c	2006-10-19 16:52:29.000000000 +0200
@@ -18,11 +18,11 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/platform_device.h>
 
 #include <asm/hardware.h>
 #include <asm/mach-types.h>
-#include <asm/irq.h>
 #include <asm/apm.h>
 #include <asm/arch/pm.h>
 #include <asm/arch/pxa-regs.h>
Index: linux-2.6.16/arch/arm/mach-rpc/irq.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-rpc/irq.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-rpc/irq.c	2006-10-19 16:52:29.000000000 +0200
@@ -112,6 +112,15 @@ static struct irqchip iomd_fiq_chip = {
 	.unmask = iomd_unmask_irq_fiq,
 };
 
+static unsigned int startup_irq_disabled(unsigned int irq)
+{
+	return 0;
+}
+
+/* Interrupt type for irqs which must not be
+ * automatically enabled in reqeust_irq */
+static struct irq_type level_type_nostart;
+
 void __init rpc_init_irq(void)
 {
 	unsigned int irq, flags;
@@ -121,16 +130,15 @@ void __init rpc_init_irq(void)
 	iomd_writeb(0, IOMD_FIQMASK);
 	iomd_writeb(0, IOMD_DMAMASK);
 
+	level_type_nostart = default_level_type;
+	level_type_nostart.startup = startup_irq_disabled;
+
 	for (irq = 0; irq < NR_IRQS; irq++) {
 		flags = IRQF_VALID;
 
 		if (irq <= 6 || (irq >= 9 && irq <= 15))
 			flags |= IRQF_PROBE;
 
-		if (irq == 21 || (irq >= 16 && irq <= 19) ||
-		    irq == IRQ_KEYBOARDTX)
-			flags |= IRQF_NOAUTOEN;
-
 		switch (irq) {
 		case 0 ... 7:
 			set_irq_chip(irq, &iomd_a_chip);
@@ -155,6 +163,10 @@ void __init rpc_init_irq(void)
 			set_irq_flags(irq, IRQF_VALID);
 			break;
 		}
+
+		if (irq == 21 || (irq >= 16 && irq <= 19) ||
+		    irq == IRQ_KEYBOARDTX)
+			set_irq_handler(irq, &level_type_nostart);
 	}
 
 	init_FIQ();
Index: linux-2.6.16/arch/arm/mach-s3c2410/bast-irq.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-s3c2410/bast-irq.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-s3c2410/bast-irq.c	2006-10-19 16:52:29.000000000 +0200
@@ -136,13 +136,15 @@ bast_irq_pc104_demux(unsigned int irq,
 		for (i = 0; stat != 0; i++, stat >>= 1) {
 			if (stat & 1) {
 				irqno = bast_pc104_irqs[i];
-
-				desc_handle_irq(irqno, irq_desc + irqno, regs);
+				desc = irq_desc + irqno;
+				desc_handle_irq(irqno, desc, regs);
 			}
 		}
 	}
 }
 
+DEFINE_IRQ_CHAINED_TYPE(bast_irq_pc104_demux);
+
 static __init int bast_irq_init(void)
 {
 	unsigned int i;
@@ -156,7 +158,7 @@ static __init int bast_irq_init(void)
 
 		set_irq_chained_handler(IRQ_ISA, bast_irq_pc104_demux);
 
-		/* reigster our IRQs */
+		/* register our IRQs */
 
 		for (i = 0; i < 4; i++) {
 			unsigned int irqno = bast_pc104_irqs[i];
Index: linux-2.6.16/arch/arm/mach-s3c2410/irq.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-s3c2410/irq.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-s3c2410/irq.c	2006-10-19 16:52:29.000000000 +0200
@@ -573,6 +573,11 @@ s3c_irq_demux_uart2(unsigned int irq,
 }
 
 
+static DEFINE_IRQ_CHAINED_TYPE(s3c_irq_demux_uart0);
+static DEFINE_IRQ_CHAINED_TYPE(s3c_irq_demux_uart1);
+static DEFINE_IRQ_CHAINED_TYPE(s3c_irq_demux_uart2);
+static DEFINE_IRQ_CHAINED_TYPE(s3c_irq_demux_adc);
+
 /* s3c24xx_init_irq
  *
  * Initialise S3C2410 IRQ system
Index: linux-2.6.16/arch/arm/mach-s3c2410/s3c2440-irq.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-s3c2410/s3c2440-irq.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-s3c2410/s3c2440-irq.c	2006-10-19 16:52:29.000000000 +0200
@@ -157,6 +157,9 @@ static struct irqchip s3c_irq_cam = {
 	.ack	    = s3c_irq_cam_ack,
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(s3c_irq_demux_wdtac97);
+static DEFINE_IRQ_CHAINED_TYPE(s3c_irq_demux_cam);
+
 static int s3c2440_irq_add(struct sys_device *sysdev)
 {
 	unsigned int irqno;
Index: linux-2.6.16/arch/arm/mach-s3c2410/time.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-s3c2410/time.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-s3c2410/time.c	2006-10-19 16:52:29.000000000 +0200
@@ -23,6 +23,7 @@
 #include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/err.h>
 #include <linux/clk.h>
 
Index: linux-2.6.16/arch/arm/mach-sa1100/badge4.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-sa1100/badge4.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-sa1100/badge4.c	2006-10-19 16:52:29.000000000 +0200
@@ -240,15 +240,22 @@ void badge4_set_5V(unsigned subsystem, i
 	/* detect on->off and off->on transitions */
 	if ((!old_5V_bitmap) && (badge4_5V_bitmap)) {
 		/* was off, now on */
-		printk(KERN_INFO "%s: enabling 5V supply rail\n", __FUNCTION__);
 		GPSR = BADGE4_GPIO_PCMEN5V;
 	} else if ((old_5V_bitmap) && (!badge4_5V_bitmap)) {
 		/* was on, now off */
-		printk(KERN_INFO "%s: disabling 5V supply rail\n", __FUNCTION__);
 		GPCR = BADGE4_GPIO_PCMEN5V;
 	}
 
 	local_irq_restore(flags);
+
+	/* detect on->off and off->on transitions */
+	if ((!old_5V_bitmap) && (badge4_5V_bitmap)) {
+		/* was off, now on */
+		printk(KERN_INFO "%s: enabling 5V supply rail\n", __FUNCTION__);
+	} else if ((old_5V_bitmap) && (!badge4_5V_bitmap)) {
+		/* was on, now off */
+		printk(KERN_INFO "%s: disabling 5V supply rail\n", __FUNCTION__);
+	}
 }
 EXPORT_SYMBOL(badge4_set_5V);
 
Index: linux-2.6.16/arch/arm/mach-sa1100/cerf.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-sa1100/cerf.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-sa1100/cerf.c	2006-10-19 16:52:29.000000000 +0200
@@ -15,6 +15,7 @@
 #include <linux/kernel.h>
 #include <linux/tty.h>
 #include <linux/platform_device.h>
+#include <linux/irq.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
 
Index: linux-2.6.16/arch/arm/mach-sa1100/h3600.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-sa1100/h3600.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-sa1100/h3600.c	2006-10-19 16:52:29.000000000 +0200
@@ -798,6 +798,8 @@ static void h3800_unmask_gpio_irq(unsign
 	H3800_ASIC2_GPIINTSTAT |= mask;
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(h3800_IRQ_demux);
+
 static void __init h3800_init_irq(void)
 {
 	int i;
@@ -836,7 +838,7 @@ static void __init h3800_init_irq(void)
 	}
 #endif
 	set_irq_type(IRQ_GPIO_H3800_ASIC, IRQT_RISING);
-	set_irq_chained_handler(IRQ_GPIO_H3800_ASIC, &h3800_IRQ_demux);
+	set_irq_chained_handler(IRQ_GPIO_H3800_ASIC, h3800_IRQ_demux);
 }
 
 
Index: linux-2.6.16/arch/arm/mach-sa1100/irq.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-sa1100/irq.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-sa1100/irq.c	2006-10-19 16:52:29.000000000 +0200
@@ -11,12 +11,13 @@
  */
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/ioport.h>
 #include <linux/ptrace.h>
 #include <linux/sysdev.h>
 
 #include <asm/hardware.h>
-#include <asm/irq.h>
 #include <asm/mach/irq.h>
 
 #include "generic.h"
@@ -281,6 +282,8 @@ static int __init sa1100irq_init_devicef
 	return sysdev_register(&sa1100irq_device);
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(sa1100_high_gpio_handler);
+
 device_initcall(sa1100irq_init_devicefs);
 
 void __init sa1100_init_irq(void)
Index: linux-2.6.16/arch/arm/mach-sa1100/neponset.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-sa1100/neponset.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-sa1100/neponset.c	2006-10-19 16:52:29.000000000 +0200
@@ -137,6 +137,8 @@ static struct sa1100_port_fns neponset_p
 	.get_mctrl	= neponset_get_mctrl,
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(neponset_irq_handler);
+
 static int neponset_probe(struct platform_device *dev)
 {
 	sa1100_register_uart_fns(&neponset_port_fns);
Index: linux-2.6.16/arch/arm/mach-sa1100/pleb.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-sa1100/pleb.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-sa1100/pleb.c	2006-10-19 16:52:29.000000000 +0200
@@ -7,6 +7,7 @@
 #include <linux/tty.h>
 #include <linux/ioport.h>
 #include <linux/platform_device.h>
+#include <linux/irq.h>
 
 #include <linux/mtd/partitions.h>
 
Index: linux-2.6.16/arch/arm/mach-sa1100/time.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-sa1100/time.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-sa1100/time.c	2006-10-19 16:52:29.000000000 +0200
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/timex.h>
 #include <linux/signal.h>
 
Index: linux-2.6.16/arch/arm/mach-shark/core.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-shark/core.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-shark/core.c	2006-10-19 16:52:29.000000000 +0200
@@ -6,6 +6,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/sched.h>
 #include <linux/serial_8250.h>
 
Index: linux-2.6.16/arch/arm/mach-shark/leds.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-shark/leds.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-shark/leds.c	2006-10-19 16:52:29.000000000 +0200
@@ -33,7 +33,7 @@ static char led_state;
 static short hw_led_state;
 static short saved_state;
 
-static DEFINE_SPINLOCK(leds_lock);
+static DEFINE_RAW_SPINLOCK(leds_lock);
 
 short sequoia_read(int addr) {
   outw(addr,0x24);
Index: linux-2.6.16/arch/arm/mach-versatile/core.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mach-versatile/core.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mach-versatile/core.c	2006-10-19 16:52:29.000000000 +0200
@@ -96,6 +96,8 @@ sic_handle_irq(unsigned int irq, struct 
 	} while (status);
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(sic_handle_irq);
+
 #if 1
 #define IRQ_MMCI0A	IRQ_VICSOURCE22
 #define IRQ_AACI	IRQ_VICSOURCE24
@@ -114,7 +116,7 @@ void __init versatile_init_irq(void)
 
 	vic_init(VA_VIC_BASE, ~(1 << 31));
 
-	set_irq_handler(IRQ_VICSOURCE31, sic_handle_irq);
+	set_irq_chained_handler(IRQ_VICSOURCE31, sic_handle_irq);
 	enable_irq(IRQ_VICSOURCE31);
 
 	/* Do second interrupt controller */
Index: linux-2.6.16/arch/arm/mm/consistent.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mm/consistent.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mm/consistent.c	2006-10-19 16:52:29.000000000 +0200
@@ -39,7 +39,7 @@
  * These are the page tables (2MB each) covering uncached, DMA consistent allocations
  */
 static pte_t *consistent_pte[NUM_CONSISTENT_PTES];
-static DEFINE_SPINLOCK(consistent_lock);
+static DEFINE_RAW_SPINLOCK(consistent_lock);
 
 /*
  * VM region handling support.
Index: linux-2.6.16/arch/arm/mm/copypage-v4mc.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mm/copypage-v4mc.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mm/copypage-v4mc.c	2006-10-19 16:52:29.000000000 +0200
@@ -29,7 +29,7 @@
 
 #define TOP_PTE(x)	pte_offset_kernel(top_pmd, x)
 
-static DEFINE_SPINLOCK(minicache_lock);
+static DEFINE_RAW_SPINLOCK(minicache_lock);
 
 /*
  * ARMv4 mini-dcache optimised copy_user_page
@@ -43,7 +43,7 @@ static DEFINE_SPINLOCK(minicache_lock);
  * instruction.  If your processor does not supply this, you have to write your
  * own copy_user_page that does the right thing.
  */
-static void __attribute__((naked))
+static void notrace __attribute__((naked))
 mc_copy_user_page(void *from, void *to)
 {
 	asm volatile(
@@ -82,7 +82,7 @@ void v4_mc_copy_user_page(void *kto, con
 /*
  * ARMv4 optimised clear_user_page
  */
-void __attribute__((naked))
+void notrace __attribute__((naked))
 v4_mc_clear_user_page(void *kaddr, unsigned long vaddr)
 {
 	asm volatile(
Index: linux-2.6.16/arch/arm/mm/copypage-v6.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mm/copypage-v6.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mm/copypage-v6.c	2006-10-19 16:52:29.000000000 +0200
@@ -26,7 +26,7 @@
 
 #define TOP_PTE(x)	pte_offset_kernel(top_pmd, x)
 
-static DEFINE_SPINLOCK(v6_lock);
+static DEFINE_RAW_SPINLOCK(v6_lock);
 
 /*
  * Copy the user page.  No aliasing to deal with so we can just
Index: linux-2.6.16/arch/arm/mm/copypage-xscale.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mm/copypage-xscale.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mm/copypage-xscale.c	2006-10-19 16:52:29.000000000 +0200
@@ -31,7 +31,7 @@
 
 #define TOP_PTE(x)	pte_offset_kernel(top_pmd, x)
 
-static DEFINE_SPINLOCK(minicache_lock);
+static DEFINE_RAW_SPINLOCK(minicache_lock);
 
 /*
  * XScale mini-dcache optimised copy_user_page
@@ -41,7 +41,7 @@ static DEFINE_SPINLOCK(minicache_lock);
  * Dcache aliasing issue.  The writes will be forwarded to the write buffer,
  * and merged as appropriate.
  */
-static void __attribute__((naked))
+static void notrace __attribute__((naked))
 mc_copy_user_page(void *from, void *to)
 {
 	/*
@@ -104,7 +104,7 @@ void xscale_mc_copy_user_page(void *kto,
 /*
  * XScale optimised clear_user_page
  */
-void __attribute__((naked))
+void notrace __attribute__((naked))
 xscale_mc_clear_user_page(void *kaddr, unsigned long vaddr)
 {
 	asm volatile(
Index: linux-2.6.16/arch/arm/mm/fault.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mm/fault.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mm/fault.c	2006-10-19 16:52:29.000000000 +0200
@@ -216,7 +216,7 @@ out:
 	return fault;
 }
 
-static int
+static notrace int
 do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 {
 	struct task_struct *tsk;
@@ -316,7 +316,7 @@ no_context:
  * interrupt or a critical region, and should only copy the information
  * from the master page table, nothing more.
  */
-static int
+static notrace int
 do_translation_fault(unsigned long addr, unsigned int fsr,
 		     struct pt_regs *regs)
 {
@@ -362,7 +362,7 @@ bad_area:
  * Some section permission faults need to be handled gracefully.
  * They can happen due to a __{get,put}_user during an oops.
  */
-static int
+static notrace int
 do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 {
 	struct task_struct *tsk = current;
@@ -373,7 +373,7 @@ do_sect_fault(unsigned long addr, unsign
 /*
  * This abort handler always returns "fault".
  */
-static int
+static notrace int
 do_bad(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 {
 	return 1;
@@ -428,7 +428,7 @@ static struct fsr_info {
 	{ do_bad,		SIGBUS,  0,		"unknown 31"			   }
 };
 
-void __init
+void __init notrace
 hook_fault_code(int nr, int (*fn)(unsigned long, unsigned int, struct pt_regs *),
 		int sig, const char *name)
 {
@@ -442,7 +442,7 @@ hook_fault_code(int nr, int (*fn)(unsign
 /*
  * Dispatch a data abort to the relevant handler.
  */
-asmlinkage void
+asmlinkage notrace void
 do_DataAbort(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 {
 	const struct fsr_info *inf = fsr_info + (fsr & 15) + ((fsr & (1 << 10)) >> 6);
@@ -461,7 +461,7 @@ do_DataAbort(unsigned long addr, unsigne
 	notify_die("", regs, &info, fsr, 0);
 }
 
-asmlinkage void
+asmlinkage notrace void
 do_PrefetchAbort(unsigned long addr, struct pt_regs *regs)
 {
 	do_translation_fault(addr, 0, regs);
Index: linux-2.6.16/arch/arm/mm/init.c
===================================================================
--- linux-2.6.16.orig/arch/arm/mm/init.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/mm/init.c	2006-10-19 16:52:29.000000000 +0200
@@ -28,7 +28,7 @@
 
 #define TABLE_SIZE	(2 * PTRS_PER_PTE * sizeof(pte_t))
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
 extern void _stext, _text, _etext, __data_start, _end, __init_begin, __init_end;
Index: linux-2.6.16/arch/arm/plat-omap/clock.c
===================================================================
--- linux-2.6.16.orig/arch/arm/plat-omap/clock.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/plat-omap/clock.c	2006-10-19 16:52:29.000000000 +0200
@@ -29,7 +29,7 @@
 
 LIST_HEAD(clocks);
 static DEFINE_MUTEX(clocks_mutex);
-DEFINE_SPINLOCK(clockfw_lock);
+DEFINE_RAW_SPINLOCK(clockfw_lock);
 
 static struct clk_functions *arch_clock;
 
Index: linux-2.6.16/arch/arm/plat-omap/dma.c
===================================================================
--- linux-2.6.16.orig/arch/arm/plat-omap/dma.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/plat-omap/dma.c	2006-10-19 16:52:29.000000000 +0200
@@ -903,7 +903,7 @@ static struct irqaction omap24xx_dma_irq
 /*----------------------------------------------------------------------------*/
 
 static struct lcd_dma_info {
-	spinlock_t lock;
+	raw_spinlock_t lock;
 	int reserved;
 	void (* callback)(u16 status, void *data);
 	void *cb_data;
Index: linux-2.6.16/arch/arm/plat-omap/gpio.c
===================================================================
--- linux-2.6.16.orig/arch/arm/plat-omap/gpio.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/plat-omap/gpio.c	2006-10-19 16:52:29.000000000 +0200
@@ -121,7 +121,7 @@ struct gpio_bank {
 	u32 reserved_map;
 	u32 suspend_wakeup;
 	u32 saved_wakeup;
-	spinlock_t lock;
+	raw_spinlock_t lock;
 };
 
 #define METHOD_MPUIO		0
@@ -736,7 +736,7 @@ static void gpio_irq_handler(unsigned in
 
 	desc->chip->ack(irq);
 
-	bank = (struct gpio_bank *) desc->data;
+	bank = (struct gpio_bank *) desc->handler_data;
 	if (bank->method == METHOD_MPUIO)
 		isr_reg = bank->base + OMAP_MPUIO_GPIO_INT;
 #ifdef CONFIG_ARCH_OMAP15XX
@@ -837,6 +837,8 @@ static struct irqchip mpuio_irq_chip = {
 	.unmask = mpuio_unmask_irq
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(gpio_irq_handler);
+
 static int initialized;
 static struct clk * gpio_ick;
 static struct clk * gpio_fck;
Index: linux-2.6.16/arch/arm/plat-omap/mux.c
===================================================================
--- linux-2.6.16.orig/arch/arm/plat-omap/mux.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm/plat-omap/mux.c	2006-10-19 16:52:29.000000000 +0200
@@ -57,7 +57,7 @@ int __init omap_mux_register(struct pin_
  */
 int __init_or_module omap_cfg_reg(const unsigned long index)
 {
-	static DEFINE_SPINLOCK(mux_spin_lock);
+	static DEFINE_RAW_SPINLOCK(mux_spin_lock);
 
 	unsigned long flags;
 	struct pin_config *cfg;
Index: linux-2.6.16/arch/arm26/boot/compressed/misc.c
===================================================================
--- linux-2.6.16.orig/arch/arm26/boot/compressed/misc.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/arm26/boot/compressed/misc.c	2006-10-19 16:52:29.000000000 +0200
@@ -184,6 +184,7 @@ static ulg free_mem_ptr_end;
 
 #define HEAP_SIZE 0x2000
 
+#define ZLIB_INFLATE_NO_INFLATE_LOCK
 #include "../../../../lib/inflate.c"
 
 #ifndef STANDALONE_DEBUG
Index: linux-2.6.16/arch/i386/Kconfig
===================================================================
--- linux-2.6.16.orig/arch/i386/Kconfig	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/Kconfig	2006-10-19 16:52:29.000000000 +0200
@@ -14,6 +14,10 @@ config X86_32
 	  486, 586, Pentiums, and various instruction-set-compatible chips by
 	  AMD, Cyrix, and others.
 
+config GENERIC_TIME
+	bool
+	default y
+
 config SEMAPHORE_SLEEPERS
 	bool
 	default y
@@ -173,6 +177,8 @@ config HPET_EMULATE_RTC
 	depends on HPET_TIMER && RTC=y
 	default y
 
+source "kernel/time/Kconfig"
+
 config SMP
 	bool "Symmetric multi-processing support"
 	---help---
@@ -228,6 +234,19 @@ config SCHED_SMT
 
 source "kernel/Kconfig.preempt"
 
+config RWSEM_GENERIC_SPINLOCK
+	bool
+	depends on M386 || PREEMPT_RT
+	default y
+
+config ASM_SEMAPHORES
+	bool
+	default y
+
+config RWSEM_XCHGADD_ALGORITHM
+	bool
+	default y if !RWSEM_GENERIC_SPINLOCK
+
 config X86_UP_APIC
 	bool "Local APIC support on uniprocessors"
 	depends on !SMP && !(X86_VISWS || X86_VOYAGER)
@@ -661,7 +680,7 @@ config BOOT_IOREMAP
 
 config REGPARM
 	bool "Use register arguments (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	depends on EXPERIMENTAL && !MCOUNT
 	default n
 	help
 	Compile the kernel with -mregparm=3. This uses a different ABI
Index: linux-2.6.16/arch/i386/Kconfig.cpu
===================================================================
--- linux-2.6.16.orig/arch/i386/Kconfig.cpu	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/Kconfig.cpu	2006-10-19 16:52:29.000000000 +0200
@@ -235,11 +235,6 @@ config RWSEM_GENERIC_SPINLOCK
 	depends on M386
 	default y
 
-config RWSEM_XCHGADD_ALGORITHM
-	bool
-	depends on !M386
-	default y
-
 config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
Index: linux-2.6.16/arch/i386/Kconfig.debug
===================================================================
--- linux-2.6.16.orig/arch/i386/Kconfig.debug	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/Kconfig.debug	2006-10-19 16:52:29.000000000 +0200
@@ -18,6 +18,7 @@ config EARLY_PRINTK
 config DEBUG_STACKOVERFLOW
 	bool "Check for stack overflows"
 	depends on DEBUG_KERNEL
+	default y
 	help
 	  This option will cause messages to be printed if free stack space
 	  drops below a certain limit.
@@ -25,6 +26,7 @@ config DEBUG_STACKOVERFLOW
 config DEBUG_STACK_USAGE
 	bool "Stack utilization instrumentation"
 	depends on DEBUG_KERNEL
+	default y
 	help
 	  Enables the display of the minimum amount of free stack which each
 	  task has ever had available in the sysrq-T and sysrq-P debug output.
@@ -45,6 +47,7 @@ config DEBUG_PAGEALLOC
 config DEBUG_RODATA
 	bool "Write protect kernel read-only data structures"
 	depends on DEBUG_KERNEL
+	default y
 	help
 	  Mark the kernel read-only data as write-protected in the pagetables,
 	  in order to catch accidental (and incorrect) writes to such const
@@ -55,6 +58,7 @@ config DEBUG_RODATA
 config 4KSTACKS
 	bool "Use 4Kb for kernel stacks instead of 8Kb"
 	depends on DEBUG_KERNEL
+	default y
 	help
 	  If you say Y here the kernel will use a 4Kb stacksize for the
 	  kernel stack attached to each process/thread. This facilitates
Index: linux-2.6.16/arch/i386/boot/compressed/misc.c
===================================================================
--- linux-2.6.16.orig/arch/i386/boot/compressed/misc.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/boot/compressed/misc.c	2006-10-19 16:52:29.000000000 +0200
@@ -15,6 +15,12 @@
 #include <asm/io.h>
 #include <asm/page.h>
 
+#ifdef CONFIG_MCOUNT
+void notrace mcount(void)
+{
+}
+#endif
+
 /*
  * gzip declarations
  */
@@ -112,7 +118,7 @@ static long free_mem_end_ptr;
 #define INPLACE_MOVE_ROUTINE  0x1000
 #define LOW_BUFFER_START      0x2000
 #define LOW_BUFFER_MAX       0x90000
-#define HEAP_SIZE             0x3000
+#define HEAP_SIZE             0x4000
 static unsigned int low_buffer_end, low_buffer_size;
 static int high_loaded =0;
 static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/;
@@ -125,6 +131,7 @@ static int lines, cols;
 static void * xquad_portio = NULL;
 #endif
 
+#define ZLIB_INFLATE_NO_INFLATE_LOCK
 #include "../../../../lib/inflate.c"
 
 static void *malloc(int size)
Index: linux-2.6.16/arch/i386/kernel/Makefile
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/Makefile	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/kernel/Makefile	2006-10-19 16:52:29.000000000 +0200
@@ -4,13 +4,13 @@
 
 extra-y := head.o init_task.o vmlinux.lds
 
-obj-y	:= process.o semaphore.o signal.o entry.o traps.o irq.o \
+obj-y	:= process.o signal.o entry.o traps.o irq.o \
 		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
 		pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \
-		quirks.o i8237.o topology.o
+		quirks.o i8237.o i8253.o tsc.o topology.o
 
+obj-$(CONFIG_ASM_SEMAPHORES)	+= semaphore.o
 obj-y				+= cpu/
-obj-y				+= timers/
 obj-y				+= acpi/
 obj-$(CONFIG_X86_BIOS_REBOOT)	+= reboot.o
 obj-$(CONFIG_MCA)		+= mca.o
@@ -20,6 +20,7 @@ obj-$(CONFIG_MICROCODE)		+= microcode.o
 obj-$(CONFIG_APM)		+= apm.o
 obj-$(CONFIG_X86_SMP)		+= smp.o smpboot.o
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
+obj-$(CONFIG_MCOUNT)		+= mcount-wrapper.o
 obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o nmi.o
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
@@ -37,6 +38,7 @@ obj-$(CONFIG_EFI) 		+= efi.o efi_stub.o
 obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault.o
 obj-$(CONFIG_VM86)		+= vm86.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
+obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 
 EXTRA_AFLAGS   := -traditional
 
Index: linux-2.6.16/arch/i386/kernel/acpi/boot.c
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/acpi/boot.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/kernel/acpi/boot.c	2006-10-19 16:52:29.000000000 +0200
@@ -574,7 +574,7 @@ static int __init acpi_parse_sbf(unsigne
 }
 
 #ifdef CONFIG_HPET_TIMER
-
+#include <asm/hpet.h>
 static int __init acpi_parse_hpet(unsigned long phys, unsigned long size)
 {
 	struct acpi_table_hpet *hpet_tbl;
@@ -596,6 +596,7 @@ static int __init acpi_parse_hpet(unsign
 #ifdef	CONFIG_X86_64
 	vxtime.hpet_address = hpet_tbl->addr.addrl |
 	    ((long)hpet_tbl->addr.addrh << 32);
+	hpet_address = vxtime.hpet_address;
 
 	printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
 	       hpet_tbl->id, vxtime.hpet_address);
@@ -604,10 +605,10 @@ static int __init acpi_parse_hpet(unsign
 		extern unsigned long hpet_address;
 
 		hpet_address = hpet_tbl->addr.addrl;
-		printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
-		       hpet_tbl->id, hpet_address);
 	}
-#endif				/* X86 */
+#endif	/* X86 */
+		printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
+			hpet_tbl->id, hpet_address);
 
 	return 0;
 }
Index: linux-2.6.16/arch/i386/kernel/apic.c
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/apic.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/kernel/apic.c	2006-10-19 16:52:29.000000000 +0200
@@ -26,6 +26,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/sysdev.h>
 #include <linux/cpu.h>
+#include <linux/clockchips.h>
 #include <linux/module.h>
 
 #include <asm/atomic.h>
@@ -58,6 +59,23 @@ int enable_local_apic __initdata = 0; /*
  */
 int apic_verbosity;
 
+static unsigned int calibration_result;
+
+static void lapic_next_event(unsigned long evt);
+static void lapic_timer_setup(int mode);
+
+static struct clock_event lapic_clockevent = {
+	.name = "lapic",
+	.capabilities = CLOCK_CAP_NEXTEVT | CLOCK_CAP_PROFILE |
+			CLOCK_HAS_IRQHANDLER
+#ifdef CONFIG_SMP
+			| CLOCK_CAP_UPDATE
+#endif
+	,
+	.shift = 32,
+	.set_mode = lapic_timer_setup,
+	.set_next_event = lapic_next_event,
+};
 
 static void apic_pm_activate(void);
 
@@ -883,6 +901,11 @@ fake_ioapic_page:
  */
 
 /*
+ * FIXME: Move this to i8253.h. There is no need to keep the access to
+ * the PIT scattered all around the place -tglx
+ */
+
+/*
  * The timer chip is already set up at HZ interrupts per second here,
  * but we do not accept timer interrupts yet. We only allow the BP
  * to calibrate.
@@ -940,13 +963,15 @@ void (*wait_timer_tick)(void) __devinitd
 
 #define APIC_DIVISOR 16
 
-static void __setup_APIC_LVTT(unsigned int clocks)
+static void __setup_APIC_LVTT(unsigned int clocks, int oneshot)
 {
 	unsigned int lvtt_value, tmp_value, ver;
 	int cpu = smp_processor_id();
 
 	ver = GET_APIC_VERSION(apic_read(APIC_LVR));
-	lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
+	lvtt_value = LOCAL_TIMER_VECTOR;
+	if (!oneshot)
+		lvtt_value |= APIC_LVT_TIMER_PERIODIC;
 	if (!APIC_INTEGRATED(ver))
 		lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
 
@@ -963,31 +988,37 @@ static void __setup_APIC_LVTT(unsigned i
 				& ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
 				| APIC_TDR_DIV_16);
 
-	apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
+	if (!oneshot)
+		apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
+}
+
+static void lapic_next_event(unsigned long evt)
+{
+	apic_write_around(APIC_TMICT, evt);
 }
 
-static void __devinit setup_APIC_timer(unsigned int clocks)
+static void lapic_timer_setup(int mode)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
-
-	/*
-	 * Wait for IRQ0's slice:
-	 */
-	wait_timer_tick();
-
-	__setup_APIC_LVTT(clocks);
-
+	__setup_APIC_LVTT(calibration_result, mode == CLOCK_EVT_ONESHOT);
 	local_irq_restore(flags);
 }
 
+static void __devinit setup_APIC_timer(void)
+{
+	setup_local_clockevent(&lapic_clockevent, CPU_MASK_NONE);
+}
+
 /*
  * In this function we calibrate APIC bus clocks to the external
  * timer. Unfortunately we cannot use jiffies and the timer irq
  * to calibrate, since some later bootup code depends on getting
  * the first irq? Ugh.
  *
+ * TODO: Fix this rather than saying "Ugh" -tglx
+ *
  * We want to do the calibration only once since we
  * want to have local timer irqs syncron. CPUs connected
  * by the same APIC bus have the very same bus frequency.
@@ -1010,7 +1041,7 @@ static int __init calibrate_APIC_clock(v
 	 * value into the APIC clock, we just want to get the
 	 * counter running for calibration.
 	 */
-	__setup_APIC_LVTT(1000000000);
+	__setup_APIC_LVTT(1000000000, 0);
 
 	/*
 	 * The timer chip counts down to zero. Let's wait
@@ -1047,6 +1078,13 @@ static int __init calibrate_APIC_clock(v
 
 	result = (tt1-tt2)*APIC_DIVISOR/LOOPS;
 
+	/* Calculate the scaled math multiplication factor */
+	lapic_clockevent.mult = div_sc32(tt1-tt2, TICK_NSEC * LOOPS);
+	lapic_clockevent.max_delta_ns =
+		clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
+	lapic_clockevent.min_delta_ns =
+		clockevent_delta2ns(0xF, &lapic_clockevent);
+
 	if (cpu_has_tsc)
 		apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
 			"%ld.%04ld MHz.\n",
@@ -1061,8 +1099,6 @@ static int __init calibrate_APIC_clock(v
 	return result;
 }
 
-static unsigned int calibration_result;
-
 void __init setup_boot_APIC_clock(void)
 {
 	unsigned long flags;
@@ -1075,14 +1111,14 @@ void __init setup_boot_APIC_clock(void)
 	/*
 	 * Now set up the timer for real.
 	 */
-	setup_APIC_timer(calibration_result);
+	setup_APIC_timer();
 
 	local_irq_restore(flags);
 }
 
 void __devinit setup_secondary_APIC_clock(void)
 {
-	setup_APIC_timer(calibration_result);
+	setup_APIC_timer();
 }
 
 void disable_APIC_timer(void)
@@ -1153,6 +1189,8 @@ inline void smp_local_timer_interrupt(st
 	update_process_times(user_mode_vm(regs));
 #endif
 
+        trace_special(regs->eip, 0, 0);
+
 	/*
 	 * We take the 'long' return path, and there every subsystem
 	 * grabs the apropriate locks (kernel lock/ irq lock).
@@ -1174,7 +1212,7 @@ inline void smp_local_timer_interrupt(st
  *   interrupt as well. Thus we cannot inline the local irq ... ]
  */
 
-fastcall void smp_apic_timer_interrupt(struct pt_regs *regs)
+fastcall notrace void smp_apic_timer_interrupt(struct pt_regs *regs)
 {
 	int cpu = smp_processor_id();
 
@@ -1183,6 +1221,8 @@ fastcall void smp_apic_timer_interrupt(s
 	 */
 	per_cpu(irq_stat, cpu).apic_timer_irqs++;
 
+        trace_special(regs->eip, 0, 0);
+
 	/*
 	 * NOTE! We'd better ACK the irq immediately,
 	 * because timer handling can be slow.
@@ -1194,7 +1234,17 @@ fastcall void smp_apic_timer_interrupt(s
 	 * interrupt lock, which is the WrongThing (tm) to do.
 	 */
 	irq_enter();
-	smp_local_timer_interrupt(regs);
+	/*
+	 * If the task is currently running in user mode, don't
+	 * detect soft lockups.  If CONFIG_DETECT_SOFTLOCKUP is not
+	 * configured, this should be optimized out.
+	 */
+	if (user_mode(regs))
+		touch_softlockup_watchdog();
+
+	if (lapic_clockevent.event_handler)
+		lapic_clockevent.event_handler(regs);
+
 	irq_exit();
 }
 
@@ -1286,6 +1336,7 @@ fastcall void smp_error_interrupt(struct
 	*/
 	printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
 	        smp_processor_id(), v , v1);
+	dump_stack();
 	irq_exit();
 }
 
Index: linux-2.6.16/arch/i386/kernel/cpu/mtrr/generic.c
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/cpu/mtrr/generic.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/kernel/cpu/mtrr/generic.c	2006-10-19 16:52:29.000000000 +0200
@@ -234,7 +234,7 @@ static unsigned long set_mtrr_state(u32 
 
 static unsigned long cr4 = 0;
 static u32 deftype_lo, deftype_hi;
-static DEFINE_SPINLOCK(set_atomicity_lock);
+static DEFINE_RAW_SPINLOCK(set_atomicity_lock);
 
 /*
  * Since we are disabling the cache don't allow any interrupts - they
Index: linux-2.6.16/arch/i386/kernel/entry.S
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/entry.S	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/kernel/entry.S	2006-10-19 16:52:29.000000000 +0200
@@ -43,6 +43,7 @@
 #include <linux/config.h>
 #include <linux/linkage.h>
 #include <asm/thread_info.h>
+#include <asm/irqflags.h>
 #include <asm/errno.h>
 #include <asm/segment.h>
 #include <asm/smp.h>
@@ -76,7 +77,7 @@ NT_MASK		= 0x00004000
 VM_MASK		= 0x00020000
 
 #ifdef CONFIG_PREEMPT
-#define preempt_stop		cli
+#define preempt_stop		cli; TRACE_IRQS_OFF
 #else
 #define preempt_stop
 #define resume_kernel		restore_nocheck
@@ -160,14 +161,17 @@ ENTRY(resume_userspace)
 #ifdef CONFIG_PREEMPT
 ENTRY(resume_kernel)
 	cli
+	cmpl $0, kernel_preemption
+	jz restore_nocheck
 	cmpl $0,TI_preempt_count(%ebp)	# non-zero preempt_count ?
 	jnz restore_nocheck
 need_resched:
 	movl TI_flags(%ebp), %ecx	# need_resched set ?
 	testb $_TIF_NEED_RESCHED, %cl
-	jz restore_all
+	jz restore_nocheck
 	testl $IF_MASK,EFLAGS(%esp)     # interrupts off (exception path) ?
-	jz restore_all
+	jz restore_nocheck
+	cli
 	call preempt_schedule_irq
 	jmp need_resched
 #endif
@@ -179,6 +183,10 @@ need_resched:
 ENTRY(sysenter_entry)
 	movl TSS_sysenter_esp0(%esp),%esp
 sysenter_past_esp:
+	/*
+	 * No need to trace this one: sysenter disabled irqs and
+	 * we quickly enable it without doing anything else:
+	 */
 	sti
 	pushl $(__USER_DS)
 	pushl %ebp
@@ -200,6 +208,11 @@ sysenter_past_esp:
 
 	pushl %eax
 	SAVE_ALL
+#ifdef CONFIG_LATENCY_TRACE
+	pushl %edx; pushl %ecx; pushl %ebx; pushl %eax
+	call sys_call
+	popl %eax; popl %ebx; popl %ecx; popl %edx
+#endif
 	GET_THREAD_INFO(%ebp)
 
 	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
@@ -210,13 +223,20 @@ sysenter_past_esp:
 	call *sys_call_table(,%eax,4)
 	movl %eax,EAX(%esp)
 	cli
+	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
 	testw $_TIF_ALLWORK_MASK, %cx
 	jne syscall_exit_work
+#ifdef CONFIG_LATENCY_TRACE
+	pushl %eax
+	call sys_ret
+	popl %eax
+#endif
 /* if something modifies registers it must also disable sysexit */
 	movl EIP(%esp), %edx
 	movl OLDESP(%esp), %ecx
 	xorl %ebp,%ebp
+	TRACE_IRQS_ON
 	sti
 	sysexit
 
@@ -225,6 +245,11 @@ sysenter_past_esp:
 ENTRY(system_call)
 	pushl %eax			# save orig_eax
 	SAVE_ALL
+#ifdef CONFIG_LATENCY_TRACE
+	pushl %edx; pushl %ecx; pushl %ebx; pushl %eax
+	call sys_call
+	popl %eax; popl %ebx; popl %ecx; popl %edx
+#endif
 	GET_THREAD_INFO(%ebp)
 					# system call tracing in operation / emulation
 	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
@@ -239,6 +264,7 @@ syscall_exit:
 	cli				# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
+	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
 	testw $_TIF_ALLWORK_MASK, %cx	# current->work
 	jne syscall_exit_work
@@ -254,11 +280,14 @@ restore_all:
 	cmpl $((4 << 8) | 3), %eax
 	je ldt_ss			# returning to user-space with LDT SS
 restore_nocheck:
+	TRACE_IRQS_ON
+restore_nocheck2:
 	RESTORE_REGS
 	addl $4, %esp
 1:	iret
 .section .fixup,"ax"
 iret_exc:
+	TRACE_IRQS_ON
 	sti
 	pushl $0			# no error code
 	pushl $do_iret_error
@@ -282,10 +311,12 @@ ldt_ss:
 	 * dosemu and wine happy. */
 	subl $8, %esp		# reserve space for switch16 pointer
 	cli
+	TRACE_IRQS_OFF
 	movl %esp, %eax
 	/* Set up the 16bit stack frame with switch32 pointer on top,
 	 * and a switch16 pointer on top of the current frame. */
 	call setup_x86_bogus_stack
+	TRACE_IRQS_ON
 	RESTORE_REGS
 	lss 20+4(%esp), %esp	# switch to 16bit stack
 1:	iret
@@ -297,18 +328,20 @@ ldt_ss:
 	# perform work that needs to be done immediately before resumption
 	ALIGN
 work_pending:
-	testb $_TIF_NEED_RESCHED, %cl
+	testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %ecx
 	jz work_notifysig
 work_resched:
-	call schedule
-	cli				# make sure we don't miss an interrupt
+	cli
+	call __schedule
+					# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
+	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
 	andl $_TIF_WORK_MASK, %ecx	# is there any work to be done other
 					# than syscall tracing?
 	jz restore_all
-	testb $_TIF_NEED_RESCHED, %cl
+	testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %ecx
 	jnz work_resched
 
 work_notifysig:				# deal with pending signals and
@@ -353,6 +386,7 @@ syscall_trace_entry:
 syscall_exit_work:
 	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
 	jz work_pending
+	TRACE_IRQS_ON
 	sti				# could let do_syscall_trace() call
 					# schedule() instead
 	movl %esp, %eax
@@ -414,9 +448,14 @@ ENTRY(irq_entries_start)
 vector=vector+1
 .endr
 
+/*
+ * the CPU automatically disables interrupts when executing an IRQ vector,
+ * so IRQ-flags tracing has to follow that:
+ */
 	ALIGN
 common_interrupt:
 	SAVE_ALL
+	TRACE_IRQS_OFF
 	movl %esp,%eax
 	call do_IRQ
 	jmp ret_from_intr
@@ -425,6 +464,7 @@ common_interrupt:
 ENTRY(name)				\
 	pushl $nr-256;			\
 	SAVE_ALL			\
+	TRACE_IRQS_OFF			\
 	movl %esp,%eax;			\
 	call smp_/**/name;		\
 	jmp ret_from_intr;
@@ -554,7 +594,8 @@ nmi_stack_correct:
 	xorl %edx,%edx		# zero error code
 	movl %esp,%eax		# pt_regs pointer
 	call do_nmi
-	jmp restore_all
+	jmp restore_nocheck2
+#	jmp restore_all
 
 nmi_stack_fixup:
 	FIX_STACK(12,nmi_stack_correct, 1)
Index: linux-2.6.16/arch/i386/kernel/head.S
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/head.S	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/kernel/head.S	2006-10-19 16:52:29.000000000 +0200
@@ -404,6 +404,7 @@ ignore_int:
 	call printk
 #endif
 	addl $(5*4),%esp
+	call dump_stack
 	popl %ds
 	popl %es
 	popl %edx
Index: linux-2.6.16/arch/i386/kernel/hpet.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/arch/i386/kernel/hpet.c	2006-10-19 16:52:29.000000000 +0200
@@ -0,0 +1,67 @@
+#include <linux/clocksource.h>
+#include <linux/errno.h>
+#include <linux/hpet.h>
+#include <linux/init.h>
+
+#include <asm/hpet.h>
+#include <asm/io.h>
+
+#define HPET_MASK	0xFFFFFFFF
+#define HPET_SHIFT	22
+
+/* FSEC = 10^-15 NSEC = 10^-9 */
+#define FSEC_PER_NSEC	1000000
+
+static void *hpet_ptr;
+
+static cycle_t read_hpet(void)
+{
+	return (cycle_t)readl(hpet_ptr);
+}
+
+static struct clocksource clocksource_hpet = {
+	.name		= "hpet",
+	.rating		= 250,
+	.read		= read_hpet,
+	.mask		= (cycle_t)HPET_MASK,
+	.mult		= 0, /* set below */
+	.shift		= HPET_SHIFT,
+	.is_continuous	= 1,
+};
+
+static int __init init_hpet_clocksource(void)
+{
+	unsigned long hpet_period;
+	void __iomem* hpet_base;
+	u64 tmp;
+
+	if (!hpet_address)
+		return -ENODEV;
+
+	/* calculate the hpet address: */
+	hpet_base =
+		(void __iomem*)ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
+	hpet_ptr = hpet_base + HPET_COUNTER;
+
+	/* calculate the frequency: */
+	hpet_period = readl(hpet_base + HPET_PERIOD);
+
+	/*
+	 * hpet period is in femto seconds per cycle
+	 * so we need to convert this to ns/cyc units
+	 * aproximated by mult/2^shift
+	 *
+	 *  fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift
+	 *  fsec/cyc * 1ns/1000000fsec * 2^shift = mult
+	 *  fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
+	 *  (fsec/cyc << shift)/1000000 = mult
+	 *  (hpet_period << shift)/FSEC_PER_NSEC = mult
+	 */
+	tmp = (u64)hpet_period << HPET_SHIFT;
+	do_div(tmp, FSEC_PER_NSEC);
+	clocksource_hpet.mult = (u32)tmp;
+
+	return register_clocksource(&clocksource_hpet);
+}
+
+module_init(init_hpet_clocksource);
Index: linux-2.6.16/arch/i386/kernel/i386_ksyms.c
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/i386_ksyms.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/kernel/i386_ksyms.c	2006-10-19 16:52:29.000000000 +0200
@@ -3,10 +3,12 @@
 #include <asm/checksum.h>
 #include <asm/desc.h>
 
-EXPORT_SYMBOL(__down_failed);
-EXPORT_SYMBOL(__down_failed_interruptible);
-EXPORT_SYMBOL(__down_failed_trylock);
-EXPORT_SYMBOL(__up_wakeup);
+#ifdef CONFIG_ASM_SEMAPHORES
+EXPORT_SYMBOL(__compat_down_failed);
+EXPORT_SYMBOL(__compat_down_failed_interruptible);
+EXPORT_SYMBOL(__compat_down_failed_trylock);
+EXPORT_SYMBOL(__compat_up_wakeup);
+#endif
 /* Networking helper routines. */
 EXPORT_SYMBOL(csum_partial_copy_generic);
 
@@ -22,7 +24,7 @@ EXPORT_SYMBOL(__put_user_8);
 EXPORT_SYMBOL(strpbrk);
 EXPORT_SYMBOL(strstr);
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && defined(CONFIG_ASM_SEMAPHORES)
 extern void FASTCALL( __write_lock_failed(rwlock_t *rw));
 extern void FASTCALL( __read_lock_failed(rwlock_t *rw));
 EXPORT_SYMBOL(__write_lock_failed);
Index: linux-2.6.16/arch/i386/kernel/i8253.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/arch/i386/kernel/i8253.c	2006-10-19 16:52:29.000000000 +0200
@@ -0,0 +1,135 @@
+/*
+ * i8253.c  8253/PIT functions
+ *
+ */
+#include <linux/clockchips.h>
+#include <linux/spinlock.h>
+#include <linux/jiffies.h>
+#include <linux/sysdev.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/mca.h>
+
+#include <asm/smp.h>
+#include <asm/io_apic.h>
+#include <asm/delay.h>
+#include <asm/i8253.h>
+#include <asm/io.h>
+
+#include "io_ports.h"
+
+DEFINE_RAW_SPINLOCK(i8253_lock);
+EXPORT_SYMBOL(i8253_lock);
+
+static void init_pit_timer(int mode)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8253_lock, flags);
+
+	if (mode != CLOCK_EVT_ONESHOT) {
+		/* binary, mode 2, LSB/MSB, ch 0 */
+		outb_p(0x34, PIT_MODE);
+		udelay(10);
+		outb_p(LATCH & 0xff , PIT_CH0);	/* LSB */
+		outb(LATCH >> 8 , PIT_CH0);	/* MSB */
+	} else {
+		/* One shot setup */
+		outb_p(0x38, PIT_MODE);
+		udelay(10);
+	}
+
+	spin_unlock_irqrestore(&i8253_lock, flags);
+}
+
+static void pit_next_event(unsigned long evt)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8253_lock, flags);
+	outb_p(evt & 0xff , PIT_CH0);	/* LSB */
+	outb(evt >> 8 , PIT_CH0);	/* MSB */
+	spin_unlock_irqrestore(&i8253_lock, flags);
+}
+
+static struct clock_event pit_clockevent = {
+	.name		= "pit",
+	.capabilities	= CLOCK_CAP_TICK
+#ifndef CONFIG_SMP
+			| CLOCK_CAP_NEXTEVT | CLOCK_CAP_PROFILE |
+			CLOCK_CAP_UPDATE
+#endif
+	,
+	.set_mode	= init_pit_timer,
+	.set_next_event = pit_next_event,
+	.start_event	= io_apic_timer_ack,
+	.end_event	= mca_timer_ack,
+	.shift		= 32,
+	.irq		= 0,
+};
+
+void setup_pit_timer(void)
+{
+	pit_clockevent.mult = div_sc32(CLOCK_TICK_RATE, NSEC_PER_SEC);
+	pit_clockevent.max_delta_ns =
+		clockevent_delta2ns(0x7FFF, &pit_clockevent);
+	pit_clockevent.min_delta_ns =
+		clockevent_delta2ns(0xF, &pit_clockevent);
+	setup_global_clockevent(&pit_clockevent, CPU_MASK_NONE);
+}
+
+/*
+ * Since the PIT overflows every tick, its not very useful
+ * to just read by itself. So use jiffies to emulate a free
+ * running counter:
+ */
+static cycle_t pit_read(void)
+{
+	unsigned long flags, seq;
+	int count;
+	u64 jifs;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+
+		spin_lock_irqsave(&i8253_lock, flags);
+		outb_p(0x00, PIT_MODE);	/* latch the count ASAP */
+		count = inb_p(PIT_CH0);	/* read the latched count */
+		count |= inb_p(PIT_CH0) << 8;
+
+		/* VIA686a test code... reset the latch if count > max + 1 */
+		if (count > LATCH) {
+			outb_p(0x34, PIT_MODE);
+			outb_p(LATCH & 0xff, PIT_CH0);
+			outb(LATCH >> 8, PIT_CH0);
+			count = LATCH - 1;
+		}
+		spin_unlock_irqrestore(&i8253_lock, flags);
+
+		jifs = jiffies_64;
+	} while (read_seqretry(&xtime_lock, seq));
+
+	jifs -= INITIAL_JIFFIES;
+	count = (LATCH-1) - count;
+
+	return (cycle_t)(jifs * LATCH) + count;
+}
+
+static struct clocksource clocksource_pit = {
+	.name	= "pit",
+	.rating = 110,
+	.read	= pit_read,
+	.mask	= (cycle_t)-1,
+	.mult	= 0,
+	.shift	= 20,
+};
+
+static int __init init_pit_clocksource(void)
+{
+	if (num_possible_cpus() > 4) /* PIT does not scale! */
+		return 0;
+
+	clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20);
+	return register_clocksource(&clocksource_pit);
+}
+module_init(init_pit_clocksource);
Index: linux-2.6.16/arch/i386/kernel/i8259.c
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/i8259.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/kernel/i8259.c	2006-10-19 16:52:29.000000000 +0200
@@ -35,7 +35,7 @@
  * moves to arch independent land
  */
 
-DEFINE_SPINLOCK(i8259A_lock);
+DEFINE_RAW_SPINLOCK(i8259A_lock);
 
 static void end_8259A_irq (unsigned int irq)
 {
@@ -366,7 +366,7 @@ static irqreturn_t math_error_irq(int cp
  * New motherboards sometimes make IRQ 13 be a PCI interrupt,
  * so allow interrupt sharing.
  */
-static struct irqaction fpu_irq = { math_error_irq, 0, CPU_MASK_NONE, "fpu", NULL, NULL };
+static struct irqaction fpu_irq = { math_error_irq, SA_NODELAY, CPU_MASK_NONE, "fpu", NULL, NULL };
 
 void __init init_ISA_irqs (void)
 {
@@ -422,12 +422,6 @@ void __init init_IRQ(void)
 	intr_init_hook();
 
 	/*
-	 * Set the clock to HZ Hz, we already have a valid
-	 * vector now:
-	 */
-	setup_pit_timer();
-
-	/*
 	 * External FPU? Set up irq13 if so, for
 	 * original braindamaged IBM FERR coupling.
 	 */
Index: linux-2.6.16/arch/i386/kernel/io_apic.c
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/io_apic.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/kernel/io_apic.c	2006-10-19 16:52:29.000000000 +0200
@@ -49,7 +49,7 @@ atomic_t irq_mis_count;
 /* Where if anywhere is the i8259 connect in external int mode */
 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 
-static DEFINE_SPINLOCK(ioapic_lock);
+static DEFINE_RAW_SPINLOCK(ioapic_lock);
 
 int timer_over_8254 __initdata = 1;
 
@@ -92,6 +92,27 @@ int vector_irq[NR_VECTORS] __read_mostly
 #define vector_to_irq(vector)	(vector)
 #endif
 
+static int timer_ack;
+
+void io_apic_timer_ack(void *priv)
+{
+	unsigned long flags;
+
+	if (timer_ack) {
+		/*
+		 * Subtle, when I/O APICs are used we have to ack timer IRQ
+		 * manually to reset the IRR bit for do_slow_gettimeoffset().
+		 * This will also deassert NMI lines for the watchdog if run
+		 * on an 82489DX-based system.
+		 */
+		spin_lock_irqsave(&i8259A_lock, flags);
+		outb(0x0c, PIC_MASTER_OCW3);
+		/* Ack the IRQ; AEOI will end it automatically. */
+		inb(PIC_MASTER_POLL);
+		spin_unlock_irqrestore(&i8259A_lock, flags);
+	}
+}
+
 /*
  * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
  * shared ISA-space IRQs, so we have to support them. We are super
@@ -135,6 +156,105 @@ static void __init replace_pin_at_irq(un
 	}
 }
 
+//#define IOAPIC_CACHE
+
+#ifdef IOAPIC_CACHE
+# define MAX_IOAPIC_CACHE 512
+
+/*
+ * Cache register values:
+ */
+static unsigned int io_apic_cache[MAX_IO_APICS][MAX_IOAPIC_CACHE]
+		____cacheline_aligned_in_smp;
+#endif
+
+inline unsigned int __raw_io_apic_read(unsigned int apic, unsigned int reg)
+{
+	*IO_APIC_BASE(apic) = reg;
+	return *(IO_APIC_BASE(apic)+4);
+}
+
+unsigned int raw_io_apic_read(unsigned int apic, unsigned int reg)
+{
+	unsigned int val = __raw_io_apic_read(apic, reg);
+
+#ifdef IOAPIC_CACHE
+	io_apic_cache[apic][reg] = val;
+#endif
+	return val;
+}
+
+unsigned int io_apic_read(unsigned int apic, unsigned int reg)
+{
+#ifdef IOAPIC_CACHE
+	if (unlikely(reg >= MAX_IOAPIC_CACHE)) {
+		static int once = 1;
+
+		if (once) {
+			once = 0;
+			printk("WARNING: ioapic register cache overflow: %d.\n",
+				reg);
+			dump_stack();
+		}
+		return __raw_io_apic_read(apic, reg);
+	}
+	if (io_apic_cache[apic][reg] && !sis_apic_bug)
+		return io_apic_cache[apic][reg];
+#endif
+	return raw_io_apic_read(apic, reg);
+}
+
+void io_apic_write(unsigned int apic, unsigned int reg, unsigned int val)
+{
+#ifdef IOAPIC_CACHE
+	if (unlikely(reg >= MAX_IOAPIC_CACHE)) {
+		static int once = 1;
+
+		if (once) {
+			once = 0;
+			printk("WARNING: ioapic register cache overflow: %d.\n",
+				reg);
+			dump_stack();
+		}
+	} else
+		io_apic_cache[apic][reg] = val;
+#endif
+	*IO_APIC_BASE(apic) = reg;
+	*(IO_APIC_BASE(apic)+4) = val;
+}
+
+/*
+ * Some systems need a POST flush or else level-triggered interrupts
+ * generate lots of spurious interrupts due to the POST-ed write not
+ * reaching the IOAPIC before the IRQ is ACK-ed in the local APIC.
+ */
+#ifdef CONFIG_SMP
+# define IOAPIC_POSTFLUSH
+#endif
+
+/*
+ * Re-write a value: to be used for read-modify-write
+ * cycles where the read already set up the index register.
+ *
+ * Older SiS APIC requires we rewrite the index regiser
+ */
+void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val)
+{
+#ifdef IOAPIC_CACHE
+	io_apic_cache[apic][reg] = val;
+#endif
+	if (unlikely(sis_apic_bug))
+		*IO_APIC_BASE(apic) = reg;
+	*(IO_APIC_BASE(apic)+4) = val;
+#ifndef IOAPIC_POSTFLUSH
+	if (unlikely(sis_apic_bug))
+#endif
+		/*
+		 * Force POST flush by reading:
+ 		 */
+		val = *(IO_APIC_BASE(apic)+4);
+}
+
 static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
 {
 	struct irq_pin_list *entry = irq_2_pin + irq;
@@ -166,18 +286,6 @@ static void __unmask_IO_APIC_irq (unsign
 	__modify_IO_APIC_irq(irq, 0, 0x00010000);
 }
 
-/* mask = 1, trigger = 0 */
-static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
-{
-	__modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
-}
-
-/* mask = 0, trigger = 1 */
-static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
-{
-	__modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
-}
-
 static void mask_IO_APIC_irq (unsigned int irq)
 {
 	unsigned long flags;
@@ -1432,8 +1540,8 @@ void __init print_IO_APIC(void)
 		struct IO_APIC_route_entry entry;
 
 		spin_lock_irqsave(&ioapic_lock, flags);
-		*(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
-		*(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
+		*(((int *)&entry)+0) = raw_io_apic_read(apic, 0x10+i*2);
+		*(((int *)&entry)+1) = raw_io_apic_read(apic, 0x11+i*2);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 
 		printk(KERN_DEBUG " %02x %03X %02X  ",
@@ -1479,7 +1587,7 @@ void __init print_IO_APIC(void)
 	return;
 }
 
-#if 0
+#if 1
 
 static void print_APIC_bitfield (int base)
 {
@@ -1879,7 +1987,7 @@ static int __init timer_irq_works(void)
 	 * might have cached one ExtINT interrupt.  Finally, at
 	 * least one tick may be lost due to delays.
 	 */
-	if (jiffies - t1 > 4)
+	if (jiffies - t1 > 4 && jiffies - t1 < 16)
 		return 1;
 
 	return 0;
@@ -1932,9 +2040,11 @@ static unsigned int startup_edge_ioapic_
 static void ack_edge_ioapic_irq(unsigned int irq)
 {
 	move_irq(irq);
+#if 0
 	if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
 					== (IRQ_PENDING | IRQ_DISABLED))
 		mask_IO_APIC_irq(irq);
+#endif
 	ack_APIC_irq();
 }
 
@@ -1959,6 +2069,30 @@ static unsigned int startup_level_ioapic
 	return 0; /* don't check for pending */
 }
 
+#ifdef CONFIG_PREEMPT_HARDIRQS
+
+/*
+ * in the PREEMPT_HARDIRQS case we dont want to keep the local
+ * APIC unacked, because the prevents further interrupts from
+ * being handled - and with IRQ threads being delayed arbitrarily,
+ * that's unacceptable. So we first mask the IRQ, then ack it.
+ * The hardirq thread will then unmask it.
+ */
+static void mask_and_ack_level_ioapic_irq(unsigned int irq)
+{
+	move_irq(irq);
+	mask_IO_APIC_irq(irq);
+	ack_APIC_irq();
+}
+
+#else
+
+static void mask_and_ack_level_ioapic_irq(unsigned int irq)
+{
+}
+
+#endif
+
 static void end_level_ioapic_irq (unsigned int irq)
 {
 	unsigned long v;
@@ -1993,8 +2127,10 @@ static void end_level_ioapic_irq (unsign
 	if (!(v & (1 << (i & 0x1f)))) {
 		atomic_inc(&irq_mis_count);
 		spin_lock(&ioapic_lock);
-		__mask_and_edge_IO_APIC_irq(irq);
-		__unmask_and_level_IO_APIC_irq(irq);
+		/* mask = 1, trigger = 0 */
+		__modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
+		/* mask = 0, trigger = 1 */
+		__modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
 		spin_unlock(&ioapic_lock);
 	}
 }
@@ -2022,6 +2158,13 @@ static unsigned int startup_level_ioapic
 	return startup_level_ioapic_irq (irq);
 }
 
+static void mask_and_ack_level_ioapic_vector (unsigned int vector)
+{
+	int irq = vector_to_irq(vector);
+
+	mask_and_ack_level_ioapic_irq(irq);
+}
+
 static void end_level_ioapic_vector (unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
Index: linux-2.6.16/arch/i386/kernel/irq.c
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/irq.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/kernel/irq.c	2006-10-19 16:52:29.000000000 +0200
@@ -51,7 +51,7 @@ static union irq_ctx *softirq_ctx[NR_CPU
  * SMP cross-CPU interrupts have their own specific
  * handlers).
  */
-fastcall unsigned int do_IRQ(struct pt_regs *regs)
+fastcall notrace unsigned int do_IRQ(struct pt_regs *regs)
 {	
 	/* high bits used in ret_from_ code */
 	int irq = regs->orig_eax & 0xff;
@@ -59,8 +59,12 @@ fastcall unsigned int do_IRQ(struct pt_r
 	union irq_ctx *curctx, *irqctx;
 	u32 *isp;
 #endif
-
 	irq_enter();
+#ifdef CONFIG_LATENCY_TRACE
+	if (irq == trace_user_trigger_irq)
+		user_trace_start();
+#endif
+	trace_special(regs->eip, irq, 0);
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 	/* Debugging check for stack overflow: is there less than 1KB free? */
 	{
@@ -69,7 +73,7 @@ fastcall unsigned int do_IRQ(struct pt_r
 		__asm__ __volatile__("andl %%esp,%0" :
 					"=r" (esp) : "0" (THREAD_SIZE - 1));
 		if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
-			printk("do_IRQ: stack overflow: %ld\n",
+			printk("BUG: do_IRQ: stack overflow: %ld\n",
 				esp - sizeof(struct thread_info));
 			dump_stack();
 		}
@@ -224,8 +228,10 @@ int show_interrupts(struct seq_file *p, 
 	}
 
 	if (i < NR_IRQS) {
-		spin_lock_irqsave(&irq_desc[i].lock, flags);
-		action = irq_desc[i].action;
+		irq_desc_t *desc = irq_desc + i;
+
+		spin_lock_irqsave(&desc->lock, flags);
+		action = desc->action;
 		if (!action)
 			goto skip;
 		seq_printf(p, "%3d: ",i);
@@ -235,15 +241,27 @@ int show_interrupts(struct seq_file *p, 
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
 #endif
-		seq_printf(p, " %14s", irq_desc[i].handler->typename);
+		seq_printf(p, " %-14s", desc->handler->typename);
+#define F(x,c) ((desc->status & x) ? c : '.')
+		seq_printf(p, " [%c%c%c%c%c%c%c%c%c/",
+			F(IRQ_INPROGRESS,	'I'),
+			F(IRQ_DISABLED,		'D'),
+			F(IRQ_PENDING,		'P'),
+			F(IRQ_REPLAY,		'R'),
+			F(IRQ_AUTODETECT,	'A'),
+			F(IRQ_WAITING,		'W'),
+			F(IRQ_LEVEL,		'L'),
+			F(IRQ_MASKED,		'M'),
+			F(IRQ_NODELAY,		'N'));
+#undef F
+		seq_printf(p, "%3d]", desc->irqs_unhandled);
 		seq_printf(p, "  %s", action->name);
-
 		for (action=action->next; action; action = action->next)
 			seq_printf(p, ", %s", action->name);
 
 		seq_putc(p, '\n');
 skip:
-		spin_unlock_irqrestore(&irq_desc[i].lock, flags);
+		spin_unlock_irqrestore(&desc->lock, flags);
 	} else if (i == NR_IRQS) {
 		seq_printf(p, "NMI: ");
 		for_each_online_cpu(j)
Index: linux-2.6.16/arch/i386/kernel/mca.c
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/mca.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/kernel/mca.c	2006-10-19 16:52:29.000000000 +0200
@@ -472,3 +472,22 @@ void mca_handle_nmi(void)
 
 	mca_nmi_hook();
 } /* mca_handle_nmi */
+
+void mca_timer_ack(void *priv)
+{
+	int irq;
+
+	if (MCA_bus) {
+		/* The PS/2 uses level-triggered interrupts.  You can't
+		turn them off, nor would you want to (any attempt to
+		enable edge-triggered interrupts usually gets intercepted by a
+		special hardware circuit).  Hence we have to acknowledge
+		the timer interrupt.  Through some incredibly stupid
+		design idea, the reset for IRQ 0 is done by setting the
+		high bit of the PPI port B (0x61).  Note that some PS/2s,
+		notably the 55SX, work fine if this is removed.  */
+
+		irq = inb_p( 0x61 );	/* read the current state */
+		outb_p( irq|0x80, 0x61 );	/* reset the IRQ */
+	}
+}
Index: linux-2.6.16/arch/i386/kernel/mcount-wrapper.S
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/arch/i386/kernel/mcount-wrapper.S	2006-10-19 16:52:29.000000000 +0200
@@ -0,0 +1,27 @@
+/*
+ *  linux/arch/i386/mcount-wrapper.S
+ *
+ *  Copyright (C) 2004 Ingo Molnar
+ */
+
+.globl mcount
+mcount:
+
+	cmpl $0, mcount_enabled
+	jz out
+
+	push %ebp
+	mov %esp, %ebp
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+
+	call __mcount
+
+	popl %edx
+	popl %ecx
+	popl %eax
+	popl %ebp
+out:
+	ret
+
Index: linux-2.6.16/arch/i386/kernel/microcode.c
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/microcode.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/kernel/microcode.c	2006-10-19 16:52:29.000000000 +0200
@@ -111,7 +111,7 @@ MODULE_LICENSE("GPL");
 #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
 
 /* serialize access to the physical write to MSR 0x79 */
-static DEFINE_SPINLOCK(microcode_update_lock);
+static DEFINE_RAW_SPINLOCK(microcode_update_lock);
 
 /* no concurrent ->write()s are allowed on /dev/cpu/microcode */
 static DECLARE_MUTEX(microcode_sem);
Index: linux-2.6.16/arch/i386/kernel/nmi.c
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/nmi.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/kernel/nmi.c	2006-10-19 16:52:29.000000000 +0200
@@ -34,7 +34,7 @@
 
 unsigned int nmi_watchdog = NMI_NONE;
 extern int unknown_nmi_panic;
-static unsigned int nmi_hz = HZ;
+static unsigned int nmi_hz = 1000;
 static unsigned int nmi_perfctr_msr;	/* the MSR to reset in NMI handler */
 static unsigned int nmi_p4_cccr_val;
 extern void show_registers(struct pt_regs *regs);
@@ -141,7 +141,7 @@ static int __init check_nmi_watchdog(voi
 	for_each_cpu(cpu)
 		prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
 	local_irq_enable();
-	mdelay((10*1000)/nmi_hz); // wait 10 ticks
+	mdelay((100*1000)/nmi_hz); // wait 100 ticks
 
 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
 #ifdef CONFIG_SMP
@@ -168,7 +168,7 @@ static int __init check_nmi_watchdog(voi
 	/* now that we know it works we can reduce NMI frequency to
 	   something more reasonable; makes a difference in some configs */
 	if (nmi_watchdog == NMI_LOCAL_APIC)
-		nmi_hz = 1;
+		nmi_hz = 10000;
 
 	kfree(prev_nmi_count);
 	return 0;
@@ -521,9 +521,34 @@ void touch_nmi_watchdog (void)
 
 extern void die_nmi(struct pt_regs *, const char *msg);
 
-void nmi_watchdog_tick (struct pt_regs * regs)
+int nmi_show_regs[NR_CPUS];
+
+void nmi_show_all_regs(void)
 {
+	int i;
+
+	if (nmi_watchdog == NMI_NONE)
+		return;
+	if (system_state != SYSTEM_RUNNING) {
+		printk("nmi_show_all_regs(): system state %d, not doing.\n",
+			system_state);
+		return;
+	}
+	printk("nmi_show_all_regs(): start on CPU#%d.\n",
+		raw_smp_processor_id());
+	dump_stack();
+
+	for_each_online_cpu(i)
+		nmi_show_regs[i] = 1;
+	for_each_online_cpu(i)
+		while (nmi_show_regs[i] == 1)
+			barrier();
+}
+
+static DEFINE_RAW_SPINLOCK(nmi_print_lock);
 
+void notrace nmi_watchdog_tick (struct pt_regs * regs)
+{
 	/*
 	 * Since current_thread_info()-> is always on the stack, and we
 	 * always switch the stack NMI-atomically, it's safe to use
@@ -531,7 +556,16 @@ void nmi_watchdog_tick (struct pt_regs *
 	 */
 	int sum, cpu = smp_processor_id();
 
-	sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
+	sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_irqs(0);
+
+	profile_tick(CPU_PROFILING, regs);
+	if (nmi_show_regs[cpu]) {
+		nmi_show_regs[cpu] = 0;
+		spin_lock(&nmi_print_lock);
+		printk("NMI show regs on CPU#%d:\n", cpu);
+		show_regs(regs);
+		spin_unlock(&nmi_print_lock);
+	}
 
 	if (last_irq_sums[cpu] == sum) {
 		/*
@@ -539,15 +573,29 @@ void nmi_watchdog_tick (struct pt_regs *
 		 * wait a few IRQs (5 seconds) before doing the oops ...
 		 */
 		alert_counter[cpu]++;
-		if (alert_counter[cpu] == 5*nmi_hz)
-			/*
-			 * die_nmi will return ONLY if NOTIFY_STOP happens..
-			 */
+		if (alert_counter[cpu] && !(alert_counter[cpu] % (5*nmi_hz))) {
+			int i;
+
+			bust_spinlocks(1);
+			spin_lock(&nmi_print_lock);
+			printk("NMI watchdog detected lockup on CPU#%d (%d/%d)\n", cpu, alert_counter[cpu], 5*nmi_hz);
+			show_regs(regs);
+			spin_unlock(&nmi_print_lock);
+
+			for_each_online_cpu(i)
+				if (i != cpu)
+					nmi_show_regs[i] = 1;
+			for_each_online_cpu(i)
+				while (nmi_show_regs[i] == 1)
+					barrier();
+
 			die_nmi(regs, "NMI Watchdog detected LOCKUP");
+		}
 	} else {
 		last_irq_sums[cpu] = sum;
 		alert_counter[cpu] = 0;
 	}
+
 	if (nmi_perfctr_msr) {
 		if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) {
 			/*
Index: linux-2.6.16/arch/i386/kernel/numaq.c
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/numaq.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/kernel/numaq.c	2006-10-19 16:52:29.000000000 +0200
@@ -79,10 +79,12 @@ int __init get_memcfg_numaq(void)
 	return 1;
 }
 
-static int __init numaq_dsc_disable(void)
+static int __init numaq_tsc_disable(void)
 {
-	printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
-	tsc_disable = 1;
+	if (num_online_nodes() > 1) {
+		printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
+		tsc_disable = 1;
+	}
 	return 0;
 }
-core_initcall(numaq_dsc_disable);
+arch_initcall(numaq_tsc_disable);
Index: linux-2.6.16/arch/i386/kernel/process.c
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/process.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/kernel/process.c	2006-10-19 16:52:29.000000000 +0200
@@ -105,16 +105,16 @@ void default_idle(void)
 	if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
 		clear_thread_flag(TIF_POLLING_NRFLAG);
 		smp_mb__after_clear_bit();
-		while (!need_resched()) {
+		while (!need_resched() && !need_resched_delayed()) {
 			local_irq_disable();
-			if (!need_resched())
+			if (!need_resched() && !need_resched_delayed())
 				safe_halt();
 			else
 				local_irq_enable();
 		}
 		set_thread_flag(TIF_POLLING_NRFLAG);
 	} else {
-		while (!need_resched())
+		while (!need_resched() && !need_resched_delayed())
 			cpu_relax();
 	}
 }
@@ -179,7 +179,9 @@ void cpu_idle(void)
 
 	/* endless idle loop with no priority at all */
 	while (1) {
-		while (!need_resched()) {
+		BUG_ON(irqs_disabled());
+
+		while (!need_resched() && !need_resched_delayed()) {
 			void (*idle)(void);
 
 			if (__get_cpu_var(cpu_idle_state))
@@ -197,9 +199,11 @@ void cpu_idle(void)
 			__get_cpu_var(irq_stat).idle_timestamp = jiffies;
 			idle();
 		}
-		preempt_enable_no_resched();
-		schedule();
+		local_irq_disable();
+		__preempt_enable_no_resched();
+		__schedule();
 		preempt_disable();
+		local_irq_enable();
 	}
 }
 
@@ -242,10 +246,10 @@ static void mwait_idle(void)
 {
 	local_irq_enable();
 
-	while (!need_resched()) {
+	while (!need_resched() && !need_resched_delayed()) {
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		smp_mb();
-		if (need_resched())
+		if (need_resched() || need_resched_delayed())
 			break;
 		__mwait(0, 0);
 	}
@@ -373,11 +377,16 @@ void exit_thread(void)
 
 	/* The process may have allocated an io port bitmap... nuke it. */
 	if (unlikely(NULL != t->io_bitmap_ptr)) {
-		int cpu = get_cpu();
-		struct tss_struct *tss = &per_cpu(init_tss, cpu);
+		int cpu;
+		struct tss_struct *tss;
+		void *io_bitmap_ptr = t->io_bitmap_ptr;
 
-		kfree(t->io_bitmap_ptr);
 		t->io_bitmap_ptr = NULL;
+		mb();
+		kfree(io_bitmap_ptr);
+
+		cpu = get_cpu();
+		tss = &per_cpu(init_tss, cpu);
 		/*
 		 * Careful, clear this in the TSS too:
 		 */
Index: linux-2.6.16/arch/i386/kernel/semaphore.c
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/semaphore.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/kernel/semaphore.c	2006-10-19 16:52:29.000000000 +0200
@@ -13,6 +13,7 @@
  * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
  */
 #include <linux/config.h>
+#include <linux/module.h>
 #include <asm/semaphore.h>
 
 /*
@@ -28,15 +29,15 @@
 asm(
 ".section .sched.text\n"
 ".align 4\n"
-".globl __down_failed\n"
-"__down_failed:\n\t"
+".globl __compat_down_failed\n"
+"__compat_down_failed:\n\t"
 #if defined(CONFIG_FRAME_POINTER)
 	"pushl %ebp\n\t"
 	"movl  %esp,%ebp\n\t"
 #endif
 	"pushl %edx\n\t"
 	"pushl %ecx\n\t"
-	"call __down\n\t"
+	"call __compat_down\n\t"
 	"popl %ecx\n\t"
 	"popl %edx\n\t"
 #if defined(CONFIG_FRAME_POINTER)
@@ -49,15 +50,15 @@ asm(
 asm(
 ".section .sched.text\n"
 ".align 4\n"
-".globl __down_failed_interruptible\n"
-"__down_failed_interruptible:\n\t"
+".globl __compat_down_failed_interruptible\n"
+"__compat_down_failed_interruptible:\n\t"
 #if defined(CONFIG_FRAME_POINTER)
 	"pushl %ebp\n\t"
 	"movl  %esp,%ebp\n\t"
 #endif
 	"pushl %edx\n\t"
 	"pushl %ecx\n\t"
-	"call __down_interruptible\n\t"
+	"call __compat_down_interruptible\n\t"
 	"popl %ecx\n\t"
 	"popl %edx\n\t"
 #if defined(CONFIG_FRAME_POINTER)
@@ -70,15 +71,15 @@ asm(
 asm(
 ".section .sched.text\n"
 ".align 4\n"
-".globl __down_failed_trylock\n"
-"__down_failed_trylock:\n\t"
+".globl __compat_down_failed_trylock\n"
+"__compat_down_failed_trylock:\n\t"
 #if defined(CONFIG_FRAME_POINTER)
 	"pushl %ebp\n\t"
 	"movl  %esp,%ebp\n\t"
 #endif
 	"pushl %edx\n\t"
 	"pushl %ecx\n\t"
-	"call __down_trylock\n\t"
+	"call __compat_down_trylock\n\t"
 	"popl %ecx\n\t"
 	"popl %edx\n\t"
 #if defined(CONFIG_FRAME_POINTER)
@@ -91,45 +92,13 @@ asm(
 asm(
 ".section .sched.text\n"
 ".align 4\n"
-".globl __up_wakeup\n"
-"__up_wakeup:\n\t"
+".globl __compat_up_wakeup\n"
+"__compat_up_wakeup:\n\t"
 	"pushl %edx\n\t"
 	"pushl %ecx\n\t"
-	"call __up\n\t"
+	"call __compat_up\n\t"
 	"popl %ecx\n\t"
 	"popl %edx\n\t"
 	"ret"
 );
 
-/*
- * rw spinlock fallbacks
- */
-#if defined(CONFIG_SMP)
-asm(
-".section .sched.text\n"
-".align	4\n"
-".globl	__write_lock_failed\n"
-"__write_lock_failed:\n\t"
-	LOCK "addl	$" RW_LOCK_BIAS_STR ",(%eax)\n"
-"1:	rep; nop\n\t"
-	"cmpl	$" RW_LOCK_BIAS_STR ",(%eax)\n\t"
-	"jne	1b\n\t"
-	LOCK "subl	$" RW_LOCK_BIAS_STR ",(%eax)\n\t"
-	"jnz	__write_lock_failed\n\t"
-	"ret"
-);
-
-asm(
-".section .sched.text\n"
-".align	4\n"
-".globl	__read_lock_failed\n"
-"__read_lock_failed:\n\t"
-	LOCK "incl	(%eax)\n"
-"1:	rep; nop\n\t"
-	"cmpl	$1,(%eax)\n\t"
-	"js	1b\n\t"
-	LOCK "decl	(%eax)\n\t"
-	"js	__read_lock_failed\n\t"
-	"ret"
-);
-#endif
Index: linux-2.6.16/arch/i386/kernel/setup.c
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/setup.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/kernel/setup.c	2006-10-19 16:52:29.000000000 +0200
@@ -1632,6 +1632,7 @@ void __init setup_arch(char **cmdline_p)
 	conswitchp = &dummy_con;
 #endif
 #endif
+	tsc_init();
 }
 
 #include "setup_arch_post.h"
Index: linux-2.6.16/arch/i386/kernel/signal.c
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/signal.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/kernel/signal.c	2006-10-19 16:52:29.000000000 +0200
@@ -531,6 +531,13 @@ handle_signal(unsigned long sig, siginfo
 		}
 	}
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Fully-preemptible kernel does not need interrupts disabled:
+	 */
+	local_irq_enable();
+	preempt_check_resched();
+#endif
 	/*
 	 * If TF is set due to a debugger (PT_DTRACE), clear the TF flag so
 	 * that register information in the sigcontext is correct.
@@ -571,6 +578,13 @@ static void fastcall do_signal(struct pt
 	struct k_sigaction ka;
 	sigset_t *oldset;
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Fully-preemptible kernel does not need interrupts disabled:
+	 */
+	local_irq_enable();
+	preempt_check_resched();
+#endif
 	/*
 	 * We want the common case to go fast, which
 	 * is why we may in certain cases get here from
Index: linux-2.6.16/arch/i386/kernel/smp.c
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/smp.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/kernel/smp.c	2006-10-19 16:52:29.000000000 +0200
@@ -245,7 +245,7 @@ void send_IPI_mask_sequence(cpumask_t ma
 static cpumask_t flush_cpumask;
 static struct mm_struct * flush_mm;
 static unsigned long flush_va;
-static DEFINE_SPINLOCK(tlbstate_lock);
+static DEFINE_RAW_SPINLOCK(tlbstate_lock);
 #define FLUSH_ALL	0xffffffff
 
 /*
@@ -390,7 +390,7 @@ static void flush_tlb_others(cpumask_t c
 
 	while (!cpus_empty(flush_cpumask))
 		/* nothing. lockup detection does not belong here */
-		mb();
+		cpu_relax();
 
 	flush_mm = NULL;
 	flush_va = 0;
@@ -481,10 +481,20 @@ void smp_send_reschedule(int cpu)
 }
 
 /*
+ * this function sends a 'reschedule' IPI to all other CPUs.
+ * This is used when RT tasks are starving and other CPUs
+ * might be able to run them:
+ */
+void smp_send_reschedule_allbutself(void)
+{
+	send_IPI_allbutself(RESCHEDULE_VECTOR);
+}
+
+/*
  * Structure and data for smp_call_function(). This is designed to minimise
  * static memory requirements. It also looks cleaner.
  */
-static DEFINE_SPINLOCK(call_lock);
+static DEFINE_RAW_SPINLOCK(call_lock);
 
 struct call_data_struct {
 	void (*func) (void *info);
@@ -593,13 +603,14 @@ void smp_send_stop(void)
 }
 
 /*
- * Reschedule call back. Nothing to do,
- * all the work is done automatically when
- * we return from the interrupt.
+ * Reschedule call back. Trigger a reschedule pass so that
+ * RT-overload balancing can pass tasks around.
  */
-fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
+fastcall notrace void smp_reschedule_interrupt(struct pt_regs *regs)
 {
+	trace_special(regs->eip, 0, 0);
 	ack_APIC_irq();
+	set_tsk_need_resched(current);
 }
 
 fastcall void smp_call_function_interrupt(struct pt_regs *regs)
Index: linux-2.6.16/arch/i386/kernel/smpboot.c
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/smpboot.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/kernel/smpboot.c	2006-10-19 16:52:29.000000000 +0200
@@ -208,142 +208,299 @@ valid_k7:
 	;
 }
 
-/*
- * TSC synchronization.
- *
- * We first check whether all CPUs have their TSC's synchronized,
- * then we print a warning if not, and always resync.
- */
+static atomic_t tsc_start_flag, tsc_check_start, tsc_check_stop;
 
-static atomic_t tsc_start_flag = ATOMIC_INIT(0);
-static atomic_t tsc_count_start = ATOMIC_INIT(0);
-static atomic_t tsc_count_stop = ATOMIC_INIT(0);
-static unsigned long long tsc_values[NR_CPUS];
-
-#define NR_LOOPS 5
-
-static void __init synchronize_tsc_bp (void)
+static int __init check_tsc_warp(void)
 {
-	int i;
-	unsigned long long t0;
-	unsigned long long sum, avg;
-	long long delta;
-	unsigned int one_usec;
-	int buggy = 0;
-
-	printk(KERN_INFO "checking TSC synchronization across %u CPUs: ", num_booting_cpus());
-
-	/* convert from kcyc/sec to cyc/usec */
-	one_usec = cpu_khz / 1000;
+	static DEFINE_RAW_SPINLOCK(warp_lock);
+	static long long prev;
+	static unsigned int error;
 
-	atomic_set(&tsc_start_flag, 1);
-	wmb();
+	int cpus = num_booting_cpus(), nr = 0;
+	long long start, now, end, delta;
 
+	atomic_inc(&tsc_check_start);
+	while (atomic_read(&tsc_check_start) != cpus)
+		cpu_relax();
 	/*
-	 * We loop a few times to get a primed instruction cache,
-	 * then the last pass is more or less synchronized and
-	 * the BP and APs set their cycle counters to zero all at
-	 * once. This reduces the chance of having random offsets
-	 * between the processors, and guarantees that the maximum
-	 * delay between the cycle counters is never bigger than
-	 * the latency of information-passing (cachelines) between
-	 * two CPUs.
+	 * Run the check for 500 msecs:
 	 */
-	for (i = 0; i < NR_LOOPS; i++) {
-		/*
-		 * all APs synchronize but they loop on '== num_cpus'
-		 */
-		while (atomic_read(&tsc_count_start) != num_booting_cpus()-1)
-			mb();
-		atomic_set(&tsc_count_stop, 0);
-		wmb();
-		/*
-		 * this lets the APs save their current TSC:
-		 */
-		atomic_inc(&tsc_count_start);
+	rdtscll(start);
+	end = start + cpu_khz*500;
 
-		rdtscll(tsc_values[smp_processor_id()]);
+	for (;;) {
 		/*
-		 * We clear the TSC in the last loop:
+		 * Check for the TSC going backwards (between CPUs):
 		 */
-		if (i == NR_LOOPS-1)
-			write_tsc(0, 0);
+		spin_lock(&warp_lock);
+		rdtscll(now);
+		delta = now - prev;
+		prev = now;
+		spin_unlock(&warp_lock);
+		if (unlikely(delta < 0))
+			error = 1;
 
+		if (now > end)
+			break;
 		/*
-		 * Wait for all APs to leave the synchronization point:
+		 * Take it easy every couple of iterations,
+		 * to not starve other CPUs:
 		 */
-		while (atomic_read(&tsc_count_stop) != num_booting_cpus()-1)
-			mb();
-		atomic_set(&tsc_count_start, 0);
-		wmb();
-		atomic_inc(&tsc_count_stop);
+		nr++;
+		if (!(nr % 31))
+			cpu_relax();
 	}
 
-	sum = 0;
-	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_isset(i, cpu_callout_map)) {
-			t0 = tsc_values[i];
-			sum += t0;
-		}
-	}
-	avg = sum;
-	do_div(avg, num_booting_cpus());
+	atomic_inc(&tsc_check_stop);
+	while (atomic_read(&tsc_check_stop) != cpus)
+		cpu_relax();
 
-	sum = 0;
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_isset(i, cpu_callout_map))
-			continue;
-		delta = tsc_values[i] - avg;
-		if (delta < 0)
-			delta = -delta;
-		/*
-		 * We report bigger than 2 microseconds clock differences.
-		 */
-		if (delta > 2*one_usec) {
-			long realdelta;
-			if (!buggy) {
-				buggy = 1;
-				printk("\n");
-			}
-			realdelta = delta;
-			do_div(realdelta, one_usec);
-			if (tsc_values[i] < avg)
-				realdelta = -realdelta;
+	return error;
+}
 
-			printk(KERN_INFO "CPU#%d had %ld usecs TSC skew, fixed it up.\n", i, realdelta);
-		}
+/*
+ * TSC synchronization based on ia64 itc synchronization code.  Synchronize
+ * pairs of processors rahter than tring to synchronize all of the processors
+ * with a single event.  When several processors are all waiting for an
+ * event they don't all see it at the same time.  The write will cause
+ * an invalidate on each processors cache and then they all scramble to
+ * re-read that cache line.
+ *
+ * Writing the TSC resets the upper 32-bits, so we need to be careful
+ * that all of the cpus can be synchronized before we overflow the
+ * 32-bit count.
+ */
 
-		sum += delta;
+#define MASTER	0
+#define SLAVE	(SMP_CACHE_BYTES/sizeof(long))
+
+#define NUM_ROUNDS	64	/* magic value */
+#define NUM_ITERS	5	/* likewise */
+
+static volatile unsigned long go[2*SLAVE] __cacheline_aligned;
+static volatile int current_slave = -1;
+static volatile int tsc_sync_complete = 0;
+static volatile int tsc_adj_latency = 0;
+static unsigned int max_rt = 0;
+static unsigned int max_delta = 0;
+
+#define DEBUG_TSC_SYNC	0
+#if DEBUG_TSC_SYNC
+struct tsc_sync_debug {
+	long rt;	/* roundtrip time */
+	long master;	/* master's timestamp */
+	long diff;	/* difference between midpoint and master's timestamp */
+	long lat;	/* estimate of tsc adjustment latency */
+} tsc_sync_debug[NUM_ROUNDS*NR_CPUS];
+#endif
+
+void
+sync_master(void)
+{
+	unsigned long  n, tsc, last_go_master;
+
+	last_go_master = 0;
+	while (1) {
+		while ((n = go[MASTER]) == last_go_master)
+			rep_nop();
+		if (n == ~0)
+			break;
+		rdtscl(tsc);
+		if (unlikely(!tsc))
+			tsc = 1;
+		go[SLAVE] = tsc;
+		last_go_master = n;
 	}
-	if (!buggy)
-		printk("passed.\n");
 }
 
-static void __init synchronize_tsc_ap (void)
+/*
+ * Return the number of cycles by which our TSC differs from the TSC on
+ * the master (time-keeper) CPU.  A positive number indicates our TSC is
+ * ahead of the master, negative that it is behind.
+ */
+static inline long
+get_delta (long *rt, long *master)
 {
-	int i;
+	unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
+	unsigned long tcenter, t0, t1, tm, last_go_slave;
+	long i;
+
+	last_go_slave = go[SLAVE];
+	for (i = 0; i < NUM_ITERS; ++i) {
+		rdtscl(t0);
+		go[MASTER] = i+1;
+		while ((tm = go[SLAVE]) == last_go_slave)
+			rep_nop();
+		rdtscl(t1);
+
+		if (t1 - t0 < best_t1 - best_t0)
+			best_t0 = t0, best_t1 = t1, best_tm = tm;
+		last_go_slave = tm;
+	}
+
+	*rt = best_t1 - best_t0;
+	*master = best_tm - best_t0;
+
+	/* average best_t0 and best_t1 without overflow: */
+	tcenter = (best_t0/2 + best_t1/2);
+	if (best_t0 % 2 + best_t1 % 2 == 2)
+		++tcenter;
+	return tcenter - best_tm;
+}
+
+/*
+ * Synchronize TSC of the current (slave) CPU with the TSC of the MASTER CPU
+ * (normally the time-keeper CPU).  We use a closed loop to eliminate the
+ * possibility of unaccounted-for errors (such as getting a machine check in
+ * the middle of a calibration step).  The basic idea is for the slave to ask
+ * the master what TSC value it has and to read its own TSC before and after
+ * the master responds.  Each iteration gives us three
+ * timestamps:
+ *
+ *	slave		master
+ *
+ *	t0 ---\
+ *             ---\
+ *		   --->
+ *			tm
+ *		   /---
+ *	       /---
+ *	t1 <---
+ *
+ *
+ * The goal is to adjust the slave's TSC such that tm falls exactly half-way
+ * between t0 and t1.  If we achieve this, the clocks are synchronized provided
+ * the interconnect between the slave and the master is symmetric.  Even if the
+ * interconnect were asymmetric, we would still know that the synchronization
+ * error is smaller than the roundtrip latency (t0 - t1).
+ *
+ * When the interconnect is quiet and symmetric, this lets us synchronize the
+ * TSC to within one or two cycles.  However, we can only *guarantee* that the
+ * synchronization is accurate to within a round-trip time, which is typically
+ * in the range of several hundred cycles (e.g., ~500 cycles).  In practice,
+ * this means that the TSC's are usually almost perfectly synchronized, but we
+ * shouldn't assume that the accuracy is much better than half a micro second
+ * or so.
+ */
+
+static void __init
+synchronize_tsc_ap (void)
+{
+	long i, delta, adj, adjust_latency, n_rounds;
+	unsigned long rt, master_time_stamp,  tsc;
+#if DEBUG_TSC_SYNC
+	struct tsc_sync_debug *t =
+		 &tsc_sync_debug[smp_processor_id() * NUM_ROUNDS];
+#endif
+
+	while (!atomic_read(&tsc_start_flag))
+		mb();
+
+	if (!check_tsc_warp())
+		return;
 
 	/*
-	 * Not every cpu is online at the time
-	 * this gets called, so we first wait for the BP to
-	 * finish SMP initialization:
+	 * Wait for our turn to synchronize with the boot processor.
 	 */
-	while (!atomic_read(&tsc_start_flag)) mb();
+	while (current_slave != smp_processor_id())
+		rep_nop();
+	adjust_latency = tsc_adj_latency;
+
+	go[SLAVE] = 0;
+	go[MASTER] = 0;
+	write_tsc(0,0);
+	for (i = 0; i < NUM_ROUNDS; ++i) {
+		delta = get_delta(&rt, &master_time_stamp);
+		if (delta == 0)
+			break;
+
+		if (i > 0)
+			adjust_latency += -delta;
+		adj = -delta + adjust_latency/8;
+		rdtscl(tsc);
+		write_tsc(tsc + adj, 0);
+#if DEBUG_TSC_SYNC
+		t[i].rt = rt;
+		t[i].master = master_time_stamp;
+		t[i].diff = delta;
+		t[i].lat = adjust_latency/8;
+#endif
+	}
+	n_rounds = i;
+	go[MASTER] = ~0;
+
+#if (DEBUG_TSC_SYNC == 2)
+	for (i = 0; i < n_rounds; ++i)
+		printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
+		       t[i].rt, t[i].master, t[i].diff, t[i].lat);
+
+	printk("CPU %d: synchronized TSC (last diff %ld cycles, maxerr %lu cycles)\n",
+	       smp_processor_id(), delta, rt);
+
+	printk("It took %ld rounds\n", n_rounds);
+#endif
+	if (rt > max_rt)
+		max_rt = rt;
+	if (delta < 0)
+		delta = -delta;
+	if (delta > max_delta)
+		max_delta = delta;
+	tsc_adj_latency = adjust_latency;
+	current_slave = -1;
+	while (!tsc_sync_complete)
+		rep_nop();
+}
+
+/*
+ * The boot processor set its own TSC to zero and then gives each
+ * slave processor the chance to synchronize itself.
+ */
 
-	for (i = 0; i < NR_LOOPS; i++) {
-		atomic_inc(&tsc_count_start);
-		while (atomic_read(&tsc_count_start) != num_booting_cpus())
-			mb();
+static void __init synchronize_tsc_bp (void)
+{
+	unsigned int tsc_low, tsc_high, error;
+	int cpu;
+
+	atomic_set(&tsc_start_flag, 1);
 
-		rdtscll(tsc_values[smp_processor_id()]);
-		if (i == NR_LOOPS-1)
-			write_tsc(0, 0);
+	printk(KERN_INFO "checking TSC synchronization across %u CPUs: ",
+		num_booting_cpus());
 
-		atomic_inc(&tsc_count_stop);
-		while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb();
+	if (!check_tsc_warp()) {
+		printk("passed.\n");
+		return;
+	}
+	printk("failed.\n");
+
+	printk(KERN_INFO "starting TSC synchronization\n");
+	write_tsc(0, 0);
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		if (!cpu_isset(cpu, cpu_callout_map))
+			continue;
+		if (cpu == smp_processor_id())
+			continue;
+		go[MASTER] = 0;
+		current_slave = cpu;
+		sync_master();
+		while (current_slave != -1)
+			rep_nop();
+	}
+	rdtsc(tsc_low, tsc_high);
+	if (tsc_high)
+		printk("TSC overflowed during synchronization\n");
+	else
+		printk("TSC synchronization complete max_delta=%d cycles\n",
+			max_delta);
+	if (max_rt < 4293) {
+		error = (max_rt * 1000000)/cpu_khz;
+		printk("TSC sync round-trip time %d.%03d microseconds\n",
+			error/1000, error%1000);
+	} else {
+		printk("TSC sync round-trip time %d cycles\n", max_rt);
 	}
+	tsc_sync_complete = 1;
 }
-#undef NR_LOOPS
 
 extern void calibrate_delay(void);
 
Index: linux-2.6.16/arch/i386/kernel/syscall_table.S
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/syscall_table.S	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/kernel/syscall_table.S	2006-10-19 16:52:29.000000000 +0200
@@ -310,3 +310,5 @@ ENTRY(sys_call_table)
 	.long sys_pselect6
 	.long sys_ppoll
 	.long sys_unshare		/* 310 */
+	.long sys_set_robust_list
+	.long sys_get_robust_list
Index: linux-2.6.16/arch/i386/kernel/time.c
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/time.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/kernel/time.c	2006-10-19 16:52:29.000000000 +0200
@@ -46,6 +46,7 @@
 #include <linux/bcd.h>
 #include <linux/efi.h>
 #include <linux/mca.h>
+#include <linux/clockchips.h>
 
 #include <asm/io.h>
 #include <asm/smp.h>
@@ -56,6 +57,7 @@
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 #include <asm/timer.h>
+#include <asm/timeofday.h>
 
 #include "mach_time.h"
 
@@ -79,16 +81,9 @@ EXPORT_SYMBOL(cpu_khz);
 
 extern unsigned long wall_jiffies;
 
-DEFINE_SPINLOCK(rtc_lock);
+DEFINE_RAW_SPINLOCK(rtc_lock);
 EXPORT_SYMBOL(rtc_lock);
 
-#include <asm/i8253.h>
-
-DEFINE_SPINLOCK(i8253_lock);
-EXPORT_SYMBOL(i8253_lock);
-
-struct timer_opts *cur_timer __read_mostly = &timer_none;
-
 /*
  * This is a special lock that is owned by the CPU and holds the index
  * register we are working with.  It is required for NMI access to the
@@ -118,118 +113,25 @@ void rtc_cmos_write(unsigned char val, u
 }
 EXPORT_SYMBOL(rtc_cmos_write);
 
-/*
- * This version of gettimeofday has microsecond resolution
- * and better than microsecond precision on fast x86 machines with TSC.
- */
-void do_gettimeofday(struct timeval *tv)
-{
-	unsigned long seq;
-	unsigned long usec, sec;
-	unsigned long max_ntp_tick;
-
-	do {
-		unsigned long lost;
-
-		seq = read_seqbegin(&xtime_lock);
-
-		usec = cur_timer->get_offset();
-		lost = jiffies - wall_jiffies;
-
-		/*
-		 * If time_adjust is negative then NTP is slowing the clock
-		 * so make sure not to go into next possible interval.
-		 * Better to lose some accuracy than have time go backwards..
-		 */
-		if (unlikely(time_adjust < 0)) {
-			max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj;
-			usec = min(usec, max_ntp_tick);
-
-			if (lost)
-				usec += lost * max_ntp_tick;
-		}
-		else if (unlikely(lost))
-			usec += lost * (USEC_PER_SEC / HZ);
-
-		sec = xtime.tv_sec;
-		usec += (xtime.tv_nsec / 1000);
-	} while (read_seqretry(&xtime_lock, seq));
-
-	while (usec >= 1000000) {
-		usec -= 1000000;
-		sec++;
-	}
-
-	tv->tv_sec = sec;
-	tv->tv_usec = usec;
-}
-
-EXPORT_SYMBOL(do_gettimeofday);
-
-int do_settimeofday(struct timespec *tv)
-{
-	time_t wtm_sec, sec = tv->tv_sec;
-	long wtm_nsec, nsec = tv->tv_nsec;
-
-	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
-		return -EINVAL;
-
-	write_seqlock_irq(&xtime_lock);
-	/*
-	 * This is revolting. We need to set "xtime" correctly. However, the
-	 * value in this location is the value at the most recent update of
-	 * wall time.  Discover what correction gettimeofday() would have
-	 * made, and then undo it!
-	 */
-	nsec -= cur_timer->get_offset() * NSEC_PER_USEC;
-	nsec -= (jiffies - wall_jiffies) * TICK_NSEC;
-
-	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
-	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
-
-	set_normalized_timespec(&xtime, sec, nsec);
-	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
-
-	ntp_clear();
-	write_sequnlock_irq(&xtime_lock);
-	clock_was_set();
-	return 0;
-}
-
-EXPORT_SYMBOL(do_settimeofday);
-
 static int set_rtc_mmss(unsigned long nowtime)
 {
 	int retval;
-
-	WARN_ON(irqs_disabled());
+	unsigned long flags;
 
 	/* gets recalled with irq locally disabled */
-	spin_lock_irq(&rtc_lock);
+	/* XXX - does irqsave resolve this? -johnstul */
+	spin_lock_irqsave(&rtc_lock, flags);
 	if (efi_enabled)
 		retval = efi_set_rtc_mmss(nowtime);
 	else
 		retval = mach_set_rtc_mmss(nowtime);
-	spin_unlock_irq(&rtc_lock);
+	spin_unlock_irqrestore(&rtc_lock, flags);
 
 	return retval;
 }
 
-
-int timer_ack;
-
-/* monotonic_clock(): returns # of nanoseconds passed since time_init()
- *		Note: This function is required to return accurate
- *		time even in the absence of multiple timer ticks.
- */
-unsigned long long monotonic_clock(void)
-{
-	return cur_timer->monotonic_clock();
-}
-EXPORT_SYMBOL(monotonic_clock);
-
 #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
-unsigned long profile_pc(struct pt_regs *regs)
+unsigned long notrace profile_pc(struct pt_regs *regs)
 {
 	unsigned long pc = instruction_pointer(regs);
 
@@ -241,76 +143,6 @@ unsigned long profile_pc(struct pt_regs 
 EXPORT_SYMBOL(profile_pc);
 #endif
 
-/*
- * timer_interrupt() needs to keep up the real-time clock,
- * as well as call the "do_timer()" routine every clocktick
- */
-static inline void do_timer_interrupt(int irq, struct pt_regs *regs)
-{
-#ifdef CONFIG_X86_IO_APIC
-	if (timer_ack) {
-		/*
-		 * Subtle, when I/O APICs are used we have to ack timer IRQ
-		 * manually to reset the IRR bit for do_slow_gettimeoffset().
-		 * This will also deassert NMI lines for the watchdog if run
-		 * on an 82489DX-based system.
-		 */
-		spin_lock(&i8259A_lock);
-		outb(0x0c, PIC_MASTER_OCW3);
-		/* Ack the IRQ; AEOI will end it automatically. */
-		inb(PIC_MASTER_POLL);
-		spin_unlock(&i8259A_lock);
-	}
-#endif
-
-	do_timer_interrupt_hook(regs);
-
-
-	if (MCA_bus) {
-		/* The PS/2 uses level-triggered interrupts.  You can't
-		turn them off, nor would you want to (any attempt to
-		enable edge-triggered interrupts usually gets intercepted by a
-		special hardware circuit).  Hence we have to acknowledge
-		the timer interrupt.  Through some incredibly stupid
-		design idea, the reset for IRQ 0 is done by setting the
-		high bit of the PPI port B (0x61).  Note that some PS/2s,
-		notably the 55SX, work fine if this is removed.  */
-
-		irq = inb_p( 0x61 );	/* read the current state */
-		outb_p( irq|0x80, 0x61 );	/* reset the IRQ */
-	}
-}
-
-/*
- * This is the same as the above, except we _also_ save the current
- * Time Stamp Counter value at the time of the timer interrupt, so that
- * we later on can estimate the time of day more exactly.
- */
-irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
-{
-	/*
-	 * Here we are in the timer irq handler. We just have irqs locally
-	 * disabled but we don't know if the timer_bh is running on the other
-	 * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
-	 * the irq version of write_lock because as just said we have irq
-	 * locally disabled. -arca
-	 */
-	write_seqlock(&xtime_lock);
-
-	cur_timer->mark_offset();
- 
-	do_timer_interrupt(irq, regs);
-
-	write_sequnlock(&xtime_lock);
-
-#ifdef CONFIG_X86_LOCAL_APIC
-	if (using_apic_timer)
-		smp_send_timer_broadcast_ipi(regs);
-#endif
-
-	return IRQ_HANDLED;
-}
-
 /* not static: needed by APM */
 unsigned long get_cmos_time(void)
 {
@@ -329,139 +161,42 @@ unsigned long get_cmos_time(void)
 }
 EXPORT_SYMBOL(get_cmos_time);
 
-static void sync_cmos_clock(unsigned long dummy);
-
-static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
-
-static void sync_cmos_clock(unsigned long dummy)
+/* arch specific timeofday hooks */
+s64 read_persistent_clock(void)
 {
-	struct timeval now, next;
-	int fail = 1;
+	return (s64)get_cmos_time() * NSEC_PER_SEC;
+}
 
+void sync_persistent_clock(struct timespec ts)
+{
+	static unsigned long last_rtc_update;
 	/*
 	 * If we have an externally synchronized Linux clock, then update
 	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
 	 * called as close as possible to 500 ms before the new second starts.
-	 * This code is run on a timer.  If the clock is set, that timer
-	 * may not expire at the correct time.  Thus, we adjust...
 	 */
-	if (!ntp_synced())
-		/*
-		 * Not synced, exit, do not restart a timer (if one is
-		 * running, let it run out).
-		 */
+	if (ts.tv_sec <= last_rtc_update + 660)
 		return;
 
-	do_gettimeofday(&now);
-	if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
-	    now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
-		fail = set_rtc_mmss(now.tv_sec);
-
-	next.tv_usec = USEC_AFTER - now.tv_usec;
-	if (next.tv_usec <= 0)
-		next.tv_usec += USEC_PER_SEC;
-
-	if (!fail)
-		next.tv_sec = 659;
-	else
-		next.tv_sec = 0;
-
-	if (next.tv_usec >= USEC_PER_SEC) {
-		next.tv_sec++;
-		next.tv_usec -= USEC_PER_SEC;
+	if((ts.tv_nsec / 1000) >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
+		(ts.tv_nsec / 1000) <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2) {
+		/* horrible...FIXME */
+		if (set_rtc_mmss(ts.tv_sec) == 0)
+			last_rtc_update = ts.tv_sec;
+		else
+			last_rtc_update = ts.tv_sec - 600; /* do it again in 60 s */
 	}
-	mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
-}
-
-void notify_arch_cmos_timer(void)
-{
-	mod_timer(&sync_cmos_timer, jiffies + 1);
-}
-
-static long clock_cmos_diff, sleep_start;
-
-static struct timer_opts *last_timer;
-static int timer_suspend(struct sys_device *dev, pm_message_t state)
-{
-	/*
-	 * Estimate time zone so that set_time can update the clock
-	 */
-	clock_cmos_diff = -get_cmos_time();
-	clock_cmos_diff += get_seconds();
-	sleep_start = get_cmos_time();
-	last_timer = cur_timer;
-	cur_timer = &timer_none;
-	if (last_timer->suspend)
-		last_timer->suspend(state);
-	return 0;
 }
 
-static int timer_resume(struct sys_device *dev)
-{
-	unsigned long flags;
-	unsigned long sec;
-	unsigned long sleep_length;
-
-#ifdef CONFIG_HPET_TIMER
-	if (is_hpet_enabled())
-		hpet_reenable();
-#endif
-	setup_pit_timer();
-	sec = get_cmos_time() + clock_cmos_diff;
-	sleep_length = (get_cmos_time() - sleep_start) * HZ;
-	write_seqlock_irqsave(&xtime_lock, flags);
-	xtime.tv_sec = sec;
-	xtime.tv_nsec = 0;
-	jiffies_64 += sleep_length;
-	wall_jiffies += sleep_length;
-	write_sequnlock_irqrestore(&xtime_lock, flags);
-	if (last_timer->resume)
-		last_timer->resume();
-	cur_timer = last_timer;
-	last_timer = NULL;
-	touch_softlockup_watchdog();
-	return 0;
-}
-
-static struct sysdev_class timer_sysclass = {
-	.resume = timer_resume,
-	.suspend = timer_suspend,
-	set_kset_name("timer"),
-};
-
-
-/* XXX this driverfs stuff should probably go elsewhere later -john */
-static struct sys_device device_timer = {
-	.id	= 0,
-	.cls	= &timer_sysclass,
-};
-
-static int time_init_device(void)
-{
-	int error = sysdev_class_register(&timer_sysclass);
-	if (!error)
-		error = sysdev_register(&device_timer);
-	return error;
-}
-
-device_initcall(time_init_device);
-
 #ifdef CONFIG_HPET_TIMER
 extern void (*late_time_init)(void);
 /* Duplicate of time_init() below, with hpet_enable part added */
 static void __init hpet_time_init(void)
 {
-	xtime.tv_sec = get_cmos_time();
-	xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
-	set_normalized_timespec(&wall_to_monotonic,
-		-xtime.tv_sec, -xtime.tv_nsec);
-
 	if ((hpet_enable() >= 0) && hpet_use_timer) {
 		printk("Using HPET for base-timer\n");
 	}
 
-	cur_timer = select_timer();
-	printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
 
 	time_init_hook();
 }
@@ -469,6 +204,9 @@ static void __init hpet_time_init(void)
 
 void __init time_init(void)
 {
+	/* Set the clock to HZ Hz: */
+	setup_pit_timer();
+
 #ifdef CONFIG_HPET_TIMER
 	if (is_hpet_capable()) {
 		/*
@@ -479,13 +217,5 @@ void __init time_init(void)
 		return;
 	}
 #endif
-	xtime.tv_sec = get_cmos_time();
-	xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
-	set_normalized_timespec(&wall_to_monotonic,
-		-xtime.tv_sec, -xtime.tv_nsec);
-
-	cur_timer = select_timer();
-	printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
-
 	time_init_hook();
 }
Index: linux-2.6.16/arch/i386/kernel/timers/Makefile
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/timers/Makefile	2006-09-20 10:56:46.000000000 +0200
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,9 +0,0 @@
-#
-# Makefile for x86 timers
-#
-
-obj-y := timer.o timer_none.o timer_tsc.o timer_pit.o common.o
-
-obj-$(CONFIG_X86_CYCLONE_TIMER)	+= timer_cyclone.o
-obj-$(CONFIG_HPET_TIMER)	+= timer_hpet.o
-obj-$(CONFIG_X86_PM_TIMER)	+= timer_pm.o
Index: linux-2.6.16/arch/i386/kernel/timers/common.c
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/timers/common.c	2006-09-20 10:56:46.000000000 +0200
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,172 +0,0 @@
-/*
- *	Common functions used across the timers go here
- */
-
-#include <linux/init.h>
-#include <linux/timex.h>
-#include <linux/errno.h>
-#include <linux/jiffies.h>
-#include <linux/module.h>
-
-#include <asm/io.h>
-#include <asm/timer.h>
-#include <asm/hpet.h>
-
-#include "mach_timer.h"
-
-/* ------ Calibrate the TSC -------
- * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset().
- * Too much 64-bit arithmetic here to do this cleanly in C, and for
- * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2)
- * output busy loop as low as possible. We avoid reading the CTC registers
- * directly because of the awkward 8-bit access mechanism of the 82C54
- * device.
- */
-
-#define CALIBRATE_TIME	(5 * 1000020/HZ)
-
-unsigned long calibrate_tsc(void)
-{
-	mach_prepare_counter();
-
-	{
-		unsigned long startlow, starthigh;
-		unsigned long endlow, endhigh;
-		unsigned long count;
-
-		rdtsc(startlow,starthigh);
-		mach_countup(&count);
-		rdtsc(endlow,endhigh);
-
-
-		/* Error: ECTCNEVERSET */
-		if (count <= 1)
-			goto bad_ctc;
-
-		/* 64-bit subtract - gcc just messes up with long longs */
-		__asm__("subl %2,%0\n\t"
-			"sbbl %3,%1"
-			:"=a" (endlow), "=d" (endhigh)
-			:"g" (startlow), "g" (starthigh),
-			 "0" (endlow), "1" (endhigh));
-
-		/* Error: ECPUTOOFAST */
-		if (endhigh)
-			goto bad_ctc;
-
-		/* Error: ECPUTOOSLOW */
-		if (endlow <= CALIBRATE_TIME)
-			goto bad_ctc;
-
-		__asm__("divl %2"
-			:"=a" (endlow), "=d" (endhigh)
-			:"r" (endlow), "0" (0), "1" (CALIBRATE_TIME));
-
-		return endlow;
-	}
-
-	/*
-	 * The CTC wasn't reliable: we got a hit on the very first read,
-	 * or the CPU was so fast/slow that the quotient wouldn't fit in
-	 * 32 bits..
-	 */
-bad_ctc:
-	return 0;
-}
-
-#ifdef CONFIG_HPET_TIMER
-/* ------ Calibrate the TSC using HPET -------
- * Return 2^32 * (1 / (TSC clocks per usec)) for getting the CPU freq.
- * Second output is parameter 1 (when non NULL)
- * Set 2^32 * (1 / (tsc per HPET clk)) for delay_hpet().
- * calibrate_tsc() calibrates the processor TSC by comparing
- * it to the HPET timer of known frequency.
- * Too much 64-bit arithmetic here to do this cleanly in C
- */
-#define CALIBRATE_CNT_HPET 	(5 * hpet_tick)
-#define CALIBRATE_TIME_HPET 	(5 * KERNEL_TICK_USEC)
-
-unsigned long __devinit calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr)
-{
-	unsigned long tsc_startlow, tsc_starthigh;
-	unsigned long tsc_endlow, tsc_endhigh;
-	unsigned long hpet_start, hpet_end;
-	unsigned long result, remain;
-
-	hpet_start = hpet_readl(HPET_COUNTER);
-	rdtsc(tsc_startlow, tsc_starthigh);
-	do {
-		hpet_end = hpet_readl(HPET_COUNTER);
-	} while ((hpet_end - hpet_start) < CALIBRATE_CNT_HPET);
-	rdtsc(tsc_endlow, tsc_endhigh);
-
-	/* 64-bit subtract - gcc just messes up with long longs */
-	__asm__("subl %2,%0\n\t"
-		"sbbl %3,%1"
-		:"=a" (tsc_endlow), "=d" (tsc_endhigh)
-		:"g" (tsc_startlow), "g" (tsc_starthigh),
-		 "0" (tsc_endlow), "1" (tsc_endhigh));
-
-	/* Error: ECPUTOOFAST */
-	if (tsc_endhigh)
-		goto bad_calibration;
-
-	/* Error: ECPUTOOSLOW */
-	if (tsc_endlow <= CALIBRATE_TIME_HPET)
-		goto bad_calibration;
-
-	ASM_DIV64_REG(result, remain, tsc_endlow, 0, CALIBRATE_TIME_HPET);
-	if (remain > (tsc_endlow >> 1))
-		result++; /* rounding the result */
-
-	if (tsc_hpet_quotient_ptr) {
-		unsigned long tsc_hpet_quotient;
-
-		ASM_DIV64_REG(tsc_hpet_quotient, remain, tsc_endlow, 0,
-			CALIBRATE_CNT_HPET);
-		if (remain > (tsc_endlow >> 1))
-			tsc_hpet_quotient++; /* rounding the result */
-		*tsc_hpet_quotient_ptr = tsc_hpet_quotient;
-	}
-
-	return result;
-bad_calibration:
-	/*
-	 * the CPU was so fast/slow that the quotient wouldn't fit in
-	 * 32 bits..
-	 */
-	return 0;
-}
-#endif
-
-
-unsigned long read_timer_tsc(void)
-{
-	unsigned long retval;
-	rdtscl(retval);
-	return retval;
-}
-
-
-/* calculate cpu_khz */
-void init_cpu_khz(void)
-{
-	if (cpu_has_tsc) {
-		unsigned long tsc_quotient = calibrate_tsc();
-		if (tsc_quotient) {
-			/* report CPU clock rate in Hz.
-			 * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) =
-			 * clock/second. Our precision is about 100 ppm.
-			 */
-			{	unsigned long eax=0, edx=1000;
-				__asm__("divl %2"
-		       		:"=a" (cpu_khz), "=d" (edx)
-        	       		:"r" (tsc_quotient),
-	                	"0" (eax), "1" (edx));
-				printk("Detected %u.%03u MHz processor.\n",
-					cpu_khz / 1000, cpu_khz % 1000);
-			}
-		}
-	}
-}
-
Index: linux-2.6.16/arch/i386/kernel/timers/timer.c
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/timers/timer.c	2006-09-20 10:56:46.000000000 +0200
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,75 +0,0 @@
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <asm/timer.h>
-
-#ifdef CONFIG_HPET_TIMER
-/*
- * HPET memory read is slower than tsc reads, but is more dependable as it
- * always runs at constant frequency and reduces complexity due to
- * cpufreq. So, we prefer HPET timer to tsc based one. Also, we cannot use
- * timer_pit when HPET is active. So, we default to timer_tsc.
- */
-#endif
-/* list of timers, ordered by preference, NULL terminated */
-static struct init_timer_opts* __initdata timers[] = {
-#ifdef CONFIG_X86_CYCLONE_TIMER
-	&timer_cyclone_init,
-#endif
-#ifdef CONFIG_HPET_TIMER
-	&timer_hpet_init,
-#endif
-#ifdef CONFIG_X86_PM_TIMER
-	&timer_pmtmr_init,
-#endif
-	&timer_tsc_init,
-	&timer_pit_init,
-	NULL,
-};
-
-static char clock_override[10] __initdata;
-
-static int __init clock_setup(char* str)
-{
-	if (str)
-		strlcpy(clock_override, str, sizeof(clock_override));
-	return 1;
-}
-__setup("clock=", clock_setup);
-
-
-/* The chosen timesource has been found to be bad.
- * Fall back to a known good timesource (the PIT)
- */
-void clock_fallback(void)
-{
-	cur_timer = &timer_pit;
-}
-
-/* iterates through the list of timers, returning the first 
- * one that initializes successfully.
- */
-struct timer_opts* __init select_timer(void)
-{
-	int i = 0;
-	
-	/* find most preferred working timer */
-	while (timers[i]) {
-		if (timers[i]->init)
-			if (timers[i]->init(clock_override) == 0)
-				return timers[i]->opts;
-		++i;
-	}
-		
-	panic("select_timer: Cannot find a suitable timer\n");
-	return NULL;
-}
-
-int read_current_timer(unsigned long *timer_val)
-{
-	if (cur_timer->read_timer) {
-		*timer_val = cur_timer->read_timer();
-		return 0;
-	}
-	return -1;
-}
Index: linux-2.6.16/arch/i386/kernel/timers/timer_cyclone.c
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/timers/timer_cyclone.c	2006-09-20 10:56:46.000000000 +0200
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,259 +0,0 @@
-/*	Cyclone-timer: 
- *		This code implements timer_ops for the cyclone counter found
- *		on IBM x440, x360, and other Summit based systems.
- *
- *	Copyright (C) 2002 IBM, John Stultz (johnstul@us.ibm.com)
- */
-
-
-#include <linux/spinlock.h>
-#include <linux/init.h>
-#include <linux/timex.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/jiffies.h>
-
-#include <asm/timer.h>
-#include <asm/io.h>
-#include <asm/pgtable.h>
-#include <asm/fixmap.h>
-#include <asm/i8253.h>
-
-#include "io_ports.h"
-
-/* Number of usecs that the last interrupt was delayed */
-static int delay_at_last_interrupt;
-
-#define CYCLONE_CBAR_ADDR 0xFEB00CD0
-#define CYCLONE_PMCC_OFFSET 0x51A0
-#define CYCLONE_MPMC_OFFSET 0x51D0
-#define CYCLONE_MPCS_OFFSET 0x51A8
-#define CYCLONE_TIMER_FREQ 100000000
-#define CYCLONE_TIMER_MASK (((u64)1<<40)-1) /* 40 bit mask */
-int use_cyclone = 0;
-
-static u32* volatile cyclone_timer;	/* Cyclone MPMC0 register */
-static u32 last_cyclone_low;
-static u32 last_cyclone_high;
-static unsigned long long monotonic_base;
-static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
-
-/* helper macro to atomically read both cyclone counter registers */
-#define read_cyclone_counter(low,high) \
-	do{ \
-		high = cyclone_timer[1]; low = cyclone_timer[0]; \
-	} while (high != cyclone_timer[1]);
-
-
-static void mark_offset_cyclone(void)
-{
-	unsigned long lost, delay;
-	unsigned long delta = last_cyclone_low;
-	int count;
-	unsigned long long this_offset, last_offset;
-
-	write_seqlock(&monotonic_lock);
-	last_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low;
-	
-	spin_lock(&i8253_lock);
-	read_cyclone_counter(last_cyclone_low,last_cyclone_high);
-
-	/* read values for delay_at_last_interrupt */
-	outb_p(0x00, 0x43);     /* latch the count ASAP */
-
-	count = inb_p(0x40);    /* read the latched count */
-	count |= inb(0x40) << 8;
-
-	/*
-	 * VIA686a test code... reset the latch if count > max + 1
-	 * from timer_pit.c - cjb
-	 */
-	if (count > LATCH) {
-		outb_p(0x34, PIT_MODE);
-		outb_p(LATCH & 0xff, PIT_CH0);
-		outb(LATCH >> 8, PIT_CH0);
-		count = LATCH - 1;
-	}
-	spin_unlock(&i8253_lock);
-
-	/* lost tick compensation */
-	delta = last_cyclone_low - delta;	
-	delta /= (CYCLONE_TIMER_FREQ/1000000);
-	delta += delay_at_last_interrupt;
-	lost = delta/(1000000/HZ);
-	delay = delta%(1000000/HZ);
-	if (lost >= 2)
-		jiffies_64 += lost-1;
-	
-	/* update the monotonic base value */
-	this_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low;
-	monotonic_base += (this_offset - last_offset) & CYCLONE_TIMER_MASK;
-	write_sequnlock(&monotonic_lock);
-
-	/* calculate delay_at_last_interrupt */
-	count = ((LATCH-1) - count) * TICK_SIZE;
-	delay_at_last_interrupt = (count + LATCH/2) / LATCH;
-
-
-	/* catch corner case where tick rollover occured 
-	 * between cyclone and pit reads (as noted when 
-	 * usec delta is > 90% # of usecs/tick)
-	 */
-	if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ))
-		jiffies_64++;
-}
-
-static unsigned long get_offset_cyclone(void)
-{
-	u32 offset;
-
-	if(!cyclone_timer)
-		return delay_at_last_interrupt;
-
-	/* Read the cyclone timer */
-	offset = cyclone_timer[0];
-
-	/* .. relative to previous jiffy */
-	offset = offset - last_cyclone_low;
-
-	/* convert cyclone ticks to microseconds */	
-	/* XXX slow, can we speed this up? */
-	offset = offset/(CYCLONE_TIMER_FREQ/1000000);
-
-	/* our adjusted time offset in microseconds */
-	return delay_at_last_interrupt + offset;
-}
-
-static unsigned long long monotonic_clock_cyclone(void)
-{
-	u32 now_low, now_high;
-	unsigned long long last_offset, this_offset, base;
-	unsigned long long ret;
-	unsigned seq;
-
-	/* atomically read monotonic base & last_offset */
-	do {
-		seq = read_seqbegin(&monotonic_lock);
-		last_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low;
-		base = monotonic_base;
-	} while (read_seqretry(&monotonic_lock, seq));
-
-
-	/* Read the cyclone counter */
-	read_cyclone_counter(now_low,now_high);
-	this_offset = ((unsigned long long)now_high<<32)|now_low;
-
-	/* convert to nanoseconds */
-	ret = base + ((this_offset - last_offset)&CYCLONE_TIMER_MASK);
-	return ret * (1000000000 / CYCLONE_TIMER_FREQ);
-}
-
-static int __init init_cyclone(char* override)
-{
-	u32* reg;	
-	u32 base;		/* saved cyclone base address */
-	u32 pageaddr;	/* page that contains cyclone_timer register */
-	u32 offset;		/* offset from pageaddr to cyclone_timer register */
-	int i;
-	
-	/* check clock override */
-	if (override[0] && strncmp(override,"cyclone",7))
-			return -ENODEV;
-
-	/*make sure we're on a summit box*/
-	if(!use_cyclone) return -ENODEV; 
-	
-	printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n");
-
-	/* find base address */
-	pageaddr = (CYCLONE_CBAR_ADDR)&PAGE_MASK;
-	offset = (CYCLONE_CBAR_ADDR)&(~PAGE_MASK);
-	set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
-	reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
-	if(!reg){
-		printk(KERN_ERR "Summit chipset: Could not find valid CBAR register.\n");
-		return -ENODEV;
-	}
-	base = *reg;	
-	if(!base){
-		printk(KERN_ERR "Summit chipset: Could not find valid CBAR value.\n");
-		return -ENODEV;
-	}
-	
-	/* setup PMCC */
-	pageaddr = (base + CYCLONE_PMCC_OFFSET)&PAGE_MASK;
-	offset = (base + CYCLONE_PMCC_OFFSET)&(~PAGE_MASK);
-	set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
-	reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
-	if(!reg){
-		printk(KERN_ERR "Summit chipset: Could not find valid PMCC register.\n");
-		return -ENODEV;
-	}
-	reg[0] = 0x00000001;
-
-	/* setup MPCS */
-	pageaddr = (base + CYCLONE_MPCS_OFFSET)&PAGE_MASK;
-	offset = (base + CYCLONE_MPCS_OFFSET)&(~PAGE_MASK);
-	set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
-	reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
-	if(!reg){
-		printk(KERN_ERR "Summit chipset: Could not find valid MPCS register.\n");
-		return -ENODEV;
-	}
-	reg[0] = 0x00000001;
-
-	/* map in cyclone_timer */
-	pageaddr = (base + CYCLONE_MPMC_OFFSET)&PAGE_MASK;
-	offset = (base + CYCLONE_MPMC_OFFSET)&(~PAGE_MASK);
-	set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
-	cyclone_timer = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
-	if(!cyclone_timer){
-		printk(KERN_ERR "Summit chipset: Could not find valid MPMC register.\n");
-		return -ENODEV;
-	}
-
-	/*quick test to make sure its ticking*/
-	for(i=0; i<3; i++){
-		u32 old = cyclone_timer[0];
-		int stall = 100;
-		while(stall--) barrier();
-		if(cyclone_timer[0] == old){
-			printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n");
-			cyclone_timer = 0;
-			return -ENODEV;
-		}
-	}
-
-	init_cpu_khz();
-
-	/* Everything looks good! */
-	return 0;
-}
-
-
-static void delay_cyclone(unsigned long loops)
-{
-	unsigned long bclock, now;
-	if(!cyclone_timer)
-		return;
-	bclock = cyclone_timer[0];
-	do {
-		rep_nop();
-		now = cyclone_timer[0];
-	} while ((now-bclock) < loops);
-}
-/************************************************************/
-
-/* cyclone timer_opts struct */
-static struct timer_opts timer_cyclone = {
-	.name = "cyclone",
-	.mark_offset = mark_offset_cyclone, 
-	.get_offset = get_offset_cyclone,
-	.monotonic_clock =	monotonic_clock_cyclone,
-	.delay = delay_cyclone,
-};
-
-struct init_timer_opts __initdata timer_cyclone_init = {
-	.init = init_cyclone,
-	.opts = &timer_cyclone,
-};
Index: linux-2.6.16/arch/i386/kernel/timers/timer_hpet.c
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/timers/timer_hpet.c	2006-09-20 10:56:46.000000000 +0200
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,217 +0,0 @@
-/*
- * This code largely moved from arch/i386/kernel/time.c.
- * See comments there for proper credits.
- */
-
-#include <linux/spinlock.h>
-#include <linux/init.h>
-#include <linux/timex.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/jiffies.h>
-
-#include <asm/timer.h>
-#include <asm/io.h>
-#include <asm/processor.h>
-
-#include "io_ports.h"
-#include "mach_timer.h"
-#include <asm/hpet.h>
-
-static unsigned long hpet_usec_quotient __read_mostly;	/* convert hpet clks to usec */
-static unsigned long tsc_hpet_quotient __read_mostly;	/* convert tsc to hpet clks */
-static unsigned long hpet_last; 	/* hpet counter value at last tick*/
-static unsigned long last_tsc_low;	/* lsb 32 bits of Time Stamp Counter */
-static unsigned long last_tsc_high; 	/* msb 32 bits of Time Stamp Counter */
-static unsigned long long monotonic_base;
-static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
-
-/* convert from cycles(64bits) => nanoseconds (64bits)
- *  basic equation:
- *		ns = cycles / (freq / ns_per_sec)
- *		ns = cycles * (ns_per_sec / freq)
- *		ns = cycles * (10^9 / (cpu_khz * 10^3))
- *		ns = cycles * (10^6 / cpu_khz)
- *
- *	Then we use scaling math (suggested by george@mvista.com) to get:
- *		ns = cycles * (10^6 * SC / cpu_khz) / SC
- *		ns = cycles * cyc2ns_scale / SC
- *
- *	And since SC is a constant power of two, we can convert the div
- *  into a shift.
- *
- *  We can use khz divisor instead of mhz to keep a better percision, since
- *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
- *  (mathieu.desnoyers@polymtl.ca)
- *
- *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
- */
-static unsigned long cyc2ns_scale;
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
-
-static inline void set_cyc2ns_scale(unsigned long cpu_khz)
-{
-	cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
-}
-
-static inline unsigned long long cycles_2_ns(unsigned long long cyc)
-{
-	return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
-}
-
-static unsigned long long monotonic_clock_hpet(void)
-{
-	unsigned long long last_offset, this_offset, base;
-	unsigned seq;
-
-	/* atomically read monotonic base & last_offset */
-	do {
-		seq = read_seqbegin(&monotonic_lock);
-		last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
-		base = monotonic_base;
-	} while (read_seqretry(&monotonic_lock, seq));
-
-	/* Read the Time Stamp Counter */
-	rdtscll(this_offset);
-
-	/* return the value in ns */
-	return base + cycles_2_ns(this_offset - last_offset);
-}
-
-static unsigned long get_offset_hpet(void)
-{
-	register unsigned long eax, edx;
-
-	eax = hpet_readl(HPET_COUNTER);
-	eax -= hpet_last;	/* hpet delta */
-	eax = min(hpet_tick, eax);
-	/*
-         * Time offset = (hpet delta) * ( usecs per HPET clock )
-	 *             = (hpet delta) * ( usecs per tick / HPET clocks per tick)
-	 *             = (hpet delta) * ( hpet_usec_quotient ) / (2^32)
-	 *
-	 * Where,
-	 * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick
-	 *
-	 * Using a mull instead of a divl saves some cycles in critical path.
-         */
-	ASM_MUL64_REG(eax, edx, hpet_usec_quotient, eax);
-
-	/* our adjusted time offset in microseconds */
-	return edx;
-}
-
-static void mark_offset_hpet(void)
-{
-	unsigned long long this_offset, last_offset;
-	unsigned long offset;
-
-	write_seqlock(&monotonic_lock);
-	last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
-	rdtsc(last_tsc_low, last_tsc_high);
-
-	if (hpet_use_timer)
-		offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
-	else
-		offset = hpet_readl(HPET_COUNTER);
-	if (unlikely(((offset - hpet_last) >= (2*hpet_tick)) && (hpet_last != 0))) {
-		int lost_ticks = ((offset - hpet_last) / hpet_tick) - 1;
-		jiffies_64 += lost_ticks;
-	}
-	hpet_last = offset;
-
-	/* update the monotonic base value */
-	this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
-	monotonic_base += cycles_2_ns(this_offset - last_offset);
-	write_sequnlock(&monotonic_lock);
-}
-
-static void delay_hpet(unsigned long loops)
-{
-	unsigned long hpet_start, hpet_end;
-	unsigned long eax;
-
-	/* loops is the number of cpu cycles. Convert it to hpet clocks */
-	ASM_MUL64_REG(eax, loops, tsc_hpet_quotient, loops);
-
-	hpet_start = hpet_readl(HPET_COUNTER);
-	do {
-		rep_nop();
-		hpet_end = hpet_readl(HPET_COUNTER);
-	} while ((hpet_end - hpet_start) < (loops));
-}
-
-static struct timer_opts timer_hpet;
-
-static int __init init_hpet(char* override)
-{
-	unsigned long result, remain;
-
-	/* check clock override */
-	if (override[0] && strncmp(override,"hpet",4))
-		return -ENODEV;
-
-	if (!is_hpet_enabled())
-		return -ENODEV;
-
-	printk("Using HPET for gettimeofday\n");
-	if (cpu_has_tsc) {
-		unsigned long tsc_quotient = calibrate_tsc_hpet(&tsc_hpet_quotient);
-		if (tsc_quotient) {
-			/* report CPU clock rate in Hz.
-			 * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) =
-			 * clock/second. Our precision is about 100 ppm.
-			 */
-			{	unsigned long eax=0, edx=1000;
-				ASM_DIV64_REG(cpu_khz, edx, tsc_quotient,
-						eax, edx);
-				printk("Detected %u.%03u MHz processor.\n",
-					cpu_khz / 1000, cpu_khz % 1000);
-			}
-			set_cyc2ns_scale(cpu_khz);
-		}
-		/* set this only when cpu_has_tsc */
-		timer_hpet.read_timer = read_timer_tsc;
-	}
-
-	/*
-	 * Math to calculate hpet to usec multiplier
-	 * Look for the comments at get_offset_hpet()
-	 */
-	ASM_DIV64_REG(result, remain, hpet_tick, 0, KERNEL_TICK_USEC);
-	if (remain > (hpet_tick >> 1))
-		result++; /* rounding the result */
-	hpet_usec_quotient = result;
-
-	return 0;
-}
-
-static int hpet_resume(void)
-{
-	write_seqlock(&monotonic_lock);
-	/* Assume this is the last mark offset time */
-	rdtsc(last_tsc_low, last_tsc_high);
-
-	if (hpet_use_timer)
-		hpet_last = hpet_readl(HPET_T0_CMP) - hpet_tick;
-	else
-		hpet_last = hpet_readl(HPET_COUNTER);
-	write_sequnlock(&monotonic_lock);
-	return 0;
-}
-/************************************************************/
-
-/* tsc timer_opts struct */
-static struct timer_opts timer_hpet __read_mostly = {
-	.name = 		"hpet",
-	.mark_offset =		mark_offset_hpet,
-	.get_offset =		get_offset_hpet,
-	.monotonic_clock =	monotonic_clock_hpet,
-	.delay = 		delay_hpet,
-	.resume	=		hpet_resume,
-};
-
-struct init_timer_opts __initdata timer_hpet_init = {
-	.init =	init_hpet,
-	.opts = &timer_hpet,
-};
Index: linux-2.6.16/arch/i386/kernel/timers/timer_none.c
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/timers/timer_none.c	2006-09-20 10:56:46.000000000 +0200
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,39 +0,0 @@
-#include <linux/init.h>
-#include <asm/timer.h>
-
-static void mark_offset_none(void)
-{
-	/* nothing needed */
-}
-
-static unsigned long get_offset_none(void)
-{
-	return 0;
-}
-
-static unsigned long long monotonic_clock_none(void)
-{
-	return 0;
-}
-
-static void delay_none(unsigned long loops)
-{
-	int d0;
-	__asm__ __volatile__(
-		"\tjmp 1f\n"
-		".align 16\n"
-		"1:\tjmp 2f\n"
-		".align 16\n"
-		"2:\tdecl %0\n\tjns 2b"
-		:"=&a" (d0)
-		:"0" (loops));
-}
-
-/* none timer_opts struct */
-struct timer_opts timer_none = {
-	.name = 	"none",
-	.mark_offset =	mark_offset_none, 
-	.get_offset =	get_offset_none,
-	.monotonic_clock =	monotonic_clock_none,
-	.delay = delay_none,
-};
Index: linux-2.6.16/arch/i386/kernel/timers/timer_pit.c
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/timers/timer_pit.c	2006-09-20 10:56:46.000000000 +0200
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,177 +0,0 @@
-/*
- * This code largely moved from arch/i386/kernel/time.c.
- * See comments there for proper credits.
- */
-
-#include <linux/spinlock.h>
-#include <linux/module.h>
-#include <linux/device.h>
-#include <linux/sysdev.h>
-#include <linux/timex.h>
-#include <asm/delay.h>
-#include <asm/mpspec.h>
-#include <asm/timer.h>
-#include <asm/smp.h>
-#include <asm/io.h>
-#include <asm/arch_hooks.h>
-#include <asm/i8253.h>
-
-#include "do_timer.h"
-#include "io_ports.h"
-
-static int count_p; /* counter in get_offset_pit() */
-
-static int __init init_pit(char* override)
-{
- 	/* check clock override */
- 	if (override[0] && strncmp(override,"pit",3))
- 		printk(KERN_ERR "Warning: clock= override failed. Defaulting "
-				"to PIT\n");
- 	init_cpu_khz();
-	count_p = LATCH;
-	return 0;
-}
-
-static void mark_offset_pit(void)
-{
-	/* nothing needed */
-}
-
-static unsigned long long monotonic_clock_pit(void)
-{
-	return 0;
-}
-
-static void delay_pit(unsigned long loops)
-{
-	int d0;
-	__asm__ __volatile__(
-		"\tjmp 1f\n"
-		".align 16\n"
-		"1:\tjmp 2f\n"
-		".align 16\n"
-		"2:\tdecl %0\n\tjns 2b"
-		:"=&a" (d0)
-		:"0" (loops));
-}
-
-
-/* This function must be called with xtime_lock held.
- * It was inspired by Steve McCanne's microtime-i386 for BSD.  -- jrs
- * 
- * However, the pc-audio speaker driver changes the divisor so that
- * it gets interrupted rather more often - it loads 64 into the
- * counter rather than 11932! This has an adverse impact on
- * do_gettimeoffset() -- it stops working! What is also not
- * good is that the interval that our timer function gets called
- * is no longer 10.0002 ms, but 9.9767 ms. To get around this
- * would require using a different timing source. Maybe someone
- * could use the RTC - I know that this can interrupt at frequencies
- * ranging from 8192Hz to 2Hz. If I had the energy, I'd somehow fix
- * it so that at startup, the timer code in sched.c would select
- * using either the RTC or the 8253 timer. The decision would be
- * based on whether there was any other device around that needed
- * to trample on the 8253. I'd set up the RTC to interrupt at 1024 Hz,
- * and then do some jiggery to have a version of do_timer that 
- * advanced the clock by 1/1024 s. Every time that reached over 1/100
- * of a second, then do all the old code. If the time was kept correct
- * then do_gettimeoffset could just return 0 - there is no low order
- * divider that can be accessed.
- *
- * Ideally, you would be able to use the RTC for the speaker driver,
- * but it appears that the speaker driver really needs interrupt more
- * often than every 120 us or so.
- *
- * Anyway, this needs more thought....		pjsg (1993-08-28)
- * 
- * If you are really that interested, you should be reading
- * comp.protocols.time.ntp!
- */
-
-static unsigned long get_offset_pit(void)
-{
-	int count;
-	unsigned long flags;
-	static unsigned long jiffies_p = 0;
-
-	/*
-	 * cache volatile jiffies temporarily; we have xtime_lock. 
-	 */
-	unsigned long jiffies_t;
-
-	spin_lock_irqsave(&i8253_lock, flags);
-	/* timer count may underflow right here */
-	outb_p(0x00, PIT_MODE);	/* latch the count ASAP */
-
-	count = inb_p(PIT_CH0);	/* read the latched count */
-
-	/*
-	 * We do this guaranteed double memory access instead of a _p 
-	 * postfix in the previous port access. Wheee, hackady hack
-	 */
- 	jiffies_t = jiffies;
-
-	count |= inb_p(PIT_CH0) << 8;
-	
-        /* VIA686a test code... reset the latch if count > max + 1 */
-        if (count > LATCH) {
-                outb_p(0x34, PIT_MODE);
-                outb_p(LATCH & 0xff, PIT_CH0);
-                outb(LATCH >> 8, PIT_CH0);
-                count = LATCH - 1;
-        }
-	
-	/*
-	 * avoiding timer inconsistencies (they are rare, but they happen)...
-	 * there are two kinds of problems that must be avoided here:
-	 *  1. the timer counter underflows
-	 *  2. hardware problem with the timer, not giving us continuous time,
-	 *     the counter does small "jumps" upwards on some Pentium systems,
-	 *     (see c't 95/10 page 335 for Neptun bug.)
-	 */
-
-	if( jiffies_t == jiffies_p ) {
-		if( count > count_p ) {
-			/* the nutcase */
-			count = do_timer_overflow(count);
-		}
-	} else
-		jiffies_p = jiffies_t;
-
-	count_p = count;
-
-	spin_unlock_irqrestore(&i8253_lock, flags);
-
-	count = ((LATCH-1) - count) * TICK_SIZE;
-	count = (count + LATCH/2) / LATCH;
-
-	return count;
-}
-
-
-/* tsc timer_opts struct */
-struct timer_opts timer_pit = {
-	.name = "pit",
-	.mark_offset = mark_offset_pit, 
-	.get_offset = get_offset_pit,
-	.monotonic_clock = monotonic_clock_pit,
-	.delay = delay_pit,
-};
-
-struct init_timer_opts __initdata timer_pit_init = {
-	.init = init_pit, 
-	.opts = &timer_pit,
-};
-
-void setup_pit_timer(void)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&i8253_lock, flags);
-	outb_p(0x34,PIT_MODE);		/* binary, mode 2, LSB/MSB, ch 0 */
-	udelay(10);
-	outb_p(LATCH & 0xff , PIT_CH0);	/* LSB */
-	udelay(10);
-	outb(LATCH >> 8 , PIT_CH0);	/* MSB */
-	spin_unlock_irqrestore(&i8253_lock, flags);
-}
Index: linux-2.6.16/arch/i386/kernel/timers/timer_pm.c
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/timers/timer_pm.c	2006-09-20 10:56:46.000000000 +0200
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,268 +0,0 @@
-/*
- * (C) Dominik Brodowski <linux@brodo.de> 2003
- *
- * Driver to use the Power Management Timer (PMTMR) available in some
- * southbridges as primary timing source for the Linux kernel.
- *
- * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
- * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
- *
- * This file is licensed under the GPL v2.
- */
-
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/device.h>
-#include <linux/init.h>
-#include <asm/types.h>
-#include <asm/timer.h>
-#include <asm/smp.h>
-#include <asm/io.h>
-#include <asm/arch_hooks.h>
-
-#include <linux/timex.h>
-#include "mach_timer.h"
-
-/* Number of PMTMR ticks expected during calibration run */
-#define PMTMR_TICKS_PER_SEC 3579545
-#define PMTMR_EXPECTED_RATE \
-  ((CALIBRATE_LATCH * (PMTMR_TICKS_PER_SEC >> 10)) / (CLOCK_TICK_RATE>>10))
-
-
-/* The I/O port the PMTMR resides at.
- * The location is detected during setup_arch(),
- * in arch/i386/acpi/boot.c */
-u32 pmtmr_ioport = 0;
-
-
-/* value of the Power timer at last timer interrupt */
-static u32 offset_tick;
-static u32 offset_delay;
-
-static unsigned long long monotonic_base;
-static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
-
-#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
-
-/*helper function to safely read acpi pm timesource*/
-static inline u32 read_pmtmr(void)
-{
-	u32 v1=0,v2=0,v3=0;
-	/* It has been reported that because of various broken
-	 * chipsets (ICH4, PIIX4 and PIIX4E) where the ACPI PM time
-	 * source is not latched, so you must read it multiple
-	 * times to insure a safe value is read.
-	 */
-	do {
-		v1 = inl(pmtmr_ioport);
-		v2 = inl(pmtmr_ioport);
-		v3 = inl(pmtmr_ioport);
-	} while ((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1)
-			|| (v3 > v1 && v3 < v2));
-
-	/* mask the output to 24 bits */
-	return v2 & ACPI_PM_MASK;
-}
-
-
-/*
- * Some boards have the PMTMR running way too fast. We check
- * the PMTMR rate against PIT channel 2 to catch these cases.
- */
-static int verify_pmtmr_rate(void)
-{
-	u32 value1, value2;
-	unsigned long count, delta;
-
-	mach_prepare_counter();
-	value1 = read_pmtmr();
-	mach_countup(&count);
-	value2 = read_pmtmr();
-	delta = (value2 - value1) & ACPI_PM_MASK;
-
-	/* Check that the PMTMR delta is within 5% of what we expect */
-	if (delta < (PMTMR_EXPECTED_RATE * 19) / 20 ||
-	    delta > (PMTMR_EXPECTED_RATE * 21) / 20) {
-		printk(KERN_INFO "PM-Timer running at invalid rate: %lu%% of normal - aborting.\n", 100UL * delta / PMTMR_EXPECTED_RATE);
-		return -1;
-	}
-
-	return 0;
-}
-
-
-static int init_pmtmr(char* override)
-{
-	u32 value1, value2;
-	unsigned int i;
-
- 	if (override[0] && strncmp(override,"pmtmr",5))
-		return -ENODEV;
-
-	if (!pmtmr_ioport)
-		return -ENODEV;
-
-	/* we use the TSC for delay_pmtmr, so make sure it exists */
-	if (!cpu_has_tsc)
-		return -ENODEV;
-
-	/* "verify" this timing source */
-	value1 = read_pmtmr();
-	for (i = 0; i < 10000; i++) {
-		value2 = read_pmtmr();
-		if (value2 == value1)
-			continue;
-		if (value2 > value1)
-			goto pm_good;
-		if ((value2 < value1) && ((value2) < 0xFFF))
-			goto pm_good;
-		printk(KERN_INFO "PM-Timer had inconsistent results: 0x%#x, 0x%#x - aborting.\n", value1, value2);
-		return -EINVAL;
-	}
-	printk(KERN_INFO "PM-Timer had no reasonable result: 0x%#x - aborting.\n", value1);
-	return -ENODEV;
-
-pm_good:
-	if (verify_pmtmr_rate() != 0)
-		return -ENODEV;
-
-	init_cpu_khz();
-	return 0;
-}
-
-static inline u32 cyc2us(u32 cycles)
-{
-	/* The Power Management Timer ticks at 3.579545 ticks per microsecond.
-	 * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
-	 *
-	 * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
-	 * easily be multiplied with 286 (=0x11E) without having to fear
-	 * u32 overflows.
-	 */
-	cycles *= 286;
-	return (cycles >> 10);
-}
-
-/*
- * this gets called during each timer interrupt
- *   - Called while holding the writer xtime_lock
- */
-static void mark_offset_pmtmr(void)
-{
-	u32 lost, delta, last_offset;
-	static int first_run = 1;
-	last_offset = offset_tick;
-
-	write_seqlock(&monotonic_lock);
-
-	offset_tick = read_pmtmr();
-
-	/* calculate tick interval */
-	delta = (offset_tick - last_offset) & ACPI_PM_MASK;
-
-	/* convert to usecs */
-	delta = cyc2us(delta);
-
-	/* update the monotonic base value */
-	monotonic_base += delta * NSEC_PER_USEC;
-	write_sequnlock(&monotonic_lock);
-
-	/* convert to ticks */
-	delta += offset_delay;
-	lost = delta / (USEC_PER_SEC / HZ);
-	offset_delay = delta % (USEC_PER_SEC / HZ);
-
-
-	/* compensate for lost ticks */
-	if (lost >= 2)
-		jiffies_64 += lost - 1;
-
-	/* don't calculate delay for first run,
-	   or if we've got less then a tick */
-	if (first_run || (lost < 1)) {
-		first_run = 0;
-		offset_delay = 0;
-	}
-}
-
-static int pmtmr_resume(void)
-{
-	write_seqlock(&monotonic_lock);
-	/* Assume this is the last mark offset time */
-	offset_tick = read_pmtmr();
-	write_sequnlock(&monotonic_lock);
-	return 0;
-}
-
-static unsigned long long monotonic_clock_pmtmr(void)
-{
-	u32 last_offset, this_offset;
-	unsigned long long base, ret;
-	unsigned seq;
-
-
-	/* atomically read monotonic base & last_offset */
-	do {
-		seq = read_seqbegin(&monotonic_lock);
-		last_offset = offset_tick;
-		base = monotonic_base;
-	} while (read_seqretry(&monotonic_lock, seq));
-
-	/* Read the pmtmr */
-	this_offset =  read_pmtmr();
-
-	/* convert to nanoseconds */
-	ret = (this_offset - last_offset) & ACPI_PM_MASK;
-	ret = base + (cyc2us(ret) * NSEC_PER_USEC);
-	return ret;
-}
-
-static void delay_pmtmr(unsigned long loops)
-{
-	unsigned long bclock, now;
-
-	rdtscl(bclock);
-	do
-	{
-		rep_nop();
-		rdtscl(now);
-	} while ((now-bclock) < loops);
-}
-
-
-/*
- * get the offset (in microseconds) from the last call to mark_offset()
- *	- Called holding a reader xtime_lock
- */
-static unsigned long get_offset_pmtmr(void)
-{
-	u32 now, offset, delta = 0;
-
-	offset = offset_tick;
-	now = read_pmtmr();
-	delta = (now - offset)&ACPI_PM_MASK;
-
-	return (unsigned long) offset_delay + cyc2us(delta);
-}
-
-
-/* acpi timer_opts struct */
-static struct timer_opts timer_pmtmr = {
-	.name			= "pmtmr",
-	.mark_offset		= mark_offset_pmtmr,
-	.get_offset		= get_offset_pmtmr,
-	.monotonic_clock 	= monotonic_clock_pmtmr,
-	.delay 			= delay_pmtmr,
-	.read_timer 		= read_timer_tsc,
-	.resume			= pmtmr_resume,
-};
-
-struct init_timer_opts __initdata timer_pmtmr_init = {
-	.init = init_pmtmr,
-	.opts = &timer_pmtmr,
-};
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("Power Management Timer (PMTMR) as primary timing source for x86");
Index: linux-2.6.16/arch/i386/kernel/timers/timer_tsc.c
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/timers/timer_tsc.c	2006-09-20 10:56:46.000000000 +0200
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,617 +0,0 @@
-/*
- * This code largely moved from arch/i386/kernel/time.c.
- * See comments there for proper credits.
- *
- * 2004-06-25    Jesper Juhl
- *      moved mark_offset_tsc below cpufreq_delayed_get to avoid gcc 3.4
- *      failing to inline.
- */
-
-#include <linux/spinlock.h>
-#include <linux/init.h>
-#include <linux/timex.h>
-#include <linux/errno.h>
-#include <linux/cpufreq.h>
-#include <linux/string.h>
-#include <linux/jiffies.h>
-
-#include <asm/timer.h>
-#include <asm/io.h>
-/* processor.h for distable_tsc flag */
-#include <asm/processor.h>
-
-#include "io_ports.h"
-#include "mach_timer.h"
-
-#include <asm/hpet.h>
-#include <asm/i8253.h>
-
-#ifdef CONFIG_HPET_TIMER
-static unsigned long hpet_usec_quotient;
-static unsigned long hpet_last;
-static struct timer_opts timer_tsc;
-#endif
-
-static inline void cpufreq_delayed_get(void);
-
-int tsc_disable __devinitdata = 0;
-
-static int use_tsc;
-/* Number of usecs that the last interrupt was delayed */
-static int delay_at_last_interrupt;
-
-static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */
-static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */
-static unsigned long long monotonic_base;
-static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
-
-/* Avoid compensating for lost ticks before TSCs are synched */
-static int detect_lost_ticks;
-static int __init start_lost_tick_compensation(void)
-{
-	detect_lost_ticks = 1;
-	return 0;
-}
-late_initcall(start_lost_tick_compensation);
-
-/* convert from cycles(64bits) => nanoseconds (64bits)
- *  basic equation:
- *		ns = cycles / (freq / ns_per_sec)
- *		ns = cycles * (ns_per_sec / freq)
- *		ns = cycles * (10^9 / (cpu_khz * 10^3))
- *		ns = cycles * (10^6 / cpu_khz)
- *
- *	Then we use scaling math (suggested by george@mvista.com) to get:
- *		ns = cycles * (10^6 * SC / cpu_khz) / SC
- *		ns = cycles * cyc2ns_scale / SC
- *
- *	And since SC is a constant power of two, we can convert the div
- *  into a shift.
- *
- *  We can use khz divisor instead of mhz to keep a better percision, since
- *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
- *  (mathieu.desnoyers@polymtl.ca)
- *
- *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
- */
-static unsigned long cyc2ns_scale; 
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
-
-static inline void set_cyc2ns_scale(unsigned long cpu_khz)
-{
-	cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
-}
-
-static inline unsigned long long cycles_2_ns(unsigned long long cyc)
-{
-	return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
-}
-
-static int count2; /* counter for mark_offset_tsc() */
-
-/* Cached *multiplier* to convert TSC counts to microseconds.
- * (see the equation below).
- * Equal to 2^32 * (1 / (clocks per usec) ).
- * Initialized in time_init.
- */
-static unsigned long fast_gettimeoffset_quotient;
-
-static unsigned long get_offset_tsc(void)
-{
-	register unsigned long eax, edx;
-
-	/* Read the Time Stamp Counter */
-
-	rdtsc(eax,edx);
-
-	/* .. relative to previous jiffy (32 bits is enough) */
-	eax -= last_tsc_low;	/* tsc_low delta */
-
-	/*
-         * Time offset = (tsc_low delta) * fast_gettimeoffset_quotient
-         *             = (tsc_low delta) * (usecs_per_clock)
-         *             = (tsc_low delta) * (usecs_per_jiffy / clocks_per_jiffy)
-	 *
-	 * Using a mull instead of a divl saves up to 31 clock cycles
-	 * in the critical path.
-         */
-
-	__asm__("mull %2"
-		:"=a" (eax), "=d" (edx)
-		:"rm" (fast_gettimeoffset_quotient),
-		 "0" (eax));
-
-	/* our adjusted time offset in microseconds */
-	return delay_at_last_interrupt + edx;
-}
-
-static unsigned long long monotonic_clock_tsc(void)
-{
-	unsigned long long last_offset, this_offset, base;
-	unsigned seq;
-	
-	/* atomically read monotonic base & last_offset */
-	do {
-		seq = read_seqbegin(&monotonic_lock);
-		last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
-		base = monotonic_base;
-	} while (read_seqretry(&monotonic_lock, seq));
-
-	/* Read the Time Stamp Counter */
-	rdtscll(this_offset);
-
-	/* return the value in ns */
-	return base + cycles_2_ns(this_offset - last_offset);
-}
-
-/*
- * Scheduler clock - returns current time in nanosec units.
- */
-unsigned long long sched_clock(void)
-{
-	unsigned long long this_offset;
-
-	/*
-	 * In the NUMA case we dont use the TSC as they are not
-	 * synchronized across all CPUs.
-	 */
-#ifndef CONFIG_NUMA
-	if (!use_tsc)
-#endif
-		/* no locking but a rare wrong value is not a big deal */
-		return jiffies_64 * (1000000000 / HZ);
-
-	/* Read the Time Stamp Counter */
-	rdtscll(this_offset);
-
-	/* return the value in ns */
-	return cycles_2_ns(this_offset);
-}
-
-static void delay_tsc(unsigned long loops)
-{
-	unsigned long bclock, now;
-	
-	rdtscl(bclock);
-	do
-	{
-		rep_nop();
-		rdtscl(now);
-	} while ((now-bclock) < loops);
-}
-
-#ifdef CONFIG_HPET_TIMER
-static void mark_offset_tsc_hpet(void)
-{
-	unsigned long long this_offset, last_offset;
- 	unsigned long offset, temp, hpet_current;
-
-	write_seqlock(&monotonic_lock);
-	last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
-	/*
-	 * It is important that these two operations happen almost at
-	 * the same time. We do the RDTSC stuff first, since it's
-	 * faster. To avoid any inconsistencies, we need interrupts
-	 * disabled locally.
-	 */
-	/*
-	 * Interrupts are just disabled locally since the timer irq
-	 * has the SA_INTERRUPT flag set. -arca
-	 */
-	/* read Pentium cycle counter */
-
-	hpet_current = hpet_readl(HPET_COUNTER);
-	rdtsc(last_tsc_low, last_tsc_high);
-
-	/* lost tick compensation */
-	offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
-	if (unlikely(((offset - hpet_last) > hpet_tick) && (hpet_last != 0))
-					&& detect_lost_ticks) {
-		int lost_ticks = (offset - hpet_last) / hpet_tick;
-		jiffies_64 += lost_ticks;
-	}
-	hpet_last = hpet_current;
-
-	/* update the monotonic base value */
-	this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
-	monotonic_base += cycles_2_ns(this_offset - last_offset);
-	write_sequnlock(&monotonic_lock);
-
-	/* calculate delay_at_last_interrupt */
-	/*
-	 * Time offset = (hpet delta) * ( usecs per HPET clock )
-	 *             = (hpet delta) * ( usecs per tick / HPET clocks per tick)
-	 *             = (hpet delta) * ( hpet_usec_quotient ) / (2^32)
-	 * Where,
-	 * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick
-	 */
-	delay_at_last_interrupt = hpet_current - offset;
-	ASM_MUL64_REG(temp, delay_at_last_interrupt,
-			hpet_usec_quotient, delay_at_last_interrupt);
-}
-#endif
-
-
-#ifdef CONFIG_CPU_FREQ
-#include <linux/workqueue.h>
-
-static unsigned int cpufreq_delayed_issched = 0;
-static unsigned int cpufreq_init = 0;
-static struct work_struct cpufreq_delayed_get_work;
-
-static void handle_cpufreq_delayed_get(void *v)
-{
-	unsigned int cpu;
-	for_each_online_cpu(cpu) {
-		cpufreq_get(cpu);
-	}
-	cpufreq_delayed_issched = 0;
-}
-
-/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries
- * to verify the CPU frequency the timing core thinks the CPU is running
- * at is still correct.
- */
-static inline void cpufreq_delayed_get(void) 
-{
-	if (cpufreq_init && !cpufreq_delayed_issched) {
-		cpufreq_delayed_issched = 1;
-		printk(KERN_DEBUG "Losing some ticks... checking if CPU frequency changed.\n");
-		schedule_work(&cpufreq_delayed_get_work);
-	}
-}
-
-/* If the CPU frequency is scaled, TSC-based delays will need a different
- * loops_per_jiffy value to function properly.
- */
-
-static unsigned int  ref_freq = 0;
-static unsigned long loops_per_jiffy_ref = 0;
-
-#ifndef CONFIG_SMP
-static unsigned long fast_gettimeoffset_ref = 0;
-static unsigned int cpu_khz_ref = 0;
-#endif
-
-static int
-time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
-		       void *data)
-{
-	struct cpufreq_freqs *freq = data;
-
-	if (val != CPUFREQ_RESUMECHANGE)
-		write_seqlock_irq(&xtime_lock);
-	if (!ref_freq) {
-		if (!freq->old){
-			ref_freq = freq->new;
-			goto end;
-		}
-		ref_freq = freq->old;
-		loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy;
-#ifndef CONFIG_SMP
-		fast_gettimeoffset_ref = fast_gettimeoffset_quotient;
-		cpu_khz_ref = cpu_khz;
-#endif
-	}
-
-	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
-	    (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
-	    (val == CPUFREQ_RESUMECHANGE)) {
-		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
-			cpu_data[freq->cpu].loops_per_jiffy = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
-#ifndef CONFIG_SMP
-		if (cpu_khz)
-			cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
-		if (use_tsc) {
-			if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
-				fast_gettimeoffset_quotient = cpufreq_scale(fast_gettimeoffset_ref, freq->new, ref_freq);
-				set_cyc2ns_scale(cpu_khz);
-			}
-		}
-#endif
-	}
-
-end:
-	if (val != CPUFREQ_RESUMECHANGE)
-		write_sequnlock_irq(&xtime_lock);
-
-	return 0;
-}
-
-static struct notifier_block time_cpufreq_notifier_block = {
-	.notifier_call	= time_cpufreq_notifier
-};
-
-
-static int __init cpufreq_tsc(void)
-{
-	int ret;
-	INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL);
-	ret = cpufreq_register_notifier(&time_cpufreq_notifier_block,
-					CPUFREQ_TRANSITION_NOTIFIER);
-	if (!ret)
-		cpufreq_init = 1;
-	return ret;
-}
-core_initcall(cpufreq_tsc);
-
-#else /* CONFIG_CPU_FREQ */
-static inline void cpufreq_delayed_get(void) { return; }
-#endif 
-
-int recalibrate_cpu_khz(void)
-{
-#ifndef CONFIG_SMP
-	unsigned int cpu_khz_old = cpu_khz;
-
-	if (cpu_has_tsc) {
-		local_irq_disable();
-		init_cpu_khz();
-		local_irq_enable();
-		cpu_data[0].loops_per_jiffy =
-		    cpufreq_scale(cpu_data[0].loops_per_jiffy,
-			          cpu_khz_old,
-				  cpu_khz);
-		return 0;
-	} else
-		return -ENODEV;
-#else
-	return -ENODEV;
-#endif
-}
-EXPORT_SYMBOL(recalibrate_cpu_khz);
-
-static void mark_offset_tsc(void)
-{
-	unsigned long lost,delay;
-	unsigned long delta = last_tsc_low;
-	int count;
-	int countmp;
-	static int count1 = 0;
-	unsigned long long this_offset, last_offset;
-	static int lost_count = 0;
-
-	write_seqlock(&monotonic_lock);
-	last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
-	/*
-	 * It is important that these two operations happen almost at
-	 * the same time. We do the RDTSC stuff first, since it's
-	 * faster. To avoid any inconsistencies, we need interrupts
-	 * disabled locally.
-	 */
-
-	/*
-	 * Interrupts are just disabled locally since the timer irq
-	 * has the SA_INTERRUPT flag set. -arca
-	 */
-
-	/* read Pentium cycle counter */
-
-	rdtsc(last_tsc_low, last_tsc_high);
-
-	spin_lock(&i8253_lock);
-	outb_p(0x00, PIT_MODE);     /* latch the count ASAP */
-
-	count = inb_p(PIT_CH0);    /* read the latched count */
-	count |= inb(PIT_CH0) << 8;
-
-	/*
-	 * VIA686a test code... reset the latch if count > max + 1
-	 * from timer_pit.c - cjb
-	 */
-	if (count > LATCH) {
-		outb_p(0x34, PIT_MODE);
-		outb_p(LATCH & 0xff, PIT_CH0);
-		outb(LATCH >> 8, PIT_CH0);
-		count = LATCH - 1;
-	}
-
-	spin_unlock(&i8253_lock);
-
-	if (pit_latch_buggy) {
-		/* get center value of last 3 time lutch */
-		if ((count2 >= count && count >= count1)
-		    || (count1 >= count && count >= count2)) {
-			count2 = count1; count1 = count;
-		} else if ((count1 >= count2 && count2 >= count)
-			   || (count >= count2 && count2 >= count1)) {
-			countmp = count;count = count2;
-			count2 = count1;count1 = countmp;
-		} else {
-			count2 = count1; count1 = count; count = count1;
-		}
-	}
-
-	/* lost tick compensation */
-	delta = last_tsc_low - delta;
-	{
-		register unsigned long eax, edx;
-		eax = delta;
-		__asm__("mull %2"
-		:"=a" (eax), "=d" (edx)
-		:"rm" (fast_gettimeoffset_quotient),
-		 "0" (eax));
-		delta = edx;
-	}
-	delta += delay_at_last_interrupt;
-	lost = delta/(1000000/HZ);
-	delay = delta%(1000000/HZ);
-	if (lost >= 2 && detect_lost_ticks) {
-		jiffies_64 += lost-1;
-
-		/* sanity check to ensure we're not always losing ticks */
-		if (lost_count++ > 100) {
-			printk(KERN_WARNING "Losing too many ticks!\n");
-			printk(KERN_WARNING "TSC cannot be used as a timesource.  \n");
-			printk(KERN_WARNING "Possible reasons for this are:\n");
-			printk(KERN_WARNING "  You're running with Speedstep,\n");
-			printk(KERN_WARNING "  You don't have DMA enabled for your hard disk (see hdparm),\n");
-			printk(KERN_WARNING "  Incorrect TSC synchronization on an SMP system (see dmesg).\n");
-			printk(KERN_WARNING "Falling back to a sane timesource now.\n");
-
-			clock_fallback();
-		}
-		/* ... but give the TSC a fair chance */
-		if (lost_count > 25)
-			cpufreq_delayed_get();
-	} else
-		lost_count = 0;
-	/* update the monotonic base value */
-	this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
-	monotonic_base += cycles_2_ns(this_offset - last_offset);
-	write_sequnlock(&monotonic_lock);
-
-	/* calculate delay_at_last_interrupt */
-	count = ((LATCH-1) - count) * TICK_SIZE;
-	delay_at_last_interrupt = (count + LATCH/2) / LATCH;
-
-	/* catch corner case where tick rollover occured
-	 * between tsc and pit reads (as noted when
-	 * usec delta is > 90% # of usecs/tick)
-	 */
-	if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ))
-		jiffies_64++;
-}
-
-static int __init init_tsc(char* override)
-{
-
-	/* check clock override */
-	if (override[0] && strncmp(override,"tsc",3)) {
-#ifdef CONFIG_HPET_TIMER
-		if (is_hpet_enabled()) {
-			printk(KERN_ERR "Warning: clock= override failed. Defaulting to tsc\n");
-		} else
-#endif
-		{
-			return -ENODEV;
-		}
-	}
-
-	/*
-	 * If we have APM enabled or the CPU clock speed is variable
-	 * (CPU stops clock on HLT or slows clock to save power)
-	 * then the TSC timestamps may diverge by up to 1 jiffy from
-	 * 'real time' but nothing will break.
-	 * The most frequent case is that the CPU is "woken" from a halt
-	 * state by the timer interrupt itself, so we get 0 error. In the
-	 * rare cases where a driver would "wake" the CPU and request a
-	 * timestamp, the maximum error is < 1 jiffy. But timestamps are
-	 * still perfectly ordered.
-	 * Note that the TSC counter will be reset if APM suspends
-	 * to disk; this won't break the kernel, though, 'cuz we're
-	 * smart.  See arch/i386/kernel/apm.c.
-	 */
- 	/*
- 	 *	Firstly we have to do a CPU check for chips with
- 	 * 	a potentially buggy TSC. At this point we haven't run
- 	 *	the ident/bugs checks so we must run this hook as it
- 	 *	may turn off the TSC flag.
- 	 *
- 	 *	NOTE: this doesn't yet handle SMP 486 machines where only
- 	 *	some CPU's have a TSC. Thats never worked and nobody has
- 	 *	moaned if you have the only one in the world - you fix it!
- 	 */
-
-	count2 = LATCH; /* initialize counter for mark_offset_tsc() */
-
-	if (cpu_has_tsc) {
-		unsigned long tsc_quotient;
-#ifdef CONFIG_HPET_TIMER
-		if (is_hpet_enabled() && hpet_use_timer) {
-			unsigned long result, remain;
-			printk("Using TSC for gettimeofday\n");
-			tsc_quotient = calibrate_tsc_hpet(NULL);
-			timer_tsc.mark_offset = &mark_offset_tsc_hpet;
-			/*
-			 * Math to calculate hpet to usec multiplier
-			 * Look for the comments at get_offset_tsc_hpet()
-			 */
-			ASM_DIV64_REG(result, remain, hpet_tick,
-					0, KERNEL_TICK_USEC);
-			if (remain > (hpet_tick >> 1))
-				result++; /* rounding the result */
-
-			hpet_usec_quotient = result;
-		} else
-#endif
-		{
-			tsc_quotient = calibrate_tsc();
-		}
-
-		if (tsc_quotient) {
-			fast_gettimeoffset_quotient = tsc_quotient;
-			use_tsc = 1;
-			/*
-			 *	We could be more selective here I suspect
-			 *	and just enable this for the next intel chips ?
-			 */
-			/* report CPU clock rate in Hz.
-			 * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) =
-			 * clock/second. Our precision is about 100 ppm.
-			 */
-			{	unsigned long eax=0, edx=1000;
-				__asm__("divl %2"
-		       		:"=a" (cpu_khz), "=d" (edx)
-        	       		:"r" (tsc_quotient),
-	                	"0" (eax), "1" (edx));
-				printk("Detected %u.%03u MHz processor.\n",
-					cpu_khz / 1000, cpu_khz % 1000);
-			}
-			set_cyc2ns_scale(cpu_khz);
-			return 0;
-		}
-	}
-	return -ENODEV;
-}
-
-static int tsc_resume(void)
-{
-	write_seqlock(&monotonic_lock);
-	/* Assume this is the last mark offset time */
-	rdtsc(last_tsc_low, last_tsc_high);
-#ifdef CONFIG_HPET_TIMER
-	if (is_hpet_enabled() && hpet_use_timer)
-		hpet_last = hpet_readl(HPET_COUNTER);
-#endif
-	write_sequnlock(&monotonic_lock);
-	return 0;
-}
-
-#ifndef CONFIG_X86_TSC
-/* disable flag for tsc.  Takes effect by clearing the TSC cpu flag
- * in cpu/common.c */
-static int __init tsc_setup(char *str)
-{
-	tsc_disable = 1;
-	return 1;
-}
-#else
-static int __init tsc_setup(char *str)
-{
-	printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
-				"cannot disable TSC.\n");
-	return 1;
-}
-#endif
-__setup("notsc", tsc_setup);
-
-
-
-/************************************************************/
-
-/* tsc timer_opts struct */
-static struct timer_opts timer_tsc = {
-	.name = "tsc",
-	.mark_offset = mark_offset_tsc, 
-	.get_offset = get_offset_tsc,
-	.monotonic_clock = monotonic_clock_tsc,
-	.delay = delay_tsc,
-	.read_timer = read_timer_tsc,
-	.resume	= tsc_resume,
-};
-
-struct init_timer_opts __initdata timer_tsc_init = {
-	.init = init_tsc,
-	.opts = &timer_tsc,
-};
Index: linux-2.6.16/arch/i386/kernel/traps.c
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/traps.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/kernel/traps.c	2006-10-19 16:52:29.000000000 +0200
@@ -93,7 +93,7 @@ asmlinkage void machine_check(void);
 
 static int kstack_depth_to_print = 24;
 struct notifier_block *i386die_chain;
-static DEFINE_SPINLOCK(die_notifier_lock);
+static DEFINE_RAW_SPINLOCK(die_notifier_lock);
 
 int register_die_notifier(struct notifier_block *nb)
 {
@@ -117,7 +117,6 @@ static void print_addr_and_symbol(unsign
 	printk(log_lvl);
 	printk(" [<%08lx>] ", addr);
 	print_symbol("%s", addr);
-	printk("\n");
 }
 
 static inline unsigned long print_context_stack(struct thread_info *tinfo,
@@ -126,17 +125,23 @@ static inline unsigned long print_contex
 {
 	unsigned long addr;
 
-#ifdef	CONFIG_FRAME_POINTER
+#ifdef CONFIG_FRAME_POINTER
 	while (valid_stack_ptr(tinfo, (void *)ebp)) {
 		addr = *(unsigned long *)(ebp + 4);
 		print_addr_and_symbol(addr, log_lvl);
+		printk(" (%ld)\n", *(unsigned long *)ebp - ebp);
 		ebp = *(unsigned long *)ebp;
 	}
 #else
+	unsigned long prev_frame = (unsigned long)stack;
+
 	while (valid_stack_ptr(tinfo, stack)) {
 		addr = *stack++;
-		if (__kernel_text_address(addr))
+		if (__kernel_text_address(addr)) {
 			print_addr_and_symbol(addr, log_lvl);
+			printk(" (%ld)\n", (unsigned long)stack - prev_frame);
+			prev_frame = (unsigned long)stack;
+		}
 	}
 #endif
 	return ebp;
@@ -169,6 +174,8 @@ static void show_trace_log_lvl(struct ta
 		printk(log_lvl);
 		printk(" =======================\n");
 	}
+	print_traces(task);
+	rt_mutex_show_held_locks(task, 1);
 }
 
 void show_trace(struct task_struct *task, unsigned long * stack)
@@ -224,6 +231,12 @@ void dump_stack(void)
 
 EXPORT_SYMBOL(dump_stack);
 
+#if defined(CONFIG_DEBUG_STACKOVERFLOW) && defined(CONFIG_LATENCY_TRACE)
+extern unsigned long worst_stack_left;
+#else
+# define worst_stack_left -1L
+#endif
+
 void show_registers(struct pt_regs *regs)
 {
 	int i;
@@ -250,10 +263,18 @@ void show_registers(struct pt_regs *regs
 		regs->eax, regs->ebx, regs->ecx, regs->edx);
 	printk(KERN_EMERG "esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
 		regs->esi, regs->edi, regs->ebp, esp);
-	printk(KERN_EMERG "ds: %04x   es: %04x   ss: %04x\n",
-		regs->xds & 0xffff, regs->xes & 0xffff, ss);
-	printk(KERN_EMERG "Process %s (pid: %d, threadinfo=%p task=%p)",
+	printk(KERN_EMERG "ds: %04x   es: %04x   ss: %04x   preempt: %08x\n",
+		regs->xds & 0xffff, regs->xes & 0xffff, ss, preempt_count());
+	printk(KERN_EMERG "Process %s (pid: %d, threadinfo=%p task=%p",
 		current->comm, current->pid, current_thread_info(), current);
+
+	if (in_kernel)
+		printk(" stack_left=%ld worst_left=%ld)",
+			(esp & (THREAD_SIZE-1))-sizeof(struct thread_info),
+			worst_stack_left);
+	else
+		printk(")");
+
 	/*
 	 * When in-kernel, we also print out the stack and code at the
 	 * time of the fault..
@@ -322,11 +343,11 @@ bug:
 void die(const char * str, struct pt_regs * regs, long err)
 {
 	static struct {
-		spinlock_t lock;
+		raw_spinlock_t lock;
 		u32 lock_owner;
 		int lock_owner_depth;
 	} die = {
-		.lock =			SPIN_LOCK_UNLOCKED,
+		.lock =			RAW_SPIN_LOCK_UNLOCKED,
 		.lock_owner =		-1,
 		.lock_owner_depth =	0
 	};
@@ -411,6 +432,11 @@ static void __kprobes do_trap(int trapnr
 	if (!user_mode(regs))
 		goto kernel_trap;
 
+#ifdef CONFIG_PREEMPT_RT
+	local_irq_enable();
+	preempt_check_resched();
+#endif
+
 	trap_signal: {
 		if (info)
 			force_sig_info(signr, info, tsk);
@@ -430,6 +456,8 @@ static void __kprobes do_trap(int trapnr
 		if (ret) goto trap_signal;
 		return;
 	}
+	print_traces(tsk);
+	rt_mutex_show_held_locks(tsk, 1);
 }
 
 #define DO_ERROR(trapnr, signr, str, name) \
@@ -597,10 +625,12 @@ static void unknown_nmi_error(unsigned c
 	printk("Do you have a strange power saving mode enabled?\n");
 }
 
-static DEFINE_SPINLOCK(nmi_print_lock);
+static DEFINE_RAW_SPINLOCK(nmi_print_lock);
 
 void die_nmi (struct pt_regs *regs, const char *msg)
 {
+	deadlock_trace_off();
+
 	if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 0, SIGINT) ==
 	    NOTIFY_STOP)
 		return;
@@ -628,10 +658,11 @@ void die_nmi (struct pt_regs *regs, cons
 		crash_kexec(regs);
 	}
 
+	nmi_exit();
 	do_exit(SIGSEGV);
 }
 
-static void default_do_nmi(struct pt_regs * regs)
+static void notrace default_do_nmi(struct pt_regs * regs)
 {
 	unsigned char reason = 0;
 
@@ -650,6 +681,7 @@ static void default_do_nmi(struct pt_reg
 		 */
 		if (nmi_watchdog) {
 			nmi_watchdog_tick(regs);
+//			trace_special(6, 1, 0);
 			return;
 		}
 #endif
@@ -669,18 +701,19 @@ static void default_do_nmi(struct pt_reg
 	reassert_nmi();
 }
 
-static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
+static notrace int dummy_nmi_callback(struct pt_regs * regs, int cpu)
 {
 	return 0;
 }
  
 static nmi_callback_t nmi_callback = dummy_nmi_callback;
  
-fastcall void do_nmi(struct pt_regs * regs, long error_code)
+fastcall notrace void do_nmi(struct pt_regs * regs, long error_code)
 {
 	int cpu;
 
 	nmi_enter();
+	nmi_trace((unsigned long)do_nmi, regs->eip, regs->eflags);
 
 	cpu = smp_processor_id();
 
Index: linux-2.6.16/arch/i386/kernel/tsc.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/arch/i386/kernel/tsc.c	2006-10-19 16:52:29.000000000 +0200
@@ -0,0 +1,492 @@
+/*
+ * This code largely moved from arch/i386/kernel/timer/timer_tsc.c
+ * which was originally moved from arch/i386/kernel/time.c.
+ * See comments there for proper credits.
+ */
+
+#include <linux/clocksource.h>
+#include <linux/workqueue.h>
+#include <linux/cpufreq.h>
+#include <linux/jiffies.h>
+#include <linux/init.h>
+#include <linux/dmi.h>
+
+#include <asm/delay.h>
+#include <asm/tsc.h>
+#include <asm/delay.h>
+#include <asm/io.h>
+
+#include "mach_timer.h"
+
+/*
+ * On some systems the TSC frequency does not
+ * change with the cpu frequency. So we need
+ * an extra value to store the TSC freq
+ */
+unsigned int tsc_khz;
+
+int tsc_disable __cpuinitdata = 0;
+
+#ifdef CONFIG_X86_TSC
+static int __init tsc_setup(char *str)
+{
+	printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
+				"cannot disable TSC.\n");
+	return 1;
+}
+#else
+/*
+ * disable flag for tsc. Takes effect by clearing the TSC cpu flag
+ * in cpu/common.c
+ */
+static int __init tsc_setup(char *str)
+{
+	tsc_disable = 1;
+
+	return 1;
+}
+#endif
+
+__setup("notsc", tsc_setup);
+
+/*
+ * code to mark and check if the TSC is unstable
+ * due to cpufreq or due to unsynced TSCs
+ */
+static int tsc_unstable;
+
+static inline int check_tsc_unstable(void)
+{
+	return tsc_unstable;
+}
+
+void mark_tsc_unstable(void)
+{
+#ifdef CONFIG_PARANOID_GENERIC_TIME
+	/* give a clue as to why the TSC is unstable */
+	static int warncount=0;
+	if(warncount < 10) {
+		warncount++;
+		printk("TSC: Marked unstable\n");
+		dump_stack();
+	}
+#endif
+	tsc_unstable = 1;
+}
+EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+
+/* Accellerators for sched_clock()
+ * convert from cycles(64bits) => nanoseconds (64bits)
+ *  basic equation:
+ *		ns = cycles / (freq / ns_per_sec)
+ *		ns = cycles * (ns_per_sec / freq)
+ *		ns = cycles * (10^9 / (cpu_khz * 10^3))
+ *		ns = cycles * (10^6 / cpu_khz)
+ *
+ *	Then we use scaling math (suggested by george@mvista.com) to get:
+ *		ns = cycles * (10^6 * SC / cpu_khz) / SC
+ *		ns = cycles * cyc2ns_scale / SC
+ *
+ *	And since SC is a constant power of two, we can convert the div
+ *  into a shift.
+ *
+ *  We can use khz divisor instead of mhz to keep a better percision, since
+ *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ *  (mathieu.desnoyers@polymtl.ca)
+ *
+ *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
+ */
+static unsigned long cyc2ns_scale;
+
+#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
+
+static inline void set_cyc2ns_scale(unsigned long cpu_khz)
+{
+	cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
+}
+
+static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+	return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
+}
+
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ */
+unsigned long long sched_clock(void)
+{
+	unsigned long long this_offset;
+
+	/*
+	 * in the NUMA case we dont use the TSC as they are not
+	 * synchronized across all CPUs.
+	 */
+#ifndef CONFIG_NUMA
+	if (!cpu_khz || check_tsc_unstable())
+#endif
+		/* no locking but a rare wrong value is not a big deal */
+		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
+
+	/* read the Time Stamp Counter: */
+	rdtscll(this_offset);
+
+	/* return the value in ns */
+	return cycles_2_ns(this_offset);
+}
+
+static unsigned long calculate_cpu_khz(void)
+{
+	unsigned long long start, end;
+	unsigned long count;
+	u64 delta64;
+	int i;
+	unsigned long flags;
+
+	local_irq_save(flags);
+
+	/* run 3 times to ensure the cache is warm */
+	for (i = 0; i < 3; i++) {
+		mach_prepare_counter();
+		rdtscll(start);
+		mach_countup(&count);
+		rdtscll(end);
+	}
+	/*
+	 * Error: ECTCNEVERSET
+	 * The CTC wasn't reliable: we got a hit on the very first read,
+	 * or the CPU was so fast/slow that the quotient wouldn't fit in
+	 * 32 bits..
+	 */
+	if (count <= 1)
+		goto err;
+
+	delta64 = end - start;
+
+	/* cpu freq too fast: */
+	if (delta64 > (1ULL<<32))
+		goto err;
+
+	/* cpu freq too slow: */
+	if (delta64 <= CALIBRATE_TIME_MSEC)
+		goto err;
+
+	delta64 += CALIBRATE_TIME_MSEC/2; /* round for do_div */
+	do_div(delta64,CALIBRATE_TIME_MSEC);
+
+	local_irq_restore(flags);
+	return (unsigned long)delta64;
+err:
+	local_irq_restore(flags);
+	return 0;
+}
+
+int recalibrate_cpu_khz(void)
+{
+#ifndef CONFIG_SMP
+	unsigned long cpu_khz_old = cpu_khz;
+
+	if (cpu_has_tsc) {
+		cpu_khz = calculate_cpu_khz();
+		tsc_khz = cpu_khz;
+		cpu_data[0].loops_per_jiffy =
+			cpufreq_scale(cpu_data[0].loops_per_jiffy,
+					cpu_khz_old, cpu_khz);
+		return 0;
+	} else
+		return -ENODEV;
+#else
+	return -ENODEV;
+#endif
+}
+
+EXPORT_SYMBOL(recalibrate_cpu_khz);
+
+void tsc_init(void)
+{
+	if (!cpu_has_tsc || tsc_disable)
+		return;
+
+	cpu_khz = calculate_cpu_khz();
+	tsc_khz = cpu_khz;
+
+	if (!cpu_khz)
+		return;
+
+	printk("Detected %lu.%03lu MHz processor.\n",
+				(unsigned long)cpu_khz / 1000,
+				(unsigned long)cpu_khz % 1000);
+
+	set_cyc2ns_scale(cpu_khz);
+	use_tsc_delay();
+}
+
+#ifdef CONFIG_CPU_FREQ
+
+static unsigned int cpufreq_delayed_issched = 0;
+static unsigned int cpufreq_init = 0;
+static struct work_struct cpufreq_delayed_get_work;
+
+static void handle_cpufreq_delayed_get(void *v)
+{
+	unsigned int cpu;
+
+	for_each_online_cpu(cpu)
+		cpufreq_get(cpu);
+
+	cpufreq_delayed_issched = 0;
+}
+
+/*
+ * if we notice cpufreq oddness, schedule a call to cpufreq_get() as it tries
+ * to verify the CPU frequency the timing core thinks the CPU is running
+ * at is still correct.
+ */
+static inline void cpufreq_delayed_get(void)
+{
+	if (cpufreq_init && !cpufreq_delayed_issched) {
+		cpufreq_delayed_issched = 1;
+		printk(KERN_DEBUG "Checking if CPU frequency changed.\n");
+		schedule_work(&cpufreq_delayed_get_work);
+	}
+}
+
+/*
+ * if the CPU frequency is scaled, TSC-based delays will need a different
+ * loops_per_jiffy value to function properly.
+ */
+static unsigned int ref_freq = 0;
+static unsigned long loops_per_jiffy_ref = 0;
+static unsigned long cpu_khz_ref = 0;
+
+static int
+time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
+{
+	struct cpufreq_freqs *freq = data;
+
+	if (val != CPUFREQ_RESUMECHANGE)
+		write_seqlock_irq(&xtime_lock);
+
+	if (!ref_freq) {
+		if (!freq->old){
+			ref_freq = freq->new;
+			goto end;
+		}
+		ref_freq = freq->old;
+		loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy;
+		cpu_khz_ref = cpu_khz;
+	}
+
+	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
+	    (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
+	    (val == CPUFREQ_RESUMECHANGE)) {
+		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+			cpu_data[freq->cpu].loops_per_jiffy =
+				cpufreq_scale(loops_per_jiffy_ref,
+						ref_freq, freq->new);
+
+		if (cpu_khz) {
+
+			if (num_online_cpus() == 1)
+				cpu_khz = cpufreq_scale(cpu_khz_ref,
+						ref_freq, freq->new);
+			if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
+				tsc_khz = cpu_khz;
+				set_cyc2ns_scale(cpu_khz);
+				/*
+				 * TSC based sched_clock turns
+				 * to junk w/ cpufreq
+				 */
+				mark_tsc_unstable();
+			}
+		}
+	}
+end:
+	if (val != CPUFREQ_RESUMECHANGE)
+		write_sequnlock_irq(&xtime_lock);
+
+	return 0;
+}
+
+static struct notifier_block time_cpufreq_notifier_block = {
+	.notifier_call	= time_cpufreq_notifier
+};
+
+static int __init cpufreq_tsc(void)
+{
+	int ret;
+
+	INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL);
+	ret = cpufreq_register_notifier(&time_cpufreq_notifier_block,
+					CPUFREQ_TRANSITION_NOTIFIER);
+	if (!ret)
+		cpufreq_init = 1;
+
+	return ret;
+}
+
+core_initcall(cpufreq_tsc);
+
+#endif
+
+/* clock source code */
+
+static unsigned long current_tsc_khz = 0;
+static int tsc_update_callback(void);
+
+#ifdef CONFIG_PARANOID_GENERIC_TIME
+/* This will hurt performance! */
+static DEFINE_RAW_SPINLOCK(checktsc_lock);
+static cycle_t last_tsc;
+
+static cycle_t read_tsc(void)
+{
+	static int once = 1;
+
+	unsigned long flags;
+	cycle_t ret;
+
+	spin_lock_irqsave(&checktsc_lock, flags);
+
+	rdtscll(ret);
+
+	if (once && ret < last_tsc) {
+		once = 0;
+		spin_unlock_irqrestore(&checktsc_lock, flags);
+		printk("BUG in read_tsc(): TSC went backward!\n");
+		if (num_online_cpus() > 1)
+			printk("... Unsynced TSCs?\n");
+		printk("... [ from %016Lx to %016Lx ]\n", last_tsc, ret);
+
+	} else {
+		last_tsc = ret;
+		spin_unlock_irqrestore(&checktsc_lock, flags);
+	}
+
+	return ret;
+}
+
+static struct clocksource clocksource_tsc;
+
+void tsc_print_debuginfo(void)
+{
+	u64 cyc;
+
+	printk("TSC clocksource dbg: %lu khz, rating=%lu mult=%lu shift=%lu\n",
+		current_tsc_khz, clocksource_tsc.rating,
+		clocksource_tsc.mult, clocksource_tsc.shift);
+	rdtscll(cyc);
+	printk("	current value: %llu  current jiffies: %lu\n",
+			cyc, jiffies);
+}
+
+#else /* CONFIG_PARANOID_GENERIC_TIME */
+
+static cycle_t read_tsc(void)
+{
+	cycle_t ret;
+
+	rdtscll(ret);
+
+	return ret;
+}
+
+#define tsc_print_debuginfo()
+#endif /* CONFIG_PARANOID_GENERIC_TIME */
+
+static struct clocksource clocksource_tsc = {
+	.name			= "tsc",
+	.rating			= 300,
+	.read			= read_tsc,
+	.mask			= (cycle_t)-1,
+	.mult			= 0, /* to be set */
+	.shift			= 22,
+	.update_callback	= tsc_update_callback,
+	.is_continuous		= 1,
+};
+
+static int tsc_update_callback(void)
+{
+	int change = 0;
+
+	static int dbug_count;
+	if (!(dbug_count++ % 1000))
+		tsc_print_debuginfo();
+
+	/* check to see if we should switch to the safe clocksource: */
+	if (clocksource_tsc.rating != 50 && check_tsc_unstable()) {
+		clocksource_tsc.rating = 50;
+		reselect_clocksource();
+		change = 1;
+	}
+
+	/* only update if tsc_khz has changed: */
+	if (current_tsc_khz != tsc_khz) {
+		current_tsc_khz = tsc_khz;
+		clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
+							clocksource_tsc.shift);
+		change = 1;
+	}
+
+	return change;
+}
+
+static int __init dmi_mark_tsc_unstable(struct dmi_system_id *d)
+{
+	printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
+		       d->ident);
+	mark_tsc_unstable();
+	return 0;
+}
+
+/* List of systems that have known TSC problems */
+static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
+	{
+	 .callback = dmi_mark_tsc_unstable,
+	 .ident = "IBM Thinkpad 380XD",
+	 .matches = {
+		     DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
+		     DMI_MATCH(DMI_BOARD_NAME, "2635FA0"),
+		     },
+	 },
+	 {}
+};
+
+/*
+ * Make an educated guess if the TSC is trustworthy and synchronized
+ * over all CPUs.
+ */
+static __init int unsynchronized_tsc(void)
+{
+	/*
+	 * Intel systems are normally all synchronized.
+	 * Exceptions must mark TSC as unstable:
+	 */
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ 		return 0;
+
+	/* assume multi socket systems are not synchronized: */
+ 	return num_possible_cpus() > 1;
+}
+
+static int __init init_tsc_clocksource(void)
+{
+
+	if (cpu_has_tsc && tsc_khz && !tsc_disable) {
+		/* check blacklist */
+		dmi_check_system(bad_tsc_dmi_table);
+
+		if (unsynchronized_tsc()) /* mark unstable if unsynced */
+			mark_tsc_unstable();
+		current_tsc_khz = tsc_khz;
+		clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
+							clocksource_tsc.shift);
+		/* lower the rating if we already know its unstable: */
+		if (check_tsc_unstable())
+			clocksource_tsc.rating = 50;
+		return register_clocksource(&clocksource_tsc);
+	}
+
+	return 0;
+}
+
+module_init(init_tsc_clocksource);
Index: linux-2.6.16/arch/i386/kernel/vm86.c
===================================================================
--- linux-2.6.16.orig/arch/i386/kernel/vm86.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/kernel/vm86.c	2006-10-19 16:52:29.000000000 +0200
@@ -109,6 +109,7 @@ struct pt_regs * fastcall save_v86_state
 	local_irq_enable();
 
 	if (!current->thread.vm86_info) {
+		local_irq_disable();
 		printk("no vm86_info: BAD\n");
 		do_exit(SIGSEGV);
 	}
Index: linux-2.6.16/arch/i386/lib/bitops.c
===================================================================
--- linux-2.6.16.orig/arch/i386/lib/bitops.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/lib/bitops.c	2006-10-19 16:52:29.000000000 +0200
@@ -68,3 +68,39 @@ int find_next_zero_bit(const unsigned lo
 	return (offset + set + res);
 }
 EXPORT_SYMBOL(find_next_zero_bit);
+
+
+/*
+ * rw spinlock fallbacks
+ */
+#ifdef CONFIG_SMP
+asm(
+".section .sched.text,\"ax\"\n"
+".align	4\n"
+".globl	__write_lock_failed\n"
+"__write_lock_failed:\n\t"
+	LOCK "addl	$" RW_LOCK_BIAS_STR ",(%eax)\n"
+"1:	rep; nop\n\t"
+	"cmpl	$" RW_LOCK_BIAS_STR ",(%eax)\n\t"
+	"jne	1b\n\t"
+	LOCK "subl	$" RW_LOCK_BIAS_STR ",(%eax)\n\t"
+	"jnz	__write_lock_failed\n\t"
+	"ret\n"
+".previous\n"
+);
+
+asm(
+".section .sched.text,\"ax\"\n"
+".align	4\n"
+".globl	__read_lock_failed\n"
+"__read_lock_failed:\n\t"
+	LOCK "incl	(%eax)\n"
+"1:	rep; nop\n\t"
+	"cmpl	$1,(%eax)\n\t"
+	"js	1b\n\t"
+	LOCK "decl	(%eax)\n\t"
+	"js	__read_lock_failed\n\t"
+	"ret\n"
+".previous\n"
+);
+#endif
Index: linux-2.6.16/arch/i386/lib/delay.c
===================================================================
--- linux-2.6.16.orig/arch/i386/lib/delay.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/lib/delay.c	2006-10-19 16:52:29.000000000 +0200
@@ -10,43 +10,93 @@
  *	we have to worry about.
  */
 
+#include <linux/timeofday.h>
+#include <linux/module.h>
 #include <linux/config.h>
 #include <linux/sched.h>
 #include <linux/delay.h>
-#include <linux/module.h>
+
 #include <asm/processor.h>
 #include <asm/delay.h>
 #include <asm/timer.h>
 
 #ifdef CONFIG_SMP
-#include <asm/smp.h>
+# include <asm/smp.h>
 #endif
 
-extern struct timer_opts* timer;
+/* simple loop based delay: */
+static void delay_loop(unsigned long loops)
+{
+	int d0;
+
+	__asm__ __volatile__(
+		"\tjmp 1f\n"
+		".align 16\n"
+		"1:\tjmp 2f\n"
+		".align 16\n"
+		"2:\tdecl %0\n\tjns 2b"
+		:"=&a" (d0)
+		:"0" (loops));
+}
+
+/* TSC based delay: */
+static void delay_tsc(unsigned long loops)
+{
+	unsigned long bclock, now;
+
+	rdtscl(bclock);
+	do {
+		rep_nop();
+		rdtscl(now);
+	} while ((now-bclock) < loops);
+}
+
+/*
+ * Since we calibrate only once at boot, this
+ * function should be set once at boot and not changed
+ */
+static void (*delay_fn)(unsigned long) = delay_loop;
+
+void use_tsc_delay(void)
+{
+	delay_fn = delay_tsc;
+}
+
+int read_current_timer(unsigned long *timer_val)
+{
+	if (delay_fn == delay_tsc) {
+		rdtscl(*timer_val);
+		return 0;
+	}
+	return -1;
+}
 
 void __delay(unsigned long loops)
 {
-	cur_timer->delay(loops);
+	delay_fn(loops);
 }
 
 inline void __const_udelay(unsigned long xloops)
 {
 	int d0;
+
 	xloops *= 4;
 	__asm__("mull %0"
 		:"=d" (xloops), "=&a" (d0)
-		:"1" (xloops),"0" (cpu_data[raw_smp_processor_id()].loops_per_jiffy * (HZ/4)));
-        __delay(++xloops);
+		:"1" (xloops), "0"
+		(cpu_data[raw_smp_processor_id()].loops_per_jiffy * (HZ/4)));
+
+	__delay(++xloops);
 }
 
 void __udelay(unsigned long usecs)
 {
-	__const_udelay(usecs * 0x000010c7);  /* 2**32 / 1000000 (rounded up) */
+	__const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
 }
 
 void __ndelay(unsigned long nsecs)
 {
-	__const_udelay(nsecs * 0x00005);  /* 2**32 / 1000000000 (rounded up) */
+	__const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
 }
 
 EXPORT_SYMBOL(__delay);
Index: linux-2.6.16/arch/i386/mach-default/setup.c
===================================================================
--- linux-2.6.16.orig/arch/i386/mach-default/setup.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/mach-default/setup.c	2006-10-19 16:52:29.000000000 +0200
@@ -34,7 +34,7 @@ void __init pre_intr_init_hook(void)
 /*
  * IRQ2 is cascade interrupt to second interrupt controller
  */
-static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
+static struct irqaction irq2 = { no_action, SA_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL};
 
 /**
  * intr_init_hook - post gate setup interrupt initialisation
@@ -78,8 +78,6 @@ void __init trap_init_hook(void)
 {
 }
 
-static struct irqaction irq0  = { timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL};
-
 /**
  * time_init_hook - do any specific initialisations for the system timer.
  *
@@ -89,7 +87,6 @@ static struct irqaction irq0  = { timer_
  **/
 void __init time_init_hook(void)
 {
-	setup_irq(0, &irq0);
 }
 
 #ifdef CONFIG_MCA
Index: linux-2.6.16/arch/i386/mach-visws/setup.c
===================================================================
--- linux-2.6.16.orig/arch/i386/mach-visws/setup.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/mach-visws/setup.c	2006-10-19 16:52:29.000000000 +0200
@@ -113,7 +113,7 @@ void __init pre_setup_arch_hook()
 
 static struct irqaction irq0 = {
 	.handler =	timer_interrupt,
-	.flags =	SA_INTERRUPT,
+	.flags =	SA_INTERRUPT | SA_NODELAY,
 	.name =		"timer",
 };
 
Index: linux-2.6.16/arch/i386/mach-visws/visws_apic.c
===================================================================
--- linux-2.6.16.orig/arch/i386/mach-visws/visws_apic.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/mach-visws/visws_apic.c	2006-10-19 16:52:29.000000000 +0200
@@ -260,11 +260,13 @@ out_unlock:
 static struct irqaction master_action = {
 	.handler =	piix4_master_intr,
 	.name =		"PIIX4-8259",
+	.flags =	SA_NODELAY,
 };
 
 static struct irqaction cascade_action = {
 	.handler = 	no_action,
 	.name =		"cascade",
+	.flags =	SA_NODELAY,
 };
 
 
Index: linux-2.6.16/arch/i386/mach-voyager/setup.c
===================================================================
--- linux-2.6.16.orig/arch/i386/mach-voyager/setup.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/mach-voyager/setup.c	2006-10-19 16:52:29.000000000 +0200
@@ -16,7 +16,7 @@ void __init pre_intr_init_hook(void)
 /*
  * IRQ2 is cascade interrupt to second interrupt controller
  */
-static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
+static struct irqaction irq2 = { no_action, SA_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL};
 
 void __init intr_init_hook(void)
 {
@@ -39,7 +39,7 @@ void __init trap_init_hook(void)
 {
 }
 
-static struct irqaction irq0  = { timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL};
+static struct irqaction irq0  = { timer_interrupt, SA_INTERRUPT | SA_NODELAY, CPU_MASK_NONE, "timer", NULL, NULL};
 
 void __init time_init_hook(void)
 {
Index: linux-2.6.16/arch/i386/mm/fault.c
===================================================================
--- linux-2.6.16.orig/arch/i386/mm/fault.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/mm/fault.c	2006-10-19 16:52:29.000000000 +0200
@@ -39,6 +39,8 @@ void bust_spinlocks(int yes)
 	int loglevel_save = console_loglevel;
 
 	if (yes) {
+		stop_trace();
+		zap_rt_locks();
 		oops_in_progress = 1;
 		return;
 	}
@@ -224,8 +226,8 @@ fastcall void do_invalid_op(struct pt_re
  *	bit 1 == 0 means read, 1 means write
  *	bit 2 == 0 means kernel, 1 means user-mode
  */
-fastcall void __kprobes do_page_fault(struct pt_regs *regs,
-				      unsigned long error_code)
+fastcall notrace void __kprobes do_page_fault(struct pt_regs *regs,
+					      unsigned long error_code)
 {
 	struct task_struct *tsk;
 	struct mm_struct *mm;
@@ -236,6 +238,7 @@ fastcall void __kprobes do_page_fault(st
 
 	/* get the address */
         address = read_cr2();
+	trace_special(regs->eip, error_code, address);
 
 	if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
 					SIGSEGV) == NOTIFY_STOP)
@@ -449,9 +452,9 @@ no_context:
 	}
 #endif
 	if (address < PAGE_SIZE)
-		printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
+		printk(KERN_ALERT "BUG: Unable to handle kernel NULL pointer dereference");
 	else
-		printk(KERN_ALERT "Unable to handle kernel paging request");
+		printk(KERN_ALERT "BUG: Unable to handle kernel paging request");
 	printk(" at virtual address %08lx\n",address);
 	printk(KERN_ALERT " printing eip:\n");
 	printk("%08lx\n", regs->eip);
Index: linux-2.6.16/arch/i386/mm/highmem.c
===================================================================
--- linux-2.6.16.orig/arch/i386/mm/highmem.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/mm/highmem.c	2006-10-19 16:52:29.000000000 +0200
@@ -18,6 +18,26 @@ void kunmap(struct page *page)
 	kunmap_high(page);
 }
 
+void kunmap_virt(void *ptr)
+{
+	struct page *page;
+
+	if ((unsigned long)ptr < PKMAP_ADDR(0))
+		return;
+	page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]);
+	kunmap(page);
+}
+
+struct page *kmap_to_page(void *ptr)
+{
+	struct page *page;
+
+	if ((unsigned long)ptr < PKMAP_ADDR(0))
+		return virt_to_page(ptr);
+	page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]);
+	return page;
+}
+
 /*
  * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
  * no global lock is needed and because the kmap code must perform a global TLB
@@ -26,7 +46,7 @@ void kunmap(struct page *page)
  * However when holding an atomic kmap is is not legal to sleep, so atomic
  * kmaps are appropriate for short, tight code paths only.
  */
-void *kmap_atomic(struct page *page, enum km_type type)
+void *__kmap_atomic(struct page *page, enum km_type type)
 {
 	enum fixed_addresses idx;
 	unsigned long vaddr;
@@ -48,7 +68,7 @@ void *kmap_atomic(struct page *page, enu
 	return (void*) vaddr;
 }
 
-void kunmap_atomic(void *kvaddr, enum km_type type)
+void __kunmap_atomic(void *kvaddr, enum km_type type)
 {
 #ifdef CONFIG_DEBUG_HIGHMEM
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
@@ -78,7 +98,7 @@ void kunmap_atomic(void *kvaddr, enum km
 /* This is the same as kmap_atomic() but can map memory that doesn't
  * have a struct page associated with it.
  */
-void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
+void *__kmap_atomic_pfn(unsigned long pfn, enum km_type type)
 {
 	enum fixed_addresses idx;
 	unsigned long vaddr;
@@ -93,7 +113,7 @@ void *kmap_atomic_pfn(unsigned long pfn,
 	return (void*) vaddr;
 }
 
-struct page *kmap_atomic_to_page(void *ptr)
+struct page *__kmap_atomic_to_page(void *ptr)
 {
 	unsigned long idx, vaddr = (unsigned long)ptr;
 	pte_t *pte;
@@ -108,6 +128,7 @@ struct page *kmap_atomic_to_page(void *p
 
 EXPORT_SYMBOL(kmap);
 EXPORT_SYMBOL(kunmap);
-EXPORT_SYMBOL(kmap_atomic);
-EXPORT_SYMBOL(kunmap_atomic);
-EXPORT_SYMBOL(kmap_atomic_to_page);
+EXPORT_SYMBOL(kunmap_virt);
+EXPORT_SYMBOL(__kmap_atomic);
+EXPORT_SYMBOL(__kunmap_atomic);
+EXPORT_SYMBOL(__kmap_atomic_to_page);
Index: linux-2.6.16/arch/i386/mm/init.c
===================================================================
--- linux-2.6.16.orig/arch/i386/mm/init.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/mm/init.c	2006-10-19 16:52:29.000000000 +0200
@@ -44,7 +44,7 @@
 
 unsigned int __VMALLOC_RESERVE = 128 << 20;
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 unsigned long highstart_pfn, highend_pfn;
 
 static int noinline do_test_wp_bit(void);
Index: linux-2.6.16/arch/i386/mm/pageattr.c
===================================================================
--- linux-2.6.16.orig/arch/i386/mm/pageattr.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/mm/pageattr.c	2006-10-19 16:52:29.000000000 +0200
@@ -223,8 +223,8 @@ void kernel_map_pages(struct page *page,
 	if (PageHighMem(page))
 		return;
 	if (!enable)
-		mutex_debug_check_no_locks_freed(page_address(page),
-						 numpages * PAGE_SIZE);
+		debug_check_no_locks_freed(page_address(page),
+					   numpages * PAGE_SIZE);
 
 	/* the return value is ignored - the calls cannot fail,
 	 * large pages are disabled at boot time.
Index: linux-2.6.16/arch/i386/mm/pgtable.c
===================================================================
--- linux-2.6.16.orig/arch/i386/mm/pgtable.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/mm/pgtable.c	2006-10-19 16:52:29.000000000 +0200
@@ -183,7 +183,7 @@ void pmd_ctor(void *pmd, kmem_cache_t *c
  * recommendations and having no core impact whatsoever.
  * -- wli
  */
-DEFINE_SPINLOCK(pgd_lock);
+DEFINE_RAW_SPINLOCK(pgd_lock);
 struct page *pgd_list;
 
 static inline void pgd_list_add(pgd_t *pgd)
Index: linux-2.6.16/arch/i386/oprofile/Kconfig
===================================================================
--- linux-2.6.16.orig/arch/i386/oprofile/Kconfig	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/oprofile/Kconfig	2006-10-19 16:52:29.000000000 +0200
@@ -15,3 +15,6 @@ config OPROFILE
 
 	  If unsure, say N.
 
+config PROFILE_NMI
+	bool
+	default y
Index: linux-2.6.16/arch/i386/pci/Makefile
===================================================================
--- linux-2.6.16.orig/arch/i386/pci/Makefile	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/pci/Makefile	2006-10-19 16:52:29.000000000 +0200
@@ -4,8 +4,9 @@ obj-$(CONFIG_PCI_BIOS)		+= pcbios.o
 obj-$(CONFIG_PCI_MMCONFIG)	+= mmconfig.o direct.o
 obj-$(CONFIG_PCI_DIRECT)	+= direct.o
 
+obj-$(CONFIG_ACPI)		+= acpi.o
+
 pci-y				:= fixup.o
-pci-$(CONFIG_ACPI)		+= acpi.o
 pci-y				+= legacy.o irq.o
 
 pci-$(CONFIG_X86_VISWS)		:= visws.o fixup.o
Index: linux-2.6.16/arch/i386/pci/direct.c
===================================================================
--- linux-2.6.16.orig/arch/i386/pci/direct.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/i386/pci/direct.c	2006-10-19 16:52:29.000000000 +0200
@@ -211,16 +211,23 @@ static int __init pci_check_type1(void)
 	unsigned int tmp;
 	int works = 0;
 
-	local_irq_save(flags);
+	spin_lock_irqsave(&pci_config_lock, flags);
 
 	outb(0x01, 0xCFB);
 	tmp = inl(0xCF8);
 	outl(0x80000000, 0xCF8);
-	if (inl(0xCF8) == 0x80000000 && pci_sanity_check(&pci_direct_conf1)) {
-		works = 1;
+
+	if (inl(0xCF8) == 0x80000000) {
+		spin_unlock_irqrestore(&pci_config_lock, flags);
+
+		if (pci_sanity_check(&pci_direct_conf1))
+			works = 1;
+
+		spin_lock_irqsave(&pci_config_lock, flags);
 	}
 	outl(tmp, 0xCF8);
-	local_irq_restore(flags);
+
+	spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return works;
 }
@@ -230,17 +237,19 @@ static int __init pci_check_type2(void)
 	unsigned long flags;
 	int works = 0;
 
-	local_irq_save(flags);
+	spin_lock_irqsave(&pci_config_lock, flags);
 
 	outb(0x00, 0xCFB);
 	outb(0x00, 0xCF8);
 	outb(0x00, 0xCFA);
-	if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00 &&
-	    pci_sanity_check(&pci_direct_conf2)) {
-		works = 1;
-	}
 
-	local_irq_restore(flags);
+	if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00) {
+		spin_unlock_irqrestore(&pci_config_lock, flags);
+
+		if (pci_sanity_check(&pci_direct_conf2))
+			works = 1;
+	} else
+		spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return works;
 }
Index: linux-2.6.16/arch/ia64/Kconfig
===================================================================
--- linux-2.6.16.orig/arch/ia64/Kconfig	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ia64/Kconfig	2006-10-19 16:52:29.000000000 +0200
@@ -32,16 +32,13 @@ config SWIOTLB
 
 config RWSEM_XCHGADD_ALGORITHM
 	bool
+	depends on !PREEMPT_RT
 	default y
 
 config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
 
-config TIME_INTERPOLATION
-	bool
-	default y
-
 config EFI
 	bool
 	default y
@@ -240,6 +237,75 @@ config SMP
 
 	  If you don't know what to do here, say N.
 
+
+config GENERIC_TIME
+       bool
+       default y
+
+config TIME_INTERPOLATION
+	bool
+	default y
+	depends on !GENERIC_TIME
+
+
+config HIGH_RES_TIMERS
+	bool "High-Resolution Timers"
+	help
+
+	  POSIX timers are available by default.  This option enables
+	  high-resolution POSIX timers.  With this option the resolution
+	  is at least 1 microsecond.  High resolution is not free.  If
+	  enabled this option will add a small overhead each time a
+	  timer expires that is not on a 1/HZ tick boundary.  If no such
+	  timers are used the overhead is nil.
+
+	  This option enables two additional POSIX CLOCKS,
+	  CLOCK_REALTIME_HR and CLOCK_MONOTONIC_HR.  Note that this
+	  option does not change the resolution of CLOCK_REALTIME or
+	  CLOCK_MONOTONIC which remain at 1/HZ resolution.
+
+config HIGH_RES_RESOLUTION
+	int "High-Resolution-Timer resolution (nanoseconds)"
+	depends on HIGH_RES_TIMERS
+	default 1000
+	help
+
+	  This sets the resolution of timers accessed with
+          CLOCK_REALTIME_HR and CLOCK_MONOTONIC_HR.  Too
+	  fine a resolution (small a number) will usually not
+          be observable due to normal system latencies.  For an
+          800 MHZ processor about 10,000 is the recommended maximum
+	  (smallest number).  If you don't need that sort of resolution,
+	  higher numbers may generate less overhead.
+
+choice
+	prompt "Clock source"
+	depends on HIGH_RES_TIMERS
+ 	default HIGH_RES_TIMER_ITC
+	help
+	  This option allows you to choose the hardware source in charge
+	  of generating high precision interruptions on your system.
+	  On IA-64 these are:
+
+	  <timer>				<resolution>
+	  ITC Interval Time Counter		1/CPU clock
+  	  HPET High Precision Event Timer	~ (XXX:have to check the spec)
+
+	  The ITC timer is available on all the ia64 computers because
+	  it is integrated directly into the processor. However it may not
+	  give correct results on MP machines with processors running
+	  at different clock rates. In this case you may want to use
+	  the HPET if available on your machine.
+
+
+config HIGH_RES_TIMER_ITC
+	bool "Interval Time Counter/ITC"
+
+config HIGH_RES_TIMER_HPET
+	bool "High Precision Event Timer/HPET"
+
+endchoice
+
 config NR_CPUS
 	int "Maximum number of CPUs (2-1024)"
 	range 2 1024
@@ -271,17 +337,15 @@ config SCHED_SMT
 	  Intel IA64 chips with MultiThreading at a cost of slightly increased
 	  overhead in some places. If unsure say N here.
 
-config PREEMPT
-	bool "Preemptible Kernel"
-        help
-          This option reduces the latency of the kernel when reacting to
-          real-time or interactive events by allowing a low priority process to
-          be preempted even if it is in kernel mode executing a system call.
-          This allows applications to run more reliably even when the system is
-          under load.
+source "kernel/Kconfig.preempt"
+
+config RWSEM_GENERIC_SPINLOCK
+	bool
+	depends on PREEMPT_RT
+	default y
 
-          Say Y here if you are building a kernel for a desktop, embedded
-          or real-time system.  Say N if you are unsure.
+config PREEMPT
+	def_bool y if (PREEMPT_RT || PREEMPT_SOFTIRQS || PREEMPT_HARDIRQS || PREEMPT_VOLUNTARY || PREEMPT_DESKTOP)
 
 source "mm/Kconfig"
 
Index: linux-2.6.16/arch/ia64/ia32/sys_ia32.c
===================================================================
--- linux-2.6.16.orig/arch/ia64/ia32/sys_ia32.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ia64/ia32/sys_ia32.c	2006-10-19 16:52:29.000000000 +0200
@@ -1166,19 +1166,7 @@ put_tv32 (struct compat_timeval __user *
 asmlinkage unsigned long
 sys32_alarm (unsigned int seconds)
 {
-	struct itimerval it_new, it_old;
-	unsigned int oldalarm;
-
-	it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
-	it_new.it_value.tv_sec = seconds;
-	it_new.it_value.tv_usec = 0;
-	do_setitimer(ITIMER_REAL, &it_new, &it_old);
-	oldalarm = it_old.it_value.tv_sec;
-	/* ehhh.. We can't return 0 if we have an alarm pending.. */
-	/* And we'd better return too much than too little anyway */
-	if (it_old.it_value.tv_usec)
-		oldalarm++;
-	return oldalarm;
+	return alarm_setitimer(seconds);
 }
 
 /* Translations due to time_t size differences.  Which affects all
Index: linux-2.6.16/arch/ia64/kernel/asm-offsets.c
===================================================================
--- linux-2.6.16.orig/arch/ia64/kernel/asm-offsets.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ia64/kernel/asm-offsets.c	2006-10-19 16:52:29.000000000 +0200
@@ -247,6 +247,7 @@ void foo(void)
 	       offsetof (struct pal_min_state_area_s, pmsa_xip));
 	BLANK();
 
+#ifdef CONFIG_TIME_INTERPOLATION
 	/* used by fsys_gettimeofday in arch/ia64/kernel/fsys.S */
 	DEFINE(IA64_TIME_INTERPOLATOR_ADDRESS_OFFSET, offsetof (struct time_interpolator, addr));
 	DEFINE(IA64_TIME_INTERPOLATOR_SOURCE_OFFSET, offsetof (struct time_interpolator, source));
@@ -260,5 +261,6 @@ void foo(void)
 	DEFINE(IA64_TIME_SOURCE_CPU, TIME_SOURCE_CPU);
 	DEFINE(IA64_TIME_SOURCE_MMIO64, TIME_SOURCE_MMIO64);
 	DEFINE(IA64_TIME_SOURCE_MMIO32, TIME_SOURCE_MMIO32);
+#endif
 	DEFINE(IA64_TIMESPEC_TV_NSEC_OFFSET, offsetof (struct timespec, tv_nsec));
 }
Index: linux-2.6.16/arch/ia64/kernel/entry.S
===================================================================
--- linux-2.6.16.orig/arch/ia64/kernel/entry.S	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ia64/kernel/entry.S	2006-10-19 16:52:29.000000000 +0200
@@ -1105,23 +1105,25 @@ skip_rbs_switch:
 	tbit.nz p6,p0=r31,TIF_SIGDELAYED		// signal delayed from  MCA/INIT/NMI/PMI context?
 (p6)	br.cond.sptk.few .sigdelayed
 	;;
-	tbit.z p6,p0=r31,TIF_NEED_RESCHED		// current_thread_info()->need_resched==0?
+	tbit.nz p6,p0=r31,TIF_NEED_RESCHED		// current_thread_info()->need_resched==0?
+(p6)	br.cond.sptk.few .needresched
+	;;
+	tbit.z p6,p0=r31,TIF_NEED_RESCHED_DELAYED	// current_thread_info()->need_resched_delayed==0?
 (p6)	br.cond.sptk.few .notify
-#ifdef CONFIG_PREEMPT
-(pKStk) dep r21=-1,r0,PREEMPT_ACTIVE_BIT,1
+.needresched:
+
+(pKStk) br.cond.sptk.many .fromkernel
 	;;
-(pKStk) st4 [r20]=r21
 	ssm psr.i		// enable interrupts
-#endif
 	br.call.spnt.many rp=schedule
-.ret9:	cmp.eq p6,p0=r0,r0				// p6 <- 1
-	rsm psr.i		// disable interrupts
-	;;
-#ifdef CONFIG_PREEMPT
-(pKStk)	adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
+.ret9a:	rsm psr.i		// disable interrupts
 	;;
-(pKStk)	st4 [r20]=r0		// preempt_count() <- 0
-#endif
+	br.cond.sptk.many .endpreemptdep
+.fromkernel:
+	br.call.spnt.many rp=preempt_schedule_irq
+.ret9b:	rsm psr.i		// disable interrupts
+.endpreemptdep:
+	cmp.eq p6,p0=r0,r0				// p6 <- 1
 (pLvSys)br.cond.sptk.few  .work_pending_syscall_end
 	br.cond.sptk.many .work_processed_kernel	// re-check
 
@@ -1138,7 +1140,7 @@ skip_rbs_switch:
 
 .sigdelayed:
 	br.call.sptk.many rp=do_sigdelayed
-	cmp.eq p6,p0=r0,r0				// p6 <- 1, always re-check
+.ret10b: cmp.eq p6,p0=r0,r0				// p6 <- 1, always re-check
 (pLvSys)br.cond.sptk.few  .work_pending_syscall_end
 	br.cond.sptk.many .work_processed_kernel	// re-check
 
Index: linux-2.6.16/arch/ia64/kernel/fsys.S
===================================================================
--- linux-2.6.16.orig/arch/ia64/kernel/fsys.S	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ia64/kernel/fsys.S	2006-10-19 16:52:29.000000000 +0200
@@ -145,6 +145,7 @@ ENTRY(fsys_set_tid_address)
 	FSYS_RETURN
 END(fsys_set_tid_address)
 
+#ifdef CONFIG_TIME_INTERPOLATION
 /*
  * Ensure that the time interpolator structure is compatible with the asm code
  */
@@ -348,6 +349,26 @@ ENTRY(fsys_clock_gettime)
 	br.many .gettime
 END(fsys_clock_gettime)
 
+
+#else // !CONFIG_TIME_INTERPOLATION
+
+# define fsys_gettimeofday 0
+# define fsys_clock_gettime 0
+
+.fail_einval:
+	mov r8 = EINVAL
+	mov r10 = -1
+	FSYS_RETURN
+
+.fail_efault:
+	mov r8 = EFAULT
+	mov r10 = -1
+	FSYS_RETURN
+
+#endif
+
+
+
 /*
  * long fsys_rt_sigprocmask (int how, sigset_t *set, sigset_t *oset, size_t sigsetsize).
  */
Index: linux-2.6.16/arch/ia64/kernel/iosapic.c
===================================================================
--- linux-2.6.16.orig/arch/ia64/kernel/iosapic.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ia64/kernel/iosapic.c	2006-10-19 16:52:29.000000000 +0200
@@ -102,7 +102,7 @@
 #define NR_PREALLOCATE_RTE_ENTRIES	(PAGE_SIZE / sizeof(struct iosapic_rte_info))
 #define RTE_PREALLOCATED	(1)
 
-static DEFINE_SPINLOCK(iosapic_lock);
+static DEFINE_RAW_SPINLOCK(iosapic_lock);
 
 /* These tables map IA-64 vectors to the IOSAPIC pin that generates this vector. */
 
@@ -379,6 +379,34 @@ iosapic_startup_level_irq (unsigned int 
 	return 0;
 }
 
+/*
+ * In the preemptible case mask the IRQ first then handle it and ack it.
+ */
+#ifdef CONFIG_PREEMPT_HARDIRQS
+
+static void
+iosapic_ack_level_irq (unsigned int irq)
+{
+	ia64_vector vec = irq_to_vector(irq);
+	struct iosapic_rte_info *rte;
+
+	move_irq(irq);
+	mask_irq(irq);
+	list_for_each_entry(rte, &iosapic_intr_info[vec].rtes, rte_list)
+		iosapic_eoi(rte->addr, vec);
+}
+
+static void
+iosapic_end_level_irq (unsigned int irq)
+{
+	if (!(irq_desc[irq].status & IRQ_INPROGRESS))
+		unmask_irq(irq);
+}
+
+#else /* !CONFIG_PREEMPT_HARDIRQS */
+
+#define iosapic_ack_level_irq		nop
+
 static void
 iosapic_end_level_irq (unsigned int irq)
 {
@@ -390,10 +418,12 @@ iosapic_end_level_irq (unsigned int irq)
 		iosapic_eoi(rte->addr, vec);
 }
 
+
+#endif
+
 #define iosapic_shutdown_level_irq	mask_irq
 #define iosapic_enable_level_irq	unmask_irq
 #define iosapic_disable_level_irq	mask_irq
-#define iosapic_ack_level_irq		nop
 
 struct hw_interrupt_type irq_type_iosapic_level = {
 	.typename =	"IO-SAPIC-level",
Index: linux-2.6.16/arch/ia64/kernel/mca.c
===================================================================
--- linux-2.6.16.orig/arch/ia64/kernel/mca.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ia64/kernel/mca.c	2006-10-19 16:52:29.000000000 +0200
@@ -151,7 +151,7 @@ ia64_mca_spin(const char *func)
 
 typedef struct ia64_state_log_s
 {
-	spinlock_t	isl_lock;
+	raw_spinlock_t	isl_lock;
 	int		isl_index;
 	unsigned long	isl_count;
 	ia64_err_rec_t  *isl_log[IA64_MAX_LOGS]; /* need space to store header + error log */
Index: linux-2.6.16/arch/ia64/kernel/perfmon.c
===================================================================
--- linux-2.6.16.orig/arch/ia64/kernel/perfmon.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ia64/kernel/perfmon.c	2006-10-19 16:52:30.000000000 +0200
@@ -278,7 +278,7 @@ typedef struct {
  */
 
 typedef struct pfm_context {
-	spinlock_t		ctx_lock;		/* context protection */
+	raw_spinlock_t		ctx_lock;		/* context protection */
 
 	pfm_context_flags_t	ctx_flags;		/* bitmask of flags  (block reason incl.) */
 	unsigned int		ctx_state;		/* state: active/inactive (no bitfield) */
@@ -364,7 +364,7 @@ typedef struct pfm_context {
  * mostly used to synchronize between system wide and per-process
  */
 typedef struct {
-	spinlock_t		pfs_lock;		   /* lock the structure */
+	raw_spinlock_t		pfs_lock;		   /* lock the structure */
 
 	unsigned int		pfs_task_sessions;	   /* number of per task sessions */
 	unsigned int		pfs_sys_sessions;	   /* number of per system wide sessions */
@@ -505,7 +505,7 @@ static pfm_intr_handler_desc_t  *pfm_alt
 static struct proc_dir_entry 	*perfmon_dir;
 static pfm_uuid_t		pfm_null_uuid = {0,};
 
-static spinlock_t		pfm_buffer_fmt_lock;
+static raw_spinlock_t		pfm_buffer_fmt_lock;
 static LIST_HEAD(pfm_buffer_fmt_list);
 
 static pmu_config_t		*pmu_conf;
Index: linux-2.6.16/arch/ia64/kernel/process.c
===================================================================
--- linux-2.6.16.orig/arch/ia64/kernel/process.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ia64/kernel/process.c	2006-10-19 16:52:30.000000000 +0200
@@ -98,6 +98,9 @@ show_stack (struct task_struct *task, un
 void
 dump_stack (void)
 {
+	if (irqs_disabled()) {
+		printk("Uh oh.. entering dump_stack() with irqs disabled.\n");
+	}
 	show_stack(NULL, NULL);
 }
 
@@ -201,7 +204,7 @@ void
 default_idle (void)
 {
 	local_irq_enable();
-	while (!need_resched()) {
+	while (!need_resched() && !need_resched_delayed()) {
 		if (can_do_pal_halt)
 			safe_halt();
 		else
@@ -277,7 +280,7 @@ cpu_idle (void)
 		else
 			set_thread_flag(TIF_POLLING_NRFLAG);
 
-		if (!need_resched()) {
+		if (!need_resched() && !need_resched_delayed()) {
 			void (*idle)(void);
 #ifdef CONFIG_SMP
 			min_xtp();
@@ -299,10 +302,11 @@ cpu_idle (void)
 			normal_xtp();
 #endif
 		}
-		preempt_enable_no_resched();
-		schedule();
+		__preempt_enable_no_resched();
+		__schedule();
+
 		preempt_disable();
-		check_pgt_cache();
+
 		if (cpu_is_offline(cpu))
 			play_dead();
 	}
Index: linux-2.6.16/arch/ia64/kernel/sal.c
===================================================================
--- linux-2.6.16.orig/arch/ia64/kernel/sal.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ia64/kernel/sal.c	2006-10-19 16:52:30.000000000 +0200
@@ -19,7 +19,7 @@
 #include <asm/sal.h>
 #include <asm/pal.h>
 
- __cacheline_aligned DEFINE_SPINLOCK(sal_lock);
+ __cacheline_aligned DEFINE_RAW_SPINLOCK(sal_lock);
 unsigned long sal_platform_features;
 
 unsigned short sal_revision;
Index: linux-2.6.16/arch/ia64/kernel/salinfo.c
===================================================================
--- linux-2.6.16.orig/arch/ia64/kernel/salinfo.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ia64/kernel/salinfo.c	2006-10-19 16:52:30.000000000 +0200
@@ -141,7 +141,7 @@ enum salinfo_state {
 
 struct salinfo_data {
 	cpumask_t		cpu_event;	/* which cpus have outstanding events */
-	struct semaphore	mutex;
+	struct compat_semaphore	mutex;
 	u8			*log_buffer;
 	u64			log_size;
 	u8			*oemdata;	/* decoded oem data */
@@ -157,8 +157,8 @@ struct salinfo_data {
 
 static struct salinfo_data salinfo_data[ARRAY_SIZE(salinfo_log_name)];
 
-static DEFINE_SPINLOCK(data_lock);
-static DEFINE_SPINLOCK(data_saved_lock);
+static DEFINE_RAW_SPINLOCK(data_lock);
+static DEFINE_RAW_SPINLOCK(data_saved_lock);
 
 /** salinfo_platform_oemdata - optional callback to decode oemdata from an error
  * record.
Index: linux-2.6.16/arch/ia64/kernel/semaphore.c
===================================================================
--- linux-2.6.16.orig/arch/ia64/kernel/semaphore.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ia64/kernel/semaphore.c	2006-10-19 16:52:30.000000000 +0200
@@ -40,12 +40,12 @@
  */
 
 void
-__up (struct semaphore *sem)
+__up (struct compat_semaphore *sem)
 {
 	wake_up(&sem->wait);
 }
 
-void __sched __down (struct semaphore *sem)
+void __sched __down (struct compat_semaphore *sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -82,7 +82,7 @@ void __sched __down (struct semaphore *s
 	tsk->state = TASK_RUNNING;
 }
 
-int __sched __down_interruptible (struct semaphore * sem)
+int __sched __down_interruptible (struct compat_semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -142,7 +142,7 @@ int __sched __down_interruptible (struct
  * count.
  */
 int
-__down_trylock (struct semaphore *sem)
+__down_trylock (struct compat_semaphore *sem)
 {
 	unsigned long flags;
 	int sleepers;
Index: linux-2.6.16/arch/ia64/kernel/signal.c
===================================================================
--- linux-2.6.16.orig/arch/ia64/kernel/signal.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ia64/kernel/signal.c	2006-10-19 16:52:30.000000000 +0200
@@ -488,6 +488,14 @@ ia64_do_signal (sigset_t *oldset, struct
 	long errno = scr->pt.r8;
 #	define ERR_CODE(c)	(IS_IA32_PROCESS(&scr->pt) ? -(c) : (c))
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Fully-preemptible kernel does not need interrupts disabled:
+	 */
+	local_irq_enable();
+	preempt_check_resched();
+#endif
+
 	/*
 	 * In the ia64_leave_kernel code path, we want the common case to go fast, which
 	 * is why we may in certain cases get here from kernel mode. Just return without
Index: linux-2.6.16/arch/ia64/kernel/smp.c
===================================================================
--- linux-2.6.16.orig/arch/ia64/kernel/smp.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ia64/kernel/smp.c	2006-10-19 16:52:30.000000000 +0200
@@ -222,6 +222,22 @@ smp_send_reschedule (int cpu)
 	platform_send_ipi(cpu, IA64_IPI_RESCHEDULE, IA64_IPI_DM_INT, 0);
 }
 
+/*
+ * this function sends a 'reschedule' IPI to all other CPUs.
+ * This is used when RT tasks are starving and other CPUs
+ * might be able to run them:
+ */
+void smp_send_reschedule_allbutself(void)
+{
+	unsigned int cpu;
+
+	for_each_online_cpu(cpu) {
+		if (cpu != smp_processor_id())
+			platform_send_ipi(cpu, IA64_IPI_RESCHEDULE, IA64_IPI_DM_INT, 0);
+	}
+}
+
+
 void
 smp_flush_tlb_all (void)
 {
Index: linux-2.6.16/arch/ia64/kernel/smpboot.c
===================================================================
--- linux-2.6.16.orig/arch/ia64/kernel/smpboot.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ia64/kernel/smpboot.c	2006-10-19 16:52:30.000000000 +0200
@@ -324,6 +324,8 @@ smp_setup_percpu_timer (void)
 {
 }
 
+extern void register_itc_clockevent(void);
+
 static void __devinit
 smp_callin (void)
 {
@@ -379,6 +381,7 @@ smp_callin (void)
 #ifdef CONFIG_IA32_SUPPORT
 	ia32_gdt_init();
 #endif
+	register_itc_clockevent();
 
 	/*
 	 * Allow the master to continue.
Index: linux-2.6.16/arch/ia64/kernel/time.c
===================================================================
--- linux-2.6.16.orig/arch/ia64/kernel/time.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ia64/kernel/time.c	2006-10-19 16:52:30.000000000 +0200
@@ -21,6 +21,7 @@
 #include <linux/efi.h>
 #include <linux/profile.h>
 #include <linux/timex.h>
+#include <linux/clockchips.h>
 
 #include <asm/machvec.h>
 #include <asm/delay.h>
@@ -41,12 +42,102 @@ EXPORT_SYMBOL(last_cli_ip);
 
 #endif
 
+#ifdef CONFIG_GENERIC_TIME
+static void itc_set_next_event(unsigned long evt)
+{
+	unsigned long flags;
+	unsigned long new_itm;
+
+	raw_local_irq_save(flags);
+
+	new_itm = ia64_get_itc() + evt;
+	local_cpu_data->itm_timer_next = new_itm;
+
+	if (time_before(new_itm, local_cpu_data->itm_tick_next))
+		ia64_set_itm(new_itm);
+
+	raw_local_irq_restore(flags);
+}
+
+static struct clock_event itc_clockevent = {
+	.name = "itc_clockevent",
+	.capabilities = CLOCK_CAP_NEXTEVT,
+	.max_delta_ns = 1000000000,    // fixme...
+	.min_delta_ns = 1000,	       // fixme...
+	.shift = 10,
+	.set_next_event = itc_set_next_event,
+};
+
+/* arch specific timeofday hooks */
+s64 read_persistent_clock(void)
+{
+	struct timespec ts;
+	efi_gettimeofday(&ts);
+	return (s64) ts.tv_sec * NSEC_PER_SEC + (s64) ts.tv_nsec;
+}
+
+void sync_persistent_clock(struct timespec ts)
+{
+	// todo
+}
+
+static unsigned long last_itc = 0;
+
+static cycle_t itc_clocksource_read(void)
+{
+	unsigned long now = ia64_get_itc();
+
+	// I suppose this should be sufficient to have a monotonic clock
+	if (last_itc < now)
+		last_itc = now;
+
+	return (cycle_t) last_itc;
+}
+
+static struct clocksource itc_clocksource = {
+	.name = "itc_clocksource",
+	.rating = 200,
+	.shift = 10,
+	.mask = 0xffffffffffffffff,
+	.read = itc_clocksource_read,
+	.is_continuous	= 1,
+};
+
+
+static void register_itc_clocksource(void)
+{
+	printk("Registering itc clocksource\n");
+	itc_clocksource.mult = clocksource_hz2mult(local_cpu_data->itc_freq, itc_clocksource.shift);
+	printk("itc_freq = %lu itc_clocksource.mult = %u\n", local_cpu_data->itc_freq, itc_clocksource.mult);
+	register_clocksource(&itc_clocksource);
+}
+
+
+void register_itc_clockevent(void)
+{
+	printk("Registering itc clockevent on cpu %d\n", smp_processor_id());
+	if (!itc_clockevent.mult)
+		itc_clockevent.mult = clocksource_hz2mult(local_cpu_data->itc_freq, itc_clockevent.shift);
+	setup_local_clockevent(&itc_clockevent, CPU_MASK_NONE);
+}
+
+
+
+#endif
+
+
+
+
+#ifdef CONFIG_TIME_INTERPOLATION
+
 static struct time_interpolator itc_interpolator = {
 	.shift = 16,
 	.mask = 0xffffffffffffffffLL,
 	.source = TIME_SOURCE_CPU
 };
 
+#endif
+
 static irqreturn_t
 timer_interrupt (int irq, void *dev_id, struct pt_regs *regs)
 {
@@ -58,38 +149,57 @@ timer_interrupt (int irq, void *dev_id, 
 
 	platform_timer_interrupt(irq, dev_id, regs);
 
+#if 0
 	new_itm = local_cpu_data->itm_next;
 
 	if (!time_after(ia64_get_itc(), new_itm))
 		printk(KERN_ERR "Oops: timer tick before it's due (itc=%lx,itm=%lx)\n",
 		       ia64_get_itc(), new_itm);
+#endif
 
-	profile_tick(CPU_PROFILING, regs);
+	if (time_after(ia64_get_itc(), local_cpu_data->itm_tick_next)) {
 
-	while (1) {
-		update_process_times(user_mode(regs));
+		unsigned long new_tick_itm;
+		new_tick_itm = local_cpu_data->itm_tick_next;
 
-		new_itm += local_cpu_data->itm_delta;
+		profile_tick(CPU_PROFILING, regs);
 
-		if (smp_processor_id() == TIME_KEEPER_ID) {
-			/*
-			 * Here we are in the timer irq handler. We have irqs locally
-			 * disabled, but we don't know if the timer_bh is running on
-			 * another CPU. We need to avoid to SMP race by acquiring the
-			 * xtime_lock.
-			 */
-			write_seqlock(&xtime_lock);
-			do_timer(regs);
-			local_cpu_data->itm_next = new_itm;
-			write_sequnlock(&xtime_lock);
-		} else
-			local_cpu_data->itm_next = new_itm;
+		while (1) {
+			update_process_times(user_mode(regs));
+
+			new_tick_itm += local_cpu_data->itm_tick_delta;
+
+			if (smp_processor_id() == TIME_KEEPER_ID) {
+				/*
+				 * Here we are in the timer irq handler. We have irqs locally
+				 * disabled, but we don't know if the timer_bh is running on
+				 * another CPU. We need to avoid to SMP race by acquiring the
+				 * xtime_lock.
+				 */
+				write_seqlock(&xtime_lock);
+				do_timer(regs);
+				local_cpu_data->itm_tick_next = new_tick_itm;
+				write_sequnlock(&xtime_lock);
+			} else
+				local_cpu_data->itm_tick_next = new_tick_itm;
+
+			if (time_after(new_tick_itm, ia64_get_itc()))
+				break;
+		}
+	}
 
-		if (time_after(new_itm, ia64_get_itc()))
-			break;
+	if (time_after(ia64_get_itc(), local_cpu_data->itm_timer_next)) {
+		if (itc_clockevent.event_handler)
+			itc_clockevent.event_handler(regs);
 	}
 
 	do {
+		// FIXME, really, please
+		new_itm = local_cpu_data->itm_tick_next;
+
+		if (time_after(new_itm, local_cpu_data->itm_timer_next))
+			new_itm = local_cpu_data->itm_timer_next;
+
 		/*
 		 * If we're too close to the next clock tick for
 		 * comfort, we increase the safety margin by
@@ -99,8 +209,8 @@ timer_interrupt (int irq, void *dev_id, 
 		 * too fast (with the potentially devastating effect
 		 * of losing monotony of time).
 		 */
-		while (!time_after(new_itm, ia64_get_itc() + local_cpu_data->itm_delta/2))
-			new_itm += local_cpu_data->itm_delta;
+		while (!time_after(new_itm, ia64_get_itc() + local_cpu_data->itm_tick_delta/2))
+			new_itm += local_cpu_data->itm_tick_delta;
 		ia64_set_itm(new_itm);
 		/* double check, in case we got hit by a (slow) PMI: */
 	} while (time_after_eq(ia64_get_itc(), new_itm));
@@ -119,7 +229,7 @@ ia64_cpu_local_tick (void)
 	/* arrange for the cycle counter to generate a timer interrupt: */
 	ia64_set_itv(IA64_TIMER_VECTOR);
 
-	delta = local_cpu_data->itm_delta;
+	delta = local_cpu_data->itm_tick_delta;
 	/*
 	 * Stagger the timer tick for each CPU so they don't occur all at (almost) the
 	 * same time:
@@ -128,8 +238,8 @@ ia64_cpu_local_tick (void)
 		unsigned long hi = 1UL << ia64_fls(cpu);
 		shift = (2*(cpu - hi) + 1) * delta/hi/2;
 	}
-	local_cpu_data->itm_next = ia64_get_itc() + delta + shift;
-	ia64_set_itm(local_cpu_data->itm_next);
+	local_cpu_data->itm_tick_next = ia64_get_itc() + delta + shift;
+	ia64_set_itm(local_cpu_data->itm_tick_next);
 }
 
 static int nojitter;
@@ -187,7 +297,7 @@ ia64_init_itm (void)
 
 	itc_freq = (platform_base_freq*itc_ratio.num)/itc_ratio.den;
 
-	local_cpu_data->itm_delta = (itc_freq + HZ/2) / HZ;
+	local_cpu_data->itm_tick_delta = (itc_freq + HZ/2) / HZ;
 	printk(KERN_DEBUG "CPU %d: base freq=%lu.%03luMHz, ITC ratio=%lu/%lu, "
 	       "ITC freq=%lu.%03luMHz", smp_processor_id(),
 	       platform_base_freq / 1000000, (platform_base_freq / 1000) % 1000,
@@ -207,6 +317,7 @@ ia64_init_itm (void)
 	local_cpu_data->nsec_per_cyc = ((NSEC_PER_SEC<<IA64_NSEC_PER_CYC_SHIFT)
 					+ itc_freq/2)/itc_freq;
 
+#ifdef CONFIG_TIME_INTERPOLATION
 	if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)) {
 		itc_interpolator.frequency = local_cpu_data->itc_freq;
 		itc_interpolator.drift = itc_drift;
@@ -225,6 +336,7 @@ ia64_init_itm (void)
 #endif
 		register_time_interpolator(&itc_interpolator);
 	}
+#endif
 
 	/* Setup the CPU local timer tick */
 	ia64_cpu_local_tick();
@@ -232,7 +344,7 @@ ia64_init_itm (void)
 
 static struct irqaction timer_irqaction = {
 	.handler =	timer_interrupt,
-	.flags =	SA_INTERRUPT,
+	.flags =	SA_INTERRUPT | SA_NODELAY,
 	.name =		"timer"
 };
 
@@ -248,6 +360,8 @@ time_init (void)
 	 * tv_nsec field must be normalized (i.e., 0 <= nsec < NSEC_PER_SEC).
 	 */
 	set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec);
+	register_itc_clocksource();
+	register_itc_clockevent();
 }
 
 /*
Index: linux-2.6.16/arch/ia64/kernel/traps.c
===================================================================
--- linux-2.6.16.orig/arch/ia64/kernel/traps.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ia64/kernel/traps.c	2006-10-19 16:52:30.000000000 +0200
@@ -25,7 +25,7 @@
 #include <asm/uaccess.h>
 #include <asm/kdebug.h>
 
-extern spinlock_t timerlist_lock;
+extern raw_spinlock_t timerlist_lock;
 
 fpswa_interface_t *fpswa_interface;
 EXPORT_SYMBOL(fpswa_interface);
@@ -86,11 +86,11 @@ void
 die (const char *str, struct pt_regs *regs, long err)
 {
 	static struct {
-		spinlock_t lock;
+		raw_spinlock_t lock;
 		u32 lock_owner;
 		int lock_owner_depth;
 	} die = {
-		.lock =			SPIN_LOCK_UNLOCKED,
+		.lock =			RAW_SPIN_LOCK_UNLOCKED,
 		.lock_owner =		-1,
 		.lock_owner_depth =	0
 	};
@@ -230,7 +230,7 @@ __kprobes ia64_bad_break (unsigned long 
  * access to fph by the time we get here, as the IVT's "Disabled FP-Register" handler takes
  * care of clearing psr.dfh.
  */
-static inline void
+void
 disabled_fph_fault (struct pt_regs *regs)
 {
 	struct ia64_psr *psr = ia64_psr(regs);
@@ -249,7 +249,7 @@ disabled_fph_fault (struct pt_regs *regs
 			= (struct task_struct *)ia64_get_kr(IA64_KR_FPU_OWNER);
 
 		if (ia64_is_local_fpu_owner(current)) {
-			preempt_enable_no_resched();
+			__preempt_enable_no_resched();
 			return;
 		}
 
@@ -269,7 +269,7 @@ disabled_fph_fault (struct pt_regs *regs
 		 */
 		psr->mfh = 1;
 	}
-	preempt_enable_no_resched();
+	__preempt_enable_no_resched();
 }
 
 static inline int
Index: linux-2.6.16/arch/ia64/kernel/unwind.c
===================================================================
--- linux-2.6.16.orig/arch/ia64/kernel/unwind.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ia64/kernel/unwind.c	2006-10-19 16:52:30.000000000 +0200
@@ -81,7 +81,7 @@ typedef unsigned long unw_word;
 typedef unsigned char unw_hash_index_t;
 
 static struct {
-	spinlock_t lock;			/* spinlock for unwind data */
+	raw_spinlock_t lock;			/* spinlock for unwind data */
 
 	/* list of unwind tables (one per load-module) */
 	struct unw_table *tables;
@@ -145,7 +145,7 @@ static struct {
 # endif
 } unw = {
 	.tables = &unw.kernel_table,
-	.lock = SPIN_LOCK_UNLOCKED,
+	.lock = RAW_SPIN_LOCK_UNLOCKED,
 	.save_order = {
 		UNW_REG_RP, UNW_REG_PFS, UNW_REG_PSP, UNW_REG_PR,
 		UNW_REG_UNAT, UNW_REG_LC, UNW_REG_FPSR, UNW_REG_PRI_UNAT_GR
Index: linux-2.6.16/arch/ia64/kernel/unwind_i.h
===================================================================
--- linux-2.6.16.orig/arch/ia64/kernel/unwind_i.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ia64/kernel/unwind_i.h	2006-10-19 16:52:30.000000000 +0200
@@ -154,7 +154,7 @@ struct unw_script {
 	unsigned long ip;		/* ip this script is for */
 	unsigned long pr_mask;		/* mask of predicates script depends on */
 	unsigned long pr_val;		/* predicate values this script is for */
-	rwlock_t lock;
+	raw_rwlock_t lock;
 	unsigned int flags;		/* see UNW_FLAG_* in unwind.h */
 	unsigned short lru_chain;	/* used for least-recently-used chain */
 	unsigned short coll_chain;	/* used for hash collisions */
Index: linux-2.6.16/arch/ia64/mm/init.c
===================================================================
--- linux-2.6.16.orig/arch/ia64/mm/init.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ia64/mm/init.c	2006-10-19 16:52:30.000000000 +0200
@@ -37,7 +37,7 @@
 #include <asm/unistd.h>
 #include <asm/mca.h>
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 DEFINE_PER_CPU(unsigned long *, __pgtable_quicklist);
 DEFINE_PER_CPU(long, __pgtable_quicklist_size);
@@ -93,15 +93,11 @@ check_pgt_cache(void)
 	if (unlikely(pgtable_quicklist_size <= MIN_PGT_PAGES))
 		return;
 
-	preempt_disable();
 	while (unlikely((pages_to_free = min_pages_to_free()) > 0)) {
 		while (pages_to_free--) {
 			free_page((unsigned long)pgtable_quicklist_alloc());
 		}
-		preempt_enable();
-		preempt_disable();
 	}
-	preempt_enable();
 }
 
 void
Index: linux-2.6.16/arch/ia64/mm/tlb.c
===================================================================
--- linux-2.6.16.orig/arch/ia64/mm/tlb.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ia64/mm/tlb.c	2006-10-19 16:52:30.000000000 +0200
@@ -33,7 +33,7 @@ static struct {
 } purge;
 
 struct ia64_ctx ia64_ctx = {
-	.lock =		SPIN_LOCK_UNLOCKED,
+	.lock =		RAW_SPIN_LOCK_UNLOCKED,
 	.next =		1,
 	.max_ctx =	~0U
 };
Index: linux-2.6.16/arch/mips/Kconfig
===================================================================
--- linux-2.6.16.orig/arch/mips/Kconfig	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/mips/Kconfig	2006-10-19 16:52:30.000000000 +0200
@@ -362,6 +362,7 @@ config MOMENCO_JAGUAR_ATX
 config MOMENCO_OCELOT
 	bool "Support for Momentum Ocelot board"
 	select DMA_NONCOHERENT
+	select NO_SPINLOCK
 	select HW_HAS_PCI
 	select IRQ_CPU
 	select IRQ_CPU_RM7K
@@ -793,12 +794,21 @@ source "arch/mips/philips/pnx8550/common
 
 endmenu
 
+source "kernel/Kconfig.preempt"
+
 config RWSEM_GENERIC_SPINLOCK
 	bool
+	depends on !PREEMPT_RT
 	default y
 
 config RWSEM_XCHGADD_ALGORITHM
 	bool
+	depends on !PREEMPT_RT
+
+config ASM_SEMAPHORES
+	bool
+#	depends on !PREEMPT_RT
+	default y
 
 config GENERIC_CALIBRATE_DELAY
 	bool
@@ -833,6 +843,9 @@ config DMA_NEED_PCI_MAP_STATE
 config OWN_DMA
 	bool
 
+config NO_SPINLOCK
+	bool
+
 config EARLY_PRINTK
 	bool
 
@@ -1634,10 +1647,6 @@ config MIPS_INSANE_LARGE
 
 endmenu
 
-config RWSEM_GENERIC_SPINLOCK
-	bool
-	default y
-
 source "init/Kconfig"
 
 menu "Bus options (PCI, PCMCIA, EISA, ISA, TC)"
Index: linux-2.6.16/arch/mips/kernel/Makefile
===================================================================
--- linux-2.6.16.orig/arch/mips/kernel/Makefile	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/mips/kernel/Makefile	2006-10-19 16:52:30.000000000 +0200
@@ -5,7 +5,7 @@
 extra-y		:= head.o init_task.o vmlinux.lds
 
 obj-y		+= cpu-probe.o branch.o entry.o genex.o irq.o process.o \
-		   ptrace.o reset.o semaphore.o setup.o signal.o syscall.o \
+		   ptrace.o reset.o setup.o signal.o syscall.o \
 		   time.o traps.o unaligned.o
 
 binfmt_irix-objs	:= irixelf.o irixinv.o irixioctl.o irixsig.o	\
@@ -13,6 +13,8 @@ binfmt_irix-objs	:= irixelf.o irixinv.o 
 
 obj-$(CONFIG_MODULES)		+= mips_ksyms.o module.o
 
+obj-$(CONFIG_ASM_SEMAPHORES)	+= semaphore.o
+
 obj-$(CONFIG_CPU_R3000)		+= r2300_fpu.o r2300_switch.o
 obj-$(CONFIG_CPU_TX39XX)	+= r2300_fpu.o r2300_switch.o
 obj-$(CONFIG_CPU_TX49XX)	+= r4k_fpu.o r4k_switch.o
Index: linux-2.6.16/arch/mips/kernel/asm-offsets.c
===================================================================
--- linux-2.6.16.orig/arch/mips/kernel/asm-offsets.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/mips/kernel/asm-offsets.c	2006-10-19 16:52:30.000000000 +0200
@@ -11,6 +11,9 @@
 #include <linux/config.h>
 #include <linux/compat.h>
 #include <linux/types.h>
+#include <linux/linkage.h>
+#include <linux/trace_irqflags.h>
+#include <asm/interrupt.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
Index: linux-2.6.16/arch/mips/kernel/entry.S
===================================================================
--- linux-2.6.16.orig/arch/mips/kernel/entry.S	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/mips/kernel/entry.S	2006-10-19 16:52:30.000000000 +0200
@@ -23,7 +23,7 @@
 	.endm
 #else
 	.macro	preempt_stop
-	local_irq_disable
+	mips_raw_local_irq_disable
 	.endm
 #define resume_kernel	restore_all
 #endif
@@ -38,7 +38,7 @@ FEXPORT(ret_from_irq)
 	beqz	t0, resume_kernel
 
 resume_userspace:
-	local_irq_disable		# make sure we dont miss an
+	mips_raw_local_irq_disable	# make sure we dont miss an
 					# interrupt setting need_resched
 					# between sampling and return
 	LONG_L	a2, TI_FLAGS($28)	# current->work
@@ -48,7 +48,9 @@ resume_userspace:
 
 #ifdef CONFIG_PREEMPT
 resume_kernel:
-	local_irq_disable
+	mips_local_irq_disable
+	lw	t0, kernel_preemption
+	beqz	t0, restore_all
 	lw	t0, TI_PRE_COUNT($28)
 	bnez	t0, restore_all
 need_resched:
@@ -66,7 +68,7 @@ FEXPORT(ret_from_fork)
 	jal	schedule_tail		# a0 = task_t *prev
 
 FEXPORT(syscall_exit)
-	local_irq_disable		# make sure need_resched and
+	mips_raw_local_irq_disable	# make sure need_resched and
 					# signals dont change between
 					# sampling and return
 	LONG_L	a2, TI_FLAGS($28)	# current->work
@@ -85,19 +87,21 @@ FEXPORT(restore_partial)		# restore part
 	.set	at
 
 work_pending:
-	andi	t0, a2, _TIF_NEED_RESCHED # a2 is preloaded with TI_FLAGS
+					# a2 is preloaded with TI_FLAGS
+	andi	t0, a2, (_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	beqz	t0, work_notifysig
 work_resched:
+	mips_raw_local_irq_enable  t0
 	jal	schedule
 
-	local_irq_disable		# make sure need_resched and
+	mips_raw_local_irq_disable	# make sure need_resched and
 					# signals dont change between
 					# sampling and return
 	LONG_L	a2, TI_FLAGS($28)
 	andi	t0, a2, _TIF_WORK_MASK	# is there any work to be done
 					# other than syscall tracing?
 	beqz	t0, restore_all
-	andi	t0, a2, _TIF_NEED_RESCHED
+	andi	t0, a2, (_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	bnez	t0, work_resched
 
 work_notifysig:				# deal with pending signals and
@@ -113,7 +117,7 @@ syscall_exit_work:
 	li	t0, _TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT
 	and	t0, a2			# a2 is preloaded with TI_FLAGS
 	beqz	t0, work_pending	# trace bit set?
-	local_irq_enable		# could let do_syscall_trace()
+	mips_raw_local_irq_enable	# could let do_syscall_trace()
 					# call schedule() instead
 	move	a0, sp
 	li	a1, 1
Index: linux-2.6.16/arch/mips/kernel/i8259.c
===================================================================
--- linux-2.6.16.orig/arch/mips/kernel/i8259.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/mips/kernel/i8259.c	2006-10-19 16:52:30.000000000 +0200
@@ -31,7 +31,7 @@ void disable_8259A_irq(unsigned int irq)
  * moves to arch independent land
  */
 
-DEFINE_SPINLOCK(i8259A_lock);
+DEFINE_RAW_SPINLOCK(i8259A_lock);
 
 static void end_8259A_irq (unsigned int irq)
 {
Index: linux-2.6.16/arch/mips/kernel/init_task.c
===================================================================
--- linux-2.6.16.orig/arch/mips/kernel/init_task.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/mips/kernel/init_task.c	2006-10-19 16:52:30.000000000 +0200
@@ -9,8 +9,8 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
-static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
+static struct fs_struct init_fs = INIT_FS(init_fs);
+static struct files_struct init_files = INIT_FILES(init_files);
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
Index: linux-2.6.16/arch/mips/kernel/irq.c
===================================================================
--- linux-2.6.16.orig/arch/mips/kernel/irq.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/mips/kernel/irq.c	2006-10-19 16:52:30.000000000 +0200
@@ -125,7 +125,10 @@ void __init init_IRQ(void)
 		irq_desc[i].action  = NULL;
 		irq_desc[i].depth   = 1;
 		irq_desc[i].handler = &no_irq_type;
-		spin_lock_init(&irq_desc[i].lock);
+		__raw_spin_lock_init(&irq_desc[i].lock);
+#ifdef CONFIG_PREEMPT_HARDIRQS
+		irq_desc[i].thread = NULL;
+#endif
 	}
 
 	arch_init_irq();
Index: linux-2.6.16/arch/mips/kernel/module.c
===================================================================
--- linux-2.6.16.orig/arch/mips/kernel/module.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/mips/kernel/module.c	2006-10-19 16:52:30.000000000 +0200
@@ -39,7 +39,7 @@ struct mips_hi16 {
 static struct mips_hi16 *mips_hi16_list;
 
 static LIST_HEAD(dbe_list);
-static DEFINE_SPINLOCK(dbe_lock);
+static DEFINE_RAW_SPINLOCK(dbe_lock);
 
 void *module_alloc(unsigned long size)
 {
Index: linux-2.6.16/arch/mips/kernel/process.c
===================================================================
--- linux-2.6.16.orig/arch/mips/kernel/process.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/mips/kernel/process.c	2006-10-19 16:52:30.000000000 +0200
@@ -49,13 +49,15 @@
  */
 ATTRIB_NORET void cpu_idle(void)
 {
+	local_irq_enable();
+
 	/* endless idle loop with no priority at all */
 	while (1) {
-		while (!need_resched())
+		while (!need_resched() && !need_resched_delayed())
 			if (cpu_wait)
 				(*cpu_wait)();
-		preempt_enable_no_resched();
-		schedule();
+		__preempt_enable_no_resched();
+		__schedule();
 		preempt_disable();
 	}
 }
Index: linux-2.6.16/arch/mips/kernel/scall32-o32.S
===================================================================
--- linux-2.6.16.orig/arch/mips/kernel/scall32-o32.S	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/mips/kernel/scall32-o32.S	2006-10-19 16:52:30.000000000 +0200
@@ -72,7 +72,7 @@ stack_done:
 1:	sw	v0, PT_R2(sp)		# result
 
 o32_syscall_exit:
-	local_irq_disable		# make sure need_resched and
+	mips_raw_local_irq_disable	# make sure need_resched and
 					# signals dont change between
 					# sampling and return
 	lw	a2, TI_FLAGS($28)	# current->work
Index: linux-2.6.16/arch/mips/kernel/semaphore.c
===================================================================
--- linux-2.6.16.orig/arch/mips/kernel/semaphore.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/mips/kernel/semaphore.c	2006-10-19 16:52:30.000000000 +0200
@@ -36,7 +36,7 @@
  * sem->count and sem->waking atomic.  Scalability isn't an issue because
  * this lock is used on UP only so it's just an empty variable.
  */
-static inline int __sem_update_count(struct semaphore *sem, int incr)
+static inline int __sem_update_count(struct compat_semaphore *sem, int incr)
 {
 	int old_count, tmp;
 
@@ -67,7 +67,7 @@ static inline int __sem_update_count(str
 		: "=&r" (old_count), "=&r" (tmp), "=m" (sem->count)
 		: "r" (incr), "m" (sem->count));
 	} else {
-		static DEFINE_SPINLOCK(semaphore_lock);
+		static DEFINE_RAW_SPINLOCK(semaphore_lock);
 		unsigned long flags;
 
 		spin_lock_irqsave(&semaphore_lock, flags);
@@ -80,7 +80,7 @@ static inline int __sem_update_count(str
 	return old_count;
 }
 
-void __up(struct semaphore *sem)
+void __compat_up(struct compat_semaphore *sem)
 {
 	/*
 	 * Note that we incremented count in up() before we came here,
@@ -94,7 +94,7 @@ void __up(struct semaphore *sem)
 	wake_up(&sem->wait);
 }
 
-EXPORT_SYMBOL(__up);
+EXPORT_SYMBOL(__compat_up);
 
 /*
  * Note that when we come in to __down or __down_interruptible,
@@ -104,7 +104,7 @@ EXPORT_SYMBOL(__up);
  * Thus it is only when we decrement count from some value > 0
  * that we have actually got the semaphore.
  */
-void __sched __down(struct semaphore *sem)
+void __sched __compat_down(struct compat_semaphore *sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -133,9 +133,9 @@ void __sched __down(struct semaphore *se
 	wake_up(&sem->wait);
 }
 
-EXPORT_SYMBOL(__down);
+EXPORT_SYMBOL(__compat_down);
 
-int __sched __down_interruptible(struct semaphore * sem)
+int __sched __compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -165,4 +165,4 @@ int __sched __down_interruptible(struct 
 	return retval;
 }
 
-EXPORT_SYMBOL(__down_interruptible);
+EXPORT_SYMBOL(__compat_down_interruptible);
Index: linux-2.6.16/arch/mips/kernel/signal.c
===================================================================
--- linux-2.6.16.orig/arch/mips/kernel/signal.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/mips/kernel/signal.c	2006-10-19 16:52:30.000000000 +0200
@@ -417,6 +417,10 @@ void do_signal(struct pt_regs *regs)
 	siginfo_t info;
 	int signr;
 
+#ifdef CONFIG_PREEMPT_RT
+	local_irq_enable();
+	preempt_check_resched();
+#endif
 	/*
 	 * We want the common case to go fast, which is why we may in certain
 	 * cases get here from kernel mode. Just return without doing anything
Index: linux-2.6.16/arch/mips/kernel/signal32.c
===================================================================
--- linux-2.6.16.orig/arch/mips/kernel/signal32.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/mips/kernel/signal32.c	2006-10-19 16:52:30.000000000 +0200
@@ -807,6 +807,10 @@ void do_signal32(struct pt_regs *regs)
 	siginfo_t info;
 	int signr;
 
+#ifdef CONFIG_PREEMPT_RT
+	local_irq_enable();
+	preempt_check_resched();
+#endif
 	/*
 	 * We want the common case to go fast, which is why we may in certain
 	 * cases get here from kernel mode. Just return without doing anything
Index: linux-2.6.16/arch/mips/kernel/smp.c
===================================================================
--- linux-2.6.16.orig/arch/mips/kernel/smp.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/mips/kernel/smp.c	2006-10-19 16:52:30.000000000 +0200
@@ -107,7 +107,22 @@ asmlinkage void start_secondary(void)
 	cpu_idle();
 }
 
-DEFINE_SPINLOCK(smp_call_lock);
+DEFINE_RAW_SPINLOCK(smp_call_lock);
+
+/*
+ * this function sends a 'reschedule' IPI to all other CPUs.
+ * This is used when RT tasks are starving and other CPUs
+ * might be able to run them.
+ */
+void smp_send_reschedule_allbutself(void)
+{
+	int cpu = smp_processor_id();
+	int i;
+
+	for (i = 0; i < NR_CPUS; i++)
+		if (cpu_online(i) && i != cpu)
+			core_send_ipi(i, SMP_RESCHEDULE_YOURSELF);
+}
 
 struct call_data_struct *call_data;
 
@@ -290,6 +305,8 @@ int setup_profiling_timer(unsigned int m
 	return 0;
 }
 
+static DEFINE_RAW_SPINLOCK(tlbstate_lock);
+
 static void flush_tlb_all_ipi(void *info)
 {
 	local_flush_tlb_all();
@@ -321,6 +338,7 @@ static void flush_tlb_mm_ipi(void *mm)
 void flush_tlb_mm(struct mm_struct *mm)
 {
 	preempt_disable();
+	spin_lock(&tlbstate_lock);
 
 	if ((atomic_read(&mm->mm_users) != 1) || (current->mm != mm)) {
 		smp_call_function(flush_tlb_mm_ipi, (void *)mm, 1, 1);
@@ -330,6 +348,7 @@ void flush_tlb_mm(struct mm_struct *mm)
 			if (smp_processor_id() != i)
 				cpu_context(i, mm) = 0;
 	}
+	spin_unlock(&tlbstate_lock);
 	local_flush_tlb_mm(mm);
 
 	preempt_enable();
@@ -353,6 +372,8 @@ void flush_tlb_range(struct vm_area_stru
 	struct mm_struct *mm = vma->vm_mm;
 
 	preempt_disable();
+	spin_lock(&tlbstate_lock);
+
 	if ((atomic_read(&mm->mm_users) != 1) || (current->mm != mm)) {
 		struct flush_tlb_data fd;
 
@@ -366,6 +387,7 @@ void flush_tlb_range(struct vm_area_stru
 			if (smp_processor_id() != i)
 				cpu_context(i, mm) = 0;
 	}
+	spin_unlock(&tlbstate_lock);
 	local_flush_tlb_range(vma, start, end);
 	preempt_enable();
 }
@@ -396,6 +418,8 @@ static void flush_tlb_page_ipi(void *inf
 void flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
 {
 	preempt_disable();
+	spin_lock(&tlbstate_lock);
+
 	if ((atomic_read(&vma->vm_mm->mm_users) != 1) || (current->mm != vma->vm_mm)) {
 		struct flush_tlb_data fd;
 
@@ -408,6 +432,7 @@ void flush_tlb_page(struct vm_area_struc
 			if (smp_processor_id() != i)
 				cpu_context(i, vma->vm_mm) = 0;
 	}
+	spin_unlock(&tlbstate_lock);
 	local_flush_tlb_page(vma, page);
 	preempt_enable();
 }
Index: linux-2.6.16/arch/mips/kernel/sysirix.c
===================================================================
--- linux-2.6.16.orig/arch/mips/kernel/sysirix.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/mips/kernel/sysirix.c	2006-10-19 16:52:30.000000000 +0200
@@ -645,27 +645,7 @@ static inline void getitimer_real(struct
 
 asmlinkage unsigned int irix_alarm(unsigned int seconds)
 {
-	struct itimerval it_new, it_old;
-	unsigned int oldalarm;
-
-	if (!seconds) {
-		getitimer_real(&it_old);
-		del_timer(&current->real_timer);
-	} else {
-		it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
-		it_new.it_value.tv_sec = seconds;
-		it_new.it_value.tv_usec = 0;
-		do_setitimer(ITIMER_REAL, &it_new, &it_old);
-	}
-	oldalarm = it_old.it_value.tv_sec;
-	/*
-	 * ehhh.. We can't return 0 if we have an alarm pending ...
-	 * And we'd better return too much than too little anyway
-	 */
-	if (it_old.it_value.tv_usec)
-		oldalarm++;
-
-	return oldalarm;
+	return alarm_setitimer(seconds);
 }
 
 asmlinkage int irix_pause(void)
Index: linux-2.6.16/arch/mips/kernel/time.c
===================================================================
--- linux-2.6.16.orig/arch/mips/kernel/time.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/mips/kernel/time.c	2006-10-19 16:52:30.000000000 +0200
@@ -50,7 +50,7 @@
  */
 extern volatile unsigned long wall_jiffies;
 
-DEFINE_SPINLOCK(rtc_lock);
+DEFINE_RAW_SPINLOCK(rtc_lock);
 
 /*
  * By default we provide the null RTC ops
@@ -581,7 +581,7 @@ unsigned int mips_hpt_frequency;
 
 static struct irqaction timer_irqaction = {
 	.handler = timer_interrupt,
-	.flags = SA_INTERRUPT,
+	.flags = SA_NODELAY | SA_INTERRUPT,
 	.name = "timer",
 };
 
Index: linux-2.6.16/arch/mips/kernel/traps.c
===================================================================
--- linux-2.6.16.orig/arch/mips/kernel/traps.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/mips/kernel/traps.c	2006-10-19 16:52:30.000000000 +0200
@@ -274,7 +274,7 @@ void show_registers(struct pt_regs *regs
 	printk("\n");
 }
 
-static DEFINE_SPINLOCK(die_lock);
+static DEFINE_RAW_SPINLOCK(die_lock);
 
 NORET_TYPE void ATTRIB_NORET die(const char * str, struct pt_regs * regs)
 {
Index: linux-2.6.16/arch/mips/math-emu/cp1emu.c
===================================================================
--- linux-2.6.16.orig/arch/mips/math-emu/cp1emu.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/mips/math-emu/cp1emu.c	2006-10-19 16:52:30.000000000 +0200
@@ -1269,7 +1269,9 @@ int fpu_emulator_cop1Handler(struct pt_r
 		if (sig)
 			break;
 
+		preempt_enable();
 		cond_resched();
+		preempt_disable();
 	} while (xcp->cp0_epc > prevepc);
 
 	/* SIGILL indicates a non-fpu instruction */
Index: linux-2.6.16/arch/mips/mm/init.c
===================================================================
--- linux-2.6.16.orig/arch/mips/mm/init.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/mips/mm/init.c	2006-10-19 16:52:30.000000000 +0200
@@ -36,7 +36,7 @@
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 unsigned long highstart_pfn, highend_pfn;
 
Index: linux-2.6.16/arch/mips/sibyte/sb1250/irq.c
===================================================================
--- linux-2.6.16.orig/arch/mips/sibyte/sb1250/irq.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/mips/sibyte/sb1250/irq.c	2006-10-19 16:52:30.000000000 +0200
@@ -86,7 +86,7 @@ static struct hw_interrupt_type sb1250_i
 /* Store the CPU id (not the logical number) */
 int sb1250_irq_owner[SB1250_NR_IRQS];
 
-DEFINE_SPINLOCK(sb1250_imr_lock);
+DEFINE_RAW_SPINLOCK(sb1250_imr_lock);
 
 void sb1250_mask_irq(int cpu, int irq)
 {
@@ -267,7 +267,7 @@ static irqreturn_t  sb1250_dummy_handler
 
 static struct irqaction sb1250_dummy_action = {
 	.handler = sb1250_dummy_handler,
-	.flags   = 0,
+	.flags   = SA_NODELAY,
 	.mask    = CPU_MASK_NONE,
 	.name    = "sb1250-private",
 	.next    = NULL,
Index: linux-2.6.16/arch/powerpc/Kconfig
===================================================================
--- linux-2.6.16.orig/arch/powerpc/Kconfig	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/powerpc/Kconfig	2006-10-19 16:52:30.000000000 +0200
@@ -26,14 +26,11 @@ config MMU
 	bool
 	default y
 
-config GENERIC_HARDIRQS
+config GENERIC_TIME
 	bool
 	default y
 
-config RWSEM_GENERIC_SPINLOCK
-	bool
-
-config RWSEM_XCHGADD_ALGORITHM
+config GENERIC_HARDIRQS
 	bool
 	default y
 
@@ -519,6 +516,18 @@ config HIGHMEM
 
 source kernel/Kconfig.hz
 source kernel/Kconfig.preempt
+
+config RWSEM_GENERIC_SPINLOCK
+	bool
+	default y
+
+config ASM_SEMAPHORES
+	bool
+	default y
+
+config RWSEM_XCHGADD_ALGORITHM
+	bool
+
 source "fs/Kconfig.binfmt"
 
 # We optimistically allocate largepages from the VM, so make the limit
Index: linux-2.6.16/arch/powerpc/boot/Makefile
===================================================================
--- linux-2.6.16.orig/arch/powerpc/boot/Makefile	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/powerpc/boot/Makefile	2006-10-19 16:52:30.000000000 +0200
@@ -29,6 +29,14 @@ OBJCOPYFLAGS    := contents,alloc,load,r
 OBJCOPY_COFF_ARGS := -O aixcoff-rs6000 --set-start 0x500000
 OBJCOPY_MIB_ARGS  := -O aixcoff-rs6000 -R .stab -R .stabstr -R .comment
 
+ifdef CONFIG_MCOUNT
+# do not trace the boot loader
+nullstring	:=
+space		:= $(nullstring) # end of the line
+pg_flag		= $(nullstring) -pg # end of the line
+CFLAGS		:= $(subst ${pg_flag},${space},${CFLAGS})
+endif
+
 zlib       := infblock.c infcodes.c inffast.c inflate.c inftrees.c infutil.c
 zlibheader := infblock.h infcodes.h inffast.h inftrees.h infutil.h
 zliblinuxheader := zlib.h zconf.h zutil.h
@@ -44,7 +52,7 @@ obj-boot := $(addsuffix .o, $(basename $
 BOOTCFLAGS	+= -I$(obj) -I$(srctree)/$(obj)
 
 quiet_cmd_copy_zlib = COPY    $@
-      cmd_copy_zlib = sed "s@__attribute_used__@@;s@<linux/\([^>]\+\).*@\"\1\"@" $< > $@
+	cmd_copy_zlib = sed "s@__attribute_used__@@;s@.include.<linux/module.h>@@;s@.include.<linux/spinlock.h>@@;s@.*spin.*lock.*@@;s@.*SPINLOCK.*@@;s@<linux/\([^>]\+\).*@\"\1\"@" $< > $@
 
 quiet_cmd_copy_zlibheader = COPY    $@
       cmd_copy_zlibheader = sed "s@<linux/\([^>]\+\).*@\"\1\"@" $< > $@
Index: linux-2.6.16/arch/powerpc/kernel/Makefile
===================================================================
--- linux-2.6.16.orig/arch/powerpc/kernel/Makefile	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/powerpc/kernel/Makefile	2006-10-19 16:52:30.000000000 +0200
@@ -10,10 +10,11 @@ CFLAGS_prom_init.o      += -fPIC
 CFLAGS_btext.o		+= -fPIC
 endif
 
-obj-y				:= semaphore.o cputable.o ptrace.o syscalls.o \
+obj-y				:= cputable.o ptrace.o syscalls.o \
 				   irq.o align.o signal_32.o pmc.o vdso.o \
 				   init_task.o process.o systbl.o
 obj-y				+= vdso32/
+obj-$(CONFIG_ASM_SEMAPHORES)	+= semaphore.o
 obj-$(CONFIG_PPC64)		+= setup_64.o binfmt_elf32.o sys_ppc32.o \
 				   signal_64.o ptrace32.o \
 				   paca.o cpu_setup_power4.o \
Index: linux-2.6.16/arch/powerpc/kernel/entry_32.S
===================================================================
--- linux-2.6.16.orig/arch/powerpc/kernel/entry_32.S	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/powerpc/kernel/entry_32.S	2006-10-19 16:52:30.000000000 +0200
@@ -636,7 +636,7 @@ user_exc_return:		/* r10 contains MSR_KE
 	/* Check current_thread_info()->flags */
 	rlwinm	r9,r1,0,0,(31-THREAD_SHIFT)
 	lwz	r9,TI_FLAGS(r9)
-	andi.	r0,r9,(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_NEED_RESCHED)
+	andi.	r0,r9,(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	bne	do_work
 
 restore_user:
@@ -854,7 +854,7 @@ load_dbcr0:
 #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
 
 do_work:			/* r10 contains MSR_KERNEL here */
-	andi.	r0,r9,_TIF_NEED_RESCHED
+	andi.	r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	beq	do_user_signal
 
 do_resched:			/* r10 contains MSR_KERNEL here */
@@ -868,7 +868,7 @@ recheck:
 	MTMSRD(r10)		/* disable interrupts */
 	rlwinm	r9,r1,0,0,(31-THREAD_SHIFT)
 	lwz	r9,TI_FLAGS(r9)
-	andi.	r0,r9,_TIF_NEED_RESCHED
+	andi.	r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	bne-	do_resched
 	andi.	r0,r9,_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK
 	beq	restore_user
@@ -976,3 +976,85 @@ machine_check_in_rtas:
 	/* XXX load up BATs and panic */
 
 #endif /* CONFIG_PPC_RTAS */
+
+#ifdef CONFIG_MCOUNT
+/*
+ * mcount() is not the same as _mcount().  The callers of mcount() have a
+ * normal context.  The callers of _mcount() do not have a stack frame and
+ * have not saved the "caller saves" registers.
+ */
+_GLOBAL(mcount)
+	stwu	r1,-16(r1)
+	mflr	r3
+	lis	r5,mcount_enabled@ha
+	lwz	r5,mcount_enabled@l(r5)
+	stw	r3,20(r1)
+	cmpwi	r5,0
+	beq	1f
+	/* r3 contains lr (eip), put parent lr (parent_eip) in r4 */
+	lwz	r4,16(r1)
+	lwz	r4,4(r4)
+	bl	__trace
+1:
+	lwz	r0,20(r1)
+	mtlr	r0
+	addi	r1,r1,16
+	blr
+
+/*
+ * The -pg flag, which is specified in the case of CONFIG_MCOUNT, causes the
+ * C compiler to add a call to _mcount() at the start of each function
+ * preamble, before the stack frame is created.  An example of this preamble
+ * code is:
+ *
+ *	mflr	r0
+ *	lis	r12,-16354
+ *	stw	r0,4(r1)
+ *	addi	r0,r12,-19652
+ *	bl	0xc00034c8 <_mcount>
+ *	mflr	r0
+ *	stwu	r1,-16(r1)
+ */
+_GLOBAL(_mcount)
+#define M_STK_SIZE 48
+	/* Would not expect to need to save cr, but glibc version of */
+	/* _mcount() does, so cautiously saving it here too. */
+	stwu	r1,-M_STK_SIZE(r1)
+	stw	r3, 12(r1)
+	stw	r4, 16(r1)
+	stw	r5, 20(r1)
+	stw	r6, 24(r1)
+	mflr	r3	/* will use as first arg to __trace() */
+	mfcr	r4
+	lis	r5,mcount_enabled@ha
+	lwz	r5,mcount_enabled@l(r5)
+	cmpwi	r5,0
+	stw	r3, 44(r1)	/* lr */
+	stw	r4,  8(r1)	/* cr */
+	stw	r7, 28(r1)
+	stw	r8, 32(r1)
+	stw	r9, 36(r1)
+	stw	r10,40(r1)
+	beq	1f
+	/* r3 contains lr (eip), put parent lr (parent_eip) in r4 */
+	lwz	r4,M_STK_SIZE+4(r1)
+	bl	__trace
+1:
+	lwz	r8,  8(r1)	/* cr */
+	lwz	r9, 44(r1)	/* lr */
+	lwz	r3, 12(r1)
+	lwz	r4, 16(r1)
+	lwz	r5, 20(r1)
+	mtcrf	0xff,r8
+	mtctr	r9
+	lwz	r0, 52(r1)
+	lwz	r6, 24(r1)
+	lwz	r7, 28(r1)
+	lwz	r8, 32(r1)
+	lwz	r9, 36(r1)
+	lwz	r10,40(r1)
+	addi	r1,r1,M_STK_SIZE
+	mtlr	r0
+	bctr
+
+#endif /* CONFIG_MCOUNT */
Index: linux-2.6.16/arch/powerpc/kernel/idle_64.c
===================================================================
--- linux-2.6.16.orig/arch/powerpc/kernel/idle_64.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/powerpc/kernel/idle_64.c	2006-10-19 16:52:30.000000000 +0200
@@ -37,7 +37,7 @@ void default_idle(void)
 	set_thread_flag(TIF_POLLING_NRFLAG);
 
 	while (1) {
-		if (!need_resched()) {
+		if (!need_resched() && !need_resched_delayed()) {
 			while (!need_resched() && !cpu_is_offline(cpu)) {
 				ppc64_runlatch_off();
 
@@ -53,9 +53,11 @@ void default_idle(void)
 		}
 
 		ppc64_runlatch_on();
-		preempt_enable_no_resched();
-		schedule();
+		local_irq_disable();
+		__preempt_enable_no_resched();
+		__schedule();
 		preempt_disable();
+		local_irq_enable();
 		if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)
 			cpu_die();
 	}
@@ -71,9 +73,11 @@ void native_idle(void)
 
 		if (need_resched()) {
 			ppc64_runlatch_on();
-			preempt_enable_no_resched();
-			schedule();
+			local_irq_disable();
+			__preempt_enable_no_resched();
+			__schedule();
 			preempt_disable();
+			local_irq_enable();
 		}
 
 		if (cpu_is_offline(smp_processor_id()) &&
Index: linux-2.6.16/arch/powerpc/kernel/irq.c
===================================================================
--- linux-2.6.16.orig/arch/powerpc/kernel/irq.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/powerpc/kernel/irq.c	2006-10-19 16:52:30.000000000 +0200
@@ -88,8 +88,6 @@ extern atomic_t ipi_sent;
 #endif /* CONFIG_PPC32 */
 
 #ifdef CONFIG_PPC64
-EXPORT_SYMBOL(irq_desc);
-
 int distribute_irqs = 1;
 u64 ppc64_interrupt_controller;
 #endif /* CONFIG_PPC64 */
Index: linux-2.6.16/arch/powerpc/kernel/ppc_ksyms.c
===================================================================
--- linux-2.6.16.orig/arch/powerpc/kernel/ppc_ksyms.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/powerpc/kernel/ppc_ksyms.c	2006-10-19 16:52:30.000000000 +0200
@@ -17,7 +17,6 @@
 #include <linux/bitops.h>
 
 #include <asm/page.h>
-#include <asm/semaphore.h>
 #include <asm/processor.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -194,7 +193,6 @@ EXPORT_SYMBOL(screen_info);
 
 #ifdef CONFIG_PPC32
 EXPORT_SYMBOL(timer_interrupt);
-EXPORT_SYMBOL(irq_desc);
 EXPORT_SYMBOL(tb_ticks_per_jiffy);
 EXPORT_SYMBOL(console_drivers);
 EXPORT_SYMBOL(cacheable_memcpy);
Index: linux-2.6.16/arch/powerpc/kernel/semaphore.c
===================================================================
--- linux-2.6.16.orig/arch/powerpc/kernel/semaphore.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/powerpc/kernel/semaphore.c	2006-10-19 16:52:30.000000000 +0200
@@ -31,7 +31,7 @@
  *	sem->count = tmp;
  *	return old_count;
  */
-static inline int __sem_update_count(struct semaphore *sem, int incr)
+static inline int __sem_update_count(struct compat_semaphore *sem, int incr)
 {
 	int old_count, tmp;
 
@@ -50,7 +50,7 @@ static inline int __sem_update_count(str
 	return old_count;
 }
 
-void __up(struct semaphore *sem)
+void __compat_up(struct compat_semaphore *sem)
 {
 	/*
 	 * Note that we incremented count in up() before we came here,
@@ -63,7 +63,7 @@ void __up(struct semaphore *sem)
 	__sem_update_count(sem, 1);
 	wake_up(&sem->wait);
 }
-EXPORT_SYMBOL(__up);
+EXPORT_SYMBOL(__compat_up);
 
 /*
  * Note that when we come in to __down or __down_interruptible,
@@ -73,7 +73,7 @@ EXPORT_SYMBOL(__up);
  * Thus it is only when we decrement count from some value > 0
  * that we have actually got the semaphore.
  */
-void __sched __down(struct semaphore *sem)
+void __sched __compat_down(struct compat_semaphore *sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -101,9 +101,9 @@ void __sched __down(struct semaphore *se
 	 */
 	wake_up(&sem->wait);
 }
-EXPORT_SYMBOL(__down);
+EXPORT_SYMBOL(__compat_down);
 
-int __sched __down_interruptible(struct semaphore * sem)
+int __sched __compat_down_interruptible(struct compat_semaphore *sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -132,4 +132,10 @@ int __sched __down_interruptible(struct 
 	wake_up(&sem->wait);
 	return retval;
 }
-EXPORT_SYMBOL(__down_interruptible);
+EXPORT_SYMBOL(__compat_down_interruptible);
+
+int compat_sem_is_locked(struct compat_semaphore *sem)
+{
+	return (int) atomic_read(&sem->count) < 0;
+}
+EXPORT_SYMBOL(compat_sem_is_locked);
Index: linux-2.6.16/arch/powerpc/kernel/smp.c
===================================================================
--- linux-2.6.16.orig/arch/powerpc/kernel/smp.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/powerpc/kernel/smp.c	2006-10-19 16:52:30.000000000 +0200
@@ -148,6 +148,16 @@ void smp_send_reschedule(int cpu)
 	smp_ops->message_pass(cpu, PPC_MSG_RESCHEDULE);
 }
 
+/*
+ * this function sends a 'reschedule' IPI to all other CPUs.
+ * This is used when RT tasks are starving and other CPUs
+ * might be able to run them:
+ */
+void smp_send_reschedule_allbutself(void)
+{
+	smp_ops->message_pass(MSG_ALL_BUT_SELF, PPC_MSG_RESCHEDULE);
+}
+
 #ifdef CONFIG_DEBUGGER
 void smp_send_debugger_break(int cpu)
 {
@@ -183,7 +193,7 @@ void smp_send_stop(void)
  * static memory requirements. It also looks cleaner.
  * Stolen from the i386 version.
  */
-static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(call_lock);
+static  __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(call_lock);
 
 static struct call_data_struct {
 	void (*func) (void *info);
Index: linux-2.6.16/arch/powerpc/kernel/time.c
===================================================================
--- linux-2.6.16.orig/arch/powerpc/kernel/time.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/powerpc/kernel/time.c	2006-10-19 16:52:30.000000000 +0200
@@ -50,6 +50,7 @@
 #include <linux/security.h>
 #include <linux/percpu.h>
 #include <linux/rtc.h>
+#include <linux/timeofday.h>
 #include <linux/jiffies.h>
 
 #include <asm/io.h>
@@ -73,6 +74,9 @@
 #endif
 #include <asm/smp.h>
 
+unsigned long cpu_khz;	/* Detected as we calibrate the TSC */
+EXPORT_SYMBOL(cpu_khz);
+
 /* keep track of when we need to update the rtc */
 time_t last_rtc_update;
 extern int piranha_simulator;
@@ -87,13 +91,6 @@ static unsigned long first_settimeofday 
 
 #define XSEC_PER_SEC (1024*1024)
 
-#ifdef CONFIG_PPC64
-#define SCALE_XSEC(xsec, max)	(((xsec) * max) / XSEC_PER_SEC)
-#else
-/* compute ((xsec << 12) * max) >> 32 */
-#define SCALE_XSEC(xsec, max)	mulhwu((xsec) << 12, max)
-#endif
-
 unsigned long tb_ticks_per_jiffy;
 unsigned long tb_ticks_per_usec = 100; /* sane default */
 EXPORT_SYMBOL(tb_ticks_per_usec);
@@ -109,7 +106,7 @@ u64 ticklen_to_xs;	/* 0.64 fraction */
    last_tick_len << TICKLEN_SHIFT will be about 2^63. */
 #define TICKLEN_SHIFT	(63 - 30 - TICKLEN_SCALE + SHIFT_HZ)
 
-DEFINE_SPINLOCK(rtc_lock);
+DEFINE_RAW_SPINLOCK(rtc_lock);
 EXPORT_SYMBOL_GPL(rtc_lock);
 
 u64 tb_to_ns_scale;
@@ -135,6 +132,46 @@ unsigned long tb_last_stamp;
  */
 DEFINE_PER_CPU(unsigned long, last_jiffy);
 
+unsigned long get_boot_time(void);
+
+s64 read_persistent_clock(void)
+{
+	unsigned long sec = get_boot_time();
+	return (s64)sec * NSEC_PER_SEC;
+}
+void sync_persistent_clock(struct timespec ts)
+{
+	/*
+	 * update the rtc when needed, this should be performed on the
+	 * right fraction of a second. Half or full second ?
+	 * Full second works on mk48t59 clocks, others need testing.
+	 * Note that this update is basically only used through
+	 * the adjtimex system calls. Setting the HW clock in
+	 * any other way is a /dev/rtc and userland business.
+	 * This is still wrong by -0.5/+1.5 jiffies because of the
+	 * timer interrupt resolution and possible delay, but here we
+	 * hit a quantization limit which can only be solved by higher
+	 * resolution timers and decoupling time management from timer
+	 * interrupts. This is also wrong on the clocks
+	 * which require being written at the half second boundary.
+	 * We should have an rtc call that only sets the minutes and
+	 * seconds like on Intel to avoid problems with non UTC clocks.
+	 */
+	if ( ts.tv_sec - last_rtc_update >= 659 &&
+		abs((ts.tv_nsec/1000) - (1000000-1000000/HZ)) < 500000/HZ) {
+		struct rtc_time tm;
+		to_tm(ts.tv_sec+1, &tm);
+		tm.tm_year -= 1900;
+		tm.tm_mon -= 1;
+		if (ppc_md.set_rtc_time(&tm) == 0)
+			last_rtc_update = ts.tv_sec+1;
+		else
+			/* Try again one minute later */
+			last_rtc_update += 60;
+	}
+}
+
+
 void __delay(unsigned long loops)
 {
 	unsigned long start;
@@ -163,39 +200,6 @@ void udelay(unsigned long usecs)
 }
 EXPORT_SYMBOL(udelay);
 
-static __inline__ void timer_check_rtc(void)
-{
-        /*
-         * update the rtc when needed, this should be performed on the
-         * right fraction of a second. Half or full second ?
-         * Full second works on mk48t59 clocks, others need testing.
-         * Note that this update is basically only used through 
-         * the adjtimex system calls. Setting the HW clock in
-         * any other way is a /dev/rtc and userland business.
-         * This is still wrong by -0.5/+1.5 jiffies because of the
-         * timer interrupt resolution and possible delay, but here we 
-         * hit a quantization limit which can only be solved by higher
-         * resolution timers and decoupling time management from timer
-         * interrupts. This is also wrong on the clocks
-         * which require being written at the half second boundary.
-         * We should have an rtc call that only sets the minutes and
-         * seconds like on Intel to avoid problems with non UTC clocks.
-         */
-        if (ppc_md.set_rtc_time && ntp_synced() &&
-	    xtime.tv_sec - last_rtc_update >= 659 &&
-	    abs((xtime.tv_nsec/1000) - (1000000-1000000/HZ)) < 500000/HZ) {
-		struct rtc_time tm;
-		to_tm(xtime.tv_sec + 1 + timezone_offset, &tm);
-		tm.tm_year -= 1900;
-		tm.tm_mon -= 1;
-		if (ppc_md.set_rtc_time(&tm) == 0)
-			last_rtc_update = xtime.tv_sec + 1;
-		else
-			/* Try again one minute later */
-			last_rtc_update += 60;
-        }
-}
-
 /*
  * This version of gettimeofday has microsecond resolution.
  */
@@ -346,7 +350,7 @@ static __inline__ void timer_recalc_offs
 }
 
 #ifdef CONFIG_SMP
-unsigned long profile_pc(struct pt_regs *regs)
+unsigned long notrace profile_pc(struct pt_regs *regs)
 {
 	unsigned long pc = instruction_pointer(regs);
 
@@ -395,6 +399,8 @@ static void iSeries_tb_recal(void)
 				div128_by_32( XSEC_PER_SEC, 0, tb_ticks_per_sec, &divres );
 				do_gtod.tb_ticks_per_sec = tb_ticks_per_sec;
 				tb_to_xs = divres.result_low;
+				cpu_khz = ppc_tb_freq / 1000;
+
 				do_gtod.varp->tb_to_xs = tb_to_xs;
 				vdso_data->tb_ticks_per_sec = tb_ticks_per_sec;
 				vdso_data->tb_to_xs = tb_to_xs;
@@ -474,8 +480,6 @@ void timer_interrupt(struct pt_regs * re
 		tb_last_jiffy += tb_ticks_per_jiffy;
 		tb_last_stamp = per_cpu(last_jiffy, cpu);
 		do_timer(regs);
-		timer_recalc_offset(tb_last_jiffy);
-		timer_check_rtc();
 		write_sequnlock(&xtime_lock);
 	}
 	
@@ -546,82 +550,6 @@ unsigned long long sched_clock(void)
 	return mulhdu(get_tb(), tb_to_ns_scale) << tb_to_ns_shift;
 }
 
-int do_settimeofday(struct timespec *tv)
-{
-	time_t wtm_sec, new_sec = tv->tv_sec;
-	long wtm_nsec, new_nsec = tv->tv_nsec;
-	unsigned long flags;
-	u64 new_xsec;
-	unsigned long tb_delta;
-
-	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
-		return -EINVAL;
-
-	write_seqlock_irqsave(&xtime_lock, flags);
-
-	/*
-	 * Updating the RTC is not the job of this code. If the time is
-	 * stepped under NTP, the RTC will be updated after STA_UNSYNC
-	 * is cleared.  Tools like clock/hwclock either copy the RTC
-	 * to the system time, in which case there is no point in writing
-	 * to the RTC again, or write to the RTC but then they don't call
-	 * settimeofday to perform this operation.
-	 */
-#ifdef CONFIG_PPC_ISERIES
-	if (first_settimeofday) {
-		iSeries_tb_recal();
-		first_settimeofday = 0;
-	}
-#endif
-
-	/* Make userspace gettimeofday spin until we're done. */
-	++vdso_data->tb_update_count;
-	smp_mb();
-
-	/*
-	 * Subtract off the number of nanoseconds since the
-	 * beginning of the last tick.
-	 * Note that since we don't increment jiffies_64 anywhere other
-	 * than in do_timer (since we don't have a lost tick problem),
-	 * wall_jiffies will always be the same as jiffies,
-	 * and therefore the (jiffies - wall_jiffies) computation
-	 * has been removed.
-	 */
-	tb_delta = tb_ticks_since(tb_last_stamp);
-	tb_delta = mulhdu(tb_delta, do_gtod.varp->tb_to_xs); /* in xsec */
-	new_nsec -= SCALE_XSEC(tb_delta, 1000000000);
-
-	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - new_sec);
-	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - new_nsec);
-
- 	set_normalized_timespec(&xtime, new_sec, new_nsec);
-	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
-
-	/* In case of a large backwards jump in time with NTP, we want the 
-	 * clock to be updated as soon as the PLL is again in lock.
-	 */
-	last_rtc_update = new_sec - 658;
-
-	ntp_clear();
-
-	new_xsec = xtime.tv_nsec;
-	if (new_xsec != 0) {
-		new_xsec *= XSEC_PER_SEC;
-		do_div(new_xsec, NSEC_PER_SEC);
-	}
-	new_xsec += (u64)xtime.tv_sec * XSEC_PER_SEC;
-	update_gtod(tb_last_jiffy, new_xsec, do_gtod.varp->tb_to_xs);
-
-	vdso_data->tz_minuteswest = sys_tz.tz_minuteswest;
-	vdso_data->tz_dsttime = sys_tz.tz_dsttime;
-
-	write_sequnlock_irqrestore(&xtime_lock, flags);
-	clock_was_set();
-	return 0;
-}
-
-EXPORT_SYMBOL(do_settimeofday);
-
 void __init generic_calibrate_decr(void)
 {
 	struct device_node *cpu;
@@ -815,7 +743,6 @@ void __init time_init(void)
 	set_dec(tb_ticks_per_jiffy);
 }
 
-
 #define FEBRUARY	2
 #define	STARTOFTIME	1970
 #define SECDAY		86400L
@@ -960,3 +887,36 @@ void div128_by_32(u64 dividend_high, u64
 	dr->result_low  = ((u64)y << 32) + z;
 
 }
+
+
+/* powerpc clocksource code */
+
+#include <linux/clocksource.h>
+static cycle_t timebase_read(void)
+{
+	return (cycle_t)get_tb();
+}
+
+struct clocksource clocksource_timebase = {
+	.name = "timebase",
+	.rating = 200,
+	.read = timebase_read,
+	.mask = (cycle_t)-1,
+	.mult = 0,
+	.shift = 22,
+};
+
+
+/* XXX - this should be calculated or properly externed! */
+static int __init init_timebase_clocksource(void)
+{
+	if (__USE_RTC())
+		return -ENODEV;
+
+	clocksource_timebase.mult = clocksource_hz2mult(tb_ticks_per_sec,
+					clocksource_timebase.shift);
+	return register_clocksource(&clocksource_timebase);
+}
+
+module_init(init_timebase_clocksource);
+
Index: linux-2.6.16/arch/powerpc/kernel/traps.c
===================================================================
--- linux-2.6.16.orig/arch/powerpc/kernel/traps.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/powerpc/kernel/traps.c	2006-10-19 16:52:30.000000000 +0200
@@ -92,7 +92,7 @@ int register_die_notifier(struct notifie
  * Trap & Exception support
  */
 
-static DEFINE_SPINLOCK(die_lock);
+static DEFINE_RAW_SPINLOCK(die_lock);
 
 int die(const char *str, struct pt_regs *regs, long err)
 {
@@ -197,6 +197,11 @@ void _exception(int signr, struct pt_reg
 			return;
 	}
 
+#ifdef CONFIG_PREEMPT_RT
+	local_irq_enable();
+	preempt_check_resched();
+#endif
+
 	memset(&info, 0, sizeof(info));
 	info.si_signo = signr;
 	info.si_code = code;
Index: linux-2.6.16/arch/powerpc/lib/locks.c
===================================================================
--- linux-2.6.16.orig/arch/powerpc/lib/locks.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/powerpc/lib/locks.c	2006-10-19 16:52:30.000000000 +0200
@@ -25,7 +25,7 @@
 #include <asm/iseries/hv_call.h>
 #include <asm/smp.h>
 
-void __spin_yield(raw_spinlock_t *lock)
+void __spin_yield(__raw_spinlock_t *lock)
 {
 	unsigned int lock_value, holder_cpu, yield_count;
 
@@ -80,7 +80,7 @@ void __rw_yield(raw_rwlock_t *rw)
 }
 #endif
 
-void __raw_spin_unlock_wait(raw_spinlock_t *lock)
+void __raw_spin_unlock_wait(__raw_spinlock_t *lock)
 {
 	while (lock->slock) {
 		HMT_low();
Index: linux-2.6.16/arch/powerpc/mm/fault.c
===================================================================
--- linux-2.6.16.orig/arch/powerpc/mm/fault.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/powerpc/mm/fault.c	2006-10-19 16:52:30.000000000 +0200
@@ -118,8 +118,8 @@ static void do_dabr(struct pt_regs *regs
  * The return value is 0 if the fault was handled, or the signal
  * number if this is a kernel fault that can't be handled here.
  */
-int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
-			    unsigned long error_code)
+int __kprobes notrace do_page_fault(struct pt_regs *regs,
+		unsigned long address, unsigned long error_code)
 {
 	struct vm_area_struct * vma;
 	struct mm_struct *mm = current->mm;
Index: linux-2.6.16/arch/powerpc/mm/init_32.c
===================================================================
--- linux-2.6.16.orig/arch/powerpc/mm/init_32.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/powerpc/mm/init_32.c	2006-10-19 16:52:30.000000000 +0200
@@ -57,7 +57,7 @@
 #endif
 #define MAX_LOW_MEM	CONFIG_LOWMEM_SIZE
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 unsigned long total_memory;
 unsigned long total_lowmem;
Index: linux-2.6.16/arch/powerpc/mm/tlb_64.c
===================================================================
--- linux-2.6.16.orig/arch/powerpc/mm/tlb_64.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/powerpc/mm/tlb_64.c	2006-10-19 16:52:30.000000000 +0200
@@ -38,7 +38,7 @@ DEFINE_PER_CPU(struct ppc64_tlb_batch, p
 /* This is declared as we are using the more or less generic
  * include/asm-ppc64/tlb.h file -- tgall
  */
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
 unsigned long pte_freelist_forced_free;
 
Index: linux-2.6.16/arch/powerpc/platforms/cell/smp.c
===================================================================
--- linux-2.6.16.orig/arch/powerpc/platforms/cell/smp.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/powerpc/platforms/cell/smp.c	2006-10-19 16:52:30.000000000 +0200
@@ -134,7 +134,7 @@ static void __devinit smp_iic_setup_cpu(
 		iic_setup_cpu();
 }
 
-static DEFINE_SPINLOCK(timebase_lock);
+static DEFINE_RAW_SPINLOCK(timebase_lock);
 static unsigned long timebase = 0;
 
 static void __devinit cell_give_timebase(void)
Index: linux-2.6.16/arch/powerpc/platforms/chrp/smp.c
===================================================================
--- linux-2.6.16.orig/arch/powerpc/platforms/chrp/smp.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/powerpc/platforms/chrp/smp.c	2006-10-19 16:52:30.000000000 +0200
@@ -47,7 +47,7 @@ static void __devinit smp_chrp_setup_cpu
 	mpic_setup_this_cpu();
 }
 
-static DEFINE_SPINLOCK(timebase_lock);
+static DEFINE_RAW_SPINLOCK(timebase_lock);
 static unsigned int timebase_upper = 0, timebase_lower = 0;
 
 void __devinit smp_chrp_give_timebase(void)
Index: linux-2.6.16/arch/powerpc/platforms/chrp/time.c
===================================================================
--- linux-2.6.16.orig/arch/powerpc/platforms/chrp/time.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/powerpc/platforms/chrp/time.c	2006-10-19 16:52:30.000000000 +0200
@@ -29,7 +29,7 @@
 #include <asm/sections.h>
 #include <asm/time.h>
 
-extern spinlock_t rtc_lock;
+extern raw_spinlock_t rtc_lock;
 
 static int nvram_as1 = NVRAM_AS1;
 static int nvram_as0 = NVRAM_AS0;
Index: linux-2.6.16/arch/powerpc/platforms/iseries/setup.c
===================================================================
--- linux-2.6.16.orig/arch/powerpc/platforms/iseries/setup.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/powerpc/platforms/iseries/setup.c	2006-10-19 16:52:30.000000000 +0200
@@ -655,12 +655,14 @@ static void yield_shared_processor(void)
 static void iseries_shared_idle(void)
 {
 	while (1) {
-		while (!need_resched() && !hvlpevent_is_pending()) {
+		while (!need_resched() && !need_resched_delayed()
+				&& !hvlpevent_is_pending()) {
 			local_irq_disable();
 			ppc64_runlatch_off();
 
 			/* Recheck with irqs off */
-			if (!need_resched() && !hvlpevent_is_pending())
+			if (!need_resched() && !need_resched_delayed()
+					&& !hvlpevent_is_pending())
 				yield_shared_processor();
 
 			HMT_medium();
Index: linux-2.6.16/arch/powerpc/platforms/powermac/feature.c
===================================================================
--- linux-2.6.16.orig/arch/powerpc/platforms/powermac/feature.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/powerpc/platforms/powermac/feature.c	2006-10-19 16:52:30.000000000 +0200
@@ -62,7 +62,7 @@ extern struct device_node *k2_skiplist[2
  * We use a single global lock to protect accesses. Each driver has
  * to take care of its own locking
  */
-DEFINE_SPINLOCK(feature_lock);
+DEFINE_RAW_SPINLOCK(feature_lock);
 
 #define LOCK(flags)	spin_lock_irqsave(&feature_lock, flags);
 #define UNLOCK(flags)	spin_unlock_irqrestore(&feature_lock, flags);
Index: linux-2.6.16/arch/powerpc/platforms/powermac/nvram.c
===================================================================
--- linux-2.6.16.orig/arch/powerpc/platforms/powermac/nvram.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/powerpc/platforms/powermac/nvram.c	2006-10-19 16:52:30.000000000 +0200
@@ -81,7 +81,7 @@ static int is_core_99;
 static int core99_bank = 0;
 static int nvram_partitions[3];
 // XXX Turn that into a sem
-static DEFINE_SPINLOCK(nv_lock);
+static DEFINE_RAW_SPINLOCK(nv_lock);
 
 extern int pmac_newworld;
 extern int system_running;
Index: linux-2.6.16/arch/powerpc/platforms/powermac/pic.c
===================================================================
--- linux-2.6.16.orig/arch/powerpc/platforms/powermac/pic.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/powerpc/platforms/powermac/pic.c	2006-10-19 16:52:30.000000000 +0200
@@ -64,7 +64,7 @@ static int max_irqs;
 static int max_real_irqs;
 static u32 level_mask[4];
 
-static DEFINE_SPINLOCK(pmac_pic_lock);
+static DEFINE_RAW_SPINLOCK(pmac_pic_lock);
 
 #define GATWICK_IRQ_POOL_SIZE        10
 static struct interrupt_info gatwick_int_pool[GATWICK_IRQ_POOL_SIZE];
Index: linux-2.6.16/arch/powerpc/platforms/pseries/setup.c
===================================================================
--- linux-2.6.16.orig/arch/powerpc/platforms/pseries/setup.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/powerpc/platforms/pseries/setup.c	2006-10-19 16:52:30.000000000 +0200
@@ -464,7 +464,7 @@ static inline void dedicated_idle_sleep(
 		 * a prod occurs.  Returning from the cede enables external
 		 * interrupts.
 		 */
-		if (!need_resched())
+		if (!need_resched() && !need_resched_delayed())
 			cede_processor();
 		else
 			local_irq_enable();
@@ -540,7 +540,8 @@ static void pseries_shared_idle(void)
 		 */
 		get_lppaca()->idle = 1;
 
-		while (!need_resched() && !cpu_is_offline(cpu)) {
+		while (!need_resched() && !need_resched_delayed() &&
+				!cpu_is_offline(cpu)) {
 			local_irq_disable();
 			ppc64_runlatch_off();
 
@@ -565,8 +566,8 @@ static void pseries_shared_idle(void)
 		get_lppaca()->idle = 0;
 		ppc64_runlatch_on();
 
-		preempt_enable_no_resched();
-		schedule();
+		__preempt_enable_no_resched();
+		__schedule();
 		preempt_disable();
 
 		if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)
Index: linux-2.6.16/arch/powerpc/platforms/pseries/smp.c
===================================================================
--- linux-2.6.16.orig/arch/powerpc/platforms/pseries/smp.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/powerpc/platforms/pseries/smp.c	2006-10-19 16:52:30.000000000 +0200
@@ -345,7 +345,7 @@ static void __devinit smp_xics_setup_cpu
 }
 #endif /* CONFIG_XICS */
 
-static DEFINE_SPINLOCK(timebase_lock);
+static DEFINE_RAW_SPINLOCK(timebase_lock);
 static unsigned long timebase = 0;
 
 static void __devinit pSeries_give_timebase(void)
Index: linux-2.6.16/arch/ppc/8260_io/enet.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/8260_io/enet.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/8260_io/enet.c	2006-10-19 16:52:30.000000000 +0200
@@ -116,7 +116,7 @@ struct scc_enet_private {
 	scc_t	*sccp;
 	struct	net_device_stats stats;
 	uint	tx_full;
-	spinlock_t lock;
+	raw_spinlock_t lock;
 };
 
 static int scc_enet_open(struct net_device *dev);
Index: linux-2.6.16/arch/ppc/8260_io/fcc_enet.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/8260_io/fcc_enet.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/8260_io/fcc_enet.c	2006-10-19 16:52:30.000000000 +0200
@@ -377,7 +377,7 @@ struct fcc_enet_private {
 	volatile fcc_enet_t	*ep;
 	struct	net_device_stats stats;
 	uint	tx_free;
-	spinlock_t lock;
+	raw_spinlock_t lock;
 
 #ifdef	CONFIG_USE_MDIO
 	uint	phy_id;
Index: linux-2.6.16/arch/ppc/8xx_io/commproc.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/8xx_io/commproc.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/8xx_io/commproc.c	2006-10-19 16:52:30.000000000 +0200
@@ -356,7 +356,7 @@ cpm_setbrg(uint brg, uint rate)
 /*
  * dpalloc / dpfree bits.
  */
-static spinlock_t cpm_dpmem_lock;
+static raw_spinlock_t cpm_dpmem_lock;
 /*
  * 16 blocks should be enough to satisfy all requests
  * until the memory subsystem goes up...
Index: linux-2.6.16/arch/ppc/8xx_io/enet.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/8xx_io/enet.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/8xx_io/enet.c	2006-10-19 16:52:30.000000000 +0200
@@ -144,7 +144,7 @@ struct scc_enet_private {
 	unsigned char *rx_vaddr[RX_RING_SIZE];
 	struct	net_device_stats stats;
 	uint	tx_full;
-	spinlock_t lock;
+	raw_spinlock_t lock;
 };
 
 static int scc_enet_open(struct net_device *dev);
Index: linux-2.6.16/arch/ppc/8xx_io/fec.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/8xx_io/fec.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/8xx_io/fec.c	2006-10-19 16:52:30.000000000 +0200
@@ -165,7 +165,7 @@ struct fec_enet_private {
 
 	struct	net_device_stats stats;
 	uint	tx_full;
-	spinlock_t lock;
+	raw_spinlock_t lock;
 
 #ifdef	CONFIG_USE_MDIO
 	uint	phy_id;
Index: linux-2.6.16/arch/ppc/Kconfig
===================================================================
--- linux-2.6.16.orig/arch/ppc/Kconfig	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/Kconfig	2006-10-19 16:52:30.000000000 +0200
@@ -12,13 +12,6 @@ config GENERIC_HARDIRQS
 	bool
 	default y
 
-config RWSEM_GENERIC_SPINLOCK
-	bool
-
-config RWSEM_XCHGADD_ALGORITHM
-	bool
-	default y
-
 config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
@@ -922,6 +915,18 @@ config HIGHMEM
 
 source kernel/Kconfig.hz
 source kernel/Kconfig.preempt
+
+config RWSEM_GENERIC_SPINLOCK
+	bool
+	default y
+
+config ASM_SEMAPHORES
+	bool
+	default y
+
+config RWSEM_XCHGADD_ALGORITHM
+	bool
+
 source "mm/Kconfig"
 
 source "fs/Kconfig.binfmt"
Index: linux-2.6.16/arch/ppc/boot/Makefile
===================================================================
--- linux-2.6.16.orig/arch/ppc/boot/Makefile	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/boot/Makefile	2006-10-19 16:52:30.000000000 +0200
@@ -11,6 +11,15 @@
 #
 
 CFLAGS	 	+= -fno-builtin -D__BOOTER__ -Iarch/$(ARCH)/boot/include
+
+ifdef CONFIG_MCOUNT
+# do not trace the boot loader
+nullstring :=
+space      := $(nullstring) # end of the line
+pg_flag     = $(nullstring) -pg # end of the line
+CFLAGS     := $(subst ${pg_flag},${space},${CFLAGS})
+endif
+
 HOSTCFLAGS	+= -Iarch/$(ARCH)/boot/include
 
 BOOT_TARGETS	= zImage zImage.initrd znetboot znetboot.initrd
Index: linux-2.6.16/arch/ppc/boot/lib/Makefile
===================================================================
--- linux-2.6.16.orig/arch/ppc/boot/lib/Makefile	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/boot/lib/Makefile	2006-10-19 16:52:30.000000000 +0200
@@ -5,19 +5,49 @@
 CFLAGS_kbd.o	:= -Idrivers/char
 CFLAGS_vreset.o := -I$(srctree)/arch/ppc/boot/include
 
-zlib  := infblock.c infcodes.c inffast.c inflate.c inftrees.c infutil.c
-	 
-lib-y += $(zlib:.c=.o) div64.o
-lib-$(CONFIG_VGA_CONSOLE) += vreset.o kbd.o
-
+zlib       := infblock.c infcodes.c inffast.c inflate.c inftrees.c infutil.c
+zlibheader := infblock.h infcodes.h inffast.h inftrees.h infutil.h
+zliblinuxheader := zlib.h zconf.h zutil.h
+
+$(addprefix $(obj)/,$(zlib)): $(addprefix $(obj)/,$(zliblinuxheader)) $(addprefix $(obj)/,$(zlibheader))
+
+src-boot := div64.S
+src-boot += $(zlib)
+#src-boot := $(addprefix $(obj)/, $(src-boot))
+obj-boot := $(addsuffix .o, $(basename $(src-boot)))
 
-# zlib files needs header from their original place
-EXTRA_CFLAGS += -Ilib/zlib_inflate
+BOOTCFLAGS	+= -I$(obj) -I$(srctree)/$(obj) $(CFLAGS)
 
 quiet_cmd_copy_zlib = COPY    $@
-      cmd_copy_zlib = cat $< > $@
+      cmd_copy_zlib = sed "s@__attribute_used__@@;s@.include.<linux/module.h>@@;s@.include.<linux/spinlock.h>@@;s@.*spin.*lock.*@@;s@.*SPINLOCK.*@@;s@<linux/\([^>]\+\).*@\"\1\"@" $< > $@
+
+quiet_cmd_copy_zlibheader = COPY    $@
+      cmd_copy_zlibheader = sed "s@<linux/\([^>]\+\).*@\"\1\"@" $< > $@
+# stddef.h for NULL
+quiet_cmd_copy_zliblinuxheader = COPY    $@
+      cmd_copy_zliblinuxheader = sed "s@.include.<linux/string.h>@@;s@.include.<linux/errno.h>@@;s@<linux/kernel.h>@<stddef.h>@;s@<linux/\([^>]\+\).*@\"\1\"@" $< > $@
 
 $(addprefix $(obj)/,$(zlib)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
 	$(call cmd,copy_zlib)
 
-clean-files := $(zlib)
+$(addprefix $(obj)/,$(zlibheader)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
+	$(call cmd,copy_zlibheader)
+
+$(addprefix $(obj)/,$(zliblinuxheader)): $(obj)/%: $(srctree)/include/linux/%
+	$(call cmd,copy_zliblinuxheader)
+
+clean-files := $(zlib) $(zlibheader) $(zliblinuxheader)
+
+quiet_cmd_bootcc = BOOTCC  $@
+      cmd_bootcc = $(CC) -Wp,-MD,$(depfile) $(BOOTCFLAGS) -c -o $@ $<
+
+quiet_cmd_bootas = BOOTAS  $@
+      cmd_bootas = $(CC) -Wp,-MD,$(depfile) $(BOOTAFLAGS) -c -o $@ $<
+
+$(patsubst %.c,%.o, $(filter %.c, $(src-boot))): %.o: %.c
+	$(call if_changed_dep,bootcc)
+$(patsubst %.S,%.o, $(filter %.S, $(src-boot))): %.o: %.S
+	$(call if_changed_dep,bootas)
+
+lib-y += $(obj-boot)
+lib-$(CONFIG_VGA_CONSOLE) += vreset.o kbd.o
Index: linux-2.6.16/arch/ppc/kernel/dma-mapping.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/kernel/dma-mapping.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/kernel/dma-mapping.c	2006-10-19 16:52:30.000000000 +0200
@@ -71,7 +71,7 @@ int map_page(unsigned long va, phys_addr
  * This is the page table (2MB) covering uncached, DMA consistent allocations
  */
 static pte_t *consistent_pte;
-static DEFINE_SPINLOCK(consistent_lock);
+static DEFINE_RAW_SPINLOCK(consistent_lock);
 
 /*
  * VM region handling support.
Index: linux-2.6.16/arch/ppc/kernel/entry.S
===================================================================
--- linux-2.6.16.orig/arch/ppc/kernel/entry.S	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/kernel/entry.S	2006-10-19 16:52:30.000000000 +0200
@@ -854,7 +854,7 @@ load_dbcr0:
 #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
 
 do_work:			/* r10 contains MSR_KERNEL here */
-	andi.	r0,r9,_TIF_NEED_RESCHED
+	andi.	r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	beq	do_user_signal
 
 do_resched:			/* r10 contains MSR_KERNEL here */
@@ -868,7 +868,7 @@ recheck:
 	MTMSRD(r10)		/* disable interrupts */
 	rlwinm	r9,r1,0,0,18
 	lwz	r9,TI_FLAGS(r9)
-	andi.	r0,r9,_TIF_NEED_RESCHED
+	andi.	r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	bne-	do_resched
 	andi.	r0,r9,_TIF_SIGPENDING
 	beq	restore_user
@@ -978,3 +978,85 @@ machine_check_in_rtas:
 	/* XXX load up BATs and panic */
 
 #endif /* CONFIG_PPC_OF */
+
+#ifdef CONFIG_MCOUNT
+
+/*
+ * mcount() is not the same as _mcount().  The callers of mcount() have a
+ * normal context.  The callers of _mcount() do not have a stack frame and
+ * have not saved the "caller saves" registers.
+ */
+_GLOBAL(mcount)
+	stwu	r1,-16(r1)
+	mflr	r3
+	lis	r5,mcount_enabled@ha
+	lwz	r5,mcount_enabled@l(r5)
+	stw	r3,20(r1)
+	cmpwi	r5,0
+	beq	1f
+	/* r3 contains lr (eip), put parent lr (parent_eip) in r4 */
+	lwz	r4,16(r1)
+	lwz	r4,4(r4)
+	bl	__trace
+1:
+	lwz	r0,20(r1)
+	mtlr	r0
+	addi	r1,r1,16
+	blr
+
+/*
+ * The -pg flag, which is specified in the case of CONFIG_MCOUNT, causes the
+ * C compiler to add a call to _mcount() at the start of each function preamble,
+ * before the stack frame is created.  An example of this preamble code is:
+ *
+ * 	mflr    r0
+ * 	lis     r12,-16354
+ * 	stw     r0,4(r1)
+ * 	addi    r0,r12,-19652
+ * 	bl      0xc00034c8 <_mcount>
+ * 	mflr    r0
+ * 	stwu    r1,-16(r1)
+ */
+_GLOBAL(_mcount)
+#define M_STK_SIZE 48
+	/* Would not expect to need to save cr, but glibc version of */
+	/* _mcount() does, so cautiously saving it here too.         */
+	stwu	r1,-M_STK_SIZE(r1)
+	stw	r3, 12(r1)
+	stw	r4, 16(r1)
+	stw	r5, 20(r1)
+	stw	r6, 24(r1)
+	mflr	r3		/* will use as first arg to __trace() */
+	mfcr	r4
+	lis	r5,mcount_enabled@ha
+	lwz	r5,mcount_enabled@l(r5)
+	cmpwi	r5,0
+	stw	r3, 44(r1)	/* lr */
+	stw	r4,  8(r1)	/* cr */
+	stw	r7, 28(r1)
+	stw	r8, 32(r1)
+	stw	r9, 36(r1)
+	stw	r10,40(r1)
+	beq	1f
+	/* r3 contains lr (eip), put parent lr (parent_eip) in r4 */
+	lwz	r4,M_STK_SIZE+4(r1)
+	bl	__trace
+1:
+	lwz	r8,  8(r1)	/* cr */
+	lwz	r9, 44(r1)	/* lr */
+	lwz	r3, 12(r1)
+	lwz	r4, 16(r1)
+	lwz	r5, 20(r1)
+	mtcrf	0xff,r8
+	mtctr	r9
+	lwz	r0, 52(r1)
+	lwz	r6, 24(r1)
+	lwz	r7, 28(r1)
+	lwz	r8, 32(r1)
+	lwz	r9, 36(r1)
+	lwz	r10,40(r1)
+	addi	r1,r1,M_STK_SIZE
+	mtlr	r0
+	bctr
+
+#endif /* CONFIG_MCOUNT */
Index: linux-2.6.16/arch/ppc/kernel/idle.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/kernel/idle.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/kernel/idle.c	2006-10-19 16:52:30.000000000 +0200
@@ -40,7 +40,7 @@ void default_idle(void)
 
 	powersave = ppc_md.power_save;
 
-	if (!need_resched()) {
+	if (!need_resched() && !need_resched_delayed()) {
 		if (powersave != NULL)
 			powersave();
 #ifdef CONFIG_SMP
@@ -64,6 +64,9 @@ void cpu_idle(void)
 
 	for (;;) {
 		while (!need_resched()) {
+			BUG_ON(irqs_disabled());
+			stop_critical_timing();
+
 			if (ppc_md.idle != NULL)
 				ppc_md.idle();
 			else
@@ -72,9 +75,11 @@ void cpu_idle(void)
 
 		if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)
 			cpu_die();
-		preempt_enable_no_resched();
-		schedule();
+		local_irq_disable();
+		__preempt_enable_no_resched();
+		__schedule();
 		preempt_disable();
+		local_irq_enable();
 	}
 }
 
Index: linux-2.6.16/arch/ppc/kernel/ppc_ksyms.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/kernel/ppc_ksyms.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/kernel/ppc_ksyms.c	2006-10-19 16:52:30.000000000 +0200
@@ -255,16 +255,12 @@ EXPORT_SYMBOL(screen_info);
 
 EXPORT_SYMBOL(__delay);
 EXPORT_SYMBOL(timer_interrupt);
-EXPORT_SYMBOL(irq_desc);
 EXPORT_SYMBOL(tb_ticks_per_jiffy);
 EXPORT_SYMBOL(console_drivers);
 #ifdef CONFIG_XMON
 EXPORT_SYMBOL(xmon);
 EXPORT_SYMBOL(xmon_printf);
 #endif
-EXPORT_SYMBOL(__up);
-EXPORT_SYMBOL(__down);
-EXPORT_SYMBOL(__down_interruptible);
 
 #if defined(CONFIG_KGDB) || defined(CONFIG_XMON)
 extern void (*debugger)(struct pt_regs *regs);
Index: linux-2.6.16/arch/ppc/kernel/semaphore.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/kernel/semaphore.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/kernel/semaphore.c	2006-10-19 16:52:30.000000000 +0200
@@ -29,7 +29,7 @@
  *	sem->count = tmp;
  *	return old_count;
  */
-static inline int __sem_update_count(struct semaphore *sem, int incr)
+static inline int __sem_update_count(struct compat_semaphore *sem, int incr)
 {
 	int old_count, tmp;
 
@@ -48,7 +48,7 @@ static inline int __sem_update_count(str
 	return old_count;
 }
 
-void __up(struct semaphore *sem)
+void __compat_up(struct compat_semaphore *sem)
 {
 	/*
 	 * Note that we incremented count in up() before we came here,
@@ -70,7 +70,7 @@ void __up(struct semaphore *sem)
  * Thus it is only when we decrement count from some value > 0
  * that we have actually got the semaphore.
  */
-void __sched __down(struct semaphore *sem)
+void __sched __compat_down(struct compat_semaphore *sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -100,7 +100,7 @@ void __sched __down(struct semaphore *se
 	wake_up(&sem->wait);
 }
 
-int __sched __down_interruptible(struct semaphore * sem)
+int __sched __compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -129,3 +129,8 @@ int __sched __down_interruptible(struct 
 	wake_up(&sem->wait);
 	return retval;
 }
+
+int compat_sem_is_locked(struct compat_semaphore *sem)
+{
+	return (int) atomic_read(&sem->count) < 0;
+}
Index: linux-2.6.16/arch/ppc/kernel/smp.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/kernel/smp.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/kernel/smp.c	2006-10-19 16:52:30.000000000 +0200
@@ -138,6 +138,16 @@ void smp_send_reschedule(int cpu)
 	smp_message_pass(cpu, PPC_MSG_RESCHEDULE);
 }
 
+/*
+ * this function sends a 'reschedule' IPI to all other CPUs.
+ * This is used when RT tasks are starving and other CPUs
+ * might be able to run them:
+ */
+void smp_send_reschedule_allbutself(void)
+{
+	smp_message_pass(MSG_ALL_BUT_SELF, PPC_MSG_RESCHEDULE, 0, 0);
+}
+
 #ifdef CONFIG_XMON
 void smp_send_xmon_break(int cpu)
 {
@@ -162,7 +172,7 @@ void smp_send_stop(void)
  * static memory requirements. It also looks cleaner.
  * Stolen from the i386 version.
  */
-static DEFINE_SPINLOCK(call_lock);
+static DEFINE_RAW_SPINLOCK(call_lock);
 
 static struct call_data_struct {
 	void (*func) (void *info);
Index: linux-2.6.16/arch/ppc/kernel/time.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/kernel/time.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/kernel/time.c	2006-10-19 16:52:30.000000000 +0200
@@ -66,6 +66,9 @@
 
 #include <asm/time.h>
 
+unsigned long cpu_khz;   /* Detected as we calibrate the TSC */
+EXPORT_SYMBOL(cpu_khz);
+
 unsigned long disarm_decr[NR_CPUS];
 
 extern struct timezone sys_tz;
@@ -86,7 +89,7 @@ extern unsigned long wall_jiffies;
 /* used for timezone offset */
 static long timezone_offset;
 
-DEFINE_SPINLOCK(rtc_lock);
+DEFINE_RAW_SPINLOCK(rtc_lock);
 
 EXPORT_SYMBOL(rtc_lock);
 
@@ -104,7 +107,7 @@ static inline int tb_delta(unsigned *jif
 }
 
 #ifdef CONFIG_SMP
-unsigned long profile_pc(struct pt_regs *regs)
+unsigned long notrace profile_pc(struct pt_regs *regs)
 {
 	unsigned long pc = instruction_pointer(regs);
 
Index: linux-2.6.16/arch/ppc/kernel/traps.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/kernel/traps.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/kernel/traps.c	2006-10-19 16:52:30.000000000 +0200
@@ -74,7 +74,7 @@ void (*debugger_fault_handler)(struct pt
  * Trap & Exception support
  */
 
-DEFINE_SPINLOCK(die_lock);
+DEFINE_RAW_SPINLOCK(die_lock);
 
 int die(const char * str, struct pt_regs * fp, long err)
 {
@@ -109,6 +109,10 @@ void _exception(int signr, struct pt_reg
 		debugger(regs);
 		die("Exception in kernel mode", regs, signr);
 	}
+#ifdef CONFIG_PREEMPT_RT
+	local_irq_enable();
+	preempt_check_resched();
+#endif
 	info.si_signo = signr;
 	info.si_errno = 0;
 	info.si_code = code;
Index: linux-2.6.16/arch/ppc/lib/locks.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/lib/locks.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/lib/locks.c	2006-10-19 16:52:30.000000000 +0200
@@ -43,7 +43,7 @@ static inline unsigned long __spin_trylo
 	return ret;
 }
 
-void _raw_spin_lock(spinlock_t *lock)
+void __raw_spin_lock(raw_spinlock_t *lock)
 {
 	int cpu = smp_processor_id();
 	unsigned int stuck = INIT_STUCK;
@@ -63,9 +63,9 @@ void _raw_spin_lock(spinlock_t *lock)
 	lock->owner_pc = (unsigned long)__builtin_return_address(0);
 	lock->owner_cpu = cpu;
 }
-EXPORT_SYMBOL(_raw_spin_lock);
+EXPORT_SYMBOL(__raw_spin_lock);
 
-int _raw_spin_trylock(spinlock_t *lock)
+int __raw_spin_trylock(raw_spinlock_t *lock)
 {
 	if (__spin_trylock(&lock->lock))
 		return 0;
@@ -73,9 +73,9 @@ int _raw_spin_trylock(spinlock_t *lock)
 	lock->owner_pc = (unsigned long)__builtin_return_address(0);
 	return 1;
 }
-EXPORT_SYMBOL(_raw_spin_trylock);
+EXPORT_SYMBOL(__raw_spin_trylock);
 
-void _raw_spin_unlock(spinlock_t *lp)
+void __raw_spin_unlock(raw_spinlock_t *lp)
 {
   	if ( !lp->lock )
 		printk("_spin_unlock(%p): no lock cpu %d curr PC %p %s/%d\n",
@@ -89,13 +89,13 @@ void _raw_spin_unlock(spinlock_t *lp)
 	wmb();
 	lp->lock = 0;
 }
-EXPORT_SYMBOL(_raw_spin_unlock);
+EXPORT_SYMBOL(__raw_spin_unlock);
 
 /*
  * For rwlocks, zero is unlocked, -1 is write-locked,
  * positive is read-locked.
  */
-static __inline__ int __read_trylock(rwlock_t *rw)
+static __inline__ int __read_trylock(raw_rwlock_t *rw)
 {
 	signed int tmp;
 
@@ -115,13 +115,13 @@ static __inline__ int __read_trylock(rwl
 	return tmp;
 }
 
-int _raw_read_trylock(rwlock_t *rw)
+int __raw_read_trylock(raw_rwlock_t *rw)
 {
 	return __read_trylock(rw) > 0;
 }
-EXPORT_SYMBOL(_raw_read_trylock);
+EXPORT_SYMBOL(__raw_read_trylock);
 
-void _raw_read_lock(rwlock_t *rw)
+void __raw_read_lock(rwlock_t *rw)
 {
 	unsigned int stuck;
 
@@ -136,9 +136,9 @@ void _raw_read_lock(rwlock_t *rw)
 		}
 	}
 }
-EXPORT_SYMBOL(_raw_read_lock);
+EXPORT_SYMBOL(__raw_read_lock);
 
-void _raw_read_unlock(rwlock_t *rw)
+void __raw_read_unlock(raw_rwlock_t *rw)
 {
 	if ( rw->lock == 0 )
 		printk("_read_unlock(): %s/%d (nip %08lX) lock %d\n",
@@ -147,9 +147,9 @@ void _raw_read_unlock(rwlock_t *rw)
 	wmb();
 	atomic_dec((atomic_t *) &(rw)->lock);
 }
-EXPORT_SYMBOL(_raw_read_unlock);
+EXPORT_SYMBOL(__raw_read_unlock);
 
-void _raw_write_lock(rwlock_t *rw)
+void __raw_write_lock(raw_rwlock_t *rw)
 {
 	unsigned int stuck;
 
@@ -165,18 +165,18 @@ void _raw_write_lock(rwlock_t *rw)
 	}
 	wmb();
 }
-EXPORT_SYMBOL(_raw_write_lock);
+EXPORT_SYMBOL(__raw_write_lock);
 
-int _raw_write_trylock(rwlock_t *rw)
+int __raw_write_trylock(raw_rwlock_t *rw)
 {
 	if (cmpxchg(&rw->lock, 0, -1) != 0)
 		return 0;
 	wmb();
 	return 1;
 }
-EXPORT_SYMBOL(_raw_write_trylock);
+EXPORT_SYMBOL(__raw_write_trylock);
 
-void _raw_write_unlock(rwlock_t *rw)
+void __raw_write_unlock(raw_rwlock_t *rw)
 {
 	if (rw->lock >= 0)
 		printk("_write_lock(): %s/%d (nip %08lX) lock %d\n",
@@ -185,6 +185,6 @@ void _raw_write_unlock(rwlock_t *rw)
 	wmb();
 	rw->lock = 0;
 }
-EXPORT_SYMBOL(_raw_write_unlock);
+EXPORT_SYMBOL(__raw_write_unlock);
 
 #endif
Index: linux-2.6.16/arch/ppc/mm/fault.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/mm/fault.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/mm/fault.c	2006-10-19 16:52:30.000000000 +0200
@@ -92,7 +92,7 @@ static int store_updates_sp(struct pt_re
  * the error_code parameter is ESR for a data fault, 0 for an instruction
  * fault.
  */
-int do_page_fault(struct pt_regs *regs, unsigned long address,
+int notrace do_page_fault(struct pt_regs *regs, unsigned long address,
 		  unsigned long error_code)
 {
 	struct vm_area_struct * vma;
Index: linux-2.6.16/arch/ppc/mm/init.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/mm/init.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/mm/init.c	2006-10-19 16:52:30.000000000 +0200
@@ -56,7 +56,7 @@
 #endif
 #define MAX_LOW_MEM	CONFIG_LOWMEM_SIZE
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 unsigned long total_memory;
 unsigned long total_lowmem;
Index: linux-2.6.16/arch/ppc/platforms/apus_setup.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/platforms/apus_setup.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/platforms/apus_setup.c	2006-10-19 16:52:30.000000000 +0200
@@ -278,6 +278,7 @@ void apus_calibrate_decr(void)
 	       freq/1000000, freq%1000000);
 	tb_ticks_per_jiffy = freq / HZ;
 	tb_to_us = mulhwu_scale_factor(freq, 1000000);
+	cpu_khz = freq / 1000;
 
 	__bus_speed = bus_speed;
 	__speed_test_failed = speed_test_failed;
Index: linux-2.6.16/arch/ppc/platforms/chrp_smp.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/platforms/chrp_smp.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/platforms/chrp_smp.c	2006-10-19 16:52:30.000000000 +0200
@@ -58,7 +58,7 @@ smp_chrp_setup_cpu(int cpu_nr)
 		do_openpic_setup_cpu();
 }
 
-static DEFINE_SPINLOCK(timebase_lock);
+static DEFINE_RAW_SPINLOCK(timebase_lock);
 static unsigned int timebase_upper = 0, timebase_lower = 0;
 
 void __devinit
Index: linux-2.6.16/arch/ppc/platforms/chrp_time.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/platforms/chrp_time.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/platforms/chrp_time.c	2006-10-19 16:52:30.000000000 +0200
@@ -28,7 +28,7 @@
 #include <asm/sections.h>
 #include <asm/time.h>
 
-extern spinlock_t rtc_lock;
+extern raw_spinlock_t rtc_lock;
 
 static int nvram_as1 = NVRAM_AS1;
 static int nvram_as0 = NVRAM_AS0;
@@ -250,4 +250,5 @@ void __init chrp_calibrate_decr(void)
  	       freq/1000000, freq%1000000);
 	tb_ticks_per_jiffy = freq / HZ;
 	tb_to_us = mulhwu_scale_factor(freq, 1000000);
+	cpu_khz = freq / 1000;
 }
Index: linux-2.6.16/arch/ppc/platforms/ev64260.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/platforms/ev64260.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/platforms/ev64260.c	2006-10-19 16:52:30.000000000 +0200
@@ -553,6 +553,7 @@ ev64260_calibrate_decr(void)
 
 	tb_ticks_per_jiffy = freq / HZ;
 	tb_to_us = mulhwu_scale_factor(freq, 1000000);
+	cpu_khz = freq / 1000;
 
 	return;
 }
Index: linux-2.6.16/arch/ppc/platforms/gemini_setup.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/platforms/gemini_setup.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/platforms/gemini_setup.c	2006-10-19 16:52:30.000000000 +0200
@@ -462,6 +462,7 @@ void __init gemini_calibrate_decr(void)
 	divisor = 4;
 	tb_ticks_per_jiffy = freq / HZ / divisor;
 	tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000);
+	cpu_khz = (freq / divisor) / 1000;
 }
 
 unsigned long __init gemini_find_end_of_memory(void)
Index: linux-2.6.16/arch/ppc/platforms/hdpu.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/platforms/hdpu.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/platforms/hdpu.c	2006-10-19 16:52:30.000000000 +0200
@@ -59,7 +59,7 @@ static void parse_bootinfo(unsigned long
 static void hdpu_set_l1pe(void);
 static void hdpu_cpustate_set(unsigned char new_state);
 #ifdef CONFIG_SMP
-static DEFINE_SPINLOCK(timebase_lock);
+static DEFINE_RAW_SPINLOCK(timebase_lock);
 static unsigned int timebase_upper = 0, timebase_lower = 0;
 extern int smp_tb_synchronized;
 
Index: linux-2.6.16/arch/ppc/platforms/powerpmc250.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/platforms/powerpmc250.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/platforms/powerpmc250.c	2006-10-19 16:52:30.000000000 +0200
@@ -166,6 +166,7 @@ powerpmc250_calibrate_decr(void)
 
 	tb_ticks_per_jiffy = freq / (HZ * divisor);
 	tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000);
+	cpu_khz = (freq / divisor) / 1000;
 }
 
 static void
Index: linux-2.6.16/arch/ppc/platforms/prep_setup.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/platforms/prep_setup.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/platforms/prep_setup.c	2006-10-19 16:52:30.000000000 +0200
@@ -943,6 +943,7 @@ prep_calibrate_decr(void)
 					(freq/divisor)/1000000,
 					(freq/divisor)%1000000);
 			tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000);
+			cpu_khz = (freq / divisor) / 1000;
 			tb_ticks_per_jiffy = freq / HZ / divisor;
 		}
 	}
Index: linux-2.6.16/arch/ppc/platforms/prpmc750.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/platforms/prpmc750.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/platforms/prpmc750.c	2006-10-19 16:52:30.000000000 +0200
@@ -271,6 +271,7 @@ static void __init prpmc750_calibrate_de
 
 	tb_ticks_per_jiffy = freq / (HZ * divisor);
 	tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000);
+	cpu_khz = (freq / divisor) / 1000;
 }
 
 static void prpmc750_restart(char *cmd)
Index: linux-2.6.16/arch/ppc/platforms/prpmc800.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/platforms/prpmc800.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/platforms/prpmc800.c	2006-10-19 16:52:30.000000000 +0200
@@ -330,6 +330,7 @@ static void __init prpmc800_calibrate_de
 		tb_ticks_per_second = 100000000 / 4;
 		tb_ticks_per_jiffy = tb_ticks_per_second / HZ;
 		tb_to_us = mulhwu_scale_factor(tb_ticks_per_second, 1000000);
+		cpu_khz = tb_ticks_per_second / 1000;
 		return;
 	}
 
@@ -370,6 +371,7 @@ static void __init prpmc800_calibrate_de
 	tb_ticks_per_second = (tbl_end - tbl_start) * 2;
 	tb_ticks_per_jiffy = tb_ticks_per_second / HZ;
 	tb_to_us = mulhwu_scale_factor(tb_ticks_per_second, 1000000);
+	cpu_khz = tb_ticks_per_second / 1000;
 }
 
 static void prpmc800_restart(char *cmd)
Index: linux-2.6.16/arch/ppc/platforms/sbc82xx.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/platforms/sbc82xx.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/platforms/sbc82xx.c	2006-10-19 16:52:30.000000000 +0200
@@ -68,7 +68,7 @@ static void sbc82xx_time_init(void)
 
 static volatile char *sbc82xx_i8259_map;
 static char sbc82xx_i8259_mask = 0xff;
-static DEFINE_SPINLOCK(sbc82xx_i8259_lock);
+static DEFINE_RAW_SPINLOCK(sbc82xx_i8259_lock);
 
 static void sbc82xx_i8259_mask_and_ack_irq(unsigned int irq_nr)
 {
Index: linux-2.6.16/arch/ppc/platforms/spruce.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/platforms/spruce.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/platforms/spruce.c	2006-10-19 16:52:30.000000000 +0200
@@ -150,6 +150,7 @@ spruce_calibrate_decr(void)
 	freq = SPRUCE_BUS_SPEED;
 	tb_ticks_per_jiffy = freq / HZ / divisor;
 	tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000);
+	cpu_khz = (freq / divisor) / 1000;
 }
 
 static int
Index: linux-2.6.16/arch/ppc/syslib/cpm2_common.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/syslib/cpm2_common.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/syslib/cpm2_common.c	2006-10-19 16:52:30.000000000 +0200
@@ -114,7 +114,7 @@ cpm2_fastbrg(uint brg, uint rate, int di
 /*
  * dpalloc / dpfree bits.
  */
-static spinlock_t cpm_dpmem_lock;
+static raw_spinlock_t cpm_dpmem_lock;
 /* 16 blocks should be enough to satisfy all requests
  * until the memory subsystem goes up... */
 static rh_block_t cpm_boot_dpmem_rh_block[16];
Index: linux-2.6.16/arch/ppc/syslib/ibm44x_common.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/syslib/ibm44x_common.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/syslib/ibm44x_common.c	2006-10-19 16:52:30.000000000 +0200
@@ -66,6 +66,7 @@ void __init ibm44x_calibrate_decr(unsign
 {
 	tb_ticks_per_jiffy = freq / HZ;
 	tb_to_us = mulhwu_scale_factor(freq, 1000000);
+	cpu_khz = freq / 1000;
 
 	/* Set the time base to zero */
 	mtspr(SPRN_TBWL, 0);
Index: linux-2.6.16/arch/ppc/syslib/m8260_setup.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/syslib/m8260_setup.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/syslib/m8260_setup.c	2006-10-19 16:52:30.000000000 +0200
@@ -82,6 +82,7 @@ m8260_calibrate_decr(void)
         divisor = 4;
         tb_ticks_per_jiffy = freq / HZ / divisor;
 	tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000);
+	cpu_khz = (freq / divisor) / 1000;
 }
 
 /* The 8260 has an internal 1-second timer update register that
Index: linux-2.6.16/arch/ppc/syslib/m8xx_setup.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/syslib/m8xx_setup.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/syslib/m8xx_setup.c	2006-10-19 16:52:30.000000000 +0200
@@ -170,6 +170,7 @@ void __init m8xx_calibrate_decr(void)
         printk("Decrementer Frequency = %d/%d\n", freq, divisor);
         tb_ticks_per_jiffy = freq / HZ / divisor;
 	tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000);
+	cpu_khz = (freq / divisor) / 1000;
 
 	/* Perform some more timer/timebase initialization.  This used
 	 * to be done elsewhere, but other changes caused it to get
Index: linux-2.6.16/arch/ppc/syslib/mpc52xx_setup.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/syslib/mpc52xx_setup.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/syslib/mpc52xx_setup.c	2006-10-19 16:52:30.000000000 +0200
@@ -216,6 +216,7 @@ mpc52xx_calibrate_decr(void)
 
 	tb_ticks_per_jiffy = xlbfreq / HZ / divisor;
 	tb_to_us = mulhwu_scale_factor(xlbfreq / divisor, 1000000);
+	cpu_khz = (xlbfreq / divisor) / 1000;
 }
 
 int mpc52xx_match_psc_function(int psc_idx, const char *func)
Index: linux-2.6.16/arch/ppc/syslib/ocp.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/syslib/ocp.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/syslib/ocp.c	2006-10-19 16:52:30.000000000 +0200
@@ -45,11 +45,11 @@
 #include <linux/pm.h>
 #include <linux/bootmem.h>
 #include <linux/device.h>
+#include <linux/rwsem.h>
 
 #include <asm/io.h>
 #include <asm/ocp.h>
 #include <asm/errno.h>
-#include <asm/rwsem.h>
 #include <asm/semaphore.h>
 
 //#define DBG(x)	printk x
Index: linux-2.6.16/arch/ppc/syslib/open_pic.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/syslib/open_pic.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/syslib/open_pic.c	2006-10-19 16:52:30.000000000 +0200
@@ -529,7 +529,7 @@ void openpic_reset_processor_phys(u_int 
 }
 
 #if defined(CONFIG_SMP) || defined(CONFIG_PM)
-static DEFINE_SPINLOCK(openpic_setup_lock);
+static DEFINE_RAW_SPINLOCK(openpic_setup_lock);
 #endif
 
 #ifdef CONFIG_SMP
Index: linux-2.6.16/arch/ppc/syslib/open_pic2.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/syslib/open_pic2.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/syslib/open_pic2.c	2006-10-19 16:52:30.000000000 +0200
@@ -383,7 +383,7 @@ static void openpic2_set_spurious(u_int 
 			   vec);
 }
 
-static DEFINE_SPINLOCK(openpic2_setup_lock);
+static DEFINE_RAW_SPINLOCK(openpic2_setup_lock);
 
 /*
  *  Initialize a timer interrupt (and disable it)
Index: linux-2.6.16/arch/ppc/syslib/ppc4xx_setup.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/syslib/ppc4xx_setup.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/syslib/ppc4xx_setup.c	2006-10-19 16:52:30.000000000 +0200
@@ -173,6 +173,7 @@ ppc4xx_calibrate_decr(void)
 	freq = bip->bi_tbfreq;
 	tb_ticks_per_jiffy = freq / HZ;
 	tb_to_us = mulhwu_scale_factor(freq, 1000000);
+	cpu_khz = freq / 1000;
 
 	/* Set the time base to zero.
 	   ** At 200 Mhz, time base will rollover in ~2925 years.
Index: linux-2.6.16/arch/ppc/syslib/ppc85xx_setup.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/syslib/ppc85xx_setup.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/syslib/ppc85xx_setup.c	2006-10-19 16:52:30.000000000 +0200
@@ -60,6 +60,7 @@ mpc85xx_calibrate_decr(void)
         divisor = 8;
         tb_ticks_per_jiffy = freq / divisor / HZ;
         tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000);
+        cpu_khz = (freq / divisor) / 1000;
 
 	/* Set the time base to zero */
 	mtspr(SPRN_TBWL, 0);
Index: linux-2.6.16/arch/ppc/syslib/prom.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/syslib/prom.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/syslib/prom.c	2006-10-19 16:52:30.000000000 +0200
@@ -1379,7 +1379,7 @@ print_properties(struct device_node *np)
 }
 #endif
 
-static DEFINE_SPINLOCK(rtas_lock);
+static DEFINE_RAW_SPINLOCK(rtas_lock);
 
 /* this can be called after setup -- Cort */
 int
Index: linux-2.6.16/arch/ppc/syslib/todc_time.c
===================================================================
--- linux-2.6.16.orig/arch/ppc/syslib/todc_time.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/ppc/syslib/todc_time.c	2006-10-19 16:52:30.000000000 +0200
@@ -508,6 +508,7 @@ todc_calibrate_decr(void)
 
 	tb_ticks_per_jiffy = freq / HZ;
 	tb_to_us = mulhwu_scale_factor(freq, 1000000);
+	cpu_khz = freq / 1000;
 
 	return;
 }
Index: linux-2.6.16/arch/x86_64/Kconfig
===================================================================
--- linux-2.6.16.orig/arch/x86_64/Kconfig	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/x86_64/Kconfig	2006-10-19 16:52:30.000000000 +0200
@@ -24,6 +24,14 @@ config X86
 	bool
 	default y
 
+config GENERIC_TIME
+       bool
+       default y
+
+config GENERIC_TIME_VSYSCALL
+       bool
+       default y
+
 config SEMAPHORE_SLEEPERS
 	bool
 	default y
@@ -38,13 +46,6 @@ config ISA
 config SBUS
 	bool
 
-config RWSEM_GENERIC_SPINLOCK
-	bool
-	default y
-
-config RWSEM_XCHGADD_ALGORITHM
-	bool
-
 config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
@@ -221,6 +222,8 @@ config MTRR
 
 	  See <file:Documentation/mtrr.txt> for more information.
 
+source "kernel/time/Kconfig"
+
 config SMP
 	bool "Symmetric multi-processing support"
 	---help---
@@ -259,6 +262,14 @@ config NUMA
 	 If the system is EM64T, you should say N unless your system is EM64T 
 	 NUMA. 
 
+config RWSEM_GENERIC_SPINLOCK
+	bool
+	default y
+
+config RWSEM_XCHGADD_ALGORITHM
+	depends on !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT
+	bool
+
 config K8_NUMA
        bool "Old style AMD Opteron NUMA detection"
        depends on NUMA
Index: linux-2.6.16/arch/x86_64/boot/compressed/misc.c
===================================================================
--- linux-2.6.16.orig/arch/x86_64/boot/compressed/misc.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/x86_64/boot/compressed/misc.c	2006-10-19 16:52:30.000000000 +0200
@@ -114,6 +114,7 @@ static char *vidmem = (char *)0xb8000;
 static int vidport;
 static int lines, cols;
 
+#define ZLIB_INFLATE_NO_INFLATE_LOCK
 #include "../../../../lib/inflate.c"
 
 static void *malloc(int size)
Index: linux-2.6.16/arch/x86_64/ia32/ia32entry.S
===================================================================
--- linux-2.6.16.orig/arch/x86_64/ia32/ia32entry.S	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/x86_64/ia32/ia32entry.S	2006-10-19 16:52:30.000000000 +0200
@@ -688,6 +688,8 @@ ia32_sys_call_table:
 	.quad sys_ni_syscall		/* pselect6 for now */
 	.quad sys_ni_syscall		/* ppoll for now */
 	.quad sys_unshare		/* 310 */
+	.quad compat_sys_set_robust_list
+	.quad compat_sys_get_robust_list
 ia32_syscall_end:		
 	.rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8
 		.quad ni_syscall
Index: linux-2.6.16/arch/x86_64/ia32/sys_ia32.c
===================================================================
--- linux-2.6.16.orig/arch/x86_64/ia32/sys_ia32.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/x86_64/ia32/sys_ia32.c	2006-10-19 16:52:30.000000000 +0200
@@ -430,24 +430,12 @@ put_tv32(struct compat_timeval __user *o
 	return err; 
 }
 
-extern int do_setitimer(int which, struct itimerval *, struct itimerval *);
+extern unsigned int alarm_setitimer(unsigned int seconds);
 
 asmlinkage long
 sys32_alarm(unsigned int seconds)
 {
-	struct itimerval it_new, it_old;
-	unsigned int oldalarm;
-
-	it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
-	it_new.it_value.tv_sec = seconds;
-	it_new.it_value.tv_usec = 0;
-	do_setitimer(ITIMER_REAL, &it_new, &it_old);
-	oldalarm = it_old.it_value.tv_sec;
-	/* ehhh.. We can't return 0 if we have an alarm pending.. */
-	/* And we'd better return too much than too little anyway */
-	if (it_old.it_value.tv_usec)
-		oldalarm++;
-	return oldalarm;
+	return alarm_setitimer(seconds);
 }
 
 /* Translations due to time_t size differences.  Which affects all
@@ -478,6 +466,10 @@ sys32_settimeofday(struct compat_timeval
 	struct timespec kts;
 	struct timezone ktz;
 
+	int ret = timeofday_API_hacks(tv, tz);
+	if (ret != 1)
+		return ret;
+
  	if (tv) {
 		if (get_tv32(&ktv, tv))
 			return -EFAULT;
Index: linux-2.6.16/arch/x86_64/kernel/Makefile
===================================================================
--- linux-2.6.16.orig/arch/x86_64/kernel/Makefile	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/x86_64/kernel/Makefile	2006-10-19 16:52:30.000000000 +0200
@@ -31,7 +31,6 @@ obj-$(CONFIG_EARLY_PRINTK)	+= early_prin
 obj-$(CONFIG_GART_IOMMU)	+= pci-gart.o aperture.o
 obj-$(CONFIG_SWIOTLB)		+= pci-swiotlb.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
-obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer.o
 obj-$(CONFIG_X86_VSMP)		+= vsmp.o
 
 obj-$(CONFIG_MODULES)		+= module.o
Index: linux-2.6.16/arch/x86_64/kernel/apic.c
===================================================================
--- linux-2.6.16.orig/arch/x86_64/kernel/apic.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/x86_64/kernel/apic.c	2006-10-19 16:52:30.000000000 +0200
@@ -508,8 +508,7 @@ static int lapic_suspend(struct sys_devi
 	apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
 	apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
 	apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
-	local_save_flags(flags);
-	local_irq_disable();
+	local_irq_save(flags);
 	disable_local_APIC();
 	local_irq_restore(flags);
 	return 0;
@@ -767,7 +766,7 @@ static int __init calibrate_APIC_clock(v
 	__setup_APIC_LVTT(1000000000);
 
 	apic_start = apic_read(APIC_TMCCT);
-#ifdef CONFIG_X86_PM_TIMER
+#ifdef CONFIG_X86_PM_TIMER_XXXTHISNEEDSFIXING
 	if (apic_calibrate_pmtmr && pmtmr_ioport) {
 		pmtimer_wait(5000);  /* 5ms wait */
 		apic = apic_read(APIC_TMCCT);
Index: linux-2.6.16/arch/x86_64/kernel/early_printk.c
===================================================================
--- linux-2.6.16.orig/arch/x86_64/kernel/early_printk.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/x86_64/kernel/early_printk.c	2006-10-19 16:52:30.000000000 +0200
@@ -206,7 +206,7 @@ static int early_console_initialized = 0
 
 void early_printk(const char *fmt, ...)
 { 
-	char buf[512]; 
+	static char buf[512];
 	int n; 
 	va_list ap;
 
Index: linux-2.6.16/arch/x86_64/kernel/entry.S
===================================================================
--- linux-2.6.16.orig/arch/x86_64/kernel/entry.S	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/x86_64/kernel/entry.S	2006-10-19 16:52:30.000000000 +0200
@@ -49,6 +49,15 @@
 #define retint_kernel retint_restore_args
 #endif	
 	
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+# define CALL_TRACE_IRQS_ON \
+	push %rbp; \
+	mov %rsp, %rbp; \
+	call trace_hardirqs_on_lowlevel; \
+	leaveq
+#else
+# define CALL_TRACE_IRQS_ON
+#endif
 /*
  * C code is not supposed to know about undefined top of stack. Every time 
  * a C function with an pt_regs argument is called from the SYSCALL based 
@@ -231,8 +240,8 @@ sysret_check:		
 	/* edx:	work, edi: workmask */	
 sysret_careful:
 	CFI_RESTORE_STATE
-	bt $TIF_NEED_RESCHED,%edx
-	jnc sysret_signal
+	testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx
+	jz sysret_signal
 	sti
 	pushq %rdi
 	CFI_ADJUST_CFA_OFFSET 8
@@ -253,7 +262,7 @@ sysret_signal:
 	leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
 	xorl %esi,%esi # oldset -> arg2
 	call ptregscall_common
-1:	movl $_TIF_NEED_RESCHED,%edi
+1:	movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi
 	jmp sysret_check
 	
 badsys:
@@ -321,8 +330,8 @@ int_with_check:
 	/* First do a reschedule test. */
 	/* edx:	work, edi: workmask */
 int_careful:
-	bt $TIF_NEED_RESCHED,%edx
-	jnc  int_very_careful
+	testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx
+	jz int_very_careful
 	sti
 	pushq %rdi
 	CFI_ADJUST_CFA_OFFSET 8
@@ -355,7 +364,7 @@ int_signal:
 	movq %rsp,%rdi		# &ptregs -> arg1
 	xorl %esi,%esi		# oldset -> arg2
 	call do_notify_resume
-1:	movl $_TIF_NEED_RESCHED,%edi	
+1:	movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi
 int_restore_rest:
 	RESTORE_REST
 	cli
@@ -561,8 +570,8 @@ bad_iret:
 	/* edi: workmask, edx: work */
 retint_careful:
 	CFI_RESTORE_STATE
-	bt    $TIF_NEED_RESCHED,%edx
-	jnc   retint_signal
+	testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx
+	jz    retint_signal
 	sti
 	pushq %rdi
 	CFI_ADJUST_CFA_OFFSET	8
@@ -584,7 +593,7 @@ retint_signal:
 	call do_notify_resume
 	RESTORE_REST
 	cli
-	movl $_TIF_NEED_RESCHED,%edi
+	movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi
 	GET_THREAD_INFO(%rcx)
 	jmp retint_check
 
@@ -600,6 +609,7 @@ retint_kernel:	
 	bt   $9,EFLAGS-ARGOFFSET(%rsp)	/* interrupts off? */
 	jnc  retint_restore_args
 	call preempt_schedule_irq
+	CALL_TRACE_IRQS_ON
 	jmp exit_intr
 #endif	
 	CFI_ENDPROC
@@ -1059,3 +1069,41 @@ ENTRY(call_softirq)
 	decl %gs:pda_irqcount
 	ret
 	CFI_ENDPROC
+
+#ifdef CONFIG_LATENCY_TRACE
+
+ENTRY(mcount)
+	cmpq $0, trace_enabled
+	jz out
+
+	push %rbp
+	mov %rsp,%rbp
+
+	push %r9
+	push %r8
+	push %rdi
+	push %rsi
+	push %rdx
+	push %rcx
+	push %rax
+
+	mov 0x0(%rbp),%rax
+	mov 0x8(%rbp),%rdi
+	mov 0x10(%rax),%rsi
+
+	call   __trace
+
+	pop %rax
+	pop %rcx
+	pop %rdx
+	pop %rsi
+	pop %rdi
+	pop %r8
+	pop %r9
+
+	leaveq
+out:
+	ret
+
+#endif
+
Index: linux-2.6.16/arch/x86_64/kernel/i8259.c
===================================================================
--- linux-2.6.16.orig/arch/x86_64/kernel/i8259.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/x86_64/kernel/i8259.c	2006-10-19 16:52:30.000000000 +0200
@@ -127,7 +127,7 @@ void (*interrupt[NR_IRQS])(void) = {
  * moves to arch independent land
  */
 
-DEFINE_SPINLOCK(i8259A_lock);
+DEFINE_RAW_SPINLOCK(i8259A_lock);
 
 static void end_8259A_irq (unsigned int irq)
 {
@@ -448,7 +448,7 @@ device_initcall(i8259A_init_sysfs);
  * IRQ2 is cascade interrupt to second interrupt controller
  */
 
-static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
+static struct irqaction irq2 = { no_action, SA_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL};
 
 void __init init_ISA_irqs (void)
 {
Index: linux-2.6.16/arch/x86_64/kernel/io_apic.c
===================================================================
--- linux-2.6.16.orig/arch/x86_64/kernel/io_apic.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/x86_64/kernel/io_apic.c	2006-10-19 16:52:30.000000000 +0200
@@ -55,7 +55,7 @@ int timer_over_8254 __initdata = 1;
 /* Where if anywhere is the i8259 connect in external int mode */
 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 
-static DEFINE_SPINLOCK(ioapic_lock);
+static DEFINE_RAW_SPINLOCK(ioapic_lock);
 
 /*
  * # of IRQ routing registers
@@ -103,6 +103,9 @@ int vector_irq[NR_VECTORS] __read_mostly
 		reg = io_apic_read(entry->apic, 0x10 + R + pin*2);	\
 		reg ACTION;						\
 		io_apic_modify(entry->apic, reg);			\
+		 /* Force POST flush by reading: */			\
+		reg = io_apic_read(entry->apic, 0x10 + R + pin*2);	\
+									\
 		if (!entry->next)					\
 			break;						\
 		entry = irq_2_pin + entry->next;			\
@@ -169,10 +172,8 @@ static void add_pin_to_irq(unsigned int 
 	static void name##_IO_APIC_irq (unsigned int irq)		\
 	__DO_ACTION(R, ACTION, FINAL)
 
-DO_ACTION( __mask,             0, |= 0x00010000, io_apic_sync(entry->apic) )
-						/* mask = 1 */
-DO_ACTION( __unmask,           0, &= 0xfffeffff, )
-						/* mask = 0 */
+DO_ACTION( __mask,             0, |= 0x00010000, ) /* mask = 1 */
+DO_ACTION( __unmask,           0, &= 0xfffeffff, ) /* mask = 0 */
 
 static void mask_IO_APIC_irq (unsigned int irq)
 {
@@ -1528,12 +1529,50 @@ static unsigned int startup_level_ioapic
 	return 0; /* don't check for pending */
 }
 
+/*
+ * In the preemptible case mask the IRQ first then handle it and ack it.
+ *
+ * (In the non-preemptible case we keep the IRQ unacked in the local APIC
+ * and dont need to do the masking, because the code executes atomically.)
+ */
+#ifdef CONFIG_PREEMPT_HARDIRQS
+
+static void mask_and_ack_level_ioapic_irq(unsigned int irq)
+{
+	move_irq(irq);
+	mask_IO_APIC_irq(irq);
+	ack_APIC_irq();
+}
+
+static void end_level_ioapic_irq(unsigned int irq)
+{
+	if (!(irq_desc[irq].status & IRQ_INPROGRESS))
+		unmask_IO_APIC_irq(irq);
+}
+
+static void enable_level_ioapic_irq(unsigned int irq)
+{
+	unmask_IO_APIC_irq(irq);
+}
+
+#else /* !CONFIG_PREEMPT_HARDIRQS */
+
+static void mask_and_ack_level_ioapic_irq(unsigned int irq)
+{
+}
+
 static void end_level_ioapic_irq (unsigned int irq)
 {
 	move_irq(irq);
 	ack_APIC_irq();
 }
 
+static void enable_level_ioapic_irq(unsigned int irq)
+{
+	unmask_IO_APIC_irq(irq);
+}
+#endif /* !CONFIG_PREEMPT_HARDIRQS */
+
 #ifdef CONFIG_PCI_MSI
 static unsigned int startup_edge_ioapic_vector(unsigned int vector)
 {
@@ -1557,6 +1596,13 @@ static unsigned int startup_level_ioapic
 	return startup_level_ioapic_irq (irq);
 }
 
+static void mask_and_ack_level_ioapic_vector (unsigned int vector)
+{
+	int irq = vector_to_irq(vector);
+
+	mask_and_ack_level_ioapic_irq(irq);
+}
+
 static void end_level_ioapic_vector (unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
@@ -1565,6 +1611,11 @@ static void end_level_ioapic_vector (uns
 	end_level_ioapic_irq(irq);
 }
 
+static void enable_level_ioapic_vector(unsigned int vector)
+{
+	enable_level_ioapic_irq(vector_to_irq(vector));
+}
+
 static void mask_IO_APIC_vector (unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
Index: linux-2.6.16/arch/x86_64/kernel/nmi.c
===================================================================
--- linux-2.6.16.orig/arch/x86_64/kernel/nmi.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/x86_64/kernel/nmi.c	2006-10-19 16:52:30.000000000 +0200
@@ -44,7 +44,7 @@
  * This is maintained separately from nmi_active because the NMI
  * watchdog may also be driven from the I/O APIC timer.
  */
-static DEFINE_SPINLOCK(lapic_nmi_owner_lock);
+static DEFINE_RAW_SPINLOCK(lapic_nmi_owner_lock);
 static unsigned int lapic_nmi_owner;
 #define LAPIC_NMI_WATCHDOG	(1<<0)
 #define LAPIC_NMI_RESERVED	(1<<1)
@@ -472,12 +472,42 @@ void touch_nmi_watchdog (void)
  	touch_softlockup_watchdog();
 }
 
+int nmi_show_regs[NR_CPUS];
+
+void nmi_show_all_regs(void)
+{
+	int i;
+
+	if (nmi_watchdog == NMI_NONE)
+		return;
+	if (system_state != SYSTEM_RUNNING) {
+		printk("nmi_show_all_regs(): system state %d, not doing.\n",
+			system_state);
+		return;
+	}
+
+	for_each_online_cpu(i)
+		nmi_show_regs[i] = 1;
+	for_each_online_cpu(i)
+		while (nmi_show_regs[i] == 1)
+			barrier();
+}
+
+static DEFINE_RAW_SPINLOCK(nmi_print_lock);
+
 void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
 {
 	int sum;
 	int touched = 0;
+	int cpu = safe_smp_processor_id();
 
 	sum = read_pda(apic_timer_irqs);
+	if (nmi_show_regs[cpu]) {
+		nmi_show_regs[cpu] = 0;
+		spin_lock(&nmi_print_lock);
+		show_regs(regs);
+		spin_unlock(&nmi_print_lock);
+	}
 	if (__get_cpu_var(nmi_touch)) {
 		__get_cpu_var(nmi_touch) = 0;
 		touched = 1;
@@ -489,6 +519,11 @@ void __kprobes nmi_watchdog_tick(struct 
 		 */
 		local_inc(&__get_cpu_var(alert_counter));
 		if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) {
+			int i;
+
+			for (i = 0; i < NR_CPUS; i++)
+				nmi_show_regs[i] = 1;
+
 			if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
 							== NOTIFY_STOP) {
 				local_set(&__get_cpu_var(alert_counter), 0);
Index: linux-2.6.16/arch/x86_64/kernel/pmtimer.c
===================================================================
--- linux-2.6.16.orig/arch/x86_64/kernel/pmtimer.c	2006-09-20 10:56:46.000000000 +0200
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,126 +0,0 @@
-/* Ported over from i386 by AK, original copyright was:
- *
- * (C) Dominik Brodowski <linux@brodo.de> 2003
- *
- * Driver to use the Power Management Timer (PMTMR) available in some
- * southbridges as primary timing source for the Linux kernel.
- *
- * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
- * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
- *
- * This file is licensed under the GPL v2.
- *
- * Dropped all the hardware bug workarounds for now. Hopefully they
- * are not needed on 64bit chipsets.
- */
-
-#include <linux/jiffies.h>
-#include <linux/kernel.h>
-#include <linux/time.h>
-#include <linux/init.h>
-#include <linux/cpumask.h>
-#include <asm/io.h>
-#include <asm/proto.h>
-#include <asm/msr.h>
-#include <asm/vsyscall.h>
-
-/* The I/O port the PMTMR resides at.
- * The location is detected during setup_arch(),
- * in arch/i386/kernel/acpi/boot.c */
-u32 pmtmr_ioport;
-
-/* value of the Power timer at last timer interrupt */
-static u32 offset_delay;
-static u32 last_pmtmr_tick;
-
-#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
-
-static inline u32 cyc2us(u32 cycles)
-{
-	/* The Power Management Timer ticks at 3.579545 ticks per microsecond.
-	 * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
-	 *
-	 * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
-	 * easily be multiplied with 286 (=0x11E) without having to fear
-	 * u32 overflows.
-	 */
-	cycles *= 286;
-	return (cycles >> 10);
-}
-
-int pmtimer_mark_offset(void)
-{
-	static int first_run = 1;
-	unsigned long tsc;
-	u32 lost;
-
-	u32 tick = inl(pmtmr_ioport);
-	u32 delta;
-
-	delta = cyc2us((tick - last_pmtmr_tick) & ACPI_PM_MASK);
-
-	last_pmtmr_tick = tick;
-	monotonic_base += delta * NSEC_PER_USEC;
-
-	delta += offset_delay;
-
-	lost = delta / (USEC_PER_SEC / HZ);
-	offset_delay = delta % (USEC_PER_SEC / HZ);
-
-	rdtscll(tsc);
-	vxtime.last_tsc = tsc - offset_delay * cpu_khz;
-
-	/* don't calculate delay for first run,
-	   or if we've got less then a tick */
-	if (first_run || (lost < 1)) {
-		first_run = 0;
-		offset_delay = 0;
-	}
-
-	return lost - 1;
-}
-
-static unsigned pmtimer_wait_tick(void)
-{
-	u32 a, b;
-	for (a = b = inl(pmtmr_ioport) & ACPI_PM_MASK;
-	     a == b;
-	     b = inl(pmtmr_ioport) & ACPI_PM_MASK)
-		;
-	return b;
-}
-
-/* note: wait time is rounded up to one tick */
-void pmtimer_wait(unsigned us)
-{
-	u32 a, b;
-	a = pmtimer_wait_tick();
-	do {
-		b = inl(pmtmr_ioport);
-	} while (cyc2us(b - a) < us);
-}
-
-void pmtimer_resume(void)
-{
-	last_pmtmr_tick = inl(pmtmr_ioport);
-}
-
-unsigned int do_gettimeoffset_pm(void)
-{
-	u32 now, offset, delta = 0;
-
-	offset = last_pmtmr_tick;
-	now = inl(pmtmr_ioport);
-	delta = (now - offset) & ACPI_PM_MASK;
-
-	return offset_delay + cyc2us(delta);
-}
-
-
-static int __init nopmtimer_setup(char *s)
-{
-	pmtmr_ioport = 0;
-	return 0;
-}
-
-__setup("nopmtimer", nopmtimer_setup);
Index: linux-2.6.16/arch/x86_64/kernel/process.c
===================================================================
--- linux-2.6.16.orig/arch/x86_64/kernel/process.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/x86_64/kernel/process.c	2006-10-19 16:52:30.000000000 +0200
@@ -120,9 +120,9 @@ void default_idle(void)
 
 	clear_thread_flag(TIF_POLLING_NRFLAG);
 	smp_mb__after_clear_bit();
-	while (!need_resched()) {
+	while (!need_resched() && !need_resched_delayed()) {
 		local_irq_disable();
-		if (!need_resched())
+		if (!need_resched() && !need_resched_delayed())
 			safe_halt();
 		else
 			local_irq_enable();
@@ -214,7 +214,9 @@ void cpu_idle (void)
 
 	/* endless idle loop with no priority at all */
 	while (1) {
-		while (!need_resched()) {
+		BUG_ON(irqs_disabled());
+
+		while (!need_resched() && !need_resched_delayed()) {
 			void (*idle)(void);
 
 			if (__get_cpu_var(cpu_idle_state))
@@ -226,14 +228,16 @@ void cpu_idle (void)
 				idle = default_idle;
 			if (cpu_is_offline(smp_processor_id()))
 				play_dead();
+			stop_critical_timing();
 			enter_idle();
 			idle();
 			__exit_idle();
 		}
-
-		preempt_enable_no_resched();
-		schedule();
+		local_irq_disable();
+		__preempt_enable_no_resched();
+		__schedule();
 		preempt_disable();
+		local_irq_enable();
 	}
 }
 
@@ -248,10 +252,10 @@ static void mwait_idle(void)
 {
 	local_irq_enable();
 
-	while (!need_resched()) {
+	while (!need_resched() && !need_resched_delayed()) {
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		smp_mb();
-		if (need_resched())
+		if (need_resched() || need_resched_delayed())
 			break;
 		__mwait(0, 0);
 	}
@@ -342,7 +346,7 @@ void show_regs(struct pt_regs *regs)
 {
 	printk("CPU %d:", smp_processor_id());
 	__show_regs(regs);
-	show_trace(&regs->rsp);
+	show_trace(current, &regs->rsp);
 }
 
 /*
@@ -361,13 +365,14 @@ void exit_thread(void)
 	kprobe_flush_task(me);
 
 	if (me->thread.io_bitmap_ptr) { 
-		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
+		struct tss_struct *tss;
 
 		kfree(t->io_bitmap_ptr);
 		t->io_bitmap_ptr = NULL;
 		/*
 		 * Careful, clear this in the TSS too:
 		 */
+		tss = &per_cpu(init_tss, get_cpu());
 		memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
 		t->io_bitmap_max = 0;
 		put_cpu();
Index: linux-2.6.16/arch/x86_64/kernel/signal.c
===================================================================
--- linux-2.6.16.orig/arch/x86_64/kernel/signal.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/x86_64/kernel/signal.c	2006-10-19 16:52:30.000000000 +0200
@@ -434,6 +434,13 @@ int do_signal(struct pt_regs *regs, sigs
 	siginfo_t info;
 	int signr;
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Fully-preemptible kernel does not need interrupts disabled:
+	 */
+	local_irq_enable();
+	preempt_check_resched();
+#endif
 	/*
 	 * We want the common case to go fast, which
 	 * is why we may in certain cases get here from
Index: linux-2.6.16/arch/x86_64/kernel/smp.c
===================================================================
--- linux-2.6.16.orig/arch/x86_64/kernel/smp.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/x86_64/kernel/smp.c	2006-10-19 16:52:30.000000000 +0200
@@ -293,10 +293,20 @@ void smp_send_reschedule(int cpu)
 }
 
 /*
+ * this function sends a 'reschedule' IPI to all other CPUs.
+ * This is used when RT tasks are starving and other CPUs
+ * might be able to run them:
+ */
+void smp_send_reschedule_allbutself(void)
+{
+	send_IPI_allbutself(RESCHEDULE_VECTOR);
+}
+
+/*
  * Structure and data for smp_call_function(). This is designed to minimise
  * static memory requirements. It also looks cleaner.
  */
-static DEFINE_SPINLOCK(call_lock);
+static DEFINE_RAW_SPINLOCK(call_lock);
 
 struct call_data_struct {
 	void (*func) (void *info);
Index: linux-2.6.16/arch/x86_64/kernel/smpboot.c
===================================================================
--- linux-2.6.16.orig/arch/x86_64/kernel/smpboot.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/x86_64/kernel/smpboot.c	2006-10-19 16:52:30.000000000 +0200
@@ -201,7 +201,7 @@ static void __cpuinit smp_store_cpu_info
    latency and low latency is the primary objective here. -AK */
 #define no_cpu_relax() barrier()
 
-static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock);
+static __cpuinitdata __DEFINE_RAW_SPINLOCK(tsc_sync_lock);
 static volatile __cpuinitdata unsigned long go[SLAVE + 1];
 static int notscsync __cpuinitdata;
 
Index: linux-2.6.16/arch/x86_64/kernel/time.c
===================================================================
--- linux-2.6.16.orig/arch/x86_64/kernel/time.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/x86_64/kernel/time.c	2006-10-19 16:52:30.000000000 +0200
@@ -26,6 +26,7 @@
 #include <linux/bcd.h>
 #include <linux/kallsyms.h>
 #include <linux/acpi.h>
+
 #ifdef CONFIG_ACPI
 #include <acpi/achware.h>	/* for PM timer frequency */
 #endif
@@ -38,27 +39,28 @@
 #include <asm/sections.h>
 #include <linux/cpufreq.h>
 #include <linux/hpet.h>
+#include <linux/clocksource.h>
+#include <linux/timeofday.h>
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/apic.h>
 #endif
 
-#ifdef CONFIG_CPU_FREQ
-static void cpufreq_delayed_get(void);
-#endif
 extern void i8254_timer_resume(void);
 extern int using_apic_timer;
 
 static char *time_init_gtod(void);
 
-DEFINE_SPINLOCK(rtc_lock);
-DEFINE_SPINLOCK(i8253_lock);
+DEFINE_RAW_SPINLOCK(rtc_lock);
+DEFINE_RAW_SPINLOCK(i8253_lock);
 
 int nohpet __initdata = 0;
 static int notsc __initdata = 0;
 
 #undef HPET_HACK_ENABLE_DANGEROUS
 
-unsigned int cpu_khz;					/* TSC clocks / usec, not used here */
+unsigned int cpu_khz;					/* CPU clocks / usec, not used here */
+unsigned int tsc_khz;					/* TSC clocks / usec, not used here */
+unsigned long hpet_address;
 static unsigned long hpet_period;			/* fsecs / HPET clock */
 unsigned long hpet_tick;				/* HPET clocks / interrupt */
 int hpet_use_timer;				/* Use counter of hpet for time keeping, otherwise PIT */
@@ -73,107 +75,14 @@ unsigned long __wall_jiffies __section_w
 struct timespec __xtime __section_xtime;
 struct timezone __sys_tz __section_sys_tz;
 
-/*
- * do_gettimeoffset() returns microseconds since last timer interrupt was
- * triggered by hardware. A memory read of HPET is slower than a register read
- * of TSC, but much more reliable. It's also synchronized to the timer
- * interrupt. Note that do_gettimeoffset() may return more than hpet_tick, if a
- * timer interrupt has happened already, but vxtime.trigger wasn't updated yet.
- * This is not a problem, because jiffies hasn't updated either. They are bound
- * together by xtime_lock.
- */
-
-static inline unsigned int do_gettimeoffset_tsc(void)
-{
-	unsigned long t;
-	unsigned long x;
-	t = get_cycles_sync();
-	if (t < vxtime.last_tsc) t = vxtime.last_tsc; /* hack */
-	x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> 32;
-	return x;
-}
-
-static inline unsigned int do_gettimeoffset_hpet(void)
-{
-	/* cap counter read to one tick to avoid inconsistencies */
-	unsigned long counter = hpet_readl(HPET_COUNTER) - vxtime.last;
-	return (min(counter,hpet_tick) * vxtime.quot) >> 32;
-}
-
-unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc;
-
-/*
- * This version of gettimeofday() has microsecond resolution and better than
- * microsecond precision, as we're using at least a 10 MHz (usually 14.31818
- * MHz) HPET timer.
- */
-
-void do_gettimeofday(struct timeval *tv)
-{
-	unsigned long seq, t;
- 	unsigned int sec, usec;
-
-	do {
-		seq = read_seqbegin(&xtime_lock);
-
-		sec = xtime.tv_sec;
-		usec = xtime.tv_nsec / 1000;
-
-		/* i386 does some correction here to keep the clock 
-		   monotonous even when ntpd is fixing drift.
-		   But they didn't work for me, there is a non monotonic
-		   clock anyways with ntp.
-		   I dropped all corrections now until a real solution can
-		   be found. Note when you fix it here you need to do the same
-		   in arch/x86_64/kernel/vsyscall.c and export all needed
-		   variables in vmlinux.lds. -AK */ 
-
-		t = (jiffies - wall_jiffies) * (1000000L / HZ) +
-			do_gettimeoffset();
-		usec += t;
-
-	} while (read_seqretry(&xtime_lock, seq));
-
-	tv->tv_sec = sec + usec / 1000000;
-	tv->tv_usec = usec % 1000000;
-}
-
-EXPORT_SYMBOL(do_gettimeofday);
-
-/*
- * settimeofday() first undoes the correction that gettimeofday would do
- * on the time, and then saves it. This is ugly, but has been like this for
- * ages already.
- */
-
-int do_settimeofday(struct timespec *tv)
+static inline void rdtscll_sync(unsigned long *tsc)
 {
-	time_t wtm_sec, sec = tv->tv_sec;
-	long wtm_nsec, nsec = tv->tv_nsec;
-
-	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
-		return -EINVAL;
-
-	write_seqlock_irq(&xtime_lock);
-
-	nsec -= do_gettimeoffset() * 1000 +
-		(jiffies - wall_jiffies) * (NSEC_PER_SEC/HZ);
-
-	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
-	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
-
-	set_normalized_timespec(&xtime, sec, nsec);
-	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
-
-	ntp_clear();
-
-	write_sequnlock_irq(&xtime_lock);
-	clock_was_set();
-	return 0;
+#ifdef CONFIG_SMP
+	sync_core();
+#endif
+	rdtscll(*tsc);
 }
 
-EXPORT_SYMBOL(do_settimeofday);
-
 unsigned long profile_pc(struct pt_regs *regs)
 {
 	unsigned long pc = instruction_pointer(regs);
@@ -274,88 +183,8 @@ static void set_rtc_mmss(unsigned long n
 }
 
 
-/* monotonic_clock(): returns # of nanoseconds passed since time_init()
- *		Note: This function is required to return accurate
- *		time even in the absence of multiple timer ticks.
- */
-unsigned long long monotonic_clock(void)
-{
-	unsigned long seq;
- 	u32 last_offset, this_offset, offset;
-	unsigned long long base;
-
-	if (vxtime.mode == VXTIME_HPET) {
-		do {
-			seq = read_seqbegin(&xtime_lock);
-
-			last_offset = vxtime.last;
-			base = monotonic_base;
-			this_offset = hpet_readl(HPET_COUNTER);
-		} while (read_seqretry(&xtime_lock, seq));
-		offset = (this_offset - last_offset);
-		offset *=(NSEC_PER_SEC/HZ)/hpet_tick;
-		return base + offset;
-	} else {
-		do {
-			seq = read_seqbegin(&xtime_lock);
-
-			last_offset = vxtime.last_tsc;
-			base = monotonic_base;
-		} while (read_seqretry(&xtime_lock, seq));
-		this_offset = get_cycles_sync();
-		offset = (this_offset - last_offset)*1000/cpu_khz; 
-		return base + offset;
-	}
-}
-EXPORT_SYMBOL(monotonic_clock);
-
-static noinline void handle_lost_ticks(int lost, struct pt_regs *regs)
-{
-    static long lost_count;
-    static int warned;
-
-    if (report_lost_ticks) {
-	    printk(KERN_WARNING "time.c: Lost %d timer "
-		   "tick(s)! ", lost);
-	    print_symbol("rip %s)\n", regs->rip);
-    }
-
-    if (lost_count == 1000 && !warned) {
-	    printk(KERN_WARNING
-		   "warning: many lost ticks.\n"
-		   KERN_WARNING "Your time source seems to be instable or "
-		   		"some driver is hogging interupts\n");
-	    print_symbol("rip %s\n", regs->rip);
-	    if (vxtime.mode == VXTIME_TSC && vxtime.hpet_address) {
-		    printk(KERN_WARNING "Falling back to HPET\n");
-		    if (hpet_use_timer)
-		    	vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
-		    else
-		    	vxtime.last = hpet_readl(HPET_COUNTER);
-		    vxtime.mode = VXTIME_HPET;
-		    do_gettimeoffset = do_gettimeoffset_hpet;
-	    }
-	    /* else should fall back to PIT, but code missing. */
-	    warned = 1;
-    } else
-	    lost_count++;
-
-#ifdef CONFIG_CPU_FREQ
-    /* In some cases the CPU can change frequency without us noticing
-       (like going into thermal throttle)
-       Give cpufreq a change to catch up. */
-    if ((lost_count+1) % 25 == 0) {
-	    cpufreq_delayed_get();
-    }
-#endif
-}
-
 void main_timer_handler(struct pt_regs *regs)
 {
-	static unsigned long rtc_update = 0;
-	unsigned long tsc;
-	int delay, offset = 0, lost = 0;
-
 /*
  * Here we are in the timer irq handler. We have irqs locally disabled (so we
  * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running
@@ -365,67 +194,6 @@ void main_timer_handler(struct pt_regs *
 
 	write_seqlock(&xtime_lock);
 
-	if (vxtime.hpet_address)
-		offset = hpet_readl(HPET_COUNTER);
-
-	if (hpet_use_timer) {
-		/* if we're using the hpet timer functionality,
-		 * we can more accurately know the counter value
-		 * when the timer interrupt occured.
-		 */
-		offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
-		delay = hpet_readl(HPET_COUNTER) - offset;
-	} else {
-		spin_lock(&i8253_lock);
-		outb_p(0x00, 0x43);
-		delay = inb_p(0x40);
-		delay |= inb(0x40) << 8;
-		spin_unlock(&i8253_lock);
-		delay = LATCH - 1 - delay;
-	}
-
-	tsc = get_cycles_sync();
-
-	if (vxtime.mode == VXTIME_HPET) {
-		if (offset - vxtime.last > hpet_tick) {
-			lost = (offset - vxtime.last) / hpet_tick - 1;
-		}
-
-		monotonic_base += 
-			(offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick;
-
-		vxtime.last = offset;
-#ifdef CONFIG_X86_PM_TIMER
-	} else if (vxtime.mode == VXTIME_PMTMR) {
-		lost = pmtimer_mark_offset();
-#endif
-	} else {
-		offset = (((tsc - vxtime.last_tsc) *
-			   vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ);
-
-		if (offset < 0)
-			offset = 0;
-
-		if (offset > (USEC_PER_SEC / HZ)) {
-			lost = offset / (USEC_PER_SEC / HZ);
-			offset %= (USEC_PER_SEC / HZ);
-		}
-
-		monotonic_base += (tsc - vxtime.last_tsc)*1000000/cpu_khz ;
-
-		vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot;
-
-		if ((((tsc - vxtime.last_tsc) *
-		      vxtime.tsc_quot) >> 32) < offset)
-			vxtime.last_tsc = tsc -
-				(((long) offset << 32) / vxtime.tsc_quot) - 1;
-	}
-
-	if (lost > 0) {
-		handle_lost_ticks(lost, regs);
-		jiffies += lost;
-	}
-
 /*
  * Do the timer stuff.
  */
@@ -448,20 +216,6 @@ void main_timer_handler(struct pt_regs *
 		smp_local_timer_interrupt(regs);
 #endif
 
-/*
- * If we have an externally synchronized Linux clock, then update CMOS clock
- * accordingly every ~11 minutes. set_rtc_mmss() will be called in the jiffy
- * closest to exactly 500 ms before the next second. If the update fails, we
- * don't care, as it'll be updated on the next turn, and the problem (time way
- * off) isn't likely to go away much sooner anyway.
- */
-
-	if (ntp_synced() && xtime.tv_sec > rtc_update &&
-		abs(xtime.tv_nsec - 500000000) <= tick_nsec / 2) {
-		set_rtc_mmss(xtime.tv_sec);
-		rtc_update = xtime.tv_sec + 660;
-	}
- 
 	write_sequnlock(&xtime_lock);
 }
 
@@ -512,7 +266,20 @@ unsigned long long sched_clock(void)
 	return cycles_2_ns(a);
 }
 
-static unsigned long get_cmos_time(void)
+static int tsc_unstable;
+
+static inline int check_tsc_unstable(void)
+{
+	return tsc_unstable;
+}
+
+void mark_tsc_unstable(void)
+{
+	tsc_unstable = 1;
+}
+EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+
+unsigned long get_cmos_time(void)
 {
 	unsigned int timeout = 1000000, year, mon, day, hour, min, sec;
 	unsigned char uip = 0, this = 0;
@@ -568,6 +335,30 @@ static unsigned long get_cmos_time(void)
 	return mktime(year, mon, day, hour, min, sec);
 }
 
+/* arch specific timeofday hooks: */
+u64 read_persistent_clock(void)
+{
+	return (u64)get_cmos_time() * NSEC_PER_SEC;
+}
+
+void sync_persistent_clock(struct timespec ts)
+{
+	static unsigned long rtc_update = 0;
+	/*
+	 * If we have an externally synchronized Linux clock, then update
+	 * CMOS clock accordingly every ~11 minutes. set_rtc_mmss() will
+	 * be called in the jiffy closest to exactly 500 ms before the
+	 * next second. If the update fails, we don't care, as it'll be
+	 * updated on the next turn, and the problem (time way off) isn't
+	 * likely to go away much sooner anyway.
+	 */
+	if (ts.tv_sec > rtc_update &&
+		abs(ts.tv_nsec - 500000000) <= tick_nsec / 2) {
+		set_rtc_mmss(xtime.tv_sec);
+		rtc_update = xtime.tv_sec + 660;
+	}
+}
+
 #ifdef CONFIG_CPU_FREQ
 
 /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
@@ -595,23 +386,6 @@ static void handle_cpufreq_delayed_get(v
 	cpufreq_delayed_issched = 0;
 }
 
-/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries
- * to verify the CPU frequency the timing core thinks the CPU is running
- * at is still correct.
- */
-static void cpufreq_delayed_get(void)
-{
-	static int warned;
-	if (cpufreq_init && !cpufreq_delayed_issched) {
-		cpufreq_delayed_issched = 1;
-		if (!warned) {
-			warned = 1;
-			printk(KERN_DEBUG "Losing some ticks... checking if CPU frequency changed.\n");
-		}
-		schedule_work(&cpufreq_delayed_get_work);
-	}
-}
-
 static unsigned int  ref_freq = 0;
 static unsigned long loops_per_jiffy_ref = 0;
 
@@ -646,8 +420,11 @@ static int time_cpufreq_notifier(struct 
 		cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
 
 		cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
-		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+		if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
 			vxtime.tsc_quot = (1000L << 32) / cpu_khz;
+			tsc_khz = cpu_khz;
+		}
+
 	}
 	
 	set_cyc2ns_scale(cpu_khz_ref);
@@ -897,7 +674,7 @@ int __init time_setup(char *str)
 }
 
 static struct irqaction irq0 = {
-	timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL
+	timer_interrupt, SA_INTERRUPT | SA_NODELAY, CPU_MASK_NONE, "timer", NULL, NULL
 };
 
 void __init time_init(void)
@@ -935,18 +712,12 @@ void __init time_init(void)
 	if (hpet_use_timer) {
 		cpu_khz = hpet_calibrate_tsc();
 		timename = "HPET";
-#ifdef CONFIG_X86_PM_TIMER
-	} else if (pmtmr_ioport && !vxtime.hpet_address) {
-		vxtime_hz = PM_TIMER_FREQUENCY;
-		timename = "PM";
-		pit_init();
-		cpu_khz = pit_calibrate_tsc();
-#endif
 	} else {
 		pit_init();
 		cpu_khz = pit_calibrate_tsc();
 		timename = "PIT";
 	}
+	tsc_khz = cpu_khz;
 
 	vxtime.mode = VXTIME_TSC;
 	gtod = time_init_gtod();
@@ -986,33 +757,8 @@ __cpuinit int unsynchronized_tsc(void)
  */
 __init static char *time_init_gtod(void)
 {
-	char *timetype;
-
 	if (unsynchronized_tsc())
-		notsc = 1;
-	if (vxtime.hpet_address && notsc) {
-		timetype = hpet_use_timer ? "HPET" : "PIT/HPET";
-		if (hpet_use_timer)
-			vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
-		else
-			vxtime.last = hpet_readl(HPET_COUNTER);
-		vxtime.mode = VXTIME_HPET;
-		do_gettimeoffset = do_gettimeoffset_hpet;
-#ifdef CONFIG_X86_PM_TIMER
-	/* Using PM for gettimeofday is quite slow, but we have no other
-	   choice because the TSC is too unreliable on some systems. */
-	} else if (pmtmr_ioport && !vxtime.hpet_address && notsc) {
-		timetype = "PM";
-		do_gettimeoffset = do_gettimeoffset_pm;
-		vxtime.mode = VXTIME_PMTMR;
-		sysctl_vsyscall = 0;
-		printk(KERN_INFO "Disabling vsyscall due to use of PM timer\n");
-#endif
-	} else {
-		timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC";
-		vxtime.mode = VXTIME_TSC;
-	}
-	return timetype;
+		mark_tsc_unstable();
 }
 
 __setup("report_lost_ticks", time_setup);
@@ -1039,7 +785,6 @@ static int timer_suspend(struct sys_devi
 
 static int timer_resume(struct sys_device *dev)
 {
-	unsigned long flags;
 	unsigned long sec;
 	unsigned long ctime = get_cmos_time();
 	unsigned long sleep_length = (ctime - sleep_start) * HZ;
@@ -1050,21 +795,6 @@ static int timer_resume(struct sys_devic
 		i8254_timer_resume();
 
 	sec = ctime + clock_cmos_diff;
-	write_seqlock_irqsave(&xtime_lock,flags);
-	xtime.tv_sec = sec;
-	xtime.tv_nsec = 0;
-	if (vxtime.mode == VXTIME_HPET) {
-		if (hpet_use_timer)
-			vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
-		else
-			vxtime.last = hpet_readl(HPET_COUNTER);
-#ifdef CONFIG_X86_PM_TIMER
-	} else if (vxtime.mode == VXTIME_PMTMR) {
-		pmtimer_resume();
-#endif
-	} else
-		vxtime.last_tsc = get_cycles_sync();
-	write_sequnlock_irqrestore(&xtime_lock,flags);
 	jiffies += sleep_length;
 	wall_jiffies += sleep_length;
 	monotonic_base += sleep_length * (NSEC_PER_SEC/HZ);
@@ -1334,3 +1064,170 @@ int __init notsc_setup(char *s)
 }
 
 __setup("notsc", notsc_setup);
+
+
+/* clock source code: */
+
+static unsigned long current_tsc_khz = 0;
+
+static int tsc_update_callback(void);
+
+#ifdef CONFIG_PARANOID_GENERIC_TIME
+/* This will hurt performance! */
+static DEFINE_RAW_SPINLOCK(checktsc_lock);
+static cycle_t last_tsc;
+
+static cycle_t read_tsc(void)
+{
+	unsigned long flags;
+	cycle_t ret;
+
+	spin_lock_irqsave(&checktsc_lock, flags);
+
+	rdtscll(ret);
+
+	if (ret < last_tsc) {
+		last_tsc = ret;
+		spin_unlock_irqrestore(&checktsc_lock, flags);
+		printk("read_tsc: ACK! TSC went backward! Unsynced TSCs?\n");
+	} else {
+		last_tsc = ret;
+		spin_unlock_irqrestore(&checktsc_lock, flags);
+	}
+	return ret;
+}
+
+#else /* CONFIG_PARANOID_GENERIC_TIME */
+
+static cycle_t read_tsc(void)
+{
+	cycle_t ret;
+
+	rdtscll(ret);
+
+	return ret;
+}
+
+#endif /* CONFIG_PARANOID_GENERIC_TIME */
+
+static cycle_t __vsyscall_fn vread_tsc(void* unused)
+{
+	cycle_t ret;
+
+	rdtscll(ret);
+
+	return ret;
+}
+
+static struct clocksource clocksource_tsc = {
+	.name			= "tsc",
+	.rating			= 300,
+	.read			= read_tsc,
+	.vread			= vread_tsc,
+	.mask			= (cycle_t)-1,
+	.mult			= 0, /* to be set */
+	.shift			= 22,
+	.update_callback	= tsc_update_callback,
+	.is_continuous		= 1,
+};
+
+static int tsc_update_callback(void)
+{
+	int change = 0;
+
+	/* check to see if we should switch to the safe clocksource: */
+	if (clocksource_tsc.rating != 50 && check_tsc_unstable()) {
+		clocksource_tsc.rating = 50;
+		reselect_clocksource();
+		change = 1;
+	}
+
+	/* only update if tsc_khz has changed: */
+	if (current_tsc_khz != tsc_khz){
+		current_tsc_khz = tsc_khz;
+		clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
+							clocksource_tsc.shift);
+		change = 1;
+	}
+	return change;
+}
+
+static int __init init_tsc_clocksource(void)
+{
+	if (!notsc && tsc_khz) {
+		current_tsc_khz = tsc_khz;
+		clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
+							clocksource_tsc.shift);
+		return register_clocksource(&clocksource_tsc);
+	}
+	return 0;
+}
+
+module_init(init_tsc_clocksource);
+
+
+#define HPET_MASK	0xFFFFFFFF
+#define HPET_SHIFT	22
+
+/* FSEC = 10^-15 NSEC = 10^-9 */
+#define FSEC_PER_NSEC	1000000
+
+static void *hpet_ptr;
+
+static cycle_t read_hpet(void)
+{
+	return (cycle_t)readl(hpet_ptr);
+}
+
+static cycle_t __vsyscall_fn vread_hpet(void* ptr)
+{
+	return (cycle_t)readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
+}
+
+struct clocksource clocksource_hpet = {
+	.name		= "hpet",
+	.rating		= 250,
+	.read		= read_hpet,
+	.vread		= vread_hpet,
+	.mask		= (cycle_t)HPET_MASK,
+	.mult		= 0, /* set below */
+	.shift		= HPET_SHIFT,
+	.is_continuous	= 1,
+};
+
+static int __init init_hpet_clocksource(void)
+{
+	unsigned long hpet_period;
+	void __iomem *hpet_base;
+	u64 tmp;
+
+	if (!hpet_address)
+		return -ENODEV;
+
+	/* calculate the hpet address: */
+	hpet_base =
+		(void __iomem*)ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
+	hpet_ptr = hpet_base + HPET_COUNTER;
+
+	/* calculate the frequency: */
+	hpet_period = readl(hpet_base + HPET_PERIOD);
+
+	/*
+	 * hpet period is in femto seconds per cycle
+	 * so we need to convert this to ns/cyc units
+	 * aproximated by mult/2^shift
+	 *
+	 *  fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift
+	 *  fsec/cyc * 1ns/1000000fsec * 2^shift = mult
+	 *  fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
+	 *  (fsec/cyc << shift)/1000000 = mult
+	 *  (hpet_period << shift)/FSEC_PER_NSEC = mult
+	 */
+	tmp = (u64)hpet_period << HPET_SHIFT;
+	do_div(tmp, FSEC_PER_NSEC);
+	clocksource_hpet.mult = (u32)tmp;
+
+	return register_clocksource(&clocksource_hpet);
+}
+
+module_init(init_hpet_clocksource);
Index: linux-2.6.16/arch/x86_64/kernel/traps.c
===================================================================
--- linux-2.6.16.orig/arch/x86_64/kernel/traps.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/x86_64/kernel/traps.c	2006-10-19 16:52:30.000000000 +0200
@@ -195,7 +195,7 @@ static unsigned long *in_exception_stack
  * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
  */
 
-void show_trace(unsigned long *stack)
+void show_trace(struct task_struct *task, unsigned long *stack)
 {
 	const unsigned cpu = safe_smp_processor_id();
 	unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
@@ -208,12 +208,7 @@ void show_trace(unsigned long *stack)
 	do while (cond) { \
 		unsigned long addr = *stack++; \
 		if (kernel_text_address(addr)) { \
-			if (i > 50) { \
-				printk("\n       "); \
-				i = 0; \
-			} \
-			else \
-				i += printk(" "); \
+			printk("\n       "); \
 			/* \
 			 * If the address is either in the text segment of the \
 			 * kernel, or in the region which contains vmalloc'ed \
@@ -222,7 +217,7 @@ void show_trace(unsigned long *stack)
 			 * down the cause of the crash will be able to figure \
 			 * out the call path that was taken. \
 			 */ \
-			i += printk_address(addr); \
+			i = 7 + printk_address(addr); \
 		} \
 	} while (0)
 
@@ -259,6 +254,7 @@ void show_trace(unsigned long *stack)
 	HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0);
 #undef HANDLE_STACK
 	printk("\n");
+	print_traces(task);
 }
 
 void show_stack(struct task_struct *tsk, unsigned long * rsp)
@@ -295,7 +291,7 @@ void show_stack(struct task_struct *tsk,
 		printk("%016lx ", *stack++);
 		touch_nmi_watchdog();
 	}
-	show_trace((unsigned long *)rsp);
+	show_trace(tsk, (unsigned long *)rsp);
 }
 
 /*
@@ -304,7 +300,7 @@ void show_stack(struct task_struct *tsk,
 void dump_stack(void)
 {
 	unsigned long dummy;
-	show_trace(&dummy);
+	show_trace(current, &dummy);
 }
 
 EXPORT_SYMBOL(dump_stack);
@@ -383,7 +379,7 @@ void out_of_line_bug(void)
 } 
 #endif
 
-static DEFINE_SPINLOCK(die_lock);
+static DEFINE_RAW_SPINLOCK(die_lock);
 static int die_owner = -1;
 
 unsigned __kprobes long oops_begin(void)
Index: linux-2.6.16/arch/x86_64/kernel/vmlinux.lds.S
===================================================================
--- linux-2.6.16.orig/arch/x86_64/kernel/vmlinux.lds.S	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/x86_64/kernel/vmlinux.lds.S	2006-10-19 16:52:30.000000000 +0200
@@ -101,6 +101,18 @@ SECTIONS
   .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
   jiffies = VVIRT(.jiffies);
 
+  .vsyscall_fn :  AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) }
+  .vsyscall_data :  AT(VLOAD(.vsyscall_data)) { *(.vsyscall_data) }
+
+  .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { *(.vsyscall_gtod_data) }
+  vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
+
+  .vsyscall_gtod_lock : AT(VLOAD(.vsyscall_gtod_lock)) { *(.vsyscall_gtod_lock) }
+  vsyscall_gtod_lock = VVIRT(.vsyscall_gtod_lock);
+
+  .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) }
+  .vsyscall_data : AT(VLOAD(.vsyscall_data)) { *(.vsyscall_data) }
+
   .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) }
   .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { *(.vsyscall_2) }
   .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) }
Index: linux-2.6.16/arch/x86_64/kernel/vsyscall.c
===================================================================
--- linux-2.6.16.orig/arch/x86_64/kernel/vsyscall.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/x86_64/kernel/vsyscall.c	2006-10-19 16:52:30.000000000 +0200
@@ -19,6 +19,8 @@
  *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
  */
 
+#include <linux/timeofday.h>
+#include <linux/clocksource.h>
 #include <linux/time.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -27,19 +29,32 @@
 #include <linux/jiffies.h>
 #include <linux/sysctl.h>
 
+
 #include <asm/vsyscall.h>
 #include <asm/pgtable.h>
+#include <asm/unistd.h>
 #include <asm/page.h>
 #include <asm/fixmap.h>
 #include <asm/errno.h>
 #include <asm/io.h>
 
-#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
+#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) notrace
 
 int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
-seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
+raw_seqlock_t __xtime_lock __section_xtime_lock = RAW_SEQLOCK_UNLOCKED;
 
-#include <asm/unistd.h>
+struct vsyscall_gtod_data_t {
+	struct timeval wall_time_tv;
+	struct timezone sys_tz;
+	cycle_t offset_base;
+	struct clocksource clock;
+};
+
+extern struct vsyscall_gtod_data_t vsyscall_gtod_data;
+struct vsyscall_gtod_data_t __vsyscall_gtod_data __section_vsyscall_gtod_data;
+
+extern raw_seqlock_t vsyscall_gtod_lock;
+raw_seqlock_t __vsyscall_gtod_lock __section_vsyscall_gtod_lock = RAW_SEQLOCK_UNLOCKED;
 
 static __always_inline void timeval_normalize(struct timeval * tv)
 {
@@ -52,39 +67,68 @@ static __always_inline void timeval_norm
 	}
 }
 
+/*
+ * XXX - this is ugly. gettimeofday() has a label in it so we can't
+ *       call it twice.
+ */
+static __always_inline int syscall_gtod(struct timeval *tv, struct timezone *tz)
+{
+	int ret;
+
+	asm volatile("syscall"
+		: "=a" (ret)
+		: "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
+		: __syscall_clobber);
+
+	return ret;
+}
+
 static __always_inline void do_vgettimeofday(struct timeval * tv)
 {
-	long sequence, t;
-	unsigned long sec, usec;
+	cycle_t now, base, mask, cycle_delta;
+	unsigned long mult, shift, seq;
+	s64 nsec_delta;
 
 	do {
-		sequence = read_seqbegin(&__xtime_lock);
-		
-		sec = __xtime.tv_sec;
-		usec = (__xtime.tv_nsec / 1000) +
-			(__jiffies - __wall_jiffies) * (1000000 / HZ);
-
-		if (__vxtime.mode != VXTIME_HPET) {
-			t = get_cycles_sync();
-			if (t < __vxtime.last_tsc)
-				t = __vxtime.last_tsc;
-			usec += ((t - __vxtime.last_tsc) *
-				 __vxtime.tsc_quot) >> 32;
-			/* See comment in x86_64 do_gettimeofday. */
-		} else {
-			usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) -
-				  __vxtime.last) * __vxtime.quot) >> 32;
+		seq = read_seqbegin(&__vsyscall_gtod_lock);
+
+		if (!__vsyscall_gtod_data.clock.vread) {
+			syscall_gtod(tv, NULL);
+			return;
 		}
-	} while (read_seqretry(&__xtime_lock, sequence));
 
-	tv->tv_sec = sec + usec / 1000000;
-	tv->tv_usec = usec % 1000000;
+		/* read the timeosurce and store state values */
+		now = __vsyscall_gtod_data.clock.vread(
+				__vsyscall_gtod_data.clock.vdata);
+
+		base = __vsyscall_gtod_data.offset_base;
+		mask = __vsyscall_gtod_data.clock.mask;
+
+		mult = __vsyscall_gtod_data.clock.mult;
+		shift = __vsyscall_gtod_data.clock.shift;
+
+		*tv = __vsyscall_gtod_data.wall_time_tv;
+	} while (read_seqretry(&__vsyscall_gtod_lock, seq));
+
+	/* calculate interval: */
+	cycle_delta = (now - base) & mask;
+	/* convert to nsecs: */
+	nsec_delta = (cycle_delta * mult) >> shift;
+
+	/* convert to usecs and add to timespec: */
+	do_div(nsec_delta, NSEC_PER_USEC);
+	tv->tv_usec += (unsigned long) nsec_delta;
+
+	while (tv->tv_usec > USEC_PER_SEC) {
+		tv->tv_sec += 1;
+		tv->tv_usec -= USEC_PER_SEC;
+	}
 }
 
 /* RED-PEN may want to readd seq locking, but then the variable should be write-once. */
 static __always_inline void do_get_tz(struct timezone * tz)
 {
-	*tz = __sys_tz;
+	*tz = __vsyscall_gtod_data.sys_tz;
 }
 
 static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
@@ -120,11 +164,16 @@ int __vsyscall(0) vgettimeofday(struct t
  * unlikely */
 time_t __vsyscall(1) vtime(time_t *t)
 {
+	struct timeval tv;
+
 	if (unlikely(!__sysctl_vsyscall))
 		return time_syscall(t);
-	else if (t)
-		*t = __xtime.tv_sec;		
-	return __xtime.tv_sec;
+
+	vgettimeofday(&tv, 0);
+	if (t)
+		*t = tv.tv_sec;
+
+	return tv.tv_sec;
 }
 
 long __vsyscall(2) venosys_0(void)
@@ -137,6 +186,38 @@ long __vsyscall(3) venosys_1(void)
 	return -ENOSYS;
 }
 
+struct clocksource *curr_clock;
+
+void arch_update_vsyscall_gtod(struct timespec wall_time, cycle_t offset_base,
+				struct clocksource *clock, int ntp_adj)
+{
+	unsigned long flags;
+
+	write_seqlock_irqsave(&vsyscall_gtod_lock, flags);
+
+	/* XXX - hackitty hack hack. this is terrible! */
+	if (curr_clock != clock)
+		curr_clock = clock;
+
+	/* save off wall time as timeval: */
+	vsyscall_gtod_data.wall_time_tv.tv_sec = wall_time.tv_sec;
+	vsyscall_gtod_data.wall_time_tv.tv_usec = wall_time.tv_nsec/1000;
+
+	/* save offset_base: */
+	vsyscall_gtod_data.offset_base = offset_base;
+
+	/* copy current clocksource: */
+	vsyscall_gtod_data.clock = *clock;
+
+	/* apply ntp adjustment to clocksource mult: */
+	vsyscall_gtod_data.clock.mult += ntp_adj;
+
+	/* save off current timezone: */
+	vsyscall_gtod_data.sys_tz = sys_tz;
+
+	write_sequnlock_irqrestore(&vsyscall_gtod_lock, flags);
+}
+
 #ifdef CONFIG_SYSCTL
 
 #define SYSCALL 0x050f
@@ -215,6 +296,7 @@ static int __init vsyscall_init(void)
 	BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
 	BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
 	map_vsyscall();
+	sysctl_vsyscall = 1;
 #ifdef CONFIG_SYSCTL
 	register_sysctl_table(kernel_root_table2, 0);
 #endif
Index: linux-2.6.16/arch/x86_64/kernel/x8664_ksyms.c
===================================================================
--- linux-2.6.16.orig/arch/x86_64/kernel/x8664_ksyms.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/x86_64/kernel/x8664_ksyms.c	2006-10-19 16:52:30.000000000 +0200
@@ -13,6 +13,7 @@
 #include <linux/string.h>
 #include <linux/syscalls.h>
 #include <linux/tty.h>
+#include <linux/mc146818rtc.h>
 
 #include <asm/semaphore.h>
 #include <asm/processor.h>
@@ -32,8 +33,6 @@
 #include <asm/tlbflush.h>
 #include <asm/kdebug.h>
 
-extern spinlock_t rtc_lock;
-
 #ifdef CONFIG_SMP
 extern void __write_lock_failed(rwlock_t *rw);
 extern void __read_lock_failed(rwlock_t *rw);
@@ -49,10 +48,12 @@ EXPORT_SYMBOL(kernel_thread);
 EXPORT_SYMBOL(pm_idle);
 EXPORT_SYMBOL(pm_power_off);
 
-EXPORT_SYMBOL(__down_failed);
-EXPORT_SYMBOL(__down_failed_interruptible);
-EXPORT_SYMBOL(__down_failed_trylock);
-EXPORT_SYMBOL(__up_wakeup);
+#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
+EXPORT_SYMBOL(__compat_down_failed);
+EXPORT_SYMBOL(__compat_down_failed_interruptible);
+EXPORT_SYMBOL(__compat_down_failed_trylock);
+EXPORT_SYMBOL(__compat_up_wakeup);
+#endif
 /* Networking helper routines. */
 EXPORT_SYMBOL(csum_partial_copy_nocheck);
 EXPORT_SYMBOL(ip_compute_csum);
Index: linux-2.6.16/arch/x86_64/lib/thunk.S
===================================================================
--- linux-2.6.16.orig/arch/x86_64/lib/thunk.S	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/x86_64/lib/thunk.S	2006-10-19 16:52:30.000000000 +0200
@@ -43,11 +43,13 @@
 	thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
 #endif	
 	thunk do_softirq_thunk,do_softirq
-	
-	thunk __down_failed,__down
-	thunk_retrax __down_failed_interruptible,__down_interruptible
-	thunk_retrax __down_failed_trylock,__down_trylock
-	thunk __up_wakeup,__up
+
+#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
+	thunk __compat_down_failed,__compat_down
+	thunk_retrax __compat_down_failed_interruptible,__compat_down_interruptible
+	thunk_retrax __compat_down_failed_trylock,__compat_down_trylock
+	thunk __compat_up_wakeup,__compat_up
+#endif
 	
 	/* SAVE_ARGS below is used only for the .cfi directives it contains. */
 	CFI_STARTPROC
Index: linux-2.6.16/arch/x86_64/mm/fault.c
===================================================================
--- linux-2.6.16.orig/arch/x86_64/mm/fault.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/x86_64/mm/fault.c	2006-10-19 16:52:30.000000000 +0200
@@ -45,6 +45,7 @@ void bust_spinlocks(int yes)
 {
 	int loglevel_save = console_loglevel;
 	if (yes) {
+		stop_trace();
 		oops_in_progress = 1;
 	} else {
 #ifdef CONFIG_VT
Index: linux-2.6.16/arch/x86_64/mm/init.c
===================================================================
--- linux-2.6.16.orig/arch/x86_64/mm/init.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/arch/x86_64/mm/init.c	2006-10-19 16:52:30.000000000 +0200
@@ -53,7 +53,7 @@ EXPORT_SYMBOL(dma_ops);
 
 static unsigned long dma_reserve __initdata;
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 /*
  * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
Index: linux-2.6.16/block/cfq-iosched.c
===================================================================
--- linux-2.6.16.orig/block/cfq-iosched.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/block/cfq-iosched.c	2006-10-19 16:52:30.000000000 +0200
@@ -21,6 +21,7 @@
 #include <linux/mempool.h>
 #include <linux/ioprio.h>
 #include <linux/writeback.h>
+#include <linux/interrupt.h>
 
 /*
  * tunables
@@ -1215,7 +1216,7 @@ static void cfq_exit_single_io_context(s
 	struct cfq_data *cfqd = cic->cfqq->cfqd;
 	request_queue_t *q = cfqd->queue;
 
-	WARN_ON(!irqs_disabled());
+	WARN_ON_NONRT(!irqs_disabled());
 
 	spin_lock(q->queue_lock);
 
@@ -1237,7 +1238,9 @@ static void cfq_exit_io_context(struct c
 	struct list_head *entry;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	// FIXME: i dont think this code is safe, upstream!
+
+	local_irq_save_nort(flags);
 
 	/*
 	 * put the reference this task is holding to the various queues
@@ -1248,7 +1251,7 @@ static void cfq_exit_io_context(struct c
 	}
 
 	cfq_exit_single_io_context(cic);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 }
 
 static struct cfq_io_context *
Index: linux-2.6.16/block/ll_rw_blk.c
===================================================================
--- linux-2.6.16.orig/block/ll_rw_blk.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/block/ll_rw_blk.c	2006-10-19 16:52:30.000000000 +0200
@@ -1547,7 +1547,7 @@ static int ll_merge_requests_fn(request_
  */
 void blk_plug_device(request_queue_t *q)
 {
-	WARN_ON(!irqs_disabled());
+	WARN_ON_NONRT(!irqs_disabled());
 
 	/*
 	 * don't plug a stopped queue, it must be paired with blk_start_queue()
@@ -1568,7 +1568,7 @@ EXPORT_SYMBOL(blk_plug_device);
  */
 int blk_remove_plug(request_queue_t *q)
 {
-	WARN_ON(!irqs_disabled());
+	WARN_ON_NONRT(!irqs_disabled());
 
 	if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
 		return 0;
@@ -3493,13 +3493,15 @@ void exit_io_context(void)
 	unsigned long flags;
 	struct io_context *ioc;
 
-	local_irq_save(flags);
+	// FIXME: unsafe upstream too?
+
+	local_irq_save_nort(flags);
 	task_lock(current);
 	ioc = current->io_context;
 	current->io_context = NULL;
 	ioc->task = NULL;
 	task_unlock(current);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 
 	if (ioc->aic && ioc->aic->exit)
 		ioc->aic->exit(ioc->aic);
Index: linux-2.6.16/drivers/Makefile
===================================================================
--- linux-2.6.16.orig/drivers/Makefile	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/Makefile	2006-10-19 16:52:30.000000000 +0200
@@ -73,3 +73,4 @@ obj-$(CONFIG_SGI_SN)		+= sn/
 obj-y				+= firmware/
 obj-$(CONFIG_CRYPTO)		+= crypto/
 obj-$(CONFIG_SUPERH)		+= sh/
+obj-$(CONFIG_GENERIC_TIME)	+= clocksource/
Index: linux-2.6.16/drivers/acpi/osl.c
===================================================================
--- linux-2.6.16.orig/drivers/acpi/osl.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/acpi/osl.c	2006-10-19 16:52:30.000000000 +0200
@@ -730,14 +730,14 @@ void acpi_os_delete_lock(acpi_handle han
 acpi_status
 acpi_os_create_semaphore(u32 max_units, u32 initial_units, acpi_handle * handle)
 {
-	struct semaphore *sem = NULL;
+	struct compat_semaphore *sem = NULL;
 
 	ACPI_FUNCTION_TRACE("os_create_semaphore");
 
-	sem = acpi_os_allocate(sizeof(struct semaphore));
+	sem = acpi_os_allocate(sizeof(struct compat_semaphore));
 	if (!sem)
 		return_ACPI_STATUS(AE_NO_MEMORY);
-	memset(sem, 0, sizeof(struct semaphore));
+	memset(sem, 0, sizeof(struct compat_semaphore));
 
 	sema_init(sem, initial_units);
 
@@ -760,7 +760,7 @@ EXPORT_SYMBOL(acpi_os_create_semaphore);
 
 acpi_status acpi_os_delete_semaphore(acpi_handle handle)
 {
-	struct semaphore *sem = (struct semaphore *)handle;
+	struct compat_semaphore *sem = (struct compat_semaphore *)handle;
 
 	ACPI_FUNCTION_TRACE("os_delete_semaphore");
 
@@ -789,7 +789,7 @@ EXPORT_SYMBOL(acpi_os_delete_semaphore);
 acpi_status acpi_os_wait_semaphore(acpi_handle handle, u32 units, u16 timeout)
 {
 	acpi_status status = AE_OK;
-	struct semaphore *sem = (struct semaphore *)handle;
+	struct compat_semaphore *sem = (struct compat_semaphore *)handle;
 	int ret = 0;
 
 	ACPI_FUNCTION_TRACE("os_wait_semaphore");
@@ -870,7 +870,7 @@ EXPORT_SYMBOL(acpi_os_wait_semaphore);
  */
 acpi_status acpi_os_signal_semaphore(acpi_handle handle, u32 units)
 {
-	struct semaphore *sem = (struct semaphore *)handle;
+	struct compat_semaphore *sem = (struct compat_semaphore *)handle;
 
 	ACPI_FUNCTION_TRACE("os_signal_semaphore");
 
Index: linux-2.6.16/drivers/acpi/processor_idle.c
===================================================================
--- linux-2.6.16.orig/drivers/acpi/processor_idle.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/acpi/processor_idle.c	2006-10-19 16:52:30.000000000 +0200
@@ -369,6 +369,11 @@ static void acpi_processor_idle(void)
 		t2 = inl(acpi_fadt.xpm_tmr_blk.address);
 		/* Get end time (ticks) */
 		t2 = inl(acpi_fadt.xpm_tmr_blk.address);
+
+#ifdef CONFIG_GENERIC_TIME
+		/* TSC halts in C2, so notify users */
+		mark_tsc_unstable();
+#endif
 		/* Re-enable interrupts */
 		local_irq_enable();
 		set_thread_flag(TIF_POLLING_NRFLAG);
@@ -409,6 +414,10 @@ static void acpi_processor_idle(void)
 					  ACPI_MTX_DO_NOT_LOCK);
 		}
 
+#ifdef CONFIG_GENERIC_TIME
+		/* TSC halts in C3, so notify users */
+		mark_tsc_unstable();
+#endif
 		/* Re-enable interrupts */
 		local_irq_enable();
 		set_thread_flag(TIF_POLLING_NRFLAG);
Index: linux-2.6.16/drivers/base/class.c
===================================================================
--- linux-2.6.16.orig/drivers/base/class.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/base/class.c	2006-10-19 16:52:30.000000000 +0200
@@ -555,8 +555,10 @@ int class_device_add(struct class_device
 		class_name = make_class_name(class_dev);
 		sysfs_create_link(&class_dev->kobj,
 				  &class_dev->dev->kobj, "device");
+		/*
 		sysfs_create_link(&class_dev->dev->kobj, &class_dev->kobj,
 				  class_name);
+		*/
 	}
 
 	kobject_uevent(&class_dev->kobj, KOBJ_ADD);
@@ -667,7 +669,9 @@ void class_device_del(struct class_devic
 	if (class_dev->dev) {
 		class_name = make_class_name(class_dev);
 		sysfs_remove_link(&class_dev->kobj, "device");
+		/*
 		sysfs_remove_link(&class_dev->dev->kobj, class_name);
+		*/
 	}
 	class_device_remove_file(class_dev, &class_dev->uevent_attr);
 	if (class_dev->devt_attr)
Index: linux-2.6.16/drivers/block/paride/pseudo.h
===================================================================
--- linux-2.6.16.orig/drivers/block/paride/pseudo.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/block/paride/pseudo.h	2006-10-19 16:52:30.000000000 +0200
@@ -43,7 +43,7 @@ static unsigned long ps_timeout;
 static int ps_tq_active = 0;
 static int ps_nice = 0;
 
-static DEFINE_SPINLOCK(ps_spinlock __attribute__((unused)));
+static __attribute__((unused)) DEFINE_SPINLOCK(ps_spinlock);
 
 static DECLARE_WORK(ps_tq, ps_tq_int, NULL);
 
Index: linux-2.6.16/drivers/char/Kconfig
===================================================================
--- linux-2.6.16.orig/drivers/char/Kconfig	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/char/Kconfig	2006-10-19 16:52:30.000000000 +0200
@@ -719,6 +719,46 @@ config RTC
 	  To compile this driver as a module, choose M here: the
 	  module will be called rtc.
 
+config RTC_HISTOGRAM
+	bool "Real Time Clock Histogram Support"
+	default n
+	depends on RTC
+	---help---
+	  If you say Y here then the kernel will track the delivery and
+	  wakeup latency of /dev/rtc using tasks and will report a
+	  histogram to the kernel log when the application closes /dev/rtc.
+
+config BLOCKER
+	tristate "Priority Inheritance Debugging (Blocker) Device Support"
+	depends on X86
+	default y
+	---help---
+	  If you say Y here then a device will be created that the userspace
+	  pi_test suite uses to test and measure kernel locking primitives.
+
+config LPPTEST
+	tristate "Parallel Port Based Latency Measurement Device"
+	depends on !PARPORT && X86
+	default y
+	---help---
+	  If you say Y here then a device will be created that the userspace
+	  testlpp utility uses to measure IRQ latencies of a target system
+	  from an independent measurement system.
+
+	  NOTE: this code assumes x86 PCs and that the parallel port is
+	  bidirectional and is on IRQ 7.
+
+	  to use the device, both the target and the source system needs to
+	  run a kernel with CONFIG_LPPTEST enabled. To measure latencies,
+	  use the scripts/testlpp utility in your kernel source directory,
+	  and run it (as root) on the source system - it will start printing
+	  out the latencies it took to get a response from the target system:
+
+	    Latency of response: 12.2 usecs (121265 cycles)
+
+	  then generate various workloads on the target system to see how
+	  (worst-case-) latencies are impacted.
+
 config SGI_DS1286
 	tristate "SGI DS1286 RTC support"
 	depends on SGI_IP22
Index: linux-2.6.16/drivers/char/Makefile
===================================================================
--- linux-2.6.16.orig/drivers/char/Makefile	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/char/Makefile	2006-10-19 16:52:30.000000000 +0200
@@ -58,6 +58,8 @@ obj-$(CONFIG_R3964) += n_r3964.o
 obj-$(CONFIG_APPLICOM) += applicom.o
 obj-$(CONFIG_SONYPI) += sonypi.o
 obj-$(CONFIG_RTC) += rtc.o
+obj-$(CONFIG_BLOCKER) += blocker.o
+obj-$(CONFIG_LPPTEST) += lpptest.o
 obj-$(CONFIG_HPET) += hpet.o
 obj-$(CONFIG_GEN_RTC) += genrtc.o
 obj-$(CONFIG_EFI_RTC) += efirtc.o
Index: linux-2.6.16/drivers/char/blocker.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/drivers/char/blocker.c	2006-10-19 16:52:30.000000000 +0200
@@ -0,0 +1,107 @@
+/*
+ * priority inheritance testing device
+ */
+
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+
+#define BLOCKER_MINOR		221
+
+#define BLOCK_IOCTL		4245
+#define BLOCK_SET_DEPTH		4246
+
+#define MAX_LOCK_DEPTH		10
+
+void loop(int loops)
+{
+	int i;
+
+	for (i = 0; i < loops; i++)
+		get_cycles();
+}
+
+static spinlock_t blocker_lock[MAX_LOCK_DEPTH];
+
+static unsigned int lock_depth = 1;
+
+void do_the_lock_and_loop(unsigned int args)
+{
+	int i, max;
+
+	if (rt_task(current))
+		max = lock_depth;
+	else if (lock_depth > 1)
+		max = (current->pid % lock_depth) + 1;
+	else
+		max = 1;
+
+	/* Always lock from the top down */
+	for (i = max-1; i >= 0; i--)
+		 spin_lock(&blocker_lock[i]);
+	loop(args);
+	for (i = 0; i < max; i++)
+		spin_unlock(&blocker_lock[i]);
+}
+
+static int blocker_open(struct inode *in, struct file *file)
+{
+	printk(KERN_INFO "blocker_open called\n");
+
+	return 0;
+}
+
+static long blocker_ioctl(struct file *file,
+			  unsigned int cmd, unsigned long args)
+{
+	switch(cmd) {
+	case BLOCK_IOCTL:
+		do_the_lock_and_loop(args);
+		return 0;
+	case BLOCK_SET_DEPTH:
+		if (args >= MAX_LOCK_DEPTH)
+			return -EINVAL;
+		lock_depth = args;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static struct file_operations blocker_fops = {
+	.owner		= THIS_MODULE,
+	.llseek		= no_llseek,
+	.unlocked_ioctl = blocker_ioctl,
+	.open		= blocker_open,
+};
+
+static struct miscdevice blocker_dev =
+{
+	BLOCKER_MINOR,
+	"blocker",
+	&blocker_fops
+};
+
+static int __init blocker_init(void)
+{
+	int i;
+
+	if (misc_register(&blocker_dev))
+		return -ENODEV;
+
+	for (i = 0; i < MAX_LOCK_DEPTH; i++)
+		spin_lock_init(blocker_lock + i);
+
+	return 0;
+}
+
+void __exit blocker_exit(void)
+{
+	printk(KERN_INFO "blocker device uninstalled\n");
+	misc_deregister(&blocker_dev);
+}
+
+module_init(blocker_init);
+module_exit(blocker_exit);
+
+MODULE_LICENSE("GPL");
+
Index: linux-2.6.16/drivers/char/epca.c
===================================================================
--- linux-2.6.16.orig/drivers/char/epca.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/char/epca.c	2006-10-19 16:52:30.000000000 +0200
@@ -80,7 +80,7 @@ static int invalid_lilo_config;
 /* The ISA boards do window flipping into the same spaces so its only sane
    with a single lock. It's still pretty efficient */
 
-static spinlock_t epca_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(epca_lock);
 
 /* -----------------------------------------------------------------------
 	MAXBOARDS is typically 12, but ISA and EISA cards are restricted to 
Index: linux-2.6.16/drivers/char/hangcheck-timer.c
===================================================================
--- linux-2.6.16.orig/drivers/char/hangcheck-timer.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/char/hangcheck-timer.c	2006-10-19 16:52:30.000000000 +0200
@@ -49,6 +49,7 @@
 #include <linux/delay.h>
 #include <asm/uaccess.h>
 #include <linux/sysrq.h>
+#include <linux/timeofday.h>
 
 
 #define VERSION_STR "0.9.0"
@@ -127,8 +128,12 @@ __setup("hcheck_dump_tasks", hangcheck_p
 #endif
 
 #ifdef HAVE_MONOTONIC
+#ifndef CONFIG_GENERIC_TIME
 extern unsigned long long monotonic_clock(void);
 #else
+#define monotonic_clock() ktime_to_ns(get_monotonic_clock())
+#endif
+#else
 static inline unsigned long long monotonic_clock(void)
 {
 	return get_cycles();
Index: linux-2.6.16/drivers/char/ipmi/ipmi_si_intf.c
===================================================================
--- linux-2.6.16.orig/drivers/char/ipmi/ipmi_si_intf.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/char/ipmi/ipmi_si_intf.c	2006-10-19 16:52:30.000000000 +0200
@@ -54,7 +54,7 @@
 #include <linux/notifier.h>
 #include <linux/kthread.h>
 #include <asm/irq.h>
-#ifdef CONFIG_HIGH_RES_TIMERS
+#ifdef CONFIG_HIGH_RES_TIMERS_OLD
 #include <linux/hrtime.h>
 # if defined(schedule_next_int)
 /* Old high-res timer code, do translations. */
@@ -824,7 +824,7 @@ static int initialized = 0;
 /* Must be called with interrupts off and with the si_lock held. */
 static void si_restart_short_timer(struct smi_info *smi_info)
 {
-#if defined(CONFIG_HIGH_RES_TIMERS)
+#if defined(CONFIG_HIGH_RES_TIMERS_OLD)
 	unsigned long flags;
 	unsigned long jiffies_now;
 	unsigned long seq;
@@ -892,13 +892,13 @@ static void smi_timeout(unsigned long da
 	/* If the state machine asks for a short delay, then shorten
            the timer timeout. */
 	if (smi_result == SI_SM_CALL_WITH_DELAY) {
-#if defined(CONFIG_HIGH_RES_TIMERS)
+#if defined(CONFIG_HIGH_RES_TIMERS_OLD)
 		unsigned long seq;
 #endif
 		spin_lock_irqsave(&smi_info->count_lock, flags);
 		smi_info->short_timeouts++;
 		spin_unlock_irqrestore(&smi_info->count_lock, flags);
-#if defined(CONFIG_HIGH_RES_TIMERS)
+#if defined(CONFIG_HIGH_RES_TIMERS_OLD)
 		do {
 			seq = read_seqbegin_irqsave(&xtime_lock, flags);
 			smi_info->si_timer.expires = jiffies;
@@ -914,7 +914,7 @@ static void smi_timeout(unsigned long da
 		smi_info->long_timeouts++;
 		spin_unlock_irqrestore(&smi_info->count_lock, flags);
 		smi_info->si_timer.expires = jiffies + SI_TIMEOUT_JIFFIES;
-#if defined(CONFIG_HIGH_RES_TIMERS)
+#if defined(CONFIG_HIGH_RES_TIMERS_OLD)
 		smi_info->si_timer.arch_cycle_expires = 0;
 #endif
 	}
Index: linux-2.6.16/drivers/char/ipmi/ipmi_watchdog.c
===================================================================
--- linux-2.6.16.orig/drivers/char/ipmi/ipmi_watchdog.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/char/ipmi/ipmi_watchdog.c	2006-10-19 16:52:30.000000000 +0200
@@ -459,7 +459,8 @@ static void panic_halt_ipmi_set_timeout(
    when both messages are free. */
 static atomic_t heartbeat_tofree = ATOMIC_INIT(0);
 static DECLARE_MUTEX(heartbeat_lock);
-static DECLARE_MUTEX_LOCKED(heartbeat_wait_lock);
+/* PREEMPT_RT: should be a completion instead */
+static COMPAT_DECLARE_MUTEX_LOCKED(heartbeat_wait_lock);
 static void heartbeat_free_smi(struct ipmi_smi_msg *msg)
 {
     if (atomic_dec_and_test(&heartbeat_tofree))
Index: linux-2.6.16/drivers/char/lpptest.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/drivers/char/lpptest.c	2006-10-19 16:52:30.000000000 +0200
@@ -0,0 +1,179 @@
+/*
+ * /dev/lpptest device: test IRQ handling latencies over parallel port
+ *
+ *      Copyright (C) 2005 Thomas Gleixner, Ingo Molnar
+ *
+ * licensed under the GPL
+ *
+ * You need to have CONFIG_PARPORT disabled for this device, it is a
+ * completely self-contained device that assumes sole ownership of the
+ * parallel port.
+ */
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/fs.h>
+#include <linux/delay.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/rtc.h>
+
+/*
+ * API wrappers so that the code can be shared with the -rt tree:
+ */
+#ifndef local_irq_disable
+# define local_irq_disable	local_irq_disable
+# define local_irq_enable	local_irq_enable
+#endif
+
+#ifndef IRQ_NODELAY
+# define IRQ_NODELAY		0
+# define SA_NODELAY		0
+#endif
+
+/*
+ * Driver:
+ */
+#define LPPTEST_CHAR_MAJOR 245
+#define LPPTEST_DEVICE_NAME "lpptest"
+
+#define LPPTEST_IRQ 7
+
+#define LPPTEST_TEST    _IOR (LPPTEST_CHAR_MAJOR, 1, unsigned long long)
+#define LPPTEST_DISABLE _IOR (LPPTEST_CHAR_MAJOR, 2, unsigned long long)
+#define LPPTEST_ENABLE  _IOR (LPPTEST_CHAR_MAJOR, 3, unsigned long long)
+
+static char dev_id[] = "lpptest";
+
+#define INIT_PORT()	outb(0x04, 0x37a)
+#define ENABLE_IRQ()	outb(0x10, 0x37a)
+#define DISABLE_IRQ()	outb(0, 0x37a)
+
+static unsigned char out = 0x5a;
+
+/**
+ * Interrupt handler. Flip a bit in the reply.
+ */
+static int lpptest_irq (int irq, void *dev_id, struct pt_regs *regs)
+{
+	out ^= 0xff;
+	outb(out, 0x378);
+
+	return IRQ_HANDLED;
+}
+
+static cycles_t test_response(void)
+{
+	cycles_t now, end;
+	unsigned char in;
+	int timeout = 0;
+
+	local_irq_disable();
+	in = inb(0x379);
+	inb(0x378);
+	outb(0x08, 0x378);
+	now = get_cycles();
+	while(1) {
+    		if (inb(0x379) != in)
+			break;
+		if (timeout++ > 1000000) {
+			outb(0x00, 0x378);
+			local_irq_enable();
+
+			return 0;
+		}
+	}
+	end = get_cycles();
+	outb(0x00, 0x378);
+	local_irq_enable();
+
+	return end - now;
+}
+
+static int lpptest_open(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+static int lpptest_close(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+int lpptest_ioctl(struct inode *inode, struct file *file, unsigned int ioctl_num, unsigned long ioctl_param)
+{
+	int retval = 0;
+
+	switch (ioctl_num) {
+
+	case LPPTEST_DISABLE:
+		DISABLE_IRQ();
+		break;
+
+	case LPPTEST_ENABLE:
+		ENABLE_IRQ();
+		break;
+
+	case LPPTEST_TEST: {
+
+		cycles_t diff = test_response();
+		if (copy_to_user((void *)ioctl_param, (void*) &diff, sizeof(diff)))
+			goto errcpy;
+		break;
+	}
+	default: retval = -EINVAL;
+	}
+
+	return retval;
+
+ errcpy:
+	return -EFAULT;
+}
+
+static struct file_operations lpptest_dev_fops = {
+	.ioctl = lpptest_ioctl,
+	.open = lpptest_open,
+	.release = lpptest_close,
+};
+
+static int __init lpptest_init (void)
+{
+	if (register_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME, &lpptest_dev_fops))
+	{
+		printk(KERN_NOTICE "Can't allocate major number %d for lpptest.\n",
+		       LPPTEST_CHAR_MAJOR);
+		return -EAGAIN;
+	}
+
+	if (request_irq (LPPTEST_IRQ, lpptest_irq, 0, "lpptest", dev_id)) {
+		printk (KERN_WARNING "lpptest: irq %d in use. Unload parport module!\n", LPPTEST_IRQ);
+		unregister_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME);
+		return -EAGAIN;
+	}
+	irq_desc[LPPTEST_IRQ].status |= IRQ_NODELAY;
+	irq_desc[LPPTEST_IRQ].action->flags |= SA_NODELAY | SA_INTERRUPT;
+
+	INIT_PORT();
+	ENABLE_IRQ();
+
+	return 0;
+}
+module_init (lpptest_init);
+
+static void __exit lpptest_exit (void)
+{
+	DISABLE_IRQ();
+
+	free_irq(LPPTEST_IRQ, dev_id);
+	unregister_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME);
+}
+module_exit (lpptest_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("lpp test module");
+
Index: linux-2.6.16/drivers/char/random.c
===================================================================
--- linux-2.6.16.orig/drivers/char/random.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/char/random.c	2006-10-19 16:52:30.000000000 +0200
@@ -581,8 +581,11 @@ static void add_timer_randomness(struct 
 	preempt_disable();
 	/* if over the trickle threshold, use only 1 in 4096 samples */
 	if (input_pool.entropy_count > trickle_thresh &&
-	    (__get_cpu_var(trickle_count)++ & 0xfff))
-		goto out;
+	    (__get_cpu_var(trickle_count)++ & 0xfff)) {
+		preempt_enable();
+		return;
+	}
+	preempt_enable();
 
 	sample.jiffies = jiffies;
 	sample.cycles = get_cycles();
@@ -627,9 +630,6 @@ static void add_timer_randomness(struct 
 
 	if(input_pool.entropy_count >= random_read_wakeup_thresh)
 		wake_up_interruptible(&random_read_wait);
-
-out:
-	preempt_enable();
 }
 
 void add_input_randomness(unsigned int type, unsigned int code,
Index: linux-2.6.16/drivers/char/rtc.c
===================================================================
--- linux-2.6.16.orig/drivers/char/rtc.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/char/rtc.c	2006-10-19 16:52:30.000000000 +0200
@@ -84,10 +84,36 @@
 #include <asm/uaccess.h>
 #include <asm/system.h>
 
+#ifdef CONFIG_MIPS
+# include <asm/time.h>
+#endif
+
 #if defined(__i386__)
 #include <asm/hpet.h>
 #endif
 
+#ifdef CONFIG_RTC_HISTOGRAM
+
+static cycles_t last_interrupt_time;
+
+#include <asm/timex.h>
+
+#define CPU_MHZ		(cpu_khz / 1000)
+
+#define HISTSIZE	10000
+static int histogram[HISTSIZE];
+
+static int rtc_state;
+
+enum rtc_states {
+	S_STARTUP,		/* First round - let the application start */
+	S_IDLE,			/* Waiting for an interrupt */
+	S_WAITING_FOR_READ,	/* Signal delivered. waiting for rtc_read() */
+	S_READ_MISSED,		/* Signal delivered, read() deadline missed */
+};
+
+#endif
+
 #ifdef __sparc__
 #include <linux/pci.h>
 #include <asm/ebus.h>
@@ -149,22 +175,8 @@ static void get_rtc_alm_time (struct rtc
 #ifdef RTC_IRQ
 static void rtc_dropped_irq(unsigned long data);
 
-static void set_rtc_irq_bit_locked(unsigned char bit);
-static void mask_rtc_irq_bit_locked(unsigned char bit);
-
-static inline void set_rtc_irq_bit(unsigned char bit)
-{
-	spin_lock_irq(&rtc_lock);
-	set_rtc_irq_bit_locked(bit);
-	spin_unlock_irq(&rtc_lock);
-}
-
-static void mask_rtc_irq_bit(unsigned char bit)
-{
-	spin_lock_irq(&rtc_lock);
-	mask_rtc_irq_bit_locked(bit);
-	spin_unlock_irq(&rtc_lock);
-}
+static void set_rtc_irq_bit(unsigned char bit);
+static void mask_rtc_irq_bit(unsigned char bit);
 #endif
 
 static int rtc_proc_open(struct inode *inode, struct file *file);
@@ -193,6 +205,7 @@ static unsigned long rtc_max_user_freq =
  * rtc_task_lock nests inside rtc_lock.
  */
 static DEFINE_SPINLOCK(rtc_task_lock);
+static DEFINE_SPINLOCK(rtc_timer_lock);
 static rtc_task_t *rtc_callback = NULL;
 #endif
 
@@ -219,7 +232,146 @@ static inline unsigned char rtc_is_updat
 	return uip;
 }
 
+#ifndef RTC_IRQ
+# undef CONFIG_RTC_HISTOGRAM
+#endif
+
+static inline void rtc_open_event(void)
+{
+#ifdef CONFIG_RTC_HISTOGRAM
+	int i;
+
+	last_interrupt_time = 0;
+	rtc_state = S_STARTUP;
+	rtc_irq_data = 0;
+
+	for (i = 0; i < HISTSIZE; i++)
+		histogram[i] = 0;
+#endif
+}
+
+static inline void rtc_wake_event(void)
+{
+#ifndef CONFIG_RTC_HISTOGRAM
+	kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+#else
+	if (!(rtc_status & RTC_IS_OPEN))
+		return;
+
+	switch (rtc_state) {
+	/* Startup */
+	case S_STARTUP:
+		kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+		break;
+	/* Waiting for an interrupt */
+	case S_IDLE:
+		kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+		last_interrupt_time = get_cycles();
+		rtc_state = S_WAITING_FOR_READ;
+		break;
+
+	/* Signal has been delivered. waiting for rtc_read() */
+	case S_WAITING_FOR_READ:
+		/*
+		 * Well foo.  The usermode application didn't
+		 * schedule and read in time.
+		 */
+		last_interrupt_time = get_cycles();
+		rtc_state = S_READ_MISSED;
+		printk("Read missed before next interrupt\n");
+		break;
+	/* Signal has been delivered, read() deadline was missed */
+	case S_READ_MISSED:
+		/*
+		 * Not much we can do here.  We're waiting for the usermode
+		 * application to read the rtc
+		 */
+		last_interrupt_time = get_cycles();
+		break;
+	}
+#endif
+}
+
+static inline void rtc_read_event(void)
+{
+#ifdef CONFIG_RTC_HISTOGRAM
+	cycles_t now = get_cycles();
+
+	switch (rtc_state) {
+	/* Startup */
+	case S_STARTUP:
+		rtc_state = S_IDLE;
+		break;
+
+	/* Waiting for an interrupt */
+	case S_IDLE:
+		printk("bug in rtc_read(): called in state S_IDLE!\n");
+		break;
+	case S_WAITING_FOR_READ:	/*
+					 * Signal has been delivered.
+					 * waiting for rtc_read()
+					 */
+		/*
+		 * Well done
+		 */
+	case S_READ_MISSED:		/*
+					 * Signal has been delivered, read()
+					 * deadline was missed
+					 */
+		/*
+		 * So, you finally got here.
+		 */
+		if (!last_interrupt_time)
+			printk("bug in rtc_read(): last_interrupt_time = 0\n");
+		rtc_state = S_IDLE;
+		{
+			cycles_t latency = now - last_interrupt_time;
+			unsigned long delta;	/* Microseconds */
+
+			delta = latency;
+			delta /= CPU_MHZ;
+
+			if (delta > 1000 * 1000) {
+				printk("rtc: eek\n");
+			} else {
+				unsigned long slot = delta;
+				if (slot >= HISTSIZE)
+					slot = HISTSIZE - 1;
+				histogram[slot]++;
+				if (delta > 2000)
+					printk("wow!  That was a "
+							"%ld millisec bump\n",
+						delta / 1000);
+			}
+		}
+		rtc_state = S_IDLE;
+		break;
+	}
+#endif
+}
+
+static inline void rtc_close_event(void)
+{
+#ifdef CONFIG_RTC_HISTOGRAM
+	int i = 0;
+	unsigned long total = 0;
+
+	for (i = 0; i < HISTSIZE; i++)
+		total += histogram[i];
+	if (!total)
+		return;
+
+	printk("\nrtc latency histogram of {%s/%d, %lu samples}:\n",
+		current->comm, current->pid, total);
+	for (i = 0; i < HISTSIZE; i++) {
+		if (histogram[i])
+			printk("%d %d\n", i, histogram[i]);
+	}
+#endif
+}
+
 #ifdef RTC_IRQ
+
 /*
  *	A very tiny interrupt handler. It runs with SA_INTERRUPT set,
  *	but there is possibility of conflicting with the set_rtc_mmss()
@@ -232,6 +384,8 @@ static inline unsigned char rtc_is_updat
 
 irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 {
+	int mod;
+
 	/*
 	 *	Can be an alarm interrupt, update complete interrupt,
 	 *	or a periodic interrupt. We store the status in the
@@ -239,7 +393,8 @@ irqreturn_t rtc_interrupt(int irq, void 
 	 *	the last read in the remainder of rtc_irq_data.
 	 */
 
-	spin_lock (&rtc_lock);
+	spin_lock(&rtc_timer_lock);
+	spin_lock(&rtc_lock);
 	rtc_irq_data += 0x100;
 	rtc_irq_data &= ~0xff;
 	if (is_hpet_enabled()) {
@@ -253,19 +408,23 @@ irqreturn_t rtc_interrupt(int irq, void 
 		rtc_irq_data |= (CMOS_READ(RTC_INTR_FLAGS) & 0xF0);
 	}
 
+	mod = 0;
 	if (rtc_status & RTC_TIMER_ON)
-		mod_timer(&rtc_irq_timer, jiffies + HZ/rtc_freq + 2*HZ/100);
+		mod = 1;
 
-	spin_unlock (&rtc_lock);
+	spin_unlock(&rtc_lock);
+	if (mod)
+		mod_timer(&rtc_irq_timer, jiffies + HZ/rtc_freq + 2*HZ/100);
+	spin_unlock(&rtc_timer_lock);
 
 	/* Now do the rest of the actions */
 	spin_lock(&rtc_task_lock);
 	if (rtc_callback)
 		rtc_callback->func(rtc_callback->private_data);
 	spin_unlock(&rtc_task_lock);
-	wake_up_interruptible(&rtc_wait);	
 
-	kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+	rtc_wake_event();
+	wake_up_interruptible(&rtc_wait);
 
 	return IRQ_HANDLED;
 }
@@ -350,10 +509,10 @@ static ssize_t rtc_read(struct file *fil
 
 		__set_current_state(TASK_INTERRUPTIBLE);
 		
-		spin_lock_irq (&rtc_lock);
+		spin_lock_irq(&rtc_lock);
 		data = rtc_irq_data;
 		rtc_irq_data = 0;
-		spin_unlock_irq (&rtc_lock);
+		spin_unlock_irq(&rtc_lock);
 
 		if (data != 0)
 			break;
@@ -369,6 +528,8 @@ static ssize_t rtc_read(struct file *fil
 		schedule();
 	} while (1);
 
+	rtc_read_event();
+
 	if (count < sizeof(unsigned long))
 		retval = put_user(data, (unsigned int __user *)buf) ?: sizeof(int); 
 	else
@@ -383,7 +544,7 @@ static ssize_t rtc_read(struct file *fil
 
 static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
 {
-	struct rtc_time wtime; 
+	struct rtc_time wtime;
 
 #ifdef RTC_IRQ
 	if (rtc_has_irq == 0) {
@@ -415,19 +576,24 @@ static int rtc_do_ioctl(unsigned int cmd
 	}
 	case RTC_PIE_OFF:	/* Mask periodic int. enab. bit	*/
 	{
-		unsigned long flags; /* can be called from isr via rtc_control() */
-		spin_lock_irqsave (&rtc_lock, flags);
-		mask_rtc_irq_bit_locked(RTC_PIE);
+		int del = 0;
+		mask_rtc_irq_bit(RTC_PIE);
+		spin_lock_irq(&rtc_timer_lock);
+		spin_lock(&rtc_lock);
 		if (rtc_status & RTC_TIMER_ON) {
+			del = 1;
 			rtc_status &= ~RTC_TIMER_ON;
-			del_timer(&rtc_irq_timer);
 		}
-		spin_unlock_irqrestore (&rtc_lock, flags);
+		spin_unlock(&rtc_lock);
+		if (del)
+			del_timer(&rtc_irq_timer);
+		spin_unlock_irq(&rtc_timer_lock);
 		return 0;
 	}
 	case RTC_PIE_ON:	/* Allow periodic ints		*/
 	{
-		unsigned long flags; /* can be called from isr via rtc_control() */
+		int add = 0;
+
 		/*
 		 * We don't really want Joe User enabling more
 		 * than 64Hz of interrupts on a multi-user machine.
@@ -436,14 +602,18 @@ static int rtc_do_ioctl(unsigned int cmd
 			(!capable(CAP_SYS_RESOURCE)))
 			return -EACCES;
 
-		spin_lock_irqsave (&rtc_lock, flags);
+		spin_lock_irq(&rtc_timer_lock);
+		spin_lock(&rtc_lock);
 		if (!(rtc_status & RTC_TIMER_ON)) {
+			add = 1;
 			rtc_irq_timer.expires = jiffies + HZ/rtc_freq + 2*HZ/100;
-			add_timer(&rtc_irq_timer);
 			rtc_status |= RTC_TIMER_ON;
 		}
-		set_rtc_irq_bit_locked(RTC_PIE);
-		spin_unlock_irqrestore (&rtc_lock, flags);
+		spin_unlock(&rtc_lock);
+		if (add)
+			add_timer(&rtc_irq_timer);
+		spin_unlock_irq(&rtc_timer_lock);
+		set_rtc_irq_bit(RTC_PIE);
 		return 0;
 	}
 	case RTC_UIE_OFF:	/* Mask ints from RTC updates.	*/
@@ -599,6 +769,11 @@ static int rtc_do_ioctl(unsigned int cmd
 		save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
 		CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
 
+		/*
+		 * Make CMOS date writes nonpreemptible even on PREEMPT_RT.
+		 * There's a limit to everything! =B-)
+		 */
+		preempt_disable();
 #ifdef CONFIG_MACH_DECSTATION
 		CMOS_WRITE(real_yrs, RTC_DEC_YEAR);
 #endif
@@ -608,6 +783,7 @@ static int rtc_do_ioctl(unsigned int cmd
 		CMOS_WRITE(hrs, RTC_HOURS);
 		CMOS_WRITE(min, RTC_MINUTES);
 		CMOS_WRITE(sec, RTC_SECONDS);
+		preempt_enable();
 
 		CMOS_WRITE(save_control, RTC_CONTROL);
 		CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
@@ -624,7 +800,6 @@ static int rtc_do_ioctl(unsigned int cmd
 	{
 		int tmp = 0;
 		unsigned char val;
-		unsigned long flags; /* can be called from isr via rtc_control() */
 
 		/* 
 		 * The max we can do is 8192Hz.
@@ -647,9 +822,9 @@ static int rtc_do_ioctl(unsigned int cmd
 		if (arg != (1<<tmp))
 			return -EINVAL;
 
-		spin_lock_irqsave(&rtc_lock, flags);
+		spin_lock_irq(&rtc_lock);
 		if (hpet_set_periodic_freq(arg)) {
-			spin_unlock_irqrestore(&rtc_lock, flags);
+			spin_unlock_irq(&rtc_lock);
 			return 0;
 		}
 		rtc_freq = arg;
@@ -657,7 +832,7 @@ static int rtc_do_ioctl(unsigned int cmd
 		val = CMOS_READ(RTC_FREQ_SELECT) & 0xf0;
 		val |= (16 - tmp);
 		CMOS_WRITE(val, RTC_FREQ_SELECT);
-		spin_unlock_irqrestore(&rtc_lock, flags);
+		spin_unlock_irq(&rtc_lock);
 		return 0;
 	}
 #endif
@@ -701,19 +876,20 @@ static int rtc_ioctl(struct inode *inode
  * needed here. Or anywhere else in this driver. */
 static int rtc_open(struct inode *inode, struct file *file)
 {
-	spin_lock_irq (&rtc_lock);
+	spin_lock_irq(&rtc_lock);
 
 	if(rtc_status & RTC_IS_OPEN)
 		goto out_busy;
 
+	rtc_open_event();
 	rtc_status |= RTC_IS_OPEN;
 
 	rtc_irq_data = 0;
-	spin_unlock_irq (&rtc_lock);
+	spin_unlock_irq(&rtc_lock);
 	return 0;
 
 out_busy:
-	spin_unlock_irq (&rtc_lock);
+	spin_unlock_irq(&rtc_lock);
 	return -EBUSY;
 }
 
@@ -727,6 +903,7 @@ static int rtc_release(struct inode *ino
 {
 #ifdef RTC_IRQ
 	unsigned char tmp;
+	int del;
 
 	if (rtc_has_irq == 0)
 		goto no_irq;
@@ -736,7 +913,8 @@ static int rtc_release(struct inode *ino
 	 * in use, and clear the data.
 	 */
 
-	spin_lock_irq(&rtc_lock);
+	spin_lock_irq(&rtc_timer_lock);
+	spin_lock(&rtc_lock);
 	if (!hpet_mask_rtc_irq_bit(RTC_PIE | RTC_AIE | RTC_UIE)) {
 		tmp = CMOS_READ(RTC_CONTROL);
 		tmp &=  ~RTC_PIE;
@@ -745,11 +923,15 @@ static int rtc_release(struct inode *ino
 		CMOS_WRITE(tmp, RTC_CONTROL);
 		CMOS_READ(RTC_INTR_FLAGS);
 	}
+	del = 0;
 	if (rtc_status & RTC_TIMER_ON) {
 		rtc_status &= ~RTC_TIMER_ON;
-		del_timer(&rtc_irq_timer);
+		del = 1;
 	}
-	spin_unlock_irq(&rtc_lock);
+	spin_unlock(&rtc_lock);
+	if (del)
+		del_timer(&rtc_irq_timer);
+	spin_unlock_irq(&rtc_timer_lock);
 
 	if (file->f_flags & FASYNC) {
 		rtc_fasync (-1, file, 0);
@@ -757,10 +939,11 @@ static int rtc_release(struct inode *ino
 no_irq:
 #endif
 
-	spin_lock_irq (&rtc_lock);
+	spin_lock_irq(&rtc_lock);
 	rtc_irq_data = 0;
 	rtc_status &= ~RTC_IS_OPEN;
-	spin_unlock_irq (&rtc_lock);
+	spin_unlock_irq(&rtc_lock);
+	rtc_close_event();
 	return 0;
 }
 
@@ -775,9 +958,9 @@ static unsigned int rtc_poll(struct file
 
 	poll_wait(file, &rtc_wait, wait);
 
-	spin_lock_irq (&rtc_lock);
+	spin_lock_irq(&rtc_lock);
 	l = rtc_irq_data;
-	spin_unlock_irq (&rtc_lock);
+	spin_unlock_irq(&rtc_lock);
 
 	if (l != 0)
 		return POLLIN | POLLRDNORM;
@@ -825,12 +1008,15 @@ int rtc_unregister(rtc_task_t *task)
 	return -EIO;
 #else
 	unsigned char tmp;
+	int del;
 
-	spin_lock_irq(&rtc_lock);
+	spin_lock_irq(&rtc_timer_lock);
+	spin_lock(&rtc_lock);
 	spin_lock(&rtc_task_lock);
 	if (rtc_callback != task) {
 		spin_unlock(&rtc_task_lock);
-		spin_unlock_irq(&rtc_lock);
+		spin_unlock(&rtc_lock);
+		spin_unlock_irq(&rtc_timer_lock);
 		return -ENXIO;
 	}
 	rtc_callback = NULL;
@@ -844,13 +1030,17 @@ int rtc_unregister(rtc_task_t *task)
 		CMOS_WRITE(tmp, RTC_CONTROL);
 		CMOS_READ(RTC_INTR_FLAGS);
 	}
+	del = 0;
 	if (rtc_status & RTC_TIMER_ON) {
 		rtc_status &= ~RTC_TIMER_ON;
-		del_timer(&rtc_irq_timer);
+		del = 1;
 	}
 	rtc_status &= ~RTC_IS_OPEN;
 	spin_unlock(&rtc_task_lock);
-	spin_unlock_irq(&rtc_lock);
+	spin_unlock(&rtc_lock);
+	if (del)
+		del_timer(&rtc_irq_timer);
+	spin_unlock_irq(&rtc_timer_lock);
 	return 0;
 #endif
 }
@@ -860,15 +1050,12 @@ int rtc_control(rtc_task_t *task, unsign
 #ifndef RTC_IRQ
 	return -EIO;
 #else
-	unsigned long flags;
-	if (cmd != RTC_PIE_ON && cmd != RTC_PIE_OFF && cmd != RTC_IRQP_SET)
-		return -EINVAL;
-	spin_lock_irqsave(&rtc_task_lock, flags);
+	spin_lock_irq(&rtc_task_lock);
 	if (rtc_callback != task) {
-		spin_unlock_irqrestore(&rtc_task_lock, flags);
+		spin_unlock_irq(&rtc_task_lock);
 		return -ENXIO;
 	}
-	spin_unlock_irqrestore(&rtc_task_lock, flags);
+	spin_unlock_irq(&rtc_task_lock);
 	return rtc_do_ioctl(cmd, arg, 1);
 #endif
 }
@@ -1111,17 +1298,21 @@ module_exit(rtc_exit);
 static void rtc_dropped_irq(unsigned long data)
 {
 	unsigned long freq;
+	int mod;
 
-	spin_lock_irq (&rtc_lock);
+	spin_lock_irq(&rtc_timer_lock);
+	spin_lock(&rtc_lock);
 
 	if (hpet_rtc_dropped_irq()) {
-		spin_unlock_irq(&rtc_lock);
+		spin_unlock(&rtc_lock);
+		spin_unlock_irq(&rtc_timer_lock);
 		return;
 	}
 
 	/* Just in case someone disabled the timer from behind our back... */
+	mod = 0;
 	if (rtc_status & RTC_TIMER_ON)
-		mod_timer(&rtc_irq_timer, jiffies + HZ/rtc_freq + 2*HZ/100);
+		mod = 1;
 
 	rtc_irq_data += ((rtc_freq/HZ)<<8);
 	rtc_irq_data &= ~0xff;
@@ -1129,7 +1320,10 @@ static void rtc_dropped_irq(unsigned lon
 
 	freq = rtc_freq;
 
-	spin_unlock_irq(&rtc_lock);
+	spin_unlock(&rtc_lock);
+	if (mod)
+		mod_timer(&rtc_irq_timer, jiffies + HZ/rtc_freq + 2*HZ/100);
+	spin_unlock_irq(&rtc_timer_lock);
 
 	printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", freq);
 
@@ -1329,32 +1523,40 @@ static void get_rtc_alm_time(struct rtc_
  * meddles with the interrupt enable/disable bits.
  */
 
-static void mask_rtc_irq_bit_locked(unsigned char bit)
+static void mask_rtc_irq_bit(unsigned char bit)
 {
 	unsigned char val;
 
-	if (hpet_mask_rtc_irq_bit(bit))
+	spin_lock_irq(&rtc_lock);
+	if (hpet_mask_rtc_irq_bit(bit)) {
+		spin_unlock_irq(&rtc_lock);
 		return;
+	}
 	val = CMOS_READ(RTC_CONTROL);
 	val &=  ~bit;
 	CMOS_WRITE(val, RTC_CONTROL);
 	CMOS_READ(RTC_INTR_FLAGS);
 
 	rtc_irq_data = 0;
+	spin_unlock_irq(&rtc_lock);
 }
 
-static void set_rtc_irq_bit_locked(unsigned char bit)
+static void set_rtc_irq_bit(unsigned char bit)
 {
 	unsigned char val;
 
-	if (hpet_set_rtc_irq_bit(bit))
+	spin_lock_irq(&rtc_lock);
+	if (hpet_set_rtc_irq_bit(bit)) {
+		spin_unlock_irq(&rtc_lock);
 		return;
+	}
 	val = CMOS_READ(RTC_CONTROL);
 	val |= bit;
 	CMOS_WRITE(val, RTC_CONTROL);
 	CMOS_READ(RTC_INTR_FLAGS);
 
 	rtc_irq_data = 0;
+	spin_unlock_irq(&rtc_lock);
 }
 #endif
 
Index: linux-2.6.16/drivers/char/s3c2410-rtc.c
===================================================================
--- linux-2.6.16.orig/drivers/char/s3c2410-rtc.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/char/s3c2410-rtc.c	2006-10-19 16:52:30.000000000 +0200
@@ -22,6 +22,7 @@
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/rtc.h>
 #include <linux/bcd.h>
 #include <linux/clk.h>
@@ -29,7 +30,6 @@
 #include <asm/hardware.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
-#include <asm/irq.h>
 #include <asm/rtc.h>
 
 #include <asm/mach/time.h>
Index: linux-2.6.16/drivers/char/sysrq.c
===================================================================
--- linux-2.6.16.orig/drivers/char/sysrq.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/char/sysrq.c	2006-10-19 16:52:30.000000000 +0200
@@ -184,6 +184,37 @@ static struct sysrq_key_op sysrq_showreg
 	.enable_mask	= SYSRQ_ENABLE_DUMP,
 };
 
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+
+static void sysrq_handle_show_pi_locks(int key, struct pt_regs *pt_regs,
+				   struct tty_struct *tty)
+{
+	rt_mutex_show_all_locks();
+}
+
+static struct sysrq_key_op sysrq_show_pi_locks_op = {
+	.handler	= sysrq_handle_show_pi_locks,
+	.help_msg	= "show-all-PI-locks(D)",
+	.action_msg	= "Show PI Locks Held",
+};
+
+#endif
+
+#if defined(__i386__)
+
+static void sysrq_handle_showallregs(int key, struct pt_regs *pt_regs,
+				     struct tty_struct *tty)
+{
+	nmi_show_all_regs();
+}
+
+static struct sysrq_key_op sysrq_showallregs_op = {
+	.handler	= sysrq_handle_showallregs,
+	.help_msg	= "showalLcpupc",
+	.action_msg	= "Show Regs On All CPUs",
+};
+
+#endif
 
 static void sysrq_handle_showstate(int key, struct pt_regs *pt_regs,
 				   struct tty_struct *tty) 
@@ -325,7 +356,11 @@ static struct sysrq_key_op *sysrq_key_ta
 #else
 /* k */	NULL,
 #endif
+#if defined(__i386__)
+/* l */	&sysrq_showallregs_op,
+#else
 /* l */	NULL,
+#endif
 /* m */	&sysrq_showmem_op,
 /* n */	&sysrq_unrt_op,
 /* o */	NULL, /* This will often be registered
@@ -342,7 +377,11 @@ static struct sysrq_key_op *sysrq_key_ta
 /* u */	&sysrq_mountro_op,
 /* v */	NULL, /* May be assigned at init time by SMP VOYAGER */
 /* w */	NULL,
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+/* x */ &sysrq_show_pi_locks_op,
+#else
 /* x */	NULL,
+#endif
 /* y */	NULL,
 /* z */	NULL
 };
Index: linux-2.6.16/drivers/char/tty_io.c
===================================================================
--- linux-2.6.16.orig/drivers/char/tty_io.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/char/tty_io.c	2006-10-19 16:52:30.000000000 +0200
@@ -227,6 +227,7 @@ static int check_tty_count(struct tty_st
 		printk(KERN_WARNING "Warning: dev (%s) tty->count(%d) "
 				    "!= #fd's(%d) in %s\n",
 		       tty->name, tty->count, count, routine);
+		dump_stack();
 		return count;
        }	
 #endif
@@ -1097,8 +1098,8 @@ static void do_tty_hangup(void *data)
 				p->signal->tty = NULL;
 			if (!p->signal->leader)
 				continue;
-			send_group_sig_info(SIGHUP, SEND_SIG_PRIV, p);
-			send_group_sig_info(SIGCONT, SEND_SIG_PRIV, p);
+			group_send_sig_info(SIGHUP, SEND_SIG_PRIV, p);
+			group_send_sig_info(SIGCONT, SEND_SIG_PRIV, p);
 			if (tty->pgrp > 0)
 				p->signal->tty_old_pgrp = tty->pgrp;
 		} while_each_task_pid(tty->session, PIDTYPE_SID, p);
Index: linux-2.6.16/drivers/clocksource/Makefile
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/drivers/clocksource/Makefile	2006-10-19 16:52:30.000000000 +0200
@@ -0,0 +1,2 @@
+obj-$(CONFIG_X86_CYCLONE_TIMER) += cyclone.o
+obj-$(CONFIG_X86_PM_TIMER) += acpi_pm.o
Index: linux-2.6.16/drivers/clocksource/acpi_pm.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/drivers/clocksource/acpi_pm.c	2006-10-19 16:52:30.000000000 +0200
@@ -0,0 +1,89 @@
+/*
+ * linux/drivers/clocksource/acpi_pm.c
+ *
+ * This file contains the ACPI PM based clocksource.
+ *
+ * This code was largely moved from the i386 timer_pm.c file
+ * which was (C) Dominik Brodowski <linux@brodo.de> 2003
+ * and contained the following comments:
+ *
+ * Driver to use the Power Management Timer (PMTMR) available in some
+ * southbridges as primary timing source for the Linux kernel.
+ *
+ * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
+ * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
+ *
+ * This file is licensed under the GPL v2.
+ */
+
+#include <linux/clocksource.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <asm/io.h>
+
+/* Number of PMTMR ticks expected during calibration run */
+#define PMTMR_TICKS_PER_SEC 3579545
+
+/*
+ * The I/O port the PMTMR resides at.
+ * The location is detected during setup_arch(),
+ * in arch/i386/acpi/boot.c
+ */
+u32 pmtmr_ioport;
+
+#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
+
+static inline u32 read_pmtmr(void)
+{
+	/* mask the output to 24 bits */
+	return inl(pmtmr_ioport) & ACPI_PM_MASK;
+}
+
+static cycle_t acpi_pm_read(void)
+{
+	return (cycle_t)read_pmtmr();
+}
+
+static struct clocksource clocksource_acpi_pm = {
+	.name		= "acpi_pm",
+	.rating		= 200,
+	.read		= acpi_pm_read,
+	.mask		= (cycle_t)ACPI_PM_MASK,
+	.mult		= 0, /*to be caluclated*/
+	.shift		= 22,
+	.is_continuous	= 1,
+};
+
+static int __init init_acpi_pm_clocksource(void)
+{
+	u32 value1, value2;
+	unsigned int i;
+
+	if (!pmtmr_ioport)
+		return -ENODEV;
+
+	clocksource_acpi_pm.mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC,
+						clocksource_acpi_pm.shift);
+
+	/* "verify" this timing source: */
+	value1 = read_pmtmr();
+	for (i = 0; i < 10000; i++) {
+		value2 = read_pmtmr();
+		if (value2 == value1)
+			continue;
+		if (value2 > value1)
+			goto pm_good;
+		if ((value2 < value1) && ((value2) < 0xFFF))
+			goto pm_good;
+		printk(KERN_INFO "PM-Timer had inconsistent results: 0x%#x, 0x%#x - aborting.\n", value1, value2);
+		return -EINVAL;
+	}
+	printk(KERN_INFO "PM-Timer had no reasonable result: 0x%#x - aborting.\n", value1);
+	return -ENODEV;
+
+pm_good:
+
+	return register_clocksource(&clocksource_acpi_pm);
+}
+
+module_init(init_acpi_pm_clocksource);
Index: linux-2.6.16/drivers/clocksource/cyclone.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/drivers/clocksource/cyclone.c	2006-10-19 16:52:30.000000000 +0200
@@ -0,0 +1,119 @@
+#include <linux/clocksource.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/timex.h>
+#include <linux/init.h>
+
+#include <asm/pgtable.h>
+#include <asm/io.h>
+
+#include "mach_timer.h"
+
+#define CYCLONE_CBAR_ADDR	0xFEB00CD0	/* base address ptr */
+#define CYCLONE_PMCC_OFFSET	0x51A0		/* offset to control register */
+#define CYCLONE_MPCS_OFFSET	0x51A8		/* offset to select register */
+#define CYCLONE_MPMC_OFFSET	0x51D0		/* offset to count register */
+#define CYCLONE_TIMER_FREQ	99780000	/* 100Mhz, but not really */
+#define CYCLONE_TIMER_MASK	0xFFFFFFFF	/* 32 bit mask */
+
+int use_cyclone = 0;
+static void __iomem *cyclone_ptr;
+
+static cycle_t read_cyclone(void)
+{
+	return (cycle_t)readl(cyclone_ptr);
+}
+
+static struct clocksource clocksource_cyclone = {
+	.name		= "cyclone",
+	.rating		= 250,
+	.read		= read_cyclone,
+	.mask		= (cycle_t)CYCLONE_TIMER_MASK,
+	.mult		= 10,
+	.shift		= 0,
+	.is_continuous	= 1,
+};
+
+static int __init init_cyclone_clocksource(void)
+{
+	unsigned long base;	/* saved value from CBAR */
+	unsigned long offset;
+	u32 __iomem* volatile cyclone_timer;	/* Cyclone MPMC0 register */
+	u32 __iomem* reg;
+	int i;
+
+	/* make sure we're on a summit box: */
+	if (!use_cyclone)
+		return -ENODEV;
+
+	printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n");
+
+	/* find base address: */
+	offset = CYCLONE_CBAR_ADDR;
+	reg = ioremap_nocache(offset, sizeof(reg));
+	if (!reg) {
+		printk(KERN_ERR "Summit chipset: Could not find valid CBAR register.\n");
+		return -ENODEV;
+	}
+	/* even on 64bit systems, this is only 32bits: */
+	base = readl(reg);
+	if (!base) {
+		printk(KERN_ERR "Summit chipset: Could not find valid CBAR value.\n");
+		return -ENODEV;
+	}
+	iounmap(reg);
+
+	/* setup PMCC: */
+	offset = base + CYCLONE_PMCC_OFFSET;
+	reg = ioremap_nocache(offset, sizeof(reg));
+	if (!reg) {
+		printk(KERN_ERR "Summit chipset: Could not find valid PMCC register.\n");
+		return -ENODEV;
+	}
+	writel(0x00000001,reg);
+	iounmap(reg);
+
+	/* setup MPCS: */
+	offset = base + CYCLONE_MPCS_OFFSET;
+	reg = ioremap_nocache(offset, sizeof(reg));
+	if (!reg) {
+		printk(KERN_ERR "Summit chipset: Could not find valid MPCS register.\n");
+		return -ENODEV;
+	}
+	writel(0x00000001,reg);
+	iounmap(reg);
+
+	/* map in cyclone_timer: */
+	offset = base + CYCLONE_MPMC_OFFSET;
+	cyclone_timer = ioremap_nocache(offset, sizeof(u64));
+	if (!cyclone_timer) {
+		printk(KERN_ERR "Summit chipset: Could not find valid MPMC register.\n");
+		return -ENODEV;
+	}
+
+	/* quick test to make sure its ticking: */
+	for (i = 0; i < 3; i++){
+		u32 old = readl(cyclone_timer);
+		int stall = 100;
+
+		while (stall--)
+			barrier();
+
+		if (readl(cyclone_timer) == old) {
+			printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n");
+			iounmap(cyclone_timer);
+			cyclone_timer = NULL;
+			return -ENODEV;
+		}
+	}
+	cyclone_ptr = cyclone_timer;
+
+	/* sort out mult/shift values: */
+	clocksource_cyclone.shift = 22;
+	clocksource_cyclone.mult = clocksource_hz2mult(CYCLONE_TIMER_FREQ,
+						clocksource_cyclone.shift);
+
+	return register_clocksource(&clocksource_cyclone);
+}
+
+module_init(init_cyclone_clocksource);
Index: linux-2.6.16/drivers/edac/edac_mc.c
===================================================================
--- linux-2.6.16.orig/drivers/edac/edac_mc.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/edac/edac_mc.c	2006-10-19 16:52:30.000000000 +0200
@@ -64,6 +64,8 @@ static atomic_t pci_parity_count = ATOMI
 static DECLARE_MUTEX(mem_ctls_mutex);
 static struct list_head mc_devices = LIST_HEAD_INIT(mc_devices);
 
+static DEFINE_RAW_SPINLOCK(lowlevel_mc_lock);
+
 /* Structure of the whitelist and blacklist arrays */
 struct edac_pci_device_list {
 	unsigned int  vendor;		/* Vendor ID */
@@ -1577,7 +1579,7 @@ void edac_mc_scrub_block(unsigned long p
 	pg = pfn_to_page(page);
 
 	if (PageHighMem(pg))
-		local_irq_save(flags);
+		spin_lock_irqsave(&lowlevel_mc_lock, flags);
 
 	virt_addr = kmap_atomic(pg, KM_BOUNCE_READ);
 
@@ -1588,7 +1590,7 @@ void edac_mc_scrub_block(unsigned long p
 	kunmap_atomic(virt_addr, KM_BOUNCE_READ);
 
 	if (PageHighMem(pg))
-		local_irq_restore(flags);
+		spin_unlock_irqrestore(&lowlevel_mc_lock, flags);
 }
 
 
@@ -1999,9 +2001,9 @@ static void do_pci_parity_check(void)
 	/* scan all PCI devices looking for a Parity Error on devices and
 	 * bridges
 	 */
-	local_irq_save(flags);
+	spin_lock_irqsave(&lowlevel_mc_lock, flags);
 	edac_pci_dev_parity_iterator(edac_pci_dev_parity_test);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&lowlevel_mc_lock, flags);
 
 	/* Only if operator has selected panic on PCI Error */
 	if (panic_on_pci_parity) {
@@ -2050,7 +2052,7 @@ static inline void check_mc_devices (voi
 	debugf3("MC: " __FILE__ ": %s()\n", __func__);
 
 	/* during poll, have interrupts off */
-	local_irq_save(flags);
+	spin_lock_irqsave(&lowlevel_mc_lock, flags);
 
 	list_for_each(item, &mc_devices) {
 		mci = list_entry(item, struct mem_ctl_info, link);
@@ -2059,7 +2061,7 @@ static inline void check_mc_devices (voi
 			mci->edac_check(mci);
 	}
 
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&lowlevel_mc_lock, flags);
 }
 
 
Index: linux-2.6.16/drivers/i2c/chips/tps65010.c
===================================================================
--- linux-2.6.16.orig/drivers/i2c/chips/tps65010.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/i2c/chips/tps65010.c	2006-10-19 16:52:30.000000000 +0200
@@ -25,6 +25,7 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/device.h>
 #include <linux/i2c.h>
 #include <linux/delay.h>
@@ -33,7 +34,6 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 
-#include <asm/irq.h>
 #include <asm/mach-types.h>
 
 #include <asm/arch/gpio.h>
Index: linux-2.6.16/drivers/ide/ide-floppy.c
===================================================================
--- linux-2.6.16.orig/drivers/ide/ide-floppy.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/ide/ide-floppy.c	2006-10-19 16:52:30.000000000 +0200
@@ -838,7 +838,7 @@ static ide_startstop_t idefloppy_pc_intr
 			"transferred\n", pc->actually_transferred);
 		clear_bit(PC_DMA_IN_PROGRESS, &pc->flags);
 
-		local_irq_enable();
+		local_irq_enable_nort();
 
 		if (status.b.check || test_bit(PC_DMA_ERROR, &pc->flags)) {
 			/* Error detected */
@@ -1670,9 +1670,9 @@ static int idefloppy_get_format_progress
 		atapi_status_t status;
 		unsigned long flags;
 
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		status.all = HWIF(drive)->INB(IDE_STATUS_REG);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 
 		progress_indication = !status.b.dsc ? 0 : 0x10000;
 	}
Index: linux-2.6.16/drivers/ide/ide-io.c
===================================================================
--- linux-2.6.16.orig/drivers/ide/ide-io.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/ide/ide-io.c	2006-10-19 16:52:30.000000000 +0200
@@ -633,7 +633,7 @@ static ide_startstop_t drive_cmd_intr (i
 	u8 stat = hwif->INB(IDE_STATUS_REG);
 	int retries = 10;
 
-	local_irq_enable();
+	local_irq_enable_nort();
 	if ((stat & DRQ_STAT) && args && args[3]) {
 		u8 io_32bit = drive->io_32bit;
 		drive->io_32bit = 0;
@@ -1104,7 +1104,7 @@ static void ide_do_request (ide_hwgroup_
 	ide_get_lock(ide_intr, hwgroup);
 
 	/* caller must own ide_lock */
-	BUG_ON(!irqs_disabled());
+	BUG_ON_NONRT(!irqs_disabled());
 
 	while (!hwgroup->busy) {
 		hwgroup->busy = 1;
@@ -1216,8 +1216,7 @@ static void ide_do_request (ide_hwgroup_
 		 */
 		if (masked_irq != IDE_NO_IRQ && hwif->irq != masked_irq)
 			disable_irq_nosync(hwif->irq);
-		spin_unlock(&ide_lock);
-		local_irq_enable();
+		spin_unlock_irq(&ide_lock);
 			/* allow other IRQs while we start this request */
 		startstop = start_request(drive, rq);
 		spin_lock_irq(&ide_lock);
@@ -1365,7 +1364,7 @@ void ide_timer_expiry (unsigned long dat
 #endif /* DISABLE_IRQ_NOSYNC */
 			/* local CPU only,
 			 * as if we were handling an interrupt */
-			local_irq_disable();
+			local_irq_disable_nort();
 			if (hwgroup->polling) {
 				startstop = handler(drive);
 			} else if (drive_is_ready(drive)) {
@@ -1562,7 +1561,7 @@ irqreturn_t ide_intr (int irq, void *dev
 	spin_unlock(&ide_lock);
 
 	if (drive->unmask)
-		local_irq_enable();
+		local_irq_enable_nort();
 	/* service this interrupt, may set handler for next interrupt */
 	startstop = handler(drive);
 	spin_lock_irq(&ide_lock);
Index: linux-2.6.16/drivers/ide/ide-iops.c
===================================================================
--- linux-2.6.16.orig/drivers/ide/ide-iops.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/ide/ide-iops.c	2006-10-19 16:52:30.000000000 +0200
@@ -244,10 +244,10 @@ static void ata_input_data(ide_drive_t *
 	if (io_32bit) {
 		if (io_32bit & 2) {
 			unsigned long flags;
-			local_irq_save(flags);
+			local_irq_save_nort(flags);
 			ata_vlb_sync(drive, IDE_NSECTOR_REG);
 			hwif->INSL(IDE_DATA_REG, buffer, wcount);
-			local_irq_restore(flags);
+			local_irq_restore_nort(flags);
 		} else
 			hwif->INSL(IDE_DATA_REG, buffer, wcount);
 	} else {
@@ -266,10 +266,10 @@ static void ata_output_data(ide_drive_t 
 	if (io_32bit) {
 		if (io_32bit & 2) {
 			unsigned long flags;
-			local_irq_save(flags);
+			local_irq_save_nort(flags);
 			ata_vlb_sync(drive, IDE_NSECTOR_REG);
 			hwif->OUTSL(IDE_DATA_REG, buffer, wcount);
-			local_irq_restore(flags);
+			local_irq_restore_nort(flags);
 		} else
 			hwif->OUTSL(IDE_DATA_REG, buffer, wcount);
 	} else {
@@ -564,12 +564,12 @@ int ide_wait_stat (ide_startstop_t *star
 				if (!(stat & BUSY_STAT))
 					break;
 
-				local_irq_restore(flags);
+				local_irq_restore_nort(flags);
 				*startstop = ide_error(drive, "status timeout", stat);
 				return 1;
 			}
 		}
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	}
 	/*
 	 * Allow status to settle, then read it again.
@@ -727,17 +727,15 @@ int ide_driveid_update (ide_drive_t *dri
 		printk("%s: CHECK for good STATUS\n", drive->name);
 		return 0;
 	}
-	local_irq_save(flags);
-	SELECT_MASK(drive, 0);
 	id = kmalloc(SECTOR_WORDS*4, GFP_ATOMIC);
-	if (!id) {
-		local_irq_restore(flags);
+	if (!id)
 		return 0;
-	}
+	local_irq_save_nort(flags);
+	SELECT_MASK(drive, 0);
 	ata_input_data(drive, id, SECTOR_WORDS);
 	(void) hwif->INB(IDE_STATUS_REG);	/* clear drive IRQ */
-	local_irq_enable();
-	local_irq_restore(flags);
+	local_irq_enable_nort();
+	local_irq_restore_nort(flags);
 	ide_fix_driveid(id);
 	if (id) {
 		drive->id->dma_ultra = id->dma_ultra;
@@ -817,7 +815,7 @@ int ide_config_drive_speed (ide_drive_t 
 			if (time_after(jiffies, timeout))
 				break;
 		}
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	}
 
 	/*
Index: linux-2.6.16/drivers/ide/ide-lib.c
===================================================================
--- linux-2.6.16.orig/drivers/ide/ide-lib.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/ide/ide-lib.c	2006-10-19 16:52:30.000000000 +0200
@@ -447,15 +447,16 @@ EXPORT_SYMBOL_GPL(ide_set_xfer_rate);
 
 static void ide_dump_opcode(ide_drive_t *drive)
 {
+	unsigned long flags;
 	struct request *rq;
 	u8 opcode = 0;
 	int found = 0;
 
-	spin_lock(&ide_lock);
+	spin_lock_irqsave(&ide_lock, flags);
 	rq = NULL;
 	if (HWGROUP(drive))
 		rq = HWGROUP(drive)->rq;
-	spin_unlock(&ide_lock);
+	spin_unlock_irqrestore(&ide_lock, flags);
 	if (!rq)
 		return;
 	if (rq->flags & (REQ_DRIVE_CMD | REQ_DRIVE_TASK)) {
@@ -483,10 +484,8 @@ static void ide_dump_opcode(ide_drive_t 
 static u8 ide_dump_ata_status(ide_drive_t *drive, const char *msg, u8 stat)
 {
 	ide_hwif_t *hwif = HWIF(drive);
-	unsigned long flags;
 	u8 err = 0;
 
-	local_irq_set(flags);
 	printk("%s: %s: status=0x%02x { ", drive->name, msg, stat);
 	if (stat & BUSY_STAT)
 		printk("Busy ");
@@ -546,7 +545,7 @@ static u8 ide_dump_ata_status(ide_drive_
 		printk("\n");
 	}
 	ide_dump_opcode(drive);
-	local_irq_restore(flags);
+
 	return err;
 }
 
@@ -561,14 +560,12 @@ static u8 ide_dump_ata_status(ide_drive_
 
 static u8 ide_dump_atapi_status(ide_drive_t *drive, const char *msg, u8 stat)
 {
-	unsigned long flags;
-
 	atapi_status_t status;
 	atapi_error_t error;
 
 	status.all = stat;
 	error.all = 0;
-	local_irq_set(flags);
+
 	printk("%s: %s: status=0x%02x { ", drive->name, msg, stat);
 	if (status.b.bsy)
 		printk("Busy ");
@@ -594,7 +591,7 @@ static u8 ide_dump_atapi_status(ide_driv
 		printk("}\n");
 	}
 	ide_dump_opcode(drive);
-	local_irq_restore(flags);
+
 	return error.all;
 }
 
Index: linux-2.6.16/drivers/ide/ide-probe.c
===================================================================
--- linux-2.6.16.orig/drivers/ide/ide-probe.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/ide/ide-probe.c	2006-10-19 16:52:30.000000000 +0200
@@ -145,7 +145,7 @@ static inline void do_identify (ide_driv
 	hwif->ata_input_data(drive, id, SECTOR_WORDS);
 
 	drive->id_read = 1;
-	local_irq_enable();
+	local_irq_enable_nort();
 	ide_fix_driveid(id);
 
 #if defined (CONFIG_SCSI_EATA_DMA) || defined (CONFIG_SCSI_EATA_PIO) || defined (CONFIG_SCSI_EATA)
@@ -327,14 +327,14 @@ static int actual_try_to_identify (ide_d
 		unsigned long flags;
 
 		/* local CPU only; some systems need this */
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		/* drive returned ID */
 		do_identify(drive, cmd);
 		/* drive responded with ID */
 		rc = 0;
 		/* clear drive IRQ */
 		(void) hwif->INB(IDE_STATUS_REG);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	} else {
 		/* drive refused ID */
 		rc = 2;
@@ -806,7 +806,7 @@ static void probe_hwif(ide_hwif_t *hwif)
 		} while ((stat & BUSY_STAT) && time_after(timeout, jiffies));
 
 	}
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 	/*
 	 * Use cached IRQ number. It might be (and is...) changed by probe
 	 * code above
Index: linux-2.6.16/drivers/ide/ide-taskfile.c
===================================================================
--- linux-2.6.16.orig/drivers/ide/ide-taskfile.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/ide/ide-taskfile.c	2006-10-19 16:52:30.000000000 +0200
@@ -224,7 +224,7 @@ ide_startstop_t task_no_data_intr (ide_d
 	ide_hwif_t *hwif	= HWIF(drive);
 	u8 stat;
 
-	local_irq_enable();
+	local_irq_enable_nort();
 	if (!OK_STAT(stat = hwif->INB(IDE_STATUS_REG),READY_STAT,BAD_STAT)) {
 		return ide_error(drive, "task_no_data_intr", stat);
 		/* calls ide_end_drive_cmd */
@@ -276,7 +276,7 @@ static void ide_pio_sector(ide_drive_t *
 	offset %= PAGE_SIZE;
 
 #ifdef CONFIG_HIGHMEM
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 #endif
 	buf = kmap_atomic(page, KM_BIO_SRC_IRQ) + offset;
 
@@ -296,7 +296,7 @@ static void ide_pio_sector(ide_drive_t *
 
 	kunmap_atomic(buf, KM_BIO_SRC_IRQ);
 #ifdef CONFIG_HIGHMEM
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 #endif
 }
 
@@ -456,7 +456,7 @@ ide_startstop_t pre_task_out_intr (ide_d
 	}
 
 	if (!drive->unmask)
-		local_irq_disable();
+		local_irq_disable_nort();
 
 	ide_set_handler(drive, &task_out_intr, WAIT_WORSTCASE, NULL);
 	ide_pio_datablock(drive, rq, 1);
Index: linux-2.6.16/drivers/ide/ide.c
===================================================================
--- linux-2.6.16.orig/drivers/ide/ide.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/ide/ide.c	2006-10-19 16:52:30.000000000 +0200
@@ -1047,15 +1047,13 @@ int ide_spin_wait_hwgroup (ide_drive_t *
 	spin_lock_irq(&ide_lock);
 
 	while (hwgroup->busy) {
-		unsigned long lflags;
 		spin_unlock_irq(&ide_lock);
-		local_irq_set(lflags);
+
 		if (time_after(jiffies, timeout)) {
-			local_irq_restore(lflags);
 			printk(KERN_ERR "%s: channel busy\n", drive->name);
 			return -EBUSY;
 		}
-		local_irq_restore(lflags);
+
 		spin_lock_irq(&ide_lock);
 	}
 	return 0;
Index: linux-2.6.16/drivers/ide/pci/alim15x3.c
===================================================================
--- linux-2.6.16.orig/drivers/ide/pci/alim15x3.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/ide/pci/alim15x3.c	2006-10-19 16:52:30.000000000 +0200
@@ -296,7 +296,6 @@ static void ali15x3_tune_drive (ide_driv
 	struct pci_dev *dev = hwif->pci_dev;
 	int s_time, a_time, c_time;
 	u8 s_clc, a_clc, r_clc;
-	unsigned long flags;
 	int bus_speed = system_bus_clock();
 	int port = hwif->channel ? 0x5c : 0x58;
 	int portFIFO = hwif->channel ? 0x55 : 0x54;
@@ -323,7 +322,6 @@ static void ali15x3_tune_drive (ide_driv
 		if (r_clc >= 16)
 			r_clc = 0;
 	}
-	local_irq_save(flags);
 	
 	/* 
 	 * PIO mode => ATA FIFO on, ATAPI FIFO off
@@ -345,7 +343,6 @@ static void ali15x3_tune_drive (ide_driv
 	
 	pci_write_config_byte(dev, port, s_clc);
 	pci_write_config_byte(dev, port+drive->select.b.unit+2, (a_clc << 4) | r_clc);
-	local_irq_restore(flags);
 
 	/*
 	 * setup   active  rec
@@ -585,7 +582,6 @@ static int ali15x3_dma_setup(ide_drive_t
   
 static unsigned int __devinit init_chipset_ali15x3 (struct pci_dev *dev, const char *name)
 {
-	unsigned long flags;
 	u8 tmpbyte;
 	struct pci_dev *north = pci_find_slot(0, PCI_DEVFN(0,0));
 
@@ -601,7 +597,6 @@ static unsigned int __devinit init_chips
 	}
 #endif  /* defined(DISPLAY_ALI_TIMINGS) && defined(CONFIG_PROC_FS) */
 
-	local_irq_save(flags);
 
 	if (m5229_revision < 0xC2) {
 		/*
@@ -614,7 +609,6 @@ static unsigned int __devinit init_chips
 		 * clear bit 7
 		 */
 		pci_write_config_byte(dev, 0x4b, tmpbyte & 0x7F);
-		local_irq_restore(flags);
 		return 0;
 	}
 
@@ -639,7 +633,6 @@ static unsigned int __devinit init_chips
 	 * 0:0.0 so if we didn't find one we know what is cooking.
 	 */
 	if (north && north->vendor != PCI_VENDOR_ID_AL) {
-		local_irq_restore(flags);
 	        return 0;
 	}
 
@@ -662,7 +655,6 @@ static unsigned int __devinit init_chips
 			pci_write_config_byte(isa_dev, 0x79, tmpbyte | 0x02);
 		}
 	}
-	local_irq_restore(flags);
 	return 0;
 }
 
@@ -683,10 +675,8 @@ static unsigned int __devinit ata66_ali1
 	unsigned int ata66	= 0;
 	u8 cable_80_pin[2]	= { 0, 0 };
 
-	unsigned long flags;
 	u8 tmpbyte;
 
-	local_irq_save(flags);
 
 	if (m5229_revision >= 0xC2) {
 		/*
@@ -736,7 +726,6 @@ static unsigned int __devinit ata66_ali1
 
 	pci_write_config_byte(dev, 0x53, tmpbyte);
 
-	local_irq_restore(flags);
 
 	return(ata66);
 }
Index: linux-2.6.16/drivers/ide/pci/cs5530.c
===================================================================
--- linux-2.6.16.orig/drivers/ide/pci/cs5530.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/ide/pci/cs5530.c	2006-10-19 16:52:30.000000000 +0200
@@ -242,8 +242,8 @@ static unsigned int __devinit init_chips
 		return 0;
 	}
 
-	spin_lock_irqsave(&ide_lock, flags);
-		/* all CPUs (there should only be one CPU with this chipset) */
+	/* Local CPU. ide_lock is acquired in do_ide_setup_pci_device. */
+	local_irq_save(flags);
 
 	/*
 	 * Enable BusMaster and MemoryWriteAndInvalidate for the cs5530:
@@ -295,7 +295,7 @@ static unsigned int __devinit init_chips
 	pci_write_config_byte(master_0, 0x42, 0x00);
 	pci_write_config_byte(master_0, 0x43, 0xc1);
 
-	spin_unlock_irqrestore(&ide_lock, flags);
+	local_irq_restore(flags);
 
 	return 0;
 }
Index: linux-2.6.16/drivers/ide/pci/hpt366.c
===================================================================
--- linux-2.6.16.orig/drivers/ide/pci/hpt366.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/ide/pci/hpt366.c	2006-10-19 16:52:30.000000000 +0200
@@ -1485,7 +1485,6 @@ static void __devinit init_dma_hpt366(id
 	u8 dma_new	= 0, dma_old = 0;
 	u8 primary	= hwif->channel ? 0x4b : 0x43;
 	u8 secondary	= hwif->channel ? 0x4f : 0x47;
-	unsigned long flags;
 
 	if (!dmabase)
 		return;
@@ -1497,8 +1496,6 @@ static void __devinit init_dma_hpt366(id
 
 	dma_old = hwif->INB(dmabase+2);
 
-	local_irq_save(flags);
-
 	dma_new = dma_old;
 	pci_read_config_byte(hwif->pci_dev, primary, &masterdma);
 	pci_read_config_byte(hwif->pci_dev, secondary, &slavedma);
@@ -1508,8 +1505,6 @@ static void __devinit init_dma_hpt366(id
 	if (dma_new != dma_old)
 		hwif->OUTB(dma_new, dmabase+2);
 
-	local_irq_restore(flags);
-
 	ide_setup_dma(hwif, dmabase, 8);
 }
 
Index: linux-2.6.16/drivers/ide/setup-pci.c
===================================================================
--- linux-2.6.16.orig/drivers/ide/setup-pci.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/ide/setup-pci.c	2006-10-19 16:52:30.000000000 +0200
@@ -665,8 +665,11 @@ static int do_ide_setup_pci_device(struc
 {
 	static ata_index_t ata_index = { .b = { .low = 0xff, .high = 0xff } };
 	int tried_config = 0;
+	unsigned long flags;
 	int pciirq, ret;
 
+	spin_lock_irqsave(&ide_lock, flags);
+
 	ret = ide_setup_pci_controller(dev, d, noisy, &tried_config);
 	if (ret < 0)
 		goto out;
@@ -721,6 +724,8 @@ static int do_ide_setup_pci_device(struc
 	*index = ata_index;
 	ide_pci_setup_ports(dev, d, pciirq, index);
 out:
+	spin_unlock_irqrestore(&ide_lock, flags);
+
 	return ret;
 }
 
Index: linux-2.6.16/drivers/ieee1394/ieee1394_types.h
===================================================================
--- linux-2.6.16.orig/drivers/ieee1394/ieee1394_types.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/ieee1394/ieee1394_types.h	2006-10-19 16:52:30.000000000 +0200
@@ -19,7 +19,7 @@ struct hpsb_tlabel_pool {
 	spinlock_t lock;
 	u8 next;
 	u32 allocations;
-	struct semaphore count;
+	struct compat_semaphore count;
 };
 
 #define HPSB_TPOOL_INIT(_tp)			\
Index: linux-2.6.16/drivers/ieee1394/nodemgr.c
===================================================================
--- linux-2.6.16.orig/drivers/ieee1394/nodemgr.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/ieee1394/nodemgr.c	2006-10-19 16:52:30.000000000 +0200
@@ -114,7 +114,7 @@ struct host_info {
 	struct hpsb_host *host;
 	struct list_head list;
 	struct completion exited;
-	struct semaphore reset_sem;
+	struct compat_semaphore reset_sem;
 	int pid;
 	char daemon_name[15];
 	int kill_me;
Index: linux-2.6.16/drivers/ieee1394/raw1394-private.h
===================================================================
--- linux-2.6.16.orig/drivers/ieee1394/raw1394-private.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/ieee1394/raw1394-private.h	2006-10-19 16:52:30.000000000 +0200
@@ -29,7 +29,7 @@ struct file_info {
 
         struct list_head req_pending;
         struct list_head req_complete;
-        struct semaphore complete_sem;
+        struct compat_semaphore complete_sem;
         spinlock_t reqlists_lock;
         wait_queue_head_t poll_wait_complete;
 
Index: linux-2.6.16/drivers/input/gameport/gameport.c
===================================================================
--- linux-2.6.16.orig/drivers/input/gameport/gameport.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/input/gameport/gameport.c	2006-10-19 16:52:30.000000000 +0200
@@ -21,6 +21,7 @@
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
+#include <linux/interrupt.h>
 #include <linux/sched.h>	/* HZ */
 
 /*#include <asm/io.h>*/
@@ -99,12 +100,12 @@ static int gameport_measure_speed(struct
 	tx = 1 << 30;
 
 	for(i = 0; i < 50; i++) {
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		GET_TIME(t1);
 		for (t = 0; t < 50; t++) gameport_read(gameport);
 		GET_TIME(t2);
 		GET_TIME(t3);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 		udelay(i * 10);
 		if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
 	}
@@ -123,11 +124,11 @@ static int gameport_measure_speed(struct
 	tx = 1 << 30;
 
 	for(i = 0; i < 50; i++) {
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		rdtscl(t1);
 		for (t = 0; t < 50; t++) gameport_read(gameport);
 		rdtscl(t2);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 		udelay(i * 10);
 		if (t2 - t1 < tx) tx = t2 - t1;
 	}
Index: linux-2.6.16/drivers/input/serio/sa1111ps2.c
===================================================================
--- linux-2.6.16.orig/drivers/input/serio/sa1111ps2.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/input/serio/sa1111ps2.c	2006-10-19 16:52:30.000000000 +0200
@@ -13,6 +13,7 @@
 #include <linux/serio.h>
 #include <linux/errno.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/ioport.h>
 #include <linux/delay.h>
 #include <linux/device.h>
Index: linux-2.6.16/drivers/input/touchscreen/corgi_ts.c
===================================================================
--- linux-2.6.16.orig/drivers/input/touchscreen/corgi_ts.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/input/touchscreen/corgi_ts.c	2006-10-19 16:52:30.000000000 +0200
@@ -15,9 +15,9 @@
 #include <linux/init.h>
 #include <linux/input.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <asm/irq.h>
 
 #include <asm/arch/sharpsl.h>
 #include <asm/arch/hardware.h>
Index: linux-2.6.16/drivers/media/dvb/dvb-core/dvb_frontend.c
===================================================================
--- linux-2.6.16.orig/drivers/media/dvb/dvb-core/dvb_frontend.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/media/dvb/dvb-core/dvb_frontend.c	2006-10-19 16:52:30.000000000 +0200
@@ -96,7 +96,7 @@ struct dvb_frontend_private {
 	struct dvb_device *dvbdev;
 	struct dvb_frontend_parameters parameters;
 	struct dvb_fe_events events;
-	struct semaphore sem;
+	struct compat_semaphore sem;
 	struct list_head list_head;
 	wait_queue_head_t wait_queue;
 	pid_t thread_pid;
Index: linux-2.6.16/drivers/media/dvb/dvb-core/dvb_frontend.h
===================================================================
--- linux-2.6.16.orig/drivers/media/dvb/dvb-core/dvb_frontend.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/media/dvb/dvb-core/dvb_frontend.h	2006-10-19 16:52:30.000000000 +0200
@@ -96,7 +96,7 @@ struct dvb_fe_events {
 	int			  eventr;
 	int			  overflow;
 	wait_queue_head_t	  wait_queue;
-	struct semaphore	  sem;
+	struct compat_semaphore	  sem;
 };
 
 struct dvb_frontend {
Index: linux-2.6.16/drivers/net/3c527.c
===================================================================
--- linux-2.6.16.orig/drivers/net/3c527.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/net/3c527.c	2006-10-19 16:52:30.000000000 +0200
@@ -182,7 +182,7 @@ struct mc32_local 
 
 	u16 rx_ring_tail;       /* index to rx de-queue end */ 
 
-	struct semaphore cmd_mutex;    /* Serialises issuing of execute commands */
+	struct compat_semaphore cmd_mutex;    /* Serialises issuing of execute commands */
         struct completion execution_cmd; /* Card has completed an execute command */
 	struct completion xceiver_cmd;   /* Card has completed a tx or rx command */
 };
Index: linux-2.6.16/drivers/net/3c59x.c
===================================================================
--- linux-2.6.16.orig/drivers/net/3c59x.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/net/3c59x.c	2006-10-19 16:52:30.000000000 +0200
@@ -967,9 +967,9 @@ static void poll_vortex(struct net_devic
 	struct vortex_private *vp = netdev_priv(dev);
 	unsigned long flags;
 	local_save_flags(flags);
-	local_irq_disable();
+	local_irq_disable_nort();
 	(vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev,NULL);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 } 
 #endif
 
@@ -1888,6 +1888,7 @@ vortex_timer(unsigned long data)
 	int next_tick = 60*HZ;
 	int ok = 0;
 	int media_status, mii_status, old_window;
+	unsigned long flags;
 
 	if (vortex_debug > 2) {
 		printk(KERN_DEBUG "%s: Media selection timer tick happened, %s.\n",
@@ -1897,7 +1898,7 @@ vortex_timer(unsigned long data)
 
 	if (vp->medialock)
 		goto leave_media_alone;
-	disable_irq(dev->irq);
+	spin_lock_irqsave(&vp->lock, flags);
 	old_window = ioread16(ioaddr + EL3_CMD) >> 13;
 	EL3WINDOW(4);
 	media_status = ioread16(ioaddr + Wn4_Media);
@@ -1919,7 +1920,6 @@ vortex_timer(unsigned long data)
 		break;
 	case XCVR_MII: case XCVR_NWAY:
 		{
-			spin_lock_bh(&vp->lock);
 			mii_status = mdio_read(dev, vp->phys[0], MII_BMSR);
 			if (!(mii_status & BMSR_LSTATUS)) {
 				/* Re-read to get actual link status */
@@ -1957,7 +1957,6 @@ vortex_timer(unsigned long data)
 			} else {
 				netif_carrier_off(dev);
 			}
-			spin_unlock_bh(&vp->lock);
 		}
 		break;
 	  default:					/* Other media types handled by Tx timeouts. */
@@ -2000,7 +1999,7 @@ vortex_timer(unsigned long data)
 		/* AKPM: FIXME: Should reset Rx & Tx here.  P60 of 3c90xc.pdf */
 	}
 	EL3WINDOW(old_window);
-	enable_irq(dev->irq);
+	spin_unlock_irqrestore(&vp->lock, flags);
 
 leave_media_alone:
 	if (vortex_debug > 2)
@@ -2039,13 +2038,17 @@ static void vortex_tx_timeout(struct net
 			/*
 			 * Block interrupts because vortex_interrupt does a bare spin_lock()
 			 */
+#ifndef CONFIG_PREEMPT_RT
 			unsigned long flags;
 			local_irq_save(flags);
+#endif
 			if (vp->full_bus_master_tx)
 				boomerang_interrupt(dev->irq, dev, NULL);
 			else
 				vortex_interrupt(dev->irq, dev, NULL);
+#ifndef CONFIG_PREEMPT_RT
 			local_irq_restore(flags);
+#endif
 		}
 	}
 
Index: linux-2.6.16/drivers/net/e1000/e1000_main.c
===================================================================
--- linux-2.6.16.orig/drivers/net/e1000/e1000_main.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/net/e1000/e1000_main.c	2006-10-19 16:52:30.000000000 +0200
@@ -2961,10 +2961,8 @@ e1000_xmit_frame(struct sk_buff *skb, st
 	if (adapter->hw.tx_pkt_filtering && (adapter->hw.mac_type == e1000_82573) )
 		e1000_transfer_dhcp_info(adapter, skb);
 
-	local_irq_save(flags);
-	if (!spin_trylock(&tx_ring->tx_lock)) {
+	if (!spin_trylock_irqsave(&tx_ring->tx_lock, flags)) {
 		/* Collision - tell upper layer to requeue */
-		local_irq_restore(flags);
 		return NETDEV_TX_LOCKED;
 	}
 
Index: linux-2.6.16/drivers/net/hamradio/6pack.c
===================================================================
--- linux-2.6.16.orig/drivers/net/hamradio/6pack.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/net/hamradio/6pack.c	2006-10-19 16:52:30.000000000 +0200
@@ -124,7 +124,7 @@ struct sixpack {
 	struct timer_list	tx_t;
 	struct timer_list	resync_t;
 	atomic_t		refcnt;
-	struct semaphore	dead_sem;
+	struct compat_semaphore	dead_sem;
 	spinlock_t		lock;
 };
 
Index: linux-2.6.16/drivers/net/hamradio/mkiss.c
===================================================================
--- linux-2.6.16.orig/drivers/net/hamradio/mkiss.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/net/hamradio/mkiss.c	2006-10-19 16:52:30.000000000 +0200
@@ -85,7 +85,7 @@ struct mkiss {
 #define CRC_MODE_SMACK_TEST	4
 
 	atomic_t		refcnt;
-	struct semaphore	dead_sem;
+	struct compat_semaphore	dead_sem;
 };
 
 /*---------------------------------------------------------------------------*/
Index: linux-2.6.16/drivers/net/netconsole.c
===================================================================
--- linux-2.6.16.orig/drivers/net/netconsole.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/net/netconsole.c	2006-10-19 16:52:30.000000000 +0200
@@ -74,16 +74,22 @@ static void write_msg(struct console *co
 	if (!np.dev)
 		return;
 
-	local_irq_save(flags);
+	/*
+	 * A bit hairy. Netconsole uses mutexes (indirectly) and
+	 * thus must have interrupts enabled:
+	 */
+	local_irq_save_nort(flags);
 
 	for(left = len; left; ) {
 		frag = min(left, MAX_PRINT_CHUNK);
+		WARN_ON_RT(irqs_disabled());
 		netpoll_send_udp(&np, msg, frag);
+		WARN_ON_RT(irqs_disabled());
 		msg += frag;
 		left -= frag;
 	}
 
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 }
 
 static struct console netconsole = {
Index: linux-2.6.16/drivers/net/ns83820.c
===================================================================
--- linux-2.6.16.orig/drivers/net/ns83820.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/net/ns83820.c	2006-10-19 16:52:30.000000000 +0200
@@ -1013,8 +1013,6 @@ static void do_tx_done(struct net_device
 	struct ns83820 *dev = PRIV(ndev);
 	u32 cmdsts, tx_done_idx, *desc;
 
-	spin_lock_irq(&dev->tx_lock);
-
 	dprintk("do_tx_done(%p)\n", ndev);
 	tx_done_idx = dev->tx_done_idx;
 	desc = dev->tx_descs + (tx_done_idx * DESC_SIZE);
@@ -1070,7 +1068,6 @@ static void do_tx_done(struct net_device
 		netif_start_queue(ndev);
 		netif_wake_queue(ndev);
 	}
-	spin_unlock_irq(&dev->tx_lock);
 }
 
 static void ns83820_cleanup_tx(struct ns83820 *dev)
@@ -1371,7 +1368,9 @@ static void ns83820_do_isr(struct net_de
 	 * work has accumulated
 	 */
 	if ((ISR_TXDESC | ISR_TXIDLE | ISR_TXOK | ISR_TXERR) & isr) {
+		spin_lock_irq(&dev->tx_lock);
 		do_tx_done(ndev);
+		spin_unlock_irq(&dev->tx_lock);
 
 		/* Disable TxOk if there are no outstanding tx packets.
 		 */
@@ -1456,7 +1455,7 @@ static void ns83820_tx_timeout(struct ne
         u32 tx_done_idx, *desc;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	spin_lock_irqsave(&dev->tx_lock, flags);
 
 	tx_done_idx = dev->tx_done_idx;
 	desc = dev->tx_descs + (tx_done_idx * DESC_SIZE);
@@ -1483,7 +1482,7 @@ static void ns83820_tx_timeout(struct ne
 		ndev->name,
 		tx_done_idx, dev->tx_free_idx, le32_to_cpu(desc[DESC_CMDSTS]));
 
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&dev->tx_lock, flags);
 }
 
 static void ns83820_tx_watch(unsigned long data)
Index: linux-2.6.16/drivers/net/plip.c
===================================================================
--- linux-2.6.16.orig/drivers/net/plip.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/net/plip.c	2006-10-19 16:52:30.000000000 +0200
@@ -227,7 +227,10 @@ struct net_local {
 	                              struct hh_cache *hh);
 	spinlock_t lock;
 	atomic_t kill_timer;
-	struct semaphore killed_timer_sem;
+	/*
+	 * PREEMPT_RT: this isnt a mutex, it should be struct completion.
+	 */
+	struct compat_semaphore killed_timer_sem;
 };
 
 static inline void enable_parport_interrupts (struct net_device *dev)
Index: linux-2.6.16/drivers/net/ppp_async.c
===================================================================
--- linux-2.6.16.orig/drivers/net/ppp_async.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/net/ppp_async.c	2006-10-19 16:52:30.000000000 +0200
@@ -66,7 +66,7 @@ struct asyncppp {
 	struct tasklet_struct tsk;
 
 	atomic_t	refcnt;
-	struct semaphore dead_sem;
+	struct compat_semaphore dead_sem;
 	struct ppp_channel chan;	/* interface to generic ppp layer */
 	unsigned char	obuf[OBUFSIZE];
 };
Index: linux-2.6.16/drivers/net/ppp_synctty.c
===================================================================
--- linux-2.6.16.orig/drivers/net/ppp_synctty.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/net/ppp_synctty.c	2006-10-19 16:52:30.000000000 +0200
@@ -70,7 +70,7 @@ struct syncppp {
 	struct tasklet_struct tsk;
 
 	atomic_t	refcnt;
-	struct semaphore dead_sem;
+	struct compat_semaphore dead_sem;
 	struct ppp_channel chan;	/* interface to generic ppp layer */
 };
 
Index: linux-2.6.16/drivers/net/skge.c
===================================================================
--- linux-2.6.16.orig/drivers/net/skge.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/net/skge.c	2006-10-19 16:52:30.000000000 +0200
@@ -2313,12 +2313,9 @@ static int skge_xmit_frame(struct sk_buf
 	if (!skb)
 		return NETDEV_TX_OK;
 
-	local_irq_save(flags);
-	if (!spin_trylock(&skge->tx_lock)) {
+	if (!spin_trylock_irqsave(&skge->tx_lock, flags))
  		/* Collision - tell upper layer to requeue */
- 		local_irq_restore(flags);
  		return NETDEV_TX_LOCKED;
- 	}
 
 	if (unlikely(skge->tx_avail < skb_shinfo(skb)->nr_frags +1)) {
 		if (!netif_queue_stopped(dev)) {
Index: linux-2.6.16/drivers/net/smc91x.c
===================================================================
--- linux-2.6.16.orig/drivers/net/smc91x.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/net/smc91x.c	2006-10-19 16:52:30.000000000 +0200
@@ -74,6 +74,7 @@ static const char version[] =
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/errno.h>
 #include <linux/ioport.h>
 #include <linux/crc32.h>
Index: linux-2.6.16/drivers/net/tulip/tulip_core.c
===================================================================
--- linux-2.6.16.orig/drivers/net/tulip/tulip_core.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/net/tulip/tulip_core.c	2006-10-19 16:52:30.000000000 +0200
@@ -1809,6 +1809,7 @@ static void __devexit tulip_remove_one (
 	pci_iounmap(pdev, tp->base_addr);
 	free_netdev (dev);
 	pci_release_regions (pdev);
+	pci_disable_device (pdev);
 	pci_set_drvdata (pdev, NULL);
 
 	/* pci_power_off (pdev, -1); */
Index: linux-2.6.16/drivers/oprofile/oprofilefs.c
===================================================================
--- linux-2.6.16.orig/drivers/oprofile/oprofilefs.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/oprofile/oprofilefs.c	2006-10-19 16:52:30.000000000 +0200
@@ -21,7 +21,7 @@
 
 #define OPROFILEFS_MAGIC 0x6f70726f
 
-DEFINE_SPINLOCK(oprofilefs_lock);
+DEFINE_RAW_SPINLOCK(oprofilefs_lock);
 
 static struct inode * oprofilefs_get_inode(struct super_block * sb, int mode)
 {
Index: linux-2.6.16/drivers/pci/hotplug/cpci_hotplug_core.c
===================================================================
--- linux-2.6.16.orig/drivers/pci/hotplug/cpci_hotplug_core.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/pci/hotplug/cpci_hotplug_core.c	2006-10-19 16:52:30.000000000 +0200
@@ -60,8 +60,8 @@ static int slots;
 static atomic_t extracting;
 int cpci_debug;
 static struct cpci_hp_controller *controller;
-static struct semaphore event_semaphore;	/* mutex for process loop (up if something to process) */
-static struct semaphore thread_exit;		/* guard ensure thread has exited before calling it quits */
+static struct compat_semaphore event_semaphore;	/* mutex for process loop (up if something to process) */
+static struct compat_semaphore thread_exit;		/* guard ensure thread has exited before calling it quits */
 static int thread_finished = 1;
 
 static int enable_slot(struct hotplug_slot *slot);
Index: linux-2.6.16/drivers/pci/hotplug/cpqphp_ctrl.c
===================================================================
--- linux-2.6.16.orig/drivers/pci/hotplug/cpqphp_ctrl.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/pci/hotplug/cpqphp_ctrl.c	2006-10-19 16:52:30.000000000 +0200
@@ -45,8 +45,8 @@ static int configure_new_function(struct
 			u8 behind_bridge, struct resource_lists *resources);
 static void interrupt_event_handler(struct controller *ctrl);
 
-static struct semaphore event_semaphore;	/* mutex for process loop (up if something to process) */
-static struct semaphore event_exit;		/* guard ensure thread has exited before calling it quits */
+static struct compat_semaphore event_semaphore;	/* mutex for process loop (up if something to process) */
+static struct compat_semaphore event_exit;		/* guard ensure thread has exited before calling it quits */
 static int event_finished;
 static unsigned long pushbutton_pending;	/* = 0 */
 
Index: linux-2.6.16/drivers/pci/hotplug/ibmphp_hpc.c
===================================================================
--- linux-2.6.16.orig/drivers/pci/hotplug/ibmphp_hpc.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/pci/hotplug/ibmphp_hpc.c	2006-10-19 16:52:30.000000000 +0200
@@ -104,7 +104,7 @@ static int tid_poll;
 static struct semaphore sem_hpcaccess;	// lock access to HPC
 static struct semaphore semOperations;	// lock all operations and
 					// access to data structures
-static struct semaphore sem_exit;	// make sure polling thread goes away
+static struct compat_semaphore sem_exit;	// make sure polling thread goes away
 //----------------------------------------------------------------------------
 // local function prototypes
 //----------------------------------------------------------------------------
Index: linux-2.6.16/drivers/pci/hotplug/pciehp_ctrl.c
===================================================================
--- linux-2.6.16.orig/drivers/pci/hotplug/pciehp_ctrl.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/pci/hotplug/pciehp_ctrl.c	2006-10-19 16:52:30.000000000 +0200
@@ -37,8 +37,8 @@
 
 static void interrupt_event_handler(struct controller *ctrl);
 
-static struct semaphore event_semaphore;	/* mutex for process loop (up if something to process) */
-static struct semaphore event_exit;		/* guard ensure thread has exited before calling it quits */
+static struct compat_semaphore event_semaphore;	/* mutex for process loop (up if something to process) */
+static struct compat_semaphore event_exit;		/* guard ensure thread has exited before calling it quits */
 static int event_finished;
 static unsigned long pushbutton_pending;	/* = 0 */
 static unsigned long surprise_rm_pending;	/* = 0 */
Index: linux-2.6.16/drivers/pci/hotplug/shpchp_ctrl.c
===================================================================
--- linux-2.6.16.orig/drivers/pci/hotplug/shpchp_ctrl.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/pci/hotplug/shpchp_ctrl.c	2006-10-19 16:52:30.000000000 +0200
@@ -37,8 +37,8 @@
 
 static void interrupt_event_handler(struct controller *ctrl);
 
-static struct semaphore event_semaphore;	/* mutex for process loop (up if something to process) */
-static struct semaphore event_exit;		/* guard ensure thread has exited before calling it quits */
+static struct compat_semaphore event_semaphore;	/* mutex for process loop (up if something to process) */
+static struct compat_semaphore event_exit;		/* guard ensure thread has exited before calling it quits */
 static int event_finished;
 static unsigned long pushbutton_pending;	/* = 0 */
 
Index: linux-2.6.16/drivers/pcmcia/soc_common.c
===================================================================
--- linux-2.6.16.orig/drivers/pcmcia/soc_common.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/pcmcia/soc_common.c	2006-10-19 16:52:30.000000000 +0200
@@ -39,12 +39,12 @@
 #include <linux/timer.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/spinlock.h>
 #include <linux/cpufreq.h>
 
 #include <asm/hardware.h>
 #include <asm/io.h>
-#include <asm/irq.h>
 #include <asm/system.h>
 
 #include "soc_common.h"
Index: linux-2.6.16/drivers/scsi/aacraid/aacraid.h
===================================================================
--- linux-2.6.16.orig/drivers/scsi/aacraid/aacraid.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/scsi/aacraid/aacraid.h	2006-10-19 16:52:30.000000000 +0200
@@ -742,7 +742,7 @@ struct aac_fib_context {
 	u32			unique;		// unique value representing this context
 	ulong			jiffies;	// used for cleanup - dmb changed to ulong
 	struct list_head	next;		// used to link context's into a linked list
-	struct semaphore 	wait_sem;	// this is used to wait for the next fib to arrive.
+	struct compat_semaphore	wait_sem;	// this is used to wait for the next fib to arrive.
 	int			wait;		// Set to true when thread is in WaitForSingleObject
 	unsigned long		count;		// total number of FIBs on FibList
 	struct list_head	fib_list;	// this holds fibs and their attachd hw_fibs
@@ -811,7 +811,7 @@ struct fib {
 	 *	This is the event the sendfib routine will wait on if the
 	 *	caller did not pass one and this is synch io.
 	 */
-	struct semaphore 	event_wait;
+	struct compat_semaphore	event_wait;
 	spinlock_t		event_lock;
 
 	u32			done;	/* gets set to 1 when fib is complete */
Index: linux-2.6.16/drivers/scsi/aic7xxx/aic79xx_osm.h
===================================================================
--- linux-2.6.16.orig/drivers/scsi/aic7xxx/aic79xx_osm.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/scsi/aic7xxx/aic79xx_osm.h	2006-10-19 16:52:30.000000000 +0200
@@ -382,7 +382,7 @@ struct ahd_platform_data {
 
 	spinlock_t		 spin_lock;
 	u_int			 qfrozen;
-	struct semaphore	 eh_sem;
+	struct compat_semaphore	 eh_sem;
 	struct Scsi_Host        *host;		/* pointer to scsi host */
 #define AHD_LINUX_NOIRQ	((uint32_t)~0)
 	uint32_t		 irq;		/* IRQ for this adapter */
Index: linux-2.6.16/drivers/scsi/aic7xxx/aic7xxx_osm.h
===================================================================
--- linux-2.6.16.orig/drivers/scsi/aic7xxx/aic7xxx_osm.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/scsi/aic7xxx/aic7xxx_osm.h	2006-10-19 16:52:30.000000000 +0200
@@ -369,7 +369,7 @@ struct ahc_platform_data {
 
 	spinlock_t		 spin_lock;
 	u_int			 qfrozen;
-	struct semaphore	 eh_sem;
+	struct compat_semaphore	 eh_sem;
 	struct Scsi_Host        *host;		/* pointer to scsi host */
 #define AHC_LINUX_NOIRQ	((uint32_t)~0)
 	uint32_t		 irq;		/* IRQ for this adapter */
Index: linux-2.6.16/drivers/scsi/libata-core.c
===================================================================
--- linux-2.6.16.orig/drivers/scsi/libata-core.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/scsi/libata-core.c	2006-10-19 16:52:30.000000000 +0200
@@ -5110,7 +5110,7 @@ module_init(ata_init);
 module_exit(ata_exit);
 
 static unsigned long ratelimit_time;
-static spinlock_t ata_ratelimit_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(ata_ratelimit_lock);
 
 int ata_ratelimit(void)
 {
Index: linux-2.6.16/drivers/scsi/qla2xxx/qla_def.h
===================================================================
--- linux-2.6.16.orig/drivers/scsi/qla2xxx/qla_def.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/scsi/qla2xxx/qla_def.h	2006-10-19 16:52:30.000000000 +0200
@@ -2459,7 +2459,7 @@ typedef struct scsi_qla_host {
 	spinlock_t	mbx_reg_lock;   /* Mbx Cmd Register Lock */
 
 	struct semaphore mbx_cmd_sem;	/* Serialialize mbx access */
-	struct semaphore mbx_intr_sem;  /* Used for completion notification */
+	struct compat_semaphore mbx_intr_sem;  /* Used for completion notification */
 
 	uint32_t	mbx_flags;
 #define  MBX_IN_PROGRESS	BIT_0
Index: linux-2.6.16/drivers/scsi/qla2xxx/qla_os.c
===================================================================
--- linux-2.6.16.orig/drivers/scsi/qla2xxx/qla_os.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/scsi/qla2xxx/qla_os.c	2006-10-19 16:52:30.000000000 +0200
@@ -2156,12 +2156,13 @@ qla2x00_free_sp_pool( scsi_qla_host_t *h
 static int
 qla2x00_do_dpc(void *data)
 {
-	DECLARE_MUTEX_LOCKED(sem);
+	DECLARE_MUTEX(sem);
 	scsi_qla_host_t *ha;
 	fc_port_t	*fcport;
 	uint8_t		status;
 	uint16_t	next_loopid;
 
+	down(&sem);
 	ha = (scsi_qla_host_t *)data;
 
 	lock_kernel();
Index: linux-2.6.16/drivers/serial/8250.c
===================================================================
--- linux-2.6.16.orig/drivers/serial/8250.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/serial/8250.c	2006-10-19 16:52:30.000000000 +0200
@@ -1336,6 +1336,14 @@ static irqreturn_t serial8250_interrupt(
 				"irq%d\n", irq);
 			break;
 		}
+		/*
+		 * If we have a buggy TX line, that doesn't
+		 * notify us via iir that we need to transmit
+		 * then force the call.
+		 */
+		if (!handled && (up->bugs & UART_BUG_TXEN))
+			serial8250_handle_port(up, regs);
+
 	} while (l != end);
 
 	spin_unlock(&i->lock);
Index: linux-2.6.16/drivers/usb/core/devio.c
===================================================================
--- linux-2.6.16.orig/drivers/usb/core/devio.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/usb/core/devio.c	2006-10-19 16:52:30.000000000 +0200
@@ -307,10 +307,11 @@ static void async_completed(struct urb *
         struct async *as = (struct async *)urb->context;
         struct dev_state *ps = as->ps;
 	struct siginfo sinfo;
+	unsigned long flags;
 
-        spin_lock(&ps->lock);
-        list_move_tail(&as->asynclist, &ps->async_completed);
-        spin_unlock(&ps->lock);
+	spin_lock_irqsave(&ps->lock, flags);
+	list_move_tail(&as->asynclist, &ps->async_completed);
+	spin_unlock_irqrestore(&ps->lock, flags);
 	if (as->signr) {
 		sinfo.si_signo = as->signr;
 		sinfo.si_errno = as->urb->status;
Index: linux-2.6.16/drivers/usb/core/hcd.c
===================================================================
--- linux-2.6.16.orig/drivers/usb/core/hcd.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/usb/core/hcd.c	2006-10-19 16:52:30.000000000 +0200
@@ -497,13 +497,11 @@ error:
 	}
 
 	/* any errors get returned through the urb completion */
-	local_irq_save (flags);
-	spin_lock (&urb->lock);
+	spin_lock_irqsave(&urb->lock, flags);
 	if (urb->status == -EINPROGRESS)
 		urb->status = status;
-	spin_unlock (&urb->lock);
+	spin_unlock_irqrestore(&urb->lock, flags);
 	usb_hcd_giveback_urb (hcd, urb, NULL);
-	local_irq_restore (flags);
 	return 0;
 }
 
@@ -531,8 +529,7 @@ void usb_hcd_poll_rh_status(struct usb_h
 	if (length > 0) {
 
 		/* try to complete the status urb */
-		local_irq_save (flags);
-		spin_lock(&hcd_root_hub_lock);
+		spin_lock_irqsave(&hcd_root_hub_lock, flags);
 		urb = hcd->status_urb;
 		if (urb) {
 			spin_lock(&urb->lock);
@@ -548,14 +545,13 @@ void usb_hcd_poll_rh_status(struct usb_h
 			spin_unlock(&urb->lock);
 		} else
 			length = 0;
-		spin_unlock(&hcd_root_hub_lock);
+		spin_unlock_irqrestore(&hcd_root_hub_lock, flags);
 
 		/* local irqs are always blocked in completions */
 		if (length > 0)
 			usb_hcd_giveback_urb (hcd, urb, NULL);
 		else
 			hcd->poll_pending = 1;
-		local_irq_restore (flags);
 	}
 
 	/* The USB 2.0 spec says 256 ms.  This is close enough and won't
@@ -638,17 +634,15 @@ static int usb_rh_urb_dequeue (struct us
 	} else {				/* Status URB */
 		if (!hcd->uses_new_polling)
 			del_timer_sync (&hcd->rh_timer);
-		local_irq_disable ();
-		spin_lock (&hcd_root_hub_lock);
+		spin_lock_irq(&hcd_root_hub_lock);
 		if (urb == hcd->status_urb) {
 			hcd->status_urb = NULL;
 			urb->hcpriv = NULL;
 		} else
 			urb = NULL;		/* wasn't fully queued */
-		spin_unlock (&hcd_root_hub_lock);
+		spin_unlock_irq(&hcd_root_hub_lock);
 		if (urb)
 			usb_hcd_giveback_urb (hcd, urb, NULL);
-		local_irq_enable ();
 	}
 
 	return 0;
@@ -1359,15 +1353,13 @@ hcd_endpoint_disable (struct usb_device 
 	WARN_ON (!HC_IS_RUNNING (hcd->state) && hcd->state != HC_STATE_HALT &&
 			udev->state != USB_STATE_NOTATTACHED);
 
-	local_irq_disable ();
-
 	/* FIXME move most of this into message.c as part of its
 	 * endpoint disable logic
 	 */
 
 	/* ep is already gone from udev->ep_{in,out}[]; no more submits */
 rescan:
-	spin_lock (&hcd_data_lock);
+	spin_lock_irq(&hcd_data_lock);
 	list_for_each_entry (urb, &ep->urb_list, urb_list) {
 		int	tmp;
 
@@ -1380,13 +1372,13 @@ rescan:
 		if (urb->status != -EINPROGRESS)
 			continue;
 		usb_get_urb (urb);
-		spin_unlock (&hcd_data_lock);
+		spin_unlock_irq(&hcd_data_lock);
 
-		spin_lock (&urb->lock);
+		spin_lock_irq(&urb->lock);
 		tmp = urb->status;
 		if (tmp == -EINPROGRESS)
 			urb->status = -ESHUTDOWN;
-		spin_unlock (&urb->lock);
+		spin_unlock_irq(&urb->lock);
 
 		/* kick hcd unless it's already returning this */
 		if (tmp == -EINPROGRESS) {
@@ -1409,8 +1401,7 @@ rescan:
 		/* list contents may have changed */
 		goto rescan;
 	}
-	spin_unlock (&hcd_data_lock);
-	local_irq_enable ();
+	spin_unlock_irq(&hcd_data_lock);
 
 	/* synchronize with the hardware, so old configuration state
 	 * clears out immediately (and will be freed).
Index: linux-2.6.16/drivers/usb/core/message.c
===================================================================
--- linux-2.6.16.orig/drivers/usb/core/message.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/usb/core/message.c	2006-10-19 16:52:30.000000000 +0200
@@ -234,8 +234,9 @@ static void sg_clean (struct usb_sg_requ
 static void sg_complete (struct urb *urb, struct pt_regs *regs)
 {
 	struct usb_sg_request	*io = (struct usb_sg_request *) urb->context;
+	unsigned long flags;
 
-	spin_lock (&io->lock);
+	spin_lock_irqsave (&io->lock, flags);
 
 	/* In 2.5 we require hcds' endpoint queues not to progress after fault
 	 * reports, until the completion callback (this!) returns.  That lets
@@ -269,7 +270,7 @@ static void sg_complete (struct urb *urb
 		 * unlink pending urbs so they won't rx/tx bad data.
 		 * careful: unlink can sometimes be synchronous...
 		 */
-		spin_unlock (&io->lock);
+		spin_unlock_irqrestore (&io->lock, flags);
 		for (i = 0, found = 0; i < io->entries; i++) {
 			if (!io->urbs [i] || !io->urbs [i]->dev)
 				continue;
@@ -284,7 +285,7 @@ static void sg_complete (struct urb *urb
 			} else if (urb == io->urbs [i])
 				found = 1;
 		}
-		spin_lock (&io->lock);
+		spin_lock_irqsave (&io->lock, flags);
 	}
 	urb->dev = NULL;
 
@@ -294,7 +295,7 @@ static void sg_complete (struct urb *urb
 	if (!io->count)
 		complete (&io->complete);
 
-	spin_unlock (&io->lock);
+	spin_unlock_irqrestore (&io->lock, flags);
 }
 
 
@@ -556,7 +557,7 @@ void usb_sg_cancel (struct usb_sg_reques
 				dev_warn (&io->dev->dev, "%s, unlink --> %d\n",
 					__FUNCTION__, retval);
 		}
-		spin_lock (&io->lock);
+		spin_lock_irqsave (&io->lock, flags);
 	}
 	spin_unlock_irqrestore (&io->lock, flags);
 }
Index: linux-2.6.16/drivers/usb/net/usbnet.c
===================================================================
--- linux-2.6.16.orig/drivers/usb/net/usbnet.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/usb/net/usbnet.c	2006-10-19 16:52:30.000000000 +0200
@@ -819,6 +819,8 @@ static void tx_complete (struct urb *urb
 
 	urb->dev = NULL;
 	entry->state = tx_done;
+	spin_lock_rt(&dev->txq.lock);
+	spin_unlock_rt(&dev->txq.lock);
 	defer_bh(dev, skb, &dev->txq);
 }
 
Index: linux-2.6.16/drivers/usb/storage/usb.c
===================================================================
--- linux-2.6.16.orig/drivers/usb/storage/usb.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/usb/storage/usb.c	2006-10-19 16:52:30.000000000 +0200
@@ -282,6 +282,7 @@ static int usb_stor_control_thread(void 
 		if (test_bit(US_FLIDX_DISCONNECTING, &us->flags)) {
 			US_DEBUGP("-- exiting\n");
 			up(&(us->dev_semaphore));
+			up(&us->sema);
 			break;
 		}
 
Index: linux-2.6.16/drivers/usb/storage/usb.h
===================================================================
--- linux-2.6.16.orig/drivers/usb/storage/usb.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/usb/storage/usb.h	2006-10-19 16:52:30.000000000 +0200
@@ -146,7 +146,7 @@ struct us_data {
 	dma_addr_t		iobuf_dma;
 
 	/* mutual exclusion and synchronization structures */
-	struct semaphore	sema;		 /* to sleep thread on	    */
+	struct compat_semaphore	sema;		 /* to sleep thread on	    */
 	struct completion	notify;		 /* thread begin/end	    */
 	wait_queue_head_t	delay_wait;	 /* wait during scan, reset */
 
Index: linux-2.6.16/drivers/video/console/fbcon.c
===================================================================
--- linux-2.6.16.orig/drivers/video/console/fbcon.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/video/console/fbcon.c	2006-10-19 16:52:30.000000000 +0200
@@ -1183,7 +1183,6 @@ static void fbcon_clear(struct vc_data *
 {
 	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
 	struct fbcon_ops *ops = info->fbcon_par;
-
 	struct display *p = &fb_display[vc->vc_num];
 	u_int y_break;
 
@@ -1212,10 +1211,11 @@ static void fbcon_putcs(struct vc_data *
 	struct display *p = &fb_display[vc->vc_num];
 	struct fbcon_ops *ops = info->fbcon_par;
 
-	if (!fbcon_is_inactive(vc, info))
+	if (!fbcon_is_inactive(vc, info)) {
 		ops->putcs(vc, info, s, count, real_y(p, ypos), xpos,
 			   get_color(vc, info, scr_readw(s), 1),
 			   get_color(vc, info, scr_readw(s), 0));
+	}
 }
 
 static void fbcon_putc(struct vc_data *vc, int c, int ypos, int xpos)
@@ -2985,6 +2985,7 @@ static const struct consw fb_con = {
 	.con_screen_pos 	= fbcon_screen_pos,
 	.con_getxy 		= fbcon_getxy,
 	.con_resize             = fbcon_resize,
+	.con_preemptible 	= 1,
 };
 
 static struct notifier_block fbcon_event_notifier = {
Index: linux-2.6.16/drivers/video/console/vgacon.c
===================================================================
--- linux-2.6.16.orig/drivers/video/console/vgacon.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/drivers/video/console/vgacon.c	2006-10-19 16:52:30.000000000 +0200
@@ -53,7 +53,7 @@
 #include <video/vga.h>
 #include <asm/io.h>
 
-static DEFINE_SPINLOCK(vga_lock);
+static DEFINE_RAW_SPINLOCK(vga_lock);
 static int cursor_size_lastfrom;
 static int cursor_size_lastto;
 static u32 vgacon_xres;
Index: linux-2.6.16/fs/aio.c
===================================================================
--- linux-2.6.16.orig/fs/aio.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/fs/aio.c	2006-10-19 16:52:30.000000000 +0200
@@ -579,13 +579,15 @@ static void use_mm(struct mm_struct *mm)
 	tsk->flags |= PF_BORROWED_MM;
 	active_mm = tsk->active_mm;
 	atomic_inc(&mm->mm_count);
-	tsk->mm = mm;
-	tsk->active_mm = mm;
+	local_irq_disable(); // FIXME
 	/*
 	 * Note that on UML this *requires* PF_BORROWED_MM to be set, otherwise
 	 * it won't work. Update it accordingly if you change it here
 	 */
 	activate_mm(active_mm, mm);
+	tsk->mm = mm;
+	tsk->active_mm = mm;
+	local_irq_enable();
 	task_unlock(tsk);
 
 	mmdrop(active_mm);
Index: linux-2.6.16/fs/block_dev.c
===================================================================
--- linux-2.6.16.orig/fs/block_dev.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/fs/block_dev.c	2006-10-19 16:52:30.000000000 +0200
@@ -667,14 +667,32 @@ int blkdev_get(struct block_device *bdev
 	 * For now, block device ->open() routine must _not_
 	 * examine anything in 'inode' argument except ->i_rdev.
 	 */
-	struct file fake_file = {};
-	struct dentry fake_dentry = {};
-	fake_file.f_mode = mode;
-	fake_file.f_flags = flags;
-	fake_file.f_dentry = &fake_dentry;
-	fake_dentry.d_inode = bdev->bd_inode;
-
-	return do_open(bdev, &fake_file);
+	struct file *fake_file;
+	struct dentry *fake_dentry;
+	int err = -ENOMEM;
+
+	fake_file = kmalloc(sizeof(*fake_file), GFP_KERNEL);
+	if (!fake_file)
+		goto out;
+	memset(fake_file, 0, sizeof(*fake_file));
+
+	fake_dentry = kmalloc(sizeof(*fake_dentry), GFP_KERNEL);
+	if (!fake_dentry)
+		goto out_free_file;
+	memset(fake_dentry, 0, sizeof(*fake_dentry));
+
+	fake_file->f_mode = mode;
+	fake_file->f_flags = flags;
+	fake_file->f_dentry = fake_dentry;
+	fake_dentry->d_inode = bdev->bd_inode;
+
+	err = do_open(bdev, fake_file);
+
+	kfree(fake_dentry);
+out_free_file:
+	kfree(fake_file);
+out:
+	return err;
 }
 
 EXPORT_SYMBOL(blkdev_get);
Index: linux-2.6.16/fs/buffer.c
===================================================================
--- linux-2.6.16.orig/fs/buffer.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/fs/buffer.c	2006-10-19 16:52:30.000000000 +0200
@@ -533,8 +533,7 @@ static void end_buffer_async_read(struct
 	 * decide that the page is now completely done.
 	 */
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	spin_lock_irqsave(&first->b_uptodate_lock, flags);
 	clear_buffer_async_read(bh);
 	unlock_buffer(bh);
 	tmp = bh;
@@ -547,8 +546,7 @@ static void end_buffer_async_read(struct
 		}
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 
 	/*
 	 * If none of the buffers had errors and they are all
@@ -560,8 +558,7 @@ static void end_buffer_async_read(struct
 	return;
 
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	return;
 }
 
@@ -595,8 +592,7 @@ void end_buffer_async_write(struct buffe
 	}
 
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	spin_lock_irqsave(&first->b_uptodate_lock, flags);
 
 	clear_buffer_async_write(bh);
 	unlock_buffer(bh);
@@ -608,14 +604,12 @@ void end_buffer_async_write(struct buffe
 		}
 		tmp = tmp->b_this_page;
 	}
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	end_page_writeback(page);
 	return;
 
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	return;
 }
 
@@ -3160,6 +3154,8 @@ EXPORT_SYMBOL(alloc_buffer_head);
 void free_buffer_head(struct buffer_head *bh)
 {
 	BUG_ON(!list_empty(&bh->b_assoc_buffers));
+	BUG_ON(spin_is_locked(&bh->b_uptodate_lock));
+	BUG_ON(spin_is_locked(&bh->b_state_lock));
 	kmem_cache_free(bh_cachep, bh);
 	get_cpu_var(bh_accounting).nr--;
 	recalc_bh_state();
@@ -3176,6 +3172,8 @@ init_buffer_head(void *data, kmem_cache_
 
 		memset(bh, 0, sizeof(*bh));
 		INIT_LIST_HEAD(&bh->b_assoc_buffers);
+		spin_lock_init(&bh->b_uptodate_lock);
+		spin_lock_init(&bh->b_state_lock);
 	}
 }
 
Index: linux-2.6.16/fs/dcache.c
===================================================================
--- linux-2.6.16.orig/fs/dcache.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/fs/dcache.c	2006-10-19 16:52:30.000000000 +0200
@@ -40,7 +40,7 @@ int sysctl_vfs_cache_pressure = 100;
 EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
 
  __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock);
-static seqlock_t rename_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED;
+static DECLARE_SEQLOCK(rename_lock);
 
 EXPORT_SYMBOL(dcache_lock);
 
Index: linux-2.6.16/fs/dnotify.c
===================================================================
--- linux-2.6.16.orig/fs/dnotify.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/fs/dnotify.c	2006-10-19 16:52:30.000000000 +0200
@@ -162,7 +162,7 @@ void dnotify_parent(struct dentry *dentr
 
 	spin_lock(&dentry->d_lock);
 	parent = dentry->d_parent;
-	if (parent->d_inode->i_dnotify_mask & event) {
+	if (unlikely(parent->d_inode->i_dnotify_mask & event)) {
 		dget(parent);
 		spin_unlock(&dentry->d_lock);
 		__inode_dir_notify(parent->d_inode, event);
Index: linux-2.6.16/fs/exec.c
===================================================================
--- linux-2.6.16.orig/fs/exec.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/fs/exec.c	2006-10-19 16:52:30.000000000 +0200
@@ -48,6 +48,7 @@
 #include <linux/syscalls.h>
 #include <linux/rmap.h>
 #include <linux/acct.h>
+#include <linux/delay.h>
 #include <linux/cn_proc.h>
 
 #include <asm/uaccess.h>
@@ -553,11 +554,16 @@ static int exec_mmap(struct mm_struct *m
 		}
 	}
 	task_lock(tsk);
+
+	local_irq_disable();
 	active_mm = tsk->active_mm;
+	activate_mm(active_mm, mm);
 	tsk->mm = mm;
 	tsk->active_mm = mm;
-	activate_mm(active_mm, mm);
+	local_irq_enable();
+
 	task_unlock(tsk);
+
 	arch_pick_mmap_layout(mm);
 	if (old_mm) {
 		up_read(&old_mm->mmap_sem);
@@ -632,7 +638,7 @@ static int de_thread(struct task_struct 
 		 * synchronize with any firing (by calling del_timer_sync)
 		 * before we can safely let the old group leader die.
 		 */
-		sig->real_timer.data = current;
+		sig->tsk = current;
 		spin_unlock_irq(lock);
 		if (hrtimer_cancel(&sig->real_timer))
 			hrtimer_restart(&sig->real_timer);
@@ -667,7 +673,7 @@ static int de_thread(struct task_struct 
 		 */
 		leader = current->group_leader;
 		while (leader->exit_state != EXIT_ZOMBIE)
-			yield();
+			msleep(1);
 
 		spin_lock(&leader->proc_lock);
 		spin_lock(&current->proc_lock);
Index: linux-2.6.16/fs/file.c
===================================================================
--- linux-2.6.16.orig/fs/file.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/fs/file.c	2006-10-19 16:52:30.000000000 +0200
@@ -137,7 +137,9 @@ static void free_fdtable_rcu(struct rcu_
 		kfree(fdt->fd);
 		kfree(fdt);
 	} else {
-		fddef = &get_cpu_var(fdtable_defer_list);
+
+		fddef = &per_cpu(fdtable_defer_list, raw_smp_processor_id());
+
 		spin_lock(&fddef->lock);
 		fdt->next = fddef->next;
 		fddef->next = fdt;
@@ -149,7 +151,6 @@ static void free_fdtable_rcu(struct rcu_
 		if (!schedule_work(&fddef->wq))
 			mod_timer(&fddef->timer, 5);
 		spin_unlock(&fddef->lock);
-		put_cpu_var(fdtable_defer_list);
 	}
 }
 
Index: linux-2.6.16/fs/jbd/transaction.c
===================================================================
--- linux-2.6.16.orig/fs/jbd/transaction.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/fs/jbd/transaction.c	2006-10-19 16:52:31.000000000 +0200
@@ -1488,7 +1488,7 @@ void __journal_temp_unlink_buffer(struct
 	transaction_t *transaction;
 	struct buffer_head *bh = jh2bh(jh);
 
-	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh));
 	transaction = jh->b_transaction;
 	if (transaction)
 		assert_spin_locked(&transaction->t_journal->j_list_lock);
@@ -1933,7 +1933,7 @@ void __journal_file_buffer(struct journa
 	int was_dirty = 0;
 	struct buffer_head *bh = jh2bh(jh);
 
-	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh));
 	assert_spin_locked(&transaction->t_journal->j_list_lock);
 
 	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
@@ -2022,7 +2022,7 @@ void __journal_refile_buffer(struct jour
 	int was_dirty;
 	struct buffer_head *bh = jh2bh(jh);
 
-	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh));
 	if (jh->b_transaction)
 		assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
 
Index: linux-2.6.16/fs/lockd/svc.c
===================================================================
--- linux-2.6.16.orig/fs/lockd/svc.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/fs/lockd/svc.c	2006-10-19 16:52:31.000000000 +0200
@@ -49,7 +49,7 @@ static pid_t			nlmsvc_pid;
 int				nlmsvc_grace_period;
 unsigned long			nlmsvc_timeout;
 
-static DECLARE_MUTEX_LOCKED(lockd_start);
+static DECLARE_WAIT_QUEUE_HEAD(lockd_start);
 static DECLARE_WAIT_QUEUE_HEAD(lockd_exit);
 
 /*
@@ -112,7 +112,7 @@ lockd(struct svc_rqst *rqstp)
 	 * Let our maker know we're running.
 	 */
 	nlmsvc_pid = current->pid;
-	up(&lockd_start);
+	wake_up(&lockd_start);
 
 	daemonize("lockd");
 
@@ -263,8 +263,15 @@ lockd_up(void)
 			"lockd_up: create thread failed, error=%d\n", error);
 		goto destroy_and_out;
 	}
-	down(&lockd_start);
-
+	/*
+	 * Wait for the lockd process to start, but since we're holding
+	 * the lockd semaphore, we can't wait around forever ...
+	 */
+	if (wait_event_interruptible_timeout(lockd_start,
+					     nlmsvc_pid != 0, HZ) <= 0) {
+		printk(KERN_WARNING
+			"lockd_down: lockd failed to start\n");
+	}
 	/*
 	 * Note: svc_serv structures have an initial use count of 1,
 	 * so we exit through here on both success and failure.
@@ -304,16 +311,12 @@ lockd_down(void)
 	 * Wait for the lockd process to exit, but since we're holding
 	 * the lockd semaphore, we can't wait around forever ...
 	 */
-	clear_thread_flag(TIF_SIGPENDING);
-	interruptible_sleep_on_timeout(&lockd_exit, HZ);
-	if (nlmsvc_pid) {
+	if (wait_event_interruptible_timeout(lockd_exit,
+					     nlmsvc_pid == 0, HZ) <= 0) {
 		printk(KERN_WARNING 
 			"lockd_down: lockd failed to exit, clearing pid\n");
 		nlmsvc_pid = 0;
 	}
-	spin_lock_irq(&current->sighand->siglock);
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
 out:
 	up(&nlmsvc_sema);
 }
Index: linux-2.6.16/fs/ntfs/aops.c
===================================================================
--- linux-2.6.16.orig/fs/ntfs/aops.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/fs/ntfs/aops.c	2006-10-19 16:52:31.000000000 +0200
@@ -104,8 +104,7 @@ static void ntfs_end_buffer_async_read(s
 				"0x%llx.", (unsigned long long)bh->b_blocknr);
 	}
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	spin_lock_irqsave(&first->b_uptodate_lock, flags);
 	clear_buffer_async_read(bh);
 	unlock_buffer(bh);
 	tmp = bh;
@@ -120,8 +119,7 @@ static void ntfs_end_buffer_async_read(s
 		}
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	/*
 	 * If none of the buffers had errors then we can set the page uptodate,
 	 * but we first have to perform the post read mst fixups, if the
@@ -154,8 +152,7 @@ static void ntfs_end_buffer_async_read(s
 	unlock_page(page);
 	return;
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	return;
 }
 
Index: linux-2.6.16/fs/pipe.c
===================================================================
--- linux-2.6.16.orig/fs/pipe.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/fs/pipe.c	2006-10-19 16:52:31.000000000 +0200
@@ -206,8 +206,14 @@ pipe_readv(struct file *filp, const stru
 		wake_up_interruptible(PIPE_WAIT(*inode));
 		kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
 	}
+	/*
+	 * Hack: we turn off atime updates for -RT kernels.
+	 * Who uses them on pipes anyway?
+	 */
+#ifndef CONFIG_PREEMPT_RT
 	if (ret > 0)
 		file_accessed(filp);
+#endif
 	return ret;
 }
 
@@ -346,8 +352,14 @@ out:
 		wake_up_interruptible(PIPE_WAIT(*inode));
 		kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
 	}
+	/*
+	 * Hack: we turn off atime updates for -RT kernels.
+	 * Who uses them on pipes anyway?
+	 */
+#ifndef CONFIG_PREEMPT_RT
 	if (ret > 0)
 		file_update_time(filp);
+#endif
 	return ret;
 }
 
Index: linux-2.6.16/fs/proc/array.c
===================================================================
--- linux-2.6.16.orig/fs/proc/array.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/fs/proc/array.c	2006-10-19 16:52:31.000000000 +0200
@@ -130,17 +130,19 @@ static inline char * task_name(struct ta
  */
 static const char *task_state_array[] = {
 	"R (running)",		/*  0 */
-	"S (sleeping)",		/*  1 */
-	"D (disk sleep)",	/*  2 */
-	"T (stopped)",		/*  4 */
-	"T (tracing stop)",	/*  8 */
-	"Z (zombie)",		/* 16 */
-	"X (dead)"		/* 32 */
+	"M (running-mutex)",	/*  1 */
+	"S (sleeping)",		/*  2 */
+	"D (disk sleep)",	/*  4 */
+	"T (stopped)",		/*  8 */
+	"T (tracing stop)",	/* 16 */
+	"Z (zombie)",		/* 32 */
+	"X (dead)"		/* 64 */
 };
 
 static inline const char * get_task_state(struct task_struct *tsk)
 {
 	unsigned int state = (tsk->state & (TASK_RUNNING |
+					    TASK_RUNNING_MUTEX |
 					    TASK_INTERRUPTIBLE |
 					    TASK_UNINTERRUPTIBLE |
 					    TASK_STOPPED |
@@ -293,6 +295,19 @@ static inline char *task_cap(struct task
 			    cap_t(p->cap_effective));
 }
 
+#define get_blocked_on(t)	(-1)
+
+static char *show_blocked_on(task_t *task, char *buffer)
+{
+	pid_t pid = get_blocked_on(task);
+
+	if (pid < 0)
+		return buffer;
+
+	return buffer + sprintf(buffer,"BlckOn: %d\n",pid);
+}
+
+
 int proc_pid_status(struct task_struct *task, char * buffer)
 {
 	char * orig = buffer;
@@ -311,6 +326,7 @@ int proc_pid_status(struct task_struct *
 #if defined(CONFIG_S390)
 	buffer = task_show_regs(task, buffer);
 #endif
+	buffer = show_blocked_on(task,buffer);
 	return buffer - orig;
 }
 
@@ -330,7 +346,6 @@ static int do_task_stat(struct task_stru
 	unsigned long  min_flt = 0,  maj_flt = 0;
 	cputime_t cutime, cstime, utime, stime;
 	unsigned long rsslim = 0;
-	DEFINE_KTIME(it_real_value);
 	struct task_struct *t;
 	char tcomm[sizeof(task->comm)];
 
@@ -386,7 +401,6 @@ static int do_task_stat(struct task_stru
 			utime = cputime_add(utime, task->signal->utime);
 			stime = cputime_add(stime, task->signal->stime);
 		}
-		it_real_value = task->signal->real_timer.expires;
 	}
 	ppid = pid_alive(task) ? task->group_leader->real_parent->tgid : 0;
 	read_unlock(&tasklist_lock);
@@ -413,7 +427,7 @@ static int do_task_stat(struct task_stru
 	start_time = nsec_to_clock_t(start_time);
 
 	res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
-%lu %lu %lu %lu %lu %ld %ld %ld %ld %d %ld %llu %lu %ld %lu %lu %lu %lu %lu \
+%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
 %lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu\n",
 		task->pid,
 		tcomm,
@@ -435,7 +449,6 @@ static int do_task_stat(struct task_stru
 		priority,
 		nice,
 		num_threads,
-		(long) ktime_to_clock_t(it_real_value),
 		start_time,
 		vsize,
 		mm ? get_mm_rss(mm) : 0,
Index: linux-2.6.16/fs/proc/generic.c
===================================================================
--- linux-2.6.16.orig/fs/proc/generic.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/fs/proc/generic.c	2006-10-19 16:52:31.000000000 +0200
@@ -19,6 +19,7 @@
 #include <linux/idr.h>
 #include <linux/namei.h>
 #include <linux/bitops.h>
+#include <linux/spinlock.h>
 #include <asm/uaccess.h>
 
 #include "internal.h"
@@ -29,6 +30,8 @@ static ssize_t proc_file_write(struct fi
 			       size_t count, loff_t *ppos);
 static loff_t proc_file_lseek(struct file *, loff_t, int);
 
+DEFINE_SPINLOCK(proc_subdir_lock);
+
 int proc_match(int len, const char *name, struct proc_dir_entry *de)
 {
 	if (de->namelen != len)
@@ -277,7 +280,9 @@ static int xlate_proc_name(const char *n
 	const char     		*cp = name, *next;
 	struct proc_dir_entry	*de;
 	int			len;
+	int 			rtn = 0;
 
+	spin_lock(&proc_subdir_lock);
 	de = &proc_root;
 	while (1) {
 		next = strchr(cp, '/');
@@ -289,13 +294,17 @@ static int xlate_proc_name(const char *n
 			if (proc_match(len, cp, de))
 				break;
 		}
-		if (!de)
-			return -ENOENT;
+		if (!de) {
+			rtn = -ENOENT;
+			goto out;
+		}
 		cp += len + 1;
 	}
 	*residual = cp;
 	*ret = de;
-	return 0;
+out:
+	spin_unlock(&proc_subdir_lock);
+	return rtn;
 }
 
 static DEFINE_IDR(proc_inum_idr);
@@ -380,6 +389,7 @@ struct dentry *proc_lookup(struct inode 
 	int error = -ENOENT;
 
 	lock_kernel();
+	spin_lock(&proc_subdir_lock);
 	de = PDE(dir);
 	if (de) {
 		for (de = de->subdir; de ; de = de->next) {
@@ -388,12 +398,15 @@ struct dentry *proc_lookup(struct inode 
 			if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
 				unsigned int ino = de->low_ino;
 
+				spin_unlock(&proc_subdir_lock);
 				error = -EINVAL;
 				inode = proc_get_inode(dir->i_sb, ino, de);
+				spin_lock(&proc_subdir_lock);
 				break;
 			}
 		}
 	}
+	spin_unlock(&proc_subdir_lock);
 	unlock_kernel();
 
 	if (inode) {
@@ -447,11 +460,13 @@ int proc_readdir(struct file * filp,
 			filp->f_pos++;
 			/* fall through */
 		default:
+			spin_lock(&proc_subdir_lock);
 			de = de->subdir;
 			i -= 2;
 			for (;;) {
 				if (!de) {
 					ret = 1;
+					spin_unlock(&proc_subdir_lock);
 					goto out;
 				}
 				if (!i)
@@ -461,12 +476,16 @@ int proc_readdir(struct file * filp,
 			}
 
 			do {
+				/* filldir passes info to user space */
+				spin_unlock(&proc_subdir_lock);
 				if (filldir(dirent, de->name, de->namelen, filp->f_pos,
 					    de->low_ino, de->mode >> 12) < 0)
 					goto out;
+				spin_lock(&proc_subdir_lock);
 				filp->f_pos++;
 				de = de->next;
 			} while (de);
+			spin_unlock(&proc_subdir_lock);
 	}
 	ret = 1;
 out:	unlock_kernel();
@@ -500,9 +519,13 @@ static int proc_register(struct proc_dir
 	if (i == 0)
 		return -EAGAIN;
 	dp->low_ino = i;
+
+	spin_lock(&proc_subdir_lock);
 	dp->next = dir->subdir;
 	dp->parent = dir;
 	dir->subdir = dp;
+	spin_unlock(&proc_subdir_lock);
+
 	if (S_ISDIR(dp->mode)) {
 		if (dp->proc_iops == NULL) {
 			dp->proc_fops = &proc_dir_operations;
@@ -694,6 +717,8 @@ void remove_proc_entry(const char *name,
 	if (!parent && xlate_proc_name(name, &parent, &fn) != 0)
 		goto out;
 	len = strlen(fn);
+
+	spin_lock(&proc_subdir_lock);
 	for (p = &parent->subdir; *p; p=&(*p)->next ) {
 		if (!proc_match(len, fn, *p))
 			continue;
@@ -714,6 +739,7 @@ void remove_proc_entry(const char *name,
 		}
 		break;
 	}
+	spin_unlock(&proc_subdir_lock);
 out:
 	return;
 }
Index: linux-2.6.16/fs/proc/proc_devtree.c
===================================================================
--- linux-2.6.16.orig/fs/proc/proc_devtree.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/fs/proc/proc_devtree.c	2006-10-19 16:52:31.000000000 +0200
@@ -136,9 +136,11 @@ void proc_device_tree_add_node(struct de
 		 * properties are quite unimportant for us though, thus we
 		 * simply "skip" them here, but we do have to check.
 		 */
+		spin_lock(&proc_subdir_lock);
 		for (ent = de->subdir; ent != NULL; ent = ent->next)
 			if (!strcmp(ent->name, pp->name))
 				break;
+		spin_unlock(&proc_subdir_lock);
 		if (ent != NULL) {
 			printk(KERN_WARNING "device-tree: property \"%s\" name"
 			       " conflicts with node in %s\n", pp->name,
Index: linux-2.6.16/fs/proc/proc_misc.c
===================================================================
--- linux-2.6.16.orig/fs/proc/proc_misc.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/fs/proc/proc_misc.c	2006-10-19 16:52:31.000000000 +0200
@@ -96,6 +96,27 @@ static int loadavg_read_proc(char *page,
 	return proc_calc_metrics(page, start, off, count, eof, len);
 }
 
+#ifdef CONFIG_PREEMPT_RT
+static int loadavg_rt_read_proc(char *page, char **start, off_t off,
+				 int count, int *eof, void *data)
+{
+	extern unsigned long avenrun_rt[];
+	extern unsigned long rt_nr_running(void);
+	int a, b, c;
+	int len;
+
+	a = avenrun_rt[0] + (FIXED_1/200);
+	b = avenrun_rt[1] + (FIXED_1/200);
+	c = avenrun_rt[2] + (FIXED_1/200);
+	len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
+		LOAD_INT(a), LOAD_FRAC(a),
+		LOAD_INT(b), LOAD_FRAC(b),
+		LOAD_INT(c), LOAD_FRAC(c),
+		rt_nr_running(), nr_threads, last_pid);
+	return proc_calc_metrics(page, start, off, count, eof, len);
+}
+#endif
+
 static int uptime_read_proc(char *page, char **start, off_t off,
 				 int count, int *eof, void *data)
 {
@@ -491,10 +512,11 @@ static int show_stat(struct seq_file *p,
 {
 	int i;
 	unsigned long jif;
-	cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
+	cputime64_t user_rt, user, nice, system_rt, system, idle,
+		    iowait, irq, softirq, steal;
 	u64 sum = 0;
 
-	user = nice = system = idle = iowait =
+	user_rt = user = nice = system_rt = system = idle = iowait =
 		irq = softirq = steal = cputime64_zero;
 	jif = - wall_to_monotonic.tv_sec;
 	if (wall_to_monotonic.tv_nsec)
@@ -511,11 +533,16 @@ static int show_stat(struct seq_file *p,
 		irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
 		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
+		user_rt = cputime64_add(user_rt, kstat_cpu(i).cpustat.user_rt);
+		system_rt = cputime64_add(system_rt, kstat_cpu(i).cpustat.system_rt);
 		for (j = 0 ; j < NR_IRQS ; j++)
 			sum += kstat_cpu(i).irqs[j];
 	}
 
-	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu\n",
+	user = cputime64_add(user_rt, user);
+	system = cputime64_add(system_rt, system);
+
+	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
 		(unsigned long long)cputime64_to_clock_t(user),
 		(unsigned long long)cputime64_to_clock_t(nice),
 		(unsigned long long)cputime64_to_clock_t(system),
@@ -523,19 +550,24 @@ static int show_stat(struct seq_file *p,
 		(unsigned long long)cputime64_to_clock_t(iowait),
 		(unsigned long long)cputime64_to_clock_t(irq),
 		(unsigned long long)cputime64_to_clock_t(softirq),
-		(unsigned long long)cputime64_to_clock_t(steal));
+		(unsigned long long)cputime64_to_clock_t(steal),
+		(unsigned long long)cputime64_to_clock_t(user_rt),
+		(unsigned long long)cputime64_to_clock_t(system_rt));
+
 	for_each_online_cpu(i) {
 
 		/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
-		user = kstat_cpu(i).cpustat.user;
+		user_rt = kstat_cpu(i).cpustat.user_rt;
+		system_rt = kstat_cpu(i).cpustat.system_rt;
+		user = cputime64_add(user_rt, kstat_cpu(i).cpustat.user);
 		nice = kstat_cpu(i).cpustat.nice;
-		system = kstat_cpu(i).cpustat.system;
+		system = cputime64_add(system_rt, kstat_cpu(i).cpustat.system);
 		idle = kstat_cpu(i).cpustat.idle;
 		iowait = kstat_cpu(i).cpustat.iowait;
 		irq = kstat_cpu(i).cpustat.irq;
 		softirq = kstat_cpu(i).cpustat.softirq;
 		steal = kstat_cpu(i).cpustat.steal;
-		seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu\n",
+		seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
 			i,
 			(unsigned long long)cputime64_to_clock_t(user),
 			(unsigned long long)cputime64_to_clock_t(nice),
@@ -544,7 +576,9 @@ static int show_stat(struct seq_file *p,
 			(unsigned long long)cputime64_to_clock_t(iowait),
 			(unsigned long long)cputime64_to_clock_t(irq),
 			(unsigned long long)cputime64_to_clock_t(softirq),
-			(unsigned long long)cputime64_to_clock_t(steal));
+			(unsigned long long)cputime64_to_clock_t(steal),
+			(unsigned long long)cputime64_to_clock_t(user_rt),
+			(unsigned long long)cputime64_to_clock_t(system_rt));
 	}
 	seq_printf(p, "intr %llu", (unsigned long long)sum);
 
@@ -565,6 +599,49 @@ static int show_stat(struct seq_file *p,
 		nr_running(),
 		nr_iowait());
 
+#ifdef CONFIG_PREEMPT_RT
+	{
+		unsigned long nr_uninterruptible_cpu(int cpu);
+		extern int pi_initialized;
+		extern int rt_overload_schedule,
+			   rt_overload_wakeup, rt_overload_pulled;
+		unsigned long rt_nr_running(void);
+		unsigned long rt_nr_running_cpu(int cpu);
+		unsigned long rt_nr_uninterruptible(void);
+		unsigned long rt_nr_uninterruptible_cpu(int cpu);
+		extern atomic_t rt_overload;
+
+		int i;
+
+		seq_printf(p, "rt_overload_schedule: %d\n",
+					rt_overload_schedule);
+		seq_printf(p, "rt_overload_wakeup:   %d\n",
+					rt_overload_wakeup);
+		seq_printf(p, "rt_overload_pulled:   %d\n",
+					rt_overload_pulled);
+		seq_printf(p, "pi_init: %d\n", pi_initialized);
+		seq_printf(p, "nr_running(): %ld\n",
+			nr_running());
+		seq_printf(p, "nr_uninterruptible(): %ld\n",
+			nr_uninterruptible());
+		for_each_cpu(i)
+			seq_printf(p, "nr_uninterruptible(%d): %ld\n",
+				i, nr_uninterruptible_cpu(i));
+		seq_printf(p, "rt_nr_running(): %ld\n",
+			rt_nr_running());
+		for_each_cpu(i)
+			seq_printf(p, "rt_nr_running(%d): %ld\n",
+				i, rt_nr_running_cpu(i));
+		seq_printf(p, "nr_rt_uninterruptible(): %ld\n",
+			rt_nr_uninterruptible());
+		for_each_cpu(i)
+			seq_printf(p, "nr_rt_uninterruptible(%d): %ld\n",
+				i, rt_nr_uninterruptible_cpu(i));
+		seq_printf(p, "rt_overload: %d\n", atomic_read(&rt_overload));
+
+	}
+#endif
+
 	return 0;
 }
 
@@ -673,6 +750,20 @@ static int execdomains_read_proc(char *p
 	return proc_calc_metrics(page, start, off, count, eof, len);
 }
 
+#ifdef CONFIG_LATENCY_TRACE
+extern struct seq_operations latency_trace_op;
+static int latency_trace_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &latency_trace_op);
+}
+static struct file_operations proc_latency_trace_operations = {
+	.open		= latency_trace_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+#endif
+
 #ifdef CONFIG_MAGIC_SYSRQ
 /*
  * writing 'C' to /proc/sysrq-trigger is like sysrq-C
@@ -705,6 +796,48 @@ void create_seq_entry(char *name, mode_t
 		entry->proc_fops = f;
 }
 
+#ifdef CONFIG_RCU_STATS
+int rcu_read_proc(char *page, char **start, off_t off,
+		  int count, int *eof, void *data)
+{
+	int len;
+	extern int rcu_read_proc_data(char *page);
+
+	len = rcu_read_proc_data(page);
+	return proc_calc_metrics(page, start, off, count, eof, len);
+}
+
+int rcu_read_proc_gp(char *page, char **start, off_t off,
+		     int count, int *eof, void *data)
+{
+	int len;
+	extern int rcu_read_proc_gp_data(char *page);
+
+	len = rcu_read_proc_gp_data(page);
+	return proc_calc_metrics(page, start, off, count, eof, len);
+}
+
+int rcu_read_proc_ptrs(char *page, char **start, off_t off,
+		       int count, int *eof, void *data)
+{
+	int len;
+	extern int rcu_read_proc_ptrs_data(char *page);
+
+	len = rcu_read_proc_ptrs_data(page);
+	return proc_calc_metrics(page, start, off, count, eof, len);
+}
+
+int rcu_read_proc_ctrs(char *page, char **start, off_t off,
+		       int count, int *eof, void *data)
+{
+	int len;
+	extern int rcu_read_proc_ctrs_data(char *page);
+
+	len = rcu_read_proc_ctrs_data(page);
+	return proc_calc_metrics(page, start, off, count, eof, len);
+}
+#endif /* #ifdef CONFIG_RCU_STATS */
+
 void __init proc_misc_init(void)
 {
 	struct proc_dir_entry *entry;
@@ -713,6 +846,9 @@ void __init proc_misc_init(void)
 		int (*read_proc)(char*,char**,off_t,int,int*,void*);
 	} *p, simple_ones[] = {
 		{"loadavg",     loadavg_read_proc},
+#ifdef CONFIG_PREEMPT_RT
+		{"loadavgrt",   loadavg_rt_read_proc},
+#endif
 		{"uptime",	uptime_read_proc},
 		{"meminfo",	meminfo_read_proc},
 		{"version",	version_read_proc},
@@ -726,6 +862,12 @@ void __init proc_misc_init(void)
 		{"cmdline",	cmdline_read_proc},
 		{"locks",	locks_read_proc},
 		{"execdomains",	execdomains_read_proc},
+#ifdef CONFIG_RCU_STATS
+		{"rcustats",	rcu_read_proc},
+		{"rcugp",	rcu_read_proc_gp},
+		{"rcuptrs",	rcu_read_proc_ptrs},
+		{"rcuctrs",	rcu_read_proc_ctrs},
+#endif /* #ifdef CONFIG_RCU_STATS */
 		{NULL,}
 	};
 	for (p = simple_ones; p->name; p++)
@@ -755,6 +897,9 @@ void __init proc_misc_init(void)
 #ifdef CONFIG_SCHEDSTATS
 	create_seq_entry("schedstat", 0, &proc_schedstat_operations);
 #endif
+#ifdef CONFIG_LATENCY_TRACE
+	create_seq_entry("latency_trace", 0, &proc_latency_trace_operations);
+#endif
 #ifdef CONFIG_PROC_KCORE
 	proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL);
 	if (proc_root_kcore) {
Index: linux-2.6.16/fs/proc/task_mmu.c
===================================================================
--- linux-2.6.16.orig/fs/proc/task_mmu.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/fs/proc/task_mmu.c	2006-10-19 16:52:31.000000000 +0200
@@ -331,8 +331,10 @@ static void *m_start(struct seq_file *m,
 	vma = NULL;
 	if ((unsigned long)l < mm->map_count) {
 		vma = mm->mmap;
-		while (l-- && vma)
+		while (l-- && vma) {
 			vma = vma->vm_next;
+			cond_resched();
+		}
 		goto out;
 	}
 
Index: linux-2.6.16/fs/xfs/linux-2.6/mrlock.h
===================================================================
--- linux-2.6.16.orig/fs/xfs/linux-2.6/mrlock.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/fs/xfs/linux-2.6/mrlock.h	2006-10-19 16:52:31.000000000 +0200
@@ -23,12 +23,12 @@
 enum { MR_NONE, MR_ACCESS, MR_UPDATE };
 
 typedef struct {
-	struct rw_semaphore	mr_lock;
-	int			mr_writer;
+	struct compat_rw_semaphore	mr_lock;
+	int				mr_writer;
 } mrlock_t;
 
 #define mrinit(mrp, name)	\
-	( (mrp)->mr_writer = 0, init_rwsem(&(mrp)->mr_lock) )
+	do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0)
 #define mrlock_init(mrp, t,n,s)	mrinit(mrp, n)
 #define mrfree(mrp)		do { } while (0)
 #define mraccess(mrp)		mraccessf(mrp, 0)
Index: linux-2.6.16/fs/xfs/linux-2.6/sema.h
===================================================================
--- linux-2.6.16.orig/fs/xfs/linux-2.6/sema.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/fs/xfs/linux-2.6/sema.h	2006-10-19 16:52:31.000000000 +0200
@@ -27,7 +27,7 @@
  * sema_t structure just maps to struct semaphore in Linux kernel.
  */
 
-typedef struct semaphore sema_t;
+typedef struct compat_semaphore sema_t;
 
 #define init_sema(sp, val, c, d)	sema_init(sp, val)
 #define initsema(sp, val)		sema_init(sp, val)
Index: linux-2.6.16/fs/xfs/linux-2.6/xfs_buf.h
===================================================================
--- linux-2.6.16.orig/fs/xfs/linux-2.6/xfs_buf.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/fs/xfs/linux-2.6/xfs_buf.h	2006-10-19 16:52:31.000000000 +0200
@@ -119,7 +119,7 @@ typedef int (*xfs_buf_bdstrat_t)(struct 
 #define XB_PAGES	2
 
 typedef struct xfs_buf {
-	struct semaphore	b_sema;		/* semaphore for lockables */
+	struct compat_semaphore	b_sema;		/* semaphore for lockables */
 	unsigned long		b_queuetime;	/* time buffer was queued */
 	atomic_t		b_pin_count;	/* pin count */
 	wait_queue_head_t	b_waiters;	/* unpin waiters */
@@ -139,7 +139,7 @@ typedef struct xfs_buf {
 	xfs_buf_iodone_t	b_iodone;	/* I/O completion function */
 	xfs_buf_relse_t		b_relse;	/* releasing function */
 	xfs_buf_bdstrat_t	b_strat;	/* pre-write function */
-	struct semaphore	b_iodonesema;	/* Semaphore for I/O waiters */
+	struct compat_semaphore	b_iodonesema;	/* Semaphore for I/O waiters */
 	void			*b_fspriv;
 	void			*b_fspriv2;
 	void			*b_fspriv3;
Index: linux-2.6.16/fs/xfs/xfs_mount.h
===================================================================
--- linux-2.6.16.orig/fs/xfs/xfs_mount.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/fs/xfs/xfs_mount.h	2006-10-19 16:52:31.000000000 +0200
@@ -328,7 +328,7 @@ typedef struct xfs_mount {
 	uint			m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
 	uint			m_in_maxlevels;	/* XFS_IN_MAXLEVELS */
 	struct xfs_perag	*m_perag;	/* per-ag accounting info */
-	struct rw_semaphore	m_peraglock;	/* lock for m_perag (pointer) */
+	struct compat_rw_semaphore m_peraglock;	/* lock for m_perag (pointer) */
 	sema_t			m_growlock;	/* growfs mutex */
 	int			m_fixedfsid[2];	/* unchanged for life of FS */
 	uint			m_dmevmask;	/* DMI events for this FS */
Index: linux-2.6.16/include/acpi/acpiosxf.h
===================================================================
--- linux-2.6.16.orig/include/acpi/acpiosxf.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/acpi/acpiosxf.h	2006-10-19 16:52:31.000000000 +0200
@@ -57,7 +57,7 @@
 #define OSD_PRIORITY_MED            3
 #define OSD_PRIORITY_LO             4
 
-#define ACPI_NO_UNIT_LIMIT          ((u32) -1)
+#define ACPI_NO_UNIT_LIMIT          (INT_MAX/2)
 #define ACPI_MUTEX_SEM              1
 
 /* Functions for acpi_os_signal */
Index: linux-2.6.16/include/asm-arm/arch-omap/clock.h
===================================================================
--- linux-2.6.16.orig/include/asm-arm/arch-omap/clock.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-arm/arch-omap/clock.h	2006-10-19 16:52:31.000000000 +0200
@@ -48,7 +48,7 @@ struct clk_functions {
 
 extern unsigned int mpurate;
 extern struct list_head clocks;
-extern spinlock_t clockfw_lock;
+extern raw_spinlock_t clockfw_lock;
 
 extern int clk_init(struct clk_functions * custom_clocks);
 extern int clk_register(struct clk *clk);
Index: linux-2.6.16/include/asm-arm/arch-pxa/timex.h
===================================================================
--- linux-2.6.16.orig/include/asm-arm/arch-pxa/timex.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-arm/arch-pxa/timex.h	2006-10-19 16:52:31.000000000 +0200
@@ -11,6 +11,8 @@
  */
 
 #include <linux/config.h>
+#include <asm/hardware.h>
+#include <asm/div64.h>
 
 #if defined(CONFIG_PXA25x)
 /* PXA250/210 timer base */
@@ -23,3 +25,7 @@
 #define CLOCK_TICK_RATE 3250000
 #endif
 #endif
+
+#define mach_read_cycles() OSCR
+#define mach_cycles_to_usecs(d) (((d) * ((1000000LL << 32) / CLOCK_TICK_RATE)) >> 32)
+#define mach_usecs_to_cycles(d) (((d) * (((long long)CLOCK_TICK_RATE << 32) / 1000000)) >> 32)
Index: linux-2.6.16/include/asm-arm/atomic.h
===================================================================
--- linux-2.6.16.orig/include/asm-arm/atomic.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-arm/atomic.h	2006-10-19 16:52:31.000000000 +0200
@@ -129,10 +129,10 @@ static inline int atomic_add_return(int 
 	unsigned long flags;
 	int val;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	val = v->counter;
 	v->counter = val += i;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return val;
 }
@@ -142,10 +142,10 @@ static inline int atomic_sub_return(int 
 	unsigned long flags;
 	int val;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	val = v->counter;
 	v->counter = val -= i;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return val;
 }
@@ -168,11 +168,46 @@ static inline void atomic_clear_mask(uns
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	*addr &= ~mask;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
+#ifndef CONFIG_SMP
+/*
+ * Atomic compare and exchange.
+ */
+#define __HAVE_ARCH_CMPXCHG	1
+
+extern unsigned long wrong_size_cmpxchg(volatile void *ptr);
+
+static inline unsigned long __cmpxchg(volatile void *ptr,
+				    unsigned long old,
+				    unsigned long new, int size)
+{
+	unsigned long flags, prev;
+	volatile unsigned long *p = ptr;
+
+	if (size == 4) {
+		raw_local_irq_save(flags);
+		if ((prev = *p) == old)
+			*p = new;
+		raw_local_irq_restore(flags);
+		return(prev);
+	} else
+		return wrong_size_cmpxchg(ptr);
+}
+
+#define cmpxchg(ptr,o,n)					  	\
+({									\
+     __typeof__(*(ptr)) _o_ = (o);					\
+     __typeof__(*(ptr)) _n_ = (n);					\
+     (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_,		\
+			   	 (unsigned long)_n_, sizeof(*(ptr)));	\
+})
+
+#endif
+
 #endif /* __LINUX_ARM_ARCH__ */
 
 #define atomic_xchg(v, new) (xchg(&((v)->counter), new))
Index: linux-2.6.16/include/asm-arm/bitops.h
===================================================================
--- linux-2.6.16.orig/include/asm-arm/bitops.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-arm/bitops.h	2006-10-19 16:52:31.000000000 +0200
@@ -37,9 +37,9 @@ static inline void ____atomic_set_bit(un
 
 	p += bit >> 5;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	*p |= mask;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static inline void ____atomic_clear_bit(unsigned int bit, volatile unsigned long *p)
@@ -49,9 +49,9 @@ static inline void ____atomic_clear_bit(
 
 	p += bit >> 5;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	*p &= ~mask;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static inline void ____atomic_change_bit(unsigned int bit, volatile unsigned long *p)
@@ -61,9 +61,9 @@ static inline void ____atomic_change_bit
 
 	p += bit >> 5;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	*p ^= mask;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static inline int
@@ -75,10 +75,10 @@ ____atomic_test_and_set_bit(unsigned int
 
 	p += bit >> 5;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	res = *p;
 	*p = res | mask;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return res & mask;
 }
@@ -92,10 +92,10 @@ ____atomic_test_and_clear_bit(unsigned i
 
 	p += bit >> 5;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	res = *p;
 	*p = res & ~mask;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return res & mask;
 }
@@ -109,10 +109,10 @@ ____atomic_test_and_change_bit(unsigned 
 
 	p += bit >> 5;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	res = *p;
 	*p = res ^ mask;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return res & mask;
 }
Index: linux-2.6.16/include/asm-arm/dma.h
===================================================================
--- linux-2.6.16.orig/include/asm-arm/dma.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-arm/dma.h	2006-10-19 16:52:31.000000000 +0200
@@ -28,7 +28,7 @@ typedef unsigned int dmamode_t;
 #define DMA_MODE_CASCADE 2
 #define DMA_AUTOINIT	 4
 
-extern spinlock_t  dma_spin_lock;
+extern raw_spinlock_t  dma_spin_lock;
 
 static inline unsigned long claim_dma_lock(void)
 {
Index: linux-2.6.16/include/asm-arm/dyntick.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/include/asm-arm/dyntick.h	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,6 @@
+#ifndef _ASMARM_DYNTICK_H
+#define _ASMARM_DYNTICK_H
+
+#include <asm/mach/time.h>
+
+#endif /* _ASMARM_DYNTICK_H */
Index: linux-2.6.16/include/asm-arm/hw_irq.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/include/asm-arm/hw_irq.h	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,9 @@
+/*
+ * Nothing to see here yet
+ */
+#ifndef _ARCH_ARM_HW_IRQ_H
+#define _ARCH_ARM_HW_IRQ_H
+
+#include <asm/mach/irq.h>
+
+#endif
Index: linux-2.6.16/include/asm-arm/irq.h
===================================================================
--- linux-2.6.16.orig/include/asm-arm/irq.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-arm/irq.h	2006-10-19 16:52:31.000000000 +0200
@@ -21,18 +21,10 @@
 
 struct irqaction;
 
-extern void disable_irq_nosync(unsigned int);
-extern void disable_irq(unsigned int);
-extern void enable_irq(unsigned int);
-
-/*
- * These correspond with the SA_TRIGGER_* defines, and therefore the
- * IRQRESOURCE_IRQ_* defines.
- */
-#define __IRQT_RISEDGE	(1 << 0)
-#define __IRQT_FALEDGE	(1 << 1)
-#define __IRQT_HIGHLVL	(1 << 2)
-#define __IRQT_LOWLVL	(1 << 3)
+#define __IRQT_FALEDGE	IRQ_TYPE_EDGEL
+#define __IRQT_RISEDGE	IRQ_TYPE_EDGEH
+#define __IRQT_LOWLVL	IRQ_TYPE_LEVELL
+#define __IRQT_HIGHLVL	IRQ_TYPE_LEVELH
 
 #define IRQT_NOEDGE	(0)
 #define IRQT_RISING	(__IRQT_RISEDGE)
@@ -40,16 +32,9 @@ extern void enable_irq(unsigned int);
 #define IRQT_BOTHEDGE	(__IRQT_RISEDGE|__IRQT_FALEDGE)
 #define IRQT_LOW	(__IRQT_LOWLVL)
 #define IRQT_HIGH	(__IRQT_HIGHLVL)
-#define IRQT_PROBE	(1 << 4)
 
-int set_irq_type(unsigned int irq, unsigned int type);
-void disable_irq_wake(unsigned int irq);
-void enable_irq_wake(unsigned int irq);
-int setup_irq(unsigned int, struct irqaction *);
-
-struct irqaction;
-struct pt_regs;
-int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+/* FIXME_TGLX */
+#define IRQT_PROBE	(1 << 7)
 
 extern void migrate_irqs(void);
 #endif
Index: linux-2.6.16/include/asm-arm/irqflags.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/include/asm-arm/irqflags.h	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,112 @@
+/*
+ * include/asm-arm/irqflags.h
+ *
+ * IRQ flags handling
+ *
+ * This file gets included from lowlevel asm headers too, to provide
+ * wrapped versions of the local_irq_*() APIs, based on the
+ * raw_local_irq_*() macros from the lowlevel headers.
+ */
+#ifndef _ASM_IRQFLAGS_H
+#define _ASM_IRQFLAGS_H
+
+/*
+ * CPU interrupt mask handling.
+ */
+#if __LINUX_ARM_ARCH__ >= 6
+
+#define raw_local_irq_save(x)					\
+	({							\
+	__asm__ __volatile__(					\
+	"mrs	%0, cpsr		@ local_irq_save\n"	\
+	"cpsid	i"						\
+	: "=r" (x) : : "memory", "cc");				\
+	})
+
+#define raw_local_irq_enable()  __asm__("cpsie i	@ __sti" : : : "memory", "cc")
+#define raw_local_irq_disable() __asm__("cpsid i	@ __cli" : : : "memory", "cc")
+
+#else
+
+/*
+ * Save the current interrupt enable state & disable IRQs
+ */
+#define raw_local_irq_save(x)					\
+	({							\
+		unsigned long temp;				\
+		(void) (&temp == &x);				\
+	__asm__ __volatile__(					\
+	"mrs	%0, cpsr		@ local_irq_save\n"	\
+"	orr	%1, %0, #128\n"					\
+"	msr	cpsr_c, %1"					\
+	: "=r" (x), "=r" (temp)					\
+	:							\
+	: "memory", "cc");					\
+	})
+
+/*
+ * Enable IRQs
+ */
+#define raw_local_irq_enable()				\
+	({							\
+		unsigned long temp;				\
+	__asm__ __volatile__(					\
+	"mrs	%0, cpsr		@ local_irq_enable\n"	\
+"	bic	%0, %0, #128\n"					\
+"	msr	cpsr_c, %0"					\
+	: "=r" (temp)						\
+	:							\
+	: "memory", "cc");					\
+	})
+
+/*
+ * Disable IRQs
+ */
+#define raw_local_irq_disable()				\
+	({							\
+		unsigned long temp;				\
+	__asm__ __volatile__(					\
+	"mrs	%0, cpsr		@ local_irq_disable\n"	\
+"	orr	%0, %0, #128\n"					\
+"	msr	cpsr_c, %0"					\
+	: "=r" (temp)						\
+	:							\
+	: "memory", "cc");					\
+	})
+
+
+/*
+ * Save the current interrupt enable state.
+ */
+#define raw_local_save_flags(x)				\
+	({							\
+	__asm__ __volatile__(					\
+	"mrs	%0, cpsr		@ local_save_flags"	\
+	: "=r" (x) : : "memory", "cc");				\
+	})
+
+/*
+ * restore saved IRQ & FIQ state
+ */
+#define raw_local_irq_restore(x)				\
+	__asm__ __volatile__(					\
+	"msr	cpsr_c, %0		@ local_irq_restore\n"	\
+	:							\
+	: "r" (x)						\
+	: "memory", "cc")
+
+#define raw_irqs_disabled_flags(flags)	\
+({						\
+	(int)(flags & PSR_I_BIT);		\
+})
+
+#define raw_irqs_disabled()			\
+({						\
+	unsigned long flags;			\
+	raw_local_save_flags(flags);		\
+	raw_irqs_disabled_flags(flags);	\
+})
+
+#endif /* __LINUX_ARM_ARCH__ */
+
+#endif /* _ASM_IRQFLAGS_H */
Index: linux-2.6.16/include/asm-arm/mach/irq.h
===================================================================
--- linux-2.6.16.orig/include/asm-arm/mach/irq.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-arm/mach/irq.h	2006-10-19 16:52:31.000000000 +0200
@@ -10,94 +10,9 @@
 #ifndef __ASM_ARM_MACH_IRQ_H
 #define __ASM_ARM_MACH_IRQ_H
 
-struct irqdesc;
-struct pt_regs;
-struct seq_file;
-
-typedef void (*irq_handler_t)(unsigned int, struct irqdesc *, struct pt_regs *);
-typedef void (*irq_control_t)(unsigned int);
+#include <linux/irq.h>
 
-struct irqchip {
-	/*
-	 * Acknowledge the IRQ.
-	 * If this is a level-based IRQ, then it is expected to mask the IRQ
-	 * as well.
-	 */
-	void (*ack)(unsigned int);
-	/*
-	 * Mask the IRQ in hardware.
-	 */
-	void (*mask)(unsigned int);
-	/*
-	 * Unmask the IRQ in hardware.
-	 */
-	void (*unmask)(unsigned int);
-	/*
-	 * Ask the hardware to re-trigger the IRQ.
-	 * Note: This method _must_ _not_ call the interrupt handler.
-	 * If you are unable to retrigger the interrupt, do not
-	 * provide a function, or if you do, return non-zero.
-	 */
-	int (*retrigger)(unsigned int);
-	/*
-	 * Set the type of the IRQ.
-	 */
-	int (*set_type)(unsigned int, unsigned int);
-	/*
-	 * Set wakeup-enable on the selected IRQ
-	 */
-	int (*set_wake)(unsigned int, unsigned int);
-
-#ifdef CONFIG_SMP
-	/*
-	 * Route an interrupt to a CPU
-	 */
-	void (*set_cpu)(struct irqdesc *desc, unsigned int irq, unsigned int cpu);
-#endif
-};
-
-struct irqdesc {
-	irq_handler_t	handle;
-	struct irqchip	*chip;
-	struct irqaction *action;
-	struct list_head pend;
-	void		*chipdata;
-	void		*data;
-	unsigned int	disable_depth;
-
-	unsigned int	triggered: 1;		/* IRQ has occurred	      */
-	unsigned int	running  : 1;		/* IRQ is running             */
-	unsigned int	pending  : 1;		/* IRQ is pending	      */
-	unsigned int	probing  : 1;		/* IRQ in use for a probe     */
-	unsigned int	probe_ok : 1;		/* IRQ can be used for probe  */
-	unsigned int	valid    : 1;		/* IRQ claimable	      */
-	unsigned int	noautoenable : 1;	/* don't automatically enable IRQ */
-	unsigned int	unused   :25;
-
-	struct proc_dir_entry *procdir;
-
-#ifdef CONFIG_SMP
-	cpumask_t	affinity;
-	unsigned int	cpu;
-#endif
-
-	/*
-	 * IRQ lock detection
-	 */
-	unsigned int	lck_cnt;
-	unsigned int	lck_pc;
-	unsigned int	lck_jif;
-};
-
-extern struct irqdesc irq_desc[];
-
-/*
- * Helpful inline function for calling irq descriptor handlers.
- */
-static inline void desc_handle_irq(unsigned int irq, struct irqdesc *desc, struct pt_regs *regs)
-{
-	desc->handle(irq, desc, regs);
-}
+struct seq_file;
 
 /*
  * This is internal.  Do not use it.
@@ -105,31 +20,52 @@ static inline void desc_handle_irq(unsig
 extern void (*init_arch_irq)(void);
 extern void init_FIQ(void);
 extern int show_fiq_list(struct seq_file *, void *);
-void __set_irq_handler(unsigned int irq, irq_handler_t, int);
+void __set_irq_handler(unsigned int irq, struct irq_type *, int);
 
 /*
  * External stuff.
  */
 #define set_irq_handler(irq,handler)		__set_irq_handler(irq,handler,0)
-#define set_irq_chained_handler(irq,handler)	__set_irq_handler(irq,handler,1)
-#define set_irq_data(irq,d)			do { irq_desc[irq].data = d; } while (0)
-#define set_irq_chipdata(irq,d)			do { irq_desc[irq].chipdata = d; } while (0)
-#define get_irq_chipdata(irq)			(irq_desc[irq].chipdata)
 
-void set_irq_chip(unsigned int irq, struct irqchip *);
+
+#define set_irq_chipdata(irq,d)			set_irq_chip_data(irq, d)
+#define get_irq_chipdata(irq)			get_irq_chip_data(irq)
+
 void set_irq_flags(unsigned int irq, unsigned int flags);
 
 #define IRQF_VALID	(1 << 0)
 #define IRQF_PROBE	(1 << 1)
 #define IRQF_NOAUTOEN	(1 << 2)
 
+/* ARM uses the retrigger functions in desc->chip or software retrigger */
+static inline void hw_resend_irq(struct irq_type *t, unsigned int i) {}
+
 /*
- * Built-in IRQ handlers.
+ * Hack alert. This is for easy migration, but should be changed in the source
  */
-void do_level_IRQ(unsigned int irq, struct irqdesc *desc, struct pt_regs *regs);
-void do_edge_IRQ(unsigned int irq, struct irqdesc *desc, struct pt_regs *regs);
-void do_simple_IRQ(unsigned int irq, struct irqdesc *desc, struct pt_regs *regs);
-void do_bad_IRQ(unsigned int irq, struct irqdesc *desc, struct pt_regs *regs);
-void dummy_mask_unmask_irq(unsigned int irq);
+#define do_level_IRQ	(&default_level_type)
+#define do_edge_IRQ	(&default_edge_type)
+#define do_simple_IRQ	(&default_simple_type)
+
+/* Hack to get around set_irq_chained_handler(nr,NULL) problem */
+#define irq_NULL_type no_irq_type
+#define set_irq_chained_handler(irq,handler) \
+	__set_irq_handler(irq,&irq_##handler##_type,1)
+
+#define DEFINE_IRQ_CHAINED_TYPE(function)		\
+struct irq_type irq_##function##_type = {		\
+	.typename = #function"-chained_type",		\
+	.handle_irq = function,				\
+}
+
+#define do_bad_IRQ(irq,desc,regs)			\
+do {							\
+	spin_lock(&desc->lock);				\
+	handle_bad_irq(irq, desc, regs);		\
+	spin_unlock(&desc->lock);			\
+} while(0)
+
+/* FIXME */
+#define ack_bad_irq(irq) do {} while (0)
 
 #endif
Index: linux-2.6.16/include/asm-arm/pgalloc.h
===================================================================
--- linux-2.6.16.orig/include/asm-arm/pgalloc.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-arm/pgalloc.h	2006-10-19 16:52:31.000000000 +0200
@@ -102,7 +102,7 @@ static inline void __pmd_populate(pmd_t 
  *
  * Ensure that we always set both PMD entries.
  */
-static inline void
+static inline void notrace
 pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep)
 {
 	unsigned long pte_ptr = (unsigned long)ptep;
@@ -115,7 +115,7 @@ pmd_populate_kernel(struct mm_struct *mm
 	__pmd_populate(pmdp, __pa(pte_ptr) | _PAGE_KERNEL_TABLE);
 }
 
-static inline void
+static inline void notrace
 pmd_populate(struct mm_struct *mm, pmd_t *pmdp, struct page *ptep)
 {
 	__pmd_populate(pmdp, page_to_pfn(ptep) << PAGE_SHIFT | _PAGE_USER_TABLE);
Index: linux-2.6.16/include/asm-arm/semaphore.h
===================================================================
--- linux-2.6.16.orig/include/asm-arm/semaphore.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-arm/semaphore.h	2006-10-19 16:52:31.000000000 +0200
@@ -5,46 +5,66 @@
 #define __ASM_ARM_SEMAPHORE_H
 
 #include <linux/linkage.h>
+
+#ifdef CONFIG_PREEMPT_RT
+# include <linux/rt_lock.h>
+#endif
+
 #include <linux/spinlock.h>
 #include <linux/wait.h>
 #include <linux/rwsem.h>
 
+/*
+ * On !PREEMPT_RT all semaphores are compat:
+ */
+#ifndef CONFIG_PREEMPT_RT
+# define semaphore compat_semaphore
+#endif
+
 #include <asm/atomic.h>
 #include <asm/locks.h>
 
-struct semaphore {
+struct compat_semaphore {
 	atomic_t count;
 	int sleepers;
 	wait_queue_head_t wait;
 };
 
-#define __SEMAPHORE_INIT(name, cnt)				\
+#define __COMPAT_SEMAPHORE_INITIALIZER(name, cnt)				\
 {								\
 	.count	= ATOMIC_INIT(cnt),				\
 	.wait	= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait),	\
 }
 
-#define __DECLARE_SEMAPHORE_GENERIC(name,count)	\
-	struct semaphore name = __SEMAPHORE_INIT(name,count)
+#define __COMPAT_MUTEX_INITIALIZER(name) \
+	__COMPAT_SEMAPHORE_INITIALIZER(name,1)
+
+#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count) \
+	struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count)
 
-#define DECLARE_MUTEX(name)		__DECLARE_SEMAPHORE_GENERIC(name,1)
-#define DECLARE_MUTEX_LOCKED(name)	__DECLARE_SEMAPHORE_GENERIC(name,0)
+#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,1)
+#define COMPAT_DECLARE_MUTEX_LOCKED(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,0)
 
-static inline void sema_init(struct semaphore *sem, int val)
+static inline void compat_sema_init(struct compat_semaphore *sem, int val)
 {
 	atomic_set(&sem->count, val);
 	sem->sleepers = 0;
 	init_waitqueue_head(&sem->wait);
 }
 
-static inline void init_MUTEX(struct semaphore *sem)
+static inline void compat_init_MUTEX(struct compat_semaphore *sem)
+{
+	compat_sema_init(sem, 1);
+}
+
+static inline void compat_init_MUTEX_LOCKED(struct compat_semaphore *sem)
 {
-	sema_init(sem, 1);
+	compat_sema_init(sem, 0);
 }
 
-static inline void init_MUTEX_LOCKED(struct semaphore *sem)
+static inline int compat_sema_count(struct compat_semaphore *sem)
 {
-	sema_init(sem, 0);
+	return atomic_read(&sem->count);
 }
 
 /*
@@ -55,16 +75,18 @@ asmlinkage int  __down_interruptible_fai
 asmlinkage int  __down_trylock_failed(void);
 asmlinkage void __up_wakeup(void);
 
-extern void __down(struct semaphore * sem);
-extern int  __down_interruptible(struct semaphore * sem);
-extern int  __down_trylock(struct semaphore * sem);
-extern void __up(struct semaphore * sem);
+extern void __compat_up(struct compat_semaphore *sem);
+extern int __compat_down_interruptible(struct compat_semaphore * sem);
+extern int __compat_down_trylock(struct compat_semaphore * sem);
+extern void __compat_down(struct compat_semaphore * sem);
+
+extern int compat_sem_is_locked(struct compat_semaphore *sem);
 
 /*
  * This is ugly, but we want the default case to fall through.
  * "__down" is the actual routine that waits...
  */
-static inline void down(struct semaphore * sem)
+static inline void compat_down(struct compat_semaphore * sem)
 {
 	might_sleep();
 	__down_op(sem, __down_failed);
@@ -74,13 +96,13 @@ static inline void down(struct semaphore
  * This is ugly, but we want the default case to fall through.
  * "__down_interruptible" is the actual routine that waits...
  */
-static inline int down_interruptible (struct semaphore * sem)
+static inline int compat_down_interruptible (struct compat_semaphore * sem)
 {
 	might_sleep();
 	return __down_op_ret(sem, __down_interruptible_failed);
 }
 
-static inline int down_trylock(struct semaphore *sem)
+static inline int compat_down_trylock(struct compat_semaphore *sem)
 {
 	return __down_op_ret(sem, __down_trylock_failed);
 }
@@ -91,9 +113,10 @@ static inline int down_trylock(struct se
  * The default case (no contention) will result in NO
  * jumps for both down() and up().
  */
-static inline void up(struct semaphore * sem)
+static inline void compat_up(struct compat_semaphore * sem)
 {
 	__up_op(sem, __up_wakeup);
 }
 
+#include <linux/semaphore.h>
 #endif
Index: linux-2.6.16/include/asm-arm/signal.h
===================================================================
--- linux-2.6.16.orig/include/asm-arm/signal.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-arm/signal.h	2006-10-19 16:52:31.000000000 +0200
@@ -114,7 +114,7 @@ typedef unsigned long sigset_t;
 #define SIGSTKSZ	8192
 
 #ifdef __KERNEL__
-#define SA_TIMER		0x40000000
+#define SA_TIMER		(0x40000000 | SA_NODELAY)
 #endif
 
 #include <asm-generic/signal.h>
Index: linux-2.6.16/include/asm-arm/system.h
===================================================================
--- linux-2.6.16.orig/include/asm-arm/system.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-arm/system.h	2006-10-19 16:52:31.000000000 +0200
@@ -4,6 +4,7 @@
 #ifdef __KERNEL__
 
 #include <linux/config.h>
+#include <asm/ptrace.h>
 
 #define CPU_ARCH_UNKNOWN	0
 #define CPU_ARCH_ARMv3		1
@@ -181,72 +182,10 @@ static inline void sched_cacheflush(void
 {
 }
 
-/*
- * CPU interrupt mask handling.
- */
 #if __LINUX_ARM_ARCH__ >= 6
-
-#define local_irq_save(x)					\
-	({							\
-	__asm__ __volatile__(					\
-	"mrs	%0, cpsr		@ local_irq_save\n"	\
-	"cpsid	i"						\
-	: "=r" (x) : : "memory", "cc");				\
-	})
-
-#define local_irq_enable()  __asm__("cpsie i	@ __sti" : : : "memory", "cc")
-#define local_irq_disable() __asm__("cpsid i	@ __cli" : : : "memory", "cc")
-#define local_fiq_enable()  __asm__("cpsie f	@ __stf" : : : "memory", "cc")
-#define local_fiq_disable() __asm__("cpsid f	@ __clf" : : : "memory", "cc")
-
+#define raw_local_fiq_enable()  __asm__("cpsie f	@ __stf" : : : "memory", "cc")
+#define raw_local_fiq_disable() __asm__("cpsid f	@ __clf" : : : "memory", "cc")
 #else
-
-/*
- * Save the current interrupt enable state & disable IRQs
- */
-#define local_irq_save(x)					\
-	({							\
-		unsigned long temp;				\
-		(void) (&temp == &x);				\
-	__asm__ __volatile__(					\
-	"mrs	%0, cpsr		@ local_irq_save\n"	\
-"	orr	%1, %0, #128\n"					\
-"	msr	cpsr_c, %1"					\
-	: "=r" (x), "=r" (temp)					\
-	:							\
-	: "memory", "cc");					\
-	})
-	
-/*
- * Enable IRQs
- */
-#define local_irq_enable()					\
-	({							\
-		unsigned long temp;				\
-	__asm__ __volatile__(					\
-	"mrs	%0, cpsr		@ local_irq_enable\n"	\
-"	bic	%0, %0, #128\n"					\
-"	msr	cpsr_c, %0"					\
-	: "=r" (temp)						\
-	:							\
-	: "memory", "cc");					\
-	})
-
-/*
- * Disable IRQs
- */
-#define local_irq_disable()					\
-	({							\
-		unsigned long temp;				\
-	__asm__ __volatile__(					\
-	"mrs	%0, cpsr		@ local_irq_disable\n"	\
-"	orr	%0, %0, #128\n"					\
-"	msr	cpsr_c, %0"					\
-	: "=r" (temp)						\
-	:							\
-	: "memory", "cc");					\
-	})
-
 /*
  * Enable FIQs
  */
@@ -279,32 +218,7 @@ static inline void sched_cacheflush(void
 
 #endif
 
-/*
- * Save the current interrupt enable state.
- */
-#define local_save_flags(x)					\
-	({							\
-	__asm__ __volatile__(					\
-	"mrs	%0, cpsr		@ local_save_flags"	\
-	: "=r" (x) : : "memory", "cc");				\
-	})
-
-/*
- * restore saved IRQ & FIQ state
- */
-#define local_irq_restore(x)					\
-	__asm__ __volatile__(					\
-	"msr	cpsr_c, %0		@ local_irq_restore\n"	\
-	:							\
-	: "r" (x)						\
-	: "memory", "cc")
-
-#define irqs_disabled()			\
-({					\
-	unsigned long flags;		\
-	local_save_flags(flags);	\
-	(int)(flags & PSR_I_BIT);	\
-})
+#include <linux/trace_irqflags.h>
 
 #ifdef CONFIG_SMP
 
Index: linux-2.6.16/include/asm-arm/thread_info.h
===================================================================
--- linux-2.6.16.orig/include/asm-arm/thread_info.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-arm/thread_info.h	2006-10-19 16:52:31.000000000 +0200
@@ -126,6 +126,7 @@ extern void iwmmxt_task_release(struct t
 #define TIF_NOTIFY_RESUME	0
 #define TIF_SIGPENDING		1
 #define TIF_NEED_RESCHED	2
+#define TIF_NEED_RESCHED_DELAYED 3
 #define TIF_SYSCALL_TRACE	8
 #define TIF_POLLING_NRFLAG	16
 #define TIF_USING_IWMMXT	17
@@ -134,6 +135,7 @@ extern void iwmmxt_task_release(struct t
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
+#define _TIF_NEED_RESCHED_DELAYED (1<<TIF_NEED_RESCHED_DELAYED)
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
 #define _TIF_USING_IWMMXT	(1 << TIF_USING_IWMMXT)
Index: linux-2.6.16/include/asm-arm/timex.h
===================================================================
--- linux-2.6.16.orig/include/asm-arm/timex.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-arm/timex.h	2006-10-19 16:52:31.000000000 +0200
@@ -16,9 +16,17 @@
 
 typedef unsigned long cycles_t;
 
+#ifndef mach_read_cycles
+ #define mach_read_cycles() (0)
+#ifdef CONFIG_LATENCY_TIMING
+ #define mach_cycles_to_usecs(d) (d)
+ #define mach_usecs_to_cycles(d) (d)
+#endif
+#endif
+
 static inline cycles_t get_cycles (void)
 {
-	return 0;
+	return mach_read_cycles();
 }
 
 #endif
Index: linux-2.6.16/include/asm-arm/tlb.h
===================================================================
--- linux-2.6.16.orig/include/asm-arm/tlb.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-arm/tlb.h	2006-10-19 16:52:31.000000000 +0200
@@ -28,15 +28,18 @@
 struct mmu_gather {
 	struct mm_struct	*mm;
 	unsigned int		fullmm;
+	int			cpu;
 };
 
-DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
+DECLARE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 static inline struct mmu_gather *
 tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 {
-	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
+	int cpu;
+	struct mmu_gather *tlb = &get_cpu_var_locked(mmu_gathers, &cpu);
 
+	tlb->cpu = cpu;
 	tlb->mm = mm;
 	tlb->fullmm = full_mm_flush;
 
@@ -52,7 +55,7 @@ tlb_finish_mmu(struct mmu_gather *tlb, u
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
 
-	put_cpu_var(mmu_gathers);
+	put_cpu_var_locked(mmu_gathers, tlb->cpu);
 }
 
 #define tlb_remove_tlb_entry(tlb,ptep,address)	do { } while (0)
Index: linux-2.6.16/include/asm-arm/tlbflush.h
===================================================================
--- linux-2.6.16.orig/include/asm-arm/tlbflush.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-arm/tlbflush.h	2006-10-19 16:52:31.000000000 +0200
@@ -240,6 +240,7 @@ static inline void local_flush_tlb_all(v
 	const int zero = 0;
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
+	preempt_disable();
 	if (tlb_flag(TLB_WB))
 		asm("mcr%? p15, 0, %0, c7, c10, 4" : : "r" (zero));
 
@@ -251,6 +252,7 @@ static inline void local_flush_tlb_all(v
 		asm("mcr%? p15, 0, %0, c8, c6, 0" : : "r" (zero));
 	if (tlb_flag(TLB_V4_I_FULL | TLB_V6_I_FULL))
 		asm("mcr%? p15, 0, %0, c8, c5, 0" : : "r" (zero));
+	preempt_enable();
 }
 
 static inline void local_flush_tlb_mm(struct mm_struct *mm)
@@ -259,6 +261,7 @@ static inline void local_flush_tlb_mm(st
 	const int asid = ASID(mm);
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
+	preempt_disable();
 	if (tlb_flag(TLB_WB))
 		asm("mcr%? p15, 0, %0, c7, c10, 4" : : "r" (zero));
 
@@ -279,6 +282,7 @@ static inline void local_flush_tlb_mm(st
 		asm("mcr%? p15, 0, %0, c8, c6, 2" : : "r" (asid));
 	if (tlb_flag(TLB_V6_I_ASID))
 		asm("mcr%? p15, 0, %0, c8, c5, 2" : : "r" (asid));
+	preempt_enable();
 }
 
 static inline void
@@ -287,6 +291,7 @@ local_flush_tlb_page(struct vm_area_stru
 	const int zero = 0;
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
+	preempt_disable();
 	uaddr = (uaddr & PAGE_MASK) | ASID(vma->vm_mm);
 
 	if (tlb_flag(TLB_WB))
@@ -311,6 +316,7 @@ local_flush_tlb_page(struct vm_area_stru
 		asm("mcr%? p15, 0, %0, c8, c6, 1" : : "r" (uaddr));
 	if (tlb_flag(TLB_V6_I_PAGE))
 		asm("mcr%? p15, 0, %0, c8, c5, 1" : : "r" (uaddr));
+	preempt_enable();
 }
 
 static inline void local_flush_tlb_kernel_page(unsigned long kaddr)
@@ -318,6 +324,7 @@ static inline void local_flush_tlb_kerne
 	const int zero = 0;
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
+	preempt_disable();
 	kaddr &= PAGE_MASK;
 
 	if (tlb_flag(TLB_WB))
@@ -346,6 +353,7 @@ static inline void local_flush_tlb_kerne
 	 */
 	if (tlb_flag(TLB_V6_U_PAGE | TLB_V6_D_PAGE | TLB_V6_I_PAGE))
 		asm("mcr%? p15, 0, %0, c7, c10, 4" : : "r" (zero));
+	preempt_enable();
 }
 
 /*
@@ -366,21 +374,25 @@ static inline void flush_pmd_entry(pmd_t
 	const unsigned int zero = 0;
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
+	preempt_disable();
 	if (tlb_flag(TLB_DCLEAN))
 		asm("mcr%?	p15, 0, %0, c7, c10, 1	@ flush_pmd"
 			: : "r" (pmd));
 	if (tlb_flag(TLB_WB))
 		asm("mcr%?	p15, 0, %0, c7, c10, 4	@ flush_pmd"
 			: : "r" (zero));
+	preempt_enable();
 }
 
 static inline void clean_pmd_entry(pmd_t *pmd)
 {
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
+	preempt_disable();
 	if (tlb_flag(TLB_DCLEAN))
 		asm("mcr%?	p15, 0, %0, c7, c10, 1	@ flush_pmd"
 			: : "r" (pmd));
+	preempt_enable();
 }
 
 #undef tlb_flag
Index: linux-2.6.16/include/asm-arm/unistd.h
===================================================================
--- linux-2.6.16.orig/include/asm-arm/unistd.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-arm/unistd.h	2006-10-19 16:52:31.000000000 +0200
@@ -361,6 +361,9 @@
 #define __NR_get_mempolicy		(__NR_SYSCALL_BASE+320)
 #define __NR_set_mempolicy		(__NR_SYSCALL_BASE+321)
 
+// FIXME: check this number ...
+#define NR_syscalls			322
+
 /*
  * The following SWIs are ARM private.
  */
@@ -533,6 +536,7 @@ type name(type1 arg1, type2 arg2, type3 
 #define __ARCH_WANT_OLD_READDIR
 #define __ARCH_WANT_SYS_SOCKETCALL
 #endif
+
 #endif
 
 #ifdef __KERNEL_SYSCALLS__
Index: linux-2.6.16/include/asm-frv/futex.h
===================================================================
--- linux-2.6.16.orig/include/asm-frv/futex.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-frv/futex.h	2006-10-19 16:52:31.000000000 +0200
@@ -9,5 +9,11 @@
 
 extern int futex_atomic_op_inuser(int encoded_op, int __user *uaddr);
 
+static inline int
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
+{
+	return -ENOSYS;
+}
+
 #endif
 #endif
Index: linux-2.6.16/include/asm-generic/bug.h
===================================================================
--- linux-2.6.16.orig/include/asm-generic/bug.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-generic/bug.h	2006-10-19 16:52:31.000000000 +0200
@@ -16,12 +16,12 @@
 #define BUG_ON(condition) do { if (unlikely((condition)!=0)) BUG(); } while(0)
 #endif
 
+extern void __WARN_ON(const char *func, const char *file, const int line);
+
 #ifndef HAVE_ARCH_WARN_ON
 #define WARN_ON(condition) do { \
-	if (unlikely((condition)!=0)) { \
-		printk("Badness in %s at %s:%d\n", __FUNCTION__, __FILE__, __LINE__); \
-		dump_stack(); \
-	} \
+	if (unlikely((condition)!=0)) \
+		__WARN_ON(__FUNCTION__, __FILE__, __LINE__); \
 } while (0)
 #endif
 
@@ -39,4 +39,44 @@
 #endif
 #endif
 
+#ifdef CONFIG_SMP
+# define WARN_ON_SMP(x)			WARN_ON(x)
+#else
+# define WARN_ON_SMP(x)			do { } while (0)
+#endif
+
+#define WARN_ON_ONCE(condition)		\
+do {					\
+	static int warn_once = 1;	\
+					\
+	if (condition) {		\
+		warn_once = 0;		\
+		WARN_ON(1);		\
+	}				\
+} while (0);
+
+#ifdef CONFIG_PREEMPT_RT
+# define BUG_ON_RT(c)			BUG_ON(c)
+# define BUG_ON_NONRT(c)		do { } while (0)
+# define WARN_ON_RT(condition)		WARN_ON(condition)
+# define WARN_ON_NONRT(condition)	do { } while (0)
+#else
+# define BUG_ON_RT(c)			do { } while (0)
+# define BUG_ON_NONRT(c)		BUG_ON(c)
+# define WARN_ON_RT(condition)		do { } while (0)
+# define WARN_ON_NONRT(condition)	WARN_ON(condition)
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+# define BUG_ON_RT(c)			BUG_ON(c)
+# define BUG_ON_NONRT(c)		do { } while (0)
+# define WARN_ON_RT(condition)		WARN_ON(condition)
+# define WARN_ON_NONRT(condition)	do { } while (0)
+#else
+# define BUG_ON_RT(c)			do { } while (0)
+# define BUG_ON_NONRT(c)		BUG_ON(c)
+# define WARN_ON_RT(condition)		do { } while (0)
+# define WARN_ON_NONRT(condition)	WARN_ON(condition)
+#endif
+
 #endif
Index: linux-2.6.16/include/asm-generic/div64.h
===================================================================
--- linux-2.6.16.orig/include/asm-generic/div64.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-generic/div64.h	2006-10-19 16:52:31.000000000 +0200
@@ -55,4 +55,13 @@ extern uint32_t __div64_32(uint64_t *div
 
 #endif /* BITS_PER_LONG */
 
+#ifndef div_long_long_rem
+#define div_long_long_rem(dividend,divisor,remainder) \
+({							\
+	u64 result = dividend;				\
+	*remainder = do_div(result,divisor);		\
+	result;						\
+})
+#endif
+
 #endif /* _ASM_GENERIC_DIV64_H */
Index: linux-2.6.16/include/asm-generic/futex.h
===================================================================
--- linux-2.6.16.orig/include/asm-generic/futex.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-generic/futex.h	2006-10-19 16:52:31.000000000 +0200
@@ -49,5 +49,11 @@ futex_atomic_op_inuser (int encoded_op, 
 	return ret;
 }
 
+static inline int
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
+{
+	return -ENOSYS;
+}
+
 #endif
 #endif
Index: linux-2.6.16/include/asm-generic/percpu.h
===================================================================
--- linux-2.6.16.orig/include/asm-generic/percpu.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-generic/percpu.h	2006-10-19 16:52:31.000000000 +0200
@@ -10,11 +10,23 @@ extern unsigned long __per_cpu_offset[NR
 /* Separate out the type, so (int[3], foo) works. */
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
+#define DEFINE_PER_CPU_LOCKED(type, name) \
+    __attribute__((__section__(".data.percpu"))) spinlock_t per_cpu_lock__##name##_locked = SPIN_LOCK_UNLOCKED; \
+    __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name##_locked
 
 /* var is in discarded region: offset to particular copy we want */
 #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu]))
 #define __get_cpu_var(var) per_cpu(var, smp_processor_id())
 
+#define per_cpu_lock(var, cpu) \
+	(*RELOC_HIDE(&per_cpu_lock__##var##_locked, __per_cpu_offset[cpu]))
+#define per_cpu_var_locked(var, cpu) \
+		(*RELOC_HIDE(&per_cpu__##var##_locked, __per_cpu_offset[cpu]))
+#define __get_cpu_lock(var, cpu) \
+		per_cpu_lock(var, cpu)
+#define __get_cpu_var_locked(var, cpu) \
+		per_cpu_var_locked(var, cpu)
+
 /* A macro to avoid #include hell... */
 #define percpu_modcopy(pcpudst, src, size)			\
 do {								\
@@ -28,15 +40,26 @@ do {								\
 
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
+#define DEFINE_PER_CPU_LOCKED(type, name) \
+    spinlock_t per_cpu_lock__##name##_locked = SPIN_LOCK_UNLOCKED; \
+    __typeof__(type) per_cpu__##name##_locked
 
 #define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu__##var))
+#define per_cpu_var_locked(var, cpu)			(*((void)(cpu), &per_cpu__##var##_locked))
 #define __get_cpu_var(var)			per_cpu__##var
+#define __get_cpu_lock(var, cpu)		per_cpu_lock__##var##_locked
+#define __get_cpu_var_locked(var, cpu)		per_cpu__##var##_locked
 
 #endif	/* SMP */
 
 #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
+#define DECLARE_PER_CPU_LOCKED(type, name) \
+    extern spinlock_t per_cpu_lock__##name##_locked; \
+    extern __typeof__(type) per_cpu__##name##_locked
 
 #define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
 #define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
+#define EXPORT_PER_CPU_LOCKED_SYMBOL(var) EXPORT_SYMBOL(per_cpu_lock__##var##_locked); EXPORT_SYMBOL(per_cpu__##var##_locked)
+#define EXPORT_PER_CPU_LOCKED_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu_lock__##var##_locked); EXPORT_SYMBOL_GPL(per_cpu__##var##_locked)
 
 #endif /* _ASM_GENERIC_PERCPU_H_ */
Index: linux-2.6.16/include/asm-generic/timeofday.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/include/asm-generic/timeofday.h	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,30 @@
+/*  linux/include/asm-generic/timeofday.h
+ *
+ *  This file contains the asm-generic interface
+ *  to the arch specific calls used by the time of day subsystem
+ */
+#ifndef _ASM_GENERIC_TIMEOFDAY_H
+#define _ASM_GENERIC_TIMEOFDAY_H
+#include <linux/types.h>
+#include <linux/time.h>
+#include <linux/timex.h>
+#include <linux/timeofday.h>
+#include <linux/clocksource.h>
+
+#include <asm/div64.h>
+
+#ifdef CONFIG_GENERIC_TIME
+/* Required externs */
+extern s64 read_persistent_clock(void);
+extern void sync_persistent_clock(struct timespec ts);
+
+#ifdef CONFIG_GENERIC_TIME_VSYSCALL
+extern void arch_update_vsyscall_gtod(struct timespec wall_time,
+				cycle_t offset_base, struct clocksource* clock,
+				int ntp_adj);
+#else
+# define arch_update_vsyscall_gtod(x,y,z,w) do { } while(0)
+#endif /* CONFIG_GENERIC_TIME_VSYSCALL */
+
+#endif /* CONFIG_GENERIC_TIME */
+#endif
Index: linux-2.6.16/include/asm-generic/tlb.h
===================================================================
--- linux-2.6.16.orig/include/asm-generic/tlb.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-generic/tlb.h	2006-10-19 16:52:31.000000000 +0200
@@ -42,11 +42,12 @@ struct mmu_gather {
 	unsigned int		nr;	/* set to ~0U means fast mode */
 	unsigned int		need_flush;/* Really unmapped some ptes? */
 	unsigned int		fullmm; /* non-zero means full mm flush */
+	int			cpu;
 	struct page *		pages[FREE_PTE_NR];
 };
 
 /* Users of the generic TLB shootdown code must declare this storage space. */
-DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
+DECLARE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 /* tlb_gather_mmu
  *	Return a pointer to an initialized struct mmu_gather.
@@ -54,8 +55,10 @@ DECLARE_PER_CPU(struct mmu_gather, mmu_g
 static inline struct mmu_gather *
 tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 {
-	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
+	int cpu;
+	struct mmu_gather *tlb = &get_cpu_var_locked(mmu_gathers, &cpu);
 
+	tlb->cpu = cpu;
 	tlb->mm = mm;
 
 	/* Use fast mode if only one CPU is online */
@@ -91,7 +94,7 @@ tlb_finish_mmu(struct mmu_gather *tlb, u
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
 
-	put_cpu_var(mmu_gathers);
+	put_cpu_var_locked(mmu_gathers, tlb->cpu);
 }
 
 /* tlb_remove_page
Index: linux-2.6.16/include/asm-i386/acpi.h
===================================================================
--- linux-2.6.16.orig/include/asm-i386/acpi.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-i386/acpi.h	2006-10-19 16:52:31.000000000 +0200
@@ -52,8 +52,8 @@
 
 #define ACPI_ASM_MACROS
 #define BREAKPOINT3
-#define ACPI_DISABLE_IRQS() local_irq_disable()
-#define ACPI_ENABLE_IRQS()  local_irq_enable()
+#define ACPI_DISABLE_IRQS() local_irq_disable_nort()
+#define ACPI_ENABLE_IRQS()  local_irq_enable_nort()
 #define ACPI_FLUSH_CPU_CACHE()	wbinvd()
 
 
Index: linux-2.6.16/include/asm-i386/arch_hooks.h
===================================================================
--- linux-2.6.16.orig/include/asm-i386/arch_hooks.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-i386/arch_hooks.h	2006-10-19 16:52:31.000000000 +0200
@@ -14,7 +14,6 @@
 extern void init_ISA_irqs(void);
 extern void apic_intr_init(void);
 extern void smp_intr_init(void);
-extern irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs);
 
 /* these are the defined hooks */
 extern void intr_init_hook(void);
Index: linux-2.6.16/include/asm-i386/bitops.h
===================================================================
--- linux-2.6.16.orig/include/asm-i386/bitops.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-i386/bitops.h	2006-10-19 16:52:31.000000000 +0200
@@ -385,7 +385,7 @@ static inline int sched_find_first_bit(c
 		return __ffs(b[1]) + 32;
 	if (unlikely(b[2]))
 		return __ffs(b[2]) + 64;
-	if (b[3])
+	if (unlikely(b[3]))
 		return __ffs(b[3]) + 96;
 	return __ffs(b[4]) + 128;
 }
Index: linux-2.6.16/include/asm-i386/bug.h
===================================================================
--- linux-2.6.16.orig/include/asm-i386/bug.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-i386/bug.h	2006-10-19 16:52:31.000000000 +0200
@@ -12,11 +12,22 @@
 #ifdef CONFIG_BUG
 #define HAVE_ARCH_BUG
 #ifdef CONFIG_DEBUG_BUGVERBOSE
+
+#ifdef CONFIG_LATENCY_TRACE
+  extern void stop_trace(void);
+#else
+# define stop_trace()			do { } while (0)
+#endif
+
 #define BUG()				\
+do {					\
+stop_trace();				\
+printk("BUG at %s:%d!\n", __FILE__, __LINE__); \
  __asm__ __volatile__(	"ud2\n"		\
 			"\t.word %c0\n"	\
 			"\t.long %c1\n"	\
-			 : : "i" (__LINE__), "i" (__FILE__))
+			 : : "i" (__LINE__), "i" (__FILE__)); \
+} while (0)
 #else
 #define BUG() __asm__ __volatile__("ud2\n")
 #endif
Index: linux-2.6.16/include/asm-i386/delay.h
===================================================================
--- linux-2.6.16.orig/include/asm-i386/delay.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-i386/delay.h	2006-10-19 16:52:31.000000000 +0200
@@ -23,4 +23,6 @@ extern void __delay(unsigned long loops)
 	((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \
 	__ndelay(n))
 
+void use_tsc_delay(void);
+
 #endif /* defined(_I386_DELAY_H) */
Index: linux-2.6.16/include/asm-i386/dma.h
===================================================================
--- linux-2.6.16.orig/include/asm-i386/dma.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-i386/dma.h	2006-10-19 16:52:31.000000000 +0200
@@ -135,7 +135,7 @@
 #define DMA_AUTOINIT	0x10
 
 
-extern spinlock_t  dma_spin_lock;
+extern spinlock_t dma_spin_lock;
 
 static __inline__ unsigned long claim_dma_lock(void)
 {
Index: linux-2.6.16/include/asm-i386/futex.h
===================================================================
--- linux-2.6.16.orig/include/asm-i386/futex.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-i386/futex.h	2006-10-19 16:52:31.000000000 +0200
@@ -104,5 +104,32 @@ futex_atomic_op_inuser (int encoded_op, 
 	return ret;
 }
 
+static inline int
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
+{
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	__asm__ __volatile__(
+		"1:	" LOCK_PREFIX "cmpxchgl %3, %1		\n"
+
+		"2:	.section .fixup, \"ax\"			\n"
+		"3:	mov     %2, %0				\n"
+		"	jmp     2b				\n"
+		"	.previous				\n"
+
+		"	.section __ex_table, \"a\"		\n"
+		"	.align  8				\n"
+		"	.long   1b,3b				\n"
+		"	.previous				\n"
+
+		: "=a" (oldval), "=m" (*uaddr)
+		: "i" (-EFAULT), "r" (newval), "0" (oldval)
+		: "memory"
+	);
+
+	return oldval;
+}
+
 #endif
 #endif
Index: linux-2.6.16/include/asm-i386/highmem.h
===================================================================
--- linux-2.6.16.orig/include/asm-i386/highmem.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-i386/highmem.h	2006-10-19 16:52:31.000000000 +0200
@@ -67,14 +67,32 @@ extern void * FASTCALL(kmap_high(struct 
 extern void FASTCALL(kunmap_high(struct page *page));
 
 void *kmap(struct page *page);
+extern void kunmap_virt(void *ptr);
+extern struct page *kmap_to_page(void *ptr);
 void kunmap(struct page *page);
-void *kmap_atomic(struct page *page, enum km_type type);
-void kunmap_atomic(void *kvaddr, enum km_type type);
-void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
-struct page *kmap_atomic_to_page(void *ptr);
+
+void *__kmap_atomic(struct page *page, enum km_type type);
+void __kunmap_atomic(void *kvaddr, enum km_type type);
+void *__kmap_atomic_pfn(unsigned long pfn, enum km_type type);
+struct page *__kmap_atomic_to_page(void *ptr);
 
 #define flush_cache_kmaps()	do { } while (0)
 
+/*
+ * on PREEMPT_RT kmap_atomic() is a wrapper that uses kmap():
+ */
+#ifdef CONFIG_PREEMPT_RT
+#  define kmap_atomic(page, type)	kmap(page)
+#  define kmap_atomic_pfn(pfn, type)	kmap(pfn_to_page(pfn))
+#  define kunmap_atomic(kvaddr, type)	kunmap_virt(kvaddr)
+#  define kmap_atomic_to_page(kvaddr)	kmap_to_page(kvaddr)
+#else
+# define kmap_atomic(page, type)	__kmap_atomic(page, type)
+# define kmap_atomic_pfn(pfn, type)	__kmap_atomic_pfn(pfn, type)
+# define kunmap_atomic(kvaddr, type)	__kunmap_atomic(kvaddr, type)
+# define kmap_atomic_to_page(kvaddr)	__kmap_atomic_to_page(kvaddr)
+#endif
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_HIGHMEM_H */
Index: linux-2.6.16/include/asm-i386/i8253.h
===================================================================
--- linux-2.6.16.orig/include/asm-i386/i8253.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-i386/i8253.h	2006-10-19 16:52:31.000000000 +0200
@@ -1,6 +1,6 @@
 #ifndef __ASM_I8253_H__
 #define __ASM_I8253_H__
 
-extern spinlock_t i8253_lock;
+extern raw_spinlock_t i8253_lock;
 
 #endif	/* __ASM_I8253_H__ */
Index: linux-2.6.16/include/asm-i386/i8259.h
===================================================================
--- linux-2.6.16.orig/include/asm-i386/i8259.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-i386/i8259.h	2006-10-19 16:52:31.000000000 +0200
@@ -7,7 +7,7 @@ extern unsigned int cached_irq_mask;
 #define cached_master_mask	(__byte(0, cached_irq_mask))
 #define cached_slave_mask	(__byte(1, cached_irq_mask))
 
-extern spinlock_t i8259A_lock;
+extern raw_spinlock_t i8259A_lock;
 
 extern void init_8259A(int auto_eoi);
 extern void enable_8259A_irq(unsigned int irq);
Index: linux-2.6.16/include/asm-i386/io_apic.h
===================================================================
--- linux-2.6.16.orig/include/asm-i386/io_apic.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-i386/io_apic.h	2006-10-19 16:52:31.000000000 +0200
@@ -4,6 +4,7 @@
 #include <linux/config.h>
 #include <asm/types.h>
 #include <asm/mpspec.h>
+#include <asm/apicdef.h>
 
 /*
  * Intel IO-APIC support for SMP and UP systems.
@@ -16,7 +17,6 @@
 #ifdef CONFIG_PCI_MSI
 static inline int use_pci_vector(void)	{return 1;}
 static inline void disable_edge_ioapic_vector(unsigned int vector) { }
-static inline void mask_and_ack_level_ioapic_vector(unsigned int vector) { }
 static inline void end_edge_ioapic_vector (unsigned int vector) { }
 #define startup_level_ioapic	startup_level_ioapic_vector
 #define shutdown_level_ioapic	mask_IO_APIC_vector
@@ -35,7 +35,6 @@ static inline void end_edge_ioapic_vecto
 #else
 static inline int use_pci_vector(void)	{return 0;}
 static inline void disable_edge_ioapic_irq(unsigned int irq) { }
-static inline void mask_and_ack_level_ioapic_irq(unsigned int irq) { }
 static inline void end_edge_ioapic_irq (unsigned int irq) { }
 #define startup_level_ioapic	startup_level_ioapic_irq
 #define shutdown_level_ioapic	mask_IO_APIC_irq
@@ -160,31 +159,11 @@ extern struct mpc_config_intsrc mp_irqs[
 /* non-0 if default (table-less) MP configuration */
 extern int mpc_default_type;
 
-static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
-{
-	*IO_APIC_BASE(apic) = reg;
-	return *(IO_APIC_BASE(apic)+4);
-}
-
-static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
-{
-	*IO_APIC_BASE(apic) = reg;
-	*(IO_APIC_BASE(apic)+4) = value;
-}
-
-/*
- * Re-write a value: to be used for read-modify-write
- * cycles where the read already set up the index register.
- *
- * Older SiS APIC requires we rewrite the index regiser
- */
+extern unsigned int raw_io_apic_read(unsigned int apic, unsigned int reg);
+extern unsigned int io_apic_read(unsigned int apic, unsigned int reg);
+extern void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value);
+extern void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value);
 extern int sis_apic_bug;
-static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
-{
-	if (sis_apic_bug)
-		*IO_APIC_BASE(apic) = reg;
-	*(IO_APIC_BASE(apic)+4) = value;
-}
 
 /* 1 if "noapic" boot option passed */
 extern int skip_ioapic_setup;
@@ -204,8 +183,11 @@ extern int io_apic_set_pci_routing (int 
 
 extern int (*ioapic_renumber_irq)(int ioapic, int irq);
 
+extern void io_apic_timer_ack(void *);
+
 #else  /* !CONFIG_X86_IO_APIC */
 #define io_apic_assign_pci_irqs 0
+#define io_apic_timer_ack NULL
 #endif
 
 extern int assign_irq_vector(int irq);
Index: linux-2.6.16/include/asm-i386/irqflags.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/include/asm-i386/irqflags.h	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,76 @@
+/*
+ * include/asm-i386/irqflags.h
+ *
+ * IRQ flags handling
+ *
+ * This file gets included from lowlevel asm headers too, to provide
+ * wrapped versions of the local_irq_*() APIs, based on the
+ * raw_local_irq_*() macros from the lowlevel headers.
+ */
+#ifndef _ASM_IRQFLAGS_H
+#define _ASM_IRQFLAGS_H
+
+#define raw_local_save_flags(x)	do { typecheck(unsigned long,x); __asm__ __volatile__("pushfl ; popl %0":"=g" (x): /* no input */); } while (0)
+#define raw_local_irq_restore(x) do { typecheck(unsigned long,x); __asm__ __volatile__("pushl %0 ; popfl": /* no output */ :"g" (x):"memory", "cc"); } while (0)
+#define raw_local_irq_disable()	__asm__ __volatile__("cli": : :"memory")
+#define raw_local_irq_enable()	__asm__ __volatile__("sti": : :"memory")
+/* used in the idle loop; sti takes one instruction cycle to complete */
+#define raw_safe_halt()		__asm__ __volatile__("sti; hlt": : :"memory")
+/* used when interrupts are already enabled or to shutdown the processor */
+#define halt()			__asm__ __volatile__("hlt": : :"memory")
+
+#define raw_irqs_disabled_flags(flags)	(!((flags) & (1<<9)))
+
+/* For spinlocks etc */
+#define raw_local_irq_save(x)	__asm__ __volatile__("pushfl ; popl %0 ; cli":"=g" (x): /* no input */ :"memory")
+
+/*
+ * Do the CPU's IRQ-state tracing from assembly code. We call a
+ * C function, so save all the C-clobbered registers:
+ */
+#ifdef CONFIG_DEBUG_TRACE_IRQFLAGS
+
+# define TRACE_IRQS_ON				\
+	pushl %eax;				\
+	pushl %ecx;				\
+	pushl %edx;				\
+	call trace_hardirqs_on_lowlevel;	\
+	popl %edx;				\
+	popl %ecx;				\
+	popl %eax;
+
+# define TRACE_IRQS_OFF				\
+	pushl %eax;				\
+	pushl %ecx;				\
+	pushl %edx;				\
+	call trace_hardirqs_off_lowlevel;	\
+	popl %edx;				\
+	popl %ecx;				\
+	popl %eax;
+
+# define TRACE_IRQS_ON_STR			\
+	"pushl %%eax;				\
+	pushl %%ecx;				\
+	pushl %%edx;				\
+	call trace_hardirqs_on;			\
+	popl %%edx;				\
+	popl %%ecx;				\
+	popl %%eax;"
+
+# define TRACE_IRQS_OFF_STR			\
+	"pushl %%eax;				\
+	pushl %%ecx;				\
+	pushl %%edx;				\
+	call trace_hardirqs_off;		\
+	popl %%edx;				\
+	popl %%ecx;				\
+	popl %%eax;"
+
+#else
+# define TRACE_IRQS_ON
+# define TRACE_IRQS_OFF
+# define TRACE_IRQS_ON_STR
+# define TRACE_IRQS_OFF_STR
+#endif
+
+#endif
Index: linux-2.6.16/include/asm-i386/mach-default/do_timer.h
===================================================================
--- linux-2.6.16.orig/include/asm-i386/mach-default/do_timer.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-i386/mach-default/do_timer.h	2006-10-19 16:52:31.000000000 +0200
@@ -1,86 +1,2 @@
 /* defines for inline arch setup functions */
 
-#include <asm/apic.h>
-#include <asm/i8259.h>
-
-/**
- * do_timer_interrupt_hook - hook into timer tick
- * @regs:	standard registers from interrupt
- *
- * Description:
- *	This hook is called immediately after the timer interrupt is ack'd.
- *	It's primary purpose is to allow architectures that don't possess
- *	individual per CPU clocks (like the CPU APICs supply) to broadcast the
- *	timer interrupt as a means of triggering reschedules etc.
- **/
-
-static inline void do_timer_interrupt_hook(struct pt_regs *regs)
-{
-	do_timer(regs);
-#ifndef CONFIG_SMP
-	update_process_times(user_mode(regs));
-#endif
-/*
- * In the SMP case we use the local APIC timer interrupt to do the
- * profiling, except when we simulate SMP mode on a uniprocessor
- * system, in that case we have to call the local interrupt handler.
- */
-#ifndef CONFIG_X86_LOCAL_APIC
-	profile_tick(CPU_PROFILING, regs);
-#else
-	if (!using_apic_timer)
-		smp_local_timer_interrupt(regs);
-#endif
-}
-
-
-/* you can safely undefine this if you don't have the Neptune chipset */
-
-#define BUGGY_NEPTUN_TIMER
-
-/**
- * do_timer_overflow - process a detected timer overflow condition
- * @count:	hardware timer interrupt count on overflow
- *
- * Description:
- *	This call is invoked when the jiffies count has not incremented but
- *	the hardware timer interrupt has.  It means that a timer tick interrupt
- *	came along while the previous one was pending, thus a tick was missed
- **/
-static inline int do_timer_overflow(int count)
-{
-	int i;
-
-	spin_lock(&i8259A_lock);
-	/*
-	 * This is tricky when I/O APICs are used;
-	 * see do_timer_interrupt().
-	 */
-	i = inb(0x20);
-	spin_unlock(&i8259A_lock);
-	
-	/* assumption about timer being IRQ0 */
-	if (i & 0x01) {
-		/*
-		 * We cannot detect lost timer interrupts ... 
-		 * well, that's why we call them lost, don't we? :)
-		 * [hmm, on the Pentium and Alpha we can ... sort of]
-		 */
-		count -= LATCH;
-	} else {
-#ifdef BUGGY_NEPTUN_TIMER
-		/*
-		 * for the Neptun bug we know that the 'latch'
-		 * command doesn't latch the high and low value
-		 * of the counter atomically. Thus we have to 
-		 * substract 256 from the counter 
-		 * ... funny, isnt it? :)
-		 */
-		
-		count -= 256;
-#else
-		printk("do_slow_gettimeoffset(): hardware timer problem?\n");
-#endif
-	}
-	return count;
-}
Index: linux-2.6.16/include/asm-i386/mach-default/irq_vectors.h
===================================================================
--- linux-2.6.16.orig/include/asm-i386/mach-default/irq_vectors.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-i386/mach-default/irq_vectors.h	2006-10-19 16:52:31.000000000 +0200
@@ -63,7 +63,7 @@
  * levels. (0x80 is the syscall vector)
  */
 #define FIRST_DEVICE_VECTOR	0x31
-#define FIRST_SYSTEM_VECTOR	0xef
+#define FIRST_SYSTEM_VECTOR	0xee
 
 #define TIMER_IRQ 0
 
Index: linux-2.6.16/include/asm-i386/mach-default/mach_timer.h
===================================================================
--- linux-2.6.16.orig/include/asm-i386/mach-default/mach_timer.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-i386/mach-default/mach_timer.h	2006-10-19 16:52:31.000000000 +0200
@@ -15,7 +15,9 @@
 #ifndef _MACH_TIMER_H
 #define _MACH_TIMER_H
 
-#define CALIBRATE_LATCH	(5 * LATCH)
+#define CALIBRATE_TIME_MSEC 30 /* 30 msecs */
+#define CALIBRATE_LATCH	\
+	((CLOCK_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000)
 
 static inline void mach_prepare_counter(void)
 {
Index: linux-2.6.16/include/asm-i386/mach-summit/mach_mpparse.h
===================================================================
--- linux-2.6.16.orig/include/asm-i386/mach-summit/mach_mpparse.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-i386/mach-summit/mach_mpparse.h	2006-10-19 16:52:31.000000000 +0200
@@ -2,6 +2,7 @@
 #define __ASM_MACH_MPPARSE_H
 
 #include <mach_apic.h>
+#include <asm/tsc.h>
 
 extern int use_cyclone;
 
@@ -29,6 +30,7 @@ static inline int mps_oem_check(struct m
 			(!strncmp(productid, "VIGIL SMP", 9) 
 			 || !strncmp(productid, "EXA", 3)
 			 || !strncmp(productid, "RUTHLESS SMP", 12))){
+		mark_tsc_unstable();
 		use_cyclone = 1; /*enable cyclone-timer*/
 		setup_summit();
 		return 1;
@@ -42,6 +44,7 @@ static inline int acpi_madt_oem_check(ch
 	if (!strncmp(oem_id, "IBM", 3) &&
 	    (!strncmp(oem_table_id, "SERVIGIL", 8)
 	     || !strncmp(oem_table_id, "EXA", 3))){
+		mark_tsc_unstable();
 		use_cyclone = 1; /*enable cyclone-timer*/
 		setup_summit();
 		return 1;
Index: linux-2.6.16/include/asm-i386/mc146818rtc.h
===================================================================
--- linux-2.6.16.orig/include/asm-i386/mc146818rtc.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-i386/mc146818rtc.h	2006-10-19 16:52:31.000000000 +0200
@@ -69,7 +69,7 @@ static inline unsigned char current_lock
 		lock_cmos(reg)
 #define lock_cmos_suffix(reg) \
 		unlock_cmos();			\
-		local_irq_restore(cmos_flags);	\
+		local_irq_restore(cmos_flags); \
 	} while (0)
 #else
 #define lock_cmos_prefix(reg) do {} while (0)
Index: linux-2.6.16/include/asm-i386/pgtable.h
===================================================================
--- linux-2.6.16.orig/include/asm-i386/pgtable.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-i386/pgtable.h	2006-10-19 16:52:31.000000000 +0200
@@ -37,7 +37,7 @@ extern unsigned long empty_zero_page[102
 extern pgd_t swapper_pg_dir[1024];
 extern kmem_cache_t *pgd_cache;
 extern kmem_cache_t *pmd_cache;
-extern spinlock_t pgd_lock;
+extern raw_spinlock_t pgd_lock;
 extern struct page *pgd_list;
 
 void pmd_ctor(void *, kmem_cache_t *, unsigned long);
Index: linux-2.6.16/include/asm-i386/rwsem.h
===================================================================
--- linux-2.6.16.orig/include/asm-i386/rwsem.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-i386/rwsem.h	2006-10-19 16:52:31.000000000 +0200
@@ -43,15 +43,15 @@
 
 struct rwsem_waiter;
 
-extern struct rw_semaphore *FASTCALL(rwsem_down_read_failed(struct rw_semaphore *sem));
-extern struct rw_semaphore *FASTCALL(rwsem_down_write_failed(struct rw_semaphore *sem));
-extern struct rw_semaphore *FASTCALL(rwsem_wake(struct rw_semaphore *));
-extern struct rw_semaphore *FASTCALL(rwsem_downgrade_wake(struct rw_semaphore *sem));
+extern struct compat_rw_semaphore *FASTCALL(rwsem_down_read_failed(struct compat_rw_semaphore *sem));
+extern struct compat_rw_semaphore *FASTCALL(rwsem_down_write_failed(struct compat_rw_semaphore *sem));
+extern struct compat_rw_semaphore *FASTCALL(rwsem_wake(struct compat_rw_semaphore *));
+extern struct compat_rw_semaphore *FASTCALL(rwsem_downgrade_wake(struct compat_rw_semaphore *sem));
 
 /*
  * the semaphore definition
  */
-struct rw_semaphore {
+struct compat_rw_semaphore {
 	signed long		count;
 #define RWSEM_UNLOCKED_VALUE		0x00000000
 #define RWSEM_ACTIVE_BIAS		0x00000001
@@ -79,10 +79,10 @@ struct rw_semaphore {
 { RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) \
 	__RWSEM_DEBUG_INIT }
 
-#define DECLARE_RWSEM(name) \
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
+#define COMPAT_DECLARE_RWSEM(name) \
+	struct compat_rw_semaphore name = __RWSEM_INITIALIZER(name)
 
-static inline void init_rwsem(struct rw_semaphore *sem)
+static inline void compat_init_rwsem(struct compat_rw_semaphore *sem)
 {
 	sem->count = RWSEM_UNLOCKED_VALUE;
 	spin_lock_init(&sem->wait_lock);
@@ -95,7 +95,7 @@ static inline void init_rwsem(struct rw_
 /*
  * lock for reading
  */
-static inline void __down_read(struct rw_semaphore *sem)
+static inline void __down_read(struct compat_rw_semaphore *sem)
 {
 	__asm__ __volatile__(
 		"# beginning down_read\n\t"
@@ -120,7 +120,7 @@ LOCK_PREFIX	"  incl      (%%eax)\n\t" /*
 /*
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
-static inline int __down_read_trylock(struct rw_semaphore *sem)
+static inline int __down_read_trylock(struct compat_rw_semaphore *sem)
 {
 	__s32 result, tmp;
 	__asm__ __volatile__(
@@ -143,7 +143,7 @@ LOCK_PREFIX	"  cmpxchgl  %2,%0\n\t"
 /*
  * lock for writing
  */
-static inline void __down_write(struct rw_semaphore *sem)
+static inline void __down_write(struct compat_rw_semaphore *sem)
 {
 	int tmp;
 
@@ -170,7 +170,7 @@ LOCK_PREFIX	"  xadd      %%edx,(%%eax)\n
 /*
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
-static inline int __down_write_trylock(struct rw_semaphore *sem)
+static inline int __down_write_trylock(struct compat_rw_semaphore *sem)
 {
 	signed long ret = cmpxchg(&sem->count,
 				  RWSEM_UNLOCKED_VALUE, 
@@ -183,7 +183,7 @@ static inline int __down_write_trylock(s
 /*
  * unlock after reading
  */
-static inline void __up_read(struct rw_semaphore *sem)
+static inline void __up_read(struct compat_rw_semaphore *sem)
 {
 	__s32 tmp = -RWSEM_ACTIVE_READ_BIAS;
 	__asm__ __volatile__(
@@ -209,7 +209,7 @@ LOCK_PREFIX	"  xadd      %%edx,(%%eax)\n
 /*
  * unlock after writing
  */
-static inline void __up_write(struct rw_semaphore *sem)
+static inline void __up_write(struct compat_rw_semaphore *sem)
 {
 	__asm__ __volatile__(
 		"# beginning __up_write\n\t"
@@ -235,7 +235,7 @@ LOCK_PREFIX	"  xaddl     %%edx,(%%eax)\n
 /*
  * downgrade write lock to read lock
  */
-static inline void __downgrade_write(struct rw_semaphore *sem)
+static inline void __downgrade_write(struct compat_rw_semaphore *sem)
 {
 	__asm__ __volatile__(
 		"# beginning __downgrade_write\n\t"
@@ -260,7 +260,7 @@ LOCK_PREFIX	"  addl      %2,(%%eax)\n\t"
 /*
  * implement atomic add functionality
  */
-static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
+static inline void rwsem_atomic_add(int delta, struct compat_rw_semaphore *sem)
 {
 	__asm__ __volatile__(
 LOCK_PREFIX	"addl %1,%0"
@@ -271,7 +271,7 @@ LOCK_PREFIX	"addl %1,%0"
 /*
  * implement exchange and add functionality
  */
-static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
+static inline int rwsem_atomic_update(int delta, struct compat_rw_semaphore *sem)
 {
 	int tmp = delta;
 
@@ -284,7 +284,7 @@ LOCK_PREFIX	"xadd %0,(%2)"
 	return tmp+delta;
 }
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
+static inline int compat_rwsem_is_locked(struct rw_semaphore *sem)
 {
 	return (sem->count != 0);
 }
Index: linux-2.6.16/include/asm-i386/semaphore.h
===================================================================
--- linux-2.6.16.orig/include/asm-i386/semaphore.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-i386/semaphore.h	2006-10-19 16:52:31.000000000 +0200
@@ -1,10 +1,9 @@
 #ifndef _I386_SEMAPHORE_H
 #define _I386_SEMAPHORE_H
 
+#include <linux/config.h>
 #include <linux/linkage.h>
 
-#ifdef __KERNEL__
-
 /*
  * SMP- and interrupt-safe semaphores..
  *
@@ -41,30 +40,40 @@
 #include <linux/wait.h>
 #include <linux/rwsem.h>
 
-struct semaphore {
+/*
+ * On !PREEMPT_RT all semaphores are compat:
+ */
+#ifndef CONFIG_PREEMPT_RT
+# define compat_semaphore semaphore
+#endif
+
+struct compat_semaphore {
 	atomic_t count;
 	int sleepers;
 	wait_queue_head_t wait;
 };
 
 
-#define __SEMAPHORE_INITIALIZER(name, n)				\
+#define __COMPAT_SEMAPHORE_INITIALIZER(name, n)				\
 {									\
 	.count		= ATOMIC_INIT(n),				\
 	.sleepers	= 0,						\
 	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
 }
 
-#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
+#define __COMPAT_MUTEX_INITIALIZER(name) \
+	__COMPAT_SEMAPHORE_INITIALIZER(name,1)
 
-#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
-#define DECLARE_MUTEX_LOCKED(name) __DECLARE_SEMAPHORE_GENERIC(name,0)
+#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count) \
+	struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count)
 
-static inline void sema_init (struct semaphore *sem, int val)
+#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,1)
+#define COMPAT_DECLARE_MUTEX_LOCKED(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,0)
+
+static inline void compat_sema_init (struct compat_semaphore *sem, int val)
 {
 /*
- *	*sem = (struct semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
+ *	*sem = (struct compat_semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
  *
  * i'd rather use the more flexible initialization above, but sadly
  * GCC 2.7.2.3 emits a bogus warning. EGCS doesn't. Oh well.
@@ -74,27 +83,27 @@ static inline void sema_init (struct sem
 	init_waitqueue_head(&sem->wait);
 }
 
-static inline void init_MUTEX (struct semaphore *sem)
+static inline void compat_init_MUTEX (struct compat_semaphore *sem)
 {
-	sema_init(sem, 1);
+	compat_sema_init(sem, 1);
 }
 
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
+static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem)
 {
-	sema_init(sem, 0);
+	compat_sema_init(sem, 0);
 }
 
-fastcall void __down_failed(void /* special register calling convention */);
-fastcall int  __down_failed_interruptible(void  /* params in registers */);
-fastcall int  __down_failed_trylock(void  /* params in registers */);
-fastcall void __up_wakeup(void /* special register calling convention */);
+fastcall void __compat_down_failed(void /* special register calling convention */);
+fastcall int  __compat_down_failed_interruptible(void  /* params in registers */);
+fastcall int  __compat_down_failed_trylock(void  /* params in registers */);
+fastcall void __compat_up_wakeup(void /* special register calling convention */);
 
 /*
  * This is ugly, but we want the default case to fall through.
  * "__down_failed" is a special asm handler that calls the C
  * routine that actually waits. See arch/i386/kernel/semaphore.c
  */
-static inline void down(struct semaphore * sem)
+static inline void compat_down(struct compat_semaphore * sem)
 {
 	might_sleep();
 	__asm__ __volatile__(
@@ -104,7 +113,7 @@ static inline void down(struct semaphore
 		"1:\n"
 		LOCK_SECTION_START("")
 		"2:\tlea %0,%%eax\n\t"
-		"call __down_failed\n\t"
+		"call __compat_down_failed\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		:"=m" (sem->count)
@@ -116,7 +125,7 @@ static inline void down(struct semaphore
  * Interruptible try to acquire a semaphore.  If we obtained
  * it, return zero.  If we were interrupted, returns -EINTR
  */
-static inline int down_interruptible(struct semaphore * sem)
+static inline int compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int result;
 
@@ -129,7 +138,7 @@ static inline int down_interruptible(str
 		"1:\n"
 		LOCK_SECTION_START("")
 		"2:\tlea %1,%%eax\n\t"
-		"call __down_failed_interruptible\n\t"
+		"call __compat_down_failed_interruptible\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		:"=a" (result), "=m" (sem->count)
@@ -142,7 +151,7 @@ static inline int down_interruptible(str
  * Non-blockingly attempt to down() a semaphore.
  * Returns zero if we acquired it
  */
-static inline int down_trylock(struct semaphore * sem)
+static inline int compat_down_trylock(struct compat_semaphore * sem)
 {
 	int result;
 
@@ -154,7 +163,7 @@ static inline int down_trylock(struct se
 		"1:\n"
 		LOCK_SECTION_START("")
 		"2:\tlea %1,%%eax\n\t"
-		"call __down_failed_trylock\n\t"
+		"call __compat_down_failed_trylock\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		:"=a" (result), "=m" (sem->count)
@@ -169,7 +178,7 @@ static inline int down_trylock(struct se
  * The default case (no contention) will result in NO
  * jumps for both down() and up().
  */
-static inline void up(struct semaphore * sem)
+static inline void compat_up(struct compat_semaphore * sem)
 {
 	__asm__ __volatile__(
 		"# atomic up operation\n\t"
@@ -178,7 +187,7 @@ static inline void up(struct semaphore *
 		"1:\n"
 		LOCK_SECTION_START("")
 		"2:\tlea %0,%%eax\n\t"
-		"call __up_wakeup\n\t"
+		"call __compat_up_wakeup\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		".subsection 0\n"
@@ -187,5 +196,10 @@ static inline void up(struct semaphore *
 		:"memory","ax");
 }
 
-#endif
+extern int FASTCALL(compat_sem_is_locked(struct compat_semaphore *sem));
+
+#define compat_sema_count(sem) atomic_read(&(sem)->count)
+
+#include <linux/semaphore.h>
+
 #endif
Index: linux-2.6.16/include/asm-i386/spinlock.h
===================================================================
--- linux-2.6.16.orig/include/asm-i386/spinlock.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-i386/spinlock.h	2006-10-19 16:52:31.000000000 +0200
@@ -4,6 +4,7 @@
 #include <asm/atomic.h>
 #include <asm/rwlock.h>
 #include <asm/page.h>
+#include <asm/irqflags.h>
 #include <linux/config.h>
 #include <linux/compiler.h>
 
@@ -34,35 +35,37 @@
 
 #define __raw_spin_lock_string_flags \
 	"\n1:\t" \
-	"lock ; decb %0\n\t" \
+	LOCK_PREFIX "decb %0\n\t" \
 	"jns 4f\n\t" \
 	"2:\t" \
 	"testl $0x200, %1\n\t" \
 	"jz 3f\n\t" \
+	TRACE_IRQS_ON_STR \
 	"sti\n\t" \
 	"3:\t" \
 	"rep;nop\n\t" \
 	"cmpb $0, %0\n\t" \
 	"jle 3b\n\t" \
 	"cli\n\t" \
+	TRACE_IRQS_OFF_STR \
 	"jmp 1b\n" \
 	"4:\n\t"
 
-static inline void __raw_spin_lock(raw_spinlock_t *lock)
+static inline void __raw_spin_lock(__raw_spinlock_t *lock)
 {
 	__asm__ __volatile__(
 		__raw_spin_lock_string
 		:"=m" (lock->slock) : : "memory");
 }
 
-static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
+static inline void __raw_spin_lock_flags(__raw_spinlock_t *lock, unsigned long flags)
 {
 	__asm__ __volatile__(
 		__raw_spin_lock_string_flags
 		:"=m" (lock->slock) : "r" (flags) : "memory");
 }
 
-static inline int __raw_spin_trylock(raw_spinlock_t *lock)
+static inline int __raw_spin_trylock(__raw_spinlock_t *lock)
 {
 	char oldval;
 	__asm__ __volatile__(
@@ -86,7 +89,7 @@ static inline int __raw_spin_trylock(raw
 		:"=m" (lock->slock) : : "memory"
 
 
-static inline void __raw_spin_unlock(raw_spinlock_t *lock)
+static inline void __raw_spin_unlock(__raw_spinlock_t *lock)
 {
 	__asm__ __volatile__(
 		__raw_spin_unlock_string
@@ -100,7 +103,7 @@ static inline void __raw_spin_unlock(raw
 		:"=q" (oldval), "=m" (lock->slock) \
 		:"0" (oldval) : "memory"
 
-static inline void __raw_spin_unlock(raw_spinlock_t *lock)
+static inline void __raw_spin_unlock(__raw_spinlock_t *lock)
 {
 	char oldval = 1;
 
@@ -147,17 +150,17 @@ static inline void __raw_spin_unlock(raw
  */
 #define __raw_write_can_lock(x)		((x)->lock == RW_LOCK_BIAS)
 
-static inline void __raw_read_lock(raw_rwlock_t *rw)
+static inline void __raw_read_lock(__raw_rwlock_t *rw)
 {
 	__build_read_lock(rw, "__read_lock_failed");
 }
 
-static inline void __raw_write_lock(raw_rwlock_t *rw)
+static inline void __raw_write_lock(__raw_rwlock_t *rw)
 {
 	__build_write_lock(rw, "__write_lock_failed");
 }
 
-static inline int __raw_read_trylock(raw_rwlock_t *lock)
+static inline int __raw_read_trylock(__raw_rwlock_t *lock)
 {
 	atomic_t *count = (atomic_t *)lock;
 	atomic_dec(count);
@@ -167,7 +170,7 @@ static inline int __raw_read_trylock(raw
 	return 0;
 }
 
-static inline int __raw_write_trylock(raw_rwlock_t *lock)
+static inline int __raw_write_trylock(__raw_rwlock_t *lock)
 {
 	atomic_t *count = (atomic_t *)lock;
 	if (atomic_sub_and_test(RW_LOCK_BIAS, count))
@@ -176,12 +179,12 @@ static inline int __raw_write_trylock(ra
 	return 0;
 }
 
-static inline void __raw_read_unlock(raw_rwlock_t *rw)
+static inline void __raw_read_unlock(__raw_rwlock_t *rw)
 {
 	asm volatile("lock ; incl %0" :"=m" (rw->lock) : : "memory");
 }
 
-static inline void __raw_write_unlock(raw_rwlock_t *rw)
+static inline void __raw_write_unlock(__raw_rwlock_t *rw)
 {
 	asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ", %0"
 				 : "=m" (rw->lock) : : "memory");
Index: linux-2.6.16/include/asm-i386/spinlock_types.h
===================================================================
--- linux-2.6.16.orig/include/asm-i386/spinlock_types.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-i386/spinlock_types.h	2006-10-19 16:52:31.000000000 +0200
@@ -7,13 +7,13 @@
 
 typedef struct {
 	volatile unsigned int slock;
-} raw_spinlock_t;
+} __raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED	{ 1 }
 
 typedef struct {
 	volatile unsigned int lock;
-} raw_rwlock_t;
+} __raw_rwlock_t;
 
 #define __RAW_RW_LOCK_UNLOCKED		{ RW_LOCK_BIAS }
 
Index: linux-2.6.16/include/asm-i386/system.h
===================================================================
--- linux-2.6.16.orig/include/asm-i386/system.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-i386/system.h	2006-10-19 16:52:31.000000000 +0200
@@ -518,25 +518,7 @@ struct alt_instr { 
 
 #define set_wmb(var, value) do { var = value; wmb(); } while (0)
 
-/* interrupt control.. */
-#define local_save_flags(x)	do { typecheck(unsigned long,x); __asm__ __volatile__("pushfl ; popl %0":"=g" (x): /* no input */); } while (0)
-#define local_irq_restore(x) 	do { typecheck(unsigned long,x); __asm__ __volatile__("pushl %0 ; popfl": /* no output */ :"g" (x):"memory", "cc"); } while (0)
-#define local_irq_disable() 	__asm__ __volatile__("cli": : :"memory")
-#define local_irq_enable()	__asm__ __volatile__("sti": : :"memory")
-/* used in the idle loop; sti takes one instruction cycle to complete */
-#define safe_halt()		__asm__ __volatile__("sti; hlt": : :"memory")
-/* used when interrupts are already enabled or to shutdown the processor */
-#define halt()			__asm__ __volatile__("hlt": : :"memory")
-
-#define irqs_disabled()			\
-({					\
-	unsigned long flags;		\
-	local_save_flags(flags);	\
-	!(flags & (1<<9));		\
-})
-
-/* For spinlocks etc */
-#define local_irq_save(x)	__asm__ __volatile__("pushfl ; popl %0 ; cli":"=g" (x): /* no input */ :"memory")
+#include <linux/trace_irqflags.h>
 
 /*
  * disable hlt during certain critical i/o operations
Index: linux-2.6.16/include/asm-i386/thread_info.h
===================================================================
--- linux-2.6.16.orig/include/asm-i386/thread_info.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-i386/thread_info.h	2006-10-19 16:52:31.000000000 +0200
@@ -83,15 +83,28 @@ struct thread_info {
 #define init_thread_info	(init_thread_union.thread_info)
 #define init_stack		(init_thread_union.stack)
 
+#ifndef CONFIG_SMP
+// # define CURRENT_PTR
+#endif
 
 /* how to get the thread information struct from C */
-static inline struct thread_info *current_thread_info(void)
+static inline struct thread_info *__current_thread_info(void)
 {
 	struct thread_info *ti;
 	__asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~(THREAD_SIZE - 1)));
 	return ti;
 }
 
+#ifndef CURRENT_PTR
+static inline struct thread_info *current_thread_info(void)
+{
+	return __current_thread_info();
+}
+#else
+extern struct thread_info * const current_ti;
+# define current_thread_info() current_ti
+#endif
+
 /* how to get the current stack pointer from C */
 register unsigned long current_stack_pointer asm("esp") __attribute_used__;
 
@@ -143,11 +156,14 @@ register unsigned long current_stack_poi
 #define TIF_RESTORE_SIGMASK	9	/* restore signal mask in do_signal() */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_MEMDIE		17
+#define TIF_NEED_RESCHED_DELAYED 18	/* reschedule on return to userspace */
+
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
 #define _TIF_SIGPENDING		(1<<TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
+#define _TIF_NEED_RESCHED_DELAYED (1<<TIF_NEED_RESCHED_DELAYED)
 #define _TIF_SINGLESTEP		(1<<TIF_SINGLESTEP)
 #define _TIF_IRET		(1<<TIF_IRET)
 #define _TIF_SYSCALL_EMU	(1<<TIF_SYSCALL_EMU)
Index: linux-2.6.16/include/asm-i386/timeofday.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/include/asm-i386/timeofday.h	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,4 @@
+#ifndef _ASM_I386_TIMEOFDAY_H
+#define _ASM_I386_TIMEOFDAY_H
+#include <asm-generic/timeofday.h>
+#endif
Index: linux-2.6.16/include/asm-i386/timer.h
===================================================================
--- linux-2.6.16.orig/include/asm-i386/timer.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-i386/timer.h	2006-10-19 16:52:31.000000000 +0200
@@ -3,68 +3,10 @@
 #include <linux/init.h>
 #include <linux/pm.h>
 
-/**
- * struct timer_ops - used to define a timer source
- *
- * @name: name of the timer.
- * @init: Probes and initializes the timer. Takes clock= override 
- *        string as an argument. Returns 0 on success, anything else
- *        on failure.
- * @mark_offset: called by the timer interrupt.
- * @get_offset:  called by gettimeofday(). Returns the number of microseconds
- *               since the last timer interupt.
- * @monotonic_clock: returns the number of nanoseconds since the init of the
- *                   timer.
- * @delay: delays this many clock cycles.
- */
-struct timer_opts {
-	char* name;
-	void (*mark_offset)(void);
-	unsigned long (*get_offset)(void);
-	unsigned long long (*monotonic_clock)(void);
-	void (*delay)(unsigned long);
-	unsigned long (*read_timer)(void);
-	int (*suspend)(pm_message_t state);
-	int (*resume)(void);
-};
-
-struct init_timer_opts {
-	int (*init)(char *override);
-	struct timer_opts *opts;
-};
-
 #define TICK_SIZE (tick_nsec / 1000)
-
-extern struct timer_opts* __init select_timer(void);
-extern void clock_fallback(void);
 void setup_pit_timer(void);
-
 /* Modifiers for buggy PIT handling */
-
 extern int pit_latch_buggy;
-
-extern struct timer_opts *cur_timer;
-extern int timer_ack;
-
-/* list of externed timers */
-extern struct timer_opts timer_none;
-extern struct timer_opts timer_pit;
-extern struct init_timer_opts timer_pit_init;
-extern struct init_timer_opts timer_tsc_init;
-#ifdef CONFIG_X86_CYCLONE_TIMER
-extern struct init_timer_opts timer_cyclone_init;
-#endif
-
-extern unsigned long calibrate_tsc(void);
-extern unsigned long read_timer_tsc(void);
-extern void init_cpu_khz(void);
 extern int recalibrate_cpu_khz(void);
-#ifdef CONFIG_HPET_TIMER
-extern struct init_timer_opts timer_hpet_init;
-extern unsigned long calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr);
-#endif
 
-#ifdef CONFIG_X86_PM_TIMER
-extern struct init_timer_opts timer_pmtmr_init;
-#endif
 #endif
Index: linux-2.6.16/include/asm-i386/timex.h
===================================================================
--- linux-2.6.16.orig/include/asm-i386/timex.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-i386/timex.h	2006-10-19 16:52:31.000000000 +0200
@@ -8,6 +8,7 @@
 
 #include <linux/config.h>
 #include <asm/processor.h>
+#include <asm/tsc.h>
 
 #ifdef CONFIG_X86_ELAN
 #  define CLOCK_TICK_RATE 1189200 /* AMD Elan has different frequency! */
@@ -16,39 +17,6 @@
 #endif
 
 
-/*
- * Standard way to access the cycle counter on i586+ CPUs.
- * Currently only used on SMP.
- *
- * If you really have a SMP machine with i486 chips or older,
- * compile for that, and this will just always return zero.
- * That's ok, it just means that the nicer scheduling heuristics
- * won't work for you.
- *
- * We only use the low 32 bits, and we'd simply better make sure
- * that we reschedule before that wraps. Scheduling at least every
- * four billion cycles just basically sounds like a good idea,
- * regardless of how fast the machine is. 
- */
-typedef unsigned long long cycles_t;
-
-static inline cycles_t get_cycles (void)
-{
-	unsigned long long ret=0;
-
-#ifndef CONFIG_X86_TSC
-	if (!cpu_has_tsc)
-		return 0;
-#endif
-
-#if defined(CONFIG_X86_GENERIC) || defined(CONFIG_X86_TSC)
-	rdtscll(ret);
-#endif
-	return ret;
-}
-
-extern unsigned int cpu_khz;
-
 extern int read_current_timer(unsigned long *timer_value);
 #define ARCH_HAS_READ_CURRENT_TIMER	1
 
Index: linux-2.6.16/include/asm-i386/tlbflush.h
===================================================================
--- linux-2.6.16.orig/include/asm-i386/tlbflush.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-i386/tlbflush.h	2006-10-19 16:52:31.000000000 +0200
@@ -5,15 +5,32 @@
 #include <linux/mm.h>
 #include <asm/processor.h>
 
+/*
+ * TLB-flush needs to be nonpreemptible on PREEMPT_RT due to the
+ * following complex race scenario:
+ *
+ * if the current task is lazy-TLB and does a TLB flush and
+ * gets preempted after the movl %%r3, %0 but before the
+ * movl %0, %%cr3 then its ->active_mm might change and it will
+ * install the wrong cr3 when it switches back. This is not a
+ * problem for the lazy-TLB task itself, but if the next task it
+ * switches to has an ->mm that is also the lazy-TLB task's
+ * new ->active_mm, then the scheduler will assume that cr3 is
+ * the new one, while we overwrote it with the old one. The result
+ * is the wrong cr3 in the new (non-lazy-TLB) task, which typically
+ * causes an infinite pagefault upon the next userspace access.
+ */
 #define __flush_tlb()							\
 	do {								\
 		unsigned int tmpreg;					\
 									\
+		preempt_disable();					\
 		__asm__ __volatile__(					\
 			"movl %%cr3, %0;              \n"		\
 			"movl %0, %%cr3;  # flush TLB \n"		\
 			: "=r" (tmpreg)					\
 			:: "memory");					\
+		preempt_enable();					\
 	} while (0)
 
 /*
@@ -24,6 +41,7 @@
 	do {								\
 		unsigned int tmpreg, cr4, cr4_orig;			\
 									\
+		preempt_disable();					\
 		__asm__ __volatile__(					\
 			"movl %%cr4, %2;  # turn off PGE     \n"	\
 			"movl %2, %1;                        \n"	\
@@ -35,6 +53,7 @@
 			: "=&r" (tmpreg), "=&r" (cr4), "=&r" (cr4_orig)	\
 			: "i" (~X86_CR4_PGE)				\
 			: "memory");					\
+		preempt_enable();					\
 	} while (0)
 
 extern unsigned long pgkern_mask;
@@ -87,6 +106,13 @@ extern unsigned long pgkern_mask;
 
 static inline void flush_tlb_mm(struct mm_struct *mm)
 {
+	/*
+	 * This is safe on PREEMPT_RT because if we preempt
+	 * right after the check but before the __flush_tlb(),
+	 * and if ->active_mm changes, then we might miss a
+	 * TLB flush, but that TLB flush happened already when
+	 * ->active_mm was changed:
+	 */
 	if (mm == current->active_mm)
 		__flush_tlb();
 }
Index: linux-2.6.16/include/asm-i386/tsc.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/include/asm-i386/tsc.h	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,49 @@
+/*
+ * linux/include/asm-i386/tsc.h
+ *
+ * i386 TSC related functions
+ */
+#ifndef _ASM_i386_TSC_H
+#define _ASM_i386_TSC_H
+
+#include <linux/config.h>
+#include <asm/processor.h>
+
+/*
+ * Standard way to access the cycle counter on i586+ CPUs.
+ * Currently only used on SMP.
+ *
+ * If you really have a SMP machine with i486 chips or older,
+ * compile for that, and this will just always return zero.
+ * That's ok, it just means that the nicer scheduling heuristics
+ * won't work for you.
+ *
+ * We only use the low 32 bits, and we'd simply better make sure
+ * that we reschedule before that wraps. Scheduling at least every
+ * four billion cycles just basically sounds like a good idea,
+ * regardless of how fast the machine is.
+ */
+typedef unsigned long long cycles_t;
+
+extern unsigned int cpu_khz;
+extern unsigned int tsc_khz;
+
+static inline cycles_t get_cycles(void)
+{
+	unsigned long long ret = 0;
+
+#ifndef CONFIG_X86_TSC
+	if (!cpu_has_tsc)
+		return 0;
+#endif
+
+#if defined(CONFIG_X86_GENERIC) || defined(CONFIG_X86_TSC)
+	rdtscll(ret);
+#endif
+	return ret;
+}
+
+extern void tsc_init(void);
+extern void mark_tsc_unstable(void);
+
+#endif
Index: linux-2.6.16/include/asm-i386/unistd.h
===================================================================
--- linux-2.6.16.orig/include/asm-i386/unistd.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-i386/unistd.h	2006-10-19 16:52:31.000000000 +0200
@@ -316,8 +316,10 @@
 #define __NR_pselect6		308
 #define __NR_ppoll		309
 #define __NR_unshare		310
+#define __NR_set_robust_list	311
+#define __NR_get_robust_list	312
 
-#define NR_syscalls 311
+#define NR_syscalls 313
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
Index: linux-2.6.16/include/asm-i386/xor.h
===================================================================
--- linux-2.6.16.orig/include/asm-i386/xor.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-i386/xor.h	2006-10-19 16:52:31.000000000 +0200
@@ -862,7 +862,21 @@ static struct xor_block_template xor_blo
 #include <asm-generic/xor.h>
 
 #undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES				\
+/*
+ * MMX/SSE ops disable preemption for long periods of time,
+ * so on PREEMPT_RT use the register-based ops only:
+ */
+#ifdef CONFIG_PREEMPT_RT
+# define XOR_TRY_TEMPLATES				\
+	do {						\
+		xor_speed(&xor_block_8regs);		\
+		xor_speed(&xor_block_8regs_p);		\
+		xor_speed(&xor_block_32regs);		\
+		xor_speed(&xor_block_32regs_p);		\
+	} while (0)
+# define XOR_SELECT_TEMPLATE(FASTEST) (FASTEST)
+#else
+# define XOR_TRY_TEMPLATES				\
 	do {						\
 		xor_speed(&xor_block_8regs);		\
 		xor_speed(&xor_block_8regs_p);		\
@@ -875,9 +889,10 @@ static struct xor_block_template xor_blo
 	                xor_speed(&xor_block_p5_mmx);	\
 	        }					\
 	} while (0)
-
 /* We force the use of the SSE xor block because it can write around L2.
    We may also be able to load into the L1 only depending on how the cpu
    deals with a load to a line that is being prefetched.  */
-#define XOR_SELECT_TEMPLATE(FASTEST) \
+# define XOR_SELECT_TEMPLATE(FASTEST) \
 	(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
+#endif
+
Index: linux-2.6.16/include/asm-ia64/compat.h
===================================================================
--- linux-2.6.16.orig/include/asm-ia64/compat.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-ia64/compat.h	2006-10-19 16:52:31.000000000 +0200
@@ -189,6 +189,12 @@ compat_ptr (compat_uptr_t uptr)
 	return (void __user *) (unsigned long) uptr;
 }
 
+static inline compat_uptr_t
+ptr_to_compat(void __user *uptr)
+{
+	return (u32)(unsigned long)uptr;
+}
+
 static __inline__ void __user *
 compat_alloc_user_space (long len)
 {
Index: linux-2.6.16/include/asm-ia64/irqflags.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/include/asm-ia64/irqflags.h	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,95 @@
+
+/*
+ * include/asm-i64/irqflags.h
+ *
+ * IRQ flags handling
+ *
+ * This file gets included from lowlevel asm headers too, to provide
+ * wrapped versions of the local_irq_*() APIs, based on the
+ * raw_local_irq_*() macros from the lowlevel headers.
+ */
+#ifndef _ASM_IRQFLAGS_H
+#define _ASM_IRQFLAGS_H
+
+/* For spinlocks etc */
+
+/*
+ * - clearing psr.i is implicitly serialized (visible by next insn)
+ * - setting psr.i requires data serialization
+ * - we need a stop-bit before reading PSR because we sometimes
+ *   write a floating-point register right before reading the PSR
+ *   and that writes to PSR.mfl
+ */
+#define __local_irq_save(x)			\
+do {						\
+	ia64_stop();				\
+	(x) = ia64_getreg(_IA64_REG_PSR);	\
+	ia64_stop();				\
+	ia64_rsm(IA64_PSR_I);			\
+} while (0)
+
+#define __local_irq_disable()			\
+do {						\
+	ia64_stop();				\
+	ia64_rsm(IA64_PSR_I);			\
+} while (0)
+
+#define __local_irq_restore(x)	ia64_intrin_local_irq_restore((x) & IA64_PSR_I)
+
+#ifdef CONFIG_IA64_DEBUG_IRQ
+
+  extern unsigned long last_cli_ip;
+
+# define __save_ip()		last_cli_ip = ia64_getreg(_IA64_REG_IP)
+
+# define raw_local_irq_save(x)					\
+do {								\
+	unsigned long psr;					\
+								\
+	__local_irq_save(psr);					\
+	if (psr & IA64_PSR_I)					\
+		__save_ip();					\
+	(x) = psr;						\
+} while (0)
+
+# define raw_local_irq_disable()	do { unsigned long x; local_irq_save(x); } while (0)
+
+# define raw_local_irq_restore(x)					\
+do {								\
+	unsigned long old_psr, psr = (x);			\
+								\
+	local_save_flags(old_psr);				\
+	__local_irq_restore(psr);				\
+	if ((old_psr & IA64_PSR_I) && !(psr & IA64_PSR_I))	\
+		__save_ip();					\
+} while (0)
+
+#else /* !CONFIG_IA64_DEBUG_IRQ */
+# define raw_local_irq_save(x)	__local_irq_save(x)
+# define raw_local_irq_disable()	__local_irq_disable()
+# define raw_local_irq_restore(x)	__local_irq_restore(x)
+#endif /* !CONFIG_IA64_DEBUG_IRQ */
+
+#define raw_local_irq_enable()	({ ia64_stop(); ia64_ssm(IA64_PSR_I); ia64_srlz_d(); })
+#define raw_local_save_flags(flags)	({ ia64_stop(); (flags) = ia64_getreg(_IA64_REG_PSR); })
+
+#define raw_irqs_disabled()				\
+({						\
+	unsigned long __ia64_id_flags;		\
+	local_save_flags(__ia64_id_flags);	\
+	(__ia64_id_flags & IA64_PSR_I) == 0;	\
+})
+
+#define raw_irqs_disabled_flags(flags) ((flags & IA64_PSR_I) == 0)
+
+
+#define raw_safe_halt()         ia64_pal_halt_light()    /* PAL_HALT_LIGHT */
+
+/* TBD... */
+# define TRACE_IRQS_ON
+# define TRACE_IRQS_OFF
+# define TRACE_IRQS_ON_STR
+# define TRACE_IRQS_OFF_STR
+
+#endif
+
Index: linux-2.6.16/include/asm-ia64/mmu_context.h
===================================================================
--- linux-2.6.16.orig/include/asm-ia64/mmu_context.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-ia64/mmu_context.h	2006-10-19 16:52:31.000000000 +0200
@@ -31,7 +31,7 @@
 #include <asm/processor.h>
 
 struct ia64_ctx {
-	spinlock_t lock;
+	raw_spinlock_t lock;
 	unsigned int next;	/* next context number to use */
 	unsigned int limit;     /* available free range */
 	unsigned int max_ctx;   /* max. context value supported by all CPUs */
Index: linux-2.6.16/include/asm-ia64/percpu.h
===================================================================
--- linux-2.6.16.orig/include/asm-ia64/percpu.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-ia64/percpu.h	2006-10-19 16:52:31.000000000 +0200
@@ -25,10 +25,17 @@
 #define DECLARE_PER_CPU(type, name)				\
 	extern __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name
 
+#define DECLARE_PER_CPU_LOCKED(type, name)		\
+	extern spinlock_t per_cpu_lock__##name##_locked; \
+	extern __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name##_locked
+
 /* Separate out the type, so (int[3], foo) works. */
 #define DEFINE_PER_CPU(type, name)				\
-	__attribute__((__section__(".data.percpu")))		\
-	__SMALL_ADDR_AREA __typeof__(type) per_cpu__##name
+	__attribute__((__section__(".data.percpu"))) __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name
+
+#define DEFINE_PER_CPU_LOCKED(type, name)				\
+	__attribute__((__section__(".data.percpu"))) __SMALL_ADDR_AREA spinlock_t per_cpu_lock__##name##_locked = SPIN_LOCK_UNLOCKED; \
+	__attribute__((__section__(".data.percpu"))) __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name##_locked
 
 /*
  * Pretty much a literal copy of asm-generic/percpu.h, except that percpu_modcopy() is an
@@ -44,6 +51,16 @@ DECLARE_PER_CPU(unsigned long, local_per
 #define per_cpu(var, cpu)  (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu]))
 #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __ia64_per_cpu_var(local_per_cpu_offset)))
 
+#define per_cpu_lock(var, cpu) \
+	(*RELOC_HIDE(&per_cpu_lock__##var##_locked, __per_cpu_offset[cpu]))
+#define per_cpu_var_locked(var, cpu) \
+		(*RELOC_HIDE(&per_cpu__##var##_locked, __per_cpu_offset[cpu]))
+#define __get_cpu_lock(var, cpu) \
+		per_cpu_lock(var, cpu)
+#define __get_cpu_var_locked(var, cpu) \
+		per_cpu_var_locked(var, cpu)
+
+
 extern void percpu_modcopy(void *pcpudst, const void *src, unsigned long size);
 extern void setup_per_cpu_areas (void);
 extern void *per_cpu_init(void);
Index: linux-2.6.16/include/asm-ia64/processor.h
===================================================================
--- linux-2.6.16.orig/include/asm-ia64/processor.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-ia64/processor.h	2006-10-19 16:52:31.000000000 +0200
@@ -129,8 +129,10 @@ struct ia64_psr {
  */
 struct cpuinfo_ia64 {
 	__u32 softirq_pending;
-	__u64 itm_delta;	/* # of clock cycles between clock ticks */
-	__u64 itm_next;		/* interval timer mask value to use for next clock tick */
+	__u64 itm_tick_delta;	/* # of clock cycles between clock ticks */
+	__u64 itm_tick_next;	/* interval timer mask value to use for next clock tick */
+	__u64 itm_timer_next;
+	__u64 __itm_next;
 	__u64 nsec_per_cyc;	/* (1000000000<<IA64_NSEC_PER_CYC_SHIFT)/itc_freq */
 	__u64 unimpl_va_mask;	/* mask of unimplemented virtual address bits (from PAL) */
 	__u64 unimpl_pa_mask;	/* mask of unimplemented physical address bits (from PAL) */
Index: linux-2.6.16/include/asm-ia64/rtc.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/include/asm-ia64/rtc.h	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,7 @@
+#ifndef _IA64_RTC_H
+#define _IA64_RTC_H
+
+#error "no asm/rtc.h on IA64 !"
+
+#endif
+
Index: linux-2.6.16/include/asm-ia64/rwsem.h
===================================================================
--- linux-2.6.16.orig/include/asm-ia64/rwsem.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-ia64/rwsem.h	2006-10-19 16:52:31.000000000 +0200
@@ -29,7 +29,7 @@
 /*
  * the semaphore definition
  */
-struct rw_semaphore {
+struct compat_rw_semaphore {
 	signed long		count;
 	spinlock_t		wait_lock;
 	struct list_head	wait_list;
@@ -59,16 +59,16 @@ struct rw_semaphore {
 	  LIST_HEAD_INIT((name).wait_list) \
 	  __RWSEM_DEBUG_INIT }
 
-#define DECLARE_RWSEM(name) \
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
+#define COMPAT_DECLARE_RWSEM(name) \
+	struct compat_rw_semaphore name = __RWSEM_INITIALIZER(name)
 
-extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_down_read_failed(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_down_write_failed(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_wake(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_downgrade_wake(struct compat_rw_semaphore *sem);
 
 static inline void
-init_rwsem (struct rw_semaphore *sem)
+compat_init_rwsem (struct compat_rw_semaphore *sem)
 {
 	sem->count = RWSEM_UNLOCKED_VALUE;
 	spin_lock_init(&sem->wait_lock);
@@ -82,7 +82,7 @@ init_rwsem (struct rw_semaphore *sem)
  * lock for reading
  */
 static inline void
-__down_read (struct rw_semaphore *sem)
+__down_read (struct compat_rw_semaphore *sem)
 {
 	long result = ia64_fetchadd8_acq((unsigned long *)&sem->count, 1);
 
@@ -94,7 +94,7 @@ __down_read (struct rw_semaphore *sem)
  * lock for writing
  */
 static inline void
-__down_write (struct rw_semaphore *sem)
+__down_write (struct compat_rw_semaphore *sem)
 {
 	long old, new;
 
@@ -111,7 +111,7 @@ __down_write (struct rw_semaphore *sem)
  * unlock after reading
  */
 static inline void
-__up_read (struct rw_semaphore *sem)
+__up_read (struct compat_rw_semaphore *sem)
 {
 	long result = ia64_fetchadd8_rel((unsigned long *)&sem->count, -1);
 
@@ -123,7 +123,7 @@ __up_read (struct rw_semaphore *sem)
  * unlock after writing
  */
 static inline void
-__up_write (struct rw_semaphore *sem)
+__up_write (struct compat_rw_semaphore *sem)
 {
 	long old, new;
 
@@ -140,7 +140,7 @@ __up_write (struct rw_semaphore *sem)
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
 static inline int
-__down_read_trylock (struct rw_semaphore *sem)
+__down_read_trylock (struct compat_rw_semaphore *sem)
 {
 	long tmp;
 	while ((tmp = sem->count) >= 0) {
@@ -155,7 +155,7 @@ __down_read_trylock (struct rw_semaphore
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
 static inline int
-__down_write_trylock (struct rw_semaphore *sem)
+__down_write_trylock (struct compat_rw_semaphore *sem)
 {
 	long tmp = cmpxchg_acq(&sem->count, RWSEM_UNLOCKED_VALUE,
 			      RWSEM_ACTIVE_WRITE_BIAS);
@@ -166,7 +166,7 @@ __down_write_trylock (struct rw_semaphor
  * downgrade write lock to read lock
  */
 static inline void
-__downgrade_write (struct rw_semaphore *sem)
+__downgrade_write (struct compat_rw_semaphore *sem)
 {
 	long old, new;
 
@@ -186,7 +186,7 @@ __downgrade_write (struct rw_semaphore *
 #define rwsem_atomic_add(delta, sem)	atomic64_add(delta, (atomic64_t *)(&(sem)->count))
 #define rwsem_atomic_update(delta, sem)	atomic64_add_return(delta, (atomic64_t *)(&(sem)->count))
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
+static inline int compat_rwsem_is_locked(struct compat_rw_semaphore *sem)
 {
 	return (sem->count != 0);
 }
Index: linux-2.6.16/include/asm-ia64/sal.h
===================================================================
--- linux-2.6.16.orig/include/asm-ia64/sal.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-ia64/sal.h	2006-10-19 16:52:31.000000000 +0200
@@ -43,7 +43,7 @@
 #include <asm/system.h>
 #include <asm/fpu.h>
 
-extern spinlock_t sal_lock;
+extern raw_spinlock_t sal_lock;
 
 /* SAL spec _requires_ eight args for each call. */
 #define __SAL_CALL(result,a0,a1,a2,a3,a4,a5,a6,a7)	\
Index: linux-2.6.16/include/asm-ia64/semaphore.h
===================================================================
--- linux-2.6.16.orig/include/asm-ia64/semaphore.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-ia64/semaphore.h	2006-10-19 16:52:31.000000000 +0200
@@ -11,54 +11,65 @@
 
 #include <asm/atomic.h>
 
-struct semaphore {
+/*
+ * On !PREEMPT_RT all semaphores are compat:
+ */
+#ifndef CONFIG_PREEMPT_RT
+# define compat_semaphore semaphore
+#endif
+
+struct compat_semaphore {
 	atomic_t count;
 	int sleepers;
 	wait_queue_head_t wait;
 };
 
-#define __SEMAPHORE_INITIALIZER(name, n)				\
+#define __COMPAT_SEMAPHORE_INITIALIZER(name, n)				\
 {									\
 	.count		= ATOMIC_INIT(n),				\
 	.sleepers	= 0,						\
 	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
 }
 
-#define __DECLARE_SEMAPHORE_GENERIC(name,count)					\
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name, count)
+#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count)					\
+	struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name, count)
 
-#define DECLARE_MUTEX(name)		__DECLARE_SEMAPHORE_GENERIC(name, 1)
-#define DECLARE_MUTEX_LOCKED(name)	__DECLARE_SEMAPHORE_GENERIC(name, 0)
+#define COMPAT_DECLARE_MUTEX(name)		__COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 1)
+#define COMPAT_DECLARE_MUTEX_LOCKED(name)	__COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 0)
+
+#define compat_sema_count(sem) atomic_read(&(sem)->count)
+
+asmlinkage int compat_sem_is_locked(struct compat_semaphore *sem);
 
 static inline void
-sema_init (struct semaphore *sem, int val)
+compat_sema_init (struct compat_semaphore *sem, int val)
 {
-	*sem = (struct semaphore) __SEMAPHORE_INITIALIZER(*sem, val);
+	*sem = (struct compat_semaphore) __COMPAT_SEMAPHORE_INITIALIZER(*sem, val);
 }
 
 static inline void
-init_MUTEX (struct semaphore *sem)
+compat_init_MUTEX (struct compat_semaphore *sem)
 {
-	sema_init(sem, 1);
+	compat_sema_init(sem, 1);
 }
 
 static inline void
-init_MUTEX_LOCKED (struct semaphore *sem)
+compat_init_MUTEX_LOCKED (struct compat_semaphore *sem)
 {
-	sema_init(sem, 0);
+	compat_sema_init(sem, 0);
 }
 
-extern void __down (struct semaphore * sem);
-extern int  __down_interruptible (struct semaphore * sem);
-extern int  __down_trylock (struct semaphore * sem);
-extern void __up (struct semaphore * sem);
+extern void __down (struct compat_semaphore * sem);
+extern int  __down_interruptible (struct compat_semaphore * sem);
+extern int  __down_trylock (struct compat_semaphore * sem);
+extern void __up (struct compat_semaphore * sem);
 
 /*
  * Atomically decrement the semaphore's count.  If it goes negative,
  * block the calling thread in the TASK_UNINTERRUPTIBLE state.
  */
 static inline void
-down (struct semaphore *sem)
+compat_down (struct compat_semaphore *sem)
 {
 	might_sleep();
 	if (ia64_fetchadd(-1, &sem->count.counter, acq) < 1)
@@ -70,7 +81,7 @@ down (struct semaphore *sem)
  * block the calling thread in the TASK_INTERRUPTIBLE state.
  */
 static inline int
-down_interruptible (struct semaphore * sem)
+compat_down_interruptible (struct compat_semaphore * sem)
 {
 	int ret = 0;
 
@@ -81,7 +92,7 @@ down_interruptible (struct semaphore * s
 }
 
 static inline int
-down_trylock (struct semaphore *sem)
+compat_down_trylock (struct compat_semaphore *sem)
 {
 	int ret = 0;
 
@@ -91,10 +102,12 @@ down_trylock (struct semaphore *sem)
 }
 
 static inline void
-up (struct semaphore * sem)
+compat_up (struct compat_semaphore * sem)
 {
 	if (ia64_fetchadd(1, &sem->count.counter, rel) <= -1)
 		__up(sem);
 }
 
+#include <linux/semaphore.h>
+
 #endif /* _ASM_IA64_SEMAPHORE_H */
Index: linux-2.6.16/include/asm-ia64/spinlock.h
===================================================================
--- linux-2.6.16.orig/include/asm-ia64/spinlock.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-ia64/spinlock.h	2006-10-19 16:52:31.000000000 +0200
@@ -17,8 +17,6 @@
 #include <asm/intrinsics.h>
 #include <asm/system.h>
 
-#define __raw_spin_lock_init(x)			((x)->lock = 0)
-
 #ifdef ASM_SUPPORTED
 /*
  * Try to get the lock.  If we fail to get the lock, make a non-standard call to
@@ -30,7 +28,7 @@
 #define IA64_SPINLOCK_CLOBBERS "ar.ccv", "ar.pfs", "p14", "p15", "r27", "r28", "r29", "r30", "b6", "memory"
 
 static inline void
-__raw_spin_lock_flags (raw_spinlock_t *lock, unsigned long flags)
+__raw_spin_lock_flags (__raw_spinlock_t *lock, unsigned long flags)
 {
 	register volatile unsigned int *ptr asm ("r31") = &lock->lock;
 
@@ -89,7 +87,7 @@ __raw_spin_lock_flags (raw_spinlock_t *l
 #define __raw_spin_lock(lock) __raw_spin_lock_flags(lock, 0)
 
 /* Unlock by doing an ordered store and releasing the cacheline with nta */
-static inline void __raw_spin_unlock(raw_spinlock_t *x) {
+static inline void __raw_spin_unlock(__raw_spinlock_t *x) {
 	barrier();
 	asm volatile ("st4.rel.nta [%0] = r0\n\t" :: "r"(x));
 }
@@ -109,7 +107,7 @@ do {											\
 		} while (ia64_spinlock_val);						\
 	}										\
 } while (0)
-#define __raw_spin_unlock(x)	do { barrier(); ((raw_spinlock_t *) x)->lock = 0; } while (0)
+#define __raw_spin_unlock(x)	do { barrier(); ((__raw_spinlock_t *) x)->lock = 0; } while (0)
 #endif /* !ASM_SUPPORTED */
 
 #define __raw_spin_is_locked(x)		((x)->lock != 0)
@@ -122,7 +120,7 @@ do {											\
 
 #define __raw_read_lock(rw)								\
 do {											\
-	raw_rwlock_t *__read_lock_ptr = (rw);						\
+	__raw_rwlock_t *__read_lock_ptr = (rw);						\
 											\
 	while (unlikely(ia64_fetchadd(1, (int *) __read_lock_ptr, acq) < 0)) {		\
 		ia64_fetchadd(-1, (int *) __read_lock_ptr, rel);			\
@@ -133,7 +131,7 @@ do {											\
 
 #define __raw_read_unlock(rw)					\
 do {								\
-	raw_rwlock_t *__read_lock_ptr = (rw);			\
+	__raw_rwlock_t *__read_lock_ptr = (rw);			\
 	ia64_fetchadd(-1, (int *) __read_lock_ptr, rel);	\
 } while (0)
 
@@ -165,7 +163,7 @@ do {										\
 	(result == 0);								\
 })
 
-static inline void __raw_write_unlock(raw_rwlock_t *x)
+static inline void __raw_write_unlock(__raw_rwlock_t *x)
 {
 	u8 *y = (u8 *)x;
 	barrier();
@@ -193,7 +191,7 @@ static inline void __raw_write_unlock(ra
 	(ia64_val == 0);						\
 })
 
-static inline void __raw_write_unlock(raw_rwlock_t *x)
+static inline void __raw_write_unlock(__raw_rwlock_t *x)
 {
 	barrier();
 	x->write_lock = 0;
@@ -201,10 +199,10 @@ static inline void __raw_write_unlock(ra
 
 #endif /* !ASM_SUPPORTED */
 
-static inline int __raw_read_trylock(raw_rwlock_t *x)
+static inline int __raw_read_trylock(__raw_rwlock_t *x)
 {
 	union {
-		raw_rwlock_t lock;
+		__raw_rwlock_t lock;
 		__u32 word;
 	} old, new;
 	old.lock = new.lock = *x;
Index: linux-2.6.16/include/asm-ia64/spinlock_types.h
===================================================================
--- linux-2.6.16.orig/include/asm-ia64/spinlock_types.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-ia64/spinlock_types.h	2006-10-19 16:52:31.000000000 +0200
@@ -7,14 +7,14 @@
 
 typedef struct {
 	volatile unsigned int lock;
-} raw_spinlock_t;
+} __raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED	{ 0 }
 
 typedef struct {
 	volatile unsigned int read_counter	: 31;
 	volatile unsigned int write_lock	:  1;
-} raw_rwlock_t;
+} __raw_rwlock_t;
 
 #define __RAW_RW_LOCK_UNLOCKED		{ 0, 0 }
 
Index: linux-2.6.16/include/asm-ia64/system.h
===================================================================
--- linux-2.6.16.orig/include/asm-ia64/system.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-ia64/system.h	2006-10-19 16:52:31.000000000 +0200
@@ -106,81 +106,16 @@ extern struct ia64_boot_param {
 #define set_mb(var, value)	do { (var) = (value); mb(); } while (0)
 #define set_wmb(var, value)	do { (var) = (value); mb(); } while (0)
 
-#define safe_halt()         ia64_pal_halt_light()    /* PAL_HALT_LIGHT */
 
 /*
  * The group barrier in front of the rsm & ssm are necessary to ensure
  * that none of the previous instructions in the same group are
  * affected by the rsm/ssm.
  */
-/* For spinlocks etc */
 
-/*
- * - clearing psr.i is implicitly serialized (visible by next insn)
- * - setting psr.i requires data serialization
- * - we need a stop-bit before reading PSR because we sometimes
- *   write a floating-point register right before reading the PSR
- *   and that writes to PSR.mfl
- */
-#define __local_irq_save(x)			\
-do {						\
-	ia64_stop();				\
-	(x) = ia64_getreg(_IA64_REG_PSR);	\
-	ia64_stop();				\
-	ia64_rsm(IA64_PSR_I);			\
-} while (0)
-
-#define __local_irq_disable()			\
-do {						\
-	ia64_stop();				\
-	ia64_rsm(IA64_PSR_I);			\
-} while (0)
-
-#define __local_irq_restore(x)	ia64_intrin_local_irq_restore((x) & IA64_PSR_I)
-
-#ifdef CONFIG_IA64_DEBUG_IRQ
 
-  extern unsigned long last_cli_ip;
-
-# define __save_ip()		last_cli_ip = ia64_getreg(_IA64_REG_IP)
-
-# define local_irq_save(x)					\
-do {								\
-	unsigned long psr;					\
-								\
-	__local_irq_save(psr);					\
-	if (psr & IA64_PSR_I)					\
-		__save_ip();					\
-	(x) = psr;						\
-} while (0)
-
-# define local_irq_disable()	do { unsigned long x; local_irq_save(x); } while (0)
-
-# define local_irq_restore(x)					\
-do {								\
-	unsigned long old_psr, psr = (x);			\
-								\
-	local_save_flags(old_psr);				\
-	__local_irq_restore(psr);				\
-	if ((old_psr & IA64_PSR_I) && !(psr & IA64_PSR_I))	\
-		__save_ip();					\
-} while (0)
+#include <linux/trace_irqflags.h>
 
-#else /* !CONFIG_IA64_DEBUG_IRQ */
-# define local_irq_save(x)	__local_irq_save(x)
-# define local_irq_disable()	__local_irq_disable()
-# define local_irq_restore(x)	__local_irq_restore(x)
-#endif /* !CONFIG_IA64_DEBUG_IRQ */
-
-#define local_irq_enable()	({ ia64_stop(); ia64_ssm(IA64_PSR_I); ia64_srlz_d(); })
-#define local_save_flags(flags)	({ ia64_stop(); (flags) = ia64_getreg(_IA64_REG_PSR); })
-
-#define irqs_disabled()				\
-({						\
-	unsigned long __ia64_id_flags;		\
-	local_save_flags(__ia64_id_flags);	\
-	(__ia64_id_flags & IA64_PSR_I) == 0;	\
-})
 
 #ifdef __KERNEL__
 
Index: linux-2.6.16/include/asm-ia64/thread_info.h
===================================================================
--- linux-2.6.16.orig/include/asm-ia64/thread_info.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-ia64/thread_info.h	2006-10-19 16:52:31.000000000 +0200
@@ -94,6 +94,7 @@ struct thread_info {
 #define TIF_MEMDIE		17
 #define TIF_MCA_INIT		18	/* this task is processing MCA or INIT */
 #define TIF_DB_DISABLED		19	/* debug trap disabled for fsyscall */
+#define TIF_NEED_RESCHED_DELAYED 20	/* reschedule on return to userspace */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
Index: linux-2.6.16/include/asm-ia64/timeofday.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/include/asm-ia64/timeofday.h	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,5 @@
+#ifndef _ASM_IA64_TIMEOFDAY_H
+#define _ASM_IA64_TIMEOFDAY_H
+#include <asm-generic/timeofday.h>
+#endif
+
Index: linux-2.6.16/include/asm-ia64/tlb.h
===================================================================
--- linux-2.6.16.orig/include/asm-ia64/tlb.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-ia64/tlb.h	2006-10-19 16:52:31.000000000 +0200
@@ -41,6 +41,7 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
+#include <linux/percpu.h>
 
 #include <asm/pgalloc.h>
 #include <asm/processor.h>
@@ -62,11 +63,12 @@ struct mmu_gather {
 	unsigned char		need_flush;	/* really unmapped some PTEs? */
 	unsigned long		start_addr;
 	unsigned long		end_addr;
+	int cpu;
 	struct page 		*pages[FREE_PTE_NR];
 };
 
 /* Users of the generic TLB shootdown code must declare this storage space. */
-DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
+DECLARE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 /*
  * Flush the TLB for address range START to END and, if not in fast mode, release the
@@ -128,8 +130,10 @@ ia64_tlb_flush_mmu (struct mmu_gather *t
 static inline struct mmu_gather *
 tlb_gather_mmu (struct mm_struct *mm, unsigned int full_mm_flush)
 {
-	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
+	int cpu;
+	struct mmu_gather *tlb = &get_cpu_var_locked(mmu_gathers, &cpu);
 
+	tlb->cpu = cpu;
 	tlb->mm = mm;
 	/*
 	 * Use fast mode if only 1 CPU is online.
@@ -166,7 +170,7 @@ tlb_finish_mmu (struct mmu_gather *tlb, 
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
 
-	put_cpu_var(mmu_gathers);
+	put_cpu_var_locked(mmu_gathers, tlb->cpu);
 }
 
 /*
Index: linux-2.6.16/include/asm-mips/asmmacro.h
===================================================================
--- linux-2.6.16.orig/include/asm-mips/asmmacro.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-mips/asmmacro.h	2006-10-19 16:52:31.000000000 +0200
@@ -18,14 +18,14 @@
 #include <asm/asmmacro-64.h>
 #endif
 
-	.macro	local_irq_enable reg=t0
+	.macro	mips_raw_local_irq_enable reg=t0
 	mfc0	\reg, CP0_STATUS
 	ori	\reg, \reg, 1
 	mtc0	\reg, CP0_STATUS
 	irq_enable_hazard
 	.endm
 
-	.macro	local_irq_disable reg=t0
+	.macro	mips_raw_local_irq_disable reg=t0
 	mfc0	\reg, CP0_STATUS
 	ori	\reg, \reg, 1
 	xori	\reg, \reg, 1
Index: linux-2.6.16/include/asm-mips/atomic.h
===================================================================
--- linux-2.6.16.orig/include/asm-mips/atomic.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-mips/atomic.h	2006-10-19 16:52:31.000000000 +0200
@@ -81,13 +81,16 @@ static __inline__ void atomic_add(int i,
 		"	.set	mips0					\n"
 		: "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter));
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		local_irq_save(flags);
 		v->counter += i;
 		local_irq_restore(flags);
 	}
+#endif
 }
 
 /*
@@ -123,13 +126,16 @@ static __inline__ void atomic_sub(int i,
 		"	.set	mips0					\n"
 		: "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter));
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		local_irq_save(flags);
 		v->counter -= i;
 		local_irq_restore(flags);
 	}
+#endif
 }
 
 /*
@@ -169,7 +175,9 @@ static __inline__ int atomic_add_return(
 		: "=&r" (result), "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter)
 		: "memory");
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		local_irq_save(flags);
@@ -178,6 +186,7 @@ static __inline__ int atomic_add_return(
 		v->counter = result;
 		local_irq_restore(flags);
 	}
+#endif
 
 	return result;
 }
@@ -216,7 +225,9 @@ static __inline__ int atomic_sub_return(
 		: "=&r" (result), "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter)
 		: "memory");
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		local_irq_save(flags);
@@ -225,6 +236,7 @@ static __inline__ int atomic_sub_return(
 		v->counter = result;
 		local_irq_restore(flags);
 	}
+#endif
 
 	return result;
 }
@@ -279,7 +291,9 @@ static __inline__ int atomic_sub_if_posi
 		: "=&r" (result), "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter)
 		: "memory");
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		local_irq_save(flags);
@@ -289,6 +303,7 @@ static __inline__ int atomic_sub_if_posi
 			v->counter = result;
 		local_irq_restore(flags);
 	}
+#endif
 
 	return result;
 }
@@ -435,13 +450,16 @@ static __inline__ void atomic64_add(long
 		"	.set	mips0					\n"
 		: "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter));
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		local_irq_save(flags);
 		v->counter += i;
 		local_irq_restore(flags);
 	}
+#endif
 }
 
 /*
@@ -477,13 +495,16 @@ static __inline__ void atomic64_sub(long
 		"	.set	mips0					\n"
 		: "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter));
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		local_irq_save(flags);
 		v->counter -= i;
 		local_irq_restore(flags);
 	}
+#endif
 }
 
 /*
@@ -523,7 +544,9 @@ static __inline__ long atomic64_add_retu
 		: "=&r" (result), "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter)
 		: "memory");
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		local_irq_save(flags);
@@ -532,6 +555,8 @@ static __inline__ long atomic64_add_retu
 		v->counter = result;
 		local_irq_restore(flags);
 	}
+#endif
+#endif
 
 	return result;
 }
@@ -570,7 +595,9 @@ static __inline__ long atomic64_sub_retu
 		: "=&r" (result), "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter)
 		: "memory");
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		local_irq_save(flags);
@@ -643,6 +670,7 @@ static __inline__ long atomic64_sub_if_p
 			v->counter = result;
 		local_irq_restore(flags);
 	}
+#endif
 
 	return result;
 }
Index: linux-2.6.16/include/asm-mips/bitops.h
===================================================================
--- linux-2.6.16.orig/include/asm-mips/bitops.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-mips/bitops.h	2006-10-19 16:52:31.000000000 +0200
@@ -91,6 +91,7 @@ static inline void set_bit(unsigned long
 		"	.set	mips0					\n"
 		: "=&r" (temp), "=m" (*m)
 		: "ir" (1UL << (nr & SZLONG_MASK)), "m" (*m));
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		volatile unsigned long *a = addr;
 		unsigned long mask;
@@ -101,6 +102,7 @@ static inline void set_bit(unsigned long
 		__bi_local_irq_save(flags);
 		*a |= mask;
 		__bi_local_irq_restore(flags);
+#endif
 	}
 }
 
@@ -155,6 +157,7 @@ static inline void clear_bit(unsigned lo
 		"	.set	mips0					\n"
 		: "=&r" (temp), "=m" (*m)
 		: "ir" (~(1UL << (nr & SZLONG_MASK))), "m" (*m));
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		volatile unsigned long *a = addr;
 		unsigned long mask;
@@ -165,6 +168,7 @@ static inline void clear_bit(unsigned lo
 		__bi_local_irq_save(flags);
 		*a &= ~mask;
 		__bi_local_irq_restore(flags);
+#endif
 	}
 }
 
@@ -221,6 +225,7 @@ static inline void change_bit(unsigned l
 		"	.set	mips0				\n"
 		: "=&r" (temp), "=m" (*m)
 		: "ir" (1UL << (nr & SZLONG_MASK)), "m" (*m));
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		volatile unsigned long *a = addr;
 		unsigned long mask;
@@ -231,6 +236,7 @@ static inline void change_bit(unsigned l
 		__bi_local_irq_save(flags);
 		*a ^= mask;
 		__bi_local_irq_restore(flags);
+#endif
 	}
 }
 
@@ -303,6 +309,7 @@ static inline int test_and_set_bit(unsig
 		: "memory");
 
 		return res != 0;
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		volatile unsigned long *a = addr;
 		unsigned long mask;
@@ -317,6 +324,7 @@ static inline int test_and_set_bit(unsig
 		__bi_local_irq_restore(flags);
 
 		return retval;
+#endif
 	}
 }
 
@@ -399,6 +407,7 @@ static inline int test_and_clear_bit(uns
 		: "memory");
 
 		return res != 0;
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		volatile unsigned long *a = addr;
 		unsigned long mask;
@@ -413,6 +422,7 @@ static inline int test_and_clear_bit(uns
 		__bi_local_irq_restore(flags);
 
 		return retval;
+#endif
 	}
 }
 
@@ -493,6 +503,7 @@ static inline int test_and_change_bit(un
 		: "memory");
 
 		return res != 0;
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		volatile unsigned long *a = addr;
 		unsigned long mask, retval;
@@ -506,6 +517,7 @@ static inline int test_and_change_bit(un
 		__bi_local_irq_restore(flags);
 
 		return retval;
+#endif
 	}
 }
 
Index: linux-2.6.16/include/asm-mips/compat.h
===================================================================
--- linux-2.6.16.orig/include/asm-mips/compat.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-mips/compat.h	2006-10-19 16:52:31.000000000 +0200
@@ -133,6 +133,11 @@ static inline void *compat_ptr(compat_up
 	return (void *)(long)uptr;
 }
 
+static inline compat_uptr_t ptr_to_compat(void __user *uptr)
+{
+	return (u32)(unsigned long)uptr;
+}
+
 static inline void *compat_alloc_user_space(long len)
 {
 	struct pt_regs *regs = (struct pt_regs *)
Index: linux-2.6.16/include/asm-mips/futex.h
===================================================================
--- linux-2.6.16.orig/include/asm-mips/futex.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-mips/futex.h	2006-10-19 16:52:31.000000000 +0200
@@ -99,5 +99,11 @@ futex_atomic_op_inuser (int encoded_op, 
 	return ret;
 }
 
+static inline int
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
+{
+	return -ENOSYS;
+}
+
 #endif
 #endif
Index: linux-2.6.16/include/asm-mips/hw_irq.h
===================================================================
--- linux-2.6.16.orig/include/asm-mips/hw_irq.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-mips/hw_irq.h	2006-10-19 16:52:31.000000000 +0200
@@ -10,6 +10,7 @@
 
 #include <linux/profile.h>
 #include <asm/atomic.h>
+#include <linux/trace_irqflags.h>
 
 extern void disable_8259A_irq(unsigned int irq);
 extern void enable_8259A_irq(unsigned int irq);
Index: linux-2.6.16/include/asm-mips/i8259.h
===================================================================
--- linux-2.6.16.orig/include/asm-mips/i8259.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-mips/i8259.h	2006-10-19 16:52:31.000000000 +0200
@@ -19,7 +19,7 @@
 
 #include <asm/io.h>
 
-extern spinlock_t i8259A_lock;
+extern raw_spinlock_t i8259A_lock;
 
 extern void init_i8259_irqs(void);
 
Index: linux-2.6.16/include/asm-mips/interrupt.h
===================================================================
--- linux-2.6.16.orig/include/asm-mips/interrupt.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-mips/interrupt.h	2006-10-19 16:52:31.000000000 +0200
@@ -15,7 +15,7 @@
 #include <asm/hazards.h>
 
 __asm__ (
-	"	.macro	local_irq_enable				\n"
+	"	.macro	mips_raw_local_irq_enable			\n"
 	"	.set	push						\n"
 	"	.set	reorder						\n"
 	"	.set	noat						\n"
@@ -31,10 +31,10 @@ __asm__ (
 	"	.set	pop						\n"
 	"	.endm");
 
-static inline void local_irq_enable(void)
+static inline void raw_local_irq_enable(void)
 {
 	__asm__ __volatile__(
-		"local_irq_enable"
+		"mips_raw_local_irq_enable"
 		: /* no outputs */
 		: /* no inputs */
 		: "memory");
@@ -59,7 +59,7 @@ static inline void local_irq_enable(void
  * Workaround: mask EXL bit of the result or place a nop before mfc0.
  */
 __asm__ (
-	"	.macro	local_irq_disable\n"
+	"	.macro	mips_raw_local_irq_disable			\n"
 	"	.set	push						\n"
 	"	.set	noat						\n"
 #ifdef CONFIG_CPU_MIPSR2
@@ -75,30 +75,30 @@ __asm__ (
 	"	.set	pop						\n"
 	"	.endm							\n");
 
-static inline void local_irq_disable(void)
+static inline void raw_local_irq_disable(void)
 {
 	__asm__ __volatile__(
-		"local_irq_disable"
+		"mips_raw_local_irq_disable"
 		: /* no outputs */
 		: /* no inputs */
 		: "memory");
 }
 
 __asm__ (
-	"	.macro	local_save_flags flags				\n"
+	"	.macro	mips_raw_local_save_flags flags			\n"
 	"	.set	push						\n"
 	"	.set	reorder						\n"
 	"	mfc0	\\flags, $12					\n"
 	"	.set	pop						\n"
 	"	.endm							\n");
 
-#define local_save_flags(x)						\
+#define raw_local_save_flags(x)					\
 __asm__ __volatile__(							\
-	"local_save_flags %0"						\
+	"mips_raw_local_save_flags %0"					\
 	: "=r" (x))
 
 __asm__ (
-	"	.macro	local_irq_save result				\n"
+	"	.macro	mips_raw_local_irq_save result			\n"
 	"	.set	push						\n"
 	"	.set	reorder						\n"
 	"	.set	noat						\n"
@@ -116,15 +116,15 @@ __asm__ (
 	"	.set	pop						\n"
 	"	.endm							\n");
 
-#define local_irq_save(x)						\
+#define raw_local_irq_save(x)						\
 __asm__ __volatile__(							\
-	"local_irq_save\t%0"						\
+	"mips_raw_local_irq_save\t%0"					\
 	: "=r" (x)							\
 	: /* no inputs */						\
 	: "memory")
 
 __asm__ (
-	"	.macro	local_irq_restore flags				\n"
+	"	.macro	mips_raw_local_irq_restore flags		\n"
 	"	.set	push						\n"
 	"	.set	noreorder					\n"
 	"	.set	noat						\n"
@@ -156,22 +156,28 @@ __asm__ (
 	"	.set	pop						\n"
 	"	.endm							\n");
 
-#define local_irq_restore(flags)					\
+#define raw_local_irq_restore(flags)					\
 do {									\
 	unsigned long __tmp1;						\
 									\
 	__asm__ __volatile__(						\
-		"local_irq_restore\t%0"					\
+		"mips_raw_local_irq_restore\t%0"				\
 		: "=r" (__tmp1)						\
 		: "0" (flags)						\
 		: "memory");						\
 } while(0)
 
-#define irqs_disabled()							\
+#define raw_irqs_disabled()							\
 ({									\
 	unsigned long flags;						\
-	local_save_flags(flags);					\
+	raw_local_save_flags(flags);					\
 	!(flags & 1);							\
 })
 
+#define raw_irqs_disabled_flags(flags)	\
+({						\
+	!(flags & 1);				\
+})
+
 #endif /* _ASM_INTERRUPT_H */
+
Index: linux-2.6.16/include/asm-mips/io.h
===================================================================
--- linux-2.6.16.orig/include/asm-mips/io.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-mips/io.h	2006-10-19 16:52:31.000000000 +0200
@@ -16,6 +16,7 @@
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
+#include <linux/trace_irqflags.h>
 
 #include <asm/addrspace.h>
 #include <asm/byteorder.h>
Index: linux-2.6.16/include/asm-mips/linkage.h
===================================================================
--- linux-2.6.16.orig/include/asm-mips/linkage.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-mips/linkage.h	2006-10-19 16:52:31.000000000 +0200
@@ -1,6 +1,8 @@
 #ifndef __ASM_LINKAGE_H
 #define __ASM_LINKAGE_H
 
-/* Nothing to see here... */
+/* FASTCALL stuff */
+#define FASTCALL(x)	x
+#define fastcall
 
 #endif
Index: linux-2.6.16/include/asm-mips/m48t35.h
===================================================================
--- linux-2.6.16.orig/include/asm-mips/m48t35.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-mips/m48t35.h	2006-10-19 16:52:31.000000000 +0200
@@ -6,7 +6,7 @@
 
 #include <linux/spinlock.h>
 
-extern spinlock_t rtc_lock;
+extern raw_spinlock_t rtc_lock;
 
 struct m48t35_rtc {
 	volatile u8	pad[0x7ff8];    /* starts at 0x7ff8 */
Index: linux-2.6.16/include/asm-mips/mipsregs.h
===================================================================
--- linux-2.6.16.orig/include/asm-mips/mipsregs.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-mips/mipsregs.h	2006-10-19 16:52:31.000000000 +0200
@@ -758,7 +758,7 @@ do {									\
 	unsigned long long val;						\
 	unsigned long flags;						\
 									\
-	local_irq_save(flags);						\
+	local_irq_save(flags);					\
 	if (sel == 0)							\
 		__asm__ __volatile__(					\
 			".set\tmips64\n\t"				\
@@ -786,7 +786,7 @@ do {									\
 do {									\
 	unsigned long flags;						\
 									\
-	local_irq_save(flags);						\
+	local_irq_save(flags);					\
 	if (sel == 0)							\
 		__asm__ __volatile__(					\
 			".set\tmips64\n\t"				\
Index: linux-2.6.16/include/asm-mips/rwsem.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/include/asm-mips/rwsem.h	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,176 @@
+/*
+ * include/asm-mips/rwsem.h: R/W semaphores for MIPS using the stuff
+ * in lib/rwsem.c.  Adapted largely from include/asm-ppc/rwsem.h
+ * by john.cooper@timesys.com
+ */
+
+#ifndef _MIPS_RWSEM_H
+#define _MIPS_RWSEM_H
+
+#ifndef _LINUX_RWSEM_H
+#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead"
+#endif
+
+#ifdef __KERNEL__
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+
+/*
+ * the semaphore definition
+ */
+struct compat_rw_semaphore {
+	/* XXX this should be able to be an atomic_t  -- paulus */
+	signed long		count;
+#define RWSEM_UNLOCKED_VALUE		0x00000000
+#define RWSEM_ACTIVE_BIAS		0x00000001
+#define RWSEM_ACTIVE_MASK		0x0000ffff
+#define RWSEM_WAITING_BIAS		(-0x00010000)
+#define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
+#define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
+	raw_spinlock_t		wait_lock;
+	struct list_head	wait_list;
+#if RWSEM_DEBUG
+	int			debug;
+#endif
+};
+
+/*
+ * initialisation
+ */
+#if RWSEM_DEBUG
+#define __RWSEM_DEBUG_INIT      , 0
+#else
+#define __RWSEM_DEBUG_INIT	/* */
+#endif
+
+#define __COMPAT_RWSEM_INITIALIZER(name) \
+	{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \
+	  LIST_HEAD_INIT((name).wait_list) \
+	  __RWSEM_DEBUG_INIT }
+
+#define COMPAT_DECLARE_RWSEM(name)		\
+	struct compat_rw_semaphore name = __COMPAT_RWSEM_INITIALIZER(name)
+
+extern struct compat_rw_semaphore *rwsem_down_read_failed(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_down_write_failed(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_wake(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_downgrade_wake(struct compat_rw_semaphore *sem);
+
+static inline void compat_init_rwsem(struct compat_rw_semaphore *sem)
+{
+	sem->count = RWSEM_UNLOCKED_VALUE;
+	spin_lock_init(&sem->wait_lock);
+	INIT_LIST_HEAD(&sem->wait_list);
+#if RWSEM_DEBUG
+	sem->debug = 0;
+#endif
+}
+
+/*
+ * lock for reading
+ */
+static inline void __down_read(struct compat_rw_semaphore *sem)
+{
+	if (atomic_inc_return((atomic_t *)(&sem->count)) > 0)
+		smp_wmb();
+	else
+		rwsem_down_read_failed(sem);
+}
+
+static inline int __down_read_trylock(struct compat_rw_semaphore *sem)
+{
+	int tmp;
+
+	while ((tmp = sem->count) >= 0) {
+		if (tmp == cmpxchg(&sem->count, tmp,
+				   tmp + RWSEM_ACTIVE_READ_BIAS)) {
+			smp_wmb();
+			return 1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * lock for writing
+ */
+static inline void __down_write(struct compat_rw_semaphore *sem)
+{
+	int tmp;
+
+	tmp = atomic_add_return(RWSEM_ACTIVE_WRITE_BIAS,
+				(atomic_t *)(&sem->count));
+	if (tmp == RWSEM_ACTIVE_WRITE_BIAS)
+		smp_wmb();
+	else
+		rwsem_down_write_failed(sem);
+}
+
+static inline int __down_write_trylock(struct compat_rw_semaphore *sem)
+{
+	int tmp;
+
+	tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE,
+		      RWSEM_ACTIVE_WRITE_BIAS);
+	smp_wmb();
+	return tmp == RWSEM_UNLOCKED_VALUE;
+}
+
+/*
+ * unlock after reading
+ */
+static inline void __up_read(struct compat_rw_semaphore *sem)
+{
+	int tmp;
+
+	smp_wmb();
+	tmp = atomic_dec_return((atomic_t *)(&sem->count));
+	if (tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0)
+		rwsem_wake(sem);
+}
+
+/*
+ * unlock after writing
+ */
+static inline void __up_write(struct compat_rw_semaphore *sem)
+{
+	smp_wmb();
+	if (atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS,
+			      (atomic_t *)(&sem->count)) < 0)
+		rwsem_wake(sem);
+}
+
+/*
+ * implement atomic add functionality
+ */
+static inline void rwsem_atomic_add(int delta, struct compat_rw_semaphore *sem)
+{
+	atomic_add(delta, (atomic_t *)(&sem->count));
+}
+
+/*
+ * downgrade write lock to read lock
+ */
+static inline void __downgrade_write(struct compat_rw_semaphore *sem)
+{
+	int tmp;
+
+	smp_wmb();
+	tmp = atomic_add_return(-RWSEM_WAITING_BIAS, (atomic_t *)(&sem->count));
+	if (tmp < 0)
+		rwsem_downgrade_wake(sem);
+}
+
+/*
+ * implement exchange and add functionality
+ */
+static inline int rwsem_atomic_update(int delta, struct compat_rw_semaphore *sem)
+{
+	smp_mb();
+	return atomic_add_return(delta, (atomic_t *)(&sem->count));
+}
+
+#endif /* __KERNEL__ */
+#endif /* _MIPS_RWSEM_H */
Index: linux-2.6.16/include/asm-mips/semaphore.h
===================================================================
--- linux-2.6.16.orig/include/asm-mips/semaphore.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-mips/semaphore.h	2006-10-19 16:52:31.000000000 +0200
@@ -24,12 +24,20 @@
 
 #ifdef __KERNEL__
 
-#include <asm/atomic.h>
-#include <asm/system.h>
 #include <linux/wait.h>
 #include <linux/rwsem.h>
 
-struct semaphore {
+/*
+ * On !PREEMPT_RT all semaphores are compat:
+ */
+#ifndef CONFIG_PREEMPT_RT
+# define compat_semaphore semaphore
+#endif
+
+#include <asm/atomic.h>
+#include <asm/system.h>
+
+struct compat_semaphore {
 	/*
 	 * Note that any negative value of count is equivalent to 0,
 	 * but additionally indicates that some process(es) might be
@@ -39,39 +47,42 @@ struct semaphore {
 	wait_queue_head_t wait;
 };
 
-#define __SEMAPHORE_INITIALIZER(name, n)				\
+#define __COMPAT_SEMAPHORE_INITIALIZER(name, n)				\
 {									\
 	.count		= ATOMIC_INIT(n),				\
 	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
 }
 
-#define __DECLARE_SEMAPHORE_GENERIC(name, count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
+#define __COMPAT_MUTEX_INITIALIZER(name) \
+	__COMPAT_SEMAPHORE_INITIALIZER(name, 1)
+
+#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name, count) \
+	struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count)
 
-#define DECLARE_MUTEX(name)		__DECLARE_SEMAPHORE_GENERIC(name, 1)
-#define DECLARE_MUTEX_LOCKED(name)	__DECLARE_SEMAPHORE_GENERIC(name, 0)
+#define COMPAT_DECLARE_MUTEX(name)		__COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 1)
+#define COMPAT_DECLARE_MUTEX_LOCKED(name)	__COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 0)
 
-static inline void sema_init (struct semaphore *sem, int val)
+static inline void compat_sema_init (struct compat_semaphore *sem, int val)
 {
 	atomic_set(&sem->count, val);
 	init_waitqueue_head(&sem->wait);
 }
 
-static inline void init_MUTEX (struct semaphore *sem)
+static inline void compat_init_MUTEX (struct compat_semaphore *sem)
 {
 	sema_init(sem, 1);
 }
 
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
+static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem)
 {
 	sema_init(sem, 0);
 }
 
-extern void __down(struct semaphore * sem);
-extern int  __down_interruptible(struct semaphore * sem);
-extern void __up(struct semaphore * sem);
+extern void __compat_down(struct compat_semaphore * sem);
+extern int  __compat_down_interruptible(struct compat_semaphore * sem);
+extern void __compat_up(struct compat_semaphore * sem);
 
-static inline void down(struct semaphore * sem)
+static inline void compat_down(struct compat_semaphore * sem)
 {
 	might_sleep();
 
@@ -79,31 +90,35 @@ static inline void down(struct semaphore
 	 * Try to get the semaphore, take the slow path if we fail.
 	 */
 	if (unlikely(atomic_dec_return(&sem->count) < 0))
-		__down(sem);
+		__compat_down(sem);
 }
 
-static inline int down_interruptible(struct semaphore * sem)
+static inline int compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int ret = 0;
 
 	might_sleep();
 
 	if (unlikely(atomic_dec_return(&sem->count) < 0))
-		ret = __down_interruptible(sem);
+		ret = __compat_down_interruptible(sem);
 	return ret;
 }
 
-static inline int down_trylock(struct semaphore * sem)
+static inline int compat_down_trylock(struct compat_semaphore * sem)
 {
 	return atomic_dec_if_positive(&sem->count) < 0;
 }
 
-static inline void up(struct semaphore * sem)
+static inline void compat_up(struct compat_semaphore * sem)
 {
 	if (unlikely(atomic_inc_return(&sem->count) <= 0))
-		__up(sem);
+		__compat_up(sem);
 }
 
+#define compat_sema_count(sem) atomic_read(&(sem)->count)
+
+#include <linux/semaphore.h>
+
 #endif /* __KERNEL__ */
 
 #endif /* __ASM_SEMAPHORE_H */
Index: linux-2.6.16/include/asm-mips/system.h
===================================================================
--- linux-2.6.16.orig/include/asm-mips/system.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-mips/system.h	2006-10-19 16:52:31.000000000 +0200
@@ -214,6 +214,7 @@ static inline unsigned long __xchg_u32(v
 		: "=&r" (retval), "=m" (*m), "=&r" (dummy)
 		: "R" (*m), "Jr" (val)
 		: "memory");
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		unsigned long flags;
 
@@ -221,6 +222,7 @@ static inline unsigned long __xchg_u32(v
 		retval = *m;
 		*m = val;
 		local_irq_restore(flags);	/* implies memory barrier  */
+#endif
 	}
 
 	return retval;
@@ -345,6 +347,7 @@ static inline unsigned long __cmpxchg_u3
 		: "=&r" (retval), "=R" (*m)
 		: "R" (*m), "Jr" (old), "Jr" (new)
 		: "memory");
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		unsigned long flags;
 
@@ -353,6 +356,7 @@ static inline unsigned long __cmpxchg_u3
 		if (retval == old)
 			*m = new;
 		local_irq_restore(flags);	/* implies memory barrier  */
+#endif
 	}
 
 	return retval;
Index: linux-2.6.16/include/asm-mips/thread_info.h
===================================================================
--- linux-2.6.16.orig/include/asm-mips/thread_info.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-mips/thread_info.h	2006-10-19 16:52:31.000000000 +0200
@@ -114,6 +114,7 @@ register struct thread_info *__current_t
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_SYSCALL_AUDIT	4	/* syscall auditing active */
 #define TIF_SECCOMP		5	/* secure computing */
+#define TIF_NEED_RESCHED_DELAYED 6	/* reschedule on return to userspace */
 #define TIF_RESTORE_SIGMASK	9	/* restore signal mask in do_signal() */
 #define TIF_USEDFPU		16	/* FPU was used by this task this quantum (SMP) */
 #define TIF_POLLING_NRFLAG	17	/* true if poll_idle() is polling TIF_NEED_RESCHED */
@@ -126,6 +127,7 @@ register struct thread_info *__current_t
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
+#define _TIF_NEED_RESCHED_DELAYED (1<<TIF_NEED_RESCHED_DELAYED)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
 #define _TIF_USEDFPU		(1<<TIF_USEDFPU)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
Index: linux-2.6.16/include/asm-mips/time.h
===================================================================
--- linux-2.6.16.orig/include/asm-mips/time.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-mips/time.h	2006-10-19 16:52:31.000000000 +0200
@@ -34,6 +34,7 @@ extern spinlock_t rtc_lock;
 extern unsigned long (*rtc_get_time)(void);
 extern int (*rtc_set_time)(unsigned long);
 extern int (*rtc_set_mmss)(unsigned long);
+extern unsigned long cpu_khz;
 
 /*
  * Timer interrupt functions.
Index: linux-2.6.16/include/asm-parisc/compat.h
===================================================================
--- linux-2.6.16.orig/include/asm-parisc/compat.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-parisc/compat.h	2006-10-19 16:52:31.000000000 +0200
@@ -138,6 +138,11 @@ static inline void __user *compat_ptr(co
 	return (void __user *)(unsigned long)uptr;
 }
 
+static inline compat_uptr_t ptr_to_compat(void __user *uptr)
+{
+	return (u32)(unsigned long)uptr;
+}
+
 static __inline__ void __user *compat_alloc_user_space(long len)
 {
 	struct pt_regs *regs = &current->thread.regs;
Index: linux-2.6.16/include/asm-powerpc/bitops.h
===================================================================
--- linux-2.6.16.orig/include/asm-powerpc/bitops.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-powerpc/bitops.h	2006-10-19 16:52:31.000000000 +0200
@@ -418,7 +418,7 @@ static inline int sched_find_first_bit(c
 		return __ffs(b[1]) + 32;
 	if (unlikely(b[2]))
 		return __ffs(b[2]) + 64;
-	if (b[3])
+	if (unlikely(b[3]))
 		return __ffs(b[3]) + 96;
 	return __ffs(b[4]) + 128;
 #endif
Index: linux-2.6.16/include/asm-powerpc/futex.h
===================================================================
--- linux-2.6.16.orig/include/asm-powerpc/futex.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-powerpc/futex.h	2006-10-19 16:52:31.000000000 +0200
@@ -81,5 +81,11 @@ static inline int futex_atomic_op_inuser
 	return ret;
 }
 
+static inline int
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
+{
+	return -ENOSYS;
+}
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_FUTEX_H */
Index: linux-2.6.16/include/asm-powerpc/hw_irq.h
===================================================================
--- linux-2.6.16.orig/include/asm-powerpc/hw_irq.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-powerpc/hw_irq.h	2006-10-19 16:52:31.000000000 +0200
@@ -15,33 +15,34 @@ extern void timer_interrupt(struct pt_re
 
 #ifdef CONFIG_PPC_ISERIES
 
-extern unsigned long local_get_flags(void);
-extern unsigned long local_irq_disable(void);
-extern void local_irq_restore(unsigned long);
-
-#define local_irq_enable()	local_irq_restore(1)
-#define local_save_flags(flags)	((flags) = local_get_flags())
-#define local_irq_save(flags)	((flags) = local_irq_disable())
+extern unsigned long raw_local_get_flags(void);
+extern unsigned long raw_local_irq_disable(void);
+extern void raw_local_irq_restore(unsigned long);
+
+#define raw_local_irq_enable()	raw_local_irq_restore(1)
+#define raw_local_save_flags(flags)	((flags) = raw_local_get_flags())
+#define raw_local_irq_save(flags)	((flags) = raw_local_irq_disable())
 
-#define irqs_disabled()		(local_get_flags() == 0)
+#define raw_irqs_disabled()			(raw_local_get_flags() == 0)
+#define raw_irqs_disabled_flags(flags)	((flags) == 0)
 
 #else
 
 #if defined(CONFIG_BOOKE)
 #define SET_MSR_EE(x)	mtmsr(x)
-#define local_irq_restore(flags)	__asm__ __volatile__("wrtee %0" : : "r" (flags) : "memory")
+#define raw_local_irq_restore(flags)	__asm__ __volatile__("wrtee %0" : : "r" (flags) : "memory")
 #elif defined(__powerpc64__)
 #define SET_MSR_EE(x)	__mtmsrd(x, 1)
-#define local_irq_restore(flags) do { \
+#define raw_local_irq_restore(flags) do { \
 	__asm__ __volatile__("": : :"memory"); \
 	__mtmsrd((flags), 1); \
 } while(0)
 #else
 #define SET_MSR_EE(x)	mtmsr(x)
-#define local_irq_restore(flags)	mtmsr(flags)
+#define raw_local_irq_restore(flags)	mtmsr(flags)
 #endif
 
-static inline void local_irq_disable(void)
+static inline void raw_local_irq_disable(void)
 {
 #ifdef CONFIG_BOOKE
 	__asm__ __volatile__("wrteei 0": : :"memory");
@@ -53,7 +54,7 @@ static inline void local_irq_disable(voi
 #endif
 }
 
-static inline void local_irq_enable(void)
+static inline void raw_local_irq_enable(void)
 {
 #ifdef CONFIG_BOOKE
 	__asm__ __volatile__("wrteei 1": : :"memory");
@@ -65,7 +66,7 @@ static inline void local_irq_enable(void
 #endif
 }
 
-static inline void local_irq_save_ptr(unsigned long *flags)
+static inline void raw_local_irq_save_ptr(unsigned long *flags)
 {
 	unsigned long msr;
 	msr = mfmsr();
@@ -78,12 +79,15 @@ static inline void local_irq_save_ptr(un
 	__asm__ __volatile__("": : :"memory");
 }
 
-#define local_save_flags(flags)	((flags) = mfmsr())
-#define local_irq_save(flags)	local_irq_save_ptr(&flags)
-#define irqs_disabled()		((mfmsr() & MSR_EE) == 0)
+#define raw_local_save_flags(flags)		((flags) = mfmsr())
+#define raw_local_irq_save(flags)		raw_local_irq_save_ptr(&flags)
+#define raw_irqs_disabled()			((mfmsr() & MSR_EE) == 0)
+#define raw_irqs_disabled_flags(flags)	((flags & MSR_EE) == 0)
 
 #endif /* CONFIG_PPC_ISERIES */
 
+#include <linux/trace_irqflags.h>
+
 #define mask_irq(irq)						\
 	({							\
 	 	irq_desc_t *desc = get_irq_desc(irq);		\
Index: linux-2.6.16/include/asm-powerpc/irqflags.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/include/asm-powerpc/irqflags.h	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,33 @@
+/*
+ * include/asm-powerpc/irqflags.h
+ *
+ * IRQ flags handling
+ *
+ * This file gets included from lowlevel asm headers too, to provide
+ * wrapped versions of the local_irq_*() APIs, based on the
+ * raw_local_irq_*() macros from the lowlevel headers.
+ */
+#ifndef _ASM_IRQFLAGS_H
+#define _ASM_IRQFLAGS_H
+
+/*
+ * Get definitions for raw_local_save_flags(x), etc.
+ */
+#include <asm-powerpc/hw_irq.h>
+
+/*
+ * Do the CPU's IRQ-state tracing from assembly code. We call a
+ * C function, so save all the C-clobbered registers:
+ */
+#ifdef CONFIG_DEBUG_TRACE_IRQFLAGS
+
+#error No support on PowerPC yet for CONFIG_DEBUG_TRACE_IRQFLAGS
+
+#else
+# define TRACE_IRQS_ON
+# define TRACE_IRQS_OFF
+# define TRACE_IRQS_ON_STR
+# define TRACE_IRQS_OFF_STR
+#endif
+
+#endif
Index: linux-2.6.16/include/asm-powerpc/rwsem.h
===================================================================
--- linux-2.6.16.orig/include/asm-powerpc/rwsem.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-powerpc/rwsem.h	2006-10-19 16:52:31.000000000 +0200
@@ -1,6 +1,10 @@
 #ifndef _ASM_POWERPC_RWSEM_H
 #define _ASM_POWERPC_RWSEM_H
 
+#ifndef _LINUX_RWSEM_H
+#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead"
+#endif
+
 #ifdef __KERNEL__
 
 /*
@@ -17,7 +21,7 @@
 /*
  * the semaphore definition
  */
-struct rw_semaphore {
+struct compat_rw_semaphore {
 	/* XXX this should be able to be an atomic_t  -- paulus */
 	signed int		count;
 #define RWSEM_UNLOCKED_VALUE		0x00000000
@@ -26,7 +30,7 @@ struct rw_semaphore {
 #define RWSEM_WAITING_BIAS		(-0x00010000)
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
-	spinlock_t		wait_lock;
+	raw_spinlock_t		wait_lock;
 	struct list_head	wait_list;
 #if RWSEM_DEBUG
 	int			debug;
@@ -47,15 +51,15 @@ struct rw_semaphore {
 	  LIST_HEAD_INIT((name).wait_list) \
 	  __RWSEM_DEBUG_INIT }
 
-#define DECLARE_RWSEM(name)		\
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
+#define COMPAT_DECLARE_RWSEM(name)		\
+	struct compat_rw_semaphore name = __RWSEM_INITIALIZER(name)
 
-extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_down_read_failed(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_down_write_failed(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_wake(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_downgrade_wake(struct compat_rw_semaphore *sem);
 
-static inline void init_rwsem(struct rw_semaphore *sem)
+static inline void compat_init_rwsem(struct compat_rw_semaphore *sem)
 {
 	sem->count = RWSEM_UNLOCKED_VALUE;
 	spin_lock_init(&sem->wait_lock);
@@ -68,13 +72,13 @@ static inline void init_rwsem(struct rw_
 /*
  * lock for reading
  */
-static inline void __down_read(struct rw_semaphore *sem)
+static inline void __down_read(struct compat_rw_semaphore *sem)
 {
 	if (unlikely(atomic_inc_return((atomic_t *)(&sem->count)) <= 0))
 		rwsem_down_read_failed(sem);
 }
 
-static inline int __down_read_trylock(struct rw_semaphore *sem)
+static inline int __down_read_trylock(struct compat_rw_semaphore *sem)
 {
 	int tmp;
 
@@ -90,7 +94,7 @@ static inline int __down_read_trylock(st
 /*
  * lock for writing
  */
-static inline void __down_write(struct rw_semaphore *sem)
+static inline void __down_write(struct compat_rw_semaphore *sem)
 {
 	int tmp;
 
@@ -100,7 +104,7 @@ static inline void __down_write(struct r
 		rwsem_down_write_failed(sem);
 }
 
-static inline int __down_write_trylock(struct rw_semaphore *sem)
+static inline int __down_write_trylock(struct compat_rw_semaphore *sem)
 {
 	int tmp;
 
@@ -112,7 +116,7 @@ static inline int __down_write_trylock(s
 /*
  * unlock after reading
  */
-static inline void __up_read(struct rw_semaphore *sem)
+static inline void __up_read(struct compat_rw_semaphore *sem)
 {
 	int tmp;
 
@@ -124,7 +128,7 @@ static inline void __up_read(struct rw_s
 /*
  * unlock after writing
  */
-static inline void __up_write(struct rw_semaphore *sem)
+static inline void __up_write(struct compat_rw_semaphore *sem)
 {
 	if (unlikely(atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS,
 			      (atomic_t *)(&sem->count)) < 0))
@@ -134,7 +138,7 @@ static inline void __up_write(struct rw_
 /*
  * implement atomic add functionality
  */
-static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
+static inline void rwsem_atomic_add(int delta, struct compat_rw_semaphore *sem)
 {
 	atomic_add(delta, (atomic_t *)(&sem->count));
 }
@@ -142,7 +146,7 @@ static inline void rwsem_atomic_add(int 
 /*
  * downgrade write lock to read lock
  */
-static inline void __downgrade_write(struct rw_semaphore *sem)
+static inline void __downgrade_write(struct compat_rw_semaphore *sem)
 {
 	int tmp;
 
@@ -154,12 +158,12 @@ static inline void __downgrade_write(str
 /*
  * implement exchange and add functionality
  */
-static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
+static inline int rwsem_atomic_update(int delta, struct compat_rw_semaphore *sem)
 {
 	return atomic_add_return(delta, (atomic_t *)(&sem->count));
 }
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
+static inline int compat_rwsem_is_locked(struct compat_rw_semaphore *sem)
 {
 	return (sem->count != 0);
 }
Index: linux-2.6.16/include/asm-powerpc/semaphore.h
===================================================================
--- linux-2.6.16.orig/include/asm-powerpc/semaphore.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-powerpc/semaphore.h	2006-10-19 16:52:31.000000000 +0200
@@ -10,54 +10,65 @@
 
 #ifdef __KERNEL__
 
+#include <linux/config.h>
 #include <asm/atomic.h>
 #include <asm/system.h>
 #include <linux/wait.h>
 #include <linux/rwsem.h>
 
-struct semaphore {
+/*
+ * On !PREEMPT_RT all sempahores are compat
+ */
+#ifndef CONFIG_PREEMPT_RT
+# define compat_semaphore semaphore
+#endif
+
+struct compat_semaphore {
 	/*
 	 * Note that any negative value of count is equivalent to 0,
 	 * but additionally indicates that some process(es) might be
 	 * sleeping on `wait'.
 	 */
 	atomic_t count;
+	int sleepers;
 	wait_queue_head_t wait;
 };
 
-#define __SEMAPHORE_INITIALIZER(name, n)				\
+#define __COMPAT_SEMAPHORE_INITIALIZER(name, n)				\
 {									\
 	.count		= ATOMIC_INIT(n),				\
 	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
 }
 
-#define __DECLARE_SEMAPHORE_GENERIC(name, count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
+#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name, count) \
+	struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count)
 
-#define DECLARE_MUTEX(name)		__DECLARE_SEMAPHORE_GENERIC(name, 1)
-#define DECLARE_MUTEX_LOCKED(name)	__DECLARE_SEMAPHORE_GENERIC(name, 0)
+#define COMPAT_DECLARE_MUTEX(name)		__COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 1)
+#define COMPAT_DECLARE_MUTEX_LOCKED(name)	__COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 0)
 
-static inline void sema_init (struct semaphore *sem, int val)
+static inline void compat_sema_init (struct compat_semaphore *sem, int val)
 {
 	atomic_set(&sem->count, val);
 	init_waitqueue_head(&sem->wait);
 }
 
-static inline void init_MUTEX (struct semaphore *sem)
+static inline void compat_init_MUTEX (struct compat_semaphore *sem)
 {
-	sema_init(sem, 1);
+	compat_sema_init(sem, 1);
 }
 
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
+static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem)
 {
-	sema_init(sem, 0);
+	compat_sema_init(sem, 0);
 }
 
-extern void __down(struct semaphore * sem);
-extern int  __down_interruptible(struct semaphore * sem);
-extern void __up(struct semaphore * sem);
+extern void __compat_down(struct compat_semaphore * sem);
+extern int  __compat_down_interruptible(struct compat_semaphore * sem);
+extern void __compat_up(struct compat_semaphore * sem);
+
+extern int compat_sem_is_locked(struct compat_semaphore *sem);
 
-static inline void down(struct semaphore * sem)
+static inline void compat_down(struct compat_semaphore * sem)
 {
 	might_sleep();
 
@@ -65,31 +76,35 @@ static inline void down(struct semaphore
 	 * Try to get the semaphore, take the slow path if we fail.
 	 */
 	if (unlikely(atomic_dec_return(&sem->count) < 0))
-		__down(sem);
+		__compat_down(sem);
 }
 
-static inline int down_interruptible(struct semaphore * sem)
+static inline int compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int ret = 0;
 
 	might_sleep();
 
 	if (unlikely(atomic_dec_return(&sem->count) < 0))
-		ret = __down_interruptible(sem);
+		ret = __compat_down_interruptible(sem);
 	return ret;
 }
 
-static inline int down_trylock(struct semaphore * sem)
+static inline int compat_down_trylock(struct compat_semaphore * sem)
 {
 	return atomic_dec_if_positive(&sem->count) < 0;
 }
 
-static inline void up(struct semaphore * sem)
+static inline void compat_up(struct compat_semaphore * sem)
 {
 	if (unlikely(atomic_inc_return(&sem->count) <= 0))
-		__up(sem);
+		__compat_up(sem);
 }
 
+#define compat_sema_count(sem) atomic_read(&(sem)->count)
+
+#include <linux/semaphore.h>
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_POWERPC_SEMAPHORE_H */
Index: linux-2.6.16/include/asm-powerpc/spinlock.h
===================================================================
--- linux-2.6.16.orig/include/asm-powerpc/spinlock.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-powerpc/spinlock.h	2006-10-19 16:52:31.000000000 +0200
@@ -40,7 +40,7 @@
  * This returns the old value in the lock, so we succeeded
  * in getting the lock if the return value is 0.
  */
-static __inline__ unsigned long __spin_trylock(raw_spinlock_t *lock)
+static __inline__ unsigned long ___raw_spin_trylock(__raw_spinlock_t *lock)
 {
 	unsigned long tmp, token;
 
@@ -59,9 +59,9 @@ static __inline__ unsigned long __spin_t
 	return tmp;
 }
 
-static int __inline__ __raw_spin_trylock(raw_spinlock_t *lock)
+static int __inline__ __raw_spin_trylock(__raw_spinlock_t *lock)
 {
-	return __spin_trylock(lock) == 0;
+	return ___raw_spin_trylock(lock) == 0;
 }
 
 /*
@@ -81,18 +81,18 @@ static int __inline__ __raw_spin_trylock
 #if defined(CONFIG_PPC_SPLPAR) || defined(CONFIG_PPC_ISERIES)
 /* We only yield to the hypervisor if we are in shared processor mode */
 #define SHARED_PROCESSOR (get_lppaca()->shared_proc)
-extern void __spin_yield(raw_spinlock_t *lock);
-extern void __rw_yield(raw_rwlock_t *lock);
+extern void __spin_yield(__raw_spinlock_t *lock);
+extern void __rw_yield(__raw_rwlock_t *lock);
 #else /* SPLPAR || ISERIES */
 #define __spin_yield(x)	barrier()
 #define __rw_yield(x)	barrier()
 #define SHARED_PROCESSOR	0
 #endif
 
-static void __inline__ __raw_spin_lock(raw_spinlock_t *lock)
+static void __inline__ __raw_spin_lock(__raw_spinlock_t *lock)
 {
 	while (1) {
-		if (likely(__spin_trylock(lock) == 0))
+		if (likely(___raw_spin_trylock(lock) == 0))
 			break;
 		do {
 			HMT_low();
@@ -103,12 +103,12 @@ static void __inline__ __raw_spin_lock(r
 	}
 }
 
-static void __inline__ __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
+static void __inline__ __raw_spin_lock_flags(__raw_spinlock_t *lock, unsigned long flags)
 {
 	unsigned long flags_dis;
 
 	while (1) {
-		if (likely(__spin_trylock(lock) == 0))
+		if (likely(___raw_spin_trylock(lock) == 0))
 			break;
 		local_save_flags(flags_dis);
 		local_irq_restore(flags);
@@ -122,7 +122,7 @@ static void __inline__ __raw_spin_lock_f
 	}
 }
 
-static __inline__ void __raw_spin_unlock(raw_spinlock_t *lock)
+static __inline__ void __raw_spin_unlock(__raw_spinlock_t *lock)
 {
 	__asm__ __volatile__("# __raw_spin_unlock\n\t"
 				LWSYNC_ON_SMP: : :"memory");
@@ -130,7 +130,7 @@ static __inline__ void __raw_spin_unlock
 }
 
 #ifdef CONFIG_PPC64
-extern void __raw_spin_unlock_wait(raw_spinlock_t *lock);
+extern void __raw_spin_unlock_wait(__raw_spinlock_t *lock);
 #else
 #define __raw_spin_unlock_wait(lock) \
 	do { while (__raw_spin_is_locked(lock)) cpu_relax(); } while (0)
@@ -162,7 +162,7 @@ extern void __raw_spin_unlock_wait(raw_s
  * This returns the old value in the lock + 1,
  * so we got a read lock if the return value is > 0.
  */
-static long __inline__ __read_trylock(raw_rwlock_t *rw)
+static long __inline__ __read_trylock(__raw_rwlock_t *rw)
 {
 	long tmp;
 
@@ -186,7 +186,7 @@ static long __inline__ __read_trylock(ra
  * This returns the old value in the lock,
  * so we got the write lock if the return value is 0.
  */
-static __inline__ long __write_trylock(raw_rwlock_t *rw)
+static __inline__ long __write_trylock(__raw_rwlock_t *rw)
 {
 	long tmp, token;
 
@@ -206,7 +206,7 @@ static __inline__ long __write_trylock(r
 	return tmp;
 }
 
-static void __inline__ __raw_read_lock(raw_rwlock_t *rw)
+static void __inline__ __raw_read_lock(__raw_rwlock_t *rw)
 {
 	while (1) {
 		if (likely(__read_trylock(rw) > 0))
@@ -220,7 +220,7 @@ static void __inline__ __raw_read_lock(r
 	}
 }
 
-static void __inline__ __raw_write_lock(raw_rwlock_t *rw)
+static void __inline__ __raw_write_lock(__raw_rwlock_t *rw)
 {
 	while (1) {
 		if (likely(__write_trylock(rw) == 0))
@@ -234,17 +234,17 @@ static void __inline__ __raw_write_lock(
 	}
 }
 
-static int __inline__ __raw_read_trylock(raw_rwlock_t *rw)
+static int __inline__ __raw_read_trylock(__raw_rwlock_t *rw)
 {
 	return __read_trylock(rw) > 0;
 }
 
-static int __inline__ __raw_write_trylock(raw_rwlock_t *rw)
+static int __inline__ __raw_write_trylock(__raw_rwlock_t *rw)
 {
 	return __write_trylock(rw) == 0;
 }
 
-static void __inline__ __raw_read_unlock(raw_rwlock_t *rw)
+static void __inline__ __raw_read_unlock(__raw_rwlock_t *rw)
 {
 	long tmp;
 
@@ -261,7 +261,7 @@ static void __inline__ __raw_read_unlock
 	: "cr0", "memory");
 }
 
-static __inline__ void __raw_write_unlock(raw_rwlock_t *rw)
+static __inline__ void __raw_write_unlock(__raw_rwlock_t *rw)
 {
 	__asm__ __volatile__("# write_unlock\n\t"
 				LWSYNC_ON_SMP: : :"memory");
Index: linux-2.6.16/include/asm-powerpc/spinlock_types.h
===================================================================
--- linux-2.6.16.orig/include/asm-powerpc/spinlock_types.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-powerpc/spinlock_types.h	2006-10-19 16:52:31.000000000 +0200
@@ -7,13 +7,13 @@
 
 typedef struct {
 	volatile unsigned int slock;
-} raw_spinlock_t;
+} __raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED	{ 0 }
 
 typedef struct {
 	volatile signed int lock;
-} raw_rwlock_t;
+} __raw_rwlock_t;
 
 #define __RAW_RW_LOCK_UNLOCKED		{ 0 }
 
Index: linux-2.6.16/include/asm-powerpc/thread_info.h
===================================================================
--- linux-2.6.16.orig/include/asm-powerpc/thread_info.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-powerpc/thread_info.h	2006-10-19 16:52:31.000000000 +0200
@@ -119,6 +119,7 @@ static inline struct thread_info *curren
 #define TIF_MEMDIE		10
 #define TIF_SECCOMP		11	/* secure computing */
 #define TIF_RESTOREALL		12	/* Restore all regs (implies NOERROR) */
+#define TIF_NEED_RESCHED_DELAYED 13	/* reschedule on return to userspace */
 #define TIF_NOERROR		14	/* Force successful syscall return */
 #define TIF_RESTORE_SIGMASK	15	/* Restore signal mask in do_signal */
 
@@ -137,6 +138,8 @@ static inline struct thread_info *curren
 #define _TIF_RESTOREALL		(1<<TIF_RESTOREALL)
 #define _TIF_NOERROR		(1<<TIF_NOERROR)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
+#define _TIF_NEED_RESCHED_DELAYED (1<<TIF_NEED_RESCHED_DELAYED)
+
 #define _TIF_SYSCALL_T_OR_A	(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP)
 
 #define _TIF_USER_WORK_MASK	(_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | \
Index: linux-2.6.16/include/asm-powerpc/time.h
===================================================================
--- linux-2.6.16.orig/include/asm-powerpc/time.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-powerpc/time.h	2006-10-19 16:52:31.000000000 +0200
@@ -32,6 +32,7 @@ extern u64 tb_to_xs;
 extern unsigned      tb_to_us;
 extern unsigned long tb_last_stamp;
 extern u64 tb_last_jiffy;
+extern unsigned long cpu_khz;
 
 DECLARE_PER_CPU(unsigned long, last_jiffy);
 
Index: linux-2.6.16/include/asm-powerpc/timeofday.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/include/asm-powerpc/timeofday.h	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,4 @@
+#ifndef _ASM_POWERPC_TIMEOFDAY_H
+#define _ASM_POWERPC_TIMEOFDAY_H
+#include <asm-generic/timeofday.h>
+#endif
Index: linux-2.6.16/include/asm-ppc/ocp.h
===================================================================
--- linux-2.6.16.orig/include/asm-ppc/ocp.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-ppc/ocp.h	2006-10-19 16:52:31.000000000 +0200
@@ -29,10 +29,10 @@
 #include <linux/config.h>
 #include <linux/devfs_fs_kernel.h>
 #include <linux/device.h>
+#include <linux/rwsem.h>
 
 #include <asm/mmu.h>
 #include <asm/ocp_ids.h>
-#include <asm/rwsem.h>
 #include <asm/semaphore.h>
 
 #ifdef CONFIG_PPC_OCP
Index: linux-2.6.16/include/asm-ppc/time.h
===================================================================
--- linux-2.6.16.orig/include/asm-ppc/time.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-ppc/time.h	2006-10-19 16:52:31.000000000 +0200
@@ -20,6 +20,7 @@
 extern unsigned tb_ticks_per_jiffy;
 extern unsigned tb_to_us;
 extern unsigned tb_last_stamp;
+extern unsigned long cpu_khz;
 extern unsigned long disarm_decr[NR_CPUS];
 
 extern void to_tm(int tim, struct rtc_time * tm);
Index: linux-2.6.16/include/asm-s390/compat.h
===================================================================
--- linux-2.6.16.orig/include/asm-s390/compat.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-s390/compat.h	2006-10-19 16:52:31.000000000 +0200
@@ -128,6 +128,11 @@ static inline void __user *compat_ptr(co
 	return (void __user *)(unsigned long)(uptr & 0x7fffffffUL);
 }
 
+static inline compat_uptr_t ptr_to_compat(void __user *uptr)
+{
+	return (u32)(unsigned long)uptr;
+}
+
 static inline void __user *compat_alloc_user_space(long len)
 {
 	unsigned long stack;
Index: linux-2.6.16/include/asm-sparc64/futex.h
===================================================================
--- linux-2.6.16.orig/include/asm-sparc64/futex.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-sparc64/futex.h	2006-10-19 16:52:31.000000000 +0200
@@ -83,4 +83,10 @@ static inline int futex_atomic_op_inuser
 	return ret;
 }
 
+static inline int
+futex_atomic_cmpxchg_inuser(int __user *uaddr, int oldval, int newval)
+{
+	return -ENOSYS;
+}
+
 #endif /* !(_SPARC64_FUTEX_H) */
Index: linux-2.6.16/include/asm-x86_64/acpi.h
===================================================================
--- linux-2.6.16.orig/include/asm-x86_64/acpi.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-x86_64/acpi.h	2006-10-19 16:52:31.000000000 +0200
@@ -50,8 +50,8 @@
 
 #define ACPI_ASM_MACROS
 #define BREAKPOINT3
-#define ACPI_DISABLE_IRQS() local_irq_disable()
-#define ACPI_ENABLE_IRQS()  local_irq_enable()
+#define ACPI_DISABLE_IRQS() local_irq_disable_nort()
+#define ACPI_ENABLE_IRQS()  local_irq_enable_nort()
 #define ACPI_FLUSH_CPU_CACHE()	wbinvd()
 
 
Index: linux-2.6.16/include/asm-x86_64/futex.h
===================================================================
--- linux-2.6.16.orig/include/asm-x86_64/futex.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-x86_64/futex.h	2006-10-19 16:52:31.000000000 +0200
@@ -94,5 +94,32 @@ futex_atomic_op_inuser (int encoded_op, 
 	return ret;
 }
 
+static inline int
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
+{
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	__asm__ __volatile__(
+		"1:	" LOCK_PREFIX "cmpxchgl %3, %1		\n"
+
+		"2:	.section .fixup, \"ax\"			\n"
+		"3:	mov     %2, %0				\n"
+		"	jmp     2b				\n"
+		"	.previous				\n"
+
+		"	.section __ex_table, \"a\"		\n"
+		"	.align  8				\n"
+		"	.quad   1b,3b				\n"
+		"	.previous				\n"
+
+		: "=a" (oldval), "=m" (*uaddr)
+		: "i" (-EFAULT), "r" (newval), "0" (oldval)
+		: "memory"
+	);
+
+	return oldval;
+}
+
 #endif
 #endif
Index: linux-2.6.16/include/asm-x86_64/hpet.h
===================================================================
--- linux-2.6.16.orig/include/asm-x86_64/hpet.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-x86_64/hpet.h	2006-10-19 16:52:31.000000000 +0200
@@ -1,6 +1,6 @@
 #ifndef _ASM_X8664_HPET_H
 #define _ASM_X8664_HPET_H 1
-
+#include <asm/fixmap.h>
 /*
  * Documentation on HPET can be found at:
  *      http://www.intel.com/ial/home/sp/pcmmspec.htm
@@ -51,6 +51,7 @@
 
 #define HPET_TN_ROUTE_SHIFT	9
 
+extern unsigned long hpet_address;	/* hpet memory map physical address */
 extern int is_hpet_enabled(void);
 extern int hpet_rtc_timer_init(void);
 extern int oem_force_hpet_timer(void);
Index: linux-2.6.16/include/asm-x86_64/io_apic.h
===================================================================
--- linux-2.6.16.orig/include/asm-x86_64/io_apic.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-x86_64/io_apic.h	2006-10-19 16:52:31.000000000 +0200
@@ -4,6 +4,7 @@
 #include <linux/config.h>
 #include <asm/types.h>
 #include <asm/mpspec.h>
+#include <asm/apicdef.h>
 
 /*
  * Intel IO-APIC support for SMP and UP systems.
@@ -16,11 +17,10 @@
 #ifdef CONFIG_PCI_MSI
 static inline int use_pci_vector(void)	{return 1;}
 static inline void disable_edge_ioapic_vector(unsigned int vector) { }
-static inline void mask_and_ack_level_ioapic_vector(unsigned int vector) { }
 static inline void end_edge_ioapic_vector (unsigned int vector) { }
 #define startup_level_ioapic	startup_level_ioapic_vector
 #define shutdown_level_ioapic	mask_IO_APIC_vector
-#define enable_level_ioapic	unmask_IO_APIC_vector
+#define enable_level_ioapic	enable_level_ioapic_vector
 #define disable_level_ioapic	mask_IO_APIC_vector
 #define mask_and_ack_level_ioapic mask_and_ack_level_ioapic_vector
 #define end_level_ioapic	end_level_ioapic_vector
@@ -35,11 +35,10 @@ static inline void end_edge_ioapic_vecto
 #else
 static inline int use_pci_vector(void)	{return 0;}
 static inline void disable_edge_ioapic_irq(unsigned int irq) { }
-static inline void mask_and_ack_level_ioapic_irq(unsigned int irq) { }
 static inline void end_edge_ioapic_irq (unsigned int irq) { }
 #define startup_level_ioapic	startup_level_ioapic_irq
 #define shutdown_level_ioapic	mask_IO_APIC_irq
-#define enable_level_ioapic	unmask_IO_APIC_irq
+#define enable_level_ioapic	enable_level_ioapic_irq
 #define disable_level_ioapic	mask_IO_APIC_irq
 #define mask_and_ack_level_ioapic mask_and_ack_level_ioapic_irq
 #define end_level_ioapic	end_level_ioapic_irq
@@ -217,6 +216,6 @@ extern int assign_irq_vector(int irq);
 
 void enable_NMI_through_LVT0 (void * dummy);
 
-extern spinlock_t i8259A_lock;
+extern raw_spinlock_t i8259A_lock;
 
 #endif
Index: linux-2.6.16/include/asm-x86_64/irqflags.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/include/asm-x86_64/irqflags.h	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,54 @@
+/*
+ * include/asm-x86_64/irqflags.h
+ *
+ * IRQ flags handling
+ *
+ * This file gets included from lowlevel asm headers too, to provide
+ * wrapped versions of the local_irq_*() APIs, based on the
+ * raw_local_irq_*() macros from the lowlevel headers.
+ */
+#ifndef _ASM_IRQFLAGS_H
+#define _ASM_IRQFLAGS_H
+
+/* interrupt control.. */
+#define raw_local_save_flags(x)	do { warn_if_not_ulong(x); __asm__ __volatile__("# save_flags \n\t pushfq ; popq %q0":"=g" (x): /* no input */ :"memory"); } while (0)
+#define raw_local_irq_restore(x) 	__asm__ __volatile__("# restore_flags \n\t pushq %0 ; popfq": /* no output */ :"g" (x):"memory", "cc")
+
+#ifdef CONFIG_X86_VSMP
+/* Interrupt control for VSMP  architecture */
+#define raw_local_irq_disable()	do { unsigned long flags; raw_local_save_flags(flags); raw_local_irq_restore((flags & ~(1 << 9)) | (1 << 18)); } while (0)
+#define raw_local_irq_enable()	do { unsigned long flags; raw_local_save_flags(flags); raw_local_irq_restore((flags | (1 << 9)) & ~(1 << 18)); } while (0)
+
+#define raw_irqs_disabled_flags(flags)	\
+({						\
+	(flags & (1<<18)) || !(flags & (1<<9));	\
+})
+
+/* For spinlocks etc */
+#define raw_local_irq_save(x)	do { raw_local_save_flags(x); raw_local_irq_restore((x & ~(1 << 9)) | (1 << 18)); } while (0)
+#else  /* CONFIG_X86_VSMP */
+#define raw_local_irq_disable() 	__asm__ __volatile__("cli": : :"memory")
+#define raw_local_irq_enable()	__asm__ __volatile__("sti": : :"memory")
+
+#define raw_irqs_disabled_flags(flags)	\
+({						\
+	!(flags & (1<<9));			\
+})
+
+/* For spinlocks etc */
+#define raw_local_irq_save(x) 	do { warn_if_not_ulong(x); __asm__ __volatile__("# raw_local_irq_save \n\t pushfq ; popq %0 ; cli":"=g" (x): /* no input */ :"memory"); } while (0)
+#endif
+
+#define raw_irqs_disabled()			\
+({						\
+	unsigned long flags;			\
+	raw_local_save_flags(flags);		\
+	raw_irqs_disabled_flags(flags);		\
+})
+
+/* used in the idle loop; sti takes one instruction cycle to complete */
+#define raw_safe_halt()	__asm__ __volatile__("sti; hlt": : :"memory")
+/* used when interrupts are already enabled or to shutdown the processor */
+#define halt()			__asm__ __volatile__("hlt": : :"memory")
+
+#endif
Index: linux-2.6.16/include/asm-x86_64/percpu.h
===================================================================
--- linux-2.6.16.orig/include/asm-x86_64/percpu.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-x86_64/percpu.h	2006-10-19 16:52:31.000000000 +0200
@@ -17,11 +17,23 @@
 /* Separate out the type, so (int[3], foo) works. */
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
+#define DEFINE_PER_CPU_LOCKED(type, name) \
+    __attribute__((__section__(".data.percpu"))) spinlock_t per_cpu_lock__##name##_locked = SPIN_LOCK_UNLOCKED; \
+    __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name##_locked
 
 /* var is in discarded region: offset to particular copy we want */
 #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu)))
 #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset()))
 
+#define per_cpu_lock(var, cpu) \
+	(*RELOC_HIDE(&per_cpu_lock__##var##_locked, __per_cpu_offset(cpu)))
+#define per_cpu_var_locked(var, cpu) \
+		(*RELOC_HIDE(&per_cpu__##var##_locked, __per_cpu_offset(cpu)))
+#define __get_cpu_lock(var, cpu) \
+		per_cpu_lock(var, cpu)
+#define __get_cpu_var_locked(var, cpu) \
+		per_cpu_var_locked(var, cpu)
+
 /* A macro to avoid #include hell... */
 #define percpu_modcopy(pcpudst, src, size)			\
 do {								\
@@ -39,14 +51,27 @@ extern void setup_per_cpu_areas(void);
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
 
+#define DEFINE_PER_CPU_LOCKED(type, name) \
+	spinlock_t per_cpu_lock__##name##_locked = SPIN_LOCK_UNLOCKED; \
+	__typeof__(type) per_cpu__##name##_locked
+
 #define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu__##var))
+#define per_cpu_var_locked(var, cpu)		(*((void)(cpu), &per_cpu__##var##_locked))
 #define __get_cpu_var(var)			per_cpu__##var
+#define __get_cpu_lock(var, cpu)		per_cpu_lock__##var##_locked
+#define __get_cpu_var_locked(var, cpu)		per_cpu__##var##_locked
 
 #endif	/* SMP */
 
 #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
 
+#define DECLARE_PER_CPU_LOCKED(type, name) \
+	extern spinlock_t per_cpu_lock__##name##_locked; \
+	extern __typeof__(type) per_cpu__##name##_locked
+
 #define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
 #define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
+#define EXPORT_PER_CPU_LOCKED_SYMBOL(var) EXPORT_SYMBOL(per_cpu_lock__##var##_locked); EXPORT_SYMBOL(per_cpu__##var##_locked)
+#define EXPORT_PER_CPU_LOCKED_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu_lock__##var##_locked); EXPORT_SYMBOL_GPL(per_cpu__##var##_locked)
 
 #endif /* _ASM_X8664_PERCPU_H_ */
Index: linux-2.6.16/include/asm-x86_64/proto.h
===================================================================
--- linux-2.6.16.orig/include/asm-x86_64/proto.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-x86_64/proto.h	2006-10-19 16:52:31.000000000 +0200
@@ -77,7 +77,7 @@ extern void main_timer_handler(struct pt
 
 extern unsigned long end_pfn_map; 
 
-extern void show_trace(unsigned long * rsp);
+extern void show_trace(struct task_struct *task, unsigned long * rsp);
 extern void show_registers(struct pt_regs *regs);
 
 extern void exception_table_check(void);
Index: linux-2.6.16/include/asm-x86_64/semaphore.h
===================================================================
--- linux-2.6.16.orig/include/asm-x86_64/semaphore.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-x86_64/semaphore.h	2006-10-19 16:52:31.000000000 +0200
@@ -1,10 +1,15 @@
 #ifndef _X86_64_SEMAPHORE_H
 #define _X86_64_SEMAPHORE_H
 
+#include <linux/config.h>
 #include <linux/linkage.h>
 
 #ifdef __KERNEL__
 
+#ifndef CONFIG_PREEMPT_RT
+# define compat_semaphore semaphore
+#endif
+
 /*
  * SMP- and interrupt-safe semaphores..
  *
@@ -43,29 +48,34 @@
 #include <linux/rwsem.h>
 #include <linux/stringify.h>
 
-struct semaphore {
+struct compat_semaphore {
 	atomic_t count;
 	int sleepers;
 	wait_queue_head_t wait;
 };
 
-#define __SEMAPHORE_INITIALIZER(name, n)				\
+#define __COMPAT_SEMAPHORE_INITIALIZER(name, n)				\
 {									\
 	.count		= ATOMIC_INIT(n),				\
 	.sleepers	= 0,						\
 	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
 }
 
-#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
+#define __COMPAT_MUTEX_INITIALIZER(name) \
+	__COMPAT_SEMAPHORE_INITIALIZER(name,1)
+
+#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count) \
+	struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count)
 
-#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
-#define DECLARE_MUTEX_LOCKED(name) __DECLARE_SEMAPHORE_GENERIC(name,0)
+#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,1)
+#define COMPAT_DECLARE_MUTEX_LOCKED(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,0)
 
-static inline void sema_init (struct semaphore *sem, int val)
+#define compat_sema_count(sem) atomic_read(&(sem)->count)
+
+static inline void compat_sema_init (struct compat_semaphore *sem, int val)
 {
 /*
- *	*sem = (struct semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
+ *	*sem = (struct compat_semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
  *
  * i'd rather use the more flexible initialization above, but sadly
  * GCC 2.7.2.3 emits a bogus warning. EGCS doesn't. Oh well.
@@ -75,32 +85,33 @@ static inline void sema_init (struct sem
 	init_waitqueue_head(&sem->wait);
 }
 
-static inline void init_MUTEX (struct semaphore *sem)
+static inline void compat_init_MUTEX (struct compat_semaphore *sem)
 {
-	sema_init(sem, 1);
+	compat_sema_init(sem, 1);
 }
 
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
+static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem)
 {
-	sema_init(sem, 0);
+	compat_sema_init(sem, 0);
 }
 
-asmlinkage void __down_failed(void /* special register calling convention */);
-asmlinkage int  __down_failed_interruptible(void  /* params in registers */);
-asmlinkage int  __down_failed_trylock(void  /* params in registers */);
-asmlinkage void __up_wakeup(void /* special register calling convention */);
+asmlinkage void __compat_down_failed(void /* special register calling convention */);
+asmlinkage int  __compat_down_failed_interruptible(void  /* params in registers */);
+asmlinkage int  __compat_down_failed_trylock(void  /* params in registers */);
+asmlinkage void __compat_up_wakeup(void /* special register calling convention */);
 
-asmlinkage void __down(struct semaphore * sem);
-asmlinkage int  __down_interruptible(struct semaphore * sem);
-asmlinkage int  __down_trylock(struct semaphore * sem);
-asmlinkage void __up(struct semaphore * sem);
+asmlinkage void __compat_down(struct compat_semaphore * sem);
+asmlinkage int  __compat_down_interruptible(struct compat_semaphore * sem);
+asmlinkage int  __compat_down_trylock(struct compat_semaphore * sem);
+asmlinkage void __compat_up(struct compat_semaphore * sem);
+asmlinkage int compat_sem_is_locked(struct compat_semaphore *sem);
 
 /*
  * This is ugly, but we want the default case to fall through.
  * "__down_failed" is a special asm handler that calls the C
  * routine that actually waits. See arch/x86_64/kernel/semaphore.c
  */
-static inline void down(struct semaphore * sem)
+static inline void compat_down(struct compat_semaphore * sem)
 {
 	might_sleep();
 
@@ -110,7 +121,7 @@ static inline void down(struct semaphore
 		"js 2f\n"
 		"1:\n"
 		LOCK_SECTION_START("")
-		"2:\tcall __down_failed\n\t"
+		"2:\tcall __compat_down_failed\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		:"=m" (sem->count)
@@ -122,7 +133,7 @@ static inline void down(struct semaphore
  * Interruptible try to acquire a semaphore.  If we obtained
  * it, return zero.  If we were interrupted, returns -EINTR
  */
-static inline int down_interruptible(struct semaphore * sem)
+static inline int compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int result;
 
@@ -135,7 +146,7 @@ static inline int down_interruptible(str
 		"xorl %0,%0\n"
 		"1:\n"
 		LOCK_SECTION_START("")
-		"2:\tcall __down_failed_interruptible\n\t"
+		"2:\tcall __compat_down_failed_interruptible\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		:"=a" (result), "=m" (sem->count)
@@ -148,7 +159,7 @@ static inline int down_interruptible(str
  * Non-blockingly attempt to down() a semaphore.
  * Returns zero if we acquired it
  */
-static inline int down_trylock(struct semaphore * sem)
+static inline int compat_down_trylock(struct compat_semaphore * sem)
 {
 	int result;
 
@@ -159,7 +170,7 @@ static inline int down_trylock(struct se
 		"xorl %0,%0\n"
 		"1:\n"
 		LOCK_SECTION_START("")
-		"2:\tcall __down_failed_trylock\n\t"
+		"2:\tcall __compat_down_failed_trylock\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		:"=a" (result), "=m" (sem->count)
@@ -174,7 +185,7 @@ static inline int down_trylock(struct se
  * The default case (no contention) will result in NO
  * jumps for both down() and up().
  */
-static inline void up(struct semaphore * sem)
+static inline void compat_up(struct compat_semaphore * sem)
 {
 	__asm__ __volatile__(
 		"# atomic up operation\n\t"
@@ -182,12 +193,15 @@ static inline void up(struct semaphore *
 		"jle 2f\n"
 		"1:\n"
 		LOCK_SECTION_START("")
-		"2:\tcall __up_wakeup\n\t"
+		"2:\tcall __compat_up_wakeup\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		:"=m" (sem->count)
 		:"D" (sem)
 		:"memory");
 }
+
+#include <linux/semaphore.h>
+
 #endif /* __KERNEL__ */
 #endif
Index: linux-2.6.16/include/asm-x86_64/spinlock.h
===================================================================
--- linux-2.6.16.orig/include/asm-x86_64/spinlock.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-x86_64/spinlock.h	2006-10-19 16:52:31.000000000 +0200
@@ -36,7 +36,7 @@
 	"movl $1,%0" \
 		:"=m" (lock->slock) : : "memory"
 
-static inline void __raw_spin_lock(raw_spinlock_t *lock)
+static inline void __raw_spin_lock(__raw_spinlock_t *lock)
 {
 	__asm__ __volatile__(
 		__raw_spin_lock_string
@@ -45,7 +45,7 @@ static inline void __raw_spin_lock(raw_s
 
 #define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
 
-static inline int __raw_spin_trylock(raw_spinlock_t *lock)
+static inline int __raw_spin_trylock(__raw_spinlock_t *lock)
 {
 	int oldval;
 
@@ -57,7 +57,7 @@ static inline int __raw_spin_trylock(raw
 	return oldval > 0;
 }
 
-static inline void __raw_spin_unlock(raw_spinlock_t *lock)
+static inline void __raw_spin_unlock(__raw_spinlock_t *lock)
 {
 	__asm__ __volatile__(
 		__raw_spin_unlock_string
@@ -91,17 +91,17 @@ static inline void __raw_spin_unlock(raw
 #define __raw_read_can_lock(x)		((int)(x)->lock > 0)
 #define __raw_write_can_lock(x)		((x)->lock == RW_LOCK_BIAS)
 
-static inline void __raw_read_lock(raw_rwlock_t *rw)
+static inline void __raw_read_lock(__raw_rwlock_t *rw)
 {
 	__build_read_lock(rw, "__read_lock_failed");
 }
 
-static inline void __raw_write_lock(raw_rwlock_t *rw)
+static inline void __raw_write_lock(__raw_rwlock_t *rw)
 {
 	__build_write_lock(rw, "__write_lock_failed");
 }
 
-static inline int __raw_read_trylock(raw_rwlock_t *lock)
+static inline int __raw_read_trylock(__raw_rwlock_t *lock)
 {
 	atomic_t *count = (atomic_t *)lock;
 	atomic_dec(count);
@@ -111,7 +111,7 @@ static inline int __raw_read_trylock(raw
 	return 0;
 }
 
-static inline int __raw_write_trylock(raw_rwlock_t *lock)
+static inline int __raw_write_trylock(__raw_rwlock_t *lock)
 {
 	atomic_t *count = (atomic_t *)lock;
 	if (atomic_sub_and_test(RW_LOCK_BIAS, count))
@@ -120,12 +120,12 @@ static inline int __raw_write_trylock(ra
 	return 0;
 }
 
-static inline void __raw_read_unlock(raw_rwlock_t *rw)
+static inline void __raw_read_unlock(__raw_rwlock_t *rw)
 {
 	asm volatile("lock ; incl %0" :"=m" (rw->lock) : : "memory");
 }
 
-static inline void __raw_write_unlock(raw_rwlock_t *rw)
+static inline void __raw_write_unlock(__raw_rwlock_t *rw)
 {
 	asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0"
 				: "=m" (rw->lock) : : "memory");
Index: linux-2.6.16/include/asm-x86_64/spinlock_types.h
===================================================================
--- linux-2.6.16.orig/include/asm-x86_64/spinlock_types.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-x86_64/spinlock_types.h	2006-10-19 16:52:31.000000000 +0200
@@ -7,13 +7,13 @@
 
 typedef struct {
 	volatile unsigned int slock;
-} raw_spinlock_t;
+} __raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED	{ 1 }
 
 typedef struct {
 	volatile unsigned int lock;
-} raw_rwlock_t;
+} __raw_rwlock_t;
 
 #define __RAW_RW_LOCK_UNLOCKED		{ RW_LOCK_BIAS }
 
Index: linux-2.6.16/include/asm-x86_64/system.h
===================================================================
--- linux-2.6.16.orig/include/asm-x86_64/system.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-x86_64/system.h	2006-10-19 16:52:31.000000000 +0200
@@ -332,43 +332,7 @@ static inline unsigned long __cmpxchg(vo
 
 #define warn_if_not_ulong(x) do { unsigned long foo; (void) (&(x) == &foo); } while (0)
 
-/* interrupt control.. */
-#define local_save_flags(x)	do { warn_if_not_ulong(x); __asm__ __volatile__("# save_flags \n\t pushfq ; popq %q0":"=g" (x): /* no input */ :"memory"); } while (0)
-#define local_irq_restore(x) 	__asm__ __volatile__("# restore_flags \n\t pushq %0 ; popfq": /* no output */ :"g" (x):"memory", "cc")
-
-#ifdef CONFIG_X86_VSMP
-/* Interrupt control for VSMP  architecture */
-#define local_irq_disable()	do { unsigned long flags; local_save_flags(flags); local_irq_restore((flags & ~(1 << 9)) | (1 << 18)); } while (0)
-#define local_irq_enable()	do { unsigned long flags; local_save_flags(flags); local_irq_restore((flags | (1 << 9)) & ~(1 << 18)); } while (0)
-
-#define irqs_disabled()					\
-({							\
-	unsigned long flags;				\
-	local_save_flags(flags);			\
-	(flags & (1<<18)) || !(flags & (1<<9));		\
-})
-
-/* For spinlocks etc */
-#define local_irq_save(x)	do { local_save_flags(x); local_irq_restore((x & ~(1 << 9)) | (1 << 18)); } while (0)
-#else  /* CONFIG_X86_VSMP */
-#define local_irq_disable() 	__asm__ __volatile__("cli": : :"memory")
-#define local_irq_enable()	__asm__ __volatile__("sti": : :"memory")
-
-#define irqs_disabled()			\
-({					\
-	unsigned long flags;		\
-	local_save_flags(flags);	\
-	!(flags & (1<<9));		\
-})
-
-/* For spinlocks etc */
-#define local_irq_save(x) 	do { warn_if_not_ulong(x); __asm__ __volatile__("# local_irq_save \n\t pushfq ; popq %0 ; cli":"=g" (x): /* no input */ :"memory"); } while (0)
-#endif
-
-/* used in the idle loop; sti takes one instruction cycle to complete */
-#define safe_halt()		__asm__ __volatile__("sti; hlt": : :"memory")
-/* used when interrupts are already enabled or to shutdown the processor */
-#define halt()			__asm__ __volatile__("hlt": : :"memory")
+#include <linux/trace_irqflags.h>
 
 void cpu_idle_wait(void);
 
Index: linux-2.6.16/include/asm-x86_64/thread_info.h
===================================================================
--- linux-2.6.16.orig/include/asm-x86_64/thread_info.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-x86_64/thread_info.h	2006-10-19 16:52:31.000000000 +0200
@@ -99,6 +99,7 @@ static inline struct thread_info *stack_
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_SINGLESTEP		4	/* reenable singlestep on user return*/
 #define TIF_IRET		5	/* force IRET */
+#define TIF_NEED_RESCHED_DELAYED 6	/* reschedul on return to userspace */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
@@ -115,6 +116,7 @@ static inline struct thread_info *stack_
 #define _TIF_IRET		(1<<TIF_IRET)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
+#define _TIF_NEED_RESCHED_DELAYED	(1<<TIF_NEED_RESCHED_DELAYED)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 #define _TIF_IA32		(1<<TIF_IA32)
 #define _TIF_FORK		(1<<TIF_FORK)
Index: linux-2.6.16/include/asm-x86_64/timeofday.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/include/asm-x86_64/timeofday.h	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,4 @@
+#ifndef _ASM_X86_64_TIMEOFDAY_H
+#define _ASM_X86_64_TIMEOFDAY_H
+#include <asm-generic/timeofday.h>
+#endif
Index: linux-2.6.16/include/asm-x86_64/timex.h
===================================================================
--- linux-2.6.16.orig/include/asm-x86_64/timex.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-x86_64/timex.h	2006-10-19 16:52:31.000000000 +0200
@@ -40,6 +40,9 @@ static __always_inline cycles_t get_cycl
 }
 
 extern unsigned int cpu_khz;
+extern unsigned int tsc_khz;
+
+extern void mark_tsc_unstable(void);
 
 extern int read_current_timer(unsigned long *timer_value);
 #define ARCH_HAS_READ_CURRENT_TIMER	1
Index: linux-2.6.16/include/asm-x86_64/tlbflush.h
===================================================================
--- linux-2.6.16.orig/include/asm-x86_64/tlbflush.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-x86_64/tlbflush.h	2006-10-19 16:52:31.000000000 +0200
@@ -9,11 +9,13 @@
 	do {								\
 		unsigned long tmpreg;					\
 									\
+		preempt_disable();					\
 		__asm__ __volatile__(					\
 			"movq %%cr3, %0;  # flush TLB \n"		\
 			"movq %0, %%cr3;              \n"		\
 			: "=r" (tmpreg)					\
 			:: "memory");					\
+		preempt_enable();					\
 	} while (0)
 
 /*
@@ -24,6 +26,7 @@
 	do {								\
 		unsigned long tmpreg, cr4, cr4_orig;			\
 									\
+		preempt_disable();					\
 		__asm__ __volatile__(					\
 			"movq %%cr4, %2;  # turn off PGE     \n"	\
 			"movq %2, %1;                        \n"	\
@@ -35,6 +38,7 @@
 			: "=&r" (tmpreg), "=&r" (cr4), "=&r" (cr4_orig)	\
 			: "i" (~X86_CR4_PGE)				\
 			: "memory");					\
+		preempt_enable();					\
 	} while (0)
 
 extern unsigned long pgkern_mask;
Index: linux-2.6.16/include/asm-x86_64/unistd.h
===================================================================
--- linux-2.6.16.orig/include/asm-x86_64/unistd.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-x86_64/unistd.h	2006-10-19 16:52:31.000000000 +0200
@@ -11,6 +11,8 @@
  * Note: holes are not allowed.
  */
 
+#define NR_syscalls (__NR_syscall_max+1)
+
 /* at least 8 syscall per cacheline */
 #define __NR_read                                0
 __SYSCALL(__NR_read, sys_read)
@@ -605,8 +607,12 @@ __SYSCALL(__NR_pselect6, sys_ni_syscall)
 __SYSCALL(__NR_ppoll,	sys_ni_syscall)		/* for now */
 #define __NR_unshare		272
 __SYSCALL(__NR_unshare,	sys_unshare)
+#define __NR_set_robust_list	273
+__SYSCALL(__NR_set_robust_list, sys_set_robust_list)
+#define __NR_get_robust_list	274
+__SYSCALL(__NR_get_robust_list, sys_get_robust_list)
 
-#define __NR_syscall_max __NR_unshare
+#define __NR_syscall_max __NR_get_robust_list
 
 #ifndef __NO_STUBS
 
Index: linux-2.6.16/include/asm-x86_64/vsyscall.h
===================================================================
--- linux-2.6.16.orig/include/asm-x86_64/vsyscall.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/asm-x86_64/vsyscall.h	2006-10-19 16:52:31.000000000 +0200
@@ -14,7 +14,7 @@ enum vsyscall_num {
 #define VSYSCALL_ADDR(vsyscall_nr) (VSYSCALL_START+VSYSCALL_SIZE*(vsyscall_nr))
 
 #ifdef __KERNEL__
-
+/* XXX - All of these are unused w/ CONFIG_GENERIC_TIME and should be removed */
 #define __section_vxtime __attribute__ ((unused, __section__ (".vxtime"), aligned(16)))
 #define __section_wall_jiffies __attribute__ ((unused, __section__ (".wall_jiffies"), aligned(16)))
 #define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16)))
@@ -23,6 +23,12 @@ enum vsyscall_num {
 #define __section_xtime __attribute__ ((unused, __section__ (".xtime"), aligned(16)))
 #define __section_xtime_lock __attribute__ ((unused, __section__ (".xtime_lock"), aligned(16)))
 
+/* Definitions for CONFIG_GENERIC_TIME definitions */
+#define __section_vsyscall_gtod_data __attribute__ ((unused, __section__ (".vsyscall_gtod_data"),aligned(16)))
+#define __section_vsyscall_gtod_lock __attribute__ ((unused, __section__ (".vsyscall_gtod_lock"),aligned(16)))
+#define __vsyscall_fn __attribute__ ((unused,__section__(".vsyscall_fn")))
+#define __vsyscall_data __attribute__ ((unused,__section__(".vsyscall_data")))
+
 #define VXTIME_TSC	1
 #define VXTIME_HPET	2
 #define VXTIME_PMTMR	3
@@ -45,14 +51,14 @@ extern struct timespec __xtime;
 extern volatile unsigned long __jiffies;
 extern unsigned long __wall_jiffies;
 extern struct timezone __sys_tz;
-extern seqlock_t __xtime_lock;
+extern raw_seqlock_t __xtime_lock;
 
 /* kernel space (writeable) */
 extern struct vxtime_data vxtime;
 extern unsigned long wall_jiffies;
 extern struct timezone sys_tz;
 extern int sysctl_vsyscall;
-extern seqlock_t xtime_lock;
+extern raw_seqlock_t xtime_lock;
 
 extern int sysctl_vsyscall;
 
Index: linux-2.6.16/include/linux/bit_spinlock.h
===================================================================
--- linux-2.6.16.orig/include/linux/bit_spinlock.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/bit_spinlock.h	2006-10-19 16:52:31.000000000 +0200
@@ -1,6 +1,8 @@
 #ifndef __LINUX_BIT_SPINLOCK_H
 #define __LINUX_BIT_SPINLOCK_H
 
+#if 0
+
 /*
  *  bit-based spin_lock()
  *
@@ -73,5 +75,7 @@ static inline int bit_spin_is_locked(int
 #endif
 }
 
+#endif
+
 #endif /* __LINUX_BIT_SPINLOCK_H */
 
Index: linux-2.6.16/include/linux/buffer_head.h
===================================================================
--- linux-2.6.16.orig/include/linux/buffer_head.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/buffer_head.h	2006-10-19 16:52:31.000000000 +0200
@@ -19,10 +19,6 @@ enum bh_state_bits {
 	BH_Dirty,	/* Is dirty */
 	BH_Lock,	/* Is locked */
 	BH_Req,		/* Has been submitted for I/O */
-	BH_Uptodate_Lock,/* Used by the first bh in a page, to serialise
-			  * IO completion of other buffers in the page
-			  */
-
 	BH_Mapped,	/* Has a disk mapping */
 	BH_New,		/* Disk mapping was newly created by get_block */
 	BH_Async_Read,	/* Is under end_buffer_async_read I/O */
@@ -65,6 +61,8 @@ struct buffer_head {
 	bh_end_io_t *b_end_io;		/* I/O completion */
  	void *b_private;		/* reserved for b_end_io */
 	struct list_head b_assoc_buffers; /* associated with another mapping */
+	spinlock_t b_uptodate_lock;
+	spinlock_t b_state_lock;
 };
 
 /*
Index: linux-2.6.16/include/linux/clockchips.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/include/linux/clockchips.h	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,127 @@
+/*  linux/include/linux/clockchips.h
+ *
+ *  This file contains the structure definitions for clockchips.
+ *
+ *  If you are not a clockchip, or the time of day code, you should
+ *  not be including this file!
+ */
+#ifndef _LINUX_CLOCKCHIPS_H
+#define _LINUX_CLOCKCHIPS_H
+
+#include <linux/config.h>
+
+#ifdef CONFIG_GENERIC_TIME
+
+#include <linux/clocksource.h>
+#include <linux/interrupt.h>
+
+/* Clock event modes and commands */
+enum {
+	CLOCK_EVT_NONE,
+	CLOCK_EVT_STARTUP,
+	CLOCK_EVT_PERIODIC,
+	CLOCK_EVT_ONESHOT,
+	CLOCK_EVT_IPI,
+	CLOCK_EVT_STOP,
+	CLOCK_EVT_SHUTDOWN,
+	CLOCK_EVT_RUN_CYCLIC,
+	CLOCK_EVT_SCHEDTICK,
+	CLOCK_EVT_NOTICK,
+};
+
+/* Clock event capability flags */
+#define CLOCK_CAP_TICK		0x000001
+
+#if defined(CONFIG_HIGH_RES_TIMERS) || defined(CONFIG_DYNTICK)
+#define CLOCK_CAP_NEXTEVT	0x000002
+#else
+#define CLOCK_CAP_NEXTEVT	0x000000
+#endif
+
+#define CLOCK_CAP_UPDATE	0x000004
+
+#ifndef CONFIG_PROFILE_NMI
+#define CLOCK_CAP_PROFILE	0x000008
+#else
+#define CLOCK_CAP_PROFILE	0x000000
+#endif
+
+#define CLOCK_CAP_MASK		(CLOCK_CAP_TICK | CLOCK_CAP_NEXTEVT | CLOCK_CAP_PROFILE | CLOCK_CAP_UPDATE)
+
+/* The device has its own interrupt handler */
+#define CLOCK_HAS_IRQHANDLER	0x010000
+
+struct clock_event;
+
+/**
+ * struct clock_event - clock event descriptor
+ *
+ * @name:		ptr to clock event name
+ * @capabilities:	capabilities of the event chip
+ * @max_delta_ns:	maximum delta value in ns
+ * @min_delta_ns:	minimum delta value in ns
+ * @mult:		nanosecond to cycles multiplier
+ * @shift:		nanoseconds to cycles divisor (power of two)
+ * @set_next_event:	set next event
+ * @set_mode:		set mode function
+ * @suspend:		suspend function (optional)
+ * @resume:		resume function (optional)
+ * @evthandler:		Assigned by the framework to be called by the low
+ *			level handler of the event source
+ * @start_event:	called on entry (optional for chip handling...)
+ * @end_event:		called on exit (optional for chip handling...)
+ * @priv:		private device data
+ */
+struct clock_event {
+	const char* name;
+	unsigned int capabilities;
+	unsigned long max_delta_ns;
+	unsigned long min_delta_ns;
+	u32 mult;
+	u32 shift;
+	void (*set_next_event)(unsigned long evt);
+	void (*set_mode)(int mode);
+	int (*suspend)(void);
+	int (*resume)(void);
+	void (*event_handler)(struct pt_regs *regs);
+	void (*start_event)(void *priv);
+	void (*end_event)(void *priv);
+	unsigned int irq;
+	void *priv;
+};
+
+
+
+/*
+ * Calculate a multiplication factor with shift=32
+ */
+static inline unsigned long div_sc32(unsigned long a, unsigned long b)
+{
+	u64 tmp = ((u64)a) << 32;
+	do_div(tmp, b);
+	return (unsigned long) tmp;
+}
+
+static inline unsigned long mpy_sc32(unsigned long a, unsigned long b)
+{
+	u64 res = (u64) a * b;
+
+	return (unsigned long) (res >> 32);
+}
+
+/* Clock event layer functions */
+extern int setup_local_clockevent(struct clock_event *, cpumask_t cpumask);
+extern int setup_global_clockevent(struct clock_event *, cpumask_t cpumask);
+extern unsigned long clockevent_delta2ns(unsigned long latch, struct clock_event *evt);
+extern void init_clockevents(void);
+
+extern int clockevents_init_next_event(void);
+extern int clockevents_set_next_event(ktime_t expires);
+extern void clockevents_trigger_next_event(void);
+extern int clockevents_next_event_available(void);
+
+#else
+# define init_clockevents() do { } while(0)
+#endif
+
+#endif
Index: linux-2.6.16/include/linux/clocksource.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/include/linux/clocksource.h	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,307 @@
+/*  linux/include/linux/clocksource.h
+ *
+ *  This file contains the structure definitions for clocksources.
+ *
+ *  If you are not a clocksource, or the time of day code, you should
+ *  not be including this file!
+ */
+#ifndef _LINUX_CLOCKSOURCE_H
+#define _LINUX_CLOCKSOURCE_H
+
+#include <linux/types.h>
+#include <linux/timex.h>
+#include <linux/time.h>
+#include <linux/list.h>
+#include <asm/div64.h>
+#include <asm/io.h>
+
+/* clocksource cycle base type */
+typedef u64 cycle_t;
+
+/**
+ * struct clocksource - hardware abstraction for a free running counter
+ *	Provides mostly state-free accessors to the underlying hardware.
+ *
+ * @name:		ptr to clocksource name
+ * @list:		list head for registration
+ * @rating:		rating value for selection (higher is better)
+ *			To avoid rating inflation the following
+ *			list should give you a guide as to how
+ *			to assign your clocksource a rating
+ *			1-99: Unfit for real use
+ *				Only available for bootup and testing purposes.
+ *			100-199: Base level usability.
+ *				Functional for real use, but not desired.
+ *			200-299: Good.
+ *				A correct and usable clocksource.
+ *			300-399: Desired.
+ *				A reasonably fast and accurate clocksource.
+ *			400-499: Perfect
+ *				The ideal clocksource. A must-use where
+ *				available.
+ * @read:		returns a cycle value
+ * @mask:		bitmask for two's complement
+ *			subtraction of non 64 bit counters
+ * @mult:		cycle to nanosecond multiplier
+ * @shift:		cycle to nanosecond divisor (power of two)
+ * @update_callback:	called when safe to alter clocksource values
+ * @is_continuous:	defines if clocksource is free-running.
+ * @vread:		vsyscall read function
+ * @vdata:		vsyscall data value passed to read function
+ */
+struct clocksource {
+	char *name;
+	struct list_head list;
+	int rating;
+	cycle_t (*read)(void);
+	cycle_t mask;
+	u32 mult;
+	u32 shift;
+	int (*update_callback)(void);
+	int is_continuous;
+	cycle_t (*vread)(void *);
+	void *vdata;
+};
+
+
+/**
+ * clocksource_khz2mult - calculates mult from khz and shift
+ * @khz:		Clocksource frequency in KHz
+ * @shift_constant:	Clocksource shift factor
+ *
+ * Helper functions that converts a khz counter frequency to a timsource
+ * multiplier, given the clocksource shift value
+ */
+static inline u32 clocksource_khz2mult(u32 khz, u32 shift_constant)
+{
+	/*  khz = cyc/(Million ns)
+	 *  mult/2^shift  = ns/cyc
+	 *  mult = ns/cyc * 2^shift
+	 *  mult = 1Million/khz * 2^shift
+	 *  mult = 1000000 * 2^shift / khz
+	 *  mult = (1000000<<shift) / khz
+	 */
+	u64 tmp = ((u64)1000000) << shift_constant;
+
+	tmp += khz/2; /* round for do_div */
+	do_div(tmp, khz);
+
+	return (u32)tmp;
+}
+
+/**
+ * clocksource_hz2mult - calculates mult from hz and shift
+ * @hz:			Clocksource frequency in Hz
+ * @shift_constant:	Clocksource shift factor
+ *
+ * Helper functions that converts a hz counter
+ * frequency to a timsource multiplier, given the
+ * clocksource shift value
+ */
+static inline u32 clocksource_hz2mult(u32 hz, u32 shift_constant)
+{
+	/*  hz = cyc/(Billion ns)
+	 *  mult/2^shift  = ns/cyc
+	 *  mult = ns/cyc * 2^shift
+	 *  mult = 1Billion/hz * 2^shift
+	 *  mult = 1000000000 * 2^shift / hz
+	 *  mult = (1000000000<<shift) / hz
+	 */
+	u64 tmp = ((u64)1000000000) << shift_constant;
+
+	tmp += hz/2; /* round for do_div */
+	do_div(tmp, hz);
+
+	return (u32)tmp;
+}
+
+/**
+ * read_clocksource: - Access the clocksource's current cycle value
+ * @cs:		pointer to clocksource being read
+ *
+ * Uses the clocksource to return the current cycle_t value
+ */
+static inline cycle_t read_clocksource(struct clocksource *cs)
+{
+	return cs->read();
+}
+
+/**
+ * ppm_to_mult_adj - Converts shifted ppm values to mult adjustment
+ * @cs:		Pointer to clocksource
+ * @ppm:	Shifted PPM value
+ *
+ * Helper which converts a shifted ppm value to clocksource mult_adj value.
+ *
+ * XXX - this could use some optimization
+ */
+static inline int ppm_to_mult_adj(struct clocksource *cs, int ppm)
+{
+	u64 mult_adj;
+	int ret_adj;
+
+	/* The basic math is as follows:
+	 *     cyc * mult/2^shift * (1 + ppm/MILL) = scaled ns
+	 * We want to precalculate the ppm factor so it can be added
+	 * to the multiplyer saving the extra multiplication step.
+	 *     cyc * (mult/2^shift + (mult/2^shift) * (ppm/MILL)) =
+	 *     cyc * (mult/2^shift + (mult*ppm/MILL)/2^shift) =
+	 *     cyc * (mult + (mult*ppm/MILL))/2^shift =
+	 * Thus we want to calculate the value of:
+	 *     mult*ppm/MILL
+	 */
+	mult_adj = abs(ppm);
+	mult_adj = (mult_adj * cs->mult)>>SHIFT_USEC;
+	mult_adj += 1000000/2; /* round for div*/
+	do_div(mult_adj, 1000000);
+	if (ppm < 0)
+		ret_adj = -(int)mult_adj;
+	else
+		ret_adj = (int)mult_adj;
+
+	return ret_adj;
+}
+
+/**
+ * cyc2ns - converts clocksource cycles to nanoseconds
+ * @cs:		Pointer to clocksource
+ * @ntp_adj:	Multiplier adjustment value
+ * @cycles:	Cycles
+ *
+ * Uses the clocksource and ntp ajdustment to convert cycle_ts to nanoseconds.
+ *
+ * XXX - This could use some mult_lxl_ll() asm optimization
+ */
+static inline s64 cyc2ns(struct clocksource *cs, int ntp_adj, cycle_t cycles)
+{
+	u64 ret = cycles;
+
+	ret *= (cs->mult + ntp_adj);
+	ret >>= cs->shift;
+
+	return ret;
+}
+
+/**
+ * cyc2ns_rem - converts clocksource cycles to nanoseconds w/ remainder
+ * @cs:		Pointer to clocksource
+ * @ntp_adj:	Multiplier adjustment value
+ * @cycles:	Cycles
+ * @rem:	Remainder
+ *
+ * Uses the clocksource and ntp ajdustment interval to convert cycle_t to
+ * nanoseconds. Add in remainder portion which is stored in (ns<<cs->shift)
+ * units and save the new remainder off.
+ *
+ * XXX - This could use some mult_lxl_ll() asm optimization.
+ */
+static inline s64 cyc2ns_rem(struct clocksource *cs, int ntp_adj,
+				cycle_t cycles, u64* rem)
+{
+	u64 ret = cycles;
+
+	ret *= (cs->mult + ntp_adj);
+	if (rem) {
+		ret += *rem;
+		*rem = ret & ((1<<cs->shift)-1);
+	}
+	ret >>= cs->shift;
+
+	return ret;
+}
+
+
+/**
+ * struct clocksource_interval - Fixed interval conversion structure
+ *
+ * @cycles:	A specified number of cycles
+ * @nsecs:	The number of nanoseconds equivalent to the cycles value
+ * @remainder:	Non-integer nanosecond remainder stored in (ns<<cs->shift) units
+ * @remainder_ns_overflow:	Value at which the remainder is equal to
+ *				one second
+ *
+ * This is a optimization structure used by cyc2ns_fixed_rem() to avoid the
+ * multiply in cyc2ns().
+ *
+ * Unless you're the timeofday_periodic_hook, you should not be using this!
+ */
+struct clocksource_interval {
+	cycle_t cycles;
+	s64 nsecs;
+	u64 remainder;
+	u64 remainder_ns_overflow;
+};
+
+/**
+ * calculate_clocksource_interval - Calculates a clocksource interval struct
+ *
+ * @c:		Pointer to clocksource.
+ * @adj:	Multiplyer adjustment.
+ * @length_nsec: Desired interval length in nanoseconds.
+ *
+ * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
+ * pair and interval request.
+ *
+ * Unless you're the timeofday_periodic_hook, you should not be using this!
+ */
+static inline struct clocksource_interval
+calculate_clocksource_interval(struct clocksource *c, long adj,
+			       unsigned long length_nsec)
+{
+	struct clocksource_interval ret;
+	u64 tmp;
+
+	/* XXX - All of this could use a whole lot of optimization */
+	tmp = length_nsec;
+	tmp <<= c->shift;
+	do_div(tmp, c->mult+adj);
+
+	ret.cycles = (cycle_t)tmp;
+	if(ret.cycles == 0)
+		ret.cycles = 1;
+
+	ret.remainder = 0;
+	ret.remainder_ns_overflow = 1 << c->shift;
+	ret.nsecs = cyc2ns_rem(c, adj, ret.cycles, &ret.remainder);
+
+	return ret;
+}
+
+/**
+ * cyc2ns_fixed_rem -
+ *	converts clocksource cycles to nanoseconds using fixed intervals
+ *
+ * @interval:	precalculated clocksource_interval structure
+ * @cycles:	Number of clocksource cycles
+ * @rem:	Remainder
+ *
+ * Uses a precalculated fixed cycle/nsec interval to convert cycles to
+ * nanoseconds. Returns the unaccumulated cycles in the cycles pointer as
+ * well as uses and updates the value at the remainder pointer
+ *
+ * Unless you're the timeofday_periodic_hook, you should not be using this!
+ */
+static inline s64 cyc2ns_fixed_rem(struct clocksource_interval interval,
+				      cycle_t *cycles, u64* rem)
+{
+	s64 delta_nsec = 0;
+
+	while (*cycles > interval.cycles) {
+		delta_nsec += interval.nsecs;
+		*cycles -= interval.cycles;
+		*rem += interval.remainder;
+		while(*rem > interval.remainder_ns_overflow) {
+			*rem -= interval.remainder_ns_overflow;
+			delta_nsec += 1;
+		}
+	}
+
+	return delta_nsec;
+}
+
+/* used to install a new clocksource */
+int register_clocksource(struct clocksource*);
+void reselect_clocksource(void);
+struct clocksource* get_next_clocksource(void);
+
+#endif /* _LINUX_CLOCKSOURCE_H */
Index: linux-2.6.16/include/linux/compat.h
===================================================================
--- linux-2.6.16.orig/include/linux/compat.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/compat.h	2006-10-19 16:52:31.000000000 +0200
@@ -121,6 +121,24 @@ typedef struct compat_sigevent {
 	} _sigev_un;
 } compat_sigevent_t;
 
+struct compat_robust_list {
+	compat_uptr_t			next;
+};
+
+struct compat_robust_list_head {
+	struct compat_robust_list	list;
+	compat_long_t			futex_offset;
+	compat_uptr_t			list_op_pending;
+};
+
+extern void compat_exit_robust_list(struct task_struct *curr);
+
+asmlinkage long
+compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
+			   compat_size_t len);
+asmlinkage long
+compat_sys_get_robust_list(int pid, compat_uptr_t *head_ptr,
+			   compat_size_t __user *len_ptr);
 
 long compat_sys_semctl(int first, int second, int third, void __user *uptr);
 long compat_sys_msgsnd(int first, int second, int third, void __user *uptr);
Index: linux-2.6.16/include/linux/completion.h
===================================================================
--- linux-2.6.16.orig/include/linux/completion.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/completion.h	2006-10-19 16:52:31.000000000 +0200
@@ -33,6 +33,7 @@ extern unsigned long FASTCALL(wait_for_c
 						   unsigned long timeout));
 extern unsigned long FASTCALL(wait_for_completion_interruptible_timeout(
 			struct completion *x, unsigned long timeout));
+extern unsigned int FASTCALL(completion_done(struct completion *x));
 
 extern void FASTCALL(complete(struct completion *));
 extern void FASTCALL(complete_all(struct completion *));
Index: linux-2.6.16/include/linux/console.h
===================================================================
--- linux-2.6.16.orig/include/linux/console.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/console.h	2006-10-19 16:52:31.000000000 +0200
@@ -54,6 +54,7 @@ struct consw {
 	void	(*con_invert_region)(struct vc_data *, u16 *, int);
 	u16    *(*con_screen_pos)(struct vc_data *, int);
 	unsigned long (*con_getxy)(struct vc_data *, unsigned long, int *, int *);
+	int	con_preemptible; // can it reschedule from within printk?
 };
 
 extern const struct consw *conswitchp;
Index: linux-2.6.16/include/linux/fs.h
===================================================================
--- linux-2.6.16.orig/include/linux/fs.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/fs.h	2006-10-19 16:52:31.000000000 +0200
@@ -883,7 +883,6 @@ static inline int has_fs_excl(void)
 	return atomic_read(&current->fs_excl);
 }
 
-
 /*
  * Superblock locking.
  */
Index: linux-2.6.16/include/linux/futex.h
===================================================================
--- linux-2.6.16.orig/include/linux/futex.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/futex.h	2006-11-22 17:50:43.000000000 +0100
@@ -1,6 +1,8 @@
 #ifndef _LINUX_FUTEX_H
 #define _LINUX_FUTEX_H
 
+#include <linux/sched.h>
+
 /* Second argument to futex syscall */
 
 
@@ -10,10 +12,103 @@
 #define FUTEX_REQUEUE		3
 #define FUTEX_CMP_REQUEUE	4
 #define FUTEX_WAKE_OP		5
-
-long do_futex(unsigned long uaddr, int op, int val,
-		unsigned long timeout, unsigned long uaddr2, int val2,
-		int val3);
+#define FUTEX_LOCK_PI		6
+#define FUTEX_UNLOCK_PI		7
+#define FUTEX_TRYLOCK_PI	8
+
+/*
+ * Support for robust futexes: the kernel cleans up held futexes at
+ * thread exit time.
+ */
+
+/*
+ * Per-lock list entry - embedded in user-space locks, somewhere close
+ * to the futex field. (Note: user-space uses a double-linked list to
+ * achieve O(1) list add and remove, but the kernel only needs to know
+ * about the forward link)
+ *
+ * NOTE: this structure is part of the syscall ABI, and must not be
+ * changed.
+ */
+struct robust_list {
+	struct robust_list __user *next;
+};
+
+/*
+ * Per-thread list head:
+ *
+ * NOTE: this structure is part of the syscall ABI, and must only be
+ * changed if the change is first communicated with the glibc folks.
+ * (When an incompatible change is done, we'll increase the structure
+ *  size, which glibc will detect)
+ */
+struct robust_list_head {
+	/*
+	 * The head of the list. Points back to itself if empty:
+	 */
+	struct robust_list list;
+
+	/*
+	 * This relative offset is set by user-space, it gives the kernel
+	 * the relative position of the futex field to examine. This way
+	 * we keep userspace flexible, to freely shape its data-structure,
+	 * without hardcoding any particular offset into the kernel:
+	 */
+	long futex_offset;
+
+	/*
+	 * The death of the thread may race with userspace setting
+	 * up a lock's links. So to handle this race, userspace first
+	 * sets this field to the address of the to-be-taken lock,
+	 * then does the lock acquire, and then adds itself to the
+	 * list, and then clears this field. Hence the kernel will
+	 * always have full knowledge of all locks that the thread
+	 * _might_ have taken. We check the owner TID in any case,
+	 * so only truly owned locks will be handled.
+	 */
+	struct robust_list __user *list_op_pending;
+};
+
+/*
+ * Are there any waiters for this robust futex:
+ */
+#define FUTEX_WAITERS		0x80000000
+
+/*
+ * The kernel signals via this bit that a thread holding a futex
+ * has exited without unlocking the futex. The kernel also does
+ * a FUTEX_WAKE on such futexes, after setting the bit, to wake
+ * up any possible waiters:
+ */
+#define FUTEX_OWNER_DIED	0x40000000
+
+/*
+ * The rest of the robust-futex field is for the TID:
+ */
+#define FUTEX_TID_MASK		0x3fffffff
+
+/*
+ * This limit protects against a deliberately circular list.
+ * (Not worth introducing an rlimit for it)
+ */
+#define ROBUST_LIST_LIMIT	2048
+
+long do_futex(u32 __user *uaddr, int op, u32 val, struct timespec *timeout,
+	      u32 __user *uaddr2, u32 val2, u32 val3);
+
+extern int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi);
+
+#ifdef CONFIG_FUTEX
+extern void exit_robust_list(struct task_struct *curr);
+extern void exit_pi_state_list(struct task_struct *curr);
+#else
+static inline void exit_robust_list(struct task_struct *curr)
+{
+}
+static inline void exit_pi_state_list(struct task_struct *curr)
+{
+}
+#endif
 
 #define FUTEX_OP_SET		0	/* *(int *)UADDR2 = OPARG; */
 #define FUTEX_OP_ADD		1	/* *(int *)UADDR2 += OPARG; */
Index: linux-2.6.16/include/linux/genhd.h
===================================================================
--- linux-2.6.16.orig/include/linux/genhd.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/genhd.h	2006-10-19 16:52:31.000000000 +0200
@@ -142,18 +142,26 @@ struct disk_attribute {
  * variants disable/enable preemption.
  */
 #ifdef	CONFIG_SMP
-#define __disk_stat_add(gendiskp, field, addnd) 	\
-	(per_cpu_ptr(gendiskp->dkstats, smp_processor_id())->field += addnd)
+#define __disk_stat_add(gendiskp, field, addnd)			\
+do {								\
+	preempt_disable();					\
+	(per_cpu_ptr(gendiskp->dkstats,				\
+			smp_processor_id())->field += addnd);	\
+	preempt_enable();					\
+} while (0)
+
 
 #define disk_stat_read(gendiskp, field)					\
 ({									\
 	typeof(gendiskp->dkstats->field) res = 0;			\
 	int i;								\
+	preempt_disable();						\
 	for (i=0; i < NR_CPUS; i++) {					\
 		if (!cpu_possible(i))					\
 			continue;					\
 		res += per_cpu_ptr(gendiskp->dkstats, i)->field;	\
 	}								\
+	preempt_enable();						\
 	res;								\
 })
 
Index: linux-2.6.16/include/linux/hardirq.h
===================================================================
--- linux-2.6.16.orig/include/linux/hardirq.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/hardirq.h	2006-10-19 16:52:31.000000000 +0200
@@ -17,17 +17,18 @@
  * The hardirq count can be overridden per architecture, the default is:
  *
  * - bits 16-27 are the hardirq count (max # of hardirqs: 4096)
- * - ( bit 28 is the PREEMPT_ACTIVE flag. )
+ * - bit 28 is the PREEMPT_ACTIVE flag
  *
- * PREEMPT_MASK: 0x000000ff
- * SOFTIRQ_MASK: 0x0000ff00
- * HARDIRQ_MASK: 0x0fff0000
+ * PREEMPT_MASK:         0x000000ff
+ * SOFTIRQ_MASK:         0x0000ff00
+ * HARDIRQ_MASK:         0x0fff0000
+ * PREEMPT_ACTIVE_MASK:  0x10000000
  */
-#define PREEMPT_BITS	8
-#define SOFTIRQ_BITS	8
-
+#define PREEMPT_BITS		8
+#define SOFTIRQ_BITS		8
 #ifndef HARDIRQ_BITS
-#define HARDIRQ_BITS	12
+#define HARDIRQ_BITS		12
+
 /*
  * The hardirq mask has to be large enough to have space for potentially
  * all IRQ sources in the system nesting on a single CPU.
@@ -36,38 +37,43 @@
 # error HARDIRQ_BITS is too low!
 #endif
 #endif
+#define PREEMPT_ACTIVE_BITS	1
 
-#define PREEMPT_SHIFT	0
-#define SOFTIRQ_SHIFT	(PREEMPT_SHIFT + PREEMPT_BITS)
-#define HARDIRQ_SHIFT	(SOFTIRQ_SHIFT + SOFTIRQ_BITS)
-
-#define __IRQ_MASK(x)	((1UL << (x))-1)
-
-#define PREEMPT_MASK	(__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
-#define SOFTIRQ_MASK	(__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
-#define HARDIRQ_MASK	(__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
-
-#define PREEMPT_OFFSET	(1UL << PREEMPT_SHIFT)
-#define SOFTIRQ_OFFSET	(1UL << SOFTIRQ_SHIFT)
-#define HARDIRQ_OFFSET	(1UL << HARDIRQ_SHIFT)
+#define PREEMPT_SHIFT		0
+#define SOFTIRQ_SHIFT		(PREEMPT_SHIFT + PREEMPT_BITS)
+#define HARDIRQ_SHIFT		(SOFTIRQ_SHIFT + SOFTIRQ_BITS)
+#define PREEMPT_ACTIVE_SHIFT	(HARDIRQ_SHIFT + HARDIRQ_BITS)
+
+#define __IRQ_MASK(x)		((1UL << (x))-1)
+
+#define PREEMPT_MASK		(__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
+#define SOFTIRQ_MASK		(__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
+#define HARDIRQ_MASK		(__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
+
+#define PREEMPT_OFFSET		(1UL << PREEMPT_SHIFT)
+#define SOFTIRQ_OFFSET		(1UL << SOFTIRQ_SHIFT)
+#define HARDIRQ_OFFSET		(1UL << HARDIRQ_SHIFT)
 
 #if PREEMPT_ACTIVE < (1 << (HARDIRQ_SHIFT + HARDIRQ_BITS))
-#error PREEMPT_ACTIVE is too low!
+# error PREEMPT_ACTIVE is too low!
 #endif
 
 #define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
 #define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
 #define irq_count()	(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK))
+#define irqs_off()	(current->flags & PF_IRQSOFF)
 
 /*
  * Are we doing bottom half or hardware interrupt processing?
  * Are we in a softirq context? Interrupt context?
  */
-#define in_irq()		(hardirq_count())
-#define in_softirq()		(softirq_count())
-#define in_interrupt()		(irq_count())
-
-#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
+#define in_irq()	(hardirq_count() || (current->flags & PF_HARDIRQ))
+#define in_softirq()	(softirq_count() || (current->flags & PF_SOFTIRQ))
+#define in_interrupt()	(irq_count())
+
+#if defined(CONFIG_PREEMPT) && \
+	!defined(CONFIG_PREEMPT_BKL) && \
+		!defined(CONFIG_PREEMPT_RT)
 # define in_atomic()	((preempt_count() & ~PREEMPT_ACTIVE) != kernel_locked())
 #else
 # define in_atomic()	((preempt_count() & ~PREEMPT_ACTIVE) != 0)
@@ -87,8 +93,8 @@ extern void synchronize_irq(unsigned int
 # define synchronize_irq(irq)	barrier()
 #endif
 
-#define nmi_enter()		irq_enter()
-#define nmi_exit()		sub_preempt_count(HARDIRQ_OFFSET)
+#define nmi_enter()		/* irq_enter() */
+#define nmi_exit()		/* sub_preempt_count(HARDIRQ_OFFSET) */
 
 struct task_struct;
 
Index: linux-2.6.16/include/linux/hrtimer.h
===================================================================
--- linux-2.6.16.orig/include/linux/hrtimer.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/hrtimer.h	2006-10-19 16:52:31.000000000 +0200
@@ -18,8 +18,9 @@
 #include <linux/rbtree.h>
 #include <linux/ktime.h>
 #include <linux/init.h>
-#include <linux/list.h>
+#include <linux/plist.h>
 #include <linux/wait.h>
+#include <linux/timeofday.h>
 
 /*
  * Mode arguments of xxx_hrtimer functions:
@@ -34,15 +35,9 @@ enum hrtimer_restart {
 	HRTIMER_RESTART,
 };
 
-/*
- * Timer states:
- */
-enum hrtimer_state {
-	HRTIMER_INACTIVE,	/* Timer is inactive */
-	HRTIMER_EXPIRED,		/* Timer is expired */
-	HRTIMER_RUNNING,		/* Timer is running the callback function */
-	HRTIMER_PENDING,		/* Timer is pending */
-};
+#define HRTIMER_CB_IRQSAFE	1
+
+#define HRTIMER_INACTIVE	((void *)1UL)
 
 struct hrtimer_base;
 
@@ -53,50 +48,125 @@ struct hrtimer_base;
  * @expires:	the absolute expiry time in the hrtimers internal
  *		representation. The time is related to the clock on
  *		which the timer is based.
- * @state:	state of the timer
  * @function:	timer expiry callback function
- * @data:	argument for the callback function
  * @base:	pointer to the timer base (per cpu and per clock)
  *
+ * @mode:	high resolution timer feature to allow executing the
+ *		callback in the hardirq context (wakeups)
+ * @cb_entry:	list head to enqueue an expired timer into the callback list
+ *
  * The hrtimer structure must be initialized by init_hrtimer_#CLOCKTYPE()
  */
 struct hrtimer {
 	struct rb_node		node;
 	ktime_t			expires;
-	enum hrtimer_state	state;
-	int			(*function)(void *);
-	void			*data;
+	int			(*function)(struct hrtimer *);
 	struct hrtimer_base	*base;
+#ifdef CONFIG_HIGH_RES_TIMERS
+	int			mode;
+# ifdef CONFIG_PREEMPT_RT
+	struct plist_node	cb_entry;
+# else
+	struct list_head	cb_entry;
+# endif
+#endif
+};
+
+/**
+ * struct hrtimer_sleeper - simple sleeper structure
+ *
+ * @timer:	embedded timer structure
+ * @task:	task to wake up
+ *
+ * task is set to NULL, when the timer expires.
+ */
+struct hrtimer_sleeper {
+	struct hrtimer timer;
+	struct task_struct *task;
 };
 
 /**
  * struct hrtimer_base - the timer base for a specific clock
  *
- * @index:	clock type index for per_cpu support when moving a timer
- *		to a base on another cpu.
- * @lock:	lock protecting the base and associated timers
- * @active:	red black tree root node for the active timers
- * @first:	pointer to the timer node which expires first
- * @resolution:	the resolution of the clock, in nanoseconds
- * @get_time:	function to retrieve the current time of the clock
- * @curr_timer:	the timer which is executing a callback right now
+ * @index:		clock type index for per_cpu support when moving a timer
+ *			to a base on another cpu.
+ * @lock:		lock protecting the base and associated timers
+ * @active:		red black tree root node for the active timers
+ * @first:		pointer to the timer node which expires first
+ * @resolution:		the resolution of the clock, in nanoseconds
+ * @get_time:		function to retrieve the current time of the clock
+ * @get_sofirq_time:	function to retrieve the current time from the softirq
+ * @curr_timer:		the timer which is executing a callback right now
+ * @softirq_time:	the time when running the hrtimer queue in the softirq
+ *
+ * @cb_pending:		list of timers where the callback is pending
+ * @offset:		hmmm
+ * @reprogram:		function to reprogram the timer event
  */
 struct hrtimer_base {
 	clockid_t		index;
-	spinlock_t		lock;
+	raw_spinlock_t		lock;
 	struct rb_root		active;
 	struct rb_node		*first;
 	ktime_t			resolution;
 	ktime_t			(*get_time)(void);
+	ktime_t			(*get_softirq_time)(void);
 	struct hrtimer		*curr_timer;
+	ktime_t			softirq_time;
+#ifdef CONFIG_HIGH_RES_TIMERS
+# ifdef CONFIG_PREEMPT_RT
+	struct plist_head	cb_pending;
+# else
+	struct list_head	cb_pending;
+# endif
+	ktime_t			offset;
+	int			(*reprogram)(struct hrtimer *t,
+					     struct hrtimer_base *b, ktime_t n);
+#endif
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+	wait_queue_head_t	wait;
+#endif
 };
 
+#ifdef CONFIG_HIGH_RES_TIMERS
+
+extern void hrtimer_clock_notify(void);
+extern void clock_was_set(void);
+extern int hrtimer_interrupt(void);
+
+#define hrtimer_cb_get_time(t)	(t)->base->get_time()
+
+/*
+ * The resolution of the clocks. The resolution value is returned in
+ * the clock_getres() system call to give application programmers an
+ * idea of the (in)accuracy of timers. Timer values are rounded up to
+ * this resolution values.
+ */
+#define KTIME_REALTIME_RES	(ktime_t) { .tv64 = CONFIG_HIGH_RES_RESOLUTION }
+#define KTIME_MONOTONIC_RES	(ktime_t) { .tv64 = CONFIG_HIGH_RES_RESOLUTION }
+
+#else
+
+#define KTIME_REALTIME_RES		KTIME_LOW_RES
+#define KTIME_MONOTONIC_RES		KTIME_LOW_RES
+
 /*
  * clock_was_set() is a NOP for non- high-resolution systems. The
  * time-sorted order guarantees that a timer does not expire early and
  * is expired in the next softirq when the clock was advanced.
  */
 #define clock_was_set()		do { } while (0)
+#define hrtimer_clock_notify()	do { } while (0)
+
+#define hrtimer_cb_get_time(t)	(t)->base->softirq_time
+
+#endif
+
+# if (BITS_PER_LONG == 64) || defined(CONFIG_KTIME_SCALAR)
+#  define hrtimer_trace(a,b)		trace_special_u64((a).tv64,b)
+# else
+#  define hrtimer_trace(a,b)		trace_special((a).tv.sec,(a).tv.nsec,b)
+# endif
 
 /* Exported timer functions: */
 
@@ -112,6 +182,13 @@ extern int hrtimer_try_to_cancel(struct 
 
 #define hrtimer_restart(timer) hrtimer_start((timer), (timer)->expires, HRTIMER_ABS)
 
+/* Softirq preemption could deadlock timer removal */
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
+#else
+#define hrtimer_wait_for_timer(timer)	do { } while (0)
+#endif
+
 /* Query timers: */
 extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer);
 extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp);
@@ -122,11 +199,12 @@ extern ktime_t hrtimer_get_next_event(vo
 
 static inline int hrtimer_active(const struct hrtimer *timer)
 {
-	return timer->state == HRTIMER_PENDING;
+	return timer->node.rb_parent != HRTIMER_INACTIVE;
 }
 
 /* Forward a hrtimer so it expires after now: */
-extern unsigned long hrtimer_forward(struct hrtimer *timer, ktime_t interval);
+extern unsigned long
+hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval);
 
 /* Precise sleep: */
 extern long hrtimer_nanosleep(struct timespec *rqtp,
@@ -134,6 +212,9 @@ extern long hrtimer_nanosleep(struct tim
 			      const enum hrtimer_mode mode,
 			      const clockid_t clockid);
 
+extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
+				 struct task_struct *tsk);
+
 /* Soft interrupt function to run the hrtimer queues: */
 extern void hrtimer_run_queues(void);
 
Index: linux-2.6.16/include/linux/ide.h
===================================================================
--- linux-2.6.16.orig/include/linux/ide.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/ide.h	2006-10-19 16:52:31.000000000 +0200
@@ -1357,7 +1357,7 @@ extern struct semaphore ide_cfg_sem;
  * ide_drive_t->hwif: constant, no locking
  */
 
-#define local_irq_set(flags)	do { local_save_flags((flags)); local_irq_enable(); } while (0)
+#define local_irq_set(flags)	do { local_save_flags((flags)); local_irq_enable_nort(); } while (0)
 
 extern struct bus_type ide_bus_type;
 
Index: linux-2.6.16/include/linux/init_task.h
===================================================================
--- linux-2.6.16.orig/include/linux/init_task.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/init_task.h	2006-10-19 16:52:31.000000000 +0200
@@ -3,6 +3,7 @@
 
 #include <linux/file.h>
 #include <linux/rcupdate.h>
+#include <linux/rt_lock.h>
 
 #define INIT_FDTABLE \
 {							\
@@ -85,6 +86,7 @@ extern struct group_info init_groups;
 	.lock_depth	= -1,						\
 	.prio		= MAX_PRIO-20,					\
 	.static_prio	= MAX_PRIO-20,					\
+	.normal_prio	= MAX_PRIO-20,					\
 	.policy		= SCHED_NORMAL,					\
 	.cpus_allowed	= CPU_MASK_ALL,					\
 	.mm		= NULL,						\
@@ -121,6 +123,8 @@ extern struct group_info init_groups;
 	.journal_info	= NULL,						\
 	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
 	.fs_excl	= ATOMIC_INIT(0),				\
+	.posix_timer_list = NULL,					\
+	INIT_RT_MUTEXES(tsk)						\
 }
 
 
Index: linux-2.6.16/include/linux/interrupt.h
===================================================================
--- linux-2.6.16.orig/include/linux/interrupt.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/interrupt.h	2006-10-19 16:52:31.000000000 +0200
@@ -42,7 +42,7 @@ struct irqaction {
 	void *dev_id;
 	struct irqaction *next;
 	int irq;
-	struct proc_dir_entry *dir;
+	struct proc_dir_entry *dir, *threaded;
 };
 
 extern irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs);
@@ -60,6 +60,7 @@ extern void enable_irq(unsigned int irq)
 
 #ifndef __ARCH_SET_SOFTIRQ_PENDING
 #define set_softirq_pending(x) (local_softirq_pending() = (x))
+// FIXME: PREEMPT_RT: set_bit()?
 #define or_softirq_pending(x)  (local_softirq_pending() |= (x))
 #endif
 
@@ -92,13 +93,18 @@ static inline void __deprecated save_and
 #define save_and_cli(x)	save_and_cli(&x)
 #endif /* CONFIG_SMP */
 
+#ifdef CONFIG_PREEMPT_RT
+# define local_bh_disable() do { } while (0)
+# define local_bh_enable() do { } while (0)
+# define __local_bh_enable() do { } while (0)
+#else
 /* SoftIRQ primitives.  */
-#define local_bh_disable() \
+# define local_bh_disable() \
 		do { add_preempt_count(SOFTIRQ_OFFSET); barrier(); } while (0)
-#define __local_bh_enable() \
+# define __local_bh_enable() \
 		do { barrier(); sub_preempt_count(SOFTIRQ_OFFSET); } while (0)
-
-extern void local_bh_enable(void);
+  extern void local_bh_enable(void);
+#endif
 
 /* PLEASE, avoid to allocate new softirqs, if you need not _really_ high
    frequency threaded job scheduling. For almost all the purposes
@@ -113,7 +119,17 @@ enum
 	NET_TX_SOFTIRQ,
 	NET_RX_SOFTIRQ,
 	BLOCK_SOFTIRQ,
-	TASKLET_SOFTIRQ
+	TASKLET_SOFTIRQ,
+#ifdef CONFIG_HIGH_RES_TIMERS
+# ifdef CONFIG_PREEMPT_RT
+	HRTIMER_SOFTIRQ_REAL,
+	HRTIMER_SOFTIRQ_MONO,
+# else
+	HRTIMER_SOFTIRQ,
+# endif
+#endif
+	/* Entries after this are ignored in the split softirq mode */
+	MAX_SOFTIRQ,
 };
 
 /* softirq mask and active fields moved to irq_cpustat_t in
@@ -130,9 +146,16 @@ asmlinkage void do_softirq(void);
 extern void open_softirq(int nr, void (*action)(struct softirq_action*), void *data);
 extern void softirq_init(void);
 #define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0)
+extern void FASTCALL(raise_softirq_prio_irqoff(unsigned int nr, int prio));
 extern void FASTCALL(raise_softirq_irqoff(unsigned int nr));
 extern void FASTCALL(raise_softirq(unsigned int nr));
+extern void wakeup_irqd(void);
 
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+extern void wait_for_softirq(int softirq);
+#else
+# define wait_for_softirq(x) do {} while(0)
+#endif
 
 /* Tasklets --- multithreaded analogue of BHs.
 
@@ -147,8 +170,9 @@ extern void FASTCALL(raise_softirq(unsig
      to be executed on some cpu at least once after this.
    * If the tasklet is already scheduled, but its excecution is still not
      started, it will be executed only once.
-   * If this tasklet is already running on another CPU (or schedule is called
-     from tasklet itself), it is rescheduled for later.
+   * If this tasklet is already running on another CPU, it is rescheduled
+     for later.
+   * Schedule must not be called from the tasklet itself (a lockup occurs)
    * Tasklet is strictly serialized wrt itself, but not
      wrt another tasklets. If client needs some intertask synchronization,
      he makes it with spinlocks.
@@ -173,15 +197,25 @@ struct tasklet_struct name = { NULL, 0, 
 enum
 {
 	TASKLET_STATE_SCHED,	/* Tasklet is scheduled for execution */
-	TASKLET_STATE_RUN	/* Tasklet is running (SMP only) */
+	TASKLET_STATE_RUN,	/* Tasklet is running (SMP only) */
+	TASKLET_STATE_PENDING	/* Tasklet is pending */
 };
 
-#ifdef CONFIG_SMP
+#define TASKLET_STATEF_SCHED	(1 << TASKLET_STATE_SCHED)
+#define TASKLET_STATEF_RUN	(1 << TASKLET_STATE_RUN)
+#define TASKLET_STATEF_PENDING	(1 << TASKLET_STATE_PENDING)
+
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
 static inline int tasklet_trylock(struct tasklet_struct *t)
 {
 	return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
 }
 
+static inline int tasklet_tryunlock(struct tasklet_struct *t)
+{
+	return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
+}
+
 static inline void tasklet_unlock(struct tasklet_struct *t)
 {
 	smp_mb__before_clear_bit(); 
@@ -193,9 +227,10 @@ static inline void tasklet_unlock_wait(s
 	while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
 }
 #else
-#define tasklet_trylock(t) 1
-#define tasklet_unlock_wait(t) do { } while (0)
-#define tasklet_unlock(t) do { } while (0)
+# define tasklet_trylock(t)		1
+# define tasklet_tryunlock(t)		1
+# define tasklet_unlock_wait(t)		do { } while (0)
+# define tasklet_unlock(t)		do { } while (0)
 #endif
 
 extern void FASTCALL(__tasklet_schedule(struct tasklet_struct *t));
@@ -228,22 +263,14 @@ static inline void tasklet_disable(struc
 	smp_mb();
 }
 
-static inline void tasklet_enable(struct tasklet_struct *t)
-{
-	smp_mb__before_atomic_dec();
-	atomic_dec(&t->count);
-}
-
-static inline void tasklet_hi_enable(struct tasklet_struct *t)
-{
-	smp_mb__before_atomic_dec();
-	atomic_dec(&t->count);
-}
+extern fastcall void tasklet_enable(struct tasklet_struct *t);
+extern fastcall void tasklet_hi_enable(struct tasklet_struct *t);
 
 extern void tasklet_kill(struct tasklet_struct *t);
 extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
 extern void tasklet_init(struct tasklet_struct *t,
 			 void (*func)(unsigned long), unsigned long data);
+void takeover_tasklets(unsigned int cpu);
 
 /*
  * Autoprobing for irqs:
@@ -292,4 +319,35 @@ extern int probe_irq_off(unsigned long);
 extern unsigned int probe_irq_mask(unsigned long);	/* returns mask of ISA interrupts */
 #endif
 
+#ifdef CONFIG_PREEMPT_RT
+# define local_irq_disable_nort()	do { BUG_ON(in_interrupt()); } while (0)
+# define local_irq_enable_nort()	do { BUG_ON(in_interrupt()); } while (0)
+# define local_irq_save_nort(flags)	do { local_save_flags(flags); WARN_ON(in_interrupt()); } while (0)
+# define local_irq_restore_nort(flags)	do { (void)(flags); WARN_ON(in_interrupt()); } while (0)
+# define spin_lock_nort(lock)		do { } while (0)
+# define spin_unlock_nort(lock)		do { } while (0)
+# define spin_lock_bh_nort(lock)	do { } while (0)
+# define spin_unlock_bh_nort(lock)	do { } while (0)
+# define spin_lock_rt(lock)		spin_lock(lock)
+# define spin_unlock_rt(lock)		spin_unlock(lock)
+# define smp_processor_id_rt(cpu)	(cpu)
+# define in_atomic_rt()			(!oops_in_progress && \
+					  (in_atomic() || irqs_disabled()))
+# define read_trylock_rt(lock)		({read_lock(lock); 1; })
+#else
+# define local_irq_disable_nort()	local_irq_disable()
+# define local_irq_enable_nort()	local_irq_enable()
+# define local_irq_save_nort(flags)	local_irq_save(flags)
+# define local_irq_restore_nort(flags)	local_irq_restore(flags)
+# define spin_lock_rt(lock)		do { } while (0)
+# define spin_unlock_rt(lock)		do { } while (0)
+# define spin_lock_nort(lock)		spin_lock(lock)
+# define spin_unlock_nort(lock)		spin_unlock(lock)
+# define spin_lock_bh_nort(lock)	spin_lock_bh(lock)
+# define spin_unlock_bh_nort(lock)	spin_unlock_bh(lock)
+# define smp_processor_id_rt(cpu)	smp_processor_id()
+# define in_atomic_rt()			0
+# define read_trylock_rt(lock)		read_trylock(lock)
+#endif
+
 #endif
Index: linux-2.6.16/include/linux/irq.h
===================================================================
--- linux-2.6.16.orig/include/linux/irq.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/irq.h	2006-10-19 16:52:31.000000000 +0200
@@ -1,14 +1,6 @@
 #ifndef __irq_h
 #define __irq_h
 
-/*
- * Please do not include this file in generic code.  There is currently
- * no requirement for any architecture to implement anything held
- * within this file.
- *
- * Thanks. --rmk
- */
-
 #include <linux/config.h>
 #include <linux/smp.h>
 
@@ -18,9 +10,11 @@
 #include <linux/cache.h>
 #include <linux/spinlock.h>
 #include <linux/cpumask.h>
+#include <linux/wait.h>
 
 #include <asm/irq.h>
 #include <asm/ptrace.h>
+#include <asm/timex.h>
 
 /*
  * IRQ line status.
@@ -40,48 +34,158 @@
 # define CHECK_IRQ_PER_CPU(var) 0
 #endif
 
+#define IRQ_NOPROBE	512	/* IRQ is not valid for probing */
+#define IRQ_NOREQUEST	1024	/* IRQ cannot be requested */
+
+#define IRQ_NODELAY	2048     /* IRQ must run immediately */
+
+/*
+ * Not used on any of the architectures, but feel free to provide
+ * your own per-arch one:
+ */
+#ifndef SA_NODELAY
+# define SA_NODELAY 0x01000000
+#endif
+
+/*
+ * IRQ types
+ */
+#define IRQ_TYPE_NONE		0x0000		/* Default, unspecified type */
+#define IRQ_TYPE_EDGEL		0x0001		/* Edge low/falling type */
+#define IRQ_TYPE_EDGEH		0x0002		/* Edge high/rising type */
+#define IRQ_TYPE_EDGEB \
+	(IRQ_TYPE_EDGEL | IRQ_TYPE_EDGEH)	/* Edge low+high/both type */
+#define IRQ_TYPE_LEVELL		0x0004		/* Level low type */
+#define IRQ_TYPE_LEVELH		0x0008		/* Level high type */
+#define IRQ_TYPE_SIMPLE		0x0010		/* Simple type */
+
+
 /*
- * Interrupt controller descriptor. This is all we need
- * to describe about the low-level hardware. 
+ * IRQ wakeup control modes
+ */
+#define IRQ_WAKE_NORESUME	0x0000	/* Do not resume on this irq */
+#define IRQ_WAKE_RESUME		0x0001	/* Enable resume on this irq */
+
+/**
+ * struct irq_chip - Low level interrupt controller hardware descriptor
+ *
+ * @ack:	acknowledge IRQ
+ * @mask:	mask the IRQ
+ * @mask_ack:	acknowledge and mask the IRQ
+ * @unmask:	unmask the IRQ
+ * @retrigger:	retrigger the IRQ in hardware, if possible. Return 0 on success.
+ * @set_type:	set the IRQ type (level, edge[high,low,both])
+ * @set_wake:	Set the IRQ PM-wakeup function
+ * @options:	option field to store type, wake information
+ * @lock:	locking for SMP
+ * @chip_data:	platform-specific private data for the chip
+ */
+struct irq_chip {
+	spinlock_t	lock;
+	void		(*ack)(unsigned int irq);
+	void		(*mask)(unsigned int irq);
+	void		(*mask_ack)(unsigned int irq);
+	void		(*unmask)(unsigned int irq);
+	int		(*retrigger)(unsigned int irq);
+	int		(*set_type)(unsigned int irq, unsigned int hw_type);
+	int		(*set_wake)(unsigned int irq, unsigned int mode);
+	unsigned long	options;
+	void		*chip_data;
+};
+
+struct irq_desc;
+struct irq_type;
+
+/**
+ * struct irq_type - high level hardware interrupt type descriptor
+ *
+ * @typename:		name for /proc/interrupts
+ * @startup:		start up the interrupt (defaults to ->enable if NULL)
+ * @shutdown:		shut down the interrupt (defaults to ->disable if NULL)
+ * @enable:		enable the interrupt (defaults to chip->unmask if NULL)
+ * @disable:		disable the interrupt (defaults to chip->mask if NULL)
+ * @handle_irq:		irq flow handler called from the arch IRQ glue code
+ * @ack:		start of new interrupt.	(Note: This will be renamed to 'start')
+ * @hold:		same interrupt while the handler is running
+ * @end:		end of interrupt
+ * @set_affinity:	set the CPU affinity on SMP machines
+ * @set_type:		set the interrupt type (level, edge[high,low,both]),
+ *			returns a pointer to the irq_type structure which can
+ *			handle the requested type or NULL, if the type cannot
+ *			be handled.
  */
-struct hw_interrupt_type {
-	const char * typename;
-	unsigned int (*startup)(unsigned int irq);
-	void (*shutdown)(unsigned int irq);
-	void (*enable)(unsigned int irq);
-	void (*disable)(unsigned int irq);
-	void (*ack)(unsigned int irq);
-	void (*end)(unsigned int irq);
-	void (*set_affinity)(unsigned int irq, cpumask_t dest);
+struct irq_type {
+	const char	*typename;
+	unsigned int 	(*startup)(unsigned int irq);
+	void		(*shutdown)(unsigned int irq);
+	void		(*enable)(unsigned int irq);
+	void		(*disable)(unsigned int irq);
+
+	void		(*handle_irq)(unsigned int irq, struct irq_desc *desc,
+				      struct pt_regs *regs);
+
+			/* (*start) Will be renamed */
+	void		(*ack)(unsigned int irq);
+	void		(*hold)(unsigned int irq);
+	void		(*end)(unsigned int irq);
+	void		(*set_affinity)(unsigned int irq, cpumask_t dest);
+	struct irq_type *(*set_type)(unsigned int irq, unsigned int type);
 	/* Currently used only by UML, might disappear one day.*/
 #ifdef CONFIG_IRQ_RELEASE_METHOD
 	void (*release)(unsigned int irq, void *dev_id);
 #endif
 };
 
-typedef struct hw_interrupt_type  hw_irq_controller;
-
-/*
- * This is the "IRQ descriptor", which contains various information
- * about the irq, including what kind of hardware handling it has,
- * whether it is disabled etc etc.
+/**
+ * struct irq_desc - interrupt descriptor
+ *
+ * @handler:		interrupt type dependent handler functions,
+ * 			(this should be renamed to 'type')
+ * @handler_data:	data for the type handlers
+ * @chip_data:		platform-specific private data for the chip to
+ *			allow shared chip implementations for
+ * @chip:		low level hardware access functions - comes from type
+ * @action:		the irq action chain
+ * @status:		status information
+ * @depth:		disable-depth, for nested irq_disable() calls
+ * @irq_count:		stats field to detect stalled irqs
+ * @irqs_unhandled:	stats field for spurious unhandled interrupts
+ * @thread:		Thread pointer for threaded preemptible irq handling
+ * @wait_for_handler:	Waitqueue to wait for a running preemptible handler
+ * @lock:		locking for SMP
+ * @move_irq:		Flag need to re-target interrupt destination
  *
  * Pad this out to 32 bytes for cache and indexing reasons.
  */
 typedef struct irq_desc {
-	hw_irq_controller *handler;
-	void *handler_data;
-	struct irqaction *action;	/* IRQ action list */
-	unsigned int status;		/* IRQ status */
-	unsigned int depth;		/* nested irq disables */
-	unsigned int irq_count;		/* For detecting broken interrupts */
-	unsigned int irqs_unhandled;
-	spinlock_t lock;
+	struct irq_type		*handler;
+	void			*handler_data;
+	struct irq_chip		*chip;
+	void			*chip_data;
+	struct irqaction	*action;
+	unsigned int		status;
+
+	unsigned int		depth;
+	unsigned int		irq_count;
+	unsigned int		irqs_unhandled;
+ 	struct task_struct	*thread;
+ 	wait_queue_head_t	wait_for_handler;
+	raw_spinlock_t		lock;
 #if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
-	unsigned int move_irq;		/* Flag need to re-target intr dest*/
+	unsigned int		move_irq;
 #endif
 } ____cacheline_aligned irq_desc_t;
 
+
+/*
+ * Migration helpers for obsolete names, they will go away:
+ */
+#define irqdesc			irq_desc
+#define irqchip			irq_chip
+#define hw_interrupt_type	irq_type
+#define set_irq_type		set_hwirq_type
+typedef struct irq_type hw_irq_controller;
+
 extern irq_desc_t irq_desc [NR_IRQS];
 
 /* Return a pointer to the irq descriptor for IRQ.  */
@@ -210,28 +314,98 @@ static inline void set_irq_info(int irq,
 
 #endif // CONFIG_SMP
 
+#ifdef CONFIG_AUTO_IRQ_AFFINITY
+extern int select_smp_affinity(unsigned int irq);
+#else
+static inline int
+select_smp_affinity(unsigned int irq)
+{
+	return 1;
+}
+#endif
+
 extern int no_irq_affinity;
-extern int noirqdebug_setup(char *str);
 
+/* Handle irq action chains */
 extern fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
-					struct irqaction *action);
+				       struct irqaction *action);
+
+/*
+ * Built-in IRQ handlers for various IRQ types,
+ * callable via desc->handler->handle_irq()
+ */
+extern void handle_level_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs);
+extern void handle_edge_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs);
+extern void handle_simple_irq(unsigned int irq, struct irq_desc *desc,  struct pt_regs *regs);
+extern void handle_percpu_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs);
+extern void handle_bad_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs);
+
+#define desc_handle_irq(irq, desc, regs)		\
+do {							\
+	spin_lock(&(desc)->lock);			\
+	(desc)->handler->handle_irq(irq, (desc), regs);	\
+	spin_unlock(&(desc)->lock);			\
+} while(0)
+
+/* Monolithic do_IRQ implementation */
 extern fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs);
+
+/* Handling of unhandled and spurious interrupts */
 extern void note_interrupt(unsigned int irq, irq_desc_t *desc,
 					int action_ret, struct pt_regs *regs);
-extern int can_request_irq(unsigned int irq, unsigned long irqflags);
 
+/* Resending of interrupts */
+void check_irq_resend(irq_desc_t *desc, unsigned int irq);
+
+/* Proc filesystem */
 extern void init_irq_proc(void);
 
-#ifdef CONFIG_AUTO_IRQ_AFFINITY
-extern int select_smp_affinity(unsigned int irq);
+/* Enable/disable irq debugging output */
+extern int noirqdebug_setup(char *str);
+
+/* Set/get irq type */
+extern int set_irq_type(unsigned int irq, unsigned int type);
+extern int get_irq_type(unsigned int irq, unsigned int type);
+
+/* Irq wakeup (PM) control) */
+extern int set_irq_wake(unsigned int irq, unsigned int mode);
+#define enable_irq_wake(irq) set_irq_wake(irq, IRQ_WAKE_RESUME)
+#define disable_irq_wake(irq) set_irq_wake(irq, IRQ_WAKE_NORESUME)
+
+/* Checks whether the interrupt can be requested by request_irq() */
+extern int can_request_irq(unsigned int irq, unsigned long irqflags);
+
+/* Set type control/chip/data for an interrupt */
+extern int generic_set_irq_type(unsigned int irq, struct irq_type *type);
+extern int set_irq_data(unsigned int irq, void *data);
+extern int set_irq_chip(unsigned int irq, struct irq_chip *chip);
+extern int set_irq_chip_data(unsigned int irq, void *data);
+
+/* Get chip/data for an interrupt */
+#define get_irq_chip(irq) (irq_desc[irq].chip)
+#define get_irq_chip_data(irq) (irq_desc[irq].chip_data)
+
+/* Interrupt type default implementations */
+extern struct irq_type no_irq_type;
+extern struct irq_type default_edge_type;
+extern struct irq_type default_level_type;
+extern struct irq_type default_simple_type;
+extern struct irq_type default_percpu_type;
+
+/* Early initialization of irqs */
+extern void early_init_hardirqs(void);
+
+#if defined(CONFIG_PREEMPT_HARDIRQS)
+extern void init_hardirqs(void);
 #else
-static inline int
-select_smp_affinity(unsigned int irq)
-{
-	return 1;
-}
+static inline void init_hardirqs(void) { }
 #endif
 
+#else	/* GENERIC HARDIRQS */
+
+static inline void early_init_hardirqs(void) { }
+static inline void init_hardirqs(void) { }
+
 #endif
 
 extern hw_irq_controller no_irq_type;  /* needed in every arch ? */
Index: linux-2.6.16/include/linux/jbd.h
===================================================================
--- linux-2.6.16.orig/include/linux/jbd.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/jbd.h	2006-10-19 16:52:31.000000000 +0200
@@ -270,6 +270,15 @@ void buffer_assertion_failure(struct buf
 #define J_ASSERT(assert)	do { } while (0)
 #endif		/* JBD_ASSERTIONS */
 
+/*
+ * For assertions that are only valid on SMP (e.g. spin_is_locked()):
+ */
+#ifdef CONFIG_SMP
+# define J_ASSERT_JH_SMP(jh, expr)	J_ASSERT_JH(jh, expr)
+#else
+# define J_ASSERT_JH_SMP(jh, assert)	do { } while (0)
+#endif
+
 #if defined(JBD_PARANOID_IOFAIL)
 #define J_EXPECT(expr, why...)		J_ASSERT(expr)
 #define J_EXPECT_BH(bh, expr, why...)	J_ASSERT_BH(bh, expr)
@@ -325,32 +334,32 @@ static inline struct journal_head *bh2jh
 
 static inline void jbd_lock_bh_state(struct buffer_head *bh)
 {
-	bit_spin_lock(BH_State, &bh->b_state);
+	spin_lock(&bh->b_state_lock);
 }
 
 static inline int jbd_trylock_bh_state(struct buffer_head *bh)
 {
-	return bit_spin_trylock(BH_State, &bh->b_state);
+	return spin_trylock(&bh->b_state_lock);
 }
 
 static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
 {
-	return bit_spin_is_locked(BH_State, &bh->b_state);
+	return spin_is_locked(&bh->b_state_lock);
 }
 
 static inline void jbd_unlock_bh_state(struct buffer_head *bh)
 {
-	bit_spin_unlock(BH_State, &bh->b_state);
+	spin_unlock(&bh->b_state_lock);
 }
 
 static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
 {
-	bit_spin_lock(BH_JournalHead, &bh->b_state);
+	spin_lock(&bh->b_uptodate_lock);
 }
 
 static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
 {
-	bit_spin_unlock(BH_JournalHead, &bh->b_state);
+	spin_unlock(&bh->b_uptodate_lock);
 }
 
 struct jbd_revoke_table_s;
Index: linux-2.6.16/include/linux/jffs2_fs_i.h
===================================================================
--- linux-2.6.16.orig/include/linux/jffs2_fs_i.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/jffs2_fs_i.h	2006-10-19 16:52:31.000000000 +0200
@@ -14,7 +14,15 @@ struct jffs2_inode_info {
 	   before letting GC proceed. Or we'd have to put ugliness
 	   into the GC code so it didn't attempt to obtain the i_mutex
 	   for the inode(s) which are already locked */
-	struct semaphore sem;
+	/*
+	 * (On PREEMPT_RT: while use of ei->sem is mostly mutex-alike, the
+	 * SLAB cache keeps the semaphore locked, which breaks the strict
+	 * "owner must exist" properties of rt_mutexes. Fix it the easy
+	 * way: by going to a compat_semaphore. But the real fix would be
+	 * to cache inodes in an unlocked state and lock them when
+	 * allocating a new inode.)
+	 */
+	struct compat_semaphore sem;
 
 	/* The highest (datanode) version number used for this ino */
 	uint32_t highest_version;
Index: linux-2.6.16/include/linux/jffs2_fs_sb.h
===================================================================
--- linux-2.6.16.orig/include/linux/jffs2_fs_sb.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/jffs2_fs_sb.h	2006-10-19 16:52:31.000000000 +0200
@@ -35,7 +35,7 @@ struct jffs2_sb_info {
 	struct completion gc_thread_start; /* GC thread start completion */
 	struct completion gc_thread_exit; /* GC thread exit completion port */
 
-	struct semaphore alloc_sem;	/* Used to protect all the following
+	struct compat_semaphore alloc_sem; /* Used to protect all the following
 					   fields, and also to protect against
 					   out-of-order writing of nodes. And GC. */
 	uint32_t cleanmarker_size;	/* Size of an _inline_ CLEANMARKER
@@ -93,7 +93,7 @@ struct jffs2_sb_info {
 	/* Sem to allow jffs2_garbage_collect_deletion_dirent to
 	   drop the erase_completion_lock while it's holding a pointer
 	   to an obsoleted node. I don't like this. Alternatives welcomed. */
-	struct semaphore erase_free_sem;
+	struct compat_semaphore erase_free_sem;
 
 	uint32_t wbuf_pagesize; /* 0 for NOR and other flashes with no wbuf */
 
@@ -104,7 +104,7 @@ struct jffs2_sb_info {
 	uint32_t wbuf_len;
 	struct jffs2_inodirty *wbuf_inodes;
 
-	struct rw_semaphore wbuf_sem;	/* Protects the write buffer */
+	struct compat_rw_semaphore wbuf_sem;	/* Protects the write buffer */
 
 	/* Information about out-of-band area usage... */
 	struct nand_oobinfo *oobinfo;
Index: linux-2.6.16/include/linux/kernel.h
===================================================================
--- linux-2.6.16.orig/include/linux/kernel.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/kernel.h	2006-10-19 16:52:31.000000000 +0200
@@ -67,7 +67,7 @@ extern int cond_resched(void);
 # define might_resched() do { } while (0)
 #endif
 
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT)
   void __might_sleep(char *file, int line);
 # define might_sleep() \
 	do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0)
@@ -132,6 +132,8 @@ asmlinkage int vprintk(const char *fmt, 
 	__attribute__ ((format (printf, 1, 0)));
 asmlinkage int printk(const char * fmt, ...)
 	__attribute__ ((format (printf, 1, 2)));
+extern void early_printk(const char *fmt, ...)
+	__attribute__ ((format (printf, 1, 2)));
 #else
 static inline int vprintk(const char *s, va_list args)
 	__attribute__ ((format (printf, 1, 0)));
@@ -141,6 +143,18 @@ static inline int printk(const char *s, 
 static inline int printk(const char *s, ...) { return 0; }
 #endif
 
+#ifdef CONFIG_PREEMPT_RT
+extern void zap_rt_locks(void);
+#else
+# define zap_rt_locks() do { } while (0)
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+extern void zap_rt_locks(void);
+#else
+# define zap_rt_locks() do { } while (0)
+#endif
+
 unsigned long int_sqrt(unsigned long);
 
 static inline int __attribute_pure__ long_log2(unsigned long x)
@@ -181,6 +195,7 @@ extern void add_taint(unsigned);
 /* Values used for system_state */
 extern enum system_states {
 	SYSTEM_BOOTING,
+	SYSTEM_BOOTING_SCHEDULER_OK,
 	SYSTEM_RUNNING,
 	SYSTEM_HALT,
 	SYSTEM_POWER_OFF,
Index: linux-2.6.16/include/linux/kernel_stat.h
===================================================================
--- linux-2.6.16.orig/include/linux/kernel_stat.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/kernel_stat.h	2006-10-19 16:52:31.000000000 +0200
@@ -24,6 +24,8 @@ struct cpu_usage_stat {
 	cputime64_t idle;
 	cputime64_t iowait;
 	cputime64_t steal;
+	cputime64_t user_rt;
+	cputime64_t system_rt;
 };
 
 struct kernel_stat {
Index: linux-2.6.16/include/linux/ktime.h
===================================================================
--- linux-2.6.16.orig/include/linux/ktime.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/ktime.h	2006-10-19 16:52:31.000000000 +0200
@@ -64,9 +64,6 @@ typedef union {
 
 #if (BITS_PER_LONG == 64) || defined(CONFIG_KTIME_SCALAR)
 
-/* Define a ktime_t variable and initialize it to zero: */
-#define DEFINE_KTIME(kt)		ktime_t kt = { .tv64 = 0 }
-
 /**
  * ktime_set - Set a ktime_t variable from a seconds/nanoseconds value
  *
@@ -113,9 +110,6 @@ static inline ktime_t timeval_to_ktime(s
 /* Map the ktime_t to timeval conversion to ns_to_timeval function */
 #define ktime_to_timeval(kt)		ns_to_timeval((kt).tv64)
 
-/* Map the ktime_t to clock_t conversion to the inline in jiffies.h: */
-#define ktime_to_clock_t(kt)		nsec_to_clock_t((kt).tv64)
-
 /* Convert ktime_t to nanoseconds - NOP in the scalar storage format: */
 #define ktime_to_ns(kt)			((kt).tv64)
 
@@ -136,9 +130,6 @@ static inline ktime_t timeval_to_ktime(s
  *   tv.sec < 0 and 0 >= tv.nsec < NSEC_PER_SEC
  */
 
-/* Define a ktime_t variable and initialize it to zero: */
-#define DEFINE_KTIME(kt)		ktime_t kt = { .tv64 = 0 }
-
 /* Set a ktime_t variable to a value in sec/nsec representation: */
 static inline ktime_t ktime_set(const long secs, const unsigned long nsecs)
 {
@@ -255,17 +246,6 @@ static inline struct timeval ktime_to_ti
 }
 
 /**
- * ktime_to_clock_t - convert a ktime_t variable to clock_t format
- * @kt:		the ktime_t variable to convert
- *
- * Returns a clock_t variable with the converted value
- */
-static inline clock_t ktime_to_clock_t(const ktime_t kt)
-{
-	return nsec_to_clock_t( (u64) kt.tv.sec * NSEC_PER_SEC + kt.tv.nsec);
-}
-
-/**
  * ktime_to_ns - convert a ktime_t variable to scalar nanoseconds
  * @kt:		the ktime_t variable to convert
  *
@@ -284,8 +264,16 @@ static inline u64 ktime_to_ns(const ktim
  * idea of the (in)accuracy of timers. Timer values are rounded up to
  * this resolution values.
  */
-#define KTIME_REALTIME_RES	(ktime_t){ .tv64 = TICK_NSEC }
-#define KTIME_MONOTONIC_RES	(ktime_t){ .tv64 = TICK_NSEC }
+#define KTIME_LOW_RES		(ktime_t){ .tv64 = TICK_NSEC }
+
+#ifdef CONFIG_GENERIC_TIME
+
+#define ktime_get		get_monotonic_clock
+#define ktime_get_real		get_realtime_clock
+#define ktime_get_ts(ts)	get_monotonic_clock_ts(ts)
+#define ktime_get_real_ts(ts)	get_realtime_clock_ts(ts)
+
+#else /* CONFIG_GENERIC_TIME */
 
 /* Get the monotonic time in timespec format: */
 extern void ktime_get_ts(struct timespec *ts);
@@ -293,4 +281,6 @@ extern void ktime_get_ts(struct timespec
 /* Get the real (wall-) time in timespec format: */
 #define ktime_get_real_ts(ts)	getnstimeofday(ts)
 
+#endif /* !CONFIG_GENERIC_TIME */
+
 #endif
Index: linux-2.6.16/include/linux/latency_hist.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/include/linux/latency_hist.h	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,32 @@
+/*
+ * kernel/latency_hist.h
+ *
+ * Add support for histograms of preemption-off latency and
+ * interrupt-off latency and wakeup latency, it depends on
+ * Real-Time Preemption Support.
+ *
+ *  Copyright (C) 2005 MontaVista Software, Inc.
+ *  Yi Yang <yyang@ch.mvista.com>
+ *
+ */
+#ifndef _LINUX_LATENCY_HIST_H_
+#define _LINUX_LATENCY_HIST_H_
+
+enum {
+        INTERRUPT_LATENCY = 0,
+        PREEMPT_LATENCY,
+        WAKEUP_LATENCY
+};
+
+#define MAX_ENTRY_NUM 10240
+#define LATENCY_TYPE_NUM 3
+
+#ifdef CONFIG_LATENCY_HIST
+extern void latency_hist(int latency_type, int cpu, unsigned long latency);
+# define latency_hist_flag 1
+#else
+# define latency_hist(a,b,c) do { (void)(cpu); } while (0)
+# define latency_hist_flag 0
+#endif /* CONFIG_LATENCY_HIST */
+
+#endif /* ifndef _LINUX_LATENCY_HIST_H_ */
Index: linux-2.6.16/include/linux/linkage.h
===================================================================
--- linux-2.6.16.orig/include/linux/linkage.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/linkage.h	2006-10-19 16:52:31.000000000 +0200
@@ -4,6 +4,8 @@
 #include <linux/config.h>
 #include <asm/linkage.h>
 
+#define notrace __attribute ((no_instrument_function))
+
 #ifdef __cplusplus
 #define CPP_ASMLINKAGE extern "C"
 #else
@@ -42,7 +44,7 @@
 
 #endif
 
-#define NORET_TYPE    /**/
+#define NORET_TYPE    /* */
 #define ATTRIB_NORET  __attribute__((noreturn))
 #define NORET_AND     noreturn,
 
Index: linux-2.6.16/include/linux/mc146818rtc.h
===================================================================
--- linux-2.6.16.orig/include/linux/mc146818rtc.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/mc146818rtc.h	2006-10-19 16:52:31.000000000 +0200
@@ -17,7 +17,7 @@
 
 #ifdef __KERNEL__
 #include <linux/spinlock.h>		/* spinlock_t */
-extern spinlock_t rtc_lock;		/* serialize CMOS RAM access */
+extern raw_spinlock_t rtc_lock;		/* serialize CMOS RAM access */
 #endif
 
 /**********************************************************************
Index: linux-2.6.16/include/linux/mca.h
===================================================================
--- linux-2.6.16.orig/include/linux/mca.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/mca.h	2006-10-19 16:52:31.000000000 +0200
@@ -12,8 +12,10 @@
 #include <asm/mca.h>
 
 extern int MCA_bus;
+extern void mca_timer_ack(void *);
 #else
 #define MCA_bus 0
+#define mca_timer_ack NULL
 #endif
 
 /* This sets up an information callback for /proc/mca/slot?.  The
Index: linux-2.6.16/include/linux/mm.h
===================================================================
--- linux-2.6.16.orig/include/linux/mm.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/mm.h	2006-10-19 16:52:31.000000000 +0200
@@ -229,10 +229,9 @@ struct page {
 		unsigned long private;		/* Mapping-private opaque data:
 					 	 * usually used for buffer_heads
 						 * if PagePrivate set; used for
-						 * swp_entry_t if PageSwapCache.
-						 * When page is free, this
+						 * swp_entry_t if PageSwapCache;
 						 * indicates order in the buddy
-						 * system.
+						 * system if PG_buddy is set.
 						 */
 		struct address_space *mapping;	/* If low bit clear, points to
 						 * inode address_space, or NULL.
@@ -1022,13 +1021,20 @@ static inline void vm_stat_account(struc
 }
 #endif /* CONFIG_PROC_FS */
 
+static inline void
+debug_check_no_locks_freed(const void *from, unsigned long len)
+{
+	mutex_debug_check_no_locks_freed(from, len);
+	rt_mutex_debug_check_no_locks_freed(from, len);
+}
+
 #ifndef CONFIG_DEBUG_PAGEALLOC
 static inline void
 kernel_map_pages(struct page *page, int numpages, int enable)
 {
 	if (!PageHighMem(page) && !enable)
-		mutex_debug_check_no_locks_freed(page_address(page),
-						 numpages * PAGE_SIZE);
+		debug_check_no_locks_freed(page_address(page),
+					   numpages * PAGE_SIZE);
 }
 #endif
 
Index: linux-2.6.16/include/linux/mutex.h
===================================================================
--- linux-2.6.16.orig/include/linux/mutex.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/mutex.h	2006-10-19 16:52:31.000000000 +0200
@@ -12,10 +12,40 @@
 
 #include <linux/list.h>
 #include <linux/spinlock_types.h>
+#include <linux/rt_lock.h>
 #include <linux/linkage.h>
 
 #include <asm/atomic.h>
 
+#ifdef CONFIG_PREEMPT_RT
+
+#include <linux/rtmutex.h>
+
+struct mutex {
+	struct rt_mutex mutex;
+};
+
+#define DEFINE_MUTEX(mutexname) 					\
+ 	struct mutex mutexname = { 					\
+		.mutex = __RT_MUTEX_INITIALIZER(mutexname.mutex) 	\
+	}
+
+#define __mutex_init(l,n)		__rt_mutex_init(&(l)->mutex,n)
+#define mutex_is_locked(l)	       	rt_mutex_is_locked(&(l)->mutex)
+#define mutex_lock(l)			rt_mutex_lock(&(l)->mutex)
+#define mutex_lock_interruptible(l)   	\
+	rt_mutex_lock_interruptible(&(l)->mutex, 0)
+#define mutex_trylock(l)		rt_mutex_trylock(&(l)->mutex)
+#define mutex_unlock(l)			rt_mutex_unlock(&(l)->mutex)
+#define mutex_destroy(l)		rt_mutex_destroy(&(l)->mutex)
+
+# define mutex_debug_show_all_locks()			do { } while (0)
+# define mutex_debug_show_held_locks(p)			do { } while (0)
+# define mutex_debug_check_no_locks_held(task)		do { } while (0)
+# define mutex_debug_check_no_locks_freed(from, len)	do { } while (0)
+# define mutex_init(mutex)			__mutex_init(mutex, NULL)
+
+#else
 /*
  * Simple, straightforward mutexes with strict semantics:
  *
@@ -118,3 +148,5 @@ extern int fastcall mutex_trylock(struct
 extern void fastcall mutex_unlock(struct mutex *lock);
 
 #endif
+
+#endif
Index: linux-2.6.16/include/linux/netdevice.h
===================================================================
--- linux-2.6.16.orig/include/linux/netdevice.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/netdevice.h	2006-10-19 16:52:31.000000000 +0200
@@ -829,7 +829,7 @@ static inline void __netif_rx_schedule(s
 		dev->quota += dev->weight;
 	else
 		dev->quota = dev->weight;
-	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+	raise_softirq_irqoff(NET_RX_SOFTIRQ);
 	local_irq_restore(flags);
 }
 
@@ -853,7 +853,7 @@ static inline int netif_rx_reschedule(st
 
 		local_irq_save(flags);
 		list_add_tail(&dev->poll_list, &__get_cpu_var(softnet_data).poll_list);
-		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+		raise_softirq_irqoff(NET_RX_SOFTIRQ);
 		local_irq_restore(flags);
 		return 1;
 	}
Index: linux-2.6.16/include/linux/netfilter_ipv4/ip_conntrack.h
===================================================================
--- linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/netfilter_ipv4/ip_conntrack.h	2006-10-19 16:52:31.000000000 +0200
@@ -291,8 +291,14 @@ static inline int is_dying(struct ip_con
 }
 
 extern unsigned int ip_conntrack_htable_size;
- 
-#define CONNTRACK_STAT_INC(count) (__get_cpu_var(ip_conntrack_stat).count++)
+
+
+#define CONNTRACK_STAT_INC(count) \
+do { \
+	preempt_disable(); \
+	__get_cpu_var(ip_conntrack_stat).count++; \
+	preempt_enable(); \
+} while (0)
 
 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
 #include <linux/notifier.h>
@@ -302,10 +308,8 @@ struct ip_conntrack_ecache {
 	struct ip_conntrack *ct;
 	unsigned int events;
 };
-DECLARE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
+DECLARE_PER_CPU_LOCKED(struct ip_conntrack_ecache, ip_conntrack_ecache);
 
-#define CONNTRACK_ECACHE(x)	(__get_cpu_var(ip_conntrack_ecache).x)
- 
 extern struct notifier_block *ip_conntrack_chain;
 extern struct notifier_block *ip_conntrack_expect_chain;
 
@@ -332,7 +336,8 @@ ip_conntrack_expect_unregister_notifier(
 }
 
 extern void ip_ct_deliver_cached_events(const struct ip_conntrack *ct);
-extern void __ip_ct_event_cache_init(struct ip_conntrack *ct);
+extern void __ip_ct_event_cache_init(struct ip_conntrack_ecache *ecache,
+				     struct ip_conntrack *ct);
 
 static inline void 
 ip_conntrack_event_cache(enum ip_conntrack_events event,
@@ -340,12 +345,14 @@ ip_conntrack_event_cache(enum ip_conntra
 {
 	struct ip_conntrack *ct = (struct ip_conntrack *)skb->nfct;
 	struct ip_conntrack_ecache *ecache;
-	
+	int cpu;
+
 	local_bh_disable();
-	ecache = &__get_cpu_var(ip_conntrack_ecache);
+	ecache = &get_cpu_var_locked(ip_conntrack_ecache, &cpu);
 	if (ct != ecache->ct)
-		__ip_ct_event_cache_init(ct);
+		__ip_ct_event_cache_init(ecache, ct);
 	ecache->events |= event;
+	put_cpu_var_locked(ip_conntrack_ecache, cpu);
 	local_bh_enable();
 }
 
Index: linux-2.6.16/include/linux/oprofile.h
===================================================================
--- linux-2.6.16.orig/include/linux/oprofile.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/oprofile.h	2006-10-19 16:52:31.000000000 +0200
@@ -114,6 +114,6 @@ ssize_t oprofilefs_ulong_to_user(unsigne
 int oprofilefs_ulong_from_user(unsigned long * val, char const __user * buf, size_t count);
 
 /** lock for read/write safety */
-extern spinlock_t oprofilefs_lock;
+extern raw_spinlock_t oprofilefs_lock;
  
 #endif /* OPROFILE_H */
Index: linux-2.6.16/include/linux/page-flags.h
===================================================================
--- linux-2.6.16.orig/include/linux/page-flags.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/page-flags.h	2006-10-19 16:52:31.000000000 +0200
@@ -74,7 +74,9 @@
 #define PG_mappedtodisk		16	/* Has blocks allocated on-disk */
 #define PG_reclaim		17	/* To be reclaimed asap */
 #define PG_nosave_free		18	/* Free, should not be written */
-#define PG_uncached		19	/* Page has been mapped as uncached */
+#define PG_buddy		19	/* Page is free, on buddy lists */
+
+#define PG_uncached		20	/* Page has been mapped as uncached */
 
 /*
  * Global page accounting.  One instance per CPU.  Only unsigned longs are
@@ -319,6 +321,10 @@ extern void __mod_page_state_offset(unsi
 #define SetPageNosaveFree(page)	set_bit(PG_nosave_free, &(page)->flags)
 #define ClearPageNosaveFree(page)		clear_bit(PG_nosave_free, &(page)->flags)
 
+#define PageBuddy(page)		test_bit(PG_buddy, &(page)->flags)
+#define __SetPageBuddy(page)	__set_bit(PG_buddy, &(page)->flags)
+#define __ClearPageBuddy(page)	__clear_bit(PG_buddy, &(page)->flags)
+
 #define PageMappedToDisk(page)	test_bit(PG_mappedtodisk, &(page)->flags)
 #define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags)
 #define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags)
Index: linux-2.6.16/include/linux/pagemap.h
===================================================================
--- linux-2.6.16.orig/include/linux/pagemap.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/pagemap.h	2006-10-19 16:52:31.000000000 +0200
@@ -112,20 +112,19 @@ DECLARE_PER_CPU(long, nr_pagecache_local
  * an offset in their per-cpu arena and will spill that into the
  * global count whenever the absolute value of the local count
  * exceeds the counter's threshold.
- *
- * MUST be protected from preemption.
- * current protection is mapping->page_lock.
  */
 static inline void pagecache_acct(int count)
 {
 	long *local;
 
+	preempt_disable();
 	local = &__get_cpu_var(nr_pagecache_local);
 	*local += count;
 	if (*local > PAGECACHE_ACCT_THRESHOLD || *local < -PAGECACHE_ACCT_THRESHOLD) {
 		atomic_add(*local, &nr_pagecache);
 		*local = 0;
 	}
+	preempt_enable();
 }
 
 #else
Index: linux-2.6.16/include/linux/pagevec.h
===================================================================
--- linux-2.6.16.orig/include/linux/pagevec.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/pagevec.h	2006-10-19 16:52:31.000000000 +0200
@@ -9,7 +9,7 @@
 #define _LINUX_PAGEVEC_H
 
 /* 14 pointers + two long's align the pagevec structure to a power of two */
-#define PAGEVEC_SIZE	14
+#define PAGEVEC_SIZE	8
 
 struct page;
 struct address_space;
Index: linux-2.6.16/include/linux/parport.h
===================================================================
--- linux-2.6.16.orig/include/linux/parport.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/parport.h	2006-10-19 16:52:31.000000000 +0200
@@ -261,7 +261,7 @@ enum ieee1284_phase {
 struct ieee1284_info {
 	int mode;
 	volatile enum ieee1284_phase phase;
-	struct semaphore irq;
+	struct compat_semaphore irq;
 };
 
 /* A parallel port */
Index: linux-2.6.16/include/linux/percpu.h
===================================================================
--- linux-2.6.16.orig/include/linux/percpu.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/percpu.h	2006-10-19 16:52:31.000000000 +0200
@@ -8,13 +8,36 @@
 
 /* Enough to cover all DEFINE_PER_CPUs in kernel, including modules. */
 #ifndef PERCPU_ENOUGH_ROOM
-#define PERCPU_ENOUGH_ROOM 32768
+# define PERCPU_ENOUGH_ROOM 524288
 #endif
 
 /* Must be an lvalue. */
 #define get_cpu_var(var) (*({ preempt_disable(); &__get_cpu_var(var); }))
 #define put_cpu_var(var) preempt_enable()
 
+/*
+ * Per-CPU data structures with an additional lock - useful for
+ * PREEMPT_RT code that wants to reschedule but also wants
+ * per-CPU data structures.
+ *
+ * 'cpu' gets updated with the CPU the task is currently executing on.
+ *
+ * NOTE: on normal !PREEMPT_RT kernels these per-CPU variables
+ * are the same as the normal per-CPU variables, so there no
+ * runtime overhead.
+ */
+#define get_cpu_var_locked(var, cpuptr)			\
+(*({							\
+	int __cpu = raw_smp_processor_id();		\
+							\
+	*(cpuptr) = __cpu;				\
+	spin_lock(&__get_cpu_lock(var, __cpu));		\
+	&__get_cpu_var_locked(var, __cpu);		\
+}))
+
+#define put_cpu_var_locked(var, cpu) \
+		 do { (void)cpu; spin_unlock(&__get_cpu_lock(var, cpu)); } while (0)
+
 #ifdef CONFIG_SMP
 
 struct percpu_data {
Index: linux-2.6.16/include/linux/percpu_counter.h
===================================================================
--- linux-2.6.16.orig/include/linux/percpu_counter.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/percpu_counter.h	2006-10-19 16:52:31.000000000 +0200
@@ -15,7 +15,7 @@
 #ifdef CONFIG_SMP
 
 struct percpu_counter {
-	spinlock_t lock;
+	raw_spinlock_t lock;
 	long count;
 	long *counters;
 };
Index: linux-2.6.16/include/linux/plist.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/include/linux/plist.h	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,248 @@
+/*
+ * Descending-priority-sorted double-linked list
+ *
+ * (C) 2002-2003 Intel Corp
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>.
+ *
+ * 2001-2005 (c) MontaVista Software, Inc.
+ * Daniel Walker <dwalker@mvista.com>
+ *
+ * (C) 2005 Thomas Gleixner <tglx@linutronix.de>
+ *
+ * Simplifications of the original code by
+ * Oleg Nesterov <oleg@tv-sign.ru>
+ *
+ * Licensed under the FSF's GNU Public License v2 or later.
+ *
+ * Based on simple lists (include/linux/list.h).
+ *
+ * This is a priority-sorted list of nodes; each node has a
+ * priority from INT_MIN (highest) to INT_MAX (lowest).
+ *
+ * Addition is O(K), removal is O(1), change of priority of a node is
+ * O(K) and K is the number of RT priority levels used in the system.
+ * (1 <= K <= 99)
+ *
+ * This list is really a list of lists:
+ *
+ *  - The tier 1 list is the prio_list, different priority nodes.
+ *
+ *  - The tier 2 list is the node_list, serialized nodes.
+ *
+ * Simple ASCII art explanation:
+ *
+ * |HEAD          |
+ * |              |
+ * |prio_list.prev|<------------------------------------|
+ * |prio_list.next|<->|pl|<->|pl|<--------------->|pl|<-|
+ * |10            |   |10|   |21|   |21|   |21|   |40|   (prio)
+ * |              |   |  |   |  |   |  |   |  |   |  |
+ * |              |   |  |   |  |   |  |   |  |   |  |
+ * |node_list.next|<->|nl|<->|nl|<->|nl|<->|nl|<->|nl|<-|
+ * |node_list.prev|<------------------------------------|
+ *
+ * The nodes on the prio_list list are sorted by priority to simplify
+ * the insertion of new nodes. There are no nodes with duplicate
+ * priorites on the list.
+ *
+ * The nodes on the node_list is ordered by priority and can contain
+ * entries which have the same priority. Those entries are ordered
+ * FIFO
+ *
+ * Addition means: look for the prio_list node in the prio_list
+ * for the priority of the node and insert it before the node_list
+ * entry of the next prio_list node. If it is the first node of
+ * that priority, add it to the prio_list in the right position and
+ * insert it into the serialized node_list list
+ *
+ * Removal means remove it from the node_list and remove it from
+ * the prio_list if the node_list list_head is non empty. In case
+ * of removal from the prio_list it must be checked whether other
+ * entries of the same priority are on the list or not. If there
+ * is another entry of the same priority then this entry has to
+ * replace the removed entry on the prio_list. If the entry which
+ * is removed is the only entry of this priority then a simple
+ * remove from both list is sufficient.
+ *
+ * INT_MIN is the highest priority, 0 is the medium highest, INT_MAX
+ * is lowest priority.
+ *
+ * No locking is done, up to the caller.
+ *
+ */
+#ifndef _LINUX_PLIST_H_
+#define _LINUX_PLIST_H_
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/spinlock_types.h>
+
+struct plist_head {
+	struct list_head prio_list;
+	struct list_head node_list;
+#ifdef CONFIG_DEBUG_PI_LIST
+	raw_spinlock_t *lock;
+#endif
+};
+
+struct plist_node {
+	int			prio;
+	struct plist_head	plist;
+};
+
+#ifdef CONFIG_DEBUG_PI_LIST
+# define PLIST_HEAD_LOCK_INIT(_lock)	.lock = _lock
+#else
+# define PLIST_HEAD_LOCK_INIT(_lock)
+#endif
+
+/**
+ * #PLIST_HEAD_INIT - static struct plist_head initializer
+ *
+ * @head:	struct plist_head variable name
+ */
+#define PLIST_HEAD_INIT(head, _lock)			\
+{							\
+	.prio_list = LIST_HEAD_INIT((head).prio_list),	\
+	.node_list = LIST_HEAD_INIT((head).node_list),	\
+	PLIST_HEAD_LOCK_INIT(&(_lock))			\
+}
+
+/**
+ * #PLIST_NODE_INIT - static struct plist_node initializer
+ *
+ * @node:	struct plist_node variable name
+ * @__prio:	initial node priority
+ */
+#define PLIST_NODE_INIT(node, __prio)			\
+{							\
+	.prio  = (__prio),				\
+	.plist = PLIST_HEAD_INIT((node).plist, NULL),	\
+}
+
+/**
+ * plist_head_init - dynamic struct plist_head initializer
+ *
+ * @head:	&struct plist_head pointer
+ */
+static inline void
+plist_head_init(struct plist_head *head, raw_spinlock_t *lock)
+{
+	INIT_LIST_HEAD(&head->prio_list);
+	INIT_LIST_HEAD(&head->node_list);
+#ifdef CONFIG_DEBUG_PI_LIST
+	head->lock = lock;
+#endif
+}
+
+/**
+ * plist_node_init - Dynamic struct plist_node initializer
+ *
+ * @node:	&struct plist_node pointer
+ * @prio:	initial node priority
+ */
+static inline void plist_node_init(struct plist_node *node, int prio)
+{
+	node->prio = prio;
+	plist_head_init(&node->plist, NULL);
+}
+
+extern void plist_add(struct plist_node *node, struct plist_head *head);
+extern void plist_del(struct plist_node *node, struct plist_head *head);
+
+/**
+ * plist_for_each - iterate over the plist
+ *
+ * @pos1:	the type * to use as a loop counter.
+ * @head:	the head for your list.
+ */
+#define plist_for_each(pos, head)	\
+	 list_for_each_entry(pos, &(head)->node_list, plist.node_list)
+
+/**
+ * plist_for_each_entry_safe - iterate over a plist of given type safe
+ * against removal of list entry
+ *
+ * @pos1:	the type * to use as a loop counter.
+ * @n1:	another type * to use as temporary storage
+ * @head:	the head for your list.
+ */
+#define plist_for_each_safe(pos, n, head)	\
+	 list_for_each_entry_safe(pos, n, &(head)->node_list, plist.node_list)
+
+/**
+ * plist_for_each_entry	- iterate over list of given type
+ *
+ * @pos:	the type * to use as a loop counter.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define plist_for_each_entry(pos, head, mem)	\
+	 list_for_each_entry(pos, &(head)->node_list, mem.plist.node_list)
+
+/**
+ * plist_for_each_entry_safe - iterate over list of given type safe against
+ * removal of list entry
+ *
+ * @pos:	the type * to use as a loop counter.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @m:		the name of the list_struct within the struct.
+ */
+#define plist_for_each_entry_safe(pos, n, head, m)	\
+	list_for_each_entry_safe(pos, n, &(head)->node_list, m.plist.node_list)
+
+/**
+ * plist_head_empty - return !0 if a plist_head is empty
+ *
+ * @head:	&struct plist_head pointer
+ */
+static inline int plist_head_empty(const struct plist_head *head)
+{
+	return list_empty(&head->node_list);
+}
+
+/**
+ * plist_node_empty - return !0 if plist_node is not on a list
+ *
+ * @node:	&struct plist_node pointer
+ */
+static inline int plist_node_empty(const struct plist_node *node)
+{
+	return plist_head_empty(&node->plist);
+}
+
+/* All functions below assume the plist_head is not empty. */
+
+/**
+ * plist_first_entry - get the struct for the first entry
+ *
+ * @ptr:	the &struct plist_head pointer.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the list_struct within the struct.
+ */
+#ifdef CONFIG_DEBUG_PI_LIST
+# define plist_first_entry(head, type, member)	\
+({ \
+	WARN_ON(plist_head_empty(head)); \
+	container_of(plist_first(head), type, member); \
+})
+#else
+# define plist_first_entry(head, type, member)	\
+	container_of(plist_first(head), type, member)
+#endif
+
+/**
+ * plist_first - return the first node (and thus, highest priority)
+ *
+ * @head:	the &struct plist_head pointer
+ *
+ * Assumes the plist is _not_ empty.
+ */
+static inline struct plist_node* plist_first(const struct plist_head *head)
+{
+	return list_entry(head->node_list.next,
+			  struct plist_node, plist.node_list);
+}
+
+#endif
Index: linux-2.6.16/include/linux/posix-timers.h
===================================================================
--- linux-2.6.16.orig/include/linux/posix-timers.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/posix-timers.h	2006-10-19 16:52:31.000000000 +0200
@@ -110,5 +110,5 @@ void posix_cpu_timers_exit_group(struct 
 
 void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx,
 			   cputime_t *newval, cputime_t *oldval);
-
+int posix_cpu_thread_init(void);
 #endif
Index: linux-2.6.16/include/linux/preempt.h
===================================================================
--- linux-2.6.16.orig/include/linux/preempt.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/preempt.h	2006-10-19 16:52:31.000000000 +0200
@@ -9,23 +9,39 @@
 #include <linux/config.h>
 #include <linux/thread_info.h>
 #include <linux/linkage.h>
+#include <linux/thread_info.h>
 
-#ifdef CONFIG_DEBUG_PREEMPT
-  extern void fastcall add_preempt_count(int val);
-  extern void fastcall sub_preempt_count(int val);
+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_CRITICAL_TIMING)
+  extern void notrace add_preempt_count(unsigned int val);
+  extern void notrace sub_preempt_count(unsigned int val);
+  extern void notrace mask_preempt_count(unsigned int mask);
+  extern void notrace unmask_preempt_count(unsigned int mask);
 #else
 # define add_preempt_count(val)	do { preempt_count() += (val); } while (0)
 # define sub_preempt_count(val)	do { preempt_count() -= (val); } while (0)
+# define mask_preempt_count(mask) \
+		do { preempt_count() |= (mask); } while (0)
+# define unmask_preempt_count(mask) \
+		do { preempt_count() &= ~(mask); } while (0)
+#endif
+
+#ifdef CONFIG_CRITICAL_TIMING
+  extern void touch_critical_timing(void);
+  extern void stop_critical_timing(void);
+#else
+# define touch_critical_timing()	do { } while (0)
+# define stop_critical_timing()	do { } while (0)
 #endif
 
 #define inc_preempt_count() add_preempt_count(1)
 #define dec_preempt_count() sub_preempt_count(1)
 
-#define preempt_count()	(current_thread_info()->preempt_count)
+#define preempt_count()		(current_thread_info()->preempt_count)
 
 #ifdef CONFIG_PREEMPT
 
 asmlinkage void preempt_schedule(void);
+asmlinkage void preempt_schedule_irq(void);
 
 #define preempt_disable() \
 do { \
@@ -33,21 +49,34 @@ do { \
 	barrier(); \
 } while (0)
 
-#define preempt_enable_no_resched() \
+#define __preempt_enable_no_resched() \
 do { \
 	barrier(); \
 	dec_preempt_count(); \
 } while (0)
 
+
+#ifdef CONFIG_DEBUG_PREEMPT
+extern void notrace preempt_enable_no_resched(void);
+#else
+# define preempt_enable_no_resched() __preempt_enable_no_resched()
+#endif
+
 #define preempt_check_resched() \
 do { \
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
 		preempt_schedule(); \
 } while (0)
 
+#define preempt_check_resched_delayed() \
+do { \
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED_DELAYED))) \
+		preempt_schedule(); \
+} while (0)
+
 #define preempt_enable() \
 do { \
-	preempt_enable_no_resched(); \
+	__preempt_enable_no_resched(); \
 	barrier(); \
 	preempt_check_resched(); \
 } while (0)
@@ -56,8 +85,12 @@ do { \
 
 #define preempt_disable()		do { } while (0)
 #define preempt_enable_no_resched()	do { } while (0)
+#define __preempt_enable_no_resched()	do { } while (0)
 #define preempt_enable()		do { } while (0)
 #define preempt_check_resched()		do { } while (0)
+#define preempt_check_resched_delayed()	do { } while (0)
+
+#define preempt_schedule_irq()		do { } while (0)
 
 #endif
 
Index: linux-2.6.16/include/linux/proc_fs.h
===================================================================
--- linux-2.6.16.orig/include/linux/proc_fs.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/proc_fs.h	2006-10-19 16:52:31.000000000 +0200
@@ -4,6 +4,7 @@
 #include <linux/config.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
+#include <linux/spinlock.h>
 #include <asm/atomic.h>
 
 /*
@@ -92,6 +93,8 @@ extern struct proc_dir_entry *proc_bus;
 extern struct proc_dir_entry *proc_root_driver;
 extern struct proc_dir_entry *proc_root_kcore;
 
+extern spinlock_t proc_subdir_lock;
+
 extern void proc_root_init(void);
 extern void proc_misc_init(void);
 
Index: linux-2.6.16/include/linux/profile.h
===================================================================
--- linux-2.6.16.orig/include/linux/profile.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/profile.h	2006-10-19 16:52:31.000000000 +0200
@@ -7,10 +7,12 @@
 #include <linux/config.h>
 #include <linux/init.h>
 #include <linux/cpumask.h>
+#include <linux/kernel_stat.h>
 #include <asm/errno.h>
 
-#define CPU_PROFILING	1
-#define SCHED_PROFILING	2
+#define CPU_PROFILING		1
+#define SCHED_PROFILING		2
+#define PREEMPT_PROFILING	3
 
 struct proc_dir_entry;
 struct pt_regs;
@@ -30,6 +32,10 @@ enum profile_type {
 	PROFILE_MUNMAP
 };
 
+extern int prof_pid;
+
+extern int prof_pid;
+
 #ifdef CONFIG_PROFILING
 
 struct notifier_block;
Index: linux-2.6.16/include/linux/quota.h
===================================================================
--- linux-2.6.16.orig/include/linux/quota.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/quota.h	2006-10-19 16:52:31.000000000 +0200
@@ -37,6 +37,7 @@
 
 #include <linux/errno.h>
 #include <linux/types.h>
+#include <linux/wait.h>
 #include <linux/spinlock.h>
 
 #define __DQUOT_VERSION__	"dquot_6.5.1"
Index: linux-2.6.16/include/linux/radix-tree.h
===================================================================
--- linux-2.6.16.orig/include/linux/radix-tree.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/radix-tree.h	2006-10-19 16:52:31.000000000 +0200
@@ -52,7 +52,18 @@ void *radix_tree_delete(struct radix_tre
 unsigned int
 radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 			unsigned long first_index, unsigned int max_items);
+/*
+ * On a mutex based kernel we can freely schedule within the radix code:
+ */
+#ifdef CONFIG_PREEMPT_RT
+static inline int radix_tree_preload(gfp_t gfp_mask)
+{
+	return 0;
+}
+#else
 int radix_tree_preload(gfp_t gfp_mask);
+#endif
+
 void radix_tree_init(void);
 void *radix_tree_tag_set(struct radix_tree_root *root,
 			unsigned long index, int tag);
@@ -67,7 +78,9 @@ int radix_tree_tagged(struct radix_tree_
 
 static inline void radix_tree_preload_end(void)
 {
+#ifndef CONFIG_PREEMPT_RT
 	preempt_enable();
+#endif
 }
 
 #endif /* _LINUX_RADIX_TREE_H */
Index: linux-2.6.16/include/linux/rcupdate.h
===================================================================
--- linux-2.6.16.orig/include/linux/rcupdate.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/rcupdate.h	2006-10-19 16:52:31.000000000 +0200
@@ -59,6 +59,7 @@ struct rcu_head {
 } while (0)
 
 
+#ifndef CONFIG_PREEMPT_RCU
 
 /* Global control variables for rcupdate callback mechanism. */
 struct rcu_ctrlblk {
@@ -164,14 +165,26 @@ extern int rcu_pending(int cpu);
  *
  * It is illegal to block while in an RCU read-side critical section.
  */
-#define rcu_read_lock()		preempt_disable()
+#define rcu_read_lock preempt_disable
 
 /**
  * rcu_read_unlock - marks the end of an RCU read-side critical section.
  *
  * See rcu_read_lock() for more information.
  */
-#define rcu_read_unlock()	preempt_enable()
+#define rcu_read_unlock preempt_enable
+
+#else /* #ifndef CONFIG_PREEMPT_RCU */
+
+#define rcu_qsctr_inc(cpu)
+#define rcu_bh_qsctr_inc(cpu)
+#define call_rcu_bh(head, rcu) call_rcu(head, rcu)
+
+extern void rcu_read_lock(void);
+extern void rcu_read_unlock(void);
+extern int rcu_pending(int cpu);
+
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU */
 
 /*
  * So where is rcu_write_lock()?  It does not exist, as there is no
@@ -194,14 +207,22 @@ extern int rcu_pending(int cpu);
  * can use just rcu_read_lock().
  *
  */
+#ifndef CONFIG_PREEMPT_RCU
 #define rcu_read_lock_bh()	local_bh_disable()
+#else /* #ifndef CONFIG_PREEMPT_RCU */
+#define rcu_read_lock_bh()	{ rcu_read_lock(); local_bh_disable(); }
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU */
 
 /*
  * rcu_read_unlock_bh - marks the end of a softirq-only RCU critical section
  *
  * See rcu_read_lock_bh() for more information.
  */
+#ifndef CONFIG_PREEMPT_RCU
 #define rcu_read_unlock_bh()	local_bh_enable()
+#else /* #ifndef CONFIG_PREEMPT_RCU */
+#define rcu_read_unlock_bh()	{ local_bh_enable(); rcu_read_unlock(); }
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU */
 
 /**
  * rcu_dereference - fetch an RCU-protected pointer in an
@@ -247,13 +268,19 @@ extern int rcu_pending(int cpu);
  * softirq handlers will have completed, since in some kernels, these
  * handlers can run in process context, and can block.
  *
- * This primitive provides the guarantees made by the (deprecated)
+ * This primitive provides the guarantees made by the (now removed)
  * synchronize_kernel() API.  In contrast, synchronize_rcu() only
  * guarantees that rcu_read_lock() sections will have completed.
  * In "classic RCU", these two guarantees happen to be one and
  * the same, but can differ in realtime RCU implementations.
  */
+#ifndef CONFIG_PREEMPT_RCU
 #define synchronize_sched() synchronize_rcu()
+extern void rcu_barrier(void);
+#else /* #ifndef CONFIG_PREEMPT_RCU */
+extern void synchronize_sched(void);
+#define rcu_barrier() do {} while(0)
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU */
 
 extern void rcu_init(void);
 extern void rcu_check_callbacks(int cpu, int user);
@@ -265,10 +292,8 @@ extern void FASTCALL(call_rcu(struct rcu
 				void (*func)(struct rcu_head *head)));
 extern void FASTCALL(call_rcu_bh(struct rcu_head *head,
 				void (*func)(struct rcu_head *head)));
-extern __deprecated_for_modules void synchronize_kernel(void);
 extern void synchronize_rcu(void);
 void synchronize_idle(void);
-extern void rcu_barrier(void);
 
 #endif /* __KERNEL__ */
 #endif /* __LINUX_RCUPDATE_H */
Index: linux-2.6.16/include/linux/rt_lock.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/include/linux/rt_lock.h	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,295 @@
+#ifndef __LINUX_RT_LOCK_H
+#define __LINUX_RT_LOCK_H
+
+/*
+ * Real-Time Preemption Support
+ *
+ * started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004, 2005 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * This file contains the main data structure definitions.
+ */
+#include <linux/rtmutex.h>
+#include <asm/atomic.h>
+#include <linux/spinlock_types.h>
+
+#ifdef CONFIG_PREEMPT_RT
+
+/*
+ * RW-semaphores are an RT mutex plus a reader-depth count.
+ *
+ * Note that the semantics are different from the usual
+ * Linux rw-sems, in PREEMPT_RT mode we do not allow
+ * multiple readers to hold the lock at once, we only allow
+ * a read-lock owner to read-lock recursively. This is
+ * better for latency, makes the implementation inherently
+ * fair and makes it simpler as well:
+ */
+struct rw_semaphore {
+	struct rt_mutex		lock;
+	int			read_depth;
+};
+
+/*
+ * rwlocks - an RW semaphore plus lock-break field:
+ */
+typedef struct {
+	struct rw_semaphore	lock;
+	unsigned int		break_lock;
+} rwlock_t;
+
+# ifdef CONFIG_DEBUG_RT_MUTEXES
+#  define __RW_LOCK_UNLOCKED \
+	.wait_lock = _RAW_SPIN_LOCK_UNLOCKED, .save_state = 1 \
+	, .file = __FILE__, .line = __LINE__
+#  define _RW_LOCK_UNLOCKED \
+	(rwlock_t) { { { __RW_LOCK_UNLOCKED } } }
+#  define RW_LOCK_UNLOCKED \
+	(rwlock_t) { { { __RW_LOCK_UNLOCKED } } }
+# else
+#  define RW_LOCK_UNLOCKED (rwlock_t) \
+	{ { { .wait_lock = _RAW_SPIN_LOCK_UNLOCKED } } }
+#  define _RW_LOCK_UNLOCKED RW_LOCK_UNLOCKED
+# endif
+#else /* !PREEMPT_RT */
+  typedef raw_rwlock_t rwlock_t;
+# ifdef CONFIG_DEBUG_SPINLOCK
+# define _RW_LOCK_UNLOCKED						\
+	(rwlock_t)	{	.raw_lock = __RAW_RW_LOCK_UNLOCKED,	\
+				.magic = RWLOCK_MAGIC,			\
+				.owner = SPINLOCK_OWNER_INIT,		\
+				.owner_cpu = -1 }
+# else
+#  define _RW_LOCK_UNLOCKED						\
+	(rwlock_t)	{	.raw_lock = __RAW_RW_LOCK_UNLOCKED }
+# endif
+# define RW_LOCK_UNLOCKED	_RW_LOCK_UNLOCKED
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+
+/*
+ * spinlocks - an RT mutex plus lock-break field:
+ */
+typedef struct {
+	struct rt_mutex lock;
+	unsigned int break_lock;
+} spinlock_t;
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# define __SPIN_LOCK_UNLOCKED \
+	.wait_lock = _RAW_SPIN_LOCK_UNLOCKED \
+	, .save_state = 1, .file = __FILE__, .line = __LINE__
+# define _SPIN_LOCK_UNLOCKED \
+	(spinlock_t) { { __SPIN_LOCK_UNLOCKED } }
+# define SPIN_LOCK_UNLOCKED \
+	(spinlock_t) { { __SPIN_LOCK_UNLOCKED } }
+#else
+# define SPIN_LOCK_UNLOCKED \
+	(spinlock_t) { { .wait_lock = _RAW_SPIN_LOCK_UNLOCKED } }
+# define _SPIN_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED
+#endif
+#else /* !PREEMPT_RT */
+  typedef raw_spinlock_t spinlock_t;
+# ifdef CONFIG_DEBUG_SPINLOCK
+#  define _SPIN_LOCK_UNLOCKED						\
+	(spinlock_t)	{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED,	\
+				.magic = SPINLOCK_MAGIC,		\
+				.owner = SPINLOCK_OWNER_INIT,		\
+				.owner_cpu = -1 }
+# else
+#  define _SPIN_LOCK_UNLOCKED \
+	(spinlock_t)	{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED }
+# endif
+# define SPIN_LOCK_UNLOCKED _SPIN_LOCK_UNLOCKED
+#endif
+
+#define DEFINE_SPINLOCK(name) \
+	spinlock_t name __cacheline_aligned_in_smp = _SPIN_LOCK_UNLOCKED
+
+#define DEFINE_RWLOCK(name) \
+	rwlock_t name __cacheline_aligned_in_smp = _RW_LOCK_UNLOCKED
+
+#ifdef CONFIG_PREEMPT_RT
+
+/*
+ * Semaphores - an RT-mutex plus the semaphore count:
+ */
+struct semaphore {
+	atomic_t count;
+	struct rt_mutex lock;
+};
+
+#define DECLARE_MUTEX(name) \
+struct semaphore name = \
+	{ .count = { 1 }, .lock = __RT_MUTEX_INITIALIZER(name.lock) }
+
+/*
+ * DECLARE_MUTEX_LOCKED() is deprecated: very hard to initialize properly
+ * and it also often signals abuse of semaphores. So we redirect it to
+ * compat semaphores:
+ */
+#define DECLARE_MUTEX_LOCKED COMPAT_DECLARE_MUTEX_LOCKED
+
+extern void FASTCALL(__sema_init(struct semaphore *sem, int val, char *name, char *file, int line));
+
+#define rt_sema_init(sem, val) \
+		__sema_init(sem, val, #sem, __FILE__, __LINE__)
+
+extern void FASTCALL(__init_MUTEX(struct semaphore *sem, char *name, char *file, int line));
+#define rt_init_MUTEX(sem) \
+		__init_MUTEX(sem, #sem, __FILE__, __LINE__)
+
+extern void there_is_no_init_MUTEX_LOCKED_for_RT_semaphores(void);
+
+/*
+ * No locked initialization for RT semaphores
+ */
+#define rt_init_MUTEX_LOCKED(sem) \
+		there_is_no_init_MUTEX_LOCKED_for_RT_semaphores()
+extern void FASTCALL(rt_down(struct semaphore *sem));
+extern int FASTCALL(rt_down_interruptible(struct semaphore *sem));
+extern int FASTCALL(rt_down_trylock(struct semaphore *sem));
+extern void FASTCALL(rt_up(struct semaphore *sem));
+
+#define rt_sem_is_locked(s)	rt_mutex_is_locked(&(s)->lock)
+#define rt_sema_count(s)	atomic_read(&(s)->count)
+
+extern int __bad_func_type(void);
+
+#undef TYPE_EQUAL
+#define TYPE_EQUAL(var, type) \
+		__builtin_types_compatible_p(typeof(var), type *)
+
+#define PICK_FUNC_1ARG(type1, type2, func1, func2, arg)			\
+do {									\
+	if (TYPE_EQUAL((arg), type1))					\
+		func1((type1 *)(arg));					\
+	else if (TYPE_EQUAL((arg), type2))				\
+		func2((type2 *)(arg));					\
+	else __bad_func_type();						\
+} while (0)
+
+#define PICK_FUNC_1ARG_RET(type1, type2, func1, func2, arg)		\
+({									\
+	unsigned long __ret;						\
+									\
+	if (TYPE_EQUAL((arg), type1))					\
+		__ret = func1((type1 *)(arg));				\
+	else if (TYPE_EQUAL((arg), type2))				\
+		__ret = func2((type2 *)(arg));				\
+	else __ret = __bad_func_type();					\
+									\
+	__ret;								\
+})
+
+#define PICK_FUNC_2ARG(type1, type2, func1, func2, arg0, arg1)		\
+do {									\
+	if (TYPE_EQUAL((arg0), type1))					\
+		func1((type1 *)(arg0), arg1);				\
+	else if (TYPE_EQUAL((arg0), type2))				\
+		func2((type2 *)(arg0), arg1);				\
+	else __bad_func_type();						\
+} while (0)
+
+#define sema_init(sem, val) \
+	PICK_FUNC_2ARG(struct compat_semaphore, struct semaphore, \
+		compat_sema_init, rt_sema_init, sem, val)
+
+#define init_MUTEX(sem) \
+	PICK_FUNC_1ARG(struct compat_semaphore, struct semaphore, \
+		compat_init_MUTEX, rt_init_MUTEX, sem)
+
+#define init_MUTEX_LOCKED(sem) \
+	PICK_FUNC_1ARG(struct compat_semaphore, struct semaphore, \
+		compat_init_MUTEX_LOCKED, rt_init_MUTEX_LOCKED, sem)
+
+#define down(sem) \
+	PICK_FUNC_1ARG(struct compat_semaphore, struct semaphore, \
+		compat_down, rt_down, sem)
+
+#define down_interruptible(sem) \
+	PICK_FUNC_1ARG_RET(struct compat_semaphore, struct semaphore, \
+		compat_down_interruptible, rt_down_interruptible, sem)
+
+#define down_trylock(sem) \
+	PICK_FUNC_1ARG_RET(struct compat_semaphore, struct semaphore, \
+		compat_down_trylock, rt_down_trylock, sem)
+
+#define up(sem) \
+	PICK_FUNC_1ARG(struct compat_semaphore, struct semaphore, \
+		compat_up, rt_up, sem)
+
+#define sem_is_locked(sem) \
+	PICK_FUNC_1ARG_RET(struct compat_semaphore, struct semaphore, \
+		compat_sem_is_locked, rt_sem_is_locked, sem)
+
+#define sema_count(sem) \
+	PICK_FUNC_1ARG_RET(struct compat_semaphore, struct semaphore, \
+		compat_sema_count, rt_sema_count, sem)
+
+/*
+ * rwsems:
+ */
+
+#define __RWSEM_INITIALIZER(name) \
+	{ .lock = __RT_MUTEX_INITIALIZER(name.lock) }
+
+#define DECLARE_RWSEM(lockname) \
+	struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
+
+extern void FASTCALL(__rt_rwsem_init(struct rw_semaphore *rwsem, char *name));
+
+#define rt_init_rwsem(sem) __rt_rwsem_init(sem, #sem)
+
+extern void FASTCALL(rt_down_read(struct rw_semaphore *rwsem));
+extern int FASTCALL(rt_down_read_trylock(struct rw_semaphore *rwsem));
+extern void FASTCALL(rt_up_read(struct rw_semaphore *rwsem));
+extern void FASTCALL(rt_up_write(struct rw_semaphore *rwsem));
+extern void FASTCALL(rt_downgrade_write(struct rw_semaphore *rwsem));
+
+# define rt_rwsem_is_locked(rws)	(rt_mutex_is_locked(&(rws)->lock))
+# define rt_down_write(rws)		rt_mutex_lock(&(rws)->lock)
+# define rt_down_write_trylock(rws)	rt_mutex_trylock(&(rws)->lock)
+
+#define init_rwsem(rwsem) \
+	PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_init_rwsem, rt_init_rwsem, rwsem)
+
+#define down_read(rwsem) \
+	PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_down_read, rt_down_read, rwsem)
+
+#define down_read_trylock(rwsem) \
+	PICK_FUNC_1ARG_RET(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_down_read_trylock, rt_down_read_trylock, rwsem)
+
+#define down_write(rwsem) \
+	PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_down_write, rt_down_write, rwsem)
+
+#define down_write_trylock(rwsem) \
+	PICK_FUNC_1ARG_RET(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_down_write_trylock, rt_down_write_trylock, rwsem)
+
+#define up_read(rwsem) \
+	PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_up_read, rt_up_read, rwsem)
+
+#define up_write(rwsem) \
+	PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_up_write, rt_up_write, rwsem)
+
+#define downgrade_write(rwsem) \
+	PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_downgrade_write, rt_downgrade_write, rwsem)
+
+#define rwsem_is_locked(rwsem) \
+	PICK_FUNC_1ARG_RET(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_rwsem_is_locked, rt_rwsem_is_locked, rwsem)
+
+#endif /* CONFIG_PREEMPT_RT */
+
+#endif
+
Index: linux-2.6.16/include/linux/rtmutex.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/include/linux/rtmutex.h	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,124 @@
+/*
+ * RT Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains the public data structure and API definitions.
+ */
+
+#ifndef __LINUX_RT_MUTEX_H
+#define __LINUX_RT_MUTEX_H
+
+#include <linux/linkage.h>
+#include <linux/plist.h>
+#include <linux/spinlock_types.h>
+
+/*
+ * The rt_mutex structure
+ *
+ * @wait_lock:	spinlock to protect the structure
+ * @wait_list:	pilist head to enqueue waiters in priority order
+ * @owner:	the mutex owner
+ */
+struct rt_mutex {
+	raw_spinlock_t		wait_lock;
+	struct plist_head	wait_list;
+	struct task_struct	*owner;
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+	int			save_state;
+	struct list_head	held_list_entry;
+	unsigned long		acquire_ip;
+	const char 		*name, *file;
+	int			line;
+	void			*magic;
+#endif
+};
+
+struct rt_mutex_waiter;
+struct hrtimer_sleeper;
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+ extern int rt_mutex_debug_check_no_locks_freed(const void *from,
+						unsigned long len);
+ extern void rt_mutex_debug_check_no_locks_held(struct task_struct *task);
+ extern void rt_mutex_show_held_locks(struct task_struct *filter, int verbose);
+ extern void rt_mutex_show_all_locks(void);
+ extern void deadlock_trace_off(void);
+#else
+ static inline int rt_mutex_debug_check_no_locks_freed(const void *from,
+						       unsigned long len)
+ {
+	return 0;
+ }
+# define rt_mutex_debug_check_no_locks_held(task)	do { } while (0)
+# define rt_mutex_show_held_locks(filter, verbose)	do { } while (0)
+# define rt_mutex_show_all_locks(filter)		do { } while (0)
+# define deadlock_trace_off()				do { } while (0)
+#endif
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
+	, .name = #mutexname, .file = __FILE__, .line = __LINE__
+# define rt_mutex_init(mutex)			__rt_mutex_init(mutex, __FUNCTION__)
+ extern void rt_mutex_debug_task_free(struct task_struct *tsk);
+#else
+# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
+# define rt_mutex_init(mutex)			__rt_mutex_init(mutex, NULL)
+# define rt_mutex_debug_task_free(t)			do { } while (0)
+#endif
+
+#define __RT_MUTEX_INITIALIZER(mutexname) \
+	{ .wait_lock = RAW_SPIN_LOCK_UNLOCKED \
+	, .wait_list = PLIST_HEAD_INIT(mutexname.wait_list, mutexname.wait_lock) \
+	, .owner = NULL \
+	__DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
+
+#define DEFINE_RT_MUTEX(mutexname) \
+	struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
+
+/***
+ * rt_mutex_is_locked - is the mutex locked
+ * @lock: the mutex to be queried
+ *
+ * Returns 1 if the mutex is locked, 0 if unlocked.
+ */
+static inline int rt_mutex_is_locked(struct rt_mutex *lock)
+{
+	return lock->owner != NULL;
+}
+
+extern void __rt_mutex_init(struct rt_mutex *lock, const char *name);
+extern void rt_mutex_destroy(struct rt_mutex *lock);
+
+extern void rt_mutex_lock(struct rt_mutex *lock);
+extern int rt_mutex_lock_interruptible(struct rt_mutex *lock,
+						int detect_deadlock);
+extern int rt_mutex_timed_lock(struct rt_mutex *lock,
+					struct hrtimer_sleeper *timeout,
+					int detect_deadlock);
+
+extern int rt_mutex_trylock(struct rt_mutex *lock);
+
+extern void rt_mutex_unlock(struct rt_mutex *lock);
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# define INIT_RT_MUTEX_DEBUG(tsk)					\
+	.held_list_head	= LIST_HEAD_INIT(tsk.held_list_head),		\
+	.held_list_lock	= RAW_SPIN_LOCK_UNLOCKED
+#else
+# define INIT_RT_MUTEX_DEBUG(tsk)
+#endif
+
+#ifdef CONFIG_RT_MUTEXES
+# define INIT_RT_MUTEXES(tsk)						\
+	.pi_waiters	= PLIST_HEAD_INIT(tsk.pi_waiters, tsk.pi_lock),	\
+	.pi_lock	= RAW_SPIN_LOCK_UNLOCKED,			\
+	INIT_RT_MUTEX_DEBUG(tsk)
+#else
+# define INIT_RT_MUTEXES(tsk)
+#endif
+
+#endif
Index: linux-2.6.16/include/linux/rwsem-spinlock.h
===================================================================
--- linux-2.6.16.orig/include/linux/rwsem-spinlock.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/rwsem-spinlock.h	2006-10-19 16:52:31.000000000 +0200
@@ -28,7 +28,7 @@ struct rwsem_waiter;
  * - if activity is -1 then there is one active writer
  * - if wait_list is not empty, then there are processes waiting for the semaphore
  */
-struct rw_semaphore {
+struct compat_rw_semaphore {
 	__s32			activity;
 	spinlock_t		wait_lock;
 	struct list_head	wait_list;
@@ -46,25 +46,24 @@ struct rw_semaphore {
 #define __RWSEM_DEBUG_INIT	/* */
 #endif
 
-#define __RWSEM_INITIALIZER(name) \
+#define __COMPAT_RWSEM_INITIALIZER(name) \
 { 0, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) __RWSEM_DEBUG_INIT }
 
-#define DECLARE_RWSEM(name) \
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
+#define COMPAT_DECLARE_RWSEM(name) \
+	struct compat_rw_semaphore name = __COMPAT_RWSEM_INITIALIZER(name)
 
-extern void FASTCALL(init_rwsem(struct rw_semaphore *sem));
-extern void FASTCALL(__down_read(struct rw_semaphore *sem));
-extern int FASTCALL(__down_read_trylock(struct rw_semaphore *sem));
-extern void FASTCALL(__down_write(struct rw_semaphore *sem));
-extern int FASTCALL(__down_write_trylock(struct rw_semaphore *sem));
-extern void FASTCALL(__up_read(struct rw_semaphore *sem));
-extern void FASTCALL(__up_write(struct rw_semaphore *sem));
-extern void FASTCALL(__downgrade_write(struct rw_semaphore *sem));
+extern void FASTCALL(compat_init_rwsem(struct compat_rw_semaphore *sem));
+extern void FASTCALL(__down_read(struct compat_rw_semaphore *sem));
+extern int FASTCALL(__down_read_trylock(struct compat_rw_semaphore *sem));
+extern void FASTCALL(__down_write(struct compat_rw_semaphore *sem));
+extern int FASTCALL(__down_write_trylock(struct compat_rw_semaphore *sem));
+extern void FASTCALL(__up_read(struct compat_rw_semaphore *sem));
+extern void FASTCALL(__up_write(struct compat_rw_semaphore *sem));
+extern void FASTCALL(__downgrade_write(struct compat_rw_semaphore *sem));
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
+static inline int compat_rwsem_is_locked(struct compat_rw_semaphore *sem)
 {
 	return (sem->activity != 0);
 }
-
 #endif /* __KERNEL__ */
 #endif /* _LINUX_RWSEM_SPINLOCK_H */
Index: linux-2.6.16/include/linux/rwsem.h
===================================================================
--- linux-2.6.16.orig/include/linux/rwsem.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/rwsem.h	2006-10-19 16:52:31.000000000 +0200
@@ -9,6 +9,10 @@
 
 #include <linux/linkage.h>
 
+#ifdef CONFIG_PREEMPT_RT
+# include <linux/rt_lock.h>
+#endif
+
 #define RWSEM_DEBUG 0
 
 #ifdef __KERNEL__
@@ -19,17 +23,29 @@
 #include <asm/system.h>
 #include <asm/atomic.h>
 
-struct rw_semaphore;
+#ifndef CONFIG_PREEMPT_RT
+/*
+ * On !PREEMPT_RT all rw-semaphores are compat:
+ */
+#define compat_rw_semaphore rw_semaphore
+#endif
+
+struct compat_rw_semaphore;
+
 
 #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
-#include <linux/rwsem-spinlock.h> /* use a generic implementation */
+# include <linux/rwsem-spinlock.h> /* use a generic implementation */
+#  ifndef CONFIG_PREEMPT_RT
+#  define __RWSEM_INITIALIZER __COMPAT_RWSEM_INITIALIZER
+#  define DECLARE_RWSEM COMPAT_DECLARE_RWSEM
+# endif
 #else
-#include <asm/rwsem.h> /* use an arch-specific implementation */
+# include <asm/rwsem.h> /* use an arch-specific implementation */
 #endif
 
 #ifndef rwsemtrace
 #if RWSEM_DEBUG
-extern void FASTCALL(rwsemtrace(struct rw_semaphore *sem, const char *str));
+extern void FASTCALL(rwsemtrace(struct compat_rw_semaphore *sem, const char *str));
 #else
 #define rwsemtrace(SEM,FMT)
 #endif
@@ -38,7 +54,7 @@ extern void FASTCALL(rwsemtrace(struct r
 /*
  * lock for reading
  */
-static inline void down_read(struct rw_semaphore *sem)
+static inline void compat_down_read(struct compat_rw_semaphore *sem)
 {
 	might_sleep();
 	rwsemtrace(sem,"Entering down_read");
@@ -49,7 +65,7 @@ static inline void down_read(struct rw_s
 /*
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
-static inline int down_read_trylock(struct rw_semaphore *sem)
+static inline int compat_down_read_trylock(struct compat_rw_semaphore *sem)
 {
 	int ret;
 	rwsemtrace(sem,"Entering down_read_trylock");
@@ -61,7 +77,7 @@ static inline int down_read_trylock(stru
 /*
  * lock for writing
  */
-static inline void down_write(struct rw_semaphore *sem)
+static inline void compat_down_write(struct compat_rw_semaphore *sem)
 {
 	might_sleep();
 	rwsemtrace(sem,"Entering down_write");
@@ -72,7 +88,7 @@ static inline void down_write(struct rw_
 /*
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
-static inline int down_write_trylock(struct rw_semaphore *sem)
+static inline int compat_down_write_trylock(struct compat_rw_semaphore *sem)
 {
 	int ret;
 	rwsemtrace(sem,"Entering down_write_trylock");
@@ -84,7 +100,7 @@ static inline int down_write_trylock(str
 /*
  * release a read lock
  */
-static inline void up_read(struct rw_semaphore *sem)
+static inline void compat_up_read(struct compat_rw_semaphore *sem)
 {
 	rwsemtrace(sem,"Entering up_read");
 	__up_read(sem);
@@ -94,7 +110,7 @@ static inline void up_read(struct rw_sem
 /*
  * release a write lock
  */
-static inline void up_write(struct rw_semaphore *sem)
+static inline void compat_up_write(struct compat_rw_semaphore *sem)
 {
 	rwsemtrace(sem,"Entering up_write");
 	__up_write(sem);
@@ -104,12 +120,54 @@ static inline void up_write(struct rw_se
 /*
  * downgrade write lock to read lock
  */
-static inline void downgrade_write(struct rw_semaphore *sem)
+static inline void compat_downgrade_write(struct compat_rw_semaphore *sem)
 {
 	rwsemtrace(sem,"Entering downgrade_write");
 	__downgrade_write(sem);
 	rwsemtrace(sem,"Leaving downgrade_write");
 }
 
+#ifndef CONFIG_PREEMPT_RT
+
+#define DECLARE_RWSEM COMPAT_DECLARE_RWSEM
+
+static inline void init_rwsem(struct compat_rw_semaphore *rwsem)
+{
+	compat_init_rwsem(rwsem);
+}
+static inline void down_read(struct compat_rw_semaphore *rwsem)
+{
+	compat_down_read(rwsem);
+}
+static inline int down_read_trylock(struct compat_rw_semaphore *rwsem)
+{
+	return compat_down_read_trylock(rwsem);
+}
+static inline void down_write(struct compat_rw_semaphore *rwsem)
+{
+	compat_down_write(rwsem);
+}
+static inline int down_write_trylock(struct compat_rw_semaphore *rwsem)
+{
+	return compat_down_write_trylock(rwsem);
+}
+static inline void up_read(struct compat_rw_semaphore *rwsem)
+{
+	compat_up_read(rwsem);
+}
+static inline void up_write(struct compat_rw_semaphore *rwsem)
+{
+	compat_up_write(rwsem);
+}
+static inline void downgrade_write(struct compat_rw_semaphore *rwsem)
+{
+	compat_downgrade_write(rwsem);
+}
+static inline int rwsem_is_locked(struct compat_rw_semaphore *sem)
+{
+	return compat_rwsem_is_locked(sem);
+}
+#endif /* CONFIG_PREEMPT_RT */
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_RWSEM_H */
Index: linux-2.6.16/include/linux/sched.h
===================================================================
--- linux-2.6.16.orig/include/linux/sched.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/sched.h	2006-10-19 16:52:31.000000000 +0200
@@ -35,9 +35,141 @@
 #include <linux/topology.h>
 #include <linux/seccomp.h>
 #include <linux/rcupdate.h>
+#include <linux/futex.h>
+#include <linux/rtmutex.h>
 
 #include <linux/auxvec.h>	/* For AT_VECTOR_SIZE */
 
+#ifdef CONFIG_PREEMPT
+extern int kernel_preemption;
+#else
+# define kernel_preemption 0
+#endif
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+extern int voluntary_preemption;
+#else
+# define voluntary_preemption 0
+#endif
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+extern int softirq_preemption;
+#else
+# define softirq_preemption 0
+#endif
+#ifdef CONFIG_PREEMPT_HARDIRQS
+extern int hardirq_preemption;
+#else
+# define hardirq_preemption 0
+#endif
+struct exec_domain;
+
+#ifdef CONFIG_PREEMPT_BKL
+extern struct semaphore kernel_sem;
+#endif
+
+#ifdef CONFIG_GENERIC_HARDIRQS
+extern int debug_direct_keyboard;
+#else
+# define debug_direct_keyboard 0
+#endif
+
+#if defined(CONFIG_PREEMPT_TRACE) || defined(CONFIG_LATENCY_TRACE)
+  extern void print_traces(struct task_struct *task);
+#else
+# define print_traces(task)			do { } while (0)
+#endif
+
+#ifdef CONFIG_FRAME_POINTER
+# ifndef CONFIG_ARM
+#  define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+#  define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
+#  define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
+#  define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3))
+# else
+   extern unsigned long arm_return_addr(int level);
+#  define CALLER_ADDR0 arm_return_addr(0)
+#  define CALLER_ADDR1 arm_return_addr(1)
+#  define CALLER_ADDR2 arm_return_addr(2)
+#  define CALLER_ADDR3 arm_return_addr(3)
+#endif
+#else
+# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+# define CALLER_ADDR1 0UL
+# define CALLER_ADDR2 0UL
+# define CALLER_ADDR3 0UL
+#endif
+
+#ifdef CONFIG_MCOUNT
+  extern void notrace mcount(void);
+#else
+# define mcount() do { } while (0)
+#endif
+
+#ifdef CONFIG_LATENCY_TRACE
+  extern int mcount_enabled, trace_enabled, trace_user_triggered,
+		trace_user_trigger_irq, trace_freerunning, trace_verbose,
+		trace_print_at_crash, trace_all_cpus, print_functions;
+  extern void notrace trace_special(unsigned long v1, unsigned long v2, unsigned long v3);
+  extern void notrace trace_special_pid(int pid, unsigned long v1, unsigned long v2);
+  extern void notrace trace_special_u64(unsigned long long v1, unsigned long v2);
+  extern void stop_trace(void);
+  extern void print_last_trace(void);
+  extern void nmi_trace(unsigned long eip, unsigned long parent_eip,
+			unsigned long flags);
+  extern long user_trace_start(void);
+  extern long user_trace_stop(void);
+  extern void trace_cmdline(void);
+#else
+# define mcount_enabled				0
+# define trace_enabled				0
+# define trace_user_triggered			0
+# define trace_freerunning			0
+# define trace_all_cpus				0
+# define trace_verbose				0
+# define trace_special(v1,v2,v3)		do { } while (0)
+# define trace_special_pid(pid,v1,v2)		do { } while (0)
+# define trace_special_u64(v1,v2)		do { } while (0)
+# define stop_trace()				do { } while (0)
+# define print_last_trace()			do { } while (0)
+# define nmi_trace(eip, parent_eip, flags)	do { } while (0)
+# define user_trace_start()			do { } while (0)
+# define user_trace_stop()			do { } while (0)
+# define trace_cmdline()			do { } while (0)
+#endif
+
+extern int timeofday_API_hacks(void *tv, void *tz);
+
+#ifdef CONFIG_WAKEUP_TIMING
+  extern int wakeup_timing;
+  extern void __trace_start_sched_wakeup(struct task_struct *p);
+  extern void trace_stop_sched_switched(struct task_struct *p);
+  extern void trace_change_sched_cpu(struct task_struct *p, int new_cpu);
+#else
+# define wakeup_timing 0
+# define __trace_start_sched_wakeup(p)		do { } while (0)
+# define trace_stop_sched_switched(p)		do { } while (0)
+# define trace_change_sched_cpu(p, cpu)		do { } while (0)
+#endif
+
+// #define PREEMPT_DIRECT
+
+#ifdef CONFIG_X86_LOCAL_APIC
+extern void nmi_show_all_regs(void);
+#else
+# define nmi_show_all_regs() do { } while (0)
+#endif
+
+#include <linux/smp.h>
+#include <linux/sem.h>
+#include <linux/signal.h>
+#include <linux/securebits.h>
+#include <linux/fs_struct.h>
+#include <linux/compiler.h>
+#include <linux/completion.h>
+#include <linux/pid.h>
+#include <linux/percpu.h>
+#include <linux/topology.h>
+#include <linux/seccomp.h>
+
 struct exec_domain;
 
 /*
@@ -120,15 +252,16 @@ extern unsigned long nr_iowait(void);
  * mistake.
  */
 #define TASK_RUNNING		0
-#define TASK_INTERRUPTIBLE	1
-#define TASK_UNINTERRUPTIBLE	2
-#define TASK_STOPPED		4
-#define TASK_TRACED		8
+#define TASK_RUNNING_MUTEX	1
+#define TASK_INTERRUPTIBLE	2
+#define TASK_UNINTERRUPTIBLE	4
+#define TASK_STOPPED		8
+#define TASK_TRACED		16
 /* in tsk->exit_state */
-#define EXIT_ZOMBIE		16
-#define EXIT_DEAD		32
+#define EXIT_ZOMBIE		32
+#define EXIT_DEAD		64
 /* in tsk->state again */
-#define TASK_NONINTERACTIVE	64
+#define TASK_NONINTERACTIVE	128
 
 #define __set_task_state(tsk, state_value)		\
 	do { (tsk)->state = (state_value); } while (0)
@@ -206,11 +339,11 @@ extern void update_process_times(int use
 extern void scheduler_tick(void);
 
 #ifdef CONFIG_DETECT_SOFTLOCKUP
-extern void softlockup_tick(struct pt_regs *regs);
+extern void softlockup_tick(void);
 extern void spawn_softlockup_task(void);
 extern void touch_softlockup_watchdog(void);
 #else
-static inline void softlockup_tick(struct pt_regs *regs)
+static inline void softlockup_tick(void)
 {
 }
 static inline void spawn_softlockup_task(void)
@@ -232,6 +365,11 @@ extern signed long FASTCALL(schedule_tim
 extern signed long schedule_timeout_interruptible(signed long timeout);
 extern signed long schedule_timeout_uninterruptible(signed long timeout);
 asmlinkage void schedule(void);
+/*
+ * This one can be called with interrupts disabled, only
+ * to be used by lowlevel arch code!
+ */
+extern void __sched __schedule(void);
 
 struct namespace;
 
@@ -337,6 +475,9 @@ struct mm_struct {
 	/* Architecture-specific MM context */
 	mm_context_t context;
 
+	/* realtime bits */
+	struct list_head	delayed_drop;
+
 	/* Token based thrashing protection. */
 	unsigned long swap_token_time;
 	char recent_pagein;
@@ -402,6 +543,7 @@ struct signal_struct {
 
 	/* ITIMER_REAL timer for the process */
 	struct hrtimer real_timer;
+	struct task_struct *tsk;
 	ktime_t it_real_incr;
 
 	/* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */
@@ -488,7 +630,8 @@ struct signal_struct {
 
 #define MAX_PRIO		(MAX_RT_PRIO + 40)
 
-#define rt_task(p)		(unlikely((p)->prio < MAX_RT_PRIO))
+#define rt_prio(prio)		((prio) < MAX_RT_PRIO)
+#define rt_task(p)		rt_prio((p)->prio)
 
 /*
  * Some day this will be a full-fledged user tracking system..
@@ -688,6 +831,7 @@ static inline void prefetch_stack(struct
 
 struct audit_context;		/* See audit.c */
 struct mempolicy;
+struct futex_pi_state;
 
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
@@ -701,7 +845,7 @@ struct task_struct {
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
 	int oncpu;
 #endif
-	int prio, static_prio;
+	int prio, static_prio, normal_prio;
 	struct list_head run_list;
 	prio_array_t *array;
 
@@ -716,6 +860,11 @@ struct task_struct {
 	cpumask_t cpus_allowed;
 	unsigned int time_slice, first_time_slice;
 
+#ifdef CONFIG_PREEMPT_RCU
+	int rcu_read_lock_nesting;
+	atomic_t *rcu_flipctr1;
+	atomic_t *rcu_flipctr2;
+#endif
 #ifdef CONFIG_SCHEDSTATS
 	struct sched_info sched_info;
 #endif
@@ -773,6 +922,8 @@ struct task_struct {
 	unsigned long long it_sched_expires;
 	struct list_head cpu_timers[3];
 
+	struct task_struct* posix_timer_list;
+
 /* process credentials */
 	uid_t uid,euid,suid,fsuid;
 	gid_t gid,egid,sgid,fsgid;
@@ -828,11 +979,51 @@ struct task_struct {
 /* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */
 	spinlock_t proc_lock;
 
+	/* Protection of the PI data structures: */
+	raw_spinlock_t pi_lock;
+
+#ifdef CONFIG_RT_MUTEXES
+	/* PI waiters blocked on a rt_mutex held by this task */
+	struct plist_head pi_waiters;
+	/* Deadlock detection and priority inheritance handling */
+	struct rt_mutex_waiter *pi_blocked_on;
+# ifdef CONFIG_DEBUG_RT_MUTEXES
+	raw_spinlock_t held_list_lock;
+	struct list_head held_list_head;
+# endif
+#endif
+
 #ifdef CONFIG_DEBUG_MUTEXES
 	/* mutex deadlock detection */
 	struct mutex_waiter *blocked_on;
 #endif
 
+#define MAX_PREEMPT_TRACE 25
+
+#ifdef CONFIG_PREEMPT_TRACE
+	unsigned long preempt_trace_eip[MAX_PREEMPT_TRACE];
+	unsigned long preempt_trace_parent_eip[MAX_PREEMPT_TRACE];
+#endif
+
+#define MAX_LOCK_STACK	MAX_PREEMPT_TRACE
+#ifdef CONFIG_DEBUG_PREEMPT
+	int lock_count;
+# ifdef CONFIG_PREEMPT_RT
+	struct rt_mutex *owned_lock[MAX_LOCK_STACK];
+# endif
+#endif
+#ifdef CONFIG_DETECT_SOFTLOCKUP
+	unsigned long	softlockup_count; /* Count to keep track how long the
+					   *  thread is in the kernel without
+					   *  sleeping.
+					   */
+#endif
+	/* realtime bits */
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+	void *last_kernel_lock;
+#endif
+
 /* journalling filesystem info */
 	void *journal_info;
 
@@ -869,6 +1060,13 @@ struct task_struct {
 	nodemask_t mems_allowed;
 	int cpuset_mems_generation;
 #endif
+	struct robust_list_head __user *robust_list;
+#ifdef CONFIG_COMPAT
+	struct compat_robust_list_head __user *compat_robust_list;
+#endif
+	struct list_head pi_state_list;
+	struct futex_pi_state *pi_state_cache;
+
 	atomic_t fs_excl;	/* holding fs exclusive resources */
 	struct rcu_head rcu;
 };
@@ -928,6 +1126,11 @@ static inline void put_task_struct(struc
 #define PF_BORROWED_MM	0x00400000	/* I am a kthread doing use_mm */
 #define PF_RANDOMIZE	0x00800000	/* randomize virtual address space */
 #define PF_SWAPWRITE	0x01000000	/* Allowed to write to swap */
+#define PF_MUTEX_TESTER	0x02000000	/* Thread belongs to the rt mutex tester */
+#define PF_SOFTIRQ	0x04000000	/* softirq context */
+#define PF_HARDIRQ	0x08000000	/* hardirq context */
+#define PF_NOSCHED	0x10000000	/* no voluntary scheduling */
+#define PF_IRQSOFF	0x20000000	/* soft IRQs-off flag */
 
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
@@ -982,6 +1185,17 @@ static inline void idle_task_exit(void) 
 #endif
 
 extern void sched_idle_next(void);
+
+#ifdef CONFIG_RT_MUTEXES
+extern int rt_mutex_getprio(task_t *p);
+extern void rt_mutex_setprio(task_t *p, int prio);
+#else
+static inline int rt_mutex_getprio(task_t *p)
+{
+	return p->normal_prio;
+}
+#endif
+
 extern void set_user_nice(task_t *p, long nice);
 extern int task_prio(const task_t *p);
 extern int task_nice(const task_t *p);
@@ -989,11 +1203,13 @@ extern int can_nice(const task_t *p, con
 extern int task_curr(const task_t *p);
 extern int idle_cpu(int cpu);
 extern int sched_setscheduler(struct task_struct *, int, struct sched_param *);
+extern int setscheduler(struct task_struct *, int, int, struct sched_param *);
 extern task_t *idle_task(int cpu);
 extern task_t *curr_task(int cpu);
 extern void set_curr_task(int cpu, task_t *p);
 
 void yield(void);
+void __yield(void);
 
 /*
  * The default (Linux) execution domain.
@@ -1041,6 +1257,9 @@ extern void do_timer(struct pt_regs *);
 
 extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state));
 extern int FASTCALL(wake_up_process(struct task_struct * tsk));
+extern int FASTCALL(wake_up_process_mutex(struct task_struct * tsk));
+extern int FASTCALL(wake_up_process_sync(struct task_struct * tsk));
+extern int FASTCALL(wake_up_process_mutex_sync(struct task_struct * tsk));
 extern void FASTCALL(wake_up_new_task(struct task_struct * tsk,
 						unsigned long clone_flags));
 #ifdef CONFIG_SMP
@@ -1128,12 +1347,20 @@ extern struct mm_struct * mm_alloc(void)
 
 /* mmdrop drops the mm and the page tables */
 extern void FASTCALL(__mmdrop(struct mm_struct *));
+extern void FASTCALL(__mmdrop_delayed(struct mm_struct *));
+
 static inline void mmdrop(struct mm_struct * mm)
 {
 	if (atomic_dec_and_test(&mm->mm_count))
 		__mmdrop(mm);
 }
 
+static inline void mmdrop_delayed(struct mm_struct * mm)
+{
+	if (atomic_dec_and_test(&mm->mm_count))
+		__mmdrop_delayed(mm);
+}
+
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
 /* Grab a reference to a task's mm, if it is not already going away */
@@ -1297,43 +1524,97 @@ static inline int signal_pending(struct 
 	return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
 }
   
-static inline int need_resched(void)
+static inline int _need_resched(void)
 {
 	return unlikely(test_thread_flag(TIF_NEED_RESCHED));
 }
 
-/*
- * cond_resched() and cond_resched_lock(): latency reduction via
- * explicit rescheduling in places that are safe. The return
- * value indicates whether a reschedule was done in fact.
- * cond_resched_lock() will drop the spinlock before scheduling,
- * cond_resched_softirq() will enable bhs before scheduling.
- */
-extern int cond_resched(void);
-extern int cond_resched_lock(spinlock_t * lock);
-extern int cond_resched_softirq(void);
+static inline int need_resched(void)
+{
+	touch_critical_timing();
+	return _need_resched();
+}
+
+static inline void set_tsk_need_resched_delayed(struct task_struct *tsk)
+{
+	set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_DELAYED);
+}
+
+static inline void clear_tsk_need_resched_delayed(struct task_struct *tsk)
+{
+	clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_DELAYED);
+}
+
+static inline int need_resched_delayed(void)
+{
+	return unlikely(test_thread_flag(TIF_NEED_RESCHED_DELAYED));
+}
 
 /*
  * Does a critical section need to be broken due to another
  * task waiting?:
  */
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
-# define need_lockbreak(lock) ((lock)->break_lock)
+#if (defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)) || defined(CONFIG_PREEMPT_RT)
+# define need_lockbreak(lock) ({ int __need = ((lock)->break_lock); if (__need) (lock)->break_lock = 0; __need; })
 #else
 # define need_lockbreak(lock) 0
 #endif
 
+#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
+# define need_lockbreak_raw(lock) ({ int __need = ((lock)->break_lock); if (__need) (lock)->break_lock = 0; __need; })
+#else
+# define need_lockbreak_raw(lock) 0
+#endif
+
 /*
  * Does a critical section need to be broken due to another
  * task waiting or preemption being signalled:
  */
-static inline int lock_need_resched(spinlock_t *lock)
+#define lock_need_resched(lock) \
+	unlikely(need_lockbreak(lock) || need_resched())
+
+static inline int softirq_need_resched(void)
 {
-	if (need_lockbreak(lock) || need_resched())
-		return 1;
+	if (softirq_preemption)
+		return need_resched();
 	return 0;
 }
 
+static inline int hardirq_need_resched(void)
+{
+	if (current->flags & PF_HARDIRQ)
+		return need_resched();
+	return 0;
+}
+
+/*
+ * cond_resched() and cond_resched_lock(): latency reduction via
+ * explicit rescheduling in places that are safe. The return
+ * value indicates whether a reschedule was done in fact.
+ * cond_resched_lock() will drop the spinlock before scheduling,
+ * cond_resched_softirq() will enable bhs before scheduling.
+ */
+extern int cond_resched(void);
+extern int __cond_resched_raw_spinlock(raw_spinlock_t *lock);
+extern int __cond_resched_spinlock(spinlock_t *spinlock);
+
+#define cond_resched_lock(lock) \
+({								\
+	int __ret;						\
+								\
+	if (TYPE_EQUAL((lock), raw_spinlock_t))	 		\
+		__ret = __cond_resched_raw_spinlock((raw_spinlock_t *)lock);\
+	else if (TYPE_EQUAL(lock, spinlock_t))			\
+		__ret = __cond_resched_spinlock((spinlock_t *)lock); \
+	else __ret = __bad_spinlock_type();			\
+								\
+	__ret;							\
+})
+
+extern int cond_resched_softirq(void);
+extern int cond_resched_hardirq(void);
+extern int cond_resched_all(void);
+
 /* Reevaluate whether the task has signals pending delivery.
    This is required every time the blocked sigset_t changes.
    callers must hold sighand->siglock.  */
@@ -1355,6 +1636,7 @@ static inline unsigned int task_cpu(cons
 
 static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
+	trace_change_sched_cpu(p, cpu);
 	task_thread_info(p)->cpu = cpu;
 }
 
Index: linux-2.6.16/include/linux/semaphore.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/include/linux/semaphore.h	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,52 @@
+#ifndef _LINUX_SEMAPHORE_H
+#define _LINUX_SEMAPHORE_H
+
+#include <linux/config.h>
+
+#ifdef CONFIG_PREEMPT_RT
+# include <linux/rt_lock.h>
+#else
+
+#define DECLARE_MUTEX COMPAT_DECLARE_MUTEX
+#define DECLARE_MUTEX_LOCKED COMPAT_DECLARE_MUTEX_LOCKED
+
+static inline void sema_init(struct compat_semaphore *sem, int val)
+{
+	compat_sema_init(sem, val);
+}
+static inline void init_MUTEX(struct compat_semaphore *sem)
+{
+	compat_init_MUTEX(sem);
+}
+static inline void init_MUTEX_LOCKED(struct compat_semaphore *sem)
+{
+	compat_init_MUTEX_LOCKED(sem);
+}
+static inline void down(struct compat_semaphore *sem)
+{
+	compat_down(sem);
+}
+static inline int down_interruptible(struct compat_semaphore *sem)
+{
+	return compat_down_interruptible(sem);
+}
+static inline int down_trylock(struct compat_semaphore *sem)
+{
+	return compat_down_trylock(sem);
+}
+static inline void up(struct compat_semaphore *sem)
+{
+	compat_up(sem);
+}
+static inline int sem_is_locked(struct compat_semaphore *sem)
+{
+	return compat_sem_is_locked(sem);
+}
+static inline int sema_count(struct compat_semaphore *sem)
+{
+	return compat_sema_count(sem);
+}
+
+#endif /* CONFIG_PREEMPT_RT */
+
+#endif /* _LINUX_SEMAPHORE_H */
Index: linux-2.6.16/include/linux/seqlock.h
===================================================================
--- linux-2.6.16.orig/include/linux/seqlock.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/seqlock.h	2006-10-19 16:52:31.000000000 +0200
@@ -29,39 +29,63 @@
 #include <linux/config.h>
 #include <linux/spinlock.h>
 #include <linux/preempt.h>
+#include <linux/trace_irqflags.h>
 
 typedef struct {
 	unsigned sequence;
 	spinlock_t lock;
-} seqlock_t;
+} __seqlock_t;
+
+typedef struct {
+	unsigned sequence;
+	raw_spinlock_t lock;
+} __raw_seqlock_t;
+
+#define seqlock_need_resched(seq) lock_need_resched(&(seq)->lock)
+
+#ifdef CONFIG_PREEMPT_RT
+typedef __seqlock_t seqlock_t;
+#else
+typedef __raw_seqlock_t seqlock_t;
+#endif
+
+typedef __raw_seqlock_t raw_seqlock_t;
 
 /*
  * These macros triggered gcc-3.x compile-time problems.  We think these are
  * OK now.  Be cautious.
  */
+#ifdef CONFIG_PREEMPT_RT
 #define SEQLOCK_UNLOCKED { 0, SPIN_LOCK_UNLOCKED }
-#define seqlock_init(x)	do { *(x) = (seqlock_t) SEQLOCK_UNLOCKED; } while (0)
-
+#else
+#define SEQLOCK_UNLOCKED { 0, RAW_SPIN_LOCK_UNLOCKED }
+#endif
+
+#define seqlock_init(x)	do { (x)->sequence = 0; spin_lock_init(&(x)->lock); } while (0)
+
+#define RAW_SEQLOCK_UNLOCKED { 0, RAW_SPIN_LOCK_UNLOCKED }
+#define raw_seqlock_init(x) \
+		do { *(x) = (raw_seqlock_t) RAW_SEQLOCK_UNLOCKED; } while (0)
 
 /* Lock out other writers and update the count.
  * Acts like a normal spin_lock/unlock.
  * Don't need preempt_disable() because that is in the spin_lock already.
  */
-static inline void write_seqlock(seqlock_t *sl)
+static inline void __write_seqlock(seqlock_t *sl)
 {
 	spin_lock(&sl->lock);
 	++sl->sequence;
 	smp_wmb();			
 }	
 
-static inline void write_sequnlock(seqlock_t *sl) 
+static inline void __write_sequnlock(seqlock_t *sl)
 {
 	smp_wmb();
 	sl->sequence++;
 	spin_unlock(&sl->lock);
 }
 
-static inline int write_tryseqlock(seqlock_t *sl)
+static inline int __write_tryseqlock(seqlock_t *sl)
 {
 	int ret = spin_trylock(&sl->lock);
 
@@ -73,7 +97,7 @@ static inline int write_tryseqlock(seqlo
 }
 
 /* Start of read calculation -- fetch last complete writer token */
-static inline unsigned read_seqbegin(const seqlock_t *sl)
+static inline unsigned __read_seqbegin(const seqlock_t *sl)
 {
 	unsigned ret = sl->sequence;
 	smp_rmb();
@@ -88,13 +112,126 @@ static inline unsigned read_seqbegin(con
  *    
  * Using xor saves one conditional branch.
  */
-static inline int read_seqretry(const seqlock_t *sl, unsigned iv)
+static inline int __read_seqretry(seqlock_t *sl, unsigned iv)
+{
+	int ret;
+
+	smp_rmb();
+	ret = (iv & 1) | (sl->sequence ^ iv);
+	/*
+	 * If invalid then serialize with the writer, to make sure we
+	 * are not livelocking it:
+	 */
+	if (unlikely(ret)) {
+		unsigned long flags;
+		spin_lock_irqsave(&sl->lock, flags);
+		spin_unlock_irqrestore(&sl->lock, flags);
+	}
+	return ret;
+}
+
+static inline void __write_seqlock_raw(raw_seqlock_t *sl)
+{
+	spin_lock(&sl->lock);
+	++sl->sequence;
+	smp_wmb();
+}
+
+static inline void __write_sequnlock_raw(raw_seqlock_t *sl)
+{
+	smp_wmb();
+	sl->sequence++;
+	spin_unlock(&sl->lock);
+}
+
+static inline int __write_tryseqlock_raw(raw_seqlock_t *sl)
+{
+	int ret = spin_trylock(&sl->lock);
+
+	if (ret) {
+		++sl->sequence;
+		smp_wmb();
+	}
+	return ret;
+}
+
+static inline unsigned __read_seqbegin_raw(const raw_seqlock_t *sl)
+{
+	unsigned ret = sl->sequence;
+	smp_rmb();
+	return ret;
+}
+
+static inline int __read_seqretry_raw(const raw_seqlock_t *sl, unsigned iv)
 {
 	smp_rmb();
 	return (iv & 1) | (sl->sequence ^ iv);
 }
 
 
+extern int __bad_seqlock_type(void);
+
+#define PICK_SEQOP(op, lock)					\
+do {								\
+	if (TYPE_EQUAL((lock), raw_seqlock_t))			\
+		op##_raw((raw_seqlock_t *)(lock));		\
+	else if (TYPE_EQUAL(lock, seqlock_t))			\
+		op((seqlock_t *)(lock));			\
+	else __bad_seqlock_type();				\
+} while (0)
+
+#define PICK_SEQOP_RET(op, lock)				\
+({								\
+	unsigned long __ret;					\
+								\
+	if (TYPE_EQUAL((lock), raw_seqlock_t))			\
+		__ret = op##_raw((raw_seqlock_t *)(lock));	\
+	else if (TYPE_EQUAL(lock, seqlock_t))			\
+		__ret = op((seqlock_t *)(lock));		\
+	else __ret = __bad_seqlock_type();			\
+								\
+	__ret;							\
+})
+
+#define PICK_SEQOP_CONST_RET(op, lock)				\
+({								\
+	unsigned long __ret;					\
+								\
+	if (TYPE_EQUAL((lock), raw_seqlock_t))			\
+		__ret = op##_raw((const raw_seqlock_t *)(lock));\
+	else if (TYPE_EQUAL(lock, seqlock_t))			\
+		__ret = op((const seqlock_t *)(lock));		\
+	else __ret = __bad_seqlock_type();			\
+								\
+	__ret;							\
+})
+
+#define PICK_SEQOP2_CONST_RET(op, lock, arg)				\
+({									\
+	unsigned long __ret;						\
+									\
+	if (TYPE_EQUAL((lock), raw_seqlock_t))				\
+		__ret = op##_raw((const raw_seqlock_t *)(lock), (arg));	\
+	else if (TYPE_EQUAL(lock, seqlock_t))				\
+		__ret = op((seqlock_t *)(lock), (arg));			\
+	else __ret = __bad_seqlock_type();				\
+									\
+	__ret;								\
+})
+
+
+#define write_seqlock(sl)	PICK_SEQOP(__write_seqlock, sl)
+#define write_sequnlock(sl)	PICK_SEQOP(__write_sequnlock, sl)
+#define write_tryseqlock(sl)	PICK_SEQOP_RET(__write_tryseqlock, sl)
+#define read_seqbegin(sl)	PICK_SEQOP_CONST_RET(__read_seqbegin, sl)
+#define read_seqretry(sl, iv)	PICK_SEQOP2_CONST_RET(__read_seqretry, sl, iv)
+
+#define DECLARE_SEQLOCK(name) \
+	seqlock_t name __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED
+
+#define DECLARE_RAW_SEQLOCK(name) \
+	raw_seqlock_t name __cacheline_aligned_in_smp = RAW_SEQLOCK_UNLOCKED
+
 /*
  * Version using sequence counter only.
  * This can be used when code has its own mutex protecting the
@@ -145,30 +282,51 @@ static inline void write_seqcount_end(se
 	s->sequence++;
 }
 
+#define PICK_IRQOP(op, lock)					\
+do {								\
+	if (TYPE_EQUAL((lock), raw_seqlock_t))			\
+		op();						\
+	else if (TYPE_EQUAL((lock), seqlock_t))			\
+		{ /* nothing */ }				\
+	else __bad_seqlock_type();				\
+} while (0)
+
+#define PICK_IRQOP2(op, arg, lock)				\
+do {								\
+	if (TYPE_EQUAL((lock), raw_seqlock_t))			\
+		op(arg);					\
+	else if (TYPE_EQUAL(lock, seqlock_t))			\
+		{ /* nothing */ }				\
+	else __bad_seqlock_type();				\
+} while (0)
+
+
+
 /*
  * Possible sw/hw IRQ protected versions of the interfaces.
  */
 #define write_seqlock_irqsave(lock, flags)				\
-	do { local_irq_save(flags); write_seqlock(lock); } while (0)
+	do { PICK_IRQOP2(local_irq_save, flags, lock); write_seqlock(lock); } while (0)
 #define write_seqlock_irq(lock)						\
-	do { local_irq_disable();   write_seqlock(lock); } while (0)
+	do { PICK_IRQOP(local_irq_disable, lock); write_seqlock(lock); } while (0)
 #define write_seqlock_bh(lock)						\
-        do { local_bh_disable();    write_seqlock(lock); } while (0)
+        do { PICK_IRQOP(local_bh_disable, lock); write_seqlock(lock); } while (0)
 
 #define write_sequnlock_irqrestore(lock, flags)				\
-	do { write_sequnlock(lock); local_irq_restore(flags); } while(0)
+	do { write_sequnlock(lock); PICK_IRQOP2(local_irq_restore, flags, lock); preempt_check_resched(); } while(0)
 #define write_sequnlock_irq(lock)					\
-	do { write_sequnlock(lock); local_irq_enable(); } while(0)
+	do { write_sequnlock(lock); PICK_IRQOP(local_irq_enable, lock); preempt_check_resched(); } while(0)
 #define write_sequnlock_bh(lock)					\
-	do { write_sequnlock(lock); local_bh_enable(); } while(0)
+	do { write_sequnlock(lock); PICK_IRQOP(local_bh_enable, lock); } while(0)
 
 #define read_seqbegin_irqsave(lock, flags)				\
-	({ local_irq_save(flags);   read_seqbegin(lock); })
+	({ PICK_IRQOP2(local_irq_save, flags, lock); read_seqbegin(lock); })
 
 #define read_seqretry_irqrestore(lock, iv, flags)			\
 	({								\
 		int ret = read_seqretry(lock, iv);			\
-		local_irq_restore(flags);				\
+		PICK_IRQOP2(local_irq_restore, flags, lock);		\
+		preempt_check_resched(); 				\
 		ret;							\
 	})
 
Index: linux-2.6.16/include/linux/smp.h
===================================================================
--- linux-2.6.16.orig/include/linux/smp.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/smp.h	2006-10-19 16:52:31.000000000 +0200
@@ -33,6 +33,16 @@ extern void smp_send_stop(void);
  */
 extern void smp_send_reschedule(int cpu);
 
+/*
+ * trigger a reschedule on all other CPUs:
+ */
+extern void smp_send_reschedule_allbutself(void);
+
+/*
+ * trigger a reschedule on all other CPUs:
+ */
+extern void smp_send_reschedule_allbutself(void);
+
 
 /*
  * Prepare machine for booting other CPUs.
@@ -96,6 +106,7 @@ void smp_prepare_boot_cpu(void);
 #define smp_call_function(func,info,retry,wait)	({ 0; })
 #define on_each_cpu(func,info,retry,wait)	({ func(info); 0; })
 static inline void smp_send_reschedule(int cpu) { }
+static inline void smp_send_reschedule_allbutself(void) { }
 #define num_booting_cpus()			1
 #define smp_prepare_boot_cpu()			do {} while (0)
 
@@ -125,6 +136,6 @@ static inline void smp_send_reschedule(i
 
 #define get_cpu()		({ preempt_disable(); smp_processor_id(); })
 #define put_cpu()		preempt_enable()
-#define put_cpu_no_resched()	preempt_enable_no_resched()
+#define put_cpu_no_resched()	__preempt_enable_no_resched()
 
 #endif /* __LINUX_SMP_H */
Index: linux-2.6.16/include/linux/smp_lock.h
===================================================================
--- linux-2.6.16.orig/include/linux/smp_lock.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/smp_lock.h	2006-10-19 16:52:31.000000000 +0200
@@ -19,6 +19,8 @@ extern void __lockfunc __release_kernel_
 		__release_kernel_lock();	\
 } while (0)
 
+
+
 /*
  * Non-SMP kernels will never block on the kernel lock,
  * so we are better off returning a constant zero from
@@ -46,7 +48,7 @@ extern void __lockfunc unlock_kernel(voi
 #define lock_kernel()				do { } while(0)
 #define unlock_kernel()				do { } while(0)
 #define release_kernel_lock(task)		do { } while(0)
-#define reacquire_kernel_lock(task)		0
+#define reacquire_kernel_lock(task)		do { } while(0)
 #define kernel_locked()				1
 
 #endif /* CONFIG_LOCK_KERNEL */
Index: linux-2.6.16/include/linux/spinlock.h
===================================================================
--- linux-2.6.16.orig/include/linux/spinlock.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/spinlock.h	2006-10-19 16:52:31.000000000 +0200
@@ -52,6 +52,7 @@
 #include <linux/compiler.h>
 #include <linux/thread_info.h>
 #include <linux/kernel.h>
+#include <linux/cache.h>
 #include <linux/stringify.h>
 
 #include <asm/system.h>
@@ -89,16 +90,10 @@ extern int __lockfunc generic__raw_read_
 # include <linux/spinlock_up.h>
 #endif
 
-#define spin_lock_init(lock)	do { *(lock) = SPIN_LOCK_UNLOCKED; } while (0)
-#define rwlock_init(lock)	do { *(lock) = RW_LOCK_UNLOCKED; } while (0)
-
-#define spin_is_locked(lock)	__raw_spin_is_locked(&(lock)->raw_lock)
-
-/**
- * spin_unlock_wait - wait until the spinlock gets unlocked
- * @lock: the spinlock in question.
+/*
+ * Pull the RT types:
  */
-#define spin_unlock_wait(lock)	__raw_spin_unlock_wait(&(lock)->raw_lock)
+#include <linux/rt_lock.h>
 
 /*
  * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
@@ -109,18 +104,19 @@ extern int __lockfunc generic__raw_read_
 # include <linux/spinlock_api_up.h>
 #endif
 
+#if 0
 #ifdef CONFIG_DEBUG_SPINLOCK
- extern void _raw_spin_lock(spinlock_t *lock);
-#define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock)
- extern int _raw_spin_trylock(spinlock_t *lock);
- extern void _raw_spin_unlock(spinlock_t *lock);
-
- extern void _raw_read_lock(rwlock_t *lock);
- extern int _raw_read_trylock(rwlock_t *lock);
- extern void _raw_read_unlock(rwlock_t *lock);
- extern void _raw_write_lock(rwlock_t *lock);
- extern int _raw_write_trylock(rwlock_t *lock);
- extern void _raw_write_unlock(rwlock_t *lock);
+ extern void _raw_spin_lock(raw_spinlock_t *lock);
+#define _raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
+ extern int _raw_spin_trylock(raw_spinlock_t *lock);
+ extern void _raw_spin_unlock(raw_spinlock_t *lock);
+
+ extern void _raw_read_lock(raw_rwlock_t *lock);
+ extern int _raw_read_trylock(raw_rwlock_t *lock);
+ extern void _raw_read_unlock(raw_rwlock_t *lock);
+ extern void _raw_write_lock(raw_rwlock_t *lock);
+ extern int _raw_write_trylock(raw_rwlock_t *lock);
+ extern void _raw_write_unlock(raw_rwlock_t *lock);
 #else
 # define _raw_spin_unlock(lock)		__raw_spin_unlock(&(lock)->raw_lock)
 # define _raw_spin_trylock(lock)	__raw_spin_trylock(&(lock)->raw_lock)
@@ -134,114 +130,454 @@ extern int __lockfunc generic__raw_read_
 # define _raw_read_trylock(rwlock)	__raw_read_trylock(&(rwlock)->raw_lock)
 # define _raw_write_trylock(rwlock)	__raw_write_trylock(&(rwlock)->raw_lock)
 #endif
+#endif
 
-#define read_can_lock(rwlock)		__raw_read_can_lock(&(rwlock)->raw_lock)
-#define write_can_lock(rwlock)		__raw_write_can_lock(&(rwlock)->raw_lock)
+extern int __bad_spinlock_type(void);
 
+extern void __lockfunc rt_lock(struct rt_mutex *mutex);
+extern void __lockfunc rt_unlock(struct rt_mutex *mutex);
+
+#ifdef CONFIG_PREEMPT_RT
+# define _spin_lock(l)			rt_lock(&(l)->lock)
+# define _spin_lock_bh(l)		rt_lock(&(l)->lock)
+# define _spin_lock_irq(l)		rt_lock(&(l)->lock)
+# define _spin_unlock(l)		rt_unlock(&(l)->lock)
+# define _spin_unlock_no_resched(l)	rt_unlock(&(l)->lock)
+# define _spin_unlock_bh(l)		rt_unlock(&(l)->lock)
+# define _spin_unlock_irq(l)		rt_unlock(&(l)->lock)
+# define _spin_unlock_irqrestore(l, f)	rt_unlock(&(l)->lock)
+static inline unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
+{
+	rt_lock(&lock->lock);
+	return 0;
+}
+#else
+static inline unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
+{
+	return 0;
+}
+# define _spin_lock(l)			do {} while (0)
+# define _spin_lock_bh(l)		do {} while (0)
+# define _spin_lock_irq(l)		do {} while (0)
+# define _spin_lock(l)			do {} while (0)
+# define _spin_lock_bh(l)		do {} while (0)
+# define _spin_lock_irq(l)		do {} while (0)
+# define _spin_unlock(l)		do {} while (0)
+# define _spin_unlock_no_resched(l)	do {} while (0)
+# define _spin_unlock_bh(l)		do {} while (0)
+# define _spin_unlock_irq(l)		do {} while (0)
+# define _spin_unlock_irqrestore(l, f)	do {} while (0)
+#endif
+
+extern void __rt_lock_init(spinlock_t *lock, char *name);
+extern void __lockfunc rt_unlock_wait(spinlock_t *lock);
+extern int __lockfunc rt_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
+extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
+
+#define _spin_lock_init(sl,n,f,l)	__rt_lock_init(sl, n)
+
+# ifdef CONFIG_PREEMPT_RT
+#  define _spin_can_lock(l)		(!rt_mutex_is_locked(&(l)->lock))
+#  define _spin_is_locked(l)		rt_mutex_is_locked(&(l)->lock)
+#  define _spin_unlock_wait(l)		rt_unlock_wait(l)
+
+#  define _spin_trylock(l)		rt_mutex_trylock(&(l)->lock)
+#  define _spin_trylock_bh(l)		rt_mutex_trylock(&(l)->lock)
+#  define _spin_trylock_irq(l)		rt_mutex_trylock(&(l)->lock)
+#  define _spin_trylock_irqsave(l,f)	rt_trylock_irqsave(l, f)
+# else
+
+   extern int this_should_never_be_called_on_non_rt(spinlock_t *lock);
+#  define TSNBCONRT(l) this_should_never_be_called_on_non_rt(l)
+#  define _spin_can_lock(l)		TSNBCONRT(l)
+#  define _spin_is_locked(l)		TSNBCONRT(l)
+#  define _spin_unlock_wait(l)		TSNBCONRT(l)
+
+#  define _spin_trylock(l)		TSNBCONRT(l)
+#  define _spin_trylock_bh(l)		TSNBCONRT(l)
+#  define _spin_trylock_irq(l)		TSNBCONRT(l)
+#  define _spin_trylock_irqsave(l,f)	TSNBCONRT(l)
+#endif
+
+#undef TYPE_EQUAL
+#define TYPE_EQUAL(lock, type) \
+		__builtin_types_compatible_p(typeof(lock), type *)
+
+#define PICK_OP(type, optype, op, lock)				\
+do {								\
+	if (TYPE_EQUAL((lock), type))				\
+		_raw_##optype##op((type *)(lock));		\
+	else if (TYPE_EQUAL(lock, spinlock_t))			\
+		_spin##op((spinlock_t *)(lock));		\
+	else __bad_spinlock_type();				\
+} while (0)
+
+#define PICK_OP_RET(type, optype, op, lock...)			\
+({								\
+	unsigned long __ret;					\
+								\
+	if (TYPE_EQUAL((lock), type))	  			\
+		__ret = _raw_##optype##op((type *)(lock));	\
+	else if (TYPE_EQUAL(lock, spinlock_t))			\
+		__ret = _spin##op((spinlock_t *)(lock));	\
+	else __ret = __bad_spinlock_type();			\
+								\
+	__ret;							\
+})
+
+#define PICK_OP2(type, optype, op, lock, flags)			\
+do {								\
+	if (TYPE_EQUAL((lock), type))				\
+		_raw_##optype##op((type *)(lock), flags);	\
+	else if (TYPE_EQUAL(lock, spinlock_t))			\
+		_spin##op((spinlock_t *)(lock), flags);		\
+	else __bad_spinlock_type();				\
+} while (0)
+
+#define PICK_OP2_RET(type, optype, op, lock, flags)		\
+({								\
+	unsigned long __ret;					\
+								\
+	if (TYPE_EQUAL((lock), type))				\
+		__ret = _raw_##optype##op((type *)(lock), flags);\
+	else if (TYPE_EQUAL(lock, spinlock_t))			\
+		__ret = _spin##op((spinlock_t *)(lock), flags);	\
+	else __bad_spinlock_type();				\
+								\
+	__ret;							\
+})
+
+extern void __lockfunc rt_write_lock(rwlock_t *rwlock);
+extern void __lockfunc rt_read_lock(rwlock_t *rwlock);
+extern int __lockfunc rt_write_trylock(rwlock_t *rwlock);
+extern int __lockfunc rt_read_trylock(rwlock_t *rwlock);
+extern void __lockfunc rt_write_unlock(rwlock_t *rwlock);
+extern void __lockfunc rt_read_unlock(rwlock_t *rwlock);
+extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock);
+extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock);
+extern void __rt_rwlock_init(rwlock_t *rwlock, char *name);
+
+# define _rwlock_init(rwl, n, f, l)		__rt_rwlock_init(rwl, n)
+
+# define _read_can_lock(rwl)	(!rt_mutex_is_locked((rwl)->lock.lock))
+# define _write_can_lock(rwl)	(!rt_mutex_is_locked((rwl)->lock.lock))
+
+# define _read_trylock(rwl)	rt_read_trylock(rwl)
+# define _write_trylock(rwl)	rt_write_trylock(rwl)
+
+# define _read_lock(rwl)	rt_read_lock(rwl)
+# define _write_lock(rwl)	rt_write_lock(rwl)
+# define _read_unlock(rwl)	rt_read_unlock(rwl)
+# define _write_unlock(rwl)	rt_write_unlock(rwl)
+
+# define _read_lock_bh(rwl)	rt_read_lock(rwl)
+# define _write_lock_bh(rwl)	rt_write_lock(rwl)
+# define _read_unlock_bh(rwl)	rt_read_unlock(rwl)
+# define _write_unlock_bh(rwl)	rt_write_unlock(rwl)
+
+# define _read_lock_irq(rwl)	rt_read_lock(rwl)
+# define _write_lock_irq(rwl)	rt_write_lock(rwl)
+# define _read_unlock_irq(rwl)	rt_read_unlock(rwl)
+# define _write_unlock_irq(rwl)	rt_write_unlock(rwl)
+
+# define _read_lock_irqsave(rwl) 	rt_read_lock_irqsave(rwl)
+# define _write_lock_irqsave(rwl)	rt_write_lock_irqsave(rwl)
+
+# define _read_unlock_irqrestore(rwl, f)	rt_read_unlock(rwl)
+# define _write_unlock_irqrestore(rwl, f)	rt_write_unlock(rwl)
+
+#define __PICK_RW_OP(type, optype, op, lock)				\
+do {									\
+	if (TYPE_EQUAL((lock), type))					\
+		_raw_##optype##op((type *)(lock));			\
+	else if (TYPE_EQUAL(lock, rwlock_t))				\
+		##op((rwlock_t *)(lock));				\
+	else __bad_spinlock_type();					\
+} while (0)
+
+#define PICK_RW_OP(type, optype, op, lock)				\
+do {									\
+	if (TYPE_EQUAL((lock), type))					\
+		_raw_##optype##op((type *)(lock));			\
+	else if (TYPE_EQUAL(lock, rwlock_t))				\
+		_##optype##op((rwlock_t *)(lock));			\
+	else __bad_spinlock_type();					\
+} while (0)
+
+#define __PICK_RW_OP_RET(type, optype, op, lock...)			\
+({									\
+	unsigned long __ret;						\
+									\
+	if (TYPE_EQUAL((lock), type))	  				\
+		__ret = _raw_##optype##op((type *)(lock));		\
+	else if (TYPE_EQUAL(lock, rwlock_t))				\
+		__ret = _##optype##op((rwlock_t *)(lock));		\
+	else __ret = __bad_spinlock_type();				\
+									\
+	__ret;								\
+})
+
+#define PICK_RW_OP_RET(type, optype, op, lock...)			\
+({									\
+	unsigned long __ret;						\
+									\
+	if (TYPE_EQUAL((lock), type))	  				\
+		__ret = _raw_##optype##op((type *)(lock));		\
+	else if (TYPE_EQUAL(lock, rwlock_t))				\
+		__ret = _##optype##op((rwlock_t *)(lock));		\
+	else __ret = __bad_spinlock_type();				\
+									\
+	__ret;								\
+})
+
+#define PICK_RW_OP2(type, optype, op, lock, flags)			\
+do {									\
+	if (TYPE_EQUAL((lock), type))					\
+		_raw_##optype##op((type *)(lock), flags);		\
+	else if (TYPE_EQUAL(lock, rwlock_t))				\
+		_##optype##op((rwlock_t *)(lock), flags);		\
+	else __bad_spinlock_type();					\
+} while (0)
+
+#define _raw_spin_lock_init __raw_spin_lock_init
+
+#define PICK_OP_INIT(type, optype, op, lock)				\
+do {									\
+	if (TYPE_EQUAL((lock), type))					\
+		_raw_##optype##op((type *)(lock));			\
+	else if (TYPE_EQUAL(lock, spinlock_t))				\
+		_spin##op((spinlock_t *)(lock), #lock, __FILE__, __LINE__); \
+	else __bad_spinlock_type();					\
+} while (0)
+
+
+#define spin_lock_init(lock) \
+		PICK_OP_INIT(raw_spinlock_t, spin, _lock_init, lock)
+
+#define _raw_rwlock_init __raw_rwlock_init
+
+#define __PICK_RW_OP_INIT(type, optype, op, lock)			\
+do {									\
+	if (TYPE_EQUAL((lock), type))					\
+		_raw_##optype##op((type *)(lock));			\
+	else if (TYPE_EQUAL(lock, rwlock_t))				\
+		_##optype##op((rwlock_t *)(lock), #lock, __FILE__, __LINE__);\
+	else __bad_spinlock_type();					\
+} while (0)
+
+
+#define rwlock_init(lock) \
+		__PICK_RW_OP_INIT(raw_rwlock_t, rwlock, _init, lock)
+
+#define _raw_spin_is_locked(lock) __raw_spin_is_locked(&(lock)->raw_lock)
+
+#define spin_is_locked(lock) \
+		PICK_OP_RET(raw_spinlock_t, spin, _is_locked, lock)
+
+#define _raw_spin_unlock_wait(lock) __raw_spin_unlock_wait(&(lock)->raw_lock)
+
+#define spin_unlock_wait(lock) \
+		PICK_OP(raw_spinlock_t, spin, _unlock_wait, lock)
 /*
  * Define the various spin_lock and rw_lock methods.  Note we define these
  * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various
  * methods are defined as nops in the case they are not required.
  */
-#define spin_trylock(lock)		__cond_lock(_spin_trylock(lock))
-#define read_trylock(lock)		__cond_lock(_read_trylock(lock))
-#define write_trylock(lock)		__cond_lock(_write_trylock(lock))
-
-#define spin_lock(lock)			_spin_lock(lock)
-#define write_lock(lock)		_write_lock(lock)
-#define read_lock(lock)			_read_lock(lock)
+// #define spin_trylock(lock)	_spin_trylock(lock)
+#define spin_trylock(lock)	__cond_lock(PICK_OP_RET(raw_spinlock_t, spin, _trylock, lock))
 
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
-#define spin_lock_irqsave(lock, flags)	flags = _spin_lock_irqsave(lock)
-#define read_lock_irqsave(lock, flags)	flags = _read_lock_irqsave(lock)
-#define write_lock_irqsave(lock, flags)	flags = _write_lock_irqsave(lock)
-#else
-#define spin_lock_irqsave(lock, flags)	_spin_lock_irqsave(lock, flags)
-#define read_lock_irqsave(lock, flags)	_read_lock_irqsave(lock, flags)
-#define write_lock_irqsave(lock, flags)	_write_lock_irqsave(lock, flags)
-#endif
+//#define read_trylock(lock)	_read_trylock(lock)
+#define read_trylock(lock)	__cond_lock(PICK_RW_OP_RET(raw_rwlock_t, read, _trylock, lock))
 
-#define spin_lock_irq(lock)		_spin_lock_irq(lock)
-#define spin_lock_bh(lock)		_spin_lock_bh(lock)
+//#define write_trylock(lock)	_write_trylock(lock)
+#define write_trylock(lock)	__cond_lock(PICK_RW_OP_RET(raw_rwlock_t, write, _trylock, lock))
 
-#define read_lock_irq(lock)		_read_lock_irq(lock)
-#define read_lock_bh(lock)		_read_lock_bh(lock)
+#define _raw_spin_can_lock(lock) __raw_spin_can_lock(&(lock)->raw_lock)
+#define _raw_read_can_lock(lock) __raw_read_can_lock(&(lock)->raw_lock)
+#define _raw_write_can_lock(lock) __raw_write_can_lock(&(lock)->raw_lock)
 
-#define write_lock_irq(lock)		_write_lock_irq(lock)
-#define write_lock_bh(lock)		_write_lock_bh(lock)
+#define spin_can_lock(lock)	__cond_lock(PICK_OP_RET(raw_spinlock_t, spin, _can_lock, lock))
+#define read_can_lock(lock)	__cond_lock(PICK_RW_OP_RET(raw_rwlock_t, read, _can_lock, lock))
+#define write_can_lock(lock)	__cond_lock(PICK_RW_OP_RET(raw_rwlock_t, write, _can_lock, lock))
 
-/*
- * We inline the unlock functions in the nondebug case:
- */
-#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP)
-# define spin_unlock(lock)		_spin_unlock(lock)
-# define read_unlock(lock)		_read_unlock(lock)
-# define write_unlock(lock)		_write_unlock(lock)
-#else
-# define spin_unlock(lock)		__raw_spin_unlock(&(lock)->raw_lock)
-# define read_unlock(lock)		__raw_read_unlock(&(lock)->raw_lock)
-# define write_unlock(lock)		__raw_write_unlock(&(lock)->raw_lock)
-#endif
+// #define spin_lock(lock)	_spin_lock(lock)
+#define spin_lock(lock)		PICK_OP(raw_spinlock_t, spin, _lock, lock)
+
+//#define write_lock(lock)	_write_lock(lock)
+#define write_lock(lock)	PICK_RW_OP(raw_rwlock_t, write, _lock, lock)
 
-#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP)
-# define spin_unlock_irq(lock)		_spin_unlock_irq(lock)
-# define read_unlock_irq(lock)		_read_unlock_irq(lock)
-# define write_unlock_irq(lock)		_write_unlock_irq(lock)
+// #define read_lock(lock)		_read_lock(lock)
+#define read_lock(lock)		PICK_RW_OP(raw_rwlock_t, read, _lock, lock)
+
+#ifdef CONFIG_SMP
+// #define spin_lock_irqsave(lock, flags)	flags = _spin_lock_irqsave(lock)
+// #define read_lock_irqsave(lock, flags)	flags = _read_lock_irqsave(lock)
+// #define write_lock_irqsave(lock, flags)	flags = _write_lock_irqsave(lock)
 #else
-# define spin_unlock_irq(lock) \
-    do { __raw_spin_unlock(&(lock)->raw_lock); local_irq_enable(); } while (0)
-# define read_unlock_irq(lock) \
-    do { __raw_read_unlock(&(lock)->raw_lock); local_irq_enable(); } while (0)
-# define write_unlock_irq(lock) \
-    do { __raw_write_unlock(&(lock)->raw_lock); local_irq_enable(); } while (0)
+// #define spin_lock_irqsave(lock, flags)	_spin_lock_irqsave(lock, flags)
+// #define read_lock_irqsave(lock, flags)	_read_lock_irqsave(lock, flags)
+// #define write_lock_irqsave(lock, flags)	_write_lock_irqsave(lock, flags)
 #endif
 
+# define spin_lock_irqsave(lock, flags) \
+	flags = PICK_OP_RET(raw_spinlock_t, spin, _lock_irqsave, lock)
+# define read_lock_irqsave(lock, flags) \
+	flags = PICK_RW_OP_RET(raw_rwlock_t, read, _lock_irqsave, lock)
+# define write_lock_irqsave(lock, flags) \
+	flags = PICK_RW_OP_RET(raw_rwlock_t, write, _lock_irqsave, lock)
+
+// #define spin_lock_irq(lock)	_spin_lock_irq(lock)
+// #define spin_lock_bh(lock)	_spin_lock_bh(lock)
+#define spin_lock_irq(lock)	PICK_OP(raw_spinlock_t, spin, _lock_irq, lock)
+#define spin_lock_bh(lock)	PICK_OP(raw_spinlock_t, spin, _lock_bh, lock)
+
+// #define read_lock_irq(lock)	_read_lock_irq(lock)
+// #define read_lock_bh(lock)	_read_lock_bh(lock)
+#define read_lock_irq(lock)	PICK_RW_OP(raw_rwlock_t, read, _lock_irq, lock)
+#define read_lock_bh(lock)	PICK_RW_OP(raw_rwlock_t, read, _lock_bh, lock)
+
+// #define write_lock_irq(lock)		_write_lock_irq(lock)
+// #define write_lock_bh(lock)		_write_lock_bh(lock)
+#define write_lock_irq(lock)	PICK_RW_OP(raw_rwlock_t, write, _lock_irq, lock)
+#define write_lock_bh(lock)	PICK_RW_OP(raw_rwlock_t, write, _lock_bh, lock)
+
+// #define spin_unlock(lock)	_spin_unlock(lock)
+// #define write_unlock(lock)	_write_unlock(lock)
+// #define read_unlock(lock)	_read_unlock(lock)
+#define spin_unlock(lock)	PICK_OP(raw_spinlock_t, spin, _unlock, lock)
+#define read_unlock(lock)	PICK_RW_OP(raw_rwlock_t, read, _unlock, lock)
+#define write_unlock(lock)	PICK_RW_OP(raw_rwlock_t, write, _unlock, lock)
+
+// #define spin_unlock(lock)	_spin_unlock_no_resched(lock)
+#define spin_unlock_no_resched(lock) \
+			PICK_OP(raw_spinlock_t, spin, _unlock_no_resched, lock)
+
+//#define spin_unlock_irqrestore(lock, flags)
+//		_spin_unlock_irqrestore(lock, flags)
+//#define spin_unlock_irq(lock)	_spin_unlock_irq(lock)
+//#define spin_unlock_bh(lock)	_spin_unlock_bh(lock)
 #define spin_unlock_irqrestore(lock, flags) \
-					_spin_unlock_irqrestore(lock, flags)
-#define spin_unlock_bh(lock)		_spin_unlock_bh(lock)
-
+	PICK_OP2(raw_spinlock_t, spin, _unlock_irqrestore, lock, flags)
+#define spin_unlock_irq(lock)	PICK_OP(raw_spinlock_t, spin, _unlock_irq, lock)
+#define spin_unlock_bh(lock)	PICK_OP(raw_spinlock_t, spin, _unlock_bh, lock)
+
+// #define read_unlock_irqrestore(lock, flags)
+// 		_read_unlock_irqrestore(lock, flags)
+// #define read_unlock_irq(lock)	_read_unlock_irq(lock)
+// #define read_unlock_bh(lock)	_read_unlock_bh(lock)
 #define read_unlock_irqrestore(lock, flags) \
-					_read_unlock_irqrestore(lock, flags)
-#define read_unlock_bh(lock)		_read_unlock_bh(lock)
-
+		PICK_RW_OP2(raw_rwlock_t, read, _unlock_irqrestore, lock, flags)
+#define read_unlock_irq(lock) PICK_RW_OP(raw_rwlock_t, read, _unlock_irq, lock)
+#define read_unlock_bh(lock) PICK_RW_OP(raw_rwlock_t, read, _unlock_bh, lock)
+
+// #define write_unlock_irqrestore(lock, flags)
+// 	_write_unlock_irqrestore(lock, flags)
+// #define write_unlock_irq(lock)			_write_unlock_irq(lock)
+// #define write_unlock_bh(lock)			_write_unlock_bh(lock)
 #define write_unlock_irqrestore(lock, flags) \
-					_write_unlock_irqrestore(lock, flags)
-#define write_unlock_bh(lock)		_write_unlock_bh(lock)
+	PICK_RW_OP2(raw_rwlock_t, write, _unlock_irqrestore, lock, flags)
+#define write_unlock_irq(lock) PICK_RW_OP(raw_rwlock_t, write, _unlock_irq, lock)
+#define write_unlock_bh(lock) PICK_RW_OP(raw_rwlock_t, write, _unlock_bh, lock)
 
-#define spin_trylock_bh(lock)		__cond_lock(_spin_trylock_bh(lock))
+// #define spin_trylock_bh(lock)	_spin_trylock_bh(lock)
+#define spin_trylock_bh(lock)	__cond_lock(PICK_OP_RET(raw_spinlock_t, spin, _trylock_bh, lock))
 
-#define spin_trylock_irq(lock) \
-({ \
-	local_irq_disable(); \
-	_spin_trylock(lock) ? \
-	1 : ({ local_irq_enable(); 0;  }); \
-})
+// #define spin_trylock_irq(lock)
+
+#define spin_trylock_irq(lock)	__cond_lock(PICK_OP_RET(raw_spinlock_t, spin, _trylock_irq, lock))
+
+// #define spin_trylock_irqsave(lock, flags)
+
+#define spin_trylock_irqsave(lock, flags)	__cond_lock(PICK_OP2_RET(raw_spinlock_t, spin, _trylock_irqsave, lock, &flags))
+
+/* "lock on reference count zero" */
+#ifndef ATOMIC_DEC_AND_LOCK
+# include <asm/atomic.h>
+  extern int _atomic_dec_and_raw_spin_lock(atomic_t *atomic, raw_spinlock_t *lock);
+#endif
+
+#define atomic_dec_and_lock(atomic, lock)				\
+__cond_lock(({								\
+	unsigned long __ret;						\
+									\
+	if (TYPE_EQUAL(lock, raw_spinlock_t))				\
+		__ret = _atomic_dec_and_raw_spin_lock(atomic,		\
+					(raw_spinlock_t *)(lock));	\
+	else if (TYPE_EQUAL(lock, spinlock_t))				\
+		__ret = atomic_dec_and_spin_lock(atomic,		\
+					(spinlock_t *)(lock));		\
+	else __ret = __bad_spinlock_type();				\
+									\
+	__ret;								\
+}))
 
-#define spin_trylock_irqsave(lock, flags) \
-({ \
-	local_irq_save(flags); \
-	_spin_trylock(lock) ? \
-	1 : ({ local_irq_restore(flags); 0; }); \
-})
 
 /*
- * Pull the atomic_t declaration:
- * (asm-mips/atomic.h needs above definitions)
+ *  bit-based spin_lock()
+ *
+ * Don't use this unless you really need to: spin_lock() and spin_unlock()
+ * are significantly faster.
  */
-#include <asm/atomic.h>
-/**
- * atomic_dec_and_lock - lock on reaching reference count zero
- * @atomic: the atomic counter
- * @lock: the spinlock in question
- */
-extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
-#define atomic_dec_and_lock(atomic, lock) \
-		__cond_lock(_atomic_dec_and_lock(atomic, lock))
+static inline void bit_spin_lock(int bitnum, unsigned long *addr)
+{
+	/*
+	 * Assuming the lock is uncontended, this never enters
+	 * the body of the outer loop. If it is contended, then
+	 * within the inner loop a non-atomic test is used to
+	 * busywait with less bus contention for a good time to
+	 * attempt to acquire the lock bit.
+	 */
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT)
+	while (test_and_set_bit(bitnum, addr))
+		while (test_bit(bitnum, addr))
+			cpu_relax();
+#endif
+	__acquire(bitlock);
+}
+
+/*
+ * Return true if it was acquired
+ */
+static inline int bit_spin_trylock(int bitnum, unsigned long *addr)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT)
+	if (test_and_set_bit(bitnum, addr))
+		return 0;
+#endif
+	__acquire(bitlock);
+	return 1;
+}
+
+/*
+ *  bit-based spin_unlock()
+ */
+static inline void bit_spin_unlock(int bitnum, unsigned long *addr)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT)
+	BUG_ON(!test_bit(bitnum, addr));
+	smp_mb__before_clear_bit();
+	clear_bit(bitnum, addr);
+#endif
+	__release(bitlock);
+}
+
+/*
+ * Return true if the lock is held.
+ */
+static inline int bit_spin_is_locked(int bitnum, unsigned long *addr)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT)
+	return test_bit(bitnum, addr);
+#else
+	return 1;
+#endif
+}
 
 /**
- * spin_can_lock - would spin_trylock() succeed?
+ * __raw_spin_can_lock - would __raw_spin_trylock() succeed?
  * @lock: the spinlock in question.
  */
-#define spin_can_lock(lock)	(!spin_is_locked(lock))
+#define __raw_spin_can_lock(lock)            (!__raw_spin_is_locked(lock))
 
 #endif /* __LINUX_SPINLOCK_H */
+
Index: linux-2.6.16/include/linux/spinlock_api_smp.h
===================================================================
--- linux-2.6.16.orig/include/linux/spinlock_api_smp.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/spinlock_api_smp.h	2006-10-19 16:52:31.000000000 +0200
@@ -19,39 +19,42 @@ int in_lock_functions(unsigned long addr
 
 #define assert_spin_locked(x)	BUG_ON(!spin_is_locked(x))
 
-void __lockfunc _spin_lock(spinlock_t *lock)		__acquires(spinlock_t);
-void __lockfunc _read_lock(rwlock_t *lock)		__acquires(rwlock_t);
-void __lockfunc _write_lock(rwlock_t *lock)		__acquires(rwlock_t);
-void __lockfunc _spin_lock_bh(spinlock_t *lock)		__acquires(spinlock_t);
-void __lockfunc _read_lock_bh(rwlock_t *lock)		__acquires(rwlock_t);
-void __lockfunc _write_lock_bh(rwlock_t *lock)		__acquires(rwlock_t);
-void __lockfunc _spin_lock_irq(spinlock_t *lock)	__acquires(spinlock_t);
-void __lockfunc _read_lock_irq(rwlock_t *lock)		__acquires(rwlock_t);
-void __lockfunc _write_lock_irq(rwlock_t *lock)		__acquires(rwlock_t);
-unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
-							__acquires(spinlock_t);
-unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
-							__acquires(rwlock_t);
-unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
-							__acquires(rwlock_t);
-int __lockfunc _spin_trylock(spinlock_t *lock);
-int __lockfunc _read_trylock(rwlock_t *lock);
-int __lockfunc _write_trylock(rwlock_t *lock);
-int __lockfunc _spin_trylock_bh(spinlock_t *lock);
-void __lockfunc _spin_unlock(spinlock_t *lock)		__releases(spinlock_t);
-void __lockfunc _read_unlock(rwlock_t *lock)		__releases(rwlock_t);
-void __lockfunc _write_unlock(rwlock_t *lock)		__releases(rwlock_t);
-void __lockfunc _spin_unlock_bh(spinlock_t *lock)	__releases(spinlock_t);
-void __lockfunc _read_unlock_bh(rwlock_t *lock)		__releases(rwlock_t);
-void __lockfunc _write_unlock_bh(rwlock_t *lock)	__releases(rwlock_t);
-void __lockfunc _spin_unlock_irq(spinlock_t *lock)	__releases(spinlock_t);
-void __lockfunc _read_unlock_irq(rwlock_t *lock)	__releases(rwlock_t);
-void __lockfunc _write_unlock_irq(rwlock_t *lock)	__releases(rwlock_t);
-void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
-							__releases(spinlock_t);
-void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
-							__releases(rwlock_t);
-void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
-							__releases(rwlock_t);
+void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)		__acquires(raw_spinlock_t);
+void __lockfunc _raw_read_lock(raw_rwlock_t *lock)		__acquires(raw_rwlock_t);
+void __lockfunc _raw_write_lock(raw_rwlock_t *lock)		__acquires(raw_rwlock_t);
+void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)		__acquires(raw_spinlock_t);
+void __lockfunc _raw_read_lock_bh(raw_rwlock_t *lock)		__acquires(raw_rwlock_t);
+void __lockfunc _raw_write_lock_bh(raw_rwlock_t *lock)		__acquires(raw_rwlock_t);
+void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)	__acquires(raw_spinlock_t);
+void __lockfunc _raw_read_lock_irq(raw_rwlock_t *lock)		__acquires(raw_rwlock_t);
+void __lockfunc _raw_write_lock_irq(raw_rwlock_t *lock)		__acquires(raw_rwlock_t);
+unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
+							__acquires(raw_spinlock_t);
+unsigned long __lockfunc _raw_read_lock_irqsave(raw_rwlock_t *lock)
+							__acquires(raw_rwlock_t);
+unsigned long __lockfunc _raw_write_lock_irqsave(raw_rwlock_t *lock)
+							__acquires(raw_rwlock_t);
+int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock);
+int __lockfunc _raw_read_trylock(raw_rwlock_t *lock);
+int __lockfunc _raw_write_trylock(raw_rwlock_t *lock);
+int __lockfunc _raw_spin_trylock_irqsave(raw_spinlock_t *lock,
+					 unsigned long *flags);
+int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock);
+void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)		__releases(raw_spinlock_t);
+void __lockfunc _raw_spin_unlock_no_resched(raw_spinlock_t *lock) __releases(raw_spinlock_t);
+void __lockfunc _raw_read_unlock(raw_rwlock_t *lock)		__releases(raw_rwlock_t);
+void __lockfunc _raw_write_unlock(raw_rwlock_t *lock)		__releases(raw_rwlock_t);
+void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)	__releases(raw_spinlock_t);
+void __lockfunc _raw_read_unlock_bh(raw_rwlock_t *lock)		__releases(raw_rwlock_t);
+void __lockfunc _raw_write_unlock_bh(raw_rwlock_t *lock)	__releases(raw_rwlock_t);
+void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)	__releases(raw_spinlock_t);
+void __lockfunc _raw_read_unlock_irq(raw_rwlock_t *lock)	__releases(raw_rwlock_t);
+void __lockfunc _raw_write_unlock_irq(raw_rwlock_t *lock)	__releases(raw_rwlock_t);
+void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
+							__releases(raw_spinlock_t);
+void __lockfunc _raw_read_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags)
+							__releases(raw_rwlock_t);
+void __lockfunc _raw_write_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags)
+							__releases(raw_rwlock_t);
 
 #endif /* __LINUX_SPINLOCK_API_SMP_H */
Index: linux-2.6.16/include/linux/spinlock_api_up.h
===================================================================
--- linux-2.6.16.orig/include/linux/spinlock_api_up.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/spinlock_api_up.h	2006-10-19 16:52:31.000000000 +0200
@@ -33,12 +33,20 @@
 #define __LOCK_IRQ(lock) \
   do { local_irq_disable(); __LOCK(lock); } while (0)
 
-#define __LOCK_IRQSAVE(lock, flags) \
-  do { local_irq_save(flags); __LOCK(lock); } while (0)
+#define __LOCK_IRQSAVE(lock) \
+  ({ unsigned long __flags; local_irq_save(__flags); __LOCK(lock); __flags; })
+
+#define __TRYLOCK_IRQSAVE(lock, flags) \
+	({ local_irq_save(*(flags)); __LOCK(lock); 1; })
+
+#define _raw_spin_trylock_irqsave(lock, flags)	__TRYLOCK_IRQSAVE(lock, flags)
 
 #define __UNLOCK(lock) \
   do { preempt_enable(); __release(lock); (void)(lock); } while (0)
 
+#define __UNLOCK_NO_RESCHED(lock) \
+  do { __preempt_enable_no_resched(); __release(lock); (void)(lock); } while (0)
+
 #define __UNLOCK_BH(lock) \
   do { preempt_enable_no_resched(); local_bh_enable(); __release(lock); (void)(lock); } while (0)
 
@@ -48,33 +56,42 @@
 #define __UNLOCK_IRQRESTORE(lock, flags) \
   do { local_irq_restore(flags); __UNLOCK(lock); } while (0)
 
-#define _spin_lock(lock)			__LOCK(lock)
-#define _read_lock(lock)			__LOCK(lock)
-#define _write_lock(lock)			__LOCK(lock)
-#define _spin_lock_bh(lock)			__LOCK_BH(lock)
-#define _read_lock_bh(lock)			__LOCK_BH(lock)
-#define _write_lock_bh(lock)			__LOCK_BH(lock)
-#define _spin_lock_irq(lock)			__LOCK_IRQ(lock)
-#define _read_lock_irq(lock)			__LOCK_IRQ(lock)
-#define _write_lock_irq(lock)			__LOCK_IRQ(lock)
-#define _spin_lock_irqsave(lock, flags)		__LOCK_IRQSAVE(lock, flags)
-#define _read_lock_irqsave(lock, flags)		__LOCK_IRQSAVE(lock, flags)
-#define _write_lock_irqsave(lock, flags)	__LOCK_IRQSAVE(lock, flags)
-#define _spin_trylock(lock)			({ __LOCK(lock); 1; })
-#define _read_trylock(lock)			({ __LOCK(lock); 1; })
-#define _write_trylock(lock)			({ __LOCK(lock); 1; })
-#define _spin_trylock_bh(lock)			({ __LOCK_BH(lock); 1; })
-#define _spin_unlock(lock)			__UNLOCK(lock)
-#define _read_unlock(lock)			__UNLOCK(lock)
-#define _write_unlock(lock)			__UNLOCK(lock)
-#define _spin_unlock_bh(lock)			__UNLOCK_BH(lock)
-#define _write_unlock_bh(lock)			__UNLOCK_BH(lock)
-#define _read_unlock_bh(lock)			__UNLOCK_BH(lock)
-#define _spin_unlock_irq(lock)			__UNLOCK_IRQ(lock)
-#define _read_unlock_irq(lock)			__UNLOCK_IRQ(lock)
-#define _write_unlock_irq(lock)			__UNLOCK_IRQ(lock)
-#define _spin_unlock_irqrestore(lock, flags)	__UNLOCK_IRQRESTORE(lock, flags)
-#define _read_unlock_irqrestore(lock, flags)	__UNLOCK_IRQRESTORE(lock, flags)
-#define _write_unlock_irqrestore(lock, flags)	__UNLOCK_IRQRESTORE(lock, flags)
+#define _raw_spin_lock(lock)			__LOCK(lock)
+#define _raw_read_lock(lock)			__LOCK(lock)
+#define _raw_write_lock(lock)			__LOCK(lock)
+#define _raw_spin_lock_bh(lock)			__LOCK_BH(lock)
+#define _raw_read_lock_bh(lock)			__LOCK_BH(lock)
+#define _raw_write_lock_bh(lock)		__LOCK_BH(lock)
+#define _raw_spin_lock_irq(lock)		__LOCK_IRQ(lock)
+#define _raw_read_lock_irq(lock)		__LOCK_IRQ(lock)
+#define _raw_write_lock_irq(lock)		__LOCK_IRQ(lock)
+#define _raw_spin_lock_irqsave(lock)		__LOCK_IRQSAVE(lock)
+#define _raw_read_lock_irqsave(lock)		__LOCK_IRQSAVE(lock)
+#define _raw_write_lock_irqsave(lock)		__LOCK_IRQSAVE(lock)
+#define _raw_spin_trylock(lock)			({ __LOCK(lock); 1; })
+#define _raw_read_trylock(lock)			({ __LOCK(lock); 1; })
+#define _raw_write_trylock(lock)		({ __LOCK(lock); 1; })
+#define _raw_spin_trylock_bh(lock)		({ __LOCK_BH(lock); 1; })
+#define _raw_read_trylock_bh(lock)		({ __LOCK_BH(lock); 1; })
+#define _raw_write_trylock_bh(lock)		({ __LOCK_BH(lock); 1; })
+#define _raw_spin_trylock_irqsave(lock, flags)	__TRYLOCK_IRQSAVE(lock, flags)
+#define _raw_read_trylock_irqsave(lock, flags)	__TRYLOCK_IRQSAVE(lock, flags)
+#define _raw_read_trylock_irqsave(lock, flags)	__TRYLOCK_IRQSAVE(lock, flags)
+#define _raw_spin_unlock(lock)			__UNLOCK(lock)
+#define _raw_spin_unlock_no_resched(lock)	__UNLOCK_NO_RESCHED(lock)
+#define _raw_read_unlock(lock)			__UNLOCK(lock)
+#define _raw_write_unlock(lock)			__UNLOCK(lock)
+#define _raw_spin_unlock_bh(lock)		__UNLOCK_BH(lock)
+#define _raw_write_unlock_bh(lock)		__UNLOCK_BH(lock)
+#define _raw_read_unlock_bh(lock)		__UNLOCK_BH(lock)
+#define _raw_spin_unlock_irq(lock)		__UNLOCK_IRQ(lock)
+#define _raw_read_unlock_irq(lock)		__UNLOCK_IRQ(lock)
+#define _raw_write_unlock_irq(lock)		__UNLOCK_IRQ(lock)
+#define _raw_spin_unlock_irqrestore(lock, flags) \
+						__UNLOCK_IRQRESTORE(lock, flags)
+#define _raw_read_unlock_irqrestore(lock, flags) \
+						__UNLOCK_IRQRESTORE(lock, flags)
+#define _raw_write_unlock_irqrestore(lock, flags) \
+						__UNLOCK_IRQRESTORE(lock, flags)
 
 #endif /* __LINUX_SPINLOCK_API_UP_H */
Index: linux-2.6.16/include/linux/spinlock_types.h
===================================================================
--- linux-2.6.16.orig/include/linux/spinlock_types.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/spinlock_types.h	2006-10-19 16:52:31.000000000 +0200
@@ -16,7 +16,7 @@
 #endif
 
 typedef struct {
-	raw_spinlock_t raw_lock;
+	__raw_spinlock_t raw_lock;
 #if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
 	unsigned int break_lock;
 #endif
@@ -24,12 +24,12 @@ typedef struct {
 	unsigned int magic, owner_cpu;
 	void *owner;
 #endif
-} spinlock_t;
+} raw_spinlock_t;
 
-#define SPINLOCK_MAGIC		0xdead4ead
+#define RAW_SPINLOCK_MAGIC	0xdead4ead
 
 typedef struct {
-	raw_rwlock_t raw_lock;
+	__raw_rwlock_t raw_lock;
 #if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
 	unsigned int break_lock;
 #endif
@@ -37,31 +37,46 @@ typedef struct {
 	unsigned int magic, owner_cpu;
 	void *owner;
 #endif
-} rwlock_t;
+} raw_rwlock_t;
 
 #define RWLOCK_MAGIC		0xdeaf1eed
 
 #define SPINLOCK_OWNER_INIT	((void *)-1L)
 
 #ifdef CONFIG_DEBUG_SPINLOCK
-# define SPIN_LOCK_UNLOCKED						\
-	(spinlock_t)	{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED,	\
-				.magic = SPINLOCK_MAGIC,		\
+# define RAW_SPIN_LOCK_UNLOCKED						\
+	(raw_spinlock_t) {	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED,	\
+				.magic = RAW_SPINLOCK_MAGIC,		\
 				.owner = SPINLOCK_OWNER_INIT,		\
 				.owner_cpu = -1 }
-#define RW_LOCK_UNLOCKED						\
-	(rwlock_t)	{	.raw_lock = __RAW_RW_LOCK_UNLOCKED,	\
+# define RAW_RW_LOCK_UNLOCKED						\
+	(raw_rwlock_t) {	.raw_lock = __RAW_RW_LOCK_UNLOCKED,	\
 				.magic = RWLOCK_MAGIC,			\
 				.owner = SPINLOCK_OWNER_INIT,		\
 				.owner_cpu = -1 }
 #else
-# define SPIN_LOCK_UNLOCKED \
-	(spinlock_t)	{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED }
-#define RW_LOCK_UNLOCKED \
-	(rwlock_t)	{	.raw_lock = __RAW_RW_LOCK_UNLOCKED }
-#endif
-
-#define DEFINE_SPINLOCK(x)	spinlock_t x = SPIN_LOCK_UNLOCKED
-#define DEFINE_RWLOCK(x)	rwlock_t x = RW_LOCK_UNLOCKED
+# define _RAW_SPIN_LOCK_UNLOCKED \
+		{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED }
+# define _RAW_RW_LOCK_UNLOCKED \
+		{	.raw_lock = __RAW_RW_LOCK_UNLOCKED }
+# define RAW_SPIN_LOCK_UNLOCKED \
+	(raw_spinlock_t) _RAW_SPIN_LOCK_UNLOCKED
+# define RAW_RW_LOCK_UNLOCKED \
+	(raw_rwlock_t) _RAW_RW_LOCK_UNLOCKED
+#endif
+
+#define DEFINE_RAW_SPINLOCK(name) \
+	raw_spinlock_t name __cacheline_aligned_in_smp = RAW_SPIN_LOCK_UNLOCKED
+
+#define __DEFINE_RAW_SPINLOCK(name) \
+	raw_spinlock_t name = RAW_SPIN_LOCK_UNLOCKED
+
+#define DEFINE_RAW_RWLOCK(name) \
+	raw_rwlock_t name __cacheline_aligned_in_smp = RAW_RW_LOCK_UNLOCKED
+
+#define __raw_spin_lock_init(lock) \
+	do { *(lock) = RAW_SPIN_LOCK_UNLOCKED; } while (0)
+#define __raw_rwlock_init(lock) \
+	do { *(lock) = RAW_RW_LOCK_UNLOCKED; } while (0)
 
 #endif /* __LINUX_SPINLOCK_TYPES_H */
Index: linux-2.6.16/include/linux/spinlock_types_up.h
===================================================================
--- linux-2.6.16.orig/include/linux/spinlock_types_up.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/spinlock_types_up.h	2006-10-19 16:52:31.000000000 +0200
@@ -16,13 +16,13 @@
 
 typedef struct {
 	volatile unsigned int slock;
-} raw_spinlock_t;
+} __raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED { 1 }
 
 #else
 
-typedef struct { } raw_spinlock_t;
+typedef struct { } __raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED { }
 
@@ -30,7 +30,7 @@ typedef struct { } raw_spinlock_t;
 
 typedef struct {
 	/* no debug version on UP */
-} raw_rwlock_t;
+} __raw_rwlock_t;
 
 #define __RAW_RW_LOCK_UNLOCKED { }
 
Index: linux-2.6.16/include/linux/syscalls.h
===================================================================
--- linux-2.6.16.orig/include/linux/syscalls.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/syscalls.h	2006-10-19 16:52:31.000000000 +0200
@@ -174,9 +174,9 @@ asmlinkage long sys_waitid(int which, pi
 			   int options, struct rusage __user *ru);
 asmlinkage long sys_waitpid(pid_t pid, int __user *stat_addr, int options);
 asmlinkage long sys_set_tid_address(int __user *tidptr);
-asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
+asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
 			struct timespec __user *utime, u32 __user *uaddr2,
-			int val3);
+			u32 val3);
 
 asmlinkage long sys_init_module(void __user *umod, unsigned long len,
 				const char __user *uargs);
Index: linux-2.6.16/include/linux/sysctl.h
===================================================================
--- linux-2.6.16.orig/include/linux/sysctl.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/sysctl.h	2006-10-19 16:52:31.000000000 +0200
@@ -148,6 +148,7 @@ enum
 	KERN_SPIN_RETRY=70,	/* int: number of spinlock retries */
 	KERN_ACPI_VIDEO_FLAGS=71, /* int: flags for setting up video after ACPI sleep */
 	KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */
+	KERN_MAX_LOCK_DEPTH=73
 };
 
 
Index: linux-2.6.16/include/linux/threads.h
===================================================================
--- linux-2.6.16.orig/include/linux/threads.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/threads.h	2006-10-19 16:52:31.000000000 +0200
@@ -28,7 +28,8 @@
 #define PID_MAX_DEFAULT (CONFIG_BASE_SMALL ? 0x1000 : 0x8000)
 
 /*
- * A maximum of 4 million PIDs should be enough for a while:
+ * A maximum of 4 million PIDs should be enough for a while.
+ * [NOTE: PID/TIDs are limited to 2^29 ~= 500+ million, see futex.h.]
  */
 #define PID_MAX_LIMIT (CONFIG_BASE_SMALL ? PAGE_SIZE * 8 : \
 	(sizeof(long) > 4 ? 4 * 1024 * 1024 : PID_MAX_DEFAULT))
Index: linux-2.6.16/include/linux/time.h
===================================================================
--- linux-2.6.16.orig/include/linux/time.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/time.h	2006-10-19 16:52:31.000000000 +0200
@@ -74,14 +74,14 @@ extern void set_normalized_timespec(stru
 	(((ts)->tv_sec >= 0) && (((unsigned long) (ts)->tv_nsec) < NSEC_PER_SEC))
 
 /*
- * 64-bit nanosec type. Large enough to span 292+ years in nanosecond
- * resolution. Ought to be enough for a while.
+ * Returns true if the timeval is in canonical form
  */
-typedef s64 nsec_t;
+#define timeval_valid(t) \
+	(((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC))
 
 extern struct timespec xtime;
 extern struct timespec wall_to_monotonic;
-extern seqlock_t xtime_lock;
+extern raw_seqlock_t xtime_lock;
 
 static inline unsigned long get_seconds(void)
 {
@@ -101,8 +101,8 @@ extern long do_utimes(int dfd, char __us
 struct itimerval;
 extern int do_setitimer(int which, struct itimerval *value,
 			struct itimerval *ovalue);
+extern unsigned int alarm_setitimer(unsigned int seconds);
 extern int do_getitimer(int which, struct itimerval *value);
-extern void getnstimeofday(struct timespec *tv);
 
 extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
 
@@ -113,9 +113,9 @@ extern struct timespec timespec_trunc(st
  * Returns the scalar nanosecond representation of the timespec
  * parameter.
  */
-static inline nsec_t timespec_to_ns(const struct timespec *ts)
+static inline s64 timespec_to_ns(const struct timespec *ts)
 {
-	return ((nsec_t) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec;
+	return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec;
 }
 
 /**
@@ -125,9 +125,9 @@ static inline nsec_t timespec_to_ns(cons
  * Returns the scalar nanosecond representation of the timeval
  * parameter.
  */
-static inline nsec_t timeval_to_ns(const struct timeval *tv)
+static inline s64 timeval_to_ns(const struct timeval *tv)
 {
-	return ((nsec_t) tv->tv_sec * NSEC_PER_SEC) +
+	return ((s64) tv->tv_sec * NSEC_PER_SEC) +
 		tv->tv_usec * NSEC_PER_USEC;
 }
 
@@ -137,7 +137,7 @@ static inline nsec_t timeval_to_ns(const
  *
  * Returns the timespec representation of the nsec parameter.
  */
-extern struct timespec ns_to_timespec(const nsec_t nsec);
+extern struct timespec ns_to_timespec(const s64 nsec);
 
 /**
  * ns_to_timeval - Convert nanoseconds to timeval
@@ -145,7 +145,22 @@ extern struct timespec ns_to_timespec(co
  *
  * Returns the timeval representation of the nsec parameter.
  */
-extern struct timeval ns_to_timeval(const nsec_t nsec);
+extern struct timeval ns_to_timeval(const s64 nsec);
+
+/**
+ * timespec_add_ns - Adds nanoseconds to a timespec
+ * @a:		pointer to timespec to be incremented
+ * @ns:		unsigned nanoseconds value to be added
+ */
+static inline void timespec_add_ns(struct timespec *a, u64 ns)
+{
+	ns += a->tv_nsec;
+	while(unlikely(ns >= NSEC_PER_SEC)) {
+		ns -= NSEC_PER_SEC;
+		a->tv_sec++;
+	}
+	a->tv_nsec = ns;
+}
 
 #endif /* __KERNEL__ */
 
Index: linux-2.6.16/include/linux/timeofday.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/include/linux/timeofday.h	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,47 @@
+/*  linux/include/linux/timeofday.h
+ *
+ *  This file contains the interface to the time of day subsystem
+ */
+#ifndef _LINUX_TIMEOFDAY_H
+#define _LINUX_TIMEOFDAY_H
+#include <linux/calc64.h>
+#include <linux/types.h>
+#include <linux/ktime.h>
+#include <linux/time.h>
+#include <linux/timex.h>
+
+#ifdef CONFIG_GENERIC_TIME
+
+/* Kernel internal interfaces */
+extern ktime_t get_monotonic_clock(void);
+extern ktime_t get_realtime_clock(void);
+extern ktime_t get_realtime_offset(void);
+
+/* Timepsec based interfaces for user space functionality */
+extern void get_realtime_clock_ts(struct timespec *ts);
+extern void get_monotonic_clock_ts(struct timespec *ts);
+
+/* legacy timeofday interfaces*/
+#define getnstimeofday(ts) get_realtime_clock_ts(ts)
+#define getnstimestamp(ts) get_monotonic_clock_ts(ts)
+extern void getnstimeofday(struct timespec *ts);
+extern void do_gettimeofday(struct timeval *tv);
+extern int do_settimeofday(struct timespec *ts);
+
+/* Internal functions */
+extern int timeofday_is_continuous(void);
+extern u32 timeofday_get_clockres(void);
+extern void timeofday_init(void);
+extern void timeofday_overflow_protection(void);
+#ifndef CONFIG_IS_TICK_BASED
+#define arch_getoffset() (0)
+#else
+extern unsigned long arch_getoffset(void);
+#endif
+
+#else /* CONFIG_GENERIC_TIME */
+#define timeofday_init()
+extern void getnstimeofday(struct timespec *ts);
+extern void getnstimestamp(struct timespec *ts);
+#endif /* CONFIG_GENERIC_TIME */
+#endif /* _LINUX_TIMEOFDAY_H */
Index: linux-2.6.16/include/linux/timer.h
===================================================================
--- linux-2.6.16.orig/include/linux/timer.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/timer.h	2006-10-19 16:52:31.000000000 +0200
@@ -84,10 +84,12 @@ static inline void add_timer(struct time
 	__mod_timer(timer, timer->expires);
 }
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS)
+  extern int timer_pending_sync(struct timer_list *timer);
   extern int try_to_del_timer_sync(struct timer_list *timer);
   extern int del_timer_sync(struct timer_list *timer);
 #else
+# define timer_pending_sync(t)		timer_pending(t)
 # define try_to_del_timer_sync(t)	del_timer(t)
 # define del_timer_sync(t)		del_timer(t)
 #endif
@@ -96,6 +98,7 @@ static inline void add_timer(struct time
 
 extern void init_timers(void);
 extern void run_local_timers(void);
-extern int it_real_fn(void *);
+struct hrtimer;
+extern int it_real_fn(struct hrtimer *);
 
 #endif
Index: linux-2.6.16/include/linux/timex.h
===================================================================
--- linux-2.6.16.orig/include/linux/timex.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/timex.h	2006-10-19 16:52:31.000000000 +0200
@@ -260,6 +260,8 @@ extern long pps_calcnt;		/* calibration 
 extern long pps_errcnt;		/* calibration errors */
 extern long pps_stbcnt;		/* stability limit exceeded */
 
+extern raw_seqlock_t ntp_lock;
+
 /**
  * ntp_clear - Clears the NTP state variables
  *
@@ -267,21 +269,40 @@ extern long pps_stbcnt;		/* stability li
  */
 static inline void ntp_clear(void)
 {
+	unsigned long flags;
+
+	write_seqlock_irqsave(&ntp_lock, flags);
 	time_adjust = 0;		/* stop active adjtime() */
 	time_status |= STA_UNSYNC;
 	time_maxerror = NTP_PHASE_LIMIT;
 	time_esterror = NTP_PHASE_LIMIT;
+	write_sequnlock_irqrestore(&ntp_lock, flags);
 }
 
 /**
  * ntp_synced - Returns 1 if the NTP status is not UNSYNC
- *
  */
 static inline int ntp_synced(void)
 {
 	return !(time_status & STA_UNSYNC);
 }
 
+/**
+ * ntp_get_ppm_adjustment - Returns Shifted PPM adjustment
+ */
+extern long ntp_get_ppm_adjustment(void);
+
+/**
+ * ntp_advance - Advances the NTP state machine by interval_ns
+ */
+extern void ntp_advance(unsigned long interval_ns);
+
+/**
+ * ntp_leapsecond - NTP leapsecond processing code.
+ */
+extern int ntp_leapsecond(struct timespec now);
+
+
 /* Required to safely shift negative values */
 #define shift_right(x, s) ({	\
 	__typeof__(x) __x = (x);	\
@@ -289,6 +310,7 @@ static inline int ntp_synced(void)
 	__x < 0 ? -(-__x >> __s) : __x >> __s;	\
 })
 
+#ifndef CONFIG_GENERIC_TIME
 
 #ifdef CONFIG_TIME_INTERPOLATION
 
@@ -344,6 +366,7 @@ time_interpolator_reset(void)
 }
 
 #endif /* !CONFIG_TIME_INTERPOLATION */
+#endif /* !CONFIG_GENERIC_TIME */
 
 /* Returns how long ticks are at present, in ns / 2^(SHIFT_SCALE-10). */
 extern u64 current_tick_length(void);
Index: linux-2.6.16/include/linux/trace_irqflags.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/include/linux/trace_irqflags.h	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,68 @@
+/*
+ * include/linux/trace_irqflags.h
+ *
+ * IRQ flags tracing: follow the state of the hardirq and softirq flags and
+ * provide callbacks for transitions between ON and OFF states.
+ *
+ * This file gets included from lowlevel asm headers too, to provide
+ * wrapped versions of the local_irq_*() APIs, based on the
+ * raw_local_irq_*() macros from the lowlevel headers.
+ */
+#ifndef _LINUX_TRACE_IRQFLAGS_H
+#define _LINUX_TRACE_IRQFLAGS_H
+
+#include <asm/irqflags.h>
+
+/*
+ * The local_irq_*() APIs are equal to the raw_local_irq*()
+ * if !DEBUG_TRACE_IRQFLAGS.
+ */
+#ifdef CONFIG_DEBUG_TRACE_IRQFLAGS
+  extern void trace_hardirqs_on(void);
+  extern void trace_hardirqs_off(void);
+  extern void trace_softirqs_on(unsigned long ip);
+  extern void trace_softirqs_off(void);
+#else
+# define trace_hardirqs_on()		do { } while (0)
+# define trace_hardirqs_off()		do { } while (0)
+# define trace_softirqs_on(ip)		do { } while (0)
+# define trace_softirqs_off()		do { } while (0)
+#endif
+
+#define local_irq_enable() \
+	do { trace_hardirqs_on(); raw_local_irq_enable(); } while (0)
+#define local_irq_disable() \
+	do { raw_local_irq_disable(); trace_hardirqs_off(); } while (0)
+#define local_irq_save(flags) \
+	do { raw_local_irq_save(flags); trace_hardirqs_off(); } while (0)
+
+#define local_irq_restore(flags)				\
+	do {							\
+		if (raw_irqs_disabled_flags(flags)) {		\
+			raw_local_irq_restore(flags);		\
+			trace_hardirqs_off();			\
+		} else {					\
+			trace_hardirqs_on();			\
+			raw_local_irq_restore(flags);		\
+		}						\
+	} while (0)
+
+#define safe_halt()						\
+	do {							\
+		trace_hardirqs_on();				\
+		raw_safe_halt();				\
+	} while (0)
+
+#define local_save_flags(flags)		raw_local_save_flags(flags)
+
+#define irqs_disabled()						\
+({								\
+	unsigned long flags;					\
+								\
+	raw_local_save_flags(flags);				\
+	raw_irqs_disabled_flags(flags);				\
+})
+
+#define irqs_disabled_flags(flags)	raw_irqs_disabled_flags(flags)
+
+#endif
Index: linux-2.6.16/include/linux/wait.h
===================================================================
--- linux-2.6.16.orig/include/linux/wait.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/wait.h	2006-10-19 16:52:31.000000000 +0200
@@ -48,11 +48,13 @@ struct wait_bit_queue {
 	wait_queue_t wait;
 };
 
+#if 1
 struct __wait_queue_head {
 	spinlock_t lock;
 	struct list_head task_list;
 };
 typedef struct __wait_queue_head wait_queue_head_t;
+#endif
 
 struct task_struct;
 
Index: linux-2.6.16/include/linux/workqueue.h
===================================================================
--- linux-2.6.16.orig/include/linux/workqueue.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/linux/workqueue.h	2006-10-19 16:52:31.000000000 +0200
@@ -54,6 +54,8 @@ extern struct workqueue_struct *__create
 						    int singlethread);
 #define create_workqueue(name) __create_workqueue((name), 0)
 #define create_singlethread_workqueue(name) __create_workqueue((name), 1)
+extern void set_workqueue_prio(struct workqueue_struct *wq, int policy,
+				int rt_priority, int nice);
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
Index: linux-2.6.16/include/net/dn_dev.h
===================================================================
--- linux-2.6.16.orig/include/net/dn_dev.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/net/dn_dev.h	2006-10-19 16:52:31.000000000 +0200
@@ -76,9 +76,9 @@ struct dn_dev_parms {
 	int priority;             /* Priority to be a router            */
 	char *name;               /* Name for sysctl                    */
 	int ctl_name;             /* Index for sysctl                   */
-	int  (*up)(struct net_device *);
-	void (*down)(struct net_device *);
-	void (*timer3)(struct net_device *, struct dn_ifaddr *ifa);
+	int  (*dn_up)(struct net_device *);
+	void (*dn_down)(struct net_device *);
+	void (*dn_timer3)(struct net_device *, struct dn_ifaddr *ifa);
 	void *sysctl;
 };
 
Index: linux-2.6.16/include/net/inet_hashtables.h
===================================================================
--- linux-2.6.16.orig/include/net/inet_hashtables.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/net/inet_hashtables.h	2006-10-19 16:52:31.000000000 +0200
@@ -102,6 +102,7 @@ struct inet_hashinfo {
 	 * is for TIME_WAIT sockets only.
 	 */
 	struct inet_ehash_bucket	*ehash;
+	unsigned long			*ebitmask;
 
 	/* Ok, let's try this, I give up, we do need a local binding
 	 * TCP hash as well as the others for fast bind/connect.
@@ -136,6 +137,13 @@ static inline struct inet_ehash_bucket *
 	return &hashinfo->ehash[hash & (hashinfo->ehash_size - 1)];
 }
 
+static inline unsigned int inet_ehash_index(
+	struct inet_hashinfo *hashinfo,
+	unsigned int hash)
+{
+	return hash & (hashinfo->ehash_size - 1);
+}
+
 extern struct inet_bind_bucket *
 		    inet_bind_bucket_create(kmem_cache_t *cachep,
 					    struct inet_bind_hashbucket *head,
@@ -208,11 +216,25 @@ static inline void inet_listen_unlock(st
 		wake_up(&hashinfo->lhash_wait);
 }
 
+static inline void __inet_hash_setbit(unsigned long *bitmask, unsigned int index)
+{
+	if (bitmask)
+		set_bit(index, bitmask);
+}
+
+static inline void __inet_hash_clearbit(unsigned long *bitmask, unsigned int index)
+{
+	if (bitmask)
+		clear_bit(index, bitmask);
+}
+
 static inline void __inet_hash(struct inet_hashinfo *hashinfo,
 			       struct sock *sk, const int listen_possible)
 {
 	struct hlist_head *list;
 	rwlock_t *lock;
+	unsigned long *bitmask = NULL;
+	unsigned int index = 0;
 
 	BUG_TRAP(sk_unhashed(sk));
 	if (listen_possible && sk->sk_state == TCP_LISTEN) {
@@ -222,12 +244,15 @@ static inline void __inet_hash(struct in
 	} else {
 		struct inet_ehash_bucket *head;
 		sk->sk_hash = inet_sk_ehashfn(sk);
+		index = inet_ehash_index(hashinfo, sk->sk_hash);
 		head = inet_ehash_bucket(hashinfo, sk->sk_hash);
 		list = &head->chain;
 		lock = &head->lock;
+		bitmask = hashinfo->ebitmask;
 		write_lock(lock);
 	}
 	__sk_add_node(sk, list);
+	__inet_hash_setbit(bitmask, index);
 	sock_prot_inc_use(sk->sk_prot);
 	write_unlock(lock);
 	if (listen_possible && sk->sk_state == TCP_LISTEN)
@@ -246,6 +271,8 @@ static inline void inet_hash(struct inet
 static inline void inet_unhash(struct inet_hashinfo *hashinfo, struct sock *sk)
 {
 	rwlock_t *lock;
+	unsigned long *bitmask = NULL;
+	unsigned int index = 0;
 
 	if (sk_unhashed(sk))
 		goto out;
@@ -255,12 +282,16 @@ static inline void inet_unhash(struct in
 		inet_listen_wlock(hashinfo);
 		lock = &hashinfo->lhash_lock;
 	} else {
+		index = inet_ehash_index(hashinfo, sk->sk_hash);
 		lock = &inet_ehash_bucket(hashinfo, sk->sk_hash)->lock;
+		bitmask = hashinfo->ebitmask;
 		write_lock_bh(lock);
 	}
 
-	if (__sk_del_node_init(sk))
+	if (__sk_del_node_init(sk)) {
+		__inet_hash_clearbit(bitmask, index);
 		sock_prot_dec_use(sk->sk_prot);
+	}
 	write_unlock_bh(lock);
 out:
 	if (sk->sk_state == TCP_LISTEN)
Index: linux-2.6.16/include/net/sock.h
===================================================================
--- linux-2.6.16.orig/include/net/sock.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/net/sock.h	2006-10-19 16:52:31.000000000 +0200
@@ -608,12 +608,12 @@ static inline void sk_refcnt_debug_relea
 /* Called with local bh disabled */
 static __inline__ void sock_prot_inc_use(struct proto *prot)
 {
-	prot->stats[smp_processor_id()].inuse++;
+	prot->stats[raw_smp_processor_id()].inuse++;
 }
 
 static __inline__ void sock_prot_dec_use(struct proto *prot)
 {
-	prot->stats[smp_processor_id()].inuse--;
+	prot->stats[raw_smp_processor_id()].inuse--;
 }
 
 /* With per-bucket locks this operation is not-atomic, so that
@@ -735,8 +735,8 @@ extern void FASTCALL(lock_sock(struct so
 extern void FASTCALL(release_sock(struct sock *sk));
 
 /* BH context may only use the following locking interface. */
-#define bh_lock_sock(__sk)	spin_lock(&((__sk)->sk_lock.slock))
-#define bh_unlock_sock(__sk)	spin_unlock(&((__sk)->sk_lock.slock))
+#define bh_lock_sock(__sk)	do { spin_lock(&((__sk)->sk_lock.slock)); } while (0)
+#define bh_unlock_sock(__sk)	do { spin_unlock(&((__sk)->sk_lock.slock)); } while (0)
 
 extern struct sock		*sk_alloc(int family,
 					  gfp_t priority,
Index: linux-2.6.16/include/pcmcia/ss.h
===================================================================
--- linux-2.6.16.orig/include/pcmcia/ss.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/pcmcia/ss.h	2006-10-19 16:52:31.000000000 +0200
@@ -241,7 +241,7 @@ struct pcmcia_socket {
 #endif
 
 	/* state thread */
-	struct semaphore		skt_sem;	/* protects socket h/w state */
+	struct compat_semaphore		skt_sem;	/* protects socket h/w state */
 
 	struct task_struct		*thread;
 	struct completion		thread_done;
Index: linux-2.6.16/include/sound/timer.h
===================================================================
--- linux-2.6.16.orig/include/sound/timer.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/include/sound/timer.h	2006-10-19 16:52:31.000000000 +0200
@@ -25,6 +25,7 @@
 
 #include <sound/asound.h>
 #include <linux/interrupt.h>
+#include <linux/timeofday.h>
 
 #define snd_timer_chip(timer) ((timer)->private_data)
 
Index: linux-2.6.16/init/Kconfig
===================================================================
--- linux-2.6.16.orig/init/Kconfig	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/init/Kconfig	2006-10-19 16:52:31.000000000 +0200
@@ -328,9 +328,14 @@ config BASE_FULL
 	  kernel data structures. This saves memory on small machines,
 	  but may reduce performance.
 
+config RT_MUTEXES
+	boolean
+	select PLIST
+
 config FUTEX
 	bool "Enable futex support" if EMBEDDED
 	default y
+	select RT_MUTEXES
 	help
 	  Disabling this option will cause the kernel to be built without
 	  support for "fast userspace mutexes".  The resulting kernel may not
Index: linux-2.6.16/init/main.c
===================================================================
--- linux-2.6.16.orig/init/main.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/init/main.c	2006-10-19 16:52:31.000000000 +0200
@@ -40,11 +40,14 @@
 #include <linux/moduleparam.h>
 #include <linux/kallsyms.h>
 #include <linux/writeback.h>
+#include <linux/timeofday.h>
+#include <linux/clockchips.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/efi.h>
 #include <linux/unistd.h>
 #include <linux/rmap.h>
+#include <linux/irq.h>
 #include <linux/mempolicy.h>
 #include <linux/key.h>
 
@@ -387,6 +390,8 @@ static void __init smp_init(void)
 static void noinline rest_init(void)
 	__releases(kernel_lock)
 {
+	system_state = SYSTEM_BOOTING_SCHEDULER_OK;
+
 	kernel_thread(init, NULL, CLONE_FS | CLONE_SIGHAND);
 	numa_default_policy();
 	unlock_kernel();
@@ -395,7 +400,7 @@ static void noinline rest_init(void)
 	 * The boot idle thread must execute schedule()
 	 * at least one to get things moving:
 	 */
-	preempt_enable_no_resched();
+	__preempt_enable_no_resched();
 	schedule();
 	preempt_disable();
 
@@ -442,6 +447,7 @@ asmlinkage void __init start_kernel(void
 {
 	char * command_line;
 	extern struct kernel_param __start___param[], __stop___param[];
+
 /*
  * Interrupts are still disabled. Do necessary setups, then
  * enable them
@@ -470,8 +476,10 @@ asmlinkage void __init start_kernel(void
 	 * fragile until we cpu_idle() for the first time.
 	 */
 	preempt_disable();
+
 	build_all_zonelists();
 	page_alloc_init();
+	early_init_hardirqs();
 	printk(KERN_NOTICE "Kernel command line: %s\n", saved_command_line);
 	parse_early_param();
 	parse_args("Booting kernel", command_line, __start___param,
@@ -482,9 +490,11 @@ asmlinkage void __init start_kernel(void
 	rcu_init();
 	init_IRQ();
 	pidhash_init();
+	init_clockevents();
 	init_timers();
 	hrtimers_init();
 	softirq_init();
+	timeofday_init();
 	time_init();
 
 	/*
@@ -496,7 +506,12 @@ asmlinkage void __init start_kernel(void
 	if (panic_later)
 		panic(panic_later, panic_param);
 	profile_init();
+
+	/*
+	 * Soft IRQ state will be enabled with the hard state.
+	 */
 	local_irq_enable();
+
 #ifdef CONFIG_BLK_DEV_INITRD
 	if (initrd_start && !initrd_below_start_ok &&
 			initrd_start < min_low_pfn << PAGE_SHIFT) {
@@ -542,6 +557,9 @@ asmlinkage void __init start_kernel(void
 
 	acpi_early_init(); /* before LAPIC and SMP init */
 
+#ifdef CONFIG_PREEMPT_RT
+	WARN_ON(irqs_disabled());
+#endif
 	/* Do the rest non-__init'ed, we're now alive */
 	rest_init();
 }
@@ -584,6 +602,12 @@ static void __init do_initcalls(void)
 			msg = "disabled interrupts";
 			local_irq_enable();
 		}
+#ifdef CONFIG_PREEMPT_RT
+		if (irqs_disabled()) {
+			msg = "disabled hard interrupts";
+			local_irq_enable();
+		}
+#endif
 		if (msg) {
 			printk(KERN_WARNING "error in initcall at 0x%p: "
 				"returned with %s\n", *call, msg);
@@ -618,13 +642,16 @@ static void __init do_basic_setup(void)
 static void do_pre_smp_initcalls(void)
 {
 	extern int spawn_ksoftirqd(void);
+	extern int spawn_desched_task(void);
 #ifdef CONFIG_SMP
 	extern int migration_init(void);
 
 	migration_init();
 #endif
+	posix_cpu_thread_init();
 	spawn_ksoftirqd();
 	spawn_softlockup_task();
+	spawn_desched_task();
 }
 
 static void run_init_process(char *init_filename)
@@ -670,6 +697,8 @@ static int init(void * unused)
 
 	smp_prepare_cpus(max_cpus);
 
+	init_hardirqs();
+
 	do_pre_smp_initcalls();
 
 	fixup_cpu_present_map();
@@ -698,7 +727,51 @@ static int init(void * unused)
 		ramdisk_execute_command = NULL;
 		prepare_namespace();
 	}
+#ifdef CONFIG_PREEMPT_RT
+	WARN_ON(irqs_disabled());
+#endif
 
+#define DEBUG_COUNT (defined(CONFIG_DEBUG_RT_MUTEXES) + defined(CONFIG_DEBUG_PREEMPT) + defined(CONFIG_CRITICAL_PREEMPT_TIMING) + defined(CONFIG_CRITICAL_IRQSOFF_TIMING) + defined(CONFIG_LATENCY_TRACE) + defined(CONFIG_DEBUG_SLAB) + defined(CONFIG_DEBUG_PAGEALLOC))
+
+#if DEBUG_COUNT > 0
+	printk(KERN_ERR "*****************************************************************************\n");
+	printk(KERN_ERR "*                                                                           *\n");
+#if DEBUG_COUNT == 1
+	printk(KERN_ERR "*  REMINDER, the following debugging option is turned on in your .config:   *\n");
+#else
+	printk(KERN_ERR "*  REMINDER, the following debugging options are turned on in your .config: *\n");
+#endif
+	printk(KERN_ERR "*                                                                           *\n");
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+	printk(KERN_ERR "*        CONFIG_DEBUG_RT_MUTEXES                                             *\n");
+#endif
+#ifdef CONFIG_DEBUG_PREEMPT
+	printk(KERN_ERR "*        CONFIG_DEBUG_PREEMPT                                               *\n");
+#endif
+#ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+	printk(KERN_ERR "*        CONFIG_CRITICAL_PREEMPT_TIMING                                     *\n");
+#endif
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+	printk(KERN_ERR "*        CONFIG_CRITICAL_IRQSOFF_TIMING                                     *\n");
+#endif
+#ifdef CONFIG_LATENCY_TRACE
+	printk(KERN_ERR "*        CONFIG_LATENCY_TRACE                                               *\n");
+#endif
+#ifdef CONFIG_DEBUG_SLAB
+	printk(KERN_ERR "*        CONFIG_DEBUG_SLAB                                                  *\n");
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	printk(KERN_ERR "*        CONFIG_DEBUG_PAGEALLOC                                             *\n");
+#endif
+	printk(KERN_ERR "*                                                                           *\n");
+#if DEBUG_COUNT == 1
+	printk(KERN_ERR "*  it may increase runtime overhead and latencies.                          *\n");
+#else
+	printk(KERN_ERR "*  they may increase runtime overhead and latencies.                        *\n");
+#endif
+	printk(KERN_ERR "*                                                                           *\n");
+	printk(KERN_ERR "*****************************************************************************\n");
+#endif
 	/*
 	 * Ok, we have completed the initial bootup, and
 	 * we're essentially up and running. Get rid of the
@@ -721,6 +794,9 @@ static int init(void * unused)
 		printk(KERN_WARNING "Failed to execute %s\n",
 				ramdisk_execute_command);
 	}
+#ifdef CONFIG_PREEMPT_RT
+	WARN_ON(irqs_disabled());
+#endif
 
 	/*
 	 * We try each of these until one succeeds.
Index: linux-2.6.16/ipc/mqueue.c
===================================================================
--- linux-2.6.16.orig/ipc/mqueue.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/ipc/mqueue.c	2006-10-19 16:52:31.000000000 +0200
@@ -773,12 +773,17 @@ static inline void pipelined_send(struct
 				  struct msg_msg *message,
 				  struct ext_wait_queue *receiver)
 {
+	/*
+	 * Keep them in one critical section for PREEMPT_RT:
+	 */
+	preempt_disable();
 	receiver->msg = message;
 	list_del(&receiver->list);
 	receiver->state = STATE_PENDING;
 	wake_up_process(receiver->task);
 	smp_wmb();
 	receiver->state = STATE_READY;
+	preempt_enable();
 }
 
 /* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
Index: linux-2.6.16/ipc/msg.c
===================================================================
--- linux-2.6.16.orig/ipc/msg.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/ipc/msg.c	2006-10-19 16:52:31.000000000 +0200
@@ -165,6 +165,11 @@ static void expunge_all(struct msg_queue
 	tmp = msq->q_receivers.next;
 	while (tmp != &msq->q_receivers) {
 		struct msg_receiver* msr;
+		/*
+		 * Make sure that the wakeup doesnt preempt
+		 * _this_ CPU prematurely. (on PREEMPT_RT)
+		 */
+		preempt_disable();
 		
 		msr = list_entry(tmp,struct msg_receiver,r_list);
 		tmp = tmp->next;
@@ -172,6 +177,8 @@ static void expunge_all(struct msg_queue
 		wake_up_process(msr->r_tsk);
 		smp_mb();
 		msr->r_msg = ERR_PTR(res);
+
+		preempt_enable();
 	}
 }
 /* 
@@ -533,7 +540,13 @@ static inline int pipelined_send(struct 
 		if(testmsg(msg,msr->r_msgtype,msr->r_mode) &&
 		   !security_msg_queue_msgrcv(msq, msg, msr->r_tsk, msr->r_msgtype, msr->r_mode)) {
 			list_del(&msr->r_list);
+			/*
+			 * Make sure that the wakeup doesnt preempt
+			 * _this_ CPU prematurely. (on PREEMPT_RT)
+			 */
+			preempt_disable();
 			if(msr->r_maxsize < msg->m_ts) {
+
 				msr->r_msg = NULL;
 				wake_up_process(msr->r_tsk);
 				smp_mb();
@@ -545,8 +558,10 @@ static inline int pipelined_send(struct 
 				wake_up_process(msr->r_tsk);
 				smp_mb();
 				msr->r_msg = msg;
+				preempt_enable();
 				return 1;
 			}
+			preempt_enable();
 		}
 	}
 	return 0;
Index: linux-2.6.16/ipc/sem.c
===================================================================
--- linux-2.6.16.orig/ipc/sem.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/ipc/sem.c	2006-10-19 16:52:31.000000000 +0200
@@ -362,6 +362,11 @@ static void update_queue (struct sem_arr
 		if (error <= 0) {
 			struct sem_queue *n;
 			remove_from_queue(sma,q);
+			/*
+			 * make sure that the wakeup doesnt preempt
+			 * _this_ cpu prematurely. (on preempt_rt)
+			 */
+			preempt_disable();
 			q->status = IN_WAKEUP;
 			/*
 			 * Continue scanning. The next operation
@@ -384,6 +389,7 @@ static void update_queue (struct sem_arr
 			 */
 			smp_wmb();
 			q->status = error;
+			preempt_enable();
 			q = n;
 		} else {
 			q = q->next;
Index: linux-2.6.16/kernel/Kconfig.preempt
===================================================================
--- linux-2.6.16.orig/kernel/Kconfig.preempt	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/Kconfig.preempt	2006-10-19 16:52:31.000000000 +0200
@@ -1,14 +1,13 @@
-
 choice
-	prompt "Preemption Model"
-	default PREEMPT_NONE
+	prompt "Preemption Mode"
+	default PREEMPT_RT
 
 config PREEMPT_NONE
 	bool "No Forced Preemption (Server)"
 	help
-	  This is the traditional Linux preemption model, geared towards
+	  This is the traditional Linux preemption model geared towards
 	  throughput. It will still provide good latencies most of the
-	  time, but there are no guarantees and occasional longer delays
+	  time but there are no guarantees and occasional long delays
 	  are possible.
 
 	  Select this option if you are building a kernel for a server or
@@ -21,7 +20,7 @@ config PREEMPT_VOLUNTARY
 	help
 	  This option reduces the latency of the kernel by adding more
 	  "explicit preemption points" to the kernel code. These new
-	  preemption points have been selected to reduce the maximum
+	  preemption points have been selected to minimize the maximum
 	  latency of rescheduling, providing faster application reactions,
 	  at the cost of slighly lower throughput.
 
@@ -33,33 +32,149 @@ config PREEMPT_VOLUNTARY
 
 	  Select this if you are building a kernel for a desktop system.
 
-config PREEMPT
+config PREEMPT_DESKTOP
 	bool "Preemptible Kernel (Low-Latency Desktop)"
 	help
 	  This option reduces the latency of the kernel by making
-	  all kernel code (that is not executing in a critical section)
+	  all kernel code that is not executing in a critical section
 	  preemptible.  This allows reaction to interactive events by
 	  permitting a low priority process to be preempted involuntarily
 	  even if it is in kernel mode executing a system call and would
-	  otherwise not be about to reach a natural preemption point.
-	  This allows applications to run more 'smoothly' even when the
-	  system is under load, at the cost of slighly lower throughput
-	  and a slight runtime overhead to kernel code.
+	  otherwise not about to reach a preemption point.  This allows
+	  applications to run more 'smoothly' even when the system is
+	  under load, at the cost of slighly lower throughput and a
+	  slight runtime overhead to kernel code.
+
+	  (According to profiles, when this mode is selected then even
+	  during kernel-intense workloads the system is in an immediately
+	  preemptible state more than 50% of the time.)
 
 	  Select this if you are building a kernel for a desktop or
 	  embedded system with latency requirements in the milliseconds
 	  range.
 
+config PREEMPT_RT
+	bool "Complete Preemption (Real-Time)"
+	select PREEMPT_SOFTIRQS
+	select PREEMPT_HARDIRQS
+	select PREEMPT_RCU
+	help
+	  This option further reduces the scheduling latency of the
+	  kernel by replacing almost every spinlock used by the kernel
+	  with preemptible mutexes and thus making all but the most
+	  critical kernel code involuntarily preemptible. The remaining
+	  handful of lowlevel non-preemptible codepaths are short and
+	  have a deterministic latency of a couple of tens of
+	  microseconds (depending the the hardware).  This also allows
+	  applications to run more 'smoothly' even when the system is
+	  under load, at the cost of lower throughput and runtime
+	  overhead to kernel code.
+
+	  (According to profiles, when this mode is selected then even
+	  during kernel-intense workloads the system is in an immediately
+	  preemptible state more than 95% of the time.)
+
+	  Select this if you are building a kernel for a desktop,
+	  embedded or real-time system with guaranteed latency
+	  requirements of 100 usecs or lower.
+
 endchoice
 
-config PREEMPT_BKL
-	bool "Preempt The Big Kernel Lock"
-	depends on SMP || PREEMPT
+config PREEMPT
+	bool
 	default y
+	depends on PREEMPT_DESKTOP || PREEMPT_RT
+
+config PREEMPT_SOFTIRQS
+	bool "Thread Softirqs"
+	default n
+#	depends on PREEMPT
+	help
+	  This option reduces the latency of the kernel by 'threading'
+          soft interrupts. This means that all softirqs will execute
+          in softirqd's context. While this helps latency, it can also
+          reduce performance.
+
+          The threading of softirqs can also be controlled via
+          /proc/sys/kernel/softirq_preemption runtime flag and the
+          sofirq-preempt=0/1 boot-time option.
+
+	  Say N if you are unsure.
+
+config PREEMPT_HARDIRQS
+	bool "Thread Hardirqs"
+	default n
+#	depends on PREEMPT
+	help
+	  This option reduces the latency of the kernel by 'threading'
+          hardirqs. This means that all (or selected) hardirqs will run
+          in their own kernel thread context. While this helps latency,
+          this feature can also reduce performance.
+
+          The threading of hardirqs can also be controlled via the
+          /proc/sys/kernel/hardirq_preemption runtime flag and the
+          hardirq-preempt=0/1 boot-time option. Per-irq threading can
+          be enabled/disable via the /proc/irq/<IRQ>/<handler>/threaded
+          runtime flags.
+
+	  Say N if you are unsure.
+
+config SPINLOCK_BKL
+	bool "Old-Style Big Kernel Lock"
+	depends on (PREEMPT || SMP) && !PREEMPT_RT
+	default n
 	help
-	  This option reduces the latency of the kernel by making the
-	  big kernel lock preemptible.
+	  This option increases the latency of the kernel by making the
+	  big kernel lock spinlock-based (which is bad for latency).
+	  However, enable this option if you see any problems to revert
+	  back to the traditional spinlock BKL design.
 
 	  Say Y here if you are building a kernel for a desktop system.
 	  Say N if you are unsure.
 
+config PREEMPT_BKL
+	bool
+	depends on PREEMPT_RT || !SPINLOCK_BKL
+	default n if !PREEMPT
+	default y
+
+choice
+	prompt "RCU implementation type:"
+	default PREEMPT_RCU if PREEMPT_RT
+	default CLASSIC_RCU
+
+config CLASSIC_RCU
+	bool "Classic RCU"
+	depends on !PREEMPT_RT
+	help
+	  This option selects the classic RCU implementation that is
+	  designed for best read-side performance on non-embedded
+	  machines.
+
+	  Say Y if you are unsure.
+
+config PREEMPT_RCU
+	bool "Preemptible RCU"
+	depends on PREEMPT
+	help
+	  This option reduces the latency of the kernel by making certain
+	  RCU sections preemptible. Normally RCU code is non-preemptible, if
+	  this option is selected then read-only RCU sections become
+	  preemptible. This helps latency, but may expose bugs due to
+	  now-naive assumptions about each RCU read-side critical section
+	  remaining on a given CPU through its execution.
+
+	  Say N if you are unsure.
+
+endchoice
+
+config RCU_STATS
+	bool "/proc stats for preemptible RCU read-side critical sections"
+	depends on PREEMPT_RCU
+	default y
+	help
+	  This option provides /proc stats to provide debugging info for
+	  the preemptible realtime RCU implementation.
+
+	  Say Y here if you want to see RCU stats in /proc
+	  Say N if you are unsure.
Index: linux-2.6.16/kernel/Makefile
===================================================================
--- linux-2.6.16.orig/kernel/Makefile	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/Makefile	2006-10-19 16:52:31.000000000 +0200
@@ -6,12 +6,22 @@ obj-y     = sched.o fork.o exec_domain.o
 	    exit.o itimer.o time.o softirq.o resource.o \
 	    sysctl.o capability.o ptrace.o timer.o user.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
-	    rcupdate.o extable.o params.o posix-timers.o \
-	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
+	    extable.o params.o posix-timers.o \
+	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o \
 	    hrtimer.o
 
+ifneq ($(CONFIG_PREEMPT_RT),y)
+obj-y += mutex.o
+endif
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
 obj-$(CONFIG_FUTEX) += futex.o
+ifeq ($(CONFIG_COMPAT),y)
+obj-$(CONFIG_FUTEX) += futex_compat.o
+endif
+obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
+obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
+obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
+obj-$(CONFIG_PREEMPT_RT) += rt.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += cpu.o spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
@@ -19,6 +29,7 @@ obj-$(CONFIG_UID16) += uid16.o
 obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_OBSOLETE_INTERMODULE) += intermodule.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
+obj-$(CONFIG_GENERIC_TIME) += time/
 obj-$(CONFIG_PM) += power/
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_KEXEC) += kexec.o
@@ -29,11 +40,16 @@ obj-$(CONFIG_STOP_MACHINE) += stop_machi
 obj-$(CONFIG_AUDIT) += audit.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_KPROBES) += kprobes.o
+obj-$(CONFIG_DEBUG_PREEMPT) += latency.o
+obj-$(CONFIG_LATENCY_TIMING) += latency.o
+obj-$(CONFIG_LATENCY_HIST) += latency_hist.o
 obj-$(CONFIG_SYSFS) += ksysfs.o
 obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_CLASSIC_RCU) += rcupdate.o
+obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
Index: linux-2.6.16/kernel/audit.c
===================================================================
--- linux-2.6.16.orig/kernel/audit.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/audit.c	2006-10-19 16:52:31.000000000 +0200
@@ -616,7 +616,7 @@ err:
 
 unsigned int audit_serial(void)
 {
-	static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED;
+	static DEFINE_SPINLOCK(serial_lock);
 	static unsigned int serial = 0;
 
 	unsigned long flags;
Index: linux-2.6.16/kernel/compat.c
===================================================================
--- linux-2.6.16.orig/kernel/compat.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/compat.c	2006-10-19 16:52:31.000000000 +0200
@@ -17,7 +17,6 @@
 #include <linux/time.h>
 #include <linux/signal.h>
 #include <linux/sched.h>	/* for MAX_SCHEDULE_TIMEOUT */
-#include <linux/futex.h>	/* for FUTEX_WAIT */
 #include <linux/syscalls.h>
 #include <linux/unistd.h>
 #include <linux/security.h>
@@ -238,28 +237,6 @@ asmlinkage long compat_sys_sigprocmask(i
 	return ret;
 }
 
-#ifdef CONFIG_FUTEX
-asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val,
-		struct compat_timespec __user *utime, u32 __user *uaddr2,
-		int val3)
-{
-	struct timespec t;
-	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
-	int val2 = 0;
-
-	if ((op == FUTEX_WAIT) && utime) {
-		if (get_compat_timespec(&t, utime))
-			return -EFAULT;
-		timeout = timespec_to_jiffies(&t) + 1;
-	}
-	if (op >= FUTEX_REQUEUE)
-		val2 = (int) (unsigned long) utime;
-
-	return do_futex((unsigned long)uaddr, op, val, timeout,
-			(unsigned long)uaddr2, val2, val3);
-}
-#endif
-
 asmlinkage long compat_sys_setrlimit(unsigned int resource,
 		struct compat_rlimit __user *rlim)
 {
Index: linux-2.6.16/kernel/exit.c
===================================================================
--- linux-2.6.16.orig/kernel/exit.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/exit.c	2006-10-19 16:52:31.000000000 +0200
@@ -31,6 +31,8 @@
 #include <linux/signal.h>
 #include <linux/cn_proc.h>
 #include <linux/mutex.h>
+#include <linux/futex.h>
+#include <linux/compat.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -52,8 +54,11 @@ static void __unhash_process(struct task
 	if (thread_group_leader(p)) {
 		detach_pid(p, PIDTYPE_PGID);
 		detach_pid(p, PIDTYPE_SID);
-		if (p->pid)
+		if (p->pid) {
+			preempt_disable();
 			__get_cpu_var(process_counts)--;
+			preempt_enable();
+		}
 	}
 
 	REMOVE_LINKS(p);
@@ -394,8 +399,10 @@ static void close_files(struct files_str
 		while (set) {
 			if (set & 1) {
 				struct file * file = xchg(&fdt->fd[i], NULL);
-				if (file)
+				if (file) {
 					filp_close(file, files);
+					cond_resched();
+				}
 			}
 			i++;
 			set >>= 1;
@@ -529,9 +536,11 @@ static void exit_mm(struct task_struct *
 	if (mm != tsk->active_mm) BUG();
 	/* more a memory barrier than a real lock */
 	task_lock(tsk);
+	preempt_disable(); // FIXME
 	tsk->mm = NULL;
 	up_read(&mm->mmap_sem);
 	enter_lazy_tlb(mm, current);
+	preempt_enable();
 	task_unlock(tsk);
 	mmput(mm);
 }
@@ -801,6 +810,7 @@ fastcall NORET_TYPE void do_exit(long co
 
 	WARN_ON(atomic_read(&tsk->fs_excl));
 
+	BUG_ON(in_interrupt());
 	if (unlikely(in_interrupt()))
 		panic("Aiee, killing interrupt handler!");
 	if (unlikely(!tsk->pid))
@@ -852,6 +862,12 @@ fastcall NORET_TYPE void do_exit(long co
 		exit_itimers(tsk->signal);
 		acct_process(code);
 	}
+	if (unlikely(tsk->robust_list))
+		exit_robust_list(tsk);
+#ifdef CONFIG_COMPAT
+	if (unlikely(tsk->compat_robust_list))
+		compat_exit_robust_list(tsk);
+#endif
 	exit_mm(tsk);
 
 	exit_sem(tsk);
@@ -877,19 +893,32 @@ fastcall NORET_TYPE void do_exit(long co
 	tsk->mempolicy = NULL;
 #endif
 	/*
+	 * This must happen late, after the PID is not
+	 * hashed anymore:
+	 */
+	if (unlikely(!list_empty(&tsk->pi_state_list)))
+		exit_pi_state_list(tsk);
+	if (unlikely(current->pi_state_cache))
+		kfree(current->pi_state_cache);
+	/*
 	 * If DEBUG_MUTEXES is on, make sure we are holding no locks:
 	 */
 	mutex_debug_check_no_locks_held(tsk);
+	rt_mutex_debug_check_no_locks_held(tsk);
 
 	/* PF_DEAD causes final put_task_struct after we schedule. */
-	preempt_disable();
+again:
+	local_irq_disable();
 	BUG_ON(tsk->flags & PF_DEAD);
 	tsk->flags |= PF_DEAD;
 
-	schedule();
-	BUG();
-	/* Avoid "noreturn function does return".  */
-	for (;;) ;
+	__schedule();
+	printk(KERN_ERR "BUG: dead task %s:%d back from the grave!\n",
+		current->comm, current->pid);
+	printk(KERN_ERR ".... flags: %08lx, count: %d, state: %08lx\n",
+		current->flags, atomic_read(&current->usage), current->state);
+	printk(KERN_ERR ".... trying again ...\n");
+	goto again;
 }
 
 EXPORT_SYMBOL_GPL(do_exit);
@@ -1393,6 +1422,7 @@ repeat:
 		list_for_each(_p,&tsk->children) {
 			p = list_entry(_p,struct task_struct,sibling);
 
+			BUG_ON(!atomic_read(&p->usage));
 			ret = eligible_child(pid, options, p);
 			if (!ret)
 				continue;
Index: linux-2.6.16/kernel/fork.c
===================================================================
--- linux-2.6.16.orig/kernel/fork.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/fork.c	2006-10-19 16:52:31.000000000 +0200
@@ -43,6 +43,8 @@
 #include <linux/profile.h>
 #include <linux/rmap.h>
 #include <linux/acct.h>
+#include <linux/kthread.h>
+#include <linux/notifier.h>
 #include <linux/cn_proc.h>
 
 #include <asm/pgtable.h>
@@ -66,6 +68,15 @@ DEFINE_PER_CPU(unsigned long, process_co
 
 EXPORT_SYMBOL(tasklist_lock);
 
+/*
+ * Delayed mmdrop. In the PREEMPT_RT case we
+ * dont want to do this from the scheduling
+ * context.
+ */
+static DEFINE_PER_CPU(struct task_struct *, desched_task);
+
+static DEFINE_PER_CPU(struct list_head, delayed_drop_list);
+
 int nr_processes(void)
 {
 	int cpu;
@@ -104,6 +115,7 @@ static kmem_cache_t *mm_cachep;
 void free_task(struct task_struct *tsk)
 {
 	free_thread_info(tsk->thread_info);
+	rt_mutex_debug_task_free(tsk);
 	free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -112,8 +124,9 @@ void __put_task_struct_cb(struct rcu_hea
 {
 	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
 
+	BUG_ON(atomic_read(&tsk->usage));
+	WARN_ON(!(tsk->flags & PF_DEAD));
 	WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
-	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
 	if (unlikely(tsk->audit_context))
@@ -126,8 +139,33 @@ void __put_task_struct_cb(struct rcu_hea
 		free_task(tsk);
 }
 
+#if 0
+
+void put_task_struct(struct task_struct *tsk)
+{
+	BUG_ON(!atomic_read(&tsk->usage));
+
+	if (!atomic_dec_and_test(&tsk->usage))
+		return;
+	__put_task_struct(tsk);
+}
+
+EXPORT_SYMBOL(put_task_struct);
+
+void get_task_struct(struct task_struct *tsk)
+{
+	BUG_ON(!atomic_read(&tsk->usage));
+	atomic_inc(&tsk->usage);
+}
+
+EXPORT_SYMBOL(get_task_struct);
+
+#endif
+
 void __init fork_init(unsigned long mempages)
 {
+	int i;
+
 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
 #define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
@@ -155,6 +193,9 @@ void __init fork_init(unsigned long memp
 	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
 	init_task.signal->rlim[RLIMIT_SIGPENDING] =
 		init_task.signal->rlim[RLIMIT_NPROC];
+
+	for (i = 0; i < NR_CPUS; i++)
+		INIT_LIST_HEAD(&per_cpu(delayed_drop_list, i));
 }
 
 static struct task_struct *dup_task_struct(struct task_struct *orig)
@@ -326,6 +367,7 @@ static struct mm_struct * mm_init(struct
 	spin_lock_init(&mm->page_table_lock);
 	rwlock_init(&mm->ioctx_list_lock);
 	mm->ioctx_list = NULL;
+	INIT_LIST_HEAD(&mm->delayed_drop);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->cached_hole_size = ~0UL;
 
@@ -847,7 +889,7 @@ static inline int copy_signal(unsigned l
 	hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL);
 	sig->it_real_incr.tv64 = 0;
 	sig->real_timer.function = it_real_fn;
-	sig->real_timer.data = tsk;
+	sig->tsk = tsk;
 
 	sig->it_virt_expires = cputime_zero;
 	sig->it_virt_incr = cputime_zero;
@@ -899,6 +941,20 @@ asmlinkage long sys_set_tid_address(int 
 	return current->pid;
 }
 
+static inline void rt_mutex_init_task(struct task_struct *p)
+{
+#ifdef CONFIG_RT_MUTEXES
+	spin_lock_init(&p->pi_lock);
+	plist_head_init(&p->pi_waiters, &p->pi_lock);
+	p->pi_blocked_on = NULL;
+# ifdef CONFIG_DEBUG_RT_MUTEXES
+	spin_lock_init(&p->held_list_lock);
+	INIT_LIST_HEAD(&p->held_list_head);
+	p->last_kernel_lock = NULL;
+# endif
+#endif
+}
+
 /*
  * This creates a new process as a copy of the old one,
  * but does not actually start it yet.
@@ -1005,7 +1061,7 @@ static task_t *copy_process(unsigned lon
  	INIT_LIST_HEAD(&p->cpu_timers[0]);
  	INIT_LIST_HEAD(&p->cpu_timers[1]);
  	INIT_LIST_HEAD(&p->cpu_timers[2]);
-
+	p->posix_timer_list = NULL;
 	p->lock_depth = -1;		/* -1 = no lock */
 	do_posix_clock_monotonic_gettime(&p->start_time);
 	p->security = NULL;
@@ -1022,6 +1078,8 @@ static task_t *copy_process(unsigned lon
  	}
 #endif
 
+	rt_mutex_init_task(p);
+
 #ifdef CONFIG_DEBUG_MUTEXES
 	p->blocked_on = NULL; /* not blocked yet */
 #endif
@@ -1054,12 +1112,21 @@ static task_t *copy_process(unsigned lon
 	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
 	if (retval)
 		goto bad_fork_cleanup_namespace;
+#ifdef CONFIG_DEBUG_PREEMPT
+	p->lock_count = 0;
+#endif
 
 	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
 	/*
 	 * Clear TID on mm_release()?
 	 */
 	p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
+	p->robust_list = NULL;
+#ifdef CONFIG_COMPAT
+	p->compat_robust_list = NULL;
+#endif
+	INIT_LIST_HEAD(&p->pi_state_list);
+	p->pi_state_cache = NULL;
 
 	/*
 	 * sigaltstack should be cleared when sharing the same VM
@@ -1109,10 +1176,12 @@ static task_t *copy_process(unsigned lon
 	 * to ensure it is on a valid CPU (and if not, just force it back to
 	 * parent's CPU). This avoids alot of nasty races.
 	 */
+	preempt_disable();
 	p->cpus_allowed = current->cpus_allowed;
 	if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
 			!cpu_online(task_cpu(p))))
 		set_task_cpu(p, smp_processor_id());
+	preempt_enable();
 
 	/*
 	 * Check for pending SIGKILL! The new thread should not be allowed
@@ -1187,8 +1256,11 @@ static task_t *copy_process(unsigned lon
 		p->signal->session = current->signal->session;
 		attach_pid(p, PIDTYPE_PGID, process_group(p));
 		attach_pid(p, PIDTYPE_SID, p->signal->session);
-		if (p->pid)
+		if (p->pid) {
+			preempt_disable();
 			__get_cpu_var(process_counts)++;
+			preempt_enable();
+		}
 	}
 	attach_pid(p, PIDTYPE_TGID, p->tgid);
 	attach_pid(p, PIDTYPE_PID, p->pid);
@@ -1614,3 +1686,122 @@ bad_unshare_cleanup_thread:
 bad_unshare_out:
 	return err;
 }
+
+static int mmdrop_complete(void)
+{
+	struct list_head *head;
+	int ret = 0;
+
+	head = &get_cpu_var(delayed_drop_list);
+	while (!list_empty(head)) {
+		struct mm_struct *mm = list_entry(head->next,
+					struct mm_struct, delayed_drop);
+		list_del(&mm->delayed_drop);
+		put_cpu_var(delayed_drop_list);
+
+		__mmdrop(mm);
+		ret = 1;
+
+		head = &get_cpu_var(delayed_drop_list);
+	}
+	put_cpu_var(delayed_drop_list);
+
+	return ret;
+}
+
+/*
+ * We dont want to do complex work from the scheduler, thus
+ * we delay the work to a per-CPU worker thread:
+ */
+void fastcall __mmdrop_delayed(struct mm_struct *mm)
+{
+	struct task_struct *desched_task;
+	struct list_head *head;
+
+	head = &get_cpu_var(delayed_drop_list);
+	list_add_tail(&mm->delayed_drop, head);
+	desched_task = __get_cpu_var(desched_task);
+	if (desched_task)
+		wake_up_process(desched_task);
+	put_cpu_var(delayed_drop_list);
+}
+
+static int desched_thread(void * __bind_cpu)
+{
+	set_user_nice(current, -10);
+	current->flags |= PF_NOFREEZE | PF_SOFTIRQ;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	while (!kthread_should_stop()) {
+
+		if (mmdrop_complete())
+			continue;
+		schedule();
+
+		/* This must be called from time to time on ia64, and is a no-op on other archs.
+		 * Used to be in cpu_idle(), but with the new -rt semantics it can't stay there.
+		 */
+		check_pgt_cache();
+
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+static int __devinit cpu_callback(struct notifier_block *nfb,
+				  unsigned long action,
+				  void *hcpu)
+{
+	int hotcpu = (unsigned long)hcpu;
+	struct task_struct *p;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+
+		BUG_ON(per_cpu(desched_task, hotcpu));
+		INIT_LIST_HEAD(&per_cpu(delayed_drop_list, hotcpu));
+		p = kthread_create(desched_thread, hcpu, "desched/%d", hotcpu);
+		if (IS_ERR(p)) {
+			printk("desched_thread for %i failed\n", hotcpu);
+			return NOTIFY_BAD;
+		}
+  		per_cpu(desched_task, hotcpu) = p;
+		kthread_bind(p, hotcpu);
+ 		break;
+	case CPU_ONLINE:
+
+		wake_up_process(per_cpu(desched_task, hotcpu));
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+
+		/* Unbind so it can run.  Fall thru. */
+		kthread_bind(per_cpu(desched_task, hotcpu), smp_processor_id());
+	case CPU_DEAD:
+
+		p = per_cpu(desched_task, hotcpu);
+		per_cpu(desched_task, hotcpu) = NULL;
+		kthread_stop(p);
+		takeover_tasklets(hotcpu);
+		break;
+#endif /* CONFIG_HOTPLUG_CPU */
+ 	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata cpu_nfb = {
+	.notifier_call = cpu_callback
+};
+
+__init int spawn_desched_task(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+
+	cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+	register_cpu_notifier(&cpu_nfb);
+	return 0;
+}
+
Index: linux-2.6.16/kernel/futex.c
===================================================================
--- linux-2.6.16.orig/kernel/futex.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/futex.c	2006-11-22 17:50:43.000000000 +0100
@@ -8,6 +8,14 @@
  *  Removed page pinning, fix privately mapped COW pages and other cleanups
  *  (C) Copyright 2003, 2004 Jamie Lokier
  *
+ *  Robust futex support started by Ingo Molnar
+ *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
+ *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
+ *
+ *  PI-futex support started by Ingo Molnar and Thomas Gleixner
+ *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
  *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
  *  enough at me, Linus for the original (flawed) idea, Matthew
  *  Kirkwood for proof-of-concept implementation.
@@ -41,6 +49,9 @@
 #include <linux/syscalls.h>
 #include <linux/signal.h>
 #include <asm/futex.h>
+#include <linux/hrtimer.h>
+
+#include "rtmutex_common.h"
 
 #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
 
@@ -59,7 +70,7 @@ union futex_key {
 		int offset;
 	} shared;
 	struct {
-		unsigned long uaddr;
+		unsigned long address;
 		struct mm_struct *mm;
 		int offset;
 	} private;
@@ -71,6 +82,27 @@ union futex_key {
 };
 
 /*
+ * Priority Inheritance state:
+ */
+struct futex_pi_state {
+	/*
+	 * list of 'owned' pi_state instances - these have to be
+	 * cleaned up in do_exit() if the task exits prematurely:
+	 */
+	struct list_head list;
+
+	/*
+	 * The PI object:
+	 */
+	struct rt_mutex pi_mutex;
+
+	struct task_struct *owner;
+	atomic_t refcount;
+
+	union futex_key key;
+};
+
+/*
  * We use this hashed waitqueue instead of a normal wait_queue_t, so
  * we can wake only the relevant ones (hashed queues may be shared).
  *
@@ -80,18 +112,22 @@ union futex_key {
  * wake up q->waiters, then make the second condition true.
  */
 struct futex_q {
-	struct list_head list;
+	struct plist_node list;
 	wait_queue_head_t waiters;
 
-	/* Which hash list lock to use. */
+	/* Which hash list lock to use: */
 	spinlock_t *lock_ptr;
 
-	/* Key which the futex is hashed on. */
+	/* Key which the futex is hashed on: */
 	union futex_key key;
 
-	/* For fd, sigio sent using these. */
+	/* For fd, sigio sent using these: */
 	int fd;
 	struct file *filp;
+
+	/* Optional priority inheritance state: */
+	struct futex_pi_state *pi_state;
+	struct task_struct *task;
 };
 
 /*
@@ -99,7 +135,7 @@ struct futex_q {
  */
 struct futex_hash_bucket {
        spinlock_t              lock;
-       struct list_head       chain;
+       struct plist_head       chain;
 };
 
 static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
@@ -140,8 +176,9 @@ static inline int match_futex(union fute
  *
  * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
  */
-static int get_futex_key(unsigned long uaddr, union futex_key *key)
+static int get_futex_key(u32 __user *uaddr, union futex_key *key)
 {
+	unsigned long address = (unsigned long)uaddr;
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	struct page *page;
@@ -150,16 +187,16 @@ static int get_futex_key(unsigned long u
 	/*
 	 * The futex address must be "naturally" aligned.
 	 */
-	key->both.offset = uaddr % PAGE_SIZE;
+	key->both.offset = address % PAGE_SIZE;
 	if (unlikely((key->both.offset % sizeof(u32)) != 0))
 		return -EINVAL;
-	uaddr -= key->both.offset;
+	address -= key->both.offset;
 
 	/*
 	 * The futex is hashed differently depending on whether
 	 * it's in a shared or private mapping.  So check vma first.
 	 */
-	vma = find_extend_vma(mm, uaddr);
+	vma = find_extend_vma(mm, address);
 	if (unlikely(!vma))
 		return -EFAULT;
 
@@ -180,7 +217,7 @@ static int get_futex_key(unsigned long u
 	 */
 	if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
 		key->private.mm = mm;
-		key->private.uaddr = uaddr;
+		key->private.address = address;
 		return 0;
 	}
 
@@ -190,7 +227,7 @@ static int get_futex_key(unsigned long u
 	key->shared.inode = vma->vm_file->f_dentry->d_inode;
 	key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
 	if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
-		key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT)
+		key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
 				     + vma->vm_pgoff);
 		return 0;
 	}
@@ -201,7 +238,7 @@ static int get_futex_key(unsigned long u
 	 * from swap.  But that's a lot of code to duplicate here
 	 * for a rare case, so we simply fetch the page.
 	 */
-	err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL);
+	err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
 	if (err >= 0) {
 		key->shared.pgoff =
 			page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -242,24 +279,269 @@ static void drop_key_refs(union futex_ke
 	}
 }
 
-static inline int get_futex_value_locked(int *dest, int __user *from)
+static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
 {
 	int ret;
 
 	inc_preempt_count();
-	ret = __copy_from_user_inatomic(dest, from, sizeof(int));
+	ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
 	dec_preempt_count();
 
 	return ret ? -EFAULT : 0;
 }
 
 /*
+ * Fault handling. Called with current->mm->mmap_sem held.
+ */
+static int futex_handle_fault(unsigned long address, int attempt)
+{
+	struct vm_area_struct * vma;
+	struct mm_struct *mm = current->mm;
+
+	if (attempt > 2 || !(vma = find_vma(mm, address)) ||
+	    vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
+		return -EFAULT;
+
+	switch (handle_mm_fault(mm, vma, address, 1)) {
+	case VM_FAULT_MINOR:
+		current->min_flt++;
+		break;
+	case VM_FAULT_MAJOR:
+		current->maj_flt++;
+		break;
+	default:
+		return -EFAULT;
+	}
+	return 0;
+}
+
+/*
+ * PI code:
+ */
+static int refill_pi_state_cache(void)
+{
+	struct futex_pi_state *pi_state;
+
+	if (likely(current->pi_state_cache))
+		return 0;
+
+	pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL);
+
+	if (!pi_state)
+		return -ENOMEM;
+
+	memset(pi_state, 0, sizeof(*pi_state));
+	INIT_LIST_HEAD(&pi_state->list);
+	/* pi_mutex gets initialized later */
+	pi_state->owner = NULL;
+	atomic_set(&pi_state->refcount, 1);
+
+	current->pi_state_cache = pi_state;
+
+	return 0;
+}
+
+static struct futex_pi_state * alloc_pi_state(void)
+{
+	struct futex_pi_state *pi_state = current->pi_state_cache;
+
+	WARN_ON(!pi_state);
+	current->pi_state_cache = NULL;
+
+	return pi_state;
+}
+
+static void free_pi_state(struct futex_pi_state *pi_state)
+{
+	pr_debug("free_pi_state: %d %d\n", atomic_read(&pi_state->refcount),
+		 current->pid);
+	if (!atomic_dec_and_test(&pi_state->refcount))
+		return;
+
+	/*
+	 * If pi_state->owner is NULL, the owner is most probably dying
+	 * and has cleaned up the pi_state
+	 */
+	if (pi_state->owner) {
+		spin_lock_irq(&pi_state->owner->pi_lock);
+		list_del_init(&pi_state->list);
+		spin_unlock_irq(&pi_state->owner->pi_lock);
+
+		rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
+	}
+
+	if (current->pi_state_cache)
+		kfree(pi_state);
+	else {
+		/*
+		 * pi_state->list is already empty.
+		 * clear pi_state->owner.
+		 * refcount is at 0 - put it back to 1.
+		 */
+		pi_state->owner = NULL;
+		atomic_set(&pi_state->refcount, 1);
+		current->pi_state_cache = pi_state;
+	}
+}
+
+/*
+ * Look up the task based on what TID userspace gave us.
+ * We dont trust it.
+ */
+static struct task_struct * futex_find_get_task(pid_t pid)
+{
+	struct task_struct *p;
+
+	read_lock(&tasklist_lock);
+	p = find_task_by_pid(pid);
+	if (!p)
+		goto out_unlock;
+	if ((current->euid != p->euid) && (current->euid != p->uid)) {
+		p = NULL;
+		goto out_unlock;
+	}
+	if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) {
+		p = NULL;
+		goto out_unlock;
+	}
+	get_task_struct(p);
+out_unlock:
+	read_unlock(&tasklist_lock);
+
+	return p;
+}
+
+/*
+ * This task is holding PI mutexes at exit time => bad.
+ * Kernel cleans up PI-state, but userspace is likely hosed.
+ * (Robust-futex cleanup is separate and might save the day for userspace.)
+ */
+void exit_pi_state_list(struct task_struct *curr)
+{
+	struct list_head *next, *head = &curr->pi_state_list;
+	struct futex_pi_state *pi_state;
+	struct futex_hash_bucket *hb;
+	union futex_key key;
+
+	/*
+	 * We are a ZOMBIE and nobody can enqueue itself on
+	 * pi_state_list anymore, but we have to be careful
+	 * versus waiters unqueueing themselves:
+	 */
+	spin_lock_irq(&curr->pi_lock);
+	while (!list_empty(head)) {
+
+		next = head->next;
+		pi_state = list_entry(next, struct futex_pi_state, list);
+		key = pi_state->key;
+		hb = hash_futex(&key);
+		spin_unlock_irq(&curr->pi_lock);
+
+		spin_lock(&hb->lock);
+
+		spin_lock_irq(&curr->pi_lock);
+		/*
+		 * We dropped the pi-lock, so re-check whether this
+		 * task still owns the PI-state:
+		 */
+		if (head->next != next) {
+			spin_unlock(&hb->lock);
+			continue;
+		}
+
+		WARN_ON(pi_state->owner != curr);
+		WARN_ON(list_empty(&pi_state->list));
+		list_del_init(&pi_state->list);
+		pr_debug("exit_pi: %d\n", curr->pid);
+		pi_state->owner = NULL;
+		spin_unlock_irq(&curr->pi_lock);
+
+		rt_mutex_unlock(&pi_state->pi_mutex);
+
+		spin_unlock(&hb->lock);
+
+		spin_lock_irq(&curr->pi_lock);
+	}
+	spin_unlock_irq(&curr->pi_lock);
+}
+
+static int
+lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
+{
+	struct futex_pi_state *pi_state = NULL;
+	struct futex_q *this, *next;
+	struct plist_head *head;
+	struct task_struct *p;
+	pid_t pid;
+
+	head = &hb->chain;
+
+	plist_for_each_entry_safe(this, next, head, list) {
+		if (match_futex (&this->key, &me->key)) {
+			/* Another waiter already exists */
+			pi_state = this->pi_state;
+
+			/* make sure its a PI waiter: */
+			if (unlikely(!pi_state)) {
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+				if (printk_ratelimit())
+					printk("BUG: %s/%d: Mixing PI and non-PI futexes!\n",
+						current->comm, current->pid);
+#endif
+				return -EINVAL;
+			}
+			/* bump up the refcount and return its pi_state: */
+			atomic_inc(&pi_state->refcount);
+			me->pi_state = pi_state;
+			pr_debug("Waiter found: %d\n",
+				 atomic_read(&pi_state->refcount));
+
+			return 0;
+		}
+	}
+
+	/*
+	 * We are the first waiter - try to look up the real owner and
+	 * attach the new pi_state to it:
+	 */
+	pid = uval & FUTEX_TID_MASK;
+	if (!pid && (uval & FUTEX_OWNER_DIED))
+		return -ESRCH;
+	p = futex_find_get_task(pid);
+	if (!p)
+		return -ESRCH;
+
+	pi_state = alloc_pi_state();
+
+	/*
+	 * Initialize the pi_mutex in locked state and make 'p'
+	 * the owner of it:
+	 */
+	rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
+
+	/* Store the key for possible exit cleanups: */
+	pi_state->key = me->key;
+
+	spin_lock_irq(&p->pi_lock);
+	WARN_ON(!list_empty(&pi_state->list));
+	list_add(&pi_state->list, &p->pi_state_list);
+	pi_state->owner = p;
+	spin_unlock_irq(&p->pi_lock);
+
+	put_task_struct(p);
+
+	me->pi_state = pi_state;
+
+	return 0;
+}
+
+/*
  * The hash bucket lock must be held when this is called.
  * Afterwards, the futex_q must not be accessed.
  */
 static void wake_futex(struct futex_q *q)
 {
-	list_del_init(&q->list);
+	plist_del(&q->list, &q->list.plist);
 	if (q->filp)
 		send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
 	/*
@@ -280,16 +562,95 @@ static void wake_futex(struct futex_q *q
 	q->lock_ptr = NULL;
 }
 
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
+{
+	struct task_struct *new_owner;
+	struct futex_pi_state *pi_state = this->pi_state;
+	u32 curval, newval;
+
+	if (!pi_state)
+		return -EINVAL;
+
+	spin_lock(&pi_state->pi_mutex.wait_lock);
+	new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
+
+	/*
+	 * This happens when we have stolen the lock and the original
+	 * pending owner did not enqueue itself back on the rt_mutex.
+	 * Thats not a tragedy. We know that way, that a lock waiter
+	 * is on the fly. We make the futex_q waiter the pending owner.
+	 */
+	if (!new_owner)
+		new_owner = this->task;
+
+	/*
+	 * We pass it to the next owner. (The WAITERS bit is always
+	 * kept enabled while there is PI state around. We must also
+	 * preserve the owner died bit.)
+	 */
+	if (!(uval & FUTEX_OWNER_DIED)) {
+		newval = FUTEX_WAITERS | new_owner->pid;
+		pr_debug("wake_futex_pi: uaddr: %p  newval: 0x%08x  pid: %d\n",
+			 uaddr, newval, current->pid);
+
+		inc_preempt_count();
+		curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+		dec_preempt_count();
+		if (curval == -EFAULT)
+			return -EFAULT;
+		if (curval != uval)
+			return -EINVAL;
+	}
+
+	spin_lock_irq(&pi_state->owner->pi_lock);
+	WARN_ON(list_empty(&pi_state->list));
+	list_del_init(&pi_state->list);
+	spin_unlock_irq(&pi_state->owner->pi_lock);
+
+	spin_lock_irq(&new_owner->pi_lock);
+	WARN_ON(!list_empty(&pi_state->list));
+	list_add(&pi_state->list, &new_owner->pi_state_list);
+	pi_state->owner = new_owner;
+	spin_unlock_irq(&new_owner->pi_lock);
+
+	spin_unlock(&pi_state->pi_mutex.wait_lock);
+	rt_mutex_unlock(&pi_state->pi_mutex);
+
+	return 0;
+}
+
+static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
+{
+	u32 oldval;
+
+	pr_debug("unlock_futex_pi: uaddr: %p  uval: 0x%08x  pid: %d\n",
+		 uaddr, uval, current->pid);
+	/*
+	 * There is no waiter, so we unlock the futex. The owner died
+	 * bit has not to be preserved here. We are the owner:
+	 */
+	inc_preempt_count();
+	oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
+	dec_preempt_count();
+
+	if (oldval == -EFAULT)
+		return oldval;
+	if (oldval != uval)
+		return -EAGAIN;
+
+	return 0;
+}
+
 /*
  * Wake up all waiters hashed on the physical page that is mapped
  * to this virtual address:
  */
-static int futex_wake(unsigned long uaddr, int nr_wake)
+static int futex_wake(u32 __user *uaddr, int nr_wake)
 {
-	union futex_key key;
-	struct futex_hash_bucket *bh;
-	struct list_head *head;
+	struct futex_hash_bucket *hb;
 	struct futex_q *this, *next;
+	struct plist_head *head;
+	union futex_key key;
 	int ret;
 
 	down_read(&current->mm->mmap_sem);
@@ -298,19 +659,27 @@ static int futex_wake(unsigned long uadd
 	if (unlikely(ret != 0))
 		goto out;
 
-	bh = hash_futex(&key);
-	spin_lock(&bh->lock);
-	head = &bh->chain;
+	hb = hash_futex(&key);
+	spin_lock(&hb->lock);
+	head = &hb->chain;
 
-	list_for_each_entry_safe(this, next, head, list) {
+	plist_for_each_entry_safe(this, next, head, list) {
 		if (match_futex (&this->key, &key)) {
+			/*
+			 * Mixing of PI-aware and PI-unaware
+			 * waiters is invalid:
+			 */
+			if (unlikely(this->pi_state)) {
+				ret = -EINVAL;
+				break;
+			}
 			wake_futex(this);
 			if (++ret >= nr_wake)
 				break;
 		}
 	}
 
-	spin_unlock(&bh->lock);
+	spin_unlock(&hb->lock);
 out:
 	up_read(&current->mm->mmap_sem);
 	return ret;
@@ -320,11 +689,13 @@ out:
  * Wake up all waiters hashed on the physical page that is mapped
  * to this virtual address:
  */
-static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op)
+static int
+futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2,
+	      int nr_wake, int nr_wake2, int op)
 {
 	union futex_key key1, key2;
-	struct futex_hash_bucket *bh1, *bh2;
-	struct list_head *head;
+	struct futex_hash_bucket *hb1, *hb2;
+	struct plist_head *head;
 	struct futex_q *this, *next;
 	int ret, op_ret, attempt = 0;
 
@@ -338,27 +709,29 @@ retryfull:
 	if (unlikely(ret != 0))
 		goto out;
 
-	bh1 = hash_futex(&key1);
-	bh2 = hash_futex(&key2);
+	hb1 = hash_futex(&key1);
+	hb2 = hash_futex(&key2);
 
 retry:
-	if (bh1 < bh2)
-		spin_lock(&bh1->lock);
-	spin_lock(&bh2->lock);
-	if (bh1 > bh2)
-		spin_lock(&bh1->lock);
+	if (hb1 < hb2)
+		spin_lock(&hb1->lock);
+	spin_lock(&hb2->lock);
+	if (hb1 > hb2)
+		spin_lock(&hb1->lock);
 
-	op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2);
+	op_ret = futex_atomic_op_inuser(op, uaddr2);
 	if (unlikely(op_ret < 0)) {
-		int dummy;
+		u32 dummy;
 
-		spin_unlock(&bh1->lock);
-		if (bh1 != bh2)
-			spin_unlock(&bh2->lock);
+		spin_unlock(&hb1->lock);
+		if (hb1 != hb2)
+			spin_unlock(&hb2->lock);
 
 #ifndef CONFIG_MMU
-		/* we don't get EFAULT from MMU faults if we don't have an MMU,
-		 * but we might get them from range checking */
+		/*
+		 * we don't get EFAULT from MMU faults if we don't have an MMU,
+		 * but we might get them from range checking
+		 */
 		ret = op_ret;
 		goto out;
 #endif
@@ -368,49 +741,36 @@ retry:
 			goto out;
 		}
 
-		/* futex_atomic_op_inuser needs to both read and write
+		/*
+		 * futex_atomic_op_inuser needs to both read and write
 		 * *(int __user *)uaddr2, but we can't modify it
 		 * non-atomically.  Therefore, if get_user below is not
 		 * enough, we need to handle the fault ourselves, while
-		 * still holding the mmap_sem.  */
+		 * still holding the mmap_sem.
+		 */
 		if (attempt++) {
-			struct vm_area_struct * vma;
-			struct mm_struct *mm = current->mm;
-
-			ret = -EFAULT;
-			if (attempt >= 2 ||
-			    !(vma = find_vma(mm, uaddr2)) ||
-			    vma->vm_start > uaddr2 ||
-			    !(vma->vm_flags & VM_WRITE))
-				goto out;
-
-			switch (handle_mm_fault(mm, vma, uaddr2, 1)) {
-			case VM_FAULT_MINOR:
-				current->min_flt++;
-				break;
-			case VM_FAULT_MAJOR:
-				current->maj_flt++;
-				break;
-			default:
+			if (futex_handle_fault((unsigned long)uaddr2,
+					       attempt))
 				goto out;
-			}
 			goto retry;
 		}
 
-		/* If we would have faulted, release mmap_sem,
-		 * fault it in and start all over again.  */
+		/*
+		 * If we would have faulted, release mmap_sem,
+		 * fault it in and start all over again.
+		 */
 		up_read(&current->mm->mmap_sem);
 
-		ret = get_user(dummy, (int __user *)uaddr2);
+		ret = get_user(dummy, uaddr2);
 		if (ret)
 			return ret;
 
 		goto retryfull;
 	}
 
-	head = &bh1->chain;
+	head = &hb1->chain;
 
-	list_for_each_entry_safe(this, next, head, list) {
+	plist_for_each_entry_safe(this, next, head, list) {
 		if (match_futex (&this->key, &key1)) {
 			wake_futex(this);
 			if (++ret >= nr_wake)
@@ -419,10 +779,10 @@ retry:
 	}
 
 	if (op_ret > 0) {
-		head = &bh2->chain;
+		head = &hb2->chain;
 
 		op_ret = 0;
-		list_for_each_entry_safe(this, next, head, list) {
+		plist_for_each_entry_safe(this, next, head, list) {
 			if (match_futex (&this->key, &key2)) {
 				wake_futex(this);
 				if (++op_ret >= nr_wake2)
@@ -432,9 +792,9 @@ retry:
 		ret += op_ret;
 	}
 
-	spin_unlock(&bh1->lock);
-	if (bh1 != bh2)
-		spin_unlock(&bh2->lock);
+	spin_unlock(&hb1->lock);
+	if (hb1 != hb2)
+		spin_unlock(&hb2->lock);
 out:
 	up_read(&current->mm->mmap_sem);
 	return ret;
@@ -444,12 +804,12 @@ out:
  * Requeue all waiters hashed on one physical page to another
  * physical page.
  */
-static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
-			 int nr_wake, int nr_requeue, int *valp)
+static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
+			 int nr_wake, int nr_requeue, u32 *cmpval)
 {
 	union futex_key key1, key2;
-	struct futex_hash_bucket *bh1, *bh2;
-	struct list_head *head1;
+	struct futex_hash_bucket *hb1, *hb2;
+	struct plist_head *head1;
 	struct futex_q *this, *next;
 	int ret, drop_count = 0;
 
@@ -463,68 +823,73 @@ static int futex_requeue(unsigned long u
 	if (unlikely(ret != 0))
 		goto out;
 
-	bh1 = hash_futex(&key1);
-	bh2 = hash_futex(&key2);
+	hb1 = hash_futex(&key1);
+	hb2 = hash_futex(&key2);
 
-	if (bh1 < bh2)
-		spin_lock(&bh1->lock);
-	spin_lock(&bh2->lock);
-	if (bh1 > bh2)
-		spin_lock(&bh1->lock);
+	if (hb1 < hb2)
+		spin_lock(&hb1->lock);
+	spin_lock(&hb2->lock);
+	if (hb1 > hb2)
+		spin_lock(&hb1->lock);
 
-	if (likely(valp != NULL)) {
-		int curval;
+	if (likely(cmpval != NULL)) {
+		u32 curval;
 
-		ret = get_futex_value_locked(&curval, (int __user *)uaddr1);
+		ret = get_futex_value_locked(&curval, uaddr1);
 
 		if (unlikely(ret)) {
-			spin_unlock(&bh1->lock);
-			if (bh1 != bh2)
-				spin_unlock(&bh2->lock);
+			spin_unlock(&hb1->lock);
+			if (hb1 != hb2)
+				spin_unlock(&hb2->lock);
 
-			/* If we would have faulted, release mmap_sem, fault
+			/*
+			 * If we would have faulted, release mmap_sem, fault
 			 * it in and start all over again.
 			 */
 			up_read(&current->mm->mmap_sem);
 
-			ret = get_user(curval, (int __user *)uaddr1);
+			ret = get_user(curval, uaddr1);
 
 			if (!ret)
 				goto retry;
 
 			return ret;
 		}
-		if (curval != *valp) {
+		if (curval != *cmpval) {
 			ret = -EAGAIN;
 			goto out_unlock;
 		}
 	}
 
-	head1 = &bh1->chain;
-	list_for_each_entry_safe(this, next, head1, list) {
+	head1 = &hb1->chain;
+	plist_for_each_entry_safe(this, next, head1, list) {
 		if (!match_futex (&this->key, &key1))
 			continue;
 		if (++ret <= nr_wake) {
 			wake_futex(this);
 		} else {
-			list_move_tail(&this->list, &bh2->chain);
-			this->lock_ptr = &bh2->lock;
+			/*
+			 * If key1 and key2 hash to the same bucket, no
+			 * need to requeue.
+			 */
+			if (likely(head1 != &hb2->chain)) {
+				plist_del(&this->list, &hb1->chain);
+				plist_add(&this->list, &hb2->chain);
+				this->lock_ptr = &hb2->lock;
+			}
 			this->key = key2;
 			get_key_refs(&key2);
 			drop_count++;
 
 			if (ret - nr_wake >= nr_requeue)
 				break;
-			/* Make sure to stop if key1 == key2 */
-			if (head1 == &bh2->chain && head1 != &next->list)
-				head1 = &this->list;
 		}
 	}
 
 out_unlock:
-	spin_unlock(&bh1->lock);
-	if (bh1 != bh2)
-		spin_unlock(&bh2->lock);
+	spin_unlock(&hb1->lock);
+	if (hb1 != hb2)
+		spin_unlock(&hb2->lock);
 
 	/* drop_key_refs() must be called outside the spinlocks. */
 	while (--drop_count >= 0)
@@ -539,7 +904,7 @@ out:
 static inline struct futex_hash_bucket *
 queue_lock(struct futex_q *q, int fd, struct file *filp)
 {
-	struct futex_hash_bucket *bh;
+	struct futex_hash_bucket *hb;
 
 	q->fd = fd;
 	q->filp = filp;
@@ -547,23 +912,25 @@ queue_lock(struct futex_q *q, int fd, st
 	init_waitqueue_head(&q->waiters);
 
 	get_key_refs(&q->key);
-	bh = hash_futex(&q->key);
-	q->lock_ptr = &bh->lock;
+	hb = hash_futex(&q->key);
+	q->lock_ptr = &hb->lock;
 
-	spin_lock(&bh->lock);
-	return bh;
+	spin_lock(&hb->lock);
+	return hb;
 }
 
-static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh)
+static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
 {
-	list_add_tail(&q->list, &bh->chain);
-	spin_unlock(&bh->lock);
+	plist_node_init(&q->list, current->normal_prio);
+	plist_add(&q->list, &hb->chain);
+	q->task = current;
+	spin_unlock(&hb->lock);
 }
 
 static inline void
-queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh)
+queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
 {
-	spin_unlock(&bh->lock);
+	spin_unlock(&hb->lock);
 	drop_key_refs(&q->key);
 }
 
@@ -575,16 +942,17 @@ queue_unlock(struct futex_q *q, struct f
 /* The key must be already stored in q->key. */
 static void queue_me(struct futex_q *q, int fd, struct file *filp)
 {
-	struct futex_hash_bucket *bh;
-	bh = queue_lock(q, fd, filp);
-	__queue_me(q, bh);
+	struct futex_hash_bucket *hb;
+
+	hb = queue_lock(q, fd, filp);
+	__queue_me(q, hb);
 }
 
 /* Return 1 if we were still queued (ie. 0 means we were woken) */
 static int unqueue_me(struct futex_q *q)
 {
-	int ret = 0;
 	spinlock_t *lock_ptr;
+	int ret = 0;
 
 	/* In the common case we don't take the spinlock, which is nice. */
  retry:
@@ -608,8 +976,11 @@ static int unqueue_me(struct futex_q *q)
 			spin_unlock(lock_ptr);
 			goto retry;
 		}
-		WARN_ON(list_empty(&q->list));
-		list_del(&q->list);
+		WARN_ON(plist_node_empty(&q->list));
+		plist_del(&q->list, &q->list.plist);
+
+		BUG_ON(q->pi_state);
+
 		spin_unlock(lock_ptr);
 		ret = 1;
 	}
@@ -618,21 +989,44 @@ static int unqueue_me(struct futex_q *q)
 	return ret;
 }
 
-static int futex_wait(unsigned long uaddr, int val, unsigned long time)
+/*
+ * PI futexes can not be requeued and must remove themself from the
+ * hash bucket. The hash bucket lock is held on entry and dropped here.
+ */
+static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
+{
+	WARN_ON(plist_node_empty(&q->list));
+	plist_del(&q->list, &q->list.plist);
+
+	BUG_ON(!q->pi_state);
+	free_pi_state(q->pi_state);
+	q->pi_state = NULL;
+
+	spin_unlock(&hb->lock);
+
+	drop_key_refs(&q->key);
+}
+
+static int futex_wait(u32 __user *uaddr, u32 val, struct timespec *time)
 {
-	DECLARE_WAITQUEUE(wait, current);
-	int ret, curval;
+	struct task_struct *curr = current;
+	DECLARE_WAITQUEUE(wait, curr);
+	struct futex_hash_bucket *hb;
 	struct futex_q q;
-	struct futex_hash_bucket *bh;
+	u32 uval;
+	int ret;
+	struct hrtimer_sleeper t;
+	int rem = 0;
 
+	q.pi_state = NULL;
  retry:
-	down_read(&current->mm->mmap_sem);
+	down_read(&curr->mm->mmap_sem);
 
 	ret = get_futex_key(uaddr, &q.key);
 	if (unlikely(ret != 0))
 		goto out_release_sem;
 
-	bh = queue_lock(&q, -1, NULL);
+	hb = queue_lock(&q, -1, NULL);
 
 	/*
 	 * Access the page AFTER the futex is queued.
@@ -654,37 +1048,35 @@ static int futex_wait(unsigned long uadd
 	 * We hold the mmap semaphore, so the mapping cannot have changed
 	 * since we looked it up in get_futex_key.
 	 */
-
-	ret = get_futex_value_locked(&curval, (int __user *)uaddr);
+	ret = get_futex_value_locked(&uval, uaddr);
 
 	if (unlikely(ret)) {
-		queue_unlock(&q, bh);
+		queue_unlock(&q, hb);
 
-		/* If we would have faulted, release mmap_sem, fault it in and
+		/*
+		 * If we would have faulted, release mmap_sem, fault it in and
 		 * start all over again.
 		 */
-		up_read(&current->mm->mmap_sem);
+		up_read(&curr->mm->mmap_sem);
 
-		ret = get_user(curval, (int __user *)uaddr);
+		ret = get_user(uval, uaddr);
 
 		if (!ret)
 			goto retry;
 		return ret;
 	}
-	if (curval != val) {
-		ret = -EWOULDBLOCK;
-		queue_unlock(&q, bh);
-		goto out_release_sem;
-	}
+	ret = -EWOULDBLOCK;
+	if (uval != val)
+		goto out_unlock_release_sem;
 
 	/* Only actually queue if *uaddr contained val.  */
-	__queue_me(&q, bh);
+	__queue_me(&q, hb);
 
 	/*
 	 * Now the futex is queued and we have checked the data, we
 	 * don't want to hold mmap_sem while we sleep.
-	 */	
-	up_read(&current->mm->mmap_sem);
+	 */
+	up_read(&curr->mm->mmap_sem);
 
 	/*
 	 * There might have been scheduling since the queue_me(), as we
@@ -702,8 +1094,37 @@ static int futex_wait(unsigned long uadd
 	 * !list_empty() is safe here without any lock.
 	 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
 	 */
-	if (likely(!list_empty(&q.list)))
-		time = schedule_timeout(time);
+	if (likely(!plist_node_empty(&q.list))) {
+		unsigned long nosched_flag = current->flags & PF_NOSCHED;
+
+		current->flags &= ~PF_NOSCHED;
+
+		if (time->tv_sec == 0 && time->tv_nsec == 0)
+			schedule();
+		else {
+
+			hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_REL);
+			hrtimer_init_sleeper(&t, current);
+			t.timer.expires = timespec_to_ktime(*time);
+
+			hrtimer_start(&t.timer, t.timer.expires, HRTIMER_REL);
+
+			/*
+			 * the timer could have already expired, in which
+			 * case current would be flagged for rescheduling.
+			 * Don't bother calling schedule.
+			 */
+			if (likely(t.task))
+				schedule();
+
+			hrtimer_cancel(&t.timer);
+
+			/* Flag if a timeout occured */
+			rem = (t.task == NULL);
+		}
+
+		current->flags |= nosched_flag;
+	}
 	__set_current_state(TASK_RUNNING);
 
 	/*
@@ -714,14 +1135,381 @@ static int futex_wait(unsigned long uadd
 	/* If we were woken (and unqueued), we succeeded, whatever. */
 	if (!unqueue_me(&q))
 		return 0;
-	if (time == 0)
+	if (rem)
 		return -ETIMEDOUT;
-	/* We expect signal_pending(current), but another thread may
-	 * have handled it for us already. */
+	/*
+	 * We expect signal_pending(current), but another thread may
+	 * have handled it for us already.
+	 */
 	return -EINTR;
 
+ out_unlock_release_sem:
+	queue_unlock(&q, hb);
+
  out_release_sem:
+	up_read(&curr->mm->mmap_sem);
+	return ret;
+}
+
+/*
+ * Userspace tried a 0 -> TID atomic transition of the futex value
+ * and failed. The kernel side here does the whole locking operation:
+ * if there are waiters then it will block, it does PI, etc. (Due to
+ * races the kernel might see a 0 value of the futex too.)
+ */
+static int futex_lock_pi(u32 __user *uaddr, int detect, struct timespec *utime,
+			 int trylock)
+{
+	struct task_struct *curr = current;
+	struct hrtimer_sleeper timeout, *to = NULL;
+	struct futex_hash_bucket *hb;
+	u32 uval, newval, curval;
+	struct futex_q q;
+	int ret, attempt = 0;
+
+	if (refill_pi_state_cache())
+		return -ENOMEM;
+
+	pr_debug("Futex lock pi: %d\n", current->pid);
+
+	if (utime->tv_sec != 0 || utime->tv_nsec != 0) {
+		to = &timeout;
+		hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
+		hrtimer_init_sleeper(to, current);
+		to->timer.expires = timespec_to_ktime(*utime);
+	}
+
+	q.pi_state = NULL;
+
+ retry:
+	down_read(&curr->mm->mmap_sem);
+
+	ret = get_futex_key(uaddr, &q.key);
+	if (unlikely(ret != 0))
+		goto out_release_sem;
+
+	hb = queue_lock(&q, -1, NULL);
+
+ retry_locked:
+	/*
+	 * To avoid races, we attempt to take the lock here again
+	 * (by doing a 0 -> TID atomic cmpxchg), while holding all
+	 * the locks. It will most likely not succeed.
+	 */
+	newval = current->pid;
+
+	inc_preempt_count();
+	curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
+	dec_preempt_count();
+
+	if (unlikely(curval == -EFAULT))
+		goto uaddr_faulted;
+
+	/* We own the lock already */
+	if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
+		if (!detect && 0)
+			force_sig(SIGKILL, current);
+		ret = -EDEADLK;
+		goto out_unlock_release_sem;
+	}
+
+	/*
+	 * Surprise - we got the lock. Just return
+	 * to userspace:
+	 */
+	if (unlikely(!curval))
+		goto out_unlock_release_sem;
+
+	uval = curval;
+	newval = uval | FUTEX_WAITERS;
+
+	inc_preempt_count();
+	curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+	dec_preempt_count();
+
+	if (unlikely(curval == -EFAULT))
+		goto uaddr_faulted;
+	if (unlikely(curval != uval))
+		goto retry_locked;
+
+	/*
+	 * We dont have the lock. Look up the PI state (or create it if
+	 * we are the first waiter):
+	 */
+	ret = lookup_pi_state(uval, hb, &q);
+
+	if (unlikely(ret)) {
+		/* Handle pi/non-pi mixups: */
+		if (ret == -EINVAL)
+			goto out_unlock_release_sem;
+		/*
+		 * There were no waiters and the owner task lookup
+		 * failed. When the OWNER_DIED bit is set, then we
+		 * know that this is a robust futex and we actually
+		 * take the lock. This is safe as we are protected by
+		 * the hash bucket lock. We also set the waiters bit
+		 * unconditionally here, to simplify glibc handling of
+		 * multiple tasks racing to acquire the lock and
+		 * cleanup the problems which were left by the dead
+		 * owner.
+		 */
+		if (curval & FUTEX_OWNER_DIED) {
+			uval = newval;
+			newval = current->pid | FUTEX_OWNER_DIED|FUTEX_WAITERS;
+
+			inc_preempt_count();
+			curval = futex_atomic_cmpxchg_inatomic(uaddr,
+							       uval, newval);
+			dec_preempt_count();
+
+			if (unlikely(curval == -EFAULT))
+				goto uaddr_faulted;
+			if (unlikely(curval != uval))
+				goto retry_locked;
+			ret = 0;
+		}
+		goto out_unlock_release_sem;
+	}
+
+	/*
+	 * Only actually queue now that the atomic ops are done:
+	 */
+	__queue_me(&q, hb);
+
+	/*
+	 * Now the futex is queued and we have checked the data, we
+	 * don't want to hold mmap_sem while we sleep.
+	 */
+	up_read(&curr->mm->mmap_sem);
+
+	WARN_ON(!q.pi_state);
+	/*
+	 * Block on the PI mutex:
+	 */
+	if (!trylock)
+		ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
+	else {
+		ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
+		/* Fixup the trylock return value: */
+		ret = ret ? 0 : -EWOULDBLOCK;
+	}
+
+	down_read(&curr->mm->mmap_sem);
+	spin_lock(q.lock_ptr);
+
+	/*
+	 * Got the lock. We might not be the anticipated owner if we
+	 * did a lock-steal - fix up the PI-state in that case.
+	 */
+	if (!ret && q.pi_state->owner != curr) {
+		u32 newtid = current->pid | FUTEX_WAITERS;
+
+		/* Owner died? */
+		if (q.pi_state->owner != NULL) {
+			spin_lock_irq(&q.pi_state->owner->pi_lock);
+			WARN_ON(list_empty(&q.pi_state->list));
+			list_del_init(&q.pi_state->list);
+			spin_unlock_irq(&q.pi_state->owner->pi_lock);
+		} else
+			newtid |= FUTEX_OWNER_DIED;
+
+		q.pi_state->owner = current;
+
+		spin_lock_irq(&current->pi_lock);
+		WARN_ON(!list_empty(&q.pi_state->list));
+		list_add(&q.pi_state->list, &current->pi_state_list);
+		spin_unlock_irq(&current->pi_lock);
+
+		/* Unqueue and drop the lock */
+		unqueue_me_pi(&q, hb);
+		up_read(&curr->mm->mmap_sem);
+		/*
+		 * We own it, so we have to replace the pending owner
+		 * TID. This must be atomic as we have preserve the
+		 * owner died bit here.
+		 */
+		ret = get_user(uval, uaddr);
+		while (!ret) {
+			newval = (uval & FUTEX_OWNER_DIED) | newtid;
+			curval = futex_atomic_cmpxchg_inatomic(uaddr,
+							       uval, newval);
+			if (curval == -EFAULT)
+				ret = -EFAULT;
+			if (curval == uval)
+				break;
+			uval = curval;
+		}
+	} else {
+		/*
+		 * Catch the rare case, where the lock was released
+		 * when we were on the way back before we locked
+		 * the hash bucket.
+		 */
+		if (ret && q.pi_state->owner == curr) {
+			if (rt_mutex_trylock(&q.pi_state->pi_mutex))
+				ret = 0;
+		}
+		/* Unqueue and drop the lock */
+		unqueue_me_pi(&q, hb);
+		up_read(&curr->mm->mmap_sem);
+	}
+
+	if (!detect && ret == -EDEADLK && 0)
+		force_sig(SIGKILL, current);
+
+	if (!ret)
+		pr_debug("Got lock: %p, %d\n", uaddr, current->pid);
+	else
+		pr_debug("lock ret: %p, %d: %d\n", uaddr, current->pid, ret);
+
+	return ret != -EINTR ? ret : -ERESTARTNOINTR;
+
+ out_unlock_release_sem:
+	queue_unlock(&q, hb);
+
+ out_release_sem:
+	up_read(&curr->mm->mmap_sem);
+	return ret;
+
+ uaddr_faulted:
+	/*
+	 * We have to r/w  *(int __user *)uaddr, but we can't modify it
+	 * non-atomically.  Therefore, if get_user below is not
+	 * enough, we need to handle the fault ourselves, while
+	 * still holding the mmap_sem.
+	 */
+	if (attempt++) {
+		if (futex_handle_fault((unsigned long)uaddr, attempt)) {
+			ret = -EFAULT;
+			goto out_unlock_release_sem;
+		}
+		goto retry_locked;
+	}
+
+	queue_unlock(&q, hb);
+	up_read(&curr->mm->mmap_sem);
+
+	ret = get_user(uval, uaddr);
+	if (!ret && (uval != -EFAULT))
+		goto retry;
+
+	return ret;
+}
+
+/*
+ * Userspace attempted a TID -> 0 atomic transition, and failed.
+ * This is the in-kernel slowpath: we look up the PI state (if any),
+ * and do the rt-mutex unlock.
+ */
+static int futex_unlock_pi(u32 __user *uaddr)
+{
+	struct futex_hash_bucket *hb;
+	struct futex_q *this, *next;
+	u32 uval;
+	struct plist_head *head;
+	union futex_key key;
+	int ret, attempt = 0;
+
+retry:
+	if (get_user(uval, uaddr))
+		return -EFAULT;
+	/*
+	 * We release only a lock we actually own:
+	 */
+	if ((uval & FUTEX_TID_MASK) != current->pid)
+		return -EPERM;
+	/*
+	 * First take all the futex related locks:
+	 */
+	down_read(&current->mm->mmap_sem);
+
+	ret = get_futex_key(uaddr, &key);
+	if (unlikely(ret != 0))
+		goto out;
+
+	hb = hash_futex(&key);
+	spin_lock(&hb->lock);
+
+retry_locked:
+	/*
+	 * To avoid races, try to do the TID -> 0 atomic transition
+	 * again. If it succeeds then we can return without waking
+	 * anyone else up:
+	 */
+	if (!(uval & FUTEX_OWNER_DIED)) {
+		inc_preempt_count();
+		uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
+		dec_preempt_count();
+	}
+
+	if (unlikely(uval == -EFAULT))
+		goto pi_faulted;
+	/*
+	 * Rare case: we managed to release the lock atomically,
+	 * no need to wake anyone else up:
+	 */
+	if (unlikely(uval == current->pid))
+		goto out_unlock;
+
+	/*
+	 * Ok, other tasks may need to be woken up - check waiters
+	 * and do the wakeup if necessary:
+	 */
+	head = &hb->chain;
+
+	plist_for_each_entry_safe(this, next, head, list) {
+		if (!match_futex (&this->key, &key))
+			continue;
+		ret = wake_futex_pi(uaddr, uval, this);
+		/*
+		 * The atomic access to the futex value
+		 * generated a pagefault, so retry the
+		 * user-access and the wakeup:
+		 */
+		if (ret == -EFAULT)
+			goto pi_faulted;
+		goto out_unlock;
+	}
+	/*
+	 * No waiters - kernel unlocks the futex:
+	 */
+	if (!(uval & FUTEX_OWNER_DIED)) {
+		ret = unlock_futex_pi(uaddr, uval);
+		if (ret == -EFAULT)
+			goto pi_faulted;
+	}
+
+out_unlock:
+	spin_unlock(&hb->lock);
+out:
 	up_read(&current->mm->mmap_sem);
+	pr_debug("unlock ret: %p, %d (%d): %d\n", uaddr, current->pid,
+		 current->prio, ret);
+
+	return ret;
+
+pi_faulted:
+	/*
+	 * We have to r/w  *(int __user *)uaddr, but we can't modify it
+	 * non-atomically.  Therefore, if get_user below is not
+	 * enough, we need to handle the fault ourselves, while
+	 * still holding the mmap_sem.
+	 */
+	if (attempt++) {
+		if (futex_handle_fault((unsigned long)uaddr, attempt)) {
+			ret = -EFAULT;
+			goto out_unlock;
+		}
+		goto retry_locked;
+	}
+
+	spin_unlock(&hb->lock);
+	up_read(&current->mm->mmap_sem);
+
+	ret = get_user(uval, uaddr);
+	if (!ret && (uval != -EFAULT))
+		goto retry;
+
 	return ret;
 }
 
@@ -731,6 +1519,7 @@ static int futex_close(struct inode *ino
 
 	unqueue_me(q);
 	kfree(q);
+
 	return 0;
 }
 
@@ -747,7 +1536,7 @@ static unsigned int futex_poll(struct fi
 	 * list_empty() is safe here without any lock.
 	 * q->lock_ptr != 0 is not safe, because of ordering against wakeup.
 	 */
-	if (list_empty(&q->list))
+	if (plist_node_empty(&q->list))
 		ret = POLLIN | POLLRDNORM;
 
 	return ret;
@@ -762,7 +1551,7 @@ static struct file_operations futex_fops
  * Signal allows caller to avoid the race which would occur if they
  * set the sigio stuff up afterwards.
  */
-static int futex_fd(unsigned long uaddr, int signal)
+static int futex_fd(u32 __user *uaddr, int signal)
 {
 	struct futex_q *q;
 	struct file *filp;
@@ -799,6 +1588,7 @@ static int futex_fd(unsigned long uaddr,
 		err = -ENOMEM;
 		goto error;
 	}
+	q->pi_state = NULL;
 
 	down_read(&current->mm->mmap_sem);
 	err = get_futex_key(uaddr, &q->key);
@@ -829,8 +1619,203 @@ error:
 	goto out;
 }
 
-long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
-		unsigned long uaddr2, int val2, int val3)
+/*
+ * Support for robust futexes: the kernel cleans up held futexes at
+ * thread exit time.
+ *
+ * Implementation: user-space maintains a per-thread list of locks it
+ * is holding. Upon do_exit(), the kernel carefully walks this list,
+ * and marks all locks that are owned by this thread with the
+ * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
+ * always manipulated with the lock held, so the list is private and
+ * per-thread. Userspace also maintains a per-thread 'list_op_pending'
+ * field, to allow the kernel to clean up if the thread dies after
+ * acquiring the lock, but just before it could have added itself to
+ * the list. There can only be one such pending lock.
+ */
+
+/**
+ * sys_set_robust_list - set the robust-futex list head of a task
+ * @head: pointer to the list-head
+ * @len: length of the list-head, as userspace expects
+ */
+asmlinkage long
+sys_set_robust_list(struct robust_list_head __user *head,
+		    size_t len)
+{
+	/*
+	 * The kernel knows only one size for now:
+	 */
+	if (unlikely(len != sizeof(*head)))
+		return -EINVAL;
+
+	current->robust_list = head;
+
+	return 0;
+}
+
+/**
+ * sys_get_robust_list - get the robust-futex list head of a task
+ * @pid: pid of the process [zero for current task]
+ * @head_ptr: pointer to a list-head pointer, the kernel fills it in
+ * @len_ptr: pointer to a length field, the kernel fills in the header size
+ */
+asmlinkage long
+sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
+		    size_t __user *len_ptr)
+{
+	struct robust_list_head *head;
+	unsigned long ret;
+
+	if (!pid)
+		head = current->robust_list;
+	else {
+		struct task_struct *p;
+
+		ret = -ESRCH;
+		read_lock(&tasklist_lock);
+		p = find_task_by_pid(pid);
+		if (!p)
+			goto err_unlock;
+		ret = -EPERM;
+		if ((current->euid != p->euid) && (current->euid != p->uid) &&
+				!capable(CAP_SYS_PTRACE))
+			goto err_unlock;
+		head = p->robust_list;
+		read_unlock(&tasklist_lock);
+	}
+
+	if (put_user(sizeof(*head), len_ptr))
+		return -EFAULT;
+	return put_user(head, head_ptr);
+
+err_unlock:
+	read_unlock(&tasklist_lock);
+
+	return ret;
+}
+
+/*
+ * Process a futex-list entry, check whether it's owned by the
+ * dying task, and do notification if so:
+ */
+int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
+{
+	u32 uval, nval, mval;
+
+retry:
+	if (get_user(uval, uaddr))
+		return -1;
+
+	if ((uval & FUTEX_TID_MASK) == curr->pid) {
+		/*
+		 * Ok, this dying thread is truly holding a futex
+		 * of interest. Set the OWNER_DIED bit atomically
+		 * via cmpxchg, and if the value had FUTEX_WAITERS
+		 * set, wake up a waiter (if any). (We have to do a
+		 * futex_wake() even if OWNER_DIED is already set -
+		 * to handle the rare but possible case of recursive
+		 * thread-death.) The rest of the cleanup is done in
+		 * userspace.
+		 */
+		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+		nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
+
+		if (nval == -EFAULT)
+			return -1;
+
+		if (nval != uval)
+			goto retry;
+
+		/*
+		 * Wake robust non-PI futexes here. The wakeup of
+		 * PI futexes happens in exit_pi_state():
+		 */
+		if (!pi) {
+			if (uval & FUTEX_WAITERS)
+				futex_wake(uaddr, 1);
+		}
+	}
+	return 0;
+}
+
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int fetch_robust_entry(struct robust_list __user **entry,
+				     struct robust_list __user **head, int *pi)
+{
+	unsigned long uentry;
+
+	if (get_user(uentry, (unsigned long *)head))
+		return -EFAULT;
+
+	*entry = (void *)(uentry & ~1UL);
+	*pi = uentry & 1;
+
+	return 0;
+}
+
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+void exit_robust_list(struct task_struct *curr)
+{
+	struct robust_list_head __user *head = curr->robust_list;
+	struct robust_list __user *entry, *pending;
+	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+	unsigned long futex_offset;
+
+	/*
+	 * Fetch the list head (which was registered earlier, via
+	 * sys_set_robust_list()):
+	 */
+	if (fetch_robust_entry(&entry, &head->list.next, &pi))
+		return;
+	/*
+	 * Fetch the relative futex offset:
+	 */
+	if (get_user(futex_offset, &head->futex_offset))
+		return;
+	/*
+	 * Fetch any possibly pending lock-add first, and handle it
+	 * if it exists:
+	 */
+	if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
+		return;
+
+	if (pending)
+		handle_futex_death((void *)pending + futex_offset, curr, pip);
+
+	while (entry != &head->list) {
+		/*
+		 * A pending lock might already be on the list, so
+		 * don't process it twice:
+		 */
+		if (entry != pending)
+			if (handle_futex_death((void *)entry + futex_offset,
+						curr, pi))
+				return;
+		/*
+		 * Fetch the next entry in the list:
+		 */
+		if (fetch_robust_entry(&entry, &entry->next, &pi))
+			return;
+		/*
+		 * Avoid excessively long or circular lists:
+		 */
+		if (!--limit)
+			break;
+
+		cond_resched();
+	}
+}
+
+long do_futex(u32 __user *uaddr, int op, u32 val, struct timespec *timeout,
+		u32 __user *uaddr2, u32 val2, u32 val3)
 {
 	int ret;
 
@@ -854,6 +1839,15 @@ long do_futex(unsigned long uaddr, int o
 	case FUTEX_WAKE_OP:
 		ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
 		break;
+	case FUTEX_LOCK_PI:
+		ret = futex_lock_pi(uaddr, val, timeout, 0);
+		break;
+	case FUTEX_UNLOCK_PI:
+		ret = futex_unlock_pi(uaddr);
+		break;
+	case FUTEX_TRYLOCK_PI:
+		ret = futex_lock_pi(uaddr, 0, timeout, 1);
+		break;
 	default:
 		ret = -ENOSYS;
 	}
@@ -861,27 +1855,26 @@ long do_futex(unsigned long uaddr, int o
 }
 
 
-asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
+asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
 			  struct timespec __user *utime, u32 __user *uaddr2,
-			  int val3)
+			  u32 val3)
 {
-	struct timespec t;
-	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
-	int val2 = 0;
+	struct timespec t = {.tv_sec=0, .tv_nsec=0};
+	u32 val2 = 0;
 
-	if ((op == FUTEX_WAIT) && utime) {
+	if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
 		if (copy_from_user(&t, utime, sizeof(t)) != 0)
 			return -EFAULT;
-		timeout = timespec_to_jiffies(&t) + 1;
+		if (!timespec_valid(&t))
+			return -EINVAL;
 	}
 	/*
 	 * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
 	 */
-	if (op >= FUTEX_REQUEUE)
-		val2 = (int) (unsigned long) utime;
+	if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
+		val2 = (u32) (unsigned long) utime;
 
-	return do_futex((unsigned long)uaddr, op, val, timeout,
-			(unsigned long)uaddr2, val2, val3);
+	return do_futex(uaddr, op, val, &t, uaddr2, val2, val3);
 }
 
 static struct super_block *
@@ -905,7 +1898,7 @@ static int __init init(void)
 	futex_mnt = kern_mount(&futex_fs_type);
 
 	for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
-		INIT_LIST_HEAD(&futex_queues[i].chain);
+		plist_head_init(&futex_queues[i].chain, NULL);
 		spin_lock_init(&futex_queues[i].lock);
 	}
 	return 0;
Index: linux-2.6.16/kernel/futex_compat.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/kernel/futex_compat.c	2006-11-22 17:50:43.000000000 +0100
@@ -0,0 +1,157 @@
+/*
+ * linux/kernel/futex_compat.c
+ *
+ * Futex compatibililty routines.
+ *
+ * Copyright 2006, Red Hat, Inc., Ingo Molnar
+ */
+
+#include <linux/linkage.h>
+#include <linux/compat.h>
+#include <linux/futex.h>
+
+#include <asm/uaccess.h>
+
+
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int
+fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
+		   compat_uptr_t *head, int *pi)
+{
+	if (get_user(*uentry, head))
+		return -EFAULT;
+
+	*entry = compat_ptr((*uentry) & ~1);
+	*pi = (unsigned int)(*uentry) & 1;
+
+	return 0;
+}
+
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+void compat_exit_robust_list(struct task_struct *curr)
+{
+	struct compat_robust_list_head __user *head = curr->compat_robust_list;
+	struct robust_list __user *entry, *pending;
+	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+	compat_uptr_t uentry, upending;
+	compat_long_t futex_offset;
+
+	/*
+	 * Fetch the list head (which was registered earlier, via
+	 * sys_set_robust_list()):
+	 */
+	if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
+		return;
+	/*
+	 * Fetch the relative futex offset:
+	 */
+	if (get_user(futex_offset, &head->futex_offset))
+		return;
+	/*
+	 * Fetch any possibly pending lock-add first, and handle it
+	 * if it exists:
+	 */
+	if (fetch_robust_entry(&upending, &pending,
+			       &head->list_op_pending, &pip))
+		return;
+	if (upending)
+		handle_futex_death((void *)pending + futex_offset, curr, pip);
+
+	while (compat_ptr(uentry) != &head->list) {
+		/*
+		 * A pending lock might already be on the list, so
+		 * dont process it twice:
+		 */
+		if (entry != pending)
+			if (handle_futex_death((void *)entry + futex_offset,
+						curr, pi))
+				return;
+
+		/*
+		 * Fetch the next entry in the list:
+		 */
+		if (fetch_robust_entry(&uentry, &entry,
+				       (compat_uptr_t *)&entry->next, &pi))
+			return;
+		/*
+		 * Avoid excessively long or circular lists:
+		 */
+		if (!--limit)
+			break;
+
+		cond_resched();
+	}
+}
+
+asmlinkage long
+compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
+			   compat_size_t len)
+{
+	if (unlikely(len != sizeof(*head)))
+		return -EINVAL;
+
+	current->compat_robust_list = head;
+
+	return 0;
+}
+
+asmlinkage long
+compat_sys_get_robust_list(int pid, compat_uptr_t *head_ptr,
+			   compat_size_t __user *len_ptr)
+{
+	struct compat_robust_list_head *head;
+	unsigned long ret;
+
+	if (!pid)
+		head = current->compat_robust_list;
+	else {
+		struct task_struct *p;
+
+		ret = -ESRCH;
+		read_lock(&tasklist_lock);
+		p = find_task_by_pid(pid);
+		if (!p)
+			goto err_unlock;
+		ret = -EPERM;
+		if ((current->euid != p->euid) && (current->euid != p->uid) &&
+				!capable(CAP_SYS_PTRACE))
+			goto err_unlock;
+		head = p->compat_robust_list;
+		read_unlock(&tasklist_lock);
+	}
+
+	if (put_user(sizeof(*head), len_ptr))
+		return -EFAULT;
+	return put_user(ptr_to_compat(head), head_ptr);
+
+err_unlock:
+	read_unlock(&tasklist_lock);
+
+	return ret;
+}
+
+asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
+		struct compat_timespec __user *utime, u32 __user *uaddr2,
+		u32 val3)
+{
+	struct timespec t = {.tv_sec=0, .tv_nsec=0};
+	int val2 = 0;
+
+	if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
+		if (get_compat_timespec(&t, utime))
+			return -EFAULT;
+		if (!timespec_valid(&t))
+			return -EINVAL;
+	}
+	if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
+		val2 = (int) (unsigned long) utime;
+
+	return do_futex(uaddr, op, val, &t, uaddr2, val2, val3);
+}
Index: linux-2.6.16/kernel/hrtimer.c
===================================================================
--- linux-2.6.16.orig/kernel/hrtimer.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/hrtimer.c	2006-10-19 16:52:31.000000000 +0200
@@ -20,6 +20,8 @@
  *
  *  Credits:
  *	based on kernel/timer.c
+ *	Thanks to George Anzinger and Roman Zippel
+ *	for testing, buxfixes and suggestions
  *
  *	Help, testing, suggestions, bugfixes, improvements were
  *	provided by:
@@ -37,9 +39,14 @@
 #include <linux/notifier.h>
 #include <linux/syscalls.h>
 #include <linux/interrupt.h>
+#include <linux/clockchips.h>
 
 #include <asm/uaccess.h>
 
+#include "hrtimer.h"
+
+#ifndef CONFIG_GENERIC_TIME
+
 /**
  * ktime_get - get the monotonic time in ktime_t format
  *
@@ -70,32 +77,6 @@ static ktime_t ktime_get_real(void)
 
 EXPORT_SYMBOL_GPL(ktime_get_real);
 
-/*
- * The timer bases:
- *
- * Note: If we want to add new timer bases, we have to skip the two
- * clock ids captured by the cpu-timers. We do this by holding empty
- * entries rather than doing math adjustment of the clock ids.
- * This ensures that we capture erroneous accesses to these clock ids
- * rather than moving them into the range of valid clock id's.
- */
-
-#define MAX_HRTIMER_BASES 2
-
-static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) =
-{
-	{
-		.index = CLOCK_REALTIME,
-		.get_time = &ktime_get_real,
-		.resolution = KTIME_REALTIME_RES,
-	},
-	{
-		.index = CLOCK_MONOTONIC,
-		.get_time = &ktime_get,
-		.resolution = KTIME_MONOTONIC_RES,
-	},
-};
-
 /**
  * ktime_get_ts - get the monotonic clock in timespec format
  *
@@ -123,13 +104,71 @@ void ktime_get_ts(struct timespec *ts)
 EXPORT_SYMBOL_GPL(ktime_get_ts);
 
 /*
+ * Get the coarse grained time at the softirq based on xtime and
+ * wall_to_monotonic.
+ */
+static void hrtimer_get_softirq_time(struct hrtimer_base *base)
+{
+	ktime_t xtim, tomono;
+	unsigned long seq;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		xtim = timespec_to_ktime(xtime);
+		tomono = timespec_to_ktime(wall_to_monotonic);
+
+	} while (read_seqretry(&xtime_lock, seq));
+
+	base[CLOCK_REALTIME].softirq_time = xtim;
+	base[CLOCK_MONOTONIC].softirq_time = ktime_add(xtim, tomono);
+}
+
+# define get_soft_irqtime	NULL
+# define get_soft_irqtime_real	NULL
+
+#else
+
+# define hrtimer_get_softirq_time(base)	do { } while(0)
+# define get_soft_irqtime	&ktime_get
+# define get_soft_irqtime_real	&ktime_get_real
+
+#endif
+
+/*
+ * The timer bases:
+ */
+
+#define MAX_HRTIMER_BASES 2
+
+static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) =
+{
+	{
+		.index = CLOCK_REALTIME,
+		.get_time = &ktime_get_real,
+		.get_softirq_time = get_soft_irqtime_real,
+		.resolution = KTIME_REALTIME_RES,
+	},
+	{
+		.index = CLOCK_MONOTONIC,
+		.get_time = &ktime_get,
+		.get_softirq_time = get_soft_irqtime,
+		.resolution = KTIME_MONOTONIC_RES,
+	},
+};
+
+/*
  * Functions and macros which are different for UP/SMP systems are kept in a
  * single place
  */
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS)
 
 #define set_curr_timer(b, t)		do { (b)->curr_timer = (t); } while (0)
 
+static inline int base_same_cpu(struct hrtimer_base *base)
+{
+	return (base == &__get_cpu_var(hrtimer_bases[base->index]));
+}
+
 /*
  * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
  * means that all timers which are tied to this base via timer->base are
@@ -195,6 +234,7 @@ switch_hrtimer_base(struct hrtimer *time
 #else /* CONFIG_SMP */
 
 #define set_curr_timer(b, t)		do { } while (0)
+#define base_same_cpu(b)		1
 
 static inline struct hrtimer_base *
 lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
@@ -210,6 +250,158 @@ lock_hrtimer_base(const struct hrtimer *
 
 #endif	/* !CONFIG_SMP */
 
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+#define wake_up_timer_waiters(b)	wake_up(&(b)->wait)
+
+/**
+ * hrtimer_wait_for_timer - Wait for a running timer
+ *
+ * @timer:	timer to wait for
+ *
+ * The function waits in case the timers callback function is
+ * currently executed on the waitqueue of the timer base. The
+ * waitqueue is woken up after the timer callback function has
+ * finished execution.
+ */
+void hrtimer_wait_for_timer(const struct hrtimer *timer)
+{
+	struct hrtimer_base *base = timer->base;
+
+	if (base)
+		wait_event(base->wait, base->curr_timer != timer);
+}
+
+#else
+#define wake_up_timer_waiters(b)	do { } while (0)
+#endif
+
+/* High resolution timer related functions */
+#ifdef CONFIG_HIGH_RES_TIMERS
+
+/*
+ * Shared reprogramming for clock_realtime and clock_monotonic
+ *
+ * When a new expires first timer is enqueued, we have
+ * to check, whether it expires earlier than the timer
+ * for which the hrt time source was armed.
+ *
+ * Called with interrupts disabled and base lock held
+ */
+static int hrtimer_reprogram(struct hrtimer *timer, struct hrtimer_base *base)
+{
+	ktime_t *expires_next = &__get_cpu_var(hrtimer_hres).expires_next;
+	ktime_t expires = ktime_sub(timer->expires, base->offset);
+	int res;
+
+	if (!base_same_cpu(base))
+		return 0;
+
+	if (expires.tv64 >= expires_next->tv64)
+		return 0;
+
+	res = clockevents_set_next_event(expires);
+	if (!res)
+		*expires_next = expires;
+	return res;
+}
+
+
+/*
+ * Retrigger next event is called after clock was set
+ */
+static void retrigger_next_event(void *arg)
+{
+	struct hrtimer_base *base = per_cpu(hrtimer_bases, smp_processor_id());
+
+	/* Adjust CLOCK_REALTIME offset */
+	spin_lock(&base->lock);
+	base->offset = get_realtime_offset();
+	spin_unlock(&base->lock);
+
+	hrtimer_interrupt();
+}
+
+/*
+ * Clock realtime was set
+ *
+ * Change the offset of the realtime clock vs. the monotonic
+ * clock. Called with xtime lock held !
+ *
+ * We might have to reprogram the high resolution timer interrupt. On
+ * SMP we call the architecture specific code to retrigger _all_ high
+ * resolution timer interrupts. On UP we just disable interrupts and
+ * call the high resolution interrupt code.
+ */
+void clock_was_set(void)
+{
+	preempt_disable();
+	local_irq_disable();
+
+	if (hrtimer_hres_active) {
+		retrigger_next_event(NULL);
+		local_irq_enable();
+
+		if (smp_call_function(retrigger_next_event, NULL, 1, 1))
+			BUG();
+	} else
+		local_irq_enable();
+	preempt_enable();
+}
+
+/***
+ * hrtimer_clock_notify - A clock source or a clock event has been installed
+ *
+ * Notify the per cpu softirqs to recheck the clock sources and events
+ */
+void hrtimer_clock_notify(void)
+{
+	int i;
+
+	for (i = 0; i < NR_CPUS; i++)
+		set_bit(0, &per_cpu(hrtimer_hres, i).check_clocks);
+}
+
+/*
+ * A change in the clock source or clock events was detected.
+ * Check the clock source and the events, whether we can switch to
+ * high resolution mode or not.
+ *
+ * TODO: Handle the removal of clock sources / events
+ */
+static void hrtimer_check_clocks(void)
+{
+	struct hrtimer_hres *hres = &__get_cpu_var(hrtimer_hres);
+	unsigned long flags;
+	int dotick;
+
+	if (!test_and_clear_bit(0, &hres->check_clocks))
+		return;
+
+	if (!timeofday_is_continuous())
+		return;
+
+	if (!(dotick = clockevents_next_event_available()))
+		return;
+
+	local_irq_save(flags);
+	clockevents_init_next_event();
+	hres->active = 1;
+	if (dotick == CLOCK_EVT_SCHEDTICK) {
+		hres->tick_incr = ktime_set(0, NSEC_PER_SEC/HZ);
+		hres->next_tick = ktime_add(ktime_get(), hres->tick_incr);
+		hres->dotick = 1;
+	} else
+		hres->dotick = 0;
+
+	/* "Retrigger" the interrupt to get things going */
+	retrigger_next_event(NULL);
+	local_irq_restore(flags);
+	printk(KERN_INFO "hrtimers: Switched to high resolution mode CPU %d\n",
+	       smp_processor_id());
+}
+
+#endif /* CONFIG_HIGH_RES_TIMERS */
+
 /*
  * Functions for the union type storage format of ktime_t which are
  * too large for inlining:
@@ -238,15 +430,12 @@ ktime_t ktime_add_ns(const ktime_t kt, u
 
 	return ktime_add(kt, tmp);
 }
-
-#else /* CONFIG_KTIME_SCALAR */
-
 # endif /* !CONFIG_KTIME_SCALAR */
 
 /*
  * Divide a ktime value by a nanosecond value
  */
-static unsigned long ktime_divns(const ktime_t kt, nsec_t div)
+static unsigned long ktime_divns(const ktime_t kt, s64 div)
 {
 	u64 dclc, inc, dns;
 	int sft = 0;
@@ -281,18 +470,17 @@ void unlock_hrtimer_base(const struct hr
  * hrtimer_forward - forward the timer expiry
  *
  * @timer:	hrtimer to forward
+ * @now:	forward past this time
  * @interval:	the interval to forward
  *
  * Forward the timer expiry so it will expire in the future.
  * Returns the number of overruns.
  */
 unsigned long
-hrtimer_forward(struct hrtimer *timer, ktime_t interval)
+hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
 {
 	unsigned long orun = 1;
-	ktime_t delta, now;
-
-	now = timer->base->get_time();
+	ktime_t delta;
 
 	delta = ktime_sub(now, timer->expires);
 
@@ -303,7 +491,7 @@ hrtimer_forward(struct hrtimer *timer, k
 		interval.tv64 = timer->base->resolution.tv64;
 
 	if (unlikely(delta.tv64 >= interval.tv64)) {
-		nsec_t incr = ktime_to_ns(interval);
+		s64 incr = ktime_to_ns(interval);
 
 		orun = ktime_divns(delta, incr);
 		timer->expires = ktime_add_ns(timer->expires, incr * orun);
@@ -352,14 +540,17 @@ static void enqueue_hrtimer(struct hrtim
 	 * Insert the timer to the rbtree and check whether it
 	 * replaces the first pending timer
 	 */
-	rb_link_node(&timer->node, parent, link);
-	rb_insert_color(&timer->node, &base->active);
+	if (!base->first || timer->expires.tv64 <
+	    rb_entry(base->first, struct hrtimer, node)->expires.tv64) {
 
-	timer->state = HRTIMER_PENDING;
+		if (hrtimer_enqueue_reprogram(timer, base))
+			return;
 
-	if (!base->first || timer->expires.tv64 <
-	    rb_entry(base->first, struct hrtimer, node)->expires.tv64)
 		base->first = &timer->node;
+	}
+
+	rb_link_node(&timer->node, parent, link);
+	rb_insert_color(&timer->node, &base->active);
 }
 
 /*
@@ -367,15 +558,22 @@ static void enqueue_hrtimer(struct hrtim
  *
  * Caller must hold the base lock.
  */
-static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
+static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base,
+			     void *setstate)
 {
-	/*
-	 * Remove the timer from the rbtree and replace the
-	 * first entry pointer if necessary.
-	 */
-	if (base->first == &timer->node)
-		base->first = rb_next(&timer->node);
-	rb_erase(&timer->node, &base->active);
+	/* High res. callback list. NOP for !HIGHRES */
+	if (hrtimer_cb_pending(timer))
+		hrtimer_remove_cb_pending(timer);
+	else {
+		/*
+		 * Remove the timer from the rbtree and replace the
+		 * first entry pointer if necessary.
+		 */
+		if (base->first == &timer->node)
+			base->first = rb_next(&timer->node);
+		rb_erase(&timer->node, &base->active);
+	}
+	timer->node.rb_parent = setstate;
 }
 
 /*
@@ -385,8 +583,7 @@ static inline int
 remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
 {
 	if (hrtimer_active(timer)) {
-		__remove_hrtimer(timer, base);
-		timer->state = HRTIMER_INACTIVE;
+		__remove_hrtimer(timer, base, HRTIMER_INACTIVE);
 		return 1;
 	}
 	return 0;
@@ -418,6 +615,8 @@ hrtimer_start(struct hrtimer *timer, kti
 	/* Switch the timer base, if necessary: */
 	new_base = switch_hrtimer_base(timer, base);
 
+	hrtimer_update_timer_prio(timer);
+
 	if (mode == HRTIMER_REL) {
 		tim = ktime_add(tim, new_base->get_time());
 		/*
@@ -484,6 +683,7 @@ int hrtimer_cancel(struct hrtimer *timer
 
 		if (ret >= 0)
 			return ret;
+		hrtimer_wait_for_timer(timer);
 	}
 }
 
@@ -560,6 +760,8 @@ void hrtimer_init(struct hrtimer *timer,
 		clock_id = CLOCK_MONOTONIC;
 
 	timer->base = &bases[clock_id];
+	timer->node.rb_parent = HRTIMER_INACTIVE;
+	hrtimer_init_timer_hres(timer);
 }
 
 /**
@@ -581,66 +783,213 @@ int hrtimer_get_res(const clockid_t whic
 	return 0;
 }
 
+#ifdef CONFIG_HIGH_RES_TIMERS
+
+/*
+ * High resolution timer interrupt
+ * Called with interrupts disabled
+ */
+int hrtimer_interrupt(void)
+{
+	struct hrtimer_base *base;
+	ktime_t expires_next, now;
+	int i, raise = 0, ret = 0;
+	int cpu = smp_processor_id();
+	struct hrtimer_hres *hres = &per_cpu(hrtimer_hres, cpu);
+
+	/* As long as we did not switch over to high resolution mode
+	 * we expect, that the event source is running in periodic
+	 * mode when it is a source serving other (tick based)
+	 * functionality than next event
+	 */
+	if (!hres->active)
+		return 1;
+ retry:
+	now = ktime_get();
+
+	/* FIXME: Switch this over to use a hrtimer ! */
+	if (hres->dotick) {
+		while (now.tv64 >= hres->next_tick.tv64) {
+			hres->next_tick = ktime_add(hres->next_tick,
+						    hres->tick_incr);
+			ret++;
+		}
+		expires_next = hres->next_tick;
+	} else
+		expires_next.tv64 = KTIME_MAX;
+
+	base = per_cpu(hrtimer_bases, cpu);
+
+	for (i = 0; i < MAX_HRTIMER_BASES; i++) {
+		ktime_t basenow;
+		struct rb_node *node;
+
+		spin_lock(&base->lock);
+
+		basenow = ktime_add(now, base->offset);
+
+		while ((node = base->first)) {
+			struct hrtimer *timer;
+
+			timer = rb_entry(node, struct hrtimer, node);
+
+			if (basenow.tv64 < timer->expires.tv64) {
+				ktime_t expires;
+
+				expires = ktime_sub(timer->expires,
+						    base->offset);
+				if (expires.tv64 < expires_next.tv64)
+					expires_next = expires;
+				break;
+			}
+			__remove_hrtimer(timer, base, NULL);
+
+			if (timer->mode == HRTIMER_CB_IRQSAFE) {
+				int ret = timer->function(timer);
+
+				BUG_ON(ret != HRTIMER_NORESTART);
+				timer->node.rb_parent = HRTIMER_INACTIVE;
+			} else {
+				hrtimer_add_cb_pending(timer, base);
+				raise = 1;
+			}
+		}
+		spin_unlock(&base->lock);
+		if (raise)
+			hrtimer_raise_softirq_rt(base);
+		base++;
+	}
+
+	hres->expires_next = expires_next;
+
+	/* Reprogramming necessary ? */
+	if (expires_next.tv64 != KTIME_MAX) {
+		if (clockevents_set_next_event(expires_next))
+			goto retry;
+	}
+
+	/* Raise softirq ? */
+	if (raise)
+		hrtimer_raise_softirq();
+
+	return ret;
+}
+
+/*
+ * Handle the per base hrtimer-queue in high resolution mode. The
+ * timer have been expired by the high resolution interrupt or have
+ * been enqueued into the expired list in the first place.
+ */
+static inline void run_hrtimer_hres_queue(struct hrtimer_base *base)
+{
+	spin_lock_irq(&base->lock);
+
+	while (hrtimer_base_cb_pending(base)) {
+		struct hrtimer *timer;
+		int (*fn)(struct hrtimer *);
+		int restart;
+
+		timer = hrtimer_get_first_cb_pending(base);
+		fn = timer->function;
+		set_curr_timer(base, timer);
+		__remove_hrtimer(timer, base, HRTIMER_INACTIVE);
+		spin_unlock_irq(&base->lock);
+
+		restart = fn(timer);
+
+		spin_lock_irq(&base->lock);
+
+		if (restart == HRTIMER_RESTART) {
+			BUG_ON(hrtimer_active(timer));
+			enqueue_hrtimer(timer, base);
+		} else if (hrtimer_active(timer)) {
+			/* Timer was rearmed on another CPU: */
+			if (base->first == &timer->node)
+				hrtimer_reprogram(timer, base);
+		}
+		set_curr_timer(base, NULL);
+		hrtimer_adjust_softirq_prio(base);
+	}
+	spin_unlock_irq(&base->lock);
+
+	wake_up_timer_waiters(base);
+}
+
+static void run_hrtimer_softirq(struct softirq_action *h)
+{
+	struct hrtimer_base *base = per_cpu(hrtimer_bases, smp_processor_id());
+
+#ifdef CONFIG_PREEMPT_RT
+	run_hrtimer_hres_queue(&base[(int) h->data]);
+#else
+	int i;
+
+	for (i = 0; i < MAX_HRTIMER_BASES; i++)
+		run_hrtimer_hres_queue(&base[i]);
+#endif
+}
+
+#endif	/* CONFIG_HIGH_RES_TIMERS */
+
 /*
  * Expire the per base hrtimer-queue:
  */
 static inline void run_hrtimer_queue(struct hrtimer_base *base)
 {
-	ktime_t now = base->get_time();
 	struct rb_node *node;
 
+	if (base->get_softirq_time)
+		base->softirq_time = base->get_softirq_time();
+
 	spin_lock_irq(&base->lock);
 
 	while ((node = base->first)) {
 		struct hrtimer *timer;
-		int (*fn)(void *);
+		int (*fn)(struct hrtimer *);
 		int restart;
-		void *data;
 
 		timer = rb_entry(node, struct hrtimer, node);
-		if (now.tv64 <= timer->expires.tv64)
+		if (base->softirq_time.tv64 <= timer->expires.tv64)
 			break;
 
 		fn = timer->function;
-		data = timer->data;
 		set_curr_timer(base, timer);
-		timer->state = HRTIMER_RUNNING;
-		__remove_hrtimer(timer, base);
+		__remove_hrtimer(timer, base, HRTIMER_INACTIVE);
 		spin_unlock_irq(&base->lock);
 
-		/*
-		 * fn == NULL is special case for the simplest timer
-		 * variant - wake up process and do not restart:
-		 */
-		if (!fn) {
-			wake_up_process(data);
-			restart = HRTIMER_NORESTART;
-		} else
-			restart = fn(data);
+		restart = fn(timer);
 
 		spin_lock_irq(&base->lock);
 
-		/* Another CPU has added back the timer */
-		if (timer->state != HRTIMER_RUNNING)
-			continue;
-
-		if (restart == HRTIMER_RESTART)
+		if (restart != HRTIMER_NORESTART) {
+			BUG_ON(hrtimer_active(timer));
 			enqueue_hrtimer(timer, base);
-		else
-			timer->state = HRTIMER_EXPIRED;
+		}
 	}
 	set_curr_timer(base, NULL);
 	spin_unlock_irq(&base->lock);
+	wake_up_timer_waiters(base);
 }
 
 /*
  * Called from timer softirq every jiffy, expire hrtimers:
+ *
+ * For HRT its the fall back code to run the softirq in the timer
+ * softirq context in case the hrtimer initialization failed or has
+ * not been done yet.
  */
 void hrtimer_run_queues(void)
 {
 	struct hrtimer_base *base = __get_cpu_var(hrtimer_bases);
 	int i;
 
+	hrtimer_check_clocks();
+
+	if (hrtimer_hres_active)
+		return;
+
+	hrtimer_get_softirq_time(base);
+
 	for (i = 0; i < MAX_HRTIMER_BASES; i++)
 		run_hrtimer_queue(&base[i]);
 }
@@ -648,80 +997,73 @@ void hrtimer_run_queues(void)
 /*
  * Sleep related functions:
  */
-
-/**
- * schedule_hrtimer - sleep until timeout
- *
- * @timer:	hrtimer variable initialized with the correct clock base
- * @mode:	timeout value is abs/rel
- *
- * Make the current task sleep until @timeout is
- * elapsed.
- *
- * You can set the task state as follows -
- *
- * %TASK_UNINTERRUPTIBLE - at least @timeout is guaranteed to
- * pass before the routine returns. The routine will return 0
- *
- * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
- * delivered to the current task. In this case the remaining time
- * will be returned
- *
- * The current task state is guaranteed to be TASK_RUNNING when this
- * routine returns.
- */
-static ktime_t __sched
-schedule_hrtimer(struct hrtimer *timer, const enum hrtimer_mode mode)
+static int hrtimer_wakeup(struct hrtimer *timer)
 {
-	/* fn stays NULL, meaning single-shot wakeup: */
-	timer->data = current;
+	struct hrtimer_sleeper *t =
+		container_of(timer, struct hrtimer_sleeper, timer);
+	struct task_struct *task = t->task;
 
-	hrtimer_start(timer, timer->expires, mode);
+	t->task = NULL;
+	if (task)
+		wake_up_process(task);
 
-	schedule();
-	hrtimer_cancel(timer);
+	return HRTIMER_NORESTART;
+}
 
-	/* Return the remaining time: */
-	if (timer->state != HRTIMER_EXPIRED)
-		return ktime_sub(timer->expires, timer->base->get_time());
-	else
-		return (ktime_t) {.tv64 = 0 };
+void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, task_t *task)
+{
+	sl->timer.function = hrtimer_wakeup;
+	sl->task = task;
+#ifdef CONFIG_HIGH_RES_TIMERS
+	sl->timer.mode = HRTIMER_CB_IRQSAFE;
+#endif
 }
 
-static inline ktime_t __sched
-schedule_hrtimer_interruptible(struct hrtimer *timer,
-			       const enum hrtimer_mode mode)
+static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
 {
-	set_current_state(TASK_INTERRUPTIBLE);
+	hrtimer_init_sleeper(t, current);
+
+	do {
+		set_current_state(TASK_INTERRUPTIBLE);
+		hrtimer_start(&t->timer, t->timer.expires, mode);
+
+		if (likely(t->task))
+			schedule();
 
-	return schedule_hrtimer(timer, mode);
+		hrtimer_cancel(&t->timer);
+		mode = HRTIMER_ABS;
+
+	} while (t->task && !signal_pending(current));
+
+	return t->task == NULL;
 }
 
 static long __sched nanosleep_restart(struct restart_block *restart)
 {
+	struct hrtimer_sleeper t;
 	struct timespec __user *rmtp;
 	struct timespec tu;
-	void *rfn_save = restart->fn;
-	struct hrtimer timer;
-	ktime_t rem;
+	ktime_t time;
 
 	restart->fn = do_no_restart_syscall;
 
-	hrtimer_init(&timer, (clockid_t) restart->arg3, HRTIMER_ABS);
+	hrtimer_init(&t.timer, restart->arg3, HRTIMER_ABS);
+	t.timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0;
 
-	timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0;
-
-	rem = schedule_hrtimer_interruptible(&timer, HRTIMER_ABS);
-
-	if (rem.tv64 <= 0)
+	if (do_nanosleep(&t, HRTIMER_ABS))
 		return 0;
 
 	rmtp = (struct timespec __user *) restart->arg2;
-	tu = ktime_to_timespec(rem);
-	if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu)))
-		return -EFAULT;
+	if (rmtp) {
+		time = ktime_sub(t.timer.expires, t.timer.base->get_time());
+		if (time.tv64 <= 0)
+			return 0;
+		tu = ktime_to_timespec(time);
+		if (copy_to_user(rmtp, &tu, sizeof(tu)))
+			return -EFAULT;
+	}
 
-	restart->fn = rfn_save;
+	restart->fn = nanosleep_restart;
 
 	/* The other values in restart are already filled in */
 	return -ERESTART_RESTARTBLOCK;
@@ -731,33 +1073,34 @@ long hrtimer_nanosleep(struct timespec *
 		       const enum hrtimer_mode mode, const clockid_t clockid)
 {
 	struct restart_block *restart;
-	struct hrtimer timer;
+	struct hrtimer_sleeper t;
 	struct timespec tu;
 	ktime_t rem;
 
-	hrtimer_init(&timer, clockid, mode);
-
-	timer.expires = timespec_to_ktime(*rqtp);
-
-	rem = schedule_hrtimer_interruptible(&timer, mode);
-	if (rem.tv64 <= 0)
+	hrtimer_init(&t.timer, clockid, mode);
+	t.timer.expires = timespec_to_ktime(*rqtp);
+	if (do_nanosleep(&t, mode))
 		return 0;
 
 	/* Absolute timers do not update the rmtp value and restart: */
 	if (mode == HRTIMER_ABS)
 		return -ERESTARTNOHAND;
 
-	tu = ktime_to_timespec(rem);
-
-	if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu)))
-		return -EFAULT;
+	if (rmtp) {
+		rem = ktime_sub(t.timer.expires, t.timer.base->get_time());
+		if (rem.tv64 <= 0)
+			return 0;
+		tu = ktime_to_timespec(rem);
+		if (copy_to_user(rmtp, &tu, sizeof(tu)))
+			return -EFAULT;
+	}
 
 	restart = &current_thread_info()->restart_block;
 	restart->fn = nanosleep_restart;
-	restart->arg0 = timer.expires.tv64 & 0xFFFFFFFF;
-	restart->arg1 = timer.expires.tv64 >> 32;
+	restart->arg0 = t.timer.expires.tv64 & 0xFFFFFFFF;
+	restart->arg1 = t.timer.expires.tv64 >> 32;
 	restart->arg2 = (unsigned long) rmtp;
-	restart->arg3 = (unsigned long) timer.base->index;
+	restart->arg3 = (unsigned long) t.timer.base->index;
 
 	return -ERESTART_RESTARTBLOCK;
 }
@@ -785,7 +1128,9 @@ static void __devinit init_hrtimers_cpu(
 	int i;
 
 	for (i = 0; i < MAX_HRTIMER_BASES; i++, base++)
-		spin_lock_init(&base->lock);
+		hrtimer_init_base(base);
+
+	hrtimer_init_hres(cpu);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -798,7 +1143,7 @@ static void migrate_hrtimer_list(struct 
 
 	while ((node = rb_first(&old_base->active))) {
 		timer = rb_entry(node, struct hrtimer, node);
-		__remove_hrtimer(timer, old_base);
+		__remove_hrtimer(timer, old_base, HRTIMER_INACTIVE);
 		timer->base = new_base;
 		enqueue_hrtimer(timer, new_base);
 	}
@@ -808,6 +1153,7 @@ static void migrate_hrtimers(int cpu)
 {
 	struct hrtimer_base *old_base, *new_base;
 	int i;
+	unsigned long flags;
 
 	BUG_ON(cpu_online(cpu));
 	old_base = per_cpu(hrtimer_bases, cpu);
@@ -817,7 +1163,7 @@ static void migrate_hrtimers(int cpu)
 
 	for (i = 0; i < MAX_HRTIMER_BASES; i++) {
 
-		spin_lock(&new_base->lock);
+		spin_lock_irqsave(&new_base->lock, flags);
 		spin_lock(&old_base->lock);
 
 		BUG_ON(old_base->curr_timer);
@@ -825,7 +1171,7 @@ static void migrate_hrtimers(int cpu)
 		migrate_hrtimer_list(old_base, new_base);
 
 		spin_unlock(&old_base->lock);
-		spin_unlock(&new_base->lock);
+		spin_unlock_irqrestore(&new_base->lock, flags);
 		old_base++;
 		new_base++;
 	}
@@ -868,5 +1214,15 @@ void __init hrtimers_init(void)
 	hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
 			  (void *)(long)smp_processor_id());
 	register_cpu_notifier(&hrtimers_nb);
+#ifdef CONFIG_HIGH_RES_TIMERS
+#ifdef CONFIG_PREEMPT_RT
+	open_softirq(HRTIMER_SOFTIRQ_REAL, run_hrtimer_softirq,
+		     (void *) CLOCK_REALTIME);
+	open_softirq(HRTIMER_SOFTIRQ_MONO, run_hrtimer_softirq,
+		     (void *) CLOCK_MONOTONIC);
+#else
+	open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq, NULL);
+#endif
+#endif
 }
 
Index: linux-2.6.16/kernel/hrtimer.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/kernel/hrtimer.h	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,222 @@
+/*
+ * High-resolution kernel timers
+ *
+ * started by Thomas Gleixner and Ingo Molnar:
+ *
+ *  Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright(C) 2005, Red Hat, Inc., Ingo Molnar
+ *
+ * This files contains hrtimer inline and macros for hrtimer.c
+ * internal usage
+ */
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+
+static int hrtimer_reprogram(struct hrtimer *timer, struct hrtimer_base *base);
+
+#define hrtimer_hres_active 		(__get_cpu_var(hrtimer_hres).active)
+
+struct hrtimer_hres {
+	ktime_t		expires_next;
+	ktime_t		next_tick;
+	ktime_t		tick_incr;
+	int		active;
+	int		dotick;
+	unsigned long	check_clocks;
+};
+
+static DEFINE_PER_CPU(struct hrtimer_hres, hrtimer_hres);
+
+#ifdef CONFIG_PREEMPT_RT
+
+static inline int hrtimer_get_softirq_prio(struct hrtimer_base *base)
+{
+	if (!plist_head_empty(&base->cb_pending)) {
+		struct hrtimer *timer;
+
+		timer = plist_first_entry(&base->cb_pending, struct hrtimer,
+					  cb_entry);
+		if (rt_prio(timer->cb_entry.prio))
+			return timer->cb_entry.prio;
+	}
+	/* Standard softirq priority: 1 */
+	return MAX_RT_PRIO-1 - 1;
+}
+
+static inline void hrtimer_adjust_softirq_prio(struct hrtimer_base *base)
+{
+	struct sched_param param;
+	int prio = hrtimer_get_softirq_prio(base);
+
+	param.sched_priority = MAX_RT_PRIO-1 - prio;
+	if (current->normal_prio != prio) {
+		setscheduler(current, -1, SCHED_FIFO, &param);
+		spin_unlock_irq(&base->lock);
+		cond_resched();
+		spin_lock_irq(&base->lock);
+	}
+	//__cond_resched_raw_spinlock(&base->lock);
+}
+
+static inline void hrtimer_raise_softirq_rt(struct hrtimer_base *base)
+{
+	int prio = hrtimer_get_softirq_prio(base);
+
+	raise_softirq_prio_irqoff(HRTIMER_SOFTIRQ_REAL + base->index, prio);
+}
+
+static inline int hrtimer_cb_pending(const struct hrtimer *timer)
+{
+	return !plist_node_empty(&timer->cb_entry);
+}
+
+static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
+{
+	plist_del(&timer->cb_entry, &timer->base->cb_pending);
+}
+
+static inline void hrtimer_add_cb_pending(struct hrtimer *timer,
+					  struct hrtimer_base *base)
+{
+	plist_add(&timer->cb_entry, &base->cb_pending);
+	timer->node.rb_parent = NULL;
+}
+
+static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
+{
+	plist_node_init(&timer->cb_entry, current->normal_prio);
+}
+
+static inline void hrtimer_update_timer_prio(struct hrtimer *timer)
+{
+	timer->cb_entry.prio = current->normal_prio;
+}
+
+static inline int hrtimer_base_cb_pending(struct hrtimer_base *base)
+{
+	return !plist_head_empty(&base->cb_pending);
+}
+
+static inline
+struct hrtimer *hrtimer_get_first_cb_pending(struct hrtimer_base *base)
+{
+	return plist_first_entry(&base->cb_pending, struct hrtimer, cb_entry);
+}
+
+# define hrtimer_raise_softirq()	do { } while (0)
+
+#else
+
+# define hrtimer_adjust_softirq_prio(b)	do { } while (0)
+
+static inline void hrtimer_raise_softirq(void)
+{
+	raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+}
+
+static inline int hrtimer_cb_pending(const struct hrtimer *timer)
+{
+	return !list_empty(&timer->cb_entry);
+}
+
+static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
+{
+	list_del_init(&timer->cb_entry);
+}
+
+static inline void hrtimer_add_cb_pending(struct hrtimer *timer,
+					  struct hrtimer_base *base)
+{
+	list_add_tail(&timer->cb_entry, &base->cb_pending);
+	timer->node.rb_parent = NULL;
+}
+
+static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
+{
+	INIT_LIST_HEAD(&timer->cb_entry);
+}
+
+static inline int hrtimer_base_cb_pending(struct hrtimer_base *base)
+{
+	return !list_empty(&base->cb_pending);
+}
+
+static inline
+struct hrtimer *hrtimer_get_first_cb_pending(struct hrtimer_base *base)
+{
+	return list_entry(base->cb_pending.next, struct hrtimer, cb_entry);
+}
+
+# define hrtimer_raise_softirq_rt(b)	do { } while (0)
+# define hrtimer_update_timer_prio(t)	do { } while (0)
+
+#endif
+
+static inline void hrtimer_init_hres(int cpu)
+{
+	struct hrtimer_hres *hres = &per_cpu(hrtimer_hres, cpu);
+
+	hres->expires_next.tv64 = KTIME_MAX;
+	set_bit(0, &hres->check_clocks);
+	hres->active = 0;
+}
+
+static inline void hrtimer_init_base(struct hrtimer_base *base)
+{
+	spin_lock_init(&base->lock);
+#ifdef CONFIG_PREEMPT_RT
+	plist_head_init(&base->cb_pending, &base->lock);
+#else
+	INIT_LIST_HEAD(&base->cb_pending);
+#endif
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+	init_waitqueue_head(&base->wait);
+#endif
+}
+
+static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
+					    struct hrtimer_base *base)
+{
+	/*
+	 * High resolution timers, when active try to
+	 * reprogram. If the timer is in the past we just move
+	 * it to the expired list and schedule the softirq.
+	 */
+	if (hrtimer_hres_active && hrtimer_reprogram(timer, base)) {
+		/*
+		 * HRTIMER_CB_IRQSAFE signals that the timer is not
+		 * requested to be rearmed by the callback
+		 * function. Just call the callback instead of going
+		 * through the softirq.
+		 */
+		if (timer->mode == HRTIMER_CB_IRQSAFE) {
+			int ret = timer->function(timer);
+
+			BUG_ON(ret != HRTIMER_NORESTART);
+		} else
+			hrtimer_add_cb_pending(timer, base);
+		return 1;
+	}
+	return 0;
+}
+
+#else /* CONFIG_HIGH_RES_TIMERS */
+
+# define hrtimer_hres_active		0
+# define hrtimer_check_clocks()		do { } while (0)
+# define hrtimer_enqueue_reprogram(t,b)	0
+# define hrtimer_cb_pending(t)		0
+# define hrtimer_remove_cb_pending(t)	do { } while (0)
+# define hrtimer_init_hres(c)		do { } while (0)
+# define hrtimer_init_timer_hres(t)	do { } while (0)
+# define hrtimer_update_timer_prio(t)	do { } while (0)
+
+static inline void hrtimer_init_base(struct hrtimer_base *base)
+{
+	spin_lock_init(&base->lock);
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+	init_waitqueue_head(&base->wait);
+#endif
+}
+
+#endif
Index: linux-2.6.16/kernel/irq/Makefile
===================================================================
--- linux-2.6.16.orig/kernel/irq/Makefile	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/irq/Makefile	2006-10-19 16:52:31.000000000 +0200
@@ -1,5 +1,5 @@
 
-obj-y := handle.o manage.o spurious.o
+obj-y := handle.o manage.o spurious.o resend.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
 
Index: linux-2.6.16/kernel/irq/autoprobe.c
===================================================================
--- linux-2.6.16.orig/kernel/irq/autoprobe.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/irq/autoprobe.c	2006-10-19 16:52:31.000000000 +0200
@@ -7,6 +7,7 @@
  */
 
 #include <linux/irq.h>
+#include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
Index: linux-2.6.16/kernel/irq/handle.c
===================================================================
--- linux-2.6.16.orig/kernel/irq/handle.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/irq/handle.c	2006-10-19 16:52:31.000000000 +0200
@@ -1,68 +1,75 @@
 /*
  * linux/kernel/irq/handle.c
  *
- * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 1992, 1998-2005 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005, Thomas Gleixner, Russell King
  *
  * This file contains the core interrupt handling code.
+ *
+ * Detailed information is available in Documentation/DocBook/genericirq
+ *
  */
 
 #include <linux/irq.h>
 #include <linux/module.h>
 #include <linux/random.h>
+#include <linux/kallsyms.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
 
+#if defined(CONFIG_NO_IDLE_HZ)
+#include <asm/dyntick.h>
+#endif
+
 #include "internals.h"
 
 /*
- * Linux has a controller-independent interrupt architecture.
- * Every controller has a 'controller-template', that is used
- * by the main code to do the right thing. Each driver-visible
- * interrupt source is transparently wired to the apropriate
- * controller. Thus drivers need not be aware of the
- * interrupt-controller.
- *
- * The code is designed to be easily extended with new/different
- * interrupt controllers, without having to do assembly magic or
- * having to touch the generic code.
- *
- * Controller mappings for all interrupt sources:
+ * Default initialization for all interrupt sources
  */
 irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
 	[0 ... NR_IRQS-1] = {
 		.status = IRQ_DISABLED,
 		.handler = &no_irq_type,
-		.lock = SPIN_LOCK_UNLOCKED
+		.lock = RAW_SPIN_LOCK_UNLOCKED,
+		.depth = 1,
 	}
 };
 
+EXPORT_SYMBOL_GPL(irq_desc);
+
 /*
- * Generic 'no controller' code
+ * What should we do if we get a hw irq event on an illegal vector?
+ * Each architecture has to answer this themself.
  */
-static void end_none(unsigned int irq) { }
-static void enable_none(unsigned int irq) { }
-static void disable_none(unsigned int irq) { }
-static void shutdown_none(unsigned int irq) { }
-static unsigned int startup_none(unsigned int irq) { return 0; }
-
-static void ack_none(unsigned int irq)
+static void ack_bad(unsigned int irq)
 {
-	/*
-	 * 'what should we do if we get a hw irq event on an illegal vector'.
-	 * each architecture has to answer this themself.
-	 */
 	ack_bad_irq(irq);
 }
 
-struct hw_interrupt_type no_irq_type = {
+/*
+ * NOP functions
+ */
+static void noop(unsigned int irq)
+{
+}
+
+static unsigned int noop_ret(unsigned int irq)
+{
+	return 0;
+}
+
+/*
+ * Generic no controller implementation
+ */
+struct irq_type no_irq_type = {
 	.typename = 	"none",
-	.startup = 	startup_none,
-	.shutdown = 	shutdown_none,
-	.enable = 	enable_none,
-	.disable = 	disable_none,
-	.ack = 		ack_none,
-	.end = 		end_none,
-	.set_affinity = NULL
+	.startup = 	noop_ret,
+	.shutdown = 	noop,
+	.enable = 	noop,
+	.disable = 	noop,
+	.ack = 		ack_bad,
+	.end = 		noop,
+	.handle_irq =	handle_bad_irq,
 };
 
 /*
@@ -74,42 +81,459 @@ irqreturn_t no_action(int cpl, void *dev
 }
 
 /*
- * Have got an event to handle:
+ * default ack function
+ */
+static void default_ack(unsigned int irq)
+{
+	irq_desc[irq].chip->ack(irq);
+}
+
+/*
+ * default mask ack function
+ */
+static void default_mask_ack(unsigned int irq)
+{
+	irq_desc_t *desc = irq_desc + irq;
+
+	if (desc->chip->mask_ack) {
+		desc->chip->mask_ack(irq);
+	} else {
+		desc->chip->mask(irq);
+		desc->chip->ack(irq);
+	}
+}
+
+/*
+ * default enable function
+ */
+static void default_enable(unsigned int irq)
+{
+	irq_desc_t *desc = irq_desc + irq;
+
+	desc->chip->unmask(irq);
+	desc->status &= ~IRQ_MASKED;
+}
+
+/*
+ * default end function
+ */
+static void default_end(unsigned int irq)
+{
+	irq_desc_t *desc = irq_desc + irq;
+
+	if (!desc->depth)
+		desc->chip->unmask(irq);
+}
+
+/*
+ * default disable function
+ */
+static void default_disable(unsigned int irq)
+{
+	irq_desc[irq].chip->mask(irq);
+}
+
+/*
+ * Default set type function
+ */
+static struct irq_type *default_set_type(unsigned int irq, unsigned int type)
+{
+	irq_desc_t *desc = irq_desc + irq;
+
+	if (desc->chip->set_type)
+		if (desc->chip->set_type(irq, type))
+			return NULL;
+
+	switch (type) {
+	case IRQ_TYPE_NONE:
+		return &no_irq_type;
+
+	case IRQ_TYPE_EDGEL:
+	case IRQ_TYPE_EDGEH:
+	case IRQ_TYPE_EDGEB:
+		return &default_edge_type;
+	case IRQ_TYPE_LEVELL:
+	case IRQ_TYPE_LEVELH:
+		return &default_level_type;
+	case IRQ_TYPE_SIMPLE:
+		return &default_simple_type;
+	}
+	return NULL;
+}
+
+/*
+ * Generic edge type interrupt
+ *
+ */
+struct irq_type default_edge_type = {
+	.typename	= "default_edge",
+	.enable		= default_enable,
+	.disable	= default_disable,
+	.ack		= default_ack,
+	.hold		= default_mask_ack,
+	.end		= noop,
+	.handle_irq	= handle_edge_irq,
+	.set_type	= default_set_type,
+};
+
+/*
+ * Generic level type interrupt
+ */
+struct irq_type default_level_type = {
+	.typename	= "default_level",
+	.enable		= default_enable,
+	.disable	= default_disable,
+	.ack		= default_mask_ack,
+	.end		= default_end,
+	.handle_irq	= handle_level_irq,
+	.set_type	= default_set_type,
+};
+
+/*
+ * Generic simple type interrupt
+ *
+ * No hardware handling necessary
+ */
+struct irq_type default_simple_type = {
+	.typename	= "default_simple",
+	.enable		= default_enable,
+	.disable	= default_disable,
+	.set_type	= default_set_type,
+	.handle_irq	= handle_simple_irq,
+};
+
+#ifdef CONFIG_SMP
+/*
+ * Generic per cpu type interrupt
+ */
+struct irq_type default_percpu_type = {
+	.typename	= "default_percpu",
+	.enable		= default_enable,
+	.disable	= default_disable,
+	.ack		= default_ack,
+	.end		= default_end,
+	.handle_irq	= handle_percpu_irq,
+};
+#endif
+
+/*
+ * Hack - used for development only.
+ */
+int debug_direct_keyboard = 0;
+
+int redirect_hardirq(struct irq_desc *desc)
+{
+	/*
+	 * Direct execution:
+	 */
+	if (!hardirq_preemption || (desc->status & IRQ_NODELAY) ||
+							!desc->thread)
+		return 0;
+
+#ifdef __i386__
+	if (debug_direct_keyboard && (desc - irq_desc == 1))
+		return 0;
+#endif
+
+	BUG_ON(!irqs_disabled());
+	if (desc->thread && desc->thread->state != TASK_RUNNING)
+		wake_up_process(desc->thread);
+
+	return 1;
+}
+
+/**
+ * handle_IRQ_event - irq action chain handler
+ * @irq:	the interrupt number
+ * @regs:	pointer to a register structure
+ * @action:	the interrupt action chain for this irq
+ *
+ * Handles the action chain of an irq event
  */
 fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
 				struct irqaction *action)
 {
 	int ret, retval = 0, status = 0;
 
-	if (!(action->flags & SA_INTERRUPT))
+	/*
+	 * Unconditionally enable interrupts for threaded
+	 * IRQ handlers:
+	 */
+	if (!hardirq_count() || !(action->flags & SA_INTERRUPT))
 		local_irq_enable();
 
+#if defined(CONFIG_NO_IDLE_HZ)
+	if (!(action->flags & SA_TIMER) && system_timer->dyn_tick != NULL) {
+		write_seqlock(&xtime_lock);
+		if (system_timer->dyn_tick->state & DYN_TICK_ENABLED)
+			system_timer->dyn_tick->handler(irq, 0, regs);
+		write_sequnlock(&xtime_lock);
+	}
+#endif
+
 	do {
+		unsigned int preempt_count = preempt_count();
+
 		ret = action->handler(irq, action->dev_id, regs);
+		if (preempt_count() != preempt_count) {
+			stop_trace();
+			print_symbol("BUG: unbalanced irq-handler preempt count in %s!\n", (unsigned long) action->handler);
+			printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count());
+			dump_stack();
+			preempt_count() = preempt_count;
+		}
 		if (ret == IRQ_HANDLED)
 			status |= action->flags;
 		retval |= ret;
 		action = action->next;
 	} while (action);
 
-	if (status & SA_SAMPLE_RANDOM)
+	if (status & SA_SAMPLE_RANDOM) {
+		local_irq_enable();
 		add_interrupt_randomness(irq);
+	}
 	local_irq_disable();
 
 	return retval;
 }
 
-/*
- * do_IRQ handles all normal device IRQ's (the special
+/**
+ * handle_bad_irq - handle spurious and unhandled irqs
+ * @irq:	the interrupt number
+ * @desc:	the interrupt description structure for this irq
+ * @regs:	pointer to a register structure
+ */
+void handle_bad_irq(unsigned int irq, irq_desc_t *desc, struct pt_regs *regs)
+{
+}
+
+/**
+ * handle_simple_irq - Simple and software-decoded IRQs.
+ * @irq:	the interrupt number
+ * @desc:	the interrupt description structure for this irq
+ * @regs:	pointer to a register structure
+ *
+ * Simple interrupts are either sent from a demultiplexing interrupt
+ * handler or come from hardware, where no interrupt hardware control
+ * is necessary.
+ *
+ * Note: The caller is expected to handle the ack, clear, mask and
+ * unmask issues if necessary.
+ *
+ * Must be called with the irq_desc->lock held
+ */
+void handle_simple_irq(unsigned int irq, irq_desc_t *desc, struct pt_regs *regs)
+{
+	struct irqaction *action;
+	irqreturn_t action_ret;
+	const unsigned int cpu = smp_processor_id();
+
+	kstat_cpu(cpu).irqs[irq]++;
+
+	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+
+	action = desc->action;
+	if (unlikely(!action || desc->depth))
+		return;
+
+	desc->status |= IRQ_INPROGRESS;
+
+	/*
+	 * hardirq redirection to the irqd process context:
+	 */
+	if (redirect_hardirq(desc))
+		return;
+
+	spin_unlock(&desc->lock);
+	action_ret = handle_IRQ_event(irq, regs, action);
+	if (!noirqdebug)
+		note_interrupt(irq, desc, action_ret, regs);
+	spin_lock(&desc->lock);
+	desc->status &= ~IRQ_INPROGRESS;
+}
+
+/**
+ * handle_level_irq - Level type irq handler
+ * @irq:	the interrupt number
+ * @desc:	the interrupt description structure for this irq
+ * @regs:	pointer to a register structure
+ *
+ * Level type interrupts are active as long as the hardware line has
+ * the active level. This may require to mask the interrupt and unmask it
+ * after the associated handler has acknowledged the device, so the
+ * interrupt line is back to inactive.
+ *
+ * Must be called with the irq_desc->lock held
+ */
+void handle_level_irq(unsigned int irq, irq_desc_t *desc, struct pt_regs *regs)
+{
+	struct irqaction *action;
+	irqreturn_t action_ret;
+	const unsigned int cpu = smp_processor_id();
+
+	kstat_cpu(cpu).irqs[irq]++;
+
+	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+
+	desc->handler->ack(irq);
+
+	/*
+	 * If its disabled or no action available
+	 * keep it masked and get out of here
+	 */
+	action = desc->action;
+	if (unlikely(!action || desc->depth))
+		goto out;
+
+	desc->status |= IRQ_INPROGRESS;
+
+	/*
+	 * hardirq redirection to the irqd process context:
+	 */
+	if (redirect_hardirq(desc))
+		return;
+
+	spin_unlock(&desc->lock);
+	action_ret = handle_IRQ_event(irq, regs, action);
+	if (!noirqdebug)
+		note_interrupt(irq, desc, action_ret, regs);
+	spin_lock(&desc->lock);
+
+	desc->status &= ~IRQ_INPROGRESS;
+out:
+	end_irq(desc, irq);
+}
+
+/**
+ * handle_edge_irq - edge type IRQ handler
+ * @irq:	the interrupt number
+ * @desc:	the interrupt description structure for this irq
+ * @regs:	pointer to a register structure
+ *
+ * Interrupt occures on the falling and/or rising edge of a hardware
+ * signal. The occurence is latched into the irq controller hardware
+ * and must be acked in order to be reenabled. After the ack another
+ * interrupt can happen on the same source even before the first one
+ * is handled by the assosiacted event handler. If this happens it
+ * might be necessary to disable (mask) the interrupt depending on the
+ * controller hardware. This requires to reenable the interrupt inside
+ * of the loop which handles the interrupts which have arrived while
+ * the handler was running. If all pending interrupts are handled, the
+ * loop is left and depending on the hardware controller some final
+ * ack might be necessary.
+ *
+ * Must be called with the irq_desc->lock held
+ */
+void handle_edge_irq(unsigned int irq, irq_desc_t *desc, struct pt_regs *regs)
+{
+	const unsigned int cpu = smp_processor_id();
+
+	kstat_cpu(cpu).irqs[irq]++;
+
+	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+
+	/*
+	 * If we're currently running this IRQ, or its disabled,
+	 * we shouldn't process the IRQ. Mark it pending, handle
+	 * the necessary masking and go out
+	 */
+	if (unlikely((desc->status & IRQ_INPROGRESS) || desc->depth ||
+		    !desc->action)) {
+		desc->status |= (IRQ_PENDING | IRQ_MASKED);
+		desc->handler->hold(irq);
+		return;
+	}
+
+	/* Start handling the irq */
+	desc->handler->ack(irq);
+
+	/* Mark the IRQ currently in progress.*/
+	desc->status |= IRQ_INPROGRESS;
+
+	/*
+	 * hardirq redirection to the irqd process context:
+	 */
+	if (redirect_hardirq(desc))
+		return;
+
+	do {
+		struct irqaction *action = desc->action;
+		irqreturn_t action_ret;
+
+		if (unlikely(!action)) {
+			desc->handler->disable(irq);
+			return;
+		}
+
+		/*
+		 * When another irq arrived while we were handling
+		 * one, we could have masked the irq.
+		 * Renable it, if it was not disabled in meantime.
+		 */
+		if (unlikely(((desc->status & (IRQ_PENDING | IRQ_MASKED)) ==
+			    (IRQ_PENDING | IRQ_MASKED)) && !desc->depth))
+			desc->handler->enable(irq);
+
+		desc->status &= ~IRQ_PENDING;
+		spin_unlock(&desc->lock);
+		action_ret = handle_IRQ_event(irq, regs, action);
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret, regs);
+		spin_lock(&desc->lock);
+
+	} while ((desc->status & IRQ_PENDING) && !desc->depth);
+
+	desc->status &= ~IRQ_INPROGRESS;
+	end_irq(desc, irq);
+}
+
+#ifdef CONFIG_SMP
+/**
+ * handle_percpu_IRQ - Per CPU local irq handler
+ * @irq:	the interrupt number
+ * @desc:	the interrupt description structure for this irq
+ * @regs:	pointer to a register structure
+ *
+ * Per CPU interrupts on SMP machines without locking requirements
+ */
+void handle_percpu_irq(unsigned int irq, irq_desc_t *desc, struct pt_regs *regs)
+{
+	irqreturn_t action_ret;
+
+	kstat_this_cpu.irqs[irq]++;
+	desc->handler->ack(irq);
+	action_ret = handle_IRQ_event(irq, regs, desc->action);
+	if (!noirqdebug)
+		note_interrupt(irq, desc, action_ret, regs);
+	desc->handler->end(irq);
+}
+#endif /* CONFIG_SMP */
+
+/**
+ * __do_IRQ - original all in one handler
+ * @irq:	the interrupt number
+ * @regs:	pointer to a register structure
+ *
+ * __do_IRQ handles all normal device IRQ's (the special
  * SMP cross-CPU interrupts have their own specific
- * handlers).
+ * handlers). * This is the original x86 implementation which is used for every
+ * type of interrupt.
+ *
  */
-fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
+fastcall notrace unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
 {
 	irq_desc_t *desc = irq_desc + irq;
 	struct irqaction * action;
 	unsigned int status;
 
+	/*
+	 * If the task is currently running in user mode, don't
+	 * detect soft lockups.  If CONFIG_DETECT_SOFTLOCKUP is not
+	 * configured, this should be optimized out.
+	 */
+	if (user_mode(regs))
+		touch_softlockup_watchdog();
+
 	kstat_this_cpu.irqs[irq]++;
 	if (CHECK_IRQ_PER_CPU(desc->status)) {
 		irqreturn_t action_ret;
@@ -120,7 +544,7 @@ fastcall unsigned int __do_IRQ(unsigned 
 		if (desc->handler->ack)
 			desc->handler->ack(irq);
 		action_ret = handle_IRQ_event(irq, regs, desc->action);
-		desc->handler->end(irq);
+		end_irq(desc, irq);
 		return 1;
 	}
 
@@ -156,6 +580,12 @@ fastcall unsigned int __do_IRQ(unsigned 
 		goto out;
 
 	/*
+	 * hardirq redirection to the irqd process context:
+	 */
+	if (redirect_hardirq(desc))
+		goto out_no_end;
+
+	/*
 	 * Edge triggered interrupts need to remember
 	 * pending events.
 	 * This applies to any hw interrupts that allow a second
@@ -180,13 +610,13 @@ fastcall unsigned int __do_IRQ(unsigned 
 		desc->status &= ~IRQ_PENDING;
 	}
 	desc->status &= ~IRQ_INPROGRESS;
-
 out:
 	/*
-	 * The ->end() handler has to deal with interrupts which got
-	 * disabled while the handler was running.
+	 * The end-handler has to deal with interrupts which got
+	 * disabled while the handler was running:
 	 */
-	desc->handler->end(irq);
+	end_irq(desc, irq);
+out_no_end:
 	spin_unlock(&desc->lock);
 
 	return 1;
Index: linux-2.6.16/kernel/irq/internals.h
===================================================================
--- linux-2.6.16.orig/kernel/irq/internals.h	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/irq/internals.h	2006-10-19 16:52:31.000000000 +0200
@@ -4,6 +4,23 @@
 
 extern int noirqdebug;
 
+void recalculate_desc_flags(struct irq_desc *desc);
+
+/*
+ * On PREEMPT_HARDIRQS, the ->ack handler masks interrupts, so that
+ * they can be redirected to an IRQ thread, if needed. So here we
+ * have to unmask the interrupt line, if needed:
+ */
+static inline void end_irq(irq_desc_t *desc, unsigned int irq)
+{
+#if defined(CONFIG_PREEMPT_HARDIRQS) && !defined(CONFIG_ARM) && !defined(CONFIG_PPC)
+	if (!(desc->status & (IRQ_DISABLED|IRQ_INPROGRESS)))
+		desc->handler->enable(irq);
+#else
+	desc->handler->end(irq);
+#endif
+}
+
 #ifdef CONFIG_PROC_FS
 extern void register_irq_proc(unsigned int irq);
 extern void register_handler_proc(unsigned int irq, struct irqaction *action);
Index: linux-2.6.16/kernel/irq/manage.c
===================================================================
--- linux-2.6.16.orig/kernel/irq/manage.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/irq/manage.c	2006-10-19 16:52:31.000000000 +0200
@@ -1,15 +1,18 @@
 /*
  * linux/kernel/irq/manage.c
  *
- * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 1992, 1998-2005 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005, Thomas Gleixner
  *
  * This file contains driver APIs to the irq subsystem.
  */
 
 #include <linux/config.h>
 #include <linux/irq.h>
-#include <linux/module.h>
 #include <linux/random.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/syscalls.h>
 #include <linux/interrupt.h>
 
 #include "internals.h"
@@ -39,8 +42,12 @@ void synchronize_irq(unsigned int irq)
 	if (irq >= NR_IRQS)
 		return;
 
-	while (desc->status & IRQ_INPROGRESS)
-		cpu_relax();
+	if (hardirq_preemption && !(desc->status & IRQ_NODELAY))
+		wait_event(desc->wait_for_handler,
+			!(desc->status & IRQ_INPROGRESS));
+	else
+		while (desc->status & IRQ_INPROGRESS)
+			cpu_relax();
 }
 
 EXPORT_SYMBOL(synchronize_irq);
@@ -128,11 +135,9 @@ void enable_irq(unsigned int irq)
 	case 1: {
 		unsigned int status = desc->status & ~IRQ_DISABLED;
 
-		desc->status = status;
-		if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
-			desc->status = status | IRQ_REPLAY;
-			hw_resend_irq(desc->handler,irq);
-		}
+		/* Prevent probing on this irq */
+		desc->status = status | IRQ_NOPROBE;
+		check_irq_resend(desc, irq);
 		desc->handler->enable(irq);
 		/* fall-through */
 	}
@@ -144,6 +149,43 @@ void enable_irq(unsigned int irq)
 
 EXPORT_SYMBOL(enable_irq);
 
+/**
+ * 	set_irq_wake - control irq power management wakeup
+ *	@irq: 	Interrupt to control
+ *	@mode:	power management wakeup mode
+ *
+ *	Enable/disable power management wakeup mode
+ */
+int set_irq_wake(unsigned int irq, unsigned int mode)
+{
+	irq_desc_t *desc = irq_desc + irq;
+	unsigned long flags;
+	int ret = -ENXIO;
+
+	spin_lock_irqsave(&desc->lock, flags);
+	if (desc->chip && desc->chip->set_wake)
+		ret = desc->chip->set_wake(irq, mode);
+	spin_unlock_irqrestore(&desc->lock, flags);
+	return ret;
+}
+
+EXPORT_SYMBOL(set_irq_wake);
+
+/*
+ * If any action has SA_NODELAY then turn IRQ_NODELAY on:
+ */
+void recalculate_desc_flags(struct irq_desc *desc)
+{
+	struct irqaction *action;
+
+	desc->status &= ~IRQ_NODELAY;
+	for (action = desc->action ; action; action = action->next)
+		if (action->flags & SA_NODELAY)
+			desc->status |= IRQ_NODELAY;
+}
+
+static int start_irq_thread(int irq, struct irq_desc *desc);
+
 /*
  * Internal function that tells the architecture code whether a
  * particular irq has been exclusively allocated or is available
@@ -153,7 +195,7 @@ int can_request_irq(unsigned int irq, un
 {
 	struct irqaction *action;
 
-	if (irq >= NR_IRQS)
+	if (irq >= NR_IRQS || irq_desc[irq].status & IRQ_NOREQUEST)
 		return 0;
 
 	action = irq_desc[irq].action;
@@ -197,15 +239,18 @@ int setup_irq(unsigned int irq, struct i
 		rand_initialize_irq(irq);
 	}
 
+	if (!(new->flags & SA_NODELAY))
+		if (start_irq_thread(irq, desc))
+			return -ENOMEM;
 	/*
 	 * The following block of code has to be executed atomically
 	 */
-	spin_lock_irqsave(&desc->lock,flags);
+	spin_lock_irqsave(&desc->lock, flags);
 	p = &desc->action;
 	if ((old = *p) != NULL) {
 		/* Can't share interrupts unless both agree to */
 		if (!(old->flags & new->flags & SA_SHIRQ)) {
-			spin_unlock_irqrestore(&desc->lock,flags);
+			spin_unlock_irqrestore(&desc->lock, flags);
 			return -EBUSY;
 		}
 
@@ -219,6 +264,11 @@ int setup_irq(unsigned int irq, struct i
 
 	*p = new;
 
+	/*
+	 * Propagate any possible SA_NODELAY flag into IRQ_NODELAY:
+	 */
+	recalculate_desc_flags(desc);
+
 	if (!shared) {
 		desc->depth = 0;
 		desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT |
@@ -228,11 +278,11 @@ int setup_irq(unsigned int irq, struct i
 		else
 			desc->handler->enable(irq);
 	}
-	spin_unlock_irqrestore(&desc->lock,flags);
+	spin_unlock_irqrestore(&desc->lock, flags);
 
 	new->irq = irq;
 	register_irq_proc(irq);
-	new->dir = NULL;
+	new->dir = new->threaded = NULL;
 	register_handler_proc(irq, new);
 
 	return 0;
@@ -262,7 +312,7 @@ void free_irq(unsigned int irq, void *de
 		return;
 
 	desc = irq_desc + irq;
-	spin_lock_irqsave(&desc->lock,flags);
+	spin_lock_irqsave(&desc->lock, flags);
 	p = &desc->action;
 	for (;;) {
 		struct irqaction * action = *p;
@@ -290,7 +340,8 @@ void free_irq(unsigned int irq, void *de
 				else
 					desc->handler->disable(irq);
 			}
-			spin_unlock_irqrestore(&desc->lock,flags);
+			recalculate_desc_flags(desc);
+			spin_unlock_irqrestore(&desc->lock, flags);
 			unregister_handler_proc(irq, action);
 
 			/* Make sure it's not being used on another CPU */
@@ -298,8 +349,8 @@ void free_irq(unsigned int irq, void *de
 			kfree(action);
 			return;
 		}
-		printk(KERN_ERR "Trying to free free IRQ%d\n",irq);
-		spin_unlock_irqrestore(&desc->lock,flags);
+		printk(KERN_ERR "Trying to free free IRQ%d\n", irq);
+		spin_unlock_irqrestore(&desc->lock, flags);
 		return;
 	}
 }
@@ -352,6 +403,8 @@ int request_irq(unsigned int irq,
 		return -EINVAL;
 	if (irq >= NR_IRQS)
 		return -EINVAL;
+	if (irq_desc[irq].status & IRQ_NOREQUEST)
+		return -EINVAL;
 	if (!handler)
 		return -EINVAL;
 
@@ -377,3 +430,365 @@ int request_irq(unsigned int irq,
 
 EXPORT_SYMBOL(request_irq);
 
+/**
+ *	generic_set_irq_type - set the hardware irq type structure for an irq
+ *	@irq: 	Interrupt number
+ *	@type: 	Pointer to irq_type structure
+ */
+int generic_set_irq_type(unsigned int irq, struct irq_type *type)
+{
+	irq_desc_t *desc;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS) {
+		printk(KERN_ERR "Trying to install type for IRQ%d\n", irq);
+		return -EINVAL;
+	}
+
+	if (!type)
+		type = &no_irq_type;
+
+	desc = irq_desc + irq;
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->handler = type;
+	spin_unlock_irqrestore(&desc->lock, flags);
+
+	return 0;
+}
+
+EXPORT_SYMBOL(generic_set_irq_type);
+
+/**
+ *	set_irq_data - set irq type data for an irq
+ *	@irq: 	Interrupt number
+ *	@data: 	Pointer to interrupt specific data
+ *
+ * 	Set the hardware irq controller data for an irq
+ */
+int set_irq_data(unsigned int irq, void *data)
+{
+	irq_desc_t *desc;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS) {
+		printk(KERN_ERR "Trying to install controller data for IRQ%d\n", irq);
+		return -EINVAL;
+	}
+
+	desc = irq_desc + irq;
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->handler_data = data;
+	spin_unlock_irqrestore(&desc->lock, flags);
+	return 0;
+}
+
+EXPORT_SYMBOL(set_irq_data);
+
+/**
+ *	set_irq_chip - set irq chip for an IRQ
+ *	@irq: 	Interrupt number
+ *	@chip: 	Pointer to irq_chip structure
+ *
+ * 	Set the hardware chip structure for an IRQ
+ */
+int set_irq_chip(unsigned int irq, struct irq_chip *chip)
+{
+	irq_desc_t *desc;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS) {
+		printk(KERN_ERR "Trying to install chip for IRQ%d\n", irq);
+		return -EINVAL;
+	}
+
+	desc = irq_desc + irq;
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->chip = chip;
+	spin_unlock_irqrestore(&desc->lock, flags);
+	return 0;
+}
+
+EXPORT_SYMBOL(set_irq_chip);
+
+/**
+ *	set_irq_chip_data - set irq chip data for an irq
+ *	@irq: 	Interrupt number
+ *	@data: 	Pointer to chip specific data
+ *
+ * 	Set the hardware irq chip data for an irq
+ */
+int set_irq_chip_data(unsigned int irq, void *data)
+{
+	irq_desc_t *desc = irq_desc + irq;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS || !desc->handler || !desc->chip) {
+		printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
+		return -EINVAL;
+	}
+
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->chip_data = data;
+	spin_unlock_irqrestore(&desc->lock, flags);
+
+	return 0;
+}
+
+EXPORT_SYMBOL(set_irq_chip_data);
+
+/*
+ * 	set_hwirq_type - Set the irq type (level/edge/simple/percpu)
+ *	@irq: 		Interrupt number
+ *	@hw_type: 	interrupt type (see constants in include/linux/irq.h)
+ *
+ * 	Called from device drivers to configure GPIO interrupts
+ * 	according to their requirements. The set_type function of the
+ * 	handler returns a pointer to an irq_type structure which is
+ * 	able to handle this interrupt type. The handler in the irq
+ * 	descriptor structure is set to the new handler type.
+ *
+ */
+int set_hwirq_type(unsigned int irq, unsigned int hw_type)
+{
+	struct irq_type *type = NULL;
+	unsigned long flags;
+	irq_desc_t *desc;
+
+	if (irq >= NR_IRQS) {
+		printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq);
+		return -ENODEV;
+	}
+
+	desc = irq_desc + irq;
+	spin_lock_irqsave(&desc->lock, flags);
+	if (desc->handler->set_type) {
+		type = desc->handler->set_type(irq, hw_type);
+		if (type)
+			desc->handler = type;
+	}
+	spin_unlock_irqrestore(&desc->lock, flags);
+	return type ? 0 : -ENXIO;
+}
+
+EXPORT_SYMBOL(set_hwirq_type);
+
+#ifdef CONFIG_PREEMPT_HARDIRQS
+
+int hardirq_preemption = 1;
+
+EXPORT_SYMBOL(hardirq_preemption);
+
+/*
+ * Real-Time Preemption depends on hardirq threading:
+ */
+#ifndef CONFIG_PREEMPT_RT
+
+static int __init hardirq_preempt_setup (char *str)
+{
+	if (!strncmp(str, "off", 3))
+		hardirq_preemption = 0;
+	else
+		get_option(&str, &hardirq_preemption);
+	if (!hardirq_preemption)
+		printk("turning off hardirq preemption!\n");
+
+	return 1;
+}
+
+__setup("hardirq-preempt=", hardirq_preempt_setup);
+
+#endif
+
+/*
+ * threaded simple handler
+ */
+static void thread_simple_irq(irq_desc_t *desc)
+{
+	struct irqaction *action = desc->action;
+	unsigned int irq = desc - irq_desc;
+	irqreturn_t action_ret;
+
+	if (action && !desc->depth) {
+		spin_unlock(&desc->lock);
+		action_ret = handle_IRQ_event(irq, NULL, action);
+		local_irq_enable();
+ 		cond_resched_all();
+		spin_lock_irq(&desc->lock);
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret, NULL);
+	}
+	desc->status &= ~IRQ_INPROGRESS;
+}
+
+/*
+ * threaded level type irq handler
+ */
+static void thread_level_irq(irq_desc_t *desc)
+{
+	thread_simple_irq(desc);
+	end_irq(desc, desc - irq_desc);
+}
+
+/*
+ * threaded edge type IRQ handler
+ */
+static void thread_edge_irq(irq_desc_t *desc)
+{
+	unsigned int irq = desc - irq_desc;
+
+	do {
+		struct irqaction *action = desc->action;
+		irqreturn_t action_ret;
+
+		if (unlikely(!action)) {
+			desc->status &= ~IRQ_INPROGRESS;
+			desc->handler->disable(irq);
+			return;
+		}
+
+		/*
+		 * When another irq arrived while we were handling
+		 * one, we could have masked the irq.
+		 * Renable it, if it was not disabled in meantime.
+		 */
+		if (unlikely(((desc->status & (IRQ_PENDING | IRQ_MASKED)) ==
+			    (IRQ_PENDING | IRQ_MASKED)) && !desc->depth))
+			desc->handler->enable(irq);
+
+		desc->status &= ~IRQ_PENDING;
+		spin_unlock(&desc->lock);
+		action_ret = handle_IRQ_event(irq, NULL, action);
+		local_irq_enable();
+		cond_resched_all();
+		spin_lock_irq(&desc->lock);
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret, NULL);
+	} while ((desc->status & IRQ_PENDING) && !desc->depth);
+
+	desc->status &= ~IRQ_INPROGRESS;
+	/*
+	 * The end-handler has to deal with interrupts which got
+	 * disabled while the handler was running:
+	 */
+	end_irq(desc, irq);
+}
+
+static void do_hardirq(struct irq_desc *desc)
+{
+	spin_lock_irq(&desc->lock);
+
+	if (!(desc->status & IRQ_INPROGRESS))
+		goto out;
+
+	if (desc->handler->handle_irq == handle_simple_irq)
+		thread_simple_irq(desc);
+	else if (desc->handler->handle_irq == handle_level_irq)
+		thread_level_irq(desc);
+	else
+		thread_edge_irq(desc);
+ out:
+	spin_unlock_irq(&desc->lock);
+
+	if (waitqueue_active(&desc->wait_for_handler))
+		wake_up(&desc->wait_for_handler);
+}
+
+extern asmlinkage void __do_softirq(void);
+
+static int curr_irq_prio = 49;
+
+static int do_irqd(void * __desc)
+{
+	struct sched_param param = { 0, };
+	struct irq_desc *desc = __desc;
+#ifdef CONFIG_SMP
+	int irq = desc - irq_desc;
+	cpumask_t mask;
+
+	mask = cpumask_of_cpu(any_online_cpu(irq_affinity[irq]));
+	set_cpus_allowed(current, mask);
+#endif
+	current->flags |= PF_NOFREEZE | PF_HARDIRQ;
+
+	/*
+	 * Scale irq thread priorities from prio 50 to prio 25
+	 */
+	param.sched_priority = curr_irq_prio;
+	if (param.sched_priority > 25)
+		curr_irq_prio = param.sched_priority - 1;
+
+	sys_sched_setscheduler(current->pid, SCHED_FIFO, &param);
+
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		do_hardirq(desc);
+		cond_resched_all();
+		local_irq_disable();
+		__do_softirq();
+		local_irq_enable();
+#ifdef CONFIG_SMP
+		/*
+		 * Did IRQ affinities change?
+		 */
+		if (!cpus_equal(current->cpus_allowed, irq_affinity[irq]))
+			set_cpus_allowed(current, irq_affinity[irq]);
+#endif
+		schedule();
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+static int ok_to_create_irq_threads;
+
+static int start_irq_thread(int irq, struct irq_desc *desc)
+{
+	if (desc->thread || !ok_to_create_irq_threads)
+		return 0;
+
+	desc->thread = kthread_create(do_irqd, desc, "IRQ %d", irq);
+	if (!desc->thread) {
+		printk(KERN_ERR "irqd: could not create IRQ thread %d!\n", irq);
+		return -ENOMEM;
+	}
+
+	/*
+	 * An interrupt may have come in before the thread pointer was
+	 * stored in desc->thread; make sure the thread gets woken up in
+	 * such a case:
+	 */
+	smp_mb();
+	wake_up_process(desc->thread);
+
+	return 0;
+}
+
+void __init init_hardirqs(void)
+{
+	int i;
+	ok_to_create_irq_threads = 1;
+
+	for (i = 0; i < NR_IRQS; i++) {
+		irq_desc_t *desc = irq_desc + i;
+
+		if (desc->action && !(desc->status & IRQ_NODELAY))
+			start_irq_thread(i, desc);
+	}
+}
+
+#else
+
+static int start_irq_thread(int irq, struct irq_desc *desc)
+{
+	return 0;
+}
+
+#endif
+
+void __init early_init_hardirqs(void)
+{
+	int i;
+
+	for (i = 0; i < NR_IRQS; i++)
+		init_waitqueue_head(&irq_desc[i].wait_for_handler);
+}
Index: linux-2.6.16/kernel/irq/proc.c
===================================================================
--- linux-2.6.16.orig/kernel/irq/proc.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/irq/proc.c	2006-10-19 16:52:31.000000000 +0200
@@ -7,6 +7,8 @@
  */
 
 #include <linux/irq.h>
+#include <asm/uaccess.h>
+#include <linux/profile.h>
 #include <linux/proc_fs.h>
 #include <linux/interrupt.h>
 
@@ -81,37 +83,6 @@ static int irq_affinity_write_proc(struc
 
 #endif
 
-#define MAX_NAMELEN 128
-
-static int name_unique(unsigned int irq, struct irqaction *new_action)
-{
-	struct irq_desc *desc = irq_desc + irq;
-	struct irqaction *action;
-
-	for (action = desc->action ; action; action = action->next)
-		if ((action != new_action) && action->name &&
-				!strcmp(new_action->name, action->name))
-			return 0;
-	return 1;
-}
-
-void register_handler_proc(unsigned int irq, struct irqaction *action)
-{
-	char name [MAX_NAMELEN];
-
-	if (!irq_dir[irq] || action->dir || !action->name ||
-					!name_unique(irq, action))
-		return;
-
-	memset(name, 0, MAX_NAMELEN);
-	snprintf(name, MAX_NAMELEN, "%s", action->name);
-
-	/* create /proc/irq/1234/handler/ */
-	action->dir = proc_mkdir(name, irq_dir[irq]);
-}
-
-#undef MAX_NAMELEN
-
 #define MAX_NAMELEN 10
 
 void register_irq_proc(unsigned int irq)
@@ -151,10 +122,100 @@ void register_irq_proc(unsigned int irq)
 
 void unregister_handler_proc(unsigned int irq, struct irqaction *action)
 {
+	if (action->threaded)
+		remove_proc_entry(action->threaded->name, action->dir);
 	if (action->dir)
 		remove_proc_entry(action->dir->name, irq_dir[irq]);
 }
 
+#ifndef CONFIG_PREEMPT_RT
+
+#ifndef CONFIG_PREEMPT_RT
+
+static int threaded_read_proc(char *page, char **start, off_t off,
+			      int count, int *eof, void *data)
+{
+	return sprintf(page, "%c\n",
+		((struct irqaction *)data)->flags & SA_NODELAY ? '0' : '1');
+}
+
+static int threaded_write_proc(struct file *file, const char __user *buffer,
+			       unsigned long count, void *data)
+{
+	int c;
+	struct irqaction *action = data;
+	irq_desc_t *desc = irq_desc + action->irq;
+
+	if (get_user(c, buffer))
+		return -EFAULT;
+	if (c != '0' && c != '1')
+		return -EINVAL;
+
+	spin_lock_irq(&desc->lock);
+
+	if (c == '0')
+		action->flags |= SA_NODELAY;
+	if (c == '1')
+		action->flags &= ~SA_NODELAY;
+	recalculate_desc_flags(desc);
+
+	spin_unlock_irq(&desc->lock);
+
+	return 1;
+}
+
+#endif
+
+#endif
+
+#define MAX_NAMELEN 128
+
+static int name_unique(unsigned int irq, struct irqaction *new_action)
+{
+	struct irq_desc *desc = irq_desc + irq;
+	struct irqaction *action;
+
+	for (action = desc->action ; action; action = action->next)
+		if ((action != new_action) && action->name &&
+				!strcmp(new_action->name, action->name))
+			return 0;
+	return 1;
+}
+
+void register_handler_proc(unsigned int irq, struct irqaction *action)
+{
+	char name [MAX_NAMELEN];
+
+	if (!irq_dir[irq] || action->dir || !action->name ||
+					!name_unique(irq, action))
+		return;
+
+	memset(name, 0, MAX_NAMELEN);
+	snprintf(name, MAX_NAMELEN, "%s", action->name);
+
+	/* create /proc/irq/1234/handler/ */
+	action->dir = proc_mkdir(name, irq_dir[irq]);
+	if (!action->dir)
+		return;
+#ifndef CONFIG_PREEMPT_RT
+	{
+		struct proc_dir_entry *entry;
+		/* create /proc/irq/1234/handler/threaded */
+		entry = create_proc_entry("threaded", 0600, action->dir);
+		if (!entry)
+			return;
+		entry->nlink = 1;
+		entry->data = (void *)action;
+		entry->read_proc = threaded_read_proc;
+		entry->write_proc = threaded_write_proc;
+		action->threaded = entry;
+	}
+#endif
+}
+
+#undef MAX_NAMELEN
+
+
 void init_irq_proc(void)
 {
 	int i;
@@ -164,6 +225,9 @@ void init_irq_proc(void)
 	if (!root_irq_dir)
 		return;
 
+	/* create /proc/irq/prof_cpu_mask */
+	create_prof_cpu_mask(root_irq_dir);
+
 	/*
 	 * Create entries for all existing IRQs.
 	 */
Index: linux-2.6.16/kernel/irq/resend.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/kernel/irq/resend.c	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,82 @@
+/*
+ * linux/kernel/irq/resend.c
+ *
+ * Copyright (C) 1992, 1998-2005 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005, Thomas Gleixner
+ *
+ * This file contains the tasklet-based IRQ-resend code
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/interrupt.h>
+
+#include "internals.h"
+
+/* Bitmap to handle software resend of interrupts: */
+static DECLARE_BITMAP(irqs_resend, NR_IRQS);
+
+/*
+ * Run software resends of IRQ's
+ */
+static void resend_irqs(unsigned long arg)
+{
+	unsigned long flags;
+	int irq;
+
+	for (;;) {
+		if (bitmap_empty(irqs_resend, NR_IRQS))
+			break;
+		irq = find_first_bit(irqs_resend, NR_IRQS);
+		clear_bit(irq, irqs_resend);
+		local_irq_save(flags);
+		desc_handle_irq(irq, (irq_desc + irq), NULL);
+		local_irq_restore(flags);
+	}
+}
+
+/* Tasklet to handle resend: */
+static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
+
+/*
+ * Handle irq resend
+ *
+ * If the interrupt is waiting to be processed, try to re-run it.  We
+ * can't directly run it from here since the caller might be in an
+ * interrupt-protected region. Not all irq controller chips can
+ * retrigger interrupts at hardware level. For edge type interrupts it
+ * is necessary to resend them by software.  At the moment the pending
+ * list is handled at the end of asm_do_IRQ. That means the next
+ * interrupt (on any irq line) will invoke the do_pending function. It
+ * could also be done by a thread which is woken up by the
+ * check_irq_resend function.
+ *
+ * Is called with interrupts disabled and desc->lock held
+ */
+void check_irq_resend(irq_desc_t *desc, unsigned int irq)
+{
+
+	/* Chipless implementation. This should vanish in the long run */
+	if (!desc->chip) {
+		unsigned int status = desc->status;
+		if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
+			desc->status = status | IRQ_REPLAY;
+			hw_resend_irq(desc->handler, irq);
+		}
+		return;
+	}
+
+	/* Chip based implementation */
+	if ((desc->status & IRQ_PENDING) && !test_bit(irq, irqs_resend)) {
+		desc->status &= ~IRQ_PENDING;
+		/* Try to retrigger it in hardware */
+		if (!desc->chip || !desc->chip->retrigger ||
+		    desc->chip->retrigger(irq)) {
+			/* Mark it pending */
+			set_bit(irq, irqs_resend);
+			tasklet_schedule(&resend_tasklet);
+		}
+	}
+}
+
Index: linux-2.6.16/kernel/irq/spurious.c
===================================================================
--- linux-2.6.16.orig/kernel/irq/spurious.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/irq/spurious.c	2006-10-19 16:52:31.000000000 +0200
@@ -10,6 +10,10 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/interrupt.h>
+#ifdef CONFIG_X86_IO_APIC
+# include <asm/apicdef.h>
+# include <asm/io_apic.h>
+#endif
 
 static int irqfixup;
 
@@ -55,9 +59,8 @@ static int misrouted_irq(int irq, struct
 			}
 			action = action->next;
 		}
-		local_irq_disable();
 		/* Now clean up the flags */
-		spin_lock(&desc->lock);
+		spin_lock_irq(&desc->lock);
 		action = desc->action;
 
 		/*
@@ -161,12 +164,19 @@ void note_interrupt(unsigned int irq, ir
 		 * The interrupt is stuck
 		 */
 		__report_bad_irq(irq, desc, action_ret);
+#ifdef CONFIG_X86_IO_APIC
+		if (!sis_apic_bug) {
+			sis_apic_bug = 1;
+			printk(KERN_ERR "turning off IO-APIC fast mode.\n");
+		}
+#else
 		/*
 		 * Now kill the IRQ
 		 */
 		printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
 		desc->status |= IRQ_DISABLED;
 		desc->handler->disable(irq);
+#endif
 	}
 	desc->irqs_unhandled = 0;
 }
@@ -184,6 +194,11 @@ __setup("noirqdebug", noirqdebug_setup);
 
 static int __init irqfixup_setup(char *str)
 {
+#ifdef CONFIG_PREEMPT_RT
+	printk(KERN_WARNING "irqfixup boot option not supported "
+		"w/ CONFIG_PREEMPT_RT\n");
+	return 1;
+#endif
 	irqfixup = 1;
 	printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
 	printk(KERN_WARNING "This may impact system performance.\n");
@@ -194,6 +209,11 @@ __setup("irqfixup", irqfixup_setup);
 
 static int __init irqpoll_setup(char *str)
 {
+#ifdef CONFIG_PREEMPT_RT
+	printk(KERN_WARNING "irqpoll boot option not supported "
+		"w/ CONFIG_PREEMPT_RT\n");
+	return 1;
+#endif
 	irqfixup = 2;
 	printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
 				"enabled\n");
Index: linux-2.6.16/kernel/itimer.c
===================================================================
--- linux-2.6.16.orig/kernel/itimer.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/itimer.c	2006-10-19 16:52:31.000000000 +0200
@@ -128,16 +128,16 @@ asmlinkage long sys_getitimer(int which,
 /*
  * The timer is automagically restarted, when interval != 0
  */
-int it_real_fn(void *data)
+int it_real_fn(struct hrtimer *timer)
 {
-	struct task_struct *tsk = (struct task_struct *) data;
+	struct signal_struct *sig =
+	    container_of(timer, struct signal_struct, real_timer);
 
-	send_group_sig_info(SIGALRM, SEND_SIG_PRIV, tsk);
-
-	if (tsk->signal->it_real_incr.tv64 != 0) {
-		hrtimer_forward(&tsk->signal->real_timer,
-			       tsk->signal->it_real_incr);
+	send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk);
 
+	if (sig->it_real_incr.tv64 != 0) {
+		hrtimer_forward(timer, hrtimer_cb_get_time(timer),
+				sig->it_real_incr);
 		return HRTIMER_RESTART;
 	}
 	return HRTIMER_NORESTART;
@@ -150,6 +150,14 @@ int do_setitimer(int which, struct itime
 	ktime_t expires;
 	cputime_t cval, cinterval, nval, ninterval;
 
+	/*
+	 * Validate the timeval. This catches all users of
+	 * do_setitimer.
+	 */
+	if (!timeval_valid(&value->it_value) ||
+	    !timeval_valid(&value->it_interval))
+		return -EINVAL;
+
 	switch (which) {
 	case ITIMER_REAL:
 again:
@@ -163,6 +171,7 @@ again:
 		/* We are sharing ->siglock with it_real_fn() */
 		if (hrtimer_try_to_cancel(timer) < 0) {
 			spin_unlock_irq(&tsk->sighand->siglock);
+			hrtimer_wait_for_timer(&tsk->signal->real_timer);
 			goto again;
 		}
 		tsk->signal->it_real_incr =
@@ -226,6 +235,40 @@ again:
 	return 0;
 }
 
+/**
+ * alarm_setitimer - set alarm in seconds
+ *
+ * @seconds:	number of seconds until alarm
+ *		0 disables the alarm
+ *
+ * Returns the remaining time in seconds of a pending timer or 0 when
+ * the timer is not active.
+ */
+unsigned int alarm_setitimer(unsigned int seconds)
+{
+	struct itimerval it_new, it_old;
+
+#if BITS_PER_LONG < 64
+	if (seconds > INT_MAX)
+		seconds = INT_MAX;
+#endif
+	it_new.it_value.tv_sec = seconds;
+	it_new.it_value.tv_usec = 0;
+	it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
+
+	do_setitimer(ITIMER_REAL, &it_new, &it_old);
+
+	/*
+	 * We can't return 0 if we have an alarm pending ...  And we'd
+	 * better return too much than too little anyway
+	 */
+	if ((!it_old.it_value.tv_sec && it_old.it_value.tv_usec) ||
+	      it_old.it_value.tv_usec >= 500000)
+		it_old.it_value.tv_sec++;
+
+	return it_old.it_value.tv_sec;
+}
+
 asmlinkage long sys_setitimer(int which,
 			      struct itimerval __user *value,
 			      struct itimerval __user *ovalue)
Index: linux-2.6.16/kernel/latency.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/kernel/latency.c	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,2465 @@
+/*
+ *  kernel/latency.c
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+
+#include <linux/mm.h>
+#include <linux/nmi.h>
+#include <linux/rtc.h>
+#include <linux/sched.h>
+#include <linux/percpu.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/profile.h>
+#include <linux/bootmem.h>
+#include <linux/version.h>
+#include <linux/notifier.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/interrupt.h>
+#include <linux/proc_fs.h>
+#include <linux/latency_hist.h>
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+
+#ifndef irqs_off
+# define irqs_off			irqs_disabled
+#endif
+
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+# ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+#  define irqs_off_preempt_count() preempt_count()
+# else
+#  define irqs_off_preempt_count() 0
+# endif
+#endif
+
+#ifdef CONFIG_WAKEUP_TIMING
+struct sch_struct {
+	raw_spinlock_t trace_lock;
+	struct task_struct *task;
+	int cpu;
+	struct cpu_trace *tr;
+} ____cacheline_aligned_in_smp;
+
+static __cacheline_aligned_in_smp struct sch_struct sch =
+		{ trace_lock: RAW_SPIN_LOCK_UNLOCKED };
+
+int wakeup_timing = 1;
+#endif
+
+#ifdef CONFIG_LATENCY_TIMING
+
+#include <asm/rtc.h>
+/*
+ * Maximum preemption latency measured. Initialize to maximum,
+ * we clear it after bootup.
+ */
+#ifdef CONFIG_LATENCY_HIST
+static cycles_t preempt_max_latency = (cycles_t)0UL;
+#else
+static cycles_t preempt_max_latency = (cycles_t)ULONG_MAX;
+#endif
+
+static cycles_t preempt_thresh;
+
+/*
+ * Should this new latency be reported/recorded?
+ */
+static int report_latency(cycles_t delta)
+{
+	if (latency_hist_flag && !trace_user_triggered)
+		return 1;
+
+	if (preempt_thresh) {
+		if (delta < preempt_thresh)
+			return 0;
+	} else {
+		if (delta <= preempt_max_latency)
+			return 0;
+	}
+	return 1;
+}
+
+/*
+ * Track maximum latencies and save the trace:
+ */
+
+/*
+ * trace_stop_sched_switched must not be called with runqueue locks held!
+ */
+static __cacheline_aligned_in_smp DECLARE_MUTEX(max_mutex);
+
+/*
+ * Sequence count - we record it when starting a measurement and
+ * skip the latency if the sequence has changed - some other section
+ * did a maximum and could disturb our measurement with serial console
+ * printouts, etc. Truly coinciding maximum latencies should be rare
+ * and what happens together happens separately as well, so this doesnt
+ * decrease the validity of the maximum found:
+ */
+static __cacheline_aligned_in_smp int max_sequence;
+
+enum trace_type
+{
+	__TRACE_FIRST_TYPE = 0,
+
+	TRACE_FN,
+	TRACE_SPECIAL,
+	TRACE_SPECIAL_PID,
+	TRACE_SPECIAL_U64,
+	TRACE_CMDLINE,
+	TRACE_SYSCALL,
+	TRACE_SYSRET,
+
+	__TRACE_LAST_TYPE
+};
+
+enum trace_flag_type
+{
+	TRACE_FLAG_IRQS_OFF		= 0x01,
+	TRACE_FLAG_NEED_RESCHED		= 0x02,
+	TRACE_FLAG_HARDIRQ		= 0x04,
+	TRACE_FLAG_SOFTIRQ		= 0x08,
+	TRACE_FLAG_IRQS_HARD_OFF	= 0x10,
+};
+
+
+#ifdef CONFIG_LATENCY_TRACE
+
+/*
+ * On DEBUG_PAGEALLOC && SMP there's not too much lowmem, so reduce
+ * the # of trace entries, or else we OOM on bootup. Same applies for
+ * ARM where we have only 4MB boot window for kernel text+data+bss.
+ *
+ * The large buffer allocates 8MB memory, which might also be more
+ * than the available memory on a small embedded box. This needs more
+ * thought for embedded devices and should be initialized at runtime
+ * under consideration of the available memory resources.
+ */
+#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_SMP) && !defined(CONFIG_ARM)
+# define MAX_TRACE (unsigned long)(8192*16-1)
+#else
+# define MAX_TRACE (unsigned long)(8192*2-1)
+#endif
+
+#define CMDLINE_BYTES 16
+
+/*
+ * 32 bytes on 32-bit platforms:
+ */
+struct trace_entry {
+	char type;
+	char cpu;
+	char flags;
+	char preempt_count; // assumes PREEMPT_MASK is 8 bits or less
+	int pid;
+	cycles_t timestamp;
+	union {
+		struct {
+			unsigned long eip;
+			unsigned long parent_eip;
+		} fn;
+		struct {
+			unsigned long eip;
+			unsigned long v1, v2, v3;
+		} special;
+		struct {
+			unsigned char str[CMDLINE_BYTES];
+		} cmdline;
+		struct {
+			unsigned int nr;
+			unsigned long p1, p2, p3;
+		} syscall;
+		struct {
+			unsigned int ret;
+		} sysret;
+		struct {
+			int __pad3[4];
+		} pad;
+	} u;
+} __attribute__((packed));
+
+#endif
+
+struct cpu_trace {
+	atomic_t disabled;
+	unsigned long trace_idx;
+	cycles_t preempt_timestamp;
+	unsigned long critical_start, critical_end;
+	int critical_sequence;
+	atomic_t overrun;
+	int early_warning;
+	int latency_type;
+	int cpu;
+
+#ifdef CONFIG_LATENCY_TRACE
+	struct trace_entry trace[MAX_TRACE];
+	char comm[CMDLINE_BYTES];
+	pid_t pid;
+	unsigned long uid;
+	unsigned long nice;
+	unsigned long policy;
+	unsigned long rt_priority;
+	unsigned long saved_latency;
+#endif
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+	unsigned long stack_check;
+#endif
+} ____cacheline_aligned_in_smp;
+
+static struct cpu_trace cpu_traces[NR_CPUS] ____cacheline_aligned_in_smp =
+{ [0 ... NR_CPUS-1] = {
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+ .stack_check = 1
+#endif
+ } };
+
+static unsigned long notrace cycles_to_usecs(cycles_t delta)
+{
+#ifdef CONFIG_X86
+	do_div(delta, cpu_khz/1000+1);
+#elif defined(CONFIG_PPC)
+	delta = mulhwu(tb_to_us, delta);
+#elif defined(CONFIG_ARM)
+	delta = mach_cycles_to_usecs(delta);
+#else
+	#error Implement cycles_to_usecs.
+#endif
+
+	return (unsigned long) delta;
+}
+
+static cycles_t notrace usecs_to_cycles(unsigned long delta)
+{
+#if defined(CONFIG_X86) || defined(CONFIG_PPC)
+	return (cycles_t) delta * (cycles_t) (cpu_khz/1000+1);
+#elif defined(CONFIG_ARM)
+	return mach_usecs_to_cycles(delta);
+#else
+	#error Implement usecs_to_cycles
+#endif
+}
+
+#ifdef CONFIG_LATENCY_TRACE
+
+int trace_enabled = 1;
+int mcount_enabled = 1;
+int trace_freerunning = 0;
+int trace_print_at_crash = 0;
+int trace_verbose = 0;
+int trace_all_cpus = 0;
+int print_functions = 0;
+
+/*
+ * user-triggered via gettimeofday(0,1)/gettimeofday(0,0)
+ */
+int trace_user_triggered = 0;
+int trace_user_trigger_irq = -1;
+
+struct saved_trace_struct {
+	int cpu;
+	cycles_t first_timestamp, last_timestamp;
+	struct cpu_trace traces[NR_CPUS];
+} ____cacheline_aligned_in_smp;
+
+/*
+ * The current worst-case trace:
+ */
+static struct saved_trace_struct max_tr;
+
+/*
+ * /proc/latency_trace atomicity:
+ */
+static DECLARE_MUTEX(out_mutex);
+
+static struct saved_trace_struct out_tr;
+
+static void notrace printk_name(unsigned long eip)
+{
+	char namebuf[KSYM_NAME_LEN+1];
+	unsigned long size, offset;
+	const char *sym_name;
+	char *modname;
+
+	sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf);
+	if (sym_name)
+		printk("%s+%#lx/%#lx", sym_name, offset, size);
+	else
+		printk("<%08lx>", eip);
+}
+
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+
+#define MIN_STACK_NEEDED (sizeof(struct thread_info) + STACK_WARN)
+#define MAX_STACK (THREAD_SIZE - sizeof(struct thread_info))
+
+#if (defined(__i386__) || defined(__x86_64__)) && defined(CONFIG_FRAME_POINTER)
+# define PRINT_EXACT_STACKFRAME
+#endif
+
+#ifdef PRINT_EXACT_STACKFRAME
+static unsigned long *worst_stack_bp;
+#endif
+static DEFINE_RAW_SPINLOCK(worst_stack_lock);
+unsigned long worst_stack_left = THREAD_SIZE;
+static unsigned long worst_stack_printed = THREAD_SIZE;
+static char worst_stack_comm[TASK_COMM_LEN+1];
+static int worst_stack_pid;
+static unsigned long worst_stack_sp;
+static char worst_stack[THREAD_SIZE];
+
+static notrace void fill_worst_stack(unsigned long stack_left)
+{
+	unsigned long flags;
+
+	/*
+	 * On x64, we must not read the PDA during early bootup:
+	 */
+#ifdef CONFIG_X86_64
+	if (system_state == SYSTEM_BOOTING)
+		return;
+#endif
+	spin_lock_irqsave(&worst_stack_lock, flags);
+	if (likely(stack_left < worst_stack_left)) {
+		worst_stack_left = stack_left;
+		memcpy(worst_stack, current_thread_info(), THREAD_SIZE);
+		worst_stack_sp = (unsigned long)&stack_left;
+		memcpy(worst_stack_comm, current->comm, TASK_COMM_LEN);
+		worst_stack_pid = current->pid;
+#ifdef PRINT_EXACT_STACKFRAME
+# ifdef __i386__
+		asm ("mov %%ebp, %0\n" :"=g"(worst_stack_bp));
+# elif defined(__x86_64__)
+		asm ("mov %%rbp, %0\n" :"=g"(worst_stack_bp));
+# else
+#  error Poke the author of above asm code lines !
+# endif
+#endif
+	}
+	spin_unlock_irqrestore(&worst_stack_lock, flags);
+}
+
+#ifdef PRINT_EXACT_STACKFRAME
+
+/*
+ * This takes a BP offset to point the BP back into the saved stack,
+ * the original stack might be long gone (but the stackframe within
+ * the saved copy still contains references to it).
+ */
+#define CONVERT_TO_SAVED_STACK(bp) \
+	((void *)worst_stack + ((unsigned long)bp & (THREAD_SIZE-1)))
+
+static void show_stackframe(void)
+{
+	unsigned long addr, frame_size, *bp, *prev_bp, sum = 0;
+
+	bp = CONVERT_TO_SAVED_STACK(worst_stack_bp);
+
+	while (bp[0]) {
+		addr = bp[1];
+		if (!kernel_text_address(addr))
+			break;
+
+		prev_bp = bp;
+		bp = CONVERT_TO_SAVED_STACK((unsigned long *)bp[0]);
+
+		frame_size = (bp - prev_bp) * sizeof(long);
+
+		if (frame_size < THREAD_SIZE) {
+			printk("{ %4ld} ", frame_size);
+			sum += frame_size;
+		} else
+			printk("{=%4ld} ", sum);
+
+		printk("[<%08lx>] ", addr);
+		printk_name(addr);
+		printk("\n");
+	}
+}
+
+#else
+
+static inline int valid_stack_ptr(void *p)
+{
+	return  p > (void *)worst_stack &&
+                p < (void *)worst_stack + THREAD_SIZE - 3;
+}
+
+static void show_stackframe(void)
+{
+	unsigned long prev_frame, addr;
+	unsigned long *stack;
+
+	prev_frame = (unsigned long)(worst_stack +
+					(worst_stack_sp & (THREAD_SIZE-1)));
+	stack = (unsigned long *)prev_frame;
+
+	while (valid_stack_ptr(stack)) {
+		addr = *stack++;
+		if (__kernel_text_address(addr)) {
+			printk("(%4ld) ", (unsigned long)stack - prev_frame);
+			printk("[<%08lx>] ", addr);
+			print_symbol("%s\n", addr);
+			prev_frame = (unsigned long)stack;
+		}
+		if ((char *)stack >= worst_stack + THREAD_SIZE)
+			break;
+	}
+}
+
+#endif
+
+static notrace void __print_worst_stack(void)
+{
+	unsigned long fill_ratio;
+	printk("----------------------------->\n");
+	printk("| new stack fill maximum: %s/%d, %ld bytes (out of %ld bytes).\n",
+		worst_stack_comm, worst_stack_pid,
+		MAX_STACK-worst_stack_left, (long)MAX_STACK);
+	fill_ratio = (MAX_STACK-worst_stack_left)*100/(long)MAX_STACK;
+	printk("| Stack fill ratio: %02ld%%", fill_ratio);
+	if (fill_ratio >= 90)
+		printk(" - BUG: that's quite high, please report this!\n");
+	else
+		printk(" - that's still OK, no need to report this.\n");
+	printk("------------|\n");
+
+	show_stackframe();
+	printk("<---------------------------\n\n");
+}
+
+static notrace void print_worst_stack(void)
+{
+	unsigned long flags;
+
+	if (irqs_disabled())
+		return;
+
+	spin_lock_irqsave(&worst_stack_lock, flags);
+	if (worst_stack_printed == worst_stack_left) {
+		spin_unlock_irqrestore(&worst_stack_lock, flags);
+		return;
+	}
+	worst_stack_printed = worst_stack_left;
+	spin_unlock_irqrestore(&worst_stack_lock, flags);
+
+	__print_worst_stack();
+}
+
+static notrace void debug_stackoverflow(struct cpu_trace *tr)
+{
+	long stack_left;
+
+	if (unlikely(tr->stack_check <= 0))
+		return;
+	atomic_inc(&tr->disabled);
+
+	/* Debugging check for stack overflow: is there less than 1KB free? */
+#ifdef __i386__
+	__asm__ __volatile__("and %%esp,%0" :
+				"=r" (stack_left) : "0" (THREAD_SIZE - 1));
+#elif defined(__x86_64__)
+	__asm__ __volatile__("and %%rsp,%0" :
+				"=r" (stack_left) : "0" (THREAD_SIZE - 1));
+#else
+# error Poke the author of above asm code lines !
+#endif
+	if (unlikely(stack_left < MIN_STACK_NEEDED)) {
+		tr->stack_check = 0;
+		printk(KERN_ALERT "BUG: stack overflow: only %ld bytes left! [%08lx...(%08lx-%08lx)]\n",
+			stack_left - sizeof(struct thread_info),
+			(long)&stack_left,
+			(long)current_thread_info(),
+			(long)current_thread_info() + THREAD_SIZE);
+		fill_worst_stack(stack_left);
+		__print_worst_stack();
+		goto out;
+	}
+	if (unlikely(stack_left < worst_stack_left)) {
+		tr->stack_check--;
+		fill_worst_stack(stack_left);
+		print_worst_stack();
+		tr->stack_check++;
+	} else
+		if (worst_stack_printed != worst_stack_left) {
+			tr->stack_check--;
+			print_worst_stack();
+			tr->stack_check++;
+		}
+out:
+	atomic_dec(&tr->disabled);
+}
+
+#endif
+
+#ifdef CONFIG_EARLY_PRINTK
+static void notrace early_printk_name(unsigned long eip)
+{
+	char namebuf[KSYM_NAME_LEN+1];
+	unsigned long size, offset;
+	const char *sym_name;
+	char *modname;
+
+	sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf);
+	if (sym_name)
+		early_printk("%s <%08lx>", sym_name, eip);
+	else
+		early_printk("<%08lx>", eip);
+}
+
+static DEFINE_RAW_SPINLOCK(early_print_lock);
+
+static void notrace early_print_entry(struct trace_entry *entry)
+{
+	int hardirq, softirq;
+
+	spin_lock(&early_print_lock);
+	early_printk("%-5d ", entry->pid);
+
+	early_printk("%d%c%c",
+		entry->cpu,
+		(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+		(entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.',
+ 		(entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'n' : '.');
+
+	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+	if (hardirq && softirq)
+		early_printk("H");
+	else {
+		if (hardirq)
+			early_printk("h");
+		else {
+			if (softirq)
+				early_printk("s");
+			else
+				early_printk(".");
+		}
+	}
+
+	early_printk(":%d: ", entry->preempt_count);
+
+	if (entry->type == TRACE_FN) {
+		early_printk_name(entry->u.fn.eip);
+		early_printk("  <= (");
+		early_printk_name(entry->u.fn.parent_eip);
+		early_printk(")\n");
+	} else {
+		/* special entries: */
+		early_printk_name(entry->u.special.eip);
+		early_printk(": <%08lx> <%08lx> <%08lx>\n",
+			entry->u.special.v1,
+			entry->u.special.v2,
+			entry->u.special.v3);
+	}
+	spin_unlock(&early_print_lock);
+}
+#else
+#  define early_print_entry(x) do { } while(0)
+#endif
+
+static void notrace
+____trace(int cpu, enum trace_type type, struct cpu_trace *tr,
+	  unsigned long eip, unsigned long parent_eip,
+	  unsigned long v1, unsigned long v2, unsigned long v3,
+	  unsigned long flags)
+{
+	struct trace_entry *entry;
+	unsigned long idx, idx_next;
+	cycles_t timestamp;
+	u32 pc;
+
+#ifdef CONFIG_DEBUG_PREEMPT
+//	WARN_ON(!atomic_read(&tr->disabled));
+#endif
+	if (!tr->critical_start && !trace_user_triggered && !trace_all_cpus && !trace_print_at_crash && !print_functions)
+		goto out;
+	/*
+	 * Allocate the next index. Make sure an NMI (or interrupt)
+	 * has not taken it away. Potentially redo the timestamp as
+	 * well to make sure the trace timestamps are in chronologic
+	 * order.
+	 */
+again:
+	idx = tr->trace_idx;
+	idx_next = idx + 1;
+	timestamp = get_cycles();
+
+	if (unlikely((trace_freerunning || print_functions) &&
+						(idx_next >= MAX_TRACE)))
+		idx_next = 0;
+	if (unlikely(idx_next >= MAX_TRACE)) {
+		atomic_inc(&tr->overrun);
+		goto out;
+	}
+#ifdef __HAVE_ARCH_CMPXCHG
+	if (unlikely(cmpxchg(&tr->trace_idx, idx, idx_next) != idx))
+		goto again;
+#else
+# ifdef CONFIG_SMP
+#  error CMPXCHG missing
+# else
+	/* No worry, we are protected by the atomic_incr(&tr->disabled)
+	 * in __trace further down
+	 */
+	tr->trace_idx = idx_next;
+# endif
+#endif
+	pc = preempt_count();
+
+	entry = tr->trace + idx;
+	entry->type = type;
+#ifdef CONFIG_SMP
+	entry->cpu = cpu;
+#endif
+	entry->flags = (irqs_off() ? TRACE_FLAG_IRQS_OFF : 0) |
+		(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_HARD_OFF : 0)|
+		((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
+		((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
+		(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
+	entry->preempt_count = pc & 0xff;
+	entry->pid = current->pid;
+	entry->timestamp = timestamp;
+
+	switch (type) {
+	case TRACE_FN:
+		entry->u.fn.eip = eip;
+		entry->u.fn.parent_eip = parent_eip;
+		if (unlikely(print_functions && !in_interrupt()))
+			early_print_entry(entry);
+		break;
+	case TRACE_SPECIAL:
+	case TRACE_SPECIAL_PID:
+	case TRACE_SPECIAL_U64:
+		entry->u.special.eip = eip;
+		entry->u.special.v1 = v1;
+		entry->u.special.v2 = v2;
+		entry->u.special.v3 = v3;
+		if (unlikely(print_functions && !in_interrupt()))
+			early_print_entry(entry);
+		break;
+	case TRACE_SYSCALL:
+		entry->u.syscall.nr = eip;
+		entry->u.syscall.p1 = v1;
+		entry->u.syscall.p2 = v2;
+		entry->u.syscall.p3 = v3;
+		break;
+	case TRACE_SYSRET:
+		entry->u.sysret.ret = eip;
+		break;
+	case TRACE_CMDLINE:
+		memcpy(entry->u.cmdline.str, current->comm, CMDLINE_BYTES);
+		break;
+	default:
+		break;
+	}
+out:
+	;
+}
+
+static inline void notrace
+___trace(enum trace_type type, unsigned long eip, unsigned long parent_eip,
+		unsigned long v1, unsigned long v2,
+			unsigned long v3)
+{
+	struct cpu_trace *tr;
+	unsigned long flags;
+	int cpu;
+
+	if (unlikely(trace_enabled <= 0))
+		return;
+
+#if defined(CONFIG_DEBUG_STACKOVERFLOW) && defined(CONFIG_X86)
+	debug_stackoverflow(cpu_traces + raw_smp_processor_id());
+#endif
+
+	raw_local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	/*
+	 * Trace on the CPU where the current highest-prio task
+	 * is waiting to become runnable:
+	 */
+#ifdef CONFIG_WAKEUP_TIMING
+	if (wakeup_timing && !trace_all_cpus && !trace_print_at_crash && !print_functions) {
+		if (!sch.tr || cpu != sch.cpu)
+			goto out;
+		tr = sch.tr;
+	} else
+		tr = cpu_traces + cpu;
+#else
+	tr = cpu_traces + cpu;
+#endif
+	atomic_inc(&tr->disabled);
+	if (likely(atomic_read(&tr->disabled) == 1)) {
+//#define DEBUG_STACK_POISON
+#ifdef DEBUG_STACK_POISON
+		char stack;
+
+		memset(&stack - 128, 0x34, 128);
+#endif
+		____trace(cpu, type, tr, eip, parent_eip, v1, v2, v3, flags);
+	}
+	atomic_dec(&tr->disabled);
+#ifdef CONFIG_WAKEUP_TIMING
+out:
+#endif
+	raw_local_irq_restore(flags);
+}
+
+/*
+ * Special, ad-hoc tracepoints:
+ */
+void notrace trace_special(unsigned long v1, unsigned long v2, unsigned long v3)
+{
+	___trace(TRACE_SPECIAL, CALLER_ADDR0, 0, v1, v2, v3);
+}
+
+EXPORT_SYMBOL(trace_special);
+
+void notrace trace_special_pid(int pid, unsigned long v1, unsigned long v2)
+{
+	___trace(TRACE_SPECIAL_PID, CALLER_ADDR0, 0, pid, v1, v2);
+}
+
+EXPORT_SYMBOL(trace_special_pid);
+
+void notrace trace_special_u64(unsigned long long v1, unsigned long v2)
+{
+	___trace(TRACE_SPECIAL_U64, CALLER_ADDR0, 0,
+		 (unsigned long) (v1 >> 32), (unsigned long) (v1 & 0xFFFFFFFF), v2);
+}
+
+EXPORT_SYMBOL(trace_special_u64);
+
+/*
+ * Non-inlined function:
+ */
+void notrace __trace(unsigned long eip, unsigned long parent_eip)
+{
+	___trace(TRACE_FN, eip, parent_eip, 0, 0, 0);
+}
+
+extern void mcount(void);
+
+EXPORT_SYMBOL(mcount);
+
+void notrace __mcount(void)
+{
+	___trace(TRACE_FN, CALLER_ADDR1, CALLER_ADDR2, 0, 0, 0);
+}
+
+void notrace
+sys_call(int nr, unsigned long p1, unsigned long p2, unsigned long p3)
+{
+	___trace(TRACE_SYSCALL, nr, 0, p1, p2, p3);
+}
+
+void notrace sys_ret(int ret)
+{
+	___trace(TRACE_SYSRET, ret, 0, 0, 0, 0);
+}
+
+static void notrace print_name(struct seq_file *m, unsigned long eip)
+{
+	char namebuf[KSYM_NAME_LEN+1];
+	unsigned long size, offset;
+	const char *sym_name;
+	char *modname;
+
+	/*
+	 * Special trace values:
+	 */
+	if (((long)eip < 10000L) && ((long)eip > -10000L)) {
+		seq_printf(m, "(%ld)", eip);
+		return;
+	}
+	sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf);
+	if (sym_name)
+		seq_puts(m, sym_name);
+	else
+		seq_printf(m, "<%08lx>", eip);
+}
+
+static void notrace print_name_offset(struct seq_file *m, unsigned long eip)
+{
+	char namebuf[KSYM_NAME_LEN+1];
+	unsigned long size, offset;
+	const char *sym_name;
+	char *modname;
+
+	sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf);
+	if (sym_name)
+		seq_printf(m, "%s+%#lx/%#lx <%08lx>",
+					sym_name, offset, size, eip);
+	else
+		seq_printf(m, "<%08lx>", eip);
+}
+
+static unsigned int out_sequence = -1;
+static int pid_to_cmdline_array[PID_MAX_DEFAULT+1];
+
+static void notrace _trace_cmdline(int cpu, struct cpu_trace *tr)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+	____trace(cpu, TRACE_CMDLINE, tr, 0, 0, 0, 0, 0, flags);
+}
+
+void notrace trace_cmdline(void)
+{
+	___trace(TRACE_CMDLINE, 0, 0, 0, 0, 0);
+}
+
+static void construct_pid_to_cmdline(void)
+{
+	struct cpu_trace *tr = out_tr.traces;
+	unsigned int i, j, entries, pid;
+
+	if (tr->critical_sequence == out_sequence)
+		return;
+	out_sequence = tr->critical_sequence;
+
+	memset(pid_to_cmdline_array, -1, sizeof(int) * (PID_MAX_DEFAULT + 1));
+
+	entries = min(tr->trace_idx, MAX_TRACE-1);
+
+	for (i = 0; i < entries; i++) {
+		struct trace_entry *entry = tr->trace + i;
+
+		if (entry->type != TRACE_CMDLINE)
+			continue;
+		pid = entry->pid;
+		if (pid < PID_MAX_DEFAULT) {
+			pid_to_cmdline_array[pid] = i;
+			/*
+			 * Replace space with underline - makes it easier
+			 * to process for tools:
+			 */
+			for (j = 0; j < CMDLINE_BYTES; j++)
+				if (entry->u.cmdline.str[j] == ' ')
+					entry->u.cmdline.str[j] = '_';
+		}
+	}
+}
+
+char *pid_to_cmdline(unsigned long pid)
+{
+	struct cpu_trace *tr = out_tr.traces;
+	char *cmdline = "<...>";
+	int idx;
+
+	pid = min(pid, (unsigned long)PID_MAX_DEFAULT);
+	if (!pid)
+		return "<idle>";
+
+	if (pid_to_cmdline_array[pid] != -1) {
+		idx = pid_to_cmdline_array[pid];
+		if (tr->trace[idx].type == TRACE_CMDLINE)
+			cmdline = tr->trace[idx].u.cmdline.str;
+	}
+	return cmdline;
+}
+
+struct block_idx {
+	int idx[NR_CPUS];
+};
+
+/*
+ * return the trace entry (position) of the smallest-timestamp
+ * one (that is still in the valid idx range):
+ */
+static int min_idx(struct block_idx *bidx)
+{
+	cycles_t min_stamp = (cycles_t) -1;
+	struct trace_entry *entry;
+	int cpu, min_cpu = -1, idx;
+
+	for_each_online_cpu(cpu) {
+		idx = bidx->idx[cpu];
+		if (idx >= min(max_tr.traces[cpu].trace_idx, MAX_TRACE-1))
+			continue;
+		if (idx >= MAX_TRACE*NR_CPUS) {
+			printk("huh: idx (%d) > %ld*%d!\n", idx, MAX_TRACE, NR_CPUS);
+			WARN_ON(1);
+			break;
+		}
+		entry = max_tr.traces[cpu].trace + bidx->idx[cpu];
+		if (entry->timestamp < min_stamp) {
+			min_cpu = cpu;
+			min_stamp = entry->timestamp;
+		}
+	}
+
+	return min_cpu;
+}
+
+/*
+ * This code is called to construct an output trace from
+ * the maximum trace. Having separate traces serves both
+ * atomicity (a new max might be saved while we are busy
+ * accessing /proc/latency_trace) and it is also used to
+ * delay the (expensive) sorting of the output trace by
+ * timestamps, in the trace_all_cpus case.
+ */
+static void update_out_trace(void)
+{
+	int cpu, sum, entries, overrun_sum;
+	struct cpu_trace *tmp_max, *tmp_out;
+	struct trace_entry *out_entry, *entry;
+	struct block_idx bidx = { { 0, }, };
+	cycles_t stamp, first_stamp, last_stamp;
+
+	/*
+	 * Nasty trick. We might overflow the first array but
+	 * there are NR_CPUS of them so we use it as a 'big'
+	 * trace buffer.
+	 */
+	tmp_out = out_tr.traces + 0;
+	*tmp_out = max_tr.traces[max_tr.cpu];
+	out_tr.cpu = max_tr.cpu;
+	out_entry = tmp_out->trace + 0;
+
+	if (!trace_all_cpus) {
+		entries = min(tmp_out->trace_idx, MAX_TRACE-1);
+		if (!entries)
+			return;
+		out_tr.first_timestamp = tmp_out->trace[0].timestamp;
+		out_tr.last_timestamp = tmp_out->trace[entries-1].timestamp;
+		return;
+	}
+	/*
+	 * Find the range of timestamps that are fully traced in
+	 * all CPU traces. (since CPU traces can cover a variable
+	 * range of time, we have to find the best range.)
+	 */
+	first_stamp = 0;
+	for_each_online_cpu(cpu) {
+		tmp_max = max_tr.traces + cpu;
+		stamp = tmp_max->trace[0].timestamp;
+		if (stamp > first_stamp)
+			first_stamp = stamp;
+	}
+	/*
+	 * Save the timestamp range:
+	 */
+	tmp_max = max_tr.traces + max_tr.cpu;
+	entries = min(tmp_max->trace_idx, MAX_TRACE-1);
+	/*
+	 * No saved trace yet?
+	 */
+	if (!entries) {
+		out_tr.traces[0].trace_idx = 0;
+		return;
+	}
+
+	last_stamp = tmp_max->trace[entries-1].timestamp;
+
+	if (last_stamp < first_stamp) {
+		WARN_ON(1);
+
+		for_each_online_cpu(cpu) {
+			tmp_max = max_tr.traces + cpu;
+			entries = min(tmp_max->trace_idx, MAX_TRACE-1);
+			printk("CPU%d: %016Lx (%016Lx) ... #%d (%016Lx) %016Lx\n", cpu,
+				tmp_max->trace[0].timestamp,
+				tmp_max->trace[1].timestamp,
+				entries,
+				tmp_max->trace[entries-2].timestamp,
+				tmp_max->trace[entries-1].timestamp);
+		}
+		tmp_max = max_tr.traces + max_tr.cpu;
+		entries = min(tmp_max->trace_idx, MAX_TRACE-1);
+
+		printk("CPU%d entries: %d\n", max_tr.cpu, entries);
+		printk("first stamp: %016Lx\n", first_stamp);
+		printk(" last stamp: %016Lx\n", first_stamp);
+	}
+
+#if 0
+	printk("first_stamp: %Ld [%016Lx]\n", first_stamp, first_stamp);
+	printk(" last_stamp: %Ld [%016Lx]\n", last_stamp, last_stamp);
+	printk("   +1 stamp: %Ld [%016Lx]\n",
+		tmp_max->trace[entries].timestamp,
+		tmp_max->trace[entries].timestamp);
+	printk("   +2 stamp: %Ld [%016Lx]\n",
+		tmp_max->trace[entries+1].timestamp,
+		tmp_max->trace[entries+1].timestamp);
+	printk("      delta: %Ld\n", last_stamp-first_stamp);
+	printk("    entries: %d\n", entries);
+#endif
+
+	out_tr.first_timestamp = first_stamp;
+	out_tr.last_timestamp = last_stamp;
+
+	/*
+	 * Fetch trace entries one by one, in increasing timestamp
+	 * order. Start at first_stamp, stop at last_stamp:
+	 */
+	sum = 0;
+	for (;;) {
+		cpu = min_idx(&bidx);
+		if (cpu == -1)
+			break;
+		entry = max_tr.traces[cpu].trace + bidx.idx[cpu];
+		if (entry->timestamp > last_stamp)
+			break;
+
+		bidx.idx[cpu]++;
+		if (entry->timestamp < first_stamp)
+			continue;
+		*out_entry = *entry;
+		out_entry++;
+		sum++;
+		if (sum >= MAX_TRACE*NR_CPUS) {
+			printk("huh: sum (%d) > %ld*%d!\n", sum, MAX_TRACE, NR_CPUS);
+			WARN_ON(1);
+			break;
+		}
+	}
+
+	sum = 0;
+	overrun_sum = 0;
+	for_each_online_cpu(cpu) {
+		sum += max_tr.traces[cpu].trace_idx;
+		overrun_sum += atomic_read(&max_tr.traces[cpu].overrun);
+	}
+	tmp_out->trace_idx = sum;
+	atomic_set(&tmp_out->overrun, overrun_sum);
+}
+
+static void notrace print_help_header(struct seq_file *m)
+{
+	seq_puts(m, "                 _------=> CPU#            \n");
+	seq_puts(m, "                / _-----=> irqs-off        \n");
+	seq_puts(m, "               | / _----=> need-resched    \n");
+	seq_puts(m, "               || / _---=> hardirq/softirq \n");
+	seq_puts(m, "               ||| / _--=> preempt-depth   \n");
+	seq_puts(m, "               |||| /                      \n");
+	seq_puts(m, "               |||||     delay             \n");
+	seq_puts(m, "   cmd     pid ||||| time  |   caller      \n");
+	seq_puts(m, "      \\   /    |||||   \\   |   /           \n");
+}
+
+static void * notrace l_start(struct seq_file *m, loff_t *pos)
+{
+	loff_t n = *pos;
+	unsigned long entries;
+	struct cpu_trace *tr;
+
+	down(&out_mutex);
+	/*
+	 * if the file is being read newly, update the output trace:
+	 */
+	if (!n) {
+		// TODO: use the sequence counter here to optimize
+		down(&max_mutex);
+		update_out_trace();
+		up(&max_mutex);
+		if (!out_tr.traces[0].trace_idx) {
+			up(&out_mutex);
+			return NULL;
+		}
+		construct_pid_to_cmdline();
+	}
+	tr = out_tr.traces;
+	entries = min(tr->trace_idx, MAX_TRACE-1);
+
+	if (!n) {
+		seq_printf(m, "preemption latency trace v1.1.5 on %s\n", UTS_RELEASE);
+		seq_puts(m, "--------------------------------------------------------------------\n");
+		seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d | (M:%s VP:%d, KP:%d, SP:%d HP:%d",
+			cycles_to_usecs(tr->saved_latency),
+			entries, entries + atomic_read(&tr->overrun),
+			out_tr.cpu,
+#if defined(CONFIG_PREEMPT_NONE)
+			"server",
+#elif defined(CONFIG_PREEMPT_VOLUNTARY)
+			"desktop",
+#elif defined(CONFIG_PREEMPT_DESKTOP)
+			"preempt",
+#else
+			"rt",
+#endif
+			0, 0,
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+			softirq_preemption
+#else
+			0
+#endif
+			,
+#ifdef CONFIG_PREEMPT_HARDIRQS
+ hardirq_preemption
+#else
+			0
+#endif
+		);
+#ifdef CONFIG_SMP
+		seq_printf(m, " #P:%d)\n", num_online_cpus());
+#else
+		seq_puts(m, ")\n");
+#endif
+		seq_puts(m, "    -----------------\n");
+		seq_printf(m, "    | task: %.16s-%d (uid:%ld nice:%ld policy:%ld rt_prio:%ld)\n",
+			tr->comm, tr->pid, tr->uid, tr->nice,
+			tr->policy, tr->rt_priority);
+		seq_puts(m, "    -----------------\n");
+		if (trace_user_triggered) {
+			seq_puts(m, " => started at: ");
+			print_name_offset(m, tr->critical_start);
+			seq_puts(m, "\n => ended at:   ");
+			print_name_offset(m, tr->critical_end);
+			seq_puts(m, "\n");
+		}
+		seq_puts(m, "\n");
+
+		if (!trace_verbose)
+			print_help_header(m);
+	}
+	if (n >= entries)
+		return NULL;
+
+	return tr->trace + n;
+}
+
+static void * notrace l_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	struct cpu_trace *tr = out_tr.traces;
+	unsigned long entries = min(tr->trace_idx, MAX_TRACE-1);
+
+	if (++*pos >= entries) {
+		if (*pos == entries)
+			seq_puts(m, "\n\nvim:ft=help\n");
+		return NULL;
+	}
+	return tr->trace + *pos;
+}
+
+static void notrace l_stop(struct seq_file *m, void *p)
+{
+	up(&out_mutex);
+}
+
+static void print_timestamp(struct seq_file *m, unsigned long abs_usecs,
+						unsigned long rel_usecs)
+{
+	seq_printf(m, " %4ldus", abs_usecs);
+	if (rel_usecs > 100)
+		seq_puts(m, "!: ");
+	else if (rel_usecs > 1)
+		seq_puts(m, "+: ");
+	else
+		seq_puts(m, " : ");
+}
+
+static void
+print_timestamp_short(struct seq_file *m, unsigned long abs_usecs,
+			unsigned long rel_usecs)
+{
+	seq_printf(m, " %4ldus", abs_usecs);
+	if (rel_usecs > 100)
+		seq_putc(m, '!');
+	else if (rel_usecs > 1)
+		seq_putc(m, '+');
+	else
+		seq_putc(m, ' ');
+}
+
+static void
+print_generic(struct seq_file *m, struct trace_entry *entry)
+{
+	int hardirq, softirq;
+
+	seq_printf(m, "%8.8s-%-5d ", pid_to_cmdline(entry->pid), entry->pid);
+	seq_printf(m, "%d", entry->cpu);
+	seq_printf(m, "%c%c",
+		(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+		(entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.',
+		(entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'n' : '.');
+
+	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+	if (hardirq && softirq)
+		seq_putc(m, 'H');
+	else {
+		if (hardirq)
+			seq_putc(m, 'h');
+		else {
+			if (softirq)
+				seq_putc(m, 's');
+			else
+				seq_putc(m, '.');
+		}
+	}
+
+	if (entry->preempt_count)
+		seq_printf(m, "%x", entry->preempt_count);
+	else
+		seq_puts(m, ".");
+}
+
+
+static int notrace l_show_fn(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	if (trace_verbose) {
+		seq_printf(m, "%16s %5d %d %d %08x %08lx [%016Lx] %ld.%03ldms (+%ld.%03ldms): ",
+			pid_to_cmdline(entry->pid),
+			entry->pid, entry->cpu, entry->flags,
+			entry->preempt_count, trace_idx,
+			entry->timestamp, abs_usecs/1000,
+			abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000);
+		print_name_offset(m, entry->u.fn.eip);
+		seq_puts(m, " (");
+		print_name_offset(m, entry->u.fn.parent_eip);
+		seq_puts(m, ")\n");
+	} else {
+		print_generic(m, entry);
+		print_timestamp(m, abs_usecs, rel_usecs);
+		print_name(m, entry->u.fn.eip);
+		seq_puts(m, " (");
+		print_name(m, entry->u.fn.parent_eip);
+		seq_puts(m, ")\n");
+	}
+	return 0;
+}
+
+static int notrace l_show_special(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry, int mode64)
+{
+	unsigned long abs_usecs, rel_usecs;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	print_generic(m, entry);
+	print_timestamp(m, abs_usecs, rel_usecs);
+	if (trace_verbose)
+		print_name_offset(m, entry->u.special.eip);
+	else
+		print_name(m, entry->u.special.eip);
+
+	if (!mode64) {
+		seq_printf(m, " (%lx %lx %lx)\n",
+			   entry->u.special.v1, entry->u.special.v2, entry->u.special.v3);
+	} else {
+		seq_printf(m, " (%lx%8lx %lx)\n",
+			   entry->u.special.v1, entry->u.special.v2, entry->u.special.v3);
+	}
+	return 0;
+}
+
+static int notrace
+l_show_special_pid(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+	unsigned int pid;
+
+	pid = entry->u.special.v1;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	print_generic(m, entry);
+	print_timestamp(m, abs_usecs, rel_usecs);
+	if (trace_verbose)
+		print_name_offset(m, entry->u.special.eip);
+	else
+		print_name(m, entry->u.special.eip);
+	seq_printf(m, " <%.8s-%d> (%ld %ld)\n",
+		pid_to_cmdline(pid), pid,
+		entry->u.special.v2, entry->u.special.v3);
+
+	return 0;
+}
+
+static int notrace l_show_cmdline(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+
+	if (!trace_verbose)
+		return 0;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	seq_printf(m,
+		"[ => %16s ] %ld.%03ldms (+%ld.%03ldms)\n",
+			entry->u.cmdline.str,
+			abs_usecs/1000, abs_usecs % 1000,
+			rel_usecs/1000, rel_usecs % 1000);
+
+	return 0;
+}
+
+extern unsigned long sys_call_table[NR_syscalls];
+
+static int notrace l_show_syscall(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+	unsigned int nr;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	print_generic(m, entry);
+	print_timestamp_short(m, abs_usecs, rel_usecs);
+
+	seq_puts(m, "> ");
+	nr = entry->u.syscall.nr;
+	if (nr < NR_syscalls)
+		print_name(m, sys_call_table[nr]);
+	else
+		seq_printf(m, "<badsys(%u)>", nr);
+
+	seq_printf(m, " (%08lx %08lx %08lx)\n",
+		entry->u.syscall.p1, entry->u.syscall.p2, entry->u.syscall.p3);
+
+	return 0;
+}
+
+static int notrace l_show_sysret(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	print_generic(m, entry);
+	print_timestamp_short(m, abs_usecs, rel_usecs);
+
+	seq_printf(m, "< (%d)\n", entry->u.sysret.ret);
+
+	return 0;
+}
+
+
+static int notrace l_show(struct seq_file *m, void *p)
+{
+	struct cpu_trace *tr = out_tr.traces;
+	struct trace_entry *entry, *entry0, *next_entry;
+	unsigned long trace_idx;
+
+	cond_resched();
+	entry = p;
+	if (entry->timestamp < out_tr.first_timestamp)
+		return 0;
+	if (entry->timestamp > out_tr.last_timestamp)
+		return 0;
+
+	entry0 = tr->trace;
+	trace_idx = entry - entry0;
+
+	if (trace_idx + 1 < tr->trace_idx)
+		next_entry = entry + 1;
+	else
+		next_entry = entry;
+
+	if (trace_verbose)
+		seq_printf(m, "(T%d/#%ld) ", entry->type, trace_idx);
+
+	switch (entry->type) {
+		case TRACE_FN:
+			l_show_fn(m, trace_idx, entry, entry0, next_entry);
+			break;
+		case TRACE_SPECIAL:
+			l_show_special(m, trace_idx, entry, entry0, next_entry, 0);
+			break;
+		case TRACE_SPECIAL_PID:
+			l_show_special_pid(m, trace_idx, entry, entry0, next_entry);
+			break;
+		case TRACE_SPECIAL_U64:
+			l_show_special(m, trace_idx, entry, entry0, next_entry, 1);
+			break;
+		case TRACE_CMDLINE:
+			l_show_cmdline(m, trace_idx, entry, entry0, next_entry);
+			break;
+		case TRACE_SYSCALL:
+			l_show_syscall(m, trace_idx, entry, entry0, next_entry);
+			break;
+		case TRACE_SYSRET:
+			l_show_sysret(m, trace_idx, entry, entry0, next_entry);
+			break;
+		default:
+			seq_printf(m, "unknown trace type %d\n", entry->type);
+	}
+	return 0;
+}
+
+struct seq_operations latency_trace_op = {
+	.start	= l_start,
+	.next	= l_next,
+	.stop	= l_stop,
+	.show	= l_show
+};
+
+static void copy_trace(struct cpu_trace *save, struct cpu_trace *tr)
+{
+	/* free-running needs reordering */
+	if (trace_freerunning) {
+		int i, idx, idx0 = tr->trace_idx;
+
+		for (i = 0; i < MAX_TRACE; i++) {
+			idx = (idx0 + i) % MAX_TRACE;
+			save->trace[i] = tr->trace[idx];
+		}
+		save->trace_idx = MAX_TRACE-1;
+	} else {
+		save->trace_idx = tr->trace_idx;
+
+		memcpy(save->trace, tr->trace,
+			min(save->trace_idx + 1, MAX_TRACE-1) *
+					sizeof(struct trace_entry));
+	}
+	save->overrun = tr->overrun;
+}
+
+/*
+ * Copy the new maximum trace into the separate maximum-trace
+ * structure. (this way the maximum trace is permanently saved,
+ * for later retrieval via /proc/latency_trace)
+ */
+static void update_max_tr(struct cpu_trace *tr)
+{
+	struct cpu_trace *save;
+	int cpu, all_cpus = 0;
+
+#ifdef CONFIG_PREEMPT
+	WARN_ON(!preempt_count() && !irqs_disabled());
+#endif
+
+	max_tr.cpu = tr->cpu;
+	save = max_tr.traces + tr->cpu;
+
+	if ((wakeup_timing || trace_user_triggered || trace_print_at_crash || print_functions) &&
+				trace_all_cpus) {
+		all_cpus = 1;
+		for_each_online_cpu(cpu)
+			atomic_inc(&cpu_traces[cpu].disabled);
+	}
+
+	save->saved_latency = preempt_max_latency;
+	save->preempt_timestamp = tr->preempt_timestamp;
+	save->critical_start = tr->critical_start;
+	save->critical_end = tr->critical_end;
+	save->critical_sequence = tr->critical_sequence;
+
+	memcpy(save->comm, current->comm, CMDLINE_BYTES);
+	save->pid = current->pid;
+	save->uid = current->uid;
+	save->nice = current->static_prio - 20 - MAX_RT_PRIO;
+	save->policy = current->policy;
+	save->rt_priority = current->rt_priority;
+
+	if (all_cpus) {
+		for_each_online_cpu(cpu) {
+			copy_trace(max_tr.traces + cpu, cpu_traces + cpu);
+			atomic_dec(&cpu_traces[cpu].disabled);
+		}
+	} else
+		copy_trace(save, tr);
+}
+
+#else /* !LATENCY_TRACE */
+
+static inline void notrace
+____trace(int cpu, enum trace_type type, struct cpu_trace *tr,
+	  unsigned long eip, unsigned long parent_eip,
+	  unsigned long v1, unsigned long v2, unsigned long v3,
+	  unsigned long flags)
+{
+}
+
+static inline void notrace
+___trace(enum trace_type type, unsigned long eip, unsigned long parent_eip,
+		unsigned long v1, unsigned long v2,
+			unsigned long v3)
+{
+}
+
+static inline void notrace __trace(unsigned long eip, unsigned long parent_eip)
+{
+}
+
+static inline void update_max_tr(struct cpu_trace *tr)
+{
+}
+
+static inline void notrace _trace_cmdline(int cpu, struct cpu_trace *tr)
+{
+}
+
+#endif
+
+static int setup_preempt_thresh(char *s)
+{
+	int thresh;
+
+	get_option(&s, &thresh);
+	if (thresh > 0) {
+		preempt_thresh = usecs_to_cycles(thresh);
+		printk("Preemption threshold = %u us\n", thresh);
+	}
+	return 1;
+}
+__setup("preempt_thresh=", setup_preempt_thresh);
+
+static inline void notrace reset_trace_idx(int cpu, struct cpu_trace *tr)
+{
+	if (trace_all_cpus)
+		for_each_online_cpu(cpu)
+			cpu_traces[cpu].trace_idx = 0;
+	else
+		tr->trace_idx = 0;
+}
+
+#ifdef CONFIG_CRITICAL_TIMING
+
+static void notrace
+check_critical_timing(int cpu, struct cpu_trace *tr, unsigned long parent_eip)
+{
+	unsigned long latency, t0, t1;
+	cycles_t T0, T1, T2, delta;
+	unsigned long flags;
+
+	if (trace_user_triggered)
+		return;
+	/*
+	 * usecs conversion is slow so we try to delay the conversion
+	 * as long as possible:
+	 */
+	T0 = tr->preempt_timestamp;
+	T1 = get_cycles();
+	delta = T1-T0;
+
+	local_save_flags(flags);
+
+	if (!report_latency(delta))
+		goto out;
+
+	____trace(cpu, TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0, flags);
+	/*
+	 * Update the timestamp, because the trace entry above
+	 * might change it (it can only get larger so the latency
+	 * is fair to be reported):
+	 */
+	T2 = get_cycles();
+	if (T2 < T1)
+		printk("bug: %016Lx < %016Lx!\n", T2, T1);
+	delta = T2-T0;
+
+	latency = cycles_to_usecs(delta);
+	latency_hist(tr->latency_type, cpu, latency);
+
+	if (latency_hist_flag) {
+		if (preempt_max_latency >= delta)
+			goto out;
+	}
+
+	if (tr->critical_sequence != max_sequence || down_trylock(&max_mutex))
+		goto out;
+
+#ifndef CONFIG_CRITICAL_LATENCY_HIST
+	if (!preempt_thresh && preempt_max_latency > delta) {
+		printk("bug: updating %016Lx > %016Lx?\n",
+			preempt_max_latency, delta);
+		printk("  [%016Lx %016Lx %016Lx]\n", T0, T1, T2);
+	}
+#endif
+
+	preempt_max_latency = delta;
+	t0 = cycles_to_usecs(T0);
+	t1 = cycles_to_usecs(T1);
+
+	tr->critical_end = parent_eip;
+
+	update_max_tr(tr);
+
+#ifndef CONFIG_CRITICAL_LATENCY_HIST
+	if (preempt_thresh)
+		printk("(%16s-%-5d|#%d): %lu us critical section "
+			"violates %lu us threshold.\n"
+			" => started at timestamp %lu: ",
+				current->comm, current->pid,
+				raw_smp_processor_id(),
+				latency, cycles_to_usecs(preempt_thresh), t0);
+	else
+		printk("(%16s-%-5d|#%d): new %lu us maximum-latency "
+			"critical section.\n => started at timestamp %lu: ",
+				current->comm, current->pid,
+				raw_smp_processor_id(),
+				latency, t0);
+
+	print_symbol("<%s>\n", tr->critical_start);
+	printk(" =>   ended at timestamp %lu: ", t1);
+	print_symbol("<%s>\n", tr->critical_end);
+	dump_stack();
+	t1 = cycles_to_usecs(get_cycles());
+	printk(" =>   dump-end timestamp %lu\n\n", t1);
+#endif
+
+	max_sequence++;
+
+	up(&max_mutex);
+
+out:
+	tr->critical_sequence = max_sequence;
+	tr->preempt_timestamp = get_cycles();
+	tr->early_warning = 0;
+	reset_trace_idx(cpu, tr);
+	_trace_cmdline(cpu, tr);
+	____trace(cpu, TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0, flags);
+}
+
+void notrace touch_critical_timing(void)
+{
+	int cpu = raw_smp_processor_id();
+	struct cpu_trace *tr = cpu_traces + cpu;
+
+	if (!tr->critical_start || atomic_read(&tr->disabled) ||
+			trace_user_triggered || wakeup_timing)
+		return;
+
+	if (preempt_count() > 0 && tr->critical_start) {
+		atomic_inc(&tr->disabled);
+		check_critical_timing(cpu, tr, CALLER_ADDR0);
+		tr->critical_start = CALLER_ADDR0;
+		tr->critical_sequence = max_sequence;
+		atomic_dec(&tr->disabled);
+	}
+}
+EXPORT_SYMBOL(touch_critical_timing);
+
+void notrace stop_critical_timing(void)
+{
+	struct cpu_trace *tr = cpu_traces + raw_smp_processor_id();
+
+	tr->critical_start = 0;
+}
+EXPORT_SYMBOL(stop_critical_timing);
+
+static inline void notrace
+__start_critical_timing(unsigned long eip, unsigned long parent_eip, int latency_type)
+{
+	int cpu = raw_smp_processor_id();
+	struct cpu_trace *tr = cpu_traces + cpu;
+	unsigned long flags;
+
+	if (tr->critical_start || atomic_read(&tr->disabled) ||
+			trace_user_triggered || wakeup_timing)
+		return;
+
+	atomic_inc(&tr->disabled);
+
+	tr->critical_sequence = max_sequence;
+	tr->preempt_timestamp = get_cycles();
+	tr->critical_start = eip;
+	atomic_set(&tr->overrun, 0);
+	reset_trace_idx(cpu, tr);
+	tr->latency_type = latency_type;
+	_trace_cmdline(cpu, tr);
+
+	local_save_flags(flags);
+	____trace(cpu, TRACE_FN, tr, eip, parent_eip, 0, 0, 0, flags);
+
+	atomic_dec(&tr->disabled);
+}
+
+static inline void notrace
+__stop_critical_timing(unsigned long eip, unsigned long parent_eip)
+{
+	int cpu = raw_smp_processor_id();
+	struct cpu_trace *tr = cpu_traces + cpu;
+	unsigned long flags;
+
+	if (!tr->critical_start || atomic_read(&tr->disabled) ||
+			trace_user_triggered || wakeup_timing)
+		return;
+
+	atomic_inc(&tr->disabled);
+	local_save_flags(flags);
+	____trace(cpu, TRACE_FN, tr, eip, parent_eip, 0, 0, 0, flags);
+	check_critical_timing(cpu, tr, eip);
+	tr->critical_start = 0;
+	atomic_dec(&tr->disabled);
+}
+
+#endif
+
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+
+/* FIXME: do we have to save flags here? */
+void notrace trace_hardirqs_off_lowlevel(void)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (!irqs_off_preempt_count() && irqs_disabled_flags(flags))
+		__start_critical_timing(CALLER_ADDR0, 0, INTERRUPT_LATENCY);
+}
+
+void notrace trace_hardirqs_off(void)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (!irqs_off_preempt_count() && irqs_disabled_flags(flags))
+		__start_critical_timing(CALLER_ADDR0, CALLER_ADDR1, INTERRUPT_LATENCY);
+}
+
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+void notrace trace_hardirqs_on_lowlevel(void)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (!irqs_off_preempt_count() && irqs_disabled_flags(flags))
+		__stop_critical_timing(CALLER_ADDR0, 0);
+}
+
+void notrace trace_hardirqs_on(void)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (!irqs_off_preempt_count() && irqs_disabled_flags(flags))
+		__stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+#endif
+
+#endif /* LATENCY_TIMING */
+
+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_CRITICAL_TIMING)
+
+static inline unsigned long get_parent_eip(void)
+{
+	unsigned long parent_eip = CALLER_ADDR1;
+
+	if (in_lock_functions(parent_eip)) {
+		parent_eip = CALLER_ADDR2;
+		if (in_lock_functions(parent_eip))
+			parent_eip = CALLER_ADDR3;
+	}
+
+	return parent_eip;
+}
+
+void notrace add_preempt_count(unsigned int val)
+{
+	unsigned long eip = CALLER_ADDR0;
+	unsigned long parent_eip = get_parent_eip();
+
+#ifdef CONFIG_DEBUG_PREEMPT
+	/*
+	 * Underflow?
+	 */
+	BUG_ON(((int)preempt_count() < 0));
+	/*
+	 * Spinlock count overflowing soon?
+	 */
+	BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
+#endif
+
+	preempt_count() += val;
+#ifdef CONFIG_PREEMPT_TRACE
+	if (val <= 10) {
+		unsigned int idx = preempt_count() & PREEMPT_MASK;
+		if (idx < MAX_PREEMPT_TRACE) {
+			current->preempt_trace_eip[idx] = eip;
+			current->preempt_trace_parent_eip[idx] = parent_eip;
+		}
+	}
+#endif
+#ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+	{
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+		unsigned long flags;
+
+		local_save_flags(flags);
+
+		if (!irqs_disabled_flags(flags))
+#endif
+			if (preempt_count() == val)
+				__start_critical_timing(eip, parent_eip, PREEMPT_LATENCY);
+	}
+#endif
+	(void)eip, (void)parent_eip;
+}
+EXPORT_SYMBOL(add_preempt_count);
+
+void notrace sub_preempt_count(unsigned int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+	/*
+	 * Underflow?
+	 */
+	BUG_ON(unlikely(val > preempt_count()));
+
+	/*
+	 * Is the spinlock portion underflowing?
+	 */
+	BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK));
+#endif
+
+#ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+	{
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+		unsigned long flags;
+
+		local_save_flags(flags);
+
+		if (!irqs_disabled_flags(flags))
+#endif
+			if (preempt_count() == val)
+				__stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+	}
+#endif
+	preempt_count() -= val;
+}
+
+EXPORT_SYMBOL(sub_preempt_count);
+
+void notrace mask_preempt_count(unsigned int mask)
+{
+	unsigned long eip = CALLER_ADDR0;
+	unsigned long parent_eip = get_parent_eip();
+
+	preempt_count() |= mask;
+
+#ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+	{
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+		unsigned long flags;
+
+		local_save_flags(flags);
+
+		if (!irqs_disabled_flags(flags))
+#endif
+			if (preempt_count() == mask)
+				__start_critical_timing(eip, parent_eip, PREEMPT_LATENCY);
+	}
+#endif
+	(void) eip, (void) parent_eip;
+}
+EXPORT_SYMBOL(mask_preempt_count);
+
+void notrace unmask_preempt_count(unsigned int mask)
+{
+#ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+	{
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+		unsigned long flags;
+
+		local_save_flags(flags);
+
+		if (!irqs_disabled_flags(flags))
+#endif
+			if (preempt_count() == mask)
+				__stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+	}
+#endif
+	preempt_count() &= ~mask;
+}
+EXPORT_SYMBOL(unmask_preempt_count);
+
+
+#endif
+
+/*
+ * Wakeup latency timing/tracing. We get upcalls from the scheduler
+ * when a task is being woken up and we time/trace it until it gets
+ * to a CPU - or an even-higher-prio task supercedes it. (in that
+ * case we throw away the currently traced task - we dont try to
+ * handle nesting, that simplifies things significantly)
+ */
+#ifdef CONFIG_WAKEUP_TIMING
+
+static void notrace
+check_wakeup_timing(struct cpu_trace *tr, unsigned long parent_eip)
+{
+	unsigned long latency, t0, t1, flags;
+	cycles_t T0, T1, T2, delta;
+	int cpu = raw_smp_processor_id();
+
+	if (trace_user_triggered)
+		return;
+
+	atomic_inc(&tr->disabled);
+	if (atomic_read(&tr->disabled) != 1)
+		goto out;
+
+	T0 = tr->preempt_timestamp;
+	T1 = get_cycles();
+	/*
+	 * maybe preempt_timestamp originated on another CPU,
+	 * with a TSC drift:
+	 */
+	if (T0 > T1)
+		T0 = T1;
+	delta = T1-T0;
+
+	if (!report_latency(delta))
+		goto out;
+
+	local_save_flags(flags);
+	____trace(smp_processor_id(), TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0, flags);
+	T2 = get_cycles();
+	if (T2 < T1)
+		printk("bug2: %016Lx < %016Lx!\n", T2, T1);
+	delta = T2-T0;
+
+	latency = cycles_to_usecs(delta);
+	latency_hist(tr->latency_type, cpu, latency);
+
+	if (latency_hist_flag) {
+		if (preempt_max_latency >= delta)
+			goto out;
+	}
+
+	if (tr->critical_sequence != max_sequence || down_trylock(&max_mutex))
+		goto out;
+
+#ifndef CONFIG_WAKEUP_LATENCY_HIST
+	if (!preempt_thresh && preempt_max_latency > delta) {
+		printk("bug2: updating %016Lx > %016Lx?\n",
+			preempt_max_latency, delta);
+		printk("  [%016Lx %016Lx %016Lx]\n", T0, T1, T2);
+	}
+#endif
+
+	preempt_max_latency = delta;
+	t0 = cycles_to_usecs(T0);
+	t1 = cycles_to_usecs(T1);
+	tr->critical_end = parent_eip;
+
+	update_max_tr(tr);
+
+#ifndef CONFIG_WAKEUP_LATENCY_HIST
+	if (preempt_thresh)
+		printk("(%16s-%-5d|#%d): %lu us wakeup latency "
+			"violates %lu us threshold.\n",
+				current->comm, current->pid,
+				raw_smp_processor_id(), latency,
+				cycles_to_usecs(preempt_thresh));
+	else
+		printk("(%16s-%-5d|#%d): new %lu us maximum-latency "
+			"wakeup.\n", current->comm, current->pid,
+				raw_smp_processor_id(), latency);
+#endif
+
+	max_sequence++;
+
+	up(&max_mutex);
+
+out:
+	atomic_dec(&tr->disabled);
+}
+
+/*
+ * Start wakeup latency tracing - called with the runqueue held
+ * and interrupts disabled:
+ */
+void __trace_start_sched_wakeup(struct task_struct *p)
+{
+	struct cpu_trace *tr;
+	int cpu;
+
+	if (trace_user_triggered || !wakeup_timing)
+		return;
+
+	spin_lock(&sch.trace_lock);
+	if (sch.task && (sch.task->prio <= p->prio))
+		goto out_unlock;
+
+	/*
+	 * New highest-prio task just woke up - start tracing:
+	 */
+	sch.task = p;
+	cpu = task_cpu(p);
+	sch.cpu = cpu;
+	/*
+	 * We keep using this CPU's trace buffer even if the task
+	 * gets migrated to another CPU. Tracing only happens on
+	 * the CPU that 'owns' the highest-prio task so it's
+	 * fundamentally single-threaded.
+	 */
+	sch.tr = tr = cpu_traces + cpu;
+	reset_trace_idx(cpu, tr);
+
+//	if (!atomic_read(&tr->disabled)) {
+		atomic_inc(&tr->disabled);
+		tr->critical_sequence = max_sequence;
+		tr->preempt_timestamp = get_cycles();
+		tr->latency_type = WAKEUP_LATENCY;
+		tr->critical_start = CALLER_ADDR0;
+		atomic_set(&tr->overrun, 0);
+		_trace_cmdline(raw_smp_processor_id(), tr);
+		atomic_dec(&tr->disabled);
+//	}
+
+	mcount();
+	trace_special_pid(p->pid, p->prio, cpu);
+out_unlock:
+	spin_unlock(&sch.trace_lock);
+}
+
+void trace_stop_sched_switched(struct task_struct *p)
+{
+	struct cpu_trace *tr;
+	unsigned long flags;
+
+	if (trace_user_triggered || !wakeup_timing)
+		return;
+
+	local_irq_save(flags);
+	spin_lock(&sch.trace_lock);
+	if (p == sch.task) {
+		trace_special_pid(p->pid, p->prio, task_cpu(p));
+
+		sch.task = NULL;
+		tr = sch.tr;
+		sch.tr = NULL;
+		WARN_ON(!tr);
+		/*
+		 * Somewhat racy but safer - the printks within
+		 * check_wakeup_timing() can call back into the
+		 * wakup-timing code and deadlock:
+		 */
+//		atomic_inc(&tr->disabled);
+		preempt_disable();
+		spin_unlock(&sch.trace_lock);
+		check_wakeup_timing(tr, CALLER_ADDR0);
+		preempt_enable();
+//		atomic_dec(&tr->disabled);
+	} else {
+		if (sch.task)
+			trace_special_pid(sch.task->pid, sch.task->prio, p->prio);
+		if (sch.task && (sch.task->prio >= p->prio))
+			sch.task = NULL;
+		spin_unlock(&sch.trace_lock);
+	}
+	local_irq_restore(flags);
+}
+
+void trace_change_sched_cpu(struct task_struct *p, int new_cpu)
+{
+	unsigned long flags;
+
+	if (!wakeup_timing)
+		return;
+
+	trace_special(task_cpu(p), task_cpu(p), new_cpu);
+	local_irq_save(flags);
+	spin_lock(&sch.trace_lock);
+	if (p == sch.task && task_cpu(p) != new_cpu) {
+		sch.cpu = new_cpu;
+		trace_special(task_cpu(p), new_cpu, 0);
+	}
+	spin_unlock(&sch.trace_lock);
+	local_irq_restore(flags);
+}
+
+#endif
+
+#ifdef CONFIG_LATENCY_TRACE
+
+long user_trace_start(void)
+{
+	struct cpu_trace *tr;
+	unsigned long flags;
+	int cpu;
+
+	if (!trace_user_triggered || trace_print_at_crash || print_functions)
+		return -EINVAL;
+
+	/*
+	 * user_trace_start() might be called from hardirq
+	 * context, if trace_user_triggered_irq is set, so
+	 * be careful about locking:
+	 */
+	if (preempt_count()) {
+		if (down_trylock(&max_mutex))
+			return -EAGAIN;
+	} else
+		down(&max_mutex);
+
+	local_irq_save(flags);
+	cpu = smp_processor_id();
+	tr = cpu_traces + cpu;
+
+#ifdef CONFIG_WAKEUP_TIMING
+	if (wakeup_timing) {
+		spin_lock(&sch.trace_lock);
+		sch.task = current;
+		sch.cpu = cpu;
+		sch.tr = tr;
+		spin_unlock(&sch.trace_lock);
+	}
+#endif
+	reset_trace_idx(cpu, tr);
+
+	tr->critical_sequence = max_sequence;
+	tr->preempt_timestamp = get_cycles();
+	tr->critical_start = CALLER_ADDR0;
+	atomic_set(&tr->overrun, 0);
+	_trace_cmdline(cpu, tr);
+	mcount();
+
+	WARN_ON(!irqs_disabled());
+	local_irq_restore(flags);
+
+	up(&max_mutex);
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(user_trace_start);
+
+long user_trace_stop(void)
+{
+	unsigned long latency, flags;
+	struct cpu_trace *tr;
+	cycles_t delta;
+
+	if (!trace_user_triggered || trace_print_at_crash || print_functions)
+		return -EINVAL;
+
+	preempt_disable();
+	mcount();
+
+	local_irq_save(flags);
+#ifdef CONFIG_WAKEUP_TIMING
+	if (wakeup_timing) {
+		spin_lock(&sch.trace_lock);
+		if (current != sch.task) {
+			spin_unlock(&sch.trace_lock);
+			local_irq_restore(flags);
+			preempt_enable();
+			return -EINVAL;
+		}
+		sch.task = NULL;
+		tr = sch.tr;
+		sch.tr = NULL;
+		spin_unlock(&sch.trace_lock);
+	} else
+#endif
+		tr = cpu_traces + smp_processor_id();
+
+	atomic_inc(&tr->disabled);
+	if (tr->preempt_timestamp) {
+		cycles_t T0, T1;
+		unsigned long long tmp0;
+
+		T0 = tr->preempt_timestamp;
+		T1 = get_cycles();
+		tmp0 = preempt_max_latency;
+		if (T1 < T0)
+			T0 = T1;
+		delta = T1 - T0;
+		if (!report_latency(delta))
+			goto out;
+		if (tr->critical_sequence != max_sequence ||
+						down_trylock(&max_mutex))
+			goto out;
+
+		if (!preempt_thresh && preempt_max_latency > delta) {
+			local_irq_restore(flags);
+			printk("bug3: updating %016Lx > %016Lx [%016Lx]?\n",
+				preempt_max_latency, delta, tmp0);
+			printk("  [%016Lx %016Lx]\n", T0, T1);
+			local_irq_save(flags);
+		}
+
+		preempt_max_latency = delta;
+		update_max_tr(tr);
+
+		latency = cycles_to_usecs(delta);
+
+		local_irq_restore(flags);
+		if (preempt_thresh)
+			printk("(%16s-%-5d|#%d): %lu us user-latency "
+				"violates %lu us threshold.\n",
+					current->comm, current->pid,
+					raw_smp_processor_id(), latency,
+					cycles_to_usecs(preempt_thresh));
+		else
+			printk("(%16s-%-5d|#%d): new %lu us user-latency.\n",
+				current->comm, current->pid,
+					raw_smp_processor_id(), latency);
+		local_irq_save(flags);
+
+		max_sequence++;
+		up(&max_mutex);
+out:
+		tr->preempt_timestamp = 0;
+	}
+	atomic_dec(&tr->disabled);
+	local_irq_restore(flags);
+	preempt_enable();
+
+	return 0;
+}
+
+EXPORT_SYMBOL(user_trace_stop);
+
+void stop_trace(void)
+{
+	if (trace_print_at_crash)
+		trace_enabled = -1;
+}
+
+EXPORT_SYMBOL(stop_trace);
+
+static void print_entry(struct trace_entry *entry, struct trace_entry *entry0,
+			struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs;
+	int hardirq, softirq;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+
+	printk("%-5d ", entry->pid);
+
+	printk("%c%c",
+		(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+		(entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.',
+ 		(entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'n' : '.');
+
+	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+	if (hardirq && softirq)
+		printk("H");
+	else {
+		if (hardirq)
+			printk("h");
+		else {
+			if (softirq)
+				printk("s");
+			else
+				printk(".");
+		}
+	}
+
+	printk(":%d %ld.%03ldms: ",
+		entry->preempt_count, abs_usecs/1000, abs_usecs % 1000);
+
+	printk_name(entry->u.fn.eip);
+	printk("  <= (");
+	printk_name(entry->u.fn.parent_eip);
+	printk(")\n");
+}
+
+/*
+ * Print the current trace at crash time.
+ *
+ * We print it backwards, so that the newest (most interesting) entries
+ * are printed first.
+ */
+void print_last_trace(void)
+{
+	unsigned int idx0, idx, i;
+	struct cpu_trace *tr;
+	struct trace_entry *entry0, *entry, *next_entry;
+
+	if (trace_enabled != -1 || !trace_print_at_crash)
+		return;
+
+	trace_print_at_crash = 0;
+
+	preempt_disable();
+	tr = cpu_traces + smp_processor_id();
+
+	printk("Last %ld trace entries:\n", MAX_TRACE);
+	idx0 = tr->trace_idx;
+	printk("curr idx: %d\n", idx0);
+	if (idx0 >= MAX_TRACE)
+		idx0 = MAX_TRACE-1;
+	idx = idx0;
+	entry0 = tr->trace + idx0;
+
+	for (i = 0; i < MAX_TRACE; i++) {
+		next_entry = tr->trace + idx;
+		if (idx == 0)
+			idx = MAX_TRACE-1;
+		else
+			idx--;
+		entry = tr->trace + idx;
+		if (entry->type == TRACE_FN)
+			print_entry(entry, entry0, next_entry);
+	}
+	printk("printed %ld entries\n", MAX_TRACE);
+
+	preempt_enable();
+}
+
+#ifdef CONFIG_SMP
+/*
+ * On SMP, try to 'peek' on other CPU's traces and record them
+ * in this CPU's trace. This way we get a rough idea about what's
+ * going on there, without the overhead of global tracing.
+ *
+ * (no need to make this PER_CPU, we bounce it around anyway.)
+ */
+unsigned long nmi_eips[NR_CPUS];
+unsigned long nmi_flags[NR_CPUS];
+
+void notrace nmi_trace(unsigned long eip, unsigned long parent_eip,
+			unsigned long flags)
+{
+	int cpu, this_cpu = smp_processor_id();
+
+	__trace(eip, parent_eip);
+
+	nmi_eips[this_cpu] = parent_eip;
+	nmi_flags[this_cpu] = flags;
+	for (cpu = 0; cpu < NR_CPUS; cpu++)
+		if (cpu_online(cpu) && cpu != this_cpu) {
+			__trace(eip, nmi_eips[cpu]);
+			__trace(eip, nmi_flags[cpu]);
+		}
+}
+#else
+/*
+ * On UP, NMI tracing is quite simple:
+ */
+void notrace nmi_trace(unsigned long eip, unsigned long parent_eip,
+			unsigned long flags)
+{
+	__trace(eip, parent_eip);
+}
+#endif
+
+#endif
+
+#ifdef CONFIG_PREEMPT_TRACE
+
+static void print_preempt_trace(struct task_struct *task)
+{
+	unsigned int count = task->thread_info->preempt_count;
+	unsigned int i, lim = count & PREEMPT_MASK;
+	if (lim >= MAX_PREEMPT_TRACE)
+		lim = MAX_PREEMPT_TRACE-1;
+	printk("---------------------------\n");
+	printk("| preempt count: %08x ]\n", count);
+	printk("| %d-level deep critical section nesting:\n", lim);
+	printk("----------------------------------------\n");
+	for (i = 1; i <= lim; i++) {
+		printk(".. [<%08lx>] .... ", task->preempt_trace_eip[i]);
+		print_symbol("%s\n", task->preempt_trace_eip[i]);
+		printk(".....[<%08lx>] ..   ( <= ",
+				task->preempt_trace_parent_eip[i]);
+		print_symbol("%s)\n", task->preempt_trace_parent_eip[i]);
+	}
+	printk("\n");
+}
+
+#endif
+
+#if defined(CONFIG_PREEMPT_TRACE) || defined(CONFIG_LATENCY_TRACE)
+void print_traces(struct task_struct *task)
+{
+	if (!task)
+		task = current;
+
+#ifdef CONFIG_PREEMPT_TRACE
+	print_preempt_trace(task);
+#endif
+#ifdef CONFIG_LATENCY_TRACE
+	print_last_trace();
+#endif
+}
+#endif
+
+#ifdef CONFIG_LATENCY_TIMING
+
+static int preempt_read_proc(char *page, char **start, off_t off,
+			     int count, int *eof, void *data)
+{
+	cycles_t *max = data;
+
+	return sprintf(page, "%ld\n", cycles_to_usecs(*max));
+}
+
+static int preempt_write_proc(struct file *file, const char __user *buffer,
+			      unsigned long count, void *data)
+{
+	unsigned int c, done = 0, val, sum = 0;
+	cycles_t *max = data;
+
+	while (count) {
+		if (get_user(c, buffer))
+			return -EFAULT;
+		val = c - '0';
+		buffer++;
+		done++;
+		count--;
+		if (c == 0 || c == '\n')
+			break;
+		if (val > 9)
+			return -EINVAL;
+		sum *= 10;
+		sum += val;
+	}
+	*max = usecs_to_cycles(sum);
+	return done;
+}
+
+#define	PROCNAME_PML	"sys/kernel/preempt_max_latency"
+#define PROCNAME_PT	"sys/kernel/preempt_thresh"
+
+static __init int latency_init(void)
+{
+	struct proc_dir_entry *entry;
+	int cpu;
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++)
+		cpu_traces[cpu].cpu = cpu;
+
+	if (!(entry = create_proc_entry(PROCNAME_PML, 0644, NULL)))
+		printk("latency_init(): can't create %s\n", PROCNAME_PML);
+	else {
+		entry->nlink = 1;
+		entry->data = &preempt_max_latency;
+		entry->read_proc = preempt_read_proc;
+		entry->write_proc = preempt_write_proc;
+	}
+
+	if (!(entry = create_proc_entry(PROCNAME_PT, 0644, NULL)))
+		printk("latency_init(): can't create %s\n", PROCNAME_PT);
+	else {
+		entry->nlink = 1;
+		entry->data = &preempt_thresh;
+		entry->read_proc = preempt_read_proc;
+		entry->write_proc = preempt_write_proc;
+	}
+	return 0;
+}
+__initcall(latency_init);
+
+#endif
Index: linux-2.6.16/kernel/latency_hist.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/kernel/latency_hist.c	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,267 @@
+/*
+ * kernel/latency_hist.c
+ *
+ * Add support for histograms of preemption-off latency and
+ * interrupt-off latency and wakeup latency, it depends on
+ * Real-Time Preemption Support.
+ *
+ *  Copyright (C) 2005 MontaVista Software, Inc.
+ *  Yi Yang <yyang@ch.mvista.com>
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/percpu.h>
+#include <linux/latency_hist.h>
+#include <asm/atomic.h>
+
+typedef struct hist_data_struct {
+	atomic_t hist_mode; /* 0 log, 1 don't log */
+	unsigned long min_lat;
+	unsigned long avg_lat;
+	unsigned long max_lat;
+	unsigned long long beyond_hist_bound_samples;
+	unsigned long long accumulate_lat;
+	unsigned long long total_samples;
+	unsigned long long hist_array[MAX_ENTRY_NUM];
+} hist_data_t;
+
+static struct proc_dir_entry * latency_hist_root = NULL;
+static char * latency_hist_proc_dir_root = "latency_hist";
+
+static char * percpu_proc_name = "CPU";
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+static DEFINE_PER_CPU(hist_data_t, interrupt_off_hist);
+static char * interrupt_off_hist_proc_dir = "interrupt_off_latency";
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+static DEFINE_PER_CPU(hist_data_t, preempt_off_hist);
+static char * preempt_off_hist_proc_dir = "preempt_off_latency";
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+static DEFINE_PER_CPU(hist_data_t, wakeup_latency_hist);
+static char * wakeup_latency_hist_proc_dir = "wakeup_latency";
+#endif
+
+static struct proc_dir_entry *entry[LATENCY_TYPE_NUM][NR_CPUS];
+
+static inline u64 u64_div(u64 x, u64 y)
+{
+        do_div(x, y);
+        return x;
+}
+
+void latency_hist(int latency_type, int cpu, unsigned long latency)
+{
+	hist_data_t * my_hist;
+
+	if ((cpu < 0) || (cpu >= NR_CPUS) || (latency_type < INTERRUPT_LATENCY)
+			|| (latency_type > WAKEUP_LATENCY) || (latency < 0))
+		return;
+
+	switch(latency_type) {
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+	case INTERRUPT_LATENCY:
+		my_hist = (hist_data_t *)&per_cpu(interrupt_off_hist, cpu);
+		break;
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+	case PREEMPT_LATENCY:
+		my_hist = (hist_data_t *)&per_cpu(preempt_off_hist, cpu);
+		break;
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+	case WAKEUP_LATENCY:
+		my_hist = (hist_data_t *)&per_cpu(wakeup_latency_hist, cpu);
+		break;
+#endif
+	default:
+		return;
+	}
+
+	if (atomic_read(&my_hist->hist_mode) == 0)
+		return;
+
+	if (latency >= MAX_ENTRY_NUM)
+		my_hist->beyond_hist_bound_samples++;
+	else
+		my_hist->hist_array[latency]++;
+
+	if (latency < my_hist->min_lat)
+		my_hist->min_lat = latency;
+	else if (latency > my_hist->max_lat)
+		my_hist->max_lat = latency;
+
+	my_hist->total_samples++;
+	my_hist->accumulate_lat += latency;
+	my_hist->avg_lat = (unsigned long) u64_div(my_hist->accumulate_lat,
+						  my_hist->total_samples);
+	return;
+}
+
+static void *l_start(struct seq_file *m, loff_t * pos)
+{
+	loff_t *index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL);
+	loff_t index = *pos;
+	hist_data_t *my_hist = (hist_data_t *) m->private;
+
+	if (!index_ptr)
+		return NULL;
+
+	if (index == 0) {
+		atomic_dec(&my_hist->hist_mode);
+		seq_printf(m, "#Minimum latency: %lu microseconds.\n"
+			   "#Average latency: %lu microseconds.\n"
+			   "#Maximum latency: %lu microseconds.\n"
+			   "#Total samples: %llu\n"
+			   "#There are %llu samples greater or equal than %d microseconds\n"
+			   "#usecs\t%16s\n"
+			   , my_hist->min_lat
+			   , my_hist->avg_lat
+			   , my_hist->max_lat
+			   , my_hist->total_samples
+			   , my_hist->beyond_hist_bound_samples
+			   , MAX_ENTRY_NUM, "samples");
+	}
+	if (index >= MAX_ENTRY_NUM)
+		return NULL;
+
+	*index_ptr = index;
+	return index_ptr;
+}
+
+static void *l_next(struct seq_file *m, void *p, loff_t * pos)
+{
+	loff_t *index_ptr = p;
+	hist_data_t *my_hist = (hist_data_t *) m->private;
+
+	if (++*pos >= MAX_ENTRY_NUM) {
+		atomic_inc(&my_hist->hist_mode);
+		return NULL;
+	}
+	*index_ptr = *pos;
+	return index_ptr;
+}
+
+static void l_stop(struct seq_file *m, void *p)
+{
+	kfree(p);
+}
+
+static int l_show(struct seq_file *m, void *p)
+{
+	int index = *(loff_t *) p;
+	hist_data_t *my_hist = (hist_data_t *) m->private;
+
+	seq_printf(m, "%5d\t%16llu\n", index, my_hist->hist_array[index]);
+	return 0;
+}
+
+static struct seq_operations latency_hist_seq_op = {
+	.start = l_start,
+	.next  = l_next,
+	.stop  = l_stop,
+	.show  = l_show
+};
+
+static int latency_hist_seq_open(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *entry_ptr = NULL;
+	int ret, i, j, break_flags = 0;
+	struct seq_file *seq;
+
+	entry_ptr = PDE(file->f_dentry->d_inode);
+	for (i = 0; i < LATENCY_TYPE_NUM; i++) {
+		for (j = 0; j < NR_CPUS; j++) {
+			if (entry[i][j] == NULL)
+				continue;
+			if (entry_ptr->low_ino == entry[i][j]->low_ino) {
+				break_flags = 1;
+				break;
+			}
+		}
+		if (break_flags == 1)
+			break;
+	}
+	ret = seq_open(file, &latency_hist_seq_op);
+	if (break_flags == 1) {
+		seq = (struct seq_file *)file->private_data;
+		seq->private = entry[i][j]->data;
+	}
+	return ret;
+}
+
+static struct file_operations latency_hist_seq_fops = {
+	.open = latency_hist_seq_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+static __init int latency_hist_init(void)
+{
+	struct proc_dir_entry *tmp_parent_proc_dir;
+	int i = 0, len = 0;
+	hist_data_t *my_hist;
+	char procname[64];
+
+	latency_hist_root = proc_mkdir(latency_hist_proc_dir_root, NULL);
+
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+	tmp_parent_proc_dir = proc_mkdir(interrupt_off_hist_proc_dir, latency_hist_root);
+	for (i = 0; i < NR_CPUS; i++) {
+		len = sprintf(procname, "%s%d", percpu_proc_name, i);
+		procname[len] = '\0';
+		entry[INTERRUPT_LATENCY][i] =
+			create_proc_entry(procname, 0, tmp_parent_proc_dir);
+		entry[INTERRUPT_LATENCY][i]->data = (void *)&per_cpu(interrupt_off_hist, i);
+		entry[INTERRUPT_LATENCY][i]->proc_fops = &latency_hist_seq_fops;
+		my_hist = (hist_data_t *) entry[INTERRUPT_LATENCY][i]->data;
+		atomic_set(&my_hist->hist_mode,1);
+		my_hist->min_lat = 0xFFFFFFFFUL;
+	}
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+	tmp_parent_proc_dir = proc_mkdir(preempt_off_hist_proc_dir, latency_hist_root);
+	for (i = 0; i < NR_CPUS; i++) {
+		len = sprintf(procname, "%s%d", percpu_proc_name, i);
+		procname[len] = '\0';
+		entry[PREEMPT_LATENCY][i] =
+			create_proc_entry(procname, 0, tmp_parent_proc_dir);
+		entry[PREEMPT_LATENCY][i]->data = (void *)&per_cpu(preempt_off_hist, i);
+		entry[PREEMPT_LATENCY][i]->proc_fops = &latency_hist_seq_fops;
+		my_hist = (hist_data_t *) entry[PREEMPT_LATENCY][i]->data;
+		atomic_set(&my_hist->hist_mode,1);
+		my_hist->min_lat = 0xFFFFFFFFUL;
+	}
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+	tmp_parent_proc_dir = proc_mkdir(wakeup_latency_hist_proc_dir, latency_hist_root);
+	for (i = 0; i < NR_CPUS; i++) {
+		len = sprintf(procname, "%s%d", percpu_proc_name, i);
+		procname[len] = '\0';
+		entry[WAKEUP_LATENCY][i] =
+			create_proc_entry(procname, 0, tmp_parent_proc_dir);
+		entry[WAKEUP_LATENCY][i]->data = (void *)&per_cpu(wakeup_latency_hist, i);
+		entry[WAKEUP_LATENCY][i]->proc_fops = &latency_hist_seq_fops;
+		my_hist = (hist_data_t *) entry[WAKEUP_LATENCY][i]->data;
+		atomic_set(&my_hist->hist_mode,1);
+		my_hist->min_lat = 0xFFFFFFFFUL;
+	}
+#endif
+	return 0;
+
+}
+
+__initcall(latency_hist_init);
+
Index: linux-2.6.16/kernel/panic.c
===================================================================
--- linux-2.6.16.orig/kernel/panic.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/panic.c	2006-10-19 16:52:31.000000000 +0200
@@ -76,6 +76,7 @@ NORET_TYPE void panic(const char * fmt, 
 	vsnprintf(buf, sizeof(buf), fmt, args);
 	va_end(args);
 	printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
+	dump_stack();
 	bust_spinlocks(0);
 
 	/*
Index: linux-2.6.16/kernel/pid.c
===================================================================
--- linux-2.6.16.orig/kernel/pid.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/pid.c	2006-10-19 16:52:31.000000000 +0200
@@ -60,7 +60,7 @@ typedef struct pidmap {
 static pidmap_t pidmap_array[PIDMAP_ENTRIES] =
 	 { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } };
 
-static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
+static DEFINE_SPINLOCK(pidmap_lock);
 
 fastcall void free_pidmap(int pid)
 {
Index: linux-2.6.16/kernel/posix-cpu-timers.c
===================================================================
--- linux-2.6.16.orig/kernel/posix-cpu-timers.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/posix-cpu-timers.c	2006-10-19 16:52:31.000000000 +0200
@@ -291,7 +291,7 @@ int posix_cpu_clock_get(const clockid_t 
 		 * should be able to see it.
 		 */
 		struct task_struct *p;
-		read_lock(&tasklist_lock);
+		rcu_read_lock();
 		p = find_task_by_pid(pid);
 		if (p) {
 			if (CPUCLOCK_PERTHREAD(which_clock)) {
@@ -300,11 +300,13 @@ int posix_cpu_clock_get(const clockid_t 
 								 p, &rtn);
 				}
 			} else if (p->tgid == pid && p->signal) {
+				read_lock(&tasklist_lock);
 				error = cpu_clock_sample_group(which_clock,
 							       p, &rtn);
+				read_unlock(&tasklist_lock);
 			}
 		}
-		read_unlock(&tasklist_lock);
+		rcu_read_unlock();
 	}
 
 	if (error)
@@ -1196,7 +1198,7 @@ static void check_process_timers(struct 
 
 			do {
 				t = next_thread(t);
-			} while (unlikely(t->flags & PF_EXITING));
+			} while ((t != tsk) && unlikely(t->flags & PF_EXITING));
 		} while (t != tsk);
 	}
 }
@@ -1271,12 +1273,11 @@ out:
  * already updated our counts.  We need to check if any timers fire now.
  * Interrupts are disabled.
  */
-void run_posix_cpu_timers(struct task_struct *tsk)
+void __run_posix_cpu_timers(struct task_struct *tsk)
 {
 	LIST_HEAD(firing);
 	struct k_itimer *timer, *next;
 
-	BUG_ON(!irqs_disabled());
 
 #define UNEXPIRED(clock) \
 		(cputime_eq(tsk->it_##clock##_expires, cputime_zero) || \
@@ -1289,12 +1290,15 @@ void run_posix_cpu_timers(struct task_st
 
 #undef	UNEXPIRED
 
-	BUG_ON(tsk->exit_state);
-
 	/*
 	 * Double-check with locks held.
 	 */
 	read_lock(&tasklist_lock);
+	/* Make sure the task doesn't exit under us. */
+	if(unlikely(tsk->exit_state)) {
+		read_unlock(&tasklist_lock);
+		return;
+	}
 	spin_lock(&tsk->sighand->siglock);
 
 	/*
@@ -1339,6 +1343,169 @@ void run_posix_cpu_timers(struct task_st
 	}
 }
 
+#include <linux/kthread.h>
+#include <linux/cpu.h>
+DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
+DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
+
+static int posix_cpu_timers_thread(void *data)
+{
+	int cpu = (long)data;
+
+	BUG_ON(per_cpu(posix_timer_task,cpu) != current);
+
+
+	while (!kthread_should_stop()) {
+		struct task_struct *tsk = NULL;
+		struct task_struct *next = NULL;
+
+		if (cpu_is_offline(cpu)) {
+			goto wait_to_die;
+		}
+
+		/* grab task list */
+		raw_local_irq_disable();
+		tsk = per_cpu(posix_timer_tasklist, cpu);
+		per_cpu(posix_timer_tasklist, cpu) = NULL;
+		raw_local_irq_enable();
+
+
+		/* its possible the list is empty, just return */
+		if (!tsk) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule();
+			__set_current_state(TASK_RUNNING);
+			continue;
+		}
+
+		/* Process task list */
+		while (1) {
+			/* save next */
+			next = tsk->posix_timer_list;
+
+			/* run the task timers, clear its ptr and
+			 * unreference it
+			 */
+			__run_posix_cpu_timers(tsk);
+			tsk->posix_timer_list = NULL;
+			put_task_struct(tsk);
+
+			/* check if this is the last on the list */
+			if (next == tsk)
+				break;
+			tsk = next;
+		}
+	}
+	return 0;
+
+wait_to_die:
+	/* Wait for kthread_stop */
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+void run_posix_cpu_timers(struct task_struct *tsk)
+{
+	unsigned long cpu = smp_processor_id();
+	struct task_struct *tasklist;
+
+	BUG_ON(!irqs_disabled());
+	if(!per_cpu(posix_timer_task, cpu))
+		return;
+	/* get per-cpu references */
+	tasklist = per_cpu(posix_timer_tasklist, cpu);
+
+	/* check to see if we're already queued */
+	if (!tsk->posix_timer_list) {
+		get_task_struct(tsk);
+		if (tasklist) {
+			tsk->posix_timer_list = tasklist;
+		} else {
+			/*
+			 * The list is terminated by a self-pointing
+			 * task_struct
+			 */
+			tsk->posix_timer_list = tsk;
+		}
+		per_cpu(posix_timer_tasklist, cpu) = tsk;
+	}
+	/* XXX signal the thread somehow */
+	wake_up_process(per_cpu(posix_timer_task,cpu));
+}
+
+
+
+
+/*
+ * posix_cpu_thread_call - callback that gets triggered when a CPU is added.
+ * Here we can start up the necessary migration thread for the new CPU.
+ */
+static int posix_cpu_thread_call(struct notifier_block *nfb, unsigned long action,
+			  void *hcpu)
+{
+	int cpu = (long)hcpu;
+	struct task_struct *p;
+	struct sched_param param;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		p = kthread_create(posix_cpu_timers_thread, hcpu,
+					"posix_cpu_timers/%d",cpu);
+		if (IS_ERR(p))
+			return NOTIFY_BAD;
+		p->flags |= PF_NOFREEZE;
+		kthread_bind(p, cpu);
+		/* Must be high prio to avoid getting starved */
+		param.sched_priority = MAX_RT_PRIO-1;
+		sched_setscheduler(p, SCHED_FIFO, &param);
+		per_cpu(posix_timer_task,cpu) = p;
+		break;
+	case CPU_ONLINE:
+		/* Strictly unneccessary, as first user will wake it. */
+		wake_up_process(per_cpu(posix_timer_task,cpu));
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+		/* Unbind it from offline cpu so it can run.  Fall thru. */
+		kthread_bind(per_cpu(posix_timer_task,cpu),
+			     any_online_cpu(cpu_online_map));
+		kthread_stop(per_cpu(posix_timer_task,cpu));
+		per_cpu(posix_timer_task,cpu) = NULL;
+		break;
+	case CPU_DEAD:
+		kthread_stop(per_cpu(posix_timer_task,cpu));
+		per_cpu(posix_timer_task,cpu) = NULL;
+		break;
+#endif
+	}
+	return NOTIFY_OK;
+}
+
+/* Register at highest priority so that task migration (migrate_all_tasks)
+ * happens before everything else.
+ */
+static struct notifier_block __devinitdata posix_cpu_thread_notifier = {
+	.notifier_call = posix_cpu_thread_call,
+	.priority = 10
+};
+
+int __init posix_cpu_thread_init(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+	/* Start one for boot CPU. */
+	posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, cpu);
+	posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, cpu);
+	register_cpu_notifier(&posix_cpu_thread_notifier);
+	return 0;
+}
+
+
+
 /*
  * Set one of the process-wide special case CPU timers.
  * The tasklist_lock and tsk->sighand->siglock must be held by the caller.
@@ -1565,6 +1732,12 @@ static __init int init_posix_cpu_timers(
 		.timer_create = thread_cpu_timer_create,
 		.nsleep = thread_cpu_nsleep,
 	};
+	unsigned long cpu;
+
+	/* init the per-cpu posix_timer_tasklets */
+	for_each_cpu(cpu) {
+		per_cpu(posix_timer_tasklist, cpu) = NULL;
+	}
 
 	register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
 	register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
Index: linux-2.6.16/kernel/posix-timers.c
===================================================================
--- linux-2.6.16.orig/kernel/posix-timers.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/posix-timers.c	2006-10-19 16:52:31.000000000 +0200
@@ -34,7 +34,7 @@
 #include <linux/smp_lock.h>
 #include <linux/interrupt.h>
 #include <linux/slab.h>
-#include <linux/time.h>
+#include <linux/timeofday.h>
 
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
@@ -144,7 +144,7 @@ static int common_timer_set(struct k_iti
 			    struct itimerspec *, struct itimerspec *);
 static int common_timer_del(struct k_itimer *timer);
 
-static int posix_timer_fn(void *data);
+static int posix_timer_fn(struct hrtimer *data);
 
 static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
 
@@ -250,15 +250,18 @@ __initcall(init_posix_timers);
 
 static void schedule_next_timer(struct k_itimer *timr)
 {
+	struct hrtimer *timer = &timr->it.real.timer;
+
 	if (timr->it.real.interval.tv64 == 0)
 		return;
 
-	timr->it_overrun += hrtimer_forward(&timr->it.real.timer,
+	timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(),
 					    timr->it.real.interval);
+
 	timr->it_overrun_last = timr->it_overrun;
 	timr->it_overrun = -1;
 	++timr->it_requeue_pending;
-	hrtimer_restart(&timr->it.real.timer);
+	hrtimer_restart(timer);
 }
 
 /*
@@ -330,13 +333,14 @@ EXPORT_SYMBOL_GPL(posix_timer_event);
 
  * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
  */
-static int posix_timer_fn(void *data)
+static int posix_timer_fn(struct hrtimer *timer)
 {
-	struct k_itimer *timr = data;
+	struct k_itimer *timr;
 	unsigned long flags;
 	int si_private = 0;
 	int ret = HRTIMER_NORESTART;
 
+	timr = container_of(timer, struct k_itimer, it.real.timer);
 	spin_lock_irqsave(&timr->it_lock, flags);
 
 	if (timr->it.real.interval.tv64 != 0)
@@ -350,7 +354,8 @@ static int posix_timer_fn(void *data)
 		 */
 		if (timr->it.real.interval.tv64 != 0) {
 			timr->it_overrun +=
-				hrtimer_forward(&timr->it.real.timer,
+				hrtimer_forward(timer,
+						hrtimer_cb_get_time(timer),
 						timr->it.real.interval);
 			ret = HRTIMER_RESTART;
 			++timr->it_requeue_pending;
@@ -602,38 +607,41 @@ static struct k_itimer * lock_timer(time
 static void
 common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
 {
-	ktime_t remaining;
+	ktime_t now, remaining, iv;
 	struct hrtimer *timer = &timr->it.real.timer;
 
 	memset(cur_setting, 0, sizeof(struct itimerspec));
-	remaining = hrtimer_get_remaining(timer);
 
-	/* Time left ? or timer pending */
-	if (remaining.tv64 > 0 || hrtimer_active(timer))
-		goto calci;
+	iv = timr->it.real.interval;
+
 	/* interval timer ? */
-	if (timr->it.real.interval.tv64 == 0)
+	if (iv.tv64)
+		cur_setting->it_interval = ktime_to_timespec(iv);
+	else if (!hrtimer_active(timer) &&
+		 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
 		return;
+
+	now = timer->base->get_time();
+
 	/*
-	 * When a requeue is pending or this is a SIGEV_NONE timer
-	 * move the expiry time forward by intervals, so expiry is >
-	 * now.
+	 * When a requeue is pending or this is a SIGEV_NONE
+	 * timer move the expiry time forward by intervals, so
+	 * expiry is > now.
 	 */
-	if (timr->it_requeue_pending & REQUEUE_PENDING ||
-	    (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
-		timr->it_overrun +=
-			hrtimer_forward(timer, timr->it.real.interval);
-		remaining = hrtimer_get_remaining(timer);
-	}
- calci:
-	/* interval timer ? */
-	if (timr->it.real.interval.tv64 != 0)
-		cur_setting->it_interval =
-			ktime_to_timespec(timr->it.real.interval);
+	if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING ||
+	    (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
+		timr->it_overrun += hrtimer_forward(timer, now, iv);
+
+	remaining = ktime_sub(timer->expires, now);
 	/* Return 0 only, when the timer is expired and not pending */
-	if (remaining.tv64 <= 0)
-		cur_setting->it_value.tv_nsec = 1;
-	else
+	if (remaining.tv64 <= 0) {
+		/*
+		 * A single shot SIGEV_NONE timer must return 0, when
+		 * it is expired !
+		 */
+		if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
+			cur_setting->it_value.tv_nsec = 1;
+	} else
 		cur_setting->it_value = ktime_to_timespec(remaining);
 }
 
@@ -716,7 +724,6 @@ common_timer_set(struct k_itimer *timr, 
 
 	mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL;
 	hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
-	timr->it.real.timer.data = timr;
 	timr->it.real.timer.function = posix_timer_fn;
 
 	timer->expires = timespec_to_ktime(new_setting->it_value);
@@ -768,6 +775,7 @@ retry:
 
 	unlock_timer(timr, flag);
 	if (error == TIMER_RETRY) {
+		hrtimer_wait_for_timer(&timr->it.real.timer);
 		rtn = NULL;	// We already got the old time...
 		goto retry;
 	}
@@ -807,6 +815,7 @@ retry_delete:
 
 	if (timer_delete_hook(timer) == TIMER_RETRY) {
 		unlock_timer(timer, flags);
+		hrtimer_wait_for_timer(&timer->it.real.timer);
 		goto retry_delete;
 	}
 
@@ -839,6 +848,7 @@ retry_delete:
 
 	if (timer_delete_hook(timer) == TIMER_RETRY) {
 		unlock_timer(timer, flags);
+		hrtimer_wait_for_timer(&timer->it.real.timer);
 		goto retry_delete;
 	}
 	list_del(&timer->list);
Index: linux-2.6.16/kernel/power/swsusp.c
===================================================================
--- linux-2.6.16.orig/kernel/power/swsusp.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/power/swsusp.c	2006-10-19 16:52:31.000000000 +0200
@@ -629,6 +629,7 @@ int swsusp_suspend(void)
 	restore_processor_state();
 Restore_highmem:
 	restore_highmem();
+	touch_softlockup_watchdog();
 	device_power_up();
 Enable_irqs:
 	local_irq_enable();
Index: linux-2.6.16/kernel/printk.c
===================================================================
--- linux-2.6.16.orig/kernel/printk.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/printk.c	2006-10-19 16:52:31.000000000 +0200
@@ -83,7 +83,7 @@ static int console_locked;
  * It is also used in interesting ways to provide interlocking in
  * release_console_sem().
  */
-static DEFINE_SPINLOCK(logbuf_lock);
+static DEFINE_RAW_SPINLOCK(logbuf_lock);
 
 #define LOG_BUF_MASK	(log_buf_len-1)
 #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
@@ -363,10 +363,12 @@ static void __call_console_drivers(unsig
 {
 	struct console *con;
 
+	touch_critical_timing();
 	for (con = console_drivers; con; con = con->next) {
 		if ((con->flags & CON_ENABLED) && con->write)
 			con->write(con, &LOG_BUF(start), end - start);
 	}
+	touch_critical_timing();
 }
 
 /*
@@ -375,7 +377,10 @@ static void __call_console_drivers(unsig
 static void _call_console_drivers(unsigned long start,
 				unsigned long end, int msg_log_level)
 {
-	if (msg_log_level < console_loglevel &&
+	if (
+#ifndef CONFIG_PRINTK_IGNORE_LOGLEVEL
+			msg_log_level < console_loglevel &&
+#endif
 			console_drivers && start != end) {
 		if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
 			/* wrapped write */
@@ -468,6 +473,7 @@ static void zap_locks(void)
 	spin_lock_init(&logbuf_lock);
 	/* And make sure that we print immediately */
 	init_MUTEX(&console_sem);
+	zap_rt_locks();
 }
 
 #if defined(CONFIG_PRINTK_TIME)
@@ -543,6 +549,7 @@ asmlinkage int vprintk(const char *fmt, 
 	/* This stops the holder of console_sem just where we want him */
 	spin_lock_irqsave(&logbuf_lock, flags);
 	printk_cpu = smp_processor_id();
+	preempt_enable();
 
 	/* Emit the output into the temporary buffer */
 	printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args);
@@ -635,7 +642,6 @@ asmlinkage int vprintk(const char *fmt, 
 		spin_unlock_irqrestore(&logbuf_lock, flags);
 	}
 out:
-	preempt_enable();
 	return printed_len;
 }
 EXPORT_SYMBOL(printk);
@@ -760,14 +766,34 @@ void release_console_sem(void)
 		_con_start = con_start;
 		_log_end = log_end;
 		con_start = log_end;		/* Flush */
+		/*
+		 * on PREEMPT_RT, call console drivers with
+		 * interrupts enabled (if printk was called
+		 * with interrupts disabled):
+		 */
+#ifdef CONFIG_PREEMPT_RT
+		spin_unlock_irqrestore(&logbuf_lock, flags);
+#else
 		spin_unlock(&logbuf_lock);
+#endif
 		call_console_drivers(_con_start, _log_end);
+#ifndef CONFIG_PREEMPT_RT
 		local_irq_restore(flags);
+#endif
 	}
 	console_locked = 0;
 	console_may_schedule = 0;
-	up(&console_sem);
 	spin_unlock_irqrestore(&logbuf_lock, flags);
+	up(&console_sem);
+	/*
+	 * On PREEMPT_RT kernels __wake_up may sleep, so wake syslogd
+	 * up only if we are in a preemptible section. We normally dont
+	 * printk from non-preemptible sections so this is for the emergency
+	 * case only.
+	 */
+#ifdef CONFIG_PREEMPT_RT
+	if (!in_atomic() && !irqs_disabled())
+#endif
 	if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait))
 		wake_up_interruptible(&log_wait);
 }
@@ -1009,7 +1035,7 @@ void tty_write_message(struct tty_struct
  */
 int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
 {
-	static DEFINE_SPINLOCK(ratelimit_lock);
+	static DEFINE_RAW_SPINLOCK(ratelimit_lock);
 	static unsigned long toks = 10 * 5 * HZ;
 	static unsigned long last_msg;
 	static int missed;
@@ -1049,3 +1075,20 @@ int printk_ratelimit(void)
 				printk_ratelimit_burst);
 }
 EXPORT_SYMBOL(printk_ratelimit);
+
+static DEFINE_RAW_SPINLOCK(warn_lock);
+
+void __WARN_ON(const char *func, const char *file, const int line)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&warn_lock, flags);
+	printk("%s/%d[CPU#%d]: BUG in %s at %s:%d\n",
+		current->comm, current->pid, raw_smp_processor_id(),
+		func, file, line);
+	dump_stack();
+	spin_unlock_irqrestore(&warn_lock, flags);
+}
+
+EXPORT_SYMBOL(__WARN_ON);
+
Index: linux-2.6.16/kernel/profile.c
===================================================================
--- linux-2.6.16.orig/kernel/profile.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/profile.c	2006-10-19 16:52:31.000000000 +0200
@@ -41,6 +41,7 @@ static atomic_t *prof_buffer;
 static unsigned long prof_len, prof_shift;
 static int prof_on __read_mostly;
 static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
+int prof_pid = -1;
 #ifdef CONFIG_SMP
 static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
 static DEFINE_PER_CPU(int, cpu_profile_flip);
@@ -387,7 +388,7 @@ void profile_tick(int type, struct pt_re
 {
 	if (type == CPU_PROFILING && timer_hook)
 		timer_hook(regs);
-	if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask))
+	if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask) && (prof_pid == -1 || prof_pid == current->pid))
 		profile_hit(type, (void *)profile_pc(regs));
 }
 
Index: linux-2.6.16/kernel/rcucommon.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/kernel/rcucommon.h	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,71 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion, definitions common
+ * to all implementations.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2001
+ *
+ * Authors: Dipankar Sarma <dipankar@in.ibm.com>
+ *	    Manfred Spraul <manfred@colorfullife.com>
+ *          Paul E. McKenney <paulmck@us.ibm.com>
+ *
+ * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ *
+ * Papers:  http://www.rdrop.com/users/paulmck/RCU
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU/ *.txt
+ *
+ */
+
+struct rcu_synchronize {
+	struct rcu_head head;
+	struct completion completion;
+};
+
+/* Because of FASTCALL declaration of complete, we use this wrapper */
+static void wakeme_after_rcu(struct rcu_head  *head)
+{
+	struct rcu_synchronize *rcu;
+
+	rcu = container_of(head, struct rcu_synchronize, head);
+	complete(&rcu->completion);
+}
+
+/**
+ * synchronize_rcu - wait until a grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full grace
+ * period has elapsed, in other words after all currently executing RCU
+ * read-side critical sections have completed.  RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ *
+ * If your read-side code is not protected by rcu_read_lock(), do -not-
+ * use synchronize_rcu().
+ */
+void synchronize_rcu(void)
+{
+	struct rcu_synchronize rcu;
+
+	init_completion(&rcu.completion);
+	/* Will wake me after RCU finished */
+	call_rcu(&rcu.head, wakeme_after_rcu);
+
+	/* Wait for it */
+	wait_for_completion(&rcu.completion);
+}
Index: linux-2.6.16/kernel/rcupdate.c
===================================================================
--- linux-2.6.16.orig/kernel/rcupdate.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/rcupdate.c	2006-10-19 16:52:31.000000000 +0200
@@ -1,5 +1,5 @@
 /*
- * Read-Copy Update mechanism for mutual exclusion
+ * Read-Copy Update mechanism for mutual exclusion, classic implementation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -22,12 +22,11 @@
  * 
  * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
  * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
- * Papers:
- * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
- * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * Papers:  http://www.rdrop.com/users/paulmck/RCU
  *
  * For detailed explanation of Read-Copy Update mechanism see -
- * 		http://lse.sourceforge.net/locking/rcupdate.html
+ * 		Documentation/RCU/ *.txt
  *
  */
 #include <linux/types.h>
@@ -47,6 +46,11 @@
 #include <linux/notifier.h>
 #include <linux/rcupdate.h>
 #include <linux/cpu.h>
+#include <linux/random.h>
+#include <linux/delay.h>
+#include <linux/byteorder/swabb.h>
+#include "rcucommon.h"
+
 
 /* Definition for rcupdate control block. */
 struct rcu_ctrlblk rcu_ctrlblk = {
@@ -556,60 +560,12 @@ void __init rcu_init(void)
 	register_cpu_notifier(&rcu_nb);
 }
 
-struct rcu_synchronize {
-	struct rcu_head head;
-	struct completion completion;
-};
-
-/* Because of FASTCALL declaration of complete, we use this wrapper */
-static void wakeme_after_rcu(struct rcu_head  *head)
-{
-	struct rcu_synchronize *rcu;
-
-	rcu = container_of(head, struct rcu_synchronize, head);
-	complete(&rcu->completion);
-}
-
-/**
- * synchronize_rcu - wait until a grace period has elapsed.
- *
- * Control will return to the caller some time after a full grace
- * period has elapsed, in other words after all currently executing RCU
- * read-side critical sections have completed.  RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
- * and may be nested.
- *
- * If your read-side code is not protected by rcu_read_lock(), do -not-
- * use synchronize_rcu().
- */
-void synchronize_rcu(void)
-{
-	struct rcu_synchronize rcu;
-
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished */
-	call_rcu(&rcu.head, wakeme_after_rcu);
-
-	/* Wait for it */
-	wait_for_completion(&rcu.completion);
-}
-
-/*
- * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
- */
-void synchronize_kernel(void)
-{
-	synchronize_rcu();
-}
-
 module_param(blimit, int, 0);
 module_param(qhimark, int, 0);
 module_param(qlowmark, int, 0);
 #ifdef CONFIG_SMP
 module_param(rsinterval, int, 0);
 #endif
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-EXPORT_SYMBOL(call_rcu);  /* WARNING: GPL-only in April 2006. */
-EXPORT_SYMBOL(call_rcu_bh);  /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL_GPL(call_rcu);
+EXPORT_SYMBOL_GPL(call_rcu_bh);
 EXPORT_SYMBOL_GPL(synchronize_rcu);
-EXPORT_SYMBOL(synchronize_kernel);  /* WARNING: GPL-only in April 2006. */
Index: linux-2.6.16/kernel/rcupreempt.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/kernel/rcupreempt.c	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,463 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion, realtime implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2001
+ *
+ * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ *		With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
+ *		for pushing me away from locks and towards counters.
+ *
+ * Papers:  http://www.rdrop.com/users/paulmck/RCU
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU/ *.txt
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/rcupdate.h>
+#include <linux/cpu.h>
+#include <linux/random.h>
+#include <linux/delay.h>
+#include <linux/byteorder/swabb.h>
+#include "rcucommon.h"
+
+/*
+ * PREEMPT_RCU data structures.
+ */
+
+struct rcu_data {
+	raw_spinlock_t	lock;
+	long		completed;	/* Number of last completed batch. */
+	struct tasklet_struct rcu_tasklet;
+	struct rcu_head *nextlist;
+	struct rcu_head **nexttail;
+	struct rcu_head *waitlist;
+	struct rcu_head **waittail;
+	struct rcu_head *donelist;
+	struct rcu_head **donetail;
+#ifdef CONFIG_RCU_STATS
+	long		n_next_length;
+	long		n_next_add;
+	long		n_wait_length;
+	long		n_wait_add;
+	long		n_done_length;
+	long		n_done_add;
+	long		n_done_remove;
+	atomic_t	n_done_invoked;
+	long		n_rcu_check_callbacks;
+	atomic_t	n_rcu_try_flip1;
+	long		n_rcu_try_flip2;
+	long		n_rcu_try_flip3;
+	atomic_t	n_rcu_try_flip_e1;
+	long		n_rcu_try_flip_e2;
+	long		n_rcu_try_flip_e3;
+#endif /* #ifdef CONFIG_RCU_STATS */
+};
+struct rcu_ctrlblk {
+	raw_spinlock_t	fliplock;
+	long		completed;	/* Number of last completed batch. */
+};
+static struct rcu_data rcu_data;
+static struct rcu_ctrlblk rcu_ctrlblk = {
+	.fliplock = RAW_SPIN_LOCK_UNLOCKED,
+	.completed = 0,
+};
+static DEFINE_PER_CPU(atomic_t [2], rcu_flipctr) =
+	{ ATOMIC_INIT(0), ATOMIC_INIT(0) };
+
+/*
+ * Return the number of RCU batches processed thus far.  Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed(void)
+{
+	return rcu_ctrlblk.completed;
+}
+
+void
+rcu_read_lock(void)
+{
+	int flipctr;
+	unsigned long oldirq;
+
+	local_irq_save(oldirq);
+	if (current->rcu_read_lock_nesting++ == 0) {
+
+		/*
+		 * Outermost nesting of rcu_read_lock(), so atomically
+		 * increment the current counter for the current CPU.
+		 */
+
+		flipctr = rcu_ctrlblk.completed & 0x1;
+		smp_read_barrier_depends();
+		current->rcu_flipctr1 = &(__get_cpu_var(rcu_flipctr)[flipctr]);
+		/* Can optimize to non-atomic on fastpath, but start simple. */
+		atomic_inc(current->rcu_flipctr1);
+		smp_mb__after_atomic_inc();  /* might optimize out... */
+		if (unlikely(flipctr != (rcu_ctrlblk.completed & 0x1))) {
+
+			/*
+			 * We raced with grace-period processing (flip).
+			 * Although we cannot be preempted here, there
+			 * could be interrupts, ECC errors and the like,
+			 * so just nail down both sides of the rcu_flipctr
+			 * array for the duration of our RCU read-side
+			 * critical section, preventing a second flip
+			 * from racing with us.  At some point, it would
+			 * be safe to decrement one of the counters, but
+			 * we have no way of knowing when that would be.
+			 * So just decrement them both in rcu_read_unlock().
+			 */
+
+			current->rcu_flipctr2 =
+				&(__get_cpu_var(rcu_flipctr)[!flipctr]);
+			/* Can again optimize to non-atomic on fastpath. */
+			atomic_inc(current->rcu_flipctr2);
+			smp_mb__after_atomic_inc();  /* might optimize out... */
+		}
+	}
+	local_irq_restore(oldirq);
+}
+
+void
+rcu_read_unlock(void)
+{
+	unsigned long oldirq;
+
+	local_irq_save(oldirq);
+	if (--current->rcu_read_lock_nesting == 0) {
+
+		/*
+		 * Just atomically decrement whatever we incremented.
+		 * Might later want to awaken some task waiting for the
+		 * grace period to complete, but keep it simple for the
+		 * moment.
+		 */
+
+		smp_mb__before_atomic_dec();
+		atomic_dec(current->rcu_flipctr1);
+		current->rcu_flipctr1 = NULL;
+		if (unlikely(current->rcu_flipctr2 != NULL)) {
+			atomic_dec(current->rcu_flipctr2);
+			current->rcu_flipctr2 = NULL;
+		}
+	}
+	local_irq_restore(oldirq);
+}
+
+static void
+__rcu_advance_callbacks(void)
+{
+
+	if (rcu_data.completed != rcu_ctrlblk.completed) {
+		if (rcu_data.waitlist != NULL) {
+			*rcu_data.donetail = rcu_data.waitlist;
+			rcu_data.donetail = rcu_data.waittail;
+#ifdef CONFIG_RCU_STATS
+			rcu_data.n_done_length += rcu_data.n_wait_length;
+			rcu_data.n_done_add += rcu_data.n_wait_length;
+			rcu_data.n_wait_length = 0;
+#endif /* #ifdef CONFIG_RCU_STATS */
+		}
+		if (rcu_data.nextlist != NULL) {
+			rcu_data.waitlist = rcu_data.nextlist;
+			rcu_data.waittail = rcu_data.nexttail;
+			rcu_data.nextlist = NULL;
+			rcu_data.nexttail = &rcu_data.nextlist;
+#ifdef CONFIG_RCU_STATS
+			rcu_data.n_wait_length += rcu_data.n_next_length;
+			rcu_data.n_wait_add += rcu_data.n_next_length;
+			rcu_data.n_next_length = 0;
+#endif /* #ifdef CONFIG_RCU_STATS */
+		} else {
+			rcu_data.waitlist = NULL;
+			rcu_data.waittail = &rcu_data.waitlist;
+		}
+		rcu_data.completed = rcu_ctrlblk.completed;
+	}
+}
+
+/*
+ * Attempt a single flip of the counters.  Remember, a single flip does
+ * -not- constitute a grace period.  Instead, the interval between
+ * a pair of consecutive flips is a grace period.
+ *
+ * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
+ * on a large SMP, they might want to use a hierarchical organization of
+ * the per-CPU-counter pairs.
+ */
+static void
+rcu_try_flip(void)
+{
+	int cpu;
+	long flipctr;
+	unsigned long oldirq;
+
+	flipctr = rcu_ctrlblk.completed;
+#ifdef CONFIG_RCU_STATS
+	atomic_inc(&rcu_data.n_rcu_try_flip1);
+#endif /* #ifdef CONFIG_RCU_STATS */
+	if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, oldirq))) {
+#ifdef CONFIG_RCU_STATS
+		atomic_inc(&rcu_data.n_rcu_try_flip_e1);
+#endif /* #ifdef CONFIG_RCU_STATS */
+		return;
+	}
+	if (unlikely(flipctr != rcu_ctrlblk.completed)) {
+
+		/* Our work is done!  ;-) */
+
+#ifdef CONFIG_RCU_STATS
+		rcu_data.n_rcu_try_flip_e2++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+		spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq);
+		return;
+	}
+	flipctr &= 0x1;
+
+	/*
+	 * Check for completion of all RCU read-side critical sections
+	 * that started prior to the previous flip.
+	 */
+
+#ifdef CONFIG_RCU_STATS
+	rcu_data.n_rcu_try_flip2++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+	for_each_cpu(cpu) {
+		if (atomic_read(&per_cpu(rcu_flipctr, cpu)[!flipctr]) != 0) {
+#ifdef CONFIG_RCU_STATS
+			rcu_data.n_rcu_try_flip_e3++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+			spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq);
+			return;
+		}
+	}
+
+	/* Do the flip. */
+
+	smp_mb();
+	rcu_ctrlblk.completed++;
+
+#ifdef CONFIG_RCU_STATS
+	rcu_data.n_rcu_try_flip3++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq);
+}
+
+void
+rcu_check_callbacks(int cpu, int user)
+{
+	unsigned long oldirq;
+
+	if (rcu_ctrlblk.completed == rcu_data.completed) {
+		rcu_try_flip();
+		if (rcu_ctrlblk.completed == rcu_data.completed) {
+			return;
+		}
+	}
+	spin_lock_irqsave(&rcu_data.lock, oldirq);
+#ifdef CONFIG_RCU_STATS
+	rcu_data.n_rcu_check_callbacks++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+	__rcu_advance_callbacks();
+	if (rcu_data.donelist == NULL) {
+		spin_unlock_irqrestore(&rcu_data.lock, oldirq);
+	} else {
+		spin_unlock_irqrestore(&rcu_data.lock, oldirq);
+		tasklet_schedule(&rcu_data.rcu_tasklet);
+	}
+}
+
+static
+void rcu_process_callbacks(unsigned long data)
+{
+	unsigned long flags;
+	struct rcu_head *next, *list;
+
+	spin_lock_irqsave(&rcu_data.lock, flags);
+	list = rcu_data.donelist;
+	if (list == NULL) {
+		spin_unlock_irqrestore(&rcu_data.lock, flags);
+		return;
+	}
+	rcu_data.donelist = NULL;
+	rcu_data.donetail = &rcu_data.donelist;
+#ifdef CONFIG_RCU_STATS
+	rcu_data.n_done_remove += rcu_data.n_done_length;
+	rcu_data.n_done_length = 0;
+#endif /* #ifdef CONFIG_RCU_STATS */
+	spin_unlock_irqrestore(&rcu_data.lock, flags);
+	while (list) {
+		next = list->next;
+		list->func(list);
+		list = next;
+#ifdef CONFIG_RCU_STATS
+		atomic_inc(&rcu_data.n_done_invoked);
+#endif /* #ifdef CONFIG_RCU_STATS */
+	}
+}
+
+void fastcall
+call_rcu(struct rcu_head *head,
+	 void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+
+	head->func = func;
+	head->next = NULL;
+	spin_lock_irqsave(&rcu_data.lock, flags);
+	__rcu_advance_callbacks();
+	*rcu_data.nexttail = head;
+	rcu_data.nexttail = &head->next;
+#ifdef CONFIG_RCU_STATS
+	rcu_data.n_next_add++;
+	rcu_data.n_next_length++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+	spin_unlock_irqrestore(&rcu_data.lock, flags);
+}
+
+/*
+ * Crude hack, reduces but does not eliminate possibility of failure.
+ * Needs to wait for all CPUs to pass through a -voluntary- context
+ * switch to eliminate possibility of failure.  (Maybe just crank
+ * priority down...)
+ */
+void
+synchronize_sched(void)
+{
+	cpumask_t oldmask;
+	int cpu;
+
+	if (sched_getaffinity(0, &oldmask) < 0) {
+		oldmask = cpu_possible_map;
+	}
+	for_each_cpu(cpu) {
+		sched_setaffinity(0, cpumask_of_cpu(cpu));
+		schedule();
+	}
+	sched_setaffinity(0, oldmask);
+}
+
+int
+rcu_pending(int cpu)
+{
+	return (rcu_data.donelist != NULL ||
+		rcu_data.waitlist != NULL ||
+		rcu_data.nextlist != NULL);
+}
+
+void __init rcu_init(void)
+{
+/*&&&&*/printk("WARNING: experimental RCU implementation.\n");
+	spin_lock_init(&rcu_data.lock);
+	rcu_data.completed = 0;
+	rcu_data.nextlist = NULL;
+	rcu_data.nexttail = &rcu_data.nextlist;
+	rcu_data.waitlist = NULL;
+	rcu_data.waittail = &rcu_data.waitlist;
+	rcu_data.donelist = NULL;
+	rcu_data.donetail = &rcu_data.donelist;
+	tasklet_init(&rcu_data.rcu_tasklet, rcu_process_callbacks, 0UL);
+}
+
+#ifdef CONFIG_RCU_STATS
+int rcu_read_proc_data(char *page)
+{
+	return sprintf(page,
+		       "ggp=%ld lgp=%ld rcc=%ld\n"
+		       "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n"
+		       "rtf1=%d rtf2=%ld rtf3=%ld rtfe1=%d rtfe2=%ld rtfe3=%ld\n",
+
+		       rcu_ctrlblk.completed,
+		       rcu_data.completed,
+		       rcu_data.n_rcu_check_callbacks,
+
+		       rcu_data.n_next_add,
+		       rcu_data.n_next_length,
+		       rcu_data.n_wait_add,
+		       rcu_data.n_wait_length,
+		       rcu_data.n_done_add,
+		       rcu_data.n_done_length,
+		       rcu_data.n_done_remove,
+		       atomic_read(&rcu_data.n_done_invoked),
+
+		       atomic_read(&rcu_data.n_rcu_try_flip1),
+		       rcu_data.n_rcu_try_flip2,
+		       rcu_data.n_rcu_try_flip3,
+		       atomic_read(&rcu_data.n_rcu_try_flip_e1),
+		       rcu_data.n_rcu_try_flip_e2,
+		       rcu_data.n_rcu_try_flip_e3);
+}
+
+int rcu_read_proc_gp_data(char *page)
+{
+	long oldgp = rcu_ctrlblk.completed;
+
+	synchronize_rcu();
+	return sprintf(page, "oldggp=%ld  newggp=%ld\n",
+		       oldgp, rcu_ctrlblk.completed);
+}
+
+int rcu_read_proc_ptrs_data(char *page)
+{
+	return sprintf(page,
+		       "nl=%p/%p nt=%p\n wl=%p/%p wt=%p dl=%p/%p dt=%p\n",
+		       &rcu_data.nextlist, rcu_data.nextlist, rcu_data.nexttail,
+		       &rcu_data.waitlist, rcu_data.waitlist, rcu_data.waittail,
+		       &rcu_data.donelist, rcu_data.donelist, rcu_data.donetail
+		      );
+}
+
+int rcu_read_proc_ctrs_data(char *page)
+{
+	int cnt = 0;
+	int cpu;
+	int f = rcu_data.completed & 0x1;
+
+	cnt += sprintf(&page[cnt], "CPU last cur\n");
+	for_each_cpu(cpu) {
+		cnt += sprintf(&page[cnt], "%3d %4d %3d\n",
+			       cpu,
+			       atomic_read(&per_cpu(rcu_flipctr, cpu)[!f]),
+			       atomic_read(&per_cpu(rcu_flipctr, cpu)[f]));
+	}
+	cnt += sprintf(&page[cnt], "ggp = %ld\n", rcu_data.completed);
+	return (cnt);
+}
+
+#endif /* #ifdef CONFIG_RCU_STATS */
+
+EXPORT_SYMBOL_GPL(call_rcu);
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+EXPORT_SYMBOL_GPL(synchronize_rcu);
+EXPORT_SYMBOL_GPL(synchronize_sched);
+EXPORT_SYMBOL_GPL(rcu_read_lock);
+EXPORT_SYMBOL_GPL(rcu_read_unlock);
Index: linux-2.6.16/kernel/rcutorture.c
===================================================================
--- linux-2.6.16.orig/kernel/rcutorture.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/rcutorture.c	2006-10-19 16:52:31.000000000 +0200
@@ -260,7 +260,7 @@ rcu_torture_reader(void *arg)
 		if (p == NULL) {
 			/* Wait for rcu_torture_writer to get underway */
 			rcu_read_unlock();
-			schedule_timeout_interruptible(HZ);
+			msleep(1000);
 			continue;
 		}
 		if (p->rtort_mbtest == 0)
@@ -281,7 +281,7 @@ rcu_torture_reader(void *arg)
 		++__get_cpu_var(rcu_torture_batch)[completed];
 		preempt_enable();
 		rcu_read_unlock();
-		schedule();
+		cond_resched();
 	} while (!kthread_should_stop() && !fullstop);
 	VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
 	while (!kthread_should_stop())
Index: linux-2.6.16/kernel/rt.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/kernel/rt.c	2006-11-22 17:50:48.000000000 +0100
@@ -0,0 +1,436 @@
+/*
+ * kernel/rt.c
+ *
+ * Real-Time Preemption Support
+ *
+ * started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * historic credit for proving that Linux spinlocks can be implemented via
+ * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
+ * and others) who prototyped it on 2.4 and did lots of comparative
+ * research and analysis; TimeSys, for proving that you can implement a
+ * fully preemptible kernel via the use of IRQ threading and mutexes;
+ * Bill Huey for persuasively arguing on lkml that the mutex model is the
+ * right one; and to MontaVista, who ported pmutexes to 2.6.
+ *
+ * This code is a from-scratch implementation and is not based on pmutexes,
+ * but the idea of converting spinlocks to mutexes is used here too.
+ *
+ * lock debugging, locking tree, deadlock detection:
+ *
+ *  Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
+ *  Released under the General Public License (GPL).
+ *
+ * Includes portions of the generic R/W semaphore implementation from:
+ *
+ *  Copyright (c) 2001   David Howells (dhowells@redhat.com).
+ *  - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
+ *  - Derived also from comments by Linus
+ *
+ * Pending ownership of locks and ownership stealing:
+ *
+ *  Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
+ *
+ *   (also by Steven Rostedt)
+ *    - Converted single pi_lock to individual task locks.
+ *
+ * By Esben Nielsen:
+ *    Doing priority inheritance with help of the scheduler.
+ *
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *  - major rework based on Esben Nielsens initial patch
+ *  - replaced thread_info references by task_struct refs
+ *  - removed task->pending_owner dependency
+ *  - BKL drop/reacquire for semaphore style locks to avoid deadlocks
+ *    in the scheduler return path as discussed with Steven Rostedt
+ *
+ *  Copyright (C) 2006, Kihon Technologies Inc.
+ *    Steven Rostedt <rostedt@goodmis.org>
+ *  - debugged and patched Thomas Gleixner's rework.
+ *  - added back the cmpxchg to the rework.
+ *  - turned atomic require back on for SMP.
+ */
+
+#include <linux/config.h>
+#include <linux/spinlock.h>
+#include <linux/rt_lock.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/interrupt.h>
+#include <linux/plist.h>
+#include <linux/fs.h>
+#include <linux/futex.h>
+
+#include "rtmutex_common.h"
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# include "rtmutex-debug.h"
+#else
+# include "rtmutex.h"
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * Unlock these on crash:
+ */
+void zap_rt_locks(void)
+{
+	//trace_lock_init();
+}
+#endif
+
+
+/*
+ * spin_lock functions
+ */
+
+void __lockfunc rt_unlock_wait(spinlock_t *lock)
+{
+	do {
+		barrier();
+	} while (spin_is_locked(&lock->lock.wait_lock));
+}
+EXPORT_SYMBOL(rt_unlock_wait);
+
+int __lockfunc rt_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
+{
+	*flags = 0;
+	return rt_mutex_trylock(&lock->lock);
+}
+EXPORT_SYMBOL(rt_trylock_irqsave);
+
+int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
+{
+	/* Subtract 1 from counter unless that it drops to 0 */
+	if (atomic_add_unless(atomic, -1, 1))
+		return 0;
+	rt_lock(&lock->lock);
+	if (atomic_dec_and_test(atomic))
+		return 1;
+	rt_unlock(&lock->lock);
+	return 0;
+}
+EXPORT_SYMBOL(atomic_dec_and_spin_lock);
+
+void __rt_lock_init(spinlock_t *lock, char *name)
+{
+	__rt_mutex_init(&lock->lock, name);
+}
+EXPORT_SYMBOL(__rt_lock_init);
+
+/*
+ * rw_lock functions
+ */
+int __lockfunc rt_write_trylock(rwlock_t *rwlock)
+{
+	return rt_mutex_trylock(&rwlock->lock.lock);
+}
+EXPORT_SYMBOL(rt_write_trylock);
+
+int __lockfunc rt_read_trylock(rwlock_t *rwlock)
+{
+	unsigned long flags;
+	struct rw_semaphore *rwsem = &rwlock->lock;
+
+	/*
+	 * Read locks within the self-held write lock succeed.
+	 */
+	flags = _raw_spin_lock_irqsave(&rwsem->lock.wait_lock);
+	if (rt_mutex_real_owner(&rwsem->lock) == current) {
+		_raw_spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+		rwsem->read_depth++;
+		return 1;
+	}
+	_raw_spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+	return rt_mutex_trylock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_read_trylock);
+
+static inline void rt_rw_lock(struct rw_semaphore *rwsem __IP_DECL__)
+{
+	rt_lock(&rwsem->lock);
+}
+
+void __lockfunc rt_write_lock(rwlock_t *rwlock)
+{
+	rt_rw_lock(&rwlock->lock __RET_IP__);
+}
+EXPORT_SYMBOL(rt_write_lock);
+
+void __lockfunc rt_read_lock(rwlock_t *rwlock)
+{
+	unsigned long flags;
+	struct rw_semaphore *rwsem = &rwlock->lock;
+
+	/*
+	 * Read locks within the write lock succeed.
+	 */
+	flags = _raw_spin_lock_irqsave(&rwsem->lock.wait_lock);
+	if (rt_mutex_real_owner(&rwsem->lock) == current) {
+		_raw_spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+		rwsem->read_depth++;
+		return;
+	}
+	_raw_spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+	rt_rw_lock(rwsem __RET_IP__);
+}
+
+EXPORT_SYMBOL(rt_read_lock);
+
+static inline void rt_rw_unlock(struct rw_semaphore *rwsem __IP_DECL__)
+{
+/*
+	TRACE_WARN_ON(rwsem->lock.save_state != 1);
+	TRACE_WARN_ON(lock_real_owner(&rwsem->lock) != current);
+	TRACE_BUG_ON(rwsem->read_depth);
+*/
+	rt_unlock(&rwsem->lock);
+}
+
+void __lockfunc rt_write_unlock(rwlock_t *rwlock)
+{
+	rt_rw_unlock(&rwlock->lock __RET_IP__);
+}
+EXPORT_SYMBOL(rt_write_unlock);
+
+void __lockfunc rt_read_unlock(rwlock_t *rwlock)
+{
+	unsigned long flags;
+	struct rw_semaphore *rwsem = &rwlock->lock;
+
+	// TRACE_WARN_ON(rwsem->lock.save_state != 1);
+	/*
+	 * Read locks within the self-held write lock succeed.
+	 */
+	flags = _raw_spin_lock_irqsave(&rwsem->lock.wait_lock);
+	if (rt_mutex_real_owner(&rwsem->lock) == current && rwsem->read_depth) {
+		_raw_spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+		rwsem->read_depth--;
+		return;
+	}
+	_raw_spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+	rt_rw_unlock(rwsem __RET_IP__);
+}
+EXPORT_SYMBOL(rt_read_unlock);
+
+unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock)
+{
+	rt_write_lock(rwlock);
+
+	return 0;
+}
+EXPORT_SYMBOL(rt_write_lock_irqsave);
+
+unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock)
+{
+	rt_read_lock(rwlock);
+
+	return 0;
+}
+EXPORT_SYMBOL(rt_read_lock_irqsave);
+
+void __rt_rwlock_init(rwlock_t *rwlock, char *name)
+{
+	__rt_rwsem_init(&rwlock->lock, name);
+}
+EXPORT_SYMBOL(__rt_rwlock_init);
+
+/*
+ * rw_semaphores
+ */
+
+static inline void do_rt_up_write(struct rw_semaphore *rwsem __IP_DECL__)
+{
+/*
+	TRACE_WARN_ON(lock_real_owner(&rwsem->lock) != current);
+	TRACE_BUG_ON(rwsem->read_depth);
+*/
+	rt_mutex_unlock(&rwsem->lock);
+}
+
+void fastcall rt_up_write(struct rw_semaphore *rwsem)
+{
+	do_rt_up_write(rwsem __RET_IP__);
+}
+EXPORT_SYMBOL(rt_up_write);
+
+void fastcall rt_up_read(struct rw_semaphore *rwsem)
+{
+	unsigned long flags;
+	/*
+	 * Read locks within the self-held write lock succeed.
+	 */
+	flags = _raw_spin_lock_irqsave(&rwsem->lock.wait_lock);
+	if (rt_mutex_real_owner(&rwsem->lock) == current && rwsem->read_depth) {
+		_raw_spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+		rwsem->read_depth--;
+		return;
+	}
+	_raw_spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+	do_rt_up_write(rwsem __RET_IP__);
+}
+EXPORT_SYMBOL(rt_up_read);
+
+/*
+ * downgrade a write lock into a read lock
+ * - just wake up any readers at the front of the queue
+ */
+void fastcall rt_downgrade_write(struct rw_semaphore *rwsem)
+{
+	BUG();
+}
+EXPORT_SYMBOL(rt_downgrade_write);
+
+int fastcall rt_down_read_trylock(struct rw_semaphore *rwsem)
+{
+	unsigned long flags;
+
+	/*
+	 * Read locks within the self-held write lock succeed.
+	 */
+	flags = _raw_spin_lock_irqsave(&rwsem->lock.wait_lock);
+	if (rt_mutex_real_owner(&rwsem->lock) == current) {
+		_raw_spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+		rwsem->read_depth++;
+		return 1;
+	}
+	_raw_spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+	return rt_mutex_trylock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_down_read_trylock);
+
+
+void fastcall rt_down_read(struct rw_semaphore *rwsem)
+{
+	unsigned long flags;
+	/*
+	 * Read locks within the write lock succeed.
+	 */
+	flags = _raw_spin_lock_irqsave(&rwsem->lock.wait_lock);
+	if (rt_mutex_real_owner(&rwsem->lock) == current) {
+		_raw_spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+		rwsem->read_depth++;
+		return;
+	}
+	_raw_spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+	rt_mutex_lock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_down_read);
+
+void fastcall __rt_rwsem_init(struct rw_semaphore *rwsem, char *name)
+{
+	__rt_mutex_init(&rwsem->lock, name);
+	rwsem->read_depth = 0;
+}
+EXPORT_SYMBOL(__rt_rwsem_init);
+
+/*
+ * Semaphores
+ */
+/*
+ * Linux Semaphores implemented via RT-mutexes.
+ *
+ * In the down() variants we use the mutex as the semaphore blocking
+ * object: we always acquire it, decrease the counter and keep the lock
+ * locked if we did the 1->0 transition. The next down() will then block.
+ *
+ * In the up() path we atomically increase the counter and do the
+ * unlock if we were the one doing the 0->1 transition.
+ */
+
+static inline void __down_complete(struct semaphore *sem)
+{
+	int count = atomic_dec_return(&sem->count);
+
+	if (unlikely(count > 0))
+		rt_mutex_unlock(&sem->lock);
+}
+
+void fastcall rt_down(struct semaphore *sem)
+{
+	rt_mutex_lock(&sem->lock);
+	__down_complete(sem);
+}
+EXPORT_SYMBOL(rt_down);
+
+int fastcall rt_down_interruptible(struct semaphore *sem)
+{
+	int ret;
+
+	ret = rt_mutex_lock_interruptible(&sem->lock, 0);
+	if (ret)
+		return ret;
+	__down_complete(sem);
+	return 0;
+}
+EXPORT_SYMBOL(rt_down_interruptible);
+
+/*
+ * try to down the semaphore, 0 on success and 1 on failure. (inverted)
+ */
+int fastcall rt_down_trylock(struct semaphore *sem)
+{
+	/*
+	 * Here we are a tiny bit different from ordinary Linux semaphores,
+	 * because we can get 'transient' locking-failures when say a
+	 * process decreases the count from 9 to 8 and locks/releases the
+	 * embedded mutex internally. It would be quite complex to remove
+	 * these transient failures so lets try it the simple way first:
+	 */
+	if (rt_mutex_trylock(&sem->lock)) {
+		__down_complete(sem);
+		return 0;
+	}
+	return 1;
+}
+EXPORT_SYMBOL(rt_down_trylock);
+
+void fastcall rt_up(struct semaphore *sem)
+{
+	int count;
+
+	/*
+	 * Disable preemption to make sure a highprio trylock-er cannot
+	 * preempt us here and get into an infinite loop:
+	 */
+	preempt_disable();
+	count = atomic_inc_return(&sem->count);
+	/*
+	 * If we did the 0 -> 1 transition then we are the ones to unlock it:
+	 */
+	if (likely(count == 1))
+		rt_mutex_unlock(&sem->lock);
+	preempt_enable();
+}
+EXPORT_SYMBOL(rt_up);
+
+void fastcall __sema_init(struct semaphore *sem, int val,
+			  char *name, char *file, int line)
+{
+	atomic_set(&sem->count, val);
+	switch (val) {
+	case 0:
+		__rt_mutex_init(&sem->lock, name);
+		rt_mutex_lock(&sem->lock);
+		break;
+	default:
+		__rt_mutex_init(&sem->lock, name);
+		break;
+	}
+}
+EXPORT_SYMBOL(__sema_init);
+
+void fastcall __init_MUTEX(struct semaphore *sem, char *name, char *file,
+			   int line)
+{
+	__sema_init(sem, 1, name, file, line);
+}
+EXPORT_SYMBOL(__init_MUTEX);
+
Index: linux-2.6.16/kernel/rtmutex-debug.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/kernel/rtmutex-debug.c	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,571 @@
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This code is based on the rt.c implementation in the preempt-rt tree.
+ * Portions of said code are
+ *
+ *  Copyright (C) 2004  LynuxWorks, Inc., Igor Manyilov, Bill Huey
+ *  Copyright (C) 2006  Esben Nielsen
+ *  Copyright (C) 2006  Kihon Technologies Inc.,
+ *			Steven Rostedt <rostedt@goodmis.org>
+ *
+ * See rt.c in preempt-rt for proper credits and further information
+ */
+#include <linux/config.h>
+#include <linux/rt_lock.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/interrupt.h>
+#include <linux/plist.h>
+#include <linux/fs.h>
+
+#include "rtmutex_common.h"
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# include "rtmutex-debug.h"
+#else
+# include "rtmutex.h"
+#endif
+
+# define TRACE_WARN_ON(x)			WARN_ON(x)
+# define TRACE_BUG_ON(x)			BUG_ON(x)
+
+# define TRACE_OFF()						\
+do {								\
+	if (rt_trace_on) {					\
+		rt_trace_on = 0;				\
+		console_verbose();				\
+		if (spin_is_locked(&current->pi_lock))		\
+			spin_unlock(&current->pi_lock);		\
+		if (spin_is_locked(&current->held_list_lock))	\
+			spin_unlock(&current->held_list_lock);	\
+	}							\
+} while (0)
+
+# define TRACE_OFF_NOLOCK()					\
+do {								\
+	if (rt_trace_on) {					\
+		rt_trace_on = 0;				\
+		console_verbose();				\
+	}							\
+} while (0)
+
+# define TRACE_BUG_LOCKED()			\
+do {						\
+	TRACE_OFF();				\
+	BUG();					\
+} while (0)
+
+# define TRACE_WARN_ON_LOCKED(c)		\
+do {						\
+	if (unlikely(c)) {			\
+		TRACE_OFF();			\
+		WARN_ON(1);			\
+	}					\
+} while (0)
+
+# define TRACE_BUG_ON_LOCKED(c)			\
+do {						\
+	if (unlikely(c))			\
+		TRACE_BUG_LOCKED();		\
+} while (0)
+
+#ifdef CONFIG_SMP
+# define SMP_TRACE_BUG_ON_LOCKED(c)	TRACE_BUG_ON_LOCKED(c)
+#else
+# define SMP_TRACE_BUG_ON_LOCKED(c)	do { } while (0)
+#endif
+
+/*
+ * deadlock detection flag. We turn it off when we detect
+ * the first problem because we dont want to recurse back
+ * into the tracing code when doing error printk or
+ * executing a BUG():
+ */
+int rt_trace_on = 1;
+
+void deadlock_trace_off(void)
+{
+	rt_trace_on = 0;
+}
+
+/*
+ * BKL check
+ */
+#ifdef CONFIG_PREEMPT_RT
+static inline int is_kernel_lock(struct rt_mutex *lock)
+{
+	return (lock == &kernel_sem.lock);
+}
+#else
+# define is_kernel_lock(lock) (0)
+#endif
+
+static void printk_task(task_t *p)
+{
+	if (p)
+		printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio);
+	else
+		printk("<none>");
+}
+
+static void printk_task_short(task_t *p)
+{
+	if (p)
+		printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio);
+	else
+		printk("<none>");
+}
+
+static void printk_lock(struct rt_mutex *lock, int print_owner)
+{
+	if (lock->name)
+		printk(" [%p] {%s}\n",
+			lock, lock->name);
+	else
+		printk(" [%p] {%s:%d}\n",
+			lock, lock->file, lock->line);
+
+	if (print_owner && rt_mutex_owner(lock)) {
+		printk(".. ->owner: %p\n", lock->owner);
+		printk(".. held by:  ");
+		printk_task(rt_mutex_owner(lock));
+		printk("\n");
+	}
+	if (rt_mutex_owner(lock)) {
+		printk("... acquired at:               ");
+		print_symbol("%s\n", lock->acquire_ip);
+	}
+}
+
+static void printk_waiter(struct rt_mutex_waiter *w)
+{
+	printk("-------------------------\n");
+	printk("| waiter struct %p:\n", w);
+	printk("| w->list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n",
+	       w->list_entry.plist.prio_list.prev, w->list_entry.plist.prio_list.next,
+	       w->list_entry.plist.node_list.prev, w->list_entry.plist.node_list.next,
+	       w->list_entry.prio);
+	printk("| w->pi_list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n",
+	       w->pi_list_entry.plist.prio_list.prev, w->pi_list_entry.plist.prio_list.next,
+	       w->pi_list_entry.plist.node_list.prev, w->pi_list_entry.plist.node_list.next,
+	       w->pi_list_entry.prio);
+	printk("\n| lock:\n");
+	printk_lock(w->lock, 1);
+	printk("| w->ti->task:\n");
+	printk_task(w->task);
+	printk("| blocked at:  ");
+	print_symbol("%s\n", w->ip);
+	printk("-------------------------\n");
+}
+
+static void show_task_locks(task_t *p)
+{
+	switch (p->state) {
+	case TASK_RUNNING:		printk("R"); break;
+	case TASK_RUNNING_MUTEX:	printk("M"); break;
+	case TASK_INTERRUPTIBLE:	printk("S"); break;
+	case TASK_UNINTERRUPTIBLE:	printk("D"); break;
+	case TASK_STOPPED:		printk("T"); break;
+	case EXIT_ZOMBIE:		printk("Z"); break;
+	case EXIT_DEAD:			printk("X"); break;
+	default:			printk("?"); break;
+	}
+	printk_task(p);
+	if (p->pi_blocked_on) {
+		struct rt_mutex *lock = p->pi_blocked_on->lock;
+
+		printk(" blocked on:");
+		printk_lock(lock, 1);
+	} else
+		printk(" (not blocked)\n");
+}
+
+void rt_mutex_show_held_locks(task_t *task, int verbose)
+{
+	struct list_head *curr, *cursor = NULL;
+	struct rt_mutex *lock;
+	task_t *t;
+	unsigned long flags;
+	int count = 0;
+
+	if (!rt_trace_on)
+		return;
+
+	if (verbose) {
+		printk("------------------------------\n");
+		printk("| showing all locks held by: |  (");
+		printk_task_short(task);
+		printk("):\n");
+		printk("------------------------------\n");
+	}
+
+next:
+	spin_lock_irqsave(&task->held_list_lock, flags);
+	list_for_each(curr, &task->held_list_head) {
+		if (cursor && curr != cursor)
+			continue;
+		lock = list_entry(curr, struct rt_mutex, held_list_entry);
+		t = rt_mutex_owner(lock);
+		WARN_ON(t != task);
+		count++;
+		cursor = curr->next;
+		spin_unlock_irqrestore(&task->held_list_lock, flags);
+
+		printk("\n#%03d:            ", count);
+		printk_lock(lock, 0);
+		goto next;
+	}
+	spin_unlock_irqrestore(&task->held_list_lock, flags);
+
+	printk("\n");
+}
+
+void rt_mutex_show_all_locks(void)
+{
+	task_t *g, *p;
+	int count = 10;
+	int unlock = 1;
+
+	printk("\n");
+	printk("----------------------\n");
+	printk("| showing all tasks: |\n");
+	printk("----------------------\n");
+
+	/*
+	 * Here we try to get the tasklist_lock as hard as possible,
+	 * if not successful after 2 seconds we ignore it (but keep
+	 * trying). This is to enable a debug printout even if a
+	 * tasklist_lock-holding task deadlocks or crashes.
+	 */
+retry:
+	if (!read_trylock(&tasklist_lock)) {
+		if (count == 10)
+			printk("hm, tasklist_lock locked, retrying... ");
+		if (count) {
+			count--;
+			printk(" #%d", 10-count);
+			mdelay(200);
+			goto retry;
+		}
+		printk(" ignoring it.\n");
+		unlock = 0;
+	}
+	if (count != 10)
+		printk(" locked it.\n");
+
+	do_each_thread(g, p) {
+		show_task_locks(p);
+		if (!unlock)
+			if (read_trylock(&tasklist_lock))
+				unlock = 1;
+	} while_each_thread(g, p);
+
+	printk("\n");
+
+	printk("-----------------------------------------\n");
+	printk("| showing all locks held in the system: |\n");
+	printk("-----------------------------------------\n");
+
+	do_each_thread(g, p) {
+		rt_mutex_show_held_locks(p, 0);
+		if (!unlock)
+			if (read_trylock(&tasklist_lock))
+				unlock = 1;
+	} while_each_thread(g, p);
+
+
+	printk("=============================================\n\n");
+
+	if (unlock)
+		read_unlock(&tasklist_lock);
+}
+
+void rt_mutex_debug_check_no_locks_held(task_t *task)
+{
+	struct rt_mutex_waiter *w;
+	struct list_head *curr;
+	struct rt_mutex *lock;
+
+	if (!rt_trace_on)
+		return;
+#ifdef CONFIG_DEBUG_PREEMPT
+	if (task->lock_count) {
+		static int once = 1;
+		if (once) {
+			once = 0;
+			printk("BUG: nonzero lock count %d at exit time?\n",
+				task->lock_count);
+			printk_task(task);
+			printk("\n");
+			dump_stack();
+		}
+	}
+#endif
+	if (!rt_prio(task->normal_prio) && rt_prio(task->prio)) {
+		printk("BUG: PI priority boost leaked!\n");
+		printk_task(task);
+		printk("\n");
+	}
+	if (list_empty(&task->held_list_head))
+		return;
+
+	spin_lock(&task->pi_lock);
+	plist_for_each_entry(w, &task->pi_waiters, pi_list_entry) {
+		TRACE_OFF();
+
+		printk("hm, PI interest held at exit time? Task:\n");
+		printk_task(task);
+		printk_waiter(w);
+		return;
+	}
+	spin_unlock(&task->pi_lock);
+
+	list_for_each(curr, &task->held_list_head) {
+		lock = list_entry(curr, struct rt_mutex, held_list_entry);
+
+		if (is_kernel_lock(lock)) {
+			printk("BUG: %s/%d, BKL held at task exit time!\n",
+				task->comm, task->pid);
+			printk("BKL acquired at: ");
+			print_symbol("%s\n",
+				(unsigned long) task->last_kernel_lock);
+		} else
+			printk("BUG: %s/%d, lock held at task exit time!\n",
+				task->comm, task->pid);
+		printk_lock(lock, 1);
+		if (rt_mutex_owner(lock) != task)
+			printk("exiting task is not even the owner??\n");
+	}
+}
+
+int rt_mutex_debug_check_no_locks_freed(const void *from, unsigned long len)
+{
+	const void *to = from + len;
+	struct list_head *curr;
+	struct rt_mutex *lock;
+	unsigned long flags;
+	void *lock_addr;
+
+	if (!rt_trace_on)
+		return 0;
+
+	spin_lock_irqsave(&current->held_list_lock, flags);
+	list_for_each(curr, &current->held_list_head) {
+		lock = list_entry(curr, struct rt_mutex, held_list_entry);
+		lock_addr = lock;
+		if (lock_addr < from || lock_addr >= to)
+			continue;
+		TRACE_OFF();
+
+		printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n",
+			current->comm, current->pid, lock, from, to);
+		dump_stack();
+		printk_lock(lock, 1);
+		if (rt_mutex_owner(lock) != current)
+			printk("freeing task is not even the owner??\n");
+		return 1;
+	}
+	spin_unlock_irqrestore(&current->held_list_lock, flags);
+
+	return 0;
+}
+
+void rt_mutex_debug_task_free(struct task_struct *task)
+{
+	WARN_ON(!plist_head_empty(&task->pi_waiters));
+	WARN_ON(task->pi_blocked_on);
+}
+
+/*
+ * We fill out the fields in the waiter to store the information about
+ * the deadlock. We print when we return. act_waiter can be NULL in
+ * case of a remove waiter operation.
+ */
+void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
+			     struct rt_mutex *lock)
+{
+	struct task_struct *task;
+
+	if (!rt_trace_on || detect || !act_waiter)
+		return;
+
+	task = rt_mutex_owner(act_waiter->lock);
+	if (task && task != current) {
+		act_waiter->deadlock_task_pid = task->pid;
+		act_waiter->deadlock_lock = lock;
+	}
+}
+
+void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
+{
+	struct task_struct *task;
+
+	if (!waiter->deadlock_lock || !rt_trace_on)
+		return;
+
+	task = find_task_by_pid(waiter->deadlock_task_pid);
+	if (!task)
+		return;
+
+	TRACE_OFF_NOLOCK();
+
+	printk("\n============================================\n");
+	printk(  "[ BUG: circular locking deadlock detected! ]\n");
+	printk(  "--------------------------------------------\n");
+	printk("%s/%d is deadlocking current task %s/%d\n\n",
+	       task->comm, task->pid, current->comm, current->pid);
+
+	printk("\n1) %s/%d is trying to acquire this lock:\n",
+	       current->comm, current->pid);
+	printk_lock(waiter->lock, 1);
+
+	printk("... trying at:                 ");
+	print_symbol("%s\n", waiter->ip);
+
+	printk("\n2) %s/%d is blocked on this lock:\n", task->comm, task->pid);
+	printk_lock(waiter->deadlock_lock, 1);
+
+	rt_mutex_show_held_locks(current, 1);
+	rt_mutex_show_held_locks(task, 1);
+
+	printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid);
+	show_stack(task, NULL);
+	printk("\n%s/%d's [current] stackdump:\n\n",
+	       current->comm, current->pid);
+	dump_stack();
+	rt_mutex_show_all_locks();
+	printk("[ turning off deadlock detection."
+	       "Please report this trace. ]\n\n");
+}
+
+void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__)
+{
+	unsigned long flags;
+
+	if (rt_trace_on) {
+		TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry));
+
+		spin_lock_irqsave(&current->held_list_lock, flags);
+		list_add_tail(&lock->held_list_entry, &current->held_list_head);
+		spin_unlock_irqrestore(&current->held_list_lock, flags);
+
+		lock->acquire_ip = ip;
+	}
+}
+
+void debug_rt_mutex_unlock(struct rt_mutex *lock)
+{
+	unsigned long flags;
+
+	if (rt_trace_on) {
+		TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current);
+		TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry));
+
+		spin_lock_irqsave(&current->held_list_lock, flags);
+		list_del_init(&lock->held_list_entry);
+		spin_unlock_irqrestore(&current->held_list_lock, flags);
+	}
+}
+
+void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
+			       struct task_struct *powner __IP_DECL__)
+{
+	unsigned long flags;
+
+	if (rt_trace_on) {
+		TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry));
+
+		spin_lock_irqsave(&powner->held_list_lock, flags);
+		list_add_tail(&lock->held_list_entry, &powner->held_list_head);
+		spin_unlock_irqrestore(&powner->held_list_lock, flags);
+
+		lock->acquire_ip = ip;
+	}
+}
+
+void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
+{
+	unsigned long flags;
+
+	if (rt_trace_on) {
+		struct task_struct *owner = rt_mutex_owner(lock);
+
+		TRACE_WARN_ON_LOCKED(!owner);
+		TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry));
+
+		spin_lock_irqsave(&owner->held_list_lock, flags);
+		list_del_init(&lock->held_list_entry);
+		spin_unlock_irqrestore(&owner->held_list_lock, flags);
+	}
+}
+
+void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+	memset(waiter, 0x11, sizeof(*waiter));
+	plist_node_init(&waiter->list_entry, MAX_PRIO);
+	plist_node_init(&waiter->pi_list_entry, MAX_PRIO);
+}
+
+void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
+{
+	TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
+	TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
+	TRACE_WARN_ON(waiter->task);
+	memset(waiter, 0x22, sizeof(*waiter));
+}
+
+void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
+{
+	void *addr = lock;
+
+	if (rt_trace_on) {
+		rt_mutex_debug_check_no_locks_freed(addr,
+						    sizeof(struct rt_mutex));
+		INIT_LIST_HEAD(&lock->held_list_entry);
+		lock->name = name;
+	}
+}
+
+void rt_mutex_deadlock_account_lock(struct rt_mutex *lock, task_t *task)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+	if (task->lock_count >= MAX_LOCK_STACK) {
+		TRACE_OFF();
+		printk("BUG: %s/%d: lock count overflow!\n",
+			task->comm, task->pid);
+		dump_stack();
+		return;
+	}
+#ifdef CONFIG_RT_PREEMPT
+	task->owned_lock[task->lock_count] = lock;
+#endif
+	task->lock_count++;
+#endif
+}
+
+void rt_mutex_deadlock_account_unlock(struct task_struct *task)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+	if (!task->lock_count) {
+		TRACE_OFF();
+		printk("BUG: %s/%d: lock count underflow!\n",
+			task->comm, task->pid);
+		dump_stack();
+		return;
+	}
+	task->lock_count--;
+#ifdef CONFIG_RT_PREEMPT
+	task->owned_lock[task->lock_count] = NULL;
+#endif
+#endif
+}
Index: linux-2.6.16/kernel/rtmutex-debug.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/kernel/rtmutex-debug.h	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,37 @@
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains macros used solely by rtmutex.c. Debug version.
+ */
+
+#define __IP_DECL__		, unsigned long ip
+#define __IP__			, ip
+#define __RET_IP__		, (unsigned long)__builtin_return_address(0)
+
+extern void
+rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
+extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
+extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
+extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
+extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
+extern void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__);
+extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
+extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
+				      struct task_struct *powner __IP_DECL__);
+extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
+extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter,
+				    struct rt_mutex *lock);
+extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
+# define debug_rt_mutex_reset_waiter(w)			\
+	do { (w)->deadlock_lock = NULL; } while (0)
+
+static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
+						 int detect)
+{
+	return (waiter != NULL);
+}
Index: linux-2.6.16/kernel/rtmutex-tester.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/kernel/rtmutex-tester.c	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,436 @@
+/*
+ * RT-Mutex-tester: scriptable tester for rt mutexes
+ *
+ * started by Thomas Gleixner:
+ *
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ */
+#include <linux/config.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/smp_lock.h>
+#include <linux/spinlock.h>
+#include <linux/sysdev.h>
+#include <linux/timer.h>
+
+#include "rtmutex.h"
+
+#define MAX_RT_TEST_THREADS	8
+#define MAX_RT_TEST_MUTEXES	8
+
+static spinlock_t rttest_lock;
+static atomic_t rttest_event;
+
+struct test_thread_data {
+	int			opcode;
+	int			opdata;
+	int			mutexes[MAX_RT_TEST_MUTEXES];
+	int			bkl;
+	int			event;
+	struct sys_device	sysdev;
+};
+
+static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
+static task_t *threads[MAX_RT_TEST_THREADS];
+static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES];
+
+enum test_opcodes {
+	RTTEST_NOP = 0,
+	RTTEST_SCHEDOT,		/* 1 Sched other, data = nice */
+	RTTEST_SCHEDRT,		/* 2 Sched fifo, data = prio */
+	RTTEST_LOCK,		/* 3 Lock uninterruptible, data = lockindex */
+	RTTEST_LOCKNOWAIT,	/* 4 Lock uninterruptible no wait in wakeup, data = lockindex */
+	RTTEST_LOCKINT,		/* 5 Lock interruptible, data = lockindex */
+	RTTEST_LOCKINTNOWAIT,	/* 6 Lock interruptible no wait in wakeup, data = lockindex */
+	RTTEST_LOCKCONT,	/* 7 Continue locking after the wakeup delay */
+	RTTEST_UNLOCK,		/* 8 Unlock, data = lockindex */
+	RTTEST_LOCKBKL, 	/* 9 Lock BKL */
+	RTTEST_UNLOCKBKL,	/* 10 Unlock BKL */
+	RTTEST_SIGNAL,		/* 11 Signal other test thread, data = thread id */
+	RTTEST_RESETEVENT = 98,	/* 98 Reset event counter */
+	RTTEST_RESET = 99,	/* 99 Reset all pending operations */
+};
+
+static int handle_op(struct test_thread_data *td, int lockwakeup)
+{
+	struct sched_param schedpar;
+	int i, id, ret = -EINVAL;
+
+	switch(td->opcode) {
+
+	case RTTEST_NOP:
+		return 0;
+
+	case RTTEST_SCHEDOT:
+		schedpar.sched_priority = 0;
+		ret = sched_setscheduler(current, SCHED_NORMAL, &schedpar);
+		if (!ret)
+			set_user_nice(current, 0);
+		return ret;
+
+	case RTTEST_SCHEDRT:
+		schedpar.sched_priority = td->opdata;
+		return sched_setscheduler(current, SCHED_FIFO, &schedpar);
+
+	case RTTEST_LOCKCONT:
+		td->mutexes[td->opdata] = 1;
+		td->event = atomic_add_return(1, &rttest_event);
+		return 0;
+
+	case RTTEST_RESET:
+		for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) {
+			if (td->mutexes[i] == 4) {
+				rt_mutex_unlock(&mutexes[i]);
+				td->mutexes[i] = 0;
+			}
+		}
+
+		if (!lockwakeup && td->bkl == 4) {
+			unlock_kernel();
+			td->bkl = 0;
+		}
+		return 0;
+
+	case RTTEST_RESETEVENT:
+		atomic_set(&rttest_event, 0);
+		return 0;
+
+	default:
+		if (lockwakeup)
+			return ret;
+	}
+
+	switch(td->opcode) {
+
+	case RTTEST_LOCK:
+	case RTTEST_LOCKNOWAIT:
+		id = td->opdata;
+		if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
+			return ret;
+
+		td->mutexes[id] = 1;
+		td->event = atomic_add_return(1, &rttest_event);
+		rt_mutex_lock(&mutexes[id]);
+		td->event = atomic_add_return(1, &rttest_event);
+		td->mutexes[id] = 4;
+		return 0;
+
+	case RTTEST_LOCKINT:
+	case RTTEST_LOCKINTNOWAIT:
+		id = td->opdata;
+		if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
+			return ret;
+
+		td->mutexes[id] = 1;
+		td->event = atomic_add_return(1, &rttest_event);
+		ret = rt_mutex_lock_interruptible(&mutexes[id], 0);
+		td->event = atomic_add_return(1, &rttest_event);
+		td->mutexes[id] = ret ? 0 : 4;
+		return ret ? -EINTR : 0;
+
+	case RTTEST_UNLOCK:
+		id = td->opdata;
+		if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4)
+			return ret;
+
+		td->event = atomic_add_return(1, &rttest_event);
+		rt_mutex_unlock(&mutexes[id]);
+		td->event = atomic_add_return(1, &rttest_event);
+		td->mutexes[id] = 0;
+		return 0;
+
+	case RTTEST_LOCKBKL:
+		if (td->bkl)
+			return 0;
+		td->bkl = 1;
+		lock_kernel();
+		td->bkl = 4;
+		return 0;
+
+	case RTTEST_UNLOCKBKL:
+		if (td->bkl != 4)
+			break;
+		unlock_kernel();
+		td->bkl = 0;
+		return 0;
+
+	default:
+		break;
+	}
+	return ret;
+}
+
+/*
+ * Schedule replacement for rtsem_down(). Only called for threads with
+ * PF_MUTEX_TESTER set.
+ *
+ * This allows us to have finegrained control over the event flow.
+ *
+ */
+void schedule_rt_mutex_test(struct rt_mutex *mutex)
+{
+	int tid, op, dat;
+	struct test_thread_data *td;
+
+	/* We have to lookup the task */
+	for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) {
+		if (threads[tid] == current)
+			break;
+	}
+
+	BUG_ON(tid == MAX_RT_TEST_THREADS);
+
+	td = &thread_data[tid];
+
+	op = td->opcode;
+	dat = td->opdata;
+
+	switch (op) {
+	case RTTEST_LOCK:
+	case RTTEST_LOCKINT:
+	case RTTEST_LOCKNOWAIT:
+	case RTTEST_LOCKINTNOWAIT:
+		if (mutex != &mutexes[dat])
+			break;
+
+		if (td->mutexes[dat] != 1)
+			break;
+
+		td->mutexes[dat] = 2;
+		td->event = atomic_add_return(1, &rttest_event);
+		break;
+
+	case RTTEST_LOCKBKL:
+	default:
+		break;
+	}
+
+	schedule();
+
+
+	switch (op) {
+	case RTTEST_LOCK:
+	case RTTEST_LOCKINT:
+		if (mutex != &mutexes[dat])
+			return;
+
+		if (td->mutexes[dat] != 2)
+			return;
+
+		td->mutexes[dat] = 3;
+		td->event = atomic_add_return(1, &rttest_event);
+		break;
+
+	case RTTEST_LOCKNOWAIT:
+	case RTTEST_LOCKINTNOWAIT:
+		if (mutex != &mutexes[dat])
+			return;
+
+		if (td->mutexes[dat] != 2)
+			return;
+
+		td->mutexes[dat] = 1;
+		td->event = atomic_add_return(1, &rttest_event);
+		return;
+
+	case RTTEST_LOCKBKL:
+		return;
+	default:
+		return;
+	}
+
+	td->opcode = 0;
+
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		if (td->opcode > 0) {
+			int ret;
+
+			set_current_state(TASK_RUNNING);
+			ret = handle_op(td, 1);
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (td->opcode == RTTEST_LOCKCONT)
+				break;
+			td->opcode = ret;
+		}
+
+		/* Wait for the next command to be executed */
+		schedule();
+	}
+
+	/* Restore previous command and data */
+	td->opcode = op;
+	td->opdata = dat;
+}
+
+static int test_func(void *data)
+{
+	struct test_thread_data *td = data;
+	int ret;
+
+	current->flags |= PF_MUTEX_TESTER;
+	allow_signal(SIGHUP);
+
+	for(;;) {
+
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		if (td->opcode > 0) {
+			set_current_state(TASK_RUNNING);
+			ret = handle_op(td, 0);
+			set_current_state(TASK_INTERRUPTIBLE);
+			td->opcode = ret;
+		}
+
+		/* Wait for the next command to be executed */
+		schedule();
+
+		if (signal_pending(current))
+			flush_signals(current);
+
+		if(kthread_should_stop())
+			break;
+	}
+	return 0;
+}
+
+/**
+ * sysfs_test_command - interface for test commands
+ * @dev:	thread reference
+ * @buf:	command for actual step
+ * @count:	length of buffer
+ *
+ * command syntax:
+ *
+ * opcode:data
+ */
+static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf,
+				  size_t count)
+{
+	struct test_thread_data *td;
+	char cmdbuf[32];
+	int op, dat, tid;
+
+	td = container_of(dev, struct test_thread_data, sysdev);
+	tid = td->sysdev.id;
+
+	/* strings from sysfs write are not 0 terminated! */
+	if (count >= sizeof(cmdbuf))
+		return -EINVAL;
+
+	/* strip of \n: */
+	if (buf[count-1] == '\n')
+		count--;
+	if (count < 1)
+		return -EINVAL;
+
+	memcpy(cmdbuf, buf, count);
+	cmdbuf[count] = 0;
+
+	if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2)
+		return -EINVAL;
+
+	switch (op) {
+	case RTTEST_SIGNAL:
+		send_sig(SIGHUP, threads[tid], 0);
+		break;
+
+	default:
+		if (td->opcode > 0)
+			return -EBUSY;
+		td->opdata = dat;
+		td->opcode = op;
+		wake_up_process(threads[tid]);
+	}
+
+	return count;
+}
+
+/**
+ * sysfs_test_status - sysfs interface for rt tester
+ * @dev:	thread to query
+ * @buf:	char buffer to be filled with thread status info
+ */
+static ssize_t sysfs_test_status(struct sys_device *dev, char *buf)
+{
+	struct test_thread_data *td;
+	char *curr = buf;
+	task_t *tsk;
+	int i;
+
+	td = container_of(dev, struct test_thread_data, sysdev);
+	tsk = threads[td->sysdev.id];
+
+	spin_lock(&rttest_lock);
+
+	curr += sprintf(curr,
+		"O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:",
+		td->opcode, td->event, tsk->state,
+			(MAX_RT_PRIO - 1) - tsk->prio,
+			(MAX_RT_PRIO - 1) - tsk->normal_prio,
+		tsk->pi_blocked_on, td->bkl);
+
+	for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
+		curr += sprintf(curr, "%d", td->mutexes[i]);
+
+	spin_unlock(&rttest_lock);
+
+	curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
+			mutexes[td->sysdev.id].owner);
+
+	return curr - buf;
+}
+
+static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL);
+static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command);
+
+static struct sysdev_class rttest_sysclass = {
+	set_kset_name("rttest"),
+};
+
+static int init_test_thread(int id)
+{
+	thread_data[id].sysdev.cls = &rttest_sysclass;
+	thread_data[id].sysdev.id = id;
+
+	threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
+	if (IS_ERR(threads[id]))
+		return PTR_ERR(threads[id]);
+
+	return sysdev_register(&thread_data[id].sysdev);
+}
+
+static int init_rttest(void)
+{
+	int ret, i;
+
+	spin_lock_init(&rttest_lock);
+
+	for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
+		rt_mutex_init(&mutexes[i]);
+
+	ret = sysdev_class_register(&rttest_sysclass);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < MAX_RT_TEST_THREADS; i++) {
+		ret = init_test_thread(i);
+		if (ret)
+			break;
+		ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status);
+		if (ret)
+			break;
+		ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command);
+		if (ret)
+			break;
+	}
+
+	printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" );
+
+	return ret;
+}
+
+device_initcall(init_rttest);
Index: linux-2.6.16/kernel/rtmutex.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/kernel/rtmutex.c	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,1197 @@
+/*
+ * RT-Mutexes: simple blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner.
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
+ *  Copyright (C) 2006 Esben Nielsen
+ */
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+
+#include "rtmutex_common.h"
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# include "rtmutex-debug.h"
+#else
+# include "rtmutex.h"
+#endif
+
+/*
+ * lock->owner state tracking:
+ *
+ * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1
+ * are used to keep track of the "owner is pending" and "lock has
+ * waiters" state.
+ *
+ * owner	bit1	bit0
+ * NULL		0	0	lock is free (fast acquire possible)
+ * NULL		0	1	invalid state
+ * NULL		1	0	invalid state
+ * NULL		1	1	invalid state
+ * taskpointer	0	0	lock is held (fast release possible)
+ * taskpointer	0	1	task is pending owner
+ * taskpointer	1	0	lock is held and has waiters
+ * taskpointer	1	1	task is pending owner and lock has more waiters
+ *
+ * Pending ownership is assigned to the top (highest priority)
+ * waiter of the lock, when the lock is released. The thread is woken
+ * up and can now take the lock. Until the lock is taken (bit 0
+ * cleared) a competing higher priority thread can steal the lock
+ * which puts the woken up thread back on the waiters list.
+ *
+ * The fast atomic compare exchange based acquire and release is only
+ * possible when bit 0 and 1 of lock->owner are 0.
+ */
+
+static void
+rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
+		   unsigned long mask)
+{
+	unsigned long val = (unsigned long)owner | mask;
+
+	if (rt_mutex_has_waiters(lock))
+		val |= RT_MUTEX_HAS_WAITERS;
+
+	lock->owner = (struct task_struct *)val;
+}
+
+static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
+{
+	lock->owner = (struct task_struct *)
+			((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
+}
+
+static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
+{
+	if (!rt_mutex_has_waiters(lock))
+		clear_rt_mutex_waiters(lock);
+}
+
+/*
+ * We can speed up the acquire/release, if the architecture
+ * supports cmpxchg and if there's no debugging state to be set up
+ */
+#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
+# define rt_mutex_cmpxchg(l,c,n)	(cmpxchg(&l->owner, c, n) == c)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+	unsigned long owner, *p = (unsigned long *) &lock->owner;
+
+	do {
+		owner = *p;
+	} while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
+}
+#else
+# define rt_mutex_cmpxchg(l,c,n)	(0)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+	lock->owner = (struct task_struct *)
+			((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
+}
+#endif
+
+int pi_initialized;
+
+/*
+ * we initialize the wait_list runtime. (Could be done build-time and/or
+ * boot-time.)
+ */
+static inline void init_lists(struct rt_mutex *lock)
+{
+	if (unlikely(!lock->wait_list.prio_list.prev)) {
+		plist_head_init(&lock->wait_list, &lock->wait_lock);
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+		pi_initialized++;
+#endif
+	}
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+	if (!lock->held_list_entry.prev && !lock->held_list_entry.next)
+		INIT_LIST_HEAD(&lock->held_list_entry);
+#endif
+}
+
+/*
+ * Calculate task priority from the waiter list priority
+ *
+ * Return task->normal_prio when the waiter list is empty or when
+ * the waiter is not allowed to do priority boosting
+ */
+int rt_mutex_getprio(struct task_struct *task)
+{
+	if (likely(!task_has_pi_waiters(task)))
+		return task->normal_prio;
+
+	return min(task_top_pi_waiter(task)->pi_list_entry.prio,
+		   task->normal_prio);
+}
+
+/*
+ * Adjust the priority of a task, after its pi_waiters got modified.
+ *
+ * This can be both boosting and unboosting. task->pi_lock must be held.
+ */
+static void __rt_mutex_adjust_prio(struct task_struct *task)
+{
+	int prio = rt_mutex_getprio(task);
+
+	if (task->prio != prio)
+		rt_mutex_setprio(task, prio);
+}
+
+/*
+ * Adjust task priority (undo boosting). Called from the exit path of
+ * rt_mutex_slowunlock() and rt_mutex_slowlock().
+ *
+ * (Note: We do this outside of the protection of lock->wait_lock to
+ * allow the lock to be taken while or before we readjust the priority
+ * of task. We do not use the spin_xx_mutex() variants here as we are
+ * outside of the debug path.)
+ */
+static void rt_mutex_adjust_prio(struct task_struct *task)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&task->pi_lock, flags);
+	__rt_mutex_adjust_prio(task);
+	spin_unlock_irqrestore(&task->pi_lock, flags);
+}
+
+/*
+ * Max number of times we'll walk the boosting chain:
+ */
+int max_lock_depth = 1024;
+
+/*
+ * Adjust the priority chain. Also used for deadlock detection.
+ * Decreases task's usage by one - may thus free the task.
+ * Returns 0 or -EDEADLK.
+ */
+static int rt_mutex_adjust_prio_chain(task_t *task,
+				      int deadlock_detect,
+				      struct rt_mutex *orig_lock,
+				      struct rt_mutex_waiter *orig_waiter
+				      __IP_DECL__)
+{
+	struct rt_mutex *lock;
+	struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
+	int detect_deadlock, ret = 0, depth = 0;
+	unsigned long flags;
+
+	detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter,
+							 deadlock_detect);
+
+	/*
+	 * The (de)boosting is a step by step approach with a lot of
+	 * pitfalls. We want this to be preemptible and we want hold a
+	 * maximum of two locks per step. So we have to check
+	 * carefully whether things change under us.
+	 */
+ again:
+	if (++depth > max_lock_depth) {
+		static int prev_max;
+
+		/*
+		 * Print this only once. If the admin changes the limit,
+		 * print a new message when reaching the limit again.
+		 */
+		if (prev_max != max_lock_depth) {
+			prev_max = max_lock_depth;
+			printk(KERN_WARNING "Maximum lock depth %d reached "
+			       "task: %s (%d)\n", max_lock_depth,
+			       current->comm, current->pid);
+		}
+		put_task_struct(task);
+
+		return deadlock_detect ? -EDEADLK : 0;
+	}
+ retry:
+	/*
+	 * Task can not go away as we did a get_task() before !
+	 */
+	spin_lock_irqsave(&task->pi_lock, flags);
+
+	waiter = task->pi_blocked_on;
+	/*
+	 * Check whether the end of the boosting chain has been
+	 * reached or the state of the chain has changed while we
+	 * dropped the locks.
+	 */
+	if (!waiter || !waiter->task)
+		goto out_unlock_pi;
+
+	if (top_waiter && (!task_has_pi_waiters(task) ||
+			   top_waiter != task_top_pi_waiter(task)))
+		goto out_unlock_pi;
+
+	/*
+	 * When deadlock detection is off then we check, if further
+	 * priority adjustment is necessary.
+	 */
+	if (!detect_deadlock && waiter->list_entry.prio == task->prio)
+		goto out_unlock_pi;
+
+	lock = waiter->lock;
+	if (!spin_trylock(&lock->wait_lock)) {
+		spin_unlock_irqrestore(&task->pi_lock, flags);
+		cpu_relax();
+		goto retry;
+	}
+
+	/* Deadlock detection */
+	if (lock == orig_lock || rt_mutex_owner(lock) == current) {
+		debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
+		spin_unlock(&lock->wait_lock);
+		ret = deadlock_detect ? -EDEADLK : 0;
+		goto out_unlock_pi;
+	}
+
+	top_waiter = rt_mutex_top_waiter(lock);
+
+	/* Requeue the waiter */
+	plist_del(&waiter->list_entry, &lock->wait_list);
+	waiter->list_entry.prio = task->prio;
+	plist_add(&waiter->list_entry, &lock->wait_list);
+
+	/* Release the task */
+	spin_unlock_irqrestore(&task->pi_lock, flags);
+	put_task_struct(task);
+
+	/* Grab the next task */
+	task = rt_mutex_owner(lock);
+	spin_lock_irqsave(&task->pi_lock, flags);
+
+	if (waiter == rt_mutex_top_waiter(lock)) {
+		/* Boost the owner */
+		plist_del(&top_waiter->pi_list_entry, &task->pi_waiters);
+		waiter->pi_list_entry.prio = waiter->list_entry.prio;
+		plist_add(&waiter->pi_list_entry, &task->pi_waiters);
+		__rt_mutex_adjust_prio(task);
+
+	} else if (top_waiter == waiter) {
+		/* Deboost the owner */
+		plist_del(&waiter->pi_list_entry, &task->pi_waiters);
+		waiter = rt_mutex_top_waiter(lock);
+		waiter->pi_list_entry.prio = waiter->list_entry.prio;
+		plist_add(&waiter->pi_list_entry, &task->pi_waiters);
+		__rt_mutex_adjust_prio(task);
+	}
+
+	get_task_struct(task);
+	spin_unlock_irqrestore(&task->pi_lock, flags);
+
+	top_waiter = rt_mutex_top_waiter(lock);
+	spin_unlock(&lock->wait_lock);
+
+	if (!detect_deadlock && waiter != top_waiter)
+		goto out_put_task;
+
+	goto again;
+
+ out_unlock_pi:
+	spin_unlock_irqrestore(&task->pi_lock, flags);
+ out_put_task:
+	put_task_struct(task);
+	return ret;
+}
+
+/*
+ * Optimization: check if we can steal the lock from the
+ * assigned pending owner [which might not have taken the
+ * lock yet]:
+ */
+static inline int try_to_steal_lock(struct rt_mutex *lock)
+{
+	struct task_struct *pendowner = rt_mutex_owner(lock);
+	struct rt_mutex_waiter *next;
+	unsigned long flags;
+
+	if (!rt_mutex_owner_pending(lock))
+		return 0;
+
+	if (pendowner == current)
+		return 1;
+
+	spin_lock_irqsave(&pendowner->pi_lock, flags);
+	if (current->prio >= pendowner->prio) {
+		spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+		return 0;
+	}
+
+	/*
+	 * Check if a waiter is enqueued on the pending owners
+	 * pi_waiters list. Remove it and readjust pending owners
+	 * priority.
+	 */
+	if (likely(!rt_mutex_has_waiters(lock))) {
+		spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+		return 1;
+	}
+
+	/* No chain handling, pending owner is not blocked on anything: */
+	next = rt_mutex_top_waiter(lock);
+	plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
+	__rt_mutex_adjust_prio(pendowner);
+	spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+
+	/*
+	 * We are going to steal the lock and a waiter was
+	 * enqueued on the pending owners pi_waiters queue. So
+	 * we have to enqueue this waiter into
+	 * current->pi_waiters list. This covers the case,
+	 * where current is boosted because it holds another
+	 * lock and gets unboosted because the booster is
+	 * interrupted, so we would delay a waiter with higher
+	 * priority as current->normal_prio.
+	 *
+	 * Note: in the rare case of a SCHED_OTHER task changing
+	 * its priority and thus stealing the lock, next->task
+	 * might be current:
+	 */
+	if (likely(next->task != current)) {
+		spin_lock_irqsave(&current->pi_lock, flags);
+		plist_add(&next->pi_list_entry, &current->pi_waiters);
+		__rt_mutex_adjust_prio(current);
+		spin_unlock_irqrestore(&current->pi_lock, flags);
+	}
+	return 1;
+}
+
+/*
+ * Try to take an rt-mutex
+ *
+ * This fails
+ * - when the lock has a real owner
+ * - when a different pending owner exists and has higher priority than current
+ *
+ * Must be called with lock->wait_lock held.
+ */
+static int try_to_take_rt_mutex(struct rt_mutex *lock __IP_DECL__)
+{
+	/*
+	 * We have to be careful here if the atomic speedups are
+	 * enabled, such that, when
+	 *  - no other waiter is on the lock
+	 *  - the lock has been released since we did the cmpxchg
+	 * the lock can be released or taken while we are doing the
+	 * checks and marking the lock with RT_MUTEX_HAS_WAITERS.
+	 *
+	 * The atomic acquire/release aware variant of
+	 * mark_rt_mutex_waiters uses a cmpxchg loop. After setting
+	 * the WAITERS bit, the atomic release / acquire can not
+	 * happen anymore and lock->wait_lock protects us from the
+	 * non-atomic case.
+	 *
+	 * Note, that this might set lock->owner =
+	 * RT_MUTEX_HAS_WAITERS in the case the lock is not contended
+	 * any more. This is fixed up when we take the ownership.
+	 */
+	mark_rt_mutex_waiters(lock);
+
+	if (rt_mutex_owner(lock) && !try_to_steal_lock(lock))
+		return 0;
+
+	/* We got the lock. */
+	debug_rt_mutex_lock(lock __IP__);
+
+	rt_mutex_set_owner(lock, current, 0);
+
+	rt_mutex_deadlock_account_lock(lock, current);
+
+	return 1;
+}
+
+/*
+ * Task blocks on lock.
+ *
+ * Prepare waiter and propagate pi chain
+ *
+ * This must be called with lock->wait_lock held.
+ */
+static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
+				   struct rt_mutex_waiter *waiter,
+				   int detect_deadlock
+				   __IP_DECL__)
+{
+	struct rt_mutex_waiter *top_waiter = waiter;
+	task_t *owner = rt_mutex_owner(lock);
+	int boost = 0, res;
+	unsigned long flags;
+
+	spin_lock_irqsave(&current->pi_lock, flags);
+	__rt_mutex_adjust_prio(current);
+	waiter->task = current;
+	waiter->lock = lock;
+	plist_node_init(&waiter->list_entry, current->prio);
+	plist_node_init(&waiter->pi_list_entry, current->prio);
+
+	/* Get the top priority waiter on the lock */
+	if (rt_mutex_has_waiters(lock))
+		top_waiter = rt_mutex_top_waiter(lock);
+	plist_add(&waiter->list_entry, &lock->wait_list);
+
+	current->pi_blocked_on = waiter;
+
+	spin_unlock_irqrestore(&current->pi_lock, flags);
+
+	if (waiter == rt_mutex_top_waiter(lock)) {
+		spin_lock_irqsave(&owner->pi_lock, flags);
+		plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
+		plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
+
+		__rt_mutex_adjust_prio(owner);
+		if (owner->pi_blocked_on) {
+			boost = 1;
+			get_task_struct(owner);
+		}
+		spin_unlock_irqrestore(&owner->pi_lock, flags);
+	}
+	else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) {
+		spin_lock_irqsave(&owner->pi_lock, flags);
+		if (owner->pi_blocked_on) {
+			boost = 1;
+			get_task_struct(owner);
+		}
+		spin_unlock_irqrestore(&owner->pi_lock, flags);
+	}
+	if (!boost)
+		return 0;
+
+	spin_unlock(&lock->wait_lock);
+
+	res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock,
+					 waiter __IP__);
+
+	spin_lock(&lock->wait_lock);
+
+	return res;
+}
+
+/*
+ * Wake up the next waiter on the lock.
+ *
+ * Remove the top waiter from the current tasks waiter list and from
+ * the lock waiter list. Set it as pending owner. Then wake it up.
+ *
+ * Called with lock->wait_lock held.
+ */
+static void wakeup_next_waiter(struct rt_mutex *lock, int savestate)
+{
+	struct rt_mutex_waiter *waiter;
+	struct task_struct *pendowner;
+	unsigned long flags;
+
+	spin_lock_irqsave(&current->pi_lock, flags);
+
+	waiter = rt_mutex_top_waiter(lock);
+	plist_del(&waiter->list_entry, &lock->wait_list);
+
+	/*
+	 * Remove it from current->pi_waiters. We do not adjust a
+	 * possible priority boost right now. We execute wakeup in the
+	 * boosted mode and go back to normal after releasing
+	 * lock->wait_lock.
+	 */
+	plist_del(&waiter->pi_list_entry, &current->pi_waiters);
+	pendowner = waiter->task;
+	waiter->task = NULL;
+
+	rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
+
+	spin_unlock_irqrestore(&current->pi_lock, flags);
+
+	/*
+	 * Clear the pi_blocked_on variable and enqueue a possible
+	 * waiter into the pi_waiters list of the pending owner. This
+	 * prevents that in case the pending owner gets unboosted a
+	 * waiter with higher priority than pending-owner->normal_prio
+	 * is blocked on the unboosted (pending) owner.
+	 */
+	spin_lock_irqsave(&pendowner->pi_lock, flags);
+
+	WARN_ON(!pendowner->pi_blocked_on);
+	WARN_ON(pendowner->pi_blocked_on != waiter);
+	WARN_ON(pendowner->pi_blocked_on->lock != lock);
+
+	pendowner->pi_blocked_on = NULL;
+
+	if (rt_mutex_has_waiters(lock)) {
+		struct rt_mutex_waiter *next;
+
+		next = rt_mutex_top_waiter(lock);
+		plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
+	}
+	spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+
+	if (savestate)
+		wake_up_process_mutex(pendowner);
+	else
+		wake_up_process(pendowner);
+}
+
+/*
+ * Remove a waiter from a lock
+ *
+ * Must be called with lock->wait_lock held
+ */
+static void remove_waiter(struct rt_mutex *lock,
+			  struct rt_mutex_waiter *waiter  __IP_DECL__)
+{
+	int first = (waiter == rt_mutex_top_waiter(lock));
+	int boost = 0;
+	task_t *owner = rt_mutex_owner(lock);
+	unsigned long flags;
+
+	spin_lock_irqsave(&current->pi_lock, flags);
+	plist_del(&waiter->list_entry, &lock->wait_list);
+	waiter->task = NULL;
+	current->pi_blocked_on = NULL;
+	spin_unlock_irqrestore(&current->pi_lock, flags);
+
+	if (first && owner != current) {
+
+		spin_lock_irqsave(&owner->pi_lock, flags);
+
+		plist_del(&waiter->pi_list_entry, &owner->pi_waiters);
+
+		if (rt_mutex_has_waiters(lock)) {
+			struct rt_mutex_waiter *next;
+
+			next = rt_mutex_top_waiter(lock);
+			plist_add(&next->pi_list_entry, &owner->pi_waiters);
+		}
+		__rt_mutex_adjust_prio(owner);
+
+		if (owner->pi_blocked_on) {
+			boost = 1;
+			get_task_struct(owner);
+		}
+		spin_unlock_irqrestore(&owner->pi_lock, flags);
+	}
+
+	WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
+
+	if (!boost)
+		return;
+
+	spin_unlock(&lock->wait_lock);
+
+	rt_mutex_adjust_prio_chain(owner, 0, lock, NULL __IP__);
+
+	spin_lock(&lock->wait_lock);
+}
+
+#ifdef CONFIG_PREEMPT_RT
+
+static inline void
+rt_lock_fastlock(struct rt_mutex *lock,
+		 void fastcall (*slowfn)(struct rt_mutex *lock __IP_DECL__))
+{
+	if (likely(rt_mutex_cmpxchg(lock, NULL, current)))
+		rt_mutex_deadlock_account_lock(lock, current);
+	else
+		slowfn(lock __RET_IP__);
+}
+
+static inline void
+rt_lock_fastunlock(struct rt_mutex *lock,
+		   void fastcall (*slowfn)(struct rt_mutex *lock))
+{
+	if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
+		rt_mutex_deadlock_account_unlock(current);
+	else
+		slowfn(lock);
+}
+
+/*
+ * Slow path lock function spin_lock style: this variant is very
+ * careful not to miss any non-lock wakeups.
+ *
+ * The wakeup side uses wake_up_process_mutex, which, combined with
+ * the xchg code of this function is a transparent sleep/wakeup
+ * mechanism nested within any existing sleep/wakeup mechanism. This
+ * enables the seemless use of arbitrary (blocking) spinlocks within
+ * sleep/wakeup event loops.
+ */
+static void fastcall noinline __sched
+rt_lock_slowlock(struct rt_mutex *lock __IP_DECL__)
+{
+	struct rt_mutex_waiter waiter;
+	unsigned long saved_state, state;      ;
+
+	debug_rt_mutex_init_waiter(&waiter);
+	waiter.task = NULL;
+
+	spin_lock(&lock->wait_lock);
+
+	init_lists(lock);
+
+	/* Try to acquire the lock again: */
+	if (try_to_take_rt_mutex(lock __IP__)) {
+		spin_unlock(&lock->wait_lock);
+		return;
+	}
+
+	BUG_ON(rt_mutex_owner(lock) == current);
+
+	/*
+	 * Here we save whatever state the task was in originally,
+	 * we'll restore it at the end of the function and we'll take
+	 * any intermediate wakeup into account as well, independently
+	 * of the lock sleep/wakeup mechanism. When we get a real
+	 * wakeup the task->state is TASK_RUNNING and we change
+	 * saved_state accordingly. If we did not get a real wakeup
+	 * then we return with the saved state.
+	 */
+	saved_state = xchg(&current->state, TASK_UNINTERRUPTIBLE);
+
+	for (;;) {
+		unsigned long saved_flags;
+		int saved_lock_depth = current->lock_depth;
+
+		/* Try to acquire the lock */
+		if (try_to_take_rt_mutex(lock __IP__))
+			break;
+		/*
+		 * waiter.task is NULL the first time we come here and
+		 * when we have been woken up by the previous owner
+		 * but the lock got stolen by an higher prio task.
+		 */
+		if (!waiter.task) {
+			task_blocks_on_rt_mutex(lock, &waiter, 0 __IP__);
+			/* Wakeup during boost ? */
+			if (unlikely(!waiter.task))
+				continue;
+		}
+
+		/*
+		 * Prevent schedule() to drop BKL, while waiting for
+		 * the lock ! We restore lock_depth when we come back.
+		 */
+		saved_flags = current->flags & PF_NOSCHED;
+		current->lock_depth = -1;
+		current->flags &= ~PF_NOSCHED;
+		spin_unlock(&lock->wait_lock);
+
+		debug_rt_mutex_print_deadlock(&waiter);
+
+		schedule_rt_mutex(lock);
+
+		spin_lock(&lock->wait_lock);
+		current->flags |= saved_flags;
+		current->lock_depth = saved_lock_depth;
+		state = xchg(&current->state, TASK_UNINTERRUPTIBLE);
+		if (unlikely(state == TASK_RUNNING))
+			saved_state = TASK_RUNNING;
+	}
+
+	state = xchg(&current->state, saved_state);
+	if (unlikely(state == TASK_RUNNING))
+		current->state = TASK_RUNNING;
+
+	/*
+	 * Extremely rare case, if we got woken up by a non-mutex wakeup,
+	 * and we managed to steal the lock despite us not being the
+	 * highest-prio waiter (due to SCHED_OTHER changing prio), then we
+	 * can end up with a non-NULL waiter.task:
+	 */
+	if (unlikely(waiter.task))
+		remove_waiter(lock, &waiter __IP__);
+	/*
+	 * try_to_take_rt_mutex() sets the waiter bit
+	 * unconditionally. We might have to fix that up:
+	 */
+	fixup_rt_mutex_waiters(lock);
+
+	spin_unlock(&lock->wait_lock);
+
+	debug_rt_mutex_free_waiter(&waiter);
+}
+
+/*
+ * Slow path to release a rt_mutex spin_lock style
+ */
+static void fastcall noinline __sched
+rt_lock_slowunlock(struct rt_mutex *lock)
+{
+	spin_lock(&lock->wait_lock);
+
+	debug_rt_mutex_unlock(lock);
+
+	rt_mutex_deadlock_account_unlock(current);
+
+	if (!rt_mutex_has_waiters(lock)) {
+		lock->owner = NULL;
+		spin_unlock(&lock->wait_lock);
+		return;
+	}
+
+	wakeup_next_waiter(lock, 1);
+
+	spin_unlock(&lock->wait_lock);
+
+	/* Undo pi boosting.when necessary */
+	rt_mutex_adjust_prio(current);
+}
+
+void __lockfunc rt_lock(struct rt_mutex *lock)
+{
+	rt_lock_fastlock(lock, rt_lock_slowlock);
+}
+EXPORT_SYMBOL(rt_lock);
+
+void __lockfunc rt_unlock(struct rt_mutex *lock)
+{
+	rt_lock_fastunlock(lock, rt_lock_slowunlock);
+}
+EXPORT_SYMBOL(rt_unlock);
+
+#endif
+
+
+#ifdef CONFIG_PREEMPT_BKL
+
+static inline int rt_release_bkl(struct rt_mutex *lock)
+{
+	int saved_lock_depth = current->lock_depth;
+
+	current->lock_depth = -1;
+	/*
+	 * try_to_take_lock set the waiters, make sure it's
+	 * still correct.
+	 */
+	fixup_rt_mutex_waiters(lock);
+	spin_unlock(&lock->wait_lock);
+
+	up(&kernel_sem);
+
+	spin_lock(&lock->wait_lock);
+
+	return saved_lock_depth;
+}
+
+static inline void rt_reacquire_bkl(int saved_lock_depth)
+{
+	down(&kernel_sem);
+	current->lock_depth = saved_lock_depth;
+}
+
+#else
+# define rt_release_bkl(x)	(-1)
+# define rt_reacquire_bkl(x)	do { } while (0)
+#endif
+
+/*
+ * Slow path lock function:
+ */
+static int __sched
+rt_mutex_slowlock(struct rt_mutex *lock, int state,
+		  struct hrtimer_sleeper *timeout,
+		  int detect_deadlock __IP_DECL__)
+{
+	int ret = 0, saved_lock_depth = -1;
+	struct rt_mutex_waiter waiter;
+
+	debug_rt_mutex_init_waiter(&waiter);
+	waiter.task = NULL;
+
+	spin_lock(&lock->wait_lock);
+
+	init_lists(lock);
+
+	/* Try to acquire the lock again: */
+	if (try_to_take_rt_mutex(lock __IP__)) {
+		spin_unlock(&lock->wait_lock);
+		return 0;
+	}
+
+	/*
+	 * We drop the BKL here before we go into the wait loop to avoid a
+	 * possible deadlock in the scheduler.
+	 */
+	if (unlikely(current->lock_depth >= 0))
+		saved_lock_depth = rt_release_bkl(lock);
+
+	set_current_state(state);
+
+	/* Setup the timer, when timeout != NULL */
+	if (unlikely(timeout))
+		hrtimer_start(&timeout->timer, timeout->timer.expires,
+			      HRTIMER_ABS);
+
+	for (;;) {
+		unsigned long saved_flags;
+
+		/* Try to acquire the lock: */
+		if (try_to_take_rt_mutex(lock __IP__))
+			break;
+
+		/*
+		 * TASK_INTERRUPTIBLE checks for signals and
+		 * timeout. Ignored otherwise.
+		 */
+		if (unlikely(state == TASK_INTERRUPTIBLE)) {
+			/* Signal pending? */
+			if (signal_pending(current))
+				ret = -EINTR;
+			if (timeout && !timeout->task)
+				ret = -ETIMEDOUT;
+			if (ret)
+				break;
+		}
+
+		/*
+		 * waiter.task is NULL the first time we come here and
+		 * when we have been woken up by the previous owner
+		 * but the lock got stolen by a higher prio task.
+		 */
+		if (!waiter.task) {
+			ret = task_blocks_on_rt_mutex(lock, &waiter,
+						      detect_deadlock __IP__);
+			/*
+			 * If we got woken up by the owner then start loop
+			 * all over without going into schedule to try
+			 * to get the lock now:
+			 */
+			if (unlikely(!waiter.task))
+				continue;
+
+			if (unlikely(ret))
+				break;
+		}
+		saved_flags = current->flags & PF_NOSCHED;
+		current->flags &= ~PF_NOSCHED;
+
+		spin_unlock(&lock->wait_lock);
+
+		debug_rt_mutex_print_deadlock(&waiter);
+
+		if (waiter.task)
+			schedule_rt_mutex(lock);
+
+		spin_lock(&lock->wait_lock);
+
+		current->flags |= saved_flags;
+		set_current_state(state);
+	}
+
+	set_current_state(TASK_RUNNING);
+
+	if (unlikely(waiter.task))
+		remove_waiter(lock, &waiter __IP__);
+
+	/*
+	 * try_to_take_rt_mutex() sets the waiter bit
+	 * unconditionally. We might have to fix that up.
+	 */
+	fixup_rt_mutex_waiters(lock);
+
+	spin_unlock(&lock->wait_lock);
+
+	/* Remove pending timer: */
+	if (unlikely(timeout))
+		hrtimer_cancel(&timeout->timer);
+
+	/*
+	 * Readjust priority, when we did not get the lock. We might
+	 * have been the pending owner and boosted. Since we did not
+	 * take the lock, the PI boost has to go.
+	 */
+	if (unlikely(ret))
+		rt_mutex_adjust_prio(current);
+
+	/* Must we reaquire the BKL? */
+	if (unlikely(saved_lock_depth >= 0))
+		rt_reacquire_bkl(saved_lock_depth);
+
+	debug_rt_mutex_free_waiter(&waiter);
+
+	return ret;
+}
+
+/*
+ * Slow path try-lock function:
+ */
+static inline int
+rt_mutex_slowtrylock(struct rt_mutex *lock __IP_DECL__)
+{
+	int ret = 0;
+
+	spin_lock(&lock->wait_lock);
+
+	if (likely(rt_mutex_owner(lock) != current)) {
+
+		init_lists(lock);
+		ret = try_to_take_rt_mutex(lock __IP__);
+		/*
+		 * try_to_take_rt_mutex() sets the lock waiters
+		 * bit unconditionally. Clean this up.
+		 */
+		fixup_rt_mutex_waiters(lock);
+	}
+
+	spin_unlock(&lock->wait_lock);
+
+	return ret;
+}
+
+/*
+ * Slow path to release a rt-mutex:
+ */
+static void __sched
+rt_mutex_slowunlock(struct rt_mutex *lock)
+{
+	spin_lock(&lock->wait_lock);
+
+	debug_rt_mutex_unlock(lock);
+
+	rt_mutex_deadlock_account_unlock(current);
+
+	if (!rt_mutex_has_waiters(lock)) {
+		lock->owner = NULL;
+		spin_unlock(&lock->wait_lock);
+		return;
+	}
+
+	wakeup_next_waiter(lock, 0);
+
+	spin_unlock(&lock->wait_lock);
+
+	/* Undo pi boosting if necessary: */
+	rt_mutex_adjust_prio(current);
+}
+
+/*
+ * debug aware fast / slowpath lock,trylock,unlock
+ *
+ * The atomic acquire/release ops are compiled away, when either the
+ * architecture does not support cmpxchg or when debugging is enabled.
+ */
+static inline int
+rt_mutex_fastlock(struct rt_mutex *lock, int state,
+		  int detect_deadlock,
+		  int (*slowfn)(struct rt_mutex *lock, int state,
+				struct hrtimer_sleeper *timeout,
+				int detect_deadlock __IP_DECL__))
+{
+	if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+		rt_mutex_deadlock_account_lock(lock, current);
+		return 0;
+	} else
+		return slowfn(lock, state, NULL, detect_deadlock __RET_IP__);
+}
+
+static inline int
+rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
+			struct hrtimer_sleeper *timeout, int detect_deadlock,
+			int (*slowfn)(struct rt_mutex *lock, int state,
+				      struct hrtimer_sleeper *timeout,
+				      int detect_deadlock __IP_DECL__))
+{
+	if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+		rt_mutex_deadlock_account_lock(lock, current);
+		return 0;
+	} else
+		return slowfn(lock, state, timeout, detect_deadlock __RET_IP__);
+}
+
+static inline int
+rt_mutex_fasttrylock(struct rt_mutex *lock,
+		     int (*slowfn)(struct rt_mutex *lock __IP_DECL__))
+{
+	if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+		rt_mutex_deadlock_account_lock(lock, current);
+		return 1;
+	}
+	return slowfn(lock __RET_IP__);
+}
+
+static inline void
+rt_mutex_fastunlock(struct rt_mutex *lock,
+		    void (*slowfn)(struct rt_mutex *lock))
+{
+	if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
+		rt_mutex_deadlock_account_unlock(current);
+	else
+		slowfn(lock);
+}
+
+/**
+ * rt_mutex_lock - lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ */
+void __sched rt_mutex_lock(struct rt_mutex *lock)
+{
+	might_sleep();
+
+	rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock);
+
+/**
+ * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
+ *
+ * @lock: 		the rt_mutex to be locked
+ * @detect_deadlock:	deadlock detection on/off
+ *
+ * Returns:
+ *  0 		on success
+ * -EINTR 	when interrupted by a signal
+ * -EDEADLK	when the lock would deadlock (when deadlock detection is on)
+ */
+int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
+						 int detect_deadlock)
+{
+	might_sleep();
+
+	return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE,
+				 detect_deadlock, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
+
+/**
+ * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible
+ *				       the timeout structure is provided
+ *				       by the caller
+ *
+ * @lock: 		the rt_mutex to be locked
+ * @timeout:		timeout structure or NULL (no timeout)
+ * @detect_deadlock:	deadlock detection on/off
+ *
+ * Returns:
+ *  0 		on success
+ * -EINTR 	when interrupted by a signal
+ * -ETIMEOUT	when the timeout expired
+ * -EDEADLK	when the lock would deadlock (when deadlock detection is on)
+ */
+int
+rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout,
+		    int detect_deadlock)
+{
+	might_sleep();
+
+	return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
+				       detect_deadlock, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
+
+/**
+ * rt_mutex_trylock - try to lock a rt_mutex
+ *
+ * @lock:	the rt_mutex to be locked
+ *
+ * Returns 1 on success and 0 on contention
+ */
+int __sched rt_mutex_trylock(struct rt_mutex *lock)
+{
+	return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_trylock);
+
+/**
+ * rt_mutex_unlock - unlock a rt_mutex
+ *
+ * @lock: the rt_mutex to be unlocked
+ */
+void __sched rt_mutex_unlock(struct rt_mutex *lock)
+{
+	rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_unlock);
+
+/***
+ * rt_mutex_destroy - mark a mutex unusable
+ * @lock: the mutex to be destroyed
+ *
+ * This function marks the mutex uninitialized, and any subsequent
+ * use of the mutex is forbidden. The mutex must not be locked when
+ * this function is called.
+ */
+void rt_mutex_destroy(struct rt_mutex *lock)
+{
+	WARN_ON(rt_mutex_is_locked(lock));
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+	lock->magic = NULL;
+#endif
+}
+
+EXPORT_SYMBOL_GPL(rt_mutex_destroy);
+
+/**
+ * __rt_mutex_init - initialize the rt lock
+ *
+ * @lock: the rt lock to be initialized
+ *
+ * Initialize the rt lock to unlocked state.
+ *
+ * Initializing of a locked rt lock is not allowed
+ */
+void __rt_mutex_init(struct rt_mutex *lock, const char *name)
+{
+	lock->owner = NULL;
+	spin_lock_init(&lock->wait_lock);
+	plist_head_init(&lock->wait_list, &lock->wait_lock);
+
+	debug_rt_mutex_init(lock, name);
+}
+EXPORT_SYMBOL_GPL(__rt_mutex_init);
+
+/**
+ * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
+ *				proxy owner
+ *
+ * @lock: 	the rt_mutex to be locked
+ * @proxy_owner:the task to set as owner
+ *
+ * No locking. Caller has to do serializing itself
+ * Special API call for PI-futex support
+ */
+void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
+				struct task_struct *proxy_owner)
+{
+	__rt_mutex_init(lock, NULL);
+	debug_rt_mutex_proxy_lock(lock, proxy_owner __RET_IP__);
+	rt_mutex_set_owner(lock, proxy_owner, 0);
+	rt_mutex_deadlock_account_lock(lock, proxy_owner);
+}
+
+/**
+ * rt_mutex_proxy_unlock - release a lock on behalf of owner
+ *
+ * @lock: 	the rt_mutex to be locked
+ *
+ * No locking. Caller has to do serializing itself
+ * Special API call for PI-futex support
+ */
+void rt_mutex_proxy_unlock(struct rt_mutex *lock,
+			   struct task_struct *proxy_owner)
+{
+	debug_rt_mutex_proxy_unlock(lock);
+	rt_mutex_set_owner(lock, NULL, 0);
+	rt_mutex_deadlock_account_unlock(proxy_owner);
+}
+
+/**
+ * rt_mutex_next_owner - return the next owner of the lock
+ *
+ * @lock: the rt lock query
+ *
+ * Returns the next owner of the lock or NULL
+ *
+ * Caller has to serialize against other accessors to the lock
+ * itself.
+ *
+ * Special API call for PI-futex support
+ */
+struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
+{
+	if (!rt_mutex_has_waiters(lock))
+		return NULL;
+
+	return rt_mutex_top_waiter(lock)->task;
+}
Index: linux-2.6.16/kernel/rtmutex.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/kernel/rtmutex.h	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,29 @@
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains macros used solely by rtmutex.c.
+ * Non-debug version.
+ */
+
+#define __IP_DECL__
+#define __IP__
+#define __RET_IP__
+#define rt_mutex_deadlock_check(l)			(0)
+#define rt_mutex_deadlock_account_lock(m, t)		do { } while (0)
+#define rt_mutex_deadlock_account_unlock(l)		do { } while (0)
+#define debug_rt_mutex_init_waiter(w)			do { } while (0)
+#define debug_rt_mutex_free_waiter(w)			do { } while (0)
+#define debug_rt_mutex_lock(l)				do { } while (0)
+#define debug_rt_mutex_proxy_lock(l,p)			do { } while (0)
+#define debug_rt_mutex_proxy_unlock(l)			do { } while (0)
+#define debug_rt_mutex_unlock(l)			do { } while (0)
+#define debug_rt_mutex_init(m, n)			do { } while (0)
+#define debug_rt_mutex_deadlock(d, a ,l)		do { } while (0)
+#define debug_rt_mutex_print_deadlock(w)		do { } while (0)
+#define debug_rt_mutex_detect_deadlock(w,d)		(d)
+#define debug_rt_mutex_reset_waiter(w)			do { } while (0)
Index: linux-2.6.16/kernel/rtmutex_common.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/kernel/rtmutex_common.h	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,123 @@
+/*
+ * RT Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains the private data structure and API definitions.
+ */
+
+#ifndef __KERNEL_RTMUTEX_COMMON_H
+#define __KERNEL_RTMUTEX_COMMON_H
+
+#include <linux/rtmutex.h>
+
+/*
+ * The rtmutex in kernel tester is independent of rtmutex debugging. We
+ * call schedule_rt_mutex_test() instead of schedule() for the tasks which
+ * belong to the tester. That way we can delay the wakeup path of those
+ * threads to provoke lock stealing and testing of  complex boosting scenarios.
+ */
+#ifdef CONFIG_RT_MUTEX_TESTER
+
+extern void schedule_rt_mutex_test(struct rt_mutex *lock);
+
+#define schedule_rt_mutex(_lock)				\
+  do {								\
+	if (!(current->flags & PF_MUTEX_TESTER))		\
+		schedule();					\
+	else							\
+		schedule_rt_mutex_test(_lock);			\
+  } while (0)
+
+#else
+# define schedule_rt_mutex(_lock)			schedule()
+#endif
+
+/*
+ * This is the control structure for tasks blocked on a rt_mutex,
+ * which is allocated on the kernel stack on of the blocked task.
+ *
+ * @list_entry:		pi node to enqueue into the mutex waiters list
+ * @pi_list_entry:	pi node to enqueue into the mutex owner waiters list
+ * @task:		task reference to the blocked task
+ */
+struct rt_mutex_waiter {
+	struct plist_node	list_entry;
+	struct plist_node	pi_list_entry;
+	struct task_struct	*task;
+	struct rt_mutex		*lock;
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+	unsigned long		ip;
+	pid_t			deadlock_task_pid;
+	struct rt_mutex		*deadlock_lock;
+#endif
+};
+
+/*
+ * Various helpers to access the waiters-plist:
+ */
+static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
+{
+	return !plist_head_empty(&lock->wait_list);
+}
+
+static inline struct rt_mutex_waiter *
+rt_mutex_top_waiter(struct rt_mutex *lock)
+{
+	struct rt_mutex_waiter *w;
+
+	w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter,
+			       list_entry);
+	BUG_ON(w->lock != lock);
+
+	return w;
+}
+
+static inline int task_has_pi_waiters(struct task_struct *p)
+{
+	return !plist_head_empty(&p->pi_waiters);
+}
+
+static inline struct rt_mutex_waiter *
+task_top_pi_waiter(struct task_struct *p)
+{
+	return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter,
+				  pi_list_entry);
+}
+
+/*
+ * lock->owner state tracking:
+ */
+#define RT_MUTEX_OWNER_PENDING	1UL
+#define RT_MUTEX_HAS_WAITERS	2UL
+#define RT_MUTEX_OWNER_MASKALL	3UL
+
+static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
+{
+	return (struct task_struct *)
+		((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
+}
+
+static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock)
+{
+ 	return (struct task_struct *)
+		((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
+}
+
+static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
+{
+	return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
+}
+
+/*
+ * PI-futex support (proxy locking functions, etc.):
+ */
+extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
+extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
+				       struct task_struct *proxy_owner);
+extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
+				  struct task_struct *proxy_owner);
+#endif
Index: linux-2.6.16/kernel/sched.c
===================================================================
--- linux-2.6.16.orig/kernel/sched.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/sched.c	2006-10-19 16:52:31.000000000 +0200
@@ -4,6 +4,7 @@
  *  Kernel scheduler and related syscalls
  *
  *  Copyright (C) 1991-2002  Linus Torvalds
+ *  Copyright (C) 2004 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  *
  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
  *		make semaphores SMP safe
@@ -16,6 +17,7 @@
  *		by Davide Libenzi, preemptible kernel bits by Robert Love.
  *  2003-09-03	Interactivity tuning by Con Kolivas.
  *  2004-04-02	Scheduler domains code by Nick Piggin
+ *  2004-10-13  Real-Time Preemption support by Ingo Molnar
  */
 
 #include <linux/mm.h>
@@ -48,6 +50,7 @@
 #include <linux/seq_file.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
+#include <linux/kallsyms.h>
 #include <linux/acct.h>
 #include <asm/tlb.h>
 
@@ -62,6 +65,10 @@
 #define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
 #define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
 
+#define __PRIO(prio) \
+	((prio) <= 99 ? 199 - (prio) : (prio) - 120)
+
+#define PRIO(p) __PRIO((p)->prio)
 /*
  * 'User priority' is the nice value converted to something we
  * can work with better when scaling various scheduler parameters,
@@ -186,7 +193,27 @@ static unsigned int task_timeslice(task_
 
 typedef struct runqueue runqueue_t;
 
+#ifdef CURRENT_PTR
+struct task_struct * const ___current = &init_task;
+struct task_struct ** const current_ptr = (struct task_struct ** const)&___current;
+struct thread_info * const current_ti = &init_thread_union.thread_info;
+struct thread_info ** const current_ti_ptr = (struct thread_info ** const)&current_ti;
+
+EXPORT_SYMBOL(___current);
+EXPORT_SYMBOL(current_ti);
+
+/*
+ * The scheduler itself doesnt want 'current' to be cached
+ * during context-switches:
+ */
+# undef current
+# define current __current()
+# undef current_thread_info
+# define current_thread_info() __current_thread_info()
+#endif
+
 struct prio_array {
+	runqueue_t *rq;
 	unsigned int nr_active;
 	unsigned long bitmap[BITMAP_SIZE];
 	struct list_head queue[MAX_PRIO];
@@ -200,7 +227,7 @@ struct prio_array {
  * acquire operations must be ordered by ascending &runqueue.
  */
 struct runqueue {
-	spinlock_t lock;
+	raw_spinlock_t lock;
 
 	/*
 	 * nr_running and cpu_load should be in the same cacheline because
@@ -220,6 +247,11 @@ struct runqueue {
 	 */
 	unsigned long nr_uninterruptible;
 
+#ifdef CONFIG_PREEMPT_RT
+	unsigned long rt_nr_running;
+	unsigned long rt_nr_uninterruptible;
+#endif
+
 	unsigned long expired_timestamp;
 	unsigned long long timestamp_last_tick;
 	task_t *curr, *idle;
@@ -277,11 +309,23 @@ for (domain = rcu_dereference(cpu_rq(cpu
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
+/*
+ * We really dont want to do anything complex within switch_to()
+ * on PREEMPT_RT - this check enforces this.
+ */
+#ifdef prepare_arch_switch
+# ifdef CONFIG_PREEMPT_RT
+#   error FIXME
+# else
+#  define _finish_arch_switch finish_arch_switch
+# endif
+#endif
+
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
 #ifndef finish_arch_switch
-# define finish_arch_switch(prev)	do { } while (0)
+# define _finish_arch_switch(prev)	do { } while (0)
 #endif
 
 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
@@ -593,6 +637,53 @@ static inline void sched_info_switch(tas
 #define sched_info_switch(t, next)	do { } while (0)
 #endif /* CONFIG_SCHEDSTATS */
 
+int rt_overload_schedule, rt_overload_wakeup, rt_overload_pulled;
+
+__cacheline_aligned_in_smp atomic_t rt_overload;
+
+static inline void inc_rt_tasks(task_t *p, runqueue_t *rq)
+{
+#ifdef CONFIG_PREEMPT_RT
+	if (rt_task(p)) {
+		rq->rt_nr_running++;
+# ifdef CONFIG_SMP
+		if (rq->rt_nr_running == 2)
+			atomic_inc(&rt_overload);
+# endif
+	}
+#endif
+}
+
+static inline void dec_rt_tasks(task_t *p, runqueue_t *rq)
+{
+#ifdef CONFIG_PREEMPT_RT
+	if (rt_task(p)) {
+		WARN_ON(!rq->rt_nr_running);
+		rq->rt_nr_running--;
+# ifdef CONFIG_SMP
+		if (rq->rt_nr_running == 1)
+			atomic_dec(&rt_overload);
+# endif
+	}
+#endif
+}
+
+static inline void incr_rt_nr_uninterruptible(task_t *p, runqueue_t *rq)
+{
+#ifdef CONFIG_PREEMPT_RT
+	if (rt_task(p))
+		rq->rt_nr_uninterruptible++;
+#endif
+}
+
+static inline void decr_rt_nr_uninterruptible(task_t *p, runqueue_t *rq)
+{
+#ifdef CONFIG_PREEMPT_RT
+	if (rt_task(p))
+		rq->rt_nr_uninterruptible--;
+#endif
+}
+
 /*
  * Adding/removing a task to/from a priority array:
  */
@@ -602,15 +693,21 @@ static void dequeue_task(struct task_str
 	list_del(&p->run_list);
 	if (list_empty(array->queue + p->prio))
 		__clear_bit(p->prio, array->bitmap);
+	dec_rt_tasks(p, array->rq);
 }
 
 static void enqueue_task(struct task_struct *p, prio_array_t *array)
 {
+	if (p->flags & PF_DEAD) {
+		printk("BUG: %s/%d: dead task enqueued!\n", p->comm, p->pid);
+		dump_stack();
+	}
 	sched_info_queued(p);
 	list_add_tail(&p->run_list, array->queue + p->prio);
 	__set_bit(p->prio, array->bitmap);
 	array->nr_active++;
 	p->array = array;
+	inc_rt_tasks(p, array->rq);
 }
 
 /*
@@ -631,7 +728,7 @@ static inline void enqueue_task_head(str
 }
 
 /*
- * effective_prio - return the priority that is based on the static
+ * __normal_prio - return the priority that is based on the static
  * priority but is modified by bonuses/penalties.
  *
  * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
@@ -644,13 +741,11 @@ static inline void enqueue_task_head(str
  *
  * Both properties are important to certain workloads.
  */
-static int effective_prio(task_t *p)
+
+static inline int __normal_prio(task_t *p)
 {
 	int bonus, prio;
 
-	if (rt_task(p))
-		return p->prio;
-
 	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
 
 	prio = p->static_prio - bonus;
@@ -662,23 +757,49 @@ static int effective_prio(task_t *p)
 }
 
 /*
- * __activate_task - move a task to the runqueue.
+ * Calculate the expected normal priority: i.e. priority
+ * without taking RT-inheritance into account. Might be
+ * boosted by interactivity modifiers. Changes upon fork,
+ * setprio syscalls, and whenever the interactivity
+ * estimator recalculates.
  */
-static inline void __activate_task(task_t *p, runqueue_t *rq)
+static inline int normal_prio(task_t *p)
 {
-	enqueue_task(p, rq->active);
-	rq->nr_running++;
+	int prio;
+
+	if (p->policy != SCHED_NORMAL && p->policy != SCHED_BATCH)
+		prio = MAX_RT_PRIO-1 - p->rt_priority;
+	else
+		prio = __normal_prio(p);
+
+	trace_special_pid(p->pid, PRIO(p), __PRIO(prio));
+	return prio;
 }
 
 /*
- * __activate_idle_task - move idle task to the _front_ of runqueue.
+ * Calculate the current priority, i.e. the priority
+ * taken into account by the scheduler. This value might
+ * be boosted by RT tasks, or might be boosted by
+ * interactivity modifiers. Will be RT if the task got
+ * RT-boosted. If not then it returns p->normal_prio.
  */
-static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
+static int effective_prio(task_t *p)
 {
-	enqueue_task_head(p, rq->active);
-	rq->nr_running++;
+	p->normal_prio = normal_prio(p);
+	/*
+	 * If we are RT tasks or we were boosted to RT priority,
+	 * keep the priority unchanged. Otherwise, update priority
+	 * to the normal priority:
+	 */
+	if (!rt_prio(p->prio))
+		return p->normal_prio;
+	return p->prio;
 }
 
+/*
+ * Recalculate p->normal_prio and p->prio after having slept,
+ * updating the sleep-average too:
+ */
 static int recalc_task_prio(task_t *p, unsigned long long now)
 {
 	/* Caller must always ensure 'now >= p->timestamp' */
@@ -745,6 +866,56 @@ static int recalc_task_prio(task_t *p, u
 	return effective_prio(p);
 }
 
+static inline void trace_start_sched_wakeup(task_t *p, runqueue_t *rq)
+{
+	if (TASK_PREEMPTS_CURR(p, rq) && (p != rq->curr))
+		__trace_start_sched_wakeup(p);
+}
+
+#define inc_nr_running(p, rq) (rq)->nr_running++
+#define dec_nr_running(p, rq) (rq)->nr_running--
+
+/*
+ * __activate_task - move a task to the runqueue.
+ */
+static inline void __activate_task(task_t *p, runqueue_t *rq)
+{
+	trace_special_pid(p->pid, PRIO(p), rq->nr_running);
+	enqueue_task(p, rq->active);
+	inc_nr_running(p, rq);
+}
+
+/*
+ * __activate_task_after - move a task to the runqueue,
+ *                         to execute after a specific task.
+ */
+static inline
+void __activate_task_after(task_t *p, task_t *parent, runqueue_t *rq)
+{
+	trace_special_pid(p->pid, PRIO(p), rq->nr_running);
+	if (p->flags & PF_DEAD) {
+		printk("BUG: %s/%d: dead task enqueued!\n", p->comm, p->pid);
+		dump_stack();
+	}
+	sched_info_queued(p);
+	// FIXME: to head rather?
+	list_add_tail(&p->run_list, &parent->run_list);
+	p->array = parent->array;
+	p->array->nr_active++;
+	inc_rt_tasks(p, p->array->rq);
+	inc_nr_running(p, rq);
+}
+
+/*
+ * __activate_idle_task - move idle task to the _front_ of runqueue.
+ */
+static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
+{
+	enqueue_task_head(p, rq->active);
+	inc_nr_running(p, rq);
+	WARN_ON(rt_task(p));
+}
+
 /*
  * activate_task - move a task to the runqueue and do priority recalculation
  *
@@ -800,6 +971,7 @@ static void activate_task(task_t *p, run
  */
 static void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
+	trace_special_pid(p->pid, PRIO(p), rq->nr_running);
 	rq->nr_running--;
 	dequeue_task(p, p->array);
 	p->array = NULL;
@@ -1022,6 +1194,143 @@ nextgroup:
 	return idlest;
 }
 
+static __always_inline task_t * pick_task(runqueue_t *rq)
+{
+	prio_array_t *array = rq->active;
+	struct list_head *queue;
+	int idx;
+
+	if (!array->nr_active)
+		return rq->idle;
+
+	idx = sched_find_first_bit(array->bitmap);
+	queue = array->queue + idx;
+
+	return list_entry(queue->next, task_t, run_list);
+}
+
+#ifdef CONFIG_PREEMPT_RT
+
+static task_t * pick_rt_task(runqueue_t *src_rq, int this_cpu)
+{
+	struct list_head *head, *curr;
+	prio_array_t *array;
+	task_t *tmp;
+	int idx;
+
+	WARN_ON(!spin_is_locked(&src_rq->lock));
+	/*
+	 * Only consider the active array - we are looking for
+	 * RT tasks. Must have 2 tasks at least:
+	 */
+	array = src_rq->active;
+	if (unlikely(array->nr_active < 2))
+		return NULL;
+
+	idx = sched_find_first_bit(array->bitmap);
+next_in_bitmap:
+	/*
+	 * Only non-RT tasks available - abort the search:
+	 */
+	if (idx >= MAX_RT_PRIO)
+		return NULL;
+
+	head = array->queue + idx;
+	curr = head->next;
+next_in_queue:
+	tmp = list_entry(curr, task_t, run_list);
+	/*
+	 * Return the highest-prio non-running RT task (if task
+	 * may run on this CPU):
+	 */
+	if (!task_running(src_rq, tmp) &&
+				cpu_isset(this_cpu, tmp->cpus_allowed))
+		return tmp;
+
+	curr = curr->next;
+	if (curr != head)
+		goto next_in_queue;
+
+	idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1);
+	goto next_in_bitmap;
+}
+
+static int double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest);
+
+/*
+ * Pull RT tasks from other CPUs in the RT-overload
+ * case. Interrupts are disabled, local rq is locked.
+ */
+static void balance_rt_tasks(runqueue_t *this_rq, int this_cpu)
+{
+	runqueue_t *src_rq;
+	task_t *p, *next;
+	int cpu;
+
+	WARN_ON(!irqs_disabled());
+
+	/*
+	 * No need to do array switching - there can be no
+	 * RT tasks in the expired array and the idle task
+	 * is more than enough for comparing against RT tasks:
+	 */
+	next = pick_task(this_rq);
+
+	for_each_online_cpu(cpu) {
+		if (cpu == this_cpu)
+			continue;
+		src_rq = cpu_rq(cpu);
+		if (src_rq->rt_nr_running <= 1)
+			continue;
+
+		/*
+		 * We can potentially drop this_rq's lock in
+		 * double_lock_balance, and another CPU could
+		 * steal our next task - hence we must cause
+		 * the caller to recalculate the next task
+		 * in that case:
+		 */
+		if (double_lock_balance(this_rq, src_rq))
+			next = pick_task(this_rq);
+		/*
+		 * Are there still pullable RT tasks?
+		 */
+		if (src_rq->rt_nr_running <= 1) {
+			spin_unlock(&src_rq->lock);
+			continue;
+		}
+
+		p = pick_rt_task(src_rq, this_cpu);
+
+		/*
+		 * Do we have an RT task that preempts
+		 * the to-be-scheduled task?
+		 */
+		if (p && (p->prio < next->prio)) {
+			WARN_ON(p == src_rq->curr);
+			WARN_ON(!p->array);
+			rt_overload_pulled++;
+
+			set_task_cpu(p, this_cpu);
+
+			p->timestamp = p->timestamp -
+				src_rq->timestamp_last_tick
+				+ this_rq->timestamp_last_tick;
+			deactivate_task(p, src_rq);
+			activate_task(p, this_rq, 0);
+			/*
+			 * We continue with the search, just in
+			 * case there's an even higher prio task
+			 * in another runqueue. (low likelyhood
+			 * but possible)
+			 */
+		}
+		spin_unlock(&src_rq->lock);
+	}
+}
+
+#endif
+
 /*
  * find_idlest_queue - find the idlest runqueue among the cpus in group.
  */
@@ -1154,7 +1463,7 @@ static inline int wake_idle(int cpu, tas
  *
  * returns failure only if the task is already active.
  */
-static int try_to_wake_up(task_t *p, unsigned int state, int sync)
+static int try_to_wake_up(task_t *p, unsigned int state, int sync, int mutex)
 {
 	int cpu, this_cpu, success = 0;
 	unsigned long flags;
@@ -1166,6 +1475,13 @@ static int try_to_wake_up(task_t *p, uns
 	int new_cpu;
 #endif
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * sync wakeups can increase wakeup latencies:
+	 */
+	if (rt_task(p))
+		sync = 0;
+#endif
 	rq = task_rq_lock(p, &flags);
 	old_state = p->state;
 	if (!(old_state & state))
@@ -1265,12 +1581,23 @@ out_set_cpu:
 
 		this_cpu = smp_processor_id();
 		cpu = task_cpu(p);
+	} else {
+		/*
+		 * If a newly woken up RT task cannot preempt the
+		 * current (RT) task then try to find another
+		 * CPU it can preempt:
+		 */
+		if (rt_task(p) && !TASK_PREEMPTS_CURR(p, rq)) {
+			smp_send_reschedule_allbutself();
+			rt_overload_wakeup++;
+		}
 	}
 
 out_activate:
 #endif /* CONFIG_SMP */
 	if (old_state == TASK_UNINTERRUPTIBLE) {
 		rq->nr_uninterruptible--;
+		decr_rt_nr_uninterruptible(p, rq);
 		/*
 		 * Tasks on involuntary sleep don't earn
 		 * sleep_avg beyond just interactive state.
@@ -1279,48 +1606,111 @@ out_activate:
 	}
 
 	/*
-	 * Tasks that have marked their sleep as noninteractive get
-	 * woken up without updating their sleep average. (i.e. their
-	 * sleep is handled in a priority-neutral manner, no priority
-	 * boost and no penalty.)
-	 */
-	if (old_state & TASK_NONINTERACTIVE)
-		__activate_task(p, rq);
-	else
-		activate_task(p, rq, cpu == this_cpu);
-	/*
 	 * Sync wakeups (i.e. those types of wakeups where the waker
 	 * has indicated that it will leave the CPU in short order)
-	 * don't trigger a preemption, if the woken up task will run on
-	 * this cpu. (in this case the 'I will reschedule' promise of
-	 * the waker guarantees that the freshly woken up task is going
-	 * to be considered on this CPU.)
+	 * trigger a 'delayed preemption', if the woken up task will run on
+	 * this cpu. Delayed preemption is guaranteed to happen upon
+	 * return to userspace.
 	 */
 	if (!sync || cpu != this_cpu) {
-		if (TASK_PREEMPTS_CURR(p, rq))
+		/*
+		 * Mutex wakeups cause no boosting:
+		 */
+		if (mutex)
+			__activate_task(p, rq);
+		else
+			activate_task(p, rq, cpu == this_cpu);
+		if (TASK_PREEMPTS_CURR(p, rq)) {
+			trace_start_sched_wakeup(p, rq);
 			resched_task(rq->curr);
+		}
+	} else {
+		activate_task(p, rq, cpu == this_cpu);
+		if (TASK_PREEMPTS_CURR(p, rq))
+			set_tsk_need_resched_delayed(rq->curr);
 	}
+	if (rq->curr && p && rq && _need_resched())
+		trace_special_pid(p->pid, PRIO(p), PRIO(rq->curr));
 	success = 1;
 
 out_running:
-	p->state = TASK_RUNNING;
+	if (mutex)
+		p->state = TASK_RUNNING_MUTEX;
+	else
+		p->state = TASK_RUNNING;
 out:
-	task_rq_unlock(rq, &flags);
+#ifdef PREEMPT_DIRECT
+	spin_unlock(&rq->lock);
+	/*
+	 * Common place where preemption is requested - if we can
+	 * reschedule then do it here without enabling interrupts
+	 * again (and lengthening latency):
+	 */
+	if (_need_resched() && !irqs_disabled_flags(flags) && !preempt_count())
+		preempt_schedule_irq();
+	local_irq_restore(flags);
+#else
+	spin_unlock_irqrestore(&rq->lock, flags);
+#endif
+	/* no need to check for preempt here - we just handled it */
 
 	return success;
 }
 
-int fastcall wake_up_process(task_t *p)
+int fastcall wake_up_process(task_t * p)
 {
-	return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
-				 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
+	int ret;
+
+	ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
+				 TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE |
+				 TASK_UNINTERRUPTIBLE, 0, 0);
+	mcount();
+	return ret;
 }
 
 EXPORT_SYMBOL(wake_up_process);
 
+int fastcall wake_up_process_sync(task_t * p)
+{
+	int ret;
+
+	ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
+				 TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE |
+				 TASK_UNINTERRUPTIBLE, 1, 0);
+	mcount();
+	return ret;
+}
+
+EXPORT_SYMBOL(wake_up_process_sync);
+
+
+int fastcall wake_up_process_mutex(task_t * p)
+{
+	int ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
+				 TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE |
+				 TASK_UNINTERRUPTIBLE, 0, 1);
+	mcount();
+	return ret;
+}
+
+EXPORT_SYMBOL(wake_up_process_mutex);
+
+int fastcall wake_up_process_mutex_sync(task_t * p)
+{
+	int ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
+				 TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE |
+				 TASK_UNINTERRUPTIBLE, 1, 1);
+	mcount();
+	return ret;
+}
+
+EXPORT_SYMBOL(wake_up_process_mutex_sync);
+
 int fastcall wake_up_state(task_t *p, unsigned int state)
 {
-	return try_to_wake_up(p, state, 0);
+	int ret = try_to_wake_up(p, state | TASK_RUNNING_MUTEX, 0, 0);
+	mcount();
+	return ret;
 }
 
 /*
@@ -1343,6 +1733,12 @@ void fastcall sched_fork(task_t *p, int 
 	 * event cannot wake it up and insert it on the runqueue either.
 	 */
 	p->state = TASK_RUNNING;
+
+	/*
+	 * Make sure we do not leak PI boosting priority to the child:
+	 */
+	p->prio = current->normal_prio;
+
 	INIT_LIST_HEAD(&p->run_list);
 	p->array = NULL;
 #ifdef CONFIG_SCHEDSTATS
@@ -1422,15 +1818,17 @@ void fastcall wake_up_new_task(task_t *p
 				__activate_task(p, rq);
 			else {
 				p->prio = current->prio;
-				list_add_tail(&p->run_list, &current->run_list);
-				p->array = current->array;
-				p->array->nr_active++;
-				rq->nr_running++;
+				p->normal_prio = current->normal_prio;
+				__activate_task_after(p, current, rq);
 			}
 			set_need_resched();
-		} else
+			trace_start_sched_wakeup(p, rq);
+		} else {
 			/* Run child last */
 			__activate_task(p, rq);
+			if (rt_task(p) && TASK_PREEMPTS_CURR(p, rq))
+				set_need_resched();
+		}
 		/*
 		 * We skip the following code due to cpu == this_cpu
 	 	 *
@@ -1547,10 +1945,25 @@ static inline void finish_task_switch(ru
 	 *		Manfred Spraul <manfred@colorfullife.com>
 	 */
 	prev_task_flags = prev->flags;
-	finish_arch_switch(prev);
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+	/*
+	 * If we pushed an RT task off the runqueue,
+	 * then kick other CPUs, they might run it:
+	 */
+	if (unlikely(rt_task(current) && prev->array && rt_task(prev))) {
+		rt_overload_schedule++;
+		smp_send_reschedule_allbutself();
+	}
+#endif
+	_finish_arch_switch(prev);
 	finish_lock_switch(rq, prev);
+	trace_stop_sched_switched(current);
+	/*
+	 * Delay the final freeing of the mm or task, so that we dont have
+	 * to do complex work from within the scheduler:
+	 */
 	if (mm)
-		mmdrop(mm);
+		mmdrop_delayed(mm);
 	if (unlikely(prev_task_flags & PF_DEAD))
 		put_task_struct(prev);
 }
@@ -1562,12 +1975,17 @@ static inline void finish_task_switch(ru
 asmlinkage void schedule_tail(task_t *prev)
 	__releases(rq->lock)
 {
-	runqueue_t *rq = this_rq();
-	finish_task_switch(rq, prev);
+	preempt_disable(); // TODO: move this to fork setup
+	finish_task_switch(this_rq(), prev);
+	__preempt_enable_no_resched();
+	local_irq_enable();
 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
 	/* In this case, finish_task_switch does not reenable preemption */
 	preempt_enable();
+#else
+	preempt_check_resched();
 #endif
+
 	if (current->set_child_tid)
 		put_user(current->pid, current->set_child_tid);
 }
@@ -1595,6 +2013,13 @@ task_t * context_switch(runqueue_t *rq, 
 		rq->prev_mm = oldmm;
 	}
 
+	trace_cmdline();
+
+#ifdef CURRENT_PTR
+	barrier();
+	*current_ptr = next;
+	*current_ti_ptr = next->thread_info;
+#endif
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
 
@@ -1635,6 +2060,43 @@ unsigned long nr_uninterruptible(void)
 	return sum;
 }
 
+unsigned long nr_uninterruptible_cpu(int cpu)
+{
+	return cpu_rq(cpu)->nr_uninterruptible;
+}
+
+#if defined(CONFIG_PREEMPT_RT)
+unsigned long rt_nr_running(void)
+{
+	unsigned long i, sum = 0;
+
+	for_each_online_cpu(i)
+		sum += cpu_rq(i)->rt_nr_running;
+
+	return sum;
+}
+
+unsigned long rt_nr_running_cpu(int cpu)
+{
+	return cpu_rq(cpu)->rt_nr_running;
+}
+
+unsigned long rt_nr_uninterruptible(void)
+{
+	unsigned long i, sum = 0;
+
+	for_each_cpu(i)
+		sum += cpu_rq(i)->rt_nr_uninterruptible;
+
+	return sum;
+}
+
+unsigned long rt_nr_uninterruptible_cpu(int cpu)
+{
+	return cpu_rq(cpu)->rt_nr_uninterruptible;
+}
+#endif
+
 unsigned long long nr_context_switches(void)
 {
 	unsigned long long i, sum = 0;
@@ -1701,7 +2163,7 @@ static void double_rq_unlock(runqueue_t 
 /*
  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
  */
-static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
+static int double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
 	__releases(this_rq->lock)
 	__acquires(busiest->lock)
 	__acquires(this_rq->lock)
@@ -1711,9 +2173,12 @@ static void double_lock_balance(runqueue
 			spin_unlock(&this_rq->lock);
 			spin_lock(&busiest->lock);
 			spin_lock(&this_rq->lock);
+
+			return 1;
 		} else
 			spin_lock(&busiest->lock);
 	}
+	return 0;
 }
 
 /*
@@ -2453,6 +2918,7 @@ unsigned long long current_sched_time(co
 {
 	unsigned long long ns;
 	unsigned long flags;
+
 	local_irq_save(flags);
 	ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick);
 	ns = tsk->sched_time + (sched_clock() - ns);
@@ -2491,7 +2957,9 @@ void account_user_time(struct task_struc
 
 	/* Add user time to cpustat. */
 	tmp = cputime_to_cputime64(cputime);
-	if (TASK_NICE(p) > 0)
+	if (rt_task(p))
+		cpustat->user_rt = cputime64_add(cpustat->user_rt, tmp);
+	else if (TASK_NICE(p) > 0)
 		cpustat->nice = cputime64_add(cpustat->nice, tmp);
 	else
 		cpustat->user = cputime64_add(cpustat->user, tmp);
@@ -2518,6 +2986,8 @@ void account_system_time(struct task_str
 		cpustat->irq = cputime64_add(cpustat->irq, tmp);
 	else if (softirq_count())
 		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+	else if (rt_task(p))
+		cpustat->system_rt = cputime64_add(cpustat->system_rt, tmp);
 	else if (p != rq->idle)
 		cpustat->system = cputime64_add(cpustat->system, tmp);
 	else if (atomic_read(&rq->nr_iowait) > 0)
@@ -2563,6 +3033,8 @@ void scheduler_tick(void)
 	task_t *p = current;
 	unsigned long long now = sched_clock();
 
+	BUG_ON(!irqs_disabled());
+
 	update_cpu_clock(p, rq, now);
 
 	rq->timestamp_last_tick = now;
@@ -2586,11 +3058,17 @@ void scheduler_tick(void)
 	 * priority until it either goes to sleep or uses up its
 	 * timeslice. This makes it possible for interactive tasks
 	 * to use up their timeslices at their highest priority levels.
+	 *
+	 * Priority-boosted SCHED_NORMAL tasks may go here too.
 	 */
 	if (rt_task(p)) {
 		/*
 		 * RR tasks need a special form of timeslice management.
 		 * FIFO tasks have no timeslices.
+		 *
+		 * On PREEMPT_RT, boosted tasks will also get into this
+		 * branch and wont get their timeslice decreased until
+		 * they have done their work.
 		 */
 		if ((p->policy == SCHED_RR) && !--p->time_slice) {
 			p->time_slice = task_timeslice(p);
@@ -2716,8 +3194,8 @@ static int dependent_sleeper(int this_cp
 	struct sched_domain *tmp, *sd = NULL;
 	cpumask_t sibling_map;
 	prio_array_t *array;
-	int ret = 0, i;
-	task_t *p;
+	int ret = 0, i, stop_sched_switched = 0;
+	task_t *p = NULL;
 
 	for_each_domain(this_cpu, tmp)
 		if (tmp->flags & SD_SHARE_CPUPOWER)
@@ -2779,6 +3257,12 @@ static int dependent_sleeper(int this_cp
 				!TASK_PREEMPTS_CURR(p, smt_rq) &&
 				smt_slice(smt_curr, sd) > task_timeslice(p))
 					ret = 1;
+		if (ret) {
+			trace_special_pid(smt_curr->pid, smt_curr->prio,
+						smt_curr->static_prio);
+			trace_special_pid(p->pid, p->prio, p->static_prio);
+			stop_sched_switched = 1;
+		}
 
 check_smt_task:
 		if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
@@ -2809,6 +3293,9 @@ check_smt_task:
 out_unlock:
 	for_each_cpu_mask(i, sibling_map)
 		spin_unlock(&cpu_rq(i)->lock);
+	if (p && stop_sched_switched)
+		trace_stop_sched_switched(p);
+
 	return ret;
 }
 #else
@@ -2822,42 +3309,48 @@ static inline int dependent_sleeper(int 
 }
 #endif
 
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
+#if defined(CONFIG_LATENCY_TRACE) && defined(CONFIG_DEBUG_RT_MUTEXES)
 
-void fastcall add_preempt_count(int val)
+static void trace_array(prio_array_t *array)
 {
-	/*
-	 * Underflow?
-	 */
-	BUG_ON((preempt_count() < 0));
-	preempt_count() += val;
-	/*
-	 * Spinlock count overflowing soon?
-	 */
-	BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
+	int i;
+	task_t *p;
+	struct list_head *head, *tmp;
+
+	for (i = 0; i < MAX_PRIO; i++) {
+		head = array->queue + i;
+		if (list_empty(head)) {
+			WARN_ON(test_bit(i, array->bitmap));
+			continue;
+		}
+		WARN_ON(!test_bit(i, array->bitmap));
+		list_for_each(tmp, head) {
+			p = list_entry(tmp, task_t, run_list);
+			trace_special_pid(p->pid, p->prio, PRIO(p));
+		}
+	}
 }
-EXPORT_SYMBOL(add_preempt_count);
 
-void fastcall sub_preempt_count(int val)
+static inline void trace_all_runnable_tasks(runqueue_t *rq)
+{
+	if (trace_enabled) {
+		trace_array(rq->active);
+		trace_array(rq->expired);
+	}
+}
+
+#else
+
+static inline void trace_all_runnable_tasks(runqueue_t *rq)
 {
-	/*
-	 * Underflow?
-	 */
-	BUG_ON(val > preempt_count());
-	/*
-	 * Is the spinlock portion underflowing?
-	 */
-	BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK));
-	preempt_count() -= val;
 }
-EXPORT_SYMBOL(sub_preempt_count);
 
 #endif
 
 /*
- * schedule() is the main scheduler function.
+ * __schedule() is the main scheduler function.
  */
-asmlinkage void __sched schedule(void)
+void __sched __schedule(void)
 {
 	long *switch_count;
 	task_t *prev, *next;
@@ -2868,26 +3361,24 @@ asmlinkage void __sched schedule(void)
 	unsigned long run_time;
 	int cpu, idx, new_prio;
 
+	WARN_ON(system_state == SYSTEM_BOOTING);
 	/*
-	 * Test if we are atomic.  Since do_exit() needs to call into
-	 * schedule() atomically, we ignore that path for now.
-	 * Otherwise, whine if we are scheduling when we should not be.
-	 */
-	if (likely(!current->exit_state)) {
-		if (unlikely(in_atomic())) {
-			printk(KERN_ERR "scheduling while atomic: "
-				"%s/0x%08x/%d\n",
-				current->comm, preempt_count(), current->pid);
-			dump_stack();
-		}
+	 * Test if we are atomic.
+	 */
+	if (unlikely(in_atomic())) {
+		stop_trace();
+		printk(KERN_ERR "BUG: scheduling while atomic: "
+			"%s/0x%08x/%d\n",
+			current->comm, preempt_count(), current->pid);
+		print_symbol("caller is %s\n",
+			(long)__builtin_return_address(0));
+		dump_stack();
 	}
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 
-need_resched:
-	preempt_disable();
+	preempt_disable(); // FIXME: disable irqs here
 	prev = current;
 	release_kernel_lock(prev);
-need_resched_nonpreemptible:
 	rq = this_rq();
 
 	/*
@@ -2895,7 +3386,7 @@ need_resched_nonpreemptible:
 	 * Remove this check after it has been exercised a bit.
 	 */
 	if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
-		printk(KERN_ERR "bad: scheduling from the idle thread!\n");
+		printk(KERN_ERR "BUG: scheduling from the idle thread!\n");
 		dump_stack();
 	}
 
@@ -2914,25 +3405,42 @@ need_resched_nonpreemptible:
 	 */
 	run_time /= (CURRENT_BONUS(prev) ? : 1);
 
+	cpu = smp_processor_id();
 	spin_lock_irq(&rq->lock);
 
-	if (unlikely(prev->flags & PF_DEAD))
-		prev->state = EXIT_DEAD;
-
-	switch_count = &prev->nivcsw;
-	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+	switch_count = &prev->nvcsw; // TODO: temporary - to see it in vmstat
+	if ((prev->state & ~TASK_RUNNING_MUTEX) &&
+			!(preempt_count() & PREEMPT_ACTIVE)) {
 		switch_count = &prev->nvcsw;
 		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
 				unlikely(signal_pending(prev))))
 			prev->state = TASK_RUNNING;
 		else {
-			if (prev->state == TASK_UNINTERRUPTIBLE)
+			if (prev->state == TASK_UNINTERRUPTIBLE) {
 				rq->nr_uninterruptible++;
+				incr_rt_nr_uninterruptible(prev, rq);
+			}
+			touch_softlockup_watchdog();
 			deactivate_task(prev, rq);
 		}
 	}
+	if (preempt_count() & PREEMPT_ACTIVE)
+		sub_preempt_count(PREEMPT_ACTIVE);
+	if (unlikely(prev->flags & PF_DEAD)) {
+		if (prev->state != TASK_RUNNING) {
+			printk("prev->state: %ld != TASK_RUNNING??\n",
+				prev->state);
+			WARN_ON(1);
+		} else
+			deactivate_task(prev, rq);
+		prev->state = EXIT_DEAD;
+	}
+
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+	if (unlikely(atomic_read(&rt_overload)))
+		balance_rt_tasks(rq, cpu);
+#endif
 
-	cpu = smp_processor_id();
 	if (unlikely(!rq->nr_running)) {
 go_idle:
 		idle_balance(cpu, rq);
@@ -3004,6 +3512,7 @@ switch_tasks:
 	prefetch(next);
 	prefetch_stack(next);
 	clear_tsk_need_resched(prev);
+	clear_tsk_need_resched_delayed(prev);
 	rcu_qsctr_inc(task_cpu(prev));
 
 	update_cpu_clock(prev, rq, now);
@@ -3013,6 +3522,8 @@ switch_tasks:
 		prev->sleep_avg = 0;
 	prev->timestamp = prev->last_ran = now;
 
+	trace_all_runnable_tasks(rq);
+
 	sched_info_switch(prev, next);
 	if (likely(prev != next)) {
 		next->timestamp = now;
@@ -3023,26 +3534,80 @@ switch_tasks:
 		prepare_task_switch(rq, next);
 		prev = context_switch(rq, prev, next);
 		barrier();
+		trace_special_pid(prev->pid, PRIO(prev), PRIO(current));
 		/*
 		 * this_rq must be evaluated again because prev may have moved
 		 * CPUs since it called schedule(), thus the 'rq' on its stack
 		 * frame will be invalid.
 		 */
 		finish_task_switch(this_rq(), prev);
-	} else
-		spin_unlock_irq(&rq->lock);
+		__preempt_enable_no_resched();
+	} else {
+		__preempt_enable_no_resched();
+		spin_unlock(&rq->lock);
+		trace_stop_sched_switched(next);
+	}
 
-	prev = current;
-	if (unlikely(reacquire_kernel_lock(prev) < 0))
-		goto need_resched_nonpreemptible;
-	preempt_enable_no_resched();
-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
-		goto need_resched;
+	reacquire_kernel_lock(current);
 }
 
+/*
+ * schedule() is the main scheduler function.
+ */
+asmlinkage void __sched schedule(void)
+{
+	WARN_ON(system_state == SYSTEM_BOOTING);
+	/*
+	 * Test if we have interrupts disabled.
+	 */
+	if (unlikely(irqs_disabled())) {
+		stop_trace();
+		printk(KERN_ERR "BUG: scheduling with irqs disabled: "
+			"%s/0x%08x/%d\n", current->comm, preempt_count(), current->pid);
+		print_symbol("caller is %s\n",
+			(long)__builtin_return_address(0));
+		dump_stack();
+	}
+	if (unlikely(current->flags & PF_NOSCHED)) {
+		current->flags &= ~PF_NOSCHED;
+		printk(KERN_ERR "%s:%d userspace BUG: scheduling in user-atomic context!\n", current->comm, current->pid);
+		dump_stack();
+		send_sig(SIGUSR2, current, 1);
+	}
+	do {
+		__schedule();
+	} while (unlikely(test_thread_flag(TIF_NEED_RESCHED) || test_thread_flag(TIF_NEED_RESCHED_DELAYED)));
+	local_irq_enable(); // TODO: do sti; ret
+}
 EXPORT_SYMBOL(schedule);
 
 #ifdef CONFIG_PREEMPT
+
+int kernel_preemption = 1;
+
+static int __init preempt_setup (char *str)
+{
+	if (!strncmp(str, "off", 3)) {
+		if (kernel_preemption) {
+			printk("turning off kernel preemption!\n");
+			kernel_preemption = 0;
+		}
+		return 1;
+	}
+	if (!strncmp(str, "on", 2)) {
+		if (!kernel_preemption) {
+			printk("turning on kernel preemption!\n");
+			kernel_preemption = 1;
+		}
+		return 1;
+	}
+	get_option(&str, &kernel_preemption);
+
+	return 1;
+}
+
+__setup("preempt=", preempt_setup);
+
 /*
  * this is is the entry point to schedule() from in-kernel preemption
  * off of preempt_enable.  Kernel preemptions off return from interrupt
@@ -3055,6 +3620,8 @@ asmlinkage void __sched preempt_schedule
 	struct task_struct *task = current;
 	int saved_lock_depth;
 #endif
+	if (!kernel_preemption)
+		return;
 	/*
 	 * If there is a non-zero preempt_count or interrupts are disabled,
 	 * we do not want to preempt the current task.  Just return..
@@ -3063,6 +3630,7 @@ asmlinkage void __sched preempt_schedule
 		return;
 
 need_resched:
+	local_irq_disable();
 	add_preempt_count(PREEMPT_ACTIVE);
 	/*
 	 * We keep the big kernel semaphore locked, but we
@@ -3073,25 +3641,25 @@ need_resched:
 	saved_lock_depth = task->lock_depth;
 	task->lock_depth = -1;
 #endif
-	schedule();
+	__schedule();
 #ifdef CONFIG_PREEMPT_BKL
 	task->lock_depth = saved_lock_depth;
 #endif
-	sub_preempt_count(PREEMPT_ACTIVE);
-
 	/* we could miss a preemption opportunity between schedule and now */
 	barrier();
-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED) ||
+			test_thread_flag(TIF_NEED_RESCHED_DELAYED)))
 		goto need_resched;
+	local_irq_enable();
 }
 
 EXPORT_SYMBOL(preempt_schedule);
 
 /*
- * this is is the entry point to schedule() from kernel preemption
- * off of irq context.
- * Note, that this is called and return with irqs disabled. This will
- * protect us against recursive calling from irq.
+ * this is is the entry point for the IRQ return path. Called with
+ * interrupts disabled.  To avoid infinite irq-entry recursion problems
+ * with fast-paced IRQ sources we do all of this carefully to never
+ * enable interrupts again.
  */
 asmlinkage void __sched preempt_schedule_irq(void)
 {
@@ -3100,10 +3668,17 @@ asmlinkage void __sched preempt_schedule
 	struct task_struct *task = current;
 	int saved_lock_depth;
 #endif
-	/* Catch callers which need to be fixed*/
-	BUG_ON(ti->preempt_count || !irqs_disabled());
+	if (!kernel_preemption)
+		return;
+	/*
+	 * If there is a non-zero preempt_count then just return.
+	 * (interrupts are disabled)
+	 */
+	if (unlikely(ti->preempt_count))
+		return;
 
 need_resched:
+	local_irq_disable();
 	add_preempt_count(PREEMPT_ACTIVE);
 	/*
 	 * We keep the big kernel semaphore locked, but we
@@ -3114,17 +3689,17 @@ need_resched:
 	saved_lock_depth = task->lock_depth;
 	task->lock_depth = -1;
 #endif
-	local_irq_enable();
-	schedule();
+	__schedule();
+
 	local_irq_disable();
+
 #ifdef CONFIG_PREEMPT_BKL
 	task->lock_depth = saved_lock_depth;
 #endif
-	sub_preempt_count(PREEMPT_ACTIVE);
-
 	/* we could miss a preemption opportunity between schedule and now */
 	barrier();
-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED) ||
+			test_thread_flag(TIF_NEED_RESCHED_DELAYED)))
 		goto need_resched;
 }
 
@@ -3134,7 +3709,7 @@ int default_wake_function(wait_queue_t *
 			  void *key)
 {
 	task_t *p = curr->private;
-	return try_to_wake_up(p, mode, sync);
+	return try_to_wake_up(p, mode | TASK_RUNNING_MUTEX, sync, 0);
 }
 
 EXPORT_SYMBOL(default_wake_function);
@@ -3178,8 +3753,9 @@ void fastcall __wake_up(wait_queue_head_
 	unsigned long flags;
 
 	spin_lock_irqsave(&q->lock, flags);
-	__wake_up_common(q, mode, nr_exclusive, 0, key);
+	__wake_up_common(q, mode, nr_exclusive, 1, key);
 	spin_unlock_irqrestore(&q->lock, flags);
+	preempt_check_resched_delayed();
 }
 
 EXPORT_SYMBOL(__wake_up);
@@ -3230,8 +3806,9 @@ void fastcall complete(struct completion
 	spin_lock_irqsave(&x->wait.lock, flags);
 	x->done++;
 	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
-			 1, 0, NULL);
+			 1, 1, NULL);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
+	preempt_check_resched_delayed();
 }
 EXPORT_SYMBOL(complete);
 
@@ -3242,11 +3819,18 @@ void fastcall complete_all(struct comple
 	spin_lock_irqsave(&x->wait.lock, flags);
 	x->done += UINT_MAX/2;
 	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
-			 0, 0, NULL);
+			 0, 1, NULL);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
+	preempt_check_resched_delayed();
 }
 EXPORT_SYMBOL(complete_all);
 
+unsigned int fastcall completion_done(struct completion *x)
+{
+	return x->done;
+}
+EXPORT_SYMBOL(completion_done);
+
 void fastcall __sched wait_for_completion(struct completion *x)
 {
 	might_sleep();
@@ -3441,6 +4025,63 @@ long fastcall __sched sleep_on_timeout(w
 
 EXPORT_SYMBOL(sleep_on_timeout);
 
+#ifdef CONFIG_RT_MUTEXES
+
+/*
+ * rt_mutex_setprio - set the current priority of a task
+ * @p: task
+ * @prio: prio value (kernel-internal form)
+ *
+ * This function changes the 'effective' priority of a task. It does
+ * not touch ->normal_prio like __setscheduler().
+ *
+ * Used by the rt_mutex code to implement priority inheritance logic.
+ */
+void rt_mutex_setprio(task_t *p, int prio)
+{
+	unsigned long flags;
+	prio_array_t *array;
+	runqueue_t *rq;
+	int oldprio, prev_resched;
+
+	BUG_ON(prio < 0 || prio > MAX_PRIO);
+
+	rq = task_rq_lock(p, &flags);
+
+	oldprio = p->prio;
+	array = p->array;
+	if (array)
+		dequeue_task(p, array);
+	p->prio = prio;
+
+	trace_special_pid(p->pid, __PRIO(oldprio), PRIO(p));
+	prev_resched = _need_resched();
+	if (array) {
+		/*
+		 * If changing to an RT priority then queue it
+		 * in the active array!
+		 */
+		if (rt_task(p))
+			array = rq->active;
+		enqueue_task(p, array);
+		/*
+		 * Reschedule if we are currently running on this runqueue and
+		 * our priority decreased, or if we are not currently running on
+		 * this runqueue and our priority is higher than the current's
+		 */
+		if (task_running(rq, p)) {
+			if (p->prio > oldprio)
+				resched_task(rq->curr);
+		} else if (TASK_PREEMPTS_CURR(p, rq))
+			resched_task(rq->curr);
+	}
+	trace_special(prev_resched, _need_resched(), 0);
+
+	task_rq_unlock(rq, &flags);
+}
+
+#endif
+
 void set_user_nice(task_t *p, long nice)
 {
 	unsigned long flags;
@@ -3473,7 +4114,7 @@ void set_user_nice(task_t *p, long nice)
 	new_prio = NICE_TO_PRIO(nice);
 	delta = new_prio - old_prio;
 	p->static_prio = NICE_TO_PRIO(nice);
-	p->prio += delta;
+	p->prio = effective_prio(p);
 
 	if (array) {
 		enqueue_task(p, array);
@@ -3602,16 +4243,61 @@ static void __setscheduler(struct task_s
 	BUG_ON(p->array);
 	p->policy = policy;
 	p->rt_priority = prio;
-	if (policy != SCHED_NORMAL && policy != SCHED_BATCH) {
-		p->prio = MAX_RT_PRIO-1 - p->rt_priority;
-	} else {
-		p->prio = p->static_prio;
+	p->normal_prio = normal_prio(p);
+	/* we are holding p->pi_lock already */
+	p->prio = rt_mutex_getprio(p);
+	/*
+	 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
+	 */
+	if (policy == SCHED_BATCH)
+		p->sleep_avg = 0;
+}
+
+int setscheduler(struct task_struct *p, int oldpolicy, int policy,
+		  struct sched_param *param)
+{
+	int oldprio;
+	prio_array_t *array;
+	unsigned long flags, fp;
+	runqueue_t *rq;
+
+	/*
+	 * make sure no PI-waiters arrive (or leave) while we are
+	 * changing the priority of the task:
+	 */
+	spin_lock_irqsave(&p->pi_lock, fp);
+	/*
+	 * To be able to change p->policy safely, the apropriate
+	 * runqueue lock must be held.
+	 */
+	rq = task_rq_lock(p, &flags);
+	/* recheck policy now with rq lock held */
+	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
+		task_rq_unlock(rq, &flags);
+		spin_unlock_irqrestore(&p->pi_lock, fp);
+		return -EAGAIN;
+	}
+	array = p->array;
+	if (array)
+		deactivate_task(p, rq);
+	oldprio = p->prio;
+	__setscheduler(p, policy, param->sched_priority);
+	if (array) {
+		__activate_task(p, rq);
 		/*
-		 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
+		 * Reschedule if we are currently running on this runqueue and
+		 * our priority decreased, or if we are not currently running on
+		 * this runqueue and our priority is higher than the current's
 		 */
-		if (policy == SCHED_BATCH)
-			p->sleep_avg = 0;
+		if (task_running(rq, p)) {
+			if (p->prio > oldprio)
+				resched_task(rq->curr);
+		} else if (TASK_PREEMPTS_CURR(p, rq))
+			resched_task(rq->curr);
 	}
+	task_rq_unlock(rq, &flags);
+	spin_unlock_irqrestore(&p->pi_lock, fp);
+	return 0;
 }
 
 /**
@@ -3625,10 +4311,7 @@ int sched_setscheduler(struct task_struc
 		       struct sched_param *param)
 {
 	int retval;
-	int oldprio, oldpolicy = -1;
-	prio_array_t *array;
-	unsigned long flags;
-	runqueue_t *rq;
+	int oldpolicy = -1;
 
 recheck:
 	/* double check policy once rq lock held */
@@ -3677,36 +4360,12 @@ recheck:
 	retval = security_task_setscheduler(p, policy, param);
 	if (retval)
 		return retval;
-	/*
-	 * To be able to change p->policy safely, the apropriate
-	 * runqueue lock must be held.
-	 */
-	rq = task_rq_lock(p, &flags);
-	/* recheck policy now with rq lock held */
-	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
+
+	if (setscheduler(p, oldpolicy, policy, param)) {
 		policy = oldpolicy = -1;
-		task_rq_unlock(rq, &flags);
 		goto recheck;
 	}
-	array = p->array;
-	if (array)
-		deactivate_task(p, rq);
-	oldprio = p->prio;
-	__setscheduler(p, policy, param->sched_priority);
-	if (array) {
-		__activate_task(p, rq);
-		/*
-		 * Reschedule if we are currently running on this runqueue and
-		 * our priority decreased, or if we are not currently running on
-		 * this runqueue and our priority is higher than the current's
-		 */
-		if (task_running(rq, p)) {
-			if (p->prio > oldprio)
-				resched_task(rq->curr);
-		} else if (TASK_PREEMPTS_CURR(p, rq))
-			resched_task(rq->curr);
-	}
-	task_rq_unlock(rq, &flags);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(sched_setscheduler);
@@ -4004,10 +4663,11 @@ asmlinkage long sys_sched_yield(void)
 	 * no need to preempt or enable interrupts:
 	 */
 	__release(rq->lock);
-	_raw_spin_unlock(&rq->lock);
-	preempt_enable_no_resched();
+	spin_unlock_no_resched(&rq->lock);
 
-	schedule();
+	__schedule();
+	local_irq_enable();
+	preempt_check_resched();
 
 	return 0;
 }
@@ -4024,10 +4684,11 @@ static inline void __cond_resched(void)
 	if (unlikely(system_state != SYSTEM_RUNNING))
 		return;
 	do {
+		local_irq_disable();
 		add_preempt_count(PREEMPT_ACTIVE);
-		schedule();
-		sub_preempt_count(PREEMPT_ACTIVE);
+		__schedule();
 	} while (need_resched());
+	local_irq_enable();
 }
 
 int __sched cond_resched(void)
@@ -4049,43 +4710,121 @@ EXPORT_SYMBOL(cond_resched);
  * operations here to prevent schedule() from being called twice (once via
  * spin_unlock(), once by hand).
  */
-int cond_resched_lock(spinlock_t *lock)
+int __cond_resched_raw_spinlock(raw_spinlock_t *lock)
 {
 	int ret = 0;
 
-	if (need_lockbreak(lock)) {
+	if (need_lockbreak_raw(lock)) {
 		spin_unlock(lock);
 		cpu_relax();
-		ret = 1;
 		spin_lock(lock);
+		ret = 1;
 	}
 	if (need_resched()) {
-		_raw_spin_unlock(lock);
-		preempt_enable_no_resched();
+		spin_unlock_no_resched(lock);
 		__cond_resched();
-		ret = 1;
 		spin_lock(lock);
+		ret = 1;
 	}
 	return ret;
 }
 
-EXPORT_SYMBOL(cond_resched_lock);
+EXPORT_SYMBOL(__cond_resched_raw_spinlock);
+
+#ifdef CONFIG_PREEMPT_RT
+
+int __cond_resched_spinlock(spinlock_t *lock)
+{
+#if (defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)) || defined(CONFIG_PREEMPT_RT)
+	if (lock->break_lock) {
+		lock->break_lock = 0;
+		_spin_unlock(lock);
+		__cond_resched();
+		_spin_lock(lock);
+		return 1;
+	}
+#endif
+	return 0;
+}
+
+EXPORT_SYMBOL(__cond_resched_spinlock);
+
+#endif
+
 
+/*
+ * Preempt a softirq context if necessary:
+ */
 int __sched cond_resched_softirq(void)
 {
+#ifndef CONFIG_PREEMPT_RT
 	BUG_ON(!in_softirq());
 
-	if (need_resched()) {
+	if (softirq_need_resched()) {
 		__local_bh_enable();
 		__cond_resched();
 		local_bh_disable();
 		return 1;
 	}
+#endif
 	return 0;
 }
 
 EXPORT_SYMBOL(cond_resched_softirq);
 
+/*
+ * Preempt a hardirq context if necessary:
+ */
+int cond_resched_hardirq(void)
+{
+	BUG_ON(!in_irq());
+
+	if (hardirq_need_resched()) {
+		irq_exit();
+		__cond_resched();
+		irq_enter();
+		return 1;
+	}
+	return 0;
+}
+
+EXPORT_SYMBOL(cond_resched_hardirq);
+
+/*
+ * Preempt any context:
+ */
+int cond_resched_all(void)
+{
+	if (hardirq_count())
+		return cond_resched_hardirq();
+	if (softirq_count())
+		return cond_resched_softirq();
+	return cond_resched();
+}
+
+EXPORT_SYMBOL(cond_resched_all);
+
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+
+int voluntary_preemption = 1;
+
+EXPORT_SYMBOL(voluntary_preemption);
+
+static int __init voluntary_preempt_setup (char *str)
+{
+	if (!strncmp(str, "off", 3))
+		voluntary_preemption = 0;
+	else
+		get_option(&str, &voluntary_preemption);
+	if (!voluntary_preemption)
+		printk("turning off voluntary preemption!\n");
+
+	return 1;
+}
+
+__setup("voluntary-preempt=", voluntary_preempt_setup);
+
+#endif
 
 /**
  * yield - yield the current processor to other threads.
@@ -4093,12 +4832,31 @@ EXPORT_SYMBOL(cond_resched_softirq);
  * this is a shortcut for kernel-space yielding - it marks the
  * thread runnable and calls sys_sched_yield().
  */
-void __sched yield(void)
+void __sched __yield(void)
 {
 	set_current_state(TASK_RUNNING);
 	sys_sched_yield();
 }
 
+void __sched yield(void)
+{
+	static int once = 1;
+
+	/*
+	 * it's a bug to rely on yield() with RT priorities. We print
+	 * the first occurance after bootup ... this will still give
+	 * us an idea about the scope of the problem, without spamming
+	 * the syslog:
+	 */
+	if (once && rt_task(current)) {
+		once = 0;
+		printk(KERN_ERR "BUG: %s:%d RT task yield()-ing!\n",
+			current->comm, current->pid);
+		dump_stack();
+	}
+	__yield();
+}
+
 EXPORT_SYMBOL(yield);
 
 /*
@@ -4239,25 +4997,29 @@ static void show_task(task_t *p)
 	task_t *relative;
 	unsigned state;
 	unsigned long free = 0;
-	static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" };
+	static const char *stat_nam[] = { "R", "M", "S", "D", "T", "t", "Z", "X" };
 
-	printk("%-13.13s ", p->comm);
+	printk("%-13.13s [%p]", p->comm, p);
 	state = p->state ? __ffs(p->state) + 1 : 0;
 	if (state < ARRAY_SIZE(stat_nam))
 		printk(stat_nam[state]);
 	else
 		printk("?");
 #if (BITS_PER_LONG == 32)
-	if (state == TASK_RUNNING)
+	if (0 && (state == TASK_RUNNING))
 		printk(" running ");
 	else
 		printk(" %08lX ", thread_saved_pc(p));
 #else
-	if (state == TASK_RUNNING)
+	if (0 && (state == TASK_RUNNING))
 		printk("  running task   ");
 	else
 		printk(" %016lx ", thread_saved_pc(p));
 #endif
+	if (task_curr(p))
+		printk("[curr] ");
+	else if (p->array)
+		printk("[on rq #%d] ", task_cpu(p));
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	{
 		unsigned long *n = end_of_stack(p);
@@ -4284,13 +5046,14 @@ static void show_task(task_t *p)
 	else
 		printk(" (NOTLB)\n");
 
-	if (state != TASK_RUNNING)
+//	if (state != TASK_RUNNING)
 		show_stack(p, NULL);
 }
 
 void show_state(void)
 {
 	task_t *g, *p;
+	int do_unlock = 1;
 
 #if (BITS_PER_LONG == 32)
 	printk("\n"
@@ -4301,7 +5064,16 @@ void show_state(void)
 	       "                                                       sibling\n");
 	printk("  task                 PC          pid father child younger older\n");
 #endif
+#ifdef CONFIG_PREEMPT_RT
+	if (!read_trylock(&tasklist_lock)) {
+		printk("hm, tasklist_lock write-locked.\n");
+		printk("ignoring ...\n");
+		do_unlock = 0;
+	}
+#else
 	read_lock(&tasklist_lock);
+#endif
+
 	do_each_thread(g, p) {
 		/*
 		 * reset the NMI-timeout, listing all files on a slow
@@ -4311,8 +5083,10 @@ void show_state(void)
 		show_task(p);
 	} while_each_thread(g, p);
 
-	read_unlock(&tasklist_lock);
+	if (do_unlock)
+		read_unlock(&tasklist_lock);
 	mutex_debug_show_all_locks();
+	rt_mutex_show_all_locks();
 }
 
 /**
@@ -4331,7 +5105,7 @@ void __devinit init_idle(task_t *idle, i
 	idle->timestamp = sched_clock();
 	idle->sleep_avg = 0;
 	idle->array = NULL;
-	idle->prio = MAX_PRIO;
+	idle->prio = idle->normal_prio = MAX_PRIO;
 	idle->state = TASK_RUNNING;
 	idle->cpus_allowed = cpumask_of_cpu(cpu);
 	set_task_cpu(idle, cpu);
@@ -4344,7 +5118,9 @@ void __devinit init_idle(task_t *idle, i
 	spin_unlock_irqrestore(&rq->lock, flags);
 
 	/* Set the preempt count _outside_ the spinlocks! */
-#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
+#if defined(CONFIG_PREEMPT) && \
+	!defined(CONFIG_PREEMPT_BKL) && \
+		!defined(CONFIG_PREEMPT_RT)
 	task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
 #else
 	task_thread_info(idle)->preempt_count = 0;
@@ -4428,12 +5204,13 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
  * So we race with normal scheduler movements, but that's OK, as long
  * as the task is no longer on this CPU.
  */
-static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
+static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
 	runqueue_t *rq_dest, *rq_src;
+	int ret = 0;
 
 	if (unlikely(cpu_is_offline(dest_cpu)))
-		return;
+		return 0;
 
 	rq_src = cpu_rq(src_cpu);
 	rq_dest = cpu_rq(dest_cpu);
@@ -4446,7 +5223,9 @@ static void __migrate_task(struct task_s
 	if (!cpu_isset(dest_cpu, p->cpus_allowed))
 		goto out;
 
+	WARN_ON(p == rq_src->curr);
 	set_task_cpu(p, dest_cpu);
+
 	if (p->array) {
 		/*
 		 * Sync timestamp with rq_dest's before activating.
@@ -4460,10 +5239,13 @@ static void __migrate_task(struct task_s
 		activate_task(p, rq_dest, 0);
 		if (TASK_PREEMPTS_CURR(p, rq_dest))
 			resched_task(rq_dest->curr);
+		ret = 1;
 	}
 
 out:
 	double_rq_unlock(rq_src, rq_dest);
+
+	return ret;
 }
 
 /*
@@ -4658,9 +5440,9 @@ static void migrate_dead(unsigned int de
 	 * that's OK.  No task can be added to this CPU, so iteration is
 	 * fine.
 	 */
-	spin_unlock_irq(&rq->lock);
-	move_task_off_dead_cpu(dead_cpu, tsk);
 	spin_lock_irq(&rq->lock);
+	move_task_off_dead_cpu(dead_cpu, tsk);
+	spin_unlock_irq(&rq->lock);
 
 	put_task_struct(tsk);
 }
@@ -6040,6 +6822,7 @@ void __init sched_init(void)
 
 		for (j = 0; j < 2; j++) {
 			array = rq->arrays + j;
+			array->rq = rq;
 			for (k = 0; k < MAX_PRIO; k++) {
 				INIT_LIST_HEAD(array->queue + k);
 				__clear_bit(k, array->bitmap);
@@ -6055,6 +6838,9 @@ void __init sched_init(void)
 	atomic_inc(&init_mm.mm_count);
 	enter_lazy_tlb(&init_mm, current);
 
+#ifdef CONFIG_PREEMPT_RT
+	printk("Real-Time Preemption Support (C) 2004-2006 Ingo Molnar\n");
+#endif
 	/*
 	 * Make us the idle thread. Technically, schedule() should not be
 	 * called from this thread, however somewhere below it might be,
@@ -6064,7 +6850,7 @@ void __init sched_init(void)
 	init_idle(current, smp_processor_id());
 }
 
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT)
 void __might_sleep(char *file, int line)
 {
 #if defined(in_atomic)
@@ -6072,13 +6858,17 @@ void __might_sleep(char *file, int line)
 
 	if ((in_atomic() || irqs_disabled()) &&
 	    system_state == SYSTEM_RUNNING && !oops_in_progress) {
+		if (debug_direct_keyboard && hardirq_count())
+			return;
 		if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
 			return;
 		prev_jiffy = jiffies;
-		printk(KERN_ERR "Debug: sleeping function called from invalid"
-				" context at %s:%d\n", file, line);
-		printk("in_atomic():%d, irqs_disabled():%d\n",
-			in_atomic(), irqs_disabled());
+		stop_trace();
+		printk(KERN_ERR "BUG: sleeping function called from invalid"
+				" context %s(%d) at %s:%d\n",
+				current->comm, current->pid, file, line);
+		printk("in_atomic():%d [%08x], irqs_disabled():%d\n",
+			in_atomic(), preempt_count(), irqs_disabled());
 		dump_stack();
 	}
 #endif
@@ -6091,7 +6881,7 @@ void normalize_rt_tasks(void)
 {
 	struct task_struct *p;
 	prio_array_t *array;
-	unsigned long flags;
+	unsigned long flags, fp;
 	runqueue_t *rq;
 
 	read_lock_irq(&tasklist_lock);
@@ -6099,6 +6889,7 @@ void normalize_rt_tasks(void)
 		if (!rt_task(p))
 			continue;
 
+		spin_lock_irqsave(&p->pi_lock, fp);
 		rq = task_rq_lock(p, &flags);
 
 		array = p->array;
@@ -6111,6 +6902,7 @@ void normalize_rt_tasks(void)
 		}
 
 		task_rq_unlock(rq, &flags);
+		spin_unlock_irqrestore(&p->pi_lock, fp);
 	}
 	read_unlock_irq(&tasklist_lock);
 }
@@ -6160,3 +6952,23 @@ void set_curr_task(int cpu, task_t *p)
 }
 
 #endif
+
+#ifdef CONFIG_DEBUG_PREEMPT
+void notrace preempt_enable_no_resched(void)
+{
+	static int once = 1;
+
+	barrier();
+	dec_preempt_count();
+
+	if (once && !preempt_count()) {
+		once = 0;
+		printk(KERN_ERR "BUG: %s:%d task might have lost a preemption check!\n",
+			current->comm, current->pid);
+		dump_stack();
+	}
+}
+
+EXPORT_SYMBOL(preempt_enable_no_resched);
+#endif
+
Index: linux-2.6.16/kernel/signal.c
===================================================================
--- linux-2.6.16.orig/kernel/signal.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/signal.c	2006-10-19 16:52:31.000000000 +0200
@@ -875,9 +875,12 @@ specific_send_sig_info(int sig, struct s
 {
 	int ret = 0;
 
-	if (!irqs_disabled())
-		BUG();
+#ifndef CONFIG_PREEMPT_RT
+	BUG_ON(!irqs_disabled());
+#endif
+#ifdef CONFIG_SMP
 	assert_spin_locked(&t->sighand->siglock);
+#endif
 
 	/* Short-circuit ignored signals.  */
 	if (sig_ignored(t, sig))
@@ -1697,6 +1700,7 @@ static void ptrace_stop(int exit_code, i
 	     !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
 		do_notify_parent_cldstop(current, 1, CLD_TRAPPED);
 		read_unlock(&tasklist_lock);
+		current->flags &= ~PF_NOSCHED;
 		schedule();
 	} else {
 		/*
@@ -1763,6 +1767,7 @@ finish_stop(int stop_count)
 	read_unlock(&tasklist_lock);
 
 out:
+	current->flags &= ~PF_NOSCHED;
 	schedule();
 	/*
 	 * Now we don't run again until continued.
@@ -1922,6 +1927,9 @@ int get_signal_to_deliver(siginfo_t *inf
 	sigset_t *mask = &current->blocked;
 	int signr = 0;
 
+#ifdef CONFIG_PREEMPT_RT
+	might_sleep();
+#endif
 relock:
 	spin_lock_irq(&current->sighand->siglock);
 	for (;;) {
Index: linux-2.6.16/kernel/softirq.c
===================================================================
--- linux-2.6.16.orig/kernel/softirq.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/softirq.c	2006-10-19 16:52:31.000000000 +0200
@@ -4,6 +4,9 @@
  *	Copyright (C) 1992 Linus Torvalds
  *
  * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
+ *
+ *	Softirq-split implemetation by
+ *	Copyright (C) 2005 Thomas Gleixner, Ingo Molnar
  */
 
 #include <linux/module.h>
@@ -16,6 +19,10 @@
 #include <linux/cpu.h>
 #include <linux/kthread.h>
 #include <linux/rcupdate.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/wait.h>
+#include <linux/delay.h>
 
 #include <asm/irq.h>
 /*
@@ -43,7 +50,41 @@ EXPORT_SYMBOL(irq_stat);
 
 static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp;
 
-static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
+struct softirqdata {
+	int			nr;
+	unsigned long		cpu;
+	struct task_struct	*tsk;
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+	wait_queue_head_t	wait;
+	int			running;
+#endif
+};
+
+static DEFINE_PER_CPU(struct softirqdata, ksoftirqd[MAX_SOFTIRQ]);
+
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+/*
+ * Preempting the softirq causes cases that would not be a
+ * problem when the softirq is not preempted. That is a
+ * process may have code to spin while waiting for a softirq
+ * to finish on another CPU.  But if it happens that the
+ * process has preempted the softirq, this could cause a
+ * deadlock.
+ */
+void wait_for_softirq(int softirq)
+{
+	struct softirqdata *data = &__get_cpu_var(ksoftirqd[softirq]);
+	if (data->running) {
+		DECLARE_WAITQUEUE(wait, current);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		add_wait_queue(&data->wait, &wait);
+		if (data->running)
+			schedule();
+		remove_wait_queue(&data->wait, &wait);
+		__set_current_state(TASK_RUNNING);
+	}
+}
+#endif
 
 /*
  * we cannot loop indefinitely here to avoid userspace starvation,
@@ -51,15 +92,47 @@ static DEFINE_PER_CPU(struct task_struct
  * to the pending events, so lets the scheduler to balance
  * the softirq load for us.
  */
-static inline void wakeup_softirqd(void)
+static void wakeup_softirqd(int softirq)
 {
 	/* Interrupts are disabled: no need to stop preemption */
-	struct task_struct *tsk = __get_cpu_var(ksoftirqd);
+	struct task_struct *tsk = __get_cpu_var(ksoftirqd[softirq].tsk);
 
 	if (tsk && tsk->state != TASK_RUNNING)
 		wake_up_process(tsk);
 }
 
+static void wakeup_softirqd_prio(int softirq, int prio)
+{
+	/* Interrupts are disabled: no need to stop preemption */
+	struct task_struct *tsk = __get_cpu_var(ksoftirqd[softirq].tsk);
+
+	if (tsk) {
+		if (tsk->normal_prio != prio) {
+			struct sched_param param;
+
+			param.sched_priority = MAX_RT_PRIO-1 - prio;
+			setscheduler(tsk, -1, SCHED_FIFO, &param);
+		}
+		if(tsk->state != TASK_RUNNING)
+			wake_up_process(tsk);
+	}
+}
+
+/*
+ * Wake up the softirq threads which have work
+ */
+static void trigger_softirqs(void)
+{
+	u32 pending = local_softirq_pending();
+	int curr = 0;
+
+	while (pending) {
+		if (pending & 1)
+			wakeup_softirqd(curr);
+		pending >>= 1;
+		curr++;
+	}
+}
 /*
  * We restart softirq processing MAX_SOFTIRQ_RESTART times,
  * and we fall back to softirqd after that.
@@ -71,7 +144,7 @@ static inline void wakeup_softirqd(void)
  */
 #define MAX_SOFTIRQ_RESTART 10
 
-asmlinkage void __do_softirq(void)
+asmlinkage void ___do_softirq(void)
 {
 	struct softirq_action *h;
 	__u32 pending;
@@ -80,7 +153,6 @@ asmlinkage void __do_softirq(void)
 
 	pending = local_softirq_pending();
 
-	local_bh_disable();
 	cpu = smp_processor_id();
 restart:
 	/* Reset the pending bitmask before enabling irqs */
@@ -92,8 +164,17 @@ restart:
 
 	do {
 		if (pending & 1) {
-			h->action(h);
+			{
+				u32 preempt_count = preempt_count();
+				h->action(h);
+				if (preempt_count != preempt_count()) {
+					print_symbol("BUG: softirq exited %s with wrong preemption count!\n", (unsigned long) h->action);
+					printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count());
+					preempt_count() = preempt_count;
+				}
+			}
 			rcu_bh_qsctr_inc(cpu);
+			cond_resched_all();
 		}
 		h++;
 		pending >>= 1;
@@ -106,11 +187,72 @@ restart:
 		goto restart;
 
 	if (pending)
-		wakeup_softirqd();
+		trigger_softirqs();
+}
 
+asmlinkage void __do_softirq(void)
+{
+	unsigned long p_flags;
+
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+	/*
+	 * 'preempt harder'. Push all softirq processing off to ksoftirqd.
+	 */
+	if (softirq_preemption) {
+		if (local_softirq_pending())
+			trigger_softirqs();
+		return;
+	}
+#endif
+	/*
+	 * 'immediate' softirq execution:
+	 */
+	local_bh_disable();
+	p_flags = current->flags & PF_HARDIRQ;
+	current->flags &= ~PF_HARDIRQ;
+
+	___do_softirq();
 	__local_bh_enable();
+
+	current->flags |= p_flags;
 }
 
+void do_softirq_from_hardirq(void)
+{
+	unsigned long p_flags;
+
+	if (!local_softirq_pending())
+		return;
+	/*
+	 * 'immediate' softirq execution:
+	 */
+	local_bh_disable();
+	p_flags = current->flags & PF_HARDIRQ;
+	current->flags &= ~PF_HARDIRQ;
+
+	___do_softirq();
+	__local_bh_enable();
+
+	current->flags |= p_flags;
+}
+
+
+/*
+ * 'delayed' softirq execution. Does not disable bhs and thus
+ * makes most of the softirq handlers preemptable - as long as
+ * they are not executed 'directly'.
+ */
+asmlinkage void _do_softirq(void)
+{
+	local_irq_disable();
+	if (!softirq_preemption)
+		__do_softirq();
+	else
+		___do_softirq();
+	local_irq_enable();
+}
+
+
 #ifndef __ARCH_HAS_DO_SOFTIRQ
 
 asmlinkage void do_softirq(void)
@@ -135,6 +277,8 @@ EXPORT_SYMBOL(do_softirq);
 
 #endif
 
+#ifndef CONFIG_PREEMPT_RT
+
 void local_bh_enable(void)
 {
 	WARN_ON(irqs_disabled());
@@ -152,6 +296,8 @@ void local_bh_enable(void)
 }
 EXPORT_SYMBOL(local_bh_enable);
 
+#endif
+
 #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
 # define invoke_softirq()	__do_softirq()
 #else
@@ -165,9 +311,30 @@ void irq_exit(void)
 {
 	account_system_vtime(current);
 	sub_preempt_count(IRQ_EXIT_OFFSET);
-	if (!in_interrupt() && local_softirq_pending())
-		invoke_softirq();
-	preempt_enable_no_resched();
+	//if (!in_interrupt() && local_softirq_pending())
+	//	invoke_softirq();
+	__preempt_enable_no_resched();
+}
+
+/*
+ * This function must run with irqs disabled!
+ */
+fastcall void raise_softirq_prio_irqoff(unsigned int nr, int prio)
+{
+	__raise_softirq_irqoff(nr);
+
+	/*
+	 * If we're in an interrupt or softirq, we're done
+	 * (this also catches softirq-disabled code). We will
+	 * actually run the softirq once we return from
+	 * the irq or softirq.
+	 *
+	 * Otherwise we wake up ksoftirqd to make sure we
+	 * schedule the softirq soon.
+	 */
+	if (!in_interrupt() || (current->flags & PF_HARDIRQ) || hardirq_count())
+		//trigger_softirqs();
+		wakeup_softirqd_prio(nr, prio);
 }
 
 /*
@@ -186,8 +353,9 @@ inline fastcall void raise_softirq_irqof
 	 * Otherwise we wake up ksoftirqd to make sure we
 	 * schedule the softirq soon.
 	 */
-	if (!in_interrupt())
-		wakeup_softirqd();
+	if (!in_interrupt() || (current->flags & PF_HARDIRQ) || hardirq_count())
+		//trigger_softirqs();
+		wakeup_softirqd(nr);
 }
 
 EXPORT_SYMBOL(raise_softirq_irqoff);
@@ -220,14 +388,24 @@ struct tasklet_head
 static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec) = { NULL };
 static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec) = { NULL };
 
+static void inline
+__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
+{
+	if (tasklet_trylock(t)) {
+		WARN_ON(t->next != NULL);
+		t->next = head->list;
+		head->list = t;
+		raise_softirq_irqoff(nr);
+		tasklet_unlock(t);
+	}
+}
+
 void fastcall __tasklet_schedule(struct tasklet_struct *t)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
-	t->next = __get_cpu_var(tasklet_vec).list;
-	__get_cpu_var(tasklet_vec).list = t;
-	raise_softirq_irqoff(TASKLET_SOFTIRQ);
+	__tasklet_common_schedule(t, &__get_cpu_var(tasklet_vec), TASKLET_SOFTIRQ);
 	local_irq_restore(flags);
 }
 
@@ -238,78 +416,128 @@ void fastcall __tasklet_hi_schedule(stru
 	unsigned long flags;
 
 	local_irq_save(flags);
-	t->next = __get_cpu_var(tasklet_hi_vec).list;
-	__get_cpu_var(tasklet_hi_vec).list = t;
-	raise_softirq_irqoff(HI_SOFTIRQ);
+	__tasklet_common_schedule(t, &__get_cpu_var(tasklet_hi_vec), HI_SOFTIRQ);
 	local_irq_restore(flags);
 }
 
 EXPORT_SYMBOL(__tasklet_hi_schedule);
 
-static void tasklet_action(struct softirq_action *a)
+void fastcall tasklet_enable(struct tasklet_struct *t)
 {
-	struct tasklet_struct *list;
+	if (!atomic_dec_and_test(&t->count))
+		return;
+	if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
+		tasklet_schedule(t);
+}
 
-	local_irq_disable();
-	list = __get_cpu_var(tasklet_vec).list;
-	__get_cpu_var(tasklet_vec).list = NULL;
-	local_irq_enable();
+EXPORT_SYMBOL(tasklet_enable);
+
+void fastcall tasklet_hi_enable(struct tasklet_struct *t)
+{
+	if (!atomic_dec_and_test(&t->count))
+		return;
+	if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
+		tasklet_hi_schedule(t);
+}
+
+EXPORT_SYMBOL(tasklet_hi_enable);
+
+static void
+__tasklet_action(struct softirq_action *a, struct tasklet_struct *list)
+{
+	int loops = 1000000;
 
 	while (list) {
 		struct tasklet_struct *t = list;
 
 		list = list->next;
+		/*
+		 * Should always succeed - after a tasklist got on the
+		 * list (after getting the SCHED bit set from 0 to 1),
+		 * nothing but the tasklet softirq it got queued to can
+		 * lock it:
+		 */
+		if (!tasklet_trylock(t)) {
+			WARN_ON(1);
+			continue;
+		}
+
+		t->next = NULL;
 
-		if (tasklet_trylock(t)) {
-			if (!atomic_read(&t->count)) {
-				if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
-					BUG();
-				t->func(t->data);
+		/*
+		 * If we cannot handle the tasklet because it's disabled,
+		 * mark it as pending. tasklet_enable() will later
+		 * re-schedule the tasklet.
+		 */
+		if (unlikely(atomic_read(&t->count))) {
+out_disabled:
+			/* implicit unlock: */
+			wmb();
+			t->state = TASKLET_STATEF_PENDING;
+			continue;
+		}
+
+		/*
+		 * After this point on the tasklet might be rescheduled
+		 * on another CPU, but it can only be added to another
+		 * CPU's tasklet list if we unlock the tasklet (which we
+		 * dont do yet).
+		 */
+		if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
+			WARN_ON(1);
+
+again:
+		t->func(t->data);
+
+		/*
+		 * Try to unlock the tasklet. We must use cmpxchg, because
+		 * another CPU might have scheduled or disabled the tasklet.
+		 * We only allow the STATE_RUN -> 0 transition here.
+		 */
+		while (!tasklet_tryunlock(t)) {
+			/*
+			 * If it got disabled meanwhile, bail out:
+			 */
+			if (atomic_read(&t->count))
+				goto out_disabled;
+			/*
+			 * If it got scheduled meanwhile, re-execute
+			 * the tasklet function:
+			 */
+			if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
+				goto again;
+			if (!--loops) {
+				printk("hm, tasklet state: %08lx\n", t->state);
+				WARN_ON(1);
 				tasklet_unlock(t);
-				continue;
+				break;
 			}
-			tasklet_unlock(t);
 		}
-
-		local_irq_disable();
-		t->next = __get_cpu_var(tasklet_vec).list;
-		__get_cpu_var(tasklet_vec).list = t;
-		__raise_softirq_irqoff(TASKLET_SOFTIRQ);
-		local_irq_enable();
 	}
 }
 
-static void tasklet_hi_action(struct softirq_action *a)
+static void tasklet_action(struct softirq_action *a)
 {
 	struct tasklet_struct *list;
 
 	local_irq_disable();
-	list = __get_cpu_var(tasklet_hi_vec).list;
-	__get_cpu_var(tasklet_hi_vec).list = NULL;
+	list = __get_cpu_var(tasklet_vec).list;
+	__get_cpu_var(tasklet_vec).list = NULL;
 	local_irq_enable();
 
-	while (list) {
-		struct tasklet_struct *t = list;
+	__tasklet_action(a, list);
+}
 
-		list = list->next;
+static void tasklet_hi_action(struct softirq_action *a)
+{
+	struct tasklet_struct *list;
 
-		if (tasklet_trylock(t)) {
-			if (!atomic_read(&t->count)) {
-				if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
-					BUG();
-				t->func(t->data);
-				tasklet_unlock(t);
-				continue;
-			}
-			tasklet_unlock(t);
-		}
+	local_irq_disable();
+	list = __get_cpu_var(tasklet_hi_vec).list;
+	__get_cpu_var(tasklet_hi_vec).list = NULL;
+	local_irq_enable();
 
-		local_irq_disable();
-		t->next = __get_cpu_var(tasklet_hi_vec).list;
-		__get_cpu_var(tasklet_hi_vec).list = t;
-		__raise_softirq_irqoff(HI_SOFTIRQ);
-		local_irq_enable();
-	}
+	__tasklet_action(a, list);
 }
 
 
@@ -332,7 +560,7 @@ void tasklet_kill(struct tasklet_struct 
 
 	while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
 		do
-			yield();
+			msleep(1);
 		while (test_bit(TASKLET_STATE_SCHED, &t->state));
 	}
 	tasklet_unlock_wait(t);
@@ -347,36 +575,64 @@ void __init softirq_init(void)
 	open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL);
 }
 
-static int ksoftirqd(void * __bind_cpu)
+static int ksoftirqd(void * __data)
 {
-	set_user_nice(current, 19);
-	current->flags |= PF_NOFREEZE;
+	struct sched_param param = { .sched_priority = MAX_RT_PRIO/4-1 };
+	struct softirqdata *data = __data;
+	u32 mask = (1 << data->nr);
+	struct softirq_action *h;
+
+	param.sched_priority = 1;
+	sys_sched_setscheduler(current->pid, SCHED_FIFO, &param);
+//	set_user_nice(current, -10);
+	current->flags |= PF_NOFREEZE | PF_SOFTIRQ;
+
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+	init_waitqueue_head(&data->wait);
+#endif
 
 	set_current_state(TASK_INTERRUPTIBLE);
 
 	while (!kthread_should_stop()) {
 		preempt_disable();
-		if (!local_softirq_pending()) {
-			preempt_enable_no_resched();
+		if (!(local_softirq_pending() & mask)) {
+			__preempt_enable_no_resched();
 			schedule();
 			preempt_disable();
 		}
-
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+		data->running = 1;
+#endif
 		__set_current_state(TASK_RUNNING);
 
-		while (local_softirq_pending()) {
+		while (local_softirq_pending() & mask) {
 			/* Preempt disable stops cpu going offline.
 			   If already offline, we'll be on wrong CPU:
 			   don't process */
-			if (cpu_is_offline((long)__bind_cpu))
+			if (cpu_is_offline(data->cpu))
 				goto wait_to_die;
-			do_softirq();
-			preempt_enable_no_resched();
+
+			local_irq_disable();
+			__preempt_enable_no_resched();
+			set_softirq_pending(local_softirq_pending() & ~mask);
+			local_bh_disable();
+			local_irq_enable();
+
+			h = &softirq_vec[data->nr];
+			if (h)
+				h->action(h);
+			rcu_bh_qsctr_inc(data->cpu);
+
+			__local_bh_enable();
 			cond_resched();
 			preempt_disable();
 		}
 		preempt_enable();
 		set_current_state(TASK_INTERRUPTIBLE);
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+		data->running = 0;
+		wake_up(&data->wait);
+#endif
 	}
 	__set_current_state(TASK_RUNNING);
 	return 0;
@@ -423,7 +679,7 @@ void tasklet_kill_immediate(struct taskl
 	BUG();
 }
 
-static void takeover_tasklets(unsigned int cpu)
+void takeover_tasklets(unsigned int cpu)
 {
 	struct tasklet_struct **i;
 
@@ -445,37 +701,67 @@ static void takeover_tasklets(unsigned i
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
+static const char *softirq_names [] =
+{
+  [HI_SOFTIRQ]		= "high",
+  [TIMER_SOFTIRQ]	= "timer",
+  [NET_TX_SOFTIRQ]	= "net-tx",
+  [NET_RX_SOFTIRQ]	= "net-rx",
+  [BLOCK_SOFTIRQ]	= "block",
+  [TASKLET_SOFTIRQ]	= "tasklet",
+#ifdef CONFIG_HIGH_RES_TIMERS
+# ifdef CONFIG_PREEMPT_RT
+  [HRTIMER_SOFTIRQ_REAL]= "hrtreal",
+  [HRTIMER_SOFTIRQ_MONO]= "hrtmono",
+# else
+  [HRTIMER_SOFTIRQ]	= "hrtimer",
+# endif
+#endif
+};
+
 static int __devinit cpu_callback(struct notifier_block *nfb,
 				  unsigned long action,
 				  void *hcpu)
 {
-	int hotcpu = (unsigned long)hcpu;
+	int hotcpu = (unsigned long)hcpu, i;
 	struct task_struct *p;
 
 	switch (action) {
 	case CPU_UP_PREPARE:
-		BUG_ON(per_cpu(tasklet_vec, hotcpu).list);
-		BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list);
-		p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
-		if (IS_ERR(p)) {
-			printk("ksoftirqd for %i failed\n", hotcpu);
-			return NOTIFY_BAD;
+		/* We may have tasklets already scheduled on
+		   processor 0, so don't check there. */
+		if (hotcpu != 0) {
+			BUG_ON(per_cpu(tasklet_vec, hotcpu).list);
+			BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list);
+		}
+		for (i = 0; i < MAX_SOFTIRQ; i++) {
+			per_cpu(ksoftirqd[i].nr, hotcpu) = i;
+			per_cpu(ksoftirqd[i].cpu, hotcpu) = hotcpu;
+			p = kthread_create(ksoftirqd, &per_cpu(ksoftirqd[i], hotcpu),
+					   "softirq-%s/%d", softirq_names[i], hotcpu);
+			if (IS_ERR(p)) {
+				printk("ksoftirqd %d for %i failed\n", i, hotcpu);
+				return NOTIFY_BAD;
+			}
+			kthread_bind(p, hotcpu);
+			per_cpu(ksoftirqd[i].tsk, hotcpu) = p;
 		}
-		kthread_bind(p, hotcpu);
-  		per_cpu(ksoftirqd, hotcpu) = p;
  		break;
 	case CPU_ONLINE:
-		wake_up_process(per_cpu(ksoftirqd, hotcpu));
+		for (i = 0; i < MAX_SOFTIRQ; i++)
+			wake_up_process(per_cpu(ksoftirqd[i].tsk, hotcpu));
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_UP_CANCELED:
 		/* Unbind so it can run.  Fall thru. */
-		kthread_bind(per_cpu(ksoftirqd, hotcpu),
-			     any_online_cpu(cpu_online_map));
+		for (i = 0; i < MAX_SOFTIRQ; i++)
+			kthread_bind(per_cpu(ksoftirqd[i], hotcpu).tsk, any_online_cpu(cpu_online_map));
 	case CPU_DEAD:
-		p = per_cpu(ksoftirqd, hotcpu);
-		per_cpu(ksoftirqd, hotcpu) = NULL;
-		kthread_stop(p);
+		for (i = 0; i < MAX_SOFTIRQ; i++) {
+			p = per_cpu(ksoftirqd[i], hotcpu).tsk;
+			per_cpu(ksoftirqd[i], hotcpu).tsk = NULL;
+			kthread_stop(p);
+		}
 		takeover_tasklets(hotcpu);
 		break;
 #endif /* CONFIG_HOTPLUG_CPU */
@@ -495,3 +781,33 @@ __init int spawn_ksoftirqd(void)
 	register_cpu_notifier(&cpu_nfb);
 	return 0;
 }
+
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+
+int softirq_preemption = 1;
+
+EXPORT_SYMBOL(softirq_preemption);
+
+/*
+ * Real-Time Preemption depends on softirq threading:
+ */
+#ifndef CONFIG_PREEMPT_RT
+
+static int __init softirq_preempt_setup (char *str)
+{
+	if (!strncmp(str, "off", 3))
+		softirq_preemption = 0;
+	else
+		get_option(&str, &softirq_preemption);
+	if (!softirq_preemption)
+		printk("turning off softirq preemption!\n");
+
+	return 1;
+}
+
+__setup("softirq-preempt=", softirq_preempt_setup);
+
+#endif
+
+#endif
+
Index: linux-2.6.16/kernel/softlockup.c
===================================================================
--- linux-2.6.16.orig/kernel/softlockup.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/softlockup.c	2006-10-19 16:52:31.000000000 +0200
@@ -1,12 +1,11 @@
 /*
  * Detect Soft Lockups
  *
- * started by Ingo Molnar, (C) 2005, Red Hat
+ * started by Ingo Molnar, Copyright (C) 2005, 2006 Red Hat, Inc.
  *
  * this code detects soft lockups: incidents in where on a CPU
  * the kernel does not reschedule for 10 seconds or more.
  */
-
 #include <linux/mm.h>
 #include <linux/cpu.h>
 #include <linux/init.h>
@@ -15,15 +14,16 @@
 #include <linux/notifier.h>
 #include <linux/module.h>
 
-static DEFINE_SPINLOCK(print_lock);
+static DEFINE_RAW_SPINLOCK(print_lock);
 
-static DEFINE_PER_CPU(unsigned long, timestamp) = 0;
-static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0;
+static DEFINE_PER_CPU(unsigned long, touch_timestamp);
+static DEFINE_PER_CPU(unsigned long, print_timestamp);
 static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
 
 static int did_panic = 0;
-static int softlock_panic(struct notifier_block *this, unsigned long event,
-				void *ptr)
+
+static int
+softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
 {
 	did_panic = 1;
 
@@ -36,7 +36,7 @@ static struct notifier_block panic_block
 
 void touch_softlockup_watchdog(void)
 {
-	per_cpu(timestamp, raw_smp_processor_id()) = jiffies;
+	per_cpu(touch_timestamp, raw_smp_processor_id()) = jiffies;
 }
 EXPORT_SYMBOL(touch_softlockup_watchdog);
 
@@ -44,25 +44,35 @@ EXPORT_SYMBOL(touch_softlockup_watchdog)
  * This callback runs from the timer interrupt, and checks
  * whether the watchdog thread has hung or not:
  */
-void softlockup_tick(struct pt_regs *regs)
+void softlockup_tick(void)
 {
 	int this_cpu = smp_processor_id();
-	unsigned long timestamp = per_cpu(timestamp, this_cpu);
+	unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu);
 
-	if (per_cpu(print_timestamp, this_cpu) == timestamp)
+	/* prevent double reports: */
+	if (per_cpu(print_timestamp, this_cpu) == touch_timestamp ||
+		did_panic ||
+			!per_cpu(watchdog_task, this_cpu))
 		return;
 
-	/* Do not cause a second panic when there already was one */
-	if (did_panic)
+	/* do not print during early bootup: */
+	if (unlikely(system_state != SYSTEM_RUNNING)) {
+		touch_softlockup_watchdog();
 		return;
+	}
 
-	if (time_after(jiffies, timestamp + 10*HZ)) {
-		per_cpu(print_timestamp, this_cpu) = timestamp;
+	/* Wake up the high-prio watchdog task every second: */
+	if (time_after(jiffies, touch_timestamp + HZ))
+		wake_up_process(per_cpu(watchdog_task, this_cpu));
+
+	/* Warn about unreasonable 10+ seconds delays: */
+	if (time_after(jiffies, touch_timestamp + 10*HZ)) {
+		per_cpu(print_timestamp, this_cpu) = touch_timestamp;
 
 		spin_lock(&print_lock);
 		printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n",
 			this_cpu);
-		show_regs(regs);
+		dump_stack();
 		spin_unlock(&print_lock);
 	}
 }
@@ -77,18 +87,16 @@ static int watchdog(void * __bind_cpu)
 	sched_setscheduler(current, SCHED_FIFO, &param);
 	current->flags |= PF_NOFREEZE;
 
-	set_current_state(TASK_INTERRUPTIBLE);
-
 	/*
-	 * Run briefly once per second - if this gets delayed for
-	 * more than 10 seconds then the debug-printout triggers
-	 * in softlockup_tick():
+	 * Run briefly once per second to reset the softlockup timestamp.
+	 * If this gets delayed for more than 10 seconds then the
+	 * debug-printout triggers in softlockup_tick().
 	 */
 	while (!kthread_should_stop()) {
-		msleep_interruptible(1000);
+		set_current_state(TASK_INTERRUPTIBLE);
 		touch_softlockup_watchdog();
+		schedule();
 	}
-	__set_current_state(TASK_RUNNING);
 
 	return 0;
 }
@@ -110,11 +118,11 @@ cpu_callback(struct notifier_block *nfb,
 			printk("watchdog for %i failed\n", hotcpu);
 			return NOTIFY_BAD;
 		}
+  		per_cpu(touch_timestamp, hotcpu) = jiffies;
   		per_cpu(watchdog_task, hotcpu) = p;
 		kthread_bind(p, hotcpu);
  		break;
 	case CPU_ONLINE:
-
 		wake_up_process(per_cpu(watchdog_task, hotcpu));
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
@@ -146,4 +154,3 @@ __init void spawn_softlockup_task(void)
 
 	notifier_chain_register(&panic_notifier_list, &panic_block);
 }
-
Index: linux-2.6.16/kernel/spinlock.c
===================================================================
--- linux-2.6.16.orig/kernel/spinlock.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/spinlock.c	2006-10-19 16:52:31.000000000 +0200
@@ -20,151 +20,197 @@
  * Generic declaration of the raw read_trylock() function,
  * architectures are supposed to optimize this:
  */
-int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock)
+int __lockfunc generic_raw_read_trylock(raw_rwlock_t *lock)
 {
-	__raw_read_lock(lock);
+	__raw_read_lock(&lock->raw_lock);
 	return 1;
 }
-EXPORT_SYMBOL(generic__raw_read_trylock);
+EXPORT_SYMBOL(generic_raw_read_trylock);
 
-int __lockfunc _spin_trylock(spinlock_t *lock)
+int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)
 {
 	preempt_disable();
-	if (_raw_spin_trylock(lock))
+
+	if (__raw_spin_trylock(&lock->raw_lock))
 		return 1;
 	
 	preempt_enable();
+
 	return 0;
 }
-EXPORT_SYMBOL(_spin_trylock);
+EXPORT_SYMBOL(_raw_spin_trylock);
 
-int __lockfunc _read_trylock(rwlock_t *lock)
+int __lockfunc _raw_spin_trylock_irq(raw_spinlock_t *lock)
 {
+	local_irq_disable();
 	preempt_disable();
-	if (_raw_read_trylock(lock))
+
+	if (__raw_spin_trylock(&lock->raw_lock))
+		return 1;
+
+	__preempt_enable_no_resched();
+	local_irq_enable();
+	preempt_check_resched();
+
+	return 0;
+}
+EXPORT_SYMBOL(_raw_spin_trylock_irq);
+
+int __lockfunc _raw_spin_trylock_irqsave(raw_spinlock_t *lock,
+					 unsigned long *flags)
+{
+	local_irq_save(*flags);
+	preempt_disable();
+
+	if (__raw_spin_trylock(&lock->raw_lock))
+		return 1;
+
+	__preempt_enable_no_resched();
+	local_irq_restore(*flags);
+	preempt_check_resched();
+
+	return 0;
+}
+EXPORT_SYMBOL(_raw_spin_trylock_irqsave);
+
+int __lockfunc _raw_read_trylock(raw_rwlock_t *lock)
+{
+	preempt_disable();
+
+	if (__raw_read_trylock(&lock->raw_lock))
 		return 1;
 
 	preempt_enable();
+
 	return 0;
 }
-EXPORT_SYMBOL(_read_trylock);
+EXPORT_SYMBOL(_raw_read_trylock);
 
-int __lockfunc _write_trylock(rwlock_t *lock)
+int __lockfunc _raw_write_trylock(raw_rwlock_t *lock)
 {
 	preempt_disable();
-	if (_raw_write_trylock(lock))
+
+	if (__raw_write_trylock(&lock->raw_lock))
 		return 1;
 
 	preempt_enable();
+
 	return 0;
 }
-EXPORT_SYMBOL(_write_trylock);
+EXPORT_SYMBOL(_raw_write_trylock);
 
-#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP)
+//#ifndef CONFIG_PREEMPT
+#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_RT)
 
-void __lockfunc _read_lock(rwlock_t *lock)
+void __lockfunc _raw_read_lock(raw_rwlock_t *lock)
 {
 	preempt_disable();
-	_raw_read_lock(lock);
+	__raw_read_lock(&lock->raw_lock);
 }
-EXPORT_SYMBOL(_read_lock);
+EXPORT_SYMBOL(_raw_read_lock);
 
-unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
+unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
 	preempt_disable();
-	_raw_spin_lock_flags(lock, &flags);
+
+	__raw_spin_lock_flags(&lock->raw_lock, flags);
+
 	return flags;
 }
-EXPORT_SYMBOL(_spin_lock_irqsave);
+EXPORT_SYMBOL(_raw_spin_lock_irqsave);
 
-void __lockfunc _spin_lock_irq(spinlock_t *lock)
+void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)
 {
 	local_irq_disable();
 	preempt_disable();
-	_raw_spin_lock(lock);
+
+	__raw_spin_lock(&lock->raw_lock);
 }
-EXPORT_SYMBOL(_spin_lock_irq);
+EXPORT_SYMBOL(_raw_spin_lock_irq);
 
-void __lockfunc _spin_lock_bh(spinlock_t *lock)
+void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
-	_raw_spin_lock(lock);
+
+	__raw_spin_lock(&lock->raw_lock);
 }
-EXPORT_SYMBOL(_spin_lock_bh);
+EXPORT_SYMBOL(_raw_spin_lock_bh);
 
-unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
+unsigned long __lockfunc _raw_read_lock_irqsave(raw_rwlock_t *lock)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
 	preempt_disable();
-	_raw_read_lock(lock);
+
+	__raw_read_lock(&lock->raw_lock);
+
 	return flags;
 }
-EXPORT_SYMBOL(_read_lock_irqsave);
+EXPORT_SYMBOL(_raw_read_lock_irqsave);
 
-void __lockfunc _read_lock_irq(rwlock_t *lock)
+void __lockfunc _raw_read_lock_irq(raw_rwlock_t *lock)
 {
 	local_irq_disable();
 	preempt_disable();
-	_raw_read_lock(lock);
+
+	__raw_read_lock(&lock->raw_lock);
 }
-EXPORT_SYMBOL(_read_lock_irq);
+EXPORT_SYMBOL(_raw_read_lock_irq);
 
-void __lockfunc _read_lock_bh(rwlock_t *lock)
+void __lockfunc _raw_read_lock_bh(raw_rwlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
-	_raw_read_lock(lock);
+
+	__raw_read_lock(&lock->raw_lock);
 }
-EXPORT_SYMBOL(_read_lock_bh);
+EXPORT_SYMBOL(_raw_read_lock_bh);
 
-unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
+unsigned long __lockfunc _raw_write_lock_irqsave(raw_rwlock_t *lock)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
 	preempt_disable();
-	_raw_write_lock(lock);
+	__raw_write_lock(&lock->raw_lock);
 	return flags;
 }
-EXPORT_SYMBOL(_write_lock_irqsave);
+EXPORT_SYMBOL(_raw_write_lock_irqsave);
 
-void __lockfunc _write_lock_irq(rwlock_t *lock)
+void __lockfunc _raw_write_lock_irq(raw_rwlock_t *lock)
 {
 	local_irq_disable();
 	preempt_disable();
-	_raw_write_lock(lock);
+	__raw_write_lock(&lock->raw_lock);
 }
-EXPORT_SYMBOL(_write_lock_irq);
+EXPORT_SYMBOL(_raw_write_lock_irq);
 
-void __lockfunc _write_lock_bh(rwlock_t *lock)
+void __lockfunc _raw_write_lock_bh(raw_rwlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
-	_raw_write_lock(lock);
+	__raw_write_lock(&lock->raw_lock);
 }
-EXPORT_SYMBOL(_write_lock_bh);
+EXPORT_SYMBOL(_raw_write_lock_bh);
 
-void __lockfunc _spin_lock(spinlock_t *lock)
+void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
 {
 	preempt_disable();
-	_raw_spin_lock(lock);
+	__raw_spin_lock(&lock->raw_lock);
 }
+EXPORT_SYMBOL(_raw_spin_lock);
 
-EXPORT_SYMBOL(_spin_lock);
-
-void __lockfunc _write_lock(rwlock_t *lock)
+void __lockfunc _raw_write_lock(raw_rwlock_t *lock)
 {
 	preempt_disable();
-	_raw_write_lock(lock);
+	__raw_write_lock(&lock->raw_lock);
 }
-
-EXPORT_SYMBOL(_write_lock);
+EXPORT_SYMBOL(_raw_write_lock);
 
 #else /* CONFIG_PREEMPT: */
 
@@ -177,39 +223,41 @@ EXPORT_SYMBOL(_write_lock);
  */
 
 #define BUILD_LOCK_OPS(op, locktype)					\
-void __lockfunc _##op##_lock(locktype##_t *lock)			\
+void __lockfunc _raw_##op##_lock(locktype##_t *lock)			\
 {									\
 	preempt_disable();						\
 	for (;;) {							\
-		if (likely(_raw_##op##_trylock(lock)))			\
+		if (likely(__raw_##op##_trylock(&(lock)->raw_lock)))	\
 			break;						\
 		preempt_enable();					\
 		if (!(lock)->break_lock)				\
 			(lock)->break_lock = 1;				\
-		while (!op##_can_lock(lock) && (lock)->break_lock)	\
+		while (!__raw_##op##_can_lock(&(lock)->raw_lock) &&	\
+						 (lock)->break_lock)	\
 			cpu_relax();					\
 		preempt_disable();					\
 	}								\
 	(lock)->break_lock = 0;						\
 }									\
 									\
-EXPORT_SYMBOL(_##op##_lock);						\
+EXPORT_SYMBOL(_raw_##op##_lock);					\
 									\
-unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock)	\
+unsigned long __lockfunc _raw_##op##_lock_irqsave(locktype##_t *lock)	\
 {									\
 	unsigned long flags;						\
 									\
 	preempt_disable();						\
 	for (;;) {							\
-		local_irq_save(flags);					\
-		if (likely(_raw_##op##_trylock(lock)))			\
+		local_irq_save(flags);				\
+		if (likely(__raw_##op##_trylock(&(lock)->raw_lock)))	\
 			break;						\
 		local_irq_restore(flags);				\
 									\
 		preempt_enable();					\
 		if (!(lock)->break_lock)				\
 			(lock)->break_lock = 1;				\
-		while (!op##_can_lock(lock) && (lock)->break_lock)	\
+		while (!__raw_##op##_can_lock(&(lock)->raw_lock) &&	\
+						 (lock)->break_lock)	\
 			cpu_relax();					\
 		preempt_disable();					\
 	}								\
@@ -217,16 +265,16 @@ unsigned long __lockfunc _##op##_lock_ir
 	return flags;							\
 }									\
 									\
-EXPORT_SYMBOL(_##op##_lock_irqsave);					\
+EXPORT_SYMBOL(_raw_##op##_lock_irqsave);				\
 									\
-void __lockfunc _##op##_lock_irq(locktype##_t *lock)			\
+void __lockfunc _raw_##op##_lock_irq(locktype##_t *lock)		\
 {									\
-	_##op##_lock_irqsave(lock);					\
+	_raw_##op##_lock_irqsave(lock);					\
 }									\
 									\
-EXPORT_SYMBOL(_##op##_lock_irq);					\
+EXPORT_SYMBOL(_raw_##op##_lock_irq);					\
 									\
-void __lockfunc _##op##_lock_bh(locktype##_t *lock)			\
+void __lockfunc _raw_##op##_lock_bh(locktype##_t *lock)			\
 {									\
 	unsigned long flags;						\
 									\
@@ -235,12 +283,12 @@ void __lockfunc _##op##_lock_bh(locktype
 	/* irq-disabling. We use the generic preemption-aware	*/	\
 	/* function:						*/	\
 	/**/								\
-	flags = _##op##_lock_irqsave(lock);				\
+	flags = _raw_##op##_lock_irqsave(lock);				\
 	local_bh_disable();						\
 	local_irq_restore(flags);					\
 }									\
 									\
-EXPORT_SYMBOL(_##op##_lock_bh)
+EXPORT_SYMBOL(_raw_##op##_lock_bh)
 
 /*
  * Build preemption-friendly versions of the following
@@ -251,119 +299,132 @@ EXPORT_SYMBOL(_##op##_lock_bh)
  *         _[spin|read|write]_lock_irqsave()
  *         _[spin|read|write]_lock_bh()
  */
-BUILD_LOCK_OPS(spin, spinlock);
-BUILD_LOCK_OPS(read, rwlock);
-BUILD_LOCK_OPS(write, rwlock);
+BUILD_LOCK_OPS(spin, raw_spinlock);
+BUILD_LOCK_OPS(read, raw_rwlock);
+BUILD_LOCK_OPS(write, raw_rwlock);
 
 #endif /* CONFIG_PREEMPT */
 
-void __lockfunc _spin_unlock(spinlock_t *lock)
+void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
 {
-	_raw_spin_unlock(lock);
+	__raw_spin_unlock(&lock->raw_lock);
 	preempt_enable();
 }
-EXPORT_SYMBOL(_spin_unlock);
+EXPORT_SYMBOL(_raw_spin_unlock);
+
+void __lockfunc _raw_spin_unlock_no_resched(raw_spinlock_t *lock)
+{
+	__raw_spin_unlock(&lock->raw_lock);
+	__preempt_enable_no_resched();
+}
+/* not exported */
 
-void __lockfunc _write_unlock(rwlock_t *lock)
+void __lockfunc _raw_write_unlock(raw_rwlock_t *lock)
 {
-	_raw_write_unlock(lock);
+	__raw_write_unlock(&lock->raw_lock);
 	preempt_enable();
 }
-EXPORT_SYMBOL(_write_unlock);
+EXPORT_SYMBOL(_raw_write_unlock);
 
-void __lockfunc _read_unlock(rwlock_t *lock)
+void __lockfunc _raw_read_unlock(raw_rwlock_t *lock)
 {
-	_raw_read_unlock(lock);
+	__raw_read_unlock(&lock->raw_lock);
 	preempt_enable();
 }
-EXPORT_SYMBOL(_read_unlock);
+EXPORT_SYMBOL(_raw_read_unlock);
 
-void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
+void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
 {
-	_raw_spin_unlock(lock);
+	__raw_spin_unlock(&lock->raw_lock);
+	__preempt_enable_no_resched();
 	local_irq_restore(flags);
-	preempt_enable();
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_spin_unlock_irqrestore);
+EXPORT_SYMBOL(_raw_spin_unlock_irqrestore);
 
-void __lockfunc _spin_unlock_irq(spinlock_t *lock)
+void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)
 {
-	_raw_spin_unlock(lock);
+	__raw_spin_unlock(&lock->raw_lock);
+	__preempt_enable_no_resched();
 	local_irq_enable();
-	preempt_enable();
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_spin_unlock_irq);
+EXPORT_SYMBOL(_raw_spin_unlock_irq);
 
-void __lockfunc _spin_unlock_bh(spinlock_t *lock)
+void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
 {
-	_raw_spin_unlock(lock);
-	preempt_enable_no_resched();
+	__raw_spin_unlock(&lock->raw_lock);
+	__preempt_enable_no_resched();
 	local_bh_enable();
 }
-EXPORT_SYMBOL(_spin_unlock_bh);
+EXPORT_SYMBOL(_raw_spin_unlock_bh);
 
-void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+void __lockfunc _raw_read_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags)
 {
-	_raw_read_unlock(lock);
+	__raw_read_unlock(&lock->raw_lock);
+	__preempt_enable_no_resched();
 	local_irq_restore(flags);
-	preempt_enable();
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_read_unlock_irqrestore);
+EXPORT_SYMBOL(_raw_read_unlock_irqrestore);
 
-void __lockfunc _read_unlock_irq(rwlock_t *lock)
+void __lockfunc _raw_read_unlock_irq(raw_rwlock_t *lock)
 {
-	_raw_read_unlock(lock);
+	__raw_read_unlock(&lock->raw_lock);
+	__preempt_enable_no_resched();
 	local_irq_enable();
-	preempt_enable();
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_read_unlock_irq);
+EXPORT_SYMBOL(_raw_read_unlock_irq);
 
-void __lockfunc _read_unlock_bh(rwlock_t *lock)
+void __lockfunc _raw_read_unlock_bh(raw_rwlock_t *lock)
 {
-	_raw_read_unlock(lock);
-	preempt_enable_no_resched();
+	__raw_read_unlock(&lock->raw_lock);
+	__preempt_enable_no_resched();
 	local_bh_enable();
 }
-EXPORT_SYMBOL(_read_unlock_bh);
+EXPORT_SYMBOL(_raw_read_unlock_bh);
 
-void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+void __lockfunc _raw_write_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags)
 {
-	_raw_write_unlock(lock);
+	__raw_write_unlock(&lock->raw_lock);
+	__preempt_enable_no_resched();
 	local_irq_restore(flags);
-	preempt_enable();
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_write_unlock_irqrestore);
+EXPORT_SYMBOL(_raw_write_unlock_irqrestore);
 
-void __lockfunc _write_unlock_irq(rwlock_t *lock)
+void __lockfunc _raw_write_unlock_irq(raw_rwlock_t *lock)
 {
-	_raw_write_unlock(lock);
+	__raw_write_unlock(&lock->raw_lock);
+	__preempt_enable_no_resched();
 	local_irq_enable();
-	preempt_enable();
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_write_unlock_irq);
+EXPORT_SYMBOL(_raw_write_unlock_irq);
 
-void __lockfunc _write_unlock_bh(rwlock_t *lock)
+void __lockfunc _raw_write_unlock_bh(raw_rwlock_t *lock)
 {
-	_raw_write_unlock(lock);
-	preempt_enable_no_resched();
+	__raw_write_unlock(&lock->raw_lock);
+	__preempt_enable_no_resched();
 	local_bh_enable();
 }
-EXPORT_SYMBOL(_write_unlock_bh);
+EXPORT_SYMBOL(_raw_write_unlock_bh);
 
-int __lockfunc _spin_trylock_bh(spinlock_t *lock)
+int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
-	if (_raw_spin_trylock(lock))
+	if (__raw_spin_trylock(&lock->raw_lock))
 		return 1;
 
-	preempt_enable_no_resched();
+	__preempt_enable_no_resched();
 	local_bh_enable();
 	return 0;
 }
-EXPORT_SYMBOL(_spin_trylock_bh);
+EXPORT_SYMBOL(_raw_spin_trylock_bh);
 
-int in_lock_functions(unsigned long addr)
+int notrace in_lock_functions(unsigned long addr)
 {
 	/* Linker adds these: start and end of __lockfunc functions */
 	extern char __lock_text_start[], __lock_text_end[];
@@ -372,3 +433,14 @@ int in_lock_functions(unsigned long addr
 	&& addr < (unsigned long)__lock_text_end;
 }
 EXPORT_SYMBOL(in_lock_functions);
+
+void notrace __debug_atomic_dec_and_test(atomic_t *v)
+{
+	static int warn_once = 1;
+
+	if (!atomic_read(v) && warn_once) {
+		warn_once = 0;
+		printk("BUG: atomic counter underflow!\n");
+		WARN_ON(1);
+	}
+}
Index: linux-2.6.16/kernel/stop_machine.c
===================================================================
--- linux-2.6.16.orig/kernel/stop_machine.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/stop_machine.c	2006-10-19 16:52:31.000000000 +0200
@@ -56,7 +56,7 @@ static int stopmachine(void *cpu)
 		/* Yield in first stage: migration threads need to
 		 * help our sisters onto their CPUs. */
 		if (!prepared && !irqs_disabled)
-			yield();
+			__yield();
 		else
 			cpu_relax();
 	}
@@ -106,7 +106,7 @@ static int stop_machine(void)
 
 	/* Wait for them all to come to life. */
 	while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
-		yield();
+		__yield();
 
 	/* If some failed, kill them all. */
 	if (ret < 0) {
@@ -130,7 +130,7 @@ static void restart_machine(void)
 {
 	stopmachine_set_state(STOPMACHINE_EXIT);
 	local_irq_enable();
-	preempt_enable_no_resched();
+	__preempt_enable_no_resched();
 }
 
 struct stop_machine_data
Index: linux-2.6.16/kernel/sys.c
===================================================================
--- linux-2.6.16.orig/kernel/sys.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/sys.c	2006-10-19 16:52:31.000000000 +0200
@@ -33,6 +33,7 @@
 
 #include <linux/compat.h>
 #include <linux/syscalls.h>
+#include <linux/rt_lock.h>
 #include <linux/kprobes.h>
 
 #include <asm/uaccess.h>
@@ -170,7 +171,7 @@ EXPORT_SYMBOL(notifier_chain_unregister)
  *	of the last notifier function called.
  */
  
-int __kprobes notifier_call_chain(struct notifier_block **n, unsigned long val, void *v)
+int notrace __kprobes notifier_call_chain(struct notifier_block **n, unsigned long val, void *v)
 {
 	int ret=NOTIFY_DONE;
 	struct notifier_block *nb = *n;
Index: linux-2.6.16/kernel/sys_ni.c
===================================================================
--- linux-2.6.16.orig/kernel/sys_ni.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/sys_ni.c	2006-10-19 16:52:31.000000000 +0200
@@ -42,6 +42,10 @@ cond_syscall(sys_recvmsg);
 cond_syscall(sys_socketcall);
 cond_syscall(sys_futex);
 cond_syscall(compat_sys_futex);
+cond_syscall(sys_set_robust_list);
+cond_syscall(compat_sys_set_robust_list);
+cond_syscall(sys_get_robust_list);
+cond_syscall(compat_sys_get_robust_list);
 cond_syscall(sys_epoll_create);
 cond_syscall(sys_epoll_ctl);
 cond_syscall(sys_epoll_wait);
Index: linux-2.6.16/kernel/sysctl.c
===================================================================
--- linux-2.6.16.orig/kernel/sysctl.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/sysctl.c	2006-10-19 16:52:31.000000000 +0200
@@ -44,6 +44,7 @@
 #include <linux/limits.h>
 #include <linux/dcache.h>
 #include <linux/syscalls.h>
+#include <linux/profile.h>
 #include <linux/nfs_fs.h>
 #include <linux/acpi.h>
 
@@ -131,6 +132,10 @@ extern int acct_parm[];
 extern int no_unaligned_warning;
 #endif
 
+#ifdef CONFIG_RT_MUTEXES
+extern int max_lock_depth;
+#endif
+
 static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t,
 		       ctl_table *, void **);
 static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
@@ -284,6 +289,140 @@ static ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "prof_pid",
+		.data		= &prof_pid,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#ifdef CONFIG_PREEMPT
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "kernel_preemption",
+		.data		= &kernel_preemption,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "voluntary_preemption",
+		.data		= &voluntary_preemption,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#if defined(CONFIG_PREEMPT_SOFTIRQS) && !defined(CONFIG_PREEMPT_RT)
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "softirq_preemption",
+		.data		= &softirq_preemption,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#if defined(CONFIG_PREEMPT_HARDIRQS) && !defined(CONFIG_PREEMPT_RT)
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "hardirq_preemption",
+		.data		= &hardirq_preemption,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_WAKEUP_TIMING
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "wakeup_timing",
+		.data		= &wakeup_timing,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_LATENCY_TRACE
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_enabled",
+		.data		= &trace_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "mcount_enabled",
+		.data		= &mcount_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_user_triggered",
+		.data		= &trace_user_triggered,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_user_trigger_irq",
+		.data		= &trace_user_trigger_irq,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_freerunning",
+		.data		= &trace_freerunning,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_print_at_crash",
+		.data		= &trace_print_at_crash,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_verbose",
+		.data		= &trace_verbose,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_all_cpus",
+		.data		= &trace_all_cpus,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_GENERIC_HARDIRQS
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "debug_direct_keyboard",
+		.data		= &debug_direct_keyboard,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+	{
 		.ctl_name	= KERN_CORE_USES_PID,
 		.procname	= "core_uses_pid",
 		.data		= &core_uses_pid,
@@ -487,7 +626,8 @@ static ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
-#ifdef CONFIG_MAGIC_SYSRQ
+// ignore sysrq-off when debug-printks are enabled:
+#if defined(CONFIG_MAGIC_SYSRQ) && !defined(CONFIG_PRINTK_IGNORE_LOGLEVEL)
 	{
 		.ctl_name	= KERN_SYSRQ,
 		.procname	= "sysrq",
@@ -683,6 +823,17 @@ static ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
+#ifdef CONFIG_RT_MUTEXES
+	{
+		.ctl_name	= KERN_MAX_LOCK_DEPTH,
+		.procname	= "max_lock_depth",
+		.data		= &max_lock_depth,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+
 	{ .ctl_name = 0 }
 };
 
Index: linux-2.6.16/kernel/time.c
===================================================================
--- linux-2.6.16.orig/kernel/time.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/time.c	2006-10-19 16:52:31.000000000 +0200
@@ -39,6 +39,7 @@
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
+#include <linux/timeofday.h>
 
 /* 
  * The timezone where the local system is located.  Used as a default by some
@@ -98,8 +99,31 @@ asmlinkage long sys_stime(time_t __user 
 
 #endif /* __ARCH_WANT_SYS_TIME */
 
+int timeofday_API_hacks(void *tv, void *tz)
+{
+#ifdef CONFIG_LATENCY_TRACE
+	if (!tv && ((long)tz == 1))
+		return user_trace_start();
+	if (!tv && !tz)
+		return user_trace_stop();
+#endif
+	if (((long)tv == 1) && ((long)tz == 1)) {
+		current->flags |= PF_NOSCHED;
+		return 0;
+	}
+	if (((long)tv == 1) && ((long)tz == 0)) {
+		current->flags &= ~PF_NOSCHED;
+		return 0;
+	}
+	return 1;
+}
+
 asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __user *tz)
 {
+	int ret = timeofday_API_hacks(tv, tz);
+	if (ret != 1)
+		return ret;
+
 	if (likely(tv != NULL)) {
 		struct timeval ktv;
 		do_gettimeofday(&ktv);
@@ -129,6 +153,7 @@ asmlinkage long sys_gettimeofday(struct 
  * as real UNIX machines always do it. This avoids all headaches about
  * daylight saving times and warping kernel clocks.
  */
+#ifndef CONFIG_GENERIC_TIME
 static inline void warp_clock(void)
 {
 	write_seqlock_irq(&xtime_lock);
@@ -138,6 +163,18 @@ static inline void warp_clock(void)
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 }
+#else /* !CONFIG_GENERIC_TIME */
+/* XXX - this is somewhat cracked out and should
+         be checked  -johnstul@us.ibm.com
+*/
+static inline void warp_clock(void)
+{
+	struct timespec ts;
+	getnstimeofday(&ts);
+	ts.tv_sec += sys_tz.tz_minuteswest * 60;
+	do_settimeofday(&ts);
+}
+#endif /* !CONFIG_GENERIC_TIME */
 
 /*
  * In case for some reason the CMOS clock has not already been running
@@ -188,6 +225,10 @@ asmlinkage long sys_settimeofday(struct 
 	struct timespec	new_ts;
 	struct timezone new_tz;
 
+	int ret = timeofday_API_hacks(tv, tz);
+	if (ret != 1)
+		return ret;
+
 	if (tv) {
 		if (copy_from_user(&user_tv, tv, sizeof(*tv)))
 			return -EFAULT;
@@ -259,6 +300,8 @@ int do_adjtimex(struct timex *txc)
 			return -EINVAL;
 
 	write_seqlock_irq(&xtime_lock);
+	write_seqlock(&ntp_lock);
+
 	result = time_state;	/* mostly `TIME_OK' */
 
 	/* Save for later - semantics of adjtime is to return old value */
@@ -396,6 +439,7 @@ leave:	if ((time_status & (STA_UNSYNC|ST
 	txc->calcnt	   = pps_calcnt;
 	txc->errcnt	   = pps_errcnt;
 	txc->stbcnt	   = pps_stbcnt;
+	write_sequnlock(&ntp_lock);
 	write_sequnlock_irq(&xtime_lock);
 	do_gettimeofday(&txc->time);
 	notify_arch_cmos_timer();
@@ -477,6 +521,7 @@ struct timespec timespec_trunc(struct ti
 }
 EXPORT_SYMBOL(timespec_trunc);
 
+#ifndef CONFIG_GENERIC_TIME
 #ifdef CONFIG_TIME_INTERPOLATION
 void getnstimeofday (struct timespec *tv)
 {
@@ -513,10 +558,7 @@ int do_settimeofday (struct timespec *tv
 		set_normalized_timespec(&xtime, sec, nsec);
 		set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-		time_adjust = 0;		/* stop active adjtime() */
-		time_status |= STA_UNSYNC;
-		time_maxerror = NTP_PHASE_LIMIT;
-		time_esterror = NTP_PHASE_LIMIT;
+		ntp_clear();
 		time_interpolator_reset();
 	}
 	write_sequnlock_irq(&xtime_lock);
@@ -565,6 +607,8 @@ void getnstimeofday(struct timespec *tv)
 EXPORT_SYMBOL_GPL(getnstimeofday);
 #endif
 
+#endif /* !CONFIG_GENERIC_TIME */
+
 /* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
  * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
  * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
@@ -637,7 +681,7 @@ void set_normalized_timespec(struct time
  *
  * Returns the timespec representation of the nsec parameter.
  */
-struct timespec ns_to_timespec(const nsec_t nsec)
+struct timespec ns_to_timespec(const s64 nsec)
 {
 	struct timespec ts;
 
@@ -657,7 +701,7 @@ struct timespec ns_to_timespec(const nse
  *
  * Returns the timeval representation of the nsec parameter.
  */
-struct timeval ns_to_timeval(const nsec_t nsec)
+struct timeval ns_to_timeval(const s64 nsec)
 {
 	struct timespec ts = ns_to_timespec(nsec);
 	struct timeval tv;
Index: linux-2.6.16/kernel/time/Kconfig
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/kernel/time/Kconfig	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,22 @@
+#
+# Timer subsystem related configuration options
+#
+config HIGH_RES_TIMERS
+	bool "High Resolution Timer Support"
+	depends on GENERIC_TIME
+	help
+	  This option enables high resolution timer support. If your
+	  hardware is not capable then this option only increases
+	  the size of the kernel image.
+
+config HIGH_RES_RESOLUTION
+	int "High Resolution Timer resolution (nanoseconds)"
+	depends on HIGH_RES_TIMERS
+	default 1000
+	help
+	  This sets the resolution in nanoseconds of the high resolution
+	  timers. Too fine a resolution (small a number) will usually
+	  not be observable due to normal system latencies.  For an
+          800 MHz processor about 10,000 (10 microseconds) is recommended as a
+	  finest resolution.  If you don't need that sort of resolution,
+	  larger values may generate less overhead.
Index: linux-2.6.16/kernel/time/Makefile
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/kernel/time/Makefile	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1 @@
+obj-y	= clocksource.o jiffies.o clockevents.o timeofday.o
Index: linux-2.6.16/kernel/time/clockevents.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/kernel/time/clockevents.c	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,627 @@
+/*
+ * linux/kernel/time/clockevents.c
+ *
+ * This file contains functions which manage clock event drivers.
+ *
+ * Copyright(C) 2005 Thomas Gleixner <tglx@linutronix.de>
+ *
+ * Kudos to Ingo Molnar for review, criticism, ideas
+ *
+ * We have two types of clock event devices:
+ * - global events (one device per system)
+ * - local events (one device per cpu)
+ *
+ * We assign the various time(r) related interrupts to those devices
+ *
+ * - global tick
+ * - profiling (per cpu)
+ * - next timer events (per cpu)
+ *
+ * TODO:
+ * - implement variable frequency profiling
+ */
+
+#include <linux/clockchips.h>
+#include <linux/cpu.h>
+#include <linux/irq.h>
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/sysdev.h>
+#include <linux/hrtimer.h>
+
+#define MAX_CLOCK_EVENTS	4
+
+struct event_descr {
+	struct clock_event *event;
+	unsigned int mode;
+	unsigned int real_caps;
+	struct irqaction action;
+};
+
+struct local_events {
+	int installed;
+	struct event_descr events[MAX_CLOCK_EVENTS];
+	struct clock_event *nextevt;
+};
+
+/* Variables related to the global event source */
+static struct event_descr global_eventsource;
+
+/* Variables related to the per cpu local event sources */
+static DEFINE_PER_CPU(struct local_events, local_eventsources);
+
+#ifdef CONFIG_SMP
+# define recalc_global_event(e) do { } while(0)
+#else
+# define recalc_global_event(c) recalc_active_event(&global_eventsource, c)
+#endif
+
+/*
+ * Math helper. Convert a latch value to ns
+ */
+unsigned long clockevent_delta2ns(unsigned long latch, struct clock_event *evt)
+{
+	u64 clc = ((u64) latch << evt->shift);
+
+	do_div(clc, evt->mult);
+	if (clc < KTIME_MONOTONIC_RES.tv64)
+		clc = KTIME_MONOTONIC_RES.tv64;
+	if (clc > 0x7FFFFFFF)
+		clc = 0x7FFFFFFF;
+
+	return (unsigned long) clc;
+}
+
+/*
+ * Generic timer interrupt handler usable for all kinds of events
+ */
+static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+{
+	struct clock_event *evt = dev_id;
+
+	if (evt->start_event)
+		evt->start_event(evt->priv);
+
+	evt->event_handler(regs);
+
+	if (evt->end_event)
+		evt->end_event(evt->priv);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Handle tick
+ */
+static void handle_tick(struct pt_regs *regs)
+{
+	write_seqlock(&xtime_lock);
+	do_timer(regs);
+	write_sequnlock(&xtime_lock);
+	timeofday_overflow_protection();
+}
+
+/*
+ * Handle tick and update
+ */
+static void handle_tick_update(struct pt_regs *regs)
+{
+	write_seqlock(&xtime_lock);
+	do_timer(regs);
+	write_sequnlock(&xtime_lock);
+
+	update_process_times(user_mode(regs));
+}
+
+/*
+ * Handle tick, update and profiling
+ */
+static void handle_tick_update_profile(struct pt_regs *regs)
+{
+	write_seqlock(&xtime_lock);
+	do_timer(regs);
+	write_sequnlock(&xtime_lock);
+
+	update_process_times(user_mode(regs));
+	profile_tick(CPU_PROFILING, regs);
+}
+
+/*
+ * Handle update
+ */
+static void handle_update(struct pt_regs *regs)
+{
+	update_process_times(user_mode(regs));
+}
+
+/*
+ * Handle update and profile
+ */
+static void handle_update_profile(struct pt_regs *regs)
+{
+	update_process_times(user_mode(regs));
+	profile_tick(CPU_PROFILING, regs);
+}
+
+/*
+ * Handle profile
+ */
+static void handle_profile(struct pt_regs *regs)
+{
+	profile_tick(CPU_PROFILING, regs);
+}
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+/*
+ * Handle next event
+ */
+static void handle_nextevent(struct pt_regs *regs)
+{
+	hrtimer_interrupt();
+}
+
+/*
+ * Handle next event, tick
+ */
+static void handle_nextevent_tick(struct pt_regs *regs)
+{
+	int res;
+
+	res = hrtimer_interrupt();
+	for (; res > 0; res--)
+		handle_tick(regs);
+}
+
+/*
+ * Handle next event, update
+ */
+static void handle_nextevent_update(struct pt_regs *regs)
+{
+	if (hrtimer_interrupt() > 0)
+		handle_update(regs);
+}
+
+/*
+ * Handle next event, tick, update
+ */
+static void handle_nextevent_tick_update(struct pt_regs *regs)
+{
+	int res;
+
+	if ((res = hrtimer_interrupt()) == 0)
+		return;
+
+	for (; res > 0; res--)
+		handle_tick(regs);
+
+	handle_update(regs);
+}
+
+/*
+ * Handle next event, profile
+ */
+static void handle_nextevent_profile(struct pt_regs *regs)
+{
+	if (hrtimer_interrupt() > 0)
+		handle_profile(regs);
+}
+
+/*
+ * Handle next event, update, profile
+ */
+static void handle_nextevent_update_profile(struct pt_regs *regs)
+{
+	if (hrtimer_interrupt() > 0)
+		handle_update_profile(regs);
+}
+
+/*
+ * Handle next event, tick, update, profile
+ */
+static void handle_nextevent_all(struct pt_regs *regs)
+{
+	int res;
+
+	if ((res = hrtimer_interrupt()) == 0)
+		return;
+
+	for (; res > 0; res--)
+		handle_tick(regs);
+
+	handle_update_profile(regs);
+}
+#else
+#define handle_nextevent		NULL
+#define handle_nextevent_tick		NULL
+#define handle_nextevent_update		NULL
+#define handle_nextevent_tick_update	NULL
+#define handle_nextevent_profile	NULL
+#define handle_nextevent_update_profile	NULL
+#define handle_nextevent_all		NULL
+#endif
+
+/*
+ * Lookup table for event assignment
+ */
+static void *event_handlers[] = {
+	NULL,				/* 0: No capability selected */
+	handle_tick,			/* 1: Tick only	*/
+	handle_nextevent,		/* 2: Next event only */
+	handle_nextevent_tick,		/* 3: Next event + tick */
+	handle_update,			/* 4: Update process times */
+	handle_tick_update,		/* 5: Tick + update process times */
+	handle_nextevent_update,	/* 6: Next event +
+					      update process times */
+	handle_nextevent_tick_update,	/* 7: Next event + tick +
+					      update process times */
+	handle_profile,			/* 8: Profiling int */
+	NULL,				/* 9: Tick + profiling */
+	handle_nextevent_profile,	/* A: Next event + profiling */
+	NULL,				/* B: Next event + tick + profiling */
+	handle_update_profile,		/* C: Update process times +
+					      profiling */
+	handle_tick_update_profile,	/* D: Tick + update process times +
+					      profiling */
+	handle_nextevent_update_profile,/* E: Next event +
+					      update process times +
+					      profiling */
+	handle_nextevent_all,		/* F: Next event + tick +
+					      update process times +
+					      profiling */
+};
+
+/*
+ * The selection model makes following assumptions:
+ *
+ * There is only one global event source set up. Global event sources
+ * are unique devices in a system (UP/SMP) Usually they are setup
+ * early in the bootup phase to provide the basic tick environment to
+ * bring up hardware. Such a device can be capable of providing all in
+ * one functionality including next event scheduling.
+ *
+ * When a system has decicated event sources which can be used for
+ * particular purposes then we assume that there are no devices setup
+ * which provide "competing" functionality. i.e. the developer has to
+ * decide which device should be used for a particular functionality
+ * rather than letting the management code guess about the best
+ * fit. The code manages the cases, where the number of event sources
+ * is unknown during compile time, but the functionality of the event
+ * source is assigned to the respective event source by a human best
+ * fit decision.
+ *
+ * The purpose of the management code is to provide handling code for
+ * the various possible combinations and the necessary infrastructure
+ * to handle next event (e.g. high resolution) scheduling with a
+ * single event source, which makes a periodic rescheduling of the
+ * tick interupt necessary. This is done to avoid the #ifdef mess all
+ * over the architecture dependend timer and event interupt code for
+ * the various possible use case combinations and allows clean non
+ * intrusive implementation of configurable extensions to the time
+ * related event system e.g. dynamic ticks, high resolution
+ * timers.
+ *
+ * Some architectures can use a NMI based profiling mechanism. If this
+ * is used, then profiling is excluded from the event assignements.
+ *
+ * SMP systems CPU which have no unique global event source should not
+ * setup a global event source. The correct way is setting up one
+ * event source (usually local to CPU0 or the bootcpu in hotplug
+ * systems) which has the CLOCK_CAP_TICK flag set, so the management
+ * code assigns exactly one tick source for the complete system.
+ *
+ * A special case are pseudo event sources (IPI mechanisms) on SMP
+ * systems. They can be used for populating tick events from one event
+ * source across multiple CPUs.
+ *
+ */
+static int setup_event(struct event_descr *descr, struct clock_event *evt,
+		       unsigned int caps, cpumask_t cpumask)
+{
+	void *handler = event_handlers[caps];
+
+	if (!handler) {
+		printk(KERN_ERR "Unsupported event source %s\n", evt->name);
+		return -EINVAL;
+	}
+
+	/* Store the event handler */
+	evt->event_handler = handler;
+
+	/* Save the event descriptor reference */
+	descr->event = evt;
+
+	if (!(evt->capabilities & CLOCK_HAS_IRQHANDLER)) {
+		descr->action.name = evt->name;
+		descr->action.handler = timer_interrupt;
+		descr->action.flags = SA_INTERRUPT | SA_NODELAY;
+		descr->action.mask = cpumask;
+		descr->action.dev_id = evt;
+		setup_irq(evt->irq, &descr->action);
+	}
+
+	descr->real_caps = caps;
+	descr->mode = CLOCK_EVT_STARTUP;
+	if (evt->set_mode)
+		evt->set_mode(CLOCK_EVT_STARTUP);
+	printk(KERN_INFO "Event source %s installed with caps set: %02x\n",
+	       descr->event->name, descr->real_caps);
+
+	return 0;
+}
+
+/*
+ * Mask out the functionality which is covered by the new event source
+ * and assign a new event handler.
+ */
+static unsigned int recalc_active_event(struct event_descr *descr,
+					unsigned int caps)
+{
+	unsigned int gcaps;
+
+	if (!descr->event)
+		return caps;
+
+	/* Find out the overlapping bits */
+	gcaps = descr->real_caps & caps;
+
+	/*
+	 * Be careful here. We dont know in which order the event
+	 * sources are set up. So we might switch off a previously
+	 * registered source completely.
+	 *
+	 * Might need more thoughts though.
+	 */
+	if (gcaps == descr->real_caps) {
+		int i;
+
+		i = ffs(gcaps) - 1;
+		gcaps &= ~(1 << i);
+		caps &= ~(1 << i);
+	}
+	if (!gcaps)
+		return caps;
+
+	/* Mask the bits which are now covered by the new event */
+	descr->real_caps &= ~gcaps;
+
+	/* Assign the new event handler */
+	descr->event->event_handler = event_handlers[descr->real_caps];
+	printk(KERN_INFO "Event source %s new caps set: %02x\n" ,
+	       descr->event->name, descr->real_caps);
+
+	return caps;
+}
+
+/*
+ * Recalc the events and reassign the handlers if necessary
+ */
+static int recalc_events(struct local_events *sources, struct clock_event *evt,
+			 cpumask_t cpumask)
+{
+	unsigned int caps = evt->capabilities & CLOCK_CAP_MASK;
+	int i;
+
+	if (sources->installed == MAX_CLOCK_EVENTS)
+		return -ENOSPC;
+
+	if (!event_handlers[caps])
+		return -EINVAL;
+
+	recalc_global_event(caps);
+
+	for (i = 0; i < sources->installed; i++)
+		caps = recalc_active_event(&sources->events[i], caps);
+
+	setup_event(&sources->events[sources->installed], evt, caps, cpumask);
+	sources->installed++;
+	if (evt->capabilities & CLOCK_CAP_NEXTEVT) {
+		sources->nextevt = evt;
+		hrtimer_clock_notify();
+	}
+
+	return 0;
+}
+
+/**
+ * setup_local_clockevent - Set up a cpu local clock event device
+ *
+ * @evtdev:	event device to be registered
+ * @cpumask:	cpumask for the irq setup
+ */
+int setup_local_clockevent(struct clock_event *evtdev, cpumask_t cpumask)
+{
+	struct local_events *sources = &__get_cpu_var(local_eventsources);
+	unsigned long flags;
+	int res;
+
+	/* Recalc event sources and maybe reassign interrupts */
+	local_irq_save(flags);
+	res = recalc_events(sources, evtdev, cpumask);
+	local_irq_restore(flags);
+
+	return res;
+}
+EXPORT_SYMBOL(setup_local_clockevent);
+
+/**
+ * set_global_clockevent - Set the device which generates global clock events
+ *
+ * @evt:	The device which generates global clock events (ticks)
+ */
+int __init setup_global_clockevent(struct clock_event *evt, cpumask_t cpumask)
+{
+	int res;
+
+	res = setup_event(&global_eventsource, evt,
+			   evt->capabilities & CLOCK_CAP_MASK, cpumask);
+#ifndef CONFIG_SMP
+	/*
+	 * The "global" event source on UP systems can serve as
+	 * next event source !
+	 */
+	if (!res && (evt->capabilities & CLOCK_CAP_NEXTEVT))
+		per_cpu(local_eventsources, 0).nextevt = evt;
+#endif
+	return res;
+}
+
+/**
+ * clockevents_next_event_available - Check for a installed next event source
+ */
+int clockevents_next_event_available(void)
+{
+	struct local_events *sources = &__get_cpu_var(local_eventsources);
+        int i;
+
+	if (!sources->nextevt)
+		return 0;
+
+#ifndef CONFIG_SMP
+	if (sources->nextevt == global_eventsource.event)
+		return CLOCK_EVT_SCHEDTICK;
+#endif
+	/*
+	 * Check, whether the next event source is solely for next events or
+	 * it has to do some periodic tick functionality
+	 * We use the real_caps field here, as some other source might
+	 * have switched off one of the capability flags.
+	 */
+	for (i = 0; i < sources->installed; i++) {
+                if (sources->nextevt != sources->events[i].event)
+                        continue;
+
+	        if (sources->events[i].real_caps & ~CLOCK_CAP_NEXTEVT)
+        		return CLOCK_EVT_SCHEDTICK;
+		return CLOCK_EVT_NOTICK;
+        }
+	return CLOCK_EVT_NOTICK;
+}
+
+int clockevents_init_next_event(void)
+{
+	struct local_events *sources = &__get_cpu_var(local_eventsources);
+
+	if (!sources->nextevt)
+		return 0;
+
+	if (sources->nextevt->set_mode)
+		sources->nextevt->set_mode(CLOCK_EVT_ONESHOT);
+
+	return 1;
+}
+
+int clockevents_set_next_event(ktime_t expires)
+{
+	struct local_events *sources = &__get_cpu_var(local_eventsources);
+	s64 delta = ktime_to_ns(ktime_sub(expires, ktime_get()));
+	unsigned long clc;
+
+	if (delta <= 0)
+		return -ETIME;
+	if (delta > sources->nextevt->max_delta_ns)
+		delta = sources->nextevt->max_delta_ns;
+	if (delta < sources->nextevt->min_delta_ns)
+		delta = sources->nextevt->min_delta_ns;
+
+#ifndef CONFIG_IA64
+	clc = mpy_sc32((unsigned long) delta, sources->nextevt->mult);
+#else
+	clc = (unsigned long) delta * (unsigned long) sources->nextevt->mult;
+	clc = clc >> sources->nextevt->shift;
+#endif
+
+	sources->nextevt->set_next_event(clc);
+
+	hrtimer_trace(expires, clc);
+
+	return 0;
+}
+
+void clockevents_trigger_next_event(void)
+{
+}
+
+#ifdef CONFIG_PM
+static int
+global_eventsource_suspend(struct sys_device *dev, pm_message_t state)
+{
+	/* Do generic stuff here */
+	if (global_eventsource.event->suspend)
+		global_eventsource.event->suspend();
+	return 0;
+}
+
+static int global_eventsource_resume(struct sys_device *dev)
+{
+	/* Do generic stuff here */
+	if (global_eventsource.event->resume)
+		global_eventsource.event->resume();
+	return 0;
+}
+#else
+# define global_eventsource_resume	NULL
+# define global_eventsource_suspend	NULL
+#endif
+
+static struct sysdev_class global_clock_event_sysclass = {
+	.resume = global_eventsource_resume,
+	.suspend = global_eventsource_suspend,
+	set_kset_name("global_clock_event"),
+};
+
+static struct sys_device device_global_clock_event = {
+	.id	= 0,
+	.cls	= &global_clock_event_sysclass,
+};
+
+static int __init global_clock_event_devinit(void)
+{
+	int error = sysdev_class_register(&global_clock_event_sysclass);
+
+	if (!error)
+		error = sysdev_register(&device_global_clock_event);
+
+	return error;
+}
+
+device_initcall(global_clock_event_devinit);
+
+/*
+ * Functions related to initialization
+ */
+static void __devinit init_clockevents_cpu(int cpu)
+{
+}
+
+static int __devinit clockevents_cpu_notify(struct notifier_block *self,
+					    unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+
+	switch(action) {
+	case CPU_UP_PREPARE:
+		init_clockevents_cpu(cpu);
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_DEAD:
+		break;
+#endif
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata clockevents_nb = {
+	.notifier_call	= clockevents_cpu_notify,
+};
+
+void __init init_clockevents(void)
+{
+	clockevents_cpu_notify(&clockevents_nb, (unsigned long)CPU_UP_PREPARE,
+				(void *)(long)smp_processor_id());
+	register_cpu_notifier(&clockevents_nb);
+}
Index: linux-2.6.16/kernel/time/clocksource.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/kernel/time/clocksource.c	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,355 @@
+/*
+ * linux/kernel/time/clocksource.c
+ *
+ * This file contains the functions which manage clocksource drivers.
+ *
+ * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * TODO WishList:
+ *   o Allow clocksource drivers to be unregistered
+ *   o get rid of clocksource_jiffies extern
+ */
+
+#include <linux/clocksource.h>
+#include <linux/sysdev.h>
+#include <linux/init.h>
+#include <linux/module.h>
+
+/* XXX - Would like a better way for initializing curr_clocksource */
+extern struct clocksource clocksource_jiffies;
+
+/*[Clocksource internal variables]---------
+ * curr_clocksource:
+ *	currently selected clocksource. Initialized to clocksource_jiffies.
+ * next_clocksource:
+ *	pending next selected clocksource.
+ * clocksource_list:
+ *	linked list with the registered clocksources
+ * clocksource_lock:
+ *	protects manipulations to curr_clocksource and next_clocksource
+ *	and the clocksource_list
+ * override_name:
+ *	Name of the user-specified clocksource.
+ */
+static struct clocksource *curr_clocksource = &clocksource_jiffies;
+static struct clocksource *next_clocksource;
+static LIST_HEAD(clocksource_list);
+static DEFINE_RAW_SPINLOCK(clocksource_lock);
+static char override_name[32];
+static int finished_booting;
+
+/* clocksource_done_booting - Called near the end of bootup
+ *
+ * Hack to avoid lots of clocksource churn at boot time
+ */
+static int clocksource_done_booting(void)
+{
+	finished_booting = 1;
+	return 0;
+}
+
+late_initcall(clocksource_done_booting);
+
+/**
+ * get_next_clocksource - Returns the selected clocksource
+ *
+ * Accessor for timeofday_periodic_hook.
+ */
+struct clocksource *get_next_clocksource(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&clocksource_lock, flags);
+	if (next_clocksource && finished_booting) {
+		curr_clocksource = next_clocksource;
+		next_clocksource = NULL;
+	}
+	spin_unlock_irqrestore(&clocksource_lock, flags);
+
+	return curr_clocksource;
+}
+
+/**
+ * select_clocksource - Finds the best registered clocksource.
+ *
+ * Private function. Must hold clocksource_lock when called.
+ *
+ * Looks through the list of registered clocksources, returning
+ * the one with the highest rating value. If there is a clocksource
+ * name that matches the override string, it returns that clocksource.
+ */
+static struct clocksource *select_clocksource(void)
+{
+	struct clocksource *best = NULL;
+	struct list_head *tmp;
+
+	list_for_each(tmp, &clocksource_list) {
+		struct clocksource *src;
+
+		src = list_entry(tmp, struct clocksource, list);
+		if (!best)
+			best = src;
+
+		/* check for override: */
+		if (strlen(src->name) == strlen(override_name) &&
+		    !strcmp(src->name, override_name)) {
+			best = src;
+			break;
+		}
+		/* pick the highest rating: */
+		if (src->rating > best->rating)
+		 	best = src;
+	}
+
+	return best;
+}
+
+/**
+ * is_registered_source - Checks if clocksource is registered
+ * @c:		pointer to a clocksource
+ *
+ * Private helper function. Must hold clocksource_lock when called.
+ *
+ * Returns one if the clocksource is already registered, zero otherwise.
+ */
+static int is_registered_source(struct clocksource *c)
+{
+	int len = strlen(c->name);
+	struct list_head *tmp;
+
+	list_for_each(tmp, &clocksource_list) {
+		struct clocksource *src;
+
+		src = list_entry(tmp, struct clocksource, list);
+		if (strlen(src->name) == len &&	!strcmp(src->name, c->name))
+			return 1;
+	}
+
+	return 0;
+}
+
+/**
+ * register_clocksource - Used to install new clocksources
+ * @t:		clocksource to be registered
+ *
+ * Returns -EBUSY if registration fails, zero otherwise.
+ */
+int register_clocksource(struct clocksource *c)
+{
+	int ret = 0;
+	unsigned long flags;
+
+#ifdef CONFIG_PARANOID_GENERIC_TIME
+	printk("register_clocksource: Registering %s\n", c->name);
+#endif
+	spin_lock_irqsave(&clocksource_lock, flags);
+	/* check if clocksource is already registered */
+	if (is_registered_source(c)) {
+		spin_unlock_irqrestore(&clocksource_lock, flags);
+		printk("register_clocksource: Cannot register %s. "
+			"Already registered!", c->name);
+		ret = -EBUSY;
+	} else {
+		/* register it */
+ 		list_add(&c->list, &clocksource_list);
+		/* scan the registered clocksources, and pick the best one */
+		next_clocksource = select_clocksource();
+		spin_unlock_irqrestore(&clocksource_lock, flags);
+	}
+	return ret;
+}
+
+EXPORT_SYMBOL(register_clocksource);
+
+/**
+ * reselect_clocksource - Rescan list for next clocksource
+ *
+ * A quick helper function to be used if a clocksource changes its
+ * rating. Forces the clocksource list to be re-scaned for the best
+ * clocksource.
+ */
+void reselect_clocksource(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&clocksource_lock, flags);
+	next_clocksource = select_clocksource();
+	spin_unlock_irqrestore(&clocksource_lock, flags);
+}
+
+/**
+ * sysfs_show_current_clocksources - sysfs interface for current clocksource
+ * @dev:	unused
+ * @buf:	char buffer to be filled with clocksource list
+ *
+ * Provides sysfs interface for listing current clocksource.
+ */
+static ssize_t
+sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
+{
+	char *curr = buf;
+
+	spin_lock_irq(&clocksource_lock);
+	curr += sprintf(curr, "%s ", curr_clocksource->name);
+	spin_unlock_irq(&clocksource_lock);
+
+	curr += sprintf(curr, "\n");
+
+	return curr - buf;
+}
+
+/**
+ * sysfs_override_clocksource - interface for manually overriding clocksource
+ * @dev:	unused
+ * @buf:	name of override clocksource
+ * @count:	length of buffer
+ *
+ * Takes input from sysfs interface for manually overriding the default
+ * clocksource selction.
+ */
+static ssize_t sysfs_override_clocksource(struct sys_device *dev,
+					  const char *buf, size_t count)
+{
+	size_t ret = count;
+
+	/* strings from sysfs write are not 0 terminated! */
+	if (count >= sizeof(override_name))
+		return -EINVAL;
+
+	/* strip of \n: */
+	if (buf[count-1] == '\n')
+		count--;
+	if (count < 1)
+		return -EINVAL;
+
+	spin_lock_irq(&clocksource_lock);
+
+	/* copy the name given: */
+	memcpy(override_name, buf, count);
+	override_name[count] = 0;
+
+	/* try to select it: */
+	next_clocksource = select_clocksource();
+
+	spin_unlock_irq(&clocksource_lock);
+
+	return ret;
+}
+
+/**
+ * sysfs_show_available_clocksources - sysfs interface for listing clocksource
+ * @dev:	unused
+ * @buf:	char buffer to be filled with clocksource list
+ *
+ * Provides sysfs interface for listing registered clocksources
+ */
+static ssize_t
+sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
+{
+	struct list_head *tmp;
+	char *curr = buf;
+
+	spin_lock_irq(&clocksource_lock);
+	list_for_each(tmp, &clocksource_list) {
+		struct clocksource *src;
+
+		src = list_entry(tmp, struct clocksource, list);
+		curr += sprintf(curr, "%s ", src->name);
+	}
+	spin_unlock_irq(&clocksource_lock);
+
+	curr += sprintf(curr, "\n");
+
+	return curr - buf;
+}
+
+/*
+ * Sysfs setup bits:
+ */
+static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources,
+			sysfs_override_clocksource);
+
+static SYSDEV_ATTR(available_clocksource, 0600,
+			sysfs_show_available_clocksources, NULL);
+
+static struct sysdev_class clocksource_sysclass = {
+	set_kset_name("clocksource"),
+};
+
+static struct sys_device device_clocksource = {
+	.id	= 0,
+	.cls	= &clocksource_sysclass,
+};
+
+static int init_clocksource_sysfs(void)
+{
+	int error = sysdev_class_register(&clocksource_sysclass);
+
+	if (!error)
+		error = sysdev_register(&device_clocksource);
+	if (!error)
+		error = sysdev_create_file(
+				&device_clocksource,
+				&attr_current_clocksource);
+	if (!error)
+		error = sysdev_create_file(
+				&device_clocksource,
+				&attr_available_clocksource);
+	return error;
+}
+
+device_initcall(init_clocksource_sysfs);
+
+/**
+ * boot_override_clocksource - boot clock override
+ * @str:	override name
+ *
+ * Takes a clocksource= boot argument and uses it
+ * as the clocksource override name.
+ */
+static int __init boot_override_clocksource(char* str)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&clocksource_lock, flags);
+	if (str)
+		strlcpy(override_name, str, sizeof(override_name));
+	spin_unlock_irqrestore(&clocksource_lock, flags);
+	return 1;
+}
+
+__setup("clocksource=", boot_override_clocksource);
+
+/**
+ * boot_override_clock - Compatibility layer for deprecated boot option
+ * @str:	override name
+ *
+ * DEPRECATED! Takes a clock= boot argument and uses it
+ * as the clocksource override name
+ */
+static int __init boot_override_clock(char* str)
+{
+	if (!strcmp(str, "pmtmr")) {
+		printk("Warning: clock=pmtmr is depricated. "
+			"Use clocksource=acpi_pm.\n");
+		return boot_override_clocksource("acpi_pm");
+	}
+	printk("Warning! clock= boot option is deprecated. "
+		"Use clocksource=xyz\n");
+	return boot_override_clocksource(str);
+}
+
+__setup("clock=", boot_override_clock);
Index: linux-2.6.16/kernel/time/jiffies.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/kernel/time/jiffies.c	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,73 @@
+/***********************************************************************
+* linux/kernel/time/jiffies.c
+*
+* This file contains the jiffies based clocksource.
+*
+* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*
+************************************************************************/
+#include <linux/clocksource.h>
+#include <linux/jiffies.h>
+#include <linux/init.h>
+
+/* The Jiffies based clocksource is the lowest common
+ * denominator clock source which should function on
+ * all systems. It has the same coarse resolution as
+ * the timer interrupt frequency HZ and it suffers
+ * inaccuracies caused by missed or lost timer
+ * interrupts and the inability for the timer
+ * interrupt hardware to accuratly tick at the
+ * requested HZ value. It is also not reccomended
+ * for "tick-less" systems.
+ */
+#define NSEC_PER_JIFFY	((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
+
+/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
+ * conversion, the .shift value could be zero. However
+ * this would make NTP adjustments impossible as they are
+ * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to
+ * shift both the nominator and denominator the same
+ * amount, and give ntp adjustments in units of 1/2^8
+ *
+ * The value 8 is somewhat carefully chosen, as anything
+ * larger can result in overflows. NSEC_PER_JIFFY grows as
+ * HZ shrinks, so values greater then 8 overflow 32bits when
+ * HZ=100.
+ */
+#define JIFFIES_SHIFT	8
+
+static cycle_t jiffies_read(void)
+{
+	return (cycle_t) get_jiffies_64();
+}
+
+struct clocksource clocksource_jiffies = {
+	.name		= "jiffies",
+	.rating		= 0, /* lowest rating*/
+	.read		= jiffies_read,
+	.mask		= (cycle_t)-1,
+	.mult		= NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
+	.shift		= JIFFIES_SHIFT,
+	.is_continuous	= 0, /* tick based, not free running */
+};
+
+static int __init init_jiffies_clocksource(void)
+{
+	return register_clocksource(&clocksource_jiffies);
+}
+
+module_init(init_jiffies_clocksource);
Index: linux-2.6.16/kernel/time/timeofday.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/kernel/time/timeofday.c	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,854 @@
+/*
+ * linux/kernel/time/timeofday.c
+ *
+ * This file contains the functions which access and manage
+ * the system's time of day functionality.
+ *
+ * Copyright (C) 2003, 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * TODO WishList:
+ *   o See XXX's below.
+ */
+
+#include <linux/clocksource.h>
+#include <linux/timeofday.h>
+#include <linux/jiffies.h>
+#include <linux/sysdev.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/ktime.h>
+#include <linux/timex.h>
+#include <linux/sched.h>
+
+#include <asm/timeofday.h>
+
+/* Periodic hook interval */
+#define PERIODIC_INTERVAL_MS 50
+
+/* [ktime_t based variables]
+ * system_time:
+ *	Monotonically increasing counter of the number of nanoseconds
+ *	since boot.
+ * wall_time_offset:
+ *	Offset added to system_time to provide accurate time-of-day
+ */
+static ktime_t system_time;
+static ktime_t wall_time_offset;
+
+/* [timespec based variables]
+ * These variables mirror the ktime_t based variables to avoid
+ * performance issues in the userspace syscall paths.
+ *
+ * wall_time_ts:
+ *	timespec holding the current wall time.
+ *	NOTE: clock_was_set() must be called when this value changes.
+ * mono_time_ts:
+ *	timespec holding the current monotonic time.
+ * monotonic_time_offset_ts:
+ *	timespec holding the difference between wall and monotonic time.
+ */
+static struct timespec wall_time_ts;
+static struct timespec mono_time_ts;
+static struct timespec monotonic_time_offset_ts;
+
+/* [cycle based variables]
+ * cycle_last:
+ *	Value of the clocksource at the last timeofday_periodic_hook()
+ *	(adjusted only minorly to account for rounded off cycles)
+ */
+static cycle_t cycle_last;
+static cycle_t cycle_wrap_bucket;
+
+
+/* [clocksource_interval variables]
+ * ts_interval:
+ *	This clocksource_interval is used in the fixed interval
+ *	cycles to nanosecond calculation.
+ * INTERVAL_LEN:
+ *	This constant is the requested fixed interval period
+ *	in nanoseconds.
+ */
+static struct clocksource_interval ts_interval;
+#define INTERVAL_LEN ((PERIODIC_INTERVAL_MS-1)*1000000)
+
+/* [clocksource data]
+ * clock:
+ *	current clocksource pointer
+ */
+static struct clocksource *clock;
+
+/* [NTP adjustment]
+ * ntp_adj:
+ *	value of the current ntp adjustment, stored in
+ *	clocksource multiplier units.
+ */
+static int ntp_adj;
+
+/* [locks]
+ * system_time_lock:
+ *	generic lock for all locally scoped time values
+ */
+static DECLARE_RAW_SEQLOCK(system_time_lock);
+
+
+/* [suspend/resume info]
+ * time_suspend_state:
+ *	variable that keeps track of suspend state
+ * suspend_start:
+ *	start of the suspend call
+ */
+static enum {
+	TIME_RUNNING,
+	TIME_SUSPENDED
+} time_suspend_state = TIME_RUNNING;
+
+static s64 suspend_start;
+
+/* [Soft-Timers]
+ * timeofday_timer:
+ *	soft-timer used to call timeofday_periodic_hook()
+ */
+static struct timer_list timeofday_timer;
+
+
+#ifdef CONFIG_PARANOID_GENERIC_TIME
+/* This will hurt performance! */
+static DEFINE_RAW_SPINLOCK(check_monotonic_lock);
+static ktime_t last_monotonic_ktime;
+
+static ktime_t get_check_value(void)
+{
+	unsigned long flags;
+	ktime_t ret;
+
+	spin_lock_irqsave(&check_monotonic_lock, flags);
+	ret = last_monotonic_ktime;
+	spin_unlock_irqrestore(&check_monotonic_lock, flags);
+
+	return ret;
+}
+
+static void check_monotonic_clock(ktime_t prev, ktime_t now)
+{
+	unsigned long flags;
+
+	/* check for monotonic inconsistencies */
+	if (now.tv64 < prev.tv64) {
+		static int warn = 1;
+
+		if (warn) {
+			warn = 0;
+
+			printk("check_monotonic_clock: monotonic inconsistency"
+					" detected!\n");
+			printk("	from %16Lx (%llu) to %16Lx (%llu).\n",
+					ktime_to_ns(prev),
+					ktime_to_ns(prev),
+					ktime_to_ns(now),
+					ktime_to_ns(now));
+			printk("	Using %s clocksource\n", clock->name);
+			WARN_ON(1);
+		}
+	}
+	spin_lock_irqsave(&check_monotonic_lock, flags);
+	last_monotonic_ktime = now;
+	spin_unlock_irqrestore(&check_monotonic_lock, flags);
+}
+
+/* timespec version */
+#define check_monotonic_clock_ts(prev, now) \
+	check_monotonic_clock(prev, timespec_to_ktime(now))
+
+/* Call holding atleast a readlock on system_time_lock */
+void verify_timekeeping_state(void)
+{
+	/* ensure all the timespec and ktime values are consistent: */
+	WARN_ON_ONCE(system_time.tv64 != timespec_to_ktime(mono_time_ts).tv64);
+	WARN_ON_ONCE(ktime_add(system_time, wall_time_offset).tv64 !=
+			timespec_to_ktime(wall_time_ts).tv64);
+	WARN_ON_ONCE(wall_time_offset.tv64 !=
+			timespec_to_ktime(monotonic_time_offset_ts).tv64);
+}
+
+static void check_periodic_interval(cycle_t now)
+{
+	static cycle_t last;
+
+	cycle_t delta;
+	s64 ns_offset;
+
+	if (last != 0 && now != 0) {
+		delta = (now - last)& clock->mask;
+
+		ns_offset = cyc2ns(clock, ntp_adj, delta);
+
+		if (ns_offset > (s64)2*PERIODIC_INTERVAL_MS *1000000) {
+			static int warn_count = 10;
+			if (warn_count > 0) {
+				warn_count--;
+				printk("check_periodic_interval: "
+					"Long interval! %llu ns.\n",
+					ns_offset);
+				printk("		Something may "
+					"be blocking interrupts.\n");
+			}
+		}
+		if (ns_offset < (s64)PERIODIC_INTERVAL_MS *1000000) {
+			static int warn_count = 10;
+			if (warn_count > 0) {
+				warn_count--;
+				printk("check_periodic_interval: "
+					"short interval! %llu ns.\n",
+					ns_offset);
+				printk("		bad calibration "
+					"or timers may be broken.\n");
+			}
+		}
+	}
+	last = now;
+}
+
+#else /* CONFIG_PARANOID_GENERIC_TIME */
+  /* XXX can we optimize this out? */
+# define get_check_value(void)		ktime_set(0,0)
+# define check_monotonic_clock(x,y)	do { } while (0)
+# define check_monotonic_clock_ts(x,ts)	do { } while (0)
+# define verify_timekeeping_state()	do { } while (0)
+# define check_periodic_interval(x)	do { } while (0)
+#endif /* CONFIG_PARANOID_GENERIC_TIME */
+
+/**
+ * update_legacy_time_values - sync legacy time values
+ *
+ * This function is necessary for a smooth transition to the
+ * new timekeeping code. When all the xtime/wall_to_monotonic
+ * users are converted this function can be removed.
+ *
+ * system_time_lock must be held by the caller
+ */
+static void update_legacy_time_values(void)
+{
+	unsigned long flags;
+
+	write_seqlock_irqsave(&xtime_lock, flags);
+
+	xtime = wall_time_ts;
+	set_normalized_timespec(&wall_to_monotonic,
+		-monotonic_time_offset_ts.tv_sec,
+		-monotonic_time_offset_ts.tv_nsec);
+
+	write_sequnlock_irqrestore(&xtime_lock, flags);
+
+	/* since time state has changed, notify vsyscall code */
+	arch_update_vsyscall_gtod(wall_time_ts, cycle_last, clock, ntp_adj);
+}
+
+/**
+ * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
+ *
+ * private function, must hold system_time_lock lock when being
+ * called. Returns the number of nanoseconds since the
+ * last call to timeofday_periodic_hook() (adjusted by NTP scaling)
+ */
+static inline s64 __get_nsec_offset(void)
+{
+	cycle_t cycle_now, cycle_delta;
+	s64 ns_offset;
+
+	/* read clocksource: */
+	cycle_now = read_clocksource(clock);
+
+	/* calculate the delta since the last timeofday_periodic_hook: */
+	cycle_delta = (cycle_now - cycle_last) & clock->mask;
+	cycle_delta += cycle_wrap_bucket;
+	/* convert to nanoseconds: */
+	ns_offset = cyc2ns(clock, ntp_adj, cycle_delta);
+
+	/*
+	 * special case for jiffies tick/offset based systems,
+	 * add arch-specific offset:
+	 */
+	ns_offset += arch_getoffset();
+
+	return ns_offset;
+}
+
+/**
+ * __get_monotonic_clock - Returns monotonically increasing nanoseconds
+ *
+ * private function, must hold system_time_lock lock when being
+ * called. Returns the monotonically increasing number of
+ * nanoseconds since the system booted (adjusted by NTP scaling)
+ */
+static ktime_t __get_monotonic_clock(void)
+{
+	s64 offset = __get_nsec_offset();
+#ifdef CONFIG_PARANOID_GENERIC_TIME
+	ktime_t check = get_check_value();
+#endif
+	ktime_t ret;
+
+	ret = ktime_add_ns(system_time, offset);
+	check_monotonic_clock(check,ret);
+	return ret;
+}
+
+/**
+ * get_monotonic_clock - Returns monotonic time in ktime_t format
+ *
+ * Returns the monotonically increasing number of nanoseconds
+ * since the system booted via __monotonic_clock()
+ */
+ktime_t get_monotonic_clock(void)
+{
+	unsigned long seq;
+	ktime_t ret;
+
+	/* atomically read __get_monotonic_clock() */
+	do {
+		seq = read_seqbegin(&system_time_lock);
+
+		ret = __get_monotonic_clock();
+
+	} while (read_seqretry(&system_time_lock, seq));
+
+	return ret;
+}
+
+EXPORT_SYMBOL_GPL(get_monotonic_clock);
+
+/**
+ * get_realtime_clock - Returns the timeofday in ktime_t format
+ *
+ * Returns the wall time in ktime_t format. The resolution
+ * is nanoseconds
+ */
+ktime_t get_realtime_clock(void)
+{
+	unsigned long seq;
+	ktime_t ret;
+
+	/* atomically read __get_monotonic_clock() */
+	do {
+		seq = read_seqbegin(&system_time_lock);
+
+		ret = __get_monotonic_clock();
+		ret = ktime_add(ret, wall_time_offset);
+
+	} while (read_seqretry(&system_time_lock, seq));
+
+	return ret;
+}
+
+EXPORT_SYMBOL_GPL(get_realtime_clock);
+
+/**
+ * get_realtime_offset - Returns the offset of realtime clock
+ *
+ * Returns the number of nanoseconds in ktime_t storage format which
+ * represents the offset of the realtime clock to the the monotonic clock
+ */
+ktime_t get_realtime_offset(void)
+{
+	unsigned long seq;
+	ktime_t ret;
+
+	/* atomically read wall_time_offset */
+	do {
+		seq = read_seqbegin(&system_time_lock);
+
+		ret = wall_time_offset;
+
+	} while (read_seqretry(&system_time_lock, seq));
+
+	return ret;
+}
+
+/**
+ * get_monotonic_clock_ts - Returns monotonic time in timespec format
+ * @ts:		pointer to the timespec to be set
+ *
+ * Returns a timespec of nanoseconds since the system booted and
+ * store the result in the timespec variable pointed to by @ts
+ */
+void get_monotonic_clock_ts(struct timespec *ts)
+{
+#ifdef CONFIG_PARANOID_GENERIC_TIME
+	ktime_t check = get_check_value();
+#endif
+	unsigned long seq;
+	s64 offset;
+
+	do {
+		seq = read_seqbegin(&system_time_lock);
+
+		*ts = mono_time_ts;
+		offset = __get_nsec_offset();
+	} while (read_seqretry(&system_time_lock, seq));
+
+	timespec_add_ns(ts, offset);
+	check_monotonic_clock_ts(check, *ts);
+}
+
+/**
+ * __get_realtime_clock_ts - Returns the time of day in a timespec
+ * @ts:		pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec. Used by
+ * do_gettimeofday() and get_realtime_clock_ts().
+ */
+static inline void __get_realtime_clock_ts(struct timespec *ts)
+{
+	unsigned long seq;
+	s64 nsecs;
+
+	do {
+		seq = read_seqbegin(&system_time_lock);
+
+		*ts = wall_time_ts;
+		nsecs = __get_nsec_offset();
+
+	} while (read_seqretry(&system_time_lock, seq));
+
+	timespec_add_ns(ts, nsecs);
+}
+
+/**
+ * get_realtime_clock_ts - Returns the time of day in a timespec
+ * @ts:		pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec.
+ */
+void get_realtime_clock_ts(struct timespec *ts)
+{
+	__get_realtime_clock_ts(ts);
+}
+
+EXPORT_SYMBOL(get_realtime_clock_ts);
+
+/**
+ * do_gettimeofday - Returns the time of day in a timeval
+ * @tv:		pointer to the timeval to be set
+ *
+ * NOTE: Users should be converted to using get_realtime_clock_ts()
+ */
+void do_gettimeofday(struct timeval *tv)
+{
+	struct timespec now;
+
+	__get_realtime_clock_ts(&now);
+	tv->tv_sec = now.tv_sec;
+	tv->tv_usec = now.tv_nsec/1000;
+}
+
+EXPORT_SYMBOL(do_gettimeofday);
+
+/**
+ * __increment_system_time - Increments system time
+ * @delta:	nanosecond delta to add to the time variables
+ *
+ * Private helper that increments system_time and related
+ * timekeeping variables.
+ */
+static void __increment_system_time(s64 delta)
+{
+	system_time = ktime_add_ns(system_time, delta);
+	timespec_add_ns(&wall_time_ts, delta);
+	timespec_add_ns(&mono_time_ts, delta);
+}
+
+/**
+ * __set_wall_time_offset - Sets the wall time offset
+ * @delta:	nanosecond delta to adjust to the time variables
+ *
+ * Private helper that adjusts wall_time_offset and related
+ * timekeeping variables.
+ */
+static void __set_wall_time_offset(ktime_t val)
+{
+	wall_time_offset = val;
+	wall_time_ts = ktime_to_timespec(ktime_add(system_time,
+						wall_time_offset));
+	monotonic_time_offset_ts = ktime_to_timespec(wall_time_offset);
+}
+
+/**
+ * do_settimeofday - Sets the time of day
+ * @tv:		pointer to the timespec variable containing the new time
+ *
+ * Sets the time of day to the new time and update NTP and notify hrtimers
+ */
+int do_settimeofday(struct timespec *tv)
+{
+	unsigned long flags;
+	ktime_t newtime;
+
+	newtime = timespec_to_ktime(*tv);
+
+	write_seqlock_irqsave(&system_time_lock, flags);
+
+	/* calculate the new offset from the monotonic clock */
+	__set_wall_time_offset(ktime_sub(newtime, __get_monotonic_clock()));
+
+	ntp_clear();
+	update_legacy_time_values();
+
+	write_sequnlock_irqrestore(&system_time_lock, flags);
+#ifdef CONFIG_PARANOID_GENERIC_TIME
+	printk("do_settimeofday() was called!\n");
+#endif
+	/* signal hrtimers about time change */
+	clock_was_set();
+
+	return 0;
+}
+
+EXPORT_SYMBOL(do_settimeofday);
+
+/**
+ * timeofday_suspend_hook - allows the timeofday subsystem to be shutdown
+ * @dev:	unused
+ * @state:	unused
+ *
+ * This function allows the timeofday subsystem to be shutdown for a period
+ * of time. Called when going into suspend/hibernate mode.
+ */
+static int timeofday_suspend_hook(struct sys_device *dev, pm_message_t state)
+{
+	unsigned long flags;
+
+	write_seqlock_irqsave(&system_time_lock, flags);
+
+	BUG_ON(time_suspend_state != TIME_RUNNING);
+
+	/*
+	 * First off, save suspend start time
+	 * then quickly accumulate the current nsec offset.
+	 * These two calls hopefully occur quickly
+	 * because the difference between reads will
+	 * accumulate as time drift on resume.
+	 */
+	suspend_start = read_persistent_clock();
+	__increment_system_time(__get_nsec_offset());
+
+	time_suspend_state = TIME_SUSPENDED;
+
+	write_sequnlock_irqrestore(&system_time_lock, flags);
+
+	return 0;
+}
+
+/**
+ * timeofday_resume_hook - Resumes the timeofday subsystem.
+ * @dev:	unused
+ *
+ * This function resumes the timeofday subsystem from a previous call
+ * to timeofday_suspend_hook.
+ */
+static int timeofday_resume_hook(struct sys_device *dev)
+{
+	s64 suspend_end, suspend_time;
+	unsigned long flags;
+
+	write_seqlock_irqsave(&system_time_lock, flags);
+
+	BUG_ON(time_suspend_state != TIME_SUSPENDED);
+
+	/*
+	 * Read persistent clock to mark the end of
+	 * the suspend interval then rebase the
+	 * cycle_last to current clocksource value.
+	 * Again, time between these two calls will
+	 * not be accounted for and will show up as
+	 * time drift.
+	 */
+	suspend_end = read_persistent_clock();
+	cycle_last = read_clocksource(clock);
+	cycle_wrap_bucket = 0;
+
+	/* calculate suspend time and add it to system time: */
+	suspend_time = suspend_end - suspend_start;
+	__increment_system_time(suspend_time);
+
+	ntp_clear();
+
+	time_suspend_state = TIME_RUNNING;
+
+	update_legacy_time_values();
+
+	write_sequnlock_irqrestore(&system_time_lock, flags);
+
+	/* notify the posix timers if wall_time_offset changed */
+	clock_was_set();
+
+	return 0;
+}
+
+/* sysfs resume/suspend bits */
+static struct sysdev_class timeofday_sysclass = {
+	.resume		= timeofday_resume_hook,
+	.suspend	= timeofday_suspend_hook,
+	set_kset_name("timeofday"),
+};
+
+static struct sys_device device_timer = {
+	.id		= 0,
+	.cls		= &timeofday_sysclass,
+};
+
+static int timeofday_init_device(void)
+{
+	int error = sysdev_class_register(&timeofday_sysclass);
+
+	if (!error)
+		error = sysdev_register(&device_timer);
+
+	return error;
+}
+
+device_initcall(timeofday_init_device);
+
+/**
+ * timeofday_periodic_hook - Does periodic update of timekeeping values.
+ * @unused:	unused value
+ *
+ * Calculates the delta since the last call, updates system time and
+ * clears the offset.
+ *
+ * Called via timeofday_timer.
+ */
+static void timeofday_periodic_hook(unsigned long unused)
+{
+	unsigned long flags;
+
+	cycle_t cycle_now, cycle_delta;
+	s64 delta_nsec;
+	static u64 remainder;
+
+	long leapsecond = 0;
+	struct clocksource* next;
+
+	int ppm;
+	static int ppm_last;
+
+	int something_changed = 0;
+ 	int clocksource_changed = 0;
+	struct clocksource old_clock;
+	static s64 second_check;
+
+	write_seqlock_irqsave(&system_time_lock, flags);
+
+	/* read time source & calc time since last call: */
+	cycle_now = read_clocksource(clock);
+	check_periodic_interval(cycle_now);
+	cycle_delta = (cycle_now - cycle_last) & clock->mask;
+	cycle_delta += cycle_wrap_bucket;
+	delta_nsec = cyc2ns_fixed_rem(ts_interval, &cycle_delta, &remainder);
+	cycle_last = (cycle_now - cycle_delta)&clock->mask;
+	cycle_wrap_bucket = 0;
+
+	/* update system_time:  */
+	__increment_system_time(delta_nsec);
+
+	/* advance the ntp state machine by ns interval: */
+	ntp_advance(delta_nsec);
+
+	/* only call ntp_leapsecond once a sec:  */
+	second_check += delta_nsec;
+	if (second_check >= NSEC_PER_SEC) {
+		/* do ntp leap second processing: */
+		leapsecond = ntp_leapsecond(wall_time_ts);
+		if (leapsecond)
+			__set_wall_time_offset(ktime_add_ns(wall_time_offset,
+						leapsecond * NSEC_PER_SEC));
+		second_check -= NSEC_PER_SEC;
+	}
+	/* sync the persistent clock: */
+	if (ntp_synced())
+		sync_persistent_clock(wall_time_ts);
+
+	/* if necessary, switch clocksources: */
+	next = get_next_clocksource();
+	if (next != clock) {
+		/* immediately set new cycle_last: */
+		cycle_last = read_clocksource(next);
+		/* update cycle_now to avoid problems in accumulation later: */
+		cycle_now = cycle_last;
+		/* swap clocksources: */
+		old_clock = *clock;
+		clock = next;
+		ntp_clear();
+		ntp_adj = 0;
+		remainder = 0;
+		something_changed = 1;
+		clocksource_changed = 1;
+		check_periodic_interval(0);
+	}
+
+	/*
+	 * now is a safe time, so allow clocksource to adjust
+	 * itself (for example: to make cpufreq changes):
+	 */
+	if (clock->update_callback) {
+		/*
+		 * since clocksource state might change,
+		 * keep a copy, but only if we've not
+		 * already changed timesources:
+		 */
+		if (!something_changed)
+			old_clock = *clock;
+		if (clock->update_callback()) {
+			remainder = 0;
+			something_changed = 1;
+		}
+	}
+
+	/* check for new PPM adjustment: */
+	ppm = ntp_get_ppm_adjustment();
+	if (ppm_last != ppm) {
+		/* make sure old_clock is set: */
+		if (!something_changed)
+			old_clock = *clock;
+		something_changed = 1;
+	}
+
+	/* if something changed, recalculate the ntp adjustment value: */
+	if (something_changed) {
+		/* accumulate current leftover cycles using old_clock: */
+		if (cycle_delta) {
+			delta_nsec = cyc2ns_rem(&old_clock, ntp_adj,
+						cycle_delta, &remainder);
+			cycle_last = cycle_now;
+			__increment_system_time(delta_nsec);
+			ntp_advance(delta_nsec);
+		}
+
+		/* recalculate the ntp adjustment and fixed interval values: */
+		ppm_last = ppm;
+		ntp_adj = ppm_to_mult_adj(clock, ppm);
+		ts_interval = calculate_clocksource_interval(clock, ntp_adj,
+					INTERVAL_LEN);
+	}
+
+	update_legacy_time_values();
+
+	verify_timekeeping_state();
+
+	write_sequnlock_irqrestore(&system_time_lock, flags);
+
+	if (clocksource_changed) {
+		printk(KERN_INFO "Time: %s clocksource has been installed.\n",
+					clock->name);
+		hrtimer_clock_notify();
+	}
+
+	/* notify the posix timers if wall_time_offset changed */
+	if (leapsecond)
+		clock_was_set();
+
+	/* set us up to go off on the next interval: */
+	mod_timer(&timeofday_timer,
+		jiffies + 1 + msecs_to_jiffies(PERIODIC_INTERVAL_MS));
+}
+
+void timeofday_overflow_protection(void)
+{
+	unsigned long flags;
+	cycle_t cycle_now, cycle_delta;
+
+	write_seqlock_irqsave(&system_time_lock, flags);
+	cycle_now = read_clocksource(clock);
+	cycle_wrap_bucket += (cycle_now - cycle_last) & clock->mask;
+	cycle_last = cycle_now;
+	write_sequnlock_irqrestore(&system_time_lock, flags);
+}
+
+
+/**
+ * timeofday_is_continuous - check to see if timekeeping is free running
+ */
+int timeofday_is_continuous(void)
+{
+	unsigned long seq;
+	int ret;
+
+	do {
+		seq = read_seqbegin(&system_time_lock);
+
+		ret = clock->is_continuous;
+
+	} while (read_seqretry(&system_time_lock, seq));
+
+	return ret;
+}
+
+/**
+ * timeofday_get_clockres - returns the underlying clocksource's resolution
+ */
+u32 timeofday_get_clockres(void)
+{
+	unsigned long seq;
+	u32 res;
+	do {
+		seq = read_seqbegin(&system_time_lock);
+
+		/* take one cycle -> N+1 nsecs */
+		res = cyc2ns(clock, 0, 1) + 1;
+
+	} while (read_seqretry(&system_time_lock, seq));
+
+	return res;
+}
+
+/**
+ * timeofday_init - Initializes time variables
+ */
+void __init timeofday_init(void)
+{
+	unsigned long flags;
+
+	write_seqlock_irqsave(&system_time_lock, flags);
+
+	/* initialize the clock variable: */
+	clock = get_next_clocksource();
+
+	/* initialize cycle_last offset base: */
+	cycle_last = read_clocksource(clock);
+
+	/* initialize wall_time_offset to now: */
+	/* XXX - this should be something like ns_to_ktime() */
+	__set_wall_time_offset(ktime_add_ns(wall_time_offset,
+					read_persistent_clock()));
+
+	/* clear NTP scaling factor & state machine: */
+	ntp_adj = 0;
+	ntp_clear();
+	ts_interval = calculate_clocksource_interval(clock, ntp_adj,
+				INTERVAL_LEN);
+
+	/* initialize legacy time values: */
+	update_legacy_time_values();
+
+	write_sequnlock_irqrestore(&system_time_lock, flags);
+
+	/* install timeofday_periodic_hook timer: */
+	init_timer(&timeofday_timer);
+	timeofday_timer.function = timeofday_periodic_hook;
+	timeofday_timer.expires = jiffies + 1
+				+ msecs_to_jiffies(PERIODIC_INTERVAL_MS);
+	add_timer(&timeofday_timer);
+}
Index: linux-2.6.16/kernel/timer.c
===================================================================
--- linux-2.6.16.orig/kernel/timer.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/timer.c	2006-10-19 16:52:31.000000000 +0200
@@ -28,11 +28,12 @@
 #include <linux/swap.h>
 #include <linux/notifier.h>
 #include <linux/thread_info.h>
-#include <linux/time.h>
+#include <linux/timeofday.h>
 #include <linux/jiffies.h>
 #include <linux/posix-timers.h>
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
+#include <linux/kallsyms.h>
 #include <linux/delay.h>
 
 #include <asm/uaccess.h>
@@ -65,6 +66,7 @@ EXPORT_SYMBOL(jiffies_64);
 struct timer_base_s {
 	spinlock_t lock;
 	struct timer_list *running_timer;
+	wait_queue_head_t wait_for_running_timer;
 };
 
 typedef struct tvec_s {
@@ -91,7 +93,7 @@ static DEFINE_PER_CPU(tvec_base_t, tvec_
 static inline void set_running_timer(tvec_base_t *base,
 					struct timer_list *timer)
 {
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS)
 	base->t_base.running_timer = timer;
 #endif
 }
@@ -207,7 +209,7 @@ int __mod_timer(struct timer_list *timer
 	timer_base_t *base;
 	tvec_base_t *new_base;
 	unsigned long flags;
-	int ret = 0;
+	int ret = 0, cpu;
 
 	BUG_ON(!timer->function);
 
@@ -217,8 +219,8 @@ int __mod_timer(struct timer_list *timer
 		detach_timer(timer, 0);
 		ret = 1;
 	}
-
-	new_base = &__get_cpu_var(tvec_bases);
+	cpu = raw_smp_processor_id();
+	new_base = &per_cpu(tvec_bases, cpu);
 
 	if (base != &new_base->t_base) {
 		/*
@@ -268,6 +270,17 @@ void add_timer_on(struct timer_list *tim
 	spin_unlock_irqrestore(&base->t_base.lock, flags);
 }
 
+/*
+ * Wait for a running timer
+ */
+void wait_for_running_timer(struct timer_list *timer)
+{
+	timer_base_t *base = timer->base;
+
+	if (base->running_timer == timer)
+		wait_event(base->wait_for_running_timer,
+			   base->running_timer != timer);
+}
 
 /***
  * mod_timer - modify a timer's timeout
@@ -336,7 +349,34 @@ int del_timer(struct timer_list *timer)
 
 EXPORT_SYMBOL(del_timer);
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS)
+/*
+ * This function checks whether a timer is active and not running on any
+ * CPU. Upon successful (ret >= 0) exit the timer is not queued and the
+ * handler is not running on any CPU.
+ *
+ * It must not be called from interrupt contexts.
+ */
+int timer_pending_sync(struct timer_list *timer)
+{
+	timer_base_t *base;
+	unsigned long flags;
+	int ret = -1;
+
+	base = lock_timer_base(timer, &flags);
+
+	if (base->running_timer == timer)
+		goto out;
+
+	ret = 0;
+	if (timer_pending(timer))
+		ret = 1;
+out:
+	spin_unlock_irqrestore(&base->lock, flags);
+
+	return ret;
+}
+
 /*
  * This function tries to deactivate a timer. Upon successful (ret >= 0)
  * exit the timer is not queued and the handler is not running on any CPU.
@@ -365,6 +405,7 @@ out:
 	return ret;
 }
 
+
 /***
  * del_timer_sync - deactivate a timer and wait for the handler to finish.
  * @timer: the timer to be deactivated
@@ -388,6 +429,7 @@ int del_timer_sync(struct timer_list *ti
 		int ret = try_to_del_timer_sync(timer);
 		if (ret >= 0)
 			return ret;
+		wait_for_running_timer(timer);
 	}
 }
 
@@ -435,8 +477,21 @@ static inline void __run_timers(tvec_bas
 	while (time_after_eq(jiffies, base->timer_jiffies)) {
 		struct list_head work_list = LIST_HEAD_INIT(work_list);
 		struct list_head *head = &work_list;
- 		int index = base->timer_jiffies & TVR_MASK;
- 
+		int index = base->timer_jiffies & TVR_MASK;
+
+		if (softirq_need_resched()) {
+			spin_unlock_irq(&base->t_base.lock);
+			wake_up(&base->t_base.wait_for_running_timer);
+			cond_resched_all();
+			cpu_relax();
+			spin_lock_irq(&base->t_base.lock);
+			/*
+			 * We can simply continue after preemption, nobody
+			 * else can touch timer_jiffies so 'index' is still
+			 * valid. Any new jiffy will be taken care of in
+			 * subsequent loops:
+			 */
+		}
 		/*
 		 * Cascade timers:
 		 */
@@ -462,19 +517,18 @@ static inline void __run_timers(tvec_bas
 				int preempt_count = preempt_count();
 				fn(data);
 				if (preempt_count != preempt_count()) {
-					printk(KERN_WARNING "huh, entered %p "
-					       "with preempt_count %08x, exited"
-					       " with %08x?\n",
-					       fn, preempt_count,
-					       preempt_count());
-					BUG();
+					print_symbol("BUG: unbalanced timer-handler preempt count in %s!\n", (unsigned long) fn);
+					printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count());
+					preempt_count() = preempt_count;
 				}
 			}
+			set_running_timer(base, NULL);
+			cond_resched_all();
 			spin_lock_irq(&base->t_base.lock);
 		}
 	}
-	set_running_timer(base, NULL);
 	spin_unlock_irq(&base->t_base.lock);
+	wake_up(&base->t_base.wait_for_running_timer);
 }
 
 #ifdef CONFIG_NO_IDLE_HZ
@@ -599,7 +653,6 @@ long time_tolerance = MAXFREQ;		/* frequ
 long time_precision = 1;		/* clock precision (us)		*/
 long time_maxerror = NTP_PHASE_LIMIT;	/* maximum error (us)		*/
 long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
-static long time_phase;			/* phase offset (scaled us)	*/
 long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
 					/* frequency offset (scaled ppm)*/
 static long time_adj;			/* tick adjust (scaled 1 / HZ)	*/
@@ -607,76 +660,107 @@ long time_reftime;			/* time at last adj
 long time_adjust;
 long time_next_adjust;
 
-/*
- * this routine handles the overflow of the microsecond field
- *
- * The tricky bits of code to handle the accurate clock support
- * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
- * They were originally developed for SUN and DEC kernels.
- * All the kudos should go to Dave for this stuff.
+static long total_sppm;			/* shifted ppm sum of all adjustments */
+static long offset_adj_ppm;
+static long tick_adj_ppm;
+static long singleshot_adj_ppm;
+
+#define MAX_SINGLESHOT_ADJ	500	/* (ppm) */
+#define SEC_PER_DAY		86400
+#define END_OF_DAY(x)		((x) + SEC_PER_DAY - ((x) % SEC_PER_DAY) - 1)
+
+/* NTP lock, protects NTP state machine */
+DECLARE_RAW_SEQLOCK(ntp_lock);
+
+/**
+ * ntp_leapsecond - NTP leapsecond processing code.
+ * now: the current time
  *
+ * Returns the number of seconds (-1, 0, or 1) that
+ * should be added to the current time to properly
+ * adjust for leapseconds.
  */
-static void second_overflow(void)
+int ntp_leapsecond(struct timespec now)
 {
-	long ltemp;
-
-	/* Bump the maxerror field */
-	time_maxerror += time_tolerance >> SHIFT_USEC;
-	if (time_maxerror > NTP_PHASE_LIMIT) {
-		time_maxerror = NTP_PHASE_LIMIT;
-		time_status |= STA_UNSYNC;
-	}
-
 	/*
-	 * Leap second processing. If in leap-insert state at the end of the
-	 * day, the system clock is set back one second; if in leap-delete
-	 * state, the system clock is set ahead one second. The microtime()
-	 * routine or external clock driver will insure that reported time is
-	 * always monotonic. The ugly divides should be replaced.
+	 * Leap second processing. If in leap-insert state at
+	 * the end of the day, the system clock is set back one
+	 * second; if in leap-delete state, the system clock is
+	 * set ahead one second.
 	 */
+	static time_t leaptime = 0;
+
+	unsigned long flags;
+	int ret = 0;
+
+	write_seqlock_irqsave(&ntp_lock, flags);
+
 	switch (time_state) {
+
 	case TIME_OK:
-		if (time_status & STA_INS)
+		if (time_status & STA_INS) {
 			time_state = TIME_INS;
-		else if (time_status & STA_DEL)
+			leaptime = END_OF_DAY(now.tv_sec);
+		} else if (time_status & STA_DEL) {
 			time_state = TIME_DEL;
+			leaptime = END_OF_DAY(now.tv_sec);
+		}
 		break;
+
 	case TIME_INS:
-		if (xtime.tv_sec % 86400 == 0) {
-			xtime.tv_sec--;
-			wall_to_monotonic.tv_sec++;
-			/*
-			 * The timer interpolator will make time change
-			 * gradually instead of an immediate jump by one second
-			 */
-			time_interpolator_update(-NSEC_PER_SEC);
+		/* Once we are at (or past) leaptime, insert the second */
+		if (now.tv_sec >= leaptime) {
 			time_state = TIME_OOP;
-			clock_was_set();
-			printk(KERN_NOTICE "Clock: inserting leap second "
-					"23:59:60 UTC\n");
+			printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n");
+			ret = -1;
 		}
 		break;
+
 	case TIME_DEL:
-		if ((xtime.tv_sec + 1) % 86400 == 0) {
-			xtime.tv_sec++;
-			wall_to_monotonic.tv_sec--;
-			/*
-			 * Use of time interpolator for a gradual change of
-			 * time
-			 */
-			time_interpolator_update(NSEC_PER_SEC);
+		/* Once we are at (or past) leaptime, delete the second */
+		if (now.tv_sec >= leaptime) {
 			time_state = TIME_WAIT;
-			clock_was_set();
-			printk(KERN_NOTICE "Clock: deleting leap second "
-					"23:59:59 UTC\n");
+			printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n");
+			ret = 1;
 		}
 		break;
+
 	case TIME_OOP:
+		/* Wait for the end of the leap second */
+		if (now.tv_sec > (leaptime + 1))
+			time_state = TIME_WAIT;
 		time_state = TIME_WAIT;
 		break;
+
 	case TIME_WAIT:
 		if (!(time_status & (STA_INS | STA_DEL)))
-		time_state = TIME_OK;
+			time_state = TIME_OK;
+		break;
+	}
+
+	write_sequnlock_irqrestore(&ntp_lock, flags);
+
+	return ret;
+}
+
+/*
+ * this routine handles the overflow of the nanosecond field
+ *
+ * The tricky bits of code to handle the accurate clock support
+ * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
+ * They were originally developed for SUN and DEC kernels.
+ * All the kudos should go to Dave for this stuff.
+ *
+ */
+static void second_overflow(void)
+{
+	long ltemp;
+
+	/* Bump the maxerror field */
+	time_maxerror += time_tolerance >> SHIFT_USEC;
+	if (time_maxerror > NTP_PHASE_LIMIT) {
+		time_maxerror = NTP_PHASE_LIMIT;
+		time_status |= STA_UNSYNC;
 	}
 
 	/*
@@ -694,6 +778,13 @@ static void second_overflow(void)
 	time_offset -= ltemp;
 	time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
 
+	offset_adj_ppm = shift_right(ltemp, SHIFT_UPDATE); /* ppm */
+
+	/* first calculate usec/user_tick offset: */
+	tick_adj_ppm = ((USEC_PER_SEC + USER_HZ/2)/USER_HZ) - tick_usec;
+	/* multiply by user_hz to get usec/sec => ppm: */
+	tick_adj_ppm *= USER_HZ;
+
 	/*
 	 * Compute the frequency estimate and additional phase adjustment due
 	 * to frequency error for the next second. When the PPS signal is
@@ -757,34 +848,111 @@ static long adjtime_adjustment(void)
 	return time_adjust_step;
 }
 
-/* in the NTP reference this is called "hardclock()" */
-static void update_wall_time_one_tick(void)
+/**
+ * ntp_get_ppm_adjustment - return shifted PPM adjustment
+ */
+long ntp_get_ppm_adjustment(void)
 {
-	long time_adjust_step, delta_nsec;
+	return total_sppm;
+}
+
+/**
+ * ntp_advance - increments the NTP state machine
+ * @interval_ns: interval, in nanoseconds
+ */
+void ntp_advance(unsigned long interval_ns)
+{
+	static unsigned long interval_sum;
+	long time_adjust_step;
+	unsigned long flags;
+
+	write_seqlock_irqsave(&ntp_lock, flags);
+
+	/* increment the interval sum: */
+	interval_sum += interval_ns;
 
 	time_adjust_step = adjtime_adjustment();
-	if (time_adjust_step)
-		/* Reduce by this step the amount of time left  */
+	/* calculate the per tick singleshot adjtime adjustment step: */
+	while (interval_ns >= tick_nsec) {
 		time_adjust -= time_adjust_step;
-	delta_nsec = tick_nsec + time_adjust_step * 1000;
-	/*
-	 * Advance the phase, once it gets to one microsecond, then
-	 * advance the tick more.
-	 */
-	time_phase += time_adj;
-	if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) {
-		long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10));
-		time_phase -= ltemp << (SHIFT_SCALE - 10);
-		delta_nsec += ltemp;
+		interval_ns -= tick_nsec;
 	}
-	xtime.tv_nsec += delta_nsec;
-	time_interpolator_update(delta_nsec);
+	/* usec/tick => ppm: */
+	singleshot_adj_ppm = time_adjust_step*(1000000/HZ);
 
 	/* Changes by adjtime() do not take effect till next tick. */
 	if (time_next_adjust != 0) {
 		time_adjust = time_next_adjust;
 		time_next_adjust = 0;
 	}
+
+	while (interval_sum >= NSEC_PER_SEC) {
+		interval_sum -= NSEC_PER_SEC;
+		second_overflow();
+	}
+
+	/* calculate the total continuous ppm adjustment: */
+	total_sppm = time_freq; /* already shifted by SHIFT_USEC */
+	total_sppm += offset_adj_ppm << SHIFT_USEC;
+	total_sppm += tick_adj_ppm << SHIFT_USEC;
+	total_sppm += singleshot_adj_ppm << SHIFT_USEC;
+
+	write_sequnlock_irqrestore(&ntp_lock, flags);
+}
+
+#ifdef CONFIG_GENERIC_TIME
+# define update_wall_time(x) do { } while (0)
+#else
+
+/**
+ * phase_advance - advance the phase
+ *
+ * advance the phase, once it gets to one nanosecond advance the tick more.
+ */
+static inline long phase_advance(void)
+{
+	static long time_phase; /* phase offset (scaled us) */
+
+	long delta = 0;
+
+	time_phase += time_adj;
+
+	if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) {
+		delta = shift_right(time_phase, (SHIFT_SCALE - 10));
+		time_phase -= delta << (SHIFT_SCALE - 10);
+	}
+
+	return delta;
+}
+
+/**
+ * xtime_advance - advance xtime
+ * @delta_nsec: adjustment in nsecs
+ */
+static inline void xtime_advance(long delta_nsec)
+{
+	int leapsecond;
+
+	xtime.tv_nsec += delta_nsec;
+	if (likely(xtime.tv_nsec < NSEC_PER_SEC))
+		return;
+
+	xtime.tv_nsec -= NSEC_PER_SEC;
+	xtime.tv_sec++;
+
+	/* process leapsecond: */
+	leapsecond = ntp_leapsecond(xtime);
+	if (likely(!leapsecond))
+		return;
+
+	xtime.tv_sec += leapsecond;
+	wall_to_monotonic.tv_sec -= leapsecond;
+	/*
+	 * Use of time interpolator for a gradual
+	 * change of time:
+	 */
+	time_interpolator_update(leapsecond*NSEC_PER_SEC);
+	clock_was_set();
 }
 
 /*
@@ -808,20 +976,26 @@ u64 current_tick_length(void)
  * usually just one (we shouldn't be losing ticks,
  * we're doing this this way mainly for interrupt
  * latency reasons, not because we think we'll
- * have lots of lost timer ticks
+ * have lots of lost timer ticks)
  */
 static void update_wall_time(unsigned long ticks)
 {
 	do {
-		ticks--;
-		update_wall_time_one_tick();
-		if (xtime.tv_nsec >= 1000000000) {
-			xtime.tv_nsec -= 1000000000;
-			xtime.tv_sec++;
-			second_overflow();
-		}
-	} while (ticks);
+		/*
+		 * Calculate the nsec delta using the NTP
+		 * adjustments:
+		 *     tick_nsec, adjtime_adjustment(), phase_advance()
+		 */
+		long delta_nsec = tick_nsec + adjtime_adjustment() * 1000;
+		delta_nsec += phase_advance();
+
+		xtime_advance(delta_nsec);
+		ntp_advance(tick_nsec);
+		time_interpolator_update(delta_nsec);
+
+	} while (--ticks);
 }
+#endif /* !CONFIG_GENERIC_TIME */
 
 /*
  * Called from the timer interrupt handler to charge one tick to the current 
@@ -829,8 +1003,8 @@ static void update_wall_time(unsigned lo
  */
 void update_process_times(int user_tick)
 {
-	struct task_struct *p = current;
 	int cpu = smp_processor_id();
+	struct task_struct *p = current;
 
 	/* Note: this timer irq context must be accounted for as well. */
 	if (user_tick)
@@ -849,9 +1023,30 @@ void update_process_times(int user_tick)
  */
 static unsigned long count_active_tasks(void)
 {
+	/*
+	 * On PREEMPT_RT, we are running in the timer softirq thread,
+	 * so consider 1 less running tasks:
+	 */
+#ifdef CONFIG_PREEMPT_RT
+	return (nr_running()-1 + nr_uninterruptible()) * FIXED_1;
+#else
 	return (nr_running() + nr_uninterruptible()) * FIXED_1;
+#endif
 }
 
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * Nr of active tasks - counted in fixed-point numbers
+ */
+static unsigned long count_active_rt_tasks(void)
+{
+	extern unsigned long rt_nr_running(void);
+	extern unsigned long rt_nr_uninterruptible(void);
+
+	return (rt_nr_running() + rt_nr_uninterruptible()) * FIXED_1;
+}
+#endif
+
 /*
  * Hmm.. Changed this, as the GNU make sources (load.c) seems to
  * imply that avenrun[] is the standard name for this kind of thing.
@@ -864,6 +1059,8 @@ unsigned long avenrun[3];
 
 EXPORT_SYMBOL(avenrun);
 
+unsigned long avenrun_rt[3];
+
 /*
  * calc_load - given tick count, update the avenrun load estimates.
  * This is called while holding a write_lock on xtime_lock.
@@ -880,6 +1077,12 @@ static inline void calc_load(unsigned lo
 		CALC_LOAD(avenrun[0], EXP_1, active_tasks);
 		CALC_LOAD(avenrun[1], EXP_5, active_tasks);
 		CALC_LOAD(avenrun[2], EXP_15, active_tasks);
+#ifdef CONFIG_PREEMPT_RT
+		active_tasks = count_active_rt_tasks();
+		CALC_LOAD(avenrun_rt[0], EXP_1, active_tasks);
+		CALC_LOAD(avenrun_rt[1], EXP_5, active_tasks);
+		CALC_LOAD(avenrun_rt[2], EXP_15, active_tasks);
+#endif
 	}
 }
 
@@ -891,48 +1094,64 @@ unsigned long wall_jiffies = INITIAL_JIF
  * playing with xtime and avenrun.
  */
 #ifndef ARCH_HAVE_XTIME_LOCK
-seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED;
+DECLARE_RAW_SEQLOCK(xtime_lock);
 
 EXPORT_SYMBOL(xtime_lock);
 #endif
 
 /*
- * This function runs timers and the timer-tq in bottom half context.
- */
-static void run_timer_softirq(struct softirq_action *h)
-{
-	tvec_base_t *base = &__get_cpu_var(tvec_bases);
-
- 	hrtimer_run_queues();
-	if (time_after_eq(jiffies, base->timer_jiffies))
-		__run_timers(base);
-}
-
-/*
  * Called by the local, per-CPU timer interrupt on SMP.
  */
 void run_local_timers(void)
 {
 	raise_softirq(TIMER_SOFTIRQ);
+	softlockup_tick();
 }
 
 /*
- * Called by the timer interrupt. xtime_lock must already be taken
- * by the timer IRQ!
+ * Time of day handling:
  */
 static inline void update_times(void)
 {
-	unsigned long ticks;
+	unsigned long ticks = 0;
+	/*
+	 * First test outside the lock for performance reasons:
+	 */
+	if (jiffies != wall_jiffies) {
+		unsigned long flags;
 
-	ticks = jiffies - wall_jiffies;
-	if (ticks) {
-		wall_jiffies += ticks;
-		update_wall_time(ticks);
+		write_seqlock_irqsave(&xtime_lock, flags);
+		while (jiffies != wall_jiffies) {
+			wall_jiffies++;
+			ticks++;
+			update_wall_time(1);
+			/*
+			 * Unlock unconditionally, to make sure
+			 * we dont keep irqs off for a long time!
+			 */
+			write_sequnlock_irqrestore(&xtime_lock, flags);
+			cond_resched_softirq();
+			write_seqlock_irqsave(&xtime_lock, flags);
+		}
+		calc_load(ticks);
+		write_sequnlock_irqrestore(&xtime_lock, flags);
 	}
-	calc_load(ticks);
 }
   
 /*
+ * This function runs timers and the timer-tq in bottom half context.
+ */
+static void run_timer_softirq(struct softirq_action *h)
+{
+	tvec_base_t *base = &__get_cpu_var(tvec_bases);
+
+	update_times();
+ 	hrtimer_run_queues();
+	if (time_after_eq(jiffies, base->timer_jiffies))
+		__run_timers(base);
+}
+
+/*
  * The 64-bit jiffies value is not atomic - you MUST NOT read it
  * without sampling the sequence number in xtime_lock.
  * jiffies is defined in the linker script...
@@ -943,8 +1162,6 @@ void do_timer(struct pt_regs *regs)
 	jiffies_64++;
 	/* prevent loading jiffies before storing new jiffies_64 value. */
 	barrier();
-	update_times();
-	softlockup_tick(regs);
 }
 
 #ifdef __ARCH_WANT_SYS_ALARM
@@ -955,19 +1172,7 @@ void do_timer(struct pt_regs *regs)
  */
 asmlinkage unsigned long sys_alarm(unsigned int seconds)
 {
-	struct itimerval it_new, it_old;
-	unsigned int oldalarm;
-
-	it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
-	it_new.it_value.tv_sec = seconds;
-	it_new.it_value.tv_usec = 0;
-	do_setitimer(ITIMER_REAL, &it_new, &it_old);
-	oldalarm = it_old.it_value.tv_sec;
-	/* ehhh.. We can't return 0 if we have an alarm pending.. */
-	/* And we'd better return too much than too little anyway */
-	if ((!oldalarm && it_old.it_value.tv_usec) || it_old.it_value.tv_usec >= 500000)
-		oldalarm++;
-	return oldalarm;
+	return alarm_setitimer(seconds);
 }
 
 #endif
@@ -1263,6 +1468,7 @@ static void __devinit init_timers_cpu(in
 
 	base = &per_cpu(tvec_bases, cpu);
 	spin_lock_init(&base->t_base.lock);
+	init_waitqueue_head(&base->t_base.wait_for_running_timer);
 	for (j = 0; j < TVN_SIZE; j++) {
 		INIT_LIST_HEAD(base->tv5.vec + j);
 		INIT_LIST_HEAD(base->tv4.vec + j);
@@ -1298,8 +1504,7 @@ static void __devinit migrate_timers(int
 	old_base = &per_cpu(tvec_bases, cpu);
 	new_base = &get_cpu_var(tvec_bases);
 
-	local_irq_disable();
-	spin_lock(&new_base->t_base.lock);
+	spin_lock_irq(&new_base->t_base.lock);
 	spin_lock(&old_base->t_base.lock);
 
 	if (old_base->t_base.running_timer)
@@ -1314,8 +1519,7 @@ static void __devinit migrate_timers(int
 	}
 
 	spin_unlock(&old_base->t_base.lock);
-	spin_unlock(&new_base->t_base.lock);
-	local_irq_enable();
+	spin_unlock_irq(&new_base->t_base.lock);
 	put_cpu_var(tvec_bases);
 }
 #endif /* CONFIG_HOTPLUG_CPU */
@@ -1346,6 +1550,7 @@ static struct notifier_block __devinitda
 
 void __init init_timers(void)
 {
+	spin_lock_init(&__init_timer_base.lock);
 	timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
 				(void *)(long)smp_processor_id());
 	register_cpu_notifier(&timers_nb);
Index: linux-2.6.16/kernel/user.c
===================================================================
--- linux-2.6.16.orig/kernel/user.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/user.c	2006-10-19 16:52:31.000000000 +0200
@@ -105,7 +105,7 @@ void free_uid(struct user_struct *up)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	if (up && atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
 		uid_hash_remove(up);
 		key_put(up->uid_keyring);
@@ -113,7 +113,7 @@ void free_uid(struct user_struct *up)
 		kmem_cache_free(uid_cachep, up);
 		spin_unlock(&uidhash_lock);
 	}
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 }
 
 struct user_struct * alloc_uid(uid_t uid)
Index: linux-2.6.16/kernel/workqueue.c
===================================================================
--- linux-2.6.16.orig/kernel/workqueue.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/kernel/workqueue.c	2006-10-19 16:52:31.000000000 +0200
@@ -27,6 +27,7 @@
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <linux/kthread.h>
+#include <linux/syscalls.h>
 
 /*
  * The per-CPU workqueue (if single thread, we always use the first
@@ -98,10 +99,12 @@ static void __queue_work(struct cpu_work
  *
  * We queue the work to the CPU it was submitted, but there is no
  * guarantee that it will be processed by that CPU.
+ *
+ * Especially no such guarantee on PREEMPT_RT.
  */
 int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
 {
-	int ret = 0, cpu = get_cpu();
+	int ret = 0, cpu = raw_smp_processor_id();
 
 	if (!test_and_set_bit(0, &work->pending)) {
 		if (unlikely(is_single_threaded(wq)))
@@ -110,7 +113,6 @@ int fastcall queue_work(struct workqueue
 		__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 		ret = 1;
 	}
-	put_cpu();
 	return ret;
 }
 
@@ -373,6 +375,46 @@ static void cleanup_workqueue_thread(str
 		kthread_stop(p);
 }
 
+void set_workqueue_thread_prio(struct workqueue_struct *wq, int cpu,
+				int policy, int rt_priority, int nice)
+{
+	struct cpu_workqueue_struct *cwq;
+	struct task_struct *p;
+	struct sched_param param = { .sched_priority = rt_priority };
+	unsigned long flags;
+	int ret;
+
+	cwq = per_cpu_ptr(wq->cpu_wq, cpu);
+	spin_lock_irqsave(&cwq->lock, flags);
+	p = cwq->thread;
+	spin_unlock_irqrestore(&cwq->lock, flags);
+
+	set_user_nice(p, nice);
+	ret = sys_sched_setscheduler(p->pid, policy, &param);
+	if (ret)
+		printk("BUG: wq(%s) setscheduler() returned: %d.\n",
+			wq->name, ret);
+
+}
+
+void set_workqueue_prio(struct workqueue_struct *wq, int policy,
+			int rt_priority, int nice)
+{
+	int cpu;
+
+	/* We don't need the distraction of CPUs appearing and vanishing. */
+	lock_cpu_hotplug();
+	if (is_single_threaded(wq))
+		set_workqueue_thread_prio(wq, 0, policy, rt_priority, nice);
+	else {
+		for_each_online_cpu(cpu)
+			set_workqueue_thread_prio(wq, cpu, policy,
+						  rt_priority, nice);
+	}
+	unlock_cpu_hotplug();
+}
+
+
 void destroy_workqueue(struct workqueue_struct *wq)
 {
 	int cpu;
@@ -574,6 +616,7 @@ void init_workqueues(void)
 	hotcpu_notifier(workqueue_cpu_callback, 0);
 	keventd_wq = create_workqueue("events");
 	BUG_ON(!keventd_wq);
+	set_workqueue_prio(keventd_wq, SCHED_FIFO, 1, -20);
 }
 
 EXPORT_SYMBOL_GPL(__create_workqueue);
Index: linux-2.6.16/lib/Kconfig
===================================================================
--- linux-2.6.16.orig/lib/Kconfig	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/lib/Kconfig	2006-10-19 16:52:31.000000000 +0200
@@ -86,4 +86,10 @@ config TEXTSEARCH_BM
 config TEXTSEARCH_FSM
 	tristate
 
+#
+# plist support is select#ed if needed
+#
+config PLIST
+	boolean
+
 endmenu
Index: linux-2.6.16/lib/Kconfig.debug
===================================================================
--- linux-2.6.16.orig/lib/Kconfig.debug	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/lib/Kconfig.debug	2006-10-19 16:52:31.000000000 +0200
@@ -8,6 +8,22 @@ config PRINTK_TIME
 	  operations.  This is useful for identifying long delays
 	  in kernel startup.
 
+config PRINTK_IGNORE_LOGLEVEL
+	bool "Ignore loglevel on printks"
+	default n
+	help
+	  Selecting this option causes all printk messages to go
+	  to the console.  This allows you to serial-log kernel
+	  messages, no matter what userspace does. (e.g. some
+	  distributions disable kernel log messages during
+	  certain phases of system startup.)
+
+	  NOTE: this option also makes printk non-preemptible,
+	  which might improve the output of debugging info or
+	  crash info, but it might also cause latencies if your
+	  kernel is printk-ing alot.
+
+	  Normally you dont need or want this option.
 
 config MAGIC_SYSRQ
 	bool "Magic SysRq key"
@@ -46,6 +62,11 @@ config LOG_BUF_SHIFT
 		     13 =>  8 KB
 		     12 =>  4 KB
 
+config PARANOID_GENERIC_TIME
+	default n
+	depends on GENERIC_TIME
+	bool "Paraniod Timekeeping Checks"
+
 config DETECT_SOFTLOCKUP
 	bool "Detect Soft Lockups"
 	depends on DEBUG_KERNEL
@@ -93,19 +114,43 @@ config DEBUG_PREEMPT
 	  If you say Y here then the kernel will use a debug variant of the
 	  commonly used smp_processor_id() function and will print warnings
 	  if kernel code uses it in a preemption-unsafe way. Also, the kernel
-	  will detect preemption count underflows.
+	  will detect preemption count underflows and will track critical
+	  section entries and print that info when an illegal sleep happens.
 
 config DEBUG_MUTEXES
 	bool "Mutex debugging, deadlock detection"
 	default y
-	depends on DEBUG_KERNEL
+	depends on DEBUG_KERNEL && !PREEMPT_RT
 	help
 	 This allows mutex semantics violations and mutex related deadlocks
 	 (lockups) to be detected and reported automatically.
 
+config DEBUG_RT_MUTEXES
+	bool "RT Mutex debugging, deadlock detection"
+	default y
+	depends on DEBUG_KERNEL && RT_MUTEXES
+	help
+	 This allows rt mutex semantics violations and rt mutex related
+	 deadlocks (lockups) to be detected and reported automatically.
+	 When realtime preemption is enabled this includes spinlocks,
+	 rwlocks, mutexes and (rw)semaphores
+
+config DEBUG_PI_LIST
+	bool
+	default y
+	depends on DEBUG_RT_MUTEXES
+
+config RT_MUTEX_TESTER
+	bool "Built-in scriptable tester for rt-mutexes"
+	depends on RT_MUTEXES
+	default n
+	help
+	  This option enables a rt-mutex tester.
+
+# FIXME: broken by PREEMPT_RT, disable for now
 config DEBUG_SPINLOCK
 	bool "Spinlock debugging"
-	depends on DEBUG_KERNEL
+	depends on 0 && DEBUG_KERNEL
 	help
 	  Say Y here and build SMP to catch missing spinlock initialization
 	  and certain other kinds of spinlock errors commonly made.  This is
@@ -114,11 +159,157 @@ config DEBUG_SPINLOCK
 
 config DEBUG_SPINLOCK_SLEEP
 	bool "Sleep-inside-spinlock checking"
-	depends on DEBUG_KERNEL
+	depends on DEBUG_KERNEL && !DEBUG_PREEMPT
 	help
 	  If you say Y here, various routines which may sleep will become very
 	  noisy if they are called with a spinlock held.
 
+config WAKEUP_TIMING
+	bool "Wakeup latency timing"
+	default y
+	help
+	  This option measures the time spent from a highprio thread being
+	  woken up to it getting scheduled on a CPU, with microsecond
+	  accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started via:
+
+	      echo 0 > /proc/sys/kernel/preempt_max_latency
+
+config WAKEUP_LATENCY_HIST
+	bool "wakeup latency histogram"
+	default n
+	depends on WAKEUP_TIMING
+	help
+	  This option logs all the wakeup latency timing to a big histogram
+	  bucket, in the meanwhile, it also dummies up printk produced by
+	  wakeup latency timing.
+
+	  The wakeup latency timing histogram can be viewed via:
+
+	      cat /proc/latency_hist/wakeup_latency/CPU*
+
+	  (Note: * presents CPU ID.)
+
+config PREEMPT_TRACE
+	bool
+	default y
+	depends on DEBUG_PREEMPT
+
+config CRITICAL_PREEMPT_TIMING
+	bool "Non-preemptible critical section latency timing"
+	default n
+	depends on PREEMPT
+	help
+	  This option measures the time spent in preempt-off critical
+	  sections, with microsecond accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started via:
+
+	      echo 0 > /proc/sys/kernel/preempt_max_latency
+
+	  (Note that kernel size and overhead increases with this option
+	  enabled. This option and the irqs-off timing option can be
+	  used together or separately.)
+
+config PREEMPT_OFF_HIST
+        bool "non-preemptible critical section latency histogram"
+        default n
+        depends on CRITICAL_PREEMPT_TIMING
+        help
+          This option logs all the non-preemptible critical section latency
+	  timing to a big histogram bucket, in the meanwhile, it also
+	  dummies up printk produced by non-preemptible critical section
+	  latency timing.
+
+          The non-preemptible critical section latency timing histogram can
+	  be viewed via:
+
+              cat /proc/latency_hist/preempt_off_latency/CPU*
+
+          (Note: * presents CPU ID.)
+
+config CRITICAL_IRQSOFF_TIMING
+	bool "Interrupts-off critical section latency timing"
+	default n
+	help
+	  This option measures the time spent in irqs-off critical
+	  sections, with microsecond accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started via:
+
+	      echo 0 > /proc/sys/kernel/preempt_max_latency
+
+	  (Note that kernel size and overhead increases with this option
+	  enabled. This option and the preempt-off timing option can be
+	  used together or separately.)
+
+config INTERRUPT_OFF_HIST
+        bool "interrupts-off critical section latency histogram"
+        default n
+        depends on CRITICAL_IRQSOFF_TIMING
+        help
+          This option logs all the interrupts-off critical section latency
+          timing to a big histogram bucket, in the meanwhile, it also
+          dummies up printk produced by interrupts-off critical section
+          latency timing.
+
+          The interrupts-off critical section latency timing histogram can
+          be viewed via:
+
+              cat /proc/latency_hist/interrupt_off_latency/CPU*
+
+          (Note: * presents CPU ID.)
+
+config CRITICAL_TIMING
+	bool
+	default y
+	depends on CRITICAL_PREEMPT_TIMING || CRITICAL_IRQSOFF_TIMING
+
+config DEBUG_TRACE_IRQFLAGS
+	bool
+	default y
+	depends on CRITICAL_IRQSOFF_TIMING
+
+config LATENCY_TIMING
+	bool
+	default y
+	depends on WAKEUP_TIMING || CRITICAL_TIMING
+	select SYSCTL
+
+config CRITICAL_LATENCY_HIST
+	bool
+	default y
+	depends on PREEMPT_OFF_HIST || INTERRUPT_OFF_HIST
+
+config LATENCY_HIST
+	bool
+	default y
+	depends on WAKEUP_LATENCY_HIST || CRITICAL_LATENCY_HIST
+
+config LATENCY_TRACE
+	bool "Latency tracing"
+	default n
+	depends on LATENCY_TIMING
+	help
+	  This option enables a kernel tracing mechanism that will track
+	  precise function-call granularity kernel execution during
+	  wakeup paths or critical sections.  When this option is enabled
+	  then the last maximum latency timing event's full trace can be
+	  found in /proc/latency_trace, in a human-readable (or rather as
+	  some would say, in a kernel-developer-readable) form.
+
+	  (Note that kernel size and overhead increases noticeably
+	  with this option enabled.)
+
+config MCOUNT
+	bool
+	depends on LATENCY_TRACE
+	default y
+
 config DEBUG_KOBJECT
 	bool "kobject debugging"
 	depends on DEBUG_KERNEL
@@ -185,9 +376,9 @@ config DEBUG_VM
 
 	  If unsure, say N.
 
-config FRAME_POINTER
+config USE_FRAME_POINTER
 	bool "Compile the kernel with frame pointers"
-	depends on DEBUG_KERNEL && (X86 || CRIS || M68K || M68KNOMMU || FRV || UML)
+	depends on DEBUG_KERNEL && !MCOUNT && (X86 || CRIS || M68K || M68KNOMMU || FRV || UML)
 	default y if DEBUG_INFO && UML
 	help
 	  If you say Y here the resulting kernel image will be slightly larger
@@ -209,6 +400,11 @@ config FORCED_INLINING
 	  become the default in the future, until then this option is there to
 	  test gcc for this.
 
+config FRAME_POINTER
+	bool
+	depends on USE_FRAME_POINTER || MCOUNT
+	default y
+
 config RCU_TORTURE_TEST
 	tristate "torture tests for RCU"
 	depends on DEBUG_KERNEL
Index: linux-2.6.16/lib/Makefile
===================================================================
--- linux-2.6.16.orig/lib/Makefile	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/lib/Makefile	2006-10-19 16:52:31.000000000 +0200
@@ -17,11 +17,13 @@ CFLAGS_kobject_uevent.o += -DDEBUG
 endif
 
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
-lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
+obj-$(CONFIG_PREEMPT_RT) += plist.o
+obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o
 lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
 obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
+obj-$(CONFIG_PLIST) += plist.o
 obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
 
 ifneq ($(CONFIG_HAVE_DEC_LOCK),y)
Index: linux-2.6.16/lib/dec_and_lock.c
===================================================================
--- linux-2.6.16.orig/lib/dec_and_lock.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/lib/dec_and_lock.c	2006-10-19 16:52:31.000000000 +0200
@@ -17,7 +17,7 @@
  * because the spin-lock and the decrement must be
  * "atomic".
  */
-int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
+int _atomic_dec_and_raw_spin_lock(atomic_t *atomic, raw_spinlock_t *lock)
 {
 #ifdef CONFIG_SMP
 	/* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
@@ -25,11 +25,11 @@ int _atomic_dec_and_lock(atomic_t *atomi
 		return 0;
 #endif
 	/* Otherwise do it the slow way */
-	spin_lock(lock);
+	_raw_spin_lock(lock);
 	if (atomic_dec_and_test(atomic))
 		return 1;
-	spin_unlock(lock);
+	_raw_spin_unlock(lock);
 	return 0;
 }
 
-EXPORT_SYMBOL(_atomic_dec_and_lock);
+EXPORT_SYMBOL(_atomic_dec_and_raw_spin_lock);
Index: linux-2.6.16/lib/inflate.c
===================================================================
--- linux-2.6.16.orig/lib/inflate.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/lib/inflate.c	2006-10-19 16:52:31.000000000 +0200
@@ -141,6 +141,25 @@ struct huft {
   } v;
 };
 
+/*
+ * turn off the inflate_lock for the bootloader code, it is
+ * single-threaded and has no need for (nor access to) the
+ * kernel's locking primitives:
+ */
+#ifdef ZLIB_INFLATE_NO_INFLATE_LOCK
+# undef DEFINE_SPINLOCK
+# undef spin_lock
+# undef spin_unlock
+# define DEFINE_SPINLOCK(x)	int x
+# define spin_lock(x)		(void)(x)
+# define spin_unlock(x)		(void)(x)
+#endif
+
+/*
+ * lock protecting static variables of huft_build() and other inflate
+ * functions, to reduce their insane stack footprint.
+ */
+static DEFINE_SPINLOCK(inflate_lock);
 
 /* Function prototypes */
 STATIC int INIT huft_build OF((unsigned *, unsigned, unsigned, 
@@ -304,7 +323,7 @@ STATIC int INIT huft_build(
   register struct huft *q;      /* points to current table */
   struct huft r;                /* table entry for structure assignment */
   struct huft *u[BMAX];         /* table stack */
-  unsigned v[N_MAX];            /* values in order of bit length */
+  static unsigned v[N_MAX];     /* values in order of bit length */
   register int w;               /* bits before this table == (l * h) */
   unsigned x[BMAX+1];           /* bit offsets, then code stack */
   unsigned *xp;                 /* pointer into x */
@@ -705,7 +724,7 @@ STATIC int noinline INIT inflate_fixed(v
   struct huft *td;      /* distance code table */
   int bl;               /* lookup bits for tl */
   int bd;               /* lookup bits for td */
-  unsigned l[288];      /* length list for huft_build */
+  static unsigned l[288];      /* length list for huft_build */
 
 DEBG("<fix");
 
@@ -767,9 +786,9 @@ STATIC int noinline INIT inflate_dynamic
   unsigned nl;          /* number of literal/length codes */
   unsigned nd;          /* number of distance codes */
 #ifdef PKZIP_BUG_WORKAROUND
-  unsigned ll[288+32];  /* literal/length and distance code lengths */
+  static unsigned ll[288+32];  /* literal/length and distance code lengths */
 #else
-  unsigned ll[286+30];  /* literal/length and distance code lengths */
+  static unsigned ll[286+30];  /* literal/length and distance code lengths */
 #endif
   register ulg b;       /* bit buffer */
   register unsigned k;  /* number of bits in bit buffer */
@@ -940,6 +959,7 @@ STATIC int INIT inflate_block(
   unsigned t;           /* block type */
   register ulg b;       /* bit buffer */
   register unsigned k;  /* number of bits in bit buffer */
+  unsigned ret;         /* return code */
 
   DEBG("<blk");
 
@@ -965,17 +985,19 @@ STATIC int INIT inflate_block(
   bk = k;
 
   /* inflate that block type */
-  if (t == 2)
-    return inflate_dynamic();
-  if (t == 0)
-    return inflate_stored();
-  if (t == 1)
-    return inflate_fixed();
+  ret = 2;
+  spin_lock(&inflate_lock);
+  switch (t) {
+	case 2: ret = inflate_dynamic(); break;
+	case 0: ret = inflate_stored();  break;
+	case 1: ret = inflate_fixed();   break;
+  }
+  spin_unlock(&inflate_lock);
 
   DEBG(">");
 
   /* bad block type */
-  return 2;
+  return ret;
 
  underrun:
   return 4;			/* Input underrun */
Index: linux-2.6.16/lib/kernel_lock.c
===================================================================
--- linux-2.6.16.orig/lib/kernel_lock.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/lib/kernel_lock.c	2006-10-19 16:52:31.000000000 +0200
@@ -24,7 +24,7 @@
  *
  * Don't use in new code.
  */
-static DECLARE_MUTEX(kernel_sem);
+DECLARE_MUTEX(kernel_sem);
 
 /*
  * Re-acquire the kernel semaphore.
@@ -35,22 +35,25 @@ static DECLARE_MUTEX(kernel_sem);
  * about recursion, both due to the down() and due to the enabling of
  * preemption. schedule() will re-check the preemption flag after
  * reacquiring the semaphore.
+ *
+ * Called with interrupts disabled.
  */
 int __lockfunc __reacquire_kernel_lock(void)
 {
 	struct task_struct *task = current;
 	int saved_lock_depth = task->lock_depth;
 
+	local_irq_enable();
 	BUG_ON(saved_lock_depth < 0);
 
 	task->lock_depth = -1;
-	preempt_enable_no_resched();
 
 	down(&kernel_sem);
 
-	preempt_disable();
 	task->lock_depth = saved_lock_depth;
 
+	local_irq_disable();
+
 	return 0;
 }
 
@@ -67,11 +70,15 @@ void __lockfunc lock_kernel(void)
 	struct task_struct *task = current;
 	int depth = task->lock_depth + 1;
 
-	if (likely(!depth))
+	if (likely(!depth)) {
 		/*
 		 * No recursion worries - we set up lock_depth _after_
 		 */
 		down(&kernel_sem);
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+		current->last_kernel_lock = __builtin_return_address(0);
+#endif
+	}
 
 	task->lock_depth = depth;
 }
@@ -82,8 +89,12 @@ void __lockfunc unlock_kernel(void)
 
 	BUG_ON(task->lock_depth < 0);
 
-	if (likely(--task->lock_depth < 0))
+	if (likely(--task->lock_depth == -1)) {
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+		current->last_kernel_lock = NULL;
+#endif
 		up(&kernel_sem);
+	}
 }
 
 #else
@@ -116,38 +127,40 @@ static  __cacheline_aligned_in_smp DEFIN
  */
 int __lockfunc __reacquire_kernel_lock(void)
 {
-	while (!_raw_spin_trylock(&kernel_flag)) {
+	local_irq_enable();
+	while (!__raw_spin_trylock(&kernel_flag.raw_lock)) {
 		if (test_thread_flag(TIF_NEED_RESCHED))
 			return -EAGAIN;
 		cpu_relax();
 	}
+	local_irq_disable();
 	preempt_disable();
 	return 0;
 }
 
 void __lockfunc __release_kernel_lock(void)
 {
-	_raw_spin_unlock(&kernel_flag);
+	__raw_spin_unlock(&kernel_flag.raw_lock);
 	preempt_enable_no_resched();
 }
 
 /*
  * These are the BKL spinlocks - we try to be polite about preemption. 
  * If SMP is not on (ie UP preemption), this all goes away because the
- * _raw_spin_trylock() will always succeed.
+ * __raw_spin_trylock() will always succeed.
  */
 #ifdef CONFIG_PREEMPT
 static inline void __lock_kernel(void)
 {
 	preempt_disable();
-	if (unlikely(!_raw_spin_trylock(&kernel_flag))) {
+	if (unlikely(!__raw_spin_trylock(&kernel_flag.raw_lock))) {
 		/*
 		 * If preemption was disabled even before this
 		 * was called, there's nothing we can be polite
 		 * about - just spin.
 		 */
 		if (preempt_count() > 1) {
-			_raw_spin_lock(&kernel_flag);
+			__raw_spin_lock(&kernel_flag.raw_lock);
 			return;
 		}
 
@@ -160,7 +173,7 @@ static inline void __lock_kernel(void)
 			while (spin_is_locked(&kernel_flag))
 				cpu_relax();
 			preempt_disable();
-		} while (!_raw_spin_trylock(&kernel_flag));
+		} while (!__raw_spin_trylock(&kernel_flag.raw_lock));
 	}
 }
 
@@ -171,7 +184,7 @@ static inline void __lock_kernel(void)
  */
 static inline void __lock_kernel(void)
 {
-	_raw_spin_lock(&kernel_flag);
+	__raw_spin_lock(&kernel_flag.raw_lock);
 }
 #endif
 
Index: linux-2.6.16/lib/plist.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/lib/plist.c	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,118 @@
+/*
+ * lib/plist.c
+ *
+ * Descending-priority-sorted double-linked list
+ *
+ * (C) 2002-2003 Intel Corp
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>.
+ *
+ * 2001-2005 (c) MontaVista Software, Inc.
+ * Daniel Walker <dwalker@mvista.com>
+ *
+ * (C) 2005 Thomas Gleixner <tglx@linutronix.de>
+ *
+ * Simplifications of the original code by
+ * Oleg Nesterov <oleg@tv-sign.ru>
+ *
+ * Licensed under the FSF's GNU Public License v2 or later.
+ *
+ * Based on simple lists (include/linux/list.h).
+ *
+ * This file contains the add / del functions which are considered to
+ * be too large to inline. See include/linux/plist.h for further
+ * information.
+ */
+
+#include <linux/plist.h>
+#include <linux/spinlock.h>
+
+#ifdef CONFIG_DEBUG_PI_LIST
+
+static void plist_check_prev_next(struct list_head *t, struct list_head *p,
+				  struct list_head *n)
+{
+	if (n->prev != p || p->next != n) {
+		printk("top: %p, n: %p, p: %p\n", t, t->next, t->prev);
+		printk("prev: %p, n: %p, p: %p\n", p, p->next, p->prev);
+		printk("next: %p, n: %p, p: %p\n", n, n->next, n->prev);
+		WARN_ON(1);
+	}
+}
+
+static void plist_check_list(struct list_head *top)
+{
+	struct list_head *prev = top, *next = top->next;
+
+	plist_check_prev_next(top, prev, next);
+	while (next != top) {
+		prev = next;
+		next = prev->next;
+		plist_check_prev_next(top, prev, next);
+	}
+}
+
+static void plist_check_head(struct plist_head *head)
+{
+	WARN_ON(!head->lock);
+	if (head->lock)
+		WARN_ON_SMP(!spin_is_locked(head->lock));
+	plist_check_list(&head->prio_list);
+	plist_check_list(&head->node_list);
+}
+
+#else
+# define plist_check_head(h)	do { } while (0)
+#endif
+
+/**
+ * plist_add - add @node to @head
+ *
+ * @node:	&struct plist_node pointer
+ * @head:	&struct plist_head pointer
+ */
+void plist_add(struct plist_node *node, struct plist_head *head)
+{
+	struct plist_node *iter;
+
+	plist_check_head(head);
+	WARN_ON(!plist_node_empty(node));
+
+	list_for_each_entry(iter, &head->prio_list, plist.prio_list) {
+		if (node->prio < iter->prio)
+			goto lt_prio;
+		else if (node->prio == iter->prio) {
+			iter = list_entry(iter->plist.prio_list.next,
+					struct plist_node, plist.prio_list);
+			goto eq_prio;
+		}
+	}
+
+lt_prio:
+	list_add_tail(&node->plist.prio_list, &iter->plist.prio_list);
+eq_prio:
+	list_add_tail(&node->plist.node_list, &iter->plist.node_list);
+
+	plist_check_head(head);
+}
+
+/**
+ * plist_del - Remove a @node from plist.
+ *
+ * @node:	&struct plist_node pointer - entry to be removed
+ * @head:	&struct plist_head pointer - list head
+ */
+void plist_del(struct plist_node *node, struct plist_head *head)
+{
+	plist_check_head(head);
+
+	if (!list_empty(&node->plist.prio_list)) {
+		struct plist_node *next = plist_first(&node->plist);
+
+		list_move_tail(&next->plist.prio_list, &node->plist.prio_list);
+		list_del_init(&node->plist.prio_list);
+	}
+
+	list_del_init(&node->plist.node_list);
+
+	plist_check_head(head);
+}
Index: linux-2.6.16/lib/radix-tree.c
===================================================================
--- linux-2.6.16.orig/lib/radix-tree.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/lib/radix-tree.c	2006-10-19 16:52:31.000000000 +0200
@@ -104,6 +104,8 @@ radix_tree_node_free(struct radix_tree_n
 	kmem_cache_free(radix_tree_node_cachep, node);
 }
 
+#ifndef CONFIG_PREEMPT_RT
+
 /*
  * Load up this CPU's radix_tree_node buffer with sufficient objects to
  * ensure that the addition of a single element in the tree cannot fail.  On
@@ -135,6 +137,8 @@ out:
 	return ret;
 }
 
+#endif
+
 static inline void tag_set(struct radix_tree_node *node, int tag, int offset)
 {
 	__set_bit(offset, node->tags[tag]);
Index: linux-2.6.16/lib/rwsem-spinlock.c
===================================================================
--- linux-2.6.16.orig/lib/rwsem-spinlock.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/lib/rwsem-spinlock.c	2006-10-19 16:52:31.000000000 +0200
@@ -18,7 +18,7 @@ struct rwsem_waiter {
 };
 
 #if RWSEM_DEBUG
-void rwsemtrace(struct rw_semaphore *sem, const char *str)
+void rwsemtrace(struct compat_rw_semaphore *sem, const char *str)
 {
 	if (sem->debug)
 		printk("[%d] %s({%d,%d})\n",
@@ -30,7 +30,7 @@ void rwsemtrace(struct rw_semaphore *sem
 /*
  * initialise the semaphore
  */
-void fastcall init_rwsem(struct rw_semaphore *sem)
+void fastcall compat_init_rwsem(struct compat_rw_semaphore *sem)
 {
 	sem->activity = 0;
 	spin_lock_init(&sem->wait_lock);
@@ -49,8 +49,8 @@ void fastcall init_rwsem(struct rw_semap
  * - woken process blocks are discarded from the list after having task zeroed
  * - writers are only woken if wakewrite is non-zero
  */
-static inline struct rw_semaphore *
-__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
+static inline struct compat_rw_semaphore *
+__rwsem_do_wake(struct compat_rw_semaphore *sem, int wakewrite)
 {
 	struct rwsem_waiter *waiter;
 	struct task_struct *tsk;
@@ -111,8 +111,8 @@ __rwsem_do_wake(struct rw_semaphore *sem
 /*
  * wake a single writer
  */
-static inline struct rw_semaphore *
-__rwsem_wake_one_writer(struct rw_semaphore *sem)
+static inline struct compat_rw_semaphore *
+__rwsem_wake_one_writer(struct compat_rw_semaphore *sem)
 {
 	struct rwsem_waiter *waiter;
 	struct task_struct *tsk;
@@ -133,7 +133,7 @@ __rwsem_wake_one_writer(struct rw_semaph
 /*
  * get a read lock on the semaphore
  */
-void fastcall __sched __down_read(struct rw_semaphore *sem)
+void fastcall __sched __down_read(struct compat_rw_semaphore *sem)
 {
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk;
@@ -179,7 +179,7 @@ void fastcall __sched __down_read(struct
 /*
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
-int fastcall __down_read_trylock(struct rw_semaphore *sem)
+int fastcall __down_read_trylock(struct compat_rw_semaphore *sem)
 {
 	unsigned long flags;
 	int ret = 0;
@@ -204,7 +204,7 @@ int fastcall __down_read_trylock(struct 
  * get a write lock on the semaphore
  * - we increment the waiting count anyway to indicate an exclusive lock
  */
-void fastcall __sched __down_write(struct rw_semaphore *sem)
+void fastcall __sched __down_write(struct compat_rw_semaphore *sem)
 {
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk;
@@ -250,7 +250,7 @@ void fastcall __sched __down_write(struc
 /*
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
-int fastcall __down_write_trylock(struct rw_semaphore *sem)
+int fastcall __down_write_trylock(struct compat_rw_semaphore *sem)
 {
 	unsigned long flags;
 	int ret = 0;
@@ -274,7 +274,7 @@ int fastcall __down_write_trylock(struct
 /*
  * release a read lock on the semaphore
  */
-void fastcall __up_read(struct rw_semaphore *sem)
+void fastcall __up_read(struct compat_rw_semaphore *sem)
 {
 	unsigned long flags;
 
@@ -293,7 +293,7 @@ void fastcall __up_read(struct rw_semaph
 /*
  * release a write lock on the semaphore
  */
-void fastcall __up_write(struct rw_semaphore *sem)
+void fastcall __up_write(struct compat_rw_semaphore *sem)
 {
 	unsigned long flags;
 
@@ -314,7 +314,7 @@ void fastcall __up_write(struct rw_semap
  * downgrade a write lock into a read lock
  * - just wake up any readers at the front of the queue
  */
-void fastcall __downgrade_write(struct rw_semaphore *sem)
+void fastcall __downgrade_write(struct compat_rw_semaphore *sem)
 {
 	unsigned long flags;
 
@@ -331,7 +331,7 @@ void fastcall __downgrade_write(struct r
 	rwsemtrace(sem, "Leaving __downgrade_write");
 }
 
-EXPORT_SYMBOL(init_rwsem);
+EXPORT_SYMBOL(compat_init_rwsem);
 EXPORT_SYMBOL(__down_read);
 EXPORT_SYMBOL(__down_read_trylock);
 EXPORT_SYMBOL(__down_write);
Index: linux-2.6.16/lib/semaphore-sleepers.c
===================================================================
--- linux-2.6.16.orig/lib/semaphore-sleepers.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/lib/semaphore-sleepers.c	2006-10-19 16:52:31.000000000 +0200
@@ -16,6 +16,7 @@
 #include <linux/sched.h>
 #include <linux/err.h>
 #include <linux/init.h>
+#include <linux/module.h>
 #include <asm/semaphore.h>
 
 /*
@@ -49,12 +50,12 @@
  *    we cannot lose wakeup events.
  */
 
-fastcall void __up(struct semaphore *sem)
+fastcall void __compat_up(struct compat_semaphore *sem)
 {
 	wake_up(&sem->wait);
 }
 
-fastcall void __sched __down(struct semaphore * sem)
+fastcall void __sched __compat_down(struct compat_semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -91,7 +92,7 @@ fastcall void __sched __down(struct sema
 	tsk->state = TASK_RUNNING;
 }
 
-fastcall int __sched __down_interruptible(struct semaphore * sem)
+fastcall int __sched __compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -154,7 +155,7 @@ fastcall int __sched __down_interruptibl
  * single "cmpxchg" without failure cases,
  * but then it wouldn't work on a 386.
  */
-fastcall int __down_trylock(struct semaphore * sem)
+fastcall int __compat_down_trylock(struct compat_semaphore * sem)
 {
 	int sleepers;
 	unsigned long flags;
@@ -175,3 +176,10 @@ fastcall int __down_trylock(struct semap
 	spin_unlock_irqrestore(&sem->wait.lock, flags);
 	return 1;
 }
+
+int fastcall compat_sem_is_locked(struct compat_semaphore *sem)
+{
+	return (int) atomic_read(&sem->count) < 0;
+}
+
+EXPORT_SYMBOL(compat_sem_is_locked);
Index: linux-2.6.16/lib/smp_processor_id.c
===================================================================
--- linux-2.6.16.orig/lib/smp_processor_id.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/lib/smp_processor_id.c	2006-10-19 16:52:31.000000000 +0200
@@ -7,7 +7,7 @@
 #include <linux/kallsyms.h>
 #include <linux/sched.h>
 
-unsigned int debug_smp_processor_id(void)
+unsigned int notrace debug_smp_processor_id(void)
 {
 	unsigned long preempt_count = preempt_count();
 	int this_cpu = raw_smp_processor_id();
@@ -42,7 +42,7 @@ unsigned int debug_smp_processor_id(void
 	if (!printk_ratelimit())
 		goto out_enable;
 
-	printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x] code: %s/%d\n", preempt_count(), current->comm, current->pid);
+	printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x] code: %s/%d\n", preempt_count()-1, current->comm, current->pid);
 	print_symbol("caller is %s\n", (long)__builtin_return_address(0));
 	dump_stack();
 
Index: linux-2.6.16/lib/zlib_inflate/inftrees.c
===================================================================
--- linux-2.6.16.orig/lib/zlib_inflate/inftrees.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/lib/zlib_inflate/inftrees.c	2006-10-19 16:52:31.000000000 +0200
@@ -4,11 +4,19 @@
  */
 
 #include <linux/zutil.h>
+#include <linux/spinlock.h>
 #include "inftrees.h"
 #include "infutil.h"
 
 static const char inflate_copyright[] __attribute_used__ =
    " inflate 1.1.3 Copyright 1995-1998 Mark Adler ";
+
+/*
+ * lock protecting static variables of huft_build() and other inflate
+ * functions, to reduce their insane stack footprint.
+ */
+static DEFINE_SPINLOCK(inflate_lock);
+
 /*
   If you use the zlib library in a product, an acknowledgment is welcome
   in the documentation of your product. If for some reason you cannot
@@ -107,7 +115,7 @@ static int huft_build(
 {
 
   uInt a;                       /* counter for codes of length k */
-  uInt c[BMAX+1];               /* bit length count table */
+  static uInt c[BMAX+1];        /* bit length count table */
   uInt f;                       /* i repeats in table every f entries */
   int g;                        /* maximum code length */
   int h;                        /* table level */
@@ -118,10 +126,10 @@ static int huft_build(
   uInt mask;                    /* (1 << w) - 1, to avoid cc -O bug on HP */
   register uInt *p;             /* pointer into c[], b[], or v[] */
   inflate_huft *q;              /* points to current table */
-  struct inflate_huft_s r;      /* table entry for structure assignment */
-  inflate_huft *u[BMAX];        /* table stack */
+  static struct inflate_huft_s r; /* table entry for structure assignment */
+  static inflate_huft *u[BMAX]; /* table stack */
   register int w;               /* bits before this table == (l * h) */
-  uInt x[BMAX+1];               /* bit offsets, then code stack */
+  static uInt x[BMAX+1];        /* bit offsets, then code stack */
   uInt *xp;                     /* pointer into x */
   int y;                        /* number of dummy codes added */
   uInt z;                       /* number of entries in current table */
@@ -300,9 +308,13 @@ int zlib_inflate_trees_bits(
   int r;
   uInt hn = 0;          /* hufts used in space */
   uInt *v;              /* work area for huft_build */
-  
+
   v = WS(z)->tree_work_area_1;
+
+  spin_lock(&inflate_lock);
   r = huft_build(c, 19, 19, NULL, NULL, tb, bb, hp, &hn, v);
+  spin_unlock(&inflate_lock);
+
   if (r == Z_DATA_ERROR)
     z->msg = (char*)"oversubscribed dynamic bit lengths tree";
   else if (r == Z_BUF_ERROR || *bb == 0)
@@ -333,7 +345,10 @@ int zlib_inflate_trees_dynamic(
   v = WS(z)->tree_work_area_2;
 
   /* build literal/length tree */
+  spin_lock(&inflate_lock);
   r = huft_build(c, nl, 257, cplens, cplext, tl, bl, hp, &hn, v);
+  spin_unlock(&inflate_lock);
+
   if (r != Z_OK || *bl == 0)
   {
     if (r == Z_DATA_ERROR)
@@ -347,7 +362,10 @@ int zlib_inflate_trees_dynamic(
   }
 
   /* build distance tree */
+  spin_lock(&inflate_lock);
   r = huft_build(c + nl, nd, 0, cpdist, cpdext, td, bd, hp, &hn, v);
+  spin_unlock(&inflate_lock);
+
   if (r != Z_OK || (*bd == 0 && nl > 257))
   {
     if (r == Z_DATA_ERROR)
@@ -383,9 +401,11 @@ int zlib_inflate_trees_fixed(
 	z_streamp z              /* for memory allocation */
 )
 {
-  int i;                /* temporary variable */
-  unsigned l[288];      /* length list for huft_build */
-  uInt *v;              /* work area for huft_build */
+  int i;                       /* temporary variable */
+  static unsigned l[288];      /* length list for huft_build */
+  uInt *v;                     /* work area for huft_build */
+
+  spin_lock(&inflate_lock);
 
   /* set up literal table */
   for (i = 0; i < 144; i++)
@@ -398,15 +418,20 @@ int zlib_inflate_trees_fixed(
     l[i] = 8;
   *bl = 9;
   v = WS(z)->tree_work_area_1;
-  if ((i = huft_build(l, 288, 257, cplens, cplext, tl, bl, hp,  &i, v)) != 0)
+  if ((i = huft_build(l, 288, 257, cplens, cplext, tl, bl, hp,  &i, v)) != 0) {
+    spin_unlock(&inflate_lock);
     return i;
+  }
 
   /* set up distance table */
   for (i = 0; i < 30; i++)      /* make an incomplete code set */
     l[i] = 5;
   *bd = 5;
-  if ((i = huft_build(l, 30, 0, cpdist, cpdext, td, bd, hp, &i, v)) > 1)
+  if ((i = huft_build(l, 30, 0, cpdist, cpdext, td, bd, hp, &i, v)) > 1) {
+    spin_unlock(&inflate_lock);
     return i;
+  }
 
+  spin_unlock(&inflate_lock);
   return Z_OK;
 }
Index: linux-2.6.16/mm/highmem.c
===================================================================
--- linux-2.6.16.orig/mm/highmem.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/mm/highmem.c	2006-10-19 16:52:31.000000000 +0200
@@ -246,11 +246,11 @@ static void bounce_copy_vec(struct bio_v
 	unsigned long flags;
 	unsigned char *vto;
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
 	memcpy(vto + to->bv_offset, vfrom, to->bv_len);
 	kunmap_atomic(vto, KM_BOUNCE_READ);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 }
 
 #else /* CONFIG_HIGHMEM */
Index: linux-2.6.16/mm/memory.c
===================================================================
--- linux-2.6.16.orig/mm/memory.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/mm/memory.c	2006-10-19 16:52:31.000000000 +0200
@@ -264,18 +264,52 @@ void free_pgd_range(struct mmu_gather **
 		flush_tlb_pgtables((*tlb)->mm, start, end);
 }
 
+#ifdef CONFIG_IA64
+#define tlb_start_addr(tlb)	(tlb)->start_addr
+#define tlb_end_addr(tlb)	(tlb)->end_addr
+#else
+#define tlb_start_addr(tlb)	0UL	/* only ia64 really uses it */
+#define tlb_end_addr(tlb)	0UL	/* only ia64 really uses it */
+#endif
+
 void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
 		unsigned long floor, unsigned long ceiling)
 {
+#ifdef CONFIG_PREEMPT
+	struct vm_area_struct *unlink = vma;
+	int fullmm = (*tlb)->fullmm;
+
+	if (!vma)	/* Sometimes when exiting after an oops */
+		return;
+#ifndef CONFIG_PREEMPT_RT
+	if (vma->vm_next)
+#endif
+		tlb_finish_mmu(*tlb, tlb_start_addr(*tlb), tlb_end_addr(*tlb));
+	/*
+	 * Hide vma from rmap and vmtruncate before freeeing pgtables,
+	 * with preemption enabled, except when unmapping just one area.
+	 */
+	while (unlink) {
+		anon_vma_unlink(unlink);
+		unlink_file_vma(unlink);
+		unlink = unlink->vm_next;
+	}
+#ifndef CONFIG_PREEMPT_RT
+	if (vma->vm_next)
+#endif
+		*tlb = tlb_gather_mmu(vma->vm_mm, fullmm);
+#endif
 	while (vma) {
 		struct vm_area_struct *next = vma->vm_next;
 		unsigned long addr = vma->vm_start;
 
+#ifndef CONFIG_PREEMPT
 		/*
 		 * Hide vma from rmap and vmtruncate before freeing pgtables
 		 */
 		anon_vma_unlink(vma);
 		unlink_file_vma(vma);
+#endif
 
 		if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) {
 			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
@@ -289,8 +323,10 @@ void free_pgtables(struct mmu_gather **t
 							HPAGE_SIZE)) {
 				vma = next;
 				next = vma->vm_next;
+#ifndef CONFIG_PREEMPT
 				anon_vma_unlink(vma);
 				unlink_file_vma(vma);
+#endif
 			}
 			free_pgd_range(tlb, addr, vma->vm_end,
 				floor, next? next->vm_start: ceiling);
@@ -760,10 +796,13 @@ static unsigned long unmap_page_range(st
 	return addr;
 }
 
-#ifdef CONFIG_PREEMPT
+#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_RT)
 # define ZAP_BLOCK_SIZE	(8 * PAGE_SIZE)
 #else
-/* No preempt: go for improved straight-line efficiency */
+/*
+ * No preempt: go for improved straight-line efficiency
+ * on PREEMPT_RT this is not a critical latency-path.
+ */
 # define ZAP_BLOCK_SIZE	(1024 * PAGE_SIZE)
 #endif
 
Index: linux-2.6.16/mm/mmap.c
===================================================================
--- linux-2.6.16.orig/mm/mmap.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/mm/mmap.c	2006-10-19 16:52:31.000000000 +0200
@@ -1827,10 +1827,16 @@ asmlinkage long sys_munmap(unsigned long
 static inline void verify_mm_writelocked(struct mm_struct *mm)
 {
 #ifdef CONFIG_DEBUG_VM
-	if (unlikely(down_read_trylock(&mm->mmap_sem))) {
+# ifdef CONFIG_PREEMPT_RT
+	if (unlikely(!rt_rwsem_is_locked(&mm->mmap_sem))) {
 		WARN_ON(1);
-		up_read(&mm->mmap_sem);
 	}
+# else
+        if (unlikely(down_read_trylock(&mm->mmap_sem))) {
+		WARN_ON(1);
+		up_read(&mm->mmap_sem);
+        }
+# endif
 #endif
 }
 
Index: linux-2.6.16/mm/page_alloc.c
===================================================================
--- linux-2.6.16.orig/mm/page_alloc.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/mm/page_alloc.c	2006-10-19 16:52:31.000000000 +0200
@@ -55,7 +55,7 @@ unsigned long totalhigh_pages __read_mos
 long nr_swap_pages;
 int percpu_pagelist_fraction;
 
-static void fastcall free_hot_cold_page(struct page *page, int cold);
+void fastcall free_hot_page(struct page *page);
 static void __free_pages_ok(struct page *page, unsigned int order);
 
 /*
@@ -153,7 +153,8 @@ static void bad_page(struct page *page)
 			1 << PG_reclaim |
 			1 << PG_slab    |
 			1 << PG_swapcache |
-			1 << PG_writeback );
+			1 << PG_writeback |
+			1 << PG_buddy );
 	set_page_count(page, 0);
 	reset_page_mapcount(page);
 	page->mapping = NULL;
@@ -224,12 +225,12 @@ static inline unsigned long page_order(s
 
 static inline void set_page_order(struct page *page, int order) {
 	set_page_private(page, order);
-	__SetPagePrivate(page);
+	__SetPageBuddy(page);
 }
 
 static inline void rmv_page_order(struct page *page)
 {
-	__ClearPagePrivate(page);
+	__ClearPageBuddy(page);
 	set_page_private(page, 0);
 }
 
@@ -268,11 +269,13 @@ __find_combined_index(unsigned long page
  * This function checks whether a page is free && is the buddy
  * we can do coalesce a page and its buddy if
  * (a) the buddy is not in a hole &&
- * (b) the buddy is free &&
- * (c) the buddy is on the buddy system &&
- * (d) a page and its buddy have the same order.
- * for recording page's order, we use page_private(page) and PG_private.
+ * (b) the buddy is in the buddy system &&
+ * (c) a page and its buddy have the same order.
+ *
+ * For recording whether a page is in the buddy system, we use PG_buddy.
+ * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
  *
+ * For recording page's order, we use page_private(page).
  */
 static inline int page_is_buddy(struct page *page, int order)
 {
@@ -281,10 +284,10 @@ static inline int page_is_buddy(struct p
 		return 0;
 #endif
 
-       if (PagePrivate(page)           &&
-           (page_order(page) == order) &&
-            page_count(page) == 0)
+       if (PageBuddy(page) && page_order(page) == order) {
+	       BUG_ON(page_count(page) != 0);
                return 1;
+       }
        return 0;
 }
 
@@ -301,7 +304,7 @@ static inline int page_is_buddy(struct p
  * as necessary, plus some accounting needed to play nicely with other
  * parts of the VM system.
  * At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with PG_Private.Page's
+ * free pages of length of (1 << order) and marked with PG_buddy. Page's
  * order is recorded in page_private(page) field.
  * So when we are allocating or freeing one, we can derive the state of the
  * other.  That is, if we allocate a small block, and both were   
@@ -364,7 +367,8 @@ static inline int free_pages_check(struc
 			1 << PG_slab	|
 			1 << PG_swapcache |
 			1 << PG_writeback |
-			1 << PG_reserved ))))
+			1 << PG_reserved |
+			1 << PG_buddy ))))
 		bad_page(page);
 	if (PageDirty(page))
 		__ClearPageDirty(page);
@@ -390,7 +394,9 @@ static inline int free_pages_check(struc
 static void free_pages_bulk(struct zone *zone, int count,
 					struct list_head *list, int order)
 {
-	spin_lock(&zone->lock);
+	unsigned long flags;
+
+	spin_lock_irqsave(&zone->lock, flags);
 	zone->all_unreclaimable = 0;
 	zone->pages_scanned = 0;
 	while (count--) {
@@ -402,7 +408,7 @@ static void free_pages_bulk(struct zone 
 		list_del(&page->lru);
 		__free_one_page(page, zone, order);
 	}
-	spin_unlock(&zone->lock);
+	spin_unlock_irqrestore(&zone->lock, flags);
 }
 
 static void free_one_page(struct zone *zone, struct page *page, int order)
@@ -420,8 +426,8 @@ static void __free_pages_ok(struct page 
 
 	arch_free_page(page, order);
 	if (!PageHighMem(page))
-		mutex_debug_check_no_locks_freed(page_address(page),
-						 PAGE_SIZE<<order);
+		debug_check_no_locks_freed(page_address(page),
+					   PAGE_SIZE<<order);
 
 #ifndef CONFIG_MMU
 	for (i = 1 ; i < (1 << order) ; ++i)
@@ -434,10 +440,10 @@ static void __free_pages_ok(struct page 
 		return;
 
 	kernel_map_pages(page, 1 << order, 0);
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	__mod_page_state(pgfree, 1 << order);
+	local_irq_restore_nort(flags);
 	free_one_page(page_zone(page), page, order);
-	local_irq_restore(flags);
 }
 
 /*
@@ -449,7 +455,7 @@ void fastcall __init __free_pages_bootme
 		__ClearPageReserved(page);
 		set_page_count(page, 0);
 
-		free_hot_cold_page(page, 0);
+		free_hot_page(page);
 	} else {
 		LIST_HEAD(list);
 		int loop;
@@ -522,7 +528,8 @@ static int prep_new_page(struct page *pa
 			1 << PG_slab    |
 			1 << PG_swapcache |
 			1 << PG_writeback |
-			1 << PG_reserved ))))
+			1 << PG_reserved |
+			1 << PG_buddy ))))
 		bad_page(page);
 
 	/*
@@ -568,6 +575,7 @@ static struct page *__rmqueue(struct zon
 	return NULL;
 }
 
+#ifndef CONFIG_PREEMPT_RT
 /* 
  * Obtain a specified number of elements from the buddy allocator, all under
  * a single hold of the lock, for efficiency.  Add them to the supplied list.
@@ -588,6 +596,7 @@ static int rmqueue_bulk(struct zone *zon
 	spin_unlock(&zone->lock);
 	return i;
 }
+#endif
 
 #ifdef CONFIG_NUMA
 /*
@@ -596,6 +605,7 @@ static int rmqueue_bulk(struct zone *zon
  */
 void drain_node_pages(int nodeid)
 {
+#ifndef CONFIG_PREEMPT_RT
 	int i, z;
 	unsigned long flags;
 
@@ -614,6 +624,7 @@ void drain_node_pages(int nodeid)
 		}
 	}
 	local_irq_restore(flags);
+#endif
 }
 #endif
 
@@ -702,6 +713,7 @@ static void zone_statistics(struct zonel
 #endif
 }
 
+#ifndef CONFIG_PREEMPT_RT
 /*
  * Free a 0-order page
  */
@@ -732,15 +744,32 @@ static void fastcall free_hot_cold_page(
 	local_irq_restore(flags);
 	put_cpu();
 }
+#endif
 
+/*
+ * On PREEMPT_RT we use a simple solution for the time being,
+ * per-CPU allocation is disabled.
+ */
 void fastcall free_hot_page(struct page *page)
 {
+#ifdef CONFIG_PREEMPT_RT
+	if (PageAnon(page))
+		page->mapping = NULL;
+	__free_pages_ok(page, 0);
+#else
 	free_hot_cold_page(page, 0);
+#endif
 }
 	
 void fastcall free_cold_page(struct page *page)
 {
+#ifdef CONFIG_PREEMPT_RT
+	if (PageAnon(page))
+		page->mapping = NULL;
+	__free_pages_ok(page, 0);
+#else
 	free_hot_cold_page(page, 1);
+#endif
 }
 
 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
@@ -762,13 +791,18 @@ static struct page *buffered_rmqueue(str
 {
 	unsigned long flags;
 	struct page *page;
+#ifndef CONFIG_PREEMPT_RT
 	int cold = !!(gfp_flags & __GFP_COLD);
-	int cpu;
+#endif
 
+	/*
+	 * Moved the get_cpu() magic inside the !PREEMPT_RT section.
+	 */
 again:
-	cpu  = get_cpu();
+#ifndef CONFIG_PREEMPT_RT
 	if (likely(order == 0)) {
 		struct per_cpu_pages *pcp;
+		int cpu = get_cpu();
 
 		pcp = &zone_pcp(zone, cpu)->pcp[cold];
 		local_irq_save(flags);
@@ -781,18 +815,20 @@ again:
 		page = list_entry(pcp->list.next, struct page, lru);
 		list_del(&page->lru);
 		pcp->count--;
-	} else {
+		local_irq_restore(flags);
+		put_cpu();
+	} else
+#endif
+	{
 		spin_lock_irqsave(&zone->lock, flags);
 		page = __rmqueue(zone, order);
-		spin_unlock(&zone->lock);
+		spin_unlock_irqrestore(&zone->lock, flags);
 		if (!page)
 			goto failed;
 	}
 
 	__mod_page_state_zone(zone, pgalloc, 1 << order);
-	zone_statistics(zonelist, zone, cpu);
-	local_irq_restore(flags);
-	put_cpu();
+	zone_statistics(zonelist, zone, raw_smp_processor_id());
 
 	BUG_ON(bad_range(zone, page));
 	if (prep_new_page(page, order))
@@ -806,7 +842,7 @@ again:
 	return page;
 
 failed:
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 	put_cpu();
 	return NULL;
 }
@@ -977,9 +1013,11 @@ nofail_alloc:
 		goto nopage;
 	}
 
+#ifndef CONFIG_PREEMPT_RT
 	/* Atomic allocations - we can't balance anything */
 	if (!wait)
 		goto nopage;
+#endif
 
 rebalance:
 	cond_resched();
@@ -1087,8 +1125,15 @@ void __pagevec_free(struct pagevec *pvec
 {
 	int i = pagevec_count(pvec);
 
-	while (--i >= 0)
+	while (--i >= 0) {
+#ifdef CONFIG_PREEMPT_RT
+		if (PageAnon(pvec->pages[i]))
+			pvec->pages[i]->mapping = NULL;
+		__free_pages_ok(pvec->pages[i], 0);
+#else
 		free_hot_cold_page(pvec->pages[i], pvec->cold);
+#endif
+	}
 }
 
 fastcall void __free_pages(struct page *page, unsigned int order)
@@ -1204,7 +1249,7 @@ static void show_node(struct zone *zone)
  * The result is unavoidably approximate - it can change
  * during and after execution of this function.
  */
-static DEFINE_PER_CPU(struct page_state, page_states) = {0};
+static DEFINE_PER_CPU_LOCKED(struct page_state, page_states);
 
 atomic_t nr_pagecache = ATOMIC_INIT(0);
 EXPORT_SYMBOL(nr_pagecache);
@@ -1226,12 +1271,12 @@ static void __get_page_state(struct page
 		if (!cpu_isset(cpu, *cpumask))
 			continue;
 
-		in = (unsigned long *)&per_cpu(page_states, cpu);
+		in = (unsigned long *)&per_cpu_var_locked(page_states, cpu);
 
 		cpu = next_cpu(cpu, *cpumask);
 
 		if (likely(cpu < NR_CPUS))
-			prefetch(&per_cpu(page_states, cpu));
+			prefetch(&per_cpu_var_locked(page_states, cpu));
 
 		out = (unsigned long *)ret;
 		for (off = 0; off < nr; off++)
@@ -1276,30 +1321,34 @@ unsigned long read_page_state_offset(uns
 	for_each_online_cpu(cpu) {
 		unsigned long in;
 
-		in = (unsigned long)&per_cpu(page_states, cpu) + offset;
+		in = (unsigned long)&per_cpu_var_locked(page_states, cpu) + offset;
 		ret += *((unsigned long *)in);
 	}
 	return ret;
 }
 
+/*
+ * On PREEMPT_RT we use the per-CPU locks to protect this per-CPU data.
+ * On non-PREEMPT_RT this translates to preempt off / on.
+ */
 void __mod_page_state_offset(unsigned long offset, unsigned long delta)
 {
 	void *ptr;
+	int cpu;
 
-	ptr = &__get_cpu_var(page_states);
+	ptr = &get_cpu_var_locked(page_states, &cpu);
 	*(unsigned long *)(ptr + offset) += delta;
+	put_cpu_var_locked(page_states, cpu);
 }
 EXPORT_SYMBOL(__mod_page_state_offset);
 
 void mod_page_state_offset(unsigned long offset, unsigned long delta)
 {
 	unsigned long flags;
-	void *ptr;
 
-	local_irq_save(flags);
-	ptr = &__get_cpu_var(page_states);
-	*(unsigned long *)(ptr + offset) += delta;
-	local_irq_restore(flags);
+	local_irq_save_nort(flags);
+	__mod_page_state_offset(offset, delta);
+	local_irq_restore_nort(flags);
 }
 EXPORT_SYMBOL(mod_page_state_offset);
 
@@ -2435,8 +2484,8 @@ static int page_alloc_cpu_notify(struct 
 		__drain_pages(cpu);
 
 		/* Add dead cpu's page_states to our own. */
-		dest = (unsigned long *)&__get_cpu_var(page_states);
-		src = (unsigned long *)&per_cpu(page_states, cpu);
+		dest = (unsigned long *)&__get_cpu_var_locked(page_states, cpu);
+		src = (unsigned long *)&per_cpu_var_locked(page_states, cpu);
 
 		for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long);
 				i++) {
@@ -2729,3 +2778,30 @@ void *__init alloc_large_system_hash(con
 
 	return table;
 }
+
+void *__init alloc_large_system_bitmask(char *bitmaskname,
+					unsigned long bits, int flags)
+{
+	unsigned long words = bits / (sizeof(unsigned long)*8);
+	unsigned long size = words * sizeof(unsigned long);
+	unsigned long *bitmask = NULL;
+
+	if (flags & HASH_EARLY)
+		bitmask = alloc_bootmem(size);
+	else if (hashdist)
+		bitmask = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
+	else {
+		bitmask = kmalloc(size, GFP_ATOMIC);
+		if (!bitmask) {
+			unsigned long order;
+			for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
+				;
+			bitmask = (void*) __get_free_pages(GFP_ATOMIC, order);
+		}
+	}
+
+	if (!bitmask)
+		panic("Failed to allocate %s bitmask\n", bitmaskname);
+
+	return bitmask;
+}
Index: linux-2.6.16/mm/slab.c
===================================================================
--- linux-2.6.16.orig/mm/slab.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/mm/slab.c	2006-10-19 16:52:31.000000000 +0200
@@ -112,6 +112,63 @@
 #include	<asm/page.h>
 
 /*
+ * On !PREEMPT_RT, raw irq flags are used as a per-CPU locking
+ * mechanism.
+ *
+ * On PREEMPT_RT, we use per-CPU locks for this. That's why the
+ * calling convention is changed slightly: a new 'flags' argument
+ * is passed to 'irq disable/enable' - the PREEMPT_RT code stores
+ * the CPU number of the lock there.
+ */
+#ifndef CONFIG_PREEMPT_RT
+# define slab_irq_disable(cpu) \
+	do { local_irq_disable(); (cpu) = smp_processor_id(); } while (0)
+# define slab_irq_enable(cpu)		local_irq_enable()
+# define slab_irq_save(flags, cpu) \
+	do { local_irq_save(flags); (cpu) = smp_processor_id(); } while (0)
+# define slab_irq_restore(flags, cpu)	local_irq_restore(flags)
+/*
+ * In the __GFP_WAIT case we enable/disable interrupts on !PREEMPT_RT,
+ * which has no per-CPU locking effect since we are holding the cache
+ * lock in that case already.
+ *
+ * (On PREEMPT_RT, these are NOPs, but we have to drop/get the irq locks.)
+ */
+# define slab_irq_disable_nort()	local_irq_disable()
+# define slab_irq_enable_nort()		local_irq_enable()
+# define slab_irq_disable_rt(flags)	do { (void)(flags); } while (0)
+# define slab_irq_enable_rt(flags)	do { (void)(flags); } while (0)
+# define slab_spin_lock_irq(lock, cpu) \
+ 	do { spin_lock_irq(lock); (cpu) = smp_processor_id(); } while (0)
+# define slab_spin_unlock_irq(lock, cpu) \
+				 	spin_unlock_irq(lock)
+# define slab_spin_lock_irqsave(lock, flags, cpu) \
+ 	do { spin_lock_irqsave(lock, flags); (cpu) = smp_processor_id(); } while (0)
+# define slab_spin_unlock_irqrestore(lock, flags, cpu) \
+ 	do { spin_unlock_irqrestore(lock, flags); } while (0)
+#else
+DEFINE_PER_CPU_LOCKED(int, slab_irq_locks) = { 0, };
+# define slab_irq_disable(cpu)		get_cpu_var_locked(slab_irq_locks, &(cpu))
+# define slab_irq_enable(cpu)		put_cpu_var_locked(slab_irq_locks, cpu)
+# define slab_irq_save(flags, cpu) \
+	do { slab_irq_disable(cpu); (void) (flags); } while (0)
+# define slab_irq_restore(flags, cpu) \
+	do { slab_irq_enable(cpu); (void) (flags); } while (0)
+# define slab_irq_disable_rt(cpu)	slab_irq_disable(cpu)
+# define slab_irq_enable_rt(cpu)	slab_irq_enable(cpu)
+# define slab_irq_disable_nort()	do { } while (0)
+# define slab_irq_enable_nort()		do { } while (0)
+# define slab_spin_lock_irq(lock, cpu) \
+		do { slab_irq_disable(cpu); spin_lock(lock); } while (0)
+# define slab_spin_unlock_irq(lock, cpu) \
+		do { spin_unlock(lock); slab_irq_enable(cpu); } while (0)
+# define slab_spin_lock_irqsave(lock, flags, cpu) \
+ 	do { slab_irq_disable(cpu); spin_lock_irqsave(lock, flags); } while (0)
+# define slab_spin_unlock_irqrestore(lock, flags, cpu) \
+ 	do { spin_unlock_irqrestore(lock, flags); slab_irq_enable(cpu); } while (0)
+#endif
+
+/*
  * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
  *		  SLAB_RED_ZONE & SLAB_POISON.
  *		  0 for faster, smaller code (especially in the critical paths).
@@ -675,14 +732,15 @@ static enum {
 
 static DEFINE_PER_CPU(struct work_struct, reap_work);
 
-static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node);
+static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node, int *this_cpu);
 static void enable_cpucache(struct kmem_cache *cachep);
 static void cache_reap(void *unused);
-static int __node_shrink(struct kmem_cache *cachep, int node);
+static int __node_shrink(struct kmem_cache *cachep, int node, int *this_cpu);
 
-static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
+static inline struct
+array_cache *cpu_cache_get(struct kmem_cache *cachep, int this_cpu)
 {
-	return cachep->array[smp_processor_id()];
+	return cachep->array[this_cpu];
 }
 
 static inline struct kmem_cache *__find_general_cachep(size_t size, gfp_t gfpflags)
@@ -871,7 +929,8 @@ static struct array_cache *alloc_arrayca
 }
 
 #ifdef CONFIG_NUMA
-static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
+static void * __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
+				 int nodeid, int *this_cpu);
 
 static struct array_cache **alloc_alien_cache(int node, int limit)
 {
@@ -914,13 +973,14 @@ static void free_alien_cache(struct arra
 }
 
 static void __drain_alien_cache(struct kmem_cache *cachep,
-				struct array_cache *ac, int node)
+				struct array_cache *ac, int node,
+				int *this_cpu)
 {
 	struct kmem_list3 *rl3 = cachep->nodelists[node];
 
 	if (ac->avail) {
 		spin_lock(&rl3->list_lock);
-		free_block(cachep, ac->entry, ac->avail, node);
+		free_block(cachep, ac->entry, ac->avail, node, this_cpu);
 		ac->avail = 0;
 		spin_unlock(&rl3->list_lock);
 	}
@@ -929,7 +989,8 @@ static void __drain_alien_cache(struct k
 /*
  * Called from cache_reap() to regularly drain alien caches round robin.
  */
-static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
+static void
+reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3, int *this_cpu)
 {
 	int node = __get_cpu_var(reap_node);
 
@@ -937,7 +998,7 @@ static void reap_alien(struct kmem_cache
 		struct array_cache *ac = l3->alien[node];
 		if (ac && ac->avail) {
 			spin_lock_irq(&ac->lock);
-			__drain_alien_cache(cachep, ac, node);
+			__drain_alien_cache(cachep, ac, node, this_cpu);
 			spin_unlock_irq(&ac->lock);
 		}
 	}
@@ -948,20 +1009,21 @@ static void drain_alien_cache(struct kme
 	int i = 0;
 	struct array_cache *ac;
 	unsigned long flags;
+	int this_cpu;
 
 	for_each_online_node(i) {
 		ac = alien[i];
 		if (ac) {
-			spin_lock_irqsave(&ac->lock, flags);
-			__drain_alien_cache(cachep, ac, i);
-			spin_unlock_irqrestore(&ac->lock, flags);
+			slab_spin_lock_irqsave(&ac->lock, flags, this_cpu);
+			__drain_alien_cache(cachep, ac, i, &this_cpu);
+			slab_spin_unlock_irqrestore(&ac->lock, flags, this_cpu);
 		}
 	}
 }
 #else
 
 #define drain_alien_cache(cachep, alien) do { } while (0)
-#define reap_alien(cachep, l3) do { } while (0)
+#define reap_alien(cachep, l3, this_cpu) do { } while (0)
 
 static inline struct array_cache **alloc_alien_cache(int node, int limit)
 {
@@ -982,6 +1044,7 @@ static int __devinit cpuup_callback(stru
 	struct kmem_list3 *l3 = NULL;
 	int node = cpu_to_node(cpu);
 	int memsize = sizeof(struct kmem_list3);
+	int this_cpu;
 
 	switch (action) {
 	case CPU_UP_PREPARE:
@@ -1013,17 +1076,19 @@ static int __devinit cpuup_callback(stru
 				cachep->nodelists[node] = l3;
 			}
 
-			spin_lock_irq(&cachep->nodelists[node]->list_lock);
+			slab_spin_lock_irq(&cachep->nodelists[node]->list_lock,
+this_cpu);
 			cachep->nodelists[node]->free_limit =
 			    (1 + nr_cpus_node(node)) *
 			    cachep->batchcount + cachep->num;
-			spin_unlock_irq(&cachep->nodelists[node]->list_lock);
+			slab_spin_unlock_irq(&cachep->nodelists[node]->list_lock, this_cpu);
 		}
 
 		/* Now we can go ahead with allocating the shared array's
 		   & array cache's */
 		list_for_each_entry(cachep, &cache_chain, next) {
 			struct array_cache *nc;
+			int this_cpu;
 			struct array_cache *shared;
 			struct array_cache **alien;
 
@@ -1045,7 +1110,7 @@ static int __devinit cpuup_callback(stru
 			l3 = cachep->nodelists[node];
 			BUG_ON(!l3);
 
-			spin_lock_irq(&l3->list_lock);
+			slab_spin_lock_irq(&l3->list_lock, this_cpu);
 			if (!l3->shared) {
 				/*
 				 * We are serialised from CPU_DEAD or
@@ -1060,7 +1125,7 @@ static int __devinit cpuup_callback(stru
 				alien = NULL;
 			}
 #endif
-			spin_unlock_irq(&l3->list_lock);
+			slab_spin_unlock_irq(&l3->list_lock, this_cpu);
 
 			kfree(shared);
 			free_alien_cache(alien);
@@ -1088,6 +1153,7 @@ static int __devinit cpuup_callback(stru
 			struct array_cache *nc;
 			struct array_cache *shared;
 			struct array_cache **alien;
+			int this_cpu;
 			cpumask_t mask;
 
 			mask = node_to_cpumask(node);
@@ -1099,29 +1165,30 @@ static int __devinit cpuup_callback(stru
 			if (!l3)
 				goto free_array_cache;
 
-			spin_lock_irq(&l3->list_lock);
+			slab_spin_lock_irq(&l3->list_lock, this_cpu);
 
 			/* Free limit for this kmem_list3 */
 			l3->free_limit -= cachep->batchcount;
 			if (nc)
-				free_block(cachep, nc->entry, nc->avail, node);
+				free_block(cachep, nc->entry, nc->avail, node,
+					   &this_cpu);
 
 			if (!cpus_empty(mask)) {
-				spin_unlock_irq(&l3->list_lock);
+				slab_spin_unlock_irq(&l3->list_lock, this_cpu);
 				goto free_array_cache;
 			}
 
 			shared = l3->shared;
 			if (shared) {
 				free_block(cachep, l3->shared->entry,
-					   l3->shared->avail, node);
+					   l3->shared->avail, node, &this_cpu);
 				l3->shared = NULL;
 			}
 
 			alien = l3->alien;
 			l3->alien = NULL;
 
-			spin_unlock_irq(&l3->list_lock);
+			slab_spin_unlock_irq(&l3->list_lock, this_cpu);
 
 			kfree(shared);
 			if (alien) {
@@ -1137,13 +1204,15 @@ free_array_cache:
 		 * shrink each nodelist to its limit.
 		 */
 		list_for_each_entry(cachep, &cache_chain, next) {
+			int this_cpu;
+
 			l3 = cachep->nodelists[node];
 			if (!l3)
 				continue;
-			spin_lock_irq(&l3->list_lock);
+			slab_spin_lock_irq(&l3->list_lock, this_cpu);
 			/* free slabs belonging to this node */
-			__node_shrink(cachep, node);
-			spin_unlock_irq(&l3->list_lock);
+			__node_shrink(cachep, node, &this_cpu);
+			slab_spin_unlock_irq(&l3->list_lock, this_cpu);
 		}
 		mutex_unlock(&cache_chain_mutex);
 		break;
@@ -1163,16 +1232,24 @@ static struct notifier_block cpucache_no
 static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int nodeid)
 {
 	struct kmem_list3 *ptr;
+	int this_cpu;
 
 	BUG_ON(cachep->nodelists[nodeid] != list);
 	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
 	BUG_ON(!ptr);
 
-	local_irq_disable();
+	WARN_ON(spin_is_locked(&list->list_lock));
+	slab_irq_disable(this_cpu);
 	memcpy(ptr, list, sizeof(struct kmem_list3));
+	/*
+	 * Do not assume that spinlocks can be initialized via memcpy:
+	 */
+	spin_lock_init(&ptr->list_lock);
+
 	MAKE_ALL_LISTS(cachep, ptr, nodeid);
 	cachep->nodelists[nodeid] = ptr;
-	local_irq_enable();
+
+	slab_irq_enable(this_cpu);
 }
 
 /* Initialisation.
@@ -1299,27 +1376,29 @@ void __init kmem_cache_init(void)
 	}
 	/* 4) Replace the bootstrap head arrays */
 	{
-		void *ptr;
+		struct array_cache *ptr;
+		int this_cpu;
 
 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
 
-		local_irq_disable();
-		BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
-		memcpy(ptr, cpu_cache_get(&cache_cache),
-		       sizeof(struct arraycache_init));
-		cache_cache.array[smp_processor_id()] = ptr;
-		local_irq_enable();
+		slab_irq_disable(this_cpu);
+		BUG_ON(cpu_cache_get(&cache_cache, this_cpu) != &initarray_cache.cache);
+		memcpy(ptr, cpu_cache_get(&cache_cache, this_cpu),
+				sizeof(struct arraycache_init));
+		spin_lock_init(&ptr->lock);
+		cache_cache.array[this_cpu] = ptr;
+		slab_irq_enable(this_cpu);
 
 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
 
-		local_irq_disable();
-		BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
-		       != &initarray_generic.cache);
-		memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
-		       sizeof(struct arraycache_init));
-		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
-		    ptr;
-		local_irq_enable();
+		slab_irq_disable(this_cpu);
+		BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep, this_cpu)
+				!= &initarray_generic.cache);
+		memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep, this_cpu),
+				sizeof(struct arraycache_init));
+		spin_lock_init(&ptr->lock);
+		malloc_sizes[INDEX_AC].cs_cachep->array[this_cpu] = ptr;
+		slab_irq_enable(this_cpu);
 	}
 	/* 5) Replace the bootstrap kmem_list3's */
 	{
@@ -1455,7 +1534,7 @@ static void store_stackinfo(struct kmem_
 
 	*addr++ = 0x12345678;
 	*addr++ = caller;
-	*addr++ = smp_processor_id();
+	*addr++ = raw_smp_processor_id();
 	size -= 3 * sizeof(unsigned long);
 	{
 		unsigned long *sptr = &caller;
@@ -1638,12 +1717,17 @@ static void slab_destroy_objs(struct kme
 }
 #endif
 
+static void
+__cache_free(struct kmem_cache *cachep, void *objp, int *this_cpu);
+
 /**
  * Destroy all the objs in a slab, and release the mem back to the system.
  * Before calling the slab must have been unlinked from the cache.
  * The cache-lock is not held/needed.
  */
-static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
+static void
+slab_destroy(struct kmem_cache *cachep, struct slab *slabp,
+	     int *this_cpu)
 {
 	void *addr = slabp->s_mem - slabp->colouroff;
 
@@ -1657,8 +1741,12 @@ static void slab_destroy(struct kmem_cac
 		call_rcu(&slab_rcu->head, kmem_rcu_free);
 	} else {
 		kmem_freepages(cachep, addr);
-		if (OFF_SLAB(cachep))
-			kmem_cache_free(cachep->slabp_cache, slabp);
+		if (OFF_SLAB(cachep)) {
+			if (this_cpu)
+				__cache_free(cachep->slabp_cache, slabp, this_cpu);
+			else
+				kmem_cache_free(cachep->slabp_cache, slabp);
+		}
 	}
 }
 
@@ -1987,7 +2075,6 @@ kmem_cache_create (const char *name, siz
 	cachep->dtor = dtor;
 	cachep->name = name;
 
-
 	if (g_cpucache_up == FULL) {
 		enable_cpucache(cachep);
 	} else {
@@ -2034,13 +2121,17 @@ kmem_cache_create (const char *name, siz
 		    jiffies + REAPTIMEOUT_LIST3 +
 		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
 
-		BUG_ON(!cpu_cache_get(cachep));
-		cpu_cache_get(cachep)->avail = 0;
-		cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
-		cpu_cache_get(cachep)->batchcount = 1;
-		cpu_cache_get(cachep)->touched = 0;
-		cachep->batchcount = 1;
-		cachep->limit = BOOT_CPUCACHE_ENTRIES;
+		{
+			int this_cpu = raw_smp_processor_id();
+
+			BUG_ON(!cpu_cache_get(cachep, this_cpu));
+			cpu_cache_get(cachep, this_cpu)->avail = 0;
+			cpu_cache_get(cachep, this_cpu)->limit = BOOT_CPUCACHE_ENTRIES;
+			cpu_cache_get(cachep, this_cpu)->batchcount = 1;
+			cpu_cache_get(cachep, this_cpu)->touched = 0;
+			cachep->batchcount = 1;
+			cachep->limit = BOOT_CPUCACHE_ENTRIES;
+		}
 	}
 
 	/* cache setup completed, link it into the list */
@@ -2058,19 +2149,19 @@ EXPORT_SYMBOL(kmem_cache_create);
 #if DEBUG
 static void check_irq_off(void)
 {
+/*
+ * On PREEMPT_RT we use locks to protect the per-CPU lists,
+ * and keep interrupts enabled.
+ */
+#ifndef CONFIG_PREEMPT_RT
 	BUG_ON(!irqs_disabled());
+#endif
 }
 
 static void check_irq_on(void)
 {
+#ifndef CONFIG_PREEMPT_RT
 	BUG_ON(irqs_disabled());
-}
-
-static void check_spinlock_acquired(struct kmem_cache *cachep)
-{
-#ifdef CONFIG_SMP
-	check_irq_off();
-	assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
 #endif
 }
 
@@ -2085,48 +2176,82 @@ static void check_spinlock_acquired_node
 #else
 #define check_irq_off()	do { } while(0)
 #define check_irq_on()	do { } while(0)
-#define check_spinlock_acquired(x) do { } while(0)
 #define check_spinlock_acquired_node(x, y) do { } while(0)
 #endif
 
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * execute func() for all CPUs. On PREEMPT_RT we dont actually have
+ * to run on the remote CPUs - we only have to take their CPU-locks.
+ * (This is a rare operation, so cacheline bouncing is not an issue.)
+ */
+static void
+smp_call_function_all_cpus(void (*func)(void *arg, int this_cpu), void *arg)
+{
+	unsigned int i;
+
+	check_irq_on();
+	for_each_online_cpu(i) {
+		spin_lock(&__get_cpu_lock(slab_irq_locks, i));
+		func(arg, i);
+		spin_unlock(&__get_cpu_lock(slab_irq_locks, i));
+	}
+}
+#else
 /*
  * Waits for all CPUs to execute func().
  */
 static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
 {
+	unsigned int this_cpu;
+
 	check_irq_on();
 	preempt_disable();
 
-	local_irq_disable();
+	slab_irq_disable(this_cpu);
 	func(arg);
-	local_irq_enable();
+	slab_irq_enable(this_cpu);
 
 	if (smp_call_function(func, arg, 1, 1))
 		BUG();
 
 	preempt_enable();
 }
+#endif
 
 static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
 				int force, int node);
 
-static void do_drain(void *arg)
+static void __do_drain(void *arg, int this_cpu)
 {
 	struct kmem_cache *cachep = (struct kmem_cache *) arg;
+	int node = cpu_to_node(this_cpu);
 	struct array_cache *ac;
-	int node = numa_node_id();
 
 	check_irq_off();
-	ac = cpu_cache_get(cachep);
+	ac = cpu_cache_get(cachep, this_cpu);
 	spin_lock(&cachep->nodelists[node]->list_lock);
-	free_block(cachep, ac->entry, ac->avail, node);
+	free_block(cachep, ac->entry, ac->avail, node, &this_cpu);
 	spin_unlock(&cachep->nodelists[node]->list_lock);
 	ac->avail = 0;
 }
 
+#ifdef CONFIG_PREEMPT_RT
+static void do_drain(void *arg, int this_cpu)
+{
+	__do_drain(arg, this_cpu);
+}
+#else
+static void do_drain(void *arg)
+{
+	__do_drain(arg, smp_processor_id());
+}
+#endif
+
 static void drain_cpu_caches(struct kmem_cache *cachep)
 {
 	struct kmem_list3 *l3;
+	int this_cpu;
 	int node;
 
 	smp_call_function_all_cpus(do_drain, cachep);
@@ -2134,16 +2259,16 @@ static void drain_cpu_caches(struct kmem
 	for_each_online_node(node) {
 		l3 = cachep->nodelists[node];
 		if (l3) {
-			spin_lock_irq(&l3->list_lock);
+			slab_spin_lock_irq(&l3->list_lock, this_cpu);
 			drain_array_locked(cachep, l3->shared, 1, node);
-			spin_unlock_irq(&l3->list_lock);
+			slab_spin_unlock_irq(&l3->list_lock, this_cpu);
 			if (l3->alien)
 				drain_alien_cache(cachep, l3->alien);
 		}
 	}
 }
 
-static int __node_shrink(struct kmem_cache *cachep, int node)
+static int __node_shrink(struct kmem_cache *cachep, int node, int *this_cpu)
 {
 	struct slab *slabp;
 	struct kmem_list3 *l3 = cachep->nodelists[node];
@@ -2164,9 +2289,9 @@ static int __node_shrink(struct kmem_cac
 		list_del(&slabp->list);
 
 		l3->free_objects -= cachep->num;
-		spin_unlock_irq(&l3->list_lock);
-		slab_destroy(cachep, slabp);
-		spin_lock_irq(&l3->list_lock);
+		slab_spin_unlock_irq(&l3->list_lock, *this_cpu);
+		slab_destroy(cachep, slabp, NULL);
+		slab_spin_lock_irq(&l3->list_lock, *this_cpu);
 	}
 	ret = !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial);
 	return ret;
@@ -2176,6 +2301,7 @@ static int __cache_shrink(struct kmem_ca
 {
 	int ret = 0, i = 0;
 	struct kmem_list3 *l3;
+	int this_cpu;
 
 	drain_cpu_caches(cachep);
 
@@ -2183,9 +2309,9 @@ static int __cache_shrink(struct kmem_ca
 	for_each_online_node(i) {
 		l3 = cachep->nodelists[i];
 		if (l3) {
-			spin_lock_irq(&l3->list_lock);
-			ret += __node_shrink(cachep, i);
-			spin_unlock_irq(&l3->list_lock);
+			slab_spin_lock_irq(&l3->list_lock, this_cpu);
+			ret += __node_shrink(cachep, i, &this_cpu);
+			slab_spin_unlock_irq(&l3->list_lock, this_cpu);
 		}
 	}
 	return (ret ? 1 : 0);
@@ -2416,7 +2542,8 @@ static void set_slab_attr(struct kmem_ca
  * Grow (by 1) the number of slabs within a cache.  This is called by
  * kmem_cache_alloc() when there are no active objs left in a cache.
  */
-static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid,
+		      int *this_cpu)
 {
 	struct slab *slabp;
 	void *objp;
@@ -2457,7 +2584,8 @@ static int cache_grow(struct kmem_cache 
 	offset *= cachep->colour_off;
 
 	if (local_flags & __GFP_WAIT)
-		local_irq_enable();
+		slab_irq_enable_nort();
+	slab_irq_enable_rt(*this_cpu);
 
 	/*
 	 * The test for missing atomic flag is performed here, rather than
@@ -2482,8 +2610,9 @@ static int cache_grow(struct kmem_cache 
 
 	cache_init_objs(cachep, slabp, ctor_flags);
 
+	slab_irq_disable_rt(*this_cpu);
 	if (local_flags & __GFP_WAIT)
-		local_irq_disable();
+		slab_irq_disable_nort();
 	check_irq_off();
 	spin_lock(&l3->list_lock);
 
@@ -2496,8 +2625,9 @@ static int cache_grow(struct kmem_cache 
       opps1:
 	kmem_freepages(cachep, objp);
       failed:
+	slab_irq_disable_rt(*this_cpu);
 	if (local_flags & __GFP_WAIT)
-		local_irq_disable();
+		slab_irq_disable_nort();
 	return 0;
 }
 
@@ -2633,14 +2763,15 @@ static void check_slabp(struct kmem_cach
 #define check_slabp(x,y) do { } while(0)
 #endif
 
-static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
+static void *
+cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, int *this_cpu)
 {
 	int batchcount;
 	struct kmem_list3 *l3;
 	struct array_cache *ac;
 
 	check_irq_off();
-	ac = cpu_cache_get(cachep);
+	ac = cpu_cache_get(cachep, *this_cpu);
       retry:
 	batchcount = ac->batchcount;
 	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -2650,7 +2781,7 @@ static void *cache_alloc_refill(struct k
 		 */
 		batchcount = BATCHREFILL_LIMIT;
 	}
-	l3 = cachep->nodelists[numa_node_id()];
+	l3 = cachep->nodelists[cpu_to_node(*this_cpu)];
 
 	BUG_ON(ac->avail > 0 || !l3);
 	spin_lock(&l3->list_lock);
@@ -2683,14 +2814,14 @@ static void *cache_alloc_refill(struct k
 
 		slabp = list_entry(entry, struct slab, list);
 		check_slabp(cachep, slabp);
-		check_spinlock_acquired(cachep);
+		check_spinlock_acquired_node(cachep, cpu_to_node(*this_cpu));
 		while (slabp->inuse < cachep->num && batchcount--) {
 			STATS_INC_ALLOCED(cachep);
 			STATS_INC_ACTIVE(cachep);
 			STATS_SET_HIGH(cachep);
 
 			ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
-							    numa_node_id());
+						    cpu_to_node(*this_cpu));
 		}
 		check_slabp(cachep, slabp);
 
@@ -2709,10 +2840,10 @@ static void *cache_alloc_refill(struct k
 
 	if (unlikely(!ac->avail)) {
 		int x;
-		x = cache_grow(cachep, flags, numa_node_id());
+		x = cache_grow(cachep, flags, cpu_to_node(*this_cpu), this_cpu);
 
 		// cache_grow can reenable interrupts, then ac could change.
-		ac = cpu_cache_get(cachep);
+		ac = cpu_cache_get(cachep, *this_cpu);
 		if (!x && ac->avail == 0)	// no objects in sight? abort
 			return NULL;
 
@@ -2782,7 +2913,8 @@ static void *cache_alloc_debugcheck_afte
 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
 #endif
 
-static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
+static inline void *
+____cache_alloc(struct kmem_cache *cachep, gfp_t flags, int *this_cpu)
 {
 	void *objp;
 	struct array_cache *ac;
@@ -2791,20 +2923,20 @@ static inline void *____cache_alloc(stru
 	if (unlikely(current->mempolicy && !in_interrupt())) {
 		int nid = slab_node(current->mempolicy);
 
-		if (nid != numa_node_id())
-			return __cache_alloc_node(cachep, flags, nid);
+		if (nid != cpu_to_node(*this_cpu))
+			return __cache_alloc_node(cachep, flags, nid, this_cpu);
 	}
 #endif
 
 	check_irq_off();
-	ac = cpu_cache_get(cachep);
+	ac = cpu_cache_get(cachep, *this_cpu);
 	if (likely(ac->avail)) {
 		STATS_INC_ALLOCHIT(cachep);
 		ac->touched = 1;
 		objp = ac->entry[--ac->avail];
 	} else {
 		STATS_INC_ALLOCMISS(cachep);
-		objp = cache_alloc_refill(cachep, flags);
+		objp = cache_alloc_refill(cachep, flags, this_cpu);
 	}
 	return objp;
 }
@@ -2812,14 +2944,15 @@ static inline void *____cache_alloc(stru
 static __always_inline void *
 __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
 {
-	unsigned long save_flags;
+	unsigned long irqflags;
+	int this_cpu;
 	void *objp;
 
 	cache_alloc_debugcheck_before(cachep, flags);
 
-	local_irq_save(save_flags);
-	objp = ____cache_alloc(cachep, flags);
-	local_irq_restore(save_flags);
+	slab_irq_save(irqflags, this_cpu);
+	objp = ____cache_alloc(cachep, flags, &this_cpu);
+	slab_irq_restore(irqflags, this_cpu);
 	objp = cache_alloc_debugcheck_after(cachep, flags, objp,
 					    caller);
 	prefetchw(objp);
@@ -2830,7 +2963,8 @@ __cache_alloc(struct kmem_cache *cachep,
 /*
  * A interface to enable slab creation on nodeid
  */
-static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+static void * __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
+				 int nodeid, int *this_cpu)
 {
 	struct list_head *entry;
 	struct slab *slabp;
@@ -2879,7 +3013,7 @@ static void *__cache_alloc_node(struct k
 
       must_grow:
 	spin_unlock(&l3->list_lock);
-	x = cache_grow(cachep, flags, nodeid);
+	x = cache_grow(cachep, flags, nodeid, this_cpu);
 
 	if (!x)
 		return NULL;
@@ -2894,7 +3028,7 @@ static void *__cache_alloc_node(struct k
  * Caller needs to acquire correct kmem_list's list_lock
  */
 static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
-		       int node)
+		       int node, int *this_cpu)
 {
 	int i;
 	struct kmem_list3 *l3;
@@ -2917,7 +3051,7 @@ static void free_block(struct kmem_cache
 		if (slabp->inuse == 0) {
 			if (l3->free_objects > l3->free_limit) {
 				l3->free_objects -= cachep->num;
-				slab_destroy(cachep, slabp);
+				slab_destroy(cachep, slabp, this_cpu);
 			} else {
 				list_add(&slabp->list, &l3->slabs_free);
 			}
@@ -2931,11 +3065,12 @@ static void free_block(struct kmem_cache
 	}
 }
 
-static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
+static void
+cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac, int *this_cpu)
 {
 	int batchcount;
 	struct kmem_list3 *l3;
-	int node = numa_node_id();
+	int node = cpu_to_node(*this_cpu);
 
 	batchcount = ac->batchcount;
 #if DEBUG
@@ -2957,7 +3092,7 @@ static void cache_flusharray(struct kmem
 		}
 	}
 
-	free_block(cachep, ac->entry, batchcount, node);
+	free_block(cachep, ac->entry, batchcount, node, this_cpu);
       free_done:
 #if STATS
 	{
@@ -2990,9 +3125,10 @@ static void cache_flusharray(struct kmem
  *
  * Called with disabled ints.
  */
-static inline void __cache_free(struct kmem_cache *cachep, void *objp)
+static void
+__cache_free(struct kmem_cache *cachep, void *objp, int *this_cpu)
 {
-	struct array_cache *ac = cpu_cache_get(cachep);
+	struct array_cache *ac = cpu_cache_get(cachep, *this_cpu);
 
 	check_irq_off();
 	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
@@ -3004,25 +3140,25 @@ static inline void __cache_free(struct k
 	{
 		struct slab *slabp;
 		slabp = virt_to_slab(objp);
-		if (unlikely(slabp->nodeid != numa_node_id())) {
+		if (unlikely(slabp->nodeid != cpu_to_node(*this_cpu))) {
 			struct array_cache *alien = NULL;
 			int nodeid = slabp->nodeid;
 			struct kmem_list3 *l3 =
-			    cachep->nodelists[numa_node_id()];
+			    cachep->nodelists[cpu_to_node(*this_cpu)];
 
 			STATS_INC_NODEFREES(cachep);
 			if (l3->alien && l3->alien[nodeid]) {
 				alien = l3->alien[nodeid];
 				spin_lock(&alien->lock);
 				if (unlikely(alien->avail == alien->limit))
-					__drain_alien_cache(cachep,
-							    alien, nodeid);
+					__drain_alien_cache(cachep, alien,
+							    nodeid, this_cpu);
 				alien->entry[alien->avail++] = objp;
 				spin_unlock(&alien->lock);
 			} else {
 				spin_lock(&(cachep->nodelists[nodeid])->
 					  list_lock);
-				free_block(cachep, &objp, 1, nodeid);
+				free_block(cachep, &objp, 1, nodeid, this_cpu);
 				spin_unlock(&(cachep->nodelists[nodeid])->
 					    list_lock);
 			}
@@ -3036,7 +3172,7 @@ static inline void __cache_free(struct k
 		return;
 	} else {
 		STATS_INC_FREEMISS(cachep);
-		cache_flusharray(cachep, ac);
+		cache_flusharray(cachep, ac, this_cpu);
 		ac->entry[ac->avail++] = objp;
 	}
 }
@@ -3113,17 +3249,17 @@ int fastcall kmem_ptr_validate(struct km
 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
 	unsigned long save_flags;
+	int this_cpu;
 	void *ptr;
 
 	cache_alloc_debugcheck_before(cachep, flags);
-	local_irq_save(save_flags);
-
-	if (nodeid == -1 || nodeid == numa_node_id() ||
+	slab_irq_save(save_flags, this_cpu);
+	if (nodeid == -1 || nodeid == cpu_to_node(this_cpu) ||
 	    !cachep->nodelists[nodeid])
-		ptr = ____cache_alloc(cachep, flags);
+		ptr = ____cache_alloc(cachep, flags, &this_cpu);
 	else
-		ptr = __cache_alloc_node(cachep, flags, nodeid);
-	local_irq_restore(save_flags);
+		ptr = __cache_alloc_node(cachep, flags, nodeid, &this_cpu);
+	slab_irq_restore(save_flags, this_cpu);
 
 	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr,
 					   __builtin_return_address(0));
@@ -3259,10 +3395,11 @@ EXPORT_SYMBOL(__alloc_percpu);
 void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 {
 	unsigned long flags;
+	int this_cpu;
 
-	local_irq_save(flags);
-	__cache_free(cachep, objp);
-	local_irq_restore(flags);
+	slab_irq_save(flags, this_cpu);
+	__cache_free(cachep, objp, &this_cpu);
+	slab_irq_restore(flags, this_cpu);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
@@ -3279,15 +3416,16 @@ void kfree(const void *objp)
 {
 	struct kmem_cache *c;
 	unsigned long flags;
+	int this_cpu;
 
 	if (unlikely(!objp))
 		return;
-	local_irq_save(flags);
+	slab_irq_save(flags, this_cpu);
 	kfree_debugcheck(objp);
 	c = virt_to_cache(objp);
-	mutex_debug_check_no_locks_freed(objp, obj_size(c));
-	__cache_free(c, (void *)objp);
-	local_irq_restore(flags);
+	debug_check_no_locks_freed(objp, obj_size(c));
+	__cache_free(c, (void *)objp, &this_cpu);
+	slab_irq_restore(flags, this_cpu);
 }
 EXPORT_SYMBOL(kfree);
 
@@ -3338,6 +3476,8 @@ static int alloc_kmemlist(struct kmem_ca
 	for_each_online_node(node) {
 		struct array_cache *nc = NULL, *new;
 		struct array_cache **new_alien = NULL;
+		int this_cpu;
+
 #ifdef CONFIG_NUMA
 		if (!(new_alien = alloc_alien_cache(node, cachep->limit)))
 			goto fail;
@@ -3348,10 +3488,11 @@ static int alloc_kmemlist(struct kmem_ca
 			goto fail;
 		if ((l3 = cachep->nodelists[node])) {
 
-			spin_lock_irq(&l3->list_lock);
+			slab_spin_lock_irq(&l3->list_lock, this_cpu);
 
 			if ((nc = cachep->nodelists[node]->shared))
-				free_block(cachep, nc->entry, nc->avail, node);
+				free_block(cachep, nc->entry, nc->avail, node,
+					   &this_cpu);
 
 			l3->shared = new;
 			if (!cachep->nodelists[node]->alien) {
@@ -3360,7 +3501,7 @@ static int alloc_kmemlist(struct kmem_ca
 			}
 			l3->free_limit = (1 + nr_cpus_node(node)) *
 			    cachep->batchcount + cachep->num;
-			spin_unlock_irq(&l3->list_lock);
+			slab_spin_unlock_irq(&l3->list_lock, this_cpu);
 			kfree(nc);
 			free_alien_cache(new_alien);
 			continue;
@@ -3389,22 +3530,36 @@ struct ccupdate_struct {
 	struct array_cache *new[NR_CPUS];
 };
 
-static void do_ccupdate_local(void *info)
+static void __do_ccupdate_local(void *info, int this_cpu)
 {
 	struct ccupdate_struct *new = (struct ccupdate_struct *)info;
 	struct array_cache *old;
 
 	check_irq_off();
-	old = cpu_cache_get(new->cachep);
+	old = cpu_cache_get(new->cachep, this_cpu);
 
-	new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
-	new->new[smp_processor_id()] = old;
+	new->cachep->array[this_cpu] = new->new[this_cpu];
+	new->new[this_cpu] = old;
+}
+
+#ifdef CONFIG_PREEMPT_RT
+static void do_ccupdate_local(void *arg, int this_cpu)
+{
+	__do_ccupdate_local(arg, this_cpu);
 }
+#else
+static void do_ccupdate_local(void *arg)
+{
+	__do_ccupdate_local(arg, smp_processor_id());
+}
+#endif
+
 
 static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount,
 			    int shared)
 {
 	struct ccupdate_struct new;
+	int this_cpu;
 	int i, err;
 
 	memset(&new.new, 0, sizeof(new.new));
@@ -3432,9 +3587,9 @@ static int do_tune_cpucache(struct kmem_
 		struct array_cache *ccold = new.new[i];
 		if (!ccold)
 			continue;
-		spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
-		free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
-		spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
+		slab_spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock, this_cpu);
+		free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i), &this_cpu);
+		slab_spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock, this_cpu);
 		kfree(ccold);
 	}
 
@@ -3502,6 +3657,7 @@ static void enable_cpucache(struct kmem_
 static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
 				int force, int node)
 {
+	int this_cpu = smp_processor_id();
 	int tofree;
 
 	check_spinlock_acquired_node(cachep, node);
@@ -3512,7 +3668,7 @@ static void drain_array_locked(struct km
 		if (tofree > ac->avail) {
 			tofree = (ac->avail + 1) / 2;
 		}
-		free_block(cachep, ac->entry, tofree, node);
+		free_block(cachep, ac->entry, tofree, node, &this_cpu);
 		ac->avail -= tofree;
 		memmove(ac->entry, &(ac->entry[tofree]),
 			sizeof(void *) * ac->avail);
@@ -3533,6 +3689,7 @@ static void drain_array_locked(struct km
  */
 static void cache_reap(void *unused)
 {
+	int this_cpu = raw_smp_processor_id();
 	struct list_head *walk;
 	struct kmem_list3 *l3;
 
@@ -3556,12 +3713,13 @@ static void cache_reap(void *unused)
 
 		check_irq_on();
 
-		l3 = searchp->nodelists[numa_node_id()];
-		reap_alien(searchp, l3);
-		spin_lock_irq(&l3->list_lock);
+		l3 = searchp->nodelists[cpu_to_node(this_cpu)];
+		reap_alien(searchp, l3, &this_cpu);
+
+		slab_spin_lock_irq(&l3->list_lock, this_cpu);
 
-		drain_array_locked(searchp, cpu_cache_get(searchp), 0,
-				   numa_node_id());
+		drain_array_locked(searchp, cpu_cache_get(searchp, this_cpu), 0,
+				   cpu_to_node(this_cpu));
 
 		if (time_after(l3->next_reap, jiffies))
 			goto next_unlock;
@@ -3570,7 +3728,7 @@ static void cache_reap(void *unused)
 
 		if (l3->shared)
 			drain_array_locked(searchp, l3->shared, 0,
-					   numa_node_id());
+					   cpu_to_node(this_cpu));
 
 		if (l3->free_touched) {
 			l3->free_touched = 0;
@@ -3596,12 +3754,12 @@ static void cache_reap(void *unused)
 			 * cache_chain_lock
 			 */
 			l3->free_objects -= searchp->num;
-			spin_unlock_irq(&l3->list_lock);
-			slab_destroy(searchp, slabp);
-			spin_lock_irq(&l3->list_lock);
+			slab_spin_unlock_irq(&l3->list_lock, this_cpu);
+			slab_destroy(searchp, slabp, NULL);
+			slab_spin_lock_irq(&l3->list_lock, this_cpu);
 		} while (--tofree > 0);
 	      next_unlock:
-		spin_unlock_irq(&l3->list_lock);
+		slab_spin_unlock_irq(&l3->list_lock, this_cpu);
 	      next:
 		cond_resched();
 	}
@@ -3678,7 +3836,7 @@ static int s_show(struct seq_file *m, vo
 	unsigned long num_slabs, free_objects = 0, shared_avail = 0;
 	const char *name;
 	char *error = NULL;
-	int node;
+	int this_cpu, node;
 	struct kmem_list3 *l3;
 
 	spin_lock(&cachep->spinlock);
@@ -3690,7 +3848,7 @@ static int s_show(struct seq_file *m, vo
 			continue;
 
 		check_irq_on();
-		spin_lock_irq(&l3->list_lock);
+		slab_spin_lock_irq(&l3->list_lock, this_cpu);
 
 		list_for_each(q, &l3->slabs_full) {
 			slabp = list_entry(q, struct slab, list);
@@ -3718,7 +3876,7 @@ static int s_show(struct seq_file *m, vo
 		if (l3->shared)
 			shared_avail += l3->shared->avail;
 
-		spin_unlock_irq(&l3->list_lock);
+		slab_spin_unlock_irq(&l3->list_lock, this_cpu);
 	}
 	num_slabs += active_slabs;
 	num_objs = num_slabs * cachep->num;
Index: linux-2.6.16/mm/slob.c
===================================================================
--- linux-2.6.16.orig/mm/slob.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/mm/slob.c	2006-10-19 16:52:31.000000000 +0200
@@ -27,6 +27,20 @@
  * are allocated by calling __get_free_pages. As SLAB objects know
  * their size, no separate size bookkeeping is necessary and there is
  * essentially no allocation space overhead.
+ *
+ * Modified by: Steven Rostedt <rostedt@goodmis.org> 12/20/05
+ *
+ * Now we take advantage of the kmem_cache usage.  I've removed
+ * the global slobfree, and created one for every cache.
+ *
+ * For kmalloc/kfree I've reintroduced the usage of cache_sizes,
+ * but only for sizes 32 through PAGE_SIZE >> 1 by order of 2.
+ *
+ * Having the SLOB alloc per size of the cache should speed things up
+ * greatly, not only by making the search paths smaller, but also by
+ * keeping all the caches of similar units.  This way the fragmentation
+ * should not be as big of a problem.
+ *
  */
 
 #include <linux/config.h>
@@ -37,6 +51,8 @@
 #include <linux/module.h>
 #include <linux/timer.h>
 
+#undef DEBUG_CACHE
+
 struct slob_block {
 	int units;
 	struct slob_block *next;
@@ -50,28 +66,125 @@ typedef struct slob_block slob_t;
 struct bigblock {
 	int order;
 	void *pages;
-	struct bigblock *next;
 };
 typedef struct bigblock bigblock_t;
 
-static slob_t arena = { .next = &arena, .units = 1 };
-static slob_t *slobfree = &arena;
-static bigblock_t *bigblocks;
-static DEFINE_SPINLOCK(slob_lock);
-static DEFINE_SPINLOCK(block_lock);
+struct kmem_cache {
+	unsigned int size, align;
+	const char *name;
+	slob_t *slobfree;
+	slob_t arena;
+	spinlock_t lock;
+	void (*ctor)(void *, struct kmem_cache *, unsigned long);
+	void (*dtor)(void *, struct kmem_cache *, unsigned long);
+	atomic_t items;
+	unsigned int free;
+	struct list_head list;
+};
+
+#define NR_SLOB_CACHES ((PAGE_SHIFT) - 5) /* 32 to PAGE_SIZE-1 by order of 2 */
+#define MAX_SLOB_CACHE_SIZE (PAGE_SIZE >> 1)
 
-static void slob_free(void *b, int size);
+static struct kmem_cache *cache_sizes[NR_SLOB_CACHES];
+static struct kmem_cache *bb_cache;
 
-static void *slob_alloc(size_t size, gfp_t gfp, int align)
+static struct semaphore	cache_chain_sem;
+static struct list_head cache_chain;
+
+#ifdef DEBUG_CACHE
+static void test_cache(kmem_cache_t *c)
 {
+	slob_t *cur = c->slobfree;
+	unsigned int x = -1 >> 2;
+
+	do {
+		BUG_ON(!cur->next);
+		cur = cur->next;
+	} while (cur != c->slobfree && --x);
+	BUG_ON(!x);
+}
+#else
+#define test_cache(x) do {} while(0)
+#endif
+
+/*
+ * Here we take advantage of the lru field of the pages that
+ * map to the pages we use in the SLOB.  This is done similar
+ * to what is done with SLAB.
+ *
+ * The lru.next field is used to get the bigblock descriptor
+ *    for large blocks larger than PAGE_SIZE >> 1.
+ *
+ * Set and retrieved by set_slob_block and get_slob_block
+ * respectively.
+ *
+ * The lru.prev field is used to find the cache descriptor
+ *   for small blocks smaller than or equal to PAGE_SIZE >> 1.
+ *
+ * Set and retrieved by set_slob_ptr and get_slob_ptr
+ * respectively.
+ *
+ * The use of lru.next tells us in kmalloc that the page is large.
+ */
+static inline struct page *get_slob_page(const void *mem)
+{
+	return virt_to_page(mem);
+}
+
+static inline void zero_slob_block(const void *b)
+{
+	struct page *page;
+	page = get_slob_page(b);
+	memset(&page->lru, 0, sizeof(page->lru));
+}
+
+static inline void *get_slob_block(const void *b)
+{
+	struct page *page;
+	page = get_slob_page(b);
+	return page->lru.next;
+}
+
+static inline void set_slob_block(const void *b, void *data)
+{
+	struct page *page;
+	page = get_slob_page(b);
+	page->lru.next = data;
+}
+
+static inline void *get_slob_ptr(const void *b)
+{
+	struct page *page;
+	page = get_slob_page(b);
+	return page->lru.prev;
+}
+
+static inline void set_slob_ptr(const void *b, void *data)
+{
+	struct page *page;
+	page = get_slob_page(b);
+	page->lru.prev = data;
+}
+
+static void slob_free(kmem_cache_t *cachep, void *b, int size);
+
+static void *slob_alloc(kmem_cache_t *cachep, gfp_t gfp, int align)
+{
+	size_t size;
 	slob_t *prev, *cur, *aligned = 0;
-	int delta = 0, units = SLOB_UNITS(size);
+	int delta = 0, units;
 	unsigned long flags;
 
-	spin_lock_irqsave(&slob_lock, flags);
-	prev = slobfree;
+	size = cachep->size;
+	units = SLOB_UNITS(size);
+	BUG_ON(!units);
+
+	spin_lock_irqsave(&cachep->lock, flags);
+	prev = cachep->slobfree;
 	for (cur = prev->next; ; prev = cur, cur = cur->next) {
 		if (align) {
+			while (align < SLOB_UNIT)
+				align <<= 1;
 			aligned = (slob_t *)ALIGN((unsigned long)cur, align);
 			delta = aligned - cur;
 		}
@@ -94,12 +207,16 @@ static void *slob_alloc(size_t size, gfp
 				cur->units = units;
 			}
 
-			slobfree = prev;
-			spin_unlock_irqrestore(&slob_lock, flags);
+			cachep->slobfree = prev;
+			test_cache(cachep);
+			if (prev < prev->next)
+				BUG_ON(cur + cur->units > prev->next);
+			spin_unlock_irqrestore(&cachep->lock, flags);
 			return cur;
 		}
-		if (cur == slobfree) {
-			spin_unlock_irqrestore(&slob_lock, flags);
+		if (cur == cachep->slobfree) {
+			test_cache(cachep);
+			spin_unlock_irqrestore(&cachep->lock, flags);
 
 			if (size == PAGE_SIZE) /* trying to shrink arena? */
 				return 0;
@@ -108,14 +225,16 @@ static void *slob_alloc(size_t size, gfp
 			if (!cur)
 				return 0;
 
-			slob_free(cur, PAGE_SIZE);
-			spin_lock_irqsave(&slob_lock, flags);
-			cur = slobfree;
+			zero_slob_block(cur);
+			set_slob_ptr(cur, cachep);
+			slob_free(cachep, cur, PAGE_SIZE);
+			spin_lock_irqsave(&cachep->lock, flags);
+			cur = cachep->slobfree;
 		}
 	}
 }
 
-static void slob_free(void *block, int size)
+static void slob_free(kmem_cache_t *cachep, void *block, int size)
 {
 	slob_t *cur, *b = (slob_t *)block;
 	unsigned long flags;
@@ -127,26 +246,29 @@ static void slob_free(void *block, int s
 		b->units = SLOB_UNITS(size);
 
 	/* Find reinsertion point */
-	spin_lock_irqsave(&slob_lock, flags);
-	for (cur = slobfree; !(b > cur && b < cur->next); cur = cur->next)
+	spin_lock_irqsave(&cachep->lock, flags);
+	for (cur = cachep->slobfree; !(b > cur && b < cur->next); cur = cur->next)
 		if (cur >= cur->next && (b > cur || b < cur->next))
 			break;
 
 	if (b + b->units == cur->next) {
 		b->units += cur->next->units;
 		b->next = cur->next->next;
+		BUG_ON(cur->next == &cachep->arena);
 	} else
 		b->next = cur->next;
 
 	if (cur + cur->units == b) {
 		cur->units += b->units;
 		cur->next = b->next;
+		BUG_ON(b == &cachep->arena);
 	} else
 		cur->next = b;
 
-	slobfree = cur;
+	cachep->slobfree = cur;
 
-	spin_unlock_irqrestore(&slob_lock, flags);
+	test_cache(cachep);
+	spin_unlock_irqrestore(&cachep->lock, flags);
 }
 
 static int FASTCALL(find_order(int size));
@@ -160,16 +282,24 @@ static int fastcall find_order(int size)
 
 void *kmalloc(size_t size, gfp_t gfp)
 {
-	slob_t *m;
 	bigblock_t *bb;
-	unsigned long flags;
 
-	if (size < PAGE_SIZE - SLOB_UNIT) {
-		m = slob_alloc(size + SLOB_UNIT, gfp, 0);
-		return m ? (void *)(m + 1) : 0;
+	/*
+	 * If the size is less than PAGE_SIZE >> 1 then
+	 * we use the generic caches.  Otherwise, we
+	 * just allocate the necessary pages.
+	 */
+	if (size <= MAX_SLOB_CACHE_SIZE) {
+		int i;
+		int order;
+		for (i=0, order=32; i < NR_SLOB_CACHES; i++, order <<= 1)
+			if (size <= order)
+				break;
+		BUG_ON(i == NR_SLOB_CACHES);
+		return kmem_cache_alloc(cache_sizes[i], gfp);
 	}
 
-	bb = slob_alloc(sizeof(bigblock_t), gfp, 0);
+	bb = slob_alloc(bb_cache, gfp, 0);
 	if (!bb)
 		return 0;
 
@@ -177,14 +307,11 @@ void *kmalloc(size_t size, gfp_t gfp)
 	bb->pages = (void *)__get_free_pages(gfp, bb->order);
 
 	if (bb->pages) {
-		spin_lock_irqsave(&block_lock, flags);
-		bb->next = bigblocks;
-		bigblocks = bb;
-		spin_unlock_irqrestore(&block_lock, flags);
+		set_slob_block(bb->pages, bb);
 		return bb->pages;
 	}
 
-	slob_free(bb, sizeof(bigblock_t));
+	slob_free(bb_cache, bb, sizeof(bigblock_t));
 	return 0;
 }
 
@@ -192,28 +319,26 @@ EXPORT_SYMBOL(kmalloc);
 
 void kfree(const void *block)
 {
-	bigblock_t *bb, **last = &bigblocks;
-	unsigned long flags;
+	kmem_cache_t *c;
+	bigblock_t *bb;
 
 	if (!block)
 		return;
 
-	if (!((unsigned long)block & (PAGE_SIZE-1))) {
-		/* might be on the big block list */
-		spin_lock_irqsave(&block_lock, flags);
-		for (bb = bigblocks; bb; last = &bb->next, bb = bb->next) {
-			if (bb->pages == block) {
-				*last = bb->next;
-				spin_unlock_irqrestore(&block_lock, flags);
-				free_pages((unsigned long)block, bb->order);
-				slob_free(bb, sizeof(bigblock_t));
-				return;
-			}
-		}
-		spin_unlock_irqrestore(&block_lock, flags);
+	/*
+	 * look into the page of the allocated block to
+	 * see if this is a big allocation or not.
+	 */
+	bb = get_slob_block(block);
+	if (bb) {
+		free_pages((unsigned long)block, bb->order);
+		slob_free(bb_cache, bb, sizeof(bigblock_t));
+		return;
 	}
 
-	slob_free((slob_t *)block - 1, 0);
+	c = get_slob_ptr(block);
+	kmem_cache_free(c, (void *)block);
+
 	return;
 }
 
@@ -222,29 +347,25 @@ EXPORT_SYMBOL(kfree);
 unsigned int ksize(const void *block)
 {
 	bigblock_t *bb;
-	unsigned long flags;
+	kmem_cache_t *c;
 
 	if (!block)
 		return 0;
 
-	if (!((unsigned long)block & (PAGE_SIZE-1))) {
-		spin_lock_irqsave(&block_lock, flags);
-		for (bb = bigblocks; bb; bb = bb->next)
-			if (bb->pages == block) {
-				spin_unlock_irqrestore(&slob_lock, flags);
-				return PAGE_SIZE << bb->order;
-			}
-		spin_unlock_irqrestore(&block_lock, flags);
-	}
+	bb = get_slob_block(block);
+	if (bb)
+		return PAGE_SIZE << bb->order;
 
-	return ((slob_t *)block - 1)->units * SLOB_UNIT;
+	c = get_slob_ptr(block);
+	return c->size;
 }
 
-struct kmem_cache {
-	unsigned int size, align;
-	const char *name;
-	void (*ctor)(void *, struct kmem_cache *, unsigned long);
-	void (*dtor)(void *, struct kmem_cache *, unsigned long);
+static slob_t cache_arena = { .next = &cache_arena, .units = 0 };
+struct kmem_cache cache_cache = {
+	.name = "cache",
+	.slobfree = &cache_cache.arena,
+	.arena = { .next = &cache_cache.arena, .units = 0 },
+	.lock = SPIN_LOCK_UNLOCKED
 };
 
 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
@@ -253,8 +374,22 @@ struct kmem_cache *kmem_cache_create(con
 	void (*dtor)(void*, struct kmem_cache *, unsigned long))
 {
 	struct kmem_cache *c;
+	void *p;
+
+	c = slob_alloc(&cache_cache, flags, 0);
 
-	c = slob_alloc(sizeof(struct kmem_cache), flags, 0);
+	memset(c, 0, sizeof(*c));
+
+	c->size = PAGE_SIZE;
+	c->arena.next = &c->arena;
+	c->arena.units = 0;
+	c->slobfree = &c->arena;
+	atomic_set(&c->items, 0);
+	spin_lock_init(&c->lock);
+
+	p = slob_alloc(c, 0, PAGE_SIZE-1);
+	if (p)
+		free_page((unsigned long)p);
 
 	if (c) {
 		c->name = name;
@@ -266,6 +401,9 @@ struct kmem_cache *kmem_cache_create(con
 		if (c->align < align)
 			c->align = align;
 	}
+	down(&cache_chain_sem);
+	list_add_tail(&c->list, &cache_chain);
+	up(&cache_chain_sem);
 
 	return c;
 }
@@ -273,7 +411,17 @@ EXPORT_SYMBOL(kmem_cache_create);
 
 int kmem_cache_destroy(struct kmem_cache *c)
 {
-	slob_free(c, sizeof(struct kmem_cache));
+	down(&cache_chain_sem);
+	list_del(&c->list);
+	up(&cache_chain_sem);
+
+	BUG_ON(atomic_read(&c->items));
+
+	/*
+	 * WARNING!!! Memory leak!
+	 */
+	printk("FIX ME: need to free memory\n");
+	slob_free(&cache_cache, c, sizeof(struct kmem_cache));
 	return 0;
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
@@ -282,11 +430,16 @@ void *kmem_cache_alloc(struct kmem_cache
 {
 	void *b;
 
-	if (c->size < PAGE_SIZE)
-		b = slob_alloc(c->size, flags, c->align);
+	atomic_inc(&c->items);
+
+	if (c->size <= MAX_SLOB_CACHE_SIZE)
+		b = slob_alloc(c, flags, c->align);
 	else
 		b = (void *)__get_free_pages(flags, find_order(c->size));
 
+	if (!b)
+		return 0;
+
 	if (c->ctor)
 		c->ctor(b, c, SLAB_CTOR_CONSTRUCTOR);
 
@@ -296,11 +449,13 @@ EXPORT_SYMBOL(kmem_cache_alloc);
 
 void kmem_cache_free(struct kmem_cache *c, void *b)
 {
+	atomic_dec(&c->items);
+
 	if (c->dtor)
 		c->dtor(b, c, 0);
 
-	if (c->size < PAGE_SIZE)
-		slob_free(b, c->size);
+	if (c->size <= MAX_SLOB_CACHE_SIZE)
+		slob_free(c, b, c->size);
 	else
 		free_pages((unsigned long)b, find_order(c->size));
 }
@@ -318,22 +473,62 @@ const char *kmem_cache_name(struct kmem_
 }
 EXPORT_SYMBOL(kmem_cache_name);
 
-static struct timer_list slob_timer = TIMER_INITIALIZER(
-	(void (*)(unsigned long))kmem_cache_init, 0, 0);
+static char cache_names[NR_SLOB_CACHES][15];
 
 void kmem_cache_init(void)
 {
-	void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1);
+	static int done;
+	void *p;
 
-	if (p)
-		free_page((unsigned long)p);
-
-	mod_timer(&slob_timer, jiffies + HZ);
+	if (!done) {
+		int i;
+		int size = 32;
+		done = 1;
+
+		init_MUTEX(&cache_chain_sem);
+		INIT_LIST_HEAD(&cache_chain);
+
+		cache_cache.size = PAGE_SIZE;
+		p = slob_alloc(&cache_cache, 0, PAGE_SIZE-1);
+		if (p)
+			free_page((unsigned long)p);
+		cache_cache.size = sizeof(struct kmem_cache);
+
+		bb_cache = kmem_cache_create("bb_cache",sizeof(bigblock_t), 0,
+					     GFP_KERNEL, NULL, NULL);
+		for (i=0; i < NR_SLOB_CACHES; i++, size <<= 1)
+			cache_sizes[i] = kmem_cache_create(cache_names[i], size, 0,
+							   GFP_KERNEL, NULL, NULL);
+	}
 }
 
 atomic_t slab_reclaim_pages = ATOMIC_INIT(0);
 EXPORT_SYMBOL(slab_reclaim_pages);
 
+static void test_slob(slob_t *s)
+{
+	slob_t *p;
+	long x = 0;
+
+	for (p=s->next; p != s && x < 10000; p = p->next, x++)
+		printk(".");
+}
+
+void print_slobs(void)
+{
+	struct list_head *curr;
+
+	list_for_each(curr, &cache_chain) {
+		kmem_cache_t *c = list_entry(curr, struct kmem_cache, list);
+
+		printk("%s items:%d",
+		       c->name?:"<none>",
+		       atomic_read(&c->items));
+		test_slob(&c->arena);
+		printk("\n");
+	}
+}
+
 #ifdef CONFIG_SMP
 
 void *__alloc_percpu(size_t size)
Index: linux-2.6.16/mm/sparse.c
===================================================================
--- linux-2.6.16.orig/mm/sparse.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/mm/sparse.c	2006-10-19 16:52:31.000000000 +0200
@@ -42,7 +42,7 @@ static struct mem_section *sparse_index_
 
 static int sparse_index_init(unsigned long section_nr, int nid)
 {
-	static spinlock_t index_init_lock = SPIN_LOCK_UNLOCKED;
+	static DEFINE_SPINLOCK(index_init_lock);
 	unsigned long root = SECTION_NR_TO_ROOT(section_nr);
 	struct mem_section *section;
 	int ret = 0;
Index: linux-2.6.16/mm/swap.c
===================================================================
--- linux-2.6.16.orig/mm/swap.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/mm/swap.c	2006-10-19 16:52:31.000000000 +0200
@@ -136,45 +136,45 @@ EXPORT_SYMBOL(mark_page_accessed);
  * lru_cache_add: add a page to the page lists
  * @page: the page to add
  */
-static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
-static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
+static DEFINE_PER_CPU_LOCKED(struct pagevec, lru_add_pvecs) = { 0, };
+static DEFINE_PER_CPU_LOCKED(struct pagevec, lru_add_active_pvecs) = { 0, };
 
 void fastcall lru_cache_add(struct page *page)
 {
-	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
+	int cpu;
+	struct pagevec *pvec = &get_cpu_var_locked(lru_add_pvecs, &cpu);
 
 	page_cache_get(page);
 	if (!pagevec_add(pvec, page))
 		__pagevec_lru_add(pvec);
-	put_cpu_var(lru_add_pvecs);
+	put_cpu_var_locked(lru_add_pvecs, cpu);
 }
 
 void fastcall lru_cache_add_active(struct page *page)
 {
-	struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
+	int cpu;
+	struct pagevec *pvec = &get_cpu_var_locked(lru_add_active_pvecs, &cpu);
 
 	page_cache_get(page);
 	if (!pagevec_add(pvec, page))
 		__pagevec_lru_add_active(pvec);
-	put_cpu_var(lru_add_active_pvecs);
+	put_cpu_var_locked(lru_add_active_pvecs, cpu);
 }
 
-static void __lru_add_drain(int cpu)
+void lru_add_drain(void)
 {
-	struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
+	struct pagevec *pvec;
+	int cpu;
 
-	/* CPU is dead, so no locking needed. */
+	pvec = &get_cpu_var_locked(lru_add_pvecs, &cpu);
 	if (pagevec_count(pvec))
 		__pagevec_lru_add(pvec);
-	pvec = &per_cpu(lru_add_active_pvecs, cpu);
+	put_cpu_var_locked(lru_add_pvecs, cpu);
+
+	pvec = &get_cpu_var_locked(lru_add_active_pvecs, &cpu);
 	if (pagevec_count(pvec))
 		__pagevec_lru_add_active(pvec);
-}
-
-void lru_add_drain(void)
-{
-	__lru_add_drain(get_cpu());
-	put_cpu();
+	put_cpu_var_locked(lru_add_active_pvecs, cpu);
 }
 
 #ifdef CONFIG_NUMA
@@ -471,7 +471,9 @@ static int cpu_swap_callback(struct noti
 	if (action == CPU_DEAD) {
 		atomic_add(*committed, &vm_committed_space);
 		*committed = 0;
-		__lru_add_drain((long)hcpu);
+		/* FIXME: */
+//		__lru_add_drain((long)hcpu);
+		lru_add_drain();
 	}
 	return NOTIFY_OK;
 }
Index: linux-2.6.16/mm/vmscan.c
===================================================================
--- linux-2.6.16.orig/mm/vmscan.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/mm/vmscan.c	2006-10-19 16:52:31.000000000 +0200
@@ -22,6 +22,7 @@
 #include <linux/file.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
+#include <linux/interrupt.h>
 #include <linux/buffer_head.h>	/* for try_to_release_page(),
 					buffer_heads_over_limit */
 #include <linux/mm_inline.h>
@@ -1138,7 +1139,7 @@ static void shrink_cache(struct zone *zo
 		max_scan -= nr_scan;
 		nr_freed = shrink_list(&page_list, sc);
 
-		local_irq_disable();
+		local_irq_disable_nort();
 		if (current_is_kswapd()) {
 			__mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
 			__mod_page_state(kswapd_steal, nr_freed);
@@ -1315,11 +1316,16 @@ refill_inactive_zone(struct zone *zone, 
 		}
 	}
 	zone->nr_active += pgmoved;
+	/*
+	 * Non-PREEMPT_RT relies on IRQs-off protecting the page_states
+	 * per-CPU data. PREEMPT_RT has that data protected even in
+	 * __mod_page_state(), so no need to keep IRQs disabled.
+	 */
 	spin_unlock(&zone->lru_lock);
 
 	__mod_page_state_zone(zone, pgrefill, pgscanned);
 	__mod_page_state(pgdeactivate, pgdeactivate);
-	local_irq_enable();
+	local_irq_enable_nort();
 
 	pagevec_release(&pvec);
 }
Index: linux-2.6.16/net/core/dev.c
===================================================================
--- linux-2.6.16.orig/net/core/dev.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/net/core/dev.c	2006-10-19 16:52:31.000000000 +0200
@@ -1319,10 +1319,16 @@ int dev_queue_xmit(struct sk_buff *skb)
 	   Either shot noqueue qdisc, it is even simpler 8)
 	 */
 	if (dev->flags & IFF_UP) {
-		int cpu = smp_processor_id(); /* ok because BHs are off */
+		int cpu = raw_smp_processor_id(); /* ok because BHs are off */
 
+		/*
+		 * No need to check for recursion with threaded interrupts:
+		 */
+#ifdef CONFIG_PREEMPT_RT
+		if (1) {
+#else
 		if (dev->xmit_lock_owner != cpu) {
-
+#endif
 			HARD_TX_LOCK(dev, cpu);
 
 			if (!netif_queue_stopped(dev)) {
@@ -1472,6 +1478,11 @@ static void net_tx_action(struct softirq
 
 			BUG_TRAP(!atomic_read(&skb->users));
 			__kfree_skb(skb);
+			/*
+			 * Safe to reschedule - the list is private
+			 * at this point.
+			 */
+			cond_resched_all();
 		}
 	}
 
@@ -1490,14 +1501,30 @@ static void net_tx_action(struct softirq
 			smp_mb__before_clear_bit();
 			clear_bit(__LINK_STATE_SCHED, &dev->state);
 
+			/*
+			 * We are executing in softirq context here, and
+			 * if softirqs are preemptible, we must avoid
+			 * infinite reactivation of the softirq by
+			 * either the tx handler, or by netif_schedule().
+			 * (it would result in an infinitely looping
+			 *  softirq context)
+			 * So we take the spinlock unconditionally.
+			 */
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+			spin_lock(&dev->queue_lock);
+			qdisc_run(dev);
+			spin_unlock(&dev->queue_lock);
+#else
 			if (spin_trylock(&dev->queue_lock)) {
 				qdisc_run(dev);
 				spin_unlock(&dev->queue_lock);
 			} else {
 				netif_schedule(dev);
 			}
+#endif
 		}
 	}
+
 }
 
 static __inline__ int deliver_skb(struct sk_buff *skb,
@@ -1715,12 +1742,13 @@ job_done:
 
 static void net_rx_action(struct softirq_action *h)
 {
-	struct softnet_data *queue = &__get_cpu_var(softnet_data);
+	struct softnet_data *queue;
 	unsigned long start_time = jiffies;
 	int budget = netdev_budget;
 	void *have;
 
 	local_irq_disable();
+	queue = &__get_cpu_var(softnet_data);
 
 	while (!list_empty(&queue->poll_list)) {
 		struct net_device *dev;
@@ -1729,6 +1757,10 @@ static void net_rx_action(struct softirq
 			goto softnet_break;
 
 		local_irq_enable();
+		if (unlikely(cond_resched_all())) {
+			local_irq_disable();
+			continue;
+		}
 
 		dev = list_entry(queue->poll_list.next,
 				 struct net_device, poll_list);
@@ -1754,8 +1786,10 @@ out:
 	return;
 
 softnet_break:
+	preempt_disable();
 	__get_cpu_var(netdev_rx_stat).time_squeeze++;
 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+	preempt_enable();
 	goto out;
 }
 
Index: linux-2.6.16/net/core/flow.c
===================================================================
--- linux-2.6.16.orig/net/core/flow.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/net/core/flow.c	2006-10-19 16:52:31.000000000 +0200
@@ -40,6 +40,8 @@ atomic_t flow_cache_genid = ATOMIC_INIT(
 
 static u32 flow_hash_shift;
 #define flow_hash_size	(1 << flow_hash_shift)
+
+// #warning FIXME: this code is PREEMPT_RT-unsafe
 static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL };
 
 #define flow_table(cpu) (per_cpu(flow_tables, cpu))
Index: linux-2.6.16/net/core/netpoll.c
===================================================================
--- linux-2.6.16.orig/net/core/netpoll.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/net/core/netpoll.c	2006-10-19 16:52:31.000000000 +0200
@@ -140,7 +140,7 @@ static void poll_napi(struct netpoll *np
 	int budget = 16;
 
 	if (test_bit(__LINK_STATE_RX_SCHED, &np->dev->state) &&
-	    npinfo->poll_owner != smp_processor_id() &&
+	    npinfo->poll_owner != raw_smp_processor_id() &&
 	    spin_trylock(&npinfo->poll_lock)) {
 		npinfo->rx_flags |= NETPOLL_RX_DROP;
 		atomic_inc(&trapped);
@@ -159,7 +159,9 @@ void netpoll_poll(struct netpoll *np)
 		return;
 
 	/* Process pending work on NIC */
+	WARN_ON_RT(irqs_disabled());
 	np->dev->poll_controller(np->dev);
+	WARN_ON_RT(irqs_disabled());
 	if (np->dev->poll)
 		poll_napi(np);
 
@@ -186,28 +188,31 @@ static void refill_skbs(void)
 
 static void zap_completion_queue(void)
 {
-	unsigned long flags;
 	struct softnet_data *sd = &get_cpu_var(softnet_data);
+	struct sk_buff *clist = NULL;
+	unsigned long flags;
 
 	if (sd->completion_queue) {
-		struct sk_buff *clist;
-
 		local_irq_save(flags);
 		clist = sd->completion_queue;
 		sd->completion_queue = NULL;
 		local_irq_restore(flags);
-
-		while (clist != NULL) {
-			struct sk_buff *skb = clist;
-			clist = clist->next;
-			if(skb->destructor)
-				dev_kfree_skb_any(skb); /* put this one back */
-			else
-				__kfree_skb(skb);
-		}
 	}
 
+	/*
+	 * Took the list private, can drop our softnet
+	 * reference:
+	 */
 	put_cpu_var(softnet_data);
+
+	while (clist != NULL) {
+		struct sk_buff *skb = clist;
+		clist = clist->next;
+		if(skb->destructor)
+			dev_kfree_skb_any(skb); /* put this one back */
+		else
+			__kfree_skb(skb);
+	}
 }
 
 static struct sk_buff * find_skb(struct netpoll *np, int len, int reserve)
@@ -262,8 +267,8 @@ static void netpoll_send_skb(struct netp
 	npinfo = np->dev->npinfo;
 
 	/* avoid recursion */
-	if (npinfo->poll_owner == smp_processor_id() ||
-	    np->dev->xmit_lock_owner == smp_processor_id()) {
+	if (npinfo->poll_owner == raw_smp_processor_id() ||
+	    np->dev->xmit_lock_owner == raw_smp_processor_id()) {
 		if (np->drop)
 			np->drop(skb);
 		else
@@ -274,7 +279,7 @@ static void netpoll_send_skb(struct netp
 	do {
 		npinfo->tries--;
 		spin_lock(&np->dev->xmit_lock);
-		np->dev->xmit_lock_owner = smp_processor_id();
+		np->dev->xmit_lock_owner = raw_smp_processor_id();
 
 		/*
 		 * network drivers do not expect to be called if the queue is
Index: linux-2.6.16/net/core/sock.c
===================================================================
--- linux-2.6.16.orig/net/core/sock.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/net/core/sock.c	2006-10-19 16:52:31.000000000 +0200
@@ -1217,7 +1217,7 @@ static void sock_def_readable(struct soc
 {
 	read_lock(&sk->sk_callback_lock);
 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
-		wake_up_interruptible(sk->sk_sleep);
+		wake_up_interruptible_sync(sk->sk_sleep);
 	sk_wake_async(sk,1,POLL_IN);
 	read_unlock(&sk->sk_callback_lock);
 }
Index: linux-2.6.16/net/decnet/dn_dev.c
===================================================================
--- linux-2.6.16.orig/net/decnet/dn_dev.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/net/decnet/dn_dev.c	2006-10-19 16:52:31.000000000 +0200
@@ -88,9 +88,9 @@ static struct dn_dev_parms dn_dev_list[]
 	.t3 =		10,
 	.name =		"ethernet",
 	.ctl_name =	NET_DECNET_CONF_ETHER,
-	.up =		dn_eth_up,
-	.down = 	dn_eth_down,
-	.timer3 =	dn_send_brd_hello,
+	.dn_up =		dn_eth_up,
+	.dn_down = 	dn_eth_down,
+	.dn_timer3 =	dn_send_brd_hello,
 },
 {
 	.type =		ARPHRD_IPGRE, /* DECnet tunneled over GRE in IP */
@@ -100,7 +100,7 @@ static struct dn_dev_parms dn_dev_list[]
 	.t3 =		10,
 	.name =		"ipgre",
 	.ctl_name =	NET_DECNET_CONF_GRE,
-	.timer3 =	dn_send_brd_hello,
+	.dn_timer3 =	dn_send_brd_hello,
 },
 #if 0
 {
@@ -111,7 +111,7 @@ static struct dn_dev_parms dn_dev_list[]
 	.t3 =		120,
 	.name =		"x25",
 	.ctl_name =	NET_DECNET_CONF_X25,
-	.timer3 =	dn_send_ptp_hello,
+	.dn_timer3 =	dn_send_ptp_hello,
 },
 #endif
 #if 0
@@ -123,7 +123,7 @@ static struct dn_dev_parms dn_dev_list[]
 	.t3 =		10,
 	.name =		"ppp",
 	.ctl_name =	NET_DECNET_CONF_PPP,
-	.timer3 =	dn_send_brd_hello,
+	.dn_timer3 =	dn_send_brd_hello,
 },
 #endif
 {
@@ -134,7 +134,7 @@ static struct dn_dev_parms dn_dev_list[]
 	.t3 =		120,
 	.name =		"ddcmp",
 	.ctl_name =	NET_DECNET_CONF_DDCMP,
-	.timer3 =	dn_send_ptp_hello,
+	.dn_timer3 =	dn_send_ptp_hello,
 },
 {
 	.type =		ARPHRD_LOOPBACK, /* Loopback interface - always last */
@@ -144,7 +144,7 @@ static struct dn_dev_parms dn_dev_list[]
 	.t3 =		10,
 	.name =		"loopback",
 	.ctl_name =	NET_DECNET_CONF_LOOPBACK,
-	.timer3 =	dn_send_brd_hello,
+	.dn_timer3 =	dn_send_brd_hello,
 }
 };
 
@@ -333,11 +333,11 @@ static int dn_forwarding_proc(ctl_table 
 		 */
 		tmp = dn_db->parms.forwarding;
 		dn_db->parms.forwarding = old;
-		if (dn_db->parms.down)
-			dn_db->parms.down(dev);
+		if (dn_db->parms.dn_down)
+			dn_db->parms.dn_down(dev);
 		dn_db->parms.forwarding = tmp;
-		if (dn_db->parms.up)
-			dn_db->parms.up(dev);
+		if (dn_db->parms.dn_up)
+			dn_db->parms.dn_up(dev);
 	}
 
 	return err;
@@ -372,11 +372,11 @@ static int dn_forwarding_sysctl(ctl_tabl
 		if (value > 2)
 			return -EINVAL;
 
-		if (dn_db->parms.down)
-			dn_db->parms.down(dev);
+		if (dn_db->parms.dn_down)
+			dn_db->parms.dn_down(dev);
 		dn_db->parms.forwarding = value;
-		if (dn_db->parms.up)
-			dn_db->parms.up(dev);
+		if (dn_db->parms.dn_up)
+			dn_db->parms.dn_up(dev);
 	}
 
 	return 0;
@@ -1062,10 +1062,10 @@ static void dn_dev_timer_func(unsigned l
 	struct dn_ifaddr *ifa;
 
 	if (dn_db->t3 <= dn_db->parms.t2) {
-		if (dn_db->parms.timer3) {
+		if (dn_db->parms.dn_timer3) {
 			for(ifa = dn_db->ifa_list; ifa; ifa = ifa->ifa_next) {
 				if (!(ifa->ifa_flags & IFA_F_SECONDARY))
-					dn_db->parms.timer3(dev, ifa);
+					dn_db->parms.dn_timer3(dev, ifa);
 			}
 		}
 		dn_db->t3 = dn_db->parms.t3;
@@ -1117,8 +1117,8 @@ struct dn_dev *dn_dev_create(struct net_
 	init_timer(&dn_db->timer);
 
 	dn_db->uptime = jiffies;
-	if (dn_db->parms.up) {
-		if (dn_db->parms.up(dev) < 0) {
+	if (dn_db->parms.dn_up) {
+		if (dn_db->parms.dn_up(dev) < 0) {
 			dev->dn_ptr = NULL;
 			kfree(dn_db);
 			return NULL;
@@ -1213,8 +1213,8 @@ static void dn_dev_delete(struct net_dev
 	dn_dev_check_default(dev);
 	neigh_ifdown(&dn_neigh_table, dev);
 
-	if (dn_db->parms.down)
-		dn_db->parms.down(dev);
+	if (dn_db->parms.dn_down)
+		dn_db->parms.dn_down(dev);
 
 	dev->dn_ptr = NULL;
 
Index: linux-2.6.16/net/ipv4/netfilter/arp_tables.c
===================================================================
--- linux-2.6.16.orig/net/ipv4/netfilter/arp_tables.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/net/ipv4/netfilter/arp_tables.c	2006-10-19 16:52:31.000000000 +0200
@@ -248,7 +248,7 @@ unsigned int arpt_do_table(struct sk_buf
 	outdev = out ? out->name : nulldevname;
 
 	read_lock_bh(&table->lock);
-	table_base = (void *)private->entries[smp_processor_id()];
+	table_base = (void *)private->entries[raw_smp_processor_id()];
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
 
@@ -948,7 +948,7 @@ static int do_add_counters(void __user *
 
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[smp_processor_id()];
+	loc_cpu_entry = private->entries[raw_smp_processor_id()];
 	ARPT_ENTRY_ITERATE(loc_cpu_entry,
 			   private->size,
 			   add_counter_to_entry,
Index: linux-2.6.16/net/ipv4/netfilter/ip_conntrack_core.c
===================================================================
--- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_core.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/net/ipv4/netfilter/ip_conntrack_core.c	2006-10-19 16:52:31.000000000 +0200
@@ -83,7 +83,7 @@ static unsigned int ip_conntrack_expect_
 struct notifier_block *ip_conntrack_chain;
 struct notifier_block *ip_conntrack_expect_chain;
 
-DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
+DEFINE_PER_CPU_LOCKED(struct ip_conntrack_ecache, ip_conntrack_ecache);
 
 /* deliver cached events and clear cache entry - must be called with locally
  * disabled softirqs */
@@ -104,20 +104,20 @@ __ip_ct_deliver_cached_events(struct ip_
 void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
 {
 	struct ip_conntrack_ecache *ecache;
-	
+	int cpu;
+
 	local_bh_disable();
-	ecache = &__get_cpu_var(ip_conntrack_ecache);
+	ecache = &get_cpu_var_locked(ip_conntrack_ecache, &cpu);
 	if (ecache->ct == ct)
 		__ip_ct_deliver_cached_events(ecache);
+	put_cpu_var_locked(ip_conntrack_ecache, cpu);
 	local_bh_enable();
 }
 
-void __ip_ct_event_cache_init(struct ip_conntrack *ct)
+void __ip_ct_event_cache_init(struct ip_conntrack_ecache *ecache,
+			      struct ip_conntrack *ct)
 {
-	struct ip_conntrack_ecache *ecache;
-
 	/* take care of delivering potentially old events */
-	ecache = &__get_cpu_var(ip_conntrack_ecache);
 	BUG_ON(ecache->ct == ct);
 	if (ecache->ct)
 		__ip_ct_deliver_cached_events(ecache);
@@ -133,8 +133,11 @@ static void ip_ct_event_cache_flush(void
 	struct ip_conntrack_ecache *ecache;
 	int cpu;
 
+	/*
+	 * First get all locks, then do the flush and drop the locks.
+	 */
 	for_each_cpu(cpu) {
-		ecache = &per_cpu(ip_conntrack_ecache, cpu);
+		ecache = &__get_cpu_var_locked(ip_conntrack_ecache, cpu);
 		if (ecache->ct)
 			ip_conntrack_put(ecache->ct);
 	}
Index: linux-2.6.16/net/ipv4/netfilter/ip_conntrack_standalone.c
===================================================================
--- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_standalone.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/net/ipv4/netfilter/ip_conntrack_standalone.c	2006-10-19 16:52:31.000000000 +0200
@@ -954,7 +954,7 @@ EXPORT_SYMBOL_GPL(ip_conntrack_expect_ch
 EXPORT_SYMBOL_GPL(ip_conntrack_register_notifier);
 EXPORT_SYMBOL_GPL(ip_conntrack_unregister_notifier);
 EXPORT_SYMBOL_GPL(__ip_ct_event_cache_init);
-EXPORT_PER_CPU_SYMBOL_GPL(ip_conntrack_ecache);
+EXPORT_PER_CPU_LOCKED_SYMBOL_GPL(ip_conntrack_ecache);
 #endif
 EXPORT_SYMBOL(ip_conntrack_protocol_register);
 EXPORT_SYMBOL(ip_conntrack_protocol_unregister);
Index: linux-2.6.16/net/ipv4/netfilter/ip_tables.c
===================================================================
--- linux-2.6.16.orig/net/ipv4/netfilter/ip_tables.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/net/ipv4/netfilter/ip_tables.c	2006-10-19 16:52:31.000000000 +0200
@@ -246,7 +246,7 @@ ipt_do_table(struct sk_buff **pskb,
 
 	read_lock_bh(&table->lock);
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-	table_base = (void *)private->entries[smp_processor_id()];
+	table_base = (void *)private->entries[raw_smp_processor_id()];
 	e = get_entry(table_base, private->hook_entry[hook]);
 
 	/* For return from builtin chain */
Index: linux-2.6.16/net/ipv4/route.c
===================================================================
--- linux-2.6.16.orig/net/ipv4/route.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/net/ipv4/route.c	2006-10-19 16:52:31.000000000 +0200
@@ -204,14 +204,14 @@ __u8 ip_tos2prio[16] = {
 struct rt_hash_bucket {
 	struct rtable	*chain;
 };
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT_RT)
 /*
  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
  * The size of this table is a power of two and depends on the number of CPUS.
  */
-#if NR_CPUS >= 32
+#if NR_CPUS >= 32 && !defined(CONFIG_PREEMPT_RT)
 #define RT_HASH_LOCK_SZ	4096
-#elif NR_CPUS >= 16
+#elif NR_CPUS >= 16 && !defined(CONFIG_PREEMPT_RT)
 #define RT_HASH_LOCK_SZ	2048
 #elif NR_CPUS >= 8
 #define RT_HASH_LOCK_SZ	1024
@@ -231,7 +231,7 @@ static spinlock_t	*rt_hash_locks;
 			spin_lock_init(&rt_hash_locks[i]); \
 		}
 #else
-# define rt_hash_lock_addr(slot) NULL
+# define rt_hash_lock_addr(slot) ((spinlock_t *)NULL)
 # define rt_hash_lock_init()
 #endif
 
Index: linux-2.6.16/net/ipv4/tcp.c
===================================================================
--- linux-2.6.16.orig/net/ipv4/tcp.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/net/ipv4/tcp.c	2006-10-19 16:52:31.000000000 +0200
@@ -2039,6 +2039,8 @@ static int __init set_thash_entries(char
 }
 __setup("thash_entries=", set_thash_entries);
 
+void *__init alloc_large_system_bitmask(char *bitmaskname,
+					unsigned long bits, int flags);
 void __init tcp_init(void)
 {
 	struct sk_buff *skb = NULL;
@@ -2071,6 +2073,10 @@ void __init tcp_init(void)
 					NULL,
 					0);
 	tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
+	tcp_hashinfo.ebitmask =
+		alloc_large_system_bitmask("TCP established",
+					  tcp_hashinfo.ehash_size,
+					  HASH_HIGHMEM);
 	for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
 		rwlock_init(&tcp_hashinfo.ehash[i].lock);
 		INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
Index: linux-2.6.16/net/ipv4/tcp_ipv4.c
===================================================================
--- linux-2.6.16.orig/net/ipv4/tcp_ipv4.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/net/ipv4/tcp_ipv4.c	2006-10-19 16:52:31.000000000 +0200
@@ -1415,7 +1415,12 @@ static void *established_get_first(struc
 	struct tcp_iter_state* st = seq->private;
 	void *rc = NULL;
 
-	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
+	for (st->bucket = find_first_bit(tcp_hashinfo.ebitmask,
+					 tcp_hashinfo.ehash_size);
+	     st->bucket < tcp_hashinfo.ehash_size;
+	     st->bucket = find_next_bit(tcp_hashinfo.ebitmask,
+					tcp_hashinfo.ehash_size,
+					st->bucket+1)) {
 		struct sock *sk;
 		struct hlist_node *node;
 		struct inet_timewait_sock *tw;
Index: linux-2.6.16/net/sched/sch_generic.c
===================================================================
--- linux-2.6.16.orig/net/sched/sch_generic.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/net/sched/sch_generic.c	2006-10-19 16:52:31.000000000 +0200
@@ -14,6 +14,7 @@
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <linux/bitops.h>
+#include <linux/kallsyms.h>
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/types.h>
@@ -32,6 +33,7 @@
 #include <linux/init.h>
 #include <linux/rcupdate.h>
 #include <linux/list.h>
+#include <linux/delay.h>
 #include <net/sock.h>
 #include <net/pkt_sched.h>
 
@@ -108,8 +110,11 @@ int qdisc_restart(struct net_device *dev
 		 * will be requeued.
 		 */
 		if (!nolock) {
+#ifdef CONFIG_PREEMPT_RT
+			spin_lock(&dev->xmit_lock);
+			dev->xmit_lock_owner = raw_smp_processor_id();
+#else
 			if (!spin_trylock(&dev->xmit_lock)) {
-			collision:
 				/* So, someone grabbed the driver. */
 				
 				/* It may be transient configuration error,
@@ -117,7 +122,7 @@ int qdisc_restart(struct net_device *dev
 				   it by checking xmit owner and drop the
 				   packet when deadloop is detected.
 				*/
-				if (dev->xmit_lock_owner == smp_processor_id()) {
+				if (dev->xmit_lock_owner == raw_smp_processor_id()) {
 					kfree_skb(skb);
 					if (net_ratelimit())
 						printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
@@ -127,7 +132,8 @@ int qdisc_restart(struct net_device *dev
 				goto requeue;
 			}
 			/* Remember that the driver is grabbed by us. */
-			dev->xmit_lock_owner = smp_processor_id();
+			dev->xmit_lock_owner = raw_smp_processor_id();
+#endif
 		}
 		
 		{
@@ -139,7 +145,20 @@ int qdisc_restart(struct net_device *dev
 				if (netdev_nit)
 					dev_queue_xmit_nit(skb, dev);
 
+				WARN_ON_RT(irqs_disabled());
 				ret = dev->hard_start_xmit(skb, dev);
+#ifdef CONFIG_PREEMPT_RT
+				if (irqs_disabled()) {
+					if (printk_ratelimit())
+						print_symbol("network driver disabled interrupts: %s\n", (unsigned long)dev->hard_start_xmit);
+					local_irq_enable();
+				}
+				if (irqs_disabled()) {
+					if (printk_ratelimit())
+						print_symbol("network driver disabled raw interrupts: %s\n", (unsigned long)dev->hard_start_xmit);
+					local_irq_enable();
+				}
+#endif
 				if (ret == NETDEV_TX_OK) { 
 					if (!nolock) {
 						dev->xmit_lock_owner = -1;
@@ -150,7 +169,10 @@ int qdisc_restart(struct net_device *dev
 				}
 				if (ret == NETDEV_TX_LOCKED && nolock) {
 					spin_lock(&dev->queue_lock);
-					goto collision; 
+					preempt_disable();
+					__get_cpu_var(netdev_rx_stat).cpu_collision++;
+					preempt_enable();
+					goto requeue;
 				}
 			}
 
@@ -578,7 +600,7 @@ void dev_deactivate(struct net_device *d
 	dev_watchdog_down(dev);
 
 	while (test_bit(__LINK_STATE_SCHED, &dev->state))
-		yield();
+		msleep(1);
 
 	spin_unlock_wait(&dev->xmit_lock);
 }
Index: linux-2.6.16/net/unix/af_unix.c
===================================================================
--- linux-2.6.16.orig/net/unix/af_unix.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/net/unix/af_unix.c	2006-10-19 16:52:31.000000000 +0200
@@ -290,10 +290,11 @@ static void unix_write_space(struct sock
 	read_lock(&sk->sk_callback_lock);
 	if (unix_writable(sk)) {
 		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
-			wake_up_interruptible(sk->sk_sleep);
+			wake_up_interruptible_sync(sk->sk_sleep);
 		sk_wake_async(sk, 2, POLL_OUT);
 	}
 	read_unlock(&sk->sk_callback_lock);
+	preempt_check_resched_delayed();
 }
 
 /* When dgram socket disconnects (or changes its peer), we clear its receive
Index: linux-2.6.16/scripts/Makefile
===================================================================
--- linux-2.6.16.orig/scripts/Makefile	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/scripts/Makefile	2006-10-19 16:52:31.000000000 +0200
@@ -12,6 +12,9 @@ hostprogs-$(CONFIG_LOGO)         += pnmt
 hostprogs-$(CONFIG_VT)           += conmakehash
 hostprogs-$(CONFIG_PROM_CONSOLE) += conmakehash
 hostprogs-$(CONFIG_IKCONFIG)     += bin2c
+ifdef CONFIG_LPPTEST
+hostprogs-y      += testlpp
+endif
 
 always		:= $(hostprogs-y)
 
Index: linux-2.6.16/scripts/rt-tester/check-all.sh
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/scripts/rt-tester/check-all.sh	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,21 @@
+
+
+function testit ()
+{
+ printf "%-30s: " $1
+ ./rt-tester.py $1 | grep Pass
+}
+
+testit t2-l1-2rt-sameprio.tst
+testit t2-l1-pi.tst
+testit t2-l1-signal.tst
+#testit t2-l2-2rt-deadlock.tst
+testit t3-l1-pi-1rt.tst
+testit t3-l1-pi-2rt.tst
+testit t3-l1-pi-3rt.tst
+testit t3-l1-pi-signal.tst
+testit t3-l1-pi-steal.tst
+testit t3-l2-pi.tst
+testit t4-l2-pi-deboost.tst
+testit t5-l4-pi-boost-deboost.tst
+
Index: linux-2.6.16/scripts/rt-tester/rt-tester.py
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/scripts/rt-tester/rt-tester.py	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,222 @@
+#!/usr/bin/env python
+#
+# rt-mutex tester
+#
+# (C) 2006 Thomas Gleixner <tglx@linutronix.de>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+#
+import os
+import sys
+import getopt
+import shutil
+import string
+
+# Globals
+quiet = 0
+test = 0
+comments = 0
+
+sysfsprefix = "/sys/devices/system/rttest/rttest"
+statusfile = "/status"
+commandfile = "/command"
+
+# Command opcodes
+cmd_opcodes = {
+    "schedother"    : "1",
+    "schedfifo"     : "2",
+    "lock"          : "3",
+    "locknowait"    : "4",
+    "lockint"       : "5",
+    "lockintnowait" : "6",
+    "lockcont"      : "7",
+    "unlock"        : "8",
+    "lockbkl"       : "9",
+    "unlockbkl"     : "10",
+    "signal"        : "11",
+    "resetevent"    : "98",
+    "reset"         : "99",
+    }
+
+test_opcodes = {
+    "prioeq"        : ["P" , "eq" , None],
+    "priolt"        : ["P" , "lt" , None],
+    "priogt"        : ["P" , "gt" , None],
+    "nprioeq"       : ["N" , "eq" , None],
+    "npriolt"       : ["N" , "lt" , None],
+    "npriogt"       : ["N" , "gt" , None],
+    "unlocked"      : ["M" , "eq" , 0],
+    "trylock"       : ["M" , "eq" , 1],
+    "blocked"       : ["M" , "eq" , 2],
+    "blockedwake"   : ["M" , "eq" , 3],
+    "locked"        : ["M" , "eq" , 4],
+    "opcodeeq"      : ["O" , "eq" , None],
+    "opcodelt"      : ["O" , "lt" , None],
+    "opcodegt"      : ["O" , "gt" , None],
+    "eventeq"       : ["E" , "eq" , None],
+    "eventlt"       : ["E" , "lt" , None],
+    "eventgt"       : ["E" , "gt" , None],
+    }
+
+# Print usage information
+def usage():
+    print "rt-tester.py <-c -h -q -t> <testfile>"
+    print " -c    display comments after first command"
+    print " -h    help"
+    print " -q    quiet mode"
+    print " -t    test mode (syntax check)"
+    print " testfile: read test specification from testfile"
+    print " otherwise from stdin"
+    return
+
+# Print progress when not in quiet mode
+def progress(str):
+    if not quiet:
+        print str
+
+# Analyse a status value
+def analyse(val, top, arg):
+
+    intval = int(val)
+
+    if top[0] == "M":
+        intval = intval / (10 ** int(arg))
+	intval = intval % 10
+        argval = top[2]
+    elif top[0] == "O":
+        argval = int(cmd_opcodes.get(arg, arg))
+    else:
+        argval = int(arg)
+
+    # progress("%d %s %d" %(intval, top[1], argval))
+
+    if top[1] == "eq" and intval == argval:
+	return 1
+    if top[1] == "lt" and intval < argval:
+        return 1
+    if top[1] == "gt" and intval > argval:
+	return 1
+    return 0
+
+# Parse the commandline
+try:
+    (options, arguments) = getopt.getopt(sys.argv[1:],'chqt')
+except getopt.GetoptError, ex:
+    usage()
+    sys.exit(1)
+
+# Parse commandline options
+for option, value in options:
+    if option == "-c":
+        comments = 1
+    elif option == "-q":
+        quiet = 1
+    elif option == "-t":
+        test = 1
+    elif option == '-h':
+        usage()
+        sys.exit(0)
+
+# Select the input source
+if arguments:
+    try:
+        fd = open(arguments[0])
+    except Exception,ex:
+        sys.stderr.write("File not found %s\n" %(arguments[0]))
+        sys.exit(1)
+else:
+    fd = sys.stdin
+
+linenr = 0
+
+# Read the test patterns
+while 1:
+
+    linenr = linenr + 1
+    line = fd.readline()
+    if not len(line):
+        break
+
+    line = line.strip()
+    parts = line.split(":")
+
+    if not parts or len(parts) < 1:
+        continue
+
+    if len(parts[0]) == 0:
+        continue
+
+    if parts[0].startswith("#"):
+	if comments > 1:
+	    progress(line)
+	continue
+
+    if comments == 1:
+	comments = 2
+
+    progress(line)
+
+    cmd = parts[0].strip().lower()
+    opc = parts[1].strip().lower()
+    tid = parts[2].strip()
+    dat = parts[3].strip()
+
+    try:
+        # Test or wait for a status value
+        if cmd == "t" or cmd == "w":
+            testop = test_opcodes[opc]
+
+            fname = "%s%s%s" %(sysfsprefix, tid, statusfile)
+            if test:
+		print fname
+                continue
+
+            while 1:
+                query = 1
+                fsta = open(fname, 'r')
+                status = fsta.readline().strip()
+                fsta.close()
+                stat = status.split(",")
+                for s in stat:
+		    s = s.strip()
+                    if s.startswith(testop[0]):
+                        # Seperate status value
+                        val = s[2:].strip()
+                        query = analyse(val, testop, dat)
+                        break
+                if query or cmd == "t":
+                    break
+
+            progress("   " + status)
+
+            if not query:
+                sys.stderr.write("Test failed in line %d\n" %(linenr))
+		sys.exit(1)
+
+        # Issue a command to the tester
+        elif cmd == "c":
+            cmdnr = cmd_opcodes[opc]
+            # Build command string and sys filename
+            cmdstr = "%s:%s" %(cmdnr, dat)
+            fname = "%s%s%s" %(sysfsprefix, tid, commandfile)
+            if test:
+		print fname
+                continue
+            fcmd = open(fname, 'w')
+            fcmd.write(cmdstr)
+            fcmd.close()
+
+    except Exception,ex:
+    	sys.stderr.write(str(ex))
+        sys.stderr.write("\nSyntax error in line %d\n" %(linenr))
+        if not test:
+            fd.close()
+            sys.exit(1)
+
+# Normal exit pass
+print "Pass"
+sys.exit(0)
+
+
Index: linux-2.6.16/scripts/rt-tester/t2-l1-2rt-sameprio.tst
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/scripts/rt-tester/t2-l1-2rt-sameprio.tst	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,101 @@
+#
+# RT-Mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	0
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 2 threads 1 lock
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedfifo:		0: 	80
+W: opcodeeq:		0: 	0
+C: schedfifo:		1: 	80
+W: opcodeeq:		1: 	0
+
+# T0 lock L0
+C: locknowait:		0: 	0
+C: locknowait:		1:	0
+W: locked:		0: 	0
+W: blocked:		1: 	0
+T: prioeq:		0: 	80
+
+# T0 unlock L0
+C: unlock:		0: 	0
+W: locked:		1: 	0
+
+# Verify T0
+W: unlocked:		0: 	0
+T: prioeq:		0: 	80
+
+# Unlock
+C: unlock:		1: 	0
+W: unlocked:		1: 	0
+
+# T1,T0 lock L0
+C: locknowait:		1: 	0
+C: locknowait:		0:	0
+W: locked:		1: 	0
+W: blocked:		0: 	0
+T: prioeq:		1: 	80
+
+# T1 unlock L0
+C: unlock:		1: 	0
+W: locked:		0: 	0
+
+# Verify T1
+W: unlocked:		1: 	0
+T: prioeq:		1: 	80
+
+# Unlock and exit
+C: unlock:		0: 	0
+W: unlocked:		0: 	0
+
Index: linux-2.6.16/scripts/rt-tester/t2-l1-pi.tst
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/scripts/rt-tester/t2-l1-pi.tst	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,84 @@
+#
+# RT-Mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	0
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 2 threads 1 lock with priority inversion
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedother:		0: 	0
+W: opcodeeq:		0: 	0
+C: schedfifo:		1: 	80
+W: opcodeeq:		1: 	0
+
+# T0 lock L0
+C: locknowait:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L0
+C: locknowait:		1: 	0
+W: blocked:		1: 	0
+T: prioeq:		0: 	80
+
+# T0 unlock L0
+C: unlock:		0: 	0
+W: locked:		1: 	0
+
+# Verify T1
+W: unlocked:		0: 	0
+T: priolt:		0: 	1
+
+# Unlock and exit
+C: unlock:		1: 	0
+W: unlocked:		1: 	0
+
Index: linux-2.6.16/scripts/rt-tester/t2-l1-signal.tst
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/scripts/rt-tester/t2-l1-signal.tst	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,79 @@
+#
+# RT-Mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	0
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 2 threads 1 lock with priority inversion
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedother:		0: 	0
+W: opcodeeq:		0: 	0
+C: schedother:		1: 	0
+W: opcodeeq:		1: 	0
+
+# T0 lock L0
+C: locknowait:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L0
+C: lockintnowait:	1: 	0
+W: blocked:		1: 	0
+
+# Interrupt T1
+C: signal:		1:	0
+W: unlocked:		1: 	0
+T: opcodeeq:		1:	-4
+
+# Unlock and exit
+C: unlock:		0: 	0
+W: unlocked:		0: 	0
Index: linux-2.6.16/scripts/rt-tester/t2-l2-2rt-deadlock.tst
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/scripts/rt-tester/t2-l2-2rt-deadlock.tst	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,91 @@
+#
+# RT-Mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	0
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 2 threads 2 lock
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedfifo:		0: 	80
+W: opcodeeq:		0: 	0
+C: schedfifo:		1: 	80
+W: opcodeeq:		1: 	0
+
+# T0 lock L0
+C: locknowait:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L1
+C: locknowait:		1:	1
+W: locked:		1: 	1
+
+# T0 lock L1
+C: lockintnowait:	0: 	1
+W: blocked:		0: 	1
+
+# T1 lock L0
+C: lockintnowait:	1: 	0
+W: blocked:		1: 	0
+
+# Make deadlock go away
+C: signal:		1:	0
+W: unlocked:		1:	0
+C: signal:		0:	0
+W: unlocked:		0:	1
+
+# Unlock and exit
+C: unlock:		0: 	0
+W: unlocked:		0: 	0
+C: unlock:		1: 	1
+W: unlocked:		1: 	1
+
Index: linux-2.6.16/scripts/rt-tester/t3-l1-pi-1rt.tst
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/scripts/rt-tester/t3-l1-pi-1rt.tst	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,95 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	thread to signal (0-7)
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 3 threads 1 lock PI
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedother:		0: 	0
+W: opcodeeq:		0: 	0
+C: schedother:		1: 	0
+W: opcodeeq:		1: 	0
+C: schedfifo:		2: 	82
+W: opcodeeq:		2: 	0
+
+# T0 lock L0
+C: locknowait:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L0
+C: locknowait:		1: 	0
+W: blocked:		1: 	0
+T: priolt:		0: 	1
+
+# T2 lock L0
+C: locknowait:		2: 	0
+W: blocked:		2: 	0
+T: prioeq:		0: 	82
+
+# T0 unlock L0
+C: unlock:		0: 	0
+
+# Wait until T2 got the lock
+W: locked:		2: 	0
+W: unlocked:		0:	0
+T: priolt:		0:	1
+
+# T2 unlock L0
+C: unlock:		2: 	0
+
+W: unlocked:		2: 	0
+W: locked:		1: 	0
+
+C: unlock:		1: 	0
+W: unlocked:		1: 	0
Index: linux-2.6.16/scripts/rt-tester/t3-l1-pi-2rt.tst
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/scripts/rt-tester/t3-l1-pi-2rt.tst	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,96 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	thread to signal (0-7)
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 3 threads 1 lock PI
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedother:		0: 	0
+W: opcodeeq:		0: 	0
+C: schedfifo:		1: 	81
+W: opcodeeq:		1: 	0
+C: schedfifo:		2: 	82
+W: opcodeeq:		2: 	0
+
+# T0 lock L0
+C: locknowait:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L0
+C: locknowait:		1: 	0
+W: blocked:		1: 	0
+T: prioeq:		0: 	81
+
+# T2 lock L0
+C: locknowait:		2: 	0
+W: blocked:		2: 	0
+T: prioeq:		0: 	82
+T: prioeq:		1:	81
+
+# T0 unlock L0
+C: unlock:		0: 	0
+
+# Wait until T2 got the lock
+W: locked:		2: 	0
+W: unlocked:		0:	0
+T: priolt:		0:	1
+
+# T2 unlock L0
+C: unlock:		2: 	0
+
+W: unlocked:		2: 	0
+W: locked:		1: 	0
+
+C: unlock:		1: 	0
+W: unlocked:		1: 	0
Index: linux-2.6.16/scripts/rt-tester/t3-l1-pi-3rt.tst
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/scripts/rt-tester/t3-l1-pi-3rt.tst	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,95 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	thread to signal (0-7)
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 3 threads 1 lock PI
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedfifo:		0: 	80
+W: opcodeeq:		0: 	0
+C: schedfifo:		1: 	81
+W: opcodeeq:		1: 	0
+C: schedfifo:		2: 	82
+W: opcodeeq:		2: 	0
+
+# T0 lock L0
+C: locknowait:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L0
+C: locknowait:		1: 	0
+W: blocked:		1: 	0
+T: prioeq:		0: 	81
+
+# T2 lock L0
+C: locknowait:		2: 	0
+W: blocked:		2: 	0
+T: prioeq:		0: 	82
+
+# T0 unlock L0
+C: unlock:		0: 	0
+
+# Wait until T2 got the lock
+W: locked:		2: 	0
+W: unlocked:		0:	0
+T: prioeq:		0:	80
+
+# T2 unlock L0
+C: unlock:		2: 	0
+
+W: locked:		1: 	0
+W: unlocked:		2: 	0
+
+C: unlock:		1: 	0
+W: unlocked:		1: 	0
Index: linux-2.6.16/scripts/rt-tester/t3-l1-pi-signal.tst
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/scripts/rt-tester/t3-l1-pi-signal.tst	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,101 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	thread to signal (0-7)
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+# Reset event counter
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set priorities
+C: schedother:		0: 	0
+W: opcodeeq:		0: 	0
+C: schedfifo:		1: 	80
+W: opcodeeq:		1: 	0
+C: schedfifo:		2: 	81
+W: opcodeeq:		2: 	0
+
+# T0 lock L0
+C: lock:		0:	0
+W: locked:		0: 	0
+
+# T1 lock L0, no wait in the wakeup path
+C: locknowait:		1: 	0
+W: blocked:		1: 	0
+T: prioeq:		0:	80
+T: prioeq:		1:	80
+
+# T2 lock L0 interruptible, no wait in the wakeup path
+C: lockintnowait:	2:	0
+W: blocked:		2: 	0
+T: prioeq:		0:	81
+T: prioeq:		1:	80
+
+# Interrupt T2
+C: signal:		2:	2
+W: unlocked:		2:	0
+T: prioeq:		1:	80
+T: prioeq:		0:	80
+
+T: locked:		0:	0
+T: blocked:		1:	0
+
+# T0 unlock L0
+C: unlock:		0: 	0
+
+# Wait until T1 has locked L0 and exit
+W: locked:		1:	0
+W: unlocked:		0: 	0
+T: priolt:		0:	1
+
+C: unlock:		1: 	0
+W: unlocked:		1: 	0
+
+
+
Index: linux-2.6.16/scripts/rt-tester/t3-l1-pi-steal.tst
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/scripts/rt-tester/t3-l1-pi-steal.tst	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,99 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	thread to signal (0-7)
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 3 threads 1 lock PI steal pending ownership
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedother:		0: 	0
+W: opcodeeq:		0: 	0
+C: schedfifo:		1: 	80
+W: opcodeeq:		1: 	0
+C: schedfifo:		2: 	81
+W: opcodeeq:		2: 	0
+
+# T0 lock L0
+C: lock:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L0
+C: lock:		1: 	0
+W: blocked:		1: 	0
+T: prioeq:		0: 	80
+
+# T0 unlock L0
+C: unlock:		0: 	0
+
+# Wait until T1 is in the wakeup loop
+W: blockedwake:		1: 	0
+T: priolt:		0: 	1
+
+# T2 lock L0
+C: lock:		2: 	0
+# T1 leave wakeup loop
+C: lockcont:		1: 	0
+
+# T2 must have the lock and T1 must be blocked
+W: locked:		2: 	0
+W: blocked:		1: 	0
+
+# T2 unlock L0
+C: unlock:		2: 	0
+
+# Wait until T1 is in the wakeup loop and let it run
+W: blockedwake:		1: 	0
+C: lockcont:		1: 	0
+W: locked:		1: 	0
+C: unlock:		1: 	0
+W: unlocked:		1: 	0
Index: linux-2.6.16/scripts/rt-tester/t3-l2-pi.tst
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/scripts/rt-tester/t3-l2-pi.tst	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,95 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	thread to signal (0-7)
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 3 threads 2 lock PI
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedother:		0: 	0
+W: opcodeeq:		0: 	0
+C: schedother:		1: 	0
+W: opcodeeq:		1: 	0
+C: schedfifo:		2: 	82
+W: opcodeeq:		2: 	0
+
+# T0 lock L0
+C: locknowait:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L0
+C: locknowait:		1: 	0
+W: blocked:		1: 	0
+T: priolt:		0: 	1
+
+# T2 lock L0
+C: locknowait:		2: 	0
+W: blocked:		2: 	0
+T: prioeq:		0: 	82
+
+# T0 unlock L0
+C: unlock:		0: 	0
+
+# Wait until T2 got the lock
+W: locked:		2: 	0
+W: unlocked:		0:	0
+T: priolt:		0:	1
+
+# T2 unlock L0
+C: unlock:		2: 	0
+
+W: unlocked:		2: 	0
+W: locked:		1: 	0
+
+C: unlock:		1: 	0
+W: unlocked:		1: 	0
Index: linux-2.6.16/scripts/rt-tester/t4-l2-pi-deboost.tst
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/scripts/rt-tester/t4-l2-pi-deboost.tst	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,127 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	thread to signal (0-7)
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 4 threads 2 lock PI
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedother:		0: 	0
+W: opcodeeq:		0: 	0
+C: schedother:		1: 	0
+W: opcodeeq:		1: 	0
+C: schedfifo:		2: 	82
+W: opcodeeq:		2: 	0
+C: schedfifo:		3: 	83
+W: opcodeeq:		3: 	0
+
+# T0 lock L0
+C: locknowait:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L1
+C: locknowait:		1: 	1
+W: locked:		1: 	1
+
+# T3 lock L0
+C: lockintnowait:	3: 	0
+W: blocked:		3: 	0
+T: prioeq:		0: 	83
+
+# T0 lock L1
+C: lock:		0: 	1
+W: blocked:		0: 	1
+T: prioeq:		1: 	83
+
+# T1 unlock L1
+C: unlock:		1:	1
+
+# Wait until T0 is in the wakeup code
+W: blockedwake:		0:	1
+
+# Verify that T1 is unboosted
+W: unlocked:		1: 	1
+T: priolt:		1: 	1
+
+# T2 lock L1 (T0 is boosted and pending owner !)
+C: locknowait:		2:	1
+W: blocked:		2: 	1
+T: prioeq:		0: 	83
+
+# Interrupt T3 and wait until T3 returned
+C: signal:		3:	0
+W: unlocked:		3:	0
+
+# Verify prio of T0 (still pending owner,
+# but T2 is enqueued due to the previous boost by T3
+T: prioeq:		0:	82
+
+# Let T0 continue
+C: lockcont:		0:	1
+W: locked:		0:	1
+
+# Unlock L1 and let T2 get L1
+C: unlock:		0:	1
+W: locked:		2:	1
+
+# Verify that T0 is unboosted
+W: unlocked:		0:	1
+T: priolt:		0:	1
+
+# Unlock everything and exit
+C: unlock:		2:	1
+W: unlocked:		2:	1
+
+C: unlock:		0:	0
+W: unlocked:		0:	0
+
Index: linux-2.6.16/scripts/rt-tester/t5-l4-pi-boost-deboost.tst
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/scripts/rt-tester/t5-l4-pi-boost-deboost.tst	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,148 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	thread to signal (0-7)
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 5 threads 4 lock PI
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedother:		0: 	0
+W: opcodeeq:		0: 	0
+C: schedfifo:		1: 	81
+W: opcodeeq:		1: 	0
+C: schedfifo:		2: 	82
+W: opcodeeq:		2: 	0
+C: schedfifo:		3: 	83
+W: opcodeeq:		3: 	0
+C: schedfifo:		4: 	84
+W: opcodeeq:		4: 	0
+
+# T0 lock L0
+C: locknowait:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L1
+C: locknowait:		1: 	1
+W: locked:		1: 	1
+
+# T1 lock L0
+C: lockintnowait:	1: 	0
+W: blocked:		1: 	0
+T: prioeq:		0: 	81
+
+# T2 lock L2
+C: locknowait:		2: 	2
+W: locked:		2: 	2
+
+# T2 lock L1
+C: lockintnowait:	2: 	1
+W: blocked:		2: 	1
+T: prioeq:		0: 	82
+T: prioeq:		1:	82
+
+# T3 lock L3
+C: locknowait:		3: 	3
+W: locked:		3: 	3
+
+# T3 lock L2
+C: lockintnowait:	3: 	2
+W: blocked:		3: 	2
+T: prioeq:		0: 	83
+T: prioeq:		1:	83
+T: prioeq:		2:	83
+
+# T4 lock L3
+C: lockintnowait:	4:	3
+W: blocked:		4: 	3
+T: prioeq:		0: 	84
+T: prioeq:		1:	84
+T: prioeq:		2:	84
+T: prioeq:		3:	84
+
+# Signal T4
+C: signal:		4: 	0
+W: unlocked:		4: 	3
+T: prioeq:		0: 	83
+T: prioeq:		1:	83
+T: prioeq:		2:	83
+T: prioeq:		3:	83
+
+# Signal T3
+C: signal:		3: 	0
+W: unlocked:		3: 	2
+T: prioeq:		0: 	82
+T: prioeq:		1:	82
+T: prioeq:		2:	82
+
+# Signal T2
+C: signal:		2: 	0
+W: unlocked:		2: 	1
+T: prioeq:		0: 	81
+T: prioeq:		1:	81
+
+# Signal T1
+C: signal:		1: 	0
+W: unlocked:		1: 	0
+T: priolt:		0: 	1
+
+# Unlock and exit
+C: unlock:		3:	3
+C: unlock:		2:	2
+C: unlock:		1:	1
+C: unlock:		0:	0
+
+W: unlocked:		3:	3
+W: unlocked:		2:	2
+W: unlocked:		1:	1
+W: unlocked:		0:	0
+
Index: linux-2.6.16/scripts/testlpp.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16/scripts/testlpp.c	2006-10-19 16:52:31.000000000 +0200
@@ -0,0 +1,159 @@
+/*
+ * testlpp.c: use the /dev/lpptest device to test IRQ handling
+ *            latencies over parallel port
+ *
+ *      Copyright (C) 2005 Thomas Gleixner
+ *
+ * licensed under the GPL
+ */
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/io.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#define LPPTEST_CHAR_MAJOR 245
+#define LPPTEST_DEVICE_NAME "lpptest"
+
+#define LPPTEST_TEST    _IOR (LPPTEST_CHAR_MAJOR, 1, unsigned long long)
+#define LPPTEST_DISABLE _IOR (LPPTEST_CHAR_MAJOR, 2, unsigned long long)
+#define LPPTEST_ENABLE  _IOR (LPPTEST_CHAR_MAJOR, 3, unsigned long long)
+
+#define HIST_SIZE 10000
+
+static int hist_total;
+static unsigned long hist[HIST_SIZE];
+
+static void hist_hit(unsigned long usecs)
+{
+	hist_total++;
+	if (usecs >= HIST_SIZE-1)
+		hist[HIST_SIZE-1]++;
+	else
+		hist[usecs]++;
+}
+
+static void print_hist(void)
+{
+	int i;
+
+	printf("LPP latency histogram:\n");
+
+	for (i = 0; i < HIST_SIZE; i++) {
+		if (hist[i])
+			printf("%3d usecs: %9ld\n", i, hist[i]);
+	}
+}
+
+static inline unsigned long long int rdtsc(void)
+{
+	unsigned long long int x, y;
+	for (;;) {
+		__asm__ volatile ("rdtsc" : "=A" (x));
+		__asm__ volatile ("rdtsc" : "=A" (y));
+		if (y - x < 1000)
+			return y;
+	}
+}
+
+static unsigned long long calibrate_loop(void)
+{
+	unsigned long long mytime1, mytime2;
+
+	mytime1 = rdtsc();
+	usleep(500000);
+	mytime2 = rdtsc();
+
+	return (mytime2 - mytime1) * 2;
+}
+
+#define time_to_usecs(time) ((double)time*1000000.0/(double)cycles_per_sec)
+
+#define time_to_usecs_l(time) (long)(time*1000000/cycles_per_sec)
+
+int fd, total;
+unsigned long long tim, sum_tim, min_tim = -1ULL, max_tim, cycles_per_sec;
+
+void cleanup(int sig)
+{
+	ioctl (fd, LPPTEST_ENABLE, &tim);
+	if (sig)
+		printf("[ interrupted - exiting ]\n");
+	printf("\ntotal number of responses: %d\n", total);
+	printf("average reponse latency:   %.2lf usecs\n",
+		time_to_usecs(sum_tim/total));
+	printf("minimum latency:           %.2lf usecs\n",
+			time_to_usecs(min_tim));
+	printf("maximum latency:           %.2lf usecs\n",
+			time_to_usecs(max_tim));
+	print_hist();
+	exit(0);
+}
+
+#define HZ 3000
+
+int main (int argc, char **argv)
+{
+	unsigned int nr_requests = 0;
+
+	if (argc > 2) {
+		fprintf(stderr, "usage: testlpp [<nr_of_requests>]\n");
+		exit(-1);
+	}
+	if (argc == 2)
+		nr_requests = atol(argv[1]);
+
+	if (getuid() != 0) {
+		fprintf(stderr, "need to run as root!\n");
+		exit(-1);
+	}
+	mknod("/dev/lpptest", S_IFCHR|0666, makedev(245, 1));
+
+	fd = open("/dev/lpptest", O_RDWR);
+	if (fd == -1) {
+		fprintf(stderr, "could not open /dev/lpptest, your kernel doesnt have CONFIG_LPPTEST enabled?\n");
+		exit(-1);
+	}
+
+	signal(SIGINT,&cleanup);
+
+	ioctl (fd, LPPTEST_DISABLE, &tim);
+
+	fprintf(stderr, "calibrating cycles to usecs: ");
+	cycles_per_sec = calibrate_loop();
+	fprintf(stderr, "%lld cycles per usec\n", cycles_per_sec/1000000);
+	if (nr_requests)
+		fprintf(stderr, "[max # of requests: %u]\n", nr_requests);
+	fprintf(stderr, "starting %dHz test, hit Ctrl-C to stop:\n\n", HZ);
+
+	while(1) {
+		ioctl (fd, LPPTEST_TEST, &tim);
+		if (tim == 0)
+			printf ("No response from target.\n");
+		else {
+			hist_hit(time_to_usecs_l(tim));
+			if (tim > max_tim) {
+				printf ("new max latency: %.2lf usecs (%Ld cycles)\n", time_to_usecs(tim), tim);
+				max_tim = tim;
+			}
+			if (tim < min_tim)
+				min_tim = tim;
+			total++;
+			if (total == nr_requests)
+				break;
+			sum_tim += tim;
+		}
+		usleep(1000000/HZ);
+	}
+	cleanup(0);
+
+	return 0;
+}
+
+
Index: linux-2.6.16/sound/core/pcm_lib.c
===================================================================
--- linux-2.6.16.orig/sound/core/pcm_lib.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/sound/core/pcm_lib.c	2006-10-19 16:52:31.000000000 +0200
@@ -132,6 +132,7 @@ static void xrun(struct snd_pcm_substrea
 	snd_pcm_stop(substream, SNDRV_PCM_STATE_XRUN);
 #ifdef CONFIG_SND_DEBUG
 	if (substream->pstr->xrun_debug) {
+		user_trace_stop();
 		snd_printd(KERN_DEBUG "XRUN: pcmC%dD%d%c\n",
 			   substream->pcm->card->number,
 			   substream->pcm->device,
Index: linux-2.6.16/sound/oss/emu10k1/midi.c
===================================================================
--- linux-2.6.16.orig/sound/oss/emu10k1/midi.c	2006-09-20 10:56:46.000000000 +0200
+++ linux-2.6.16/sound/oss/emu10k1/midi.c	2006-10-19 16:52:31.000000000 +0200
@@ -45,7 +45,7 @@
 #include "../sound_config.h"
 #endif
 
-static DEFINE_SPINLOCK(midi_spinlock __attribute((unused)));
+static DEFINE_SPINLOCK(midi_spinlock);
 
 static void init_midi_hdr(struct midi_hdr *midihdr)
 {