diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/arch/i386/boot/video.S linux-2.6.8.1-ck7/arch/i386/boot/video.S
--- linux-2.6.8.1-ck6/arch/i386/boot/video.S	2004-03-11 21:28:53.000000000 +1100
+++ linux-2.6.8.1-ck7/arch/i386/boot/video.S	2004-09-09 22:56:38.668119359 +1000
@@ -164,10 +164,12 @@ basret:	ret
 # parameters in the default 80x25 mode -- these are set directly,
 # because some very obscure BIOSes supply insane values.
 mode_params:
+#ifdef CONFIG_FB_VESA_STD
 #ifdef CONFIG_VIDEO_SELECT
 	cmpb	$0, graphic_mode
 	jnz	mopar_gr
 #endif
+#endif
 	movb	$0x03, %ah			# Read cursor position
 	xorb	%bh, %bh
 	int	$0x10
@@ -200,6 +202,7 @@ mopar2: movb	%al, %fs:(PARAM_VIDEO_LINES
 	ret
 
 #ifdef CONFIG_VIDEO_SELECT
+#ifdef CONFIG_FB_VESA_STD
 # Fetching of VESA frame buffer parameters
 mopar_gr:
 	leaw	modelist+1024, %di
@@ -243,6 +246,7 @@ mopar_gr:
 	movw	%es, %fs:(PARAM_VESAPM_SEG)
 	movw	%di, %fs:(PARAM_VESAPM_OFF)
 no_pm:	ret
+#endif
 
 # The video mode menu
 mode_menu:
@@ -457,10 +461,10 @@ mode_set:
 	
 	cmpb	$VIDEO_FIRST_V7>>8, %ah
 	jz	setv7
-	
+
 	cmpb	$VIDEO_FIRST_VESA>>8, %ah
 	jnc	check_vesa
-	
+
 	orb	%ah, %ah
 	jz	setmenu
 	
@@ -547,6 +551,7 @@ check_vesa:
 	cmpb	$0x09, %al
 	jz	setvesa				# This is a text mode
 
+#ifdef CONFIG_FB_VESA_STD
 	movb	(%di), %al			# Check capabilities.
 	andb	$0x99, %al
 	cmpb	$0x99, %al
@@ -563,6 +568,7 @@ check_vesa:
 	movb	$0, do_restore			# no screen restore
 	stc
 	ret
+#endif
 
 _setbad:	jmp	setbad          	# Ugly...
 
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/Documentation/fb/vesafb.txt linux-2.6.8.1-ck7/Documentation/fb/vesafb.txt
--- linux-2.6.8.1-ck6/Documentation/fb/vesafb.txt	2004-06-16 17:35:30.000000000 +1000
+++ linux-2.6.8.1-ck7/Documentation/fb/vesafb.txt	2004-09-09 22:56:38.674118422 +1000
@@ -2,16 +2,18 @@
 What is vesafb?
 ===============
 
-This is a generic driver for a graphic framebuffer on intel boxes.
+Vesafb is a generic framebuffer driver for x86 and x86_64 boxes.
 
-The idea is simple:  Turn on graphics mode at boot time with the help
-of the BIOS, and use this as framebuffer device /dev/fb0, like the m68k
-(and other) ports do.
-
-This means we decide at boot time whenever we want to run in text or
-graphics mode.  Switching mode later on (in protected mode) is
-impossible; BIOS calls work in real mode only.  VESA BIOS Extensions
-Version 2.0 are required, because we need a linear frame buffer.
+VESA BIOS Extensions Version 2.0 are required, because we need a linear
+frame buffer. VBE 3.0 is required if you want to use modes with a higher
+(than the standard 60Hz) refresh rate.
+
+The VESA framebuffer driver comes in two flavors - the standard vesafb 
+and vesafb-tng. Vesafb-tng is available only on 32-bit x86 due to the 
+technology it uses (vm86). Vesafb-tng has more features than vesafb
+(adjusting the refresh rate on VBE3.0-compliant boards, switching the 
+video mode without rebooting, selecting a mode by providing its 
+modedb name, and more) but might be unstable on some systems.
 
 Advantages:
 
@@ -29,16 +31,27 @@ Disadvantages:
 How to use it?
 ==============
 
-Switching modes is done using the vga=... boot parameter.  Read
-Documentation/svga.txt for details.
-
-You should compile in both vgacon (for text mode) and vesafb (for
-graphics mode). Which of them takes over the console depends on
-whenever the specified mode is text or graphics.
-
-The graphic modes are NOT in the list which you get if you boot with
-vga=ask and hit return. The mode you wish to use is derived from the
-VESA mode number. Here are those VESA mode numbers:
+If you are running your system on hardware platform where vm86 is supported
+(this is 32-bit x86 only as of the time of writing this document) and you
+decide to use vesafb-tng, you can either the driver into the kernel or use it 
+as a module. The graphic mode you want to use is in both cases specified using 
+the standard modedb format.
+
+If your system doesn't support vm86 calls yet (all 64-bit platforms), things
+get a little more tricky. Since on such systems you can't do BIOS calls from
+protected mode in which kernel runs, you have to decide at boot time whenever
+you want to run in text or in graphics mode. Switching mode later on is
+impossible. Switching modes is done using the vga=... boot parameter.  Read
+Documentation/svga.txt for details. Below is a more detailed description of 
+what to do on systems using the standard vesafb driver.
+
+You should compile in both vgacon (for text mode) and vesafb (for graphics mode). 
+Which of them takes over the console depends on whenever the specified mode is 
+text or graphics.
+
+The graphic modes are NOT in the list which you get if you boot with vga=ask 
+and hit return. The mode you wish to use is derived from the VESA mode number. 
+Here are those VESA mode numbers:
 
     | 640x480  800x600  1024x768 1280x1024
 ----+-------------------------------------
@@ -47,8 +60,7 @@ VESA mode number. Here are those VESA mo
 64k |  0x111    0x114    0x117    0x11A   
 16M |  0x112    0x115    0x118    0x11B   
 
-The video mode number of the Linux kernel is the VESA mode number plus
-0x200.
+The video mode number of the Linux kernel is the VESA mode number plus 0x200.
  
  Linux_kernel_mode_number = VESA_mode_number + 0x200
 
@@ -61,10 +73,10 @@ So the table for the Kernel mode numbers
 64k |  0x311    0x314    0x317    0x31A   
 16M |  0x312    0x315    0x318    0x31B   
 
-To enable one of those modes you have to specify "vga=ask" in the
-lilo.conf file and rerun LILO. Then you can type in the desired
-mode at the "vga=ask" prompt. For example if you like to use 
-1024x768x256 colors you have to say "305" at this prompt.
+To enable one of those modes you have to specify "vga=ask" in the lilo.conf 
+file and rerun LILO. Then you can type in the desired mode at the "vga=ask" 
+prompt. For example if you like to use 1024x768x256 colors you have to say 
+"305" at this prompt.
 
 If this does not work, this might be because your BIOS does not support
 linear framebuffers or because it does not support this mode at all.
@@ -77,6 +89,7 @@ Extensions v2.0 are required, 1.2 is NOT
 2. Note: Some newer versions of LILO appear to work with those hex values,
          if you set the 0x in front of the numbers.
 
+
 X11
 ===
 
@@ -86,77 +99,177 @@ It depends on X-Server and graphics boar
 
 The X-Server must restore the video mode correctly, else you end up
 with a broken console (and vesafb cannot do anything about this).
+With vesafb-tng chances are that the console will be restored properly
+even if the X server messed up the video mode.
 
 
 Refresh rates
 =============
 
-There is no way to change the vesafb video mode and/or timings after
-booting linux.  If you are not happy with the 60 Hz refresh rate, you
-have these options:
+With VBE3.0 compatible BIOSes and vesafb-tng it is possible to change 
+the refresh rate either at boot time (by specifying the @<rr> part of 
+the mode name) or later, using the fbset utility. 
+
+With VBE2.0 there is no way to change the mode timings after booting
+Linux. If you are not happy with the 60 Hz refresh rate, you have
+these options:
 
- * configure and load the DOS-Tools for your the graphics board (if
-   available) and boot linux with loadlin.
+ * configure and load the DOS tools for your the graphics board (if
+   available) and boot Linux with loadlin.
  * use a native driver (matroxfb/atyfb) instead if vesafb.  If none
    is available, write a new one!
- * VBE 3.0 might work too.  I have neither a gfx board with VBE 3.0
-   support nor the specs, so I have not checked this yet.
+ * use a BIOS editor to change the default refresh rate (such an
+   editor does exist at least for ATI Radeon BIOSes). 
+ * if you're running a non-vm86 and VBE3.0-compatible system, you can 
+   use a kernel patch to hard-code some mode timings in the kernel and
+   use these while setting the graphic mode at boot time.
 
 
 Configuration
 =============
 
-The VESA BIOS provides protected mode interface for changing
-some parameters.  vesafb can use it for palette changes and
-to pan the display.  It is turned off by default because it
-seems not to work with some BIOS versions, but there are options
-to turn it on.
-
-You can pass options to vesafb using "video=vesafb:option" on
-the kernel command line.  Multiple options should be separated
-by comma, like this: "video=vesafb:ypan,invers"
-
-Accepted options:
-
-invers	no comment...
-
-ypan	enable display panning using the VESA protected mode 
-	interface.  The visible screen is just a window of the
+The VESA BIOS provides protected mode interface for changing some parameters. 
+vesafb can use it for palette changes and to pan the display.  It is turned 
+off by default because it seems not to work with some BIOS versions, but there 
+are options to turn it on.
+
+You can pass options to vesafb using "video=vesafb:option" on the kernel 
+command line. Multiple options should be separated by comma, like this: 
+"video=vesafb:ypan,1024x768-32@85"
+
+Accepted options (both vesafb and vesafb-tng):
+     
+ypan	Enable display panning using the VESA protected mode 
+	interface or vm86 calls. The visible screen is just a window of the
 	video memory, console scrolling is done by changing the
 	start of the window.
 	pro:	* scrolling (fullscreen) is fast, because there is
 		  no need to copy around data.
-		* You'll get scrollback (the Shift-PgUp thing),
+		* you'll get scrollback (the Shift-PgUp thing),
 		  the video memory can be used as scrollback buffer
-	kontra: * scrolling only parts of the screen causes some
+	con: 	* scrolling only parts of the screen causes some
 		  ugly flicker effects (boot logo flickers for
 		  example).
 
-ywrap	Same as ypan, but assumes your gfx board can wrap-around 
-	the video memory (i.e. starts reading from top if it
-	reaches the end of video memory).  Faster than ypan.
-
-redraw	scroll by redrawing the affected part of the screen, this
-	is the safe (and slow) default.
+ywrap	Same as ypan, but assumes your gfx board can wrap-around the video 
+	memory (i.e. starts reading from top if it reaches the end of video 
+	memory). Faster than ypan.
 
+redraw	Scroll by redrawing the affected part of the screen, this is the 
+	safe (and slow) default.
 
-vgapal	Use the standard vga registers for palette changes.
+vgapal	Use the standard VGA registers for palette changes. 
 	This is the default.
+
 pmipal	Use the protected mode interface for palette changes.
 
-mtrr	setup memory type range registers for the vesafb framebuffer.
+mtrr	Setup memory type range registers for the vesafb framebuffer.
+
+nomtrr	Do not use memory type range registers for vesafb.
 
 vram:n	remap 'n' MiB of video RAM. If 0 or not specified, remap memory
 	according to video mode. (2.5.66 patch/idea by Antonino Daplas
 	reversed to give override possibility (allocate more fb memory
 	than the kernel would) to 2.4 by tmb@iki.fi)
 
-Have fun!
+Options accepted only by vesafb-tng:
+
+<mode>	The mode you want to set, in the standard modedb format. Refer to
+	modedb.txt for detailed description. If you specify a mode that is
+	not supported by your board's BIOS, vesafb will attempt to set a
+	similar mode. The list of supported modes can be found in
+	/proc/fbx/modes, where x is the framebuffer number (usually 0).
+	When vesafb is compiled as a module, the mode string should be
+	provided as a value of the parameter 'mode'.
+
+vbemode:x
+	Force the use of VBE mode x. The mode will only be set if it's
+	found in VBE-provided list of supported modes.
+	NOTE: The mode number 'x' should be specified in VESA mode number
+	notation, not the Linux kernel one (ie. 257 instead of 769).
+	HINT: If you use this option because normal <mode> parameter does
+	not work for you and you use a X server, you'll probably want to 
+	set the 'nocrtc' option to ensure that the video mode is properly 
+	restored after console <-> X switches.
+
+nocrtc	Do not use CRTC timings while setting the graphic mode. This option 
+	makes sence only with VBE3.0 compliant systems. Use it if you have 
+	problems with the modes set in the standard way. Note that specifying
+	this option means the refresh rate will be ignored and will stay at 
+	your BIOS' default (60 Hz).
+
+noedid 	Do not try to fetch and use EDID-provided modes.
+
+gtf	Force the use of VESA's GTF (Generalized Timing Formula). Specifying
+	this will cause vesafb to skip it's internal modedb and EDID-modedb
+	and jump straight to the GTF part of the code (normally used only is
+	everything else failed). This can be useful if you want to get as much
+	as possible from you graphics board but your BIOS doesn't support
+	modes with refresh rates you require. Note that you may need to
+	specify the maxhf, maxvf and maxclk parameters if they are not
+	provided by EDID.
+
+Additionally, the following parameters may be provided. They all override the
+EDID-provided values and BIOS defaults. Refer to you monitor's specs to get
+the correct values for maxhf, maxvf and maxclk for your hardware.
+
+maxhf:n		Maximum horizontal frequency (in kHz). 
+maxvf:n		Maximum vertical frequency (in Hz).
+maxclk:n	Maximum pixel clock (in MHz).
+
+	
+Vesafb-tng Technical details
+============================
+
+1. The driver architecture.
+
+The driver's code is stored in 3 files:
+  /drivers/video/vesafb-tng.c
+  /drivers/video/vesafb-thread.c
+  /include/video/vesa.h
+  
+vesafb-tng.c contains the main code. vesafb-thread.c contains code for the 
+vesafb service thread. A separate thread is necessary because we need to remap 
+memory in order to be able to use the vm86 calls. The service thread is started
+regardless of whether vesafb is compiled into the kernel or compiled as a
+module. This is necessary because of the active_mm stuff, better described in
+the header of vesafb-thread.c.
+
+2. The driver initialization
+
+ o vesafb_vbe_init
+   - get basic info about the graphics BIOS
+   - fetch data about all modes supported by VBE
+   - get info about the protected mode interface 
+   - get EDID data and attempt to create an EDID modedb
+
+ o vesafb_probe
+   - get service thread's PID (started earlier from fbmem.c)
+   - call vesafb_vbe_init
+   - try to find the specified mode in vesa_modes modedb
+   - if the previous step failed or was skipped:
+     - try to find a matching mode in the VBE modedb - identify VBE mode ID
+     - try to find a matching mode in the EDID modedb
+     - if the previous step failed or was skipped:
+       - try to calculate mode timings with GTF
+   - low level setup - request_mem_region, ioremap, etc.
+   - setup /proc/fb<x>/modes and /proc/fb<x>/vbe_info
 
-  Gerd
+3. Used hacks
+ 
+ o info->var.reserved[0] holds the VBE mode ID
+ o info->var.reserved[1] holds a pointer to the VBE mode data in vesafb's
+   mode database.
+
+Have fun!
 
 --
+Original document for the vesafb driver by
 Gerd Knorr <kraxel@goldbach.in-berlin.de>
 
-Minor (mostly typo) changes 
-by Nico Schmoigl <schmoigl@rumms.uni-mannheim.de>
+Minor (mostly typo) changes by 
+Nico Schmoigl <schmoigl@rumms.uni-mannheim.de>
+
+Extended documentation for vm86, VBE3.0 and vesafb-tng by
+Michał Januszewski <spock@gentoo.org>
+
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/block/as-iosched.c linux-2.6.8.1-ck7/drivers/block/as-iosched.c
--- linux-2.6.8.1-ck6/drivers/block/as-iosched.c	2004-06-16 17:35:35.000000000 +1000
+++ linux-2.6.8.1-ck7/drivers/block/as-iosched.c	2004-09-09 22:56:38.677117954 +1000
@@ -1828,14 +1828,14 @@ static int as_set_request(request_queue_
 
 static int as_may_queue(request_queue_t *q, int rw)
 {
-	int ret = 0;
+	int ret = ELV_MQUEUE_MAY;
 	struct as_data *ad = q->elevator.elevator_data;
 	struct io_context *ioc;
 	if (ad->antic_status == ANTIC_WAIT_REQ ||
 			ad->antic_status == ANTIC_WAIT_NEXT) {
 		ioc = as_get_io_context();
 		if (ad->io_context == ioc)
-			ret = 1;
+			ret = ELV_MQUEUE_MUST;
 		put_io_context(ioc);
 	}
 
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/block/cfq-iosched.c linux-2.6.8.1-ck7/drivers/block/cfq-iosched.c
--- linux-2.6.8.1-ck6/drivers/block/cfq-iosched.c	2004-08-15 14:08:05.000000000 +1000
+++ linux-2.6.8.1-ck7/drivers/block/cfq-iosched.c	2004-09-09 22:56:38.679117642 +1000
@@ -22,96 +22,214 @@
 #include <linux/rbtree.h>
 #include <linux/mempool.h>
 
+#undef CFQ_DEBUG
+
+#ifdef CFQ_DEBUG
+#define dprintk(fmt, args...) printk(KERN_ERR "cfq: " fmt, ##args)
+#else
+#define dprintk(fmt, args...)
+#endif
+
+static unsigned long max_elapsed_crq;
+static unsigned long max_elapsed_dispatch;
+
 /*
  * tunables
  */
-static int cfq_quantum = 4;
-static int cfq_queued = 8;
+static int cfq_quantum = 4;		/* max queue in one round of service */
+static int cfq_queued = 8;		/* minimum rq allocate limit per-queue*/
+static int cfq_service = HZ;		/* period over which service is avg */
+static int cfq_fifo_expire_r = HZ / 2;	/* fifo timeout for sync requests */
+static int cfq_fifo_expire_w = 5 * HZ;	/* fifo timeout for async requests */
+static int cfq_fifo_rate = HZ / 8;	/* fifo expiry rate */
+static int cfq_back_max = 16 * 1024;	/* maximum backwards seek, in KiB */
+static int cfq_back_penalty = 2;	/* penalty of a backwards seek */
 
+/*
+ * for the hash of cfqq inside the cfqd
+ */
 #define CFQ_QHASH_SHIFT		6
 #define CFQ_QHASH_ENTRIES	(1 << CFQ_QHASH_SHIFT)
-#define list_entry_qhash(entry)	list_entry((entry), struct cfq_queue, cfq_hash)
+#define list_entry_qhash(entry)	hlist_entry((entry), struct cfq_queue, cfq_hash)
 
-#define CFQ_MHASH_SHIFT		8
+/*
+ * for the hash of crq inside the cfqq
+ */
+#define CFQ_MHASH_SHIFT		6
 #define CFQ_MHASH_BLOCK(sec)	((sec) >> 3)
 #define CFQ_MHASH_ENTRIES	(1 << CFQ_MHASH_SHIFT)
-#define CFQ_MHASH_FN(sec)	(hash_long(CFQ_MHASH_BLOCK((sec)),CFQ_MHASH_SHIFT))
-#define ON_MHASH(crq)		!list_empty(&(crq)->hash)
+#define CFQ_MHASH_FN(sec)	hash_long(CFQ_MHASH_BLOCK(sec), CFQ_MHASH_SHIFT)
 #define rq_hash_key(rq)		((rq)->sector + (rq)->nr_sectors)
-#define list_entry_hash(ptr)	list_entry((ptr), struct cfq_rq, hash)
+#define list_entry_hash(ptr)	hlist_entry((ptr), struct cfq_rq, hash)
 
 #define list_entry_cfqq(ptr)	list_entry((ptr), struct cfq_queue, cfq_list)
 
-#define RQ_DATA(rq)		((struct cfq_rq *) (rq)->elevator_private)
+#define RQ_DATA(rq)		(rq)->elevator_private
+
+/*
+ * rb-tree defines
+ */
+#define RB_NONE			(2)
+#define RB_EMPTY(node)		((node)->rb_node == NULL)
+#define RB_CLEAR_COLOR(node)	(node)->rb_color = RB_NONE
+#define RB_CLEAR(node)		do {	\
+	(node)->rb_parent = NULL;	\
+	RB_CLEAR_COLOR((node));		\
+	(node)->rb_right = NULL;	\
+	(node)->rb_left = NULL;		\
+} while (0)
+#define RB_CLEAR_ROOT(root)	((root)->rb_node = NULL)
+#define ON_RB(node)		((node)->rb_color != RB_NONE)
+#define rb_entry_crq(node)	rb_entry((node), struct cfq_rq, rb_node)
+#define rq_rb_key(rq)		(rq)->sector
+
+/*
+ * sort key types and names
+ */
+enum {
+	CFQ_KEY_PGID,
+	CFQ_KEY_TGID,
+	CFQ_KEY_UID,
+	CFQ_KEY_GID,
+	CFQ_KEY_LAST,
+};
+
+static char *cfq_key_types[] = { "pgid", "tgid", "uid", "gid", NULL };
+
+/*
+ * spare queue
+ */
+#define CFQ_KEY_SPARE		(~0UL)
 
 static kmem_cache_t *crq_pool;
 static kmem_cache_t *cfq_pool;
-static mempool_t *cfq_mpool;
+static kmem_cache_t *cfq_ioc_pool;
 
 struct cfq_data {
 	struct list_head rr_list;
-	struct list_head *dispatch;
-	struct list_head *cfq_hash;
+	struct list_head empty_list;
 
-	struct list_head *crq_hash;
+	struct hlist_head *cfq_hash;
+	struct hlist_head *crq_hash;
 
+	/* queues on rr_list (ie they have pending requests */
 	unsigned int busy_queues;
+
 	unsigned int max_queued;
 
+	int key_type;
+
 	mempool_t *crq_pool;
 
 	request_queue_t *queue;
 
+	sector_t last_sector;
+
 	/*
-	 * tunables
+	 * tunables, see top of file
 	 */
 	unsigned int cfq_quantum;
 	unsigned int cfq_queued;
+	unsigned int cfq_tagged;
+	unsigned int cfq_fifo_expire_r;
+	unsigned int cfq_fifo_expire_w;
+	unsigned int cfq_fifo_batch_expire;
+	unsigned int cfq_back_penalty;
+	unsigned int cfq_back_max;
+	unsigned int find_best_crq;
 };
 
 struct cfq_queue {
-	struct list_head cfq_hash;
+	/* reference count */
+	atomic_t ref;
+	/* parent cfq_data */
+	struct cfq_data *cfqd;
+	/* hash of mergeable requests */
+	struct hlist_node cfq_hash;
+	/* hash key */
+	unsigned long key;
+	/* whether queue is on rr (or empty) list */
+	int on_rr;
+	/* on either rr or empty list of cfqd */
 	struct list_head cfq_list;
+	/* sorted list of pending requests */
 	struct rb_root sort_list;
-	int pid;
+	/* if fifo isn't expired, next request to serve */
+	struct cfq_rq *next_crq;
+	/* requests queued in sort_list */
 	int queued[2];
-#if 0
-	/*
-	 * with a simple addition like this, we can do io priorities. almost.
-	 * does need a split request free list, too.
-	 */
-	int io_prio
+	/* currently allocated requests */
+	int allocated[2];
+	/* fifo list of requests in sort_list */
+	struct list_head fifo[2];
+	/* last time fifo expired */
+	unsigned long last_fifo_expire;
+
+	int key_type;
+
+	unsigned long service_start;
+	unsigned long service_used;
+
+	/* number of requests that have been handed to the driver */
+	int in_flight;
+	/* number of currently allocated requests */
+	int alloc_limit[2];
+
+#ifdef CFQ_DEBUG
+	char name[16];
 #endif
 };
 
 struct cfq_rq {
 	struct rb_node rb_node;
 	sector_t rb_key;
-
 	struct request *request;
+	struct hlist_node hash;
 
 	struct cfq_queue *cfq_queue;
+	struct cfq_io_context *io_context;
+
+	unsigned long service_start;
+	unsigned long queue_start;
 
-	struct list_head hash;
+	unsigned int in_flight : 1;
+	unsigned int accounted : 1;
+	unsigned int is_sync   : 1;
 };
 
-static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq);
-static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid);
-static void cfq_dispatch_sort(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-			      struct cfq_rq *crq);
+static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned long);
+static void cfq_dispatch_sort(request_queue_t *, struct cfq_rq *);
+static void cfq_update_next_crq(struct cfq_rq *);
 
 /*
- * lots of deadline iosched dupes, can be abstracted later...
+ * what the fairness is based on (ie how processes are grouped and
+ * differentiated)
  */
-static inline void __cfq_del_crq_hash(struct cfq_rq *crq)
+static inline unsigned long
+cfq_hash_key(struct cfq_data *cfqd, struct task_struct *tsk)
 {
-	list_del_init(&crq->hash);
+	/*
+	 * optimize this so that ->key_type is the offset into the struct
+	 */
+	switch (cfqd->key_type) {
+		case CFQ_KEY_PGID:
+			return process_group(tsk);
+		default:
+		case CFQ_KEY_TGID:
+			return tsk->tgid;
+		case CFQ_KEY_UID:
+			return tsk->uid;
+		case CFQ_KEY_GID:
+			return tsk->gid;
+	}
 }
 
+/*
+ * lots of deadline iosched dupes, can be abstracted later...
+ */
 static inline void cfq_del_crq_hash(struct cfq_rq *crq)
 {
-	if (ON_MHASH(crq))
-		__cfq_del_crq_hash(crq);
+	hlist_del_init(&crq->hash);
 }
 
 static void cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq)
@@ -120,32 +238,32 @@ static void cfq_remove_merge_hints(reque
 
 	if (q->last_merge == crq->request)
 		q->last_merge = NULL;
+
+	cfq_update_next_crq(crq);
 }
 
 static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
 {
-	struct request *rq = crq->request;
+	const int hash_idx = CFQ_MHASH_FN(rq_hash_key(crq->request));
 
-	BUG_ON(ON_MHASH(crq));
+	BUG_ON(!hlist_unhashed(&crq->hash));
 
-	list_add(&crq->hash, &cfqd->crq_hash[CFQ_MHASH_FN(rq_hash_key(rq))]);
+	hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]);
 }
 
 static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
 {
-	struct list_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
-	struct list_head *entry, *next = hash_list->next;
+	struct hlist_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
+	struct hlist_node *entry, *next;
 
-	while ((entry = next) != hash_list) {
+	hlist_for_each_safe(entry, next, hash_list) {
 		struct cfq_rq *crq = list_entry_hash(entry);
 		struct request *__rq = crq->request;
 
-		next = entry->next;
-
-		BUG_ON(!ON_MHASH(crq));
+		BUG_ON(hlist_unhashed(&crq->hash));
 
 		if (!rq_mergeable(__rq)) {
-			__cfq_del_crq_hash(crq);
+			cfq_del_crq_hash(crq);
 			continue;
 		}
 
@@ -157,29 +275,234 @@ static struct request *cfq_find_rq_hash(
 }
 
 /*
- * rb tree support functions
+ * Lifted from AS - choose which of crq1 and crq2 that is best served now.
+ * We choose the request that is closest to the head right now. Distance
+ * behind the head are penalized and only allowed to a certain extent.
  */
-#define RB_NONE		(2)
-#define RB_EMPTY(node)	((node)->rb_node == NULL)
-#define RB_CLEAR(node)	((node)->rb_color = RB_NONE)
-#define RB_CLEAR_ROOT(root)	((root)->rb_node = NULL)
-#define ON_RB(node)	((node)->rb_color != RB_NONE)
-#define rb_entry_crq(node)	rb_entry((node), struct cfq_rq, rb_node)
-#define rq_rb_key(rq)		(rq)->sector
+static struct cfq_rq *
+cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
+{
+	sector_t last, s1, s2, d1 = 0, d2 = 0;
+	int r1_wrap = 0, r2_wrap = 0;	/* requests are behind the disk head */
+	unsigned long back_max;
+
+	if (crq1 == NULL || crq1 == crq2)
+		return crq2;
+	if (crq2 == NULL)
+		return crq1;
+
+	s1 = crq1->request->sector;
+	s2 = crq2->request->sector;
+
+	last = cfqd->last_sector;
+
+#if 0
+	if (!list_empty(&cfqd->queue->queue_head)) {
+		struct list_head *entry = &cfqd->queue->queue_head;
+		unsigned long distance = ~0UL;
+		struct request *rq;
+
+		while ((entry = entry->prev) != &cfqd->queue->queue_head) {
+			rq = list_entry_rq(entry);
+
+			if (blk_barrier_rq(rq))
+				break;
+	
+			if (distance < abs(s1 - rq->sector + rq->nr_sectors)) {
+				distance = abs(s1 - rq->sector +rq->nr_sectors);
+				last = rq->sector + rq->nr_sectors;
+			}
+			if (distance < abs(s2 - rq->sector + rq->nr_sectors)) {
+				distance = abs(s2 - rq->sector +rq->nr_sectors);
+				last = rq->sector + rq->nr_sectors;
+			}
+		}
+	}
+#endif
+
+	/*
+	 * by definition, 1KiB is 2 sectors
+	 */
+	back_max = cfqd->cfq_back_max * 2;
+
+	/*
+	 * Strict one way elevator _except_ in the case where we allow
+	 * short backward seeks which are biased as twice the cost of a
+	 * similar forward seek.
+	 */
+	if (s1 >= last)
+		d1 = s1 - last;
+	else if (s1 + back_max >= last)
+		d1 = (last - s1) * cfqd->cfq_back_penalty;
+	else
+		r1_wrap = 1;
+
+	if (s2 >= last)
+		d2 = s2 - last;
+	else if (s2 + back_max >= last)
+		d2 = (last - s2) * cfqd->cfq_back_penalty;
+	else
+		r2_wrap = 1;
+
+	/* Found required data */
+	if (!r1_wrap && r2_wrap)
+		return crq1;
+	else if (!r2_wrap && r1_wrap)
+		return crq2;
+	else if (r1_wrap && r2_wrap) {
+		/* both behind the head */
+		if (s1 <= s2)
+			return crq1;
+		else
+			return crq2;
+	}
+
+	/* Both requests in front of the head */
+	if (d1 < d2)
+		return crq1;
+	else if (d2 < d1)
+		return crq2;
+	else {
+		if (s1 >= s2)
+			return crq1;
+		else
+			return crq2;
+	}
+}
+
+/*
+ * would be nice to take fifo expire time into account as well
+ */
+static struct cfq_rq *
+cfq_find_next_crq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+		  struct cfq_rq *last)
+{
+	struct cfq_rq *crq_next = NULL, *crq_prev = NULL;
+	struct rb_node *rbnext, *rbprev;
+
+	if (!ON_RB(&last->rb_node))
+		return NULL;
+
+	if ((rbnext = rb_next(&last->rb_node)) == NULL)
+		rbnext = rb_first(&cfqq->sort_list);
+
+	rbprev = rb_prev(&last->rb_node);
+
+	if (rbprev)
+		crq_prev = rb_entry_crq(rbprev);
+	if (rbnext)
+		crq_next = rb_entry_crq(rbnext);
+
+	return cfq_choose_req(cfqd, crq_next, crq_prev);
+}
+
+static void cfq_update_next_crq(struct cfq_rq *crq)
+{
+	struct cfq_queue *cfqq = crq->cfq_queue;
+
+	if (cfqq->next_crq == crq)
+		cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq);
+}
+
+static inline void
+cfq_sort_rr_list(struct cfq_queue *cfqq)
+{
+	struct list_head *entry = &cfqq->cfqd->rr_list;
+
+	list_del(&cfqq->cfq_list);
+
+	/*
+	 * sort by our mean service_used, sub-sort by in-flight requests
+	 */
+	while ((entry = entry->prev) != &cfqq->cfqd->rr_list) {
+		struct cfq_queue *__cfqq = list_entry_cfqq(entry);
+
+		if (cfqq->service_used > __cfqq->service_used)
+			break;
+		else if (cfqq->service_used == __cfqq->service_used) {
+			struct list_head *prv;
+
+			while ((prv = entry->prev) != &cfqq->cfqd->rr_list) {
+				__cfqq = list_entry_cfqq(prv);
+
+				WARN_ON(__cfqq->service_used > cfqq->service_used);
+				if (cfqq->service_used != __cfqq->service_used)
+					break;
+				if (cfqq->in_flight > __cfqq->in_flight)
+					break;
+
+				entry = prv;
+			}
+		}
+	}
 
-static inline void cfq_del_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
+	list_add(&cfqq->cfq_list, entry);
+}
+
+/*
+ * add to busy list of queues for service, trying to be fair in ordering
+ * the pending list according to requests serviced
+ */
+static inline void
+cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	BUG_ON(cfqq->on_rr);
+
+	/*
+	 * it's currently on the empty list
+	 */
+	cfq_sort_rr_list(cfqq);
+	cfqq->on_rr = 1;
+	cfqd->busy_queues++;
+
+	/*
+	 * if the queue is on the empty_list, service_start was the time
+	 * where it was deleted from the rr_list.
+	 */
+	if (time_after(jiffies, cfqq->service_start + cfq_service))
+		cfqq->service_used >>= 3;
+}
+
+static inline void
+cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
+	list_move(&cfqq->cfq_list, &cfqd->empty_list);
+	cfqq->on_rr = 0;
+	cfqq->service_start = jiffies;
+
+	BUG_ON(!cfqd->busy_queues);
+	cfqd->busy_queues--;
+}
+
+/*
+ * rb tree support functions
+ */
+static inline void cfq_del_crq_rb(struct cfq_rq *crq)
+{
+	struct cfq_queue *cfqq = crq->cfq_queue;
+
 	if (ON_RB(&crq->rb_node)) {
-		cfqq->queued[rq_data_dir(crq->request)]--;
+		struct cfq_data *cfqd = cfqq->cfqd;
+
+		BUG_ON(!cfqq->queued[crq->is_sync]);
+
+		cfq_update_next_crq(crq);
+
+		cfqq->queued[crq->is_sync]--;
 		rb_erase(&crq->rb_node, &cfqq->sort_list);
-		crq->cfq_queue = NULL;
+		RB_CLEAR_COLOR(&crq->rb_node);
+
+		if (RB_EMPTY(&cfqq->sort_list) && cfqq->on_rr) {
+			dprintk("moving 0x%p empty_list\n", cfqq);
+			cfq_del_cfqq_rr(cfqd, cfqq);
+		}
 	}
 }
 
 static struct cfq_rq *
-__cfq_add_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
+__cfq_add_crq_rb(struct cfq_rq *crq)
 {
-	struct rb_node **p = &cfqq->sort_list.rb_node;
+	struct rb_node **p = &crq->cfq_queue->sort_list.rb_node;
 	struct rb_node *parent = NULL;
 	struct cfq_rq *__crq;
 
@@ -199,30 +522,53 @@ __cfq_add_crq_rb(struct cfq_queue *cfqq,
 	return NULL;
 }
 
-static void
-cfq_add_crq_rb(struct cfq_data *cfqd, struct cfq_queue *cfqq,struct cfq_rq *crq)
+static void cfq_add_crq_rb(struct cfq_rq *crq)
 {
+	struct cfq_queue *cfqq = crq->cfq_queue;
+	struct cfq_data *cfqd = cfqq->cfqd;
 	struct request *rq = crq->request;
 	struct cfq_rq *__alias;
 
 	crq->rb_key = rq_rb_key(rq);
-	cfqq->queued[rq_data_dir(rq)]++;
-retry:
-	__alias = __cfq_add_crq_rb(cfqq, crq);
-	if (!__alias) {
-		rb_insert_color(&crq->rb_node, &cfqq->sort_list);
-		crq->cfq_queue = cfqq;
-		return;
+	cfqq->queued[crq->is_sync]++;
+
+	/*
+	 * looks a little odd, but the first insert might return an alias.
+	 * if that happens, put the alias on the dispatch list
+	 */
+	while ((__alias = __cfq_add_crq_rb(crq)) != NULL)
+		cfq_dispatch_sort(cfqd->queue, __alias);
+
+	rb_insert_color(&crq->rb_node, &cfqq->sort_list);
+
+	if (!cfqq->on_rr) {
+		cfq_add_cfqq_rr(cfqd, cfqq);
+		dprintk("moving to rr list %d\n", cfqd->busy_queues);
+	} else
+		dprintk("already on rr list %d\n", cfqd->busy_queues);
+
+	/*
+	 * check if this request is a better next-serve candidate
+	 */
+	cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
+}
+
+static inline void
+cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
+{
+	if (ON_RB(&crq->rb_node)) {
+		rb_erase(&crq->rb_node, &cfqq->sort_list);
+		cfqq->queued[crq->is_sync]--;
 	}
 
-	cfq_dispatch_sort(cfqd, cfqq, __alias);
-	goto retry;
+	cfq_add_crq_rb(crq);
 }
 
 static struct request *
 cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
 {
-	struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->tgid);
+	const unsigned long key = cfq_hash_key(cfqd, current);
+	struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, key);
 	struct rb_node *n;
 
 	if (!cfqq)
@@ -246,21 +592,16 @@ out:
 
 static void cfq_remove_request(request_queue_t *q, struct request *rq)
 {
-	struct cfq_data *cfqd = q->elevator.elevator_data;
 	struct cfq_rq *crq = RQ_DATA(rq);
 
-	if (crq) {
-		struct cfq_queue *cfqq = crq->cfq_queue;
+	dprintk("removing 0x%p\n", rq);
 
+	if (crq) {
 		cfq_remove_merge_hints(q, crq);
 		list_del_init(&rq->queuelist);
 
-		if (cfqq) {
-			cfq_del_crq_rb(cfqq, crq);
-
-			if (RB_EMPTY(&cfqq->sort_list))
-				cfq_put_queue(cfqd, cfqq);
-		}
+		if (crq->cfq_queue)
+			cfq_del_crq_rb(crq);
 	}
 }
 
@@ -314,92 +655,228 @@ static void cfq_merged_request(request_q
 	if (ON_RB(&crq->rb_node) && (rq_rb_key(req) != crq->rb_key)) {
 		struct cfq_queue *cfqq = crq->cfq_queue;
 
-		cfq_del_crq_rb(cfqq, crq);
-		cfq_add_crq_rb(cfqd, cfqq, crq);
+		cfq_update_next_crq(crq);
+		cfq_reposition_crq_rb(cfqq, crq);
 	}
 
 	q->last_merge = req;
 }
 
 static void
-cfq_merged_requests(request_queue_t *q, struct request *req,
+cfq_merged_requests(request_queue_t *q, struct request *rq,
 		    struct request *next)
 {
-	cfq_merged_request(q, req);
+	struct cfq_rq *crq = RQ_DATA(rq);
+	struct cfq_rq *cnext = RQ_DATA(next);
+
+	cfq_merged_request(q, rq);
+
+	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist)) {
+		if (time_before(cnext->queue_start, crq->queue_start)) {
+			list_move(&rq->queuelist, &next->queuelist);
+			crq->queue_start = cnext->queue_start;
+		}
+	}
+
+	cfq_update_next_crq(cnext);
 	cfq_remove_request(q, next);
 }
 
-static void
-cfq_dispatch_sort(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-		  struct cfq_rq *crq)
+/*
+ * we dispatch cfqd->cfq_quantum requests in total from the rr_list queues,
+ * this function sector sorts the selected request to minimize seeks. we start
+ * at cfqd->last_sector, not 0.
+ */
+static void cfq_dispatch_sort(request_queue_t *q, struct cfq_rq *crq)
 {
-	struct list_head *head = cfqd->dispatch, *entry = head;
+	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_queue *cfqq = crq->cfq_queue;
+	struct list_head *head = &q->queue_head, *entry = head;
 	struct request *__rq;
+	sector_t last;
 
-	cfq_del_crq_rb(cfqq, crq);
-	cfq_remove_merge_hints(cfqd->queue, crq);
+	cfq_del_crq_rb(crq);
+	cfq_remove_merge_hints(q, crq);
+	list_del(&crq->request->queuelist);
 
-	if (!list_empty(head)) {
-		__rq = list_entry_rq(head->next);
+	last = cfqd->last_sector;
+	while ((entry = entry->prev) != head) {
+		__rq = list_entry_rq(entry);
+
+		if (blk_barrier_rq(crq->request))
+			break;
+		if (!blk_fs_request(crq->request))
+			break;
 
-		if (crq->request->sector < __rq->sector) {
-			entry = head->prev;
-			goto link;
+		if (crq->request->sector > __rq->sector)
+			break;
+		if (__rq->sector > last && crq->request->sector < last) {
+			last = crq->request->sector;
+			break;
 		}
 	}
 
-	while ((entry = entry->prev) != head) {
-		__rq = list_entry_rq(entry);
+	cfqd->last_sector = last;
+	crq->in_flight = 1;
+	cfqq->in_flight++;
+	list_add(&crq->request->queuelist, entry);
+}
 
-		if (crq->request->sector <= __rq->sector)
-			break;
+/*
+ * return expired entry, or NULL to just start from scratch in rbtree
+ */
+static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq)
+{
+	struct cfq_data *cfqd = cfqq->cfqd;
+	const int reads = !list_empty(&cfqq->fifo[0]);
+	const int writes = !list_empty(&cfqq->fifo[1]);
+	struct cfq_rq *crq;
+
+	if (jiffies - cfqq->last_fifo_expire < cfqd->cfq_fifo_batch_expire)
+		return NULL;
+
+	crq = RQ_DATA(list_entry(cfqq->fifo[0].next, struct request, queuelist));
+	if (reads && time_after(jiffies, crq->queue_start + cfqd->cfq_fifo_expire_r)) {
+		cfqq->last_fifo_expire = jiffies;
+		return crq;
 	}
 
-link:
-	list_add_tail(&crq->request->queuelist, entry);
+	crq = RQ_DATA(list_entry(cfqq->fifo[1].next, struct request, queuelist));
+	if (writes && time_after(jiffies, crq->queue_start + cfqd->cfq_fifo_expire_w)) {
+		cfqq->last_fifo_expire = jiffies;
+		return crq;
+	}
+
+	return NULL;
 }
 
+/*
+ * dispatch a single request from given queue
+ */
 static inline void
-__cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd,
-			struct cfq_queue *cfqq)
+cfq_dispatch_request(request_queue_t *q, struct cfq_data *cfqd,
+		     struct cfq_queue *cfqq)
 {
-	struct cfq_rq *crq = rb_entry_crq(rb_first(&cfqq->sort_list));
+	struct cfq_rq *crq;
 
-	cfq_dispatch_sort(cfqd, cfqq, crq);
+	/*
+	 * follow expired path, else get first next available
+	 */
+	if ((crq = cfq_check_fifo(cfqq)) == NULL) {
+		if (cfqd->find_best_crq)
+			crq = cfqq->next_crq;
+		else
+			crq = rb_entry_crq(rb_first(&cfqq->sort_list));
+	}
+
+	cfqd->last_sector = crq->request->sector + crq->request->nr_sectors;
+
+	/*
+	 * finally, insert request into driver list
+	 */
+	cfq_dispatch_sort(q, crq);
 }
 
-static int cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd)
+static int cfq_dispatch_requests(request_queue_t *q, int max_dispatch)
 {
+	struct cfq_data *cfqd = q->elevator.elevator_data;
 	struct cfq_queue *cfqq;
 	struct list_head *entry, *tmp;
-	int ret, queued, good_queues;
+	int queued, busy_queues, first_round;
 
 	if (list_empty(&cfqd->rr_list))
 		return 0;
 
-	queued = ret = 0;
+	queued = 0;
+	first_round = 1;
 restart:
-	good_queues = 0;
+	busy_queues = 0;
 	list_for_each_safe(entry, tmp, &cfqd->rr_list) {
-		cfqq = list_entry_cfqq(cfqd->rr_list.next);
+		cfqq = list_entry_cfqq(entry);
 
 		BUG_ON(RB_EMPTY(&cfqq->sort_list));
 
-		__cfq_dispatch_requests(q, cfqd, cfqq);
+		/*
+		 * first round of queueing, only select from queues that
+		 * don't already have io in-flight
+		 */
+		if (first_round && cfqq->in_flight)
+			continue;
 
-		if (RB_EMPTY(&cfqq->sort_list))
-			cfq_put_queue(cfqd, cfqq);
-		else
-			good_queues++;
+		cfq_dispatch_request(q, cfqd, cfqq);
+
+		if (!RB_EMPTY(&cfqq->sort_list))
+			busy_queues++;
 
 		queued++;
-		ret = 1;
 	}
 
-	if ((queued < cfqd->cfq_quantum) && good_queues)
+	if ((queued < max_dispatch) && (busy_queues || first_round)) {
+		first_round = 0;
 		goto restart;
+	}
 
-	return ret;
+	return queued;
+}
+
+static inline void cfq_account_dispatch(struct cfq_rq *crq)
+{
+	struct cfq_queue *cfqq = crq->cfq_queue;
+	unsigned long elapsed = jiffies - crq->queue_start;
+
+	/*
+	 * accounted bit is necessary since some drivers will call
+	 * elv_next_request() many times for the same request (eg ide)
+	 */
+	if (crq->accounted)
+		return;
+
+	/*
+	 * on drives with tagged command queueing, command turn-around time
+	 * doesn't necessarily reflect the time spent processing this very
+	 * command inside the drive. so do the accounting differently there,
+	 * by just sorting on the number of requests
+	 */
+	if (cfqq->cfqd->cfq_tagged) {
+		if (time_after(jiffies, cfqq->service_start + cfq_service)) {
+			cfqq->service_start = jiffies;
+			cfqq->service_used /= 10;
+		}
+
+		cfqq->service_used++;
+	}
+
+	if (elapsed > max_elapsed_dispatch)
+		max_elapsed_dispatch = elapsed;
+
+	crq->accounted = 1;
+	crq->service_start = jiffies;
+}
+
+static inline void
+cfq_account_completion(struct cfq_queue *cfqq, struct cfq_rq *crq)
+{
+	unsigned long start_val = cfqq->service_used;
+
+	if (!cfqq->cfqd->cfq_tagged) {
+		unsigned long duration = jiffies - crq->service_start;
+
+		if (time_after(jiffies, cfqq->service_start + cfq_service)) {
+			cfqq->service_start = jiffies;
+			cfqq->service_used >>= 3;
+		}
+
+		cfqq->service_used += duration;
+
+		if (duration > max_elapsed_crq)
+			max_elapsed_crq = duration;
+	}
+
+	/*
+	 * make sure list stays properly sorted, but only do so if necessary
+	 */
+	if (cfqq->on_rr && cfqq->service_used != start_val)
+		cfq_sort_rr_list(cfqq);
 }
 
 static struct request *cfq_next_request(request_queue_t *q)
@@ -407,100 +884,309 @@ static struct request *cfq_next_request(
 	struct cfq_data *cfqd = q->elevator.elevator_data;
 	struct request *rq;
 
-	if (!list_empty(cfqd->dispatch)) {
+	if (!list_empty(&q->queue_head)) {
 		struct cfq_rq *crq;
 dispatch:
-		rq = list_entry_rq(cfqd->dispatch->next);
+		rq = list_entry_rq(q->queue_head.next);
 
-		crq = RQ_DATA(rq);
-		if (crq)
+		if ((crq = RQ_DATA(rq)) != NULL) {
 			cfq_remove_merge_hints(q, crq);
+			cfq_account_dispatch(crq);
+		}
 
 		return rq;
 	}
 
-	if (cfq_dispatch_requests(q, cfqd))
+	if (cfq_dispatch_requests(q, cfqd->cfq_quantum))
 		goto dispatch;
 
 	return NULL;
 }
 
+/*
+ * task holds one reference to the queue, dropped when task exits. each crq
+ * in-flight on this queue also holds a reference, dropped when crq is freed.
+ *
+ * queue lock must be held here.
+ */
+static void cfq_put_queue(struct cfq_queue *cfqq)
+{
+	BUG_ON(!atomic_read(&cfqq->ref));
+
+	dprintk("cfq_put_queue 0x%p, ref\n", atomic_read(&cfqq->ref));
+
+	if (!atomic_dec_and_test(&cfqq->ref))
+		return;
+
+	dprintk("killing queue 0x%p/%s\n", cfqq, cfqq->name);
+
+	BUG_ON(rb_first(&cfqq->sort_list));
+	BUG_ON(cfqq->on_rr);
+
+	/*
+	 * it's on the empty list and still hashed
+	 */
+	list_del(&cfqq->cfq_list);
+	hlist_del(&cfqq->cfq_hash);
+	kmem_cache_free(cfq_pool, cfqq);
+}
+
 static inline struct cfq_queue *
-__cfq_find_cfq_hash(struct cfq_data *cfqd, int pid, const int hashval)
+__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key, const int hashval)
 {
-	struct list_head *hash_list = &cfqd->cfq_hash[hashval];
-	struct list_head *entry;
+	struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
+	struct hlist_node *entry, *next;
 
-	list_for_each(entry, hash_list) {
+	hlist_for_each_safe(entry, next, hash_list) {
 		struct cfq_queue *__cfqq = list_entry_qhash(entry);
 
-		if (__cfqq->pid == pid)
+		if (__cfqq->key == key)
 			return __cfqq;
 	}
 
 	return NULL;
 }
 
-static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid)
+static struct cfq_queue *
+cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key)
 {
-	const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT);
+	return __cfq_find_cfq_hash(cfqd, key, hash_long(key, CFQ_QHASH_SHIFT));
+}
+
+static inline void
+cfq_rehash_cfqq(struct cfq_data *cfqd, struct cfq_queue **cfqq,
+		struct cfq_io_context *cic)
+{
+	unsigned long hashkey = cfq_hash_key(cfqd, current);
+	unsigned long hashval = hash_long(hashkey, CFQ_QHASH_SHIFT);
+	struct cfq_queue *__cfqq;
+	unsigned long flags;
+
+	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
+
+	hlist_del(&(*cfqq)->cfq_hash);
+
+	__cfqq = __cfq_find_cfq_hash(cfqd, hashkey, hashval);
+	if (!__cfqq || __cfqq == *cfqq) {
+		__cfqq = *cfqq;
+		hlist_add_head(&__cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
+		__cfqq->key_type = cfqd->key_type;
+	} else {
+		atomic_inc(&__cfqq->ref);
+		cic->cfqq = __cfqq;
+		cfq_put_queue(*cfqq);
+		*cfqq = __cfqq;
+	}
 
-	return __cfq_find_cfq_hash(cfqd, pid, hashval);
+	cic->cfqq = __cfqq;
+	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
 }
 
-static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+static void cfq_free_io_context(struct cfq_io_context *cic)
 {
-	cfqd->busy_queues--;
-	list_del(&cfqq->cfq_list);
-	list_del(&cfqq->cfq_hash);
-	mempool_free(cfqq, cfq_mpool);
+	kmem_cache_free(cfq_ioc_pool, cic);
+}
+
+/*
+ * locking hierarchy is: io_context lock -> queue locks
+ */
+static void cfq_exit_io_context(struct cfq_io_context *cic)
+{
+	struct cfq_queue *cfqq = cic->cfqq;
+	struct list_head *entry = &cic->list;
+	request_queue_t *q;
+	unsigned long flags;
+
+	/*
+	 * put the reference this task is holding to the various queues
+	 */
+	spin_lock_irqsave(&cic->ioc->lock, flags);
+	while ((entry = cic->list.next) != &cic->list) {
+		struct cfq_io_context *__cic;
+
+		__cic = list_entry(entry, struct cfq_io_context, list);
+		list_del(entry);
+
+		q = __cic->cfqq->cfqd->queue;
+		spin_lock(q->queue_lock);
+		cfq_put_queue(__cic->cfqq);
+		spin_unlock(q->queue_lock);
+	}
+
+	q = cfqq->cfqd->queue;
+	spin_lock(q->queue_lock);
+	cfq_put_queue(cfqq);
+	spin_unlock(q->queue_lock);
+
+	cic->cfqq = NULL;
+	spin_unlock_irqrestore(&cic->ioc->lock, flags);
+}
+
+static struct cfq_io_context *cfq_alloc_io_context(int gfp_flags)
+{
+	struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_flags);
+
+	if (cic) {
+		cic->dtor = cfq_free_io_context;
+		cic->exit = cfq_exit_io_context;
+		INIT_LIST_HEAD(&cic->list);
+		cic->cfqq = NULL;
+	}
+
+	return cic;
+}
+
+/*
+ * Setup general io context and cfq io context. There can be several cfq
+ * io contexts per general io context, if this process is doing io to more
+ * than one device managed by cfq. Note that caller is holding a reference to
+ * cfqq, so we don't need to worry about it disappearing
+ */
+static struct cfq_io_context *
+cfq_get_io_context(struct cfq_queue **cfqq, int gfp_flags)
+{
+	struct cfq_data *cfqd = (*cfqq)->cfqd;
+	struct cfq_queue *__cfqq = *cfqq;
+	struct cfq_io_context *cic;
+	struct io_context *ioc;
+
+	might_sleep_if(gfp_flags & __GFP_WAIT);
+
+	ioc = get_io_context(gfp_flags);
+	if (!ioc)
+		return NULL;
+
+	if ((cic = ioc->cic) == NULL) {
+		cic = cfq_alloc_io_context(gfp_flags);
+
+		if (cic == NULL)
+			goto err;
+
+		ioc->cic = cic;
+		cic->ioc = ioc;
+		cic->cfqq = __cfqq;
+		atomic_inc(&__cfqq->ref);
+	} else {
+		struct cfq_io_context *__cic;
+		unsigned long flags;
+
+		/*
+		 * since the first cic on the list is actually the head
+		 * itself, need to check this here or we'll duplicate an
+		 * cic per ioc for no reason
+		 */
+		if (cic->cfqq == __cfqq)
+			goto out;
+
+		/*
+		 * cic exists, check if we already are there. linear search
+		 * should be ok here, the list will usually not be more than
+		 * 1 or a few entries long
+		 */
+		spin_lock_irqsave(&ioc->lock, flags);
+		list_for_each_entry(__cic, &cic->list, list) {
+			/*
+			 * this process is already holding a reference to
+			 * this queue, so no need to get one more
+			 */
+			if (__cic->cfqq == __cfqq) {
+				cic = __cic;
+				spin_unlock_irqrestore(&ioc->lock, flags);
+				goto out;
+			}
+		}
+		spin_unlock_irqrestore(&ioc->lock, flags);
+
+		/*
+		 * nope, process doesn't have a cic assoicated with this
+		 * cfqq yet. get a new one and add to list
+		 */
+		__cic = cfq_alloc_io_context(gfp_flags);
+		if (__cic == NULL)
+			goto err;
+
+		__cic->ioc = ioc;
+		__cic->cfqq = __cfqq;
+		atomic_inc(&__cfqq->ref);
+		spin_lock_irqsave(&ioc->lock, flags);
+		list_add(&__cic->list, &cic->list);
+		spin_unlock_irqrestore(&ioc->lock, flags);
+
+		cic = __cic;
+		*cfqq = __cfqq;
+	}
+
+out:
+	/*
+	 * if key_type has been changed on the fly, we lazily rehash
+	 * each queue at lookup time
+	 */
+	if ((*cfqq)->key_type != cfqd->key_type)
+		cfq_rehash_cfqq(cfqd, cfqq, cic);
+
+	return cic;
+err:
+	put_io_context(ioc);
+	return NULL;
 }
 
-static struct cfq_queue *__cfq_get_queue(struct cfq_data *cfqd, int pid,
-					 int gfp_mask)
+static struct cfq_queue *
+__cfq_get_queue(struct cfq_data *cfqd, unsigned long key, int gfp_mask)
 {
-	const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT);
+	const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
 	struct cfq_queue *cfqq, *new_cfqq = NULL;
-	request_queue_t *q = cfqd->queue;
 
 retry:
-	cfqq = __cfq_find_cfq_hash(cfqd, pid, hashval);
+	cfqq = __cfq_find_cfq_hash(cfqd, key, hashval);
 
 	if (!cfqq) {
 		if (new_cfqq) {
 			cfqq = new_cfqq;
 			new_cfqq = NULL;
 		} else if (gfp_mask & __GFP_WAIT) {
-			spin_unlock_irq(q->queue_lock);
-			new_cfqq = mempool_alloc(cfq_mpool, gfp_mask);
-			spin_lock_irq(q->queue_lock);
+			spin_unlock_irq(cfqd->queue->queue_lock);
+			new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
+			spin_lock_irq(cfqd->queue->queue_lock);
 			goto retry;
 		} else
-			return NULL;
+			goto out;
+
+		memset(cfqq, 0, sizeof(*cfqq));
 
-		INIT_LIST_HEAD(&cfqq->cfq_hash);
+		INIT_HLIST_NODE(&cfqq->cfq_hash);
 		INIT_LIST_HEAD(&cfqq->cfq_list);
 		RB_CLEAR_ROOT(&cfqq->sort_list);
+		INIT_LIST_HEAD(&cfqq->fifo[0]);
+		INIT_LIST_HEAD(&cfqq->fifo[1]);
 
-		cfqq->pid = pid;
-		cfqq->queued[0] = cfqq->queued[1] = 0;
-		list_add(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
+		cfqq->key = key;
+		hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
+		atomic_set(&cfqq->ref, 0);
+		cfqq->cfqd = cfqd;
+#ifdef CFQ_DEBUG
+		strncpy(cfqq->name, current->comm, sizeof(cfqq->name)-1);
+#endif
+		dprintk("cfqq set up for 0x%p/%s\n", cfqq, cfqq->name);
+		cfqq->key_type = cfqd->key_type;
 	}
 
 	if (new_cfqq)
-		mempool_free(new_cfqq, cfq_mpool);
+		kmem_cache_free(cfq_pool, new_cfqq);
 
+	atomic_inc(&cfqq->ref);
+out:
+	WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq);
 	return cfqq;
 }
 
-static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, int pid,
-				       int gfp_mask)
+static struct cfq_queue *
+cfq_get_queue(struct cfq_data *cfqd, unsigned long key, int gfp_mask)
 {
 	request_queue_t *q = cfqd->queue;
 	struct cfq_queue *cfqq;
 
 	spin_lock_irq(q->queue_lock);
-	cfqq = __cfq_get_queue(cfqd, pid, gfp_mask);
+	cfqq = __cfq_get_queue(cfqd, key, gfp_mask);
 	spin_unlock_irq(q->queue_lock);
 
 	return cfqq;
@@ -508,24 +1194,14 @@ static struct cfq_queue *cfq_get_queue(s
 
 static void cfq_enqueue(struct cfq_data *cfqd, struct cfq_rq *crq)
 {
-	struct cfq_queue *cfqq;
+	crq->is_sync = 0;
+	if (rq_data_dir(crq->request) == READ || current->flags & PF_SYNCWRITE)
+		crq->is_sync = 1;
 
-	cfqq = __cfq_get_queue(cfqd, current->tgid, GFP_ATOMIC);
-	if (cfqq) {
-		cfq_add_crq_rb(cfqd, cfqq, crq);
+	cfq_add_crq_rb(crq);
+	crq->queue_start = jiffies;
 
-		if (list_empty(&cfqq->cfq_list)) {
-			list_add(&cfqq->cfq_list, &cfqd->rr_list);
-			cfqd->busy_queues++;
-		}
-	} else {
-		/*
-		 * should can only happen if the request wasn't allocated
-		 * through blk_alloc_request(), eg stack requests from ide-cd
-		 * (those should be removed) _and_ we are in OOM.
-		 */
-		list_add_tail(&crq->request->queuelist, cfqd->dispatch);
-	}
+	list_add_tail(&crq->request->queuelist, &crq->cfq_queue->fifo[crq->is_sync]);
 }
 
 static void
@@ -536,14 +1212,17 @@ cfq_insert_request(request_queue_t *q, s
 
 	switch (where) {
 		case ELEVATOR_INSERT_BACK:
-			while (cfq_dispatch_requests(q, cfqd))
+			dprintk("adding back 0x%p\n", rq);
+			while (cfq_dispatch_requests(q, cfqd->cfq_quantum))
 				;
-			list_add_tail(&rq->queuelist, cfqd->dispatch);
+			list_add_tail(&rq->queuelist, &q->queue_head);
 			break;
 		case ELEVATOR_INSERT_FRONT:
-			list_add(&rq->queuelist, cfqd->dispatch);
+			dprintk("adding front 0x%p\n", rq);
+			list_add(&rq->queuelist, &q->queue_head);
 			break;
 		case ELEVATOR_INSERT_SORT:
+			dprintk("adding sort 0x%p\n", rq);
 			BUG_ON(!blk_fs_request(rq));
 			cfq_enqueue(cfqd, crq);
 			break;
@@ -564,10 +1243,25 @@ static int cfq_queue_empty(request_queue
 {
 	struct cfq_data *cfqd = q->elevator.elevator_data;
 
-	if (list_empty(cfqd->dispatch) && list_empty(&cfqd->rr_list))
-		return 1;
+	return list_empty(&q->queue_head) && list_empty(&cfqd->rr_list);
+}
+
+static void cfq_completed_request(request_queue_t *q, struct request *rq)
+{
+	struct cfq_rq *crq = RQ_DATA(rq);
+
+	if (unlikely(!blk_fs_request(rq)))
+		return;
+
+	if (crq->in_flight) {
+		struct cfq_queue *cfqq = crq->cfq_queue;
+
+		WARN_ON(!cfqq->in_flight);
+		cfqq->in_flight--;
+
+		cfq_account_completion(cfqq, crq);
+	}
 
-	return 0;
 }
 
 static struct request *
@@ -598,90 +1292,158 @@ static int cfq_may_queue(request_queue_t
 {
 	struct cfq_data *cfqd = q->elevator.elevator_data;
 	struct cfq_queue *cfqq;
-	int ret = 1;
+	int ret = ELV_MQUEUE_MAY;
 
-	if (!cfqd->busy_queues)
-		goto out;
+	if (current->flags & PF_MEMALLOC)
+		return ELV_MQUEUE_MAY;
 
-	cfqq = cfq_find_cfq_hash(cfqd, current->tgid);
+	cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(cfqd, current));
 	if (cfqq) {
-		int limit = (q->nr_requests - cfqd->cfq_queued) / cfqd->busy_queues;
+		int limit = cfqd->max_queued;
+
+		if (cfqq->allocated[rw] < cfqd->cfq_queued)
+			return ELV_MQUEUE_MUST;
+
+		if (cfqd->busy_queues)
+			limit = q->nr_requests / cfqd->busy_queues;
 
-		if (limit < 3)
-			limit = 3;
+		if (limit < cfqd->cfq_queued)
+			limit = cfqd->cfq_queued;
 		else if (limit > cfqd->max_queued)
 			limit = cfqd->max_queued;
 
-		if (cfqq->queued[rw] > limit)
-			ret = 0;
+		if (cfqq->allocated[rw] >= limit) {
+			if (limit > cfqq->alloc_limit[rw])
+				cfqq->alloc_limit[rw] = limit;
+
+			ret = ELV_MQUEUE_NO;
+		}
 	}
-out:
+
 	return ret;
 }
 
+static void cfq_check_waiters(request_queue_t *q, struct cfq_queue *cfqq)
+{
+	struct request_list *rl = &q->rq;
+	const int write = waitqueue_active(&rl->wait[WRITE]);
+	const int read = waitqueue_active(&rl->wait[READ]);
+
+	if (read && cfqq->allocated[READ] < cfqq->alloc_limit[READ])
+		wake_up(&rl->wait[READ]);
+	if (write && cfqq->allocated[WRITE] < cfqq->alloc_limit[WRITE])
+		wake_up(&rl->wait[WRITE]);
+}
+
+/*
+ * queue lock held here
+ */
 static void cfq_put_request(request_queue_t *q, struct request *rq)
 {
 	struct cfq_data *cfqd = q->elevator.elevator_data;
 	struct cfq_rq *crq = RQ_DATA(rq);
-	struct request_list *rl;
-	int other_rw;
+	const int rw = rq_data_dir(rq);
 
 	if (crq) {
+		struct cfq_queue *cfqq = crq->cfq_queue;
+
 		BUG_ON(q->last_merge == rq);
-		BUG_ON(ON_MHASH(crq));
+		BUG_ON(!hlist_unhashed(&crq->hash));
+
+		if (crq->io_context)
+			put_io_context(crq->io_context->ioc);
 
 		mempool_free(crq, cfqd->crq_pool);
 		rq->elevator_private = NULL;
-	}
 
-	/*
-	 * work-around for may_queue "bug": if a read gets issued and refused
-	 * to queue because writes ate all the allowed slots and no other
-	 * reads are pending for this queue, it could get stuck infinitely
-	 * since freed_request() only checks the waitqueue for writes when
-	 * freeing them. or vice versa for a single write vs many reads.
-	 * so check here whether "the other" data direction might be able
-	 * to queue and wake them
-	 */
-	rl = &q->rq;
-	other_rw = rq_data_dir(rq) ^ 1;
-	if (rl->count[other_rw] <= q->nr_requests) {
+		BUG_ON(!cfqq->allocated[rw]);
+		cfqq->allocated[rw]--;
+
 		smp_mb();
-		if (waitqueue_active(&rl->wait[other_rw]))
-			wake_up(&rl->wait[other_rw]);
+		cfq_check_waiters(q, cfqq);
+		cfq_put_queue(cfqq);
 	}
 }
 
+/*
+ * Allocate cfq data structures associated with this request. A queue and
+ */
 static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
 {
 	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_io_context *cic;
+	const int rw = rq_data_dir(rq);
 	struct cfq_queue *cfqq;
 	struct cfq_rq *crq;
+	unsigned long flags;
+
+	might_sleep_if(gfp_mask & __GFP_WAIT);
+
+	spin_lock_irqsave(q->queue_lock, flags);
+
+	cfqq = __cfq_get_queue(cfqd, cfq_hash_key(cfqd, current), gfp_mask);
+	if (!cfqq) {
+#if 0
+		cfqq = cfq_get_queue(cfqd, CFQ_KEY_SPARE, gfp_mask);
+		printk("%s: got spare queue\n", current->comm);
+#else
+		goto out_lock;
+#endif
+	}
+
+	if (cfqq->allocated[rw] >= cfqd->max_queued)
+		goto out_lock;
+
+	spin_unlock_irqrestore(q->queue_lock, flags);
 
 	/*
-	 * prepare a queue up front, so cfq_enqueue() doesn't have to
+	 * if hashing type has changed, the cfq_queue might change here. we
+	 * don't bother rechecking ->allocated since it should be a rare
+	 * event
 	 */
-	cfqq = cfq_get_queue(cfqd, current->tgid, gfp_mask);
-	if (!cfqq)
-		return 1;
+	cic = cfq_get_io_context(&cfqq, gfp_mask);
+	if (!cic)
+		goto err;
 
 	crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
 	if (crq) {
-		memset(crq, 0, sizeof(*crq));
 		RB_CLEAR(&crq->rb_node);
+		crq->rb_key = 0;
 		crq->request = rq;
-		crq->cfq_queue = NULL;
-		INIT_LIST_HEAD(&crq->hash);
+		INIT_HLIST_NODE(&crq->hash);
+		crq->cfq_queue = cfqq;
+		crq->io_context = cic;
+		crq->service_start = crq->queue_start = 0;
+		crq->in_flight = crq->accounted = crq->is_sync = 0;
 		rq->elevator_private = crq;
+		cfqq->allocated[rw]++;
+		cfqq->alloc_limit[rw] = 0;
 		return 0;
 	}
 
+	put_io_context(cic->ioc);
+err:
+	spin_lock_irqsave(q->queue_lock, flags);
+	cfq_put_queue(cfqq);
+out_lock:
+	spin_unlock_irqrestore(q->queue_lock, flags);
 	return 1;
 }
 
 static void cfq_exit(request_queue_t *q, elevator_t *e)
 {
 	struct cfq_data *cfqd = e->elevator_data;
+	struct cfq_queue *cfqq;
+
+	/*
+	 * kill spare queue, getting it means we have two refences to it.
+	 * drop both
+	 */
+	spin_lock_irq(q->queue_lock);
+	cfqq = __cfq_get_queue(cfqd, CFQ_KEY_SPARE, GFP_ATOMIC);
+	cfq_put_queue(cfqq);
+	cfq_put_queue(cfqq);
+	spin_unlock_irq(q->queue_lock);
 
 	e->elevator_data = NULL;
 	mempool_destroy(cfqd->crq_pool);
@@ -693,6 +1455,7 @@ static void cfq_exit(request_queue_t *q,
 static int cfq_init(request_queue_t *q, elevator_t *e)
 {
 	struct cfq_data *cfqd;
+	struct cfq_queue *cfqq;
 	int i;
 
 	cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL);
@@ -701,12 +1464,13 @@ static int cfq_init(request_queue_t *q, 
 
 	memset(cfqd, 0, sizeof(*cfqd));
 	INIT_LIST_HEAD(&cfqd->rr_list);
+	INIT_LIST_HEAD(&cfqd->empty_list);
 
-	cfqd->crq_hash = kmalloc(sizeof(struct list_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
+	cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
 	if (!cfqd->crq_hash)
 		goto out_crqhash;
 
-	cfqd->cfq_hash = kmalloc(sizeof(struct list_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL);
+	cfqd->cfq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL);
 	if (!cfqd->cfq_hash)
 		goto out_cfqhash;
 
@@ -715,25 +1479,42 @@ static int cfq_init(request_queue_t *q, 
 		goto out_crqpool;
 
 	for (i = 0; i < CFQ_MHASH_ENTRIES; i++)
-		INIT_LIST_HEAD(&cfqd->crq_hash[i]);
+		INIT_HLIST_HEAD(&cfqd->crq_hash[i]);
 	for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
-		INIT_LIST_HEAD(&cfqd->cfq_hash[i]);
+		INIT_HLIST_HEAD(&cfqd->cfq_hash[i]);
 
-	cfqd->dispatch = &q->queue_head;
 	e->elevator_data = cfqd;
 	cfqd->queue = q;
 
 	/*
+	 * setup spare failure queue
+	 */
+	cfqq = cfq_get_queue(cfqd, CFQ_KEY_SPARE, GFP_KERNEL);
+	if (!cfqq)
+		goto out_spare;
+
+	/*
 	 * just set it to some high value, we want anyone to be able to queue
 	 * some requests. fairness is handled differently
 	 */
-	cfqd->max_queued = q->nr_requests;
-	q->nr_requests = 8192;
+	q->nr_requests = 1024;
+	cfqd->max_queued = q->nr_requests / 16;
+	q->nr_batching = cfq_queued;
+	cfqd->key_type = CFQ_KEY_TGID;
+	cfqd->find_best_crq = 1;
 
 	cfqd->cfq_queued = cfq_queued;
 	cfqd->cfq_quantum = cfq_quantum;
+	cfqd->cfq_fifo_expire_r = cfq_fifo_expire_r;
+	cfqd->cfq_fifo_expire_w = cfq_fifo_expire_w;
+	cfqd->cfq_fifo_batch_expire = cfq_fifo_rate;
+	cfqd->cfq_back_max = cfq_back_max;
+	cfqd->cfq_back_penalty = cfq_back_penalty;
 
+	dprintk("cfq on queue 0x%p\n", q);
 	return 0;
+out_spare:
+	mempool_destroy(cfqd->crq_pool);
 out_crqpool:
 	kfree(cfqd->cfq_hash);
 out_cfqhash:
@@ -747,20 +1528,18 @@ static int __init cfq_slab_setup(void)
 {
 	crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0,
 					NULL, NULL);
-
 	if (!crq_pool)
 		panic("cfq_iosched: can't init crq pool\n");
 
 	cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
 					NULL, NULL);
-
 	if (!cfq_pool)
 		panic("cfq_iosched: can't init cfq pool\n");
 
-	cfq_mpool = mempool_create(64, mempool_alloc_slab, mempool_free_slab, cfq_pool);
-
-	if (!cfq_mpool)
-		panic("cfq_iosched: can't init cfq mpool\n");
+	cfq_ioc_pool = kmem_cache_create("cfq_ioc_pool",
+			sizeof(struct cfq_io_context), 0, 0, NULL, NULL);
+	if (!cfq_ioc_pool)
+		panic("cfq_iosched: can't init ioc pool\n");
 
 	return 0;
 }
@@ -791,6 +1570,83 @@ cfq_var_store(unsigned int *var, const c
 	return count;
 }
 
+static ssize_t
+cfq_clear_elapsed(struct cfq_data *cfqd, const char *page, size_t count)
+{
+	max_elapsed_dispatch = max_elapsed_crq = 0;
+	return count;
+}
+
+static ssize_t
+cfq_set_key_type(struct cfq_data *cfqd, const char *page, size_t count)
+{
+	spin_lock_irq(cfqd->queue->queue_lock);
+	if (!strncmp(page, "pgid", 4))
+		cfqd->key_type = CFQ_KEY_PGID;
+	else if (!strncmp(page, "tgid", 4))
+		cfqd->key_type = CFQ_KEY_TGID;
+	else if (!strncmp(page, "uid", 3))
+		cfqd->key_type = CFQ_KEY_UID;
+	else if (!strncmp(page, "gid", 3))
+		cfqd->key_type = CFQ_KEY_GID;
+	spin_unlock_irq(cfqd->queue->queue_lock);
+	return count;
+}
+
+static ssize_t
+cfq_read_key_type(struct cfq_data *cfqd, char *page)
+{
+	ssize_t len = 0;
+	int i;
+
+	for (i = CFQ_KEY_PGID; i < CFQ_KEY_LAST; i++) {
+		if (cfqd->key_type == i)
+			len += sprintf(page+len, "[%s] ", cfq_key_types[i]);
+		else
+			len += sprintf(page+len, "%s ", cfq_key_types[i]);
+	}
+	len += sprintf(page+len, "\n");
+	return len;
+}
+
+static ssize_t
+cfq_status_show(struct cfq_data *cfqd, char *page)
+{
+	struct list_head *entry;
+	struct cfq_queue *cfqq;
+	ssize_t len;
+	int i = 0, queues;
+
+	len = sprintf(page, "Busy queues: %u\n", cfqd->busy_queues);
+	len += sprintf(page+len, "key type: %s\n", cfq_key_types[cfqd->key_type]);
+	len += sprintf(page+len, "last sector: %Lu\n", (u64) cfqd->last_sector);
+	len += sprintf(page+len, "max time in iosched: %lu\n", max_elapsed_dispatch);
+	len += sprintf(page+len, "max completion time: %lu\n", max_elapsed_crq);
+
+	len += sprintf(page+len, "Busy queue list:\n");
+	spin_lock_irq(cfqd->queue->queue_lock);
+	list_for_each(entry, &cfqd->rr_list) {
+		i++;
+		cfqq = list_entry_cfqq(entry);
+		len += sprintf(page+len, "  cfqq: key=%lu alloc=%d/%d, queued=%d/%d, last_fifo=%lu, service_used=%lu\n", cfqq->key, cfqq->allocated[0], cfqq->allocated[1], cfqq->queued[0], cfqq->queued[1], cfqq->last_fifo_expire, cfqq->service_used);
+	}
+	len += sprintf(page+len, "  busy queues total: %d\n", i);
+	queues = i;
+	
+	len += sprintf(page+len, "Empty queue list:\n");
+	i = 0;
+	list_for_each(entry, &cfqd->empty_list) {
+		i++;
+		cfqq = list_entry_cfqq(entry);
+		len += sprintf(page+len, "  cfqq: key=%lu alloc=%d/%d, queued=%d/%d, last_fifo=%lu, service_used=%lu\n", cfqq->key, cfqq->allocated[0], cfqq->allocated[1], cfqq->queued[0], cfqq->queued[1], cfqq->last_fifo_expire, cfqq->service_used);
+	}
+	len += sprintf(page+len, "  empty queues total: %d\n", i);
+	queues += i;
+	len += sprintf(page+len, "Total queues: %d\n", queues);
+	spin_unlock_irq(cfqd->queue->queue_lock);
+	return len;
+}
+
 #define SHOW_FUNCTION(__FUNC, __VAR)					\
 static ssize_t __FUNC(struct cfq_data *cfqd, char *page)		\
 {									\
@@ -798,6 +1654,13 @@ static ssize_t __FUNC(struct cfq_data *c
 }
 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum);
 SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued);
+SHOW_FUNCTION(cfq_tagged_show, cfqd->cfq_tagged);
+SHOW_FUNCTION(cfq_fifo_expire_r_show, cfqd->cfq_fifo_expire_r);
+SHOW_FUNCTION(cfq_fifo_expire_w_show, cfqd->cfq_fifo_expire_w);
+SHOW_FUNCTION(cfq_fifo_batch_expire_show, cfqd->cfq_fifo_batch_expire);
+SHOW_FUNCTION(cfq_find_best_show, cfqd->find_best_crq);
+SHOW_FUNCTION(cfq_back_max_show, cfqd->cfq_back_max);
+SHOW_FUNCTION(cfq_back_penalty_show, cfqd->cfq_back_penalty);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)				\
@@ -810,8 +1673,15 @@ static ssize_t __FUNC(struct cfq_data *c
 		*(__PTR) = (MAX);					\
 	return ret;							\
 }
-STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, INT_MAX);
-STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, INT_MAX);
+STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX);
+STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX);
+STORE_FUNCTION(cfq_tagged_store, &cfqd->cfq_tagged, 0, 1);
+STORE_FUNCTION(cfq_fifo_expire_r_store, &cfqd->cfq_fifo_expire_r, 1, UINT_MAX);
+STORE_FUNCTION(cfq_fifo_expire_w_store, &cfqd->cfq_fifo_expire_w, 1, UINT_MAX);
+STORE_FUNCTION(cfq_fifo_batch_expire_store, &cfqd->cfq_fifo_batch_expire, 0, UINT_MAX);
+STORE_FUNCTION(cfq_find_best_store, &cfqd->find_best_crq, 0, 1);
+STORE_FUNCTION(cfq_back_max_store, &cfqd->cfq_back_max, 0, UINT_MAX);
+STORE_FUNCTION(cfq_back_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX);
 #undef STORE_FUNCTION
 
 static struct cfq_fs_entry cfq_quantum_entry = {
@@ -824,10 +1694,68 @@ static struct cfq_fs_entry cfq_queued_en
 	.show = cfq_queued_show,
 	.store = cfq_queued_store,
 };
+static struct cfq_fs_entry cfq_tagged_entry = {
+	.attr = {.name = "tagged", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_tagged_show,
+	.store = cfq_tagged_store,
+};
+static struct cfq_fs_entry cfq_fifo_expire_r_entry = {
+	.attr = {.name = "fifo_expire_sync", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_fifo_expire_r_show,
+	.store = cfq_fifo_expire_r_store,
+};
+static struct cfq_fs_entry cfq_fifo_expire_w_entry = {
+	.attr = {.name = "fifo_expire_async", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_fifo_expire_w_show,
+	.store = cfq_fifo_expire_w_store,
+};
+static struct cfq_fs_entry cfq_fifo_batch_expire_entry = {
+	.attr = {.name = "fifo_batch_expire", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_fifo_batch_expire_show,
+	.store = cfq_fifo_batch_expire_store,
+};
+static struct cfq_fs_entry cfq_find_best_entry = {
+	.attr = {.name = "find_best_crq", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_find_best_show,
+	.store = cfq_find_best_store,
+};
+static struct cfq_fs_entry cfq_back_max_entry = {
+	.attr = {.name = "back_seek_max", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_back_max_show,
+	.store = cfq_back_max_store,
+};
+static struct cfq_fs_entry cfq_back_penalty_entry = {
+	.attr = {.name = "back_seek_penalty", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_back_penalty_show,
+	.store = cfq_back_penalty_store,
+};
+static struct cfq_fs_entry cfq_clear_elapsed_entry = {
+	.attr = {.name = "clear_elapsed", .mode = S_IWUSR },
+	.store = cfq_clear_elapsed,
+};
+static struct cfq_fs_entry cfq_misc_entry = {
+	.attr = {.name = "show_status", .mode = S_IRUGO },
+	.show = cfq_status_show,
+};
+static struct cfq_fs_entry cfq_key_type_entry = {
+	.attr = {.name = "key_type", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_read_key_type,
+	.store = cfq_set_key_type,
+};
 
 static struct attribute *default_attrs[] = {
 	&cfq_quantum_entry.attr,
 	&cfq_queued_entry.attr,
+	&cfq_tagged_entry.attr,
+	&cfq_fifo_expire_r_entry.attr,
+	&cfq_fifo_expire_w_entry.attr,
+	&cfq_fifo_batch_expire_entry.attr,
+	&cfq_key_type_entry.attr,
+	&cfq_find_best_entry.attr,
+	&cfq_back_max_entry.attr,
+	&cfq_back_penalty_entry.attr,
+	&cfq_clear_elapsed_entry.attr,
+	&cfq_misc_entry.attr,
 	NULL,
 };
 
@@ -878,6 +1806,7 @@ elevator_t iosched_cfq = {
 	.elevator_add_req_fn =		cfq_insert_request,
 	.elevator_remove_req_fn =	cfq_remove_request,
 	.elevator_queue_empty_fn =	cfq_queue_empty,
+	.elevator_completed_req_fn =	cfq_completed_request,
 	.elevator_former_req_fn =	cfq_former_request,
 	.elevator_latter_req_fn =	cfq_latter_request,
 	.elevator_set_req_fn =		cfq_set_request,
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/block/elevator.c linux-2.6.8.1-ck7/drivers/block/elevator.c
--- linux-2.6.8.1-ck6/drivers/block/elevator.c	2004-08-15 14:08:05.000000000 +1000
+++ linux-2.6.8.1-ck7/drivers/block/elevator.c	2004-09-09 22:56:38.680117486 +1000
@@ -346,7 +346,7 @@ int elv_may_queue(request_queue_t *q, in
 	if (e->elevator_may_queue_fn)
 		return e->elevator_may_queue_fn(q, rw);
 
-	return 0;
+	return ELV_MQUEUE_MAY;
 }
 
 void elv_completed_request(request_queue_t *q, struct request *rq)
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/block/ll_rw_blk.c linux-2.6.8.1-ck7/drivers/block/ll_rw_blk.c
--- linux-2.6.8.1-ck6/drivers/block/ll_rw_blk.c	2004-09-09 22:56:24.789284955 +1000
+++ linux-2.6.8.1-ck7/drivers/block/ll_rw_blk.c	2004-09-09 22:56:38.682117174 +1000
@@ -241,6 +241,7 @@ void blk_queue_make_request(request_queu
 	blk_queue_hardsect_size(q, 512);
 	blk_queue_dma_alignment(q, 511);
 	blk_queue_congestion_threshold(q);
+	q->nr_batching = BLK_BATCH_REQ;
 
 	q->unplug_thresh = 4;		/* hmm */
 	q->unplug_delay = (3 * HZ) / 1000;	/* 3 milliseconds */
@@ -263,6 +264,45 @@ void blk_queue_make_request(request_queu
 EXPORT_SYMBOL(blk_queue_make_request);
 
 /**
+ * blk_queue_ordered - does this queue support ordered writes
+ * @q:     the request queue
+ * @flag:  see below
+ *
+ * Description:
+ *   For journalled file systems, doing ordered writes on a commit
+ *   block instead of explicitly doing wait_on_buffer (which is bad
+ *   for performance) can be a big win. Block drivers supporting this
+ *   feature should call this function and indicate so.
+ *
+ **/
+void blk_queue_ordered(request_queue_t *q, int flag)
+{
+	if (flag)
+		set_bit(QUEUE_FLAG_ORDERED, &q->queue_flags);
+	else
+		clear_bit(QUEUE_FLAG_ORDERED, &q->queue_flags);
+}
+
+EXPORT_SYMBOL(blk_queue_ordered);
+
+/**
+ * blk_queue_issue_flush_fn - set function for issuing a flush
+ * @q:     the request queue
+ * @iff:   the function to be called issuing the flush
+ *
+ * Description:
+ *   If a driver supports issuing a flush command, the support is notified
+ *   to the block layer by defining it through this call.
+ *
+ **/
+void blk_queue_issue_flush_fn(request_queue_t *q, issue_flush_fn *iff)
+{
+	q->issue_flush_fn = iff;
+}
+
+EXPORT_SYMBOL(blk_queue_issue_flush_fn);
+
+/**
  * blk_queue_bounce_limit - set bounce buffer limit for queue
  * @q:  the request queue for the device
  * @dma_addr:   bus address limit
@@ -482,15 +522,14 @@ struct request *blk_queue_find_tag(reque
 EXPORT_SYMBOL(blk_queue_find_tag);
 
 /**
- * blk_queue_free_tags - release tag maintenance info
+ * __blk_queue_free_tags - release tag maintenance info
  * @q:  the request queue for the device
  *
  *  Notes:
  *    blk_cleanup_queue() will take care of calling this function, if tagging
- *    has been used. So there's usually no need to call this directly, unless
- *    tagging is just being disabled but the queue remains in function.
+ *    has been used. So there's no need to call this directly.
  **/
-void blk_queue_free_tags(request_queue_t *q)
+static void __blk_queue_free_tags(request_queue_t *q)
 {
 	struct blk_queue_tag *bqt = q->queue_tags;
 
@@ -514,12 +553,27 @@ void blk_queue_free_tags(request_queue_t
 	q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED);
 }
 
+/**
+ * blk_queue_free_tags - release tag maintenance info
+ * @q:  the request queue for the device
+ *
+ *  Notes:
+ *	This is used to disabled tagged queuing to a device, yet leave
+ *	queue in function.
+ **/
+void blk_queue_free_tags(request_queue_t *q)
+{
+	clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
+}
+
 EXPORT_SYMBOL(blk_queue_free_tags);
 
 static int
 init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth)
 {
 	int bits, i;
+	struct request **tag_index;
+	unsigned long *tag_map;
 
 	if (depth > q->nr_requests * 2) {
 		depth = q->nr_requests * 2;
@@ -527,32 +581,31 @@ init_tag_map(request_queue_t *q, struct 
 				__FUNCTION__, depth);
 	}
 
-	tags->tag_index = kmalloc(depth * sizeof(struct request *), GFP_ATOMIC);
-	if (!tags->tag_index)
+	tag_index = kmalloc(depth * sizeof(struct request *), GFP_ATOMIC);
+	if (!tag_index)
 		goto fail;
 
 	bits = (depth / BLK_TAGS_PER_LONG) + 1;
-	tags->tag_map = kmalloc(bits * sizeof(unsigned long), GFP_ATOMIC);
-	if (!tags->tag_map)
+	tag_map = kmalloc(bits * sizeof(unsigned long), GFP_ATOMIC);
+	if (!tag_map)
 		goto fail;
 
-	memset(tags->tag_index, 0, depth * sizeof(struct request *));
-	memset(tags->tag_map, 0, bits * sizeof(unsigned long));
+	memset(tag_index, 0, depth * sizeof(struct request *));
+	memset(tag_map, 0, bits * sizeof(unsigned long));
 	tags->max_depth = depth;
 	tags->real_max_depth = bits * BITS_PER_LONG;
+	tags->tag_index = tag_index;
+	tags->tag_map = tag_map;
 
 	/*
 	 * set the upper bits if the depth isn't a multiple of the word size
 	 */
 	for (i = depth; i < bits * BLK_TAGS_PER_LONG; i++)
-		__set_bit(i, tags->tag_map);
+		__set_bit(i, tag_map);
 
-	INIT_LIST_HEAD(&tags->busy_list);
-	tags->busy = 0;
-	atomic_set(&tags->refcnt, 1);
 	return 0;
 fail:
-	kfree(tags->tag_index);
+	kfree(tag_index);
 	return -ENOMEM;
 }
 
@@ -564,13 +617,26 @@ fail:
 int blk_queue_init_tags(request_queue_t *q, int depth,
 			struct blk_queue_tag *tags)
 {
-	if (!tags) {
+	int rc;
+
+	BUG_ON(tags && q->queue_tags && tags != q->queue_tags);
+
+	if (!tags && !q->queue_tags) {
 		tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC);
 		if (!tags)
 			goto fail;
 
 		if (init_tag_map(q, tags, depth))
 			goto fail;
+
+		INIT_LIST_HEAD(&tags->busy_list);
+		tags->busy = 0;
+		atomic_set(&tags->refcnt, 1);
+	} else if (q->queue_tags) {
+		if ((rc = blk_queue_resize_tags(q, depth)))
+			return rc;
+		set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
+		return 0;
 	} else
 		atomic_inc(&tags->refcnt);
 
@@ -1335,8 +1401,8 @@ void blk_cleanup_queue(request_queue_t *
 	if (rl->rq_pool)
 		mempool_destroy(rl->rq_pool);
 
-	if (blk_queue_tagged(q))
-		blk_queue_free_tags(q);
+	if (q->queue_tags)
+		__blk_queue_free_tags(q);
 
 	kmem_cache_free(requestq_cachep, q);
 }
@@ -1487,8 +1553,10 @@ request_queue_t *blk_init_queue(request_
 	/*
 	 * all done
 	 */
-	if (!elevator_init(q, chosen_elevator))
+	if (!elevator_init(q, chosen_elevator)) {
+		blk_queue_congestion_threshold(q);
 		return q;
+	}
 
 	blk_cleanup_queue(q);
 out_init:
@@ -1516,13 +1584,20 @@ static inline void blk_free_request(requ
 	mempool_free(rq, q->rq.rq_pool);
 }
 
-static inline struct request *blk_alloc_request(request_queue_t *q,int gfp_mask)
+static inline struct request *blk_alloc_request(request_queue_t *q, int rw,
+						int gfp_mask)
 {
 	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
 
 	if (!rq)
 		return NULL;
 
+	/*
+	 * first three bits are identical in rq->flags and bio->bi_rw,
+	 * see bio.h and blkdev.h
+	 */
+	rq->flags = rw;
+
 	if (!elv_set_request(q, rq, gfp_mask))
 		return rq;
 
@@ -1534,7 +1609,7 @@ static inline struct request *blk_alloc_
  * ioc_batching returns true if the ioc is a valid batching request and
  * should be given priority access to a request.
  */
-static inline int ioc_batching(struct io_context *ioc)
+static inline int ioc_batching(request_queue_t *q, struct io_context *ioc)
 {
 	if (!ioc)
 		return 0;
@@ -1544,7 +1619,7 @@ static inline int ioc_batching(struct io
 	 * even if the batch times out, otherwise we could theoretically
 	 * lose wakeups.
 	 */
-	return ioc->nr_batch_requests == BLK_BATCH_REQ ||
+	return ioc->nr_batch_requests == q->nr_batching ||
 		(ioc->nr_batch_requests > 0
 		&& time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
 }
@@ -1555,12 +1630,12 @@ static inline int ioc_batching(struct io
  * is the behaviour we want though - once it gets a wakeup it should be given
  * a nice run.
  */
-void ioc_set_batching(struct io_context *ioc)
+void ioc_set_batching(request_queue_t *q, struct io_context *ioc)
 {
-	if (!ioc || ioc_batching(ioc))
+	if (!ioc || ioc_batching(q, ioc))
 		return;
 
-	ioc->nr_batch_requests = BLK_BATCH_REQ;
+	ioc->nr_batch_requests = q->nr_batching;
 	ioc->last_waited = jiffies;
 }
 
@@ -1576,10 +1651,10 @@ static void freed_request(request_queue_
 	if (rl->count[rw] < queue_congestion_off_threshold(q))
 		clear_queue_congested(q, rw);
 	if (rl->count[rw]+1 <= q->nr_requests) {
+		smp_mb();
 		if (waitqueue_active(&rl->wait[rw]))
 			wake_up(&rl->wait[rw]);
-		if (!waitqueue_active(&rl->wait[rw]))
-			blk_clear_queue_full(q, rw);
+		blk_clear_queue_full(q, rw);
 	}
 }
 
@@ -1602,13 +1677,22 @@ static struct request *get_request(reque
 		 * will be blocked.
 		 */
 		if (!blk_queue_full(q, rw)) {
-			ioc_set_batching(ioc);
+			ioc_set_batching(q, ioc);
 			blk_set_queue_full(q, rw);
 		}
 	}
 
-	if (blk_queue_full(q, rw)
-			&& !ioc_batching(ioc) && !elv_may_queue(q, rw)) {
+	switch (elv_may_queue(q, rw)) {
+		case ELV_MQUEUE_NO:
+			spin_unlock_irq(q->queue_lock);
+			goto out;
+		case ELV_MQUEUE_MAY:
+			break;
+		case ELV_MQUEUE_MUST:
+			goto get_rq;
+	}
+
+	if (blk_queue_full(q, rw) && !ioc_batching(q, ioc)) {
 		/*
 		 * The queue is full and the allocating process is not a
 		 * "batcher", and not exempted by the IO scheduler
@@ -1617,12 +1701,15 @@ static struct request *get_request(reque
 		goto out;
 	}
 
+get_rq:
 	rl->count[rw]++;
+#if 0
 	if (rl->count[rw] >= queue_congestion_on_threshold(q))
 		set_queue_congested(q, rw);
+#endif
 	spin_unlock_irq(q->queue_lock);
 
-	rq = blk_alloc_request(q, gfp_mask);
+	rq = blk_alloc_request(q, rw, gfp_mask);
 	if (!rq) {
 		/*
 		 * Allocation failed presumably due to memory. Undo anything
@@ -1637,17 +1724,11 @@ static struct request *get_request(reque
 		goto out;
 	}
 
-	if (ioc_batching(ioc))
+	if (ioc_batching(q, ioc))
 		ioc->nr_batch_requests--;
 	
 	INIT_LIST_HEAD(&rq->queuelist);
 
-	/*
-	 * first three bits are identical in rq->flags and bio->bi_rw,
-	 * see bio.h and blkdev.h
-	 */
-	rq->flags = rw;
-
 	rq->errors = 0;
 	rq->rq_status = RQ_ACTIVE;
 	rq->bio = rq->biotail = NULL;
@@ -1696,7 +1777,7 @@ static struct request *get_request_wait(
 			 * See ioc_batching, ioc_set_batching
 			 */
 			ioc = get_io_context(GFP_NOIO);
-			ioc_set_batching(ioc);
+			ioc_set_batching(q, ioc);
 			put_io_context(ioc);
 		}
 		finish_wait(&rl->wait[rw], &wait);
@@ -1925,10 +2006,11 @@ int blk_execute_rq(request_queue_t *q, s
 	}
 
 	rq->flags |= REQ_NOMERGE;
-	rq->waiting = &wait;
+	if (!rq->waiting)
+		rq->waiting = &wait;
 	elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 1);
 	generic_unplug_device(q);
-	wait_for_completion(&wait);
+	wait_for_completion(rq->waiting);
 	rq->waiting = NULL;
 
 	if (rq->errors)
@@ -1939,6 +2021,72 @@ int blk_execute_rq(request_queue_t *q, s
 
 EXPORT_SYMBOL(blk_execute_rq);
 
+/**
+ * blkdev_issue_flush - queue a flush
+ * @bdev:	blockdev to issue flush for
+ * @error_sector:	error sector
+ *
+ * Description:
+ *    Issue a flush for the block device in question. Caller can supply
+ *    room for storing the error offset in case of a flush error, if they
+ *    wish to.  Caller must run wait_for_completion() on its own.
+ */
+int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
+{
+	request_queue_t *q;
+
+	if (bdev->bd_disk == NULL)
+		return -ENXIO;
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return -ENXIO;
+	if (!q->issue_flush_fn)
+		return -EOPNOTSUPP;
+
+	return q->issue_flush_fn(q, bdev->bd_disk, error_sector);
+}
+
+EXPORT_SYMBOL(blkdev_issue_flush);
+
+/**
+ * blkdev_scsi_issue_flush_fn - issue flush for SCSI devices
+ * @q:		device queue
+ * @disk:	gendisk
+ * @error_sector:	error offset
+ *
+ * Description:
+ *    Devices understanding the SCSI command set, can use this function as
+ *    a helper for issuing a cache flush. Note: driver is required to store
+ *    the error offset (in case of error flushing) in ->sector of struct
+ *    request.
+ */
+int blkdev_scsi_issue_flush_fn(request_queue_t *q, struct gendisk *disk,
+			       sector_t *error_sector)
+{
+	struct request *rq = blk_get_request(q, WRITE, __GFP_WAIT);
+	int ret;
+
+	rq->flags |= REQ_BLOCK_PC | REQ_SOFTBARRIER;
+	rq->sector = 0;
+	memset(rq->cmd, 0, sizeof(rq->cmd));
+	rq->cmd[0] = 0x35;
+	rq->cmd_len = 12;
+	rq->data = NULL;
+	rq->data_len = 0;
+	rq->timeout = 60 * HZ;
+
+	ret = blk_execute_rq(q, disk, rq);
+
+	if (ret && error_sector)
+		*error_sector = rq->sector;
+
+	blk_put_request(rq);
+	return ret;
+}
+
+EXPORT_SYMBOL(blkdev_scsi_issue_flush_fn);
+
 void drive_stat_acct(struct request *rq, int nr_sectors, int new_io)
 {
 	int rw = rq_data_dir(rq);
@@ -2192,7 +2340,7 @@ EXPORT_SYMBOL(__blk_attempt_remerge);
 static int __make_request(request_queue_t *q, struct bio *bio)
 {
 	struct request *req, *freereq = NULL;
-	int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, ra;
+	int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err;
 	sector_t sector;
 
 	sector = bio->bi_sector;
@@ -2210,9 +2358,11 @@ static int __make_request(request_queue_
 
 	spin_lock_prefetch(q->queue_lock);
 
-	barrier = test_bit(BIO_RW_BARRIER, &bio->bi_rw);
-
-	ra = bio->bi_rw & (1 << BIO_RW_AHEAD);
+	barrier = bio_barrier(bio);
+	if (barrier && !(q->queue_flags & (1 << QUEUE_FLAG_ORDERED))) {
+		err = -EOPNOTSUPP;
+		goto end_io;
+	}
 
 again:
 	spin_lock_irq(q->queue_lock);
@@ -2292,7 +2442,8 @@ get_rq:
 			/*
 			 * READA bit set
 			 */
-			if (ra)
+			err = -EWOULDBLOCK;
+			if (bio_rw_ahead(bio))
 				goto end_io;
 	
 			freereq = get_request_wait(q, rw);
@@ -2303,10 +2454,9 @@ get_rq:
 	req->flags |= REQ_CMD;
 
 	/*
-	 * inherit FAILFAST from bio and don't stack up
-	 * retries for read ahead
+	 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
 	 */
-	if (ra || test_bit(BIO_RW_FAILFAST, &bio->bi_rw))	
+	if (bio_rw_ahead(bio) || bio_failfast(bio))
 		req->flags |= REQ_FAILFAST;
 
 	/*
@@ -2340,7 +2490,7 @@ out:
 	return 0;
 
 end_io:
-	bio_endio(bio, nr_sectors << 9, -EWOULDBLOCK);
+	bio_endio(bio, nr_sectors << 9, err);
 	return 0;
 }
 
@@ -2647,10 +2797,17 @@ void blk_recalc_rq_sectors(struct reques
 static int __end_that_request_first(struct request *req, int uptodate,
 				    int nr_bytes)
 {
-	int total_bytes, bio_nbytes, error = 0, next_idx = 0;
+	int total_bytes, bio_nbytes, error, next_idx = 0;
 	struct bio *bio;
 
 	/*
+	 * extend uptodate bool to allow < 0 value to be direct io error
+	 */
+	error = 0;
+	if (end_io_error(uptodate))
+		error = !uptodate ? -EIO : uptodate;
+
+	/*
 	 * for a REQ_BLOCK_PC request, we want to carry any eventual
 	 * sense key with us all the way through
 	 */
@@ -2658,7 +2815,6 @@ static int __end_that_request_first(stru
 		req->errors = 0;
 
 	if (!uptodate) {
-		error = -EIO;
 		if (blk_fs_request(req) && !(req->flags & REQ_QUIET))
 			printk("end_request: I/O error, dev %s, sector %llu\n",
 				req->rq_disk ? req->rq_disk->disk_name : "?",
@@ -2741,7 +2897,7 @@ static int __end_that_request_first(stru
 /**
  * end_that_request_first - end I/O on a request
  * @req:      the request being processed
- * @uptodate: 0 for I/O error
+ * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error
  * @nr_sectors: number of sectors to end I/O on
  *
  * Description:
@@ -2762,7 +2918,7 @@ EXPORT_SYMBOL(end_that_request_first);
 /**
  * end_that_request_chunk - end I/O on a request
  * @req:      the request being processed
- * @uptodate: 0 for I/O error
+ * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error
  * @nr_bytes: number of bytes to complete
  *
  * Description:
@@ -2908,6 +3064,9 @@ void put_io_context(struct io_context *i
 	if (atomic_dec_and_test(&ioc->refcount)) {
 		if (ioc->aic && ioc->aic->dtor)
 			ioc->aic->dtor(ioc->aic);
+		if (ioc->cic && ioc->cic->dtor)
+			ioc->cic->dtor(ioc->cic);
+
 		kmem_cache_free(iocontext_cachep, ioc);
 	}
 }
@@ -2920,14 +3079,15 @@ void exit_io_context(void)
 
 	local_irq_save(flags);
 	ioc = current->io_context;
-	if (ioc) {
-		if (ioc->aic && ioc->aic->exit)
-			ioc->aic->exit(ioc->aic);
-		put_io_context(ioc);
-		current->io_context = NULL;
-	} else
-		WARN_ON(1);
+	current->io_context = NULL;
 	local_irq_restore(flags);
+
+	if (ioc->aic && ioc->aic->exit)
+		ioc->aic->exit(ioc->aic);
+	if (ioc->cic && ioc->cic->exit)
+		ioc->cic->exit(ioc->cic);
+
+	put_io_context(ioc);
 }
 
 /*
@@ -2946,20 +3106,39 @@ struct io_context *get_io_context(int gf
 
 	local_irq_save(flags);
 	ret = tsk->io_context;
-	if (ret == NULL) {
-		ret = kmem_cache_alloc(iocontext_cachep, GFP_ATOMIC);
-		if (ret) {
-			atomic_set(&ret->refcount, 1);
-			ret->pid = tsk->pid;
-			ret->last_waited = jiffies; /* doesn't matter... */
-			ret->nr_batch_requests = 0; /* because this is 0 */
-			ret->aic = NULL;
+	if (ret)
+		goto out;
+
+	local_irq_restore(flags);
+
+	ret = kmem_cache_alloc(iocontext_cachep, gfp_flags);
+	if (ret) {
+		atomic_set(&ret->refcount, 1);
+		ret->pid = tsk->pid;
+		ret->last_waited = jiffies; /* doesn't matter... */
+		ret->nr_batch_requests = 0; /* because this is 0 */
+		ret->aic = NULL;
+		ret->cic = NULL;
+		spin_lock_init(&ret->lock);
+
+		local_irq_save(flags);
+
+		/*
+		 * very unlikely, someone raced with us in setting up the task
+		 * io context. free new context and just grab a reference.
+		 */
+		if (!tsk->io_context)
 			tsk->io_context = ret;
+		else {
+			kmem_cache_free(iocontext_cachep, ret);
+			ret = tsk->io_context;
 		}
-	}
-	if (ret)
+			
+out:
 		atomic_inc(&ret->refcount);
-	local_irq_restore(flags);
+		local_irq_restore(flags);
+	}
+
 	return ret;
 }
 
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/ide/ide.c linux-2.6.8.1-ck7/drivers/ide/ide.c
--- linux-2.6.8.1-ck6/drivers/ide/ide.c	2004-08-15 14:08:06.000000000 +1000
+++ linux-2.6.8.1-ck7/drivers/ide/ide.c	2004-09-09 22:56:38.699114522 +1000
@@ -437,6 +437,30 @@ u8 ide_dump_status (ide_drive_t *drive, 
 #endif	/* FANCY_STATUS_DUMPS */
 		printk("\n");
 	}
+	{
+		struct request *rq;
+		int opcode = 0x100;
+
+		spin_lock(&ide_lock);
+		rq = HWGROUP(drive)->rq;
+		spin_unlock(&ide_lock);
+		if (!rq)
+			goto out;
+		if (rq->flags & (REQ_DRIVE_CMD | REQ_DRIVE_TASK)) {
+			char *args = rq->buffer;
+			if (args)
+				opcode = args[0];
+		} else if (rq->flags & REQ_DRIVE_TASKFILE) {
+			ide_task_t *args = rq->special;
+			if (args) {
+				task_struct_t *tf = (task_struct_t *) args->tfRegister;
+				opcode = tf->command;
+			}
+		}
+
+		printk("ide: failed opcode was %x\n", opcode);
+	}
+out:
 	local_irq_restore(flags);
 	return err;
 }
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/ide/ide-disk.c linux-2.6.8.1-ck7/drivers/ide/ide-disk.c
--- linux-2.6.8.1-ck6/drivers/ide/ide-disk.c	2004-08-15 14:08:06.000000000 +1000
+++ linux-2.6.8.1-ck7/drivers/ide/ide-disk.c	2004-09-09 22:56:38.702114054 +1000
@@ -702,6 +702,37 @@ static u8 idedisk_dump_status (ide_drive
 	}
 #endif	/* FANCY_STATUS_DUMPS */
 	printk("\n");
+	{
+		struct request *rq;
+		unsigned char opcode = 0;
+		int found = 0;
+
+		spin_lock(&ide_lock);
+		rq = HWGROUP(drive)->rq;
+		spin_unlock(&ide_lock);
+		if (!rq)
+			goto out;
+		if (rq->flags & (REQ_DRIVE_CMD | REQ_DRIVE_TASK)) {
+			char *args = rq->buffer;
+			if (args) {
+				opcode = args[0];
+				found = 1;
+			}
+		} else if (rq->flags & REQ_DRIVE_TASKFILE) {
+			ide_task_t *args = rq->special;
+			if (args) {
+				task_struct_t *tf = (task_struct_t *) args->tfRegister;
+				opcode = tf->command;
+				found = 1;
+			}
+		}
+		printk("ide: failed opcode was: ");
+		if (!found)
+			printk("unknown\n");
+		else
+			printk("0x%02x\n", opcode);
+	}
+out:
 	local_irq_restore(flags);
 	return err;
 }
@@ -1203,6 +1234,42 @@ static ide_proc_entry_t idedisk_proc[] =
 
 #endif	/* CONFIG_PROC_FS */
 
+static int idedisk_issue_flush(request_queue_t *q, struct gendisk *disk,
+			       sector_t *error_sector)
+{
+	ide_drive_t *drive = q->queuedata;
+	struct request *rq;
+	int ret;
+
+	if (!drive->wcache)
+		return 0;
+
+	rq = blk_get_request(q, WRITE, __GFP_WAIT);
+
+	memset(rq->cmd, 0, sizeof(rq->cmd));
+
+	if (ide_id_has_flush_cache_ext(drive->id) &&
+	    (drive->capacity64 >= (1UL << 28)))
+		rq->cmd[0] = WIN_FLUSH_CACHE_EXT;
+	else
+		rq->cmd[0] = WIN_FLUSH_CACHE;
+
+
+	rq->flags |= REQ_DRIVE_TASK | REQ_SOFTBARRIER;
+	rq->buffer = rq->cmd;
+
+	ret = blk_execute_rq(q, disk, rq);
+
+	/*
+	 * if we failed and caller wants error offset, get it
+	 */
+	if (ret && error_sector)
+		*error_sector = ide_get_error_location(drive, rq->cmd);
+
+	blk_put_request(rq);
+	return ret;
+}
+
 /*
  * This is tightly woven into the driver->do_special can not touch.
  * DON'T do it again until a total personality rewrite is committed.
@@ -1231,16 +1298,10 @@ static int set_nowerr(ide_drive_t *drive
 	return 0;
 }
 
-/* check if CACHE FLUSH (EXT) command is supported (bits defined in ATA-6) */
-#define ide_id_has_flush_cache(id)	((id)->cfs_enable_2 & 0x3000)
-
-/* some Maxtor disks have bit 13 defined incorrectly so check bit 10 too */
-#define ide_id_has_flush_cache_ext(id)	\
-	(((id)->cfs_enable_2 & 0x2400) == 0x2400)
-
 static int write_cache (ide_drive_t *drive, int arg)
 {
 	ide_task_t args;
+	int err;
 
 	if (!ide_id_has_flush_cache(drive->id))
 		return 1;
@@ -1251,7 +1312,10 @@ static int write_cache (ide_drive_t *dri
 	args.tfRegister[IDE_COMMAND_OFFSET]	= WIN_SETFEATURES;
 	args.command_type			= IDE_DRIVE_TASK_NO_DATA;
 	args.handler				= &task_no_data_intr;
-	(void) ide_raw_taskfile(drive, &args, NULL);
+
+	err = ide_raw_taskfile(drive, &args, NULL);
+	if (err)
+		return err;
 
 	drive->wcache = arg;
 	return 0;
@@ -1412,6 +1476,7 @@ static void idedisk_setup (ide_drive_t *
 {
 	struct hd_driveid *id = drive->id;
 	unsigned long long capacity;
+	int barrier;
 
 	idedisk_add_settings(drive);
 
@@ -1543,6 +1608,27 @@ static void idedisk_setup (ide_drive_t *
 		drive->wcache = 1;
 
 	write_cache(drive, 1);
+
+	/*
+	 * decide if we can sanely support flushes and barriers on
+	 * this drive. unfortunately not all drives advertise FLUSH_CACHE
+	 * support even if they support it. So assume FLUSH_CACHE is there
+	 * always. LBA48 drives are newer, so expect it to flag support
+	 * properly. We can safely support FLUSH_CACHE on lba48, if capacity
+	 * doesn't exceed lba28
+	 */
+	barrier = 1;
+	if (drive->addressing == 1) {
+		if (capacity > (1ULL << 28) && !ide_id_has_flush_cache_ext(id))
+			barrier = 0;
+	}
+
+	printk("%s: cache flushes %ssupported\n",
+		drive->name, barrier ? "" : "not ");
+	if (barrier) {
+		blk_queue_ordered(drive->queue, 1);
+		blk_queue_issue_flush_fn(drive->queue, idedisk_issue_flush);
+	}
 }
 
 static void ide_cacheflush_p(ide_drive_t *drive)
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/ide/ide-io.c linux-2.6.8.1-ck7/drivers/ide/ide-io.c
--- linux-2.6.8.1-ck6/drivers/ide/ide-io.c	2004-06-16 17:35:36.000000000 +1000
+++ linux-2.6.8.1-ck7/drivers/ide/ide-io.c	2004-09-09 22:56:38.705113586 +1000
@@ -54,38 +54,77 @@
 #include <asm/io.h>
 #include <asm/bitops.h>
 
-/**
- *	ide_end_request		-	complete an IDE I/O
- *	@drive: IDE device for the I/O
- *	@uptodate: 
- *	@nr_sectors: number of sectors completed
- *
- *	This is our end_request wrapper function. We complete the I/O
- *	update random number input and dequeue the request, which if
- *	it was tagged may be out of order.
+static void ide_fill_flush_cmd(ide_drive_t *drive, struct request *rq)
+{
+	char *buf = rq->cmd;
+
+	/*
+	 * reuse cdb space for ata command
+	 */
+	memset(buf, 0, sizeof(rq->cmd));
+
+	rq->flags |= REQ_DRIVE_TASK | REQ_STARTED;
+	rq->buffer = buf;
+	rq->buffer[0] = WIN_FLUSH_CACHE;
+
+	if (ide_id_has_flush_cache_ext(drive->id) &&
+	    (drive->capacity64 >= (1UL << 28)))
+		rq->buffer[0] = WIN_FLUSH_CACHE_EXT;
+}
+
+/*
+ * preempt pending requests, and store this cache flush for immediate
+ * execution
  */
- 
-int ide_end_request (ide_drive_t *drive, int uptodate, int nr_sectors)
+static struct request *ide_queue_flush_cmd(ide_drive_t *drive,
+					   struct request *rq, int post)
 {
-	struct request *rq;
-	unsigned long flags;
-	int ret = 1;
+	struct request *flush_rq = &HWGROUP(drive)->wrq;
 
-	spin_lock_irqsave(&ide_lock, flags);
-	rq = HWGROUP(drive)->rq;
+	/*
+	 * write cache disabled, clear the barrier bit and treat it like
+	 * an ordinary write
+	 */
+	if (!drive->wcache) {
+		rq->flags |= REQ_BAR_PREFLUSH;
+		return rq;
+	}
 
-	BUG_ON(!(rq->flags & REQ_STARTED));
+	ide_init_drive_cmd(flush_rq);
+	ide_fill_flush_cmd(drive, flush_rq);
 
-	if (!nr_sectors)
-		nr_sectors = rq->hard_cur_sectors;
+	flush_rq->special = rq;
+	flush_rq->nr_sectors = rq->nr_sectors;
+
+	if (!post) {
+		drive->doing_barrier = 1;
+		flush_rq->flags |= REQ_BAR_PREFLUSH;
+		blkdev_dequeue_request(rq);
+	} else
+		flush_rq->flags |= REQ_BAR_POSTFLUSH;
+
+	__elv_add_request(drive->queue, flush_rq, ELEVATOR_INSERT_FRONT, 0);
+	HWGROUP(drive)->rq = NULL;
+	return flush_rq;
+}
+
+static int __ide_end_request(ide_drive_t *drive, struct request *rq,
+			     int uptodate, int nr_sectors)
+{
+	int ret = 1;
+
+	BUG_ON(!(rq->flags & REQ_STARTED));
 
 	/*
 	 * if failfast is set on a request, override number of sectors and
 	 * complete the whole request right now
 	 */
-	if (blk_noretry_request(rq) && !uptodate)
+	if (blk_noretry_request(rq) && end_io_error(uptodate))
 		nr_sectors = rq->hard_nr_sectors;
 
+	if (!blk_fs_request(rq) && end_io_error(uptodate) && !rq->errors)
+		rq->errors = -EIO;
+
 	/*
 	 * decide whether to reenable DMA -- 3 is a random magic for now,
 	 * if we DMA timeout more than 3 times, just stay in PIO
@@ -97,15 +136,56 @@ int ide_end_request (ide_drive_t *drive,
 
 	if (!end_that_request_first(rq, uptodate, nr_sectors)) {
 		add_disk_randomness(rq->rq_disk);
+
+		if (blk_rq_tagged(rq))
+			blk_queue_end_tag(drive->queue, rq);
+
 		blkdev_dequeue_request(rq);
 		HWGROUP(drive)->rq = NULL;
 		end_that_request_last(rq);
 		ret = 0;
 	}
-	spin_unlock_irqrestore(&ide_lock, flags);
 	return ret;
 }
 
+/**
+ *	ide_end_request		-	complete an IDE I/O
+ *	@drive: IDE device for the I/O
+ *	@uptodate:
+ *	@nr_sectors: number of sectors completed
+ *
+ *	This is our end_request wrapper function. We complete the I/O
+ *	update random number input and dequeue the request, which if
+ *	it was tagged may be out of order.
+ */
+
+int ide_end_request (ide_drive_t *drive, int uptodate, int nr_sectors)
+{
+	struct request *rq;
+	unsigned long flags;
+	int ret = 1;
+
+	spin_lock_irqsave(&ide_lock, flags);
+	rq = HWGROUP(drive)->rq;
+
+	if (!nr_sectors)
+		nr_sectors = rq->hard_cur_sectors;
+
+	if (!blk_barrier_rq(rq) || !drive->wcache)
+		ret = __ide_end_request(drive, rq, uptodate, nr_sectors);
+	else {
+		struct request *flush_rq = &HWGROUP(drive)->wrq;
+
+		flush_rq->nr_sectors -= nr_sectors;
+		if (!flush_rq->nr_sectors) {
+			ide_queue_flush_cmd(drive, rq, 1);
+			ret = 0;
+		}
+	}
+
+	spin_unlock_irqrestore(&ide_lock, flags);
+	return ret;
+}
 EXPORT_SYMBOL(ide_end_request);
 
 /**
@@ -137,6 +217,113 @@ static void ide_complete_pm_request (ide
 	spin_unlock_irqrestore(&ide_lock, flags);
 }
 
+/*
+ * FIXME: probably move this somewhere else, name is bad too :)
+ */
+u64 ide_get_error_location(ide_drive_t *drive, char *args)
+{
+	u32 high, low;
+	u8 hcyl, lcyl, sect;
+	u64 sector;
+
+	high = 0;
+	hcyl = args[5];
+	lcyl = args[4];
+	sect = args[3];
+
+	if (ide_id_has_flush_cache_ext(drive->id)) {
+		low = (hcyl << 16) | (lcyl << 8) | sect;
+		HWIF(drive)->OUTB(drive->ctl|0x80, IDE_CONTROL_REG);
+		high = ide_read_24(drive);
+	} else {
+		u8 cur = HWIF(drive)->INB(IDE_SELECT_REG);
+		if (cur & 0x40)
+			low = (hcyl << 16) | (lcyl << 8) | sect;
+		else {
+			low = hcyl * drive->head * drive->sect;
+			low += lcyl * drive->sect;
+			low += sect - 1;
+		}
+	}
+
+	sector = ((u64) high << 24) | low;
+	return sector;
+}
+EXPORT_SYMBOL(ide_get_error_location);
+
+static void ide_complete_barrier(ide_drive_t *drive, struct request *rq,
+				 int error)
+{
+	struct request *real_rq = rq->special;
+	int good_sectors, bad_sectors;
+	sector_t sector;
+
+	if (!error) {
+		if (blk_barrier_postflush(rq)) {
+			/*
+			 * this completes the barrier write
+			 */
+			__ide_end_request(drive, real_rq, 1, real_rq->hard_nr_sectors);
+			drive->doing_barrier = 0;
+		} else {
+			/*
+			 * just indicate that we did the pre flush
+			 */
+			real_rq->flags |= REQ_BAR_PREFLUSH;
+			elv_requeue_request(drive->queue, real_rq);
+		}
+		/*
+		 * all is fine, return
+		 */
+		return;
+	}
+
+	/*
+	 * we need to end real_rq, but it's not on the queue currently.
+	 * put it back on the queue, so we don't have to special case
+	 * anything else for completing it
+	 */
+	if (!blk_barrier_postflush(rq))
+		elv_requeue_request(drive->queue, real_rq);
+
+	/*
+	 * drive aborted flush command, assume FLUSH_CACHE_* doesn't
+	 * work and disable barrier support
+	 */
+	if (error & ABRT_ERR) {
+		printk(KERN_ERR "%s: barrier support doesn't work\n", drive->name);
+		__ide_end_request(drive, real_rq, -EOPNOTSUPP, real_rq->hard_nr_sectors);
+		blk_queue_ordered(drive->queue, 0);
+		blk_queue_issue_flush_fn(drive->queue, NULL);
+	} else {
+		/*
+		 * find out what part of the request failed
+		 */
+		good_sectors = 0;
+		if (blk_barrier_postflush(rq)) {
+			sector = ide_get_error_location(drive, rq->buffer);
+
+			if ((sector >= real_rq->hard_sector) &&
+			    (sector < real_rq->hard_sector + real_rq->hard_nr_sectors))
+				good_sectors = sector - real_rq->hard_sector;
+		} else
+			sector = real_rq->hard_sector;
+
+		bad_sectors = real_rq->hard_nr_sectors - good_sectors;
+		if (good_sectors)
+			__ide_end_request(drive, real_rq, 1, good_sectors);
+		if (bad_sectors)
+			__ide_end_request(drive, real_rq, 0, bad_sectors);
+
+		printk(KERN_ERR "%s: failed barrier write: "
+				"sector=%Lx(good=%d/bad=%d)\n",
+				drive->name, (unsigned long long)sector,
+				good_sectors, bad_sectors);
+	}
+
+	drive->doing_barrier = 0;
+}
+
 /**
  *	ide_end_drive_cmd	-	end an explicit drive command
  *	@drive: command 
@@ -226,6 +413,10 @@ void ide_end_drive_cmd (ide_drive_t *dri
 
 	spin_lock_irqsave(&ide_lock, flags);
 	blkdev_dequeue_request(rq);
+
+	if (blk_barrier_preflush(rq) || blk_barrier_postflush(rq))
+		ide_complete_barrier(drive, rq, err);
+
 	HWGROUP(drive)->rq = NULL;
 	end_that_request_last(rq);
 	spin_unlock_irqrestore(&ide_lock, flags);
@@ -712,6 +903,22 @@ static inline ide_drive_t *choose_drive 
 repeat:	
 	best = NULL;
 	drive = hwgroup->drive;
+
+	/*
+	 * drive is doing pre-flush, ordered write, post-flush sequence. even
+	 * though that is 3 requests, it must be seen as a single transaction.
+	 * we must not preempt this drive until that is complete
+	 */
+	if (drive->doing_barrier) {
+		/*
+		 * small race where queue could get replugged during
+		 * the 3-request flush cycle, just yank the plug since
+		 * we want it to finish asap
+		 */
+		blk_remove_plug(drive->queue);
+		return drive;
+	}
+
 	do {
 		if ((!drive->sleep || time_after_eq(jiffies, drive->sleep))
 		    && !elv_queue_empty(drive->queue)) {
@@ -868,6 +1075,13 @@ void ide_do_request (ide_hwgroup_t *hwgr
 		}
 
 		/*
+		 * if rq is a barrier write, issue pre cache flush if not
+		 * already done
+		 */
+		if (blk_barrier_rq(rq) && !blk_barrier_preflush(rq))
+			rq = ide_queue_flush_cmd(drive, rq, 0);
+
+		/*
 		 * Sanity: don't accept a request that isn't a PM request
 		 * if we are currently power managed. This is very important as
 		 * blk_stop_queue() doesn't prevent the elv_next_request()
@@ -917,7 +1131,9 @@ EXPORT_SYMBOL(ide_do_request);
  */
 void do_ide_request(request_queue_t *q)
 {
-	ide_do_request(q->queuedata, IDE_NO_IRQ);
+	ide_drive_t *drive = q->queuedata;
+
+	ide_do_request(HWGROUP(drive), IDE_NO_IRQ);
 }
 
 /*
@@ -1286,6 +1502,7 @@ void ide_init_drive_cmd (struct request 
 {
 	memset(rq, 0, sizeof(*rq));
 	rq->flags = REQ_DRIVE_CMD;
+	rq->ref_count = 1;
 }
 
 EXPORT_SYMBOL(ide_init_drive_cmd);
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/ide/ide-probe.c linux-2.6.8.1-ck7/drivers/ide/ide-probe.c
--- linux-2.6.8.1-ck6/drivers/ide/ide-probe.c	2004-06-16 17:35:36.000000000 +1000
+++ linux-2.6.8.1-ck7/drivers/ide/ide-probe.c	2004-09-09 22:56:38.706113430 +1000
@@ -893,7 +893,7 @@ static int ide_init_queue(ide_drive_t *d
 	if (!q)
 		return 1;
 
-	q->queuedata = HWGROUP(drive);
+	q->queuedata = drive;
 	blk_queue_segment_boundary(q, 0xffff);
 
 	if (!hwif->rqsize)
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/md/dm.c linux-2.6.8.1-ck7/drivers/md/dm.c
--- linux-2.6.8.1-ck6/drivers/md/dm.c	2004-08-15 14:08:06.000000000 +1000
+++ linux-2.6.8.1-ck7/drivers/md/dm.c	2004-09-09 22:56:38.707113274 +1000
@@ -597,6 +597,21 @@ static int dm_request(request_queue_t *q
 	return 0;
 }
 
+static int dm_flush_all(request_queue_t *q, struct gendisk *disk,
+			sector_t *error_sector)
+{
+	struct mapped_device *md = q->queuedata;
+	struct dm_table *map = dm_get_table(md);
+	int ret = -ENXIO;
+
+	if (map) {
+		ret = dm_table_flush_all(md->map);
+		dm_table_put(map);
+	}
+
+	return ret;
+}
+
 static void dm_unplug_all(request_queue_t *q)
 {
 	struct mapped_device *md = q->queuedata;
@@ -764,6 +779,7 @@ static struct mapped_device *alloc_dev(u
 	md->queue->backing_dev_info.congested_data = md;
 	blk_queue_make_request(md->queue, dm_request);
 	md->queue->unplug_fn = dm_unplug_all;
+	md->queue->issue_flush_fn = dm_flush_all;
 
 	md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
 				     mempool_free_slab, _io_cache);
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/md/dm.h linux-2.6.8.1-ck7/drivers/md/dm.h
--- linux-2.6.8.1-ck6/drivers/md/dm.h	2004-06-16 17:35:36.000000000 +1000
+++ linux-2.6.8.1-ck7/drivers/md/dm.h	2004-09-09 22:56:38.707113274 +1000
@@ -113,6 +113,7 @@ void dm_table_suspend_targets(struct dm_
 void dm_table_resume_targets(struct dm_table *t);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
 void dm_table_unplug_all(struct dm_table *t);
+int dm_table_flush_all(struct dm_table *t);
 
 /*-----------------------------------------------------------------
  * A registry of target types.
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/md/dm-table.c linux-2.6.8.1-ck7/drivers/md/dm-table.c
--- linux-2.6.8.1-ck6/drivers/md/dm-table.c	2004-08-15 14:08:06.000000000 +1000
+++ linux-2.6.8.1-ck7/drivers/md/dm-table.c	2004-09-09 22:56:38.708113118 +1000
@@ -900,6 +900,28 @@ void dm_table_unplug_all(struct dm_table
 	}
 }
 
+int dm_table_flush_all(struct dm_table *t)
+{
+	struct list_head *d, *devices = dm_table_get_devices(t);
+	int ret = 0;
+
+	for (d = devices->next; d != devices; d = d->next) {
+		struct dm_dev *dd = list_entry(d, struct dm_dev, list);
+		request_queue_t *q = bdev_get_queue(dd->bdev);
+		int err;
+
+		if (!q->issue_flush_fn)
+			err = -EOPNOTSUPP;
+		else
+			err = q->issue_flush_fn(q, dd->bdev->bd_disk, NULL);
+
+		if (!ret)
+			ret = err;
+	}
+
+	return ret;
+}
+
 EXPORT_SYMBOL(dm_vcalloc);
 EXPORT_SYMBOL(dm_get_device);
 EXPORT_SYMBOL(dm_put_device);
@@ -908,3 +930,4 @@ EXPORT_SYMBOL(dm_table_get_mode);
 EXPORT_SYMBOL(dm_table_put);
 EXPORT_SYMBOL(dm_table_get);
 EXPORT_SYMBOL(dm_table_unplug_all);
+EXPORT_SYMBOL(dm_table_flush_all);
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/md/linear.c linux-2.6.8.1-ck7/drivers/md/linear.c
--- linux-2.6.8.1-ck6/drivers/md/linear.c	2004-05-23 12:54:50.000000000 +1000
+++ linux-2.6.8.1-ck7/drivers/md/linear.c	2004-09-09 22:56:38.709112962 +1000
@@ -47,7 +47,6 @@ static inline dev_info_t *which_dev(mdde
 		return hash->dev0;
 }
 
-
 /**
  *	linear_mergeable_bvec -- tell bio layer if a two requests can be merged
  *	@q: request queue
@@ -93,6 +92,27 @@ static void linear_unplug(request_queue_
 	}
 }
 
+static int linear_issue_flush(request_queue_t *q, struct gendisk *disk,
+			      sector_t *error_sector)
+{
+	mddev_t *mddev = q->queuedata;
+	linear_conf_t *conf = mddev_to_conf(mddev);
+	int i, ret = 0;
+
+	for (i=0; i < mddev->raid_disks; i++) {
+		struct block_device *bdev = conf->disks[i].rdev->bdev;
+		request_queue_t *r_queue = bdev_get_queue(bdev);
+
+		if (!r_queue->issue_flush_fn) {
+			ret = -EOPNOTSUPP;
+			break;
+		}
+		ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
+		if (ret)
+			break;
+	}
+	return ret;
+}
 
 static int linear_run (mddev_t *mddev)
 {
@@ -200,6 +220,7 @@ static int linear_run (mddev_t *mddev)
 
 	blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
 	mddev->queue->unplug_fn = linear_unplug;
+	mddev->queue->issue_flush_fn = linear_issue_flush;
 	return 0;
 
 out:
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/md/md.c linux-2.6.8.1-ck7/drivers/md/md.c
--- linux-2.6.8.1-ck6/drivers/md/md.c	2004-06-16 17:35:36.000000000 +1000
+++ linux-2.6.8.1-ck7/drivers/md/md.c	2004-09-09 22:56:38.710112806 +1000
@@ -154,6 +154,39 @@ static spinlock_t all_mddevs_lock = SPIN
 		tmp = tmp->next;})					\
 		)
 
+int md_flush_mddev(mddev_t *mddev, sector_t *error_sector)
+{
+	struct list_head *tmp;
+	mdk_rdev_t *rdev;
+	int ret = 0;
+
+	/*
+	 * this list iteration is done without any locking in md?!
+	 */
+	ITERATE_RDEV(mddev, rdev, tmp) {
+		request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
+		int err;
+
+		if (!r_queue->issue_flush_fn)
+			err = -EOPNOTSUPP;
+		else
+			err = r_queue->issue_flush_fn(r_queue, rdev->bdev->bd_disk, error_sector);
+
+		if (!ret)
+			ret = err;
+	}
+
+	return ret;
+}
+
+static int md_flush_all(request_queue_t *q, struct gendisk *disk,
+			 sector_t *error_sector)
+{
+	mddev_t *mddev = q->queuedata;
+
+	return md_flush_mddev(mddev, error_sector);
+}
+
 static int md_fail_request (request_queue_t *q, struct bio *bio)
 {
 	bio_io_error(bio, bio->bi_size);
@@ -1645,6 +1678,7 @@ static int do_md_run(mddev_t * mddev)
 	 */
 	mddev->queue->queuedata = mddev;
 	mddev->queue->make_request_fn = mddev->pers->make_request;
+	mddev->queue->issue_flush_fn = md_flush_all;
 
 	mddev->changed = 1;
 	return 0;
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/md/multipath.c linux-2.6.8.1-ck7/drivers/md/multipath.c
--- linux-2.6.8.1-ck6/drivers/md/multipath.c	2004-08-15 14:08:06.000000000 +1000
+++ linux-2.6.8.1-ck7/drivers/md/multipath.c	2004-09-09 22:56:38.711112650 +1000
@@ -120,7 +120,7 @@ int multipath_end_request(struct bio *bi
 
 	if (uptodate)
 		multipath_end_bh_io(mp_bh, uptodate);
-	else if ((bio->bi_rw & (1 << BIO_RW_AHEAD)) == 0) {
+	else if (!bio_rw_ahead(bio)) {
 		/*
 		 * oops, IO error:
 		 */
@@ -217,6 +217,31 @@ static void multipath_status (struct seq
 	seq_printf (seq, "]");
 }
 
+static int multipath_issue_flush(request_queue_t *q, struct gendisk *disk,
+				 sector_t *error_sector)
+{
+	mddev_t *mddev = q->queuedata;
+	multipath_conf_t *conf = mddev_to_conf(mddev);
+	int i, ret = 0;
+
+	for (i=0; i<mddev->raid_disks; i++) {
+		mdk_rdev_t *rdev = conf->multipaths[i].rdev;
+		if (rdev && !rdev->faulty) {
+			struct block_device *bdev = rdev->bdev;
+			request_queue_t *r_queue = bdev_get_queue(bdev);
+
+			if (!r_queue->issue_flush_fn) {
+				ret = -EOPNOTSUPP;
+				break;
+			}
+
+			ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
+			if (ret)
+				break;
+		}
+	}
+	return ret;
+}
 
 /*
  * Careful, this can execute in IRQ contexts as well!
@@ -435,6 +460,8 @@ static int multipath_run (mddev_t *mddev
 
 	mddev->queue->unplug_fn = multipath_unplug;
 
+	mddev->queue->issue_flush_fn = multipath_issue_flush;
+
 	conf->working_disks = 0;
 	ITERATE_RDEV(mddev,rdev,tmp) {
 		disk_idx = rdev->raid_disk;
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/md/raid0.c linux-2.6.8.1-ck7/drivers/md/raid0.c
--- linux-2.6.8.1-ck6/drivers/md/raid0.c	2004-05-23 12:54:50.000000000 +1000
+++ linux-2.6.8.1-ck7/drivers/md/raid0.c	2004-09-09 22:56:38.712112494 +1000
@@ -40,6 +40,31 @@ static void raid0_unplug(request_queue_t
 	}
 }
 
+static int raid0_issue_flush(request_queue_t *q, struct gendisk *disk,
+			     sector_t *error_sector)
+{
+	mddev_t *mddev = q->queuedata;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
+	mdk_rdev_t **devlist = conf->strip_zone[0].dev;
+	int i, ret = 0;
+
+	for (i=0; i<mddev->raid_disks; i++) {
+		struct block_device *bdev = devlist[i]->bdev;
+		request_queue_t *r_queue = bdev_get_queue(bdev);
+
+		if (!r_queue->issue_flush_fn) {
+			ret = -EOPNOTSUPP;
+			break;
+		}
+
+		ret =r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
+
 static int create_strip_zones (mddev_t *mddev)
 {
 	int i, c, j;
@@ -219,6 +244,8 @@ static int create_strip_zones (mddev_t *
 
 	mddev->queue->unplug_fn = raid0_unplug;
 
+	mddev->queue->issue_flush_fn = raid0_issue_flush;
+
 	printk("raid0: done.\n");
 	return 0;
  abort:
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/md/raid1.c linux-2.6.8.1-ck7/drivers/md/raid1.c
--- linux-2.6.8.1-ck6/drivers/md/raid1.c	2004-08-15 14:08:06.000000000 +1000
+++ linux-2.6.8.1-ck7/drivers/md/raid1.c	2004-09-09 22:56:38.713112338 +1000
@@ -481,6 +481,32 @@ static void raid1_unplug(request_queue_t
 	unplug_slaves(q->queuedata);
 }
 
+static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,
+			     sector_t *error_sector)
+{
+	mddev_t *mddev = q->queuedata;
+	conf_t *conf = mddev_to_conf(mddev);
+	unsigned long flags;
+	int i, ret = 0;
+
+	spin_lock_irqsave(&conf->device_lock, flags);
+	for (i=0; i<mddev->raid_disks; i++) {
+		mdk_rdev_t *rdev = conf->mirrors[i].rdev;
+		if (rdev && !rdev->faulty) {
+			struct block_device *bdev = rdev->bdev;
+			request_queue_t *r_queue = bdev_get_queue(bdev);
+
+			if (r_queue->issue_flush_fn) {
+				ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
+				if (ret)
+					break;
+			}
+		}
+	}
+	spin_unlock_irqrestore(&conf->device_lock, flags);
+	return ret;
+}
+
 /*
  * Throttle resync depth, so that we can both get proper overlapping of
  * requests, but are still able to handle normal requests quickly.
@@ -1168,6 +1194,7 @@ static int run(mddev_t *mddev)
 
 	mddev->queue->unplug_fn = raid1_unplug;
 
+	mddev->queue->issue_flush_fn = raid1_issue_flush;
 
 	ITERATE_RDEV(mddev, rdev, tmp) {
 		disk_idx = rdev->raid_disk;
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/md/raid5.c linux-2.6.8.1-ck7/drivers/md/raid5.c
--- linux-2.6.8.1-ck6/drivers/md/raid5.c	2004-08-15 14:08:06.000000000 +1000
+++ linux-2.6.8.1-ck7/drivers/md/raid5.c	2004-09-09 22:56:38.714112182 +1000
@@ -1339,6 +1339,39 @@ static void raid5_unplug_device(request_
 	unplug_slaves(mddev);
 }
 
+static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk,
+			     sector_t *error_sector)
+{
+	mddev_t *mddev = q->queuedata;
+	raid5_conf_t *conf = mddev_to_conf(mddev);
+	int i, ret = 0;
+
+	for (i=0; i<mddev->raid_disks; i++) {
+		mdk_rdev_t *rdev = conf->disks[i].rdev;
+		if (rdev && !rdev->faulty) {
+			struct block_device *bdev = rdev->bdev;
+			request_queue_t *r_queue;
+
+			if (!bdev)
+				continue;
+
+			r_queue = bdev_get_queue(bdev);
+			if (!r_queue)
+				continue;
+
+			if (!r_queue->issue_flush_fn) {
+				ret = -EOPNOTSUPP;
+				break;
+			}
+
+			ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
+			if (ret)
+				break;
+		}
+	}
+	return ret;
+}
+
 static inline void raid5_plug_device(raid5_conf_t *conf)
 {
 	spin_lock_irq(&conf->device_lock);
@@ -1545,6 +1578,7 @@ static int run (mddev_t *mddev)
 	atomic_set(&conf->preread_active_stripes, 0);
 
 	mddev->queue->unplug_fn = raid5_unplug_device;
+	mddev->queue->issue_flush_fn = raid5_issue_flush;
 
 	PRINTK("raid5: run(%s) called.\n", mdname(mddev));
 
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/md/raid6main.c linux-2.6.8.1-ck7/drivers/md/raid6main.c
--- linux-2.6.8.1-ck6/drivers/md/raid6main.c	2004-08-15 14:08:06.000000000 +1000
+++ linux-2.6.8.1-ck7/drivers/md/raid6main.c	2004-09-09 22:56:38.715112026 +1000
@@ -1501,6 +1501,39 @@ static void raid6_unplug_device(request_
 	unplug_slaves(mddev);
 }
 
+static int raid6_issue_flush(request_queue_t *q, struct gendisk *disk,
+			     sector_t *error_sector)
+{
+	mddev_t *mddev = q->queuedata;
+	raid6_conf_t *conf = mddev_to_conf(mddev);
+	int i, ret = 0;
+
+	for (i=0; i<mddev->raid_disks; i++) {
+		mdk_rdev_t *rdev = conf->disks[i].rdev;
+		if (rdev && !rdev->faulty) {
+			struct block_device *bdev = rdev->bdev;
+			request_queue_t *r_queue;
+
+			if (!bdev)
+				continue;
+
+			r_queue = bdev_get_queue(bdev);
+			if (!r_queue)
+				continue;
+
+			if (!r_queue->issue_flush_fn) {
+				ret = -EOPNOTSUPP;
+				break;
+			}
+
+			ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
+			if (ret)
+				break;
+		}
+	}
+	return ret;
+}
+
 static inline void raid6_plug_device(raid6_conf_t *conf)
 {
 	spin_lock_irq(&conf->device_lock);
@@ -1708,6 +1741,7 @@ static int run (mddev_t *mddev)
 	atomic_set(&conf->preread_active_stripes, 0);
 
 	mddev->queue->unplug_fn = raid6_unplug_device;
+	mddev->queue->issue_flush_fn = raid6_issue_flush;
 
 	PRINTK("raid6: run(%s) called.\n", mdname(mddev));
 
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/scsi/scsi_lib.c linux-2.6.8.1-ck7/drivers/scsi/scsi_lib.c
--- linux-2.6.8.1-ck6/drivers/scsi/scsi_lib.c	2004-08-15 14:08:08.000000000 +1000
+++ linux-2.6.8.1-ck7/drivers/scsi/scsi_lib.c	2004-09-09 22:56:38.716111870 +1000
@@ -954,6 +954,22 @@ static int scsi_init_io(struct scsi_cmnd
 	return BLKPREP_KILL;
 }
 
+static int scsi_issue_flush_fn(request_queue_t *q, struct gendisk *disk,
+			       sector_t *error_sector)
+{
+	struct scsi_device *sdev = q->queuedata;
+	struct scsi_driver *drv;
+
+	if (sdev->sdev_state != SDEV_RUNNING)
+		return -ENXIO;
+
+	drv = *(struct scsi_driver **) disk->private_data;
+	if (drv->issue_flush)
+		return drv->issue_flush(&sdev->sdev_gendev, error_sector);
+
+	return -EOPNOTSUPP;
+}
+
 static int scsi_prep_fn(struct request_queue *q, struct request *req)
 {
 	struct scsi_device *sdev = q->queuedata;
@@ -1335,7 +1351,8 @@ struct request_queue *scsi_alloc_queue(s
 	blk_queue_max_sectors(q, shost->max_sectors);
 	blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
 	blk_queue_segment_boundary(q, shost->dma_boundary);
- 
+	blk_queue_issue_flush_fn(q, scsi_issue_flush_fn);
+
 	if (!shost->use_clustering)
 		clear_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
 	return q;
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/scsi/sd.c linux-2.6.8.1-ck7/drivers/scsi/sd.c
--- linux-2.6.8.1-ck6/drivers/scsi/sd.c	2004-09-09 22:56:24.839277154 +1000
+++ linux-2.6.8.1-ck7/drivers/scsi/sd.c	2004-09-09 22:56:38.717111714 +1000
@@ -114,6 +114,7 @@ static int sd_remove(struct device *);
 static void sd_shutdown(struct device *dev);
 static void sd_rescan(struct device *);
 static int sd_init_command(struct scsi_cmnd *);
+static int sd_issue_flush(struct device *, sector_t *);
 static void sd_read_capacity(struct scsi_disk *sdkp, char *diskname,
 		 struct scsi_request *SRpnt, unsigned char *buffer);
 
@@ -127,6 +128,7 @@ static struct scsi_driver sd_template = 
 	},
 	.rescan			= sd_rescan,
 	.init_command		= sd_init_command,
+	.issue_flush		= sd_issue_flush,
 };
 
 /* Device no to disk mapping:
@@ -687,6 +689,62 @@ not_present:
 	return 1;
 }
 
+static int sd_sync_cache(struct scsi_device *sdp)
+{
+	struct scsi_request *sreq;
+	int retries, res;
+
+	if (!scsi_device_online(sdp))
+		return -ENODEV;
+
+	sreq = scsi_allocate_request(sdp, GFP_KERNEL);
+	if (!sreq) {
+		printk("FAILED\n  No memory for request\n");
+		return -ENOMEM;
+	}
+
+	sreq->sr_data_direction = DMA_NONE;
+	for (retries = 3; retries > 0; --retries) {
+		unsigned char cmd[10] = { 0 };
+
+		cmd[0] = SYNCHRONIZE_CACHE;
+		/*
+		 * Leave the rest of the command zero to indicate
+		 * flush everything.
+		 */
+		scsi_wait_req(sreq, cmd, NULL, 0, SD_TIMEOUT, SD_MAX_RETRIES);
+		if (sreq->sr_result == 0)
+			break;
+	}
+
+	res = sreq->sr_result;
+	if (res) {
+		printk(KERN_WARNING "FAILED\n  status = %x, message = %02x, "
+				    "host = %d, driver = %02x\n  ",
+				    status_byte(res), msg_byte(res),
+				    host_byte(res), driver_byte(res));
+			if (driver_byte(res) & DRIVER_SENSE)
+				scsi_print_req_sense("sd", sreq);
+	}
+
+	scsi_release_request(sreq);
+	return res;
+}
+
+static int sd_issue_flush(struct device *dev, sector_t *error_sector)
+{
+	struct scsi_device *sdp = to_scsi_device(dev);
+	struct scsi_disk *sdkp = dev_get_drvdata(dev);
+
+	if (!sdkp)
+               return -ENODEV;
+
+	if (!sdkp->WCE)
+		return 0;
+
+	return sd_sync_cache(sdp);
+}
+
 static void sd_rescan(struct device *dev)
 {
 	struct scsi_disk *sdkp = dev_get_drvdata(dev);
@@ -1562,52 +1620,17 @@ static void scsi_disk_release(struct kre
 static void sd_shutdown(struct device *dev)
 {
 	struct scsi_device *sdp = to_scsi_device(dev);
-	struct scsi_disk *sdkp;
-	struct scsi_request *sreq;
-	int retries, res;
+	struct scsi_disk *sdkp = dev_get_drvdata(dev);
 
-	sdkp = dev_get_drvdata(dev);
 	if (!sdkp)
-               return;         /* this can happen */
+		return;         /* this can happen */
 
-	if (!scsi_device_online(sdp) || !sdkp->WCE)
+	if (!sdkp->WCE)
 		return;
 
-	printk(KERN_NOTICE "Synchronizing SCSI cache for disk %s: ",
+	printk(KERN_NOTICE "Synchronizing SCSI cache for disk %s: \n",
 			sdkp->disk->disk_name);
-
-	sreq = scsi_allocate_request(sdp, GFP_KERNEL);
-	if (!sreq) {
-		printk("FAILED\n  No memory for request\n");
-		return;
-	}
-
-	sreq->sr_data_direction = DMA_NONE;
-	for (retries = 3; retries > 0; --retries) {
-		unsigned char cmd[10] = { 0 };
-
-		cmd[0] = SYNCHRONIZE_CACHE;
-		/*
-		 * Leave the rest of the command zero to indicate
-		 * flush everything.
-		 */
-		scsi_wait_req(sreq, cmd, NULL, 0, SD_TIMEOUT, SD_MAX_RETRIES);
-		if (sreq->sr_result == 0)
-			break;
-	}
-
-	res = sreq->sr_result;
-	if (res) {
-		printk(KERN_WARNING "FAILED\n  status = %x, message = %02x, "
-				    "host = %d, driver = %02x\n  ",
-				    status_byte(res), msg_byte(res),
-				    host_byte(res), driver_byte(res));
-			if (driver_byte(res) & DRIVER_SENSE)
-				scsi_print_req_sense("sd", sreq);
-	}
-	
-	scsi_release_request(sreq);
-	printk("\n");
+	sd_sync_cache(sdp);
 }	
 
 /**
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/video/fbmem.c linux-2.6.8.1-ck7/drivers/video/fbmem.c
--- linux-2.6.8.1-ck6/drivers/video/fbmem.c	2004-08-15 14:08:09.000000000 +1000
+++ linux-2.6.8.1-ck7/drivers/video/fbmem.c	2004-09-09 22:56:38.722110934 +1000
@@ -97,6 +97,7 @@ extern int virgefb_setup(char*);
 extern int resolver_video_setup(char*);
 extern int s3triofb_init(void);
 extern int vesafb_init(void);
+extern int vesafb_init_thread(void);
 extern int vesafb_setup(char*);
 extern int vga16fb_init(void);
 extern int vga16fb_setup(char*);
@@ -306,7 +307,6 @@ static struct {
 #ifdef CONFIG_FB_VESA
 	{ "vesafb", vesafb_init, vesafb_setup },
 #endif 
-
 	/*
 	 * Chipset specific drivers that don't use resource management (yet)
 	 */
@@ -1519,6 +1519,9 @@ fbmem_init(void)
 	}
 #endif
 
+#if defined(CONFIG_FB_VESA_TNG) || defined(CONFIG_FB_VESA_TNG_MODULE)
+	vesafb_init_thread();
+#endif
 	/*
 	 *  Probe for all builtin frame buffer devices
 	 */
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/video/Kconfig linux-2.6.8.1-ck7/drivers/video/Kconfig
--- linux-2.6.8.1-ck6/drivers/video/Kconfig	2004-09-09 22:56:24.845276218 +1000
+++ linux-2.6.8.1-ck7/drivers/video/Kconfig	2004-09-09 22:56:38.724110622 +1000
@@ -287,7 +287,7 @@ config FB_TGA
 	  cards. Say Y if you have one of those.
 
 config FB_VESA
-	bool "VESA VGA graphics support"
+	tristate "VESA VGA graphics support"
 	depends on FB && (X86 || X86_64)
 	help
 	  This is the frame buffer device driver for generic VESA 2.0
@@ -295,6 +295,46 @@ config FB_VESA
 	  You will get a boot time penguin logo at no additional cost. Please
 	  read <file:Documentation/fb/vesafb.txt>. If unsure, say Y.
 
+choice 
+	prompt "VESA driver type"
+	depends on FB_VESA
+	default FB_VESA_STD
+
+config FB_VESA_STD
+	bool "vesafb"
+	help
+	  This is the frame buffer device driver for generic VESA 2.0
+	  compliant graphic cards. The older VESA 1.2 cards are not supported.
+	  You will get a boot time penguin logo at no additional cost. Please
+	  read <file:Documentation/fb/vesafb.txt>. Choose this driver if you
+	  are experiencing problems with vesafb-tng or if you own a 64-bit system.
+
+	  Note that this driver cannot be compiled as a module.
+
+config FB_VESA_TNG
+	bool "vesafb-tng"
+	depends on !X86_64
+	help
+	  This is the frame buffer device driver for generic VESA 2.0 
+	  compliant graphic cards. It is capable of taking advantage of 
+	  VBE 3.0 features. With this driver you will be able to adjust
+	  the refresh rate (VBE 3.0 compliant boards only) and change
+	  the graphic mode on-the-fly.
+	  
+	  You will also get a boot time penguin logo at no additional cost. Please
+	  read <file:Documentation/fb/vesafb.txt>.
+
+endchoice
+
+config FB_VESA_DEFAULT_MODE
+	string "VESA default mode"
+	depends on FB_VESA_TNG
+	default "640x480@60"
+	help 
+	  This option is used to determine the default mode vesafb is
+	  supposed to switch to in case no mode is provided as a kernel
+	  command line parameter.
+
 config VIDEO_SELECT
 	bool
 	depends on FB_VESA
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/video/Makefile linux-2.6.8.1-ck7/drivers/video/Makefile
--- linux-2.6.8.1-ck6/drivers/video/Makefile	2004-09-09 22:56:24.845276218 +1000
+++ linux-2.6.8.1-ck7/drivers/video/Makefile	2004-09-09 22:56:38.724110622 +1000
@@ -44,7 +44,25 @@ obj-$(CONFIG_FB_CIRRUS)		  += cirrusfb.o
 obj-$(CONFIG_FB_TRIDENT)	  += tridentfb.o cfbfillrect.o cfbimgblt.o cfbcopyarea.o
 obj-$(CONFIG_FB_S3TRIO)           += S3triofb.o
 obj-$(CONFIG_FB_TGA)              += tgafb.o cfbfillrect.o cfbcopyarea.o cfbimgblt.o 
-obj-$(CONFIG_FB_VESA)             += vesafb.o cfbfillrect.o cfbcopyarea.o cfbimgblt.o 
+
+ifeq ($(CONFIG_FB_VESA),m)
+  ifeq ($(CONFIG_FB_VESA_STD),y)
+    obj-y                         += vesafb.o cfbfillrect.o cfbcopyarea.o cfbimgblt.o
+  else
+    obj-m                         += vesafb-tng.o cfbfillrect.o cfbcopyarea.o cfbimgblt.o
+    obj-y                         += vesafb-thread.o
+   endif
+else
+  ifeq ($(CONFIG_FB_VESA),y)
+    ifeq ($(CONFIG_FB_VESA_STD),y)
+      obj-y                       += vesafb.o cfbfillrect.o cfbcopyarea.o cfbimgblt.o
+    else
+      obj-y                       += vesafb-tng.o vesafb-thread.o cfbfillrect.o \
+                                     cfbcopyarea.o cfbimgblt.o
+    endif
+  endif
+endif
+
 obj-$(CONFIG_FB_VGA16)            += vga16fb.o cfbfillrect.o cfbcopyarea.o \
 	                             cfbimgblt.o vgastate.o 
 obj-$(CONFIG_FB_VIRGE)            += virgefb.o
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/video/vesafb-thread.c linux-2.6.8.1-ck7/drivers/video/vesafb-thread.c
--- linux-2.6.8.1-ck6/drivers/video/vesafb-thread.c	1970-01-01 10:00:00.000000000 +1000
+++ linux-2.6.8.1-ck7/drivers/video/vesafb-thread.c	2004-09-09 22:56:38.725110466 +1000
@@ -0,0 +1,578 @@
+/*
+ * Framebuffer driver for VBE 2.0+ compliant graphic boards - kernel thread 
+ * and vm86 routines.
+ *
+ * This code has to be compiled into the kernel even if vesafb is configured
+ * as a module. If vesafb_thread were to be started while the module is being 
+ * initialized, it would share its active_mm with modprobe. This mm would be 
+ * lost after modprobe finished its work, and we can't allow it, because we 
+ * need it for as long as the vesafb thread is active. 
+ *
+ * (c) 2004 Michał Januszewski <spock@gentoo.org>
+ * 
+ */
+
+#include <linux/workqueue.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/signal.h>
+#include <linux/suspend.h>
+#include <video/vesa.h>
+#include <video/edid.h>
+#include <asm/mman.h>
+#include <asm/page.h>
+#include <asm/vm86.h>
+#include <asm/unistd.h>
+#include <asm/thread_info.h>
+#include <asm/uaccess.h>
+#include "edid.h"
+
+#ifdef DEBUG
+#define DPRINTK(fmt, args...)	printk(KERN_DEBUG "%s: " fmt, __FUNCTION__ , ## args)
+#else
+#define DPRINTK(fmt, args...)
+#endif
+
+static int		errno = 0;
+int			vesafb_pid = 0;
+struct vm86_struct	vm86;
+
+static DECLARE_MUTEX(vesafb_sem);
+static LIST_HEAD(vesafb_task_list);
+static DECLARE_WAIT_QUEUE_HEAD(vesafb_wait);
+
+_syscall3(int,ioperm,unsigned long, a, unsigned long, b, unsigned long, c);
+_syscall1(int,vm86old,struct vm86_struct __user*, v86);
+
+#define DEFAULT_VM86_FLAGS (IF_MASK | IOPL_MASK) 
+#define VM86_PUSHW(x)	vm86.regs.esp -= 2; *(unsigned short*)(real_mem+vm86.regs.esp) = x;
+#define REAL_MEM_SIZE	0x20000
+#define REAL_MEM	0x10000
+#define RET_CODE_SIZE 	0x02
+#define STACK_SIZE	0x500
+#define BUFFER		(STACK_SIZE + RET_CODE_SIZE)
+#define FLAG_D 		(1 << 10)
+
+/* segment prefix opcodes */
+enum {	
+	P_CS = 0x2e, 
+	P_SS = 0x36, 
+	P_DS = 0x3e,
+	P_ES = 0x26, 
+	P_FS = 0x64, 
+	P_GS = 0x65
+};
+
+void vesafb_queue_task(struct vesafb_task *task)
+{
+	list_add_tail(&task->node, &vesafb_task_list);
+	wake_up(&vesafb_wait);
+}
+
+/* emulated vm86 ins instruction */
+static void vm86_ins(int size)
+{
+	u32 edx, edi;
+
+	edx = vm86.regs.edx & 0xffff;
+	edi = (vm86.regs.edi & 0xffff) + (u32)(vm86.regs.es << 4);
+
+	if (vm86.regs.eflags & FLAG_D)
+		asm volatile ("std\n");
+	else	
+		asm volatile ("cld\n");
+	
+	switch (size) {
+	case 4:	asm volatile ("insl\n" : "=D" (edi) : "d" (edx), "0" (edi)); break;
+	case 2:	asm volatile ("insw\n" : "=D" (edi) : "d" (edx), "0" (edi)); break;
+	case 1:	asm volatile ("insb\n" : "=D" (edi) : "d" (edx), "0" (edi));
+	}
+
+	if (vm86.regs.eflags & FLAG_D)
+		asm volatile ("cld\n");
+
+	edi -= (u32)(vm86.regs.es << 4);
+
+	vm86.regs.edi &= 0xffff0000;
+	vm86.regs.edi |= edi & 0xffff;
+}
+
+static void vm86_rep_ins(int size)
+{
+	u16 cx = vm86.regs.ecx; 
+	
+	while (cx--)
+		vm86_ins(size);
+
+	vm86.regs.ecx &= 0xffff0000;
+}
+
+/* emulated vm86 outs instruction */
+static void vm86_outs(int size, int segment)
+{
+	u32 edx, esi, base;
+
+	edx = vm86.regs.edx & 0xffff;
+	esi = vm86.regs.esi & 0xffff;
+
+	switch (segment) {
+	case P_CS: base = vm86.regs.cs; break;
+	case P_SS: base = vm86.regs.ss; break;
+	case P_ES: base = vm86.regs.es; break;
+	case P_FS: base = vm86.regs.fs; break;
+	case P_GS: base = vm86.regs.gs; break;
+	default:   base = vm86.regs.ds; break;
+	}
+
+	esi += base << 4;
+
+	if (vm86.regs.eflags & FLAG_D)
+		asm volatile ("std\n");
+	else
+		asm volatile ("cld\n");
+
+	switch (size) {
+	case 4: asm volatile ("outsl\n" : "=S" (esi) : "d" (edx), "0" (esi)); break;
+	case 2: asm volatile ("outsw\n" : "=S" (esi) : "d" (edx), "0" (esi)); break;
+	case 1: asm volatile ("outsb\n" : "=S" (esi) : "d" (edx), "0" (esi)); break;
+	}
+
+	if (vm86.regs.eflags & FLAG_D)
+		asm volatile ("cld");
+	
+	esi -= base << 4;
+	vm86.regs.esi &= 0xffff0000;
+	vm86.regs.esi |= (esi & 0xffff);
+}
+
+static void vm86_rep_outs(int size, int segment)
+{
+	u16 cx = vm86.regs.ecx;
+	
+	while (cx--)
+		vm86_outs(size, segment);
+
+	vm86.regs.ecx &= 0xffff0000;
+}
+
+void vesafb_do_vm86(struct vm86_regs *regs)
+{
+        unsigned char *real_mem = (void*)REAL_MEM;
+	unsigned int ret;
+	
+	memset(&vm86,0,sizeof(vm86));
+	memcpy(&vm86.regs, regs, sizeof(struct vm86_regs));
+
+	/* the return code */
+	real_mem[0] = 0xcd;  		/* int opcode */
+	real_mem[1] = 0xff;		/* int number (255) */
+
+        /* we use int 255 to get back to protected mode */
+	memset(&vm86.int_revectored, 0, sizeof(vm86.int_revectored));
+        ((unsigned char *) &vm86.int_revectored)[0xff / 8] |= (1 << (0xff % 8));	/* int 0xff */
+	
+	/* it's up to the caller to set the rest of the registers */
+	vm86.regs.eflags = DEFAULT_VM86_FLAGS;
+	vm86.regs.cs = *(unsigned short *)0x42;		/* 0x10 * 4 + 2 - the int map starts at 0x0	*/
+	vm86.regs.eip = *(unsigned short *)0x40;	/* 0x10 * 4					*/
+	
+	/* stack @ 0x10500, size: 0x500-4 - should be enough for our needs */
+	vm86.regs.ss = (REAL_MEM >> 4);
+	vm86.regs.esp = STACK_SIZE+RET_CODE_SIZE;
+
+	/* these will be fetched off the stack when we come to an iret in the int's 0x10 code */
+	VM86_PUSHW(DEFAULT_VM86_FLAGS);
+	VM86_PUSHW((REAL_MEM >> 4));		/* return code segment */
+	VM86_PUSHW(0x0000);			/* return code offset */
+
+	while(1) {
+		ret = vm86old(&vm86);
+
+		if (VM86_TYPE(ret) == VM86_INTx) {
+
+			int vint = VM86_ARG(ret);
+
+			/* if exit from vm86 was caused by int 0xff - we're done.. */
+			if (vint == 0xff)
+				goto vm86_done_call;
+			
+			/* .. otherwise, we have call the int handler manually */
+			VM86_PUSHW(vm86.regs.eflags);
+			VM86_PUSHW(vm86.regs.cs);
+			VM86_PUSHW(vm86.regs.eip);
+
+			vm86.regs.cs = *(unsigned short *)((vint << 2) + 2);
+			vm86.regs.eip = *(unsigned short *)(vint << 2);
+			vm86.regs.eflags &= ~(VIF_MASK | TF_MASK);
+
+		} else if (VM86_TYPE(ret) == VM86_UNKNOWN) {
+			
+			u8 *instr;
+			u8 data32 = 0, segment = P_DS, rep = 0;
+			int i = 0;
+					
+			instr = (u8*)((vm86.regs.cs << 4) + vm86.regs.eip);
+
+			while (1) {
+			
+				switch(instr[i]) {
+
+				case 0x66:	/* operand size prefix */
+					data32 = 1 - data32; 
+					i++;
+					break;
+				case 0xf2:	/* repnz */
+				case 0xf3:	/* rep */
+					rep = 1;
+					i++;
+					break;
+				case P_CS:	/* segment prefix */
+				case P_SS:
+				case P_DS:
+				case P_ES:
+				case P_FS:
+				case P_GS:
+					segment = instr[i];
+					i++;
+					break;
+				case 0xf0:	/* LOCK - ignored */
+				case 0x67:	/* address size prefix - ignored */
+					i++;
+					break;
+				case 0x6c:	/* insb */
+					if (rep)	vm86_rep_ins(1);
+					else		vm86_ins(1);
+					i++;
+					goto vm86_done_emu;
+				case 0x6d:	/* insw / insd */
+					if (rep)
+						if (data32)	vm86_rep_ins(4);
+						else		vm86_rep_ins(2);
+					else
+						if (data32)	vm86_ins(4);
+						else		vm86_ins(2);
+					i++;
+					goto vm86_done_emu;
+				case 0x6e:	/* outsb */
+					if (rep)	vm86_rep_outs(1, segment);
+					else		vm86_outs(1, segment);
+					i++;
+					goto vm86_done_emu;
+				case 0x6f:	/* outsw / outsd */
+					if (rep)
+						if (data32)	vm86_rep_outs(4, segment);
+						else		vm86_rep_outs(2, segment);
+					else
+						if (data32)	vm86_outs(4, segment);
+						else		vm86_outs(2, segment);
+					i++;
+					goto vm86_done_emu;
+				case 0xe4:	/* inb xx */
+					asm volatile (
+						"inb %w1, %b0" 
+						: "=a" (vm86.regs.eax)
+						: "d" (instr[i+1]), "0" (vm86.regs.eax));
+					i += 2;
+					goto vm86_done_emu;
+				case 0xe5:	/* inw xx / ind xx */
+					if (data32)
+						asm volatile (
+							"inl %w1, %0"
+							: "=a" (vm86.regs.eax)
+							: "d" (instr[i+1]), "0" (vm86.regs.eax));
+					else
+						asm volatile (
+							"inw %w1, %w0"
+							: "=a" (vm86.regs.eax)
+							: "d" (instr[i+1]), "0" (vm86.regs.eax));
+					i += 2;
+					goto vm86_done_emu;
+				case 0xec:	/* inb dx */
+					asm volatile (
+						"inb %w1, %b0"
+	 					: "=a" (vm86.regs.eax)
+						: "d" (vm86.regs.edx), "0" (vm86.regs.eax));
+					i++;
+					goto vm86_done_emu;
+				case 0xed:	/* inw dx / ind dx */
+					if (data32)
+						asm volatile (
+							"inl %w1, %0"
+							: "=a" (vm86.regs.eax)
+							: "d" (vm86.regs.edx));
+					else
+						asm volatile (
+							"inw %w1, %w0"
+							: "=a" (vm86.regs.eax)
+							: "d" (vm86.regs.edx));
+					i++;
+					goto vm86_done_emu;
+				case 0xe6:	/* outb xx */
+					asm volatile (
+						"outb %b0, %w1"
+						: : "a" (vm86.regs.eax), "d" (instr[i+1]));
+					i += 2;
+					goto vm86_done_emu;
+				case 0xe7:	/* outw xx / outd xx */
+					if (data32)
+						asm volatile (
+							"outl %0, %w1"
+							: : "a" (vm86.regs.eax), "d" (instr[i+1]));
+					else
+						asm volatile (
+							"outw %w0, %w1"
+							: : "a" (vm86.regs.eax), "d" (instr[i+1]));
+					i += 2;
+					goto vm86_done_emu;
+				case 0xee:	/* outb dx */
+					asm volatile (
+						"outb %b0, %w1"
+						: : "a" (vm86.regs.eax), "d" (vm86.regs.edx));
+					i++;
+					goto vm86_done_emu;
+				case 0xef:	/* outw dx / outd dx */
+					if (data32)
+						asm volatile (
+							"outl %0, %w1"
+							: : "a" (vm86.regs.eax), "d" (vm86.regs.edx));
+					else
+						asm volatile (
+							"outw %w0, %w1"
+							: : "a" (vm86.regs.eax), "d" (vm86.regs.edx));
+					i++;
+					goto vm86_done_emu;
+				default:
+					printk(KERN_ERR "vesafb: BUG, opcode %x emulation not supported\n", instr[i]);
+					goto vm86_done_call;
+				}
+			}
+vm86_done_emu:		vm86.regs.eip += i;
+		} else {
+			printk(KERN_ERR "vesafb: BUG, returned from vm86 with %x\n", ret);
+			goto vm86_done_call;
+		}	
+	}
+
+vm86_done_call:
+
+	/* copy the registers' state back to the caller's struct */
+	memcpy(regs, &vm86.regs, sizeof(struct vm86_regs));
+}
+
+#define vesafb_get_string(str) { \
+													\
+	/* the address is in the form ssssoooo, where oooo = offset, ssss = segment */			\
+	addr = ((vbe_pib(task->res)->str & 0xffff0000) >> 12) +						\
+		(vbe_pib(task->res)->str & 0x0000ffff);							\
+													\
+	/* the data is in ROM which is shared between processes, so we just translate the		\
+	   real mode address into one visible from the kernel space */					\
+	if (addr >= 0xa0000) {										\
+		vbe_pib(task->res)->str = (u32) __va(addr);						\
+													\
+	/* the data is in the buffer, we just have to convert the address so that it would		\
+	   point into the buffer user provided */							\
+	} else if (addr > REAL_MEM+BUFFER && addr < REAL_MEM+BUFFER + 					\
+		   sizeof(struct vesafb_vbe_info_block)) {						\
+		addr -= BUFFER+REAL_MEM;					 			\
+		vbe_pib(task->res)->str = (u32) (task->res + addr);					\
+													\
+	/* this should never happen: someone was insane enough to put the data somewhere in the RAM;	\
+	   we need to copy as much of it as possible to our buffer */					\
+	} else {											\
+		res = strlcpy((char*) (((int)&(vbe_pib(task->res)->oem_data)) + oem_offset),		\
+			      (char*) addr, sizeof(vbe_pib(task->res)->oem_data) - oem_offset);		\
+													\
+		vbe_pib(task->res)->str = ((u32)&(vbe_pib(task->res)->oem_data)) + oem_offset;		\
+													\
+		oem_offset += res+1;									\
+		if (oem_offset > sizeof(vbe_pib(task->res)->oem_data)) {				\
+			oem_offset = sizeof(vbe_pib(task->res)->oem_data);				\
+		}											\
+	}												\
+}
+
+void vesafb_handle_tasks(void)
+{
+	struct vesafb_task *task;
+	struct list_head *node, *next;
+	int addr, oem_offset = 0, res;
+	
+	down(&vesafb_sem);
+	list_for_each_safe(node, next, &vesafb_task_list) {
+
+		task = container_of(node, struct vesafb_task, node);  
+		
+		switch (task->type) {
+
+			case VESAFB_TASK_DOVM86:
+				vesafb_do_vm86(&task->regs);
+				break;
+				
+			case VESAFB_TASK_GETVBE_IB:
+				task->regs.es	= (REAL_MEM >> 4);
+				task->regs.edi	= BUFFER;
+				strncpy(vbe_pib(REAL_MEM+BUFFER)->vbe_signature,"VBE2",4);
+				
+				vesafb_do_vm86(&task->regs);
+
+				memcpy(task->res, (void*)(REAL_MEM + BUFFER), sizeof(struct vesafb_vbe_info_block));
+			
+				/* the OEM fields were not defined prior to VBE 2.0 */
+				if (vbe_pib(task->res)->vbe_version >= 0x200) {
+					vesafb_get_string(oem_string_ptr);
+					vesafb_get_string(oem_vendor_name_ptr);
+					vesafb_get_string(oem_product_name_ptr);
+					vesafb_get_string(oem_product_rev_ptr);
+				}
+		
+				/* this is basically the same as vesafb_get_string; the third part
+				   is different though, so that's what causes all this mess */
+				addr = ((vbe_pib(task->res)->mode_list_ptr & 0xffff0000) >> 12) +
+					(vbe_pib(task->res)->mode_list_ptr & 0x0000ffff);
+			
+				if (addr >= 0xa0000) {
+					vbe_pib(task->res)->mode_list_ptr = (u32) __va(addr);
+				} else if (addr > REAL_MEM+BUFFER && addr < REAL_MEM+BUFFER +
+					   sizeof(struct vesafb_vbe_info_block))
+				{
+					addr -= BUFFER+REAL_MEM;
+					vbe_pib(task->res)->mode_list_ptr = (u32) (task->res + addr);
+				} else {
+					res = 0;
+					while (*(u16*)(addr+res) != 0xffff && 
+					       res < (sizeof(vbe_pib(task->res)->reserved) - 2) )
+					{
+						*(u16*) ((u32)&(vbe_pib(task->res)->reserved) + res) =
+							*(u16*)(addr+res);
+						res += 2;
+					}
+					
+					*(u16*) ((u32)&(vbe_pib(task->res)->reserved) + res) = 0xffff;
+ 				}
+				break;
+
+			case VESAFB_TASK_GETVBE_MODEINFO:
+				task->regs.es	= (REAL_MEM >> 4);
+				task->regs.edi	= BUFFER;
+				vesafb_do_vm86(&task->regs);
+				memcpy(task->res, (void*)(REAL_MEM + BUFFER), sizeof(struct vesafb_mode_info_block));
+				break;
+
+			case VESAFB_TASK_SWITCHMODE:
+				if (task->res != NULL) {
+					task->regs.es	= (REAL_MEM >> 4);
+					task->regs.edi	= BUFFER;
+					memcpy((void*)(REAL_MEM + BUFFER), task->res, sizeof(struct vesafb_crtc_info_block));
+				}
+				
+				vesafb_do_vm86(&task->regs);
+				break;
+
+			case VESAFB_TASK_SETPAL:
+				task->regs.es	= (REAL_MEM >> 4);
+				task->regs.edi	= BUFFER;
+				memcpy((void*)(REAL_MEM + BUFFER), task->res, sizeof(struct vesafb_pal_entry));
+				vesafb_do_vm86(&task->regs);
+				break;
+				
+			case VESAFB_TASK_GETEDID:
+				task->regs.es	= (REAL_MEM >> 4);
+				task->regs.edi	= BUFFER;
+				vesafb_do_vm86(&task->regs);
+				memcpy(task->res, (void*)(REAL_MEM + BUFFER), EDID_LENGTH);
+			
+			default:
+				break;
+		}
+		
+		task->done = 1;
+		if (task->flags & VESAFB_FLAG_FREESTRUCT) {
+			kfree(task);
+		}
+	}
+
+	up(&vesafb_sem);
+	list_del_init(node);
+}
+
+int vesafb_thread(void *unused)
+{
+	struct vm_area_struct vma;
+	struct page *page;
+	
+	int ret, err = 0;
+	void *mem;	
+
+	set_fs(KERNEL_DS);
+	daemonize("vesafb");
+	DPRINTK("started vesafb thread\n");
+
+	current->mm = current->active_mm;
+	mem = kmalloc(REAL_MEM_SIZE,GFP_KERNEL);
+	
+	if (!mem)
+		return -ENOMEM;
+
+	for (page = virt_to_page(mem); page < virt_to_page(mem+REAL_MEM_SIZE); page++) {
+		SetPageReserved(page);
+        }
+
+#ifdef CONFIG_SMP
+	cpus_clear(current->active_mm->cpu_vm_mask);
+	cpu_set(smp_processor_id(), current->active_mm->cpu_vm_mask);
+#endif
+
+	vma.vm_mm = current->active_mm;
+	vma.vm_page_prot.pgprot = PROT_READ | PROT_EXEC | PROT_WRITE;
+	
+	ret = remap_page_range(&vma, 0x000000, __pa(mem), REAL_MEM_SIZE, vma.vm_page_prot);
+	ret += remap_page_range(&vma, 0x0a0000, 0x0a0000, 0x100000 - 0x0a0000, vma.vm_page_prot);
+
+	if (ret) {
+		printk(KERN_ERR "vesafb thread: memory remapping failed\n");
+		err = -EINVAL;
+		goto thr_end;
+	}
+
+	/* copy the first 0x20000 bytes from low mem to our private memory, which is 
+	 * then used for the vm86 calls */
+	memcpy((void*)0x0, __va(0x0), REAL_MEM_SIZE);
+
+	ioperm(0,1024,1);	/* we can live if it fails, so don't bother checking for errors */
+	set_user_nice(current, -10);
+	
+	while (1) {
+		vesafb_handle_tasks();
+		wait_event_interruptible(vesafb_wait, !list_empty(&vesafb_task_list));
+		if (current->flags & PF_FREEZE)
+			refrigerator(PF_FREEZE);
+
+		if (signal_pending(current))
+			break;
+	}
+
+thr_end:
+	DPRINTK("exiting the vesafb thread\n");
+	vesafb_pid = 0;
+	
+	for (page = virt_to_page(mem); page < virt_to_page(mem+REAL_MEM_SIZE); page++) {
+		ClearPageReserved(page);
+        }
+	
+	kfree(mem);
+	return err;
+}
+
+int vesafb_init_thread(void)
+{
+	vesafb_pid = kernel_thread(vesafb_thread,NULL,0);
+	return 0;
+}
+
+EXPORT_SYMBOL(vesafb_pid);
+EXPORT_SYMBOL(vesafb_queue_task);
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/video/vesafb-tng.c linux-2.6.8.1-ck7/drivers/video/vesafb-tng.c
--- linux-2.6.8.1-ck6/drivers/video/vesafb-tng.c	1970-01-01 10:00:00.000000000 +1000
+++ linux-2.6.8.1-ck7/drivers/video/vesafb-tng.c	2004-09-09 22:56:38.729109842 +1000
@@ -0,0 +1,1149 @@
+/*
+ * Framebuffer driver for VBE 2.0+ compliant graphic boards
+ *
+ * (c) 2004 Michał Januszewski <spock@gentoo.org>
+ *     Based upon vesafb code by Gerd Knorr <kraxel@goldbach.in-berlin.de>
+ * 
+ */
+
+#ifdef DEBUG
+#define DPRINTK(fmt, args...)	printk(KERN_DEBUG "%s: " fmt, __FUNCTION__ , ## args)
+#else
+#define DPRINTK(fmt, args...)
+#endif
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/tty.h>
+#include <linux/delay.h>
+#include <linux/fb.h>
+#include <linux/ioport.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <video/edid.h>
+#include <video/vesa.h>
+
+#include <asm/io.h>
+#include <asm/mtrr.h>
+#include "edid.h"
+
+#define dac_reg	(0x3c8)
+#define dac_val	(0x3c9)
+
+/* --------------------------------------------------------------------- */
+
+static struct fb_var_screeninfo vesafb_defined __initdata = {
+	.activate	= FB_ACTIVATE_NOW,
+	.height		= -1,
+	.width		= -1,
+	.right_margin	= 32,
+	.upper_margin	= 16,
+	.lower_margin	= 4,
+	.vsync_len	= 4,
+	.vmode		= FB_VMODE_NONINTERLACED,
+};
+
+static struct fb_fix_screeninfo vesafb_fix __initdata = {
+	.id	= "VESA VGA",
+	.type	= FB_TYPE_PACKED_PIXELS,
+	.accel	= FB_ACCEL_NONE,
+};
+
+static int				mtrr       = 1;	/* use MTRR */
+static int				ypan       = 0;	/* 0 - nothing, 1 - ypan, 2 -ywrap */
+static int				pmi_setpal = 0;	/* pmi for palette changes */
+static unsigned short			*pmi_base  = NULL; /* protected mode interface location in memory */
+static void				(*pmi_start)(void) = NULL;	
+static void				(*pmi_pal)(void) = NULL;
+static struct task_struct		*vesafb_serv_thread = NULL;
+static struct vesafb_vbe_info_block	vbe_ib;
+static struct vesafb_mode_info_block	*vbe_modes;
+static int				vbe_modes_cnt = 0;
+static u8				mon_limits = 0;	/* 0 - no monitor limits, 1 - full monitor limits, 
+							   2 - monitor limits with default pixel clock */
+static int				nocrtc = 0;	/* ignore CRTC settings */
+static struct fb_info			*vesafb_info = NULL;
+static struct fb_videomode		*edid_modes    __initdata = NULL;
+static int				edid_modes_cnt __initdata = 0; 
+static int				noedid         __initdata = 0; /* don't try the DDC transfers */
+static unsigned short			vram           __initdata = 0; /* set the amount of memory to be used */
+static unsigned short			maxclk         __initdata = 0; /* maximum pixel clock */
+static unsigned short			maxvf          __initdata = 0; /* maximum vertical frequency */
+static unsigned short			maxhf          __initdata = 0; /* maximum horizontal frequency */
+static int 				gtf            __initdata = 0; /* forces use of the GTF */
+static char				*mode_option   __initdata = NULL;
+static unsigned short			vbemode	       __initdata = 0;
+
+extern int vesafb_pid;			/* PID of the vesafb service thread */
+
+/* --------------------------------------------------------------------- */
+
+#define vesafb_create_task(task)	{	task = kmalloc(sizeof(struct vesafb_task), GFP_ATOMIC); \
+						if (task) memset(task,0,sizeof(struct vesafb_task));	}
+
+#define vesafb_wait_for_task(task)	{ while (task->done == 0) { schedule(); } }
+
+extern void vesafb_queue_task(struct vesafb_task *task);
+
+/* --------------------------------------------------------------------- */
+
+static int vesafb_pan_display(struct fb_var_screeninfo *var,
+                              struct fb_info *info)
+{
+#ifdef __i386__
+	int offset;
+	
+	if (!ypan)
+		return -EINVAL;
+	if (var->xoffset)
+		return -EINVAL;
+	if (var->yoffset > var->yres_virtual)
+		return -EINVAL;
+	if ((ypan==1) && var->yoffset+var->yres > var->yres_virtual)
+		return -EINVAL;
+
+	offset = (var->yoffset * info->fix.line_length + var->xoffset) / 4;
+
+	/* It turns out it's not the best idea to do panning via vm86,
+	 * so we only allow it if we have a PMI */
+	if (pmi_start) {
+		__asm__ __volatile__(
+			"call *(%%edi)"
+			: /* no return value */
+			: "a" (0x4f07),         /* EAX */
+			  "b" (0),              /* EBX */
+			  "c" (offset),         /* ECX */
+			  "d" (offset >> 16),   /* EDX */
+			  "D" (&pmi_start));    /* EDI */
+	}
+#endif
+	return 0;
+}
+
+static void vesa_setpalette(int regno, unsigned red, unsigned green, unsigned blue, int shift)
+{
+	struct vesafb_pal_entry entry;
+	struct vesafb_task *mytask;
+
+#ifdef __i386__
+	entry.red   = red   >> shift;
+	entry.green = green >> shift;
+	entry.blue  = blue  >> shift;
+	entry.pad   = 0;
+
+	if (pmi_setpal) {
+	        __asm__ __volatile__(
+                "call *(%%esi)"
+                : /* no return value */
+                : "a" (0x4f09),         /* EAX */
+                  "b" (0),              /* EBX */
+                  "c" (1),              /* ECX */
+                  "d" (regno),          /* EDX */
+                  "D" (&entry),         /* EDI */
+                  "S" (&pmi_pal));      /* ESI */
+	} else {
+		vesafb_create_task (mytask);
+
+		mytask->regs.eax = 0x4f09;
+		mytask->regs.ebx = 0x0;
+		mytask->regs.ecx = 1;
+		mytask->regs.edx = regno;
+		mytask->res = &entry;
+		mytask->type = VESAFB_TASK_SETPAL;
+			
+		vesafb_queue_task (mytask);
+		vesafb_wait_for_task(mytask);
+		kfree(mytask);
+	}
+#endif
+}
+
+static int vesafb_setcolreg(unsigned regno, unsigned red, unsigned green,
+			    unsigned blue, unsigned transp,
+			    struct fb_info *info)
+{
+	/*
+	 *  Set a single color register. The values supplied are
+	 *  already rounded down to the hardware's capabilities
+	 *  (according to the entries in the `var' structure). Return
+	 *  != 0 for invalid regno.
+	 */
+
+	if (regno >= info->cmap.len)
+		return 1;
+
+	switch (info->var.bits_per_pixel) {
+	case 8:
+		vesa_setpalette(regno,red,green,blue,16 - info->var.green.length);
+		break;
+	case 16:
+		if (info->var.red.offset == 10) {
+			/* 1:5:5:5 */
+			((u32*) (info->pseudo_palette))[regno] =	
+					((red   & 0xf800) >>  1) |
+					((green & 0xf800) >>  6) |
+					((blue  & 0xf800) >> 11);
+		} else {
+			/* 0:5:6:5 */
+			((u32*) (info->pseudo_palette))[regno] =	
+					((red   & 0xf800)      ) |
+					((green & 0xfc00) >>  5) |
+					((blue  & 0xf800) >> 11);
+		}
+		break;
+	case 24:
+		red   >>= 8;
+		green >>= 8;
+		blue  >>= 8;
+		((u32 *)(info->pseudo_palette))[regno] =
+			(red   << info->var.red.offset)   |
+			(green << info->var.green.offset) |
+			(blue  << info->var.blue.offset);
+		break;
+	case 32:
+		red   >>= 8;
+		green >>= 8;
+		blue  >>= 8;
+		((u32 *)(info->pseudo_palette))[regno] =
+			(red   << info->var.red.offset)   |
+			(green << info->var.green.offset) |
+			(blue  << info->var.blue.offset);
+		break;
+    }
+    return 0;
+}
+
+static int vesafb_set_par(struct fb_info *info)
+{
+	struct vesafb_par *par = (struct vesafb_par *) info->par;
+	struct vesafb_task *mytask;
+	struct vesafb_crtc_info_block *crtc = NULL;	
+	struct vesafb_mode_info_block *mode = (void*)info->var.reserved[1];
+	int err = 0;
+	
+	/* sanity check */
+	if (info->var.reserved[0] == 0xffff)
+		return -EINVAL;
+
+	vesafb_create_task (mytask);
+
+	mytask->regs.eax = 0x4f02;
+	mytask->regs.ebx = (u16)info->var.reserved[0] | 0x4000;	/* use LFB */
+
+	if (vbe_ib.vbe_version >= 0x0300 && !nocrtc && 
+	    info->var.reserved[2] != 0xdeadbeef) {
+
+		mytask->regs.ebx |= 0x0800; /* use CRTC data */
+		crtc = kmalloc(sizeof(struct vesafb_crtc_info_block), GFP_KERNEL);
+	
+		if (!crtc) {
+			err = -ENOMEM;
+			goto sp_end;		
+		}
+		crtc->horiz_start = info->var.xres + info->var.right_margin;
+		crtc->horiz_end	  = crtc->horiz_start + info->var.hsync_len;
+		crtc->horiz_total = crtc->horiz_end + info->var.left_margin;
+		
+		crtc->vert_start  = info->var.yres + info->var.lower_margin;
+		crtc->vert_end    = crtc->vert_start + info->var.vsync_len;
+		crtc->vert_total  = crtc->vert_end + info->var.upper_margin;
+	
+		crtc->pixel_clock = PICOS2KHZ(info->var.pixclock) * 1000;
+		crtc->refresh_rate = (u16)(100 * (crtc->pixel_clock / (crtc->vert_total * crtc->horiz_total)));
+		crtc->flags = 0;
+
+		if (info->var.vmode & FB_VMODE_DOUBLE)
+			crtc->flags |= 0x1;
+
+		if (info->var.vmode & FB_VMODE_INTERLACED)
+			crtc->flags |= 0x2;
+	
+		if (!(info->var.sync & FB_SYNC_HOR_HIGH_ACT))
+			crtc->flags |= 0x4;
+
+		if (!(info->var.sync & FB_SYNC_VERT_HIGH_ACT))
+			crtc->flags |= 0x8;
+		
+		memcpy(&par->crtc, crtc, sizeof(struct vesafb_crtc_info_block));
+	} else
+		memset(&par->crtc, 0, sizeof(struct vesafb_crtc_info_block));
+
+	mytask->res = (void*)crtc;
+	mytask->type = VESAFB_TASK_SWITCHMODE; 
+
+	vesafb_queue_task (mytask);
+	vesafb_wait_for_task(mytask);
+
+	if ((mytask->regs.eax & 0xffff) != 0x004f) {
+		printk(KERN_ERR "vesafb: mode switch failed (eax: 0x%lx)\n", mytask->regs.eax);
+		err = -EINVAL;
+		goto sp_end;
+	}
+	
+	if (vbe_ib.capabilities & VESAFB_CAP_CAN_SWITCH_DAC && mode->bits_per_pixel <= 8) {
+		mytask->done = 0;
+		mytask->type = VESAFB_TASK_DOVM86;
+		mytask->regs.eax = 0x4f08;
+		mytask->regs.ebx = 0x0800;
+	
+		vesafb_queue_task (mytask);
+		vesafb_wait_for_task(mytask);
+
+		if ((mytask->regs.eax & 0xffff) != 0x004f ||
+		    ((mytask->regs.ebx & 0xff00) >> 8) != 8) {
+
+			/* we've failed to set the DAC palette format - time to correct var */
+			info->var.red.length    = 6;
+			info->var.green.length  = 6;
+			info->var.blue.length   = 6;
+		}
+	}
+		
+	info->fix.visual = (info->var.bits_per_pixel == 8) ? FB_VISUAL_PSEUDOCOLOR : FB_VISUAL_TRUECOLOR;
+	info->fix.line_length = mode->bytes_per_scan_line;
+	par->vbe_mode = info->var.reserved[0];
+
+	DPRINTK("set new mode %dx%d-%d\n", info->var.xres, info->var.yres, info->var.bits_per_pixel);
+
+sp_end:
+	if (crtc != NULL)
+		kfree(crtc);
+	kfree(mytask);
+
+	return err;
+}
+
+void vesafb_setup_var(struct fb_var_screeninfo *var, struct vesafb_mode_info_block *mode)
+{
+	if (var->bits_per_pixel == 15)
+		var->bits_per_pixel = 16;
+
+	if (var->bits_per_pixel > 8) {
+		var->red.offset    = mode->red_off;
+		var->red.length    = mode->red_len;
+		var->green.offset  = mode->green_off;
+		var->green.length  = mode->green_len;
+		var->blue.offset   = mode->blue_off;
+		var->blue.length   = mode->blue_len;
+		var->transp.offset = mode->rsvd_off;
+		var->transp.length = mode->rsvd_len;
+
+		DPRINTK("directcolor: size=%d:%d:%d:%d, shift=%d:%d:%d:%d\n",
+	       		mode->rsvd_len,
+			mode->red_len,
+			mode->green_len,
+			mode->blue_len,
+			mode->rsvd_off,
+			mode->red_off,
+			mode->green_off,
+			mode->blue_off);
+	} else {
+		var->red.offset    = 0;
+		var->green.offset  = 0;
+		var->blue.offset   = 0;
+		var->transp.offset = 0;
+		
+		/* We're assuming that we can switch the DAC to 8 bits. If this proves
+		 * to be incorrect, we'll update the fields later in set_par. */
+		if (vbe_ib.capabilities & VESAFB_CAP_CAN_SWITCH_DAC) {
+			var->red.length    = 8;
+			var->green.length  = 8;
+			var->blue.length   = 8;
+			var->transp.length = 0;
+		} else {
+			var->red.length    = 6;
+			var->green.length  = 6;
+			var->blue.length   = 6;
+			var->transp.length = 0;
+		}
+	}
+}
+
+int inline vesafb_check_limits(struct fb_var_screeninfo *var, struct fb_info *info)
+{	
+	if (mon_limits)
+		return fb_validate_mode(var, info);
+	else
+		return 0;
+}
+
+int vesafb_find_vbe_mode(int xres, int yres, int bpp, unsigned char flags)
+{
+	int match = -1;
+	int i;
+	
+	DPRINTK("looking for mode: %dx%d-%d\n", xres, yres, bpp);
+
+	/* first try to find the exact mode the user wants to set.. */
+	for (i = 0; i < vbe_modes_cnt; i++) {
+
+		if (vbe_modes[i].x_res == xres && vbe_modes[i].y_res == yres) {
+	
+			int h = bpp - vbe_modes[i].bits_per_pixel;
+			
+			/* ok, we've got an exact match */
+			if (h == 0)
+				return i;
+					
+			if (match == -1 || (bpp - vbe_modes[match].bits_per_pixel) > h)
+				match = i;
+		}
+	}
+	
+	/* .. and if this fails look for similar modes */
+	if (match == -1 && flags) {
+	
+		unsigned int min = 0xffffffff; /* just a big number */
+		unsigned int d;
+		
+		DPRINTK("mode not found (1st pass)\n"); 
+			
+		for (i = 0; i < vbe_modes_cnt; i++) {
+
+			if (vbe_modes[i].y_res < yres || vbe_modes[i].x_res < xres)
+				continue;
+
+			d = vbe_modes[i].y_res - yres + vbe_modes[i].x_res - xres;
+					
+			if (d < min) {
+				min = d;
+				match = i;
+			}
+		
+			if (d == min && (bpp - vbe_modes[match].bits_per_pixel) > 
+	  		    (bpp - vbe_modes[i].bits_per_pixel))
+				match = i;
+		}
+	}
+
+	return match;
+}
+
+int vesafb_check_var(struct fb_var_screeninfo *var, struct fb_info *info)
+{
+	int match = -1;
+	
+	if (vesafb_check_limits(var, info))
+		return -EINVAL;
+	
+	/* FIXME: we should allow interlaced/double modes if an appropriate mode is supported by the VBE */
+	if (var->vmode & (FB_VMODE_INTERLACED | FB_VMODE_DOUBLE))
+		return -EINVAL;
+	
+	match = vesafb_find_vbe_mode(var->xres, var->yres, var->bits_per_pixel, (vbe_ib.vbe_version >= 0x300) ? 1 : 0);
+
+	if (match == -1) {
+		printk(KERN_ERR "vesafb: mode %dx%d-%d@%d not found\n", var->xres, var->yres, var->bits_per_pixel,
+				(int)(PICOS2KHZ(info->var.pixclock) /
+				((info->var.xres + info->var.right_margin + info->var.hsync_len + info->var.left_margin) *
+				(info->var.yres + info->var.lower_margin + info->var.vsync_len + info->var.upper_margin))) * 1000);
+		var->reserved[0] = 0xffff;
+		return -EINVAL;
+	} else {
+		var->bits_per_pixel = vbe_modes[match].bits_per_pixel;
+		var->reserved[0] = (u32)vbe_modes[match].mode_id;
+		var->reserved[1] = (u32)(&vbe_modes[match]);
+		var->reserved[2] = 0x0;
+		vesafb_setup_var(var, &vbe_modes[match]);
+	
+		DPRINTK("found mode 0x%x (%dx%d-%dbpp)\n",
+			vbe_modes[match].mode_id, vbe_modes[match].x_res, vbe_modes[match].y_res, 
+			vbe_modes[match].bits_per_pixel);
+	}
+		
+	return 0;
+}
+
+static void vesafb_platform_release(struct device *device)
+{
+	return;
+}
+
+static int __init vesafb_probe(struct device *device);
+
+static struct fb_ops vesafb_ops = {
+	.owner		= THIS_MODULE,
+	.fb_setcolreg	= vesafb_setcolreg,
+	.fb_pan_display	= vesafb_pan_display,
+	.fb_fillrect	= cfb_fillrect,
+	.fb_copyarea	= cfb_copyarea,
+	.fb_imageblit	= cfb_imageblit,
+	.fb_cursor	= soft_cursor,
+	.fb_check_var	= vesafb_check_var,
+	.fb_set_par	= vesafb_set_par
+};
+
+static struct device_driver vesafb_driver = {
+	.name	= "vesafb",
+	.bus	= &platform_bus_type,
+	.probe	= vesafb_probe,
+};
+
+static struct platform_device vesafb_device = {
+	.name	= "vesafb",
+        .dev    = {
+                .release = vesafb_platform_release,
+        }
+};
+
+#ifndef MODULE
+int __init vesafb_setup(char *options)
+{
+	char *this_opt;
+	
+	if (!options || !*options)
+		return 0;
+	
+	DPRINTK("options %s\n",options);
+	
+	while ((this_opt = strsep(&options, ",")) != NULL) {
+		if (!*this_opt) continue;
+		
+		DPRINTK("this_opt: %s\n",this_opt);
+		
+		if (! strcmp(this_opt, "redraw"))
+			ypan=0;
+		else if (! strcmp(this_opt, "ypan"))
+			ypan=1;
+		else if (! strcmp(this_opt, "ywrap"))
+			ypan=2;
+		else if (! strcmp(this_opt, "vgapal"))
+			pmi_setpal=0;
+		else if (! strcmp(this_opt, "pmipal"))
+			pmi_setpal=1;
+		else if (! strcmp(this_opt, "mtrr"))
+			mtrr=1;
+		else if (! strcmp(this_opt, "nomtrr"))
+			mtrr=0;
+		else if (! strcmp(this_opt, "nocrtc"))
+			nocrtc=1;
+		else if (! strcmp(this_opt, "noedid")) 
+			noedid=1;
+		else if (! strcmp(this_opt, "gtf"))
+			gtf=1;
+		else if (! strncmp(this_opt, "vram:", 5))
+			vram = simple_strtoul(this_opt + 5, NULL, 0);
+		else if (! strncmp(this_opt, "maxhf:", 6))
+			maxhf = simple_strtoul(this_opt + 6, NULL, 0);
+		else if (! strncmp(this_opt, "maxvf:", 6))
+			maxvf = simple_strtoul(this_opt + 6, NULL, 0);
+		else if (! strncmp(this_opt, "maxclk:", 7))
+			maxclk = simple_strtoul(this_opt + 7, NULL, 0);
+		else if (! strncmp(this_opt, "vbemode:", 8))
+			vbemode = simple_strtoul(this_opt + 8, NULL,0);
+		else {
+			DPRINTK("mode_option: %s\n",this_opt);
+			mode_option = this_opt;
+		}
+	}
+
+	return 0;
+}
+
+#endif /* !MODULE */
+
+static int vesafb_read_proc_modes(char *buf, char **start, off_t offset,
+			    	  int len, int *eof, void *private)
+{
+	int clen = 0, i;
+	
+	for (i = 0; i < vbe_modes_cnt; i++)
+		clen += sprintf(buf + clen, "%dx%d-%d\n", vbe_modes[i].x_res, 
+				vbe_modes[i].y_res, vbe_modes[i].bits_per_pixel);
+	
+	*start = buf + offset;
+
+	if (clen > offset)
+		clen -= offset;
+	else
+		clen = 0;
+
+	return clen;
+}
+
+static int vesafb_read_proc_vbe_info(char *buf, char **start, off_t offset,
+			    	     int len, int *eof, void *private)
+{
+	int clen = 0;
+
+	clen += sprintf(buf + clen, "Version:    %d.%d\n", ((vbe_ib.vbe_version & 0xff00) >> 8), vbe_ib.vbe_version & 0xff);
+	clen += sprintf(buf + clen, "Vendor:     %s\n", (char*)vbe_ib.oem_vendor_name_ptr);
+	clen += sprintf(buf + clen, "Product:    %s\n", (char*)vbe_ib.oem_product_name_ptr);
+	clen += sprintf(buf + clen, "OEM rev:    %s\n", (char*)vbe_ib.oem_product_rev_ptr);
+	clen += sprintf(buf + clen, "OEM string: %s\n", (char*)vbe_ib.oem_string_ptr);
+
+	*start = buf + offset;
+
+	if (clen > offset)
+		clen -= offset;
+	else
+		clen = 0;
+
+	return clen;
+}
+
+static int __init vesafb_vbe_init(struct fb_info *info)
+{
+	struct vesafb_task *mytask;
+	u16 *mode = 0;
+	int off = 0;
+	int i;
+	
+	vesafb_create_task (mytask);
+	mytask->regs.eax = 0x4f00;
+	mytask->type = VESAFB_TASK_GETVBE_IB;
+	mytask->res = &vbe_ib; 
+	vesafb_queue_task (mytask);
+	vesafb_wait_for_task(mytask);
+
+	if (vbe_ib.vbe_version < 0x0200) {
+		printk(KERN_ERR "vesafb: Sorry, pre-VBE 2.0 cards are not supported.\n");
+		kfree(mytask);
+		return 1;
+	}
+	
+	if ((mytask->regs.eax & 0xffff) != 0x004f) {
+		printk(KERN_ERR "vesafb: Getting mode info block failed (eax=0x%x)\n",(u32)mytask->regs.eax);
+		kfree(mytask);
+		return 1;	
+	}
+		
+	printk(KERN_INFO "vesafb: %s, %s, %s (OEM: %s)\n", (char*)vbe_ib.oem_vendor_name_ptr,
+		(char*)vbe_ib.oem_product_name_ptr, (char*)vbe_ib.oem_product_rev_ptr,
+		(char*)vbe_ib.oem_string_ptr);
+
+	printk(KERN_INFO "vesafb: VBE version: %d.%d\n",((vbe_ib.vbe_version & 0xff00) >> 8), vbe_ib.vbe_version & 0xff);
+
+	/* count the available modes */
+	mode = (u16*)vbe_ib.mode_list_ptr;
+	while (*mode != 0xffff) {
+		vbe_modes_cnt++;
+		mode++;
+	}
+
+	vbe_modes = kmalloc(sizeof(struct vesafb_mode_info_block)*vbe_modes_cnt,GFP_KERNEL);
+
+	/* get mode info for all available modes */
+	mode = (u16*)vbe_ib.mode_list_ptr;
+	
+	while (*mode != 0xffff) {
+		mytask->regs.eax = 0x4f01;
+		mytask->regs.ecx = (u32) *mode;
+		mytask->type = VESAFB_TASK_GETVBE_MODEINFO;
+		mytask->res = vbe_modes+off; 
+		mytask->done = 0;
+		
+		vesafb_queue_task (mytask);
+		vesafb_wait_for_task(mytask);
+	
+		vesafb_pmib(mytask->res)->mode_id = *mode;
+		
+		/* forget text modes */
+		if ((vesafb_pmib(mytask->res)->mode_attr & 0x10) != 0 && 
+		    vesafb_pmib(mytask->res)->bits_per_pixel >= 8)
+			off++;
+		else
+			vbe_modes_cnt--;
+		
+		mode++;
+	}
+
+	mytask->regs.eax = 0x4f0a;
+	mytask->regs.ebx = 0x0;
+	mytask->type = VESAFB_TASK_DOVM86;
+	mytask->res = NULL;
+	mytask->done = 0;
+
+	vesafb_queue_task(mytask);
+	vesafb_wait_for_task(mytask);
+
+	if ((mytask->regs.eax & 0xffff) != 0x004f || mytask->regs.es < 0xc000) {
+		pmi_setpal = ypan = 0;
+	} else {
+		printk(KERN_INFO "vesafb: protected mode interface info at %04x:%04x\n", (u16)mytask->regs.es, (u16)mytask->regs.edi);
+		pmi_base  = (unsigned short*)phys_to_virt(((unsigned long)mytask->regs.es << 4) + mytask->regs.edi);
+		pmi_start = (void*)((char*)pmi_base + pmi_base[1]);
+		pmi_pal   = (void*)((char*)pmi_base + pmi_base[2]);
+		printk(KERN_INFO "vesafb: pmi: set display start = %p, set palette = %p\n",pmi_start,pmi_pal);
+
+		if (pmi_base[3]) {
+			printk(KERN_INFO "vesafb: pmi: ports = ");
+			for (i = pmi_base[3]/2; pmi_base[i] != 0xffff; i++)
+				printk("%x ",pmi_base[i]);
+			printk("\n");
+
+			if (pmi_base[i] != 0xffff) {
+				/*
+				 * memory areas not supported (yet?)
+				 *
+				 * Rules are: we have to set up a descriptor for the requested
+				 * memory area and pass it in the ES register to the BIOS function.
+				 */				 
+				printk(KERN_INFO "vesafb: can't handle memory requests, pmi disabled\n");
+				ypan = pmi_setpal = 0;
+			}
+		}
+	}
+
+	if (noedid || vbe_ib.vbe_version < 0x0300)
+		goto vi_1;
+	
+	mytask->regs.eax = 0x4f15;
+	mytask->regs.ebx = 0;
+	mytask->regs.ecx = 0;
+	mytask->done = 0; 
+
+	vesafb_queue_task(mytask);
+	vesafb_wait_for_task(mytask);
+
+	if ((mytask->regs.eax & 0xffff) != 0x004f)
+		goto vi_1;
+
+	if ((mytask->regs.ebx & 0x3) == 3) {
+		printk(KERN_INFO "vesafb: hardware supports both DCC1 and DCC2 transfers\n");
+	} else if ((mytask->regs.ebx & 0x3) == 2) {
+		printk(KERN_INFO "vesafb: hardware supports DCC2 transfers\n");
+	} else if ((mytask->regs.ebx & 0x3) == 1) {
+		printk(KERN_INFO "vesafb: hardware supports DCC1 transfers\n");
+	} else {
+		printk(KERN_INFO "vesafb: hardware doesn't support DCC transfers\n");
+		goto vi_1;
+	}
+	
+	mytask->regs.eax = 0x4f15;
+	mytask->regs.ebx = 1;
+	mytask->regs.ecx = mytask->regs.edx = mytask->done = 0;
+	mytask->type = VESAFB_TASK_GETEDID;
+	mytask->res = kmalloc(EDID_LENGTH, GFP_KERNEL);
+
+	vesafb_queue_task(mytask);
+	vesafb_wait_for_task(mytask);
+
+	if ((mytask->regs.eax & 0xffff) == 0x004f) {
+
+		mon_limits = !fb_get_monitor_limits(mytask->res, &info->monspecs);
+
+		/* if no maximum clock is specified, set to 300 MHz */
+		if (mon_limits && info->monspecs.dclkmax == 0) {
+			info->monspecs.dclkmax = 300 * 1000000;
+		}
+
+		edid_modes = fb_create_modedb(mytask->res, &edid_modes_cnt);
+	}
+	kfree(mytask->res);
+	
+vi_1:
+	if (maxclk)
+		info->monspecs.dclkmax = maxclk * 1000000;
+
+	if (maxvf)
+		info->monspecs.vfmax = maxvf;
+
+	if (maxhf)
+		info->monspecs.hfmax = maxhf * 1000;
+
+	/* in case DCC transfers are not supported the user can provide monitor limits 
+	   manually, lower limits are set to "safe" values */
+	if (!mon_limits && maxclk && maxvf && maxhf) {
+		info->monspecs.dclkmin = 0;	
+		info->monspecs.vfmin = 60;
+		info->monspecs.hfmin = 29000;
+	}
+
+	printk(KERN_INFO "vesafb: monitor limits: vf = %d Hz, hf = %d kHz, clk = %d MHz\n", 
+		info->monspecs.vfmax, (int)(info->monspecs.hfmax / 1000), 
+		(int)(info->monspecs.dclkmax / 1000000));
+
+	kfree(mytask);
+	return 0;
+}
+
+static int __init vesafb_probe(struct device *device)
+{
+	char entry[16];
+	struct platform_device *dev = to_platform_device(device);
+	struct fb_info *info;
+	int err = 0, i;
+
+	vesafb_info = info = framebuffer_alloc(sizeof(struct vesafb_par) + sizeof(u32) * 256, &dev->dev);
+	
+	if (!info)
+	 	return -ENOMEM;
+
+	if (vesafb_pid)
+		vesafb_serv_thread = find_task_by_pid(vesafb_pid);
+	else {
+		printk(KERN_ERR "vesafb: vesafb thread not running - returning..\n");
+		framebuffer_release(info);
+		return -EINVAL;
+	}
+
+	if (vesafb_vbe_init(info)) {
+		printk(KERN_ERR "vesafb: vbe_init failed - returning..\n");
+		err = -EINVAL;
+		goto pr_err;
+	}
+
+	vesafb_fix.smem_len = vbe_ib.total_memory * 65536;
+	vesafb_fix.ypanstep  = ypan     ? 1 : 0;
+	vesafb_fix.ywrapstep = (ypan>1) ? 1 : 0;
+
+	/* limit framebuffer size to 16 MB.  Otherwise we'll eat tons of
+	 * kernel address space for nothing if the gfx card has alot of
+	 * memory (>= 128 MB isn't uncommon these days ...) */
+	if (vesafb_fix.smem_len > 16 * 1024 * 1024)
+		vesafb_fix.smem_len = 16 * 1024 * 1024;
+	
+	if (vesafb_fix.smem_len > vbe_ib.total_memory * 65536)
+		vesafb_fix.smem_len = vbe_ib.total_memory * 65536;
+
+	/* vram boot option override */
+	if (vram)
+		vesafb_fix.smem_len = vram * 1024 * 1024;
+	
+	info->pseudo_palette = ((u8*)info->par + sizeof(struct vesafb_par));
+	info->fbops = &vesafb_ops;
+	info->var = vesafb_defined;
+	info->fix = vesafb_fix;
+
+	if (fb_alloc_cmap(&info->cmap, 256, 0) < 0) {
+		err = -ENXIO;
+		goto pr_err;
+	}	
+
+	if (!mode_option)
+		mode_option = CONFIG_FB_VESA_DEFAULT_MODE;
+
+	if (vbemode > 0) {
+		for (i = 0; i < vbe_modes_cnt; i++) {
+			if (vbe_modes[i].mode_id == vbemode) {
+				info->var.xres = vbe_modes[i].x_res;
+				info->var.yres = vbe_modes[i].y_res;
+				info->var.xres_virtual = vbe_modes[i].x_res;
+				info->var.xoffset = 0;
+				info->var.yoffset = 0;
+				info->var.bits_per_pixel = vbe_modes[i].bits_per_pixel;
+				info->var.reserved[0] = (u32)vbe_modes[i].mode_id;
+				info->var.reserved[1] = (u32)(&vbe_modes[i]);
+				info->var.reserved[2] = 0xdeadbeef;
+				info->var.vmode = FB_VMODE_NONINTERLACED;
+				info->var.sync = FB_SYNC_VERT_HIGH_ACT;
+				vesafb_setup_var(&info->var, &vbe_modes[i]);
+				fb_get_mode(FB_MAXTIMINGS, 60, &info->var, info);
+				goto pr_end;
+			}
+		}
+
+		printk(KERN_INFO "specified VBE mode %d not found\n",vbemode);
+	}
+
+	if (gtf)
+		goto pr_manual;
+
+	i = fb_find_mode(&info->var, info, mode_option, vesa_modes, 33, NULL, 0);
+
+	DPRINTK("fb_find_mode returned %d\n", i);
+	
+	if (i == 0 || i >= 3) 
+pr_manual:	
+	{
+		int match = -1;
+		unsigned int len = strlen(mode_option);
+		unsigned int xres = 0, yres = 0, bpp = 8, refresh = 60;
+		unsigned char res_specified = 0, bpp_specified = 0, refresh_specified = 0, yres_specified = 0;
+	
+		for (i = len-1; i >= 0; i--) {
+	    		switch (mode_option[i]) {
+				case '@':
+		    			len = i;
+		    			if (!refresh_specified && !bpp_specified &&
+					    !yres_specified) {
+						refresh = simple_strtoul(&mode_option[i+1], NULL, 0);
+						refresh_specified = 1;
+		    			} else
+						goto pr_modedone;
+		    			break;
+				case '-':
+		    			len = i;
+		    			if (!bpp_specified && !yres_specified) {
+					    	bpp = simple_strtoul(&mode_option[i+1], NULL, 0);
+						bpp_specified = 1;
+		    			} else
+						goto pr_modedone;
+		    			break;
+				case 'x':
+		    			if (!yres_specified) {
+						yres = simple_strtoul(&mode_option[i+1], NULL, 0);
+						yres_specified = 1;
+		    			} else
+						goto pr_modedone;
+		    			break;
+				case '0'...'9':
+		    			break;
+				default:
+		    			goto pr_modedone;
+	    		}
+		}
+
+		if (i < 0 && yres_specified) {
+	    		xres = simple_strtoul(mode_option, NULL, 0);
+	    		res_specified = 1;
+		}
+
+pr_modedone:	if (!res_specified || !yres_specified) {
+			printk(KERN_ERR "vesafb: invalid resolution, %s not specified\n",
+					(!res_specified) ? "width" : "height");
+			err = -EINVAL;
+			goto pr_err1;
+		}
+	
+		match = vesafb_find_vbe_mode(xres, yres, bpp, (vbe_ib.vbe_version >= 0x300) ? 1 : 0);
+		
+		if (match == -1) {
+			printk(KERN_ERR "vesafb: no matching VBE mode found\n");
+			err = -EINVAL;
+			goto pr_err1;
+		}
+		
+		info->var.xres = xres;
+		info->var.yres = yres;
+		info->var.xres_virtual = xres;
+		info->var.xoffset = 0;
+		info->var.yoffset = 0;
+		info->var.bits_per_pixel = vbe_modes[match].bits_per_pixel;
+		info->var.reserved[0] = (u32)vbe_modes[match].mode_id;
+		info->var.reserved[1] = (u32)(&vbe_modes[match]);
+		info->var.reserved[2] = 0x0;
+		info->var.vmode = FB_VMODE_NONINTERLACED;
+		info->var.sync = FB_SYNC_VERT_HIGH_ACT;
+		vesafb_setup_var(&info->var, &vbe_modes[match]);
+
+		if (edid_modes != NULL && !gtf) {
+		
+			DPRINTK("looking for EDID modes\n");
+		
+			for (i = 0; i < edid_modes_cnt; i++) {
+
+				if (edid_modes[i].xres == xres && edid_modes[i].yres == yres && 
+				    edid_modes[i].refresh - refresh < 5 && edid_modes[i].refresh - refresh > -5) {
+
+					info->var.pixclock = edid_modes[i].pixclock;
+				    	info->var.left_margin = edid_modes[i].left_margin;
+					info->var.right_margin = edid_modes[i].right_margin;
+					info->var.upper_margin = edid_modes[i].upper_margin;
+					info->var.lower_margin = edid_modes[i].lower_margin;
+					info->var.hsync_len = edid_modes[i].hsync_len;
+					info->var.vsync_len = edid_modes[i].vsync_len;
+					info->var.sync = edid_modes[i].sync;
+					info->var.vmode = edid_modes[i].vmode;
+					DPRINTK("using EDID-provided mode\n");
+					goto pr_end;
+				}
+			}
+		}
+		
+		if (refresh_specified)
+			i = FB_VSYNCTIMINGS;
+		else
+			i = FB_MAXTIMINGS;
+			
+		if (vbe_ib.vbe_version < 0x0300) { 
+			i = FB_VSYNCTIMINGS | FB_IGNOREMON;
+			refresh = 60;
+		} 
+	
+		if (!mon_limits)
+			i |= FB_IGNOREMON;
+		
+		if (fb_get_mode(i, refresh, &info->var, info) != 0) {
+			printk(KERN_ERR "vesafb: fb_get_mode failed, try a different refresh rate.\n");
+			err = -EINVAL;
+			goto pr_err1;
+		}
+	}
+pr_end:
+	info->var.yres_virtual = info->fix.smem_len / ((struct vesafb_mode_info_block*)info->var.reserved[1])->bytes_per_scan_line;
+	info->fix.smem_start = ((struct vesafb_mode_info_block*)info->var.reserved[1])->phys_base_ptr;
+
+	if (ypan && info->var.yres_virtual > info->var.yres) {
+		printk(KERN_INFO "vesafb: scrolling: %s using protected mode interface, yres_virtual=%d\n",
+		       (ypan > 1) ? "ywrap" : "ypan",info->var.yres_virtual);
+	} else {
+		printk(KERN_INFO "vesafb: scrolling: redraw\n");
+		info->var.yres_virtual = info->var.yres;
+		ypan = 0;
+	}
+
+	info->flags = FBINFO_FLAG_DEFAULT | 
+		(ypan) ? FBINFO_HWACCEL_YPAN : 0;
+	
+	if (!request_mem_region(info->fix.smem_start, info->fix.smem_len, "vesafb")) {
+		printk(KERN_WARNING "vesafb: cannot reserve video memory at 0x%lx\n", info->fix.smem_start);
+		/* We cannot make this fatal. Sometimes this comes from magic
+		   spaces our resource handlers simply don't know about */
+	}
+		
+	info->screen_base = ioremap(info->fix.smem_start, info->fix.smem_len); 
+	
+	if (!info->screen_base) {
+		printk(KERN_ERR
+		       "vesafb: abort, cannot ioremap video memory 0x%x @ 0x%lx\n",
+			info->fix.smem_len, info->fix.smem_start);
+		err = -EIO;
+		goto pr_err2;
+ 	}
+
+	/* request failure does not faze us, as vgacon probably has this
+	   region already (FIXME) */
+	request_region(0x3c0, 32, "vesafb");
+
+	if (mtrr) {
+		int temp_size = info->fix.smem_len;
+
+		/* Find the largest power-of-two */
+		while (temp_size & (temp_size - 1))
+                	temp_size &= (temp_size - 1);
+                        
+                /* Try and find a power of two to add */
+		while (temp_size && mtrr_add(info->fix.smem_start, temp_size, MTRR_TYPE_WRCOMB, 1) == -EINVAL) {
+			temp_size >>= 1;
+		}
+	}
+
+	if (register_framebuffer(info) < 0) {
+		printk(KERN_ERR "vesafb: failed to register framebuffer device\n");
+		err = -EINVAL;
+		goto pr_err2;
+	}
+
+  	printk(KERN_INFO "vesafb: framebuffer at 0x%lx, mapped to 0x%p, size %dk\n",
+	       info->fix.smem_start, info->screen_base, info->fix.smem_len/1024);
+	printk(KERN_INFO "fb%d: %s frame buffer device\n", info->node, info->fix.id);
+
+	sprintf(entry, "fb%d", info->node);
+	proc_mkdir(entry, 0);
+
+	sprintf(entry, "fb%d/modes", info->node);
+	create_proc_read_entry(entry, 0, 0, vesafb_read_proc_modes, NULL);
+	
+	sprintf(entry, "fb%d/vbe_info", info->node);
+	create_proc_read_entry(entry, 0, 0, vesafb_read_proc_vbe_info, NULL);
+
+	fb_destroy_modedb(edid_modes);
+	return 0;
+
+pr_err2:
+	release_mem_region(info->fix.smem_start, info->fix.smem_len);
+pr_err1:
+	fb_dealloc_cmap(&info->cmap);
+pr_err:	
+	framebuffer_release(info);
+	vesafb_info = NULL;
+
+	fb_destroy_modedb(edid_modes);
+	kfree(vbe_modes);
+	vbe_modes = NULL;
+	return err;
+}
+
+int __init vesafb_init(void)
+{
+	int ret;
+
+	ret = driver_register(&vesafb_driver);
+
+	if (!ret) {
+		ret = platform_device_register(&vesafb_device);
+		if (ret)
+			driver_unregister(&vesafb_driver);
+	}
+
+	return ret;
+}
+
+#ifdef MODULE
+
+void vesafb_exit(void)
+{
+	char entry[16];
+
+	if (vesafb_info)
+		unregister_framebuffer(vesafb_info);
+
+	platform_device_unregister(&vesafb_device);
+	driver_unregister(&vesafb_driver);
+
+	if (vesafb_info) {
+		sprintf(entry, "fb%d/modes", vesafb_info->node);
+		remove_proc_entry(entry, NULL);
+	
+		sprintf(entry, "fb%d/vbe_info", vesafb_info->node);
+		remove_proc_entry(entry, NULL);
+
+		sprintf(entry, "fb%d", vesafb_info->node);
+		remove_proc_entry(entry, NULL);
+
+		release_mem_region(vesafb_info->fix.smem_start, vesafb_info->fix.smem_len);
+		fb_dealloc_cmap(&vesafb_info->cmap);
+		framebuffer_release(vesafb_info);
+	}
+	
+	if (vbe_modes != NULL)
+		kfree(vbe_modes);	
+}
+
+module_init(vesafb_init);
+module_exit(vesafb_exit);
+
+static inline int param_get_scroll(char *buffer, struct kernel_param *kp) { return 0; }
+static inline int param_set_scroll(const char *val, struct kernel_param *kp) 
+{
+	ypan = 0;
+	
+	if (! strcmp(val, "redraw"))
+		ypan=0;
+	else if (! strcmp(val, "ypan"))
+		ypan=1;
+	else if (! strcmp(val, "ywrap"))
+		ypan=2;
+
+	return 0;
+}
+
+#define param_check_scroll(name, p) __param_check(name, p, void);
+
+module_param_named(scroll, ypan, scroll, 0);
+MODULE_PARM_DESC(scroll,"Scrolling mode, set to 'redraw', 'ypan' or 'ywrap'");
+module_param_named(vgapal, pmi_setpal, invbool, 0);
+MODULE_PARM_DESC(vgapal,"bool: set palette using VGA registers");
+module_param_named(pmipal, pmi_setpal, bool, 0);
+MODULE_PARM_DESC(pmipal,"bool: set palette using PMI calls");
+module_param_named(nomtrr, mtrr, invbool, 0);
+MODULE_PARM_DESC(nomtrr,"bool: disable use of MTRR registers");
+module_param(nocrtc, bool, 0);
+MODULE_PARM_DESC(nocrtc,"bool: ignore CRTC timings when setting modes");
+module_param(noedid, bool, 0);
+MODULE_PARM_DESC(noedid,"bool: ignore EDID-provided monitor limits when setting modes");
+module_param(gtf, bool, 0);
+MODULE_PARM_DESC(gtf,"bool: force use of VESA GTF to calculate mode timings");
+module_param(vram, ushort, 0);
+MODULE_PARM_DESC(vram,"Limit usage of video RAM [megabytes]");
+module_param(maxclk, ushort, 0);
+MODULE_PARM_DESC(maxclk,"Maximum pixelclock [MHz], overrides EDID data");
+module_param(maxhf, ushort, 0);
+MODULE_PARM_DESC(maxhf,"Maximum horizontal frequency [kHz], overrides EDID data");
+module_param(maxvf, ushort, 0);
+MODULE_PARM_DESC(maxvf,"Maximum vertical frequency [Hz], overrides EDID data");
+module_param_named(mode, mode_option, charp, 0);
+MODULE_PARM_DESC(mode,"Specify resolution as \"<xres>x<yres>[-<bpp>][@<refresh>]\"");
+module_param(vbemode, ushort, 0);
+MODULE_PARM_DESC(vbemode,"VBE mode number to set, overrides 'mode' setting");
+
+#endif /* MODULE */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Michał Januszewski");
+MODULE_DESCRIPTION("Framebuffer driver for VBE2.0-compliant graphic boards");
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/fs/buffer.c linux-2.6.8.1-ck7/fs/buffer.c
--- linux-2.6.8.1-ck6/fs/buffer.c	2004-09-09 22:56:24.977255625 +1000
+++ linux-2.6.8.1-ck7/fs/buffer.c	2004-09-09 22:56:38.732109374 +1000
@@ -213,7 +213,7 @@ void end_buffer_write_sync(struct buffer
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
-		if (printk_ratelimit()) {
+		if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
 			buffer_io_error(bh);
 			printk(KERN_WARNING "lost page write due to "
 					"I/O error on %s\n",
@@ -2756,21 +2756,33 @@ static int end_bio_bh_io_sync(struct bio
 	if (bio->bi_size)
 		return 1;
 
+	if (err == -EOPNOTSUPP) {
+		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+		set_bit(BH_Eopnotsupp, &bh->b_state);
+	}
+
 	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
 	bio_put(bio);
 	return 0;
 }
 
-void submit_bh(int rw, struct buffer_head * bh)
+int submit_bh(int rw, struct buffer_head * bh)
 {
 	struct bio *bio;
+	int ret = 0;
 
 	BUG_ON(!buffer_locked(bh));
 	BUG_ON(!buffer_mapped(bh));
 	BUG_ON(!bh->b_end_io);
 
-	/* Only clear out a write error when rewriting */
-	if (test_set_buffer_req(bh) && rw == WRITE)
+	if (buffer_ordered(bh) && (rw == WRITE))
+		rw = WRITE_BARRIER;
+
+	/*
+	 * Only clear out a write error when rewriting, should this
+	 * include WRITE_SYNC as well?
+	 */
+	if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
 		clear_buffer_write_io_error(bh);
 
 	/*
@@ -2792,7 +2804,14 @@ void submit_bh(int rw, struct buffer_hea
 	bio->bi_end_io = end_bio_bh_io_sync;
 	bio->bi_private = bh;
 
+	bio_get(bio);
 	submit_bio(rw, bio);
+
+	if (bio_flagged(bio, BIO_EOPNOTSUPP))
+		ret = -EOPNOTSUPP;
+
+	bio_put(bio);
+	return ret;
 }
 
 /**
@@ -2851,20 +2870,30 @@ void ll_rw_block(int rw, int nr, struct 
 
 /*
  * For a data-integrity writeout, we need to wait upon any in-progress I/O
- * and then start new I/O and then wait upon it.
+ * and then start new I/O and then wait upon it.  The caller must have a ref on
+ * the buffer_head.
  */
-void sync_dirty_buffer(struct buffer_head *bh)
+int sync_dirty_buffer(struct buffer_head *bh)
 {
+	int ret = 0;
+
 	WARN_ON(atomic_read(&bh->b_count) < 1);
 	lock_buffer(bh);
 	if (test_clear_buffer_dirty(bh)) {
 		get_bh(bh);
 		bh->b_end_io = end_buffer_write_sync;
-		submit_bh(WRITE, bh);
+		ret = submit_bh(WRITE, bh);
 		wait_on_buffer(bh);
+		if (buffer_eopnotsupp(bh)) {
+			clear_buffer_eopnotsupp(bh);
+			ret = -EOPNOTSUPP;
+		}
+		if (!ret && !buffer_uptodate(bh))
+			ret = -EIO;
 	} else {
 		unlock_buffer(bh);
 	}
+	return ret;
 }
 
 /*
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/fs/ext3/super.c linux-2.6.8.1-ck7/fs/ext3/super.c
--- linux-2.6.8.1-ck6/fs/ext3/super.c	2004-08-15 14:08:15.000000000 +1000
+++ linux-2.6.8.1-ck7/fs/ext3/super.c	2004-09-09 22:56:38.736108750 +1000
@@ -587,7 +587,7 @@ enum {
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0,
-	Opt_ignore, Opt_err,
+	Opt_ignore, Opt_barrier, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -632,6 +632,7 @@ static match_table_t tokens = {
 	{Opt_ignore, "noquota"},
 	{Opt_ignore, "quota"},
 	{Opt_ignore, "usrquota"},
+	{Opt_barrier, "barrier=%u"},
 	{Opt_err, NULL}
 };
 
@@ -897,6 +898,14 @@ clear_qf_name:
 		case Opt_abort:
 			set_opt(sbi->s_mount_opt, ABORT);
 			break;
+		case Opt_barrier:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option)
+				set_opt(sbi->s_mount_opt, BARRIER);
+			else
+				clear_opt(sbi->s_mount_opt, BARRIER);
+			break;
 		case Opt_ignore:
 			break;
 		default:
@@ -1599,16 +1608,23 @@ out_fail:
  * initial mount, once the journal has been initialised but before we've
  * done any recovery; and again on any subsequent remount. 
  */
-static void ext3_init_journal_params(struct ext3_sb_info *sbi, 
-				     journal_t *journal)
+static void ext3_init_journal_params(struct super_block *sb, journal_t *journal)
 {
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+
 	if (sbi->s_commit_interval)
 		journal->j_commit_interval = sbi->s_commit_interval;
 	/* We could also set up an ext3-specific default for the commit
 	 * interval here, but for now we'll just fall back to the jbd
 	 * default. */
-}
 
+	spin_lock(&journal->j_state_lock);
+	if (test_opt(sb, BARRIER))
+		journal->j_flags |= JFS_BARRIER;
+	else
+		journal->j_flags &= ~JFS_BARRIER;
+	spin_unlock(&journal->j_state_lock);
+}
 
 static journal_t *ext3_get_journal(struct super_block *sb, int journal_inum)
 {
@@ -1646,7 +1662,7 @@ static journal_t *ext3_get_journal(struc
 		return NULL;
 	}
 	journal->j_private = sb;
-	ext3_init_journal_params(EXT3_SB(sb), journal);
+	ext3_init_journal_params(sb, journal);
 	return journal;
 }
 
@@ -1731,7 +1747,7 @@ static journal_t *ext3_get_dev_journal(s
 		goto out_journal;
 	}
 	EXT3_SB(sb)->journal_bdev = bdev;
-	ext3_init_journal_params(EXT3_SB(sb), journal);
+	ext3_init_journal_params(sb, journal);
 	return journal;
 out_journal:
 	journal_destroy(journal);
@@ -2028,7 +2044,7 @@ int ext3_remount (struct super_block * s
 
 	es = sbi->s_es;
 
-	ext3_init_journal_params(sbi, sbi->s_journal);
+	ext3_init_journal_params(sb, sbi->s_journal);
 
 	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
 		if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/fs/jbd/checkpoint.c linux-2.6.8.1-ck7/fs/jbd/checkpoint.c
--- linux-2.6.8.1-ck6/fs/jbd/checkpoint.c	2004-09-09 22:56:25.072240804 +1000
+++ linux-2.6.8.1-ck7/fs/jbd/checkpoint.c	2004-09-09 22:56:38.740108126 +1000
@@ -462,8 +462,9 @@ int cleanup_journal_tail(journal_t *jour
  *
  * Find all the written-back checkpoint buffers in the journal and release them.
  *
- * Called with j_list_lock held, drops it.
- * Returns number of bufers reaped
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ * Returns number of bufers reaped (for debug)
  */
 
 int __journal_clean_checkpoint_list(journal_t *journal)
@@ -473,7 +474,7 @@ int __journal_clean_checkpoint_list(jour
 
 	transaction = journal->j_checkpoint_transactions;
 	if (transaction == 0)
-		goto out_unlock;
+		goto out;
 
 	last_transaction = transaction->t_cpprev;
 	next_transaction = transaction;
@@ -490,41 +491,13 @@ int __journal_clean_checkpoint_list(jour
 			do {
 				jh = next_jh;
 				next_jh = jh->b_cpnext;
-				/* Use trylock because of the ranking */
+				/* Use trylock because of the ranknig */
 				if (jbd_trylock_bh_state(jh2bh(jh)))
 					ret += __try_to_free_cp_buf(jh);
 			} while (jh != last_jh);
 		}
-#ifdef CONFIG_PREEMPT
-		/*
-		 * This is potentially sucky: semi-quadratic performance if
-		 * there are a lot of dirty buffers.  So only do it if the user
-		 * has chosen a preemptible kernel.  If !CONFIG_PREEMPT we're
-		 * optimimising for straight-line performance, after all.
-		 * We don't test cond_resched() here because another CPU could
-		 * be waiting on j_list_lock() while holding a different lock.
-		 */
-		if ((ret & 127) == 127) {
-			spin_unlock(&journal->j_list_lock);
-			/*
-			 * We need to schedule away.  Rotate both this
-			 * transaction's buffer list and the checkpoint list to
-			 * try to avoid quadratic behaviour.
-			 */
-			jh = transaction->t_checkpoint_list;
-			if (jh)
-				transaction->t_checkpoint_list = jh->b_cpnext;
-
-			transaction = journal->j_checkpoint_transactions;
-			if (transaction)
-				journal->j_checkpoint_transactions =
-					transaction->t_cpnext;
-			return ret;
-		}
-#endif
 	} while (transaction != last_transaction);
-out_unlock:
-	spin_unlock(&journal->j_list_lock);
+out:
 	return ret;
 }
 
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/fs/jbd/commit.c linux-2.6.8.1-ck7/fs/jbd/commit.c
--- linux-2.6.8.1-ck6/fs/jbd/commit.c	2004-09-09 22:56:25.115234096 +1000
+++ linux-2.6.8.1-ck7/fs/jbd/commit.c	2004-09-09 22:56:38.741107970 +1000
@@ -209,16 +209,9 @@ void journal_commit_transaction(journal_
 	 * checkpoint lists.  We do this *before* commit because it potentially
 	 * frees some memory
 	 */
-	spin_unlock(&journal->j_state_lock);
-	{
-		int nr_cleaned;
-
-		do {
-			spin_lock(&journal->j_list_lock);
-			nr_cleaned = __journal_clean_checkpoint_list(journal);
-		} while (nr_cleaned);
-	}
-	spin_lock(&journal->j_state_lock);
+	spin_lock(&journal->j_list_lock);
+	__journal_clean_checkpoint_list(journal);
+	spin_unlock(&journal->j_list_lock);
 
 	jbd_debug (3, "JBD: commit phase 1\n");
 
@@ -652,10 +645,38 @@ wait_for_iobuf:
 	JBUFFER_TRACE(descriptor, "write commit block");
 	{
 		struct buffer_head *bh = jh2bh(descriptor);
+		int ret;
+		int barrier_done = 0;
 
 		set_buffer_dirty(bh);
-		sync_dirty_buffer(bh);
-		if (unlikely(!buffer_uptodate(bh)))
+		if (journal->j_flags & JFS_BARRIER) {
+			set_buffer_ordered(bh);
+			barrier_done = 1;
+		}
+		ret = sync_dirty_buffer(bh);
+		/* is it possible for another commit to fail at roughly
+		 * the same time as this one?  If so, we don't want to
+		 * trust the barrier flag in the super, but instead want
+		 * to remember if we sent a barrier request
+		 */
+		if (ret == -EOPNOTSUPP && barrier_done) {
+			char b[BDEVNAME_SIZE];
+
+			printk(KERN_WARNING
+				"JBD: barrier-based sync failed on %s - "
+				"disabling barriers\n",
+				bdevname(journal->j_dev, b));
+			spin_lock(&journal->j_state_lock);
+			journal->j_flags &= ~JFS_BARRIER;
+			spin_unlock(&journal->j_state_lock);
+
+			/* And try again, without the barrier */
+			clear_buffer_ordered(bh);
+			set_buffer_uptodate(bh);
+			set_buffer_dirty(bh);
+			ret = sync_dirty_buffer(bh);
+		}
+		if (unlikely(ret == -EIO))
 			err = -EIO;
 		put_bh(bh);		/* One for getblk() */
 		journal_put_journal_head(descriptor);
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/fs/Kconfig.reiser4 linux-2.6.8.1-ck7/fs/Kconfig.reiser4
--- linux-2.6.8.1-ck6/fs/Kconfig.reiser4	2004-09-09 22:56:25.215218494 +1000
+++ linux-2.6.8.1-ck7/fs/Kconfig.reiser4	2004-09-09 22:56:38.743107658 +1000
@@ -1,7 +1,7 @@
 config REISER4_FS
 	tristate "Reiser4 (EXPERIMENTAL very fast general purpose filesystem)"
-	depends on EXPERIMENTAL
-	default y
+	depends on EXPERIMENTAL && !4KSTACKS
+	default n
 	---help---
 	  Reiser4 is more than twice as fast for both reads and writes as
 	  ReiserFS V3, and is the fastest Linux filesystem, by a lot,
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/fs/reiserfs/file.c linux-2.6.8.1-ck7/fs/reiserfs/file.c
--- linux-2.6.8.1-ck6/fs/reiserfs/file.c	2004-08-15 14:08:17.000000000 +1000
+++ linux-2.6.8.1-ck7/fs/reiserfs/file.c	2004-09-09 22:56:38.944076299 +1000
@@ -89,15 +89,16 @@ static int reiserfs_sync_file(
 			      ) {
   struct inode * p_s_inode = p_s_dentry->d_inode;
   int n_err;
-
-  reiserfs_write_lock(p_s_inode->i_sb);
+  int barrier_done;
 
   if (!S_ISREG(p_s_inode->i_mode))
       BUG ();
-
   n_err = sync_mapping_buffers(p_s_inode->i_mapping) ;
-  reiserfs_commit_for_inode(p_s_inode) ;
+  reiserfs_write_lock(p_s_inode->i_sb);
+  barrier_done = reiserfs_commit_for_inode(p_s_inode);
   reiserfs_write_unlock(p_s_inode->i_sb);
+  if (barrier_done != 1)
+      blkdev_issue_flush(p_s_inode->i_sb->s_bdev, NULL);
   return ( n_err < 0 ) ? -EIO : 0;
 }
 
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/fs/reiserfs/journal.c linux-2.6.8.1-ck7/fs/reiserfs/journal.c
--- linux-2.6.8.1-ck6/fs/reiserfs/journal.c	2004-08-15 14:08:17.000000000 +1000
+++ linux-2.6.8.1-ck7/fs/reiserfs/journal.c	2004-09-09 22:56:38.946075987 +1000
@@ -127,6 +127,12 @@ static int reiserfs_clean_and_file_buffe
   return 0 ;
 }
 
+static void disable_barrier(struct super_block *s)
+{
+    REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_BARRIER_FLUSH);
+    printk("reiserfs: disabling flush barriers on %s\n", reiserfs_bdevname(s));
+}
+
 static struct reiserfs_bitmap_node *
 allocate_bitmap_node(struct super_block *p_s_sb) {
   struct reiserfs_bitmap_node *bn ;
@@ -640,6 +646,26 @@ static void submit_ordered_buffer(struct
     submit_bh(WRITE, bh) ;
 }
 
+static int submit_barrier_buffer(struct buffer_head *bh) {
+    get_bh(bh) ;
+    bh->b_end_io = reiserfs_end_ordered_io;
+    clear_buffer_dirty(bh) ;
+    if (!buffer_uptodate(bh))
+        BUG();
+    return submit_bh(WRITE_BARRIER, bh) ;
+}
+
+static void check_barrier_completion(struct super_block *s,
+                                     struct buffer_head *bh) {
+    if (buffer_eopnotsupp(bh)) {
+	clear_buffer_eopnotsupp(bh);
+	disable_barrier(s);
+	set_buffer_uptodate(bh);
+	set_buffer_dirty(bh);
+	sync_dirty_buffer(bh);
+    }
+}
+
 #define CHUNK_SIZE 32
 struct buffer_chunk {
     struct buffer_head *bh[CHUNK_SIZE];
@@ -909,6 +935,7 @@ static int flush_commit_list(struct supe
   int bn ;
   struct buffer_head *tbh = NULL ;
   unsigned long trans_id = jl->j_trans_id;
+  int barrier = 0;
 
   reiserfs_check_lock_depth(s, "flush_commit_list") ;
 
@@ -973,7 +1000,20 @@ static int flush_commit_list(struct supe
   }
   atomic_dec(&SB_JOURNAL(s)->j_async_throttle);
 
-  /* wait on everything written so far before writing the commit */
+  /* wait on everything written so far before writing the commit
+   * if we are in barrier mode, send the commit down now
+   */
+  barrier = reiserfs_barrier_flush(s);
+  if (barrier) {
+      int ret;
+      lock_buffer(jl->j_commit_bh);
+      ret = submit_barrier_buffer(jl->j_commit_bh);
+      if (ret == -EOPNOTSUPP) {
+	  set_buffer_uptodate(jl->j_commit_bh);
+          disable_barrier(s);
+	  barrier = 0;
+      }
+  }
   for (i = 0 ;  i < (jl->j_len + 1) ; i++) {
     bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
 	 (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s) ;
@@ -995,10 +1035,15 @@ static int flush_commit_list(struct supe
   if (atomic_read(&(jl->j_commit_left)) != 1)
     BUG();
 
-  if (buffer_dirty(jl->j_commit_bh))
-    BUG();
-  mark_buffer_dirty(jl->j_commit_bh) ;
-  sync_dirty_buffer(jl->j_commit_bh) ;
+  if (!barrier) {
+      if (buffer_dirty(jl->j_commit_bh))
+	BUG();
+      mark_buffer_dirty(jl->j_commit_bh) ;
+      sync_dirty_buffer(jl->j_commit_bh) ;
+  } else
+      wait_on_buffer(jl->j_commit_bh);
+
+  check_barrier_completion(s, jl->j_commit_bh);
   if (!buffer_uptodate(jl->j_commit_bh)) {
     reiserfs_panic(s, "journal-615: buffer write failed\n") ;
   }
@@ -1098,8 +1143,23 @@ static int _update_journal_header_block(
     jh->j_last_flush_trans_id = cpu_to_le32(trans_id) ;
     jh->j_first_unflushed_offset = cpu_to_le32(offset) ;
     jh->j_mount_id = cpu_to_le32(SB_JOURNAL(p_s_sb)->j_mount_id) ;
-    set_buffer_dirty(SB_JOURNAL(p_s_sb)->j_header_bh) ;
-    sync_dirty_buffer(SB_JOURNAL(p_s_sb)->j_header_bh) ;
+
+    if (reiserfs_barrier_flush(p_s_sb)) {
+	int ret;
+	lock_buffer(SB_JOURNAL(p_s_sb)->j_header_bh);
+	ret = submit_barrier_buffer(SB_JOURNAL(p_s_sb)->j_header_bh);
+	if (ret == -EOPNOTSUPP) {
+	    set_buffer_uptodate(SB_JOURNAL(p_s_sb)->j_header_bh);
+	    disable_barrier(p_s_sb);
+	    goto sync;
+	}
+	wait_on_buffer(SB_JOURNAL(p_s_sb)->j_header_bh);
+	check_barrier_completion(p_s_sb, SB_JOURNAL(p_s_sb)->j_header_bh);
+    } else {
+sync:
+	set_buffer_dirty(SB_JOURNAL(p_s_sb)->j_header_bh) ;
+	sync_dirty_buffer(SB_JOURNAL(p_s_sb)->j_header_bh) ;
+    }
     if (!buffer_uptodate(SB_JOURNAL(p_s_sb)->j_header_bh)) {
       reiserfs_warning (p_s_sb, "journal-837: IO error during journal replay");
       return -EIO ;
@@ -3184,11 +3244,16 @@ void reiserfs_update_inode_transaction(s
   REISERFS_I(inode)->i_trans_id = SB_JOURNAL(inode->i_sb)->j_trans_id ;
 }
 
-static void __commit_trans_jl(struct inode *inode, unsigned long id,
+/*
+ * returns -1 on error, 0 if no commits/barriers were done and 1
+ * if a transaction was actually committed and the barrier was done
+ */
+static int __commit_trans_jl(struct inode *inode, unsigned long id,
                                  struct reiserfs_journal_list *jl)
 {
     struct reiserfs_transaction_handle th ;
     struct super_block *sb = inode->i_sb ;
+    int ret = 0;
 
     /* is it from the current transaction, or from an unknown transaction? */
     if (id == SB_JOURNAL(sb)->j_trans_id) {
@@ -3210,6 +3275,7 @@ static void __commit_trans_jl(struct ino
 	}
 
 	journal_end_sync(&th, sb, 1) ;
+	ret = 1;
 
     } else {
 	/* this gets tricky, we have to make sure the journal list in
@@ -3218,13 +3284,21 @@ static void __commit_trans_jl(struct ino
 	 */
 flush_commit_only:
 	if (journal_list_still_alive(inode->i_sb, id)) {
+	    /*
+	     * we only set ret to 1 when we know for sure
+	     * the barrier hasn't been started yet on the commit
+	     * block.
+	     */
+	    if (atomic_read(&jl->j_commit_left) > 1)
+	        ret = 1;
 	    flush_commit_list(sb, jl, 1) ;
 	}
     }
     /* otherwise the list is gone, and long since committed */
+    return ret;
 }
 
-void reiserfs_commit_for_inode(struct inode *inode) {
+int reiserfs_commit_for_inode(struct inode *inode) {
     unsigned long id = REISERFS_I(inode)->i_trans_id;
     struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl;
 
@@ -3237,7 +3311,7 @@ void reiserfs_commit_for_inode(struct in
 	/* jl will be updated in __commit_trans_jl */
     }
 
-    __commit_trans_jl(inode, id, jl);
+   return __commit_trans_jl(inode, id, jl);
 }
 
 void reiserfs_restore_prepared_buffer(struct super_block *p_s_sb, 
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/fs/reiserfs/super.c linux-2.6.8.1-ck7/fs/reiserfs/super.c
--- linux-2.6.8.1-ck6/fs/reiserfs/super.c	2004-08-15 14:08:17.000000000 +1000
+++ linux-2.6.8.1-ck7/fs/reiserfs/super.c	2004-09-09 22:56:38.948075675 +1000
@@ -549,6 +549,13 @@ static const arg_desc_t logging_mode[] =
     {NULL, 0}
 };
 
+/* possible values for -o barrier= */
+static const arg_desc_t barrier_mode[] = {
+    {"none", 1<<REISERFS_BARRIER_NONE, 1<<REISERFS_BARRIER_FLUSH},
+    {"flush", 1<<REISERFS_BARRIER_FLUSH, 1<<REISERFS_BARRIER_NONE},
+    {NULL, 0}
+};
+
 /* possible values for "-o block-allocator=" and bits which are to be set in
    s_mount_opt of reiserfs specific part of in-core super block */
 static const arg_desc_t balloc[] = {
@@ -711,6 +718,7 @@ static int reiserfs_parse_options (struc
 	{"replayonly",	.setmask = 1<<REPLAYONLY},
 	{"block-allocator", .arg_required = 'a', .values = balloc},
 	{"data",	.arg_required = 'd', .values = logging_mode},
+	{"barrier",	.arg_required = 'b', .values = barrier_mode},
 	{"resize",	.arg_required = 'r', .values = NULL},
 	{"jdev",	.arg_required = 'j', .values = NULL},
 	{"nolargeio",	.arg_required = 'w', .values = NULL},
@@ -810,6 +818,23 @@ static void handle_data_mode(struct supe
     }
 }
 
+static void handle_barrier_mode(struct super_block *s, unsigned long bits) {
+    int flush = (1 << REISERFS_BARRIER_FLUSH);
+    int none = (1 << REISERFS_BARRIER_NONE);
+    int all_barrier = flush | none;
+
+    if (bits & all_barrier) {
+        REISERFS_SB(s)->s_mount_opt &= ~all_barrier;
+	if (bits & flush) {
+	    REISERFS_SB(s)->s_mount_opt |= flush;
+	    printk("reiserfs: enabling write barrier flush mode\n");
+	} else if (bits & none) {
+	    REISERFS_SB(s)->s_mount_opt |= none;
+	    printk("reiserfs: write barriers turned off\n");
+	}
+   }
+}
+
 static void handle_attrs( struct super_block *s )
 {
 	struct reiserfs_super_block * rs;
@@ -854,6 +879,8 @@ static int reiserfs_remount (struct supe
   safe_mask |= 1 << REISERFS_ATTRS;
   safe_mask |= 1 << REISERFS_XATTRS_USER;
   safe_mask |= 1 << REISERFS_POSIXACL;
+  safe_mask |= 1 << REISERFS_BARRIER_FLUSH;
+  safe_mask |= 1 << REISERFS_BARRIER_NONE;
 
   /* Update the bitmask, taking care to keep
    * the bits we're not allowed to change here */
@@ -900,6 +927,7 @@ static int reiserfs_remount (struct supe
     }
 
     handle_data_mode(s, mount_options);
+    handle_barrier_mode(s, mount_options);
     REISERFS_SB(s)->s_mount_state = sb_umount_state(rs) ;
     s->s_flags &= ~MS_RDONLY ; /* now it is safe to call journal_begin */
     journal_begin(&th, s, 10) ;
@@ -1413,6 +1441,9 @@ static int reiserfs_fill_super (struct s
     } else {
         reiserfs_info (s, "using writeback data mode\n");
     }
+    if (reiserfs_barrier_flush(s)) {
+    	printk("reiserfs: using flush barriers\n");
+    }
 
     // set_device_ro(s->s_dev, 1) ;
     if( journal_init(s, jdev_name, old_format, commit_max_age) ) {
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/fs/supermount/subfs.c linux-2.6.8.1-ck7/fs/supermount/subfs.c
--- linux-2.6.8.1-ck6/fs/supermount/subfs.c	2004-09-09 22:56:25.890113187 +1000
+++ linux-2.6.8.1-ck7/fs/supermount/subfs.c	2004-09-09 22:56:38.955074583 +1000
@@ -154,7 +154,7 @@ supermount_clean_inodes(struct super_blo
  *
  *   unlock_door is always needed to keep device usage count correct
  */
-static inline int subfs_remount_ro(struct super_block *sb);
+static int subfs_remount_ro(struct super_block *sb);
 void
 subfs_umount(struct super_block *sb, int reason)
 {
@@ -219,7 +219,7 @@ subfs_umount(struct super_block *sb, int
 	LEAVE(sb);
 }
 
-static inline int
+static int
 subfs_remount_ro(struct super_block *sb)
 {
 	struct super_block *subsb = subfs_sb(sb);
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/fs/supermount/supermount.h linux-2.6.8.1-ck7/fs/supermount/supermount.h
--- linux-2.6.8.1-ck6/fs/supermount/supermount.h	2004-09-09 22:56:25.891113031 +1000
+++ linux-2.6.8.1-ck7/fs/supermount/supermount.h	2004-09-09 22:56:38.957074271 +1000
@@ -34,7 +34,7 @@
  */
 
 #define SUPERMOUNT_SUPER_MAGIC	0x9fa1
-#define SUPERMOUNT_VERSION		"2.0.4"
+#define SUPERMOUNT_VERSION		"2.0.5"
 
 #define S_DBG_DEBUG			0x001
 #define S_DBG_TRACE_DENTRY		0x002
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/include/linux/bio.h linux-2.6.8.1-ck7/include/linux/bio.h
--- linux-2.6.8.1-ck6/include/linux/bio.h	2004-08-15 14:08:18.000000000 +1000
+++ linux-2.6.8.1-ck7/include/linux/bio.h	2004-09-09 22:56:38.961073647 +1000
@@ -121,6 +121,7 @@ struct bio {
 #define BIO_CLONED	4	/* doesn't own data */
 #define BIO_BOUNCED	5	/* bio is a bounce bio */
 #define BIO_USER_MAPPED 6	/* contains user pages */
+#define BIO_EOPNOTSUPP	7	/* not supported */
 #define bio_flagged(bio, flag)	((bio)->bi_flags & (1 << (flag)))
 
 /*
@@ -160,6 +161,8 @@ struct bio {
 #define bio_data(bio)		(page_address(bio_page((bio))) + bio_offset((bio)))
 #define bio_barrier(bio)	((bio)->bi_rw & (1 << BIO_RW_BARRIER))
 #define bio_sync(bio)		((bio)->bi_rw & (1 << BIO_RW_SYNC))
+#define bio_failfast(bio)	((bio)->bi_rw & (1 << BIO_RW_FAILFAST))
+#define bio_rw_ahead(bio)	((bio)->bi_rw & (1 << BIO_RW_AHEAD))
 
 /*
  * will die
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/include/linux/blkdev.h linux-2.6.8.1-ck7/include/linux/blkdev.h
--- linux-2.6.8.1-ck6/include/linux/blkdev.h	2004-08-15 14:08:18.000000000 +1000
+++ linux-2.6.8.1-ck7/include/linux/blkdev.h	2004-09-09 22:56:38.962073491 +1000
@@ -52,6 +52,20 @@ struct as_io_context {
 	sector_t seek_mean;
 };
 
+struct cfq_queue;
+struct cfq_io_context {
+	void (*dtor)(struct cfq_io_context *);
+	void (*exit)(struct cfq_io_context *);
+
+	struct io_context *ioc;
+
+	/*
+	 * circular list of cfq_io_contexts belonging to a process io context
+	 */
+	struct list_head list;
+	struct cfq_queue *cfqq;
+};
+
 /*
  * This is the per-process I/O subsystem state.  It is refcounted and
  * kmalloc'ed. Currently all fields are modified in process io context
@@ -67,7 +81,10 @@ struct io_context {
 	unsigned long last_waited; /* Time last woken after wait for request */
 	int nr_batch_requests;     /* Number of requests left in the batch */
 
+	spinlock_t lock;
+
 	struct as_io_context *aic;
+	struct cfq_io_context *cic;
 };
 
 void put_io_context(struct io_context *ioc);
@@ -195,6 +212,8 @@ enum rq_flag_bits {
 	__REQ_PM_SUSPEND,	/* suspend request */
 	__REQ_PM_RESUME,	/* resume request */
 	__REQ_PM_SHUTDOWN,	/* shutdown request */
+	__REQ_BAR_PREFLUSH,	/* barrier pre-flush done */
+	__REQ_BAR_POSTFLUSH,	/* barrier post-flush */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -220,6 +239,8 @@ enum rq_flag_bits {
 #define REQ_PM_SUSPEND	(1 << __REQ_PM_SUSPEND)
 #define REQ_PM_RESUME	(1 << __REQ_PM_RESUME)
 #define REQ_PM_SHUTDOWN	(1 << __REQ_PM_SHUTDOWN)
+#define REQ_BAR_PREFLUSH	(1 << __REQ_BAR_PREFLUSH)
+#define REQ_BAR_POSTFLUSH	(1 << __REQ_BAR_POSTFLUSH)
 
 /*
  * State information carried for REQ_PM_SUSPEND and REQ_PM_RESUME
@@ -248,6 +269,7 @@ typedef void (unplug_fn) (request_queue_
 struct bio_vec;
 typedef int (merge_bvec_fn) (request_queue_t *, struct bio *, struct bio_vec *);
 typedef void (activity_fn) (void *data, int rw);
+typedef int (issue_flush_fn) (request_queue_t *, struct gendisk *, sector_t *);
 
 enum blk_queue_state {
 	Queue_down,
@@ -290,6 +312,7 @@ struct request_queue
 	unplug_fn		*unplug_fn;
 	merge_bvec_fn		*merge_bvec_fn;
 	activity_fn		*activity_fn;
+	issue_flush_fn		*issue_flush_fn;
 
 	/*
 	 * Auto-unplugging state
@@ -336,6 +359,7 @@ struct request_queue
 	unsigned long		nr_requests;	/* Max # of requests */
 	unsigned int		nr_congestion_on;
 	unsigned int		nr_congestion_off;
+	unsigned int		nr_batching;
 
 	unsigned short		max_sectors;
 	unsigned short		max_phys_segments;
@@ -373,6 +397,7 @@ struct request_queue
 #define QUEUE_FLAG_DEAD		5	/* queue being torn down */
 #define QUEUE_FLAG_REENTER	6	/* Re-entrancy avoidance */
 #define QUEUE_FLAG_PLUGGED	7	/* queue is plugged */
+#define QUEUE_FLAG_ORDERED	8	/* supports ordered writes */
 
 #define blk_queue_plugged(q)	test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
@@ -390,6 +415,10 @@ struct request_queue
 #define blk_pm_request(rq)	\
 	((rq)->flags & (REQ_PM_SUSPEND | REQ_PM_RESUME))
 
+#define blk_barrier_rq(rq)	((rq)->flags & REQ_HARDBARRIER)
+#define blk_barrier_preflush(rq)	((rq)->flags & REQ_BAR_PREFLUSH)
+#define blk_barrier_postflush(rq)	((rq)->flags & REQ_BAR_POSTFLUSH)
+
 #define list_entry_rq(ptr)	list_entry((ptr), struct request, queuelist)
 
 #define rq_data_dir(rq)		((rq)->flags & 1)
@@ -560,6 +589,14 @@ extern void end_that_request_last(struct
 extern int process_that_request_first(struct request *, unsigned int);
 extern void end_request(struct request *req, int uptodate);
 
+/*
+ * end_that_request_first/chunk() takes an uptodate argument. we account
+ * any value <= as an io error. 0 means -EIO for compatability reasons,
+ * any other < 0 value is the direct error type. An uptodate value of
+ * 1 indicates successful io completion
+ */
+#define end_io_error(uptodate)	(unlikely((uptodate) <= 0))
+
 static inline void blkdev_dequeue_request(struct request *req)
 {
 	BUG_ON(list_empty(&req->queuelist));
@@ -588,6 +625,9 @@ extern void blk_queue_prep_rq(request_qu
 extern void blk_queue_merge_bvec(request_queue_t *, merge_bvec_fn *);
 extern void blk_queue_dma_alignment(request_queue_t *, int);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
+extern void blk_queue_ordered(request_queue_t *, int);
+extern void blk_queue_issue_flush_fn(request_queue_t *, issue_flush_fn *);
+extern int blkdev_scsi_issue_flush_fn(request_queue_t *, struct gendisk *, sector_t *);
 
 extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
@@ -616,6 +656,7 @@ extern long blk_congestion_wait(int rw, 
 
 extern void blk_rq_bio_prep(request_queue_t *, struct request *, struct bio *);
 extern void blk_rq_prep_restart(struct request *);
+extern int blkdev_issue_flush(struct block_device *, sector_t *);
 
 #define MAX_PHYS_SEGMENTS 128
 #define MAX_HW_SEGMENTS 128
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/include/linux/buffer_head.h linux-2.6.8.1-ck7/include/linux/buffer_head.h
--- linux-2.6.8.1-ck6/include/linux/buffer_head.h	2004-06-16 17:35:45.000000000 +1000
+++ linux-2.6.8.1-ck7/include/linux/buffer_head.h	2004-09-09 22:56:38.963073335 +1000
@@ -26,6 +26,8 @@ enum bh_state_bits {
 	BH_Delay,	/* Buffer is not yet allocated on disk */
 	BH_Boundary,	/* Block is followed by a discontiguity */
 	BH_Write_EIO,	/* I/O error on write */
+	BH_Ordered,	/* ordered write */
+	BH_Eopnotsupp,	/* operation not supported (barrier) */
 
 	BH_PrivateStart,/* not a state bit, but the first bit available
 			 * for private allocation by other entities
@@ -110,7 +112,9 @@ BUFFER_FNS(Async_Read, async_read)
 BUFFER_FNS(Async_Write, async_write)
 BUFFER_FNS(Delay, delay)
 BUFFER_FNS(Boundary, boundary)
-BUFFER_FNS(Write_EIO,write_io_error)
+BUFFER_FNS(Write_EIO, write_io_error)
+BUFFER_FNS(Ordered, ordered)
+BUFFER_FNS(Eopnotsupp, eopnotsupp)
 
 #define bh_offset(bh)		((unsigned long)(bh)->b_data & ~PAGE_MASK)
 #define touch_buffer(bh)	mark_page_accessed(bh->b_page)
@@ -172,8 +176,8 @@ void free_buffer_head(struct buffer_head
 void FASTCALL(unlock_buffer(struct buffer_head *bh));
 void FASTCALL(__lock_buffer(struct buffer_head *bh));
 void ll_rw_block(int, int, struct buffer_head * bh[]);
-void sync_dirty_buffer(struct buffer_head *bh);
-void submit_bh(int, struct buffer_head *);
+int sync_dirty_buffer(struct buffer_head *bh);
+int submit_bh(int, struct buffer_head *);
 void write_boundary_block(struct block_device *bdev,
 			sector_t bblock, unsigned blocksize);
 
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/include/linux/elevator.h linux-2.6.8.1-ck7/include/linux/elevator.h
--- linux-2.6.8.1-ck6/include/linux/elevator.h	2004-05-23 12:54:57.000000000 +1000
+++ linux-2.6.8.1-ck7/include/linux/elevator.h	2004-09-09 22:56:38.965073023 +1000
@@ -119,4 +119,13 @@ extern int elv_try_last_merge(request_qu
 #define ELEVATOR_INSERT_BACK	2
 #define ELEVATOR_INSERT_SORT	3
 
+/*
+ * return values from elevator_may_queue_fn
+ */
+enum {
+	ELV_MQUEUE_MAY,
+	ELV_MQUEUE_NO,
+	ELV_MQUEUE_MUST,
+};
+
 #endif
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/include/linux/ext3_fs.h linux-2.6.8.1-ck7/include/linux/ext3_fs.h
--- linux-2.6.8.1-ck6/include/linux/ext3_fs.h	2004-08-15 14:08:19.000000000 +1000
+++ linux-2.6.8.1-ck7/include/linux/ext3_fs.h	2004-09-09 22:56:38.965073023 +1000
@@ -324,6 +324,7 @@ struct ext3_inode {
 #define EXT3_MOUNT_NO_UID32		0x2000  /* Disable 32-bit UIDs */
 #define EXT3_MOUNT_XATTR_USER		0x4000	/* Extended user attributes */
 #define EXT3_MOUNT_POSIX_ACL		0x8000	/* POSIX Access Control Lists */
+#define EXT3_MOUNT_BARRIER		0x10000 /* Use block barriers */
 
 /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/include/linux/fs.h linux-2.6.8.1-ck7/include/linux/fs.h
--- linux-2.6.8.1-ck6/include/linux/fs.h	2004-09-09 22:56:25.981098990 +1000
+++ linux-2.6.8.1-ck7/include/linux/fs.h	2004-09-09 22:56:38.967072711 +1000
@@ -88,6 +88,7 @@ extern int leases_enable, dir_notify_ena
 #define SPECIAL 4	/* For non-blockdevice requests in request queue */
 #define READ_SYNC	(READ | (1 << BIO_RW_SYNC))
 #define WRITE_SYNC	(WRITE | (1 << BIO_RW_SYNC))
+#define WRITE_BARRIER	((1 << BIO_RW) | (1 << BIO_RW_BARRIER))
 
 #define SEL_IN		1
 #define SEL_OUT		2
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/include/linux/ide.h linux-2.6.8.1-ck7/include/linux/ide.h
--- linux-2.6.8.1-ck6/include/linux/ide.h	2004-08-15 14:08:19.000000000 +1000
+++ linux-2.6.8.1-ck7/include/linux/ide.h	2004-09-09 22:56:38.969072399 +1000
@@ -780,6 +780,7 @@ typedef struct ide_drive_s {
 	u8	sect;		/* "real" sectors per track */
 	u8	bios_head;	/* BIOS/fdisk/LILO number of heads */
 	u8	bios_sect;	/* BIOS/fdisk/LILO sectors per track */
+	u8	doing_barrier;	/* state, 1=currently doing flush */
 
 	unsigned int	bios_cyl;	/* BIOS/fdisk/LILO number of cyls */
 	unsigned int	cyl;		/* "real" number of cyls */
@@ -1293,6 +1294,11 @@ extern ide_startstop_t ide_do_reset (ide
 extern void ide_init_drive_cmd (struct request *rq);
 
 /*
+ * this function returns error location sector offset in case of a write error
+ */
+extern u64 ide_get_error_location(ide_drive_t *, char *);
+
+/*
  * "action" parameter type for ide_do_drive_cmd() below.
  */
 typedef enum {
@@ -1664,4 +1670,11 @@ extern struct semaphore ide_cfg_sem;
 
 extern struct bus_type ide_bus_type;
 
+/* check if CACHE FLUSH (EXT) command is supported (bits defined in ATA-6) */
+#define ide_id_has_flush_cache(id)	((id)->cfs_enable_2 & 0x3000)
+
+/* some Maxtor disks have bit 13 defined incorrectly so check bit 10 too */
+#define ide_id_has_flush_cache_ext(id)	\
+	(((id)->cfs_enable_2 & 0x2400) == 0x2400)
+
 #endif /* _IDE_H */
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/include/linux/jbd.h linux-2.6.8.1-ck7/include/linux/jbd.h
--- linux-2.6.8.1-ck6/include/linux/jbd.h	2004-08-15 14:08:19.000000000 +1000
+++ linux-2.6.8.1-ck7/include/linux/jbd.h	2004-09-09 22:56:38.971072087 +1000
@@ -840,6 +840,7 @@ struct journal_s
 #define JFS_ACK_ERR	0x004	/* The errno in the sb has been acked */
 #define JFS_FLUSHED	0x008	/* The journal superblock has been flushed */
 #define JFS_LOADED	0x010	/* The journal superblock has been loaded */
+#define JFS_BARRIER	0x020	/* Use IDE barriers */
 
 /* 
  * Function declarations for the journaling transaction and buffer
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/include/linux/reiserfs_fs.h linux-2.6.8.1-ck7/include/linux/reiserfs_fs.h
--- linux-2.6.8.1-ck6/include/linux/reiserfs_fs.h	2004-08-15 14:08:19.000000000 +1000
+++ linux-2.6.8.1-ck7/include/linux/reiserfs_fs.h	2004-09-09 22:56:38.976071307 +1000
@@ -1777,7 +1777,8 @@ int reiserfs_end_persistent_transaction(
 int reiserfs_commit_page(struct inode *inode, struct page *page,
 		unsigned from, unsigned to);
 int reiserfs_flush_old_commits(struct super_block *);
-void reiserfs_commit_for_inode(struct inode *) ;
+int reiserfs_commit_for_inode(struct inode *) ;
+int  reiserfs_inode_needs_commit(struct inode *) ;
 void reiserfs_update_inode_transaction(struct inode *) ;
 void reiserfs_wait_on_write_block(struct super_block *s) ;
 void reiserfs_block_writes(struct reiserfs_transaction_handle *th) ;
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/include/linux/reiserfs_fs_sb.h linux-2.6.8.1-ck7/include/linux/reiserfs_fs_sb.h
--- linux-2.6.8.1-ck6/include/linux/reiserfs_fs_sb.h	2004-06-16 17:35:46.000000000 +1000
+++ linux-2.6.8.1-ck7/include/linux/reiserfs_fs_sb.h	2004-09-09 22:56:38.977071151 +1000
@@ -444,6 +444,8 @@ enum reiserfs_mount_options {
     REISERFS_XATTRS,
     REISERFS_XATTRS_USER,
     REISERFS_POSIXACL,
+    REISERFS_BARRIER_NONE,
+    REISERFS_BARRIER_FLUSH,
 
     REISERFS_TEST1,
     REISERFS_TEST2,
@@ -473,6 +475,8 @@ enum reiserfs_mount_options {
 #define reiserfs_xattrs_user(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_XATTRS_USER))
 #define reiserfs_posixacl(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_POSIXACL))
 #define reiserfs_xattrs_optional(s) (reiserfs_xattrs_user(s) || reiserfs_posixacl(s))
+#define reiserfs_barrier_none(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_NONE))
+#define reiserfs_barrier_flush(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_FLUSH))
 
 void reiserfs_file_buffer (struct buffer_head * bh, int list);
 extern struct file_system_type reiserfs_fs_type;
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/include/scsi/scsi_driver.h linux-2.6.8.1-ck7/include/scsi/scsi_driver.h
--- linux-2.6.8.1-ck6/include/scsi/scsi_driver.h	2004-03-11 21:29:26.000000000 +1100
+++ linux-2.6.8.1-ck7/include/scsi/scsi_driver.h	2004-09-09 22:56:38.980070683 +1000
@@ -13,6 +13,7 @@ struct scsi_driver {
 
 	int (*init_command)(struct scsi_cmnd *);
 	void (*rescan)(struct device *);
+	int (*issue_flush)(struct device *, sector_t *);
 };
 #define to_scsi_driver(drv) \
 	container_of((drv), struct scsi_driver, gendrv)
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/include/video/vesa.h linux-2.6.8.1-ck7/include/video/vesa.h
--- linux-2.6.8.1-ck6/include/video/vesa.h	1970-01-01 10:00:00.000000000 +1000
+++ linux-2.6.8.1-ck7/include/video/vesa.h	2004-09-09 22:56:38.980070683 +1000
@@ -0,0 +1,126 @@
+#define crtc_pib(arg) ((struct vesafb_crtc_info_block*)(arg))
+#define vbe_pib(arg) ((struct vesafb_vbe_info_block*)(arg))
+#define vesafb_pmib(arg) ((struct vesafb_mode_info_block*)(arg))
+
+struct vesafb_task {
+	enum { 
+		VESAFB_TASK_DOVM86, 
+		VESAFB_TASK_GETVBE_IB, 
+		VESAFB_TASK_GETVBE_MODEINFO,
+		VESAFB_TASK_SWITCHMODE,
+		VESAFB_TASK_GETEDID,
+		VESAFB_TASK_SETPAL
+	} type;
+
+	unsigned short		flags;
+	struct vm86_regs 	regs;
+
+	unsigned char		done;
+	void 			*res;
+		
+	struct list_head 	node;
+};
+
+#define VESAFB_FLAG_FREESTRUCT		0x0001
+#define VESAFB_CAP_CAN_SWITCH_DAC	0x01
+
+/* this struct is 512 bytes long */
+struct vesafb_vbe_info_block {
+	char	vbe_signature[4];
+	u16	vbe_version;
+	u32	oem_string_ptr;
+	u32	capabilities;
+	u32 	mode_list_ptr;
+	u16 	total_memory;
+	u16	oem_software_rev;
+	u32	oem_vendor_name_ptr;
+	u32 	oem_product_name_ptr;
+	u32 	oem_product_rev_ptr;
+	u8	reserved[222];
+	char	oem_data[256];
+} __attribute__ ((packed));
+
+struct vesafb_crtc_info_block {
+
+	u16	horiz_total;
+	u16	horiz_start;
+	u16	horiz_end;
+	u16	vert_total;
+	u16	vert_start;
+	u16	vert_end;
+	u8	flags;
+	u32	pixel_clock;
+	u16	refresh_rate;
+	u8	reserved[40];
+
+} __attribute__ ((packed));
+
+/* this struct is 256 bytes long */
+struct vesafb_mode_info_block {
+
+	/* for all VBE revisions */
+	u16	mode_attr;
+	u8	winA_attr;
+	u8	winB_attr;
+	u16	win_granularity;
+	u16	win_size;
+	u16	winA_seg;
+	u16	winB_seg;
+	u32	win_func_ptr;
+	u16	bytes_per_scan_line;
+	
+	/* for VBE 1.2+ */
+	u16	x_res;
+	u16	y_res;
+	u8	x_char_size;
+	u8	y_char_size;
+	u8	planes;
+	u8	bits_per_pixel;
+	u8	banks;
+	u8	memory_model;
+	u8	bank_size;
+	u8	image_pages;
+	u8	reserved1;
+	
+	/* direct color fields for direct/6 and YUV/7 memory models */
+	u8	red_len;
+	u8	red_off;		/* offsets are bit positions of lsb in the mask */
+	u8	green_len;
+	u8	green_off;
+	u8	blue_len;
+	u8	blue_off;
+	u8	rsvd_len;
+	u8	rsvd_off;
+	u8	direct_color_info;	/* direct color mode attributes */
+
+	/* for VBE 2.0+ */
+	u32	phys_base_ptr;
+	u8	reserved2[6];
+
+	/* for VBE 3.0+ */
+	u16	lin_bytes_per_scan_line;
+	u8	bnk_image_pages;
+	u8	lin_image_pages;
+	u8	lin_red_len;
+	u8	lin_red_off;
+	u8	lin_green_len;
+	u8	lin_green_off;
+	u8	lin_blue_len;
+	u8	lin_blue_off;
+	u8	lin_rsvd_len;
+	u8	lin_rsvd_off;
+	u32	max_pixel_clock;
+	
+	u16	mode_id;
+
+} __attribute__ ((packed));
+
+struct vesafb_par {
+	u16 				vbe_mode;
+	struct vesafb_crtc_info_block 	crtc;	
+};
+
+
+struct vesafb_pal_entry {
+	u_char blue, green, red, pad; 
+} __attribute__ ((packed));
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/kernel/sched.c linux-2.6.8.1-ck7/kernel/sched.c
--- linux-2.6.8.1-ck6/kernel/sched.c	2004-09-09 22:56:26.014093842 +1000
+++ linux-2.6.8.1-ck7/kernel/sched.c	2004-09-09 22:56:38.986069747 +1000
@@ -352,11 +352,12 @@ static void recalc_task_prio(task_t *p, 
 {
 	unsigned long sleep_time = now - p->timestamp;
 	unsigned int rr = rr_interval(p);
-	unsigned int minrun = rr * (p->burst + 1) / (burst(p) + 1) ? : 1;
-	if (p->flags & PF_FORKED || 
+	unsigned int best_burst = burst(p);
+	unsigned int minrun = rr * (p->burst + 1) / (best_burst + 1) ? : 1;
+	if (p->flags & PF_FORKED || (p->mm &&
 		(NS_TO_JIFFIES(p->runtime + sleep_time) < minrun || 
 		((!sched_interactive || sched_compute) && 
-		NS_TO_JIFFIES(p->runtime + sleep_time) < rr))) {
+		NS_TO_JIFFIES(p->runtime + sleep_time) < rr)))) {
 			unsigned long ns_totalrun = p->totalrun + p->runtime;
 			unsigned long total_run = NS_TO_JIFFIES(ns_totalrun);
 			p->flags &= ~PF_FORKED;
@@ -365,15 +366,22 @@ static void recalc_task_prio(task_t *p, 
 				dec_burst(p);
 			} else {
 				unsigned int intervals = total_run / rr;
+				unsigned int remainder;
 				p->totalrun = ns_totalrun;
 				p->slice -= intervals * rr;
 				if (p->slice <= rr) {
 					p->totalrun = 0;
 					dec_burst(p);
+				} else {
+					remainder = p->slice % rr;
+					if (remainder)
+						p->time_slice = remainder;
 				}
 			}
 	} else {
-		if (!(p->flags & PF_UISLEEP))
+		if (NS_TO_JIFFIES(p->totalrun) > (best_burst - p->burst) * rr)
+			dec_burst(p);
+		else if (!(p->flags & PF_UISLEEP || p->totalrun))
 			inc_burst(p);
 		p->runtime = 0;
 		p->totalrun = 0;
@@ -395,10 +403,10 @@ static void activate_task(task_t *p, run
 	}
 #endif
 	p->slice = slice(p);
+	p->time_slice = rr_interval(p);
 	recalc_task_prio(p, now);
 	p->prio = effective_prio(p);
 	p->flags &= ~PF_UISLEEP;
-	p->time_slice = rr_interval(p);
 	if (batch_task(p))
 		p->time_slice = p->slice;
 	p->timestamp = now;
@@ -430,7 +438,8 @@ static void resched_task(task_t *p)
 {
 	int need_resched, nrpolling;
 
-	preempt_disable();
+	BUG_ON(!spin_is_locked(&task_rq(p)->lock));
+
 	/* minimise the chance of sending an interrupt to poll_idle() */
 	nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
 	need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
@@ -438,7 +447,6 @@ static void resched_task(task_t *p)
 
 	if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
 		smp_send_reschedule(task_cpu(p));
-	preempt_enable();
 }
 #else
 static inline void resched_task(task_t *p)
@@ -1792,8 +1800,10 @@ void scheduler_tick(int user_ticks, int 
 			cpustat->iowait += sys_ticks;
 		else
 			cpustat->idle += sys_ticks;
+		spin_lock(&rq->lock);
 		if (wake_priority_sleeper(rq))
-			goto out;
+			goto out_unlock;
+		spin_unlock(&rq->lock);
 		rebalance_tick(cpu, rq, IDLE);
 		return;
 	}
@@ -1844,23 +1854,33 @@ out:
 }
 
 #ifdef CONFIG_SCHED_SMT
-static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq)
+static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
 {
-	int i;
-	struct sched_domain *sd = rq->sd;
+	struct sched_domain *sd = this_rq->sd;
 	cpumask_t sibling_map;
+	int i;
 
 	if (!(sd->flags & SD_SHARE_CPUPOWER))
 		return;
 
-	cpus_and(sibling_map, sd->span, cpu_online_map);
-	for_each_cpu_mask(i, sibling_map) {
-		runqueue_t *smt_rq;
+	/*
+	 * Unlock the current runqueue because we have to lock in
+	 * CPU order to avoid deadlocks. Caller knows that we might
+	 * unlock. We keep IRQs disabled.
+	 */
+	spin_unlock(&this_rq->lock);
 
-		if (i == cpu)
-			continue;
+	cpus_and(sibling_map, sd->span, cpu_online_map);
+	for_each_cpu_mask(i, sibling_map)
+		spin_lock(&cpu_rq(i)->lock);
+	/*
+	 * We clear this CPU from the mask. This both simplifies the
+	 * inner loop and keps this_rq locked when we exit:
+	 */
+	cpu_clear(this_cpu, sibling_map);
 
-		smt_rq = cpu_rq(i);
+	for_each_cpu_mask(i, sibling_map) {
+		runqueue_t *smt_rq = cpu_rq(i);
 
 		/*
 		 * If an SMT sibling task is sleeping due to priority
@@ -1869,27 +1889,48 @@ static inline void wake_sleeping_depende
 		if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
 			resched_task(smt_rq->idle);
 	}
+
+	for_each_cpu_mask(i, sibling_map)
+		spin_unlock(&cpu_rq(i)->lock);
+	/*
+	 * We exit with this_cpu's rq still held and IRQs
+	 * still disabled:
+	 */
 }
 
-static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p)
+static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
 {
-	struct sched_domain *sd = rq->sd;
+	struct sched_domain *sd = this_rq->sd;
 	cpumask_t sibling_map;
 	int ret = 0, i;
+	task_t *p;
 
 	if (!(sd->flags & SD_SHARE_CPUPOWER))
 		return 0;
 
+	/*
+	 * The same locking rules and details apply as for
+	 * wake_sleeping_dependent():
+	 */
+	spin_unlock(&this_rq->lock);
 	cpus_and(sibling_map, sd->span, cpu_online_map);
-	for_each_cpu_mask(i, sibling_map) {
-		runqueue_t *smt_rq;
-		task_t *smt_curr;
+	for_each_cpu_mask(i, sibling_map)
+		spin_lock(&cpu_rq(i)->lock);
+	cpu_clear(this_cpu, sibling_map);
 
-		if (i == cpu)
-			continue;
+	/*
+	 * Establish next task to be run - it might have gone away because
+	 * we released the runqueue lock above:
+	 */
+	if (!this_rq->nr_running)
+		goto out_unlock;
 
-		smt_rq = cpu_rq(i);
-		smt_curr = smt_rq->curr;
+	p = list_entry(this_rq->queue[sched_find_first_bit(this_rq->bitmap)].next,
+		task_t, run_list);
+
+	for_each_cpu_mask(i, sibling_map) {
+		runqueue_t *smt_rq = cpu_rq(i);
+		task_t *smt_curr = smt_rq->curr;
 
 		/*
 		 * If a user task with lower static priority than the
@@ -1917,14 +1958,17 @@ static inline int dependent_sleeper(int 
 			(smt_curr == smt_rq->idle && smt_rq->nr_running))
 				resched_task(smt_curr);
 	}
+out_unlock:
+	for_each_cpu_mask(i, sibling_map)
+		spin_unlock(&cpu_rq(i)->lock);
 	return ret;
 }
 #else
-static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq)
+static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
 {
 }
 
-static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p)
+static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
 {
 	return 0;
 }
@@ -1981,20 +2025,37 @@ need_resched:
 
 	cpu = smp_processor_id();
 	if (unlikely(!rq->nr_running)) {
+go_idle:
 		idle_balance(cpu, rq);
 		if (!rq->nr_running) {
 			next = rq->idle;
 			wake_sleeping_dependent(cpu, rq);
+			/*
+			 * wake_sleeping_dependent() might have released
+			 * the runqueue, so break out if we got new
+			 * tasks meanwhile:
+			 */
+			if (likely(!rq->nr_running))
+				goto switch_tasks;
+		}
+	} else {
+		if (dependent_sleeper(cpu, rq)) {
+			next = rq->idle;
 			goto switch_tasks;
 		}
+		/*
+		 * dependent_sleeper() releases and reacquires the runqueue
+		 * lock, hence go into the idle loop if the rq went
+		 * empty meanwhile:
+		 */
+		if (unlikely(!rq->nr_running))
+			goto go_idle;
 	}
 
 	idx = sched_find_first_bit(rq->bitmap);
 	queue = rq->queue + idx;
 	next = list_entry(queue->next, task_t, run_list);
 
-	if (dependent_sleeper(cpu, rq, next))
-		next = rq->idle;
 switch_tasks:
 	prefetch(next);
 	clear_tsk_need_resched(prev);
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/Makefile linux-2.6.8.1-ck7/Makefile
--- linux-2.6.8.1-ck6/Makefile	2004-09-09 22:56:26.039089942 +1000
+++ linux-2.6.8.1-ck7/Makefile	2004-09-09 22:56:38.989069279 +1000
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 8
-EXTRAVERSION = .1-ck6
+EXTRAVERSION = .1-ck7
 NAME=Zonked Quokka
 
 # *DOCUMENTATION*