diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/arch/i386/boot/video.S linux-2.6.8.1-ck7/arch/i386/boot/video.S --- linux-2.6.8.1-ck6/arch/i386/boot/video.S 2004-03-11 21:28:53.000000000 +1100 +++ linux-2.6.8.1-ck7/arch/i386/boot/video.S 2004-09-09 22:56:38.668119359 +1000 @@ -164,10 +164,12 @@ basret: ret # parameters in the default 80x25 mode -- these are set directly, # because some very obscure BIOSes supply insane values. mode_params: +#ifdef CONFIG_FB_VESA_STD #ifdef CONFIG_VIDEO_SELECT cmpb $0, graphic_mode jnz mopar_gr #endif +#endif movb $0x03, %ah # Read cursor position xorb %bh, %bh int $0x10 @@ -200,6 +202,7 @@ mopar2: movb %al, %fs:(PARAM_VIDEO_LINES ret #ifdef CONFIG_VIDEO_SELECT +#ifdef CONFIG_FB_VESA_STD # Fetching of VESA frame buffer parameters mopar_gr: leaw modelist+1024, %di @@ -243,6 +246,7 @@ mopar_gr: movw %es, %fs:(PARAM_VESAPM_SEG) movw %di, %fs:(PARAM_VESAPM_OFF) no_pm: ret +#endif # The video mode menu mode_menu: @@ -457,10 +461,10 @@ mode_set: cmpb $VIDEO_FIRST_V7>>8, %ah jz setv7 - + cmpb $VIDEO_FIRST_VESA>>8, %ah jnc check_vesa - + orb %ah, %ah jz setmenu @@ -547,6 +551,7 @@ check_vesa: cmpb $0x09, %al jz setvesa # This is a text mode +#ifdef CONFIG_FB_VESA_STD movb (%di), %al # Check capabilities. andb $0x99, %al cmpb $0x99, %al @@ -563,6 +568,7 @@ check_vesa: movb $0, do_restore # no screen restore stc ret +#endif _setbad: jmp setbad # Ugly... diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/Documentation/fb/vesafb.txt linux-2.6.8.1-ck7/Documentation/fb/vesafb.txt --- linux-2.6.8.1-ck6/Documentation/fb/vesafb.txt 2004-06-16 17:35:30.000000000 +1000 +++ linux-2.6.8.1-ck7/Documentation/fb/vesafb.txt 2004-09-09 22:56:38.674118422 +1000 @@ -2,16 +2,18 @@ What is vesafb? =============== -This is a generic driver for a graphic framebuffer on intel boxes. +Vesafb is a generic framebuffer driver for x86 and x86_64 boxes. -The idea is simple: Turn on graphics mode at boot time with the help -of the BIOS, and use this as framebuffer device /dev/fb0, like the m68k -(and other) ports do. - -This means we decide at boot time whenever we want to run in text or -graphics mode. Switching mode later on (in protected mode) is -impossible; BIOS calls work in real mode only. VESA BIOS Extensions -Version 2.0 are required, because we need a linear frame buffer. +VESA BIOS Extensions Version 2.0 are required, because we need a linear +frame buffer. VBE 3.0 is required if you want to use modes with a higher +(than the standard 60Hz) refresh rate. + +The VESA framebuffer driver comes in two flavors - the standard vesafb +and vesafb-tng. Vesafb-tng is available only on 32-bit x86 due to the +technology it uses (vm86). Vesafb-tng has more features than vesafb +(adjusting the refresh rate on VBE3.0-compliant boards, switching the +video mode without rebooting, selecting a mode by providing its +modedb name, and more) but might be unstable on some systems. Advantages: @@ -29,16 +31,27 @@ Disadvantages: How to use it? ============== -Switching modes is done using the vga=... boot parameter. Read -Documentation/svga.txt for details. - -You should compile in both vgacon (for text mode) and vesafb (for -graphics mode). Which of them takes over the console depends on -whenever the specified mode is text or graphics. - -The graphic modes are NOT in the list which you get if you boot with -vga=ask and hit return. The mode you wish to use is derived from the -VESA mode number. Here are those VESA mode numbers: +If you are running your system on hardware platform where vm86 is supported +(this is 32-bit x86 only as of the time of writing this document) and you +decide to use vesafb-tng, you can either the driver into the kernel or use it +as a module. The graphic mode you want to use is in both cases specified using +the standard modedb format. + +If your system doesn't support vm86 calls yet (all 64-bit platforms), things +get a little more tricky. Since on such systems you can't do BIOS calls from +protected mode in which kernel runs, you have to decide at boot time whenever +you want to run in text or in graphics mode. Switching mode later on is +impossible. Switching modes is done using the vga=... boot parameter. Read +Documentation/svga.txt for details. Below is a more detailed description of +what to do on systems using the standard vesafb driver. + +You should compile in both vgacon (for text mode) and vesafb (for graphics mode). +Which of them takes over the console depends on whenever the specified mode is +text or graphics. + +The graphic modes are NOT in the list which you get if you boot with vga=ask +and hit return. The mode you wish to use is derived from the VESA mode number. +Here are those VESA mode numbers: | 640x480 800x600 1024x768 1280x1024 ----+------------------------------------- @@ -47,8 +60,7 @@ VESA mode number. Here are those VESA mo 64k | 0x111 0x114 0x117 0x11A 16M | 0x112 0x115 0x118 0x11B -The video mode number of the Linux kernel is the VESA mode number plus -0x200. +The video mode number of the Linux kernel is the VESA mode number plus 0x200. Linux_kernel_mode_number = VESA_mode_number + 0x200 @@ -61,10 +73,10 @@ So the table for the Kernel mode numbers 64k | 0x311 0x314 0x317 0x31A 16M | 0x312 0x315 0x318 0x31B -To enable one of those modes you have to specify "vga=ask" in the -lilo.conf file and rerun LILO. Then you can type in the desired -mode at the "vga=ask" prompt. For example if you like to use -1024x768x256 colors you have to say "305" at this prompt. +To enable one of those modes you have to specify "vga=ask" in the lilo.conf +file and rerun LILO. Then you can type in the desired mode at the "vga=ask" +prompt. For example if you like to use 1024x768x256 colors you have to say +"305" at this prompt. If this does not work, this might be because your BIOS does not support linear framebuffers or because it does not support this mode at all. @@ -77,6 +89,7 @@ Extensions v2.0 are required, 1.2 is NOT 2. Note: Some newer versions of LILO appear to work with those hex values, if you set the 0x in front of the numbers. + X11 === @@ -86,77 +99,177 @@ It depends on X-Server and graphics boar The X-Server must restore the video mode correctly, else you end up with a broken console (and vesafb cannot do anything about this). +With vesafb-tng chances are that the console will be restored properly +even if the X server messed up the video mode. Refresh rates ============= -There is no way to change the vesafb video mode and/or timings after -booting linux. If you are not happy with the 60 Hz refresh rate, you -have these options: +With VBE3.0 compatible BIOSes and vesafb-tng it is possible to change +the refresh rate either at boot time (by specifying the @ part of +the mode name) or later, using the fbset utility. + +With VBE2.0 there is no way to change the mode timings after booting +Linux. If you are not happy with the 60 Hz refresh rate, you have +these options: - * configure and load the DOS-Tools for your the graphics board (if - available) and boot linux with loadlin. + * configure and load the DOS tools for your the graphics board (if + available) and boot Linux with loadlin. * use a native driver (matroxfb/atyfb) instead if vesafb. If none is available, write a new one! - * VBE 3.0 might work too. I have neither a gfx board with VBE 3.0 - support nor the specs, so I have not checked this yet. + * use a BIOS editor to change the default refresh rate (such an + editor does exist at least for ATI Radeon BIOSes). + * if you're running a non-vm86 and VBE3.0-compatible system, you can + use a kernel patch to hard-code some mode timings in the kernel and + use these while setting the graphic mode at boot time. Configuration ============= -The VESA BIOS provides protected mode interface for changing -some parameters. vesafb can use it for palette changes and -to pan the display. It is turned off by default because it -seems not to work with some BIOS versions, but there are options -to turn it on. - -You can pass options to vesafb using "video=vesafb:option" on -the kernel command line. Multiple options should be separated -by comma, like this: "video=vesafb:ypan,invers" - -Accepted options: - -invers no comment... - -ypan enable display panning using the VESA protected mode - interface. The visible screen is just a window of the +The VESA BIOS provides protected mode interface for changing some parameters. +vesafb can use it for palette changes and to pan the display. It is turned +off by default because it seems not to work with some BIOS versions, but there +are options to turn it on. + +You can pass options to vesafb using "video=vesafb:option" on the kernel +command line. Multiple options should be separated by comma, like this: +"video=vesafb:ypan,1024x768-32@85" + +Accepted options (both vesafb and vesafb-tng): + +ypan Enable display panning using the VESA protected mode + interface or vm86 calls. The visible screen is just a window of the video memory, console scrolling is done by changing the start of the window. pro: * scrolling (fullscreen) is fast, because there is no need to copy around data. - * You'll get scrollback (the Shift-PgUp thing), + * you'll get scrollback (the Shift-PgUp thing), the video memory can be used as scrollback buffer - kontra: * scrolling only parts of the screen causes some + con: * scrolling only parts of the screen causes some ugly flicker effects (boot logo flickers for example). -ywrap Same as ypan, but assumes your gfx board can wrap-around - the video memory (i.e. starts reading from top if it - reaches the end of video memory). Faster than ypan. - -redraw scroll by redrawing the affected part of the screen, this - is the safe (and slow) default. +ywrap Same as ypan, but assumes your gfx board can wrap-around the video + memory (i.e. starts reading from top if it reaches the end of video + memory). Faster than ypan. +redraw Scroll by redrawing the affected part of the screen, this is the + safe (and slow) default. -vgapal Use the standard vga registers for palette changes. +vgapal Use the standard VGA registers for palette changes. This is the default. + pmipal Use the protected mode interface for palette changes. -mtrr setup memory type range registers for the vesafb framebuffer. +mtrr Setup memory type range registers for the vesafb framebuffer. + +nomtrr Do not use memory type range registers for vesafb. vram:n remap 'n' MiB of video RAM. If 0 or not specified, remap memory according to video mode. (2.5.66 patch/idea by Antonino Daplas reversed to give override possibility (allocate more fb memory than the kernel would) to 2.4 by tmb@iki.fi) -Have fun! +Options accepted only by vesafb-tng: + + The mode you want to set, in the standard modedb format. Refer to + modedb.txt for detailed description. If you specify a mode that is + not supported by your board's BIOS, vesafb will attempt to set a + similar mode. The list of supported modes can be found in + /proc/fbx/modes, where x is the framebuffer number (usually 0). + When vesafb is compiled as a module, the mode string should be + provided as a value of the parameter 'mode'. + +vbemode:x + Force the use of VBE mode x. The mode will only be set if it's + found in VBE-provided list of supported modes. + NOTE: The mode number 'x' should be specified in VESA mode number + notation, not the Linux kernel one (ie. 257 instead of 769). + HINT: If you use this option because normal parameter does + not work for you and you use a X server, you'll probably want to + set the 'nocrtc' option to ensure that the video mode is properly + restored after console <-> X switches. + +nocrtc Do not use CRTC timings while setting the graphic mode. This option + makes sence only with VBE3.0 compliant systems. Use it if you have + problems with the modes set in the standard way. Note that specifying + this option means the refresh rate will be ignored and will stay at + your BIOS' default (60 Hz). + +noedid Do not try to fetch and use EDID-provided modes. + +gtf Force the use of VESA's GTF (Generalized Timing Formula). Specifying + this will cause vesafb to skip it's internal modedb and EDID-modedb + and jump straight to the GTF part of the code (normally used only is + everything else failed). This can be useful if you want to get as much + as possible from you graphics board but your BIOS doesn't support + modes with refresh rates you require. Note that you may need to + specify the maxhf, maxvf and maxclk parameters if they are not + provided by EDID. + +Additionally, the following parameters may be provided. They all override the +EDID-provided values and BIOS defaults. Refer to you monitor's specs to get +the correct values for maxhf, maxvf and maxclk for your hardware. + +maxhf:n Maximum horizontal frequency (in kHz). +maxvf:n Maximum vertical frequency (in Hz). +maxclk:n Maximum pixel clock (in MHz). + + +Vesafb-tng Technical details +============================ + +1. The driver architecture. + +The driver's code is stored in 3 files: + /drivers/video/vesafb-tng.c + /drivers/video/vesafb-thread.c + /include/video/vesa.h + +vesafb-tng.c contains the main code. vesafb-thread.c contains code for the +vesafb service thread. A separate thread is necessary because we need to remap +memory in order to be able to use the vm86 calls. The service thread is started +regardless of whether vesafb is compiled into the kernel or compiled as a +module. This is necessary because of the active_mm stuff, better described in +the header of vesafb-thread.c. + +2. The driver initialization + + o vesafb_vbe_init + - get basic info about the graphics BIOS + - fetch data about all modes supported by VBE + - get info about the protected mode interface + - get EDID data and attempt to create an EDID modedb + + o vesafb_probe + - get service thread's PID (started earlier from fbmem.c) + - call vesafb_vbe_init + - try to find the specified mode in vesa_modes modedb + - if the previous step failed or was skipped: + - try to find a matching mode in the VBE modedb - identify VBE mode ID + - try to find a matching mode in the EDID modedb + - if the previous step failed or was skipped: + - try to calculate mode timings with GTF + - low level setup - request_mem_region, ioremap, etc. + - setup /proc/fb/modes and /proc/fb/vbe_info - Gerd +3. Used hacks + + o info->var.reserved[0] holds the VBE mode ID + o info->var.reserved[1] holds a pointer to the VBE mode data in vesafb's + mode database. + +Have fun! -- +Original document for the vesafb driver by Gerd Knorr -Minor (mostly typo) changes -by Nico Schmoigl +Minor (mostly typo) changes by +Nico Schmoigl + +Extended documentation for vm86, VBE3.0 and vesafb-tng by +Micha³ Januszewski + diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/block/as-iosched.c linux-2.6.8.1-ck7/drivers/block/as-iosched.c --- linux-2.6.8.1-ck6/drivers/block/as-iosched.c 2004-06-16 17:35:35.000000000 +1000 +++ linux-2.6.8.1-ck7/drivers/block/as-iosched.c 2004-09-09 22:56:38.677117954 +1000 @@ -1828,14 +1828,14 @@ static int as_set_request(request_queue_ static int as_may_queue(request_queue_t *q, int rw) { - int ret = 0; + int ret = ELV_MQUEUE_MAY; struct as_data *ad = q->elevator.elevator_data; struct io_context *ioc; if (ad->antic_status == ANTIC_WAIT_REQ || ad->antic_status == ANTIC_WAIT_NEXT) { ioc = as_get_io_context(); if (ad->io_context == ioc) - ret = 1; + ret = ELV_MQUEUE_MUST; put_io_context(ioc); } diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/block/cfq-iosched.c linux-2.6.8.1-ck7/drivers/block/cfq-iosched.c --- linux-2.6.8.1-ck6/drivers/block/cfq-iosched.c 2004-08-15 14:08:05.000000000 +1000 +++ linux-2.6.8.1-ck7/drivers/block/cfq-iosched.c 2004-09-09 22:56:38.679117642 +1000 @@ -22,96 +22,214 @@ #include #include +#undef CFQ_DEBUG + +#ifdef CFQ_DEBUG +#define dprintk(fmt, args...) printk(KERN_ERR "cfq: " fmt, ##args) +#else +#define dprintk(fmt, args...) +#endif + +static unsigned long max_elapsed_crq; +static unsigned long max_elapsed_dispatch; + /* * tunables */ -static int cfq_quantum = 4; -static int cfq_queued = 8; +static int cfq_quantum = 4; /* max queue in one round of service */ +static int cfq_queued = 8; /* minimum rq allocate limit per-queue*/ +static int cfq_service = HZ; /* period over which service is avg */ +static int cfq_fifo_expire_r = HZ / 2; /* fifo timeout for sync requests */ +static int cfq_fifo_expire_w = 5 * HZ; /* fifo timeout for async requests */ +static int cfq_fifo_rate = HZ / 8; /* fifo expiry rate */ +static int cfq_back_max = 16 * 1024; /* maximum backwards seek, in KiB */ +static int cfq_back_penalty = 2; /* penalty of a backwards seek */ +/* + * for the hash of cfqq inside the cfqd + */ #define CFQ_QHASH_SHIFT 6 #define CFQ_QHASH_ENTRIES (1 << CFQ_QHASH_SHIFT) -#define list_entry_qhash(entry) list_entry((entry), struct cfq_queue, cfq_hash) +#define list_entry_qhash(entry) hlist_entry((entry), struct cfq_queue, cfq_hash) -#define CFQ_MHASH_SHIFT 8 +/* + * for the hash of crq inside the cfqq + */ +#define CFQ_MHASH_SHIFT 6 #define CFQ_MHASH_BLOCK(sec) ((sec) >> 3) #define CFQ_MHASH_ENTRIES (1 << CFQ_MHASH_SHIFT) -#define CFQ_MHASH_FN(sec) (hash_long(CFQ_MHASH_BLOCK((sec)),CFQ_MHASH_SHIFT)) -#define ON_MHASH(crq) !list_empty(&(crq)->hash) +#define CFQ_MHASH_FN(sec) hash_long(CFQ_MHASH_BLOCK(sec), CFQ_MHASH_SHIFT) #define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors) -#define list_entry_hash(ptr) list_entry((ptr), struct cfq_rq, hash) +#define list_entry_hash(ptr) hlist_entry((ptr), struct cfq_rq, hash) #define list_entry_cfqq(ptr) list_entry((ptr), struct cfq_queue, cfq_list) -#define RQ_DATA(rq) ((struct cfq_rq *) (rq)->elevator_private) +#define RQ_DATA(rq) (rq)->elevator_private + +/* + * rb-tree defines + */ +#define RB_NONE (2) +#define RB_EMPTY(node) ((node)->rb_node == NULL) +#define RB_CLEAR_COLOR(node) (node)->rb_color = RB_NONE +#define RB_CLEAR(node) do { \ + (node)->rb_parent = NULL; \ + RB_CLEAR_COLOR((node)); \ + (node)->rb_right = NULL; \ + (node)->rb_left = NULL; \ +} while (0) +#define RB_CLEAR_ROOT(root) ((root)->rb_node = NULL) +#define ON_RB(node) ((node)->rb_color != RB_NONE) +#define rb_entry_crq(node) rb_entry((node), struct cfq_rq, rb_node) +#define rq_rb_key(rq) (rq)->sector + +/* + * sort key types and names + */ +enum { + CFQ_KEY_PGID, + CFQ_KEY_TGID, + CFQ_KEY_UID, + CFQ_KEY_GID, + CFQ_KEY_LAST, +}; + +static char *cfq_key_types[] = { "pgid", "tgid", "uid", "gid", NULL }; + +/* + * spare queue + */ +#define CFQ_KEY_SPARE (~0UL) static kmem_cache_t *crq_pool; static kmem_cache_t *cfq_pool; -static mempool_t *cfq_mpool; +static kmem_cache_t *cfq_ioc_pool; struct cfq_data { struct list_head rr_list; - struct list_head *dispatch; - struct list_head *cfq_hash; + struct list_head empty_list; - struct list_head *crq_hash; + struct hlist_head *cfq_hash; + struct hlist_head *crq_hash; + /* queues on rr_list (ie they have pending requests */ unsigned int busy_queues; + unsigned int max_queued; + int key_type; + mempool_t *crq_pool; request_queue_t *queue; + sector_t last_sector; + /* - * tunables + * tunables, see top of file */ unsigned int cfq_quantum; unsigned int cfq_queued; + unsigned int cfq_tagged; + unsigned int cfq_fifo_expire_r; + unsigned int cfq_fifo_expire_w; + unsigned int cfq_fifo_batch_expire; + unsigned int cfq_back_penalty; + unsigned int cfq_back_max; + unsigned int find_best_crq; }; struct cfq_queue { - struct list_head cfq_hash; + /* reference count */ + atomic_t ref; + /* parent cfq_data */ + struct cfq_data *cfqd; + /* hash of mergeable requests */ + struct hlist_node cfq_hash; + /* hash key */ + unsigned long key; + /* whether queue is on rr (or empty) list */ + int on_rr; + /* on either rr or empty list of cfqd */ struct list_head cfq_list; + /* sorted list of pending requests */ struct rb_root sort_list; - int pid; + /* if fifo isn't expired, next request to serve */ + struct cfq_rq *next_crq; + /* requests queued in sort_list */ int queued[2]; -#if 0 - /* - * with a simple addition like this, we can do io priorities. almost. - * does need a split request free list, too. - */ - int io_prio + /* currently allocated requests */ + int allocated[2]; + /* fifo list of requests in sort_list */ + struct list_head fifo[2]; + /* last time fifo expired */ + unsigned long last_fifo_expire; + + int key_type; + + unsigned long service_start; + unsigned long service_used; + + /* number of requests that have been handed to the driver */ + int in_flight; + /* number of currently allocated requests */ + int alloc_limit[2]; + +#ifdef CFQ_DEBUG + char name[16]; #endif }; struct cfq_rq { struct rb_node rb_node; sector_t rb_key; - struct request *request; + struct hlist_node hash; struct cfq_queue *cfq_queue; + struct cfq_io_context *io_context; + + unsigned long service_start; + unsigned long queue_start; - struct list_head hash; + unsigned int in_flight : 1; + unsigned int accounted : 1; + unsigned int is_sync : 1; }; -static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq); -static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid); -static void cfq_dispatch_sort(struct cfq_data *cfqd, struct cfq_queue *cfqq, - struct cfq_rq *crq); +static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned long); +static void cfq_dispatch_sort(request_queue_t *, struct cfq_rq *); +static void cfq_update_next_crq(struct cfq_rq *); /* - * lots of deadline iosched dupes, can be abstracted later... + * what the fairness is based on (ie how processes are grouped and + * differentiated) */ -static inline void __cfq_del_crq_hash(struct cfq_rq *crq) +static inline unsigned long +cfq_hash_key(struct cfq_data *cfqd, struct task_struct *tsk) { - list_del_init(&crq->hash); + /* + * optimize this so that ->key_type is the offset into the struct + */ + switch (cfqd->key_type) { + case CFQ_KEY_PGID: + return process_group(tsk); + default: + case CFQ_KEY_TGID: + return tsk->tgid; + case CFQ_KEY_UID: + return tsk->uid; + case CFQ_KEY_GID: + return tsk->gid; + } } +/* + * lots of deadline iosched dupes, can be abstracted later... + */ static inline void cfq_del_crq_hash(struct cfq_rq *crq) { - if (ON_MHASH(crq)) - __cfq_del_crq_hash(crq); + hlist_del_init(&crq->hash); } static void cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq) @@ -120,32 +238,32 @@ static void cfq_remove_merge_hints(reque if (q->last_merge == crq->request) q->last_merge = NULL; + + cfq_update_next_crq(crq); } static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq) { - struct request *rq = crq->request; + const int hash_idx = CFQ_MHASH_FN(rq_hash_key(crq->request)); - BUG_ON(ON_MHASH(crq)); + BUG_ON(!hlist_unhashed(&crq->hash)); - list_add(&crq->hash, &cfqd->crq_hash[CFQ_MHASH_FN(rq_hash_key(rq))]); + hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]); } static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset) { - struct list_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)]; - struct list_head *entry, *next = hash_list->next; + struct hlist_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)]; + struct hlist_node *entry, *next; - while ((entry = next) != hash_list) { + hlist_for_each_safe(entry, next, hash_list) { struct cfq_rq *crq = list_entry_hash(entry); struct request *__rq = crq->request; - next = entry->next; - - BUG_ON(!ON_MHASH(crq)); + BUG_ON(hlist_unhashed(&crq->hash)); if (!rq_mergeable(__rq)) { - __cfq_del_crq_hash(crq); + cfq_del_crq_hash(crq); continue; } @@ -157,29 +275,234 @@ static struct request *cfq_find_rq_hash( } /* - * rb tree support functions + * Lifted from AS - choose which of crq1 and crq2 that is best served now. + * We choose the request that is closest to the head right now. Distance + * behind the head are penalized and only allowed to a certain extent. */ -#define RB_NONE (2) -#define RB_EMPTY(node) ((node)->rb_node == NULL) -#define RB_CLEAR(node) ((node)->rb_color = RB_NONE) -#define RB_CLEAR_ROOT(root) ((root)->rb_node = NULL) -#define ON_RB(node) ((node)->rb_color != RB_NONE) -#define rb_entry_crq(node) rb_entry((node), struct cfq_rq, rb_node) -#define rq_rb_key(rq) (rq)->sector +static struct cfq_rq * +cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2) +{ + sector_t last, s1, s2, d1 = 0, d2 = 0; + int r1_wrap = 0, r2_wrap = 0; /* requests are behind the disk head */ + unsigned long back_max; + + if (crq1 == NULL || crq1 == crq2) + return crq2; + if (crq2 == NULL) + return crq1; + + s1 = crq1->request->sector; + s2 = crq2->request->sector; + + last = cfqd->last_sector; + +#if 0 + if (!list_empty(&cfqd->queue->queue_head)) { + struct list_head *entry = &cfqd->queue->queue_head; + unsigned long distance = ~0UL; + struct request *rq; + + while ((entry = entry->prev) != &cfqd->queue->queue_head) { + rq = list_entry_rq(entry); + + if (blk_barrier_rq(rq)) + break; + + if (distance < abs(s1 - rq->sector + rq->nr_sectors)) { + distance = abs(s1 - rq->sector +rq->nr_sectors); + last = rq->sector + rq->nr_sectors; + } + if (distance < abs(s2 - rq->sector + rq->nr_sectors)) { + distance = abs(s2 - rq->sector +rq->nr_sectors); + last = rq->sector + rq->nr_sectors; + } + } + } +#endif + + /* + * by definition, 1KiB is 2 sectors + */ + back_max = cfqd->cfq_back_max * 2; + + /* + * Strict one way elevator _except_ in the case where we allow + * short backward seeks which are biased as twice the cost of a + * similar forward seek. + */ + if (s1 >= last) + d1 = s1 - last; + else if (s1 + back_max >= last) + d1 = (last - s1) * cfqd->cfq_back_penalty; + else + r1_wrap = 1; + + if (s2 >= last) + d2 = s2 - last; + else if (s2 + back_max >= last) + d2 = (last - s2) * cfqd->cfq_back_penalty; + else + r2_wrap = 1; + + /* Found required data */ + if (!r1_wrap && r2_wrap) + return crq1; + else if (!r2_wrap && r1_wrap) + return crq2; + else if (r1_wrap && r2_wrap) { + /* both behind the head */ + if (s1 <= s2) + return crq1; + else + return crq2; + } + + /* Both requests in front of the head */ + if (d1 < d2) + return crq1; + else if (d2 < d1) + return crq2; + else { + if (s1 >= s2) + return crq1; + else + return crq2; + } +} + +/* + * would be nice to take fifo expire time into account as well + */ +static struct cfq_rq * +cfq_find_next_crq(struct cfq_data *cfqd, struct cfq_queue *cfqq, + struct cfq_rq *last) +{ + struct cfq_rq *crq_next = NULL, *crq_prev = NULL; + struct rb_node *rbnext, *rbprev; + + if (!ON_RB(&last->rb_node)) + return NULL; + + if ((rbnext = rb_next(&last->rb_node)) == NULL) + rbnext = rb_first(&cfqq->sort_list); + + rbprev = rb_prev(&last->rb_node); + + if (rbprev) + crq_prev = rb_entry_crq(rbprev); + if (rbnext) + crq_next = rb_entry_crq(rbnext); + + return cfq_choose_req(cfqd, crq_next, crq_prev); +} + +static void cfq_update_next_crq(struct cfq_rq *crq) +{ + struct cfq_queue *cfqq = crq->cfq_queue; + + if (cfqq->next_crq == crq) + cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq); +} + +static inline void +cfq_sort_rr_list(struct cfq_queue *cfqq) +{ + struct list_head *entry = &cfqq->cfqd->rr_list; + + list_del(&cfqq->cfq_list); + + /* + * sort by our mean service_used, sub-sort by in-flight requests + */ + while ((entry = entry->prev) != &cfqq->cfqd->rr_list) { + struct cfq_queue *__cfqq = list_entry_cfqq(entry); + + if (cfqq->service_used > __cfqq->service_used) + break; + else if (cfqq->service_used == __cfqq->service_used) { + struct list_head *prv; + + while ((prv = entry->prev) != &cfqq->cfqd->rr_list) { + __cfqq = list_entry_cfqq(prv); + + WARN_ON(__cfqq->service_used > cfqq->service_used); + if (cfqq->service_used != __cfqq->service_used) + break; + if (cfqq->in_flight > __cfqq->in_flight) + break; + + entry = prv; + } + } + } -static inline void cfq_del_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq) + list_add(&cfqq->cfq_list, entry); +} + +/* + * add to busy list of queues for service, trying to be fair in ordering + * the pending list according to requests serviced + */ +static inline void +cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) +{ + BUG_ON(cfqq->on_rr); + + /* + * it's currently on the empty list + */ + cfq_sort_rr_list(cfqq); + cfqq->on_rr = 1; + cfqd->busy_queues++; + + /* + * if the queue is on the empty_list, service_start was the time + * where it was deleted from the rr_list. + */ + if (time_after(jiffies, cfqq->service_start + cfq_service)) + cfqq->service_used >>= 3; +} + +static inline void +cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) { + list_move(&cfqq->cfq_list, &cfqd->empty_list); + cfqq->on_rr = 0; + cfqq->service_start = jiffies; + + BUG_ON(!cfqd->busy_queues); + cfqd->busy_queues--; +} + +/* + * rb tree support functions + */ +static inline void cfq_del_crq_rb(struct cfq_rq *crq) +{ + struct cfq_queue *cfqq = crq->cfq_queue; + if (ON_RB(&crq->rb_node)) { - cfqq->queued[rq_data_dir(crq->request)]--; + struct cfq_data *cfqd = cfqq->cfqd; + + BUG_ON(!cfqq->queued[crq->is_sync]); + + cfq_update_next_crq(crq); + + cfqq->queued[crq->is_sync]--; rb_erase(&crq->rb_node, &cfqq->sort_list); - crq->cfq_queue = NULL; + RB_CLEAR_COLOR(&crq->rb_node); + + if (RB_EMPTY(&cfqq->sort_list) && cfqq->on_rr) { + dprintk("moving 0x%p empty_list\n", cfqq); + cfq_del_cfqq_rr(cfqd, cfqq); + } } } static struct cfq_rq * -__cfq_add_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq) +__cfq_add_crq_rb(struct cfq_rq *crq) { - struct rb_node **p = &cfqq->sort_list.rb_node; + struct rb_node **p = &crq->cfq_queue->sort_list.rb_node; struct rb_node *parent = NULL; struct cfq_rq *__crq; @@ -199,30 +522,53 @@ __cfq_add_crq_rb(struct cfq_queue *cfqq, return NULL; } -static void -cfq_add_crq_rb(struct cfq_data *cfqd, struct cfq_queue *cfqq,struct cfq_rq *crq) +static void cfq_add_crq_rb(struct cfq_rq *crq) { + struct cfq_queue *cfqq = crq->cfq_queue; + struct cfq_data *cfqd = cfqq->cfqd; struct request *rq = crq->request; struct cfq_rq *__alias; crq->rb_key = rq_rb_key(rq); - cfqq->queued[rq_data_dir(rq)]++; -retry: - __alias = __cfq_add_crq_rb(cfqq, crq); - if (!__alias) { - rb_insert_color(&crq->rb_node, &cfqq->sort_list); - crq->cfq_queue = cfqq; - return; + cfqq->queued[crq->is_sync]++; + + /* + * looks a little odd, but the first insert might return an alias. + * if that happens, put the alias on the dispatch list + */ + while ((__alias = __cfq_add_crq_rb(crq)) != NULL) + cfq_dispatch_sort(cfqd->queue, __alias); + + rb_insert_color(&crq->rb_node, &cfqq->sort_list); + + if (!cfqq->on_rr) { + cfq_add_cfqq_rr(cfqd, cfqq); + dprintk("moving to rr list %d\n", cfqd->busy_queues); + } else + dprintk("already on rr list %d\n", cfqd->busy_queues); + + /* + * check if this request is a better next-serve candidate + */ + cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq); +} + +static inline void +cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq) +{ + if (ON_RB(&crq->rb_node)) { + rb_erase(&crq->rb_node, &cfqq->sort_list); + cfqq->queued[crq->is_sync]--; } - cfq_dispatch_sort(cfqd, cfqq, __alias); - goto retry; + cfq_add_crq_rb(crq); } static struct request * cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector) { - struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->tgid); + const unsigned long key = cfq_hash_key(cfqd, current); + struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, key); struct rb_node *n; if (!cfqq) @@ -246,21 +592,16 @@ out: static void cfq_remove_request(request_queue_t *q, struct request *rq) { - struct cfq_data *cfqd = q->elevator.elevator_data; struct cfq_rq *crq = RQ_DATA(rq); - if (crq) { - struct cfq_queue *cfqq = crq->cfq_queue; + dprintk("removing 0x%p\n", rq); + if (crq) { cfq_remove_merge_hints(q, crq); list_del_init(&rq->queuelist); - if (cfqq) { - cfq_del_crq_rb(cfqq, crq); - - if (RB_EMPTY(&cfqq->sort_list)) - cfq_put_queue(cfqd, cfqq); - } + if (crq->cfq_queue) + cfq_del_crq_rb(crq); } } @@ -314,92 +655,228 @@ static void cfq_merged_request(request_q if (ON_RB(&crq->rb_node) && (rq_rb_key(req) != crq->rb_key)) { struct cfq_queue *cfqq = crq->cfq_queue; - cfq_del_crq_rb(cfqq, crq); - cfq_add_crq_rb(cfqd, cfqq, crq); + cfq_update_next_crq(crq); + cfq_reposition_crq_rb(cfqq, crq); } q->last_merge = req; } static void -cfq_merged_requests(request_queue_t *q, struct request *req, +cfq_merged_requests(request_queue_t *q, struct request *rq, struct request *next) { - cfq_merged_request(q, req); + struct cfq_rq *crq = RQ_DATA(rq); + struct cfq_rq *cnext = RQ_DATA(next); + + cfq_merged_request(q, rq); + + if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist)) { + if (time_before(cnext->queue_start, crq->queue_start)) { + list_move(&rq->queuelist, &next->queuelist); + crq->queue_start = cnext->queue_start; + } + } + + cfq_update_next_crq(cnext); cfq_remove_request(q, next); } -static void -cfq_dispatch_sort(struct cfq_data *cfqd, struct cfq_queue *cfqq, - struct cfq_rq *crq) +/* + * we dispatch cfqd->cfq_quantum requests in total from the rr_list queues, + * this function sector sorts the selected request to minimize seeks. we start + * at cfqd->last_sector, not 0. + */ +static void cfq_dispatch_sort(request_queue_t *q, struct cfq_rq *crq) { - struct list_head *head = cfqd->dispatch, *entry = head; + struct cfq_data *cfqd = q->elevator.elevator_data; + struct cfq_queue *cfqq = crq->cfq_queue; + struct list_head *head = &q->queue_head, *entry = head; struct request *__rq; + sector_t last; - cfq_del_crq_rb(cfqq, crq); - cfq_remove_merge_hints(cfqd->queue, crq); + cfq_del_crq_rb(crq); + cfq_remove_merge_hints(q, crq); + list_del(&crq->request->queuelist); - if (!list_empty(head)) { - __rq = list_entry_rq(head->next); + last = cfqd->last_sector; + while ((entry = entry->prev) != head) { + __rq = list_entry_rq(entry); + + if (blk_barrier_rq(crq->request)) + break; + if (!blk_fs_request(crq->request)) + break; - if (crq->request->sector < __rq->sector) { - entry = head->prev; - goto link; + if (crq->request->sector > __rq->sector) + break; + if (__rq->sector > last && crq->request->sector < last) { + last = crq->request->sector; + break; } } - while ((entry = entry->prev) != head) { - __rq = list_entry_rq(entry); + cfqd->last_sector = last; + crq->in_flight = 1; + cfqq->in_flight++; + list_add(&crq->request->queuelist, entry); +} - if (crq->request->sector <= __rq->sector) - break; +/* + * return expired entry, or NULL to just start from scratch in rbtree + */ +static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq) +{ + struct cfq_data *cfqd = cfqq->cfqd; + const int reads = !list_empty(&cfqq->fifo[0]); + const int writes = !list_empty(&cfqq->fifo[1]); + struct cfq_rq *crq; + + if (jiffies - cfqq->last_fifo_expire < cfqd->cfq_fifo_batch_expire) + return NULL; + + crq = RQ_DATA(list_entry(cfqq->fifo[0].next, struct request, queuelist)); + if (reads && time_after(jiffies, crq->queue_start + cfqd->cfq_fifo_expire_r)) { + cfqq->last_fifo_expire = jiffies; + return crq; } -link: - list_add_tail(&crq->request->queuelist, entry); + crq = RQ_DATA(list_entry(cfqq->fifo[1].next, struct request, queuelist)); + if (writes && time_after(jiffies, crq->queue_start + cfqd->cfq_fifo_expire_w)) { + cfqq->last_fifo_expire = jiffies; + return crq; + } + + return NULL; } +/* + * dispatch a single request from given queue + */ static inline void -__cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd, - struct cfq_queue *cfqq) +cfq_dispatch_request(request_queue_t *q, struct cfq_data *cfqd, + struct cfq_queue *cfqq) { - struct cfq_rq *crq = rb_entry_crq(rb_first(&cfqq->sort_list)); + struct cfq_rq *crq; - cfq_dispatch_sort(cfqd, cfqq, crq); + /* + * follow expired path, else get first next available + */ + if ((crq = cfq_check_fifo(cfqq)) == NULL) { + if (cfqd->find_best_crq) + crq = cfqq->next_crq; + else + crq = rb_entry_crq(rb_first(&cfqq->sort_list)); + } + + cfqd->last_sector = crq->request->sector + crq->request->nr_sectors; + + /* + * finally, insert request into driver list + */ + cfq_dispatch_sort(q, crq); } -static int cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd) +static int cfq_dispatch_requests(request_queue_t *q, int max_dispatch) { + struct cfq_data *cfqd = q->elevator.elevator_data; struct cfq_queue *cfqq; struct list_head *entry, *tmp; - int ret, queued, good_queues; + int queued, busy_queues, first_round; if (list_empty(&cfqd->rr_list)) return 0; - queued = ret = 0; + queued = 0; + first_round = 1; restart: - good_queues = 0; + busy_queues = 0; list_for_each_safe(entry, tmp, &cfqd->rr_list) { - cfqq = list_entry_cfqq(cfqd->rr_list.next); + cfqq = list_entry_cfqq(entry); BUG_ON(RB_EMPTY(&cfqq->sort_list)); - __cfq_dispatch_requests(q, cfqd, cfqq); + /* + * first round of queueing, only select from queues that + * don't already have io in-flight + */ + if (first_round && cfqq->in_flight) + continue; - if (RB_EMPTY(&cfqq->sort_list)) - cfq_put_queue(cfqd, cfqq); - else - good_queues++; + cfq_dispatch_request(q, cfqd, cfqq); + + if (!RB_EMPTY(&cfqq->sort_list)) + busy_queues++; queued++; - ret = 1; } - if ((queued < cfqd->cfq_quantum) && good_queues) + if ((queued < max_dispatch) && (busy_queues || first_round)) { + first_round = 0; goto restart; + } - return ret; + return queued; +} + +static inline void cfq_account_dispatch(struct cfq_rq *crq) +{ + struct cfq_queue *cfqq = crq->cfq_queue; + unsigned long elapsed = jiffies - crq->queue_start; + + /* + * accounted bit is necessary since some drivers will call + * elv_next_request() many times for the same request (eg ide) + */ + if (crq->accounted) + return; + + /* + * on drives with tagged command queueing, command turn-around time + * doesn't necessarily reflect the time spent processing this very + * command inside the drive. so do the accounting differently there, + * by just sorting on the number of requests + */ + if (cfqq->cfqd->cfq_tagged) { + if (time_after(jiffies, cfqq->service_start + cfq_service)) { + cfqq->service_start = jiffies; + cfqq->service_used /= 10; + } + + cfqq->service_used++; + } + + if (elapsed > max_elapsed_dispatch) + max_elapsed_dispatch = elapsed; + + crq->accounted = 1; + crq->service_start = jiffies; +} + +static inline void +cfq_account_completion(struct cfq_queue *cfqq, struct cfq_rq *crq) +{ + unsigned long start_val = cfqq->service_used; + + if (!cfqq->cfqd->cfq_tagged) { + unsigned long duration = jiffies - crq->service_start; + + if (time_after(jiffies, cfqq->service_start + cfq_service)) { + cfqq->service_start = jiffies; + cfqq->service_used >>= 3; + } + + cfqq->service_used += duration; + + if (duration > max_elapsed_crq) + max_elapsed_crq = duration; + } + + /* + * make sure list stays properly sorted, but only do so if necessary + */ + if (cfqq->on_rr && cfqq->service_used != start_val) + cfq_sort_rr_list(cfqq); } static struct request *cfq_next_request(request_queue_t *q) @@ -407,100 +884,309 @@ static struct request *cfq_next_request( struct cfq_data *cfqd = q->elevator.elevator_data; struct request *rq; - if (!list_empty(cfqd->dispatch)) { + if (!list_empty(&q->queue_head)) { struct cfq_rq *crq; dispatch: - rq = list_entry_rq(cfqd->dispatch->next); + rq = list_entry_rq(q->queue_head.next); - crq = RQ_DATA(rq); - if (crq) + if ((crq = RQ_DATA(rq)) != NULL) { cfq_remove_merge_hints(q, crq); + cfq_account_dispatch(crq); + } return rq; } - if (cfq_dispatch_requests(q, cfqd)) + if (cfq_dispatch_requests(q, cfqd->cfq_quantum)) goto dispatch; return NULL; } +/* + * task holds one reference to the queue, dropped when task exits. each crq + * in-flight on this queue also holds a reference, dropped when crq is freed. + * + * queue lock must be held here. + */ +static void cfq_put_queue(struct cfq_queue *cfqq) +{ + BUG_ON(!atomic_read(&cfqq->ref)); + + dprintk("cfq_put_queue 0x%p, ref\n", atomic_read(&cfqq->ref)); + + if (!atomic_dec_and_test(&cfqq->ref)) + return; + + dprintk("killing queue 0x%p/%s\n", cfqq, cfqq->name); + + BUG_ON(rb_first(&cfqq->sort_list)); + BUG_ON(cfqq->on_rr); + + /* + * it's on the empty list and still hashed + */ + list_del(&cfqq->cfq_list); + hlist_del(&cfqq->cfq_hash); + kmem_cache_free(cfq_pool, cfqq); +} + static inline struct cfq_queue * -__cfq_find_cfq_hash(struct cfq_data *cfqd, int pid, const int hashval) +__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key, const int hashval) { - struct list_head *hash_list = &cfqd->cfq_hash[hashval]; - struct list_head *entry; + struct hlist_head *hash_list = &cfqd->cfq_hash[hashval]; + struct hlist_node *entry, *next; - list_for_each(entry, hash_list) { + hlist_for_each_safe(entry, next, hash_list) { struct cfq_queue *__cfqq = list_entry_qhash(entry); - if (__cfqq->pid == pid) + if (__cfqq->key == key) return __cfqq; } return NULL; } -static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid) +static struct cfq_queue * +cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key) { - const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT); + return __cfq_find_cfq_hash(cfqd, key, hash_long(key, CFQ_QHASH_SHIFT)); +} + +static inline void +cfq_rehash_cfqq(struct cfq_data *cfqd, struct cfq_queue **cfqq, + struct cfq_io_context *cic) +{ + unsigned long hashkey = cfq_hash_key(cfqd, current); + unsigned long hashval = hash_long(hashkey, CFQ_QHASH_SHIFT); + struct cfq_queue *__cfqq; + unsigned long flags; + + spin_lock_irqsave(cfqd->queue->queue_lock, flags); + + hlist_del(&(*cfqq)->cfq_hash); + + __cfqq = __cfq_find_cfq_hash(cfqd, hashkey, hashval); + if (!__cfqq || __cfqq == *cfqq) { + __cfqq = *cfqq; + hlist_add_head(&__cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); + __cfqq->key_type = cfqd->key_type; + } else { + atomic_inc(&__cfqq->ref); + cic->cfqq = __cfqq; + cfq_put_queue(*cfqq); + *cfqq = __cfqq; + } - return __cfq_find_cfq_hash(cfqd, pid, hashval); + cic->cfqq = __cfqq; + spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); } -static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) +static void cfq_free_io_context(struct cfq_io_context *cic) { - cfqd->busy_queues--; - list_del(&cfqq->cfq_list); - list_del(&cfqq->cfq_hash); - mempool_free(cfqq, cfq_mpool); + kmem_cache_free(cfq_ioc_pool, cic); +} + +/* + * locking hierarchy is: io_context lock -> queue locks + */ +static void cfq_exit_io_context(struct cfq_io_context *cic) +{ + struct cfq_queue *cfqq = cic->cfqq; + struct list_head *entry = &cic->list; + request_queue_t *q; + unsigned long flags; + + /* + * put the reference this task is holding to the various queues + */ + spin_lock_irqsave(&cic->ioc->lock, flags); + while ((entry = cic->list.next) != &cic->list) { + struct cfq_io_context *__cic; + + __cic = list_entry(entry, struct cfq_io_context, list); + list_del(entry); + + q = __cic->cfqq->cfqd->queue; + spin_lock(q->queue_lock); + cfq_put_queue(__cic->cfqq); + spin_unlock(q->queue_lock); + } + + q = cfqq->cfqd->queue; + spin_lock(q->queue_lock); + cfq_put_queue(cfqq); + spin_unlock(q->queue_lock); + + cic->cfqq = NULL; + spin_unlock_irqrestore(&cic->ioc->lock, flags); +} + +static struct cfq_io_context *cfq_alloc_io_context(int gfp_flags) +{ + struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_flags); + + if (cic) { + cic->dtor = cfq_free_io_context; + cic->exit = cfq_exit_io_context; + INIT_LIST_HEAD(&cic->list); + cic->cfqq = NULL; + } + + return cic; +} + +/* + * Setup general io context and cfq io context. There can be several cfq + * io contexts per general io context, if this process is doing io to more + * than one device managed by cfq. Note that caller is holding a reference to + * cfqq, so we don't need to worry about it disappearing + */ +static struct cfq_io_context * +cfq_get_io_context(struct cfq_queue **cfqq, int gfp_flags) +{ + struct cfq_data *cfqd = (*cfqq)->cfqd; + struct cfq_queue *__cfqq = *cfqq; + struct cfq_io_context *cic; + struct io_context *ioc; + + might_sleep_if(gfp_flags & __GFP_WAIT); + + ioc = get_io_context(gfp_flags); + if (!ioc) + return NULL; + + if ((cic = ioc->cic) == NULL) { + cic = cfq_alloc_io_context(gfp_flags); + + if (cic == NULL) + goto err; + + ioc->cic = cic; + cic->ioc = ioc; + cic->cfqq = __cfqq; + atomic_inc(&__cfqq->ref); + } else { + struct cfq_io_context *__cic; + unsigned long flags; + + /* + * since the first cic on the list is actually the head + * itself, need to check this here or we'll duplicate an + * cic per ioc for no reason + */ + if (cic->cfqq == __cfqq) + goto out; + + /* + * cic exists, check if we already are there. linear search + * should be ok here, the list will usually not be more than + * 1 or a few entries long + */ + spin_lock_irqsave(&ioc->lock, flags); + list_for_each_entry(__cic, &cic->list, list) { + /* + * this process is already holding a reference to + * this queue, so no need to get one more + */ + if (__cic->cfqq == __cfqq) { + cic = __cic; + spin_unlock_irqrestore(&ioc->lock, flags); + goto out; + } + } + spin_unlock_irqrestore(&ioc->lock, flags); + + /* + * nope, process doesn't have a cic assoicated with this + * cfqq yet. get a new one and add to list + */ + __cic = cfq_alloc_io_context(gfp_flags); + if (__cic == NULL) + goto err; + + __cic->ioc = ioc; + __cic->cfqq = __cfqq; + atomic_inc(&__cfqq->ref); + spin_lock_irqsave(&ioc->lock, flags); + list_add(&__cic->list, &cic->list); + spin_unlock_irqrestore(&ioc->lock, flags); + + cic = __cic; + *cfqq = __cfqq; + } + +out: + /* + * if key_type has been changed on the fly, we lazily rehash + * each queue at lookup time + */ + if ((*cfqq)->key_type != cfqd->key_type) + cfq_rehash_cfqq(cfqd, cfqq, cic); + + return cic; +err: + put_io_context(ioc); + return NULL; } -static struct cfq_queue *__cfq_get_queue(struct cfq_data *cfqd, int pid, - int gfp_mask) +static struct cfq_queue * +__cfq_get_queue(struct cfq_data *cfqd, unsigned long key, int gfp_mask) { - const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT); + const int hashval = hash_long(key, CFQ_QHASH_SHIFT); struct cfq_queue *cfqq, *new_cfqq = NULL; - request_queue_t *q = cfqd->queue; retry: - cfqq = __cfq_find_cfq_hash(cfqd, pid, hashval); + cfqq = __cfq_find_cfq_hash(cfqd, key, hashval); if (!cfqq) { if (new_cfqq) { cfqq = new_cfqq; new_cfqq = NULL; } else if (gfp_mask & __GFP_WAIT) { - spin_unlock_irq(q->queue_lock); - new_cfqq = mempool_alloc(cfq_mpool, gfp_mask); - spin_lock_irq(q->queue_lock); + spin_unlock_irq(cfqd->queue->queue_lock); + new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask); + spin_lock_irq(cfqd->queue->queue_lock); goto retry; } else - return NULL; + goto out; + + memset(cfqq, 0, sizeof(*cfqq)); - INIT_LIST_HEAD(&cfqq->cfq_hash); + INIT_HLIST_NODE(&cfqq->cfq_hash); INIT_LIST_HEAD(&cfqq->cfq_list); RB_CLEAR_ROOT(&cfqq->sort_list); + INIT_LIST_HEAD(&cfqq->fifo[0]); + INIT_LIST_HEAD(&cfqq->fifo[1]); - cfqq->pid = pid; - cfqq->queued[0] = cfqq->queued[1] = 0; - list_add(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); + cfqq->key = key; + hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); + atomic_set(&cfqq->ref, 0); + cfqq->cfqd = cfqd; +#ifdef CFQ_DEBUG + strncpy(cfqq->name, current->comm, sizeof(cfqq->name)-1); +#endif + dprintk("cfqq set up for 0x%p/%s\n", cfqq, cfqq->name); + cfqq->key_type = cfqd->key_type; } if (new_cfqq) - mempool_free(new_cfqq, cfq_mpool); + kmem_cache_free(cfq_pool, new_cfqq); + atomic_inc(&cfqq->ref); +out: + WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq); return cfqq; } -static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, int pid, - int gfp_mask) +static struct cfq_queue * +cfq_get_queue(struct cfq_data *cfqd, unsigned long key, int gfp_mask) { request_queue_t *q = cfqd->queue; struct cfq_queue *cfqq; spin_lock_irq(q->queue_lock); - cfqq = __cfq_get_queue(cfqd, pid, gfp_mask); + cfqq = __cfq_get_queue(cfqd, key, gfp_mask); spin_unlock_irq(q->queue_lock); return cfqq; @@ -508,24 +1194,14 @@ static struct cfq_queue *cfq_get_queue(s static void cfq_enqueue(struct cfq_data *cfqd, struct cfq_rq *crq) { - struct cfq_queue *cfqq; + crq->is_sync = 0; + if (rq_data_dir(crq->request) == READ || current->flags & PF_SYNCWRITE) + crq->is_sync = 1; - cfqq = __cfq_get_queue(cfqd, current->tgid, GFP_ATOMIC); - if (cfqq) { - cfq_add_crq_rb(cfqd, cfqq, crq); + cfq_add_crq_rb(crq); + crq->queue_start = jiffies; - if (list_empty(&cfqq->cfq_list)) { - list_add(&cfqq->cfq_list, &cfqd->rr_list); - cfqd->busy_queues++; - } - } else { - /* - * should can only happen if the request wasn't allocated - * through blk_alloc_request(), eg stack requests from ide-cd - * (those should be removed) _and_ we are in OOM. - */ - list_add_tail(&crq->request->queuelist, cfqd->dispatch); - } + list_add_tail(&crq->request->queuelist, &crq->cfq_queue->fifo[crq->is_sync]); } static void @@ -536,14 +1212,17 @@ cfq_insert_request(request_queue_t *q, s switch (where) { case ELEVATOR_INSERT_BACK: - while (cfq_dispatch_requests(q, cfqd)) + dprintk("adding back 0x%p\n", rq); + while (cfq_dispatch_requests(q, cfqd->cfq_quantum)) ; - list_add_tail(&rq->queuelist, cfqd->dispatch); + list_add_tail(&rq->queuelist, &q->queue_head); break; case ELEVATOR_INSERT_FRONT: - list_add(&rq->queuelist, cfqd->dispatch); + dprintk("adding front 0x%p\n", rq); + list_add(&rq->queuelist, &q->queue_head); break; case ELEVATOR_INSERT_SORT: + dprintk("adding sort 0x%p\n", rq); BUG_ON(!blk_fs_request(rq)); cfq_enqueue(cfqd, crq); break; @@ -564,10 +1243,25 @@ static int cfq_queue_empty(request_queue { struct cfq_data *cfqd = q->elevator.elevator_data; - if (list_empty(cfqd->dispatch) && list_empty(&cfqd->rr_list)) - return 1; + return list_empty(&q->queue_head) && list_empty(&cfqd->rr_list); +} + +static void cfq_completed_request(request_queue_t *q, struct request *rq) +{ + struct cfq_rq *crq = RQ_DATA(rq); + + if (unlikely(!blk_fs_request(rq))) + return; + + if (crq->in_flight) { + struct cfq_queue *cfqq = crq->cfq_queue; + + WARN_ON(!cfqq->in_flight); + cfqq->in_flight--; + + cfq_account_completion(cfqq, crq); + } - return 0; } static struct request * @@ -598,90 +1292,158 @@ static int cfq_may_queue(request_queue_t { struct cfq_data *cfqd = q->elevator.elevator_data; struct cfq_queue *cfqq; - int ret = 1; + int ret = ELV_MQUEUE_MAY; - if (!cfqd->busy_queues) - goto out; + if (current->flags & PF_MEMALLOC) + return ELV_MQUEUE_MAY; - cfqq = cfq_find_cfq_hash(cfqd, current->tgid); + cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(cfqd, current)); if (cfqq) { - int limit = (q->nr_requests - cfqd->cfq_queued) / cfqd->busy_queues; + int limit = cfqd->max_queued; + + if (cfqq->allocated[rw] < cfqd->cfq_queued) + return ELV_MQUEUE_MUST; + + if (cfqd->busy_queues) + limit = q->nr_requests / cfqd->busy_queues; - if (limit < 3) - limit = 3; + if (limit < cfqd->cfq_queued) + limit = cfqd->cfq_queued; else if (limit > cfqd->max_queued) limit = cfqd->max_queued; - if (cfqq->queued[rw] > limit) - ret = 0; + if (cfqq->allocated[rw] >= limit) { + if (limit > cfqq->alloc_limit[rw]) + cfqq->alloc_limit[rw] = limit; + + ret = ELV_MQUEUE_NO; + } } -out: + return ret; } +static void cfq_check_waiters(request_queue_t *q, struct cfq_queue *cfqq) +{ + struct request_list *rl = &q->rq; + const int write = waitqueue_active(&rl->wait[WRITE]); + const int read = waitqueue_active(&rl->wait[READ]); + + if (read && cfqq->allocated[READ] < cfqq->alloc_limit[READ]) + wake_up(&rl->wait[READ]); + if (write && cfqq->allocated[WRITE] < cfqq->alloc_limit[WRITE]) + wake_up(&rl->wait[WRITE]); +} + +/* + * queue lock held here + */ static void cfq_put_request(request_queue_t *q, struct request *rq) { struct cfq_data *cfqd = q->elevator.elevator_data; struct cfq_rq *crq = RQ_DATA(rq); - struct request_list *rl; - int other_rw; + const int rw = rq_data_dir(rq); if (crq) { + struct cfq_queue *cfqq = crq->cfq_queue; + BUG_ON(q->last_merge == rq); - BUG_ON(ON_MHASH(crq)); + BUG_ON(!hlist_unhashed(&crq->hash)); + + if (crq->io_context) + put_io_context(crq->io_context->ioc); mempool_free(crq, cfqd->crq_pool); rq->elevator_private = NULL; - } - /* - * work-around for may_queue "bug": if a read gets issued and refused - * to queue because writes ate all the allowed slots and no other - * reads are pending for this queue, it could get stuck infinitely - * since freed_request() only checks the waitqueue for writes when - * freeing them. or vice versa for a single write vs many reads. - * so check here whether "the other" data direction might be able - * to queue and wake them - */ - rl = &q->rq; - other_rw = rq_data_dir(rq) ^ 1; - if (rl->count[other_rw] <= q->nr_requests) { + BUG_ON(!cfqq->allocated[rw]); + cfqq->allocated[rw]--; + smp_mb(); - if (waitqueue_active(&rl->wait[other_rw])) - wake_up(&rl->wait[other_rw]); + cfq_check_waiters(q, cfqq); + cfq_put_queue(cfqq); } } +/* + * Allocate cfq data structures associated with this request. A queue and + */ static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask) { struct cfq_data *cfqd = q->elevator.elevator_data; + struct cfq_io_context *cic; + const int rw = rq_data_dir(rq); struct cfq_queue *cfqq; struct cfq_rq *crq; + unsigned long flags; + + might_sleep_if(gfp_mask & __GFP_WAIT); + + spin_lock_irqsave(q->queue_lock, flags); + + cfqq = __cfq_get_queue(cfqd, cfq_hash_key(cfqd, current), gfp_mask); + if (!cfqq) { +#if 0 + cfqq = cfq_get_queue(cfqd, CFQ_KEY_SPARE, gfp_mask); + printk("%s: got spare queue\n", current->comm); +#else + goto out_lock; +#endif + } + + if (cfqq->allocated[rw] >= cfqd->max_queued) + goto out_lock; + + spin_unlock_irqrestore(q->queue_lock, flags); /* - * prepare a queue up front, so cfq_enqueue() doesn't have to + * if hashing type has changed, the cfq_queue might change here. we + * don't bother rechecking ->allocated since it should be a rare + * event */ - cfqq = cfq_get_queue(cfqd, current->tgid, gfp_mask); - if (!cfqq) - return 1; + cic = cfq_get_io_context(&cfqq, gfp_mask); + if (!cic) + goto err; crq = mempool_alloc(cfqd->crq_pool, gfp_mask); if (crq) { - memset(crq, 0, sizeof(*crq)); RB_CLEAR(&crq->rb_node); + crq->rb_key = 0; crq->request = rq; - crq->cfq_queue = NULL; - INIT_LIST_HEAD(&crq->hash); + INIT_HLIST_NODE(&crq->hash); + crq->cfq_queue = cfqq; + crq->io_context = cic; + crq->service_start = crq->queue_start = 0; + crq->in_flight = crq->accounted = crq->is_sync = 0; rq->elevator_private = crq; + cfqq->allocated[rw]++; + cfqq->alloc_limit[rw] = 0; return 0; } + put_io_context(cic->ioc); +err: + spin_lock_irqsave(q->queue_lock, flags); + cfq_put_queue(cfqq); +out_lock: + spin_unlock_irqrestore(q->queue_lock, flags); return 1; } static void cfq_exit(request_queue_t *q, elevator_t *e) { struct cfq_data *cfqd = e->elevator_data; + struct cfq_queue *cfqq; + + /* + * kill spare queue, getting it means we have two refences to it. + * drop both + */ + spin_lock_irq(q->queue_lock); + cfqq = __cfq_get_queue(cfqd, CFQ_KEY_SPARE, GFP_ATOMIC); + cfq_put_queue(cfqq); + cfq_put_queue(cfqq); + spin_unlock_irq(q->queue_lock); e->elevator_data = NULL; mempool_destroy(cfqd->crq_pool); @@ -693,6 +1455,7 @@ static void cfq_exit(request_queue_t *q, static int cfq_init(request_queue_t *q, elevator_t *e) { struct cfq_data *cfqd; + struct cfq_queue *cfqq; int i; cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL); @@ -701,12 +1464,13 @@ static int cfq_init(request_queue_t *q, memset(cfqd, 0, sizeof(*cfqd)); INIT_LIST_HEAD(&cfqd->rr_list); + INIT_LIST_HEAD(&cfqd->empty_list); - cfqd->crq_hash = kmalloc(sizeof(struct list_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL); + cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL); if (!cfqd->crq_hash) goto out_crqhash; - cfqd->cfq_hash = kmalloc(sizeof(struct list_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL); + cfqd->cfq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL); if (!cfqd->cfq_hash) goto out_cfqhash; @@ -715,25 +1479,42 @@ static int cfq_init(request_queue_t *q, goto out_crqpool; for (i = 0; i < CFQ_MHASH_ENTRIES; i++) - INIT_LIST_HEAD(&cfqd->crq_hash[i]); + INIT_HLIST_HEAD(&cfqd->crq_hash[i]); for (i = 0; i < CFQ_QHASH_ENTRIES; i++) - INIT_LIST_HEAD(&cfqd->cfq_hash[i]); + INIT_HLIST_HEAD(&cfqd->cfq_hash[i]); - cfqd->dispatch = &q->queue_head; e->elevator_data = cfqd; cfqd->queue = q; /* + * setup spare failure queue + */ + cfqq = cfq_get_queue(cfqd, CFQ_KEY_SPARE, GFP_KERNEL); + if (!cfqq) + goto out_spare; + + /* * just set it to some high value, we want anyone to be able to queue * some requests. fairness is handled differently */ - cfqd->max_queued = q->nr_requests; - q->nr_requests = 8192; + q->nr_requests = 1024; + cfqd->max_queued = q->nr_requests / 16; + q->nr_batching = cfq_queued; + cfqd->key_type = CFQ_KEY_TGID; + cfqd->find_best_crq = 1; cfqd->cfq_queued = cfq_queued; cfqd->cfq_quantum = cfq_quantum; + cfqd->cfq_fifo_expire_r = cfq_fifo_expire_r; + cfqd->cfq_fifo_expire_w = cfq_fifo_expire_w; + cfqd->cfq_fifo_batch_expire = cfq_fifo_rate; + cfqd->cfq_back_max = cfq_back_max; + cfqd->cfq_back_penalty = cfq_back_penalty; + dprintk("cfq on queue 0x%p\n", q); return 0; +out_spare: + mempool_destroy(cfqd->crq_pool); out_crqpool: kfree(cfqd->cfq_hash); out_cfqhash: @@ -747,20 +1528,18 @@ static int __init cfq_slab_setup(void) { crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0, NULL, NULL); - if (!crq_pool) panic("cfq_iosched: can't init crq pool\n"); cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0, NULL, NULL); - if (!cfq_pool) panic("cfq_iosched: can't init cfq pool\n"); - cfq_mpool = mempool_create(64, mempool_alloc_slab, mempool_free_slab, cfq_pool); - - if (!cfq_mpool) - panic("cfq_iosched: can't init cfq mpool\n"); + cfq_ioc_pool = kmem_cache_create("cfq_ioc_pool", + sizeof(struct cfq_io_context), 0, 0, NULL, NULL); + if (!cfq_ioc_pool) + panic("cfq_iosched: can't init ioc pool\n"); return 0; } @@ -791,6 +1570,83 @@ cfq_var_store(unsigned int *var, const c return count; } +static ssize_t +cfq_clear_elapsed(struct cfq_data *cfqd, const char *page, size_t count) +{ + max_elapsed_dispatch = max_elapsed_crq = 0; + return count; +} + +static ssize_t +cfq_set_key_type(struct cfq_data *cfqd, const char *page, size_t count) +{ + spin_lock_irq(cfqd->queue->queue_lock); + if (!strncmp(page, "pgid", 4)) + cfqd->key_type = CFQ_KEY_PGID; + else if (!strncmp(page, "tgid", 4)) + cfqd->key_type = CFQ_KEY_TGID; + else if (!strncmp(page, "uid", 3)) + cfqd->key_type = CFQ_KEY_UID; + else if (!strncmp(page, "gid", 3)) + cfqd->key_type = CFQ_KEY_GID; + spin_unlock_irq(cfqd->queue->queue_lock); + return count; +} + +static ssize_t +cfq_read_key_type(struct cfq_data *cfqd, char *page) +{ + ssize_t len = 0; + int i; + + for (i = CFQ_KEY_PGID; i < CFQ_KEY_LAST; i++) { + if (cfqd->key_type == i) + len += sprintf(page+len, "[%s] ", cfq_key_types[i]); + else + len += sprintf(page+len, "%s ", cfq_key_types[i]); + } + len += sprintf(page+len, "\n"); + return len; +} + +static ssize_t +cfq_status_show(struct cfq_data *cfqd, char *page) +{ + struct list_head *entry; + struct cfq_queue *cfqq; + ssize_t len; + int i = 0, queues; + + len = sprintf(page, "Busy queues: %u\n", cfqd->busy_queues); + len += sprintf(page+len, "key type: %s\n", cfq_key_types[cfqd->key_type]); + len += sprintf(page+len, "last sector: %Lu\n", (u64) cfqd->last_sector); + len += sprintf(page+len, "max time in iosched: %lu\n", max_elapsed_dispatch); + len += sprintf(page+len, "max completion time: %lu\n", max_elapsed_crq); + + len += sprintf(page+len, "Busy queue list:\n"); + spin_lock_irq(cfqd->queue->queue_lock); + list_for_each(entry, &cfqd->rr_list) { + i++; + cfqq = list_entry_cfqq(entry); + len += sprintf(page+len, " cfqq: key=%lu alloc=%d/%d, queued=%d/%d, last_fifo=%lu, service_used=%lu\n", cfqq->key, cfqq->allocated[0], cfqq->allocated[1], cfqq->queued[0], cfqq->queued[1], cfqq->last_fifo_expire, cfqq->service_used); + } + len += sprintf(page+len, " busy queues total: %d\n", i); + queues = i; + + len += sprintf(page+len, "Empty queue list:\n"); + i = 0; + list_for_each(entry, &cfqd->empty_list) { + i++; + cfqq = list_entry_cfqq(entry); + len += sprintf(page+len, " cfqq: key=%lu alloc=%d/%d, queued=%d/%d, last_fifo=%lu, service_used=%lu\n", cfqq->key, cfqq->allocated[0], cfqq->allocated[1], cfqq->queued[0], cfqq->queued[1], cfqq->last_fifo_expire, cfqq->service_used); + } + len += sprintf(page+len, " empty queues total: %d\n", i); + queues += i; + len += sprintf(page+len, "Total queues: %d\n", queues); + spin_unlock_irq(cfqd->queue->queue_lock); + return len; +} + #define SHOW_FUNCTION(__FUNC, __VAR) \ static ssize_t __FUNC(struct cfq_data *cfqd, char *page) \ { \ @@ -798,6 +1654,13 @@ static ssize_t __FUNC(struct cfq_data *c } SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum); SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued); +SHOW_FUNCTION(cfq_tagged_show, cfqd->cfq_tagged); +SHOW_FUNCTION(cfq_fifo_expire_r_show, cfqd->cfq_fifo_expire_r); +SHOW_FUNCTION(cfq_fifo_expire_w_show, cfqd->cfq_fifo_expire_w); +SHOW_FUNCTION(cfq_fifo_batch_expire_show, cfqd->cfq_fifo_batch_expire); +SHOW_FUNCTION(cfq_find_best_show, cfqd->find_best_crq); +SHOW_FUNCTION(cfq_back_max_show, cfqd->cfq_back_max); +SHOW_FUNCTION(cfq_back_penalty_show, cfqd->cfq_back_penalty); #undef SHOW_FUNCTION #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ @@ -810,8 +1673,15 @@ static ssize_t __FUNC(struct cfq_data *c *(__PTR) = (MAX); \ return ret; \ } -STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, INT_MAX); -STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, INT_MAX); +STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX); +STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX); +STORE_FUNCTION(cfq_tagged_store, &cfqd->cfq_tagged, 0, 1); +STORE_FUNCTION(cfq_fifo_expire_r_store, &cfqd->cfq_fifo_expire_r, 1, UINT_MAX); +STORE_FUNCTION(cfq_fifo_expire_w_store, &cfqd->cfq_fifo_expire_w, 1, UINT_MAX); +STORE_FUNCTION(cfq_fifo_batch_expire_store, &cfqd->cfq_fifo_batch_expire, 0, UINT_MAX); +STORE_FUNCTION(cfq_find_best_store, &cfqd->find_best_crq, 0, 1); +STORE_FUNCTION(cfq_back_max_store, &cfqd->cfq_back_max, 0, UINT_MAX); +STORE_FUNCTION(cfq_back_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX); #undef STORE_FUNCTION static struct cfq_fs_entry cfq_quantum_entry = { @@ -824,10 +1694,68 @@ static struct cfq_fs_entry cfq_queued_en .show = cfq_queued_show, .store = cfq_queued_store, }; +static struct cfq_fs_entry cfq_tagged_entry = { + .attr = {.name = "tagged", .mode = S_IRUGO | S_IWUSR }, + .show = cfq_tagged_show, + .store = cfq_tagged_store, +}; +static struct cfq_fs_entry cfq_fifo_expire_r_entry = { + .attr = {.name = "fifo_expire_sync", .mode = S_IRUGO | S_IWUSR }, + .show = cfq_fifo_expire_r_show, + .store = cfq_fifo_expire_r_store, +}; +static struct cfq_fs_entry cfq_fifo_expire_w_entry = { + .attr = {.name = "fifo_expire_async", .mode = S_IRUGO | S_IWUSR }, + .show = cfq_fifo_expire_w_show, + .store = cfq_fifo_expire_w_store, +}; +static struct cfq_fs_entry cfq_fifo_batch_expire_entry = { + .attr = {.name = "fifo_batch_expire", .mode = S_IRUGO | S_IWUSR }, + .show = cfq_fifo_batch_expire_show, + .store = cfq_fifo_batch_expire_store, +}; +static struct cfq_fs_entry cfq_find_best_entry = { + .attr = {.name = "find_best_crq", .mode = S_IRUGO | S_IWUSR }, + .show = cfq_find_best_show, + .store = cfq_find_best_store, +}; +static struct cfq_fs_entry cfq_back_max_entry = { + .attr = {.name = "back_seek_max", .mode = S_IRUGO | S_IWUSR }, + .show = cfq_back_max_show, + .store = cfq_back_max_store, +}; +static struct cfq_fs_entry cfq_back_penalty_entry = { + .attr = {.name = "back_seek_penalty", .mode = S_IRUGO | S_IWUSR }, + .show = cfq_back_penalty_show, + .store = cfq_back_penalty_store, +}; +static struct cfq_fs_entry cfq_clear_elapsed_entry = { + .attr = {.name = "clear_elapsed", .mode = S_IWUSR }, + .store = cfq_clear_elapsed, +}; +static struct cfq_fs_entry cfq_misc_entry = { + .attr = {.name = "show_status", .mode = S_IRUGO }, + .show = cfq_status_show, +}; +static struct cfq_fs_entry cfq_key_type_entry = { + .attr = {.name = "key_type", .mode = S_IRUGO | S_IWUSR }, + .show = cfq_read_key_type, + .store = cfq_set_key_type, +}; static struct attribute *default_attrs[] = { &cfq_quantum_entry.attr, &cfq_queued_entry.attr, + &cfq_tagged_entry.attr, + &cfq_fifo_expire_r_entry.attr, + &cfq_fifo_expire_w_entry.attr, + &cfq_fifo_batch_expire_entry.attr, + &cfq_key_type_entry.attr, + &cfq_find_best_entry.attr, + &cfq_back_max_entry.attr, + &cfq_back_penalty_entry.attr, + &cfq_clear_elapsed_entry.attr, + &cfq_misc_entry.attr, NULL, }; @@ -878,6 +1806,7 @@ elevator_t iosched_cfq = { .elevator_add_req_fn = cfq_insert_request, .elevator_remove_req_fn = cfq_remove_request, .elevator_queue_empty_fn = cfq_queue_empty, + .elevator_completed_req_fn = cfq_completed_request, .elevator_former_req_fn = cfq_former_request, .elevator_latter_req_fn = cfq_latter_request, .elevator_set_req_fn = cfq_set_request, diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/block/elevator.c linux-2.6.8.1-ck7/drivers/block/elevator.c --- linux-2.6.8.1-ck6/drivers/block/elevator.c 2004-08-15 14:08:05.000000000 +1000 +++ linux-2.6.8.1-ck7/drivers/block/elevator.c 2004-09-09 22:56:38.680117486 +1000 @@ -346,7 +346,7 @@ int elv_may_queue(request_queue_t *q, in if (e->elevator_may_queue_fn) return e->elevator_may_queue_fn(q, rw); - return 0; + return ELV_MQUEUE_MAY; } void elv_completed_request(request_queue_t *q, struct request *rq) diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/block/ll_rw_blk.c linux-2.6.8.1-ck7/drivers/block/ll_rw_blk.c --- linux-2.6.8.1-ck6/drivers/block/ll_rw_blk.c 2004-09-09 22:56:24.789284955 +1000 +++ linux-2.6.8.1-ck7/drivers/block/ll_rw_blk.c 2004-09-09 22:56:38.682117174 +1000 @@ -241,6 +241,7 @@ void blk_queue_make_request(request_queu blk_queue_hardsect_size(q, 512); blk_queue_dma_alignment(q, 511); blk_queue_congestion_threshold(q); + q->nr_batching = BLK_BATCH_REQ; q->unplug_thresh = 4; /* hmm */ q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */ @@ -263,6 +264,45 @@ void blk_queue_make_request(request_queu EXPORT_SYMBOL(blk_queue_make_request); /** + * blk_queue_ordered - does this queue support ordered writes + * @q: the request queue + * @flag: see below + * + * Description: + * For journalled file systems, doing ordered writes on a commit + * block instead of explicitly doing wait_on_buffer (which is bad + * for performance) can be a big win. Block drivers supporting this + * feature should call this function and indicate so. + * + **/ +void blk_queue_ordered(request_queue_t *q, int flag) +{ + if (flag) + set_bit(QUEUE_FLAG_ORDERED, &q->queue_flags); + else + clear_bit(QUEUE_FLAG_ORDERED, &q->queue_flags); +} + +EXPORT_SYMBOL(blk_queue_ordered); + +/** + * blk_queue_issue_flush_fn - set function for issuing a flush + * @q: the request queue + * @iff: the function to be called issuing the flush + * + * Description: + * If a driver supports issuing a flush command, the support is notified + * to the block layer by defining it through this call. + * + **/ +void blk_queue_issue_flush_fn(request_queue_t *q, issue_flush_fn *iff) +{ + q->issue_flush_fn = iff; +} + +EXPORT_SYMBOL(blk_queue_issue_flush_fn); + +/** * blk_queue_bounce_limit - set bounce buffer limit for queue * @q: the request queue for the device * @dma_addr: bus address limit @@ -482,15 +522,14 @@ struct request *blk_queue_find_tag(reque EXPORT_SYMBOL(blk_queue_find_tag); /** - * blk_queue_free_tags - release tag maintenance info + * __blk_queue_free_tags - release tag maintenance info * @q: the request queue for the device * * Notes: * blk_cleanup_queue() will take care of calling this function, if tagging - * has been used. So there's usually no need to call this directly, unless - * tagging is just being disabled but the queue remains in function. + * has been used. So there's no need to call this directly. **/ -void blk_queue_free_tags(request_queue_t *q) +static void __blk_queue_free_tags(request_queue_t *q) { struct blk_queue_tag *bqt = q->queue_tags; @@ -514,12 +553,27 @@ void blk_queue_free_tags(request_queue_t q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED); } +/** + * blk_queue_free_tags - release tag maintenance info + * @q: the request queue for the device + * + * Notes: + * This is used to disabled tagged queuing to a device, yet leave + * queue in function. + **/ +void blk_queue_free_tags(request_queue_t *q) +{ + clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags); +} + EXPORT_SYMBOL(blk_queue_free_tags); static int init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth) { int bits, i; + struct request **tag_index; + unsigned long *tag_map; if (depth > q->nr_requests * 2) { depth = q->nr_requests * 2; @@ -527,32 +581,31 @@ init_tag_map(request_queue_t *q, struct __FUNCTION__, depth); } - tags->tag_index = kmalloc(depth * sizeof(struct request *), GFP_ATOMIC); - if (!tags->tag_index) + tag_index = kmalloc(depth * sizeof(struct request *), GFP_ATOMIC); + if (!tag_index) goto fail; bits = (depth / BLK_TAGS_PER_LONG) + 1; - tags->tag_map = kmalloc(bits * sizeof(unsigned long), GFP_ATOMIC); - if (!tags->tag_map) + tag_map = kmalloc(bits * sizeof(unsigned long), GFP_ATOMIC); + if (!tag_map) goto fail; - memset(tags->tag_index, 0, depth * sizeof(struct request *)); - memset(tags->tag_map, 0, bits * sizeof(unsigned long)); + memset(tag_index, 0, depth * sizeof(struct request *)); + memset(tag_map, 0, bits * sizeof(unsigned long)); tags->max_depth = depth; tags->real_max_depth = bits * BITS_PER_LONG; + tags->tag_index = tag_index; + tags->tag_map = tag_map; /* * set the upper bits if the depth isn't a multiple of the word size */ for (i = depth; i < bits * BLK_TAGS_PER_LONG; i++) - __set_bit(i, tags->tag_map); + __set_bit(i, tag_map); - INIT_LIST_HEAD(&tags->busy_list); - tags->busy = 0; - atomic_set(&tags->refcnt, 1); return 0; fail: - kfree(tags->tag_index); + kfree(tag_index); return -ENOMEM; } @@ -564,13 +617,26 @@ fail: int blk_queue_init_tags(request_queue_t *q, int depth, struct blk_queue_tag *tags) { - if (!tags) { + int rc; + + BUG_ON(tags && q->queue_tags && tags != q->queue_tags); + + if (!tags && !q->queue_tags) { tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC); if (!tags) goto fail; if (init_tag_map(q, tags, depth)) goto fail; + + INIT_LIST_HEAD(&tags->busy_list); + tags->busy = 0; + atomic_set(&tags->refcnt, 1); + } else if (q->queue_tags) { + if ((rc = blk_queue_resize_tags(q, depth))) + return rc; + set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags); + return 0; } else atomic_inc(&tags->refcnt); @@ -1335,8 +1401,8 @@ void blk_cleanup_queue(request_queue_t * if (rl->rq_pool) mempool_destroy(rl->rq_pool); - if (blk_queue_tagged(q)) - blk_queue_free_tags(q); + if (q->queue_tags) + __blk_queue_free_tags(q); kmem_cache_free(requestq_cachep, q); } @@ -1487,8 +1553,10 @@ request_queue_t *blk_init_queue(request_ /* * all done */ - if (!elevator_init(q, chosen_elevator)) + if (!elevator_init(q, chosen_elevator)) { + blk_queue_congestion_threshold(q); return q; + } blk_cleanup_queue(q); out_init: @@ -1516,13 +1584,20 @@ static inline void blk_free_request(requ mempool_free(rq, q->rq.rq_pool); } -static inline struct request *blk_alloc_request(request_queue_t *q,int gfp_mask) +static inline struct request *blk_alloc_request(request_queue_t *q, int rw, + int gfp_mask) { struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); if (!rq) return NULL; + /* + * first three bits are identical in rq->flags and bio->bi_rw, + * see bio.h and blkdev.h + */ + rq->flags = rw; + if (!elv_set_request(q, rq, gfp_mask)) return rq; @@ -1534,7 +1609,7 @@ static inline struct request *blk_alloc_ * ioc_batching returns true if the ioc is a valid batching request and * should be given priority access to a request. */ -static inline int ioc_batching(struct io_context *ioc) +static inline int ioc_batching(request_queue_t *q, struct io_context *ioc) { if (!ioc) return 0; @@ -1544,7 +1619,7 @@ static inline int ioc_batching(struct io * even if the batch times out, otherwise we could theoretically * lose wakeups. */ - return ioc->nr_batch_requests == BLK_BATCH_REQ || + return ioc->nr_batch_requests == q->nr_batching || (ioc->nr_batch_requests > 0 && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME)); } @@ -1555,12 +1630,12 @@ static inline int ioc_batching(struct io * is the behaviour we want though - once it gets a wakeup it should be given * a nice run. */ -void ioc_set_batching(struct io_context *ioc) +void ioc_set_batching(request_queue_t *q, struct io_context *ioc) { - if (!ioc || ioc_batching(ioc)) + if (!ioc || ioc_batching(q, ioc)) return; - ioc->nr_batch_requests = BLK_BATCH_REQ; + ioc->nr_batch_requests = q->nr_batching; ioc->last_waited = jiffies; } @@ -1576,10 +1651,10 @@ static void freed_request(request_queue_ if (rl->count[rw] < queue_congestion_off_threshold(q)) clear_queue_congested(q, rw); if (rl->count[rw]+1 <= q->nr_requests) { + smp_mb(); if (waitqueue_active(&rl->wait[rw])) wake_up(&rl->wait[rw]); - if (!waitqueue_active(&rl->wait[rw])) - blk_clear_queue_full(q, rw); + blk_clear_queue_full(q, rw); } } @@ -1602,13 +1677,22 @@ static struct request *get_request(reque * will be blocked. */ if (!blk_queue_full(q, rw)) { - ioc_set_batching(ioc); + ioc_set_batching(q, ioc); blk_set_queue_full(q, rw); } } - if (blk_queue_full(q, rw) - && !ioc_batching(ioc) && !elv_may_queue(q, rw)) { + switch (elv_may_queue(q, rw)) { + case ELV_MQUEUE_NO: + spin_unlock_irq(q->queue_lock); + goto out; + case ELV_MQUEUE_MAY: + break; + case ELV_MQUEUE_MUST: + goto get_rq; + } + + if (blk_queue_full(q, rw) && !ioc_batching(q, ioc)) { /* * The queue is full and the allocating process is not a * "batcher", and not exempted by the IO scheduler @@ -1617,12 +1701,15 @@ static struct request *get_request(reque goto out; } +get_rq: rl->count[rw]++; +#if 0 if (rl->count[rw] >= queue_congestion_on_threshold(q)) set_queue_congested(q, rw); +#endif spin_unlock_irq(q->queue_lock); - rq = blk_alloc_request(q, gfp_mask); + rq = blk_alloc_request(q, rw, gfp_mask); if (!rq) { /* * Allocation failed presumably due to memory. Undo anything @@ -1637,17 +1724,11 @@ static struct request *get_request(reque goto out; } - if (ioc_batching(ioc)) + if (ioc_batching(q, ioc)) ioc->nr_batch_requests--; INIT_LIST_HEAD(&rq->queuelist); - /* - * first three bits are identical in rq->flags and bio->bi_rw, - * see bio.h and blkdev.h - */ - rq->flags = rw; - rq->errors = 0; rq->rq_status = RQ_ACTIVE; rq->bio = rq->biotail = NULL; @@ -1696,7 +1777,7 @@ static struct request *get_request_wait( * See ioc_batching, ioc_set_batching */ ioc = get_io_context(GFP_NOIO); - ioc_set_batching(ioc); + ioc_set_batching(q, ioc); put_io_context(ioc); } finish_wait(&rl->wait[rw], &wait); @@ -1925,10 +2006,11 @@ int blk_execute_rq(request_queue_t *q, s } rq->flags |= REQ_NOMERGE; - rq->waiting = &wait; + if (!rq->waiting) + rq->waiting = &wait; elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 1); generic_unplug_device(q); - wait_for_completion(&wait); + wait_for_completion(rq->waiting); rq->waiting = NULL; if (rq->errors) @@ -1939,6 +2021,72 @@ int blk_execute_rq(request_queue_t *q, s EXPORT_SYMBOL(blk_execute_rq); +/** + * blkdev_issue_flush - queue a flush + * @bdev: blockdev to issue flush for + * @error_sector: error sector + * + * Description: + * Issue a flush for the block device in question. Caller can supply + * room for storing the error offset in case of a flush error, if they + * wish to. Caller must run wait_for_completion() on its own. + */ +int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) +{ + request_queue_t *q; + + if (bdev->bd_disk == NULL) + return -ENXIO; + + q = bdev_get_queue(bdev); + if (!q) + return -ENXIO; + if (!q->issue_flush_fn) + return -EOPNOTSUPP; + + return q->issue_flush_fn(q, bdev->bd_disk, error_sector); +} + +EXPORT_SYMBOL(blkdev_issue_flush); + +/** + * blkdev_scsi_issue_flush_fn - issue flush for SCSI devices + * @q: device queue + * @disk: gendisk + * @error_sector: error offset + * + * Description: + * Devices understanding the SCSI command set, can use this function as + * a helper for issuing a cache flush. Note: driver is required to store + * the error offset (in case of error flushing) in ->sector of struct + * request. + */ +int blkdev_scsi_issue_flush_fn(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + struct request *rq = blk_get_request(q, WRITE, __GFP_WAIT); + int ret; + + rq->flags |= REQ_BLOCK_PC | REQ_SOFTBARRIER; + rq->sector = 0; + memset(rq->cmd, 0, sizeof(rq->cmd)); + rq->cmd[0] = 0x35; + rq->cmd_len = 12; + rq->data = NULL; + rq->data_len = 0; + rq->timeout = 60 * HZ; + + ret = blk_execute_rq(q, disk, rq); + + if (ret && error_sector) + *error_sector = rq->sector; + + blk_put_request(rq); + return ret; +} + +EXPORT_SYMBOL(blkdev_scsi_issue_flush_fn); + void drive_stat_acct(struct request *rq, int nr_sectors, int new_io) { int rw = rq_data_dir(rq); @@ -2192,7 +2340,7 @@ EXPORT_SYMBOL(__blk_attempt_remerge); static int __make_request(request_queue_t *q, struct bio *bio) { struct request *req, *freereq = NULL; - int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, ra; + int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err; sector_t sector; sector = bio->bi_sector; @@ -2210,9 +2358,11 @@ static int __make_request(request_queue_ spin_lock_prefetch(q->queue_lock); - barrier = test_bit(BIO_RW_BARRIER, &bio->bi_rw); - - ra = bio->bi_rw & (1 << BIO_RW_AHEAD); + barrier = bio_barrier(bio); + if (barrier && !(q->queue_flags & (1 << QUEUE_FLAG_ORDERED))) { + err = -EOPNOTSUPP; + goto end_io; + } again: spin_lock_irq(q->queue_lock); @@ -2292,7 +2442,8 @@ get_rq: /* * READA bit set */ - if (ra) + err = -EWOULDBLOCK; + if (bio_rw_ahead(bio)) goto end_io; freereq = get_request_wait(q, rw); @@ -2303,10 +2454,9 @@ get_rq: req->flags |= REQ_CMD; /* - * inherit FAILFAST from bio and don't stack up - * retries for read ahead + * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST) */ - if (ra || test_bit(BIO_RW_FAILFAST, &bio->bi_rw)) + if (bio_rw_ahead(bio) || bio_failfast(bio)) req->flags |= REQ_FAILFAST; /* @@ -2340,7 +2490,7 @@ out: return 0; end_io: - bio_endio(bio, nr_sectors << 9, -EWOULDBLOCK); + bio_endio(bio, nr_sectors << 9, err); return 0; } @@ -2647,10 +2797,17 @@ void blk_recalc_rq_sectors(struct reques static int __end_that_request_first(struct request *req, int uptodate, int nr_bytes) { - int total_bytes, bio_nbytes, error = 0, next_idx = 0; + int total_bytes, bio_nbytes, error, next_idx = 0; struct bio *bio; /* + * extend uptodate bool to allow < 0 value to be direct io error + */ + error = 0; + if (end_io_error(uptodate)) + error = !uptodate ? -EIO : uptodate; + + /* * for a REQ_BLOCK_PC request, we want to carry any eventual * sense key with us all the way through */ @@ -2658,7 +2815,6 @@ static int __end_that_request_first(stru req->errors = 0; if (!uptodate) { - error = -EIO; if (blk_fs_request(req) && !(req->flags & REQ_QUIET)) printk("end_request: I/O error, dev %s, sector %llu\n", req->rq_disk ? req->rq_disk->disk_name : "?", @@ -2741,7 +2897,7 @@ static int __end_that_request_first(stru /** * end_that_request_first - end I/O on a request * @req: the request being processed - * @uptodate: 0 for I/O error + * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error * @nr_sectors: number of sectors to end I/O on * * Description: @@ -2762,7 +2918,7 @@ EXPORT_SYMBOL(end_that_request_first); /** * end_that_request_chunk - end I/O on a request * @req: the request being processed - * @uptodate: 0 for I/O error + * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error * @nr_bytes: number of bytes to complete * * Description: @@ -2908,6 +3064,9 @@ void put_io_context(struct io_context *i if (atomic_dec_and_test(&ioc->refcount)) { if (ioc->aic && ioc->aic->dtor) ioc->aic->dtor(ioc->aic); + if (ioc->cic && ioc->cic->dtor) + ioc->cic->dtor(ioc->cic); + kmem_cache_free(iocontext_cachep, ioc); } } @@ -2920,14 +3079,15 @@ void exit_io_context(void) local_irq_save(flags); ioc = current->io_context; - if (ioc) { - if (ioc->aic && ioc->aic->exit) - ioc->aic->exit(ioc->aic); - put_io_context(ioc); - current->io_context = NULL; - } else - WARN_ON(1); + current->io_context = NULL; local_irq_restore(flags); + + if (ioc->aic && ioc->aic->exit) + ioc->aic->exit(ioc->aic); + if (ioc->cic && ioc->cic->exit) + ioc->cic->exit(ioc->cic); + + put_io_context(ioc); } /* @@ -2946,20 +3106,39 @@ struct io_context *get_io_context(int gf local_irq_save(flags); ret = tsk->io_context; - if (ret == NULL) { - ret = kmem_cache_alloc(iocontext_cachep, GFP_ATOMIC); - if (ret) { - atomic_set(&ret->refcount, 1); - ret->pid = tsk->pid; - ret->last_waited = jiffies; /* doesn't matter... */ - ret->nr_batch_requests = 0; /* because this is 0 */ - ret->aic = NULL; + if (ret) + goto out; + + local_irq_restore(flags); + + ret = kmem_cache_alloc(iocontext_cachep, gfp_flags); + if (ret) { + atomic_set(&ret->refcount, 1); + ret->pid = tsk->pid; + ret->last_waited = jiffies; /* doesn't matter... */ + ret->nr_batch_requests = 0; /* because this is 0 */ + ret->aic = NULL; + ret->cic = NULL; + spin_lock_init(&ret->lock); + + local_irq_save(flags); + + /* + * very unlikely, someone raced with us in setting up the task + * io context. free new context and just grab a reference. + */ + if (!tsk->io_context) tsk->io_context = ret; + else { + kmem_cache_free(iocontext_cachep, ret); + ret = tsk->io_context; } - } - if (ret) + +out: atomic_inc(&ret->refcount); - local_irq_restore(flags); + local_irq_restore(flags); + } + return ret; } diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/ide/ide.c linux-2.6.8.1-ck7/drivers/ide/ide.c --- linux-2.6.8.1-ck6/drivers/ide/ide.c 2004-08-15 14:08:06.000000000 +1000 +++ linux-2.6.8.1-ck7/drivers/ide/ide.c 2004-09-09 22:56:38.699114522 +1000 @@ -437,6 +437,30 @@ u8 ide_dump_status (ide_drive_t *drive, #endif /* FANCY_STATUS_DUMPS */ printk("\n"); } + { + struct request *rq; + int opcode = 0x100; + + spin_lock(&ide_lock); + rq = HWGROUP(drive)->rq; + spin_unlock(&ide_lock); + if (!rq) + goto out; + if (rq->flags & (REQ_DRIVE_CMD | REQ_DRIVE_TASK)) { + char *args = rq->buffer; + if (args) + opcode = args[0]; + } else if (rq->flags & REQ_DRIVE_TASKFILE) { + ide_task_t *args = rq->special; + if (args) { + task_struct_t *tf = (task_struct_t *) args->tfRegister; + opcode = tf->command; + } + } + + printk("ide: failed opcode was %x\n", opcode); + } +out: local_irq_restore(flags); return err; } diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/ide/ide-disk.c linux-2.6.8.1-ck7/drivers/ide/ide-disk.c --- linux-2.6.8.1-ck6/drivers/ide/ide-disk.c 2004-08-15 14:08:06.000000000 +1000 +++ linux-2.6.8.1-ck7/drivers/ide/ide-disk.c 2004-09-09 22:56:38.702114054 +1000 @@ -702,6 +702,37 @@ static u8 idedisk_dump_status (ide_drive } #endif /* FANCY_STATUS_DUMPS */ printk("\n"); + { + struct request *rq; + unsigned char opcode = 0; + int found = 0; + + spin_lock(&ide_lock); + rq = HWGROUP(drive)->rq; + spin_unlock(&ide_lock); + if (!rq) + goto out; + if (rq->flags & (REQ_DRIVE_CMD | REQ_DRIVE_TASK)) { + char *args = rq->buffer; + if (args) { + opcode = args[0]; + found = 1; + } + } else if (rq->flags & REQ_DRIVE_TASKFILE) { + ide_task_t *args = rq->special; + if (args) { + task_struct_t *tf = (task_struct_t *) args->tfRegister; + opcode = tf->command; + found = 1; + } + } + printk("ide: failed opcode was: "); + if (!found) + printk("unknown\n"); + else + printk("0x%02x\n", opcode); + } +out: local_irq_restore(flags); return err; } @@ -1203,6 +1234,42 @@ static ide_proc_entry_t idedisk_proc[] = #endif /* CONFIG_PROC_FS */ +static int idedisk_issue_flush(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + ide_drive_t *drive = q->queuedata; + struct request *rq; + int ret; + + if (!drive->wcache) + return 0; + + rq = blk_get_request(q, WRITE, __GFP_WAIT); + + memset(rq->cmd, 0, sizeof(rq->cmd)); + + if (ide_id_has_flush_cache_ext(drive->id) && + (drive->capacity64 >= (1UL << 28))) + rq->cmd[0] = WIN_FLUSH_CACHE_EXT; + else + rq->cmd[0] = WIN_FLUSH_CACHE; + + + rq->flags |= REQ_DRIVE_TASK | REQ_SOFTBARRIER; + rq->buffer = rq->cmd; + + ret = blk_execute_rq(q, disk, rq); + + /* + * if we failed and caller wants error offset, get it + */ + if (ret && error_sector) + *error_sector = ide_get_error_location(drive, rq->cmd); + + blk_put_request(rq); + return ret; +} + /* * This is tightly woven into the driver->do_special can not touch. * DON'T do it again until a total personality rewrite is committed. @@ -1231,16 +1298,10 @@ static int set_nowerr(ide_drive_t *drive return 0; } -/* check if CACHE FLUSH (EXT) command is supported (bits defined in ATA-6) */ -#define ide_id_has_flush_cache(id) ((id)->cfs_enable_2 & 0x3000) - -/* some Maxtor disks have bit 13 defined incorrectly so check bit 10 too */ -#define ide_id_has_flush_cache_ext(id) \ - (((id)->cfs_enable_2 & 0x2400) == 0x2400) - static int write_cache (ide_drive_t *drive, int arg) { ide_task_t args; + int err; if (!ide_id_has_flush_cache(drive->id)) return 1; @@ -1251,7 +1312,10 @@ static int write_cache (ide_drive_t *dri args.tfRegister[IDE_COMMAND_OFFSET] = WIN_SETFEATURES; args.command_type = IDE_DRIVE_TASK_NO_DATA; args.handler = &task_no_data_intr; - (void) ide_raw_taskfile(drive, &args, NULL); + + err = ide_raw_taskfile(drive, &args, NULL); + if (err) + return err; drive->wcache = arg; return 0; @@ -1412,6 +1476,7 @@ static void idedisk_setup (ide_drive_t * { struct hd_driveid *id = drive->id; unsigned long long capacity; + int barrier; idedisk_add_settings(drive); @@ -1543,6 +1608,27 @@ static void idedisk_setup (ide_drive_t * drive->wcache = 1; write_cache(drive, 1); + + /* + * decide if we can sanely support flushes and barriers on + * this drive. unfortunately not all drives advertise FLUSH_CACHE + * support even if they support it. So assume FLUSH_CACHE is there + * always. LBA48 drives are newer, so expect it to flag support + * properly. We can safely support FLUSH_CACHE on lba48, if capacity + * doesn't exceed lba28 + */ + barrier = 1; + if (drive->addressing == 1) { + if (capacity > (1ULL << 28) && !ide_id_has_flush_cache_ext(id)) + barrier = 0; + } + + printk("%s: cache flushes %ssupported\n", + drive->name, barrier ? "" : "not "); + if (barrier) { + blk_queue_ordered(drive->queue, 1); + blk_queue_issue_flush_fn(drive->queue, idedisk_issue_flush); + } } static void ide_cacheflush_p(ide_drive_t *drive) diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/ide/ide-io.c linux-2.6.8.1-ck7/drivers/ide/ide-io.c --- linux-2.6.8.1-ck6/drivers/ide/ide-io.c 2004-06-16 17:35:36.000000000 +1000 +++ linux-2.6.8.1-ck7/drivers/ide/ide-io.c 2004-09-09 22:56:38.705113586 +1000 @@ -54,38 +54,77 @@ #include #include -/** - * ide_end_request - complete an IDE I/O - * @drive: IDE device for the I/O - * @uptodate: - * @nr_sectors: number of sectors completed - * - * This is our end_request wrapper function. We complete the I/O - * update random number input and dequeue the request, which if - * it was tagged may be out of order. +static void ide_fill_flush_cmd(ide_drive_t *drive, struct request *rq) +{ + char *buf = rq->cmd; + + /* + * reuse cdb space for ata command + */ + memset(buf, 0, sizeof(rq->cmd)); + + rq->flags |= REQ_DRIVE_TASK | REQ_STARTED; + rq->buffer = buf; + rq->buffer[0] = WIN_FLUSH_CACHE; + + if (ide_id_has_flush_cache_ext(drive->id) && + (drive->capacity64 >= (1UL << 28))) + rq->buffer[0] = WIN_FLUSH_CACHE_EXT; +} + +/* + * preempt pending requests, and store this cache flush for immediate + * execution */ - -int ide_end_request (ide_drive_t *drive, int uptodate, int nr_sectors) +static struct request *ide_queue_flush_cmd(ide_drive_t *drive, + struct request *rq, int post) { - struct request *rq; - unsigned long flags; - int ret = 1; + struct request *flush_rq = &HWGROUP(drive)->wrq; - spin_lock_irqsave(&ide_lock, flags); - rq = HWGROUP(drive)->rq; + /* + * write cache disabled, clear the barrier bit and treat it like + * an ordinary write + */ + if (!drive->wcache) { + rq->flags |= REQ_BAR_PREFLUSH; + return rq; + } - BUG_ON(!(rq->flags & REQ_STARTED)); + ide_init_drive_cmd(flush_rq); + ide_fill_flush_cmd(drive, flush_rq); - if (!nr_sectors) - nr_sectors = rq->hard_cur_sectors; + flush_rq->special = rq; + flush_rq->nr_sectors = rq->nr_sectors; + + if (!post) { + drive->doing_barrier = 1; + flush_rq->flags |= REQ_BAR_PREFLUSH; + blkdev_dequeue_request(rq); + } else + flush_rq->flags |= REQ_BAR_POSTFLUSH; + + __elv_add_request(drive->queue, flush_rq, ELEVATOR_INSERT_FRONT, 0); + HWGROUP(drive)->rq = NULL; + return flush_rq; +} + +static int __ide_end_request(ide_drive_t *drive, struct request *rq, + int uptodate, int nr_sectors) +{ + int ret = 1; + + BUG_ON(!(rq->flags & REQ_STARTED)); /* * if failfast is set on a request, override number of sectors and * complete the whole request right now */ - if (blk_noretry_request(rq) && !uptodate) + if (blk_noretry_request(rq) && end_io_error(uptodate)) nr_sectors = rq->hard_nr_sectors; + if (!blk_fs_request(rq) && end_io_error(uptodate) && !rq->errors) + rq->errors = -EIO; + /* * decide whether to reenable DMA -- 3 is a random magic for now, * if we DMA timeout more than 3 times, just stay in PIO @@ -97,15 +136,56 @@ int ide_end_request (ide_drive_t *drive, if (!end_that_request_first(rq, uptodate, nr_sectors)) { add_disk_randomness(rq->rq_disk); + + if (blk_rq_tagged(rq)) + blk_queue_end_tag(drive->queue, rq); + blkdev_dequeue_request(rq); HWGROUP(drive)->rq = NULL; end_that_request_last(rq); ret = 0; } - spin_unlock_irqrestore(&ide_lock, flags); return ret; } +/** + * ide_end_request - complete an IDE I/O + * @drive: IDE device for the I/O + * @uptodate: + * @nr_sectors: number of sectors completed + * + * This is our end_request wrapper function. We complete the I/O + * update random number input and dequeue the request, which if + * it was tagged may be out of order. + */ + +int ide_end_request (ide_drive_t *drive, int uptodate, int nr_sectors) +{ + struct request *rq; + unsigned long flags; + int ret = 1; + + spin_lock_irqsave(&ide_lock, flags); + rq = HWGROUP(drive)->rq; + + if (!nr_sectors) + nr_sectors = rq->hard_cur_sectors; + + if (!blk_barrier_rq(rq) || !drive->wcache) + ret = __ide_end_request(drive, rq, uptodate, nr_sectors); + else { + struct request *flush_rq = &HWGROUP(drive)->wrq; + + flush_rq->nr_sectors -= nr_sectors; + if (!flush_rq->nr_sectors) { + ide_queue_flush_cmd(drive, rq, 1); + ret = 0; + } + } + + spin_unlock_irqrestore(&ide_lock, flags); + return ret; +} EXPORT_SYMBOL(ide_end_request); /** @@ -137,6 +217,113 @@ static void ide_complete_pm_request (ide spin_unlock_irqrestore(&ide_lock, flags); } +/* + * FIXME: probably move this somewhere else, name is bad too :) + */ +u64 ide_get_error_location(ide_drive_t *drive, char *args) +{ + u32 high, low; + u8 hcyl, lcyl, sect; + u64 sector; + + high = 0; + hcyl = args[5]; + lcyl = args[4]; + sect = args[3]; + + if (ide_id_has_flush_cache_ext(drive->id)) { + low = (hcyl << 16) | (lcyl << 8) | sect; + HWIF(drive)->OUTB(drive->ctl|0x80, IDE_CONTROL_REG); + high = ide_read_24(drive); + } else { + u8 cur = HWIF(drive)->INB(IDE_SELECT_REG); + if (cur & 0x40) + low = (hcyl << 16) | (lcyl << 8) | sect; + else { + low = hcyl * drive->head * drive->sect; + low += lcyl * drive->sect; + low += sect - 1; + } + } + + sector = ((u64) high << 24) | low; + return sector; +} +EXPORT_SYMBOL(ide_get_error_location); + +static void ide_complete_barrier(ide_drive_t *drive, struct request *rq, + int error) +{ + struct request *real_rq = rq->special; + int good_sectors, bad_sectors; + sector_t sector; + + if (!error) { + if (blk_barrier_postflush(rq)) { + /* + * this completes the barrier write + */ + __ide_end_request(drive, real_rq, 1, real_rq->hard_nr_sectors); + drive->doing_barrier = 0; + } else { + /* + * just indicate that we did the pre flush + */ + real_rq->flags |= REQ_BAR_PREFLUSH; + elv_requeue_request(drive->queue, real_rq); + } + /* + * all is fine, return + */ + return; + } + + /* + * we need to end real_rq, but it's not on the queue currently. + * put it back on the queue, so we don't have to special case + * anything else for completing it + */ + if (!blk_barrier_postflush(rq)) + elv_requeue_request(drive->queue, real_rq); + + /* + * drive aborted flush command, assume FLUSH_CACHE_* doesn't + * work and disable barrier support + */ + if (error & ABRT_ERR) { + printk(KERN_ERR "%s: barrier support doesn't work\n", drive->name); + __ide_end_request(drive, real_rq, -EOPNOTSUPP, real_rq->hard_nr_sectors); + blk_queue_ordered(drive->queue, 0); + blk_queue_issue_flush_fn(drive->queue, NULL); + } else { + /* + * find out what part of the request failed + */ + good_sectors = 0; + if (blk_barrier_postflush(rq)) { + sector = ide_get_error_location(drive, rq->buffer); + + if ((sector >= real_rq->hard_sector) && + (sector < real_rq->hard_sector + real_rq->hard_nr_sectors)) + good_sectors = sector - real_rq->hard_sector; + } else + sector = real_rq->hard_sector; + + bad_sectors = real_rq->hard_nr_sectors - good_sectors; + if (good_sectors) + __ide_end_request(drive, real_rq, 1, good_sectors); + if (bad_sectors) + __ide_end_request(drive, real_rq, 0, bad_sectors); + + printk(KERN_ERR "%s: failed barrier write: " + "sector=%Lx(good=%d/bad=%d)\n", + drive->name, (unsigned long long)sector, + good_sectors, bad_sectors); + } + + drive->doing_barrier = 0; +} + /** * ide_end_drive_cmd - end an explicit drive command * @drive: command @@ -226,6 +413,10 @@ void ide_end_drive_cmd (ide_drive_t *dri spin_lock_irqsave(&ide_lock, flags); blkdev_dequeue_request(rq); + + if (blk_barrier_preflush(rq) || blk_barrier_postflush(rq)) + ide_complete_barrier(drive, rq, err); + HWGROUP(drive)->rq = NULL; end_that_request_last(rq); spin_unlock_irqrestore(&ide_lock, flags); @@ -712,6 +903,22 @@ static inline ide_drive_t *choose_drive repeat: best = NULL; drive = hwgroup->drive; + + /* + * drive is doing pre-flush, ordered write, post-flush sequence. even + * though that is 3 requests, it must be seen as a single transaction. + * we must not preempt this drive until that is complete + */ + if (drive->doing_barrier) { + /* + * small race where queue could get replugged during + * the 3-request flush cycle, just yank the plug since + * we want it to finish asap + */ + blk_remove_plug(drive->queue); + return drive; + } + do { if ((!drive->sleep || time_after_eq(jiffies, drive->sleep)) && !elv_queue_empty(drive->queue)) { @@ -868,6 +1075,13 @@ void ide_do_request (ide_hwgroup_t *hwgr } /* + * if rq is a barrier write, issue pre cache flush if not + * already done + */ + if (blk_barrier_rq(rq) && !blk_barrier_preflush(rq)) + rq = ide_queue_flush_cmd(drive, rq, 0); + + /* * Sanity: don't accept a request that isn't a PM request * if we are currently power managed. This is very important as * blk_stop_queue() doesn't prevent the elv_next_request() @@ -917,7 +1131,9 @@ EXPORT_SYMBOL(ide_do_request); */ void do_ide_request(request_queue_t *q) { - ide_do_request(q->queuedata, IDE_NO_IRQ); + ide_drive_t *drive = q->queuedata; + + ide_do_request(HWGROUP(drive), IDE_NO_IRQ); } /* @@ -1286,6 +1502,7 @@ void ide_init_drive_cmd (struct request { memset(rq, 0, sizeof(*rq)); rq->flags = REQ_DRIVE_CMD; + rq->ref_count = 1; } EXPORT_SYMBOL(ide_init_drive_cmd); diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/ide/ide-probe.c linux-2.6.8.1-ck7/drivers/ide/ide-probe.c --- linux-2.6.8.1-ck6/drivers/ide/ide-probe.c 2004-06-16 17:35:36.000000000 +1000 +++ linux-2.6.8.1-ck7/drivers/ide/ide-probe.c 2004-09-09 22:56:38.706113430 +1000 @@ -893,7 +893,7 @@ static int ide_init_queue(ide_drive_t *d if (!q) return 1; - q->queuedata = HWGROUP(drive); + q->queuedata = drive; blk_queue_segment_boundary(q, 0xffff); if (!hwif->rqsize) diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/md/dm.c linux-2.6.8.1-ck7/drivers/md/dm.c --- linux-2.6.8.1-ck6/drivers/md/dm.c 2004-08-15 14:08:06.000000000 +1000 +++ linux-2.6.8.1-ck7/drivers/md/dm.c 2004-09-09 22:56:38.707113274 +1000 @@ -597,6 +597,21 @@ static int dm_request(request_queue_t *q return 0; } +static int dm_flush_all(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + struct mapped_device *md = q->queuedata; + struct dm_table *map = dm_get_table(md); + int ret = -ENXIO; + + if (map) { + ret = dm_table_flush_all(md->map); + dm_table_put(map); + } + + return ret; +} + static void dm_unplug_all(request_queue_t *q) { struct mapped_device *md = q->queuedata; @@ -764,6 +779,7 @@ static struct mapped_device *alloc_dev(u md->queue->backing_dev_info.congested_data = md; blk_queue_make_request(md->queue, dm_request); md->queue->unplug_fn = dm_unplug_all; + md->queue->issue_flush_fn = dm_flush_all; md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab, mempool_free_slab, _io_cache); diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/md/dm.h linux-2.6.8.1-ck7/drivers/md/dm.h --- linux-2.6.8.1-ck6/drivers/md/dm.h 2004-06-16 17:35:36.000000000 +1000 +++ linux-2.6.8.1-ck7/drivers/md/dm.h 2004-09-09 22:56:38.707113274 +1000 @@ -113,6 +113,7 @@ void dm_table_suspend_targets(struct dm_ void dm_table_resume_targets(struct dm_table *t); int dm_table_any_congested(struct dm_table *t, int bdi_bits); void dm_table_unplug_all(struct dm_table *t); +int dm_table_flush_all(struct dm_table *t); /*----------------------------------------------------------------- * A registry of target types. diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/md/dm-table.c linux-2.6.8.1-ck7/drivers/md/dm-table.c --- linux-2.6.8.1-ck6/drivers/md/dm-table.c 2004-08-15 14:08:06.000000000 +1000 +++ linux-2.6.8.1-ck7/drivers/md/dm-table.c 2004-09-09 22:56:38.708113118 +1000 @@ -900,6 +900,28 @@ void dm_table_unplug_all(struct dm_table } } +int dm_table_flush_all(struct dm_table *t) +{ + struct list_head *d, *devices = dm_table_get_devices(t); + int ret = 0; + + for (d = devices->next; d != devices; d = d->next) { + struct dm_dev *dd = list_entry(d, struct dm_dev, list); + request_queue_t *q = bdev_get_queue(dd->bdev); + int err; + + if (!q->issue_flush_fn) + err = -EOPNOTSUPP; + else + err = q->issue_flush_fn(q, dd->bdev->bd_disk, NULL); + + if (!ret) + ret = err; + } + + return ret; +} + EXPORT_SYMBOL(dm_vcalloc); EXPORT_SYMBOL(dm_get_device); EXPORT_SYMBOL(dm_put_device); @@ -908,3 +930,4 @@ EXPORT_SYMBOL(dm_table_get_mode); EXPORT_SYMBOL(dm_table_put); EXPORT_SYMBOL(dm_table_get); EXPORT_SYMBOL(dm_table_unplug_all); +EXPORT_SYMBOL(dm_table_flush_all); diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/md/linear.c linux-2.6.8.1-ck7/drivers/md/linear.c --- linux-2.6.8.1-ck6/drivers/md/linear.c 2004-05-23 12:54:50.000000000 +1000 +++ linux-2.6.8.1-ck7/drivers/md/linear.c 2004-09-09 22:56:38.709112962 +1000 @@ -47,7 +47,6 @@ static inline dev_info_t *which_dev(mdde return hash->dev0; } - /** * linear_mergeable_bvec -- tell bio layer if a two requests can be merged * @q: request queue @@ -93,6 +92,27 @@ static void linear_unplug(request_queue_ } } +static int linear_issue_flush(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + mddev_t *mddev = q->queuedata; + linear_conf_t *conf = mddev_to_conf(mddev); + int i, ret = 0; + + for (i=0; i < mddev->raid_disks; i++) { + struct block_device *bdev = conf->disks[i].rdev->bdev; + request_queue_t *r_queue = bdev_get_queue(bdev); + + if (!r_queue->issue_flush_fn) { + ret = -EOPNOTSUPP; + break; + } + ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); + if (ret) + break; + } + return ret; +} static int linear_run (mddev_t *mddev) { @@ -200,6 +220,7 @@ static int linear_run (mddev_t *mddev) blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); mddev->queue->unplug_fn = linear_unplug; + mddev->queue->issue_flush_fn = linear_issue_flush; return 0; out: diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/md/md.c linux-2.6.8.1-ck7/drivers/md/md.c --- linux-2.6.8.1-ck6/drivers/md/md.c 2004-06-16 17:35:36.000000000 +1000 +++ linux-2.6.8.1-ck7/drivers/md/md.c 2004-09-09 22:56:38.710112806 +1000 @@ -154,6 +154,39 @@ static spinlock_t all_mddevs_lock = SPIN tmp = tmp->next;}) \ ) +int md_flush_mddev(mddev_t *mddev, sector_t *error_sector) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + int ret = 0; + + /* + * this list iteration is done without any locking in md?! + */ + ITERATE_RDEV(mddev, rdev, tmp) { + request_queue_t *r_queue = bdev_get_queue(rdev->bdev); + int err; + + if (!r_queue->issue_flush_fn) + err = -EOPNOTSUPP; + else + err = r_queue->issue_flush_fn(r_queue, rdev->bdev->bd_disk, error_sector); + + if (!ret) + ret = err; + } + + return ret; +} + +static int md_flush_all(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + mddev_t *mddev = q->queuedata; + + return md_flush_mddev(mddev, error_sector); +} + static int md_fail_request (request_queue_t *q, struct bio *bio) { bio_io_error(bio, bio->bi_size); @@ -1645,6 +1678,7 @@ static int do_md_run(mddev_t * mddev) */ mddev->queue->queuedata = mddev; mddev->queue->make_request_fn = mddev->pers->make_request; + mddev->queue->issue_flush_fn = md_flush_all; mddev->changed = 1; return 0; diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/md/multipath.c linux-2.6.8.1-ck7/drivers/md/multipath.c --- linux-2.6.8.1-ck6/drivers/md/multipath.c 2004-08-15 14:08:06.000000000 +1000 +++ linux-2.6.8.1-ck7/drivers/md/multipath.c 2004-09-09 22:56:38.711112650 +1000 @@ -120,7 +120,7 @@ int multipath_end_request(struct bio *bi if (uptodate) multipath_end_bh_io(mp_bh, uptodate); - else if ((bio->bi_rw & (1 << BIO_RW_AHEAD)) == 0) { + else if (!bio_rw_ahead(bio)) { /* * oops, IO error: */ @@ -217,6 +217,31 @@ static void multipath_status (struct seq seq_printf (seq, "]"); } +static int multipath_issue_flush(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + mddev_t *mddev = q->queuedata; + multipath_conf_t *conf = mddev_to_conf(mddev); + int i, ret = 0; + + for (i=0; iraid_disks; i++) { + mdk_rdev_t *rdev = conf->multipaths[i].rdev; + if (rdev && !rdev->faulty) { + struct block_device *bdev = rdev->bdev; + request_queue_t *r_queue = bdev_get_queue(bdev); + + if (!r_queue->issue_flush_fn) { + ret = -EOPNOTSUPP; + break; + } + + ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); + if (ret) + break; + } + } + return ret; +} /* * Careful, this can execute in IRQ contexts as well! @@ -435,6 +460,8 @@ static int multipath_run (mddev_t *mddev mddev->queue->unplug_fn = multipath_unplug; + mddev->queue->issue_flush_fn = multipath_issue_flush; + conf->working_disks = 0; ITERATE_RDEV(mddev,rdev,tmp) { disk_idx = rdev->raid_disk; diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/md/raid0.c linux-2.6.8.1-ck7/drivers/md/raid0.c --- linux-2.6.8.1-ck6/drivers/md/raid0.c 2004-05-23 12:54:50.000000000 +1000 +++ linux-2.6.8.1-ck7/drivers/md/raid0.c 2004-09-09 22:56:38.712112494 +1000 @@ -40,6 +40,31 @@ static void raid0_unplug(request_queue_t } } +static int raid0_issue_flush(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + mddev_t *mddev = q->queuedata; + raid0_conf_t *conf = mddev_to_conf(mddev); + mdk_rdev_t **devlist = conf->strip_zone[0].dev; + int i, ret = 0; + + for (i=0; iraid_disks; i++) { + struct block_device *bdev = devlist[i]->bdev; + request_queue_t *r_queue = bdev_get_queue(bdev); + + if (!r_queue->issue_flush_fn) { + ret = -EOPNOTSUPP; + break; + } + + ret =r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); + if (ret) + break; + } + return ret; +} + + static int create_strip_zones (mddev_t *mddev) { int i, c, j; @@ -219,6 +244,8 @@ static int create_strip_zones (mddev_t * mddev->queue->unplug_fn = raid0_unplug; + mddev->queue->issue_flush_fn = raid0_issue_flush; + printk("raid0: done.\n"); return 0; abort: diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/md/raid1.c linux-2.6.8.1-ck7/drivers/md/raid1.c --- linux-2.6.8.1-ck6/drivers/md/raid1.c 2004-08-15 14:08:06.000000000 +1000 +++ linux-2.6.8.1-ck7/drivers/md/raid1.c 2004-09-09 22:56:38.713112338 +1000 @@ -481,6 +481,32 @@ static void raid1_unplug(request_queue_t unplug_slaves(q->queuedata); } +static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + mddev_t *mddev = q->queuedata; + conf_t *conf = mddev_to_conf(mddev); + unsigned long flags; + int i, ret = 0; + + spin_lock_irqsave(&conf->device_lock, flags); + for (i=0; iraid_disks; i++) { + mdk_rdev_t *rdev = conf->mirrors[i].rdev; + if (rdev && !rdev->faulty) { + struct block_device *bdev = rdev->bdev; + request_queue_t *r_queue = bdev_get_queue(bdev); + + if (r_queue->issue_flush_fn) { + ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); + if (ret) + break; + } + } + } + spin_unlock_irqrestore(&conf->device_lock, flags); + return ret; +} + /* * Throttle resync depth, so that we can both get proper overlapping of * requests, but are still able to handle normal requests quickly. @@ -1168,6 +1194,7 @@ static int run(mddev_t *mddev) mddev->queue->unplug_fn = raid1_unplug; + mddev->queue->issue_flush_fn = raid1_issue_flush; ITERATE_RDEV(mddev, rdev, tmp) { disk_idx = rdev->raid_disk; diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/md/raid5.c linux-2.6.8.1-ck7/drivers/md/raid5.c --- linux-2.6.8.1-ck6/drivers/md/raid5.c 2004-08-15 14:08:06.000000000 +1000 +++ linux-2.6.8.1-ck7/drivers/md/raid5.c 2004-09-09 22:56:38.714112182 +1000 @@ -1339,6 +1339,39 @@ static void raid5_unplug_device(request_ unplug_slaves(mddev); } +static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + mddev_t *mddev = q->queuedata; + raid5_conf_t *conf = mddev_to_conf(mddev); + int i, ret = 0; + + for (i=0; iraid_disks; i++) { + mdk_rdev_t *rdev = conf->disks[i].rdev; + if (rdev && !rdev->faulty) { + struct block_device *bdev = rdev->bdev; + request_queue_t *r_queue; + + if (!bdev) + continue; + + r_queue = bdev_get_queue(bdev); + if (!r_queue) + continue; + + if (!r_queue->issue_flush_fn) { + ret = -EOPNOTSUPP; + break; + } + + ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); + if (ret) + break; + } + } + return ret; +} + static inline void raid5_plug_device(raid5_conf_t *conf) { spin_lock_irq(&conf->device_lock); @@ -1545,6 +1578,7 @@ static int run (mddev_t *mddev) atomic_set(&conf->preread_active_stripes, 0); mddev->queue->unplug_fn = raid5_unplug_device; + mddev->queue->issue_flush_fn = raid5_issue_flush; PRINTK("raid5: run(%s) called.\n", mdname(mddev)); diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/md/raid6main.c linux-2.6.8.1-ck7/drivers/md/raid6main.c --- linux-2.6.8.1-ck6/drivers/md/raid6main.c 2004-08-15 14:08:06.000000000 +1000 +++ linux-2.6.8.1-ck7/drivers/md/raid6main.c 2004-09-09 22:56:38.715112026 +1000 @@ -1501,6 +1501,39 @@ static void raid6_unplug_device(request_ unplug_slaves(mddev); } +static int raid6_issue_flush(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + mddev_t *mddev = q->queuedata; + raid6_conf_t *conf = mddev_to_conf(mddev); + int i, ret = 0; + + for (i=0; iraid_disks; i++) { + mdk_rdev_t *rdev = conf->disks[i].rdev; + if (rdev && !rdev->faulty) { + struct block_device *bdev = rdev->bdev; + request_queue_t *r_queue; + + if (!bdev) + continue; + + r_queue = bdev_get_queue(bdev); + if (!r_queue) + continue; + + if (!r_queue->issue_flush_fn) { + ret = -EOPNOTSUPP; + break; + } + + ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); + if (ret) + break; + } + } + return ret; +} + static inline void raid6_plug_device(raid6_conf_t *conf) { spin_lock_irq(&conf->device_lock); @@ -1708,6 +1741,7 @@ static int run (mddev_t *mddev) atomic_set(&conf->preread_active_stripes, 0); mddev->queue->unplug_fn = raid6_unplug_device; + mddev->queue->issue_flush_fn = raid6_issue_flush; PRINTK("raid6: run(%s) called.\n", mdname(mddev)); diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/scsi/scsi_lib.c linux-2.6.8.1-ck7/drivers/scsi/scsi_lib.c --- linux-2.6.8.1-ck6/drivers/scsi/scsi_lib.c 2004-08-15 14:08:08.000000000 +1000 +++ linux-2.6.8.1-ck7/drivers/scsi/scsi_lib.c 2004-09-09 22:56:38.716111870 +1000 @@ -954,6 +954,22 @@ static int scsi_init_io(struct scsi_cmnd return BLKPREP_KILL; } +static int scsi_issue_flush_fn(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + struct scsi_device *sdev = q->queuedata; + struct scsi_driver *drv; + + if (sdev->sdev_state != SDEV_RUNNING) + return -ENXIO; + + drv = *(struct scsi_driver **) disk->private_data; + if (drv->issue_flush) + return drv->issue_flush(&sdev->sdev_gendev, error_sector); + + return -EOPNOTSUPP; +} + static int scsi_prep_fn(struct request_queue *q, struct request *req) { struct scsi_device *sdev = q->queuedata; @@ -1335,7 +1351,8 @@ struct request_queue *scsi_alloc_queue(s blk_queue_max_sectors(q, shost->max_sectors); blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost)); blk_queue_segment_boundary(q, shost->dma_boundary); - + blk_queue_issue_flush_fn(q, scsi_issue_flush_fn); + if (!shost->use_clustering) clear_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); return q; diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/scsi/sd.c linux-2.6.8.1-ck7/drivers/scsi/sd.c --- linux-2.6.8.1-ck6/drivers/scsi/sd.c 2004-09-09 22:56:24.839277154 +1000 +++ linux-2.6.8.1-ck7/drivers/scsi/sd.c 2004-09-09 22:56:38.717111714 +1000 @@ -114,6 +114,7 @@ static int sd_remove(struct device *); static void sd_shutdown(struct device *dev); static void sd_rescan(struct device *); static int sd_init_command(struct scsi_cmnd *); +static int sd_issue_flush(struct device *, sector_t *); static void sd_read_capacity(struct scsi_disk *sdkp, char *diskname, struct scsi_request *SRpnt, unsigned char *buffer); @@ -127,6 +128,7 @@ static struct scsi_driver sd_template = }, .rescan = sd_rescan, .init_command = sd_init_command, + .issue_flush = sd_issue_flush, }; /* Device no to disk mapping: @@ -687,6 +689,62 @@ not_present: return 1; } +static int sd_sync_cache(struct scsi_device *sdp) +{ + struct scsi_request *sreq; + int retries, res; + + if (!scsi_device_online(sdp)) + return -ENODEV; + + sreq = scsi_allocate_request(sdp, GFP_KERNEL); + if (!sreq) { + printk("FAILED\n No memory for request\n"); + return -ENOMEM; + } + + sreq->sr_data_direction = DMA_NONE; + for (retries = 3; retries > 0; --retries) { + unsigned char cmd[10] = { 0 }; + + cmd[0] = SYNCHRONIZE_CACHE; + /* + * Leave the rest of the command zero to indicate + * flush everything. + */ + scsi_wait_req(sreq, cmd, NULL, 0, SD_TIMEOUT, SD_MAX_RETRIES); + if (sreq->sr_result == 0) + break; + } + + res = sreq->sr_result; + if (res) { + printk(KERN_WARNING "FAILED\n status = %x, message = %02x, " + "host = %d, driver = %02x\n ", + status_byte(res), msg_byte(res), + host_byte(res), driver_byte(res)); + if (driver_byte(res) & DRIVER_SENSE) + scsi_print_req_sense("sd", sreq); + } + + scsi_release_request(sreq); + return res; +} + +static int sd_issue_flush(struct device *dev, sector_t *error_sector) +{ + struct scsi_device *sdp = to_scsi_device(dev); + struct scsi_disk *sdkp = dev_get_drvdata(dev); + + if (!sdkp) + return -ENODEV; + + if (!sdkp->WCE) + return 0; + + return sd_sync_cache(sdp); +} + static void sd_rescan(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); @@ -1562,52 +1620,17 @@ static void scsi_disk_release(struct kre static void sd_shutdown(struct device *dev) { struct scsi_device *sdp = to_scsi_device(dev); - struct scsi_disk *sdkp; - struct scsi_request *sreq; - int retries, res; + struct scsi_disk *sdkp = dev_get_drvdata(dev); - sdkp = dev_get_drvdata(dev); if (!sdkp) - return; /* this can happen */ + return; /* this can happen */ - if (!scsi_device_online(sdp) || !sdkp->WCE) + if (!sdkp->WCE) return; - printk(KERN_NOTICE "Synchronizing SCSI cache for disk %s: ", + printk(KERN_NOTICE "Synchronizing SCSI cache for disk %s: \n", sdkp->disk->disk_name); - - sreq = scsi_allocate_request(sdp, GFP_KERNEL); - if (!sreq) { - printk("FAILED\n No memory for request\n"); - return; - } - - sreq->sr_data_direction = DMA_NONE; - for (retries = 3; retries > 0; --retries) { - unsigned char cmd[10] = { 0 }; - - cmd[0] = SYNCHRONIZE_CACHE; - /* - * Leave the rest of the command zero to indicate - * flush everything. - */ - scsi_wait_req(sreq, cmd, NULL, 0, SD_TIMEOUT, SD_MAX_RETRIES); - if (sreq->sr_result == 0) - break; - } - - res = sreq->sr_result; - if (res) { - printk(KERN_WARNING "FAILED\n status = %x, message = %02x, " - "host = %d, driver = %02x\n ", - status_byte(res), msg_byte(res), - host_byte(res), driver_byte(res)); - if (driver_byte(res) & DRIVER_SENSE) - scsi_print_req_sense("sd", sreq); - } - - scsi_release_request(sreq); - printk("\n"); + sd_sync_cache(sdp); } /** diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/video/fbmem.c linux-2.6.8.1-ck7/drivers/video/fbmem.c --- linux-2.6.8.1-ck6/drivers/video/fbmem.c 2004-08-15 14:08:09.000000000 +1000 +++ linux-2.6.8.1-ck7/drivers/video/fbmem.c 2004-09-09 22:56:38.722110934 +1000 @@ -97,6 +97,7 @@ extern int virgefb_setup(char*); extern int resolver_video_setup(char*); extern int s3triofb_init(void); extern int vesafb_init(void); +extern int vesafb_init_thread(void); extern int vesafb_setup(char*); extern int vga16fb_init(void); extern int vga16fb_setup(char*); @@ -306,7 +307,6 @@ static struct { #ifdef CONFIG_FB_VESA { "vesafb", vesafb_init, vesafb_setup }, #endif - /* * Chipset specific drivers that don't use resource management (yet) */ @@ -1519,6 +1519,9 @@ fbmem_init(void) } #endif +#if defined(CONFIG_FB_VESA_TNG) || defined(CONFIG_FB_VESA_TNG_MODULE) + vesafb_init_thread(); +#endif /* * Probe for all builtin frame buffer devices */ diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/video/Kconfig linux-2.6.8.1-ck7/drivers/video/Kconfig --- linux-2.6.8.1-ck6/drivers/video/Kconfig 2004-09-09 22:56:24.845276218 +1000 +++ linux-2.6.8.1-ck7/drivers/video/Kconfig 2004-09-09 22:56:38.724110622 +1000 @@ -287,7 +287,7 @@ config FB_TGA cards. Say Y if you have one of those. config FB_VESA - bool "VESA VGA graphics support" + tristate "VESA VGA graphics support" depends on FB && (X86 || X86_64) help This is the frame buffer device driver for generic VESA 2.0 @@ -295,6 +295,46 @@ config FB_VESA You will get a boot time penguin logo at no additional cost. Please read . If unsure, say Y. +choice + prompt "VESA driver type" + depends on FB_VESA + default FB_VESA_STD + +config FB_VESA_STD + bool "vesafb" + help + This is the frame buffer device driver for generic VESA 2.0 + compliant graphic cards. The older VESA 1.2 cards are not supported. + You will get a boot time penguin logo at no additional cost. Please + read . Choose this driver if you + are experiencing problems with vesafb-tng or if you own a 64-bit system. + + Note that this driver cannot be compiled as a module. + +config FB_VESA_TNG + bool "vesafb-tng" + depends on !X86_64 + help + This is the frame buffer device driver for generic VESA 2.0 + compliant graphic cards. It is capable of taking advantage of + VBE 3.0 features. With this driver you will be able to adjust + the refresh rate (VBE 3.0 compliant boards only) and change + the graphic mode on-the-fly. + + You will also get a boot time penguin logo at no additional cost. Please + read . + +endchoice + +config FB_VESA_DEFAULT_MODE + string "VESA default mode" + depends on FB_VESA_TNG + default "640x480@60" + help + This option is used to determine the default mode vesafb is + supposed to switch to in case no mode is provided as a kernel + command line parameter. + config VIDEO_SELECT bool depends on FB_VESA diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/video/Makefile linux-2.6.8.1-ck7/drivers/video/Makefile --- linux-2.6.8.1-ck6/drivers/video/Makefile 2004-09-09 22:56:24.845276218 +1000 +++ linux-2.6.8.1-ck7/drivers/video/Makefile 2004-09-09 22:56:38.724110622 +1000 @@ -44,7 +44,25 @@ obj-$(CONFIG_FB_CIRRUS) += cirrusfb.o obj-$(CONFIG_FB_TRIDENT) += tridentfb.o cfbfillrect.o cfbimgblt.o cfbcopyarea.o obj-$(CONFIG_FB_S3TRIO) += S3triofb.o obj-$(CONFIG_FB_TGA) += tgafb.o cfbfillrect.o cfbcopyarea.o cfbimgblt.o -obj-$(CONFIG_FB_VESA) += vesafb.o cfbfillrect.o cfbcopyarea.o cfbimgblt.o + +ifeq ($(CONFIG_FB_VESA),m) + ifeq ($(CONFIG_FB_VESA_STD),y) + obj-y += vesafb.o cfbfillrect.o cfbcopyarea.o cfbimgblt.o + else + obj-m += vesafb-tng.o cfbfillrect.o cfbcopyarea.o cfbimgblt.o + obj-y += vesafb-thread.o + endif +else + ifeq ($(CONFIG_FB_VESA),y) + ifeq ($(CONFIG_FB_VESA_STD),y) + obj-y += vesafb.o cfbfillrect.o cfbcopyarea.o cfbimgblt.o + else + obj-y += vesafb-tng.o vesafb-thread.o cfbfillrect.o \ + cfbcopyarea.o cfbimgblt.o + endif + endif +endif + obj-$(CONFIG_FB_VGA16) += vga16fb.o cfbfillrect.o cfbcopyarea.o \ cfbimgblt.o vgastate.o obj-$(CONFIG_FB_VIRGE) += virgefb.o diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.8.1-ck6/drivers/video/vesafb-thread.c linux-2.6.8.1-ck7/drivers/video/vesafb-thread.c --- linux-2.6.8.1-ck6/drivers/video/vesafb-thread.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.8.1-ck7/drivers/video/vesafb-thread.c 2004-09-09 22:56:38.725110466 +1000 @@ -0,0 +1,578 @@ +/* + * Framebuffer driver for VBE 2.0+ compliant graphic boards - kernel thread + * and vm86 routines. + * + * This code has to be compiled into the kernel even if vesafb is configured + * as a module. If vesafb_thread were to be started while the module is being + * initialized, it would share its active_mm with modprobe. This mm would be + * lost after modprobe finished its work, and we can't allow it, because we + * need it for as long as the vesafb thread is active. + * + * (c) 2004 Micha³ Januszewski + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include