GIT b58c7251486fbfd2d67454a6e5d45bd96da74625 git+ssh://master.kernel.org/pub/scm/linux/kernel/git/axboe/linux-2.6-block.git#for-akpm

commit 6c650636c1071f36878a941fec4218f5e08b184b
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Thu Jul 19 11:08:32 2007 +0200

    splice: add tcp_splice_read() to IPV6
    
    Thanks to YOSHIFUJI Hideaki for the hint!
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit c5bd68122f2ce99946f3aef34e86bbf073aafbca
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Wed Jun 20 09:54:14 2007 +0200

    TCP splice receive support
    
    Support for network splice receive.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit fabc96dc07d129dfe2986582f11ee8d6053549db
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Mon Jun 11 13:00:32 2007 +0200

    splice: don't assume regular pages in splice_to_pipe()
    
    Allow caller to pass in a release function, there might be
    other resources that need releasing as well. Needed for
    network receive.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 897630c19ae4dba0fad57278bde508e8daec7701
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Fri Jul 13 14:23:32 2007 +0200

    [PATCH] vmsplice: implement real vmsplice to userspace
    
    I'm sure this is riddled with bugs currently, it's a 3 hour hack
    but it does appear to work. It attempts to map pipe pages into
    a vma, so that it appears in the applications address space
    without having to copy data.
    
    It changes the sys_vmsplice() prototype to NOT have a const iov
    argument anymore. I think this is alright, as existing vmsplice
    usage will treat the iov as const. Only the now introduced
    vmsplice-to-user functionality would modify iov->iov_base.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit d05b213df65f971d9697244b071740a5470b9c61
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Tue Aug 14 09:22:15 2007 +0200

    block: convert blkdev_issue_flush() to use empty barriers
    
    Then we can get rid of ->issue_flush_fn() and all the driver private
    implementations of that.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 84ebb7733d190583c8a2c202f64cc7e0ebb27df1
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Wed Jul 18 14:28:03 2007 +0200

    block: Initial support for data-less (or empty) barrier support
    
    This implements functionality to pass down or insert a barrier
    in a queue, without having data attached to it. The ->prepare_flush_fn()
    infrastructure from data barriers are reused to provide this
    functionality.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 2300142fd9def049027eb3d775b191b899951a0e
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Wed Jul 18 13:27:58 2007 +0200

    block: factor our bio_check_eod()
    
    End of device check is done twice in __generic_make_request() and it's
    fully inlined each time.  Factor out bio_check_eod().
    
    Signed-off-by: Tejun Heo <htejun@gmail.com>
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit b8ddb3acf60fb5eb00c663bb8f81bece2f036fd2
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Wed Jul 25 08:05:53 2007 +0200

    block: add end_queued_request() helper
    
    We can use this helper in the elevator core for BLKPREP_KILL, and it'll
    also be useful for the empty barrier patch.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 21b58f03e4ad6fe42162ac572f173db1f62fed4e
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Wed Jul 18 13:18:08 2007 +0200

    bio: make freeing of ->bi_io_vec conditional in bio_free()
    
    The empty barrier patches do not carry data, so they have no
    iovec attached.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit c94f1c4ac87862675c8d70941973bc3a69aff5d8
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Wed Jul 18 13:14:03 2007 +0200

    bio: use memset() in bio_init()
    
    Use memset() to clear the bio, instead of doing each field manually.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 884a898149609e30dbdf6fcb0b16c4c65ad8b8d8
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Wed Jul 18 13:13:10 2007 +0200

    block: ll_rw_blk.c: cosmetics
    
    Fix ?: construct, a typo, whitespace, and similar.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 35802e740effa3fb823dd5ce246e5e131b340a73
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Tue Aug 7 09:37:10 2007 +0200

    SPARC64: sg chaining support
    
    This updates the sparc64 iommu/pci dma mappers to sg chaining.
    
    Acked-by: David S. Miller <davem@davemloft.net>
    
    Later updated to newer kernel with unified sparc64 iommu sg handling.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit e11de8b50295179035a80b6b00cbbf451e9ce3c7
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Mon May 14 15:44:38 2007 +0200

    SPARC: sg chaining support
    
    This updates the sparc iommu/pci dma mappers to sg chaining.
    
    Acked-by: David S. Miller <davem@davemloft.net>
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 105046eab912cdd1bee9dba947ab6d11b00e497f
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Thu Aug 16 08:50:57 2007 +0200

    PPC: sg chaining support
    
    This updates the ppc iommu/pci dma mappers to sg chaining. Includes
    further fixes from FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 4186ecbb7eb83f3f1b964fda97d1b30430a6f414
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Thu Jul 19 08:22:17 2007 +0200

    PS3: sg chaining support
    
    Acked-by: Geoff Levand <geoffrey.levand@am.sony.com>
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 2eb7e5c468111b3c2b32a96214bff00165ae5834
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Mon May 14 13:01:35 2007 +0200

    IA64: sg chaining support
    
    This updates the ia64 iommu/pci dma mappers to sg chaining.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit ba1cfc2ac2643c07b740730ecfc41a2a03516ad4
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Thu May 10 11:59:55 2007 +0200

    x86-64: enable sg chaining
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 836a4db9a09ea09da3a58f7483cd84dd0af8707a
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Tue Jul 24 12:40:44 2007 +0200

    x86-64: update pci-gart iommu to sg helpers
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 75d30cbc3d1e75fe6b131e0550156c86a83fb53a
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Tue Jul 24 12:39:27 2007 +0200

    x86-64: update nommu to sg helpers
    
    Acked-by: Muli Ben-Yehuda <muli@il.ibm.com>
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 35efad63d050e171cf1dec2ba8ffe3f6aa87eaa0
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Tue Jul 24 12:38:15 2007 +0200

    x86-64: update calgary iommu to sg helpers
    
    Acked-by: Muli Ben-Yehuda <muli@il.ibm.com>
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit a7f15ddc541f96cc96b210f9b1313e76040414ef
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Fri May 11 14:56:18 2007 +0200

    swiotlb: sg chaining support
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit c451be6df98361d6a18baac507a3655a78388f3a
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Fri May 11 12:57:30 2007 +0200

    i386: enable sg chaining
    
    We don't need to do more on x86, there's no iommu to be worried about.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit a1ff33df522c40e1ef9049336ae6af2253ec6d92
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Wed May 9 09:15:27 2007 +0200

    i386 dma_map_sg: convert to using sg helpers
    
    The dma mapping helpers need to be converted to using
    sg helpers as well, so they will work with a chained
    sglist setup.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit e795ae676bb30c2881f5abf389bf0e5053ccb564
Author: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date:   Sat Aug 18 18:27:36 2007 +0900

    libata sg chaining support fix
    
    Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit e1fbe98960f8fa9b92f2f2bfea5bd161b7761ee5
Author: FUJITA Tomonori <tomof@acm.org>
Date:   Mon Jul 30 23:01:32 2007 +0900

    zfcp: sg chaining support
    
    Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 628c38d9b88fa72736e554d5c48d1f778e80422a
Author: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date:   Mon Jul 16 15:24:50 2007 +0200

    iscsi_tcp: sg chaining support
    
    Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 8a0a42f045094adcb28e827a89ee4c94ebf2b555
Author: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date:   Mon Jul 16 15:24:14 2007 +0200

    ips: sg chaining support
    
    ips properly uses scsi_for_each_sg for the normal I/O path, however,
    the breakup path doesn't.
    
    Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit c3616f4fc12c74470eb23825a782aaebc4023526
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Wed Jul 25 08:13:56 2007 +0200

    IDE: sg chaining support
    
    Acked-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 83ebcb83f5d62ce29c42fcc437c61b7d82f98838
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Tue Jul 24 14:42:11 2007 +0200

    i2o: sg chaining support
    
    Acked-by: Alan Cox <alan@redhat.com>
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 33274a3761cb15640fcf83972ab7c5c13043879a
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Mon Jul 16 15:30:33 2007 +0200

    Fusion: sg chaining support
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit e579fb2044760617e5244647da0a5a33e94ad761
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Fri May 11 12:33:09 2007 +0200

    USB storage: sg chaining support
    
    [PATCH] USB storage: sg chaining support
    
    Modify usb_stor_access_xfer_buf() to take a pointer to an sg
    entry pointer, so we can keep track of that instead of passing
    around an integer index (which we can't use when dealing with
    multiple scatterlist arrays).
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit d0df5cc34b981d44ee44930492b1e4dcacec35bd
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Tue Jul 24 14:41:13 2007 +0200

    infiniband: sg chaining support
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 88700324b9abd29f9deee8d96cbc5d0f38310e6e
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Mon Jul 30 16:10:30 2007 +0200

    advansys: convert to use the data buffer accessors
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 8bbbf329be4af410829faa9ae180368773765022
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Mon Jul 16 10:00:31 2007 +0200

    aha1542: convert to use the data buffer accessors
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit ff0217cd975a72aa72b5a22688e6ae1348e20c27
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Wed May 9 14:42:23 2007 +0200

    gdth: sg chaining support
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit b7f6c8b6f4be173917e40db41d5c1df88df558de
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Wed May 9 14:34:52 2007 +0200

    ide-scsi: sg chaining support
    
    Acked-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit eb2b968d5fe84f2869b04d622615b39322b77051
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Wed May 9 13:43:12 2007 +0200

    qlogicpti: sg chaining support
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 6efa92d9135530e8be9880a29b4c51a4152b48c7
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Wed May 9 13:18:54 2007 +0200

    aic94xx: sg chaining support
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 3cd05172ed660e85ba0c97fc001b84562b6a7ec3
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Wed May 9 13:02:43 2007 +0200

    qla1280: sg chaining support
    
    Interesting hardware setup...
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 7d627c2d7e963990739a3dfe218fe3f61d2ab1ad
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Fri May 11 15:01:01 2007 +0200

    scsi generic: sg chaining support
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 63499aaafa6ec0c107a925574c2af84a2cd86703
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Mon Jul 16 10:19:24 2007 +0200

    scsi_debug: support sg chaining
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit e43881f0f7ca4b9695949f8e44460944869ffa0c
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Wed May 9 09:08:18 2007 +0200

    libata: convert to using sg helpers
    
    This converts libata to using the sg helpers for looking up sg
    elements, instead of doing it manually.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit d6beb57f48231f5c012fb7d55b369bc0af6b0c41
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Tue Aug 7 09:02:51 2007 +0200

    SCSI: support for allocating large scatterlists
    
    This is what enables large commands. If we need to allocate an
    sgtable that doesn't fit in a single page, allocate several
    SCSI_MAX_SG_SEGMENTS sized tables and chain them together.
    
    SCSI defaults to large chained sg tables, if the arch supports it.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 7f9a5100db130cf3751ce89e1ee74ed257bb2b4a
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Mon Jul 16 09:30:38 2007 +0200

    scsi: simplify scsi_free_sgtable()
    
    Just pass in the command, no point in passing in the scatterlist
    and scatterlist pool index seperately.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit c4d9b51d2579a88677850dda08aebeb66fbbf8e8
Author: saeed bishara <saeed.bishara@gmail.com>
Date:   Wed Aug 8 13:09:00 2007 +0200

    use sg helper function in DMA mapping documentation
    
    Signed-off-by: Saeed Bishara <saeed.bishara@gmail.com>
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 95aea7a481224d65e1e70040607da4662cacf143
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Thu May 10 13:29:51 2007 +0200

    ll_rw_blk: temporarily enable max_segments tweaking
    
    Expose this setting for now, so that users can play with enabling
    large commands without defaulting it to on globally. This is a debug
    patch, it will be dropped for the final versions.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit f93308b57f6a11f33992b41eb80334959c296b21
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Mon Jul 16 21:17:16 2007 +0200

    Add chained sg support to linux/scatterlist.h
    
    The core of the patch - allow the last sg element in a scatterlist
    table to point to the start of a new table. We overload the LSB of
    the page pointer to indicate whether this is a valid sg entry, or
    merely a link to the next list.
    
    Includes a fix from Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
    correcting the ifdef ARCH_HAS_SG_CHAIN guarding sg_last().
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit a25a2c20827147c0c7374021ee5a871736f50c0d
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Tue Jul 24 12:31:42 2007 +0200

    scsi: convert to using sg helpers
    
    This converts the SCSI mid layer to using the sg helpers for looking up
    sg elements, instead of doing it manually.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit b53dd4ba1794ad87657d5677f564f629e2b1e0e9
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Wed Jul 25 08:23:24 2007 +0200

    block: convert to using sg helpers
    
    Convert the main rq mapper (blk_rq_map_sg()) to the sg helper setup.
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit d77cfbb8b2da6ee39fd7d3be287018c355b6e597
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Wed May 9 09:02:57 2007 +0200

    Add sg helpers for iterating over a scatterlist table
    
    First step to being able to change the scatterlist setup without
    having to modify drivers (a lot :-)
    
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

commit 9f9572a5048e52acf7481cec9b9e99afd3e383c9
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Wed May 9 08:58:32 2007 +0200

    crypto: don't pollute the global namespace with sg_next()
    
    It's a subsystem function, prefix it as such.
    
    Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 Documentation/DMA-mapping.txt             |    2 
 arch/ia64/hp/common/sba_iommu.c           |   14 
 arch/ia64/sn/pci/pci_dma.c                |   11 
 arch/powerpc/kernel/dma_64.c              |    5 
 arch/powerpc/kernel/ibmebus.c             |   11 
 arch/powerpc/kernel/iommu.c               |   23 -
 arch/powerpc/platforms/ps3/system-bus.c   |    7 
 arch/sparc/kernel/ioport.c                |   25 -
 arch/sparc/mm/io-unit.c                   |   12 
 arch/sparc/mm/iommu.c                     |   10 
 arch/sparc/mm/sun4c.c                     |   10 
 arch/sparc64/kernel/iommu.c               |   39 +
 arch/sparc64/kernel/pci_sun4v.c           |   32 -
 arch/x86_64/kernel/pci-calgary.c          |   24 -
 arch/x86_64/kernel/pci-gart.c             |   63 +-
 arch/x86_64/kernel/pci-nommu.c            |    5 
 block/elevator.c                          |   17 
 block/ll_rw_blk.c                         |  308 +++++++++-----
 crypto/digest.c                           |    2 
 crypto/scatterwalk.c                      |    2 
 crypto/scatterwalk.h                      |    2 
 drivers/ata/libata-core.c                 |   35 -
 drivers/ide/cris/ide-cris.c               |    3 
 drivers/ide/ide-disk.c                    |   29 -
 drivers/ide/ide-dma.c                     |    2 
 drivers/ide/ide-io.c                      |    3 
 drivers/ide/ide-probe.c                   |    2 
 drivers/ide/ide-taskfile.c                |   18 
 drivers/ide/mips/au1xxx-ide.c             |    2 
 drivers/ide/pci/sgiioc4.c                 |    3 
 drivers/ide/ppc/pmac.c                    |    2 
 drivers/infiniband/hw/ipath/ipath_dma.c   |   10 
 drivers/infiniband/ulp/iser/iser_memory.c |   76 +--
 drivers/md/dm-table.c                     |   28 -
 drivers/md/dm.c                           |   16 
 drivers/md/dm.h                           |    1 
 drivers/md/linear.c                       |   20 
 drivers/md/md.c                           |    1 
 drivers/md/multipath.c                    |   30 -
 drivers/md/raid0.c                        |   21 
 drivers/md/raid1.c                        |   31 -
 drivers/md/raid10.c                       |   31 -
 drivers/md/raid5.c                        |   31 -
 drivers/message/fusion/mptscsih.c         |    6 
 drivers/message/i2o/i2o_block.c           |   24 -
 drivers/s390/scsi/zfcp_def.h              |    1 
 drivers/s390/scsi/zfcp_qdio.c             |    6 
 drivers/scsi/advansys.c                   |   12 
 drivers/scsi/aha1542.c                    |   32 -
 drivers/scsi/aic94xx/aic94xx_task.c       |    6 
 drivers/scsi/gdth.c                       |   45 +-
 drivers/scsi/ide-scsi.c                   |   31 -
 drivers/scsi/ips.c                        |   14 
 drivers/scsi/iscsi_tcp.c                  |   41 +
 drivers/scsi/qla1280.c                    |   66 +--
 drivers/scsi/qlogicpti.c                  |   15 
 drivers/scsi/scsi_debug.c                 |   30 -
 drivers/scsi/scsi_lib.c                   |  230 +++++++---
 drivers/scsi/scsi_tgt_lib.c               |    4 
 drivers/scsi/sd.c                         |   23 -
 drivers/scsi/sg.c                         |   16 
 drivers/usb/storage/alauda.c              |   16 
 drivers/usb/storage/datafab.c             |   10 
 drivers/usb/storage/jumpshot.c            |   10 
 drivers/usb/storage/protocol.c            |   20 
 drivers/usb/storage/protocol.h            |    2 
 drivers/usb/storage/sddr09.c              |   16 
 drivers/usb/storage/sddr55.c              |   16 
 drivers/usb/storage/shuttle_usbat.c       |   17 
 fs/bio.c                                  |   23 -
 fs/splice.c                               |  439 ++++++++++++++------
 include/asm-i386/dma-mapping.h            |   13 
 include/asm-i386/scatterlist.h            |    2 
 include/asm-ia64/dma-mapping.h            |    1 
 include/asm-ia64/scatterlist.h            |    2 
 include/asm-powerpc/dma-mapping.h         |   17 
 include/asm-powerpc/scatterlist.h         |    2 
 include/asm-sparc/scatterlist.h           |    2 
 include/asm-sparc64/scatterlist.h         |    2 
 include/asm-x86_64/dma-mapping.h          |    3 
 include/asm-x86_64/scatterlist.h          |    2 
 include/linux/bio.h                       |   19 
 include/linux/blkdev.h                    |    7 
 include/linux/i2o.h                       |    3 
 include/linux/ide.h                       |    2 
 include/linux/libata.h                    |   16 
 include/linux/net.h                       |    3 
 include/linux/scatterlist.h               |   84 +++
 include/linux/skbuff.h                    |    5 
 include/linux/splice.h                    |    2 
 include/linux/syscalls.h                  |    2 
 include/net/tcp.h                         |    3 
 include/scsi/scsi.h                       |    7 
 include/scsi/scsi_cmnd.h                  |    6 
 lib/swiotlb.c                             |   19 
 mm/bounce.c                               |    6 
 net/core/skbuff.c                         |  246 +++++++++++
 net/ipv4/af_inet.c                        |    1 
 net/ipv4/tcp.c                            |  129 +++++
 net/ipv6/af_inet6.c                       |    1 
 net/socket.c                              |   13 
 101 files changed, 1804 insertions(+), 1006 deletions(-)

diff -puN Documentation/DMA-mapping.txt~git-block Documentation/DMA-mapping.txt
--- a/Documentation/DMA-mapping.txt~git-block
+++ a/Documentation/DMA-mapping.txt
@@ -514,7 +514,7 @@ With scatterlists, you map a region gath
 	int i, count = pci_map_sg(dev, sglist, nents, direction);
 	struct scatterlist *sg;
 
-	for (i = 0, sg = sglist; i < count; i++, sg++) {
+	for_each_sg(sglist, sg, count, i) {
 		hw_address[i] = sg_dma_address(sg);
 		hw_len[i] = sg_dma_len(sg);
 	}
diff -puN arch/ia64/hp/common/sba_iommu.c~git-block arch/ia64/hp/common/sba_iommu.c
--- a/arch/ia64/hp/common/sba_iommu.c~git-block
+++ a/arch/ia64/hp/common/sba_iommu.c
@@ -396,7 +396,7 @@ sba_dump_sg( struct ioc *ioc, struct sca
 		printk(KERN_DEBUG " %d : DMA %08lx/%05x CPU %p\n", nents,
 		       startsg->dma_address, startsg->dma_length,
 		       sba_sg_address(startsg));
-		startsg++;
+		startsg = sg_next(startsg);
 	}
 }
 
@@ -409,7 +409,7 @@ sba_check_sg( struct ioc *ioc, struct sc
 	while (the_nents-- > 0) {
 		if (sba_sg_address(the_sg) == 0x0UL)
 			sba_dump_sg(NULL, startsg, nents);
-		the_sg++;
+		the_sg = sg_next(the_sg);
 	}
 }
 
@@ -1201,7 +1201,7 @@ sba_fill_pdir(
 			u32 pide = startsg->dma_address & ~PIDE_FLAG;
 			dma_offset = (unsigned long) pide & ~iovp_mask;
 			startsg->dma_address = 0;
-			dma_sg++;
+			dma_sg = sg_next(dma_sg);
 			dma_sg->dma_address = pide | ioc->ibase;
 			pdirp = &(ioc->pdir_base[pide >> iovp_shift]);
 			n_mappings++;
@@ -1228,7 +1228,7 @@ sba_fill_pdir(
 				pdirp++;
 			} while (cnt > 0);
 		}
-		startsg++;
+		startsg = sg_next(startsg);
 	}
 	/* force pdir update */
 	wmb();
@@ -1297,7 +1297,7 @@ sba_coalesce_chunks( struct ioc *ioc,
 		while (--nents > 0) {
 			unsigned long vaddr;	/* tmp */
 
-			startsg++;
+			startsg = sg_next(startsg);
 
 			/* PARANOID */
 			startsg->dma_address = startsg->dma_length = 0;
@@ -1407,7 +1407,7 @@ int sba_map_sg(struct device *dev, struc
 #ifdef ALLOW_IOV_BYPASS_SG
 	ASSERT(to_pci_dev(dev)->dma_mask);
 	if (likely((ioc->dma_mask & ~to_pci_dev(dev)->dma_mask) == 0)) {
-		for (sg = sglist ; filled < nents ; filled++, sg++){
+		for_each_sg(sglist, sg, nents, filled) {
 			sg->dma_length = sg->length;
 			sg->dma_address = virt_to_phys(sba_sg_address(sg));
 		}
@@ -1501,7 +1501,7 @@ void sba_unmap_sg (struct device *dev, s
 	while (nents && sglist->dma_length) {
 
 		sba_unmap_single(dev, sglist->dma_address, sglist->dma_length, dir);
-		sglist++;
+		sglist = sg_next(sglist);
 		nents--;
 	}
 
diff -puN arch/ia64/sn/pci/pci_dma.c~git-block arch/ia64/sn/pci/pci_dma.c
--- a/arch/ia64/sn/pci/pci_dma.c~git-block
+++ a/arch/ia64/sn/pci/pci_dma.c
@@ -218,16 +218,17 @@ EXPORT_SYMBOL(sn_dma_unmap_single);
  *
  * Unmap a set of streaming mode DMA translations.
  */
-void sn_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
+void sn_dma_unmap_sg(struct device *dev, struct scatterlist *sgl,
 		     int nhwentries, int direction)
 {
 	int i;
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct sn_pcibus_provider *provider = SN_PCIDEV_BUSPROVIDER(pdev);
+	struct scatterlist *sg;
 
 	BUG_ON(dev->bus != &pci_bus_type);
 
-	for (i = 0; i < nhwentries; i++, sg++) {
+	for_each_sg(sgl, sg, nhwentries, i) {
 		provider->dma_unmap(pdev, sg->dma_address, direction);
 		sg->dma_address = (dma_addr_t) NULL;
 		sg->dma_length = 0;
@@ -244,11 +245,11 @@ EXPORT_SYMBOL(sn_dma_unmap_sg);
  *
  * Maps each entry of @sg for DMA.
  */
-int sn_dma_map_sg(struct device *dev, struct scatterlist *sg, int nhwentries,
+int sn_dma_map_sg(struct device *dev, struct scatterlist *sgl, int nhwentries,
 		  int direction)
 {
 	unsigned long phys_addr;
-	struct scatterlist *saved_sg = sg;
+	struct scatterlist *saved_sg = sgl, *sg;
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct sn_pcibus_provider *provider = SN_PCIDEV_BUSPROVIDER(pdev);
 	int i;
@@ -258,7 +259,7 @@ int sn_dma_map_sg(struct device *dev, st
 	/*
 	 * Setup a DMA address for each entry in the scatterlist.
 	 */
-	for (i = 0; i < nhwentries; i++, sg++) {
+	for_each_sg(sgl, sg, nhwentries, i) {
 		phys_addr = SG_ENT_PHYS_ADDRESS(sg);
 		sg->dma_address = provider->dma_map(pdev,
 						    phys_addr, sg->length,
diff -puN arch/powerpc/kernel/dma_64.c~git-block arch/powerpc/kernel/dma_64.c
--- a/arch/powerpc/kernel/dma_64.c~git-block
+++ a/arch/powerpc/kernel/dma_64.c
@@ -154,12 +154,13 @@ static void dma_direct_unmap_single(stru
 {
 }
 
-static int dma_direct_map_sg(struct device *dev, struct scatterlist *sg,
+static int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl,
 			     int nents, enum dma_data_direction direction)
 {
+	struct scatterlist *sg;
 	int i;
 
-	for (i = 0; i < nents; i++, sg++) {
+	for_each_sg(sgl, sg, nents, i) {
 		sg->dma_address = (page_to_phys(sg->page) + sg->offset) |
 			dma_direct_offset;
 		sg->dma_length = sg->length;
diff -puN arch/powerpc/kernel/ibmebus.c~git-block arch/powerpc/kernel/ibmebus.c
--- a/arch/powerpc/kernel/ibmebus.c~git-block
+++ a/arch/powerpc/kernel/ibmebus.c
@@ -87,15 +87,16 @@ static void ibmebus_unmap_single(struct 
 }
 
 static int ibmebus_map_sg(struct device *dev,
-			  struct scatterlist *sg,
+			  struct scatterlist *sgl,
 			  int nents, enum dma_data_direction direction)
 {
+	struct scatterlist *sg;
 	int i;
 
-	for (i = 0; i < nents; i++) {
-		sg[i].dma_address = (dma_addr_t)page_address(sg[i].page)
-			+ sg[i].offset;
-		sg[i].dma_length = sg[i].length;
+	for_each_sg(sgl, sg, nents, i) {
+		sg->dma_address = (dma_addr_t)page_address(sg->page)
+			+ sg->offset;
+		sg->dma_length = sg->length;
 	}
 
 	return nents;
diff -puN arch/powerpc/kernel/iommu.c~git-block arch/powerpc/kernel/iommu.c
--- a/arch/powerpc/kernel/iommu.c~git-block
+++ a/arch/powerpc/kernel/iommu.c
@@ -277,7 +277,7 @@ int iommu_map_sg(struct iommu_table *tbl
 	dma_addr_t dma_next = 0, dma_addr;
 	unsigned long flags;
 	struct scatterlist *s, *outs, *segstart;
-	int outcount, incount;
+	int outcount, incount, i;
 	unsigned long handle;
 
 	BUG_ON(direction == DMA_NONE);
@@ -297,7 +297,7 @@ int iommu_map_sg(struct iommu_table *tbl
 
 	spin_lock_irqsave(&(tbl->it_lock), flags);
 
-	for (s = outs; nelems; nelems--, s++) {
+	for_each_sg(sglist, s, nelems, i) {
 		unsigned long vaddr, npages, entry, slen;
 
 		slen = s->length;
@@ -341,7 +341,8 @@ int iommu_map_sg(struct iommu_table *tbl
 			if (novmerge || (dma_addr != dma_next)) {
 				/* Can't merge: create a new segment */
 				segstart = s;
-				outcount++; outs++;
+				outcount++;
+				outs = sg_next(outs);
 				DBG("    can't merge, new segment.\n");
 			} else {
 				outs->dma_length += s->length;
@@ -374,7 +375,7 @@ int iommu_map_sg(struct iommu_table *tbl
 	 * next entry of the sglist if we didn't fill the list completely
 	 */
 	if (outcount < incount) {
-		outs++;
+		outs = sg_next(outs);
 		outs->dma_address = DMA_ERROR_CODE;
 		outs->dma_length = 0;
 	}
@@ -385,7 +386,7 @@ int iommu_map_sg(struct iommu_table *tbl
 	return outcount;
 
  failure:
-	for (s = &sglist[0]; s <= outs; s++) {
+	for_each_sg(sglist, s, nelems, i) {
 		if (s->dma_length != 0) {
 			unsigned long vaddr, npages;
 
@@ -395,6 +396,8 @@ int iommu_map_sg(struct iommu_table *tbl
 			s->dma_address = DMA_ERROR_CODE;
 			s->dma_length = 0;
 		}
+		if (s == outs)
+			break;
 	}
 	spin_unlock_irqrestore(&(tbl->it_lock), flags);
 	return 0;
@@ -404,6 +407,7 @@ int iommu_map_sg(struct iommu_table *tbl
 void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
 		int nelems, enum dma_data_direction direction)
 {
+	struct scatterlist *sg;
 	unsigned long flags;
 
 	BUG_ON(direction == DMA_NONE);
@@ -413,15 +417,16 @@ void iommu_unmap_sg(struct iommu_table *
 
 	spin_lock_irqsave(&(tbl->it_lock), flags);
 
+	sg = sglist;
 	while (nelems--) {
 		unsigned int npages;
-		dma_addr_t dma_handle = sglist->dma_address;
+		dma_addr_t dma_handle = sg->dma_address;
 
-		if (sglist->dma_length == 0)
+		if (sg->dma_length == 0)
 			break;
-		npages = iommu_num_pages(dma_handle,sglist->dma_length);
+		npages = iommu_num_pages(dma_handle, sg->dma_length);
 		__iommu_free(tbl, dma_handle, npages);
-		sglist++;
+		sg = sg_next(sg);
 	}
 
 	/* Flush/invalidate TLBs if necessary. As for iommu_free(), we
diff -puN arch/powerpc/platforms/ps3/system-bus.c~git-block arch/powerpc/platforms/ps3/system-bus.c
--- a/arch/powerpc/platforms/ps3/system-bus.c~git-block
+++ a/arch/powerpc/platforms/ps3/system-bus.c
@@ -617,17 +617,18 @@ static void ps3_unmap_single(struct devi
 	}
 }
 
-static int ps3_sb_map_sg(struct device *_dev, struct scatterlist *sg, int nents,
-	enum dma_data_direction direction)
+static int ps3_sb_map_sg(struct device *_dev, struct scatterlist *sgl,
+	int nents, enum dma_data_direction direction)
 {
 #if defined(CONFIG_PS3_DYNAMIC_DMA)
 	BUG_ON("do");
 	return -EPERM;
 #else
 	struct ps3_system_bus_device *dev = ps3_dev_to_system_bus_dev(_dev);
+	struct scatterlist *sg;
 	int i;
 
-	for (i = 0; i < nents; i++, sg++) {
+	for_each_sg(sgl, sg, nents, i) {
 		int result = ps3_dma_map(dev->d_region,
 			page_to_phys(sg->page) + sg->offset, sg->length,
 					 &sg->dma_address, 0);
diff -puN arch/sparc/kernel/ioport.c~git-block arch/sparc/kernel/ioport.c
--- a/arch/sparc/kernel/ioport.c~git-block
+++ a/arch/sparc/kernel/ioport.c
@@ -35,6 +35,7 @@
 #include <linux/slab.h>
 #include <linux/pci.h>		/* struct pci_dev */
 #include <linux/proc_fs.h>
+#include <linux/scatterlist.h>
 
 #include <asm/io.h>
 #include <asm/vaddrs.h>
@@ -717,19 +718,19 @@ void pci_unmap_page(struct pci_dev *hwde
  * Device ownership issues as mentioned above for pci_map_single are
  * the same here.
  */
-int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg, int nents,
+int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sgl, int nents,
     int direction)
 {
+	struct scatterlist *sg;
 	int n;
 
 	BUG_ON(direction == PCI_DMA_NONE);
 	/* IIep is write-through, not flushing. */
-	for (n = 0; n < nents; n++) {
+	for_each_sg(sgl, sg, nents, n) {
 		BUG_ON(page_address(sg->page) == NULL);
 		sg->dvma_address =
 			virt_to_phys(page_address(sg->page)) + sg->offset;
 		sg->dvma_length = sg->length;
-		sg++;
 	}
 	return nents;
 }
@@ -738,19 +739,19 @@ int pci_map_sg(struct pci_dev *hwdev, st
  * Again, cpu read rules concerning calls here are the same as for
  * pci_unmap_single() above.
  */
-void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg, int nents,
+void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sgl, int nents,
     int direction)
 {
+	struct scatterlist *sg;
 	int n;
 
 	BUG_ON(direction == PCI_DMA_NONE);
 	if (direction != PCI_DMA_TODEVICE) {
-		for (n = 0; n < nents; n++) {
+		for_each_sg(sgl, sg, nents, n) {
 			BUG_ON(page_address(sg->page) == NULL);
 			mmu_inval_dma_area(
 			    (unsigned long) page_address(sg->page),
 			    (sg->length + PAGE_SIZE-1) & PAGE_MASK);
-			sg++;
 		}
 	}
 }
@@ -789,34 +790,34 @@ void pci_dma_sync_single_for_device(stru
  * The same as pci_dma_sync_single_* but for a scatter-gather list,
  * same rules and usage.
  */
-void pci_dma_sync_sg_for_cpu(struct pci_dev *hwdev, struct scatterlist *sg, int nents, int direction)
+void pci_dma_sync_sg_for_cpu(struct pci_dev *hwdev, struct scatterlist *sgl, int nents, int direction)
 {
+	struct scatterlist *sg;
 	int n;
 
 	BUG_ON(direction == PCI_DMA_NONE);
 	if (direction != PCI_DMA_TODEVICE) {
-		for (n = 0; n < nents; n++) {
+		for_each_sg(sgl, sg, nents, n) {
 			BUG_ON(page_address(sg->page) == NULL);
 			mmu_inval_dma_area(
 			    (unsigned long) page_address(sg->page),
 			    (sg->length + PAGE_SIZE-1) & PAGE_MASK);
-			sg++;
 		}
 	}
 }
 
-void pci_dma_sync_sg_for_device(struct pci_dev *hwdev, struct scatterlist *sg, int nents, int direction)
+void pci_dma_sync_sg_for_device(struct pci_dev *hwdev, struct scatterlist *sgl, int nents, int direction)
 {
+	struct scatterlist *sg;
 	int n;
 
 	BUG_ON(direction == PCI_DMA_NONE);
 	if (direction != PCI_DMA_TODEVICE) {
-		for (n = 0; n < nents; n++) {
+		for_each_sg(sgl, sg, nents, n) {
 			BUG_ON(page_address(sg->page) == NULL);
 			mmu_inval_dma_area(
 			    (unsigned long) page_address(sg->page),
 			    (sg->length + PAGE_SIZE-1) & PAGE_MASK);
-			sg++;
 		}
 	}
 }
diff -puN arch/sparc/mm/io-unit.c~git-block arch/sparc/mm/io-unit.c
--- a/arch/sparc/mm/io-unit.c~git-block
+++ a/arch/sparc/mm/io-unit.c
@@ -11,8 +11,8 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>	/* pte_offset_map => kmap_atomic */
 #include <linux/bitops.h>
+#include <linux/scatterlist.h>
 
-#include <asm/scatterlist.h>
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
 #include <asm/sbus.h>
@@ -144,8 +144,9 @@ static void iounit_get_scsi_sgl(struct s
 	spin_lock_irqsave(&iounit->lock, flags);
 	while (sz != 0) {
 		--sz;
-		sg[sz].dvma_address = iounit_get_area(iounit, (unsigned long)page_address(sg[sz].page) + sg[sz].offset, sg[sz].length);
-		sg[sz].dvma_length = sg[sz].length;
+		sg->dvma_address = iounit_get_area(iounit, (unsigned long)page_address(sg->page) + sg->offset, sg->length);
+		sg->dvma_length = sg->length;
+		sg = sg_next(sg);
 	}
 	spin_unlock_irqrestore(&iounit->lock, flags);
 }
@@ -173,11 +174,12 @@ static void iounit_release_scsi_sgl(stru
 	spin_lock_irqsave(&iounit->lock, flags);
 	while (sz != 0) {
 		--sz;
-		len = ((sg[sz].dvma_address & ~PAGE_MASK) + sg[sz].length + (PAGE_SIZE-1)) >> PAGE_SHIFT;
-		vaddr = (sg[sz].dvma_address - IOUNIT_DMA_BASE) >> PAGE_SHIFT;
+		len = ((sg->dvma_address & ~PAGE_MASK) + sg->length + (PAGE_SIZE-1)) >> PAGE_SHIFT;
+		vaddr = (sg->dvma_address - IOUNIT_DMA_BASE) >> PAGE_SHIFT;
 		IOD(("iounit_release %08lx-%08lx\n", (long)vaddr, (long)len+vaddr));
 		for (len += vaddr; vaddr < len; vaddr++)
 			clear_bit(vaddr, iounit->bmap);
+		sg = sg_next(sg);
 	}
 	spin_unlock_irqrestore(&iounit->lock, flags);
 }
diff -puN arch/sparc/mm/iommu.c~git-block arch/sparc/mm/iommu.c
--- a/arch/sparc/mm/iommu.c~git-block
+++ a/arch/sparc/mm/iommu.c
@@ -12,8 +12,8 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>	/* pte_offset_map => kmap_atomic */
+#include <linux/scatterlist.h>
 
-#include <asm/scatterlist.h>
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
 #include <asm/sbus.h>
@@ -240,7 +240,7 @@ static void iommu_get_scsi_sgl_noflush(s
 		n = (sg->length + sg->offset + PAGE_SIZE-1) >> PAGE_SHIFT;
 		sg->dvma_address = iommu_get_one(sg->page, n, sbus) + sg->offset;
 		sg->dvma_length = (__u32) sg->length;
-		sg++;
+		sg = sg_next(sg);
 	}
 }
 
@@ -254,7 +254,7 @@ static void iommu_get_scsi_sgl_gflush(st
 		n = (sg->length + sg->offset + PAGE_SIZE-1) >> PAGE_SHIFT;
 		sg->dvma_address = iommu_get_one(sg->page, n, sbus) + sg->offset;
 		sg->dvma_length = (__u32) sg->length;
-		sg++;
+		sg = sg_next(sg);
 	}
 }
 
@@ -285,7 +285,7 @@ static void iommu_get_scsi_sgl_pflush(st
 
 		sg->dvma_address = iommu_get_one(sg->page, n, sbus) + sg->offset;
 		sg->dvma_length = (__u32) sg->length;
-		sg++;
+		sg = sg_next(sg);
 	}
 }
 
@@ -325,7 +325,7 @@ static void iommu_release_scsi_sgl(struc
 		n = (sg->length + sg->offset + PAGE_SIZE-1) >> PAGE_SHIFT;
 		iommu_release_one(sg->dvma_address & PAGE_MASK, n, sbus);
 		sg->dvma_address = 0x21212121;
-		sg++;
+		sg = sg_next(sg);
 	}
 }
 
diff -puN arch/sparc/mm/sun4c.c~git-block arch/sparc/mm/sun4c.c
--- a/arch/sparc/mm/sun4c.c~git-block
+++ a/arch/sparc/mm/sun4c.c
@@ -17,8 +17,8 @@
 #include <linux/highmem.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
+#include <linux/scatterlist.h>
 
-#include <asm/scatterlist.h>
 #include <asm/page.h>
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
@@ -1228,8 +1228,9 @@ static void sun4c_get_scsi_sgl(struct sc
 {
 	while (sz != 0) {
 		--sz;
-		sg[sz].dvma_address = (__u32)sun4c_lockarea(page_address(sg[sz].page) + sg[sz].offset, sg[sz].length);
-		sg[sz].dvma_length = sg[sz].length;
+		sg->dvma_address = (__u32)sun4c_lockarea(page_address(sg->page) + sg->offset, sg->length);
+		sg->dvma_length = sg->length;
+		sg = sg_next(sg);
 	}
 }
 
@@ -1244,7 +1245,8 @@ static void sun4c_release_scsi_sgl(struc
 {
 	while (sz != 0) {
 		--sz;
-		sun4c_unlockarea((char *)sg[sz].dvma_address, sg[sz].length);
+		sun4c_unlockarea((char *)sg->dvma_address, sg->length);
+		sg = sg_next(sg);
 	}
 }
 
diff -puN arch/sparc64/kernel/iommu.c~git-block arch/sparc64/kernel/iommu.c
--- a/arch/sparc64/kernel/iommu.c~git-block
+++ a/arch/sparc64/kernel/iommu.c
@@ -10,6 +10,7 @@
 #include <linux/device.h>
 #include <linux/dma-mapping.h>
 #include <linux/errno.h>
+#include <linux/scatterlist.h>
 
 #ifdef CONFIG_PCI
 #include <linux/pci.h>
@@ -480,7 +481,7 @@ static inline void fill_sg(iopte_t *iopt
 			   unsigned long iopte_protection)
 {
 	struct scatterlist *dma_sg = sg;
-	struct scatterlist *sg_end = sg + nelems;
+	struct scatterlist *sg_end = sg_last(sg, nelems);
 	int i;
 
 	for (i = 0; i < nused; i++) {
@@ -515,7 +516,7 @@ static inline void fill_sg(iopte_t *iopt
 					len -= (IO_PAGE_SIZE - (tmp & (IO_PAGE_SIZE - 1UL)));
 					break;
 				}
-				sg++;
+				sg = sg_next(sg);
 			}
 
 			pteval = iopte_protection | (pteval & IOPTE_PAGE);
@@ -528,24 +529,24 @@ static inline void fill_sg(iopte_t *iopt
 			}
 
 			pteval = (pteval & IOPTE_PAGE) + len;
-			sg++;
+			sg = sg_next(sg);
 
 			/* Skip over any tail mappings we've fully mapped,
 			 * adjusting pteval along the way.  Stop when we
 			 * detect a page crossing event.
 			 */
-			while (sg < sg_end &&
+			while (sg != sg_end &&
 			       (pteval << (64 - IO_PAGE_SHIFT)) != 0UL &&
 			       (pteval == SG_ENT_PHYS_ADDRESS(sg)) &&
 			       ((pteval ^
 				 (SG_ENT_PHYS_ADDRESS(sg) + sg->length - 1UL)) >> IO_PAGE_SHIFT) == 0UL) {
 				pteval += sg->length;
-				sg++;
+				sg = sg_next(sg);
 			}
 			if ((pteval << (64 - IO_PAGE_SHIFT)) == 0UL)
 				pteval = ~0UL;
 		} while (dma_npages != 0);
-		dma_sg++;
+		dma_sg = sg_next(dma_sg);
 	}
 }
 
@@ -606,7 +607,7 @@ static int dma_4u_map_sg(struct device *
 	sgtmp = sglist;
 	while (used && sgtmp->dma_length) {
 		sgtmp->dma_address += dma_base;
-		sgtmp++;
+		sgtmp = sg_next(sgtmp);
 		used--;
 	}
 	used = nelems - used;
@@ -642,6 +643,7 @@ static void dma_4u_unmap_sg(struct devic
 	struct strbuf *strbuf;
 	iopte_t *base;
 	unsigned long flags, ctx, i, npages;
+	struct scatterlist *sg, *sgprv;
 	u32 bus_addr;
 
 	if (unlikely(direction == DMA_NONE)) {
@@ -654,11 +656,14 @@ static void dma_4u_unmap_sg(struct devic
 
 	bus_addr = sglist->dma_address & IO_PAGE_MASK;
 
-	for (i = 1; i < nelems; i++)
-		if (sglist[i].dma_length == 0)
+	sgprv = NULL;
+	for_each_sg(sglist, sg, nelems, i) {
+		if (sg->dma_length == 0)
 			break;
-	i--;
-	npages = (IO_PAGE_ALIGN(sglist[i].dma_address + sglist[i].dma_length) -
+		sgprv = sg;
+	}
+
+	npages = (IO_PAGE_ALIGN(sgprv->dma_address + sgprv->dma_length) -
 		  bus_addr) >> IO_PAGE_SHIFT;
 
 	base = iommu->page_table +
@@ -730,6 +735,7 @@ static void dma_4u_sync_sg_for_cpu(struc
 	struct iommu *iommu;
 	struct strbuf *strbuf;
 	unsigned long flags, ctx, npages, i;
+	struct scatterlist *sg, *sgprv;
 	u32 bus_addr;
 
 	iommu = dev->archdata.iommu;
@@ -753,11 +759,14 @@ static void dma_4u_sync_sg_for_cpu(struc
 
 	/* Step 2: Kick data out of streaming buffers. */
 	bus_addr = sglist[0].dma_address & IO_PAGE_MASK;
-	for(i = 1; i < nelems; i++)
-		if (!sglist[i].dma_length)
+	sgprv = NULL;
+	for_each_sg(sglist, sg, nelems, i) {
+		if (sg->dma_length == 0)
 			break;
-	i--;
-	npages = (IO_PAGE_ALIGN(sglist[i].dma_address + sglist[i].dma_length)
+		sgprv = sg;
+	}
+
+	npages = (IO_PAGE_ALIGN(sgprv->dma_address + sgprv->dma_length)
 		  - bus_addr) >> IO_PAGE_SHIFT;
 	strbuf_flush(strbuf, iommu, bus_addr, ctx, npages, direction);
 
diff -puN arch/sparc64/kernel/pci_sun4v.c~git-block arch/sparc64/kernel/pci_sun4v.c
--- a/arch/sparc64/kernel/pci_sun4v.c~git-block
+++ a/arch/sparc64/kernel/pci_sun4v.c
@@ -13,6 +13,7 @@
 #include <linux/irq.h>
 #include <linux/msi.h>
 #include <linux/log2.h>
+#include <linux/scatterlist.h>
 
 #include <asm/iommu.h>
 #include <asm/irq.h>
@@ -373,7 +374,7 @@ static inline long fill_sg(long entry, s
 			   int nused, int nelems, unsigned long prot)
 {
 	struct scatterlist *dma_sg = sg;
-	struct scatterlist *sg_end = sg + nelems;
+	struct scatterlist *sg_end = sg_last(sg, nelems);
 	unsigned long flags;
 	int i;
 
@@ -413,7 +414,7 @@ static inline long fill_sg(long entry, s
 					len -= (IO_PAGE_SIZE - (tmp & (IO_PAGE_SIZE - 1UL)));
 					break;
 				}
-				sg++;
+				sg = sg_next(sg);
 			}
 
 			pteval = (pteval & IOPTE_PAGE);
@@ -431,24 +432,25 @@ static inline long fill_sg(long entry, s
 			}
 
 			pteval = (pteval & IOPTE_PAGE) + len;
-			sg++;
+			sg = sg_next(sg);
 
 			/* Skip over any tail mappings we've fully mapped,
 			 * adjusting pteval along the way.  Stop when we
 			 * detect a page crossing event.
 			 */
-			while (sg < sg_end &&
-			       (pteval << (64 - IO_PAGE_SHIFT)) != 0UL &&
+			while ((pteval << (64 - IO_PAGE_SHIFT)) != 0UL &&
 			       (pteval == SG_ENT_PHYS_ADDRESS(sg)) &&
 			       ((pteval ^
 				 (SG_ENT_PHYS_ADDRESS(sg) + sg->length - 1UL)) >> IO_PAGE_SHIFT) == 0UL) {
 				pteval += sg->length;
-				sg++;
+				if (sg == sg_end)
+					break;
+				sg = sg_next(sg);
 			}
 			if ((pteval << (64 - IO_PAGE_SHIFT)) == 0UL)
 				pteval = ~0UL;
 		} while (dma_npages != 0);
-		dma_sg++;
+		dma_sg = sg_next(dma_sg);
 	}
 
 	if (unlikely(iommu_batch_end() < 0L))
@@ -510,7 +512,7 @@ static int dma_4v_map_sg(struct device *
 	sgtmp = sglist;
 	while (used && sgtmp->dma_length) {
 		sgtmp->dma_address += dma_base;
-		sgtmp++;
+		sgtmp = sg_next(sgtmp);
 		used--;
 	}
 	used = nelems - used;
@@ -545,6 +547,7 @@ static void dma_4v_unmap_sg(struct devic
 	struct pci_pbm_info *pbm;
 	struct iommu *iommu;
 	unsigned long flags, i, npages;
+	struct scatterlist *sg, *sgprv;
 	long entry;
 	u32 devhandle, bus_addr;
 
@@ -558,12 +561,15 @@ static void dma_4v_unmap_sg(struct devic
 	devhandle = pbm->devhandle;
 	
 	bus_addr = sglist->dma_address & IO_PAGE_MASK;
-
-	for (i = 1; i < nelems; i++)
-		if (sglist[i].dma_length == 0)
+	sgprv = NULL;
+	for_each_sg(sglist, sg, nelems, i) {
+		if (sg->dma_length == 0)
 			break;
-	i--;
-	npages = (IO_PAGE_ALIGN(sglist[i].dma_address + sglist[i].dma_length) -
+
+		sgprv = sg;
+	}
+
+	npages = (IO_PAGE_ALIGN(sgprv->dma_address + sgprv->dma_length) -
 		  bus_addr) >> IO_PAGE_SHIFT;
 
 	entry = ((bus_addr - iommu->page_table_map_base) >> IO_PAGE_SHIFT);
diff -puN arch/x86_64/kernel/pci-calgary.c~git-block arch/x86_64/kernel/pci-calgary.c
--- a/arch/x86_64/kernel/pci-calgary.c~git-block
+++ a/arch/x86_64/kernel/pci-calgary.c
@@ -35,6 +35,7 @@
 #include <linux/pci_ids.h>
 #include <linux/pci.h>
 #include <linux/delay.h>
+#include <linux/scatterlist.h>
 #include <asm/iommu.h>
 #include <asm/calgary.h>
 #include <asm/tce.h>
@@ -384,31 +385,32 @@ static void calgary_unmap_sg(struct devi
 	struct scatterlist *sglist, int nelems, int direction)
 {
 	struct iommu_table *tbl = find_iommu_table(dev);
+	struct scatterlist *s;
+	int i;
 
 	if (!translate_phb(to_pci_dev(dev)))
 		return;
 
-	while (nelems--) {
+	for_each_sg(sglist, s, nelems, i) {
 		unsigned int npages;
-		dma_addr_t dma = sglist->dma_address;
-		unsigned int dmalen = sglist->dma_length;
+		dma_addr_t dma = s->dma_address;
+		unsigned int dmalen = s->dma_length;
 
 		if (dmalen == 0)
 			break;
 
 		npages = num_dma_pages(dma, dmalen);
 		iommu_free(tbl, dma, npages);
-		sglist++;
 	}
 }
 
 static int calgary_nontranslate_map_sg(struct device* dev,
 	struct scatterlist *sg, int nelems, int direction)
 {
+	struct scatterlist *s;
 	int i;
 
-	for (i = 0; i < nelems; i++ ) {
-		struct scatterlist *s = &sg[i];
+	for_each_sg(sg, s, nelems, i) {
 		BUG_ON(!s->page);
 		s->dma_address = virt_to_bus(page_address(s->page) +s->offset);
 		s->dma_length = s->length;
@@ -420,6 +422,7 @@ static int calgary_map_sg(struct device 
 	int nelems, int direction)
 {
 	struct iommu_table *tbl = find_iommu_table(dev);
+	struct scatterlist *s;
 	unsigned long vaddr;
 	unsigned int npages;
 	unsigned long entry;
@@ -428,8 +431,7 @@ static int calgary_map_sg(struct device 
 	if (!translate_phb(to_pci_dev(dev)))
 		return calgary_nontranslate_map_sg(dev, sg, nelems, direction);
 
-	for (i = 0; i < nelems; i++ ) {
-		struct scatterlist *s = &sg[i];
+	for_each_sg(sg, s, nelems, i) {
 		BUG_ON(!s->page);
 
 		vaddr = (unsigned long)page_address(s->page) + s->offset;
@@ -454,9 +456,9 @@ static int calgary_map_sg(struct device 
 	return nelems;
 error:
 	calgary_unmap_sg(dev, sg, nelems, direction);
-	for (i = 0; i < nelems; i++) {
-		sg[i].dma_address = bad_dma_address;
-		sg[i].dma_length = 0;
+	for_each_sg(sg, s, nelems, i) {
+		sg->dma_address = bad_dma_address;
+		sg->dma_length = 0;
 	}
 	return 0;
 }
diff -puN arch/x86_64/kernel/pci-gart.c~git-block arch/x86_64/kernel/pci-gart.c
--- a/arch/x86_64/kernel/pci-gart.c~git-block
+++ a/arch/x86_64/kernel/pci-gart.c
@@ -23,6 +23,7 @@
 #include <linux/interrupt.h>
 #include <linux/bitops.h>
 #include <linux/kdebug.h>
+#include <linux/scatterlist.h>
 #include <asm/atomic.h>
 #include <asm/io.h>
 #include <asm/mtrr.h>
@@ -278,10 +279,10 @@ static void gart_unmap_single(struct dev
  */
 static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
 {
+	struct scatterlist *s;
 	int i;
 
-	for (i = 0; i < nents; i++) {
-		struct scatterlist *s = &sg[i];
+	for_each_sg(sg, s, nents, i) {
 		if (!s->dma_length || !s->length)
 			break;
 		gart_unmap_single(dev, s->dma_address, s->dma_length, dir);
@@ -292,14 +293,14 @@ static void gart_unmap_sg(struct device 
 static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
 			       int nents, int dir)
 {
+	struct scatterlist *s;
 	int i;
 
 #ifdef CONFIG_IOMMU_DEBUG
 	printk(KERN_DEBUG "dma_map_sg overflow\n");
 #endif
 
- 	for (i = 0; i < nents; i++ ) {
-		struct scatterlist *s = &sg[i];
+	for_each_sg(sg, s, nents, i) {
 		unsigned long addr = page_to_phys(s->page) + s->offset; 
 		if (nonforced_iommu(dev, addr, s->length)) { 
 			addr = dma_map_area(dev, addr, s->length, dir);
@@ -319,23 +320,23 @@ static int dma_map_sg_nonforce(struct de
 }
 
 /* Map multiple scatterlist entries continuous into the first. */
-static int __dma_map_cont(struct scatterlist *sg, int start, int stopat,
+static int __dma_map_cont(struct scatterlist *start, int nelems,
 		      struct scatterlist *sout, unsigned long pages)
 {
 	unsigned long iommu_start = alloc_iommu(pages);
 	unsigned long iommu_page = iommu_start; 
+	struct scatterlist *s;
 	int i;
 
 	if (iommu_start == -1)
 		return -1;
-	
-	for (i = start; i < stopat; i++) {
-		struct scatterlist *s = &sg[i];
+
+	for_each_sg(start, s, nelems, i) {
 		unsigned long pages, addr;
 		unsigned long phys_addr = s->dma_address;
 		
-		BUG_ON(i > start && s->offset);
-		if (i == start) {
+		BUG_ON(s != start && s->offset);
+		if (s == start) {
 			*sout = *s; 
 			sout->dma_address = iommu_bus_base;
 			sout->dma_address += iommu_page*PAGE_SIZE + s->offset;
@@ -357,17 +358,17 @@ static int __dma_map_cont(struct scatter
 	return 0;
 }
 
-static inline int dma_map_cont(struct scatterlist *sg, int start, int stopat,
+static inline int dma_map_cont(struct scatterlist *start, int nelems,
 		      struct scatterlist *sout,
 		      unsigned long pages, int need)
 {
-	if (!need) { 
-		BUG_ON(stopat - start != 1);
-		*sout = sg[start]; 
-		sout->dma_length = sg[start].length; 
+	if (!need) {
+		BUG_ON(nelems != 1);
+		*sout = *start;
+		sout->dma_length = start->length;
 		return 0;
-	} 
-	return __dma_map_cont(sg, start, stopat, sout, pages);
+	}
+	return __dma_map_cont(start, nelems, sout, pages);
 }
 		
 /*
@@ -381,6 +382,7 @@ int gart_map_sg(struct device *dev, stru
 	int start;
 	unsigned long pages = 0;
 	int need = 0, nextneed;
+	struct scatterlist *s, *ps, *start_sg, *sgmap;
 
 	if (nents == 0) 
 		return 0;
@@ -390,8 +392,9 @@ int gart_map_sg(struct device *dev, stru
 
 	out = 0;
 	start = 0;
-	for (i = 0; i < nents; i++) {
-		struct scatterlist *s = &sg[i];
+	start_sg = sgmap = sg;
+	ps = NULL; /* shut up gcc */
+	for_each_sg(sg, s, nents, i) {
 		dma_addr_t addr = page_to_phys(s->page) + s->offset;
 		s->dma_address = addr;
 		BUG_ON(s->length == 0); 
@@ -400,29 +403,31 @@ int gart_map_sg(struct device *dev, stru
 
 		/* Handle the previous not yet processed entries */
 		if (i > start) {
-			struct scatterlist *ps = &sg[i-1];
 			/* Can only merge when the last chunk ends on a page 
 			   boundary and the new one doesn't have an offset. */
 			if (!iommu_merge || !nextneed || !need || s->offset ||
-			    (ps->offset + ps->length) % PAGE_SIZE) { 
-				if (dma_map_cont(sg, start, i, sg+out, pages,
-						 need) < 0)
+			    (ps->offset + ps->length) % PAGE_SIZE) {
+				if (dma_map_cont(start_sg, i - start, sgmap,
+						  pages, need) < 0)
 					goto error;
 				out++;
+				sgmap = sg_next(sgmap);
 				pages = 0;
-				start = i;	
+				start = i;
+				start_sg = s;
 			}
 		}
 
 		need = nextneed;
 		pages += to_pages(s->offset, s->length);
+		ps = s;
 	}
-	if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0)
+	if (dma_map_cont(start_sg, i - start, sgmap, pages, need) < 0)
 		goto error;
 	out++;
 	flush_gart();
-	if (out < nents) 
-		sg[out].dma_length = 0; 
+	if (out < nents)
+		ps->dma_length = 0;
 	return out;
 
 error:
@@ -437,8 +442,8 @@ error:
 	if (panic_on_overflow)
 		panic("dma_map_sg: overflow on %lu pages\n", pages);
 	iommu_full(dev, pages << PAGE_SHIFT, dir);
-	for (i = 0; i < nents; i++)
-		sg[i].dma_address = bad_dma_address;
+	for_each_sg(sg, s, nents, i)
+		s->dma_address = bad_dma_address;
 	return 0;
 } 
 
diff -puN arch/x86_64/kernel/pci-nommu.c~git-block arch/x86_64/kernel/pci-nommu.c
--- a/arch/x86_64/kernel/pci-nommu.c~git-block
+++ a/arch/x86_64/kernel/pci-nommu.c
@@ -5,6 +5,7 @@
 #include <linux/pci.h>
 #include <linux/string.h>
 #include <linux/dma-mapping.h>
+#include <linux/scatterlist.h>
 
 #include <asm/iommu.h>
 #include <asm/processor.h>
@@ -57,10 +58,10 @@ static void nommu_unmap_single(struct de
 static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
 	       int nents, int direction)
 {
+	struct scatterlist *s;
 	int i;
 
- 	for (i = 0; i < nents; i++ ) {
-		struct scatterlist *s = &sg[i];
+	for_each_sg(sg, s, nents, i) {
 		BUG_ON(!s->page);
 		s->dma_address = virt_to_bus(page_address(s->page) +s->offset);
 		if (!check_addr("map_sg", hwdev, s->dma_address, s->length))
diff -puN block/elevator.c~git-block block/elevator.c
--- a/block/elevator.c~git-block
+++ a/block/elevator.c
@@ -712,6 +712,14 @@ struct request *elv_next_request(struct 
 	int ret;
 
 	while ((rq = __elv_next_request(q)) != NULL) {
+		/*
+		 * Kill the empty barrier place holder, the driver must
+		 * not ever see it.
+		 */
+		if (blk_empty_barrier(rq)) {
+			end_queued_request(rq, 1);
+			continue;
+		}
 		if (!(rq->cmd_flags & REQ_STARTED)) {
 			/*
 			 * This is the first time the device driver
@@ -751,15 +759,8 @@ struct request *elv_next_request(struct 
 			rq = NULL;
 			break;
 		} else if (ret == BLKPREP_KILL) {
-			int nr_bytes = rq->hard_nr_sectors << 9;
-
-			if (!nr_bytes)
-				nr_bytes = rq->data_len;
-
-			blkdev_dequeue_request(rq);
 			rq->cmd_flags |= REQ_QUIET;
-			end_that_request_chunk(rq, 0, nr_bytes);
-			end_that_request_last(rq, 0);
+			end_queued_request(rq, 0);
 		} else {
 			printk(KERN_ERR "%s: bad return=%d\n", __FUNCTION__,
 								ret);
diff -puN block/ll_rw_blk.c~git-block block/ll_rw_blk.c
--- a/block/ll_rw_blk.c~git-block
+++ a/block/ll_rw_blk.c
@@ -30,6 +30,7 @@
 #include <linux/cpu.h>
 #include <linux/blktrace_api.h>
 #include <linux/fault-inject.h>
+#include <linux/scatterlist.h>
 
 /*
  * for max sense size
@@ -301,23 +302,6 @@ int blk_queue_ordered(struct request_que
 
 EXPORT_SYMBOL(blk_queue_ordered);
 
-/**
- * blk_queue_issue_flush_fn - set function for issuing a flush
- * @q:     the request queue
- * @iff:   the function to be called issuing the flush
- *
- * Description:
- *   If a driver supports issuing a flush command, the support is notified
- *   to the block layer by defining it through this call.
- *
- **/
-void blk_queue_issue_flush_fn(struct request_queue *q, issue_flush_fn *iff)
-{
-	q->issue_flush_fn = iff;
-}
-
-EXPORT_SYMBOL(blk_queue_issue_flush_fn);
-
 /*
  * Cache flushing for ordered writes handling
  */
@@ -374,10 +358,12 @@ void blk_ordered_complete_seq(struct req
 	/*
 	 * Okay, sequence complete.
 	 */
-	rq = q->orig_bar_rq;
-	uptodate = q->orderr ? q->orderr : 1;
+	uptodate = 1;
+	if (q->orderr)
+		uptodate = q->orderr;
 
 	q->ordseq = 0;
+	rq = q->orig_bar_rq;
 
 	end_that_request_first(rq, uptodate, rq->hard_nr_sectors);
 	end_that_request_last(rq, uptodate);
@@ -443,7 +429,8 @@ static inline struct request *start_orde
 	rq_init(q, rq);
 	if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
 		rq->cmd_flags |= REQ_RW;
-	rq->cmd_flags |= q->ordered & QUEUE_ORDERED_FUA ? REQ_FUA : 0;
+	if (q->ordered & QUEUE_ORDERED_FUA)
+		rq->cmd_flags |= REQ_FUA;
 	rq->elevator_private = NULL;
 	rq->elevator_private2 = NULL;
 	init_request_from_bio(rq, q->orig_bar_rq->bio);
@@ -453,9 +440,12 @@ static inline struct request *start_orde
 	 * Queue ordered sequence.  As we stack them at the head, we
 	 * need to queue in reverse order.  Note that we rely on that
 	 * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
-	 * request gets inbetween ordered sequence.
+	 * request gets inbetween ordered sequence. If this request is
+	 * an empty barrier, we don't need to do a postflush ever since
+	 * there will be no data written between the pre and post flush.
+	 * Hence a single flush will suffice.
 	 */
-	if (q->ordered & QUEUE_ORDERED_POSTFLUSH)
+	if ((q->ordered & QUEUE_ORDERED_POSTFLUSH) && !blk_empty_barrier(rq))
 		queue_flush(q, QUEUE_ORDERED_POSTFLUSH);
 	else
 		q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH;
@@ -479,7 +469,7 @@ static inline struct request *start_orde
 int blk_do_ordered(struct request_queue *q, struct request **rqp)
 {
 	struct request *rq = *rqp;
-	int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);
+	const int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);
 
 	if (!q->ordseq) {
 		if (!is_barrier)
@@ -538,6 +528,8 @@ static int flush_dry_bio_endio(struct bi
 
 	if (bio->bi_size)
 		return 1;
+	if (bio_empty_barrier(bio))
+		return 0;
 
 	/* Reset bio */
 	set_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -561,8 +553,17 @@ static int ordered_bio_endio(struct requ
 	/*
 	 * Okay, this is the barrier request in progress, dry finish it.
 	 */
-	if (error && !q->orderr)
-		q->orderr = error;
+	if (error) {
+		if (!q->orderr)
+			q->orderr = error;
+
+		/*
+		 * Let the original request inherit the error location, if
+		 * this is an empty barrier request.
+		 */
+		if (bio_empty_barrier(bio))
+			q->orig_bar_rq->sector = bio->bi_sector;
+	}
 
 	endio = bio->bi_end_io;
 	private = bio->bi_private;
@@ -1311,9 +1312,10 @@ static int blk_hw_contig_segment(struct 
  * must make sure sg can hold rq->nr_phys_segments entries
  */
 int blk_rq_map_sg(struct request_queue *q, struct request *rq,
-		  struct scatterlist *sg)
+		  struct scatterlist *sglist)
 {
 	struct bio_vec *bvec, *bvprv;
+	struct scatterlist *next_sg, *sg;
 	struct bio *bio;
 	int nsegs, i, cluster;
 
@@ -1324,6 +1326,7 @@ int blk_rq_map_sg(struct request_queue *
 	 * for each bio in rq
 	 */
 	bvprv = NULL;
+	sg = next_sg = &sglist[0];
 	rq_for_each_bio(bio, rq) {
 		/*
 		 * for each segment in bio
@@ -1332,7 +1335,7 @@ int blk_rq_map_sg(struct request_queue *
 			int nbytes = bvec->bv_len;
 
 			if (bvprv && cluster) {
-				if (sg[nsegs - 1].length + nbytes > q->max_segment_size)
+				if (sg->length + nbytes > q->max_segment_size)
 					goto new_segment;
 
 				if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
@@ -1340,14 +1343,15 @@ int blk_rq_map_sg(struct request_queue *
 				if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
 					goto new_segment;
 
-				sg[nsegs - 1].length += nbytes;
+				sg->length += nbytes;
 			} else {
 new_segment:
-				memset(&sg[nsegs],0,sizeof(struct scatterlist));
-				sg[nsegs].page = bvec->bv_page;
-				sg[nsegs].length = nbytes;
-				sg[nsegs].offset = bvec->bv_offset;
+				sg = next_sg;
+				next_sg = sg_next(sg);
 
+				sg->page = bvec->bv_page;
+				sg->length = nbytes;
+				sg->offset = bvec->bv_offset;
 				nsegs++;
 			}
 			bvprv = bvec;
@@ -2641,6 +2645,16 @@ int blk_execute_rq(struct request_queue 
 
 EXPORT_SYMBOL(blk_execute_rq);
 
+static int bio_end_empty_barrier(struct bio *bio, unsigned int bytes_done,
+				 int err)
+{
+	if (err)
+		clear_bit(BIO_UPTODATE, &bio->bi_flags);
+
+	complete(bio->bi_private);
+	return 0;
+}
+
 /**
  * blkdev_issue_flush - queue a flush
  * @bdev:	blockdev to issue flush for
@@ -2653,7 +2667,10 @@ EXPORT_SYMBOL(blk_execute_rq);
  */
 int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
 {
+	DECLARE_COMPLETION_ONSTACK(wait);
 	struct request_queue *q;
+	struct bio *bio;
+	int ret;
 
 	if (bdev->bd_disk == NULL)
 		return -ENXIO;
@@ -2661,10 +2678,32 @@ int blkdev_issue_flush(struct block_devi
 	q = bdev_get_queue(bdev);
 	if (!q)
 		return -ENXIO;
-	if (!q->issue_flush_fn)
-		return -EOPNOTSUPP;
 
-	return q->issue_flush_fn(q, bdev->bd_disk, error_sector);
+	bio = bio_alloc(GFP_KERNEL, 0);
+	if (!bio)
+		return -ENOMEM;
+
+	bio->bi_end_io = bio_end_empty_barrier;
+	bio->bi_private = &wait;
+	bio->bi_bdev = bdev;
+	submit_bio(1 << BIO_RW_BARRIER, bio);
+
+	wait_for_completion(&wait);
+
+	/*
+	 * The driver must store the error location in ->bi_sector, if
+	 * it supports it. For non-stacked drivers, this should be copied
+	 * from rq->sector.
+	 */
+	if (error_sector)
+		*error_sector = bio->bi_sector;
+
+	ret = 0;
+	if (!bio_flagged(bio, BIO_UPTODATE))
+		ret = -EIO;
+
+	bio_put(bio);
+	return ret;
 }
 
 EXPORT_SYMBOL(blkdev_issue_flush);
@@ -3038,7 +3077,7 @@ static inline void blk_partition_remap(s
 {
 	struct block_device *bdev = bio->bi_bdev;
 
-	if (bdev != bdev->bd_contains) {
+	if (bio_sectors(bio) && bdev != bdev->bd_contains) {
 		struct hd_struct *p = bdev->bd_part;
 		const int rw = bio_data_dir(bio);
 
@@ -3104,6 +3143,35 @@ static inline int should_fail_request(st
 
 #endif /* CONFIG_FAIL_MAKE_REQUEST */
 
+/*
+ * Check whether this bio extends beyond the end of the device.
+ */
+static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
+{
+	sector_t maxsector;
+
+	if (!nr_sectors)
+		return 0;
+
+	/* Test device or partition size, when known. */
+	maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
+	if (maxsector) {
+		sector_t sector = bio->bi_sector;
+
+		if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {
+			/*
+			 * This may well happen - the kernel calls bread()
+			 * without checking the size of the device, e.g., when
+			 * mounting a device.
+			 */
+			handle_bad_sector(bio);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
 /**
  * generic_make_request: hand a buffer to its device driver for I/O
  * @bio:  The bio describing the location in memory and on the device.
@@ -3131,27 +3199,14 @@ static inline int should_fail_request(st
 static inline void __generic_make_request(struct bio *bio)
 {
 	struct request_queue *q;
-	sector_t maxsector;
 	sector_t old_sector;
 	int ret, nr_sectors = bio_sectors(bio);
 	dev_t old_dev;
 
 	might_sleep();
-	/* Test device or partition size, when known. */
-	maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
-	if (maxsector) {
-		sector_t sector = bio->bi_sector;
 
-		if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {
-			/*
-			 * This may well happen - the kernel calls bread()
-			 * without checking the size of the device, e.g., when
-			 * mounting a device.
-			 */
-			handle_bad_sector(bio);
-			goto end_io;
-		}
-	}
+	if (bio_check_eod(bio, nr_sectors))
+		goto end_io;
 
 	/*
 	 * Resolve the mapping until finished. (drivers are
@@ -3178,7 +3233,7 @@ end_io:
 			break;
 		}
 
-		if (unlikely(bio_sectors(bio) > q->max_hw_sectors)) {
+		if (unlikely(nr_sectors > q->max_hw_sectors)) {
 			printk("bio too big device %s (%u > %u)\n", 
 				bdevname(bio->bi_bdev, b),
 				bio_sectors(bio),
@@ -3199,7 +3254,7 @@ end_io:
 		blk_partition_remap(bio);
 
 		if (old_sector != -1)
-			blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, 
+			blk_add_trace_remap(q, bio, old_dev, bio->bi_sector,
 					    old_sector);
 
 		blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
@@ -3207,21 +3262,8 @@ end_io:
 		old_sector = bio->bi_sector;
 		old_dev = bio->bi_bdev->bd_dev;
 
-		maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
-		if (maxsector) {
-			sector_t sector = bio->bi_sector;
-
-			if (maxsector < nr_sectors ||
-					maxsector - nr_sectors < sector) {
-				/*
-				 * This may well happen - partitions are not
-				 * checked to make sure they are within the size
-				 * of the whole device.
-				 */
-				handle_bad_sector(bio);
-				goto end_io;
-			}
-		}
+		if (bio_check_eod(bio, nr_sectors))
+			goto end_io;
 
 		ret = q->make_request_fn(q, bio);
 	} while (ret);
@@ -3294,23 +3336,32 @@ void submit_bio(int rw, struct bio *bio)
 {
 	int count = bio_sectors(bio);
 
-	BIO_BUG_ON(!bio->bi_size);
-	BIO_BUG_ON(!bio->bi_io_vec);
 	bio->bi_rw |= rw;
-	if (rw & WRITE) {
-		count_vm_events(PGPGOUT, count);
-	} else {
-		task_io_account_read(bio->bi_size);
-		count_vm_events(PGPGIN, count);
-	}
 
-	if (unlikely(block_dump)) {
-		char b[BDEVNAME_SIZE];
-		printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
-			current->comm, current->pid,
-			(rw & WRITE) ? "WRITE" : "READ",
-			(unsigned long long)bio->bi_sector,
-			bdevname(bio->bi_bdev,b));
+	/*
+	 * If it's a regular read/write or a barrier with data attached,
+	 * go through the normal accounting stuff before submission.
+	 */
+	if (!bio_empty_barrier(bio)) {
+
+		BIO_BUG_ON(!bio->bi_size);
+		BIO_BUG_ON(!bio->bi_io_vec);
+
+		if (rw & WRITE) {
+			count_vm_events(PGPGOUT, count);
+		} else {
+			task_io_account_read(bio->bi_size);
+			count_vm_events(PGPGIN, count);
+		}
+
+		if (unlikely(block_dump)) {
+			char b[BDEVNAME_SIZE];
+			printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
+				current->comm, current->pid,
+				(rw & WRITE) ? "WRITE" : "READ",
+				(unsigned long long)bio->bi_sector,
+				bdevname(bio->bi_bdev,b));
+		}
 	}
 
 	generic_make_request(bio);
@@ -3428,6 +3479,14 @@ static int __end_that_request_first(stru
 	while ((bio = req->bio) != NULL) {
 		int nbytes;
 
+		/*
+		 * For an empty barrier request, the low level driver must
+		 * store a potential error location in ->sector. We pass
+		 * that back up in ->bi_sector.
+		 */
+		if (blk_empty_barrier(req))
+			bio->bi_sector = req->sector;
+
 		if (nr_bytes >= bio->bi_size) {
 			req->bio = bio->bi_next;
 			nbytes = bio->bi_size;
@@ -3595,7 +3654,7 @@ static struct notifier_block __devinitda
  * Description:
  *     Ends all I/O on a request. It does not handle partial completions,
  *     unless the driver actually implements this in its completion callback
- *     through requeueing. Theh actual completion happens out-of-order,
+ *     through requeueing. The actual completion happens out-of-order,
  *     through a softirq handler. The user must have registered a completion
  *     callback through blk_queue_softirq_done().
  **/
@@ -3658,15 +3717,63 @@ void end_that_request_last(struct reques
 
 EXPORT_SYMBOL(end_that_request_last);
 
-void end_request(struct request *req, int uptodate)
+static inline void __end_request(struct request *rq, int uptodate,
+				 unsigned int nr_bytes)
 {
-	if (!end_that_request_first(req, uptodate, req->hard_cur_sectors)) {
-		add_disk_randomness(req->rq_disk);
-		blkdev_dequeue_request(req);
-		end_that_request_last(req, uptodate);
+	if (!end_that_request_chunk(rq, uptodate, nr_bytes)) {
+		blkdev_dequeue_request(rq);
+		add_disk_randomness(rq->rq_disk);
+		end_that_request_last(rq, uptodate);
 	}
 }
 
+/**
+ * end_queued_request - end all I/O on a queued request
+ * @rq:		the request being processed
+ * @uptodate:	error value or 0/1 uptodate flag
+ *
+ * Description:
+ *     Ends all I/O on a request, and removes it from the block layer queues.
+ *     Not suitable for normal IO completion, unless the driver still has
+ *     the request attached to the block layer.
+ *
+ **/
+void end_queued_request(struct request *rq, int uptodate)
+{
+	unsigned int nr_bytes;
+
+	if (blk_fs_request(rq))
+		nr_bytes = rq->hard_nr_sectors << 9;
+	else
+		nr_bytes = rq->data_len;
+
+	__end_request(rq, uptodate, nr_bytes);
+}
+EXPORT_SYMBOL(end_queued_request);
+
+/**
+ * end_request - end I/O on the current segment of the request
+ * @rq:		the request being processed
+ * @uptodate:	error value or 0/1 uptodate flag
+ *
+ * Description:
+ *     Ends I/O on the current segment of a request. If that is the only
+ *     remaining segment, the request is also completed and freed.
+ *
+ *     This is a remnant of how older block drivers handled IO completions.
+ *     Modern drivers typically end IO on the full request in one go, unless
+ *     they have a residual value to account for. For that case this function
+ *     isn't really useful, unless the residual just happens to be the
+ *     full current segment. In other words, don't use this function in new
+ *     code. Either use end_request_completely(), or the
+ *     end_that_request_chunk() (along with end_that_request_last()) for
+ *     partial completions.
+ *
+ **/
+void end_request(struct request *req, int uptodate)
+{
+	__end_request(req, uptodate, req->hard_cur_sectors << 9);
+}
 EXPORT_SYMBOL(end_request);
 
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
@@ -3988,7 +4095,23 @@ static ssize_t queue_max_hw_sectors_show
 	return queue_var_show(max_hw_sectors_kb, (page));
 }
 
+static ssize_t queue_max_segments_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->max_phys_segments, page);
+}
+
+static ssize_t queue_max_segments_store(struct request_queue *q,
+					const char *page, size_t count)
+{
+	unsigned long segments;
+	ssize_t ret = queue_var_store(&segments, page, count);
 
+	spin_lock_irq(q->queue_lock);
+	q->max_phys_segments = segments;
+	spin_unlock_irq(q->queue_lock);
+
+	return ret;
+}
 static struct queue_sysfs_entry queue_requests_entry = {
 	.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_requests_show,
@@ -4012,6 +4135,12 @@ static struct queue_sysfs_entry queue_ma
 	.show = queue_max_hw_sectors_show,
 };
 
+static struct queue_sysfs_entry queue_max_segments_entry = {
+	.attr = {.name = "max_segments", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_max_segments_show,
+	.store = queue_max_segments_store,
+};
+
 static struct queue_sysfs_entry queue_iosched_entry = {
 	.attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
 	.show = elv_iosched_show,
@@ -4023,6 +4152,7 @@ static struct attribute *default_attrs[]
 	&queue_ra_entry.attr,
 	&queue_max_hw_sectors_entry.attr,
 	&queue_max_sectors_entry.attr,
+	&queue_max_segments_entry.attr,
 	&queue_iosched_entry.attr,
 	NULL,
 };
diff -puN crypto/digest.c~git-block crypto/digest.c
--- a/crypto/digest.c~git-block
+++ a/crypto/digest.c
@@ -77,7 +77,7 @@ static int update2(struct hash_desc *des
 
 		if (!nbytes)
 			break;
-		sg = sg_next(sg);
+		sg = scatterwalk_sg_next(sg);
 	}
 
 	return 0;
diff -puN crypto/scatterwalk.c~git-block crypto/scatterwalk.c
--- a/crypto/scatterwalk.c~git-block
+++ a/crypto/scatterwalk.c
@@ -70,7 +70,7 @@ static void scatterwalk_pagedone(struct 
 		walk->offset += PAGE_SIZE - 1;
 		walk->offset &= PAGE_MASK;
 		if (walk->offset >= walk->sg->offset + walk->sg->length)
-			scatterwalk_start(walk, sg_next(walk->sg));
+			scatterwalk_start(walk, scatterwalk_sg_next(walk->sg));
 	}
 }
 
diff -puN crypto/scatterwalk.h~git-block crypto/scatterwalk.h
--- a/crypto/scatterwalk.h~git-block
+++ a/crypto/scatterwalk.h
@@ -20,7 +20,7 @@
 
 #include "internal.h"
 
-static inline struct scatterlist *sg_next(struct scatterlist *sg)
+static inline struct scatterlist *scatterwalk_sg_next(struct scatterlist *sg)
 {
 	return (++sg)->length ? sg : (void *)sg->page;
 }
diff -puN drivers/ata/libata-core.c~git-block drivers/ata/libata-core.c
--- a/drivers/ata/libata-core.c~git-block
+++ a/drivers/ata/libata-core.c
@@ -1356,7 +1356,7 @@ static void ata_qc_complete_internal(str
  */
 unsigned ata_exec_internal_sg(struct ata_device *dev,
 			      struct ata_taskfile *tf, const u8 *cdb,
-			      int dma_dir, struct scatterlist *sg,
+			      int dma_dir, struct scatterlist *sgl,
 			      unsigned int n_elem)
 {
 	struct ata_link *link = dev->link;
@@ -1415,11 +1415,12 @@ unsigned ata_exec_internal_sg(struct ata
 	qc->dma_dir = dma_dir;
 	if (dma_dir != DMA_NONE) {
 		unsigned int i, buflen = 0;
+		struct scatterlist *sg;
 
-		for (i = 0; i < n_elem; i++)
-			buflen += sg[i].length;
+		for_each_sg(sgl, sg, n_elem, i)
+			buflen += sg->length;
 
-		ata_sg_init(qc, sg, n_elem);
+		ata_sg_init(qc, sgl, n_elem);
 		qc->nbytes = buflen;
 	}
 
@@ -4196,7 +4197,7 @@ void ata_sg_clean(struct ata_queued_cmd 
 		if (qc->n_elem)
 			dma_unmap_sg(ap->dev, sg, qc->n_elem, dir);
 		/* restore last sg */
-		sg[qc->orig_n_elem - 1].length += qc->pad_len;
+		sg_last(sg, qc->orig_n_elem)->length += qc->pad_len;
 		if (pad_buf) {
 			struct scatterlist *psg = &qc->pad_sgent;
 			void *addr = kmap_atomic(psg->page, KM_IRQ0);
@@ -4421,6 +4422,7 @@ void ata_sg_init_one(struct ata_queued_c
 	qc->orig_n_elem = 1;
 	qc->buf_virt = buf;
 	qc->nbytes = buflen;
+	qc->cursg = qc->__sg;
 
 	sg_init_one(&qc->sgent, buf, buflen);
 }
@@ -4446,6 +4448,7 @@ void ata_sg_init(struct ata_queued_cmd *
 	qc->__sg = sg;
 	qc->n_elem = n_elem;
 	qc->orig_n_elem = n_elem;
+	qc->cursg = qc->__sg;
 }
 
 /**
@@ -4535,7 +4538,7 @@ static int ata_sg_setup(struct ata_queue
 {
 	struct ata_port *ap = qc->ap;
 	struct scatterlist *sg = qc->__sg;
-	struct scatterlist *lsg = &sg[qc->n_elem - 1];
+	struct scatterlist *lsg = sg_last(qc->__sg, qc->n_elem);
 	int n_elem, pre_n_elem, dir, trim_sg = 0;
 
 	VPRINTK("ENTER, ata%u\n", ap->print_id);
@@ -4699,7 +4702,6 @@ void ata_data_xfer_noirq(struct ata_devi
 static void ata_pio_sector(struct ata_queued_cmd *qc)
 {
 	int do_write = (qc->tf.flags & ATA_TFLAG_WRITE);
-	struct scatterlist *sg = qc->__sg;
 	struct ata_port *ap = qc->ap;
 	struct page *page;
 	unsigned int offset;
@@ -4708,8 +4710,8 @@ static void ata_pio_sector(struct ata_qu
 	if (qc->curbytes == qc->nbytes - qc->sect_size)
 		ap->hsm_task_state = HSM_ST_LAST;
 
-	page = sg[qc->cursg].page;
-	offset = sg[qc->cursg].offset + qc->cursg_ofs;
+	page = qc->cursg->page;
+	offset = qc->cursg->offset + qc->cursg_ofs;
 
 	/* get the current page and offset */
 	page = nth_page(page, (offset >> PAGE_SHIFT));
@@ -4737,8 +4739,8 @@ static void ata_pio_sector(struct ata_qu
 	qc->curbytes += qc->sect_size;
 	qc->cursg_ofs += qc->sect_size;
 
-	if (qc->cursg_ofs == (&sg[qc->cursg])->length) {
-		qc->cursg++;
+	if (qc->cursg_ofs == qc->cursg->length) {
+		qc->cursg = sg_next(qc->cursg);
 		qc->cursg_ofs = 0;
 	}
 }
@@ -4824,16 +4826,18 @@ static void __atapi_pio_bytes(struct ata
 {
 	int do_write = (qc->tf.flags & ATA_TFLAG_WRITE);
 	struct scatterlist *sg = qc->__sg;
+	struct scatterlist *lsg = sg_last(qc->__sg, qc->n_elem);
 	struct ata_port *ap = qc->ap;
 	struct page *page;
 	unsigned char *buf;
 	unsigned int offset, count;
+	int no_more_sg = 0;
 
 	if (qc->curbytes + bytes >= qc->nbytes)
 		ap->hsm_task_state = HSM_ST_LAST;
 
 next_sg:
-	if (unlikely(qc->cursg >= qc->n_elem)) {
+	if (unlikely(no_more_sg)) {
 		/*
 		 * The end of qc->sg is reached and the device expects
 		 * more data to transfer. In order not to overrun qc->sg
@@ -4856,7 +4860,7 @@ next_sg:
 		return;
 	}
 
-	sg = &qc->__sg[qc->cursg];
+	sg = qc->cursg;
 
 	page = sg->page;
 	offset = sg->offset + qc->cursg_ofs;
@@ -4895,7 +4899,10 @@ next_sg:
 	qc->cursg_ofs += count;
 
 	if (qc->cursg_ofs == sg->length) {
-		qc->cursg++;
+		if (qc->cursg == lsg)
+			no_more_sg = 1;
+
+		qc->cursg = sg_next(qc->cursg);
 		qc->cursg_ofs = 0;
 	}
 
diff -puN drivers/ide/cris/ide-cris.c~git-block drivers/ide/cris/ide-cris.c
--- a/drivers/ide/cris/ide-cris.c~git-block
+++ a/drivers/ide/cris/ide-cris.c
@@ -937,7 +937,8 @@ static int cris_ide_build_dmatable (ide_
 		/* group sequential buffers into one large buffer */
 		addr = page_to_phys(sg->page) + sg->offset;
 		size = sg_dma_len(sg);
-		while (sg++, --i) {
+		while (--i) {
+			sg = sg_next(sg);
 			if ((addr + size) != page_to_phys(sg->page) + sg->offset)
 				break;
 			size += sg_dma_len(sg);
diff -puN drivers/ide/ide-disk.c~git-block drivers/ide/ide-disk.c
--- a/drivers/ide/ide-disk.c~git-block
+++ a/drivers/ide/ide-disk.c
@@ -715,32 +715,6 @@ static void idedisk_prepare_flush(struct
 	rq->buffer = rq->cmd;
 }
 
-static int idedisk_issue_flush(struct request_queue *q, struct gendisk *disk,
-			       sector_t *error_sector)
-{
-	ide_drive_t *drive = q->queuedata;
-	struct request *rq;
-	int ret;
-
-	if (!drive->wcache)
-		return 0;
-
-	rq = blk_get_request(q, WRITE, __GFP_WAIT);
-
-	idedisk_prepare_flush(q, rq);
-
-	ret = blk_execute_rq(q, disk, rq, 0);
-
-	/*
-	 * if we failed and caller wants error offset, get it
-	 */
-	if (ret && error_sector)
-		*error_sector = ide_get_error_location(drive, rq->cmd);
-
-	blk_put_request(rq);
-	return ret;
-}
-
 /*
  * This is tightly woven into the driver->do_special can not touch.
  * DON'T do it again until a total personality rewrite is committed.
@@ -780,7 +754,6 @@ static void update_ordered(ide_drive_t *
 	struct hd_driveid *id = drive->id;
 	unsigned ordered = QUEUE_ORDERED_NONE;
 	prepare_flush_fn *prep_fn = NULL;
-	issue_flush_fn *issue_fn = NULL;
 
 	if (drive->wcache) {
 		unsigned long long capacity;
@@ -804,13 +777,11 @@ static void update_ordered(ide_drive_t *
 		if (barrier) {
 			ordered = QUEUE_ORDERED_DRAIN_FLUSH;
 			prep_fn = idedisk_prepare_flush;
-			issue_fn = idedisk_issue_flush;
 		}
 	} else
 		ordered = QUEUE_ORDERED_DRAIN;
 
 	blk_queue_ordered(drive->queue, ordered, prep_fn);
-	blk_queue_issue_flush_fn(drive->queue, issue_fn);
 }
 
 static int write_cache(ide_drive_t *drive, int arg)
diff -puN drivers/ide/ide-dma.c~git-block drivers/ide/ide-dma.c
--- a/drivers/ide/ide-dma.c~git-block
+++ a/drivers/ide/ide-dma.c
@@ -280,7 +280,7 @@ int ide_build_dmatable (ide_drive_t *dri
 			}
 		}
 
-		sg++;
+		sg = sg_next(sg);
 		i--;
 	}
 
diff -puN drivers/ide/ide-io.c~git-block drivers/ide/ide-io.c
--- a/drivers/ide/ide-io.c~git-block
+++ a/drivers/ide/ide-io.c
@@ -881,7 +881,8 @@ void ide_init_sg_cmd(ide_drive_t *drive,
 	ide_hwif_t *hwif = drive->hwif;
 
 	hwif->nsect = hwif->nleft = rq->nr_sectors;
-	hwif->cursg = hwif->cursg_ofs = 0;
+	hwif->cursg_ofs = 0;
+	hwif->cursg = NULL;
 }
 
 EXPORT_SYMBOL_GPL(ide_init_sg_cmd);
diff -puN drivers/ide/ide-probe.c~git-block drivers/ide/ide-probe.c
--- a/drivers/ide/ide-probe.c~git-block
+++ a/drivers/ide/ide-probe.c
@@ -1338,7 +1338,7 @@ static int hwif_init(ide_hwif_t *hwif)
 	if (!hwif->sg_max_nents)
 		hwif->sg_max_nents = PRD_ENTRIES;
 
-	hwif->sg_table = kmalloc(sizeof(struct scatterlist)*hwif->sg_max_nents,
+	hwif->sg_table = kzalloc(sizeof(struct scatterlist)*hwif->sg_max_nents,
 				 GFP_KERNEL);
 	if (!hwif->sg_table) {
 		printk(KERN_ERR "%s: unable to allocate SG table.\n", hwif->name);
diff -puN drivers/ide/ide-taskfile.c~git-block drivers/ide/ide-taskfile.c
--- a/drivers/ide/ide-taskfile.c~git-block
+++ a/drivers/ide/ide-taskfile.c
@@ -45,6 +45,7 @@
 #include <linux/hdreg.h>
 #include <linux/ide.h>
 #include <linux/bitops.h>
+#include <linux/scatterlist.h>
 
 #include <asm/byteorder.h>
 #include <asm/irq.h>
@@ -263,6 +264,7 @@ static void ide_pio_sector(ide_drive_t *
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct scatterlist *sg = hwif->sg_table;
+	struct scatterlist *cursg = hwif->cursg;
 	struct page *page;
 #ifdef CONFIG_HIGHMEM
 	unsigned long flags;
@@ -270,8 +272,14 @@ static void ide_pio_sector(ide_drive_t *
 	unsigned int offset;
 	u8 *buf;
 
-	page = sg[hwif->cursg].page;
-	offset = sg[hwif->cursg].offset + hwif->cursg_ofs * SECTOR_SIZE;
+	cursg = hwif->cursg;
+	if (!cursg) {
+		cursg = sg;
+		hwif->cursg = sg;
+	}
+
+	page = cursg->page;
+	offset = cursg->offset + hwif->cursg_ofs * SECTOR_SIZE;
 
 	/* get the current page and offset */
 	page = nth_page(page, (offset >> PAGE_SHIFT));
@@ -285,8 +293,8 @@ static void ide_pio_sector(ide_drive_t *
 	hwif->nleft--;
 	hwif->cursg_ofs++;
 
-	if ((hwif->cursg_ofs * SECTOR_SIZE) == sg[hwif->cursg].length) {
-		hwif->cursg++;
+	if ((hwif->cursg_ofs * SECTOR_SIZE) == cursg->length) {
+		hwif->cursg = sg_next(hwif->cursg);
 		hwif->cursg_ofs = 0;
 	}
 
@@ -367,6 +375,8 @@ static ide_startstop_t task_error(ide_dr
 
 static void task_end_request(ide_drive_t *drive, struct request *rq, u8 stat)
 {
+	HWIF(drive)->cursg = NULL;
+
 	if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE) {
 		ide_task_t *task = rq->special;
 
diff -puN drivers/ide/mips/au1xxx-ide.c~git-block drivers/ide/mips/au1xxx-ide.c
--- a/drivers/ide/mips/au1xxx-ide.c~git-block
+++ a/drivers/ide/mips/au1xxx-ide.c
@@ -296,7 +296,7 @@ static int auide_build_dmatable(ide_driv
 			cur_addr += tc;
 			cur_len -= tc;
 		}
-		sg++;
+		sg = sg_next(sg);
 		i--;
 	}
 
diff -puN drivers/ide/pci/sgiioc4.c~git-block drivers/ide/pci/sgiioc4.c
--- a/drivers/ide/pci/sgiioc4.c~git-block
+++ a/drivers/ide/pci/sgiioc4.c
@@ -29,6 +29,7 @@
 #include <linux/mm.h>
 #include <linux/ioport.h>
 #include <linux/blkdev.h>
+#include <linux/scatterlist.h>
 #include <linux/ioc4.h>
 #include <asm/io.h>
 
@@ -523,7 +524,7 @@ sgiioc4_build_dma_table(ide_drive_t * dr
 			}
 		}
 
-		sg++;
+		sg = sg_next(sg);
 		i--;
 	}
 
diff -puN drivers/ide/ppc/pmac.c~git-block drivers/ide/ppc/pmac.c
--- a/drivers/ide/ppc/pmac.c~git-block
+++ a/drivers/ide/ppc/pmac.c
@@ -1533,7 +1533,7 @@ pmac_ide_build_dmatable(ide_drive_t *dri
 			cur_len -= tc;
 			++table;
 		}
-		sg++;
+		sg = sg_next(sg);
 		i--;
 	}
 
diff -puN drivers/infiniband/hw/ipath/ipath_dma.c~git-block drivers/infiniband/hw/ipath/ipath_dma.c
--- a/drivers/infiniband/hw/ipath/ipath_dma.c~git-block
+++ a/drivers/infiniband/hw/ipath/ipath_dma.c
@@ -30,6 +30,7 @@
  * SOFTWARE.
  */
 
+#include <linux/scatterlist.h>
 #include <rdma/ib_verbs.h>
 
 #include "ipath_verbs.h"
@@ -96,17 +97,18 @@ static void ipath_dma_unmap_page(struct 
 	BUG_ON(!valid_dma_direction(direction));
 }
 
-static int ipath_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents,
-			enum dma_data_direction direction)
+static int ipath_map_sg(struct ib_device *dev, struct scatterlist *sgl,
+			int nents, enum dma_data_direction direction)
 {
+	struct scatterlist *sg;
 	u64 addr;
 	int i;
 	int ret = nents;
 
 	BUG_ON(!valid_dma_direction(direction));
 
-	for (i = 0; i < nents; i++) {
-		addr = (u64) page_address(sg[i].page);
+	for_each_sg(sgl, sg, nents, i) {
+		addr = (u64) page_address(sg->page);
 		/* TODO: handle highmem pages */
 		if (!addr) {
 			ret = 0;
diff -puN drivers/infiniband/ulp/iser/iser_memory.c~git-block drivers/infiniband/ulp/iser/iser_memory.c
--- a/drivers/infiniband/ulp/iser/iser_memory.c~git-block
+++ a/drivers/infiniband/ulp/iser/iser_memory.c
@@ -37,7 +37,6 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <asm/io.h>
-#include <asm/scatterlist.h>
 #include <linux/scatterlist.h>
 
 #include "iscsi_iser.h"
@@ -126,17 +125,19 @@ static int iser_start_rdma_unaligned_sg(
 
 	if (cmd_dir == ISER_DIR_OUT) {
 		/* copy the unaligned sg the buffer which is used for RDMA */
-		struct scatterlist *sg = (struct scatterlist *)data->buf;
+		struct scatterlist *sgl = (struct scatterlist *)data->buf;
+		struct scatterlist *sg;
 		int i;
 		char *p, *from;
 
-		for (p = mem, i = 0; i < data->size; i++) {
-			from = kmap_atomic(sg[i].page, KM_USER0);
+		p = mem;
+		for_each_sg(sgl, sg, data->size, i) {
+			from = kmap_atomic(sg->page, KM_USER0);
 			memcpy(p,
-			       from + sg[i].offset,
-			       sg[i].length);
+			       from + sg->offset,
+			       sg->length);
 			kunmap_atomic(from, KM_USER0);
-			p += sg[i].length;
+			p += sg->length;
 		}
 	}
 
@@ -178,7 +179,7 @@ void iser_finalize_rdma_unaligned_sg(str
 
 	if (cmd_dir == ISER_DIR_IN) {
 		char *mem;
-		struct scatterlist *sg;
+		struct scatterlist *sgl, *sg;
 		unsigned char *p, *to;
 		unsigned int sg_size;
 		int i;
@@ -186,16 +187,17 @@ void iser_finalize_rdma_unaligned_sg(str
 		/* copy back read RDMA to unaligned sg */
 		mem	= mem_copy->copy_buf;
 
-		sg	= (struct scatterlist *)iser_ctask->data[ISER_DIR_IN].buf;
+		sgl	= (struct scatterlist *)iser_ctask->data[ISER_DIR_IN].buf;
 		sg_size = iser_ctask->data[ISER_DIR_IN].size;
 
-		for (p = mem, i = 0; i < sg_size; i++){
-			to = kmap_atomic(sg[i].page, KM_SOFTIRQ0);
-			memcpy(to + sg[i].offset,
+		p = mem;
+		for_each_sg(sgl, sg, sg_size, i) {
+			to = kmap_atomic(sg->page, KM_SOFTIRQ0);
+			memcpy(to + sg->offset,
 			       p,
-			       sg[i].length);
+			       sg->length);
 			kunmap_atomic(to, KM_SOFTIRQ0);
-			p += sg[i].length;
+			p += sg->length;
 		}
 	}
 
@@ -226,7 +228,8 @@ static int iser_sg_to_page_vec(struct is
 			       struct iser_page_vec *page_vec,
 			       struct ib_device *ibdev)
 {
-	struct scatterlist *sg = (struct scatterlist *)data->buf;
+	struct scatterlist *sgl = (struct scatterlist *)data->buf;
+	struct scatterlist *sg;
 	u64 first_addr, last_addr, page;
 	int end_aligned;
 	unsigned int cur_page = 0;
@@ -234,24 +237,25 @@ static int iser_sg_to_page_vec(struct is
 	int i;
 
 	/* compute the offset of first element */
-	page_vec->offset = (u64) sg[0].offset & ~MASK_4K;
+	page_vec->offset = (u64) sgl[0].offset & ~MASK_4K;
 
-	for (i = 0; i < data->dma_nents; i++) {
-		unsigned int dma_len = ib_sg_dma_len(ibdev, &sg[i]);
+	for_each_sg(sgl, sg, data->dma_nents, i) {
+		unsigned int dma_len = ib_sg_dma_len(ibdev, sg);
 
 		total_sz += dma_len;
 
-		first_addr = ib_sg_dma_address(ibdev, &sg[i]);
+		first_addr = ib_sg_dma_address(ibdev, sg);
 		last_addr  = first_addr + dma_len;
 
 		end_aligned   = !(last_addr  & ~MASK_4K);
 
 		/* continue to collect page fragments till aligned or SG ends */
 		while (!end_aligned && (i + 1 < data->dma_nents)) {
+			sg = sg_next(sg);
 			i++;
-			dma_len = ib_sg_dma_len(ibdev, &sg[i]);
+			dma_len = ib_sg_dma_len(ibdev, sg);
 			total_sz += dma_len;
-			last_addr = ib_sg_dma_address(ibdev, &sg[i]) + dma_len;
+			last_addr = ib_sg_dma_address(ibdev, sg) + dma_len;
 			end_aligned = !(last_addr  & ~MASK_4K);
 		}
 
@@ -286,25 +290,26 @@ static int iser_sg_to_page_vec(struct is
 static unsigned int iser_data_buf_aligned_len(struct iser_data_buf *data,
 					      struct ib_device *ibdev)
 {
-	struct scatterlist *sg;
+	struct scatterlist *sgl, *sg;
 	u64 end_addr, next_addr;
 	int i, cnt;
 	unsigned int ret_len = 0;
 
-	sg = (struct scatterlist *)data->buf;
+	sgl = (struct scatterlist *)data->buf;
 
-	for (cnt = 0, i = 0; i < data->dma_nents; i++, cnt++) {
+	cnt = 0;
+	for_each_sg(sgl, sg, data->dma_nents, i) {
 		/* iser_dbg("Checking sg iobuf [%d]: phys=0x%08lX "
 		   "offset: %ld sz: %ld\n", i,
-		   (unsigned long)page_to_phys(sg[i].page),
-		   (unsigned long)sg[i].offset,
-		   (unsigned long)sg[i].length); */
-		end_addr = ib_sg_dma_address(ibdev, &sg[i]) +
-			   ib_sg_dma_len(ibdev, &sg[i]);
+		   (unsigned long)page_to_phys(sg->page),
+		   (unsigned long)sg->offset,
+		   (unsigned long)sg->length); */
+		end_addr = ib_sg_dma_address(ibdev, sg) +
+			   ib_sg_dma_len(ibdev, sg);
 		/* iser_dbg("Checking sg iobuf end address "
 		       "0x%08lX\n", end_addr); */
 		if (i + 1 < data->dma_nents) {
-			next_addr = ib_sg_dma_address(ibdev, &sg[i+1]);
+			next_addr = ib_sg_dma_address(ibdev, sg_next(sg));
 			/* are i, i+1 fragments of the same page? */
 			if (end_addr == next_addr)
 				continue;
@@ -324,15 +329,16 @@ static unsigned int iser_data_buf_aligne
 static void iser_data_buf_dump(struct iser_data_buf *data,
 			       struct ib_device *ibdev)
 {
-	struct scatterlist *sg = (struct scatterlist *)data->buf;
+	struct scatterlist *sgl = (struct scatterlist *)data->buf;
+	struct scatterlist *sg;
 	int i;
 
-	for (i = 0; i < data->dma_nents; i++)
+	for_each_sg(sgl, sg, data->dma_nents, i)
 		iser_err("sg[%d] dma_addr:0x%lX page:0x%p "
 			 "off:0x%x sz:0x%x dma_len:0x%x\n",
-			 i, (unsigned long)ib_sg_dma_address(ibdev, &sg[i]),
-			 sg[i].page, sg[i].offset,
-			 sg[i].length, ib_sg_dma_len(ibdev, &sg[i]));
+			 i, (unsigned long)ib_sg_dma_address(ibdev, sg),
+			 sg->page, sg->offset,
+			 sg->length, ib_sg_dma_len(ibdev, sg));
 }
 
 static void iser_dump_page_vec(struct iser_page_vec *page_vec)
diff -puN drivers/md/dm-table.c~git-block drivers/md/dm-table.c
--- a/drivers/md/dm-table.c~git-block
+++ a/drivers/md/dm-table.c
@@ -998,33 +998,6 @@ void dm_table_unplug_all(struct dm_table
 	}
 }
 
-int dm_table_flush_all(struct dm_table *t)
-{
-	struct list_head *d, *devices = dm_table_get_devices(t);
-	int ret = 0;
-	unsigned i;
-
-	for (i = 0; i < t->num_targets; i++)
-		if (t->targets[i].type->flush)
-			t->targets[i].type->flush(&t->targets[i]);
-
-	for (d = devices->next; d != devices; d = d->next) {
-		struct dm_dev *dd = list_entry(d, struct dm_dev, list);
-		struct request_queue *q = bdev_get_queue(dd->bdev);
-		int err;
-
-		if (!q->issue_flush_fn)
-			err = -EOPNOTSUPP;
-		else
-			err = q->issue_flush_fn(q, dd->bdev->bd_disk, NULL);
-
-		if (!ret)
-			ret = err;
-	}
-
-	return ret;
-}
-
 struct mapped_device *dm_table_get_md(struct dm_table *t)
 {
 	dm_get(t->md);
@@ -1042,4 +1015,3 @@ EXPORT_SYMBOL(dm_table_get_md);
 EXPORT_SYMBOL(dm_table_put);
 EXPORT_SYMBOL(dm_table_get);
 EXPORT_SYMBOL(dm_table_unplug_all);
-EXPORT_SYMBOL(dm_table_flush_all);
diff -puN drivers/md/dm.c~git-block drivers/md/dm.c
--- a/drivers/md/dm.c~git-block
+++ a/drivers/md/dm.c
@@ -844,21 +844,6 @@ static int dm_request(struct request_que
 	return 0;
 }
 
-static int dm_flush_all(struct request_queue *q, struct gendisk *disk,
-			sector_t *error_sector)
-{
-	struct mapped_device *md = q->queuedata;
-	struct dm_table *map = dm_get_table(md);
-	int ret = -ENXIO;
-
-	if (map) {
-		ret = dm_table_flush_all(map);
-		dm_table_put(map);
-	}
-
-	return ret;
-}
-
 static void dm_unplug_all(struct request_queue *q)
 {
 	struct mapped_device *md = q->queuedata;
@@ -1007,7 +992,6 @@ static struct mapped_device *alloc_dev(i
 	blk_queue_make_request(md->queue, dm_request);
 	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
 	md->queue->unplug_fn = dm_unplug_all;
-	md->queue->issue_flush_fn = dm_flush_all;
 
 	md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache);
 	if (!md->io_pool)
diff -puN drivers/md/dm.h~git-block drivers/md/dm.h
--- a/drivers/md/dm.h~git-block
+++ a/drivers/md/dm.h
@@ -111,7 +111,6 @@ void dm_table_postsuspend_targets(struct
 int dm_table_resume_targets(struct dm_table *t);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
 void dm_table_unplug_all(struct dm_table *t);
-int dm_table_flush_all(struct dm_table *t);
 
 /*-----------------------------------------------------------------
  * A registry of target types.
diff -puN drivers/md/linear.c~git-block drivers/md/linear.c
--- a/drivers/md/linear.c~git-block
+++ a/drivers/md/linear.c
@@ -92,25 +92,6 @@ static void linear_unplug(struct request
 	}
 }
 
-static int linear_issue_flush(struct request_queue *q, struct gendisk *disk,
-			      sector_t *error_sector)
-{
-	mddev_t *mddev = q->queuedata;
-	linear_conf_t *conf = mddev_to_conf(mddev);
-	int i, ret = 0;
-
-	for (i=0; i < mddev->raid_disks && ret == 0; i++) {
-		struct block_device *bdev = conf->disks[i].rdev->bdev;
-		struct request_queue *r_queue = bdev_get_queue(bdev);
-
-		if (!r_queue->issue_flush_fn)
-			ret = -EOPNOTSUPP;
-		else
-			ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
-	}
-	return ret;
-}
-
 static int linear_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
@@ -279,7 +260,6 @@ static int linear_run (mddev_t *mddev)
 
 	blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
 	mddev->queue->unplug_fn = linear_unplug;
-	mddev->queue->issue_flush_fn = linear_issue_flush;
 	mddev->queue->backing_dev_info.congested_fn = linear_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
 	return 0;
diff -puN drivers/md/md.c~git-block drivers/md/md.c
--- a/drivers/md/md.c~git-block
+++ a/drivers/md/md.c
@@ -3473,7 +3473,6 @@ static int do_md_stop(mddev_t * mddev, i
 			mddev->pers->stop(mddev);
 			mddev->queue->merge_bvec_fn = NULL;
 			mddev->queue->unplug_fn = NULL;
-			mddev->queue->issue_flush_fn = NULL;
 			mddev->queue->backing_dev_info.congested_fn = NULL;
 			if (mddev->pers->sync_request)
 				sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
diff -puN drivers/md/multipath.c~git-block drivers/md/multipath.c
--- a/drivers/md/multipath.c~git-block
+++ a/drivers/md/multipath.c
@@ -199,35 +199,6 @@ static void multipath_status (struct seq
 	seq_printf (seq, "]");
 }
 
-static int multipath_issue_flush(struct request_queue *q, struct gendisk *disk,
-				 sector_t *error_sector)
-{
-	mddev_t *mddev = q->queuedata;
-	multipath_conf_t *conf = mddev_to_conf(mddev);
-	int i, ret = 0;
-
-	rcu_read_lock();
-	for (i=0; i<mddev->raid_disks && ret == 0; i++) {
-		mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
-		if (rdev && !test_bit(Faulty, &rdev->flags)) {
-			struct block_device *bdev = rdev->bdev;
-			struct request_queue *r_queue = bdev_get_queue(bdev);
-
-			if (!r_queue->issue_flush_fn)
-				ret = -EOPNOTSUPP;
-			else {
-				atomic_inc(&rdev->nr_pending);
-				rcu_read_unlock();
-				ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
-							      error_sector);
-				rdev_dec_pending(rdev, mddev);
-				rcu_read_lock();
-			}
-		}
-	}
-	rcu_read_unlock();
-	return ret;
-}
 static int multipath_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
@@ -532,7 +503,6 @@ static int multipath_run (mddev_t *mddev
 	mddev->array_size = mddev->size;
 
 	mddev->queue->unplug_fn = multipath_unplug;
-	mddev->queue->issue_flush_fn = multipath_issue_flush;
 	mddev->queue->backing_dev_info.congested_fn = multipath_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
 
diff -puN drivers/md/raid0.c~git-block drivers/md/raid0.c
--- a/drivers/md/raid0.c~git-block
+++ a/drivers/md/raid0.c
@@ -40,26 +40,6 @@ static void raid0_unplug(struct request_
 	}
 }
 
-static int raid0_issue_flush(struct request_queue *q, struct gendisk *disk,
-			     sector_t *error_sector)
-{
-	mddev_t *mddev = q->queuedata;
-	raid0_conf_t *conf = mddev_to_conf(mddev);
-	mdk_rdev_t **devlist = conf->strip_zone[0].dev;
-	int i, ret = 0;
-
-	for (i=0; i<mddev->raid_disks && ret == 0; i++) {
-		struct block_device *bdev = devlist[i]->bdev;
-		struct request_queue *r_queue = bdev_get_queue(bdev);
-
-		if (!r_queue->issue_flush_fn)
-			ret = -EOPNOTSUPP;
-		else
-			ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
-	}
-	return ret;
-}
-
 static int raid0_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
@@ -250,7 +230,6 @@ static int create_strip_zones (mddev_t *
 
 	mddev->queue->unplug_fn = raid0_unplug;
 
-	mddev->queue->issue_flush_fn = raid0_issue_flush;
 	mddev->queue->backing_dev_info.congested_fn = raid0_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
 
diff -puN drivers/md/raid1.c~git-block drivers/md/raid1.c
--- a/drivers/md/raid1.c~git-block
+++ a/drivers/md/raid1.c
@@ -575,36 +575,6 @@ static void raid1_unplug(struct request_
 	md_wakeup_thread(mddev->thread);
 }
 
-static int raid1_issue_flush(struct request_queue *q, struct gendisk *disk,
-			     sector_t *error_sector)
-{
-	mddev_t *mddev = q->queuedata;
-	conf_t *conf = mddev_to_conf(mddev);
-	int i, ret = 0;
-
-	rcu_read_lock();
-	for (i=0; i<mddev->raid_disks && ret == 0; i++) {
-		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
-		if (rdev && !test_bit(Faulty, &rdev->flags)) {
-			struct block_device *bdev = rdev->bdev;
-			struct request_queue *r_queue = bdev_get_queue(bdev);
-
-			if (!r_queue->issue_flush_fn)
-				ret = -EOPNOTSUPP;
-			else {
-				atomic_inc(&rdev->nr_pending);
-				rcu_read_unlock();
-				ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
-							      error_sector);
-				rdev_dec_pending(rdev, mddev);
-				rcu_read_lock();
-			}
-		}
-	}
-	rcu_read_unlock();
-	return ret;
-}
-
 static int raid1_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
@@ -2013,7 +1983,6 @@ static int run(mddev_t *mddev)
 	mddev->array_size = mddev->size;
 
 	mddev->queue->unplug_fn = raid1_unplug;
-	mddev->queue->issue_flush_fn = raid1_issue_flush;
 	mddev->queue->backing_dev_info.congested_fn = raid1_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
 
diff -puN drivers/md/raid10.c~git-block drivers/md/raid10.c
--- a/drivers/md/raid10.c~git-block
+++ a/drivers/md/raid10.c
@@ -618,36 +618,6 @@ static void raid10_unplug(struct request
 	md_wakeup_thread(mddev->thread);
 }
 
-static int raid10_issue_flush(struct request_queue *q, struct gendisk *disk,
-			     sector_t *error_sector)
-{
-	mddev_t *mddev = q->queuedata;
-	conf_t *conf = mddev_to_conf(mddev);
-	int i, ret = 0;
-
-	rcu_read_lock();
-	for (i=0; i<mddev->raid_disks && ret == 0; i++) {
-		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
-		if (rdev && !test_bit(Faulty, &rdev->flags)) {
-			struct block_device *bdev = rdev->bdev;
-			struct request_queue *r_queue = bdev_get_queue(bdev);
-
-			if (!r_queue->issue_flush_fn)
-				ret = -EOPNOTSUPP;
-			else {
-				atomic_inc(&rdev->nr_pending);
-				rcu_read_unlock();
-				ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
-							      error_sector);
-				rdev_dec_pending(rdev, mddev);
-				rcu_read_lock();
-			}
-		}
-	}
-	rcu_read_unlock();
-	return ret;
-}
-
 static int raid10_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
@@ -2133,7 +2103,6 @@ static int run(mddev_t *mddev)
 	mddev->resync_max_sectors = size << conf->chunk_shift;
 
 	mddev->queue->unplug_fn = raid10_unplug;
-	mddev->queue->issue_flush_fn = raid10_issue_flush;
 	mddev->queue->backing_dev_info.congested_fn = raid10_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
 
diff -puN drivers/md/raid5.c~git-block drivers/md/raid5.c
--- a/drivers/md/raid5.c~git-block
+++ a/drivers/md/raid5.c
@@ -3767,36 +3767,6 @@ static void raid5_unplug_device(struct r
 	unplug_slaves(mddev);
 }
 
-static int raid5_issue_flush(struct request_queue *q, struct gendisk *disk,
-			     sector_t *error_sector)
-{
-	mddev_t *mddev = q->queuedata;
-	raid5_conf_t *conf = mddev_to_conf(mddev);
-	int i, ret = 0;
-
-	rcu_read_lock();
-	for (i=0; i<mddev->raid_disks && ret == 0; i++) {
-		mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
-		if (rdev && !test_bit(Faulty, &rdev->flags)) {
-			struct block_device *bdev = rdev->bdev;
-			struct request_queue *r_queue = bdev_get_queue(bdev);
-
-			if (!r_queue->issue_flush_fn)
-				ret = -EOPNOTSUPP;
-			else {
-				atomic_inc(&rdev->nr_pending);
-				rcu_read_unlock();
-				ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
-							      error_sector);
-				rdev_dec_pending(rdev, mddev);
-				rcu_read_lock();
-			}
-		}
-	}
-	rcu_read_unlock();
-	return ret;
-}
-
 static int raid5_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
@@ -4971,7 +4941,6 @@ static int run(mddev_t *mddev)
 		       mdname(mddev));
 
 	mddev->queue->unplug_fn = raid5_unplug_device;
-	mddev->queue->issue_flush_fn = raid5_issue_flush;
 	mddev->queue->backing_dev_info.congested_data = mddev;
 	mddev->queue->backing_dev_info.congested_fn = raid5_congested;
 
diff -puN drivers/message/fusion/mptscsih.c~git-block drivers/message/fusion/mptscsih.c
--- a/drivers/message/fusion/mptscsih.c~git-block
+++ a/drivers/message/fusion/mptscsih.c
@@ -289,7 +289,7 @@ nextSGEset:
 	for (ii=0; ii < (numSgeThisFrame-1); ii++) {
 		thisxfer = sg_dma_len(sg);
 		if (thisxfer == 0) {
-			sg ++; /* Get next SG element from the OS */
+			sg = sg_next(sg); /* Get next SG element from the OS */
 			sg_done++;
 			continue;
 		}
@@ -297,7 +297,7 @@ nextSGEset:
 		v2 = sg_dma_address(sg);
 		mptscsih_add_sge(psge, sgflags | thisxfer, v2);
 
-		sg++;		/* Get next SG element from the OS */
+		sg = sg_next(sg);	/* Get next SG element from the OS */
 		psge += (sizeof(u32) + sizeof(dma_addr_t));
 		sgeOffset += (sizeof(u32) + sizeof(dma_addr_t));
 		sg_done++;
@@ -318,7 +318,7 @@ nextSGEset:
 		v2 = sg_dma_address(sg);
 		mptscsih_add_sge(psge, sgflags | thisxfer, v2);
 		/*
-		sg++;
+		sg = sg_next(sg);
 		psge += (sizeof(u32) + sizeof(dma_addr_t));
 		*/
 		sgeOffset += (sizeof(u32) + sizeof(dma_addr_t));
diff -puN drivers/message/i2o/i2o_block.c~git-block drivers/message/i2o/i2o_block.c
--- a/drivers/message/i2o/i2o_block.c~git-block
+++ a/drivers/message/i2o/i2o_block.c
@@ -149,29 +149,6 @@ static int i2o_block_device_flush(struct
 };
 
 /**
- *	i2o_block_issue_flush - device-flush interface for block-layer
- *	@queue: the request queue of the device which should be flushed
- *	@disk: gendisk
- *	@error_sector: error offset
- *
- *	Helper function to provide flush functionality to block-layer.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-
-static int i2o_block_issue_flush(struct request_queue * queue, struct gendisk *disk,
-				 sector_t * error_sector)
-{
-	struct i2o_block_device *i2o_blk_dev = queue->queuedata;
-	int rc = -ENODEV;
-
-	if (likely(i2o_blk_dev))
-		rc = i2o_block_device_flush(i2o_blk_dev->i2o_dev);
-
-	return rc;
-}
-
-/**
  *	i2o_block_device_mount - Mount (load) the media of device dev
  *	@dev: I2O device which should receive the mount request
  *	@media_id: Media Identifier
@@ -1009,7 +986,6 @@ static struct i2o_block_device *i2o_bloc
 	}
 
 	blk_queue_prep_rq(queue, i2o_block_prep_req_fn);
-	blk_queue_issue_flush_fn(queue, i2o_block_issue_flush);
 
 	gd->major = I2O_MAJOR;
 	gd->queue = queue;
diff -puN drivers/s390/scsi/zfcp_def.h~git-block drivers/s390/scsi/zfcp_def.h
--- a/drivers/s390/scsi/zfcp_def.h~git-block
+++ a/drivers/s390/scsi/zfcp_def.h
@@ -34,6 +34,7 @@
 #include <linux/slab.h>
 #include <linux/mempool.h>
 #include <linux/syscalls.h>
+#include <linux/scatterlist.h>
 #include <linux/ioctl.h>
 #include <scsi/scsi.h>
 #include <scsi/scsi_tcq.h>
diff -puN drivers/s390/scsi/zfcp_qdio.c~git-block drivers/s390/scsi/zfcp_qdio.c
--- a/drivers/s390/scsi/zfcp_qdio.c~git-block
+++ a/drivers/s390/scsi/zfcp_qdio.c
@@ -591,7 +591,7 @@ zfcp_qdio_sbals_from_segment(struct zfcp
  */
 int
 zfcp_qdio_sbals_from_sg(struct zfcp_fsf_req *fsf_req, unsigned long sbtype,
-                        struct scatterlist *sg,	int sg_count, int max_sbals)
+                        struct scatterlist *sgl, int sg_count, int max_sbals)
 {
 	int sg_index;
 	struct scatterlist *sg_segment;
@@ -607,9 +607,7 @@ zfcp_qdio_sbals_from_sg(struct zfcp_fsf_
 	sbale->flags |= sbtype;
 
 	/* process all segements of scatter-gather list */
-	for (sg_index = 0, sg_segment = sg, bytes = 0;
-	     sg_index < sg_count;
-	     sg_index++, sg_segment++) {
+	for_each_sg(sgl, sg_segment, sg_count, sg_index) {
 		retval = zfcp_qdio_sbals_from_segment(
 				fsf_req,
 				sbtype,
diff -puN drivers/scsi/advansys.c~git-block drivers/scsi/advansys.c
--- a/drivers/scsi/advansys.c~git-block
+++ a/drivers/scsi/advansys.c
@@ -4245,7 +4245,7 @@ static int asc_build_req(asc_board_t *bo
 		 */
 		int sgcnt;
 		int use_sg;
-		struct scatterlist *slp;
+		struct scatterlist *slp, *sg;
 
 		slp = (struct scatterlist *)scp->request_buffer;
 		use_sg = dma_map_sg(boardp->dev, slp, scp->use_sg,
@@ -4282,13 +4282,13 @@ static int asc_build_req(asc_board_t *bo
 		/*
 		 * Convert scatter-gather list into ASC_SG_HEAD list.
 		 */
-		for (sgcnt = 0; sgcnt < use_sg; sgcnt++, slp++) {
+		scsi_for_each_sg(scp, sg, use_sg, sgcnt) {
 			asc_sg_head.sg_list[sgcnt].addr =
-			    cpu_to_le32(sg_dma_address(slp));
+			    cpu_to_le32(sg_dma_address(sg));
 			asc_sg_head.sg_list[sgcnt].bytes =
-			    cpu_to_le32(sg_dma_len(slp));
+			    cpu_to_le32(sg_dma_len(sg));
 			ASC_STATS_ADD(scp->device->host, sg_xfer,
-				      ASC_CEILING(sg_dma_len(slp), 512));
+				      ASC_CEILING(sg_dma_len(sg), 512));
 		}
 	}
 
@@ -4577,7 +4577,7 @@ adv_get_sglist(asc_board_t *boardp, adv_
 				sg_block->sg_ptr = 0L;	/* Last ADV_SG_BLOCK in list. */
 				return ADV_SUCCESS;
 			}
-			slp++;
+			slp = sg_next(slp);
 		}
 		sg_block->sg_cnt = NO_OF_SG_PER_BLOCK;
 		prev_sg_block = sg_block;
diff -puN drivers/scsi/aha1542.c~git-block drivers/scsi/aha1542.c
--- a/drivers/scsi/aha1542.c~git-block
+++ a/drivers/scsi/aha1542.c
@@ -61,15 +61,15 @@ static void BAD_DMA(void *address, unsig
 }
 
 static void BAD_SG_DMA(Scsi_Cmnd * SCpnt,
-		       struct scatterlist *sgpnt,
+		       struct scatterlist *sgp,
 		       int nseg,
 		       int badseg)
 {
 	printk(KERN_CRIT "sgpnt[%d:%d] page %p/0x%llx length %u\n",
 	       badseg, nseg,
-	       page_address(sgpnt[badseg].page) + sgpnt[badseg].offset,
-	       (unsigned long long)SCSI_SG_PA(&sgpnt[badseg]),
-	       sgpnt[badseg].length);
+	       page_address(sgp->page) + sgp->offset,
+	       (unsigned long long)SCSI_SG_PA(sgp),
+	       sgp->length);
 
 	/*
 	 * Not safe to continue.
@@ -691,7 +691,7 @@ static int aha1542_queuecommand(Scsi_Cmn
 	memcpy(ccb[mbo].cdb, cmd, ccb[mbo].cdblen);
 
 	if (SCpnt->use_sg) {
-		struct scatterlist *sgpnt;
+		struct scatterlist *sg;
 		struct chain *cptr;
 #ifdef DEBUG
 		unsigned char *ptr;
@@ -699,23 +699,21 @@ static int aha1542_queuecommand(Scsi_Cmn
 		int i;
 		ccb[mbo].op = 2;	/* SCSI Initiator Command  w/scatter-gather */
 		SCpnt->host_scribble = kmalloc(512, GFP_KERNEL | GFP_DMA);
-		sgpnt = (struct scatterlist *) SCpnt->request_buffer;
 		cptr = (struct chain *) SCpnt->host_scribble;
 		if (cptr == NULL) {
 			/* free the claimed mailbox slot */
 			HOSTDATA(SCpnt->device->host)->SCint[mbo] = NULL;
 			return SCSI_MLQUEUE_HOST_BUSY;
 		}
-		for (i = 0; i < SCpnt->use_sg; i++) {
-			if (sgpnt[i].length == 0 || SCpnt->use_sg > 16 ||
-			    (((int) sgpnt[i].offset) & 1) || (sgpnt[i].length & 1)) {
+		scsi_for_each_sg(SCpnt, sg, SCpnt->use_sg, i) {
+			if (sg->length == 0 || SCpnt->use_sg > 16 ||
+			    (((int) sg->offset) & 1) || (sg->length & 1)) {
 				unsigned char *ptr;
 				printk(KERN_CRIT "Bad segment list supplied to aha1542.c (%d, %d)\n", SCpnt->use_sg, i);
-				for (i = 0; i < SCpnt->use_sg; i++) {
+				scsi_for_each_sg(SCpnt, sg, SCpnt->use_sg, i) {
 					printk(KERN_CRIT "%d: %p %d\n", i,
-					       (page_address(sgpnt[i].page) +
-						sgpnt[i].offset),
-					       sgpnt[i].length);
+					       (page_address(sg->page) +
+						sg->offset), sg->length);
 				};
 				printk(KERN_CRIT "cptr %x: ", (unsigned int) cptr);
 				ptr = (unsigned char *) &cptr[i];
@@ -723,10 +721,10 @@ static int aha1542_queuecommand(Scsi_Cmn
 					printk("%02x ", ptr[i]);
 				panic("Foooooooood fight!");
 			};
-			any2scsi(cptr[i].dataptr, SCSI_SG_PA(&sgpnt[i]));
-			if (SCSI_SG_PA(&sgpnt[i]) + sgpnt[i].length - 1 > ISA_DMA_THRESHOLD)
-				BAD_SG_DMA(SCpnt, sgpnt, SCpnt->use_sg, i);
-			any2scsi(cptr[i].datalen, sgpnt[i].length);
+			any2scsi(cptr[i].dataptr, SCSI_SG_PA(sg));
+			if (SCSI_SG_PA(sg) + sg->length - 1 > ISA_DMA_THRESHOLD)
+				BAD_SG_DMA(SCpnt, sg, SCpnt->use_sg, i);
+			any2scsi(cptr[i].datalen, sg->length);
 		};
 		any2scsi(ccb[mbo].datalen, SCpnt->use_sg * sizeof(struct chain));
 		any2scsi(ccb[mbo].dataptr, SCSI_BUF_PA(cptr));
diff -puN drivers/scsi/aic94xx/aic94xx_task.c~git-block drivers/scsi/aic94xx/aic94xx_task.c
--- a/drivers/scsi/aic94xx/aic94xx_task.c~git-block
+++ a/drivers/scsi/aic94xx/aic94xx_task.c
@@ -94,7 +94,7 @@ static inline int asd_map_scatterlist(st
 			res = -ENOMEM;
 			goto err_unmap;
 		}
-		for (sc = task->scatter, i = 0; i < num_sg; i++, sc++) {
+		for_each_sg(task->scatter, sc, num_sg, i) {
 			struct sg_el *sg =
 				&((struct sg_el *)ascb->sg_arr->vaddr)[i];
 			sg->bus_addr = cpu_to_le64((u64)sg_dma_address(sc));
@@ -103,7 +103,7 @@ static inline int asd_map_scatterlist(st
 				sg->flags |= ASD_SG_EL_LIST_EOL;
 		}
 
-		for (sc = task->scatter, i = 0; i < 2; i++, sc++) {
+		for_each_sg(task->scatter, sc, 2, i) {
 			sg_arr[i].bus_addr =
 				cpu_to_le64((u64)sg_dma_address(sc));
 			sg_arr[i].size = cpu_to_le32((u32)sg_dma_len(sc));
@@ -115,7 +115,7 @@ static inline int asd_map_scatterlist(st
 		sg_arr[2].bus_addr=cpu_to_le64((u64)ascb->sg_arr->dma_handle);
 	} else {
 		int i;
-		for (sc = task->scatter, i = 0; i < num_sg; i++, sc++) {
+		for_each_sg(task->scatter, sc, num_sg, i) {
 			sg_arr[i].bus_addr =
 				cpu_to_le64((u64)sg_dma_address(sc));
 			sg_arr[i].size = cpu_to_le32((u32)sg_dma_len(sc));
diff -puN drivers/scsi/gdth.c~git-block drivers/scsi/gdth.c
--- a/drivers/scsi/gdth.c~git-block
+++ a/drivers/scsi/gdth.c
@@ -2651,7 +2651,7 @@ static void gdth_copy_internal_data(int 
 {
     ushort cpcount,i;
     ushort cpsum,cpnow;
-    struct scatterlist *sl;
+    struct scatterlist *sl, *sg;
     gdth_ha_str *ha;
     char *address;
 
@@ -2660,29 +2660,30 @@ static void gdth_copy_internal_data(int 
 
     if (scp->use_sg) {
         sl = (struct scatterlist *)scp->request_buffer;
-        for (i=0,cpsum=0; i<scp->use_sg; ++i,++sl) {
+	cpsum = 0;
+	for_each_sg(sl, sg, scp->use_sg, i) {
             unsigned long flags;
-            cpnow = (ushort)sl->length;
+            cpnow = (ushort)sg->length;
             TRACE(("copy_internal() now %d sum %d count %d %d\n",
                           cpnow,cpsum,cpcount,(ushort)scp->bufflen));
             if (cpsum+cpnow > cpcount) 
                 cpnow = cpcount - cpsum;
             cpsum += cpnow;
-            if (!sl->page) {
+            if (!sg->page) {
                 printk("GDT-HA %d: invalid sc/gt element in gdth_copy_internal_data()\n",
                        hanum);
                 return;
             }
             local_irq_save(flags);
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
-            address = kmap_atomic(sl->page, KM_BIO_SRC_IRQ) + sl->offset;
+            address = kmap_atomic(sg->page, KM_BIO_SRC_IRQ) + sg->offset;
             memcpy(address,buffer,cpnow);
-            flush_dcache_page(sl->page);
+            flush_dcache_page(sg->page);
             kunmap_atomic(address, KM_BIO_SRC_IRQ);
 #else
-            address = kmap_atomic(sl->page, KM_BH_IRQ) + sl->offset;
+            address = kmap_atomic(sg->page, KM_BH_IRQ) + sg->offset;
             memcpy(address,buffer,cpnow);
-            flush_dcache_page(sl->page);
+            flush_dcache_page(sg->page);
             kunmap_atomic(address, KM_BH_IRQ);
 #endif
             local_irq_restore(flags);
@@ -2802,7 +2803,7 @@ static int gdth_fill_cache_cmd(int hanum
 {
     register gdth_ha_str *ha;
     register gdth_cmd_str *cmdp;
-    struct scatterlist *sl;
+    struct scatterlist *sl, *sg;
     ulong32 cnt, blockcnt;
     ulong64 no, blockno;
     dma_addr_t phys_addr;
@@ -2908,25 +2909,25 @@ static int gdth_fill_cache_cmd(int hanum
             if (mode64) {
                 cmdp->u.cache64.DestAddr= (ulong64)-1;
                 cmdp->u.cache64.sg_canz = sgcnt;
-                for (i=0; i<sgcnt; ++i,++sl) {
-                    cmdp->u.cache64.sg_lst[i].sg_ptr = sg_dma_address(sl);
+		for_each_sg(sl, sg, sgcnt, i) {
+                    cmdp->u.cache64.sg_lst[i].sg_ptr = sg_dma_address(sg);
 #ifdef GDTH_DMA_STATISTICS
                     if (cmdp->u.cache64.sg_lst[i].sg_ptr > (ulong64)0xffffffff)
                         ha->dma64_cnt++;
                     else
                         ha->dma32_cnt++;
 #endif
-                    cmdp->u.cache64.sg_lst[i].sg_len = sg_dma_len(sl);
+                    cmdp->u.cache64.sg_lst[i].sg_len = sg_dma_len(sg);
                 }
             } else {
                 cmdp->u.cache.DestAddr= 0xffffffff;
                 cmdp->u.cache.sg_canz = sgcnt;
-                for (i=0; i<sgcnt; ++i,++sl) {
-                    cmdp->u.cache.sg_lst[i].sg_ptr = sg_dma_address(sl);
+		for_each_sg(sl, sg, sgcnt, i) {
+                    cmdp->u.cache.sg_lst[i].sg_ptr = sg_dma_address(sg);
 #ifdef GDTH_DMA_STATISTICS
                     ha->dma32_cnt++;
 #endif
-                    cmdp->u.cache.sg_lst[i].sg_len = sg_dma_len(sl);
+                    cmdp->u.cache.sg_lst[i].sg_len = sg_dma_len(sg);
                 }
             }
 
@@ -3012,7 +3013,7 @@ static int gdth_fill_raw_cmd(int hanum,S
 {
     register gdth_ha_str *ha;
     register gdth_cmd_str *cmdp;
-    struct scatterlist *sl;
+    struct scatterlist *sl, *sg;
     ushort i;
     dma_addr_t phys_addr, sense_paddr;
     int cmd_index, sgcnt, mode64;
@@ -3115,25 +3116,25 @@ static int gdth_fill_raw_cmd(int hanum,S
             if (mode64) {
                 cmdp->u.raw64.sdata = (ulong64)-1;
                 cmdp->u.raw64.sg_ranz = sgcnt;
-                for (i=0; i<sgcnt; ++i,++sl) {
-                    cmdp->u.raw64.sg_lst[i].sg_ptr = sg_dma_address(sl);
+		for_each_sg(sl, sg, sgcnt, i) {
+                    cmdp->u.raw64.sg_lst[i].sg_ptr = sg_dma_address(sg);
 #ifdef GDTH_DMA_STATISTICS
                     if (cmdp->u.raw64.sg_lst[i].sg_ptr > (ulong64)0xffffffff)
                         ha->dma64_cnt++;
                     else
                         ha->dma32_cnt++;
 #endif
-                    cmdp->u.raw64.sg_lst[i].sg_len = sg_dma_len(sl);
+                    cmdp->u.raw64.sg_lst[i].sg_len = sg_dma_len(sg);
                 }
             } else {
                 cmdp->u.raw.sdata = 0xffffffff;
                 cmdp->u.raw.sg_ranz = sgcnt;
-                for (i=0; i<sgcnt; ++i,++sl) {
-                    cmdp->u.raw.sg_lst[i].sg_ptr = sg_dma_address(sl);
+		for_each_sg(sl, sg, sgcnt, i) {
+                    cmdp->u.raw.sg_lst[i].sg_ptr = sg_dma_address(sg);
 #ifdef GDTH_DMA_STATISTICS
                     ha->dma32_cnt++;
 #endif
-                    cmdp->u.raw.sg_lst[i].sg_len = sg_dma_len(sl);
+                    cmdp->u.raw.sg_lst[i].sg_len = sg_dma_len(sg);
                 }
             }
 
diff -puN drivers/scsi/ide-scsi.c~git-block drivers/scsi/ide-scsi.c
--- a/drivers/scsi/ide-scsi.c~git-block
+++ a/drivers/scsi/ide-scsi.c
@@ -70,6 +70,7 @@ typedef struct idescsi_pc_s {
 	u8 *buffer;				/* Data buffer */
 	u8 *current_position;			/* Pointer into the above buffer */
 	struct scatterlist *sg;			/* Scatter gather table */
+	struct scatterlist *last_sg;		/* Last sg element */
 	int b_count;				/* Bytes transferred from current entry */
 	struct scsi_cmnd *scsi_cmd;		/* SCSI command */
 	void (*done)(struct scsi_cmnd *);	/* Scsi completion routine */
@@ -175,11 +176,6 @@ static void idescsi_input_buffers (ide_d
 	char *buf;
 
 	while (bcount) {
-		if (pc->sg - (struct scatterlist *) pc->scsi_cmd->request_buffer > pc->scsi_cmd->use_sg) {
-			printk (KERN_ERR "ide-scsi: scatter gather table too small, discarding data\n");
-			idescsi_discard_data (drive, bcount);
-			return;
-		}
 		count = min(pc->sg->length - pc->b_count, bcount);
 		if (PageHighMem(pc->sg->page)) {
 			unsigned long flags;
@@ -198,10 +194,17 @@ static void idescsi_input_buffers (ide_d
 		}
 		bcount -= count; pc->b_count += count;
 		if (pc->b_count == pc->sg->length) {
-			pc->sg++;
+			if (pc->sg == pc->last_sg)
+				break;
+			pc->sg = sg_next(pc->sg);
 			pc->b_count = 0;
 		}
 	}
+
+	if (bcount) {
+		printk (KERN_ERR "ide-scsi: scatter gather table too small, discarding data\n");
+		idescsi_discard_data (drive, bcount);
+	}
 }
 
 static void idescsi_output_buffers (ide_drive_t *drive, idescsi_pc_t *pc, unsigned int bcount)
@@ -210,11 +213,6 @@ static void idescsi_output_buffers (ide_
 	char *buf;
 
 	while (bcount) {
-		if (pc->sg - (struct scatterlist *) pc->scsi_cmd->request_buffer > pc->scsi_cmd->use_sg) {
-			printk (KERN_ERR "ide-scsi: scatter gather table too small, padding with zeros\n");
-			idescsi_output_zeros (drive, bcount);
-			return;
-		}
 		count = min(pc->sg->length - pc->b_count, bcount);
 		if (PageHighMem(pc->sg->page)) {
 			unsigned long flags;
@@ -233,10 +231,17 @@ static void idescsi_output_buffers (ide_
 		}
 		bcount -= count; pc->b_count += count;
 		if (pc->b_count == pc->sg->length) {
-			pc->sg++;
+			if (pc->sg == pc->last_sg)
+				break;
+			pc->sg = sg_next(pc->sg);
 			pc->b_count = 0;
 		}
 	}
+
+	if (bcount) {
+		printk (KERN_ERR "ide-scsi: scatter gather table too small, padding with zeros\n");
+		idescsi_output_zeros (drive, bcount);
+	}
 }
 
 /*
@@ -908,9 +913,11 @@ static int idescsi_queue (struct scsi_cm
 	if (cmd->use_sg) {
 		pc->buffer = NULL;
 		pc->sg = cmd->request_buffer;
+		pc->last_sg = sg_last(pc->sg, cmd->use_sg);
 	} else {
 		pc->buffer = cmd->request_buffer;
 		pc->sg = NULL;
+		pc->last_sg = NULL;
 	}
 	pc->b_count = 0;
 	pc->request_transfer = pc->buffer_size = cmd->request_bufflen;
diff -puN drivers/scsi/ips.c~git-block drivers/scsi/ips.c
--- a/drivers/scsi/ips.c~git-block
+++ a/drivers/scsi/ips.c
@@ -3251,7 +3251,7 @@ ips_done(ips_ha_t * ha, ips_scb_t * scb)
 		 */
 		if ((scb->breakup) || (scb->sg_break)) {
                         struct scatterlist *sg;
-                        int sg_dma_index, ips_sg_index = 0;
+                        int i, sg_dma_index, ips_sg_index = 0;
 
 			/* we had a data breakup */
 			scb->data_len = 0;
@@ -3260,20 +3260,22 @@ ips_done(ips_ha_t * ha, ips_scb_t * scb)
 
                         /* Spin forward to last dma chunk */
                         sg_dma_index = scb->breakup;
+                        for (i = 0; i < scb->breakup; i++)
+                                sg = sg_next(sg);
 
 			/* Take care of possible partial on last chunk */
                         ips_fill_scb_sg_single(ha,
-                                               sg_dma_address(&sg[sg_dma_index]),
+                                               sg_dma_address(sg),
                                                scb, ips_sg_index++,
-                                               sg_dma_len(&sg[sg_dma_index]));
+                                               sg_dma_len(sg));
 
                         for (; sg_dma_index < scsi_sg_count(scb->scsi_cmd);
-                             sg_dma_index++) {
+                             sg_dma_index++, sg = sg_next(sg)) {
                                 if (ips_fill_scb_sg_single
                                     (ha,
-                                     sg_dma_address(&sg[sg_dma_index]),
+                                     sg_dma_address(sg),
                                      scb, ips_sg_index++,
-                                     sg_dma_len(&sg[sg_dma_index])) < 0)
+                                     sg_dma_len(sg)) < 0)
                                         break;
                         }
 
diff -puN drivers/scsi/iscsi_tcp.c~git-block drivers/scsi/iscsi_tcp.c
--- a/drivers/scsi/iscsi_tcp.c~git-block
+++ a/drivers/scsi/iscsi_tcp.c
@@ -316,7 +316,7 @@ iscsi_solicit_data_init(struct iscsi_con
 
 	sg = scsi_sglist(sc);
 	r2t->sg = NULL;
-	for (i = 0; i < scsi_sg_count(sc); i++, sg += 1) {
+	for (i = 0; i < scsi_sg_count(sc); i++, sg = sg_next(sg)) {
 		/* FIXME: prefetch ? */
 		if (sg_count + sg->length > r2t->data_offset) {
 			int page_offset;
@@ -332,7 +332,7 @@ iscsi_solicit_data_init(struct iscsi_con
 			r2t->sendbuf.sg.length -= page_offset;
 
 			/* xmit logic will continue with next one */
-			r2t->sg = sg + 1;
+			r2t->sg = sg_next(sg);;
 			break;
 		}
 		sg_count += sg->length;
@@ -707,18 +707,21 @@ static int iscsi_scsi_data_in(struct isc
 	sg = scsi_sglist(sc);
 
 	if (tcp_ctask->data_offset)
-		for (i = 0; i < tcp_ctask->sg_count; i++)
-			offset -= sg[i].length;
+		for (i = 0; i < tcp_ctask->sg_count; i++, sg = sg_next(sg))
+			offset -= sg->length;
 	/* we've passed through partial sg*/
 	if (offset < 0)
 		offset = 0;
 
-	for (i = tcp_ctask->sg_count; i < scsi_sg_count(sc); i++) {
+	for (i = 0, sg = scsi_sglist(sc); i < tcp_ctask->sg_count; i++)
+		sg = sg_next(sg);
+
+	for (i = tcp_ctask->sg_count; i < scsi_sg_count(sc); i++, sg = sg_next(sg)) {
 		char *dest;
 
-		dest = kmap_atomic(sg[i].page, KM_SOFTIRQ0);
-		rc = iscsi_ctask_copy(tcp_conn, ctask, dest + sg[i].offset,
-				      sg[i].length, offset);
+		dest = kmap_atomic(sg->page, KM_SOFTIRQ0);
+		rc = iscsi_ctask_copy(tcp_conn, ctask, dest + sg->offset,
+				      sg->length, offset);
 		kunmap_atomic(dest, KM_SOFTIRQ0);
 		if (rc == -EAGAIN)
 			/* continue with the next SKB/PDU */
@@ -728,13 +731,13 @@ static int iscsi_scsi_data_in(struct isc
 				if (!offset)
 					crypto_hash_update(
 							&tcp_conn->rx_hash,
-							&sg[i], sg[i].length);
+							sg, sg->length);
 				else
 					partial_sg_digest_update(
 							&tcp_conn->rx_hash,
-							&sg[i],
-							sg[i].offset + offset,
-							sg[i].length - offset);
+							sg,
+							sg->offset + offset,
+							sg->length - offset);
 			}
 			offset = 0;
 			tcp_ctask->sg_count++;
@@ -746,9 +749,9 @@ static int iscsi_scsi_data_in(struct isc
 				 * data-in is complete, but buffer not...
 				 */
 				partial_sg_digest_update(&tcp_conn->rx_hash,
-							 &sg[i],
-							 sg[i].offset,
-							 sg[i].length-rc);
+							 sg,
+							 sg->offset,
+							 sg->length - rc);
 			rc = 0;
 			break;
 		}
@@ -1246,7 +1249,7 @@ iscsi_solicit_data_cont(struct iscsi_con
 		return;
 
 	iscsi_buf_init_sg(&r2t->sendbuf, r2t->sg);
-	r2t->sg += 1;
+	r2t->sg = sg_next(r2t->sg);
 }
 
 static void iscsi_set_padding(struct iscsi_tcp_cmd_task *tcp_ctask,
@@ -1378,8 +1381,8 @@ iscsi_send_cmd_hdr(struct iscsi_conn *co
 			struct scatterlist *sg = scsi_sglist(sc);
 
 			iscsi_buf_init_sg(&tcp_ctask->sendbuf, sg);
-			tcp_ctask->sg = sg + 1;
-			tcp_ctask->bad_sg = sg + scsi_sg_count(sc);
+			tcp_ctask->sg = sg_next(sg);
+			tcp_ctask->bad_sg = sg_last(sg, scsi_sg_count(sc));
 
 			debug_scsi("cmd [itt 0x%x total %d imm_data %d "
 				   "unsol count %d, unsol offset %d]\n",
@@ -1509,7 +1512,7 @@ iscsi_send_data(struct iscsi_cmd_task *c
 				buf_sent);
 		if (!iscsi_buf_left(sendbuf) && *sg != tcp_ctask->bad_sg) {
 			iscsi_buf_init_sg(sendbuf, *sg);
-			*sg = *sg + 1;
+			*sg = sg_next(*sg);
 		}
 
 		if (rc)
diff -puN drivers/scsi/qla1280.c~git-block drivers/scsi/qla1280.c
--- a/drivers/scsi/qla1280.c~git-block
+++ a/drivers/scsi/qla1280.c
@@ -2775,7 +2775,7 @@ qla1280_64bit_start_scsi(struct scsi_qla
 	struct device_reg __iomem *reg = ha->iobase;
 	struct scsi_cmnd *cmd = sp->cmd;
 	cmd_a64_entry_t *pkt;
-	struct scatterlist *sg = NULL;
+	struct scatterlist *sg = NULL, *s;
 	__le32 *dword_ptr;
 	dma_addr_t dma_handle;
 	int status = 0;
@@ -2889,13 +2889,16 @@ qla1280_64bit_start_scsi(struct scsi_qla
 	 * Load data segments.
 	 */
 	if (seg_cnt) {	/* If data transfer. */
+		int remseg = seg_cnt;
 		/* Setup packet address segment pointer. */
 		dword_ptr = (u32 *)&pkt->dseg_0_address;
 
 		if (cmd->use_sg) {	/* If scatter gather */
 			/* Load command entry data segments. */
-			for (cnt = 0; cnt < 2 && seg_cnt; cnt++, seg_cnt--) {
-				dma_handle = sg_dma_address(sg);
+			for_each_sg(sg, s, seg_cnt, cnt) {
+				if (cnt == 2)
+					break;
+				dma_handle = sg_dma_address(s);
 #if defined(CONFIG_IA64_GENERIC) || defined(CONFIG_IA64_SGI_SN2)
 				if (ha->flags.use_pci_vchannel)
 					sn_pci_set_vchan(ha->pdev,
@@ -2906,12 +2909,12 @@ qla1280_64bit_start_scsi(struct scsi_qla
 					cpu_to_le32(pci_dma_lo32(dma_handle));
 				*dword_ptr++ =
 					cpu_to_le32(pci_dma_hi32(dma_handle));
-				*dword_ptr++ = cpu_to_le32(sg_dma_len(sg));
-				sg++;
+				*dword_ptr++ = cpu_to_le32(sg_dma_len(s));
 				dprintk(3, "S/G Segment phys_addr=%x %x, len=0x%x\n",
 					cpu_to_le32(pci_dma_hi32(dma_handle)),
 					cpu_to_le32(pci_dma_lo32(dma_handle)),
-					cpu_to_le32(sg_dma_len(sg)));
+					cpu_to_le32(sg_dma_len(sg_next(s))));
+				remseg--;
 			}
 			dprintk(5, "qla1280_64bit_start_scsi: Scatter/gather "
 				"command packet data - b %i, t %i, l %i \n",
@@ -2926,7 +2929,9 @@ qla1280_64bit_start_scsi(struct scsi_qla
 			dprintk(3, "S/G Building Continuation...seg_cnt=0x%x "
 				"remains\n", seg_cnt);
 
-			while (seg_cnt > 0) {
+			while (remseg > 0) {
+				/* Update sg start */
+				sg = s;
 				/* Adjust ring index. */
 				ha->req_ring_index++;
 				if (ha->req_ring_index == REQUEST_ENTRY_CNT) {
@@ -2952,9 +2957,10 @@ qla1280_64bit_start_scsi(struct scsi_qla
 					(u32 *)&((struct cont_a64_entry *) pkt)->dseg_0_address;
 
 				/* Load continuation entry data segments. */
-				for (cnt = 0; cnt < 5 && seg_cnt;
-				     cnt++, seg_cnt--) {
-					dma_handle = sg_dma_address(sg);
+				for_each_sg(sg, s, remseg, cnt) {
+					if (cnt == 5)
+						break;
+					dma_handle = sg_dma_address(s);
 #if defined(CONFIG_IA64_GENERIC) || defined(CONFIG_IA64_SGI_SN2)
 				if (ha->flags.use_pci_vchannel)
 					sn_pci_set_vchan(ha->pdev, 
@@ -2966,12 +2972,12 @@ qla1280_64bit_start_scsi(struct scsi_qla
 					*dword_ptr++ =
 						cpu_to_le32(pci_dma_hi32(dma_handle));
 					*dword_ptr++ =
-						cpu_to_le32(sg_dma_len(sg));
+						cpu_to_le32(sg_dma_len(s));
 					dprintk(3, "S/G Segment Cont. phys_addr=%x %x, len=0x%x\n",
 						cpu_to_le32(pci_dma_hi32(dma_handle)),
 						cpu_to_le32(pci_dma_lo32(dma_handle)),
-						cpu_to_le32(sg_dma_len(sg)));
-					sg++;
+						cpu_to_le32(sg_dma_len(s)));
+					remseg--;
 				}
 				dprintk(5, "qla1280_64bit_start_scsi: "
 					"continuation packet data - b %i, t "
@@ -3062,7 +3068,7 @@ qla1280_32bit_start_scsi(struct scsi_qla
 	struct device_reg __iomem *reg = ha->iobase;
 	struct scsi_cmnd *cmd = sp->cmd;
 	struct cmd_entry *pkt;
-	struct scatterlist *sg = NULL;
+	struct scatterlist *sg = NULL, *s;
 	__le32 *dword_ptr;
 	int status = 0;
 	int cnt;
@@ -3188,6 +3194,7 @@ qla1280_32bit_start_scsi(struct scsi_qla
 	 * Load data segments.
 	 */
 	if (seg_cnt) {
+		int remseg = seg_cnt;
 		/* Setup packet address segment pointer. */
 		dword_ptr = &pkt->dseg_0_address;
 
@@ -3196,22 +3203,25 @@ qla1280_32bit_start_scsi(struct scsi_qla
 			qla1280_dump_buffer(1, (char *)sg, 4 * 16);
 
 			/* Load command entry data segments. */
-			for (cnt = 0; cnt < 4 && seg_cnt; cnt++, seg_cnt--) {
+			for_each_sg(sg, s, seg_cnt, cnt) {
+				if (cnt == 4)
+					break;
 				*dword_ptr++ =
-					cpu_to_le32(pci_dma_lo32(sg_dma_address(sg)));
-				*dword_ptr++ =
-					cpu_to_le32(sg_dma_len(sg));
+					cpu_to_le32(pci_dma_lo32(sg_dma_address(s)));
+				*dword_ptr++ = cpu_to_le32(sg_dma_len(s));
 				dprintk(3, "S/G Segment phys_addr=0x%lx, len=0x%x\n",
-					(pci_dma_lo32(sg_dma_address(sg))),
-					(sg_dma_len(sg)));
-				sg++;
+					(pci_dma_lo32(sg_dma_address(s))),
+					(sg_dma_len(s)));
+				remseg--;
 			}
 			/*
 			 * Build continuation packets.
 			 */
 			dprintk(3, "S/G Building Continuation"
 				"...seg_cnt=0x%x remains\n", seg_cnt);
-			while (seg_cnt > 0) {
+			while (remseg > 0) {
+				/* Continue from end point */
+				sg = s;
 				/* Adjust ring index. */
 				ha->req_ring_index++;
 				if (ha->req_ring_index == REQUEST_ENTRY_CNT) {
@@ -3239,18 +3249,16 @@ qla1280_32bit_start_scsi(struct scsi_qla
 					&((struct cont_entry *) pkt)->dseg_0_address;
 
 				/* Load continuation entry data segments. */
-				for (cnt = 0; cnt < 7 && seg_cnt;
-				     cnt++, seg_cnt--) {
+				for_each_sg(sg, s, remseg, cnt) {
 					*dword_ptr++ =
-						cpu_to_le32(pci_dma_lo32(sg_dma_address(sg)));
+						cpu_to_le32(pci_dma_lo32(sg_dma_address(s)));
 					*dword_ptr++ =
-						cpu_to_le32(sg_dma_len(sg));
+						cpu_to_le32(sg_dma_len(s));
 					dprintk(1,
 						"S/G Segment Cont. phys_addr=0x%x, "
 						"len=0x%x\n",
-						cpu_to_le32(pci_dma_lo32(sg_dma_address(sg))),
-						cpu_to_le32(sg_dma_len(sg)));
-					sg++;
+						cpu_to_le32(pci_dma_lo32(sg_dma_address(s))),
+						cpu_to_le32(sg_dma_len(s)));
 				}
 				dprintk(5, "qla1280_32bit_start_scsi: "
 					"continuation packet data - "
diff -puN drivers/scsi/qlogicpti.c~git-block drivers/scsi/qlogicpti.c
--- a/drivers/scsi/qlogicpti.c~git-block
+++ a/drivers/scsi/qlogicpti.c
@@ -870,7 +870,7 @@ static inline int load_cmd(struct scsi_c
 			   struct qlogicpti *qpti, u_int in_ptr, u_int out_ptr)
 {
 	struct dataseg *ds;
-	struct scatterlist *sg;
+	struct scatterlist *sg, *s;
 	int i, n;
 
 	if (Cmnd->use_sg) {
@@ -886,11 +886,12 @@ static inline int load_cmd(struct scsi_c
 		n = sg_count;
 		if (n > 4)
 			n = 4;
-		for (i = 0; i < n; i++, sg++) {
-			ds[i].d_base = sg_dma_address(sg);
-			ds[i].d_count = sg_dma_len(sg);
+		for_each_sg(sg, s, n, i) {
+			ds[i].d_base = sg_dma_address(s);
+			ds[i].d_count = sg_dma_len(g);
 		}
 		sg_count -= 4;
+		sg = s;
 		while (sg_count > 0) {
 			struct Continuation_Entry *cont;
 
@@ -909,9 +910,9 @@ static inline int load_cmd(struct scsi_c
 			n = sg_count;
 			if (n > 7)
 				n = 7;
-			for (i = 0; i < n; i++, sg++) {
-				ds[i].d_base = sg_dma_address(sg);
-				ds[i].d_count = sg_dma_len(sg);
+			for_each_sg(sg, s, n, i) {
+				ds[i].d_base = sg_dma_address(s);
+				ds[i].d_count = sg_dma_len(s);
 			}
 			sg_count -= n;
 		}
diff -puN drivers/scsi/scsi_debug.c~git-block drivers/scsi/scsi_debug.c
--- a/drivers/scsi/scsi_debug.c~git-block
+++ a/drivers/scsi/scsi_debug.c
@@ -38,6 +38,7 @@
 #include <linux/proc_fs.h>
 #include <linux/vmalloc.h>
 #include <linux/moduleparam.h>
+#include <linux/scatterlist.h>
 
 #include <linux/blkdev.h>
 #include "scsi.h"
@@ -600,7 +601,7 @@ static int fill_from_dev_buffer(struct s
 	int k, req_len, act_len, len, active;
 	void * kaddr;
 	void * kaddr_off;
-	struct scatterlist * sgpnt;
+	struct scatterlist * sg;
 
 	if (0 == scp->request_bufflen)
 		return 0;
@@ -619,16 +620,16 @@ static int fill_from_dev_buffer(struct s
 			scp->resid = req_len - act_len;
 		return 0;
 	}
-	sgpnt = (struct scatterlist *)scp->request_buffer;
 	active = 1;
-	for (k = 0, req_len = 0, act_len = 0; k < scp->use_sg; ++k, ++sgpnt) {
+	req_len = act_len = 0;
+	scsi_for_each_sg(scp, sg, scp->use_sg, k) {
 		if (active) {
 			kaddr = (unsigned char *)
-				kmap_atomic(sgpnt->page, KM_USER0);
+				kmap_atomic(sg->page, KM_USER0);
 			if (NULL == kaddr)
 				return (DID_ERROR << 16);
-			kaddr_off = (unsigned char *)kaddr + sgpnt->offset;
-			len = sgpnt->length;
+			kaddr_off = (unsigned char *)kaddr + sg->offset;
+			len = sg->length;
 			if ((req_len + len) > arr_len) {
 				active = 0;
 				len = arr_len - req_len;
@@ -637,7 +638,7 @@ static int fill_from_dev_buffer(struct s
 			kunmap_atomic(kaddr, KM_USER0);
 			act_len += len;
 		}
-		req_len += sgpnt->length;
+		req_len += sg->length;
 	}
 	if (scp->resid)
 		scp->resid -= act_len;
@@ -653,7 +654,7 @@ static int fetch_to_dev_buffer(struct sc
 	int k, req_len, len, fin;
 	void * kaddr;
 	void * kaddr_off;
-	struct scatterlist * sgpnt;
+	struct scatterlist * sg;
 
 	if (0 == scp->request_bufflen)
 		return 0;
@@ -668,13 +669,14 @@ static int fetch_to_dev_buffer(struct sc
 		memcpy(arr, scp->request_buffer, len);
 		return len;
 	}
-	sgpnt = (struct scatterlist *)scp->request_buffer;
-	for (k = 0, req_len = 0, fin = 0; k < scp->use_sg; ++k, ++sgpnt) {
-		kaddr = (unsigned char *)kmap_atomic(sgpnt->page, KM_USER0);
+	sg = scsi_sglist(scp);
+	req_len = fin = 0;
+	for (k = 0; k < scp->use_sg; ++k, sg = sg_next(sg)) {
+		kaddr = (unsigned char *)kmap_atomic(sg->page, KM_USER0);
 		if (NULL == kaddr)
 			return -1;
-		kaddr_off = (unsigned char *)kaddr + sgpnt->offset;
-		len = sgpnt->length;
+		kaddr_off = (unsigned char *)kaddr + sg->offset;
+		len = sg->length;
 		if ((req_len + len) > max_arr_len) {
 			len = max_arr_len - req_len;
 			fin = 1;
@@ -683,7 +685,7 @@ static int fetch_to_dev_buffer(struct sc
 		kunmap_atomic(kaddr, KM_USER0);
 		if (fin)
 			return req_len + len;
-		req_len += sgpnt->length;
+		req_len += sg->length;
 	}
 	return req_len;
 }
diff -puN drivers/scsi/scsi_lib.c~git-block drivers/scsi/scsi_lib.c
--- a/drivers/scsi/scsi_lib.c~git-block
+++ a/drivers/scsi/scsi_lib.c
@@ -17,6 +17,7 @@
 #include <linux/pci.h>
 #include <linux/delay.h>
 #include <linux/hardirq.h>
+#include <linux/scatterlist.h>
 
 #include <scsi/scsi.h>
 #include <scsi/scsi_cmnd.h>
@@ -35,33 +36,19 @@
 
 struct scsi_host_sg_pool {
 	size_t		size;
-	char		*name; 
+	char		*name;
 	struct kmem_cache	*slab;
 	mempool_t	*pool;
 };
 
-#if (SCSI_MAX_PHYS_SEGMENTS < 32)
-#error SCSI_MAX_PHYS_SEGMENTS is too small
-#endif
-
-#define SP(x) { x, "sgpool-" #x } 
+#define SP(x) { x, "sgpool-" #x }
 static struct scsi_host_sg_pool scsi_sg_pools[] = {
 	SP(8),
 	SP(16),
 	SP(32),
-#if (SCSI_MAX_PHYS_SEGMENTS > 32)
 	SP(64),
-#if (SCSI_MAX_PHYS_SEGMENTS > 64)
 	SP(128),
-#if (SCSI_MAX_PHYS_SEGMENTS > 128)
-	SP(256),
-#if (SCSI_MAX_PHYS_SEGMENTS > 256)
-#error SCSI_MAX_PHYS_SEGMENTS is too large
-#endif
-#endif
-#endif
-#endif
-}; 	
+};
 #undef SP
 
 /* must match scsi_device_event enum in scsi_device.h */
@@ -712,56 +699,174 @@ static struct scsi_cmnd *scsi_end_reques
 	return NULL;
 }
 
-struct scatterlist *scsi_alloc_sgtable(struct scsi_cmnd *cmd, gfp_t gfp_mask)
-{
-	struct scsi_host_sg_pool *sgp;
-	struct scatterlist *sgl;
+/*
+ * The maximum number of SG segments that we will put inside a scatterlist
+ * (unless chaining is used). Should ideally fit inside a single page, to
+ * avoid a higher order allocation.
+ */
+#define SCSI_MAX_SG_SEGMENTS	128
 
-	BUG_ON(!cmd->use_sg);
+/*
+ * Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit
+ * is totally arbitrary, a setting of 2048 will get you at least 8mb ios.
+ */
+#define SCSI_MAX_SG_CHAIN_SEGMENTS	2048
 
-	switch (cmd->use_sg) {
+static inline unsigned int scsi_sgtable_index(unsigned short nents)
+{
+	unsigned int index;
+
+	switch (nents) {
 	case 1 ... 8:
-		cmd->sglist_len = 0;
+		index = 0;
 		break;
 	case 9 ... 16:
-		cmd->sglist_len = 1;
+		index = 1;
 		break;
 	case 17 ... 32:
-		cmd->sglist_len = 2;
+		index = 2;
 		break;
-#if (SCSI_MAX_PHYS_SEGMENTS > 32)
 	case 33 ... 64:
-		cmd->sglist_len = 3;
-		break;
-#if (SCSI_MAX_PHYS_SEGMENTS > 64)
-	case 65 ... 128:
-		cmd->sglist_len = 4;
+		index = 3;
 		break;
-#if (SCSI_MAX_PHYS_SEGMENTS  > 128)
-	case 129 ... 256:
-		cmd->sglist_len = 5;
+	case 65 ... SCSI_MAX_SG_SEGMENTS:
+		index = 4;
 		break;
-#endif
-#endif
-#endif
 	default:
-		return NULL;
+		printk(KERN_ERR "scsi: bad segment count=%d\n", nents);
+		BUG();
 	}
 
-	sgp = scsi_sg_pools + cmd->sglist_len;
-	sgl = mempool_alloc(sgp->pool, gfp_mask);
-	return sgl;
+	return index;
+}
+
+struct scatterlist *scsi_alloc_sgtable(struct scsi_cmnd *cmd, gfp_t gfp_mask)
+{
+	struct scsi_host_sg_pool *sgp;
+	struct scatterlist *sgl, *prev, *ret;
+	unsigned int index;
+	int this, left;
+
+	BUG_ON(!cmd->use_sg);
+
+	left = cmd->use_sg;
+	ret = prev = NULL;
+	do {
+		this = left;
+		if (this > SCSI_MAX_SG_SEGMENTS) {
+			this = SCSI_MAX_SG_SEGMENTS - 1;
+			index = SG_MEMPOOL_NR - 1;
+		} else
+			index = scsi_sgtable_index(this);
+
+		left -= this;
+
+		sgp = scsi_sg_pools + index;
+
+		sgl = mempool_alloc(sgp->pool, gfp_mask);
+		if (unlikely(!sgl))
+			goto enomem;
+
+		memset(sgl, 0, sizeof(*sgl) * sgp->size);
+
+		/*
+		 * first loop through, set initial index and return value
+		 */
+		if (!ret) {
+			cmd->sglist_len = index;
+			ret = sgl;
+		}
+
+		/*
+		 * chain previous sglist, if any. we know the previous
+		 * sglist must be the biggest one, or we would not have
+		 * ended up doing another loop.
+		 */
+		if (prev)
+			sg_chain(prev, SCSI_MAX_SG_SEGMENTS, sgl);
+
+		/*
+		 * don't allow subsequent mempool allocs to sleep, it would
+		 * violate the mempool principle.
+		 */
+		gfp_mask &= ~__GFP_WAIT;
+		gfp_mask |= __GFP_HIGH;
+		prev = sgl;
+	} while (left);
+
+	/*
+	 * ->use_sg may get modified after dma mapping has potentially
+	 * shrunk the number of segments, so keep a copy of it for free.
+	 */
+	cmd->__use_sg = cmd->use_sg;
+	return ret;
+enomem:
+	if (ret) {
+		/*
+		 * Free entries chained off ret. Since we were trying to
+		 * allocate another sglist, we know that all entries are of
+		 * the max size.
+		 */
+		sgp = scsi_sg_pools + SG_MEMPOOL_NR - 1;
+		prev = ret;
+		ret = &ret[SCSI_MAX_SG_SEGMENTS - 1];
+
+		while ((sgl = sg_chain_ptr(ret)) != NULL) {
+			ret = &sgl[SCSI_MAX_SG_SEGMENTS - 1];
+			mempool_free(sgl, sgp->pool);
+		}
+
+		mempool_free(prev, sgp->pool);
+	}
+	return NULL;
 }
 
 EXPORT_SYMBOL(scsi_alloc_sgtable);
 
-void scsi_free_sgtable(struct scatterlist *sgl, int index)
+void scsi_free_sgtable(struct scsi_cmnd *cmd)
 {
+	struct scatterlist *sgl = cmd->request_buffer;
 	struct scsi_host_sg_pool *sgp;
 
-	BUG_ON(index >= SG_MEMPOOL_NR);
+	BUG_ON(cmd->sglist_len >= SG_MEMPOOL_NR);
 
-	sgp = scsi_sg_pools + index;
+	/*
+	 * if this is the biggest size sglist, check if we have
+	 * chained parts we need to free
+	 */
+	if (cmd->__use_sg > SCSI_MAX_SG_SEGMENTS) {
+		unsigned short this, left;
+		struct scatterlist *next;
+		unsigned int index;
+
+		left = cmd->__use_sg - (SCSI_MAX_SG_SEGMENTS - 1);
+		next = sg_chain_ptr(&sgl[SCSI_MAX_SG_SEGMENTS - 1]);
+		while (left && next) {
+			sgl = next;
+			this = left;
+			if (this > SCSI_MAX_SG_SEGMENTS) {
+				this = SCSI_MAX_SG_SEGMENTS - 1;
+				index = SG_MEMPOOL_NR - 1;
+			} else
+				index = scsi_sgtable_index(this);
+
+			left -= this;
+
+			sgp = scsi_sg_pools + index;
+
+			if (left)
+				next = sg_chain_ptr(&sgl[sgp->size - 1]);
+
+			mempool_free(sgl, sgp->pool);
+		}
+
+		/*
+		 * Restore original, will be freed below
+		 */
+		sgl = cmd->request_buffer;
+	}
+
+	sgp = scsi_sg_pools + cmd->sglist_len;
 	mempool_free(sgl, sgp->pool);
 }
 
@@ -787,7 +892,7 @@ EXPORT_SYMBOL(scsi_free_sgtable);
 static void scsi_release_buffers(struct scsi_cmnd *cmd)
 {
 	if (cmd->use_sg)
-		scsi_free_sgtable(cmd->request_buffer, cmd->sglist_len);
+		scsi_free_sgtable(cmd);
 
 	/*
 	 * Zero these out.  They now point to freed memory, and it is
@@ -1002,7 +1107,6 @@ EXPORT_SYMBOL(scsi_io_completion);
 static int scsi_init_io(struct scsi_cmnd *cmd)
 {
 	struct request     *req = cmd->request;
-	struct scatterlist *sgpnt;
 	int		   count;
 
 	/*
@@ -1015,14 +1119,13 @@ static int scsi_init_io(struct scsi_cmnd
 	/*
 	 * If sg table allocation fails, requeue request later.
 	 */
-	sgpnt = scsi_alloc_sgtable(cmd, GFP_ATOMIC);
-	if (unlikely(!sgpnt)) {
+	cmd->request_buffer = scsi_alloc_sgtable(cmd, GFP_ATOMIC);
+	if (unlikely(!cmd->request_buffer)) {
 		scsi_unprep_request(req);
 		return BLKPREP_DEFER;
 	}
 
 	req->buffer = NULL;
-	cmd->request_buffer = (char *) sgpnt;
 	if (blk_pc_request(req))
 		cmd->request_bufflen = req->data_len;
 	else
@@ -1570,8 +1673,22 @@ struct request_queue *__scsi_alloc_queue
 	if (!q)
 		return NULL;
 
+	/*
+	 * this limit is imposed by hardware restrictions
+	 */
 	blk_queue_max_hw_segments(q, shost->sg_tablesize);
-	blk_queue_max_phys_segments(q, SCSI_MAX_PHYS_SEGMENTS);
+
+	/*
+	 * In the future, sg chaining support will be mandatory and this
+	 * ifdef can then go away. Right now we don't have all archs
+	 * converted, so better keep it safe.
+	 */
+#ifdef ARCH_HAS_SG_CHAIN
+	blk_queue_max_phys_segments(q, SCSI_MAX_SG_CHAIN_SEGMENTS);
+#else
+	blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS);
+#endif
+
 	blk_queue_max_sectors(q, shost->max_sectors);
 	blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
 	blk_queue_segment_boundary(q, shost->dma_boundary);
@@ -2312,18 +2429,19 @@ EXPORT_SYMBOL_GPL(scsi_target_unblock);
  *
  * Returns virtual address of the start of the mapped page
  */
-void *scsi_kmap_atomic_sg(struct scatterlist *sg, int sg_count,
+void *scsi_kmap_atomic_sg(struct scatterlist *sgl, int sg_count,
 			  size_t *offset, size_t *len)
 {
 	int i;
 	size_t sg_len = 0, len_complete = 0;
+	struct scatterlist *sg;
 	struct page *page;
 
 	WARN_ON(!irqs_disabled());
 
-	for (i = 0; i < sg_count; i++) {
+	for_each_sg(sgl, sg, sg_count, i) {
 		len_complete = sg_len; /* Complete sg-entries */
-		sg_len += sg[i].length;
+		sg_len += sg->length;
 		if (sg_len > *offset)
 			break;
 	}
@@ -2337,10 +2455,10 @@ void *scsi_kmap_atomic_sg(struct scatter
 	}
 
 	/* Offset starting from the beginning of first page in this sg-entry */
-	*offset = *offset - len_complete + sg[i].offset;
+	*offset = *offset - len_complete + sg->offset;
 
 	/* Assumption: contiguous pages can be accessed as "page + i" */
-	page = nth_page(sg[i].page, (*offset >> PAGE_SHIFT));
+	page = nth_page(sg->page, (*offset >> PAGE_SHIFT));
 	*offset &= ~PAGE_MASK;
 
 	/* Bytes in this sg-entry from *offset to the end of the page */
diff -puN drivers/scsi/scsi_tgt_lib.c~git-block drivers/scsi/scsi_tgt_lib.c
--- a/drivers/scsi/scsi_tgt_lib.c~git-block
+++ a/drivers/scsi/scsi_tgt_lib.c
@@ -332,7 +332,7 @@ static void scsi_tgt_cmd_done(struct scs
 	scsi_tgt_uspace_send_status(cmd, tcmd->itn_id, tcmd->tag);
 
 	if (cmd->request_buffer)
-		scsi_free_sgtable(cmd->request_buffer, cmd->sglist_len);
+		scsi_free_sgtable(cmd);
 
 	queue_work(scsi_tgtd, &tcmd->work);
 }
@@ -373,7 +373,7 @@ static int scsi_tgt_init_cmd(struct scsi
 	}
 
 	eprintk("cmd %p cnt %d\n", cmd, cmd->use_sg);
-	scsi_free_sgtable(cmd->request_buffer, cmd->sglist_len);
+	scsi_free_sgtable(cmd);
 	return -EINVAL;
 }
 
diff -puN drivers/scsi/sd.c~git-block drivers/scsi/sd.c
--- a/drivers/scsi/sd.c~git-block
+++ a/drivers/scsi/sd.c
@@ -805,27 +805,6 @@ static int sd_sync_cache(struct scsi_dis
 	return 0;
 }
 
-static int sd_issue_flush(struct request_queue *q, struct gendisk *disk,
-			  sector_t *error_sector)
-{
-	int ret = 0;
-	struct scsi_device *sdp = q->queuedata;
-	struct scsi_disk *sdkp;
-
-	if (sdp->sdev_state != SDEV_RUNNING)
-		return -ENXIO;
-
-	sdkp = scsi_disk_get_from_dev(&sdp->sdev_gendev);
-
-	if (!sdkp)
-               return -ENODEV;
-
-	if (sdkp->WCE)
-		ret = sd_sync_cache(sdkp);
-	scsi_disk_put(sdkp);
-	return ret;
-}
-
 static void sd_prepare_flush(struct request_queue *q, struct request *rq)
 {
 	memset(rq->cmd, 0, sizeof(rq->cmd));
@@ -1675,8 +1654,6 @@ static int sd_probe(struct device *dev)
 
 	sd_revalidate_disk(gd);
 
-	blk_queue_issue_flush_fn(sdp->request_queue, sd_issue_flush);
-
 	gd->driverfs_dev = &sdp->sdev_gendev;
 	gd->flags = GENHD_FL_DRIVERFS;
 	if (sdp->removable)
diff -puN drivers/scsi/sg.c~git-block drivers/scsi/sg.c
--- a/drivers/scsi/sg.c~git-block
+++ a/drivers/scsi/sg.c
@@ -1168,7 +1168,7 @@ sg_vma_nopage(struct vm_area_struct *vma
 	sg = rsv_schp->buffer;
 	sa = vma->vm_start;
 	for (k = 0; (k < rsv_schp->k_use_sg) && (sa < vma->vm_end);
-	     ++k, ++sg) {
+	     ++k, sg = sg_next(sg)) {
 		len = vma->vm_end - sa;
 		len = (len < sg->length) ? len : sg->length;
 		if (offset < len) {
@@ -1212,7 +1212,7 @@ sg_mmap(struct file *filp, struct vm_are
 	sa = vma->vm_start;
 	sg = rsv_schp->buffer;
 	for (k = 0; (k < rsv_schp->k_use_sg) && (sa < vma->vm_end);
-	     ++k, ++sg) {
+	     ++k, sg = sg_next(sg)) {
 		len = vma->vm_end - sa;
 		len = (len < sg->length) ? len : sg->length;
 		sa += len;
@@ -1866,7 +1866,7 @@ sg_build_indirect(Sg_scatter_hold * schp
 	}
 	for (k = 0, sg = schp->buffer, rem_sz = blk_size;
 	     (rem_sz > 0) && (k < mx_sc_elems);
-	     ++k, rem_sz -= ret_sz, ++sg) {
+	     ++k, rem_sz -= ret_sz, sg = sg_next(sg)) {
 		
 		num = (rem_sz > scatter_elem_sz_prev) ?
 		      scatter_elem_sz_prev : rem_sz;
@@ -1939,7 +1939,7 @@ sg_write_xfer(Sg_request * srp)
 		if (res)
 			return res;
 
-		for (; p; ++sg, ksglen = sg->length,
+		for (; p; sg = sg_next(sg), ksglen = sg->length,
 		     p = page_address(sg->page)) {
 			if (usglen <= 0)
 				break;
@@ -2018,7 +2018,7 @@ sg_remove_scat(Sg_scatter_hold * schp)
 			int k;
 
 			for (k = 0; (k < schp->k_use_sg) && sg->page;
-			     ++k, ++sg) {
+			     ++k, sg = sg_next(sg)) {
 				SCSI_LOG_TIMEOUT(5, printk(
 				    "sg_remove_scat: k=%d, pg=0x%p, len=%d\n",
 				    k, sg->page, sg->length));
@@ -2071,7 +2071,7 @@ sg_read_xfer(Sg_request * srp)
 		if (res)
 			return res;
 
-		for (; p; ++sg, ksglen = sg->length,
+		for (; p; sg = sg_next(sg), ksglen = sg->length,
 		     p = page_address(sg->page)) {
 			if (usglen <= 0)
 				break;
@@ -2118,7 +2118,7 @@ sg_read_oxfer(Sg_request * srp, char __u
 	if ((!outp) || (num_read_xfer <= 0))
 		return 0;
 
-	for (k = 0; (k < schp->k_use_sg) && sg->page; ++k, ++sg) {
+	for (k = 0; (k < schp->k_use_sg) && sg->page; ++k, sg = sg_next(sg)) {
 		num = sg->length;
 		if (num > num_read_xfer) {
 			if (__copy_to_user(outp, page_address(sg->page),
@@ -2168,7 +2168,7 @@ sg_link_reserve(Sg_fd * sfp, Sg_request 
 	SCSI_LOG_TIMEOUT(4, printk("sg_link_reserve: size=%d\n", size));
 	rem = size;
 
-	for (k = 0; k < rsv_schp->k_use_sg; ++k, ++sg) {
+	for (k = 0; k < rsv_schp->k_use_sg; ++k, sg = sg_next(sg)) {
 		num = sg->length;
 		if (rem <= num) {
 			sfp->save_scat_len = num;
diff -puN drivers/usb/storage/alauda.c~git-block drivers/usb/storage/alauda.c
--- a/drivers/usb/storage/alauda.c~git-block
+++ a/drivers/usb/storage/alauda.c
@@ -798,12 +798,13 @@ static int alauda_read_data(struct us_da
 {
 	unsigned char *buffer;
 	u16 lba, max_lba;
-	unsigned int page, len, index, offset;
+	unsigned int page, len, offset;
 	unsigned int blockshift = MEDIA_INFO(us).blockshift;
 	unsigned int pageshift = MEDIA_INFO(us).pageshift;
 	unsigned int blocksize = MEDIA_INFO(us).blocksize;
 	unsigned int pagesize = MEDIA_INFO(us).pagesize;
 	unsigned int uzonesize = MEDIA_INFO(us).uzonesize;
+	struct scatterlist *sg;
 	int result;
 
 	/*
@@ -827,7 +828,8 @@ static int alauda_read_data(struct us_da
 	max_lba = MEDIA_INFO(us).capacity >> (blockshift + pageshift);
 
 	result = USB_STOR_TRANSPORT_GOOD;
-	index = offset = 0;
+	offset = 0;
+	sg = NULL;
 
 	while (sectors > 0) {
 		unsigned int zone = lba / uzonesize; /* integer division */
@@ -873,7 +875,7 @@ static int alauda_read_data(struct us_da
 
 		/* Store the data in the transfer buffer */
 		usb_stor_access_xfer_buf(buffer, len, us->srb,
-				&index, &offset, TO_XFER_BUF);
+				&sg, &offset, TO_XFER_BUF);
 
 		page = 0;
 		lba++;
@@ -891,11 +893,12 @@ static int alauda_write_data(struct us_d
 		unsigned int sectors)
 {
 	unsigned char *buffer, *blockbuffer;
-	unsigned int page, len, index, offset;
+	unsigned int page, len, offset;
 	unsigned int blockshift = MEDIA_INFO(us).blockshift;
 	unsigned int pageshift = MEDIA_INFO(us).pageshift;
 	unsigned int blocksize = MEDIA_INFO(us).blocksize;
 	unsigned int pagesize = MEDIA_INFO(us).pagesize;
+	struct scatterlist *sg;
 	u16 lba, max_lba;
 	int result;
 
@@ -929,7 +932,8 @@ static int alauda_write_data(struct us_d
 	max_lba = MEDIA_INFO(us).capacity >> (pageshift + blockshift);
 
 	result = USB_STOR_TRANSPORT_GOOD;
-	index = offset = 0;
+	offset = 0;
+	sg = NULL;
 
 	while (sectors > 0) {
 		/* Write as many sectors as possible in this block */
@@ -946,7 +950,7 @@ static int alauda_write_data(struct us_d
 
 		/* Get the data from the transfer buffer */
 		usb_stor_access_xfer_buf(buffer, len, us->srb,
-				&index, &offset, FROM_XFER_BUF);
+				&sg, &offset, FROM_XFER_BUF);
 
 		result = alauda_write_lba(us, lba, page, pages, buffer,
 			blockbuffer);
diff -puN drivers/usb/storage/datafab.c~git-block drivers/usb/storage/datafab.c
--- a/drivers/usb/storage/datafab.c~git-block
+++ a/drivers/usb/storage/datafab.c
@@ -98,7 +98,8 @@ static int datafab_read_data(struct us_d
 	unsigned char  thistime;
 	unsigned int totallen, alloclen;
 	int len, result;
-	unsigned int sg_idx = 0, sg_offset = 0;
+	unsigned int sg_offset = 0;
+	struct scatterlist *sg = NULL;
 
 	// we're working in LBA mode.  according to the ATA spec, 
 	// we can support up to 28-bit addressing.  I don't know if Datafab
@@ -155,7 +156,7 @@ static int datafab_read_data(struct us_d
 
 		// Store the data in the transfer buffer
 		usb_stor_access_xfer_buf(buffer, len, us->srb,
-				 &sg_idx, &sg_offset, TO_XFER_BUF);
+				 &sg, &sg_offset, TO_XFER_BUF);
 
 		sector += thistime;
 		totallen -= len;
@@ -181,7 +182,8 @@ static int datafab_write_data(struct us_
 	unsigned char thistime;
 	unsigned int totallen, alloclen;
 	int len, result;
-	unsigned int sg_idx = 0, sg_offset = 0;
+	unsigned int sg_offset = 0;
+	struct scatterlist *sg = NULL;
 
 	// we're working in LBA mode.  according to the ATA spec, 
 	// we can support up to 28-bit addressing.  I don't know if Datafab
@@ -217,7 +219,7 @@ static int datafab_write_data(struct us_
 
 		// Get the data from the transfer buffer
 		usb_stor_access_xfer_buf(buffer, len, us->srb,
-				&sg_idx, &sg_offset, FROM_XFER_BUF);
+				&sg, &sg_offset, FROM_XFER_BUF);
 
 		command[0] = 0;
 		command[1] = thistime;
diff -puN drivers/usb/storage/jumpshot.c~git-block drivers/usb/storage/jumpshot.c
--- a/drivers/usb/storage/jumpshot.c~git-block
+++ a/drivers/usb/storage/jumpshot.c
@@ -119,7 +119,8 @@ static int jumpshot_read_data(struct us_
 	unsigned char  thistime;
 	unsigned int totallen, alloclen;
 	int len, result;
-	unsigned int sg_idx = 0, sg_offset = 0;
+	unsigned int sg_offset = 0;
+	struct scatterlist *sg = NULL;
 
 	// we're working in LBA mode.  according to the ATA spec, 
 	// we can support up to 28-bit addressing.  I don't know if Jumpshot
@@ -170,7 +171,7 @@ static int jumpshot_read_data(struct us_
 
 		// Store the data in the transfer buffer
 		usb_stor_access_xfer_buf(buffer, len, us->srb,
-				 &sg_idx, &sg_offset, TO_XFER_BUF);
+				 &sg, &sg_offset, TO_XFER_BUF);
 
 		sector += thistime;
 		totallen -= len;
@@ -195,7 +196,8 @@ static int jumpshot_write_data(struct us
 	unsigned char  thistime;
 	unsigned int totallen, alloclen;
 	int len, result, waitcount;
-	unsigned int sg_idx = 0, sg_offset = 0;
+	unsigned int sg_offset = 0;
+	struct scatterlist *sg = NULL;
 
 	// we're working in LBA mode.  according to the ATA spec, 
 	// we can support up to 28-bit addressing.  I don't know if Jumpshot
@@ -225,7 +227,7 @@ static int jumpshot_write_data(struct us
 
 		// Get the data from the transfer buffer
 		usb_stor_access_xfer_buf(buffer, len, us->srb,
-				&sg_idx, &sg_offset, FROM_XFER_BUF);
+				&sg, &sg_offset, FROM_XFER_BUF);
 
 		command[0] = 0;
 		command[1] = thistime;
diff -puN drivers/usb/storage/protocol.c~git-block drivers/usb/storage/protocol.c
--- a/drivers/usb/storage/protocol.c~git-block
+++ a/drivers/usb/storage/protocol.c
@@ -157,7 +157,7 @@ void usb_stor_transparent_scsi_command(s
  * pick up from where this one left off. */
 
 unsigned int usb_stor_access_xfer_buf(unsigned char *buffer,
-	unsigned int buflen, struct scsi_cmnd *srb, unsigned int *index,
+	unsigned int buflen, struct scsi_cmnd *srb, struct scatterlist **sgptr,
 	unsigned int *offset, enum xfer_buf_dir dir)
 {
 	unsigned int cnt;
@@ -184,16 +184,17 @@ unsigned int usb_stor_access_xfer_buf(un
 	 * located in high memory -- then kmap() will map it to a temporary
 	 * position in the kernel's virtual address space. */
 	} else {
-		struct scatterlist *sg =
-				(struct scatterlist *) srb->request_buffer
-				+ *index;
+		struct scatterlist *sg = *sgptr;
+
+		if (!sg)
+			sg = (struct scatterlist *) srb->request_buffer;
 
 		/* This loop handles a single s-g list entry, which may
 		 * include multiple pages.  Find the initial page structure
 		 * and the starting offset within the page, and update
 		 * the *offset and *index values for the next loop. */
 		cnt = 0;
-		while (cnt < buflen && *index < srb->use_sg) {
+		while (cnt < buflen) {
 			struct page *page = sg->page +
 					((sg->offset + *offset) >> PAGE_SHIFT);
 			unsigned int poff =
@@ -209,8 +210,7 @@ unsigned int usb_stor_access_xfer_buf(un
 
 				/* Transfer continues to next s-g entry */
 				*offset = 0;
-				++*index;
-				++sg;
+				sg = sg_next(sg);
 			}
 
 			/* Transfer the data for all the pages in this
@@ -234,6 +234,7 @@ unsigned int usb_stor_access_xfer_buf(un
 				sglen -= plen;
 			}
 		}
+		*sgptr = sg;
 	}
 
 	/* Return the amount actually transferred */
@@ -245,9 +246,10 @@ unsigned int usb_stor_access_xfer_buf(un
 void usb_stor_set_xfer_buf(unsigned char *buffer,
 	unsigned int buflen, struct scsi_cmnd *srb)
 {
-	unsigned int index = 0, offset = 0;
+	unsigned int offset = 0;
+	struct scatterlist *sg = NULL;
 
-	usb_stor_access_xfer_buf(buffer, buflen, srb, &index, &offset,
+	usb_stor_access_xfer_buf(buffer, buflen, srb, &sg, &offset,
 			TO_XFER_BUF);
 	if (buflen < srb->request_bufflen)
 		srb->resid = srb->request_bufflen - buflen;
diff -puN drivers/usb/storage/protocol.h~git-block drivers/usb/storage/protocol.h
--- a/drivers/usb/storage/protocol.h~git-block
+++ a/drivers/usb/storage/protocol.h
@@ -52,7 +52,7 @@ extern void usb_stor_transparent_scsi_co
 enum xfer_buf_dir	{TO_XFER_BUF, FROM_XFER_BUF};
 
 extern unsigned int usb_stor_access_xfer_buf(unsigned char *buffer,
-	unsigned int buflen, struct scsi_cmnd *srb, unsigned int *index,
+	unsigned int buflen, struct scsi_cmnd *srb, struct scatterlist **,
 	unsigned int *offset, enum xfer_buf_dir dir);
 
 extern void usb_stor_set_xfer_buf(unsigned char *buffer,
diff -puN drivers/usb/storage/sddr09.c~git-block drivers/usb/storage/sddr09.c
--- a/drivers/usb/storage/sddr09.c~git-block
+++ a/drivers/usb/storage/sddr09.c
@@ -705,7 +705,8 @@ sddr09_read_data(struct us_data *us,
 	unsigned char *buffer;
 	unsigned int lba, maxlba, pba;
 	unsigned int page, pages;
-	unsigned int len, index, offset;
+	unsigned int len, offset;
+	struct scatterlist *sg;
 	int result;
 
 	// Figure out the initial LBA and page
@@ -730,7 +731,8 @@ sddr09_read_data(struct us_data *us,
 	// contiguous LBA's. Another exercise left to the student.
 
 	result = 0;
-	index = offset = 0;
+	offset = 0;
+	sg = NULL;
 
 	while (sectors > 0) {
 
@@ -777,7 +779,7 @@ sddr09_read_data(struct us_data *us,
 
 		// Store the data in the transfer buffer
 		usb_stor_access_xfer_buf(buffer, len, us->srb,
-				&index, &offset, TO_XFER_BUF);
+				&sg, &offset, TO_XFER_BUF);
 
 		page = 0;
 		lba++;
@@ -931,7 +933,8 @@ sddr09_write_data(struct us_data *us,
 	unsigned int pagelen, blocklen;
 	unsigned char *blockbuffer;
 	unsigned char *buffer;
-	unsigned int len, index, offset;
+	unsigned int len, offset;
+	struct scatterlist *sg;
 	int result;
 
 	// Figure out the initial LBA and page
@@ -968,7 +971,8 @@ sddr09_write_data(struct us_data *us,
 	}
 
 	result = 0;
-	index = offset = 0;
+	offset = 0;
+	sg = NULL;
 
 	while (sectors > 0) {
 
@@ -987,7 +991,7 @@ sddr09_write_data(struct us_data *us,
 
 		// Get the data from the transfer buffer
 		usb_stor_access_xfer_buf(buffer, len, us->srb,
-				&index, &offset, FROM_XFER_BUF);
+				&sg, &offset, FROM_XFER_BUF);
 
 		result = sddr09_write_lba(us, lba, page, pages,
 				buffer, blockbuffer);
diff -puN drivers/usb/storage/sddr55.c~git-block drivers/usb/storage/sddr55.c
--- a/drivers/usb/storage/sddr55.c~git-block
+++ a/drivers/usb/storage/sddr55.c
@@ -167,7 +167,8 @@ static int sddr55_read_data(struct us_da
 	unsigned long address;
 
 	unsigned short pages;
-	unsigned int len, index, offset;
+	unsigned int len, offset;
+	struct scatterlist *sg;
 
 	// Since we only read in one block at a time, we have to create
 	// a bounce buffer and move the data a piece at a time between the
@@ -178,7 +179,8 @@ static int sddr55_read_data(struct us_da
 	buffer = kmalloc(len, GFP_NOIO);
 	if (buffer == NULL)
 		return USB_STOR_TRANSPORT_ERROR; /* out of memory */
-	index = offset = 0;
+	offset = 0;
+	sg = NULL;
 
 	while (sectors>0) {
 
@@ -255,7 +257,7 @@ static int sddr55_read_data(struct us_da
 
 		// Store the data in the transfer buffer
 		usb_stor_access_xfer_buf(buffer, len, us->srb,
-				&index, &offset, TO_XFER_BUF);
+				&sg, &offset, TO_XFER_BUF);
 
 		page = 0;
 		lba++;
@@ -287,7 +289,8 @@ static int sddr55_write_data(struct us_d
 
 	unsigned short pages;
 	int i;
-	unsigned int len, index, offset;
+	unsigned int len, offset;
+	struct scatterlist *sg;
 
 	/* check if we are allowed to write */
 	if (info->read_only || info->force_read_only) {
@@ -304,7 +307,8 @@ static int sddr55_write_data(struct us_d
 	buffer = kmalloc(len, GFP_NOIO);
 	if (buffer == NULL)
 		return USB_STOR_TRANSPORT_ERROR;
-	index = offset = 0;
+	offset = 0;
+	sg = NULL;
 
 	while (sectors > 0) {
 
@@ -322,7 +326,7 @@ static int sddr55_write_data(struct us_d
 
 		// Get the data from the transfer buffer
 		usb_stor_access_xfer_buf(buffer, len, us->srb,
-				&index, &offset, FROM_XFER_BUF);
+				&sg, &offset, FROM_XFER_BUF);
 
 		US_DEBUGP("Write %02X pages, to PBA %04X"
 			" (LBA %04X) page %02X\n",
diff -puN drivers/usb/storage/shuttle_usbat.c~git-block drivers/usb/storage/shuttle_usbat.c
--- a/drivers/usb/storage/shuttle_usbat.c~git-block
+++ a/drivers/usb/storage/shuttle_usbat.c
@@ -996,7 +996,8 @@ static int usbat_flash_read_data(struct 
 	unsigned char  thistime;
 	unsigned int totallen, alloclen;
 	int len, result;
-	unsigned int sg_idx = 0, sg_offset = 0;
+	unsigned int sg_offset = 0;
+	struct scatterlist *sg = NULL;
 
 	result = usbat_flash_check_media(us, info);
 	if (result != USB_STOR_TRANSPORT_GOOD)
@@ -1050,7 +1051,7 @@ static int usbat_flash_read_data(struct 
 	
 		/* Store the data in the transfer buffer */
 		usb_stor_access_xfer_buf(buffer, len, us->srb,
-					 &sg_idx, &sg_offset, TO_XFER_BUF);
+					 &sg, &sg_offset, TO_XFER_BUF);
 
 		sector += thistime;
 		totallen -= len;
@@ -1086,7 +1087,8 @@ static int usbat_flash_write_data(struct
 	unsigned char  thistime;
 	unsigned int totallen, alloclen;
 	int len, result;
-	unsigned int sg_idx = 0, sg_offset = 0;
+	unsigned int sg_offset = 0;
+	struct scatterlist *sg = NULL;
 
 	result = usbat_flash_check_media(us, info);
 	if (result != USB_STOR_TRANSPORT_GOOD)
@@ -1125,7 +1127,7 @@ static int usbat_flash_write_data(struct
 
 		/* Get the data from the transfer buffer */
 		usb_stor_access_xfer_buf(buffer, len, us->srb,
-					 &sg_idx, &sg_offset, FROM_XFER_BUF);
+					 &sg, &sg_offset, FROM_XFER_BUF);
 
 		/* ATA command 0x30 (WRITE SECTORS) */
 		usbat_pack_ata_sector_cmd(command, thistime, sector, 0x30);
@@ -1165,8 +1167,8 @@ static int usbat_hp8200e_handle_read10(s
 	unsigned char *buffer;
 	unsigned int len;
 	unsigned int sector;
-	unsigned int sg_segment = 0;
 	unsigned int sg_offset = 0;
+	struct scatterlist *sg = NULL;
 
 	US_DEBUGP("handle_read10: transfersize %d\n",
 		srb->transfersize);
@@ -1223,9 +1225,6 @@ static int usbat_hp8200e_handle_read10(s
 	sector |= short_pack(data[7+5], data[7+4]);
 	transferred = 0;
 
-	sg_segment = 0; /* for keeping track of where we are in */
-	sg_offset = 0;  /* the scatter/gather list */
-
 	while (transferred != srb->request_bufflen) {
 
 		if (len > srb->request_bufflen - transferred)
@@ -1258,7 +1257,7 @@ static int usbat_hp8200e_handle_read10(s
 
 		/* Store the data in the transfer buffer */
 		usb_stor_access_xfer_buf(buffer, len, srb,
-				 &sg_segment, &sg_offset, TO_XFER_BUF);
+				 &sg, &sg_offset, TO_XFER_BUF);
 
 		/* Update the amount transferred and the sector number */
 
diff -puN fs/bio.c~git-block fs/bio.c
--- a/fs/bio.c~git-block
+++ a/fs/bio.c
@@ -109,11 +109,14 @@ static inline struct bio_vec *bvec_alloc
 
 void bio_free(struct bio *bio, struct bio_set *bio_set)
 {
-	const int pool_idx = BIO_POOL_IDX(bio);
+	if (bio->bi_io_vec) {
+		const int pool_idx = BIO_POOL_IDX(bio);
 
-	BIO_BUG_ON(pool_idx >= BIOVEC_NR_POOLS);
+		BIO_BUG_ON(pool_idx >= BIOVEC_NR_POOLS);
+
+		mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]);
+	}
 
-	mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]);
 	mempool_free(bio, bio_set->bio_pool);
 }
 
@@ -127,21 +130,9 @@ static void bio_fs_destructor(struct bio
 
 void bio_init(struct bio *bio)
 {
-	bio->bi_next = NULL;
-	bio->bi_bdev = NULL;
+	memset(bio, 0, sizeof(*bio));
 	bio->bi_flags = 1 << BIO_UPTODATE;
-	bio->bi_rw = 0;
-	bio->bi_vcnt = 0;
-	bio->bi_idx = 0;
-	bio->bi_phys_segments = 0;
-	bio->bi_hw_segments = 0;
-	bio->bi_hw_front_size = 0;
-	bio->bi_hw_back_size = 0;
-	bio->bi_size = 0;
-	bio->bi_max_vecs = 0;
-	bio->bi_end_io = NULL;
 	atomic_set(&bio->bi_cnt, 1);
-	bio->bi_private = NULL;
 }
 
 /**
diff -puN fs/splice.c~git-block fs/splice.c
--- a/fs/splice.c~git-block
+++ a/fs/splice.c
@@ -29,6 +29,7 @@
 #include <linux/syscalls.h>
 #include <linux/uio.h>
 #include <linux/security.h>
+#include <linux/mman.h>
 
 /*
  * Attempt to steal a page from a pipe buffer. This should perhaps go into
@@ -254,11 +255,16 @@ ssize_t splice_to_pipe(struct pipe_inode
 	}
 
 	while (page_nr < spd_pages)
-		page_cache_release(spd->pages[page_nr++]);
+		spd->spd_release(spd, page_nr++);
 
 	return ret;
 }
 
+static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
+{
+	page_cache_release(spd->pages[i]);
+}
+
 static int
 __generic_file_splice_read(struct file *in, loff_t *ppos,
 			   struct pipe_inode_info *pipe, size_t len,
@@ -277,6 +283,7 @@ __generic_file_splice_read(struct file *
 		.partial = partial,
 		.flags = flags,
 		.ops = &page_cache_pipe_buf_ops,
+		.spd_release = spd_release_page,
 	};
 
 	index = *ppos >> PAGE_CACHE_SHIFT;
@@ -1224,29 +1231,25 @@ static long do_splice(struct file *in, l
 }
 
 /*
- * Map an iov into an array of pages and offset/length tupples. With the
- * partial_page structure, we can map several non-contiguous ranges into
- * our ones pages[] map instead of splitting that operation into pieces.
- * Could easily be exported as a generic helper for other users, in which
- * case one would probably want to add a 'max_nr_pages' parameter as well.
+ * Undo vmsplice_to_user map. Basically just lookup the vmas and
+ * detach/unmap/remove them.
  */
-static int get_iovec_page_array(const struct iovec __user *iov,
-				unsigned int nr_vecs, struct page **pages,
-				struct partial_page *partial, int aligned)
+static long vmsplice_unmap(const struct iovec __user *iov,
+			   unsigned long nr_segs)
 {
-	int buffers = 0, error = 0;
+	struct mm_struct *mm = current->mm;
+	int error;
 
 	/*
-	 * It's ok to take the mmap_sem for reading, even
-	 * across a "get_user()".
+	 * Given the finite number of segments we can unmap, grab the
+	 * mmap sem here once instead of per-iov
 	 */
-	down_read(&current->mm->mmap_sem);
+	down_write(&mm->mmap_sem);
 
-	while (nr_vecs) {
-		unsigned long off, npages;
+	error = 0;
+	while (nr_segs) {
 		void __user *base;
 		size_t len;
-		int i;
 
 		/*
 		 * Get user address base and length for this iovec.
@@ -1258,84 +1261,74 @@ static int get_iovec_page_array(const st
 		if (unlikely(error))
 			break;
 
-		/*
-		 * Sanity check this iovec. 0 read succeeds.
-		 */
-		if (unlikely(!len))
+		error = do_munmap(mm, (unsigned long) base, len);
+		if (error)
 			break;
-		error = -EFAULT;
-		if (unlikely(!base))
-			break;
-
-		/*
-		 * Get this base offset and number of pages, then map
-		 * in the user pages.
-		 */
-		off = (unsigned long) base & ~PAGE_MASK;
 
-		/*
-		 * If asked for alignment, the offset must be zero and the
-		 * length a multiple of the PAGE_SIZE.
-		 */
-		error = -EINVAL;
-		if (aligned && (off || len & ~PAGE_MASK))
-			break;
-
-		npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-		if (npages > PIPE_BUFFERS - buffers)
-			npages = PIPE_BUFFERS - buffers;
-
-		error = get_user_pages(current, current->mm,
-				       (unsigned long) base, npages, 0, 0,
-				       &pages[buffers], NULL);
-
-		if (unlikely(error <= 0))
-			break;
-
-		/*
-		 * Fill this contiguous range into the partial page map.
-		 */
-		for (i = 0; i < error; i++) {
-			const int plen = min_t(size_t, len, PAGE_SIZE - off);
-
-			partial[buffers].offset = off;
-			partial[buffers].len = plen;
-
-			off = 0;
-			len -= plen;
-			buffers++;
-		}
-
-		/*
-		 * We didn't complete this iov, stop here since it probably
-		 * means we have to move some of this into a pipe to
-		 * be able to continue.
-		 */
-		if (len)
-			break;
-
-		/*
-		 * Don't continue if we mapped fewer pages than we asked for,
-		 * or if we mapped the max number of pages that we have
-		 * room for.
-		 */
-		if (error < npages || buffers == PIPE_BUFFERS)
-			break;
-
-		nr_vecs--;
+		nr_segs--;
 		iov++;
 	}
 
-	up_read(&current->mm->mmap_sem);
+	up_write(&mm->mmap_sem);
+	return error;
+}
 
-	if (buffers)
-		return buffers;
 
-	return error;
+/*
+ * Having a ->close() operation prevents vma merging
+ */
+static void splice_mapping_close(struct vm_area_struct *vma)
+{
 }
 
-static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
-			struct splice_desc *sd)
+static struct vm_operations_struct splice_mapping_vmops = {
+	.close = splice_mapping_close,
+};
+
+static struct vm_area_struct *splice_setup_vma(struct splice_desc *sd,
+					       struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+	unsigned int vm_flags;
+	unsigned long addr;
+
+	addr = (unsigned long) sd->u.userptr;
+	addr = get_unmapped_area(NULL, addr, sd->total_len, 0,MAP_PRIVATE);
+	if (addr & ~PAGE_MASK)
+		return ERR_PTR(addr);
+
+	if (!may_expand_vm(mm, sd->total_len >> PAGE_SHIFT))
+		goto enomem;
+
+	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	if (unlikely(!vma))
+		goto enomem;
+
+	vm_flags = calc_vm_prot_bits(PROT_READ) | calc_vm_flag_bits(MAP_PRIVATE)
+			| mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+
+	vma->vm_mm = mm;
+	vma->vm_start = addr;
+	vma->vm_end = addr;
+	vma->vm_flags = vm_flags;
+	vma->vm_page_prot = protection_map[vm_flags & 7];
+	vma->vm_ops = &splice_mapping_vmops;
+
+	if (!insert_vm_struct(mm, vma)) {
+		vm_stat_account(mm, vm_flags, NULL,sd->total_len >> PAGE_SHIFT);
+		return vma;
+	}
+
+	kmem_cache_free(vm_area_cachep, vma);
+enomem:
+	return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * Just copy the data to user space
+ */
+static int pipe_to_user_copy(struct pipe_inode_info *pipe,
+			     struct pipe_buffer *buf, struct splice_desc *sd)
 {
 	char *src;
 	int ret;
@@ -1361,32 +1354,106 @@ static int pipe_to_user(struct pipe_inod
 
 	/*
 	 * No dice, use slow non-atomic map and copy
- 	 */
+	 */
 	src = buf->ops->map(pipe, buf, 0);
 
 	ret = sd->len;
 	if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len))
 		ret = -EFAULT;
 
+	buf->ops->unmap(pipe, buf, src);
 out:
 	if (ret > 0)
 		sd->u.userptr += ret;
-	buf->ops->unmap(pipe, buf, src);
 	return ret;
 }
 
 /*
- * For lack of a better implementation, implement vmsplice() to userspace
- * as a simple copy of the pipes pages to the user iov.
+ * Attempt to map the pipe buffer page into the application address space.
+ * Called with the ->mmap_sem held for writing
+ */
+static int pipe_to_user_map(struct pipe_inode_info *pipe,
+			    struct pipe_buffer *buf, struct splice_desc *sd)
+{
+	struct vm_area_struct *vma = sd->u.data;
+	struct mm_struct *mm = current->mm;
+	unsigned long addr = 0;
+	int error;
+
+	if (buf->len & ~PAGE_MASK)
+		return -EINVAL;
+
+	error = buf->ops->confirm(pipe, buf);
+	if (unlikely(error))
+		return error;
+
+	down_write(&mm->mmap_sem);
+
+	addr = vma->vm_start + sd->pos;
+	vma->vm_end += PAGE_SIZE;
+	error = vm_insert_page(vma, addr, buf->page);
+	if (!error)
+		mm->total_vm++;
+
+	up_write(&mm->mmap_sem);
+
+	if (error)
+		return error;
+
+	return buf->len;
+}
+
+/*
+ * Setup a vma for this address range, and let pipe_to_user_map() insert
+ * pages into that.
  */
-static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
+static int vmsplice_pipe_map(struct pipe_inode_info *pipe,
+			     struct splice_desc *sd)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+
+	if (!sd->total_len || sd->total_len > TASK_SIZE)
+		return -ENOMEM;
+	if (sd->total_len & ~PAGE_MASK)
+		return -EINVAL;
+	if (mm->map_count > sysctl_max_map_count)
+		return -ENOMEM;
+
+	/*
+	 * Speculative first check for current pipe buffer being of the
+	 * desired size/alignment
+	 */
+	if (pipe->nrbufs) {
+		struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
+
+		if (buf->offset || (buf->len & ~PAGE_MASK))
+			return -EINVAL;
+	}
+
+	down_write(&mm->mmap_sem);
+	vma = splice_setup_vma(sd, mm);
+	up_write(&mm->mmap_sem);
+
+	if (IS_ERR(vma))
+		return PTR_ERR(vma);
+
+	sd->pos = 0;
+	sd->u.data = vma;
+	return __splice_from_pipe(pipe, sd, pipe_to_user_map);
+}
+
+/*
+ * vmsplice a pipe to user memory. If SPLICE_F_MOVE is set, we will attempt
+ * to move the pipe pages to the user address space. Otherwise a simple
+ * copy is done.
+ */
+static long vmsplice_to_user(struct file *file, struct iovec __user *iov,
 			     unsigned long nr_segs, unsigned int flags)
 {
 	struct pipe_inode_info *pipe;
 	struct splice_desc sd;
-	ssize_t size;
-	int error;
-	long ret;
+	long spliced, ret;
 
 	pipe = pipe_info(file->f_path.dentry->d_inode);
 	if (!pipe)
@@ -1395,7 +1462,7 @@ static long vmsplice_to_user(struct file
 	if (pipe->inode)
 		mutex_lock(&pipe->inode->i_mutex);
 
-	error = ret = 0;
+	spliced = ret = 0;
 	while (nr_segs) {
 		void __user *base;
 		size_t len;
@@ -1403,11 +1470,11 @@ static long vmsplice_to_user(struct file
 		/*
 		 * Get user address base and length for this iovec.
 		 */
-		error = get_user(base, &iov->iov_base);
-		if (unlikely(error))
+		ret = get_user(base, &iov->iov_base);
+		if (unlikely(ret))
 			break;
-		error = get_user(len, &iov->iov_len);
-		if (unlikely(error))
+		ret = get_user(len, &iov->iov_len);
+		if (unlikely(ret))
 			break;
 
 		/*
@@ -1415,8 +1482,12 @@ static long vmsplice_to_user(struct file
 		 */
 		if (unlikely(!len))
 			break;
-		if (unlikely(!base)) {
-			error = -EFAULT;
+
+		/*
+		 * NULL base is only allowed for a remapping vmsplice
+		 */
+		if (unlikely(!base && !(flags & SPLICE_F_MOVE))) {
+			ret = -EFAULT;
 			break;
 		}
 
@@ -1426,17 +1497,42 @@ static long vmsplice_to_user(struct file
 		sd.u.userptr = base;
 		sd.pos = 0;
 
-		size = __splice_from_pipe(pipe, &sd, pipe_to_user);
-		if (size < 0) {
-			if (!ret)
-				ret = size;
+		/*
+		 * SPLICE_F_MOVE is set, don't copy the data but attempt
+		 * to map it into the app address space.
+		 */
+		ret = -EINVAL;
+		if (flags & SPLICE_F_MOVE) {
+			ret = vmsplice_pipe_map(pipe, &sd);
+			if (ret > 0) {
+				struct vm_area_struct *vma = sd.u.data;
+
+				base = (void *) vma->vm_start;
+				if (put_user(base, &iov->iov_base))
+					ret = -EFAULT;
+			}
+		}
 
-			break;
+		/*
+		 * Retry normal copy, if mapping failed with EINVAL. We can't
+		 * do a copy, if a zero-map was passed in. In that case just
+		 * bail, the app will have to retry without SPLICE_F_MOVE.
+		 */
+		if (ret == -EINVAL && base) {
+			sd.u.userptr = base;
+			ret = __splice_from_pipe(pipe, &sd, pipe_to_user_copy);
 		}
 
-		ret += size;
+		if (ret < 0)
+			break;
 
-		if (size < len)
+		spliced += ret;
+
+		/*
+		 * If we transferred less than a pipe buffer length, break
+		 * out of the loop and let the caller retry.
+		 */
+		if (ret < len)
 			break;
 
 		nr_segs--;
@@ -1446,10 +1542,121 @@ static long vmsplice_to_user(struct file
 	if (pipe->inode)
 		mutex_unlock(&pipe->inode->i_mutex);
 
-	if (!ret)
-		ret = error;
+	if (!spliced)
+		spliced = ret;
 
-	return ret;
+	return spliced;
+}
+
+/*
+ * Map an iov into an array of pages and offset/length tupples. With the
+ * partial_page structure, we can map several non-contiguous ranges into
+ * our ones pages[] map instead of splitting that operation into pieces.
+ * Could easily be exported as a generic helper for other users, in which
+ * case one would probably want to add a 'max_nr_pages' parameter as well.
+ */
+static int get_iovec_page_array(const struct iovec __user *iov,
+				unsigned int nr_vecs, struct page **pages,
+				struct partial_page *partial, int aligned)
+{
+	int buffers = 0, error = 0;
+
+	/*
+	 * It's ok to take the mmap_sem for reading, even
+	 * across a "get_user()".
+	 */
+	down_read(&current->mm->mmap_sem);
+
+	while (nr_vecs) {
+		unsigned long off, npages;
+		void __user *base;
+		size_t len;
+		int i;
+
+		/*
+		 * Get user address base and length for this iovec.
+		 */
+		error = get_user(base, &iov->iov_base);
+		if (unlikely(error))
+			break;
+		error = get_user(len, &iov->iov_len);
+		if (unlikely(error))
+			break;
+
+		/*
+		 * Sanity check this iovec. 0 read succeeds.
+		 */
+		if (unlikely(!len))
+			break;
+		error = -EFAULT;
+		if (unlikely(!base))
+			break;
+
+		/*
+		 * Get this base offset and number of pages, then map
+		 * in the user pages.
+		 */
+		off = (unsigned long) base & ~PAGE_MASK;
+
+		/*
+		 * If asked for alignment, the offset must be zero and the
+		 * length a multiple of the PAGE_SIZE.
+		 */
+		error = -EINVAL;
+		if (aligned && (off || len & ~PAGE_MASK))
+			break;
+
+		npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		if (npages > PIPE_BUFFERS - buffers)
+			npages = PIPE_BUFFERS - buffers;
+
+		error = get_user_pages(current, current->mm,
+				       (unsigned long) base, npages, 0, 0,
+				       &pages[buffers], NULL);
+
+		if (unlikely(error <= 0))
+			break;
+
+		/*
+		 * Fill this contiguous range into the partial page map.
+		 */
+		for (i = 0; i < error; i++) {
+			const int plen = min_t(size_t, len, PAGE_SIZE - off);
+
+			partial[buffers].offset = off;
+			partial[buffers].len = plen;
+
+			off = 0;
+			len -= plen;
+			buffers++;
+		}
+
+		/*
+		 * We didn't complete this iov, stop here since it probably
+		 * means we have to move some of this into a pipe to
+		 * be able to continue.
+		 */
+		if (len)
+			break;
+
+		/*
+		 * Don't continue if we mapped fewer pages than we asked for,
+		 * or if we mapped the max number of pages that we have
+		 * room for.
+		 */
+		if (error < npages || buffers == PIPE_BUFFERS)
+			break;
+
+		nr_vecs--;
+		iov++;
+	}
+
+	up_read(&current->mm->mmap_sem);
+
+	if (buffers)
+		return buffers;
+
+	return error;
 }
 
 /*
@@ -1468,6 +1675,7 @@ static long vmsplice_to_pipe(struct file
 		.partial = partial,
 		.flags = flags,
 		.ops = &user_page_pipe_buf_ops,
+		.spd_release = spd_release_page,
 	};
 
 	pipe = pipe_info(file->f_path.dentry->d_inode);
@@ -1498,7 +1706,7 @@ static long vmsplice_to_pipe(struct file
  * Currently we punt and implement it as a normal copy, see pipe_to_user().
  *
  */
-asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
+asmlinkage long sys_vmsplice(int fd, struct iovec __user *iov,
 			     unsigned long nr_segs, unsigned int flags)
 {
 	struct file *file;
@@ -1515,8 +1723,13 @@ asmlinkage long sys_vmsplice(int fd, con
 	if (file) {
 		if (file->f_mode & FMODE_WRITE)
 			error = vmsplice_to_pipe(file, iov, nr_segs, flags);
-		else if (file->f_mode & FMODE_READ)
-			error = vmsplice_to_user(file, iov, nr_segs, flags);
+		else if (file->f_mode & FMODE_READ) {
+			if (flags & SPLICE_F_UNMAP)
+				error = vmsplice_unmap(iov, nr_segs);
+			else
+				error = vmsplice_to_user(file, iov, nr_segs,
+								flags);
+		}
 
 		fput_light(file, fput);
 	}
diff -puN include/asm-i386/dma-mapping.h~git-block include/asm-i386/dma-mapping.h
--- a/include/asm-i386/dma-mapping.h~git-block
+++ a/include/asm-i386/dma-mapping.h
@@ -2,10 +2,10 @@
 #define _ASM_I386_DMA_MAPPING_H
 
 #include <linux/mm.h>
+#include <linux/scatterlist.h>
 
 #include <asm/cache.h>
 #include <asm/io.h>
-#include <asm/scatterlist.h>
 #include <asm/bug.h>
 
 #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
@@ -35,18 +35,19 @@ dma_unmap_single(struct device *dev, dma
 }
 
 static inline int
-dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents,
 	   enum dma_data_direction direction)
 {
+	struct scatterlist *sg;
 	int i;
 
 	BUG_ON(!valid_dma_direction(direction));
-	WARN_ON(nents == 0 || sg[0].length == 0);
+	WARN_ON(nents == 0 || sglist[0].length == 0);
 
-	for (i = 0; i < nents; i++ ) {
-		BUG_ON(!sg[i].page);
+	for_each_sg(sglist, sg, nents, i) {
+		BUG_ON(!sg->page);
 
-		sg[i].dma_address = page_to_phys(sg[i].page) + sg[i].offset;
+		sg->dma_address = page_to_phys(sg->page) + sg->offset;
 	}
 
 	flush_write_buffers();
diff -puN include/asm-i386/scatterlist.h~git-block include/asm-i386/scatterlist.h
--- a/include/asm-i386/scatterlist.h~git-block
+++ a/include/asm-i386/scatterlist.h
@@ -10,6 +10,8 @@ struct scatterlist {
     unsigned int	length;
 };
 
+#define ARCH_HAS_SG_CHAIN
+
 /* These macros should be used after a pci_map_sg call has been done
  * to get bus addresses of each of the SG entries and their lengths.
  * You should only work with the number of sg entries pci_map_sg
diff -puN include/asm-ia64/dma-mapping.h~git-block include/asm-ia64/dma-mapping.h
--- a/include/asm-ia64/dma-mapping.h~git-block
+++ a/include/asm-ia64/dma-mapping.h
@@ -5,6 +5,7 @@
  * Copyright (C) 2003-2004 Hewlett-Packard Co
  *	David Mosberger-Tang <davidm@hpl.hp.com>
  */
+#include <linux/scatterlist.h>
 #include <asm/machvec.h>
 
 #define dma_alloc_coherent	platform_dma_alloc_coherent
diff -puN include/asm-ia64/scatterlist.h~git-block include/asm-ia64/scatterlist.h
--- a/include/asm-ia64/scatterlist.h~git-block
+++ a/include/asm-ia64/scatterlist.h
@@ -30,4 +30,6 @@ struct scatterlist {
 #define sg_dma_len(sg)		((sg)->dma_length)
 #define sg_dma_address(sg)	((sg)->dma_address)
 
+#define	ARCH_HAS_SG_CHAIN
+
 #endif /* _ASM_IA64_SCATTERLIST_H */
diff -puN include/asm-powerpc/dma-mapping.h~git-block include/asm-powerpc/dma-mapping.h
--- a/include/asm-powerpc/dma-mapping.h~git-block
+++ a/include/asm-powerpc/dma-mapping.h
@@ -12,7 +12,7 @@
 #include <linux/cache.h>
 /* need struct page definitions */
 #include <linux/mm.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <asm/io.h>
 
 #define DMA_ERROR_CODE		(~(dma_addr_t)0x0)
@@ -276,14 +276,15 @@ static inline void dma_unmap_page(struct
 }
 
 static inline int
-dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+dma_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 	   enum dma_data_direction direction)
 {
+	struct scatterlist *sg;
 	int i;
 
 	BUG_ON(direction == DMA_NONE);
 
-	for (i = 0; i < nents; i++, sg++) {
+	for_each_sg(sgl, sg, nents, i) {
 		BUG_ON(!sg->page);
 		__dma_sync_page(sg->page, sg->offset, sg->length, direction);
 		sg->dma_address = page_to_bus(sg->page) + sg->offset;
@@ -318,26 +319,28 @@ static inline void dma_sync_single_for_d
 }
 
 static inline void dma_sync_sg_for_cpu(struct device *dev,
-		struct scatterlist *sg, int nents,
+		struct scatterlist *sgl, int nents,
 		enum dma_data_direction direction)
 {
+	struct scatterlist *sg;
 	int i;
 
 	BUG_ON(direction == DMA_NONE);
 
-	for (i = 0; i < nents; i++, sg++)
+	for_each_sg(sgl, sg, nents, i)
 		__dma_sync_page(sg->page, sg->offset, sg->length, direction);
 }
 
 static inline void dma_sync_sg_for_device(struct device *dev,
-		struct scatterlist *sg, int nents,
+		struct scatterlist *sgl, int nents,
 		enum dma_data_direction direction)
 {
+	struct scatterlist *sg;
 	int i;
 
 	BUG_ON(direction == DMA_NONE);
 
-	for (i = 0; i < nents; i++, sg++)
+	for_each_sg(sgl, sg, nents, i)
 		__dma_sync_page(sg->page, sg->offset, sg->length, direction);
 }
 
diff -puN include/asm-powerpc/scatterlist.h~git-block include/asm-powerpc/scatterlist.h
--- a/include/asm-powerpc/scatterlist.h~git-block
+++ a/include/asm-powerpc/scatterlist.h
@@ -41,5 +41,7 @@ struct scatterlist {
 #define ISA_DMA_THRESHOLD	(~0UL)
 #endif
 
+#define ARCH_HAS_SG_CHAIN
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_SCATTERLIST_H */
diff -puN include/asm-sparc/scatterlist.h~git-block include/asm-sparc/scatterlist.h
--- a/include/asm-sparc/scatterlist.h~git-block
+++ a/include/asm-sparc/scatterlist.h
@@ -19,4 +19,6 @@ struct scatterlist {
 
 #define ISA_DMA_THRESHOLD (~0UL)
 
+#define ARCH_HAS_SG_CHAIN
+
 #endif /* !(_SPARC_SCATTERLIST_H) */
diff -puN include/asm-sparc64/scatterlist.h~git-block include/asm-sparc64/scatterlist.h
--- a/include/asm-sparc64/scatterlist.h~git-block
+++ a/include/asm-sparc64/scatterlist.h
@@ -20,4 +20,6 @@ struct scatterlist {
 
 #define ISA_DMA_THRESHOLD	(~0UL)
 
+#define ARCH_HAS_SG_CHAIN
+
 #endif /* !(_SPARC64_SCATTERLIST_H) */
diff -puN include/asm-x86_64/dma-mapping.h~git-block include/asm-x86_64/dma-mapping.h
--- a/include/asm-x86_64/dma-mapping.h~git-block
+++ a/include/asm-x86_64/dma-mapping.h
@@ -6,8 +6,7 @@
  * documentation.
  */
 
-
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <asm/io.h>
 #include <asm/swiotlb.h>
 
diff -puN include/asm-x86_64/scatterlist.h~git-block include/asm-x86_64/scatterlist.h
--- a/include/asm-x86_64/scatterlist.h~git-block
+++ a/include/asm-x86_64/scatterlist.h
@@ -11,6 +11,8 @@ struct scatterlist {
     unsigned int        dma_length;
 };
 
+#define ARCH_HAS_SG_CHAIN
+
 #define ISA_DMA_THRESHOLD (0x00ffffff)
 
 /* These macros should be used after a pci_map_sg call has been done
diff -puN include/linux/bio.h~git-block include/linux/bio.h
--- a/include/linux/bio.h~git-block
+++ a/include/linux/bio.h
@@ -176,13 +176,28 @@ struct bio {
 #define bio_offset(bio)		bio_iovec((bio))->bv_offset
 #define bio_segments(bio)	((bio)->bi_vcnt - (bio)->bi_idx)
 #define bio_sectors(bio)	((bio)->bi_size >> 9)
-#define bio_cur_sectors(bio)	(bio_iovec(bio)->bv_len >> 9)
-#define bio_data(bio)		(page_address(bio_page((bio))) + bio_offset((bio)))
 #define bio_barrier(bio)	((bio)->bi_rw & (1 << BIO_RW_BARRIER))
 #define bio_sync(bio)		((bio)->bi_rw & (1 << BIO_RW_SYNC))
 #define bio_failfast(bio)	((bio)->bi_rw & (1 << BIO_RW_FAILFAST))
 #define bio_rw_ahead(bio)	((bio)->bi_rw & (1 << BIO_RW_AHEAD))
 #define bio_rw_meta(bio)	((bio)->bi_rw & (1 << BIO_RW_META))
+#define bio_empty_barrier(bio)	(bio_barrier(bio) && !(bio)->bi_size)
+
+static inline unsigned int bio_cur_sectors(struct bio *bio)
+{
+	if (bio->bi_vcnt)
+		return bio_iovec(bio)->bv_len >> 9;
+
+	return 0;
+}
+
+static inline void *bio_data(struct bio *bio)
+{
+	if (bio->bi_vcnt)
+		return page_address(bio_page(bio)) + bio_offset(bio);
+
+	return NULL;
+}
 
 /*
  * will die
diff -puN include/linux/blkdev.h~git-block include/linux/blkdev.h
--- a/include/linux/blkdev.h~git-block
+++ a/include/linux/blkdev.h
@@ -344,7 +344,6 @@ typedef void (unplug_fn) (struct request
 
 struct bio_vec;
 typedef int (merge_bvec_fn) (struct request_queue *, struct bio *, struct bio_vec *);
-typedef int (issue_flush_fn) (struct request_queue *, struct gendisk *, sector_t *);
 typedef void (prepare_flush_fn) (struct request_queue *, struct request *);
 typedef void (softirq_done_fn)(struct request *);
 
@@ -382,7 +381,6 @@ struct request_queue
 	prep_rq_fn		*prep_rq_fn;
 	unplug_fn		*unplug_fn;
 	merge_bvec_fn		*merge_bvec_fn;
-	issue_flush_fn		*issue_flush_fn;
 	prepare_flush_fn	*prepare_flush_fn;
 	softirq_done_fn		*softirq_done_fn;
 
@@ -555,6 +553,7 @@ enum {
 #define blk_barrier_rq(rq)	((rq)->cmd_flags & REQ_HARDBARRIER)
 #define blk_fua_rq(rq)		((rq)->cmd_flags & REQ_FUA)
 #define blk_bidi_rq(rq)		((rq)->next_rq != NULL)
+#define blk_empty_barrier(rq)	(blk_barrier_rq(rq) && blk_fs_request(rq) && !(rq)->hard_nr_sectors)
 
 #define list_entry_rq(ptr)	list_entry((ptr), struct request, queuelist)
 
@@ -731,7 +730,8 @@ static inline void blk_run_address_space
 extern int end_that_request_first(struct request *, int, int);
 extern int end_that_request_chunk(struct request *, int, int);
 extern void end_that_request_last(struct request *, int);
-extern void end_request(struct request *req, int uptodate);
+extern void end_request(struct request *, int);
+extern void end_queued_request(struct request *, int);
 extern void blk_complete_request(struct request *);
 
 /*
@@ -769,7 +769,6 @@ extern void blk_queue_dma_alignment(stru
 extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
 extern int blk_queue_ordered(struct request_queue *, unsigned, prepare_flush_fn *);
-extern void blk_queue_issue_flush_fn(struct request_queue *, issue_flush_fn *);
 extern int blk_do_ordered(struct request_queue *, struct request **);
 extern unsigned blk_ordered_cur_seq(struct request_queue *);
 extern unsigned blk_ordered_req_seq(struct request *);
diff -puN include/linux/i2o.h~git-block include/linux/i2o.h
--- a/include/linux/i2o.h~git-block
+++ a/include/linux/i2o.h
@@ -32,6 +32,7 @@
 #include <linux/workqueue.h>	/* work_struct */
 #include <linux/mempool.h>
 #include <linux/mutex.h>
+#include <linux/scatterlist.h>
 
 #include <asm/io.h>
 #include <asm/semaphore.h>	/* Needed for MUTEX init macros */
@@ -837,7 +838,7 @@ static inline int i2o_dma_map_sg(struct 
 		if ((sizeof(dma_addr_t) > 4) && c->pae_support)
 			*mptr++ = cpu_to_le32(i2o_dma_high(sg_dma_address(sg)));
 #endif
-		sg++;
+		sg = sg_next(sg);
 	}
 	*sg_ptr = mptr;
 
diff -puN include/linux/ide.h~git-block include/linux/ide.h
--- a/include/linux/ide.h~git-block
+++ a/include/linux/ide.h
@@ -770,7 +770,7 @@ typedef struct hwif_s {
 
 	unsigned int nsect;
 	unsigned int nleft;
-	unsigned int cursg;
+	struct scatterlist *cursg;
 	unsigned int cursg_ofs;
 
 	int		rqsize;		/* max sectors per request */
diff -puN include/linux/libata.h~git-block include/linux/libata.h
--- a/include/linux/libata.h~git-block
+++ a/include/linux/libata.h
@@ -30,7 +30,7 @@
 #include <linux/interrupt.h>
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <linux/io.h>
 #include <linux/ata.h>
 #include <linux/workqueue.h>
@@ -407,6 +407,7 @@ struct ata_queued_cmd {
 	unsigned long		flags;		/* ATA_QCFLAG_xxx */
 	unsigned int		tag;
 	unsigned int		n_elem;
+	unsigned int		n_iter;
 	unsigned int		orig_n_elem;
 
 	int			dma_dir;
@@ -417,7 +418,7 @@ struct ata_queued_cmd {
 	unsigned int		nbytes;
 	unsigned int		curbytes;
 
-	unsigned int		cursg;
+	struct scatterlist	*cursg;
 	unsigned int		cursg_ofs;
 
 	struct scatterlist	sgent;
@@ -1002,7 +1003,7 @@ ata_sg_is_last(struct scatterlist *sg, s
 		return 1;
 	if (qc->pad_len)
 		return 0;
-	if (((sg - qc->__sg) + 1) == qc->n_elem)
+	if (qc->n_iter == qc->n_elem)
 		return 1;
 	return 0;
 }
@@ -1010,6 +1011,7 @@ ata_sg_is_last(struct scatterlist *sg, s
 static inline struct scatterlist *
 ata_qc_first_sg(struct ata_queued_cmd *qc)
 {
+	qc->n_iter = 0;
 	if (qc->n_elem)
 		return qc->__sg;
 	if (qc->pad_len)
@@ -1022,8 +1024,8 @@ ata_qc_next_sg(struct scatterlist *sg, s
 {
 	if (sg == &qc->pad_sgent)
 		return NULL;
-	if (++sg - qc->__sg < qc->n_elem)
-		return sg;
+	if (++qc->n_iter < qc->n_elem)
+		return sg_next(sg);
 	if (qc->pad_len)
 		return &qc->pad_sgent;
 	return NULL;
@@ -1261,9 +1263,11 @@ static inline void ata_qc_reinit(struct 
 	qc->dma_dir = DMA_NONE;
 	qc->__sg = NULL;
 	qc->flags = 0;
-	qc->cursg = qc->cursg_ofs = 0;
+	qc->cursg = NULL;
+	qc->cursg_ofs = 0;
 	qc->nbytes = qc->curbytes = 0;
 	qc->n_elem = 0;
+	qc->n_iter = 0;
 	qc->err_mask = 0;
 	qc->pad_len = 0;
 	qc->sect_size = ATA_SECT_SIZE;
diff -puN include/linux/net.h~git-block include/linux/net.h
--- a/include/linux/net.h~git-block
+++ a/include/linux/net.h
@@ -19,6 +19,7 @@
 #define _LINUX_NET_H
 
 #include <linux/wait.h>
+#include <linux/splice.h>
 #include <asm/socket.h>
 
 struct poll_table_struct;
@@ -165,6 +166,8 @@ struct proto_ops {
 				      struct vm_area_struct * vma);
 	ssize_t		(*sendpage)  (struct socket *sock, struct page *page,
 				      int offset, size_t size, int flags);
+	ssize_t 	(*splice_read)(struct socket *sock,  loff_t *ppos,
+				       struct pipe_inode_info *pipe, size_t len, unsigned int flags);
 };
 
 struct net_proto_family {
diff -puN include/linux/scatterlist.h~git-block include/linux/scatterlist.h
--- a/include/linux/scatterlist.h~git-block
+++ a/include/linux/scatterlist.h
@@ -20,4 +20,88 @@ static inline void sg_init_one(struct sc
 	sg_set_buf(sg, buf, buflen);
 }
 
+/*
+ * We overload the LSB of the page pointer to indicate whether it's
+ * a valid sg entry, or whether it points to the start of a new scatterlist.
+ * Those low bits are there for everyone! (thanks mason :-)
+ */
+#define sg_is_chain(sg)		((unsigned long) (sg)->page & 0x01)
+#define sg_chain_ptr(sg)	\
+	((struct scatterlist *) ((unsigned long) (sg)->page & ~0x01))
+
+/**
+ * sg_next - return the next scatterlist entry in a list
+ * @sg:		The current sg entry
+ *
+ * Usually the next entry will be @sg@ + 1, but if this sg element is part
+ * of a chained scatterlist, it could jump to the start of a new
+ * scatterlist array.
+ *
+ * Note that the caller must ensure that there are further entries after
+ * the current entry, this function will NOT return NULL for an end-of-list.
+ *
+ */
+static inline struct scatterlist *sg_next(struct scatterlist *sg)
+{
+	sg++;
+
+	if (unlikely(sg_is_chain(sg)))
+		sg = sg_chain_ptr(sg);
+
+	return sg;
+}
+
+/*
+ * Loop over each sg element, following the pointer to a new list if necessary
+ */
+#define for_each_sg(sglist, sg, nr, __i)	\
+	for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg))
+
+/**
+ * sg_last - return the last scatterlist entry in a list
+ * @sgl:	First entry in the scatterlist
+ * @nents:	Number of entries in the scatterlist
+ *
+ * Should only be used casually, it (currently) scan the entire list
+ * to get the last entry.
+ *
+ * Note that the @sgl@ pointer passed in need not be the first one,
+ * the important bit is that @nents@ denotes the number of entries that
+ * exist from @sgl@.
+ *
+ */
+static inline struct scatterlist *sg_last(struct scatterlist *sgl,
+					  unsigned int nents)
+{
+#ifndef ARCH_HAS_SG_CHAIN
+	struct scatterlist *ret = &sgl[nents - 1];
+#else
+	struct scatterlist *sg, *ret = NULL;
+	int i;
+
+	for_each_sg(sgl, sg, nents, i)
+		ret = sg;
+
+#endif
+	return ret;
+}
+
+/**
+ * sg_chain - Chain two sglists together
+ * @prv:	First scatterlist
+ * @prv_nents:	Number of entries in prv
+ * @sgl:	Second scatterlist
+ *
+ * Links @prv@ and @sgl@ together, to form a longer scatterlist.
+ *
+ */
+static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents,
+			    struct scatterlist *sgl)
+{
+#ifndef ARCH_HAS_SG_CHAIN
+	BUG();
+#endif
+	prv[prv_nents - 1].page = (struct page *) ((unsigned long) sgl | 0x01);
+}
+
 #endif /* _LINUX_SCATTERLIST_H */
diff -puN include/linux/skbuff.h~git-block include/linux/skbuff.h
--- a/include/linux/skbuff.h~git-block
+++ a/include/linux/skbuff.h
@@ -1535,6 +1535,11 @@ extern int	       skb_store_bits(struct 
 extern __wsum	       skb_copy_and_csum_bits(const struct sk_buff *skb,
 					      int offset, u8 *to, int len,
 					      __wsum csum);
+extern int             skb_splice_bits(struct sk_buff *skb,
+						unsigned int offset,
+						struct pipe_inode_info *pipe,
+						unsigned int len,
+						unsigned int flags);
 extern void	       skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
 extern void	       skb_split(struct sk_buff *skb,
 				 struct sk_buff *skb1, const u32 len);
diff -puN include/linux/splice.h~git-block include/linux/splice.h
--- a/include/linux/splice.h~git-block
+++ a/include/linux/splice.h
@@ -20,6 +20,7 @@
 				 /* from/to, of course */
 #define SPLICE_F_MORE	(0x04)	/* expect more data */
 #define SPLICE_F_GIFT	(0x08)	/* pages passed in are a gift */
+#define SPLICE_F_UNMAP	(0x10)	/* undo previous vmsplice map */
 
 /*
  * Passed to the actors
@@ -53,6 +54,7 @@ struct splice_pipe_desc {
 	int nr_pages;			/* number of pages in map */
 	unsigned int flags;		/* splice flags */
 	const struct pipe_buf_operations *ops;/* ops associated with output pipe */
+	void (*spd_release)(struct splice_pipe_desc *, unsigned int);
 };
 
 typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
diff -puN include/linux/syscalls.h~git-block include/linux/syscalls.h
--- a/include/linux/syscalls.h~git-block
+++ a/include/linux/syscalls.h
@@ -591,7 +591,7 @@ asmlinkage long sys_splice(int fd_in, lo
 			   int fd_out, loff_t __user *off_out,
 			   size_t len, unsigned int flags);
 
-asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
+asmlinkage long sys_vmsplice(int fd, struct iovec __user *iov,
 			     unsigned long nr_segs, unsigned int flags);
 
 asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags);
diff -puN include/net/tcp.h~git-block include/net/tcp.h
--- a/include/net/tcp.h~git-block
+++ a/include/net/tcp.h
@@ -309,6 +309,9 @@ extern int			tcp_twsk_unique(struct sock
 
 extern void			tcp_twsk_destructor(struct sock *sk);
 
+extern ssize_t			tcp_splice_read(struct socket *sk, loff_t *ppos,
+					        struct pipe_inode_info *pipe, size_t len, unsigned int flags);
+
 static inline void tcp_dec_quickack_mode(struct sock *sk,
 					 const unsigned int pkts)
 {
diff -puN include/scsi/scsi.h~git-block include/scsi/scsi.h
--- a/include/scsi/scsi.h~git-block
+++ a/include/scsi/scsi.h
@@ -11,13 +11,6 @@
 #include <linux/types.h>
 
 /*
- *	The maximum sg list length SCSI can cope with
- *	(currently must be a power of 2 between 32 and 256)
- */
-#define SCSI_MAX_PHYS_SEGMENTS	MAX_PHYS_SEGMENTS
-
-
-/*
  *	SCSI command lengths
  */
 
diff -puN include/scsi/scsi_cmnd.h~git-block include/scsi/scsi_cmnd.h
--- a/include/scsi/scsi_cmnd.h~git-block
+++ a/include/scsi/scsi_cmnd.h
@@ -5,6 +5,7 @@
 #include <linux/list.h>
 #include <linux/types.h>
 #include <linux/timer.h>
+#include <linux/scatterlist.h>
 
 struct request;
 struct scatterlist;
@@ -72,6 +73,7 @@ struct scsi_cmnd {
 	/* These elements define the operation we ultimately want to perform */
 	unsigned short use_sg;	/* Number of pieces of scatter-gather */
 	unsigned short sglist_len;	/* size of malloc'd scatter-gather list */
+	unsigned short __use_sg;
 
 	unsigned underflow;	/* Return error if less than
 				   this amount is transferred */
@@ -133,7 +135,7 @@ extern void *scsi_kmap_atomic_sg(struct 
 extern void scsi_kunmap_atomic_sg(void *virt);
 
 extern struct scatterlist *scsi_alloc_sgtable(struct scsi_cmnd *, gfp_t);
-extern void scsi_free_sgtable(struct scatterlist *, int);
+extern void scsi_free_sgtable(struct scsi_cmnd *);
 
 extern int scsi_dma_map(struct scsi_cmnd *cmd);
 extern void scsi_dma_unmap(struct scsi_cmnd *cmd);
@@ -153,6 +155,6 @@ static inline int scsi_get_resid(struct 
 }
 
 #define scsi_for_each_sg(cmd, sg, nseg, __i)			\
-	for (__i = 0, sg = scsi_sglist(cmd); __i < (nseg); __i++, (sg)++)
+	for_each_sg(scsi_sglist(cmd), sg, nseg, __i)
 
 #endif /* _SCSI_SCSI_CMND_H */
diff -puN lib/swiotlb.c~git-block lib/swiotlb.c
--- a/lib/swiotlb.c~git-block
+++ a/lib/swiotlb.c
@@ -677,16 +677,17 @@ swiotlb_sync_single_range_for_device(str
  * same here.
  */
 int
-swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
+swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
 	       int dir)
 {
+	struct scatterlist *sg;
 	void *addr;
 	dma_addr_t dev_addr;
 	int i;
 
 	BUG_ON(dir == DMA_NONE);
 
-	for (i = 0; i < nelems; i++, sg++) {
+	for_each_sg(sgl, sg, nelems, i) {
 		addr = SG_ENT_VIRT_ADDRESS(sg);
 		dev_addr = virt_to_bus(addr);
 		if (swiotlb_force || address_needs_mapping(hwdev, dev_addr)) {
@@ -696,7 +697,7 @@ swiotlb_map_sg(struct device *hwdev, str
 				   to do proper error handling. */
 				swiotlb_full(hwdev, sg->length, dir, 0);
 				swiotlb_unmap_sg(hwdev, sg - i, i, dir);
-				sg[0].dma_length = 0;
+				sgl[0].dma_length = 0;
 				return 0;
 			}
 			sg->dma_address = virt_to_bus(map);
@@ -712,19 +713,21 @@ swiotlb_map_sg(struct device *hwdev, str
  * concerning calls here are the same as for swiotlb_unmap_single() above.
  */
 void
-swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
+swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
 		 int dir)
 {
+	struct scatterlist *sg;
 	int i;
 
 	BUG_ON(dir == DMA_NONE);
 
-	for (i = 0; i < nelems; i++, sg++)
+	for_each_sg(sgl, sg, nelems, i) {
 		if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
 			unmap_single(hwdev, bus_to_virt(sg->dma_address),
 				     sg->dma_length, dir);
 		else if (dir == DMA_FROM_DEVICE)
 			dma_mark_clean(SG_ENT_VIRT_ADDRESS(sg), sg->dma_length);
+	}
 }
 
 /*
@@ -735,19 +738,21 @@ swiotlb_unmap_sg(struct device *hwdev, s
  * and usage.
  */
 static void
-swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sg,
+swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
 		int nelems, int dir, int target)
 {
+	struct scatterlist *sg;
 	int i;
 
 	BUG_ON(dir == DMA_NONE);
 
-	for (i = 0; i < nelems; i++, sg++)
+	for_each_sg(sgl, sg, nelems, i) {
 		if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
 			sync_single(hwdev, bus_to_virt(sg->dma_address),
 				    sg->dma_length, dir, target);
 		else if (dir == DMA_FROM_DEVICE)
 			dma_mark_clean(SG_ENT_VIRT_ADDRESS(sg), sg->dma_length);
+	}
 }
 
 void
diff -puN mm/bounce.c~git-block mm/bounce.c
--- a/mm/bounce.c~git-block
+++ a/mm/bounce.c
@@ -280,6 +280,12 @@ void blk_queue_bounce(struct request_que
 	mempool_t *pool;
 
 	/*
+	 * Data-less bio, nothing to bounce
+	 */
+	if (bio_empty_barrier(*bio_orig))
+		return;
+
+	/*
 	 * for non-isa bounce case, just check if the bounce pfn is equal
 	 * to or bigger than the highest pfn in the system -- in that case,
 	 * don't waste time iterating over bio segments
diff -puN net/core/skbuff.c~git-block net/core/skbuff.c
--- a/net/core/skbuff.c~git-block
+++ a/net/core/skbuff.c
@@ -52,6 +52,7 @@
 #endif
 #include <linux/string.h>
 #include <linux/skbuff.h>
+#include <linux/splice.h>
 #include <linux/cache.h>
 #include <linux/rtnetlink.h>
 #include <linux/init.h>
@@ -71,6 +72,40 @@
 static struct kmem_cache *skbuff_head_cache __read_mostly;
 static struct kmem_cache *skbuff_fclone_cache __read_mostly;
 
+static void sock_pipe_buf_release(struct pipe_inode_info *pipe,
+				  struct pipe_buffer *buf)
+{
+	struct sk_buff *skb = (struct sk_buff *) buf->private;
+
+	kfree_skb(skb);
+}
+
+static void sock_pipe_buf_get(struct pipe_inode_info *pipe,
+				struct pipe_buffer *buf)
+{
+	struct sk_buff *skb = (struct sk_buff *) buf->private;
+
+	skb_get(skb);
+}
+
+static int sock_pipe_buf_steal(struct pipe_inode_info *pipe,
+			       struct pipe_buffer *buf)
+{
+	return 1;
+}
+
+
+/* Pipe buffer operations for a socket. */
+static struct pipe_buf_operations sock_pipe_buf_ops = {
+	.can_merge = 0,
+	.map = generic_pipe_buf_map,
+	.unmap = generic_pipe_buf_unmap,
+	.confirm = generic_pipe_buf_confirm,
+	.release = sock_pipe_buf_release,
+	.steal = sock_pipe_buf_steal,
+	.get = sock_pipe_buf_get,
+};
+
 /*
  *	Keep out-of-line to prevent kernel bloat.
  *	__builtin_return_address is not used because it is not always
@@ -1128,6 +1163,217 @@ fault:
 	return -EFAULT;
 }
 
+/*
+ * Callback from splice_to_pipe(), if we need to release some pages
+ * at the end of the spd in case we error'ed out in filling the pipe.
+ */
+static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
+{
+	struct sk_buff *skb = (struct sk_buff *) spd->partial[i].private;
+
+	kfree_skb(skb);
+}
+
+/*
+ * Fill page/offset/length into spd, if it can hold more pages.
+ */
+static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page,
+				unsigned int len, unsigned int offset,
+				struct sk_buff *skb)
+{
+	if (unlikely(spd->nr_pages == PIPE_BUFFERS))
+		return 1;
+
+	spd->pages[spd->nr_pages] = page;
+	spd->partial[spd->nr_pages].len = len;
+	spd->partial[spd->nr_pages].offset = offset;
+	spd->partial[spd->nr_pages].private = (unsigned long) skb_get(skb);
+	spd->nr_pages++;
+	return 0;
+}
+
+/*
+ * Map linear and fragment data from the skb to spd. Returns number of
+ * pages mapped.
+ */
+static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset,
+			     unsigned int *total_len,
+			     struct splice_pipe_desc *spd)
+{
+	unsigned int nr_pages = spd->nr_pages;
+	unsigned int poff, plen, len, toff, tlen;
+	int headlen, seg;
+
+	toff = *offset;
+	tlen = *total_len;
+	if (!tlen)
+		goto err;
+
+	/*
+	 * if the offset is greater than the linear part, go directly to
+	 * the fragments.
+	 */
+	headlen = skb_headlen(skb);
+	if (toff >= headlen) {
+		toff -= headlen;
+		goto map_frag;
+	}
+
+	/*
+	 * first map the linear region into the pages/partial map, skipping
+	 * any potential initial offset.
+	 */
+	len = 0;
+	while (len < headlen) {
+		void *p = skb->data + len;
+
+		poff = (unsigned long) p & (PAGE_SIZE - 1);
+		plen = min_t(unsigned int, headlen - len, PAGE_SIZE - poff);
+		len += plen;
+
+		if (toff) {
+			if (plen <= toff) {
+				toff -= plen;
+				continue;
+			}
+			plen -= toff;
+			poff += toff;
+			toff = 0;
+		}
+
+		plen = min(plen, tlen);
+		if (!plen)
+			break;
+
+		/*
+		 * just jump directly to update and return, no point
+		 * in going over fragments when the output is full.
+		 */
+		if (spd_fill_page(spd, virt_to_page(p), plen, poff, skb))
+			goto done;
+
+		tlen -= plen;
+	}
+
+	/*
+	 * then map the fragments
+	 */
+map_frag:
+	for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
+		const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
+
+		plen = f->size;
+		poff = f->page_offset;
+
+		if (toff) {
+			if (plen <= toff) {
+				toff -= plen;
+				continue;
+			}
+			plen -= toff;
+			poff += toff;
+			toff = 0;
+		}
+
+		plen = min(plen, tlen);
+		if (!plen)
+			break;
+
+		if (spd_fill_page(spd, f->page, plen, poff, skb))
+			break;
+
+		tlen -= plen;
+	}
+
+done:
+	if (spd->nr_pages - nr_pages) {
+		*offset = 0;
+		*total_len = tlen;
+		return 0;
+	}
+err:
+	return 1;
+}
+
+/*
+ * Map data from the skb to a pipe. Should handle both the linear part,
+ * the fragments, and the frag list. It does NOT handle frag lists within
+ * the frag list, if such a thing exists. We'd probably need to recurse to
+ * handle that cleanly.
+ */
+int skb_splice_bits(struct sk_buff *__skb, unsigned int offset,
+		    struct pipe_inode_info *pipe, unsigned int tlen,
+		    unsigned int flags)
+{
+	struct partial_page partial[PIPE_BUFFERS];
+	struct page *pages[PIPE_BUFFERS];
+	struct splice_pipe_desc spd = {
+		.pages = pages,
+		.partial = partial,
+		.flags = flags,
+		.ops = &sock_pipe_buf_ops,
+		.spd_release = sock_spd_release,
+	};
+	struct sk_buff *skb;
+
+	/*
+	 * I'd love to avoid the clone here, but tcp_read_sock()
+	 * ignores reference counts and unconditonally kills the sk_buff
+	 * on return from the actor.
+	 */
+	skb = skb_clone(__skb, GFP_KERNEL);
+	if (unlikely(!skb))
+		return -ENOMEM;
+
+	/*
+	 * __skb_splice_bits() only fails if the output has no room left,
+	 * so no point in going over the frag_list for the error case.
+	 */
+	if (__skb_splice_bits(skb, &offset, &tlen, &spd))
+		goto done;
+	else if (!tlen)
+		goto done;
+
+	/*
+	 * now see if we have a frag_list to map
+	 */
+	if (skb_shinfo(skb)->frag_list) {
+		struct sk_buff *list = skb_shinfo(skb)->frag_list;
+
+		for (; list && tlen; list = list->next) {
+			if (__skb_splice_bits(list, &offset, &tlen, &spd))
+				break;
+		}
+	}
+
+done:
+	/*
+	 * drop our reference to the clone, the pipe consumption will
+	 * drop the rest.
+	 */
+	kfree_skb(skb);
+
+	if (spd.nr_pages) {
+		int ret;
+
+		/*
+		 * Drop the socket lock, otherwise we have reverse
+		 * locking dependencies between sk_lock and i_mutex
+		 * here as compared to sendfile(). We enter here
+		 * with the socket lock held, and splice_to_pipe() will
+		 * grab the pipe inode lock. For sendfile() emulation,
+		 * we call into ->sendpage() with the i_mutex lock held
+		 * and networking will grab the socket lock.
+		 */
+		release_sock(__skb->sk);
+		ret = splice_to_pipe(pipe, &spd);
+		lock_sock(__skb->sk);
+		return ret;
+	}
+
+	return 0;
+}
+
 /**
  *	skb_store_bits - store bits from kernel buffer to skb
  *	@skb: destination buffer
diff -puN net/ipv4/af_inet.c~git-block net/ipv4/af_inet.c
--- a/net/ipv4/af_inet.c~git-block
+++ a/net/ipv4/af_inet.c
@@ -835,6 +835,7 @@ const struct proto_ops inet_stream_ops =
 	.recvmsg	   = sock_common_recvmsg,
 	.mmap		   = sock_no_mmap,
 	.sendpage	   = tcp_sendpage,
+	.splice_read	   = tcp_splice_read,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_sock_common_setsockopt,
 	.compat_getsockopt = compat_sock_common_getsockopt,
diff -puN net/ipv4/tcp.c~git-block net/ipv4/tcp.c
--- a/net/ipv4/tcp.c~git-block
+++ a/net/ipv4/tcp.c
@@ -254,6 +254,10 @@
 #include <linux/poll.h>
 #include <linux/init.h>
 #include <linux/fs.h>
+#include <linux/skbuff.h>
+#include <linux/splice.h>
+#include <linux/net.h>
+#include <linux/socket.h>
 #include <linux/random.h>
 #include <linux/bootmem.h>
 #include <linux/cache.h>
@@ -265,6 +269,7 @@
 #include <net/xfrm.h>
 #include <net/ip.h>
 #include <net/netdma.h>
+#include <net/sock.h>
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
@@ -292,6 +297,15 @@ EXPORT_SYMBOL(tcp_memory_allocated);
 EXPORT_SYMBOL(tcp_sockets_allocated);
 
 /*
+ * TCP splice context
+ */
+struct tcp_splice_state {
+	struct pipe_inode_info *pipe;
+	size_t len;
+	unsigned int flags;
+};
+
+/*
  * Pressure flag: try to collapse.
  * Technical note: it is used by multiple contexts non atomically.
  * All the sk_stream_mem_schedule() is of this nature: accounting
@@ -501,6 +515,120 @@ static inline void tcp_push(struct sock 
 	}
 }
 
+int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
+			 unsigned int offset, size_t len)
+{
+	struct tcp_splice_state *tss = rd_desc->arg.data;
+
+	return skb_splice_bits(skb, offset, tss->pipe, tss->len, tss->flags);
+}
+
+static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
+{
+	/* Store TCP splice context information in read_descriptor_t. */
+	read_descriptor_t rd_desc = {
+		.arg.data = tss,
+	};
+
+	return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
+}
+
+/**
+ *  tcp_splice_read - splice data from TCP socket to a pipe
+ * @sock:	socket to splice from
+ * @ppos:	position (not valid)
+ * @pipe:	pipe to splice to
+ * @len:	number of bytes to splice
+ * @flags:	splice modifier flags
+ *
+ * Description:
+ *    Will read pages from given socket and fill them into a pipe.
+ *
+ **/
+ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
+			struct pipe_inode_info *pipe, size_t len,
+			unsigned int flags)
+{
+	struct sock *sk = sock->sk;
+	struct tcp_splice_state tss = {
+		.pipe = pipe,
+		.len = len,
+		.flags = flags,
+	};
+	long timeo;
+	ssize_t spliced;
+	int ret;
+
+	/*
+	 * We can't seek on a socket input
+	 */
+	if (unlikely(*ppos))
+		return -ESPIPE;
+
+	ret = spliced = 0;
+
+	lock_sock(sk);
+
+	timeo = sock_rcvtimeo(sk, flags & SPLICE_F_NONBLOCK);
+	while (tss.len) {
+		ret = __tcp_splice_read(sk, &tss);
+		if (ret < 0)
+			break;
+		else if (!ret) {
+			if (spliced)
+				break;
+			if (flags & SPLICE_F_NONBLOCK) {
+				ret = -EAGAIN;
+				break;
+			}
+			if (sock_flag(sk, SOCK_DONE))
+				break;
+			if (sk->sk_err) {
+				ret = sock_error(sk);
+				break;
+			}
+			if (sk->sk_shutdown & RCV_SHUTDOWN)
+				break;
+			if (sk->sk_state == TCP_CLOSE) {
+				/*
+				 * This occurs when user tries to read
+				 * from never connected socket.
+				 */
+				if (!sock_flag(sk, SOCK_DONE))
+					ret = -ENOTCONN;
+				break;
+			}
+			if (!timeo) {
+				ret = -EAGAIN;
+				break;
+			}
+			sk_wait_data(sk, &timeo);
+			if (signal_pending(current)) {
+				ret = sock_intr_errno(timeo);
+				break;
+			}
+			continue;
+		}
+		tss.len -= ret;
+		spliced += ret;
+
+		release_sock(sk);
+		lock_sock(sk);
+
+		if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
+		    (sk->sk_shutdown & RCV_SHUTDOWN) || !timeo ||
+		    signal_pending(current))
+			break;
+	}
+
+	release_sock(sk);
+
+	if (spliced)
+		return spliced;
+
+	return ret;
+}
+
 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
 			 size_t psize, int flags)
 {
@@ -2527,6 +2655,7 @@ EXPORT_SYMBOL(tcp_poll);
 EXPORT_SYMBOL(tcp_read_sock);
 EXPORT_SYMBOL(tcp_recvmsg);
 EXPORT_SYMBOL(tcp_sendmsg);
+EXPORT_SYMBOL(tcp_splice_read);
 EXPORT_SYMBOL(tcp_sendpage);
 EXPORT_SYMBOL(tcp_setsockopt);
 EXPORT_SYMBOL(tcp_shutdown);
diff -puN net/ipv6/af_inet6.c~git-block net/ipv6/af_inet6.c
--- a/net/ipv6/af_inet6.c~git-block
+++ a/net/ipv6/af_inet6.c
@@ -488,6 +488,7 @@ const struct proto_ops inet6_stream_ops 
 	.recvmsg	   = sock_common_recvmsg,	/* ok		*/
 	.mmap		   = sock_no_mmap,
 	.sendpage	   = tcp_sendpage,
+	.splice_read	   = tcp_splice_read,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_sock_common_setsockopt,
 	.compat_getsockopt = compat_sock_common_getsockopt,
diff -puN net/socket.c~git-block net/socket.c
--- a/net/socket.c~git-block
+++ a/net/socket.c
@@ -111,6 +111,9 @@ static long compat_sock_ioctl(struct fil
 static int sock_fasync(int fd, struct file *filp, int on);
 static ssize_t sock_sendpage(struct file *file, struct page *page,
 			     int offset, size_t size, loff_t *ppos, int more);
+static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
+			        struct pipe_inode_info *pipe, size_t len,
+				unsigned int flags);
 
 /*
  *	Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
@@ -133,6 +136,7 @@ static const struct file_operations sock
 	.fasync =	sock_fasync,
 	.sendpage =	sock_sendpage,
 	.splice_write = generic_splice_sendpage,
+	.splice_read =	sock_splice_read,
 };
 
 /*
@@ -690,6 +694,15 @@ static ssize_t sock_sendpage(struct file
 	return sock->ops->sendpage(sock, page, offset, size, flags);
 }
 
+static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
+			        struct pipe_inode_info *pipe, size_t len,
+				unsigned int flags)
+{
+	struct socket *sock = file->private_data;
+
+	return sock->ops->splice_read(sock, ppos, pipe, len, flags);
+}
+
 static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb,
 					 struct sock_iocb *siocb)
 {
_