GIT f43bf0bebed7c33b698a8a25f95812f9e87c3843 git://git.linux-nfs.org/pub/linux/nfs-2.6.git

commit f43bf0bebed7c33b698a8a25f95812f9e87c3843
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Tue Oct 9 12:01:04 2007 -0400

    NFS: Add a boot parameter to disable 64 bit inode numbers
    
    This boot parameter will allow legacy 32-bit applications which call stat()
    to continue to function even if the NFSv3/v4 server uses 64-bit inode
    numbers.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 2a3f5fd45938bd86ce8faf4cb26be4f7e9ae2941
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Mon Oct 8 14:26:13 2007 -0400

    NFS: nfs_refresh_inode should clear cache_validity flags on success
    
    If the cached attributes match the ones supplied in the fattr, then assume
    we've revalidated the inode.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 40d24704091c8a29a4c99d25670f1996749aea6f
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Mon Oct 8 09:24:22 2007 -0400

    NFS: Fix a connectathon regression in NFSv3 and NFSv4
    
    We're failing basic test6 against Linux servers because they lack a correct
    change attribute. The fix is to assume that we always want to invalidate
    the readdir caches when we call update_changeattr and/or
    nfs_post_op_update_inode on a directory.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 9e08a3c5aec5b745e844328bcbc16458b6118faf
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Mon Oct 8 14:10:31 2007 -0400

    NFS: Use nfs_refresh_inode() in ops that aren't expected to change the inode
    
    nfs_post_op_update_inode() is really only meant to be used if we expect the
    inode and its attributes to have changed in some way.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 220bcc2afd7011b3e0569fc178331fa983c92c1b
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Mon Oct 1 12:06:48 2007 -0400

    SUNRPC: Don't call xprt_release in call refresh
    
    Call it from call_verify() instead...
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit b6e9c713f5c526a85893c6e0ab1d5d6c6f1ab479
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Mon Oct 1 12:06:44 2007 -0400

    SUNRPC: Don't call xprt_release() if call_allocate fails
    
    It completely fouls up the RPC call statistics, and serves no useful
    purpose.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 2199700f1d660494d2552278354422c51becb686
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Mon Oct 1 11:43:37 2007 -0400

    SUNRPC: Fix buggy UDP transmission
    
    xs_sendpages() may return a negative result. We sure as hell don't want to
    add that to the 'tk_bytes_sent' tally...
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 58eaab93376cb524fd0f1531a56902d2b3eaa619
Author: Jesper Juhl <jesper.juhl@gmail.com>
Date:   Sat Jul 21 17:03:28 2007 +0200

    [23/37] Clean up duplicate includes in
    
    Hi,
    
    This patch cleans up duplicate includes in
    	include/linux/nfs_fs.h
    
    Signed-off-by: Jesper Juhl <jesper.juhl@gmail.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 7f4adeff6fa32ff46f148110c4f08130175c8da8
Author: Adrian Bunk <bunk@stusta.de>
Date:   Wed Jun 13 01:03:13 2007 +0200

    [2.6 patch] net/sunrpc/rpcb_clnt.c: make struct rpcb_program static
    
    This patch makes the needlessly global struct rpcb_program static.
    
    Signed-off-by: Adrian Bunk <bunk@stusta.de>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 67f97d83bfcca9d9f8fbeeb14e7c644a82b24e12
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Wed Sep 26 14:38:10 2007 -0400

    SUNRPC: Use correct type in buffer length calculations
    
    Use correct type signage in gss_krb5_remove_padding() when doing length
    calculations.  Both xdr_buf.len and iov.iov_len are size_t, which is
    unsigned; so use an unsigned type for our temporary length variable to
    ensure we don't overflow it..
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit afde94f398b62c8596a8d0cbfc25798f0b52a371
Author: J. Bruce Fields <bfields@citi.umich.edu>
Date:   Wed Sep 26 14:38:08 2007 -0400

    SUNRPC: Fix default hostname created in rpc_create()
    
    Since 43780b87fa7..., rpc_create() fills in a default hostname based on
    the ip address if the servername passed in is null.  A small typo made
    that default incorrect.  (But this information appears to be used only
    for debugging right now, so I don't believe the typo causes any bugs in
    the current kernel.)
    
    Thanks to Olga Kornievskaia for bug report and testing.
    
    Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
    Cc: Olga Kornievskaia <aglo@citi.umich.edu>
    Cc: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit bf19aacecbeebccb2c3d150a8bd9416b7dba81fe
Author: J. Bruce Fields <bfields@citi.umich.edu>
Date:   Wed Sep 26 14:38:09 2007 -0400

    nfs: add server port to rpc_pipe info file
    
    On the client, when an alternate server port is specified on the mount
    commandline, we need to make sure gssd knows about it.
    
    Also, on the server side, when we're sending krb5 callbacks to the
    client, we'll use the same mechanism to let gssd know about the callback
    port.
    
    Thanks to Olga Kornievskaia for testing and for an earlier
    implementation.
    
    Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>
    Cc: Olga Kornievskaia <aglo@citi.umich.edu>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit c7c209730d635226b81e9aeae63b6dc8f445569f
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Fri Sep 28 19:22:40 2007 -0400

    NFS: Get rid of some obsolete macros
    
    - NFS_READTIME, NFS_CHANGE_ATTR are completely unused.
    - Inline the few remaining uses of NFS_ATTRTIMEO, and remove.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 4f48af45842c6e78ab958c90344d3c940db4da15
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Tue Oct 2 23:13:32 2007 -0400

    NFS: Simplify filehandle revalidation
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 9697d2342e1a480bc14921c52f185c2fe01016e7
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Tue Oct 2 21:58:05 2007 -0400

    NFS: Ensure that nfs_link() returns a hashed dentry
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit a12802cab8520f86c9a80bbf87d10ee6171687d1
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Tue Oct 2 19:13:04 2007 -0400

    NFS: Be strict about dentry revalidation when doing exclusive create
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit b050aa791fad6b060d6ff59305f01289e18b058c
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Tue Oct 2 19:02:07 2007 -0400

    NFS: Don't zap the readdir caches upon error
    
    If necessary, the caches will get zapped under normal revalidation.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit efbb06b7f98a154ef51ad41674548af5cc1fd005
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Tue Oct 2 17:11:54 2007 -0400

    NFS: Remove the redundant nfs_reval_fsid()
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 81c768808c78283e1b4ed4cd2cad2571294b2090
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Tue Oct 2 10:30:00 2007 -0400

    NFSv3: Always use directory post-op attributes in nfs3_proc_lookup
    
    LOOKUP returns the directory post-op attributes whether or not the
    operation was successful.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit d75340cc4de5c187fbf0bba234309ca86cf0a2fb
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Mon Oct 1 21:42:01 2007 -0400

    NFSv4: Fix nfs_atomic_open() to set the verifier on negative dentries too
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 216d5d06883edfaf992ada0d72a2a22fdfdbd296
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Mon Oct 1 20:10:12 2007 -0400

    NFSv4: Use NFSv2/v3 rules for negative dentries in nfs_open_revalidate
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 0a5ebc148879be68acdb12fbe72b65cb88c410d9
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Tue Oct 2 12:57:24 2007 -0400

    NFSv4: Don't revalidate the directory in nfs_atomic_lookup()
    
    Why bother, since the call to nfs4_atomic_open() will do it for us.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit f2c77f4e62a2290ae46b5b0449eb72d72afe691e
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Tue Oct 2 12:54:39 2007 -0400

    NFS: Optimise nfs_lookup_revalidate()
    
    We don't need to call nfs_revalidate_inode() on the directory if we already
    know that the verifiers don't match.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 6d2b2966869142660f46d1e06cf9d15c3debcf77
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Mon Oct 1 18:57:50 2007 -0400

    NFS: Reset nfsi->last_updated only if the attribute changed
    
    Otherwise set it to nfsi->read_cache_jiffies in order to prevent jiffy
    wraparound issues.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 60ccd4ec4170c9487e3792322626acd160197bce
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Sat Sep 29 17:48:19 2007 -0400

    NFS: Remove nfs_begin_data_update/nfs_end_data_update
    
    The lower level routines in fs/nfs/proc.c, fs/nfs/nfs3proc.c and
    fs/nfs/nfs4proc.c should already be dealing with the revalidation issues.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 80eb209def76d375677840800eb838abce1e6639
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Sat Sep 29 17:34:46 2007 -0400

    NFS: Remove NFS_I(inode)->data_updates
    
    We have no more users...
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit a1643a92f6de92074116922a2d2906dd33499ff4
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Sat Sep 29 17:25:43 2007 -0400

    NFS: NFS_CACHEINV() should not test for nfs_caches_unstable()
    
    The fact that we're in the process of modifying the inode does not mean
    that we should not invalidate the attribute and data caches. The defensive
    thing is to always invalidate when we're confronted with inode
    mtime/ctime or change_attribute updates that we do not immediately
    recognise.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 3258b4fa552c4f994b5e6490a8ad88f5d7e0e648
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Mon Oct 1 13:54:51 2007 -0400

    NFS: Remove bogus nfs_mark_for_revalidate() in nfs_lookup
    
    The parent of the newly materialised dentry has just been revalidated...
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit cf8ba45e0554f1c8838fcfe43a93114f177af839
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Mon Oct 1 13:46:53 2007 -0400

    NFS: don't cache the verifer across ->lookup() calls
    
    If the ->lookup() call causes the directory verifier to change, then there
    is still no need to use the old verifier, since our dentry has been
    verified.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit f38211100d4823be530577dc3452f838861222ec
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Mon Oct 1 10:00:23 2007 -0400

    NFS: nfs_mark_for_revalidate don't update cache_change_attribute
    
    Just let the subsequent inode revalidation do the update...
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 7668fdbe9aaeab705d1169ac86d0d18a12906d06
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Mon Oct 1 09:59:15 2007 -0400

    NFS: nfs_post_op_update_inode don't update cache_change_attribute
    
    If nfs_post_op_update_inode fails because the server didn't return any
    attributes, then we let the subsequent inode revalidation update
    cache_change_attribute.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 12b373ebf05485d4937dd63a00c16f8efeaa79ba
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Mon Oct 1 09:56:59 2007 -0400

    NFS: Don't revalidate dentries on directory size or ctime changes
    
    We only need to look at the mtime changes...
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 2f78e4313afd34a4ded591ec5687843113fbaa01
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Sun Sep 30 15:31:19 2007 -0400

    NFS: Don't set cache_change_attribute in nfs_revalidate_mapping
    
    The attribute revalidation code will already have taken care of resetting
    nfsi->cache_change_attribute.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 446e534985bada0ad7451c08cf213c06695f9b67
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Wed Oct 3 15:58:38 2007 -0400

    NFS: Fix a bug in nfs_open_revalidate()
    
    We want to set the verifier when the call to nfs4_open_revalidate()
    _succeeds_.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit d4d9cdcb470784df76304f75d0ce88f20f15fa6a
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Tue Oct 2 18:38:53 2007 -0400

    NFS: Don't hash the negative dentry when optimising for an O_EXCL open
    
    We don't want to leave an unverified hashed negative dentry if the
    exclusive create fails to complete.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 5724ab37872042176916441930e78fd353be1e5e
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Mon Oct 1 21:51:38 2007 -0400

    NFS: nfs_instantiate() should set the dentry verifier
    
    That will also allow us to remove the calls in mknod and mkdir.
    In addition it will ensure that symlinks set it correctly.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit fab728e156b3cbfe31f05d6e7cdebe3d5eaff878
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Sat Sep 29 17:41:33 2007 -0400

    NFS: Ensure nfs_instantiate() invalidates the parent dir on error
    
    Also ensure that it drops the dentry in this case.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 4b841736bc16b320bcdb1e8ece585b3ced9a8811
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Sat Sep 29 17:15:01 2007 -0400

    NFS: Fix nfs_verify_change_attribute()
    
    We don't care about whether or not some other process on our client is
    changing the directory while we're in nfs_lookup_revalidate(), because the
    dcache will take care of ensuring local atomicity.
    We can therefore remove the test for nfs_caches_unstable().
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 8edb01828837302055a8f0afddb2256659480bc5
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Sat Sep 29 17:14:03 2007 -0400

    NFS: Fix the sign of the return value of nfs_save_change_attribute()
    
    Also fix up the comments.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 70ca88521fc7bee8ef0fc22033a439d4b9a2c70d
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Sun Sep 30 15:21:24 2007 -0400

    NFS: Fake up 'wcc' attributes to prevent cache invalidation after write
    
    NFSv2 and v4 don't offer weak cache consistency attributes on WRITE calls.
    In NFSv3, returning wcc data is optional. In all cases, we want to prevent
    the client from invalidating our cached data whenever ->write_done()
    attempts to update the inode attributes.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit b64e8a5ef758888cb42b7c105dcfaaf51aab1baf
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Sun Sep 30 15:13:17 2007 -0400

    NFS: Remove bogus check of cache_change_attribute in nfs_update_inode
    
    Remove the bogus 'data_stable' check in nfs_update_inode. The
    cache_change_attribute tells you if the directory changed on the server,
    and should have nothing to do with the file length.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 7fdc49c4e49ba926348f71844cda7f5e12709738
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Fri Sep 28 19:11:33 2007 -0400

    NFS: Fix the ESTALE "revalidation" in _nfs_revalidate_inode()
    
    For one thing, the test NFS_ATTRTIMEO() == 0 makes no sense: we're
    testing whether or not the cache timeout length is zero, which is totally
    unrelated to the issue of whether or not we trust the file staleness.
    
    Secondly, we do not want to retry the GETATTR once a file has been declared
    stale by the server: we rather want to discard that inode as soon as
    possible, since there are broken servers still in use out there that reuse
    filehandles on new files.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 8850df999cd16aa141098e2e8be04a590276f3cc
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Fri Sep 28 17:20:07 2007 -0400

    NFS: Fix atime revalidation in read()
    
    NFSv3 will correctly update atime on a read() call, so there is no need to
    set the NFS_INO_INVALID_ATIME flag unless the call to nfs_refresh_inode()
    fails.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit c4812998398d9cbce8646494704c52297359ede0
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Fri Sep 28 17:11:45 2007 -0400

    NFS: Fix atime revalidation in readdir()
    
    NFSv3 will correctly update atime on a readdir call, so there is no need to
    set the NFS_INO_INVALID_ATIME flag unless the call to nfs_refresh_inode()
    fails.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 57fa76f2da05d0fee597b26bbc1f05242252beab
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Sun Sep 30 18:01:13 2007 -0400

    NFS: Don't use readdirplus data if the page cache is invalid
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 47aabaa7e45385fee4a535a6f6e523ff944e1684
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Thu Sep 27 15:57:24 2007 -0400

    NFSv4: Don't use ctime/mtime for determining when to invalidate the caches
    
    In NFSv4 we should only be looking at the change attribute.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 17cadc95372e28024be0874e67329c1862912c5d
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Thu Sep 27 10:07:31 2007 -0400

    NFS: Don't force a dcache revalidation if nfs_wcc_update_inode succeeds
    
    The reason is that if the weak cache consistency update was successful,
    then we know that our client must be the only one that changed the
    directory, and we've already updated the dcache to reflect the change.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit e323ea46d95d7f8c789effd1194dfc120284dbbd
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Sun Sep 30 17:03:25 2007 -0400

    NFS: nfs_wcc_update_inode: directory caches are always invalidated
    
    We must ensure that the readdir data is always invalidated whether or not
    the weak cache consistency data update succeeds.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 6ecc5e8fcad7ad64d68c098249359831331bd299
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Fri Sep 28 14:20:33 2007 -0400

    NFS: Fix dcache revalidation bugs
    
    We don't need to force a dentry lookup just because we're making changes to
    the directory.
    
    Don't update nfsi->cache_change_attribute in nfs_end_data_update: that
    overrides the NFSv3/v4 weak consistency checking that tells us our update
    was the only one, and that tells us the dcache is still valid.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 7957c1418f4b6c66e28d4ac3c4d7a8c19d526c48
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Fri Sep 28 14:20:12 2007 -0400

    NFS: fix nfs_verify_change_attribute
    
    We always want to check that the verifier and directory
    cache_change_attribute match. This also allows us to remove the 'wraparound
    hack' for the cache_change_attribute. If we're only checking for equality,
    then we don't care about wraparound issues.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 68e8a70d3cae23716f6b2b3872eba10eccea148c
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Wed Aug 15 12:59:12 2007 -0400

    NFS: nfs_post_op_update_inode() should call nfs_refresh_inode()
    
    Ensure that we don't clobber the results from a more recent getattr call...
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit f2115dc9877d480392e48e3c83bc8cbb4b418fee
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Wed Aug 15 12:49:17 2007 -0400

    NFS: Fix over-conservative attribute invalidation in nfs_update_inode()
    
    We should always be declaring the attribute cache as valid after having
    updated it.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 76b32999dfff6e59252a8af17a5671a4cf3bcf9b
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Fri Aug 10 17:45:11 2007 -0400

    NFSv4: Make NFSv4 ACCESS calls return attributes too...
    
    It doesn't really make sense to cache an access call without also
    revalidating the attributes.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit af22f94ae02ab9dd4fd7fe628c8434a59cc293be
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Fri Aug 10 17:45:10 2007 -0400

    NFSv4: Simplify _nfs4_do_access()
    
    Currently, _nfs4_do_access() is just a copy of nfs_do_access() with added
    conversion of the open flags into an access mask. This patch merges the
    duplicate functionality.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit cd3758e37ddea66fccca7d93c4b601e8a2e51926
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Fri Aug 10 17:44:32 2007 -0400

    NFS: Replace file->private_data with calls to nfs_file_open_context()
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit c03025d55540bd648f2546659090140ecc835572
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Fri Aug 10 17:44:28 2007 -0400

    NFS: Add a helper to extract the nfs_open_context from a struct file
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 8fb559f87fee7f71dbf9a595095ad7d8e84c55e7
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Mon Sep 24 15:40:16 2007 -0400

    NFS: Eliminate nfs_refresh_verifier()
    
    nfs_set_verifier() and nfs_refresh_verifier() do exactly the same thing, so
    replace one with the other.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 77a55a1fe8f26f7d022986a599b68002e21d968a
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Mon Sep 24 15:40:11 2007 -0400

    NFS: Eliminate nfs_renew_times()
    
    The nfs_renew_times() function plants the current time in jiffies in
    dentry->d_time.  But a call to nfs_renew_times() is always followed by
    another call that overwrites dentry->d_time.  Get rid of the
    nfs_renew_times() calls.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 92f6c178250170222f6d80c8ae725400765aa7a4
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Mon Sep 24 15:40:06 2007 -0400

    NFS: Don't call nfs_renew_times() in nfs_dentry_iput()
    
    Negative dentries need to be reverified after an asynchronous unlink.
    
    Quoth Trond:
    
    "Unfortunately I don't think that we can avoid revalidating the
    resulting negative dentry since the UNLINK call is asynchronous,
    and so the new verifier on the directory will only be known a
    posteriori."
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 1321d8d971028e796978f6a48d195c09158b3bcd
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Mon Sep 24 15:40:00 2007 -0400

    SUNRPC: Fix bytes-per-op accounting for RPC over UDP
    
    NFS performance metrics reported zero bytes sent per op when mounting with
    UDP.  The UDP socket transport wasn't properly counting the number of bytes
    sent.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit bcf35617a7c3474ad12892dfbb089a572e5c06d2
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Mon Sep 24 15:39:55 2007 -0400

    NFS: Show "nointr" mount option
    
    The default "intr" setting is different for NFS and NFSv4.  To avoid
    confusion on this issue, don't hide the "nointr" option in /proc/mounts.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 6e88e0618cb1e354a44cc49a996df4dd89511039
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Mon Sep 24 15:39:50 2007 -0400

    NFS: Verify server address before invoking in-kernel mount client
    
    Re-order mount option sanity checking slightly to ensure we have a valid
    server address *before* trying to do the mountd RPC call.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 113632d00acb569420b14eb7575833ac7e2eb311
Author: \"Talpey, Thomas\ <Thomas.Talpey@netapp.com>
Date:   Thu Sep 20 17:37:58 2007 -0400

    SUNRPC: Add RDMA dependency to SUNRPC_XPRT_RDMA
    
    Add a dependency on RDMA before enabling SUNRPC_XPRT_RDMA
    Yes, "INFINIBAND" also turns on iWARP and other RDMA support.
    
    Signed-off-by: Tom Talpey <talpey@netapp.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit c56c65fb67d6461f6059dd83b1750a1973a91185
Author: \"Talpey, Thomas\ <Thomas.Talpey@netapp.com>
Date:   Mon Sep 10 13:51:18 2007 -0400

    RPCRDMA: rpc rdma verbs interface implementation
    
    This implements the interface from rpcrdma to the RDMA verbs interface
    supported by Infniband and iWARP.
    
    Signed-off-by: Tom Talpey <talpey@netapp.com>
    Signed-off-by: James Lentini <jlentini@netapp.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit e96018280cb36210f4c69663561825114a57e7e1
Author: \"Talpey, Thomas\ <Thomas.Talpey@netapp.com>
Date:   Mon Sep 10 13:50:42 2007 -0400

    RPCRDMA: rpc rdma protocol implementation
    
    This implements the marshaling and unmarshaling of the rpcrdma transport
    headers. Connection management is also addressed.
    
    Signed-off-by: Tom Talpey <talpey@netapp.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit f58851e6b0f148fb4b2a1c6f70beb2f125863c0f
Author: \"Talpey, Thomas\ <Thomas.Talpey@netapp.com>
Date:   Mon Sep 10 13:50:12 2007 -0400

    RPCRDMA: rpc rdma transport switch
    
    This implements the configuration and building of the core transport
    switch implementation of the rpcrdma transport. Stubs are provided for
    the rpcrdma protocol handling, and the infiniband/iwarp verbs interface.
    These are provided in following patches.
    
    Signed-off-by: Tom Talpey <talpey@netapp.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 2cf7ff7a37cc149bd59c4f3bad432f686a4619c8
Author: \"Talpey, Thomas\ <Thomas.Talpey@netapp.com>
Date:   Mon Sep 10 13:49:41 2007 -0400

    NFS: support RDMA mounts
    
    Adds hooks to the string-based NFS mount to support an "rdma" protocol option.
    
    Signed-off-by: Tom Talpey <tmt@netapp.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit c3a57ed7471a17b07844d531534d970b84b69faf
Author: \"Talpey, Thomas\ <Thomas.Talpey@netapp.com>
Date:   Mon Sep 10 13:49:15 2007 -0400

    RPCRDMA: Kconfig and header file with rpcrdma protocol definitions
    
    This file implements the configuration target, protocol template and
    constants for the rpcrdma transport framing, for use by the xprtrdma
    rpc transport implementation.
    
    Signed-off-by: Tom Talpey <talpey@netapp.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 56928edd5afb51d684c38c0bed56594e93ffe4c7
Author: \"Talpey, Thomas\ <Thomas.Talpey@netapp.com>
Date:   Mon Sep 10 13:48:47 2007 -0400

    NFS - print accurate transport protocol
    
    Use the per-transport strings to display the transport protocol accurately.
    
    Signed-off-by: Tom Talpey <tmt@netapp.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 0896a725a1c5fdc8773a4d1ab0b73059507f5925
Author: \"Talpey, Thomas\ <Thomas.Talpey@netapp.com>
Date:   Mon Sep 10 13:48:23 2007 -0400

    NFS/SUNRPC: use transport protocol naming
    
    Instead of an { address family, raw IP protocol number }-tuple, use the
    newly-defined RPC identifier when creating clients in the upper layers.
    
    Signed-off-by: Tom Talpey <tmt@netapp.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 4fa016eb248cac875541fa199af550a8aefa0e90
Author: \"Talpey, Thomas\ <Thomas.Talpey@netapp.com>
Date:   Mon Sep 10 13:47:57 2007 -0400

    NFS/SUNRPC: support transport protocol naming
    
    To prepare for including non-sockets-based RPC transports, select
    RPC transports by an identifier (to be used in following patches).
    
    Signed-off-by: Tom Talpey <tmt@netapp.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 49c36fcc441baf6a4d698e3645d1adf28edaf57b
Author: \"Talpey, Thomas\ <Thomas.Talpey@netapp.com>
Date:   Mon Sep 10 13:47:31 2007 -0400

    SUNRPC: rearrange RPC sockets definitions
    
    To prepare for including non-sockets-based RPC transports, move the
    sockets-dependent definitions into their own file.
    
    Signed-off-by: Tom Talpey <tmt@netapp.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 3c341b0b925eee01daae2c594b81e673f659d7cd
Author: \"Talpey, Thomas\ <Thomas.Talpey@netapp.com>
Date:   Mon Sep 10 13:47:07 2007 -0400

    SUNRPC: rename the rpc_xprtsock_create structure
    
    To prepare for including non-sockets-based RPC transports, change the
    overly suggestive name of the transport creation arguments struct.
    
    Signed-off-by: Tom Talpey <tmt@netapp.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit bc25571e21e8bd053554209f5b1b228ad71e6b99
Author: \"Talpey, Thomas\ <Thomas.Talpey@netapp.com>
Date:   Mon Sep 10 13:46:39 2007 -0400

    SUNRPC: Finish API to load RPC transport implementations dynamically
    
    Allow RPC client transport implementations to be loaded as needed, or
    as they become available from distributors or third-party vendors.
    
    Note that we leave the IP sockets implementation in sunrpc.o
    permanently, as IP functionality is always available in any
    kernel that runs NFS.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Tom Talpey <tmt@netapp.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 81c098af3da7981902e9f8163aeccc2467c4ba6d
Author: \"Talpey, Thomas\ <Thomas.Talpey@netapp.com>
Date:   Mon Sep 10 13:46:00 2007 -0400

    SUNRPC: Provide a new API for registering transport implementations
    
    To allow transport capabilities to be loaded dynamically, provide an API
    for registering and unregistering the transports with the RPC client.
    Eventually xprt_create_transport() will be changed to search the list of
    registered transports when initializing a fresh transport.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Tom Talpey <tmt@netapp.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 1244480976d357447aeddd3f44977586bfa0462b
Author: \"Talpey, Thomas\ <Thomas.Talpey@netapp.com>
Date:   Mon Sep 10 13:45:36 2007 -0400

    SUNRPC: add EXPORT_SYMBOL_GPL for generic transport functions
    
    SUNRPC: add EXPORT_SYMBOL_GPL for generic transport functions
    
    As a preface to allowing arbitrary transport modules to be loaded
    dynamically, add EXPORT_SYMBOL_GPL for all generic transport functions
    that a transport implementation might want to use.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Acked-by: Tom Talpey <tmt@netapp.com>
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 4f22ccc3460ef65e9899ec271d36fc4ef795c68d
Author: \"Talpey, Thomas\ <Thomas.Talpey@netapp.com>
Date:   Mon Sep 10 13:44:58 2007 -0400

    SUNRPC: mark bulk read/write data in xdrbuf
    
    Adds a flag word to the xdrbuf struct which indicates any bulk
    disposition of the data. This enables RPC transport providers to
    marshal it efficiently/appropriately, and may enable other
    optimizations.
    
    Signed-off-by: Tom Talpey <tmt@netapp.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 20c71f5e0f954b00d75009542db2c1f844d94a1e
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Thu Sep 20 20:23:51 2007 -0400

    NFSv4: Fix a bug in nfs4_validate_mount_data()
    
    The previous patch introduced a bug when copying the server address.
    
    Also clarify a copy into the auth_flavours array: currently the two
    size calculations are equivalent, but we may decide to change the size
    of auth_flavors[] at some point.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 91ea40b9c6303ddab5c84f078f96b29084b45817
Author: \"Talpey, Thomas\ <Thomas.Talpey@netapp.com>
Date:   Mon Sep 10 13:44:33 2007 -0400

    NFS: use in-kernel mount argument structure for nfsv4 mounts
    
    The user-visible nfs4_mount_data does not contain sufficient data to
    describe new mount options, and also is now a legacy structure. Replace
    it with the internal nfs_parsed_mount_data for nfsv4 in-kernel use.
    
    Signed-off-by: Tom Talpey <tmt@netapp.com>
    Acked-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 2283f8d6ed21ea2221df4cc329314b93f35351b0
Author: \"Talpey, Thomas\ <Thomas.Talpey@netapp.com>
Date:   Mon Sep 10 13:43:56 2007 -0400

    NFS: use in-kernel mount argument structure for nfsv[23] mounts
    
    The user-visible nfs_mount_data does not contain sufficient data to
    describe new mount options, and also is now a legacy structure. Replace
    it with the internal nfs_parsed_mount_data for nfsv[23] in-kernel use.
    
    Signed-off-by: Tom Talpey <tmt@netapp.com>
    Acked-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 6b18eaa0821a559c5e2b7ed4b90f8aca5a8e6228
Author: \"Talpey, Thomas\ <Thomas.Talpey@netapp.com>
Date:   Mon Sep 10 13:43:29 2007 -0400

    NFS: move nfs_parsed_mount_data structure definition
    
    In preparation for rearranging the nfs mount argument passing, make the
    nfs_parsed_mount_data struct visible across nfs kernel files.
    
    Signed-off-by: Tom Talpey <tmt@netapp.com>
    Acked-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 4417c8c41a51a2ae95b2a2fa2811640b368c4151
Author: \"Talpey, Thomas\ <Thomas.Talpey@netapp.com>
Date:   Mon Sep 10 13:43:05 2007 -0400

    SUNRPC: export per-transport rpcbind netid's
    
    The rpcbind (v3+) netid is provided by each RPC client transport. This fixes
    an omission in IPv6 rpcbind client support, and enables future extension.
    
    Signed-off-by: Tom Talpey <tmt@netapp.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 4f40ee4a02a2d017b714d5b2faaf5c25bf9eae47
Author: \"Talpey, Thomas\ <Thomas.Talpey@netapp.com>
Date:   Mon Sep 10 13:42:38 2007 -0400

    SUNRPC: move per-transport rpcbind netid's
    
    Move the TCP/UDP rpcbind netid's from the rpcbind client to a global header.
    
    Signed-off-by: Tom Talpey <tmt@netapp.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 817cb9d43d4c330f9fc023d96e5beaa1abe8c4b7
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Tue Sep 11 18:01:20 2007 -0400

    NFSD: Convert printk's to dprintk's in NFSD's nfs4xdr
    
    Due to recent edict to remove or replace printk's that can flood the system
    log.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit e159a08b6ab14e255536fddae75d448395295c6f
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Tue Sep 11 18:01:15 2007 -0400

    LOCKD: Convert printk's to dprintk's in lockd XDR routines
    
    Due to recent edict to remove or replace printk's that might flood the
    system log.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit fe82a183ca3c9188945c4ebeebc2ca45abfa24e5
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Tue Sep 11 18:01:10 2007 -0400

    NFS: Convert printk's to dprintk's in fs/nfs/nfs?xdr.c
    
    Due to recent edict to replace or remove printk's that can be triggered en
    masse by remote misbehavior.  Left a few that only occur just before a BUG.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 0ac83779fa5bffb90e32a97abc61f1840af31ee9
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Tue Sep 11 18:01:04 2007 -0400

    NFS: Add new 'mountaddr=' mount option
    
    I got the 'mounthost=' option wrong - it shouldn't look for an address
    value, but rather a hostname value.  However, the in-kernel mount client
    and NFS client cannot resolve a hostname by themselves; they rely on
    user-land to pass in the resolved address.
    
    Create a new mount option that does take an address so that the mount
    program's address can be passed in.  The mount hostname is now ignored
    by the kernel.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit aad700073557c7932ef9f81c19a5e0647f8a6850
Author: James Lentini <jlentini@netapp.com>
Date:   Mon Sep 24 17:32:49 2007 -0400

    [NFS] [PATCH] NFS: initialize default port in kernel mount client
    
    If no mount server port number is specified, the previous change to the
    kernel mount client inadvertently allows the NFS server's port number to be
    the used as the mount server's port number. If the user specifies an NFS
    server port (-o port=x), the mount will fail.
    
    The fix below sets the mount server's port to 0 if no mount server
    port is specified by the user.
    
    Signed-off-by: James Lentini <jlentini@netapp.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit efd8340bb19c26a43e77c92fee9283b1f5777204
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Tue Sep 11 18:00:58 2007 -0400

    NFS: Kernel mount client should use async bind
    
    Simplify the in-kernel mount client by using autobind instead of an
    explicit call to rpc_getport_sync.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit b79dc8ced1412e7056f3969bef40a30cc75ee530
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Tue Sep 11 18:00:52 2007 -0400

    SUNRPC: RPC bind failures should be permanent for NULL requests
    
    The purpose of an RPC ping (a NULL request) is to determine whether the
    remote end is operating and supports the RPC program and version of the
    request.
    
    If we do an RPC bind and the remote's rpcbind service says "this
    program or service isn't supported" then we have our answer already,
    and we should give up immediately.
    
    This is good for the kernel mount client, as it will cause the request
    to fail, and then allow an immediate retry with different options.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 906462af4c707ba0238f3579fdb2b594c4ea29c3
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Tue Sep 11 18:00:47 2007 -0400

    SUNRPC: Split another new rpcbind retry error code from EACCES
    
    Add more new error code processing to the kernel's rpcbind client
    and to call_bind_status() to distinguish two cases:
    
    Case 1: the remote has replied that the program/version tuple is not
    registered (returns EACCES)
    
    Case 2: retry with a lesser rpcbind version (rpcb now returns EPFNOSUPPORT)
    
    This change allows more specific error processing for each of these two
    cases.  We now fail case 2 instead of retrying... it's a server
    configuration error not to support even rpcbind version 2.  And don't
    expose this new error code to user land -- convert it to EIO before
    failing the RPC.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 2429cbf6a1566b8e92436d615387e4250feab46b
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Tue Sep 11 18:00:41 2007 -0400

    SUNRPC: Add a new error code for retry waiting for another binder
    
    Add new error code processing to the kernel's rpcbind client and to
    call_bind_status() to distinguish two cases:
    
    Case 1: the remote has replied that the program/version tuple is not
    registered (returns -EACCES)
    
    Case 2: another process is already in the middle of binding on this
    transport (now returns -EAGAIN)
    
    This change allows more specific retry processing for each of these two
    cases.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 4784cb51a3f66d401f8a08810231aa7dc8f44e43
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Tue Sep 11 18:00:36 2007 -0400

    SUNRPC: Retry bad rpcbind replies
    
    When a server returns a bad rpcbind reply, make rpcbind client recovery logic
    retry with an older protocol version.  Older versions are more likely to work
    correctly.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit e65fe3976f594603ed7b1b4a99d3e9b867f573ea
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Tue Sep 11 18:00:31 2007 -0400

    SUNRPC: Make rpcb_decode_getaddr more picky about universal addresses
    
    Add better sanity checking of server replies to the GETVERSADDR reply
    decoder.  Change the error return code: EIO is what other XDR decoding
    routines return if there is a failure while decoding.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit d66968f207b6402fd12c20145cb31dbe3608979c
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Tue Sep 11 18:00:25 2007 -0400

    SUNRPC: Clean up in rpc_show_tasks
    
    /home/cel/linux/net/sunrpc/clnt.c: In function ‘rpc_show_tasks’:
    /home/cel/linux/net/sunrpc/clnt.c:1538: warning:
    	signed and unsigned type in conditional expression
    
    This points out another case where a conditional expression returns a
    signed value in one arm and an unsigned value in the other.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 06b8d2552d50f802a3277137a565febcd59ef037
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Tue Sep 11 18:00:20 2007 -0400

    SUNRPC: Make sure server name is reasonable before trying to print it
    
    Check the length of the passed-in server name before trying to print it in
    the log.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 6d0aa06afd62a868d83c6021335622a316469527
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Tue Sep 11 18:00:15 2007 -0400

    SUNRPC: Use correct argument type in memcpy()
    
    Noticed by Tom Talpey <tmt@netapp.com>:
    
    OBTW, there's a nit on that memcpy, too. The r_addr is an array, so
    
    memcpy(&map->r_addr
    
    is passing the address of the array as a char **. It's the same as
    map->r_addr, but technically the wrong type.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 89eb21c35b61b5157940e1b78c2c6d0529d11c63
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Tue Sep 11 18:00:09 2007 -0400

    SUNRPC: fix a signed v. unsigned comparison nit in rpc_bind_new_program
    
    /home/cel/linux/net/sunrpc/clnt.c: In function ‘rpc_bind_new_program’:
    /home/cel/linux/net/sunrpc/clnt.c:445: warning:
    	comparison between signed and unsigned
    
    RPC version numbers are u32, not int.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 5d34da3af923e0f950a89f160540d2506ca046ce
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Tue Sep 11 18:00:03 2007 -0400

    SUNRPC: Only one dprintk is needed during client creation
    
    Remove one of two identical dprintk's that occur when an RPC client is
    created.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 143b6c4008a7928de7e139c3a77a90e4cad8db2c
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Thu Aug 16 16:03:31 2007 -0400

    SUNRPC: Fix generation of universal addresses for
    
    Fix some problems with rpcbind v3 and v4 queries from the in-kernel rpcbind
    client:
    
    1.  The r_addr argument must be a full universal address, not just an IP
    address, and
    
    2.  The universal address in r_addr is the address of the remote rpcbind
    server, not the RPC service being requested
    
    This addresses bugzilla.kernel.org report 8891 for 2.6.23-rc and greater.
    
    In addition, if the rpcbind client is unable to start the rpcbind request,
    make sure not to leak the xprt.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>

commit 756805e7a76bcd2aae07fe31786fe453375e60b1
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Thu Aug 16 16:03:26 2007 -0400

    SUNRPC: Add support for formatted universal addresses
    
    "Universal addresses" are a string representation of an IP address and
    port.  They are described fully in RFC 3530, section 2.2.  Add support
    for generating them in the RPC client's socket transport module.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>

commit 8945ee5e27156ef9708bc3a11da87ba689aa38b6
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Mon Aug 6 11:58:04 2007 -0400

    SUNRPC: Split xs_reclassify_socket into an IPv4 and IPv6 version
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 95392c593e13fa7546857425971f87e4ded6e0c1
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Mon Aug 6 11:57:58 2007 -0400

    SUNRPC: Add a helper for extracting the address using the correct type
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 8f9d5b1a2e717fb9e0c4d2c60a224ecce905bd23
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Mon Aug 6 11:57:53 2007 -0400

    SUNRPC: Add IPv6 address support to net/sunrpc/xprtsock.c
    
    Finalize support for setting up RPC client transports to remote RPC
    services addressed via IPv6.
    
    Based on work done by Gilles Quillard at Bull Open Source.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Cc: Aurelien Charbon <aurelien.charbon@ext.bull.net>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 68e220bd5c9fc52d8029275cd42e08f573ce3600
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Mon Aug 6 11:57:48 2007 -0400

    SUNRPC: create connect workers for IPv6
    
    Clone separate connect worker functions for connecting AF_INET6 sockets.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Cc: Aurelien Charbon <aurelien.charbon@ext.bull.net>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 9c3d72de28eed3e882becd7054da2118f8a73131
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Mon Aug 6 11:57:43 2007 -0400

    SUNRPC: Rename IPv4 connect workers
    
    Prepare for introduction of IPv6 versions of same.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 16be2d20d999cb65daebfdaf0e560225e28fcb9d
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Mon Aug 6 11:57:38 2007 -0400

    SUNRPC: Refactor a part of socket connect logic into a helper function
    
    Finishing a socket connect is the same for IPv4 and IPv6, so split it out
    into a helper.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Cc: Aurelien Charbon <aurelien.charbon@ext.bull.net>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 90058d37c30ffce0e033ea3dcc6a539111483a58
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Mon Aug 6 11:57:33 2007 -0400

    SUNRPC: create an IPv6-savvy mechanism for binding to a reserved port
    
    Clone xs_bindresvport into two functions, one that can handle IPv4
    addresses, and one that can handle IPv6 addresses.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Cc: Aurelien Charbon <aurelien.charbon@ext.bull.net>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 7dc753f0391ad94868609376f37be4833671b57d
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Mon Aug 6 11:57:28 2007 -0400

    SUNRPC: Rename xs_bind() to prepare for IPv6-specific bind method
    
    Prepare for introduction of IPv6-specific socket bind function.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 20612005c51a5ba1bb17902276b9216825958724
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Mon Aug 6 11:57:23 2007 -0400

    SUNRPC: Introduce support for setting the port number in IPv6 addresses
    
    We could clone xs_set_port, but this is easier overall.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Cc: Aurelien Charbon <aurelien.charbon@ext.bull.net>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit d5b6443014940eb83788161c69b17cb7a1ffaaed
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Mon Aug 6 11:57:18 2007 -0400

    SUNRPC: add support for IPv6 to the kernel's rpcbind client
    
    Prepare for adding IPv6 support to the RPC client by adding IPv6
    capabilities to rpcbind.  Note that this is support on the query side
    only; registering IPv6 addresses with the local portmapper will come
    later.
    
    Note we have to take care not to fall back to using version 2 of the
    rpcbind protocol if we're dealing with IPv6 address.  Version 2 doesn't
    support IPv6 at all.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Cc: Aurelien Charbon <aurelien.charbon@ext.bull.net>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 4b6473fba4e832ee1d15737bc38779501c349a61
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Mon Aug 6 11:57:12 2007 -0400

    SUNRPC: add a function to format IPv6 addresses
    
    Clone xs_format_ipv4_peer_addresses into an IPv6 version.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Cc: Aurelien Charbon <aurelien.charbon@ext.bull.net>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit ba10f2c23471b2ef106eb0c71ead4e9862766b8d
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Mon Aug 6 11:57:07 2007 -0400

    SUNRPC: Rename xs_format_peer_addresses
    
    Prepare to add an IPv6 version of xs_format_peer_addresses by renaming it
    to xs_format_ipv4_peer_addresses.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Cc: Aurelien Charbon <aurelien.charbon@ext.bull.net>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit fbfe3cc677c1a62ca6472abf24d03d4bf9f03a55
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Mon Aug 6 11:57:02 2007 -0400

    SUNRPC: Add hex-formatted address support to rpc_peeraddr2str()
    
    Add support for the NFS client's need to export volume information
    with IP addresses formatted in hex instead of decimal.
    
    This isn't used yet, but subsequent patches (not in this series) will
    change the NFS client to use this functionality.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 0c43b3d81cca46ab2469f8802f8bd68b49f1b2a5
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Mon Aug 6 11:56:57 2007 -0400

    SUNRPC: Free address buffers in a loop
    
    Use more generic logic to free buffers holding formatted addresses.  This
    makes it less likely a bug will be introduced when adding additional buffer
    types in xs_format_peer_address().
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit bda243df2f5beebce92bae22bc01960544783984
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Mon Aug 6 11:56:52 2007 -0400

    SUNRPC: Use standard macros for printing IP addresses
    
    include/linux/kernel.h gives us some nice macros for formatting IP
    addresses.  Use them.
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit b595bb15061567441546be1af883b256bcdfff9c
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Mon Aug 6 11:56:42 2007 -0400

    SUNRPC: Fix a signed v. unsigned comparison in net/sunrpc/xprtsock.c
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit adc24df81d6903d2d6280d77e936aa5e1670d7a1
Author: Chuck Lever <chuck.lever@oracle.com>
Date:   Mon Aug 6 11:56:31 2007 -0400

    SUNRPC: Fix a signed v. unsigned comparison in rpcbind's XDR routines
    
    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit ddc01c0813dc07ca7a2bd32c143a9b54a64915ce
Author: Jeff Layton <jlayton@redhat.com>
Date:   Mon Jul 30 08:47:38 2007 -0400

    [NFS] [PATCH] NFS: show addr=ipaddr in /proc/mounts rather than
    
    A minor thing, but useful when working with a server with multiple
    addrs. This looks like it might also be necessary if Miklos' effort
    to eliminate /etc/mtab ever comes to fruition.
    
    When displaying mount options in /proc/mounts, the kernel prints
    "addr=hostname". This info is redundant since we already have the
    hostname displayed as part of the "device" section of the mount. This
    patch changes it to display the IP address to which the socket is
    connected.
    
    Signed-off-by: Jeff Layton <jlayton@redhat.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit f8cf3678f4885d66475b19b10ec2e9918ff2ca1c
Author: Christoph Hellwig <hch@lst.de>
Date:   Fri Aug 3 16:20:32 2007 +0200

    [NFS] [PATCH] nfs: tiny makefile cleanup
    
    no need to set up foo-objs these days.
    
    Signed-off-by: Christoph Hellwig <hch@lst.de>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit c7e15961115028b99f6142266b5fb08acca0e8dd
Author: Fabio Olive Leite <fleite@redhat.com>
Date:   Thu Jul 26 22:59:00 2007 -0300

    Re: [NFS] [PATCH] Attribute timeout handling and wrapping u32 jiffies
    
    I would like to discuss the idea that the current checks for attribute
    timeout using time_after are inadequate for 32bit architectures, since
    time_after works correctly only when the two timestamps being compared
    are within 2^31 jiffies of each other. The signed overflow caused by
    comparing values more than 2^31 jiffies apart will flip the result,
    causing incorrect assumptions of validity.
    
    2^31 jiffies is a fairly large period of time (~25 days) when compared
    to the lifetime of most kernel data structures, but for long lived NFS
    mounts that can sit idle for months (think that for some reason autofs
    cannot be used), it is easy to compare inode attribute timestamps with
    very disparate or even bogus values (as in when jiffies have wrapped
    many times, where the comparison doesn't even make sense).
    
    Currently the code tests for attribute timeout by simply adding the
    desired amount of jiffies to the stored timestamp and comparing that
    with the current timestamp of obtained attribute data with time_after.
    This is incorrect, as it returns true for the desired timeout period
    and another full 2^31 range of jiffies.
    
    In testing with artificial jumps (several small jumps, not one big
    crank) of the jiffies I was able to reproduce a problem found in a
    server with very long lived NFS mounts, where attributes would not be
    refreshed even after touching files and directories in the server:
    
    Initial uptime:
    03:42:01 up 6 min, 0 users, load average: 0.01, 0.12, 0.07
    
    NFS volume is mounted and time is advanced:
    03:38:09 up 25 days, 2 min, 0 users, load average: 1.22, 1.05, 1.08
    
    # ls -l /local/A/foo/bar /nfs/A/foo/bar
    -rw-r--r--  1 root root 0 Dec 17 03:38 /local/A/foo/bar
    -rw-r--r--  1 root root 0 Nov 22 00:36 /nfs/A/foo/bar
    
    # touch /local/A/foo/bar
    
    # ls -l /local/A/foo/bar /nfs/A/foo/bar
    -rw-r--r--  1 root root 0 Dec 17 03:47 /local/A/foo/bar
    -rw-r--r--  1 root root 0 Nov 22 00:36 /nfs/A/foo/bar
    
    We can see the local mtime is updated, but the NFS mount still shows
    the old value. The patch below makes it work:
    
    Initial setup...
    07:11:02 up 25 days, 1 min,  0 users,  load average: 0.15, 0.03, 0.04
    
    # ls -l /local/A/foo/bar /nfs/A/foo/bar
    -rw-r--r--  1 root root 0 Jan 11 07:11 /local/A/foo/bar
    -rw-r--r--  1 root root 0 Jan 11 07:11 /nfs/A/foo/bar
    
    # touch /local/A/foo/bar
    
    # ls -l /local/A/foo/bar /nfs/A/foo/bar
    -rw-r--r--  1 root root 0 Jan 11 07:14 /local/A/foo/bar
    -rw-r--r--  1 root root 0 Jan 11 07:14 /nfs/A/foo/bar
    
    Signed-off-by: Fabio Olive Leite <fleite@redhat.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 4e769b934e7638038e232c05b64f644e7269a90f
Author: Peter Staubach <staubach@redhat.com>
Date:   Fri Aug 3 15:07:10 2007 -0400

    64 bit ino support for NFS client
    
    Hi.
    
    Attached is a patch to modify the NFS client code to support
    64 bit ino's, as appropriate for the system and the NFS
    protocol version.
    
    The code basically just expand the NFS interfaces for routines
    which handle ino's from using ino_t to u64 and then uses the
    fileid in the nfs_inode instead of i_ino in the inode.  The
    code paths that were updated are in the getattr method and
    the readdir methods.
    
    This should be no real change on 64 bit platforms.  Since
    the ino_t is an unsigned long, it would already be 64 bits
    wide.
    
        Thanx...
    
               ps
    
    Signed-off-by: Peter Staubach <staubach@redhat.com>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 50e437d522a6cc34a882b2f740297f1b6b4c3af3
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Thu Jun 7 22:44:34 2007 -0400

    SUNRPC: Convert rpc_pipefs to use the generic filesystem notification hooks
    
    This will allow rpc.gssd to use inotify instead of dnotify in order to
    locate new rpc upcall pipes.
    
    This also requires the exporting of __audit_inode_child(), which is used by
    fsnotify_create() and fsnotify_mkdir(). Ccing David Woodhouse.
    
    Cc: David Woodhouse <dwmw2@infradead.org>
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 7b159fc18d417980f57aef64cab3417ee6af70f8
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Wed Jul 25 14:09:54 2007 -0400

    NFS: Fall back to synchronous writes when a background write errors...
    
    This helps prevent huge queues of background writes from building up
    whenever the server runs out of disk or quota space, or if someone changes
    the file access modes behind our backs.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 34901f70d119d88126e7390351b8c780646628e1
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Wed Jul 25 14:09:54 2007 -0400

    NFS: Writeback optimisation
    
    Schedule writes using WB_SYNC_NONE first, then come back for a second pass
    using WB_SYNC_ALL.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit ed90ef51a33f572fa7d00c8b05f7457be727e74f
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Fri Jul 20 13:13:28 2007 -0400

    NFS: Clean up NFS writeback flush code
    
    The only user of nfs_sync_mapping_range() is nfs_getattr(), which uses it
    to flush out the entire inode without sending a commit. We therefore
    replace nfs_sync_mapping_range with a more appropriate helper.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 90e9a3f9b0a14198a8ae5a0a5c13ad30f0b8b40d
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Sun Jul 22 19:27:46 2007 -0400

    VFS: Remove writeback_control->fs_private
    
    The only user of this field was NFS.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit f758c885199611504ff681e3ba66c410b4e9e995
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Sun Jul 22 19:27:32 2007 -0400

    NFS: Clean up nfs_writepages()
    
    Just call write_cache_pages directly instead of hacking the writeback
    control structure in order to find out if we were called from writepages()
    or directly from the VM.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 9cccef95052c7169040c3577e17d4f6fa230cc28
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Sun Jul 22 17:09:05 2007 -0400

    NFS: Clean up write code...
    
    The addition of nfs_page_mkwrite means that We should no longer need to
    create requests inside nfs_writepage()
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 94387fb1aa16ee853d00f959373132a181b0196b
Author: Trond Myklebust <Trond.Myklebust@netapp.com>
Date:   Sun Jul 22 17:09:05 2007 -0400

    NFS: Add the helper nfs_vm_page_mkwrite
    
    This is needed in order to set up a proper nfs_page request for mmapped
    files.
    
    Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 Documentation/kernel-parameters.txt |    7 
 fs/Kconfig                          |    8 
 fs/lockd/mon.c                      |    3 
 fs/lockd/xdr.c                      |    8 
 fs/lockd/xdr4.c                     |    8 
 fs/nfs/Makefile                     |    1 
 fs/nfs/client.c                     |   49 
 fs/nfs/delegation.c                 |    2 
 fs/nfs/dir.c                        |  263 +---
 fs/nfs/direct.c                     |    8 
 fs/nfs/file.c                       |  105 +
 fs/nfs/inode.c                      |  273 ++--
 fs/nfs/internal.h                   |   50 
 fs/nfs/nfs2xdr.c                    |   20 
 fs/nfs/nfs3acl.c                    |    2 
 fs/nfs/nfs3proc.c                   |   17 
 fs/nfs/nfs3xdr.c                    |   25 
 fs/nfs/nfs4proc.c                   |   85 -
 fs/nfs/nfs4state.c                  |    2 
 fs/nfs/nfs4xdr.c                    |   72 -
 fs/nfs/nfsroot.c                    |    3 
 fs/nfs/proc.c                       |    5 
 fs/nfs/read.c                       |    9 
 fs/nfs/super.c                      |  389 ++----
 fs/nfs/unlink.c                     |    3 
 fs/nfs/write.c                      |  199 +--
 fs/nfsd/nfs4xdr.c                   |   16 
 include/linux/jiffies.h             |    4 
 include/linux/nfs_fs.h              |   78 -
 include/linux/nfs_page.h            |    1 
 include/linux/nfs_xdr.h             |    6 
 include/linux/sunrpc/clnt.h         |    2 
 include/linux/sunrpc/debug.h        |    5 
 include/linux/sunrpc/msg_prot.h     |   13 
 include/linux/sunrpc/rpc_rdma.h     |  116 +
 include/linux/sunrpc/xdr.h          |    5 
 include/linux/sunrpc/xprt.h         |   42 
 include/linux/sunrpc/xprtrdma.h     |   85 +
 include/linux/sunrpc/xprtsock.h     |   51 
 include/linux/writeback.h           |    2 
 kernel/auditsc.c                    |    1 
 net/sunrpc/Makefile                 |    1 
 net/sunrpc/auth_gss/gss_krb5_wrap.c |    6 
 net/sunrpc/clnt.c                   |   52 
 net/sunrpc/rpc_pipe.c               |    8 
 net/sunrpc/rpcb_clnt.c              |  149 +-
 net/sunrpc/sched.c                  |    2 
 net/sunrpc/socklib.c                |    3 
 net/sunrpc/sunrpc_syms.c            |    2 
 net/sunrpc/timer.c                  |    4 
 net/sunrpc/xprt.c                   |  116 +
 net/sunrpc/xprtrdma/Makefile        |    3 
 net/sunrpc/xprtrdma/rpc_rdma.c      |  868 +++++++++++++
 net/sunrpc/xprtrdma/transport.c     |  800 ++++++++++++
 net/sunrpc/xprtrdma/verbs.c         | 1626 ++++++++++++++++++++++++++
 net/sunrpc/xprtrdma/xprt_rdma.h     |  330 +++++
 net/sunrpc/xprtsock.c               |  504 ++++++--
 57 files changed, 5414 insertions(+), 1103 deletions(-)

diff -puN Documentation/kernel-parameters.txt~git-nfs Documentation/kernel-parameters.txt
--- a/Documentation/kernel-parameters.txt~git-nfs
+++ a/Documentation/kernel-parameters.txt
@@ -1082,6 +1082,13 @@ and is between 256 and 4096 characters. 
 			[NFS] set the maximum lifetime for idmapper cache
 			entries.
 
+	nfs.enable_ino64=
+			[NFS] enable 64-bit inode numbers.
+			If zero, the NFS client will fake up a 32-bit inode
+			number for the readdir() and stat() syscalls instead
+			of returning the full 64-bit number.
+			The default is to return 64-bit inode numbers.
+
 	nmi_watchdog=	[KNL,BUGS=X86-32] Debugging features for SMP kernels
 
 	no387		[BUGS=X86-32] Tells the kernel to use the 387 maths
diff -puN fs/Kconfig~git-nfs fs/Kconfig
--- a/fs/Kconfig~git-nfs
+++ a/fs/Kconfig
@@ -1755,6 +1755,14 @@ config SUNRPC
 config SUNRPC_GSS
 	tristate
 
+config SUNRPC_XPRT_RDMA
+	tristate "RDMA transport for sunrpc (EXPERIMENTAL)"
+	depends on SUNRPC && INFINIBAND && EXPERIMENTAL
+	default m
+	help
+	  Adds a client RPC transport for supporting kernel NFS over RDMA
+	  mounts, including Infiniband and iWARP. Experimental.
+
 config SUNRPC_BIND34
 	bool "Support for rpcbind versions 3 & 4 (EXPERIMENTAL)"
 	depends on SUNRPC && EXPERIMENTAL
diff -puN fs/lockd/mon.c~git-nfs fs/lockd/mon.c
--- a/fs/lockd/mon.c~git-nfs
+++ a/fs/lockd/mon.c
@@ -10,6 +10,7 @@
 #include <linux/utsname.h>
 #include <linux/kernel.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/xprtsock.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/sm_inter.h>
@@ -132,7 +133,7 @@ nsm_create(void)
 		.sin_port	= 0,
 	};
 	struct rpc_create_args args = {
-		.protocol	= IPPROTO_UDP,
+		.protocol	= XPRT_TRANSPORT_UDP,
 		.address	= (struct sockaddr *)&sin,
 		.addrsize	= sizeof(sin),
 		.servername	= "localhost",
diff -puN fs/lockd/xdr.c~git-nfs fs/lockd/xdr.c
--- a/fs/lockd/xdr.c~git-nfs
+++ a/fs/lockd/xdr.c
@@ -62,8 +62,9 @@ static __be32 *nlm_decode_cookie(__be32 
 	}
 	else 
 	{
-		printk(KERN_NOTICE
-			"lockd: bad cookie size %d (only cookies under %d bytes are supported.)\n", len, NLM_MAXCOOKIELEN);
+		dprintk("lockd: bad cookie size %d (only cookies under "
+			"%d bytes are supported.)\n",
+				len, NLM_MAXCOOKIELEN);
 		return NULL;
 	}
 	return p;
@@ -84,8 +85,7 @@ nlm_decode_fh(__be32 *p, struct nfs_fh *
 	unsigned int	len;
 
 	if ((len = ntohl(*p++)) != NFS2_FHSIZE) {
-		printk(KERN_NOTICE
-			"lockd: bad fhandle size %d (should be %d)\n",
+		dprintk("lockd: bad fhandle size %d (should be %d)\n",
 			len, NFS2_FHSIZE);
 		return NULL;
 	}
diff -puN fs/lockd/xdr4.c~git-nfs fs/lockd/xdr4.c
--- a/fs/lockd/xdr4.c~git-nfs
+++ a/fs/lockd/xdr4.c
@@ -64,8 +64,9 @@ nlm4_decode_cookie(__be32 *p, struct nlm
 	}
 	else 
 	{
-		printk(KERN_NOTICE
-			"lockd: bad cookie size %d (only cookies under %d bytes are supported.)\n", len, NLM_MAXCOOKIELEN);
+		dprintk("lockd: bad cookie size %d (only cookies under "
+			"%d bytes are supported.)\n",
+				len, NLM_MAXCOOKIELEN);
 		return NULL;
 	}
 	return p;
@@ -86,8 +87,7 @@ nlm4_decode_fh(__be32 *p, struct nfs_fh 
 	memset(f->data, 0, sizeof(f->data));
 	f->size = ntohl(*p++);
 	if (f->size > NFS_MAXFHSIZE) {
-		printk(KERN_NOTICE
-			"lockd: bad fhandle size %d (should be <=%d)\n",
+		dprintk("lockd: bad fhandle size %d (should be <=%d)\n",
 			f->size, NFS_MAXFHSIZE);
 		return NULL;
 	}
diff -puN fs/nfs/Makefile~git-nfs fs/nfs/Makefile
--- a/fs/nfs/Makefile~git-nfs
+++ a/fs/nfs/Makefile
@@ -16,4 +16,3 @@ nfs-$(CONFIG_NFS_V4)	+= nfs4proc.o nfs4x
 			   nfs4namespace.o
 nfs-$(CONFIG_NFS_DIRECTIO) += direct.o
 nfs-$(CONFIG_SYSCTL) += sysctl.o
-nfs-objs		:= $(nfs-y)
diff -puN fs/nfs/client.c~git-nfs fs/nfs/client.c
--- a/fs/nfs/client.c~git-nfs
+++ a/fs/nfs/client.c
@@ -23,6 +23,8 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/stats.h>
 #include <linux/sunrpc/metrics.h>
+#include <linux/sunrpc/xprtsock.h>
+#include <linux/sunrpc/xprtrdma.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
 #include <linux/nfs4_mount.h>
@@ -340,7 +342,8 @@ static void nfs_init_timeout_values(stru
 		to->to_retries = 2;
 
 	switch (proto) {
-	case IPPROTO_TCP:
+	case XPRT_TRANSPORT_TCP:
+	case XPRT_TRANSPORT_RDMA:
 		if (!to->to_initval)
 			to->to_initval = 60 * HZ;
 		if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
@@ -349,7 +352,7 @@ static void nfs_init_timeout_values(stru
 		to->to_maxval = to->to_initval + (to->to_increment * to->to_retries);
 		to->to_exponential = 0;
 		break;
-	case IPPROTO_UDP:
+	case XPRT_TRANSPORT_UDP:
 	default:
 		if (!to->to_initval)
 			to->to_initval = 11 * HZ / 10;
@@ -501,9 +504,9 @@ static int nfs_init_server_rpcclient(str
 /*
  * Initialise an NFS2 or NFS3 client
  */
-static int nfs_init_client(struct nfs_client *clp, const struct nfs_mount_data *data)
+static int nfs_init_client(struct nfs_client *clp,
+			   const struct nfs_parsed_mount_data *data)
 {
-	int proto = (data->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
 	int error;
 
 	if (clp->cl_cons_state == NFS_CS_READY) {
@@ -522,8 +525,8 @@ static int nfs_init_client(struct nfs_cl
 	 * Create a client RPC handle for doing FSSTAT with UNIX auth only
 	 * - RFC 2623, sec 2.3.2
 	 */
-	error = nfs_create_rpc_client(clp, proto, data->timeo, data->retrans,
-					RPC_AUTH_UNIX, 0);
+	error = nfs_create_rpc_client(clp, data->nfs_server.protocol,
+				data->timeo, data->retrans, RPC_AUTH_UNIX, 0);
 	if (error < 0)
 		goto error;
 	nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -538,7 +541,8 @@ error:
 /*
  * Create a version 2 or 3 client
  */
-static int nfs_init_server(struct nfs_server *server, const struct nfs_mount_data *data)
+static int nfs_init_server(struct nfs_server *server,
+			   const struct nfs_parsed_mount_data *data)
 {
 	struct nfs_client *clp;
 	int error, nfsvers = 2;
@@ -551,7 +555,8 @@ static int nfs_init_server(struct nfs_se
 #endif
 
 	/* Allocate or find a client reference we can use */
-	clp = nfs_get_client(data->hostname, &data->addr, nfsvers);
+	clp = nfs_get_client(data->nfs_server.hostname,
+				&data->nfs_server.address, nfsvers);
 	if (IS_ERR(clp)) {
 		dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
 		return PTR_ERR(clp);
@@ -581,7 +586,7 @@ static int nfs_init_server(struct nfs_se
 	if (error < 0)
 		goto error;
 
-	error = nfs_init_server_rpcclient(server, data->pseudoflavor);
+	error = nfs_init_server_rpcclient(server, data->auth_flavors[0]);
 	if (error < 0)
 		goto error;
 
@@ -760,7 +765,7 @@ void nfs_free_server(struct nfs_server *
  * Create a version 2 or 3 volume record
  * - keyed on server and FSID
  */
-struct nfs_server *nfs_create_server(const struct nfs_mount_data *data,
+struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
 				     struct nfs_fh *mntfh)
 {
 	struct nfs_server *server;
@@ -906,7 +911,7 @@ error:
  * Create a version 4 volume record
  */
 static int nfs4_init_server(struct nfs_server *server,
-		const struct nfs4_mount_data *data, rpc_authflavor_t authflavour)
+		const struct nfs_parsed_mount_data *data)
 {
 	int error;
 
@@ -926,7 +931,7 @@ static int nfs4_init_server(struct nfs_s
 	server->acdirmin = data->acdirmin * HZ;
 	server->acdirmax = data->acdirmax * HZ;
 
-	error = nfs_init_server_rpcclient(server, authflavour);
+	error = nfs_init_server_rpcclient(server, data->auth_flavors[0]);
 
 	/* Done */
 	dprintk("<-- nfs4_init_server() = %d\n", error);
@@ -937,12 +942,7 @@ static int nfs4_init_server(struct nfs_s
  * Create a version 4 volume record
  * - keyed on server and FSID
  */
-struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *data,
-				      const char *hostname,
-				      const struct sockaddr_in *addr,
-				      const char *mntpath,
-				      const char *ip_addr,
-				      rpc_authflavor_t authflavour,
+struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
 				      struct nfs_fh *mntfh)
 {
 	struct nfs_fattr fattr;
@@ -956,13 +956,18 @@ struct nfs_server *nfs4_create_server(co
 		return ERR_PTR(-ENOMEM);
 
 	/* Get a client record */
-	error = nfs4_set_client(server, hostname, addr, ip_addr, authflavour,
-			data->proto, data->timeo, data->retrans);
+	error = nfs4_set_client(server,
+			data->nfs_server.hostname,
+			&data->nfs_server.address,
+			data->client_address,
+			data->auth_flavors[0],
+			data->nfs_server.protocol,
+			data->timeo, data->retrans);
 	if (error < 0)
 		goto error;
 
 	/* set up the general RPC client */
-	error = nfs4_init_server(server, data, authflavour);
+	error = nfs4_init_server(server, data);
 	if (error < 0)
 		goto error;
 
@@ -971,7 +976,7 @@ struct nfs_server *nfs4_create_server(co
 	BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
 
 	/* Probe the root fh to retrieve its FSID */
-	error = nfs4_path_walk(server, mntfh, mntpath);
+	error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path);
 	if (error < 0)
 		goto error;
 
diff -puN fs/nfs/delegation.c~git-nfs fs/nfs/delegation.c
--- a/fs/nfs/delegation.c~git-nfs
+++ a/fs/nfs/delegation.c
@@ -52,7 +52,7 @@ static int nfs_delegation_claim_locks(st
 	for (fl = inode->i_flock; fl != 0; fl = fl->fl_next) {
 		if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
 			continue;
-		if ((struct nfs_open_context *)fl->fl_file->private_data != ctx)
+		if (nfs_file_open_context(fl->fl_file) != ctx)
 			continue;
 		status = nfs4_lock_delegation_recall(state, fl);
 		if (status >= 0)
diff -puN fs/nfs/dir.c~git-nfs fs/nfs/dir.c
--- a/fs/nfs/dir.c~git-nfs
+++ a/fs/nfs/dir.c
@@ -200,9 +200,6 @@ int nfs_readdir_filler(nfs_readdir_descr
 	desc->timestamp = timestamp;
 	desc->timestamp_valid = 1;
 	SetPageUptodate(page);
-	spin_lock(&inode->i_lock);
-	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
-	spin_unlock(&inode->i_lock);
 	/* Ensure consistent page alignment of the data.
 	 * Note: assumes we have exclusive access to this mapping either
 	 *	 through inode->i_mutex or some other mechanism.
@@ -214,9 +211,7 @@ int nfs_readdir_filler(nfs_readdir_descr
 	unlock_page(page);
 	return 0;
  error:
-	SetPageError(page);
 	unlock_page(page);
-	nfs_zap_caches(inode);
 	desc->error = error;
 	return -EIO;
 }
@@ -407,7 +402,7 @@ int nfs_do_filldir(nfs_readdir_descripto
 	struct file	*file = desc->file;
 	struct nfs_entry *entry = desc->entry;
 	struct dentry	*dentry = NULL;
-	unsigned long	fileid;
+	u64		fileid;
 	int		loop_count = 0,
 			res;
 
@@ -418,7 +413,7 @@ int nfs_do_filldir(nfs_readdir_descripto
 		unsigned d_type = DT_UNKNOWN;
 		/* Note: entry->prev_cookie contains the cookie for
 		 *	 retrieving the current dirent on the server */
-		fileid = nfs_fileid_to_ino_t(entry->ino);
+		fileid = entry->ino;
 
 		/* Get a dentry if we have one */
 		if (dentry != NULL)
@@ -428,11 +423,12 @@ int nfs_do_filldir(nfs_readdir_descripto
 		/* Use readdirplus info */
 		if (dentry != NULL && dentry->d_inode != NULL) {
 			d_type = dt_type(dentry->d_inode);
-			fileid = dentry->d_inode->i_ino;
+			fileid = NFS_FILEID(dentry->d_inode);
 		}
 
 		res = filldir(dirent, entry->name, entry->len, 
-			      file->f_pos, fileid, d_type);
+			      file->f_pos, nfs_compat_user_ino64(fileid),
+			      d_type);
 		if (res < 0)
 			break;
 		file->f_pos++;
@@ -490,9 +486,6 @@ int uncached_readdir(nfs_readdir_descrip
 						page,
 						NFS_SERVER(inode)->dtsize,
 						desc->plus);
-	spin_lock(&inode->i_lock);
-	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
-	spin_unlock(&inode->i_lock);
 	desc->page = page;
 	desc->ptr = kmap(page);		/* matching kunmap in nfs_do_filldir */
 	if (desc->error >= 0) {
@@ -558,7 +551,7 @@ static int nfs_readdir(struct file *filp
 	memset(desc, 0, sizeof(*desc));
 
 	desc->file = filp;
-	desc->dir_cookie = &((struct nfs_open_context *)filp->private_data)->dir_cookie;
+	desc->dir_cookie = &nfs_file_open_context(filp)->dir_cookie;
 	desc->decode = NFS_PROTO(inode)->decode_dirent;
 	desc->plus = NFS_USE_READDIRPLUS(inode);
 
@@ -623,7 +616,7 @@ static loff_t nfs_llseek_dir(struct file
 	}
 	if (offset != filp->f_pos) {
 		filp->f_pos = offset;
-		((struct nfs_open_context *)filp->private_data)->dir_cookie = 0;
+		nfs_file_open_context(filp)->dir_cookie = 0;
 	}
 out:
 	mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex);
@@ -650,36 +643,18 @@ static int nfs_fsync_dir(struct file *fi
  */
 static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
 {
-	unsigned long verf;
-
 	if (IS_ROOT(dentry))
 		return 1;
-	verf = dentry->d_time;
-	if (nfs_caches_unstable(dir)
-			|| verf != NFS_I(dir)->cache_change_attribute)
+	if (!nfs_verify_change_attribute(dir, dentry->d_time))
+		return 0;
+	/* Revalidate nfsi->cache_change_attribute before we declare a match */
+	if (nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0)
+		return 0;
+	if (!nfs_verify_change_attribute(dir, dentry->d_time))
 		return 0;
 	return 1;
 }
 
-static inline void nfs_set_verifier(struct dentry * dentry, unsigned long verf)
-{
-	dentry->d_time = verf;
-}
-
-static void nfs_refresh_verifier(struct dentry * dentry, unsigned long verf)
-{
-	nfs_set_verifier(dentry, verf);
-}
-
-/*
- * Whenever an NFS operation succeeds, we know that the dentry
- * is valid, so we update the revalidation timestamp.
- */
-static inline void nfs_renew_times(struct dentry * dentry)
-{
-	dentry->d_time = jiffies;
-}
-
 /*
  * Return the intent data that applies to this particular path component
  *
@@ -695,6 +670,19 @@ static inline unsigned int nfs_lookup_ch
 }
 
 /*
+ * Use intent information to check whether or not we're going to do
+ * an O_EXCL create using this path component.
+ */
+static int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd)
+{
+	if (NFS_PROTO(dir)->version == 2)
+		return 0;
+	if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_CREATE) == 0)
+		return 0;
+	return (nd->intent.open.flags & O_EXCL) != 0;
+}
+
+/*
  * Inode and filehandle revalidation for lookups.
  *
  * We force revalidation in the cases where the VFS sets LOOKUP_REVAL,
@@ -717,6 +705,7 @@ int nfs_lookup_verify_inode(struct inode
 				(S_ISREG(inode->i_mode) ||
 				 S_ISDIR(inode->i_mode)))
 			goto out_force;
+		return 0;
 	}
 	return nfs_revalidate_inode(server, inode);
 out_force:
@@ -759,7 +748,6 @@ static int nfs_lookup_revalidate(struct 
 	int error;
 	struct nfs_fh fhandle;
 	struct nfs_fattr fattr;
-	unsigned long verifier;
 
 	parent = dget_parent(dentry);
 	lock_kernel();
@@ -767,10 +755,6 @@ static int nfs_lookup_revalidate(struct 
 	nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
 	inode = dentry->d_inode;
 
-	/* Revalidate parent directory attribute cache */
-	if (nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0)
-		goto out_zap_parent;
-
 	if (!inode) {
 		if (nfs_neg_need_reval(dir, dentry, nd))
 			goto out_bad;
@@ -785,7 +769,7 @@ static int nfs_lookup_revalidate(struct 
 	}
 
 	/* Force a full look up iff the parent directory has changed */
-	if (nfs_check_verifier(dir, dentry)) {
+	if (!nfs_is_exclusive_create(dir, nd) && nfs_check_verifier(dir, dentry)) {
 		if (nfs_lookup_verify_inode(inode, nd))
 			goto out_zap_parent;
 		goto out_valid;
@@ -794,7 +778,6 @@ static int nfs_lookup_revalidate(struct 
 	if (NFS_STALE(inode))
 		goto out_bad;
 
-	verifier = nfs_save_change_attribute(dir);
 	error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr);
 	if (error)
 		goto out_bad;
@@ -803,8 +786,7 @@ static int nfs_lookup_revalidate(struct 
 	if ((error = nfs_refresh_inode(inode, &fattr)) != 0)
 		goto out_bad;
 
-	nfs_renew_times(dentry);
-	nfs_refresh_verifier(dentry, verifier);
+	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
  out_valid:
 	unlock_kernel();
 	dput(parent);
@@ -815,7 +797,7 @@ static int nfs_lookup_revalidate(struct 
 out_zap_parent:
 	nfs_zap_caches(dir);
  out_bad:
-	NFS_CACHEINV(dir);
+	nfs_mark_for_revalidate(dir);
 	if (inode && S_ISDIR(inode->i_mode)) {
 		/* Purge readdir caches. */
 		nfs_zap_caches(inode);
@@ -872,8 +854,6 @@ static void nfs_dentry_iput(struct dentr
 		nfs_complete_unlink(dentry, inode);
 		unlock_kernel();
 	}
-	/* When creating a negative dentry, we want to renew d_time */
-	nfs_renew_times(dentry);
 	iput(inode);
 }
 
@@ -883,30 +863,6 @@ struct dentry_operations nfs_dentry_oper
 	.d_iput		= nfs_dentry_iput,
 };
 
-/*
- * Use intent information to check whether or not we're going to do
- * an O_EXCL create using this path component.
- */
-static inline
-int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd)
-{
-	if (NFS_PROTO(dir)->version == 2)
-		return 0;
-	if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_CREATE) == 0)
-		return 0;
-	return (nd->intent.open.flags & O_EXCL) != 0;
-}
-
-static inline int nfs_reval_fsid(struct inode *dir, const struct nfs_fattr *fattr)
-{
-	struct nfs_server *server = NFS_SERVER(dir);
-
-	if (!nfs_fsid_equal(&server->fsid, &fattr->fsid))
-		/* Revalidate fsid using the parent directory */
-		return __nfs_revalidate_inode(server, dir);
-	return 0;
-}
-
 static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
 {
 	struct dentry *res;
@@ -945,11 +901,6 @@ static struct dentry *nfs_lookup(struct 
 		res = ERR_PTR(error);
 		goto out_unlock;
 	}
-	error = nfs_reval_fsid(dir, &fattr);
-	if (error < 0) {
-		res = ERR_PTR(error);
-		goto out_unlock;
-	}
 	inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr);
 	res = (struct dentry *)inode;
 	if (IS_ERR(res))
@@ -958,17 +909,10 @@ static struct dentry *nfs_lookup(struct 
 no_entry:
 	res = d_materialise_unique(dentry, inode);
 	if (res != NULL) {
-		struct dentry *parent;
 		if (IS_ERR(res))
 			goto out_unlock;
-		/* Was a directory renamed! */
-		parent = dget_parent(res);
-		if (!IS_ROOT(parent))
-			nfs_mark_for_revalidate(parent->d_inode);
-		dput(parent);
 		dentry = res;
 	}
-	nfs_renew_times(dentry);
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 out_unlock:
 	unlock_kernel();
@@ -1020,28 +964,16 @@ static struct dentry *nfs_atomic_lookup(
 	}
 	dentry->d_op = NFS_PROTO(dir)->dentry_ops;
 
-	/* Let vfs_create() deal with O_EXCL */
+	/* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash
+	 * the dentry. */
 	if (nd->intent.open.flags & O_EXCL) {
-		d_add(dentry, NULL);
+		d_instantiate(dentry, NULL);
 		goto out;
 	}
 
 	/* Open the file on the server */
 	lock_kernel();
-	/* Revalidate parent directory attribute cache */
-	error = nfs_revalidate_inode(NFS_SERVER(dir), dir);
-	if (error < 0) {
-		res = ERR_PTR(error);
-		unlock_kernel();
-		goto out;
-	}
-
-	if (nd->intent.open.flags & O_CREAT) {
-		nfs_begin_data_update(dir);
-		res = nfs4_atomic_open(dir, dentry, nd);
-		nfs_end_data_update(dir);
-	} else
-		res = nfs4_atomic_open(dir, dentry, nd);
+	res = nfs4_atomic_open(dir, dentry, nd);
 	unlock_kernel();
 	if (IS_ERR(res)) {
 		error = PTR_ERR(res);
@@ -1063,8 +995,6 @@ static struct dentry *nfs_atomic_lookup(
 		}
 	} else if (res != NULL)
 		dentry = res;
-	nfs_renew_times(dentry);
-	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 out:
 	return res;
 no_open:
@@ -1076,7 +1006,6 @@ static int nfs_open_revalidate(struct de
 	struct dentry *parent = NULL;
 	struct inode *inode = dentry->d_inode;
 	struct inode *dir;
-	unsigned long verifier;
 	int openflags, ret = 0;
 
 	parent = dget_parent(dentry);
@@ -1086,8 +1015,12 @@ static int nfs_open_revalidate(struct de
 	/* We can't create new files in nfs_open_revalidate(), so we
 	 * optimize away revalidation of negative dentries.
 	 */
-	if (inode == NULL)
+	if (inode == NULL) {
+		if (!nfs_neg_need_reval(dir, dentry, nd))
+			ret = 1;
 		goto out;
+	}
+
 	/* NFS only supports OPEN on regular files */
 	if (!S_ISREG(inode->i_mode))
 		goto no_open;
@@ -1104,10 +1037,7 @@ static int nfs_open_revalidate(struct de
 	 * change attribute *before* we do the RPC call.
 	 */
 	lock_kernel();
-	verifier = nfs_save_change_attribute(dir);
 	ret = nfs4_open_revalidate(dir, dentry, openflags, nd);
-	if (!ret)
-		nfs_refresh_verifier(dentry, verifier);
 	unlock_kernel();
 out:
 	dput(parent);
@@ -1133,6 +1063,7 @@ static struct dentry *nfs_readdir_lookup
 		.len = entry->len,
 	};
 	struct inode *inode;
+	unsigned long verf = nfs_save_change_attribute(dir);
 
 	switch (name.len) {
 		case 2:
@@ -1143,6 +1074,14 @@ static struct dentry *nfs_readdir_lookup
 			if (name.name[0] == '.')
 				return dget(parent);
 	}
+
+	spin_lock(&dir->i_lock);
+	if (NFS_I(dir)->cache_validity & NFS_INO_INVALID_DATA) {
+		spin_unlock(&dir->i_lock);
+		return NULL;
+	}
+	spin_unlock(&dir->i_lock);
+
 	name.hash = full_name_hash(name.name, name.len);
 	dentry = d_lookup(parent, &name);
 	if (dentry != NULL) {
@@ -1183,12 +1122,8 @@ static struct dentry *nfs_readdir_lookup
 		dentry = alias;
 	}
 
-	nfs_renew_times(dentry);
-	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-	return dentry;
 out_renew:
-	nfs_renew_times(dentry);
-	nfs_refresh_verifier(dentry, nfs_save_change_attribute(dir));
+	nfs_set_verifier(dentry, verf);
 	return dentry;
 }
 
@@ -1198,32 +1133,40 @@ out_renew:
 int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
 				struct nfs_fattr *fattr)
 {
+	struct dentry *parent = dget_parent(dentry);
+	struct inode *dir = parent->d_inode;
 	struct inode *inode;
 	int error = -EACCES;
 
+	d_drop(dentry);
+
 	/* We may have been initialized further down */
 	if (dentry->d_inode)
-		return 0;
+		goto out;
 	if (fhandle->size == 0) {
-		struct inode *dir = dentry->d_parent->d_inode;
 		error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
 		if (error)
-			return error;
+			goto out_error;
 	}
+	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	if (!(fattr->valid & NFS_ATTR_FATTR)) {
 		struct nfs_server *server = NFS_SB(dentry->d_sb);
 		error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr);
 		if (error < 0)
-			return error;
+			goto out_error;
 	}
 	inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
 	error = PTR_ERR(inode);
 	if (IS_ERR(inode))
-		return error;
-	d_instantiate(dentry, inode);
-	if (d_unhashed(dentry))
-		d_rehash(dentry);
+		goto out_error;
+	d_add(dentry, inode);
+out:
+	dput(parent);
 	return 0;
+out_error:
+	nfs_mark_for_revalidate(dir);
+	dput(parent);
+	return error;
 }
 
 /*
@@ -1249,13 +1192,9 @@ static int nfs_create(struct inode *dir,
 		open_flags = nd->intent.open.flags;
 
 	lock_kernel();
-	nfs_begin_data_update(dir);
 	error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd);
-	nfs_end_data_update(dir);
 	if (error != 0)
 		goto out_err;
-	nfs_renew_times(dentry);
-	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	unlock_kernel();
 	return 0;
 out_err:
@@ -1283,13 +1222,9 @@ nfs_mknod(struct inode *dir, struct dent
 	attr.ia_valid = ATTR_MODE;
 
 	lock_kernel();
-	nfs_begin_data_update(dir);
 	status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev);
-	nfs_end_data_update(dir);
 	if (status != 0)
 		goto out_err;
-	nfs_renew_times(dentry);
-	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	unlock_kernel();
 	return 0;
 out_err:
@@ -1313,13 +1248,9 @@ static int nfs_mkdir(struct inode *dir, 
 	attr.ia_mode = mode | S_IFDIR;
 
 	lock_kernel();
-	nfs_begin_data_update(dir);
 	error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr);
-	nfs_end_data_update(dir);
 	if (error != 0)
 		goto out_err;
-	nfs_renew_times(dentry);
-	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	unlock_kernel();
 	return 0;
 out_err:
@@ -1336,12 +1267,10 @@ static int nfs_rmdir(struct inode *dir, 
 			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
 
 	lock_kernel();
-	nfs_begin_data_update(dir);
 	error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
 	/* Ensure the VFS deletes this inode */
 	if (error == 0 && dentry->d_inode != NULL)
 		clear_nlink(dentry->d_inode);
-	nfs_end_data_update(dir);
 	unlock_kernel();
 
 	return error;
@@ -1350,9 +1279,9 @@ static int nfs_rmdir(struct inode *dir, 
 static int nfs_sillyrename(struct inode *dir, struct dentry *dentry)
 {
 	static unsigned int sillycounter;
-	const int      i_inosize  = sizeof(dir->i_ino)*2;
+	const int      fileidsize  = sizeof(NFS_FILEID(dentry->d_inode))*2;
 	const int      countersize = sizeof(sillycounter)*2;
-	const int      slen       = sizeof(".nfs") + i_inosize + countersize - 1;
+	const int      slen        = sizeof(".nfs")+fileidsize+countersize-1;
 	char           silly[slen+1];
 	struct qstr    qsilly;
 	struct dentry *sdentry;
@@ -1370,8 +1299,9 @@ static int nfs_sillyrename(struct inode 
 	if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
 		goto out;
 
-	sprintf(silly, ".nfs%*.*lx",
-		i_inosize, i_inosize, dentry->d_inode->i_ino);
+	sprintf(silly, ".nfs%*.*Lx",
+		fileidsize, fileidsize,
+		(unsigned long long)NFS_FILEID(dentry->d_inode));
 
 	/* Return delegation in anticipation of the rename */
 	nfs_inode_return_delegation(dentry->d_inode);
@@ -1398,19 +1328,14 @@ static int nfs_sillyrename(struct inode 
 
 	qsilly.name = silly;
 	qsilly.len  = strlen(silly);
-	nfs_begin_data_update(dir);
 	if (dentry->d_inode) {
-		nfs_begin_data_update(dentry->d_inode);
 		error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
 				dir, &qsilly);
 		nfs_mark_for_revalidate(dentry->d_inode);
-		nfs_end_data_update(dentry->d_inode);
 	} else
 		error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
 				dir, &qsilly);
-	nfs_end_data_update(dir);
 	if (!error) {
-		nfs_renew_times(dentry);
 		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 		d_move(dentry, sdentry);
 		error = nfs_async_unlink(dir, dentry);
@@ -1443,19 +1368,15 @@ static int nfs_safe_remove(struct dentry
 		goto out;
 	}
 
-	nfs_begin_data_update(dir);
 	if (inode != NULL) {
 		nfs_inode_return_delegation(inode);
-		nfs_begin_data_update(inode);
 		error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
 		/* The VFS may want to delete this inode */
 		if (error == 0)
 			drop_nlink(inode);
 		nfs_mark_for_revalidate(inode);
-		nfs_end_data_update(inode);
 	} else
 		error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
-	nfs_end_data_update(dir);
 out:
 	return error;
 }
@@ -1493,7 +1414,6 @@ static int nfs_unlink(struct inode *dir,
 	spin_unlock(&dcache_lock);
 	error = nfs_safe_remove(dentry);
 	if (!error) {
-		nfs_renew_times(dentry);
 		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	} else if (need_rehash)
 		d_rehash(dentry);
@@ -1548,9 +1468,7 @@ static int nfs_symlink(struct inode *dir
 		memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen);
 	kunmap_atomic(kaddr, KM_USER0);
 
-	nfs_begin_data_update(dir);
 	error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);
-	nfs_end_data_update(dir);
 	if (error != 0) {
 		dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s) error %d\n",
 			dir->i_sb->s_id, dir->i_ino,
@@ -1590,15 +1508,12 @@ nfs_link(struct dentry *old_dentry, stru
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 
 	lock_kernel();
-	nfs_begin_data_update(dir);
-	nfs_begin_data_update(inode);
+	d_drop(dentry);
 	error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
 	if (error == 0) {
 		atomic_inc(&inode->i_count);
-		d_instantiate(dentry, inode);
+		d_add(dentry, inode);
 	}
-	nfs_end_data_update(inode);
-	nfs_end_data_update(dir);
 	unlock_kernel();
 	return error;
 }
@@ -1701,22 +1616,16 @@ go_ahead:
 		d_delete(new_dentry);
 	}
 
-	nfs_begin_data_update(old_dir);
-	nfs_begin_data_update(new_dir);
-	nfs_begin_data_update(old_inode);
 	error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name,
 					   new_dir, &new_dentry->d_name);
 	nfs_mark_for_revalidate(old_inode);
-	nfs_end_data_update(old_inode);
-	nfs_end_data_update(new_dir);
-	nfs_end_data_update(old_dir);
 out:
 	if (rehash)
 		d_rehash(rehash);
 	if (!error) {
 		d_move(old_dentry, new_dentry);
-		nfs_renew_times(new_dentry);
-		nfs_refresh_verifier(new_dentry, nfs_save_change_attribute(new_dir));
+		nfs_set_verifier(new_dentry,
+					nfs_save_change_attribute(new_dir));
 	}
 
 	/* new dentry created? */
@@ -1842,7 +1751,7 @@ static struct nfs_access_entry *nfs_acce
 	return NULL;
 }
 
-int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
+static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_access_entry *cache;
@@ -1854,7 +1763,7 @@ int nfs_access_get_cached(struct inode *
 	cache = nfs_access_search_rbtree(inode, cred);
 	if (cache == NULL)
 		goto out;
-	if (time_after(jiffies, cache->jiffies + NFS_ATTRTIMEO(inode)))
+	if (!time_in_range(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
 		goto out_stale;
 	res->jiffies = cache->jiffies;
 	res->cred = cache->cred;
@@ -1909,7 +1818,7 @@ found:
 	nfs_access_free_entry(entry);
 }
 
-void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
+static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
 {
 	struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL);
 	if (cache == NULL)
@@ -1957,6 +1866,24 @@ out:
 	return -EACCES;
 }
 
+static int nfs_open_permission_mask(int openflags)
+{
+	int mask = 0;
+
+	if (openflags & FMODE_READ)
+		mask |= MAY_READ;
+	if (openflags & FMODE_WRITE)
+		mask |= MAY_WRITE;
+	if (openflags & FMODE_EXEC)
+		mask |= MAY_EXEC;
+	return mask;
+}
+
+int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)
+{
+	return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags));
+}
+
 int nfs_permission(struct inode *inode, int mask, struct nameidata *nd)
 {
 	struct rpc_cred *cred;
diff -puN fs/nfs/direct.c~git-nfs fs/nfs/direct.c
--- a/fs/nfs/direct.c~git-nfs
+++ a/fs/nfs/direct.c
@@ -368,7 +368,7 @@ static ssize_t nfs_direct_read(struct ki
 		return -ENOMEM;
 
 	dreq->inode = inode;
-	dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
+	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
 	if (!is_sync_kiocb(iocb))
 		dreq->iocb = iocb;
 
@@ -510,7 +510,6 @@ static void nfs_direct_write_complete(st
 			nfs_direct_write_reschedule(dreq);
 			break;
 		default:
-			nfs_end_data_update(inode);
 			if (dreq->commit_data != NULL)
 				nfs_commit_free(dreq->commit_data);
 			nfs_direct_free_writedata(dreq);
@@ -533,7 +532,6 @@ static inline void nfs_alloc_commit_data
 
 static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
 {
-	nfs_end_data_update(inode);
 	nfs_direct_free_writedata(dreq);
 	nfs_zap_mapping(inode, inode->i_mapping);
 	nfs_direct_complete(dreq);
@@ -718,14 +716,12 @@ static ssize_t nfs_direct_write(struct k
 		sync = FLUSH_STABLE;
 
 	dreq->inode = inode;
-	dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
+	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
 	if (!is_sync_kiocb(iocb))
 		dreq->iocb = iocb;
 
 	nfs_add_stats(inode, NFSIOS_DIRECTWRITTENBYTES, count);
 
-	nfs_begin_data_update(inode);
-
 	rpc_clnt_sigmask(clnt, &oldset);
 	result = nfs_direct_write_schedule(dreq, user_addr, count, pos, sync);
 	if (!result)
diff -puN fs/nfs/file.c~git-nfs fs/nfs/file.c
--- a/fs/nfs/file.c~git-nfs
+++ a/fs/nfs/file.c
@@ -33,6 +33,7 @@
 #include <asm/system.h>
 
 #include "delegation.h"
+#include "internal.h"
 #include "iostat.h"
 
 #define NFSDBG_FACILITY		NFSDBG_FILE
@@ -55,6 +56,8 @@ static int nfs_lock(struct file *filp, i
 static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
 static int nfs_setlease(struct file *file, long arg, struct file_lock **fl);
 
+static struct vm_operations_struct nfs_file_vm_ops;
+
 const struct file_operations nfs_file_operations = {
 	.llseek		= nfs_file_llseek,
 	.read		= do_sync_read,
@@ -174,13 +177,38 @@ static loff_t nfs_file_llseek(struct fil
 }
 
 /*
+ * Helper for nfs_file_flush() and nfs_fsync()
+ *
+ * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
+ * disk, but it retrieves and clears ctx->error after synching, despite
+ * the two being set at the same time in nfs_context_set_write_error().
+ * This is because the former is used to notify the _next_ call to
+ * nfs_file_write() that a write error occured, and hence cause it to
+ * fall back to doing a synchronous write.
+ */
+static int nfs_do_fsync(struct nfs_open_context *ctx, struct inode *inode)
+{
+	int have_error, status;
+	int ret = 0;
+
+	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
+	status = nfs_wb_all(inode);
+	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
+	if (have_error)
+		ret = xchg(&ctx->error, 0);
+	if (!ret)
+		ret = status;
+	return ret;
+}
+
+/*
  * Flush all dirty pages, and check for write errors.
  *
  */
 static int
 nfs_file_flush(struct file *file, fl_owner_t id)
 {
-	struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
+	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	struct inode	*inode = file->f_path.dentry->d_inode;
 	int		status;
 
@@ -189,16 +217,11 @@ nfs_file_flush(struct file *file, fl_own
 	if ((file->f_mode & FMODE_WRITE) == 0)
 		return 0;
 	nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
-	lock_kernel();
+
 	/* Ensure that data+attribute caches are up to date after close() */
-	status = nfs_wb_all(inode);
-	if (!status) {
-		status = ctx->error;
-		ctx->error = 0;
-		if (!status)
-			nfs_revalidate_inode(NFS_SERVER(inode), inode);
-	}
-	unlock_kernel();
+	status = nfs_do_fsync(ctx, inode);
+	if (!status)
+		nfs_revalidate_inode(NFS_SERVER(inode), inode);
 	return status;
 }
 
@@ -257,8 +280,11 @@ nfs_file_mmap(struct file * file, struct
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 
 	status = nfs_revalidate_mapping(inode, file->f_mapping);
-	if (!status)
-		status = generic_file_mmap(file, vma);
+	if (!status) {
+		vma->vm_ops = &nfs_file_vm_ops;
+		vma->vm_flags |= VM_CAN_NONLINEAR;
+		file_accessed(file);
+	}
 	return status;
 }
 
@@ -270,21 +296,13 @@ nfs_file_mmap(struct file * file, struct
 static int
 nfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 {
-	struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
+	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	struct inode *inode = dentry->d_inode;
-	int status;
 
 	dfprintk(VFS, "nfs: fsync(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino);
 
 	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
-	lock_kernel();
-	status = nfs_wb_all(inode);
-	if (!status) {
-		status = ctx->error;
-		ctx->error = 0;
-	}
-	unlock_kernel();
-	return status;
+	return nfs_do_fsync(ctx, inode);
 }
 
 /*
@@ -333,7 +351,7 @@ static int nfs_launder_page(struct page 
 const struct address_space_operations nfs_file_aops = {
 	.readpage = nfs_readpage,
 	.readpages = nfs_readpages,
-	.set_page_dirty = nfs_set_page_dirty,
+	.set_page_dirty = __set_page_dirty_nobuffers,
 	.writepage = nfs_writepage,
 	.writepages = nfs_writepages,
 	.prepare_write = nfs_prepare_write,
@@ -346,6 +364,43 @@ const struct address_space_operations nf
 	.launder_page = nfs_launder_page,
 };
 
+static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+	struct file *filp = vma->vm_file;
+	unsigned pagelen;
+	int ret = -EINVAL;
+
+	lock_page(page);
+	if (page->mapping != vma->vm_file->f_path.dentry->d_inode->i_mapping)
+		goto out_unlock;
+	pagelen = nfs_page_length(page);
+	if (pagelen == 0)
+		goto out_unlock;
+	ret = nfs_prepare_write(filp, page, 0, pagelen);
+	if (!ret)
+		ret = nfs_commit_write(filp, page, 0, pagelen);
+out_unlock:
+	unlock_page(page);
+	return ret;
+}
+
+static struct vm_operations_struct nfs_file_vm_ops = {
+	.fault = filemap_fault,
+	.page_mkwrite = nfs_vm_page_mkwrite,
+};
+
+static int nfs_need_sync_write(struct file *filp, struct inode *inode)
+{
+	struct nfs_open_context *ctx;
+
+	if (IS_SYNC(inode) || (filp->f_flags & O_SYNC))
+		return 1;
+	ctx = nfs_file_open_context(filp);
+	if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags))
+		return 1;
+	return 0;
+}
+
 static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
 				unsigned long nr_segs, loff_t pos)
 {
@@ -382,8 +437,8 @@ static ssize_t nfs_file_write(struct kio
 	nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
 	result = generic_file_aio_write(iocb, iov, nr_segs, pos);
 	/* Return error values for O_SYNC and IS_SYNC() */
-	if (result >= 0 && (IS_SYNC(inode) || (iocb->ki_filp->f_flags & O_SYNC))) {
-		int err = nfs_fsync(iocb->ki_filp, dentry, 1);
+	if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
+		int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode);
 		if (err < 0)
 			result = err;
 	}
diff -puN fs/nfs/inode.c~git-nfs fs/nfs/inode.c
--- a/fs/nfs/inode.c~git-nfs
+++ a/fs/nfs/inode.c
@@ -49,6 +49,11 @@
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
 
+#define NFS_64_BIT_INODE_NUMBERS_ENABLED	1
+
+/* Default is to see 64-bit inode numbers */
+static int enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
+
 static void nfs_invalidate_inode(struct inode *);
 static int nfs_update_inode(struct inode *, struct nfs_fattr *);
 
@@ -62,6 +67,25 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fat
 	return nfs_fileid_to_ino_t(fattr->fileid);
 }
 
+/**
+ * nfs_compat_user_ino64 - returns the user-visible inode number
+ * @fileid: 64-bit fileid
+ *
+ * This function returns a 32-bit inode number if the boot parameter
+ * nfs.enable_ino64 is zero.
+ */
+u64 nfs_compat_user_ino64(u64 fileid)
+{
+	int ino;
+
+	if (enable_ino64)
+		return fileid;
+	ino = fileid;
+	if (sizeof(ino) < sizeof(fileid))
+		ino ^= fileid >> (sizeof(fileid)-sizeof(ino)) * 8;
+	return ino;
+}
+
 int nfs_write_inode(struct inode *inode, int sync)
 {
 	int ret;
@@ -85,7 +109,6 @@ void nfs_clear_inode(struct inode *inode
 	 */
 	BUG_ON(nfs_have_writebacks(inode));
 	BUG_ON(!list_empty(&NFS_I(inode)->open_files));
-	BUG_ON(atomic_read(&NFS_I(inode)->data_updates) != 0);
 	nfs_zap_acl_cache(inode);
 	nfs_access_zap_cache(inode);
 }
@@ -118,8 +141,8 @@ static void nfs_zap_caches_locked(struct
 
 	nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
 
-	NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode);
-	NFS_ATTRTIMEO_UPDATE(inode) = jiffies;
+	nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
+	nfsi->attrtimeo_timestamp = jiffies;
 
 	memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode)));
 	if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))
@@ -156,6 +179,13 @@ static void nfs_zap_acl_cache(struct ino
 	spin_unlock(&inode->i_lock);
 }
 
+void nfs_invalidate_atime(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
+	spin_unlock(&inode->i_lock);
+}
+
 /*
  * Invalidate, but do not unhash, the inode.
  * NB: must be called with inode->i_lock held!
@@ -338,7 +368,6 @@ nfs_setattr(struct dentry *dentry, struc
 		return 0;
 
 	lock_kernel();
-	nfs_begin_data_update(inode);
 	/* Write all dirty data */
 	if (S_ISREG(inode->i_mode)) {
 		filemap_write_and_wait(inode->i_mapping);
@@ -352,7 +381,6 @@ nfs_setattr(struct dentry *dentry, struc
 	error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr);
 	if (error == 0)
 		nfs_refresh_inode(inode, &fattr);
-	nfs_end_data_update(inode);
 	unlock_kernel();
 	return error;
 }
@@ -431,7 +459,7 @@ int nfs_getattr(struct vfsmount *mnt, st
 
 	/* Flush out writes to the server in order to update c/mtime */
 	if (S_ISREG(inode->i_mode))
-		nfs_sync_mapping_range(inode->i_mapping, 0, 0, FLUSH_NOCOMMIT);
+		nfs_wb_nocommit(inode);
 
 	/*
 	 * We may force a getattr if the user cares about atime.
@@ -450,8 +478,10 @@ int nfs_getattr(struct vfsmount *mnt, st
 		err = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
 	else
 		err = nfs_revalidate_inode(NFS_SERVER(inode), inode);
-	if (!err)
+	if (!err) {
 		generic_fillattr(inode, stat);
+		stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
+	}
 	return err;
 }
 
@@ -536,7 +566,7 @@ struct nfs_open_context *nfs_find_open_c
 static void nfs_file_clear_open_context(struct file *filp)
 {
 	struct inode *inode = filp->f_path.dentry->d_inode;
-	struct nfs_open_context *ctx = (struct nfs_open_context *)filp->private_data;
+	struct nfs_open_context *ctx = nfs_file_open_context(filp);
 
 	if (ctx) {
 		filp->private_data = NULL;
@@ -598,16 +628,10 @@ __nfs_revalidate_inode(struct nfs_server
 	status = nfs_wait_on_inode(inode);
 	if (status < 0)
 		goto out;
-	if (NFS_STALE(inode)) {
-		status = -ESTALE;
-		/* Do we trust the cached ESTALE? */
-		if (NFS_ATTRTIMEO(inode) != 0) {
-			if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME)) {
-				/* no */
-			} else
-				goto out;
-		}
-	}
+
+	status = -ESTALE;
+	if (NFS_STALE(inode))
+		goto out;
 
 	status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr);
 	if (status != 0) {
@@ -654,7 +678,7 @@ int nfs_attribute_timeout(struct inode *
 
 	if (nfs_have_delegation(inode, FMODE_READ))
 		return 0;
-	return time_after(jiffies, nfsi->read_cache_jiffies+nfsi->attrtimeo);
+	return !time_in_range(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
 }
 
 /**
@@ -683,11 +707,8 @@ static int nfs_invalidate_mapping_nolock
 	}
 	spin_lock(&inode->i_lock);
 	nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
-	if (S_ISDIR(inode->i_mode)) {
+	if (S_ISDIR(inode->i_mode))
 		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
-		/* This ensures we revalidate child dentries */
-		nfsi->cache_change_attribute = jiffies;
-	}
 	spin_unlock(&inode->i_lock);
 	nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
 	dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
@@ -756,56 +777,27 @@ out:
 	return ret;
 }
 
-/**
- * nfs_begin_data_update
- * @inode - pointer to inode
- * Declare that a set of operations will update file data on the server
- */
-void nfs_begin_data_update(struct inode *inode)
-{
-	atomic_inc(&NFS_I(inode)->data_updates);
-}
-
-/**
- * nfs_end_data_update
- * @inode - pointer to inode
- * Declare end of the operations that will update file data
- * This will mark the inode as immediately needing revalidation
- * of its attribute cache.
- */
-void nfs_end_data_update(struct inode *inode)
-{
-	struct nfs_inode *nfsi = NFS_I(inode);
-
-	/* Directories: invalidate page cache */
-	if (S_ISDIR(inode->i_mode)) {
-		spin_lock(&inode->i_lock);
-		nfsi->cache_validity |= NFS_INO_INVALID_DATA;
-		spin_unlock(&inode->i_lock);
-	}
-	nfsi->cache_change_attribute = jiffies;
-	atomic_dec(&nfsi->data_updates);
-}
-
 static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
-	unsigned long now = jiffies;
 
+	if ((fattr->valid & NFS_ATTR_WCC_V4) != 0 &&
+			nfsi->change_attr == fattr->pre_change_attr) {
+		nfsi->change_attr = fattr->change_attr;
+		if (S_ISDIR(inode->i_mode))
+			nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+	}
 	/* If we have atomic WCC data, we may update some attributes */
 	if ((fattr->valid & NFS_ATTR_WCC) != 0) {
-		if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) {
+		if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
 			memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
-			nfsi->cache_change_attribute = now;
-		}
 		if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
 			memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
-			nfsi->cache_change_attribute = now;
+			if (S_ISDIR(inode->i_mode))
+				nfsi->cache_validity |= NFS_INO_INVALID_DATA;
 		}
-		if (inode->i_size == fattr->pre_size && nfsi->npages == 0) {
+		if (inode->i_size == fattr->pre_size && nfsi->npages == 0)
 			inode->i_size = fattr->size;
-			nfsi->cache_change_attribute = now;
-		}
 	}
 }
 
@@ -822,7 +814,7 @@ static int nfs_check_inode_attributes(st
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	loff_t cur_size, new_isize;
-	int data_unstable;
+	unsigned long invalid = 0;
 
 
 	/* Has the inode gone and changed behind our back? */
@@ -831,37 +823,41 @@ static int nfs_check_inode_attributes(st
 		return -EIO;
 	}
 
-	/* Are we in the process of updating data on the server? */
-	data_unstable = nfs_caches_unstable(inode);
-
 	/* Do atomic weak cache consistency updates */
 	nfs_wcc_update_inode(inode, fattr);
 
 	if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
 			nfsi->change_attr != fattr->change_attr)
-		nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 
 	/* Verify a few of the more important attributes */
 	if (!timespec_equal(&inode->i_mtime, &fattr->mtime))
-		nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 
 	cur_size = i_size_read(inode);
  	new_isize = nfs_size_to_loff_t(fattr->size);
 	if (cur_size != new_isize && nfsi->npages == 0)
-		nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 
 	/* Have any file permissions changed? */
 	if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)
 			|| inode->i_uid != fattr->uid
 			|| inode->i_gid != fattr->gid)
-		nfsi->cache_validity |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
+		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
 
 	/* Has the link count changed? */
 	if (inode->i_nlink != fattr->nlink)
-		nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+		invalid |= NFS_INO_INVALID_ATTR;
 
 	if (!timespec_equal(&inode->i_atime, &fattr->atime))
-		nfsi->cache_validity |= NFS_INO_INVALID_ATIME;
+		invalid |= NFS_INO_INVALID_ATIME;
+
+	if (invalid != 0)
+		nfsi->cache_validity |= invalid;
+	else
+		nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_ATIME
+				| NFS_INO_REVAL_PAGECACHE);
 
 	nfsi->read_cache_jiffies = fattr->time_start;
 	return 0;
@@ -911,17 +907,41 @@ int nfs_refresh_inode(struct inode *inod
 int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
-	int status = 0;
 
 	spin_lock(&inode->i_lock);
-	if (unlikely((fattr->valid & NFS_ATTR_FATTR) == 0)) {
-		nfsi->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
-		goto out;
-	}
-	status = nfs_update_inode(inode, fattr);
-out:
+	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+	if (S_ISDIR(inode->i_mode))
+		nfsi->cache_validity |= NFS_INO_INVALID_DATA;
 	spin_unlock(&inode->i_lock);
-	return status;
+	return nfs_refresh_inode(inode, fattr);
+}
+
+/**
+ * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache
+ * @inode - pointer to inode
+ * @fattr - updated attributes
+ *
+ * After an operation that has changed the inode metadata, mark the
+ * attribute cache as being invalid, then try to update it. Fake up
+ * weak cache consistency data, if none exist.
+ *
+ * This function is mainly designed to be used by the ->write_done() functions.
+ */
+int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr)
+{
+	if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
+			(fattr->valid & NFS_ATTR_WCC_V4) == 0) {
+		fattr->pre_change_attr = NFS_I(inode)->change_attr;
+		fattr->valid |= NFS_ATTR_WCC_V4;
+	}
+	if ((fattr->valid & NFS_ATTR_FATTR) != 0 &&
+			(fattr->valid & NFS_ATTR_WCC) == 0) {
+		memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime));
+		memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime));
+		fattr->pre_size = inode->i_size;
+		fattr->valid |= NFS_ATTR_WCC;
+	}
+	return nfs_post_op_update_inode(inode, fattr);
 }
 
 /*
@@ -941,9 +961,8 @@ static int nfs_update_inode(struct inode
 	struct nfs_server *server;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	loff_t cur_isize, new_isize;
-	unsigned int	invalid = 0;
+	unsigned long invalid = 0;
 	unsigned long now = jiffies;
-	int data_stable;
 
 	dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n",
 			__FUNCTION__, inode->i_sb->s_id, inode->i_ino,
@@ -968,57 +987,51 @@ static int nfs_update_inode(struct inode
 	 * Update the read time so we don't revalidate too often.
 	 */
 	nfsi->read_cache_jiffies = fattr->time_start;
-	nfsi->last_updated = now;
 
-	/* Fix a wraparound issue with nfsi->cache_change_attribute */
-	if (time_before(now, nfsi->cache_change_attribute))
-		nfsi->cache_change_attribute = now - 600*HZ;
-
-	/* Are we racing with known updates of the metadata on the server? */
-	data_stable = nfs_verify_change_attribute(inode, fattr->time_start);
-	if (data_stable)
-		nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_ATIME);
+	nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME
+			| NFS_INO_REVAL_PAGECACHE);
 
 	/* Do atomic weak cache consistency updates */
 	nfs_wcc_update_inode(inode, fattr);
 
+	/* More cache consistency checks */
+	if (!(fattr->valid & NFS_ATTR_FATTR_V4)) {
+		/* NFSv2/v3: Check if the mtime agrees */
+		if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
+			dprintk("NFS: mtime change on server for file %s/%ld\n",
+					inode->i_sb->s_id, inode->i_ino);
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+			nfsi->cache_change_attribute = now;
+		}
+		/* If ctime has changed we should definitely clear access+acl caches */
+		if (!timespec_equal(&inode->i_ctime, &fattr->ctime))
+			invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+	} else if (nfsi->change_attr != fattr->change_attr) {
+		dprintk("NFS: change_attr change on server for file %s/%ld\n",
+				inode->i_sb->s_id, inode->i_ino);
+		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+		nfsi->cache_change_attribute = now;
+	}
+
 	/* Check if our cached file size is stale */
  	new_isize = nfs_size_to_loff_t(fattr->size);
 	cur_isize = i_size_read(inode);
 	if (new_isize != cur_isize) {
-		/* Do we perhaps have any outstanding writes? */
-		if (nfsi->npages == 0) {
-			/* No, but did we race with nfs_end_data_update()? */
-			if (data_stable) {
-				inode->i_size = new_isize;
-				invalid |= NFS_INO_INVALID_DATA;
-			}
-			invalid |= NFS_INO_INVALID_ATTR;
-		} else if (new_isize > cur_isize) {
+		/* Do we perhaps have any outstanding writes, or has
+		 * the file grown beyond our last write? */
+		if (nfsi->npages == 0 || new_isize > cur_isize) {
 			inode->i_size = new_isize;
 			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
 		}
-		nfsi->cache_change_attribute = now;
 		dprintk("NFS: isize change on server for file %s/%ld\n",
 				inode->i_sb->s_id, inode->i_ino);
 	}
 
-	/* Check if the mtime agrees */
-	if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
-		memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
-		dprintk("NFS: mtime change on server for file %s/%ld\n",
-				inode->i_sb->s_id, inode->i_ino);
-		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
-		nfsi->cache_change_attribute = now;
-	}
 
-	/* If ctime has changed we should definitely clear access+acl caches */
-	if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
-		invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-		memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
-		nfsi->cache_change_attribute = now;
-	}
+	memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
+	memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
 	memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
+	nfsi->change_attr = fattr->change_attr;
 
 	if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) ||
 	    inode->i_uid != fattr->uid ||
@@ -1039,31 +1052,29 @@ static int nfs_update_inode(struct inode
  		inode->i_blocks = fattr->du.nfs2.blocks;
  	}
 
-	if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
-			nfsi->change_attr != fattr->change_attr) {
-		dprintk("NFS: change_attr change on server for file %s/%ld\n",
-				inode->i_sb->s_id, inode->i_ino);
-		nfsi->change_attr = fattr->change_attr;
-		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-		nfsi->cache_change_attribute = now;
-	}
-
 	/* Update attrtimeo value if we're out of the unstable period */
 	if (invalid & NFS_INO_INVALID_ATTR) {
 		nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
 		nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
 		nfsi->attrtimeo_timestamp = now;
-	} else if (time_after(now, nfsi->attrtimeo_timestamp+nfsi->attrtimeo)) {
-		if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode))
-			nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
-		nfsi->attrtimeo_timestamp = now;
+		nfsi->last_updated = now;
+	} else {
+		if (!time_in_range(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
+			if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode))
+				nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
+			nfsi->attrtimeo_timestamp = now;
+		}
+		/*
+		 * Avoid jiffy wraparound issues with nfsi->last_updated
+		 */
+		if (!time_in_range(nfsi->last_updated, nfsi->read_cache_jiffies, now))
+			nfsi->last_updated = nfsi->read_cache_jiffies;
 	}
+	invalid &= ~NFS_INO_INVALID_ATTR;
 	/* Don't invalidate the data if we were to blame */
 	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
 				|| S_ISLNK(inode->i_mode)))
 		invalid &= ~NFS_INO_INVALID_DATA;
-	if (data_stable)
-		invalid &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME|NFS_INO_REVAL_PAGECACHE);
 	if (!nfs_have_delegation(inode, FMODE_READ) ||
 			(nfsi->cache_validity & NFS_INO_REVAL_FORCED))
 		nfsi->cache_validity |= invalid;
@@ -1152,7 +1163,6 @@ static void init_once(void * foo, struct
 	INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
 	INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
 	INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
-	atomic_set(&nfsi->data_updates, 0);
 	nfsi->ncommit = 0;
 	nfsi->npages = 0;
 	nfs4_init_once(nfsi);
@@ -1249,6 +1259,7 @@ static void __exit exit_nfs_fs(void)
 /* Not quite true; I just maintain it */
 MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
 MODULE_LICENSE("GPL");
+module_param(enable_ino64, bool, 0644);
 
 module_init(init_nfs_fs)
 module_exit(exit_nfs_fs)
diff -puN fs/nfs/internal.h~git-nfs fs/nfs/internal.h
--- a/fs/nfs/internal.h~git-nfs
+++ a/fs/nfs/internal.h
@@ -5,8 +5,6 @@
 #include <linux/mount.h>
 
 struct nfs_string;
-struct nfs_mount_data;
-struct nfs4_mount_data;
 
 /* Maximum number of readahead requests
  * FIXME: this should really be a sysctl so that users may tune it to suit
@@ -27,20 +25,50 @@ struct nfs_clone_mount {
 	rpc_authflavor_t authflavor;
 };
 
+/*
+ * In-kernel mount arguments
+ */
+struct nfs_parsed_mount_data {
+	int			flags;
+	int			rsize, wsize;
+	int			timeo, retrans;
+	int			acregmin, acregmax,
+				acdirmin, acdirmax;
+	int			namlen;
+	unsigned int		bsize;
+	unsigned int		auth_flavor_len;
+	rpc_authflavor_t	auth_flavors[1];
+	char			*client_address;
+
+	struct {
+		struct sockaddr_in	address;
+		char			*hostname;
+		unsigned int		program;
+		unsigned int		version;
+		unsigned short		port;
+		int			protocol;
+	} mount_server;
+
+	struct {
+		struct sockaddr_in	address;
+		char			*hostname;
+		char			*export_path;
+		unsigned int		program;
+		int			protocol;
+	} nfs_server;
+};
+
 /* client.c */
 extern struct rpc_program nfs_program;
 
 extern void nfs_put_client(struct nfs_client *);
 extern struct nfs_client *nfs_find_client(const struct sockaddr_in *, int);
-extern struct nfs_server *nfs_create_server(const struct nfs_mount_data *,
-					    struct nfs_fh *);
-extern struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *,
-					     const char *,
-					     const struct sockaddr_in *,
-					     const char *,
-					     const char *,
-					     rpc_authflavor_t,
-					     struct nfs_fh *);
+extern struct nfs_server *nfs_create_server(
+					const struct nfs_parsed_mount_data *,
+					struct nfs_fh *);
+extern struct nfs_server *nfs4_create_server(
+					const struct nfs_parsed_mount_data *,
+					struct nfs_fh *);
 extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *,
 						      struct nfs_fh *);
 extern void nfs_free_server(struct nfs_server *server);
diff -puN fs/nfs/nfs2xdr.c~git-nfs fs/nfs/nfs2xdr.c
--- a/fs/nfs/nfs2xdr.c~git-nfs
+++ a/fs/nfs/nfs2xdr.c
@@ -251,6 +251,7 @@ nfs_xdr_readargs(struct rpc_rqst *req, _
 	replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readres_sz) << 2;
 	xdr_inline_pages(&req->rq_rcv_buf, replen,
 			 args->pages, args->pgbase, count);
+	req->rq_rcv_buf.flags |= XDRBUF_READ;
 	return 0;
 }
 
@@ -271,7 +272,7 @@ nfs_xdr_readres(struct rpc_rqst *req, __
 	res->eof = 0;
 	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
 	if (iov->iov_len < hdrlen) {
-		printk(KERN_WARNING "NFS: READ reply header overflowed:"
+		dprintk("NFS: READ reply header overflowed:"
 				"length %d > %Zu\n", hdrlen, iov->iov_len);
 		return -errno_NFSERR_IO;
 	} else if (iov->iov_len != hdrlen) {
@@ -281,7 +282,7 @@ nfs_xdr_readres(struct rpc_rqst *req, __
 
 	recvd = req->rq_rcv_buf.len - hdrlen;
 	if (count > recvd) {
-		printk(KERN_WARNING "NFS: server cheating in read reply: "
+		dprintk("NFS: server cheating in read reply: "
 			"count %d > recvd %d\n", count, recvd);
 		count = recvd;
 	}
@@ -313,6 +314,7 @@ nfs_xdr_writeargs(struct rpc_rqst *req, 
 
 	/* Copy the page array */
 	xdr_encode_pages(sndbuf, args->pages, args->pgbase, count);
+	sndbuf->flags |= XDRBUF_WRITE;
 	return 0;
 }
 
@@ -431,7 +433,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req,
 
 	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
 	if (iov->iov_len < hdrlen) {
-		printk(KERN_WARNING "NFS: READDIR reply header overflowed:"
+		dprintk("NFS: READDIR reply header overflowed:"
 				"length %d > %Zu\n", hdrlen, iov->iov_len);
 		return -errno_NFSERR_IO;
 	} else if (iov->iov_len != hdrlen) {
@@ -454,7 +456,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req,
 		len = ntohl(*p++);
 		p += XDR_QUADLEN(len) + 1;	/* name plus cookie */
 		if (len > NFS2_MAXNAMLEN) {
-			printk(KERN_WARNING "NFS: giant filename in readdir (len 0x%x)!\n",
+			dprintk("NFS: giant filename in readdir (len 0x%x)!\n",
 						len);
 			goto err_unmap;
 		}
@@ -471,7 +473,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req,
 	entry[0] = entry[1] = 0;
 	/* truncate listing ? */
 	if (!nr) {
-		printk(KERN_NOTICE "NFS: readdir reply truncated!\n");
+		dprintk("NFS: readdir reply truncated!\n");
 		entry[1] = 1;
 	}
 	goto out;
@@ -583,12 +585,12 @@ nfs_xdr_readlinkres(struct rpc_rqst *req
 	/* Convert length of symlink */
 	len = ntohl(*p++);
 	if (len >= rcvbuf->page_len || len <= 0) {
-		dprintk(KERN_WARNING "nfs: server returned giant symlink!\n");
+		dprintk("nfs: server returned giant symlink!\n");
 		return -ENAMETOOLONG;
 	}
 	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
 	if (iov->iov_len < hdrlen) {
-		printk(KERN_WARNING "NFS: READLINK reply header overflowed:"
+		dprintk("NFS: READLINK reply header overflowed:"
 				"length %d > %Zu\n", hdrlen, iov->iov_len);
 		return -errno_NFSERR_IO;
 	} else if (iov->iov_len != hdrlen) {
@@ -597,7 +599,7 @@ nfs_xdr_readlinkres(struct rpc_rqst *req
 	}
 	recvd = req->rq_rcv_buf.len - hdrlen;
 	if (recvd < len) {
-		printk(KERN_WARNING "NFS: server cheating in readlink reply: "
+		dprintk("NFS: server cheating in readlink reply: "
 				"count %u > recvd %u\n", len, recvd);
 		return -EIO;
 	}
@@ -695,7 +697,7 @@ nfs_stat_to_errno(int stat)
 		if (nfs_errtbl[i].stat == stat)
 			return nfs_errtbl[i].errno;
 	}
-	printk(KERN_ERR "nfs_stat_to_errno: bad nfs status return value: %d\n", stat);
+	dprintk("nfs_stat_to_errno: bad nfs status return value: %d\n", stat);
 	return nfs_errtbl[i].errno;
 }
 
diff -puN fs/nfs/nfs3acl.c~git-nfs fs/nfs/nfs3acl.c
--- a/fs/nfs/nfs3acl.c~git-nfs
+++ a/fs/nfs/nfs3acl.c
@@ -317,13 +317,11 @@ static int nfs3_proc_setacls(struct inod
 	}
 
 	dprintk("NFS call setacl\n");
-	nfs_begin_data_update(inode);
 	msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
 	status = rpc_call_sync(server->client_acl, &msg, 0);
 	spin_lock(&inode->i_lock);
 	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS;
 	spin_unlock(&inode->i_lock);
-	nfs_end_data_update(inode);
 	dprintk("NFS reply setacl: %d\n", status);
 
 	/* pages may have been allocated at the xdr layer. */
diff -puN fs/nfs/nfs3proc.c~git-nfs fs/nfs/nfs3proc.c
--- a/fs/nfs/nfs3proc.c~git-nfs
+++ a/fs/nfs/nfs3proc.c
@@ -166,6 +166,7 @@ nfs3_proc_lookup(struct inode *dir, stru
 	nfs_fattr_init(&dir_attr);
 	nfs_fattr_init(fattr);
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	nfs_refresh_inode(dir, &dir_attr);
 	if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) {
 		msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR];
 		msg.rpc_argp = fhandle;
@@ -173,8 +174,6 @@ nfs3_proc_lookup(struct inode *dir, stru
 		status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	}
 	dprintk("NFS reply lookup: %d\n", status);
-	if (status >= 0)
-		status = nfs_refresh_inode(dir, &dir_attr);
 	return status;
 }
 
@@ -607,6 +606,9 @@ nfs3_proc_readdir(struct dentry *dentry,
 
 	nfs_fattr_init(&dir_attr);
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+
+	nfs_invalidate_atime(dir);
+
 	nfs_refresh_inode(dir, &dir_attr);
 	dprintk("NFS reply readdir: %d\n", status);
 	return status;
@@ -724,9 +726,9 @@ static int nfs3_read_done(struct rpc_tas
 {
 	if (nfs3_async_handle_jukebox(task, data->inode))
 		return -EAGAIN;
-	/* Call back common NFS readpage processing */
-	if (task->tk_status >= 0)
-		nfs_refresh_inode(data->inode, &data->fattr);
+
+	nfs_invalidate_atime(data->inode);
+	nfs_refresh_inode(data->inode, &data->fattr);
 	return 0;
 }
 
@@ -747,7 +749,7 @@ static int nfs3_write_done(struct rpc_ta
 	if (nfs3_async_handle_jukebox(task, data->inode))
 		return -EAGAIN;
 	if (task->tk_status >= 0)
-		nfs_post_op_update_inode(data->inode, data->res.fattr);
+		nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr);
 	return 0;
 }
 
@@ -775,8 +777,7 @@ static int nfs3_commit_done(struct rpc_t
 {
 	if (nfs3_async_handle_jukebox(task, data->inode))
 		return -EAGAIN;
-	if (task->tk_status >= 0)
-		nfs_post_op_update_inode(data->inode, data->res.fattr);
+	nfs_refresh_inode(data->inode, data->res.fattr);
 	return 0;
 }
 
diff -puN fs/nfs/nfs3xdr.c~git-nfs fs/nfs/nfs3xdr.c
--- a/fs/nfs/nfs3xdr.c~git-nfs
+++ a/fs/nfs/nfs3xdr.c
@@ -346,6 +346,7 @@ nfs3_xdr_readargs(struct rpc_rqst *req, 
 	replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readres_sz) << 2;
 	xdr_inline_pages(&req->rq_rcv_buf, replen,
 			 args->pages, args->pgbase, count);
+	req->rq_rcv_buf.flags |= XDRBUF_READ;
 	return 0;
 }
 
@@ -367,6 +368,7 @@ nfs3_xdr_writeargs(struct rpc_rqst *req,
 
 	/* Copy the page array */
 	xdr_encode_pages(sndbuf, args->pages, args->pgbase, count);
+	sndbuf->flags |= XDRBUF_WRITE;
 	return 0;
 }
 
@@ -524,7 +526,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req
 
 	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
 	if (iov->iov_len < hdrlen) {
-		printk(KERN_WARNING "NFS: READDIR reply header overflowed:"
+		dprintk("NFS: READDIR reply header overflowed:"
 				"length %d > %Zu\n", hdrlen, iov->iov_len);
 		return -errno_NFSERR_IO;
 	} else if (iov->iov_len != hdrlen) {
@@ -547,7 +549,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req
 		len = ntohl(*p++);		/* string length */
 		p += XDR_QUADLEN(len) + 2;	/* name + cookie */
 		if (len > NFS3_MAXNAMLEN) {
-			printk(KERN_WARNING "NFS: giant filename in readdir (len %x)!\n",
+			dprintk("NFS: giant filename in readdir (len %x)!\n",
 						len);
 			goto err_unmap;
 		}
@@ -567,7 +569,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req
 					goto short_pkt;
 				len = ntohl(*p++);
 				if (len > NFS3_FHSIZE) {
-					printk(KERN_WARNING "NFS: giant filehandle in "
+					dprintk("NFS: giant filehandle in "
 						"readdir (len %x)!\n", len);
 					goto err_unmap;
 				}
@@ -588,7 +590,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req
 	entry[0] = entry[1] = 0;
 	/* truncate listing ? */
 	if (!nr) {
-		printk(KERN_NOTICE "NFS: readdir reply truncated!\n");
+		dprintk("NFS: readdir reply truncated!\n");
 		entry[1] = 1;
 	}
 	goto out;
@@ -826,22 +828,23 @@ nfs3_xdr_readlinkres(struct rpc_rqst *re
 	/* Convert length of symlink */
 	len = ntohl(*p++);
 	if (len >= rcvbuf->page_len || len <= 0) {
-		dprintk(KERN_WARNING "nfs: server returned giant symlink!\n");
+		dprintk("nfs: server returned giant symlink!\n");
 		return -ENAMETOOLONG;
 	}
 
 	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
 	if (iov->iov_len < hdrlen) {
-		printk(KERN_WARNING "NFS: READLINK reply header overflowed:"
+		dprintk("NFS: READLINK reply header overflowed:"
 				"length %d > %Zu\n", hdrlen, iov->iov_len);
 		return -errno_NFSERR_IO;
 	} else if (iov->iov_len != hdrlen) {
-		dprintk("NFS: READLINK header is short. iovec will be shifted.\n");
+		dprintk("NFS: READLINK header is short. "
+			"iovec will be shifted.\n");
 		xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
 	}
 	recvd = req->rq_rcv_buf.len - hdrlen;
 	if (recvd < len) {
-		printk(KERN_WARNING "NFS: server cheating in readlink reply: "
+		dprintk("NFS: server cheating in readlink reply: "
 				"count %u > recvd %u\n", len, recvd);
 		return -EIO;
 	}
@@ -876,13 +879,13 @@ nfs3_xdr_readres(struct rpc_rqst *req, _
 	ocount   = ntohl(*p++);
 
 	if (ocount != count) {
-		printk(KERN_WARNING "NFS: READ count doesn't match RPC opaque count.\n");
+		dprintk("NFS: READ count doesn't match RPC opaque count.\n");
 		return -errno_NFSERR_IO;
 	}
 
 	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
 	if (iov->iov_len < hdrlen) {
-		printk(KERN_WARNING "NFS: READ reply header overflowed:"
+		dprintk("NFS: READ reply header overflowed:"
 				"length %d > %Zu\n", hdrlen, iov->iov_len);
        		return -errno_NFSERR_IO;
 	} else if (iov->iov_len != hdrlen) {
@@ -892,7 +895,7 @@ nfs3_xdr_readres(struct rpc_rqst *req, _
 
 	recvd = req->rq_rcv_buf.len - hdrlen;
 	if (count > recvd) {
-		printk(KERN_WARNING "NFS: server cheating in read reply: "
+		dprintk("NFS: server cheating in read reply: "
 			"count %d > recvd %d\n", count, recvd);
 		count = recvd;
 		res->eof = 0;
diff -puN fs/nfs/nfs4proc.c~git-nfs fs/nfs/nfs4proc.c
--- a/fs/nfs/nfs4proc.c~git-nfs
+++ a/fs/nfs/nfs4proc.c
@@ -62,10 +62,8 @@ struct nfs4_opendata;
 static int _nfs4_proc_open(struct nfs4_opendata *data);
 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
 static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *);
-static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry);
 static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception);
 static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp);
-static int _nfs4_do_access(struct inode *inode, struct rpc_cred *cred, int openflags);
 static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
 
@@ -177,7 +175,7 @@ static void nfs4_setup_readdir(u64 cooki
 		*p++ = xdr_one;                         /* bitmap length */
 		*p++ = htonl(FATTR4_WORD0_FILEID);             /* bitmap */
 		*p++ = htonl(8);              /* attribute buffer length */
-		p = xdr_encode_hyper(p, dentry->d_inode->i_ino);
+		p = xdr_encode_hyper(p, NFS_FILEID(dentry->d_inode));
 	}
 	
 	*p++ = xdr_one;                                  /* next */
@@ -189,7 +187,7 @@ static void nfs4_setup_readdir(u64 cooki
 	*p++ = xdr_one;                         /* bitmap length */
 	*p++ = htonl(FATTR4_WORD0_FILEID);             /* bitmap */
 	*p++ = htonl(8);              /* attribute buffer length */
-	p = xdr_encode_hyper(p, dentry->d_parent->d_inode->i_ino);
+	p = xdr_encode_hyper(p, NFS_FILEID(dentry->d_parent->d_inode));
 
 	readdir->pgbase = (char *)p - (char *)start;
 	readdir->count -= readdir->pgbase;
@@ -211,8 +209,9 @@ static void update_changeattr(struct ino
 
 	spin_lock(&dir->i_lock);
 	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
-	if (cinfo->before == nfsi->change_attr && cinfo->atomic)
-		nfsi->change_attr = cinfo->after;
+	if (!cinfo->atomic || cinfo->before != nfsi->change_attr)
+		nfsi->cache_change_attribute = jiffies;
+	nfsi->change_attr = cinfo->after;
 	spin_unlock(&dir->i_lock);
 }
 
@@ -454,7 +453,7 @@ static struct nfs4_state *nfs4_try_open_
 		memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data));
 		rcu_read_unlock();
 		lock_kernel();
-		ret = _nfs4_do_access(state->inode, state->owner->so_cred, open_mode);
+		ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode);
 		unlock_kernel();
 		if (ret != 0)
 			goto out;
@@ -948,36 +947,6 @@ static int _nfs4_proc_open(struct nfs4_o
 	return 0;
 }
 
-static int _nfs4_do_access(struct inode *inode, struct rpc_cred *cred, int openflags)
-{
-	struct nfs_access_entry cache;
-	int mask = 0;
-	int status;
-
-	if (openflags & FMODE_READ)
-		mask |= MAY_READ;
-	if (openflags & FMODE_WRITE)
-		mask |= MAY_WRITE;
-	if (openflags & FMODE_EXEC)
-		mask |= MAY_EXEC;
-	status = nfs_access_get_cached(inode, cred, &cache);
-	if (status == 0)
-		goto out;
-
-	/* Be clever: ask server to check for all possible rights */
-	cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ;
-	cache.cred = cred;
-	cache.jiffies = jiffies;
-	status = _nfs4_proc_access(inode, &cache);
-	if (status != 0)
-		return status;
-	nfs_access_add_cache(inode, &cache);
-out:
-	if ((cache.mask & mask) == mask)
-		return 0;
-	return -EACCES;
-}
-
 static int nfs4_recover_expired_lease(struct nfs_server *server)
 {
 	struct nfs_client *clp = server->nfs_client;
@@ -1381,7 +1350,7 @@ static int nfs4_intent_set_file(struct n
 
 	/* If the open_intent is for execute, we have an extra check to make */
 	if (nd->intent.open.flags & FMODE_EXEC) {
-		ret = _nfs4_do_access(state->inode,
+		ret = nfs_may_open(state->inode,
 				state->owner->so_cred,
 				nd->intent.open.flags);
 		if (ret < 0)
@@ -1390,7 +1359,7 @@ static int nfs4_intent_set_file(struct n
 	filp = lookup_instantiate_filp(nd, path->dentry, NULL);
 	if (!IS_ERR(filp)) {
 		struct nfs_open_context *ctx;
-		ctx = (struct nfs_open_context *)filp->private_data;
+		ctx = nfs_file_open_context(filp);
 		ctx->state = state;
 		return 0;
 	}
@@ -1428,13 +1397,16 @@ nfs4_atomic_open(struct inode *dir, stru
 	state = nfs4_do_open(dir, &path, nd->intent.open.flags, &attr, cred);
 	put_rpccred(cred);
 	if (IS_ERR(state)) {
-		if (PTR_ERR(state) == -ENOENT)
+		if (PTR_ERR(state) == -ENOENT) {
 			d_add(dentry, NULL);
+			nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+		}
 		return (struct dentry *)state;
 	}
 	res = d_add_unique(dentry, igrab(state->inode));
 	if (res != NULL)
 		path.dentry = res;
+	nfs_set_verifier(path.dentry, nfs_save_change_attribute(dir));
 	nfs4_intent_set_file(nd, &path, state);
 	return res;
 }
@@ -1468,6 +1440,7 @@ nfs4_open_revalidate(struct inode *dir, 
 		}
 	}
 	if (state->inode == dentry->d_inode) {
+		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 		nfs4_intent_set_file(nd, &path, state);
 		return 1;
 	}
@@ -1757,10 +1730,16 @@ static int nfs4_proc_lookup(struct inode
 
 static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
 {
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_fattr fattr;
 	struct nfs4_accessargs args = {
 		.fh = NFS_FH(inode),
+		.bitmask = server->attr_bitmask,
+	};
+	struct nfs4_accessres res = {
+		.server = server,
+		.fattr = &fattr,
 	};
-	struct nfs4_accessres res = { 0 };
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS],
 		.rpc_argp = &args,
@@ -1786,6 +1765,7 @@ static int _nfs4_proc_access(struct inod
 		if (mode & MAY_EXEC)
 			args.access |= NFS4_ACCESS_EXECUTE;
 	}
+	nfs_fattr_init(&fattr);
 	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
 	if (!status) {
 		entry->mask = 0;
@@ -1795,6 +1775,7 @@ static int _nfs4_proc_access(struct inod
 			entry->mask |= MAY_WRITE;
 		if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE))
 			entry->mask |= MAY_EXEC;
+		nfs_refresh_inode(inode, &fattr);
 	}
 	return status;
 }
@@ -1900,11 +1881,13 @@ nfs4_proc_create(struct inode *dir, stru
 	}
 	state = nfs4_do_open(dir, &path, flags, sattr, cred);
 	put_rpccred(cred);
+	d_drop(dentry);
 	if (IS_ERR(state)) {
 		status = PTR_ERR(state);
 		goto out;
 	}
-	d_instantiate(dentry, igrab(state->inode));
+	d_add(dentry, igrab(state->inode));
+	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	if (flags & O_EXCL) {
 		struct nfs_fattr fattr;
 		status = nfs4_do_setattr(state->inode, &fattr, sattr, state);
@@ -2218,6 +2201,9 @@ static int _nfs4_proc_readdir(struct den
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	if (status == 0)
 		memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE);
+
+	nfs_invalidate_atime(dir);
+
 	dprintk("%s: returns %d\n", __FUNCTION__, status);
 	return status;
 }
@@ -2414,6 +2400,8 @@ static int nfs4_read_done(struct rpc_tas
 		rpc_restart_call(task);
 		return -EAGAIN;
 	}
+
+	nfs_invalidate_atime(data->inode);
 	if (task->tk_status > 0)
 		renew_lease(server, data->timestamp);
 	return 0;
@@ -2443,7 +2431,7 @@ static int nfs4_write_done(struct rpc_ta
 	}
 	if (task->tk_status >= 0) {
 		renew_lease(NFS_SERVER(inode), data->timestamp);
-		nfs_post_op_update_inode(inode, data->res.fattr);
+		nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
 	}
 	return 0;
 }
@@ -2485,8 +2473,7 @@ static int nfs4_commit_done(struct rpc_t
 		rpc_restart_call(task);
 		return -EAGAIN;
 	}
-	if (task->tk_status >= 0)
-		nfs_post_op_update_inode(inode, data->res.fattr);
+	nfs_refresh_inode(inode, data->res.fattr);
 	return 0;
 }
 
@@ -3056,7 +3043,7 @@ static int _nfs4_proc_delegreturn(struct
 	if (status == 0) {
 		status = data->rpc_status;
 		if (status == 0)
-			nfs_post_op_update_inode(inode, &data->fattr);
+			nfs_refresh_inode(inode, &data->fattr);
 	}
 	rpc_put_task(task);
 	return status;
@@ -3303,7 +3290,7 @@ static int nfs4_proc_unlck(struct nfs4_s
 	status = -ENOMEM;
 	if (seqid == NULL)
 		goto out;
-	task = nfs4_do_unlck(request, request->fl_file->private_data, lsp, seqid);
+	task = nfs4_do_unlck(request, nfs_file_open_context(request->fl_file), lsp, seqid);
 	status = PTR_ERR(task);
 	if (IS_ERR(task))
 		goto out;
@@ -3447,7 +3434,7 @@ static int _nfs4_do_setlk(struct nfs4_st
 	int ret;
 
 	dprintk("%s: begin!\n", __FUNCTION__);
-	data = nfs4_alloc_lockdata(fl, fl->fl_file->private_data,
+	data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file),
 			fl->fl_u.nfs4_fl.owner);
 	if (data == NULL)
 		return -ENOMEM;
@@ -3573,7 +3560,7 @@ nfs4_proc_lock(struct file *filp, int cm
 	int status;
 
 	/* verify open state */
-	ctx = (struct nfs_open_context *)filp->private_data;
+	ctx = nfs_file_open_context(filp);
 	state = ctx->state;
 
 	if (request->fl_start < 0 || request->fl_end < 0)
diff -puN fs/nfs/nfs4state.c~git-nfs fs/nfs/nfs4state.c
--- a/fs/nfs/nfs4state.c~git-nfs
+++ a/fs/nfs/nfs4state.c
@@ -774,7 +774,7 @@ static int nfs4_reclaim_locks(struct nfs
 	for (fl = inode->i_flock; fl != 0; fl = fl->fl_next) {
 		if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
 			continue;
-		if (((struct nfs_open_context *)fl->fl_file->private_data)->state != state)
+		if (nfs_file_open_context(fl->fl_file)->state != state)
 			continue;
 		status = ops->recover_lock(state, fl);
 		if (status >= 0)
diff -puN fs/nfs/nfs4xdr.c~git-nfs fs/nfs/nfs4xdr.c
--- a/fs/nfs/nfs4xdr.c~git-nfs
+++ a/fs/nfs/nfs4xdr.c
@@ -376,10 +376,12 @@ static int nfs4_stat_to_errno(int);
 				decode_locku_maxsz)
 #define NFS4_enc_access_sz	(compound_encode_hdr_maxsz + \
 				encode_putfh_maxsz + \
-				encode_access_maxsz)
+				encode_access_maxsz + \
+				encode_getattr_maxsz)
 #define NFS4_dec_access_sz	(compound_decode_hdr_maxsz + \
 				decode_putfh_maxsz + \
-				decode_access_maxsz)
+				decode_access_maxsz + \
+				decode_getattr_maxsz)
 #define NFS4_enc_getattr_sz	(compound_encode_hdr_maxsz + \
 				encode_putfh_maxsz + \
 				encode_getattr_maxsz)
@@ -562,7 +564,6 @@ struct compound_hdr {
 
 #define RESERVE_SPACE(nbytes)	do {				\
 	p = xdr_reserve_space(xdr, nbytes);			\
-	if (!p) printk("RESERVE_SPACE(%d) failed in function %s\n", (int) (nbytes), __FUNCTION__); \
 	BUG_ON(!p);						\
 } while (0)
 
@@ -628,8 +629,8 @@ static int encode_attrs(struct xdr_strea
 	if (iap->ia_valid & ATTR_UID) {
 		owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name);
 		if (owner_namelen < 0) {
-			printk(KERN_WARNING "nfs: couldn't resolve uid %d to string\n",
-			       iap->ia_uid);
+			dprintk("nfs: couldn't resolve uid %d to string\n",
+					iap->ia_uid);
 			/* XXX */
 			strcpy(owner_name, "nobody");
 			owner_namelen = sizeof("nobody") - 1;
@@ -640,8 +641,8 @@ static int encode_attrs(struct xdr_strea
 	if (iap->ia_valid & ATTR_GID) {
 		owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group);
 		if (owner_grouplen < 0) {
-			printk(KERN_WARNING "nfs4: couldn't resolve gid %d to string\n",
-			       iap->ia_gid);
+			dprintk("nfs: couldn't resolve gid %d to string\n",
+					iap->ia_gid);
 			strcpy(owner_group, "nobody");
 			owner_grouplen = sizeof("nobody") - 1;
 			/* goto out; */
@@ -711,7 +712,7 @@ static int encode_attrs(struct xdr_strea
 	 * Now we backfill the bitmap and the attribute buffer length.
 	 */
 	if (len != ((char *)p - (char *)q) + 4) {
-		printk ("encode_attr: Attr length calculation error! %u != %Zu\n",
+		printk(KERN_ERR "nfs: Attr length error, %u != %Zu\n",
 				len, ((char *)p - (char *)q) + 4);
 		BUG();
 	}
@@ -1376,14 +1377,20 @@ static int nfs4_xdr_enc_access(struct rp
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
-		.nops = 2,
+		.nops = 3,
 	};
 	int status;
 
 	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
 	encode_compound_hdr(&xdr, &hdr);
-	if ((status = encode_putfh(&xdr, args->fh)) == 0)
-		status = encode_access(&xdr, args->access);
+	status = encode_putfh(&xdr, args->fh);
+	if (status != 0)
+		goto out;
+	status = encode_access(&xdr, args->access);
+	if (status != 0)
+		goto out;
+	status = encode_getfattr(&xdr, args->bitmask);
+out:
 	return status;
 }
 
@@ -1857,6 +1864,7 @@ static int nfs4_xdr_enc_read(struct rpc_
 	replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_read_sz) << 2;
 	xdr_inline_pages(&req->rq_rcv_buf, replen,
 			 args->pages, args->pgbase, args->count);
+	req->rq_rcv_buf.flags |= XDRBUF_READ;
 out:
 	return status;
 }
@@ -1933,6 +1941,7 @@ static int nfs4_xdr_enc_write(struct rpc
 	status = encode_write(&xdr, args);
 	if (status)
 		goto out;
+	req->rq_snd_buf.flags |= XDRBUF_WRITE;
 	status = encode_getfattr(&xdr, args->bitmask);
 out:
 	return status;
@@ -2180,9 +2189,9 @@ out:
 #define READ_BUF(nbytes)  do { \
 	p = xdr_inline_decode(xdr, nbytes); \
 	if (unlikely(!p)) { \
-		printk(KERN_INFO "%s: prematurely hit end of receive" \
+		dprintk("nfs: %s: prematurely hit end of receive" \
 				" buffer\n", __FUNCTION__); \
-		printk(KERN_INFO "%s: xdr->p=%p, bytes=%u, xdr->end=%p\n", \
+		dprintk("nfs: %s: xdr->p=%p, bytes=%u, xdr->end=%p\n", \
 				__FUNCTION__, xdr->p, nbytes, xdr->end); \
 		return -EIO; \
 	} \
@@ -2223,9 +2232,8 @@ static int decode_op_hdr(struct xdr_stre
 	READ_BUF(8);
 	READ32(opnum);
 	if (opnum != expected) {
-		printk(KERN_NOTICE
-				"nfs4_decode_op_hdr: Server returned operation"
-			       	" %d but we issued a request for %d\n",
+		dprintk("nfs: Server returned operation"
+			" %d but we issued a request for %d\n",
 				opnum, expected);
 		return -EIO;
 	}
@@ -2758,7 +2766,7 @@ static int decode_attr_owner(struct xdr_
 				dprintk("%s: nfs_map_name_to_uid failed!\n",
 						__FUNCTION__);
 		} else
-			printk(KERN_WARNING "%s: name too long (%u)!\n",
+			dprintk("%s: name too long (%u)!\n",
 					__FUNCTION__, len);
 		bitmap[1] &= ~FATTR4_WORD1_OWNER;
 	}
@@ -2783,7 +2791,7 @@ static int decode_attr_group(struct xdr_
 				dprintk("%s: nfs_map_group_to_gid failed!\n",
 						__FUNCTION__);
 		} else
-			printk(KERN_WARNING "%s: name too long (%u)!\n",
+			dprintk("%s: name too long (%u)!\n",
 					__FUNCTION__, len);
 		bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP;
 	}
@@ -2950,7 +2958,8 @@ static int verify_attr_len(struct xdr_st
 	unsigned int nwords = xdr->p - savep;
 
 	if (unlikely(attrwords != nwords)) {
-		printk(KERN_WARNING "%s: server returned incorrect attribute length: %u %c %u\n",
+		dprintk("%s: server returned incorrect attribute length: "
+			"%u %c %u\n",
 				__FUNCTION__,
 				attrwords << 2,
 				(attrwords < nwords) ? '<' : '>',
@@ -3451,7 +3460,7 @@ static int decode_read(struct xdr_stream
 	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
 	recvd = req->rq_rcv_buf.len - hdrlen;
 	if (count > recvd) {
-		printk(KERN_WARNING "NFS: server cheating in read reply: "
+		dprintk("NFS: server cheating in read reply: "
 				"count %u > recvd %u\n", count, recvd);
 		count = recvd;
 		eof = 0;
@@ -3500,7 +3509,8 @@ static int decode_readdir(struct xdr_str
 		p += 2;			/* cookie */
 		len = ntohl(*p++);	/* filename length */
 		if (len > NFS4_MAXNAMLEN) {
-			printk(KERN_WARNING "NFS: giant filename in readdir (len 0x%x)\n", len);
+			dprintk("NFS: giant filename in readdir (len 0x%x)\n",
+					len);
 			goto err_unmap;
 		}
 		xlen = XDR_QUADLEN(len);
@@ -3528,7 +3538,7 @@ short_pkt:
 	entry[0] = entry[1] = 0;
 	/* truncate listing ? */
 	if (!nr) {
-		printk(KERN_NOTICE "NFS: readdir reply truncated!\n");
+		dprintk("NFS: readdir reply truncated!\n");
 		entry[1] = 1;
 	}
 	goto out;
@@ -3554,13 +3564,13 @@ static int decode_readlink(struct xdr_st
 	READ_BUF(4);
 	READ32(len);
 	if (len >= rcvbuf->page_len || len <= 0) {
-		dprintk(KERN_WARNING "nfs: server returned giant symlink!\n");
+		dprintk("nfs: server returned giant symlink!\n");
 		return -ENAMETOOLONG;
 	}
 	hdrlen = (char *) xdr->p - (char *) iov->iov_base;
 	recvd = req->rq_rcv_buf.len - hdrlen;
 	if (recvd < len) {
-		printk(KERN_WARNING "NFS: server cheating in readlink reply: "
+		dprintk("NFS: server cheating in readlink reply: "
 				"count %u > recvd %u\n", len, recvd);
 		return -EIO;
 	}
@@ -3643,7 +3653,7 @@ static int decode_getacl(struct xdr_stre
 		hdrlen = (u8 *)xdr->p - (u8 *)iov->iov_base;
 		recvd = req->rq_rcv_buf.len - hdrlen;
 		if (attrlen > recvd) {
-			printk(KERN_WARNING "NFS: server cheating in getattr"
+			dprintk("NFS: server cheating in getattr"
 					" acl reply: attrlen %u > recvd %u\n",
 					attrlen, recvd);
 			return -EINVAL;
@@ -3688,8 +3698,7 @@ static int decode_setclientid(struct xdr
 	READ_BUF(8);
 	READ32(opnum);
 	if (opnum != OP_SETCLIENTID) {
-		printk(KERN_NOTICE
-				"nfs4_decode_setclientid: Server returned operation"
+		dprintk("nfs: decode_setclientid: Server returned operation"
 			       	" %d\n", opnum);
 		return -EIO;
 	}
@@ -3783,8 +3792,13 @@ static int nfs4_xdr_dec_access(struct rp
 	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
 	if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
 		goto out;
-	if ((status = decode_putfh(&xdr)) == 0)
-		status = decode_access(&xdr, res);
+	status = decode_putfh(&xdr);
+	if (status != 0)
+		goto out;
+	status = decode_access(&xdr, res);
+	if (status != 0)
+		goto out;
+	decode_getfattr(&xdr, res->fattr, res->server);
 out:
 	return status;
 }
diff -puN fs/nfs/nfsroot.c~git-nfs fs/nfs/nfsroot.c
--- a/fs/nfs/nfsroot.c~git-nfs
+++ a/fs/nfs/nfsroot.c
@@ -76,6 +76,7 @@
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/xprtsock.h>
 #include <linux/nfs.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
@@ -491,7 +492,7 @@ static int __init root_nfs_get_handle(vo
 	struct sockaddr_in sin;
 	int status;
 	int protocol = (nfs_data.flags & NFS_MOUNT_TCP) ?
-					IPPROTO_TCP : IPPROTO_UDP;
+					XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP;
 	int version = (nfs_data.flags & NFS_MOUNT_VER3) ?
 					NFS_MNT3_VERSION : NFS_MNT_VERSION;
 
diff -puN fs/nfs/proc.c~git-nfs fs/nfs/proc.c
--- a/fs/nfs/proc.c~git-nfs
+++ a/fs/nfs/proc.c
@@ -476,6 +476,8 @@ nfs_proc_readdir(struct dentry *dentry, 
 	dprintk("NFS call  readdir %d\n", (unsigned int)cookie);
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 
+	nfs_invalidate_atime(dir);
+
 	dprintk("NFS reply readdir: %d\n", status);
 	return status;
 }
@@ -550,6 +552,7 @@ nfs_proc_pathconf(struct nfs_server *ser
 
 static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
 {
+	nfs_invalidate_atime(data->inode);
 	if (task->tk_status >= 0) {
 		nfs_refresh_inode(data->inode, data->res.fattr);
 		/* Emulate the eof flag, which isn't normally needed in NFSv2
@@ -576,7 +579,7 @@ static void nfs_proc_read_setup(struct n
 static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
 	if (task->tk_status >= 0)
-		nfs_post_op_update_inode(data->inode, data->res.fattr);
+		nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr);
 	return 0;
 }
 
diff -puN fs/nfs/read.c~git-nfs fs/nfs/read.c
--- a/fs/nfs/read.c~git-nfs
+++ a/fs/nfs/read.c
@@ -341,9 +341,6 @@ int nfs_readpage_result(struct rpc_task 
 		set_bit(NFS_INO_STALE, &NFS_FLAGS(data->inode));
 		nfs_mark_for_revalidate(data->inode);
 	}
-	spin_lock(&data->inode->i_lock);
-	NFS_I(data->inode)->cache_validity |= NFS_INO_INVALID_ATIME;
-	spin_unlock(&data->inode->i_lock);
 	return 0;
 }
 
@@ -497,8 +494,7 @@ int nfs_readpage(struct file *file, stru
 		if (ctx == NULL)
 			goto out_unlock;
 	} else
-		ctx = get_nfs_open_context((struct nfs_open_context *)
-				file->private_data);
+		ctx = get_nfs_open_context(nfs_file_open_context(file));
 
 	error = nfs_readpage_async(ctx, inode, page);
 
@@ -576,8 +572,7 @@ int nfs_readpages(struct file *filp, str
 		if (desc.ctx == NULL)
 			return -EBADF;
 	} else
-		desc.ctx = get_nfs_open_context((struct nfs_open_context *)
-				filp->private_data);
+		desc.ctx = get_nfs_open_context(nfs_file_open_context(filp));
 	if (rsize < PAGE_CACHE_SIZE)
 		nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
 	else
diff -puN fs/nfs/super.c~git-nfs fs/nfs/super.c
--- a/fs/nfs/super.c~git-nfs
+++ a/fs/nfs/super.c
@@ -33,6 +33,8 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/stats.h>
 #include <linux/sunrpc/metrics.h>
+#include <linux/sunrpc/xprtsock.h>
+#include <linux/sunrpc/xprtrdma.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
 #include <linux/nfs4_mount.h>
@@ -58,36 +60,6 @@
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
 
-
-struct nfs_parsed_mount_data {
-	int			flags;
-	int			rsize, wsize;
-	int			timeo, retrans;
-	int			acregmin, acregmax,
-				acdirmin, acdirmax;
-	int			namlen;
-	unsigned int		bsize;
-	unsigned int		auth_flavor_len;
-	rpc_authflavor_t	auth_flavors[1];
-	char			*client_address;
-
-	struct {
-		struct sockaddr_in	address;
-		unsigned int		program;
-		unsigned int		version;
-		unsigned short		port;
-		int			protocol;
-	} mount_server;
-
-	struct {
-		struct sockaddr_in	address;
-		char			*hostname;
-		char			*export_path;
-		unsigned int		program;
-		int			protocol;
-	} nfs_server;
-};
-
 enum {
 	/* Mount options that take no arguments */
 	Opt_soft, Opt_hard,
@@ -97,7 +69,7 @@ enum {
 	Opt_ac, Opt_noac,
 	Opt_lock, Opt_nolock,
 	Opt_v2, Opt_v3,
-	Opt_udp, Opt_tcp,
+	Opt_udp, Opt_tcp, Opt_rdma,
 	Opt_acl, Opt_noacl,
 	Opt_rdirplus, Opt_nordirplus,
 	Opt_sharecache, Opt_nosharecache,
@@ -116,7 +88,7 @@ enum {
 
 	/* Mount options that take string arguments */
 	Opt_sec, Opt_proto, Opt_mountproto,
-	Opt_addr, Opt_mounthost, Opt_clientaddr,
+	Opt_addr, Opt_mountaddr, Opt_clientaddr,
 
 	/* Mount options that are ignored */
 	Opt_userspace, Opt_deprecated,
@@ -143,6 +115,7 @@ static match_table_t nfs_mount_option_to
 	{ Opt_v3, "v3" },
 	{ Opt_udp, "udp" },
 	{ Opt_tcp, "tcp" },
+	{ Opt_rdma, "rdma" },
 	{ Opt_acl, "acl" },
 	{ Opt_noacl, "noacl" },
 	{ Opt_rdirplus, "rdirplus" },
@@ -175,13 +148,14 @@ static match_table_t nfs_mount_option_to
 	{ Opt_mountproto, "mountproto=%s" },
 	{ Opt_addr, "addr=%s" },
 	{ Opt_clientaddr, "clientaddr=%s" },
-	{ Opt_mounthost, "mounthost=%s" },
+	{ Opt_userspace, "mounthost=%s" },
+	{ Opt_mountaddr, "mountaddr=%s" },
 
 	{ Opt_err, NULL }
 };
 
 enum {
-	Opt_xprt_udp, Opt_xprt_tcp,
+	Opt_xprt_udp, Opt_xprt_tcp, Opt_xprt_rdma,
 
 	Opt_xprt_err
 };
@@ -189,6 +163,7 @@ enum {
 static match_table_t nfs_xprt_protocol_tokens = {
 	{ Opt_xprt_udp, "udp" },
 	{ Opt_xprt_tcp, "tcp" },
+	{ Opt_xprt_rdma, "rdma" },
 
 	{ Opt_xprt_err, NULL }
 };
@@ -449,7 +424,7 @@ static void nfs_show_mount_options(struc
 		const char *nostr;
 	} nfs_info[] = {
 		{ NFS_MOUNT_SOFT, ",soft", ",hard" },
-		{ NFS_MOUNT_INTR, ",intr", "" },
+		{ NFS_MOUNT_INTR, ",intr", ",nointr" },
 		{ NFS_MOUNT_NOCTO, ",nocto", "" },
 		{ NFS_MOUNT_NOAC, ",noac", "" },
 		{ NFS_MOUNT_NONLM, ",nolock", "" },
@@ -460,8 +435,6 @@ static void nfs_show_mount_options(struc
 	};
 	const struct proc_nfs_info *nfs_infop;
 	struct nfs_client *clp = nfss->nfs_client;
-	char buf[12];
-	const char *proto;
 
 	seq_printf(m, ",vers=%d", clp->rpc_ops->version);
 	seq_printf(m, ",rsize=%d", nfss->rsize);
@@ -480,18 +453,8 @@ static void nfs_show_mount_options(struc
 		else
 			seq_puts(m, nfs_infop->nostr);
 	}
-	switch (nfss->client->cl_xprt->prot) {
-		case IPPROTO_TCP:
-			proto = "tcp";
-			break;
-		case IPPROTO_UDP:
-			proto = "udp";
-			break;
-		default:
-			snprintf(buf, sizeof(buf), "%u", nfss->client->cl_xprt->prot);
-			proto = buf;
-	}
-	seq_printf(m, ",proto=%s", proto);
+	seq_printf(m, ",proto=%s",
+		   rpc_peeraddr2str(nfss->client, RPC_DISPLAY_PROTO));
 	seq_printf(m, ",timeo=%lu", 10U * clp->retrans_timeo / HZ);
 	seq_printf(m, ",retrans=%u", clp->retrans_count);
 	seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor));
@@ -506,8 +469,8 @@ static int nfs_show_options(struct seq_f
 
 	nfs_show_mount_options(m, nfss, 0);
 
-	seq_puts(m, ",addr=");
-	seq_escape(m, nfss->nfs_client->cl_hostname, " \t\n\\");
+	seq_printf(m, ",addr="NIPQUAD_FMT,
+		NIPQUAD(nfss->nfs_client->cl_addr.sin_addr));
 
 	return 0;
 }
@@ -698,13 +661,19 @@ static int nfs_parse_mount_options(char 
 			break;
 		case Opt_udp:
 			mnt->flags &= ~NFS_MOUNT_TCP;
-			mnt->nfs_server.protocol = IPPROTO_UDP;
+			mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
 			mnt->timeo = 7;
 			mnt->retrans = 5;
 			break;
 		case Opt_tcp:
 			mnt->flags |= NFS_MOUNT_TCP;
-			mnt->nfs_server.protocol = IPPROTO_TCP;
+			mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+			mnt->timeo = 600;
+			mnt->retrans = 2;
+			break;
+		case Opt_rdma:
+			mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */
+			mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
 			mnt->timeo = 600;
 			mnt->retrans = 2;
 			break;
@@ -913,13 +882,20 @@ static int nfs_parse_mount_options(char 
 			switch (token) {
 			case Opt_xprt_udp:
 				mnt->flags &= ~NFS_MOUNT_TCP;
-				mnt->nfs_server.protocol = IPPROTO_UDP;
+				mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
 				mnt->timeo = 7;
 				mnt->retrans = 5;
 				break;
 			case Opt_xprt_tcp:
 				mnt->flags |= NFS_MOUNT_TCP;
-				mnt->nfs_server.protocol = IPPROTO_TCP;
+				mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+				mnt->timeo = 600;
+				mnt->retrans = 2;
+				break;
+			case Opt_xprt_rdma:
+				/* vector side protocols to TCP */
+				mnt->flags |= NFS_MOUNT_TCP;
+				mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
 				mnt->timeo = 600;
 				mnt->retrans = 2;
 				break;
@@ -937,11 +913,12 @@ static int nfs_parse_mount_options(char 
 
 			switch (token) {
 			case Opt_xprt_udp:
-				mnt->mount_server.protocol = IPPROTO_UDP;
+				mnt->mount_server.protocol = XPRT_TRANSPORT_UDP;
 				break;
 			case Opt_xprt_tcp:
-				mnt->mount_server.protocol = IPPROTO_TCP;
+				mnt->mount_server.protocol = XPRT_TRANSPORT_TCP;
 				break;
+			case Opt_xprt_rdma: /* not used for side protocols */
 			default:
 				goto out_unrec_xprt;
 			}
@@ -961,7 +938,7 @@ static int nfs_parse_mount_options(char 
 				goto out_nomem;
 			mnt->client_address = string;
 			break;
-		case Opt_mounthost:
+		case Opt_mountaddr:
 			string = match_strdup(args);
 			if (string == NULL)
 				goto out_nomem;
@@ -1027,16 +1004,10 @@ static int nfs_try_mount(struct nfs_pars
 		sin = args->mount_server.address;
 	else
 		sin = args->nfs_server.address;
-	if (args->mount_server.port == 0) {
-		status = rpcb_getport_sync(&sin,
-					   args->mount_server.program,
-					   args->mount_server.version,
-					   args->mount_server.protocol);
-		if (status < 0)
-			goto out_err;
-		sin.sin_port = htons(status);
-	} else
-		sin.sin_port = htons(args->mount_server.port);
+	/*
+	 * autobind will be used if mount_server.port == 0
+	 */
+	sin.sin_port = htons(args->mount_server.port);
 
 	/*
 	 * Now ask the mount server to map our export path
@@ -1049,14 +1020,11 @@ static int nfs_try_mount(struct nfs_pars
 			   args->mount_server.version,
 			   args->mount_server.protocol,
 			   root_fh);
-	if (status < 0)
-		goto out_err;
-
-	return status;
+	if (status == 0)
+		return 0;
 
-out_err:
-	dfprintk(MOUNT, "NFS: unable to contact server on host "
-		 NIPQUAD_FMT "\n", NIPQUAD(sin.sin_addr.s_addr));
+	dfprintk(MOUNT, "NFS: unable to mount server " NIPQUAD_FMT
+			", error %d\n", NIPQUAD(sin.sin_addr.s_addr), status);
 	return status;
 }
 
@@ -1079,15 +1047,31 @@ out_err:
  * XXX: as far as I can tell, changing the NFS program number is not
  *      supported in the NFS client.
  */
-static int nfs_validate_mount_data(struct nfs_mount_data **options,
+static int nfs_validate_mount_data(void *options,
+				   struct nfs_parsed_mount_data *args,
 				   struct nfs_fh *mntfh,
 				   const char *dev_name)
 {
-	struct nfs_mount_data *data = *options;
+	struct nfs_mount_data *data = (struct nfs_mount_data *)options;
 
 	if (data == NULL)
 		goto out_no_data;
 
+	memset(args, 0, sizeof(*args));
+	args->flags		= (NFS_MOUNT_VER3 | NFS_MOUNT_TCP);
+	args->rsize		= NFS_MAX_FILE_IO_SIZE;
+	args->wsize		= NFS_MAX_FILE_IO_SIZE;
+	args->timeo		= 600;
+	args->retrans		= 2;
+	args->acregmin		= 3;
+	args->acregmax		= 60;
+	args->acdirmin		= 30;
+	args->acdirmax		= 60;
+	args->mount_server.protocol = XPRT_TRANSPORT_UDP;
+	args->mount_server.program = NFS_MNT_PROGRAM;
+	args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+	args->nfs_server.program = NFS_PROGRAM;
+
 	switch (data->version) {
 	case 1:
 		data->namlen = 0;
@@ -1116,92 +1100,73 @@ static int nfs_validate_mount_data(struc
 		if (mntfh->size < sizeof(mntfh->data))
 			memset(mntfh->data + mntfh->size, 0,
 			       sizeof(mntfh->data) - mntfh->size);
+
+		if (!nfs_verify_server_address((struct sockaddr *) &data->addr))
+			goto out_no_address;
+
+		/*
+		 * Translate to nfs_parsed_mount_data, which nfs_fill_super
+		 * can deal with.
+		 */
+		args->flags		= data->flags;
+		args->rsize		= data->rsize;
+		args->wsize		= data->wsize;
+		args->flags		= data->flags;
+		args->timeo		= data->timeo;
+		args->retrans		= data->retrans;
+		args->acregmin		= data->acregmin;
+		args->acregmax		= data->acregmax;
+		args->acdirmin		= data->acdirmin;
+		args->acdirmax		= data->acdirmax;
+		args->nfs_server.address = data->addr;
+		if (!(data->flags & NFS_MOUNT_TCP))
+			args->nfs_server.protocol = XPRT_TRANSPORT_UDP;
+		/* N.B. caller will free nfs_server.hostname in all cases */
+		args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL);
+		args->namlen		= data->namlen;
+		args->bsize		= data->bsize;
+		args->auth_flavors[0]	= data->pseudoflavor;
 		break;
 	default: {
 		unsigned int len;
 		char *c;
 		int status;
-		struct nfs_parsed_mount_data args = {
-			.flags		= (NFS_MOUNT_VER3 | NFS_MOUNT_TCP),
-			.rsize		= NFS_MAX_FILE_IO_SIZE,
-			.wsize		= NFS_MAX_FILE_IO_SIZE,
-			.timeo		= 600,
-			.retrans	= 2,
-			.acregmin	= 3,
-			.acregmax	= 60,
-			.acdirmin	= 30,
-			.acdirmax	= 60,
-			.mount_server.protocol = IPPROTO_UDP,
-			.mount_server.program = NFS_MNT_PROGRAM,
-			.nfs_server.protocol = IPPROTO_TCP,
-			.nfs_server.program = NFS_PROGRAM,
-		};
 
-		if (nfs_parse_mount_options((char *) *options, &args) == 0)
+		if (nfs_parse_mount_options((char *)options, args) == 0)
 			return -EINVAL;
 
-		data = kzalloc(sizeof(*data), GFP_KERNEL);
-		if (data == NULL)
-			return -ENOMEM;
-
-		/*
-		 * NB: after this point, caller will free "data"
-		 * if we return an error
-		 */
-		*options = data;
+		if (!nfs_verify_server_address((struct sockaddr *)
+						&args->nfs_server.address))
+			goto out_no_address;
 
 		c = strchr(dev_name, ':');
 		if (c == NULL)
 			return -EINVAL;
 		len = c - dev_name;
-		if (len > sizeof(data->hostname))
-			return -ENAMETOOLONG;
-		strncpy(data->hostname, dev_name, len);
-		args.nfs_server.hostname = data->hostname;
+		/* N.B. caller will free nfs_server.hostname in all cases */
+		args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL);
 
 		c++;
 		if (strlen(c) > NFS_MAXPATHLEN)
 			return -ENAMETOOLONG;
-		args.nfs_server.export_path = c;
+		args->nfs_server.export_path = c;
 
-		status = nfs_try_mount(&args, mntfh);
+		status = nfs_try_mount(args, mntfh);
 		if (status)
 			return status;
 
-		/*
-		 * Translate to nfs_mount_data, which nfs_fill_super
-		 * can deal with.
-		 */
-		data->version		= 6;
-		data->flags		= args.flags;
-		data->rsize		= args.rsize;
-		data->wsize		= args.wsize;
-		data->timeo		= args.timeo;
-		data->retrans		= args.retrans;
-		data->acregmin		= args.acregmin;
-		data->acregmax		= args.acregmax;
-		data->acdirmin		= args.acdirmin;
-		data->acdirmax		= args.acdirmax;
-		data->addr		= args.nfs_server.address;
-		data->namlen		= args.namlen;
-		data->bsize		= args.bsize;
-		data->pseudoflavor	= args.auth_flavors[0];
-
 		break;
 		}
 	}
 
-	if (!(data->flags & NFS_MOUNT_SECFLAVOUR))
-		data->pseudoflavor = RPC_AUTH_UNIX;
+	if (!(args->flags & NFS_MOUNT_SECFLAVOUR))
+		args->auth_flavors[0] = RPC_AUTH_UNIX;
 
 #ifndef CONFIG_NFS_V3
-	if (data->flags & NFS_MOUNT_VER3)
+	if (args->flags & NFS_MOUNT_VER3)
 		goto out_v3_not_compiled;
 #endif /* !CONFIG_NFS_V3 */
 
-	if (!nfs_verify_server_address((struct sockaddr *) &data->addr))
-		goto out_no_address;
-
 	return 0;
 
 out_no_data:
@@ -1258,7 +1223,8 @@ static inline void nfs_initialise_sb(str
 /*
  * Finish setting up an NFS2/3 superblock
  */
-static void nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data)
+static void nfs_fill_super(struct super_block *sb,
+			   struct nfs_parsed_mount_data *data)
 {
 	struct nfs_server *server = NFS_SB(sb);
 
@@ -1379,7 +1345,7 @@ static int nfs_get_sb(struct file_system
 	struct nfs_server *server = NULL;
 	struct super_block *s;
 	struct nfs_fh mntfh;
-	struct nfs_mount_data *data = raw_data;
+	struct nfs_parsed_mount_data data;
 	struct dentry *mntroot;
 	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
 	struct nfs_sb_mountdata sb_mntdata = {
@@ -1388,12 +1354,12 @@ static int nfs_get_sb(struct file_system
 	int error;
 
 	/* Validate the mount data */
-	error = nfs_validate_mount_data(&data, &mntfh, dev_name);
+	error = nfs_validate_mount_data(raw_data, &data, &mntfh, dev_name);
 	if (error < 0)
 		goto out;
 
 	/* Get a volume representation */
-	server = nfs_create_server(data, &mntfh);
+	server = nfs_create_server(&data, &mntfh);
 	if (IS_ERR(server)) {
 		error = PTR_ERR(server);
 		goto out;
@@ -1417,7 +1383,7 @@ static int nfs_get_sb(struct file_system
 
 	if (!s->s_root) {
 		/* initial superblock/root creation */
-		nfs_fill_super(s, data);
+		nfs_fill_super(s, &data);
 	}
 
 	mntroot = nfs_get_root(s, &mntfh);
@@ -1432,8 +1398,7 @@ static int nfs_get_sb(struct file_system
 	error = 0;
 
 out:
-	if (data != raw_data)
-		kfree(data);
+	kfree(data.nfs_server.hostname);
 	return error;
 
 out_err_nosb:
@@ -1559,38 +1524,49 @@ static void nfs4_fill_super(struct super
 /*
  * Validate NFSv4 mount options
  */
-static int nfs4_validate_mount_data(struct nfs4_mount_data **options,
-				    const char *dev_name,
-				    struct sockaddr_in *addr,
-				    rpc_authflavor_t *authflavour,
-				    char **hostname,
-				    char **mntpath,
-				    char **ip_addr)
+static int nfs4_validate_mount_data(void *options,
+				    struct nfs_parsed_mount_data *args,
+				    const char *dev_name)
 {
-	struct nfs4_mount_data *data = *options;
+	struct nfs4_mount_data *data = (struct nfs4_mount_data *)options;
 	char *c;
 
 	if (data == NULL)
 		goto out_no_data;
 
+	memset(args, 0, sizeof(*args));
+	args->rsize		= NFS_MAX_FILE_IO_SIZE;
+	args->wsize		= NFS_MAX_FILE_IO_SIZE;
+	args->timeo		= 600;
+	args->retrans		= 2;
+	args->acregmin		= 3;
+	args->acregmax		= 60;
+	args->acdirmin		= 30;
+	args->acdirmax		= 60;
+	args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+
 	switch (data->version) {
 	case 1:
-		if (data->host_addrlen != sizeof(*addr))
+		if (data->host_addrlen != sizeof(args->nfs_server.address))
 			goto out_no_address;
-		if (copy_from_user(addr, data->host_addr, sizeof(*addr)))
+		if (copy_from_user(&args->nfs_server.address,
+				   data->host_addr,
+				   sizeof(args->nfs_server.address)))
 			return -EFAULT;
-		if (addr->sin_port == 0)
-			addr->sin_port = htons(NFS_PORT);
-		if (!nfs_verify_server_address((struct sockaddr *) addr))
+		if (args->nfs_server.address.sin_port == 0)
+			args->nfs_server.address.sin_port = htons(NFS_PORT);
+		if (!nfs_verify_server_address((struct sockaddr *)
+						&args->nfs_server.address))
 			goto out_no_address;
 
 		switch (data->auth_flavourlen) {
 		case 0:
-			*authflavour = RPC_AUTH_UNIX;
+			args->auth_flavors[0] = RPC_AUTH_UNIX;
 			break;
 		case 1:
-			if (copy_from_user(authflavour, data->auth_flavours,
-					   sizeof(*authflavour)))
+			if (copy_from_user(&args->auth_flavors[0],
+					   data->auth_flavours,
+					   sizeof(args->auth_flavors[0])))
 				return -EFAULT;
 			break;
 		default:
@@ -1600,75 +1576,57 @@ static int nfs4_validate_mount_data(stru
 		c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);
 		if (IS_ERR(c))
 			return PTR_ERR(c);
-		*hostname = c;
+		args->nfs_server.hostname = c;
 
 		c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN);
 		if (IS_ERR(c))
 			return PTR_ERR(c);
-		*mntpath = c;
-		dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *mntpath);
+		args->nfs_server.export_path = c;
+		dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", c);
 
 		c = strndup_user(data->client_addr.data, 16);
 		if (IS_ERR(c))
 			return PTR_ERR(c);
-		*ip_addr = c;
+		args->client_address = c;
+
+		/*
+		 * Translate to nfs_parsed_mount_data, which nfs4_fill_super
+		 * can deal with.
+		 */
+
+		args->flags	= data->flags & NFS4_MOUNT_FLAGMASK;
+		args->rsize	= data->rsize;
+		args->wsize	= data->wsize;
+		args->timeo	= data->timeo;
+		args->retrans	= data->retrans;
+		args->acregmin	= data->acregmin;
+		args->acregmax	= data->acregmax;
+		args->acdirmin	= data->acdirmin;
+		args->acdirmax	= data->acdirmax;
+		args->nfs_server.protocol = data->proto;
 
 		break;
 	default: {
 		unsigned int len;
-		struct nfs_parsed_mount_data args = {
-			.rsize		= NFS_MAX_FILE_IO_SIZE,
-			.wsize		= NFS_MAX_FILE_IO_SIZE,
-			.timeo		= 600,
-			.retrans	= 2,
-			.acregmin	= 3,
-			.acregmax	= 60,
-			.acdirmin	= 30,
-			.acdirmax	= 60,
-			.nfs_server.protocol = IPPROTO_TCP,
-		};
 
-		if (nfs_parse_mount_options((char *) *options, &args) == 0)
+		if (nfs_parse_mount_options((char *)options, args) == 0)
 			return -EINVAL;
 
 		if (!nfs_verify_server_address((struct sockaddr *)
-						&args.nfs_server.address))
+						&args->nfs_server.address))
 			return -EINVAL;
-		*addr = args.nfs_server.address;
 
-		switch (args.auth_flavor_len) {
+		switch (args->auth_flavor_len) {
 		case 0:
-			*authflavour = RPC_AUTH_UNIX;
+			args->auth_flavors[0] = RPC_AUTH_UNIX;
 			break;
 		case 1:
-			*authflavour = (rpc_authflavor_t) args.auth_flavors[0];
 			break;
 		default:
 			goto out_inval_auth;
 		}
 
 		/*
-		 * Translate to nfs4_mount_data, which nfs4_fill_super
-		 * can deal with.
-		 */
-		data = kzalloc(sizeof(*data), GFP_KERNEL);
-		if (data == NULL)
-			return -ENOMEM;
-		*options = data;
-
-		data->version	= 1;
-		data->flags	= args.flags & NFS4_MOUNT_FLAGMASK;
-		data->rsize	= args.rsize;
-		data->wsize	= args.wsize;
-		data->timeo	= args.timeo;
-		data->retrans	= args.retrans;
-		data->acregmin	= args.acregmin;
-		data->acregmax	= args.acregmax;
-		data->acdirmin	= args.acdirmin;
-		data->acdirmax	= args.acdirmax;
-		data->proto	= args.nfs_server.protocol;
-
-		/*
 		 * Split "dev_name" into "hostname:mntpath".
 		 */
 		c = strchr(dev_name, ':');
@@ -1678,27 +1636,25 @@ static int nfs4_validate_mount_data(stru
 		len = c - dev_name;
 		if (len > NFS4_MAXNAMLEN)
 			return -ENAMETOOLONG;
-		*hostname = kzalloc(len, GFP_KERNEL);
-		if (*hostname == NULL)
+		args->nfs_server.hostname = kzalloc(len, GFP_KERNEL);
+		if (args->nfs_server.hostname == NULL)
 			return -ENOMEM;
-		strncpy(*hostname, dev_name, len - 1);
+		strncpy(args->nfs_server.hostname, dev_name, len - 1);
 
 		c++;			/* step over the ':' */
 		len = strlen(c);
 		if (len > NFS4_MAXPATHLEN)
 			return -ENAMETOOLONG;
-		*mntpath = kzalloc(len + 1, GFP_KERNEL);
-		if (*mntpath == NULL)
+		args->nfs_server.export_path = kzalloc(len + 1, GFP_KERNEL);
+		if (args->nfs_server.export_path == NULL)
 			return -ENOMEM;
-		strncpy(*mntpath, c, len);
+		strncpy(args->nfs_server.export_path, c, len);
 
-		dprintk("MNTPATH: %s\n", *mntpath);
+		dprintk("MNTPATH: %s\n", args->nfs_server.export_path);
 
-		if (args.client_address == NULL)
+		if (args->client_address == NULL)
 			goto out_no_client_address;
 
-		*ip_addr = args.client_address;
-
 		break;
 		}
 	}
@@ -1729,14 +1685,11 @@ out_no_client_address:
 static int nfs4_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
 {
-	struct nfs4_mount_data *data = raw_data;
+	struct nfs_parsed_mount_data data;
 	struct super_block *s;
 	struct nfs_server *server;
-	struct sockaddr_in addr;
-	rpc_authflavor_t authflavour;
 	struct nfs_fh mntfh;
 	struct dentry *mntroot;
-	char *mntpath = NULL, *hostname = NULL, *ip_addr = NULL;
 	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
 	struct nfs_sb_mountdata sb_mntdata = {
 		.mntflags = flags,
@@ -1744,14 +1697,12 @@ static int nfs4_get_sb(struct file_syste
 	int error;
 
 	/* Validate the mount data */
-	error = nfs4_validate_mount_data(&data, dev_name, &addr, &authflavour,
-					 &hostname, &mntpath, &ip_addr);
+	error = nfs4_validate_mount_data(raw_data, &data, dev_name);
 	if (error < 0)
 		goto out;
 
 	/* Get a volume representation */
-	server = nfs4_create_server(data, hostname, &addr, mntpath, ip_addr,
-				    authflavour, &mntfh);
+	server = nfs4_create_server(&data, &mntfh);
 	if (IS_ERR(server)) {
 		error = PTR_ERR(server);
 		goto out;
@@ -1790,9 +1741,9 @@ static int nfs4_get_sb(struct file_syste
 	error = 0;
 
 out:
-	kfree(ip_addr);
-	kfree(mntpath);
-	kfree(hostname);
+	kfree(data.client_address);
+	kfree(data.nfs_server.export_path);
+	kfree(data.nfs_server.hostname);
 	return error;
 
 out_free:
diff -puN fs/nfs/unlink.c~git-nfs fs/nfs/unlink.c
--- a/fs/nfs/unlink.c~git-nfs
+++ a/fs/nfs/unlink.c
@@ -66,7 +66,6 @@ static void nfs_async_unlink_init(struct
 		.rpc_cred = data->cred,
 	};
 
-	nfs_begin_data_update(dir);
 	NFS_PROTO(dir)->unlink_setup(&msg, dir);
 	rpc_call_setup(task, &msg, 0);
 }
@@ -84,8 +83,6 @@ static void nfs_async_unlink_done(struct
 
 	if (!NFS_PROTO(dir)->unlink_done(task, dir))
 		rpc_restart_call(task);
-	else
-		nfs_end_data_update(dir);
 }
 
 /**
diff -puN fs/nfs/write.c~git-nfs fs/nfs/write.c
--- a/fs/nfs/write.c~git-nfs
+++ a/fs/nfs/write.c
@@ -110,6 +110,13 @@ void nfs_writedata_release(void *wdata)
 	nfs_writedata_free(wdata);
 }
 
+static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
+{
+	ctx->error = error;
+	smp_wmb();
+	set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
+}
+
 static struct nfs_page *nfs_page_find_request_locked(struct page *page)
 {
 	struct nfs_page *req = NULL;
@@ -243,10 +250,7 @@ static void nfs_end_page_writeback(struc
 
 /*
  * Find an associated nfs write request, and prepare to flush it out
- * Returns 1 if there was no write request, or if the request was
- * already tagged by nfs_set_page_dirty.Returns 0 if the request
- * was not tagged.
- * May also return an error if the user signalled nfs_wait_on_request().
+ * May return an error if the user signalled nfs_wait_on_request().
  */
 static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 				struct page *page)
@@ -261,7 +265,7 @@ static int nfs_page_async_flush(struct n
 		req = nfs_page_find_request_locked(page);
 		if (req == NULL) {
 			spin_unlock(&inode->i_lock);
-			return 1;
+			return 0;
 		}
 		if (nfs_lock_request_dontget(req))
 			break;
@@ -282,7 +286,7 @@ static int nfs_page_async_flush(struct n
 		spin_unlock(&inode->i_lock);
 		nfs_unlock_request(req);
 		nfs_pageio_complete(pgio);
-		return 1;
+		return 0;
 	}
 	if (nfs_set_page_writeback(page) != 0) {
 		spin_unlock(&inode->i_lock);
@@ -290,70 +294,56 @@ static int nfs_page_async_flush(struct n
 	}
 	radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
 			NFS_PAGE_TAG_LOCKED);
-	ret = test_bit(PG_NEED_FLUSH, &req->wb_flags);
 	spin_unlock(&inode->i_lock);
 	nfs_pageio_add_request(pgio, req);
-	return ret;
+	return 0;
 }
 
-/*
- * Write an mmapped page to the server.
- */
-static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc)
+static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
 {
-	struct nfs_pageio_descriptor mypgio, *pgio;
-	struct nfs_open_context *ctx;
 	struct inode *inode = page->mapping->host;
-	unsigned offset;
-	int err;
 
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
 	nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
 
-	if (wbc->for_writepages)
-		pgio = wbc->fs_private;
-	else {
-		nfs_pageio_init_write(&mypgio, inode, wb_priority(wbc));
-		pgio = &mypgio;
-	}
-
 	nfs_pageio_cond_complete(pgio, page->index);
+	return nfs_page_async_flush(pgio, page);
+}
 
-	err = nfs_page_async_flush(pgio, page);
-	if (err <= 0)
-		goto out;
-	err = 0;
-	offset = nfs_page_length(page);
-	if (!offset)
-		goto out;
-
-	nfs_pageio_cond_complete(pgio, page->index);
+/*
+ * Write an mmapped page to the server.
+ */
+static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc)
+{
+	struct nfs_pageio_descriptor pgio;
+	int err;
 
-	ctx = nfs_find_open_context(inode, NULL, FMODE_WRITE);
-	if (ctx == NULL) {
-		err = -EBADF;
-		goto out;
-	}
-	err = nfs_writepage_setup(ctx, page, 0, offset);
-	put_nfs_open_context(ctx);
-	if (err != 0)
-		goto out;
-	err = nfs_page_async_flush(pgio, page);
-	if (err > 0)
-		err = 0;
-out:
-	if (!wbc->for_writepages)
-		nfs_pageio_complete(pgio);
-	return err;
+	nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc));
+	err = nfs_do_writepage(page, wbc, &pgio);
+	nfs_pageio_complete(&pgio);
+	if (err < 0)
+		return err;
+	if (pgio.pg_error < 0)
+		return pgio.pg_error;
+	return 0;
 }
 
 int nfs_writepage(struct page *page, struct writeback_control *wbc)
 {
-	int err;
+	int ret;
+
+	ret = nfs_writepage_locked(page, wbc);
+	unlock_page(page);
+	return ret;
+}
+
+static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data)
+{
+	int ret;
 
-	err = nfs_writepage_locked(page, wbc);
+	ret = nfs_do_writepage(page, wbc, data);
 	unlock_page(page);
-	return err; 
+	return ret;
 }
 
 int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
@@ -365,12 +355,11 @@ int nfs_writepages(struct address_space 
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
 
 	nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
-	wbc->fs_private = &pgio;
-	err = generic_writepages(mapping, wbc);
+	err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
 	nfs_pageio_complete(&pgio);
-	if (err)
+	if (err < 0)
 		return err;
-	if (pgio.pg_error)
+	if (pgio.pg_error < 0)
 		return pgio.pg_error;
 	return 0;
 }
@@ -389,14 +378,11 @@ static int nfs_inode_add_request(struct 
 		return error;
 	if (!nfsi->npages) {
 		igrab(inode);
-		nfs_begin_data_update(inode);
 		if (nfs_have_delegation(inode, FMODE_WRITE))
 			nfsi->change_attr++;
 	}
 	SetPagePrivate(req->wb_page);
 	set_page_private(req->wb_page, (unsigned long)req);
-	if (PageDirty(req->wb_page))
-		set_bit(PG_NEED_FLUSH, &req->wb_flags);
 	nfsi->npages++;
 	kref_get(&req->wb_kref);
 	return 0;
@@ -416,12 +402,9 @@ static void nfs_inode_remove_request(str
 	set_page_private(req->wb_page, 0);
 	ClearPagePrivate(req->wb_page);
 	radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
-	if (test_and_clear_bit(PG_NEED_FLUSH, &req->wb_flags))
-		__set_page_dirty_nobuffers(req->wb_page);
 	nfsi->npages--;
 	if (!nfsi->npages) {
 		spin_unlock(&inode->i_lock);
-		nfs_end_data_update(inode);
 		iput(inode);
 	} else
 		spin_unlock(&inode->i_lock);
@@ -682,7 +665,7 @@ static struct nfs_page * nfs_update_requ
 
 int nfs_flush_incompatible(struct file *file, struct page *page)
 {
-	struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
+	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	struct nfs_page	*req;
 	int do_flush, status;
 	/*
@@ -716,7 +699,7 @@ int nfs_flush_incompatible(struct file *
 int nfs_updatepage(struct file *file, struct page *page,
 		unsigned int offset, unsigned int count)
 {
-	struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
+	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	struct inode	*inode = page->mapping->host;
 	int		status = 0;
 
@@ -967,7 +950,7 @@ static void nfs_writeback_done_partial(s
 
 	if (task->tk_status < 0) {
 		nfs_set_pageerror(page);
-		req->wb_context->error = task->tk_status;
+		nfs_context_set_write_error(req->wb_context, task->tk_status);
 		dprintk(", error = %d\n", task->tk_status);
 		goto out;
 	}
@@ -1030,7 +1013,7 @@ static void nfs_writeback_done_full(stru
 
 		if (task->tk_status < 0) {
 			nfs_set_pageerror(page);
-			req->wb_context->error = task->tk_status;
+			nfs_context_set_write_error(req->wb_context, task->tk_status);
 			dprintk(", error = %d\n", task->tk_status);
 			goto remove_request;
 		}
@@ -1244,7 +1227,7 @@ static void nfs_commit_done(struct rpc_t
 			req->wb_bytes,
 			(long long)req_offset(req));
 		if (task->tk_status < 0) {
-			req->wb_context->error = task->tk_status;
+			nfs_context_set_write_error(req->wb_context, task->tk_status);
 			nfs_inode_remove_request(req);
 			dprintk(", error = %d\n", task->tk_status);
 			goto next;
@@ -1347,53 +1330,52 @@ long nfs_sync_mapping_wait(struct addres
 	return ret;
 }
 
-/*
- * flush the inode to disk.
- */
-int nfs_wb_all(struct inode *inode)
+static int __nfs_write_mapping(struct address_space *mapping, struct writeback_control *wbc, int how)
 {
-	struct address_space *mapping = inode->i_mapping;
-	struct writeback_control wbc = {
-		.bdi = mapping->backing_dev_info,
-		.sync_mode = WB_SYNC_ALL,
-		.nr_to_write = LONG_MAX,
-		.for_writepages = 1,
-		.range_cyclic = 1,
-	};
 	int ret;
 
-	ret = nfs_writepages(mapping, &wbc);
+	ret = nfs_writepages(mapping, wbc);
 	if (ret < 0)
 		goto out;
-	ret = nfs_sync_mapping_wait(mapping, &wbc, 0);
-	if (ret >= 0)
-		return 0;
+	ret = nfs_sync_mapping_wait(mapping, wbc, how);
+	if (ret < 0)
+		goto out;
+	return 0;
 out:
 	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 	return ret;
 }
 
-int nfs_sync_mapping_range(struct address_space *mapping, loff_t range_start, loff_t range_end, int how)
+/* Two pass sync: first using WB_SYNC_NONE, then WB_SYNC_ALL */
+static int nfs_write_mapping(struct address_space *mapping, int how)
 {
 	struct writeback_control wbc = {
 		.bdi = mapping->backing_dev_info,
-		.sync_mode = WB_SYNC_ALL,
+		.sync_mode = WB_SYNC_NONE,
 		.nr_to_write = LONG_MAX,
-		.range_start = range_start,
-		.range_end = range_end,
 		.for_writepages = 1,
+		.range_cyclic = 1,
 	};
 	int ret;
 
-	ret = nfs_writepages(mapping, &wbc);
+	ret = __nfs_write_mapping(mapping, &wbc, how);
 	if (ret < 0)
-		goto out;
-	ret = nfs_sync_mapping_wait(mapping, &wbc, how);
-	if (ret >= 0)
-		return 0;
-out:
-	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
-	return ret;
+		return ret;
+	wbc.sync_mode = WB_SYNC_ALL;
+	return __nfs_write_mapping(mapping, &wbc, how);
+}
+
+/*
+ * flush the inode to disk.
+ */
+int nfs_wb_all(struct inode *inode)
+{
+	return nfs_write_mapping(inode->i_mapping, 0);
+}
+
+int nfs_wb_nocommit(struct inode *inode)
+{
+	return nfs_write_mapping(inode->i_mapping, FLUSH_NOCOMMIT);
 }
 
 int nfs_wb_page_cancel(struct inode *inode, struct page *page)
@@ -1477,35 +1459,6 @@ int nfs_wb_page(struct inode *inode, str
 	return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
 }
 
-int nfs_set_page_dirty(struct page *page)
-{
-	struct address_space *mapping = page->mapping;
-	struct inode *inode;
-	struct nfs_page *req;
-	int ret;
-
-	if (!mapping)
-		goto out_raced;
-	inode = mapping->host;
-	if (!inode)
-		goto out_raced;
-	spin_lock(&inode->i_lock);
-	req = nfs_page_find_request_locked(page);
-	if (req != NULL) {
-		/* Mark any existing write requests for flushing */
-		ret = !test_and_set_bit(PG_NEED_FLUSH, &req->wb_flags);
-		spin_unlock(&inode->i_lock);
-		nfs_release_request(req);
-		return ret;
-	}
-	ret = __set_page_dirty_nobuffers(page);
-	spin_unlock(&inode->i_lock);
-	return ret;
-out_raced:
-	return !TestSetPageDirty(page);
-}
-
-
 int __init nfs_init_writepagecache(void)
 {
 	nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
diff -puN fs/nfsd/nfs4xdr.c~git-nfs fs/nfsd/nfs4xdr.c
--- a/fs/nfsd/nfs4xdr.c~git-nfs
+++ a/fs/nfsd/nfs4xdr.c
@@ -102,7 +102,8 @@ check_filename(char *str, int len, __be3
 out:						\
 	return status;				\
 xdr_error:					\
-	printk(KERN_NOTICE "xdr error! (%s:%d)\n", __FILE__, __LINE__);	\
+	dprintk("NFSD: xdr error (%s:%d)\n",	\
+			__FILE__, __LINE__);	\
 	status = nfserr_bad_xdr;		\
 	goto out
 
@@ -124,7 +125,8 @@ xdr_error:					\
 	if (!(x = (p==argp->tmp || p == argp->tmpp) ? \
  		savemem(argp, p, nbytes) :	\
  		(char *)p)) {			\
-		printk(KERN_NOTICE "xdr error! (%s:%d)\n", __FILE__, __LINE__); \
+		dprintk("NFSD: xdr error (%s:%d)\n", \
+				__FILE__, __LINE__); \
 		goto xdr_error;			\
 		}				\
 	p += XDR_QUADLEN(nbytes);		\
@@ -140,7 +142,8 @@ xdr_error:					\
 		p = argp->p;			\
 		argp->p += XDR_QUADLEN(nbytes);	\
 	} else if (!(p = read_buf(argp, nbytes))) { \
-		printk(KERN_NOTICE "xdr error! (%s:%d)\n", __FILE__, __LINE__); \
+		dprintk("NFSD: xdr error (%s:%d)\n", \
+				__FILE__, __LINE__); \
 		goto xdr_error;			\
 	}					\
 } while (0)
@@ -948,7 +951,8 @@ nfsd4_decode_write(struct nfsd4_compound
 	 */
 	avail = (char*)argp->end - (char*)argp->p;
 	if (avail + argp->pagelen < write->wr_buflen) {
-		printk(KERN_NOTICE "xdr error! (%s:%d)\n", __FILE__, __LINE__); 
+		dprintk("NFSD: xdr error (%s:%d)\n",
+				__FILE__, __LINE__);
 		goto xdr_error;
 	}
 	argp->rqstp->rq_vec[0].iov_base = p;
@@ -1019,7 +1023,7 @@ nfsd4_decode_compound(struct nfsd4_compo
 		argp->ops = kmalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL);
 		if (!argp->ops) {
 			argp->ops = argp->iops;
-			printk(KERN_INFO "nfsd: couldn't allocate room for COMPOUND\n");
+			dprintk("nfsd: couldn't allocate room for COMPOUND\n");
 			goto xdr_error;
 		}
 	}
@@ -1326,7 +1330,7 @@ static char *nfsd4_path(struct svc_rqst 
 	path = exp->ex_path;
 
 	if (strncmp(path, rootpath, strlen(rootpath))) {
-		printk("nfsd: fs_locations failed;"
+		dprintk("nfsd: fs_locations failed;"
 			"%s is not contained in %s\n", path, rootpath);
 		*stat = nfserr_notsupp;
 		return NULL;
diff -puN include/linux/jiffies.h~git-nfs include/linux/jiffies.h
--- a/include/linux/jiffies.h~git-nfs
+++ a/include/linux/jiffies.h
@@ -115,6 +115,10 @@ static inline u64 get_jiffies_64(void)
 	 ((long)(a) - (long)(b) >= 0))
 #define time_before_eq(a,b)	time_after_eq(b,a)
 
+#define time_in_range(a,b,c) \
+	(time_after_eq(a,b) && \
+	 time_before_eq(a,c))
+
 /* Same as above, but does so with platform independent 64bit types.
  * These must be used when utilizing jiffies_64 (i.e. return value of
  * get_jiffies_64() */
diff -puN include/linux/nfs_fs.h~git-nfs include/linux/nfs_fs.h
--- a/include/linux/nfs_fs.h~git-nfs
+++ a/include/linux/nfs_fs.h
@@ -47,10 +47,8 @@
 #include <linux/nfs3.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_xdr.h>
-
 #include <linux/nfs_fs_sb.h>
 
-#include <linux/rwsem.h>
 #include <linux/mempool.h>
 
 /*
@@ -77,6 +75,9 @@ struct nfs_open_context {
 	struct nfs4_state *state;
 	fl_owner_t lockowner;
 	int mode;
+
+	unsigned long flags;
+#define NFS_CONTEXT_ERROR_WRITE		(0)
 	int error;
 
 	struct list_head list;
@@ -133,11 +134,6 @@ struct nfs_inode {
 	 * server.
 	 */
 	unsigned long		cache_change_attribute;
-	/*
-	 * Counter indicating the number of outstanding requests that
-	 * will cause a file data update.
-	 */
-	atomic_t		data_updates;
 
 	struct rb_root		access_cache;
 	struct list_head	access_cache_entry_lru;
@@ -205,27 +201,18 @@ static inline struct nfs_inode *NFS_I(st
 #define NFS_CLIENT(inode)		(NFS_SERVER(inode)->client)
 #define NFS_PROTO(inode)		(NFS_SERVER(inode)->nfs_client->rpc_ops)
 #define NFS_COOKIEVERF(inode)		(NFS_I(inode)->cookieverf)
-#define NFS_READTIME(inode)		(NFS_I(inode)->read_cache_jiffies)
-#define NFS_CHANGE_ATTR(inode)		(NFS_I(inode)->change_attr)
-#define NFS_ATTRTIMEO(inode)		(NFS_I(inode)->attrtimeo)
 #define NFS_MINATTRTIMEO(inode) \
 	(S_ISDIR(inode->i_mode)? NFS_SERVER(inode)->acdirmin \
 			       : NFS_SERVER(inode)->acregmin)
 #define NFS_MAXATTRTIMEO(inode) \
 	(S_ISDIR(inode->i_mode)? NFS_SERVER(inode)->acdirmax \
 			       : NFS_SERVER(inode)->acregmax)
-#define NFS_ATTRTIMEO_UPDATE(inode)	(NFS_I(inode)->attrtimeo_timestamp)
 
 #define NFS_FLAGS(inode)		(NFS_I(inode)->flags)
 #define NFS_STALE(inode)		(test_bit(NFS_INO_STALE, &NFS_FLAGS(inode)))
 
 #define NFS_FILEID(inode)		(NFS_I(inode)->fileid)
 
-static inline int nfs_caches_unstable(struct inode *inode)
-{
-	return atomic_read(&NFS_I(inode)->data_updates) != 0;
-}
-
 static inline void nfs_mark_for_revalidate(struct inode *inode)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
@@ -237,12 +224,6 @@ static inline void nfs_mark_for_revalida
 	spin_unlock(&inode->i_lock);
 }
 
-static inline void NFS_CACHEINV(struct inode *inode)
-{
-	if (!nfs_caches_unstable(inode))
-		nfs_mark_for_revalidate(inode);
-}
-
 static inline int nfs_server_capable(struct inode *inode, int cap)
 {
 	return NFS_SERVER(inode)->caps & cap;
@@ -253,28 +234,33 @@ static inline int NFS_USE_READDIRPLUS(st
 	return test_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode));
 }
 
+static inline void nfs_set_verifier(struct dentry * dentry, unsigned long verf)
+{
+	dentry->d_time = verf;
+}
+
 /**
  * nfs_save_change_attribute - Returns the inode attribute change cookie
- * @inode - pointer to inode
+ * @dir - pointer to parent directory inode
  * The "change attribute" is updated every time we finish an operation
  * that will result in a metadata change on the server.
  */
-static inline long nfs_save_change_attribute(struct inode *inode)
+static inline unsigned long nfs_save_change_attribute(struct inode *dir)
 {
-	return NFS_I(inode)->cache_change_attribute;
+	return NFS_I(dir)->cache_change_attribute;
 }
 
 /**
- * nfs_verify_change_attribute - Detects NFS inode cache updates
- * @inode - pointer to inode
+ * nfs_verify_change_attribute - Detects NFS remote directory changes
+ * @dir - pointer to parent directory inode
  * @chattr - previously saved change attribute
- * Return "false" if metadata has been updated (or is in the process of
- * being updated) since the change attribute was saved.
+ * Return "false" if the verifiers doesn't match the change attribute.
+ * This would usually indicate that the directory contents have changed on
+ * the server, and that any dentries need revalidating.
  */
-static inline int nfs_verify_change_attribute(struct inode *inode, unsigned long chattr)
+static inline int nfs_verify_change_attribute(struct inode *dir, unsigned long chattr)
 {
-	return !nfs_caches_unstable(inode)
-		&& time_after_eq(chattr, NFS_I(inode)->cache_change_attribute);
+	return chattr == NFS_I(dir)->cache_change_attribute;
 }
 
 /*
@@ -283,15 +269,14 @@ static inline int nfs_verify_change_attr
 extern int nfs_sync_mapping(struct address_space *mapping);
 extern void nfs_zap_mapping(struct inode *inode, struct address_space *mapping);
 extern void nfs_zap_caches(struct inode *);
+extern void nfs_invalidate_atime(struct inode *);
 extern struct inode *nfs_fhget(struct super_block *, struct nfs_fh *,
 				struct nfs_fattr *);
 extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *);
 extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr);
+extern int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr);
 extern int nfs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int nfs_permission(struct inode *, int, struct nameidata *);
-extern int nfs_access_get_cached(struct inode *, struct rpc_cred *, struct nfs_access_entry *);
-extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *);
-extern void nfs_access_zap_cache(struct inode *inode);
 extern int nfs_open(struct inode *, struct file *);
 extern int nfs_release(struct inode *, struct file *);
 extern int nfs_attribute_timeout(struct inode *inode);
@@ -301,13 +286,10 @@ extern int nfs_revalidate_mapping(struct
 extern int nfs_revalidate_mapping_nolock(struct inode *inode, struct address_space *mapping);
 extern int nfs_setattr(struct dentry *, struct iattr *);
 extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr);
-extern void nfs_begin_attr_update(struct inode *);
-extern void nfs_end_attr_update(struct inode *);
-extern void nfs_begin_data_update(struct inode *);
-extern void nfs_end_data_update(struct inode *);
 extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx);
 extern void put_nfs_open_context(struct nfs_open_context *ctx);
 extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, int mode);
+extern u64 nfs_compat_user_ino64(u64 fileid);
 
 /* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */
 extern __be32 root_nfs_parse_addr(char *name); /*__init*/
@@ -328,14 +310,15 @@ extern const struct inode_operations nfs
 extern const struct file_operations nfs_file_operations;
 extern const struct address_space_operations nfs_file_aops;
 
-static inline struct rpc_cred *nfs_file_cred(struct file *file)
+static inline struct nfs_open_context *nfs_file_open_context(struct file *filp)
 {
-	if (file != NULL) {
-		struct nfs_open_context *ctx;
+	return filp->private_data;
+}
 
-		ctx = (struct nfs_open_context*)file->private_data;
-		return ctx->cred;
-	}
+static inline struct rpc_cred *nfs_file_cred(struct file *file)
+{
+	if (file != NULL)
+		return nfs_file_open_context(file)->cred;
 	return NULL;
 }
 
@@ -378,6 +361,8 @@ extern const struct file_operations nfs_
 extern struct dentry_operations nfs_dentry_operations;
 
 extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr);
+extern int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags);
+extern void nfs_access_zap_cache(struct inode *inode);
 
 /*
  * linux/fs/nfs/symlink.c
@@ -420,15 +405,14 @@ extern int  nfs_flush_incompatible(struc
 extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
 extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
 extern void nfs_writedata_release(void *);
-extern int nfs_set_page_dirty(struct page *);
 
 /*
  * Try to write back everything synchronously (but check the
  * return value!)
  */
 extern long nfs_sync_mapping_wait(struct address_space *, struct writeback_control *, int);
-extern int nfs_sync_mapping_range(struct address_space *, loff_t, loff_t, int);
 extern int nfs_wb_all(struct inode *inode);
+extern int nfs_wb_nocommit(struct inode *inode);
 extern int nfs_wb_page(struct inode *inode, struct page* page);
 extern int nfs_wb_page_priority(struct inode *inode, struct page* page, int how);
 extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
diff -puN include/linux/nfs_page.h~git-nfs include/linux/nfs_page.h
--- a/include/linux/nfs_page.h~git-nfs
+++ a/include/linux/nfs_page.h
@@ -30,7 +30,6 @@
 #define PG_BUSY			0
 #define PG_NEED_COMMIT		1
 #define PG_NEED_RESCHED		2
-#define PG_NEED_FLUSH		3
 
 struct nfs_inode;
 struct nfs_page {
diff -puN include/linux/nfs_xdr.h~git-nfs include/linux/nfs_xdr.h
--- a/include/linux/nfs_xdr.h~git-nfs
+++ a/include/linux/nfs_xdr.h
@@ -62,7 +62,8 @@ struct nfs_fattr {
 #define NFS_ATTR_FATTR		0x0002		/* post-op attributes */
 #define NFS_ATTR_FATTR_V3	0x0004		/* NFSv3 attributes */
 #define NFS_ATTR_FATTR_V4	0x0008		/* NFSv4 change attribute */
-#define NFS_ATTR_FATTR_V4_REFERRAL	0x0010		/* NFSv4 referral */
+#define NFS_ATTR_WCC_V4		0x0010		/* pre-op change attribute */
+#define NFS_ATTR_FATTR_V4_REFERRAL	0x0020		/* NFSv4 referral */
 
 /*
  * Info on the file system
@@ -538,10 +539,13 @@ typedef u64 clientid4;
 
 struct nfs4_accessargs {
 	const struct nfs_fh *		fh;
+	const u32 *			bitmask;
 	u32				access;
 };
 
 struct nfs4_accessres {
+	const struct nfs_server *	server;
+	struct nfs_fattr *		fattr;
 	u32				supported;
 	u32				access;
 };
diff -puN include/linux/sunrpc/clnt.h~git-nfs include/linux/sunrpc/clnt.h
--- a/include/linux/sunrpc/clnt.h~git-nfs
+++ a/include/linux/sunrpc/clnt.h
@@ -117,7 +117,7 @@ struct rpc_create_args {
 
 struct rpc_clnt *rpc_create(struct rpc_create_args *args);
 struct rpc_clnt	*rpc_bind_new_program(struct rpc_clnt *,
-				struct rpc_program *, int);
+				struct rpc_program *, u32);
 struct rpc_clnt *rpc_clone_client(struct rpc_clnt *);
 void		rpc_shutdown_client(struct rpc_clnt *);
 void		rpc_release_client(struct rpc_clnt *);
diff -puN include/linux/sunrpc/debug.h~git-nfs include/linux/sunrpc/debug.h
--- a/include/linux/sunrpc/debug.h~git-nfs
+++ a/include/linux/sunrpc/debug.h
@@ -88,6 +88,11 @@ enum {
 	CTL_SLOTTABLE_TCP,
 	CTL_MIN_RESVPORT,
 	CTL_MAX_RESVPORT,
+	CTL_SLOTTABLE_RDMA,
+	CTL_RDMA_MAXINLINEREAD,
+	CTL_RDMA_MAXINLINEWRITE,
+	CTL_RDMA_WRITEPADDING,
+	CTL_RDMA_MEMREG,
 };
 
 #endif /* _LINUX_SUNRPC_DEBUG_H_ */
diff -puN include/linux/sunrpc/msg_prot.h~git-nfs include/linux/sunrpc/msg_prot.h
--- a/include/linux/sunrpc/msg_prot.h~git-nfs
+++ a/include/linux/sunrpc/msg_prot.h
@@ -138,6 +138,19 @@ typedef __be32	rpc_fraghdr;
 #define RPC_MAX_HEADER_WITH_AUTH \
 	(RPC_CALLHDRSIZE + 2*(2+RPC_MAX_AUTH_SIZE/4))
 
+/*
+ * RFC1833/RFC3530 rpcbind (v3+) well-known netid's.
+ */
+#define RPCBIND_NETID_UDP	"udp"
+#define RPCBIND_NETID_TCP	"tcp"
+#define RPCBIND_NETID_UDP6	"udp6"
+#define RPCBIND_NETID_TCP6	"tcp6"
+
+/*
+ * Note that RFC 1833 does not put any size restrictions on the
+ * netid string, but all currently defined netid's fit in 4 bytes.
+ */
+#define RPCBIND_MAXNETIDLEN	(4u)
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_SUNRPC_MSGPROT_H_ */
diff -puN /dev/null include/linux/sunrpc/rpc_rdma.h
--- /dev/null
+++ a/include/linux/sunrpc/rpc_rdma.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_SUNRPC_RPC_RDMA_H
+#define _LINUX_SUNRPC_RPC_RDMA_H
+
+struct rpcrdma_segment {
+	uint32_t rs_handle;	/* Registered memory handle */
+	uint32_t rs_length;	/* Length of the chunk in bytes */
+	uint64_t rs_offset;	/* Chunk virtual address or offset */
+};
+
+/*
+ * read chunk(s), encoded as a linked list.
+ */
+struct rpcrdma_read_chunk {
+	uint32_t rc_discrim;	/* 1 indicates presence */
+	uint32_t rc_position;	/* Position in XDR stream */
+	struct rpcrdma_segment rc_target;
+};
+
+/*
+ * write chunk, and reply chunk.
+ */
+struct rpcrdma_write_chunk {
+	struct rpcrdma_segment wc_target;
+};
+
+/*
+ * write chunk(s), encoded as a counted array.
+ */
+struct rpcrdma_write_array {
+	uint32_t wc_discrim;	/* 1 indicates presence */
+	uint32_t wc_nchunks;	/* Array count */
+	struct rpcrdma_write_chunk wc_array[0];
+};
+
+struct rpcrdma_msg {
+	uint32_t rm_xid;	/* Mirrors the RPC header xid */
+	uint32_t rm_vers;	/* Version of this protocol */
+	uint32_t rm_credit;	/* Buffers requested/granted */
+	uint32_t rm_type;	/* Type of message (enum rpcrdma_proc) */
+	union {
+
+		struct {			/* no chunks */
+			uint32_t rm_empty[3];	/* 3 empty chunk lists */
+		} rm_nochunks;
+
+		struct {			/* no chunks and padded */
+			uint32_t rm_align;	/* Padding alignment */
+			uint32_t rm_thresh;	/* Padding threshold */
+			uint32_t rm_pempty[3];	/* 3 empty chunk lists */
+		} rm_padded;
+
+		uint32_t rm_chunks[0];	/* read, write and reply chunks */
+
+	} rm_body;
+};
+
+#define RPCRDMA_HDRLEN_MIN	28
+
+enum rpcrdma_errcode {
+	ERR_VERS = 1,
+	ERR_CHUNK = 2
+};
+
+struct rpcrdma_err_vers {
+	uint32_t rdma_vers_low;	/* Version range supported by peer */
+	uint32_t rdma_vers_high;
+};
+
+enum rpcrdma_proc {
+	RDMA_MSG = 0,		/* An RPC call or reply msg */
+	RDMA_NOMSG = 1,		/* An RPC call or reply msg - separate body */
+	RDMA_MSGP = 2,		/* An RPC call or reply msg with padding */
+	RDMA_DONE = 3,		/* Client signals reply completion */
+	RDMA_ERROR = 4		/* An RPC RDMA encoding error */
+};
+
+#endif				/* _LINUX_SUNRPC_RPC_RDMA_H */
diff -puN include/linux/sunrpc/xdr.h~git-nfs include/linux/sunrpc/xdr.h
--- a/include/linux/sunrpc/xdr.h~git-nfs
+++ a/include/linux/sunrpc/xdr.h
@@ -70,7 +70,10 @@ struct xdr_buf {
 
 	struct page **	pages;		/* Array of contiguous pages */
 	unsigned int	page_base,	/* Start of page data */
-			page_len;	/* Length of page data */
+			page_len,	/* Length of page data */
+			flags;		/* Flags for data disposition */
+#define XDRBUF_READ		0x01		/* target of file read */
+#define XDRBUF_WRITE		0x02		/* source of file write */
 
 	unsigned int	buflen,		/* Total length of storage buffer */
 			len;		/* Length of XDR encoded message */
diff -puN include/linux/sunrpc/xprt.h~git-nfs include/linux/sunrpc/xprt.h
--- a/include/linux/sunrpc/xprt.h~git-nfs
+++ a/include/linux/sunrpc/xprt.h
@@ -19,25 +19,11 @@
 
 #ifdef __KERNEL__
 
-extern unsigned int xprt_udp_slot_table_entries;
-extern unsigned int xprt_tcp_slot_table_entries;
-
 #define RPC_MIN_SLOT_TABLE	(2U)
 #define RPC_DEF_SLOT_TABLE	(16U)
 #define RPC_MAX_SLOT_TABLE	(128U)
 
 /*
- * Parameters for choosing a free port
- */
-extern unsigned int xprt_min_resvport;
-extern unsigned int xprt_max_resvport;
-
-#define RPC_MIN_RESVPORT	(1U)
-#define RPC_MAX_RESVPORT	(65535U)
-#define RPC_DEF_MIN_RESVPORT	(665U)
-#define RPC_DEF_MAX_RESVPORT	(1023U)
-
-/*
  * This describes a timeout strategy
  */
 struct rpc_timeout {
@@ -53,6 +39,10 @@ enum rpc_display_format_t {
 	RPC_DISPLAY_PORT,
 	RPC_DISPLAY_PROTO,
 	RPC_DISPLAY_ALL,
+	RPC_DISPLAY_HEX_ADDR,
+	RPC_DISPLAY_HEX_PORT,
+	RPC_DISPLAY_UNIVERSAL_ADDR,
+	RPC_DISPLAY_NETID,
 	RPC_DISPLAY_MAX,
 };
 
@@ -196,14 +186,22 @@ struct rpc_xprt {
 	char *			address_strings[RPC_DISPLAY_MAX];
 };
 
-struct rpc_xprtsock_create {
-	int			proto;		/* IPPROTO_UDP or IPPROTO_TCP */
+struct xprt_create {
+	int			ident;		/* XPRT_TRANSPORT identifier */
 	struct sockaddr *	srcaddr;	/* optional local address */
 	struct sockaddr *	dstaddr;	/* remote peer address */
 	size_t			addrlen;
 	struct rpc_timeout *	timeout;	/* optional timeout parameters */
 };
 
+struct xprt_class {
+	struct list_head	list;
+	int			ident;		/* XPRT_TRANSPORT identifier */
+	struct rpc_xprt *	(*setup)(struct xprt_create *);
+	struct module		*owner;
+	char			name[32];
+};
+
 /*
  * Transport operations used by ULPs
  */
@@ -212,7 +210,7 @@ void			xprt_set_timeout(struct rpc_timeo
 /*
  * Generic internal transport functions
  */
-struct rpc_xprt *	xprt_create_transport(struct rpc_xprtsock_create *args);
+struct rpc_xprt		*xprt_create_transport(struct xprt_create *args);
 void			xprt_connect(struct rpc_task *task);
 void			xprt_reserve(struct rpc_task *task);
 int			xprt_reserve_xprt(struct rpc_task *task);
@@ -235,6 +233,8 @@ static inline __be32 *xprt_skip_transpor
 /*
  * Transport switch helper functions
  */
+int			xprt_register_transport(struct xprt_class *type);
+int			xprt_unregister_transport(struct xprt_class *type);
 void			xprt_set_retrans_timeout_def(struct rpc_task *task);
 void			xprt_set_retrans_timeout_rtt(struct rpc_task *task);
 void			xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status);
@@ -248,14 +248,6 @@ void			xprt_release_rqst_cong(struct rpc
 void			xprt_disconnect(struct rpc_xprt *xprt);
 
 /*
- * Socket transport setup operations
- */
-struct rpc_xprt *	xs_setup_udp(struct rpc_xprtsock_create *args);
-struct rpc_xprt *	xs_setup_tcp(struct rpc_xprtsock_create *args);
-int			init_socket_xprt(void);
-void			cleanup_socket_xprt(void);
-
-/*
  * Reserved bit positions in xprt->state
  */
 #define XPRT_LOCKED		(0)
diff -puN /dev/null include/linux/sunrpc/xprtrdma.h
--- /dev/null
+++ a/include/linux/sunrpc/xprtrdma.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_SUNRPC_XPRTRDMA_H
+#define _LINUX_SUNRPC_XPRTRDMA_H
+
+/*
+ * RPC transport identifier for RDMA
+ */
+#define XPRT_TRANSPORT_RDMA	256
+
+/*
+ * rpcbind (v3+) RDMA netid.
+ */
+#define RPCBIND_NETID_RDMA	"rdma"
+
+/*
+ * Constants. Max RPC/NFS header is big enough to account for
+ * additional marshaling buffers passed down by Linux client.
+ *
+ * RDMA header is currently fixed max size, and is big enough for a
+ * fully-chunked NFS message (read chunks are the largest). Note only
+ * a single chunk type per message is supported currently.
+ */
+#define RPCRDMA_MIN_SLOT_TABLE	(2U)
+#define RPCRDMA_DEF_SLOT_TABLE	(32U)
+#define RPCRDMA_MAX_SLOT_TABLE	(256U)
+
+#define RPCRDMA_DEF_INLINE  (1024)	/* default inline max */
+
+#define RPCRDMA_INLINE_PAD_THRESH  (512)/* payload threshold to pad (bytes) */
+
+#define RDMA_RESOLVE_TIMEOUT	(5*HZ)	/* TBD 5 seconds */
+#define RDMA_CONNECT_RETRY_MAX	(2)	/* retries if no listener backlog */
+
+/* memory registration strategies */
+#define RPCRDMA_PERSISTENT_REGISTRATION (1)
+
+enum rpcrdma_memreg {
+	RPCRDMA_BOUNCEBUFFERS = 0,
+	RPCRDMA_REGISTER,
+	RPCRDMA_MEMWINDOWS,
+	RPCRDMA_MEMWINDOWS_ASYNC,
+	RPCRDMA_MTHCAFMR,
+	RPCRDMA_ALLPHYSICAL,
+	RPCRDMA_LAST
+};
+
+#endif /* _LINUX_SUNRPC_XPRTRDMA_H */
diff -puN /dev/null include/linux/sunrpc/xprtsock.h
--- /dev/null
+++ a/include/linux/sunrpc/xprtsock.h
@@ -0,0 +1,51 @@
+/*
+ *  linux/include/linux/sunrpc/xprtsock.h
+ *
+ *  Declarations for the RPC transport socket provider.
+ */
+
+#ifndef _LINUX_SUNRPC_XPRTSOCK_H
+#define _LINUX_SUNRPC_XPRTSOCK_H
+
+#ifdef __KERNEL__
+
+/*
+ * Socket transport setup operations
+ */
+struct rpc_xprt *xs_setup_udp(struct xprt_create *args);
+struct rpc_xprt *xs_setup_tcp(struct xprt_create *args);
+
+int		init_socket_xprt(void);
+void		cleanup_socket_xprt(void);
+
+/*
+ * RPC transport identifiers for UDP, TCP
+ *
+ * To preserve compatibility with the historical use of raw IP protocol
+ * id's for transport selection, these are specified with the previous
+ * values. No such restriction exists for new transports, except that
+ * they may not collide with these values (17 and 6, respectively).
+ */
+#define XPRT_TRANSPORT_UDP	IPPROTO_UDP
+#define XPRT_TRANSPORT_TCP	IPPROTO_TCP
+
+/*
+ * RPC slot table sizes for UDP, TCP transports
+ */
+extern unsigned int xprt_udp_slot_table_entries;
+extern unsigned int xprt_tcp_slot_table_entries;
+
+/*
+ * Parameters for choosing a free port
+ */
+extern unsigned int xprt_min_resvport;
+extern unsigned int xprt_max_resvport;
+
+#define RPC_MIN_RESVPORT	(1U)
+#define RPC_MAX_RESVPORT	(65535U)
+#define RPC_DEF_MIN_RESVPORT	(665U)
+#define RPC_DEF_MAX_RESVPORT	(1023U)
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_SUNRPC_XPRTSOCK_H */
diff -puN include/linux/writeback.h~git-nfs include/linux/writeback.h
--- a/include/linux/writeback.h~git-nfs
+++ a/include/linux/writeback.h
@@ -61,8 +61,6 @@ struct writeback_control {
 	unsigned for_reclaim:1;		/* Invoked from the page allocator */
 	unsigned for_writepages:1;	/* This is a writepages() call */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
-
-	void *fs_private;		/* For use by ->writepages() */
 };
 
 /*
diff -puN kernel/auditsc.c~git-nfs kernel/auditsc.c
--- a/kernel/auditsc.c~git-nfs
+++ a/kernel/auditsc.c
@@ -1746,6 +1746,7 @@ add_names:
 			context->names[idx].ino = (unsigned long)-1;
 	}
 }
+EXPORT_SYMBOL_GPL(__audit_inode_child);
 
 /**
  * auditsc_get_stamp - get local copies of audit_context values
diff -puN net/sunrpc/Makefile~git-nfs net/sunrpc/Makefile
--- a/net/sunrpc/Makefile~git-nfs
+++ a/net/sunrpc/Makefile
@@ -5,6 +5,7 @@
 
 obj-$(CONFIG_SUNRPC) += sunrpc.o
 obj-$(CONFIG_SUNRPC_GSS) += auth_gss/
+obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma/
 
 sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
 	    auth.o auth_null.o auth_unix.o \
diff -puN net/sunrpc/auth_gss/gss_krb5_wrap.c~git-nfs net/sunrpc/auth_gss/gss_krb5_wrap.c
--- a/net/sunrpc/auth_gss/gss_krb5_wrap.c~git-nfs
+++ a/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -42,7 +42,7 @@ gss_krb5_remove_padding(struct xdr_buf *
 {
 	u8 *ptr;
 	u8 pad;
-	int len = buf->len;
+	size_t len = buf->len;
 
 	if (len <= buf->head[0].iov_len) {
 		pad = *(u8 *)(buf->head[0].iov_base + len - 1);
@@ -53,9 +53,9 @@ gss_krb5_remove_padding(struct xdr_buf *
 	} else
 		len -= buf->head[0].iov_len;
 	if (len <= buf->page_len) {
-		int last = (buf->page_base + len - 1)
+		unsigned int last = (buf->page_base + len - 1)
 					>>PAGE_CACHE_SHIFT;
-		int offset = (buf->page_base + len - 1)
+		unsigned int offset = (buf->page_base + len - 1)
 					& (PAGE_CACHE_SIZE - 1);
 		ptr = kmap_atomic(buf->pages[last], KM_USER0);
 		pad = *(ptr + offset);
diff -puN net/sunrpc/clnt.c~git-nfs net/sunrpc/clnt.c
--- a/net/sunrpc/clnt.c~git-nfs
+++ a/net/sunrpc/clnt.c
@@ -127,7 +127,14 @@ static struct rpc_clnt * rpc_new_client(
 	struct rpc_clnt		*clnt = NULL;
 	struct rpc_auth		*auth;
 	int err;
-	int len;
+	size_t len;
+
+	/* sanity check the name before trying to print it */
+	err = -EINVAL;
+	len = strlen(servname);
+	if (len > RPC_MAXNETNAMELEN)
+		goto out_no_rpciod;
+	len++;
 
 	dprintk("RPC:       creating %s client for %s (xprt %p)\n",
 			program->name, servname, xprt);
@@ -148,7 +155,6 @@ static struct rpc_clnt * rpc_new_client(
 	clnt->cl_parent = clnt;
 
 	clnt->cl_server = clnt->cl_inline_name;
-	len = strlen(servname) + 1;
 	if (len > sizeof(clnt->cl_inline_name)) {
 		char *buf = kmalloc(len, GFP_KERNEL);
 		if (buf != 0)
@@ -234,8 +240,8 @@ struct rpc_clnt *rpc_create(struct rpc_c
 {
 	struct rpc_xprt *xprt;
 	struct rpc_clnt *clnt;
-	struct rpc_xprtsock_create xprtargs = {
-		.proto = args->protocol,
+	struct xprt_create xprtargs = {
+		.ident = args->protocol,
 		.srcaddr = args->saddress,
 		.dstaddr = args->address,
 		.addrlen = args->addrsize,
@@ -253,7 +259,7 @@ struct rpc_clnt *rpc_create(struct rpc_c
 	 */
 	if (args->servername == NULL) {
 		struct sockaddr_in *addr =
-					(struct sockaddr_in *) &args->address;
+					(struct sockaddr_in *) args->address;
 		snprintf(servername, sizeof(servername), NIPQUAD_FMT,
 			NIPQUAD(addr->sin_addr.s_addr));
 		args->servername = servername;
@@ -269,9 +275,6 @@ struct rpc_clnt *rpc_create(struct rpc_c
 	if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT)
 		xprt->resvport = 0;
 
-	dprintk("RPC:       creating %s client for %s (xprt %p)\n",
-			args->program->name, args->servername, xprt);
-
 	clnt = rpc_new_client(xprt, args->servername, args->program,
 				args->version, args->authflavor);
 	if (IS_ERR(clnt))
@@ -439,7 +442,7 @@ rpc_release_client(struct rpc_clnt *clnt
  */
 struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *old,
 				      struct rpc_program *program,
-				      int vers)
+				      u32 vers)
 {
 	struct rpc_clnt *clnt;
 	struct rpc_version *version;
@@ -843,8 +846,7 @@ call_allocate(struct rpc_task *task)
 	dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid);
 
 	if (RPC_IS_ASYNC(task) || !signalled()) {
-		xprt_release(task);
-		task->tk_action = call_reserve;
+		task->tk_action = call_allocate;
 		rpc_delay(task, HZ>>4);
 		return;
 	}
@@ -871,6 +873,7 @@ rpc_xdr_buf_init(struct xdr_buf *buf, vo
 	buf->head[0].iov_len = len;
 	buf->tail[0].iov_len = 0;
 	buf->page_len = 0;
+	buf->flags = 0;
 	buf->len = 0;
 	buf->buflen = len;
 }
@@ -937,7 +940,7 @@ call_bind(struct rpc_task *task)
 static void
 call_bind_status(struct rpc_task *task)
 {
-	int status = -EACCES;
+	int status = -EIO;
 
 	if (task->tk_status >= 0) {
 		dprint_status(task);
@@ -947,9 +950,20 @@ call_bind_status(struct rpc_task *task)
 	}
 
 	switch (task->tk_status) {
+	case -EAGAIN:
+		dprintk("RPC: %5u rpcbind waiting for another request "
+				"to finish\n", task->tk_pid);
+		/* avoid busy-waiting here -- could be a network outage. */
+		rpc_delay(task, 5*HZ);
+		goto retry_timeout;
 	case -EACCES:
 		dprintk("RPC: %5u remote rpcbind: RPC program/version "
 				"unavailable\n", task->tk_pid);
+		/* fail immediately if this is an RPC ping */
+		if (task->tk_msg.rpc_proc->p_proc == 0) {
+			status = -EOPNOTSUPP;
+			break;
+		}
 		rpc_delay(task, 3*HZ);
 		goto retry_timeout;
 	case -ETIMEDOUT:
@@ -957,6 +971,7 @@ call_bind_status(struct rpc_task *task)
 				task->tk_pid);
 		goto retry_timeout;
 	case -EPFNOSUPPORT:
+		/* server doesn't support any rpcbind version we know of */
 		dprintk("RPC: %5u remote rpcbind service unavailable\n",
 				task->tk_pid);
 		break;
@@ -969,7 +984,6 @@ call_bind_status(struct rpc_task *task)
 	default:
 		dprintk("RPC: %5u unrecognized rpcbind error (%d)\n",
 				task->tk_pid, -task->tk_status);
-		status = -EIO;
 	}
 
 	rpc_exit(task, status);
@@ -1257,7 +1271,6 @@ call_refresh(struct rpc_task *task)
 {
 	dprint_status(task);
 
-	xprt_release(task);	/* Must do to obtain new XID */
 	task->tk_action = call_refreshresult;
 	task->tk_status = 0;
 	task->tk_client->cl_stats->rpcauthrefresh++;
@@ -1375,6 +1388,8 @@ call_verify(struct rpc_task *task)
 			dprintk("RPC: %5u %s: retry stale creds\n",
 					task->tk_pid, __FUNCTION__);
 			rpcauth_invalcred(task);
+			/* Ensure we obtain a new XID! */
+			xprt_release(task);
 			task->tk_action = call_refresh;
 			goto out_retry;
 		case RPC_AUTH_BADCRED:
@@ -1523,13 +1538,18 @@ void rpc_show_tasks(void)
 		spin_lock(&clnt->cl_lock);
 		list_for_each_entry(t, &clnt->cl_tasks, tk_task) {
 			const char *rpc_waitq = "none";
+			int proc;
+
+			if (t->tk_msg.rpc_proc)
+				proc = t->tk_msg.rpc_proc->p_proc;
+			else
+				proc = -1;
 
 			if (RPC_IS_QUEUED(t))
 				rpc_waitq = rpc_qname(t->u.tk_wait.rpc_waitq);
 
 			printk("%5u %04d %04x %6d %8p %6d %8p %8ld %8s %8p %8p\n",
-				t->tk_pid,
-				(t->tk_msg.rpc_proc ? t->tk_msg.rpc_proc->p_proc : -1),
+				t->tk_pid, proc,
 				t->tk_flags, t->tk_status,
 				t->tk_client,
 				(t->tk_client ? t->tk_client->cl_prog : 0),
diff -puN net/sunrpc/rpc_pipe.c~git-nfs net/sunrpc/rpc_pipe.c
--- a/net/sunrpc/rpc_pipe.c~git-nfs
+++ a/net/sunrpc/rpc_pipe.c
@@ -14,7 +14,7 @@
 #include <linux/pagemap.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
-#include <linux/dnotify.h>
+#include <linux/fsnotify.h>
 #include <linux/kernel.h>
 
 #include <asm/ioctls.h>
@@ -329,6 +329,7 @@ rpc_show_info(struct seq_file *m, void *
 			clnt->cl_prog, clnt->cl_vers);
 	seq_printf(m, "address: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_ADDR));
 	seq_printf(m, "protocol: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_PROTO));
+	seq_printf(m, "port: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_PORT));
 	return 0;
 }
 
@@ -585,6 +586,7 @@ rpc_populate(struct dentry *parent,
 		if (S_ISDIR(mode))
 			inc_nlink(dir);
 		d_add(dentry, inode);
+		fsnotify_create(dir, dentry);
 	}
 	mutex_unlock(&dir->i_mutex);
 	return 0;
@@ -606,7 +608,7 @@ __rpc_mkdir(struct inode *dir, struct de
 	inode->i_ino = iunique(dir->i_sb, 100);
 	d_instantiate(dentry, inode);
 	inc_nlink(dir);
-	inode_dir_notify(dir, DN_CREATE);
+	fsnotify_mkdir(dir, dentry);
 	return 0;
 out_err:
 	printk(KERN_WARNING "%s: %s failed to allocate inode for dentry %s\n",
@@ -748,7 +750,7 @@ rpc_mkpipe(struct dentry *parent, const 
 	rpci->flags = flags;
 	rpci->ops = ops;
 	rpci->nkern_readwriters = 1;
-	inode_dir_notify(dir, DN_CREATE);
+	fsnotify_create(dir, dentry);
 	dget(dentry);
 out:
 	mutex_unlock(&dir->i_mutex);
diff -puN net/sunrpc/rpcb_clnt.c~git-nfs net/sunrpc/rpcb_clnt.c
--- a/net/sunrpc/rpcb_clnt.c~git-nfs
+++ a/net/sunrpc/rpcb_clnt.c
@@ -16,11 +16,14 @@
 
 #include <linux/types.h>
 #include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/in6.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/xprtsock.h>
 
 #ifdef RPC_DEBUG
 # define RPCDBG_FACILITY	RPCDBG_BIND
@@ -91,26 +94,6 @@ enum {
 #define RPCB_MAXADDRLEN		(128u)
 
 /*
- * r_netid
- *
- * Quoting RFC 3530, section 2.2:
- *
- * For TCP over IPv4 the value of r_netid is the string "tcp".  For UDP
- * over IPv4 the value of r_netid is the string "udp".
- *
- * ...
- *
- * For TCP over IPv6 the value of r_netid is the string "tcp6".  For UDP
- * over IPv6 the value of r_netid is the string "udp6".
- */
-#define RPCB_NETID_UDP	"\165\144\160"		/* "udp" */
-#define RPCB_NETID_TCP	"\164\143\160"		/* "tcp" */
-#define RPCB_NETID_UDP6	"\165\144\160\066"	/* "udp6" */
-#define RPCB_NETID_TCP6	"\164\143\160\066"	/* "tcp6" */
-
-#define RPCB_MAXNETIDLEN	(4u)
-
-/*
  * r_owner
  *
  * The "owner" is allowed to unset a service in the rpcbind database.
@@ -120,7 +103,7 @@ enum {
 #define RPCB_MAXOWNERLEN	sizeof(RPCB_OWNER_STRING)
 
 static void			rpcb_getport_done(struct rpc_task *, void *);
-extern struct rpc_program	rpcb_program;
+static struct rpc_program	rpcb_program;
 
 struct rpcbind_args {
 	struct rpc_xprt *	r_xprt;
@@ -137,10 +120,13 @@ struct rpcbind_args {
 static struct rpc_procinfo rpcb_procedures2[];
 static struct rpc_procinfo rpcb_procedures3[];
 
-static struct rpcb_info {
+struct rpcb_info {
 	int			rpc_vers;
 	struct rpc_procinfo *	rpc_proc;
-} rpcb_next_version[];
+};
+
+static struct rpcb_info rpcb_next_version[];
+static struct rpcb_info rpcb_next_version6[];
 
 static void rpcb_getport_prepare(struct rpc_task *task, void *calldata)
 {
@@ -190,7 +176,17 @@ static struct rpc_clnt *rpcb_create(char
 				   RPC_CLNT_CREATE_INTR),
 	};
 
-	((struct sockaddr_in *)srvaddr)->sin_port = htons(RPCBIND_PORT);
+	switch (srvaddr->sa_family) {
+	case AF_INET:
+		((struct sockaddr_in *)srvaddr)->sin_port = htons(RPCBIND_PORT);
+		break;
+	case AF_INET6:
+		((struct sockaddr_in6 *)srvaddr)->sin6_port = htons(RPCBIND_PORT);
+		break;
+	default:
+		return NULL;
+	}
+
 	if (!privileged)
 		args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
 	return rpc_create(&args);
@@ -234,7 +230,7 @@ int rpcb_register(u32 prog, u32 vers, in
 			prog, vers, prot, port);
 
 	rpcb_clnt = rpcb_create("localhost", (struct sockaddr *) &sin,
-					IPPROTO_UDP, 2, 1);
+					XPRT_TRANSPORT_UDP, 2, 1);
 	if (IS_ERR(rpcb_clnt))
 		return PTR_ERR(rpcb_clnt);
 
@@ -316,6 +312,7 @@ void rpcb_getport_async(struct rpc_task 
 	struct rpc_task	*child;
 	struct sockaddr addr;
 	int status;
+	struct rpcb_info *info;
 
 	dprintk("RPC: %5u %s(%s, %u, %u, %d)\n",
 		task->tk_pid, __FUNCTION__,
@@ -325,7 +322,7 @@ void rpcb_getport_async(struct rpc_task 
 	BUG_ON(clnt->cl_parent != clnt);
 
 	if (xprt_test_and_set_binding(xprt)) {
-		status = -EACCES;		/* tell caller to check again */
+		status = -EAGAIN;	/* tell caller to check again */
 		dprintk("RPC: %5u %s: waiting for another binder\n",
 			task->tk_pid, __FUNCTION__);
 		goto bailout_nowake;
@@ -343,18 +340,43 @@ void rpcb_getport_async(struct rpc_task 
 		goto bailout_nofree;
 	}
 
-	if (rpcb_next_version[xprt->bind_index].rpc_proc == NULL) {
+	rpc_peeraddr(clnt, (void *)&addr, sizeof(addr));
+
+	/* Don't ever use rpcbind v2 for AF_INET6 requests */
+	switch (addr.sa_family) {
+	case AF_INET:
+		info = rpcb_next_version;
+		break;
+	case AF_INET6:
+		info = rpcb_next_version6;
+		break;
+	default:
+		status = -EAFNOSUPPORT;
+		dprintk("RPC: %5u %s: bad address family\n",
+				task->tk_pid, __FUNCTION__);
+		goto bailout_nofree;
+	}
+	if (info[xprt->bind_index].rpc_proc == NULL) {
 		xprt->bind_index = 0;
-		status = -EACCES;	/* tell caller to try again later */
+		status = -EPFNOSUPPORT;
 		dprintk("RPC: %5u %s: no more getport versions available\n",
 			task->tk_pid, __FUNCTION__);
 		goto bailout_nofree;
 	}
-	bind_version = rpcb_next_version[xprt->bind_index].rpc_vers;
+	bind_version = info[xprt->bind_index].rpc_vers;
 
 	dprintk("RPC: %5u %s: trying rpcbind version %u\n",
 		task->tk_pid, __FUNCTION__, bind_version);
 
+	rpcb_clnt = rpcb_create(clnt->cl_server, &addr, xprt->prot,
+				bind_version, 0);
+	if (IS_ERR(rpcb_clnt)) {
+		status = PTR_ERR(rpcb_clnt);
+		dprintk("RPC: %5u %s: rpcb_create failed, error %ld\n",
+			task->tk_pid, __FUNCTION__, PTR_ERR(rpcb_clnt));
+		goto bailout_nofree;
+	}
+
 	map = kzalloc(sizeof(struct rpcbind_args), GFP_ATOMIC);
 	if (!map) {
 		status = -ENOMEM;
@@ -367,28 +389,19 @@ void rpcb_getport_async(struct rpc_task 
 	map->r_prot = xprt->prot;
 	map->r_port = 0;
 	map->r_xprt = xprt_get(xprt);
-	map->r_netid = (xprt->prot == IPPROTO_TCP) ? RPCB_NETID_TCP :
-						   RPCB_NETID_UDP;
-	memcpy(&map->r_addr, rpc_peeraddr2str(clnt, RPC_DISPLAY_ADDR),
-			sizeof(map->r_addr));
+	map->r_netid = rpc_peeraddr2str(clnt, RPC_DISPLAY_NETID);
+	memcpy(map->r_addr,
+	       rpc_peeraddr2str(rpcb_clnt, RPC_DISPLAY_UNIVERSAL_ADDR),
+	       sizeof(map->r_addr));
 	map->r_owner = RPCB_OWNER_STRING;	/* ignored for GETADDR */
 
-	rpc_peeraddr(clnt, (void *)&addr, sizeof(addr));
-	rpcb_clnt = rpcb_create(clnt->cl_server, &addr, xprt->prot, bind_version, 0);
-	if (IS_ERR(rpcb_clnt)) {
-		status = PTR_ERR(rpcb_clnt);
-		dprintk("RPC: %5u %s: rpcb_create failed, error %ld\n",
-			task->tk_pid, __FUNCTION__, PTR_ERR(rpcb_clnt));
-		goto bailout;
-	}
-
 	child = rpc_run_task(rpcb_clnt, RPC_TASK_ASYNC, &rpcb_getport_ops, map);
 	rpc_release_client(rpcb_clnt);
 	if (IS_ERR(child)) {
 		status = -EIO;
 		dprintk("RPC: %5u %s: rpc_run_task failed\n",
 			task->tk_pid, __FUNCTION__);
-		goto bailout_nofree;
+		goto bailout;
 	}
 	rpc_put_task(child);
 
@@ -403,6 +416,7 @@ bailout_nofree:
 bailout_nowake:
 	task->tk_status = status;
 }
+EXPORT_SYMBOL_GPL(rpcb_getport_async);
 
 /*
  * Rpcbind child task calls this callback via tk_exit.
@@ -413,6 +427,10 @@ static void rpcb_getport_done(struct rpc
 	struct rpc_xprt *xprt = map->r_xprt;
 	int status = child->tk_status;
 
+	/* Garbage reply: retry with a lesser rpcbind version */
+	if (status == -EIO)
+		status = -EPROTONOSUPPORT;
+
 	/* rpcbind server doesn't support this rpcbind protocol version */
 	if (status == -EPROTONOSUPPORT)
 		xprt->bind_index++;
@@ -490,16 +508,24 @@ static int rpcb_decode_getaddr(struct rp
 			       unsigned short *portp)
 {
 	char *addr;
-	int addr_len, c, i, f, first, val;
+	u32 addr_len;
+	int c, i, f, first, val;
 
 	*portp = 0;
-	addr_len = (unsigned int) ntohl(*p++);
-	if (addr_len > RPCB_MAXADDRLEN)			/* sanity */
-		return -EINVAL;
-
-	dprintk("RPC:       rpcb_decode_getaddr returned string: '%s'\n",
-			(char *) p);
+	addr_len = ntohl(*p++);
 
+	/*
+	 * Simple sanity check.  The smallest possible universal
+	 * address is an IPv4 address string containing 11 bytes.
+	 */
+	if (addr_len < 11 || addr_len > RPCB_MAXADDRLEN)
+		goto out_err;
+
+	/*
+	 * Start at the end and walk backwards until the first dot
+	 * is encountered.  When the second dot is found, we have
+	 * both parts of the port number.
+	 */
 	addr = (char *)p;
 	val = 0;
 	first = 1;
@@ -521,8 +547,19 @@ static int rpcb_decode_getaddr(struct rp
 		}
 	}
 
+	/*
+	 * Simple sanity check.  If we never saw a dot in the reply,
+	 * then this was probably just garbage.
+	 */
+	if (first)
+		goto out_err;
+
 	dprintk("RPC:       rpcb_decode_getaddr port=%u\n", *portp);
 	return 0;
+
+out_err:
+	dprintk("RPC:       rpcbind server returned malformed reply\n");
+	return -EIO;
 }
 
 #define RPCB_program_sz		(1u)
@@ -531,7 +568,7 @@ static int rpcb_decode_getaddr(struct rp
 #define RPCB_port_sz		(1u)
 #define RPCB_boolean_sz		(1u)
 
-#define RPCB_netid_sz		(1+XDR_QUADLEN(RPCB_MAXNETIDLEN))
+#define RPCB_netid_sz		(1+XDR_QUADLEN(RPCBIND_MAXNETIDLEN))
 #define RPCB_addr_sz		(1+XDR_QUADLEN(RPCB_MAXADDRLEN))
 #define RPCB_ownerstring_sz	(1+XDR_QUADLEN(RPCB_MAXOWNERLEN))
 
@@ -593,6 +630,14 @@ static struct rpcb_info rpcb_next_versio
 	{ 0, NULL },
 };
 
+static struct rpcb_info rpcb_next_version6[] = {
+#ifdef CONFIG_SUNRPC_BIND34
+	{ 4, &rpcb_procedures4[RPCBPROC_GETVERSADDR] },
+	{ 3, &rpcb_procedures3[RPCBPROC_GETADDR] },
+#endif
+	{ 0, NULL },
+};
+
 static struct rpc_version rpcb_version2 = {
 	.number		= 2,
 	.nrprocs	= RPCB_HIGHPROC_2,
@@ -621,7 +666,7 @@ static struct rpc_version *rpcb_version[
 
 static struct rpc_stat rpcb_stats;
 
-struct rpc_program rpcb_program = {
+static struct rpc_program rpcb_program = {
 	.name		= "rpcbind",
 	.number		= RPCBIND_PROGRAM,
 	.nrvers		= ARRAY_SIZE(rpcb_version),
diff -puN net/sunrpc/sched.c~git-nfs net/sunrpc/sched.c
--- a/net/sunrpc/sched.c~git-nfs
+++ a/net/sunrpc/sched.c
@@ -777,6 +777,7 @@ void *rpc_malloc(struct rpc_task *task, 
 			task->tk_pid, size, buf);
 	return &buf->data;
 }
+EXPORT_SYMBOL_GPL(rpc_malloc);
 
 /**
  * rpc_free - free buffer allocated via rpc_malloc
@@ -802,6 +803,7 @@ void rpc_free(void *buffer)
 	else
 		kfree(buf);
 }
+EXPORT_SYMBOL_GPL(rpc_free);
 
 /*
  * Creation and deletion of RPC task structures
diff -puN net/sunrpc/socklib.c~git-nfs net/sunrpc/socklib.c
--- a/net/sunrpc/socklib.c~git-nfs
+++ a/net/sunrpc/socklib.c
@@ -34,6 +34,7 @@ size_t xdr_skb_read_bits(struct xdr_skb_
 	desc->offset += len;
 	return len;
 }
+EXPORT_SYMBOL_GPL(xdr_skb_read_bits);
 
 /**
  * xdr_skb_read_and_csum_bits - copy and checksum from skb to buffer
@@ -137,6 +138,7 @@ copy_tail:
 out:
 	return copied;
 }
+EXPORT_SYMBOL_GPL(xdr_partial_copy_from_skb);
 
 /**
  * csum_partial_copy_to_xdr - checksum and copy data
@@ -179,3 +181,4 @@ no_checksum:
 		return -1;
 	return 0;
 }
+EXPORT_SYMBOL_GPL(csum_partial_copy_to_xdr);
diff -puN net/sunrpc/sunrpc_syms.c~git-nfs net/sunrpc/sunrpc_syms.c
--- a/net/sunrpc/sunrpc_syms.c~git-nfs
+++ a/net/sunrpc/sunrpc_syms.c
@@ -20,7 +20,7 @@
 #include <linux/sunrpc/auth.h>
 #include <linux/workqueue.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
-
+#include <linux/sunrpc/xprtsock.h>
 
 /* RPC scheduler */
 EXPORT_SYMBOL(rpc_execute);
diff -puN net/sunrpc/timer.c~git-nfs net/sunrpc/timer.c
--- a/net/sunrpc/timer.c~git-nfs
+++ a/net/sunrpc/timer.c
@@ -17,6 +17,7 @@
 
 #include <linux/types.h>
 #include <linux/unistd.h>
+#include <linux/module.h>
 
 #include <linux/sunrpc/clnt.h>
 
@@ -40,6 +41,7 @@ rpc_init_rtt(struct rpc_rtt *rt, unsigne
 		rt->ntimeouts[i] = 0;
 	}
 }
+EXPORT_SYMBOL_GPL(rpc_init_rtt);
 
 /*
  * NB: When computing the smoothed RTT and standard deviation,
@@ -75,6 +77,7 @@ rpc_update_rtt(struct rpc_rtt *rt, unsig
 	if (*sdrtt < RPC_RTO_MIN)
 		*sdrtt = RPC_RTO_MIN;
 }
+EXPORT_SYMBOL_GPL(rpc_update_rtt);
 
 /*
  * Estimate rto for an nfs rpc sent via. an unreliable datagram.
@@ -103,3 +106,4 @@ rpc_calc_rto(struct rpc_rtt *rt, unsigne
 
 	return res;
 }
+EXPORT_SYMBOL_GPL(rpc_calc_rto);
diff -puN net/sunrpc/xprt.c~git-nfs net/sunrpc/xprt.c
--- a/net/sunrpc/xprt.c~git-nfs
+++ a/net/sunrpc/xprt.c
@@ -62,6 +62,9 @@ static inline void	do_xprt_reserve(struc
 static void	xprt_connect_status(struct rpc_task *task);
 static int      __xprt_get_cong(struct rpc_xprt *, struct rpc_task *);
 
+static spinlock_t xprt_list_lock = SPIN_LOCK_UNLOCKED;
+static LIST_HEAD(xprt_list);
+
 /*
  * The transport code maintains an estimate on the maximum number of out-
  * standing RPC requests, using a smoothed version of the congestion
@@ -81,6 +84,78 @@ static int      __xprt_get_cong(struct r
 #define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd)
 
 /**
+ * xprt_register_transport - register a transport implementation
+ * @transport: transport to register
+ *
+ * If a transport implementation is loaded as a kernel module, it can
+ * call this interface to make itself known to the RPC client.
+ *
+ * Returns:
+ * 0:		transport successfully registered
+ * -EEXIST:	transport already registered
+ * -EINVAL:	transport module being unloaded
+ */
+int xprt_register_transport(struct xprt_class *transport)
+{
+	struct xprt_class *t;
+	int result;
+
+	result = -EEXIST;
+	spin_lock(&xprt_list_lock);
+	list_for_each_entry(t, &xprt_list, list) {
+		/* don't register the same transport class twice */
+		if (t->ident == transport->ident)
+			goto out;
+	}
+
+	result = -EINVAL;
+	if (try_module_get(THIS_MODULE)) {
+		list_add_tail(&transport->list, &xprt_list);
+		printk(KERN_INFO "RPC: Registered %s transport module.\n",
+			transport->name);
+		result = 0;
+	}
+
+out:
+	spin_unlock(&xprt_list_lock);
+	return result;
+}
+EXPORT_SYMBOL_GPL(xprt_register_transport);
+
+/**
+ * xprt_unregister_transport - unregister a transport implementation
+ * transport: transport to unregister
+ *
+ * Returns:
+ * 0:		transport successfully unregistered
+ * -ENOENT:	transport never registered
+ */
+int xprt_unregister_transport(struct xprt_class *transport)
+{
+	struct xprt_class *t;
+	int result;
+
+	result = 0;
+	spin_lock(&xprt_list_lock);
+	list_for_each_entry(t, &xprt_list, list) {
+		if (t == transport) {
+			printk(KERN_INFO
+				"RPC: Unregistered %s transport module.\n",
+				transport->name);
+			list_del_init(&transport->list);
+			module_put(THIS_MODULE);
+			goto out;
+		}
+	}
+	result = -ENOENT;
+
+out:
+	spin_unlock(&xprt_list_lock);
+	return result;
+}
+EXPORT_SYMBOL_GPL(xprt_unregister_transport);
+
+/**
  * xprt_reserve_xprt - serialize write access to transports
  * @task: task that is requesting access to the transport
  *
@@ -118,6 +193,7 @@ out_sleep:
 		rpc_sleep_on(&xprt->sending, task, NULL, NULL);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(xprt_reserve_xprt);
 
 static void xprt_clear_locked(struct rpc_xprt *xprt)
 {
@@ -167,6 +243,7 @@ out_sleep:
 		rpc_sleep_on(&xprt->sending, task, NULL, NULL);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(xprt_reserve_xprt_cong);
 
 static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task)
 {
@@ -246,6 +323,7 @@ void xprt_release_xprt(struct rpc_xprt *
 		__xprt_lock_write_next(xprt);
 	}
 }
+EXPORT_SYMBOL_GPL(xprt_release_xprt);
 
 /**
  * xprt_release_xprt_cong - allow other requests to use a transport
@@ -262,6 +340,7 @@ void xprt_release_xprt_cong(struct rpc_x
 		__xprt_lock_write_next_cong(xprt);
 	}
 }
+EXPORT_SYMBOL_GPL(xprt_release_xprt_cong);
 
 static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task)
 {
@@ -314,6 +393,7 @@ void xprt_release_rqst_cong(struct rpc_t
 {
 	__xprt_put_cong(task->tk_xprt, task->tk_rqstp);
 }
+EXPORT_SYMBOL_GPL(xprt_release_rqst_cong);
 
 /**
  * xprt_adjust_cwnd - adjust transport congestion window
@@ -345,6 +425,7 @@ void xprt_adjust_cwnd(struct rpc_task *t
 	xprt->cwnd = cwnd;
 	__xprt_put_cong(xprt, req);
 }
+EXPORT_SYMBOL_GPL(xprt_adjust_cwnd);
 
 /**
  * xprt_wake_pending_tasks - wake all tasks on a transport's pending queue
@@ -359,6 +440,7 @@ void xprt_wake_pending_tasks(struct rpc_
 	else
 		rpc_wake_up(&xprt->pending);
 }
+EXPORT_SYMBOL_GPL(xprt_wake_pending_tasks);
 
 /**
  * xprt_wait_for_buffer_space - wait for transport output buffer to clear
@@ -373,6 +455,7 @@ void xprt_wait_for_buffer_space(struct r
 	task->tk_timeout = req->rq_timeout;
 	rpc_sleep_on(&xprt->pending, task, NULL, NULL);
 }
+EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space);
 
 /**
  * xprt_write_space - wake the task waiting for transport output buffer space
@@ -393,6 +476,7 @@ void xprt_write_space(struct rpc_xprt *x
 	}
 	spin_unlock_bh(&xprt->transport_lock);
 }
+EXPORT_SYMBOL_GPL(xprt_write_space);
 
 /**
  * xprt_set_retrans_timeout_def - set a request's retransmit timeout
@@ -406,6 +490,7 @@ void xprt_set_retrans_timeout_def(struct
 {
 	task->tk_timeout = task->tk_rqstp->rq_timeout;
 }
+EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_def);
 
 /*
  * xprt_set_retrans_timeout_rtt - set a request's retransmit timeout
@@ -425,6 +510,7 @@ void xprt_set_retrans_timeout_rtt(struct
 	if (task->tk_timeout > max_timeout || task->tk_timeout == 0)
 		task->tk_timeout = max_timeout;
 }
+EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_rtt);
 
 static void xprt_reset_majortimeo(struct rpc_rqst *req)
 {
@@ -500,6 +586,7 @@ void xprt_disconnect(struct rpc_xprt *xp
 	xprt_wake_pending_tasks(xprt, -ENOTCONN);
 	spin_unlock_bh(&xprt->transport_lock);
 }
+EXPORT_SYMBOL_GPL(xprt_disconnect);
 
 static void
 xprt_init_autodisconnect(unsigned long data)
@@ -610,6 +697,7 @@ struct rpc_rqst *xprt_lookup_rqst(struct
 	xprt->stat.bad_xids++;
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(xprt_lookup_rqst);
 
 /**
  * xprt_update_rtt - update an RPC client's RTT state after receiving a reply
@@ -629,6 +717,7 @@ void xprt_update_rtt(struct rpc_task *ta
 		rpc_set_timeo(rtt, timer, req->rq_ntrans - 1);
 	}
 }
+EXPORT_SYMBOL_GPL(xprt_update_rtt);
 
 /**
  * xprt_complete_rqst - called when reply processing is complete
@@ -653,6 +742,7 @@ void xprt_complete_rqst(struct rpc_task 
 	req->rq_received = req->rq_private_buf.len = copied;
 	rpc_wake_up_task(task);
 }
+EXPORT_SYMBOL_GPL(xprt_complete_rqst);
 
 static void xprt_timer(struct rpc_task *task)
 {
@@ -889,23 +979,25 @@ void xprt_set_timeout(struct rpc_timeout
  * @args: rpc transport creation arguments
  *
  */
-struct rpc_xprt *xprt_create_transport(struct rpc_xprtsock_create *args)
+struct rpc_xprt *xprt_create_transport(struct xprt_create *args)
 {
 	struct rpc_xprt	*xprt;
 	struct rpc_rqst	*req;
+	struct xprt_class *t;
 
-	switch (args->proto) {
-	case IPPROTO_UDP:
-		xprt = xs_setup_udp(args);
-		break;
-	case IPPROTO_TCP:
-		xprt = xs_setup_tcp(args);
-		break;
-	default:
-		printk(KERN_ERR "RPC: unrecognized transport protocol: %d\n",
-				args->proto);
-		return ERR_PTR(-EIO);
+	spin_lock(&xprt_list_lock);
+	list_for_each_entry(t, &xprt_list, list) {
+		if (t->ident == args->ident) {
+			spin_unlock(&xprt_list_lock);
+			goto found;
+		}
 	}
+	spin_unlock(&xprt_list_lock);
+	printk(KERN_ERR "RPC: transport (%d) not supported\n", args->ident);
+	return ERR_PTR(-EIO);
+
+found:
+	xprt = t->setup(args);
 	if (IS_ERR(xprt)) {
 		dprintk("RPC:       xprt_create_transport: failed, %ld\n",
 				-PTR_ERR(xprt));
diff -puN /dev/null net/sunrpc/xprtrdma/Makefile
--- /dev/null
+++ a/net/sunrpc/xprtrdma/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma.o
+
+xprtrdma-y := transport.o rpc_rdma.o verbs.o
diff -puN /dev/null net/sunrpc/xprtrdma/rpc_rdma.c
--- /dev/null
+++ a/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -0,0 +1,868 @@
+/*
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * rpc_rdma.c
+ *
+ * This file contains the guts of the RPC RDMA protocol, and
+ * does marshaling/unmarshaling, etc. It is also where interfacing
+ * to the Linux RPC framework lives.
+ */
+
+#include "xprt_rdma.h"
+
+#include <linux/highmem.h>
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+enum rpcrdma_chunktype {
+	rpcrdma_noch = 0,
+	rpcrdma_readch,
+	rpcrdma_areadch,
+	rpcrdma_writech,
+	rpcrdma_replych
+};
+
+#ifdef RPC_DEBUG
+static const char transfertypes[][12] = {
+	"pure inline",	/* no chunks */
+	" read chunk",	/* some argument via rdma read */
+	"*read chunk",	/* entire request via rdma read */
+	"write chunk",	/* some result via rdma write */
+	"reply chunk"	/* entire reply via rdma write */
+};
+#endif
+
+/*
+ * Chunk assembly from upper layer xdr_buf.
+ *
+ * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk
+ * elements. Segments are then coalesced when registered, if possible
+ * within the selected memreg mode.
+ *
+ * Note, this routine is never called if the connection's memory
+ * registration strategy is 0 (bounce buffers).
+ */
+
+static int
+rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, int pos,
+	enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs)
+{
+	int len, n = 0, p;
+
+	if (pos == 0 && xdrbuf->head[0].iov_len) {
+		seg[n].mr_page = NULL;
+		seg[n].mr_offset = xdrbuf->head[0].iov_base;
+		seg[n].mr_len = xdrbuf->head[0].iov_len;
+		pos += xdrbuf->head[0].iov_len;
+		++n;
+	}
+
+	if (xdrbuf->page_len && (xdrbuf->pages[0] != NULL)) {
+		if (n == nsegs)
+			return 0;
+		seg[n].mr_page = xdrbuf->pages[0];
+		seg[n].mr_offset = (void *)(unsigned long) xdrbuf->page_base;
+		seg[n].mr_len = min_t(u32,
+			PAGE_SIZE - xdrbuf->page_base, xdrbuf->page_len);
+		len = xdrbuf->page_len - seg[n].mr_len;
+		pos += len;
+		++n;
+		p = 1;
+		while (len > 0) {
+			if (n == nsegs)
+				return 0;
+			seg[n].mr_page = xdrbuf->pages[p];
+			seg[n].mr_offset = NULL;
+			seg[n].mr_len = min_t(u32, PAGE_SIZE, len);
+			len -= seg[n].mr_len;
+			++n;
+			++p;
+		}
+	}
+
+	if (pos < xdrbuf->len && xdrbuf->tail[0].iov_len) {
+		if (n == nsegs)
+			return 0;
+		seg[n].mr_page = NULL;
+		seg[n].mr_offset = xdrbuf->tail[0].iov_base;
+		seg[n].mr_len = xdrbuf->tail[0].iov_len;
+		pos += xdrbuf->tail[0].iov_len;
+		++n;
+	}
+
+	if (pos < xdrbuf->len)
+		dprintk("RPC:       %s: marshaled only %d of %d\n",
+				__func__, pos, xdrbuf->len);
+
+	return n;
+}
+
+/*
+ * Create read/write chunk lists, and reply chunks, for RDMA
+ *
+ *   Assume check against THRESHOLD has been done, and chunks are required.
+ *   Assume only encoding one list entry for read|write chunks. The NFSv3
+ *     protocol is simple enough to allow this as it only has a single "bulk
+ *     result" in each procedure - complicated NFSv4 COMPOUNDs are not. (The
+ *     RDMA/Sessions NFSv4 proposal addresses this for future v4 revs.)
+ *
+ * When used for a single reply chunk (which is a special write
+ * chunk used for the entire reply, rather than just the data), it
+ * is used primarily for READDIR and READLINK which would otherwise
+ * be severely size-limited by a small rdma inline read max. The server
+ * response will come back as an RDMA Write, followed by a message
+ * of type RDMA_NOMSG carrying the xid and length. As a result, reply
+ * chunks do not provide data alignment, however they do not require
+ * "fixup" (moving the response to the upper layer buffer) either.
+ *
+ * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
+ *
+ *  Read chunklist (a linked list):
+ *   N elements, position P (same P for all chunks of same arg!):
+ *    1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
+ *
+ *  Write chunklist (a list of (one) counted array):
+ *   N elements:
+ *    1 - N - HLOO - HLOO - ... - HLOO - 0
+ *
+ *  Reply chunk (a counted array):
+ *   N elements:
+ *    1 - N - HLOO - HLOO - ... - HLOO
+ */
+
+static unsigned int
+rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
+		struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type)
+{
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_task->tk_xprt);
+	int nsegs, nchunks = 0;
+	int pos;
+	struct rpcrdma_mr_seg *seg = req->rl_segments;
+	struct rpcrdma_read_chunk *cur_rchunk = NULL;
+	struct rpcrdma_write_array *warray = NULL;
+	struct rpcrdma_write_chunk *cur_wchunk = NULL;
+	u32 *iptr = headerp->rm_body.rm_chunks;
+
+	if (type == rpcrdma_readch || type == rpcrdma_areadch) {
+		/* a read chunk - server will RDMA Read our memory */
+		cur_rchunk = (struct rpcrdma_read_chunk *) iptr;
+	} else {
+		/* a write or reply chunk - server will RDMA Write our memory */
+		*iptr++ = xdr_zero;	/* encode a NULL read chunk list */
+		if (type == rpcrdma_replych)
+			*iptr++ = xdr_zero;	/* a NULL write chunk list */
+		warray = (struct rpcrdma_write_array *) iptr;
+		cur_wchunk = (struct rpcrdma_write_chunk *) (warray + 1);
+	}
+
+	if (type == rpcrdma_replych || type == rpcrdma_areadch)
+		pos = 0;
+	else
+		pos = target->head[0].iov_len;
+
+	nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS);
+	if (nsegs == 0)
+		return 0;
+
+	do {
+		/* bind/register the memory, then build chunk from result. */
+		int n = rpcrdma_register_external(seg, nsegs,
+						cur_wchunk != NULL, r_xprt);
+		if (n <= 0)
+			goto out;
+		if (cur_rchunk) {	/* read */
+			cur_rchunk->rc_discrim = xdr_one;
+			/* all read chunks have the same "position" */
+			cur_rchunk->rc_position = htonl(pos);
+			cur_rchunk->rc_target.rs_handle = htonl(seg->mr_rkey);
+			cur_rchunk->rc_target.rs_length = htonl(seg->mr_len);
+			xdr_encode_hyper(
+					(u32 *)&cur_rchunk->rc_target.rs_offset,
+					seg->mr_base);
+			dprintk("RPC:       %s: read chunk "
+				"elem %d@0x%llx:0x%x pos %d (%s)\n", __func__,
+				seg->mr_len, seg->mr_base, seg->mr_rkey, pos,
+				n < nsegs ? "more" : "last");
+			cur_rchunk++;
+			r_xprt->rx_stats.read_chunk_count++;
+		} else {		/* write/reply */
+			cur_wchunk->wc_target.rs_handle = htonl(seg->mr_rkey);
+			cur_wchunk->wc_target.rs_length = htonl(seg->mr_len);
+			xdr_encode_hyper(
+					(u32 *)&cur_wchunk->wc_target.rs_offset,
+					seg->mr_base);
+			dprintk("RPC:       %s: %s chunk "
+				"elem %d@0x%llx:0x%x (%s)\n", __func__,
+				(type == rpcrdma_replych) ? "reply" : "write",
+				seg->mr_len, seg->mr_base, seg->mr_rkey,
+				n < nsegs ? "more" : "last");
+			cur_wchunk++;
+			if (type == rpcrdma_replych)
+				r_xprt->rx_stats.reply_chunk_count++;
+			else
+				r_xprt->rx_stats.write_chunk_count++;
+			r_xprt->rx_stats.total_rdma_request += seg->mr_len;
+		}
+		nchunks++;
+		seg   += n;
+		nsegs -= n;
+	} while (nsegs);
+
+	/* success. all failures return above */
+	req->rl_nchunks = nchunks;
+
+	BUG_ON(nchunks == 0);
+
+	/*
+	 * finish off header. If write, marshal discrim and nchunks.
+	 */
+	if (cur_rchunk) {
+		iptr = (u32 *) cur_rchunk;
+		*iptr++ = xdr_zero;	/* finish the read chunk list */
+		*iptr++ = xdr_zero;	/* encode a NULL write chunk list */
+		*iptr++ = xdr_zero;	/* encode a NULL reply chunk */
+	} else {
+		warray->wc_discrim = xdr_one;
+		warray->wc_nchunks = htonl(nchunks);
+		iptr = (u32 *) cur_wchunk;
+		if (type == rpcrdma_writech) {
+			*iptr++ = xdr_zero; /* finish the write chunk list */
+			*iptr++ = xdr_zero; /* encode a NULL reply chunk */
+		}
+	}
+
+	/*
+	 * Return header size.
+	 */
+	return (unsigned char *)iptr - (unsigned char *)headerp;
+
+out:
+	for (pos = 0; nchunks--;)
+		pos += rpcrdma_deregister_external(
+				&req->rl_segments[pos], r_xprt, NULL);
+	return 0;
+}
+
+/*
+ * Copy write data inline.
+ * This function is used for "small" requests. Data which is passed
+ * to RPC via iovecs (or page list) is copied directly into the
+ * pre-registered memory buffer for this request. For small amounts
+ * of data, this is efficient. The cutoff value is tunable.
+ */
+static int
+rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
+{
+	int i, npages, curlen;
+	int copy_len;
+	unsigned char *srcp, *destp;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
+
+	destp = rqst->rq_svec[0].iov_base;
+	curlen = rqst->rq_svec[0].iov_len;
+	destp += curlen;
+	/*
+	 * Do optional padding where it makes sense. Alignment of write
+	 * payload can help the server, if our setting is accurate.
+	 */
+	pad -= (curlen + 36/*sizeof(struct rpcrdma_msg_padded)*/);
+	if (pad < 0 || rqst->rq_slen - curlen < RPCRDMA_INLINE_PAD_THRESH)
+		pad = 0;	/* don't pad this request */
+
+	dprintk("RPC:       %s: pad %d destp 0x%p len %d hdrlen %d\n",
+		__func__, pad, destp, rqst->rq_slen, curlen);
+
+	copy_len = rqst->rq_snd_buf.page_len;
+	r_xprt->rx_stats.pullup_copy_count += copy_len;
+	npages = PAGE_ALIGN(rqst->rq_snd_buf.page_base+copy_len) >> PAGE_SHIFT;
+	for (i = 0; copy_len && i < npages; i++) {
+		if (i == 0)
+			curlen = PAGE_SIZE - rqst->rq_snd_buf.page_base;
+		else
+			curlen = PAGE_SIZE;
+		if (curlen > copy_len)
+			curlen = copy_len;
+		dprintk("RPC:       %s: page %d destp 0x%p len %d curlen %d\n",
+			__func__, i, destp, copy_len, curlen);
+		srcp = kmap_atomic(rqst->rq_snd_buf.pages[i],
+					KM_SKB_SUNRPC_DATA);
+		if (i == 0)
+			memcpy(destp, srcp+rqst->rq_snd_buf.page_base, curlen);
+		else
+			memcpy(destp, srcp, curlen);
+		kunmap_atomic(srcp, KM_SKB_SUNRPC_DATA);
+		rqst->rq_svec[0].iov_len += curlen;
+		destp += curlen;
+		copy_len -= curlen;
+	}
+	if (rqst->rq_snd_buf.tail[0].iov_len) {
+		curlen = rqst->rq_snd_buf.tail[0].iov_len;
+		if (destp != rqst->rq_snd_buf.tail[0].iov_base) {
+			memcpy(destp,
+				rqst->rq_snd_buf.tail[0].iov_base, curlen);
+			r_xprt->rx_stats.pullup_copy_count += curlen;
+		}
+		dprintk("RPC:       %s: tail destp 0x%p len %d curlen %d\n",
+			__func__, destp, copy_len, curlen);
+		rqst->rq_svec[0].iov_len += curlen;
+	}
+	/* header now contains entire send message */
+	return pad;
+}
+
+/*
+ * Marshal a request: the primary job of this routine is to choose
+ * the transfer modes. See comments below.
+ *
+ * Uses multiple RDMA IOVs for a request:
+ *  [0] -- RPC RDMA header, which uses memory from the *start* of the
+ *         preregistered buffer that already holds the RPC data in
+ *         its middle.
+ *  [1] -- the RPC header/data, marshaled by RPC and the NFS protocol.
+ *  [2] -- optional padding.
+ *  [3] -- if padded, header only in [1] and data here.
+ */
+
+int
+rpcrdma_marshal_req(struct rpc_rqst *rqst)
+{
+	struct rpc_xprt *xprt = rqst->rq_task->tk_xprt;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+	char *base;
+	size_t hdrlen, rpclen, padlen;
+	enum rpcrdma_chunktype rtype, wtype;
+	struct rpcrdma_msg *headerp;
+
+	/*
+	 * rpclen gets amount of data in first buffer, which is the
+	 * pre-registered buffer.
+	 */
+	base = rqst->rq_svec[0].iov_base;
+	rpclen = rqst->rq_svec[0].iov_len;
+
+	/* build RDMA header in private area at front */
+	headerp = (struct rpcrdma_msg *) req->rl_base;
+	/* don't htonl XID, it's already done in request */
+	headerp->rm_xid = rqst->rq_xid;
+	headerp->rm_vers = xdr_one;
+	headerp->rm_credit = htonl(r_xprt->rx_buf.rb_max_requests);
+	headerp->rm_type = __constant_htonl(RDMA_MSG);
+
+	/*
+	 * Chunks needed for results?
+	 *
+	 * o If the expected result is under the inline threshold, all ops
+	 *   return as inline (but see later).
+	 * o Large non-read ops return as a single reply chunk.
+	 * o Large read ops return data as write chunk(s), header as inline.
+	 *
+	 * Note: the NFS code sending down multiple result segments implies
+	 * the op is one of read, readdir[plus], readlink or NFSv4 getacl.
+	 */
+
+	/*
+	 * This code can handle read chunks, write chunks OR reply
+	 * chunks -- only one type. If the request is too big to fit
+	 * inline, then we will choose read chunks. If the request is
+	 * a READ, then use write chunks to separate the file data
+	 * into pages; otherwise use reply chunks.
+	 */
+	if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
+		wtype = rpcrdma_noch;
+	else if (rqst->rq_rcv_buf.page_len == 0)
+		wtype = rpcrdma_replych;
+	else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
+		wtype = rpcrdma_writech;
+	else
+		wtype = rpcrdma_replych;
+
+	/*
+	 * Chunks needed for arguments?
+	 *
+	 * o If the total request is under the inline threshold, all ops
+	 *   are sent as inline.
+	 * o Large non-write ops are sent with the entire message as a
+	 *   single read chunk (protocol 0-position special case).
+	 * o Large write ops transmit data as read chunk(s), header as
+	 *   inline.
+	 *
+	 * Note: the NFS code sending down multiple argument segments
+	 * implies the op is a write.
+	 * TBD check NFSv4 setacl
+	 */
+	if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
+		rtype = rpcrdma_noch;
+	else if (rqst->rq_snd_buf.page_len == 0)
+		rtype = rpcrdma_areadch;
+	else
+		rtype = rpcrdma_readch;
+
+	/* The following simplification is not true forever */
+	if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
+		wtype = rpcrdma_noch;
+	BUG_ON(rtype != rpcrdma_noch && wtype != rpcrdma_noch);
+
+	if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS &&
+	    (rtype != rpcrdma_noch || wtype != rpcrdma_noch)) {
+		/* forced to "pure inline"? */
+		dprintk("RPC:       %s: too much data (%d/%d) for inline\n",
+			__func__, rqst->rq_rcv_buf.len, rqst->rq_snd_buf.len);
+		return -1;
+	}
+
+	hdrlen = 28; /*sizeof *headerp;*/
+	padlen = 0;
+
+	/*
+	 * Pull up any extra send data into the preregistered buffer.
+	 * When padding is in use and applies to the transfer, insert
+	 * it and change the message type.
+	 */
+	if (rtype == rpcrdma_noch) {
+
+		padlen = rpcrdma_inline_pullup(rqst,
+						RPCRDMA_INLINE_PAD_VALUE(rqst));
+
+		if (padlen) {
+			headerp->rm_type = __constant_htonl(RDMA_MSGP);
+			headerp->rm_body.rm_padded.rm_align =
+				htonl(RPCRDMA_INLINE_PAD_VALUE(rqst));
+			headerp->rm_body.rm_padded.rm_thresh =
+				__constant_htonl(RPCRDMA_INLINE_PAD_THRESH);
+			headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero;
+			headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
+			headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
+			hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
+			BUG_ON(wtype != rpcrdma_noch);
+
+		} else {
+			headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
+			headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
+			headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
+			/* new length after pullup */
+			rpclen = rqst->rq_svec[0].iov_len;
+			/*
+			 * Currently we try to not actually use read inline.
+			 * Reply chunks have the desirable property that
+			 * they land, packed, directly in the target buffers
+			 * without headers, so they require no fixup. The
+			 * additional RDMA Write op sends the same amount
+			 * of data, streams on-the-wire and adds no overhead
+			 * on receive. Therefore, we request a reply chunk
+			 * for non-writes wherever feasible and efficient.
+			 */
+			if (wtype == rpcrdma_noch &&
+			    r_xprt->rx_ia.ri_memreg_strategy > RPCRDMA_REGISTER)
+				wtype = rpcrdma_replych;
+		}
+	}
+
+	/*
+	 * Marshal chunks. This routine will return the header length
+	 * consumed by marshaling.
+	 */
+	if (rtype != rpcrdma_noch) {
+		hdrlen = rpcrdma_create_chunks(rqst,
+					&rqst->rq_snd_buf, headerp, rtype);
+		wtype = rtype;	/* simplify dprintk */
+
+	} else if (wtype != rpcrdma_noch) {
+		hdrlen = rpcrdma_create_chunks(rqst,
+					&rqst->rq_rcv_buf, headerp, wtype);
+	}
+
+	if (hdrlen == 0)
+		return -1;
+
+	dprintk("RPC:       %s: %s: hdrlen %zd rpclen %zd padlen %zd\n"
+		"                   headerp 0x%p base 0x%p lkey 0x%x\n",
+		__func__, transfertypes[wtype], hdrlen, rpclen, padlen,
+		headerp, base, req->rl_iov.lkey);
+
+	/*
+	 * initialize send_iov's - normally only two: rdma chunk header and
+	 * single preregistered RPC header buffer, but if padding is present,
+	 * then use a preregistered (and zeroed) pad buffer between the RPC
+	 * header and any write data. In all non-rdma cases, any following
+	 * data has been copied into the RPC header buffer.
+	 */
+	req->rl_send_iov[0].addr = req->rl_iov.addr;
+	req->rl_send_iov[0].length = hdrlen;
+	req->rl_send_iov[0].lkey = req->rl_iov.lkey;
+
+	req->rl_send_iov[1].addr = req->rl_iov.addr + (base - req->rl_base);
+	req->rl_send_iov[1].length = rpclen;
+	req->rl_send_iov[1].lkey = req->rl_iov.lkey;
+
+	req->rl_niovs = 2;
+
+	if (padlen) {
+		struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+
+		req->rl_send_iov[2].addr = ep->rep_pad.addr;
+		req->rl_send_iov[2].length = padlen;
+		req->rl_send_iov[2].lkey = ep->rep_pad.lkey;
+
+		req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen;
+		req->rl_send_iov[3].length = rqst->rq_slen - rpclen;
+		req->rl_send_iov[3].lkey = req->rl_iov.lkey;
+
+		req->rl_niovs = 4;
+	}
+
+	return 0;
+}
+
+/*
+ * Chase down a received write or reply chunklist to get length
+ * RDMA'd by server. See map at rpcrdma_create_chunks()! :-)
+ */
+static int
+rpcrdma_count_chunks(struct rpcrdma_rep *rep, int max, int wrchunk, u32 **iptrp)
+{
+	unsigned int i, total_len;
+	struct rpcrdma_write_chunk *cur_wchunk;
+
+	i = ntohl(**iptrp);	/* get array count */
+	if (i > max)
+		return -1;
+	cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1);
+	total_len = 0;
+	while (i--) {
+		struct rpcrdma_segment *seg = &cur_wchunk->wc_target;
+		ifdebug(FACILITY) {
+			u64 off;
+			xdr_decode_hyper((u32 *)&seg->rs_offset, &off);
+			dprintk("RPC:       %s: chunk %d@0x%llx:0x%x\n",
+				__func__,
+				ntohl(seg->rs_length),
+				off,
+				ntohl(seg->rs_handle));
+		}
+		total_len += ntohl(seg->rs_length);
+		++cur_wchunk;
+	}
+	/* check and adjust for properly terminated write chunk */
+	if (wrchunk) {
+		u32 *w = (u32 *) cur_wchunk;
+		if (*w++ != xdr_zero)
+			return -1;
+		cur_wchunk = (struct rpcrdma_write_chunk *) w;
+	}
+	if ((char *) cur_wchunk > rep->rr_base + rep->rr_len)
+		return -1;
+
+	*iptrp = (u32 *) cur_wchunk;
+	return total_len;
+}
+
+/*
+ * Scatter inline received data back into provided iov's.
+ */
+static void
+rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len)
+{
+	int i, npages, curlen, olen;
+	char *destp;
+
+	curlen = rqst->rq_rcv_buf.head[0].iov_len;
+	if (curlen > copy_len) {	/* write chunk header fixup */
+		curlen = copy_len;
+		rqst->rq_rcv_buf.head[0].iov_len = curlen;
+	}
+
+	dprintk("RPC:       %s: srcp 0x%p len %d hdrlen %d\n",
+		__func__, srcp, copy_len, curlen);
+
+	/* Shift pointer for first receive segment only */
+	rqst->rq_rcv_buf.head[0].iov_base = srcp;
+	srcp += curlen;
+	copy_len -= curlen;
+
+	olen = copy_len;
+	i = 0;
+	rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen;
+	if (copy_len && rqst->rq_rcv_buf.page_len) {
+		npages = PAGE_ALIGN(rqst->rq_rcv_buf.page_base +
+			rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT;
+		for (; i < npages; i++) {
+			if (i == 0)
+				curlen = PAGE_SIZE - rqst->rq_rcv_buf.page_base;
+			else
+				curlen = PAGE_SIZE;
+			if (curlen > copy_len)
+				curlen = copy_len;
+			dprintk("RPC:       %s: page %d"
+				" srcp 0x%p len %d curlen %d\n",
+				__func__, i, srcp, copy_len, curlen);
+			destp = kmap_atomic(rqst->rq_rcv_buf.pages[i],
+						KM_SKB_SUNRPC_DATA);
+			if (i == 0)
+				memcpy(destp + rqst->rq_rcv_buf.page_base,
+						srcp, curlen);
+			else
+				memcpy(destp, srcp, curlen);
+			flush_dcache_page(rqst->rq_rcv_buf.pages[i]);
+			kunmap_atomic(destp, KM_SKB_SUNRPC_DATA);
+			srcp += curlen;
+			copy_len -= curlen;
+			if (copy_len == 0)
+				break;
+		}
+		rqst->rq_rcv_buf.page_len = olen - copy_len;
+	} else
+		rqst->rq_rcv_buf.page_len = 0;
+
+	if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) {
+		curlen = copy_len;
+		if (curlen > rqst->rq_rcv_buf.tail[0].iov_len)
+			curlen = rqst->rq_rcv_buf.tail[0].iov_len;
+		if (rqst->rq_rcv_buf.tail[0].iov_base != srcp)
+			memcpy(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen);
+		dprintk("RPC:       %s: tail srcp 0x%p len %d curlen %d\n",
+			__func__, srcp, copy_len, curlen);
+		rqst->rq_rcv_buf.tail[0].iov_len = curlen;
+		copy_len -= curlen; ++i;
+	} else
+		rqst->rq_rcv_buf.tail[0].iov_len = 0;
+
+	if (copy_len)
+		dprintk("RPC:       %s: %d bytes in"
+			" %d extra segments (%d lost)\n",
+			__func__, olen, i, copy_len);
+
+	/* TBD avoid a warning from call_decode() */
+	rqst->rq_private_buf = rqst->rq_rcv_buf;
+}
+
+/*
+ * This function is called when an async event is posted to
+ * the connection which changes the connection state. All it
+ * does at this point is mark the connection up/down, the rpc
+ * timers do the rest.
+ */
+void
+rpcrdma_conn_func(struct rpcrdma_ep *ep)
+{
+	struct rpc_xprt *xprt = ep->rep_xprt;
+
+	spin_lock_bh(&xprt->transport_lock);
+	if (ep->rep_connected > 0) {
+		if (!xprt_test_and_set_connected(xprt))
+			xprt_wake_pending_tasks(xprt, 0);
+	} else {
+		if (xprt_test_and_clear_connected(xprt))
+			xprt_wake_pending_tasks(xprt, ep->rep_connected);
+	}
+	spin_unlock_bh(&xprt->transport_lock);
+}
+
+/*
+ * This function is called when memory window unbind which we are waiting
+ * for completes. Just use rr_func (zeroed by upcall) to signal completion.
+ */
+static void
+rpcrdma_unbind_func(struct rpcrdma_rep *rep)
+{
+	wake_up(&rep->rr_unbind);
+}
+
+/*
+ * Called as a tasklet to do req/reply match and complete a request
+ * Errors must result in the RPC task either being awakened, or
+ * allowed to timeout, to discover the errors at that time.
+ */
+void
+rpcrdma_reply_handler(struct rpcrdma_rep *rep)
+{
+	struct rpcrdma_msg *headerp;
+	struct rpcrdma_req *req;
+	struct rpc_rqst *rqst;
+	struct rpc_xprt *xprt = rep->rr_xprt;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	u32 *iptr;
+	int i, rdmalen, status;
+
+	/* Check status. If bad, signal disconnect and return rep to pool */
+	if (rep->rr_len == ~0U) {
+		rpcrdma_recv_buffer_put(rep);
+		if (r_xprt->rx_ep.rep_connected == 1) {
+			r_xprt->rx_ep.rep_connected = -EIO;
+			rpcrdma_conn_func(&r_xprt->rx_ep);
+		}
+		return;
+	}
+	if (rep->rr_len < 28) {
+		dprintk("RPC:       %s: short/invalid reply\n", __func__);
+		goto repost;
+	}
+	headerp = (struct rpcrdma_msg *) rep->rr_base;
+	if (headerp->rm_vers != xdr_one) {
+		dprintk("RPC:       %s: invalid version %d\n",
+			__func__, ntohl(headerp->rm_vers));
+		goto repost;
+	}
+
+	/* Get XID and try for a match. */
+	spin_lock(&xprt->transport_lock);
+	rqst = xprt_lookup_rqst(xprt, headerp->rm_xid);
+	if (rqst == NULL) {
+		spin_unlock(&xprt->transport_lock);
+		dprintk("RPC:       %s: reply 0x%p failed "
+			"to match any request xid 0x%08x len %d\n",
+			__func__, rep, headerp->rm_xid, rep->rr_len);
+repost:
+		r_xprt->rx_stats.bad_reply_count++;
+		rep->rr_func = rpcrdma_reply_handler;
+		if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
+			rpcrdma_recv_buffer_put(rep);
+
+		return;
+	}
+
+	/* get request object */
+	req = rpcr_to_rdmar(rqst);
+
+	dprintk("RPC:       %s: reply 0x%p completes request 0x%p\n"
+		"                   RPC request 0x%p xid 0x%08x\n",
+			__func__, rep, req, rqst, headerp->rm_xid);
+
+	BUG_ON(!req || req->rl_reply);
+
+	/* from here on, the reply is no longer an orphan */
+	req->rl_reply = rep;
+
+	/* check for expected message types */
+	/* The order of some of these tests is important. */
+	switch (headerp->rm_type) {
+	case __constant_htonl(RDMA_MSG):
+		/* never expect read chunks */
+		/* never expect reply chunks (two ways to check) */
+		/* never expect write chunks without having offered RDMA */
+		if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
+		    (headerp->rm_body.rm_chunks[1] == xdr_zero &&
+		     headerp->rm_body.rm_chunks[2] != xdr_zero) ||
+		    (headerp->rm_body.rm_chunks[1] != xdr_zero &&
+		     req->rl_nchunks == 0))
+			goto badheader;
+		if (headerp->rm_body.rm_chunks[1] != xdr_zero) {
+			/* count any expected write chunks in read reply */
+			/* start at write chunk array count */
+			iptr = &headerp->rm_body.rm_chunks[2];
+			rdmalen = rpcrdma_count_chunks(rep,
+						req->rl_nchunks, 1, &iptr);
+			/* check for validity, and no reply chunk after */
+			if (rdmalen < 0 || *iptr++ != xdr_zero)
+				goto badheader;
+			rep->rr_len -=
+			    ((unsigned char *)iptr - (unsigned char *)headerp);
+			status = rep->rr_len + rdmalen;
+			r_xprt->rx_stats.total_rdma_reply += rdmalen;
+		} else {
+			/* else ordinary inline */
+			iptr = (u32 *)((unsigned char *)headerp + 28);
+			rep->rr_len -= 28; /*sizeof *headerp;*/
+			status = rep->rr_len;
+		}
+		/* Fix up the rpc results for upper layer */
+		rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len);
+		break;
+
+	case __constant_htonl(RDMA_NOMSG):
+		/* never expect read or write chunks, always reply chunks */
+		if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
+		    headerp->rm_body.rm_chunks[1] != xdr_zero ||
+		    headerp->rm_body.rm_chunks[2] != xdr_one ||
+		    req->rl_nchunks == 0)
+			goto badheader;
+		iptr = (u32 *)((unsigned char *)headerp + 28);
+		rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr);
+		if (rdmalen < 0)
+			goto badheader;
+		r_xprt->rx_stats.total_rdma_reply += rdmalen;
+		/* Reply chunk buffer already is the reply vector - no fixup. */
+		status = rdmalen;
+		break;
+
+badheader:
+	default:
+		dprintk("%s: invalid rpcrdma reply header (type %d):"
+				" chunks[012] == %d %d %d"
+				" expected chunks <= %d\n",
+				__func__, ntohl(headerp->rm_type),
+				headerp->rm_body.rm_chunks[0],
+				headerp->rm_body.rm_chunks[1],
+				headerp->rm_body.rm_chunks[2],
+				req->rl_nchunks);
+		status = -EIO;
+		r_xprt->rx_stats.bad_reply_count++;
+		break;
+	}
+
+	/* If using mw bind, start the deregister process now. */
+	/* (Note: if mr_free(), cannot perform it here, in tasklet context) */
+	if (req->rl_nchunks) switch (r_xprt->rx_ia.ri_memreg_strategy) {
+	case RPCRDMA_MEMWINDOWS:
+		for (i = 0; req->rl_nchunks-- > 1;)
+			i += rpcrdma_deregister_external(
+				&req->rl_segments[i], r_xprt, NULL);
+		/* Optionally wait (not here) for unbinds to complete */
+		rep->rr_func = rpcrdma_unbind_func;
+		(void) rpcrdma_deregister_external(&req->rl_segments[i],
+						   r_xprt, rep);
+		break;
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+		for (i = 0; req->rl_nchunks--;)
+			i += rpcrdma_deregister_external(&req->rl_segments[i],
+							 r_xprt, NULL);
+		break;
+	default:
+		break;
+	}
+
+	dprintk("RPC:       %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n",
+			__func__, xprt, rqst, status);
+	xprt_complete_rqst(rqst->rq_task, status);
+	spin_unlock(&xprt->transport_lock);
+}
diff -puN /dev/null net/sunrpc/xprtrdma/transport.c
--- /dev/null
+++ a/net/sunrpc/xprtrdma/transport.c
@@ -0,0 +1,800 @@
+/*
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * transport.c
+ *
+ * This file contains the top-level implementation of an RPC RDMA
+ * transport.
+ *
+ * Naming convention: functions beginning with xprt_ are part of the
+ * transport switch. All others are RPC RDMA internal.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+
+#include "xprt_rdma.h"
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+MODULE_LICENSE("Dual BSD/GPL");
+
+MODULE_DESCRIPTION("RPC/RDMA Transport for Linux kernel NFS");
+MODULE_AUTHOR("Network Appliance, Inc.");
+
+/*
+ * tunables
+ */
+
+static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
+static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
+static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
+static unsigned int xprt_rdma_inline_write_padding;
+#if !RPCRDMA_PERSISTENT_REGISTRATION
+static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_REGISTER; /* FMR? */
+#else
+static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_ALLPHYSICAL;
+#endif
+
+#ifdef RPC_DEBUG
+
+static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE;
+static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
+static unsigned int zero;
+static unsigned int max_padding = PAGE_SIZE;
+static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
+static unsigned int max_memreg = RPCRDMA_LAST - 1;
+
+static struct ctl_table_header *sunrpc_table_header;
+
+static ctl_table xr_tunables_table[] = {
+	{
+		.ctl_name	= CTL_SLOTTABLE_RDMA,
+		.procname	= "rdma_slot_table_entries",
+		.data		= &xprt_rdma_slot_table_entries,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &min_slot_table_size,
+		.extra2		= &max_slot_table_size
+	},
+	{
+		.ctl_name	= CTL_RDMA_MAXINLINEREAD,
+		.procname	= "rdma_max_inline_read",
+		.data		= &xprt_rdma_max_inline_read,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+	},
+	{
+		.ctl_name	= CTL_RDMA_MAXINLINEWRITE,
+		.procname	= "rdma_max_inline_write",
+		.data		= &xprt_rdma_max_inline_write,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+	},
+	{
+		.ctl_name	= CTL_RDMA_WRITEPADDING,
+		.procname	= "rdma_inline_write_padding",
+		.data		= &xprt_rdma_inline_write_padding,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &max_padding,
+	},
+	{
+		.ctl_name	= CTL_RDMA_MEMREG,
+		.procname	= "rdma_memreg_strategy",
+		.data		= &xprt_rdma_memreg_strategy,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &min_memreg,
+		.extra2		= &max_memreg,
+	},
+	{
+		.ctl_name = 0,
+	},
+};
+
+static ctl_table sunrpc_table[] = {
+	{
+		.ctl_name	= CTL_SUNRPC,
+		.procname	= "sunrpc",
+		.mode		= 0555,
+		.child		= xr_tunables_table
+	},
+	{
+		.ctl_name = 0,
+	},
+};
+
+#endif
+
+static struct rpc_xprt_ops xprt_rdma_procs;	/* forward reference */
+
+static void
+xprt_rdma_format_addresses(struct rpc_xprt *xprt)
+{
+	struct sockaddr_in *addr = (struct sockaddr_in *)
+					&rpcx_to_rdmad(xprt).addr;
+	char *buf;
+
+	buf = kzalloc(20, GFP_KERNEL);
+	if (buf)
+		snprintf(buf, 20, NIPQUAD_FMT, NIPQUAD(addr->sin_addr.s_addr));
+	xprt->address_strings[RPC_DISPLAY_ADDR] = buf;
+
+	buf = kzalloc(8, GFP_KERNEL);
+	if (buf)
+		snprintf(buf, 8, "%u", ntohs(addr->sin_port));
+	xprt->address_strings[RPC_DISPLAY_PORT] = buf;
+
+	xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
+
+	buf = kzalloc(48, GFP_KERNEL);
+	if (buf)
+		snprintf(buf, 48, "addr="NIPQUAD_FMT" port=%u proto=%s",
+			NIPQUAD(addr->sin_addr.s_addr),
+			ntohs(addr->sin_port), "rdma");
+	xprt->address_strings[RPC_DISPLAY_ALL] = buf;
+
+	buf = kzalloc(10, GFP_KERNEL);
+	if (buf)
+		snprintf(buf, 10, "%02x%02x%02x%02x",
+			NIPQUAD(addr->sin_addr.s_addr));
+	xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = buf;
+
+	buf = kzalloc(8, GFP_KERNEL);
+	if (buf)
+		snprintf(buf, 8, "%4hx", ntohs(addr->sin_port));
+	xprt->address_strings[RPC_DISPLAY_HEX_PORT] = buf;
+
+	buf = kzalloc(30, GFP_KERNEL);
+	if (buf)
+		snprintf(buf, 30, NIPQUAD_FMT".%u.%u",
+			NIPQUAD(addr->sin_addr.s_addr),
+			ntohs(addr->sin_port) >> 8,
+			ntohs(addr->sin_port) & 0xff);
+	xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR] = buf;
+
+	/* netid */
+	xprt->address_strings[RPC_DISPLAY_NETID] = "rdma";
+}
+
+static void
+xprt_rdma_free_addresses(struct rpc_xprt *xprt)
+{
+	kfree(xprt->address_strings[RPC_DISPLAY_ADDR]);
+	kfree(xprt->address_strings[RPC_DISPLAY_PORT]);
+	kfree(xprt->address_strings[RPC_DISPLAY_ALL]);
+	kfree(xprt->address_strings[RPC_DISPLAY_HEX_ADDR]);
+	kfree(xprt->address_strings[RPC_DISPLAY_HEX_PORT]);
+	kfree(xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR]);
+}
+
+static void
+xprt_rdma_connect_worker(struct work_struct *work)
+{
+	struct rpcrdma_xprt *r_xprt =
+		container_of(work, struct rpcrdma_xprt, rdma_connect.work);
+	struct rpc_xprt *xprt = &r_xprt->xprt;
+	int rc = 0;
+
+	if (!xprt->shutdown) {
+		xprt_clear_connected(xprt);
+
+		dprintk("RPC:       %s: %sconnect\n", __func__,
+				r_xprt->rx_ep.rep_connected != 0 ? "re" : "");
+		rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia);
+		if (rc)
+			goto out;
+	}
+	goto out_clear;
+
+out:
+	xprt_wake_pending_tasks(xprt, rc);
+
+out_clear:
+	dprintk("RPC:       %s: exit\n", __func__);
+	xprt_clear_connecting(xprt);
+}
+
+/*
+ * xprt_rdma_destroy
+ *
+ * Destroy the xprt.
+ * Free all memory associated with the object, including its own.
+ * NOTE: none of the *destroy methods free memory for their top-level
+ * objects, even though they may have allocated it (they do free
+ * private memory). It's up to the caller to handle it. In this
+ * case (RDMA transport), all structure memory is inlined with the
+ * struct rpcrdma_xprt.
+ */
+static void
+xprt_rdma_destroy(struct rpc_xprt *xprt)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	int rc;
+
+	dprintk("RPC:       %s: called\n", __func__);
+
+	cancel_delayed_work(&r_xprt->rdma_connect);
+	flush_scheduled_work();
+
+	xprt_clear_connected(xprt);
+
+	rpcrdma_buffer_destroy(&r_xprt->rx_buf);
+	rc = rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia);
+	if (rc)
+		dprintk("RPC:       %s: rpcrdma_ep_destroy returned %i\n",
+			__func__, rc);
+	rpcrdma_ia_close(&r_xprt->rx_ia);
+
+	xprt_rdma_free_addresses(xprt);
+
+	kfree(xprt->slot);
+	xprt->slot = NULL;
+	kfree(xprt);
+
+	dprintk("RPC:       %s: returning\n", __func__);
+
+	module_put(THIS_MODULE);
+}
+
+/**
+ * xprt_setup_rdma - Set up transport to use RDMA
+ *
+ * @args: rpc transport arguments
+ */
+static struct rpc_xprt *
+xprt_setup_rdma(struct xprt_create *args)
+{
+	struct rpcrdma_create_data_internal cdata;
+	struct rpc_xprt *xprt;
+	struct rpcrdma_xprt *new_xprt;
+	struct rpcrdma_ep *new_ep;
+	struct sockaddr_in *sin;
+	int rc;
+
+	if (args->addrlen > sizeof(xprt->addr)) {
+		dprintk("RPC:       %s: address too large\n", __func__);
+		return ERR_PTR(-EBADF);
+	}
+
+	xprt = kzalloc(sizeof(struct rpcrdma_xprt), GFP_KERNEL);
+	if (xprt == NULL) {
+		dprintk("RPC:       %s: couldn't allocate rpcrdma_xprt\n",
+			__func__);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	xprt->max_reqs = xprt_rdma_slot_table_entries;
+	xprt->slot = kcalloc(xprt->max_reqs,
+				sizeof(struct rpc_rqst), GFP_KERNEL);
+	if (xprt->slot == NULL) {
+		kfree(xprt);
+		dprintk("RPC:       %s: couldn't allocate %d slots\n",
+			__func__, xprt->max_reqs);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/* 60 second timeout, no retries */
+	xprt_set_timeout(&xprt->timeout, 0, 60UL * HZ);
+	xprt->bind_timeout = (60U * HZ);
+	xprt->connect_timeout = (60U * HZ);
+	xprt->reestablish_timeout = (5U * HZ);
+	xprt->idle_timeout = (5U * 60 * HZ);
+
+	xprt->resvport = 0;		/* privileged port not needed */
+	xprt->tsh_size = 0;		/* RPC-RDMA handles framing */
+	xprt->max_payload = RPCRDMA_MAX_DATA_SEGS * PAGE_SIZE;
+	xprt->ops = &xprt_rdma_procs;
+
+	/*
+	 * Set up RDMA-specific connect data.
+	 */
+
+	/* Put server RDMA address in local cdata */
+	memcpy(&cdata.addr, args->dstaddr, args->addrlen);
+
+	/* Ensure xprt->addr holds valid server TCP (not RDMA)
+	 * address, for any side protocols which peek at it */
+	xprt->prot = IPPROTO_TCP;
+	xprt->addrlen = args->addrlen;
+	memcpy(&xprt->addr, &cdata.addr, xprt->addrlen);
+
+	sin = (struct sockaddr_in *)&cdata.addr;
+	if (ntohs(sin->sin_port) != 0)
+		xprt_set_bound(xprt);
+
+	dprintk("RPC:       %s: %u.%u.%u.%u:%u\n", __func__,
+			NIPQUAD(sin->sin_addr.s_addr), ntohs(sin->sin_port));
+
+	/* Set max requests */
+	cdata.max_requests = xprt->max_reqs;
+
+	/* Set some length limits */
+	cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */
+	cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */
+
+	cdata.inline_wsize = xprt_rdma_max_inline_write;
+	if (cdata.inline_wsize > cdata.wsize)
+		cdata.inline_wsize = cdata.wsize;
+
+	cdata.inline_rsize = xprt_rdma_max_inline_read;
+	if (cdata.inline_rsize > cdata.rsize)
+		cdata.inline_rsize = cdata.rsize;
+
+	cdata.padding = xprt_rdma_inline_write_padding;
+
+	/*
+	 * Create new transport instance, which includes initialized
+	 *  o ia
+	 *  o endpoint
+	 *  o buffers
+	 */
+
+	new_xprt = rpcx_to_rdmax(xprt);
+
+	rc = rpcrdma_ia_open(new_xprt, (struct sockaddr *) &cdata.addr,
+				xprt_rdma_memreg_strategy);
+	if (rc)
+		goto out1;
+
+	/*
+	 * initialize and create ep
+	 */
+	new_xprt->rx_data = cdata;
+	new_ep = &new_xprt->rx_ep;
+	new_ep->rep_remote_addr = cdata.addr;
+
+	rc = rpcrdma_ep_create(&new_xprt->rx_ep,
+				&new_xprt->rx_ia, &new_xprt->rx_data);
+	if (rc)
+		goto out2;
+
+	/*
+	 * Allocate pre-registered send and receive buffers for headers and
+	 * any inline data. Also specify any padding which will be provided
+	 * from a preregistered zero buffer.
+	 */
+	rc = rpcrdma_buffer_create(&new_xprt->rx_buf, new_ep, &new_xprt->rx_ia,
+				&new_xprt->rx_data);
+	if (rc)
+		goto out3;
+
+	/*
+	 * Register a callback for connection events. This is necessary because
+	 * connection loss notification is async. We also catch connection loss
+	 * when reaping receives.
+	 */
+	INIT_DELAYED_WORK(&new_xprt->rdma_connect, xprt_rdma_connect_worker);
+	new_ep->rep_func = rpcrdma_conn_func;
+	new_ep->rep_xprt = xprt;
+
+	xprt_rdma_format_addresses(xprt);
+
+	if (!try_module_get(THIS_MODULE))
+		goto out4;
+
+	return xprt;
+
+out4:
+	xprt_rdma_free_addresses(xprt);
+	rc = -EINVAL;
+out3:
+	(void) rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia);
+out2:
+	rpcrdma_ia_close(&new_xprt->rx_ia);
+out1:
+	kfree(xprt->slot);
+	kfree(xprt);
+	return ERR_PTR(rc);
+}
+
+/*
+ * Close a connection, during shutdown or timeout/reconnect
+ */
+static void
+xprt_rdma_close(struct rpc_xprt *xprt)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+	dprintk("RPC:       %s: closing\n", __func__);
+	xprt_disconnect(xprt);
+	(void) rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia);
+}
+
+static void
+xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port)
+{
+	struct sockaddr_in *sap;
+
+	sap = (struct sockaddr_in *)&xprt->addr;
+	sap->sin_port = htons(port);
+	sap = (struct sockaddr_in *)&rpcx_to_rdmad(xprt).addr;
+	sap->sin_port = htons(port);
+	dprintk("RPC:       %s: %u\n", __func__, port);
+}
+
+static void
+xprt_rdma_connect(struct rpc_task *task)
+{
+	struct rpc_xprt *xprt = (struct rpc_xprt *)task->tk_xprt;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+	if (!xprt_test_and_set_connecting(xprt)) {
+		if (r_xprt->rx_ep.rep_connected != 0) {
+			/* Reconnect */
+			schedule_delayed_work(&r_xprt->rdma_connect,
+				xprt->reestablish_timeout);
+		} else {
+			schedule_delayed_work(&r_xprt->rdma_connect, 0);
+			if (!RPC_IS_ASYNC(task))
+				flush_scheduled_work();
+		}
+	}
+}
+
+static int
+xprt_rdma_reserve_xprt(struct rpc_task *task)
+{
+	struct rpc_xprt *xprt = task->tk_xprt;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	int credits = atomic_read(&r_xprt->rx_buf.rb_credits);
+
+	/* == RPC_CWNDSCALE @ init, but *after* setup */
+	if (r_xprt->rx_buf.rb_cwndscale == 0UL) {
+		r_xprt->rx_buf.rb_cwndscale = xprt->cwnd;
+		dprintk("RPC:       %s: cwndscale %lu\n", __func__,
+			r_xprt->rx_buf.rb_cwndscale);
+		BUG_ON(r_xprt->rx_buf.rb_cwndscale <= 0);
+	}
+	xprt->cwnd = credits * r_xprt->rx_buf.rb_cwndscale;
+	return xprt_reserve_xprt_cong(task);
+}
+
+/*
+ * The RDMA allocate/free functions need the task structure as a place
+ * to hide the struct rpcrdma_req, which is necessary for the actual send/recv
+ * sequence. For this reason, the recv buffers are attached to send
+ * buffers for portions of the RPC. Note that the RPC layer allocates
+ * both send and receive buffers in the same call. We may register
+ * the receive buffer portion when using reply chunks.
+ */
+static void *
+xprt_rdma_allocate(struct rpc_task *task, size_t size)
+{
+	struct rpc_xprt *xprt = task->tk_xprt;
+	struct rpcrdma_req *req, *nreq;
+
+	req = rpcrdma_buffer_get(&rpcx_to_rdmax(xprt)->rx_buf);
+	BUG_ON(NULL == req);
+
+	if (size > req->rl_size) {
+		dprintk("RPC:       %s: size %zd too large for buffer[%zd]: "
+			"prog %d vers %d proc %d\n",
+			__func__, size, req->rl_size,
+			task->tk_client->cl_prog, task->tk_client->cl_vers,
+			task->tk_msg.rpc_proc->p_proc);
+		/*
+		 * Outgoing length shortage. Our inline write max must have
+		 * been configured to perform direct i/o.
+		 *
+		 * This is therefore a large metadata operation, and the
+		 * allocate call was made on the maximum possible message,
+		 * e.g. containing long filename(s) or symlink data. In
+		 * fact, while these metadata operations *might* carry
+		 * large outgoing payloads, they rarely *do*. However, we
+		 * have to commit to the request here, so reallocate and
+		 * register it now. The data path will never require this
+		 * reallocation.
+		 *
+		 * If the allocation or registration fails, the RPC framework
+		 * will (doggedly) retry.
+		 */
+		if (rpcx_to_rdmax(xprt)->rx_ia.ri_memreg_strategy ==
+				RPCRDMA_BOUNCEBUFFERS) {
+			/* forced to "pure inline" */
+			dprintk("RPC:       %s: too much data (%zd) for inline "
+					"(r/w max %d/%d)\n", __func__, size,
+					rpcx_to_rdmad(xprt).inline_rsize,
+					rpcx_to_rdmad(xprt).inline_wsize);
+			size = req->rl_size;
+			rpc_exit(task, -EIO);		/* fail the operation */
+			rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++;
+			goto out;
+		}
+		if (task->tk_flags & RPC_TASK_SWAPPER)
+			nreq = kmalloc(sizeof *req + size, GFP_ATOMIC);
+		else
+			nreq = kmalloc(sizeof *req + size, GFP_NOFS);
+		if (nreq == NULL)
+			goto outfail;
+
+		if (rpcrdma_register_internal(&rpcx_to_rdmax(xprt)->rx_ia,
+				nreq->rl_base, size + sizeof(struct rpcrdma_req)
+				- offsetof(struct rpcrdma_req, rl_base),
+				&nreq->rl_handle, &nreq->rl_iov)) {
+			kfree(nreq);
+			goto outfail;
+		}
+		rpcx_to_rdmax(xprt)->rx_stats.hardway_register_count += size;
+		nreq->rl_size = size;
+		nreq->rl_niovs = 0;
+		nreq->rl_nchunks = 0;
+		nreq->rl_buffer = (struct rpcrdma_buffer *)req;
+		nreq->rl_reply = req->rl_reply;
+		memcpy(nreq->rl_segments,
+			req->rl_segments, sizeof nreq->rl_segments);
+		/* flag the swap with an unused field */
+		nreq->rl_iov.length = 0;
+		req->rl_reply = NULL;
+		req = nreq;
+	}
+	dprintk("RPC:       %s: size %zd, request 0x%p\n", __func__, size, req);
+out:
+	return req->rl_xdr_buf;
+
+outfail:
+	rpcrdma_buffer_put(req);
+	rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++;
+	return NULL;
+}
+
+/*
+ * This function returns all RDMA resources to the pool.
+ */
+static void
+xprt_rdma_free(void *buffer)
+{
+	struct rpcrdma_req *req;
+	struct rpcrdma_xprt *r_xprt;
+	struct rpcrdma_rep *rep;
+	int i;
+
+	if (buffer == NULL)
+		return;
+
+	req = container_of(buffer, struct rpcrdma_req, rl_xdr_buf[0]);
+	r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf);
+	rep = req->rl_reply;
+
+	dprintk("RPC:       %s: called on 0x%p%s\n",
+		__func__, rep, (rep && rep->rr_func) ? " (with waiter)" : "");
+
+	/*
+	 * Finish the deregistration. When using mw bind, this was
+	 * begun in rpcrdma_reply_handler(). In all other modes, we
+	 * do it here, in thread context. The process is considered
+	 * complete when the rr_func vector becomes NULL - this
+	 * was put in place during rpcrdma_reply_handler() - the wait
+	 * call below will not block if the dereg is "done". If
+	 * interrupted, our framework will clean up.
+	 */
+	for (i = 0; req->rl_nchunks;) {
+		--req->rl_nchunks;
+		i += rpcrdma_deregister_external(
+			&req->rl_segments[i], r_xprt, NULL);
+	}
+
+	if (rep && wait_event_interruptible(rep->rr_unbind, !rep->rr_func)) {
+		rep->rr_func = NULL;	/* abandon the callback */
+		req->rl_reply = NULL;
+	}
+
+	if (req->rl_iov.length == 0) {	/* see allocate above */
+		struct rpcrdma_req *oreq = (struct rpcrdma_req *)req->rl_buffer;
+		oreq->rl_reply = req->rl_reply;
+		(void) rpcrdma_deregister_internal(&r_xprt->rx_ia,
+						   req->rl_handle,
+						   &req->rl_iov);
+		kfree(req);
+		req = oreq;
+	}
+
+	/* Put back request+reply buffers */
+	rpcrdma_buffer_put(req);
+}
+
+/*
+ * send_request invokes the meat of RPC RDMA. It must do the following:
+ *  1.  Marshal the RPC request into an RPC RDMA request, which means
+ *	putting a header in front of data, and creating IOVs for RDMA
+ *	from those in the request.
+ *  2.  In marshaling, detect opportunities for RDMA, and use them.
+ *  3.  Post a recv message to set up asynch completion, then send
+ *	the request (rpcrdma_ep_post).
+ *  4.  No partial sends are possible in the RPC-RDMA protocol (as in UDP).
+ */
+
+static int
+xprt_rdma_send_request(struct rpc_task *task)
+{
+	struct rpc_rqst *rqst = task->tk_rqstp;
+	struct rpc_xprt *xprt = task->tk_xprt;
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+	/* marshal the send itself */
+	if (req->rl_niovs == 0 && rpcrdma_marshal_req(rqst) != 0) {
+		r_xprt->rx_stats.failed_marshal_count++;
+		dprintk("RPC:       %s: rpcrdma_marshal_req failed\n",
+			__func__);
+		return -EIO;
+	}
+
+	if (req->rl_reply == NULL) 		/* e.g. reconnection */
+		rpcrdma_recv_buffer_get(req);
+
+	if (req->rl_reply) {
+		req->rl_reply->rr_func = rpcrdma_reply_handler;
+		/* this need only be done once, but... */
+		req->rl_reply->rr_xprt = xprt;
+	}
+
+	if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) {
+		xprt_disconnect(xprt);
+		return -ENOTCONN;	/* implies disconnect */
+	}
+
+	rqst->rq_bytes_sent = 0;
+	return 0;
+}
+
+static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	long idle_time = 0;
+
+	if (xprt_connected(xprt))
+		idle_time = (long)(jiffies - xprt->last_used) / HZ;
+
+	seq_printf(seq,
+	  "\txprt:\trdma %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu "
+	  "%lu %lu %lu %Lu %Lu %Lu %Lu %lu %lu %lu\n",
+
+	   0,	/* need a local port? */
+	   xprt->stat.bind_count,
+	   xprt->stat.connect_count,
+	   xprt->stat.connect_time,
+	   idle_time,
+	   xprt->stat.sends,
+	   xprt->stat.recvs,
+	   xprt->stat.bad_xids,
+	   xprt->stat.req_u,
+	   xprt->stat.bklog_u,
+
+	   r_xprt->rx_stats.read_chunk_count,
+	   r_xprt->rx_stats.write_chunk_count,
+	   r_xprt->rx_stats.reply_chunk_count,
+	   r_xprt->rx_stats.total_rdma_request,
+	   r_xprt->rx_stats.total_rdma_reply,
+	   r_xprt->rx_stats.pullup_copy_count,
+	   r_xprt->rx_stats.fixup_copy_count,
+	   r_xprt->rx_stats.hardway_register_count,
+	   r_xprt->rx_stats.failed_marshal_count,
+	   r_xprt->rx_stats.bad_reply_count);
+}
+
+/*
+ * Plumbing for rpc transport switch and kernel module
+ */
+
+static struct rpc_xprt_ops xprt_rdma_procs = {
+	.reserve_xprt		= xprt_rdma_reserve_xprt,
+	.release_xprt		= xprt_release_xprt_cong, /* sunrpc/xprt.c */
+	.release_request	= xprt_release_rqst_cong,       /* ditto */
+	.set_retrans_timeout	= xprt_set_retrans_timeout_def, /* ditto */
+	.rpcbind		= rpcb_getport_async,	/* sunrpc/rpcb_clnt.c */
+	.set_port		= xprt_rdma_set_port,
+	.connect		= xprt_rdma_connect,
+	.buf_alloc		= xprt_rdma_allocate,
+	.buf_free		= xprt_rdma_free,
+	.send_request		= xprt_rdma_send_request,
+	.close			= xprt_rdma_close,
+	.destroy		= xprt_rdma_destroy,
+	.print_stats		= xprt_rdma_print_stats
+};
+
+static struct xprt_class xprt_rdma = {
+	.list			= LIST_HEAD_INIT(xprt_rdma.list),
+	.name			= "rdma",
+	.owner			= THIS_MODULE,
+	.ident			= XPRT_TRANSPORT_RDMA,
+	.setup			= xprt_setup_rdma,
+};
+
+static void __exit xprt_rdma_cleanup(void)
+{
+	int rc;
+
+	dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n");
+#ifdef RPC_DEBUG
+	if (sunrpc_table_header) {
+		unregister_sysctl_table(sunrpc_table_header);
+		sunrpc_table_header = NULL;
+	}
+#endif
+	rc = xprt_unregister_transport(&xprt_rdma);
+	if (rc)
+		dprintk("RPC:       %s: xprt_unregister returned %i\n",
+			__func__, rc);
+}
+
+static int __init xprt_rdma_init(void)
+{
+	int rc;
+
+	rc = xprt_register_transport(&xprt_rdma);
+
+	if (rc)
+		return rc;
+
+	dprintk(KERN_INFO "RPCRDMA Module Init, register RPC RDMA transport\n");
+
+	dprintk(KERN_INFO "Defaults:\n");
+	dprintk(KERN_INFO "\tSlots %d\n"
+		"\tMaxInlineRead %d\n\tMaxInlineWrite %d\n",
+		xprt_rdma_slot_table_entries,
+		xprt_rdma_max_inline_read, xprt_rdma_max_inline_write);
+	dprintk(KERN_INFO "\tPadding %d\n\tMemreg %d\n",
+		xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy);
+
+#ifdef RPC_DEBUG
+	if (!sunrpc_table_header)
+		sunrpc_table_header = register_sysctl_table(sunrpc_table);
+#endif
+	return 0;
+}
+
+module_init(xprt_rdma_init);
+module_exit(xprt_rdma_cleanup);
diff -puN /dev/null net/sunrpc/xprtrdma/verbs.c
--- /dev/null
+++ a/net/sunrpc/xprtrdma/verbs.c
@@ -0,0 +1,1626 @@
+/*
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * verbs.c
+ *
+ * Encapsulates the major functions managing:
+ *  o adapters
+ *  o endpoints
+ *  o connections
+ *  o buffer memory
+ */
+
+#include <linux/pci.h>	/* for Tavor hack below */
+
+#include "xprt_rdma.h"
+
+/*
+ * Globals/Macros
+ */
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+/*
+ * internal functions
+ */
+
+/*
+ * handle replies in tasklet context, using a single, global list
+ * rdma tasklet function -- just turn around and call the func
+ * for all replies on the list
+ */
+
+static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
+static LIST_HEAD(rpcrdma_tasklets_g);
+
+static void
+rpcrdma_run_tasklet(unsigned long data)
+{
+	struct rpcrdma_rep *rep;
+	void (*func)(struct rpcrdma_rep *);
+	unsigned long flags;
+
+	data = data;
+	spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
+	while (!list_empty(&rpcrdma_tasklets_g)) {
+		rep = list_entry(rpcrdma_tasklets_g.next,
+				 struct rpcrdma_rep, rr_list);
+		list_del(&rep->rr_list);
+		func = rep->rr_func;
+		rep->rr_func = NULL;
+		spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
+
+		if (func)
+			func(rep);
+		else
+			rpcrdma_recv_buffer_put(rep);
+
+		spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
+	}
+	spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
+}
+
+static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
+
+static inline void
+rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
+	list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
+	spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
+	tasklet_schedule(&rpcrdma_tasklet_g);
+}
+
+static void
+rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
+{
+	struct rpcrdma_ep *ep = context;
+
+	dprintk("RPC:       %s: QP error %X on device %s ep %p\n",
+		__func__, event->event, event->device->name, context);
+	if (ep->rep_connected == 1) {
+		ep->rep_connected = -EIO;
+		ep->rep_func(ep);
+		wake_up_all(&ep->rep_connect_wait);
+	}
+}
+
+static void
+rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
+{
+	struct rpcrdma_ep *ep = context;
+
+	dprintk("RPC:       %s: CQ error %X on device %s ep %p\n",
+		__func__, event->event, event->device->name, context);
+	if (ep->rep_connected == 1) {
+		ep->rep_connected = -EIO;
+		ep->rep_func(ep);
+		wake_up_all(&ep->rep_connect_wait);
+	}
+}
+
+static inline
+void rpcrdma_event_process(struct ib_wc *wc)
+{
+	struct rpcrdma_rep *rep =
+			(struct rpcrdma_rep *)(unsigned long) wc->wr_id;
+
+	dprintk("RPC:       %s: event rep %p status %X opcode %X length %u\n",
+		__func__, rep, wc->status, wc->opcode, wc->byte_len);
+
+	if (!rep) /* send or bind completion that we don't care about */
+		return;
+
+	if (IB_WC_SUCCESS != wc->status) {
+		dprintk("RPC:       %s: %s WC status %X, connection lost\n",
+			__func__, (wc->opcode & IB_WC_RECV) ? "recv" : "send",
+			 wc->status);
+		rep->rr_len = ~0U;
+		rpcrdma_schedule_tasklet(rep);
+		return;
+	}
+
+	switch (wc->opcode) {
+	case IB_WC_RECV:
+		rep->rr_len = wc->byte_len;
+		ib_dma_sync_single_for_cpu(
+			rdmab_to_ia(rep->rr_buffer)->ri_id->device,
+			rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
+		/* Keep (only) the most recent credits, after check validity */
+		if (rep->rr_len >= 16) {
+			struct rpcrdma_msg *p =
+					(struct rpcrdma_msg *) rep->rr_base;
+			unsigned int credits = ntohl(p->rm_credit);
+			if (credits == 0) {
+				dprintk("RPC:       %s: server"
+					" dropped credits to 0!\n", __func__);
+				/* don't deadlock */
+				credits = 1;
+			} else if (credits > rep->rr_buffer->rb_max_requests) {
+				dprintk("RPC:       %s: server"
+					" over-crediting: %d (%d)\n",
+					__func__, credits,
+					rep->rr_buffer->rb_max_requests);
+				credits = rep->rr_buffer->rb_max_requests;
+			}
+			atomic_set(&rep->rr_buffer->rb_credits, credits);
+		}
+		/* fall through */
+	case IB_WC_BIND_MW:
+		rpcrdma_schedule_tasklet(rep);
+		break;
+	default:
+		dprintk("RPC:       %s: unexpected WC event %X\n",
+			__func__, wc->opcode);
+		break;
+	}
+}
+
+static inline int
+rpcrdma_cq_poll(struct ib_cq *cq)
+{
+	struct ib_wc wc;
+	int rc;
+
+	for (;;) {
+		rc = ib_poll_cq(cq, 1, &wc);
+		if (rc < 0) {
+			dprintk("RPC:       %s: ib_poll_cq failed %i\n",
+				__func__, rc);
+			return rc;
+		}
+		if (rc == 0)
+			break;
+
+		rpcrdma_event_process(&wc);
+	}
+
+	return 0;
+}
+
+/*
+ * rpcrdma_cq_event_upcall
+ *
+ * This upcall handles recv, send, bind and unbind events.
+ * It is reentrant but processes single events in order to maintain
+ * ordering of receives to keep server credits.
+ *
+ * It is the responsibility of the scheduled tasklet to return
+ * recv buffers to the pool. NOTE: this affects synchronization of
+ * connection shutdown. That is, the structures required for
+ * the completion of the reply handler must remain intact until
+ * all memory has been reclaimed.
+ *
+ * Note that send events are suppressed and do not result in an upcall.
+ */
+static void
+rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
+{
+	int rc;
+
+	rc = rpcrdma_cq_poll(cq);
+	if (rc)
+		return;
+
+	rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+	if (rc) {
+		dprintk("RPC:       %s: ib_req_notify_cq failed %i\n",
+			__func__, rc);
+		return;
+	}
+
+	rpcrdma_cq_poll(cq);
+}
+
+#ifdef RPC_DEBUG
+static const char * const conn[] = {
+	"address resolved",
+	"address error",
+	"route resolved",
+	"route error",
+	"connect request",
+	"connect response",
+	"connect error",
+	"unreachable",
+	"rejected",
+	"established",
+	"disconnected",
+	"device removal"
+};
+#endif
+
+static int
+rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
+{
+	struct rpcrdma_xprt *xprt = id->context;
+	struct rpcrdma_ia *ia = &xprt->rx_ia;
+	struct rpcrdma_ep *ep = &xprt->rx_ep;
+	struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
+	struct ib_qp_attr attr;
+	struct ib_qp_init_attr iattr;
+	int connstate = 0;
+
+	switch (event->event) {
+	case RDMA_CM_EVENT_ADDR_RESOLVED:
+	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		complete(&ia->ri_done);
+		break;
+	case RDMA_CM_EVENT_ADDR_ERROR:
+		ia->ri_async_rc = -EHOSTUNREACH;
+		dprintk("RPC:       %s: CM address resolution error, ep 0x%p\n",
+			__func__, ep);
+		complete(&ia->ri_done);
+		break;
+	case RDMA_CM_EVENT_ROUTE_ERROR:
+		ia->ri_async_rc = -ENETUNREACH;
+		dprintk("RPC:       %s: CM route resolution error, ep 0x%p\n",
+			__func__, ep);
+		complete(&ia->ri_done);
+		break;
+	case RDMA_CM_EVENT_ESTABLISHED:
+		connstate = 1;
+		ib_query_qp(ia->ri_id->qp, &attr,
+			IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
+			&iattr);
+		dprintk("RPC:       %s: %d responder resources"
+			" (%d initiator)\n",
+			__func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
+		goto connected;
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+		connstate = -ENOTCONN;
+		goto connected;
+	case RDMA_CM_EVENT_UNREACHABLE:
+		connstate = -ENETDOWN;
+		goto connected;
+	case RDMA_CM_EVENT_REJECTED:
+		connstate = -ECONNREFUSED;
+		goto connected;
+	case RDMA_CM_EVENT_DISCONNECTED:
+		connstate = -ECONNABORTED;
+		goto connected;
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		connstate = -ENODEV;
+connected:
+		dprintk("RPC:       %s: %s: %u.%u.%u.%u:%u"
+			" (ep 0x%p event 0x%x)\n",
+			__func__,
+			(event->event <= 11) ? conn[event->event] :
+						"unknown connection error",
+			NIPQUAD(addr->sin_addr.s_addr),
+			ntohs(addr->sin_port),
+			ep, event->event);
+		atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
+		dprintk("RPC:       %s: %sconnected\n",
+					__func__, connstate > 0 ? "" : "dis");
+		ep->rep_connected = connstate;
+		ep->rep_func(ep);
+		wake_up_all(&ep->rep_connect_wait);
+		break;
+	default:
+		ia->ri_async_rc = -EINVAL;
+		dprintk("RPC:       %s: unexpected CM event %X\n",
+			__func__, event->event);
+		complete(&ia->ri_done);
+		break;
+	}
+
+	return 0;
+}
+
+static struct rdma_cm_id *
+rpcrdma_create_id(struct rpcrdma_xprt *xprt,
+			struct rpcrdma_ia *ia, struct sockaddr *addr)
+{
+	struct rdma_cm_id *id;
+	int rc;
+
+	id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP);
+	if (IS_ERR(id)) {
+		rc = PTR_ERR(id);
+		dprintk("RPC:       %s: rdma_create_id() failed %i\n",
+			__func__, rc);
+		return id;
+	}
+
+	ia->ri_async_rc = 0;
+	rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
+	if (rc) {
+		dprintk("RPC:       %s: rdma_resolve_addr() failed %i\n",
+			__func__, rc);
+		goto out;
+	}
+	wait_for_completion(&ia->ri_done);
+	rc = ia->ri_async_rc;
+	if (rc)
+		goto out;
+
+	ia->ri_async_rc = 0;
+	rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
+	if (rc) {
+		dprintk("RPC:       %s: rdma_resolve_route() failed %i\n",
+			__func__, rc);
+		goto out;
+	}
+	wait_for_completion(&ia->ri_done);
+	rc = ia->ri_async_rc;
+	if (rc)
+		goto out;
+
+	return id;
+
+out:
+	rdma_destroy_id(id);
+	return ERR_PTR(rc);
+}
+
+/*
+ * Drain any cq, prior to teardown.
+ */
+static void
+rpcrdma_clean_cq(struct ib_cq *cq)
+{
+	struct ib_wc wc;
+	int count = 0;
+
+	while (1 == ib_poll_cq(cq, 1, &wc))
+		++count;
+
+	if (count)
+		dprintk("RPC:       %s: flushed %d events (last 0x%x)\n",
+			__func__, count, wc.opcode);
+}
+
+/*
+ * Exported functions.
+ */
+
+/*
+ * Open and initialize an Interface Adapter.
+ *  o initializes fields of struct rpcrdma_ia, including
+ *    interface and provider attributes and protection zone.
+ */
+int
+rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+{
+	int rc;
+	struct rpcrdma_ia *ia = &xprt->rx_ia;
+
+	init_completion(&ia->ri_done);
+
+	ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
+	if (IS_ERR(ia->ri_id)) {
+		rc = PTR_ERR(ia->ri_id);
+		goto out1;
+	}
+
+	ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
+	if (IS_ERR(ia->ri_pd)) {
+		rc = PTR_ERR(ia->ri_pd);
+		dprintk("RPC:       %s: ib_alloc_pd() failed %i\n",
+			__func__, rc);
+		goto out2;
+	}
+
+	/*
+	 * Optionally obtain an underlying physical identity mapping in
+	 * order to do a memory window-based bind. This base registration
+	 * is protected from remote access - that is enabled only by binding
+	 * for the specific bytes targeted during each RPC operation, and
+	 * revoked after the corresponding completion similar to a storage
+	 * adapter.
+	 */
+	if (memreg > RPCRDMA_REGISTER) {
+		int mem_priv = IB_ACCESS_LOCAL_WRITE;
+		switch (memreg) {
+#if RPCRDMA_PERSISTENT_REGISTRATION
+		case RPCRDMA_ALLPHYSICAL:
+			mem_priv |= IB_ACCESS_REMOTE_WRITE;
+			mem_priv |= IB_ACCESS_REMOTE_READ;
+			break;
+#endif
+		case RPCRDMA_MEMWINDOWS_ASYNC:
+		case RPCRDMA_MEMWINDOWS:
+			mem_priv |= IB_ACCESS_MW_BIND;
+			break;
+		default:
+			break;
+		}
+		ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
+		if (IS_ERR(ia->ri_bind_mem)) {
+			printk(KERN_ALERT "%s: ib_get_dma_mr for "
+				"phys register failed with %lX\n\t"
+				"Will continue with degraded performance\n",
+				__func__, PTR_ERR(ia->ri_bind_mem));
+			memreg = RPCRDMA_REGISTER;
+			ia->ri_bind_mem = NULL;
+		}
+	}
+
+	/* Else will do memory reg/dereg for each chunk */
+	ia->ri_memreg_strategy = memreg;
+
+	return 0;
+out2:
+	rdma_destroy_id(ia->ri_id);
+out1:
+	return rc;
+}
+
+/*
+ * Clean up/close an IA.
+ *   o if event handles and PD have been initialized, free them.
+ *   o close the IA
+ */
+void
+rpcrdma_ia_close(struct rpcrdma_ia *ia)
+{
+	int rc;
+
+	dprintk("RPC:       %s: entering\n", __func__);
+	if (ia->ri_bind_mem != NULL) {
+		rc = ib_dereg_mr(ia->ri_bind_mem);
+		dprintk("RPC:       %s: ib_dereg_mr returned %i\n",
+			__func__, rc);
+	}
+	if (ia->ri_id != NULL && !IS_ERR(ia->ri_id) && ia->ri_id->qp)
+		rdma_destroy_qp(ia->ri_id);
+	if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
+		rc = ib_dealloc_pd(ia->ri_pd);
+		dprintk("RPC:       %s: ib_dealloc_pd returned %i\n",
+			__func__, rc);
+	}
+	if (ia->ri_id != NULL && !IS_ERR(ia->ri_id))
+		rdma_destroy_id(ia->ri_id);
+}
+
+/*
+ * Create unconnected endpoint.
+ */
+int
+rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
+				struct rpcrdma_create_data_internal *cdata)
+{
+	struct ib_device_attr devattr;
+	int rc;
+
+	rc = ib_query_device(ia->ri_id->device, &devattr);
+	if (rc) {
+		dprintk("RPC:       %s: ib_query_device failed %d\n",
+			__func__, rc);
+		return rc;
+	}
+
+	/* check provider's send/recv wr limits */
+	if (cdata->max_requests > devattr.max_qp_wr)
+		cdata->max_requests = devattr.max_qp_wr;
+
+	ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
+	ep->rep_attr.qp_context = ep;
+	/* send_cq and recv_cq initialized below */
+	ep->rep_attr.srq = NULL;
+	ep->rep_attr.cap.max_send_wr = cdata->max_requests;
+	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+	case RPCRDMA_MEMWINDOWS:
+		/* Add room for mw_binds+unbinds - overkill! */
+		ep->rep_attr.cap.max_send_wr++;
+		ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
+		if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
+			return -EINVAL;
+		break;
+	default:
+		break;
+	}
+	ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
+	ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
+	ep->rep_attr.cap.max_recv_sge = 1;
+	ep->rep_attr.cap.max_inline_data = 0;
+	ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+	ep->rep_attr.qp_type = IB_QPT_RC;
+	ep->rep_attr.port_num = ~0;
+
+	dprintk("RPC:       %s: requested max: dtos: send %d recv %d; "
+		"iovs: send %d recv %d\n",
+		__func__,
+		ep->rep_attr.cap.max_send_wr,
+		ep->rep_attr.cap.max_recv_wr,
+		ep->rep_attr.cap.max_send_sge,
+		ep->rep_attr.cap.max_recv_sge);
+
+	/* set trigger for requesting send completion */
+	ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /*  - 1*/;
+	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+	case RPCRDMA_MEMWINDOWS:
+		ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
+		break;
+	default:
+		break;
+	}
+	if (ep->rep_cqinit <= 2)
+		ep->rep_cqinit = 0;
+	INIT_CQCOUNT(ep);
+	ep->rep_ia = ia;
+	init_waitqueue_head(&ep->rep_connect_wait);
+
+	/*
+	 * Create a single cq for receive dto and mw_bind (only ever
+	 * care about unbind, really). Send completions are suppressed.
+	 * Use single threaded tasklet upcalls to maintain ordering.
+	 */
+	ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
+				  rpcrdma_cq_async_error_upcall, NULL,
+				  ep->rep_attr.cap.max_recv_wr +
+				  ep->rep_attr.cap.max_send_wr + 1, 0);
+	if (IS_ERR(ep->rep_cq)) {
+		rc = PTR_ERR(ep->rep_cq);
+		dprintk("RPC:       %s: ib_create_cq failed: %i\n",
+			__func__, rc);
+		goto out1;
+	}
+
+	rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
+	if (rc) {
+		dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
+			__func__, rc);
+		goto out2;
+	}
+
+	ep->rep_attr.send_cq = ep->rep_cq;
+	ep->rep_attr.recv_cq = ep->rep_cq;
+
+	/* Initialize cma parameters */
+
+	/* RPC/RDMA does not use private data */
+	ep->rep_remote_cma.private_data = NULL;
+	ep->rep_remote_cma.private_data_len = 0;
+
+	/* Client offers RDMA Read but does not initiate */
+	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_BOUNCEBUFFERS:
+		ep->rep_remote_cma.responder_resources = 0;
+		break;
+	case RPCRDMA_MTHCAFMR:
+	case RPCRDMA_REGISTER:
+		ep->rep_remote_cma.responder_resources = cdata->max_requests *
+				(RPCRDMA_MAX_DATA_SEGS / 8);
+		break;
+	case RPCRDMA_MEMWINDOWS:
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+#if RPCRDMA_PERSISTENT_REGISTRATION
+	case RPCRDMA_ALLPHYSICAL:
+#endif
+		ep->rep_remote_cma.responder_resources = cdata->max_requests *
+				(RPCRDMA_MAX_DATA_SEGS / 2);
+		break;
+	default:
+		break;
+	}
+	if (ep->rep_remote_cma.responder_resources > devattr.max_qp_rd_atom)
+		ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
+	ep->rep_remote_cma.initiator_depth = 0;
+
+	ep->rep_remote_cma.retry_count = 7;
+	ep->rep_remote_cma.flow_control = 0;
+	ep->rep_remote_cma.rnr_retry_count = 0;
+
+	return 0;
+
+out2:
+	if (ib_destroy_cq(ep->rep_cq))
+		;
+out1:
+	return rc;
+}
+
+/*
+ * rpcrdma_ep_destroy
+ *
+ * Disconnect and destroy endpoint. After this, the only
+ * valid operations on the ep are to free it (if dynamically
+ * allocated) or re-create it.
+ *
+ * The caller's error handling must be sure to not leak the endpoint
+ * if this function fails.
+ */
+int
+rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+{
+	int rc;
+
+	dprintk("RPC:       %s: entering, connected is %d\n",
+		__func__, ep->rep_connected);
+
+	if (ia->ri_id->qp) {
+		rc = rpcrdma_ep_disconnect(ep, ia);
+		if (rc)
+			dprintk("RPC:       %s: rpcrdma_ep_disconnect"
+				" returned %i\n", __func__, rc);
+	}
+
+	ep->rep_func = NULL;
+
+	/* padding - could be done in rpcrdma_buffer_destroy... */
+	if (ep->rep_pad_mr) {
+		rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
+		ep->rep_pad_mr = NULL;
+	}
+
+	if (ia->ri_id->qp) {
+		rdma_destroy_qp(ia->ri_id);
+		ia->ri_id->qp = NULL;
+	}
+
+	rpcrdma_clean_cq(ep->rep_cq);
+	rc = ib_destroy_cq(ep->rep_cq);
+	if (rc)
+		dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
+			__func__, rc);
+
+	return rc;
+}
+
+/*
+ * Connect unconnected endpoint.
+ */
+int
+rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+{
+	struct rdma_cm_id *id;
+	int rc = 0;
+	int retry_count = 0;
+	int reconnect = (ep->rep_connected != 0);
+
+	if (reconnect) {
+		struct rpcrdma_xprt *xprt;
+retry:
+		rc = rpcrdma_ep_disconnect(ep, ia);
+		if (rc && rc != -ENOTCONN)
+			dprintk("RPC:       %s: rpcrdma_ep_disconnect"
+				" status %i\n", __func__, rc);
+		rpcrdma_clean_cq(ep->rep_cq);
+
+		xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
+		id = rpcrdma_create_id(xprt, ia,
+				(struct sockaddr *)&xprt->rx_data.addr);
+		if (IS_ERR(id)) {
+			rc = PTR_ERR(id);
+			goto out;
+		}
+		/* TEMP TEMP TEMP - fail if new device:
+		 * Deregister/remarshal *all* requests!
+		 * Close and recreate adapter, pd, etc!
+		 * Re-determine all attributes still sane!
+		 * More stuff I haven't thought of!
+		 * Rrrgh!
+		 */
+		if (ia->ri_id->device != id->device) {
+			printk("RPC:       %s: can't reconnect on "
+				"different device!\n", __func__);
+			rdma_destroy_id(id);
+			rc = -ENETDOWN;
+			goto out;
+		}
+		/* END TEMP */
+		rdma_destroy_id(ia->ri_id);
+		ia->ri_id = id;
+	}
+
+	rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
+	if (rc) {
+		dprintk("RPC:       %s: rdma_create_qp failed %i\n",
+			__func__, rc);
+		goto out;
+	}
+
+/* XXX Tavor device performs badly with 2K MTU! */
+if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
+	struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
+	if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
+	    (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
+	     pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
+		struct ib_qp_attr attr = {
+			.path_mtu = IB_MTU_1024
+		};
+		rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
+	}
+}
+
+	/* Theoretically a client initiator_depth > 0 is not needed,
+	 * but many peers fail to complete the connection unless they
+	 * == responder_resources! */
+	if (ep->rep_remote_cma.initiator_depth !=
+				ep->rep_remote_cma.responder_resources)
+		ep->rep_remote_cma.initiator_depth =
+			ep->rep_remote_cma.responder_resources;
+
+	ep->rep_connected = 0;
+
+	rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
+	if (rc) {
+		dprintk("RPC:       %s: rdma_connect() failed with %i\n",
+				__func__, rc);
+		goto out;
+	}
+
+	if (reconnect)
+		return 0;
+
+	wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
+
+	/*
+	 * Check state. A non-peer reject indicates no listener
+	 * (ECONNREFUSED), which may be a transient state. All
+	 * others indicate a transport condition which has already
+	 * undergone a best-effort.
+	 */
+	if (ep->rep_connected == -ECONNREFUSED
+	    && ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
+		dprintk("RPC:       %s: non-peer_reject, retry\n", __func__);
+		goto retry;
+	}
+	if (ep->rep_connected <= 0) {
+		/* Sometimes, the only way to reliably connect to remote
+		 * CMs is to use same nonzero values for ORD and IRD. */
+		ep->rep_remote_cma.initiator_depth =
+					ep->rep_remote_cma.responder_resources;
+		if (ep->rep_remote_cma.initiator_depth == 0)
+			++ep->rep_remote_cma.initiator_depth;
+		if (ep->rep_remote_cma.responder_resources == 0)
+			++ep->rep_remote_cma.responder_resources;
+		if (retry_count++ == 0)
+			goto retry;
+		rc = ep->rep_connected;
+	} else {
+		dprintk("RPC:       %s: connected\n", __func__);
+	}
+
+out:
+	if (rc)
+		ep->rep_connected = rc;
+	return rc;
+}
+
+/*
+ * rpcrdma_ep_disconnect
+ *
+ * This is separate from destroy to facilitate the ability
+ * to reconnect without recreating the endpoint.
+ *
+ * This call is not reentrant, and must not be made in parallel
+ * on the same endpoint.
+ */
+int
+rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+{
+	int rc;
+
+	rpcrdma_clean_cq(ep->rep_cq);
+	rc = rdma_disconnect(ia->ri_id);
+	if (!rc) {
+		/* returns without wait if not connected */
+		wait_event_interruptible(ep->rep_connect_wait,
+							ep->rep_connected != 1);
+		dprintk("RPC:       %s: after wait, %sconnected\n", __func__,
+			(ep->rep_connected == 1) ? "still " : "dis");
+	} else {
+		dprintk("RPC:       %s: rdma_disconnect %i\n", __func__, rc);
+		ep->rep_connected = rc;
+	}
+	return rc;
+}
+
+/*
+ * Initialize buffer memory
+ */
+int
+rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
+	struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
+{
+	char *p;
+	size_t len;
+	int i, rc;
+
+	buf->rb_max_requests = cdata->max_requests;
+	spin_lock_init(&buf->rb_lock);
+	atomic_set(&buf->rb_credits, 1);
+
+	/* Need to allocate:
+	 *   1.  arrays for send and recv pointers
+	 *   2.  arrays of struct rpcrdma_req to fill in pointers
+	 *   3.  array of struct rpcrdma_rep for replies
+	 *   4.  padding, if any
+	 *   5.  mw's, if any
+	 * Send/recv buffers in req/rep need to be registered
+	 */
+
+	len = buf->rb_max_requests *
+		(sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
+	len += cdata->padding;
+	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_MTHCAFMR:
+		/* TBD we are perhaps overallocating here */
+		len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
+				sizeof(struct rpcrdma_mw);
+		break;
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+	case RPCRDMA_MEMWINDOWS:
+		len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
+				sizeof(struct rpcrdma_mw);
+		break;
+	default:
+		break;
+	}
+
+	/* allocate 1, 4 and 5 in one shot */
+	p = kzalloc(len, GFP_KERNEL);
+	if (p == NULL) {
+		dprintk("RPC:       %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
+			__func__, len);
+		rc = -ENOMEM;
+		goto out;
+	}
+	buf->rb_pool = p;	/* for freeing it later */
+
+	buf->rb_send_bufs = (struct rpcrdma_req **) p;
+	p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
+	buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
+	p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
+
+	/*
+	 * Register the zeroed pad buffer, if any.
+	 */
+	if (cdata->padding) {
+		rc = rpcrdma_register_internal(ia, p, cdata->padding,
+					    &ep->rep_pad_mr, &ep->rep_pad);
+		if (rc)
+			goto out;
+	}
+	p += cdata->padding;
+
+	/*
+	 * Allocate the fmr's, or mw's for mw_bind chunk registration.
+	 * We "cycle" the mw's in order to minimize rkey reuse,
+	 * and also reduce unbind-to-bind collision.
+	 */
+	INIT_LIST_HEAD(&buf->rb_mws);
+	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_MTHCAFMR:
+		{
+		struct rpcrdma_mw *r = (struct rpcrdma_mw *)p;
+		struct ib_fmr_attr fa = {
+			RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT
+		};
+		/* TBD we are perhaps overallocating here */
+		for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
+			r->r.fmr = ib_alloc_fmr(ia->ri_pd,
+				IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
+				&fa);
+			if (IS_ERR(r->r.fmr)) {
+				rc = PTR_ERR(r->r.fmr);
+				dprintk("RPC:       %s: ib_alloc_fmr"
+					" failed %i\n", __func__, rc);
+				goto out;
+			}
+			list_add(&r->mw_list, &buf->rb_mws);
+			++r;
+		}
+		}
+		break;
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+	case RPCRDMA_MEMWINDOWS:
+		{
+		struct rpcrdma_mw *r = (struct rpcrdma_mw *)p;
+		/* Allocate one extra request's worth, for full cycling */
+		for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
+			r->r.mw = ib_alloc_mw(ia->ri_pd);
+			if (IS_ERR(r->r.mw)) {
+				rc = PTR_ERR(r->r.mw);
+				dprintk("RPC:       %s: ib_alloc_mw"
+					" failed %i\n", __func__, rc);
+				goto out;
+			}
+			list_add(&r->mw_list, &buf->rb_mws);
+			++r;
+		}
+		}
+		break;
+	default:
+		break;
+	}
+
+	/*
+	 * Allocate/init the request/reply buffers. Doing this
+	 * using kmalloc for now -- one for each buf.
+	 */
+	for (i = 0; i < buf->rb_max_requests; i++) {
+		struct rpcrdma_req *req;
+		struct rpcrdma_rep *rep;
+
+		len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
+		/* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
+		/* Typical ~2400b, so rounding up saves work later */
+		if (len < 4096)
+			len = 4096;
+		req = kmalloc(len, GFP_KERNEL);
+		if (req == NULL) {
+			dprintk("RPC:       %s: request buffer %d alloc"
+				" failed\n", __func__, i);
+			rc = -ENOMEM;
+			goto out;
+		}
+		memset(req, 0, sizeof(struct rpcrdma_req));
+		buf->rb_send_bufs[i] = req;
+		buf->rb_send_bufs[i]->rl_buffer = buf;
+
+		rc = rpcrdma_register_internal(ia, req->rl_base,
+				len - offsetof(struct rpcrdma_req, rl_base),
+				&buf->rb_send_bufs[i]->rl_handle,
+				&buf->rb_send_bufs[i]->rl_iov);
+		if (rc)
+			goto out;
+
+		buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
+
+		len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
+		rep = kmalloc(len, GFP_KERNEL);
+		if (rep == NULL) {
+			dprintk("RPC:       %s: reply buffer %d alloc failed\n",
+				__func__, i);
+			rc = -ENOMEM;
+			goto out;
+		}
+		memset(rep, 0, sizeof(struct rpcrdma_rep));
+		buf->rb_recv_bufs[i] = rep;
+		buf->rb_recv_bufs[i]->rr_buffer = buf;
+		init_waitqueue_head(&rep->rr_unbind);
+
+		rc = rpcrdma_register_internal(ia, rep->rr_base,
+				len - offsetof(struct rpcrdma_rep, rr_base),
+				&buf->rb_recv_bufs[i]->rr_handle,
+				&buf->rb_recv_bufs[i]->rr_iov);
+		if (rc)
+			goto out;
+
+	}
+	dprintk("RPC:       %s: max_requests %d\n",
+		__func__, buf->rb_max_requests);
+	/* done */
+	return 0;
+out:
+	rpcrdma_buffer_destroy(buf);
+	return rc;
+}
+
+/*
+ * Unregister and destroy buffer memory. Need to deal with
+ * partial initialization, so it's callable from failed create.
+ * Must be called before destroying endpoint, as registrations
+ * reference it.
+ */
+void
+rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
+{
+	int rc, i;
+	struct rpcrdma_ia *ia = rdmab_to_ia(buf);
+
+	/* clean up in reverse order from create
+	 *   1.  recv mr memory (mr free, then kfree)
+	 *   1a. bind mw memory
+	 *   2.  send mr memory (mr free, then kfree)
+	 *   3.  padding (if any) [moved to rpcrdma_ep_destroy]
+	 *   4.  arrays
+	 */
+	dprintk("RPC:       %s: entering\n", __func__);
+
+	for (i = 0; i < buf->rb_max_requests; i++) {
+		if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
+			rpcrdma_deregister_internal(ia,
+					buf->rb_recv_bufs[i]->rr_handle,
+					&buf->rb_recv_bufs[i]->rr_iov);
+			kfree(buf->rb_recv_bufs[i]);
+		}
+		if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
+			while (!list_empty(&buf->rb_mws)) {
+				struct rpcrdma_mw *r;
+				r = list_entry(buf->rb_mws.next,
+					struct rpcrdma_mw, mw_list);
+				list_del(&r->mw_list);
+				switch (ia->ri_memreg_strategy) {
+				case RPCRDMA_MTHCAFMR:
+					rc = ib_dealloc_fmr(r->r.fmr);
+					if (rc)
+						dprintk("RPC:       %s:"
+							" ib_dealloc_fmr"
+							" failed %i\n",
+							__func__, rc);
+					break;
+				case RPCRDMA_MEMWINDOWS_ASYNC:
+				case RPCRDMA_MEMWINDOWS:
+					rc = ib_dealloc_mw(r->r.mw);
+					if (rc)
+						dprintk("RPC:       %s:"
+							" ib_dealloc_mw"
+							" failed %i\n",
+							__func__, rc);
+					break;
+				default:
+					break;
+				}
+			}
+			rpcrdma_deregister_internal(ia,
+					buf->rb_send_bufs[i]->rl_handle,
+					&buf->rb_send_bufs[i]->rl_iov);
+			kfree(buf->rb_send_bufs[i]);
+		}
+	}
+
+	kfree(buf->rb_pool);
+}
+
+/*
+ * Get a set of request/reply buffers.
+ *
+ * Reply buffer (if needed) is attached to send buffer upon return.
+ * Rule:
+ *    rb_send_index and rb_recv_index MUST always be pointing to the
+ *    *next* available buffer (non-NULL). They are incremented after
+ *    removing buffers, and decremented *before* returning them.
+ */
+struct rpcrdma_req *
+rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
+{
+	struct rpcrdma_req *req;
+	unsigned long flags;
+
+	spin_lock_irqsave(&buffers->rb_lock, flags);
+	if (buffers->rb_send_index == buffers->rb_max_requests) {
+		spin_unlock_irqrestore(&buffers->rb_lock, flags);
+		dprintk("RPC:       %s: out of request buffers\n", __func__);
+		return ((struct rpcrdma_req *)NULL);
+	}
+
+	req = buffers->rb_send_bufs[buffers->rb_send_index];
+	if (buffers->rb_send_index < buffers->rb_recv_index) {
+		dprintk("RPC:       %s: %d extra receives outstanding (ok)\n",
+			__func__,
+			buffers->rb_recv_index - buffers->rb_send_index);
+		req->rl_reply = NULL;
+	} else {
+		req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
+		buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
+	}
+	buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
+	if (!list_empty(&buffers->rb_mws)) {
+		int i = RPCRDMA_MAX_SEGS - 1;
+		do {
+			struct rpcrdma_mw *r;
+			r = list_entry(buffers->rb_mws.next,
+					struct rpcrdma_mw, mw_list);
+			list_del(&r->mw_list);
+			req->rl_segments[i].mr_chunk.rl_mw = r;
+		} while (--i >= 0);
+	}
+	spin_unlock_irqrestore(&buffers->rb_lock, flags);
+	return req;
+}
+
+/*
+ * Put request/reply buffers back into pool.
+ * Pre-decrement counter/array index.
+ */
+void
+rpcrdma_buffer_put(struct rpcrdma_req *req)
+{
+	struct rpcrdma_buffer *buffers = req->rl_buffer;
+	struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
+	int i;
+	unsigned long flags;
+
+	BUG_ON(req->rl_nchunks != 0);
+	spin_lock_irqsave(&buffers->rb_lock, flags);
+	buffers->rb_send_bufs[--buffers->rb_send_index] = req;
+	req->rl_niovs = 0;
+	if (req->rl_reply) {
+		buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
+		init_waitqueue_head(&req->rl_reply->rr_unbind);
+		req->rl_reply->rr_func = NULL;
+		req->rl_reply = NULL;
+	}
+	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_MTHCAFMR:
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+	case RPCRDMA_MEMWINDOWS:
+		/*
+		 * Cycle mw's back in reverse order, and "spin" them.
+		 * This delays and scrambles reuse as much as possible.
+		 */
+		i = 1;
+		do {
+			struct rpcrdma_mw **mw;
+			mw = &req->rl_segments[i].mr_chunk.rl_mw;
+			list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
+			*mw = NULL;
+		} while (++i < RPCRDMA_MAX_SEGS);
+		list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
+					&buffers->rb_mws);
+		req->rl_segments[0].mr_chunk.rl_mw = NULL;
+		break;
+	default:
+		break;
+	}
+	spin_unlock_irqrestore(&buffers->rb_lock, flags);
+}
+
+/*
+ * Recover reply buffers from pool.
+ * This happens when recovering from error conditions.
+ * Post-increment counter/array index.
+ */
+void
+rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
+{
+	struct rpcrdma_buffer *buffers = req->rl_buffer;
+	unsigned long flags;
+
+	if (req->rl_iov.length == 0)	/* special case xprt_rdma_allocate() */
+		buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
+	spin_lock_irqsave(&buffers->rb_lock, flags);
+	if (buffers->rb_recv_index < buffers->rb_max_requests) {
+		req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
+		buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
+	}
+	spin_unlock_irqrestore(&buffers->rb_lock, flags);
+}
+
+/*
+ * Put reply buffers back into pool when not attached to
+ * request. This happens in error conditions, and when
+ * aborting unbinds. Pre-decrement counter/array index.
+ */
+void
+rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
+{
+	struct rpcrdma_buffer *buffers = rep->rr_buffer;
+	unsigned long flags;
+
+	rep->rr_func = NULL;
+	spin_lock_irqsave(&buffers->rb_lock, flags);
+	buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
+	spin_unlock_irqrestore(&buffers->rb_lock, flags);
+}
+
+/*
+ * Wrappers for internal-use kmalloc memory registration, used by buffer code.
+ */
+
+int
+rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
+				struct ib_mr **mrp, struct ib_sge *iov)
+{
+	struct ib_phys_buf ipb;
+	struct ib_mr *mr;
+	int rc;
+
+	/*
+	 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
+	 */
+	iov->addr = ib_dma_map_single(ia->ri_id->device,
+			va, len, DMA_BIDIRECTIONAL);
+	iov->length = len;
+
+	if (ia->ri_bind_mem != NULL) {
+		*mrp = NULL;
+		iov->lkey = ia->ri_bind_mem->lkey;
+		return 0;
+	}
+
+	ipb.addr = iov->addr;
+	ipb.size = iov->length;
+	mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
+			IB_ACCESS_LOCAL_WRITE, &iov->addr);
+
+	dprintk("RPC:       %s: phys convert: 0x%llx "
+			"registered 0x%llx length %d\n",
+			__func__, ipb.addr, iov->addr, len);
+
+	if (IS_ERR(mr)) {
+		*mrp = NULL;
+		rc = PTR_ERR(mr);
+		dprintk("RPC:       %s: failed with %i\n", __func__, rc);
+	} else {
+		*mrp = mr;
+		iov->lkey = mr->lkey;
+		rc = 0;
+	}
+
+	return rc;
+}
+
+int
+rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
+				struct ib_mr *mr, struct ib_sge *iov)
+{
+	int rc;
+
+	ib_dma_unmap_single(ia->ri_id->device,
+			iov->addr, iov->length, DMA_BIDIRECTIONAL);
+
+	if (NULL == mr)
+		return 0;
+
+	rc = ib_dereg_mr(mr);
+	if (rc)
+		dprintk("RPC:       %s: ib_dereg_mr failed %i\n", __func__, rc);
+	return rc;
+}
+
+/*
+ * Wrappers for chunk registration, shared by read/write chunk code.
+ */
+
+static void
+rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
+{
+	seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+	seg->mr_dmalen = seg->mr_len;
+	if (seg->mr_page)
+		seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
+				seg->mr_page, offset_in_page(seg->mr_offset),
+				seg->mr_dmalen, seg->mr_dir);
+	else
+		seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
+				seg->mr_offset,
+				seg->mr_dmalen, seg->mr_dir);
+}
+
+static void
+rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
+{
+	if (seg->mr_page)
+		ib_dma_unmap_page(ia->ri_id->device,
+				seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
+	else
+		ib_dma_unmap_single(ia->ri_id->device,
+				seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
+}
+
+int
+rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
+			int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
+				  IB_ACCESS_REMOTE_READ);
+	struct rpcrdma_mr_seg *seg1 = seg;
+	int i;
+	int rc = 0;
+
+	switch (ia->ri_memreg_strategy) {
+
+#if RPCRDMA_PERSISTENT_REGISTRATION
+	case RPCRDMA_ALLPHYSICAL:
+		rpcrdma_map_one(ia, seg, writing);
+		seg->mr_rkey = ia->ri_bind_mem->rkey;
+		seg->mr_base = seg->mr_dma;
+		seg->mr_nsegs = 1;
+		nsegs = 1;
+		break;
+#endif
+
+	/* Registration using fast memory registration */
+	case RPCRDMA_MTHCAFMR:
+		{
+		u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
+		int len, pageoff = offset_in_page(seg->mr_offset);
+		seg1->mr_offset -= pageoff;	/* start of page */
+		seg1->mr_len += pageoff;
+		len = -pageoff;
+		if (nsegs > RPCRDMA_MAX_DATA_SEGS)
+			nsegs = RPCRDMA_MAX_DATA_SEGS;
+		for (i = 0; i < nsegs;) {
+			rpcrdma_map_one(ia, seg, writing);
+			physaddrs[i] = seg->mr_dma;
+			len += seg->mr_len;
+			++seg;
+			++i;
+			/* Check for holes */
+			if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
+			    offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
+				break;
+		}
+		nsegs = i;
+		rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
+					physaddrs, nsegs, seg1->mr_dma);
+		if (rc) {
+			dprintk("RPC:       %s: failed ib_map_phys_fmr "
+				"%u@0x%llx+%i (%d)... status %i\n", __func__,
+				len, (unsigned long long)seg1->mr_dma,
+				pageoff, nsegs, rc);
+			while (nsegs--)
+				rpcrdma_unmap_one(ia, --seg);
+		} else {
+			seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
+			seg1->mr_base = seg1->mr_dma + pageoff;
+			seg1->mr_nsegs = nsegs;
+			seg1->mr_len = len;
+		}
+		}
+		break;
+
+	/* Registration using memory windows */
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+	case RPCRDMA_MEMWINDOWS:
+		{
+		struct ib_mw_bind param;
+		rpcrdma_map_one(ia, seg, writing);
+		param.mr = ia->ri_bind_mem;
+		param.wr_id = 0ULL;	/* no send cookie */
+		param.addr = seg->mr_dma;
+		param.length = seg->mr_len;
+		param.send_flags = 0;
+		param.mw_access_flags = mem_priv;
+
+		DECR_CQCOUNT(&r_xprt->rx_ep);
+		rc = ib_bind_mw(ia->ri_id->qp,
+					seg->mr_chunk.rl_mw->r.mw, &param);
+		if (rc) {
+			dprintk("RPC:       %s: failed ib_bind_mw "
+				"%u@0x%llx status %i\n",
+				__func__, seg->mr_len,
+				(unsigned long long)seg->mr_dma, rc);
+			rpcrdma_unmap_one(ia, seg);
+		} else {
+			seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
+			seg->mr_base = param.addr;
+			seg->mr_nsegs = 1;
+			nsegs = 1;
+		}
+		}
+		break;
+
+	/* Default registration each time */
+	default:
+		{
+		struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
+		int len = 0;
+		if (nsegs > RPCRDMA_MAX_DATA_SEGS)
+			nsegs = RPCRDMA_MAX_DATA_SEGS;
+		for (i = 0; i < nsegs;) {
+			rpcrdma_map_one(ia, seg, writing);
+			ipb[i].addr = seg->mr_dma;
+			ipb[i].size = seg->mr_len;
+			len += seg->mr_len;
+			++seg;
+			++i;
+			/* Check for holes */
+			if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
+			    offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
+				break;
+		}
+		nsegs = i;
+		seg1->mr_base = seg1->mr_dma;
+		seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
+					ipb, nsegs, mem_priv, &seg1->mr_base);
+		if (IS_ERR(seg1->mr_chunk.rl_mr)) {
+			rc = PTR_ERR(seg1->mr_chunk.rl_mr);
+			dprintk("RPC:       %s: failed ib_reg_phys_mr "
+				"%u@0x%llx (%d)... status %i\n",
+				__func__, len,
+				(unsigned long long)seg1->mr_dma, nsegs, rc);
+			while (nsegs--)
+				rpcrdma_unmap_one(ia, --seg);
+		} else {
+			seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
+			seg1->mr_nsegs = nsegs;
+			seg1->mr_len = len;
+		}
+		}
+		break;
+	}
+	if (rc)
+		return -1;
+
+	return nsegs;
+}
+
+int
+rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
+		struct rpcrdma_xprt *r_xprt, void *r)
+{
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	struct rpcrdma_mr_seg *seg1 = seg;
+	int nsegs = seg->mr_nsegs, rc;
+
+	switch (ia->ri_memreg_strategy) {
+
+#if RPCRDMA_PERSISTENT_REGISTRATION
+	case RPCRDMA_ALLPHYSICAL:
+		BUG_ON(nsegs != 1);
+		rpcrdma_unmap_one(ia, seg);
+		rc = 0;
+		break;
+#endif
+
+	case RPCRDMA_MTHCAFMR:
+		{
+		LIST_HEAD(l);
+		list_add(&seg->mr_chunk.rl_mw->r.fmr->list, &l);
+		rc = ib_unmap_fmr(&l);
+		while (seg1->mr_nsegs--)
+			rpcrdma_unmap_one(ia, seg++);
+		}
+		if (rc)
+			dprintk("RPC:       %s: failed ib_unmap_fmr,"
+				" status %i\n", __func__, rc);
+		break;
+
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+	case RPCRDMA_MEMWINDOWS:
+		{
+		struct ib_mw_bind param;
+		BUG_ON(nsegs != 1);
+		param.mr = ia->ri_bind_mem;
+		param.addr = 0ULL;	/* unbind */
+		param.length = 0;
+		param.mw_access_flags = 0;
+		if (r) {
+			param.wr_id = (u64) (unsigned long) r;
+			param.send_flags = IB_SEND_SIGNALED;
+			INIT_CQCOUNT(&r_xprt->rx_ep);
+		} else {
+			param.wr_id = 0ULL;
+			param.send_flags = 0;
+			DECR_CQCOUNT(&r_xprt->rx_ep);
+		}
+		rc = ib_bind_mw(ia->ri_id->qp,
+				seg->mr_chunk.rl_mw->r.mw, &param);
+		rpcrdma_unmap_one(ia, seg);
+		}
+		if (rc)
+			dprintk("RPC:       %s: failed ib_(un)bind_mw,"
+				" status %i\n", __func__, rc);
+		else
+			r = NULL;	/* will upcall on completion */
+		break;
+
+	default:
+		rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
+		seg1->mr_chunk.rl_mr = NULL;
+		while (seg1->mr_nsegs--)
+			rpcrdma_unmap_one(ia, seg++);
+		if (rc)
+			dprintk("RPC:       %s: failed ib_dereg_mr,"
+				" status %i\n", __func__, rc);
+		break;
+	}
+	if (r) {
+		struct rpcrdma_rep *rep = r;
+		void (*func)(struct rpcrdma_rep *) = rep->rr_func;
+		rep->rr_func = NULL;
+		func(rep);	/* dereg done, callback now */
+	}
+	return nsegs;
+}
+
+/*
+ * Prepost any receive buffer, then post send.
+ *
+ * Receive buffer is donated to hardware, reclaimed upon recv completion.
+ */
+int
+rpcrdma_ep_post(struct rpcrdma_ia *ia,
+		struct rpcrdma_ep *ep,
+		struct rpcrdma_req *req)
+{
+	struct ib_send_wr send_wr, *send_wr_fail;
+	struct rpcrdma_rep *rep = req->rl_reply;
+	int rc;
+
+	if (rep) {
+		rc = rpcrdma_ep_post_recv(ia, ep, rep);
+		if (rc)
+			goto out;
+		req->rl_reply = NULL;
+	}
+
+	send_wr.next = NULL;
+	send_wr.wr_id = 0ULL;	/* no send cookie */
+	send_wr.sg_list = req->rl_send_iov;
+	send_wr.num_sge = req->rl_niovs;
+	send_wr.opcode = IB_WR_SEND;
+	send_wr.imm_data = 0;
+	if (send_wr.num_sge == 4)	/* no need to sync any pad (constant) */
+		ib_dma_sync_single_for_device(ia->ri_id->device,
+			req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
+			DMA_TO_DEVICE);
+	ib_dma_sync_single_for_device(ia->ri_id->device,
+		req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
+		DMA_TO_DEVICE);
+	ib_dma_sync_single_for_device(ia->ri_id->device,
+		req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
+		DMA_TO_DEVICE);
+
+	if (DECR_CQCOUNT(ep) > 0)
+		send_wr.send_flags = 0;
+	else { /* Provider must take a send completion every now and then */
+		INIT_CQCOUNT(ep);
+		send_wr.send_flags = IB_SEND_SIGNALED;
+	}
+
+	rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
+	if (rc)
+		dprintk("RPC:       %s: ib_post_send returned %i\n", __func__,
+			rc);
+out:
+	return rc;
+}
+
+/*
+ * (Re)post a receive buffer.
+ */
+int
+rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
+		     struct rpcrdma_ep *ep,
+		     struct rpcrdma_rep *rep)
+{
+	struct ib_recv_wr recv_wr, *recv_wr_fail;
+	int rc;
+
+	recv_wr.next = NULL;
+	recv_wr.wr_id = (u64) (unsigned long) rep;
+	recv_wr.sg_list = &rep->rr_iov;
+	recv_wr.num_sge = 1;
+
+	ib_dma_sync_single_for_cpu(ia->ri_id->device,
+		rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
+
+	DECR_CQCOUNT(ep);
+	rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
+
+	if (rc)
+		dprintk("RPC:       %s: ib_post_recv returned %i\n", __func__,
+			rc);
+	return rc;
+}
diff -puN /dev/null net/sunrpc/xprtrdma/xprt_rdma.h
--- /dev/null
+++ a/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_SUNRPC_XPRT_RDMA_H
+#define _LINUX_SUNRPC_XPRT_RDMA_H
+
+#include <linux/wait.h> 		/* wait_queue_head_t, etc */
+#include <linux/spinlock.h> 		/* spinlock_t, etc */
+#include <asm/atomic.h>			/* atomic_t, etc */
+
+#include <rdma/rdma_cm.h>		/* RDMA connection api */
+#include <rdma/ib_verbs.h>		/* RDMA verbs api */
+
+#include <linux/sunrpc/clnt.h> 		/* rpc_xprt */
+#include <linux/sunrpc/rpc_rdma.h> 	/* RPC/RDMA protocol */
+#include <linux/sunrpc/xprtrdma.h> 	/* xprt parameters */
+
+/*
+ * Interface Adapter -- one per transport instance
+ */
+struct rpcrdma_ia {
+	struct rdma_cm_id 	*ri_id;
+	struct ib_pd		*ri_pd;
+	struct ib_mr		*ri_bind_mem;
+	struct completion	ri_done;
+	int			ri_async_rc;
+	enum rpcrdma_memreg	ri_memreg_strategy;
+};
+
+/*
+ * RDMA Endpoint -- one per transport instance
+ */
+
+struct rpcrdma_ep {
+	atomic_t		rep_cqcount;
+	int			rep_cqinit;
+	int			rep_connected;
+	struct rpcrdma_ia	*rep_ia;
+	struct ib_cq		*rep_cq;
+	struct ib_qp_init_attr	rep_attr;
+	wait_queue_head_t 	rep_connect_wait;
+	struct ib_sge		rep_pad;	/* holds zeroed pad */
+	struct ib_mr		*rep_pad_mr;	/* holds zeroed pad */
+	void			(*rep_func)(struct rpcrdma_ep *);
+	struct rpc_xprt		*rep_xprt;	/* for rep_func */
+	struct rdma_conn_param	rep_remote_cma;
+	struct sockaddr_storage	rep_remote_addr;
+};
+
+#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
+#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
+
+/*
+ * struct rpcrdma_rep -- this structure encapsulates state required to recv
+ * and complete a reply, asychronously. It needs several pieces of
+ * state:
+ *   o recv buffer (posted to provider)
+ *   o ib_sge (also donated to provider)
+ *   o status of reply (length, success or not)
+ *   o bookkeeping state to get run by tasklet (list, etc)
+ *
+ * These are allocated during initialization, per-transport instance;
+ * however, the tasklet execution list itself is global, as it should
+ * always be pretty short.
+ *
+ * N of these are associated with a transport instance, and stored in
+ * struct rpcrdma_buffer. N is the max number of outstanding requests.
+ */
+
+/* temporary static scatter/gather max */
+#define RPCRDMA_MAX_DATA_SEGS	(8)	/* max scatter/gather */
+#define RPCRDMA_MAX_SEGS 	(RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
+#define MAX_RPCRDMAHDR	(\
+	/* max supported RPC/RDMA header */ \
+	sizeof(struct rpcrdma_msg) + (2 * sizeof(u32)) + \
+	(sizeof(struct rpcrdma_read_chunk) * RPCRDMA_MAX_SEGS) + sizeof(u32))
+
+struct rpcrdma_buffer;
+
+struct rpcrdma_rep {
+	unsigned int	rr_len;		/* actual received reply length */
+	struct rpcrdma_buffer *rr_buffer; /* home base for this structure */
+	struct rpc_xprt	*rr_xprt;	/* needed for request/reply matching */
+	void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */
+	struct list_head rr_list;	/* tasklet list */
+	wait_queue_head_t rr_unbind;	/* optional unbind wait */
+	struct ib_sge	rr_iov;		/* for posting */
+	struct ib_mr	*rr_handle;	/* handle for mem in rr_iov */
+	char	rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */
+};
+
+/*
+ * struct rpcrdma_req -- structure central to the request/reply sequence.
+ *
+ * N of these are associated with a transport instance, and stored in
+ * struct rpcrdma_buffer. N is the max number of outstanding requests.
+ *
+ * It includes pre-registered buffer memory for send AND recv.
+ * The recv buffer, however, is not owned by this structure, and
+ * is "donated" to the hardware when a recv is posted. When a
+ * reply is handled, the recv buffer used is given back to the
+ * struct rpcrdma_req associated with the request.
+ *
+ * In addition to the basic memory, this structure includes an array
+ * of iovs for send operations. The reason is that the iovs passed to
+ * ib_post_{send,recv} must not be modified until the work request
+ * completes.
+ *
+ * NOTES:
+ *   o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we
+ *     marshal. The number needed varies depending on the iov lists that
+ *     are passed to us, the memory registration mode we are in, and if
+ *     physical addressing is used, the layout.
+ */
+
+struct rpcrdma_mr_seg {		/* chunk descriptors */
+	union {				/* chunk memory handles */
+		struct ib_mr	*rl_mr;		/* if registered directly */
+		struct rpcrdma_mw {		/* if registered from region */
+			union {
+				struct ib_mw	*mw;
+				struct ib_fmr	*fmr;
+			} r;
+			struct list_head mw_list;
+		} *rl_mw;
+	} mr_chunk;
+	u64		mr_base;	/* registration result */
+	u32		mr_rkey;	/* registration result */
+	u32		mr_len;		/* length of chunk or segment */
+	int		mr_nsegs;	/* number of segments in chunk or 0 */
+	enum dma_data_direction	mr_dir;	/* segment mapping direction */
+	dma_addr_t	mr_dma;		/* segment mapping address */
+	size_t		mr_dmalen;	/* segment mapping length */
+	struct page	*mr_page;	/* owning page, if any */
+	char		*mr_offset;	/* kva if no page, else offset */
+};
+
+struct rpcrdma_req {
+	size_t 		rl_size;	/* actual length of buffer */
+	unsigned int	rl_niovs;	/* 0, 2 or 4 */
+	unsigned int	rl_nchunks;	/* non-zero if chunks */
+	struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
+	struct rpcrdma_rep	*rl_reply;/* holder for reply buffer */
+	struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */
+	struct ib_sge	rl_send_iov[4];	/* for active requests */
+	struct ib_sge	rl_iov;		/* for posting */
+	struct ib_mr	*rl_handle;	/* handle for mem in rl_iov */
+	char		rl_base[MAX_RPCRDMAHDR]; /* start of actual buffer */
+	__u32 		rl_xdr_buf[0];	/* start of returned rpc rq_buffer */
+};
+#define rpcr_to_rdmar(r) \
+	container_of((r)->rq_buffer, struct rpcrdma_req, rl_xdr_buf[0])
+
+/*
+ * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for
+ * inline requests/replies, and client/server credits.
+ *
+ * One of these is associated with a transport instance
+ */
+struct rpcrdma_buffer {
+	spinlock_t	rb_lock;	/* protects indexes */
+	atomic_t	rb_credits;	/* most recent server credits */
+	unsigned long	rb_cwndscale;	/* cached framework rpc_cwndscale */
+	int		rb_max_requests;/* client max requests */
+	struct list_head rb_mws;	/* optional memory windows/fmrs */
+	int		rb_send_index;
+	struct rpcrdma_req	**rb_send_bufs;
+	int		rb_recv_index;
+	struct rpcrdma_rep	**rb_recv_bufs;
+	char		*rb_pool;
+};
+#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
+
+/*
+ * Internal structure for transport instance creation. This
+ * exists primarily for modularity.
+ *
+ * This data should be set with mount options
+ */
+struct rpcrdma_create_data_internal {
+	struct sockaddr_storage	addr;	/* RDMA server address */
+	unsigned int	max_requests;	/* max requests (slots) in flight */
+	unsigned int	rsize;		/* mount rsize - max read hdr+data */
+	unsigned int	wsize;		/* mount wsize - max write hdr+data */
+	unsigned int	inline_rsize;	/* max non-rdma read data payload */
+	unsigned int	inline_wsize;	/* max non-rdma write data payload */
+	unsigned int	padding;	/* non-rdma write header padding */
+};
+
+#define RPCRDMA_INLINE_READ_THRESHOLD(rq) \
+	(rpcx_to_rdmad(rq->rq_task->tk_xprt).inline_rsize)
+
+#define RPCRDMA_INLINE_WRITE_THRESHOLD(rq)\
+	(rpcx_to_rdmad(rq->rq_task->tk_xprt).inline_wsize)
+
+#define RPCRDMA_INLINE_PAD_VALUE(rq)\
+	rpcx_to_rdmad(rq->rq_task->tk_xprt).padding
+
+/*
+ * Statistics for RPCRDMA
+ */
+struct rpcrdma_stats {
+	unsigned long		read_chunk_count;
+	unsigned long		write_chunk_count;
+	unsigned long		reply_chunk_count;
+
+	unsigned long long	total_rdma_request;
+	unsigned long long	total_rdma_reply;
+
+	unsigned long long	pullup_copy_count;
+	unsigned long long	fixup_copy_count;
+	unsigned long		hardway_register_count;
+	unsigned long		failed_marshal_count;
+	unsigned long		bad_reply_count;
+};
+
+/*
+ * RPCRDMA transport -- encapsulates the structures above for
+ * integration with RPC.
+ *
+ * The contained structures are embedded, not pointers,
+ * for convenience. This structure need not be visible externally.
+ *
+ * It is allocated and initialized during mount, and released
+ * during unmount.
+ */
+struct rpcrdma_xprt {
+	struct rpc_xprt		xprt;
+	struct rpcrdma_ia	rx_ia;
+	struct rpcrdma_ep	rx_ep;
+	struct rpcrdma_buffer	rx_buf;
+	struct rpcrdma_create_data_internal rx_data;
+	struct delayed_work	rdma_connect;
+	struct rpcrdma_stats	rx_stats;
+};
+
+#define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt)
+#define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data)
+
+/*
+ * Interface Adapter calls - xprtrdma/verbs.c
+ */
+int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int);
+void rpcrdma_ia_close(struct rpcrdma_ia *);
+
+/*
+ * Endpoint calls - xprtrdma/verbs.c
+ */
+int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
+				struct rpcrdma_create_data_internal *);
+int rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
+int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+
+int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
+				struct rpcrdma_req *);
+int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *,
+				struct rpcrdma_rep *);
+
+/*
+ * Buffer calls - xprtrdma/verbs.c
+ */
+int rpcrdma_buffer_create(struct rpcrdma_buffer *, struct rpcrdma_ep *,
+				struct rpcrdma_ia *,
+				struct rpcrdma_create_data_internal *);
+void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
+
+struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
+void rpcrdma_buffer_put(struct rpcrdma_req *);
+void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
+void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
+
+int rpcrdma_register_internal(struct rpcrdma_ia *, void *, int,
+				struct ib_mr **, struct ib_sge *);
+int rpcrdma_deregister_internal(struct rpcrdma_ia *,
+				struct ib_mr *, struct ib_sge *);
+
+int rpcrdma_register_external(struct rpcrdma_mr_seg *,
+				int, int, struct rpcrdma_xprt *);
+int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
+				struct rpcrdma_xprt *, void *);
+
+/*
+ * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
+ */
+void rpcrdma_conn_func(struct rpcrdma_ep *);
+void rpcrdma_reply_handler(struct rpcrdma_rep *);
+
+/*
+ * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
+ */
+int rpcrdma_marshal_req(struct rpc_rqst *);
+
+#endif				/* _LINUX_SUNRPC_XPRT_RDMA_H */
diff -puN net/sunrpc/xprtsock.c~git-nfs net/sunrpc/xprtsock.c
--- a/net/sunrpc/xprtsock.c~git-nfs
+++ a/net/sunrpc/xprtsock.c
@@ -13,10 +13,14 @@
  *  (C) 1999 Trond Myklebust <trond.myklebust@fys.uio.no>
  *
  * IP socket transport implementation, (C) 2005 Chuck Lever <cel@netapp.com>
+ *
+ * IPv6 support contributed by Gilles Quillard, Bull Open Source, 2005.
+ *   <gilles.quillard@bull.net>
  */
 
 #include <linux/types.h>
 #include <linux/slab.h>
+#include <linux/module.h>
 #include <linux/capability.h>
 #include <linux/pagemap.h>
 #include <linux/errno.h>
@@ -28,6 +32,7 @@
 #include <linux/tcp.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/xprtsock.h>
 #include <linux/file.h>
 
 #include <net/sock.h>
@@ -260,14 +265,29 @@ struct sock_xprt {
 #define TCP_RCV_COPY_XID	(1UL << 2)
 #define TCP_RCV_COPY_DATA	(1UL << 3)
 
-static void xs_format_peer_addresses(struct rpc_xprt *xprt)
+static inline struct sockaddr *xs_addr(struct rpc_xprt *xprt)
+{
+	return (struct sockaddr *) &xprt->addr;
+}
+
+static inline struct sockaddr_in *xs_addr_in(struct rpc_xprt *xprt)
+{
+	return (struct sockaddr_in *) &xprt->addr;
+}
+
+static inline struct sockaddr_in6 *xs_addr_in6(struct rpc_xprt *xprt)
+{
+	return (struct sockaddr_in6 *) &xprt->addr;
+}
+
+static void xs_format_ipv4_peer_addresses(struct rpc_xprt *xprt)
 {
-	struct sockaddr_in *addr = (struct sockaddr_in *) &xprt->addr;
+	struct sockaddr_in *addr = xs_addr_in(xprt);
 	char *buf;
 
 	buf = kzalloc(20, GFP_KERNEL);
 	if (buf) {
-		snprintf(buf, 20, "%u.%u.%u.%u",
+		snprintf(buf, 20, NIPQUAD_FMT,
 				NIPQUAD(addr->sin_addr.s_addr));
 	}
 	xprt->address_strings[RPC_DISPLAY_ADDR] = buf;
@@ -279,26 +299,123 @@ static void xs_format_peer_addresses(str
 	}
 	xprt->address_strings[RPC_DISPLAY_PORT] = buf;
 
-	if (xprt->prot == IPPROTO_UDP)
-		xprt->address_strings[RPC_DISPLAY_PROTO] = "udp";
-	else
-		xprt->address_strings[RPC_DISPLAY_PROTO] = "tcp";
+	buf = kzalloc(8, GFP_KERNEL);
+	if (buf) {
+		if (xprt->prot == IPPROTO_UDP)
+			snprintf(buf, 8, "udp");
+		else
+			snprintf(buf, 8, "tcp");
+	}
+	xprt->address_strings[RPC_DISPLAY_PROTO] = buf;
 
 	buf = kzalloc(48, GFP_KERNEL);
 	if (buf) {
-		snprintf(buf, 48, "addr=%u.%u.%u.%u port=%u proto=%s",
+		snprintf(buf, 48, "addr="NIPQUAD_FMT" port=%u proto=%s",
 			NIPQUAD(addr->sin_addr.s_addr),
 			ntohs(addr->sin_port),
 			xprt->prot == IPPROTO_UDP ? "udp" : "tcp");
 	}
 	xprt->address_strings[RPC_DISPLAY_ALL] = buf;
+
+	buf = kzalloc(10, GFP_KERNEL);
+	if (buf) {
+		snprintf(buf, 10, "%02x%02x%02x%02x",
+				NIPQUAD(addr->sin_addr.s_addr));
+	}
+	xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = buf;
+
+	buf = kzalloc(8, GFP_KERNEL);
+	if (buf) {
+		snprintf(buf, 8, "%4hx",
+				ntohs(addr->sin_port));
+	}
+	xprt->address_strings[RPC_DISPLAY_HEX_PORT] = buf;
+
+	buf = kzalloc(30, GFP_KERNEL);
+	if (buf) {
+		snprintf(buf, 30, NIPQUAD_FMT".%u.%u",
+				NIPQUAD(addr->sin_addr.s_addr),
+				ntohs(addr->sin_port) >> 8,
+				ntohs(addr->sin_port) & 0xff);
+	}
+	xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR] = buf;
+
+	xprt->address_strings[RPC_DISPLAY_NETID] =
+		kstrdup(xprt->prot == IPPROTO_UDP ?
+			RPCBIND_NETID_UDP : RPCBIND_NETID_TCP, GFP_KERNEL);
+}
+
+static void xs_format_ipv6_peer_addresses(struct rpc_xprt *xprt)
+{
+	struct sockaddr_in6 *addr = xs_addr_in6(xprt);
+	char *buf;
+
+	buf = kzalloc(40, GFP_KERNEL);
+	if (buf) {
+		snprintf(buf, 40, NIP6_FMT,
+				NIP6(addr->sin6_addr));
+	}
+	xprt->address_strings[RPC_DISPLAY_ADDR] = buf;
+
+	buf = kzalloc(8, GFP_KERNEL);
+	if (buf) {
+		snprintf(buf, 8, "%u",
+				ntohs(addr->sin6_port));
+	}
+	xprt->address_strings[RPC_DISPLAY_PORT] = buf;
+
+	buf = kzalloc(8, GFP_KERNEL);
+	if (buf) {
+		if (xprt->prot == IPPROTO_UDP)
+			snprintf(buf, 8, "udp");
+		else
+			snprintf(buf, 8, "tcp");
+	}
+	xprt->address_strings[RPC_DISPLAY_PROTO] = buf;
+
+	buf = kzalloc(64, GFP_KERNEL);
+	if (buf) {
+		snprintf(buf, 64, "addr="NIP6_FMT" port=%u proto=%s",
+				NIP6(addr->sin6_addr),
+				ntohs(addr->sin6_port),
+				xprt->prot == IPPROTO_UDP ? "udp" : "tcp");
+	}
+	xprt->address_strings[RPC_DISPLAY_ALL] = buf;
+
+	buf = kzalloc(36, GFP_KERNEL);
+	if (buf) {
+		snprintf(buf, 36, NIP6_SEQFMT,
+				NIP6(addr->sin6_addr));
+	}
+	xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = buf;
+
+	buf = kzalloc(8, GFP_KERNEL);
+	if (buf) {
+		snprintf(buf, 8, "%4hx",
+				ntohs(addr->sin6_port));
+	}
+	xprt->address_strings[RPC_DISPLAY_HEX_PORT] = buf;
+
+	buf = kzalloc(50, GFP_KERNEL);
+	if (buf) {
+		snprintf(buf, 50, NIP6_FMT".%u.%u",
+				NIP6(addr->sin6_addr),
+				ntohs(addr->sin6_port) >> 8,
+				ntohs(addr->sin6_port) & 0xff);
+	}
+	xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR] = buf;
+
+	xprt->address_strings[RPC_DISPLAY_NETID] =
+		kstrdup(xprt->prot == IPPROTO_UDP ?
+			RPCBIND_NETID_UDP6 : RPCBIND_NETID_TCP6, GFP_KERNEL);
 }
 
 static void xs_free_peer_addresses(struct rpc_xprt *xprt)
 {
-	kfree(xprt->address_strings[RPC_DISPLAY_ADDR]);
-	kfree(xprt->address_strings[RPC_DISPLAY_PORT]);
-	kfree(xprt->address_strings[RPC_DISPLAY_ALL]);
+	int i;
+
+	for (i = 0; i < RPC_DISPLAY_MAX; i++)
+		kfree(xprt->address_strings[i]);
 }
 
 #define XS_SENDMSG_FLAGS	(MSG_DONTWAIT | MSG_NOSIGNAL)
@@ -463,19 +580,20 @@ static int xs_udp_send_request(struct rp
 
 	req->rq_xtime = jiffies;
 	status = xs_sendpages(transport->sock,
-			      (struct sockaddr *) &xprt->addr,
+			      xs_addr(xprt),
 			      xprt->addrlen, xdr,
 			      req->rq_bytes_sent);
 
 	dprintk("RPC:       xs_udp_send_request(%u) = %d\n",
 			xdr->len - req->rq_bytes_sent, status);
 
-	if (likely(status >= (int) req->rq_slen))
-		return 0;
-
-	/* Still some bytes left; set up for a retry later. */
-	if (status > 0)
+	if (status >= 0) {
+		task->tk_bytes_sent += status;
+		if (status >= req->rq_slen)
+			return 0;
+		/* Still some bytes left; set up for a retry later. */
 		status = -EAGAIN;
+	}
 
 	switch (status) {
 	case -ENETUNREACH:
@@ -523,7 +641,8 @@ static int xs_tcp_send_request(struct rp
 	struct rpc_xprt *xprt = req->rq_xprt;
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
 	struct xdr_buf *xdr = &req->rq_snd_buf;
-	int status, retry = 0;
+	int status;
+	unsigned int retry = 0;
 
 	xs_encode_tcp_record_marker(&req->rq_snd_buf);
 
@@ -661,6 +780,7 @@ static void xs_destroy(struct rpc_xprt *
 	xs_free_peer_addresses(xprt);
 	kfree(xprt->slot);
 	kfree(xprt);
+	module_put(THIS_MODULE);
 }
 
 static inline struct rpc_xprt *xprt_from_sock(struct sock *sk)
@@ -1139,14 +1259,23 @@ static unsigned short xs_get_random_port
  */
 static void xs_set_port(struct rpc_xprt *xprt, unsigned short port)
 {
-	struct sockaddr_in *sap = (struct sockaddr_in *) &xprt->addr;
+	struct sockaddr *addr = xs_addr(xprt);
 
 	dprintk("RPC:       setting port for xprt %p to %u\n", xprt, port);
 
-	sap->sin_port = htons(port);
+	switch (addr->sa_family) {
+	case AF_INET:
+		((struct sockaddr_in *)addr)->sin_port = htons(port);
+		break;
+	case AF_INET6:
+		((struct sockaddr_in6 *)addr)->sin6_port = htons(port);
+		break;
+	default:
+		BUG();
+	}
 }
 
-static int xs_bind(struct sock_xprt *transport, struct socket *sock)
+static int xs_bind4(struct sock_xprt *transport, struct socket *sock)
 {
 	struct sockaddr_in myaddr = {
 		.sin_family = AF_INET,
@@ -1174,8 +1303,42 @@ static int xs_bind(struct sock_xprt *tra
 		else
 			port--;
 	} while (err == -EADDRINUSE && port != transport->port);
-	dprintk("RPC:       xs_bind "NIPQUAD_FMT":%u: %s (%d)\n",
-		NIPQUAD(myaddr.sin_addr), port, err ? "failed" : "ok", err);
+	dprintk("RPC:       %s "NIPQUAD_FMT":%u: %s (%d)\n",
+			__FUNCTION__, NIPQUAD(myaddr.sin_addr),
+			port, err ? "failed" : "ok", err);
+	return err;
+}
+
+static int xs_bind6(struct sock_xprt *transport, struct socket *sock)
+{
+	struct sockaddr_in6 myaddr = {
+		.sin6_family = AF_INET6,
+	};
+	struct sockaddr_in6 *sa;
+	int err;
+	unsigned short port = transport->port;
+
+	if (!transport->xprt.resvport)
+		port = 0;
+	sa = (struct sockaddr_in6 *)&transport->addr;
+	myaddr.sin6_addr = sa->sin6_addr;
+	do {
+		myaddr.sin6_port = htons(port);
+		err = kernel_bind(sock, (struct sockaddr *) &myaddr,
+						sizeof(myaddr));
+		if (!transport->xprt.resvport)
+			break;
+		if (err == 0) {
+			transport->port = port;
+			break;
+		}
+		if (port <= xprt_min_resvport)
+			port = xprt_max_resvport;
+		else
+			port--;
+	} while (err == -EADDRINUSE && port != transport->port);
+	dprintk("RPC:       xs_bind6 "NIP6_FMT":%u: %s (%d)\n",
+		NIP6(myaddr.sin6_addr), port, err ? "failed" : "ok", err);
 	return err;
 }
 
@@ -1232,9 +1395,9 @@ static void xs_udp_connect_worker(struct
 		dprintk("RPC:       can't create UDP transport socket (%d).\n", -err);
 		goto out;
 	}
-	xs_reclassify_socket(sock);
+	xs_reclassify_socket4(sock);
 
-	if (xs_bind(transport, sock)) {
+	if (xs_bind4(transport, sock)) {
 		sock_release(sock);
 		goto out;
 	}
@@ -1242,29 +1405,48 @@ static void xs_udp_connect_worker(struct
 	dprintk("RPC:       worker connecting xprt %p to address: %s\n",
 			xprt, xprt->address_strings[RPC_DISPLAY_ALL]);
 
-	if (!transport->inet) {
-		struct sock *sk = sock->sk;
+	xs_udp_finish_connecting(xprt, sock);
+	status = 0;
+out:
+	xprt_wake_pending_tasks(xprt, status);
+	xprt_clear_connecting(xprt);
+}
 
-		write_lock_bh(&sk->sk_callback_lock);
+/**
+ * xs_udp_connect_worker6 - set up a UDP socket
+ * @work: RPC transport to connect
+ *
+ * Invoked by a work queue tasklet.
+ */
+static void xs_udp_connect_worker6(struct work_struct *work)
+{
+	struct sock_xprt *transport =
+		container_of(work, struct sock_xprt, connect_worker.work);
+	struct rpc_xprt *xprt = &transport->xprt;
+	struct socket *sock = transport->sock;
+	int err, status = -EIO;
 
-		sk->sk_user_data = xprt;
-		transport->old_data_ready = sk->sk_data_ready;
-		transport->old_state_change = sk->sk_state_change;
-		transport->old_write_space = sk->sk_write_space;
-		sk->sk_data_ready = xs_udp_data_ready;
-		sk->sk_write_space = xs_udp_write_space;
-		sk->sk_no_check = UDP_CSUM_NORCV;
-		sk->sk_allocation = GFP_ATOMIC;
+	if (xprt->shutdown || !xprt_bound(xprt))
+		goto out;
 
-		xprt_set_connected(xprt);
+	/* Start by resetting any existing state */
+	xs_close(xprt);
 
-		/* Reset to new socket */
-		transport->sock = sock;
-		transport->inet = sk;
+	if ((err = sock_create_kern(PF_INET6, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) {
+		dprintk("RPC:       can't create UDP transport socket (%d).\n", -err);
+		goto out;
+	}
+	xs_reclassify_socket6(sock);
 
-		write_unlock_bh(&sk->sk_callback_lock);
+	if (xs_bind6(transport, sock) < 0) {
+		sock_release(sock);
+		goto out;
 	}
-	xs_udp_do_set_buffer_size(xprt);
+
+	dprintk("RPC:       worker connecting xprt %p to address: %s\n",
+			xprt, xprt->address_strings[RPC_DISPLAY_ALL]);
+
+	xs_udp_finish_connecting(xprt, sock);
 	status = 0;
 out:
 	xprt_wake_pending_tasks(xprt, status);
@@ -1295,13 +1477,52 @@ static void xs_tcp_reuse_connection(stru
 				result);
 }
 
+static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
+{
+	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+
+	if (!transport->inet) {
+		struct sock *sk = sock->sk;
+
+		write_lock_bh(&sk->sk_callback_lock);
+
+		sk->sk_user_data = xprt;
+		transport->old_data_ready = sk->sk_data_ready;
+		transport->old_state_change = sk->sk_state_change;
+		transport->old_write_space = sk->sk_write_space;
+		sk->sk_data_ready = xs_tcp_data_ready;
+		sk->sk_state_change = xs_tcp_state_change;
+		sk->sk_write_space = xs_tcp_write_space;
+		sk->sk_allocation = GFP_ATOMIC;
+
+		/* socket options */
+		sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
+		sock_reset_flag(sk, SOCK_LINGER);
+		tcp_sk(sk)->linger2 = 0;
+		tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
+
+		xprt_clear_connected(xprt);
+
+		/* Reset to new socket */
+		transport->sock = sock;
+		transport->inet = sk;
+
+		write_unlock_bh(&sk->sk_callback_lock);
+	}
+
+	/* Tell the socket layer to start connecting... */
+	xprt->stat.connect_count++;
+	xprt->stat.connect_start = jiffies;
+	return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK);
+}
+
 /**
- * xs_tcp_connect_worker - connect a TCP socket to a remote endpoint
+ * xs_tcp_connect_worker4 - connect a TCP socket to a remote endpoint
  * @work: RPC transport to connect
  *
  * Invoked by a work queue tasklet.
  */
-static void xs_tcp_connect_worker(struct work_struct *work)
+static void xs_tcp_connect_worker4(struct work_struct *work)
 {
 	struct sock_xprt *transport =
 		container_of(work, struct sock_xprt, connect_worker.work);
@@ -1315,13 +1536,12 @@ static void xs_tcp_connect_worker(struct
 	if (!sock) {
 		/* start from scratch */
 		if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) {
-			dprintk("RPC:       can't create TCP transport "
-					"socket (%d).\n", -err);
+			dprintk("RPC:       can't create TCP transport socket (%d).\n", -err);
 			goto out;
 		}
-		xs_reclassify_socket(sock);
+		xs_reclassify_socket4(sock);
 
-		if (xs_bind(transport, sock)) {
+		if (xs_bind4(transport, sock) < 0) {
 			sock_release(sock);
 			goto out;
 		}
@@ -1332,43 +1552,70 @@ static void xs_tcp_connect_worker(struct
 	dprintk("RPC:       worker connecting xprt %p to address: %s\n",
 			xprt, xprt->address_strings[RPC_DISPLAY_ALL]);
 
-	if (!transport->inet) {
-		struct sock *sk = sock->sk;
-
-		write_lock_bh(&sk->sk_callback_lock);
+	status = xs_tcp_finish_connecting(xprt, sock);
+	dprintk("RPC:       %p connect status %d connected %d sock state %d\n",
+			xprt, -status, xprt_connected(xprt),
+			sock->sk->sk_state);
+	if (status < 0) {
+		switch (status) {
+			case -EINPROGRESS:
+			case -EALREADY:
+				goto out_clear;
+			case -ECONNREFUSED:
+			case -ECONNRESET:
+				/* retry with existing socket, after a delay */
+				break;
+			default:
+				/* get rid of existing socket, and retry */
+				xs_close(xprt);
+				break;
+		}
+	}
+out:
+	xprt_wake_pending_tasks(xprt, status);
+out_clear:
+	xprt_clear_connecting(xprt);
+}
 
-		sk->sk_user_data = xprt;
-		transport->old_data_ready = sk->sk_data_ready;
-		transport->old_state_change = sk->sk_state_change;
-		transport->old_write_space = sk->sk_write_space;
-		sk->sk_data_ready = xs_tcp_data_ready;
-		sk->sk_state_change = xs_tcp_state_change;
-		sk->sk_write_space = xs_tcp_write_space;
-		sk->sk_allocation = GFP_ATOMIC;
+/**
+ * xs_tcp_connect_worker6 - connect a TCP socket to a remote endpoint
+ * @work: RPC transport to connect
+ *
+ * Invoked by a work queue tasklet.
+ */
+static void xs_tcp_connect_worker6(struct work_struct *work)
+{
+	struct sock_xprt *transport =
+		container_of(work, struct sock_xprt, connect_worker.work);
+	struct rpc_xprt *xprt = &transport->xprt;
+	struct socket *sock = transport->sock;
+	int err, status = -EIO;
 
-		/* socket options */
-		sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
-		sock_reset_flag(sk, SOCK_LINGER);
-		tcp_sk(sk)->linger2 = 0;
-		tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
+	if (xprt->shutdown || !xprt_bound(xprt))
+		goto out;
 
-		xprt_clear_connected(xprt);
+	if (!sock) {
+		/* start from scratch */
+		if ((err = sock_create_kern(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) {
+			dprintk("RPC:       can't create TCP transport socket (%d).\n", -err);
+			goto out;
+		}
+		xs_reclassify_socket6(sock);
 
-		/* Reset to new socket */
-		transport->sock = sock;
-		transport->inet = sk;
+		if (xs_bind6(transport, sock) < 0) {
+			sock_release(sock);
+			goto out;
+		}
+	} else
+		/* "close" the socket, preserving the local port */
+		xs_tcp_reuse_connection(xprt);
 
-		write_unlock_bh(&sk->sk_callback_lock);
-	}
+	dprintk("RPC:       worker connecting xprt %p to address: %s\n",
+			xprt, xprt->address_strings[RPC_DISPLAY_ALL]);
 
-	/* Tell the socket layer to start connecting... */
-	xprt->stat.connect_count++;
-	xprt->stat.connect_start = jiffies;
-	status = kernel_connect(sock, (struct sockaddr *) &xprt->addr,
-			xprt->addrlen, O_NONBLOCK);
+	status = xs_tcp_finish_connecting(xprt, sock);
 	dprintk("RPC:       %p connect status %d connected %d sock state %d\n",
-			xprt, -status, xprt_connected(xprt),
-			sock->sk->sk_state);
+			xprt, -status, xprt_connected(xprt), sock->sk->sk_state);
 	if (status < 0) {
 		switch (status) {
 			case -EINPROGRESS:
@@ -1508,7 +1755,8 @@ static struct rpc_xprt_ops xs_tcp_ops = 
 	.print_stats		= xs_tcp_print_stats,
 };
 
-static struct rpc_xprt *xs_setup_xprt(struct rpc_xprtsock_create *args, unsigned int slot_table_size)
+static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args,
+				      unsigned int slot_table_size)
 {
 	struct rpc_xprt *xprt;
 	struct sock_xprt *new;
@@ -1549,8 +1797,9 @@ static struct rpc_xprt *xs_setup_xprt(st
  * @args: rpc transport creation arguments
  *
  */
-struct rpc_xprt *xs_setup_udp(struct rpc_xprtsock_create *args)
+struct rpc_xprt *xs_setup_udp(struct xprt_create *args)
 {
+	struct sockaddr *addr = args->dstaddr;
 	struct rpc_xprt *xprt;
 	struct sock_xprt *transport;
 
@@ -1559,15 +1808,11 @@ struct rpc_xprt *xs_setup_udp(struct rpc
 		return xprt;
 	transport = container_of(xprt, struct sock_xprt, xprt);
 
-	if (ntohs(((struct sockaddr_in *)args->dstaddr)->sin_port) != 0)
-		xprt_set_bound(xprt);
-
 	xprt->prot = IPPROTO_UDP;
 	xprt->tsh_size = 0;
 	/* XXX: header size can vary due to auth type, IPv6, etc. */
 	xprt->max_payload = (1U << 16) - (MAX_HEADER << 3);
 
-	INIT_DELAYED_WORK(&transport->connect_worker, xs_udp_connect_worker);
 	xprt->bind_timeout = XS_BIND_TO;
 	xprt->connect_timeout = XS_UDP_CONN_TO;
 	xprt->reestablish_timeout = XS_UDP_REEST_TO;
@@ -1580,11 +1825,37 @@ struct rpc_xprt *xs_setup_udp(struct rpc
 	else
 		xprt_set_timeout(&xprt->timeout, 5, 5 * HZ);
 
-	xs_format_peer_addresses(xprt);
+	switch (addr->sa_family) {
+	case AF_INET:
+		if (((struct sockaddr_in *)addr)->sin_port != htons(0))
+			xprt_set_bound(xprt);
+
+		INIT_DELAYED_WORK(&transport->connect_worker,
+					xs_udp_connect_worker4);
+		xs_format_ipv4_peer_addresses(xprt);
+		break;
+	case AF_INET6:
+		if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0))
+			xprt_set_bound(xprt);
+
+		INIT_DELAYED_WORK(&transport->connect_worker,
+					xs_udp_connect_worker6);
+		xs_format_ipv6_peer_addresses(xprt);
+		break;
+	default:
+		kfree(xprt);
+		return ERR_PTR(-EAFNOSUPPORT);
+	}
+
 	dprintk("RPC:       set up transport to address %s\n",
 			xprt->address_strings[RPC_DISPLAY_ALL]);
 
-	return xprt;
+	if (try_module_get(THIS_MODULE))
+		return xprt;
+
+	kfree(xprt->slot);
+	kfree(xprt);
+	return ERR_PTR(-EINVAL);
 }
 
 /**
@@ -1592,8 +1863,9 @@ struct rpc_xprt *xs_setup_udp(struct rpc
  * @args: rpc transport creation arguments
  *
  */
-struct rpc_xprt *xs_setup_tcp(struct rpc_xprtsock_create *args)
+struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
 {
+	struct sockaddr *addr = args->dstaddr;
 	struct rpc_xprt *xprt;
 	struct sock_xprt *transport;
 
@@ -1602,14 +1874,10 @@ struct rpc_xprt *xs_setup_tcp(struct rpc
 		return xprt;
 	transport = container_of(xprt, struct sock_xprt, xprt);
 
-	if (ntohs(((struct sockaddr_in *)args->dstaddr)->sin_port) != 0)
-		xprt_set_bound(xprt);
-
 	xprt->prot = IPPROTO_TCP;
 	xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
 	xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
 
-	INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_connect_worker);
 	xprt->bind_timeout = XS_BIND_TO;
 	xprt->connect_timeout = XS_TCP_CONN_TO;
 	xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
@@ -1622,15 +1890,55 @@ struct rpc_xprt *xs_setup_tcp(struct rpc
 	else
 		xprt_set_timeout(&xprt->timeout, 2, 60 * HZ);
 
-	xs_format_peer_addresses(xprt);
+	switch (addr->sa_family) {
+	case AF_INET:
+		if (((struct sockaddr_in *)addr)->sin_port != htons(0))
+			xprt_set_bound(xprt);
+
+		INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_connect_worker4);
+		xs_format_ipv4_peer_addresses(xprt);
+		break;
+	case AF_INET6:
+		if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0))
+			xprt_set_bound(xprt);
+
+		INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_connect_worker6);
+		xs_format_ipv6_peer_addresses(xprt);
+		break;
+	default:
+		kfree(xprt);
+		return ERR_PTR(-EAFNOSUPPORT);
+	}
+
 	dprintk("RPC:       set up transport to address %s\n",
 			xprt->address_strings[RPC_DISPLAY_ALL]);
 
-	return xprt;
+	if (try_module_get(THIS_MODULE))
+		return xprt;
+
+	kfree(xprt->slot);
+	kfree(xprt);
+	return ERR_PTR(-EINVAL);
 }
 
+static struct xprt_class	xs_udp_transport = {
+	.list		= LIST_HEAD_INIT(xs_udp_transport.list),
+	.name		= "udp",
+	.owner		= THIS_MODULE,
+	.ident		= IPPROTO_UDP,
+	.setup		= xs_setup_udp,
+};
+
+static struct xprt_class	xs_tcp_transport = {
+	.list		= LIST_HEAD_INIT(xs_tcp_transport.list),
+	.name		= "tcp",
+	.owner		= THIS_MODULE,
+	.ident		= IPPROTO_TCP,
+	.setup		= xs_setup_tcp,
+};
+
 /**
- * init_socket_xprt - set up xprtsock's sysctls
+ * init_socket_xprt - set up xprtsock's sysctls, register with RPC client
  *
  */
 int init_socket_xprt(void)
@@ -1640,11 +1948,14 @@ int init_socket_xprt(void)
 		sunrpc_table_header = register_sysctl_table(sunrpc_table);
 #endif
 
+	xprt_register_transport(&xs_udp_transport);
+	xprt_register_transport(&xs_tcp_transport);
+
 	return 0;
 }
 
 /**
- * cleanup_socket_xprt - remove xprtsock's sysctls
+ * cleanup_socket_xprt - remove xprtsock's sysctls, unregister
  *
  */
 void cleanup_socket_xprt(void)
@@ -1655,4 +1966,7 @@ void cleanup_socket_xprt(void)
 		sunrpc_table_header = NULL;
 	}
 #endif
+
+	xprt_unregister_transport(&xs_udp_transport);
+	xprt_unregister_transport(&xs_tcp_transport);
 }
_