GIT bc387b1ceea4ba36bf354926013c709bf1dd6d12 git://git.linux-nfs.org/~bfields/linux.git#for-mm

commit 2ad4189a26e9cc72408c1f3c572ce675c091993d
Author: J. Bruce Fields <bfields@citi.umich.edu>
Date:   Tue Oct 2 14:18:12 2007 -0400

    nfsd: remove IS_ISMNDLCK macro
    
    This macro is only used in one place; in this place it seems simpler to
    put open-code it and move the comment to where it's used.
    
    Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>

commit 6602cca355722274209c5a7ff810be72b658f58a
Author: Pavel Emelyanov <xemul@openvz.org>
Date:   Mon Oct 1 14:41:15 2007 -0700

    Rework /proc/locks via seq_files and seq_list helpers
    
    Currently /proc/locks is shown with a proc_read function, but its behavior
    is rather complex as it has to manually handle current offset and buffer
    length.  On the other hand, files that show objects from lists can be
    easily reimplemented using the sequential files and the seq_list_XXX()
    helpers.
    
    This saves (as usually) 16 lines of code and more than 200 from
    the .text section.
    
    [akpm@linux-foundation.org: no externs in C]
    [akpm@linux-foundation.org: warning fixes]
    Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
    Cc: "J. Bruce Fields" <bfields@fieldses.org>
    Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

commit 5c8912b1dd2664048d4f69272d3d15bd17f1a138
Author: Matthias Kaehlcke <matthias.kaehlcke@gmail.com>
Date:   Tue Oct 2 11:21:34 2007 -0700

    fs/locks.c: use list_for_each_entry() instead of list_for_each()
    
    fs/locks.c: use list_for_each_entry() instead of list_for_each() in
    posix_locks_deadlock() and get_locks_status()
    
    Signed-off-by: Matthias Kaehlcke <matthias.kaehlcke@gmail.com>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

commit 686734b7089212057ac99ccfd1d876a029da212d
Author: Pavel Emelyanov <xemul@openvz.org>
Date:   Mon Oct 1 14:41:15 2007 -0700

    NFS: clean up explicit check for mandatory locks
    
    The __mandatory_lock(inode) macro makes the same check, but makes the code
    more readable.
    
    Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
    Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
    Cc: "J. Bruce Fields" <bfields@fieldses.org>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

commit 5d41de6f3bc297205a8c8cebd7d49335f873b987
Author: Pavel Emelyanov <xemul@openvz.org>
Date:   Mon Oct 1 14:41:14 2007 -0700

    AFS: clean up explicit check for mandatory locks
    
    The __mandatory_lock(inode) macro makes the same check, but makes the code
    more readable.
    
    Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
    Cc: David Howells <dhowells@redhat.com>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

commit a68e3c30614a233fc07dab2bda97e8e89ba6d0a2
Author: Pavel Emelyanov <xemul@openvz.org>
Date:   Mon Oct 1 14:41:13 2007 -0700

    9PFS: clean up explicit check for mandatory locks
    
    The __mandatory_lock(inode) macro makes the same check, but makes the code
    more readable.
    
    Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
    Cc: Eric Van Hensbergen <ericvh@gmail.com>
    Cc: Ron Minnich <rminnich@sandia.gov>
    Cc: Latchesar Ionkov <lucho@ionkov.net>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

commit c1c45b13a2c12920a0440b769fa73789a652accf
Author: Pavel Emelyanov <xemul@openvz.org>
Date:   Mon Oct 1 14:41:13 2007 -0700

    GFS2: clean up explicit check for mandatory locks
    
    The __mandatory_lock(inode) function makes the same check, but makes the code
    more readable.
    
    Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
    Cc: Steven Whitehouse <swhiteho@redhat.com>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

commit ce23fdc14a10684d2addd6a66a37e81dcf9de714
Author: Pavel Emelyanov <xemul@openvz.org>
Date:   Mon Oct 1 14:41:11 2007 -0700

    Cleanup macros for distinguishing mandatory locks
    
    The combination of S_ISGID bit set and S_IXGRP bit unset is used to mark the
    inode as "mandatory lockable" and there's a macro for this check called
    MANDATORY_LOCK(inode).  However, fs/locks.c and some filesystems still perform
    the explicit i_mode checking.  Besides, Andrew pointed out, that this macro is
    buggy itself, as it dereferences the inode arg twice.
    
    Convert this macro into static inline function and switch its users to it,
    making the code shorter and more readable.
    
    The __mandatory_lock() helper is to be used in places where the IS_MANDLOCK()
    for superblock is already known to be true.
    
    Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
    Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
    Cc: "J. Bruce Fields" <bfields@fieldses.org>
    Cc: David Howells <dhowells@redhat.com>
    Cc: Eric Van Hensbergen <ericvh@gmail.com>
    Cc: Ron Minnich <rminnich@sandia.gov>
    Cc: Latchesar Ionkov <lucho@ionkov.net>
    Cc: Steven Whitehouse <swhiteho@redhat.com>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

commit 3aaddf3550e4a67b022aff6550ea36e20b957f68
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:28:48 2007 -0500

    knfsd: Support adding transports by writing portlist file
    
    Update the write handler for the portlist file to allow creating new
    listening endpoints on a transport. The general form of the string is:
    
    <transport_name><space><port number>
    
    For example:
    
    tcp 2049
    
    This is intended to support the creation of a listening endpoint for
    RDMA transports without adding #ifdef code to the nfssvc.c file.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit 555996a64bbdf999db4d54e5c5250596011c96f5
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:28:45 2007 -0500

    svc: Add /proc/sys/sunrpc/transport files
    
    Add a file that when read lists the set of registered svc
    transports.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit d11ce39df9079fca3182939d03808dbc143af14b
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:28:43 2007 -0500

    svc: Add transport hdr size for defer/revisit
    
    Some transports have a header in front of the RPC header. The current
    defer/revisit processing considers only the iov_len and arg_len to
    determine how much to back up when saving the original request
    to revisit. Add a field to the rqstp structure to save the size
    of the transport header so svc_defer can correctly compute
    the start of a request.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit 42d177f40901b9606463455d97824af3d8bd456a
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:28:41 2007 -0500

    svc: Move the xprt independent code to the svc_xprt.c file
    
    This functionally trivial patch moves all of the transport independent
    functions from the svcsock.c file to the transport independent svc_xprt.c
    file.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit 7e3665dc18a8ef31f72c9231f53999e6eee618c0
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:28:39 2007 -0500

    svc: Make svc_check_conn_limits xprt independent
    
    The svc_check_conn_limits function only manipulates xprt fields. Change references
    to svc_sock->sk_xprt to svc_xprt directly.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit 44f096b047453728e4bbdd44cdf2281d1cdbfdd6
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:28:36 2007 -0500

    svc: Removing remaining references to rq_sock in rqstp
    
    This functionally empty patch removes rq_sock and unamed union
    from rqstp structure.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit dbd5bfe03471f2ae32430c4f483f0013711de4e9
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:28:34 2007 -0500

    svc: Move common create logic to common code
    
    Move the code that adds a transport instance to the sv_tempsocks and
    sv_permsocks lists out of the transport specific functions and into core
    logic.
    
    The svc_addsock routine still manipulates sv_permsocks directly. This
    code may be removed when rpc.nfsd is modified to create transports
    by writing to the portlist file.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit 2261d3f2f175b926c90f41527ddd78e483f18cf8
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:28:32 2007 -0500

    svc: Make svc_age_temp_sockets svc_age_temp_transports
    
    This function is transport independent. Change it to use svc_xprt directly
    and change it's name to reflect this.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit bf20a61bd5d4ae3d210e576e4113b410bf40e9f9
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:28:30 2007 -0500

    svc: Make svc_recv transport neutral
    
    All of the transport field and functions used by svc_recv are now
    transport independent. Change the svc_recv function to use the svc_xprt
    structure directly instead of the transport specific svc_sock structure.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit c5e9631b9fafa150c8e6154c1929e588e2f6b326
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:28:28 2007 -0500

    svc: Make svc_sock_release svc_xprt_release
    
    The svc_sock_release function only touches transport independent fields.
    Change the function to manipulate svc_xprt directly instead of the transport
    dependent svc_sock structure.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit fa63ddc2e9890da93ebe158f212a4182a94c9d71
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:28:25 2007 -0500

    svc: Move the sockaddr information to svc_xprt
    
    Move the IP address fields to the svc_xprt structure. Note that this
    assumes that _all_ RPC transports must have IP based 4-tuples. This
    seems reasonable given the tight coupling with the portmapper etc...
    Thoughts?
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit 2def756732a6db1a2228bad967dfa95867621bf4
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:28:23 2007 -0500

    svc: Make deferral processing xprt independent
    
    This functionally trivial patch moves the transport independent sk_deferred
    list to the svc_xprt structure and updates the svc_deferred_req structure
    to keep pointers to svc_xprt's directly.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit db10883a4f6b292bc65dcc018c6b15c28293aa88
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:28:21 2007 -0500

    svc: Move the authinfo cache to svc_xprt.
    
    Move the authinfo cache to svc_xprt. This allows both the TCP and RDMA
    transports to share this logic. A flag bit is used to determine if
    auth information is to be cached or not. Previously, this code looked
    at the transport protocol.
    
    I've also changed the spin_lock/unlock logic so that a lock is not taken for
    transports that are not caching auth info.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit d522bbb2b5a93afcd2a8a2d757439031561e10c8
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:28:19 2007 -0500

    svc: Remove sk_lastrecv
    
    With the implementation of the new mark and sweep algorithm for shutting
    down old connections, the sk_lastrecv field is no longer needed.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit 4047f2427de886d86a1dcd74cbbbaf81557cee1e
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:28:17 2007 -0500

    svc: Change svc_sock_received to svc_xprt_received and export it
    
    All fields touched by svc_sock_received are now transport independent.
    Change it to use svc_xprt directly. This function is called from
    transport dependent code, so export it.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit fa69c92bd83b2d70acd0875386f5e154d836225a
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:28:14 2007 -0500

    svc: Make svc_send transport neutral
    
    Move the sk_mutex field to the transport independent svc_xprt structure.
    Now all the fields that svc_send touches are transport neutral. Change the
    svc_send function to use the transport independent svc_xprt directly instead
    of the transport dependent svc_sock structure.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit 381607c8f63f6519a4dd2d918f68814d6f4d95de
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:28:12 2007 -0500

    svc: Make the enqueue service transport neutral and export it.
    
    The svc_sock_enqueue function is now transport independent since all of
    the fields it touches have been moved to the transport independent svc_xprt
    structure. Change the function to use the svc_xprt structure directly
    instead of the transport specific svc_sock structure.
    
    Transport specific data-ready handlers need to call this function, so
    export it.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit 7632637565fbaacbabf0e3b83a6994296302d784
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:28:10 2007 -0500

    svc: Move sk_reserved to svc_xprt
    
    This functionally trivial patch moves the sk_reserved field to the
    transport independent svc_xprt structure.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit 8da329ec798cb724ad42262fa9a7fe398b51928f
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:28:08 2007 -0500

    svc: Make close transport independent
    
    Move sk_list and sk_ready to svc_xprt. This involves close because these
    lists are walked by svcs when closing all their transports. So I combined
    the moving of these lists to svc_xprt with making close transport independent.
    
    The svc_force_sock_close has been changed to svc_close_all and takes a list
    as an argument. This removes some svc internals knowledge from the svcs.
    
    This code races with module removal and transport addition.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit fe263c5d21ccae28cc6457dbca813db34d0d1a01
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:28:05 2007 -0500

    svc: Move sk_server and sk_pool to svc_xprt
    
    This is another incremental change that moves transport independent
    fields from svc_sock to the svc_xprt structure. The changes
    should be functionally null.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit 8cea193afd7b8755af9b9ecee6b108cb94722e65
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:28:03 2007 -0500

    svc: Move sk_flags to the svc_xprt structure
    
    This functionally trivial change moves the transport independent sk_flags
    field to the transport independent svc_xprt structure.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit e992be5aef124a79d05b9bab3c545e12c0e1e037
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:28:01 2007 -0500

    svc: Change sk_inuse to a kref
    
    Change the atomic_t reference count to a kref and move it to the
    transport indepenent svc_xprt structure. Change the reference count
    wrapper names to be generic.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit 489bd5f81802d4baa85ae07a6c850165e209916a
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:27:59 2007 -0500

    svc: Change services to use new svc_create_xprt service
    
    Modify the various kernel RPC svcs to use the svc_create_xprt service.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit 7a6849539bdc4f6b3452db0b85b607ed1a09503b
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:27:56 2007 -0500

    svc: Add a generic transport svc_create_xprt function
    
    The svc_create_xprt function is a transport independent version
    of the svc_makesock function.
    
    Since transport instance creation contains transport dependent and
    independent components, add an xpo_create transport function. The
    transport implementation of this function allocates the memory for the
    endpoint, implements the transport dependent initialization logic, and
    calls svc_xprt_init to initialize the transport independent field (svc_xprt)
    in it's data structure.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit 79a182a4124933dd502658372e3d38b6fc461ebf
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:27:54 2007 -0500

    svc: Add xpo_accept transport function
    
    Previously, the accept logic looked into the socket state to determine
    whether to call accept or recv when data-ready was indicated on an endpoint.
    Since some transports don't use sockets, this logic was changed to use a flag
    bit (SK_LISTENER) to identify listening endpoints. A transport function
    (xpo_accept) was added to allow each transport to define its own accept
    processing. A transport's initialization logic is reponsible for setting the
    SK_LISTENER bit. I didn't see any way to do this in transport independent
    logic since the passive side of a UDP connection doesn't listen and
    always recv's.
    
    In the svc_recv function, if the SK_LISTENER bit is set, the transport
    xpo_accept function is called to handle accept processing.
    
    Note that all functions are defined even if they don't make sense
    for a given transport. For example, accept doesn't mean anything for
    UDP. The fuction is defined anyway and bug checks if called. The
    UDP transport should never set the SK_LISTENER bit.
    
    The code that poaches connections when the connection
    limit is hit was moved to a subroutine to make the accept logic path
    easier to follow. Since this is in the new connection path, it should
    not be a performance issue.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit 1291d9c149717f3ddc86df6a609ff34fd9e43ce2
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:27:51 2007 -0500

    svc: Move close processing to a single place
    
    Close handling was duplicated in the UDP and TCP recvfrom
    methods. This code has been moved to the transport independent
    svc_recv function.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit 097ad803f8054c752db831f5ba1a9c8858606e65
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:27:49 2007 -0500

    svc: Add a transport function that checks for write space
    
    In order to avoid blocking a service thread, the receive side checks
    to see if there is sufficient write space to reply to the request.
    Each transport has a different mechanism for determining if there is
    enough write space to reply.
    
    The code that checked for white space was coupled with code that
    checked for CLOSE and CONN. These checks have been broken out into
    separate statements to make the code easier to read.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit 797c4d38ebd67e1a79c31c6390e0b7e8b0efcaa7
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:27:47 2007 -0500

    svc: Add xpo_prep_reply_hdr
    
    Some transports add fields to the RPC header for replies, e.g. the TCP
    record length. This function is called when preparing the reply header
    to allow each transport to add whatever fields it requires.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit fd4635aed0fb08c7d6c75fdde729e3bc676908aa
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:27:45 2007 -0500

    svc: Add per-transport delete functions
    
    Add transport specific xpo_detach and xpo_free functions. The xpo_detach
    function causes the transport to stop delivering data-ready events
    and enqueing the transport for I/O.
    
    The xpo_free function frees all resources associated with the particular
    transport instance.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit 82fd56366d3dc2f5543f385a40625cf7db1e769b
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:27:42 2007 -0500

    svc: Add transport specific xpo_release function
    
    The svc_sock_release function releases pages allocated to a thread. For
    UDP, this also returns the receive skb to the stack. For RDMA it will
    post a receive WR and bump the client credit count.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit c854768f825eaab3c9fccacec7c513d5f019b3fc
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:27:40 2007 -0500

    svc: Move sk_sendto and sk_recvfrom to svc_xprt_class
    
    The sk_sendto and sk_recvfrom are function pointers that allow svc_sock
    to be used for both UDP and TCP. Move these function pointers to the
    svc_xprt_ops structure.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit 85eba208284186cab9d3aa79df89bc256b3d7382
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:27:38 2007 -0500

    svc: Add a max payload value to the transport
    
    The svc_max_payload function currently looks at the socket type
    to determine the max payload. Add a max payload value to svc_xprt_class
    so it can be returned directly.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit a6b20ecc631dbb81748a4d41119b7de430037116
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:27:35 2007 -0500

    svc: Change the svc_sock in the rqstp structure to a transport
    
    The rqstp structure contains a pointer to the transport for the
    RPC request. This functionaly trivial patch adds an unamed union
    with pointers to both svc_sock and svc_xprt. Ultimately the
    union will be removed and only the rq_xprt field will remain. This
    allows incrementally extracting transport independent interfaces without
    one gigundo patch.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit e3f4b6b3f66577096804931f9463ab2858acc83d
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:27:33 2007 -0500

    svc: Make svc_sock the tcp/udp transport
    
    Make TCP and UDP svc_sock transports, and register them
    with the svc transport core.
    
    A transport type (svc_sock) has an svc_xprt as its first member,
    and calls svc_xprt_init to initialize this field.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit 8ac94da36334e5e63f8f4e8b23c281f421ada0f1
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Mon Oct 1 14:27:31 2007 -0500

    svc: Add an svc transport class
    
    The transport class (svc_xprt_class) represents a type of transport, e.g.
    udp, tcp, rdma.  A transport class has a unique name and a set of transport
    operations kept in the svc_xprt_ops structure.
    
    A transport class can be dynamically registered and unregisterd. The
    svc_xprt_class represents the module that implements the transport
    type and keeps reference counts on the module to avoid unloading while
    there are active users.
    
    The endpoint (svc_xprt) is a generic, transport independent endpoint that can
    be used to send and receive data for an RPC service. It inherits it's
    operations from the transport class.
    
    A transport driver module registers and unregisters itself with svc sunrpc
    by calling svc_reg_xprt_class, and svc_unreg_xprt_class respectively.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

commit 4f0d70bc62d429e6215b5076aa118f07cb3b9730
Author: J. Bruce Fields <bfields@citi.umich.edu>
Date:   Sun Sep 30 22:18:55 2007 -0400

    Documentation: move locks.txt in filesystems/
    
    This documentation (about file locking) belongs in filesystems/.
    
    Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>

commit 20bcf326fb7af86e6cd3747d7da32d68188dfe2c
Author: J. Bruce Fields <bfields@citi.umich.edu>
Date:   Fri Sep 28 16:45:51 2007 -0400

    knfsd: query filesystem for NFSv4 getattr of FATTR4_MAXNAME
    
    Without this we always return 2^32-1 as the the maximum namelength.
    
    Thanks to Andreas Gruenbacher for bug report and testing.
    
    Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
    Cc: Andreas Gruenbacher <agruen@suse.de>

commit 063875f0ad420d5e47ea8fe658ec1d775284bb91
Author: J. Bruce Fields <bfields@citi.umich.edu>
Date:   Wed Sep 12 20:35:15 2007 -0400

    knfsd: nfsv4 delegation recall should take reference on client
    
    It's not enough to take a reference on the delegation object itself; we
    need to ensure that the rpc_client won't go away just as we're about to
    make an rpc call.
    
    Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>

commit 8580d1cff1dd4396510ac13b827df5dfaffe7b2c
Author: J. Bruce Fields <bfields@citi.umich.edu>
Date:   Wed Sep 12 08:43:59 2007 -0400

    knfsd: don't shutdown callbacks until nfsv4 client is freed
    
    If a callback still holds a reference on the client, then it may be
    about to perform an rpc call, so it isn't safe to call rpc_shutdown().
    (Though rpc_shutdown() does wait for any outstanding rpc's, it can't
    know if a new rpc is about to be issued with that client.)
    
    So, wait to shutdown the rpc_client until the reference count on the
    client has gone to zero.
    
    Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>

commit 1f6ac201b821c45fb745ed0f5e5c4721da064f61
Author: J. Bruce Fields <bfields@citi.umich.edu>
Date:   Wed Sep 12 18:56:12 2007 -0400

    knfsd: let nfsd manage timing out its own leases
    
    Currently there's a race that can cause an oops in generic_setlease.
    
    (In detail: nfsd, when it removes a lease, does so by calling
    vfs_setlease() with F_UNLCK and a pointer to the fl_flock field, which
    in turn points to nfsd's existing lease; but the first thing the
    setlease code does is call time_out_leases().  If the lease happens to
    already be beyond the lease break time, that will free the lease and (in
    nfsd's release_private callback) set fl_flock to NULL, leading to a NULL
    deference soon after in vfs_setlease().)
    
    There are probably other things to fix here too, but it seems inherently
    racy to allow either locks.c or nfsd to time out this lease.  Instead
    just set the fl_break_time to 0 (preventing locks.c from ever timing out
    this lock) and leave it up to nfsd's laundromat thread to deal with it.
    
    Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>

commit 14de4d794f93da7e117f546317cbf0e3e0d95a3e
Author: J. Bruce Fields <bfields@citi.umich.edu>
Date:   Tue Sep 25 11:57:19 2007 -0400

    locks: add warning about mandatory locking races
    
    The mandatory file locking implementation has long-standing races that
    probably render it useless.  I know of no plans to fix them.  Till we
    do, we should at least warn people.
    
    Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>

commit 27e031db5f6ef886a3f8a947a5e02e07331a42e3
Author: J. Bruce Fields <bfields@citi.umich.edu>
Date:   Mon Sep 24 18:52:09 2007 -0400

    Documentation: move mandatory locking documentation to filesystems/
    
    Shouldn't this mandatory-locking documentation be in the
    Documentation/filesystems directory?
    
    Give it a more descriptive name while we're at it, and update 00-INDEX
    with a more inclusive description of Documentation/filesystems (which
    has already talked about more than just individual filesystems).
    
    Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
    Acked-by: Randy Dunlap <randy.dunlap@oracle.com>

commit 89dab77633d96474dc88ad471957a48160482d71
Author: Dr. David Alan Gilbert <linux@treblig.org>
Date:   Sat Aug 25 16:09:27 2007 +0100

    knfsd: Add source address to sunrpc svc errors
    
    This patch adds the address of the client that caused an error in
    sunrpc/svc.c so that you get errors that look like:
    
    svc: 192.168.66.28, port=709: unknown version (3 for prog 100003, nfsd)
    
    I've seen machines which get bunches of unknown version or similar
    errors from time to time, and while the recent patch to add the service
    helps to find which service has the wrong version it doesn't help find
    the potentially bad client.
    
    The patch is against a checkout of Linus's git tree made on 2007-08-24.
    
    One observation is that the svc_print_addr function prints to a buffer
    which in this case makes life a little more complex; it just feels as if
    there must be lots of places that print a connection address - is there
    a better function to use anywhere?
    
    I think actually there are a few places with semi duplicated code; e.g.
    one_sock_name switches on the address family but only currently has
    IPV4; I wonder how many other places are similar.
    
    Signed-off-by: Dave Gilbert <linux@treblig.org>
    Cc: Randy Dunlap <randy.dunlap@oracle.com>
    Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>

commit a5b51d88131cab6c1b99ce5da934028cf3927cc2
Author: Peter Staubach <staubach@redhat.com>
Date:   Thu Aug 16 12:10:07 2007 -0400

    knfsd: 64 bit ino support for NFS server
    
    Modify the NFS server code to support 64 bit ino's, as
    appropriate for the system and the NFS protocol version.
    
    The gist of the changes is to query the underlying file system
    for attributes and not just to use the cached attributes in the
    inode.  For this specific purpose, the inode only contains an
    ino field which unsigned long, which is large enough on 64 bit
    platforms, but is not large enough on 32 bit platforms.
    
    I haven't been able to find any reason why ->getattr can't be called
    while i_mutex.  The specification indicates that i_mutex is not
    required to be held in order to invoke ->getattr, but it doesn't say
    that i_mutex can't be held while invoking ->getattr.
    
    I also haven't come to any conclusions regarding the value of
    lease_get_mtime() and whether it should or should not be invoked
    by fill_post_wcc() too.  I chose not to change this because I
    thought that it was safer to leave well enough alone.  If we
    decide to make a change, it can be done separately.
    
    Signed-off-by: Peter Staubach <staubach@redhat.com>
    Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>

commit b5a162085cd708ce643cbd5abfe8c9992255cbc2
Author: J. Bruce Fields <bfields@citi.umich.edu>
Date:   Thu Aug 9 20:16:22 2007 -0400

    svcgss: move init code into separate function
    
    We've let svcauth_gss_accept() get much too long and hairy.  The
    RPC_GSS_PROC_INIT and RPC_GSS_PROC_CONTINUE_INIT cases share very little
    with the other cases, so it's very natural to split them off into a
    separate function.
    
    This will also nicely isolate the piece of code we need to parametrize
    to authenticating gss-protected NFSv4 callbacks on behalf of the NFS
    client.
    
    Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>

commit 0050cb82d8184eb09c08350d4a887dcef5836d35
Author: J. Bruce Fields <bfields@citi.umich.edu>
Date:   Thu Aug 9 18:34:32 2007 -0400

    knfsd: remove code duplication in nfsd4_setclientid()
    
    Each branch of this if-then-else has a bunch of duplicated code that we
    could just put at the end.
    
    Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>

commit 64954a225e2dfe36957327b5e1e4936470b8f6db
Author: Andrew Morton <akpm@linux-foundation.org>
Date:   Thu Aug 9 00:53:50 2007 -0700

    nfsd warning fix
    
    fs/nfsd/nfsctl.c: In function 'write_filehandle':
    fs/nfsd/nfsctl.c:301: warning: 'maxsize' may be used uninitialized in this function
    
    Cc: Neil Brown <neilb@suse.de>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
    Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>

commit 1489e2a1e9b921a545f26b581bf0924458a81ba6
Author: J. Bruce Fields <bfields@fieldses.org>
Date:   Tue Oct 24 18:33:17 2006 -0400

    knfsd: fix callback rpc cred
    
    It doesn't make sense to make the callback with credentials that the
    client made the setclientid with.  Instead the spec requires that the
    callback occur with the credentials the client authenticated *to*.
    It probably doesn't matter what we use for auth_unix, and some more
    infrastructure will be needed for auth_gss, so let's just remove the
    cred lookup for now.
    
    Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>

commit 953eca0a59309c45b497d7e5e828eb87a3201efa
Author: J. Bruce Fields <bfields@citi.umich.edu>
Date:   Wed Aug 1 15:30:59 2007 -0400

    knfsd: move nfsv4 slab creation/destruction to module init/exit
    
    We have some slabs that the nfs4 server uses to store state objects.
    We're currently creating and destroying those slabs whenever the server
    is brought up or down.  That seems excessive; may as well just do that
    in module initialization and exit.
    
    Also add some minor header cleanup.  (Thanks to Andrew Morton for that
    and a compile fix.)
    
    Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>

commit 0720c30a16b0b8f96daff3851dac4a1bbe0e5b09
Author: J. Bruce Fields <bfields@citi.umich.edu>
Date:   Fri Jul 27 18:06:50 2007 -0400

    knfsd: spawn kernel thread to probe callback channel
    
    We want to allow gss on the callback channel, so people using krb5 can
    still get the benefits of delegations.
    
    But looking up the rpc credential can take some time in that case.  And
    we shouldn't delay the response to setclientid_confirm while we wait.
    
    It may be inefficient, but for now the simplest solution is just to
    spawn a new thread as necessary for the purpose.
    
    (Thanks to Adrian Bunk for catching a missing static here.)
    
    Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>
    Cc: Adrian Bunk <bunk@kernel.org>

commit bb52ba85c4caf3052ed6a1ee832646e93164835c
Author: J. Bruce Fields <bfields@citi.umich.edu>
Date:   Fri Jul 27 16:36:45 2007 -0400

    knfsd: nfs4 name->id mapping not correctly parsing negative downcall
    
    Note that qword_get() returns length or -1, not an -ERROR.
    
    Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>

commit dd86d2fa3d5ce1975c3fac7efaecc2b9f1987179
Author: J. Bruce Fields <bfields@citi.umich.edu>
Date:   Fri Jul 27 16:10:37 2007 -0400

    knfsd: demote some printk()s to dprintk()s
    
    To quote a recent mail from Andrew Morton:
    
    	Look: if there's a way in which an unprivileged user can trigger
    	a printk we fix it, end of story.
    
    OK.  I assume that goes double for printk()s that might be triggered by
    random hosts on the internet.  So, disable some printk()s that look like
    they could be triggered by malfunctioning or malicious clients.  For
    now, just downgrade them to dprintk()s.
    
    Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>

commit b9d17cb50e440d15ed1310aa94b60057e205462e
Author: J. Bruce Fields <bfields@citi.umich.edu>
Date:   Thu Jul 26 17:04:54 2007 -0400

    knfsd: cleanup of nfsd4 cmp_* functions
    
    Benny Halevy suggested renaming cmp_* to same_* to make the meaning of
    the return value clearer.
    
    Fix some nearby style deviations while we're at it, including a small
    swath of creative indentation in nfs4_preprocess_seqid_op().
    
    Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>

commit 13a207c62ebf5dd95818d05828c790fbf4d9dca3
Author: J. Bruce Fields <bfields@citi.umich.edu>
Date:   Tue Jul 24 21:38:18 2007 -0400

    knfsd: delete code made redundant by map_new_errors
    
    I moved this check into map_new_errors, but forgot to delete the
    original.  Oops.
    
    Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>

commit ad65be75d86a9d9f7a99504af8e78aeaf5b9d2f6
Author: Christoph Hellwig <hch@infradead.org>
Date:   Wed Mar 7 15:26:25 2007 +0000

    nfsd: fix horrible indentation in nfsd_setattr
    
    Signed-off-by: Christoph Hellwig <hch@lst.de>

commit ea530dfdcf3428446356aed29c56ce9b44fbf18b
Author: J. Bruce Fields <bfields@citi.umich.edu>
Date:   Thu Jul 12 15:30:32 2007 -0400

    nfsd: remove unused cache_for_each macro
    
    This macro is unused.
    
    Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>

commit ab182f36fbfcd89a36d69cbdd2fd29c3b4f7307b
Author: J. Bruce Fields <bfields@citi.umich.edu>
Date:   Fri Jun 22 17:26:32 2007 -0400

    nfsd: tone down inaccurate dprintk
    
    The nfserr_dropit happens routinely on upcalls (so a kmalloc failure is
    almost never the actual cause), but I occasionally get a complant from
    some tester that's worried because they ran across this message after
    turning on debugging to research some unrelated problem.
    
    Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>

commit 99abc6a91d00839e28f9620ce23be8e6a20d7828
Author: Pavel Emelyanov <xemul@openvz.org>
Date:   Thu Sep 20 12:45:02 2007 +0400

    locks: Fix potential OOPS in generic_setlease()
    
    This code is run under lock_kernel(), which is dropped during
    sleeping operations, so the following race is possible:
    
    CPU1:                                CPU2:
      vfs_setlease();                    vfs_setlease();
      lock_kernel();
                                         lock_kernel(); /* spin */
      generic_setlease():
        ...
        for (before = ...)
        /* here we found some lease after
         * which we will insert the new one
         */
        fl = locks_alloc_lock();
        /* go to sleep in this allocation and
         * drop the BKL
         */
                                         generic_setlease():
                                           ...
                                           for (before = ...)
                                           /* here we find the "before" pointing
                                            * at the one we found on CPU1
                                            */
                                          ->fl_change(my_before, arg);
                                                  lease_modify();
                                                         locks_free_lock();
                                                         /* and we freed it */
                                         ...
                                         unlock_kernel();
       locks_insert_lock(before, fl);
       /* OOPS! We have just tried to add the lease
        * at the tail of already removed one
        */
    
    The similar races are already handled in other code - all the
    allocations are performed before any checks/updates.
    
    Thanks to Kamalesh Babulal for testing and for a bug report on an
    earlier version.
    
    Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
    Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
    Cc: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>

commit 3279d0b110df5356bca6b3495af2a52a5bc54bb6
Author: Pavel Emelyanov <xemul@openvz.org>
Date:   Wed Sep 19 16:44:07 2007 +0400

    Use list_first_entry in locks_wake_up_blocks
    
    This routine deletes all the elements from the list
    with the "while (!list_empty())" loop, and we already
    have a list_first_entry() macro to help it look nicer :)
    
    Signed-off-by: Pavel Emelyanov <xemul@openvz.org>

commit be0ee4e871fbd9de9cddbc50e5fb1dc3cc499258
Author: J. Bruce Fields <bfields@citi.umich.edu>
Date:   Wed Sep 12 15:45:07 2007 -0400

    locks: fix flock_lock_file() comment
    
    This comment wasn't updated when lease support was added, and it makes
    essentially the same mistake that the code made before a recent bugfix.
    
    Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>

commit 1a27e6ff3f607218931f04eca740ca0b04b52e90
Author: Pavel Emelyanov <xemul@openvz.org>
Date:   Tue Sep 11 16:38:13 2007 +0400

    Memory shortage can result in inconsistent flocks state
    
    When the flock_lock_file() is called to change the flock
    from F_RDLCK to F_WRLCK or vice versa the existing flock
    can be removed without appropriate warning.
    
    Look:
            for_each_lock(inode, before) {
                    struct file_lock *fl = *before;
                    if (IS_POSIX(fl))
                            break;
                    if (IS_LEASE(fl))
                            continue;
                    if (filp != fl->fl_file)
                            continue;
                    if (request->fl_type == fl->fl_type)
                            goto out;
                    found = 1;
                    locks_delete_lock(before); <<<<<< !
                    break;
            }
    
    if after this point the subsequent locks_alloc_lock() will
    fail the return code will be -ENOMEM, but the existing lock
    is already removed.
    
    This is a known feature that such "re-locking" is not atomic,
    but in the racy case the file should stay locked (although by
    some other process), but in this case the file will be unlocked.
    
    The proposal is to prepare the lock in advance keeping no chance
    to fail in the future code.
    
    Found during making the flocks pid-namespaces aware.
    
    (Note: Thanks to Reuben Farrelly for finding a bug in an earlier version
    of this patch.)
    
    Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
    Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
    Cc: Reuben Farrelly <reuben-linuxkernel@reub.net>

commit 1854f8a4f08011f7b353bf32c3337aa4bde8bb11
Author: J. Bruce Fields <bfields@fieldses.org>
Date:   Tue Nov 14 16:54:36 2006 -0500

    locks: kill redundant local variable
    
    There's no need for another variable local to this loop; we can use the
    variable (of the same name!) already declared at the top of the function,
    and not used till later (at which point it's initialized, so this is safe).
    
    Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>

commit d024dd770f8739ab9085da578e12c9a7c605b96a
Author: J. Bruce Fields <bfields@citi.umich.edu>
Date:   Thu May 10 19:02:07 2007 -0400

    locks: reverse order of posix_locks_conflict() arguments
    
    The first argument to posix_locks_conflict() is meant to be a lock request,
    and the second a lock from an inode's lock request.  It doesn't really
    make a difference which order you call them in, since the only
    asymmetric test in posix_lock_conflict() is the check whether the second
    argument is a posix lock--and every caller already does that check for
    some reason.
    
    But may as well fix posix_test_lock() to call posix_locks_conflict()
    with the arguments in the same order as everywhere else.
    
    Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 Documentation/00-INDEX                          |    6 
 Documentation/filesystems/00-INDEX              |    4 
 Documentation/filesystems/locks.txt             |   67 
 Documentation/filesystems/mandatory-locking.txt |  171 ++
 Documentation/locks.txt                         |   67 
 Documentation/mandatory.txt                     |  152 -
 fs/9p/vfs_file.c                                |    2 
 fs/afs/flock.c                                  |    3 
 fs/gfs2/ops_file.c                              |    4 
 fs/lockd/svc.c                                  |   21 
 fs/locks.c                                      |  192 +-
 fs/nfs/callback.c                               |    4 
 fs/nfs/file.c                                   |    3 
 fs/nfsd/nfs3xdr.c                               |   59 
 fs/nfsd/nfs4callback.c                          |   87 -
 fs/nfsd/nfs4idmap.c                             |    8 
 fs/nfsd/nfs4proc.c                              |    4 
 fs/nfsd/nfs4state.c                             |  202 +-
 fs/nfsd/nfs4xdr.c                               |   22 
 fs/nfsd/nfsctl.c                                |   23 
 fs/nfsd/nfssvc.c                                |   16 
 fs/nfsd/nfsxdr.c                                |    4 
 fs/nfsd/vfs.c                                   |   56 
 fs/proc/proc_misc.c                             |   19 
 fs/read_write.c                                 |    2 
 include/linux/fs.h                              |   22 
 include/linux/nfsd/nfsd.h                       |   18 
 include/linux/nfsd/nfsfh.h                      |   42 
 include/linux/nfsd/xdr4.h                       |    4 
 include/linux/sunrpc/cache.h                    |   10 
 include/linux/sunrpc/debug.h                    |    2 
 include/linux/sunrpc/svc.h                      |    7 
 include/linux/sunrpc/svc_xprt.h                 |   86 +
 include/linux/sunrpc/svcsock.h                  |   43 
 net/sunrpc/Makefile                             |    3 
 net/sunrpc/auth_gss/svcauth_gss.c               |  144 -
 net/sunrpc/sunrpc_syms.c                        |    5 
 net/sunrpc/svc.c                                |   59 
 net/sunrpc/svc_xprt.c                           |  954 +++++++++++
 net/sunrpc/svcauth_unix.c                       |   54 
 net/sunrpc/svcsock.c                            | 1162 +++-----------
 net/sunrpc/sysctl.c                             |   37 
 42 files changed, 2131 insertions(+), 1719 deletions(-)

diff -puN Documentation/00-INDEX~git-nfsd Documentation/00-INDEX
--- a/Documentation/00-INDEX~git-nfsd
+++ a/Documentation/00-INDEX
@@ -145,7 +145,7 @@ fb/
 feature-removal-schedule.txt
 	- list of files and features that are going to be removed.
 filesystems/
-	- directory with info on the various filesystems that Linux supports.
+	- info on the vfs and the various filesystems that Linux supports.
 firmware_class/
 	- request_firmware() hotplug interface info.
 floppy.txt
@@ -230,8 +230,6 @@ local_ops.txt
 	- semantics and behavior of local atomic operations.
 lockdep-design.txt
 	- documentation on the runtime locking correctness validator.
-locks.txt
-	- info on file locking implementations, flock() vs. fcntl(), etc.
 logo.gif
 	- full colour GIF image of Linux logo (penguin - Tux).
 logo.txt
@@ -240,8 +238,6 @@ m68k/
 	- directory with info about Linux on Motorola 68k architecture.
 magic-number.txt
 	- list of magic numbers used to mark/protect kernel data structures.
-mandatory.txt
-	- info on the Linux implementation of Sys V mandatory file locking.
 mca.txt
 	- info on supporting Micro Channel Architecture (e.g. PS/2) systems.
 md.txt
diff -puN Documentation/filesystems/00-INDEX~git-nfsd Documentation/filesystems/00-INDEX
--- a/Documentation/filesystems/00-INDEX~git-nfsd
+++ a/Documentation/filesystems/00-INDEX
@@ -52,6 +52,10 @@ isofs.txt
 	- info and mount options for the ISO 9660 (CDROM) filesystem.
 jfs.txt
 	- info and mount options for the JFS filesystem.
+locks.txt
+	- info on file locking implementations, flock() vs. fcntl(), etc.
+mandatory-locking.txt
+	- info on the Linux implementation of Sys V mandatory file locking.
 ncpfs.txt
 	- info on Novell Netware(tm) filesystem using NCP protocol.
 ntfs.txt
diff -puN /dev/null Documentation/filesystems/locks.txt
--- /dev/null
+++ a/Documentation/filesystems/locks.txt
@@ -0,0 +1,67 @@
+		      File Locking Release Notes
+
+		Andy Walker <andy@lysaker.kvaerner.no>
+
+			    12 May 1997
+
+
+1. What's New?
+--------------
+
+1.1 Broken Flock Emulation
+--------------------------
+
+The old flock(2) emulation in the kernel was swapped for proper BSD
+compatible flock(2) support in the 1.3.x series of kernels. With the
+release of the 2.1.x kernel series, support for the old emulation has
+been totally removed, so that we don't need to carry this baggage
+forever.
+
+This should not cause problems for anybody, since everybody using a
+2.1.x kernel should have updated their C library to a suitable version
+anyway (see the file "Documentation/Changes".)
+
+1.2 Allow Mixed Locks Again
+---------------------------
+
+1.2.1 Typical Problems - Sendmail
+---------------------------------
+Because sendmail was unable to use the old flock() emulation, many sendmail
+installations use fcntl() instead of flock(). This is true of Slackware 3.0
+for example. This gave rise to some other subtle problems if sendmail was
+configured to rebuild the alias file. Sendmail tried to lock the aliases.dir
+file with fcntl() at the same time as the GDBM routines tried to lock this
+file with flock(). With pre 1.3.96 kernels this could result in deadlocks that,
+over time, or under a very heavy mail load, would eventually cause the kernel
+to lock solid with deadlocked processes.
+
+
+1.2.2 The Solution
+------------------
+The solution I have chosen, after much experimentation and discussion,
+is to make flock() and fcntl() locks oblivious to each other. Both can
+exists, and neither will have any effect on the other.
+
+I wanted the two lock styles to be cooperative, but there were so many
+race and deadlock conditions that the current solution was the only
+practical one. It puts us in the same position as, for example, SunOS
+4.1.x and several other commercial Unices. The only OS's that support
+cooperative flock()/fcntl() are those that emulate flock() using
+fcntl(), with all the problems that implies.
+
+
+1.3 Mandatory Locking As A Mount Option
+---------------------------------------
+
+Mandatory locking, as described in 'Documentation/filesystems/mandatory.txt'
+was prior to this release a general configuration option that was valid for
+all mounted filesystems.  This had a number of inherent dangers, not the
+least of which was the ability to freeze an NFS server by asking it to read
+a file for which a mandatory lock existed.
+
+From this release of the kernel, mandatory locking can be turned on and off
+on a per-filesystem basis, using the mount options 'mand' and 'nomand'.
+The default is to disallow mandatory locking. The intention is that
+mandatory locking only be enabled on a local filesystem as the specific need
+arises.
+
diff -puN /dev/null Documentation/filesystems/mandatory-locking.txt
--- /dev/null
+++ a/Documentation/filesystems/mandatory-locking.txt
@@ -0,0 +1,171 @@
+	Mandatory File Locking For The Linux Operating System
+
+		Andy Walker <andy@lysaker.kvaerner.no>
+
+			   15 April 1996
+		     (Updated September 2007)
+
+0. Why you should avoid mandatory locking
+-----------------------------------------
+
+The Linux implementation is prey to a number of difficult-to-fix race
+conditions which in practice make it not dependable:
+
+	- The write system call checks for a mandatory lock only once
+	  at its start.  It is therefore possible for a lock request to
+	  be granted after this check but before the data is modified.
+	  A process may then see file data change even while a mandatory
+	  lock was held.
+	- Similarly, an exclusive lock may be granted on a file after
+	  the kernel has decided to proceed with a read, but before the
+	  read has actually completed, and the reading process may see
+	  the file data in a state which should not have been visible
+	  to it.
+	- Similar races make the claimed mutual exclusion between lock
+	  and mmap similarly unreliable.
+
+1. What is  mandatory locking?
+------------------------------
+
+Mandatory locking is kernel enforced file locking, as opposed to the more usual
+cooperative file locking used to guarantee sequential access to files among
+processes. File locks are applied using the flock() and fcntl() system calls
+(and the lockf() library routine which is a wrapper around fcntl().) It is
+normally a process' responsibility to check for locks on a file it wishes to
+update, before applying its own lock, updating the file and unlocking it again.
+The most commonly used example of this (and in the case of sendmail, the most
+troublesome) is access to a user's mailbox. The mail user agent and the mail
+transfer agent must guard against updating the mailbox at the same time, and
+prevent reading the mailbox while it is being updated.
+
+In a perfect world all processes would use and honour a cooperative, or
+"advisory" locking scheme. However, the world isn't perfect, and there's
+a lot of poorly written code out there.
+
+In trying to address this problem, the designers of System V UNIX came up
+with a "mandatory" locking scheme, whereby the operating system kernel would
+block attempts by a process to write to a file that another process holds a
+"read" -or- "shared" lock on, and block attempts to both read and write to a 
+file that a process holds a "write " -or- "exclusive" lock on.
+
+The System V mandatory locking scheme was intended to have as little impact as
+possible on existing user code. The scheme is based on marking individual files
+as candidates for mandatory locking, and using the existing fcntl()/lockf()
+interface for applying locks just as if they were normal, advisory locks.
+
+Note 1: In saying "file" in the paragraphs above I am actually not telling
+the whole truth. System V locking is based on fcntl(). The granularity of
+fcntl() is such that it allows the locking of byte ranges in files, in addition
+to entire files, so the mandatory locking rules also have byte level
+granularity.
+
+Note 2: POSIX.1 does not specify any scheme for mandatory locking, despite
+borrowing the fcntl() locking scheme from System V. The mandatory locking
+scheme is defined by the System V Interface Definition (SVID) Version 3.
+
+2. Marking a file for mandatory locking
+---------------------------------------
+
+A file is marked as a candidate for mandatory locking by setting the group-id
+bit in its file mode but removing the group-execute bit. This is an otherwise
+meaningless combination, and was chosen by the System V implementors so as not
+to break existing user programs.
+
+Note that the group-id bit is usually automatically cleared by the kernel when
+a setgid file is written to. This is a security measure. The kernel has been
+modified to recognize the special case of a mandatory lock candidate and to
+refrain from clearing this bit. Similarly the kernel has been modified not
+to run mandatory lock candidates with setgid privileges.
+
+3. Available implementations
+----------------------------
+
+I have considered the implementations of mandatory locking available with
+SunOS 4.1.x, Solaris 2.x and HP-UX 9.x.
+
+Generally I have tried to make the most sense out of the behaviour exhibited
+by these three reference systems. There are many anomalies.
+
+All the reference systems reject all calls to open() for a file on which
+another process has outstanding mandatory locks. This is in direct
+contravention of SVID 3, which states that only calls to open() with the
+O_TRUNC flag set should be rejected. The Linux implementation follows the SVID
+definition, which is the "Right Thing", since only calls with O_TRUNC can
+modify the contents of the file.
+
+HP-UX even disallows open() with O_TRUNC for a file with advisory locks, not
+just mandatory locks. That would appear to contravene POSIX.1.
+
+mmap() is another interesting case. All the operating systems mentioned
+prevent mandatory locks from being applied to an mmap()'ed file, but  HP-UX
+also disallows advisory locks for such a file. SVID actually specifies the
+paranoid HP-UX behaviour.
+
+In my opinion only MAP_SHARED mappings should be immune from locking, and then
+only from mandatory locks - that is what is currently implemented.
+
+SunOS is so hopeless that it doesn't even honour the O_NONBLOCK flag for
+mandatory locks, so reads and writes to locked files always block when they
+should return EAGAIN.
+
+I'm afraid that this is such an esoteric area that the semantics described
+below are just as valid as any others, so long as the main points seem to
+agree. 
+
+4. Semantics
+------------
+
+1. Mandatory locks can only be applied via the fcntl()/lockf() locking
+   interface - in other words the System V/POSIX interface. BSD style
+   locks using flock() never result in a mandatory lock.
+
+2. If a process has locked a region of a file with a mandatory read lock, then
+   other processes are permitted to read from that region. If any of these
+   processes attempts to write to the region it will block until the lock is
+   released, unless the process has opened the file with the O_NONBLOCK
+   flag in which case the system call will return immediately with the error
+   status EAGAIN.
+
+3. If a process has locked a region of a file with a mandatory write lock, all
+   attempts to read or write to that region block until the lock is released,
+   unless a process has opened the file with the O_NONBLOCK flag in which case
+   the system call will return immediately with the error status EAGAIN.
+
+4. Calls to open() with O_TRUNC, or to creat(), on a existing file that has
+   any mandatory locks owned by other processes will be rejected with the
+   error status EAGAIN.
+
+5. Attempts to apply a mandatory lock to a file that is memory mapped and
+   shared (via mmap() with MAP_SHARED) will be rejected with the error status
+   EAGAIN.
+
+6. Attempts to create a shared memory map of a file (via mmap() with MAP_SHARED)
+   that has any mandatory locks in effect will be rejected with the error status
+   EAGAIN.
+
+5. Which system calls are affected?
+-----------------------------------
+
+Those which modify a file's contents, not just the inode. That gives read(),
+write(), readv(), writev(), open(), creat(), mmap(), truncate() and
+ftruncate(). truncate() and ftruncate() are considered to be "write" actions
+for the purposes of mandatory locking.
+
+The affected region is usually defined as stretching from the current position
+for the total number of bytes read or written. For the truncate calls it is
+defined as the bytes of a file removed or added (we must also consider bytes
+added, as a lock can specify just "the whole file", rather than a specific
+range of bytes.)
+
+Note 3: I may have overlooked some system calls that need mandatory lock
+checking in my eagerness to get this code out the door. Please let me know, or
+better still fix the system calls yourself and submit a patch to me or Linus.
+
+6. Warning!
+-----------
+
+Not even root can override a mandatory lock, so runaway processes can wreak
+havoc if they lock crucial files. The way around it is to change the file
+permissions (remove the setgid bit) before trying to read or write to it.
+Of course, that might be a bit tricky if the system is hung :-(
+
diff -puN Documentation/locks.txt~git-nfsd /dev/null
--- a/Documentation/locks.txt
+++ /dev/null
@@ -1,67 +0,0 @@
-		      File Locking Release Notes
-
-		Andy Walker <andy@lysaker.kvaerner.no>
-
-			    12 May 1997
-
-
-1. What's New?
---------------
-
-1.1 Broken Flock Emulation
---------------------------
-
-The old flock(2) emulation in the kernel was swapped for proper BSD
-compatible flock(2) support in the 1.3.x series of kernels. With the
-release of the 2.1.x kernel series, support for the old emulation has
-been totally removed, so that we don't need to carry this baggage
-forever.
-
-This should not cause problems for anybody, since everybody using a
-2.1.x kernel should have updated their C library to a suitable version
-anyway (see the file "Documentation/Changes".)
-
-1.2 Allow Mixed Locks Again
----------------------------
-
-1.2.1 Typical Problems - Sendmail
----------------------------------
-Because sendmail was unable to use the old flock() emulation, many sendmail
-installations use fcntl() instead of flock(). This is true of Slackware 3.0
-for example. This gave rise to some other subtle problems if sendmail was
-configured to rebuild the alias file. Sendmail tried to lock the aliases.dir
-file with fcntl() at the same time as the GDBM routines tried to lock this
-file with flock(). With pre 1.3.96 kernels this could result in deadlocks that,
-over time, or under a very heavy mail load, would eventually cause the kernel
-to lock solid with deadlocked processes.
-
-
-1.2.2 The Solution
-------------------
-The solution I have chosen, after much experimentation and discussion,
-is to make flock() and fcntl() locks oblivious to each other. Both can
-exists, and neither will have any effect on the other.
-
-I wanted the two lock styles to be cooperative, but there were so many
-race and deadlock conditions that the current solution was the only
-practical one. It puts us in the same position as, for example, SunOS
-4.1.x and several other commercial Unices. The only OS's that support
-cooperative flock()/fcntl() are those that emulate flock() using
-fcntl(), with all the problems that implies.
-
-
-1.3 Mandatory Locking As A Mount Option
----------------------------------------
-
-Mandatory locking, as described in 'Documentation/mandatory.txt' was prior
-to this release a general configuration option that was valid for all
-mounted filesystems. This had a number of inherent dangers, not the least
-of which was the ability to freeze an NFS server by asking it to read a
-file for which a mandatory lock existed.
-
-From this release of the kernel, mandatory locking can be turned on and off
-on a per-filesystem basis, using the mount options 'mand' and 'nomand'.
-The default is to disallow mandatory locking. The intention is that
-mandatory locking only be enabled on a local filesystem as the specific need
-arises.
-
diff -puN Documentation/mandatory.txt~git-nfsd /dev/null
--- a/Documentation/mandatory.txt
+++ /dev/null
@@ -1,152 +0,0 @@
-	Mandatory File Locking For The Linux Operating System
-
-		Andy Walker <andy@lysaker.kvaerner.no>
-
-			   15 April 1996
-
-
-1. What is  mandatory locking?
-------------------------------
-
-Mandatory locking is kernel enforced file locking, as opposed to the more usual
-cooperative file locking used to guarantee sequential access to files among
-processes. File locks are applied using the flock() and fcntl() system calls
-(and the lockf() library routine which is a wrapper around fcntl().) It is
-normally a process' responsibility to check for locks on a file it wishes to
-update, before applying its own lock, updating the file and unlocking it again.
-The most commonly used example of this (and in the case of sendmail, the most
-troublesome) is access to a user's mailbox. The mail user agent and the mail
-transfer agent must guard against updating the mailbox at the same time, and
-prevent reading the mailbox while it is being updated.
-
-In a perfect world all processes would use and honour a cooperative, or
-"advisory" locking scheme. However, the world isn't perfect, and there's
-a lot of poorly written code out there.
-
-In trying to address this problem, the designers of System V UNIX came up
-with a "mandatory" locking scheme, whereby the operating system kernel would
-block attempts by a process to write to a file that another process holds a
-"read" -or- "shared" lock on, and block attempts to both read and write to a 
-file that a process holds a "write " -or- "exclusive" lock on.
-
-The System V mandatory locking scheme was intended to have as little impact as
-possible on existing user code. The scheme is based on marking individual files
-as candidates for mandatory locking, and using the existing fcntl()/lockf()
-interface for applying locks just as if they were normal, advisory locks.
-
-Note 1: In saying "file" in the paragraphs above I am actually not telling
-the whole truth. System V locking is based on fcntl(). The granularity of
-fcntl() is such that it allows the locking of byte ranges in files, in addition
-to entire files, so the mandatory locking rules also have byte level
-granularity.
-
-Note 2: POSIX.1 does not specify any scheme for mandatory locking, despite
-borrowing the fcntl() locking scheme from System V. The mandatory locking
-scheme is defined by the System V Interface Definition (SVID) Version 3.
-
-2. Marking a file for mandatory locking
----------------------------------------
-
-A file is marked as a candidate for mandatory locking by setting the group-id
-bit in its file mode but removing the group-execute bit. This is an otherwise
-meaningless combination, and was chosen by the System V implementors so as not
-to break existing user programs.
-
-Note that the group-id bit is usually automatically cleared by the kernel when
-a setgid file is written to. This is a security measure. The kernel has been
-modified to recognize the special case of a mandatory lock candidate and to
-refrain from clearing this bit. Similarly the kernel has been modified not
-to run mandatory lock candidates with setgid privileges.
-
-3. Available implementations
-----------------------------
-
-I have considered the implementations of mandatory locking available with
-SunOS 4.1.x, Solaris 2.x and HP-UX 9.x.
-
-Generally I have tried to make the most sense out of the behaviour exhibited
-by these three reference systems. There are many anomalies.
-
-All the reference systems reject all calls to open() for a file on which
-another process has outstanding mandatory locks. This is in direct
-contravention of SVID 3, which states that only calls to open() with the
-O_TRUNC flag set should be rejected. The Linux implementation follows the SVID
-definition, which is the "Right Thing", since only calls with O_TRUNC can
-modify the contents of the file.
-
-HP-UX even disallows open() with O_TRUNC for a file with advisory locks, not
-just mandatory locks. That would appear to contravene POSIX.1.
-
-mmap() is another interesting case. All the operating systems mentioned
-prevent mandatory locks from being applied to an mmap()'ed file, but  HP-UX
-also disallows advisory locks for such a file. SVID actually specifies the
-paranoid HP-UX behaviour.
-
-In my opinion only MAP_SHARED mappings should be immune from locking, and then
-only from mandatory locks - that is what is currently implemented.
-
-SunOS is so hopeless that it doesn't even honour the O_NONBLOCK flag for
-mandatory locks, so reads and writes to locked files always block when they
-should return EAGAIN.
-
-I'm afraid that this is such an esoteric area that the semantics described
-below are just as valid as any others, so long as the main points seem to
-agree. 
-
-4. Semantics
-------------
-
-1. Mandatory locks can only be applied via the fcntl()/lockf() locking
-   interface - in other words the System V/POSIX interface. BSD style
-   locks using flock() never result in a mandatory lock.
-
-2. If a process has locked a region of a file with a mandatory read lock, then
-   other processes are permitted to read from that region. If any of these
-   processes attempts to write to the region it will block until the lock is
-   released, unless the process has opened the file with the O_NONBLOCK
-   flag in which case the system call will return immediately with the error
-   status EAGAIN.
-
-3. If a process has locked a region of a file with a mandatory write lock, all
-   attempts to read or write to that region block until the lock is released,
-   unless a process has opened the file with the O_NONBLOCK flag in which case
-   the system call will return immediately with the error status EAGAIN.
-
-4. Calls to open() with O_TRUNC, or to creat(), on a existing file that has
-   any mandatory locks owned by other processes will be rejected with the
-   error status EAGAIN.
-
-5. Attempts to apply a mandatory lock to a file that is memory mapped and
-   shared (via mmap() with MAP_SHARED) will be rejected with the error status
-   EAGAIN.
-
-6. Attempts to create a shared memory map of a file (via mmap() with MAP_SHARED)
-   that has any mandatory locks in effect will be rejected with the error status
-   EAGAIN.
-
-5. Which system calls are affected?
------------------------------------
-
-Those which modify a file's contents, not just the inode. That gives read(),
-write(), readv(), writev(), open(), creat(), mmap(), truncate() and
-ftruncate(). truncate() and ftruncate() are considered to be "write" actions
-for the purposes of mandatory locking.
-
-The affected region is usually defined as stretching from the current position
-for the total number of bytes read or written. For the truncate calls it is
-defined as the bytes of a file removed or added (we must also consider bytes
-added, as a lock can specify just "the whole file", rather than a specific
-range of bytes.)
-
-Note 3: I may have overlooked some system calls that need mandatory lock
-checking in my eagerness to get this code out the door. Please let me know, or
-better still fix the system calls yourself and submit a patch to me or Linus.
-
-6. Warning!
------------
-
-Not even root can override a mandatory lock, so runaway processes can wreak
-havoc if they lock crucial files. The way around it is to change the file
-permissions (remove the setgid bit) before trying to read or write to it.
-Of course, that might be a bit tricky if the system is hung :-(
-
diff -puN fs/9p/vfs_file.c~git-nfsd fs/9p/vfs_file.c
--- a/fs/9p/vfs_file.c~git-nfsd
+++ a/fs/9p/vfs_file.c
@@ -105,7 +105,7 @@ static int v9fs_file_lock(struct file *f
 	P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
 
 	/* No mandatory locks */
-	if ((inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+	if (__mandatory_lock(inode))
 		return -ENOLCK;
 
 	if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
diff -puN fs/afs/flock.c~git-nfsd fs/afs/flock.c
--- a/fs/afs/flock.c~git-nfsd
+++ a/fs/afs/flock.c
@@ -524,8 +524,7 @@ int afs_lock(struct file *file, int cmd,
 	       (long long) fl->fl_start, (long long) fl->fl_end);
 
 	/* AFS doesn't support mandatory locks */
-	if ((vnode->vfs_inode.i_mode & (S_ISGID | S_IXGRP)) == S_ISGID &&
-	    fl->fl_type != F_UNLCK)
+	if (__mandatory_lock(&vnode->vfs_inode) && fl->fl_type != F_UNLCK)
 		return -ENOLCK;
 
 	if (IS_GETLK(cmd))
diff -puN fs/gfs2/ops_file.c~git-nfsd fs/gfs2/ops_file.c
--- a/fs/gfs2/ops_file.c~git-nfsd
+++ a/fs/gfs2/ops_file.c
@@ -535,7 +535,7 @@ static int gfs2_lock(struct file *file, 
 
 	if (!(fl->fl_flags & FL_POSIX))
 		return -ENOLCK;
-	if ((ip->i_inode.i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+	if (__mandatory_lock(&ip->i_inode))
 		return -ENOLCK;
 
 	if (sdp->sd_args.ar_localflocks) {
@@ -636,7 +636,7 @@ static int gfs2_flock(struct file *file,
 
 	if (!(fl->fl_flags & FL_FLOCK))
 		return -ENOLCK;
-	if ((ip->i_inode.i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+	if (__mandatory_lock(&ip->i_inode))
 		return -ENOLCK;
 
 	if (sdp->sd_args.ar_localflocks)
diff -puN fs/lockd/svc.c~git-nfsd fs/lockd/svc.c
--- a/fs/lockd/svc.c~git-nfsd
+++ a/fs/lockd/svc.c
@@ -219,13 +219,12 @@ lockd(struct svc_rqst *rqstp)
 	module_put_and_exit(0);
 }
 
-
-static int find_socket(struct svc_serv *serv, int proto)
+static int find_xprt(struct svc_serv *serv, char *proto)
 {
-	struct svc_sock *svsk;
+	struct svc_xprt *xprt;
 	int found = 0;
-	list_for_each_entry(svsk, &serv->sv_permsocks, sk_list)
-		if (svsk->sk_sk->sk_protocol == proto) {
+	list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list)
+		if (strcmp(xprt->xpt_class->xcl_name, proto) == 0) {
 			found = 1;
 			break;
 		}
@@ -243,13 +242,13 @@ static int make_socks(struct svc_serv *s
 	int err = 0;
 
 	if (proto == IPPROTO_UDP || nlm_udpport)
-		if (!find_socket(serv, IPPROTO_UDP))
-			err = svc_makesock(serv, IPPROTO_UDP, nlm_udpport,
-						SVC_SOCK_DEFAULTS);
+		if (!find_xprt(serv, "udp"))
+			err = svc_create_xprt(serv, "udp", nlm_udpport,
+					      SVC_SOCK_DEFAULTS);
 	if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport))
-		if (!find_socket(serv, IPPROTO_TCP))
-			err = svc_makesock(serv, IPPROTO_TCP, nlm_tcpport,
-						SVC_SOCK_DEFAULTS);
+		if (!find_xprt(serv, "tcp"))
+			err = svc_create_xprt(serv, "tcp", nlm_tcpport,
+					      SVC_SOCK_DEFAULTS);
 
 	if (err >= 0) {
 		warned = 0;
diff -puN fs/locks.c~git-nfsd fs/locks.c
--- a/fs/locks.c~git-nfsd
+++ a/fs/locks.c
@@ -534,7 +534,9 @@ static void locks_insert_block(struct fi
 static void locks_wake_up_blocks(struct file_lock *blocker)
 {
 	while (!list_empty(&blocker->fl_block)) {
-		struct file_lock *waiter = list_entry(blocker->fl_block.next,
+		struct file_lock *waiter;
+
+		waiter = list_first_entry(&blocker->fl_block,
 				struct file_lock, fl_block);
 		__locks_delete_block(waiter);
 		if (waiter->fl_lmops && waiter->fl_lmops->fl_notify)
@@ -668,7 +670,7 @@ posix_test_lock(struct file *filp, struc
 	for (cfl = filp->f_path.dentry->d_inode->i_flock; cfl; cfl = cfl->fl_next) {
 		if (!IS_POSIX(cfl))
 			continue;
-		if (posix_locks_conflict(cfl, fl))
+		if (posix_locks_conflict(fl, cfl))
 			break;
 	}
 	if (cfl)
@@ -698,13 +700,12 @@ EXPORT_SYMBOL(posix_test_lock);
 static int posix_locks_deadlock(struct file_lock *caller_fl,
 				struct file_lock *block_fl)
 {
-	struct list_head *tmp;
+	struct file_lock *fl;
 
 next_task:
 	if (posix_same_owner(caller_fl, block_fl))
 		return 1;
-	list_for_each(tmp, &blocked_list) {
-		struct file_lock *fl = list_entry(tmp, struct file_lock, fl_link);
+	list_for_each_entry(fl, &blocked_list, fl_link) {
 		if (posix_same_owner(fl, block_fl)) {
 			fl = fl->fl_next;
 			block_fl = fl;
@@ -715,8 +716,7 @@ next_task:
 }
 
 /* Try to create a FLOCK lock on filp. We always insert new FLOCK locks
- * at the head of the list, but that's secret knowledge known only to
- * flock_lock_file and posix_lock_file.
+ * after any leases, but before any posix locks.
  *
  * Note that if called with an FL_EXISTS argument, the caller may determine
  * whether or not a lock was successfully freed by testing the return
@@ -733,6 +733,15 @@ static int flock_lock_file(struct file *
 	lock_kernel();
 	if (request->fl_flags & FL_ACCESS)
 		goto find_conflict;
+
+	if (request->fl_type != F_UNLCK) {
+		error = -ENOMEM;
+		new_fl = locks_alloc_lock();
+		if (new_fl == NULL)
+			goto out;
+		error = 0;
+	}
+
 	for_each_lock(inode, before) {
 		struct file_lock *fl = *before;
 		if (IS_POSIX(fl))
@@ -754,10 +763,6 @@ static int flock_lock_file(struct file *
 		goto out;
 	}
 
-	error = -ENOMEM;
-	new_fl = locks_alloc_lock();
-	if (new_fl == NULL)
-		goto out;
 	/*
 	 * If a higher-priority process was blocked on the old file lock,
 	 * give it the opportunity to lock the file.
@@ -819,7 +824,7 @@ static int __posix_lock_file(struct inod
 	lock_kernel();
 	if (request->fl_type != F_UNLCK) {
 		for_each_lock(inode, before) {
-			struct file_lock *fl = *before;
+			fl = *before;
 			if (!IS_POSIX(fl))
 				continue;
 			if (!posix_locks_conflict(request, fl))
@@ -1113,7 +1118,7 @@ int locks_mandatory_area(int read_write,
 			 * If we've been sleeping someone might have
 			 * changed the permissions behind our back.
 			 */
-			if ((inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+			if (__mandatory_lock(inode))
 				continue;
 		}
 
@@ -1337,6 +1342,7 @@ int fcntl_getlease(struct file *filp)
 int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 {
 	struct file_lock *fl, **before, **my_before = NULL, *lease;
+	struct file_lock *new_fl = NULL;
 	struct dentry *dentry = filp->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
 	int error, rdlease_count = 0, wrlease_count = 0;
@@ -1363,6 +1369,11 @@ int generic_setlease(struct file *filp, 
 		|| (atomic_read(&inode->i_count) > 1)))
 		goto out;
 
+	error = -ENOMEM;
+	new_fl = locks_alloc_lock();
+	if (new_fl == NULL)
+		goto out;
+
 	/*
 	 * At this point, we know that if there is an exclusive
 	 * lease on this file, then we hold it on this filp
@@ -1405,18 +1416,15 @@ int generic_setlease(struct file *filp, 
 	if (!leases_enable)
 		goto out;
 
-	error = -ENOMEM;
-	fl = locks_alloc_lock();
-	if (fl == NULL)
-		goto out;
-
-	locks_copy_lock(fl, lease);
+	locks_copy_lock(new_fl, lease);
+	locks_insert_lock(before, new_fl);
 
-	locks_insert_lock(before, fl);
+	*flp = new_fl;
+	return 0;
 
-	*flp = fl;
-	error = 0;
 out:
+	if (new_fl != NULL)
+		locks_free_lock(new_fl);
 	return error;
 }
 EXPORT_SYMBOL(generic_setlease);
@@ -1752,9 +1760,7 @@ int fcntl_setlk(unsigned int fd, struct 
 	/* Don't allow mandatory locks on files that may be memory mapped
 	 * and shared.
 	 */
-	if (IS_MANDLOCK(inode) &&
-	    (inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID &&
-	    mapping_writably_mapped(filp->f_mapping)) {
+	if (mandatory_lock(inode) && mapping_writably_mapped(filp->f_mapping)) {
 		error = -EAGAIN;
 		goto out;
 	}
@@ -1878,9 +1884,7 @@ int fcntl_setlk64(unsigned int fd, struc
 	/* Don't allow mandatory locks on files that may be memory mapped
 	 * and shared.
 	 */
-	if (IS_MANDLOCK(inode) &&
-	    (inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID &&
-	    mapping_writably_mapped(filp->f_mapping)) {
+	if (mandatory_lock(inode) && mapping_writably_mapped(filp->f_mapping)) {
 		error = -EAGAIN;
 		goto out;
 	}
@@ -2062,138 +2066,114 @@ int vfs_cancel_lock(struct file *filp, s
 
 EXPORT_SYMBOL_GPL(vfs_cancel_lock);
 
-static void lock_get_status(char* out, struct file_lock *fl, int id, char *pfx)
+#ifdef CONFIG_PROC_FS
+#include <linux/seq_file.h>
+
+static void lock_get_status(struct seq_file *f, struct file_lock *fl,
+							int id, char *pfx)
 {
 	struct inode *inode = NULL;
 
 	if (fl->fl_file != NULL)
 		inode = fl->fl_file->f_path.dentry->d_inode;
 
-	out += sprintf(out, "%d:%s ", id, pfx);
+	seq_printf(f, "%d:%s ", id, pfx);
 	if (IS_POSIX(fl)) {
-		out += sprintf(out, "%6s %s ",
+		seq_printf(f, "%6s %s ",
 			     (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ",
 			     (inode == NULL) ? "*NOINODE*" :
-			     (IS_MANDLOCK(inode) &&
-			      (inode->i_mode & (S_IXGRP | S_ISGID)) == S_ISGID) ?
-			     "MANDATORY" : "ADVISORY ");
+			     mandatory_lock(inode) ? "MANDATORY" : "ADVISORY ");
 	} else if (IS_FLOCK(fl)) {
 		if (fl->fl_type & LOCK_MAND) {
-			out += sprintf(out, "FLOCK  MSNFS     ");
+			seq_printf(f, "FLOCK  MSNFS     ");
 		} else {
-			out += sprintf(out, "FLOCK  ADVISORY  ");
+			seq_printf(f, "FLOCK  ADVISORY  ");
 		}
 	} else if (IS_LEASE(fl)) {
-		out += sprintf(out, "LEASE  ");
+		seq_printf(f, "LEASE  ");
 		if (fl->fl_type & F_INPROGRESS)
-			out += sprintf(out, "BREAKING  ");
+			seq_printf(f, "BREAKING  ");
 		else if (fl->fl_file)
-			out += sprintf(out, "ACTIVE    ");
+			seq_printf(f, "ACTIVE    ");
 		else
-			out += sprintf(out, "BREAKER   ");
+			seq_printf(f, "BREAKER   ");
 	} else {
-		out += sprintf(out, "UNKNOWN UNKNOWN  ");
+		seq_printf(f, "UNKNOWN UNKNOWN  ");
 	}
 	if (fl->fl_type & LOCK_MAND) {
-		out += sprintf(out, "%s ",
+		seq_printf(f, "%s ",
 			       (fl->fl_type & LOCK_READ)
 			       ? (fl->fl_type & LOCK_WRITE) ? "RW   " : "READ "
 			       : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE ");
 	} else {
-		out += sprintf(out, "%s ",
+		seq_printf(f, "%s ",
 			       (fl->fl_type & F_INPROGRESS)
 			       ? (fl->fl_type & F_UNLCK) ? "UNLCK" : "READ "
 			       : (fl->fl_type & F_WRLCK) ? "WRITE" : "READ ");
 	}
 	if (inode) {
 #ifdef WE_CAN_BREAK_LSLK_NOW
-		out += sprintf(out, "%d %s:%ld ", fl->fl_pid,
+		seq_printf(f, "%d %s:%ld ", fl->fl_pid,
 				inode->i_sb->s_id, inode->i_ino);
 #else
 		/* userspace relies on this representation of dev_t ;-( */
-		out += sprintf(out, "%d %02x:%02x:%ld ", fl->fl_pid,
+		seq_printf(f, "%d %02x:%02x:%ld ", fl->fl_pid,
 				MAJOR(inode->i_sb->s_dev),
 				MINOR(inode->i_sb->s_dev), inode->i_ino);
 #endif
 	} else {
-		out += sprintf(out, "%d <none>:0 ", fl->fl_pid);
+		seq_printf(f, "%d <none>:0 ", fl->fl_pid);
 	}
 	if (IS_POSIX(fl)) {
 		if (fl->fl_end == OFFSET_MAX)
-			out += sprintf(out, "%Ld EOF\n", fl->fl_start);
+			seq_printf(f, "%Ld EOF\n", fl->fl_start);
 		else
-			out += sprintf(out, "%Ld %Ld\n", fl->fl_start,
-					fl->fl_end);
+			seq_printf(f, "%Ld %Ld\n", fl->fl_start, fl->fl_end);
 	} else {
-		out += sprintf(out, "0 EOF\n");
+		seq_printf(f, "0 EOF\n");
 	}
 }
 
-static void move_lock_status(char **p, off_t* pos, off_t offset)
+static int locks_show(struct seq_file *f, void *v)
 {
-	int len;
-	len = strlen(*p);
-	if(*pos >= offset) {
-		/* the complete line is valid */
-		*p += len;
-		*pos += len;
-		return;
-	}
-	if(*pos+len > offset) {
-		/* use the second part of the line */
-		int i = offset-*pos;
-		memmove(*p,*p+i,len-i);
-		*p += len-i;
-		*pos += len;
-		return;
-	}
-	/* discard the complete line */
-	*pos += len;
-}
+	struct file_lock *fl, *bfl;
 
-/**
- *	get_locks_status	-	reports lock usage in /proc/locks
- *	@buffer: address in userspace to write into
- *	@start: ?
- *	@offset: how far we are through the buffer
- *	@length: how much to read
- */
+	fl = list_entry(v, struct file_lock, fl_link);
+
+	lock_get_status(f, fl, (long)f->private, "");
+
+	list_for_each_entry(bfl, &fl->fl_block, fl_block)
+		lock_get_status(f, bfl, (long)f->private, " ->");
 
-int get_locks_status(char *buffer, char **start, off_t offset, int length)
-{
-	struct list_head *tmp;
-	char *q = buffer;
-	off_t pos = 0;
-	int i = 0;
+	f->private++;
+	return 0;
+}
 
+static void *locks_start(struct seq_file *f, loff_t *pos)
+{
 	lock_kernel();
-	list_for_each(tmp, &file_lock_list) {
-		struct list_head *btmp;
-		struct file_lock *fl = list_entry(tmp, struct file_lock, fl_link);
-		lock_get_status(q, fl, ++i, "");
-		move_lock_status(&q, &pos, offset);
-
-		if(pos >= offset+length)
-			goto done;
-
-		list_for_each(btmp, &fl->fl_block) {
-			struct file_lock *bfl = list_entry(btmp,
-					struct file_lock, fl_block);
-			lock_get_status(q, bfl, i, " ->");
-			move_lock_status(&q, &pos, offset);
+	f->private = (void *)1;
+	return seq_list_start(&file_lock_list, *pos);
+}
 
-			if(pos >= offset+length)
-				goto done;
-		}
-	}
-done:
+static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &file_lock_list, pos);
+}
+
+static void locks_stop(struct seq_file *f, void *v)
+{
 	unlock_kernel();
-	*start = buffer;
-	if(q-buffer < length)
-		return (q-buffer);
-	return length;
 }
 
+struct seq_operations locks_seq_operations = {
+	.start	= locks_start,
+	.next	= locks_next,
+	.stop	= locks_stop,
+	.show	= locks_show,
+};
+#endif
+
 /**
  *	lock_may_read - checks that the region is free of locks
  *	@inode: the inode that is being read
diff -puN fs/nfs/callback.c~git-nfsd fs/nfs/callback.c
--- a/fs/nfs/callback.c~git-nfsd
+++ a/fs/nfs/callback.c
@@ -123,8 +123,8 @@ int nfs_callback_up(void)
 	if (!serv)
 		goto out_err;
 
-	ret = svc_makesock(serv, IPPROTO_TCP, nfs_callback_set_tcpport,
-							SVC_SOCK_ANONYMOUS);
+	ret = svc_create_xprt(serv, "tcp", nfs_callback_set_tcpport,
+			      SVC_SOCK_ANONYMOUS);
 	if (ret <= 0)
 		goto out_destroy;
 	nfs_callback_tcpport = ret;
diff -puN fs/nfs/file.c~git-nfsd fs/nfs/file.c
--- a/fs/nfs/file.c~git-nfsd
+++ a/fs/nfs/file.c
@@ -577,8 +577,7 @@ static int nfs_lock(struct file *filp, i
 	nfs_inc_stats(inode, NFSIOS_VFSLOCK);
 
 	/* No mandatory locks over NFS */
-	if ((inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID &&
-	    fl->fl_type != F_UNLCK)
+	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
 		return -ENOLCK;
 
 	if (IS_GETLK(cmd))
diff -puN fs/nfsd/nfs3xdr.c~git-nfsd fs/nfsd/nfs3xdr.c
--- a/fs/nfsd/nfs3xdr.c~git-nfsd
+++ a/fs/nfsd/nfs3xdr.c
@@ -174,9 +174,6 @@ static __be32 *
 encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
 	      struct kstat *stat)
 {
-	struct dentry	*dentry = fhp->fh_dentry;
-	struct timespec time;
-
 	*p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]);
 	*p++ = htonl((u32) stat->mode);
 	*p++ = htonl((u32) stat->nlink);
@@ -191,10 +188,9 @@ encode_fattr3(struct svc_rqst *rqstp, __
 	*p++ = htonl((u32) MAJOR(stat->rdev));
 	*p++ = htonl((u32) MINOR(stat->rdev));
 	p = encode_fsid(p, fhp);
-	p = xdr_encode_hyper(p, (u64) stat->ino);
+	p = xdr_encode_hyper(p, stat->ino);
 	p = encode_time3(p, &stat->atime);
-	lease_get_mtime(dentry->d_inode, &time); 
-	p = encode_time3(p, &time);
+	p = encode_time3(p, &stat->mtime);
 	p = encode_time3(p, &stat->ctime);
 
 	return p;
@@ -203,31 +199,9 @@ encode_fattr3(struct svc_rqst *rqstp, __
 static __be32 *
 encode_saved_post_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
 {
-	struct inode	*inode = fhp->fh_dentry->d_inode;
-
 	/* Attributes to follow */
 	*p++ = xdr_one;
-
-	*p++ = htonl(nfs3_ftypes[(fhp->fh_post_mode & S_IFMT) >> 12]);
-	*p++ = htonl((u32) fhp->fh_post_mode);
-	*p++ = htonl((u32) fhp->fh_post_nlink);
-	*p++ = htonl((u32) nfsd_ruid(rqstp, fhp->fh_post_uid));
-	*p++ = htonl((u32) nfsd_rgid(rqstp, fhp->fh_post_gid));
-	if (S_ISLNK(fhp->fh_post_mode) && fhp->fh_post_size > NFS3_MAXPATHLEN) {
-		p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN);
-	} else {
-		p = xdr_encode_hyper(p, (u64) fhp->fh_post_size);
-	}
-	p = xdr_encode_hyper(p, ((u64)fhp->fh_post_blocks) << 9);
-	*p++ = fhp->fh_post_rdev[0];
-	*p++ = fhp->fh_post_rdev[1];
-	p = encode_fsid(p, fhp);
-	p = xdr_encode_hyper(p, (u64) inode->i_ino);
-	p = encode_time3(p, &fhp->fh_post_atime);
-	p = encode_time3(p, &fhp->fh_post_mtime);
-	p = encode_time3(p, &fhp->fh_post_ctime);
-
-	return p;
+	return encode_fattr3(rqstp, p, fhp, &fhp->fh_post_attr);
 }
 
 /*
@@ -246,6 +220,7 @@ encode_post_op_attr(struct svc_rqst *rqs
 		err = vfs_getattr(fhp->fh_export->ex_mnt, dentry, &stat);
 		if (!err) {
 			*p++ = xdr_one;		/* attributes follow */
+			lease_get_mtime(dentry->d_inode, &stat.mtime);
 			return encode_fattr3(rqstp, p, fhp, &stat);
 		}
 	}
@@ -284,6 +259,23 @@ encode_wcc_data(struct svc_rqst *rqstp, 
 	return encode_post_op_attr(rqstp, p, fhp);
 }
 
+/*
+ * Fill in the post_op attr for the wcc data
+ */
+void fill_post_wcc(struct svc_fh *fhp)
+{
+	int err;
+
+	if (fhp->fh_post_saved)
+		printk("nfsd: inode locked twice during operation.\n");
+
+	err = vfs_getattr(fhp->fh_export->ex_mnt, fhp->fh_dentry,
+			&fhp->fh_post_attr);
+	if (err)
+		fhp->fh_post_saved = 0;
+	else
+		fhp->fh_post_saved = 1;
+}
 
 /*
  * XDR decode functions
@@ -643,8 +635,11 @@ int
 nfs3svc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_attrstat *resp)
 {
-	if (resp->status == 0)
+	if (resp->status == 0) {
+		lease_get_mtime(resp->fh.fh_dentry->d_inode,
+				&resp->stat.mtime);
 		p = encode_fattr3(rqstp, p, &resp->fh, &resp->stat);
+	}
 	return xdr_ressize_check(rqstp, p);
 }
 
@@ -802,7 +797,7 @@ nfs3svc_encode_readdirres(struct svc_rqs
 
 static __be32 *
 encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name,
-	     int namlen, ino_t ino)
+	     int namlen, u64 ino)
 {
 	*p++ = xdr_one;				 /* mark entry present */
 	p    = xdr_encode_hyper(p, ino);	 /* file id */
@@ -873,7 +868,7 @@ compose_entry_fh(struct nfsd3_readdirres
 #define NFS3_ENTRYPLUS_BAGGAGE	(1 + 21 + 1 + (NFS3_FHSIZE >> 2))
 static int
 encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
-	     loff_t offset, ino_t ino, unsigned int d_type, int plus)
+	     loff_t offset, u64 ino, unsigned int d_type, int plus)
 {
 	struct nfsd3_readdirres *cd = container_of(ccd, struct nfsd3_readdirres,
 		       					common);
diff -puN fs/nfsd/nfs4callback.c~git-nfsd fs/nfsd/nfs4callback.c
--- a/fs/nfsd/nfs4callback.c~git-nfsd
+++ a/fs/nfsd/nfs4callback.c
@@ -39,6 +39,7 @@
 #include <linux/errno.h>
 #include <linux/delay.h>
 #include <linux/sched.h>
+#include <linux/kthread.h>
 #include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/clnt.h>
@@ -343,26 +344,28 @@ static struct rpc_version *	nfs_cb_versi
 	&nfs_cb_version4,
 };
 
-/*
- * Use the SETCLIENTID credential
- */
-static struct rpc_cred *
-nfsd4_lookupcred(struct nfs4_client *clp, int taskflags)
+/* Reference counting, callback cleanup, etc., all look racy as heck.
+ * And why is cb_set an atomic? */
+
+static int do_probe_callback(void *data)
 {
-        struct auth_cred acred;
-	struct rpc_clnt *clnt = clp->cl_callback.cb_client;
-	struct rpc_cred *ret;
+	struct nfs4_client *clp = data;
+	struct nfs4_callback *cb = &clp->cl_callback;
+	struct rpc_message msg = {
+		.rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
+		.rpc_argp       = clp,
+	};
+	int status;
 
-        get_group_info(clp->cl_cred.cr_group_info);
-        acred.uid = clp->cl_cred.cr_uid;
-        acred.gid = clp->cl_cred.cr_gid;
-        acred.group_info = clp->cl_cred.cr_group_info;
-
-        dprintk("NFSD:     looking up %s cred\n",
-                clnt->cl_auth->au_ops->au_name);
-        ret = rpcauth_lookup_credcache(clnt->cl_auth, &acred, taskflags);
-        put_group_info(clp->cl_cred.cr_group_info);
-        return ret;
+	status = rpc_call_sync(cb->cb_client, &msg, RPC_TASK_SOFT);
+
+	if (status) {
+		rpc_shutdown_client(cb->cb_client);
+		cb->cb_client = NULL;
+	} else
+		atomic_set(&cb->cb_set, 1);
+	put_nfs4_client(clp);
+	return 0;
 }
 
 /*
@@ -390,11 +393,7 @@ nfsd4_probe_callback(struct nfs4_client 
 		.authflavor	= RPC_AUTH_UNIX,	/* XXX: need AUTH_GSS... */
 		.flags		= (RPC_CLNT_CREATE_NOPING),
 	};
-	struct rpc_message msg = {
-		.rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
-		.rpc_argp       = clp,
-	};
-	int status;
+	struct task_struct *t;
 
 	if (atomic_read(&cb->cb_set))
 		return;
@@ -426,16 +425,11 @@ nfsd4_probe_callback(struct nfs4_client 
 	/* the task holds a reference to the nfs4_client struct */
 	atomic_inc(&clp->cl_count);
 
-	msg.rpc_cred = nfsd4_lookupcred(clp,0);
-	if (IS_ERR(msg.rpc_cred))
-		goto out_release_clp;
-	status = rpc_call_async(cb->cb_client, &msg, RPC_TASK_ASYNC, &nfs4_cb_null_ops, NULL);
-	put_rpccred(msg.rpc_cred);
+	t = kthread_run(do_probe_callback, clp, "nfs4_cb_probe");
 
-	if (status != 0) {
-		dprintk("NFSD: asynchronous NFSPROC4_CB_NULL failed!\n");
+	if (IS_ERR(t))
 		goto out_release_clp;
-	}
+
 	return;
 
 out_release_clp:
@@ -447,30 +441,6 @@ out_err:
 		(int)clp->cl_name.len, clp->cl_name.data);
 }
 
-static void
-nfs4_cb_null(struct rpc_task *task, void *dummy)
-{
-	struct nfs4_client *clp = (struct nfs4_client *)task->tk_msg.rpc_argp;
-	struct nfs4_callback *cb = &clp->cl_callback;
-	__be32 addr = htonl(cb->cb_addr);
-
-	dprintk("NFSD: nfs4_cb_null task->tk_status %d\n", task->tk_status);
-
-	if (task->tk_status < 0) {
-		dprintk("NFSD: callback establishment to client %.*s failed\n",
-			(int)clp->cl_name.len, clp->cl_name.data);
-		goto out;
-	}
-	atomic_set(&cb->cb_set, 1);
-	dprintk("NFSD: callback set to client %u.%u.%u.%u\n", NIPQUAD(addr));
-out:
-	put_nfs4_client(clp);
-}
-
-static const struct rpc_call_ops nfs4_cb_null_ops = {
-	.rpc_call_done = nfs4_cb_null,
-};
-
 /*
  * called with dp->dl_count inc'ed.
  * nfs4_lock_state() may or may not have been called.
@@ -491,10 +461,6 @@ nfsd4_cb_recall(struct nfs4_delegation *
 	if ((!atomic_read(&clp->cl_callback.cb_set)) || !clnt)
 		return;
 
-	msg.rpc_cred = nfsd4_lookupcred(clp, 0);
-	if (IS_ERR(msg.rpc_cred))
-		goto out;
-
 	cbr->cbr_trunc = 0; /* XXX need to implement truncate optimization */
 	cbr->cbr_dp = dp;
 
@@ -515,13 +481,12 @@ nfsd4_cb_recall(struct nfs4_delegation *
 		status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT);
 	}
 out_put_cred:
-	put_rpccred(msg.rpc_cred);
-out:
 	if (status == -EIO)
 		atomic_set(&clp->cl_callback.cb_set, 0);
 	/* Success or failure, now we're either waiting for lease expiration
 	 * or deleg_return. */
 	dprintk("NFSD: nfs4_cb_recall: dp %p dl_flock %p dl_count %d\n",dp, dp->dl_flock, atomic_read(&dp->dl_count));
+	put_nfs4_client(clp);
 	nfs4_put_delegation(dp);
 	return;
 }
diff -puN fs/nfsd/nfs4idmap.c~git-nfsd fs/nfsd/nfs4idmap.c
--- a/fs/nfsd/nfs4idmap.c~git-nfsd
+++ a/fs/nfsd/nfs4idmap.c
@@ -207,6 +207,7 @@ idtoname_parse(struct cache_detail *cd, 
 {
 	struct ent ent, *res;
 	char *buf1, *bp;
+	int len;
 	int error = -EINVAL;
 
 	if (buf[buflen - 1] != '\n')
@@ -248,10 +249,11 @@ idtoname_parse(struct cache_detail *cd, 
 		goto out;
 
 	/* Name */
-	error = qword_get(&buf, buf1, PAGE_SIZE);
-	if (error == -EINVAL)
+	error = -EINVAL;
+	len = qword_get(&buf, buf1, PAGE_SIZE);
+	if (len < 0)
 		goto out;
-	if (error == -ENOENT)
+	if (len == 0)
 		set_bit(CACHE_NEGATIVE, &ent.h.flags);
 	else {
 		if (error >= IDMAP_NAMESZ) {
diff -puN fs/nfsd/nfs4proc.c~git-nfsd fs/nfsd/nfs4proc.c
--- a/fs/nfsd/nfs4proc.c~git-nfsd
+++ a/fs/nfsd/nfs4proc.c
@@ -238,12 +238,12 @@ nfsd4_open(struct svc_rqst *rqstp, struc
 			break;
              	case NFS4_OPEN_CLAIM_DELEGATE_PREV:
 			open->op_stateowner->so_confirmed = 1;
-			printk("NFSD: unsupported OPEN claim type %d\n",
+			dprintk("NFSD: unsupported OPEN claim type %d\n",
 				open->op_claim_type);
 			status = nfserr_notsupp;
 			goto out;
 		default:
-			printk("NFSD: Invalid OPEN claim type %d\n",
+			dprintk("NFSD: Invalid OPEN claim type %d\n",
 				open->op_claim_type);
 			status = nfserr_inval;
 			goto out;
diff -puN fs/nfsd/nfs4state.c~git-nfsd fs/nfsd/nfs4state.c
--- a/fs/nfsd/nfs4state.c~git-nfsd
+++ a/fs/nfsd/nfs4state.c
@@ -358,9 +358,22 @@ alloc_client(struct xdr_netobj name)
 	return clp;
 }
 
+static void
+shutdown_callback_client(struct nfs4_client *clp)
+{
+	struct rpc_clnt *clnt = clp->cl_callback.cb_client;
+
+	/* shutdown rpc client, ending any outstanding recall rpcs */
+	if (clnt) {
+		clp->cl_callback.cb_client = NULL;
+		rpc_shutdown_client(clnt);
+	}
+}
+
 static inline void
 free_client(struct nfs4_client *clp)
 {
+	shutdown_callback_client(clp);
 	if (clp->cl_cred.cr_group_info)
 		put_group_info(clp->cl_cred.cr_group_info);
 	kfree(clp->cl_name.data);
@@ -375,18 +388,6 @@ put_nfs4_client(struct nfs4_client *clp)
 }
 
 static void
-shutdown_callback_client(struct nfs4_client *clp)
-{
-	struct rpc_clnt *clnt = clp->cl_callback.cb_client;
-
-	/* shutdown rpc client, ending any outstanding recall rpcs */
-	if (clnt) {
-		clp->cl_callback.cb_client = NULL;
-		rpc_shutdown_client(clnt);
-	}
-}
-
-static void
 expire_client(struct nfs4_client *clp)
 {
 	struct nfs4_stateowner *sop;
@@ -396,8 +397,6 @@ expire_client(struct nfs4_client *clp)
 	dprintk("NFSD: expire_client cl_count %d\n",
 	                    atomic_read(&clp->cl_count));
 
-	shutdown_callback_client(clp);
-
 	INIT_LIST_HEAD(&reaplist);
 	spin_lock(&recall_lock);
 	while (!list_empty(&clp->cl_delegations)) {
@@ -462,26 +461,28 @@ copy_cred(struct svc_cred *target, struc
 }
 
 static inline int
-same_name(const char *n1, const char *n2) {
+same_name(const char *n1, const char *n2)
+{
 	return 0 == memcmp(n1, n2, HEXDIR_LEN);
 }
 
 static int
-cmp_verf(nfs4_verifier *v1, nfs4_verifier *v2) {
-	return(!memcmp(v1->data,v2->data,sizeof(v1->data)));
+same_verf(nfs4_verifier *v1, nfs4_verifier *v2)
+{
+	return 0 == memcmp(v1->data, v2->data, sizeof(v1->data));
 }
 
 static int
-cmp_clid(clientid_t * cl1, clientid_t * cl2) {
-	return((cl1->cl_boot == cl2->cl_boot) &&
-	   	(cl1->cl_id == cl2->cl_id));
+same_clid(clientid_t *cl1, clientid_t *cl2)
+{
+	return (cl1->cl_boot == cl2->cl_boot) && (cl1->cl_id == cl2->cl_id);
 }
 
 /* XXX what about NGROUP */
 static int
-cmp_creds(struct svc_cred *cr1, struct svc_cred *cr2){
-	return(cr1->cr_uid == cr2->cr_uid);
-
+same_creds(struct svc_cred *cr1, struct svc_cred *cr2)
+{
+	return cr1->cr_uid == cr2->cr_uid;
 }
 
 static void
@@ -507,7 +508,7 @@ check_name(struct xdr_netobj name) {
 	if (name.len == 0) 
 		return 0;
 	if (name.len > NFS4_OPAQUE_LIMIT) {
-		printk("NFSD: check_name: name too long(%d)!\n", name.len);
+		dprintk("NFSD: check_name: name too long(%d)!\n", name.len);
 		return 0;
 	}
 	return 1;
@@ -546,7 +547,7 @@ find_confirmed_client(clientid_t *clid)
 	unsigned int idhashval = clientid_hashval(clid->cl_id);
 
 	list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) {
-		if (cmp_clid(&clp->cl_clientid, clid))
+		if (same_clid(&clp->cl_clientid, clid))
 			return clp;
 	}
 	return NULL;
@@ -559,7 +560,7 @@ find_unconfirmed_client(clientid_t *clid
 	unsigned int idhashval = clientid_hashval(clid->cl_id);
 
 	list_for_each_entry(clp, &unconf_id_hashtbl[idhashval], cl_idhash) {
-		if (cmp_clid(&clp->cl_clientid, clid))
+		if (same_clid(&clp->cl_clientid, clid))
 			return clp;
 	}
 	return NULL;
@@ -753,7 +754,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp
 		 * or different ip_address
 		 */
 		status = nfserr_clid_inuse;
-		if (!cmp_creds(&conf->cl_cred, &rqstp->rq_cred)
+		if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)
 				|| conf->cl_addr != sin->sin_addr.s_addr) {
 			dprintk("NFSD: setclientid: string in use by client"
 				"at %u.%u.%u.%u\n", NIPQUAD(conf->cl_addr));
@@ -772,14 +773,8 @@ nfsd4_setclientid(struct svc_rqst *rqstp
 		new = create_client(clname, dname);
 		if (new == NULL)
 			goto out;
-		copy_verf(new, &clverifier);
-		new->cl_addr = sin->sin_addr.s_addr;
-		copy_cred(&new->cl_cred,&rqstp->rq_cred);
 		gen_clid(new);
-		gen_confirm(new);
-		gen_callback(new, setclid);
-		add_to_unconfirmed(new, strhashval);
-	} else if (cmp_verf(&conf->cl_verifier, &clverifier)) {
+	} else if (same_verf(&conf->cl_verifier, &clverifier)) {
 		/*
 		 * CASE 1:
 		 * cl_name match, confirmed, principal match
@@ -804,13 +799,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp
 		new = create_client(clname, dname);
 		if (new == NULL)
 			goto out;
-		copy_verf(new,&conf->cl_verifier);
-		new->cl_addr = sin->sin_addr.s_addr;
-		copy_cred(&new->cl_cred,&rqstp->rq_cred);
 		copy_clid(new, conf);
-		gen_confirm(new);
-		gen_callback(new, setclid);
-		add_to_unconfirmed(new,strhashval);
 	} else if (!unconf) {
 		/*
 		 * CASE 2:
@@ -823,14 +812,8 @@ nfsd4_setclientid(struct svc_rqst *rqstp
 		new = create_client(clname, dname);
 		if (new == NULL)
 			goto out;
-		copy_verf(new,&clverifier);
-		new->cl_addr = sin->sin_addr.s_addr;
-		copy_cred(&new->cl_cred,&rqstp->rq_cred);
 		gen_clid(new);
-		gen_confirm(new);
-		gen_callback(new, setclid);
-		add_to_unconfirmed(new, strhashval);
-	} else if (!cmp_verf(&conf->cl_confirm, &unconf->cl_confirm)) {
+	} else if (!same_verf(&conf->cl_confirm, &unconf->cl_confirm)) {
 		/*	
 		 * CASE3:
 		 * confirmed found (name, principal match)
@@ -850,19 +833,19 @@ nfsd4_setclientid(struct svc_rqst *rqstp
 		new = create_client(clname, dname);
 		if (new == NULL)
 			goto out;
-		copy_verf(new,&clverifier);
-		new->cl_addr = sin->sin_addr.s_addr;
-		copy_cred(&new->cl_cred,&rqstp->rq_cred);
 		gen_clid(new);
-		gen_confirm(new);
-		gen_callback(new, setclid);
-		add_to_unconfirmed(new, strhashval);
 	} else {
 		/* No cases hit !!! */
 		status = nfserr_inval;
 		goto out;
 
 	}
+	copy_verf(new, &clverifier);
+	new->cl_addr = sin->sin_addr.s_addr;
+	copy_cred(&new->cl_cred, &rqstp->rq_cred);
+	gen_confirm(new);
+	gen_callback(new, setclid);
+	add_to_unconfirmed(new, strhashval);
 	setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
 	setclid->se_clientid.cl_id = new->cl_clientid.cl_id;
 	memcpy(setclid->se_confirm.data, new->cl_confirm.data, sizeof(setclid->se_confirm.data));
@@ -910,16 +893,16 @@ nfsd4_setclientid_confirm(struct svc_rqs
 		goto out;
 
 	if ((conf && unconf) && 
-	    (cmp_verf(&unconf->cl_confirm, &confirm)) &&
-	    (cmp_verf(&conf->cl_verifier, &unconf->cl_verifier)) &&
+	    (same_verf(&unconf->cl_confirm, &confirm)) &&
+	    (same_verf(&conf->cl_verifier, &unconf->cl_verifier)) &&
 	    (same_name(conf->cl_recdir,unconf->cl_recdir))  &&
-	    (!cmp_verf(&conf->cl_confirm, &unconf->cl_confirm))) {
+	    (!same_verf(&conf->cl_confirm, &unconf->cl_confirm))) {
 		/* CASE 1:
 		* unconf record that matches input clientid and input confirm.
 		* conf record that matches input clientid.
 		* conf and unconf records match names, verifiers
 		*/
-		if (!cmp_creds(&conf->cl_cred, &unconf->cl_cred)) 
+		if (!same_creds(&conf->cl_cred, &unconf->cl_cred))
 			status = nfserr_clid_inuse;
 		else {
 			/* XXX: We just turn off callbacks until we can handle
@@ -933,7 +916,7 @@ nfsd4_setclientid_confirm(struct svc_rqs
 		}
 	} else if ((conf && !unconf) ||
 	    ((conf && unconf) && 
-	     (!cmp_verf(&conf->cl_verifier, &unconf->cl_verifier) ||
+	     (!same_verf(&conf->cl_verifier, &unconf->cl_verifier) ||
 	      !same_name(conf->cl_recdir, unconf->cl_recdir)))) {
 		/* CASE 2:
 		 * conf record that matches input clientid.
@@ -941,18 +924,18 @@ nfsd4_setclientid_confirm(struct svc_rqs
 		 * unconf->cl_name or unconf->cl_verifier don't match the
 		 * conf record.
 		 */
-		if (!cmp_creds(&conf->cl_cred,&rqstp->rq_cred))
+		if (!same_creds(&conf->cl_cred, &rqstp->rq_cred))
 			status = nfserr_clid_inuse;
 		else
 			status = nfs_ok;
 	} else if (!conf && unconf
-			&& cmp_verf(&unconf->cl_confirm, &confirm)) {
+			&& same_verf(&unconf->cl_confirm, &confirm)) {
 		/* CASE 3:
 		 * conf record not found.
 		 * unconf record found.
 		 * unconf->cl_confirm matches input confirm
 		 */
-		if (!cmp_creds(&unconf->cl_cred, &rqstp->rq_cred)) {
+		if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred)) {
 			status = nfserr_clid_inuse;
 		} else {
 			unsigned int hash =
@@ -967,8 +950,8 @@ nfsd4_setclientid_confirm(struct svc_rqs
 			conf = unconf;
 			status = nfs_ok;
 		}
-	} else if ((!conf || (conf && !cmp_verf(&conf->cl_confirm, &confirm)))
-	    && (!unconf || (unconf && !cmp_verf(&unconf->cl_confirm,
+	} else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm)))
+	    && (!unconf || (unconf && !same_verf(&unconf->cl_confirm,
 				    				&confirm)))) {
 		/* CASE 4:
 		 * conf record not found, or if conf, conf->cl_confirm does not
@@ -1019,7 +1002,7 @@ nfsd4_free_slab(struct kmem_cache **slab
 	*slab = NULL;
 }
 
-static void
+void
 nfsd4_free_slabs(void)
 {
 	nfsd4_free_slab(&stateowner_slab);
@@ -1207,10 +1190,12 @@ move_to_close_lru(struct nfs4_stateowner
 }
 
 static int
-cmp_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner, clientid_t *clid) {
-	return ((sop->so_owner.len == owner->len) && 
-	 !memcmp(sop->so_owner.data, owner->data, owner->len) && 
-	  (sop->so_client->cl_clientid.cl_id == clid->cl_id));
+same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner,
+							clientid_t *clid)
+{
+	return (sop->so_owner.len == owner->len) &&
+		0 == memcmp(sop->so_owner.data, owner->data, owner->len) &&
+		(sop->so_client->cl_clientid.cl_id == clid->cl_id);
 }
 
 static struct nfs4_stateowner *
@@ -1219,7 +1204,7 @@ find_openstateowner_str(unsigned int has
 	struct nfs4_stateowner *so = NULL;
 
 	list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) {
-		if (cmp_owner_str(so, &open->op_owner, &open->op_clientid))
+		if (same_owner_str(so, &open->op_owner, &open->op_clientid))
 			return so;
 	}
 	return NULL;
@@ -1360,6 +1345,7 @@ void nfsd_break_deleg_cb(struct file_loc
 	 * lock) we know the server hasn't removed the lease yet, we know
 	 * it's safe to take a reference: */
 	atomic_inc(&dp->dl_count);
+	atomic_inc(&dp->dl_client->cl_count);
 
 	spin_lock(&recall_lock);
 	list_add_tail(&dp->dl_recall_lru, &del_recall_lru);
@@ -1368,8 +1354,12 @@ void nfsd_break_deleg_cb(struct file_loc
 	/* only place dl_time is set. protected by lock_kernel*/
 	dp->dl_time = get_seconds();
 
-	/* XXX need to merge NFSD_LEASE_TIME with fs/locks.c:lease_break_time */
-	fl->fl_break_time = jiffies + NFSD_LEASE_TIME * HZ;
+	/*
+	 * We don't want the locks code to timeout the lease for us;
+	 * we'll remove it ourself if the delegation isn't returned
+	 * in time.
+	 */
+	fl->fl_break_time = 0;
 
 	t = kthread_run(do_recall, dp, "%s", "nfs4_cb_recall");
 	if (IS_ERR(t)) {
@@ -1378,6 +1368,7 @@ void nfsd_break_deleg_cb(struct file_loc
 		printk(KERN_INFO "NFSD: Callback thread failed for "
 			"for client (clientid %08x/%08x)\n",
 			clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
+		put_nfs4_client(dp->dl_client);
 		nfs4_put_delegation(dp);
 	}
 }
@@ -1738,7 +1729,7 @@ out:
 	if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS
 			&& flag == NFS4_OPEN_DELEGATE_NONE
 			&& open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE)
-		printk("NFSD: WARNING: refusing delegation reclaim\n");
+		dprintk("NFSD: WARNING: refusing delegation reclaim\n");
 	open->op_delegate_type = flag;
 }
 
@@ -2044,7 +2035,7 @@ static inline int
 io_during_grace_disallowed(struct inode *inode, int flags)
 {
 	return nfs4_in_grace() && (flags & (RD_STATE | WR_STATE))
-		&& MANDATORY_LOCK(inode);
+		&& mandatory_lock(inode);
 }
 
 /*
@@ -2147,7 +2138,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *
 	*sopp = NULL;
 
 	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) {
-		printk("NFSD: preprocess_seqid_op: magic stateid!\n");
+		dprintk("NFSD: preprocess_seqid_op: magic stateid!\n");
 		return nfserr_bad_stateid;
 	}
 
@@ -2181,25 +2172,24 @@ nfs4_preprocess_seqid_op(struct svc_fh *
 		lkflg = setlkflg(lock->lk_type);
 
 		if (lock->lk_is_new) {
-                       if (!sop->so_is_open_owner)
-			       return nfserr_bad_stateid;
-                       if (!cmp_clid(&clp->cl_clientid, lockclid))
+			if (!sop->so_is_open_owner)
+				return nfserr_bad_stateid;
+			if (!same_clid(&clp->cl_clientid, lockclid))
 			       return nfserr_bad_stateid;
-                       /* stp is the open stateid */
-                       status = nfs4_check_openmode(stp, lkflg);
-                       if (status)
-			       return status;
-               } else {
-                       /* stp is the lock stateid */
-                       status = nfs4_check_openmode(stp->st_openstp, lkflg);
-                       if (status)
-			       return status;
+			/* stp is the open stateid */
+			status = nfs4_check_openmode(stp, lkflg);
+			if (status)
+				return status;
+		} else {
+			/* stp is the lock stateid */
+			status = nfs4_check_openmode(stp->st_openstp, lkflg);
+			if (status)
+				return status;
                }
-
 	}
 
 	if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp)) {
-		printk("NFSD: preprocess_seqid_op: fh-stateid mismatch!\n");
+		dprintk("NFSD: preprocess_seqid_op: fh-stateid mismatch!\n");
 		return nfserr_bad_stateid;
 	}
 
@@ -2215,22 +2205,22 @@ nfs4_preprocess_seqid_op(struct svc_fh *
 		goto check_replay;
 
 	if (sop->so_confirmed && flags & CONFIRM) {
-		printk("NFSD: preprocess_seqid_op: expected"
+		dprintk("NFSD: preprocess_seqid_op: expected"
 				" unconfirmed stateowner!\n");
 		return nfserr_bad_stateid;
 	}
 	if (!sop->so_confirmed && !(flags & CONFIRM)) {
-		printk("NFSD: preprocess_seqid_op: stateowner not"
+		dprintk("NFSD: preprocess_seqid_op: stateowner not"
 				" confirmed yet!\n");
 		return nfserr_bad_stateid;
 	}
 	if (stateid->si_generation > stp->st_stateid.si_generation) {
-		printk("NFSD: preprocess_seqid_op: future stateid?!\n");
+		dprintk("NFSD: preprocess_seqid_op: future stateid?!\n");
 		return nfserr_bad_stateid;
 	}
 
 	if (stateid->si_generation < stp->st_stateid.si_generation) {
-		printk("NFSD: preprocess_seqid_op: old stateid!\n");
+		dprintk("NFSD: preprocess_seqid_op: old stateid!\n");
 		return nfserr_old_stateid;
 	}
 	renew_client(sop->so_client);
@@ -2242,7 +2232,7 @@ check_replay:
 		/* indicate replay to calling function */
 		return nfserr_replay_me;
 	}
-	printk("NFSD: preprocess_seqid_op: bad seqid (expected %d, got %d)\n",
+	dprintk("NFSD: preprocess_seqid_op: bad seqid (expected %d, got %d)\n",
 			sop->so_seqid, seqid);
 	*sopp = NULL;
 	return nfserr_bad_seqid;
@@ -2561,7 +2551,7 @@ find_lockstateowner_str(struct inode *in
 	struct nfs4_stateowner *op;
 
 	list_for_each_entry(op, &lock_ownerstr_hashtbl[hashval], so_strhash) {
-		if (cmp_owner_str(op, owner, clid))
+		if (same_owner_str(op, owner, clid))
 			return op;
 	}
 	return NULL;
@@ -2855,7 +2845,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, stru
 			file_lock.fl_type = F_WRLCK;
 		break;
 		default:
-			printk("NFSD: nfs4_lockt: bad lock type!\n");
+			dprintk("NFSD: nfs4_lockt: bad lock type!\n");
 			status = nfserr_inval;
 		goto out;
 	}
@@ -3025,7 +3015,7 @@ nfsd4_release_lockowner(struct svc_rqst 
 	INIT_LIST_HEAD(&matches);
 	for (i = 0; i < LOCK_HASH_SIZE; i++) {
 		list_for_each_entry(sop, &lock_ownerid_hashtbl[i], so_idhash) {
-			if (!cmp_owner_str(sop, owner, clid))
+			if (!same_owner_str(sop, owner, clid))
 				continue;
 			list_for_each_entry(stp, &sop->so_stateids,
 					st_perstateowner) {
@@ -3149,11 +3139,14 @@ nfs4_check_open_reclaim(clientid_t *clid
 
 /* initialization to perform at module load time: */
 
-void
+int
 nfs4_state_init(void)
 {
-	int i;
+	int i, status;
 
+	status = nfsd4_init_slabs();
+	if (status)
+		return status;
 	for (i = 0; i < CLIENT_HASH_SIZE; i++) {
 		INIT_LIST_HEAD(&conf_id_hashtbl[i]);
 		INIT_LIST_HEAD(&conf_str_hashtbl[i]);
@@ -3182,6 +3175,7 @@ nfs4_state_init(void)
 	for (i = 0; i < CLIENT_HASH_SIZE; i++)
 		INIT_LIST_HEAD(&reclaim_str_hashtbl[i]);
 	reclaim_str_hashtbl_size = 0;
+	return 0;
 }
 
 static void
@@ -3242,20 +3236,15 @@ __nfs4_state_start(void)
 	set_max_delegations();
 }
 
-int
+void
 nfs4_state_start(void)
 {
-	int status;
-
 	if (nfs4_init)
-		return 0;
-	status = nfsd4_init_slabs();
-	if (status)
-		return status;
+		return;
 	nfsd4_load_reboot_recovery_data();
 	__nfs4_state_start();
 	nfs4_init = 1;
-	return 0;
+	return;
 }
 
 int
@@ -3313,7 +3302,6 @@ nfs4_state_shutdown(void)
 	nfs4_lock_state();
 	nfs4_release_reclaim();
 	__nfs4_state_shutdown();
-	nfsd4_free_slabs();
 	nfs4_unlock_state();
 }
 
diff -puN fs/nfsd/nfs4xdr.c~git-nfsd fs/nfsd/nfs4xdr.c
--- a/fs/nfsd/nfs4xdr.c~git-nfsd
+++ a/fs/nfsd/nfs4xdr.c
@@ -1479,7 +1479,8 @@ nfsd4_encode_fattr(struct svc_fh *fhp, s
 	err = vfs_getattr(exp->ex_mnt, dentry, &stat);
 	if (err)
 		goto out_nfserr;
-	if ((bmval0 & (FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL)) ||
+	if ((bmval0 & (FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL |
+			FATTR4_WORD0_MAXNAME)) ||
 	    (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
 		       FATTR4_WORD1_SPACE_TOTAL))) {
 		err = vfs_statfs(dentry, &statfs);
@@ -1683,7 +1684,7 @@ out_acl:
 	if (bmval0 & FATTR4_WORD0_FILEID) {
 		if ((buflen -= 8) < 0)
 			goto out_resource;
-		WRITE64((u64) stat.ino);
+		WRITE64(stat.ino);
 	}
 	if (bmval0 & FATTR4_WORD0_FILES_AVAIL) {
 		if ((buflen -= 8) < 0)
@@ -1725,7 +1726,7 @@ out_acl:
 	if (bmval0 & FATTR4_WORD0_MAXNAME) {
 		if ((buflen -= 4) < 0)
 			goto out_resource;
-		WRITE32(~(u32) 0);
+		WRITE32(statfs.f_namelen);
 	}
 	if (bmval0 & FATTR4_WORD0_MAXREAD) {
 		if ((buflen -= 8) < 0)
@@ -1825,16 +1826,15 @@ out_acl:
 		WRITE32(stat.mtime.tv_nsec);
 	}
 	if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
-		struct dentry *mnt_pnt, *mnt_root;
-
 		if ((buflen -= 8) < 0)
                 	goto out_resource;
-		mnt_root = exp->ex_mnt->mnt_root;
-		if (mnt_root->d_inode == dentry->d_inode) {
-			mnt_pnt = exp->ex_mnt->mnt_mountpoint;
-			WRITE64((u64) mnt_pnt->d_inode->i_ino);
-		} else
-                	WRITE64((u64) stat.ino);
+		if (exp->ex_mnt->mnt_root->d_inode == dentry->d_inode) {
+			err = vfs_getattr(exp->ex_mnt->mnt_parent,
+				exp->ex_mnt->mnt_mountpoint, &stat);
+			if (err)
+				goto out_nfserr;
+		}
+		WRITE64(stat.ino);
 	}
 	*attrlenp = htonl((char *)p - (char *)attrlenp - 4);
 	*countp = p - buffer;
diff -puN fs/nfsd/nfsctl.c~git-nfsd fs/nfsd/nfsctl.c
--- a/fs/nfsd/nfsctl.c~git-nfsd
+++ a/fs/nfsd/nfsctl.c
@@ -298,7 +298,7 @@ static ssize_t write_filehandle(struct f
 	 * qword quoting is used, so filehandle will be \x....
 	 */
 	char *dname, *path;
-	int maxsize;
+	int uninitialized_var(maxsize);
 	char *mesg = buf;
 	int len;
 	struct auth_domain *dom;
@@ -554,6 +554,22 @@ static ssize_t write_ports(struct file *
 		kfree(toclose);
 		return len;
 	}
+	/*
+	 * Add a transport listener by writing it's transport name
+	 */
+	if (isalnum(buf[0])) {
+		int err;
+		char transport[16];
+		int port;
+		if (sscanf(buf, "%15s %4d", transport, &port) == 2) {
+			err = nfsd_create_serv();
+			if (!err)
+				err = svc_create_xprt(nfsd_serv,
+						      transport, port,
+						      SVC_SOCK_ANONYMOUS);
+			return err < 0 ? err : 0;
+		}
+	}
 	return -EINVAL;
 }
 
@@ -679,11 +695,13 @@ static int __init init_nfsd(void)
 	int retval;
 	printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n");
 
+	retval = nfs4_state_init(); /* nfs4 locking state */
+	if (retval)
+		return retval;
 	nfsd_stat_init();	/* Statistics */
 	nfsd_cache_init();	/* RPC reply cache */
 	nfsd_export_init();	/* Exports table */
 	nfsd_lockd_init();	/* lockd->nfsd callbacks */
-	nfs4_state_init();	/* NFSv4 locking state */
 	nfsd_idmap_init();      /* Name to ID mapping */
 	if (proc_mkdir("fs/nfs", NULL)) {
 		struct proc_dir_entry *entry;
@@ -712,6 +730,7 @@ static void __exit exit_nfsd(void)
 	nfsd_stat_shutdown();
 	nfsd_lockd_shutdown();
 	nfsd_idmap_shutdown();
+	nfsd4_free_slabs();
 	unregister_filesystem(&nfsd_fs_type);
 }
 
diff -puN fs/nfsd/nfssvc.c~git-nfsd fs/nfsd/nfssvc.c
--- a/fs/nfsd/nfssvc.c~git-nfsd
+++ a/fs/nfsd/nfssvc.c
@@ -155,8 +155,8 @@ static int killsig;	/* signal that was u
 static void nfsd_last_thread(struct svc_serv *serv)
 {
 	/* When last nfsd thread exits we need to do some clean-up */
-	struct svc_sock *svsk;
-	list_for_each_entry(svsk, &serv->sv_permsocks, sk_list)
+	struct svc_xprt *xprt;
+	list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list)
 		lockd_down();
 	nfsd_serv = NULL;
 	nfsd_racache_shutdown();
@@ -236,7 +236,7 @@ static int nfsd_init_socks(int port)
 
 	error = lockd_up(IPPROTO_UDP);
 	if (error >= 0) {
-		error = svc_makesock(nfsd_serv, IPPROTO_UDP, port,
+		error = svc_create_xprt(nfsd_serv, "udp", port,
 					SVC_SOCK_DEFAULTS);
 		if (error < 0)
 			lockd_down();
@@ -247,7 +247,7 @@ static int nfsd_init_socks(int port)
 #ifdef CONFIG_NFSD_TCP
 	error = lockd_up(IPPROTO_TCP);
 	if (error >= 0) {
-		error = svc_makesock(nfsd_serv, IPPROTO_TCP, port,
+		error = svc_create_xprt(nfsd_serv, "tcp", port,
 					SVC_SOCK_DEFAULTS);
 		if (error < 0)
 			lockd_down();
@@ -349,9 +349,7 @@ nfsd_svc(unsigned short port, int nrserv
 	error =	nfsd_racache_init(2*nrservs);
 	if (error<0)
 		goto out;
-	error = nfs4_state_start();
-	if (error<0)
-		goto out;
+	nfs4_state_start();
 
 	nfsd_reset_versions();
 
@@ -546,10 +544,8 @@ nfsd_dispatch(struct svc_rqst *rqstp, __
 	/* Now call the procedure handler, and encode NFS status. */
 	nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
 	nfserr = map_new_errors(rqstp->rq_vers, nfserr);
-	if (nfserr == nfserr_jukebox && rqstp->rq_vers == 2)
-		nfserr = nfserr_dropit;
 	if (nfserr == nfserr_dropit) {
-		dprintk("nfsd: Dropping request due to malloc failure!\n");
+		dprintk("nfsd: Dropping request; may be revisited later\n");
 		nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
 		return 0;
 	}
diff -puN fs/nfsd/nfsxdr.c~git-nfsd fs/nfsd/nfsxdr.c
--- a/fs/nfsd/nfsxdr.c~git-nfsd
+++ a/fs/nfsd/nfsxdr.c
@@ -523,6 +523,10 @@ nfssvc_encode_entry(void *ccdv, const ch
 		cd->common.err = nfserr_toosmall;
 		return -EINVAL;
 	}
+	if (ino > ~((u32) 0)) {
+		cd->common.err = nfserr_fbig;
+		return -EINVAL;
+	}
 	*p++ = xdr_one;				/* mark entry present */
 	*p++ = htonl((u32) ino);		/* file id */
 	p    = xdr_encode_array(p, name, namlen);/* name length & name */
diff -puN fs/nfsd/vfs.c~git-nfsd fs/nfsd/vfs.c
--- a/fs/nfsd/vfs.c~git-nfsd
+++ a/fs/nfsd/vfs.c
@@ -61,12 +61,6 @@
 #define NFSDDBG_FACILITY		NFSDDBG_FILEOP
 
 
-/* We must ignore files (but only files) which might have mandatory
- * locks on them because there is no way to know if the accesser has
- * the lock.
- */
-#define IS_ISMNDLK(i)	(S_ISREG((i)->i_mode) && MANDATORY_LOCK(i))
-
 /*
  * This is a cache of readahead params that help us choose the proper
  * readahead strategy. Initially, we set all readahead parameters to 0
@@ -295,7 +289,8 @@ nfsd_setattr(struct svc_rqst *rqstp, str
 	if (!iap->ia_valid)
 		goto out;
 
-	/* NFSv2 does not differentiate between "set-[ac]time-to-now"
+	/*
+	 * NFSv2 does not differentiate between "set-[ac]time-to-now"
 	 * which only requires access, and "set-[ac]time-to-X" which
 	 * requires ownership.
 	 * So if it looks like it might be "set both to the same time which
@@ -308,25 +303,33 @@ nfsd_setattr(struct svc_rqst *rqstp, str
 	 */
 #define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET)
 #define	MAX_TOUCH_TIME_ERROR (30*60)
-	if ((iap->ia_valid & BOTH_TIME_SET) == BOTH_TIME_SET
-	    && iap->ia_mtime.tv_sec == iap->ia_atime.tv_sec
-	    ) {
-	    /* Looks probable.  Now just make sure time is in the right ballpark.
-	     * Solaris, at least, doesn't seem to care what the time request is.
-	     * We require it be within 30 minutes of now.
-	     */
-	    time_t delta = iap->ia_atime.tv_sec - get_seconds();
-	    if (delta<0) delta = -delta;
-	    if (delta < MAX_TOUCH_TIME_ERROR &&
-		inode_change_ok(inode, iap) != 0) {
-		/* turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME
-		 * this will cause notify_change to set these times to "now"
+	if ((iap->ia_valid & BOTH_TIME_SET) == BOTH_TIME_SET &&
+	    iap->ia_mtime.tv_sec == iap->ia_atime.tv_sec) {
+		/*
+		 * Looks probable.
+		 *
+		 * Now just make sure time is in the right ballpark.
+		 * Solaris, at least, doesn't seem to care what the time
+		 * request is.  We require it be within 30 minutes of now.
 		 */
-		iap->ia_valid &= ~BOTH_TIME_SET;
-	    }
+		time_t delta = iap->ia_atime.tv_sec - get_seconds();
+		if (delta < 0)
+			delta = -delta;
+		if (delta < MAX_TOUCH_TIME_ERROR &&
+		    inode_change_ok(inode, iap) != 0) {
+			/*
+			 * Turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME.
+			 * This will cause notify_change to set these times
+			 * to "now"
+			 */
+			iap->ia_valid &= ~BOTH_TIME_SET;
+		}
 	}
 	    
-	/* The size case is special. It changes the file as well as the attributes.  */
+	/*
+	 * The size case is special.
+	 * It changes the file as well as the attributes.
+	 */
 	if (iap->ia_valid & ATTR_SIZE) {
 		if (iap->ia_size < inode->i_size) {
 			err = nfsd_permission(rqstp, fhp->fh_export, dentry, MAY_TRUNC|MAY_OWNER_OVERRIDE);
@@ -680,7 +683,12 @@ nfsd_open(struct svc_rqst *rqstp, struct
 	err = nfserr_perm;
 	if (IS_APPEND(inode) && (access & MAY_WRITE))
 		goto out;
-	if (IS_ISMNDLK(inode))
+	/*
+	 * We must ignore files (but only files) which might have mandatory
+	 * locks on them because there is no way to know if the accesser has
+	 * the lock.
+	 */
+	if (S_ISREG((inode)->i_mode) && mandatory_lock(inode))
 		goto out;
 
 	if (!inode->i_fop)
diff -puN fs/proc/proc_misc.c~git-nfsd fs/proc/proc_misc.c
--- a/fs/proc/proc_misc.c~git-nfsd
+++ a/fs/proc/proc_misc.c
@@ -66,7 +66,6 @@ extern int get_stram_list(char *);
 extern int get_filesystem_list(char *);
 extern int get_exec_domain_list(char *);
 extern int get_dma_list(char *);
-extern int get_locks_status (char *, char **, off_t, int);
 
 static int proc_calc_metrics(char *page, char **start, off_t off,
 				 int count, int *eof, int len)
@@ -617,16 +616,18 @@ static int cmdline_read_proc(char *page,
 	return proc_calc_metrics(page, start, off, count, eof, len);
 }
 
-static int locks_read_proc(char *page, char **start, off_t off,
-				 int count, int *eof, void *data)
+static int locks_open(struct inode *inode, struct file *filp)
 {
-	int len = get_locks_status(page, start, off, count);
-
-	if (len < count)
-		*eof = 1;
-	return len;
+	return seq_open(filp, &locks_seq_operations);
 }
 
+static const struct file_operations proc_locks_operations = {
+	.open		= locks_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
 static int execdomains_read_proc(char *page, char **start, off_t off,
 				 int count, int *eof, void *data)
 {
@@ -684,7 +685,6 @@ void __init proc_misc_init(void)
 #endif
 		{"filesystems",	filesystems_read_proc},
 		{"cmdline",	cmdline_read_proc},
-		{"locks",	locks_read_proc},
 		{"execdomains",	execdomains_read_proc},
 		{NULL,}
 	};
@@ -702,6 +702,7 @@ void __init proc_misc_init(void)
 			entry->proc_fops = &proc_kmsg_operations;
 	}
 #endif
+	create_seq_entry("locks", 0, &proc_locks_operations);
 	create_seq_entry("devices", 0, &proc_devinfo_operations);
 	create_seq_entry("cpuinfo", 0, &proc_cpuinfo_operations);
 #ifdef CONFIG_BLOCK
diff -puN fs/read_write.c~git-nfsd fs/read_write.c
--- a/fs/read_write.c~git-nfsd
+++ a/fs/read_write.c
@@ -205,7 +205,7 @@ int rw_verify_area(int read_write, struc
 	if (unlikely((pos < 0) || (loff_t) (pos + count) < 0))
 		goto Einval;
 
-	if (unlikely(inode->i_flock && MANDATORY_LOCK(inode))) {
+	if (unlikely(inode->i_flock && mandatory_lock(inode))) {
 		int retval = locks_mandatory_area(
 			read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
 			inode, file, pos, count);
diff -puN include/linux/fs.h~git-nfsd include/linux/fs.h
--- a/include/linux/fs.h~git-nfsd
+++ a/include/linux/fs.h
@@ -883,6 +883,7 @@ extern int vfs_setlease(struct file *, l
 extern int lease_modify(struct file_lock **, int);
 extern int lock_may_read(struct inode *, loff_t start, unsigned long count);
 extern int lock_may_write(struct inode *, loff_t start, unsigned long count);
+extern struct seq_operations locks_seq_operations;
 
 struct fasync_struct {
 	int	magic;
@@ -1371,12 +1372,25 @@ extern int locks_mandatory_area(int, str
  * Candidates for mandatory locking have the setgid bit set
  * but no group execute bit -  an otherwise meaningless combination.
  */
-#define MANDATORY_LOCK(inode) \
-	(IS_MANDLOCK(inode) && ((inode)->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+
+static inline int __mandatory_lock(struct inode *ino)
+{
+	return (ino->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID;
+}
+
+/*
+ * ... and these candidates should be on MS_MANDLOCK mounted fs,
+ * otherwise these will be advisory locks
+ */
+
+static inline int mandatory_lock(struct inode *ino)
+{
+	return IS_MANDLOCK(ino) && __mandatory_lock(ino);
+}
 
 static inline int locks_verify_locked(struct inode *inode)
 {
-	if (MANDATORY_LOCK(inode))
+	if (mandatory_lock(inode))
 		return locks_mandatory_locked(inode);
 	return 0;
 }
@@ -1387,7 +1401,7 @@ static inline int locks_verify_truncate(
 				    struct file *filp,
 				    loff_t size)
 {
-	if (inode->i_flock && MANDATORY_LOCK(inode))
+	if (inode->i_flock && mandatory_lock(inode))
 		return locks_mandatory_area(
 			FLOCK_VERIFY_WRITE, inode, filp,
 			size < inode->i_size ? size : inode->i_size,
diff -puN include/linux/nfsd/nfsd.h~git-nfsd include/linux/nfsd/nfsd.h
--- a/include/linux/nfsd/nfsd.h~git-nfsd
+++ a/include/linux/nfsd/nfsd.h
@@ -153,19 +153,21 @@ extern int nfsd_max_blksize;
  */
 #ifdef CONFIG_NFSD_V4
 extern unsigned int max_delegations;
-void nfs4_state_init(void);
-int nfs4_state_start(void);
+int nfs4_state_init(void);
+void nfsd4_free_slabs(void);
+void nfs4_state_start(void);
 void nfs4_state_shutdown(void);
 time_t nfs4_lease_time(void);
 void nfs4_reset_lease(time_t leasetime);
 int nfs4_reset_recoverydir(char *recdir);
 #else
-static inline void nfs4_state_init(void){};
-static inline int nfs4_state_start(void){return 0;}
-static inline void nfs4_state_shutdown(void){}
-static inline time_t nfs4_lease_time(void){return 0;}
-static inline void nfs4_reset_lease(time_t leasetime){}
-static inline int nfs4_reset_recoverydir(char *recdir) {return 0;}
+static inline int nfs4_state_init(void) { return 0; }
+static inline void nfsd4_free_slabs(void) { }
+static inline void nfs4_state_start(void) { }
+static inline void nfs4_state_shutdown(void) { }
+static inline time_t nfs4_lease_time(void) { return 0; }
+static inline void nfs4_reset_lease(time_t leasetime) { }
+static inline int nfs4_reset_recoverydir(char *recdir) { return 0; }
 #endif
 
 /*
diff -puN include/linux/nfsd/nfsfh.h~git-nfsd include/linux/nfsd/nfsfh.h
--- a/include/linux/nfsd/nfsfh.h~git-nfsd
+++ a/include/linux/nfsd/nfsfh.h
@@ -150,17 +150,7 @@ typedef struct svc_fh {
 	struct timespec		fh_pre_ctime;	/* ctime before oper */
 
 	/* Post-op attributes saved in fh_unlock */
-	umode_t			fh_post_mode;	/* i_mode */
-	nlink_t			fh_post_nlink;	/* i_nlink */
-	uid_t			fh_post_uid;	/* i_uid */
-	gid_t			fh_post_gid;	/* i_gid */
-	__u64			fh_post_size;	/* i_size */
-	unsigned long		fh_post_blocks; /* i_blocks */
-	unsigned long		fh_post_blksize;/* i_blksize */
-	__be32			fh_post_rdev[2];/* i_rdev */
-	struct timespec		fh_post_atime;	/* i_atime */
-	struct timespec		fh_post_mtime;	/* i_mtime */
-	struct timespec		fh_post_ctime;	/* i_ctime */
+	struct kstat		fh_post_attr;	/* full attrs after operation */
 #endif /* CONFIG_NFSD_V3 */
 
 } svc_fh;
@@ -297,36 +287,12 @@ fill_pre_wcc(struct svc_fh *fhp)
 	if (!fhp->fh_pre_saved) {
 		fhp->fh_pre_mtime = inode->i_mtime;
 		fhp->fh_pre_ctime = inode->i_ctime;
-			fhp->fh_pre_size  = inode->i_size;
-			fhp->fh_pre_saved = 1;
+		fhp->fh_pre_size  = inode->i_size;
+		fhp->fh_pre_saved = 1;
 	}
 }
 
-/*
- * Fill in the post_op attr for the wcc data
- */
-static inline void
-fill_post_wcc(struct svc_fh *fhp)
-{
-	struct inode    *inode = fhp->fh_dentry->d_inode;
-
-	if (fhp->fh_post_saved)
-		printk("nfsd: inode locked twice during operation.\n");
-
-	fhp->fh_post_mode       = inode->i_mode;
-	fhp->fh_post_nlink      = inode->i_nlink;
-	fhp->fh_post_uid	= inode->i_uid;
-	fhp->fh_post_gid	= inode->i_gid;
-	fhp->fh_post_size       = inode->i_size;
-	fhp->fh_post_blksize    = BLOCK_SIZE;
-	fhp->fh_post_blocks     = inode->i_blocks;
-	fhp->fh_post_rdev[0]    = htonl((u32)imajor(inode));
-	fhp->fh_post_rdev[1]    = htonl((u32)iminor(inode));
-	fhp->fh_post_atime      = inode->i_atime;
-	fhp->fh_post_mtime      = inode->i_mtime;
-	fhp->fh_post_ctime      = inode->i_ctime;
-	fhp->fh_post_saved      = 1;
-}
+extern void fill_post_wcc(struct svc_fh *);
 #else
 #define	fill_pre_wcc(ignored)
 #define fill_post_wcc(notused)
diff -puN include/linux/nfsd/xdr4.h~git-nfsd include/linux/nfsd/xdr4.h
--- a/include/linux/nfsd/xdr4.h~git-nfsd
+++ a/include/linux/nfsd/xdr4.h
@@ -428,8 +428,8 @@ set_change_info(struct nfsd4_change_info
 	cinfo->atomic = 1;
 	cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec;
 	cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec;
-	cinfo->after_ctime_sec = fhp->fh_post_ctime.tv_sec;
-	cinfo->after_ctime_nsec = fhp->fh_post_ctime.tv_nsec;
+	cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec;
+	cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec;
 }
 
 int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *);
diff -puN include/linux/sunrpc/cache.h~git-nfsd include/linux/sunrpc/cache.h
--- a/include/linux/sunrpc/cache.h~git-nfsd
+++ a/include/linux/sunrpc/cache.h
@@ -136,16 +136,6 @@ sunrpc_cache_update(struct cache_detail 
 		    struct cache_head *new, struct cache_head *old, int hash);
 
 
-#define cache_for_each(pos, detail, index, member) 						\
-	for (({read_lock(&(detail)->hash_lock); index = (detail)->hash_size;}) ;		\
-	     ({if (index==0)read_unlock(&(detail)->hash_lock); index--;});			\
-		)										\
-		for (pos = container_of((detail)->hash_table[index], typeof(*pos), member);	\
-		     &pos->member;								\
-		     pos = container_of(pos->member.next, typeof(*pos), member))
-
-	     
-
 extern void cache_clean_deferred(void *owner);
 
 static inline struct cache_head  *cache_get(struct cache_head *h)
diff -puN include/linux/sunrpc/debug.h~git-nfsd include/linux/sunrpc/debug.h
--- a/include/linux/sunrpc/debug.h~git-nfsd
+++ a/include/linux/sunrpc/debug.h
@@ -20,7 +20,7 @@
 #define RPCDBG_BIND		0x0020
 #define RPCDBG_SCHED		0x0040
 #define RPCDBG_TRANS		0x0080
-#define RPCDBG_SVCSOCK		0x0100
+#define RPCDBG_SVCXPRT		0x0100
 #define RPCDBG_SVCDSP		0x0200
 #define RPCDBG_MISC		0x0400
 #define RPCDBG_CACHE		0x0800
diff -puN include/linux/sunrpc/svc.h~git-nfsd include/linux/sunrpc/svc.h
--- a/include/linux/sunrpc/svc.h~git-nfsd
+++ a/include/linux/sunrpc/svc.h
@@ -204,7 +204,7 @@ union svc_addr_u {
 struct svc_rqst {
 	struct list_head	rq_list;	/* idle list */
 	struct list_head	rq_all;		/* all threads list */
-	struct svc_sock *	rq_sock;	/* socket */
+	struct svc_xprt *	rq_xprt;	/* transport ptr */
 	struct sockaddr_storage	rq_addr;	/* peer address */
 	size_t			rq_addrlen;
 
@@ -214,9 +214,10 @@ struct svc_rqst {
 	struct auth_ops *	rq_authop;	/* authentication flavour */
 	u32			rq_flavor;	/* pseudoflavor */
 	struct svc_cred		rq_cred;	/* auth info */
-	struct sk_buff *	rq_skbuff;	/* fast recv inet buffer */
+	void *			rq_xprt_ctxt;	/* transport specific context ptr */
 	struct svc_deferred_req*rq_deferred;	/* deferred request we are replaying */
 
+	size_t			rq_xprt_hlen;	/* xprt header len */
 	struct xdr_buf		rq_arg;
 	struct xdr_buf		rq_res;
 	struct page *		rq_pages[RPCSVC_MAXPAGES];
@@ -317,7 +318,7 @@ static inline void svc_free_res_pages(st
 
 struct svc_deferred_req {
 	u32			prot;	/* protocol (UDP or TCP) */
-	struct svc_sock		*svsk;
+	struct svc_xprt		*xprt;
 	struct sockaddr_storage	addr;	/* where reply must go */
 	size_t			addrlen;
 	union svc_addr_u	daddr;	/* where reply must come from */
diff -puN /dev/null include/linux/sunrpc/svc_xprt.h
--- /dev/null
+++ a/include/linux/sunrpc/svc_xprt.h
@@ -0,0 +1,86 @@
+/*
+ * linux/include/linux/sunrpc/svc_xprt.h
+ *
+ * RPC server transport I/O
+ */
+
+#ifndef SUNRPC_SVC_XPRT_H
+#define SUNRPC_SVC_XPRT_H
+
+#include <linux/sunrpc/svc.h>
+#include <linux/module.h>
+
+struct svc_xprt_ops {
+	struct svc_xprt	*(*xpo_create)(struct svc_serv *,
+				       struct sockaddr *,
+				       int);
+	struct svc_xprt	*(*xpo_accept)(struct svc_xprt *);
+	int		(*xpo_has_wspace)(struct svc_xprt *);
+	int		(*xpo_recvfrom)(struct svc_rqst *);
+	void		(*xpo_prep_reply_hdr)(struct svc_rqst *);
+	int		(*xpo_sendto)(struct svc_rqst *);
+	void		(*xpo_release)(struct svc_rqst *);
+	void		(*xpo_detach)(struct svc_xprt *);
+	void		(*xpo_free)(struct svc_xprt *);
+};
+
+struct svc_xprt_class {
+	const char		*xcl_name;
+	struct module		*xcl_owner;
+	struct svc_xprt_ops	*xcl_ops;
+	struct list_head	xcl_list;
+	u32			xcl_max_payload;
+};
+
+struct svc_xprt {
+	struct svc_xprt_class	*xpt_class;
+	struct svc_xprt_ops	xpt_ops;
+	u32			xpt_max_payload;
+	struct kref		xpt_ref;
+	struct list_head	xpt_list;
+	struct list_head	xpt_ready;
+	unsigned long		xpt_flags;
+#define	XPT_BUSY	0		/* enqueued/receiving */
+#define	XPT_CONN	1		/* conn pending */
+#define	XPT_CLOSE	2		/* dead or dying */
+#define	XPT_DATA	3		/* data pending */
+#define	XPT_TEMP	4		/* connected transport */
+#define	XPT_DEAD	6		/* transport closed */
+#define	XPT_CHNGBUF	7		/* need to change snd/rcv buf sizes */
+#define	XPT_DEFERRED	8		/* deferred request pending */
+#define	XPT_OLD		9		/* used for xprt aging mark+sweep */
+#define	XPT_DETACHED	10		/* detached from tempsocks list */
+#define XPT_LISTENER	11		/* listening endpoint */
+#define XPT_CACHE_AUTH	12		/* cache auth info */
+
+	struct svc_pool		*xpt_pool;	/* current pool iff queued */
+	struct svc_serv		*xpt_server;	/* service for transport */
+	atomic_t    	    	xpt_reserved;	/* space on outq that is rsvd */
+	struct mutex		xpt_mutex;	/* to serialize sending data */
+	spinlock_t		xpt_lock;	/* protects sk_deferred
+						 * and xpt_auth_cache */
+	void			*xpt_auth_cache;/* auth cache */
+	struct list_head	xpt_deferred;	/* deferred requests that need
+						 * to be revisted */
+	struct sockaddr_storage	xpt_local;	/* local address */
+	struct sockaddr_storage	xpt_remote;	/* remote peer's address */
+	int			xpt_remotelen;	/* length of address */
+};
+
+int	svc_reg_xprt_class(struct svc_xprt_class *);
+int	svc_unreg_xprt_class(struct svc_xprt_class *);
+void	svc_xprt_init(struct svc_xprt_class *, struct svc_xprt *,
+		      struct svc_serv *);
+int	svc_create_xprt(struct svc_serv *, char *, unsigned short, int);
+void	svc_xprt_received(struct svc_xprt *);
+void	svc_xprt_enqueue(struct svc_xprt *xprt);
+int	svc_port_is_privileged(struct sockaddr *sin);
+void	svc_xprt_put(struct svc_xprt *xprt);
+static inline void svc_xprt_get(struct svc_xprt *xprt)
+{
+	kref_get(&xprt->xpt_ref);
+}
+void	svc_delete_xprt(struct svc_xprt *xprt);
+void	svc_close_xprt(struct svc_xprt *xprt);
+int	svc_print_xprts(char *buf, int maxlen);
+#endif /* SUNRPC_SVC_XPRT_H */
diff -puN include/linux/sunrpc/svcsock.h~git-nfsd include/linux/sunrpc/svcsock.h
--- a/include/linux/sunrpc/svcsock.h~git-nfsd
+++ a/include/linux/sunrpc/svcsock.h
@@ -10,42 +10,16 @@
 #define SUNRPC_SVCSOCK_H
 
 #include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/svc_xprt.h>
 
 /*
  * RPC server socket.
  */
 struct svc_sock {
-	struct list_head	sk_ready;	/* list of ready sockets */
-	struct list_head	sk_list;	/* list of all sockets */
+	struct svc_xprt		sk_xprt;
 	struct socket *		sk_sock;	/* berkeley socket layer */
 	struct sock *		sk_sk;		/* INET layer */
 
-	struct svc_pool *	sk_pool;	/* current pool iff queued */
-	struct svc_serv *	sk_server;	/* service for this socket */
-	atomic_t		sk_inuse;	/* use count */
-	unsigned long		sk_flags;
-#define	SK_BUSY		0			/* enqueued/receiving */
-#define	SK_CONN		1			/* conn pending */
-#define	SK_CLOSE	2			/* dead or dying */
-#define	SK_DATA		3			/* data pending */
-#define	SK_TEMP		4			/* temp (TCP) socket */
-#define	SK_DEAD		6			/* socket closed */
-#define	SK_CHNGBUF	7			/* need to change snd/rcv buffer sizes */
-#define	SK_DEFERRED	8			/* request on sk_deferred */
-#define	SK_OLD		9			/* used for temp socket aging mark+sweep */
-#define	SK_DETACHED	10			/* detached from tempsocks list */
-
-	atomic_t    	    	sk_reserved;	/* space on outq that is reserved */
-
-	spinlock_t		sk_lock;	/* protects sk_deferred and
-						 * sk_info_authunix */
-	struct list_head	sk_deferred;	/* deferred requests that need to
-						 * be revisted */
-	struct mutex		sk_mutex;	/* to serialize sending data */
-
-	int			(*sk_recvfrom)(struct svc_rqst *rqstp);
-	int			(*sk_sendto)(struct svc_rqst *rqstp);
-
 	/* We keep the old state_change and data_ready CB's here */
 	void			(*sk_ostate)(struct sock *);
 	void			(*sk_odata)(struct sock *, int bytes);
@@ -54,21 +28,12 @@ struct svc_sock {
 	/* private TCP part */
 	int			sk_reclen;	/* length of record */
 	int			sk_tcplen;	/* current read length */
-	time_t			sk_lastrecv;	/* time of last received request */
-
-	/* cache of various info for TCP sockets */
-	void			*sk_info_authunix;
-
-	struct sockaddr_storage	sk_local;	/* local address */
-	struct sockaddr_storage	sk_remote;	/* remote peer's address */
-	int			sk_remotelen;	/* length of address */
 };
 
 /*
  * Function prototypes.
  */
-int		svc_makesock(struct svc_serv *, int, unsigned short, int flags);
-void		svc_force_close_socket(struct svc_sock *);
+void		svc_close_all(struct list_head *);
 int		svc_recv(struct svc_rqst *, long);
 int		svc_send(struct svc_rqst *);
 void		svc_drop(struct svc_rqst *);
@@ -78,6 +43,8 @@ int		svc_addsock(struct svc_serv *serv,
 			    int fd,
 			    char *name_return,
 			    int *proto);
+void		svc_init_xprt_sock(void);
+void		svc_cleanup_xprt_sock(void);
 
 /*
  * svc_makesock socket characteristics
diff -puN net/sunrpc/Makefile~git-nfsd net/sunrpc/Makefile
--- a/net/sunrpc/Makefile~git-nfsd
+++ a/net/sunrpc/Makefile
@@ -11,6 +11,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprt
 	    auth.o auth_null.o auth_unix.o \
 	    svc.o svcsock.o svcauth.o svcauth_unix.o \
 	    rpcb_clnt.o timer.o xdr.o \
-	    sunrpc_syms.o cache.o rpc_pipe.o
+	    sunrpc_syms.o cache.o rpc_pipe.o \
+	    svc_xprt.o
 sunrpc-$(CONFIG_PROC_FS) += stats.o
 sunrpc-$(CONFIG_SYSCTL) += sysctl.o
diff -puN net/sunrpc/auth_gss/svcauth_gss.c~git-nfsd net/sunrpc/auth_gss/svcauth_gss.c
--- a/net/sunrpc/auth_gss/svcauth_gss.c~git-nfsd
+++ a/net/sunrpc/auth_gss/svcauth_gss.c
@@ -631,7 +631,8 @@ svc_safe_putnetobj(struct kvec *resv, st
 	return 0;
 }
 
-/* Verify the checksum on the header and return SVC_OK on success.
+/*
+ * Verify the checksum on the header and return SVC_OK on success.
  * Otherwise, return SVC_DROP (in the case of a bad sequence number)
  * or return SVC_DENIED and indicate error in authp.
  */
@@ -961,6 +962,78 @@ gss_write_init_verf(struct svc_rqst *rqs
 }
 
 /*
+ * Having read the cred already and found we're in the context
+ * initiation case, read the verifier and initiate (or check the results
+ * of) upcalls to userspace for help with context initiation.  If
+ * the upcall results are available, write the verifier and result.
+ * Otherwise, drop the request pending an answer to the upcall.
+ */
+static int svcauth_gss_handle_init(struct svc_rqst *rqstp,
+			struct rpc_gss_wire_cred *gc, __be32 *authp)
+{
+	struct kvec *argv = &rqstp->rq_arg.head[0];
+	struct kvec *resv = &rqstp->rq_res.head[0];
+	struct xdr_netobj tmpobj;
+	struct rsi *rsip, rsikey;
+
+	/* Read the verifier; should be NULL: */
+	*authp = rpc_autherr_badverf;
+	if (argv->iov_len < 2 * 4)
+		return SVC_DENIED;
+	if (svc_getnl(argv) != RPC_AUTH_NULL)
+		return SVC_DENIED;
+	if (svc_getnl(argv) != 0)
+		return SVC_DENIED;
+
+	/* Martial context handle and token for upcall: */
+	*authp = rpc_autherr_badcred;
+	if (gc->gc_proc == RPC_GSS_PROC_INIT && gc->gc_ctx.len != 0)
+		return SVC_DENIED;
+	memset(&rsikey, 0, sizeof(rsikey));
+	if (dup_netobj(&rsikey.in_handle, &gc->gc_ctx))
+		return SVC_DROP;
+	*authp = rpc_autherr_badverf;
+	if (svc_safe_getnetobj(argv, &tmpobj)) {
+		kfree(rsikey.in_handle.data);
+		return SVC_DENIED;
+	}
+	if (dup_netobj(&rsikey.in_token, &tmpobj)) {
+		kfree(rsikey.in_handle.data);
+		return SVC_DROP;
+	}
+
+	/* Perform upcall, or find upcall result: */
+	rsip = rsi_lookup(&rsikey);
+	rsi_free(&rsikey);
+	if (!rsip)
+		return SVC_DROP;
+	switch (cache_check(&rsi_cache, &rsip->h, &rqstp->rq_chandle)) {
+	case -EAGAIN:
+	case -ETIMEDOUT:
+	case -ENOENT:
+		/* No upcall result: */
+		return SVC_DROP;
+	case 0:
+		/* Got an answer to the upcall; use it: */
+		if (gss_write_init_verf(rqstp, rsip))
+			return SVC_DROP;
+		if (resv->iov_len + 4 > PAGE_SIZE)
+			return SVC_DROP;
+		svc_putnl(resv, RPC_SUCCESS);
+		if (svc_safe_putnetobj(resv, &rsip->out_handle))
+			return SVC_DROP;
+		if (resv->iov_len + 3 * 4 > PAGE_SIZE)
+			return SVC_DROP;
+		svc_putnl(resv, rsip->major_status);
+		svc_putnl(resv, rsip->minor_status);
+		svc_putnl(resv, GSS_SEQ_WIN);
+		if (svc_safe_putnetobj(resv, &rsip->out_token))
+			return SVC_DROP;
+	}
+	return SVC_COMPLETE;
+}
+
+/*
  * Accept an rpcsec packet.
  * If context establishment, punt to user space
  * If data exchange, verify/decrypt
@@ -974,11 +1047,9 @@ svcauth_gss_accept(struct svc_rqst *rqst
 	struct kvec	*argv = &rqstp->rq_arg.head[0];
 	struct kvec	*resv = &rqstp->rq_res.head[0];
 	u32		crlen;
-	struct xdr_netobj tmpobj;
 	struct gss_svc_data *svcdata = rqstp->rq_auth_data;
 	struct rpc_gss_wire_cred *gc;
 	struct rsc	*rsci = NULL;
-	struct rsi	*rsip, rsikey;
 	__be32		*rpcstart;
 	__be32		*reject_stat = resv->iov_base + resv->iov_len;
 	int		ret;
@@ -1023,30 +1094,14 @@ svcauth_gss_accept(struct svc_rqst *rqst
 	if ((gc->gc_proc != RPC_GSS_PROC_DATA) && (rqstp->rq_proc != 0))
 		goto auth_err;
 
-	/*
-	 * We've successfully parsed the credential. Let's check out the
-	 * verifier.  An AUTH_NULL verifier is allowed (and required) for
-	 * INIT and CONTINUE_INIT requests. AUTH_RPCSEC_GSS is required for
-	 * PROC_DATA and PROC_DESTROY.
-	 *
-	 * AUTH_NULL verifier is 0 (AUTH_NULL), 0 (length).
-	 * AUTH_RPCSEC_GSS verifier is:
-	 *   6 (AUTH_RPCSEC_GSS), length, checksum.
-	 * checksum is calculated over rpcheader from xid up to here.
-	 */
 	*authp = rpc_autherr_badverf;
 	switch (gc->gc_proc) {
 	case RPC_GSS_PROC_INIT:
 	case RPC_GSS_PROC_CONTINUE_INIT:
-		if (argv->iov_len < 2 * 4)
-			goto auth_err;
-		if (svc_getnl(argv) != RPC_AUTH_NULL)
-			goto auth_err;
-		if (svc_getnl(argv) != 0)
-			goto auth_err;
-		break;
+		return svcauth_gss_handle_init(rqstp, gc, authp);
 	case RPC_GSS_PROC_DATA:
 	case RPC_GSS_PROC_DESTROY:
+		/* Look up the context, and check the verifier: */
 		*authp = rpcsec_gsserr_credproblem;
 		rsci = gss_svc_searchbyctx(&gc->gc_ctx);
 		if (!rsci)
@@ -1067,51 +1122,6 @@ svcauth_gss_accept(struct svc_rqst *rqst
 
 	/* now act upon the command: */
 	switch (gc->gc_proc) {
-	case RPC_GSS_PROC_INIT:
-	case RPC_GSS_PROC_CONTINUE_INIT:
-		*authp = rpc_autherr_badcred;
-		if (gc->gc_proc == RPC_GSS_PROC_INIT && gc->gc_ctx.len != 0)
-			goto auth_err;
-		memset(&rsikey, 0, sizeof(rsikey));
-		if (dup_netobj(&rsikey.in_handle, &gc->gc_ctx))
-			goto drop;
-		*authp = rpc_autherr_badverf;
-		if (svc_safe_getnetobj(argv, &tmpobj)) {
-			kfree(rsikey.in_handle.data);
-			goto auth_err;
-		}
-		if (dup_netobj(&rsikey.in_token, &tmpobj)) {
-			kfree(rsikey.in_handle.data);
-			goto drop;
-		}
-
-		rsip = rsi_lookup(&rsikey);
-		rsi_free(&rsikey);
-		if (!rsip) {
-			goto drop;
-		}
-		switch(cache_check(&rsi_cache, &rsip->h, &rqstp->rq_chandle)) {
-		case -EAGAIN:
-		case -ETIMEDOUT:
-		case -ENOENT:
-			goto drop;
-		case 0:
-			if (gss_write_init_verf(rqstp, rsip))
-				goto drop;
-			if (resv->iov_len + 4 > PAGE_SIZE)
-				goto drop;
-			svc_putnl(resv, RPC_SUCCESS);
-			if (svc_safe_putnetobj(resv, &rsip->out_handle))
-				goto drop;
-			if (resv->iov_len + 3 * 4 > PAGE_SIZE)
-				goto drop;
-			svc_putnl(resv, rsip->major_status);
-			svc_putnl(resv, rsip->minor_status);
-			svc_putnl(resv, GSS_SEQ_WIN);
-			if (svc_safe_putnetobj(resv, &rsip->out_token))
-				goto drop;
-		}
-		goto complete;
 	case RPC_GSS_PROC_DESTROY:
 		if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq))
 			goto auth_err;
@@ -1158,7 +1168,7 @@ svcauth_gss_accept(struct svc_rqst *rqst
 		goto out;
 	}
 auth_err:
-	/* Restore write pointer to original value: */
+	/* Restore write pointer to its original value: */
 	xdr_ressize_check(rqstp, reject_stat);
 	ret = SVC_DENIED;
 	goto out;
diff -puN net/sunrpc/sunrpc_syms.c~git-nfsd net/sunrpc/sunrpc_syms.c
--- a/net/sunrpc/sunrpc_syms.c~git-nfsd
+++ a/net/sunrpc/sunrpc_syms.c
@@ -72,7 +72,6 @@ EXPORT_SYMBOL(svc_drop);
 EXPORT_SYMBOL(svc_process);
 EXPORT_SYMBOL(svc_recv);
 EXPORT_SYMBOL(svc_wake_up);
-EXPORT_SYMBOL(svc_makesock);
 EXPORT_SYMBOL(svc_reserve);
 EXPORT_SYMBOL(svc_auth_register);
 EXPORT_SYMBOL(auth_domain_lookup);
@@ -151,7 +150,8 @@ init_sunrpc(void)
 #endif
 	cache_register(&ip_map_cache);
 	cache_register(&unix_gid_cache);
-	init_socket_xprt();
+	svc_init_xprt_sock();	/* svc sock transport */
+	init_socket_xprt();	/* clnt sock transport */
 	rpcauth_init_module();
 out:
 	return err;
@@ -162,6 +162,7 @@ cleanup_sunrpc(void)
 {
 	rpcauth_remove_module();
 	cleanup_socket_xprt();
+	svc_cleanup_xprt_sock();
 	unregister_rpc_pipefs();
 	rpc_destroy_mempool();
 	if (cache_unregister(&ip_map_cache))
diff -puN net/sunrpc/svc.c~git-nfsd net/sunrpc/svc.c
--- a/net/sunrpc/svc.c~git-nfsd
+++ a/net/sunrpc/svc.c
@@ -458,9 +458,6 @@ svc_create_pooled(struct svc_program *pr
 void
 svc_destroy(struct svc_serv *serv)
 {
-	struct svc_sock	*svsk;
-	struct svc_sock *tmp;
-
 	dprintk("svc: svc_destroy(%s, %d)\n",
 				serv->sv_program->pg_name,
 				serv->sv_nrthreads);
@@ -475,14 +472,12 @@ svc_destroy(struct svc_serv *serv)
 
 	del_timer_sync(&serv->sv_temptimer);
 
-	list_for_each_entry_safe(svsk, tmp, &serv->sv_tempsocks, sk_list)
-		svc_force_close_socket(svsk);
+	svc_close_all(&serv->sv_tempsocks);
 
 	if (serv->sv_shutdown)
 		serv->sv_shutdown(serv);
 
-	list_for_each_entry_safe(svsk, tmp, &serv->sv_permsocks, sk_list)
-		svc_force_close_socket(svsk);
+	svc_close_all(&serv->sv_permsocks);
 
 	BUG_ON(!list_empty(&serv->sv_permsocks));
 	BUG_ON(!list_empty(&serv->sv_tempsocks));
@@ -777,6 +772,30 @@ svc_register(struct svc_serv *serv, int 
 }
 
 /*
+ * Printk the given error with the address of the client that caused it.
+ */
+static int
+__attribute__ ((format (printf, 2, 3)))
+svc_printk(struct svc_rqst *rqstp, const char *fmt, ...)
+{
+	va_list args;
+	int 	r;
+	char 	buf[RPC_MAX_ADDRBUFLEN];
+
+	if (!net_ratelimit())
+		return 0;
+
+	printk(KERN_WARNING "svc: %s: ",
+		svc_print_addr(rqstp, buf, sizeof(buf)));
+
+	va_start(args, fmt);
+	r = vprintk(fmt, args);
+	va_end(args);
+
+	return r;
+}
+
+/*
  * Process the RPC request.
  */
 int
@@ -815,9 +834,9 @@ svc_process(struct svc_rqst *rqstp)
 	rqstp->rq_res.tail[0].iov_len = 0;
 	/* Will be turned off only in gss privacy case: */
 	rqstp->rq_splice_ok = 1;
-	/* tcp needs a space for the record length... */
-	if (rqstp->rq_prot == IPPROTO_TCP)
-		svc_putnl(resv, 0);
+
+	/* Setup reply header */
+	rqstp->rq_xprt->xpt_ops.xpo_prep_reply_hdr(rqstp);
 
 	rqstp->rq_xid = svc_getu32(argv);
 	svc_putu32(resv, rqstp->rq_xid);
@@ -963,14 +982,13 @@ svc_process(struct svc_rqst *rqstp)
 	return 0;
 
 err_short_len:
-	if (net_ratelimit())
-		printk("svc: short len %Zd, dropping request\n", argv->iov_len);
+	svc_printk(rqstp, "short len %Zd, dropping request\n",
+			argv->iov_len);
 
 	goto dropit;			/* drop request */
 
 err_bad_dir:
-	if (net_ratelimit())
-		printk("svc: bad direction %d, dropping request\n", dir);
+	svc_printk(rqstp, "bad direction %d, dropping request\n", dir);
 
 	serv->sv_stats->rpcbadfmt++;
 	goto dropit;			/* drop request */
@@ -1000,8 +1018,7 @@ err_bad_prog:
 	goto sendit;
 
 err_bad_vers:
-	if (net_ratelimit())
-		printk("svc: unknown version (%d for prog %d, %s)\n",
+	svc_printk(rqstp, "unknown version (%d for prog %d, %s)\n",
 		       vers, prog, progp->pg_name);
 
 	serv->sv_stats->rpcbadfmt++;
@@ -1011,16 +1028,14 @@ err_bad_vers:
 	goto sendit;
 
 err_bad_proc:
-	if (net_ratelimit())
-		printk("svc: unknown procedure (%d)\n", proc);
+	svc_printk(rqstp, "unknown procedure (%d)\n", proc);
 
 	serv->sv_stats->rpcbadfmt++;
 	svc_putnl(resv, RPC_PROC_UNAVAIL);
 	goto sendit;
 
 err_garbage:
-	if (net_ratelimit())
-		printk("svc: failed to decode args\n");
+	svc_printk(rqstp, "failed to decode args\n");
 
 	rpc_stat = rpc_garbage_args;
 err_bad:
@@ -1034,10 +1049,8 @@ err_bad:
  */
 u32 svc_max_payload(const struct svc_rqst *rqstp)
 {
-	int max = RPCSVC_MAXPAYLOAD_TCP;
+	int max = rqstp->rq_xprt->xpt_max_payload;
 
-	if (rqstp->rq_sock->sk_sock->type == SOCK_DGRAM)
-		max = RPCSVC_MAXPAYLOAD_UDP;
 	if (rqstp->rq_server->sv_max_payload < max)
 		max = rqstp->rq_server->sv_max_payload;
 	return max;
diff -puN /dev/null net/sunrpc/svc_xprt.c
--- /dev/null
+++ a/net/sunrpc/svc_xprt.c
@@ -0,0 +1,954 @@
+/*
+ * linux/net/sunrpc/svc_xprt.c
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <linux/unistd.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/file.h>
+#include <linux/freezer.h>
+#include <net/sock.h>
+#include <net/checksum.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/tcp_states.h>
+#include <linux/uaccess.h>
+#include <asm/ioctls.h>
+
+#include <linux/sunrpc/types.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/svc_xprt.h>
+
+#define RPCDBG_FACILITY	RPCDBG_SVCXPRT
+
+static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt);
+static int svc_deferred_recv(struct svc_rqst *rqstp);
+static struct cache_deferred_req *svc_defer(struct cache_req *req);
+static void svc_age_temp_xprts(unsigned long closure);
+/* apparently the "standard" is that clients close
+ * idle connections after 5 minutes, servers after
+ * 6 minutes
+ *   http://www.connectathon.org/talks96/nfstcp.pdf
+ */
+static int svc_conn_age_period = 6*60;
+
+/* List of registered transport classes */
+static spinlock_t svc_xprt_class_lock = SPIN_LOCK_UNLOCKED;
+static LIST_HEAD(svc_xprt_class_list);
+
+int svc_reg_xprt_class(struct svc_xprt_class *xcl)
+{
+	struct svc_xprt_class *cl;
+	int res = -EEXIST;
+
+	dprintk("svc: Adding svc transport class '%s'\n",
+		xcl->xcl_name);
+
+	INIT_LIST_HEAD(&xcl->xcl_list);
+	spin_lock(&svc_xprt_class_lock);
+	list_for_each_entry(cl, &svc_xprt_class_list, xcl_list) {
+		if (xcl == cl)
+			goto out;
+	}
+	list_add_tail(&xcl->xcl_list, &svc_xprt_class_list);
+	res = 0;
+out:
+	spin_unlock(&svc_xprt_class_lock);
+	return res;
+}
+EXPORT_SYMBOL_GPL(svc_reg_xprt_class);
+
+int svc_unreg_xprt_class(struct svc_xprt_class *xcl)
+{
+	struct svc_xprt_class *cl;
+	int res = 0;
+
+	dprintk("svc: Removing svc transport class '%s'\n", xcl->xcl_name);
+
+	spin_lock(&svc_xprt_class_lock);
+	list_for_each_entry(cl, &svc_xprt_class_list, xcl_list) {
+		if (xcl == cl) {
+			list_del_init(&cl->xcl_list);
+			goto out;
+		}
+	}
+	res = -ENOENT;
+ out:
+	spin_unlock(&svc_xprt_class_lock);
+	return res;
+}
+EXPORT_SYMBOL_GPL(svc_unreg_xprt_class);
+
+/*
+ * Format the transport list for printing
+ */
+int svc_print_xprts(char *buf, int maxlen)
+{
+	struct list_head *le;
+	char tmpstr[80];
+	int len = 0;
+	buf[0] = '\0';
+
+	spin_lock(&svc_xprt_class_lock);
+	list_for_each(le, &svc_xprt_class_list) {
+		int slen;
+		struct svc_xprt_class *xcl =
+			list_entry(le, struct svc_xprt_class, xcl_list);
+
+		sprintf(tmpstr, "%s %d\n", xcl->xcl_name, xcl->xcl_max_payload);
+		slen = strlen(tmpstr);
+		if (len + slen > maxlen)
+			break;
+		len += slen;
+		strcat(buf, tmpstr);
+	}
+	spin_unlock(&svc_xprt_class_lock);
+
+	return len;
+}
+
+static inline void svc_xprt_free(struct kref *kref)
+{
+	struct svc_xprt *xprt =
+		container_of(kref, struct svc_xprt, xpt_ref);
+	struct module *owner = xprt->xpt_class->xcl_owner;
+	BUG_ON(atomic_read(&kref->refcount));
+	xprt->xpt_ops.xpo_free(xprt);
+	if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)
+	    && xprt->xpt_auth_cache != NULL)
+		svcauth_unix_info_release(xprt->xpt_auth_cache);
+	module_put(owner);
+}
+
+void svc_xprt_put(struct svc_xprt *xprt)
+{
+	kref_put(&xprt->xpt_ref, svc_xprt_free);
+}
+EXPORT_SYMBOL_GPL(svc_xprt_put);
+
+/*
+ * Called by transport drivers to initialize the transport independent
+ * portion of the transport instance.
+ */
+void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xpt,
+		   struct svc_serv *serv)
+{
+	xpt->xpt_class = xcl;
+	xpt->xpt_ops = *xcl->xcl_ops;
+	xpt->xpt_max_payload = xcl->xcl_max_payload;
+	kref_init(&xpt->xpt_ref);
+	xpt->xpt_server = serv;
+	INIT_LIST_HEAD(&xpt->xpt_list);
+	INIT_LIST_HEAD(&xpt->xpt_ready);
+	INIT_LIST_HEAD(&xpt->xpt_deferred);
+	mutex_init(&xpt->xpt_mutex);
+	spin_lock_init(&xpt->xpt_lock);
+}
+EXPORT_SYMBOL_GPL(svc_xprt_init);
+
+int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
+		    int flags)
+{
+	struct svc_xprt_class *xcl;
+	int ret = -ENOENT;
+	struct sockaddr_in sin = {
+		.sin_family		= AF_INET,
+		.sin_addr.s_addr	= INADDR_ANY,
+		.sin_port		= htons(port),
+	};
+	dprintk("svc: creating transport %s[%d]\n", xprt_name, port);
+	spin_lock(&svc_xprt_class_lock);
+	list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) {
+		if (strcmp(xprt_name, xcl->xcl_name) == 0) {
+			spin_unlock(&svc_xprt_class_lock);
+			if (try_module_get(xcl->xcl_owner)) {
+				struct svc_xprt *newxprt;
+				ret = 0;
+				newxprt = xcl->xcl_ops->xpo_create
+					(serv, (struct sockaddr *)&sin, flags);
+				if (IS_ERR(newxprt)) {
+					module_put(xcl->xcl_owner);
+					ret = PTR_ERR(newxprt);
+				} else {
+					clear_bit(XPT_TEMP,
+						  &newxprt->xpt_flags);
+					spin_lock_bh(&serv->sv_lock);
+					list_add(&newxprt->xpt_list,
+						 &serv->sv_permsocks);
+					spin_unlock_bh(&serv->sv_lock);
+				}
+			}
+			goto out;
+		}
+	}
+	spin_unlock(&svc_xprt_class_lock);
+	dprintk("svc: transport %s not found\n", xprt_name);
+ out:
+	return ret;
+}
+EXPORT_SYMBOL_GPL(svc_create_xprt);
+
+/*
+ * Queue up an idle server thread.  Must have pool->sp_lock held.
+ * Note: this is really a stack rather than a queue, so that we only
+ * use as many different threads as we need, and the rest don't pollute
+ * the cache.
+ */
+static inline void
+svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp)
+{
+	list_add(&rqstp->rq_list, &pool->sp_threads);
+}
+
+/*
+ * Dequeue an nfsd thread.  Must have pool->sp_lock held.
+ */
+static inline void
+svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp)
+{
+	list_del(&rqstp->rq_list);
+}
+
+/*
+ * Queue up a transport with data pending. If there are idle nfsd
+ * processes, wake 'em up.
+ *
+ */
+void
+svc_xprt_enqueue(struct svc_xprt *xprt)
+{
+	struct svc_serv	*serv = xprt->xpt_server;
+	struct svc_pool *pool;
+	struct svc_rqst	*rqstp;
+	int cpu;
+
+	if (!(xprt->xpt_flags &
+	      ((1<<XPT_CONN)|(1<<XPT_DATA)|(1<<XPT_CLOSE)|(1<<XPT_DEFERRED))))
+		return;
+	if (test_bit(XPT_DEAD, &xprt->xpt_flags))
+		return;
+
+	cpu = get_cpu();
+	pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
+	put_cpu();
+
+	spin_lock_bh(&pool->sp_lock);
+
+	if (!list_empty(&pool->sp_threads) &&
+	    !list_empty(&pool->sp_sockets))
+		printk(KERN_ERR
+			"svc_xprt_enqueue: threads and xprt both waiting??\n");
+
+	if (test_bit(XPT_DEAD, &xprt->xpt_flags)) {
+		/* Don't enqueue dead transports */
+		dprintk("svc: transport %p is dead, not enqueued\n", xprt);
+		goto out_unlock;
+	}
+
+	/* Mark transport as busy. It will remain in this state until the
+	 * server has processed all pending data and put the transport back
+	 * on the idle list.  We update XPT_BUSY atomically because
+	 * it also guards against trying to enqueue the svc_sock twice.
+	 */
+	if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) {
+		/* Don't enqueue transport while already enqueued */
+		dprintk("svc: transport %p busy, not enqueued\n", xprt);
+		goto out_unlock;
+	}
+	BUG_ON(xprt->xpt_pool != NULL);
+	xprt->xpt_pool = pool;
+
+	/* Handle pending connection */
+	if (test_bit(XPT_CONN, &xprt->xpt_flags))
+		goto process;
+
+	/* Handle close in-progress */
+	if (test_bit(XPT_CLOSE, &xprt->xpt_flags))
+		goto process;
+
+	/* Check if we have space to reply to a request */
+	if (!xprt->xpt_ops.xpo_has_wspace(xprt)) {
+		/* Don't enqueue while not enough space for reply */
+		dprintk("svc: no write space, transport %p  not enqueued\n", xprt);
+		xprt->xpt_pool = NULL;
+		clear_bit(XPT_BUSY, &xprt->xpt_flags);
+		goto out_unlock;
+	}
+
+ process:
+	if (!list_empty(&pool->sp_threads)) {
+		rqstp = list_entry(pool->sp_threads.next,
+				   struct svc_rqst,
+				   rq_list);
+		dprintk("svc: transport %p served by daemon %p\n",
+			xprt, rqstp);
+		svc_thread_dequeue(pool, rqstp);
+		if (rqstp->rq_xprt)
+			printk(KERN_ERR
+				"svc_xprt_enqueue: server %p, rq_xprt=%p!\n",
+				rqstp, rqstp->rq_xprt);
+		rqstp->rq_xprt = xprt;
+		svc_xprt_get(xprt);
+		rqstp->rq_reserved = serv->sv_max_mesg;
+		atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
+		BUG_ON(xprt->xpt_pool != pool);
+		wake_up(&rqstp->rq_wait);
+	} else {
+		dprintk("svc: transport %p put into queue\n", xprt);
+		list_add_tail(&xprt->xpt_ready, &pool->sp_sockets);
+		BUG_ON(xprt->xpt_pool != pool);
+	}
+
+out_unlock:
+	spin_unlock_bh(&pool->sp_lock);
+}
+EXPORT_SYMBOL_GPL(svc_xprt_enqueue);
+
+/*
+ * Dequeue the first transport.  Must be called with the pool->sp_lock held.
+ */
+static inline struct svc_xprt *
+svc_xprt_dequeue(struct svc_pool *pool)
+{
+	struct svc_xprt	*xprt;
+
+	if (list_empty(&pool->sp_sockets))
+		return NULL;
+
+	xprt = list_entry(pool->sp_sockets.next,
+			  struct svc_xprt, xpt_ready);
+	list_del_init(&xprt->xpt_ready);
+
+	dprintk("svc: transport %p dequeued, inuse=%d\n",
+		xprt, atomic_read(&xprt->xpt_ref.refcount));
+
+	return xprt;
+}
+
+/*
+ * Having read something from a transport, check whether it
+ * needs to be re-enqueued.
+ * Note: XPT_DATA only gets cleared when a read-attempt finds
+ * no (or insufficient) data.
+ */
+void
+svc_xprt_received(struct svc_xprt *xprt)
+{
+	xprt->xpt_pool = NULL;
+	clear_bit(XPT_BUSY, &xprt->xpt_flags);
+	svc_xprt_enqueue(xprt);
+}
+EXPORT_SYMBOL_GPL(svc_xprt_received);
+
+/**
+ * svc_reserve - change the space reserved for the reply to a request.
+ * @rqstp:  The request in question
+ * @space: new max space to reserve
+ *
+ * Each request reserves some space on the output queue of the transport
+ * to make sure the reply fits.  This function reduces that reserved
+ * space to be the amount of space used already, plus @space.
+ *
+ */
+void svc_reserve(struct svc_rqst *rqstp, int space)
+{
+	space += rqstp->rq_res.head[0].iov_len;
+
+	if (space < rqstp->rq_reserved) {
+		struct svc_xprt *xprt = rqstp->rq_xprt;
+		atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved);
+		rqstp->rq_reserved = space;
+
+		svc_xprt_enqueue(xprt);
+	}
+}
+
+static void
+svc_xprt_release(struct svc_rqst *rqstp)
+{
+	struct svc_xprt	*xprt = rqstp->rq_xprt;
+
+	rqstp->rq_xprt->xpt_ops.xpo_release(rqstp);
+
+	svc_free_res_pages(rqstp);
+	rqstp->rq_res.page_len = 0;
+	rqstp->rq_res.page_base = 0;
+
+	/* Reset response buffer and release
+	 * the reservation.
+	 * But first, check that enough space was reserved
+	 * for the reply, otherwise we have a bug!
+	 */
+	if ((rqstp->rq_res.len) >  rqstp->rq_reserved)
+		printk(KERN_ERR "RPC request reserved %d but used %d\n",
+		       rqstp->rq_reserved,
+		       rqstp->rq_res.len);
+
+	rqstp->rq_res.head[0].iov_len = 0;
+	svc_reserve(rqstp, 0);
+	rqstp->rq_xprt = NULL;
+
+	svc_xprt_put(xprt);
+}
+
+/*
+ * External function to wake up a server waiting for data
+ * This really only makes sense for services like lockd
+ * which have exactly one thread anyway.
+ */
+void
+svc_wake_up(struct svc_serv *serv)
+{
+	struct svc_rqst	*rqstp;
+	unsigned int i;
+	struct svc_pool *pool;
+
+	for (i = 0; i < serv->sv_nrpools; i++) {
+		pool = &serv->sv_pools[i];
+
+		spin_lock_bh(&pool->sp_lock);
+		if (!list_empty(&pool->sp_threads)) {
+			rqstp = list_entry(pool->sp_threads.next,
+					   struct svc_rqst,
+					   rq_list);
+			dprintk("svc: daemon %p woken up.\n", rqstp);
+			/*
+			svc_thread_dequeue(pool, rqstp);
+			rqstp->rq_xprt = NULL;
+			 */
+			wake_up(&rqstp->rq_wait);
+		}
+		spin_unlock_bh(&pool->sp_lock);
+	}
+}
+
+static void
+svc_check_conn_limits(struct svc_serv *serv)
+{
+	char	buf[RPC_MAX_ADDRBUFLEN];
+
+	/* make sure that we don't have too many active connections.
+	 * If we have, something must be dropped.
+	 *
+	 * There's no point in trying to do random drop here for
+	 * DoS prevention. The NFS clients does 1 reconnect in 15
+	 * seconds. An attacker can easily beat that.
+	 *
+	 * The only somewhat efficient mechanism would be if drop
+	 * old connections from the same IP first.
+	 */
+	if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) {
+		struct svc_xprt *xprt = NULL;
+		spin_lock_bh(&serv->sv_lock);
+		if (!list_empty(&serv->sv_tempsocks)) {
+			if (net_ratelimit()) {
+				/* Try to help the admin */
+				printk(KERN_NOTICE "%s: too many open  "
+					"connections, consider increasing the "
+					"number of nfsd threads\n",
+				       serv->sv_name);
+				printk(KERN_NOTICE
+				       "%s: last connection from %s\n",
+				       serv->sv_name, buf);
+			}
+			/*
+			 * Always select the oldest connection. It's not fair,
+			 * but so is life
+			 */
+			xprt = list_entry(serv->sv_tempsocks.prev,
+					  struct svc_xprt,
+					  xpt_list);
+			set_bit(XPT_CLOSE, &xprt->xpt_flags);
+			svc_xprt_get(xprt);
+		}
+		spin_unlock_bh(&serv->sv_lock);
+
+		if (xprt) {
+			svc_xprt_enqueue(xprt);
+			svc_xprt_put(xprt);
+		}
+	}
+}
+
+static inline void svc_copy_addr(struct svc_rqst *rqstp, struct svc_xprt *xprt)
+{
+	struct sockaddr *sin;
+
+	/* sock_recvmsg doesn't fill in the name/namelen, so we must..
+	 */
+	memcpy(&rqstp->rq_addr, &xprt->xpt_remote, xprt->xpt_remotelen);
+	rqstp->rq_addrlen = xprt->xpt_remotelen;
+
+	/* Destination address in request is needed for binding the
+	 * source address in RPC callbacks later.
+	 */
+	sin = (struct sockaddr *)&xprt->xpt_local;
+	switch (sin->sa_family) {
+	case AF_INET:
+		rqstp->rq_daddr.addr = ((struct sockaddr_in *)sin)->sin_addr;
+		break;
+	case AF_INET6:
+		rqstp->rq_daddr.addr6 = ((struct sockaddr_in6 *)sin)->sin6_addr;
+		break;
+	}
+}
+
+/*
+ * Receive the next request on any transport.  This code is carefully
+ * organised not to touch any cachelines in the shared svc_serv
+ * structure, only cachelines in the local svc_pool.
+ */
+int
+svc_recv(struct svc_rqst *rqstp, long timeout)
+{
+	struct svc_xprt		*xprt = NULL;
+	struct svc_serv		*serv = rqstp->rq_server;
+	struct svc_pool		*pool = rqstp->rq_pool;
+	int			len, i;
+	int			pages;
+	struct xdr_buf		*arg;
+	DECLARE_WAITQUEUE(wait, current);
+
+	dprintk("svc: server %p waiting for data (to = %ld)\n",
+		rqstp, timeout);
+
+	if (rqstp->rq_xprt)
+		printk(KERN_ERR
+			"svc_recv: service %p, transport not NULL!\n",
+			 rqstp);
+	if (waitqueue_active(&rqstp->rq_wait))
+		printk(KERN_ERR
+			"svc_recv: service %p, wait queue active!\n",
+			 rqstp);
+
+
+	/* now allocate needed pages.  If we get a failure, sleep briefly */
+	pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE;
+	for (i = 0; i < pages ; i++)
+		while (rqstp->rq_pages[i] == NULL) {
+			struct page *p = alloc_page(GFP_KERNEL);
+			if (!p)
+				schedule_timeout_uninterruptible(msecs_to_jiffies(500));
+			rqstp->rq_pages[i] = p;
+		}
+	rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */
+	BUG_ON(pages >= RPCSVC_MAXPAGES);
+
+	/* Make arg->head point to first page and arg->pages point to rest */
+	arg = &rqstp->rq_arg;
+	arg->head[0].iov_base = page_address(rqstp->rq_pages[0]);
+	arg->head[0].iov_len = PAGE_SIZE;
+	arg->pages = rqstp->rq_pages + 1;
+	arg->page_base = 0;
+	/* save at least one page for response */
+	arg->page_len = (pages-2)*PAGE_SIZE;
+	arg->len = (pages-1)*PAGE_SIZE;
+	arg->tail[0].iov_len = 0;
+
+	try_to_freeze();
+	cond_resched();
+	if (signalled())
+		return -EINTR;
+
+	spin_lock_bh(&pool->sp_lock);
+	if ((xprt = svc_xprt_dequeue(pool)) != NULL) {
+		rqstp->rq_xprt = xprt;
+		svc_xprt_get(xprt);
+		rqstp->rq_reserved = serv->sv_max_mesg;
+		atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
+	} else {
+		/* No data pending. Go to sleep */
+		svc_thread_enqueue(pool, rqstp);
+
+		/*
+		 * We have to be able to interrupt this wait
+		 * to bring down the daemons ...
+		 */
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&rqstp->rq_wait, &wait);
+		spin_unlock_bh(&pool->sp_lock);
+
+		schedule_timeout(timeout);
+
+		try_to_freeze();
+
+		spin_lock_bh(&pool->sp_lock);
+		remove_wait_queue(&rqstp->rq_wait, &wait);
+
+		if (!(xprt = rqstp->rq_xprt)) {
+			svc_thread_dequeue(pool, rqstp);
+			spin_unlock_bh(&pool->sp_lock);
+			dprintk("svc: server %p, no data yet\n", rqstp);
+			return signalled()? -EINTR : -EAGAIN;
+		}
+	}
+	spin_unlock_bh(&pool->sp_lock);
+
+	len = 0;
+	if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) {
+		dprintk("svc_recv: found XPT_CLOSE\n");
+		svc_delete_xprt(xprt);
+	} else if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) {
+		struct svc_xprt *newxpt;
+		newxpt = xprt->xpt_ops.xpo_accept(xprt);
+		if (newxpt) {
+			svc_xprt_received(newxpt);
+			/*
+			 * We know this module_get will succeed because the
+			 * listener holds a reference too
+			 */
+			__module_get(newxpt->xpt_class->xcl_owner);
+			svc_check_conn_limits(xprt->xpt_server);
+			spin_lock_bh(&serv->sv_lock);
+			set_bit(XPT_TEMP, &newxpt->xpt_flags);
+			list_add(&newxpt->xpt_list, &serv->sv_tempsocks);
+			serv->sv_tmpcnt++;
+			if (serv->sv_temptimer.function == NULL) {
+				/* setup timer to age temp transports */
+				setup_timer(&serv->sv_temptimer, svc_age_temp_xprts,
+					    (unsigned long)serv);
+				mod_timer(&serv->sv_temptimer,
+					  jiffies + svc_conn_age_period * HZ);
+			}
+			spin_unlock_bh(&serv->sv_lock);
+		}
+		svc_xprt_received(xprt);
+	} else {
+		dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
+			rqstp, pool->sp_id, xprt,
+			atomic_read(&xprt->xpt_ref.refcount));
+
+		if ((rqstp->rq_deferred = svc_deferred_dequeue(xprt))) {
+			svc_xprt_received(xprt);
+			len = svc_deferred_recv(rqstp);
+		} else
+			len = xprt->xpt_ops.xpo_recvfrom(rqstp);
+		svc_copy_addr(rqstp, xprt);
+		dprintk("svc: got len=%d\n", len);
+	}
+
+	/* No data, incomplete (TCP) read, or accept() */
+	if (len == 0 || len == -EAGAIN) {
+		rqstp->rq_res.len = 0;
+		svc_xprt_release(rqstp);
+		return -EAGAIN;
+	}
+	clear_bit(XPT_OLD, &xprt->xpt_flags);
+
+	rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp));
+	rqstp->rq_chandle.defer = svc_defer;
+
+	if (serv->sv_stats)
+		serv->sv_stats->netcnt++;
+	return len;
+}
+
+/*
+ * Drop request
+ */
+void
+svc_drop(struct svc_rqst *rqstp)
+{
+	dprintk("svc: xprt %p dropped request\n", rqstp->rq_xprt);
+	svc_xprt_release(rqstp);
+}
+
+/*
+ * Return reply to client.
+ */
+int
+svc_send(struct svc_rqst *rqstp)
+{
+	struct svc_xprt	*xprt;
+	int		len;
+	struct xdr_buf	*xb;
+
+	if ((xprt = rqstp->rq_xprt) == NULL) {
+		printk(KERN_WARNING "NULL transport pointer in %s:%d\n",
+				__FILE__, __LINE__);
+		return -EFAULT;
+	}
+
+	/* release the receive skb before sending the reply */
+	rqstp->rq_xprt->xpt_ops.xpo_release(rqstp);
+
+	/* calculate over-all length */
+	xb = & rqstp->rq_res;
+	xb->len = xb->head[0].iov_len +
+		xb->page_len +
+		xb->tail[0].iov_len;
+
+	/* Grab mutex to serialize outgoing data. */
+	mutex_lock(&xprt->xpt_mutex);
+	if (test_bit(XPT_DEAD, &xprt->xpt_flags))
+		len = -ENOTCONN;
+	else
+		len = xprt->xpt_ops.xpo_sendto(rqstp);
+	mutex_unlock(&xprt->xpt_mutex);
+	svc_xprt_release(rqstp);
+
+	if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN)
+		return 0;
+	return len;
+}
+
+/*
+ * Timer function to close old temporary transports, using
+ * a mark-and-sweep algorithm.
+ */
+static void
+svc_age_temp_xprts(unsigned long closure)
+{
+	struct svc_serv *serv = (struct svc_serv *)closure;
+	struct svc_xprt *xprt;
+	struct list_head *le, *next;
+	LIST_HEAD(to_be_aged);
+
+	dprintk("svc_age_temp_xprts\n");
+
+	if (!spin_trylock_bh(&serv->sv_lock)) {
+		/* busy, try again 1 sec later */
+		dprintk("svc_age_temp_xprts: busy\n");
+		mod_timer(&serv->sv_temptimer, jiffies + HZ);
+		return;
+	}
+
+	list_for_each_safe(le, next, &serv->sv_tempsocks) {
+		xprt = list_entry(le, struct svc_xprt, xpt_list);
+
+		/* First time through, just mark it OLD. Second time
+		 * through, close it. */
+		if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags))
+			continue;
+		if (atomic_read(&xprt->xpt_ref.refcount) > 1
+		    || test_bit(XPT_BUSY, &xprt->xpt_flags))
+			continue;
+		svc_xprt_get(xprt);
+		list_move(le, &to_be_aged);
+		set_bit(XPT_CLOSE, &xprt->xpt_flags);
+		set_bit(XPT_DETACHED, &xprt->xpt_flags);
+	}
+	spin_unlock_bh(&serv->sv_lock);
+
+	while (!list_empty(&to_be_aged)) {
+		le = to_be_aged.next;
+		/* fiddling the xpt_list node is safe 'cos we're XPT_DETACHED */
+		list_del_init(le);
+		xprt = list_entry(le, struct svc_xprt, xpt_list);
+
+		dprintk("queuing xprt %p for closing\n", xprt);
+
+		/* a thread will dequeue and close it soon */
+		svc_xprt_enqueue(xprt);
+		svc_xprt_put(xprt);
+	}
+
+	mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ);
+}
+
+/*
+ * Remove a dead transport
+ */
+void
+svc_delete_xprt(struct svc_xprt *xprt)
+{
+	struct svc_serv	*serv;
+
+	dprintk("svc: svc_delete_xprt(%p)\n", xprt);
+
+	serv = xprt->xpt_server;
+
+	xprt->xpt_ops.xpo_detach(xprt);
+
+	spin_lock_bh(&serv->sv_lock);
+
+	if (!test_and_set_bit(XPT_DETACHED, &xprt->xpt_flags))
+		list_del_init(&xprt->xpt_list);
+	/*
+	 * We used to delete the transport from whichever list
+	 * it's sk_xprt.xpt_ready node was on, but we don't actually
+	 * need to.  This is because the only time we're called
+	 * while still attached to a queue, the queue itself
+	 * is about to be destroyed (in svc_destroy).
+	 */
+	if (!test_and_set_bit(XPT_DEAD, &xprt->xpt_flags)) {
+		BUG_ON(atomic_read(&xprt->xpt_ref.refcount) < 2);
+		svc_xprt_put(xprt);
+		if (test_bit(XPT_TEMP, &xprt->xpt_flags))
+			serv->sv_tmpcnt--;
+	}
+
+	spin_unlock_bh(&serv->sv_lock);
+}
+
+void svc_close_xprt(struct svc_xprt *xprt)
+{
+	set_bit(XPT_CLOSE, &xprt->xpt_flags);
+	if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags))
+		/* someone else will have to effect the close */
+		return;
+
+	svc_xprt_get(xprt);
+	svc_delete_xprt(xprt);
+	clear_bit(XPT_BUSY, &xprt->xpt_flags);
+	svc_xprt_put(xprt);
+}
+
+void svc_close_all(struct list_head *xprt_list)
+{
+	struct svc_xprt *xprt;
+	struct svc_xprt *tmp;
+
+	list_for_each_entry_safe(xprt, tmp, xprt_list, xpt_list) {
+		set_bit(XPT_CLOSE, &xprt->xpt_flags);
+		if (test_bit(XPT_BUSY, &xprt->xpt_flags)) {
+			/* Waiting to be processed, but no threads left,
+			 * So just remove it from the waiting list
+			 */
+			list_del_init(&xprt->xpt_ready);
+			clear_bit(XPT_BUSY, &xprt->xpt_flags);
+		}
+		svc_close_xprt(xprt);
+	}
+}
+
+int svc_port_is_privileged(struct sockaddr *sin)
+{
+	switch (sin->sa_family) {
+	case AF_INET:
+		return ntohs(((struct sockaddr_in *)sin)->sin_port)
+			< PROT_SOCK;
+	case AF_INET6:
+		return ntohs(((struct sockaddr_in6 *)sin)->sin6_port)
+			< PROT_SOCK;
+	default:
+		return 0;
+	}
+}
+
+/*
+ * Handle defer and revisit of requests
+ */
+
+static void svc_revisit(struct cache_deferred_req *dreq, int too_many)
+{
+	struct svc_deferred_req *dr = container_of(dreq, struct svc_deferred_req, handle);
+	struct svc_xprt *xprt = dr->xprt;
+
+	if (too_many) {
+		svc_xprt_put(xprt);
+		kfree(dr);
+		return;
+	}
+	dprintk("revisit queued\n");
+	dr->xprt = NULL;
+	spin_lock(&xprt->xpt_lock);
+	list_add(&dr->handle.recent, &xprt->xpt_deferred);
+	spin_unlock(&xprt->xpt_lock);
+	set_bit(XPT_DEFERRED, &xprt->xpt_flags);
+	svc_xprt_enqueue(xprt);
+	svc_xprt_put(xprt);
+}
+
+/*
+ * Save the request off for later processing. The request buffer looks
+ * like this:
+ *
+ * <xprt-header><rpc-header><rpc-pagelist><rpc-tail>
+ *
+ * This code can only handle requests that consist of an xprt-header
+ * and rpc-header.
+ */
+static struct cache_deferred_req *
+svc_defer(struct cache_req *req)
+{
+	struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle);
+	struct svc_deferred_req *dr;
+
+	if (rqstp->rq_arg.page_len)
+		return NULL; /* if more than a page, give up FIXME */
+	if (rqstp->rq_deferred) {
+		dr = rqstp->rq_deferred;
+		rqstp->rq_deferred = NULL;
+	} else {
+		int skip;
+		int size;
+		/* FIXME maybe discard if size too large */
+		size = sizeof(struct svc_deferred_req) + rqstp->rq_arg.len +
+			rqstp->rq_xprt_hlen;
+		dr = kmalloc(size, GFP_KERNEL);
+		if (dr == NULL)
+			return NULL;
+
+		dr->handle.owner = rqstp->rq_server;
+		dr->prot = rqstp->rq_prot;
+		memcpy(&dr->addr, &rqstp->rq_addr, rqstp->rq_addrlen);
+		dr->addrlen = rqstp->rq_addrlen;
+		dr->daddr = rqstp->rq_daddr;
+		dr->argslen = (rqstp->rq_arg.len + rqstp->rq_xprt_hlen) >> 2;
+
+		/* back up head to the start of the buffer and copy */
+		skip = (rqstp->rq_arg.len + rqstp->rq_xprt_hlen) -
+			rqstp->rq_arg.head[0].iov_len;
+		memcpy(dr->args, rqstp->rq_arg.head[0].iov_base - skip,
+		       dr->argslen << 2);
+	}
+	svc_xprt_get(rqstp->rq_xprt);
+	dr->xprt = rqstp->rq_xprt;
+
+	dr->handle.revisit = svc_revisit;
+	return &dr->handle;
+}
+
+/*
+ * recv data from a deferred request into an active one
+ */
+static int svc_deferred_recv(struct svc_rqst *rqstp)
+{
+	struct svc_deferred_req *dr = rqstp->rq_deferred;
+
+	rqstp->rq_arg.head[0].iov_base = dr->args;
+	rqstp->rq_arg.head[0].iov_len = dr->argslen<<2;
+	rqstp->rq_arg.page_len = 0;
+	rqstp->rq_arg.len = dr->argslen<<2;
+	rqstp->rq_prot        = dr->prot;
+	memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen);
+	rqstp->rq_addrlen     = dr->addrlen;
+	rqstp->rq_daddr       = dr->daddr;
+	rqstp->rq_respages    = rqstp->rq_pages;
+	return dr->argslen<<2;
+}
+
+
+static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt)
+{
+	struct svc_deferred_req *dr = NULL;
+
+	if (!test_bit(XPT_DEFERRED, &xprt->xpt_flags))
+		return NULL;
+	spin_lock(&xprt->xpt_lock);
+	clear_bit(XPT_DEFERRED, &xprt->xpt_flags);
+	if (!list_empty(&xprt->xpt_deferred)) {
+		dr = list_entry(xprt->xpt_deferred.next,
+				struct svc_deferred_req,
+				handle.recent);
+		list_del_init(&dr->handle.recent);
+		set_bit(XPT_DEFERRED, &xprt->xpt_flags);
+	}
+	spin_unlock(&xprt->xpt_lock);
+	return dr;
+}
diff -puN net/sunrpc/svcauth_unix.c~git-nfsd net/sunrpc/svcauth_unix.c
--- a/net/sunrpc/svcauth_unix.c~git-nfsd
+++ a/net/sunrpc/svcauth_unix.c
@@ -384,41 +384,45 @@ void svcauth_unix_purge(void)
 static inline struct ip_map *
 ip_map_cached_get(struct svc_rqst *rqstp)
 {
-	struct ip_map *ipm;
-	struct svc_sock *svsk = rqstp->rq_sock;
-	spin_lock(&svsk->sk_lock);
-	ipm = svsk->sk_info_authunix;
-	if (ipm != NULL) {
-		if (!cache_valid(&ipm->h)) {
-			/*
-			 * The entry has been invalidated since it was
-			 * remembered, e.g. by a second mount from the
-			 * same IP address.
-			 */
-			svsk->sk_info_authunix = NULL;
-			spin_unlock(&svsk->sk_lock);
-			cache_put(&ipm->h, &ip_map_cache);
-			return NULL;
+	struct ip_map *ipm = NULL;
+	struct svc_xprt *xprt = rqstp->rq_xprt;
+
+	if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) {
+		spin_lock(&xprt->xpt_lock);
+		ipm = xprt->xpt_auth_cache;
+		if (ipm != NULL) {
+			if (!cache_valid(&ipm->h)) {
+				/*
+				 * The entry has been invalidated since it was
+				 * remembered, e.g. by a second mount from the
+				 * same IP address.
+				 */
+				xprt->xpt_auth_cache = NULL;
+				spin_unlock(&xprt->xpt_lock);
+				cache_put(&ipm->h, &ip_map_cache);
+				return NULL;
+			}
+			cache_get(&ipm->h);
 		}
-		cache_get(&ipm->h);
+		spin_unlock(&xprt->xpt_lock);
 	}
-	spin_unlock(&svsk->sk_lock);
 	return ipm;
 }
 
 static inline void
 ip_map_cached_put(struct svc_rqst *rqstp, struct ip_map *ipm)
 {
-	struct svc_sock *svsk = rqstp->rq_sock;
+	struct svc_xprt *xprt = rqstp->rq_xprt;
 
-	spin_lock(&svsk->sk_lock);
-	if (svsk->sk_sock->type == SOCK_STREAM &&
-	    svsk->sk_info_authunix == NULL) {
-		/* newly cached, keep the reference */
-		svsk->sk_info_authunix = ipm;
-		ipm = NULL;
+	if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) {
+		spin_lock(&xprt->xpt_lock);
+		if (xprt->xpt_auth_cache == NULL) {
+			/* newly cached, keep the reference */
+			xprt->xpt_auth_cache = ipm;
+			ipm = NULL;
+		}
+		spin_unlock(&xprt->xpt_lock);
 	}
-	spin_unlock(&svsk->sk_lock);
 	if (ipm)
 		cache_put(&ipm->h, &ip_map_cache);
 }
diff -puN net/sunrpc/svcsock.c~git-nfsd net/sunrpc/svcsock.c
--- a/net/sunrpc/svcsock.c~git-nfsd
+++ a/net/sunrpc/svcsock.c
@@ -5,7 +5,7 @@
  *
  * The server scheduling algorithm does not always distribute the load
  * evenly when servicing a single client. May need to modify the
- * svc_sock_enqueue procedure...
+ * svc_xprt_enqueue procedure...
  *
  * TCP support is largely untested and may be a little slow. The problem
  * is that we currently do two separate recvfrom's, one for the 4-byte
@@ -51,51 +51,44 @@
 /* SMP locking strategy:
  *
  *	svc_pool->sp_lock protects most of the fields of that pool.
- * 	svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt.
+ *	svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt.
  *	when both need to be taken (rare), svc_serv->sv_lock is first.
  *	BKL protects svc_serv->sv_nrthread.
  *	svc_sock->sk_lock protects the svc_sock->sk_deferred list
  *             and the ->sk_info_authunix cache.
- *	svc_sock->sk_flags.SK_BUSY prevents a svc_sock being enqueued multiply.
+ *	svc_sock->sk_xprt.xpt_flags.XPT_BUSY prevents a svc_sock being
+ *	enqueued multiply.
  *
  *	Some flags can be set to certain values at any time
  *	providing that certain rules are followed:
  *
- *	SK_CONN, SK_DATA, can be set or cleared at any time.
- *		after a set, svc_sock_enqueue must be called.
+ *	XPT_CONN, XPT_DATA, can be set or cleared at any time.
+ *		after a set, svc_xprt_enqueue must be called.
  *		after a clear, the socket must be read/accepted
  *		 if this succeeds, it must be set again.
- *	SK_CLOSE can set at any time. It is never cleared.
- *      sk_inuse contains a bias of '1' until SK_DEAD is set.
- *             so when sk_inuse hits zero, we know the socket is dead
+ *	XPT_CLOSE can set at any time. It is never cleared.
+ *      xpt_ref contains a bias of '1' until XPT_DEAD is set.
+ *             so when xprt_ref hits zero, we know the transport is dead
  *             and no-one is using it.
- *      SK_DEAD can only be set while SK_BUSY is held which ensures
+ *      XPT_DEAD can only be set while XPT_BUSY is held which ensures
  *             no other thread will be using the socket or will try to
- *	       set SK_DEAD.
+ *	       set XPT_DEAD.
  *
  */
 
-#define RPCDBG_FACILITY	RPCDBG_SVCSOCK
+#define RPCDBG_FACILITY	RPCDBG_SVCXPRT
 
 
 static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *,
 					 int *errp, int flags);
-static void		svc_delete_socket(struct svc_sock *svsk);
 static void		svc_udp_data_ready(struct sock *, int);
 static int		svc_udp_recvfrom(struct svc_rqst *);
 static int		svc_udp_sendto(struct svc_rqst *);
-static void		svc_close_socket(struct svc_sock *svsk);
+static void		svc_sock_detach(struct svc_xprt *);
+static void		svc_sock_free(struct svc_xprt *);
 
-static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk);
-static int svc_deferred_recv(struct svc_rqst *rqstp);
-static struct cache_deferred_req *svc_defer(struct cache_req *req);
-
-/* apparently the "standard" is that clients close
- * idle connections after 5 minutes, servers after
- * 6 minutes
- *   http://www.connectathon.org/talks96/nfstcp.pdf
- */
-static int svc_conn_age_period = 6*60;
+static struct svc_xprt *
+svc_create_socket(struct svc_serv *, int, struct sockaddr *, int, int);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key svc_key[2];
@@ -162,40 +155,21 @@ char *svc_print_addr(struct svc_rqst *rq
 EXPORT_SYMBOL_GPL(svc_print_addr);
 
 /*
- * Queue up an idle server thread.  Must have pool->sp_lock held.
- * Note: this is really a stack rather than a queue, so that we only
- * use as many different threads as we need, and the rest don't pollute
- * the cache.
- */
-static inline void
-svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp)
-{
-	list_add(&rqstp->rq_list, &pool->sp_threads);
-}
-
-/*
- * Dequeue an nfsd thread.  Must have pool->sp_lock held.
- */
-static inline void
-svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp)
-{
-	list_del(&rqstp->rq_list);
-}
-
-/*
  * Release an skbuff after use
  */
-static inline void
+static void
 svc_release_skb(struct svc_rqst *rqstp)
 {
-	struct sk_buff *skb = rqstp->rq_skbuff;
+	struct sk_buff *skb = rqstp->rq_xprt_ctxt;
 	struct svc_deferred_req *dr = rqstp->rq_deferred;
 
 	if (skb) {
-		rqstp->rq_skbuff = NULL;
+		struct svc_sock *svsk =
+			container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+		rqstp->rq_xprt_ctxt = NULL;
 
 		dprintk("svc: service %p, releasing skb %p\n", rqstp, skb);
-		skb_free_datagram(rqstp->rq_sock->sk_sk, skb);
+		skb_free_datagram(svsk->sk_sk, skb);
 	}
 	if (dr) {
 		rqstp->rq_deferred = NULL;
@@ -219,237 +193,6 @@ svc_sock_wspace(struct svc_sock *svsk)
 	return wspace;
 }
 
-/*
- * Queue up a socket with data pending. If there are idle nfsd
- * processes, wake 'em up.
- *
- */
-static void
-svc_sock_enqueue(struct svc_sock *svsk)
-{
-	struct svc_serv	*serv = svsk->sk_server;
-	struct svc_pool *pool;
-	struct svc_rqst	*rqstp;
-	int cpu;
-
-	if (!(svsk->sk_flags &
-	      ( (1<<SK_CONN)|(1<<SK_DATA)|(1<<SK_CLOSE)|(1<<SK_DEFERRED)) ))
-		return;
-	if (test_bit(SK_DEAD, &svsk->sk_flags))
-		return;
-
-	cpu = get_cpu();
-	pool = svc_pool_for_cpu(svsk->sk_server, cpu);
-	put_cpu();
-
-	spin_lock_bh(&pool->sp_lock);
-
-	if (!list_empty(&pool->sp_threads) &&
-	    !list_empty(&pool->sp_sockets))
-		printk(KERN_ERR
-			"svc_sock_enqueue: threads and sockets both waiting??\n");
-
-	if (test_bit(SK_DEAD, &svsk->sk_flags)) {
-		/* Don't enqueue dead sockets */
-		dprintk("svc: socket %p is dead, not enqueued\n", svsk->sk_sk);
-		goto out_unlock;
-	}
-
-	/* Mark socket as busy. It will remain in this state until the
-	 * server has processed all pending data and put the socket back
-	 * on the idle list.  We update SK_BUSY atomically because
-	 * it also guards against trying to enqueue the svc_sock twice.
-	 */
-	if (test_and_set_bit(SK_BUSY, &svsk->sk_flags)) {
-		/* Don't enqueue socket while already enqueued */
-		dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk);
-		goto out_unlock;
-	}
-	BUG_ON(svsk->sk_pool != NULL);
-	svsk->sk_pool = pool;
-
-	set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
-	if (((atomic_read(&svsk->sk_reserved) + serv->sv_max_mesg)*2
-	     > svc_sock_wspace(svsk))
-	    && !test_bit(SK_CLOSE, &svsk->sk_flags)
-	    && !test_bit(SK_CONN, &svsk->sk_flags)) {
-		/* Don't enqueue while not enough space for reply */
-		dprintk("svc: socket %p  no space, %d*2 > %ld, not enqueued\n",
-			svsk->sk_sk, atomic_read(&svsk->sk_reserved)+serv->sv_max_mesg,
-			svc_sock_wspace(svsk));
-		svsk->sk_pool = NULL;
-		clear_bit(SK_BUSY, &svsk->sk_flags);
-		goto out_unlock;
-	}
-	clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
-
-
-	if (!list_empty(&pool->sp_threads)) {
-		rqstp = list_entry(pool->sp_threads.next,
-				   struct svc_rqst,
-				   rq_list);
-		dprintk("svc: socket %p served by daemon %p\n",
-			svsk->sk_sk, rqstp);
-		svc_thread_dequeue(pool, rqstp);
-		if (rqstp->rq_sock)
-			printk(KERN_ERR
-				"svc_sock_enqueue: server %p, rq_sock=%p!\n",
-				rqstp, rqstp->rq_sock);
-		rqstp->rq_sock = svsk;
-		atomic_inc(&svsk->sk_inuse);
-		rqstp->rq_reserved = serv->sv_max_mesg;
-		atomic_add(rqstp->rq_reserved, &svsk->sk_reserved);
-		BUG_ON(svsk->sk_pool != pool);
-		wake_up(&rqstp->rq_wait);
-	} else {
-		dprintk("svc: socket %p put into queue\n", svsk->sk_sk);
-		list_add_tail(&svsk->sk_ready, &pool->sp_sockets);
-		BUG_ON(svsk->sk_pool != pool);
-	}
-
-out_unlock:
-	spin_unlock_bh(&pool->sp_lock);
-}
-
-/*
- * Dequeue the first socket.  Must be called with the pool->sp_lock held.
- */
-static inline struct svc_sock *
-svc_sock_dequeue(struct svc_pool *pool)
-{
-	struct svc_sock	*svsk;
-
-	if (list_empty(&pool->sp_sockets))
-		return NULL;
-
-	svsk = list_entry(pool->sp_sockets.next,
-			  struct svc_sock, sk_ready);
-	list_del_init(&svsk->sk_ready);
-
-	dprintk("svc: socket %p dequeued, inuse=%d\n",
-		svsk->sk_sk, atomic_read(&svsk->sk_inuse));
-
-	return svsk;
-}
-
-/*
- * Having read something from a socket, check whether it
- * needs to be re-enqueued.
- * Note: SK_DATA only gets cleared when a read-attempt finds
- * no (or insufficient) data.
- */
-static inline void
-svc_sock_received(struct svc_sock *svsk)
-{
-	svsk->sk_pool = NULL;
-	clear_bit(SK_BUSY, &svsk->sk_flags);
-	svc_sock_enqueue(svsk);
-}
-
-
-/**
- * svc_reserve - change the space reserved for the reply to a request.
- * @rqstp:  The request in question
- * @space: new max space to reserve
- *
- * Each request reserves some space on the output queue of the socket
- * to make sure the reply fits.  This function reduces that reserved
- * space to be the amount of space used already, plus @space.
- *
- */
-void svc_reserve(struct svc_rqst *rqstp, int space)
-{
-	space += rqstp->rq_res.head[0].iov_len;
-
-	if (space < rqstp->rq_reserved) {
-		struct svc_sock *svsk = rqstp->rq_sock;
-		atomic_sub((rqstp->rq_reserved - space), &svsk->sk_reserved);
-		rqstp->rq_reserved = space;
-
-		svc_sock_enqueue(svsk);
-	}
-}
-
-/*
- * Release a socket after use.
- */
-static inline void
-svc_sock_put(struct svc_sock *svsk)
-{
-	if (atomic_dec_and_test(&svsk->sk_inuse)) {
-		BUG_ON(! test_bit(SK_DEAD, &svsk->sk_flags));
-
-		dprintk("svc: releasing dead socket\n");
-		if (svsk->sk_sock->file)
-			sockfd_put(svsk->sk_sock);
-		else
-			sock_release(svsk->sk_sock);
-		if (svsk->sk_info_authunix != NULL)
-			svcauth_unix_info_release(svsk->sk_info_authunix);
-		kfree(svsk);
-	}
-}
-
-static void
-svc_sock_release(struct svc_rqst *rqstp)
-{
-	struct svc_sock	*svsk = rqstp->rq_sock;
-
-	svc_release_skb(rqstp);
-
-	svc_free_res_pages(rqstp);
-	rqstp->rq_res.page_len = 0;
-	rqstp->rq_res.page_base = 0;
-
-
-	/* Reset response buffer and release
-	 * the reservation.
-	 * But first, check that enough space was reserved
-	 * for the reply, otherwise we have a bug!
-	 */
-	if ((rqstp->rq_res.len) >  rqstp->rq_reserved)
-		printk(KERN_ERR "RPC request reserved %d but used %d\n",
-		       rqstp->rq_reserved,
-		       rqstp->rq_res.len);
-
-	rqstp->rq_res.head[0].iov_len = 0;
-	svc_reserve(rqstp, 0);
-	rqstp->rq_sock = NULL;
-
-	svc_sock_put(svsk);
-}
-
-/*
- * External function to wake up a server waiting for data
- * This really only makes sense for services like lockd
- * which have exactly one thread anyway.
- */
-void
-svc_wake_up(struct svc_serv *serv)
-{
-	struct svc_rqst	*rqstp;
-	unsigned int i;
-	struct svc_pool *pool;
-
-	for (i = 0; i < serv->sv_nrpools; i++) {
-		pool = &serv->sv_pools[i];
-
-		spin_lock_bh(&pool->sp_lock);
-		if (!list_empty(&pool->sp_threads)) {
-			rqstp = list_entry(pool->sp_threads.next,
-					   struct svc_rqst,
-					   rq_list);
-			dprintk("svc: daemon %p woken up.\n", rqstp);
-			/*
-			svc_thread_dequeue(pool, rqstp);
-			rqstp->rq_sock = NULL;
-			 */
-			wake_up(&rqstp->rq_wait);
-		}
-		spin_unlock_bh(&pool->sp_lock);
-	}
-}
-
 union svc_pktinfo_u {
 	struct in_pktinfo pkti;
 	struct in6_pktinfo pkti6;
@@ -459,7 +202,9 @@ union svc_pktinfo_u {
 
 static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh)
 {
-	switch (rqstp->rq_sock->sk_sk->sk_family) {
+	struct svc_sock *svsk =
+		container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+	switch (svsk->sk_sk->sk_family) {
 	case AF_INET: {
 			struct in_pktinfo *pki = CMSG_DATA(cmh);
 
@@ -492,7 +237,8 @@ static void svc_set_cmsg_data(struct svc
 static int
 svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
 {
-	struct svc_sock	*svsk = rqstp->rq_sock;
+	struct svc_sock	*svsk =
+		container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
 	struct socket	*sock = svsk->sk_sock;
 	int		slen;
 	union {
@@ -565,7 +311,7 @@ svc_sendto(struct svc_rqst *rqstp, struc
 	}
 out:
 	dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n",
-		rqstp->rq_sock, xdr->head[0].iov_base, xdr->head[0].iov_len,
+		svsk, xdr->head[0].iov_base, xdr->head[0].iov_len,
 		xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf)));
 
 	return len;
@@ -602,7 +348,7 @@ svc_sock_names(char *buf, struct svc_ser
 	if (!serv)
 		return 0;
 	spin_lock_bh(&serv->sv_lock);
-	list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) {
+	list_for_each_entry(svsk, &serv->sv_permsocks, sk_xprt.xpt_list) {
 		int onelen = one_sock_name(buf+len, svsk);
 		if (toclose && strcmp(toclose, buf+len) == 0)
 			closesk = svsk;
@@ -614,7 +360,7 @@ svc_sock_names(char *buf, struct svc_ser
 		/* Should unregister with portmap, but you cannot
 		 * unregister just one protocol...
 		 */
-		svc_close_socket(closesk);
+		svc_close_xprt(&closesk->sk_xprt);
 	else if (toclose)
 		return -ENOENT;
 	return len;
@@ -641,37 +387,21 @@ svc_recv_available(struct svc_sock *svsk
 static int
 svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, int buflen)
 {
-	struct svc_sock *svsk = rqstp->rq_sock;
+	struct svc_sock *svsk =
+		container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
 	struct msghdr msg = {
 		.msg_flags	= MSG_DONTWAIT,
 	};
-	struct sockaddr *sin;
 	int len;
 
+	/* TCP/UDP have no transport header */
+	rqstp->rq_xprt_hlen = 0;
+
 	len = kernel_recvmsg(svsk->sk_sock, &msg, iov, nr, buflen,
 				msg.msg_flags);
 
-	/* sock_recvmsg doesn't fill in the name/namelen, so we must..
-	 */
-	memcpy(&rqstp->rq_addr, &svsk->sk_remote, svsk->sk_remotelen);
-	rqstp->rq_addrlen = svsk->sk_remotelen;
-
-	/* Destination address in request is needed for binding the
-	 * source address in RPC callbacks later.
-	 */
-	sin = (struct sockaddr *)&svsk->sk_local;
-	switch (sin->sa_family) {
-	case AF_INET:
-		rqstp->rq_daddr.addr = ((struct sockaddr_in *)sin)->sin_addr;
-		break;
-	case AF_INET6:
-		rqstp->rq_daddr.addr6 = ((struct sockaddr_in6 *)sin)->sin6_addr;
-		break;
-	}
-
 	dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n",
 		svsk, iov[0].iov_base, iov[0].iov_len, len);
-
 	return len;
 }
 
@@ -711,9 +441,10 @@ svc_udp_data_ready(struct sock *sk, int 
 
 	if (svsk) {
 		dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n",
-			svsk, sk, count, test_bit(SK_BUSY, &svsk->sk_flags));
-		set_bit(SK_DATA, &svsk->sk_flags);
-		svc_sock_enqueue(svsk);
+			svsk, sk, count,
+			test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
+		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+		svc_xprt_enqueue(&svsk->sk_xprt);
 	}
 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
 		wake_up_interruptible(sk->sk_sleep);
@@ -729,8 +460,8 @@ svc_write_space(struct sock *sk)
 
 	if (svsk) {
 		dprintk("svc: socket %p(inet %p), write_space busy=%d\n",
-			svsk, sk, test_bit(SK_BUSY, &svsk->sk_flags));
-		svc_sock_enqueue(svsk);
+			svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
+		svc_xprt_enqueue(&svsk->sk_xprt);
 	}
 
 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) {
@@ -743,7 +474,9 @@ svc_write_space(struct sock *sk)
 static inline void svc_udp_get_dest_address(struct svc_rqst *rqstp,
 					    struct cmsghdr *cmh)
 {
-	switch (rqstp->rq_sock->sk_sk->sk_family) {
+	struct svc_sock *svsk =
+		container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+	switch (svsk->sk_sk->sk_family) {
 	case AF_INET: {
 		struct in_pktinfo *pki = CMSG_DATA(cmh);
 		rqstp->rq_daddr.addr.s_addr = pki->ipi_spec_dst.s_addr;
@@ -763,8 +496,9 @@ static inline void svc_udp_get_dest_addr
 static int
 svc_udp_recvfrom(struct svc_rqst *rqstp)
 {
-	struct svc_sock	*svsk = rqstp->rq_sock;
-	struct svc_serv	*serv = svsk->sk_server;
+	struct svc_sock	*svsk =
+		container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+	struct svc_serv	*serv = svsk->sk_xprt.xpt_server;
 	struct sk_buff	*skb;
 	union {
 		struct cmsghdr	hdr;
@@ -779,7 +513,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
 		.msg_flags = MSG_DONTWAIT,
 	};
 
-	if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags))
+	if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags))
 	    /* udp sockets need large rcvbuf as all pending
 	     * requests are still in that buffer.  sndbuf must
 	     * also be large enough that there is enough space
@@ -792,17 +526,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
 				(serv->sv_nrthreads+3) * serv->sv_max_mesg,
 				(serv->sv_nrthreads+3) * serv->sv_max_mesg);
 
-	if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) {
-		svc_sock_received(svsk);
-		return svc_deferred_recv(rqstp);
-	}
-
-	if (test_bit(SK_CLOSE, &svsk->sk_flags)) {
-		svc_delete_socket(svsk);
-		return 0;
-	}
-
-	clear_bit(SK_DATA, &svsk->sk_flags);
+	clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 	skb = NULL;
 	err = kernel_recvmsg(svsk->sk_sock, &msg, NULL,
 			     0, 0, MSG_PEEK | MSG_DONTWAIT);
@@ -813,9 +537,9 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
 		if (err != -EAGAIN) {
 			/* possibly an icmp error */
 			dprintk("svc: recvfrom returned error %d\n", -err);
-			set_bit(SK_DATA, &svsk->sk_flags);
+			set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 		}
-		svc_sock_received(svsk);
+		svc_xprt_received(&svsk->sk_xprt);
 		return -EAGAIN;
 	}
 	rqstp->rq_addrlen = sizeof(rqstp->rq_addr);
@@ -825,12 +549,12 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
 		   need that much accuracy */
 	}
 	svsk->sk_sk->sk_stamp = skb->tstamp;
-	set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */
+	set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */
 
 	/*
 	 * Maybe more packets - kick another thread ASAP.
 	 */
-	svc_sock_received(svsk);
+	svc_xprt_received(&svsk->sk_xprt);
 
 	len  = skb->len - sizeof(struct udphdr);
 	rqstp->rq_arg.len = len;
@@ -867,7 +591,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
 			skb_free_datagram(svsk->sk_sk, skb);
 			return 0;
 		}
-		rqstp->rq_skbuff = skb;
+		rqstp->rq_xprt_ctxt = skb;
 	}
 
 	rqstp->rq_arg.page_base = 0;
@@ -901,26 +625,83 @@ svc_udp_sendto(struct svc_rqst *rqstp)
 }
 
 static void
-svc_udp_init(struct svc_sock *svsk)
+svc_udp_prep_reply_hdr(struct svc_rqst *rqstp)
+{
+}
+
+static int
+svc_udp_has_wspace(struct svc_xprt *xprt)
+{
+	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+	struct svc_serv	*serv = svsk->sk_xprt.xpt_server;
+	int required;
+
+	/*
+	 * Set the SOCK_NOSPACE flag before checking the available
+	 * sock space.
+	 */
+	set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+	required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg;
+	if (required*2 > sock_wspace(svsk->sk_sk))
+		return 0;
+	clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+	return 1;
+}
+
+static struct svc_xprt *
+svc_udp_accept(struct svc_xprt *xprt)
+{
+	BUG();
+	return NULL;
+}
+
+static struct svc_xprt *
+svc_udp_create(struct svc_serv *serv, struct sockaddr *sa, int flags)
+{
+	return svc_create_socket(serv, IPPROTO_UDP, sa,
+				 sizeof(struct sockaddr_in), flags);
+}
+
+static struct svc_xprt_ops svc_udp_ops = {
+	.xpo_create = svc_udp_create,
+	.xpo_recvfrom = svc_udp_recvfrom,
+	.xpo_sendto = svc_udp_sendto,
+	.xpo_release = svc_release_skb,
+	.xpo_detach = svc_sock_detach,
+	.xpo_free = svc_sock_free,
+	.xpo_prep_reply_hdr = svc_udp_prep_reply_hdr,
+	.xpo_has_wspace = svc_udp_has_wspace,
+	.xpo_accept = svc_udp_accept,
+};
+
+static struct svc_xprt_class svc_udp_class = {
+	.xcl_name = "udp",
+	.xcl_owner = THIS_MODULE,
+	.xcl_ops = &svc_udp_ops,
+	.xcl_max_payload = RPCSVC_MAXPAYLOAD_UDP,
+};
+
+static void
+svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
 {
 	int one = 1;
 	mm_segment_t oldfs;
 
+	svc_xprt_init(&svc_udp_class, &svsk->sk_xprt, serv);
+	clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
 	svsk->sk_sk->sk_data_ready = svc_udp_data_ready;
 	svsk->sk_sk->sk_write_space = svc_write_space;
-	svsk->sk_recvfrom = svc_udp_recvfrom;
-	svsk->sk_sendto = svc_udp_sendto;
 
 	/* initialise setting must have enough space to
 	 * receive and respond to one request.
 	 * svc_udp_recvfrom will re-adjust if necessary
 	 */
 	svc_sock_setbufsize(svsk->sk_sock,
-			    3 * svsk->sk_server->sv_max_mesg,
-			    3 * svsk->sk_server->sv_max_mesg);
+			    3 * svsk->sk_xprt.xpt_server->sv_max_mesg,
+			    3 * svsk->sk_xprt.xpt_server->sv_max_mesg);
 
-	set_bit(SK_DATA, &svsk->sk_flags); /* might have come in before data_ready set up */
-	set_bit(SK_CHNGBUF, &svsk->sk_flags);
+	set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* might have come in before data_ready set up */
+	set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
 
 	oldfs = get_fs();
 	set_fs(KERNEL_DS);
@@ -954,8 +735,8 @@ svc_tcp_listen_data_ready(struct sock *s
 	 */
 	if (sk->sk_state == TCP_LISTEN) {
 		if (svsk) {
-			set_bit(SK_CONN, &svsk->sk_flags);
-			svc_sock_enqueue(svsk);
+			set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
+			svc_xprt_enqueue(&svsk->sk_xprt);
 		} else
 			printk("svc: socket %p: no user data\n", sk);
 	}
@@ -978,8 +759,8 @@ svc_tcp_state_change(struct sock *sk)
 	if (!svsk)
 		printk("svc: socket %p: no user data\n", sk);
 	else {
-		set_bit(SK_CLOSE, &svsk->sk_flags);
-		svc_sock_enqueue(svsk);
+		set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
+		svc_xprt_enqueue(&svsk->sk_xprt);
 	}
 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
 		wake_up_interruptible_all(sk->sk_sleep);
@@ -993,36 +774,23 @@ svc_tcp_data_ready(struct sock *sk, int 
 	dprintk("svc: socket %p TCP data ready (svsk %p)\n",
 		sk, sk->sk_user_data);
 	if (svsk) {
-		set_bit(SK_DATA, &svsk->sk_flags);
-		svc_sock_enqueue(svsk);
+		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+		svc_xprt_enqueue(&svsk->sk_xprt);
 	}
 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
 		wake_up_interruptible(sk->sk_sleep);
 }
 
-static inline int svc_port_is_privileged(struct sockaddr *sin)
-{
-	switch (sin->sa_family) {
-	case AF_INET:
-		return ntohs(((struct sockaddr_in *)sin)->sin_port)
-			< PROT_SOCK;
-	case AF_INET6:
-		return ntohs(((struct sockaddr_in6 *)sin)->sin6_port)
-			< PROT_SOCK;
-	default:
-		return 0;
-	}
-}
-
 /*
  * Accept a TCP connection
  */
-static void
-svc_tcp_accept(struct svc_sock *svsk)
+static struct svc_xprt *
+svc_tcp_accept(struct svc_xprt *xprt)
 {
+	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
 	struct sockaddr_storage addr;
 	struct sockaddr	*sin = (struct sockaddr *) &addr;
-	struct svc_serv	*serv = svsk->sk_server;
+	struct svc_serv	*serv = svsk->sk_xprt.xpt_server;
 	struct socket	*sock = svsk->sk_sock;
 	struct socket	*newsock;
 	struct svc_sock	*newsvsk;
@@ -1031,9 +799,9 @@ svc_tcp_accept(struct svc_sock *svsk)
 
 	dprintk("svc: tcp_accept %p sock %p\n", svsk, sock);
 	if (!sock)
-		return;
+		return NULL;
 
-	clear_bit(SK_CONN, &svsk->sk_flags);
+	clear_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
 	err = kernel_accept(sock, &newsock, O_NONBLOCK);
 	if (err < 0) {
 		if (err == -ENOMEM)
@@ -1042,11 +810,11 @@ svc_tcp_accept(struct svc_sock *svsk)
 		else if (err != -EAGAIN && net_ratelimit())
 			printk(KERN_WARNING "%s: accept failed (err %d)!\n",
 				   serv->sv_name, -err);
-		return;
+		return NULL;
 	}
 
-	set_bit(SK_CONN, &svsk->sk_flags);
-	svc_sock_enqueue(svsk);
+	set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
+	svc_xprt_enqueue(&svsk->sk_xprt);
 
 	err = kernel_getpeername(newsock, sin, &slen);
 	if (err < 0) {
@@ -1077,70 +845,23 @@ svc_tcp_accept(struct svc_sock *svsk)
 	if (!(newsvsk = svc_setup_socket(serv, newsock, &err,
 				 (SVC_SOCK_ANONYMOUS | SVC_SOCK_TEMPORARY))))
 		goto failed;
-	memcpy(&newsvsk->sk_remote, sin, slen);
-	newsvsk->sk_remotelen = slen;
+	memcpy(&newsvsk->sk_xprt.xpt_remote, sin, slen);
+	newsvsk->sk_xprt.xpt_remotelen = slen;
 	err = kernel_getsockname(newsock, sin, &slen);
 	if (unlikely(err < 0)) {
 		dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err);
 		slen = offsetof(struct sockaddr, sa_data);
 	}
-	memcpy(&newsvsk->sk_local, sin, slen);
-
-	svc_sock_received(newsvsk);
-
-	/* make sure that we don't have too many active connections.
-	 * If we have, something must be dropped.
-	 *
-	 * There's no point in trying to do random drop here for
-	 * DoS prevention. The NFS clients does 1 reconnect in 15
-	 * seconds. An attacker can easily beat that.
-	 *
-	 * The only somewhat efficient mechanism would be if drop
-	 * old connections from the same IP first. But right now
-	 * we don't even record the client IP in svc_sock.
-	 */
-	if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) {
-		struct svc_sock *svsk = NULL;
-		spin_lock_bh(&serv->sv_lock);
-		if (!list_empty(&serv->sv_tempsocks)) {
-			if (net_ratelimit()) {
-				/* Try to help the admin */
-				printk(KERN_NOTICE "%s: too many open TCP "
-					"sockets, consider increasing the "
-					"number of nfsd threads\n",
-						   serv->sv_name);
-				printk(KERN_NOTICE
-				       "%s: last TCP connect from %s\n",
-				       serv->sv_name, __svc_print_addr(sin,
-							buf, sizeof(buf)));
-			}
-			/*
-			 * Always select the oldest socket. It's not fair,
-			 * but so is life
-			 */
-			svsk = list_entry(serv->sv_tempsocks.prev,
-					  struct svc_sock,
-					  sk_list);
-			set_bit(SK_CLOSE, &svsk->sk_flags);
-			atomic_inc(&svsk->sk_inuse);
-		}
-		spin_unlock_bh(&serv->sv_lock);
-
-		if (svsk) {
-			svc_sock_enqueue(svsk);
-			svc_sock_put(svsk);
-		}
-
-	}
+	memcpy(&newsvsk->sk_xprt.xpt_local, sin, slen);
 
 	if (serv->sv_stats)
 		serv->sv_stats->nettcpconn++;
 
-	return;
+	return &newsvsk->sk_xprt;
 
 failed:
 	sock_release(newsock);
-	return;
+	return NULL;
 }
 
 /*
@@ -1149,34 +870,19 @@ failed:
 static int
 svc_tcp_recvfrom(struct svc_rqst *rqstp)
 {
-	struct svc_sock	*svsk = rqstp->rq_sock;
-	struct svc_serv	*serv = svsk->sk_server;
+	struct svc_sock	*svsk =
+		container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+	struct svc_serv	*serv = svsk->sk_xprt.xpt_server;
 	int		len;
 	struct kvec *vec;
 	int pnum, vlen;
 
 	dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
-		svsk, test_bit(SK_DATA, &svsk->sk_flags),
-		test_bit(SK_CONN, &svsk->sk_flags),
-		test_bit(SK_CLOSE, &svsk->sk_flags));
-
-	if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) {
-		svc_sock_received(svsk);
-		return svc_deferred_recv(rqstp);
-	}
+		svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags),
+		test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags),
+		test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags));
 
-	if (test_bit(SK_CLOSE, &svsk->sk_flags)) {
-		svc_delete_socket(svsk);
-		return 0;
-	}
-
-	if (svsk->sk_sk->sk_state == TCP_LISTEN) {
-		svc_tcp_accept(svsk);
-		svc_sock_received(svsk);
-		return 0;
-	}
-
-	if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags))
+	if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags))
 		/* sndbuf needs to have room for one request
 		 * per thread, otherwise we can stall even when the
 		 * network isn't a bottleneck.
@@ -1193,7 +899,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
 				    (serv->sv_nrthreads+3) * serv->sv_max_mesg,
 				    3 * serv->sv_max_mesg);
 
-	clear_bit(SK_DATA, &svsk->sk_flags);
+	clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 
 	/* Receive data. If we haven't got the record length yet, get
 	 * the next four bytes. Otherwise try to gobble up as much as
@@ -1212,7 +918,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
 		if (len < want) {
 			dprintk("svc: short recvfrom while reading record length (%d of %lu)\n",
 				len, want);
-			svc_sock_received(svsk);
+			svc_xprt_received(&svsk->sk_xprt);
 			return -EAGAIN; /* record header not complete */
 		}
 
@@ -1248,11 +954,11 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	if (len < svsk->sk_reclen) {
 		dprintk("svc: incomplete TCP record (%d of %d)\n",
 			len, svsk->sk_reclen);
-		svc_sock_received(svsk);
+		svc_xprt_received(&svsk->sk_xprt);
 		return -EAGAIN;	/* record not complete */
 	}
 	len = svsk->sk_reclen;
-	set_bit(SK_DATA, &svsk->sk_flags);
+	set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 
 	vec = rqstp->rq_vec;
 	vec[0] = rqstp->rq_arg.head[0];
@@ -1281,30 +987,30 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
 		rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len;
 	}
 
-	rqstp->rq_skbuff      = NULL;
+	rqstp->rq_xprt_ctxt   = NULL;
 	rqstp->rq_prot	      = IPPROTO_TCP;
 
 	/* Reset TCP read info */
 	svsk->sk_reclen = 0;
 	svsk->sk_tcplen = 0;
 
-	svc_sock_received(svsk);
+	svc_xprt_received(&svsk->sk_xprt);
 	if (serv->sv_stats)
 		serv->sv_stats->nettcpcnt++;
 
 	return len;
 
  err_delete:
-	svc_delete_socket(svsk);
+	svc_delete_xprt(&svsk->sk_xprt);
 	return -EAGAIN;
 
  error:
 	if (len == -EAGAIN) {
 		dprintk("RPC: TCP recvfrom got EAGAIN\n");
-		svc_sock_received(svsk);
+		svc_xprt_received(&svsk->sk_xprt);
 	} else {
 		printk(KERN_NOTICE "%s: recvfrom returned errno %d\n",
-					svsk->sk_server->sv_name, -len);
+		       svsk->sk_xprt.xpt_server->sv_name, -len);
 		goto err_delete;
 	}
 
@@ -1328,35 +1034,103 @@ svc_tcp_sendto(struct svc_rqst *rqstp)
 	reclen = htonl(0x80000000|((xbufp->len ) - 4));
 	memcpy(xbufp->head[0].iov_base, &reclen, 4);
 
-	if (test_bit(SK_DEAD, &rqstp->rq_sock->sk_flags))
+	if (test_bit(XPT_DEAD, &rqstp->rq_xprt->xpt_flags))
 		return -ENOTCONN;
 
 	sent = svc_sendto(rqstp, &rqstp->rq_res);
 	if (sent != xbufp->len) {
 		printk(KERN_NOTICE "rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n",
-		       rqstp->rq_sock->sk_server->sv_name,
+		       rqstp->rq_xprt->xpt_server->sv_name,
 		       (sent<0)?"got error":"sent only",
 		       sent, xbufp->len);
-		set_bit(SK_CLOSE, &rqstp->rq_sock->sk_flags);
-		svc_sock_enqueue(rqstp->rq_sock);
+		set_bit(XPT_CLOSE, &rqstp->rq_xprt->xpt_flags);
+		svc_xprt_enqueue(rqstp->rq_xprt);
 		sent = -EAGAIN;
 	}
 	return sent;
 }
 
+/*
+ * Setup response header. TCP has a 4B record length field.
+ */
+static void
+svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp)
+{
+	struct kvec *resv = &rqstp->rq_res.head[0];
+
+	/* tcp needs a space for the record length... */
+	svc_putnl(resv, 0);
+}
+
+static int
+svc_tcp_has_wspace(struct svc_xprt *xprt)
+{
+	struct svc_sock *svsk =	container_of(xprt, struct svc_sock, sk_xprt);
+	struct svc_serv	*serv = svsk->sk_xprt.xpt_server;
+	int required;
+
+	/*
+	 * Set the SOCK_NOSPACE flag before checking the available
+	 * sock space.
+	 */
+	set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+	required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg;
+	if (required*2 > sk_stream_wspace(svsk->sk_sk))
+		return 0;
+	clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+	return 1;
+}
+
+static struct svc_xprt *
+svc_tcp_create(struct svc_serv *serv, struct sockaddr *sa, int flags)
+{
+	return svc_create_socket(serv, IPPROTO_TCP, sa,
+				 sizeof(struct sockaddr_in), flags);
+}
+
+static struct svc_xprt_ops svc_tcp_ops = {
+	.xpo_create = svc_tcp_create,
+	.xpo_recvfrom = svc_tcp_recvfrom,
+	.xpo_sendto = svc_tcp_sendto,
+	.xpo_release = svc_release_skb,
+	.xpo_detach = svc_sock_detach,
+	.xpo_free = svc_sock_free,
+	.xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr,
+	.xpo_has_wspace = svc_tcp_has_wspace,
+	.xpo_accept = svc_tcp_accept,
+};
+
+static struct svc_xprt_class svc_tcp_class = {
+	.xcl_name = "tcp",
+	.xcl_owner = THIS_MODULE,
+	.xcl_ops = &svc_tcp_ops,
+	.xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP,
+};
+
+void svc_init_xprt_sock(void)
+{
+	svc_reg_xprt_class(&svc_tcp_class);
+	svc_reg_xprt_class(&svc_udp_class);
+}
+
+void svc_cleanup_xprt_sock(void)
+{
+	svc_unreg_xprt_class(&svc_tcp_class);
+	svc_unreg_xprt_class(&svc_udp_class);
+}
+
 static void
-svc_tcp_init(struct svc_sock *svsk)
+svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
 {
 	struct sock	*sk = svsk->sk_sk;
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	svsk->sk_recvfrom = svc_tcp_recvfrom;
-	svsk->sk_sendto = svc_tcp_sendto;
-
+	svc_xprt_init(&svc_tcp_class, &svsk->sk_xprt, serv);
+	set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
 	if (sk->sk_state == TCP_LISTEN) {
 		dprintk("setting up TCP socket for listening\n");
 		sk->sk_data_ready = svc_tcp_listen_data_ready;
-		set_bit(SK_CONN, &svsk->sk_flags);
+		set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
 	} else {
 		dprintk("setting up TCP socket for reading\n");
 		sk->sk_state_change = svc_tcp_state_change;
@@ -1373,13 +1147,13 @@ svc_tcp_init(struct svc_sock *svsk)
 		 * svc_tcp_recvfrom will re-adjust if necessary
 		 */
 		svc_sock_setbufsize(svsk->sk_sock,
-				    3 * svsk->sk_server->sv_max_mesg,
-				    3 * svsk->sk_server->sv_max_mesg);
+				    3 * svsk->sk_xprt.xpt_server->sv_max_mesg,
+				    3 * svsk->sk_xprt.xpt_server->sv_max_mesg);
 
-		set_bit(SK_CHNGBUF, &svsk->sk_flags);
-		set_bit(SK_DATA, &svsk->sk_flags);
+		set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
+		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 		if (sk->sk_state != TCP_ESTABLISHED)
-			set_bit(SK_CLOSE, &svsk->sk_flags);
+			set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
 	}
 }
 
@@ -1395,229 +1169,15 @@ svc_sock_update_bufs(struct svc_serv *se
 	spin_lock_bh(&serv->sv_lock);
 	list_for_each(le, &serv->sv_permsocks) {
 		struct svc_sock *svsk =
-			list_entry(le, struct svc_sock, sk_list);
-		set_bit(SK_CHNGBUF, &svsk->sk_flags);
+			list_entry(le, struct svc_sock, sk_xprt.xpt_list);
+		set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
 	}
 	list_for_each(le, &serv->sv_tempsocks) {
 		struct svc_sock *svsk =
-			list_entry(le, struct svc_sock, sk_list);
-		set_bit(SK_CHNGBUF, &svsk->sk_flags);
-	}
-	spin_unlock_bh(&serv->sv_lock);
-}
-
-/*
- * Receive the next request on any socket.  This code is carefully
- * organised not to touch any cachelines in the shared svc_serv
- * structure, only cachelines in the local svc_pool.
- */
-int
-svc_recv(struct svc_rqst *rqstp, long timeout)
-{
-	struct svc_sock		*svsk = NULL;
-	struct svc_serv		*serv = rqstp->rq_server;
-	struct svc_pool		*pool = rqstp->rq_pool;
-	int			len, i;
-	int 			pages;
-	struct xdr_buf		*arg;
-	DECLARE_WAITQUEUE(wait, current);
-
-	dprintk("svc: server %p waiting for data (to = %ld)\n",
-		rqstp, timeout);
-
-	if (rqstp->rq_sock)
-		printk(KERN_ERR
-			"svc_recv: service %p, socket not NULL!\n",
-			 rqstp);
-	if (waitqueue_active(&rqstp->rq_wait))
-		printk(KERN_ERR
-			"svc_recv: service %p, wait queue active!\n",
-			 rqstp);
-
-
-	/* now allocate needed pages.  If we get a failure, sleep briefly */
-	pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE;
-	for (i=0; i < pages ; i++)
-		while (rqstp->rq_pages[i] == NULL) {
-			struct page *p = alloc_page(GFP_KERNEL);
-			if (!p)
-				schedule_timeout_uninterruptible(msecs_to_jiffies(500));
-			rqstp->rq_pages[i] = p;
-		}
-	rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */
-	BUG_ON(pages >= RPCSVC_MAXPAGES);
-
-	/* Make arg->head point to first page and arg->pages point to rest */
-	arg = &rqstp->rq_arg;
-	arg->head[0].iov_base = page_address(rqstp->rq_pages[0]);
-	arg->head[0].iov_len = PAGE_SIZE;
-	arg->pages = rqstp->rq_pages + 1;
-	arg->page_base = 0;
-	/* save at least one page for response */
-	arg->page_len = (pages-2)*PAGE_SIZE;
-	arg->len = (pages-1)*PAGE_SIZE;
-	arg->tail[0].iov_len = 0;
-
-	try_to_freeze();
-	cond_resched();
-	if (signalled())
-		return -EINTR;
-
-	spin_lock_bh(&pool->sp_lock);
-	if ((svsk = svc_sock_dequeue(pool)) != NULL) {
-		rqstp->rq_sock = svsk;
-		atomic_inc(&svsk->sk_inuse);
-		rqstp->rq_reserved = serv->sv_max_mesg;
-		atomic_add(rqstp->rq_reserved, &svsk->sk_reserved);
-	} else {
-		/* No data pending. Go to sleep */
-		svc_thread_enqueue(pool, rqstp);
-
-		/*
-		 * We have to be able to interrupt this wait
-		 * to bring down the daemons ...
-		 */
-		set_current_state(TASK_INTERRUPTIBLE);
-		add_wait_queue(&rqstp->rq_wait, &wait);
-		spin_unlock_bh(&pool->sp_lock);
-
-		schedule_timeout(timeout);
-
-		try_to_freeze();
-
-		spin_lock_bh(&pool->sp_lock);
-		remove_wait_queue(&rqstp->rq_wait, &wait);
-
-		if (!(svsk = rqstp->rq_sock)) {
-			svc_thread_dequeue(pool, rqstp);
-			spin_unlock_bh(&pool->sp_lock);
-			dprintk("svc: server %p, no data yet\n", rqstp);
-			return signalled()? -EINTR : -EAGAIN;
-		}
-	}
-	spin_unlock_bh(&pool->sp_lock);
-
-	dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n",
-		 rqstp, pool->sp_id, svsk, atomic_read(&svsk->sk_inuse));
-	len = svsk->sk_recvfrom(rqstp);
-	dprintk("svc: got len=%d\n", len);
-
-	/* No data, incomplete (TCP) read, or accept() */
-	if (len == 0 || len == -EAGAIN) {
-		rqstp->rq_res.len = 0;
-		svc_sock_release(rqstp);
-		return -EAGAIN;
-	}
-	svsk->sk_lastrecv = get_seconds();
-	clear_bit(SK_OLD, &svsk->sk_flags);
-
-	rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp));
-	rqstp->rq_chandle.defer = svc_defer;
-
-	if (serv->sv_stats)
-		serv->sv_stats->netcnt++;
-	return len;
-}
-
-/*
- * Drop request
- */
-void
-svc_drop(struct svc_rqst *rqstp)
-{
-	dprintk("svc: socket %p dropped request\n", rqstp->rq_sock);
-	svc_sock_release(rqstp);
-}
-
-/*
- * Return reply to client.
- */
-int
-svc_send(struct svc_rqst *rqstp)
-{
-	struct svc_sock	*svsk;
-	int		len;
-	struct xdr_buf	*xb;
-
-	if ((svsk = rqstp->rq_sock) == NULL) {
-		printk(KERN_WARNING "NULL socket pointer in %s:%d\n",
-				__FILE__, __LINE__);
-		return -EFAULT;
-	}
-
-	/* release the receive skb before sending the reply */
-	svc_release_skb(rqstp);
-
-	/* calculate over-all length */
-	xb = & rqstp->rq_res;
-	xb->len = xb->head[0].iov_len +
-		xb->page_len +
-		xb->tail[0].iov_len;
-
-	/* Grab svsk->sk_mutex to serialize outgoing data. */
-	mutex_lock(&svsk->sk_mutex);
-	if (test_bit(SK_DEAD, &svsk->sk_flags))
-		len = -ENOTCONN;
-	else
-		len = svsk->sk_sendto(rqstp);
-	mutex_unlock(&svsk->sk_mutex);
-	svc_sock_release(rqstp);
-
-	if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN)
-		return 0;
-	return len;
-}
-
-/*
- * Timer function to close old temporary sockets, using
- * a mark-and-sweep algorithm.
- */
-static void
-svc_age_temp_sockets(unsigned long closure)
-{
-	struct svc_serv *serv = (struct svc_serv *)closure;
-	struct svc_sock *svsk;
-	struct list_head *le, *next;
-	LIST_HEAD(to_be_aged);
-
-	dprintk("svc_age_temp_sockets\n");
-
-	if (!spin_trylock_bh(&serv->sv_lock)) {
-		/* busy, try again 1 sec later */
-		dprintk("svc_age_temp_sockets: busy\n");
-		mod_timer(&serv->sv_temptimer, jiffies + HZ);
-		return;
-	}
-
-	list_for_each_safe(le, next, &serv->sv_tempsocks) {
-		svsk = list_entry(le, struct svc_sock, sk_list);
-
-		if (!test_and_set_bit(SK_OLD, &svsk->sk_flags))
-			continue;
-		if (atomic_read(&svsk->sk_inuse) > 1 || test_bit(SK_BUSY, &svsk->sk_flags))
-			continue;
-		atomic_inc(&svsk->sk_inuse);
-		list_move(le, &to_be_aged);
-		set_bit(SK_CLOSE, &svsk->sk_flags);
-		set_bit(SK_DETACHED, &svsk->sk_flags);
+			list_entry(le, struct svc_sock, sk_xprt.xpt_list);
+		set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
 	}
 	spin_unlock_bh(&serv->sv_lock);
-
-	while (!list_empty(&to_be_aged)) {
-		le = to_be_aged.next;
-		/* fiddling the sk_list node is safe 'cos we're SK_DETACHED */
-		list_del_init(le);
-		svsk = list_entry(le, struct svc_sock, sk_list);
-
-		dprintk("queuing svsk %p for closing, %lu seconds old\n",
-			svsk, get_seconds() - svsk->sk_lastrecv);
-
-		/* a thread will dequeue and close it soon */
-		svc_sock_enqueue(svsk);
-		svc_sock_put(svsk);
-	}
-
-	mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ);
 }
 
 /*
@@ -1631,7 +1191,6 @@ static struct svc_sock *svc_setup_socket
 	struct svc_sock	*svsk;
 	struct sock	*inet;
 	int		pmap_register = !(flags & SVC_SOCK_ANONYMOUS);
-	int		is_temporary = flags & SVC_SOCK_TEMPORARY;
 
 	dprintk("svc: svc_setup_socket %p\n", sock);
 	if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) {
@@ -1651,44 +1210,19 @@ static struct svc_sock *svc_setup_socket
 		return NULL;
 	}
 
-	set_bit(SK_BUSY, &svsk->sk_flags);
+	set_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags);
 	inet->sk_user_data = svsk;
 	svsk->sk_sock = sock;
 	svsk->sk_sk = inet;
 	svsk->sk_ostate = inet->sk_state_change;
 	svsk->sk_odata = inet->sk_data_ready;
 	svsk->sk_owspace = inet->sk_write_space;
-	svsk->sk_server = serv;
-	atomic_set(&svsk->sk_inuse, 1);
-	svsk->sk_lastrecv = get_seconds();
-	spin_lock_init(&svsk->sk_lock);
-	INIT_LIST_HEAD(&svsk->sk_deferred);
-	INIT_LIST_HEAD(&svsk->sk_ready);
-	mutex_init(&svsk->sk_mutex);
 
 	/* Initialize the socket */
 	if (sock->type == SOCK_DGRAM)
-		svc_udp_init(svsk);
+		svc_udp_init(svsk, serv);
 	else
-		svc_tcp_init(svsk);
-
-	spin_lock_bh(&serv->sv_lock);
-	if (is_temporary) {
-		set_bit(SK_TEMP, &svsk->sk_flags);
-		list_add(&svsk->sk_list, &serv->sv_tempsocks);
-		serv->sv_tmpcnt++;
-		if (serv->sv_temptimer.function == NULL) {
-			/* setup timer to age temp sockets */
-			setup_timer(&serv->sv_temptimer, svc_age_temp_sockets,
-					(unsigned long)serv);
-			mod_timer(&serv->sv_temptimer,
-					jiffies + svc_conn_age_period * HZ);
-		}
-	} else {
-		clear_bit(SK_TEMP, &svsk->sk_flags);
-		list_add(&svsk->sk_list, &serv->sv_permsocks);
-	}
-	spin_unlock_bh(&serv->sv_lock);
+		svc_tcp_init(svsk, serv);
 
 	dprintk("svc: svc_setup_socket created %p (inet %p)\n",
 				svsk, svsk->sk_sk);
@@ -1717,9 +1251,15 @@ int svc_addsock(struct svc_serv *serv,
 	else {
 		svsk = svc_setup_socket(serv, so, &err, SVC_SOCK_DEFAULTS);
 		if (svsk) {
-			svc_sock_received(svsk);
+			svc_xprt_received(&svsk->sk_xprt);
 			err = 0;
 		}
+		if (so->sk->sk_protocol == IPPROTO_TCP)
+			set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
+		clear_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags);
+		spin_lock_bh(&serv->sv_lock);
+		list_add(&svsk->sk_xprt.xpt_list, &serv->sv_permsocks);
+		spin_unlock_bh(&serv->sv_lock);
 	}
 	if (err) {
 		sockfd_put(so);
@@ -1733,8 +1273,9 @@ EXPORT_SYMBOL_GPL(svc_addsock);
 /*
  * Create socket for RPC service.
  */
-static int svc_create_socket(struct svc_serv *serv, int protocol,
-				struct sockaddr *sin, int len, int flags)
+static struct svc_xprt *
+svc_create_socket(struct svc_serv *serv, int protocol,
+		  struct sockaddr *sin, int len, int flags)
 {
 	struct svc_sock	*svsk;
 	struct socket	*sock;
@@ -1749,13 +1290,13 @@ static int svc_create_socket(struct svc_
 	if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) {
 		printk(KERN_WARNING "svc: only UDP and TCP "
 				"sockets supported\n");
-		return -EINVAL;
+		return ERR_PTR(-EINVAL);
 	}
 	type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM;
 
 	error = sock_create_kern(sin->sa_family, type, protocol, &sock);
 	if (error < 0)
-		return error;
+		return ERR_PTR(error);
 
 	svc_reclassify_socket(sock);
 
@@ -1771,197 +1312,48 @@ static int svc_create_socket(struct svc_
 	}
 
 	if ((svsk = svc_setup_socket(serv, sock, &error, flags)) != NULL) {
-		svc_sock_received(svsk);
-		return ntohs(inet_sk(svsk->sk_sk)->sport);
+		if (protocol == IPPROTO_TCP)
+			set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
+		svc_xprt_received(&svsk->sk_xprt);
+		return (struct svc_xprt *)svsk;
 	}
 
 bummer:
 	dprintk("svc: svc_create_socket error = %d\n", -error);
 	sock_release(sock);
-	return error;
+	return ERR_PTR(error);
 }
 
 /*
- * Remove a dead socket
+ * Detach the svc_sock from the socket so that no
+ * more callbacks occur.
  */
 static void
-svc_delete_socket(struct svc_sock *svsk)
+svc_sock_detach(struct svc_xprt *xprt)
 {
-	struct svc_serv	*serv;
-	struct sock	*sk;
+	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+	struct sock *sk = svsk->sk_sk;
 
-	dprintk("svc: svc_delete_socket(%p)\n", svsk);
-
-	serv = svsk->sk_server;
-	sk = svsk->sk_sk;
+	dprintk("svc: svc_sock_detach(%p)\n", svsk);
 
+	/* put back the old socket callbacks */
 	sk->sk_state_change = svsk->sk_ostate;
 	sk->sk_data_ready = svsk->sk_odata;
 	sk->sk_write_space = svsk->sk_owspace;
-
-	spin_lock_bh(&serv->sv_lock);
-
-	if (!test_and_set_bit(SK_DETACHED, &svsk->sk_flags))
-		list_del_init(&svsk->sk_list);
-	/*
-	 * We used to delete the svc_sock from whichever list
-	 * it's sk_ready node was on, but we don't actually
-	 * need to.  This is because the only time we're called
-	 * while still attached to a queue, the queue itself
-	 * is about to be destroyed (in svc_destroy).
-	 */
-	if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags)) {
-		BUG_ON(atomic_read(&svsk->sk_inuse)<2);
-		atomic_dec(&svsk->sk_inuse);
-		if (test_bit(SK_TEMP, &svsk->sk_flags))
-			serv->sv_tmpcnt--;
-	}
-
-	spin_unlock_bh(&serv->sv_lock);
-}
-
-static void svc_close_socket(struct svc_sock *svsk)
-{
-	set_bit(SK_CLOSE, &svsk->sk_flags);
-	if (test_and_set_bit(SK_BUSY, &svsk->sk_flags))
-		/* someone else will have to effect the close */
-		return;
-
-	atomic_inc(&svsk->sk_inuse);
-	svc_delete_socket(svsk);
-	clear_bit(SK_BUSY, &svsk->sk_flags);
-	svc_sock_put(svsk);
-}
-
-void svc_force_close_socket(struct svc_sock *svsk)
-{
-	set_bit(SK_CLOSE, &svsk->sk_flags);
-	if (test_bit(SK_BUSY, &svsk->sk_flags)) {
-		/* Waiting to be processed, but no threads left,
-		 * So just remove it from the waiting list
-		 */
-		list_del_init(&svsk->sk_ready);
-		clear_bit(SK_BUSY, &svsk->sk_flags);
-	}
-	svc_close_socket(svsk);
-}
-
-/**
- * svc_makesock - Make a socket for nfsd and lockd
- * @serv: RPC server structure
- * @protocol: transport protocol to use
- * @port: port to use
- * @flags: requested socket characteristics
- *
- */
-int svc_makesock(struct svc_serv *serv, int protocol, unsigned short port,
-			int flags)
-{
-	struct sockaddr_in sin = {
-		.sin_family		= AF_INET,
-		.sin_addr.s_addr	= INADDR_ANY,
-		.sin_port		= htons(port),
-	};
-
-	dprintk("svc: creating socket proto = %d\n", protocol);
-	return svc_create_socket(serv, protocol, (struct sockaddr *) &sin,
-							sizeof(sin), flags);
-}
-
-/*
- * Handle defer and revisit of requests
- */
-
-static void svc_revisit(struct cache_deferred_req *dreq, int too_many)
-{
-	struct svc_deferred_req *dr = container_of(dreq, struct svc_deferred_req, handle);
-	struct svc_sock *svsk;
-
-	if (too_many) {
-		svc_sock_put(dr->svsk);
-		kfree(dr);
-		return;
-	}
-	dprintk("revisit queued\n");
-	svsk = dr->svsk;
-	dr->svsk = NULL;
-	spin_lock(&svsk->sk_lock);
-	list_add(&dr->handle.recent, &svsk->sk_deferred);
-	spin_unlock(&svsk->sk_lock);
-	set_bit(SK_DEFERRED, &svsk->sk_flags);
-	svc_sock_enqueue(svsk);
-	svc_sock_put(svsk);
-}
-
-static struct cache_deferred_req *
-svc_defer(struct cache_req *req)
-{
-	struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle);
-	int size = sizeof(struct svc_deferred_req) + (rqstp->rq_arg.len);
-	struct svc_deferred_req *dr;
-
-	if (rqstp->rq_arg.page_len)
-		return NULL; /* if more than a page, give up FIXME */
-	if (rqstp->rq_deferred) {
-		dr = rqstp->rq_deferred;
-		rqstp->rq_deferred = NULL;
-	} else {
-		int skip  = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len;
-		/* FIXME maybe discard if size too large */
-		dr = kmalloc(size, GFP_KERNEL);
-		if (dr == NULL)
-			return NULL;
-
-		dr->handle.owner = rqstp->rq_server;
-		dr->prot = rqstp->rq_prot;
-		memcpy(&dr->addr, &rqstp->rq_addr, rqstp->rq_addrlen);
-		dr->addrlen = rqstp->rq_addrlen;
-		dr->daddr = rqstp->rq_daddr;
-		dr->argslen = rqstp->rq_arg.len >> 2;
-		memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2);
-	}
-	atomic_inc(&rqstp->rq_sock->sk_inuse);
-	dr->svsk = rqstp->rq_sock;
-
-	dr->handle.revisit = svc_revisit;
-	return &dr->handle;
 }
 
 /*
- * recv data from a deferred request into an active one
+ * Free the svc_sock's socket resources and the svc_sock itself.
  */
-static int svc_deferred_recv(struct svc_rqst *rqstp)
-{
-	struct svc_deferred_req *dr = rqstp->rq_deferred;
-
-	rqstp->rq_arg.head[0].iov_base = dr->args;
-	rqstp->rq_arg.head[0].iov_len = dr->argslen<<2;
-	rqstp->rq_arg.page_len = 0;
-	rqstp->rq_arg.len = dr->argslen<<2;
-	rqstp->rq_prot        = dr->prot;
-	memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen);
-	rqstp->rq_addrlen     = dr->addrlen;
-	rqstp->rq_daddr       = dr->daddr;
-	rqstp->rq_respages    = rqstp->rq_pages;
-	return dr->argslen<<2;
-}
-
-
-static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk)
+static void
+svc_sock_free(struct svc_xprt *xprt)
 {
-	struct svc_deferred_req *dr = NULL;
+	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+	dprintk("svc: svc_sock_free(%p)\n", svsk);
 
-	if (!test_bit(SK_DEFERRED, &svsk->sk_flags))
-		return NULL;
-	spin_lock(&svsk->sk_lock);
-	clear_bit(SK_DEFERRED, &svsk->sk_flags);
-	if (!list_empty(&svsk->sk_deferred)) {
-		dr = list_entry(svsk->sk_deferred.next,
-				struct svc_deferred_req,
-				handle.recent);
-		list_del_init(&dr->handle.recent);
-		set_bit(SK_DEFERRED, &svsk->sk_flags);
-	}
-	spin_unlock(&svsk->sk_lock);
-	return dr;
+	if (svsk->sk_sock->file)
+		sockfd_put(svsk->sk_sock);
+	else
+		sock_release(svsk->sk_sock);
+	kfree(svsk);
 }
diff -puN net/sunrpc/sysctl.c~git-nfsd net/sunrpc/sysctl.c
--- a/net/sunrpc/sysctl.c~git-nfsd
+++ a/net/sunrpc/sysctl.c
@@ -18,6 +18,7 @@
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/sched.h>
 #include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/svc_xprt.h>
 
 /*
  * Declare the debug flags here
@@ -27,6 +28,8 @@ unsigned int	nfs_debug;
 unsigned int	nfsd_debug;
 unsigned int	nlm_debug;
 
+char 		xprt_buf[128];
+
 #ifdef RPC_DEBUG
 
 static struct ctl_table_header *sunrpc_table_header;
@@ -48,6 +51,32 @@ rpc_unregister_sysctl(void)
 	}
 }
 
+static int proc_do_xprt(ctl_table *table, int write, struct file *file,
+			void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	char tmpbuf[sizeof(xprt_buf)];
+	int len;
+	if ((*ppos && !write) || !*lenp) {
+		*lenp = 0;
+		return 0;
+	}
+	if (write)
+		return -EINVAL;
+	else {
+
+		len = svc_print_xprts(tmpbuf, sizeof(tmpbuf));
+		if (!access_ok(VERIFY_WRITE, buffer, len))
+			return -EFAULT;
+
+		if (__copy_to_user(buffer, tmpbuf, len))
+			return -EFAULT;
+	}
+
+	*lenp -= len;
+	*ppos += len;
+	return 0;
+}
+
 static int
 proc_dodebug(ctl_table *table, int write, struct file *file,
 				void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -145,6 +174,14 @@ static ctl_table debug_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dodebug
 	},
+	{
+		.ctl_name	= CTL_TRANSPORTS,
+		.procname	= "transports",
+		.data		= xprt_buf,
+		.maxlen		= sizeof(xprt_buf),
+		.mode		= 0444,
+		.proc_handler	= &proc_do_xprt,
+	},
 	{ .ctl_name = 0 }
 };
 
_