GIT bc387b1ceea4ba36bf354926013c709bf1dd6d12 git://git.linux-nfs.org/~bfields/linux.git#for-mm commit 2ad4189a26e9cc72408c1f3c572ce675c091993d Author: J. Bruce Fields Date: Tue Oct 2 14:18:12 2007 -0400 nfsd: remove IS_ISMNDLCK macro This macro is only used in one place; in this place it seems simpler to put open-code it and move the comment to where it's used. Signed-off-by: J. Bruce Fields commit 6602cca355722274209c5a7ff810be72b658f58a Author: Pavel Emelyanov Date: Mon Oct 1 14:41:15 2007 -0700 Rework /proc/locks via seq_files and seq_list helpers Currently /proc/locks is shown with a proc_read function, but its behavior is rather complex as it has to manually handle current offset and buffer length. On the other hand, files that show objects from lists can be easily reimplemented using the sequential files and the seq_list_XXX() helpers. This saves (as usually) 16 lines of code and more than 200 from the .text section. [akpm@linux-foundation.org: no externs in C] [akpm@linux-foundation.org: warning fixes] Signed-off-by: Pavel Emelyanov Cc: "J. Bruce Fields" Cc: Trond Myklebust Signed-off-by: Andrew Morton commit 5c8912b1dd2664048d4f69272d3d15bd17f1a138 Author: Matthias Kaehlcke Date: Tue Oct 2 11:21:34 2007 -0700 fs/locks.c: use list_for_each_entry() instead of list_for_each() fs/locks.c: use list_for_each_entry() instead of list_for_each() in posix_locks_deadlock() and get_locks_status() Signed-off-by: Matthias Kaehlcke Signed-off-by: Andrew Morton commit 686734b7089212057ac99ccfd1d876a029da212d Author: Pavel Emelyanov Date: Mon Oct 1 14:41:15 2007 -0700 NFS: clean up explicit check for mandatory locks The __mandatory_lock(inode) macro makes the same check, but makes the code more readable. Signed-off-by: Pavel Emelyanov Cc: Trond Myklebust Cc: "J. Bruce Fields" Signed-off-by: Andrew Morton commit 5d41de6f3bc297205a8c8cebd7d49335f873b987 Author: Pavel Emelyanov Date: Mon Oct 1 14:41:14 2007 -0700 AFS: clean up explicit check for mandatory locks The __mandatory_lock(inode) macro makes the same check, but makes the code more readable. Signed-off-by: Pavel Emelyanov Cc: David Howells Signed-off-by: Andrew Morton commit a68e3c30614a233fc07dab2bda97e8e89ba6d0a2 Author: Pavel Emelyanov Date: Mon Oct 1 14:41:13 2007 -0700 9PFS: clean up explicit check for mandatory locks The __mandatory_lock(inode) macro makes the same check, but makes the code more readable. Signed-off-by: Pavel Emelyanov Cc: Eric Van Hensbergen Cc: Ron Minnich Cc: Latchesar Ionkov Signed-off-by: Andrew Morton commit c1c45b13a2c12920a0440b769fa73789a652accf Author: Pavel Emelyanov Date: Mon Oct 1 14:41:13 2007 -0700 GFS2: clean up explicit check for mandatory locks The __mandatory_lock(inode) function makes the same check, but makes the code more readable. Signed-off-by: Pavel Emelyanov Cc: Steven Whitehouse Signed-off-by: Andrew Morton commit ce23fdc14a10684d2addd6a66a37e81dcf9de714 Author: Pavel Emelyanov Date: Mon Oct 1 14:41:11 2007 -0700 Cleanup macros for distinguishing mandatory locks The combination of S_ISGID bit set and S_IXGRP bit unset is used to mark the inode as "mandatory lockable" and there's a macro for this check called MANDATORY_LOCK(inode). However, fs/locks.c and some filesystems still perform the explicit i_mode checking. Besides, Andrew pointed out, that this macro is buggy itself, as it dereferences the inode arg twice. Convert this macro into static inline function and switch its users to it, making the code shorter and more readable. The __mandatory_lock() helper is to be used in places where the IS_MANDLOCK() for superblock is already known to be true. Signed-off-by: Pavel Emelyanov Cc: Trond Myklebust Cc: "J. Bruce Fields" Cc: David Howells Cc: Eric Van Hensbergen Cc: Ron Minnich Cc: Latchesar Ionkov Cc: Steven Whitehouse Signed-off-by: Andrew Morton commit 3aaddf3550e4a67b022aff6550ea36e20b957f68 Author: Tom Tucker Date: Mon Oct 1 14:28:48 2007 -0500 knfsd: Support adding transports by writing portlist file Update the write handler for the portlist file to allow creating new listening endpoints on a transport. The general form of the string is: For example: tcp 2049 This is intended to support the creation of a listening endpoint for RDMA transports without adding #ifdef code to the nfssvc.c file. Signed-off-by: Tom Tucker commit 555996a64bbdf999db4d54e5c5250596011c96f5 Author: Tom Tucker Date: Mon Oct 1 14:28:45 2007 -0500 svc: Add /proc/sys/sunrpc/transport files Add a file that when read lists the set of registered svc transports. Signed-off-by: Tom Tucker commit d11ce39df9079fca3182939d03808dbc143af14b Author: Tom Tucker Date: Mon Oct 1 14:28:43 2007 -0500 svc: Add transport hdr size for defer/revisit Some transports have a header in front of the RPC header. The current defer/revisit processing considers only the iov_len and arg_len to determine how much to back up when saving the original request to revisit. Add a field to the rqstp structure to save the size of the transport header so svc_defer can correctly compute the start of a request. Signed-off-by: Tom Tucker commit 42d177f40901b9606463455d97824af3d8bd456a Author: Tom Tucker Date: Mon Oct 1 14:28:41 2007 -0500 svc: Move the xprt independent code to the svc_xprt.c file This functionally trivial patch moves all of the transport independent functions from the svcsock.c file to the transport independent svc_xprt.c file. Signed-off-by: Tom Tucker commit 7e3665dc18a8ef31f72c9231f53999e6eee618c0 Author: Tom Tucker Date: Mon Oct 1 14:28:39 2007 -0500 svc: Make svc_check_conn_limits xprt independent The svc_check_conn_limits function only manipulates xprt fields. Change references to svc_sock->sk_xprt to svc_xprt directly. Signed-off-by: Tom Tucker commit 44f096b047453728e4bbdd44cdf2281d1cdbfdd6 Author: Tom Tucker Date: Mon Oct 1 14:28:36 2007 -0500 svc: Removing remaining references to rq_sock in rqstp This functionally empty patch removes rq_sock and unamed union from rqstp structure. Signed-off-by: Tom Tucker commit dbd5bfe03471f2ae32430c4f483f0013711de4e9 Author: Tom Tucker Date: Mon Oct 1 14:28:34 2007 -0500 svc: Move common create logic to common code Move the code that adds a transport instance to the sv_tempsocks and sv_permsocks lists out of the transport specific functions and into core logic. The svc_addsock routine still manipulates sv_permsocks directly. This code may be removed when rpc.nfsd is modified to create transports by writing to the portlist file. Signed-off-by: Tom Tucker commit 2261d3f2f175b926c90f41527ddd78e483f18cf8 Author: Tom Tucker Date: Mon Oct 1 14:28:32 2007 -0500 svc: Make svc_age_temp_sockets svc_age_temp_transports This function is transport independent. Change it to use svc_xprt directly and change it's name to reflect this. Signed-off-by: Tom Tucker commit bf20a61bd5d4ae3d210e576e4113b410bf40e9f9 Author: Tom Tucker Date: Mon Oct 1 14:28:30 2007 -0500 svc: Make svc_recv transport neutral All of the transport field and functions used by svc_recv are now transport independent. Change the svc_recv function to use the svc_xprt structure directly instead of the transport specific svc_sock structure. Signed-off-by: Tom Tucker commit c5e9631b9fafa150c8e6154c1929e588e2f6b326 Author: Tom Tucker Date: Mon Oct 1 14:28:28 2007 -0500 svc: Make svc_sock_release svc_xprt_release The svc_sock_release function only touches transport independent fields. Change the function to manipulate svc_xprt directly instead of the transport dependent svc_sock structure. Signed-off-by: Tom Tucker commit fa63ddc2e9890da93ebe158f212a4182a94c9d71 Author: Tom Tucker Date: Mon Oct 1 14:28:25 2007 -0500 svc: Move the sockaddr information to svc_xprt Move the IP address fields to the svc_xprt structure. Note that this assumes that _all_ RPC transports must have IP based 4-tuples. This seems reasonable given the tight coupling with the portmapper etc... Thoughts? Signed-off-by: Tom Tucker commit 2def756732a6db1a2228bad967dfa95867621bf4 Author: Tom Tucker Date: Mon Oct 1 14:28:23 2007 -0500 svc: Make deferral processing xprt independent This functionally trivial patch moves the transport independent sk_deferred list to the svc_xprt structure and updates the svc_deferred_req structure to keep pointers to svc_xprt's directly. Signed-off-by: Tom Tucker commit db10883a4f6b292bc65dcc018c6b15c28293aa88 Author: Tom Tucker Date: Mon Oct 1 14:28:21 2007 -0500 svc: Move the authinfo cache to svc_xprt. Move the authinfo cache to svc_xprt. This allows both the TCP and RDMA transports to share this logic. A flag bit is used to determine if auth information is to be cached or not. Previously, this code looked at the transport protocol. I've also changed the spin_lock/unlock logic so that a lock is not taken for transports that are not caching auth info. Signed-off-by: Tom Tucker commit d522bbb2b5a93afcd2a8a2d757439031561e10c8 Author: Tom Tucker Date: Mon Oct 1 14:28:19 2007 -0500 svc: Remove sk_lastrecv With the implementation of the new mark and sweep algorithm for shutting down old connections, the sk_lastrecv field is no longer needed. Signed-off-by: Tom Tucker commit 4047f2427de886d86a1dcd74cbbbaf81557cee1e Author: Tom Tucker Date: Mon Oct 1 14:28:17 2007 -0500 svc: Change svc_sock_received to svc_xprt_received and export it All fields touched by svc_sock_received are now transport independent. Change it to use svc_xprt directly. This function is called from transport dependent code, so export it. Signed-off-by: Tom Tucker commit fa69c92bd83b2d70acd0875386f5e154d836225a Author: Tom Tucker Date: Mon Oct 1 14:28:14 2007 -0500 svc: Make svc_send transport neutral Move the sk_mutex field to the transport independent svc_xprt structure. Now all the fields that svc_send touches are transport neutral. Change the svc_send function to use the transport independent svc_xprt directly instead of the transport dependent svc_sock structure. Signed-off-by: Tom Tucker commit 381607c8f63f6519a4dd2d918f68814d6f4d95de Author: Tom Tucker Date: Mon Oct 1 14:28:12 2007 -0500 svc: Make the enqueue service transport neutral and export it. The svc_sock_enqueue function is now transport independent since all of the fields it touches have been moved to the transport independent svc_xprt structure. Change the function to use the svc_xprt structure directly instead of the transport specific svc_sock structure. Transport specific data-ready handlers need to call this function, so export it. Signed-off-by: Tom Tucker commit 7632637565fbaacbabf0e3b83a6994296302d784 Author: Tom Tucker Date: Mon Oct 1 14:28:10 2007 -0500 svc: Move sk_reserved to svc_xprt This functionally trivial patch moves the sk_reserved field to the transport independent svc_xprt structure. Signed-off-by: Tom Tucker commit 8da329ec798cb724ad42262fa9a7fe398b51928f Author: Tom Tucker Date: Mon Oct 1 14:28:08 2007 -0500 svc: Make close transport independent Move sk_list and sk_ready to svc_xprt. This involves close because these lists are walked by svcs when closing all their transports. So I combined the moving of these lists to svc_xprt with making close transport independent. The svc_force_sock_close has been changed to svc_close_all and takes a list as an argument. This removes some svc internals knowledge from the svcs. This code races with module removal and transport addition. Signed-off-by: Tom Tucker commit fe263c5d21ccae28cc6457dbca813db34d0d1a01 Author: Tom Tucker Date: Mon Oct 1 14:28:05 2007 -0500 svc: Move sk_server and sk_pool to svc_xprt This is another incremental change that moves transport independent fields from svc_sock to the svc_xprt structure. The changes should be functionally null. Signed-off-by: Tom Tucker commit 8cea193afd7b8755af9b9ecee6b108cb94722e65 Author: Tom Tucker Date: Mon Oct 1 14:28:03 2007 -0500 svc: Move sk_flags to the svc_xprt structure This functionally trivial change moves the transport independent sk_flags field to the transport independent svc_xprt structure. Signed-off-by: Tom Tucker commit e992be5aef124a79d05b9bab3c545e12c0e1e037 Author: Tom Tucker Date: Mon Oct 1 14:28:01 2007 -0500 svc: Change sk_inuse to a kref Change the atomic_t reference count to a kref and move it to the transport indepenent svc_xprt structure. Change the reference count wrapper names to be generic. Signed-off-by: Tom Tucker commit 489bd5f81802d4baa85ae07a6c850165e209916a Author: Tom Tucker Date: Mon Oct 1 14:27:59 2007 -0500 svc: Change services to use new svc_create_xprt service Modify the various kernel RPC svcs to use the svc_create_xprt service. Signed-off-by: Tom Tucker commit 7a6849539bdc4f6b3452db0b85b607ed1a09503b Author: Tom Tucker Date: Mon Oct 1 14:27:56 2007 -0500 svc: Add a generic transport svc_create_xprt function The svc_create_xprt function is a transport independent version of the svc_makesock function. Since transport instance creation contains transport dependent and independent components, add an xpo_create transport function. The transport implementation of this function allocates the memory for the endpoint, implements the transport dependent initialization logic, and calls svc_xprt_init to initialize the transport independent field (svc_xprt) in it's data structure. Signed-off-by: Tom Tucker commit 79a182a4124933dd502658372e3d38b6fc461ebf Author: Tom Tucker Date: Mon Oct 1 14:27:54 2007 -0500 svc: Add xpo_accept transport function Previously, the accept logic looked into the socket state to determine whether to call accept or recv when data-ready was indicated on an endpoint. Since some transports don't use sockets, this logic was changed to use a flag bit (SK_LISTENER) to identify listening endpoints. A transport function (xpo_accept) was added to allow each transport to define its own accept processing. A transport's initialization logic is reponsible for setting the SK_LISTENER bit. I didn't see any way to do this in transport independent logic since the passive side of a UDP connection doesn't listen and always recv's. In the svc_recv function, if the SK_LISTENER bit is set, the transport xpo_accept function is called to handle accept processing. Note that all functions are defined even if they don't make sense for a given transport. For example, accept doesn't mean anything for UDP. The fuction is defined anyway and bug checks if called. The UDP transport should never set the SK_LISTENER bit. The code that poaches connections when the connection limit is hit was moved to a subroutine to make the accept logic path easier to follow. Since this is in the new connection path, it should not be a performance issue. Signed-off-by: Tom Tucker commit 1291d9c149717f3ddc86df6a609ff34fd9e43ce2 Author: Tom Tucker Date: Mon Oct 1 14:27:51 2007 -0500 svc: Move close processing to a single place Close handling was duplicated in the UDP and TCP recvfrom methods. This code has been moved to the transport independent svc_recv function. Signed-off-by: Tom Tucker commit 097ad803f8054c752db831f5ba1a9c8858606e65 Author: Tom Tucker Date: Mon Oct 1 14:27:49 2007 -0500 svc: Add a transport function that checks for write space In order to avoid blocking a service thread, the receive side checks to see if there is sufficient write space to reply to the request. Each transport has a different mechanism for determining if there is enough write space to reply. The code that checked for white space was coupled with code that checked for CLOSE and CONN. These checks have been broken out into separate statements to make the code easier to read. Signed-off-by: Tom Tucker commit 797c4d38ebd67e1a79c31c6390e0b7e8b0efcaa7 Author: Tom Tucker Date: Mon Oct 1 14:27:47 2007 -0500 svc: Add xpo_prep_reply_hdr Some transports add fields to the RPC header for replies, e.g. the TCP record length. This function is called when preparing the reply header to allow each transport to add whatever fields it requires. Signed-off-by: Tom Tucker commit fd4635aed0fb08c7d6c75fdde729e3bc676908aa Author: Tom Tucker Date: Mon Oct 1 14:27:45 2007 -0500 svc: Add per-transport delete functions Add transport specific xpo_detach and xpo_free functions. The xpo_detach function causes the transport to stop delivering data-ready events and enqueing the transport for I/O. The xpo_free function frees all resources associated with the particular transport instance. Signed-off-by: Tom Tucker commit 82fd56366d3dc2f5543f385a40625cf7db1e769b Author: Tom Tucker Date: Mon Oct 1 14:27:42 2007 -0500 svc: Add transport specific xpo_release function The svc_sock_release function releases pages allocated to a thread. For UDP, this also returns the receive skb to the stack. For RDMA it will post a receive WR and bump the client credit count. Signed-off-by: Tom Tucker commit c854768f825eaab3c9fccacec7c513d5f019b3fc Author: Tom Tucker Date: Mon Oct 1 14:27:40 2007 -0500 svc: Move sk_sendto and sk_recvfrom to svc_xprt_class The sk_sendto and sk_recvfrom are function pointers that allow svc_sock to be used for both UDP and TCP. Move these function pointers to the svc_xprt_ops structure. Signed-off-by: Tom Tucker commit 85eba208284186cab9d3aa79df89bc256b3d7382 Author: Tom Tucker Date: Mon Oct 1 14:27:38 2007 -0500 svc: Add a max payload value to the transport The svc_max_payload function currently looks at the socket type to determine the max payload. Add a max payload value to svc_xprt_class so it can be returned directly. Signed-off-by: Tom Tucker commit a6b20ecc631dbb81748a4d41119b7de430037116 Author: Tom Tucker Date: Mon Oct 1 14:27:35 2007 -0500 svc: Change the svc_sock in the rqstp structure to a transport The rqstp structure contains a pointer to the transport for the RPC request. This functionaly trivial patch adds an unamed union with pointers to both svc_sock and svc_xprt. Ultimately the union will be removed and only the rq_xprt field will remain. This allows incrementally extracting transport independent interfaces without one gigundo patch. Signed-off-by: Tom Tucker commit e3f4b6b3f66577096804931f9463ab2858acc83d Author: Tom Tucker Date: Mon Oct 1 14:27:33 2007 -0500 svc: Make svc_sock the tcp/udp transport Make TCP and UDP svc_sock transports, and register them with the svc transport core. A transport type (svc_sock) has an svc_xprt as its first member, and calls svc_xprt_init to initialize this field. Signed-off-by: Tom Tucker commit 8ac94da36334e5e63f8f4e8b23c281f421ada0f1 Author: Tom Tucker Date: Mon Oct 1 14:27:31 2007 -0500 svc: Add an svc transport class The transport class (svc_xprt_class) represents a type of transport, e.g. udp, tcp, rdma. A transport class has a unique name and a set of transport operations kept in the svc_xprt_ops structure. A transport class can be dynamically registered and unregisterd. The svc_xprt_class represents the module that implements the transport type and keeps reference counts on the module to avoid unloading while there are active users. The endpoint (svc_xprt) is a generic, transport independent endpoint that can be used to send and receive data for an RPC service. It inherits it's operations from the transport class. A transport driver module registers and unregisters itself with svc sunrpc by calling svc_reg_xprt_class, and svc_unreg_xprt_class respectively. Signed-off-by: Tom Tucker commit 4f0d70bc62d429e6215b5076aa118f07cb3b9730 Author: J. Bruce Fields Date: Sun Sep 30 22:18:55 2007 -0400 Documentation: move locks.txt in filesystems/ This documentation (about file locking) belongs in filesystems/. Signed-off-by: J. Bruce Fields commit 20bcf326fb7af86e6cd3747d7da32d68188dfe2c Author: J. Bruce Fields Date: Fri Sep 28 16:45:51 2007 -0400 knfsd: query filesystem for NFSv4 getattr of FATTR4_MAXNAME Without this we always return 2^32-1 as the the maximum namelength. Thanks to Andreas Gruenbacher for bug report and testing. Signed-off-by: J. Bruce Fields Cc: Andreas Gruenbacher commit 063875f0ad420d5e47ea8fe658ec1d775284bb91 Author: J. Bruce Fields Date: Wed Sep 12 20:35:15 2007 -0400 knfsd: nfsv4 delegation recall should take reference on client It's not enough to take a reference on the delegation object itself; we need to ensure that the rpc_client won't go away just as we're about to make an rpc call. Signed-off-by: J. Bruce Fields commit 8580d1cff1dd4396510ac13b827df5dfaffe7b2c Author: J. Bruce Fields Date: Wed Sep 12 08:43:59 2007 -0400 knfsd: don't shutdown callbacks until nfsv4 client is freed If a callback still holds a reference on the client, then it may be about to perform an rpc call, so it isn't safe to call rpc_shutdown(). (Though rpc_shutdown() does wait for any outstanding rpc's, it can't know if a new rpc is about to be issued with that client.) So, wait to shutdown the rpc_client until the reference count on the client has gone to zero. Signed-off-by: J. Bruce Fields commit 1f6ac201b821c45fb745ed0f5e5c4721da064f61 Author: J. Bruce Fields Date: Wed Sep 12 18:56:12 2007 -0400 knfsd: let nfsd manage timing out its own leases Currently there's a race that can cause an oops in generic_setlease. (In detail: nfsd, when it removes a lease, does so by calling vfs_setlease() with F_UNLCK and a pointer to the fl_flock field, which in turn points to nfsd's existing lease; but the first thing the setlease code does is call time_out_leases(). If the lease happens to already be beyond the lease break time, that will free the lease and (in nfsd's release_private callback) set fl_flock to NULL, leading to a NULL deference soon after in vfs_setlease().) There are probably other things to fix here too, but it seems inherently racy to allow either locks.c or nfsd to time out this lease. Instead just set the fl_break_time to 0 (preventing locks.c from ever timing out this lock) and leave it up to nfsd's laundromat thread to deal with it. Signed-off-by: J. Bruce Fields commit 14de4d794f93da7e117f546317cbf0e3e0d95a3e Author: J. Bruce Fields Date: Tue Sep 25 11:57:19 2007 -0400 locks: add warning about mandatory locking races The mandatory file locking implementation has long-standing races that probably render it useless. I know of no plans to fix them. Till we do, we should at least warn people. Signed-off-by: J. Bruce Fields commit 27e031db5f6ef886a3f8a947a5e02e07331a42e3 Author: J. Bruce Fields Date: Mon Sep 24 18:52:09 2007 -0400 Documentation: move mandatory locking documentation to filesystems/ Shouldn't this mandatory-locking documentation be in the Documentation/filesystems directory? Give it a more descriptive name while we're at it, and update 00-INDEX with a more inclusive description of Documentation/filesystems (which has already talked about more than just individual filesystems). Signed-off-by: J. Bruce Fields Acked-by: Randy Dunlap commit 89dab77633d96474dc88ad471957a48160482d71 Author: Dr. David Alan Gilbert Date: Sat Aug 25 16:09:27 2007 +0100 knfsd: Add source address to sunrpc svc errors This patch adds the address of the client that caused an error in sunrpc/svc.c so that you get errors that look like: svc: 192.168.66.28, port=709: unknown version (3 for prog 100003, nfsd) I've seen machines which get bunches of unknown version or similar errors from time to time, and while the recent patch to add the service helps to find which service has the wrong version it doesn't help find the potentially bad client. The patch is against a checkout of Linus's git tree made on 2007-08-24. One observation is that the svc_print_addr function prints to a buffer which in this case makes life a little more complex; it just feels as if there must be lots of places that print a connection address - is there a better function to use anywhere? I think actually there are a few places with semi duplicated code; e.g. one_sock_name switches on the address family but only currently has IPV4; I wonder how many other places are similar. Signed-off-by: Dave Gilbert Cc: Randy Dunlap Signed-off-by: J. Bruce Fields commit a5b51d88131cab6c1b99ce5da934028cf3927cc2 Author: Peter Staubach Date: Thu Aug 16 12:10:07 2007 -0400 knfsd: 64 bit ino support for NFS server Modify the NFS server code to support 64 bit ino's, as appropriate for the system and the NFS protocol version. The gist of the changes is to query the underlying file system for attributes and not just to use the cached attributes in the inode. For this specific purpose, the inode only contains an ino field which unsigned long, which is large enough on 64 bit platforms, but is not large enough on 32 bit platforms. I haven't been able to find any reason why ->getattr can't be called while i_mutex. The specification indicates that i_mutex is not required to be held in order to invoke ->getattr, but it doesn't say that i_mutex can't be held while invoking ->getattr. I also haven't come to any conclusions regarding the value of lease_get_mtime() and whether it should or should not be invoked by fill_post_wcc() too. I chose not to change this because I thought that it was safer to leave well enough alone. If we decide to make a change, it can be done separately. Signed-off-by: Peter Staubach Signed-off-by: J. Bruce Fields commit b5a162085cd708ce643cbd5abfe8c9992255cbc2 Author: J. Bruce Fields Date: Thu Aug 9 20:16:22 2007 -0400 svcgss: move init code into separate function We've let svcauth_gss_accept() get much too long and hairy. The RPC_GSS_PROC_INIT and RPC_GSS_PROC_CONTINUE_INIT cases share very little with the other cases, so it's very natural to split them off into a separate function. This will also nicely isolate the piece of code we need to parametrize to authenticating gss-protected NFSv4 callbacks on behalf of the NFS client. Signed-off-by: J. Bruce Fields commit 0050cb82d8184eb09c08350d4a887dcef5836d35 Author: J. Bruce Fields Date: Thu Aug 9 18:34:32 2007 -0400 knfsd: remove code duplication in nfsd4_setclientid() Each branch of this if-then-else has a bunch of duplicated code that we could just put at the end. Signed-off-by: "J. Bruce Fields" commit 64954a225e2dfe36957327b5e1e4936470b8f6db Author: Andrew Morton Date: Thu Aug 9 00:53:50 2007 -0700 nfsd warning fix fs/nfsd/nfsctl.c: In function 'write_filehandle': fs/nfsd/nfsctl.c:301: warning: 'maxsize' may be used uninitialized in this function Cc: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: "J. Bruce Fields" commit 1489e2a1e9b921a545f26b581bf0924458a81ba6 Author: J. Bruce Fields Date: Tue Oct 24 18:33:17 2006 -0400 knfsd: fix callback rpc cred It doesn't make sense to make the callback with credentials that the client made the setclientid with. Instead the spec requires that the callback occur with the credentials the client authenticated *to*. It probably doesn't matter what we use for auth_unix, and some more infrastructure will be needed for auth_gss, so let's just remove the cred lookup for now. Signed-off-by: J. Bruce Fields commit 953eca0a59309c45b497d7e5e828eb87a3201efa Author: J. Bruce Fields Date: Wed Aug 1 15:30:59 2007 -0400 knfsd: move nfsv4 slab creation/destruction to module init/exit We have some slabs that the nfs4 server uses to store state objects. We're currently creating and destroying those slabs whenever the server is brought up or down. That seems excessive; may as well just do that in module initialization and exit. Also add some minor header cleanup. (Thanks to Andrew Morton for that and a compile fix.) Signed-off-by: "J. Bruce Fields" commit 0720c30a16b0b8f96daff3851dac4a1bbe0e5b09 Author: J. Bruce Fields Date: Fri Jul 27 18:06:50 2007 -0400 knfsd: spawn kernel thread to probe callback channel We want to allow gss on the callback channel, so people using krb5 can still get the benefits of delegations. But looking up the rpc credential can take some time in that case. And we shouldn't delay the response to setclientid_confirm while we wait. It may be inefficient, but for now the simplest solution is just to spawn a new thread as necessary for the purpose. (Thanks to Adrian Bunk for catching a missing static here.) Signed-off-by: "J. Bruce Fields" Cc: Adrian Bunk commit bb52ba85c4caf3052ed6a1ee832646e93164835c Author: J. Bruce Fields Date: Fri Jul 27 16:36:45 2007 -0400 knfsd: nfs4 name->id mapping not correctly parsing negative downcall Note that qword_get() returns length or -1, not an -ERROR. Signed-off-by: "J. Bruce Fields" commit dd86d2fa3d5ce1975c3fac7efaecc2b9f1987179 Author: J. Bruce Fields Date: Fri Jul 27 16:10:37 2007 -0400 knfsd: demote some printk()s to dprintk()s To quote a recent mail from Andrew Morton: Look: if there's a way in which an unprivileged user can trigger a printk we fix it, end of story. OK. I assume that goes double for printk()s that might be triggered by random hosts on the internet. So, disable some printk()s that look like they could be triggered by malfunctioning or malicious clients. For now, just downgrade them to dprintk()s. Signed-off-by: "J. Bruce Fields" commit b9d17cb50e440d15ed1310aa94b60057e205462e Author: J. Bruce Fields Date: Thu Jul 26 17:04:54 2007 -0400 knfsd: cleanup of nfsd4 cmp_* functions Benny Halevy suggested renaming cmp_* to same_* to make the meaning of the return value clearer. Fix some nearby style deviations while we're at it, including a small swath of creative indentation in nfs4_preprocess_seqid_op(). Signed-off-by: "J. Bruce Fields" commit 13a207c62ebf5dd95818d05828c790fbf4d9dca3 Author: J. Bruce Fields Date: Tue Jul 24 21:38:18 2007 -0400 knfsd: delete code made redundant by map_new_errors I moved this check into map_new_errors, but forgot to delete the original. Oops. Signed-off-by: "J. Bruce Fields" commit ad65be75d86a9d9f7a99504af8e78aeaf5b9d2f6 Author: Christoph Hellwig Date: Wed Mar 7 15:26:25 2007 +0000 nfsd: fix horrible indentation in nfsd_setattr Signed-off-by: Christoph Hellwig commit ea530dfdcf3428446356aed29c56ce9b44fbf18b Author: J. Bruce Fields Date: Thu Jul 12 15:30:32 2007 -0400 nfsd: remove unused cache_for_each macro This macro is unused. Signed-off-by: "J. Bruce Fields" commit ab182f36fbfcd89a36d69cbdd2fd29c3b4f7307b Author: J. Bruce Fields Date: Fri Jun 22 17:26:32 2007 -0400 nfsd: tone down inaccurate dprintk The nfserr_dropit happens routinely on upcalls (so a kmalloc failure is almost never the actual cause), but I occasionally get a complant from some tester that's worried because they ran across this message after turning on debugging to research some unrelated problem. Signed-off-by: "J. Bruce Fields" commit 99abc6a91d00839e28f9620ce23be8e6a20d7828 Author: Pavel Emelyanov Date: Thu Sep 20 12:45:02 2007 +0400 locks: Fix potential OOPS in generic_setlease() This code is run under lock_kernel(), which is dropped during sleeping operations, so the following race is possible: CPU1: CPU2: vfs_setlease(); vfs_setlease(); lock_kernel(); lock_kernel(); /* spin */ generic_setlease(): ... for (before = ...) /* here we found some lease after * which we will insert the new one */ fl = locks_alloc_lock(); /* go to sleep in this allocation and * drop the BKL */ generic_setlease(): ... for (before = ...) /* here we find the "before" pointing * at the one we found on CPU1 */ ->fl_change(my_before, arg); lease_modify(); locks_free_lock(); /* and we freed it */ ... unlock_kernel(); locks_insert_lock(before, fl); /* OOPS! We have just tried to add the lease * at the tail of already removed one */ The similar races are already handled in other code - all the allocations are performed before any checks/updates. Thanks to Kamalesh Babulal for testing and for a bug report on an earlier version. Signed-off-by: Pavel Emelyanov Signed-off-by: J. Bruce Fields Cc: Kamalesh Babulal commit 3279d0b110df5356bca6b3495af2a52a5bc54bb6 Author: Pavel Emelyanov Date: Wed Sep 19 16:44:07 2007 +0400 Use list_first_entry in locks_wake_up_blocks This routine deletes all the elements from the list with the "while (!list_empty())" loop, and we already have a list_first_entry() macro to help it look nicer :) Signed-off-by: Pavel Emelyanov commit be0ee4e871fbd9de9cddbc50e5fb1dc3cc499258 Author: J. Bruce Fields Date: Wed Sep 12 15:45:07 2007 -0400 locks: fix flock_lock_file() comment This comment wasn't updated when lease support was added, and it makes essentially the same mistake that the code made before a recent bugfix. Signed-off-by: J. Bruce Fields commit 1a27e6ff3f607218931f04eca740ca0b04b52e90 Author: Pavel Emelyanov Date: Tue Sep 11 16:38:13 2007 +0400 Memory shortage can result in inconsistent flocks state When the flock_lock_file() is called to change the flock from F_RDLCK to F_WRLCK or vice versa the existing flock can be removed without appropriate warning. Look: for_each_lock(inode, before) { struct file_lock *fl = *before; if (IS_POSIX(fl)) break; if (IS_LEASE(fl)) continue; if (filp != fl->fl_file) continue; if (request->fl_type == fl->fl_type) goto out; found = 1; locks_delete_lock(before); <<<<<< ! break; } if after this point the subsequent locks_alloc_lock() will fail the return code will be -ENOMEM, but the existing lock is already removed. This is a known feature that such "re-locking" is not atomic, but in the racy case the file should stay locked (although by some other process), but in this case the file will be unlocked. The proposal is to prepare the lock in advance keeping no chance to fail in the future code. Found during making the flocks pid-namespaces aware. (Note: Thanks to Reuben Farrelly for finding a bug in an earlier version of this patch.) Signed-off-by: Pavel Emelyanov Signed-off-by: J. Bruce Fields Cc: Reuben Farrelly commit 1854f8a4f08011f7b353bf32c3337aa4bde8bb11 Author: J. Bruce Fields Date: Tue Nov 14 16:54:36 2006 -0500 locks: kill redundant local variable There's no need for another variable local to this loop; we can use the variable (of the same name!) already declared at the top of the function, and not used till later (at which point it's initialized, so this is safe). Signed-off-by: J. Bruce Fields commit d024dd770f8739ab9085da578e12c9a7c605b96a Author: J. Bruce Fields Date: Thu May 10 19:02:07 2007 -0400 locks: reverse order of posix_locks_conflict() arguments The first argument to posix_locks_conflict() is meant to be a lock request, and the second a lock from an inode's lock request. It doesn't really make a difference which order you call them in, since the only asymmetric test in posix_lock_conflict() is the check whether the second argument is a posix lock--and every caller already does that check for some reason. But may as well fix posix_test_lock() to call posix_locks_conflict() with the arguments in the same order as everywhere else. Signed-off-by: "J. Bruce Fields" Signed-off-by: Andrew Morton --- Documentation/00-INDEX | 6 Documentation/filesystems/00-INDEX | 4 Documentation/filesystems/locks.txt | 67 Documentation/filesystems/mandatory-locking.txt | 171 ++ Documentation/locks.txt | 67 Documentation/mandatory.txt | 152 - fs/9p/vfs_file.c | 2 fs/afs/flock.c | 3 fs/gfs2/ops_file.c | 4 fs/lockd/svc.c | 21 fs/locks.c | 192 +- fs/nfs/callback.c | 4 fs/nfs/file.c | 3 fs/nfsd/nfs3xdr.c | 59 fs/nfsd/nfs4callback.c | 87 - fs/nfsd/nfs4idmap.c | 8 fs/nfsd/nfs4proc.c | 4 fs/nfsd/nfs4state.c | 202 +- fs/nfsd/nfs4xdr.c | 22 fs/nfsd/nfsctl.c | 23 fs/nfsd/nfssvc.c | 16 fs/nfsd/nfsxdr.c | 4 fs/nfsd/vfs.c | 56 fs/proc/proc_misc.c | 19 fs/read_write.c | 2 include/linux/fs.h | 22 include/linux/nfsd/nfsd.h | 18 include/linux/nfsd/nfsfh.h | 42 include/linux/nfsd/xdr4.h | 4 include/linux/sunrpc/cache.h | 10 include/linux/sunrpc/debug.h | 2 include/linux/sunrpc/svc.h | 7 include/linux/sunrpc/svc_xprt.h | 86 + include/linux/sunrpc/svcsock.h | 43 net/sunrpc/Makefile | 3 net/sunrpc/auth_gss/svcauth_gss.c | 144 - net/sunrpc/sunrpc_syms.c | 5 net/sunrpc/svc.c | 59 net/sunrpc/svc_xprt.c | 954 +++++++++++ net/sunrpc/svcauth_unix.c | 54 net/sunrpc/svcsock.c | 1162 +++----------- net/sunrpc/sysctl.c | 37 42 files changed, 2131 insertions(+), 1719 deletions(-) diff -puN Documentation/00-INDEX~git-nfsd Documentation/00-INDEX --- a/Documentation/00-INDEX~git-nfsd +++ a/Documentation/00-INDEX @@ -145,7 +145,7 @@ fb/ feature-removal-schedule.txt - list of files and features that are going to be removed. filesystems/ - - directory with info on the various filesystems that Linux supports. + - info on the vfs and the various filesystems that Linux supports. firmware_class/ - request_firmware() hotplug interface info. floppy.txt @@ -230,8 +230,6 @@ local_ops.txt - semantics and behavior of local atomic operations. lockdep-design.txt - documentation on the runtime locking correctness validator. -locks.txt - - info on file locking implementations, flock() vs. fcntl(), etc. logo.gif - full colour GIF image of Linux logo (penguin - Tux). logo.txt @@ -240,8 +238,6 @@ m68k/ - directory with info about Linux on Motorola 68k architecture. magic-number.txt - list of magic numbers used to mark/protect kernel data structures. -mandatory.txt - - info on the Linux implementation of Sys V mandatory file locking. mca.txt - info on supporting Micro Channel Architecture (e.g. PS/2) systems. md.txt diff -puN Documentation/filesystems/00-INDEX~git-nfsd Documentation/filesystems/00-INDEX --- a/Documentation/filesystems/00-INDEX~git-nfsd +++ a/Documentation/filesystems/00-INDEX @@ -52,6 +52,10 @@ isofs.txt - info and mount options for the ISO 9660 (CDROM) filesystem. jfs.txt - info and mount options for the JFS filesystem. +locks.txt + - info on file locking implementations, flock() vs. fcntl(), etc. +mandatory-locking.txt + - info on the Linux implementation of Sys V mandatory file locking. ncpfs.txt - info on Novell Netware(tm) filesystem using NCP protocol. ntfs.txt diff -puN /dev/null Documentation/filesystems/locks.txt --- /dev/null +++ a/Documentation/filesystems/locks.txt @@ -0,0 +1,67 @@ + File Locking Release Notes + + Andy Walker + + 12 May 1997 + + +1. What's New? +-------------- + +1.1 Broken Flock Emulation +-------------------------- + +The old flock(2) emulation in the kernel was swapped for proper BSD +compatible flock(2) support in the 1.3.x series of kernels. With the +release of the 2.1.x kernel series, support for the old emulation has +been totally removed, so that we don't need to carry this baggage +forever. + +This should not cause problems for anybody, since everybody using a +2.1.x kernel should have updated their C library to a suitable version +anyway (see the file "Documentation/Changes".) + +1.2 Allow Mixed Locks Again +--------------------------- + +1.2.1 Typical Problems - Sendmail +--------------------------------- +Because sendmail was unable to use the old flock() emulation, many sendmail +installations use fcntl() instead of flock(). This is true of Slackware 3.0 +for example. This gave rise to some other subtle problems if sendmail was +configured to rebuild the alias file. Sendmail tried to lock the aliases.dir +file with fcntl() at the same time as the GDBM routines tried to lock this +file with flock(). With pre 1.3.96 kernels this could result in deadlocks that, +over time, or under a very heavy mail load, would eventually cause the kernel +to lock solid with deadlocked processes. + + +1.2.2 The Solution +------------------ +The solution I have chosen, after much experimentation and discussion, +is to make flock() and fcntl() locks oblivious to each other. Both can +exists, and neither will have any effect on the other. + +I wanted the two lock styles to be cooperative, but there were so many +race and deadlock conditions that the current solution was the only +practical one. It puts us in the same position as, for example, SunOS +4.1.x and several other commercial Unices. The only OS's that support +cooperative flock()/fcntl() are those that emulate flock() using +fcntl(), with all the problems that implies. + + +1.3 Mandatory Locking As A Mount Option +--------------------------------------- + +Mandatory locking, as described in 'Documentation/filesystems/mandatory.txt' +was prior to this release a general configuration option that was valid for +all mounted filesystems. This had a number of inherent dangers, not the +least of which was the ability to freeze an NFS server by asking it to read +a file for which a mandatory lock existed. + +From this release of the kernel, mandatory locking can be turned on and off +on a per-filesystem basis, using the mount options 'mand' and 'nomand'. +The default is to disallow mandatory locking. The intention is that +mandatory locking only be enabled on a local filesystem as the specific need +arises. + diff -puN /dev/null Documentation/filesystems/mandatory-locking.txt --- /dev/null +++ a/Documentation/filesystems/mandatory-locking.txt @@ -0,0 +1,171 @@ + Mandatory File Locking For The Linux Operating System + + Andy Walker + + 15 April 1996 + (Updated September 2007) + +0. Why you should avoid mandatory locking +----------------------------------------- + +The Linux implementation is prey to a number of difficult-to-fix race +conditions which in practice make it not dependable: + + - The write system call checks for a mandatory lock only once + at its start. It is therefore possible for a lock request to + be granted after this check but before the data is modified. + A process may then see file data change even while a mandatory + lock was held. + - Similarly, an exclusive lock may be granted on a file after + the kernel has decided to proceed with a read, but before the + read has actually completed, and the reading process may see + the file data in a state which should not have been visible + to it. + - Similar races make the claimed mutual exclusion between lock + and mmap similarly unreliable. + +1. What is mandatory locking? +------------------------------ + +Mandatory locking is kernel enforced file locking, as opposed to the more usual +cooperative file locking used to guarantee sequential access to files among +processes. File locks are applied using the flock() and fcntl() system calls +(and the lockf() library routine which is a wrapper around fcntl().) It is +normally a process' responsibility to check for locks on a file it wishes to +update, before applying its own lock, updating the file and unlocking it again. +The most commonly used example of this (and in the case of sendmail, the most +troublesome) is access to a user's mailbox. The mail user agent and the mail +transfer agent must guard against updating the mailbox at the same time, and +prevent reading the mailbox while it is being updated. + +In a perfect world all processes would use and honour a cooperative, or +"advisory" locking scheme. However, the world isn't perfect, and there's +a lot of poorly written code out there. + +In trying to address this problem, the designers of System V UNIX came up +with a "mandatory" locking scheme, whereby the operating system kernel would +block attempts by a process to write to a file that another process holds a +"read" -or- "shared" lock on, and block attempts to both read and write to a +file that a process holds a "write " -or- "exclusive" lock on. + +The System V mandatory locking scheme was intended to have as little impact as +possible on existing user code. The scheme is based on marking individual files +as candidates for mandatory locking, and using the existing fcntl()/lockf() +interface for applying locks just as if they were normal, advisory locks. + +Note 1: In saying "file" in the paragraphs above I am actually not telling +the whole truth. System V locking is based on fcntl(). The granularity of +fcntl() is such that it allows the locking of byte ranges in files, in addition +to entire files, so the mandatory locking rules also have byte level +granularity. + +Note 2: POSIX.1 does not specify any scheme for mandatory locking, despite +borrowing the fcntl() locking scheme from System V. The mandatory locking +scheme is defined by the System V Interface Definition (SVID) Version 3. + +2. Marking a file for mandatory locking +--------------------------------------- + +A file is marked as a candidate for mandatory locking by setting the group-id +bit in its file mode but removing the group-execute bit. This is an otherwise +meaningless combination, and was chosen by the System V implementors so as not +to break existing user programs. + +Note that the group-id bit is usually automatically cleared by the kernel when +a setgid file is written to. This is a security measure. The kernel has been +modified to recognize the special case of a mandatory lock candidate and to +refrain from clearing this bit. Similarly the kernel has been modified not +to run mandatory lock candidates with setgid privileges. + +3. Available implementations +---------------------------- + +I have considered the implementations of mandatory locking available with +SunOS 4.1.x, Solaris 2.x and HP-UX 9.x. + +Generally I have tried to make the most sense out of the behaviour exhibited +by these three reference systems. There are many anomalies. + +All the reference systems reject all calls to open() for a file on which +another process has outstanding mandatory locks. This is in direct +contravention of SVID 3, which states that only calls to open() with the +O_TRUNC flag set should be rejected. The Linux implementation follows the SVID +definition, which is the "Right Thing", since only calls with O_TRUNC can +modify the contents of the file. + +HP-UX even disallows open() with O_TRUNC for a file with advisory locks, not +just mandatory locks. That would appear to contravene POSIX.1. + +mmap() is another interesting case. All the operating systems mentioned +prevent mandatory locks from being applied to an mmap()'ed file, but HP-UX +also disallows advisory locks for such a file. SVID actually specifies the +paranoid HP-UX behaviour. + +In my opinion only MAP_SHARED mappings should be immune from locking, and then +only from mandatory locks - that is what is currently implemented. + +SunOS is so hopeless that it doesn't even honour the O_NONBLOCK flag for +mandatory locks, so reads and writes to locked files always block when they +should return EAGAIN. + +I'm afraid that this is such an esoteric area that the semantics described +below are just as valid as any others, so long as the main points seem to +agree. + +4. Semantics +------------ + +1. Mandatory locks can only be applied via the fcntl()/lockf() locking + interface - in other words the System V/POSIX interface. BSD style + locks using flock() never result in a mandatory lock. + +2. If a process has locked a region of a file with a mandatory read lock, then + other processes are permitted to read from that region. If any of these + processes attempts to write to the region it will block until the lock is + released, unless the process has opened the file with the O_NONBLOCK + flag in which case the system call will return immediately with the error + status EAGAIN. + +3. If a process has locked a region of a file with a mandatory write lock, all + attempts to read or write to that region block until the lock is released, + unless a process has opened the file with the O_NONBLOCK flag in which case + the system call will return immediately with the error status EAGAIN. + +4. Calls to open() with O_TRUNC, or to creat(), on a existing file that has + any mandatory locks owned by other processes will be rejected with the + error status EAGAIN. + +5. Attempts to apply a mandatory lock to a file that is memory mapped and + shared (via mmap() with MAP_SHARED) will be rejected with the error status + EAGAIN. + +6. Attempts to create a shared memory map of a file (via mmap() with MAP_SHARED) + that has any mandatory locks in effect will be rejected with the error status + EAGAIN. + +5. Which system calls are affected? +----------------------------------- + +Those which modify a file's contents, not just the inode. That gives read(), +write(), readv(), writev(), open(), creat(), mmap(), truncate() and +ftruncate(). truncate() and ftruncate() are considered to be "write" actions +for the purposes of mandatory locking. + +The affected region is usually defined as stretching from the current position +for the total number of bytes read or written. For the truncate calls it is +defined as the bytes of a file removed or added (we must also consider bytes +added, as a lock can specify just "the whole file", rather than a specific +range of bytes.) + +Note 3: I may have overlooked some system calls that need mandatory lock +checking in my eagerness to get this code out the door. Please let me know, or +better still fix the system calls yourself and submit a patch to me or Linus. + +6. Warning! +----------- + +Not even root can override a mandatory lock, so runaway processes can wreak +havoc if they lock crucial files. The way around it is to change the file +permissions (remove the setgid bit) before trying to read or write to it. +Of course, that might be a bit tricky if the system is hung :-( + diff -puN Documentation/locks.txt~git-nfsd /dev/null --- a/Documentation/locks.txt +++ /dev/null @@ -1,67 +0,0 @@ - File Locking Release Notes - - Andy Walker - - 12 May 1997 - - -1. What's New? --------------- - -1.1 Broken Flock Emulation --------------------------- - -The old flock(2) emulation in the kernel was swapped for proper BSD -compatible flock(2) support in the 1.3.x series of kernels. With the -release of the 2.1.x kernel series, support for the old emulation has -been totally removed, so that we don't need to carry this baggage -forever. - -This should not cause problems for anybody, since everybody using a -2.1.x kernel should have updated their C library to a suitable version -anyway (see the file "Documentation/Changes".) - -1.2 Allow Mixed Locks Again ---------------------------- - -1.2.1 Typical Problems - Sendmail ---------------------------------- -Because sendmail was unable to use the old flock() emulation, many sendmail -installations use fcntl() instead of flock(). This is true of Slackware 3.0 -for example. This gave rise to some other subtle problems if sendmail was -configured to rebuild the alias file. Sendmail tried to lock the aliases.dir -file with fcntl() at the same time as the GDBM routines tried to lock this -file with flock(). With pre 1.3.96 kernels this could result in deadlocks that, -over time, or under a very heavy mail load, would eventually cause the kernel -to lock solid with deadlocked processes. - - -1.2.2 The Solution ------------------- -The solution I have chosen, after much experimentation and discussion, -is to make flock() and fcntl() locks oblivious to each other. Both can -exists, and neither will have any effect on the other. - -I wanted the two lock styles to be cooperative, but there were so many -race and deadlock conditions that the current solution was the only -practical one. It puts us in the same position as, for example, SunOS -4.1.x and several other commercial Unices. The only OS's that support -cooperative flock()/fcntl() are those that emulate flock() using -fcntl(), with all the problems that implies. - - -1.3 Mandatory Locking As A Mount Option ---------------------------------------- - -Mandatory locking, as described in 'Documentation/mandatory.txt' was prior -to this release a general configuration option that was valid for all -mounted filesystems. This had a number of inherent dangers, not the least -of which was the ability to freeze an NFS server by asking it to read a -file for which a mandatory lock existed. - -From this release of the kernel, mandatory locking can be turned on and off -on a per-filesystem basis, using the mount options 'mand' and 'nomand'. -The default is to disallow mandatory locking. The intention is that -mandatory locking only be enabled on a local filesystem as the specific need -arises. - diff -puN Documentation/mandatory.txt~git-nfsd /dev/null --- a/Documentation/mandatory.txt +++ /dev/null @@ -1,152 +0,0 @@ - Mandatory File Locking For The Linux Operating System - - Andy Walker - - 15 April 1996 - - -1. What is mandatory locking? ------------------------------- - -Mandatory locking is kernel enforced file locking, as opposed to the more usual -cooperative file locking used to guarantee sequential access to files among -processes. File locks are applied using the flock() and fcntl() system calls -(and the lockf() library routine which is a wrapper around fcntl().) It is -normally a process' responsibility to check for locks on a file it wishes to -update, before applying its own lock, updating the file and unlocking it again. -The most commonly used example of this (and in the case of sendmail, the most -troublesome) is access to a user's mailbox. The mail user agent and the mail -transfer agent must guard against updating the mailbox at the same time, and -prevent reading the mailbox while it is being updated. - -In a perfect world all processes would use and honour a cooperative, or -"advisory" locking scheme. However, the world isn't perfect, and there's -a lot of poorly written code out there. - -In trying to address this problem, the designers of System V UNIX came up -with a "mandatory" locking scheme, whereby the operating system kernel would -block attempts by a process to write to a file that another process holds a -"read" -or- "shared" lock on, and block attempts to both read and write to a -file that a process holds a "write " -or- "exclusive" lock on. - -The System V mandatory locking scheme was intended to have as little impact as -possible on existing user code. The scheme is based on marking individual files -as candidates for mandatory locking, and using the existing fcntl()/lockf() -interface for applying locks just as if they were normal, advisory locks. - -Note 1: In saying "file" in the paragraphs above I am actually not telling -the whole truth. System V locking is based on fcntl(). The granularity of -fcntl() is such that it allows the locking of byte ranges in files, in addition -to entire files, so the mandatory locking rules also have byte level -granularity. - -Note 2: POSIX.1 does not specify any scheme for mandatory locking, despite -borrowing the fcntl() locking scheme from System V. The mandatory locking -scheme is defined by the System V Interface Definition (SVID) Version 3. - -2. Marking a file for mandatory locking ---------------------------------------- - -A file is marked as a candidate for mandatory locking by setting the group-id -bit in its file mode but removing the group-execute bit. This is an otherwise -meaningless combination, and was chosen by the System V implementors so as not -to break existing user programs. - -Note that the group-id bit is usually automatically cleared by the kernel when -a setgid file is written to. This is a security measure. The kernel has been -modified to recognize the special case of a mandatory lock candidate and to -refrain from clearing this bit. Similarly the kernel has been modified not -to run mandatory lock candidates with setgid privileges. - -3. Available implementations ----------------------------- - -I have considered the implementations of mandatory locking available with -SunOS 4.1.x, Solaris 2.x and HP-UX 9.x. - -Generally I have tried to make the most sense out of the behaviour exhibited -by these three reference systems. There are many anomalies. - -All the reference systems reject all calls to open() for a file on which -another process has outstanding mandatory locks. This is in direct -contravention of SVID 3, which states that only calls to open() with the -O_TRUNC flag set should be rejected. The Linux implementation follows the SVID -definition, which is the "Right Thing", since only calls with O_TRUNC can -modify the contents of the file. - -HP-UX even disallows open() with O_TRUNC for a file with advisory locks, not -just mandatory locks. That would appear to contravene POSIX.1. - -mmap() is another interesting case. All the operating systems mentioned -prevent mandatory locks from being applied to an mmap()'ed file, but HP-UX -also disallows advisory locks for such a file. SVID actually specifies the -paranoid HP-UX behaviour. - -In my opinion only MAP_SHARED mappings should be immune from locking, and then -only from mandatory locks - that is what is currently implemented. - -SunOS is so hopeless that it doesn't even honour the O_NONBLOCK flag for -mandatory locks, so reads and writes to locked files always block when they -should return EAGAIN. - -I'm afraid that this is such an esoteric area that the semantics described -below are just as valid as any others, so long as the main points seem to -agree. - -4. Semantics ------------- - -1. Mandatory locks can only be applied via the fcntl()/lockf() locking - interface - in other words the System V/POSIX interface. BSD style - locks using flock() never result in a mandatory lock. - -2. If a process has locked a region of a file with a mandatory read lock, then - other processes are permitted to read from that region. If any of these - processes attempts to write to the region it will block until the lock is - released, unless the process has opened the file with the O_NONBLOCK - flag in which case the system call will return immediately with the error - status EAGAIN. - -3. If a process has locked a region of a file with a mandatory write lock, all - attempts to read or write to that region block until the lock is released, - unless a process has opened the file with the O_NONBLOCK flag in which case - the system call will return immediately with the error status EAGAIN. - -4. Calls to open() with O_TRUNC, or to creat(), on a existing file that has - any mandatory locks owned by other processes will be rejected with the - error status EAGAIN. - -5. Attempts to apply a mandatory lock to a file that is memory mapped and - shared (via mmap() with MAP_SHARED) will be rejected with the error status - EAGAIN. - -6. Attempts to create a shared memory map of a file (via mmap() with MAP_SHARED) - that has any mandatory locks in effect will be rejected with the error status - EAGAIN. - -5. Which system calls are affected? ------------------------------------ - -Those which modify a file's contents, not just the inode. That gives read(), -write(), readv(), writev(), open(), creat(), mmap(), truncate() and -ftruncate(). truncate() and ftruncate() are considered to be "write" actions -for the purposes of mandatory locking. - -The affected region is usually defined as stretching from the current position -for the total number of bytes read or written. For the truncate calls it is -defined as the bytes of a file removed or added (we must also consider bytes -added, as a lock can specify just "the whole file", rather than a specific -range of bytes.) - -Note 3: I may have overlooked some system calls that need mandatory lock -checking in my eagerness to get this code out the door. Please let me know, or -better still fix the system calls yourself and submit a patch to me or Linus. - -6. Warning! ------------ - -Not even root can override a mandatory lock, so runaway processes can wreak -havoc if they lock crucial files. The way around it is to change the file -permissions (remove the setgid bit) before trying to read or write to it. -Of course, that might be a bit tricky if the system is hung :-( - diff -puN fs/9p/vfs_file.c~git-nfsd fs/9p/vfs_file.c --- a/fs/9p/vfs_file.c~git-nfsd +++ a/fs/9p/vfs_file.c @@ -105,7 +105,7 @@ static int v9fs_file_lock(struct file *f P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); /* No mandatory locks */ - if ((inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID) + if (__mandatory_lock(inode)) return -ENOLCK; if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { diff -puN fs/afs/flock.c~git-nfsd fs/afs/flock.c --- a/fs/afs/flock.c~git-nfsd +++ a/fs/afs/flock.c @@ -524,8 +524,7 @@ int afs_lock(struct file *file, int cmd, (long long) fl->fl_start, (long long) fl->fl_end); /* AFS doesn't support mandatory locks */ - if ((vnode->vfs_inode.i_mode & (S_ISGID | S_IXGRP)) == S_ISGID && - fl->fl_type != F_UNLCK) + if (__mandatory_lock(&vnode->vfs_inode) && fl->fl_type != F_UNLCK) return -ENOLCK; if (IS_GETLK(cmd)) diff -puN fs/gfs2/ops_file.c~git-nfsd fs/gfs2/ops_file.c --- a/fs/gfs2/ops_file.c~git-nfsd +++ a/fs/gfs2/ops_file.c @@ -535,7 +535,7 @@ static int gfs2_lock(struct file *file, if (!(fl->fl_flags & FL_POSIX)) return -ENOLCK; - if ((ip->i_inode.i_mode & (S_ISGID | S_IXGRP)) == S_ISGID) + if (__mandatory_lock(&ip->i_inode)) return -ENOLCK; if (sdp->sd_args.ar_localflocks) { @@ -636,7 +636,7 @@ static int gfs2_flock(struct file *file, if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; - if ((ip->i_inode.i_mode & (S_ISGID | S_IXGRP)) == S_ISGID) + if (__mandatory_lock(&ip->i_inode)) return -ENOLCK; if (sdp->sd_args.ar_localflocks) diff -puN fs/lockd/svc.c~git-nfsd fs/lockd/svc.c --- a/fs/lockd/svc.c~git-nfsd +++ a/fs/lockd/svc.c @@ -219,13 +219,12 @@ lockd(struct svc_rqst *rqstp) module_put_and_exit(0); } - -static int find_socket(struct svc_serv *serv, int proto) +static int find_xprt(struct svc_serv *serv, char *proto) { - struct svc_sock *svsk; + struct svc_xprt *xprt; int found = 0; - list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) - if (svsk->sk_sk->sk_protocol == proto) { + list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) + if (strcmp(xprt->xpt_class->xcl_name, proto) == 0) { found = 1; break; } @@ -243,13 +242,13 @@ static int make_socks(struct svc_serv *s int err = 0; if (proto == IPPROTO_UDP || nlm_udpport) - if (!find_socket(serv, IPPROTO_UDP)) - err = svc_makesock(serv, IPPROTO_UDP, nlm_udpport, - SVC_SOCK_DEFAULTS); + if (!find_xprt(serv, "udp")) + err = svc_create_xprt(serv, "udp", nlm_udpport, + SVC_SOCK_DEFAULTS); if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport)) - if (!find_socket(serv, IPPROTO_TCP)) - err = svc_makesock(serv, IPPROTO_TCP, nlm_tcpport, - SVC_SOCK_DEFAULTS); + if (!find_xprt(serv, "tcp")) + err = svc_create_xprt(serv, "tcp", nlm_tcpport, + SVC_SOCK_DEFAULTS); if (err >= 0) { warned = 0; diff -puN fs/locks.c~git-nfsd fs/locks.c --- a/fs/locks.c~git-nfsd +++ a/fs/locks.c @@ -534,7 +534,9 @@ static void locks_insert_block(struct fi static void locks_wake_up_blocks(struct file_lock *blocker) { while (!list_empty(&blocker->fl_block)) { - struct file_lock *waiter = list_entry(blocker->fl_block.next, + struct file_lock *waiter; + + waiter = list_first_entry(&blocker->fl_block, struct file_lock, fl_block); __locks_delete_block(waiter); if (waiter->fl_lmops && waiter->fl_lmops->fl_notify) @@ -668,7 +670,7 @@ posix_test_lock(struct file *filp, struc for (cfl = filp->f_path.dentry->d_inode->i_flock; cfl; cfl = cfl->fl_next) { if (!IS_POSIX(cfl)) continue; - if (posix_locks_conflict(cfl, fl)) + if (posix_locks_conflict(fl, cfl)) break; } if (cfl) @@ -698,13 +700,12 @@ EXPORT_SYMBOL(posix_test_lock); static int posix_locks_deadlock(struct file_lock *caller_fl, struct file_lock *block_fl) { - struct list_head *tmp; + struct file_lock *fl; next_task: if (posix_same_owner(caller_fl, block_fl)) return 1; - list_for_each(tmp, &blocked_list) { - struct file_lock *fl = list_entry(tmp, struct file_lock, fl_link); + list_for_each_entry(fl, &blocked_list, fl_link) { if (posix_same_owner(fl, block_fl)) { fl = fl->fl_next; block_fl = fl; @@ -715,8 +716,7 @@ next_task: } /* Try to create a FLOCK lock on filp. We always insert new FLOCK locks - * at the head of the list, but that's secret knowledge known only to - * flock_lock_file and posix_lock_file. + * after any leases, but before any posix locks. * * Note that if called with an FL_EXISTS argument, the caller may determine * whether or not a lock was successfully freed by testing the return @@ -733,6 +733,15 @@ static int flock_lock_file(struct file * lock_kernel(); if (request->fl_flags & FL_ACCESS) goto find_conflict; + + if (request->fl_type != F_UNLCK) { + error = -ENOMEM; + new_fl = locks_alloc_lock(); + if (new_fl == NULL) + goto out; + error = 0; + } + for_each_lock(inode, before) { struct file_lock *fl = *before; if (IS_POSIX(fl)) @@ -754,10 +763,6 @@ static int flock_lock_file(struct file * goto out; } - error = -ENOMEM; - new_fl = locks_alloc_lock(); - if (new_fl == NULL) - goto out; /* * If a higher-priority process was blocked on the old file lock, * give it the opportunity to lock the file. @@ -819,7 +824,7 @@ static int __posix_lock_file(struct inod lock_kernel(); if (request->fl_type != F_UNLCK) { for_each_lock(inode, before) { - struct file_lock *fl = *before; + fl = *before; if (!IS_POSIX(fl)) continue; if (!posix_locks_conflict(request, fl)) @@ -1113,7 +1118,7 @@ int locks_mandatory_area(int read_write, * If we've been sleeping someone might have * changed the permissions behind our back. */ - if ((inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID) + if (__mandatory_lock(inode)) continue; } @@ -1337,6 +1342,7 @@ int fcntl_getlease(struct file *filp) int generic_setlease(struct file *filp, long arg, struct file_lock **flp) { struct file_lock *fl, **before, **my_before = NULL, *lease; + struct file_lock *new_fl = NULL; struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; int error, rdlease_count = 0, wrlease_count = 0; @@ -1363,6 +1369,11 @@ int generic_setlease(struct file *filp, || (atomic_read(&inode->i_count) > 1))) goto out; + error = -ENOMEM; + new_fl = locks_alloc_lock(); + if (new_fl == NULL) + goto out; + /* * At this point, we know that if there is an exclusive * lease on this file, then we hold it on this filp @@ -1405,18 +1416,15 @@ int generic_setlease(struct file *filp, if (!leases_enable) goto out; - error = -ENOMEM; - fl = locks_alloc_lock(); - if (fl == NULL) - goto out; - - locks_copy_lock(fl, lease); + locks_copy_lock(new_fl, lease); + locks_insert_lock(before, new_fl); - locks_insert_lock(before, fl); + *flp = new_fl; + return 0; - *flp = fl; - error = 0; out: + if (new_fl != NULL) + locks_free_lock(new_fl); return error; } EXPORT_SYMBOL(generic_setlease); @@ -1752,9 +1760,7 @@ int fcntl_setlk(unsigned int fd, struct /* Don't allow mandatory locks on files that may be memory mapped * and shared. */ - if (IS_MANDLOCK(inode) && - (inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID && - mapping_writably_mapped(filp->f_mapping)) { + if (mandatory_lock(inode) && mapping_writably_mapped(filp->f_mapping)) { error = -EAGAIN; goto out; } @@ -1878,9 +1884,7 @@ int fcntl_setlk64(unsigned int fd, struc /* Don't allow mandatory locks on files that may be memory mapped * and shared. */ - if (IS_MANDLOCK(inode) && - (inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID && - mapping_writably_mapped(filp->f_mapping)) { + if (mandatory_lock(inode) && mapping_writably_mapped(filp->f_mapping)) { error = -EAGAIN; goto out; } @@ -2062,138 +2066,114 @@ int vfs_cancel_lock(struct file *filp, s EXPORT_SYMBOL_GPL(vfs_cancel_lock); -static void lock_get_status(char* out, struct file_lock *fl, int id, char *pfx) +#ifdef CONFIG_PROC_FS +#include + +static void lock_get_status(struct seq_file *f, struct file_lock *fl, + int id, char *pfx) { struct inode *inode = NULL; if (fl->fl_file != NULL) inode = fl->fl_file->f_path.dentry->d_inode; - out += sprintf(out, "%d:%s ", id, pfx); + seq_printf(f, "%d:%s ", id, pfx); if (IS_POSIX(fl)) { - out += sprintf(out, "%6s %s ", + seq_printf(f, "%6s %s ", (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ", (inode == NULL) ? "*NOINODE*" : - (IS_MANDLOCK(inode) && - (inode->i_mode & (S_IXGRP | S_ISGID)) == S_ISGID) ? - "MANDATORY" : "ADVISORY "); + mandatory_lock(inode) ? "MANDATORY" : "ADVISORY "); } else if (IS_FLOCK(fl)) { if (fl->fl_type & LOCK_MAND) { - out += sprintf(out, "FLOCK MSNFS "); + seq_printf(f, "FLOCK MSNFS "); } else { - out += sprintf(out, "FLOCK ADVISORY "); + seq_printf(f, "FLOCK ADVISORY "); } } else if (IS_LEASE(fl)) { - out += sprintf(out, "LEASE "); + seq_printf(f, "LEASE "); if (fl->fl_type & F_INPROGRESS) - out += sprintf(out, "BREAKING "); + seq_printf(f, "BREAKING "); else if (fl->fl_file) - out += sprintf(out, "ACTIVE "); + seq_printf(f, "ACTIVE "); else - out += sprintf(out, "BREAKER "); + seq_printf(f, "BREAKER "); } else { - out += sprintf(out, "UNKNOWN UNKNOWN "); + seq_printf(f, "UNKNOWN UNKNOWN "); } if (fl->fl_type & LOCK_MAND) { - out += sprintf(out, "%s ", + seq_printf(f, "%s ", (fl->fl_type & LOCK_READ) ? (fl->fl_type & LOCK_WRITE) ? "RW " : "READ " : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE "); } else { - out += sprintf(out, "%s ", + seq_printf(f, "%s ", (fl->fl_type & F_INPROGRESS) ? (fl->fl_type & F_UNLCK) ? "UNLCK" : "READ " : (fl->fl_type & F_WRLCK) ? "WRITE" : "READ "); } if (inode) { #ifdef WE_CAN_BREAK_LSLK_NOW - out += sprintf(out, "%d %s:%ld ", fl->fl_pid, + seq_printf(f, "%d %s:%ld ", fl->fl_pid, inode->i_sb->s_id, inode->i_ino); #else /* userspace relies on this representation of dev_t ;-( */ - out += sprintf(out, "%d %02x:%02x:%ld ", fl->fl_pid, + seq_printf(f, "%d %02x:%02x:%ld ", fl->fl_pid, MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), inode->i_ino); #endif } else { - out += sprintf(out, "%d :0 ", fl->fl_pid); + seq_printf(f, "%d :0 ", fl->fl_pid); } if (IS_POSIX(fl)) { if (fl->fl_end == OFFSET_MAX) - out += sprintf(out, "%Ld EOF\n", fl->fl_start); + seq_printf(f, "%Ld EOF\n", fl->fl_start); else - out += sprintf(out, "%Ld %Ld\n", fl->fl_start, - fl->fl_end); + seq_printf(f, "%Ld %Ld\n", fl->fl_start, fl->fl_end); } else { - out += sprintf(out, "0 EOF\n"); + seq_printf(f, "0 EOF\n"); } } -static void move_lock_status(char **p, off_t* pos, off_t offset) +static int locks_show(struct seq_file *f, void *v) { - int len; - len = strlen(*p); - if(*pos >= offset) { - /* the complete line is valid */ - *p += len; - *pos += len; - return; - } - if(*pos+len > offset) { - /* use the second part of the line */ - int i = offset-*pos; - memmove(*p,*p+i,len-i); - *p += len-i; - *pos += len; - return; - } - /* discard the complete line */ - *pos += len; -} + struct file_lock *fl, *bfl; -/** - * get_locks_status - reports lock usage in /proc/locks - * @buffer: address in userspace to write into - * @start: ? - * @offset: how far we are through the buffer - * @length: how much to read - */ + fl = list_entry(v, struct file_lock, fl_link); + + lock_get_status(f, fl, (long)f->private, ""); + + list_for_each_entry(bfl, &fl->fl_block, fl_block) + lock_get_status(f, bfl, (long)f->private, " ->"); -int get_locks_status(char *buffer, char **start, off_t offset, int length) -{ - struct list_head *tmp; - char *q = buffer; - off_t pos = 0; - int i = 0; + f->private++; + return 0; +} +static void *locks_start(struct seq_file *f, loff_t *pos) +{ lock_kernel(); - list_for_each(tmp, &file_lock_list) { - struct list_head *btmp; - struct file_lock *fl = list_entry(tmp, struct file_lock, fl_link); - lock_get_status(q, fl, ++i, ""); - move_lock_status(&q, &pos, offset); - - if(pos >= offset+length) - goto done; - - list_for_each(btmp, &fl->fl_block) { - struct file_lock *bfl = list_entry(btmp, - struct file_lock, fl_block); - lock_get_status(q, bfl, i, " ->"); - move_lock_status(&q, &pos, offset); + f->private = (void *)1; + return seq_list_start(&file_lock_list, *pos); +} - if(pos >= offset+length) - goto done; - } - } -done: +static void *locks_next(struct seq_file *f, void *v, loff_t *pos) +{ + return seq_list_next(v, &file_lock_list, pos); +} + +static void locks_stop(struct seq_file *f, void *v) +{ unlock_kernel(); - *start = buffer; - if(q-buffer < length) - return (q-buffer); - return length; } +struct seq_operations locks_seq_operations = { + .start = locks_start, + .next = locks_next, + .stop = locks_stop, + .show = locks_show, +}; +#endif + /** * lock_may_read - checks that the region is free of locks * @inode: the inode that is being read diff -puN fs/nfs/callback.c~git-nfsd fs/nfs/callback.c --- a/fs/nfs/callback.c~git-nfsd +++ a/fs/nfs/callback.c @@ -123,8 +123,8 @@ int nfs_callback_up(void) if (!serv) goto out_err; - ret = svc_makesock(serv, IPPROTO_TCP, nfs_callback_set_tcpport, - SVC_SOCK_ANONYMOUS); + ret = svc_create_xprt(serv, "tcp", nfs_callback_set_tcpport, + SVC_SOCK_ANONYMOUS); if (ret <= 0) goto out_destroy; nfs_callback_tcpport = ret; diff -puN fs/nfs/file.c~git-nfsd fs/nfs/file.c --- a/fs/nfs/file.c~git-nfsd +++ a/fs/nfs/file.c @@ -577,8 +577,7 @@ static int nfs_lock(struct file *filp, i nfs_inc_stats(inode, NFSIOS_VFSLOCK); /* No mandatory locks over NFS */ - if ((inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID && - fl->fl_type != F_UNLCK) + if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) return -ENOLCK; if (IS_GETLK(cmd)) diff -puN fs/nfsd/nfs3xdr.c~git-nfsd fs/nfsd/nfs3xdr.c --- a/fs/nfsd/nfs3xdr.c~git-nfsd +++ a/fs/nfsd/nfs3xdr.c @@ -174,9 +174,6 @@ static __be32 * encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat) { - struct dentry *dentry = fhp->fh_dentry; - struct timespec time; - *p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]); *p++ = htonl((u32) stat->mode); *p++ = htonl((u32) stat->nlink); @@ -191,10 +188,9 @@ encode_fattr3(struct svc_rqst *rqstp, __ *p++ = htonl((u32) MAJOR(stat->rdev)); *p++ = htonl((u32) MINOR(stat->rdev)); p = encode_fsid(p, fhp); - p = xdr_encode_hyper(p, (u64) stat->ino); + p = xdr_encode_hyper(p, stat->ino); p = encode_time3(p, &stat->atime); - lease_get_mtime(dentry->d_inode, &time); - p = encode_time3(p, &time); + p = encode_time3(p, &stat->mtime); p = encode_time3(p, &stat->ctime); return p; @@ -203,31 +199,9 @@ encode_fattr3(struct svc_rqst *rqstp, __ static __be32 * encode_saved_post_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) { - struct inode *inode = fhp->fh_dentry->d_inode; - /* Attributes to follow */ *p++ = xdr_one; - - *p++ = htonl(nfs3_ftypes[(fhp->fh_post_mode & S_IFMT) >> 12]); - *p++ = htonl((u32) fhp->fh_post_mode); - *p++ = htonl((u32) fhp->fh_post_nlink); - *p++ = htonl((u32) nfsd_ruid(rqstp, fhp->fh_post_uid)); - *p++ = htonl((u32) nfsd_rgid(rqstp, fhp->fh_post_gid)); - if (S_ISLNK(fhp->fh_post_mode) && fhp->fh_post_size > NFS3_MAXPATHLEN) { - p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN); - } else { - p = xdr_encode_hyper(p, (u64) fhp->fh_post_size); - } - p = xdr_encode_hyper(p, ((u64)fhp->fh_post_blocks) << 9); - *p++ = fhp->fh_post_rdev[0]; - *p++ = fhp->fh_post_rdev[1]; - p = encode_fsid(p, fhp); - p = xdr_encode_hyper(p, (u64) inode->i_ino); - p = encode_time3(p, &fhp->fh_post_atime); - p = encode_time3(p, &fhp->fh_post_mtime); - p = encode_time3(p, &fhp->fh_post_ctime); - - return p; + return encode_fattr3(rqstp, p, fhp, &fhp->fh_post_attr); } /* @@ -246,6 +220,7 @@ encode_post_op_attr(struct svc_rqst *rqs err = vfs_getattr(fhp->fh_export->ex_mnt, dentry, &stat); if (!err) { *p++ = xdr_one; /* attributes follow */ + lease_get_mtime(dentry->d_inode, &stat.mtime); return encode_fattr3(rqstp, p, fhp, &stat); } } @@ -284,6 +259,23 @@ encode_wcc_data(struct svc_rqst *rqstp, return encode_post_op_attr(rqstp, p, fhp); } +/* + * Fill in the post_op attr for the wcc data + */ +void fill_post_wcc(struct svc_fh *fhp) +{ + int err; + + if (fhp->fh_post_saved) + printk("nfsd: inode locked twice during operation.\n"); + + err = vfs_getattr(fhp->fh_export->ex_mnt, fhp->fh_dentry, + &fhp->fh_post_attr); + if (err) + fhp->fh_post_saved = 0; + else + fhp->fh_post_saved = 1; +} /* * XDR decode functions @@ -643,8 +635,11 @@ int nfs3svc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_attrstat *resp) { - if (resp->status == 0) + if (resp->status == 0) { + lease_get_mtime(resp->fh.fh_dentry->d_inode, + &resp->stat.mtime); p = encode_fattr3(rqstp, p, &resp->fh, &resp->stat); + } return xdr_ressize_check(rqstp, p); } @@ -802,7 +797,7 @@ nfs3svc_encode_readdirres(struct svc_rqs static __be32 * encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, - int namlen, ino_t ino) + int namlen, u64 ino) { *p++ = xdr_one; /* mark entry present */ p = xdr_encode_hyper(p, ino); /* file id */ @@ -873,7 +868,7 @@ compose_entry_fh(struct nfsd3_readdirres #define NFS3_ENTRYPLUS_BAGGAGE (1 + 21 + 1 + (NFS3_FHSIZE >> 2)) static int encode_entry(struct readdir_cd *ccd, const char *name, int namlen, - loff_t offset, ino_t ino, unsigned int d_type, int plus) + loff_t offset, u64 ino, unsigned int d_type, int plus) { struct nfsd3_readdirres *cd = container_of(ccd, struct nfsd3_readdirres, common); diff -puN fs/nfsd/nfs4callback.c~git-nfsd fs/nfsd/nfs4callback.c --- a/fs/nfsd/nfs4callback.c~git-nfsd +++ a/fs/nfsd/nfs4callback.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -343,26 +344,28 @@ static struct rpc_version * nfs_cb_versi &nfs_cb_version4, }; -/* - * Use the SETCLIENTID credential - */ -static struct rpc_cred * -nfsd4_lookupcred(struct nfs4_client *clp, int taskflags) +/* Reference counting, callback cleanup, etc., all look racy as heck. + * And why is cb_set an atomic? */ + +static int do_probe_callback(void *data) { - struct auth_cred acred; - struct rpc_clnt *clnt = clp->cl_callback.cb_client; - struct rpc_cred *ret; + struct nfs4_client *clp = data; + struct nfs4_callback *cb = &clp->cl_callback; + struct rpc_message msg = { + .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], + .rpc_argp = clp, + }; + int status; - get_group_info(clp->cl_cred.cr_group_info); - acred.uid = clp->cl_cred.cr_uid; - acred.gid = clp->cl_cred.cr_gid; - acred.group_info = clp->cl_cred.cr_group_info; - - dprintk("NFSD: looking up %s cred\n", - clnt->cl_auth->au_ops->au_name); - ret = rpcauth_lookup_credcache(clnt->cl_auth, &acred, taskflags); - put_group_info(clp->cl_cred.cr_group_info); - return ret; + status = rpc_call_sync(cb->cb_client, &msg, RPC_TASK_SOFT); + + if (status) { + rpc_shutdown_client(cb->cb_client); + cb->cb_client = NULL; + } else + atomic_set(&cb->cb_set, 1); + put_nfs4_client(clp); + return 0; } /* @@ -390,11 +393,7 @@ nfsd4_probe_callback(struct nfs4_client .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */ .flags = (RPC_CLNT_CREATE_NOPING), }; - struct rpc_message msg = { - .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], - .rpc_argp = clp, - }; - int status; + struct task_struct *t; if (atomic_read(&cb->cb_set)) return; @@ -426,16 +425,11 @@ nfsd4_probe_callback(struct nfs4_client /* the task holds a reference to the nfs4_client struct */ atomic_inc(&clp->cl_count); - msg.rpc_cred = nfsd4_lookupcred(clp,0); - if (IS_ERR(msg.rpc_cred)) - goto out_release_clp; - status = rpc_call_async(cb->cb_client, &msg, RPC_TASK_ASYNC, &nfs4_cb_null_ops, NULL); - put_rpccred(msg.rpc_cred); + t = kthread_run(do_probe_callback, clp, "nfs4_cb_probe"); - if (status != 0) { - dprintk("NFSD: asynchronous NFSPROC4_CB_NULL failed!\n"); + if (IS_ERR(t)) goto out_release_clp; - } + return; out_release_clp: @@ -447,30 +441,6 @@ out_err: (int)clp->cl_name.len, clp->cl_name.data); } -static void -nfs4_cb_null(struct rpc_task *task, void *dummy) -{ - struct nfs4_client *clp = (struct nfs4_client *)task->tk_msg.rpc_argp; - struct nfs4_callback *cb = &clp->cl_callback; - __be32 addr = htonl(cb->cb_addr); - - dprintk("NFSD: nfs4_cb_null task->tk_status %d\n", task->tk_status); - - if (task->tk_status < 0) { - dprintk("NFSD: callback establishment to client %.*s failed\n", - (int)clp->cl_name.len, clp->cl_name.data); - goto out; - } - atomic_set(&cb->cb_set, 1); - dprintk("NFSD: callback set to client %u.%u.%u.%u\n", NIPQUAD(addr)); -out: - put_nfs4_client(clp); -} - -static const struct rpc_call_ops nfs4_cb_null_ops = { - .rpc_call_done = nfs4_cb_null, -}; - /* * called with dp->dl_count inc'ed. * nfs4_lock_state() may or may not have been called. @@ -491,10 +461,6 @@ nfsd4_cb_recall(struct nfs4_delegation * if ((!atomic_read(&clp->cl_callback.cb_set)) || !clnt) return; - msg.rpc_cred = nfsd4_lookupcred(clp, 0); - if (IS_ERR(msg.rpc_cred)) - goto out; - cbr->cbr_trunc = 0; /* XXX need to implement truncate optimization */ cbr->cbr_dp = dp; @@ -515,13 +481,12 @@ nfsd4_cb_recall(struct nfs4_delegation * status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT); } out_put_cred: - put_rpccred(msg.rpc_cred); -out: if (status == -EIO) atomic_set(&clp->cl_callback.cb_set, 0); /* Success or failure, now we're either waiting for lease expiration * or deleg_return. */ dprintk("NFSD: nfs4_cb_recall: dp %p dl_flock %p dl_count %d\n",dp, dp->dl_flock, atomic_read(&dp->dl_count)); + put_nfs4_client(clp); nfs4_put_delegation(dp); return; } diff -puN fs/nfsd/nfs4idmap.c~git-nfsd fs/nfsd/nfs4idmap.c --- a/fs/nfsd/nfs4idmap.c~git-nfsd +++ a/fs/nfsd/nfs4idmap.c @@ -207,6 +207,7 @@ idtoname_parse(struct cache_detail *cd, { struct ent ent, *res; char *buf1, *bp; + int len; int error = -EINVAL; if (buf[buflen - 1] != '\n') @@ -248,10 +249,11 @@ idtoname_parse(struct cache_detail *cd, goto out; /* Name */ - error = qword_get(&buf, buf1, PAGE_SIZE); - if (error == -EINVAL) + error = -EINVAL; + len = qword_get(&buf, buf1, PAGE_SIZE); + if (len < 0) goto out; - if (error == -ENOENT) + if (len == 0) set_bit(CACHE_NEGATIVE, &ent.h.flags); else { if (error >= IDMAP_NAMESZ) { diff -puN fs/nfsd/nfs4proc.c~git-nfsd fs/nfsd/nfs4proc.c --- a/fs/nfsd/nfs4proc.c~git-nfsd +++ a/fs/nfsd/nfs4proc.c @@ -238,12 +238,12 @@ nfsd4_open(struct svc_rqst *rqstp, struc break; case NFS4_OPEN_CLAIM_DELEGATE_PREV: open->op_stateowner->so_confirmed = 1; - printk("NFSD: unsupported OPEN claim type %d\n", + dprintk("NFSD: unsupported OPEN claim type %d\n", open->op_claim_type); status = nfserr_notsupp; goto out; default: - printk("NFSD: Invalid OPEN claim type %d\n", + dprintk("NFSD: Invalid OPEN claim type %d\n", open->op_claim_type); status = nfserr_inval; goto out; diff -puN fs/nfsd/nfs4state.c~git-nfsd fs/nfsd/nfs4state.c --- a/fs/nfsd/nfs4state.c~git-nfsd +++ a/fs/nfsd/nfs4state.c @@ -358,9 +358,22 @@ alloc_client(struct xdr_netobj name) return clp; } +static void +shutdown_callback_client(struct nfs4_client *clp) +{ + struct rpc_clnt *clnt = clp->cl_callback.cb_client; + + /* shutdown rpc client, ending any outstanding recall rpcs */ + if (clnt) { + clp->cl_callback.cb_client = NULL; + rpc_shutdown_client(clnt); + } +} + static inline void free_client(struct nfs4_client *clp) { + shutdown_callback_client(clp); if (clp->cl_cred.cr_group_info) put_group_info(clp->cl_cred.cr_group_info); kfree(clp->cl_name.data); @@ -375,18 +388,6 @@ put_nfs4_client(struct nfs4_client *clp) } static void -shutdown_callback_client(struct nfs4_client *clp) -{ - struct rpc_clnt *clnt = clp->cl_callback.cb_client; - - /* shutdown rpc client, ending any outstanding recall rpcs */ - if (clnt) { - clp->cl_callback.cb_client = NULL; - rpc_shutdown_client(clnt); - } -} - -static void expire_client(struct nfs4_client *clp) { struct nfs4_stateowner *sop; @@ -396,8 +397,6 @@ expire_client(struct nfs4_client *clp) dprintk("NFSD: expire_client cl_count %d\n", atomic_read(&clp->cl_count)); - shutdown_callback_client(clp); - INIT_LIST_HEAD(&reaplist); spin_lock(&recall_lock); while (!list_empty(&clp->cl_delegations)) { @@ -462,26 +461,28 @@ copy_cred(struct svc_cred *target, struc } static inline int -same_name(const char *n1, const char *n2) { +same_name(const char *n1, const char *n2) +{ return 0 == memcmp(n1, n2, HEXDIR_LEN); } static int -cmp_verf(nfs4_verifier *v1, nfs4_verifier *v2) { - return(!memcmp(v1->data,v2->data,sizeof(v1->data))); +same_verf(nfs4_verifier *v1, nfs4_verifier *v2) +{ + return 0 == memcmp(v1->data, v2->data, sizeof(v1->data)); } static int -cmp_clid(clientid_t * cl1, clientid_t * cl2) { - return((cl1->cl_boot == cl2->cl_boot) && - (cl1->cl_id == cl2->cl_id)); +same_clid(clientid_t *cl1, clientid_t *cl2) +{ + return (cl1->cl_boot == cl2->cl_boot) && (cl1->cl_id == cl2->cl_id); } /* XXX what about NGROUP */ static int -cmp_creds(struct svc_cred *cr1, struct svc_cred *cr2){ - return(cr1->cr_uid == cr2->cr_uid); - +same_creds(struct svc_cred *cr1, struct svc_cred *cr2) +{ + return cr1->cr_uid == cr2->cr_uid; } static void @@ -507,7 +508,7 @@ check_name(struct xdr_netobj name) { if (name.len == 0) return 0; if (name.len > NFS4_OPAQUE_LIMIT) { - printk("NFSD: check_name: name too long(%d)!\n", name.len); + dprintk("NFSD: check_name: name too long(%d)!\n", name.len); return 0; } return 1; @@ -546,7 +547,7 @@ find_confirmed_client(clientid_t *clid) unsigned int idhashval = clientid_hashval(clid->cl_id); list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) { - if (cmp_clid(&clp->cl_clientid, clid)) + if (same_clid(&clp->cl_clientid, clid)) return clp; } return NULL; @@ -559,7 +560,7 @@ find_unconfirmed_client(clientid_t *clid unsigned int idhashval = clientid_hashval(clid->cl_id); list_for_each_entry(clp, &unconf_id_hashtbl[idhashval], cl_idhash) { - if (cmp_clid(&clp->cl_clientid, clid)) + if (same_clid(&clp->cl_clientid, clid)) return clp; } return NULL; @@ -753,7 +754,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp * or different ip_address */ status = nfserr_clid_inuse; - if (!cmp_creds(&conf->cl_cred, &rqstp->rq_cred) + if (!same_creds(&conf->cl_cred, &rqstp->rq_cred) || conf->cl_addr != sin->sin_addr.s_addr) { dprintk("NFSD: setclientid: string in use by client" "at %u.%u.%u.%u\n", NIPQUAD(conf->cl_addr)); @@ -772,14 +773,8 @@ nfsd4_setclientid(struct svc_rqst *rqstp new = create_client(clname, dname); if (new == NULL) goto out; - copy_verf(new, &clverifier); - new->cl_addr = sin->sin_addr.s_addr; - copy_cred(&new->cl_cred,&rqstp->rq_cred); gen_clid(new); - gen_confirm(new); - gen_callback(new, setclid); - add_to_unconfirmed(new, strhashval); - } else if (cmp_verf(&conf->cl_verifier, &clverifier)) { + } else if (same_verf(&conf->cl_verifier, &clverifier)) { /* * CASE 1: * cl_name match, confirmed, principal match @@ -804,13 +799,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp new = create_client(clname, dname); if (new == NULL) goto out; - copy_verf(new,&conf->cl_verifier); - new->cl_addr = sin->sin_addr.s_addr; - copy_cred(&new->cl_cred,&rqstp->rq_cred); copy_clid(new, conf); - gen_confirm(new); - gen_callback(new, setclid); - add_to_unconfirmed(new,strhashval); } else if (!unconf) { /* * CASE 2: @@ -823,14 +812,8 @@ nfsd4_setclientid(struct svc_rqst *rqstp new = create_client(clname, dname); if (new == NULL) goto out; - copy_verf(new,&clverifier); - new->cl_addr = sin->sin_addr.s_addr; - copy_cred(&new->cl_cred,&rqstp->rq_cred); gen_clid(new); - gen_confirm(new); - gen_callback(new, setclid); - add_to_unconfirmed(new, strhashval); - } else if (!cmp_verf(&conf->cl_confirm, &unconf->cl_confirm)) { + } else if (!same_verf(&conf->cl_confirm, &unconf->cl_confirm)) { /* * CASE3: * confirmed found (name, principal match) @@ -850,19 +833,19 @@ nfsd4_setclientid(struct svc_rqst *rqstp new = create_client(clname, dname); if (new == NULL) goto out; - copy_verf(new,&clverifier); - new->cl_addr = sin->sin_addr.s_addr; - copy_cred(&new->cl_cred,&rqstp->rq_cred); gen_clid(new); - gen_confirm(new); - gen_callback(new, setclid); - add_to_unconfirmed(new, strhashval); } else { /* No cases hit !!! */ status = nfserr_inval; goto out; } + copy_verf(new, &clverifier); + new->cl_addr = sin->sin_addr.s_addr; + copy_cred(&new->cl_cred, &rqstp->rq_cred); + gen_confirm(new); + gen_callback(new, setclid); + add_to_unconfirmed(new, strhashval); setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot; setclid->se_clientid.cl_id = new->cl_clientid.cl_id; memcpy(setclid->se_confirm.data, new->cl_confirm.data, sizeof(setclid->se_confirm.data)); @@ -910,16 +893,16 @@ nfsd4_setclientid_confirm(struct svc_rqs goto out; if ((conf && unconf) && - (cmp_verf(&unconf->cl_confirm, &confirm)) && - (cmp_verf(&conf->cl_verifier, &unconf->cl_verifier)) && + (same_verf(&unconf->cl_confirm, &confirm)) && + (same_verf(&conf->cl_verifier, &unconf->cl_verifier)) && (same_name(conf->cl_recdir,unconf->cl_recdir)) && - (!cmp_verf(&conf->cl_confirm, &unconf->cl_confirm))) { + (!same_verf(&conf->cl_confirm, &unconf->cl_confirm))) { /* CASE 1: * unconf record that matches input clientid and input confirm. * conf record that matches input clientid. * conf and unconf records match names, verifiers */ - if (!cmp_creds(&conf->cl_cred, &unconf->cl_cred)) + if (!same_creds(&conf->cl_cred, &unconf->cl_cred)) status = nfserr_clid_inuse; else { /* XXX: We just turn off callbacks until we can handle @@ -933,7 +916,7 @@ nfsd4_setclientid_confirm(struct svc_rqs } } else if ((conf && !unconf) || ((conf && unconf) && - (!cmp_verf(&conf->cl_verifier, &unconf->cl_verifier) || + (!same_verf(&conf->cl_verifier, &unconf->cl_verifier) || !same_name(conf->cl_recdir, unconf->cl_recdir)))) { /* CASE 2: * conf record that matches input clientid. @@ -941,18 +924,18 @@ nfsd4_setclientid_confirm(struct svc_rqs * unconf->cl_name or unconf->cl_verifier don't match the * conf record. */ - if (!cmp_creds(&conf->cl_cred,&rqstp->rq_cred)) + if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) status = nfserr_clid_inuse; else status = nfs_ok; } else if (!conf && unconf - && cmp_verf(&unconf->cl_confirm, &confirm)) { + && same_verf(&unconf->cl_confirm, &confirm)) { /* CASE 3: * conf record not found. * unconf record found. * unconf->cl_confirm matches input confirm */ - if (!cmp_creds(&unconf->cl_cred, &rqstp->rq_cred)) { + if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred)) { status = nfserr_clid_inuse; } else { unsigned int hash = @@ -967,8 +950,8 @@ nfsd4_setclientid_confirm(struct svc_rqs conf = unconf; status = nfs_ok; } - } else if ((!conf || (conf && !cmp_verf(&conf->cl_confirm, &confirm))) - && (!unconf || (unconf && !cmp_verf(&unconf->cl_confirm, + } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm))) + && (!unconf || (unconf && !same_verf(&unconf->cl_confirm, &confirm)))) { /* CASE 4: * conf record not found, or if conf, conf->cl_confirm does not @@ -1019,7 +1002,7 @@ nfsd4_free_slab(struct kmem_cache **slab *slab = NULL; } -static void +void nfsd4_free_slabs(void) { nfsd4_free_slab(&stateowner_slab); @@ -1207,10 +1190,12 @@ move_to_close_lru(struct nfs4_stateowner } static int -cmp_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner, clientid_t *clid) { - return ((sop->so_owner.len == owner->len) && - !memcmp(sop->so_owner.data, owner->data, owner->len) && - (sop->so_client->cl_clientid.cl_id == clid->cl_id)); +same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner, + clientid_t *clid) +{ + return (sop->so_owner.len == owner->len) && + 0 == memcmp(sop->so_owner.data, owner->data, owner->len) && + (sop->so_client->cl_clientid.cl_id == clid->cl_id); } static struct nfs4_stateowner * @@ -1219,7 +1204,7 @@ find_openstateowner_str(unsigned int has struct nfs4_stateowner *so = NULL; list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) { - if (cmp_owner_str(so, &open->op_owner, &open->op_clientid)) + if (same_owner_str(so, &open->op_owner, &open->op_clientid)) return so; } return NULL; @@ -1360,6 +1345,7 @@ void nfsd_break_deleg_cb(struct file_loc * lock) we know the server hasn't removed the lease yet, we know * it's safe to take a reference: */ atomic_inc(&dp->dl_count); + atomic_inc(&dp->dl_client->cl_count); spin_lock(&recall_lock); list_add_tail(&dp->dl_recall_lru, &del_recall_lru); @@ -1368,8 +1354,12 @@ void nfsd_break_deleg_cb(struct file_loc /* only place dl_time is set. protected by lock_kernel*/ dp->dl_time = get_seconds(); - /* XXX need to merge NFSD_LEASE_TIME with fs/locks.c:lease_break_time */ - fl->fl_break_time = jiffies + NFSD_LEASE_TIME * HZ; + /* + * We don't want the locks code to timeout the lease for us; + * we'll remove it ourself if the delegation isn't returned + * in time. + */ + fl->fl_break_time = 0; t = kthread_run(do_recall, dp, "%s", "nfs4_cb_recall"); if (IS_ERR(t)) { @@ -1378,6 +1368,7 @@ void nfsd_break_deleg_cb(struct file_loc printk(KERN_INFO "NFSD: Callback thread failed for " "for client (clientid %08x/%08x)\n", clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); + put_nfs4_client(dp->dl_client); nfs4_put_delegation(dp); } } @@ -1738,7 +1729,7 @@ out: if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS && flag == NFS4_OPEN_DELEGATE_NONE && open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) - printk("NFSD: WARNING: refusing delegation reclaim\n"); + dprintk("NFSD: WARNING: refusing delegation reclaim\n"); open->op_delegate_type = flag; } @@ -2044,7 +2035,7 @@ static inline int io_during_grace_disallowed(struct inode *inode, int flags) { return nfs4_in_grace() && (flags & (RD_STATE | WR_STATE)) - && MANDATORY_LOCK(inode); + && mandatory_lock(inode); } /* @@ -2147,7 +2138,7 @@ nfs4_preprocess_seqid_op(struct svc_fh * *sopp = NULL; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { - printk("NFSD: preprocess_seqid_op: magic stateid!\n"); + dprintk("NFSD: preprocess_seqid_op: magic stateid!\n"); return nfserr_bad_stateid; } @@ -2181,25 +2172,24 @@ nfs4_preprocess_seqid_op(struct svc_fh * lkflg = setlkflg(lock->lk_type); if (lock->lk_is_new) { - if (!sop->so_is_open_owner) - return nfserr_bad_stateid; - if (!cmp_clid(&clp->cl_clientid, lockclid)) + if (!sop->so_is_open_owner) + return nfserr_bad_stateid; + if (!same_clid(&clp->cl_clientid, lockclid)) return nfserr_bad_stateid; - /* stp is the open stateid */ - status = nfs4_check_openmode(stp, lkflg); - if (status) - return status; - } else { - /* stp is the lock stateid */ - status = nfs4_check_openmode(stp->st_openstp, lkflg); - if (status) - return status; + /* stp is the open stateid */ + status = nfs4_check_openmode(stp, lkflg); + if (status) + return status; + } else { + /* stp is the lock stateid */ + status = nfs4_check_openmode(stp->st_openstp, lkflg); + if (status) + return status; } - } if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp)) { - printk("NFSD: preprocess_seqid_op: fh-stateid mismatch!\n"); + dprintk("NFSD: preprocess_seqid_op: fh-stateid mismatch!\n"); return nfserr_bad_stateid; } @@ -2215,22 +2205,22 @@ nfs4_preprocess_seqid_op(struct svc_fh * goto check_replay; if (sop->so_confirmed && flags & CONFIRM) { - printk("NFSD: preprocess_seqid_op: expected" + dprintk("NFSD: preprocess_seqid_op: expected" " unconfirmed stateowner!\n"); return nfserr_bad_stateid; } if (!sop->so_confirmed && !(flags & CONFIRM)) { - printk("NFSD: preprocess_seqid_op: stateowner not" + dprintk("NFSD: preprocess_seqid_op: stateowner not" " confirmed yet!\n"); return nfserr_bad_stateid; } if (stateid->si_generation > stp->st_stateid.si_generation) { - printk("NFSD: preprocess_seqid_op: future stateid?!\n"); + dprintk("NFSD: preprocess_seqid_op: future stateid?!\n"); return nfserr_bad_stateid; } if (stateid->si_generation < stp->st_stateid.si_generation) { - printk("NFSD: preprocess_seqid_op: old stateid!\n"); + dprintk("NFSD: preprocess_seqid_op: old stateid!\n"); return nfserr_old_stateid; } renew_client(sop->so_client); @@ -2242,7 +2232,7 @@ check_replay: /* indicate replay to calling function */ return nfserr_replay_me; } - printk("NFSD: preprocess_seqid_op: bad seqid (expected %d, got %d)\n", + dprintk("NFSD: preprocess_seqid_op: bad seqid (expected %d, got %d)\n", sop->so_seqid, seqid); *sopp = NULL; return nfserr_bad_seqid; @@ -2561,7 +2551,7 @@ find_lockstateowner_str(struct inode *in struct nfs4_stateowner *op; list_for_each_entry(op, &lock_ownerstr_hashtbl[hashval], so_strhash) { - if (cmp_owner_str(op, owner, clid)) + if (same_owner_str(op, owner, clid)) return op; } return NULL; @@ -2855,7 +2845,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, stru file_lock.fl_type = F_WRLCK; break; default: - printk("NFSD: nfs4_lockt: bad lock type!\n"); + dprintk("NFSD: nfs4_lockt: bad lock type!\n"); status = nfserr_inval; goto out; } @@ -3025,7 +3015,7 @@ nfsd4_release_lockowner(struct svc_rqst INIT_LIST_HEAD(&matches); for (i = 0; i < LOCK_HASH_SIZE; i++) { list_for_each_entry(sop, &lock_ownerid_hashtbl[i], so_idhash) { - if (!cmp_owner_str(sop, owner, clid)) + if (!same_owner_str(sop, owner, clid)) continue; list_for_each_entry(stp, &sop->so_stateids, st_perstateowner) { @@ -3149,11 +3139,14 @@ nfs4_check_open_reclaim(clientid_t *clid /* initialization to perform at module load time: */ -void +int nfs4_state_init(void) { - int i; + int i, status; + status = nfsd4_init_slabs(); + if (status) + return status; for (i = 0; i < CLIENT_HASH_SIZE; i++) { INIT_LIST_HEAD(&conf_id_hashtbl[i]); INIT_LIST_HEAD(&conf_str_hashtbl[i]); @@ -3182,6 +3175,7 @@ nfs4_state_init(void) for (i = 0; i < CLIENT_HASH_SIZE; i++) INIT_LIST_HEAD(&reclaim_str_hashtbl[i]); reclaim_str_hashtbl_size = 0; + return 0; } static void @@ -3242,20 +3236,15 @@ __nfs4_state_start(void) set_max_delegations(); } -int +void nfs4_state_start(void) { - int status; - if (nfs4_init) - return 0; - status = nfsd4_init_slabs(); - if (status) - return status; + return; nfsd4_load_reboot_recovery_data(); __nfs4_state_start(); nfs4_init = 1; - return 0; + return; } int @@ -3313,7 +3302,6 @@ nfs4_state_shutdown(void) nfs4_lock_state(); nfs4_release_reclaim(); __nfs4_state_shutdown(); - nfsd4_free_slabs(); nfs4_unlock_state(); } diff -puN fs/nfsd/nfs4xdr.c~git-nfsd fs/nfsd/nfs4xdr.c --- a/fs/nfsd/nfs4xdr.c~git-nfsd +++ a/fs/nfsd/nfs4xdr.c @@ -1479,7 +1479,8 @@ nfsd4_encode_fattr(struct svc_fh *fhp, s err = vfs_getattr(exp->ex_mnt, dentry, &stat); if (err) goto out_nfserr; - if ((bmval0 & (FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL)) || + if ((bmval0 & (FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL | + FATTR4_WORD0_MAXNAME)) || (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL))) { err = vfs_statfs(dentry, &statfs); @@ -1683,7 +1684,7 @@ out_acl: if (bmval0 & FATTR4_WORD0_FILEID) { if ((buflen -= 8) < 0) goto out_resource; - WRITE64((u64) stat.ino); + WRITE64(stat.ino); } if (bmval0 & FATTR4_WORD0_FILES_AVAIL) { if ((buflen -= 8) < 0) @@ -1725,7 +1726,7 @@ out_acl: if (bmval0 & FATTR4_WORD0_MAXNAME) { if ((buflen -= 4) < 0) goto out_resource; - WRITE32(~(u32) 0); + WRITE32(statfs.f_namelen); } if (bmval0 & FATTR4_WORD0_MAXREAD) { if ((buflen -= 8) < 0) @@ -1825,16 +1826,15 @@ out_acl: WRITE32(stat.mtime.tv_nsec); } if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) { - struct dentry *mnt_pnt, *mnt_root; - if ((buflen -= 8) < 0) goto out_resource; - mnt_root = exp->ex_mnt->mnt_root; - if (mnt_root->d_inode == dentry->d_inode) { - mnt_pnt = exp->ex_mnt->mnt_mountpoint; - WRITE64((u64) mnt_pnt->d_inode->i_ino); - } else - WRITE64((u64) stat.ino); + if (exp->ex_mnt->mnt_root->d_inode == dentry->d_inode) { + err = vfs_getattr(exp->ex_mnt->mnt_parent, + exp->ex_mnt->mnt_mountpoint, &stat); + if (err) + goto out_nfserr; + } + WRITE64(stat.ino); } *attrlenp = htonl((char *)p - (char *)attrlenp - 4); *countp = p - buffer; diff -puN fs/nfsd/nfsctl.c~git-nfsd fs/nfsd/nfsctl.c --- a/fs/nfsd/nfsctl.c~git-nfsd +++ a/fs/nfsd/nfsctl.c @@ -298,7 +298,7 @@ static ssize_t write_filehandle(struct f * qword quoting is used, so filehandle will be \x.... */ char *dname, *path; - int maxsize; + int uninitialized_var(maxsize); char *mesg = buf; int len; struct auth_domain *dom; @@ -554,6 +554,22 @@ static ssize_t write_ports(struct file * kfree(toclose); return len; } + /* + * Add a transport listener by writing it's transport name + */ + if (isalnum(buf[0])) { + int err; + char transport[16]; + int port; + if (sscanf(buf, "%15s %4d", transport, &port) == 2) { + err = nfsd_create_serv(); + if (!err) + err = svc_create_xprt(nfsd_serv, + transport, port, + SVC_SOCK_ANONYMOUS); + return err < 0 ? err : 0; + } + } return -EINVAL; } @@ -679,11 +695,13 @@ static int __init init_nfsd(void) int retval; printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n"); + retval = nfs4_state_init(); /* nfs4 locking state */ + if (retval) + return retval; nfsd_stat_init(); /* Statistics */ nfsd_cache_init(); /* RPC reply cache */ nfsd_export_init(); /* Exports table */ nfsd_lockd_init(); /* lockd->nfsd callbacks */ - nfs4_state_init(); /* NFSv4 locking state */ nfsd_idmap_init(); /* Name to ID mapping */ if (proc_mkdir("fs/nfs", NULL)) { struct proc_dir_entry *entry; @@ -712,6 +730,7 @@ static void __exit exit_nfsd(void) nfsd_stat_shutdown(); nfsd_lockd_shutdown(); nfsd_idmap_shutdown(); + nfsd4_free_slabs(); unregister_filesystem(&nfsd_fs_type); } diff -puN fs/nfsd/nfssvc.c~git-nfsd fs/nfsd/nfssvc.c --- a/fs/nfsd/nfssvc.c~git-nfsd +++ a/fs/nfsd/nfssvc.c @@ -155,8 +155,8 @@ static int killsig; /* signal that was u static void nfsd_last_thread(struct svc_serv *serv) { /* When last nfsd thread exits we need to do some clean-up */ - struct svc_sock *svsk; - list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) + struct svc_xprt *xprt; + list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) lockd_down(); nfsd_serv = NULL; nfsd_racache_shutdown(); @@ -236,7 +236,7 @@ static int nfsd_init_socks(int port) error = lockd_up(IPPROTO_UDP); if (error >= 0) { - error = svc_makesock(nfsd_serv, IPPROTO_UDP, port, + error = svc_create_xprt(nfsd_serv, "udp", port, SVC_SOCK_DEFAULTS); if (error < 0) lockd_down(); @@ -247,7 +247,7 @@ static int nfsd_init_socks(int port) #ifdef CONFIG_NFSD_TCP error = lockd_up(IPPROTO_TCP); if (error >= 0) { - error = svc_makesock(nfsd_serv, IPPROTO_TCP, port, + error = svc_create_xprt(nfsd_serv, "tcp", port, SVC_SOCK_DEFAULTS); if (error < 0) lockd_down(); @@ -349,9 +349,7 @@ nfsd_svc(unsigned short port, int nrserv error = nfsd_racache_init(2*nrservs); if (error<0) goto out; - error = nfs4_state_start(); - if (error<0) - goto out; + nfs4_state_start(); nfsd_reset_versions(); @@ -546,10 +544,8 @@ nfsd_dispatch(struct svc_rqst *rqstp, __ /* Now call the procedure handler, and encode NFS status. */ nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); nfserr = map_new_errors(rqstp->rq_vers, nfserr); - if (nfserr == nfserr_jukebox && rqstp->rq_vers == 2) - nfserr = nfserr_dropit; if (nfserr == nfserr_dropit) { - dprintk("nfsd: Dropping request due to malloc failure!\n"); + dprintk("nfsd: Dropping request; may be revisited later\n"); nfsd_cache_update(rqstp, RC_NOCACHE, NULL); return 0; } diff -puN fs/nfsd/nfsxdr.c~git-nfsd fs/nfsd/nfsxdr.c --- a/fs/nfsd/nfsxdr.c~git-nfsd +++ a/fs/nfsd/nfsxdr.c @@ -523,6 +523,10 @@ nfssvc_encode_entry(void *ccdv, const ch cd->common.err = nfserr_toosmall; return -EINVAL; } + if (ino > ~((u32) 0)) { + cd->common.err = nfserr_fbig; + return -EINVAL; + } *p++ = xdr_one; /* mark entry present */ *p++ = htonl((u32) ino); /* file id */ p = xdr_encode_array(p, name, namlen);/* name length & name */ diff -puN fs/nfsd/vfs.c~git-nfsd fs/nfsd/vfs.c --- a/fs/nfsd/vfs.c~git-nfsd +++ a/fs/nfsd/vfs.c @@ -61,12 +61,6 @@ #define NFSDDBG_FACILITY NFSDDBG_FILEOP -/* We must ignore files (but only files) which might have mandatory - * locks on them because there is no way to know if the accesser has - * the lock. - */ -#define IS_ISMNDLK(i) (S_ISREG((i)->i_mode) && MANDATORY_LOCK(i)) - /* * This is a cache of readahead params that help us choose the proper * readahead strategy. Initially, we set all readahead parameters to 0 @@ -295,7 +289,8 @@ nfsd_setattr(struct svc_rqst *rqstp, str if (!iap->ia_valid) goto out; - /* NFSv2 does not differentiate between "set-[ac]time-to-now" + /* + * NFSv2 does not differentiate between "set-[ac]time-to-now" * which only requires access, and "set-[ac]time-to-X" which * requires ownership. * So if it looks like it might be "set both to the same time which @@ -308,25 +303,33 @@ nfsd_setattr(struct svc_rqst *rqstp, str */ #define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET) #define MAX_TOUCH_TIME_ERROR (30*60) - if ((iap->ia_valid & BOTH_TIME_SET) == BOTH_TIME_SET - && iap->ia_mtime.tv_sec == iap->ia_atime.tv_sec - ) { - /* Looks probable. Now just make sure time is in the right ballpark. - * Solaris, at least, doesn't seem to care what the time request is. - * We require it be within 30 minutes of now. - */ - time_t delta = iap->ia_atime.tv_sec - get_seconds(); - if (delta<0) delta = -delta; - if (delta < MAX_TOUCH_TIME_ERROR && - inode_change_ok(inode, iap) != 0) { - /* turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME - * this will cause notify_change to set these times to "now" + if ((iap->ia_valid & BOTH_TIME_SET) == BOTH_TIME_SET && + iap->ia_mtime.tv_sec == iap->ia_atime.tv_sec) { + /* + * Looks probable. + * + * Now just make sure time is in the right ballpark. + * Solaris, at least, doesn't seem to care what the time + * request is. We require it be within 30 minutes of now. */ - iap->ia_valid &= ~BOTH_TIME_SET; - } + time_t delta = iap->ia_atime.tv_sec - get_seconds(); + if (delta < 0) + delta = -delta; + if (delta < MAX_TOUCH_TIME_ERROR && + inode_change_ok(inode, iap) != 0) { + /* + * Turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME. + * This will cause notify_change to set these times + * to "now" + */ + iap->ia_valid &= ~BOTH_TIME_SET; + } } - /* The size case is special. It changes the file as well as the attributes. */ + /* + * The size case is special. + * It changes the file as well as the attributes. + */ if (iap->ia_valid & ATTR_SIZE) { if (iap->ia_size < inode->i_size) { err = nfsd_permission(rqstp, fhp->fh_export, dentry, MAY_TRUNC|MAY_OWNER_OVERRIDE); @@ -680,7 +683,12 @@ nfsd_open(struct svc_rqst *rqstp, struct err = nfserr_perm; if (IS_APPEND(inode) && (access & MAY_WRITE)) goto out; - if (IS_ISMNDLK(inode)) + /* + * We must ignore files (but only files) which might have mandatory + * locks on them because there is no way to know if the accesser has + * the lock. + */ + if (S_ISREG((inode)->i_mode) && mandatory_lock(inode)) goto out; if (!inode->i_fop) diff -puN fs/proc/proc_misc.c~git-nfsd fs/proc/proc_misc.c --- a/fs/proc/proc_misc.c~git-nfsd +++ a/fs/proc/proc_misc.c @@ -66,7 +66,6 @@ extern int get_stram_list(char *); extern int get_filesystem_list(char *); extern int get_exec_domain_list(char *); extern int get_dma_list(char *); -extern int get_locks_status (char *, char **, off_t, int); static int proc_calc_metrics(char *page, char **start, off_t off, int count, int *eof, int len) @@ -617,16 +616,18 @@ static int cmdline_read_proc(char *page, return proc_calc_metrics(page, start, off, count, eof, len); } -static int locks_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int locks_open(struct inode *inode, struct file *filp) { - int len = get_locks_status(page, start, off, count); - - if (len < count) - *eof = 1; - return len; + return seq_open(filp, &locks_seq_operations); } +static const struct file_operations proc_locks_operations = { + .open = locks_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + static int execdomains_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -684,7 +685,6 @@ void __init proc_misc_init(void) #endif {"filesystems", filesystems_read_proc}, {"cmdline", cmdline_read_proc}, - {"locks", locks_read_proc}, {"execdomains", execdomains_read_proc}, {NULL,} }; @@ -702,6 +702,7 @@ void __init proc_misc_init(void) entry->proc_fops = &proc_kmsg_operations; } #endif + create_seq_entry("locks", 0, &proc_locks_operations); create_seq_entry("devices", 0, &proc_devinfo_operations); create_seq_entry("cpuinfo", 0, &proc_cpuinfo_operations); #ifdef CONFIG_BLOCK diff -puN fs/read_write.c~git-nfsd fs/read_write.c --- a/fs/read_write.c~git-nfsd +++ a/fs/read_write.c @@ -205,7 +205,7 @@ int rw_verify_area(int read_write, struc if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) goto Einval; - if (unlikely(inode->i_flock && MANDATORY_LOCK(inode))) { + if (unlikely(inode->i_flock && mandatory_lock(inode))) { int retval = locks_mandatory_area( read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE, inode, file, pos, count); diff -puN include/linux/fs.h~git-nfsd include/linux/fs.h --- a/include/linux/fs.h~git-nfsd +++ a/include/linux/fs.h @@ -883,6 +883,7 @@ extern int vfs_setlease(struct file *, l extern int lease_modify(struct file_lock **, int); extern int lock_may_read(struct inode *, loff_t start, unsigned long count); extern int lock_may_write(struct inode *, loff_t start, unsigned long count); +extern struct seq_operations locks_seq_operations; struct fasync_struct { int magic; @@ -1371,12 +1372,25 @@ extern int locks_mandatory_area(int, str * Candidates for mandatory locking have the setgid bit set * but no group execute bit - an otherwise meaningless combination. */ -#define MANDATORY_LOCK(inode) \ - (IS_MANDLOCK(inode) && ((inode)->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID) + +static inline int __mandatory_lock(struct inode *ino) +{ + return (ino->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID; +} + +/* + * ... and these candidates should be on MS_MANDLOCK mounted fs, + * otherwise these will be advisory locks + */ + +static inline int mandatory_lock(struct inode *ino) +{ + return IS_MANDLOCK(ino) && __mandatory_lock(ino); +} static inline int locks_verify_locked(struct inode *inode) { - if (MANDATORY_LOCK(inode)) + if (mandatory_lock(inode)) return locks_mandatory_locked(inode); return 0; } @@ -1387,7 +1401,7 @@ static inline int locks_verify_truncate( struct file *filp, loff_t size) { - if (inode->i_flock && MANDATORY_LOCK(inode)) + if (inode->i_flock && mandatory_lock(inode)) return locks_mandatory_area( FLOCK_VERIFY_WRITE, inode, filp, size < inode->i_size ? size : inode->i_size, diff -puN include/linux/nfsd/nfsd.h~git-nfsd include/linux/nfsd/nfsd.h --- a/include/linux/nfsd/nfsd.h~git-nfsd +++ a/include/linux/nfsd/nfsd.h @@ -153,19 +153,21 @@ extern int nfsd_max_blksize; */ #ifdef CONFIG_NFSD_V4 extern unsigned int max_delegations; -void nfs4_state_init(void); -int nfs4_state_start(void); +int nfs4_state_init(void); +void nfsd4_free_slabs(void); +void nfs4_state_start(void); void nfs4_state_shutdown(void); time_t nfs4_lease_time(void); void nfs4_reset_lease(time_t leasetime); int nfs4_reset_recoverydir(char *recdir); #else -static inline void nfs4_state_init(void){}; -static inline int nfs4_state_start(void){return 0;} -static inline void nfs4_state_shutdown(void){} -static inline time_t nfs4_lease_time(void){return 0;} -static inline void nfs4_reset_lease(time_t leasetime){} -static inline int nfs4_reset_recoverydir(char *recdir) {return 0;} +static inline int nfs4_state_init(void) { return 0; } +static inline void nfsd4_free_slabs(void) { } +static inline void nfs4_state_start(void) { } +static inline void nfs4_state_shutdown(void) { } +static inline time_t nfs4_lease_time(void) { return 0; } +static inline void nfs4_reset_lease(time_t leasetime) { } +static inline int nfs4_reset_recoverydir(char *recdir) { return 0; } #endif /* diff -puN include/linux/nfsd/nfsfh.h~git-nfsd include/linux/nfsd/nfsfh.h --- a/include/linux/nfsd/nfsfh.h~git-nfsd +++ a/include/linux/nfsd/nfsfh.h @@ -150,17 +150,7 @@ typedef struct svc_fh { struct timespec fh_pre_ctime; /* ctime before oper */ /* Post-op attributes saved in fh_unlock */ - umode_t fh_post_mode; /* i_mode */ - nlink_t fh_post_nlink; /* i_nlink */ - uid_t fh_post_uid; /* i_uid */ - gid_t fh_post_gid; /* i_gid */ - __u64 fh_post_size; /* i_size */ - unsigned long fh_post_blocks; /* i_blocks */ - unsigned long fh_post_blksize;/* i_blksize */ - __be32 fh_post_rdev[2];/* i_rdev */ - struct timespec fh_post_atime; /* i_atime */ - struct timespec fh_post_mtime; /* i_mtime */ - struct timespec fh_post_ctime; /* i_ctime */ + struct kstat fh_post_attr; /* full attrs after operation */ #endif /* CONFIG_NFSD_V3 */ } svc_fh; @@ -297,36 +287,12 @@ fill_pre_wcc(struct svc_fh *fhp) if (!fhp->fh_pre_saved) { fhp->fh_pre_mtime = inode->i_mtime; fhp->fh_pre_ctime = inode->i_ctime; - fhp->fh_pre_size = inode->i_size; - fhp->fh_pre_saved = 1; + fhp->fh_pre_size = inode->i_size; + fhp->fh_pre_saved = 1; } } -/* - * Fill in the post_op attr for the wcc data - */ -static inline void -fill_post_wcc(struct svc_fh *fhp) -{ - struct inode *inode = fhp->fh_dentry->d_inode; - - if (fhp->fh_post_saved) - printk("nfsd: inode locked twice during operation.\n"); - - fhp->fh_post_mode = inode->i_mode; - fhp->fh_post_nlink = inode->i_nlink; - fhp->fh_post_uid = inode->i_uid; - fhp->fh_post_gid = inode->i_gid; - fhp->fh_post_size = inode->i_size; - fhp->fh_post_blksize = BLOCK_SIZE; - fhp->fh_post_blocks = inode->i_blocks; - fhp->fh_post_rdev[0] = htonl((u32)imajor(inode)); - fhp->fh_post_rdev[1] = htonl((u32)iminor(inode)); - fhp->fh_post_atime = inode->i_atime; - fhp->fh_post_mtime = inode->i_mtime; - fhp->fh_post_ctime = inode->i_ctime; - fhp->fh_post_saved = 1; -} +extern void fill_post_wcc(struct svc_fh *); #else #define fill_pre_wcc(ignored) #define fill_post_wcc(notused) diff -puN include/linux/nfsd/xdr4.h~git-nfsd include/linux/nfsd/xdr4.h --- a/include/linux/nfsd/xdr4.h~git-nfsd +++ a/include/linux/nfsd/xdr4.h @@ -428,8 +428,8 @@ set_change_info(struct nfsd4_change_info cinfo->atomic = 1; cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec; cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec; - cinfo->after_ctime_sec = fhp->fh_post_ctime.tv_sec; - cinfo->after_ctime_nsec = fhp->fh_post_ctime.tv_nsec; + cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec; + cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec; } int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *); diff -puN include/linux/sunrpc/cache.h~git-nfsd include/linux/sunrpc/cache.h --- a/include/linux/sunrpc/cache.h~git-nfsd +++ a/include/linux/sunrpc/cache.h @@ -136,16 +136,6 @@ sunrpc_cache_update(struct cache_detail struct cache_head *new, struct cache_head *old, int hash); -#define cache_for_each(pos, detail, index, member) \ - for (({read_lock(&(detail)->hash_lock); index = (detail)->hash_size;}) ; \ - ({if (index==0)read_unlock(&(detail)->hash_lock); index--;}); \ - ) \ - for (pos = container_of((detail)->hash_table[index], typeof(*pos), member); \ - &pos->member; \ - pos = container_of(pos->member.next, typeof(*pos), member)) - - - extern void cache_clean_deferred(void *owner); static inline struct cache_head *cache_get(struct cache_head *h) diff -puN include/linux/sunrpc/debug.h~git-nfsd include/linux/sunrpc/debug.h --- a/include/linux/sunrpc/debug.h~git-nfsd +++ a/include/linux/sunrpc/debug.h @@ -20,7 +20,7 @@ #define RPCDBG_BIND 0x0020 #define RPCDBG_SCHED 0x0040 #define RPCDBG_TRANS 0x0080 -#define RPCDBG_SVCSOCK 0x0100 +#define RPCDBG_SVCXPRT 0x0100 #define RPCDBG_SVCDSP 0x0200 #define RPCDBG_MISC 0x0400 #define RPCDBG_CACHE 0x0800 diff -puN include/linux/sunrpc/svc.h~git-nfsd include/linux/sunrpc/svc.h --- a/include/linux/sunrpc/svc.h~git-nfsd +++ a/include/linux/sunrpc/svc.h @@ -204,7 +204,7 @@ union svc_addr_u { struct svc_rqst { struct list_head rq_list; /* idle list */ struct list_head rq_all; /* all threads list */ - struct svc_sock * rq_sock; /* socket */ + struct svc_xprt * rq_xprt; /* transport ptr */ struct sockaddr_storage rq_addr; /* peer address */ size_t rq_addrlen; @@ -214,9 +214,10 @@ struct svc_rqst { struct auth_ops * rq_authop; /* authentication flavour */ u32 rq_flavor; /* pseudoflavor */ struct svc_cred rq_cred; /* auth info */ - struct sk_buff * rq_skbuff; /* fast recv inet buffer */ + void * rq_xprt_ctxt; /* transport specific context ptr */ struct svc_deferred_req*rq_deferred; /* deferred request we are replaying */ + size_t rq_xprt_hlen; /* xprt header len */ struct xdr_buf rq_arg; struct xdr_buf rq_res; struct page * rq_pages[RPCSVC_MAXPAGES]; @@ -317,7 +318,7 @@ static inline void svc_free_res_pages(st struct svc_deferred_req { u32 prot; /* protocol (UDP or TCP) */ - struct svc_sock *svsk; + struct svc_xprt *xprt; struct sockaddr_storage addr; /* where reply must go */ size_t addrlen; union svc_addr_u daddr; /* where reply must come from */ diff -puN /dev/null include/linux/sunrpc/svc_xprt.h --- /dev/null +++ a/include/linux/sunrpc/svc_xprt.h @@ -0,0 +1,86 @@ +/* + * linux/include/linux/sunrpc/svc_xprt.h + * + * RPC server transport I/O + */ + +#ifndef SUNRPC_SVC_XPRT_H +#define SUNRPC_SVC_XPRT_H + +#include +#include + +struct svc_xprt_ops { + struct svc_xprt *(*xpo_create)(struct svc_serv *, + struct sockaddr *, + int); + struct svc_xprt *(*xpo_accept)(struct svc_xprt *); + int (*xpo_has_wspace)(struct svc_xprt *); + int (*xpo_recvfrom)(struct svc_rqst *); + void (*xpo_prep_reply_hdr)(struct svc_rqst *); + int (*xpo_sendto)(struct svc_rqst *); + void (*xpo_release)(struct svc_rqst *); + void (*xpo_detach)(struct svc_xprt *); + void (*xpo_free)(struct svc_xprt *); +}; + +struct svc_xprt_class { + const char *xcl_name; + struct module *xcl_owner; + struct svc_xprt_ops *xcl_ops; + struct list_head xcl_list; + u32 xcl_max_payload; +}; + +struct svc_xprt { + struct svc_xprt_class *xpt_class; + struct svc_xprt_ops xpt_ops; + u32 xpt_max_payload; + struct kref xpt_ref; + struct list_head xpt_list; + struct list_head xpt_ready; + unsigned long xpt_flags; +#define XPT_BUSY 0 /* enqueued/receiving */ +#define XPT_CONN 1 /* conn pending */ +#define XPT_CLOSE 2 /* dead or dying */ +#define XPT_DATA 3 /* data pending */ +#define XPT_TEMP 4 /* connected transport */ +#define XPT_DEAD 6 /* transport closed */ +#define XPT_CHNGBUF 7 /* need to change snd/rcv buf sizes */ +#define XPT_DEFERRED 8 /* deferred request pending */ +#define XPT_OLD 9 /* used for xprt aging mark+sweep */ +#define XPT_DETACHED 10 /* detached from tempsocks list */ +#define XPT_LISTENER 11 /* listening endpoint */ +#define XPT_CACHE_AUTH 12 /* cache auth info */ + + struct svc_pool *xpt_pool; /* current pool iff queued */ + struct svc_serv *xpt_server; /* service for transport */ + atomic_t xpt_reserved; /* space on outq that is rsvd */ + struct mutex xpt_mutex; /* to serialize sending data */ + spinlock_t xpt_lock; /* protects sk_deferred + * and xpt_auth_cache */ + void *xpt_auth_cache;/* auth cache */ + struct list_head xpt_deferred; /* deferred requests that need + * to be revisted */ + struct sockaddr_storage xpt_local; /* local address */ + struct sockaddr_storage xpt_remote; /* remote peer's address */ + int xpt_remotelen; /* length of address */ +}; + +int svc_reg_xprt_class(struct svc_xprt_class *); +int svc_unreg_xprt_class(struct svc_xprt_class *); +void svc_xprt_init(struct svc_xprt_class *, struct svc_xprt *, + struct svc_serv *); +int svc_create_xprt(struct svc_serv *, char *, unsigned short, int); +void svc_xprt_received(struct svc_xprt *); +void svc_xprt_enqueue(struct svc_xprt *xprt); +int svc_port_is_privileged(struct sockaddr *sin); +void svc_xprt_put(struct svc_xprt *xprt); +static inline void svc_xprt_get(struct svc_xprt *xprt) +{ + kref_get(&xprt->xpt_ref); +} +void svc_delete_xprt(struct svc_xprt *xprt); +void svc_close_xprt(struct svc_xprt *xprt); +int svc_print_xprts(char *buf, int maxlen); +#endif /* SUNRPC_SVC_XPRT_H */ diff -puN include/linux/sunrpc/svcsock.h~git-nfsd include/linux/sunrpc/svcsock.h --- a/include/linux/sunrpc/svcsock.h~git-nfsd +++ a/include/linux/sunrpc/svcsock.h @@ -10,42 +10,16 @@ #define SUNRPC_SVCSOCK_H #include +#include /* * RPC server socket. */ struct svc_sock { - struct list_head sk_ready; /* list of ready sockets */ - struct list_head sk_list; /* list of all sockets */ + struct svc_xprt sk_xprt; struct socket * sk_sock; /* berkeley socket layer */ struct sock * sk_sk; /* INET layer */ - struct svc_pool * sk_pool; /* current pool iff queued */ - struct svc_serv * sk_server; /* service for this socket */ - atomic_t sk_inuse; /* use count */ - unsigned long sk_flags; -#define SK_BUSY 0 /* enqueued/receiving */ -#define SK_CONN 1 /* conn pending */ -#define SK_CLOSE 2 /* dead or dying */ -#define SK_DATA 3 /* data pending */ -#define SK_TEMP 4 /* temp (TCP) socket */ -#define SK_DEAD 6 /* socket closed */ -#define SK_CHNGBUF 7 /* need to change snd/rcv buffer sizes */ -#define SK_DEFERRED 8 /* request on sk_deferred */ -#define SK_OLD 9 /* used for temp socket aging mark+sweep */ -#define SK_DETACHED 10 /* detached from tempsocks list */ - - atomic_t sk_reserved; /* space on outq that is reserved */ - - spinlock_t sk_lock; /* protects sk_deferred and - * sk_info_authunix */ - struct list_head sk_deferred; /* deferred requests that need to - * be revisted */ - struct mutex sk_mutex; /* to serialize sending data */ - - int (*sk_recvfrom)(struct svc_rqst *rqstp); - int (*sk_sendto)(struct svc_rqst *rqstp); - /* We keep the old state_change and data_ready CB's here */ void (*sk_ostate)(struct sock *); void (*sk_odata)(struct sock *, int bytes); @@ -54,21 +28,12 @@ struct svc_sock { /* private TCP part */ int sk_reclen; /* length of record */ int sk_tcplen; /* current read length */ - time_t sk_lastrecv; /* time of last received request */ - - /* cache of various info for TCP sockets */ - void *sk_info_authunix; - - struct sockaddr_storage sk_local; /* local address */ - struct sockaddr_storage sk_remote; /* remote peer's address */ - int sk_remotelen; /* length of address */ }; /* * Function prototypes. */ -int svc_makesock(struct svc_serv *, int, unsigned short, int flags); -void svc_force_close_socket(struct svc_sock *); +void svc_close_all(struct list_head *); int svc_recv(struct svc_rqst *, long); int svc_send(struct svc_rqst *); void svc_drop(struct svc_rqst *); @@ -78,6 +43,8 @@ int svc_addsock(struct svc_serv *serv, int fd, char *name_return, int *proto); +void svc_init_xprt_sock(void); +void svc_cleanup_xprt_sock(void); /* * svc_makesock socket characteristics diff -puN net/sunrpc/Makefile~git-nfsd net/sunrpc/Makefile --- a/net/sunrpc/Makefile~git-nfsd +++ a/net/sunrpc/Makefile @@ -11,6 +11,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprt auth.o auth_null.o auth_unix.o \ svc.o svcsock.o svcauth.o svcauth_unix.o \ rpcb_clnt.o timer.o xdr.o \ - sunrpc_syms.o cache.o rpc_pipe.o + sunrpc_syms.o cache.o rpc_pipe.o \ + svc_xprt.o sunrpc-$(CONFIG_PROC_FS) += stats.o sunrpc-$(CONFIG_SYSCTL) += sysctl.o diff -puN net/sunrpc/auth_gss/svcauth_gss.c~git-nfsd net/sunrpc/auth_gss/svcauth_gss.c --- a/net/sunrpc/auth_gss/svcauth_gss.c~git-nfsd +++ a/net/sunrpc/auth_gss/svcauth_gss.c @@ -631,7 +631,8 @@ svc_safe_putnetobj(struct kvec *resv, st return 0; } -/* Verify the checksum on the header and return SVC_OK on success. +/* + * Verify the checksum on the header and return SVC_OK on success. * Otherwise, return SVC_DROP (in the case of a bad sequence number) * or return SVC_DENIED and indicate error in authp. */ @@ -961,6 +962,78 @@ gss_write_init_verf(struct svc_rqst *rqs } /* + * Having read the cred already and found we're in the context + * initiation case, read the verifier and initiate (or check the results + * of) upcalls to userspace for help with context initiation. If + * the upcall results are available, write the verifier and result. + * Otherwise, drop the request pending an answer to the upcall. + */ +static int svcauth_gss_handle_init(struct svc_rqst *rqstp, + struct rpc_gss_wire_cred *gc, __be32 *authp) +{ + struct kvec *argv = &rqstp->rq_arg.head[0]; + struct kvec *resv = &rqstp->rq_res.head[0]; + struct xdr_netobj tmpobj; + struct rsi *rsip, rsikey; + + /* Read the verifier; should be NULL: */ + *authp = rpc_autherr_badverf; + if (argv->iov_len < 2 * 4) + return SVC_DENIED; + if (svc_getnl(argv) != RPC_AUTH_NULL) + return SVC_DENIED; + if (svc_getnl(argv) != 0) + return SVC_DENIED; + + /* Martial context handle and token for upcall: */ + *authp = rpc_autherr_badcred; + if (gc->gc_proc == RPC_GSS_PROC_INIT && gc->gc_ctx.len != 0) + return SVC_DENIED; + memset(&rsikey, 0, sizeof(rsikey)); + if (dup_netobj(&rsikey.in_handle, &gc->gc_ctx)) + return SVC_DROP; + *authp = rpc_autherr_badverf; + if (svc_safe_getnetobj(argv, &tmpobj)) { + kfree(rsikey.in_handle.data); + return SVC_DENIED; + } + if (dup_netobj(&rsikey.in_token, &tmpobj)) { + kfree(rsikey.in_handle.data); + return SVC_DROP; + } + + /* Perform upcall, or find upcall result: */ + rsip = rsi_lookup(&rsikey); + rsi_free(&rsikey); + if (!rsip) + return SVC_DROP; + switch (cache_check(&rsi_cache, &rsip->h, &rqstp->rq_chandle)) { + case -EAGAIN: + case -ETIMEDOUT: + case -ENOENT: + /* No upcall result: */ + return SVC_DROP; + case 0: + /* Got an answer to the upcall; use it: */ + if (gss_write_init_verf(rqstp, rsip)) + return SVC_DROP; + if (resv->iov_len + 4 > PAGE_SIZE) + return SVC_DROP; + svc_putnl(resv, RPC_SUCCESS); + if (svc_safe_putnetobj(resv, &rsip->out_handle)) + return SVC_DROP; + if (resv->iov_len + 3 * 4 > PAGE_SIZE) + return SVC_DROP; + svc_putnl(resv, rsip->major_status); + svc_putnl(resv, rsip->minor_status); + svc_putnl(resv, GSS_SEQ_WIN); + if (svc_safe_putnetobj(resv, &rsip->out_token)) + return SVC_DROP; + } + return SVC_COMPLETE; +} + +/* * Accept an rpcsec packet. * If context establishment, punt to user space * If data exchange, verify/decrypt @@ -974,11 +1047,9 @@ svcauth_gss_accept(struct svc_rqst *rqst struct kvec *argv = &rqstp->rq_arg.head[0]; struct kvec *resv = &rqstp->rq_res.head[0]; u32 crlen; - struct xdr_netobj tmpobj; struct gss_svc_data *svcdata = rqstp->rq_auth_data; struct rpc_gss_wire_cred *gc; struct rsc *rsci = NULL; - struct rsi *rsip, rsikey; __be32 *rpcstart; __be32 *reject_stat = resv->iov_base + resv->iov_len; int ret; @@ -1023,30 +1094,14 @@ svcauth_gss_accept(struct svc_rqst *rqst if ((gc->gc_proc != RPC_GSS_PROC_DATA) && (rqstp->rq_proc != 0)) goto auth_err; - /* - * We've successfully parsed the credential. Let's check out the - * verifier. An AUTH_NULL verifier is allowed (and required) for - * INIT and CONTINUE_INIT requests. AUTH_RPCSEC_GSS is required for - * PROC_DATA and PROC_DESTROY. - * - * AUTH_NULL verifier is 0 (AUTH_NULL), 0 (length). - * AUTH_RPCSEC_GSS verifier is: - * 6 (AUTH_RPCSEC_GSS), length, checksum. - * checksum is calculated over rpcheader from xid up to here. - */ *authp = rpc_autherr_badverf; switch (gc->gc_proc) { case RPC_GSS_PROC_INIT: case RPC_GSS_PROC_CONTINUE_INIT: - if (argv->iov_len < 2 * 4) - goto auth_err; - if (svc_getnl(argv) != RPC_AUTH_NULL) - goto auth_err; - if (svc_getnl(argv) != 0) - goto auth_err; - break; + return svcauth_gss_handle_init(rqstp, gc, authp); case RPC_GSS_PROC_DATA: case RPC_GSS_PROC_DESTROY: + /* Look up the context, and check the verifier: */ *authp = rpcsec_gsserr_credproblem; rsci = gss_svc_searchbyctx(&gc->gc_ctx); if (!rsci) @@ -1067,51 +1122,6 @@ svcauth_gss_accept(struct svc_rqst *rqst /* now act upon the command: */ switch (gc->gc_proc) { - case RPC_GSS_PROC_INIT: - case RPC_GSS_PROC_CONTINUE_INIT: - *authp = rpc_autherr_badcred; - if (gc->gc_proc == RPC_GSS_PROC_INIT && gc->gc_ctx.len != 0) - goto auth_err; - memset(&rsikey, 0, sizeof(rsikey)); - if (dup_netobj(&rsikey.in_handle, &gc->gc_ctx)) - goto drop; - *authp = rpc_autherr_badverf; - if (svc_safe_getnetobj(argv, &tmpobj)) { - kfree(rsikey.in_handle.data); - goto auth_err; - } - if (dup_netobj(&rsikey.in_token, &tmpobj)) { - kfree(rsikey.in_handle.data); - goto drop; - } - - rsip = rsi_lookup(&rsikey); - rsi_free(&rsikey); - if (!rsip) { - goto drop; - } - switch(cache_check(&rsi_cache, &rsip->h, &rqstp->rq_chandle)) { - case -EAGAIN: - case -ETIMEDOUT: - case -ENOENT: - goto drop; - case 0: - if (gss_write_init_verf(rqstp, rsip)) - goto drop; - if (resv->iov_len + 4 > PAGE_SIZE) - goto drop; - svc_putnl(resv, RPC_SUCCESS); - if (svc_safe_putnetobj(resv, &rsip->out_handle)) - goto drop; - if (resv->iov_len + 3 * 4 > PAGE_SIZE) - goto drop; - svc_putnl(resv, rsip->major_status); - svc_putnl(resv, rsip->minor_status); - svc_putnl(resv, GSS_SEQ_WIN); - if (svc_safe_putnetobj(resv, &rsip->out_token)) - goto drop; - } - goto complete; case RPC_GSS_PROC_DESTROY: if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) goto auth_err; @@ -1158,7 +1168,7 @@ svcauth_gss_accept(struct svc_rqst *rqst goto out; } auth_err: - /* Restore write pointer to original value: */ + /* Restore write pointer to its original value: */ xdr_ressize_check(rqstp, reject_stat); ret = SVC_DENIED; goto out; diff -puN net/sunrpc/sunrpc_syms.c~git-nfsd net/sunrpc/sunrpc_syms.c --- a/net/sunrpc/sunrpc_syms.c~git-nfsd +++ a/net/sunrpc/sunrpc_syms.c @@ -72,7 +72,6 @@ EXPORT_SYMBOL(svc_drop); EXPORT_SYMBOL(svc_process); EXPORT_SYMBOL(svc_recv); EXPORT_SYMBOL(svc_wake_up); -EXPORT_SYMBOL(svc_makesock); EXPORT_SYMBOL(svc_reserve); EXPORT_SYMBOL(svc_auth_register); EXPORT_SYMBOL(auth_domain_lookup); @@ -151,7 +150,8 @@ init_sunrpc(void) #endif cache_register(&ip_map_cache); cache_register(&unix_gid_cache); - init_socket_xprt(); + svc_init_xprt_sock(); /* svc sock transport */ + init_socket_xprt(); /* clnt sock transport */ rpcauth_init_module(); out: return err; @@ -162,6 +162,7 @@ cleanup_sunrpc(void) { rpcauth_remove_module(); cleanup_socket_xprt(); + svc_cleanup_xprt_sock(); unregister_rpc_pipefs(); rpc_destroy_mempool(); if (cache_unregister(&ip_map_cache)) diff -puN net/sunrpc/svc.c~git-nfsd net/sunrpc/svc.c --- a/net/sunrpc/svc.c~git-nfsd +++ a/net/sunrpc/svc.c @@ -458,9 +458,6 @@ svc_create_pooled(struct svc_program *pr void svc_destroy(struct svc_serv *serv) { - struct svc_sock *svsk; - struct svc_sock *tmp; - dprintk("svc: svc_destroy(%s, %d)\n", serv->sv_program->pg_name, serv->sv_nrthreads); @@ -475,14 +472,12 @@ svc_destroy(struct svc_serv *serv) del_timer_sync(&serv->sv_temptimer); - list_for_each_entry_safe(svsk, tmp, &serv->sv_tempsocks, sk_list) - svc_force_close_socket(svsk); + svc_close_all(&serv->sv_tempsocks); if (serv->sv_shutdown) serv->sv_shutdown(serv); - list_for_each_entry_safe(svsk, tmp, &serv->sv_permsocks, sk_list) - svc_force_close_socket(svsk); + svc_close_all(&serv->sv_permsocks); BUG_ON(!list_empty(&serv->sv_permsocks)); BUG_ON(!list_empty(&serv->sv_tempsocks)); @@ -777,6 +772,30 @@ svc_register(struct svc_serv *serv, int } /* + * Printk the given error with the address of the client that caused it. + */ +static int +__attribute__ ((format (printf, 2, 3))) +svc_printk(struct svc_rqst *rqstp, const char *fmt, ...) +{ + va_list args; + int r; + char buf[RPC_MAX_ADDRBUFLEN]; + + if (!net_ratelimit()) + return 0; + + printk(KERN_WARNING "svc: %s: ", + svc_print_addr(rqstp, buf, sizeof(buf))); + + va_start(args, fmt); + r = vprintk(fmt, args); + va_end(args); + + return r; +} + +/* * Process the RPC request. */ int @@ -815,9 +834,9 @@ svc_process(struct svc_rqst *rqstp) rqstp->rq_res.tail[0].iov_len = 0; /* Will be turned off only in gss privacy case: */ rqstp->rq_splice_ok = 1; - /* tcp needs a space for the record length... */ - if (rqstp->rq_prot == IPPROTO_TCP) - svc_putnl(resv, 0); + + /* Setup reply header */ + rqstp->rq_xprt->xpt_ops.xpo_prep_reply_hdr(rqstp); rqstp->rq_xid = svc_getu32(argv); svc_putu32(resv, rqstp->rq_xid); @@ -963,14 +982,13 @@ svc_process(struct svc_rqst *rqstp) return 0; err_short_len: - if (net_ratelimit()) - printk("svc: short len %Zd, dropping request\n", argv->iov_len); + svc_printk(rqstp, "short len %Zd, dropping request\n", + argv->iov_len); goto dropit; /* drop request */ err_bad_dir: - if (net_ratelimit()) - printk("svc: bad direction %d, dropping request\n", dir); + svc_printk(rqstp, "bad direction %d, dropping request\n", dir); serv->sv_stats->rpcbadfmt++; goto dropit; /* drop request */ @@ -1000,8 +1018,7 @@ err_bad_prog: goto sendit; err_bad_vers: - if (net_ratelimit()) - printk("svc: unknown version (%d for prog %d, %s)\n", + svc_printk(rqstp, "unknown version (%d for prog %d, %s)\n", vers, prog, progp->pg_name); serv->sv_stats->rpcbadfmt++; @@ -1011,16 +1028,14 @@ err_bad_vers: goto sendit; err_bad_proc: - if (net_ratelimit()) - printk("svc: unknown procedure (%d)\n", proc); + svc_printk(rqstp, "unknown procedure (%d)\n", proc); serv->sv_stats->rpcbadfmt++; svc_putnl(resv, RPC_PROC_UNAVAIL); goto sendit; err_garbage: - if (net_ratelimit()) - printk("svc: failed to decode args\n"); + svc_printk(rqstp, "failed to decode args\n"); rpc_stat = rpc_garbage_args; err_bad: @@ -1034,10 +1049,8 @@ err_bad: */ u32 svc_max_payload(const struct svc_rqst *rqstp) { - int max = RPCSVC_MAXPAYLOAD_TCP; + int max = rqstp->rq_xprt->xpt_max_payload; - if (rqstp->rq_sock->sk_sock->type == SOCK_DGRAM) - max = RPCSVC_MAXPAYLOAD_UDP; if (rqstp->rq_server->sv_max_payload < max) max = rqstp->rq_server->sv_max_payload; return max; diff -puN /dev/null net/sunrpc/svc_xprt.c --- /dev/null +++ a/net/sunrpc/svc_xprt.c @@ -0,0 +1,954 @@ +/* + * linux/net/sunrpc/svc_xprt.c + * + * Author: Tom Tucker + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt); +static int svc_deferred_recv(struct svc_rqst *rqstp); +static struct cache_deferred_req *svc_defer(struct cache_req *req); +static void svc_age_temp_xprts(unsigned long closure); +/* apparently the "standard" is that clients close + * idle connections after 5 minutes, servers after + * 6 minutes + * http://www.connectathon.org/talks96/nfstcp.pdf + */ +static int svc_conn_age_period = 6*60; + +/* List of registered transport classes */ +static spinlock_t svc_xprt_class_lock = SPIN_LOCK_UNLOCKED; +static LIST_HEAD(svc_xprt_class_list); + +int svc_reg_xprt_class(struct svc_xprt_class *xcl) +{ + struct svc_xprt_class *cl; + int res = -EEXIST; + + dprintk("svc: Adding svc transport class '%s'\n", + xcl->xcl_name); + + INIT_LIST_HEAD(&xcl->xcl_list); + spin_lock(&svc_xprt_class_lock); + list_for_each_entry(cl, &svc_xprt_class_list, xcl_list) { + if (xcl == cl) + goto out; + } + list_add_tail(&xcl->xcl_list, &svc_xprt_class_list); + res = 0; +out: + spin_unlock(&svc_xprt_class_lock); + return res; +} +EXPORT_SYMBOL_GPL(svc_reg_xprt_class); + +int svc_unreg_xprt_class(struct svc_xprt_class *xcl) +{ + struct svc_xprt_class *cl; + int res = 0; + + dprintk("svc: Removing svc transport class '%s'\n", xcl->xcl_name); + + spin_lock(&svc_xprt_class_lock); + list_for_each_entry(cl, &svc_xprt_class_list, xcl_list) { + if (xcl == cl) { + list_del_init(&cl->xcl_list); + goto out; + } + } + res = -ENOENT; + out: + spin_unlock(&svc_xprt_class_lock); + return res; +} +EXPORT_SYMBOL_GPL(svc_unreg_xprt_class); + +/* + * Format the transport list for printing + */ +int svc_print_xprts(char *buf, int maxlen) +{ + struct list_head *le; + char tmpstr[80]; + int len = 0; + buf[0] = '\0'; + + spin_lock(&svc_xprt_class_lock); + list_for_each(le, &svc_xprt_class_list) { + int slen; + struct svc_xprt_class *xcl = + list_entry(le, struct svc_xprt_class, xcl_list); + + sprintf(tmpstr, "%s %d\n", xcl->xcl_name, xcl->xcl_max_payload); + slen = strlen(tmpstr); + if (len + slen > maxlen) + break; + len += slen; + strcat(buf, tmpstr); + } + spin_unlock(&svc_xprt_class_lock); + + return len; +} + +static inline void svc_xprt_free(struct kref *kref) +{ + struct svc_xprt *xprt = + container_of(kref, struct svc_xprt, xpt_ref); + struct module *owner = xprt->xpt_class->xcl_owner; + BUG_ON(atomic_read(&kref->refcount)); + xprt->xpt_ops.xpo_free(xprt); + if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags) + && xprt->xpt_auth_cache != NULL) + svcauth_unix_info_release(xprt->xpt_auth_cache); + module_put(owner); +} + +void svc_xprt_put(struct svc_xprt *xprt) +{ + kref_put(&xprt->xpt_ref, svc_xprt_free); +} +EXPORT_SYMBOL_GPL(svc_xprt_put); + +/* + * Called by transport drivers to initialize the transport independent + * portion of the transport instance. + */ +void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xpt, + struct svc_serv *serv) +{ + xpt->xpt_class = xcl; + xpt->xpt_ops = *xcl->xcl_ops; + xpt->xpt_max_payload = xcl->xcl_max_payload; + kref_init(&xpt->xpt_ref); + xpt->xpt_server = serv; + INIT_LIST_HEAD(&xpt->xpt_list); + INIT_LIST_HEAD(&xpt->xpt_ready); + INIT_LIST_HEAD(&xpt->xpt_deferred); + mutex_init(&xpt->xpt_mutex); + spin_lock_init(&xpt->xpt_lock); +} +EXPORT_SYMBOL_GPL(svc_xprt_init); + +int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port, + int flags) +{ + struct svc_xprt_class *xcl; + int ret = -ENOENT; + struct sockaddr_in sin = { + .sin_family = AF_INET, + .sin_addr.s_addr = INADDR_ANY, + .sin_port = htons(port), + }; + dprintk("svc: creating transport %s[%d]\n", xprt_name, port); + spin_lock(&svc_xprt_class_lock); + list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) { + if (strcmp(xprt_name, xcl->xcl_name) == 0) { + spin_unlock(&svc_xprt_class_lock); + if (try_module_get(xcl->xcl_owner)) { + struct svc_xprt *newxprt; + ret = 0; + newxprt = xcl->xcl_ops->xpo_create + (serv, (struct sockaddr *)&sin, flags); + if (IS_ERR(newxprt)) { + module_put(xcl->xcl_owner); + ret = PTR_ERR(newxprt); + } else { + clear_bit(XPT_TEMP, + &newxprt->xpt_flags); + spin_lock_bh(&serv->sv_lock); + list_add(&newxprt->xpt_list, + &serv->sv_permsocks); + spin_unlock_bh(&serv->sv_lock); + } + } + goto out; + } + } + spin_unlock(&svc_xprt_class_lock); + dprintk("svc: transport %s not found\n", xprt_name); + out: + return ret; +} +EXPORT_SYMBOL_GPL(svc_create_xprt); + +/* + * Queue up an idle server thread. Must have pool->sp_lock held. + * Note: this is really a stack rather than a queue, so that we only + * use as many different threads as we need, and the rest don't pollute + * the cache. + */ +static inline void +svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp) +{ + list_add(&rqstp->rq_list, &pool->sp_threads); +} + +/* + * Dequeue an nfsd thread. Must have pool->sp_lock held. + */ +static inline void +svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp) +{ + list_del(&rqstp->rq_list); +} + +/* + * Queue up a transport with data pending. If there are idle nfsd + * processes, wake 'em up. + * + */ +void +svc_xprt_enqueue(struct svc_xprt *xprt) +{ + struct svc_serv *serv = xprt->xpt_server; + struct svc_pool *pool; + struct svc_rqst *rqstp; + int cpu; + + if (!(xprt->xpt_flags & + ((1<xpt_flags)) + return; + + cpu = get_cpu(); + pool = svc_pool_for_cpu(xprt->xpt_server, cpu); + put_cpu(); + + spin_lock_bh(&pool->sp_lock); + + if (!list_empty(&pool->sp_threads) && + !list_empty(&pool->sp_sockets)) + printk(KERN_ERR + "svc_xprt_enqueue: threads and xprt both waiting??\n"); + + if (test_bit(XPT_DEAD, &xprt->xpt_flags)) { + /* Don't enqueue dead transports */ + dprintk("svc: transport %p is dead, not enqueued\n", xprt); + goto out_unlock; + } + + /* Mark transport as busy. It will remain in this state until the + * server has processed all pending data and put the transport back + * on the idle list. We update XPT_BUSY atomically because + * it also guards against trying to enqueue the svc_sock twice. + */ + if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) { + /* Don't enqueue transport while already enqueued */ + dprintk("svc: transport %p busy, not enqueued\n", xprt); + goto out_unlock; + } + BUG_ON(xprt->xpt_pool != NULL); + xprt->xpt_pool = pool; + + /* Handle pending connection */ + if (test_bit(XPT_CONN, &xprt->xpt_flags)) + goto process; + + /* Handle close in-progress */ + if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) + goto process; + + /* Check if we have space to reply to a request */ + if (!xprt->xpt_ops.xpo_has_wspace(xprt)) { + /* Don't enqueue while not enough space for reply */ + dprintk("svc: no write space, transport %p not enqueued\n", xprt); + xprt->xpt_pool = NULL; + clear_bit(XPT_BUSY, &xprt->xpt_flags); + goto out_unlock; + } + + process: + if (!list_empty(&pool->sp_threads)) { + rqstp = list_entry(pool->sp_threads.next, + struct svc_rqst, + rq_list); + dprintk("svc: transport %p served by daemon %p\n", + xprt, rqstp); + svc_thread_dequeue(pool, rqstp); + if (rqstp->rq_xprt) + printk(KERN_ERR + "svc_xprt_enqueue: server %p, rq_xprt=%p!\n", + rqstp, rqstp->rq_xprt); + rqstp->rq_xprt = xprt; + svc_xprt_get(xprt); + rqstp->rq_reserved = serv->sv_max_mesg; + atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); + BUG_ON(xprt->xpt_pool != pool); + wake_up(&rqstp->rq_wait); + } else { + dprintk("svc: transport %p put into queue\n", xprt); + list_add_tail(&xprt->xpt_ready, &pool->sp_sockets); + BUG_ON(xprt->xpt_pool != pool); + } + +out_unlock: + spin_unlock_bh(&pool->sp_lock); +} +EXPORT_SYMBOL_GPL(svc_xprt_enqueue); + +/* + * Dequeue the first transport. Must be called with the pool->sp_lock held. + */ +static inline struct svc_xprt * +svc_xprt_dequeue(struct svc_pool *pool) +{ + struct svc_xprt *xprt; + + if (list_empty(&pool->sp_sockets)) + return NULL; + + xprt = list_entry(pool->sp_sockets.next, + struct svc_xprt, xpt_ready); + list_del_init(&xprt->xpt_ready); + + dprintk("svc: transport %p dequeued, inuse=%d\n", + xprt, atomic_read(&xprt->xpt_ref.refcount)); + + return xprt; +} + +/* + * Having read something from a transport, check whether it + * needs to be re-enqueued. + * Note: XPT_DATA only gets cleared when a read-attempt finds + * no (or insufficient) data. + */ +void +svc_xprt_received(struct svc_xprt *xprt) +{ + xprt->xpt_pool = NULL; + clear_bit(XPT_BUSY, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); +} +EXPORT_SYMBOL_GPL(svc_xprt_received); + +/** + * svc_reserve - change the space reserved for the reply to a request. + * @rqstp: The request in question + * @space: new max space to reserve + * + * Each request reserves some space on the output queue of the transport + * to make sure the reply fits. This function reduces that reserved + * space to be the amount of space used already, plus @space. + * + */ +void svc_reserve(struct svc_rqst *rqstp, int space) +{ + space += rqstp->rq_res.head[0].iov_len; + + if (space < rqstp->rq_reserved) { + struct svc_xprt *xprt = rqstp->rq_xprt; + atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved); + rqstp->rq_reserved = space; + + svc_xprt_enqueue(xprt); + } +} + +static void +svc_xprt_release(struct svc_rqst *rqstp) +{ + struct svc_xprt *xprt = rqstp->rq_xprt; + + rqstp->rq_xprt->xpt_ops.xpo_release(rqstp); + + svc_free_res_pages(rqstp); + rqstp->rq_res.page_len = 0; + rqstp->rq_res.page_base = 0; + + /* Reset response buffer and release + * the reservation. + * But first, check that enough space was reserved + * for the reply, otherwise we have a bug! + */ + if ((rqstp->rq_res.len) > rqstp->rq_reserved) + printk(KERN_ERR "RPC request reserved %d but used %d\n", + rqstp->rq_reserved, + rqstp->rq_res.len); + + rqstp->rq_res.head[0].iov_len = 0; + svc_reserve(rqstp, 0); + rqstp->rq_xprt = NULL; + + svc_xprt_put(xprt); +} + +/* + * External function to wake up a server waiting for data + * This really only makes sense for services like lockd + * which have exactly one thread anyway. + */ +void +svc_wake_up(struct svc_serv *serv) +{ + struct svc_rqst *rqstp; + unsigned int i; + struct svc_pool *pool; + + for (i = 0; i < serv->sv_nrpools; i++) { + pool = &serv->sv_pools[i]; + + spin_lock_bh(&pool->sp_lock); + if (!list_empty(&pool->sp_threads)) { + rqstp = list_entry(pool->sp_threads.next, + struct svc_rqst, + rq_list); + dprintk("svc: daemon %p woken up.\n", rqstp); + /* + svc_thread_dequeue(pool, rqstp); + rqstp->rq_xprt = NULL; + */ + wake_up(&rqstp->rq_wait); + } + spin_unlock_bh(&pool->sp_lock); + } +} + +static void +svc_check_conn_limits(struct svc_serv *serv) +{ + char buf[RPC_MAX_ADDRBUFLEN]; + + /* make sure that we don't have too many active connections. + * If we have, something must be dropped. + * + * There's no point in trying to do random drop here for + * DoS prevention. The NFS clients does 1 reconnect in 15 + * seconds. An attacker can easily beat that. + * + * The only somewhat efficient mechanism would be if drop + * old connections from the same IP first. + */ + if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) { + struct svc_xprt *xprt = NULL; + spin_lock_bh(&serv->sv_lock); + if (!list_empty(&serv->sv_tempsocks)) { + if (net_ratelimit()) { + /* Try to help the admin */ + printk(KERN_NOTICE "%s: too many open " + "connections, consider increasing the " + "number of nfsd threads\n", + serv->sv_name); + printk(KERN_NOTICE + "%s: last connection from %s\n", + serv->sv_name, buf); + } + /* + * Always select the oldest connection. It's not fair, + * but so is life + */ + xprt = list_entry(serv->sv_tempsocks.prev, + struct svc_xprt, + xpt_list); + set_bit(XPT_CLOSE, &xprt->xpt_flags); + svc_xprt_get(xprt); + } + spin_unlock_bh(&serv->sv_lock); + + if (xprt) { + svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); + } + } +} + +static inline void svc_copy_addr(struct svc_rqst *rqstp, struct svc_xprt *xprt) +{ + struct sockaddr *sin; + + /* sock_recvmsg doesn't fill in the name/namelen, so we must.. + */ + memcpy(&rqstp->rq_addr, &xprt->xpt_remote, xprt->xpt_remotelen); + rqstp->rq_addrlen = xprt->xpt_remotelen; + + /* Destination address in request is needed for binding the + * source address in RPC callbacks later. + */ + sin = (struct sockaddr *)&xprt->xpt_local; + switch (sin->sa_family) { + case AF_INET: + rqstp->rq_daddr.addr = ((struct sockaddr_in *)sin)->sin_addr; + break; + case AF_INET6: + rqstp->rq_daddr.addr6 = ((struct sockaddr_in6 *)sin)->sin6_addr; + break; + } +} + +/* + * Receive the next request on any transport. This code is carefully + * organised not to touch any cachelines in the shared svc_serv + * structure, only cachelines in the local svc_pool. + */ +int +svc_recv(struct svc_rqst *rqstp, long timeout) +{ + struct svc_xprt *xprt = NULL; + struct svc_serv *serv = rqstp->rq_server; + struct svc_pool *pool = rqstp->rq_pool; + int len, i; + int pages; + struct xdr_buf *arg; + DECLARE_WAITQUEUE(wait, current); + + dprintk("svc: server %p waiting for data (to = %ld)\n", + rqstp, timeout); + + if (rqstp->rq_xprt) + printk(KERN_ERR + "svc_recv: service %p, transport not NULL!\n", + rqstp); + if (waitqueue_active(&rqstp->rq_wait)) + printk(KERN_ERR + "svc_recv: service %p, wait queue active!\n", + rqstp); + + + /* now allocate needed pages. If we get a failure, sleep briefly */ + pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE; + for (i = 0; i < pages ; i++) + while (rqstp->rq_pages[i] == NULL) { + struct page *p = alloc_page(GFP_KERNEL); + if (!p) + schedule_timeout_uninterruptible(msecs_to_jiffies(500)); + rqstp->rq_pages[i] = p; + } + rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */ + BUG_ON(pages >= RPCSVC_MAXPAGES); + + /* Make arg->head point to first page and arg->pages point to rest */ + arg = &rqstp->rq_arg; + arg->head[0].iov_base = page_address(rqstp->rq_pages[0]); + arg->head[0].iov_len = PAGE_SIZE; + arg->pages = rqstp->rq_pages + 1; + arg->page_base = 0; + /* save at least one page for response */ + arg->page_len = (pages-2)*PAGE_SIZE; + arg->len = (pages-1)*PAGE_SIZE; + arg->tail[0].iov_len = 0; + + try_to_freeze(); + cond_resched(); + if (signalled()) + return -EINTR; + + spin_lock_bh(&pool->sp_lock); + if ((xprt = svc_xprt_dequeue(pool)) != NULL) { + rqstp->rq_xprt = xprt; + svc_xprt_get(xprt); + rqstp->rq_reserved = serv->sv_max_mesg; + atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); + } else { + /* No data pending. Go to sleep */ + svc_thread_enqueue(pool, rqstp); + + /* + * We have to be able to interrupt this wait + * to bring down the daemons ... + */ + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&rqstp->rq_wait, &wait); + spin_unlock_bh(&pool->sp_lock); + + schedule_timeout(timeout); + + try_to_freeze(); + + spin_lock_bh(&pool->sp_lock); + remove_wait_queue(&rqstp->rq_wait, &wait); + + if (!(xprt = rqstp->rq_xprt)) { + svc_thread_dequeue(pool, rqstp); + spin_unlock_bh(&pool->sp_lock); + dprintk("svc: server %p, no data yet\n", rqstp); + return signalled()? -EINTR : -EAGAIN; + } + } + spin_unlock_bh(&pool->sp_lock); + + len = 0; + if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) { + dprintk("svc_recv: found XPT_CLOSE\n"); + svc_delete_xprt(xprt); + } else if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) { + struct svc_xprt *newxpt; + newxpt = xprt->xpt_ops.xpo_accept(xprt); + if (newxpt) { + svc_xprt_received(newxpt); + /* + * We know this module_get will succeed because the + * listener holds a reference too + */ + __module_get(newxpt->xpt_class->xcl_owner); + svc_check_conn_limits(xprt->xpt_server); + spin_lock_bh(&serv->sv_lock); + set_bit(XPT_TEMP, &newxpt->xpt_flags); + list_add(&newxpt->xpt_list, &serv->sv_tempsocks); + serv->sv_tmpcnt++; + if (serv->sv_temptimer.function == NULL) { + /* setup timer to age temp transports */ + setup_timer(&serv->sv_temptimer, svc_age_temp_xprts, + (unsigned long)serv); + mod_timer(&serv->sv_temptimer, + jiffies + svc_conn_age_period * HZ); + } + spin_unlock_bh(&serv->sv_lock); + } + svc_xprt_received(xprt); + } else { + dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n", + rqstp, pool->sp_id, xprt, + atomic_read(&xprt->xpt_ref.refcount)); + + if ((rqstp->rq_deferred = svc_deferred_dequeue(xprt))) { + svc_xprt_received(xprt); + len = svc_deferred_recv(rqstp); + } else + len = xprt->xpt_ops.xpo_recvfrom(rqstp); + svc_copy_addr(rqstp, xprt); + dprintk("svc: got len=%d\n", len); + } + + /* No data, incomplete (TCP) read, or accept() */ + if (len == 0 || len == -EAGAIN) { + rqstp->rq_res.len = 0; + svc_xprt_release(rqstp); + return -EAGAIN; + } + clear_bit(XPT_OLD, &xprt->xpt_flags); + + rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp)); + rqstp->rq_chandle.defer = svc_defer; + + if (serv->sv_stats) + serv->sv_stats->netcnt++; + return len; +} + +/* + * Drop request + */ +void +svc_drop(struct svc_rqst *rqstp) +{ + dprintk("svc: xprt %p dropped request\n", rqstp->rq_xprt); + svc_xprt_release(rqstp); +} + +/* + * Return reply to client. + */ +int +svc_send(struct svc_rqst *rqstp) +{ + struct svc_xprt *xprt; + int len; + struct xdr_buf *xb; + + if ((xprt = rqstp->rq_xprt) == NULL) { + printk(KERN_WARNING "NULL transport pointer in %s:%d\n", + __FILE__, __LINE__); + return -EFAULT; + } + + /* release the receive skb before sending the reply */ + rqstp->rq_xprt->xpt_ops.xpo_release(rqstp); + + /* calculate over-all length */ + xb = & rqstp->rq_res; + xb->len = xb->head[0].iov_len + + xb->page_len + + xb->tail[0].iov_len; + + /* Grab mutex to serialize outgoing data. */ + mutex_lock(&xprt->xpt_mutex); + if (test_bit(XPT_DEAD, &xprt->xpt_flags)) + len = -ENOTCONN; + else + len = xprt->xpt_ops.xpo_sendto(rqstp); + mutex_unlock(&xprt->xpt_mutex); + svc_xprt_release(rqstp); + + if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN) + return 0; + return len; +} + +/* + * Timer function to close old temporary transports, using + * a mark-and-sweep algorithm. + */ +static void +svc_age_temp_xprts(unsigned long closure) +{ + struct svc_serv *serv = (struct svc_serv *)closure; + struct svc_xprt *xprt; + struct list_head *le, *next; + LIST_HEAD(to_be_aged); + + dprintk("svc_age_temp_xprts\n"); + + if (!spin_trylock_bh(&serv->sv_lock)) { + /* busy, try again 1 sec later */ + dprintk("svc_age_temp_xprts: busy\n"); + mod_timer(&serv->sv_temptimer, jiffies + HZ); + return; + } + + list_for_each_safe(le, next, &serv->sv_tempsocks) { + xprt = list_entry(le, struct svc_xprt, xpt_list); + + /* First time through, just mark it OLD. Second time + * through, close it. */ + if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags)) + continue; + if (atomic_read(&xprt->xpt_ref.refcount) > 1 + || test_bit(XPT_BUSY, &xprt->xpt_flags)) + continue; + svc_xprt_get(xprt); + list_move(le, &to_be_aged); + set_bit(XPT_CLOSE, &xprt->xpt_flags); + set_bit(XPT_DETACHED, &xprt->xpt_flags); + } + spin_unlock_bh(&serv->sv_lock); + + while (!list_empty(&to_be_aged)) { + le = to_be_aged.next; + /* fiddling the xpt_list node is safe 'cos we're XPT_DETACHED */ + list_del_init(le); + xprt = list_entry(le, struct svc_xprt, xpt_list); + + dprintk("queuing xprt %p for closing\n", xprt); + + /* a thread will dequeue and close it soon */ + svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); + } + + mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); +} + +/* + * Remove a dead transport + */ +void +svc_delete_xprt(struct svc_xprt *xprt) +{ + struct svc_serv *serv; + + dprintk("svc: svc_delete_xprt(%p)\n", xprt); + + serv = xprt->xpt_server; + + xprt->xpt_ops.xpo_detach(xprt); + + spin_lock_bh(&serv->sv_lock); + + if (!test_and_set_bit(XPT_DETACHED, &xprt->xpt_flags)) + list_del_init(&xprt->xpt_list); + /* + * We used to delete the transport from whichever list + * it's sk_xprt.xpt_ready node was on, but we don't actually + * need to. This is because the only time we're called + * while still attached to a queue, the queue itself + * is about to be destroyed (in svc_destroy). + */ + if (!test_and_set_bit(XPT_DEAD, &xprt->xpt_flags)) { + BUG_ON(atomic_read(&xprt->xpt_ref.refcount) < 2); + svc_xprt_put(xprt); + if (test_bit(XPT_TEMP, &xprt->xpt_flags)) + serv->sv_tmpcnt--; + } + + spin_unlock_bh(&serv->sv_lock); +} + +void svc_close_xprt(struct svc_xprt *xprt) +{ + set_bit(XPT_CLOSE, &xprt->xpt_flags); + if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) + /* someone else will have to effect the close */ + return; + + svc_xprt_get(xprt); + svc_delete_xprt(xprt); + clear_bit(XPT_BUSY, &xprt->xpt_flags); + svc_xprt_put(xprt); +} + +void svc_close_all(struct list_head *xprt_list) +{ + struct svc_xprt *xprt; + struct svc_xprt *tmp; + + list_for_each_entry_safe(xprt, tmp, xprt_list, xpt_list) { + set_bit(XPT_CLOSE, &xprt->xpt_flags); + if (test_bit(XPT_BUSY, &xprt->xpt_flags)) { + /* Waiting to be processed, but no threads left, + * So just remove it from the waiting list + */ + list_del_init(&xprt->xpt_ready); + clear_bit(XPT_BUSY, &xprt->xpt_flags); + } + svc_close_xprt(xprt); + } +} + +int svc_port_is_privileged(struct sockaddr *sin) +{ + switch (sin->sa_family) { + case AF_INET: + return ntohs(((struct sockaddr_in *)sin)->sin_port) + < PROT_SOCK; + case AF_INET6: + return ntohs(((struct sockaddr_in6 *)sin)->sin6_port) + < PROT_SOCK; + default: + return 0; + } +} + +/* + * Handle defer and revisit of requests + */ + +static void svc_revisit(struct cache_deferred_req *dreq, int too_many) +{ + struct svc_deferred_req *dr = container_of(dreq, struct svc_deferred_req, handle); + struct svc_xprt *xprt = dr->xprt; + + if (too_many) { + svc_xprt_put(xprt); + kfree(dr); + return; + } + dprintk("revisit queued\n"); + dr->xprt = NULL; + spin_lock(&xprt->xpt_lock); + list_add(&dr->handle.recent, &xprt->xpt_deferred); + spin_unlock(&xprt->xpt_lock); + set_bit(XPT_DEFERRED, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); +} + +/* + * Save the request off for later processing. The request buffer looks + * like this: + * + * + * + * This code can only handle requests that consist of an xprt-header + * and rpc-header. + */ +static struct cache_deferred_req * +svc_defer(struct cache_req *req) +{ + struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle); + struct svc_deferred_req *dr; + + if (rqstp->rq_arg.page_len) + return NULL; /* if more than a page, give up FIXME */ + if (rqstp->rq_deferred) { + dr = rqstp->rq_deferred; + rqstp->rq_deferred = NULL; + } else { + int skip; + int size; + /* FIXME maybe discard if size too large */ + size = sizeof(struct svc_deferred_req) + rqstp->rq_arg.len + + rqstp->rq_xprt_hlen; + dr = kmalloc(size, GFP_KERNEL); + if (dr == NULL) + return NULL; + + dr->handle.owner = rqstp->rq_server; + dr->prot = rqstp->rq_prot; + memcpy(&dr->addr, &rqstp->rq_addr, rqstp->rq_addrlen); + dr->addrlen = rqstp->rq_addrlen; + dr->daddr = rqstp->rq_daddr; + dr->argslen = (rqstp->rq_arg.len + rqstp->rq_xprt_hlen) >> 2; + + /* back up head to the start of the buffer and copy */ + skip = (rqstp->rq_arg.len + rqstp->rq_xprt_hlen) - + rqstp->rq_arg.head[0].iov_len; + memcpy(dr->args, rqstp->rq_arg.head[0].iov_base - skip, + dr->argslen << 2); + } + svc_xprt_get(rqstp->rq_xprt); + dr->xprt = rqstp->rq_xprt; + + dr->handle.revisit = svc_revisit; + return &dr->handle; +} + +/* + * recv data from a deferred request into an active one + */ +static int svc_deferred_recv(struct svc_rqst *rqstp) +{ + struct svc_deferred_req *dr = rqstp->rq_deferred; + + rqstp->rq_arg.head[0].iov_base = dr->args; + rqstp->rq_arg.head[0].iov_len = dr->argslen<<2; + rqstp->rq_arg.page_len = 0; + rqstp->rq_arg.len = dr->argslen<<2; + rqstp->rq_prot = dr->prot; + memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen); + rqstp->rq_addrlen = dr->addrlen; + rqstp->rq_daddr = dr->daddr; + rqstp->rq_respages = rqstp->rq_pages; + return dr->argslen<<2; +} + + +static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt) +{ + struct svc_deferred_req *dr = NULL; + + if (!test_bit(XPT_DEFERRED, &xprt->xpt_flags)) + return NULL; + spin_lock(&xprt->xpt_lock); + clear_bit(XPT_DEFERRED, &xprt->xpt_flags); + if (!list_empty(&xprt->xpt_deferred)) { + dr = list_entry(xprt->xpt_deferred.next, + struct svc_deferred_req, + handle.recent); + list_del_init(&dr->handle.recent); + set_bit(XPT_DEFERRED, &xprt->xpt_flags); + } + spin_unlock(&xprt->xpt_lock); + return dr; +} diff -puN net/sunrpc/svcauth_unix.c~git-nfsd net/sunrpc/svcauth_unix.c --- a/net/sunrpc/svcauth_unix.c~git-nfsd +++ a/net/sunrpc/svcauth_unix.c @@ -384,41 +384,45 @@ void svcauth_unix_purge(void) static inline struct ip_map * ip_map_cached_get(struct svc_rqst *rqstp) { - struct ip_map *ipm; - struct svc_sock *svsk = rqstp->rq_sock; - spin_lock(&svsk->sk_lock); - ipm = svsk->sk_info_authunix; - if (ipm != NULL) { - if (!cache_valid(&ipm->h)) { - /* - * The entry has been invalidated since it was - * remembered, e.g. by a second mount from the - * same IP address. - */ - svsk->sk_info_authunix = NULL; - spin_unlock(&svsk->sk_lock); - cache_put(&ipm->h, &ip_map_cache); - return NULL; + struct ip_map *ipm = NULL; + struct svc_xprt *xprt = rqstp->rq_xprt; + + if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) { + spin_lock(&xprt->xpt_lock); + ipm = xprt->xpt_auth_cache; + if (ipm != NULL) { + if (!cache_valid(&ipm->h)) { + /* + * The entry has been invalidated since it was + * remembered, e.g. by a second mount from the + * same IP address. + */ + xprt->xpt_auth_cache = NULL; + spin_unlock(&xprt->xpt_lock); + cache_put(&ipm->h, &ip_map_cache); + return NULL; + } + cache_get(&ipm->h); } - cache_get(&ipm->h); + spin_unlock(&xprt->xpt_lock); } - spin_unlock(&svsk->sk_lock); return ipm; } static inline void ip_map_cached_put(struct svc_rqst *rqstp, struct ip_map *ipm) { - struct svc_sock *svsk = rqstp->rq_sock; + struct svc_xprt *xprt = rqstp->rq_xprt; - spin_lock(&svsk->sk_lock); - if (svsk->sk_sock->type == SOCK_STREAM && - svsk->sk_info_authunix == NULL) { - /* newly cached, keep the reference */ - svsk->sk_info_authunix = ipm; - ipm = NULL; + if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) { + spin_lock(&xprt->xpt_lock); + if (xprt->xpt_auth_cache == NULL) { + /* newly cached, keep the reference */ + xprt->xpt_auth_cache = ipm; + ipm = NULL; + } + spin_unlock(&xprt->xpt_lock); } - spin_unlock(&svsk->sk_lock); if (ipm) cache_put(&ipm->h, &ip_map_cache); } diff -puN net/sunrpc/svcsock.c~git-nfsd net/sunrpc/svcsock.c --- a/net/sunrpc/svcsock.c~git-nfsd +++ a/net/sunrpc/svcsock.c @@ -5,7 +5,7 @@ * * The server scheduling algorithm does not always distribute the load * evenly when servicing a single client. May need to modify the - * svc_sock_enqueue procedure... + * svc_xprt_enqueue procedure... * * TCP support is largely untested and may be a little slow. The problem * is that we currently do two separate recvfrom's, one for the 4-byte @@ -51,51 +51,44 @@ /* SMP locking strategy: * * svc_pool->sp_lock protects most of the fields of that pool. - * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt. + * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt. * when both need to be taken (rare), svc_serv->sv_lock is first. * BKL protects svc_serv->sv_nrthread. * svc_sock->sk_lock protects the svc_sock->sk_deferred list * and the ->sk_info_authunix cache. - * svc_sock->sk_flags.SK_BUSY prevents a svc_sock being enqueued multiply. + * svc_sock->sk_xprt.xpt_flags.XPT_BUSY prevents a svc_sock being + * enqueued multiply. * * Some flags can be set to certain values at any time * providing that certain rules are followed: * - * SK_CONN, SK_DATA, can be set or cleared at any time. - * after a set, svc_sock_enqueue must be called. + * XPT_CONN, XPT_DATA, can be set or cleared at any time. + * after a set, svc_xprt_enqueue must be called. * after a clear, the socket must be read/accepted * if this succeeds, it must be set again. - * SK_CLOSE can set at any time. It is never cleared. - * sk_inuse contains a bias of '1' until SK_DEAD is set. - * so when sk_inuse hits zero, we know the socket is dead + * XPT_CLOSE can set at any time. It is never cleared. + * xpt_ref contains a bias of '1' until XPT_DEAD is set. + * so when xprt_ref hits zero, we know the transport is dead * and no-one is using it. - * SK_DEAD can only be set while SK_BUSY is held which ensures + * XPT_DEAD can only be set while XPT_BUSY is held which ensures * no other thread will be using the socket or will try to - * set SK_DEAD. + * set XPT_DEAD. * */ -#define RPCDBG_FACILITY RPCDBG_SVCSOCK +#define RPCDBG_FACILITY RPCDBG_SVCXPRT static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, int *errp, int flags); -static void svc_delete_socket(struct svc_sock *svsk); static void svc_udp_data_ready(struct sock *, int); static int svc_udp_recvfrom(struct svc_rqst *); static int svc_udp_sendto(struct svc_rqst *); -static void svc_close_socket(struct svc_sock *svsk); +static void svc_sock_detach(struct svc_xprt *); +static void svc_sock_free(struct svc_xprt *); -static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk); -static int svc_deferred_recv(struct svc_rqst *rqstp); -static struct cache_deferred_req *svc_defer(struct cache_req *req); - -/* apparently the "standard" is that clients close - * idle connections after 5 minutes, servers after - * 6 minutes - * http://www.connectathon.org/talks96/nfstcp.pdf - */ -static int svc_conn_age_period = 6*60; +static struct svc_xprt * +svc_create_socket(struct svc_serv *, int, struct sockaddr *, int, int); #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key svc_key[2]; @@ -162,40 +155,21 @@ char *svc_print_addr(struct svc_rqst *rq EXPORT_SYMBOL_GPL(svc_print_addr); /* - * Queue up an idle server thread. Must have pool->sp_lock held. - * Note: this is really a stack rather than a queue, so that we only - * use as many different threads as we need, and the rest don't pollute - * the cache. - */ -static inline void -svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp) -{ - list_add(&rqstp->rq_list, &pool->sp_threads); -} - -/* - * Dequeue an nfsd thread. Must have pool->sp_lock held. - */ -static inline void -svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp) -{ - list_del(&rqstp->rq_list); -} - -/* * Release an skbuff after use */ -static inline void +static void svc_release_skb(struct svc_rqst *rqstp) { - struct sk_buff *skb = rqstp->rq_skbuff; + struct sk_buff *skb = rqstp->rq_xprt_ctxt; struct svc_deferred_req *dr = rqstp->rq_deferred; if (skb) { - rqstp->rq_skbuff = NULL; + struct svc_sock *svsk = + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); + rqstp->rq_xprt_ctxt = NULL; dprintk("svc: service %p, releasing skb %p\n", rqstp, skb); - skb_free_datagram(rqstp->rq_sock->sk_sk, skb); + skb_free_datagram(svsk->sk_sk, skb); } if (dr) { rqstp->rq_deferred = NULL; @@ -219,237 +193,6 @@ svc_sock_wspace(struct svc_sock *svsk) return wspace; } -/* - * Queue up a socket with data pending. If there are idle nfsd - * processes, wake 'em up. - * - */ -static void -svc_sock_enqueue(struct svc_sock *svsk) -{ - struct svc_serv *serv = svsk->sk_server; - struct svc_pool *pool; - struct svc_rqst *rqstp; - int cpu; - - if (!(svsk->sk_flags & - ( (1<sk_flags)) - return; - - cpu = get_cpu(); - pool = svc_pool_for_cpu(svsk->sk_server, cpu); - put_cpu(); - - spin_lock_bh(&pool->sp_lock); - - if (!list_empty(&pool->sp_threads) && - !list_empty(&pool->sp_sockets)) - printk(KERN_ERR - "svc_sock_enqueue: threads and sockets both waiting??\n"); - - if (test_bit(SK_DEAD, &svsk->sk_flags)) { - /* Don't enqueue dead sockets */ - dprintk("svc: socket %p is dead, not enqueued\n", svsk->sk_sk); - goto out_unlock; - } - - /* Mark socket as busy. It will remain in this state until the - * server has processed all pending data and put the socket back - * on the idle list. We update SK_BUSY atomically because - * it also guards against trying to enqueue the svc_sock twice. - */ - if (test_and_set_bit(SK_BUSY, &svsk->sk_flags)) { - /* Don't enqueue socket while already enqueued */ - dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk); - goto out_unlock; - } - BUG_ON(svsk->sk_pool != NULL); - svsk->sk_pool = pool; - - set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); - if (((atomic_read(&svsk->sk_reserved) + serv->sv_max_mesg)*2 - > svc_sock_wspace(svsk)) - && !test_bit(SK_CLOSE, &svsk->sk_flags) - && !test_bit(SK_CONN, &svsk->sk_flags)) { - /* Don't enqueue while not enough space for reply */ - dprintk("svc: socket %p no space, %d*2 > %ld, not enqueued\n", - svsk->sk_sk, atomic_read(&svsk->sk_reserved)+serv->sv_max_mesg, - svc_sock_wspace(svsk)); - svsk->sk_pool = NULL; - clear_bit(SK_BUSY, &svsk->sk_flags); - goto out_unlock; - } - clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); - - - if (!list_empty(&pool->sp_threads)) { - rqstp = list_entry(pool->sp_threads.next, - struct svc_rqst, - rq_list); - dprintk("svc: socket %p served by daemon %p\n", - svsk->sk_sk, rqstp); - svc_thread_dequeue(pool, rqstp); - if (rqstp->rq_sock) - printk(KERN_ERR - "svc_sock_enqueue: server %p, rq_sock=%p!\n", - rqstp, rqstp->rq_sock); - rqstp->rq_sock = svsk; - atomic_inc(&svsk->sk_inuse); - rqstp->rq_reserved = serv->sv_max_mesg; - atomic_add(rqstp->rq_reserved, &svsk->sk_reserved); - BUG_ON(svsk->sk_pool != pool); - wake_up(&rqstp->rq_wait); - } else { - dprintk("svc: socket %p put into queue\n", svsk->sk_sk); - list_add_tail(&svsk->sk_ready, &pool->sp_sockets); - BUG_ON(svsk->sk_pool != pool); - } - -out_unlock: - spin_unlock_bh(&pool->sp_lock); -} - -/* - * Dequeue the first socket. Must be called with the pool->sp_lock held. - */ -static inline struct svc_sock * -svc_sock_dequeue(struct svc_pool *pool) -{ - struct svc_sock *svsk; - - if (list_empty(&pool->sp_sockets)) - return NULL; - - svsk = list_entry(pool->sp_sockets.next, - struct svc_sock, sk_ready); - list_del_init(&svsk->sk_ready); - - dprintk("svc: socket %p dequeued, inuse=%d\n", - svsk->sk_sk, atomic_read(&svsk->sk_inuse)); - - return svsk; -} - -/* - * Having read something from a socket, check whether it - * needs to be re-enqueued. - * Note: SK_DATA only gets cleared when a read-attempt finds - * no (or insufficient) data. - */ -static inline void -svc_sock_received(struct svc_sock *svsk) -{ - svsk->sk_pool = NULL; - clear_bit(SK_BUSY, &svsk->sk_flags); - svc_sock_enqueue(svsk); -} - - -/** - * svc_reserve - change the space reserved for the reply to a request. - * @rqstp: The request in question - * @space: new max space to reserve - * - * Each request reserves some space on the output queue of the socket - * to make sure the reply fits. This function reduces that reserved - * space to be the amount of space used already, plus @space. - * - */ -void svc_reserve(struct svc_rqst *rqstp, int space) -{ - space += rqstp->rq_res.head[0].iov_len; - - if (space < rqstp->rq_reserved) { - struct svc_sock *svsk = rqstp->rq_sock; - atomic_sub((rqstp->rq_reserved - space), &svsk->sk_reserved); - rqstp->rq_reserved = space; - - svc_sock_enqueue(svsk); - } -} - -/* - * Release a socket after use. - */ -static inline void -svc_sock_put(struct svc_sock *svsk) -{ - if (atomic_dec_and_test(&svsk->sk_inuse)) { - BUG_ON(! test_bit(SK_DEAD, &svsk->sk_flags)); - - dprintk("svc: releasing dead socket\n"); - if (svsk->sk_sock->file) - sockfd_put(svsk->sk_sock); - else - sock_release(svsk->sk_sock); - if (svsk->sk_info_authunix != NULL) - svcauth_unix_info_release(svsk->sk_info_authunix); - kfree(svsk); - } -} - -static void -svc_sock_release(struct svc_rqst *rqstp) -{ - struct svc_sock *svsk = rqstp->rq_sock; - - svc_release_skb(rqstp); - - svc_free_res_pages(rqstp); - rqstp->rq_res.page_len = 0; - rqstp->rq_res.page_base = 0; - - - /* Reset response buffer and release - * the reservation. - * But first, check that enough space was reserved - * for the reply, otherwise we have a bug! - */ - if ((rqstp->rq_res.len) > rqstp->rq_reserved) - printk(KERN_ERR "RPC request reserved %d but used %d\n", - rqstp->rq_reserved, - rqstp->rq_res.len); - - rqstp->rq_res.head[0].iov_len = 0; - svc_reserve(rqstp, 0); - rqstp->rq_sock = NULL; - - svc_sock_put(svsk); -} - -/* - * External function to wake up a server waiting for data - * This really only makes sense for services like lockd - * which have exactly one thread anyway. - */ -void -svc_wake_up(struct svc_serv *serv) -{ - struct svc_rqst *rqstp; - unsigned int i; - struct svc_pool *pool; - - for (i = 0; i < serv->sv_nrpools; i++) { - pool = &serv->sv_pools[i]; - - spin_lock_bh(&pool->sp_lock); - if (!list_empty(&pool->sp_threads)) { - rqstp = list_entry(pool->sp_threads.next, - struct svc_rqst, - rq_list); - dprintk("svc: daemon %p woken up.\n", rqstp); - /* - svc_thread_dequeue(pool, rqstp); - rqstp->rq_sock = NULL; - */ - wake_up(&rqstp->rq_wait); - } - spin_unlock_bh(&pool->sp_lock); - } -} - union svc_pktinfo_u { struct in_pktinfo pkti; struct in6_pktinfo pkti6; @@ -459,7 +202,9 @@ union svc_pktinfo_u { static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh) { - switch (rqstp->rq_sock->sk_sk->sk_family) { + struct svc_sock *svsk = + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); + switch (svsk->sk_sk->sk_family) { case AF_INET: { struct in_pktinfo *pki = CMSG_DATA(cmh); @@ -492,7 +237,8 @@ static void svc_set_cmsg_data(struct svc static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) { - struct svc_sock *svsk = rqstp->rq_sock; + struct svc_sock *svsk = + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); struct socket *sock = svsk->sk_sock; int slen; union { @@ -565,7 +311,7 @@ svc_sendto(struct svc_rqst *rqstp, struc } out: dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n", - rqstp->rq_sock, xdr->head[0].iov_base, xdr->head[0].iov_len, + svsk, xdr->head[0].iov_base, xdr->head[0].iov_len, xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf))); return len; @@ -602,7 +348,7 @@ svc_sock_names(char *buf, struct svc_ser if (!serv) return 0; spin_lock_bh(&serv->sv_lock); - list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) { + list_for_each_entry(svsk, &serv->sv_permsocks, sk_xprt.xpt_list) { int onelen = one_sock_name(buf+len, svsk); if (toclose && strcmp(toclose, buf+len) == 0) closesk = svsk; @@ -614,7 +360,7 @@ svc_sock_names(char *buf, struct svc_ser /* Should unregister with portmap, but you cannot * unregister just one protocol... */ - svc_close_socket(closesk); + svc_close_xprt(&closesk->sk_xprt); else if (toclose) return -ENOENT; return len; @@ -641,37 +387,21 @@ svc_recv_available(struct svc_sock *svsk static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, int buflen) { - struct svc_sock *svsk = rqstp->rq_sock; + struct svc_sock *svsk = + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); struct msghdr msg = { .msg_flags = MSG_DONTWAIT, }; - struct sockaddr *sin; int len; + /* TCP/UDP have no transport header */ + rqstp->rq_xprt_hlen = 0; + len = kernel_recvmsg(svsk->sk_sock, &msg, iov, nr, buflen, msg.msg_flags); - /* sock_recvmsg doesn't fill in the name/namelen, so we must.. - */ - memcpy(&rqstp->rq_addr, &svsk->sk_remote, svsk->sk_remotelen); - rqstp->rq_addrlen = svsk->sk_remotelen; - - /* Destination address in request is needed for binding the - * source address in RPC callbacks later. - */ - sin = (struct sockaddr *)&svsk->sk_local; - switch (sin->sa_family) { - case AF_INET: - rqstp->rq_daddr.addr = ((struct sockaddr_in *)sin)->sin_addr; - break; - case AF_INET6: - rqstp->rq_daddr.addr6 = ((struct sockaddr_in6 *)sin)->sin6_addr; - break; - } - dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n", svsk, iov[0].iov_base, iov[0].iov_len, len); - return len; } @@ -711,9 +441,10 @@ svc_udp_data_ready(struct sock *sk, int if (svsk) { dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n", - svsk, sk, count, test_bit(SK_BUSY, &svsk->sk_flags)); - set_bit(SK_DATA, &svsk->sk_flags); - svc_sock_enqueue(svsk); + svsk, sk, count, + test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); + svc_xprt_enqueue(&svsk->sk_xprt); } if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) wake_up_interruptible(sk->sk_sleep); @@ -729,8 +460,8 @@ svc_write_space(struct sock *sk) if (svsk) { dprintk("svc: socket %p(inet %p), write_space busy=%d\n", - svsk, sk, test_bit(SK_BUSY, &svsk->sk_flags)); - svc_sock_enqueue(svsk); + svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); + svc_xprt_enqueue(&svsk->sk_xprt); } if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) { @@ -743,7 +474,9 @@ svc_write_space(struct sock *sk) static inline void svc_udp_get_dest_address(struct svc_rqst *rqstp, struct cmsghdr *cmh) { - switch (rqstp->rq_sock->sk_sk->sk_family) { + struct svc_sock *svsk = + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); + switch (svsk->sk_sk->sk_family) { case AF_INET: { struct in_pktinfo *pki = CMSG_DATA(cmh); rqstp->rq_daddr.addr.s_addr = pki->ipi_spec_dst.s_addr; @@ -763,8 +496,9 @@ static inline void svc_udp_get_dest_addr static int svc_udp_recvfrom(struct svc_rqst *rqstp) { - struct svc_sock *svsk = rqstp->rq_sock; - struct svc_serv *serv = svsk->sk_server; + struct svc_sock *svsk = + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); + struct svc_serv *serv = svsk->sk_xprt.xpt_server; struct sk_buff *skb; union { struct cmsghdr hdr; @@ -779,7 +513,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) .msg_flags = MSG_DONTWAIT, }; - if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags)) + if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags)) /* udp sockets need large rcvbuf as all pending * requests are still in that buffer. sndbuf must * also be large enough that there is enough space @@ -792,17 +526,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) (serv->sv_nrthreads+3) * serv->sv_max_mesg, (serv->sv_nrthreads+3) * serv->sv_max_mesg); - if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) { - svc_sock_received(svsk); - return svc_deferred_recv(rqstp); - } - - if (test_bit(SK_CLOSE, &svsk->sk_flags)) { - svc_delete_socket(svsk); - return 0; - } - - clear_bit(SK_DATA, &svsk->sk_flags); + clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); skb = NULL; err = kernel_recvmsg(svsk->sk_sock, &msg, NULL, 0, 0, MSG_PEEK | MSG_DONTWAIT); @@ -813,9 +537,9 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) if (err != -EAGAIN) { /* possibly an icmp error */ dprintk("svc: recvfrom returned error %d\n", -err); - set_bit(SK_DATA, &svsk->sk_flags); + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); } - svc_sock_received(svsk); + svc_xprt_received(&svsk->sk_xprt); return -EAGAIN; } rqstp->rq_addrlen = sizeof(rqstp->rq_addr); @@ -825,12 +549,12 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) need that much accuracy */ } svsk->sk_sk->sk_stamp = skb->tstamp; - set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */ + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */ /* * Maybe more packets - kick another thread ASAP. */ - svc_sock_received(svsk); + svc_xprt_received(&svsk->sk_xprt); len = skb->len - sizeof(struct udphdr); rqstp->rq_arg.len = len; @@ -867,7 +591,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) skb_free_datagram(svsk->sk_sk, skb); return 0; } - rqstp->rq_skbuff = skb; + rqstp->rq_xprt_ctxt = skb; } rqstp->rq_arg.page_base = 0; @@ -901,26 +625,83 @@ svc_udp_sendto(struct svc_rqst *rqstp) } static void -svc_udp_init(struct svc_sock *svsk) +svc_udp_prep_reply_hdr(struct svc_rqst *rqstp) +{ +} + +static int +svc_udp_has_wspace(struct svc_xprt *xprt) +{ + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + struct svc_serv *serv = svsk->sk_xprt.xpt_server; + int required; + + /* + * Set the SOCK_NOSPACE flag before checking the available + * sock space. + */ + set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); + required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg; + if (required*2 > sock_wspace(svsk->sk_sk)) + return 0; + clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); + return 1; +} + +static struct svc_xprt * +svc_udp_accept(struct svc_xprt *xprt) +{ + BUG(); + return NULL; +} + +static struct svc_xprt * +svc_udp_create(struct svc_serv *serv, struct sockaddr *sa, int flags) +{ + return svc_create_socket(serv, IPPROTO_UDP, sa, + sizeof(struct sockaddr_in), flags); +} + +static struct svc_xprt_ops svc_udp_ops = { + .xpo_create = svc_udp_create, + .xpo_recvfrom = svc_udp_recvfrom, + .xpo_sendto = svc_udp_sendto, + .xpo_release = svc_release_skb, + .xpo_detach = svc_sock_detach, + .xpo_free = svc_sock_free, + .xpo_prep_reply_hdr = svc_udp_prep_reply_hdr, + .xpo_has_wspace = svc_udp_has_wspace, + .xpo_accept = svc_udp_accept, +}; + +static struct svc_xprt_class svc_udp_class = { + .xcl_name = "udp", + .xcl_owner = THIS_MODULE, + .xcl_ops = &svc_udp_ops, + .xcl_max_payload = RPCSVC_MAXPAYLOAD_UDP, +}; + +static void +svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) { int one = 1; mm_segment_t oldfs; + svc_xprt_init(&svc_udp_class, &svsk->sk_xprt, serv); + clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); svsk->sk_sk->sk_data_ready = svc_udp_data_ready; svsk->sk_sk->sk_write_space = svc_write_space; - svsk->sk_recvfrom = svc_udp_recvfrom; - svsk->sk_sendto = svc_udp_sendto; /* initialise setting must have enough space to * receive and respond to one request. * svc_udp_recvfrom will re-adjust if necessary */ svc_sock_setbufsize(svsk->sk_sock, - 3 * svsk->sk_server->sv_max_mesg, - 3 * svsk->sk_server->sv_max_mesg); + 3 * svsk->sk_xprt.xpt_server->sv_max_mesg, + 3 * svsk->sk_xprt.xpt_server->sv_max_mesg); - set_bit(SK_DATA, &svsk->sk_flags); /* might have come in before data_ready set up */ - set_bit(SK_CHNGBUF, &svsk->sk_flags); + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* might have come in before data_ready set up */ + set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); oldfs = get_fs(); set_fs(KERNEL_DS); @@ -954,8 +735,8 @@ svc_tcp_listen_data_ready(struct sock *s */ if (sk->sk_state == TCP_LISTEN) { if (svsk) { - set_bit(SK_CONN, &svsk->sk_flags); - svc_sock_enqueue(svsk); + set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); + svc_xprt_enqueue(&svsk->sk_xprt); } else printk("svc: socket %p: no user data\n", sk); } @@ -978,8 +759,8 @@ svc_tcp_state_change(struct sock *sk) if (!svsk) printk("svc: socket %p: no user data\n", sk); else { - set_bit(SK_CLOSE, &svsk->sk_flags); - svc_sock_enqueue(svsk); + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); + svc_xprt_enqueue(&svsk->sk_xprt); } if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) wake_up_interruptible_all(sk->sk_sleep); @@ -993,36 +774,23 @@ svc_tcp_data_ready(struct sock *sk, int dprintk("svc: socket %p TCP data ready (svsk %p)\n", sk, sk->sk_user_data); if (svsk) { - set_bit(SK_DATA, &svsk->sk_flags); - svc_sock_enqueue(svsk); + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); + svc_xprt_enqueue(&svsk->sk_xprt); } if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) wake_up_interruptible(sk->sk_sleep); } -static inline int svc_port_is_privileged(struct sockaddr *sin) -{ - switch (sin->sa_family) { - case AF_INET: - return ntohs(((struct sockaddr_in *)sin)->sin_port) - < PROT_SOCK; - case AF_INET6: - return ntohs(((struct sockaddr_in6 *)sin)->sin6_port) - < PROT_SOCK; - default: - return 0; - } -} - /* * Accept a TCP connection */ -static void -svc_tcp_accept(struct svc_sock *svsk) +static struct svc_xprt * +svc_tcp_accept(struct svc_xprt *xprt) { + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); struct sockaddr_storage addr; struct sockaddr *sin = (struct sockaddr *) &addr; - struct svc_serv *serv = svsk->sk_server; + struct svc_serv *serv = svsk->sk_xprt.xpt_server; struct socket *sock = svsk->sk_sock; struct socket *newsock; struct svc_sock *newsvsk; @@ -1031,9 +799,9 @@ svc_tcp_accept(struct svc_sock *svsk) dprintk("svc: tcp_accept %p sock %p\n", svsk, sock); if (!sock) - return; + return NULL; - clear_bit(SK_CONN, &svsk->sk_flags); + clear_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); err = kernel_accept(sock, &newsock, O_NONBLOCK); if (err < 0) { if (err == -ENOMEM) @@ -1042,11 +810,11 @@ svc_tcp_accept(struct svc_sock *svsk) else if (err != -EAGAIN && net_ratelimit()) printk(KERN_WARNING "%s: accept failed (err %d)!\n", serv->sv_name, -err); - return; + return NULL; } - set_bit(SK_CONN, &svsk->sk_flags); - svc_sock_enqueue(svsk); + set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); + svc_xprt_enqueue(&svsk->sk_xprt); err = kernel_getpeername(newsock, sin, &slen); if (err < 0) { @@ -1077,70 +845,23 @@ svc_tcp_accept(struct svc_sock *svsk) if (!(newsvsk = svc_setup_socket(serv, newsock, &err, (SVC_SOCK_ANONYMOUS | SVC_SOCK_TEMPORARY)))) goto failed; - memcpy(&newsvsk->sk_remote, sin, slen); - newsvsk->sk_remotelen = slen; + memcpy(&newsvsk->sk_xprt.xpt_remote, sin, slen); + newsvsk->sk_xprt.xpt_remotelen = slen; err = kernel_getsockname(newsock, sin, &slen); if (unlikely(err < 0)) { dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err); slen = offsetof(struct sockaddr, sa_data); } - memcpy(&newsvsk->sk_local, sin, slen); - - svc_sock_received(newsvsk); - - /* make sure that we don't have too many active connections. - * If we have, something must be dropped. - * - * There's no point in trying to do random drop here for - * DoS prevention. The NFS clients does 1 reconnect in 15 - * seconds. An attacker can easily beat that. - * - * The only somewhat efficient mechanism would be if drop - * old connections from the same IP first. But right now - * we don't even record the client IP in svc_sock. - */ - if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) { - struct svc_sock *svsk = NULL; - spin_lock_bh(&serv->sv_lock); - if (!list_empty(&serv->sv_tempsocks)) { - if (net_ratelimit()) { - /* Try to help the admin */ - printk(KERN_NOTICE "%s: too many open TCP " - "sockets, consider increasing the " - "number of nfsd threads\n", - serv->sv_name); - printk(KERN_NOTICE - "%s: last TCP connect from %s\n", - serv->sv_name, __svc_print_addr(sin, - buf, sizeof(buf))); - } - /* - * Always select the oldest socket. It's not fair, - * but so is life - */ - svsk = list_entry(serv->sv_tempsocks.prev, - struct svc_sock, - sk_list); - set_bit(SK_CLOSE, &svsk->sk_flags); - atomic_inc(&svsk->sk_inuse); - } - spin_unlock_bh(&serv->sv_lock); - - if (svsk) { - svc_sock_enqueue(svsk); - svc_sock_put(svsk); - } - - } + memcpy(&newsvsk->sk_xprt.xpt_local, sin, slen); if (serv->sv_stats) serv->sv_stats->nettcpconn++; - return; + return &newsvsk->sk_xprt; failed: sock_release(newsock); - return; + return NULL; } /* @@ -1149,34 +870,19 @@ failed: static int svc_tcp_recvfrom(struct svc_rqst *rqstp) { - struct svc_sock *svsk = rqstp->rq_sock; - struct svc_serv *serv = svsk->sk_server; + struct svc_sock *svsk = + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); + struct svc_serv *serv = svsk->sk_xprt.xpt_server; int len; struct kvec *vec; int pnum, vlen; dprintk("svc: tcp_recv %p data %d conn %d close %d\n", - svsk, test_bit(SK_DATA, &svsk->sk_flags), - test_bit(SK_CONN, &svsk->sk_flags), - test_bit(SK_CLOSE, &svsk->sk_flags)); - - if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) { - svc_sock_received(svsk); - return svc_deferred_recv(rqstp); - } + svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags), + test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags), + test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags)); - if (test_bit(SK_CLOSE, &svsk->sk_flags)) { - svc_delete_socket(svsk); - return 0; - } - - if (svsk->sk_sk->sk_state == TCP_LISTEN) { - svc_tcp_accept(svsk); - svc_sock_received(svsk); - return 0; - } - - if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags)) + if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags)) /* sndbuf needs to have room for one request * per thread, otherwise we can stall even when the * network isn't a bottleneck. @@ -1193,7 +899,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) (serv->sv_nrthreads+3) * serv->sv_max_mesg, 3 * serv->sv_max_mesg); - clear_bit(SK_DATA, &svsk->sk_flags); + clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* Receive data. If we haven't got the record length yet, get * the next four bytes. Otherwise try to gobble up as much as @@ -1212,7 +918,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) if (len < want) { dprintk("svc: short recvfrom while reading record length (%d of %lu)\n", len, want); - svc_sock_received(svsk); + svc_xprt_received(&svsk->sk_xprt); return -EAGAIN; /* record header not complete */ } @@ -1248,11 +954,11 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) if (len < svsk->sk_reclen) { dprintk("svc: incomplete TCP record (%d of %d)\n", len, svsk->sk_reclen); - svc_sock_received(svsk); + svc_xprt_received(&svsk->sk_xprt); return -EAGAIN; /* record not complete */ } len = svsk->sk_reclen; - set_bit(SK_DATA, &svsk->sk_flags); + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); vec = rqstp->rq_vec; vec[0] = rqstp->rq_arg.head[0]; @@ -1281,30 +987,30 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; } - rqstp->rq_skbuff = NULL; + rqstp->rq_xprt_ctxt = NULL; rqstp->rq_prot = IPPROTO_TCP; /* Reset TCP read info */ svsk->sk_reclen = 0; svsk->sk_tcplen = 0; - svc_sock_received(svsk); + svc_xprt_received(&svsk->sk_xprt); if (serv->sv_stats) serv->sv_stats->nettcpcnt++; return len; err_delete: - svc_delete_socket(svsk); + svc_delete_xprt(&svsk->sk_xprt); return -EAGAIN; error: if (len == -EAGAIN) { dprintk("RPC: TCP recvfrom got EAGAIN\n"); - svc_sock_received(svsk); + svc_xprt_received(&svsk->sk_xprt); } else { printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", - svsk->sk_server->sv_name, -len); + svsk->sk_xprt.xpt_server->sv_name, -len); goto err_delete; } @@ -1328,35 +1034,103 @@ svc_tcp_sendto(struct svc_rqst *rqstp) reclen = htonl(0x80000000|((xbufp->len ) - 4)); memcpy(xbufp->head[0].iov_base, &reclen, 4); - if (test_bit(SK_DEAD, &rqstp->rq_sock->sk_flags)) + if (test_bit(XPT_DEAD, &rqstp->rq_xprt->xpt_flags)) return -ENOTCONN; sent = svc_sendto(rqstp, &rqstp->rq_res); if (sent != xbufp->len) { printk(KERN_NOTICE "rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n", - rqstp->rq_sock->sk_server->sv_name, + rqstp->rq_xprt->xpt_server->sv_name, (sent<0)?"got error":"sent only", sent, xbufp->len); - set_bit(SK_CLOSE, &rqstp->rq_sock->sk_flags); - svc_sock_enqueue(rqstp->rq_sock); + set_bit(XPT_CLOSE, &rqstp->rq_xprt->xpt_flags); + svc_xprt_enqueue(rqstp->rq_xprt); sent = -EAGAIN; } return sent; } +/* + * Setup response header. TCP has a 4B record length field. + */ +static void +svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp) +{ + struct kvec *resv = &rqstp->rq_res.head[0]; + + /* tcp needs a space for the record length... */ + svc_putnl(resv, 0); +} + +static int +svc_tcp_has_wspace(struct svc_xprt *xprt) +{ + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + struct svc_serv *serv = svsk->sk_xprt.xpt_server; + int required; + + /* + * Set the SOCK_NOSPACE flag before checking the available + * sock space. + */ + set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); + required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg; + if (required*2 > sk_stream_wspace(svsk->sk_sk)) + return 0; + clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); + return 1; +} + +static struct svc_xprt * +svc_tcp_create(struct svc_serv *serv, struct sockaddr *sa, int flags) +{ + return svc_create_socket(serv, IPPROTO_TCP, sa, + sizeof(struct sockaddr_in), flags); +} + +static struct svc_xprt_ops svc_tcp_ops = { + .xpo_create = svc_tcp_create, + .xpo_recvfrom = svc_tcp_recvfrom, + .xpo_sendto = svc_tcp_sendto, + .xpo_release = svc_release_skb, + .xpo_detach = svc_sock_detach, + .xpo_free = svc_sock_free, + .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr, + .xpo_has_wspace = svc_tcp_has_wspace, + .xpo_accept = svc_tcp_accept, +}; + +static struct svc_xprt_class svc_tcp_class = { + .xcl_name = "tcp", + .xcl_owner = THIS_MODULE, + .xcl_ops = &svc_tcp_ops, + .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP, +}; + +void svc_init_xprt_sock(void) +{ + svc_reg_xprt_class(&svc_tcp_class); + svc_reg_xprt_class(&svc_udp_class); +} + +void svc_cleanup_xprt_sock(void) +{ + svc_unreg_xprt_class(&svc_tcp_class); + svc_unreg_xprt_class(&svc_udp_class); +} + static void -svc_tcp_init(struct svc_sock *svsk) +svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) { struct sock *sk = svsk->sk_sk; struct tcp_sock *tp = tcp_sk(sk); - svsk->sk_recvfrom = svc_tcp_recvfrom; - svsk->sk_sendto = svc_tcp_sendto; - + svc_xprt_init(&svc_tcp_class, &svsk->sk_xprt, serv); + set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); if (sk->sk_state == TCP_LISTEN) { dprintk("setting up TCP socket for listening\n"); sk->sk_data_ready = svc_tcp_listen_data_ready; - set_bit(SK_CONN, &svsk->sk_flags); + set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); } else { dprintk("setting up TCP socket for reading\n"); sk->sk_state_change = svc_tcp_state_change; @@ -1373,13 +1147,13 @@ svc_tcp_init(struct svc_sock *svsk) * svc_tcp_recvfrom will re-adjust if necessary */ svc_sock_setbufsize(svsk->sk_sock, - 3 * svsk->sk_server->sv_max_mesg, - 3 * svsk->sk_server->sv_max_mesg); + 3 * svsk->sk_xprt.xpt_server->sv_max_mesg, + 3 * svsk->sk_xprt.xpt_server->sv_max_mesg); - set_bit(SK_CHNGBUF, &svsk->sk_flags); - set_bit(SK_DATA, &svsk->sk_flags); + set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); if (sk->sk_state != TCP_ESTABLISHED) - set_bit(SK_CLOSE, &svsk->sk_flags); + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); } } @@ -1395,229 +1169,15 @@ svc_sock_update_bufs(struct svc_serv *se spin_lock_bh(&serv->sv_lock); list_for_each(le, &serv->sv_permsocks) { struct svc_sock *svsk = - list_entry(le, struct svc_sock, sk_list); - set_bit(SK_CHNGBUF, &svsk->sk_flags); + list_entry(le, struct svc_sock, sk_xprt.xpt_list); + set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); } list_for_each(le, &serv->sv_tempsocks) { struct svc_sock *svsk = - list_entry(le, struct svc_sock, sk_list); - set_bit(SK_CHNGBUF, &svsk->sk_flags); - } - spin_unlock_bh(&serv->sv_lock); -} - -/* - * Receive the next request on any socket. This code is carefully - * organised not to touch any cachelines in the shared svc_serv - * structure, only cachelines in the local svc_pool. - */ -int -svc_recv(struct svc_rqst *rqstp, long timeout) -{ - struct svc_sock *svsk = NULL; - struct svc_serv *serv = rqstp->rq_server; - struct svc_pool *pool = rqstp->rq_pool; - int len, i; - int pages; - struct xdr_buf *arg; - DECLARE_WAITQUEUE(wait, current); - - dprintk("svc: server %p waiting for data (to = %ld)\n", - rqstp, timeout); - - if (rqstp->rq_sock) - printk(KERN_ERR - "svc_recv: service %p, socket not NULL!\n", - rqstp); - if (waitqueue_active(&rqstp->rq_wait)) - printk(KERN_ERR - "svc_recv: service %p, wait queue active!\n", - rqstp); - - - /* now allocate needed pages. If we get a failure, sleep briefly */ - pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE; - for (i=0; i < pages ; i++) - while (rqstp->rq_pages[i] == NULL) { - struct page *p = alloc_page(GFP_KERNEL); - if (!p) - schedule_timeout_uninterruptible(msecs_to_jiffies(500)); - rqstp->rq_pages[i] = p; - } - rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */ - BUG_ON(pages >= RPCSVC_MAXPAGES); - - /* Make arg->head point to first page and arg->pages point to rest */ - arg = &rqstp->rq_arg; - arg->head[0].iov_base = page_address(rqstp->rq_pages[0]); - arg->head[0].iov_len = PAGE_SIZE; - arg->pages = rqstp->rq_pages + 1; - arg->page_base = 0; - /* save at least one page for response */ - arg->page_len = (pages-2)*PAGE_SIZE; - arg->len = (pages-1)*PAGE_SIZE; - arg->tail[0].iov_len = 0; - - try_to_freeze(); - cond_resched(); - if (signalled()) - return -EINTR; - - spin_lock_bh(&pool->sp_lock); - if ((svsk = svc_sock_dequeue(pool)) != NULL) { - rqstp->rq_sock = svsk; - atomic_inc(&svsk->sk_inuse); - rqstp->rq_reserved = serv->sv_max_mesg; - atomic_add(rqstp->rq_reserved, &svsk->sk_reserved); - } else { - /* No data pending. Go to sleep */ - svc_thread_enqueue(pool, rqstp); - - /* - * We have to be able to interrupt this wait - * to bring down the daemons ... - */ - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&rqstp->rq_wait, &wait); - spin_unlock_bh(&pool->sp_lock); - - schedule_timeout(timeout); - - try_to_freeze(); - - spin_lock_bh(&pool->sp_lock); - remove_wait_queue(&rqstp->rq_wait, &wait); - - if (!(svsk = rqstp->rq_sock)) { - svc_thread_dequeue(pool, rqstp); - spin_unlock_bh(&pool->sp_lock); - dprintk("svc: server %p, no data yet\n", rqstp); - return signalled()? -EINTR : -EAGAIN; - } - } - spin_unlock_bh(&pool->sp_lock); - - dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n", - rqstp, pool->sp_id, svsk, atomic_read(&svsk->sk_inuse)); - len = svsk->sk_recvfrom(rqstp); - dprintk("svc: got len=%d\n", len); - - /* No data, incomplete (TCP) read, or accept() */ - if (len == 0 || len == -EAGAIN) { - rqstp->rq_res.len = 0; - svc_sock_release(rqstp); - return -EAGAIN; - } - svsk->sk_lastrecv = get_seconds(); - clear_bit(SK_OLD, &svsk->sk_flags); - - rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp)); - rqstp->rq_chandle.defer = svc_defer; - - if (serv->sv_stats) - serv->sv_stats->netcnt++; - return len; -} - -/* - * Drop request - */ -void -svc_drop(struct svc_rqst *rqstp) -{ - dprintk("svc: socket %p dropped request\n", rqstp->rq_sock); - svc_sock_release(rqstp); -} - -/* - * Return reply to client. - */ -int -svc_send(struct svc_rqst *rqstp) -{ - struct svc_sock *svsk; - int len; - struct xdr_buf *xb; - - if ((svsk = rqstp->rq_sock) == NULL) { - printk(KERN_WARNING "NULL socket pointer in %s:%d\n", - __FILE__, __LINE__); - return -EFAULT; - } - - /* release the receive skb before sending the reply */ - svc_release_skb(rqstp); - - /* calculate over-all length */ - xb = & rqstp->rq_res; - xb->len = xb->head[0].iov_len + - xb->page_len + - xb->tail[0].iov_len; - - /* Grab svsk->sk_mutex to serialize outgoing data. */ - mutex_lock(&svsk->sk_mutex); - if (test_bit(SK_DEAD, &svsk->sk_flags)) - len = -ENOTCONN; - else - len = svsk->sk_sendto(rqstp); - mutex_unlock(&svsk->sk_mutex); - svc_sock_release(rqstp); - - if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN) - return 0; - return len; -} - -/* - * Timer function to close old temporary sockets, using - * a mark-and-sweep algorithm. - */ -static void -svc_age_temp_sockets(unsigned long closure) -{ - struct svc_serv *serv = (struct svc_serv *)closure; - struct svc_sock *svsk; - struct list_head *le, *next; - LIST_HEAD(to_be_aged); - - dprintk("svc_age_temp_sockets\n"); - - if (!spin_trylock_bh(&serv->sv_lock)) { - /* busy, try again 1 sec later */ - dprintk("svc_age_temp_sockets: busy\n"); - mod_timer(&serv->sv_temptimer, jiffies + HZ); - return; - } - - list_for_each_safe(le, next, &serv->sv_tempsocks) { - svsk = list_entry(le, struct svc_sock, sk_list); - - if (!test_and_set_bit(SK_OLD, &svsk->sk_flags)) - continue; - if (atomic_read(&svsk->sk_inuse) > 1 || test_bit(SK_BUSY, &svsk->sk_flags)) - continue; - atomic_inc(&svsk->sk_inuse); - list_move(le, &to_be_aged); - set_bit(SK_CLOSE, &svsk->sk_flags); - set_bit(SK_DETACHED, &svsk->sk_flags); + list_entry(le, struct svc_sock, sk_xprt.xpt_list); + set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); } spin_unlock_bh(&serv->sv_lock); - - while (!list_empty(&to_be_aged)) { - le = to_be_aged.next; - /* fiddling the sk_list node is safe 'cos we're SK_DETACHED */ - list_del_init(le); - svsk = list_entry(le, struct svc_sock, sk_list); - - dprintk("queuing svsk %p for closing, %lu seconds old\n", - svsk, get_seconds() - svsk->sk_lastrecv); - - /* a thread will dequeue and close it soon */ - svc_sock_enqueue(svsk); - svc_sock_put(svsk); - } - - mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); } /* @@ -1631,7 +1191,6 @@ static struct svc_sock *svc_setup_socket struct svc_sock *svsk; struct sock *inet; int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); - int is_temporary = flags & SVC_SOCK_TEMPORARY; dprintk("svc: svc_setup_socket %p\n", sock); if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) { @@ -1651,44 +1210,19 @@ static struct svc_sock *svc_setup_socket return NULL; } - set_bit(SK_BUSY, &svsk->sk_flags); + set_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags); inet->sk_user_data = svsk; svsk->sk_sock = sock; svsk->sk_sk = inet; svsk->sk_ostate = inet->sk_state_change; svsk->sk_odata = inet->sk_data_ready; svsk->sk_owspace = inet->sk_write_space; - svsk->sk_server = serv; - atomic_set(&svsk->sk_inuse, 1); - svsk->sk_lastrecv = get_seconds(); - spin_lock_init(&svsk->sk_lock); - INIT_LIST_HEAD(&svsk->sk_deferred); - INIT_LIST_HEAD(&svsk->sk_ready); - mutex_init(&svsk->sk_mutex); /* Initialize the socket */ if (sock->type == SOCK_DGRAM) - svc_udp_init(svsk); + svc_udp_init(svsk, serv); else - svc_tcp_init(svsk); - - spin_lock_bh(&serv->sv_lock); - if (is_temporary) { - set_bit(SK_TEMP, &svsk->sk_flags); - list_add(&svsk->sk_list, &serv->sv_tempsocks); - serv->sv_tmpcnt++; - if (serv->sv_temptimer.function == NULL) { - /* setup timer to age temp sockets */ - setup_timer(&serv->sv_temptimer, svc_age_temp_sockets, - (unsigned long)serv); - mod_timer(&serv->sv_temptimer, - jiffies + svc_conn_age_period * HZ); - } - } else { - clear_bit(SK_TEMP, &svsk->sk_flags); - list_add(&svsk->sk_list, &serv->sv_permsocks); - } - spin_unlock_bh(&serv->sv_lock); + svc_tcp_init(svsk, serv); dprintk("svc: svc_setup_socket created %p (inet %p)\n", svsk, svsk->sk_sk); @@ -1717,9 +1251,15 @@ int svc_addsock(struct svc_serv *serv, else { svsk = svc_setup_socket(serv, so, &err, SVC_SOCK_DEFAULTS); if (svsk) { - svc_sock_received(svsk); + svc_xprt_received(&svsk->sk_xprt); err = 0; } + if (so->sk->sk_protocol == IPPROTO_TCP) + set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags); + clear_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags); + spin_lock_bh(&serv->sv_lock); + list_add(&svsk->sk_xprt.xpt_list, &serv->sv_permsocks); + spin_unlock_bh(&serv->sv_lock); } if (err) { sockfd_put(so); @@ -1733,8 +1273,9 @@ EXPORT_SYMBOL_GPL(svc_addsock); /* * Create socket for RPC service. */ -static int svc_create_socket(struct svc_serv *serv, int protocol, - struct sockaddr *sin, int len, int flags) +static struct svc_xprt * +svc_create_socket(struct svc_serv *serv, int protocol, + struct sockaddr *sin, int len, int flags) { struct svc_sock *svsk; struct socket *sock; @@ -1749,13 +1290,13 @@ static int svc_create_socket(struct svc_ if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) { printk(KERN_WARNING "svc: only UDP and TCP " "sockets supported\n"); - return -EINVAL; + return ERR_PTR(-EINVAL); } type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; error = sock_create_kern(sin->sa_family, type, protocol, &sock); if (error < 0) - return error; + return ERR_PTR(error); svc_reclassify_socket(sock); @@ -1771,197 +1312,48 @@ static int svc_create_socket(struct svc_ } if ((svsk = svc_setup_socket(serv, sock, &error, flags)) != NULL) { - svc_sock_received(svsk); - return ntohs(inet_sk(svsk->sk_sk)->sport); + if (protocol == IPPROTO_TCP) + set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags); + svc_xprt_received(&svsk->sk_xprt); + return (struct svc_xprt *)svsk; } bummer: dprintk("svc: svc_create_socket error = %d\n", -error); sock_release(sock); - return error; + return ERR_PTR(error); } /* - * Remove a dead socket + * Detach the svc_sock from the socket so that no + * more callbacks occur. */ static void -svc_delete_socket(struct svc_sock *svsk) +svc_sock_detach(struct svc_xprt *xprt) { - struct svc_serv *serv; - struct sock *sk; + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + struct sock *sk = svsk->sk_sk; - dprintk("svc: svc_delete_socket(%p)\n", svsk); - - serv = svsk->sk_server; - sk = svsk->sk_sk; + dprintk("svc: svc_sock_detach(%p)\n", svsk); + /* put back the old socket callbacks */ sk->sk_state_change = svsk->sk_ostate; sk->sk_data_ready = svsk->sk_odata; sk->sk_write_space = svsk->sk_owspace; - - spin_lock_bh(&serv->sv_lock); - - if (!test_and_set_bit(SK_DETACHED, &svsk->sk_flags)) - list_del_init(&svsk->sk_list); - /* - * We used to delete the svc_sock from whichever list - * it's sk_ready node was on, but we don't actually - * need to. This is because the only time we're called - * while still attached to a queue, the queue itself - * is about to be destroyed (in svc_destroy). - */ - if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags)) { - BUG_ON(atomic_read(&svsk->sk_inuse)<2); - atomic_dec(&svsk->sk_inuse); - if (test_bit(SK_TEMP, &svsk->sk_flags)) - serv->sv_tmpcnt--; - } - - spin_unlock_bh(&serv->sv_lock); -} - -static void svc_close_socket(struct svc_sock *svsk) -{ - set_bit(SK_CLOSE, &svsk->sk_flags); - if (test_and_set_bit(SK_BUSY, &svsk->sk_flags)) - /* someone else will have to effect the close */ - return; - - atomic_inc(&svsk->sk_inuse); - svc_delete_socket(svsk); - clear_bit(SK_BUSY, &svsk->sk_flags); - svc_sock_put(svsk); -} - -void svc_force_close_socket(struct svc_sock *svsk) -{ - set_bit(SK_CLOSE, &svsk->sk_flags); - if (test_bit(SK_BUSY, &svsk->sk_flags)) { - /* Waiting to be processed, but no threads left, - * So just remove it from the waiting list - */ - list_del_init(&svsk->sk_ready); - clear_bit(SK_BUSY, &svsk->sk_flags); - } - svc_close_socket(svsk); -} - -/** - * svc_makesock - Make a socket for nfsd and lockd - * @serv: RPC server structure - * @protocol: transport protocol to use - * @port: port to use - * @flags: requested socket characteristics - * - */ -int svc_makesock(struct svc_serv *serv, int protocol, unsigned short port, - int flags) -{ - struct sockaddr_in sin = { - .sin_family = AF_INET, - .sin_addr.s_addr = INADDR_ANY, - .sin_port = htons(port), - }; - - dprintk("svc: creating socket proto = %d\n", protocol); - return svc_create_socket(serv, protocol, (struct sockaddr *) &sin, - sizeof(sin), flags); -} - -/* - * Handle defer and revisit of requests - */ - -static void svc_revisit(struct cache_deferred_req *dreq, int too_many) -{ - struct svc_deferred_req *dr = container_of(dreq, struct svc_deferred_req, handle); - struct svc_sock *svsk; - - if (too_many) { - svc_sock_put(dr->svsk); - kfree(dr); - return; - } - dprintk("revisit queued\n"); - svsk = dr->svsk; - dr->svsk = NULL; - spin_lock(&svsk->sk_lock); - list_add(&dr->handle.recent, &svsk->sk_deferred); - spin_unlock(&svsk->sk_lock); - set_bit(SK_DEFERRED, &svsk->sk_flags); - svc_sock_enqueue(svsk); - svc_sock_put(svsk); -} - -static struct cache_deferred_req * -svc_defer(struct cache_req *req) -{ - struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle); - int size = sizeof(struct svc_deferred_req) + (rqstp->rq_arg.len); - struct svc_deferred_req *dr; - - if (rqstp->rq_arg.page_len) - return NULL; /* if more than a page, give up FIXME */ - if (rqstp->rq_deferred) { - dr = rqstp->rq_deferred; - rqstp->rq_deferred = NULL; - } else { - int skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; - /* FIXME maybe discard if size too large */ - dr = kmalloc(size, GFP_KERNEL); - if (dr == NULL) - return NULL; - - dr->handle.owner = rqstp->rq_server; - dr->prot = rqstp->rq_prot; - memcpy(&dr->addr, &rqstp->rq_addr, rqstp->rq_addrlen); - dr->addrlen = rqstp->rq_addrlen; - dr->daddr = rqstp->rq_daddr; - dr->argslen = rqstp->rq_arg.len >> 2; - memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2); - } - atomic_inc(&rqstp->rq_sock->sk_inuse); - dr->svsk = rqstp->rq_sock; - - dr->handle.revisit = svc_revisit; - return &dr->handle; } /* - * recv data from a deferred request into an active one + * Free the svc_sock's socket resources and the svc_sock itself. */ -static int svc_deferred_recv(struct svc_rqst *rqstp) -{ - struct svc_deferred_req *dr = rqstp->rq_deferred; - - rqstp->rq_arg.head[0].iov_base = dr->args; - rqstp->rq_arg.head[0].iov_len = dr->argslen<<2; - rqstp->rq_arg.page_len = 0; - rqstp->rq_arg.len = dr->argslen<<2; - rqstp->rq_prot = dr->prot; - memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen); - rqstp->rq_addrlen = dr->addrlen; - rqstp->rq_daddr = dr->daddr; - rqstp->rq_respages = rqstp->rq_pages; - return dr->argslen<<2; -} - - -static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk) +static void +svc_sock_free(struct svc_xprt *xprt) { - struct svc_deferred_req *dr = NULL; + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + dprintk("svc: svc_sock_free(%p)\n", svsk); - if (!test_bit(SK_DEFERRED, &svsk->sk_flags)) - return NULL; - spin_lock(&svsk->sk_lock); - clear_bit(SK_DEFERRED, &svsk->sk_flags); - if (!list_empty(&svsk->sk_deferred)) { - dr = list_entry(svsk->sk_deferred.next, - struct svc_deferred_req, - handle.recent); - list_del_init(&dr->handle.recent); - set_bit(SK_DEFERRED, &svsk->sk_flags); - } - spin_unlock(&svsk->sk_lock); - return dr; + if (svsk->sk_sock->file) + sockfd_put(svsk->sk_sock); + else + sock_release(svsk->sk_sock); + kfree(svsk); } diff -puN net/sunrpc/sysctl.c~git-nfsd net/sunrpc/sysctl.c --- a/net/sunrpc/sysctl.c~git-nfsd +++ a/net/sunrpc/sysctl.c @@ -18,6 +18,7 @@ #include #include #include +#include /* * Declare the debug flags here @@ -27,6 +28,8 @@ unsigned int nfs_debug; unsigned int nfsd_debug; unsigned int nlm_debug; +char xprt_buf[128]; + #ifdef RPC_DEBUG static struct ctl_table_header *sunrpc_table_header; @@ -48,6 +51,32 @@ rpc_unregister_sysctl(void) } } +static int proc_do_xprt(ctl_table *table, int write, struct file *file, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char tmpbuf[sizeof(xprt_buf)]; + int len; + if ((*ppos && !write) || !*lenp) { + *lenp = 0; + return 0; + } + if (write) + return -EINVAL; + else { + + len = svc_print_xprts(tmpbuf, sizeof(tmpbuf)); + if (!access_ok(VERIFY_WRITE, buffer, len)) + return -EFAULT; + + if (__copy_to_user(buffer, tmpbuf, len)) + return -EFAULT; + } + + *lenp -= len; + *ppos += len; + return 0; +} + static int proc_dodebug(ctl_table *table, int write, struct file *file, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -145,6 +174,14 @@ static ctl_table debug_table[] = { .mode = 0644, .proc_handler = &proc_dodebug }, + { + .ctl_name = CTL_TRANSPORTS, + .procname = "transports", + .data = xprt_buf, + .maxlen = sizeof(xprt_buf), + .mode = 0444, + .proc_handler = &proc_do_xprt, + }, { .ctl_name = 0 } }; _