GIT a53800edb0dc9fcb995ea2a8e5f629d264cae42c git://git.linux-nfs.org/~bfields/linux.git#for-mm commit e8bfefa9408c7cbd09089e82a795813e90925055 Author: akpm@linux-foundation.org Date: Tue Nov 20 01:10:43 2007 -0800 git-nfsd build fix From: Andrew Morton net/sunrpc/svcsock.c: In function 'svc_reclassify_socket': net/sunrpc/svcsock.c:100: error: 'struct sock' has no member named 'sk_xprt' Cc: "J. Bruce Fields" Cc: Neil Brown Signed-off-by: Andrew Morton commit 5b3e7e25231883944e95358235832f54afd2635b Author: J. Bruce Fields Date: Tue Nov 13 17:39:08 2007 -0500 Fix compiler warning in server rdma code I was getting stuff like this: net/sunrpc/svc_rdma_transport.c: In function ‘dto_tasklet_func’: net/sunrpc/svc_rdma_transport.c:232: warning: passing argument 2 of ‘test_and_clear_bit’ from incompatible pointer type net/sunrpc/svc_rdma_transport.c:241: warning: passing argument 2 of ‘constant_test_bit’ from incompatible pointer type net/sunrpc/svc_rdma_transport.c:241: warning: passing argument 2 of ‘variable_test_bit’ from incompatible pointer type net/sunrpc/svc_rdma_transport.c:245: warning: passing argument 2 of ‘test_and_clear_bit’ from incompatible pointer type net/sunrpc/svc_rdma_transport.c: In function ‘rq_comp_handler’: net/sunrpc/svc_rdma_transport.c:271: warning: passing argument 2 of ‘set_bit’ from incompatible pointer type net/sunrpc/svc_rdma_transport.c: In function ‘sq_comp_handler’: net/sunrpc/svc_rdma_transport.c:382: warning: passing argument 2 of ‘set_bit’ from incompatible pointer type net/sunrpc/svc_rdma_transport.c: In function ‘rdma_cma_handler’: net/sunrpc/svc_rdma_transport.c:638: warning: passing argument 2 of ‘clear_bit’ from incompatible pointer type net/sunrpc/svc_rdma_transport.c: In function ‘svc_rdma_accept’: net/sunrpc/svc_rdma_transport.c:864: warning: passing argument 2 of ‘set_bit’ from incompatible pointer type Signed-off-by: J. Bruce Fields commit f994c2b85b597707ef3f1b349c761d74c22a0893 Author: J. Bruce Fields Date: Tue Nov 13 16:59:33 2007 -0500 rdma: remove binary names for new rdma sysctls We don't need these numbers if we just want to give userland access to the string names. See Documentation/sysctl/ctl_unnumbered.txt. Signed-off-by: J. Bruce Fields commit 630a44e632de67929fc515f27d7745c94697dce1 Author: Tom Tucker Date: Fri Oct 19 16:56:30 2007 -0500 rdma: Kconfig Add NFS_RDMA as an option to the Kconfig file. Signed-off-by: Tom Tucker commit 85445790fccebb75b54cc73c0130f3bc904d3741 Author: Tom Tucker Date: Fri Oct 19 16:56:28 2007 -0500 rdma: makefile Add the NFSD_RDMA module to the sunrpc makefile. Signed-off-by: Tom Tucker commit 42de116e9b02a21a8c96674b69c4a632ac67a8ee Author: Tom Tucker Date: Fri Oct 19 16:56:26 2007 -0500 rdma: ONCRPC RDMA protocol marshalling This logic parses the ONCRDMA protocol headers that precede the actual RPC header. It is placed in a separate file to keep all protocol aware code in a single place. Signed-off-by: Tom Tucker commit d63a9cfe23a97c217493aeba15c34108e438c267 Author: Tom Tucker Date: Fri Oct 19 16:56:24 2007 -0500 rdma: SVCRDMA sendto This file implements the RDMA transport sendto function. A RPC reply on an RDMA transport consists of some number of RDMA_WRITE requests followed by an RDMA_SEND request. The sendto function parses the ONCRPC RDMA reply header to determine how to send the reply back to the client. The send queue is sized so as to be able to send complete replies for requests in most cases. In the event that there are not enough SQ WR slots to reply, e.g. big data, the send will block the NFSD thread. The I/O callback functions in svc_rdma_transport.c that reap WR completions wake any waiters blocked on the SQ. In general, the goal is not to block NFSD threads and the has_wspace method stall requests when the SQ is nearly full. Signed-off-by: Tom Tucker commit cb825238051107749715859fcb4442e4d45eab16 Author: Tom Tucker Date: Fri Oct 19 16:56:22 2007 -0500 rdma: SVCRDMA recvfrom This file implements the RDMA transport recvfrom function. The function dequeues work reqeust completion contexts from an I/O list that it shares with the I/O tasklet in svc_rdma_transport.c. For ONCRPC RDMA, an RPC may not be complete when it is received. Instead, the RDMA header that precedes the RPC message informs the transport where to get the RPC data from on the client and where to place it in the RPC message before it is delivered to the server. The svc_rdma_recvfrom function therefore, parses this RDMA header and issues any necessary RDMA operations to fetch the remainder of the RPC from the client. Special handling is required when the request involves an RDMA_READ. In this case, recvfrom submits the RDMA_READ requests to the underlying transport driver and then returns 0. When the transport completes the last RDMA_READ for the request, it enqueues it on a read completion queue and enqueues the transport. The recvfrom code favors this queue over the regular DTO queue when satisfying reads. Signed-off-by: Tom Tucker commit e12ee2197577bd0fae520a824a41b276b02b827d Author: Tom Tucker Date: Fri Oct 19 16:56:19 2007 -0500 rdma: SVCRDMA Core Transport Services This file implements the core transport data management and I/O path. The I/O path for RDMA involves receiving callbacks on interrupt context. Since all the svc transport locks are _bh locks we enqueue the transport on a list, schedule a tasklet to dequeue data indications from the RDMA completion queue. The tasklet in turn takes _bh locks to enqueue receive data indications on a list for the transport. The svc_rdma_recvfrom transport function dequeues data from this list in an NFSD thread context. Signed-off-by: Tom Tucker commit 23a581064cafa29a1890b92c4ef92902bfb0c65f Author: Tom Tucker Date: Fri Oct 19 16:56:17 2007 -0500 rdma: SVCRDMA Transport Module This file implements the RDMA transport module initialization and termination logic and registers the transport sysctl variables. Signed-off-by: Tom Tucker commit d26b204a6dd7a4dd98d67c72648915c2d7261c8c Author: Tom Tucker Date: Fri Oct 19 16:56:15 2007 -0500 rdma: SVCRMDA Header File This file defines the data types used by the SVCRDMA transport module. The principle data structure is the transport specific extension to the svcxprt structure. Signed-off-by: Tom Tucker commit 6fa0e60bdb4906b4a74f9468dc0ca1a38d92e0ec Author: Tom Tucker Date: Fri Oct 19 16:56:13 2007 -0500 rdma: sysctl for SVCRDMA transport module Add sysctl flags for SVCRDMA transport module debug messages. Signed-off-by: Tom Tucker commit 0a29ac94afe55ba203ad10eeb0901ebe4e707f29 Author: Tom Tucker Date: Fri Oct 19 16:45:30 2007 -0500 svc: Move svc_xprt_received call to follow addition of xprt to list The call to svc_xprt_received function should be called after the transport is completely initialized and added to the tempsocks list. Signed-off-by: Tom Tucker commit a0144a254e7f304310da1a9e26affbcbe4cb14ea Author: Tom Tucker Date: Fri Oct 19 16:45:28 2007 -0500 svc: Add svc_xprt_names service to replace svc_sock_names Create a transport independent version of the svc_sock_names function. The toclose capability of the svc_sock_names service can be implemented using the svc_xprt_find and svc_xprt_close services. Signed-off-by: Tom Tucker commit 92b661e7da14497cb3d7f41f848807a5309ef5b1 Author: Tom Tucker Date: Fri Oct 19 16:45:26 2007 -0500 svc: svc_addsock needs to set the svc_xprt address The svc_addsock function needs to set the local address in the svc_xprt structure. Signed-off-by: Tom Tucker commit 113fcce2975701ec138a1cc013335ba771f4c31e Author: Tom Tucker Date: Fri Oct 19 16:45:23 2007 -0500 svc: Fix skip computation in svc_defer and svc_revisit The rq_arg.len includes the size of the transport header. The computations assumed that it did not. Signed-off-by: Tom Tucker commit 74f331309aef487f137e7ca50d231c4deb3f16e1 Author: Tom Tucker Date: Fri Oct 12 14:21:10 2007 -0500 svc: Restore rq_xprt_hlen in svc_deferred_recv Fixed a couple bugs in the deferral processing. This won't see significant testing until I complete the rdma driver mods because tcp/udp always set the xprt_hlen to zero. The changes are as follows: - Restore the length of the transport header in the rqstp structure in case the request gets deferred again. - Update the iov_head and arg.len with xprt_hlen. - Add comments that describe how xprt_hlen is used. Signed-off-by: Tom Tucker commit 3338154a7626bab4997ec25a472176f4c925c62c Author: Tom Tucker Date: Fri Oct 12 14:00:48 2007 -0500 svc: bzero the xprt memory in svc_xprt_init The transport class driver is responsible for allocating the memory that contains svc_xprt. We don't know whether the driver used kzalloc or kmalloc. Initialize the svc_xprt structure to zeroes just in case. Also changed the name of the svc_xprt ptr variable to xprt to be consistent with the naming used everywhere else for the transport ptr. Signed-off-by: Tom Tucker commit 9466bde5b4f777ab01bca58d71006199c27c6f7a Author: Tom Tucker Date: Fri Oct 12 13:58:32 2007 -0500 svc: Don't call xpo_free until after releasing auth data The svc_xprt_free function is calling xpo_free and then using the xprt pointer while attempting to free the cached auth data. Reverse the order of these operations. Signed-off-by: Tom Tucker commit b57d6d1cbaff501adbbddc7b47162132b9818db4 Author: Tom Tucker Date: Wed Oct 10 21:29:01 2007 -0500 knfsd: Modify write_ports to use svc_find_xprt service This patch enhances the write_ports function as follows: - Check if a server transport instance already exists before attempting to create a new one, and - Implement the ability to remove a previously created server transport instance. Signed-off-by: Tom Tucker commit 087b11940da763ee75188a9878f29d5f1ce7e812 Author: Tom Tucker Date: Wed Oct 10 21:28:59 2007 -0500 svc: Fix bugs in svc_find_xprt The bug fixes are as follows: - Verify the required arguments to the function. - Change the address family wildcard to the more type-friendly AF_UNSPEC. - Properly handle the address family when comparing ports - The svc_find_xprt service needs a lock on the sv_permsocks list Signed-off-by: Tom Tucker commit d1c3b78fbf63766a1650adc454a0b98e8de7e642 Author: Tom Tucker Date: Wed Oct 10 21:28:57 2007 -0500 svc: Change sockaddr to sockaddr_storage in svc_create_xprt A sockaddr is too small to handle an IPv6 address. Change the newsin local variable to point to sockaddr_storage. Signed-off-by: Tom Tucker commit e6f2cb37ae11818cff26d66ff5f83d546cc8f93f Author: Tom Tucker Date: Tue Oct 9 17:57:40 2007 -0500 svc: Add xprt header len to svc_deferred_req A transport may have a header that precedes the RPC message. Save the size of this header in the svc_deferred_req so that when the RPC is revisited, the deferred recv function doesn't need to re-parse the header to determine it's length. In addition, a misleading comment related to the transport header for TCP/UDP has been rewritten. Signed-off-by: Tom Tucker commit 2bbeff4e08d7456d0c236c22e063e1c8a203e076 Author: Tom Tucker Date: Tue Oct 9 10:37:22 2007 -0500 svc: Place type on same line for new API The convention is place the type name on the same line as the function name. The inline directive was also removed to allow the compiler to elect whether or not to inline. Signed-off-by: Tom Tucker commit b3b9c67919140bd715f9a02b47fb638c6fc1a476 Author: Tom Tucker Date: Tue Oct 9 10:37:20 2007 -0500 svc: Add svc API that queries for a transport instance Add a new svc function that allows a service to query whether a transport instance has already been created. This is used in lockd to determine whether or not a transport needs to be created when a lockd instance is brought up. Specifying 0 for the address family or port is effectively a wild-card, and will result in matching the first transport in the service's list that has a matching class name. Signed-off-by: Tom Tucker commit 96af7ac997f1efb9d7c1d4010f49223546e080e0 Author: Tom Tucker Date: Tue Oct 9 17:52:53 2007 -0500 svc: Modify svc_create_xprt to return local port Please take this version of the patch. The previous version has a whitespace issue. Sorry for the inconvenience. This patch fixes a regression introduced by the svc transport switch as follows: - Listening endpoints need to have their local address set properly. - svc_create_xprt needs to return the local port number since the the nfs4 callback service uses the return value to determine the local port elected by the transport when binding to zero. Signed-off-by: Tom Tucker commit 4b3e84ca37115c248e8e253ae500acb36d080d44 Author: Tom Tucker Date: Tue Oct 9 10:37:18 2007 -0500 svc: Remove extraneous debug svc_send printk The svc_send function has a debug check and kern info printk for a null xprt pointers in the rqstp structure. Remove the printk. Signed-off-by: Tom Tucker commit e68b3f98b1d1684eb35ee010434b81fa22efd6cf Author: Tom Tucker Date: Tue Oct 9 10:37:16 2007 -0500 svc: Add a sockaddr length argument to the xpo_create function. The xpo_create function doesn't currently accept a sockaddr length. Add this as a parameter to be consistent with other kernel interfaces taking a sockaddr. Signed-off-by: Tom Tucker commit be62c37927c38a185f9a7e78e2ddafe63c82b753 Author: Tom Tucker Date: Tue Oct 9 10:37:13 2007 -0500 svc: Move setting of XPT_LISTENER bit to svc_tcp_init Move the setting of the XPT_LISTENER bit to svc_tcp_init where the remaining TCP transport initializiation is done. Signed-off-by: Tom Tucker commit 3fa9f8b9615acdb1238ef45552aa0a95b81b60a2 Author: Tom Tucker Date: Mon Oct 1 14:28:48 2007 -0500 knfsd: Support adding transports by writing portlist file Update the write handler for the portlist file to allow creating new listening endpoints on a transport. The general form of the string is: For example: tcp 2049 This is intended to support the creation of a listening endpoint for RDMA transports without adding #ifdef code to the nfssvc.c file. Signed-off-by: Tom Tucker commit 9a1ab487e0d5346d03962f56bb21122d93bab69c Author: Tom Tucker Date: Mon Oct 1 14:28:45 2007 -0500 svc: Add /proc/sys/sunrpc/transport files Add a file that when read lists the set of registered svc transports. Signed-off-by: Tom Tucker commit 11950ad554185ff67fce9f9f3a587c19292ea574 Author: Tom Tucker Date: Mon Oct 1 14:28:43 2007 -0500 svc: Add transport hdr size for defer/revisit Some transports have a header in front of the RPC header. The current defer/revisit processing considers only the iov_len and arg_len to determine how much to back up when saving the original request to revisit. Add a field to the rqstp structure to save the size of the transport header so svc_defer can correctly compute the start of a request. Signed-off-by: Tom Tucker commit 5f07a19e51086626a76efdc964a50d4e51ad7fb3 Author: Tom Tucker Date: Mon Oct 1 14:28:41 2007 -0500 svc: Move the xprt independent code to the svc_xprt.c file This functionally trivial patch moves all of the transport independent functions from the svcsock.c file to the transport independent svc_xprt.c file. Signed-off-by: Tom Tucker commit a42d869c5d6c4ff2909ff59bc0be08a469d81957 Author: Tom Tucker Date: Mon Oct 1 14:28:39 2007 -0500 svc: Make svc_check_conn_limits xprt independent The svc_check_conn_limits function only manipulates xprt fields. Change references to svc_sock->sk_xprt to svc_xprt directly. Signed-off-by: Tom Tucker commit 9fe0334eb0f823f38b9e17fc8592572fc0d438d4 Author: Tom Tucker Date: Mon Oct 1 14:28:36 2007 -0500 svc: Removing remaining references to rq_sock in rqstp This functionally empty patch removes rq_sock and unamed union from rqstp structure. Signed-off-by: Tom Tucker commit 08ec442fbf2423b44793ecf0ef17f11db832260b Author: Tom Tucker Date: Mon Oct 1 14:28:34 2007 -0500 svc: Move common create logic to common code Move the code that adds a transport instance to the sv_tempsocks and sv_permsocks lists out of the transport specific functions and into core logic. The svc_addsock routine still manipulates sv_permsocks directly. This code may be removed when rpc.nfsd is modified to create transports by writing to the portlist file. Signed-off-by: Tom Tucker commit 44067941da9ebf4d709be02099ea26e5d152ad11 Author: Tom Tucker Date: Mon Oct 1 14:28:32 2007 -0500 svc: Make svc_age_temp_sockets svc_age_temp_transports This function is transport independent. Change it to use svc_xprt directly and change it's name to reflect this. Signed-off-by: Tom Tucker commit 52bcd8c18c289fb3daf65f5d854edcd9bd8c57c2 Author: Tom Tucker Date: Mon Oct 1 14:28:30 2007 -0500 svc: Make svc_recv transport neutral All of the transport field and functions used by svc_recv are now transport independent. Change the svc_recv function to use the svc_xprt structure directly instead of the transport specific svc_sock structure. Signed-off-by: Tom Tucker commit 07aff74e73fe6c87ae9b7626a6ed8a15590dc0c1 Author: Tom Tucker Date: Mon Oct 1 14:28:28 2007 -0500 svc: Make svc_sock_release svc_xprt_release The svc_sock_release function only touches transport independent fields. Change the function to manipulate svc_xprt directly instead of the transport dependent svc_sock structure. Signed-off-by: Tom Tucker commit 1a9725d7953b7698349dca5329c075bdbcbf191a Author: Tom Tucker Date: Mon Oct 1 14:28:25 2007 -0500 svc: Move the sockaddr information to svc_xprt Move the IP address fields to the svc_xprt structure. Note that this assumes that _all_ RPC transports must have IP based 4-tuples. This seems reasonable given the tight coupling with the portmapper etc... Thoughts? Signed-off-by: Tom Tucker commit 382b09c2e84397a9b9981b55648728a1a86dbbbd Author: Tom Tucker Date: Mon Oct 1 14:28:23 2007 -0500 svc: Make deferral processing xprt independent This functionally trivial patch moves the transport independent sk_deferred list to the svc_xprt structure and updates the svc_deferred_req structure to keep pointers to svc_xprt's directly. Signed-off-by: Tom Tucker commit 2c7349c7dc3e88a471aa9d0e40fa1f34d4f29165 Author: Tom Tucker Date: Mon Oct 1 14:28:21 2007 -0500 svc: Move the authinfo cache to svc_xprt. Move the authinfo cache to svc_xprt. This allows both the TCP and RDMA transports to share this logic. A flag bit is used to determine if auth information is to be cached or not. Previously, this code looked at the transport protocol. I've also changed the spin_lock/unlock logic so that a lock is not taken for transports that are not caching auth info. Signed-off-by: Tom Tucker commit fa922cc09a0c4e775c5110f9764a03ce91d1896d Author: Tom Tucker Date: Mon Oct 1 14:28:19 2007 -0500 svc: Remove sk_lastrecv With the implementation of the new mark and sweep algorithm for shutting down old connections, the sk_lastrecv field is no longer needed. Signed-off-by: Tom Tucker commit 09d451518d0a9a0f4d3d08a742d321ad46603598 Author: Tom Tucker Date: Mon Oct 1 14:28:17 2007 -0500 svc: Change svc_sock_received to svc_xprt_received and export it All fields touched by svc_sock_received are now transport independent. Change it to use svc_xprt directly. This function is called from transport dependent code, so export it. Signed-off-by: Tom Tucker commit 0700396e3de67151a4bff8e7a35099dc50a5226e Author: Tom Tucker Date: Mon Oct 1 14:28:14 2007 -0500 svc: Make svc_send transport neutral Move the sk_mutex field to the transport independent svc_xprt structure. Now all the fields that svc_send touches are transport neutral. Change the svc_send function to use the transport independent svc_xprt directly instead of the transport dependent svc_sock structure. Signed-off-by: Tom Tucker commit 9f95da8d64f040e01320855cb0ae978cc7f08f8c Author: Tom Tucker Date: Mon Oct 1 14:28:12 2007 -0500 svc: Make the enqueue service transport neutral and export it. The svc_sock_enqueue function is now transport independent since all of the fields it touches have been moved to the transport independent svc_xprt structure. Change the function to use the svc_xprt structure directly instead of the transport specific svc_sock structure. Transport specific data-ready handlers need to call this function, so export it. Signed-off-by: Tom Tucker commit 3ed031e00c5b315d0dc5057306c491e661e28d34 Author: Tom Tucker Date: Mon Oct 1 14:28:10 2007 -0500 svc: Move sk_reserved to svc_xprt This functionally trivial patch moves the sk_reserved field to the transport independent svc_xprt structure. Signed-off-by: Tom Tucker commit 7890a922705f8e27b71b58036fa7670b48acf7bf Author: Tom Tucker Date: Mon Oct 1 14:28:08 2007 -0500 svc: Make close transport independent Move sk_list and sk_ready to svc_xprt. This involves close because these lists are walked by svcs when closing all their transports. So I combined the moving of these lists to svc_xprt with making close transport independent. The svc_force_sock_close has been changed to svc_close_all and takes a list as an argument. This removes some svc internals knowledge from the svcs. This code races with module removal and transport addition. Signed-off-by: Tom Tucker commit 2ff64f8df5dd623d0b76f55eb7fe4fd2b0ecde91 Author: Tom Tucker Date: Mon Oct 1 14:28:05 2007 -0500 svc: Move sk_server and sk_pool to svc_xprt This is another incremental change that moves transport independent fields from svc_sock to the svc_xprt structure. The changes should be functionally null. Signed-off-by: Tom Tucker commit 6b871d42bb250c0157afd700a90dc0012d0c63f8 Author: Tom Tucker Date: Mon Oct 1 14:28:03 2007 -0500 svc: Move sk_flags to the svc_xprt structure This functionally trivial change moves the transport independent sk_flags field to the transport independent svc_xprt structure. Signed-off-by: Tom Tucker commit 16bebf2f2c11d183c185b1094cf5ac11c5f5febc Author: Tom Tucker Date: Mon Oct 1 14:28:01 2007 -0500 svc: Change sk_inuse to a kref Change the atomic_t reference count to a kref and move it to the transport indepenent svc_xprt structure. Change the reference count wrapper names to be generic. Signed-off-by: Tom Tucker commit 9219a0749f91211081d7d6e9ec9ff9284e96d9be Author: Tom Tucker Date: Mon Oct 1 14:27:59 2007 -0500 svc: Change services to use new svc_create_xprt service Modify the various kernel RPC svcs to use the svc_create_xprt service. Signed-off-by: Tom Tucker commit 634a65406cff905436dacf57245a7f26ab25e472 Author: Tom Tucker Date: Mon Oct 1 14:27:56 2007 -0500 svc: Add a generic transport svc_create_xprt function The svc_create_xprt function is a transport independent version of the svc_makesock function. Since transport instance creation contains transport dependent and independent components, add an xpo_create transport function. The transport implementation of this function allocates the memory for the endpoint, implements the transport dependent initialization logic, and calls svc_xprt_init to initialize the transport independent field (svc_xprt) in it's data structure. Signed-off-by: Tom Tucker commit 235ba7e76d4056cd94b5b29db5ec6871cab45ad8 Author: Tom Tucker Date: Mon Oct 1 14:27:54 2007 -0500 svc: Add xpo_accept transport function Previously, the accept logic looked into the socket state to determine whether to call accept or recv when data-ready was indicated on an endpoint. Since some transports don't use sockets, this logic was changed to use a flag bit (SK_LISTENER) to identify listening endpoints. A transport function (xpo_accept) was added to allow each transport to define its own accept processing. A transport's initialization logic is reponsible for setting the SK_LISTENER bit. I didn't see any way to do this in transport independent logic since the passive side of a UDP connection doesn't listen and always recv's. In the svc_recv function, if the SK_LISTENER bit is set, the transport xpo_accept function is called to handle accept processing. Note that all functions are defined even if they don't make sense for a given transport. For example, accept doesn't mean anything for UDP. The fuction is defined anyway and bug checks if called. The UDP transport should never set the SK_LISTENER bit. The code that poaches connections when the connection limit is hit was moved to a subroutine to make the accept logic path easier to follow. Since this is in the new connection path, it should not be a performance issue. Signed-off-by: Tom Tucker commit c6e08ae8251a6a142bce203a01b3db7a38e53f58 Author: Tom Tucker Date: Mon Oct 1 14:27:51 2007 -0500 svc: Move close processing to a single place Close handling was duplicated in the UDP and TCP recvfrom methods. This code has been moved to the transport independent svc_recv function. Signed-off-by: Tom Tucker commit fec29087ce522a8a93d2df743f595019fce4e130 Author: Tom Tucker Date: Mon Oct 1 14:27:49 2007 -0500 svc: Add a transport function that checks for write space In order to avoid blocking a service thread, the receive side checks to see if there is sufficient write space to reply to the request. Each transport has a different mechanism for determining if there is enough write space to reply. The code that checked for white space was coupled with code that checked for CLOSE and CONN. These checks have been broken out into separate statements to make the code easier to read. Signed-off-by: Tom Tucker commit a5d1752381643042875f81b502f2ca77665c18ff Author: Tom Tucker Date: Mon Oct 1 14:27:47 2007 -0500 svc: Add xpo_prep_reply_hdr Some transports add fields to the RPC header for replies, e.g. the TCP record length. This function is called when preparing the reply header to allow each transport to add whatever fields it requires. Signed-off-by: Tom Tucker commit ce1bdd09e99a9db8a96b0f79b4d356e382ab5d7a Author: Tom Tucker Date: Mon Oct 1 14:27:45 2007 -0500 svc: Add per-transport delete functions Add transport specific xpo_detach and xpo_free functions. The xpo_detach function causes the transport to stop delivering data-ready events and enqueing the transport for I/O. The xpo_free function frees all resources associated with the particular transport instance. Signed-off-by: Tom Tucker commit f222cb10963daf45f623157cc7e3923c9128b1b7 Author: Tom Tucker Date: Mon Oct 1 14:27:42 2007 -0500 svc: Add transport specific xpo_release_rqst function The svc_sock_release function releases pages allocated to a thread. For UDP, this also returns the receive skb to the stack. For RDMA it will post a receive WR and bump the client credit count. Signed-off-by: Tom Tucker commit 4f02459d6fd5c1c73383eb71bf6873ae0a89e7c2 Author: Tom Tucker Date: Mon Oct 1 14:27:40 2007 -0500 svc: Move sk_sendto and sk_recvfrom to svc_xprt_class The sk_sendto and sk_recvfrom are function pointers that allow svc_sock to be used for both UDP and TCP. Move these function pointers to the svc_xprt_ops structure. Signed-off-by: Tom Tucker commit 0411a5be6628b7ea78c234d877e0a480a7ac3a2d Author: Tom Tucker Date: Mon Oct 1 14:27:38 2007 -0500 svc: Add a max payload value to the transport The svc_max_payload function currently looks at the socket type to determine the max payload. Add a max payload value to svc_xprt_class so it can be returned directly. Signed-off-by: Tom Tucker commit 6418140dffaad64b1fd15e1955ae3ca5a52c1e74 Author: Tom Tucker Date: Mon Oct 1 14:27:35 2007 -0500 svc: Change the svc_sock in the rqstp structure to a transport The rqstp structure contains a pointer to the transport for the RPC request. This functionaly trivial patch adds an unamed union with pointers to both svc_sock and svc_xprt. Ultimately the union will be removed and only the rq_xprt field will remain. This allows incrementally extracting transport independent interfaces without one gigundo patch. Signed-off-by: Tom Tucker commit 75aeda80c7993393a4daeb439984969ab4dd02f0 Author: Tom Tucker Date: Mon Oct 1 14:27:33 2007 -0500 svc: Make svc_sock the tcp/udp transport Make TCP and UDP svc_sock transports, and register them with the svc transport core. A transport type (svc_sock) has an svc_xprt as its first member, and calls svc_xprt_init to initialize this field. Signed-off-by: Tom Tucker commit 631a2fc20eb64fd0a5e5c44a8a7b848b88ec8fc4 Author: Tom Tucker Date: Mon Oct 1 14:27:31 2007 -0500 svc: Add an svc transport class The transport class (svc_xprt_class) represents a type of transport, e.g. udp, tcp, rdma. A transport class has a unique name and a set of transport operations kept in the svc_xprt_ops structure. A transport class can be dynamically registered and unregisterd. The svc_xprt_class represents the module that implements the transport type and keeps reference counts on the module to avoid unloading while there are active users. The endpoint (svc_xprt) is a generic, transport independent endpoint that can be used to send and receive data for an RPC service. It inherits it's operations from the transport class. A transport driver module registers and unregisters itself with svc sunrpc by calling svc_reg_xprt_class, and svc_unreg_xprt_class respectively. Signed-off-by: Tom Tucker commit 5e344de68d9c26a455cb2353c2c11fdfe99bb861 Author: J. Bruce Fields Date: Mon Oct 29 00:11:22 2007 -0400 locks: make deadlock detection ignore processes with shared files If the process waiting for a lock has a shared file table, then it may be difficult to determine whether it will actually be "blocked" on a given lock--although one task is blocked, others may be free to unlock for it (since tasks with the same file table are able to manage each other's posix file locks). So just give up and allow any lock for which depends (directly or indirectly) on a task with shared file table. Ditto for tasks held by remote clients; we don't necessarily have the information required to identify deadlocks in that case anyway. Signed-off-by: J. Bruce Fields commit 8f619285d90941771089f6761e1437664874bb6e Author: J. Bruce Fields Date: Fri Oct 26 18:05:40 2007 -0400 locks: clarify posix_locks_deadlock For such a short function (with such a long comment), posix_locks_deadlock() seems to cause a lot of confusion. Attempt to make it a bit clearer: - Remove the initial posix_same_owner() check, which can never pass (since this is only called in the case that block_fl and caller_fl conflict) - Use an explicit loop (and a helper function) instead of a goto. - Rewrite the comment, attempting a clearer explanation, and removing some uninteresting historical detail. Signed-off-by: J. Bruce Fields commit 2d0555b1110f71119fc3724e4c670dd3c00bc36b Author: J. Bruce Fields Date: Tue Nov 20 15:54:10 2007 -0500 nfsd4: kill unneeded cl_confirm check We generate a unique cl_confirm for every new client; so if we've already checked that this cl_confirm agrees with the cl_confirm of unconf, then we already know that it does not agree with the cl_confirm of conf. Signed-off-by: J. Bruce Fields commit 9327494235a8e983bc49a47282ab44b49c5c79c7 Author: J. Bruce Fields Date: Tue Nov 20 16:52:07 2007 -0500 nfsd4: remove unnecessary cl_verifier check from setclientid_confirm Again, the only way conf and unconf can have the same clientid is if they were created in the "probable callback update" case of setclientid, in which case we already know that the cl_verifier fields must agree. Signed-off-by: J. Bruce Fields commit 15a21ef140d7339c514bf39abd5e69361a028087 Author: J. Bruce Fields Date: Tue Nov 20 15:39:07 2007 -0500 nfsd4: kill unnecessary same_name() in setclientid_confirm If conf and unconf are both found in the lookup by cl_clientid, then they share the same cl_clientid. We always create a unique new cl_clientid field when creating a new client--the only exception is the "probable callback update" case in setclientid, where we copy the old cl_clientid from another clientid with the same name. Therefore two clients with the same cl_client field also always share the same cl_name field, and a couple of the checks here are redundant. Signed-off-by: J. Bruce Fields commit f0b286d3f8e258b4344a5746e3c9d75b5225070f Author: J. Bruce Fields Date: Mon Nov 19 20:31:04 2007 -0500 nfsd: uniquify cl_confirm values Using a counter instead of the nanoseconds value seems more likely to produce a unique cl_confirm. Signed-off-by: J. Bruce Fields commit 8340c48c0c1f6fc37b128279eb3e6aeb86507246 Author: J. Bruce Fields Date: Mon Nov 19 19:09:50 2007 -0500 nfsd: eliminate final bogus case from setclientid logic We're supposed to generate a different cl_confirm verifier for each new client, so these to cl_confirm values should never be the same. Signed-off-by: J. Bruce Fields commit 2c12a7d575f6fe806a5952ae20b8ee592eb01e3e Author: J. Bruce Fields Date: Tue Nov 20 16:11:27 2007 -0500 nfsd4: kill some unneeded setclientid comments Most of these comments just summarize the code. The matching of code to the cases described in the RFC may still be useful, though; add specific section references to make that easier to follow. Also update references to the outdated RFC 3010. Signed-off-by: J. Bruce Fields commit 40886af3d1a6107551e40b9a43e37e58df71ef3f Author: J. Bruce Fields Date: Thu Nov 15 17:06:58 2007 -0500 nfsd: minor fs/nfsd/auth.h cleanup While we're here, let's remove the redundant (and now wrong) pathname in the comment, and the #ifdef __KERNEL__'s. Acked-by: NeilBrown Signed-off-by: J. Bruce Fields commit 6f7a5fe73a083dee1b0afa597406bae28fbe5e1e Author: J. Bruce Fields Date: Thu Nov 15 17:05:43 2007 -0500 nfsd: move nfsd/auth.h into fs/nfsd This header is used only in a few places in fs/nfsd, so there seems to be little point to having it in include/. (Thanks to Robert Day for pointing this out.) Cc: Robert P. J. Day Acked-by: NeilBrown Signed-off-by: J. Bruce Fields commit cdc97cde35593e58b90183345b8e41cc63fe036f Author: J. Bruce Fields Date: Thu Nov 8 17:20:34 2007 -0500 knfsd: allow cache_register to return error on failure Newer server features such as nfsv4 and gss depend on proc to work, so a failure to initialize the proc files they need should be treated as fatal. Acked-by: NeilBrown Signed-off-by: J. Bruce Fields commit a0ccb578c3c5019cac12a5896607ce48a036a70c Author: J. Bruce Fields Date: Mon Nov 12 17:04:29 2007 -0500 nfsd: move cache proc (un)registration to separate function Just some minor cleanup. Also I don't see much point in trying to register further proc entries if initial entries fail; so just stop trying in that case. Acked-by: NeilBrown Signed-off-by: J. Bruce Fields commit 60b0767dbf802ccce107e906fd78b7c7ffa0b3df Author: J. Bruce Fields Date: Mon Nov 12 17:32:21 2007 -0500 nfsd: fail init on /proc/fs/nfs/exports creation failure I assume the reason failure of creation was ignored here was just to continue support embedded systems that want nfsd but not proc. However, in cases where proc is supported it would be clearer to fail entirely than to come up with some features disabled. Acked-by: NeilBrown Signed-off-by: J. Bruce Fields commit d8b9cf21dfdc897a053280b53801a5f7d6817330 Author: J. Bruce Fields Date: Mon Nov 12 17:09:49 2007 -0500 nfsd: select CONFIG_PROC_FS in nfsv4 and gss server cases The server depends on upcalls under /proc to support nfsv4 and gss. Acked-by: NeilBrown Signed-off-by: J. Bruce Fields commit f48e9fea96d094ae2de380704d602441b8b7268c Author: J. Bruce Fields Date: Thu Nov 8 16:09:59 2007 -0500 knfsd: cache unregistration needn't return error There's really nothing much the caller can do if cache unregistration fails. And indeed, all any caller does in this case is print an error and continue. So just return void and move the printk's inside cache_unregister. Acked-by: NeilBrown Signed-off-by: J. Bruce Fields commit 9a867f3a5fca7bd79e816bb528b0ebcb15191f00 Author: J. Bruce Fields Date: Fri Nov 9 14:10:56 2007 -0500 nfsd: fail module init on reply cache init failure If the reply cache initialization fails due to a kmalloc failure, currently we try to soldier on with a reduced (or nonexistant) reply cache. Better to just fail immediately: the failure is then much easier to understand and debug, and it could save us complexity in some later code. (But actually, it doesn't help currently because the cache is also turned off in some odd failure cases; we should probably find a better way to handle those failure cases some day.) Fix some minor style problems while we're at it, and rename nfsd_cache_init() to remove the need for a comment describing it. Acked-by: NeilBrown Signed-off-by: J. Bruce Fields commit 53802c401f5419fbd910f18591a7aa6a56c996cd Author: J. Bruce Fields Date: Fri Nov 9 13:44:06 2007 -0500 nfsd: cleanup nfsd module initialization cleanup Handle the failure case here with something closer to the standard kernel style. Doesn't really matter for now, but I'd like to add a few more failure cases, and then this'll help. Acked-by: NeilBrown Signed-off-by: J. Bruce Fields commit 13ab8bc67589730d4aaa9e8cf8ec44354fcae90c Author: J. Bruce Fields Date: Fri Nov 9 12:31:55 2007 -0500 knfsd: cleanup nfsd4 properly on module init failure We forgot to shut down the nfs4 state and idmapping code in this case. Acked-by: NeilBrown Signed-off-by: J. Bruce Fields commit b7db425af87af68df327db8d4b394a0e9bc5b78a Author: J. Bruce Fields Date: Sun Nov 11 15:43:12 2007 -0500 nfsd: Fix handling of negative lengths in read_buf() The length "nbytes" passed into read_buf should never be negative, but we check only for too-large values of "nbytes", not for too-small values. Make nbytes unsigned, so it's clear that the former tests are sufficient. (Despite this read_buf() currently correctly returns an xdr error in the case of a negative length, thanks to an unsigned comparison with size_of() and bounds-checking in kmalloc(). This seems very fragile, though.) Signed-off-by: J. Bruce Fields commit b36b584afab2b692a248097a69120a123a8f98ed Author: J. Bruce Fields Date: Tue Nov 6 14:15:19 2007 -0500 knfsd: fix cache.c comment The path here must be left over from some earlier draft; fix it. And do some more minor cleanup while we're there. Signed-off-by: J. Bruce Fields commit 9be605cb56ada24e48b69a25d98b5c51823fb10e Author: Chuck Lever Date: Thu Nov 1 16:57:25 2007 -0400 NFSD: Path name length signage in nfsd request argument structures Clean up: For consistency, store the length of path name strings in nfsd argument structures as unsigned integers. Signed-off-by: Chuck Lever Acked-By: NeilBrown Signed-off-by: J. Bruce Fields commit 23f889b97d8d7fdcab402998e960eb877693c83a Author: Chuck Lever Date: Thu Nov 1 16:57:20 2007 -0400 NFSD: Fix mixed sign comparison in nfs3svc_decode_symlinkargs Signed-off-by: Chuck Lever Acked-By: NeilBrown Signed-off-by: J. Bruce Fields commit f27f08ad35146006ca987d276eae253ea6417a8b Author: Chuck Lever Date: Thu Nov 1 16:57:14 2007 -0400 NFSD: Use unsigned length argument for decode_pathname Clean up: path name lengths are unsigned on the wire, negative lengths are not meaningful natively either. Signed-off-by: Chuck Lever Acked-By: NeilBrown Signed-off-by: J. Bruce Fields commit 370967d1e2e0918e1b657f707848155c5b933b92 Author: Chuck Lever Date: Thu Nov 1 16:57:09 2007 -0400 NFSD: Adjust filename length argument of nfsd_lookup Clean up: adjust the sign of the length argument of nfsd_lookup and nfsd_lookup_dentry, for consistency with recent changes. NFSD version 4 callers already pass an unsigned file name length. Signed-off-by: Chuck Lever Acked-By: NeilBrown Signed-off-by: J. Bruce Fields commit e7280d4b3b82036f5cede2547fed7db81a94689c Author: Chuck Lever Date: Thu Nov 1 16:57:04 2007 -0400 NFSD: File name length signage in nfsd request argument structures Clean up: For consistency, store the length of file name strings in nfsd argument structures as unsigned integers. This matches the XDR routines and client argument structures for the same operation types. Signed-off-by: Chuck Lever Acked-By: NeilBrown Signed-off-by: J. Bruce Fields commit 21c00418967713ddf66f4b6c4f7df2471afdeba9 Author: Chuck Lever Date: Thu Nov 1 16:56:58 2007 -0400 NFSD: Use unsigned length argument for decode_filename Clean up: file name lengths are unsigned on the wire, negative lengths are not meaningful natively either. Signed-off-by: Chuck Lever Acked-By: NeilBrown Signed-off-by: J. Bruce Fields commit 6ddeeeb552597cc5a5f16596c6ae3c55ae8dc658 Author: Chuck Lever Date: Thu Nov 1 16:56:53 2007 -0400 NLM: Fix sign of length of NLM variable length strings According to The Open Group's NLM specification, NLM callers are variable length strings. XDR variable length strings use an unsigned 32 bit length. And internally, negative string lengths are not meaningful for the Linux NLM implementation. Clean up: Make nlm_lock.len and nlm_reboot.len unsigned integers. This makes the sign of NLM string lengths consistent with the sign of xdr_netobj lengths. Signed-off-by: Chuck Lever Acked-By: NeilBrown Signed-off-by: J. Bruce Fields commit 19005f7e94cb23ae687aa7f23a4cd9605d045987 Author: Chuck Lever Date: Thu Nov 1 16:56:47 2007 -0400 SUNRPC: Use unsigned string lengths in xdr_decode_string_inplace XDR strings, opaques, and net objects should all use unsigned lengths. To wit, RFC 4506 says: 4.2. Unsigned Integer An XDR unsigned integer is a 32-bit datum that encodes a non-negative integer in the range [0,4294967295]. ... 4.11. String The standard defines a string of n (numbered 0 through n-1) ASCII bytes to be the number n encoded as an unsigned integer (as described above), and followed by the n bytes of the string. After this patch, xdr_decode_string_inplace now matches the other XDR string and array helpers that take a string length argument. See: xdr_encode_opaque_fixed, xdr_encode_opaque, xdr_encode_array Signed-off-by: Chuck Lever Acked-By: NeilBrown Signed-off-by: J. Bruce Fields commit 82393bd6701fb2274a0af154ca86e8696536ccee Author: Chuck Lever Date: Fri Oct 26 13:31:20 2007 -0400 SUNRPC: Prevent length underflow in read_flush() Make sure we compare an unsigned length to an unsigned count in read_flush(). Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields commit 86383e878e20070995c351bc669ff25a28510fbf Author: J. Bruce Fields Date: Fri Oct 26 13:32:50 2007 -0400 knfsd: fix broken length check in nfs4idmap.c Obviously at some point we thought "error" represented the length when positive. This appears to be a long-standing typo. Thanks to Prasad Potluri for finding the problem and proposing an earlier version of this patch. Cc: Steve French Cc: Prasad V Potluri Signed-off-by: J. Bruce Fields commit a3fe04c90c9052ec4029167698317f64559ad1ca Author: J. Bruce Fields Date: Thu Oct 25 19:00:26 2007 -0400 nfsd: move callback rpc_client creation into separate thread The whole reason to move this callback-channel probe into a separate thread was because (for now) we don't have an easy way to create the rpc_client asynchronously. But I forgot to move the rpc_create() to the spawned thread. Doh! Fix that. This also narrows the window for a preexisting race that occurs when two probes are initiated at the same time; further patches will close that window. Signed-off-by: J. Bruce Fields commit 2f27490172599187f3307725e49cae082828c794 Author: Prasad P Date: Wed Oct 24 15:14:32 2007 -0500 Fix incorrect assignment Dereferenced pointer "dentry" without checking and assigned to inode in the declaration. (We could just delete the NULL checks that follow instead, as we never get to the encode function in this particular case. But it takes a little detective work to verify that fact, so it's probably safer to leave the checks in place.) Cc: Steve French Signed-off-by: Prasad V Potluri Signed-off-by: J. Bruce Fields fs/Kconfig | 9 + fs/lockd/host.c | 19 +- fs/lockd/svc.c | 25 +- fs/locks.c | 103 +++-- fs/nfs/callback.c | 4 +- fs/nfsd/auth.h | 22 + fs/nfsd/export.c | 18 +- fs/nfsd/nfs2acl.c | 2 +- fs/nfsd/nfs3xdr.c | 11 +- fs/nfsd/nfs4callback.c | 80 ++-- fs/nfsd/nfs4idmap.c | 28 +- fs/nfsd/nfs4state.c | 159 ++---- fs/nfsd/nfs4xdr.c | 7 +- fs/nfsd/nfscache.c | 28 +- fs/nfsd/nfsctl.c | 118 +++- fs/nfsd/nfsfh.c | 1 + fs/nfsd/nfssvc.c | 8 +- fs/nfsd/nfsxdr.c | 9 +- fs/nfsd/vfs.c | 4 +- include/linux/lockd/lockd.h | 9 +- include/linux/lockd/xdr.h | 4 +- include/linux/nfsd/Kbuild | 1 - include/linux/nfsd/auth.h | 27 - include/linux/nfsd/cache.h | 4 +- include/linux/nfsd/export.h | 2 +- include/linux/nfsd/nfsd.h | 5 +- include/linux/nfsd/syscall.h | 1 - include/linux/nfsd/xdr.h | 14 +- include/linux/nfsd/xdr3.h | 16 +- include/linux/nfsd_idmap.h | 4 +- include/linux/sunrpc/cache.h | 4 +- include/linux/sunrpc/debug.h | 4 +- include/linux/sunrpc/svc.h | 8 +- include/linux/sunrpc/svc_rdma.h | 262 ++++++++ include/linux/sunrpc/svc_xprt.h | 88 +++ include/linux/sunrpc/svcsock.h | 43 +-- include/linux/sunrpc/xdr.h | 3 +- net/sunrpc/Makefile | 7 +- net/sunrpc/auth_gss/svcauth_gss.c | 23 +- net/sunrpc/cache.c | 143 +++-- net/sunrpc/sunrpc_syms.c | 11 +- net/sunrpc/svc.c | 19 +- net/sunrpc/svc_rdma.c | 266 +++++++++ net/sunrpc/svc_rdma_marshal.c | 412 +++++++++++++ net/sunrpc/svc_rdma_recvfrom.c | 576 ++++++++++++++++++ net/sunrpc/svc_rdma_sendto.c | 515 ++++++++++++++++ net/sunrpc/svc_rdma_transport.c | 1070 +++++++++++++++++++++++++++++++++ net/sunrpc/svc_xprt.c | 1030 ++++++++++++++++++++++++++++++++ net/sunrpc/svcauth_unix.c | 54 +- net/sunrpc/svcsock.c | 1176 +++++++++---------------------------- net/sunrpc/sysctl.c | 36 ++ net/sunrpc/xdr.c | 8 +- 52 files changed, 5100 insertions(+), 1400 deletions(-) diff --git a/fs/Kconfig b/fs/Kconfig index 429a002..d0c7db2 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -1670,6 +1670,8 @@ config NFSD select CRYPTO_MD5 if NFSD_V4 select CRYPTO if NFSD_V4 select FS_POSIX_ACL if NFSD_V4 + select PROC_FS if NFSD_V4 + select PROC_FS if SUNRPC_GSS help If you want your Linux box to act as an NFS *server*, so that other computers on your local network which support NFS can access certain @@ -1693,6 +1695,13 @@ config NFSD To compile the NFS server support as a module, choose M here: the module will be called nfsd. If unsure, say N. +config NFSD_RDMA + tristate "Provide NFS server over RDMA support (EXPERIMENTAL)" + depends on SUNRPC && NFSD && INFINIBAND && EXPERIMENTAL + help + If you want your NFS server to support RDMA connections, + say M or Y here. If unsure, say N. + config NFSD_V2_ACL bool depends on NFSD diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 572601e..ebec009 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -34,10 +34,10 @@ static DEFINE_MUTEX(nlm_host_mutex); static void nlm_gc_hosts(void); static struct nsm_handle * __nsm_find(const struct sockaddr_in *, - const char *, int, int); + const char *, unsigned int, int); static struct nsm_handle * nsm_find(const struct sockaddr_in *sin, const char *hostname, - int hostname_len); + unsigned int hostname_len); /* * Common host lookup routine for server & client @@ -45,7 +45,8 @@ static struct nsm_handle * nsm_find(const struct sockaddr_in *sin, static struct nlm_host * nlm_lookup_host(int server, const struct sockaddr_in *sin, int proto, int version, const char *hostname, - int hostname_len, const struct sockaddr_in *ssin) + unsigned int hostname_len, + const struct sockaddr_in *ssin) { struct hlist_head *chain; struct hlist_node *pos; @@ -176,7 +177,7 @@ nlm_destroy_host(struct nlm_host *host) */ struct nlm_host * nlmclnt_lookup_host(const struct sockaddr_in *sin, int proto, int version, - const char *hostname, int hostname_len) + const char *hostname, unsigned int hostname_len) { struct sockaddr_in ssin = {0}; @@ -189,7 +190,7 @@ nlmclnt_lookup_host(const struct sockaddr_in *sin, int proto, int version, */ struct nlm_host * nlmsvc_lookup_host(struct svc_rqst *rqstp, - const char *hostname, int hostname_len) + const char *hostname, unsigned int hostname_len) { struct sockaddr_in ssin = {0}; @@ -307,7 +308,8 @@ void nlm_release_host(struct nlm_host *host) * Release all resources held by that peer. */ void nlm_host_rebooted(const struct sockaddr_in *sin, - const char *hostname, int hostname_len, + const char *hostname, + unsigned int hostname_len, u32 new_state) { struct hlist_head *chain; @@ -449,7 +451,7 @@ static DEFINE_MUTEX(nsm_mutex); static struct nsm_handle * __nsm_find(const struct sockaddr_in *sin, - const char *hostname, int hostname_len, + const char *hostname, unsigned int hostname_len, int create) { struct nsm_handle *nsm = NULL; @@ -503,7 +505,8 @@ out: } static struct nsm_handle * -nsm_find(const struct sockaddr_in *sin, const char *hostname, int hostname_len) +nsm_find(const struct sockaddr_in *sin, const char *hostname, + unsigned int hostname_len) { return __nsm_find(sin, hostname, hostname_len, 1); } diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 82e2192..ee4a9bc 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -219,19 +219,6 @@ lockd(struct svc_rqst *rqstp) module_put_and_exit(0); } - -static int find_socket(struct svc_serv *serv, int proto) -{ - struct svc_sock *svsk; - int found = 0; - list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) - if (svsk->sk_sk->sk_protocol == proto) { - found = 1; - break; - } - return found; -} - /* * Make any sockets that are needed but not present. * If nlm_udpport or nlm_tcpport were set as module @@ -243,13 +230,13 @@ static int make_socks(struct svc_serv *serv, int proto) int err = 0; if (proto == IPPROTO_UDP || nlm_udpport) - if (!find_socket(serv, IPPROTO_UDP)) - err = svc_makesock(serv, IPPROTO_UDP, nlm_udpport, - SVC_SOCK_DEFAULTS); + if (!svc_find_xprt(serv, "udp", AF_UNSPEC, 0)) + err = svc_create_xprt(serv, "udp", nlm_udpport, + SVC_SOCK_DEFAULTS); if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport)) - if (!find_socket(serv, IPPROTO_TCP)) - err = svc_makesock(serv, IPPROTO_TCP, nlm_tcpport, - SVC_SOCK_DEFAULTS); + if (!svc_find_xprt(serv, "tcp", AF_UNSPEC, 0)) + err = svc_create_xprt(serv, "tcp", nlm_tcpport, + SVC_SOCK_DEFAULTS); if (err >= 0) { warned = 0; diff --git a/fs/locks.c b/fs/locks.c index 8b8388e..7ffd465 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -512,6 +512,8 @@ static void locks_delete_block(struct file_lock *waiter) unlock_kernel(); } +static int posix_owner_shared(struct file_lock *caller_fl); + /* Insert waiter into blocker's block list. * We use a circular list so that processes can be easily woken up in * the order they blocked. The documentation doesn't require this but @@ -523,7 +525,7 @@ static void locks_insert_block(struct file_lock *blocker, BUG_ON(!list_empty(&waiter->fl_block)); list_add_tail(&waiter->fl_block, &blocker->fl_block); waiter->fl_next = blocker; - if (IS_POSIX(blocker)) + if (IS_POSIX(blocker) && !posix_owner_shared(waiter)) list_add(&waiter->fl_link, &blocked_list); } @@ -683,46 +685,79 @@ posix_test_lock(struct file *filp, struct file_lock *fl) EXPORT_SYMBOL(posix_test_lock); -/* This function tests for deadlock condition before putting a process to - * sleep. The detection scheme is no longer recursive. Recursive was neat, - * but dangerous - we risked stack corruption if the lock data was bad, or - * if the recursion was too deep for any other reason. - * - * We rely on the fact that a task can only be on one lock's wait queue - * at a time. When we find blocked_task on a wait queue we can re-search - * with blocked_task equal to that queue's owner, until either blocked_task - * isn't found, or blocked_task is found on a queue owned by my_task. - * - * Note: the above assumption may not be true when handling lock requests - * from a broken NFS client. But broken NFS clients have a lot more to - * worry about than proper deadlock detection anyway... --okir - * - * However, the failure of this assumption (also possible in the case of - * multiple tasks sharing the same open file table) also means there's no - * guarantee that the loop below will terminate. As a hack, we give up - * after a few iterations. +/* + * Deadlock detection: + * + * We attempt to detect deadlocks that are due purely to posix file + * locks. + * + * We assume that a task can be waiting for at most one lock at a time. + * So for any acquired lock, the process holding that lock may be + * waiting on at most one other lock. That lock in turns may be held by + * someone waiting for at most one other lock. Given a requested lock + * caller_fl which is about to wait for a conflicting lock block_fl, we + * follow this chain of waiters to ensure we are not about to create a + * cycle. + * + * Since we do this before we ever put a process to sleep on a lock, we + * are ensured that there is never a cycle; that is what guarantees that + * the while() loop in posix_locks_deadlock() eventually completes. + * + * Note: the above assumption may not be true when handling lock + * requests from a broken NFS client. It may also fail in the presence + * of tasks (such as posix threads) sharing the same open file table. + * + * We don't necessarily care about returning EDEALK correctly in such + * cases, but we do need to avoid cycles in the lock dependency graph in + * order to ensure the loop in posix_locks_deadlock eventually + * terminates. To that end, we enforce the assumption above by refusing + * to return EDEADLK or add to the list of blocked locks in any case + * where a lock owner might be able to block on more than one lock. */ -#define MAX_DEADLK_ITERATIONS 10 +static int posix_owner_shared(struct file_lock *caller_fl) +{ + /* + * The caller is a lock manager (lockd/nfsd), and won't + * necessarily guarantee that a single lock owner won't block on + * two locks at once: + */ + if (caller_fl->fl_lmops && caller_fl->fl_lmops->fl_compare_owner) + return 1; + /* + * Multiple tasks share current->files, also allowing the same + * "owner" to block on two locks at once: + */ + if (current->files == NULL || atomic_read(¤t->files->count) > 1) + return 1; + /* + * The lock is not on behalf of a file manager, and no other + * tasks share this file owner (and, as long as this task is + * stuck waiting for a lock, that's not going to change): + */ + return 0; +} + +/* Find a lock that the owner of the given block_fl is blocking on. */ +static struct file_lock *what_owner_is_waiting_for(struct file_lock *block_fl) +{ + struct file_lock *fl; + + list_for_each_entry(fl, &blocked_list, fl_link) + if (posix_same_owner(fl, block_fl)) + return fl->fl_next; + return NULL; +} static int posix_locks_deadlock(struct file_lock *caller_fl, struct file_lock *block_fl) { - struct file_lock *fl; - int i = 0; + if (posix_owner_shared(caller_fl)) + return 0; -next_task: - if (posix_same_owner(caller_fl, block_fl)) - return 1; - list_for_each_entry(fl, &blocked_list, fl_link) { - if (posix_same_owner(fl, block_fl)) { - if (i++ > MAX_DEADLK_ITERATIONS) - return 0; - fl = fl->fl_next; - block_fl = fl; - goto next_task; - } - } + while ((block_fl = what_owner_is_waiting_for(block_fl))) + if (posix_same_owner(caller_fl, block_fl)) + return 1; return 0; } diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index a796be5..e27ca14 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -123,8 +123,8 @@ int nfs_callback_up(void) if (!serv) goto out_err; - ret = svc_makesock(serv, IPPROTO_TCP, nfs_callback_set_tcpport, - SVC_SOCK_ANONYMOUS); + ret = svc_create_xprt(serv, "tcp", nfs_callback_set_tcpport, + SVC_SOCK_ANONYMOUS); if (ret <= 0) goto out_destroy; nfs_callback_tcpport = ret; diff --git a/fs/nfsd/auth.h b/fs/nfsd/auth.h new file mode 100644 index 0000000..78b3c0e --- /dev/null +++ b/fs/nfsd/auth.h @@ -0,0 +1,22 @@ +/* + * nfsd-specific authentication stuff. + * uid/gid mapping not yet implemented. + * + * Copyright (C) 1995, 1996 Olaf Kirch + */ + +#ifndef LINUX_NFSD_AUTH_H +#define LINUX_NFSD_AUTH_H + +#define nfsd_luid(rq, uid) ((u32)(uid)) +#define nfsd_lgid(rq, gid) ((u32)(gid)) +#define nfsd_ruid(rq, uid) ((u32)(uid)) +#define nfsd_rgid(rq, gid) ((u32)(gid)) + +/* + * Set the current process's fsuid/fsgid etc to those of the NFS + * client user + */ +int nfsd_setuser(struct svc_rqst *, struct svc_export *); + +#endif /* LINUX_NFSD_AUTH_H */ diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 66d0aeb..cbbc594 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1637,13 +1637,19 @@ exp_verify_string(char *cp, int max) /* * Initialize the exports module. */ -void +int nfsd_export_init(void) { + int rv; dprintk("nfsd: initializing export module.\n"); - cache_register(&svc_export_cache); - cache_register(&svc_expkey_cache); + rv = cache_register(&svc_export_cache); + if (rv) + return rv; + rv = cache_register(&svc_expkey_cache); + if (rv) + cache_unregister(&svc_export_cache); + return rv; } @@ -1670,10 +1676,8 @@ nfsd_export_shutdown(void) exp_writelock(); - if (cache_unregister(&svc_expkey_cache)) - printk(KERN_ERR "nfsd: failed to unregister expkey cache\n"); - if (cache_unregister(&svc_export_cache)) - printk(KERN_ERR "nfsd: failed to unregister export cache\n"); + cache_unregister(&svc_expkey_cache); + cache_unregister(&svc_export_cache); svcauth_unix_purge(); exp_writeunlock(); diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 0e5fa11..d5fca59 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -221,7 +221,7 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_getaclres *resp) { struct dentry *dentry = resp->fh.fh_dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode; struct kvec *head = rqstp->rq_res.head; unsigned int base; int n; diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 2d116d2..1fd897d 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -21,6 +21,7 @@ #include #include #include +#include "auth.h" #define NFSDDBG_FACILITY NFSDDBG_XDR @@ -88,10 +89,10 @@ encode_fh(__be32 *p, struct svc_fh *fhp) * no slashes or null bytes. */ static __be32 * -decode_filename(__be32 *p, char **namp, int *lenp) +decode_filename(__be32 *p, char **namp, unsigned int *lenp) { char *name; - int i; + unsigned int i; if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS3_MAXNAMLEN)) != NULL) { for (i = 0, name = *namp; i < *lenp; i++, name++) { @@ -449,8 +450,7 @@ int nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_symlinkargs *args) { - unsigned int len; - int avail; + unsigned int len, avail; char *old, *new; struct kvec *vec; @@ -483,7 +483,8 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p, /* now copy next page if there is one */ if (len && !avail && rqstp->rq_arg.page_len) { avail = rqstp->rq_arg.page_len; - if (avail > PAGE_SIZE) avail = PAGE_SIZE; + if (avail > PAGE_SIZE) + avail = PAGE_SIZE; old = page_address(rqstp->rq_arg.pages[0]); } while (len && avail && *old) { diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 9d536a8..250bcf3 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -350,30 +350,6 @@ static struct rpc_version * nfs_cb_version[] = { static int do_probe_callback(void *data) { struct nfs4_client *clp = data; - struct nfs4_callback *cb = &clp->cl_callback; - struct rpc_message msg = { - .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], - .rpc_argp = clp, - }; - int status; - - status = rpc_call_sync(cb->cb_client, &msg, RPC_TASK_SOFT); - - if (status) { - rpc_shutdown_client(cb->cb_client); - cb->cb_client = NULL; - } else - atomic_set(&cb->cb_set, 1); - put_nfs4_client(clp); - return 0; -} - -/* - * Set up the callback client and put a NFSPROC4_CB_NULL on the wire... - */ -void -nfsd4_probe_callback(struct nfs4_client *clp) -{ struct sockaddr_in addr; struct nfs4_callback *cb = &clp->cl_callback; struct rpc_timeout timeparms = { @@ -390,13 +366,15 @@ nfsd4_probe_callback(struct nfs4_client *clp) .timeout = &timeparms, .program = program, .version = nfs_cb_version[1]->number, - .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */ + .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */ .flags = (RPC_CLNT_CREATE_NOPING), }; - struct task_struct *t; - - if (atomic_read(&cb->cb_set)) - return; + struct rpc_message msg = { + .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], + .rpc_argp = clp, + }; + struct rpc_clnt *client; + int status; /* Initialize address */ memset(&addr, 0, sizeof(addr)); @@ -416,29 +394,51 @@ nfsd4_probe_callback(struct nfs4_client *clp) program->stats->program = program; /* Create RPC client */ - cb->cb_client = rpc_create(&args); - if (IS_ERR(cb->cb_client)) { + client = rpc_create(&args); + if (IS_ERR(client)) { dprintk("NFSD: couldn't create callback client\n"); + status = PTR_ERR(client); goto out_err; } + status = rpc_call_sync(client, &msg, RPC_TASK_SOFT); + + if (status) + goto out_release_client; + + cb->cb_client = client; + atomic_set(&cb->cb_set, 1); + put_nfs4_client(clp); + return 0; +out_release_client: + rpc_shutdown_client(client); +out_err: + put_nfs4_client(clp); + dprintk("NFSD: warning: no callback path to client %.*s\n", + (int)clp->cl_name.len, clp->cl_name.data); + return status; +} + +/* + * Set up the callback client and put a NFSPROC4_CB_NULL on the wire... + */ +void +nfsd4_probe_callback(struct nfs4_client *clp) +{ + struct task_struct *t; + + if (atomic_read(&clp->cl_callback.cb_set)) + return; + /* the task holds a reference to the nfs4_client struct */ atomic_inc(&clp->cl_count); t = kthread_run(do_probe_callback, clp, "nfs4_cb_probe"); if (IS_ERR(t)) - goto out_release_clp; + atomic_dec(&clp->cl_count); return; - -out_release_clp: - atomic_dec(&clp->cl_count); - rpc_shutdown_client(cb->cb_client); -out_err: - cb->cb_client = NULL; - dprintk("NFSD: warning: no callback path to client %.*s\n", - (int)clp->cl_name.len, clp->cl_name.data); } /* diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index 4c0c683..996bd88 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -255,13 +255,10 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen) goto out; if (len == 0) set_bit(CACHE_NEGATIVE, &ent.h.flags); - else { - if (error >= IDMAP_NAMESZ) { - error = -EINVAL; - goto out; - } + else if (len >= IDMAP_NAMESZ) + goto out; + else memcpy(ent.name, buf1, sizeof(ent.name)); - } error = -ENOMEM; res = idtoname_update(&ent, res); if (res == NULL) @@ -467,20 +464,25 @@ nametoid_update(struct ent *new, struct ent *old) * Exported API */ -void +int nfsd_idmap_init(void) { - cache_register(&idtoname_cache); - cache_register(&nametoid_cache); + int rv; + + rv = cache_register(&idtoname_cache); + if (rv) + return rv; + rv = cache_register(&nametoid_cache); + if (rv) + cache_unregister(&idtoname_cache); + return rv; } void nfsd_idmap_shutdown(void) { - if (cache_unregister(&idtoname_cache)) - printk(KERN_ERR "nfsd: failed to unregister idtoname cache\n"); - if (cache_unregister(&nametoid_cache)) - printk(KERN_ERR "nfsd: failed to unregister nametoid cache\n"); + cache_unregister(&idtoname_cache); + cache_unregister(&nametoid_cache); } /* diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 31673cd..22eb8d2 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -492,14 +492,14 @@ gen_clid(struct nfs4_client *clp) { } static void -gen_confirm(struct nfs4_client *clp) { - struct timespec tv; - u32 * p; +gen_confirm(struct nfs4_client *clp) +{ + static u32 i = 0; + u32 *p; - tv = CURRENT_TIME; p = (u32 *)clp->cl_confirm.data; - *p++ = tv.tv_sec; - *p++ = tv.tv_nsec; + *p++ = get_seconds(); + *p++ = i++; } static int @@ -683,39 +683,6 @@ out_err: return; } -/* - * RFC 3010 has a complex implmentation description of processing a - * SETCLIENTID request consisting of 5 bullets, labeled as - * CASE0 - CASE4 below. - * - * NOTES: - * callback information will be processed in a future patch - * - * an unconfirmed record is added when: - * NORMAL (part of CASE 4): there is no confirmed nor unconfirmed record. - * CASE 1: confirmed record found with matching name, principal, - * verifier, and clientid. - * CASE 2: confirmed record found with matching name, principal, - * and there is no unconfirmed record with matching - * name and principal - * - * an unconfirmed record is replaced when: - * CASE 3: confirmed record found with matching name, principal, - * and an unconfirmed record is found with matching - * name, principal, and with clientid and - * confirm that does not match the confirmed record. - * CASE 4: there is no confirmed record with matching name and - * principal. there is an unconfirmed record with - * matching name, principal. - * - * an unconfirmed record is deleted when: - * CASE 1: an unconfirmed record that matches input name, verifier, - * and confirmed clientid. - * CASE 4: any unconfirmed records with matching name and principal - * that exist after an unconfirmed record has been replaced - * as described above. - * - */ __be32 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_setclientid *setclid) @@ -748,11 +715,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); conf = find_confirmed_client_by_str(dname, strhashval); if (conf) { - /* - * CASE 0: - * clname match, confirmed, different principal - * or different ip_address - */ + /* RFC 3530 14.2.33 CASE 0: */ status = nfserr_clid_inuse; if (!same_creds(&conf->cl_cred, &rqstp->rq_cred) || conf->cl_addr != sin->sin_addr.s_addr) { @@ -761,12 +724,17 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } } + /* + * section 14.2.33 of RFC 3530 (under the heading "IMPLEMENTATION") + * has a description of SETCLIENTID request processing consisting + * of 5 bullet points, labeled as CASE0 - CASE4 below. + */ unconf = find_unconfirmed_client_by_str(dname, strhashval); status = nfserr_resource; if (!conf) { - /* - * CASE 4: - * placed first, because it is the normal case. + /* + * RFC 3530 14.2.33 CASE 4: + * placed first, because it is the normal case */ if (unconf) expire_client(unconf); @@ -776,17 +744,8 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, gen_clid(new); } else if (same_verf(&conf->cl_verifier, &clverifier)) { /* - * CASE 1: - * cl_name match, confirmed, principal match - * verifier match: probable callback update - * - * remove any unconfirmed nfs4_client with - * matching cl_name, cl_verifier, and cl_clientid - * - * create and insert an unconfirmed nfs4_client with same - * cl_name, cl_verifier, and cl_clientid as existing - * nfs4_client, but with the new callback info and a - * new cl_confirm + * RFC 3530 14.2.33 CASE 1: + * probable callback update */ if (unconf) { /* Note this is removing unconfirmed {*x***}, @@ -802,43 +761,25 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, copy_clid(new, conf); } else if (!unconf) { /* - * CASE 2: - * clname match, confirmed, principal match - * verfier does not match - * no unconfirmed. create a new unconfirmed nfs4_client - * using input clverifier, clname, and callback info - * and generate a new cl_clientid and cl_confirm. + * RFC 3530 14.2.33 CASE 2: + * probable client reboot; state will be removed if + * confirmed. */ new = create_client(clname, dname); if (new == NULL) goto out; gen_clid(new); - } else if (!same_verf(&conf->cl_confirm, &unconf->cl_confirm)) { - /* - * CASE3: - * confirmed found (name, principal match) - * confirmed verifier does not match input clverifier - * - * unconfirmed found (name match) - * confirmed->cl_confirm != unconfirmed->cl_confirm - * - * remove unconfirmed. - * - * create an unconfirmed nfs4_client - * with same cl_name as existing confirmed nfs4_client, - * but with new callback info, new cl_clientid, - * new cl_verifier and a new cl_confirm + } else { + /* + * RFC 3530 14.2.33 CASE 3: + * probable client reboot; state will be removed if + * confirmed. */ expire_client(unconf); new = create_client(clname, dname); if (new == NULL) goto out; gen_clid(new); - } else { - /* No cases hit !!! */ - status = nfserr_inval; - goto out; - } copy_verf(new, &clverifier); new->cl_addr = sin->sin_addr.s_addr; @@ -857,11 +798,9 @@ out: /* - * RFC 3010 has a complex implmentation description of processing a - * SETCLIENTID_CONFIRM request consisting of 4 bullets describing - * processing on a DRC miss, labeled as CASE1 - CASE4 below. - * - * NOTE: callback information will be processed here in a future patch + * Section 14.2.34 of RFC 3530 (under the heading "IMPLEMENTATION") has + * a description of SETCLIENTID_CONFIRM request processing consisting of 4 + * bullets, labeled as CASE1 - CASE4 below. */ __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp, @@ -892,16 +831,13 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, if (unconf && unconf->cl_addr != sin->sin_addr.s_addr) goto out; - if ((conf && unconf) && - (same_verf(&unconf->cl_confirm, &confirm)) && - (same_verf(&conf->cl_verifier, &unconf->cl_verifier)) && - (same_name(conf->cl_recdir,unconf->cl_recdir)) && - (!same_verf(&conf->cl_confirm, &unconf->cl_confirm))) { - /* CASE 1: - * unconf record that matches input clientid and input confirm. - * conf record that matches input clientid. - * conf and unconf records match names, verifiers - */ + /* + * section 14.2.34 of RFC 3530 has a description of + * SETCLIENTID_CONFIRM request processing consisting + * of 4 bullet points, labeled as CASE1 - CASE4 below. + */ + if (conf && unconf && same_verf(&confirm, &unconf->cl_confirm)) { + /* RFC 3530 14.2.34 CASE 1: */ if (!same_creds(&conf->cl_cred, &unconf->cl_cred)) status = nfserr_clid_inuse; else { @@ -914,27 +850,15 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, status = nfs_ok; } - } else if ((conf && !unconf) || - ((conf && unconf) && - (!same_verf(&conf->cl_verifier, &unconf->cl_verifier) || - !same_name(conf->cl_recdir, unconf->cl_recdir)))) { - /* CASE 2: - * conf record that matches input clientid. - * if unconf record matches input clientid, then - * unconf->cl_name or unconf->cl_verifier don't match the - * conf record. - */ + } else if (conf && !unconf) { + /* RFC 3530 14.2.34 CASE 2: */ if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) status = nfserr_clid_inuse; else status = nfs_ok; } else if (!conf && unconf && same_verf(&unconf->cl_confirm, &confirm)) { - /* CASE 3: - * conf record not found. - * unconf record found. - * unconf->cl_confirm matches input confirm - */ + /* RFC 3530 14.2.34 CASE 3: */ if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred)) { status = nfserr_clid_inuse; } else { @@ -953,12 +877,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm))) && (!unconf || (unconf && !same_verf(&unconf->cl_confirm, &confirm)))) { - /* CASE 4: - * conf record not found, or if conf, conf->cl_confirm does not - * match input confirm. - * unconf record not found, or if unconf, unconf->cl_confirm - * does not match input confirm. - */ + /* RFC 3530 14.2.34 CASE 4: */ status = nfserr_stale_clientid; } else { /* check that we have hit one of the cases...*/ diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 5733394..25c7ae2 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -148,7 +148,7 @@ xdr_error: \ } \ } while (0) -static __be32 *read_buf(struct nfsd4_compoundargs *argp, int nbytes) +static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes) { /* We want more bytes than seem to be available. * Maybe we need a new page, maybe we have just run out @@ -169,6 +169,11 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, int nbytes) return NULL; } + /* + * The following memcpy is safe because read_buf is always + * called with nbytes > avail, and the two cases above both + * guarantee p points to at least nbytes bytes. + */ memcpy(p, argp->p, avail); /* step to next page */ argp->p = page_address(argp->pagelist[0]); diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 578f2c9..92cb5ae 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -44,17 +44,18 @@ static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec); */ static DEFINE_SPINLOCK(cache_lock); -void -nfsd_cache_init(void) +int +nfsd_reply_cache_init(void) { struct svc_cacherep *rp; int i; INIT_LIST_HEAD(&lru_head); i = CACHESIZE; - while(i) { + while (i) { rp = kmalloc(sizeof(*rp), GFP_KERNEL); - if (!rp) break; + if (!rp) + goto out_nomem; list_add(&rp->c_lru, &lru_head); rp->c_state = RC_UNUSED; rp->c_type = RC_NOCACHE; @@ -62,23 +63,20 @@ nfsd_cache_init(void) i--; } - if (i) - printk (KERN_ERR "nfsd: cannot allocate all %d cache entries, only got %d\n", - CACHESIZE, CACHESIZE-i); - hash_list = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL); - if (!hash_list) { - nfsd_cache_shutdown(); - printk (KERN_ERR "nfsd: cannot allocate %Zd bytes for hash list\n", - HASHSIZE * sizeof(struct hlist_head)); - return; - } + if (!hash_list) + goto out_nomem; cache_disabled = 0; + return 0; +out_nomem: + printk(KERN_ERR "nfsd: failed to allocate reply cache\n"); + nfsd_reply_cache_shutdown(); + return -ENOMEM; } void -nfsd_cache_shutdown(void) +nfsd_reply_cache_shutdown(void) { struct svc_cacherep *rp; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 77dc989..b2db172 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -503,7 +503,7 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size) int len = 0; lock_kernel(); if (nfsd_serv) - len = svc_sock_names(buf, nfsd_serv, NULL); + len = svc_xprt_names(nfsd_serv, buf, 0); unlock_kernel(); return len; } @@ -540,7 +540,7 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size) } return err < 0 ? err : 0; } - if (buf[0] == '-') { + if (buf[0] == '-' && isdigit(buf[1])) { char *toclose = kstrdup(buf+1, GFP_KERNEL); int len = 0; if (!toclose) @@ -554,6 +554,52 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size) kfree(toclose); return len; } + /* + * Add a transport listener by writing it's transport name + */ + if (isalpha(buf[0])) { + int err; + char transport[16]; + int port; + if (sscanf(buf, "%15s %4d", transport, &port) == 2) { + err = nfsd_create_serv(); + if (!err) { + if (svc_find_xprt(nfsd_serv, transport, + AF_UNSPEC, port)) + return -EADDRINUSE; + + err = svc_create_xprt(nfsd_serv, + transport, port, + SVC_SOCK_ANONYMOUS); + } + return err < 0 ? err : 0; + } + } + /* + * Remove a transport by writing it's transport name and port number + */ + if (buf[0] == '-' && isalpha(buf[1])) { + struct svc_xprt *xprt; + int err = -EINVAL; + char transport[16]; + int port; + if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) { + if (port == 0) + return -EINVAL; + lock_kernel(); + if (nfsd_serv) { + xprt = svc_find_xprt(nfsd_serv, transport, + AF_UNSPEC, port); + if (xprt) { + svc_close_xprt(xprt); + err = 0; + } else + err = -ENOENT; + } + unlock_kernel(); + return err < 0 ? err : 0; + } + } return -EINVAL; } @@ -674,6 +720,27 @@ static struct file_system_type nfsd_fs_type = { .kill_sb = kill_litter_super, }; +#ifdef CONFIG_PROC_FS +static inline int create_proc_exports_entry(void) +{ + struct proc_dir_entry *entry; + + entry = proc_mkdir("fs/nfs", NULL); + if (!entry) + return -ENOMEM; + entry = create_proc_entry("fs/nfs/exports", 0, NULL); + if (!entry) + return -ENOMEM; + entry->proc_fops = &exports_operations; + return 0; +} +#else /* CONFIG_PROC_FS */ +static inline int create_proc_exports_entry(void) +{ + return 0; +} +#endif + static int __init init_nfsd(void) { int retval; @@ -683,32 +750,43 @@ static int __init init_nfsd(void) if (retval) return retval; nfsd_stat_init(); /* Statistics */ - nfsd_cache_init(); /* RPC reply cache */ - nfsd_export_init(); /* Exports table */ + retval = nfsd_reply_cache_init(); + if (retval) + goto out_free_stat; + retval = nfsd_export_init(); + if (retval) + goto out_free_cache; nfsd_lockd_init(); /* lockd->nfsd callbacks */ - nfsd_idmap_init(); /* Name to ID mapping */ - if (proc_mkdir("fs/nfs", NULL)) { - struct proc_dir_entry *entry; - entry = create_proc_entry("fs/nfs/exports", 0, NULL); - if (entry) - entry->proc_fops = &exports_operations; - } + retval = nfsd_idmap_init(); + if (retval) + goto out_free_lockd; + retval = create_proc_exports_entry(); + if (retval) + goto out_free_idmap; retval = register_filesystem(&nfsd_fs_type); - if (retval) { - nfsd_export_shutdown(); - nfsd_cache_shutdown(); - remove_proc_entry("fs/nfs/exports", NULL); - remove_proc_entry("fs/nfs", NULL); - nfsd_stat_shutdown(); - nfsd_lockd_shutdown(); - } + if (retval) + goto out_free_all; + return 0; +out_free_all: + remove_proc_entry("fs/nfs/exports", NULL); + remove_proc_entry("fs/nfs", NULL); +out_free_idmap: + nfsd_idmap_shutdown(); +out_free_lockd: + nfsd_lockd_shutdown(); + nfsd_export_shutdown(); +out_free_cache: + nfsd_reply_cache_shutdown(); +out_free_stat: + nfsd_stat_shutdown(); + nfsd4_free_slabs(); return retval; } static void __exit exit_nfsd(void) { nfsd_export_shutdown(); - nfsd_cache_shutdown(); + nfsd_reply_cache_shutdown(); remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); nfsd_stat_shutdown(); diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 468f17a..8fbd2dc 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -22,6 +22,7 @@ #include #include #include +#include "auth.h" #define NFSDDBG_FACILITY NFSDDBG_FH diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 1190aea..9647b0f 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -155,8 +155,8 @@ static int killsig; /* signal that was used to kill last nfsd */ static void nfsd_last_thread(struct svc_serv *serv) { /* When last nfsd thread exits we need to do some clean-up */ - struct svc_sock *svsk; - list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) + struct svc_xprt *xprt; + list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) lockd_down(); nfsd_serv = NULL; nfsd_racache_shutdown(); @@ -236,7 +236,7 @@ static int nfsd_init_socks(int port) error = lockd_up(IPPROTO_UDP); if (error >= 0) { - error = svc_makesock(nfsd_serv, IPPROTO_UDP, port, + error = svc_create_xprt(nfsd_serv, "udp", port, SVC_SOCK_DEFAULTS); if (error < 0) lockd_down(); @@ -247,7 +247,7 @@ static int nfsd_init_socks(int port) #ifdef CONFIG_NFSD_TCP error = lockd_up(IPPROTO_TCP); if (error >= 0) { - error = svc_makesock(nfsd_serv, IPPROTO_TCP, port, + error = svc_create_xprt(nfsd_serv, "tcp", port, SVC_SOCK_DEFAULTS); if (error < 0) lockd_down(); diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 986f9b3..383d6d2 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -15,6 +15,7 @@ #include #include #include +#include "auth.h" #define NFSDDBG_FACILITY NFSDDBG_XDR @@ -62,10 +63,10 @@ encode_fh(__be32 *p, struct svc_fh *fhp) * no slashes or null bytes. */ static __be32 * -decode_filename(__be32 *p, char **namp, int *lenp) +decode_filename(__be32 *p, char **namp, unsigned int *lenp) { char *name; - int i; + unsigned int i; if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXNAMLEN)) != NULL) { for (i = 0, name = *namp; i < *lenp; i++, name++) { @@ -78,10 +79,10 @@ decode_filename(__be32 *p, char **namp, int *lenp) } static __be32 * -decode_pathname(__be32 *p, char **namp, int *lenp) +decode_pathname(__be32 *p, char **namp, unsigned int *lenp) { char *name; - int i; + unsigned int i; if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXPATHLEN)) != NULL) { for (i = 0, name = *namp; i < *lenp; i++, name++) { diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index d019918..755ba43 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -132,7 +132,7 @@ out: __be32 nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, - const char *name, int len, + const char *name, unsigned int len, struct svc_export **exp_ret, struct dentry **dentry_ret) { struct svc_export *exp; @@ -226,7 +226,7 @@ out_nfserr: */ __be32 nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, - int len, struct svc_fh *resfh) + unsigned int len, struct svc_fh *resfh) { struct svc_export *exp; struct dentry *dentry; diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index e2d1ce3..4babb2a 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -173,14 +173,17 @@ void nlmclnt_next_cookie(struct nlm_cookie *); /* * Host cache */ -struct nlm_host * nlmclnt_lookup_host(const struct sockaddr_in *, int, int, const char *, int); -struct nlm_host * nlmsvc_lookup_host(struct svc_rqst *, const char *, int); +struct nlm_host *nlmclnt_lookup_host(const struct sockaddr_in *, int, int, + const char *, unsigned int); +struct nlm_host *nlmsvc_lookup_host(struct svc_rqst *, const char *, + unsigned int); struct rpc_clnt * nlm_bind_host(struct nlm_host *); void nlm_rebind_host(struct nlm_host *); struct nlm_host * nlm_get_host(struct nlm_host *); void nlm_release_host(struct nlm_host *); void nlm_shutdown_hosts(void); -extern void nlm_host_rebooted(const struct sockaddr_in *, const char *, int, u32); +extern void nlm_host_rebooted(const struct sockaddr_in *, const char *, + unsigned int, u32); void nsm_release(struct nsm_handle *); diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h index 83a1f9f..df18fa0 100644 --- a/include/linux/lockd/xdr.h +++ b/include/linux/lockd/xdr.h @@ -29,7 +29,7 @@ struct svc_rqst; /* Lock info passed via NLM */ struct nlm_lock { char * caller; - int len; /* length of "caller" */ + unsigned int len; /* length of "caller" */ struct nfs_fh fh; struct xdr_netobj oh; u32 svid; @@ -78,7 +78,7 @@ struct nlm_res { */ struct nlm_reboot { char * mon; - int len; + unsigned int len; u32 state; __be32 addr; __be32 vers; diff --git a/include/linux/nfsd/Kbuild b/include/linux/nfsd/Kbuild index d9c5455..e726fc3 100644 --- a/include/linux/nfsd/Kbuild +++ b/include/linux/nfsd/Kbuild @@ -4,4 +4,3 @@ unifdef-y += stats.h unifdef-y += syscall.h unifdef-y += nfsfh.h unifdef-y += debug.h -unifdef-y += auth.h diff --git a/include/linux/nfsd/auth.h b/include/linux/nfsd/auth.h deleted file mode 100644 index 0fb9f72..0000000 --- a/include/linux/nfsd/auth.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - * include/linux/nfsd/auth.h - * - * nfsd-specific authentication stuff. - * uid/gid mapping not yet implemented. - * - * Copyright (C) 1995, 1996 Olaf Kirch - */ - -#ifndef LINUX_NFSD_AUTH_H -#define LINUX_NFSD_AUTH_H - -#ifdef __KERNEL__ - -#define nfsd_luid(rq, uid) ((u32)(uid)) -#define nfsd_lgid(rq, gid) ((u32)(gid)) -#define nfsd_ruid(rq, uid) ((u32)(uid)) -#define nfsd_rgid(rq, gid) ((u32)(gid)) - -/* - * Set the current process's fsuid/fsgid etc to those of the NFS - * client user - */ -int nfsd_setuser(struct svc_rqst *, struct svc_export *); - -#endif /* __KERNEL__ */ -#endif /* LINUX_NFSD_AUTH_H */ diff --git a/include/linux/nfsd/cache.h b/include/linux/nfsd/cache.h index 007480c..7b5d784 100644 --- a/include/linux/nfsd/cache.h +++ b/include/linux/nfsd/cache.h @@ -72,8 +72,8 @@ enum { */ #define RC_DELAY (HZ/5) -void nfsd_cache_init(void); -void nfsd_cache_shutdown(void); +int nfsd_reply_cache_init(void); +void nfsd_reply_cache_shutdown(void); int nfsd_cache_lookup(struct svc_rqst *, int); void nfsd_cache_update(struct svc_rqst *, int, __be32 *); diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h index bcb7aba..3a16872 100644 --- a/include/linux/nfsd/export.h +++ b/include/linux/nfsd/export.h @@ -122,7 +122,7 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp); /* * Function declarations */ -void nfsd_export_init(void); +int nfsd_export_init(void); void nfsd_export_shutdown(void); void nfsd_export_flush(void); void exp_readlock(void); diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h index 604a0d7..8caf4c4 100644 --- a/include/linux/nfsd/nfsd.h +++ b/include/linux/nfsd/nfsd.h @@ -20,7 +20,6 @@ #include #include #include -#include #include /* * nfsd version @@ -70,9 +69,9 @@ void nfsd_racache_shutdown(void); int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, struct svc_export **expp); __be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *, - const char *, int, struct svc_fh *); + const char *, unsigned int, struct svc_fh *); __be32 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *, - const char *, int, + const char *, unsigned int, struct svc_export **, struct dentry **); __be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *, struct iattr *, int, time_t); diff --git a/include/linux/nfsd/syscall.h b/include/linux/nfsd/syscall.h index 8bcddcc..4e43976 100644 --- a/include/linux/nfsd/syscall.h +++ b/include/linux/nfsd/syscall.h @@ -18,7 +18,6 @@ #include #include #include -#include /* * Version of the syscall interface diff --git a/include/linux/nfsd/xdr.h b/include/linux/nfsd/xdr.h index 67885d5..a0132ef 100644 --- a/include/linux/nfsd/xdr.h +++ b/include/linux/nfsd/xdr.h @@ -23,7 +23,7 @@ struct nfsd_sattrargs { struct nfsd_diropargs { struct svc_fh fh; char * name; - int len; + unsigned int len; }; struct nfsd_readargs { @@ -43,17 +43,17 @@ struct nfsd_writeargs { struct nfsd_createargs { struct svc_fh fh; char * name; - int len; + unsigned int len; struct iattr attrs; }; struct nfsd_renameargs { struct svc_fh ffh; char * fname; - int flen; + unsigned int flen; struct svc_fh tfh; char * tname; - int tlen; + unsigned int tlen; }; struct nfsd_readlinkargs { @@ -65,15 +65,15 @@ struct nfsd_linkargs { struct svc_fh ffh; struct svc_fh tfh; char * tname; - int tlen; + unsigned int tlen; }; struct nfsd_symlinkargs { struct svc_fh ffh; char * fname; - int flen; + unsigned int flen; char * tname; - int tlen; + unsigned int tlen; struct iattr attrs; }; diff --git a/include/linux/nfsd/xdr3.h b/include/linux/nfsd/xdr3.h index 89d9d60..421eddd 100644 --- a/include/linux/nfsd/xdr3.h +++ b/include/linux/nfsd/xdr3.h @@ -21,7 +21,7 @@ struct nfsd3_sattrargs { struct nfsd3_diropargs { struct svc_fh fh; char * name; - int len; + unsigned int len; }; struct nfsd3_accessargs { @@ -48,7 +48,7 @@ struct nfsd3_writeargs { struct nfsd3_createargs { struct svc_fh fh; char * name; - int len; + unsigned int len; int createmode; struct iattr attrs; __be32 * verf; @@ -57,7 +57,7 @@ struct nfsd3_createargs { struct nfsd3_mknodargs { struct svc_fh fh; char * name; - int len; + unsigned int len; __u32 ftype; __u32 major, minor; struct iattr attrs; @@ -66,10 +66,10 @@ struct nfsd3_mknodargs { struct nfsd3_renameargs { struct svc_fh ffh; char * fname; - int flen; + unsigned int flen; struct svc_fh tfh; char * tname; - int tlen; + unsigned int tlen; }; struct nfsd3_readlinkargs { @@ -81,15 +81,15 @@ struct nfsd3_linkargs { struct svc_fh ffh; struct svc_fh tfh; char * tname; - int tlen; + unsigned int tlen; }; struct nfsd3_symlinkargs { struct svc_fh ffh; char * fname; - int flen; + unsigned int flen; char * tname; - int tlen; + unsigned int tlen; struct iattr attrs; }; diff --git a/include/linux/nfsd_idmap.h b/include/linux/nfsd_idmap.h index e82746f..f5dd037 100644 --- a/include/linux/nfsd_idmap.h +++ b/include/linux/nfsd_idmap.h @@ -44,10 +44,10 @@ #define IDMAP_NAMESZ 128 #ifdef CONFIG_NFSD_V4 -void nfsd_idmap_init(void); +int nfsd_idmap_init(void); void nfsd_idmap_shutdown(void); #else -static inline void nfsd_idmap_init(void) {}; +static inline int nfsd_idmap_init(void) {}; static inline void nfsd_idmap_shutdown(void) {}; #endif diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index bd7a6b0..03547d6 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -169,8 +169,8 @@ extern int cache_check(struct cache_detail *detail, extern void cache_flush(void); extern void cache_purge(struct cache_detail *detail); #define NEVER (0x7FFFFFFF) -extern void cache_register(struct cache_detail *cd); -extern int cache_unregister(struct cache_detail *cd); +extern int cache_register(struct cache_detail *cd); +extern void cache_unregister(struct cache_detail *cd); extern void qword_add(char **bpp, int *lp, char *str); extern void qword_addhex(char **bpp, int *lp, char *buf, int blen); diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h index 3912cf1..2ea3980 100644 --- a/include/linux/sunrpc/debug.h +++ b/include/linux/sunrpc/debug.h @@ -20,7 +20,7 @@ #define RPCDBG_BIND 0x0020 #define RPCDBG_SCHED 0x0040 #define RPCDBG_TRANS 0x0080 -#define RPCDBG_SVCSOCK 0x0100 +#define RPCDBG_SVCXPRT 0x0100 #define RPCDBG_SVCDSP 0x0200 #define RPCDBG_MISC 0x0400 #define RPCDBG_CACHE 0x0800 @@ -52,7 +52,7 @@ extern unsigned int nlm_debug; #define dprintk(args...) dfprintk(FACILITY, ## args) #undef ifdebug -#ifdef RPC_DEBUG +#ifdef RPC_DEBUG # define ifdebug(fac) if (unlikely(rpc_debug & RPCDBG_##fac)) # define dfprintk(fac, args...) do { ifdebug(fac) printk(args); } while(0) # define RPC_IFDEBUG(x) x diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 8531a70..f2ada2a 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -204,7 +204,7 @@ union svc_addr_u { struct svc_rqst { struct list_head rq_list; /* idle list */ struct list_head rq_all; /* all threads list */ - struct svc_sock * rq_sock; /* socket */ + struct svc_xprt * rq_xprt; /* transport ptr */ struct sockaddr_storage rq_addr; /* peer address */ size_t rq_addrlen; @@ -214,9 +214,10 @@ struct svc_rqst { struct auth_ops * rq_authop; /* authentication flavour */ u32 rq_flavor; /* pseudoflavor */ struct svc_cred rq_cred; /* auth info */ - struct sk_buff * rq_skbuff; /* fast recv inet buffer */ + void * rq_xprt_ctxt; /* transport specific context ptr */ struct svc_deferred_req*rq_deferred; /* deferred request we are replaying */ + size_t rq_xprt_hlen; /* xprt header len */ struct xdr_buf rq_arg; struct xdr_buf rq_res; struct page * rq_pages[RPCSVC_MAXPAGES]; @@ -317,11 +318,12 @@ static inline void svc_free_res_pages(struct svc_rqst *rqstp) struct svc_deferred_req { u32 prot; /* protocol (UDP or TCP) */ - struct svc_sock *svsk; + struct svc_xprt *xprt; struct sockaddr_storage addr; /* where reply must go */ size_t addrlen; union svc_addr_u daddr; /* where reply must come from */ struct cache_deferred_req handle; + int xprt_hlen; int argslen; __be32 args[0]; }; diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h new file mode 100644 index 0000000..c11bbcc --- /dev/null +++ b/include/linux/sunrpc/svc_rdma.h @@ -0,0 +1,262 @@ +/* + * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Author: Tom Tucker + */ + +#ifndef SVC_RDMA_H +#define SVC_RDMA_H +#include +#include +#include +#include +#include +#define SVCRDMA_DEBUG + +/* RPC/RDMA parameters and stats */ +extern unsigned int svcrdma_ord; +extern unsigned int svcrdma_max_requests; +extern unsigned int svcrdma_max_req_size; + +extern atomic_t rdma_stat_recv; +extern atomic_t rdma_stat_read; +extern atomic_t rdma_stat_write; +extern atomic_t rdma_stat_sq_starve; +extern atomic_t rdma_stat_rq_starve; +extern atomic_t rdma_stat_rq_poll; +extern atomic_t rdma_stat_rq_prod; +extern atomic_t rdma_stat_sq_poll; +extern atomic_t rdma_stat_sq_prod; + +#define RPCRDMA_VERSION 1 + +/* + * Contexts are built when an RDMA request is created and are a + * record of the resources that can be recovered when the request + * completes. + */ +struct svc_rdma_op_ctxt { + struct svc_rdma_op_ctxt *next; + struct xdr_buf arg; + struct list_head dto_q; + enum ib_wr_opcode wr_op; + enum ib_wc_status wc_status; + u32 byte_len; + struct svcxprt_rdma *xprt; + unsigned long flags; + enum dma_data_direction direction; + int count; + struct ib_sge sge[RPCSVC_MAXPAGES]; + struct page *pages[RPCSVC_MAXPAGES]; +}; + +#define RDMACTXT_F_READ_DONE 1 +#define RDMACTXT_F_LAST_CTXT 2 + +struct svcxprt_rdma { + struct svc_xprt sc_xprt; /* SVC transport structure */ + struct rdma_cm_id *sc_cm_id; /* RDMA connection id */ + struct list_head sc_accept_q; /* Conn. waiting accept */ + int sc_ord; /* RDMA read limit */ + wait_queue_head_t sc_read_wait; + int sc_max_sge; + + int sc_sq_depth; /* Depth of SQ */ + atomic_t sc_sq_count; /* Number of SQ WR on queue */ + + int sc_max_requests; /* Depth of RQ */ + int sc_max_req_size; /* Size of each RQ WR buf */ + + struct ib_pd *sc_pd; + + struct svc_rdma_op_ctxt *sc_ctxt_head; + int sc_ctxt_cnt; + int sc_ctxt_bump; + int sc_ctxt_max; + spinlock_t sc_ctxt_lock; + struct list_head sc_rq_dto_q; + spinlock_t sc_rq_dto_lock; + struct ib_qp *sc_qp; + struct ib_cq *sc_rq_cq; + struct ib_cq *sc_sq_cq; + struct ib_mr *sc_phys_mr; /* MR for server memory */ + + spinlock_t sc_lock; /* transport lock */ + + wait_queue_head_t sc_send_wait; /* SQ exhaustion waitlist */ + unsigned long sc_flags; + struct list_head sc_dto_q; /* DTO tasklet I/O pending Q */ + struct list_head sc_read_complete_q; + spinlock_t sc_read_complete_lock; +}; +/* sc_flags */ +#define RDMAXPRT_RQ_PENDING 1 +#define RDMAXPRT_SQ_PENDING 2 +#define RDMAXPRT_CONN_PENDING 3 + +#define RPCRDMA_LISTEN_BACKLOG 10 +/* The default ORD value is based on two outstanding full-size writes with a + * page size of 4k, or 32k * 2 ops / 4k = 16 outstanding RDMA_READ. */ +#define RPCRDMA_ORD (64/4) +#define RPCRDMA_SQ_DEPTH_MULT 8 +#define RPCRDMA_MAX_THREADS 16 +#define RPCRDMA_MAX_REQUESTS 16 +#define RPCRDMA_MAX_REQ_SIZE 4096 + +/* svc_rdma_marshal.c */ +extern void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *, + int *, int *); +extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *); +extern int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *); +extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *, + struct rpcrdma_msg *, + enum rpcrdma_errcode, u32 *); +extern void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *, int); +extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int); +extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int, + u32, u64, u32); +extern void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *, + struct rpcrdma_msg *, + struct rpcrdma_msg *, + enum rpcrdma_proc); +extern int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *); + +/* svc_rdma_recvfrom.c */ +extern int svc_rdma_recvfrom(struct svc_rqst *); + +/* svc_rdma_sendto.c */ +extern int svc_rdma_sendto(struct svc_rqst *); + +/* svc_rdma_transport.c */ +extern int svc_rdma_send(struct svcxprt_rdma *, struct ib_send_wr *); +extern int svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *, + enum rpcrdma_errcode); +struct page *svc_rdma_get_page(void); +extern int svc_rdma_post_recv(struct svcxprt_rdma *); +extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *); +extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *); +extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int); +extern void svc_sq_reap(struct svcxprt_rdma *); +extern void svc_rq_reap(struct svcxprt_rdma *); +extern struct svc_xprt_class svc_rdma_class; +extern void svc_rdma_prep_reply_hdr(struct svc_rqst *); + +/* svc_rdma.c */ +extern int svc_rdma_init(void); +extern void svc_rdma_cleanup(void); + +/* + * Returns the address of the first read chunk or if no read chunk is + * present + */ +static inline struct rpcrdma_read_chunk * +svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp) +{ + struct rpcrdma_read_chunk *ch = + (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; + + if (ch->rc_discrim == 0) + return NULL; + + return ch; +} + +/* + * Returns the address of the first read write array element or if no + * write array list is present + */ +static inline struct rpcrdma_write_array * +svc_rdma_get_write_array(struct rpcrdma_msg *rmsgp) +{ + if (rmsgp->rm_body.rm_chunks[0] != 0 + || rmsgp->rm_body.rm_chunks[1] == 0) + return NULL; + + return (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[1]; +} + +/* + * Returns the address of the first reply array element or if no + * reply array is present + */ +static inline struct rpcrdma_write_array * +svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp) +{ + struct rpcrdma_read_chunk *rch; + struct rpcrdma_write_array *wr_ary; + struct rpcrdma_write_array *rp_ary; + + /* XXX: Need to fix when reply list may occur with read-list and/or + * write list */ + if (rmsgp->rm_body.rm_chunks[0] != 0 || + rmsgp->rm_body.rm_chunks[1] != 0) + return NULL; + + rch = svc_rdma_get_read_chunk(rmsgp); + if (rch) { + while (rch->rc_discrim) + rch++; + + /* The reply list follows an empty write array located + * at 'rc_position' here. The reply array is at rc_target. + */ + rp_ary = (struct rpcrdma_write_array *)&rch->rc_target; + + goto found_it; + } + + wr_ary = svc_rdma_get_write_array(rmsgp); + if (wr_ary) { + rp_ary = (struct rpcrdma_write_array *) + &wr_ary-> + wc_array[wr_ary->wc_nchunks].wc_target.rs_length; + + goto found_it; + } + + /* No read list, no write list */ + rp_ary = (struct rpcrdma_write_array *) + &rmsgp->rm_body.rm_chunks[2]; + + found_it: + if (rp_ary->wc_discrim == 0) + return NULL; + + return rp_ary; +} +#endif diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h new file mode 100644 index 0000000..b7d94ef --- /dev/null +++ b/include/linux/sunrpc/svc_xprt.h @@ -0,0 +1,88 @@ +/* + * linux/include/linux/sunrpc/svc_xprt.h + * + * RPC server transport I/O + */ + +#ifndef SUNRPC_SVC_XPRT_H +#define SUNRPC_SVC_XPRT_H + +#include +#include + +struct svc_xprt_ops { + struct svc_xprt *(*xpo_create)(struct svc_serv *, + struct sockaddr *, int, + int); + struct svc_xprt *(*xpo_accept)(struct svc_xprt *); + int (*xpo_has_wspace)(struct svc_xprt *); + int (*xpo_recvfrom)(struct svc_rqst *); + void (*xpo_prep_reply_hdr)(struct svc_rqst *); + int (*xpo_sendto)(struct svc_rqst *); + void (*xpo_release_rqst)(struct svc_rqst *); + void (*xpo_detach)(struct svc_xprt *); + void (*xpo_free)(struct svc_xprt *); +}; + +struct svc_xprt_class { + const char *xcl_name; + struct module *xcl_owner; + struct svc_xprt_ops *xcl_ops; + struct list_head xcl_list; + u32 xcl_max_payload; +}; + +struct svc_xprt { + struct svc_xprt_class *xpt_class; + struct svc_xprt_ops *xpt_ops; + struct kref xpt_ref; + struct list_head xpt_list; + struct list_head xpt_ready; + unsigned long xpt_flags; +#define XPT_BUSY 0 /* enqueued/receiving */ +#define XPT_CONN 1 /* conn pending */ +#define XPT_CLOSE 2 /* dead or dying */ +#define XPT_DATA 3 /* data pending */ +#define XPT_TEMP 4 /* connected transport */ +#define XPT_DEAD 6 /* transport closed */ +#define XPT_CHNGBUF 7 /* need to change snd/rcv buf sizes */ +#define XPT_DEFERRED 8 /* deferred request pending */ +#define XPT_OLD 9 /* used for xprt aging mark+sweep */ +#define XPT_DETACHED 10 /* detached from tempsocks list */ +#define XPT_LISTENER 11 /* listening endpoint */ +#define XPT_CACHE_AUTH 12 /* cache auth info */ + + struct svc_pool *xpt_pool; /* current pool iff queued */ + struct svc_serv *xpt_server; /* service for transport */ + atomic_t xpt_reserved; /* space on outq that is rsvd */ + struct mutex xpt_mutex; /* to serialize sending data */ + spinlock_t xpt_lock; /* protects sk_deferred + * and xpt_auth_cache */ + void *xpt_auth_cache;/* auth cache */ + struct list_head xpt_deferred; /* deferred requests that need + * to be revisted */ + struct sockaddr_storage xpt_local; /* local address */ + struct sockaddr_storage xpt_remote; /* remote peer's address */ + int xpt_remotelen; /* length of address */ +}; + +int svc_reg_xprt_class(struct svc_xprt_class *); +int svc_unreg_xprt_class(struct svc_xprt_class *); +void svc_xprt_init(struct svc_xprt_class *, struct svc_xprt *, + struct svc_serv *); +int svc_create_xprt(struct svc_serv *, char *, unsigned short, int); +void svc_xprt_received(struct svc_xprt *); +void svc_xprt_enqueue(struct svc_xprt *xprt); +int svc_port_is_privileged(struct sockaddr *sin); +void svc_xprt_put(struct svc_xprt *xprt); +static inline void svc_xprt_get(struct svc_xprt *xprt) +{ + kref_get(&xprt->xpt_ref); +} +void svc_delete_xprt(struct svc_xprt *xprt); +void svc_close_xprt(struct svc_xprt *xprt); +int svc_print_xprts(char *buf, int maxlen); +struct svc_xprt *svc_find_xprt(struct svc_serv *serv, char *xprt_class, + int af, int port); +int svc_xprt_names(struct svc_serv *serv, char *buf, int buflen); +#endif /* SUNRPC_SVC_XPRT_H */ diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index a53e0fa..206f092 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -10,42 +10,16 @@ #define SUNRPC_SVCSOCK_H #include +#include /* * RPC server socket. */ struct svc_sock { - struct list_head sk_ready; /* list of ready sockets */ - struct list_head sk_list; /* list of all sockets */ + struct svc_xprt sk_xprt; struct socket * sk_sock; /* berkeley socket layer */ struct sock * sk_sk; /* INET layer */ - struct svc_pool * sk_pool; /* current pool iff queued */ - struct svc_serv * sk_server; /* service for this socket */ - atomic_t sk_inuse; /* use count */ - unsigned long sk_flags; -#define SK_BUSY 0 /* enqueued/receiving */ -#define SK_CONN 1 /* conn pending */ -#define SK_CLOSE 2 /* dead or dying */ -#define SK_DATA 3 /* data pending */ -#define SK_TEMP 4 /* temp (TCP) socket */ -#define SK_DEAD 6 /* socket closed */ -#define SK_CHNGBUF 7 /* need to change snd/rcv buffer sizes */ -#define SK_DEFERRED 8 /* request on sk_deferred */ -#define SK_OLD 9 /* used for temp socket aging mark+sweep */ -#define SK_DETACHED 10 /* detached from tempsocks list */ - - atomic_t sk_reserved; /* space on outq that is reserved */ - - spinlock_t sk_lock; /* protects sk_deferred and - * sk_info_authunix */ - struct list_head sk_deferred; /* deferred requests that need to - * be revisted */ - struct mutex sk_mutex; /* to serialize sending data */ - - int (*sk_recvfrom)(struct svc_rqst *rqstp); - int (*sk_sendto)(struct svc_rqst *rqstp); - /* We keep the old state_change and data_ready CB's here */ void (*sk_ostate)(struct sock *); void (*sk_odata)(struct sock *, int bytes); @@ -54,21 +28,12 @@ struct svc_sock { /* private TCP part */ int sk_reclen; /* length of record */ int sk_tcplen; /* current read length */ - time_t sk_lastrecv; /* time of last received request */ - - /* cache of various info for TCP sockets */ - void *sk_info_authunix; - - struct sockaddr_storage sk_local; /* local address */ - struct sockaddr_storage sk_remote; /* remote peer's address */ - int sk_remotelen; /* length of address */ }; /* * Function prototypes. */ -int svc_makesock(struct svc_serv *, int, unsigned short, int flags); -void svc_force_close_socket(struct svc_sock *); +void svc_close_all(struct list_head *); int svc_recv(struct svc_rqst *, long); int svc_send(struct svc_rqst *); void svc_drop(struct svc_rqst *); @@ -78,6 +43,8 @@ int svc_addsock(struct svc_serv *serv, int fd, char *name_return, int *proto); +void svc_init_xprt_sock(void); +void svc_cleanup_xprt_sock(void); /* * svc_makesock socket characteristics diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 0751c94..e4057d7 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -112,7 +112,8 @@ struct xdr_buf { __be32 *xdr_encode_opaque_fixed(__be32 *p, const void *ptr, unsigned int len); __be32 *xdr_encode_opaque(__be32 *p, const void *ptr, unsigned int len); __be32 *xdr_encode_string(__be32 *p, const char *s); -__be32 *xdr_decode_string_inplace(__be32 *p, char **sp, int *lenp, int maxlen); +__be32 *xdr_decode_string_inplace(__be32 *p, char **sp, unsigned int *lenp, + unsigned int maxlen); __be32 *xdr_encode_netobj(__be32 *p, const struct xdr_netobj *); __be32 *xdr_decode_netobj(__be32 *p, struct xdr_netobj *); diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile index 5c69a72..6d03dbf 100644 --- a/net/sunrpc/Makefile +++ b/net/sunrpc/Makefile @@ -11,6 +11,11 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ auth.o auth_null.o auth_unix.o \ svc.o svcsock.o svcauth.o svcauth_unix.o \ rpcb_clnt.o timer.o xdr.o \ - sunrpc_syms.o cache.o rpc_pipe.o + sunrpc_syms.o cache.o rpc_pipe.o \ + svc_xprt.o sunrpc-$(CONFIG_PROC_FS) += stats.o sunrpc-$(CONFIG_SYSCTL) += sysctl.o + +obj-$(CONFIG_NFSD_RDMA) += svcrdma.o +svcrdma-y := svc_rdma.o svc_rdma_transport.o \ + svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 73940df..aa790bb 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1386,19 +1386,26 @@ int gss_svc_init(void) { int rv = svc_auth_register(RPC_AUTH_GSS, &svcauthops_gss); - if (rv == 0) { - cache_register(&rsc_cache); - cache_register(&rsi_cache); - } + if (rv) + return rv; + rv = cache_register(&rsc_cache); + if (rv) + goto out1; + rv = cache_register(&rsi_cache); + if (rv) + goto out2; + return 0; +out2: + cache_unregister(&rsc_cache); +out1: + svc_auth_unregister(RPC_AUTH_GSS); return rv; } void gss_svc_shutdown(void) { - if (cache_unregister(&rsc_cache)) - printk(KERN_ERR "auth_rpcgss: failed to unregister rsc cache\n"); - if (cache_unregister(&rsi_cache)) - printk(KERN_ERR "auth_rpcgss: failed to unregister rsi cache\n"); + cache_unregister(&rsc_cache); + cache_unregister(&rsi_cache); svc_auth_unregister(RPC_AUTH_GSS); } diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 8e05557..d41fe3c 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -290,44 +290,78 @@ static const struct file_operations cache_flush_operations; static void do_cache_clean(struct work_struct *work); static DECLARE_DELAYED_WORK(cache_cleaner, do_cache_clean); -void cache_register(struct cache_detail *cd) +void remove_cache_proc_entries(struct cache_detail *cd) { - cd->proc_ent = proc_mkdir(cd->name, proc_net_rpc); - if (cd->proc_ent) { - struct proc_dir_entry *p; - cd->proc_ent->owner = cd->owner; - cd->channel_ent = cd->content_ent = NULL; + if (cd->proc_ent == NULL) + return; + if (cd->flush_ent) + remove_proc_entry("flush", cd->proc_ent); + if (cd->channel_ent) + remove_proc_entry("channel", cd->proc_ent); + if (cd->content_ent) + remove_proc_entry("content", cd->proc_ent); + cd->proc_ent = NULL; + remove_proc_entry(cd->name, proc_net_rpc); +} - p = create_proc_entry("flush", S_IFREG|S_IRUSR|S_IWUSR, - cd->proc_ent); - cd->flush_ent = p; - if (p) { - p->proc_fops = &cache_flush_operations; - p->owner = cd->owner; - p->data = cd; - } +#ifdef CONFIG_PROC_FS +int create_cache_proc_entries(struct cache_detail *cd) +{ + struct proc_dir_entry *p; - if (cd->cache_request || cd->cache_parse) { - p = create_proc_entry("channel", S_IFREG|S_IRUSR|S_IWUSR, - cd->proc_ent); - cd->channel_ent = p; - if (p) { - p->proc_fops = &cache_file_operations; - p->owner = cd->owner; - p->data = cd; - } - } - if (cd->cache_show) { - p = create_proc_entry("content", S_IFREG|S_IRUSR|S_IWUSR, - cd->proc_ent); - cd->content_ent = p; - if (p) { - p->proc_fops = &content_file_operations; - p->owner = cd->owner; - p->data = cd; - } - } + cd->proc_ent = proc_mkdir(cd->name, proc_net_rpc); + if (cd->proc_ent == NULL) + goto out_nomem; + cd->proc_ent->owner = cd->owner; + cd->channel_ent = cd->content_ent = NULL; + + p = create_proc_entry("flush", S_IFREG|S_IRUSR|S_IWUSR, cd->proc_ent); + cd->flush_ent = p; + if (p == NULL) + goto out_nomem; + p->proc_fops = &cache_flush_operations; + p->owner = cd->owner; + p->data = cd; + + if (cd->cache_request || cd->cache_parse) { + p = create_proc_entry("channel", S_IFREG|S_IRUSR|S_IWUSR, + cd->proc_ent); + cd->channel_ent = p; + if (p == NULL) + goto out_nomem; + p->proc_fops = &cache_file_operations; + p->owner = cd->owner; + p->data = cd; } + if (cd->cache_show) { + p = create_proc_entry("content", S_IFREG|S_IRUSR|S_IWUSR, + cd->proc_ent); + cd->content_ent = p; + if (p == NULL) + goto out_nomem; + p->proc_fops = &content_file_operations; + p->owner = cd->owner; + p->data = cd; + } + return 0; +out_nomem: + remove_cache_proc_entries(cd); + return -ENOMEM; +} +#else /* CONFIG_PROC_FS */ +int create_cache_proc_entries(struct cache_detail *cd) +{ + return 0; +} +#endif + +int cache_register(struct cache_detail *cd) +{ + int ret; + + ret = create_cache_proc_entries(cd); + if (ret) + return ret; rwlock_init(&cd->hash_lock); INIT_LIST_HEAD(&cd->queue); spin_lock(&cache_list_lock); @@ -341,9 +375,10 @@ void cache_register(struct cache_detail *cd) /* start the cleaning process */ schedule_delayed_work(&cache_cleaner, 0); + return 0; } -int cache_unregister(struct cache_detail *cd) +void cache_unregister(struct cache_detail *cd) { cache_purge(cd); spin_lock(&cache_list_lock); @@ -351,29 +386,21 @@ int cache_unregister(struct cache_detail *cd) if (cd->entries || atomic_read(&cd->inuse)) { write_unlock(&cd->hash_lock); spin_unlock(&cache_list_lock); - return -EBUSY; + goto out; } if (current_detail == cd) current_detail = NULL; list_del_init(&cd->others); write_unlock(&cd->hash_lock); spin_unlock(&cache_list_lock); - if (cd->proc_ent) { - if (cd->flush_ent) - remove_proc_entry("flush", cd->proc_ent); - if (cd->channel_ent) - remove_proc_entry("channel", cd->proc_ent); - if (cd->content_ent) - remove_proc_entry("content", cd->proc_ent); - - cd->proc_ent = NULL; - remove_proc_entry(cd->name, proc_net_rpc); - } + remove_cache_proc_entries(cd); if (list_empty(&cache_list)) { /* module must be being unloaded so its safe to kill the worker */ cancel_delayed_work_sync(&cache_cleaner); } - return 0; + return; +out: + printk(KERN_ERR "nfsd: failed to unregister %s cache\n", cd->name); } /* clean cache tries to find something to clean @@ -634,13 +661,13 @@ void cache_clean_deferred(void *owner) /* * communicate with user-space * - * We have a magic /proc file - /proc/sunrpc/cache - * On read, you get a full request, or block - * On write, an update request is processed - * Poll works if anything to read, and always allows write + * We have a magic /proc file - /proc/sunrpc//channel. + * On read, you get a full request, or block. + * On write, an update request is processed. + * Poll works if anything to read, and always allows write. * * Implemented by linked list of requests. Each open file has - * a ->private that also exists in this list. New request are added + * a ->private that also exists in this list. New requests are added * to the end and may wakeup and preceding readers. * New readers are added to the head. If, on read, an item is found with * CACHE_UPCALLING clear, we free it from the list. @@ -1242,18 +1269,18 @@ static ssize_t read_flush(struct file *file, char __user *buf, struct cache_detail *cd = PDE(file->f_path.dentry->d_inode)->data; char tbuf[20]; unsigned long p = *ppos; - int len; + size_t len; sprintf(tbuf, "%lu\n", cd->flush_time); len = strlen(tbuf); if (p >= len) return 0; len -= p; - if (len > count) len = count; + if (len > count) + len = count; if (copy_to_user(buf, (void*)(tbuf+p), len)) - len = -EFAULT; - else - *ppos += len; + return -EFAULT; + *ppos += len; return len; } diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 33d89e8..df382f2 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -72,7 +72,6 @@ EXPORT_SYMBOL(svc_drop); EXPORT_SYMBOL(svc_process); EXPORT_SYMBOL(svc_recv); EXPORT_SYMBOL(svc_wake_up); -EXPORT_SYMBOL(svc_makesock); EXPORT_SYMBOL(svc_reserve); EXPORT_SYMBOL(svc_auth_register); EXPORT_SYMBOL(auth_domain_lookup); @@ -151,7 +150,8 @@ init_sunrpc(void) #endif cache_register(&ip_map_cache); cache_register(&unix_gid_cache); - init_socket_xprt(); + svc_init_xprt_sock(); /* svc sock transport */ + init_socket_xprt(); /* clnt sock transport */ rpcauth_init_module(); out: return err; @@ -162,12 +162,11 @@ cleanup_sunrpc(void) { rpcauth_remove_module(); cleanup_socket_xprt(); + svc_cleanup_xprt_sock(); unregister_rpc_pipefs(); rpc_destroy_mempool(); - if (cache_unregister(&ip_map_cache)) - printk(KERN_ERR "sunrpc: failed to unregister ip_map cache\n"); - if (cache_unregister(&unix_gid_cache)) - printk(KERN_ERR "sunrpc: failed to unregister unix_gid cache\n"); + cache_unregister(&ip_map_cache); + cache_unregister(&unix_gid_cache); #ifdef RPC_DEBUG rpc_unregister_sysctl(); #endif diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index a4a6bf7..07c9d8a 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -458,9 +458,6 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize, void svc_destroy(struct svc_serv *serv) { - struct svc_sock *svsk; - struct svc_sock *tmp; - dprintk("svc: svc_destroy(%s, %d)\n", serv->sv_program->pg_name, serv->sv_nrthreads); @@ -475,14 +472,12 @@ svc_destroy(struct svc_serv *serv) del_timer_sync(&serv->sv_temptimer); - list_for_each_entry_safe(svsk, tmp, &serv->sv_tempsocks, sk_list) - svc_force_close_socket(svsk); + svc_close_all(&serv->sv_tempsocks); if (serv->sv_shutdown) serv->sv_shutdown(serv); - list_for_each_entry_safe(svsk, tmp, &serv->sv_permsocks, sk_list) - svc_force_close_socket(svsk); + svc_close_all(&serv->sv_permsocks); BUG_ON(!list_empty(&serv->sv_permsocks)); BUG_ON(!list_empty(&serv->sv_tempsocks)); @@ -839,9 +834,9 @@ svc_process(struct svc_rqst *rqstp) rqstp->rq_res.tail[0].iov_len = 0; /* Will be turned off only in gss privacy case: */ rqstp->rq_splice_ok = 1; - /* tcp needs a space for the record length... */ - if (rqstp->rq_prot == IPPROTO_TCP) - svc_putnl(resv, 0); + + /* Setup reply header */ + rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp); rqstp->rq_xid = svc_getu32(argv); svc_putu32(resv, rqstp->rq_xid); @@ -1054,10 +1049,8 @@ err_bad: */ u32 svc_max_payload(const struct svc_rqst *rqstp) { - int max = RPCSVC_MAXPAYLOAD_TCP; + int max = rqstp->rq_xprt->xpt_class->xcl_max_payload; - if (rqstp->rq_sock->sk_sock->type == SOCK_DGRAM) - max = RPCSVC_MAXPAYLOAD_UDP; if (rqstp->rq_server->sv_max_payload < max) max = rqstp->rq_server->sv_max_payload; return max; diff --git a/net/sunrpc/svc_rdma.c b/net/sunrpc/svc_rdma.c new file mode 100644 index 0000000..b9993f4 --- /dev/null +++ b/net/sunrpc/svc_rdma.c @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Author: Tom Tucker + */ +#include +#include +#include +#include +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +/* RPC/RDMA parameters */ +unsigned int svcrdma_ord = RPCRDMA_ORD; +static unsigned int min_ord = 1; +static unsigned int max_ord = 4096; +unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS; +static unsigned int min_max_requests = 4; +static unsigned int max_max_requests = 16384; +unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE; +static unsigned int min_max_inline = 4096; +static unsigned int max_max_inline = 65536; + +#define MAX_RDMA_STAT_LEN 32 +atomic_t rdma_stat_recv; +atomic_t rdma_stat_read; +atomic_t rdma_stat_write; +atomic_t rdma_stat_sq_starve; +atomic_t rdma_stat_rq_starve; +atomic_t rdma_stat_rq_poll; +atomic_t rdma_stat_rq_prod; +atomic_t rdma_stat_sq_poll; +atomic_t rdma_stat_sq_prod; + +/* + * This function implements reading and resetting an atomic_t stat + * variable through read/write to a proc file. Any write to the file + * resets the associated statistic to zero. Any read returns it's + * current value. + */ +static int read_reset_stat(ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + atomic_t *stat = (atomic_t *)table->data; + if (!stat) + return -EINVAL; + + if (write) + atomic_set(stat, 0); + else { + char str_buf[32]; + char *data; + int len = snprintf(str_buf, 32, "%d\n", atomic_read(stat)); + if (len) + return -EFAULT; + len = strlen(str_buf); + if (*ppos > len) { + *lenp = 0; + return 0; + } + data = &str_buf[*ppos]; + len -= *ppos; + if (len > *lenp) + len = *lenp; + if (len) + if (copy_to_user(str_buf, buffer, len)) + return -EFAULT; + *lenp = len; + *ppos += len; + } + return 0; +} + +static struct ctl_table_header *svcrdma_table_header; +static ctl_table svcrdma_parm_table[] = { + { + .procname = "max_requests", + .data = &svcrdma_max_requests, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_max_requests, + .extra2 = &max_max_requests + }, + { + .procname = "max_req_size", + .data = &svcrdma_max_req_size, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_max_inline, + .extra2 = &max_max_inline + }, + { + .procname = "max_outbound_read_requests", + .data = &svcrdma_ord, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_ord, + .extra2 = &max_ord, + }, + + { + .procname = "rdma_stat_read", + .data = &rdma_stat_read, + .maxlen = MAX_RDMA_STAT_LEN, + .mode = 0644, + .proc_handler = &read_reset_stat, + }, + { + .procname = "rdma_stat_recv", + .data = &rdma_stat_recv, + .maxlen = MAX_RDMA_STAT_LEN, + .mode = 0644, + .proc_handler = &read_reset_stat, + }, + { + .procname = "rdma_stat_write", + .data = &rdma_stat_write, + .maxlen = MAX_RDMA_STAT_LEN, + .mode = 0644, + .proc_handler = &read_reset_stat, + }, + { + .procname = "rdma_stat_sq_starve", + .data = &rdma_stat_sq_starve, + .maxlen = MAX_RDMA_STAT_LEN, + .mode = 0644, + .proc_handler = &read_reset_stat, + }, + { + .procname = "rdma_stat_rq_starve", + .data = &rdma_stat_rq_starve, + .maxlen = MAX_RDMA_STAT_LEN, + .mode = 0644, + .proc_handler = &read_reset_stat, + }, + { + .procname = "rdma_stat_rq_poll", + .data = &rdma_stat_rq_poll, + .maxlen = MAX_RDMA_STAT_LEN, + .mode = 0644, + .proc_handler = &read_reset_stat, + }, + { + .procname = "rdma_stat_rq_prod", + .data = &rdma_stat_rq_prod, + .maxlen = MAX_RDMA_STAT_LEN, + .mode = 0644, + .proc_handler = &read_reset_stat, + }, + { + .procname = "rdma_stat_sq_poll", + .data = &rdma_stat_sq_poll, + .maxlen = MAX_RDMA_STAT_LEN, + .mode = 0644, + .proc_handler = &read_reset_stat, + }, + { + .procname = "rdma_stat_sq_prod", + .data = &rdma_stat_sq_prod, + .maxlen = MAX_RDMA_STAT_LEN, + .mode = 0644, + .proc_handler = &read_reset_stat, + }, + { + .ctl_name = 0, + }, +}; + +static ctl_table svcrdma_table[] = { + { + .procname = "svc_rdma", + .mode = 0555, + .child = svcrdma_parm_table + }, + { + .ctl_name = 0, + }, +}; + +static ctl_table svcrdma_root_table[] = { + { + .procname = "sunrpc", + .mode = 0555, + .child = svcrdma_table + }, + { + .ctl_name = 0, + }, +}; + +void svc_rdma_cleanup(void) +{ + dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n"); + if (svcrdma_table_header) { + unregister_sysctl_table(svcrdma_table_header); + svcrdma_table_header = NULL; + } + svc_unreg_xprt_class(&svc_rdma_class); +} + +int svc_rdma_init(void) +{ + dprintk("SVCRDMA Module Init, register RPC RDMA transport\n"); + dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord); + dprintk("\tmax_requests : %d\n", svcrdma_max_requests); + dprintk("\tsq_depth : %d\n", + svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT); + dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); + if (!svcrdma_table_header) + svcrdma_table_header = + register_sysctl_table(svcrdma_root_table); + + /* Register RDMA with the SVC transport switch */ + svc_reg_xprt_class(&svc_rdma_class); + return 0; +} +MODULE_AUTHOR("Tom Tucker "); +MODULE_DESCRIPTION("SVC RDMA Transport"); +MODULE_LICENSE("Dual BSD/GPL"); +module_init(svc_rdma_init); +module_exit(svc_rdma_cleanup); diff --git a/net/sunrpc/svc_rdma_marshal.c b/net/sunrpc/svc_rdma_marshal.c new file mode 100644 index 0000000..9530ef2 --- /dev/null +++ b/net/sunrpc/svc_rdma_marshal.c @@ -0,0 +1,412 @@ +/* + * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Author: Tom Tucker + */ + +#include +#include +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +/* + * Decodes a read chunk list. The expected format is as follows: + * descrim : xdr_one + * position : u32 offset into XDR stream + * handle : u32 RKEY + * . . . + * end-of-list: xdr_zero + */ +static u32 *decode_read_list(u32 *va, u32 *vaend) +{ + struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va; + + while (ch->rc_discrim != xdr_zero) { + u64 ch_offset; + + if (((unsigned long)ch + sizeof(struct rpcrdma_read_chunk)) > + (unsigned long)vaend) { + dprintk("svcrdma: vaend=%p, ch=%p\n", vaend, ch); + return NULL; + } + + ch->rc_discrim = ntohl(ch->rc_discrim); + ch->rc_position = ntohl(ch->rc_position); + ch->rc_target.rs_handle = ntohl(ch->rc_target.rs_handle); + ch->rc_target.rs_length = ntohl(ch->rc_target.rs_length); + va = (u32 *)&ch->rc_target.rs_offset; + xdr_decode_hyper(va, &ch_offset); + put_unaligned(ch_offset, (u64 *)va); + ch++; + } + return (u32 *)&ch->rc_position; +} + +/* + * Determine number of chunks and total bytes in chunk list. The chunk + * list has already been verified to fit within the RPCRDMA header. + */ +void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *ch, + int *ch_count, int *byte_count) +{ + /* compute the number of bytes represented by read chunks */ + *byte_count = 0; + *ch_count = 0; + for (; ch->rc_discrim != 0; ch++) { + *byte_count = *byte_count + ch->rc_target.rs_length; + *ch_count = *ch_count + 1; + } +} + +/* + * Decodes a write chunk list. The expected format is as follows: + * descrim : xdr_one + * nchunks : + * handle : u32 RKEY ---+ + * length : u32 | + * offset : remove va + + * . . . | + * ---+ + */ +static u32 *decode_write_list(u32 *va, u32 *vaend) +{ + int ch_no; + struct rpcrdma_write_array *ary = + (struct rpcrdma_write_array *)va; + + /* Check for not write-array */ + if (ary->wc_discrim == xdr_zero) + return (u32 *)&ary->wc_nchunks; + + if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) > + (unsigned long)vaend) { + dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend); + return NULL; + } + ary->wc_discrim = ntohl(ary->wc_discrim); + ary->wc_nchunks = ntohl(ary->wc_nchunks); + if (((unsigned long)&ary->wc_array[0] + + (sizeof(struct rpcrdma_write_chunk) * ary->wc_nchunks)) > + (unsigned long)vaend) { + dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n", + ary, ary->wc_nchunks, vaend); + return NULL; + } + for (ch_no = 0; ch_no < ary->wc_nchunks; ch_no++) { + u64 ch_offset; + + ary->wc_array[ch_no].wc_target.rs_handle = + ntohl(ary->wc_array[ch_no].wc_target.rs_handle); + ary->wc_array[ch_no].wc_target.rs_length = + ntohl(ary->wc_array[ch_no].wc_target.rs_length); + va = (u32 *)&ary->wc_array[ch_no].wc_target.rs_offset; + xdr_decode_hyper(va, &ch_offset); + put_unaligned(ch_offset, (u64 *)va); + } + + /* + * rs_length is the 2nd 4B field in wc_target and taking its + * address skips the list terminator + */ + return (u32 *)&ary->wc_array[ch_no].wc_target.rs_length; +} + +static u32 *decode_reply_array(u32 *va, u32 *vaend) +{ + int ch_no; + struct rpcrdma_write_array *ary = + (struct rpcrdma_write_array *)va; + + /* Check for no reply-array */ + if (ary->wc_discrim == xdr_zero) + return (u32 *)&ary->wc_nchunks; + + if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) > + (unsigned long)vaend) { + dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend); + return NULL; + } + ary->wc_discrim = ntohl(ary->wc_discrim); + ary->wc_nchunks = ntohl(ary->wc_nchunks); + if (((unsigned long)&ary->wc_array[0] + + (sizeof(struct rpcrdma_write_chunk) * ary->wc_nchunks)) > + (unsigned long)vaend) { + dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n", + ary, ary->wc_nchunks, vaend); + return NULL; + } + for (ch_no = 0; ch_no < ary->wc_nchunks; ch_no++) { + u64 ch_offset; + + ary->wc_array[ch_no].wc_target.rs_handle = + ntohl(ary->wc_array[ch_no].wc_target.rs_handle); + ary->wc_array[ch_no].wc_target.rs_length = + ntohl(ary->wc_array[ch_no].wc_target.rs_length); + va = (u32 *)&ary->wc_array[ch_no].wc_target.rs_offset; + xdr_decode_hyper(va, &ch_offset); + put_unaligned(ch_offset, (u64 *)va); + } + + return (u32 *)&ary->wc_array[ch_no]; +} + +int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req, + struct svc_rqst *rqstp) +{ + struct rpcrdma_msg *rmsgp = NULL; + u32 *va; + u32 *vaend; + u32 hdr_len; + + rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base; + + /* Verify that there's enough bytes for header + something */ + if (rqstp->rq_arg.len <= RPCRDMA_HDRLEN_MIN) { + dprintk("svcrdma: header too short = %d\n", + rqstp->rq_arg.len); + return -EINVAL; + } + + /* Decode the header */ + rmsgp->rm_xid = ntohl(rmsgp->rm_xid); + rmsgp->rm_vers = ntohl(rmsgp->rm_vers); + rmsgp->rm_credit = ntohl(rmsgp->rm_credit); + rmsgp->rm_type = ntohl(rmsgp->rm_type); + + if (rmsgp->rm_vers != RPCRDMA_VERSION) + return -ENOSYS; + + /* Pull in the extra for the padded case and bump our pointer */ + if (rmsgp->rm_type == RDMA_MSGP) { + int hdrlen; + rmsgp->rm_body.rm_padded.rm_align = + ntohl(rmsgp->rm_body.rm_padded.rm_align); + rmsgp->rm_body.rm_padded.rm_thresh = + ntohl(rmsgp->rm_body.rm_padded.rm_thresh); + + va = &rmsgp->rm_body.rm_padded.rm_pempty[4]; + rqstp->rq_arg.head[0].iov_base = va; + hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp); + rqstp->rq_arg.head[0].iov_len -= hdrlen; + if (hdrlen > rqstp->rq_arg.len) + return -EINVAL; + return hdrlen; + } + + /* The chunk list may contain either a read chunk list or a write + * chunk list and a reply chunk list. + */ + va = &rmsgp->rm_body.rm_chunks[0]; + vaend = (u32 *)((unsigned long)rmsgp + rqstp->rq_arg.len); + va = decode_read_list(va, vaend); + if (!va) + return -EINVAL; + va = decode_write_list(va, vaend); + if (!va) + return -EINVAL; + va = decode_reply_array(va, vaend); + if (!va) + return -EINVAL; + + rqstp->rq_arg.head[0].iov_base = va; + hdr_len = (unsigned long)va - (unsigned long)rmsgp; + rqstp->rq_arg.head[0].iov_len -= hdr_len; + + *rdma_req = rmsgp; + return hdr_len; +} + +int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *rqstp) +{ + struct rpcrdma_msg *rmsgp = NULL; + struct rpcrdma_read_chunk *ch; + struct rpcrdma_write_array *ary; + u32 *va; + u32 hdrlen; + + dprintk("svcrdma: processing deferred RDMA header on rqstp=%p\n", + rqstp); + rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base; + + /* Pull in the extra for the padded case and bump our pointer */ + if (rmsgp->rm_type == RDMA_MSGP) { + va = &rmsgp->rm_body.rm_padded.rm_pempty[4]; + rqstp->rq_arg.head[0].iov_base = va; + hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp); + rqstp->rq_arg.head[0].iov_len -= hdrlen; + return hdrlen; + } + + /* + * Skip all chunks to find RPC msg. These were previously processed + */ + va = &rmsgp->rm_body.rm_chunks[0]; + + /* Skip read-list */ + for (ch = (struct rpcrdma_read_chunk *)va; + ch->rc_discrim != xdr_zero; ch++); + va = (u32 *)&ch->rc_position; + + /* Skip write-list */ + ary = (struct rpcrdma_write_array *)va; + if (ary->wc_discrim == xdr_zero) + va = (u32 *)&ary->wc_nchunks; + else + /* + * rs_length is the 2nd 4B field in wc_target and taking its + * address skips the list terminator + */ + va = (u32 *)&ary->wc_array[ary->wc_nchunks].wc_target.rs_length; + + /* Skip reply-array */ + ary = (struct rpcrdma_write_array *)va; + if (ary->wc_discrim == xdr_zero) + va = (u32 *)&ary->wc_nchunks; + else + va = (u32 *)&ary->wc_array[ary->wc_nchunks]; + + rqstp->rq_arg.head[0].iov_base = va; + hdrlen = (unsigned long)va - (unsigned long)rmsgp; + rqstp->rq_arg.head[0].iov_len -= hdrlen; + + return hdrlen; +} + +int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt, + struct rpcrdma_msg *rmsgp, + enum rpcrdma_errcode err, u32 *va) +{ + u32 *startp = va; + + *va++ = htonl(rmsgp->rm_xid); + *va++ = htonl(rmsgp->rm_vers); + *va++ = htonl(xprt->sc_max_requests); + *va++ = htonl(RDMA_ERROR); + *va++ = htonl(err); + if (err == ERR_VERS) { + *va++ = htonl(RPCRDMA_VERSION); + *va++ = htonl(RPCRDMA_VERSION); + } + + return (int)((unsigned long)va - (unsigned long)startp); +} + +int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp) +{ + struct rpcrdma_write_array *wr_ary; + + /* There is no read-list in a reply */ + + /* skip write list */ + wr_ary = (struct rpcrdma_write_array *) + &rmsgp->rm_body.rm_chunks[1]; + if (wr_ary->wc_discrim) + wr_ary = (struct rpcrdma_write_array *) + &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)]. + wc_target.rs_length; + else + wr_ary = (struct rpcrdma_write_array *) + &wr_ary->wc_nchunks; + + /* skip reply array */ + if (wr_ary->wc_discrim) + wr_ary = (struct rpcrdma_write_array *) + &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)]; + else + wr_ary = (struct rpcrdma_write_array *) + &wr_ary->wc_nchunks; + + return (unsigned long) wr_ary - (unsigned long) rmsgp; +} + +void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks) +{ + struct rpcrdma_write_array *ary; + + /* no read-list */ + rmsgp->rm_body.rm_chunks[0] = xdr_zero; + + /* write-array discrim */ + ary = (struct rpcrdma_write_array *) + &rmsgp->rm_body.rm_chunks[1]; + ary->wc_discrim = xdr_one; + ary->wc_nchunks = htonl(chunks); + + /* write-list terminator */ + ary->wc_array[chunks].wc_target.rs_handle = xdr_zero; + + /* reply-array discriminator */ + ary->wc_array[chunks].wc_target.rs_length = xdr_zero; +} + +void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary, + int chunks) +{ + ary->wc_discrim = xdr_one; + ary->wc_nchunks = htonl(chunks); +} + +void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary, + int chunk_no, + u32 rs_handle, u64 rs_offset, + u32 write_len) +{ + struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target; + seg->rs_handle = htonl(rs_handle); + seg->rs_length = htonl(write_len); + xdr_encode_hyper((u32 *) &seg->rs_offset, rs_offset); +} + +void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt, + struct rpcrdma_msg *rdma_argp, + struct rpcrdma_msg *rdma_resp, + enum rpcrdma_proc rdma_type) +{ + rdma_resp->rm_xid = htonl(rdma_argp->rm_xid); + rdma_resp->rm_vers = htonl(rdma_argp->rm_vers); + rdma_resp->rm_credit = htonl(xprt->sc_max_requests); + rdma_resp->rm_type = htonl(rdma_type); + + /* Encode chunks lists */ + rdma_resp->rm_body.rm_chunks[0] = xdr_zero; + rdma_resp->rm_body.rm_chunks[1] = xdr_zero; + rdma_resp->rm_body.rm_chunks[2] = xdr_zero; +} diff --git a/net/sunrpc/svc_rdma_recvfrom.c b/net/sunrpc/svc_rdma_recvfrom.c new file mode 100644 index 0000000..b940bdf --- /dev/null +++ b/net/sunrpc/svc_rdma_recvfrom.c @@ -0,0 +1,576 @@ +/* + * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Author: Tom Tucker + */ + +#include +#include +#include +#include +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +/* + * Replace the pages in the rq_argpages array with the pages from the SGE in + * the RDMA_RECV completion. The SGL should contain full pages up until the + * last one. + */ +static void rdma_build_arg_xdr(struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *ctxt, + u32 byte_count) +{ + struct page *page; + u32 bc; + int sge_no; + + /* Swap the page in the SGE with the page in argpages */ + page = ctxt->pages[0]; + put_page(rqstp->rq_pages[0]); + rqstp->rq_pages[0] = page; + + /* Set up the XDR head */ + rqstp->rq_arg.head[0].iov_base = page_address(page); + rqstp->rq_arg.head[0].iov_len = min(byte_count, ctxt->sge[0].length); + rqstp->rq_arg.len = byte_count; + rqstp->rq_arg.buflen = byte_count; + + /* Compute bytes past head in the SGL */ + bc = byte_count - rqstp->rq_arg.head[0].iov_len; + + /* If data remains, store it in the pagelist */ + rqstp->rq_arg.page_len = bc; + rqstp->rq_arg.page_base = 0; + rqstp->rq_arg.pages = &rqstp->rq_pages[1]; + sge_no = 1; + while (bc && sge_no < ctxt->count) { + page = ctxt->pages[sge_no]; + put_page(rqstp->rq_pages[sge_no]); + rqstp->rq_pages[sge_no] = page; + bc -= min(bc, ctxt->sge[sge_no].length); + rqstp->rq_arg.buflen += ctxt->sge[sge_no].length; + sge_no++; + } + rqstp->rq_respages = &rqstp->rq_pages[sge_no]; + + /* We should never run out of SGE because the limit is defined to + * support the max allowed RPC data length + */ + BUG_ON(bc && (sge_no == ctxt->count)); + BUG_ON((rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len) + != byte_count); + BUG_ON(rqstp->rq_arg.len != byte_count); + + /* If not all pages were used from the SGL, free the remaining ones */ + bc = sge_no; + while (sge_no < ctxt->count) { + page = ctxt->pages[sge_no++]; + put_page(page); + } + ctxt->count = bc; + + /* Set up tail */ + rqstp->rq_arg.tail[0].iov_base = NULL; + rqstp->rq_arg.tail[0].iov_len = 0; +} + +struct chunk_sge { + int start; /* sge no for this chunk */ + int count; /* sge count for this chunk */ +}; + +/* Encode a read-chunk-list as an array of IB SGE + * + * Assumptions: + * - chunk[0]->position points to pages[0] at an offset of 0 + * - pages[] is not physically or virtually contigous and consists of + * PAGE_SIZE elements. + * + * Output: + * - sge array pointing into pages[] array. + * - chunk_sge array specifying sge index and count for each + * chunk in the read list + * + */ +static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt, + struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *head, + struct rpcrdma_msg *rmsgp, + struct ib_sge *sge, + struct chunk_sge *ch_sge_ary, + int ch_count, + int byte_count) +{ + int sge_no; + int sge_bytes; + int page_off; + int page_no; + int ch_bytes; + int ch_no; + struct rpcrdma_read_chunk *ch; + + sge_no = 0; + page_no = 0; + page_off = 0; + ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; + ch_no = 0; + ch_bytes = ch->rc_target.rs_length; + head->arg.head[0] = rqstp->rq_arg.head[0]; + head->arg.tail[0] = rqstp->rq_arg.tail[0]; + head->arg.pages = &head->pages[head->count]; + head->sge[0].length = head->count; /* save count of hdr pages */ + head->arg.page_base = 0; + head->arg.page_len = ch_bytes; + head->arg.len = rqstp->rq_arg.len + ch_bytes; + head->arg.buflen = rqstp->rq_arg.buflen + ch_bytes; + head->count++; + ch_sge_ary[0].start = 0; + while (byte_count) { + sge_bytes = min_t(int, PAGE_SIZE-page_off, ch_bytes); + sge[sge_no].addr = + ib_dma_map_page(xprt->sc_cm_id->device, + rqstp->rq_arg.pages[page_no], + page_off, sge_bytes, + DMA_FROM_DEVICE); + sge[sge_no].length = sge_bytes; + sge[sge_no].lkey = xprt->sc_phys_mr->lkey; + /* + * Don't bump head->count here because the same page + * may be used by multiple SGE. + */ + head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no]; + rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1]; + + byte_count -= sge_bytes; + ch_bytes -= sge_bytes; + sge_no++; + /* + * If all bytes for this chunk have been mapped to an + * SGE, move to the next SGE + */ + if (ch_bytes == 0) { + ch_sge_ary[ch_no].count = + sge_no - ch_sge_ary[ch_no].start; + ch_no++; + ch++; + ch_sge_ary[ch_no].start = sge_no; + ch_bytes = ch->rc_target.rs_length; + /* If bytes remaining account for next chunk */ + if (byte_count) { + head->arg.page_len += ch_bytes; + head->arg.len += ch_bytes; + head->arg.buflen += ch_bytes; + } + } + /* + * If this SGE consumed all of the page, move to the + * next page + */ + if ((sge_bytes + page_off) == PAGE_SIZE) { + page_no++; + page_off = 0; + /* + * If there are still bytes left to map, bump + * the page count + */ + if (byte_count) + head->count++; + } else + page_off += sge_bytes; + } + BUG_ON(byte_count != 0); + return sge_no; +} + +static void rdma_set_ctxt_sge(struct svc_rdma_op_ctxt *ctxt, + struct ib_sge *sge, + u64 *sgl_offset, + int count) +{ + int i; + + ctxt->count = count; + for (i = 0; i < count; i++) { + ctxt->sge[i].addr = sge[i].addr; + ctxt->sge[i].length = sge[i].length; + *sgl_offset = *sgl_offset + sge[i].length; + } +} + +static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count) +{ +#ifdef RDMA_TRANSPORT_IWARP + if ((RDMA_TRANSPORT_IWARP == + rdma_node_get_transport(xprt->sc_cm_id-> + device->node_type)) + && sge_count > 1) + return 1; + else +#endif + return min_t(int, sge_count, xprt->sc_max_sge); +} + +/* + * Use RDMA_READ to read data from the advertised client buffer into the + * XDR stream starting at rq_arg.head[0].iov_base. + * Each chunk in the array + * contains the following fields: + * discrim - '1', This isn't used for data placement + * position - The xdr stream offset (the same for every chunk) + * handle - RMR for client memory region + * length - data transfer length + * offset - 64 bit tagged offset in remote memory region + * + * On our side, we need to read into a pagelist. The first page immediately + * follows the RPC header. + * + * This function returns 1 to indicate success. The data is not yet in + * the pagelist and therefore the RPC request must be deferred. The + * I/O completion will enqueue the transport again and + * svc_rdma_recvfrom will complete the request. + * + * NOTE: The ctxt must not be touched after the last WR has been posted + * because the I/O completion processing may occur on another + * processor and free / modify the context. Ne touche pas! + */ +static int rdma_read_xdr(struct svcxprt_rdma *xprt, + struct rpcrdma_msg *rmsgp, + struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *hdr_ctxt) +{ + struct ib_send_wr read_wr; + int err = 0; + int ch_no; + struct ib_sge *sge; + int ch_count; + int byte_count; + int sge_count; + u64 sgl_offset; + struct rpcrdma_read_chunk *ch; + struct svc_rdma_op_ctxt *ctxt = NULL; + struct svc_rdma_op_ctxt *head; + struct svc_rdma_op_ctxt *tmp_sge_ctxt; + struct svc_rdma_op_ctxt *tmp_ch_ctxt; + struct chunk_sge *ch_sge_ary; + + /* If no read list is present, return 0 */ + ch = svc_rdma_get_read_chunk(rmsgp); + if (!ch) + return 0; + + /* Allocate temporary contexts to keep SGE */ + BUG_ON(sizeof(struct ib_sge) < sizeof(struct chunk_sge)); + tmp_sge_ctxt = svc_rdma_get_context(xprt); + sge = tmp_sge_ctxt->sge; + tmp_ch_ctxt = svc_rdma_get_context(xprt); + ch_sge_ary = (struct chunk_sge *)tmp_ch_ctxt->sge; + + svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count); + sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp, + sge, ch_sge_ary, + ch_count, byte_count); + head = svc_rdma_get_context(xprt); + sgl_offset = 0; + ch_no = 0; + + for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; + ch->rc_discrim != 0; ch++, ch_no++) { + next_sge: + if (!ctxt) + ctxt = head; + else { + ctxt->next = svc_rdma_get_context(xprt); + ctxt = ctxt->next; + } + ctxt->next = NULL; + ctxt->direction = DMA_FROM_DEVICE; + clear_bit(RDMACTXT_F_READ_DONE, &ctxt->flags); + clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); + if ((ch+1)->rc_discrim == 0) { + /* + * Checked in sq_cq_reap to see if we need to + * be enqueued + */ + set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); + ctxt->next = hdr_ctxt; + hdr_ctxt->next = head; + } + + /* Prepare READ WR */ + memset(&read_wr, 0, sizeof read_wr); + ctxt->wr_op = IB_WR_RDMA_READ; + read_wr.wr_id = (unsigned long)ctxt; + read_wr.opcode = IB_WR_RDMA_READ; + read_wr.send_flags = IB_SEND_SIGNALED; + read_wr.wr.rdma.rkey = ch->rc_target.rs_handle; + read_wr.wr.rdma.remote_addr = + get_unaligned(&(ch->rc_target.rs_offset)) + + sgl_offset; + read_wr.sg_list = &sge[ch_sge_ary[ch_no].start]; + read_wr.num_sge = + rdma_read_max_sge(xprt, ch_sge_ary[ch_no].count); + rdma_set_ctxt_sge(ctxt, &sge[ch_sge_ary[ch_no].start], + &sgl_offset, + read_wr.num_sge); + + /* Post the read */ + err = svc_rdma_send(xprt, &read_wr); + if (err) { + printk(KERN_ERR "svcrdma: Error posting send = %d\n", + err); + /* + * Break the circular list so free knows when + * to stop if the error happened to occur on + * the last read + */ + ctxt->next = NULL; + goto out; + } + atomic_inc(&rdma_stat_read); + + if (read_wr.num_sge < ch_sge_ary[ch_no].count) { + ch_sge_ary[ch_no].count -= read_wr.num_sge; + ch_sge_ary[ch_no].start += read_wr.num_sge; + goto next_sge; + } + sgl_offset = 0; + err = 0; + } + + out: + svc_rdma_put_context(tmp_sge_ctxt, 0); + svc_rdma_put_context(tmp_ch_ctxt, 0); + + /* Detach arg pages. svc_recv will replenish them */ + for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++) + rqstp->rq_pages[ch_no] = NULL; + + /* + * Detach res pages. svc_release must see a resused count of + * zero or it will attempt to put them. + */ + while (rqstp->rq_resused) + rqstp->rq_respages[--rqstp->rq_resused] = NULL; + + if (err) { + printk(KERN_ERR "svcrdma : RDMA_READ error = %d\n", err); + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + /* Free the linked list of read contexts */ + while (head != NULL) { + ctxt = head->next; + svc_rdma_put_context(head, 1); + head = ctxt; + } + return 0; + } + + return 1; +} + +static int rdma_read_complete(struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *data) +{ + struct svc_rdma_op_ctxt *head = data->next; + struct svcxprt_rdma *rdma_xprt = + container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt); + int page_no; + int ret; + + BUG_ON(!head); + + /* Copy RPC pages */ + for (page_no = 0; page_no < head->count; page_no++) { + put_page(rqstp->rq_pages[page_no]); + rqstp->rq_pages[page_no] = head->pages[page_no]; + } + /* Point rq_arg.pages past header */ + rqstp->rq_arg.pages = &rqstp->rq_pages[head->sge[0].length]; + rqstp->rq_arg.page_len = head->arg.page_len; + rqstp->rq_arg.page_base = head->arg.page_base; + + /* rq_respages starts after the last arg page */ + rqstp->rq_respages = &rqstp->rq_arg.pages[page_no]; + rqstp->rq_resused = 0; + + /* Rebuild rq_arg head and tail. */ + rqstp->rq_arg.head[0] = head->arg.head[0]; + rqstp->rq_arg.tail[0] = head->arg.tail[0]; + rqstp->rq_arg.len = head->arg.len; + rqstp->rq_arg.buflen = head->arg.buflen; + + rqstp->rq_prot = IPPROTO_MAX; + memcpy(&rqstp->rq_addr, + &rdma_xprt->sc_cm_id->route.addr.dst_addr, + sizeof(rqstp->rq_addr)); + rqstp->rq_addrlen = sizeof(rqstp->rq_addr); + + /* + * Free the contexts we used to build the RDMA_READ. We have + * to be careful here because the context list uses the same + * next pointer used to chain the contexts associated with the + * RDMA_READ + */ + data->next = NULL; /* terminate circular list */ + do { + data = head->next; + svc_rdma_put_context(head, 0); + head = data; + } while (head != NULL); + + ret = rqstp->rq_arg.head[0].iov_len + + rqstp->rq_arg.page_len + + rqstp->rq_arg.tail[0].iov_len; + dprintk("svcrdma: deferred read ret=%d, rq_arg.len =%d, " + "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n", + ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base, + rqstp->rq_arg.head[0].iov_len); + svc_xprt_received(rqstp->rq_xprt); + return ret; +} + +/* + * Set up the rqstp thread context to point to the RQ buffer. If + * necessary, pull additional data from the client with an RDMA_READ + * request. + */ +int svc_rdma_recvfrom(struct svc_rqst *rqstp) +{ + struct svc_xprt *xprt = rqstp->rq_xprt; + struct svcxprt_rdma *rdma_xprt = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + struct svc_rdma_op_ctxt *ctxt = NULL; + struct rpcrdma_msg *rmsgp; + int ret = 0; + int len; + + dprintk("svcrdma: rqstp=%p\n", rqstp); + spin_lock_bh(&rdma_xprt->sc_read_complete_lock); + if (!list_empty(&rdma_xprt->sc_read_complete_q)) { + ctxt = list_entry(rdma_xprt->sc_read_complete_q.next, + struct svc_rdma_op_ctxt, + dto_q); + list_del_init(&ctxt->dto_q); + } + spin_unlock_bh(&rdma_xprt->sc_read_complete_lock); + if (ctxt) + return rdma_read_complete(rqstp, ctxt); + + spin_lock_bh(&rdma_xprt->sc_rq_dto_lock); + if (!list_empty(&rdma_xprt->sc_rq_dto_q)) { + ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next, + struct svc_rdma_op_ctxt, + dto_q); + list_del_init(&ctxt->dto_q); + } else { + atomic_inc(&rdma_stat_rq_starve); + clear_bit(XPT_DATA, &xprt->xpt_flags); + ctxt = NULL; + } + spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock); + if (!ctxt) { + /* This is the EAGAIN path. The svc_recv routine will + * return -EAGAIN, the nfsd thread will go to call into + * svc_recv again and we shouldn't be on the active + * transport list + */ + if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) + goto close_out; + + BUG_ON(ret); + goto out; + } + dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n", + ctxt, rdma_xprt, rqstp, ctxt->wc_status); + BUG_ON(ctxt->wc_status != IB_WC_SUCCESS); + atomic_inc(&rdma_stat_recv); + + /* rqstp struct expects transport to fill in peer address */ + rqstp->rq_prot = IPPROTO_MAX; + memcpy(&rqstp->rq_addr, + &rdma_xprt->sc_cm_id->route.addr.dst_addr, + sizeof(rqstp->rq_addr)); + rqstp->rq_addrlen = sizeof(rqstp->rq_addr); + + /* Build up the XDR from the receive buffers. */ + rdma_build_arg_xdr(rqstp, ctxt, ctxt->byte_len); + + /* Decode the RDMA header. */ + len = svc_rdma_xdr_decode_req(&rmsgp, rqstp); + rqstp->rq_xprt_hlen = len; + + /* If the request is invalid, reply with an error */ + if (len < 0) { + if (len == -ENOSYS) + (void)svc_rdma_send_error(rdma_xprt, rmsgp, ERR_VERS); + goto close_out; + } + + /* Read read-list data. If we would need to wait, defer it */ + if (rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt)) { + svc_xprt_received(xprt); + return 0; + } + + ret = rqstp->rq_arg.head[0].iov_len + + rqstp->rq_arg.page_len + + rqstp->rq_arg.tail[0].iov_len; + svc_rdma_put_context(ctxt, 0); + out: + dprintk("svcrdma: ret = %d, rq_arg.len =%d, " + "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n", + ret, rqstp->rq_arg.len, + rqstp->rq_arg.head[0].iov_base, + rqstp->rq_arg.head[0].iov_len); + svc_xprt_received(xprt); + return ret; + + close_out: + if (ctxt) + svc_rdma_put_context(ctxt, 1); + dprintk("svcrdma: transport %p is closing\n", xprt); + /* + * Set the close bit and enqueue it. svc_recv will see the + * close bit and call svc_xprt_delete + */ + set_bit(XPT_CLOSE, &xprt->xpt_flags); + svc_xprt_received(xprt); + return 0; +} diff --git a/net/sunrpc/svc_rdma_sendto.c b/net/sunrpc/svc_rdma_sendto.c new file mode 100644 index 0000000..cbedfd1 --- /dev/null +++ b/net/sunrpc/svc_rdma_sendto.c @@ -0,0 +1,515 @@ +/* + * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Author: Tom Tucker + */ + +#include +#include +#include +#include +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +/* Encode an XDR as an array of IB SGE + * + * Assumptions: + * - head[0] is physically contiguous. + * - tail[0] is physically contiguous. + * - pages[] is not physically or virtually contigous and consists of + * PAGE_SIZE elements. + * + * Output: + * SGE[0] reserved for RCPRDMA header + * SGE[1] data from xdr->head[] + * SGE[2..sge_count-2] data from xdr->pages[] + * SGE[sge_count-1] data from xdr->tail. + * + */ +static struct ib_sge *xdr_to_sge(struct svcxprt_rdma *xprt, + struct xdr_buf *xdr, + struct ib_sge *sge, + int *sge_count) +{ + /* Max we need is the length of the XDR / pagesize + one for + * head + one for tail + one for RPCRDMA header + */ + int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3; + int sge_no; + u32 byte_count = xdr->len; + u32 sge_bytes; + u32 page_bytes; + int page_off; + int page_no; + + /* Skip the first sge, this is for the RPCRDMA header */ + sge_no = 1; + + /* Head SGE */ + sge[sge_no].addr = ib_dma_map_single(xprt->sc_cm_id->device, + xdr->head[0].iov_base, + xdr->head[0].iov_len, + DMA_TO_DEVICE); + sge_bytes = min_t(u32, byte_count, xdr->head[0].iov_len); + byte_count -= sge_bytes; + sge[sge_no].length = sge_bytes; + sge[sge_no].lkey = xprt->sc_phys_mr->lkey; + sge_no++; + + /* pages SGE */ + page_no = 0; + page_bytes = xdr->page_len; + page_off = xdr->page_base; + while (byte_count && page_bytes) { + sge_bytes = min_t(u32, byte_count, (PAGE_SIZE-page_off)); + sge[sge_no].addr = + ib_dma_map_page(xprt->sc_cm_id->device, + xdr->pages[page_no], page_off, + sge_bytes, DMA_TO_DEVICE); + sge_bytes = min(sge_bytes, page_bytes); + byte_count -= sge_bytes; + page_bytes -= sge_bytes; + sge[sge_no].length = sge_bytes; + sge[sge_no].lkey = xprt->sc_phys_mr->lkey; + + sge_no++; + page_no++; + page_off = 0; /* reset for next time through loop */ + } + + /* Tail SGE */ + if (byte_count && xdr->tail[0].iov_len) { + sge[sge_no].addr = + ib_dma_map_single(xprt->sc_cm_id->device, + xdr->tail[0].iov_base, + xdr->tail[0].iov_len, + DMA_TO_DEVICE); + sge_bytes = min_t(u32, byte_count, xdr->tail[0].iov_len); + byte_count -= sge_bytes; + sge[sge_no].length = sge_bytes; + sge[sge_no].lkey = xprt->sc_phys_mr->lkey; + sge_no++; + } + + BUG_ON(sge_no > sge_max); + BUG_ON(byte_count != 0); + + *sge_count = sge_no; + return sge; +} + + +/* Assumptions: + * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE + */ +static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, + u32 rmr, u64 to, + u32 xdr_off, int write_len, + struct ib_sge *xdr_sge, int sge_count) +{ + struct svc_rdma_op_ctxt *tmp_sge_ctxt; + struct ib_send_wr write_wr; + struct ib_sge *sge; + int xdr_sge_no; + int sge_no; + int sge_bytes; + int sge_off; + int bc; + struct svc_rdma_op_ctxt *ctxt; + int ret = 0; + + BUG_ON(sge_count >= 32); + dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, " + "write_len=%d, xdr_sge=%p, sge_count=%d\n", + rmr, to, xdr_off, write_len, xdr_sge, sge_count); + + ctxt = svc_rdma_get_context(xprt); + ctxt->count = 0; + tmp_sge_ctxt = svc_rdma_get_context(xprt); + sge = tmp_sge_ctxt->sge; + + /* Find the SGE associated with xdr_off */ + for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < sge_count; + xdr_sge_no++) { + if (xdr_sge[xdr_sge_no].length > bc) + break; + bc -= xdr_sge[xdr_sge_no].length; + } + + sge_off = bc; + bc = write_len; + sge_no = 0; + + /* Copy the remaining SGE */ + while (bc != 0 && xdr_sge_no < sge_count) { + sge[sge_no].addr = xdr_sge[xdr_sge_no].addr + sge_off; + sge[sge_no].lkey = xdr_sge[xdr_sge_no].lkey; + sge_bytes = min((size_t)bc, + (size_t)(xdr_sge[xdr_sge_no].length-sge_off)); + sge[sge_no].length = sge_bytes; + + sge_off = 0; + sge_no++; + xdr_sge_no++; + bc -= sge_bytes; + } + + BUG_ON(bc != 0); + BUG_ON(xdr_sge_no > sge_count); + + /* Prepare WRITE WR */ + memset(&write_wr, 0, sizeof write_wr); + ctxt->wr_op = IB_WR_RDMA_WRITE; + write_wr.wr_id = (unsigned long)ctxt; + write_wr.sg_list = &sge[0]; + write_wr.num_sge = sge_no; + write_wr.opcode = IB_WR_RDMA_WRITE; + write_wr.send_flags = IB_SEND_SIGNALED; + write_wr.wr.rdma.rkey = rmr; + write_wr.wr.rdma.remote_addr = to; + + /* Post It */ + atomic_inc(&rdma_stat_write); + if (svc_rdma_send(xprt, &write_wr)) { + svc_rdma_put_context(ctxt, 1); + /* Fatal error, close transport */ + ret = -EIO; + } + svc_rdma_put_context(tmp_sge_ctxt, 0); + return ret; +} + +static int send_write_chunks(struct svcxprt_rdma *xprt, + struct rpcrdma_msg *rdma_argp, + struct rpcrdma_msg *rdma_resp, + struct svc_rqst *rqstp, + struct ib_sge *sge, + int sge_count) +{ + u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len; + int write_len; + int max_write; + u32 xdr_off; + int chunk_off; + int chunk_no; + struct rpcrdma_write_array *arg_ary; + struct rpcrdma_write_array *res_ary; + int ret; + + arg_ary = svc_rdma_get_write_array(rdma_argp); + if (!arg_ary) + return 0; + res_ary = (struct rpcrdma_write_array *) + &rdma_resp->rm_body.rm_chunks[1]; + + max_write = xprt->sc_max_sge * PAGE_SIZE; + + /* Write chunks start at the pagelist */ + for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0; + xfer_len && chunk_no < arg_ary->wc_nchunks; + chunk_no++) { + struct rpcrdma_segment *arg_ch; + u64 rs_offset; + + arg_ch = &arg_ary->wc_array[chunk_no].wc_target; + write_len = min(xfer_len, arg_ch->rs_length); + + /* Prepare the response chunk given the length actually + * written */ + rs_offset = get_unaligned(&(arg_ch->rs_offset)); + svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no, + arg_ch->rs_handle, + rs_offset, + write_len); + chunk_off = 0; + while (write_len) { + int this_write; + this_write = min(write_len, max_write); + ret = send_write(xprt, rqstp, + arg_ch->rs_handle, + rs_offset + chunk_off, + xdr_off, + this_write, + sge, + sge_count); + if (ret) { + dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n", + ret); + return -EIO; + } + chunk_off += this_write; + xdr_off += this_write; + xfer_len -= this_write; + write_len -= this_write; + } + } + /* Update the req with the number of chunks actually used */ + svc_rdma_xdr_encode_write_list(rdma_resp, chunk_no); + + return rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len; +} + +static int send_reply_chunks(struct svcxprt_rdma *xprt, + struct rpcrdma_msg *rdma_argp, + struct rpcrdma_msg *rdma_resp, + struct svc_rqst *rqstp, + struct ib_sge *sge, + int sge_count) +{ + u32 xfer_len = rqstp->rq_res.len; + int write_len; + int max_write; + u32 xdr_off; + int chunk_no; + int chunk_off; + struct rpcrdma_segment *ch; + struct rpcrdma_write_array *arg_ary; + struct rpcrdma_write_array *res_ary; + int ret; + + arg_ary = svc_rdma_get_reply_array(rdma_argp); + if (!arg_ary) + return 0; + /* XXX: need to fix when reply lists occur with read-list and or + * write-list */ + res_ary = (struct rpcrdma_write_array *) + &rdma_resp->rm_body.rm_chunks[2]; + + max_write = xprt->sc_max_sge * PAGE_SIZE; + + /* xdr offset starts at RPC message */ + for (xdr_off = 0, chunk_no = 0; + xfer_len && chunk_no < arg_ary->wc_nchunks; + chunk_no++) { + u64 rs_offset; + ch = &arg_ary->wc_array[chunk_no].wc_target; + write_len = min(xfer_len, ch->rs_length); + + + /* Prepare the reply chunk given the length actually + * written */ + rs_offset = get_unaligned(&(ch->rs_offset)); + svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no, + ch->rs_handle, rs_offset, + write_len); + chunk_off = 0; + while (write_len) { + int this_write; + + this_write = min(write_len, max_write); + ret = send_write(xprt, rqstp, + ch->rs_handle, + rs_offset + chunk_off, + xdr_off, + this_write, + sge, + sge_count); + if (ret) { + dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n", + ret); + return -EIO; + } + chunk_off += this_write; + xdr_off += this_write; + xfer_len -= this_write; + write_len -= this_write; + } + } + /* Update the req with the number of chunks actually used */ + svc_rdma_xdr_encode_reply_array(res_ary, chunk_no); + + return rqstp->rq_res.len; +} + +/* This function prepares the portion of the RPCRDMA message to be + * sent in the RDMA_SEND. This function is called after data sent via + * RDMA has already been transmitted. There are three cases: + * - The RPCRDMA header, RPC header, and payload are all sent in a + * single RDMA_SEND. This is the "inline" case. + * - The RPCRDMA header and some portion of the RPC header and data + * are sent via this RDMA_SEND and another portion of the data is + * sent via RDMA. + * - The RPCRDMA header [NOMSG] is sent in this RDMA_SEND and the RPC + * header and data are all transmitted via RDMA. + * In all three cases, this function prepares the RPCRDMA header in + * sge[0], the 'type' parameter indicates the type to place in the + * RPCRDMA header, and the 'byte_count' field indicates how much of + * the XDR to include in this RDMA_SEND. + */ +static int send_reply(struct svcxprt_rdma *rdma, + struct svc_rqst *rqstp, + struct page *page, + struct rpcrdma_msg *rdma_resp, + struct svc_rdma_op_ctxt *ctxt, + int sge_count, + int byte_count) +{ + struct ib_send_wr send_wr; + int sge_no; + int sge_bytes; + int page_no; + int ret; + + /* Prepare the context */ + ctxt->pages[0] = page; + ctxt->count = 1; + + /* Prepare the SGE for the RPCRDMA Header */ + ctxt->sge[0].addr = + ib_dma_map_page(rdma->sc_cm_id->device, + page, 0, PAGE_SIZE, DMA_TO_DEVICE); + ctxt->direction = DMA_TO_DEVICE; + ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp); + ctxt->sge[0].lkey = rdma->sc_phys_mr->lkey; + + /* Determine how many of our SGE are to be transmitted */ + for (sge_no = 1; byte_count && sge_no < sge_count; sge_no++) { + sge_bytes = min((size_t)ctxt->sge[sge_no].length, + (size_t)byte_count); + byte_count -= sge_bytes; + } + BUG_ON(byte_count != 0); + + /* Save all respages in the ctxt and remove them from the + * respages array. They are our pages until the I/O + * completes. + */ + for (page_no = 0; page_no < rqstp->rq_resused; page_no++) { + ctxt->pages[page_no+1] = rqstp->rq_respages[page_no]; + ctxt->count++; + rqstp->rq_respages[page_no] = NULL; + } + + BUG_ON(sge_no > rdma->sc_max_sge); + memset(&send_wr, 0, sizeof send_wr); + ctxt->wr_op = IB_WR_SEND; + send_wr.wr_id = (unsigned long)ctxt; + send_wr.sg_list = ctxt->sge; + send_wr.num_sge = sge_no; + send_wr.opcode = IB_WR_SEND; + send_wr.send_flags = IB_SEND_SIGNALED; + + ret = svc_rdma_send(rdma, &send_wr); + if (ret) + svc_rdma_put_context(ctxt, 1); + + return ret; +} + +void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp) +{ + struct svcxprt_rdma *rdma = + container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt); + /* + * Return the receive WR to the RQ. Any error posting to the + * RQ will surface on the SQ post below. + */ + (void)svc_rdma_post_recv(rdma); +} + +int svc_rdma_sendto(struct svc_rqst *rqstp) +{ + struct svc_xprt *xprt = rqstp->rq_xprt; + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + struct rpcrdma_msg *rdma_argp; + struct rpcrdma_msg *rdma_resp; + struct rpcrdma_write_array *reply_ary; + enum rpcrdma_proc reply_type; + int ret; + int inline_bytes; + struct ib_sge *sge; + int sge_count = 0; + struct page *res_page; + struct svc_rdma_op_ctxt *ctxt; + + dprintk("svcrdma: sending response for rqstp=%p\n", rqstp); + + /* Get the RDMA request header. */ + rdma_argp = page_address(rqstp->rq_pages[0]); + + /* Build an SGE for the XDR */ + ctxt = svc_rdma_get_context(rdma); + ctxt->direction = DMA_TO_DEVICE; + sge = xdr_to_sge(rdma, &rqstp->rq_res, ctxt->sge, &sge_count); + + inline_bytes = rqstp->rq_res.len; + + /* Create the RDMA response header */ + res_page = svc_rdma_get_page(); + rdma_resp = page_address(res_page); + reply_ary = svc_rdma_get_reply_array(rdma_argp); + if (reply_ary) + reply_type = RDMA_NOMSG; + else + reply_type = RDMA_MSG; + svc_rdma_xdr_encode_reply_header(rdma, rdma_argp, + rdma_resp, reply_type); + + /* Send any write-chunk data and build resp write-list */ + ret = send_write_chunks(rdma, rdma_argp, rdma_resp, + rqstp, sge, sge_count); + if (ret < 0) { + printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n", + ret); + goto error; + } + inline_bytes -= ret; + + /* Send any reply-list data and update resp reply-list */ + ret = send_reply_chunks(rdma, rdma_argp, rdma_resp, + rqstp, sge, sge_count); + if (ret < 0) { + printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n", + ret); + goto error; + } + inline_bytes -= ret; + + ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, sge_count, + inline_bytes); + dprintk("svcrdma: send_reply returns %d\n", ret); + return ret; + error: + svc_rdma_put_context(ctxt, 0); + put_page(res_page); + return ret; +} diff --git a/net/sunrpc/svc_rdma_transport.c b/net/sunrpc/svc_rdma_transport.c new file mode 100644 index 0000000..6e62c2a --- /dev/null +++ b/net/sunrpc/svc_rdma_transport.c @@ -0,0 +1,1070 @@ +/* + * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Author: Tom Tucker + */ + +#include +#include +#include +#include +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, + struct sockaddr *sa, int salen, + int flags); +static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt); +static void svc_rdma_release_rqst(struct svc_rqst *); +static void rdma_destroy_xprt(struct svcxprt_rdma *xprt); +static void dto_tasklet_func(unsigned long data); +static void svc_rdma_detach(struct svc_xprt *xprt); +static void svc_rdma_free(struct svc_xprt *xprt); +static int svc_rdma_has_wspace(struct svc_xprt *xprt); +static void rq_cq_reap(struct svcxprt_rdma *xprt); +static void sq_cq_reap(struct svcxprt_rdma *xprt); + +DECLARE_TASKLET(dto_tasklet, dto_tasklet_func, 0UL); +static spinlock_t dto_lock = SPIN_LOCK_UNLOCKED; +static LIST_HEAD(dto_xprt_q); + +static struct svc_xprt_ops svc_rdma_ops = { + .xpo_create = svc_rdma_create, + .xpo_recvfrom = svc_rdma_recvfrom, + .xpo_sendto = svc_rdma_sendto, + .xpo_release_rqst = svc_rdma_release_rqst, + .xpo_detach = svc_rdma_detach, + .xpo_free = svc_rdma_free, + .xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr, + .xpo_has_wspace = svc_rdma_has_wspace, + .xpo_accept = svc_rdma_accept, +}; + +struct svc_xprt_class svc_rdma_class = { + .xcl_name = "rdma", + .xcl_owner = THIS_MODULE, + .xcl_ops = &svc_rdma_ops, + .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP, +}; + +static int rdma_bump_context_cache(struct svcxprt_rdma *xprt) +{ + int target; + int at_least_one = 0; + struct svc_rdma_op_ctxt *ctxt; + + target = min(xprt->sc_ctxt_cnt + xprt->sc_ctxt_bump, + xprt->sc_ctxt_max); + + spin_lock_bh(&xprt->sc_ctxt_lock); + while (xprt->sc_ctxt_cnt < target) { + xprt->sc_ctxt_cnt++; + spin_unlock_bh(&xprt->sc_ctxt_lock); + + ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL); + + spin_lock_bh(&xprt->sc_ctxt_lock); + if (ctxt) { + at_least_one = 1; + ctxt->next = xprt->sc_ctxt_head; + xprt->sc_ctxt_head = ctxt; + } else { + /* kmalloc failed...give up for now */ + xprt->sc_ctxt_cnt--; + break; + } + } + spin_unlock_bh(&xprt->sc_ctxt_lock); + dprintk("svcrdma: sc_ctxt_max=%d, sc_ctxt_cnt=%d\n", + xprt->sc_ctxt_max, xprt->sc_ctxt_cnt); + return at_least_one; +} + +struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) +{ + struct svc_rdma_op_ctxt *ctxt; + + while (1) { + spin_lock_bh(&xprt->sc_ctxt_lock); + if (unlikely(xprt->sc_ctxt_head == NULL)) { + /* Try to bump my cache. */ + spin_unlock_bh(&xprt->sc_ctxt_lock); + + if (rdma_bump_context_cache(xprt)) + continue; + + printk(KERN_INFO "svcrdma: sleeping waiting for " + "context memory on xprt=%p\n", + xprt); + schedule_timeout_uninterruptible(msecs_to_jiffies(500)); + continue; + } + ctxt = xprt->sc_ctxt_head; + xprt->sc_ctxt_head = ctxt->next; + spin_unlock_bh(&xprt->sc_ctxt_lock); + ctxt->xprt = xprt; + INIT_LIST_HEAD(&ctxt->dto_q); + ctxt->count = 0; + break; + } + return ctxt; +} + +void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) +{ + struct svcxprt_rdma *xprt; + int i; + + BUG_ON(!ctxt); + xprt = ctxt->xprt; + if (free_pages) + for (i = 0; i < ctxt->count; i++) + put_page(ctxt->pages[i]); + + for (i = 0; i < ctxt->count; i++) + dma_unmap_single(xprt->sc_cm_id->device->dma_device, + ctxt->sge[i].addr, + ctxt->sge[i].length, + ctxt->direction); + spin_lock_bh(&xprt->sc_ctxt_lock); + ctxt->next = xprt->sc_ctxt_head; + xprt->sc_ctxt_head = ctxt; + spin_unlock_bh(&xprt->sc_ctxt_lock); +} + +/* ib_cq event handler */ +static void cq_event_handler(struct ib_event *event, void *context) +{ + struct svc_xprt *xprt = context; + dprintk("svcrdma: received CQ event id=%d, context=%p\n", + event->event, context); + set_bit(XPT_CLOSE, &xprt->xpt_flags); +} + +/* QP event handler */ +static void qp_event_handler(struct ib_event *event, void *context) +{ + struct svc_xprt *xprt = context; + + switch (event->event) { + /* These are considered benign events */ + case IB_EVENT_PATH_MIG: + case IB_EVENT_COMM_EST: + case IB_EVENT_SQ_DRAINED: + case IB_EVENT_QP_LAST_WQE_REACHED: + dprintk("svcrdma: QP event %d received for QP=%p\n", + event->event, event->element.qp); + break; + /* These are considered fatal events */ + case IB_EVENT_PATH_MIG_ERR: + case IB_EVENT_QP_FATAL: + case IB_EVENT_QP_REQ_ERR: + case IB_EVENT_QP_ACCESS_ERR: + case IB_EVENT_DEVICE_FATAL: + default: + dprintk("svcrdma: QP ERROR event %d received for QP=%p, " + "closing transport\n", + event->event, event->element.qp); + set_bit(XPT_CLOSE, &xprt->xpt_flags); + break; + } +} + +/* + * Data Transfer Operation Tasklet + * + * Walks a list of transports with I/O pending, removing entries as + * they are added to the server's I/O pending list. Two bits indicate + * if SQ, RQ, or both have I/O pending. The dto_lock is an irqsave + * spinlock that serializes access to the transport list with the RQ + * and SQ interrupt handlers. + */ +static void dto_tasklet_func(unsigned long data) +{ + struct svcxprt_rdma *xprt; + unsigned long flags; + + spin_lock_irqsave(&dto_lock, flags); + while (!list_empty(&dto_xprt_q)) { + xprt = list_entry(dto_xprt_q.next, + struct svcxprt_rdma, sc_dto_q); + list_del_init(&xprt->sc_dto_q); + spin_unlock_irqrestore(&dto_lock, flags); + + if (test_and_clear_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags)) { + ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP); + rq_cq_reap(xprt); + set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); + /* + * If data arrived before established event, + * don't enqueue. This defers RPC I/O until the + * RDMA connection is complete. + */ + if (!test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags)) + svc_xprt_enqueue(&xprt->sc_xprt); + } + + if (test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) { + ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP); + sq_cq_reap(xprt); + } + + spin_lock_irqsave(&dto_lock, flags); + } + spin_unlock_irqrestore(&dto_lock, flags); +} + +/* + * Receive Queue Completion Handler + * + * Since an RQ completion handler is called on interrupt context, we + * need to defer the handling of the I/O to a tasklet + */ +static void rq_comp_handler(struct ib_cq *cq, void *cq_context) +{ + struct svcxprt_rdma *xprt = cq_context; + unsigned long flags; + + /* + * Set the bit regardless of whether or not it's on the list + * because it may be on the list already due to an SQ + * completion. + */ + set_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags); + + /* + * If this transport is not already on the DTO transport queue, + * add it + */ + spin_lock_irqsave(&dto_lock, flags); + if (list_empty(&xprt->sc_dto_q)) + list_add_tail(&xprt->sc_dto_q, &dto_xprt_q); + spin_unlock_irqrestore(&dto_lock, flags); + + /* Tasklet does all the work to avoid irqsave locks. */ + tasklet_schedule(&dto_tasklet); +} + +/* + * rq_cq_reap - Process the RQ CQ. + * + * Take all completing WC off the CQE and enqueue the associated DTO + * context on the dto_q for the transport. + */ +static void rq_cq_reap(struct svcxprt_rdma *xprt) +{ + int ret; + struct ib_wc wc; + struct svc_rdma_op_ctxt *ctxt = NULL; + + atomic_inc(&rdma_stat_rq_poll); + + spin_lock_bh(&xprt->sc_rq_dto_lock); + while ((ret = ib_poll_cq(xprt->sc_rq_cq, 1, &wc)) > 0) { + ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; + ctxt->wc_status = wc.status; + ctxt->byte_len = wc.byte_len; + if (wc.status != IB_WC_SUCCESS) { + /* Close the transport */ + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + svc_rdma_put_context(ctxt, 1); + continue; + } + list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q); + } + spin_unlock_bh(&xprt->sc_rq_dto_lock); + + if (ctxt) + atomic_inc(&rdma_stat_rq_prod); +} + +/* + * Send Queue Completion Handler - potentially called on interrupt context. + */ +static void sq_cq_reap(struct svcxprt_rdma *xprt) +{ + struct svc_rdma_op_ctxt *ctxt = NULL; + struct ib_wc wc; + struct ib_cq *cq = xprt->sc_sq_cq; + int ret; + + atomic_inc(&rdma_stat_sq_poll); + while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) { + ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; + xprt = ctxt->xprt; + + if (wc.status != IB_WC_SUCCESS) + /* Close the transport */ + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + + /* Decrement used SQ WR count */ + atomic_dec(&xprt->sc_sq_count); + wake_up(&xprt->sc_send_wait); + + switch (ctxt->wr_op) { + case IB_WR_SEND: + case IB_WR_RDMA_WRITE: + svc_rdma_put_context(ctxt, 1); + break; + + case IB_WR_RDMA_READ: + if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { + set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); + set_bit(RDMACTXT_F_READ_DONE, &ctxt->flags); + spin_lock_bh(&xprt->sc_read_complete_lock); + list_add_tail(&ctxt->dto_q, + &xprt->sc_read_complete_q); + spin_unlock_bh(&xprt->sc_read_complete_lock); + svc_xprt_enqueue(&xprt->sc_xprt); + } + break; + + default: + printk(KERN_ERR "svcrdma: unexpected completion type, " + "opcode=%d, status=%d\n", + wc.opcode, wc.status); + break; + } + } + + if (ctxt) + atomic_inc(&rdma_stat_sq_prod); +} + +static void sq_comp_handler(struct ib_cq *cq, void *cq_context) +{ + struct svcxprt_rdma *xprt = cq_context; + unsigned long flags; + + /* + * Set the bit regardless of whether or not it's on the list + * because it may be on the list already due to an RQ + * completion. + */ + set_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags); + + /* + * If this transport is not already on the DTO transport queue, + * add it + */ + spin_lock_irqsave(&dto_lock, flags); + if (list_empty(&xprt->sc_dto_q)) + list_add_tail(&xprt->sc_dto_q, &dto_xprt_q); + spin_unlock_irqrestore(&dto_lock, flags); + + /* Tasklet does all the work to avoid irqsave locks. */ + tasklet_schedule(&dto_tasklet); +} + +static void create_context_cache(struct svcxprt_rdma *xprt, + int ctxt_count, int ctxt_bump, int ctxt_max) +{ + struct svc_rdma_op_ctxt *ctxt; + int i; + + xprt->sc_ctxt_max = ctxt_max; + xprt->sc_ctxt_bump = ctxt_bump; + xprt->sc_ctxt_cnt = 0; + xprt->sc_ctxt_head = NULL; + for (i = 0; i < ctxt_count; i++) { + ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL); + if (ctxt) { + ctxt->next = xprt->sc_ctxt_head; + xprt->sc_ctxt_head = ctxt; + xprt->sc_ctxt_cnt++; + } + } +} + +static void destroy_context_cache(struct svc_rdma_op_ctxt *ctxt) +{ + struct svc_rdma_op_ctxt *next; + if (!ctxt) + return; + + do { + next = ctxt->next; + kfree(ctxt); + ctxt = next; + } while (next); +} + +static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, + int listener) +{ + struct svcxprt_rdma *cma_xprt = kzalloc(sizeof *cma_xprt, GFP_KERNEL); + + if (!cma_xprt) + return NULL; + svc_xprt_init(&svc_rdma_class, &cma_xprt->sc_xprt, serv); + INIT_LIST_HEAD(&cma_xprt->sc_accept_q); + INIT_LIST_HEAD(&cma_xprt->sc_dto_q); + INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); + INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); + init_waitqueue_head(&cma_xprt->sc_send_wait); + + spin_lock_init(&cma_xprt->sc_lock); + spin_lock_init(&cma_xprt->sc_read_complete_lock); + spin_lock_init(&cma_xprt->sc_ctxt_lock); + spin_lock_init(&cma_xprt->sc_rq_dto_lock); + + cma_xprt->sc_ord = svcrdma_ord; + + cma_xprt->sc_max_req_size = svcrdma_max_req_size; + cma_xprt->sc_max_requests = svcrdma_max_requests; + cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT; + atomic_set(&cma_xprt->sc_sq_count, 0); + + if (!listener) { + int reqs = cma_xprt->sc_max_requests; + create_context_cache(cma_xprt, + reqs << 1, /* starting size */ + reqs, /* bump amount */ + reqs + + cma_xprt->sc_sq_depth + + RPCRDMA_MAX_THREADS + 1); /* max */ + if (!cma_xprt->sc_ctxt_head) { + kfree(cma_xprt); + return NULL; + } + clear_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); + } else + set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); + + return cma_xprt; +} + +struct page *svc_rdma_get_page(void) +{ + struct page *page; + + while ((page = alloc_page(GFP_KERNEL)) == NULL) { + /* If we can't get memory, wait a bit and try again */ + printk(KERN_INFO "svcrdma: out of memory...retrying in 1000 " + "jiffies.\n"); + schedule_timeout_uninterruptible(msecs_to_jiffies(1000)); + } + return page; +} + +int svc_rdma_post_recv(struct svcxprt_rdma *xprt) +{ + struct ib_recv_wr recv_wr, *bad_recv_wr; + struct svc_rdma_op_ctxt *ctxt; + struct page *page; + unsigned long pa; + int sge_no; + int buflen; + int ret; + + ctxt = svc_rdma_get_context(xprt); + buflen = 0; + ctxt->direction = DMA_FROM_DEVICE; + for (sge_no = 0; buflen < xprt->sc_max_req_size; sge_no++) { + BUG_ON(sge_no >= xprt->sc_max_sge); + page = svc_rdma_get_page(); + ctxt->pages[sge_no] = page; + pa = ib_dma_map_page(xprt->sc_cm_id->device, + page, 0, PAGE_SIZE, + DMA_FROM_DEVICE); + ctxt->sge[sge_no].addr = pa; + ctxt->sge[sge_no].length = PAGE_SIZE; + ctxt->sge[sge_no].lkey = xprt->sc_phys_mr->lkey; + buflen += PAGE_SIZE; + } + ctxt->count = sge_no; + recv_wr.next = NULL; + recv_wr.sg_list = &ctxt->sge[0]; + recv_wr.num_sge = ctxt->count; + recv_wr.wr_id = (u64)(unsigned long)ctxt; + + ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr); + return ret; +} + +/* + * This function handles the CONNECT_REQUEST event on a listening + * endpoint. It is passed the cma_id for the _new_ connection. The context in + * this cma_id is inherited from the listening cma_id and is the svc_xprt + * structure for the listening endpoint. + * + * This function creates a new xprt for the new connection and enqueues it on + * the accept queue for the listent xprt. When the listen thread is kicked, it + * will call the recvfrom method on the listen xprt which will accept the new + * connection. + */ +static void handle_connect_req(struct rdma_cm_id *new_cma_id) +{ + struct svcxprt_rdma *listen_xprt = new_cma_id->context; + struct svcxprt_rdma *newxprt; + + /* Create a new transport */ + newxprt = rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, 0); + if (!newxprt) { + dprintk("svcrdma: failed to create new transport\n"); + return; + } + newxprt->sc_cm_id = new_cma_id; + new_cma_id->context = newxprt; + dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n", + newxprt, newxprt->sc_cm_id, listen_xprt); + + /* + * Enqueue the new transport on the accept queue of the listening + * transport + */ + spin_lock_bh(&listen_xprt->sc_lock); + list_add_tail(&newxprt->sc_accept_q, &listen_xprt->sc_accept_q); + spin_unlock_bh(&listen_xprt->sc_lock); + + /* + * Can't use svc_xprt_received here because we are not on a + * rqstp thread + */ + set_bit(XPT_CONN, &listen_xprt->sc_xprt.xpt_flags); + listen_xprt->sc_xprt.xpt_pool = NULL; + svc_xprt_enqueue(&listen_xprt->sc_xprt); +} + +/* + * Handles events generated on the listening endpoint. These events will be + * either be incoming connect requests or adapter removal events. + */ +static int rdma_listen_handler(struct rdma_cm_id *cma_id, + struct rdma_cm_event *event) +{ + struct svcxprt_rdma *xprt = cma_id->context; + int ret = 0; + + switch (event->event) { + case RDMA_CM_EVENT_CONNECT_REQUEST: + dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, " + "event=%d\n", cma_id, cma_id->context, event->event); + handle_connect_req(cma_id); + break; + + case RDMA_CM_EVENT_ESTABLISHED: + /* Accept complete */ + dprintk("svcrdma: Connection completed on LISTEN xprt=%p, " + "cm_id=%p\n", xprt, cma_id); + break; + + case RDMA_CM_EVENT_DEVICE_REMOVAL: + dprintk("svcrdma: Device removal xprt=%p, cm_id=%p\n", + xprt, cma_id); + if (xprt) + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + break; + + default: + dprintk("svcrdma: Unexpected event on listening endpoint %p, " + "event=%d\n", cma_id, event->event); + break; + } + + return ret; +} + +static int svc_rdma_copy_addr(struct sockaddr *dst, struct sockaddr *src) +{ + switch (src->sa_family) { + case AF_INET: + *((struct sockaddr_in *)dst) = *((struct sockaddr_in *)src); + return sizeof(struct sockaddr_in); + case AF_INET6: + *((struct sockaddr_in6 *)dst) = *((struct sockaddr_in6 *)src); + return sizeof(struct sockaddr_in6); + default: + return 0; + } +} + +static int rdma_cma_handler(struct rdma_cm_id *cma_id, + struct rdma_cm_event *event) +{ + struct svc_xprt *xprt = cma_id->context; + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + switch (event->event) { + case RDMA_CM_EVENT_ESTABLISHED: + /* Accept complete */ + dprintk("svcrdma: Connection completed on DTO xprt=%p, " + "cm_id=%p\n", xprt, cma_id); + xprt->xpt_remotelen = + svc_rdma_copy_addr((struct sockaddr *) + &xprt->xpt_remote, + &cma_id->route.addr.dst_addr); + (void)svc_rdma_copy_addr((struct sockaddr *) + &xprt->xpt_local, + &cma_id->route.addr.src_addr); + clear_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags); + svc_xprt_enqueue(xprt); + break; + case RDMA_CM_EVENT_DISCONNECTED: + dprintk("svcrdma: Disconnect on DTO xprt=%p, cm_id=%p\n", + xprt, cma_id); + if (xprt) { + set_bit(XPT_CLOSE, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); + } + break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, " + "event=%d\n", cma_id, xprt, event->event); + if (xprt) { + set_bit(XPT_CLOSE, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); + } + break; + default: + dprintk("svcrdma: Unexpected event on DTO endpoint %p, " + "event=%d\n", cma_id, event->event); + break; + } + return 0; +} + +/* + * Create a listening RDMA service endpoint. + */ +static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, + struct sockaddr *sa, int salen, + int flags) +{ + struct rdma_cm_id *listen_id; + struct svcxprt_rdma *cma_xprt; + struct svc_xprt *xprt; + int ret; + + dprintk("svcrdma: Creating RDMA socket\n"); + + cma_xprt = rdma_create_xprt(serv, 1); + if (!cma_xprt) + return ERR_PTR(ENOMEM); + xprt = &cma_xprt->sc_xprt; + + listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP); + if (IS_ERR(listen_id)) { + rdma_destroy_xprt(cma_xprt); + dprintk("svcrdma: rdma_create_id failed = %ld\n", + PTR_ERR(listen_id)); + return (void *)listen_id; + } + ret = rdma_bind_addr(listen_id, sa); + if (ret) { + rdma_destroy_xprt(cma_xprt); + rdma_destroy_id(listen_id); + dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret); + return ERR_PTR(ret); + } + cma_xprt->sc_cm_id = listen_id; + + ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG); + if (ret) { + rdma_destroy_id(listen_id); + rdma_destroy_xprt(cma_xprt); + dprintk("svcrdma: rdma_listen failed = %d\n", ret); + } + + (void)svc_rdma_copy_addr((struct sockaddr *) + &cma_xprt->sc_xprt.xpt_local, + &listen_id->route.addr.src_addr); + return &cma_xprt->sc_xprt; +} + +/* + * This is the xpo_recvfrom function for listening endpoints. Its + * purpose is to accept incoming connections. The CMA callback handler + * has already created a new transport and attached it to the new CMA + * ID. + * + * There is a queue of pending connections hung on the listening + * transport. This queue contains the new svc_xprt structure. This + * function takes svc_xprt structures off the accept_q and completes + * the connection. + */ +static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) +{ + struct svcxprt_rdma *listen_rdma; + struct svcxprt_rdma *newxprt = NULL; + struct rdma_conn_param conn_param; + struct ib_qp_init_attr qp_attr; + struct ib_device_attr devattr; + int ret; + int i; + + listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); + clear_bit(XPT_CONN, &xprt->xpt_flags); + /* Get the next entry off the accept list */ + spin_lock_bh(&listen_rdma->sc_lock); + if (!list_empty(&listen_rdma->sc_accept_q)) { + newxprt = list_entry(listen_rdma->sc_accept_q.next, + struct svcxprt_rdma, sc_accept_q); + list_del_init(&newxprt->sc_accept_q); + } + if (!list_empty(&listen_rdma->sc_accept_q)) + set_bit(XPT_CONN, &listen_rdma->sc_xprt.xpt_flags); + spin_unlock_bh(&listen_rdma->sc_lock); + if (!newxprt) + return NULL; + + dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n", + newxprt, newxprt->sc_cm_id); + + ret = ib_query_device(newxprt->sc_cm_id->device, &devattr); + if (ret) { + dprintk("svcrdma: could not query device attributes on " + "device %p, rc=%d\n", newxprt->sc_cm_id->device, ret); + goto errout; + } + + /* Qualify the transport resource defaults with the + * capabilities of this particular device */ + newxprt->sc_max_sge = min((size_t)devattr.max_sge, + (size_t)RPCSVC_MAXPAGES); + newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr, + (size_t)svcrdma_max_requests); + newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests; + + newxprt->sc_ord = min((size_t)devattr.max_qp_rd_atom, + (size_t)svcrdma_ord); + + newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device); + if (IS_ERR(newxprt->sc_pd)) { + dprintk("svcrdma: error creating PD for connect request\n"); + goto errout; + } + newxprt->sc_sq_cq = ib_create_cq(newxprt->sc_cm_id->device, + sq_comp_handler, + cq_event_handler, + newxprt, + newxprt->sc_sq_depth, + 0); + if (IS_ERR(newxprt->sc_sq_cq)) { + dprintk("svcrdma: error creating SQ CQ for connect request\n"); + goto errout; + } + newxprt->sc_rq_cq = ib_create_cq(newxprt->sc_cm_id->device, + rq_comp_handler, + cq_event_handler, + newxprt, + newxprt->sc_max_requests, + 0); + if (IS_ERR(newxprt->sc_rq_cq)) { + dprintk("svcrdma: error creating RQ CQ for connect request\n"); + goto errout; + } + + memset(&qp_attr, 0, sizeof qp_attr); + qp_attr.event_handler = qp_event_handler; + qp_attr.qp_context = &newxprt->sc_xprt; + qp_attr.cap.max_send_wr = newxprt->sc_sq_depth; + qp_attr.cap.max_recv_wr = newxprt->sc_max_requests; + qp_attr.cap.max_send_sge = newxprt->sc_max_sge; + qp_attr.cap.max_recv_sge = newxprt->sc_max_sge; + qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + qp_attr.qp_type = IB_QPT_RC; + qp_attr.send_cq = newxprt->sc_sq_cq; + qp_attr.recv_cq = newxprt->sc_rq_cq; + dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n" + " cm_id->device=%p, sc_pd->device=%p\n" + " cap.max_send_wr = %d\n" + " cap.max_recv_wr = %d\n" + " cap.max_send_sge = %d\n" + " cap.max_recv_sge = %d\n", + newxprt->sc_cm_id, newxprt->sc_pd, + newxprt->sc_cm_id->device, newxprt->sc_pd->device, + qp_attr.cap.max_send_wr, + qp_attr.cap.max_recv_wr, + qp_attr.cap.max_send_sge, + qp_attr.cap.max_recv_sge); + + ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr); + if (ret) { + /* + * XXX: This is a hack. We need a xx_request_qp interface + * that will adjust the qp_attr's with a best-effort + * number + */ + qp_attr.cap.max_send_sge -= 2; + qp_attr.cap.max_recv_sge -= 2; + ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, + &qp_attr); + if (ret) { + dprintk("svcrdma: failed to create QP, ret=%d\n", ret); + goto errout; + } + newxprt->sc_max_sge = qp_attr.cap.max_send_sge; + newxprt->sc_max_sge = qp_attr.cap.max_recv_sge; + newxprt->sc_sq_depth = qp_attr.cap.max_send_wr; + newxprt->sc_max_requests = qp_attr.cap.max_recv_wr; + } + newxprt->sc_qp = newxprt->sc_cm_id->qp; + + /* Register all of physical memory */ + newxprt->sc_phys_mr = ib_get_dma_mr(newxprt->sc_pd, + IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE); + if (IS_ERR(newxprt->sc_phys_mr)) { + dprintk("svcrdma: Failed to create DMA MR ret=%d\n", ret); + goto errout; + } + + /* Post receive buffers */ + for (i = 0; i < newxprt->sc_max_requests; i++) { + ret = svc_rdma_post_recv(newxprt); + if (ret) { + dprintk("svcrdma: failure posting receive buffers\n"); + goto errout; + } + } + + /* Swap out the handler */ + newxprt->sc_cm_id->event_handler = rdma_cma_handler; + + /* Accept Connection */ + set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags); + memset(&conn_param, 0, sizeof conn_param); + conn_param.responder_resources = 0; + conn_param.initiator_depth = newxprt->sc_ord; + ret = rdma_accept(newxprt->sc_cm_id, &conn_param); + if (ret) { + dprintk("svcrdma: failed to accept new connection, ret=%d\n", + ret); + goto errout; + } + + dprintk("svcrdma: new connection %p accepted with the following " + "attributes:\n" + " local_ip : %d.%d.%d.%d\n" + " local_port : %d\n" + " remote_ip : %d.%d.%d.%d\n" + " remote_port : %d\n" + " max_sge : %d\n" + " sq_depth : %d\n" + " max_requests : %d\n" + " ord : %d\n", + newxprt, + NIPQUAD(((struct sockaddr_in *)&newxprt->sc_cm_id-> + route.addr.src_addr)->sin_addr.s_addr), + ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id-> + route.addr.src_addr)->sin_port), + NIPQUAD(((struct sockaddr_in *)&newxprt->sc_cm_id-> + route.addr.dst_addr)->sin_addr.s_addr), + ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id-> + route.addr.dst_addr)->sin_port), + newxprt->sc_max_sge, + newxprt->sc_sq_depth, + newxprt->sc_max_requests, + newxprt->sc_ord); + + ib_req_notify_cq(newxprt->sc_sq_cq, IB_CQ_NEXT_COMP); + ib_req_notify_cq(newxprt->sc_rq_cq, IB_CQ_NEXT_COMP); + return &newxprt->sc_xprt; + + errout: + dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret); + rdma_destroy_id(newxprt->sc_cm_id); + rdma_destroy_xprt(newxprt); + return NULL; +} + +/* No per-request resources cached by the transport */ +static void svc_rdma_release_rqst(struct svc_rqst *rqstp) +{ +} + +/* Disable data ready events for this connection */ +static void svc_rdma_detach(struct svc_xprt *xprt) +{ + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + unsigned long flags; + + dprintk("svc: svc_rdma_detach(%p)\n", xprt); + /* + * Shutdown the connection. This will ensure we don't get any + * more events from the provider. + */ + rdma_disconnect(rdma->sc_cm_id); + rdma_destroy_id(rdma->sc_cm_id); + + /* We may already be on the DTO list */ + spin_lock_irqsave(&dto_lock, flags); + if (!list_empty(&rdma->sc_dto_q)) + list_del_init(&rdma->sc_dto_q); + spin_unlock_irqrestore(&dto_lock, flags); +} + +static void svc_rdma_free(struct svc_xprt *xprt) +{ + struct svcxprt_rdma *rdma = (struct svcxprt_rdma *)xprt; + dprintk("svcrdma: svc_rdma_free(%p)\n", rdma); + rdma_destroy_xprt(rdma); + kfree(rdma); +} + +static void rdma_destroy_xprt(struct svcxprt_rdma *xprt) +{ + if (xprt->sc_qp) + ib_destroy_qp(xprt->sc_qp); + + if (xprt->sc_sq_cq) + ib_destroy_cq(xprt->sc_sq_cq); + + if (xprt->sc_rq_cq) + ib_destroy_cq(xprt->sc_rq_cq); + + if (xprt->sc_pd) + ib_dealloc_pd(xprt->sc_pd); + + destroy_context_cache(xprt->sc_ctxt_head); +} + +static int svc_rdma_has_wspace(struct svc_xprt *xprt) +{ + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + + /* + * If there are fewer SQ WR available than required to send a + * simple response, return false. + */ + if ((rdma->sc_sq_depth - atomic_read(&rdma->sc_sq_count) < 3)) + return 0; + + /* + * ...or there are already waiters on the SQ, + * return false. + */ + if (waitqueue_active(&rdma->sc_send_wait)) + return 0; + + /* Otherwise return true. */ + return 1; +} + +int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) +{ + struct ib_send_wr *bad_wr; + int ret; + + if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) + return 0; + + BUG_ON(wr->send_flags != IB_SEND_SIGNALED); + BUG_ON(((struct svc_rdma_op_ctxt *)(unsigned long)wr->wr_id)->wr_op != + wr->opcode); + /* If the SQ is full, wait until an SQ entry is available */ + while (1) { + spin_lock_bh(&xprt->sc_lock); + if (xprt->sc_sq_depth == atomic_read(&xprt->sc_sq_count)) { + spin_unlock_bh(&xprt->sc_lock); + atomic_inc(&rdma_stat_sq_starve); + /* See if we can reap some SQ WR */ + sq_cq_reap(xprt); + + /* Wait until SQ WR available if SQ still full */ + wait_event(xprt->sc_send_wait, + atomic_read(&xprt->sc_sq_count) < + xprt->sc_sq_depth); + continue; + } + /* Bumped used SQ WR count and post */ + ret = ib_post_send(xprt->sc_qp, wr, &bad_wr); + if (!ret) + atomic_inc(&xprt->sc_sq_count); + else + dprintk("svcrdma: failed to post SQ WR rc=%d, " + "sc_sq_count=%d, sc_sq_depth=%d\n", + ret, atomic_read(&xprt->sc_sq_count), + xprt->sc_sq_depth); + spin_unlock_bh(&xprt->sc_lock); + break; + } + return ret; +} + +int svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, + enum rpcrdma_errcode err) +{ + struct ib_send_wr err_wr; + struct ib_sge sge; + struct page *p; + struct svc_rdma_op_ctxt *ctxt; + u32 *va; + int length; + int ret; + + p = svc_rdma_get_page(); + va = page_address(p); + + /* XDR encode error */ + length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va); + + /* Prepare SGE for local address */ + sge.addr = ib_dma_map_page(xprt->sc_cm_id->device, + p, 0, PAGE_SIZE, DMA_FROM_DEVICE); + sge.lkey = xprt->sc_phys_mr->lkey; + sge.length = length; + + ctxt = svc_rdma_get_context(xprt); + ctxt->count = 1; + ctxt->pages[0] = p; + + /* Prepare SEND WR */ + memset(&err_wr, 0, sizeof err_wr); + ctxt->wr_op = IB_WR_SEND; + err_wr.wr_id = (unsigned long)ctxt; + err_wr.sg_list = &sge; + err_wr.num_sge = 1; + err_wr.opcode = IB_WR_SEND; + err_wr.send_flags = IB_SEND_SIGNALED; + + /* Post It */ + ret = svc_rdma_send(xprt, &err_wr); + if (ret) { + dprintk("svcrdma: Error posting send = %d\n", ret); + svc_rdma_put_context(ctxt, 1); + } + + return ret; +} diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c new file mode 100644 index 0000000..bb007e7 --- /dev/null +++ b/net/sunrpc/svc_xprt.c @@ -0,0 +1,1030 @@ +/* + * linux/net/sunrpc/svc_xprt.c + * + * Author: Tom Tucker + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt); +static int svc_deferred_recv(struct svc_rqst *rqstp); +static struct cache_deferred_req *svc_defer(struct cache_req *req); +static void svc_age_temp_xprts(unsigned long closure); +/* apparently the "standard" is that clients close + * idle connections after 5 minutes, servers after + * 6 minutes + * http://www.connectathon.org/talks96/nfstcp.pdf + */ +static int svc_conn_age_period = 6*60; + +/* List of registered transport classes */ +static spinlock_t svc_xprt_class_lock = SPIN_LOCK_UNLOCKED; +static LIST_HEAD(svc_xprt_class_list); + +int svc_reg_xprt_class(struct svc_xprt_class *xcl) +{ + struct svc_xprt_class *cl; + int res = -EEXIST; + + dprintk("svc: Adding svc transport class '%s'\n", + xcl->xcl_name); + + INIT_LIST_HEAD(&xcl->xcl_list); + spin_lock(&svc_xprt_class_lock); + list_for_each_entry(cl, &svc_xprt_class_list, xcl_list) { + if (xcl == cl) + goto out; + } + list_add_tail(&xcl->xcl_list, &svc_xprt_class_list); + res = 0; +out: + spin_unlock(&svc_xprt_class_lock); + return res; +} +EXPORT_SYMBOL_GPL(svc_reg_xprt_class); + +int svc_unreg_xprt_class(struct svc_xprt_class *xcl) +{ + struct svc_xprt_class *cl; + int res = 0; + + dprintk("svc: Removing svc transport class '%s'\n", xcl->xcl_name); + + spin_lock(&svc_xprt_class_lock); + list_for_each_entry(cl, &svc_xprt_class_list, xcl_list) { + if (xcl == cl) { + list_del_init(&cl->xcl_list); + goto out; + } + } + res = -ENOENT; + out: + spin_unlock(&svc_xprt_class_lock); + return res; +} +EXPORT_SYMBOL_GPL(svc_unreg_xprt_class); + +/* + * Format the transport list for printing + */ +int svc_print_xprts(char *buf, int maxlen) +{ + struct list_head *le; + char tmpstr[80]; + int len = 0; + buf[0] = '\0'; + + spin_lock(&svc_xprt_class_lock); + list_for_each(le, &svc_xprt_class_list) { + int slen; + struct svc_xprt_class *xcl = + list_entry(le, struct svc_xprt_class, xcl_list); + + sprintf(tmpstr, "%s %d\n", xcl->xcl_name, xcl->xcl_max_payload); + slen = strlen(tmpstr); + if (len + slen > maxlen) + break; + len += slen; + strcat(buf, tmpstr); + } + spin_unlock(&svc_xprt_class_lock); + + return len; +} + +static void svc_xprt_free(struct kref *kref) +{ + struct svc_xprt *xprt = + container_of(kref, struct svc_xprt, xpt_ref); + struct module *owner = xprt->xpt_class->xcl_owner; + BUG_ON(atomic_read(&kref->refcount)); + if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags) + && xprt->xpt_auth_cache != NULL) + svcauth_unix_info_release(xprt->xpt_auth_cache); + xprt->xpt_ops->xpo_free(xprt); + module_put(owner); +} + +void svc_xprt_put(struct svc_xprt *xprt) +{ + kref_put(&xprt->xpt_ref, svc_xprt_free); +} +EXPORT_SYMBOL_GPL(svc_xprt_put); + +/* + * Called by transport drivers to initialize the transport independent + * portion of the transport instance. + */ +void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt, + struct svc_serv *serv) +{ + memset(xprt, 0, sizeof(*xprt)); + xprt->xpt_class = xcl; + xprt->xpt_ops = xcl->xcl_ops; + kref_init(&xprt->xpt_ref); + xprt->xpt_server = serv; + INIT_LIST_HEAD(&xprt->xpt_list); + INIT_LIST_HEAD(&xprt->xpt_ready); + INIT_LIST_HEAD(&xprt->xpt_deferred); + mutex_init(&xprt->xpt_mutex); + spin_lock_init(&xprt->xpt_lock); +} +EXPORT_SYMBOL_GPL(svc_xprt_init); + +static int svc_local_port(struct svc_xprt *xprt) +{ + int ret = -1; + switch (xprt->xpt_local.ss_family) { + case AF_INET: + ret = ntohs(((struct sockaddr_in *) + &xprt->xpt_local)->sin_port); + break; + case AF_INET6: + ret = ntohs(((struct sockaddr_in6 *) + &xprt->xpt_local)->sin6_port); + break; + } + dprintk("svc: local port for xprt %p is %d\n", xprt, ret); + return ret; +} + +int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port, + int flags) +{ + struct svc_xprt_class *xcl; + int ret = -ENOENT; + struct sockaddr_in sin = { + .sin_family = AF_INET, + .sin_addr.s_addr = INADDR_ANY, + .sin_port = htons(port), + }; + dprintk("svc: creating transport %s[%d]\n", xprt_name, port); + spin_lock(&svc_xprt_class_lock); + list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) { + if (strcmp(xprt_name, xcl->xcl_name) == 0) { + spin_unlock(&svc_xprt_class_lock); + if (try_module_get(xcl->xcl_owner)) { + struct svc_xprt *newxprt; + newxprt = xcl->xcl_ops->xpo_create + (serv, + (struct sockaddr *)&sin, sizeof(sin), + flags); + if (IS_ERR(newxprt)) { + module_put(xcl->xcl_owner); + ret = PTR_ERR(newxprt); + } else { + clear_bit(XPT_TEMP, + &newxprt->xpt_flags); + spin_lock_bh(&serv->sv_lock); + list_add(&newxprt->xpt_list, + &serv->sv_permsocks); + spin_unlock_bh(&serv->sv_lock); + ret = svc_local_port(newxprt); + } + } + goto out; + } + } + spin_unlock(&svc_xprt_class_lock); + dprintk("svc: transport %s not found\n", xprt_name); + out: + return ret; +} +EXPORT_SYMBOL_GPL(svc_create_xprt); + +/* + * Queue up an idle server thread. Must have pool->sp_lock held. + * Note: this is really a stack rather than a queue, so that we only + * use as many different threads as we need, and the rest don't pollute + * the cache. + */ +static void svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp) +{ + list_add(&rqstp->rq_list, &pool->sp_threads); +} + +/* + * Dequeue an nfsd thread. Must have pool->sp_lock held. + */ +static void svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp) +{ + list_del(&rqstp->rq_list); +} + +/* + * Queue up a transport with data pending. If there are idle nfsd + * processes, wake 'em up. + * + */ +void svc_xprt_enqueue(struct svc_xprt *xprt) +{ + struct svc_serv *serv = xprt->xpt_server; + struct svc_pool *pool; + struct svc_rqst *rqstp; + int cpu; + + if (!(xprt->xpt_flags & + ((1<xpt_flags)) + return; + + cpu = get_cpu(); + pool = svc_pool_for_cpu(xprt->xpt_server, cpu); + put_cpu(); + + spin_lock_bh(&pool->sp_lock); + + if (!list_empty(&pool->sp_threads) && + !list_empty(&pool->sp_sockets)) + printk(KERN_ERR + "svc_xprt_enqueue: threads and xprt both waiting??\n"); + + if (test_bit(XPT_DEAD, &xprt->xpt_flags)) { + /* Don't enqueue dead transports */ + dprintk("svc: transport %p is dead, not enqueued\n", xprt); + goto out_unlock; + } + + /* Mark transport as busy. It will remain in this state until the + * server has processed all pending data and put the transport back + * on the idle list. We update XPT_BUSY atomically because + * it also guards against trying to enqueue the svc_sock twice. + */ + if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) { + /* Don't enqueue transport while already enqueued */ + dprintk("svc: transport %p busy, not enqueued\n", xprt); + goto out_unlock; + } + BUG_ON(xprt->xpt_pool != NULL); + xprt->xpt_pool = pool; + + /* Handle pending connection */ + if (test_bit(XPT_CONN, &xprt->xpt_flags)) + goto process; + + /* Handle close in-progress */ + if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) + goto process; + + /* Check if we have space to reply to a request */ + if (!xprt->xpt_ops->xpo_has_wspace(xprt)) { + /* Don't enqueue while not enough space for reply */ + dprintk("svc: no write space, transport %p not enqueued\n", xprt); + xprt->xpt_pool = NULL; + clear_bit(XPT_BUSY, &xprt->xpt_flags); + goto out_unlock; + } + + process: + if (!list_empty(&pool->sp_threads)) { + rqstp = list_entry(pool->sp_threads.next, + struct svc_rqst, + rq_list); + dprintk("svc: transport %p served by daemon %p\n", + xprt, rqstp); + svc_thread_dequeue(pool, rqstp); + if (rqstp->rq_xprt) + printk(KERN_ERR + "svc_xprt_enqueue: server %p, rq_xprt=%p!\n", + rqstp, rqstp->rq_xprt); + rqstp->rq_xprt = xprt; + svc_xprt_get(xprt); + rqstp->rq_reserved = serv->sv_max_mesg; + atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); + BUG_ON(xprt->xpt_pool != pool); + wake_up(&rqstp->rq_wait); + } else { + dprintk("svc: transport %p put into queue\n", xprt); + list_add_tail(&xprt->xpt_ready, &pool->sp_sockets); + BUG_ON(xprt->xpt_pool != pool); + } + +out_unlock: + spin_unlock_bh(&pool->sp_lock); +} +EXPORT_SYMBOL_GPL(svc_xprt_enqueue); + +/* + * Dequeue the first transport. Must be called with the pool->sp_lock held. + */ +static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool) +{ + struct svc_xprt *xprt; + + if (list_empty(&pool->sp_sockets)) + return NULL; + + xprt = list_entry(pool->sp_sockets.next, + struct svc_xprt, xpt_ready); + list_del_init(&xprt->xpt_ready); + + dprintk("svc: transport %p dequeued, inuse=%d\n", + xprt, atomic_read(&xprt->xpt_ref.refcount)); + + return xprt; +} + +/* + * Having read something from a transport, check whether it + * needs to be re-enqueued. + * Note: XPT_DATA only gets cleared when a read-attempt finds + * no (or insufficient) data. + */ +void svc_xprt_received(struct svc_xprt *xprt) +{ + xprt->xpt_pool = NULL; + clear_bit(XPT_BUSY, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); +} +EXPORT_SYMBOL_GPL(svc_xprt_received); + +/** + * svc_reserve - change the space reserved for the reply to a request. + * @rqstp: The request in question + * @space: new max space to reserve + * + * Each request reserves some space on the output queue of the transport + * to make sure the reply fits. This function reduces that reserved + * space to be the amount of space used already, plus @space. + * + */ +void svc_reserve(struct svc_rqst *rqstp, int space) +{ + space += rqstp->rq_res.head[0].iov_len; + + if (space < rqstp->rq_reserved) { + struct svc_xprt *xprt = rqstp->rq_xprt; + atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved); + rqstp->rq_reserved = space; + + svc_xprt_enqueue(xprt); + } +} + +static void svc_xprt_release(struct svc_rqst *rqstp) +{ + struct svc_xprt *xprt = rqstp->rq_xprt; + + rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp); + + svc_free_res_pages(rqstp); + rqstp->rq_res.page_len = 0; + rqstp->rq_res.page_base = 0; + + /* Reset response buffer and release + * the reservation. + * But first, check that enough space was reserved + * for the reply, otherwise we have a bug! + */ + if ((rqstp->rq_res.len) > rqstp->rq_reserved) + printk(KERN_ERR "RPC request reserved %d but used %d\n", + rqstp->rq_reserved, + rqstp->rq_res.len); + + rqstp->rq_res.head[0].iov_len = 0; + svc_reserve(rqstp, 0); + rqstp->rq_xprt = NULL; + + svc_xprt_put(xprt); +} + +/* + * External function to wake up a server waiting for data + * This really only makes sense for services like lockd + * which have exactly one thread anyway. + */ +void svc_wake_up(struct svc_serv *serv) +{ + struct svc_rqst *rqstp; + unsigned int i; + struct svc_pool *pool; + + for (i = 0; i < serv->sv_nrpools; i++) { + pool = &serv->sv_pools[i]; + + spin_lock_bh(&pool->sp_lock); + if (!list_empty(&pool->sp_threads)) { + rqstp = list_entry(pool->sp_threads.next, + struct svc_rqst, + rq_list); + dprintk("svc: daemon %p woken up.\n", rqstp); + /* + svc_thread_dequeue(pool, rqstp); + rqstp->rq_xprt = NULL; + */ + wake_up(&rqstp->rq_wait); + } + spin_unlock_bh(&pool->sp_lock); + } +} + +static void svc_check_conn_limits(struct svc_serv *serv) +{ + char buf[RPC_MAX_ADDRBUFLEN]; + + /* make sure that we don't have too many active connections. + * If we have, something must be dropped. + * + * There's no point in trying to do random drop here for + * DoS prevention. The NFS clients does 1 reconnect in 15 + * seconds. An attacker can easily beat that. + * + * The only somewhat efficient mechanism would be if drop + * old connections from the same IP first. + */ + if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) { + struct svc_xprt *xprt = NULL; + spin_lock_bh(&serv->sv_lock); + if (!list_empty(&serv->sv_tempsocks)) { + if (net_ratelimit()) { + /* Try to help the admin */ + printk(KERN_NOTICE "%s: too many open " + "connections, consider increasing the " + "number of nfsd threads\n", + serv->sv_name); + printk(KERN_NOTICE + "%s: last connection from %s\n", + serv->sv_name, buf); + } + /* + * Always select the oldest connection. It's not fair, + * but so is life + */ + xprt = list_entry(serv->sv_tempsocks.prev, + struct svc_xprt, + xpt_list); + set_bit(XPT_CLOSE, &xprt->xpt_flags); + svc_xprt_get(xprt); + } + spin_unlock_bh(&serv->sv_lock); + + if (xprt) { + svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); + } + } +} + +static void svc_copy_addr(struct svc_rqst *rqstp, struct svc_xprt *xprt) +{ + struct sockaddr *sin; + + /* sock_recvmsg doesn't fill in the name/namelen, so we must.. + */ + memcpy(&rqstp->rq_addr, &xprt->xpt_remote, xprt->xpt_remotelen); + rqstp->rq_addrlen = xprt->xpt_remotelen; + + /* Destination address in request is needed for binding the + * source address in RPC callbacks later. + */ + sin = (struct sockaddr *)&xprt->xpt_local; + switch (sin->sa_family) { + case AF_INET: + rqstp->rq_daddr.addr = ((struct sockaddr_in *)sin)->sin_addr; + break; + case AF_INET6: + rqstp->rq_daddr.addr6 = ((struct sockaddr_in6 *)sin)->sin6_addr; + break; + } +} + +/* + * Receive the next request on any transport. This code is carefully + * organised not to touch any cachelines in the shared svc_serv + * structure, only cachelines in the local svc_pool. + */ +int svc_recv(struct svc_rqst *rqstp, long timeout) +{ + struct svc_xprt *xprt = NULL; + struct svc_serv *serv = rqstp->rq_server; + struct svc_pool *pool = rqstp->rq_pool; + int len, i; + int pages; + struct xdr_buf *arg; + DECLARE_WAITQUEUE(wait, current); + + dprintk("svc: server %p waiting for data (to = %ld)\n", + rqstp, timeout); + + if (rqstp->rq_xprt) + printk(KERN_ERR + "svc_recv: service %p, transport not NULL!\n", + rqstp); + if (waitqueue_active(&rqstp->rq_wait)) + printk(KERN_ERR + "svc_recv: service %p, wait queue active!\n", + rqstp); + + + /* now allocate needed pages. If we get a failure, sleep briefly */ + pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE; + for (i = 0; i < pages ; i++) + while (rqstp->rq_pages[i] == NULL) { + struct page *p = alloc_page(GFP_KERNEL); + if (!p) + schedule_timeout_uninterruptible(msecs_to_jiffies(500)); + rqstp->rq_pages[i] = p; + } + rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */ + BUG_ON(pages >= RPCSVC_MAXPAGES); + + /* Make arg->head point to first page and arg->pages point to rest */ + arg = &rqstp->rq_arg; + arg->head[0].iov_base = page_address(rqstp->rq_pages[0]); + arg->head[0].iov_len = PAGE_SIZE; + arg->pages = rqstp->rq_pages + 1; + arg->page_base = 0; + /* save at least one page for response */ + arg->page_len = (pages-2)*PAGE_SIZE; + arg->len = (pages-1)*PAGE_SIZE; + arg->tail[0].iov_len = 0; + + try_to_freeze(); + cond_resched(); + if (signalled()) + return -EINTR; + + spin_lock_bh(&pool->sp_lock); + if ((xprt = svc_xprt_dequeue(pool)) != NULL) { + rqstp->rq_xprt = xprt; + svc_xprt_get(xprt); + rqstp->rq_reserved = serv->sv_max_mesg; + atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); + } else { + /* No data pending. Go to sleep */ + svc_thread_enqueue(pool, rqstp); + + /* + * We have to be able to interrupt this wait + * to bring down the daemons ... + */ + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&rqstp->rq_wait, &wait); + spin_unlock_bh(&pool->sp_lock); + + schedule_timeout(timeout); + + try_to_freeze(); + + spin_lock_bh(&pool->sp_lock); + remove_wait_queue(&rqstp->rq_wait, &wait); + + if (!(xprt = rqstp->rq_xprt)) { + svc_thread_dequeue(pool, rqstp); + spin_unlock_bh(&pool->sp_lock); + dprintk("svc: server %p, no data yet\n", rqstp); + return signalled()? -EINTR : -EAGAIN; + } + } + spin_unlock_bh(&pool->sp_lock); + + len = 0; + if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) { + dprintk("svc_recv: found XPT_CLOSE\n"); + svc_delete_xprt(xprt); + } else if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) { + struct svc_xprt *newxpt; + newxpt = xprt->xpt_ops->xpo_accept(xprt); + if (newxpt) { + /* + * We know this module_get will succeed because the + * listener holds a reference too + */ + __module_get(newxpt->xpt_class->xcl_owner); + svc_check_conn_limits(xprt->xpt_server); + spin_lock_bh(&serv->sv_lock); + set_bit(XPT_TEMP, &newxpt->xpt_flags); + list_add(&newxpt->xpt_list, &serv->sv_tempsocks); + serv->sv_tmpcnt++; + if (serv->sv_temptimer.function == NULL) { + /* setup timer to age temp transports */ + setup_timer(&serv->sv_temptimer, svc_age_temp_xprts, + (unsigned long)serv); + mod_timer(&serv->sv_temptimer, + jiffies + svc_conn_age_period * HZ); + } + spin_unlock_bh(&serv->sv_lock); + svc_xprt_received(newxpt); + } + svc_xprt_received(xprt); + } else { + dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n", + rqstp, pool->sp_id, xprt, + atomic_read(&xprt->xpt_ref.refcount)); + + if ((rqstp->rq_deferred = svc_deferred_dequeue(xprt))) { + svc_xprt_received(xprt); + len = svc_deferred_recv(rqstp); + } else + len = xprt->xpt_ops->xpo_recvfrom(rqstp); + svc_copy_addr(rqstp, xprt); + dprintk("svc: got len=%d\n", len); + } + + /* No data, incomplete (TCP) read, or accept() */ + if (len == 0 || len == -EAGAIN) { + rqstp->rq_res.len = 0; + svc_xprt_release(rqstp); + return -EAGAIN; + } + clear_bit(XPT_OLD, &xprt->xpt_flags); + + rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp)); + rqstp->rq_chandle.defer = svc_defer; + + if (serv->sv_stats) + serv->sv_stats->netcnt++; + return len; +} + +/* + * Drop request + */ +void svc_drop(struct svc_rqst *rqstp) +{ + dprintk("svc: xprt %p dropped request\n", rqstp->rq_xprt); + svc_xprt_release(rqstp); +} + +/* + * Return reply to client. + */ +int svc_send(struct svc_rqst *rqstp) +{ + struct svc_xprt *xprt; + int len; + struct xdr_buf *xb; + + if ((xprt = rqstp->rq_xprt) == NULL) + return -EFAULT; + + /* release the receive skb before sending the reply */ + rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp); + + /* calculate over-all length */ + xb = & rqstp->rq_res; + xb->len = xb->head[0].iov_len + + xb->page_len + + xb->tail[0].iov_len; + + /* Grab mutex to serialize outgoing data. */ + mutex_lock(&xprt->xpt_mutex); + if (test_bit(XPT_DEAD, &xprt->xpt_flags)) + len = -ENOTCONN; + else + len = xprt->xpt_ops->xpo_sendto(rqstp); + mutex_unlock(&xprt->xpt_mutex); + svc_xprt_release(rqstp); + + if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN) + return 0; + return len; +} + +/* + * Timer function to close old temporary transports, using + * a mark-and-sweep algorithm. + */ +static void svc_age_temp_xprts(unsigned long closure) +{ + struct svc_serv *serv = (struct svc_serv *)closure; + struct svc_xprt *xprt; + struct list_head *le, *next; + LIST_HEAD(to_be_aged); + + dprintk("svc_age_temp_xprts\n"); + + if (!spin_trylock_bh(&serv->sv_lock)) { + /* busy, try again 1 sec later */ + dprintk("svc_age_temp_xprts: busy\n"); + mod_timer(&serv->sv_temptimer, jiffies + HZ); + return; + } + + list_for_each_safe(le, next, &serv->sv_tempsocks) { + xprt = list_entry(le, struct svc_xprt, xpt_list); + + /* First time through, just mark it OLD. Second time + * through, close it. */ + if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags)) + continue; + if (atomic_read(&xprt->xpt_ref.refcount) > 1 + || test_bit(XPT_BUSY, &xprt->xpt_flags)) + continue; + svc_xprt_get(xprt); + list_move(le, &to_be_aged); + set_bit(XPT_CLOSE, &xprt->xpt_flags); + set_bit(XPT_DETACHED, &xprt->xpt_flags); + } + spin_unlock_bh(&serv->sv_lock); + + while (!list_empty(&to_be_aged)) { + le = to_be_aged.next; + /* fiddling the xpt_list node is safe 'cos we're XPT_DETACHED */ + list_del_init(le); + xprt = list_entry(le, struct svc_xprt, xpt_list); + + dprintk("queuing xprt %p for closing\n", xprt); + + /* a thread will dequeue and close it soon */ + svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); + } + + mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); +} + +/* + * Remove a dead transport + */ +void svc_delete_xprt(struct svc_xprt *xprt) +{ + struct svc_serv *serv; + + dprintk("svc: svc_delete_xprt(%p)\n", xprt); + + serv = xprt->xpt_server; + + xprt->xpt_ops->xpo_detach(xprt); + + spin_lock_bh(&serv->sv_lock); + + if (!test_and_set_bit(XPT_DETACHED, &xprt->xpt_flags)) + list_del_init(&xprt->xpt_list); + /* + * We used to delete the transport from whichever list + * it's sk_xprt.xpt_ready node was on, but we don't actually + * need to. This is because the only time we're called + * while still attached to a queue, the queue itself + * is about to be destroyed (in svc_destroy). + */ + if (!test_and_set_bit(XPT_DEAD, &xprt->xpt_flags)) { + BUG_ON(atomic_read(&xprt->xpt_ref.refcount) < 2); + svc_xprt_put(xprt); + if (test_bit(XPT_TEMP, &xprt->xpt_flags)) + serv->sv_tmpcnt--; + } + + spin_unlock_bh(&serv->sv_lock); +} + +void svc_close_xprt(struct svc_xprt *xprt) +{ + set_bit(XPT_CLOSE, &xprt->xpt_flags); + if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) + /* someone else will have to effect the close */ + return; + + svc_xprt_get(xprt); + svc_delete_xprt(xprt); + clear_bit(XPT_BUSY, &xprt->xpt_flags); + svc_xprt_put(xprt); +} +EXPORT_SYMBOL_GPL(svc_close_xprt); + +void svc_close_all(struct list_head *xprt_list) +{ + struct svc_xprt *xprt; + struct svc_xprt *tmp; + + list_for_each_entry_safe(xprt, tmp, xprt_list, xpt_list) { + set_bit(XPT_CLOSE, &xprt->xpt_flags); + if (test_bit(XPT_BUSY, &xprt->xpt_flags)) { + /* Waiting to be processed, but no threads left, + * So just remove it from the waiting list + */ + list_del_init(&xprt->xpt_ready); + clear_bit(XPT_BUSY, &xprt->xpt_flags); + } + svc_close_xprt(xprt); + } +} + +int svc_port_is_privileged(struct sockaddr *sin) +{ + switch (sin->sa_family) { + case AF_INET: + return ntohs(((struct sockaddr_in *)sin)->sin_port) + < PROT_SOCK; + case AF_INET6: + return ntohs(((struct sockaddr_in6 *)sin)->sin6_port) + < PROT_SOCK; + default: + return 0; + } +} + +/* + * Handle defer and revisit of requests + */ + +static void svc_revisit(struct cache_deferred_req *dreq, int too_many) +{ + struct svc_deferred_req *dr = container_of(dreq, struct svc_deferred_req, handle); + struct svc_xprt *xprt = dr->xprt; + + if (too_many) { + svc_xprt_put(xprt); + kfree(dr); + return; + } + dprintk("revisit queued\n"); + dr->xprt = NULL; + spin_lock(&xprt->xpt_lock); + list_add(&dr->handle.recent, &xprt->xpt_deferred); + spin_unlock(&xprt->xpt_lock); + set_bit(XPT_DEFERRED, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); +} + +/* + * Save the request off for later processing. The request buffer looks + * like this: + * + * + * + * This code can only handle requests that consist of an xprt-header + * and rpc-header. + */ +static struct cache_deferred_req *svc_defer(struct cache_req *req) +{ + struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle); + struct svc_deferred_req *dr; + + if (rqstp->rq_arg.page_len) + return NULL; /* if more than a page, give up FIXME */ + if (rqstp->rq_deferred) { + dr = rqstp->rq_deferred; + rqstp->rq_deferred = NULL; + } else { + int skip; + int size; + /* FIXME maybe discard if size too large */ + size = sizeof(struct svc_deferred_req) + rqstp->rq_arg.len; + dr = kmalloc(size, GFP_KERNEL); + if (dr == NULL) + return NULL; + + dr->handle.owner = rqstp->rq_server; + dr->prot = rqstp->rq_prot; + memcpy(&dr->addr, &rqstp->rq_addr, rqstp->rq_addrlen); + dr->addrlen = rqstp->rq_addrlen; + dr->daddr = rqstp->rq_daddr; + dr->argslen = rqstp->rq_arg.len>>2; + dr->xprt_hlen = rqstp->rq_xprt_hlen; + + /* back up head to the start of the buffer and copy */ + skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; + + memcpy(dr->args, rqstp->rq_arg.head[0].iov_base - skip, + dr->argslen << 2); + } + svc_xprt_get(rqstp->rq_xprt); + dr->xprt = rqstp->rq_xprt; + + dr->handle.revisit = svc_revisit; + return &dr->handle; +} + +/* + * recv data from a deferred request into an active one + */ +static int svc_deferred_recv(struct svc_rqst *rqstp) +{ + struct svc_deferred_req *dr = rqstp->rq_deferred; + + /* setup iov_base to point past the transport header */ + rqstp->rq_arg.head[0].iov_base = dr->args + (dr->xprt_hlen>>2); + /* The iov_len doesn't include the transport header bytes */ + rqstp->rq_arg.head[0].iov_len = (dr->argslen<<2) - dr->xprt_hlen; + rqstp->rq_arg.page_len = 0; + /* The rq_arg len includes the transport header bytes */ + rqstp->rq_arg.len = dr->argslen<<2; + rqstp->rq_prot = dr->prot; + memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen); + rqstp->rq_addrlen = dr->addrlen; + /* Save off transport header len in case we get deferred again */ + rqstp->rq_xprt_hlen = dr->xprt_hlen; + rqstp->rq_daddr = dr->daddr; + rqstp->rq_respages = rqstp->rq_pages; + return (dr->argslen<<2) - dr->xprt_hlen; +} + + +static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt) +{ + struct svc_deferred_req *dr = NULL; + + if (!test_bit(XPT_DEFERRED, &xprt->xpt_flags)) + return NULL; + spin_lock(&xprt->xpt_lock); + clear_bit(XPT_DEFERRED, &xprt->xpt_flags); + if (!list_empty(&xprt->xpt_deferred)) { + dr = list_entry(xprt->xpt_deferred.next, + struct svc_deferred_req, + handle.recent); + list_del_init(&dr->handle.recent); + set_bit(XPT_DEFERRED, &xprt->xpt_flags); + } + spin_unlock(&xprt->xpt_lock); + return dr; +} + +/* + * Return the transport instance pointer for the endpoint accepting + * connections/peer traffic from the specified transport class, + * address family and port. + * + * AF_UNSPEC for the address family or zero for a port + * number are wild cards. + */ +struct svc_xprt *svc_find_xprt(struct svc_serv *serv, char *xcl_name, + int af, int port) +{ + struct svc_xprt *xprt; + struct svc_xprt *found = NULL; + + /* Sanity check args */ + if (!serv || !xcl_name) + return found; + + spin_lock_bh(&serv->sv_lock); + list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) { + if (strcmp(xprt->xpt_class->xcl_name, xcl_name)) + continue; + if (af != AF_UNSPEC && af != xprt->xpt_local.ss_family) + continue; + if (port && port != svc_local_port(xprt)) + continue; + found = xprt; + break; + } + spin_unlock_bh(&serv->sv_lock); + return found; +} +EXPORT_SYMBOL_GPL(svc_find_xprt); + +/* + * Format a buffer with a list of the active transports. A zero for + * the buflen parameter disables target buffer overflow checking. + */ +int svc_xprt_names(struct svc_serv *serv, char *buf, int buflen) +{ + struct svc_xprt *xprt; + char xprt_str[64]; + int totlen = 0; + int len; + + /* Sanity check args */ + if (!serv) + return 0; + + spin_lock_bh(&serv->sv_lock); + list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) { + len = snprintf(xprt_str, sizeof(xprt_str), + "%s %d\n", xprt->xpt_class->xcl_name, + svc_local_port(xprt)); + /* If the string was truncated, replace with error string */ + if (len >= sizeof(xprt_str)) + strcpy(xprt_str, "name-too-long\n"); + /* Don't overflow buffer */ + len = strlen(xprt_str); + if (buflen && (len + totlen >= buflen)) + break; + strcpy(buf+totlen, xprt_str); + totlen += len; + } + spin_unlock_bh(&serv->sv_lock); + return totlen; +} +EXPORT_SYMBOL_GPL(svc_xprt_names); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 4114794..6815157 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -384,41 +384,45 @@ void svcauth_unix_purge(void) static inline struct ip_map * ip_map_cached_get(struct svc_rqst *rqstp) { - struct ip_map *ipm; - struct svc_sock *svsk = rqstp->rq_sock; - spin_lock(&svsk->sk_lock); - ipm = svsk->sk_info_authunix; - if (ipm != NULL) { - if (!cache_valid(&ipm->h)) { - /* - * The entry has been invalidated since it was - * remembered, e.g. by a second mount from the - * same IP address. - */ - svsk->sk_info_authunix = NULL; - spin_unlock(&svsk->sk_lock); - cache_put(&ipm->h, &ip_map_cache); - return NULL; + struct ip_map *ipm = NULL; + struct svc_xprt *xprt = rqstp->rq_xprt; + + if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) { + spin_lock(&xprt->xpt_lock); + ipm = xprt->xpt_auth_cache; + if (ipm != NULL) { + if (!cache_valid(&ipm->h)) { + /* + * The entry has been invalidated since it was + * remembered, e.g. by a second mount from the + * same IP address. + */ + xprt->xpt_auth_cache = NULL; + spin_unlock(&xprt->xpt_lock); + cache_put(&ipm->h, &ip_map_cache); + return NULL; + } + cache_get(&ipm->h); } - cache_get(&ipm->h); + spin_unlock(&xprt->xpt_lock); } - spin_unlock(&svsk->sk_lock); return ipm; } static inline void ip_map_cached_put(struct svc_rqst *rqstp, struct ip_map *ipm) { - struct svc_sock *svsk = rqstp->rq_sock; + struct svc_xprt *xprt = rqstp->rq_xprt; - spin_lock(&svsk->sk_lock); - if (svsk->sk_sock->type == SOCK_STREAM && - svsk->sk_info_authunix == NULL) { - /* newly cached, keep the reference */ - svsk->sk_info_authunix = ipm; - ipm = NULL; + if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) { + spin_lock(&xprt->xpt_lock); + if (xprt->xpt_auth_cache == NULL) { + /* newly cached, keep the reference */ + xprt->xpt_auth_cache = ipm; + ipm = NULL; + } + spin_unlock(&xprt->xpt_lock); } - spin_unlock(&svsk->sk_lock); if (ipm) cache_put(&ipm->h, &ip_map_cache); } diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index c75bffe..34fdf5c 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -5,7 +5,7 @@ * * The server scheduling algorithm does not always distribute the load * evenly when servicing a single client. May need to modify the - * svc_sock_enqueue procedure... + * svc_xprt_enqueue procedure... * * TCP support is largely untested and may be a little slow. The problem * is that we currently do two separate recvfrom's, one for the 4-byte @@ -51,51 +51,44 @@ /* SMP locking strategy: * * svc_pool->sp_lock protects most of the fields of that pool. - * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt. + * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt. * when both need to be taken (rare), svc_serv->sv_lock is first. * BKL protects svc_serv->sv_nrthread. * svc_sock->sk_lock protects the svc_sock->sk_deferred list * and the ->sk_info_authunix cache. - * svc_sock->sk_flags.SK_BUSY prevents a svc_sock being enqueued multiply. + * svc_sock->sk_xprt.xpt_flags.XPT_BUSY prevents a svc_sock being + * enqueued multiply. * * Some flags can be set to certain values at any time * providing that certain rules are followed: * - * SK_CONN, SK_DATA, can be set or cleared at any time. - * after a set, svc_sock_enqueue must be called. + * XPT_CONN, XPT_DATA, can be set or cleared at any time. + * after a set, svc_xprt_enqueue must be called. * after a clear, the socket must be read/accepted * if this succeeds, it must be set again. - * SK_CLOSE can set at any time. It is never cleared. - * sk_inuse contains a bias of '1' until SK_DEAD is set. - * so when sk_inuse hits zero, we know the socket is dead + * XPT_CLOSE can set at any time. It is never cleared. + * xpt_ref contains a bias of '1' until XPT_DEAD is set. + * so when xprt_ref hits zero, we know the transport is dead * and no-one is using it. - * SK_DEAD can only be set while SK_BUSY is held which ensures + * XPT_DEAD can only be set while XPT_BUSY is held which ensures * no other thread will be using the socket or will try to - * set SK_DEAD. + * set XPT_DEAD. * */ -#define RPCDBG_FACILITY RPCDBG_SVCSOCK +#define RPCDBG_FACILITY RPCDBG_SVCXPRT static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, int *errp, int flags); -static void svc_delete_socket(struct svc_sock *svsk); static void svc_udp_data_ready(struct sock *, int); static int svc_udp_recvfrom(struct svc_rqst *); static int svc_udp_sendto(struct svc_rqst *); -static void svc_close_socket(struct svc_sock *svsk); +static void svc_sock_detach(struct svc_xprt *); +static void svc_sock_free(struct svc_xprt *); -static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk); -static int svc_deferred_recv(struct svc_rqst *rqstp); -static struct cache_deferred_req *svc_defer(struct cache_req *req); - -/* apparently the "standard" is that clients close - * idle connections after 5 minutes, servers after - * 6 minutes - * http://www.connectathon.org/talks96/nfstcp.pdf - */ -static int svc_conn_age_period = 6*60; +static struct svc_xprt * +svc_create_socket(struct svc_serv *, int, struct sockaddr *, int, int); #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key svc_key[2]; @@ -104,16 +97,15 @@ static struct lock_class_key svc_slock_key[2]; static inline void svc_reclassify_socket(struct socket *sock) { struct sock *sk = sock->sk; - BUG_ON(sock_owned_by_user(sk)); switch (sk->sk_family) { case AF_INET: sock_lock_init_class_and_name(sk, "slock-AF_INET-NFSD", - &svc_slock_key[0], "sk_lock-AF_INET-NFSD", &svc_key[0]); + &svc_slock_key[0], "sk_xprt.xpt_lock-AF_INET-NFSD", &svc_key[0]); break; case AF_INET6: sock_lock_init_class_and_name(sk, "slock-AF_INET6-NFSD", - &svc_slock_key[1], "sk_lock-AF_INET6-NFSD", &svc_key[1]); + &svc_slock_key[1], "sk_xprt.xpt_lock-AF_INET6-NFSD", &svc_key[1]); break; default: @@ -162,40 +154,21 @@ char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len) EXPORT_SYMBOL_GPL(svc_print_addr); /* - * Queue up an idle server thread. Must have pool->sp_lock held. - * Note: this is really a stack rather than a queue, so that we only - * use as many different threads as we need, and the rest don't pollute - * the cache. - */ -static inline void -svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp) -{ - list_add(&rqstp->rq_list, &pool->sp_threads); -} - -/* - * Dequeue an nfsd thread. Must have pool->sp_lock held. - */ -static inline void -svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp) -{ - list_del(&rqstp->rq_list); -} - -/* * Release an skbuff after use */ -static inline void +static void svc_release_skb(struct svc_rqst *rqstp) { - struct sk_buff *skb = rqstp->rq_skbuff; + struct sk_buff *skb = rqstp->rq_xprt_ctxt; struct svc_deferred_req *dr = rqstp->rq_deferred; if (skb) { - rqstp->rq_skbuff = NULL; + struct svc_sock *svsk = + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); + rqstp->rq_xprt_ctxt = NULL; dprintk("svc: service %p, releasing skb %p\n", rqstp, skb); - skb_free_datagram(rqstp->rq_sock->sk_sk, skb); + skb_free_datagram(svsk->sk_sk, skb); } if (dr) { rqstp->rq_deferred = NULL; @@ -219,237 +192,6 @@ svc_sock_wspace(struct svc_sock *svsk) return wspace; } -/* - * Queue up a socket with data pending. If there are idle nfsd - * processes, wake 'em up. - * - */ -static void -svc_sock_enqueue(struct svc_sock *svsk) -{ - struct svc_serv *serv = svsk->sk_server; - struct svc_pool *pool; - struct svc_rqst *rqstp; - int cpu; - - if (!(svsk->sk_flags & - ( (1<sk_flags)) - return; - - cpu = get_cpu(); - pool = svc_pool_for_cpu(svsk->sk_server, cpu); - put_cpu(); - - spin_lock_bh(&pool->sp_lock); - - if (!list_empty(&pool->sp_threads) && - !list_empty(&pool->sp_sockets)) - printk(KERN_ERR - "svc_sock_enqueue: threads and sockets both waiting??\n"); - - if (test_bit(SK_DEAD, &svsk->sk_flags)) { - /* Don't enqueue dead sockets */ - dprintk("svc: socket %p is dead, not enqueued\n", svsk->sk_sk); - goto out_unlock; - } - - /* Mark socket as busy. It will remain in this state until the - * server has processed all pending data and put the socket back - * on the idle list. We update SK_BUSY atomically because - * it also guards against trying to enqueue the svc_sock twice. - */ - if (test_and_set_bit(SK_BUSY, &svsk->sk_flags)) { - /* Don't enqueue socket while already enqueued */ - dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk); - goto out_unlock; - } - BUG_ON(svsk->sk_pool != NULL); - svsk->sk_pool = pool; - - set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); - if (((atomic_read(&svsk->sk_reserved) + serv->sv_max_mesg)*2 - > svc_sock_wspace(svsk)) - && !test_bit(SK_CLOSE, &svsk->sk_flags) - && !test_bit(SK_CONN, &svsk->sk_flags)) { - /* Don't enqueue while not enough space for reply */ - dprintk("svc: socket %p no space, %d*2 > %ld, not enqueued\n", - svsk->sk_sk, atomic_read(&svsk->sk_reserved)+serv->sv_max_mesg, - svc_sock_wspace(svsk)); - svsk->sk_pool = NULL; - clear_bit(SK_BUSY, &svsk->sk_flags); - goto out_unlock; - } - clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); - - - if (!list_empty(&pool->sp_threads)) { - rqstp = list_entry(pool->sp_threads.next, - struct svc_rqst, - rq_list); - dprintk("svc: socket %p served by daemon %p\n", - svsk->sk_sk, rqstp); - svc_thread_dequeue(pool, rqstp); - if (rqstp->rq_sock) - printk(KERN_ERR - "svc_sock_enqueue: server %p, rq_sock=%p!\n", - rqstp, rqstp->rq_sock); - rqstp->rq_sock = svsk; - atomic_inc(&svsk->sk_inuse); - rqstp->rq_reserved = serv->sv_max_mesg; - atomic_add(rqstp->rq_reserved, &svsk->sk_reserved); - BUG_ON(svsk->sk_pool != pool); - wake_up(&rqstp->rq_wait); - } else { - dprintk("svc: socket %p put into queue\n", svsk->sk_sk); - list_add_tail(&svsk->sk_ready, &pool->sp_sockets); - BUG_ON(svsk->sk_pool != pool); - } - -out_unlock: - spin_unlock_bh(&pool->sp_lock); -} - -/* - * Dequeue the first socket. Must be called with the pool->sp_lock held. - */ -static inline struct svc_sock * -svc_sock_dequeue(struct svc_pool *pool) -{ - struct svc_sock *svsk; - - if (list_empty(&pool->sp_sockets)) - return NULL; - - svsk = list_entry(pool->sp_sockets.next, - struct svc_sock, sk_ready); - list_del_init(&svsk->sk_ready); - - dprintk("svc: socket %p dequeued, inuse=%d\n", - svsk->sk_sk, atomic_read(&svsk->sk_inuse)); - - return svsk; -} - -/* - * Having read something from a socket, check whether it - * needs to be re-enqueued. - * Note: SK_DATA only gets cleared when a read-attempt finds - * no (or insufficient) data. - */ -static inline void -svc_sock_received(struct svc_sock *svsk) -{ - svsk->sk_pool = NULL; - clear_bit(SK_BUSY, &svsk->sk_flags); - svc_sock_enqueue(svsk); -} - - -/** - * svc_reserve - change the space reserved for the reply to a request. - * @rqstp: The request in question - * @space: new max space to reserve - * - * Each request reserves some space on the output queue of the socket - * to make sure the reply fits. This function reduces that reserved - * space to be the amount of space used already, plus @space. - * - */ -void svc_reserve(struct svc_rqst *rqstp, int space) -{ - space += rqstp->rq_res.head[0].iov_len; - - if (space < rqstp->rq_reserved) { - struct svc_sock *svsk = rqstp->rq_sock; - atomic_sub((rqstp->rq_reserved - space), &svsk->sk_reserved); - rqstp->rq_reserved = space; - - svc_sock_enqueue(svsk); - } -} - -/* - * Release a socket after use. - */ -static inline void -svc_sock_put(struct svc_sock *svsk) -{ - if (atomic_dec_and_test(&svsk->sk_inuse)) { - BUG_ON(! test_bit(SK_DEAD, &svsk->sk_flags)); - - dprintk("svc: releasing dead socket\n"); - if (svsk->sk_sock->file) - sockfd_put(svsk->sk_sock); - else - sock_release(svsk->sk_sock); - if (svsk->sk_info_authunix != NULL) - svcauth_unix_info_release(svsk->sk_info_authunix); - kfree(svsk); - } -} - -static void -svc_sock_release(struct svc_rqst *rqstp) -{ - struct svc_sock *svsk = rqstp->rq_sock; - - svc_release_skb(rqstp); - - svc_free_res_pages(rqstp); - rqstp->rq_res.page_len = 0; - rqstp->rq_res.page_base = 0; - - - /* Reset response buffer and release - * the reservation. - * But first, check that enough space was reserved - * for the reply, otherwise we have a bug! - */ - if ((rqstp->rq_res.len) > rqstp->rq_reserved) - printk(KERN_ERR "RPC request reserved %d but used %d\n", - rqstp->rq_reserved, - rqstp->rq_res.len); - - rqstp->rq_res.head[0].iov_len = 0; - svc_reserve(rqstp, 0); - rqstp->rq_sock = NULL; - - svc_sock_put(svsk); -} - -/* - * External function to wake up a server waiting for data - * This really only makes sense for services like lockd - * which have exactly one thread anyway. - */ -void -svc_wake_up(struct svc_serv *serv) -{ - struct svc_rqst *rqstp; - unsigned int i; - struct svc_pool *pool; - - for (i = 0; i < serv->sv_nrpools; i++) { - pool = &serv->sv_pools[i]; - - spin_lock_bh(&pool->sp_lock); - if (!list_empty(&pool->sp_threads)) { - rqstp = list_entry(pool->sp_threads.next, - struct svc_rqst, - rq_list); - dprintk("svc: daemon %p woken up.\n", rqstp); - /* - svc_thread_dequeue(pool, rqstp); - rqstp->rq_sock = NULL; - */ - wake_up(&rqstp->rq_wait); - } - spin_unlock_bh(&pool->sp_lock); - } -} - union svc_pktinfo_u { struct in_pktinfo pkti; struct in6_pktinfo pkti6; @@ -459,7 +201,9 @@ union svc_pktinfo_u { static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh) { - switch (rqstp->rq_sock->sk_sk->sk_family) { + struct svc_sock *svsk = + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); + switch (svsk->sk_sk->sk_family) { case AF_INET: { struct in_pktinfo *pki = CMSG_DATA(cmh); @@ -492,7 +236,8 @@ static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh) static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) { - struct svc_sock *svsk = rqstp->rq_sock; + struct svc_sock *svsk = + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); struct socket *sock = svsk->sk_sock; int slen; union { @@ -565,7 +310,7 @@ svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) } out: dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n", - rqstp->rq_sock, xdr->head[0].iov_base, xdr->head[0].iov_len, + svsk, xdr->head[0].iov_base, xdr->head[0].iov_len, xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf))); return len; @@ -602,7 +347,7 @@ svc_sock_names(char *buf, struct svc_serv *serv, char *toclose) if (!serv) return 0; spin_lock_bh(&serv->sv_lock); - list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) { + list_for_each_entry(svsk, &serv->sv_permsocks, sk_xprt.xpt_list) { int onelen = one_sock_name(buf+len, svsk); if (toclose && strcmp(toclose, buf+len) == 0) closesk = svsk; @@ -614,7 +359,7 @@ svc_sock_names(char *buf, struct svc_serv *serv, char *toclose) /* Should unregister with portmap, but you cannot * unregister just one protocol... */ - svc_close_socket(closesk); + svc_close_xprt(&closesk->sk_xprt); else if (toclose) return -ENOENT; return len; @@ -641,37 +386,21 @@ svc_recv_available(struct svc_sock *svsk) static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, int buflen) { - struct svc_sock *svsk = rqstp->rq_sock; + struct svc_sock *svsk = + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); struct msghdr msg = { .msg_flags = MSG_DONTWAIT, }; - struct sockaddr *sin; int len; + /* UDP doesn't have a xprt header and TCP doesn't need to save it */ + rqstp->rq_xprt_hlen = 0; + len = kernel_recvmsg(svsk->sk_sock, &msg, iov, nr, buflen, msg.msg_flags); - /* sock_recvmsg doesn't fill in the name/namelen, so we must.. - */ - memcpy(&rqstp->rq_addr, &svsk->sk_remote, svsk->sk_remotelen); - rqstp->rq_addrlen = svsk->sk_remotelen; - - /* Destination address in request is needed for binding the - * source address in RPC callbacks later. - */ - sin = (struct sockaddr *)&svsk->sk_local; - switch (sin->sa_family) { - case AF_INET: - rqstp->rq_daddr.addr = ((struct sockaddr_in *)sin)->sin_addr; - break; - case AF_INET6: - rqstp->rq_daddr.addr6 = ((struct sockaddr_in6 *)sin)->sin6_addr; - break; - } - dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n", svsk, iov[0].iov_base, iov[0].iov_len, len); - return len; } @@ -711,9 +440,10 @@ svc_udp_data_ready(struct sock *sk, int count) if (svsk) { dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n", - svsk, sk, count, test_bit(SK_BUSY, &svsk->sk_flags)); - set_bit(SK_DATA, &svsk->sk_flags); - svc_sock_enqueue(svsk); + svsk, sk, count, + test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); + svc_xprt_enqueue(&svsk->sk_xprt); } if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) wake_up_interruptible(sk->sk_sleep); @@ -729,8 +459,8 @@ svc_write_space(struct sock *sk) if (svsk) { dprintk("svc: socket %p(inet %p), write_space busy=%d\n", - svsk, sk, test_bit(SK_BUSY, &svsk->sk_flags)); - svc_sock_enqueue(svsk); + svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); + svc_xprt_enqueue(&svsk->sk_xprt); } if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) { @@ -743,7 +473,9 @@ svc_write_space(struct sock *sk) static inline void svc_udp_get_dest_address(struct svc_rqst *rqstp, struct cmsghdr *cmh) { - switch (rqstp->rq_sock->sk_sk->sk_family) { + struct svc_sock *svsk = + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); + switch (svsk->sk_sk->sk_family) { case AF_INET: { struct in_pktinfo *pki = CMSG_DATA(cmh); rqstp->rq_daddr.addr.s_addr = pki->ipi_spec_dst.s_addr; @@ -763,8 +495,9 @@ static inline void svc_udp_get_dest_address(struct svc_rqst *rqstp, static int svc_udp_recvfrom(struct svc_rqst *rqstp) { - struct svc_sock *svsk = rqstp->rq_sock; - struct svc_serv *serv = svsk->sk_server; + struct svc_sock *svsk = + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); + struct svc_serv *serv = svsk->sk_xprt.xpt_server; struct sk_buff *skb; union { struct cmsghdr hdr; @@ -779,7 +512,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) .msg_flags = MSG_DONTWAIT, }; - if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags)) + if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags)) /* udp sockets need large rcvbuf as all pending * requests are still in that buffer. sndbuf must * also be large enough that there is enough space @@ -792,17 +525,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) (serv->sv_nrthreads+3) * serv->sv_max_mesg, (serv->sv_nrthreads+3) * serv->sv_max_mesg); - if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) { - svc_sock_received(svsk); - return svc_deferred_recv(rqstp); - } - - if (test_bit(SK_CLOSE, &svsk->sk_flags)) { - svc_delete_socket(svsk); - return 0; - } - - clear_bit(SK_DATA, &svsk->sk_flags); + clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); skb = NULL; err = kernel_recvmsg(svsk->sk_sock, &msg, NULL, 0, 0, MSG_PEEK | MSG_DONTWAIT); @@ -813,9 +536,9 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) if (err != -EAGAIN) { /* possibly an icmp error */ dprintk("svc: recvfrom returned error %d\n", -err); - set_bit(SK_DATA, &svsk->sk_flags); + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); } - svc_sock_received(svsk); + svc_xprt_received(&svsk->sk_xprt); return -EAGAIN; } rqstp->rq_addrlen = sizeof(rqstp->rq_addr); @@ -825,12 +548,12 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) need that much accuracy */ } svsk->sk_sk->sk_stamp = skb->tstamp; - set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */ + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */ /* * Maybe more packets - kick another thread ASAP. */ - svc_sock_received(svsk); + svc_xprt_received(&svsk->sk_xprt); len = skb->len - sizeof(struct udphdr); rqstp->rq_arg.len = len; @@ -867,7 +590,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) skb_free_datagram(svsk->sk_sk, skb); return 0; } - rqstp->rq_skbuff = skb; + rqstp->rq_xprt_ctxt = skb; } rqstp->rq_arg.page_base = 0; @@ -901,26 +624,82 @@ svc_udp_sendto(struct svc_rqst *rqstp) } static void -svc_udp_init(struct svc_sock *svsk) +svc_udp_prep_reply_hdr(struct svc_rqst *rqstp) +{ +} + +static int +svc_udp_has_wspace(struct svc_xprt *xprt) +{ + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + struct svc_serv *serv = svsk->sk_xprt.xpt_server; + int required; + + /* + * Set the SOCK_NOSPACE flag before checking the available + * sock space. + */ + set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); + required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg; + if (required*2 > sock_wspace(svsk->sk_sk)) + return 0; + clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); + return 1; +} + +static struct svc_xprt * +svc_udp_accept(struct svc_xprt *xprt) +{ + BUG(); + return NULL; +} + +static struct svc_xprt * +svc_udp_create(struct svc_serv *serv, struct sockaddr *sa, int alen, int flags) +{ + return svc_create_socket(serv, IPPROTO_UDP, sa, alen, flags); +} + +static struct svc_xprt_ops svc_udp_ops = { + .xpo_create = svc_udp_create, + .xpo_recvfrom = svc_udp_recvfrom, + .xpo_sendto = svc_udp_sendto, + .xpo_release_rqst = svc_release_skb, + .xpo_detach = svc_sock_detach, + .xpo_free = svc_sock_free, + .xpo_prep_reply_hdr = svc_udp_prep_reply_hdr, + .xpo_has_wspace = svc_udp_has_wspace, + .xpo_accept = svc_udp_accept, +}; + +static struct svc_xprt_class svc_udp_class = { + .xcl_name = "udp", + .xcl_owner = THIS_MODULE, + .xcl_ops = &svc_udp_ops, + .xcl_max_payload = RPCSVC_MAXPAYLOAD_UDP, +}; + +static void +svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) { int one = 1; mm_segment_t oldfs; + svc_xprt_init(&svc_udp_class, &svsk->sk_xprt, serv); + clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); svsk->sk_sk->sk_data_ready = svc_udp_data_ready; svsk->sk_sk->sk_write_space = svc_write_space; - svsk->sk_recvfrom = svc_udp_recvfrom; - svsk->sk_sendto = svc_udp_sendto; /* initialise setting must have enough space to * receive and respond to one request. * svc_udp_recvfrom will re-adjust if necessary */ svc_sock_setbufsize(svsk->sk_sock, - 3 * svsk->sk_server->sv_max_mesg, - 3 * svsk->sk_server->sv_max_mesg); + 3 * svsk->sk_xprt.xpt_server->sv_max_mesg, + 3 * svsk->sk_xprt.xpt_server->sv_max_mesg); - set_bit(SK_DATA, &svsk->sk_flags); /* might have come in before data_ready set up */ - set_bit(SK_CHNGBUF, &svsk->sk_flags); + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* might have come in before data_ready set up */ + set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); oldfs = get_fs(); set_fs(KERNEL_DS); @@ -954,8 +733,8 @@ svc_tcp_listen_data_ready(struct sock *sk, int count_unused) */ if (sk->sk_state == TCP_LISTEN) { if (svsk) { - set_bit(SK_CONN, &svsk->sk_flags); - svc_sock_enqueue(svsk); + set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); + svc_xprt_enqueue(&svsk->sk_xprt); } else printk("svc: socket %p: no user data\n", sk); } @@ -978,8 +757,8 @@ svc_tcp_state_change(struct sock *sk) if (!svsk) printk("svc: socket %p: no user data\n", sk); else { - set_bit(SK_CLOSE, &svsk->sk_flags); - svc_sock_enqueue(svsk); + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); + svc_xprt_enqueue(&svsk->sk_xprt); } if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) wake_up_interruptible_all(sk->sk_sleep); @@ -993,36 +772,23 @@ svc_tcp_data_ready(struct sock *sk, int count) dprintk("svc: socket %p TCP data ready (svsk %p)\n", sk, sk->sk_user_data); if (svsk) { - set_bit(SK_DATA, &svsk->sk_flags); - svc_sock_enqueue(svsk); + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); + svc_xprt_enqueue(&svsk->sk_xprt); } if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) wake_up_interruptible(sk->sk_sleep); } -static inline int svc_port_is_privileged(struct sockaddr *sin) -{ - switch (sin->sa_family) { - case AF_INET: - return ntohs(((struct sockaddr_in *)sin)->sin_port) - < PROT_SOCK; - case AF_INET6: - return ntohs(((struct sockaddr_in6 *)sin)->sin6_port) - < PROT_SOCK; - default: - return 0; - } -} - /* * Accept a TCP connection */ -static void -svc_tcp_accept(struct svc_sock *svsk) +static struct svc_xprt * +svc_tcp_accept(struct svc_xprt *xprt) { + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); struct sockaddr_storage addr; struct sockaddr *sin = (struct sockaddr *) &addr; - struct svc_serv *serv = svsk->sk_server; + struct svc_serv *serv = svsk->sk_xprt.xpt_server; struct socket *sock = svsk->sk_sock; struct socket *newsock; struct svc_sock *newsvsk; @@ -1031,9 +797,9 @@ svc_tcp_accept(struct svc_sock *svsk) dprintk("svc: tcp_accept %p sock %p\n", svsk, sock); if (!sock) - return; + return NULL; - clear_bit(SK_CONN, &svsk->sk_flags); + clear_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); err = kernel_accept(sock, &newsock, O_NONBLOCK); if (err < 0) { if (err == -ENOMEM) @@ -1042,11 +808,11 @@ svc_tcp_accept(struct svc_sock *svsk) else if (err != -EAGAIN && net_ratelimit()) printk(KERN_WARNING "%s: accept failed (err %d)!\n", serv->sv_name, -err); - return; + return NULL; } - set_bit(SK_CONN, &svsk->sk_flags); - svc_sock_enqueue(svsk); + set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); + svc_xprt_enqueue(&svsk->sk_xprt); err = kernel_getpeername(newsock, sin, &slen); if (err < 0) { @@ -1077,70 +843,23 @@ svc_tcp_accept(struct svc_sock *svsk) if (!(newsvsk = svc_setup_socket(serv, newsock, &err, (SVC_SOCK_ANONYMOUS | SVC_SOCK_TEMPORARY)))) goto failed; - memcpy(&newsvsk->sk_remote, sin, slen); - newsvsk->sk_remotelen = slen; + memcpy(&newsvsk->sk_xprt.xpt_remote, sin, slen); + newsvsk->sk_xprt.xpt_remotelen = slen; err = kernel_getsockname(newsock, sin, &slen); if (unlikely(err < 0)) { dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err); slen = offsetof(struct sockaddr, sa_data); } - memcpy(&newsvsk->sk_local, sin, slen); - - svc_sock_received(newsvsk); - - /* make sure that we don't have too many active connections. - * If we have, something must be dropped. - * - * There's no point in trying to do random drop here for - * DoS prevention. The NFS clients does 1 reconnect in 15 - * seconds. An attacker can easily beat that. - * - * The only somewhat efficient mechanism would be if drop - * old connections from the same IP first. But right now - * we don't even record the client IP in svc_sock. - */ - if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) { - struct svc_sock *svsk = NULL; - spin_lock_bh(&serv->sv_lock); - if (!list_empty(&serv->sv_tempsocks)) { - if (net_ratelimit()) { - /* Try to help the admin */ - printk(KERN_NOTICE "%s: too many open TCP " - "sockets, consider increasing the " - "number of nfsd threads\n", - serv->sv_name); - printk(KERN_NOTICE - "%s: last TCP connect from %s\n", - serv->sv_name, __svc_print_addr(sin, - buf, sizeof(buf))); - } - /* - * Always select the oldest socket. It's not fair, - * but so is life - */ - svsk = list_entry(serv->sv_tempsocks.prev, - struct svc_sock, - sk_list); - set_bit(SK_CLOSE, &svsk->sk_flags); - atomic_inc(&svsk->sk_inuse); - } - spin_unlock_bh(&serv->sv_lock); - - if (svsk) { - svc_sock_enqueue(svsk); - svc_sock_put(svsk); - } - - } + memcpy(&newsvsk->sk_xprt.xpt_local, sin, slen); if (serv->sv_stats) serv->sv_stats->nettcpconn++; - return; + return &newsvsk->sk_xprt; failed: sock_release(newsock); - return; + return NULL; } /* @@ -1149,34 +868,19 @@ failed: static int svc_tcp_recvfrom(struct svc_rqst *rqstp) { - struct svc_sock *svsk = rqstp->rq_sock; - struct svc_serv *serv = svsk->sk_server; + struct svc_sock *svsk = + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); + struct svc_serv *serv = svsk->sk_xprt.xpt_server; int len; struct kvec *vec; int pnum, vlen; dprintk("svc: tcp_recv %p data %d conn %d close %d\n", - svsk, test_bit(SK_DATA, &svsk->sk_flags), - test_bit(SK_CONN, &svsk->sk_flags), - test_bit(SK_CLOSE, &svsk->sk_flags)); - - if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) { - svc_sock_received(svsk); - return svc_deferred_recv(rqstp); - } + svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags), + test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags), + test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags)); - if (test_bit(SK_CLOSE, &svsk->sk_flags)) { - svc_delete_socket(svsk); - return 0; - } - - if (svsk->sk_sk->sk_state == TCP_LISTEN) { - svc_tcp_accept(svsk); - svc_sock_received(svsk); - return 0; - } - - if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags)) + if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags)) /* sndbuf needs to have room for one request * per thread, otherwise we can stall even when the * network isn't a bottleneck. @@ -1193,7 +897,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) (serv->sv_nrthreads+3) * serv->sv_max_mesg, 3 * serv->sv_max_mesg); - clear_bit(SK_DATA, &svsk->sk_flags); + clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* Receive data. If we haven't got the record length yet, get * the next four bytes. Otherwise try to gobble up as much as @@ -1212,7 +916,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) if (len < want) { dprintk("svc: short recvfrom while reading record length (%d of %lu)\n", len, want); - svc_sock_received(svsk); + svc_xprt_received(&svsk->sk_xprt); return -EAGAIN; /* record header not complete */ } @@ -1248,11 +952,11 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) if (len < svsk->sk_reclen) { dprintk("svc: incomplete TCP record (%d of %d)\n", len, svsk->sk_reclen); - svc_sock_received(svsk); + svc_xprt_received(&svsk->sk_xprt); return -EAGAIN; /* record not complete */ } len = svsk->sk_reclen; - set_bit(SK_DATA, &svsk->sk_flags); + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); vec = rqstp->rq_vec; vec[0] = rqstp->rq_arg.head[0]; @@ -1281,30 +985,30 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; } - rqstp->rq_skbuff = NULL; + rqstp->rq_xprt_ctxt = NULL; rqstp->rq_prot = IPPROTO_TCP; /* Reset TCP read info */ svsk->sk_reclen = 0; svsk->sk_tcplen = 0; - svc_sock_received(svsk); + svc_xprt_received(&svsk->sk_xprt); if (serv->sv_stats) serv->sv_stats->nettcpcnt++; return len; err_delete: - svc_delete_socket(svsk); + svc_delete_xprt(&svsk->sk_xprt); return -EAGAIN; error: if (len == -EAGAIN) { dprintk("RPC: TCP recvfrom got EAGAIN\n"); - svc_sock_received(svsk); + svc_xprt_received(&svsk->sk_xprt); } else { printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", - svsk->sk_server->sv_name, -len); + svsk->sk_xprt.xpt_server->sv_name, -len); goto err_delete; } @@ -1328,35 +1032,103 @@ svc_tcp_sendto(struct svc_rqst *rqstp) reclen = htonl(0x80000000|((xbufp->len ) - 4)); memcpy(xbufp->head[0].iov_base, &reclen, 4); - if (test_bit(SK_DEAD, &rqstp->rq_sock->sk_flags)) + if (test_bit(XPT_DEAD, &rqstp->rq_xprt->xpt_flags)) return -ENOTCONN; sent = svc_sendto(rqstp, &rqstp->rq_res); if (sent != xbufp->len) { printk(KERN_NOTICE "rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n", - rqstp->rq_sock->sk_server->sv_name, + rqstp->rq_xprt->xpt_server->sv_name, (sent<0)?"got error":"sent only", sent, xbufp->len); - set_bit(SK_CLOSE, &rqstp->rq_sock->sk_flags); - svc_sock_enqueue(rqstp->rq_sock); + set_bit(XPT_CLOSE, &rqstp->rq_xprt->xpt_flags); + svc_xprt_enqueue(rqstp->rq_xprt); sent = -EAGAIN; } return sent; } +/* + * Setup response header. TCP has a 4B record length field. + */ static void -svc_tcp_init(struct svc_sock *svsk) +svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp) +{ + struct kvec *resv = &rqstp->rq_res.head[0]; + + /* tcp needs a space for the record length... */ + svc_putnl(resv, 0); +} + +static int +svc_tcp_has_wspace(struct svc_xprt *xprt) +{ + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + struct svc_serv *serv = svsk->sk_xprt.xpt_server; + int required; + + /* + * Set the SOCK_NOSPACE flag before checking the available + * sock space. + */ + set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); + required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg; + if (required*2 > sk_stream_wspace(svsk->sk_sk)) + return 0; + clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); + return 1; +} + +static struct svc_xprt * +svc_tcp_create(struct svc_serv *serv, struct sockaddr *sa, int alen, int flags) +{ + return svc_create_socket(serv, IPPROTO_TCP, sa, alen, flags); +} + +static struct svc_xprt_ops svc_tcp_ops = { + .xpo_create = svc_tcp_create, + .xpo_recvfrom = svc_tcp_recvfrom, + .xpo_sendto = svc_tcp_sendto, + .xpo_release_rqst = svc_release_skb, + .xpo_detach = svc_sock_detach, + .xpo_free = svc_sock_free, + .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr, + .xpo_has_wspace = svc_tcp_has_wspace, + .xpo_accept = svc_tcp_accept, +}; + +static struct svc_xprt_class svc_tcp_class = { + .xcl_name = "tcp", + .xcl_owner = THIS_MODULE, + .xcl_ops = &svc_tcp_ops, + .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP, +}; + +void svc_init_xprt_sock(void) +{ + svc_reg_xprt_class(&svc_tcp_class); + svc_reg_xprt_class(&svc_udp_class); +} + +void svc_cleanup_xprt_sock(void) +{ + svc_unreg_xprt_class(&svc_tcp_class); + svc_unreg_xprt_class(&svc_udp_class); +} + +static void +svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) { struct sock *sk = svsk->sk_sk; struct tcp_sock *tp = tcp_sk(sk); - svsk->sk_recvfrom = svc_tcp_recvfrom; - svsk->sk_sendto = svc_tcp_sendto; - + svc_xprt_init(&svc_tcp_class, &svsk->sk_xprt, serv); + set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); if (sk->sk_state == TCP_LISTEN) { dprintk("setting up TCP socket for listening\n"); + set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags); sk->sk_data_ready = svc_tcp_listen_data_ready; - set_bit(SK_CONN, &svsk->sk_flags); + set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); } else { dprintk("setting up TCP socket for reading\n"); sk->sk_state_change = svc_tcp_state_change; @@ -1373,13 +1145,13 @@ svc_tcp_init(struct svc_sock *svsk) * svc_tcp_recvfrom will re-adjust if necessary */ svc_sock_setbufsize(svsk->sk_sock, - 3 * svsk->sk_server->sv_max_mesg, - 3 * svsk->sk_server->sv_max_mesg); + 3 * svsk->sk_xprt.xpt_server->sv_max_mesg, + 3 * svsk->sk_xprt.xpt_server->sv_max_mesg); - set_bit(SK_CHNGBUF, &svsk->sk_flags); - set_bit(SK_DATA, &svsk->sk_flags); + set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); if (sk->sk_state != TCP_ESTABLISHED) - set_bit(SK_CLOSE, &svsk->sk_flags); + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); } } @@ -1395,232 +1167,18 @@ svc_sock_update_bufs(struct svc_serv *serv) spin_lock_bh(&serv->sv_lock); list_for_each(le, &serv->sv_permsocks) { struct svc_sock *svsk = - list_entry(le, struct svc_sock, sk_list); - set_bit(SK_CHNGBUF, &svsk->sk_flags); + list_entry(le, struct svc_sock, sk_xprt.xpt_list); + set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); } list_for_each(le, &serv->sv_tempsocks) { struct svc_sock *svsk = - list_entry(le, struct svc_sock, sk_list); - set_bit(SK_CHNGBUF, &svsk->sk_flags); + list_entry(le, struct svc_sock, sk_xprt.xpt_list); + set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); } spin_unlock_bh(&serv->sv_lock); } /* - * Receive the next request on any socket. This code is carefully - * organised not to touch any cachelines in the shared svc_serv - * structure, only cachelines in the local svc_pool. - */ -int -svc_recv(struct svc_rqst *rqstp, long timeout) -{ - struct svc_sock *svsk = NULL; - struct svc_serv *serv = rqstp->rq_server; - struct svc_pool *pool = rqstp->rq_pool; - int len, i; - int pages; - struct xdr_buf *arg; - DECLARE_WAITQUEUE(wait, current); - - dprintk("svc: server %p waiting for data (to = %ld)\n", - rqstp, timeout); - - if (rqstp->rq_sock) - printk(KERN_ERR - "svc_recv: service %p, socket not NULL!\n", - rqstp); - if (waitqueue_active(&rqstp->rq_wait)) - printk(KERN_ERR - "svc_recv: service %p, wait queue active!\n", - rqstp); - - - /* now allocate needed pages. If we get a failure, sleep briefly */ - pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE; - for (i=0; i < pages ; i++) - while (rqstp->rq_pages[i] == NULL) { - struct page *p = alloc_page(GFP_KERNEL); - if (!p) - schedule_timeout_uninterruptible(msecs_to_jiffies(500)); - rqstp->rq_pages[i] = p; - } - rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */ - BUG_ON(pages >= RPCSVC_MAXPAGES); - - /* Make arg->head point to first page and arg->pages point to rest */ - arg = &rqstp->rq_arg; - arg->head[0].iov_base = page_address(rqstp->rq_pages[0]); - arg->head[0].iov_len = PAGE_SIZE; - arg->pages = rqstp->rq_pages + 1; - arg->page_base = 0; - /* save at least one page for response */ - arg->page_len = (pages-2)*PAGE_SIZE; - arg->len = (pages-1)*PAGE_SIZE; - arg->tail[0].iov_len = 0; - - try_to_freeze(); - cond_resched(); - if (signalled()) - return -EINTR; - - spin_lock_bh(&pool->sp_lock); - if ((svsk = svc_sock_dequeue(pool)) != NULL) { - rqstp->rq_sock = svsk; - atomic_inc(&svsk->sk_inuse); - rqstp->rq_reserved = serv->sv_max_mesg; - atomic_add(rqstp->rq_reserved, &svsk->sk_reserved); - } else { - /* No data pending. Go to sleep */ - svc_thread_enqueue(pool, rqstp); - - /* - * We have to be able to interrupt this wait - * to bring down the daemons ... - */ - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&rqstp->rq_wait, &wait); - spin_unlock_bh(&pool->sp_lock); - - schedule_timeout(timeout); - - try_to_freeze(); - - spin_lock_bh(&pool->sp_lock); - remove_wait_queue(&rqstp->rq_wait, &wait); - - if (!(svsk = rqstp->rq_sock)) { - svc_thread_dequeue(pool, rqstp); - spin_unlock_bh(&pool->sp_lock); - dprintk("svc: server %p, no data yet\n", rqstp); - return signalled()? -EINTR : -EAGAIN; - } - } - spin_unlock_bh(&pool->sp_lock); - - dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n", - rqstp, pool->sp_id, svsk, atomic_read(&svsk->sk_inuse)); - len = svsk->sk_recvfrom(rqstp); - dprintk("svc: got len=%d\n", len); - - /* No data, incomplete (TCP) read, or accept() */ - if (len == 0 || len == -EAGAIN) { - rqstp->rq_res.len = 0; - svc_sock_release(rqstp); - return -EAGAIN; - } - svsk->sk_lastrecv = get_seconds(); - clear_bit(SK_OLD, &svsk->sk_flags); - - rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp)); - rqstp->rq_chandle.defer = svc_defer; - - if (serv->sv_stats) - serv->sv_stats->netcnt++; - return len; -} - -/* - * Drop request - */ -void -svc_drop(struct svc_rqst *rqstp) -{ - dprintk("svc: socket %p dropped request\n", rqstp->rq_sock); - svc_sock_release(rqstp); -} - -/* - * Return reply to client. - */ -int -svc_send(struct svc_rqst *rqstp) -{ - struct svc_sock *svsk; - int len; - struct xdr_buf *xb; - - if ((svsk = rqstp->rq_sock) == NULL) { - printk(KERN_WARNING "NULL socket pointer in %s:%d\n", - __FILE__, __LINE__); - return -EFAULT; - } - - /* release the receive skb before sending the reply */ - svc_release_skb(rqstp); - - /* calculate over-all length */ - xb = & rqstp->rq_res; - xb->len = xb->head[0].iov_len + - xb->page_len + - xb->tail[0].iov_len; - - /* Grab svsk->sk_mutex to serialize outgoing data. */ - mutex_lock(&svsk->sk_mutex); - if (test_bit(SK_DEAD, &svsk->sk_flags)) - len = -ENOTCONN; - else - len = svsk->sk_sendto(rqstp); - mutex_unlock(&svsk->sk_mutex); - svc_sock_release(rqstp); - - if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN) - return 0; - return len; -} - -/* - * Timer function to close old temporary sockets, using - * a mark-and-sweep algorithm. - */ -static void -svc_age_temp_sockets(unsigned long closure) -{ - struct svc_serv *serv = (struct svc_serv *)closure; - struct svc_sock *svsk; - struct list_head *le, *next; - LIST_HEAD(to_be_aged); - - dprintk("svc_age_temp_sockets\n"); - - if (!spin_trylock_bh(&serv->sv_lock)) { - /* busy, try again 1 sec later */ - dprintk("svc_age_temp_sockets: busy\n"); - mod_timer(&serv->sv_temptimer, jiffies + HZ); - return; - } - - list_for_each_safe(le, next, &serv->sv_tempsocks) { - svsk = list_entry(le, struct svc_sock, sk_list); - - if (!test_and_set_bit(SK_OLD, &svsk->sk_flags)) - continue; - if (atomic_read(&svsk->sk_inuse) > 1 || test_bit(SK_BUSY, &svsk->sk_flags)) - continue; - atomic_inc(&svsk->sk_inuse); - list_move(le, &to_be_aged); - set_bit(SK_CLOSE, &svsk->sk_flags); - set_bit(SK_DETACHED, &svsk->sk_flags); - } - spin_unlock_bh(&serv->sv_lock); - - while (!list_empty(&to_be_aged)) { - le = to_be_aged.next; - /* fiddling the sk_list node is safe 'cos we're SK_DETACHED */ - list_del_init(le); - svsk = list_entry(le, struct svc_sock, sk_list); - - dprintk("queuing svsk %p for closing, %lu seconds old\n", - svsk, get_seconds() - svsk->sk_lastrecv); - - /* a thread will dequeue and close it soon */ - svc_sock_enqueue(svsk); - svc_sock_put(svsk); - } - - mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); -} - -/* * Initialize socket for RPC use and create svc_sock struct * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF. */ @@ -1631,7 +1189,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, struct svc_sock *svsk; struct sock *inet; int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); - int is_temporary = flags & SVC_SOCK_TEMPORARY; dprintk("svc: svc_setup_socket %p\n", sock); if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) { @@ -1651,44 +1208,19 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, return NULL; } - set_bit(SK_BUSY, &svsk->sk_flags); + set_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags); inet->sk_user_data = svsk; svsk->sk_sock = sock; svsk->sk_sk = inet; svsk->sk_ostate = inet->sk_state_change; svsk->sk_odata = inet->sk_data_ready; svsk->sk_owspace = inet->sk_write_space; - svsk->sk_server = serv; - atomic_set(&svsk->sk_inuse, 1); - svsk->sk_lastrecv = get_seconds(); - spin_lock_init(&svsk->sk_lock); - INIT_LIST_HEAD(&svsk->sk_deferred); - INIT_LIST_HEAD(&svsk->sk_ready); - mutex_init(&svsk->sk_mutex); /* Initialize the socket */ if (sock->type == SOCK_DGRAM) - svc_udp_init(svsk); + svc_udp_init(svsk, serv); else - svc_tcp_init(svsk); - - spin_lock_bh(&serv->sv_lock); - if (is_temporary) { - set_bit(SK_TEMP, &svsk->sk_flags); - list_add(&svsk->sk_list, &serv->sv_tempsocks); - serv->sv_tmpcnt++; - if (serv->sv_temptimer.function == NULL) { - /* setup timer to age temp sockets */ - setup_timer(&serv->sv_temptimer, svc_age_temp_sockets, - (unsigned long)serv); - mod_timer(&serv->sv_temptimer, - jiffies + svc_conn_age_period * HZ); - } - } else { - clear_bit(SK_TEMP, &svsk->sk_flags); - list_add(&svsk->sk_list, &serv->sv_permsocks); - } - spin_unlock_bh(&serv->sv_lock); + svc_tcp_init(svsk, serv); dprintk("svc: svc_setup_socket created %p (inet %p)\n", svsk, svsk->sk_sk); @@ -1717,9 +1249,18 @@ int svc_addsock(struct svc_serv *serv, else { svsk = svc_setup_socket(serv, so, &err, SVC_SOCK_DEFAULTS); if (svsk) { - svc_sock_received(svsk); + int salen; + (void)kernel_getsockname(svsk->sk_sock, + (struct sockaddr *) + &svsk->sk_xprt.xpt_local, + &salen); + svc_xprt_received(&svsk->sk_xprt); err = 0; } + clear_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags); + spin_lock_bh(&serv->sv_lock); + list_add(&svsk->sk_xprt.xpt_list, &serv->sv_permsocks); + spin_unlock_bh(&serv->sv_lock); } if (err) { sockfd_put(so); @@ -1733,14 +1274,18 @@ EXPORT_SYMBOL_GPL(svc_addsock); /* * Create socket for RPC service. */ -static int svc_create_socket(struct svc_serv *serv, int protocol, - struct sockaddr *sin, int len, int flags) +static struct svc_xprt * +svc_create_socket(struct svc_serv *serv, int protocol, + struct sockaddr *sin, int len, int flags) { struct svc_sock *svsk; struct socket *sock; int error; int type; char buf[RPC_MAX_ADDRBUFLEN]; + struct sockaddr_storage addr; + struct sockaddr *newsin = (struct sockaddr *)&addr; + int newlen; dprintk("svc: svc_create_socket(%s, %d, %s)\n", serv->sv_program->pg_name, protocol, @@ -1749,13 +1294,13 @@ static int svc_create_socket(struct svc_serv *serv, int protocol, if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) { printk(KERN_WARNING "svc: only UDP and TCP " "sockets supported\n"); - return -EINVAL; + return ERR_PTR(-EINVAL); } type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; error = sock_create_kern(sin->sa_family, type, protocol, &sock); if (error < 0) - return error; + return ERR_PTR(error); svc_reclassify_socket(sock); @@ -1765,203 +1310,58 @@ static int svc_create_socket(struct svc_serv *serv, int protocol, if (error < 0) goto bummer; + newlen = len; + error = kernel_getsockname(sock, newsin, &newlen); + if (error < 0) + goto bummer; + if (protocol == IPPROTO_TCP) { if ((error = kernel_listen(sock, 64)) < 0) goto bummer; } if ((svsk = svc_setup_socket(serv, sock, &error, flags)) != NULL) { - svc_sock_received(svsk); - return ntohs(inet_sk(svsk->sk_sk)->sport); + memcpy(&svsk->sk_xprt.xpt_local, newsin, newlen); + svc_xprt_received(&svsk->sk_xprt); + return (struct svc_xprt *)svsk; } bummer: dprintk("svc: svc_create_socket error = %d\n", -error); sock_release(sock); - return error; + return ERR_PTR(error); } /* - * Remove a dead socket + * Detach the svc_sock from the socket so that no + * more callbacks occur. */ static void -svc_delete_socket(struct svc_sock *svsk) +svc_sock_detach(struct svc_xprt *xprt) { - struct svc_serv *serv; - struct sock *sk; - - dprintk("svc: svc_delete_socket(%p)\n", svsk); + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + struct sock *sk = svsk->sk_sk; - serv = svsk->sk_server; - sk = svsk->sk_sk; + dprintk("svc: svc_sock_detach(%p)\n", svsk); + /* put back the old socket callbacks */ sk->sk_state_change = svsk->sk_ostate; sk->sk_data_ready = svsk->sk_odata; sk->sk_write_space = svsk->sk_owspace; - - spin_lock_bh(&serv->sv_lock); - - if (!test_and_set_bit(SK_DETACHED, &svsk->sk_flags)) - list_del_init(&svsk->sk_list); - /* - * We used to delete the svc_sock from whichever list - * it's sk_ready node was on, but we don't actually - * need to. This is because the only time we're called - * while still attached to a queue, the queue itself - * is about to be destroyed (in svc_destroy). - */ - if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags)) { - BUG_ON(atomic_read(&svsk->sk_inuse)<2); - atomic_dec(&svsk->sk_inuse); - if (test_bit(SK_TEMP, &svsk->sk_flags)) - serv->sv_tmpcnt--; - } - - spin_unlock_bh(&serv->sv_lock); -} - -static void svc_close_socket(struct svc_sock *svsk) -{ - set_bit(SK_CLOSE, &svsk->sk_flags); - if (test_and_set_bit(SK_BUSY, &svsk->sk_flags)) - /* someone else will have to effect the close */ - return; - - atomic_inc(&svsk->sk_inuse); - svc_delete_socket(svsk); - clear_bit(SK_BUSY, &svsk->sk_flags); - svc_sock_put(svsk); -} - -void svc_force_close_socket(struct svc_sock *svsk) -{ - set_bit(SK_CLOSE, &svsk->sk_flags); - if (test_bit(SK_BUSY, &svsk->sk_flags)) { - /* Waiting to be processed, but no threads left, - * So just remove it from the waiting list - */ - list_del_init(&svsk->sk_ready); - clear_bit(SK_BUSY, &svsk->sk_flags); - } - svc_close_socket(svsk); -} - -/** - * svc_makesock - Make a socket for nfsd and lockd - * @serv: RPC server structure - * @protocol: transport protocol to use - * @port: port to use - * @flags: requested socket characteristics - * - */ -int svc_makesock(struct svc_serv *serv, int protocol, unsigned short port, - int flags) -{ - struct sockaddr_in sin = { - .sin_family = AF_INET, - .sin_addr.s_addr = INADDR_ANY, - .sin_port = htons(port), - }; - - dprintk("svc: creating socket proto = %d\n", protocol); - return svc_create_socket(serv, protocol, (struct sockaddr *) &sin, - sizeof(sin), flags); -} - -/* - * Handle defer and revisit of requests - */ - -static void svc_revisit(struct cache_deferred_req *dreq, int too_many) -{ - struct svc_deferred_req *dr = container_of(dreq, struct svc_deferred_req, handle); - struct svc_sock *svsk; - - if (too_many) { - svc_sock_put(dr->svsk); - kfree(dr); - return; - } - dprintk("revisit queued\n"); - svsk = dr->svsk; - dr->svsk = NULL; - spin_lock(&svsk->sk_lock); - list_add(&dr->handle.recent, &svsk->sk_deferred); - spin_unlock(&svsk->sk_lock); - set_bit(SK_DEFERRED, &svsk->sk_flags); - svc_sock_enqueue(svsk); - svc_sock_put(svsk); -} - -static struct cache_deferred_req * -svc_defer(struct cache_req *req) -{ - struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle); - int size = sizeof(struct svc_deferred_req) + (rqstp->rq_arg.len); - struct svc_deferred_req *dr; - - if (rqstp->rq_arg.page_len) - return NULL; /* if more than a page, give up FIXME */ - if (rqstp->rq_deferred) { - dr = rqstp->rq_deferred; - rqstp->rq_deferred = NULL; - } else { - int skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; - /* FIXME maybe discard if size too large */ - dr = kmalloc(size, GFP_KERNEL); - if (dr == NULL) - return NULL; - - dr->handle.owner = rqstp->rq_server; - dr->prot = rqstp->rq_prot; - memcpy(&dr->addr, &rqstp->rq_addr, rqstp->rq_addrlen); - dr->addrlen = rqstp->rq_addrlen; - dr->daddr = rqstp->rq_daddr; - dr->argslen = rqstp->rq_arg.len >> 2; - memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2); - } - atomic_inc(&rqstp->rq_sock->sk_inuse); - dr->svsk = rqstp->rq_sock; - - dr->handle.revisit = svc_revisit; - return &dr->handle; } /* - * recv data from a deferred request into an active one + * Free the svc_sock's socket resources and the svc_sock itself. */ -static int svc_deferred_recv(struct svc_rqst *rqstp) -{ - struct svc_deferred_req *dr = rqstp->rq_deferred; - - rqstp->rq_arg.head[0].iov_base = dr->args; - rqstp->rq_arg.head[0].iov_len = dr->argslen<<2; - rqstp->rq_arg.page_len = 0; - rqstp->rq_arg.len = dr->argslen<<2; - rqstp->rq_prot = dr->prot; - memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen); - rqstp->rq_addrlen = dr->addrlen; - rqstp->rq_daddr = dr->daddr; - rqstp->rq_respages = rqstp->rq_pages; - return dr->argslen<<2; -} - - -static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk) +static void +svc_sock_free(struct svc_xprt *xprt) { - struct svc_deferred_req *dr = NULL; + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + dprintk("svc: svc_sock_free(%p)\n", svsk); - if (!test_bit(SK_DEFERRED, &svsk->sk_flags)) - return NULL; - spin_lock(&svsk->sk_lock); - clear_bit(SK_DEFERRED, &svsk->sk_flags); - if (!list_empty(&svsk->sk_deferred)) { - dr = list_entry(svsk->sk_deferred.next, - struct svc_deferred_req, - handle.recent); - list_del_init(&dr->handle.recent); - set_bit(SK_DEFERRED, &svsk->sk_flags); - } - spin_unlock(&svsk->sk_lock); - return dr; + if (svsk->sk_sock->file) + sockfd_put(svsk->sk_sock); + else + sock_release(svsk->sk_sock); + kfree(svsk); } diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index 2be714e..54641a3 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -18,6 +18,7 @@ #include #include #include +#include /* * Declare the debug flags here @@ -27,6 +28,8 @@ unsigned int nfs_debug; unsigned int nfsd_debug; unsigned int nlm_debug; +char xprt_buf[128]; + #ifdef RPC_DEBUG static struct ctl_table_header *sunrpc_table_header; @@ -48,6 +51,32 @@ rpc_unregister_sysctl(void) } } +static int proc_do_xprt(ctl_table *table, int write, struct file *file, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char tmpbuf[sizeof(xprt_buf)]; + int len; + if ((*ppos && !write) || !*lenp) { + *lenp = 0; + return 0; + } + if (write) + return -EINVAL; + else { + + len = svc_print_xprts(tmpbuf, sizeof(tmpbuf)); + if (!access_ok(VERIFY_WRITE, buffer, len)) + return -EFAULT; + + if (__copy_to_user(buffer, tmpbuf, len)) + return -EFAULT; + } + + *lenp -= len; + *ppos += len; + return 0; +} + static int proc_dodebug(ctl_table *table, int write, struct file *file, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -140,6 +169,13 @@ static ctl_table debug_table[] = { .mode = 0644, .proc_handler = &proc_dodebug }, + { + .procname = "transports", + .data = xprt_buf, + .maxlen = sizeof(xprt_buf), + .mode = 0444, + .proc_handler = &proc_do_xprt, + }, { .ctl_name = 0 } }; diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index fdc5e6d..31bd346 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -93,11 +93,13 @@ xdr_encode_string(__be32 *p, const char *string) } __be32 * -xdr_decode_string_inplace(__be32 *p, char **sp, int *lenp, int maxlen) +xdr_decode_string_inplace(__be32 *p, char **sp, + unsigned int *lenp, unsigned int maxlen) { - unsigned int len; + u32 len; - if ((len = ntohl(*p++)) > maxlen) + len = ntohl(*p++); + if (len > maxlen) return NULL; *lenp = len; *sp = (char *) p;