GIT 1e28855867c31925e2ffa3a8acf16bb3ad8b634c git://git.linux-nfs.org/pub/linux/nfs-2.6.git commit 1e28855867c31925e2ffa3a8acf16bb3ad8b634c Author: Trond Myklebust Date: Sun Feb 26 00:54:33 2006 -0500 lockd: Add helper for *_RES callbacks Signed-off-by: Trond Myklebust commit ca91c90591da17cbd97faf0d40ea7f09c41c518d Author: Trond Myklebust Date: Sun Feb 26 00:54:33 2006 -0500 NLM: Add nlmclnt_release_call Add a helper function to simplify the freeing of NLM client requests. Signed-off-by: Trond Myklebust commit 5410b7146ab304f3d5042643d1b58346b008a702 Author: Trond Myklebust Date: Sun Feb 26 00:54:32 2006 -0500 NLM: Fix nlmclnt_test to not copy private part of locks The struct file_lock does not carry a properly initialised lock, so don't copy it as if it were. Signed-off-by: Trond Myklebust commit b62279dbb338a28fcfb3db5503f343b47ed0f270 Author: Trond Myklebust Date: Sun Feb 26 00:54:32 2006 -0500 NLM: Simplify client locks Signed-off-by: Trond Myklebust commit 72107867e60d69f192131324bce5bbba35ca84f5 Author: Trond Myklebust Date: Sun Feb 26 00:54:31 2006 -0500 NFS: O_DIRECT needs to use a completion Now that we have aio writes, it is possible for dreq->outstanding to be zero, but for the I/O not to have completed. Convert struct nfs_direct_req to use a completion to signal when the I/O is done. Signed-off-by: Trond Myklebust commit a429e3e20b09ed137c723ac014fe9c29cd566798 Author: Trond Myklebust Date: Sun Feb 26 00:54:31 2006 -0500 NFS: Clean up nfs_get_user_pages Signed-off-by: Trond Myklebust commit b06c6c098078bd5501a389407b3fd0854f9eadb6 Author: Chuck Lever Date: Sun Feb 26 00:54:30 2006 -0500 NFS: fix compiler warnings on 64-bit platforms Introduced by NFS aio+dio patches. Test plan: Compile kernel with CONFIG_NFS enabled on 64-bit hardware. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit 5fe677942e79c4b0bf1c6937a0827f74a6f35e46 Author: Chuck Lever Date: Sun Feb 26 00:54:30 2006 -0500 SUNRPC: fix compile warnings on 64-bit platforms Introduced by NFS metrics patch. Test plan: Compile kernel with CONFIG_NFS enabled on a 64-bit platform. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit fea3a1a5ebfd99d6175f642153b47e17e241c4d1 Author: Trond Myklebust Date: Sun Feb 26 00:54:29 2006 -0500 NLM: nlmclnt_cancel_callback should accept NLM_LCK_DENIED errors NLM_LCK_DENIED is a valid error return for an NLM_CANCEL call by the client. Signed-off-by: Trond Myklebust commit d09828c3e21dae5d176cf12302c503171bedca07 Author: Trond Myklebust Date: Sun Feb 26 00:54:29 2006 -0500 lockd: Fix Oopses due to list manipulation errors. The patch "stop abusing file_lock_list introduces a couple of bugs since the locks may be copied and need to be removed from the lists when they are destroyed. Signed-off-by: Trond Myklebust commit 54a8d59712241450ac187bff12cf168814deaa0c Author: Christoph Hellwig Date: Sun Feb 26 00:54:29 2006 -0500 lockd: stop abusing file_lock_list Currently lockd directly access the file_lock_list from fs/locks.c. It does so to mark locks granted or reclaimable. This is very suboptimal, because a) lockd needs to poke into locks.c internals, and b) it needs to iterate over all locks in the system for marking locks granted or reclaimable. This patch adds lists for granted and reclaimable locks to the nlm_host structure instead, and adds locks to those. nlmclnt_lock: now adds the lock to h_granted instead of setting the NFS_LCK_GRANTED, still O(1) nlmclnt_mark_reclaim: goes away completely, replaced by a list_splice_init. Complexity reduced from O(locks in the system) to O(1) reclaimer: iterates over h_reclaim now, complexity reduced from O(locks in the system) to O(locks per nlm_host) Signed-off-by: Christoph Hellwig Signed-off-by: Trond Myklebust commit ddfee955be4cf359d16198e50f0f4d4f9b53d478 Author: Trond Myklebust Date: Sun Feb 26 00:54:28 2006 -0500 lockd: Make lockd use rpc_new_client() instead of rpc_create_client When doing NLM_GRANTED requests, lockd may end up blocking if we use rpc_create_client() due to the synchronous call to rpc_ping(). Instead, use rpc_new_client(). Signed-off-by: Trond Myklebust commit 128529ce28f4bc457a3f6075c5456f2007083bb4 Author: Trond Myklebust Date: Sun Feb 26 00:54:28 2006 -0500 lockd: Make nlmsvc_create_block() use nlmsvc_lookup_host() Currently it uses nlmclnt_lookup_host(), which puts the resulting host structure on a different list. Signed-off-by: Trond Myklebust commit e4bed7d8aa4e60c41bd80f5d06e18e934d5e7ff7 Author: Trond Myklebust Date: Sun Feb 26 00:54:27 2006 -0500 lockd: Clean up of the server-side GRANTED code Signed-off-by: Trond Myklebust commit 1fda510ffaadb460a9b3146abfac8ae6428ee9d8 Author: Trond Myklebust Date: Sun Feb 26 00:54:27 2006 -0500 lockd: Add refcounting to struct nlm_block Otherwise, the block may disappear from underneath us when in nlmsvc_retry_blocked. Signed-off-by: Trond Myklebust commit ff9562f93f8d2d16902c466c7eb8df730bacafc9 Author: Trond Myklebust Date: Sun Feb 26 00:54:26 2006 -0500 lockd: Fix server-side lock blocking code Signed-off-by: Trond Myklebust commit 3385487dd88ed727d27bd810e1cc44a1ababae4b Author: Trond Myklebust Date: Sun Feb 26 00:54:26 2006 -0500 lockd: posix_test_lock() should not call locks_copy_lock() The caller of posix_test_lock() should never need to look at the lock private data, so do not copy that information. This also means that there is no need to call the fl_release_private methods. Signed-off-by: Trond Myklebust commit 52139b08ed8ae08b1ac2e39cee0228173146eda9 Author: Trond Myklebust Date: Sun Feb 26 00:54:25 2006 -0500 NFS: Uninline nfs_writedata_(alloc|free) and nfs_readdata_(alloc|free) Signed-off-by: Trond Myklebust commit d7ca06d8f02300276e292fdb90eee10c4ff5419a Author: Trond Myklebust Date: Sun Feb 26 00:54:25 2006 -0500 NFS: Debugging code for nfs_direct_(read|write)_schedule() Make sure that we're doing our list accounting correctly. Signed-off-by: Trond Myklebust commit 5e1f2987ee5cf64308d97fe739d04ee9c0c131d0 Author: Trond Myklebust Date: Sun Feb 26 00:54:25 2006 -0500 NFS: O_DIRECT async IO may lose context The struct nfs_direct_req currently keeps a pointer to the file descriptor without referencing it. This may cause problems if the parent process is killed. The nfs_open_context should normally have all the information that we're currently using the filp for, and unlike fput(), is safe to release from an rpciod process context. Signed-off-by: Trond Myklebust commit a817f871c072d4a76fb6c29aecfb64801e4e9626 Author: Trond Myklebust Date: Sun Feb 26 00:54:24 2006 -0500 nfs: Use UNSTABLE + COMMIT for NFS O_DIRECT writes Currently NFS O_DIRECT writes use FILE_SYNC so that a COMMIT is not necessary. This simplifies the internal logic, but this could be a difficult workload for some servers. Instead, let's send UNSTABLE writes, and after they all complete, send a COMMIT for the dirty range. After the COMMIT returns successfully, then do the wake_up or fire off aio_complete(). Test plan: Async direct I/O tests against Solaris (or any server that requires committed unstable writes). Reboot server during test. Based on an earlier patch by Chuck Lever Signed-off-by: Trond Myklebust commit 64dee7a67c3d5a6489d62edfd45686755ab8d296 Author: Trond Myklebust Date: Sun Feb 26 00:54:24 2006 -0500 NFS: Make nfs_commit_alloc() extern We need to use nfs_commit_alloc() in fs/nfs/direct.c. Signed-off-by: Trond Myklebust commit 1587a1bf94b518879a8492661f7b3920a6654239 Author: Chuck Lever Date: Sun Feb 26 00:54:23 2006 -0500 NFS: fix data_update accounting in NFS direct I/O path ^C against "iozone -I" is hitting the assertion in nfs_clear_inode(). Test plan: "iozone -i0 -I -a -c" against a slow server, then control C. This should not cause an oops. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit cf972b80fb93540d64b1b1a803b3626827d8b786 Author: Chuck Lever Date: Sun Feb 26 00:54:23 2006 -0500 NFS: Replace atomic_t variables in nfs_direct_req with a single spin lock Three atomic_t variables cause a lot of bus locking. Because they are all used in the same places in the code, just use a single spin lock. Now that the atomic_t variables are gone, we can remove the request size limitation since the code no longer depends on the limited width of atomic_t on some platforms. Test plan: Compile with CONFIG_NFS and CONFIG_NFS_DIRECTIO enabled. Millions of fsx operations, iozone, OraSim. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit fbc8585232bfe0c50aceb8eefdffd5a8f9abd65f Author: Chuck Lever Date: Sun Feb 26 00:54:22 2006 -0500 NFS: clean up comments and tab damage in direct.c Clean up tab damage and comments. Replace "file_offset" with more commonly used "pos". Test plan: Compile with CONFIG_NFS and CONFIG_NFS_DIRECTIO enabled. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit 4003c9d4f998c906c5d1759c3a78947e7f459d75 Author: Chuck Lever Date: Sun Feb 26 00:54:22 2006 -0500 NFS: support EIOCBQUEUED return in direct write path For async iocb's, the NFS direct write path now returns EIOCBQUEUED, and calls aio_complete when all the requested writes are finished. The synchronous part of the NFS direct write path behaves exactly as it was before. Shared mapped NFS files will have some coherency difficulties when accessed concurrently with aio+dio. Will need to explore how this is handled in the local file system case. Test plan: aio-stress with "-O". OraSim. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit 8cc691f4af1df6337cdf91b80e9d4302f835bb58 Author: Chuck Lever Date: Sun Feb 26 00:54:21 2006 -0500 NFS: make iocb available everywhere in direct write path Pass the iocb argument all the way down to the direct write request scheduler, and make it available in nfs_direct_write_result. Test plan: Compile the kernel with CONFIG_NFS and CONFIG_NFS_DIRECTIO enabled. Millions of fsx-odirect ops. OraSim. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit 0a4f7f68747b7cf48ffae7cbc7975af5204fd7f7 Author: Chuck Lever Date: Sun Feb 26 00:54:21 2006 -0500 NFS: remove support for multi-segment iovs in the direct write path Eliminate the persistent use of automatic storage in all parts of the NFS client's direct write path to pave the way for introducing support for aio against files opened with the O_DIRECT flag. Test plan: Compile the kernel with CONFIG_NFS and CONFIG_NFS_DIRECTIO enabled. Millions of fsx-odirect ops. OraSim. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit 44fedbe57789e85f7ea499d4133b4d04c353dceb Author: Chuck Lever Date: Sun Feb 26 00:54:21 2006 -0500 NFS: make direct write path generate write requests concurrently Duplicate infrastructure from direct read path that will allow write path to generate multiple write requests concurrently. This will enable us to add support for aio in this path. Temporarily we will lose the ability to do UNSTABLE writes followed by a COMMIT in the direct write path. However, all applications I am aware of that use NFS O_DIRECT currently write in relatively small chunks, so this should not be inconvenient in any way. Test plan: Millions of fsx-odirect ops. OraSim. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit 3143feb2e75414f20a3f6ab2dc89db8af4f8dbdc Author: Chuck Lever Date: Sun Feb 26 00:54:20 2006 -0500 NFS: create common routine for handling direct I/O completion Factor out the common piece of completing an NFS direct I/O request. Test plan: Compile kernel with CONFIG_NFS and CONFIG_NFS_DIRECTIO enabled. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit ffa599e5aea3b844fb96621ae8f704c4fb2fc757 Author: Chuck Lever Date: Sun Feb 26 00:54:20 2006 -0500 NFS: create common routine for allocating nfs_direct_req Factor out a small common piece of the path that allocate nfs_direct_req structures. Test plan: Compile kernel with CONFIG_NFS and CONFIG_NFS_DIRECTIO enabled. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit f3d2d40a3287b84c857c1ae9a7c1c785c0222e1a Author: Chuck Lever Date: Sun Feb 26 00:54:19 2006 -0500 NFS: create common routine for waiting for direct I/O to complete We're about to add asynchrony to the NFS direct write path. Begin by abstracting out the common pieces in the read path. The first piece is nfs_direct_read_wait, which works the same whether the process is waiting for a read or a write. Test plan: Compile kernel with CONFIG_NFS and CONFIG_NFS_DIRECTIO enabled. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit 4ccda478a87de4e1ccbfebe67a7cec09fe62e49c Author: Chuck Lever Date: Sun Feb 26 00:54:19 2006 -0500 NFS: support EIOCBQUEUED return in direct read path For async iocb's, the NFS direct read path should return EIOCBQUEUED and call aio_complete when all the requested reads are finished. The synchronous part of the NFS direct read path behaves exactly as it was before. Test plan: aio-stress with "-O". OraSim. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit 354a4fa45bfb9e48325732f66a7b389ca812cc4e Author: Chuck Lever Date: Sun Feb 26 00:54:18 2006 -0500 NFS: make iocb available everywhere in direct read path Pass the iocb argument all the way down to the direct read request scheduler, and make it available in nfs_direct_read_result. Test plan: Compile the kernel with CONFIG_NFS and CONFIG_NFS_DIRECTIO enabled. Millions of fsx-odirect ops. OraSim. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit e6cf0d0d7bb9bb44e4b3ae61bc72ce399e0e8e95 Author: Chuck Lever Date: Sun Feb 26 00:54:18 2006 -0500 NFS: remove support for multi-segment iovs in the direct read path Eliminate the persistent use of automatic storage in all parts of the NFS client's direct read path to pave the way for introducing support for aio against files opened with the O_DIRECT flag. Test plan: Compile the kernel with CONFIG_NFS and CONFIG_NFS_DIRECTIO enabled. Millions of fsx-odirect ops. OraSim. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit cc6b176dd56a4dd1f5c07c5348d90b4d2da651bf Author: Chuck Lever Date: Sun Feb 26 00:54:18 2006 -0500 NFS: use size_t type for holding rsize bytes in NFS O_DIRECT read path size_t is used for holding byte counts, so use it for variables storing rsize. Note that the write path will be updated as we add support for async O_DIRECT writes. Test plan: Need to verify that existing comparisons against new size_t variables behave correctly. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit 00e3b72d1832f7be04bdc1fca01de57afa115fae Author: Chuck Lever Date: Sun Feb 26 00:54:17 2006 -0500 NFS: update comments and function definitions in fs/nfs/direct.c Update to latest coding style standards. Remove block comments on statically defined functions, and place function definitions all on one line. Test plan: Compile kernel with CONFIG_NFS and CONFIG_NFS_DIRECTIO. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit ff3993ca427de248e947f32296ced7a907b5ccca Author: Chuck Lever Date: Sun Feb 26 00:54:17 2006 -0500 NFS: clean up NFS client's a_ops->direct_IO method The NFS client's a_ops->direct_IO method, nfs_direct_IO, is required to be present to allow NFS files to be opened with O_DIRECT, but is never called because the NFS client shunts reads and writes to files opened with O_DIRECT directly to its own routines. Gut the nfs_direct_IO function. This eliminates the only part of the NFS client's direct I/O path that requires support for multi-segment iovs, allowing further simplification in subsequent patches. Test plan: Compile the kernel with CONFIG_NFS and CONFIG_NFS_DIRECTIO enabled. Millions of fsx-odirect ops. OraSim. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit 7bdcb20eedb87eaa04f330ce19c88a47eb69dcc9 Author: Trond Myklebust Date: Sun Feb 26 00:54:16 2006 -0500 NFS: Cleanup of NFS read code Same callback hierarchy inversion as for the NFS write calls. This patch is not strictly speaking needed by the O_DIRECT code, but avoids confusing differences between the asynchronous read and write code. Signed-off-by: Trond Myklebust commit b652315b8764a126626d9dcde7417ac1571c98a3 Author: Trond Myklebust Date: Sun Feb 26 00:54:16 2006 -0500 NFS: Cleanup of NFS write code in preparation for asynchronous o_direct This patch inverts the callback hierarchy for NFS write calls. Instead of having the NFSv2/v3/v4-specific code set up the RPC callback ops, we allow the original caller to do so. This allows for more flexibility w.r.t. how to set up and tear down the nfs_write_data structure while still allowing the NFSv3/v4 code to perform error handling. The greater flexibility is needed by the asynchronous O_DIRECT code, which wants to be able to hold on to the original nfs_write_data structures after the WRITE RPC call has completed in order to be able to replay them if the COMMIT call determines that the server has rebooted. Signed-off-by: Trond Myklebust commit 3a7585f5f4a36c583472bdfcd06445b8af430466 Author: J. Bruce Fields Date: Sun Feb 26 00:54:15 2006 -0500 lockd: Remove FL_LOCKD flag Currently lockd identifies its own locks using the FL_LOCKD flag. This doesn't scale well to multiple lock managers--if we did this in nfsv4 too, for example, we'd be left with only one free flag bit. Instead, we just check whether the file manager ops (fl_lmops) set on this lock are our own. The only use for this is in nlm_traverse_locks, which uses it to find locks that need cleaning up when freeing a host or a file. In the long run it might be nice to do reference counting instead of traversing all the locks like this.... Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust commit 65203679dedcf8d9b5903c0d992eda589fb3c39e Author: Andy Adamson Date: Sun Feb 26 00:54:15 2006 -0500 locks,lockd: fix race in nlmsvc_testlock posix_test_lock() returns a pointer to a struct file_lock which is unprotected and can be removed while in use by the caller. Move the conflicting lock from the return to a parameter, and copy the conflicting lock. In most cases the caller ends up putting the copy of the conflicting lock on the stack. On i386, sizeof(struct file_lock) appears to be about 100 bytes. We're assuming that's reasonable. Signed-off-by: Andy Adamson Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust commit 3b45055d5da1c42a92fc8fd6093b835a9bd3ecbb Author: Andy Adamson Date: Sun Feb 26 00:54:14 2006 -0500 locks: remove unused posix_block_lock posix_lock_file() is used to add a blocked lock to Lockd's block, so posix_block_lock() is no longer needed. Signed-off-by: Andy Adamson Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust commit b5c3eca92b96b7c0ad9634862bde7be3f497beb9 Author: Andy Adamson Date: Sun Feb 26 00:54:14 2006 -0500 lockd: make nlmsvc_lock use only posix_lock_file Reorganize nlmsvc_lock() to make full use of posix_lock_file(), which does eveything nlmsvc_lock() needs - no need to call posix_test_lock(), posix_locks_deadlock(), or posix_block_lock() separately. Signed-off-by: Andy Adamson Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust commit 5dcf97752803055e8af8bfce2d5a666481eb45b4 Author: Andy Adamson Date: Sun Feb 26 00:54:14 2006 -0500 lockd: simplify nlmsvc_grant_blocked Reorganize nlmsvc_grant_blocked() to make full use of posix_lock_file(). Note that there's no need for separate calls to posix_test_lock(), posix_locks_deadlock(), or posix_block_lock(). Signed-off-by: Andy Adamson Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust commit 6b3562af209bf7ab61b14324f3bbe7e908b2e608 Author: Andy Adamson Date: Sun Feb 26 00:54:13 2006 -0500 lockd: clean up nlmsvc_lock Slightly more consistent dprintk error reporting, consolidate some up()'s. Signed-off-by: Andy Adamson Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust commit 671ec0a131bc4632fb78c4a9928ef790a7d49fe8 Author: Chuck Lever Date: Sun Feb 26 00:54:13 2006 -0500 NFS: directory trace messages Reuse NFSDBG_DIRCACHE and NFSDBG_LOOKUPCACHE to provide additional diagnostic messages that trace the operation of the NFS client's directory behavior. A few new messages are now generated when NFSDBG_VFS is active, as well, to trace normal VFS activity. This compromise provides better trace debugging for those who use pre-built kernels, without adding a lot of extra noise to the standard debug settings. Test-plan: Enable NFS trace debugging with flags 1, 2, or 4. You should be able to see different types of trace messages with each flag setting. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit 842db74b61dd1cce3a094a7d5907d559e62fdb3a Author: Chuck Lever Date: Sun Feb 26 00:54:12 2006 -0500 SUNRPC: minor cleanup RPC_DEBUG_DATA no longer needed in net/sunrpc/xprt.c. Test plan: Compile kernel with CONFIG_NFS enabled. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit a7c8d8bb19476bcc8be27c56c4fdf86878ff460d Author: Chuck Lever Date: Sun Feb 26 00:54:12 2006 -0500 SUNRPC: eliminate rpc_call() Clean-up: replace rpc_call() helper with direct call to rpc_call_sync. This makes NFSv2 and NFSv3 synchronous calls more computationally efficient, and reduces stack consumption in functions that used to invoke rpc_call more than once. Test plan: Compile kernel with CONFIG_NFS enabled. Connectathon on NFS version 2, version 3, and version 4 mount points. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit aace71104677d3267c51993520513f959da885a4 Author: Chuck Lever Date: Sun Feb 26 00:54:11 2006 -0500 SUNRPC: display human-readable procedure name in rpc_iostats output Add fields to the rpc_procinfo struct that allow the display of a human-readable name for each procedure in the rpc_iostats output. Also fix it so that the NFSv4 stats are broken up correctly by sub-procedure number. NFSv4 uses only two real RPC procedures: NULL, and COMPOUND. Test plan: Mount with NFSv2, NFSv3, and NFSv4, and do "cat /proc/self/mountstats". Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit 46881796a7541b29b08b944ea64588f6de6b12ff Author: Chuck Lever Date: Sun Feb 26 00:54:11 2006 -0500 NFS: add RPC I/O statistics to /proc/self/mountstats NFS client now shows various RPC I/O metrics in /proc/self/mountstats. Test plan: Mount/umount while doing "cat /proc/self/mountstats", multiple iterations of connectathon locking suite. Test with NFS version 2, 3, and 4. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit 1938ebdef5edea1454f6e4739fdf1857c0ddcfbf Author: Chuck Lever Date: Sun Feb 26 00:54:10 2006 -0500 SUNRPC: provide a mechanism for collecting stats in the RPC client Add a simple mechanism for collecting stats in the RPC client. Stats are tabulated during xprt_release. Note that per_cpu shenanigans are not required here because the RPC client already serializes on the transport write lock. Test plan: Compile kernel with CONFIG_NFS enabled. Basic performance regression testing with high-speed networking and high performance server. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit 7dd9281b61a7e1d1611b4d55e5829610ebd7780c Author: Chuck Lever Date: Sun Feb 26 00:54:10 2006 -0500 SUNRPC: introduce per-task RPC iostats Account for various things that occur while an RPC task is executed. Separate timers for RPC round trip and RPC execution time show how long RPC requests wait in queue before being sent. Eventually these will be accumulated at xprt_release time in one place where they can be viewed from userland. Test plan: Compile kernel with CONFIG_NFS enabled. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit 7cae0b57aa996eebe7d53282fdac3675d1e63f60 Author: Chuck Lever Date: Sun Feb 26 00:54:10 2006 -0500 SUNRPC: add a handful of per-xprt counters Monitor generic transport events. Add a transport switch callout to format transport counters for export to user-land. Test plan: Compile kernel with CONFIG_NFS enabled. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit 4a309e1ec7a39b29a4bbd44ee6432f07ba153542 Author: Chuck Lever Date: Sun Feb 26 00:54:09 2006 -0500 SUNRPC: track length of RPC wait queues RPC wait queue length will eventually be exported to userland via the RPC iostats interface. Test plan: Compile kernel with CONFIG_NFS enabled. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit 1e2b67c3505df90a3b3f4f160e5c4bd5d9b06776 Author: Chuck Lever Date: Sun Feb 26 00:54:09 2006 -0500 NFS: report how long an NFS file system has been mounted Add a field in nfs_server to record a timestamp when a mount succeeds. Report the number of seconds the file system has been mounted via nfs_show_stats(). Test plan: Mount an NFS file system, watch the mountstats reports and compare with clock time. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit 3420039b93ceb1811a1acf4205c373b77a9de09b Author: Chuck Lever Date: Sun Feb 26 00:54:08 2006 -0500 NFS: add hooks to account for NFSERR_JUKEBOX errors Make an inode or an nfs_server struct available in the logic that handles JUKEBOX/DELAY type errors so the NFS client can account for them. This patch is split out from the main nfs iostat patch to highlight minor architectural changes required to support this statistic. Test plan: None. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit f9f38a15337bc8614dc607c55a5cb3737253fcbe Author: Chuck Lever Date: Sun Feb 26 00:54:08 2006 -0500 NFS: add I/O performance counters Invoke the byte and event counter macros where we want to count bytes and events. Clean-up: fix a possible NULL dereference in nfs_lock, and simplify nfs_file_open. Test-plan: fsx and iozone on UP and SMP systems, with and without pre-emption. Watch for memory overwrite bugs, and performance loss (significantly more CPU required per op). Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit 22505bd829bf44ba005a53b0053f3eace4dc262b Author: Chuck Lever Date: Sun Feb 26 00:54:07 2006 -0500 NFS: introduce mechanism for tracking NFS client metrics Add a per-superblock performance counter facility to the NFS client. This facility mimics the counters available for block devices and for networking. Expose these new counters via the new /proc/self/mountstats interface. Thanks to Andrew Morton and Trond Myklebust for their review and comments. Test plan: fsx and iozone on UP and SMP systems, with and without pre-emption. Watch for memory overwrite bugs, and performance loss (significantly more CPU required per op). Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit 182902e78bf9ef8e7314dce5ce2197c91f1c6925 Author: Chuck Lever Date: Sun Feb 26 00:54:07 2006 -0500 NFS: clean up some mount options Get rid of "lock" and "posix", and spell out "vers=". Test plan: None. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit ad3c1b47b41f19caff1bd555f7cf05f9c09cbace Author: Chuck Lever Date: Sun Feb 26 00:54:06 2006 -0500 NFS: show retransmit settings when displaying mount options Sometimes it's important to know the exact RPC retransmit settings the kernel is using for an NFS mount point. Add this facility to the NFS client's show_options method. Test plan: Set various retransmit settings via the mount command, and check that the settings are reflected in /proc/mounts. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit 8596a3f93ea9e2dcae1121b22b14da9cdc6bd2a9 Author: Chuck Lever Date: Sun Feb 26 00:54:06 2006 -0500 VFS: New /proc file /proc/self/mountstats Create a new file under /proc/self, called mountstats, where mounted file systems can export information (configuration options, performance counters, and so on). Use a mechanism similar to /proc/mounts and s_ops->show_options. This mechanism does not violate namespace security, and is safe to use while other processes are unmounting file systems. Thanks to Mike Waychison for his review and comments. Test-plan: Test concurrent mount/unmount operations while cat'ing /proc/self/mountstats. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust commit d57002502881ae995cb97b3867d12503bd48632b Author: Levent Serinol Date: Sun Feb 26 00:54:05 2006 -0500 SUNRPC: more verbose output for rpc auth weak error This patch adds server ip address to be printed out when "server requires stronger authentication" error occured. Signed-off-by: Levent Serinol Signed-off-by: Trond Myklebust commit ca03147251bc52765a6f62a7734c28f7c7c0e9e5 Author: Goldwyn Rodrigues Date: Sun Feb 26 00:54:05 2006 -0500 NFS: Code comments update in NFS read_cache_mtime is no longer used in nfs_inode. This patch removes references of read_cache_mtime in the code comments. Signed-off-by: Goldwyn Rodrigues Signed-off-by: Trond Myklebust commit b09ea2e98e0a9979162fd06c0117de5a527d7df6 Author: Ingo Molnar Date: Sun Feb 26 00:54:05 2006 -0500 NFS: sem2mutex idmap.c semaphore to mutex conversion. the conversion was generated via scripts, and the result was validated automatically via a script as well. build and boot tested. Signed-off-by: Ingo Molnar Signed-off-by: Trond Myklebust commit 1ccfa35b8fb2016d4dd79b9ea6c32cc80fdfab47 Author: Eric Sesterhenn Date: Sun Feb 26 00:54:04 2006 -0500 NFS: kzalloc conversion in fs/nfs this converts fs/nfs to kzalloc() usage. compile tested with make allyesconfig Signed-off-by: Eric Sesterhenn Signed-off-by: Trond Myklebust commit 0e3b69dd16d1c3a89e709bcc2e3118c87b2c3b8f Author: Trond Myklebust Date: Sun Feb 26 00:54:04 2006 -0500 NFSv4: Kill braindead gcc warnings nfs4_open_revalidate: 'res' may be used uninitialized nfs4_callback_compound: ‘hdr_res.nops’ may be used uninitialized 'op_nr’ may be used uninitialized encode_getattr_res: ‘savep’ may be used uninitialized Signed-off-by: Trond Myklebust commit 756d43190cf19cb41c129fba5b600b9418b640d8 Author: Trond Myklebust Date: Sun Feb 26 00:54:03 2006 -0500 NFSv4: Do not call rpciod_down() before call to destroy_nfsv4_state() The reason is that the idmapper cleanup may call flush_workqueue() on rpciod_workqueue. Signed-off-by: Trond Myklebust commit 2f8b1bd97a1cc37e2b19441b19dc63e9d3a82ca3 Author: Trond Myklebust Date: Sun Feb 26 00:54:03 2006 -0500 SUNRPC: Ensure that rpc_mkpipe returns a refcounted dentry If not, we cannot guarantee that idmap->idmap_dentry, gss_auth->dentry and clnt->cl_dentry are valid dentries. Signed-off-by: Trond Myklebust commit 68c38b5b83a9a50d58d5fbbd3fdb9bbbda55fe55 Author: Trond Myklebust Date: Sun Feb 26 00:54:02 2006 -0500 SUNRPC: Run rpci->queue_timeout on the rpciod workqueue instead of generic Signed-off-by: Trond Myklebust commit c26b8b32e0c0a128095a95ded4e952c7393c1cc8 Author: Olaf Kirch Date: Sun Feb 26 00:54:02 2006 -0500 SUNRPC: Auto-load RPC authentication kernel modules This patch adds a request_module call to rpcauth_create which will try to auto-load the kernel module for the requested authentication flavor. For kernels with modular sunrpc, this reduces the admin overhead for the user. Signed-off-by: Olaf Kirch Signed-off-by: Trond Myklebust commit 9fbc256bee4b711f164d197fcef6dfb661b51525 Author: Trond Myklebust Date: Sun Feb 26 00:54:02 2006 -0500 NFS: reduce the number of false cache invalidations. Signed-off-by: Trond Myklebust commit 698dea5e1844a086242f775068492bcdf9986e1a Author: Jesper Juhl Date: Sun Feb 26 00:54:01 2006 -0500 NFS: "const static" vs "static const" in nfs4 My previous "const static" vs "static const" cleanup missed a single case, patch below takes care of it. Signed-off-by: Jesper Juhl Signed-off-by: Trond Myklebust commit 546224f3c01abbb695d55ba4acfa622871937924 Author: Trond Myklebust Date: Sun Feb 26 00:54:01 2006 -0500 NFSv4: Don't invalidate cached attributes if change attribute is unchanged Signed-off-by: Trond Myklebust commit f32275936040419817b28f4b60a0c3f1e2f2466e Author: Trond Myklebust Date: Sun Feb 26 00:54:00 2006 -0500 NFS: writes should not clobber utimes() calls Ensure that we flush out writes in the case when someone calls utimes() in order to set the file times. Signed-off-by: Trond Myklebust commit 40a69a3c115e8935b9f185d931b5a26675fdac62 Author: Trond Myklebust Date: Sun Feb 26 00:54:00 2006 -0500 lockd: Don't expose the process pid to the NLM server Instead we use the nlm_lockowner->pid. Signed-off-by: Trond Myklebust commit d35e35b4749e45bf0e37a2618047e6f736b65347 Author: Trond Myklebust Date: Sun Feb 26 00:54:00 2006 -0500 NLM: nlm_alloc_call should not immediately fail on signal Currently, nlm_alloc_call tests for a signal before it even tries to allocate memory. Fix it so that it tries at least once. Signed-off-by: Trond Myklebust commit 2c09dbcbdb0ba80f2561e4f351ef385e8c44bd0d Author: Trond Myklebust Date: Sun Feb 26 00:53:59 2006 -0500 VFS: Fix __posix_lock_file() copy of private lock area The struct file_lock->fl_u area must be copied using the fl_copy_lock() operation. Signed-off-by: Trond Myklebust commit d1b9ed669737a5a0e1f6c3f1058e91432a999ffa Author: Trond Myklebust Date: Sun Feb 26 00:53:59 2006 -0500 NLM: Ensure we do not Oops in the case of an unlock In theory, NLM specs assure us that the server will only reply LCK_GRANTED or LCK_DENIED_GRACE_PERIOD to our NLM_UNLOCK request. In practice, we should not assume this to be the case, and the code will currently Oops if we do. Signed-off-by: Trond Myklebust commit 301833719b740cd5225ea74db683a8d3940cbd25 Author: Neil Brown Date: Sun Feb 26 00:53:58 2006 -0500 NFS: Fix buglet in fs/nfs/write.c I've been reading through fs/nfs/write.c trying to track down a bug that seems to be related to pages loosing a refcount and getting freed too early (you interested in detail??) and I spotted a little bug which the following patch should fix. Signed-off-by: Neil Brown Signed-off-by: Trond Myklebust commit 955df0b8648f89ee91ddfa31654b523fbdce5bde Author: Trond Myklebust Date: Sun Feb 26 00:53:56 2006 -0500 NFS: Avoid races between writebacks and truncation Currently, there is no serialisation between NFS asynchronous writebacks and truncation at the page level due to the fact that nfs_sync_inode() cannot lock the pages that it is about to write out. This means that it is possible to be flushing out data (and calling something like set_page_writeback()) while the page cache is busy evicting the page. Oops... Use the hooks provided in try_to_release_page() to ensure that dirty pages are always written back to storage before we evict them. Signed-off-by: Trond Myklebust commit 977fe9d26b414e8410998bc457b57e14fdeb4d3a Author: Trond Myklebust Date: Sun Feb 26 00:53:56 2006 -0500 NFS: Fix a potential panic in O_DIRECT Based on an original patch by Mike O'Connor and Greg Banks of SGI. Mike states: A normal user can panic an NFS client and cause a local DoS with 'judicious'(?) use of O_DIRECT. Any O_DIRECT write to an NFS file where the user buffer starts with a valid mapped page and contains an unmapped page, will crash in this way. I haven't followed the code, but O_DIRECT reads with similar user buffers will probably also crash albeit in different ways. Details: when nfs_get_user_pages() calls get_user_pages(), it detects and correctly handles get_user_pages() returning an error, which happens if the first page covered by the user buffer's address range is unmapped. However, if the first page is mapped but some subsequent page isn't, get_user_pages() will return a positive number which is less than the number of pages requested (this behaviour is sort of analagous to a short write() call and appears to be intentional). nfs_get_user_pages() doesn't detect this and hands off the array of pages (whose last few elements are random rubbish from the newly allocated array memory) to it's caller, whence they go to nfs_direct_write_seg(), which then totally ignores the nr_pages it's given, and calculates its own idea of how many pages are in the array from the user buffer length. Needless to say, when it comes to transmit those uninitialised page* pointers, we see a crash in the network stack. Signed-off-by: Trond Myklebust --- diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index da6354b..bce7444 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -44,32 +44,25 @@ static LIST_HEAD(nlm_blocked); /* * Queue up a lock for blocking so that the GRANTED request can see it */ -int nlmclnt_prepare_block(struct nlm_rqst *req, struct nlm_host *host, struct file_lock *fl) +struct nlm_wait *nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *fl) { struct nlm_wait *block; - BUG_ON(req->a_block != NULL); block = kmalloc(sizeof(*block), GFP_KERNEL); - if (block == NULL) - return -ENOMEM; - block->b_host = host; - block->b_lock = fl; - init_waitqueue_head(&block->b_wait); - block->b_status = NLM_LCK_BLOCKED; - - list_add(&block->b_list, &nlm_blocked); - req->a_block = block; - - return 0; + if (block != NULL) { + block->b_host = host; + block->b_lock = fl; + init_waitqueue_head(&block->b_wait); + block->b_status = NLM_LCK_BLOCKED; + list_add(&block->b_list, &nlm_blocked); + } + return block; } -void nlmclnt_finish_block(struct nlm_rqst *req) +void nlmclnt_finish_block(struct nlm_wait *block) { - struct nlm_wait *block = req->a_block; - if (block == NULL) return; - req->a_block = NULL; list_del(&block->b_list); kfree(block); } @@ -77,15 +70,14 @@ void nlmclnt_finish_block(struct nlm_rqs /* * Block on a lock */ -long nlmclnt_block(struct nlm_rqst *req, long timeout) +int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout) { - struct nlm_wait *block = req->a_block; long ret; /* A borken server might ask us to block even if we didn't * request it. Just say no! */ - if (!req->a_args.block) + if (block == NULL) return -EAGAIN; /* Go to sleep waiting for GRANT callback. Some servers seem @@ -99,13 +91,10 @@ long nlmclnt_block(struct nlm_rqst *req, ret = wait_event_interruptible_timeout(block->b_wait, block->b_status != NLM_LCK_BLOCKED, timeout); - - if (block->b_status != NLM_LCK_BLOCKED) { - req->a_res.status = block->b_status; - block->b_status = NLM_LCK_BLOCKED; - } - - return ret; + if (ret < 0) + return -ERESTARTSYS; + req->a_res.status = block->b_status; + return 0; } /* @@ -125,7 +114,15 @@ u32 nlmclnt_grant(const struct sockaddr_ list_for_each_entry(block, &nlm_blocked, b_list) { struct file_lock *fl_blocked = block->b_lock; - if (!nlm_compare_locks(fl_blocked, fl)) + if (fl_blocked->fl_start != fl->fl_start) + continue; + if (fl_blocked->fl_end != fl->fl_end) + continue; + /* + * Careful! The NLM server will return the 32-bit "pid" that + * we put on the wire: in this case the lockowner "pid". + */ + if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) continue; if (!nlm_cmp_addr(&block->b_host->h_addr, addr)) continue; @@ -147,34 +144,6 @@ u32 nlmclnt_grant(const struct sockaddr_ */ /* - * Mark the locks for reclaiming. - * FIXME: In 2.5 we don't want to iterate through any global file_lock_list. - * Maintain NLM lock reclaiming lists in the nlm_host instead. - */ -static -void nlmclnt_mark_reclaim(struct nlm_host *host) -{ - struct file_lock *fl; - struct inode *inode; - struct list_head *tmp; - - list_for_each(tmp, &file_lock_list) { - fl = list_entry(tmp, struct file_lock, fl_link); - - inode = fl->fl_file->f_dentry->d_inode; - if (inode->i_sb->s_magic != NFS_SUPER_MAGIC) - continue; - if (fl->fl_u.nfs_fl.owner == NULL) - continue; - if (fl->fl_u.nfs_fl.owner->host != host) - continue; - if (!(fl->fl_u.nfs_fl.flags & NFS_LCK_GRANTED)) - continue; - fl->fl_u.nfs_fl.flags |= NFS_LCK_RECLAIM; - } -} - -/* * Someone has sent us an SM_NOTIFY. Ensure we bind to the new port number, * that we mark locks for reclaiming, and that we bump the pseudo NSM state. */ @@ -186,7 +155,12 @@ void nlmclnt_prepare_reclaim(struct nlm_ host->h_state++; host->h_nextrebind = 0; nlm_rebind_host(host); - nlmclnt_mark_reclaim(host); + + /* + * Mark the locks for reclaiming. + */ + list_splice_init(&host->h_granted, &host->h_reclaim); + dprintk("NLM: reclaiming locks for host %s", host->h_name); } @@ -215,9 +189,7 @@ reclaimer(void *ptr) { struct nlm_host *host = (struct nlm_host *) ptr; struct nlm_wait *block; - struct list_head *tmp; - struct file_lock *fl; - struct inode *inode; + struct file_lock *fl, *next; daemonize("%s-reclaim", host->h_name); allow_signal(SIGKILL); @@ -229,23 +201,13 @@ reclaimer(void *ptr) /* First, reclaim all locks that have been marked. */ restart: - list_for_each(tmp, &file_lock_list) { - fl = list_entry(tmp, struct file_lock, fl_link); + list_for_each_entry_safe(fl, next, &host->h_reclaim, fl_u.nfs_fl.list) { + list_del_init(&fl->fl_u.nfs_fl.list); - inode = fl->fl_file->f_dentry->d_inode; - if (inode->i_sb->s_magic != NFS_SUPER_MAGIC) - continue; - if (fl->fl_u.nfs_fl.owner == NULL) - continue; - if (fl->fl_u.nfs_fl.owner->host != host) - continue; - if (!(fl->fl_u.nfs_fl.flags & NFS_LCK_RECLAIM)) - continue; - - fl->fl_u.nfs_fl.flags &= ~NFS_LCK_RECLAIM; - nlmclnt_reclaim(host, fl); if (signalled()) - break; + continue; + if (nlmclnt_reclaim(host, fl) == 0) + list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted); goto restart; } diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 220058d..f96e381 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -132,59 +132,18 @@ static void nlmclnt_setlockargs(struct n memcpy(&lock->fh, NFS_FH(fl->fl_file->f_dentry->d_inode), sizeof(struct nfs_fh)); lock->caller = system_utsname.nodename; lock->oh.data = req->a_owner; - lock->oh.len = sprintf(req->a_owner, "%d@%s", - current->pid, system_utsname.nodename); - locks_copy_lock(&lock->fl, fl); + lock->oh.len = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s", + (unsigned int)fl->fl_u.nfs_fl.owner->pid, + system_utsname.nodename); + lock->svid = fl->fl_u.nfs_fl.owner->pid; + lock->fl.fl_start = fl->fl_start; + lock->fl.fl_end = fl->fl_end; + lock->fl.fl_type = fl->fl_type; } static void nlmclnt_release_lockargs(struct nlm_rqst *req) { - struct file_lock *fl = &req->a_args.lock.fl; - - if (fl->fl_ops && fl->fl_ops->fl_release_private) - fl->fl_ops->fl_release_private(fl); -} - -/* - * Initialize arguments for GRANTED call. The nlm_rqst structure - * has been cleared already. - */ -int -nlmclnt_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock) -{ - locks_copy_lock(&call->a_args.lock.fl, &lock->fl); - memcpy(&call->a_args.lock.fh, &lock->fh, sizeof(call->a_args.lock.fh)); - call->a_args.lock.caller = system_utsname.nodename; - call->a_args.lock.oh.len = lock->oh.len; - - /* set default data area */ - call->a_args.lock.oh.data = call->a_owner; - - if (lock->oh.len > NLMCLNT_OHSIZE) { - void *data = kmalloc(lock->oh.len, GFP_KERNEL); - if (!data) { - nlmclnt_freegrantargs(call); - return 0; - } - call->a_args.lock.oh.data = (u8 *) data; - } - - memcpy(call->a_args.lock.oh.data, lock->oh.data, lock->oh.len); - return 1; -} - -void -nlmclnt_freegrantargs(struct nlm_rqst *call) -{ - struct file_lock *fl = &call->a_args.lock.fl; - /* - * Check whether we allocated memory for the owner. - */ - if (call->a_args.lock.oh.data != (u8 *) call->a_owner) { - kfree(call->a_args.lock.oh.data); - } - if (fl->fl_ops && fl->fl_ops->fl_release_private) - fl->fl_ops->fl_release_private(fl); + BUG_ON(req->a_args.lock.fl.fl_ops != NULL); } /* @@ -193,9 +152,8 @@ nlmclnt_freegrantargs(struct nlm_rqst *c int nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl) { - struct nfs_server *nfssrv = NFS_SERVER(inode); struct nlm_host *host; - struct nlm_rqst reqst, *call = &reqst; + struct nlm_rqst *call; sigset_t oldset; unsigned long flags; int status, proto, vers; @@ -209,23 +167,17 @@ nlmclnt_proc(struct inode *inode, int cm /* Retrieve transport protocol from NFS client */ proto = NFS_CLIENT(inode)->cl_xprt->prot; - if (!(host = nlmclnt_lookup_host(NFS_ADDR(inode), proto, vers))) + host = nlmclnt_lookup_host(NFS_ADDR(inode), proto, vers); + if (host == NULL) return -ENOLCK; - /* Create RPC client handle if not there, and copy soft - * and intr flags from NFS client. */ - if (host->h_rpcclnt == NULL) { - struct rpc_clnt *clnt; - - /* Bind an rpc client to this host handle (does not - * perform a portmapper lookup) */ - if (!(clnt = nlm_bind_host(host))) { - status = -ENOLCK; - goto done; - } - clnt->cl_softrtry = nfssrv->client->cl_softrtry; - clnt->cl_intr = nfssrv->client->cl_intr; - } + call = nlm_alloc_call(host); + if (call == NULL) + return -ENOMEM; + + nlmclnt_locks_init_private(fl, host); + /* Set up the argument struct */ + nlmclnt_setlockargs(call, fl); /* Keep the old signal mask */ spin_lock_irqsave(¤t->sighand->siglock, flags); @@ -238,26 +190,10 @@ nlmclnt_proc(struct inode *inode, int cm && (current->flags & PF_EXITING)) { sigfillset(¤t->blocked); /* Mask all signals */ recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); - call = nlmclnt_alloc_call(); - if (!call) { - status = -ENOMEM; - goto out_restore; - } call->a_flags = RPC_TASK_ASYNC; - } else { - spin_unlock_irqrestore(¤t->sighand->siglock, flags); - memset(call, 0, sizeof(*call)); - locks_init_lock(&call->a_args.lock.fl); - locks_init_lock(&call->a_res.lock.fl); } - call->a_host = host; - - nlmclnt_locks_init_private(fl, host); - - /* Set up the argument struct */ - nlmclnt_setlockargs(call, fl); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); if (IS_SETLK(cmd) || IS_SETLKW(cmd)) { if (fl->fl_type != F_UNLCK) { @@ -270,41 +206,58 @@ nlmclnt_proc(struct inode *inode, int cm else status = -EINVAL; - out_restore: + fl->fl_ops->fl_release_private(fl); + fl->fl_ops = NULL; + spin_lock_irqsave(¤t->sighand->siglock, flags); current->blocked = oldset; recalc_sigpending(); spin_unlock_irqrestore(¤t->sighand->siglock, flags); -done: dprintk("lockd: clnt proc returns %d\n", status); - nlm_release_host(host); return status; } EXPORT_SYMBOL(nlmclnt_proc); /* * Allocate an NLM RPC call struct + * + * Note: the caller must hold a reference to host. In case of failure, + * this reference will be released. */ -struct nlm_rqst * -nlmclnt_alloc_call(void) +struct nlm_rqst *nlm_alloc_call(struct nlm_host *host) { struct nlm_rqst *call; - while (!signalled()) { - call = (struct nlm_rqst *) kmalloc(sizeof(struct nlm_rqst), GFP_KERNEL); - if (call) { - memset(call, 0, sizeof(*call)); + for(;;) { + call = kzalloc(sizeof(*call), GFP_KERNEL); + if (call != NULL) { locks_init_lock(&call->a_args.lock.fl); locks_init_lock(&call->a_res.lock.fl); + call->a_host = host; return call; } - printk("nlmclnt_alloc_call: failed, waiting for memory\n"); + if (signalled()) + break; + printk("nlm_alloc_call: failed, waiting for memory\n"); schedule_timeout_interruptible(5*HZ); } + nlm_release_host(host); return NULL; } +void nlm_release_call(struct nlm_rqst *call) +{ + nlm_release_host(call->a_host); + nlmclnt_release_lockargs(call); + kfree(call); +} + +static void nlmclnt_rpc_release(void *data) +{ + return nlm_release_call(data); +} + static int nlm_wait_on_grace(wait_queue_head_t *queue) { DEFINE_WAIT(wait); @@ -401,57 +354,45 @@ in_grace_period: /* * Generic NLM call, async version. */ -int nlmsvc_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops) +static int __nlm_async_call(struct nlm_rqst *req, u32 proc, struct rpc_message *msg, const struct rpc_call_ops *tk_ops) { struct nlm_host *host = req->a_host; struct rpc_clnt *clnt; - struct rpc_message msg = { - .rpc_argp = &req->a_args, - .rpc_resp = &req->a_res, - }; - int status; + int status = -ENOLCK; dprintk("lockd: call procedure %d on %s (async)\n", (int)proc, host->h_name); /* If we have no RPC client yet, create one. */ - if ((clnt = nlm_bind_host(host)) == NULL) - return -ENOLCK; - msg.rpc_proc = &clnt->cl_procinfo[proc]; + clnt = nlm_bind_host(host); + if (clnt == NULL) + goto out_err; + msg->rpc_proc = &clnt->cl_procinfo[proc]; /* bootstrap and kick off the async RPC call */ - status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, tk_ops, req); - + status = rpc_call_async(clnt, msg, RPC_TASK_ASYNC, tk_ops, req); + if (status == 0) + return 0; +out_err: + nlm_release_call(req); return status; } -static int nlmclnt_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops) +int nlm_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops) { - struct nlm_host *host = req->a_host; - struct rpc_clnt *clnt; - struct nlm_args *argp = &req->a_args; - struct nlm_res *resp = &req->a_res; struct rpc_message msg = { - .rpc_argp = argp, - .rpc_resp = resp, + .rpc_argp = &req->a_args, + .rpc_resp = &req->a_res, }; - int status; - - dprintk("lockd: call procedure %d on %s (async)\n", - (int)proc, host->h_name); - - /* If we have no RPC client yet, create one. */ - if ((clnt = nlm_bind_host(host)) == NULL) - return -ENOLCK; - msg.rpc_proc = &clnt->cl_procinfo[proc]; + return __nlm_async_call(req, proc, &msg, tk_ops); +} - /* Increment host refcount */ - nlm_get_host(host); - /* bootstrap and kick off the async RPC call */ - status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, tk_ops, req); - if (status < 0) - nlm_release_host(host); - return status; +int nlm_async_reply(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops) +{ + struct rpc_message msg = { + .rpc_argp = &req->a_res, + }; + return __nlm_async_call(req, proc, &msg, tk_ops); } /* @@ -463,36 +404,41 @@ nlmclnt_test(struct nlm_rqst *req, struc int status; status = nlmclnt_call(req, NLMPROC_TEST); - nlmclnt_release_lockargs(req); if (status < 0) - return status; + goto out; - status = req->a_res.status; - if (status == NLM_LCK_GRANTED) { - fl->fl_type = F_UNLCK; - } if (status == NLM_LCK_DENIED) { - /* - * Report the conflicting lock back to the application. - */ - locks_copy_lock(fl, &req->a_res.lock.fl); - fl->fl_pid = 0; - } else { - return nlm_stat_to_errno(req->a_res.status); + switch (req->a_res.status) { + case NLM_LCK_GRANTED: + fl->fl_type = F_UNLCK; + break; + case NLM_LCK_DENIED: + /* + * Report the conflicting lock back to the application. + */ + fl->fl_start = req->a_res.lock.fl.fl_start; + fl->fl_end = req->a_res.lock.fl.fl_start; + fl->fl_type = req->a_res.lock.fl.fl_type; + fl->fl_pid = 0; + break; + default: + status = nlm_stat_to_errno(req->a_res.status); } - - return 0; +out: + nlm_release_call(req); + return status; } static void nlmclnt_locks_copy_lock(struct file_lock *new, struct file_lock *fl) { - memcpy(&new->fl_u.nfs_fl, &fl->fl_u.nfs_fl, sizeof(new->fl_u.nfs_fl)); - nlm_get_lockowner(new->fl_u.nfs_fl.owner); + new->fl_u.nfs_fl.state = fl->fl_u.nfs_fl.state; + new->fl_u.nfs_fl.owner = nlm_get_lockowner(fl->fl_u.nfs_fl.owner); + list_add_tail(&new->fl_u.nfs_fl.list, &fl->fl_u.nfs_fl.owner->host->h_granted); } static void nlmclnt_locks_release_private(struct file_lock *fl) { + list_del(&fl->fl_u.nfs_fl.list); nlm_put_lockowner(fl->fl_u.nfs_fl.owner); - fl->fl_ops = NULL; } static struct file_lock_operations nlmclnt_lock_ops = { @@ -504,8 +450,8 @@ static void nlmclnt_locks_init_private(s { BUG_ON(fl->fl_ops != NULL); fl->fl_u.nfs_fl.state = 0; - fl->fl_u.nfs_fl.flags = 0; fl->fl_u.nfs_fl.owner = nlm_find_lockowner(host, fl->fl_owner); + INIT_LIST_HEAD(&fl->fl_u.nfs_fl.list); fl->fl_ops = &nlmclnt_lock_ops; } @@ -552,57 +498,52 @@ nlmclnt_lock(struct nlm_rqst *req, struc { struct nlm_host *host = req->a_host; struct nlm_res *resp = &req->a_res; - long timeout; - int status; + struct nlm_wait *block = NULL; + int status = -ENOLCK; if (!host->h_monitored && nsm_monitor(host) < 0) { printk(KERN_NOTICE "lockd: failed to monitor %s\n", host->h_name); - status = -ENOLCK; goto out; } - if (req->a_args.block) { - status = nlmclnt_prepare_block(req, host, fl); - if (status < 0) - goto out; - } + block = nlmclnt_prepare_block(host, fl); for(;;) { status = nlmclnt_call(req, NLMPROC_LOCK); if (status < 0) goto out_unblock; - if (resp->status != NLM_LCK_BLOCKED) + if (!req->a_args.block) break; - /* Wait on an NLM blocking lock */ - timeout = nlmclnt_block(req, NLMCLNT_POLL_TIMEOUT); /* Did a reclaimer thread notify us of a server reboot? */ if (resp->status == NLM_LCK_DENIED_GRACE_PERIOD) continue; if (resp->status != NLM_LCK_BLOCKED) break; - if (timeout >= 0) - continue; - /* We were interrupted. Send a CANCEL request to the server + /* Wait on an NLM blocking lock */ + status = nlmclnt_block(block, req, NLMCLNT_POLL_TIMEOUT); + /* if we were interrupted. Send a CANCEL request to the server * and exit */ - status = (int)timeout; - goto out_unblock; + if (status < 0) + goto out_unblock; + if (resp->status != NLM_LCK_BLOCKED) + break; } if (resp->status == NLM_LCK_GRANTED) { fl->fl_u.nfs_fl.state = host->h_state; - fl->fl_u.nfs_fl.flags |= NFS_LCK_GRANTED; fl->fl_flags |= FL_SLEEP; + /* Ensure the resulting lock will get added to granted list */ do_vfs_lock(fl); } status = nlm_stat_to_errno(resp->status); out_unblock: - nlmclnt_finish_block(req); + nlmclnt_finish_block(block); /* Cancel the blocked request if it is still pending */ if (resp->status == NLM_LCK_BLOCKED) nlmclnt_cancel(host, req->a_args.block, fl); out: - nlmclnt_release_lockargs(req); + nlm_release_call(req); return status; } @@ -658,38 +599,31 @@ nlmclnt_unlock(struct nlm_rqst *req, str struct nlm_res *resp = &req->a_res; int status; - /* Clean the GRANTED flag now so the lock doesn't get - * reclaimed while we're stuck in the unlock call. */ - fl->fl_u.nfs_fl.flags &= ~NFS_LCK_GRANTED; - - if (req->a_flags & RPC_TASK_ASYNC) { - status = nlmclnt_async_call(req, NLMPROC_UNLOCK, - &nlmclnt_unlock_ops); - /* Hrmf... Do the unlock early since locks_remove_posix() - * really expects us to free the lock synchronously */ - do_vfs_lock(fl); - if (status < 0) { - nlmclnt_release_lockargs(req); - kfree(req); - } - return status; - } + /* + * Note: the server is supposed to either grant us the unlock + * request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either + * case, we want to unlock. + */ + do_vfs_lock(fl); + + if (req->a_flags & RPC_TASK_ASYNC) + return nlm_async_call(req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops); status = nlmclnt_call(req, NLMPROC_UNLOCK); - nlmclnt_release_lockargs(req); if (status < 0) - return status; + goto out; - do_vfs_lock(fl); + status = 0; if (resp->status == NLM_LCK_GRANTED) - return 0; + goto out; if (resp->status != NLM_LCK_DENIED_NOLOCKS) printk("lockd: unexpected unlock status: %d\n", resp->status); - /* What to do now? I'm out of my depth... */ - - return -ENOLCK; + status = -ENOLCK; +out: + nlm_release_call(req); + return status; } static void nlmclnt_unlock_callback(struct rpc_task *task, void *data) @@ -711,9 +645,6 @@ static void nlmclnt_unlock_callback(stru if (status != NLM_LCK_GRANTED) printk(KERN_WARNING "lockd: unexpected unlock status: %d\n", status); die: - nlm_release_host(req->a_host); - nlmclnt_release_lockargs(req); - kfree(req); return; retry_rebind: nlm_rebind_host(req->a_host); @@ -723,6 +654,7 @@ die: static const struct rpc_call_ops nlmclnt_unlock_ops = { .rpc_call_done = nlmclnt_unlock_callback, + .rpc_release = nlmclnt_rpc_release, }; /* @@ -744,20 +676,15 @@ static int nlmclnt_cancel(struct nlm_hos recalc_sigpending(); spin_unlock_irqrestore(¤t->sighand->siglock, flags); - req = nlmclnt_alloc_call(); + req = nlm_alloc_call(nlm_get_host(host)); if (!req) return -ENOMEM; - req->a_host = host; req->a_flags = RPC_TASK_ASYNC; nlmclnt_setlockargs(req, fl); req->a_args.block = block; - status = nlmclnt_async_call(req, NLMPROC_CANCEL, &nlmclnt_cancel_ops); - if (status < 0) { - nlmclnt_release_lockargs(req); - kfree(req); - } + status = nlm_async_call(req, NLMPROC_CANCEL, &nlmclnt_cancel_ops); spin_lock_irqsave(¤t->sighand->siglock, flags); current->blocked = oldset; @@ -786,6 +713,7 @@ static void nlmclnt_cancel_callback(stru switch (req->a_res.status) { case NLM_LCK_GRANTED: case NLM_LCK_DENIED_GRACE_PERIOD: + case NLM_LCK_DENIED: /* Everything's good */ break; case NLM_LCK_DENIED_NOLOCKS: @@ -797,9 +725,6 @@ static void nlmclnt_cancel_callback(stru } die: - nlm_release_host(req->a_host); - nlmclnt_release_lockargs(req); - kfree(req); return; retry_cancel: @@ -813,6 +738,7 @@ retry_cancel: static const struct rpc_call_ops nlmclnt_cancel_ops = { .rpc_call_done = nlmclnt_cancel_callback, + .rpc_release = nlmclnt_rpc_release, }; /* diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 82f7a0b..112ebf8 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -123,6 +123,8 @@ nlm_lookup_host(int server, struct socka nlm_hosts[hash] = host; INIT_LIST_HEAD(&host->h_lockowners); spin_lock_init(&host->h_lock); + INIT_LIST_HEAD(&host->h_granted); + INIT_LIST_HEAD(&host->h_reclaim); if (++nrhosts > NLM_HOST_MAX) next_gc = 0; @@ -191,11 +193,12 @@ nlm_bind_host(struct nlm_host *host) xprt->resvport = 1; /* NLM requires a reserved port */ /* Existing NLM servers accept AUTH_UNIX only */ - clnt = rpc_create_client(xprt, host->h_name, &nlm_program, + clnt = rpc_new_client(xprt, host->h_name, &nlm_program, host->h_version, RPC_AUTH_UNIX); if (IS_ERR(clnt)) goto forgetit; clnt->cl_autobind = 1; /* turn on pmap queries */ + clnt->cl_softrtry = 1; /* All queries are soft */ host->h_rpcclnt = clnt; } @@ -242,8 +245,12 @@ void nlm_release_host(struct nlm_host *h { if (host != NULL) { dprintk("lockd: release host %s\n", host->h_name); - atomic_dec(&host->h_count); BUG_ON(atomic_read(&host->h_count) < 0); + if (atomic_dec_and_test(&host->h_count)) { + BUG_ON(!list_empty(&host->h_lockowners)); + BUG_ON(!list_empty(&host->h_granted)); + BUG_ON(!list_empty(&host->h_reclaim)); + } } } @@ -331,7 +338,6 @@ nlm_gc_hosts(void) rpc_destroy_client(host->h_rpcclnt); } } - BUG_ON(!list_empty(&host->h_lockowners)); kfree(host); nrhosts--; } diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 0edc03e..5dd52b7 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -35,6 +35,10 @@ nsm_mon_unmon(struct nlm_host *host, u32 struct rpc_clnt *clnt; int status; struct nsm_args args; + struct rpc_message msg = { + .rpc_argp = &args, + .rpc_resp = res, + }; clnt = nsm_create(); if (IS_ERR(clnt)) { @@ -49,7 +53,8 @@ nsm_mon_unmon(struct nlm_host *host, u32 args.proc = NLMPROC_NSM_NOTIFY; memset(res, 0, sizeof(*res)); - status = rpc_call(clnt, proc, &args, res, 0); + msg.rpc_proc = &clnt->cl_procinfo[proc]; + status = rpc_call_sync(clnt, &msg, 0); if (status < 0) printk(KERN_DEBUG "nsm_mon_unmon: rpc failed, status=%d\n", status); @@ -214,12 +219,16 @@ static struct rpc_procinfo nsm_procedure .p_encode = (kxdrproc_t) xdr_encode_mon, .p_decode = (kxdrproc_t) xdr_decode_stat_res, .p_bufsiz = MAX(SM_mon_sz, SM_monres_sz) << 2, + .p_statidx = SM_MON, + .p_name = "MONITOR", }, [SM_UNMON] = { .p_proc = SM_UNMON, .p_encode = (kxdrproc_t) xdr_encode_unmon, .p_decode = (kxdrproc_t) xdr_decode_stat, .p_bufsiz = MAX(SM_mon_id_sz, SM_unmonres_sz) << 2, + .p_statidx = SM_UNMON, + .p_name = "UNMONITOR", }, }; diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index b10f913..a2dd9cc 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -21,10 +21,6 @@ #define NLMDBG_FACILITY NLMDBG_CLIENT -static u32 nlm4svc_callback(struct svc_rqst *, u32, struct nlm_res *); - -static const struct rpc_call_ops nlm4svc_callback_ops; - /* * Obtain client and file from arguments */ @@ -234,83 +230,89 @@ nlm4svc_proc_granted(struct svc_rqst *rq } /* + * This is the generic lockd callback for async RPC calls + */ +static void nlm4svc_callback_exit(struct rpc_task *task, void *data) +{ + dprintk("lockd: %4d callback returned %d\n", task->tk_pid, + -task->tk_status); +} + +static void nlm4svc_callback_release(void *data) +{ + nlm_release_call(data); +} + +static const struct rpc_call_ops nlm4svc_callback_ops = { + .rpc_call_done = nlm4svc_callback_exit, + .rpc_release = nlm4svc_callback_release, +}; + +/* * `Async' versions of the above service routines. They aren't really, * because we send the callback before the reply proper. I hope this * doesn't break any clients. */ -static int -nlm4svc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp, - void *resp) +static int nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *argp, + int (*func)(struct svc_rqst *, struct nlm_args *, struct nlm_res *)) { - struct nlm_res res; - u32 stat; + struct nlm_host *host; + struct nlm_rqst *call; + int stat; - dprintk("lockd: TEST_MSG called\n"); - memset(&res, 0, sizeof(res)); + host = nlmsvc_lookup_host(rqstp); + if (host == NULL) + return rpc_system_err; + + call = nlm_alloc_call(host); + if (call == NULL) + return rpc_system_err; - if ((stat = nlm4svc_proc_test(rqstp, argp, &res)) == 0) - stat = nlm4svc_callback(rqstp, NLMPROC_TEST_RES, &res); - return stat; + stat = func(rqstp, argp, &call->a_res); + if (stat != 0) { + nlm_release_call(call); + return stat; + } + + call->a_flags = RPC_TASK_ASYNC; + if (nlm_async_reply(call, proc, &nlm4svc_callback_ops) < 0) + return rpc_system_err; + return rpc_success; } -static int -nlm4svc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp, +static int nlm4svc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp, void *resp) { - struct nlm_res res; - u32 stat; + dprintk("lockd: TEST_MSG called\n"); + return nlm4svc_callback(rqstp, NLMPROC_TEST_RES, argp, nlm4svc_proc_test); +} +static int nlm4svc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp, + void *resp) +{ dprintk("lockd: LOCK_MSG called\n"); - memset(&res, 0, sizeof(res)); - - if ((stat = nlm4svc_proc_lock(rqstp, argp, &res)) == 0) - stat = nlm4svc_callback(rqstp, NLMPROC_LOCK_RES, &res); - return stat; + return nlm4svc_callback(rqstp, NLMPROC_LOCK_RES, argp, nlm4svc_proc_lock); } -static int -nlm4svc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp, +static int nlm4svc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp, void *resp) { - struct nlm_res res; - u32 stat; - dprintk("lockd: CANCEL_MSG called\n"); - memset(&res, 0, sizeof(res)); - - if ((stat = nlm4svc_proc_cancel(rqstp, argp, &res)) == 0) - stat = nlm4svc_callback(rqstp, NLMPROC_CANCEL_RES, &res); - return stat; + return nlm4svc_callback(rqstp, NLMPROC_CANCEL_RES, argp, nlm4svc_proc_cancel); } -static int -nlm4svc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp, +static int nlm4svc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp, void *resp) { - struct nlm_res res; - u32 stat; - dprintk("lockd: UNLOCK_MSG called\n"); - memset(&res, 0, sizeof(res)); - - if ((stat = nlm4svc_proc_unlock(rqstp, argp, &res)) == 0) - stat = nlm4svc_callback(rqstp, NLMPROC_UNLOCK_RES, &res); - return stat; + return nlm4svc_callback(rqstp, NLMPROC_UNLOCK_RES, argp, nlm4svc_proc_unlock); } -static int -nlm4svc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp, +static int nlm4svc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp, void *resp) { - struct nlm_res res; - u32 stat; - dprintk("lockd: GRANTED_MSG called\n"); - memset(&res, 0, sizeof(res)); - - if ((stat = nlm4svc_proc_granted(rqstp, argp, &res)) == 0) - stat = nlm4svc_callback(rqstp, NLMPROC_GRANTED_RES, &res); - return stat; + return nlm4svc_callback(rqstp, NLMPROC_GRANTED_RES, argp, nlm4svc_proc_granted); } /* @@ -472,55 +474,6 @@ nlm4svc_proc_granted_res(struct svc_rqst /* - * This is the generic lockd callback for async RPC calls - */ -static u32 -nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_res *resp) -{ - struct nlm_host *host; - struct nlm_rqst *call; - - if (!(call = nlmclnt_alloc_call())) - return rpc_system_err; - - host = nlmclnt_lookup_host(&rqstp->rq_addr, - rqstp->rq_prot, rqstp->rq_vers); - if (!host) { - kfree(call); - return rpc_system_err; - } - - call->a_flags = RPC_TASK_ASYNC; - call->a_host = host; - memcpy(&call->a_args, resp, sizeof(*resp)); - - if (nlmsvc_async_call(call, proc, &nlm4svc_callback_ops) < 0) - goto error; - - return rpc_success; - error: - kfree(call); - nlm_release_host(host); - return rpc_system_err; -} - -static void nlm4svc_callback_exit(struct rpc_task *task, void *data) -{ - struct nlm_rqst *call = data; - - if (task->tk_status < 0) { - dprintk("lockd: %4d callback failed (errno = %d)\n", - task->tk_pid, -task->tk_status); - } - nlm_release_host(call->a_host); - kfree(call); -} - -static const struct rpc_call_ops nlm4svc_callback_ops = { - .rpc_call_done = nlm4svc_callback_exit, -}; - -/* * NLM Server procedures. */ diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 9cfced6..1432d52 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -39,9 +39,12 @@ #define nlm_deadlock nlm_lck_denied #endif +static void nlmsvc_release_block(struct nlm_block *block); static void nlmsvc_insert_block(struct nlm_block *block, unsigned long); static int nlmsvc_remove_block(struct nlm_block *block); +static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock); +static void nlmsvc_freegrantargs(struct nlm_rqst *call); static const struct rpc_call_ops nlmsvc_grant_ops; /* @@ -58,6 +61,7 @@ nlmsvc_insert_block(struct nlm_block *bl struct nlm_block **bp, *b; dprintk("lockd: nlmsvc_insert_block(%p, %ld)\n", block, when); + kref_get(&block->b_count); if (block->b_queued) nlmsvc_remove_block(block); bp = &nlm_blocked; @@ -90,6 +94,7 @@ nlmsvc_remove_block(struct nlm_block *bl if (b == block) { *bp = block->b_next; block->b_queued = 0; + nlmsvc_release_block(block); return 1; } } @@ -112,17 +117,18 @@ nlmsvc_lookup_block(struct nlm_file *fil (long long)lock->fl.fl_start, (long long)lock->fl.fl_end, lock->fl.fl_type); for (head = &nlm_blocked; (block = *head) != 0; head = &block->b_next) { - fl = &block->b_call.a_args.lock.fl; + fl = &block->b_call->a_args.lock.fl; dprintk("lockd: check f=%p pd=%d %Ld-%Ld ty=%d cookie=%s\n", block->b_file, fl->fl_pid, (long long)fl->fl_start, (long long)fl->fl_end, fl->fl_type, - nlmdbg_cookie2a(&block->b_call.a_args.cookie)); + nlmdbg_cookie2a(&block->b_call->a_args.cookie)); if (block->b_file == file && nlm_compare_locks(fl, &lock->fl)) { if (remove) { *head = block->b_next; block->b_queued = 0; } + kref_get(&block->b_count); return block; } } @@ -150,11 +156,13 @@ nlmsvc_find_block(struct nlm_cookie *coo for (block = nlm_blocked; block; block = block->b_next) { dprintk("cookie: head of blocked queue %p, block %p\n", nlm_blocked, block); - if (nlm_cookie_match(&block->b_call.a_args.cookie,cookie) + if (nlm_cookie_match(&block->b_call->a_args.cookie,cookie) && nlm_cmp_addr(sin, &block->b_host->h_addr)) break; } + if (block != NULL) + kref_get(&block->b_count); return block; } @@ -174,27 +182,30 @@ nlmsvc_create_block(struct svc_rqst *rqs { struct nlm_block *block; struct nlm_host *host; - struct nlm_rqst *call; + struct nlm_rqst *call = NULL; /* Create host handle for callback */ - host = nlmclnt_lookup_host(&rqstp->rq_addr, - rqstp->rq_prot, rqstp->rq_vers); + host = nlmsvc_lookup_host(rqstp); if (host == NULL) return NULL; + call = nlm_alloc_call(host); + if (call == NULL) + return NULL; + /* Allocate memory for block, and initialize arguments */ - if (!(block = (struct nlm_block *) kmalloc(sizeof(*block), GFP_KERNEL))) + block = kzalloc(sizeof(*block), GFP_KERNEL); + if (block == NULL) goto failed; - memset(block, 0, sizeof(*block)); - locks_init_lock(&block->b_call.a_args.lock.fl); - locks_init_lock(&block->b_call.a_res.lock.fl); + kref_init(&block->b_count); - if (!nlmclnt_setgrantargs(&block->b_call, lock)) + if (!nlmsvc_setgrantargs(call, lock)) goto failed_free; /* Set notifier function for VFS, and init args */ - block->b_call.a_args.lock.fl.fl_lmops = &nlmsvc_lock_operations; - block->b_call.a_args.cookie = *cookie; /* see above */ + call->a_args.lock.fl.fl_flags |= FL_SLEEP; + call->a_args.lock.fl.fl_lmops = &nlmsvc_lock_operations; + call->a_args.cookie = *cookie; /* see above */ dprintk("lockd: created block %p...\n", block); @@ -208,16 +219,16 @@ nlmsvc_create_block(struct svc_rqst *rqs file->f_blocks = block; /* Set up RPC arguments for callback */ - call = &block->b_call; - call->a_host = host; + block->b_call = call; call->a_flags = RPC_TASK_ASYNC; + call->a_block = block; return block; failed_free: kfree(block); failed: - nlm_release_host(host); + nlm_release_call(call); return NULL; } @@ -227,28 +238,24 @@ failed: * It is the caller's responsibility to check whether the file * can be closed hereafter. */ -static int -nlmsvc_delete_block(struct nlm_block *block, int unlock) +static int nlmsvc_unlink_block(struct nlm_block *block) { - struct file_lock *fl = &block->b_call.a_args.lock.fl; - struct nlm_file *file = block->b_file; - struct nlm_block **bp; - int status = 0; - - dprintk("lockd: deleting block %p...\n", block); + int status; + dprintk("lockd: unlinking block %p...\n", block); /* Remove block from list */ + status = posix_unblock_lock(block->b_file->f_file, &block->b_call->a_args.lock.fl); nlmsvc_remove_block(block); - if (unlock) - status = posix_unblock_lock(file->f_file, fl); + return status; +} - /* If the block is in the middle of a GRANT callback, - * don't kill it yet. */ - if (block->b_incall) { - nlmsvc_insert_block(block, NLM_NEVER); - block->b_done = 1; - return status; - } +static void nlmsvc_free_block(struct kref *kref) +{ + struct nlm_block *block = container_of(kref, struct nlm_block, b_count); + struct nlm_file *file = block->b_file; + struct nlm_block **bp; + + dprintk("lockd: freeing block %p...\n", block); /* Remove block from file's list of blocks */ for (bp = &file->f_blocks; *bp; bp = &(*bp)->b_fnext) { @@ -258,11 +265,15 @@ nlmsvc_delete_block(struct nlm_block *bl } } - if (block->b_host) - nlm_release_host(block->b_host); - nlmclnt_freegrantargs(&block->b_call); + nlmsvc_freegrantargs(block->b_call); + nlm_release_call(block->b_call); kfree(block); - return status; +} + +static void nlmsvc_release_block(struct nlm_block *block) +{ + if (block != NULL) + kref_put(&block->b_count, nlmsvc_free_block); } /* @@ -282,7 +293,7 @@ nlmsvc_traverse_blocks(struct nlm_host * block->b_host->h_inuse = 1; else if (action == NLM_ACT_UNLOCK) { if (host == NULL || host == block->b_host) - nlmsvc_delete_block(block, 1); + nlmsvc_unlink_block(block); } } up(&file->f_sema); @@ -290,6 +301,38 @@ nlmsvc_traverse_blocks(struct nlm_host * } /* + * Initialize arguments for GRANTED call. The nlm_rqst structure + * has been cleared already. + */ +static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock) +{ + locks_copy_lock(&call->a_args.lock.fl, &lock->fl); + memcpy(&call->a_args.lock.fh, &lock->fh, sizeof(call->a_args.lock.fh)); + call->a_args.lock.caller = system_utsname.nodename; + call->a_args.lock.oh.len = lock->oh.len; + + /* set default data area */ + call->a_args.lock.oh.data = call->a_owner; + call->a_args.lock.svid = lock->fl.fl_pid; + + if (lock->oh.len > NLMCLNT_OHSIZE) { + void *data = kmalloc(lock->oh.len, GFP_KERNEL); + if (!data) + return 0; + call->a_args.lock.oh.data = (u8 *) data; + } + + memcpy(call->a_args.lock.oh.data, lock->oh.data, lock->oh.len); + return 1; +} + +static void nlmsvc_freegrantargs(struct nlm_rqst *call) +{ + if (call->a_args.lock.oh.data != call->a_owner) + kfree(call->a_args.lock.oh.data); +} + +/* * Attempt to establish a lock, and if it can't be granted, block it * if required. */ @@ -297,9 +340,9 @@ u32 nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_lock *lock, int wait, struct nlm_cookie *cookie) { - struct file_lock *conflock; - struct nlm_block *block; + struct nlm_block *block, *newblock = NULL; int error; + u32 ret; dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n", file->f_file->f_dentry->d_inode->i_sb->s_id, @@ -310,69 +353,65 @@ nlmsvc_lock(struct svc_rqst *rqstp, stru wait); - /* Get existing block (in case client is busy-waiting) */ - block = nlmsvc_lookup_block(file, lock, 0); - - lock->fl.fl_flags |= FL_LOCKD; - + lock->fl.fl_flags &= ~FL_SLEEP; again: /* Lock file against concurrent access */ down(&file->f_sema); + /* Get existing block (in case client is busy-waiting) */ + block = nlmsvc_lookup_block(file, lock, 0); + if (block == NULL) { + if (newblock != NULL) + lock = &newblock->b_call->a_args.lock; + } else + lock = &block->b_call->a_args.lock; - if (!(conflock = posix_test_lock(file->f_file, &lock->fl))) { - error = posix_lock_file(file->f_file, &lock->fl); + error = posix_lock_file(file->f_file, &lock->fl); + lock->fl.fl_flags &= ~FL_SLEEP; - if (block) - nlmsvc_delete_block(block, 0); - up(&file->f_sema); + dprintk("lockd: posix_lock_file returned %d\n", error); - dprintk("lockd: posix_lock_file returned %d\n", -error); - switch(-error) { + switch(error) { case 0: - return nlm_granted; - case EDEADLK: - return nlm_deadlock; - case EAGAIN: - return nlm_lck_denied; + ret = nlm_granted; + goto out; + case -EAGAIN: + break; + case -EDEADLK: + ret = nlm_deadlock; + goto out; default: /* includes ENOLCK */ - return nlm_lck_denied_nolocks; - } + ret = nlm_lck_denied_nolocks; + goto out; } - if (!wait) { - up(&file->f_sema); - return nlm_lck_denied; - } + ret = nlm_lck_denied; + if (!wait) + goto out; - if (posix_locks_deadlock(&lock->fl, conflock)) { - up(&file->f_sema); - return nlm_deadlock; - } + ret = nlm_lck_blocked; + if (block != NULL) + goto out; /* If we don't have a block, create and initialize it. Then * retry because we may have slept in kmalloc. */ /* We have to release f_sema as nlmsvc_create_block may try to * to claim it while doing host garbage collection */ - if (block == NULL) { + if (newblock == NULL) { up(&file->f_sema); dprintk("lockd: blocking on this lock (allocating).\n"); - if (!(block = nlmsvc_create_block(rqstp, file, lock, cookie))) + if (!(newblock = nlmsvc_create_block(rqstp, file, lock, cookie))) return nlm_lck_denied_nolocks; goto again; } /* Append to list of blocked */ - nlmsvc_insert_block(block, NLM_NEVER); - - if (list_empty(&block->b_call.a_args.lock.fl.fl_block)) { - /* Now add block to block list of the conflicting lock - if we haven't done so. */ - dprintk("lockd: blocking on this lock.\n"); - posix_block_lock(conflock, &block->b_call.a_args.lock.fl); - } - + nlmsvc_insert_block(newblock, NLM_NEVER); +out: up(&file->f_sema); - return nlm_lck_blocked; + nlmsvc_release_block(newblock); + nlmsvc_release_block(block); + dprintk("lockd: nlmsvc_lock returned %u\n", ret); + return ret; } /* @@ -382,8 +421,6 @@ u32 nlmsvc_testlock(struct nlm_file *file, struct nlm_lock *lock, struct nlm_lock *conflock) { - struct file_lock *fl; - dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n", file->f_file->f_dentry->d_inode->i_sb->s_id, file->f_file->f_dentry->d_inode->i_ino, @@ -391,13 +428,14 @@ nlmsvc_testlock(struct nlm_file *file, s (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); - if ((fl = posix_test_lock(file->f_file, &lock->fl)) != NULL) { + if (posix_test_lock(file->f_file, &lock->fl, &conflock->fl)) { dprintk("lockd: conflicting lock(ty=%d, %Ld-%Ld)\n", - fl->fl_type, (long long)fl->fl_start, - (long long)fl->fl_end); + conflock->fl.fl_type, + (long long)conflock->fl.fl_start, + (long long)conflock->fl.fl_end); conflock->caller = "somehost"; /* FIXME */ conflock->oh.len = 0; /* don't return OH info */ - conflock->fl = *fl; + conflock->svid = conflock->fl.fl_pid; return nlm_lck_denied; } @@ -453,8 +491,10 @@ nlmsvc_cancel_blocked(struct nlm_file *f (long long)lock->fl.fl_end); down(&file->f_sema); - if ((block = nlmsvc_lookup_block(file, lock, 1)) != NULL) - status = nlmsvc_delete_block(block, 1); + if ((block = nlmsvc_lookup_block(file, lock, 1)) != NULL) { + status = nlmsvc_unlink_block(block); + nlmsvc_release_block(block); + } up(&file->f_sema); return status ? nlm_lck_denied : nlm_granted; } @@ -473,7 +513,7 @@ nlmsvc_notify_blocked(struct file_lock * dprintk("lockd: VFS unblock notification for block %p\n", fl); for (bp = &nlm_blocked; (block = *bp) != 0; bp = &block->b_next) { - if (nlm_compare_locks(&block->b_call.a_args.lock.fl, fl)) { + if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) { nlmsvc_insert_block(block, 0); svc_wake_up(block->b_daemon); return; @@ -508,8 +548,7 @@ static void nlmsvc_grant_blocked(struct nlm_block *block) { struct nlm_file *file = block->b_file; - struct nlm_lock *lock = &block->b_call.a_args.lock; - struct file_lock *conflock; + struct nlm_lock *lock = &block->b_call->a_args.lock; int error; dprintk("lockd: grant blocked lock %p\n", block); @@ -518,7 +557,7 @@ nlmsvc_grant_blocked(struct nlm_block *b down(&file->f_sema); /* Unlink block request from list */ - nlmsvc_remove_block(block); + nlmsvc_unlink_block(block); /* If b_granted is true this means we've been here before. * Just retry the grant callback, possibly refreshing the RPC @@ -529,41 +568,38 @@ nlmsvc_grant_blocked(struct nlm_block *b } /* Try the lock operation again */ - if ((conflock = posix_test_lock(file->f_file, &lock->fl)) != NULL) { - /* Bummer, we blocked again */ + lock->fl.fl_flags |= FL_SLEEP; + error = posix_lock_file(file->f_file, &lock->fl); + lock->fl.fl_flags &= ~FL_SLEEP; + + switch (error) { + case 0: + break; + case -EAGAIN: dprintk("lockd: lock still blocked\n"); nlmsvc_insert_block(block, NLM_NEVER); - posix_block_lock(conflock, &lock->fl); - up(&file->f_sema); - return; - } - - /* Alright, no conflicting lock. Now lock it for real. If the - * following yields an error, this is most probably due to low - * memory. Retry the lock in a few seconds. - */ - if ((error = posix_lock_file(file->f_file, &lock->fl)) < 0) { + goto out_unlock; + default: printk(KERN_WARNING "lockd: unexpected error %d in %s!\n", -error, __FUNCTION__); nlmsvc_insert_block(block, 10 * HZ); - up(&file->f_sema); - return; + goto out_unlock; } callback: /* Lock was granted by VFS. */ dprintk("lockd: GRANTing blocked lock.\n"); block->b_granted = 1; - block->b_incall = 1; /* Schedule next grant callback in 30 seconds */ nlmsvc_insert_block(block, 30 * HZ); /* Call the client */ - nlm_get_host(block->b_call.a_host); - if (nlmsvc_async_call(&block->b_call, NLMPROC_GRANTED_MSG, + kref_get(&block->b_count); + if (nlm_async_call(block->b_call, NLMPROC_GRANTED_MSG, &nlmsvc_grant_ops) < 0) - nlm_release_host(block->b_call.a_host); + nlmsvc_release_block(block); +out_unlock: up(&file->f_sema); } @@ -578,20 +614,10 @@ callback: static void nlmsvc_grant_callback(struct rpc_task *task, void *data) { struct nlm_rqst *call = data; - struct nlm_block *block; + struct nlm_block *block = call->a_block; unsigned long timeout; - struct sockaddr_in *peer_addr = RPC_PEERADDR(task->tk_client); dprintk("lockd: GRANT_MSG RPC callback\n"); - dprintk("callback: looking for cookie %s, host (%u.%u.%u.%u)\n", - nlmdbg_cookie2a(&call->a_args.cookie), - NIPQUAD(peer_addr->sin_addr.s_addr)); - if (!(block = nlmsvc_find_block(&call->a_args.cookie, peer_addr))) { - dprintk("lockd: no block for cookie %s, host (%u.%u.%u.%u)\n", - nlmdbg_cookie2a(&call->a_args.cookie), - NIPQUAD(peer_addr->sin_addr.s_addr)); - return; - } /* Technically, we should down the file semaphore here. Since we * move the block towards the head of the queue only, no harm @@ -608,13 +634,16 @@ static void nlmsvc_grant_callback(struct } nlmsvc_insert_block(block, timeout); svc_wake_up(block->b_daemon); - block->b_incall = 0; +} - nlm_release_host(call->a_host); +void nlmsvc_grant_release(void *data) +{ + nlmsvc_release_block(data); } static const struct rpc_call_ops nlmsvc_grant_ops = { .rpc_call_done = nlmsvc_grant_callback, + .rpc_release = nlmsvc_grant_release, }; /* @@ -636,23 +665,19 @@ nlmsvc_grant_reply(struct svc_rqst *rqst file->f_count++; down(&file->f_sema); - block = nlmsvc_find_block(cookie, &rqstp->rq_addr); if (block) { if (status == NLM_LCK_DENIED_GRACE_PERIOD) { /* Try again in a couple of seconds */ nlmsvc_insert_block(block, 10 * HZ); - up(&file->f_sema); } else { /* Lock is now held by client, or has been rejected. * In both cases, the block should be removed. */ - up(&file->f_sema); - if (status == NLM_LCK_GRANTED) - nlmsvc_delete_block(block, 0); - else - nlmsvc_delete_block(block, 1); + nlmsvc_unlink_block(block); } } + up(&file->f_sema); nlm_release_file(file); + nlmsvc_release_block(block); } /* @@ -675,10 +700,12 @@ nlmsvc_retry_blocked(void) break; dprintk("nlmsvc_retry_blocked(%p, when=%ld, done=%d)\n", block, block->b_when, block->b_done); + kref_get(&block->b_count); if (block->b_done) - nlmsvc_delete_block(block, 0); + nlmsvc_unlink_block(block); else nlmsvc_grant_blocked(block); + nlmsvc_release_block(block); } if ((block = nlm_blocked) && block->b_when != NLM_NEVER) diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 35681d9..d210cf3 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -22,10 +22,6 @@ #define NLMDBG_FACILITY NLMDBG_CLIENT -static u32 nlmsvc_callback(struct svc_rqst *, u32, struct nlm_res *); - -static const struct rpc_call_ops nlmsvc_callback_ops; - #ifdef CONFIG_LOCKD_V4 static u32 cast_to_nlm(u32 status, u32 vers) @@ -262,83 +258,91 @@ nlmsvc_proc_granted(struct svc_rqst *rqs } /* + * This is the generic lockd callback for async RPC calls + */ +static void nlmsvc_callback_exit(struct rpc_task *task, void *data) +{ + dprintk("lockd: %4d callback returned %d\n", task->tk_pid, + -task->tk_status); +} + +static void nlmsvc_callback_release(void *data) +{ + nlm_release_call(data); +} + +static const struct rpc_call_ops nlmsvc_callback_ops = { + .rpc_call_done = nlmsvc_callback_exit, + .rpc_release = nlmsvc_callback_release, +}; + +/* * `Async' versions of the above service routines. They aren't really, * because we send the callback before the reply proper. I hope this * doesn't break any clients. */ -static int -nlmsvc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp, - void *resp) +static int nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *argp, + int (*func)(struct svc_rqst *, struct nlm_args *, struct nlm_res *)) { - struct nlm_res res; - u32 stat; + struct nlm_host *host; + struct nlm_rqst *call; + int stat; - dprintk("lockd: TEST_MSG called\n"); - memset(&res, 0, sizeof(res)); + host = nlmsvc_lookup_host(rqstp); + if (host == NULL) + return rpc_system_err; - if ((stat = nlmsvc_proc_test(rqstp, argp, &res)) == 0) - stat = nlmsvc_callback(rqstp, NLMPROC_TEST_RES, &res); - return stat; + call = nlm_alloc_call(host); + if (call == NULL) + return rpc_system_err; + + stat = func(rqstp, argp, &call->a_res); + if (stat != 0) { + nlm_release_call(call); + return stat; + } + + call->a_flags = RPC_TASK_ASYNC; + if (nlm_async_reply(call, proc, &nlmsvc_callback_ops) < 0) + return rpc_system_err; + return rpc_success; } -static int -nlmsvc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp, +static int nlmsvc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp, void *resp) { - struct nlm_res res; - u32 stat; + dprintk("lockd: TEST_MSG called\n"); + return nlmsvc_callback(rqstp, NLMPROC_TEST_RES, argp, nlmsvc_proc_test); +} +static int nlmsvc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp, + void *resp) +{ dprintk("lockd: LOCK_MSG called\n"); - memset(&res, 0, sizeof(res)); - - if ((stat = nlmsvc_proc_lock(rqstp, argp, &res)) == 0) - stat = nlmsvc_callback(rqstp, NLMPROC_LOCK_RES, &res); - return stat; + return nlmsvc_callback(rqstp, NLMPROC_LOCK_RES, argp, nlmsvc_proc_lock); } -static int -nlmsvc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp, +static int nlmsvc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp, void *resp) { - struct nlm_res res; - u32 stat; - dprintk("lockd: CANCEL_MSG called\n"); - memset(&res, 0, sizeof(res)); - - if ((stat = nlmsvc_proc_cancel(rqstp, argp, &res)) == 0) - stat = nlmsvc_callback(rqstp, NLMPROC_CANCEL_RES, &res); - return stat; + return nlmsvc_callback(rqstp, NLMPROC_CANCEL_RES, argp, nlmsvc_proc_cancel); } static int nlmsvc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp, void *resp) { - struct nlm_res res; - u32 stat; - dprintk("lockd: UNLOCK_MSG called\n"); - memset(&res, 0, sizeof(res)); - - if ((stat = nlmsvc_proc_unlock(rqstp, argp, &res)) == 0) - stat = nlmsvc_callback(rqstp, NLMPROC_UNLOCK_RES, &res); - return stat; + return nlmsvc_callback(rqstp, NLMPROC_UNLOCK_RES, argp, nlmsvc_proc_unlock); } static int nlmsvc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp, void *resp) { - struct nlm_res res; - u32 stat; - dprintk("lockd: GRANTED_MSG called\n"); - memset(&res, 0, sizeof(res)); - - if ((stat = nlmsvc_proc_granted(rqstp, argp, &res)) == 0) - stat = nlmsvc_callback(rqstp, NLMPROC_GRANTED_RES, &res); - return stat; + return nlmsvc_callback(rqstp, NLMPROC_GRANTED_RES, argp, nlmsvc_proc_granted); } /* @@ -497,55 +501,6 @@ nlmsvc_proc_granted_res(struct svc_rqst } /* - * This is the generic lockd callback for async RPC calls - */ -static u32 -nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_res *resp) -{ - struct nlm_host *host; - struct nlm_rqst *call; - - if (!(call = nlmclnt_alloc_call())) - return rpc_system_err; - - host = nlmclnt_lookup_host(&rqstp->rq_addr, - rqstp->rq_prot, rqstp->rq_vers); - if (!host) { - kfree(call); - return rpc_system_err; - } - - call->a_flags = RPC_TASK_ASYNC; - call->a_host = host; - memcpy(&call->a_args, resp, sizeof(*resp)); - - if (nlmsvc_async_call(call, proc, &nlmsvc_callback_ops) < 0) - goto error; - - return rpc_success; - error: - nlm_release_host(host); - kfree(call); - return rpc_system_err; -} - -static void nlmsvc_callback_exit(struct rpc_task *task, void *data) -{ - struct nlm_rqst *call = data; - - if (task->tk_status < 0) { - dprintk("lockd: %4d callback failed (errno = %d)\n", - task->tk_pid, -task->tk_status); - } - nlm_release_host(call->a_host); - kfree(call); -} - -static const struct rpc_call_ops nlmsvc_callback_ops = { - .rpc_call_done = nlmsvc_callback_exit, -}; - -/* * NLM Server procedures. */ diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 62f4a38..601e5b3 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -182,7 +182,7 @@ nlm_traverse_locks(struct nlm_host *host again: file->f_locks = 0; for (fl = inode->i_flock; fl; fl = fl->fl_next) { - if (!(fl->fl_flags & FL_LOCKD)) + if (fl->fl_lmops != &nlmsvc_lock_operations) continue; /* update current lock count */ diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index 200fbda..766ce06 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -131,10 +131,11 @@ nlm_decode_lock(u32 *p, struct nlm_lock || !(p = nlm_decode_fh(p, &lock->fh)) || !(p = nlm_decode_oh(p, &lock->oh))) return NULL; + lock->svid = ntohl(*p++); locks_init_lock(fl); fl->fl_owner = current->files; - fl->fl_pid = ntohl(*p++); + fl->fl_pid = (pid_t)lock->svid; fl->fl_flags = FL_POSIX; fl->fl_type = F_RDLCK; /* as good as anything else */ start = ntohl(*p++); @@ -174,7 +175,7 @@ nlm_encode_lock(u32 *p, struct nlm_lock else len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1); - *p++ = htonl(fl->fl_pid); + *p++ = htonl(lock->svid); *p++ = htonl(start); *p++ = htonl(len); @@ -197,7 +198,7 @@ nlm_encode_testres(u32 *p, struct nlm_re struct file_lock *fl = &resp->lock.fl; *p++ = (fl->fl_type == F_RDLCK)? xdr_zero : xdr_one; - *p++ = htonl(fl->fl_pid); + *p++ = htonl(resp->lock.svid); /* Encode owner handle. */ if (!(p = xdr_encode_netobj(p, &resp->lock.oh))) @@ -298,7 +299,8 @@ nlmsvc_decode_shareargs(struct svc_rqst memset(lock, 0, sizeof(*lock)); locks_init_lock(&lock->fl); - lock->fl.fl_pid = ~(u32) 0; + lock->svid = ~(u32) 0; + lock->fl.fl_pid = (pid_t)lock->svid; if (!(p = nlm_decode_cookie(p, &argp->cookie)) || !(p = xdr_decode_string_inplace(p, &lock->caller, @@ -415,7 +417,8 @@ nlmclt_decode_testres(struct rpc_rqst *r memset(&resp->lock, 0, sizeof(resp->lock)); locks_init_lock(fl); excl = ntohl(*p++); - fl->fl_pid = ntohl(*p++); + resp->lock.svid = ntohl(*p++); + fl->fl_pid = (pid_t)resp->lock.svid; if (!(p = nlm_decode_oh(p, &resp->lock.oh))) return -EIO; @@ -543,7 +546,9 @@ nlmclt_decode_res(struct rpc_rqst *req, .p_proc = NLMPROC_##proc, \ .p_encode = (kxdrproc_t) nlmclt_encode_##argtype, \ .p_decode = (kxdrproc_t) nlmclt_decode_##restype, \ - .p_bufsiz = MAX(NLM_##argtype##_sz, NLM_##restype##_sz) << 2 \ + .p_bufsiz = MAX(NLM_##argtype##_sz, NLM_##restype##_sz) << 2, \ + .p_statidx = NLMPROC_##proc, \ + .p_name = #proc, \ } static struct rpc_procinfo nlm_procedures[] = { diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index fdcf105..36eb175 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c @@ -130,10 +130,11 @@ nlm4_decode_lock(u32 *p, struct nlm_lock || !(p = nlm4_decode_fh(p, &lock->fh)) || !(p = nlm4_decode_oh(p, &lock->oh))) return NULL; + lock->svid = ntohl(*p++); locks_init_lock(fl); fl->fl_owner = current->files; - fl->fl_pid = ntohl(*p++); + fl->fl_pid = (pid_t)lock->svid; fl->fl_flags = FL_POSIX; fl->fl_type = F_RDLCK; /* as good as anything else */ p = xdr_decode_hyper(p, &start); @@ -167,7 +168,7 @@ nlm4_encode_lock(u32 *p, struct nlm_lock || (fl->fl_end > NLM4_OFFSET_MAX && fl->fl_end != OFFSET_MAX)) return NULL; - *p++ = htonl(fl->fl_pid); + *p++ = htonl(lock->svid); start = loff_t_to_s64(fl->fl_start); if (fl->fl_end == OFFSET_MAX) @@ -198,7 +199,7 @@ nlm4_encode_testres(u32 *p, struct nlm_r struct file_lock *fl = &resp->lock.fl; *p++ = (fl->fl_type == F_RDLCK)? xdr_zero : xdr_one; - *p++ = htonl(fl->fl_pid); + *p++ = htonl(resp->lock.svid); /* Encode owner handle. */ if (!(p = xdr_encode_netobj(p, &resp->lock.oh))) @@ -212,8 +213,8 @@ nlm4_encode_testres(u32 *p, struct nlm_r p = xdr_encode_hyper(p, start); p = xdr_encode_hyper(p, len); - dprintk("xdr: encode_testres (status %d pid %d type %d start %Ld end %Ld)\n", - resp->status, fl->fl_pid, fl->fl_type, + dprintk("xdr: encode_testres (status %u pid %d type %d start %Ld end %Ld)\n", + resp->status, (int)resp->lock.svid, fl->fl_type, (long long)fl->fl_start, (long long)fl->fl_end); } @@ -303,7 +304,8 @@ nlm4svc_decode_shareargs(struct svc_rqst memset(lock, 0, sizeof(*lock)); locks_init_lock(&lock->fl); - lock->fl.fl_pid = ~(u32) 0; + lock->svid = ~(u32) 0; + lock->fl.fl_pid = (pid_t)lock->svid; if (!(p = nlm4_decode_cookie(p, &argp->cookie)) || !(p = xdr_decode_string_inplace(p, &lock->caller, @@ -420,7 +422,8 @@ nlm4clt_decode_testres(struct rpc_rqst * memset(&resp->lock, 0, sizeof(resp->lock)); locks_init_lock(fl); excl = ntohl(*p++); - fl->fl_pid = ntohl(*p++); + resp->lock.svid = ntohl(*p++); + fl->fl_pid = (pid_t)resp->lock.svid; if (!(p = nlm4_decode_oh(p, &resp->lock.oh))) return -EIO; @@ -548,7 +551,9 @@ nlm4clt_decode_res(struct rpc_rqst *req, .p_proc = NLMPROC_##proc, \ .p_encode = (kxdrproc_t) nlm4clt_encode_##argtype, \ .p_decode = (kxdrproc_t) nlm4clt_decode_##restype, \ - .p_bufsiz = MAX(NLM4_##argtype##_sz, NLM4_##restype##_sz) << 2 \ + .p_bufsiz = MAX(NLM4_##argtype##_sz, NLM4_##restype##_sz) << 2, \ + .p_statidx = NLMPROC_##proc, \ + .p_name = #proc, \ } static struct rpc_procinfo nlm4_procedures[] = { diff --git a/fs/locks.c b/fs/locks.c index 909eab8..56f996e 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -139,10 +139,7 @@ int lease_break_time = 45; #define for_each_lock(inode, lockp) \ for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next) -LIST_HEAD(file_lock_list); - -EXPORT_SYMBOL(file_lock_list); - +static LIST_HEAD(file_lock_list); static LIST_HEAD(blocked_list); static kmem_cache_t *filelock_cache; @@ -153,6 +150,21 @@ static struct file_lock *locks_alloc_loc return kmem_cache_alloc(filelock_cache, SLAB_KERNEL); } +static void locks_release_private(struct file_lock *fl) +{ + if (fl->fl_ops) { + if (fl->fl_ops->fl_release_private) + fl->fl_ops->fl_release_private(fl); + fl->fl_ops = NULL; + } + if (fl->fl_lmops) { + if (fl->fl_lmops->fl_release_private) + fl->fl_lmops->fl_release_private(fl); + fl->fl_lmops = NULL; + } + +} + /* Free a lock which is not in use. */ static void locks_free_lock(struct file_lock *fl) { @@ -169,18 +181,7 @@ static void locks_free_lock(struct file_ if (!list_empty(&fl->fl_link)) panic("Attempting to free lock on active lock list"); - if (fl->fl_ops) { - if (fl->fl_ops->fl_release_private) - fl->fl_ops->fl_release_private(fl); - fl->fl_ops = NULL; - } - - if (fl->fl_lmops) { - if (fl->fl_lmops->fl_release_private) - fl->fl_lmops->fl_release_private(fl); - fl->fl_lmops = NULL; - } - + locks_release_private(fl); kmem_cache_free(filelock_cache, fl); } @@ -218,24 +219,46 @@ static void init_once(void *foo, kmem_ca locks_init_lock(lock); } +static void locks_copy_private(struct file_lock *new, struct file_lock *fl) +{ + if (fl->fl_ops) { + if (fl->fl_ops->fl_copy_lock) + fl->fl_ops->fl_copy_lock(new, fl); + new->fl_ops = fl->fl_ops; + } + if (fl->fl_lmops) { + if (fl->fl_lmops->fl_copy_lock) + fl->fl_lmops->fl_copy_lock(new, fl); + new->fl_lmops = fl->fl_lmops; + } +} + /* * Initialize a new lock from an existing file_lock structure. */ -void locks_copy_lock(struct file_lock *new, struct file_lock *fl) +static void __locks_copy_lock(struct file_lock *new, const struct file_lock *fl) { new->fl_owner = fl->fl_owner; new->fl_pid = fl->fl_pid; - new->fl_file = fl->fl_file; + new->fl_file = NULL; new->fl_flags = fl->fl_flags; new->fl_type = fl->fl_type; new->fl_start = fl->fl_start; new->fl_end = fl->fl_end; + new->fl_ops = NULL; + new->fl_lmops = NULL; +} + +void locks_copy_lock(struct file_lock *new, struct file_lock *fl) +{ + locks_release_private(new); + + __locks_copy_lock(new, fl); + new->fl_file = fl->fl_file; new->fl_ops = fl->fl_ops; new->fl_lmops = fl->fl_lmops; - if (fl->fl_ops && fl->fl_ops->fl_copy_lock) - fl->fl_ops->fl_copy_lock(new, fl); - if (fl->fl_lmops && fl->fl_lmops->fl_copy_lock) - fl->fl_lmops->fl_copy_lock(new, fl); + + locks_copy_private(new, fl); } EXPORT_SYMBOL(locks_copy_lock); @@ -654,8 +677,9 @@ static int locks_block_on_timeout(struct return result; } -struct file_lock * -posix_test_lock(struct file *filp, struct file_lock *fl) +int +posix_test_lock(struct file *filp, struct file_lock *fl, + struct file_lock *conflock) { struct file_lock *cfl; @@ -666,9 +690,13 @@ posix_test_lock(struct file *filp, struc if (posix_locks_conflict(cfl, fl)) break; } + if (cfl) { + __locks_copy_lock(conflock, cfl); + unlock_kernel(); + return 1; + } unlock_kernel(); - - return (cfl); + return 0; } EXPORT_SYMBOL(posix_test_lock); @@ -904,7 +932,8 @@ static int __posix_lock_file(struct inod fl->fl_start = request->fl_start; fl->fl_end = request->fl_end; fl->fl_type = request->fl_type; - fl->fl_u = request->fl_u; + locks_release_private(fl); + locks_copy_private(fl, request); request = fl; added = 1; } @@ -1544,7 +1573,7 @@ asmlinkage long sys_flock(unsigned int f */ int fcntl_getlk(struct file *filp, struct flock __user *l) { - struct file_lock *fl, file_lock; + struct file_lock *fl, cfl, file_lock; struct flock flock; int error; @@ -1568,7 +1597,7 @@ int fcntl_getlk(struct file *filp, struc else fl = (file_lock.fl_type == F_UNLCK ? NULL : &file_lock); } else { - fl = posix_test_lock(filp, &file_lock); + fl = (posix_test_lock(filp, &file_lock, &cfl) ? &cfl : NULL); } flock.l_type = F_UNLCK; @@ -1698,7 +1727,7 @@ out: */ int fcntl_getlk64(struct file *filp, struct flock64 __user *l) { - struct file_lock *fl, file_lock; + struct file_lock *fl, cfl, file_lock; struct flock64 flock; int error; @@ -1722,7 +1751,7 @@ int fcntl_getlk64(struct file *filp, str else fl = (file_lock.fl_type == F_UNLCK ? NULL : &file_lock); } else { - fl = posix_test_lock(filp, &file_lock); + fl = (posix_test_lock(filp, &file_lock, &cfl) ? &cfl : NULL); } flock.l_type = F_UNLCK; @@ -1936,21 +1965,6 @@ void locks_remove_flock(struct file *fil } /** - * posix_block_lock - blocks waiting for a file lock - * @blocker: the lock which is blocking - * @waiter: the lock which conflicts and has to wait - * - * lockd needs to block waiting for locks. - */ -void -posix_block_lock(struct file_lock *blocker, struct file_lock *waiter) -{ - locks_insert_block(blocker, waiter); -} - -EXPORT_SYMBOL(posix_block_lock); - -/** * posix_unblock_lock - stop waiting for a file lock * @filp: how the file was opened * @waiter: the lock which was waiting diff --git a/fs/namespace.c b/fs/namespace.c index 058a448..70bba4b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -399,6 +399,44 @@ struct seq_operations mounts_op = { .show = show_vfsmnt }; +static int show_vfsstat(struct seq_file *m, void *v) +{ + struct vfsmount *mnt = v; + int err = 0; + + /* device */ + if (mnt->mnt_devname) { + seq_puts(m, "device "); + mangle(m, mnt->mnt_devname); + } else + seq_puts(m, "no device"); + + /* mount point */ + seq_puts(m, " mounted on "); + seq_path(m, mnt, mnt->mnt_root, " \t\n\\"); + seq_putc(m, ' '); + + /* file system type */ + seq_puts(m, "with fstype "); + mangle(m, mnt->mnt_sb->s_type->name); + + /* optional statistics */ + if (mnt->mnt_sb->s_op->show_stats) { + seq_putc(m, ' '); + err = mnt->mnt_sb->s_op->show_stats(m, mnt); + } + + seq_putc(m, '\n'); + return err; +} + +struct seq_operations mountstats_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_vfsstat, +}; + /** * may_umount_tree - check if a mount tree is busy * @mnt: root of mount tree diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 7c33b9a..05c38cf 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -330,7 +330,7 @@ static unsigned encode_op_hdr(struct xdr static unsigned encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr, const struct cb_getattrres *res) { - uint32_t *savep; + uint32_t *savep = NULL; unsigned status = res->status; if (unlikely(status != 0)) @@ -358,23 +358,26 @@ static unsigned process_op(struct svc_rq struct xdr_stream *xdr_in, void *argp, struct xdr_stream *xdr_out, void *resp) { - struct callback_op *op; - unsigned int op_nr; + struct callback_op *op = &callback_ops[0]; + unsigned int op_nr = OP_CB_ILLEGAL; unsigned int status = 0; long maxlen; unsigned res; dprintk("%s: start\n", __FUNCTION__); status = decode_op_hdr(xdr_in, &op_nr); - if (unlikely(status != 0)) { - op_nr = OP_CB_ILLEGAL; - op = &callback_ops[0]; - } else if (unlikely(op_nr != OP_CB_GETATTR && op_nr != OP_CB_RECALL)) { - op_nr = OP_CB_ILLEGAL; - op = &callback_ops[0]; - status = htonl(NFS4ERR_OP_ILLEGAL); - } else - op = &callback_ops[op_nr]; + if (likely(status == 0)) { + switch (op_nr) { + case OP_CB_GETATTR: + case OP_CB_RECALL: + op = &callback_ops[op_nr]; + break; + default: + op_nr = OP_CB_ILLEGAL; + op = &callback_ops[0]; + status = htonl(NFS4ERR_OP_ILLEGAL); + } + } maxlen = xdr_out->end - xdr_out->p; if (maxlen > 0 && maxlen < PAGE_SIZE) { @@ -416,6 +419,7 @@ static int nfs4_callback_compound(struct decode_compound_hdr_arg(&xdr_in, &hdr_arg); hdr_res.taglen = hdr_arg.taglen; hdr_res.tag = hdr_arg.tag; + hdr_res.nops = NULL; encode_compound_hdr_res(&xdr_out, &hdr_res); for (;;) { diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index a1554be..609185a 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -34,6 +34,7 @@ #include "nfs4_fs.h" #include "delegation.h" +#include "iostat.h" #define NFS_PARANOIA 1 /* #define NFS_DEBUG_VERBOSE 1 */ @@ -129,6 +130,9 @@ nfs_opendir(struct inode *inode, struct { int res = 0; + dfprintk(VFS, "NFS: opendir(%s/%ld)\n", + inode->i_sb->s_id, inode->i_ino); + lock_kernel(); /* Call generic open code in order to cache credentials */ if (!res) @@ -172,7 +176,9 @@ int nfs_readdir_filler(nfs_readdir_descr unsigned long timestamp; int error; - dfprintk(VFS, "NFS: nfs_readdir_filler() reading cookie %Lu into page %lu.\n", (long long)desc->entry->cookie, page->index); + dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n", + __FUNCTION__, (long long)desc->entry->cookie, + page->index); again: timestamp = jiffies; @@ -244,7 +250,8 @@ int find_dirent(nfs_readdir_descriptor_t status; while((status = dir_decode(desc)) == 0) { - dfprintk(VFS, "NFS: found cookie %Lu\n", (unsigned long long)entry->cookie); + dfprintk(DIRCACHE, "NFS: %s: examining cookie %Lu\n", + __FUNCTION__, (unsigned long long)entry->cookie); if (entry->prev_cookie == *desc->dir_cookie) break; if (loop_count++ > 200) { @@ -252,7 +259,6 @@ int find_dirent(nfs_readdir_descriptor_t schedule(); } } - dfprintk(VFS, "NFS: find_dirent() returns %d\n", status); return status; } @@ -276,7 +282,8 @@ int find_dirent_index(nfs_readdir_descri if (status) break; - dfprintk(VFS, "NFS: found cookie %Lu at index %Ld\n", (unsigned long long)entry->cookie, desc->current_index); + dfprintk(DIRCACHE, "NFS: found cookie %Lu at index %Ld\n", + (unsigned long long)entry->cookie, desc->current_index); if (desc->file->f_pos == desc->current_index) { *desc->dir_cookie = entry->cookie; @@ -288,7 +295,6 @@ int find_dirent_index(nfs_readdir_descri schedule(); } } - dfprintk(VFS, "NFS: find_dirent_index() returns %d\n", status); return status; } @@ -303,7 +309,9 @@ int find_dirent_page(nfs_readdir_descrip struct page *page; int status; - dfprintk(VFS, "NFS: find_dirent_page() searching directory page %ld\n", desc->page_index); + dfprintk(DIRCACHE, "NFS: %s: searching page %ld for target %Lu\n", + __FUNCTION__, desc->page_index, + (long long) *desc->dir_cookie); page = read_cache_page(inode->i_mapping, desc->page_index, (filler_t *)nfs_readdir_filler, desc); @@ -324,7 +332,7 @@ int find_dirent_page(nfs_readdir_descrip if (status < 0) dir_page_release(desc); out: - dfprintk(VFS, "NFS: find_dirent_page() returns %d\n", status); + dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __FUNCTION__, status); return status; read_error: page_cache_release(page); @@ -346,13 +354,15 @@ int readdir_search_pagecache(nfs_readdir /* Always search-by-index from the beginning of the cache */ if (*desc->dir_cookie == 0) { - dfprintk(VFS, "NFS: readdir_search_pagecache() searching for offset %Ld\n", (long long)desc->file->f_pos); + dfprintk(DIRCACHE, "NFS: readdir_search_pagecache() searching for offset %Ld\n", + (long long)desc->file->f_pos); desc->page_index = 0; desc->entry->cookie = desc->entry->prev_cookie = 0; desc->entry->eof = 0; desc->current_index = 0; } else - dfprintk(VFS, "NFS: readdir_search_pagecache() searching for cookie %Lu\n", (unsigned long long)*desc->dir_cookie); + dfprintk(DIRCACHE, "NFS: readdir_search_pagecache() searching for cookie %Lu\n", + (unsigned long long)*desc->dir_cookie); for (;;) { res = find_dirent_page(desc); @@ -365,7 +375,8 @@ int readdir_search_pagecache(nfs_readdir schedule(); } } - dfprintk(VFS, "NFS: readdir_search_pagecache() returned %d\n", res); + + dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __FUNCTION__, res); return res; } @@ -390,7 +401,8 @@ int nfs_do_filldir(nfs_readdir_descripto int loop_count = 0, res; - dfprintk(VFS, "NFS: nfs_do_filldir() filling starting @ cookie %Lu\n", (long long)entry->cookie); + dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling starting @ cookie %Lu\n", + (unsigned long long)entry->cookie); for(;;) { unsigned d_type = DT_UNKNOWN; @@ -427,7 +439,8 @@ int nfs_do_filldir(nfs_readdir_descripto dir_page_release(desc); if (dentry != NULL) dput(dentry); - dfprintk(VFS, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", (unsigned long long)*desc->dir_cookie, res); + dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", + (unsigned long long)*desc->dir_cookie, res); return res; } @@ -453,7 +466,8 @@ int uncached_readdir(nfs_readdir_descrip struct page *page = NULL; int status; - dfprintk(VFS, "NFS: uncached_readdir() searching for cookie %Lu\n", (unsigned long long)*desc->dir_cookie); + dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", + (unsigned long long)*desc->dir_cookie); page = alloc_page(GFP_HIGHUSER); if (!page) { @@ -485,7 +499,8 @@ int uncached_readdir(nfs_readdir_descrip desc->entry->cookie = desc->entry->prev_cookie = 0; desc->entry->eof = 0; out: - dfprintk(VFS, "NFS: uncached_readdir() returns %d\n", status); + dfprintk(DIRCACHE, "NFS: %s: returns %d\n", + __FUNCTION__, status); return status; out_release: dir_page_release(desc); @@ -507,6 +522,11 @@ static int nfs_readdir(struct file *filp struct nfs_fattr fattr; long res; + dfprintk(VFS, "NFS: readdir(%s/%s) starting at cookie %Lu\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + (long long)filp->f_pos); + nfs_inc_stats(inode, NFSIOS_VFSGETDENTS); + lock_kernel(); res = nfs_revalidate_inode(NFS_SERVER(inode), inode); @@ -566,9 +586,12 @@ static int nfs_readdir(struct file *filp } } unlock_kernel(); - if (res < 0) - return res; - return 0; + if (res > 0) + res = 0; + dfprintk(VFS, "NFS: readdir(%s/%s) returns %ld\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + res); + return res; } loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) @@ -599,6 +622,10 @@ out: */ int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync) { + dfprintk(VFS, "NFS: fsync_dir(%s/%s) datasync %d\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + datasync); + return 0; } @@ -713,6 +740,7 @@ static int nfs_lookup_revalidate(struct parent = dget_parent(dentry); lock_kernel(); dir = parent->d_inode; + nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); inode = dentry->d_inode; if (!inode) { @@ -722,8 +750,9 @@ static int nfs_lookup_revalidate(struct } if (is_bad_inode(inode)) { - dfprintk(VFS, "nfs_lookup_validate: %s/%s has dud inode\n", - dentry->d_parent->d_name.name, dentry->d_name.name); + dfprintk(LOOKUPCACHE, "%s: %s/%s has dud inode\n", + __FUNCTION__, dentry->d_parent->d_name.name, + dentry->d_name.name); goto out_bad; } @@ -755,6 +784,9 @@ static int nfs_lookup_revalidate(struct out_valid: unlock_kernel(); dput(parent); + dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is valid\n", + __FUNCTION__, dentry->d_parent->d_name.name, + dentry->d_name.name); return 1; out_zap_parent: nfs_zap_caches(dir); @@ -771,6 +803,9 @@ out_zap_parent: d_drop(dentry); unlock_kernel(); dput(parent); + dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n", + __FUNCTION__, dentry->d_parent->d_name.name, + dentry->d_name.name); return 0; } @@ -844,6 +879,7 @@ static struct dentry *nfs_lookup(struct dfprintk(VFS, "NFS: lookup(%s/%s)\n", dentry->d_parent->d_name.name, dentry->d_name.name); + nfs_inc_stats(dir, NFSIOS_VFSLOOKUP); res = ERR_PTR(-ENAMETOOLONG); if (dentry->d_name.len > NFS_SERVER(dir)->namelen) @@ -912,6 +948,9 @@ static struct dentry *nfs_atomic_lookup( struct dentry *res = NULL; int error; + dfprintk(VFS, "NFS: atomic_lookup(%s/%ld), %s\n", + dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); + /* Check that we are indeed trying to open this file */ if (!is_atomic_open(dir, nd)) goto no_open; @@ -1119,8 +1158,8 @@ static int nfs_create(struct inode *dir, int error; int open_flags = 0; - dfprintk(VFS, "NFS: create(%s/%ld, %s\n", dir->i_sb->s_id, - dir->i_ino, dentry->d_name.name); + dfprintk(VFS, "NFS: create(%s/%ld), %s\n", + dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); attr.ia_mode = mode; attr.ia_valid = ATTR_MODE; @@ -1153,8 +1192,8 @@ nfs_mknod(struct inode *dir, struct dent struct iattr attr; int status; - dfprintk(VFS, "NFS: mknod(%s/%ld, %s\n", dir->i_sb->s_id, - dir->i_ino, dentry->d_name.name); + dfprintk(VFS, "NFS: mknod(%s/%ld), %s\n", + dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); if (!new_valid_dev(rdev)) return -EINVAL; @@ -1186,8 +1225,8 @@ static int nfs_mkdir(struct inode *dir, struct iattr attr; int error; - dfprintk(VFS, "NFS: mkdir(%s/%ld, %s\n", dir->i_sb->s_id, - dir->i_ino, dentry->d_name.name); + dfprintk(VFS, "NFS: mkdir(%s/%ld), %s\n", + dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); attr.ia_valid = ATTR_MODE; attr.ia_mode = mode | S_IFDIR; @@ -1212,8 +1251,8 @@ static int nfs_rmdir(struct inode *dir, { int error; - dfprintk(VFS, "NFS: rmdir(%s/%ld, %s\n", dir->i_sb->s_id, - dir->i_ino, dentry->d_name.name); + dfprintk(VFS, "NFS: rmdir(%s/%ld), %s\n", + dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); lock_kernel(); nfs_begin_data_update(dir); @@ -1241,6 +1280,7 @@ static int nfs_sillyrename(struct inode dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n", dentry->d_parent->d_name.name, dentry->d_name.name, atomic_read(&dentry->d_count)); + nfs_inc_stats(dir, NFSIOS_SILLYRENAME); #ifdef NFS_PARANOIA if (!dentry->d_inode) @@ -1268,8 +1308,8 @@ dentry->d_parent->d_name.name, dentry->d sillycounter++; sprintf(suffix, "%*.*x", countersize, countersize, sillycounter); - dfprintk(VFS, "trying to rename %s to %s\n", - dentry->d_name.name, silly); + dfprintk(VFS, "NFS: trying to rename %s to %s\n", + dentry->d_name.name, silly); sdentry = lookup_one_len(silly, dentry->d_parent, slen); /* @@ -1640,6 +1680,8 @@ int nfs_permission(struct inode *inode, struct rpc_cred *cred; int res = 0; + nfs_inc_stats(inode, NFSIOS_VFSACCESS); + if (mask == 0) goto out; /* Is this sys_access() ? */ @@ -1679,13 +1721,15 @@ force_lookup: res = PTR_ERR(cred); unlock_kernel(); out: + dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n", + inode->i_sb->s_id, inode->i_ino, mask, res); return res; out_notsup: res = nfs_revalidate_inode(NFS_SERVER(inode), inode); if (res == 0) res = generic_permission(inode, mask, NULL); unlock_kernel(); - return res; + goto out; } /* diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 04ab2fc..193ef4c 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -7,11 +7,11 @@ * * There are important applications whose performance or correctness * depends on uncached access to file data. Database clusters - * (multiple copies of the same instance running on separate hosts) + * (multiple copies of the same instance running on separate hosts) * implement their own cache coherency protocol that subsumes file - * system cache protocols. Applications that process datasets - * considerably larger than the client's memory do not always benefit - * from a local cache. A streaming video server, for instance, has no + * system cache protocols. Applications that process datasets + * considerably larger than the client's memory do not always benefit + * from a local cache. A streaming video server, for instance, has no * need to cache the contents of a file. * * When an application requests uncached I/O, all read and write requests @@ -34,6 +34,7 @@ * 08 Jun 2003 Port to 2.5 APIs --cel * 31 Mar 2004 Handle direct I/O without VFS support --cel * 15 Sep 2004 Parallel async reads --cel + * 04 May 2005 support O_DIRECT with aio --cel * */ @@ -54,8 +55,9 @@ #include #include +#include "iostat.h" + #define NFSDBG_FACILITY NFSDBG_VFS -#define MAX_DIRECTIO_SIZE (4096UL << PAGE_SHIFT) static kmem_cache_t *nfs_direct_cachep; @@ -64,38 +66,78 @@ static kmem_cache_t *nfs_direct_cachep; */ struct nfs_direct_req { struct kref kref; /* release manager */ - struct list_head list; /* nfs_read_data structs */ - wait_queue_head_t wait; /* wait for i/o completion */ + + /* I/O parameters */ + struct list_head list, /* nfs_read/write_data structs */ + rewrite_list; /* saved nfs_write_data structs */ + struct nfs_open_context *ctx; /* file open context info */ + struct kiocb * iocb; /* controlling i/o request */ + struct inode * inode; /* target file of i/o */ + unsigned long user_addr; /* location of user's buffer */ + size_t user_count; /* total bytes to move */ + loff_t pos; /* starting offset in file */ struct page ** pages; /* pages in our buffer */ unsigned int npages; /* count of pages */ - atomic_t complete, /* i/os we're waiting for */ - count, /* bytes actually processed */ + + /* completion state */ + spinlock_t lock; /* protect completion state */ + int outstanding; /* i/os we're waiting for */ + ssize_t count, /* bytes actually processed */ error; /* any reported error */ + struct completion completion; /* wait for i/o completion */ + + /* commit state */ + struct nfs_write_data * commit_data; /* special write_data for commits */ + int flags; +#define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */ +#define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */ + struct nfs_writeverf verf; /* unstable write verifier */ }; +static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync); +static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode); /** - * nfs_get_user_pages - find and set up pages underlying user's buffer - * rw: direction (read or write) - * user_addr: starting address of this segment of user's buffer - * count: size of this segment - * @pages: returned array of page struct pointers underlying user's buffer - */ -static inline int -nfs_get_user_pages(int rw, unsigned long user_addr, size_t size, - struct page ***pages) + * nfs_direct_IO - NFS address space operation for direct I/O + * @rw: direction (read or write) + * @iocb: target I/O control block + * @iov: array of vectors that define I/O buffer + * @pos: offset in file to begin the operation + * @nr_segs: size of iovec array + * + * The presence of this routine in the address space ops vector means + * the NFS client supports direct I/O. However, we shunt off direct + * read and write requests before the VFS gets them, so this method + * should never be called. + */ +ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) +{ + struct dentry *dentry = iocb->ki_filp->f_dentry; + + dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n", + dentry->d_name.name, (long long) pos, nr_segs); + + return -EINVAL; +} + +static void nfs_free_user_pages(struct page **pages, int npages, int do_dirty) +{ + int i; + for (i = 0; i < npages; i++) { + struct page *page = pages[i]; + if (do_dirty && !PageCompound(page)) + set_page_dirty_lock(page); + page_cache_release(page); + } + kfree(pages); +} + +static inline int nfs_get_user_pages(int rw, unsigned long user_addr, size_t size, struct page ***pages) { int result = -ENOMEM; unsigned long page_count; size_t array_size; - /* set an arbitrary limit to prevent type overflow */ - /* XXX: this can probably be as large as INT_MAX */ - if (size > MAX_DIRECTIO_SIZE) { - *pages = NULL; - return -EFBIG; - } - page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT; page_count -= user_addr >> PAGE_SHIFT; @@ -107,66 +149,117 @@ nfs_get_user_pages(int rw, unsigned long page_count, (rw == READ), 0, *pages, NULL); up_read(¤t->mm->mmap_sem); + if (result != page_count) { + /* + * If we got fewer pages than expected from + * get_user_pages(), the user buffer runs off the + * end of a mapping; return EFAULT. + */ + if (result >= 0) { + nfs_free_user_pages(*pages, result, 0); + result = -EFAULT; + } else + kfree(*pages); + *pages = NULL; + } } return result; } -/** - * nfs_free_user_pages - tear down page struct array - * @pages: array of page struct pointers underlying target buffer - * @npages: number of pages in the array - * @do_dirty: dirty the pages as we release them - */ -static void -nfs_free_user_pages(struct page **pages, int npages, int do_dirty) +static inline struct nfs_direct_req *nfs_direct_req_alloc(void) { - int i; - for (i = 0; i < npages; i++) { - struct page *page = pages[i]; - if (do_dirty && !PageCompound(page)) - set_page_dirty_lock(page); - page_cache_release(page); - } - kfree(pages); + struct nfs_direct_req *dreq; + + dreq = kmem_cache_alloc(nfs_direct_cachep, SLAB_KERNEL); + if (!dreq) + return NULL; + + kref_init(&dreq->kref); + init_completion(&dreq->completion); + INIT_LIST_HEAD(&dreq->list); + INIT_LIST_HEAD(&dreq->rewrite_list); + dreq->iocb = NULL; + dreq->ctx = NULL; + spin_lock_init(&dreq->lock); + dreq->outstanding = 0; + dreq->count = 0; + dreq->error = 0; + dreq->flags = 0; + + return dreq; } -/** - * nfs_direct_req_release - release nfs_direct_req structure for direct read - * @kref: kref object embedded in an nfs_direct_req structure - * - */ static void nfs_direct_req_release(struct kref *kref) { struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref); + + if (dreq->ctx != NULL) + put_nfs_open_context(dreq->ctx); kmem_cache_free(nfs_direct_cachep, dreq); } -/** - * nfs_direct_read_alloc - allocate nfs_read_data structures for direct read - * @count: count of bytes for the read request - * @rsize: local rsize setting +/* + * Collects and returns the final error value/byte-count. + */ +static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq) +{ + ssize_t result = -EIOCBQUEUED; + + /* Async requests don't wait here */ + if (dreq->iocb) + goto out; + + result = wait_for_completion_interruptible(&dreq->completion); + + if (!result) + result = dreq->error; + if (!result) + result = dreq->count; + +out: + kref_put(&dreq->kref, nfs_direct_req_release); + return (ssize_t) result; +} + +/* + * We must hold a reference to all the pages in this direct read request + * until the RPCs complete. This could be long *after* we are woken up in + * nfs_direct_wait (for instance, if someone hits ^C on a slow server). * + * In addition, synchronous I/O uses a stack-allocated iocb. Thus we + * can't trust the iocb is still valid here if this is a synchronous + * request. If the waiter is woken prematurely, the iocb is long gone. + */ +static void nfs_direct_complete(struct nfs_direct_req *dreq) +{ + nfs_free_user_pages(dreq->pages, dreq->npages, 1); + + if (dreq->iocb) { + long res = (long) dreq->error; + if (!res) + res = (long) dreq->count; + aio_complete(dreq->iocb, res, 0); + } + complete_all(&dreq->completion); + + kref_put(&dreq->kref, nfs_direct_req_release); +} + +/* * Note we also set the number of requests we have in the dreq when we are * done. This prevents races with I/O completion so we will always wait * until all requests have been dispatched and completed. */ -static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, unsigned int rsize) +static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, size_t rsize) { struct list_head *list; struct nfs_direct_req *dreq; - unsigned int reads = 0; unsigned int rpages = (rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - dreq = kmem_cache_alloc(nfs_direct_cachep, SLAB_KERNEL); + dreq = nfs_direct_req_alloc(); if (!dreq) return NULL; - kref_init(&dreq->kref); - init_waitqueue_head(&dreq->wait); - INIT_LIST_HEAD(&dreq->list); - atomic_set(&dreq->count, 0); - atomic_set(&dreq->error, 0); - list = &dreq->list; for(;;) { struct nfs_read_data *data = nfs_readdata_alloc(rpages); @@ -186,72 +279,70 @@ static struct nfs_direct_req *nfs_direct list_add(&data->pages, list); data->req = (struct nfs_page *) dreq; - reads++; + dreq->outstanding++; if (nbytes <= rsize) break; nbytes -= rsize; } kref_get(&dreq->kref); - atomic_set(&dreq->complete, reads); return dreq; } -/** - * nfs_direct_read_result - handle a read reply for a direct read request - * @data: address of NFS READ operation control block - * @status: status of this NFS READ operation - * - * We must hold a reference to all the pages in this direct read request - * until the RPCs complete. This could be long *after* we are woken up in - * nfs_direct_read_wait (for instance, if someone hits ^C on a slow server). - */ -static void nfs_direct_read_result(struct nfs_read_data *data, int status) +static void nfs_direct_read_result(struct rpc_task *task, void *calldata) { + struct nfs_read_data *data = calldata; struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; - if (likely(status >= 0)) - atomic_add(data->res.count, &dreq->count); + if (nfs_readpage_result(task, data) != 0) + return; + + spin_lock(&dreq->lock); + + if (likely(task->tk_status >= 0)) + dreq->count += data->res.count; else - atomic_set(&dreq->error, status); + dreq->error = task->tk_status; - if (unlikely(atomic_dec_and_test(&dreq->complete))) { - nfs_free_user_pages(dreq->pages, dreq->npages, 1); - wake_up(&dreq->wait); - kref_put(&dreq->kref, nfs_direct_req_release); + if (--dreq->outstanding) { + spin_unlock(&dreq->lock); + return; } + + spin_unlock(&dreq->lock); + nfs_direct_complete(dreq); } -/** - * nfs_direct_read_schedule - dispatch NFS READ operations for a direct read - * @dreq: address of nfs_direct_req struct for this request - * @inode: target inode - * @ctx: target file open context - * @user_addr: starting address of this segment of user's buffer - * @count: size of this segment - * @file_offset: offset in file to begin the operation - * +static const struct rpc_call_ops nfs_read_direct_ops = { + .rpc_call_done = nfs_direct_read_result, + .rpc_release = nfs_readdata_release, +}; + +/* * For each nfs_read_data struct that was allocated on the list, dispatch * an NFS READ operation */ -static void nfs_direct_read_schedule(struct nfs_direct_req *dreq, - struct inode *inode, struct nfs_open_context *ctx, - unsigned long user_addr, size_t count, loff_t file_offset) +static void nfs_direct_read_schedule(struct nfs_direct_req *dreq) { + struct nfs_open_context *ctx = dreq->ctx; + struct inode *inode = ctx->dentry->d_inode; struct list_head *list = &dreq->list; struct page **pages = dreq->pages; + size_t count = dreq->user_count; + loff_t pos = dreq->pos; + size_t rsize = NFS_SERVER(inode)->rsize; unsigned int curpage, pgbase; - unsigned int rsize = NFS_SERVER(inode)->rsize; curpage = 0; - pgbase = user_addr & ~PAGE_MASK; + pgbase = dreq->user_addr & ~PAGE_MASK; do { struct nfs_read_data *data; - unsigned int bytes; + size_t bytes; bytes = rsize; if (count < rsize) bytes = count; + BUG_ON(list_empty(list)); data = list_entry(list->next, struct nfs_read_data, pages); list_del_init(&data->pages); @@ -259,7 +350,7 @@ static void nfs_direct_read_schedule(str data->cred = ctx->cred; data->args.fh = NFS_FH(inode); data->args.context = ctx; - data->args.offset = file_offset; + data->args.offset = pos; data->args.pgbase = pgbase; data->args.pages = &pages[curpage]; data->args.count = bytes; @@ -267,77 +358,38 @@ static void nfs_direct_read_schedule(str data->res.eof = 0; data->res.count = bytes; + rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC, + &nfs_read_direct_ops, data); NFS_PROTO(inode)->read_setup(data); data->task.tk_cookie = (unsigned long) inode; - data->complete = nfs_direct_read_result; lock_kernel(); rpc_execute(&data->task); unlock_kernel(); - dfprintk(VFS, "NFS: %4d initiated direct read call (req %s/%Ld, %u bytes @ offset %Lu)\n", + dfprintk(VFS, "NFS: %5u initiated direct read call (req %s/%Ld, %zu bytes @ offset %Lu)\n", data->task.tk_pid, inode->i_sb->s_id, (long long)NFS_FILEID(inode), bytes, (unsigned long long)data->args.offset); - file_offset += bytes; + pos += bytes; pgbase += bytes; curpage += pgbase >> PAGE_SHIFT; pgbase &= ~PAGE_MASK; count -= bytes; } while (count != 0); + BUG_ON(!list_empty(list)); } -/** - * nfs_direct_read_wait - wait for I/O completion for direct reads - * @dreq: request on which we are to wait - * @intr: whether or not this wait can be interrupted - * - * Collects and returns the final error value/byte-count. - */ -static ssize_t nfs_direct_read_wait(struct nfs_direct_req *dreq, int intr) -{ - int result = 0; - - if (intr) { - result = wait_event_interruptible(dreq->wait, - (atomic_read(&dreq->complete) == 0)); - } else { - wait_event(dreq->wait, (atomic_read(&dreq->complete) == 0)); - } - - if (!result) - result = atomic_read(&dreq->error); - if (!result) - result = atomic_read(&dreq->count); - - kref_put(&dreq->kref, nfs_direct_req_release); - return (ssize_t) result; -} - -/** - * nfs_direct_read_seg - Read in one iov segment. Generate separate - * read RPCs for each "rsize" bytes. - * @inode: target inode - * @ctx: target file open context - * @user_addr: starting address of this segment of user's buffer - * @count: size of this segment - * @file_offset: offset in file to begin the operation - * @pages: array of addresses of page structs defining user's buffer - * @nr_pages: number of pages in the array - * - */ -static ssize_t nfs_direct_read_seg(struct inode *inode, - struct nfs_open_context *ctx, unsigned long user_addr, - size_t count, loff_t file_offset, struct page **pages, - unsigned int nr_pages) +static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, unsigned int nr_pages) { ssize_t result; sigset_t oldset; + struct inode *inode = iocb->ki_filp->f_mapping->host; struct rpc_clnt *clnt = NFS_CLIENT(inode); struct nfs_direct_req *dreq; @@ -345,284 +397,350 @@ static ssize_t nfs_direct_read_seg(struc if (!dreq) return -ENOMEM; + dreq->user_addr = user_addr; + dreq->user_count = count; + dreq->pos = pos; dreq->pages = pages; dreq->npages = nr_pages; + dreq->inode = inode; + dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data); + if (!is_sync_kiocb(iocb)) + dreq->iocb = iocb; + nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, count); rpc_clnt_sigmask(clnt, &oldset); - nfs_direct_read_schedule(dreq, inode, ctx, user_addr, count, - file_offset); - result = nfs_direct_read_wait(dreq, clnt->cl_intr); + nfs_direct_read_schedule(dreq); + result = nfs_direct_wait(dreq); rpc_clnt_sigunmask(clnt, &oldset); return result; } -/** - * nfs_direct_read - For each iov segment, map the user's buffer - * then generate read RPCs. - * @inode: target inode - * @ctx: target file open context - * @iov: array of vectors that define I/O buffer - * file_offset: offset in file to begin the operation - * nr_segs: size of iovec array - * - * We've already pushed out any non-direct writes so that this read - * will see them when we read from the server. - */ -static ssize_t -nfs_direct_read(struct inode *inode, struct nfs_open_context *ctx, - const struct iovec *iov, loff_t file_offset, - unsigned long nr_segs) -{ - ssize_t tot_bytes = 0; - unsigned long seg = 0; - - while ((seg < nr_segs) && (tot_bytes >= 0)) { - ssize_t result; - int page_count; - struct page **pages; - const struct iovec *vec = &iov[seg++]; - unsigned long user_addr = (unsigned long) vec->iov_base; - size_t size = vec->iov_len; - - page_count = nfs_get_user_pages(READ, user_addr, size, &pages); - if (page_count < 0) { - nfs_free_user_pages(pages, 0, 0); - if (tot_bytes > 0) - break; - return page_count; - } +static void nfs_direct_free_writedata(struct nfs_direct_req *dreq) +{ + list_splice_init(&dreq->rewrite_list, &dreq->list); + while (!list_empty(&dreq->list)) { + struct nfs_write_data *data = list_entry(dreq->list.next, struct nfs_write_data, pages); + list_del(&data->pages); + nfs_writedata_release(data); + } +} - result = nfs_direct_read_seg(inode, ctx, user_addr, size, - file_offset, pages, page_count); +#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) +static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) +{ + struct list_head *pos; - if (result <= 0) { - if (tot_bytes > 0) - break; - return result; - } - tot_bytes += result; - file_offset += result; - if (result < size) - break; + list_splice_init(&dreq->rewrite_list, &dreq->list); + list_for_each(pos, &dreq->list) + dreq->outstanding++; + dreq->count = 0; + + nfs_direct_write_schedule(dreq, FLUSH_STABLE); +} + +static void nfs_direct_commit_result(struct rpc_task *task, void *calldata) +{ + struct nfs_write_data *data = calldata; + struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; + + /* Call the NFS version-specific code */ + if (NFS_PROTO(data->inode)->commit_done(task, data) != 0) + return; + if (unlikely(task->tk_status < 0)) { + dreq->error = task->tk_status; + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + } + if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) { + dprintk("NFS: %5u commit verify failed\n", task->tk_pid); + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; } - return tot_bytes; + dprintk("NFS: %5u commit returned %d\n", task->tk_pid, task->tk_status); + nfs_direct_write_complete(dreq, data->inode); } -/** - * nfs_direct_write_seg - Write out one iov segment. Generate separate - * write RPCs for each "wsize" bytes, then commit. - * @inode: target inode - * @ctx: target file open context - * user_addr: starting address of this segment of user's buffer - * count: size of this segment - * file_offset: offset in file to begin the operation - * @pages: array of addresses of page structs defining user's buffer - * nr_pages: size of pages array - */ -static ssize_t nfs_direct_write_seg(struct inode *inode, - struct nfs_open_context *ctx, unsigned long user_addr, - size_t count, loff_t file_offset, struct page **pages, - int nr_pages) -{ - const unsigned int wsize = NFS_SERVER(inode)->wsize; - size_t request; - int curpage, need_commit; - ssize_t result, tot_bytes; - struct nfs_writeverf first_verf; - struct nfs_write_data *wdata; +static const struct rpc_call_ops nfs_commit_direct_ops = { + .rpc_call_done = nfs_direct_commit_result, + .rpc_release = nfs_commit_release, +}; - wdata = nfs_writedata_alloc(NFS_SERVER(inode)->wpages); - if (!wdata) - return -ENOMEM; +static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) +{ + struct nfs_write_data *data = dreq->commit_data; + struct rpc_task *task = &data->task; - wdata->inode = inode; - wdata->cred = ctx->cred; - wdata->args.fh = NFS_FH(inode); - wdata->args.context = ctx; - wdata->args.stable = NFS_UNSTABLE; - if (IS_SYNC(inode) || NFS_PROTO(inode)->version == 2 || count <= wsize) - wdata->args.stable = NFS_FILE_SYNC; - wdata->res.fattr = &wdata->fattr; - wdata->res.verf = &wdata->verf; + data->inode = dreq->inode; + data->cred = dreq->ctx->cred; - nfs_begin_data_update(inode); -retry: - need_commit = 0; - tot_bytes = 0; - curpage = 0; - request = count; - wdata->args.pgbase = user_addr & ~PAGE_MASK; - wdata->args.offset = file_offset; - do { - wdata->args.count = request; - if (wdata->args.count > wsize) - wdata->args.count = wsize; - wdata->args.pages = &pages[curpage]; - - dprintk("NFS: direct write: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n", - wdata->args.count, (long long) wdata->args.offset, - user_addr + tot_bytes, wdata->args.pgbase, curpage); + data->args.fh = NFS_FH(data->inode); + data->args.offset = dreq->pos; + data->args.count = dreq->user_count; + data->res.count = 0; + data->res.fattr = &data->fattr; + data->res.verf = &data->verf; - lock_kernel(); - result = NFS_PROTO(inode)->write(wdata); - unlock_kernel(); + rpc_init_task(&data->task, NFS_CLIENT(dreq->inode), RPC_TASK_ASYNC, + &nfs_commit_direct_ops, data); + NFS_PROTO(data->inode)->commit_setup(data, 0); - if (result <= 0) { - if (tot_bytes > 0) - break; - goto out; - } + data->task.tk_priority = RPC_PRIORITY_NORMAL; + data->task.tk_cookie = (unsigned long)data->inode; + /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ + dreq->commit_data = NULL; + + dprintk("NFS: %5u initiated commit call\n", task->tk_pid); - if (tot_bytes == 0) - memcpy(&first_verf.verifier, &wdata->verf.verifier, - sizeof(first_verf.verifier)); - if (wdata->verf.committed != NFS_FILE_SYNC) { - need_commit = 1; - if (memcmp(&first_verf.verifier, &wdata->verf.verifier, - sizeof(first_verf.verifier))) - goto sync_retry; - } + lock_kernel(); + rpc_execute(&data->task); + unlock_kernel(); +} - tot_bytes += result; +static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) +{ + int flags = dreq->flags; - /* in case of a short write: stop now, let the app recover */ - if (result < wdata->args.count) + dreq->flags = 0; + switch (flags) { + case NFS_ODIRECT_DO_COMMIT: + nfs_direct_commit_schedule(dreq); break; + case NFS_ODIRECT_RESCHED_WRITES: + nfs_direct_write_reschedule(dreq); + break; + default: + nfs_end_data_update(inode); + if (dreq->commit_data != NULL) + nfs_commit_free(dreq->commit_data); + nfs_direct_free_writedata(dreq); + nfs_direct_complete(dreq); + } +} - wdata->args.offset += result; - wdata->args.pgbase += result; - curpage += wdata->args.pgbase >> PAGE_SHIFT; - wdata->args.pgbase &= ~PAGE_MASK; - request -= result; - } while (request != 0); +static void nfs_alloc_commit_data(struct nfs_direct_req *dreq) +{ + dreq->commit_data = nfs_commit_alloc(0); + if (dreq->commit_data != NULL) + dreq->commit_data->req = (struct nfs_page *) dreq; +} +#else +static inline void nfs_alloc_commit_data(struct nfs_direct_req *dreq) +{ + dreq->commit_data = NULL; +} - /* - * Commit data written so far, even in the event of an error - */ - if (need_commit) { - wdata->args.count = tot_bytes; - wdata->args.offset = file_offset; +static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) +{ + nfs_end_data_update(inode); + nfs_direct_free_writedata(dreq); + nfs_direct_complete(dreq); +} +#endif - lock_kernel(); - result = NFS_PROTO(inode)->commit(wdata); - unlock_kernel(); +static struct nfs_direct_req *nfs_direct_write_alloc(size_t nbytes, size_t wsize) +{ + struct list_head *list; + struct nfs_direct_req *dreq; + unsigned int wpages = (wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + dreq = nfs_direct_req_alloc(); + if (!dreq) + return NULL; + + list = &dreq->list; + for(;;) { + struct nfs_write_data *data = nfs_writedata_alloc(wpages); - if (result < 0 || memcmp(&first_verf.verifier, - &wdata->verf.verifier, - sizeof(first_verf.verifier)) != 0) - goto sync_retry; + if (unlikely(!data)) { + while (!list_empty(list)) { + data = list_entry(list->next, + struct nfs_write_data, pages); + list_del(&data->pages); + nfs_writedata_free(data); + } + kref_put(&dreq->kref, nfs_direct_req_release); + return NULL; + } + + INIT_LIST_HEAD(&data->pages); + list_add(&data->pages, list); + + data->req = (struct nfs_page *) dreq; + dreq->outstanding++; + if (nbytes <= wsize) + break; + nbytes -= wsize; } - result = tot_bytes; -out: - nfs_end_data_update(inode); - nfs_writedata_free(wdata); - return result; + nfs_alloc_commit_data(dreq); -sync_retry: - wdata->args.stable = NFS_FILE_SYNC; - goto retry; + kref_get(&dreq->kref); + return dreq; } -/** - * nfs_direct_write - For each iov segment, map the user's buffer - * then generate write and commit RPCs. - * @inode: target inode - * @ctx: target file open context - * @iov: array of vectors that define I/O buffer - * file_offset: offset in file to begin the operation - * nr_segs: size of iovec array - * - * Upon return, generic_file_direct_IO invalidates any cached pages - * that non-direct readers might access, so they will pick up these - * writes immediately. - */ -static ssize_t nfs_direct_write(struct inode *inode, - struct nfs_open_context *ctx, const struct iovec *iov, - loff_t file_offset, unsigned long nr_segs) -{ - ssize_t tot_bytes = 0; - unsigned long seg = 0; - - while ((seg < nr_segs) && (tot_bytes >= 0)) { - ssize_t result; - int page_count; - struct page **pages; - const struct iovec *vec = &iov[seg++]; - unsigned long user_addr = (unsigned long) vec->iov_base; - size_t size = vec->iov_len; - - page_count = nfs_get_user_pages(WRITE, user_addr, size, &pages); - if (page_count < 0) { - nfs_free_user_pages(pages, 0, 0); - if (tot_bytes > 0) - break; - return page_count; - } +static void nfs_direct_write_result(struct rpc_task *task, void *calldata) +{ + struct nfs_write_data *data = calldata; + struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; + int status = task->tk_status; + + if (nfs_writeback_done(task, data) != 0) + return; + + spin_lock(&dreq->lock); - result = nfs_direct_write_seg(inode, ctx, user_addr, size, - file_offset, pages, page_count); - nfs_free_user_pages(pages, page_count, 0); + if (likely(status >= 0)) + dreq->count += data->res.count; + else + dreq->error = task->tk_status; - if (result <= 0) { - if (tot_bytes > 0) + if (data->res.verf->committed != NFS_FILE_SYNC) { + switch (dreq->flags) { + case 0: + memcpy(&dreq->verf, &data->verf, sizeof(dreq->verf)); + dreq->flags = NFS_ODIRECT_DO_COMMIT; break; - return result; + case NFS_ODIRECT_DO_COMMIT: + if (memcmp(&dreq->verf, &data->verf, sizeof(dreq->verf))) { + dprintk("NFS: %5u write verify failed\n", task->tk_pid); + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + } } - tot_bytes += result; - file_offset += result; - if (result < size) - break; } - return tot_bytes; + /* In case we have to resend */ + data->args.stable = NFS_FILE_SYNC; + + spin_unlock(&dreq->lock); } -/** - * nfs_direct_IO - NFS address space operation for direct I/O - * rw: direction (read or write) - * @iocb: target I/O control block - * @iov: array of vectors that define I/O buffer - * file_offset: offset in file to begin the operation - * nr_segs: size of iovec array - * +/* + * NB: Return the value of the first error return code. Subsequent + * errors after the first one are ignored. */ -ssize_t -nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t file_offset, unsigned long nr_segs) +static void nfs_direct_write_release(void *calldata) { - ssize_t result = -EINVAL; - struct file *file = iocb->ki_filp; - struct nfs_open_context *ctx; - struct dentry *dentry = file->f_dentry; - struct inode *inode = dentry->d_inode; + struct nfs_write_data *data = calldata; + struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; - /* - * No support for async yet - */ + spin_lock(&dreq->lock); + if (--dreq->outstanding) { + spin_unlock(&dreq->lock); + return; + } + spin_unlock(&dreq->lock); + + nfs_direct_write_complete(dreq, data->inode); +} + +static const struct rpc_call_ops nfs_write_direct_ops = { + .rpc_call_done = nfs_direct_write_result, + .rpc_release = nfs_direct_write_release, +}; + +/* + * For each nfs_write_data struct that was allocated on the list, dispatch + * an NFS WRITE operation + */ +static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync) +{ + struct nfs_open_context *ctx = dreq->ctx; + struct inode *inode = ctx->dentry->d_inode; + struct list_head *list = &dreq->list; + struct page **pages = dreq->pages; + size_t count = dreq->user_count; + loff_t pos = dreq->pos; + size_t wsize = NFS_SERVER(inode)->wsize; + unsigned int curpage, pgbase; + + curpage = 0; + pgbase = dreq->user_addr & ~PAGE_MASK; + do { + struct nfs_write_data *data; + size_t bytes; + + bytes = wsize; + if (count < wsize) + bytes = count; + + BUG_ON(list_empty(list)); + data = list_entry(list->next, struct nfs_write_data, pages); + list_move_tail(&data->pages, &dreq->rewrite_list); + + data->inode = inode; + data->cred = ctx->cred; + data->args.fh = NFS_FH(inode); + data->args.context = ctx; + data->args.offset = pos; + data->args.pgbase = pgbase; + data->args.pages = &pages[curpage]; + data->args.count = bytes; + data->res.fattr = &data->fattr; + data->res.count = bytes; + data->res.verf = &data->verf; + + rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC, + &nfs_write_direct_ops, data); + NFS_PROTO(inode)->write_setup(data, sync); + + data->task.tk_priority = RPC_PRIORITY_NORMAL; + data->task.tk_cookie = (unsigned long) inode; + + lock_kernel(); + rpc_execute(&data->task); + unlock_kernel(); + + dfprintk(VFS, "NFS: %5u initiated direct write call (req %s/%Ld, %zu bytes @ offset %Lu)\n", + data->task.tk_pid, + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), + bytes, + (unsigned long long)data->args.offset); + + pos += bytes; + pgbase += bytes; + curpage += pgbase >> PAGE_SHIFT; + pgbase &= ~PAGE_MASK; + + count -= bytes; + } while (count != 0); + BUG_ON(!list_empty(list)); +} + +static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, int nr_pages) +{ + ssize_t result; + sigset_t oldset; + struct inode *inode = iocb->ki_filp->f_mapping->host; + struct rpc_clnt *clnt = NFS_CLIENT(inode); + struct nfs_direct_req *dreq; + size_t wsize = NFS_SERVER(inode)->wsize; + int sync = 0; + + dreq = nfs_direct_write_alloc(count, wsize); + if (!dreq) + return -ENOMEM; + if (dreq->commit_data == NULL || count < wsize) + sync = FLUSH_STABLE; + + dreq->user_addr = user_addr; + dreq->user_count = count; + dreq->pos = pos; + dreq->pages = pages; + dreq->npages = nr_pages; + dreq->inode = inode; + dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data); if (!is_sync_kiocb(iocb)) - return result; + dreq->iocb = iocb; + + nfs_add_stats(inode, NFSIOS_DIRECTWRITTENBYTES, count); + + nfs_begin_data_update(inode); + + rpc_clnt_sigmask(clnt, &oldset); + nfs_direct_write_schedule(dreq, sync); + result = nfs_direct_wait(dreq); + rpc_clnt_sigunmask(clnt, &oldset); - ctx = (struct nfs_open_context *)file->private_data; - switch (rw) { - case READ: - dprintk("NFS: direct_IO(read) (%s) off/no(%Lu/%lu)\n", - dentry->d_name.name, file_offset, nr_segs); - - result = nfs_direct_read(inode, ctx, iov, - file_offset, nr_segs); - break; - case WRITE: - dprintk("NFS: direct_IO(write) (%s) off/no(%Lu/%lu)\n", - dentry->d_name.name, file_offset, nr_segs); - - result = nfs_direct_write(inode, ctx, iov, - file_offset, nr_segs); - break; - default: - break; - } return result; } @@ -630,49 +748,40 @@ nfs_direct_IO(int rw, struct kiocb *iocb * nfs_file_direct_read - file direct read operation for NFS files * @iocb: target I/O control block * @buf: user's buffer into which to read data - * count: number of bytes to read - * pos: byte offset in file where reading starts + * @count: number of bytes to read + * @pos: byte offset in file where reading starts * * We use this function for direct reads instead of calling * generic_file_aio_read() in order to avoid gfar's check to see if * the request starts before the end of the file. For that check * to work, we must generate a GETATTR before each direct read, and * even then there is a window between the GETATTR and the subsequent - * READ where the file size could change. So our preference is simply + * READ where the file size could change. Our preference is simply * to do all reads the application wants, and the server will take * care of managing the end of file boundary. - * + * * This function also eliminates unnecessarily updating the file's * atime locally, as the NFS server sets the file's atime, and this * client must read the updated atime from the server back into its * cache. */ -ssize_t -nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos) +ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos) { ssize_t retval = -EINVAL; - loff_t *ppos = &iocb->ki_pos; + int page_count; + struct page **pages; struct file *file = iocb->ki_filp; - struct nfs_open_context *ctx = - (struct nfs_open_context *) file->private_data; struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - struct iovec iov = { - .iov_base = buf, - .iov_len = count, - }; dprintk("nfs: direct read(%s/%s, %lu@%Ld)\n", file->f_dentry->d_parent->d_name.name, file->f_dentry->d_name.name, (unsigned long) count, (long long) pos); - if (!is_sync_kiocb(iocb)) - goto out; if (count < 0) goto out; retval = -EFAULT; - if (!access_ok(VERIFY_WRITE, iov.iov_base, iov.iov_len)) + if (!access_ok(VERIFY_WRITE, buf, count)) goto out; retval = 0; if (!count) @@ -682,9 +791,16 @@ nfs_file_direct_read(struct kiocb *iocb, if (retval) goto out; - retval = nfs_direct_read(inode, ctx, &iov, pos, 1); + retval = nfs_get_user_pages(READ, (unsigned long) buf, + count, &pages); + if (retval < 0) + goto out; + page_count = retval; + + retval = nfs_direct_read(iocb, (unsigned long) buf, count, pos, + pages, page_count); if (retval > 0) - *ppos = pos + retval; + iocb->ki_pos = pos + retval; out: return retval; @@ -694,8 +810,8 @@ out: * nfs_file_direct_write - file direct write operation for NFS files * @iocb: target I/O control block * @buf: user's buffer from which to write data - * count: number of bytes to write - * pos: byte offset in file where writing starts + * @count: number of bytes to write + * @pos: byte offset in file where writing starts * * We use this function for direct writes instead of calling * generic_file_aio_write() in order to avoid taking the inode @@ -715,28 +831,19 @@ out: * Note that O_APPEND is not supported for NFS direct writes, as there * is no atomic O_APPEND write facility in the NFS protocol. */ -ssize_t -nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos) +ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos) { ssize_t retval; + int page_count; + struct page **pages; struct file *file = iocb->ki_filp; - struct nfs_open_context *ctx = - (struct nfs_open_context *) file->private_data; struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - struct iovec iov = { - .iov_base = (char __user *)buf, - }; dfprintk(VFS, "nfs: direct write(%s/%s, %lu@%Ld)\n", file->f_dentry->d_parent->d_name.name, file->f_dentry->d_name.name, (unsigned long) count, (long long) pos); - retval = -EINVAL; - if (!is_sync_kiocb(iocb)) - goto out; - retval = generic_write_checks(file, &pos, &count, 0); if (retval) goto out; @@ -747,19 +854,35 @@ nfs_file_direct_write(struct kiocb *iocb retval = 0; if (!count) goto out; - iov.iov_len = count, retval = -EFAULT; - if (!access_ok(VERIFY_READ, iov.iov_base, iov.iov_len)) + if (!access_ok(VERIFY_READ, buf, count)) goto out; retval = nfs_sync_mapping(mapping); if (retval) goto out; - retval = nfs_direct_write(inode, ctx, &iov, pos, 1); + retval = nfs_get_user_pages(WRITE, (unsigned long) buf, + count, &pages); + if (retval < 0) + goto out; + page_count = retval; + + retval = nfs_direct_write(iocb, (unsigned long) buf, count, + pos, pages, page_count); + + /* + * XXX: nfs_end_data_update() already ensures this file's + * cached data is subsequently invalidated. Do we really + * need to call invalidate_inode_pages2() again here? + * + * For aio writes, this invalidation will almost certainly + * occur before the writes complete. Kind of racey. + */ if (mapping->nrpages) invalidate_inode_pages2(mapping); + if (retval > 0) iocb->ki_pos = pos + retval; @@ -767,6 +890,10 @@ out: return retval; } +/** + * nfs_init_directcache - create a slab cache for nfs_direct_req structures + * + */ int nfs_init_directcache(void) { nfs_direct_cachep = kmem_cache_create("nfs_direct_cache", @@ -779,6 +906,10 @@ int nfs_init_directcache(void) return 0; } +/** + * nfs_init_directcache - destroy the slab cache for nfs_direct_req structures + * + */ void nfs_destroy_directcache(void) { if (kmem_cache_destroy(nfs_direct_cachep)) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 7a79fbe..6bcbc4d 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -32,6 +32,7 @@ #include #include "delegation.h" +#include "iostat.h" #define NFSDBG_FACILITY NFSDBG_FILE @@ -102,18 +103,15 @@ static int nfs_check_flags(int flags) static int nfs_file_open(struct inode *inode, struct file *filp) { - struct nfs_server *server = NFS_SERVER(inode); - int (*open)(struct inode *, struct file *); int res; res = nfs_check_flags(filp->f_flags); if (res) return res; + nfs_inc_stats(inode, NFSIOS_VFSOPEN); lock_kernel(); - /* Do NFSv4 open() call */ - if ((open = server->rpc_ops->file_open) != NULL) - res = open(inode, filp); + res = NFS_SERVER(inode)->rpc_ops->file_open(inode, filp); unlock_kernel(); return res; } @@ -124,6 +122,7 @@ nfs_file_release(struct inode *inode, st /* Ensure that dirty pages are flushed out with the right creds */ if (filp->f_mode & FMODE_WRITE) filemap_fdatawrite(filp->f_mapping); + nfs_inc_stats(inode, NFSIOS_VFSRELEASE); return NFS_PROTO(inode)->file_release(inode, filp); } @@ -199,6 +198,7 @@ nfs_file_flush(struct file *file) if ((file->f_mode & FMODE_WRITE) == 0) return 0; + nfs_inc_stats(inode, NFSIOS_VFSFLUSH); lock_kernel(); /* Ensure that data+attribute caches are up to date after close() */ status = nfs_wb_all(inode); @@ -229,6 +229,7 @@ nfs_file_read(struct kiocb *iocb, char _ (unsigned long) count, (unsigned long) pos); result = nfs_revalidate_file(inode, iocb->ki_filp); + nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, count); if (!result) result = generic_file_aio_read(iocb, buf, count, pos); return result; @@ -282,6 +283,7 @@ nfs_fsync(struct file *file, struct dent dfprintk(VFS, "nfs: fsync(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino); + nfs_inc_stats(inode, NFSIOS_VFSFSYNC); lock_kernel(); status = nfs_wb_all(inode); if (!status) { @@ -316,6 +318,17 @@ static int nfs_commit_write(struct file return status; } +static int nfs_invalidate_page(struct page *page, unsigned long offset) +{ + /* FIXME: we really should cancel any unstarted writes on this page */ + return 1; +} + +static int nfs_release_page(struct page *page, gfp_t gfp) +{ + return !nfs_wb_page(page->mapping->host, page); +} + struct address_space_operations nfs_file_aops = { .readpage = nfs_readpage, .readpages = nfs_readpages, @@ -324,6 +337,8 @@ struct address_space_operations nfs_file .writepages = nfs_writepages, .prepare_write = nfs_prepare_write, .commit_write = nfs_commit_write, + .invalidatepage = nfs_invalidate_page, + .releasepage = nfs_release_page, #ifdef CONFIG_NFS_DIRECTIO .direct_IO = nfs_direct_IO, #endif @@ -365,6 +380,7 @@ nfs_file_write(struct kiocb *iocb, const if (!count) goto out; + nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count); result = generic_file_aio_write(iocb, buf, count, pos); out: return result; @@ -376,15 +392,17 @@ out_swapfile: static int do_getlk(struct file *filp, int cmd, struct file_lock *fl) { - struct file_lock *cfl; + struct file_lock cfl; struct inode *inode = filp->f_mapping->host; int status = 0; lock_kernel(); /* Try local locking first */ - cfl = posix_test_lock(filp, fl); - if (cfl != NULL) { - locks_copy_lock(fl, cfl); + if (posix_test_lock(filp, fl, &cfl)) { + fl->fl_start = cfl.fl_start; + fl->fl_end = cfl.fl_end; + fl->fl_type = cfl.fl_type; + fl->fl_pid = cfl.fl_pid; goto out; } @@ -504,9 +522,7 @@ static int nfs_lock(struct file *filp, i inode->i_sb->s_id, inode->i_ino, fl->fl_type, fl->fl_flags, (long long)fl->fl_start, (long long)fl->fl_end); - - if (!inode) - return -EINVAL; + nfs_inc_stats(inode, NFSIOS_VFSLOCK); /* No mandatory locks over NFS */ if ((inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID && @@ -531,9 +547,6 @@ static int nfs_flock(struct file *filp, inode->i_sb->s_id, inode->i_ino, fl->fl_type, fl->fl_flags); - if (!inode) - return -EINVAL; - /* * No BSD flocks over NFS allowed. * Note: we could try to fake a POSIX lock request here by diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 821edd3..3fab5b0 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -35,6 +35,7 @@ */ #include +#include #include #include #include @@ -74,8 +75,8 @@ struct idmap { struct dentry *idmap_dentry; wait_queue_head_t idmap_wq; struct idmap_msg idmap_im; - struct semaphore idmap_lock; /* Serializes upcalls */ - struct semaphore idmap_im_lock; /* Protects the hashtable */ + struct mutex idmap_lock; /* Serializes upcalls */ + struct mutex idmap_im_lock; /* Protects the hashtable */ struct idmap_hashtable idmap_user_hash; struct idmap_hashtable idmap_group_hash; }; @@ -101,11 +102,9 @@ nfs_idmap_new(struct nfs4_client *clp) if (clp->cl_idmap != NULL) return; - if ((idmap = kmalloc(sizeof(*idmap), GFP_KERNEL)) == NULL) + if ((idmap = kzalloc(sizeof(*idmap), GFP_KERNEL)) == NULL) return; - memset(idmap, 0, sizeof(*idmap)); - snprintf(idmap->idmap_path, sizeof(idmap->idmap_path), "%s/idmap", clp->cl_rpcclient->cl_pathname); @@ -116,8 +115,8 @@ nfs_idmap_new(struct nfs4_client *clp) return; } - init_MUTEX(&idmap->idmap_lock); - init_MUTEX(&idmap->idmap_im_lock); + mutex_init(&idmap->idmap_lock); + mutex_init(&idmap->idmap_im_lock); init_waitqueue_head(&idmap->idmap_wq); idmap->idmap_user_hash.h_type = IDMAP_TYPE_USER; idmap->idmap_group_hash.h_type = IDMAP_TYPE_GROUP; @@ -132,6 +131,8 @@ nfs_idmap_delete(struct nfs4_client *clp if (!idmap) return; + dput(idmap->idmap_dentry); + idmap->idmap_dentry = NULL; rpc_unlink(idmap->idmap_path); clp->cl_idmap = NULL; kfree(idmap); @@ -232,8 +233,8 @@ nfs_idmap_id(struct idmap *idmap, struct if (namelen >= IDMAP_NAMESZ) return -EINVAL; - down(&idmap->idmap_lock); - down(&idmap->idmap_im_lock); + mutex_lock(&idmap->idmap_lock); + mutex_lock(&idmap->idmap_im_lock); he = idmap_lookup_name(h, name, namelen); if (he != NULL) { @@ -259,11 +260,11 @@ nfs_idmap_id(struct idmap *idmap, struct } set_current_state(TASK_UNINTERRUPTIBLE); - up(&idmap->idmap_im_lock); + mutex_unlock(&idmap->idmap_im_lock); schedule(); current->state = TASK_RUNNING; remove_wait_queue(&idmap->idmap_wq, &wq); - down(&idmap->idmap_im_lock); + mutex_lock(&idmap->idmap_im_lock); if (im->im_status & IDMAP_STATUS_SUCCESS) { *id = im->im_id; @@ -272,8 +273,8 @@ nfs_idmap_id(struct idmap *idmap, struct out: memset(im, 0, sizeof(*im)); - up(&idmap->idmap_im_lock); - up(&idmap->idmap_lock); + mutex_unlock(&idmap->idmap_im_lock); + mutex_unlock(&idmap->idmap_lock); return (ret); } @@ -293,8 +294,8 @@ nfs_idmap_name(struct idmap *idmap, stru im = &idmap->idmap_im; - down(&idmap->idmap_lock); - down(&idmap->idmap_im_lock); + mutex_lock(&idmap->idmap_lock); + mutex_lock(&idmap->idmap_im_lock); he = idmap_lookup_id(h, id); if (he != 0) { @@ -320,11 +321,11 @@ nfs_idmap_name(struct idmap *idmap, stru } set_current_state(TASK_UNINTERRUPTIBLE); - up(&idmap->idmap_im_lock); + mutex_unlock(&idmap->idmap_im_lock); schedule(); current->state = TASK_RUNNING; remove_wait_queue(&idmap->idmap_wq, &wq); - down(&idmap->idmap_im_lock); + mutex_lock(&idmap->idmap_im_lock); if (im->im_status & IDMAP_STATUS_SUCCESS) { if ((len = strnlen(im->im_name, IDMAP_NAMESZ)) == 0) @@ -335,8 +336,8 @@ nfs_idmap_name(struct idmap *idmap, stru out: memset(im, 0, sizeof(*im)); - up(&idmap->idmap_im_lock); - up(&idmap->idmap_lock); + mutex_unlock(&idmap->idmap_im_lock); + mutex_unlock(&idmap->idmap_lock); return ret; } @@ -380,7 +381,7 @@ idmap_pipe_downcall(struct file *filp, c if (copy_from_user(&im_in, src, mlen) != 0) return (-EFAULT); - down(&idmap->idmap_im_lock); + mutex_lock(&idmap->idmap_im_lock); ret = mlen; im->im_status = im_in.im_status; @@ -440,7 +441,7 @@ idmap_pipe_downcall(struct file *filp, c idmap_update_entry(he, im_in.im_name, namelen_in, im_in.im_id); ret = mlen; out: - up(&idmap->idmap_im_lock); + mutex_unlock(&idmap->idmap_im_lock); return ret; } @@ -452,10 +453,10 @@ idmap_pipe_destroy_msg(struct rpc_pipe_m if (msg->errno >= 0) return; - down(&idmap->idmap_im_lock); + mutex_lock(&idmap->idmap_im_lock); im->im_status = IDMAP_STATUS_LOOKUPFAIL; wake_up(&idmap->idmap_wq); - up(&idmap->idmap_im_lock); + mutex_unlock(&idmap->idmap_im_lock); } /* diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index a77ee95..22606ba 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -42,6 +43,7 @@ #include "nfs4_fs.h" #include "callback.h" #include "delegation.h" +#include "iostat.h" #define NFSDBG_FACILITY NFSDBG_VFS #define NFS_PARANOIA 1 @@ -65,6 +67,7 @@ static void nfs_clear_inode(struct inode static void nfs_umount_begin(struct super_block *); static int nfs_statfs(struct super_block *, struct kstatfs *); static int nfs_show_options(struct seq_file *, struct vfsmount *); +static int nfs_show_stats(struct seq_file *, struct vfsmount *); static void nfs_zap_acl_cache(struct inode *); static struct rpc_program nfs_program; @@ -78,6 +81,7 @@ static struct super_operations nfs_sops .clear_inode = nfs_clear_inode, .umount_begin = nfs_umount_begin, .show_options = nfs_show_options, + .show_stats = nfs_show_stats, }; /* @@ -290,6 +294,15 @@ nfs_sb_init(struct super_block *sb, rpc_ } sb->s_root->d_op = server->rpc_ops->dentry_ops; + server->io_stats = nfs_alloc_iostats(); + if (!server->io_stats) { + no_root_error = -ENOMEM; + goto out_no_root; + } + + /* mount time stamp, in seconds */ + server->mount_time = jiffies; + /* Get some general file system info */ if (server->namelen == 0 && server->rpc_ops->pathconf(server, &server->fh, &pathinfo) >= 0) @@ -396,6 +409,9 @@ nfs_create_client(struct nfs_server *ser nfs_init_timeout_values(&timeparms, proto, data->timeo, data->retrans); + server->retrans_timeo = timeparms.to_initval; + server->retrans_count = timeparms.to_retries; + /* create transport and client */ xprt = xprt_create_proto(proto, &server->addr, &timeparms); if (IS_ERR(xprt)) { @@ -579,7 +595,7 @@ nfs_statfs(struct super_block *sb, struc } -static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt) +static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, int showdefaults) { static struct proc_nfs_info { int flag; @@ -588,28 +604,26 @@ static int nfs_show_options(struct seq_f } nfs_info[] = { { NFS_MOUNT_SOFT, ",soft", ",hard" }, { NFS_MOUNT_INTR, ",intr", "" }, - { NFS_MOUNT_POSIX, ",posix", "" }, { NFS_MOUNT_NOCTO, ",nocto", "" }, { NFS_MOUNT_NOAC, ",noac", "" }, - { NFS_MOUNT_NONLM, ",nolock", ",lock" }, + { NFS_MOUNT_NONLM, ",nolock", "" }, { NFS_MOUNT_NOACL, ",noacl", "" }, { 0, NULL, NULL } }; struct proc_nfs_info *nfs_infop; - struct nfs_server *nfss = NFS_SB(mnt->mnt_sb); char buf[12]; char *proto; - seq_printf(m, ",v%d", nfss->rpc_ops->version); + seq_printf(m, ",vers=%d", nfss->rpc_ops->version); seq_printf(m, ",rsize=%d", nfss->rsize); seq_printf(m, ",wsize=%d", nfss->wsize); - if (nfss->acregmin != 3*HZ) + if (nfss->acregmin != 3*HZ || showdefaults) seq_printf(m, ",acregmin=%d", nfss->acregmin/HZ); - if (nfss->acregmax != 60*HZ) + if (nfss->acregmax != 60*HZ || showdefaults) seq_printf(m, ",acregmax=%d", nfss->acregmax/HZ); - if (nfss->acdirmin != 30*HZ) + if (nfss->acdirmin != 30*HZ || showdefaults) seq_printf(m, ",acdirmin=%d", nfss->acdirmin/HZ); - if (nfss->acdirmax != 60*HZ) + if (nfss->acdirmax != 60*HZ || showdefaults) seq_printf(m, ",acdirmax=%d", nfss->acdirmax/HZ); for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) { if (nfss->flags & nfs_infop->flag) @@ -629,8 +643,96 @@ static int nfs_show_options(struct seq_f proto = buf; } seq_printf(m, ",proto=%s", proto); + seq_printf(m, ",timeo=%lu", 10U * nfss->retrans_timeo / HZ); + seq_printf(m, ",retrans=%u", nfss->retrans_count); +} + +static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt) +{ + struct nfs_server *nfss = NFS_SB(mnt->mnt_sb); + + nfs_show_mount_options(m, nfss, 0); + seq_puts(m, ",addr="); seq_escape(m, nfss->hostname, " \t\n\\"); + + return 0; +} + +static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) +{ + int i, cpu; + struct nfs_server *nfss = NFS_SB(mnt->mnt_sb); + struct rpc_auth *auth = nfss->client->cl_auth; + struct nfs_iostats totals = { }; + + seq_printf(m, "statvers=%s", NFS_IOSTAT_VERS); + + /* + * Display all mount option settings + */ + seq_printf(m, "\n\topts:\t"); + seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? "ro" : "rw"); + seq_puts(m, mnt->mnt_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : ""); + seq_puts(m, mnt->mnt_sb->s_flags & MS_NOATIME ? ",noatime" : ""); + seq_puts(m, mnt->mnt_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : ""); + nfs_show_mount_options(m, nfss, 1); + + seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ); + + seq_printf(m, "\n\tcaps:\t"); + seq_printf(m, "caps=0x%x", nfss->caps); + seq_printf(m, ",wtmult=%d", nfss->wtmult); + seq_printf(m, ",dtsize=%d", nfss->dtsize); + seq_printf(m, ",bsize=%d", nfss->bsize); + seq_printf(m, ",namelen=%d", nfss->namelen); + +#ifdef CONFIG_NFS_V4 + if (nfss->rpc_ops->version == 4) { + seq_printf(m, "\n\tnfsv4:\t"); + seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); + seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); + seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); + } +#endif + + /* + * Display security flavor in effect for this mount + */ + seq_printf(m, "\n\tsec:\tflavor=%d", auth->au_ops->au_flavor); + if (auth->au_flavor) + seq_printf(m, ",pseudoflavor=%d", auth->au_flavor); + + /* + * Display superblock I/O counters + */ + for (cpu = 0; cpu < NR_CPUS; cpu++) { + struct nfs_iostats *stats; + + if (!cpu_possible(cpu)) + continue; + + preempt_disable(); + stats = per_cpu_ptr(nfss->io_stats, cpu); + + for (i = 0; i < __NFSIOS_COUNTSMAX; i++) + totals.events[i] += stats->events[i]; + for (i = 0; i < __NFSIOS_BYTESMAX; i++) + totals.bytes[i] += stats->bytes[i]; + + preempt_enable(); + } + + seq_printf(m, "\n\tevents:\t"); + for (i = 0; i < __NFSIOS_COUNTSMAX; i++) + seq_printf(m, "%lu ", totals.events[i]); + seq_printf(m, "\n\tbytes:\t"); + for (i = 0; i < __NFSIOS_BYTESMAX; i++) + seq_printf(m, "%Lu ", totals.bytes[i]); + seq_printf(m, "\n"); + + rpc_print_iostats(m, nfss->client); + return 0; } @@ -660,6 +762,8 @@ static void nfs_zap_caches_locked(struct struct nfs_inode *nfsi = NFS_I(inode); int mode = inode->i_mode; + nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); + NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode); NFS_ATTRTIMEO_UPDATE(inode) = jiffies; @@ -847,6 +951,8 @@ nfs_setattr(struct dentry *dentry, struc struct nfs_fattr fattr; int error; + nfs_inc_stats(inode, NFSIOS_VFSSETATTR); + if (attr->ia_valid & ATTR_SIZE) { if (!S_ISREG(inode->i_mode) || attr->ia_size == i_size_read(inode)) attr->ia_valid &= ~ATTR_SIZE; @@ -859,11 +965,9 @@ nfs_setattr(struct dentry *dentry, struc lock_kernel(); nfs_begin_data_update(inode); - /* Write all dirty data if we're changing file permissions or size */ - if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE)) != 0) { - filemap_write_and_wait(inode->i_mapping); - nfs_wb_all(inode); - } + /* Write all dirty data */ + filemap_write_and_wait(inode->i_mapping); + nfs_wb_all(inode); /* * Return any delegations if we're going to change ACLs */ @@ -902,6 +1006,7 @@ void nfs_setattr_update_inode(struct ino spin_unlock(&inode->i_lock); } if ((attr->ia_valid & ATTR_SIZE) != 0) { + nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC); inode->i_size = attr->ia_size; vmtruncate(inode, attr->ia_size); } @@ -1185,6 +1290,7 @@ int nfs_attribute_timeout(struct inode * */ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) { + nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); if (!(NFS_I(inode)->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA)) && !nfs_attribute_timeout(inode)) return NFS_STALE(inode) ? -ESTALE : 0; @@ -1201,6 +1307,7 @@ void nfs_revalidate_mapping(struct inode struct nfs_inode *nfsi = NFS_I(inode); if (nfsi->cache_validity & NFS_INO_INVALID_DATA) { + nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); if (S_ISREG(inode->i_mode)) nfs_sync_mapping(mapping); invalidate_inode_pages2(mapping); @@ -1299,39 +1406,37 @@ static int nfs_check_inode_attributes(st if ((fattr->valid & NFS_ATTR_FATTR) == 0) return 0; + /* Has the inode gone and changed behind our back? */ + if (nfsi->fileid != fattr->fileid + || (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) { + return -EIO; + } + /* Are we in the process of updating data on the server? */ data_unstable = nfs_caches_unstable(inode); /* Do atomic weak cache consistency updates */ nfs_wcc_update_inode(inode, fattr); - if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && - nfsi->change_attr != fattr->change_attr) { + if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0) { + if (nfsi->change_attr == fattr->change_attr) + goto out; nfsi->cache_validity |= NFS_INO_INVALID_ATTR; if (!data_unstable) nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE; } - /* Has the inode gone and changed behind our back? */ - if (nfsi->fileid != fattr->fileid - || (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) { - return -EIO; - } - - cur_size = i_size_read(inode); - new_isize = nfs_size_to_loff_t(fattr->size); - /* Verify a few of the more important attributes */ if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) { nfsi->cache_validity |= NFS_INO_INVALID_ATTR; if (!data_unstable) nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE; } - if (cur_size != new_isize) { - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; - if (nfsi->npages == 0) - nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE; - } + + cur_size = i_size_read(inode); + new_isize = nfs_size_to_loff_t(fattr->size); + if (cur_size != new_isize && nfsi->npages == 0) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; /* Have any file permissions changed? */ if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) @@ -1343,6 +1448,7 @@ static int nfs_check_inode_attributes(st if (inode->i_nlink != fattr->nlink) nfsi->cache_validity |= NFS_INO_INVALID_ATTR; +out: if (!timespec_equal(&inode->i_atime, &fattr->atime)) nfsi->cache_validity |= NFS_INO_INVALID_ATIME; @@ -1481,15 +1587,6 @@ static int nfs_update_inode(struct inode nfsi->cache_change_attribute = jiffies; } - if ((fattr->valid & NFS_ATTR_FATTR_V4) - && nfsi->change_attr != fattr->change_attr) { - dprintk("NFS: change_attr change on server for file %s/%ld\n", - inode->i_sb->s_id, inode->i_ino); - nfsi->change_attr = fattr->change_attr; - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; - nfsi->cache_change_attribute = jiffies; - } - /* If ctime has changed we should definitely clear access+acl caches */ if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) { invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; @@ -1519,8 +1616,20 @@ static int nfs_update_inode(struct inode inode->i_blksize = fattr->du.nfs2.blocksize; } + if ((fattr->valid & NFS_ATTR_FATTR_V4)) { + if (nfsi->change_attr != fattr->change_attr) { + dprintk("NFS: change_attr change on server for file %s/%ld\n", + inode->i_sb->s_id, inode->i_ino); + nfsi->change_attr = fattr->change_attr; + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + nfsi->cache_change_attribute = jiffies; + } else + invalid &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA); + } + /* Update attrtimeo value if we're out of the unstable period */ if (invalid & NFS_INO_INVALID_ATTR) { + nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = jiffies; } else if (time_after(jiffies, nfsi->attrtimeo_timestamp+nfsi->attrtimeo)) { @@ -1637,10 +1746,9 @@ static struct super_block *nfs_get_sb(st #endif /* CONFIG_NFS_V3 */ s = ERR_PTR(-ENOMEM); - server = kmalloc(sizeof(struct nfs_server), GFP_KERNEL); + server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL); if (!server) goto out_err; - memset(server, 0, sizeof(struct nfs_server)); /* Zero out the NFS state stuff */ init_nfsv4_state(server); server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL); @@ -1738,6 +1846,7 @@ static struct super_operations nfs4_sops .clear_inode = nfs4_clear_inode, .umount_begin = nfs_umount_begin, .show_options = nfs_show_options, + .show_stats = nfs_show_stats, }; /* @@ -1800,6 +1909,9 @@ static int nfs4_fill_super(struct super_ nfs_init_timeout_values(&timeparms, data->proto, data->timeo, data->retrans); + server->retrans_timeo = timeparms.to_initval; + server->retrans_count = timeparms.to_retries; + clp = nfs4_get_client(&server->addr.sin_addr); if (!clp) { dprintk("%s: failed to create NFS4 client.\n", __FUNCTION__); @@ -1941,10 +2053,9 @@ static struct super_block *nfs4_get_sb(s return ERR_PTR(-EINVAL); } - server = kmalloc(sizeof(struct nfs_server), GFP_KERNEL); + server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL); if (!server) return ERR_PTR(-ENOMEM); - memset(server, 0, sizeof(struct nfs_server)); /* Zero out the NFS state stuff */ init_nfsv4_state(server); server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL); @@ -2009,6 +2120,7 @@ out_err: out_free: kfree(server->mnt_path); kfree(server->hostname); + nfs_free_iostats(server->io_stats); kfree(server); return s; } @@ -2024,10 +2136,11 @@ static void nfs4_kill_super(struct super if (server->client != NULL && !IS_ERR(server->client)) rpc_shutdown_client(server->client); - rpciod_down(); /* release rpciod */ destroy_nfsv4_state(server); + rpciod_down(); + kfree(server->hostname); kfree(server); } diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h new file mode 100644 index 0000000..7a74951 --- /dev/null +++ b/fs/nfs/iostat.h @@ -0,0 +1,163 @@ +/* + * linux/fs/nfs/iostat.h + * + * Declarations for NFS client per-mount statistics + * + * Copyright (C) 2005, 2006 Chuck Lever + * + * NFS client per-mount statistics provide information about the health of + * the NFS client and the health of each NFS mount point. Generally these + * are not for detailed problem diagnosis, but simply to indicate that there + * is a problem. + * + * These counters are not meant to be human-readable, but are meant to be + * integrated into system monitoring tools such as "sar" and "iostat". As + * such, the counters are sampled by the tools over time, and are never + * zeroed after a file system is mounted. Moving averages can be computed + * by the tools by taking the difference between two instantaneous samples + * and dividing that by the time between the samples. + */ + +#ifndef _NFS_IOSTAT +#define _NFS_IOSTAT + +#define NFS_IOSTAT_VERS "1.0" + +/* + * NFS byte counters + * + * 1. SERVER - the number of payload bytes read from or written to the + * server by the NFS client via an NFS READ or WRITE request. + * + * 2. NORMAL - the number of bytes read or written by applications via + * the read(2) and write(2) system call interfaces. + * + * 3. DIRECT - the number of bytes read or written from files opened + * with the O_DIRECT flag. + * + * These counters give a view of the data throughput into and out of the NFS + * client. Comparing the number of bytes requested by an application with the + * number of bytes the client requests from the server can provide an + * indication of client efficiency (per-op, cache hits, etc). + * + * These counters can also help characterize which access methods are in + * use. DIRECT by itself shows whether there is any O_DIRECT traffic. + * NORMAL + DIRECT shows how much data is going through the system call + * interface. A large amount of SERVER traffic without much NORMAL or + * DIRECT traffic shows that applications are using mapped files. + * + * NFS page counters + * + * These count the number of pages read or written via nfs_readpage(), + * nfs_readpages(), or their write equivalents. + */ +enum nfs_stat_bytecounters { + NFSIOS_NORMALREADBYTES = 0, + NFSIOS_NORMALWRITTENBYTES, + NFSIOS_DIRECTREADBYTES, + NFSIOS_DIRECTWRITTENBYTES, + NFSIOS_SERVERREADBYTES, + NFSIOS_SERVERWRITTENBYTES, + NFSIOS_READPAGES, + NFSIOS_WRITEPAGES, + __NFSIOS_BYTESMAX, +}; + +/* + * NFS event counters + * + * These counters provide a low-overhead way of monitoring client activity + * without enabling NFS trace debugging. The counters show the rate at + * which VFS requests are made, and how often the client invalidates its + * data and attribute caches. This allows system administrators to monitor + * such things as how close-to-open is working, and answer questions such + * as "why are there so many GETATTR requests on the wire?" + * + * They also count anamolous events such as short reads and writes, silly + * renames due to close-after-delete, and operations that change the size + * of a file (such operations can often be the source of data corruption + * if applications aren't using file locking properly). + */ +enum nfs_stat_eventcounters { + NFSIOS_INODEREVALIDATE = 0, + NFSIOS_DENTRYREVALIDATE, + NFSIOS_DATAINVALIDATE, + NFSIOS_ATTRINVALIDATE, + NFSIOS_VFSOPEN, + NFSIOS_VFSLOOKUP, + NFSIOS_VFSACCESS, + NFSIOS_VFSUPDATEPAGE, + NFSIOS_VFSREADPAGE, + NFSIOS_VFSREADPAGES, + NFSIOS_VFSWRITEPAGE, + NFSIOS_VFSWRITEPAGES, + NFSIOS_VFSGETDENTS, + NFSIOS_VFSSETATTR, + NFSIOS_VFSFLUSH, + NFSIOS_VFSFSYNC, + NFSIOS_VFSLOCK, + NFSIOS_VFSRELEASE, + NFSIOS_CONGESTIONWAIT, + NFSIOS_SETATTRTRUNC, + NFSIOS_EXTENDWRITE, + NFSIOS_SILLYRENAME, + NFSIOS_SHORTREAD, + NFSIOS_SHORTWRITE, + NFSIOS_DELAY, + __NFSIOS_COUNTSMAX, +}; + +#ifdef __KERNEL__ + +#include +#include + +struct nfs_iostats { + unsigned long long bytes[__NFSIOS_BYTESMAX]; + unsigned long events[__NFSIOS_COUNTSMAX]; +} ____cacheline_aligned; + +static inline void nfs_inc_server_stats(struct nfs_server *server, enum nfs_stat_eventcounters stat) +{ + struct nfs_iostats *iostats; + int cpu; + + cpu = get_cpu(); + iostats = per_cpu_ptr(server->io_stats, cpu); + iostats->events[stat] ++; + put_cpu_no_resched(); +} + +static inline void nfs_inc_stats(struct inode *inode, enum nfs_stat_eventcounters stat) +{ + nfs_inc_server_stats(NFS_SERVER(inode), stat); +} + +static inline void nfs_add_server_stats(struct nfs_server *server, enum nfs_stat_bytecounters stat, unsigned long addend) +{ + struct nfs_iostats *iostats; + int cpu; + + cpu = get_cpu(); + iostats = per_cpu_ptr(server->io_stats, cpu); + iostats->bytes[stat] += addend; + put_cpu_no_resched(); +} + +static inline void nfs_add_stats(struct inode *inode, enum nfs_stat_bytecounters stat, unsigned long addend) +{ + nfs_add_server_stats(NFS_SERVER(inode), stat, addend); +} + +static inline struct nfs_iostats *nfs_alloc_iostats(void) +{ + return alloc_percpu(struct nfs_iostats); +} + +static inline void nfs_free_iostats(struct nfs_iostats *stats) +{ + free_percpu(stats); +} + +#endif +#endif diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index db99b8f..c44d87b 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -49,9 +49,12 @@ nfsroot_mount(struct sockaddr_in *addr, struct mnt_fhstatus result = { .fh = fh }; + struct rpc_message msg = { + .rpc_argp = path, + .rpc_resp = &result, + }; char hostname[32]; int status; - int call; dprintk("NFS: nfs_mount(%08x:%s)\n", (unsigned)ntohl(addr->sin_addr.s_addr), path); @@ -61,8 +64,12 @@ nfsroot_mount(struct sockaddr_in *addr, if (IS_ERR(mnt_clnt)) return PTR_ERR(mnt_clnt); - call = (version == NFS_MNT3_VERSION) ? MOUNTPROC3_MNT : MNTPROC_MNT; - status = rpc_call(mnt_clnt, call, path, &result, 0); + if (version == NFS_MNT3_VERSION) + msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC3_MNT]; + else + msg.rpc_proc = &mnt_clnt->cl_procinfo[MNTPROC_MNT]; + + status = rpc_call_sync(mnt_clnt, &msg, 0); return status < 0? status : (result.status? -EACCES : 0); } @@ -137,6 +144,8 @@ static struct rpc_procinfo mnt_procedure .p_encode = (kxdrproc_t) xdr_encode_dirpath, .p_decode = (kxdrproc_t) xdr_decode_fhstatus, .p_bufsiz = MNT_dirpath_sz << 2, + .p_statidx = MNTPROC_MNT, + .p_name = "MOUNT", }, }; @@ -146,6 +155,8 @@ static struct rpc_procinfo mnt3_procedur .p_encode = (kxdrproc_t) xdr_encode_dirpath, .p_decode = (kxdrproc_t) xdr_decode_fhstatus3, .p_bufsiz = MNT_dirpath_sz << 2, + .p_statidx = MOUNTPROC3_MNT, + .p_name = "MOUNT", }, }; diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 7fc0560..8cdc792 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -682,7 +682,9 @@ nfs_stat_to_errno(int stat) .p_encode = (kxdrproc_t) nfs_xdr_##argtype, \ .p_decode = (kxdrproc_t) nfs_xdr_##restype, \ .p_bufsiz = MAX(NFS_##argtype##_sz,NFS_##restype##_sz) << 2, \ - .p_timer = timer \ + .p_timer = timer, \ + .p_statidx = NFSPROC_##proc, \ + .p_name = #proc, \ } struct rpc_procinfo nfs_procedures[] = { PROC(GETATTR, fhandle, attrstat, 1), diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 6a5bbc0..3328787 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -190,6 +190,10 @@ struct posix_acl *nfs3_proc_getacl(struc struct nfs3_getaclres res = { .fattr = &fattr, }; + struct rpc_message msg = { + .rpc_argp = &args, + .rpc_resp = &res, + }; struct posix_acl *acl; int status, count; @@ -218,8 +222,8 @@ struct posix_acl *nfs3_proc_getacl(struc return NULL; dprintk("NFS call getacl\n"); - status = rpc_call(server->client_acl, ACLPROC3_GETACL, - &args, &res, 0); + msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL]; + status = rpc_call_sync(server->client_acl, &msg, 0); dprintk("NFS reply getacl: %d\n", status); /* pages may have been allocated at the xdr layer. */ @@ -286,6 +290,10 @@ static int nfs3_proc_setacls(struct inod .acl_access = acl, .pages = pages, }; + struct rpc_message msg = { + .rpc_argp = &args, + .rpc_resp = &fattr, + }; int status, count; status = -EOPNOTSUPP; @@ -306,8 +314,8 @@ static int nfs3_proc_setacls(struct inod dprintk("NFS call setacl\n"); nfs_begin_data_update(inode); - status = rpc_call(server->client_acl, ACLPROC3_SETACL, - &args, &fattr, 0); + msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL]; + status = rpc_call_sync(server->client_acl, &msg, 0); spin_lock(&inode->i_lock); NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS; spin_unlock(&inode->i_lock); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index ed67567..cf186f0 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -19,6 +19,8 @@ #include #include +#include "iostat.h" + #define NFSDBG_FACILITY NFSDBG_PROC extern struct rpc_procinfo nfs3_procedures[]; @@ -41,27 +43,14 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, return res; } -static inline int -nfs3_rpc_call_wrapper(struct rpc_clnt *clnt, u32 proc, void *argp, void *resp, int flags) -{ - struct rpc_message msg = { - .rpc_proc = &clnt->cl_procinfo[proc], - .rpc_argp = argp, - .rpc_resp = resp, - }; - return nfs3_rpc_wrapper(clnt, &msg, flags); -} - -#define rpc_call(clnt, proc, argp, resp, flags) \ - nfs3_rpc_call_wrapper(clnt, proc, argp, resp, flags) -#define rpc_call_sync(clnt, msg, flags) \ - nfs3_rpc_wrapper(clnt, msg, flags) +#define rpc_call_sync(clnt, msg, flags) nfs3_rpc_wrapper(clnt, msg, flags) static int -nfs3_async_handle_jukebox(struct rpc_task *task) +nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode) { if (task->tk_status != -EJUKEBOX) return 0; + nfs_inc_stats(inode, NFSIOS_DELAY); task->tk_status = 0; rpc_restart_call(task); rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); @@ -72,14 +61,21 @@ static int do_proc_get_root(struct rpc_clnt *client, struct nfs_fh *fhandle, struct nfs_fsinfo *info) { + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_FSINFO], + .rpc_argp = fhandle, + .rpc_resp = info, + }; int status; dprintk("%s: call fsinfo\n", __FUNCTION__); nfs_fattr_init(info->fattr); - status = rpc_call(client, NFS3PROC_FSINFO, fhandle, info, 0); + status = rpc_call_sync(client, &msg, 0); dprintk("%s: reply fsinfo: %d\n", __FUNCTION__, status); if (!(info->fattr->valid & NFS_ATTR_FATTR)) { - status = rpc_call(client, NFS3PROC_GETATTR, fhandle, info->fattr, 0); + msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR]; + msg.rpc_resp = info->fattr; + status = rpc_call_sync(client, &msg, 0); dprintk("%s: reply getattr: %d\n", __FUNCTION__, status); } return status; @@ -107,12 +103,16 @@ static int nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR], + .rpc_argp = fhandle, + .rpc_resp = fattr, + }; int status; dprintk("NFS call getattr\n"); nfs_fattr_init(fattr); - status = rpc_call(server->client, NFS3PROC_GETATTR, - fhandle, fattr, 0); + status = rpc_call_sync(server->client, &msg, 0); dprintk("NFS reply getattr: %d\n", status); return status; } @@ -126,11 +126,16 @@ nfs3_proc_setattr(struct dentry *dentry, .fh = NFS_FH(inode), .sattr = sattr, }; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_SETATTR], + .rpc_argp = &arg, + .rpc_resp = fattr, + }; int status; dprintk("NFS call setattr\n"); nfs_fattr_init(fattr); - status = rpc_call(NFS_CLIENT(inode), NFS3PROC_SETATTR, &arg, fattr, 0); + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); if (status == 0) nfs_setattr_update_inode(inode, sattr); dprintk("NFS reply setattr: %d\n", status); @@ -152,15 +157,23 @@ nfs3_proc_lookup(struct inode *dir, stru .fh = fhandle, .fattr = fattr }; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_LOOKUP], + .rpc_argp = &arg, + .rpc_resp = &res, + }; int status; dprintk("NFS call lookup %s\n", name->name); nfs_fattr_init(&dir_attr); nfs_fattr_init(fattr); - status = rpc_call(NFS_CLIENT(dir), NFS3PROC_LOOKUP, &arg, &res, 0); - if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) - status = rpc_call(NFS_CLIENT(dir), NFS3PROC_GETATTR, - fhandle, fattr, 0); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) { + msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR]; + msg.rpc_argp = fhandle; + msg.rpc_resp = fattr; + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + } dprintk("NFS reply lookup: %d\n", status); if (status >= 0) status = nfs_refresh_inode(dir, &dir_attr); @@ -180,7 +193,7 @@ static int nfs3_proc_access(struct inode .rpc_proc = &nfs3_procedures[NFS3PROC_ACCESS], .rpc_argp = &arg, .rpc_resp = &res, - .rpc_cred = entry->cred + .rpc_cred = entry->cred, }; int mode = entry->mask; int status; @@ -226,12 +239,16 @@ static int nfs3_proc_readlink(struct ino .pglen = pglen, .pages = &page }; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_READLINK], + .rpc_argp = &args, + .rpc_resp = &fattr, + }; int status; dprintk("NFS call readlink\n"); nfs_fattr_init(&fattr); - status = rpc_call(NFS_CLIENT(inode), NFS3PROC_READLINK, - &args, &fattr, 0); + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); nfs_refresh_inode(inode, &fattr); dprintk("NFS reply readlink: %d\n", status); return status; @@ -327,6 +344,11 @@ nfs3_proc_create(struct inode *dir, stru .fh = &fhandle, .fattr = &fattr }; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_CREATE], + .rpc_argp = &arg, + .rpc_resp = &res, + }; mode_t mode = sattr->ia_mode; int status; @@ -343,8 +365,8 @@ nfs3_proc_create(struct inode *dir, stru again: nfs_fattr_init(&dir_attr); nfs_fattr_init(&fattr); - status = rpc_call(NFS_CLIENT(dir), NFS3PROC_CREATE, &arg, &res, 0); - nfs_post_op_update_inode(dir, &dir_attr); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_refresh_inode(dir, &dir_attr); /* If the server doesn't support the exclusive creation semantics, * try again with simple 'guarded' mode. */ @@ -447,7 +469,7 @@ nfs3_proc_unlink_done(struct dentry *dir struct rpc_message *msg = &task->tk_msg; struct nfs_fattr *dir_attr; - if (nfs3_async_handle_jukebox(task)) + if (nfs3_async_handle_jukebox(task, dir->d_inode)) return 1; if (msg->rpc_argp) { dir_attr = (struct nfs_fattr*)msg->rpc_resp; @@ -474,12 +496,17 @@ nfs3_proc_rename(struct inode *old_dir, .fromattr = &old_dir_attr, .toattr = &new_dir_attr }; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_RENAME], + .rpc_argp = &arg, + .rpc_resp = &res, + }; int status; dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name); nfs_fattr_init(&old_dir_attr); nfs_fattr_init(&new_dir_attr); - status = rpc_call(NFS_CLIENT(old_dir), NFS3PROC_RENAME, &arg, &res, 0); + status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); nfs_post_op_update_inode(old_dir, &old_dir_attr); nfs_post_op_update_inode(new_dir, &new_dir_attr); dprintk("NFS reply rename: %d\n", status); @@ -500,12 +527,17 @@ nfs3_proc_link(struct inode *inode, stru .dir_attr = &dir_attr, .fattr = &fattr }; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_LINK], + .rpc_argp = &arg, + .rpc_resp = &res, + }; int status; dprintk("NFS call link %s\n", name->name); nfs_fattr_init(&dir_attr); nfs_fattr_init(&fattr); - status = rpc_call(NFS_CLIENT(inode), NFS3PROC_LINK, &arg, &res, 0); + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); nfs_post_op_update_inode(dir, &dir_attr); nfs_post_op_update_inode(inode, &fattr); dprintk("NFS reply link: %d\n", status); @@ -531,6 +563,11 @@ nfs3_proc_symlink(struct inode *dir, str .fh = fhandle, .fattr = fattr }; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_SYMLINK], + .rpc_argp = &arg, + .rpc_resp = &res, + }; int status; if (path->len > NFS3_MAXPATHLEN) @@ -538,7 +575,7 @@ nfs3_proc_symlink(struct inode *dir, str dprintk("NFS call symlink %s -> %s\n", name->name, path->name); nfs_fattr_init(&dir_attr); nfs_fattr_init(fattr); - status = rpc_call(NFS_CLIENT(dir), NFS3PROC_SYMLINK, &arg, &res, 0); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_post_op_update_inode(dir, &dir_attr); dprintk("NFS reply symlink: %d\n", status); return status; @@ -560,6 +597,11 @@ nfs3_proc_mkdir(struct inode *dir, struc .fh = &fhandle, .fattr = &fattr }; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR], + .rpc_argp = &arg, + .rpc_resp = &res, + }; int mode = sattr->ia_mode; int status; @@ -569,7 +611,7 @@ nfs3_proc_mkdir(struct inode *dir, struc nfs_fattr_init(&dir_attr); nfs_fattr_init(&fattr); - status = rpc_call(NFS_CLIENT(dir), NFS3PROC_MKDIR, &arg, &res, 0); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_post_op_update_inode(dir, &dir_attr); if (status != 0) goto out; @@ -591,11 +633,16 @@ nfs3_proc_rmdir(struct inode *dir, struc .name = name->name, .len = name->len }; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_RMDIR], + .rpc_argp = &arg, + .rpc_resp = &dir_attr, + }; int status; dprintk("NFS call rmdir %s\n", name->name); nfs_fattr_init(&dir_attr); - status = rpc_call(NFS_CLIENT(dir), NFS3PROC_RMDIR, &arg, &dir_attr, 0); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_post_op_update_inode(dir, &dir_attr); dprintk("NFS reply rmdir: %d\n", status); return status; @@ -672,6 +719,11 @@ nfs3_proc_mknod(struct inode *dir, struc .fh = &fh, .fattr = &fattr }; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD], + .rpc_argp = &arg, + .rpc_resp = &res, + }; mode_t mode = sattr->ia_mode; int status; @@ -690,7 +742,7 @@ nfs3_proc_mknod(struct inode *dir, struc nfs_fattr_init(&dir_attr); nfs_fattr_init(&fattr); - status = rpc_call(NFS_CLIENT(dir), NFS3PROC_MKNOD, &arg, &res, 0); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_post_op_update_inode(dir, &dir_attr); if (status != 0) goto out; @@ -707,11 +759,16 @@ static int nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *stat) { + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_FSSTAT], + .rpc_argp = fhandle, + .rpc_resp = stat, + }; int status; dprintk("NFS call fsstat\n"); nfs_fattr_init(stat->fattr); - status = rpc_call(server->client, NFS3PROC_FSSTAT, fhandle, stat, 0); + status = rpc_call_sync(server->client, &msg, 0); dprintk("NFS reply statfs: %d\n", status); return status; } @@ -720,11 +777,16 @@ static int nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *info) { + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_FSINFO], + .rpc_argp = fhandle, + .rpc_resp = info, + }; int status; dprintk("NFS call fsinfo\n"); nfs_fattr_init(info->fattr); - status = rpc_call(server->client_sys, NFS3PROC_FSINFO, fhandle, info, 0); + status = rpc_call_sync(server->client_sys, &msg, 0); dprintk("NFS reply fsinfo: %d\n", status); return status; } @@ -733,40 +795,34 @@ static int nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_pathconf *info) { + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_PATHCONF], + .rpc_argp = fhandle, + .rpc_resp = info, + }; int status; dprintk("NFS call pathconf\n"); nfs_fattr_init(info->fattr); - status = rpc_call(server->client, NFS3PROC_PATHCONF, fhandle, info, 0); + status = rpc_call_sync(server->client, &msg, 0); dprintk("NFS reply pathconf: %d\n", status); return status; } extern u32 *nfs3_decode_dirent(u32 *, struct nfs_entry *, int); -static void nfs3_read_done(struct rpc_task *task, void *calldata) +static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data) { - struct nfs_read_data *data = calldata; - - if (nfs3_async_handle_jukebox(task)) - return; + if (nfs3_async_handle_jukebox(task, data->inode)) + return -EAGAIN; /* Call back common NFS readpage processing */ if (task->tk_status >= 0) nfs_refresh_inode(data->inode, &data->fattr); - nfs_readpage_result(task, calldata); + return 0; } -static const struct rpc_call_ops nfs3_read_ops = { - .rpc_call_done = nfs3_read_done, - .rpc_release = nfs_readdata_release, -}; - -static void -nfs3_proc_read_setup(struct nfs_read_data *data) +static void nfs3_proc_read_setup(struct nfs_read_data *data) { - struct rpc_task *task = &data->task; - struct inode *inode = data->inode; - int flags; struct rpc_message msg = { .rpc_proc = &nfs3_procedures[NFS3PROC_READ], .rpc_argp = &data->args, @@ -774,37 +830,20 @@ nfs3_proc_read_setup(struct nfs_read_dat .rpc_cred = data->cred, }; - /* N.B. Do we need to test? Never called for swapfile inode */ - flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0); - - /* Finalize the task. */ - rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs3_read_ops, data); - rpc_call_setup(task, &msg, 0); + rpc_call_setup(&data->task, &msg, 0); } -static void nfs3_write_done(struct rpc_task *task, void *calldata) +static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data) { - struct nfs_write_data *data = calldata; - - if (nfs3_async_handle_jukebox(task)) - return; + if (nfs3_async_handle_jukebox(task, data->inode)) + return -EAGAIN; if (task->tk_status >= 0) nfs_post_op_update_inode(data->inode, data->res.fattr); - nfs_writeback_done(task, calldata); + return 0; } -static const struct rpc_call_ops nfs3_write_ops = { - .rpc_call_done = nfs3_write_done, - .rpc_release = nfs_writedata_release, -}; - -static void -nfs3_proc_write_setup(struct nfs_write_data *data, int how) +static void nfs3_proc_write_setup(struct nfs_write_data *data, int how) { - struct rpc_task *task = &data->task; - struct inode *inode = data->inode; - int stable; - int flags; struct rpc_message msg = { .rpc_proc = &nfs3_procedures[NFS3PROC_WRITE], .rpc_argp = &data->args, @@ -812,45 +851,28 @@ nfs3_proc_write_setup(struct nfs_write_d .rpc_cred = data->cred, }; + data->args.stable = NFS_UNSTABLE; if (how & FLUSH_STABLE) { - if (!NFS_I(inode)->ncommit) - stable = NFS_FILE_SYNC; - else - stable = NFS_DATA_SYNC; - } else - stable = NFS_UNSTABLE; - data->args.stable = stable; - - /* Set the initial flags for the task. */ - flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; + data->args.stable = NFS_FILE_SYNC; + if (NFS_I(data->inode)->ncommit) + data->args.stable = NFS_DATA_SYNC; + } /* Finalize the task. */ - rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs3_write_ops, data); - rpc_call_setup(task, &msg, 0); + rpc_call_setup(&data->task, &msg, 0); } -static void nfs3_commit_done(struct rpc_task *task, void *calldata) +static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data) { - struct nfs_write_data *data = calldata; - - if (nfs3_async_handle_jukebox(task)) - return; + if (nfs3_async_handle_jukebox(task, data->inode)) + return -EAGAIN; if (task->tk_status >= 0) nfs_post_op_update_inode(data->inode, data->res.fattr); - nfs_commit_done(task, calldata); + return 0; } -static const struct rpc_call_ops nfs3_commit_ops = { - .rpc_call_done = nfs3_commit_done, - .rpc_release = nfs_commit_release, -}; - -static void -nfs3_proc_commit_setup(struct nfs_write_data *data, int how) +static void nfs3_proc_commit_setup(struct nfs_write_data *data, int how) { - struct rpc_task *task = &data->task; - struct inode *inode = data->inode; - int flags; struct rpc_message msg = { .rpc_proc = &nfs3_procedures[NFS3PROC_COMMIT], .rpc_argp = &data->args, @@ -858,12 +880,7 @@ nfs3_proc_commit_setup(struct nfs_write_ .rpc_cred = data->cred, }; - /* Set the initial flags for the task. */ - flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; - - /* Finalize the task. */ - rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs3_commit_ops, data); - rpc_call_setup(task, &msg, 0); + rpc_call_setup(&data->task, &msg, 0); } static int @@ -902,8 +919,11 @@ struct nfs_rpc_ops nfs_v3_clientops = { .pathconf = nfs3_proc_pathconf, .decode_dirent = nfs3_decode_dirent, .read_setup = nfs3_proc_read_setup, + .read_done = nfs3_read_done, .write_setup = nfs3_proc_write_setup, + .write_done = nfs3_write_done, .commit_setup = nfs3_proc_commit_setup, + .commit_done = nfs3_commit_done, .file_open = nfs_open, .file_release = nfs_release, .lock = nfs3_proc_lock, diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index b6c0b50..2d8701a 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -1109,7 +1109,9 @@ nfs3_xdr_setaclres(struct rpc_rqst *req, .p_encode = (kxdrproc_t) nfs3_xdr_##argtype, \ .p_decode = (kxdrproc_t) nfs3_xdr_##restype, \ .p_bufsiz = MAX(NFS3_##argtype##_sz,NFS3_##restype##_sz) << 2, \ - .p_timer = timer \ + .p_timer = timer, \ + .p_statidx = NFS3PROC_##proc, \ + .p_name = #proc, \ } struct rpc_procinfo nfs3_procedures[] = { @@ -1150,6 +1152,7 @@ static struct rpc_procinfo nfs3_acl_proc .p_decode = (kxdrproc_t) nfs3_xdr_getaclres, .p_bufsiz = MAX(ACL3_getaclargs_sz, ACL3_getaclres_sz) << 2, .p_timer = 1, + .p_name = "GETACL", }, [ACLPROC3_SETACL] = { .p_proc = ACLPROC3_SETACL, @@ -1157,6 +1160,7 @@ static struct rpc_procinfo nfs3_acl_proc .p_decode = (kxdrproc_t) nfs3_xdr_setaclres, .p_bufsiz = MAX(ACL3_setaclargs_sz, ACL3_setaclres_sz) << 2, .p_timer = 0, + .p_name = "SETACL", }, }; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 984ca34..2144129 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -51,6 +51,7 @@ #include "nfs4_fs.h" #include "delegation.h" +#include "iostat.h" #define NFSDBG_FACILITY NFSDBG_PROC @@ -908,7 +909,7 @@ out_put_state_owner: static struct nfs4_state *nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred *cred) { struct nfs4_exception exception = { }; - struct nfs4_state *res; + struct nfs4_state *res = ERR_PTR(-EIO); int err; do { @@ -2344,75 +2345,50 @@ static int nfs4_proc_pathconf(struct nfs return err; } -static void nfs4_read_done(struct rpc_task *task, void *calldata) +static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) { - struct nfs_read_data *data = calldata; - struct inode *inode = data->inode; + struct nfs_server *server = NFS_SERVER(data->inode); - if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) { + if (nfs4_async_handle_error(task, server) == -EAGAIN) { rpc_restart_call(task); - return; + return -EAGAIN; } if (task->tk_status > 0) - renew_lease(NFS_SERVER(inode), data->timestamp); - /* Call back common NFS readpage processing */ - nfs_readpage_result(task, calldata); + renew_lease(server, data->timestamp); + return 0; } -static const struct rpc_call_ops nfs4_read_ops = { - .rpc_call_done = nfs4_read_done, - .rpc_release = nfs_readdata_release, -}; - -static void -nfs4_proc_read_setup(struct nfs_read_data *data) +static void nfs4_proc_read_setup(struct nfs_read_data *data) { - struct rpc_task *task = &data->task; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ], .rpc_argp = &data->args, .rpc_resp = &data->res, .rpc_cred = data->cred, }; - struct inode *inode = data->inode; - int flags; data->timestamp = jiffies; - /* N.B. Do we need to test? Never called for swapfile inode */ - flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0); - - /* Finalize the task. */ - rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs4_read_ops, data); - rpc_call_setup(task, &msg, 0); + rpc_call_setup(&data->task, &msg, 0); } -static void nfs4_write_done(struct rpc_task *task, void *calldata) +static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) { - struct nfs_write_data *data = calldata; struct inode *inode = data->inode; if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) { rpc_restart_call(task); - return; + return -EAGAIN; } if (task->tk_status >= 0) { renew_lease(NFS_SERVER(inode), data->timestamp); nfs_post_op_update_inode(inode, data->res.fattr); } - /* Call back common NFS writeback processing */ - nfs_writeback_done(task, calldata); + return 0; } -static const struct rpc_call_ops nfs4_write_ops = { - .rpc_call_done = nfs4_write_done, - .rpc_release = nfs_writedata_release, -}; - -static void -nfs4_proc_write_setup(struct nfs_write_data *data, int how) +static void nfs4_proc_write_setup(struct nfs_write_data *data, int how) { - struct rpc_task *task = &data->task; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE], .rpc_argp = &data->args, @@ -2422,7 +2398,6 @@ nfs4_proc_write_setup(struct nfs_write_d struct inode *inode = data->inode; struct nfs_server *server = NFS_SERVER(inode); int stable; - int flags; if (how & FLUSH_STABLE) { if (!NFS_I(inode)->ncommit) @@ -2437,57 +2412,37 @@ nfs4_proc_write_setup(struct nfs_write_d data->timestamp = jiffies; - /* Set the initial flags for the task. */ - flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; - /* Finalize the task. */ - rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs4_write_ops, data); - rpc_call_setup(task, &msg, 0); + rpc_call_setup(&data->task, &msg, 0); } -static void nfs4_commit_done(struct rpc_task *task, void *calldata) +static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data) { - struct nfs_write_data *data = calldata; struct inode *inode = data->inode; if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) { rpc_restart_call(task); - return; + return -EAGAIN; } if (task->tk_status >= 0) nfs_post_op_update_inode(inode, data->res.fattr); - /* Call back common NFS writeback processing */ - nfs_commit_done(task, calldata); + return 0; } -static const struct rpc_call_ops nfs4_commit_ops = { - .rpc_call_done = nfs4_commit_done, - .rpc_release = nfs_commit_release, -}; - -static void -nfs4_proc_commit_setup(struct nfs_write_data *data, int how) +static void nfs4_proc_commit_setup(struct nfs_write_data *data, int how) { - struct rpc_task *task = &data->task; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT], .rpc_argp = &data->args, .rpc_resp = &data->res, .rpc_cred = data->cred, }; - struct inode *inode = data->inode; - struct nfs_server *server = NFS_SERVER(inode); - int flags; + struct nfs_server *server = NFS_SERVER(data->inode); data->args.bitmask = server->attr_bitmask; data->res.server = server; - /* Set the initial flags for the task. */ - flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; - - /* Finalize the task. */ - rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs4_commit_ops, data); - rpc_call_setup(task, &msg, 0); + rpc_call_setup(&data->task, &msg, 0); } /* @@ -2755,8 +2710,10 @@ nfs4_async_handle_error(struct rpc_task rpc_wake_up_task(task); task->tk_status = 0; return -EAGAIN; - case -NFS4ERR_GRACE: case -NFS4ERR_DELAY: + nfs_inc_server_stats((struct nfs_server *) server, + NFSIOS_DELAY); + case -NFS4ERR_GRACE: rpc_delay(task, NFS4_POLL_RETRY_MAX); task->tk_status = 0; return -EAGAIN; @@ -2958,7 +2915,7 @@ static void nfs4_delegreturn_release(voi kfree(calldata); } -const static struct rpc_call_ops nfs4_delegreturn_ops = { +static const struct rpc_call_ops nfs4_delegreturn_ops = { .rpc_call_prepare = nfs4_delegreturn_prepare, .rpc_call_done = nfs4_delegreturn_done, .rpc_release = nfs4_delegreturn_release, @@ -3644,8 +3601,11 @@ struct nfs_rpc_ops nfs_v4_clientops = { .pathconf = nfs4_proc_pathconf, .decode_dirent = nfs4_decode_dirent, .read_setup = nfs4_proc_read_setup, + .read_done = nfs4_read_done, .write_setup = nfs4_proc_write_setup, + .write_done = nfs4_write_done, .commit_setup = nfs4_proc_commit_setup, + .commit_done = nfs4_commit_done, .file_open = nfs_open, .file_release = nfs_release, .lock = nfs4_proc_lock, diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 4bbf5ef..b956753 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4344,6 +4344,8 @@ nfs_stat_to_errno(int stat) .p_encode = (kxdrproc_t) nfs4_xdr_##argtype, \ .p_decode = (kxdrproc_t) nfs4_xdr_##restype, \ .p_bufsiz = MAX(NFS4_##argtype##_sz,NFS4_##restype##_sz) << 2, \ + .p_statidx = NFSPROC4_CLNT_##proc, \ + .p_name = #proc, \ } struct rpc_procinfo nfs4_procedures[] = { diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index d53857b..d6e076c 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -85,6 +85,10 @@ nfs_create_request(struct nfs_open_conte atomic_set(&req->wb_complete, 0); req->wb_index = page->index; page_cache_get(page); + BUG_ON(PagePrivate(page)); + BUG_ON(!PageLocked(page)); + BUG_ON(page->mapping->host != inode); + SetPagePrivate(page); req->wb_offset = offset; req->wb_pgbase = offset; req->wb_bytes = count; @@ -147,8 +151,10 @@ void nfs_clear_page_writeback(struct nfs */ void nfs_clear_request(struct nfs_page *req) { - if (req->wb_page) { - page_cache_release(req->wb_page); + struct page *page = req->wb_page; + if (page != NULL) { + ClearPagePrivate(page); + page_cache_release(page); req->wb_page = NULL; } } diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index f5150d7..9dd85ca 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -58,16 +58,23 @@ nfs_proc_get_root(struct nfs_server *ser { struct nfs_fattr *fattr = info->fattr; struct nfs2_fsstat fsinfo; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_GETATTR], + .rpc_argp = fhandle, + .rpc_resp = fattr, + }; int status; dprintk("%s: call getattr\n", __FUNCTION__); nfs_fattr_init(fattr); - status = rpc_call(server->client_sys, NFSPROC_GETATTR, fhandle, fattr, 0); + status = rpc_call_sync(server->client_sys, &msg, 0); dprintk("%s: reply getattr: %d\n", __FUNCTION__, status); if (status) return status; dprintk("%s: call statfs\n", __FUNCTION__); - status = rpc_call(server->client_sys, NFSPROC_STATFS, fhandle, &fsinfo, 0); + msg.rpc_proc = &nfs_procedures[NFSPROC_STATFS]; + msg.rpc_resp = &fsinfo; + status = rpc_call_sync(server->client_sys, &msg, 0); dprintk("%s: reply statfs: %d\n", __FUNCTION__, status); if (status) return status; @@ -90,12 +97,16 @@ static int nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_GETATTR], + .rpc_argp = fhandle, + .rpc_resp = fattr, + }; int status; dprintk("NFS call getattr\n"); nfs_fattr_init(fattr); - status = rpc_call(server->client, NFSPROC_GETATTR, - fhandle, fattr, 0); + status = rpc_call_sync(server->client, &msg, 0); dprintk("NFS reply getattr: %d\n", status); return status; } @@ -109,6 +120,11 @@ nfs_proc_setattr(struct dentry *dentry, .fh = NFS_FH(inode), .sattr = sattr }; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_SETATTR], + .rpc_argp = &arg, + .rpc_resp = fattr, + }; int status; /* Mask out the non-modebit related stuff from attr->ia_mode */ @@ -116,7 +132,7 @@ nfs_proc_setattr(struct dentry *dentry, dprintk("NFS call setattr\n"); nfs_fattr_init(fattr); - status = rpc_call(NFS_CLIENT(inode), NFSPROC_SETATTR, &arg, fattr, 0); + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); if (status == 0) nfs_setattr_update_inode(inode, sattr); dprintk("NFS reply setattr: %d\n", status); @@ -136,11 +152,16 @@ nfs_proc_lookup(struct inode *dir, struc .fh = fhandle, .fattr = fattr }; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_LOOKUP], + .rpc_argp = &arg, + .rpc_resp = &res, + }; int status; dprintk("NFS call lookup %s\n", name->name); nfs_fattr_init(fattr); - status = rpc_call(NFS_CLIENT(dir), NFSPROC_LOOKUP, &arg, &res, 0); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); dprintk("NFS reply lookup: %d\n", status); return status; } @@ -154,10 +175,14 @@ static int nfs_proc_readlink(struct inod .pglen = pglen, .pages = &page }; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_READLINK], + .rpc_argp = &args, + }; int status; dprintk("NFS call readlink\n"); - status = rpc_call(NFS_CLIENT(inode), NFSPROC_READLINK, &args, NULL, 0); + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); dprintk("NFS reply readlink: %d\n", status); return status; } @@ -233,11 +258,16 @@ nfs_proc_create(struct inode *dir, struc .fh = &fhandle, .fattr = &fattr }; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_CREATE], + .rpc_argp = &arg, + .rpc_resp = &res, + }; int status; nfs_fattr_init(&fattr); dprintk("NFS call create %s\n", dentry->d_name.name); - status = rpc_call(NFS_CLIENT(dir), NFSPROC_CREATE, &arg, &res, 0); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); if (status == 0) status = nfs_instantiate(dentry, &fhandle, &fattr); dprintk("NFS reply create: %d\n", status); @@ -263,6 +293,11 @@ nfs_proc_mknod(struct inode *dir, struct .fh = &fhandle, .fattr = &fattr }; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_CREATE], + .rpc_argp = &arg, + .rpc_resp = &res, + }; int status, mode; dprintk("NFS call mknod %s\n", dentry->d_name.name); @@ -277,13 +312,13 @@ nfs_proc_mknod(struct inode *dir, struct } nfs_fattr_init(&fattr); - status = rpc_call(NFS_CLIENT(dir), NFSPROC_CREATE, &arg, &res, 0); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_mark_for_revalidate(dir); if (status == -EINVAL && S_ISFIFO(mode)) { sattr->ia_mode = mode; nfs_fattr_init(&fattr); - status = rpc_call(NFS_CLIENT(dir), NFSPROC_CREATE, &arg, &res, 0); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); } if (status == 0) status = nfs_instantiate(dentry, &fhandle, &fattr); @@ -302,8 +337,6 @@ nfs_proc_remove(struct inode *dir, struc struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_REMOVE], .rpc_argp = &arg, - .rpc_resp = NULL, - .rpc_cred = NULL }; int status; @@ -355,10 +388,14 @@ nfs_proc_rename(struct inode *old_dir, s .toname = new_name->name, .tolen = new_name->len }; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_RENAME], + .rpc_argp = &arg, + }; int status; dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name); - status = rpc_call(NFS_CLIENT(old_dir), NFSPROC_RENAME, &arg, NULL, 0); + status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); nfs_mark_for_revalidate(old_dir); nfs_mark_for_revalidate(new_dir); dprintk("NFS reply rename: %d\n", status); @@ -374,10 +411,14 @@ nfs_proc_link(struct inode *inode, struc .toname = name->name, .tolen = name->len }; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_LINK], + .rpc_argp = &arg, + }; int status; dprintk("NFS call link %s\n", name->name); - status = rpc_call(NFS_CLIENT(inode), NFSPROC_LINK, &arg, NULL, 0); + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); nfs_mark_for_revalidate(inode); nfs_mark_for_revalidate(dir); dprintk("NFS reply link: %d\n", status); @@ -397,6 +438,10 @@ nfs_proc_symlink(struct inode *dir, stru .tolen = path->len, .sattr = sattr }; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_SYMLINK], + .rpc_argp = &arg, + }; int status; if (path->len > NFS2_MAXPATHLEN) @@ -404,7 +449,7 @@ nfs_proc_symlink(struct inode *dir, stru dprintk("NFS call symlink %s -> %s\n", name->name, path->name); nfs_fattr_init(fattr); fhandle->size = 0; - status = rpc_call(NFS_CLIENT(dir), NFSPROC_SYMLINK, &arg, NULL, 0); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_mark_for_revalidate(dir); dprintk("NFS reply symlink: %d\n", status); return status; @@ -425,11 +470,16 @@ nfs_proc_mkdir(struct inode *dir, struct .fh = &fhandle, .fattr = &fattr }; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_MKDIR], + .rpc_argp = &arg, + .rpc_resp = &res, + }; int status; dprintk("NFS call mkdir %s\n", dentry->d_name.name); nfs_fattr_init(&fattr); - status = rpc_call(NFS_CLIENT(dir), NFSPROC_MKDIR, &arg, &res, 0); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_mark_for_revalidate(dir); if (status == 0) status = nfs_instantiate(dentry, &fhandle, &fattr); @@ -445,10 +495,14 @@ nfs_proc_rmdir(struct inode *dir, struct .name = name->name, .len = name->len }; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_RMDIR], + .rpc_argp = &arg, + }; int status; dprintk("NFS call rmdir %s\n", name->name); - status = rpc_call(NFS_CLIENT(dir), NFSPROC_RMDIR, &arg, NULL, 0); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_mark_for_revalidate(dir); dprintk("NFS reply rmdir: %d\n", status); return status; @@ -470,13 +524,12 @@ nfs_proc_readdir(struct dentry *dentry, .fh = NFS_FH(dir), .cookie = cookie, .count = count, - .pages = &page + .pages = &page, }; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_READDIR], .rpc_argp = &arg, - .rpc_resp = NULL, - .rpc_cred = cred + .rpc_cred = cred, }; int status; @@ -495,11 +548,16 @@ nfs_proc_statfs(struct nfs_server *serve struct nfs_fsstat *stat) { struct nfs2_fsstat fsinfo; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_STATFS], + .rpc_argp = fhandle, + .rpc_resp = &fsinfo, + }; int status; dprintk("NFS call statfs\n"); nfs_fattr_init(stat->fattr); - status = rpc_call(server->client, NFSPROC_STATFS, fhandle, &fsinfo, 0); + status = rpc_call_sync(server->client, &msg, 0); dprintk("NFS reply statfs: %d\n", status); if (status) goto out; @@ -518,11 +576,16 @@ nfs_proc_fsinfo(struct nfs_server *serve struct nfs_fsinfo *info) { struct nfs2_fsstat fsinfo; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_STATFS], + .rpc_argp = fhandle, + .rpc_resp = &fsinfo, + }; int status; dprintk("NFS call fsinfo\n"); nfs_fattr_init(info->fattr); - status = rpc_call(server->client, NFSPROC_STATFS, fhandle, &fsinfo, 0); + status = rpc_call_sync(server->client, &msg, 0); dprintk("NFS reply fsinfo: %d\n", status); if (status) goto out; @@ -550,10 +613,8 @@ nfs_proc_pathconf(struct nfs_server *ser extern u32 * nfs_decode_dirent(u32 *, struct nfs_entry *, int); -static void nfs_read_done(struct rpc_task *task, void *calldata) +static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data) { - struct nfs_read_data *data = calldata; - if (task->tk_status >= 0) { nfs_refresh_inode(data->inode, data->res.fattr); /* Emulate the eof flag, which isn't normally needed in NFSv2 @@ -562,20 +623,11 @@ static void nfs_read_done(struct rpc_tas if (data->args.offset + data->args.count >= data->res.fattr->size) data->res.eof = 1; } - nfs_readpage_result(task, calldata); + return 0; } -static const struct rpc_call_ops nfs_read_ops = { - .rpc_call_done = nfs_read_done, - .rpc_release = nfs_readdata_release, -}; - -static void -nfs_proc_read_setup(struct nfs_read_data *data) +static void nfs_proc_read_setup(struct nfs_read_data *data) { - struct rpc_task *task = &data->task; - struct inode *inode = data->inode; - int flags; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_READ], .rpc_argp = &data->args, @@ -583,34 +635,18 @@ nfs_proc_read_setup(struct nfs_read_data .rpc_cred = data->cred, }; - /* N.B. Do we need to test? Never called for swapfile inode */ - flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0); - - /* Finalize the task. */ - rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs_read_ops, data); - rpc_call_setup(task, &msg, 0); + rpc_call_setup(&data->task, &msg, 0); } -static void nfs_write_done(struct rpc_task *task, void *calldata) +static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data) { - struct nfs_write_data *data = calldata; - if (task->tk_status >= 0) nfs_post_op_update_inode(data->inode, data->res.fattr); - nfs_writeback_done(task, calldata); + return 0; } -static const struct rpc_call_ops nfs_write_ops = { - .rpc_call_done = nfs_write_done, - .rpc_release = nfs_writedata_release, -}; - -static void -nfs_proc_write_setup(struct nfs_write_data *data, int how) +static void nfs_proc_write_setup(struct nfs_write_data *data, int how) { - struct rpc_task *task = &data->task; - struct inode *inode = data->inode; - int flags; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_WRITE], .rpc_argp = &data->args, @@ -621,12 +657,8 @@ nfs_proc_write_setup(struct nfs_write_da /* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */ data->args.stable = NFS_FILE_SYNC; - /* Set the initial flags for the task. */ - flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; - /* Finalize the task. */ - rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs_write_ops, data); - rpc_call_setup(task, &msg, 0); + rpc_call_setup(&data->task, &msg, 0); } static void @@ -672,7 +704,9 @@ struct nfs_rpc_ops nfs_v2_clientops = { .pathconf = nfs_proc_pathconf, .decode_dirent = nfs_decode_dirent, .read_setup = nfs_proc_read_setup, + .read_done = nfs_read_done, .write_setup = nfs_proc_write_setup, + .write_done = nfs_write_done, .commit_setup = nfs_proc_commit_setup, .file_open = nfs_open, .file_release = nfs_release, diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 05eb43f..3961524 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -31,17 +31,49 @@ #include +#include "iostat.h" + #define NFSDBG_FACILITY NFSDBG_PAGECACHE static int nfs_pagein_one(struct list_head *, struct inode *); -static void nfs_readpage_result_partial(struct nfs_read_data *, int); -static void nfs_readpage_result_full(struct nfs_read_data *, int); +static const struct rpc_call_ops nfs_read_partial_ops; +static const struct rpc_call_ops nfs_read_full_ops; static kmem_cache_t *nfs_rdata_cachep; -mempool_t *nfs_rdata_mempool; +static mempool_t *nfs_rdata_mempool; #define MIN_POOL_READ (32) +struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) +{ + struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, SLAB_NOFS); + + if (p) { + memset(p, 0, sizeof(*p)); + INIT_LIST_HEAD(&p->pages); + if (pagecount < NFS_PAGEVEC_SIZE) + p->pagevec = &p->page_array[0]; + else { + size_t size = ++pagecount * sizeof(struct page *); + p->pagevec = kmalloc(size, GFP_NOFS); + if (p->pagevec) { + memset(p->pagevec, 0, size); + } else { + mempool_free(p, nfs_rdata_mempool); + p = NULL; + } + } + } + return p; +} + +void nfs_readdata_free(struct nfs_read_data *p) +{ + if (p && (p->pagevec != &p->page_array[0])) + kfree(p->pagevec); + mempool_free(p, nfs_rdata_mempool); +} + void nfs_readdata_release(void *data) { nfs_readdata_free(data); @@ -133,6 +165,8 @@ static int nfs_readpage_sync(struct nfs_ } count -= result; rdata->args.pgbase += result; + nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, result); + /* Note: result == 0 should only happen if we're caching * a write that extends the file and punches a hole. */ @@ -196,9 +230,11 @@ static void nfs_readpage_release(struct * Set up the NFS read request struct */ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, + const struct rpc_call_ops *call_ops, unsigned int count, unsigned int offset) { struct inode *inode; + int flags; data->req = req; data->inode = inode = req->wb_context->dentry->d_inode; @@ -216,6 +252,9 @@ static void nfs_read_rpcsetup(struct nfs data->res.eof = 0; nfs_fattr_init(&data->fattr); + /* Set up the initial task struct. */ + flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0); + rpc_init_task(&data->task, NFS_CLIENT(inode), flags, call_ops, data); NFS_PROTO(inode)->read_setup(data); data->task.tk_cookie = (unsigned long)inode; @@ -303,14 +342,15 @@ static int nfs_pagein_multi(struct list_ list_del_init(&data->pages); data->pagevec[0] = page; - data->complete = nfs_readpage_result_partial; if (nbytes > rsize) { - nfs_read_rpcsetup(req, data, rsize, offset); + nfs_read_rpcsetup(req, data, &nfs_read_partial_ops, + rsize, offset); offset += rsize; nbytes -= rsize; } else { - nfs_read_rpcsetup(req, data, nbytes, offset); + nfs_read_rpcsetup(req, data, &nfs_read_partial_ops, + nbytes, offset); nbytes = 0; } nfs_execute_read(data); @@ -356,8 +396,7 @@ static int nfs_pagein_one(struct list_he } req = nfs_list_entry(data->pages.next); - data->complete = nfs_readpage_result_full; - nfs_read_rpcsetup(req, data, count, 0); + nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0); nfs_execute_read(data); return 0; @@ -391,12 +430,15 @@ nfs_pagein_list(struct list_head *head, /* * Handle a read reply that fills part of a page. */ -static void nfs_readpage_result_partial(struct nfs_read_data *data, int status) +static void nfs_readpage_result_partial(struct rpc_task *task, void *calldata) { + struct nfs_read_data *data = calldata; struct nfs_page *req = data->req; struct page *page = req->wb_page; - if (status >= 0) { + if (nfs_readpage_result(task, data) != 0) + return; + if (task->tk_status >= 0) { unsigned int request = data->args.count; unsigned int result = data->res.count; @@ -415,20 +457,28 @@ static void nfs_readpage_result_partial( } } +static const struct rpc_call_ops nfs_read_partial_ops = { + .rpc_call_done = nfs_readpage_result_partial, + .rpc_release = nfs_readdata_release, +}; + /* * This is the callback from RPC telling us whether a reply was * received or some error occurred (timeout or socket shutdown). */ -static void nfs_readpage_result_full(struct nfs_read_data *data, int status) +static void nfs_readpage_result_full(struct rpc_task *task, void *calldata) { + struct nfs_read_data *data = calldata; unsigned int count = data->res.count; + if (nfs_readpage_result(task, data) != 0) + return; while (!list_empty(&data->pages)) { struct nfs_page *req = nfs_list_entry(data->pages.next); struct page *page = req->wb_page; nfs_list_remove_request(req); - if (status >= 0) { + if (task->tk_status >= 0) { if (count < PAGE_CACHE_SIZE) { if (count < req->wb_bytes) memclear_highpage_flush(page, @@ -444,22 +494,33 @@ static void nfs_readpage_result_full(str } } +static const struct rpc_call_ops nfs_read_full_ops = { + .rpc_call_done = nfs_readpage_result_full, + .rpc_release = nfs_readdata_release, +}; + /* * This is the callback from RPC telling us whether a reply was * received or some error occurred (timeout or socket shutdown). */ -void nfs_readpage_result(struct rpc_task *task, void *calldata) +int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data) { - struct nfs_read_data *data = calldata; struct nfs_readargs *argp = &data->args; struct nfs_readres *resp = &data->res; - int status = task->tk_status; + int status; dprintk("NFS: %4d nfs_readpage_result, (status %d)\n", - task->tk_pid, status); + task->tk_pid, task->tk_status); + + status = NFS_PROTO(data->inode)->read_done(task, data); + if (status != 0) + return status; + + nfs_add_stats(data->inode, NFSIOS_SERVERREADBYTES, resp->count); /* Is this a short read? */ if (task->tk_status >= 0 && resp->count < argp->count && !resp->eof) { + nfs_inc_stats(data->inode, NFSIOS_SHORTREAD); /* Has the server at least made some progress? */ if (resp->count != 0) { /* Yes, so retry the read at the end of the data */ @@ -467,14 +528,14 @@ void nfs_readpage_result(struct rpc_task argp->pgbase += resp->count; argp->count -= resp->count; rpc_restart_call(task); - return; + return -EAGAIN; } task->tk_status = -EIO; } spin_lock(&data->inode->i_lock); NFS_I(data->inode)->cache_validity |= NFS_INO_INVALID_ATIME; spin_unlock(&data->inode->i_lock); - data->complete(data, status); + return 0; } /* @@ -491,6 +552,9 @@ int nfs_readpage(struct file *file, stru dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", page, PAGE_CACHE_SIZE, page->index); + nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); + nfs_add_stats(inode, NFSIOS_READPAGES, 1); + /* * Try to flush any pending writes to the file.. * @@ -570,6 +634,7 @@ int nfs_readpages(struct file *filp, str inode->i_sb->s_id, (long long)NFS_FILEID(inode), nr_pages); + nfs_inc_stats(inode, NFSIOS_VFSREADPAGES); if (filp == NULL) { desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ); @@ -582,6 +647,7 @@ int nfs_readpages(struct file *filp, str if (!list_empty(&head)) { int err = nfs_pagein_list(&head, server->rpages); if (!ret) + nfs_add_stats(inode, NFSIOS_READPAGES, err); ret = err; } put_nfs_open_context(desc.ctx); diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index a65c7b5..0e28189 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -163,10 +163,9 @@ nfs_async_unlink(struct dentry *dentry) struct rpc_clnt *clnt = NFS_CLIENT(dir->d_inode); int status = -ENOMEM; - data = kmalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) goto out; - memset(data, 0, sizeof(*data)); data->cred = rpcauth_lookupcred(clnt->cl_auth, 0); if (IS_ERR(data->cred)) { diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 9449b68..647e321 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -63,6 +63,7 @@ #include #include "delegation.h" +#include "iostat.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE @@ -76,20 +77,21 @@ static struct nfs_page * nfs_update_requ struct inode *, struct page *, unsigned int, unsigned int); -static void nfs_writeback_done_partial(struct nfs_write_data *, int); -static void nfs_writeback_done_full(struct nfs_write_data *, int); static int nfs_wait_on_write_congestion(struct address_space *, int); static int nfs_wait_on_requests(struct inode *, unsigned long, unsigned int); static int nfs_flush_inode(struct inode *inode, unsigned long idx_start, unsigned int npages, int how); +static const struct rpc_call_ops nfs_write_partial_ops; +static const struct rpc_call_ops nfs_write_full_ops; +static const struct rpc_call_ops nfs_commit_ops; static kmem_cache_t *nfs_wdata_cachep; -mempool_t *nfs_wdata_mempool; +static mempool_t *nfs_wdata_mempool; static mempool_t *nfs_commit_mempool; static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion); -static inline struct nfs_write_data *nfs_commit_alloc(unsigned int pagecount) +struct nfs_write_data *nfs_commit_alloc(unsigned int pagecount) { struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, SLAB_NOFS); @@ -100,11 +102,39 @@ static inline struct nfs_write_data *nfs p->pagevec = &p->page_array[0]; else { size_t size = ++pagecount * sizeof(struct page *); + p->pagevec = kzalloc(size, GFP_NOFS); + if (!p->pagevec) { + mempool_free(p, nfs_commit_mempool); + p = NULL; + } + } + } + return p; +} + +void nfs_commit_free(struct nfs_write_data *p) +{ + if (p && (p->pagevec != &p->page_array[0])) + kfree(p->pagevec); + mempool_free(p, nfs_commit_mempool); +} + +struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) +{ + struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, SLAB_NOFS); + + if (p) { + memset(p, 0, sizeof(*p)); + INIT_LIST_HEAD(&p->pages); + if (pagecount < NFS_PAGEVEC_SIZE) + p->pagevec = &p->page_array[0]; + else { + size_t size = ++pagecount * sizeof(struct page *); p->pagevec = kmalloc(size, GFP_NOFS); if (p->pagevec) { memset(p->pagevec, 0, size); } else { - mempool_free(p, nfs_commit_mempool); + mempool_free(p, nfs_wdata_mempool); p = NULL; } } @@ -112,11 +142,11 @@ static inline struct nfs_write_data *nfs return p; } -static inline void nfs_commit_free(struct nfs_write_data *p) +void nfs_writedata_free(struct nfs_write_data *p) { if (p && (p->pagevec != &p->page_array[0])) kfree(p->pagevec); - mempool_free(p, nfs_commit_mempool); + mempool_free(p, nfs_wdata_mempool); } void nfs_writedata_release(void *wdata) @@ -136,6 +166,7 @@ static void nfs_grow_file(struct page *p end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count); if (i_size >= end) return; + nfs_inc_stats(inode, NFSIOS_EXTENDWRITE); i_size_write(inode, end); } @@ -225,6 +256,7 @@ static int nfs_writepage_sync(struct nfs wdata->args.pgbase += result; written += result; count -= result; + nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, result); } while (count); /* Update file length */ nfs_grow_file(page, offset, written); @@ -281,6 +313,9 @@ int nfs_writepage(struct page *page, str int priority = wb_priority(wbc); int err; + nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); + nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); + /* * Note: We need to ensure that we have a reference to the inode * if we are to do asynchronous writes. If not, waiting @@ -345,6 +380,8 @@ int nfs_writepages(struct address_space struct inode *inode = mapping->host; int err; + nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); + err = generic_writepages(mapping, wbc); if (err) return err; @@ -356,6 +393,7 @@ int nfs_writepages(struct address_space err = nfs_flush_inode(inode, 0, 0, wb_priority(wbc)); if (err < 0) goto out; + nfs_add_stats(inode, NFSIOS_WRITEPAGES, err); wbc->nr_to_write -= err; if (!wbc->nonblocking && wbc->sync_mode == WB_SYNC_ALL) { err = nfs_wait_on_requests(inode, 0, 0); @@ -598,6 +636,9 @@ static int nfs_wait_on_write_congestion( if (!bdi_write_congested(bdi)) return 0; + + nfs_inc_stats(mapping->host, NFSIOS_CONGESTIONWAIT); + if (intr) { struct rpc_clnt *clnt = NFS_CLIENT(mapping->host); sigset_t oldset; @@ -653,8 +694,11 @@ static struct nfs_page * nfs_update_requ spin_unlock(&nfsi->req_lock); error = nfs_wait_on_request(req); nfs_release_request(req); - if (error < 0) + if (error < 0) { + if (new) + nfs_release_request(new); return ERR_PTR(error); + } continue; } spin_unlock(&nfsi->req_lock); @@ -748,6 +792,8 @@ int nfs_updatepage(struct file *file, st struct nfs_page *req; int status = 0; + nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); + dprintk("NFS: nfs_updatepage(%s/%s %d@%Ld)\n", file->f_dentry->d_parent->d_name.name, file->f_dentry->d_name.name, count, @@ -857,10 +903,12 @@ static inline int flush_task_priority(in */ static void nfs_write_rpcsetup(struct nfs_page *req, struct nfs_write_data *data, + const struct rpc_call_ops *call_ops, unsigned int count, unsigned int offset, int how) { struct inode *inode; + int flags; /* Set up the RPC argument and reply structs * NB: take care not to mess about with data->commit et al. */ @@ -881,6 +929,9 @@ static void nfs_write_rpcsetup(struct nf data->res.verf = &data->verf; nfs_fattr_init(&data->fattr); + /* Set up the initial task struct. */ + flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; + rpc_init_task(&data->task, NFS_CLIENT(inode), flags, call_ops, data); NFS_PROTO(inode)->write_setup(data, how); data->task.tk_priority = flush_task_priority(how); @@ -944,14 +995,15 @@ static int nfs_flush_multi(struct list_h list_del_init(&data->pages); data->pagevec[0] = page; - data->complete = nfs_writeback_done_partial; if (nbytes > wsize) { - nfs_write_rpcsetup(req, data, wsize, offset, how); + nfs_write_rpcsetup(req, data, &nfs_write_partial_ops, + wsize, offset, how); offset += wsize; nbytes -= wsize; } else { - nfs_write_rpcsetup(req, data, nbytes, offset, how); + nfs_write_rpcsetup(req, data, &nfs_write_partial_ops, + nbytes, offset, how); nbytes = 0; } nfs_execute_write(data); @@ -1005,9 +1057,8 @@ static int nfs_flush_one(struct list_hea } req = nfs_list_entry(data->pages.next); - data->complete = nfs_writeback_done_full; /* Set up the argument struct */ - nfs_write_rpcsetup(req, data, count, 0, how); + nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, how); nfs_execute_write(data); return 0; @@ -1051,8 +1102,9 @@ nfs_flush_list(struct list_head *head, i /* * Handle a write reply that flushed part of a page. */ -static void nfs_writeback_done_partial(struct nfs_write_data *data, int status) +static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata) { + struct nfs_write_data *data = calldata; struct nfs_page *req = data->req; struct page *page = req->wb_page; @@ -1062,11 +1114,14 @@ static void nfs_writeback_done_partial(s req->wb_bytes, (long long)req_offset(req)); - if (status < 0) { + if (nfs_writeback_done(task, data) != 0) + return; + + if (task->tk_status < 0) { ClearPageUptodate(page); SetPageError(page); - req->wb_context->error = status; - dprintk(", error = %d\n", status); + req->wb_context->error = task->tk_status; + dprintk(", error = %d\n", task->tk_status); } else { #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) if (data->verf.committed < NFS_FILE_SYNC) { @@ -1087,6 +1142,11 @@ static void nfs_writeback_done_partial(s nfs_writepage_release(req); } +static const struct rpc_call_ops nfs_write_partial_ops = { + .rpc_call_done = nfs_writeback_done_partial, + .rpc_release = nfs_writedata_release, +}; + /* * Handle a write reply that flushes a whole page. * @@ -1094,11 +1154,15 @@ static void nfs_writeback_done_partial(s * writebacks since the page->count is kept > 1 for as long * as the page has a write request pending. */ -static void nfs_writeback_done_full(struct nfs_write_data *data, int status) +static void nfs_writeback_done_full(struct rpc_task *task, void *calldata) { + struct nfs_write_data *data = calldata; struct nfs_page *req; struct page *page; + if (nfs_writeback_done(task, data) != 0) + return; + /* Update attributes as result of writeback. */ while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); @@ -1111,13 +1175,13 @@ static void nfs_writeback_done_full(stru req->wb_bytes, (long long)req_offset(req)); - if (status < 0) { + if (task->tk_status < 0) { ClearPageUptodate(page); SetPageError(page); - req->wb_context->error = status; + req->wb_context->error = task->tk_status; end_page_writeback(page); nfs_inode_remove_request(req); - dprintk(", error = %d\n", status); + dprintk(", error = %d\n", task->tk_status); goto next; } end_page_writeback(page); @@ -1139,18 +1203,30 @@ static void nfs_writeback_done_full(stru } } +static const struct rpc_call_ops nfs_write_full_ops = { + .rpc_call_done = nfs_writeback_done_full, + .rpc_release = nfs_writedata_release, +}; + + /* * This function is called when the WRITE call is complete. */ -void nfs_writeback_done(struct rpc_task *task, void *calldata) +int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) { - struct nfs_write_data *data = calldata; struct nfs_writeargs *argp = &data->args; struct nfs_writeres *resp = &data->res; + int status; dprintk("NFS: %4d nfs_writeback_done (status %d)\n", task->tk_pid, task->tk_status); + /* Call the NFS version-specific code */ + status = NFS_PROTO(data->inode)->write_done(task, data); + if (status != 0) + return status; + nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count); + #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) if (resp->verf->committed < argp->stable && task->tk_status >= 0) { /* We tried a write call, but the server did not @@ -1176,6 +1252,8 @@ void nfs_writeback_done(struct rpc_task if (task->tk_status >= 0 && resp->count < argp->count) { static unsigned long complain; + nfs_inc_stats(data->inode, NFSIOS_SHORTWRITE); + /* Has the server at least made some progress? */ if (resp->count != 0) { /* Was this an NFSv2 write or an NFSv3 stable write? */ @@ -1191,7 +1269,7 @@ void nfs_writeback_done(struct rpc_task argp->stable = NFS_FILE_SYNC; } rpc_restart_call(task); - return; + return -EAGAIN; } if (time_before(complain, jiffies)) { printk(KERN_WARNING @@ -1202,11 +1280,7 @@ void nfs_writeback_done(struct rpc_task /* Can't do anything about it except throw an error. */ task->tk_status = -EIO; } - - /* - * Process the nfs_page list - */ - data->complete(data, task->tk_status); + return 0; } @@ -1220,10 +1294,12 @@ void nfs_commit_release(void *wdata) * Set up the argument/result storage required for the RPC call. */ static void nfs_commit_rpcsetup(struct list_head *head, - struct nfs_write_data *data, int how) + struct nfs_write_data *data, + int how) { struct nfs_page *first; struct inode *inode; + int flags; /* Set up the RPC argument and reply structs * NB: take care not to mess about with data->commit et al. */ @@ -1243,7 +1319,10 @@ static void nfs_commit_rpcsetup(struct l data->res.fattr = &data->fattr; data->res.verf = &data->verf; nfs_fattr_init(&data->fattr); - + + /* Set up the initial task struct. */ + flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; + rpc_init_task(&data->task, NFS_CLIENT(inode), flags, &nfs_commit_ops, data); NFS_PROTO(inode)->commit_setup(data, how); data->task.tk_priority = flush_task_priority(how); @@ -1284,7 +1363,7 @@ nfs_commit_list(struct inode *inode, str /* * COMMIT call returned */ -void nfs_commit_done(struct rpc_task *task, void *calldata) +static void nfs_commit_done(struct rpc_task *task, void *calldata) { struct nfs_write_data *data = calldata; struct nfs_page *req; @@ -1293,6 +1372,10 @@ void nfs_commit_done(struct rpc_task *ta dprintk("NFS: %4d nfs_commit_done (status %d)\n", task->tk_pid, task->tk_status); + /* Call the NFS version-specific code */ + if (NFS_PROTO(data->inode)->commit_done(task, data) != 0) + return; + while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); @@ -1326,6 +1409,11 @@ void nfs_commit_done(struct rpc_task *ta } sub_page_state(nr_unstable,res); } + +static const struct rpc_call_ops nfs_commit_ops = { + .rpc_call_done = nfs_commit_done, + .rpc_release = nfs_commit_release, +}; #endif static int nfs_flush_inode(struct inode *inode, unsigned long idx_start, diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index d828662..4f391cb 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -326,6 +326,8 @@ out: .p_encode = (kxdrproc_t) nfs4_xdr_##argtype, \ .p_decode = (kxdrproc_t) nfs4_xdr_##restype, \ .p_bufsiz = MAX(NFS4_##argtype##_sz,NFS4_##restype##_sz) << 2, \ + .p_statidx = NFSPROC4_CB_##call, \ + .p_name = #proc, \ } static struct rpc_procinfo nfs4_cb_procedures[] = { diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 1143cfb..f6ab762 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2639,7 +2639,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struc struct nfs4_stateid *lock_stp; struct file *filp; struct file_lock file_lock; - struct file_lock *conflock; + struct file_lock conflock; int status = 0; unsigned int strhashval; @@ -2775,11 +2775,11 @@ conflicting_lock: /* XXX There is a race here. Future patch needed to provide * an atomic posix_lock_and_test_file */ - if (!(conflock = posix_test_lock(filp, &file_lock))) { + if (!posix_test_lock(filp, &file_lock, &conflock)) { status = nfserr_serverfault; goto out; } - nfs4_set_lock_denied(conflock, &lock->lk_denied); + nfs4_set_lock_denied(&conflock, &lock->lk_denied); out: if (status && lock->lk_is_new && lock_sop) release_stateowner(lock_sop); @@ -2800,7 +2800,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, stru struct inode *inode; struct file file; struct file_lock file_lock; - struct file_lock *conflicting_lock; + struct file_lock conflock; int status; if (nfs4_in_grace()) @@ -2864,10 +2864,9 @@ nfsd4_lockt(struct svc_rqst *rqstp, stru file.f_dentry = current_fh->fh_dentry; status = nfs_ok; - conflicting_lock = posix_test_lock(&file, &file_lock); - if (conflicting_lock) { + if (posix_test_lock(&file, &file_lock, &conflock)) { status = nfserr_denied; - nfs4_set_lock_denied(conflicting_lock, &lockt->lt_denied); + nfs4_set_lock_denied(&conflock, &lockt->lt_denied); } out: nfs4_unlock_state(); diff --git a/fs/proc/base.c b/fs/proc/base.c index 20feb75..8f1f49c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -104,6 +104,7 @@ enum pid_directory_inos { PROC_TGID_MAPS, PROC_TGID_NUMA_MAPS, PROC_TGID_MOUNTS, + PROC_TGID_MOUNTSTATS, PROC_TGID_WCHAN, #ifdef CONFIG_MMU PROC_TGID_SMAPS, @@ -144,6 +145,7 @@ enum pid_directory_inos { PROC_TID_MAPS, PROC_TID_NUMA_MAPS, PROC_TID_MOUNTS, + PROC_TID_MOUNTSTATS, PROC_TID_WCHAN, #ifdef CONFIG_MMU PROC_TID_SMAPS, @@ -201,6 +203,7 @@ static struct pid_entry tgid_base_stuff[ E(PROC_TGID_ROOT, "root", S_IFLNK|S_IRWXUGO), E(PROC_TGID_EXE, "exe", S_IFLNK|S_IRWXUGO), E(PROC_TGID_MOUNTS, "mounts", S_IFREG|S_IRUGO), + E(PROC_TGID_MOUNTSTATS, "mountstats", S_IFREG|S_IRUSR), #ifdef CONFIG_MMU E(PROC_TGID_SMAPS, "smaps", S_IFREG|S_IRUGO), #endif @@ -732,6 +735,38 @@ static struct file_operations proc_mount .poll = mounts_poll, }; +extern struct seq_operations mountstats_op; +static int mountstats_open(struct inode *inode, struct file *file) +{ + struct task_struct *task = proc_task(inode); + int ret = seq_open(file, &mountstats_op); + + if (!ret) { + struct seq_file *m = file->private_data; + struct namespace *namespace; + task_lock(task); + namespace = task->namespace; + if (namespace) + get_namespace(namespace); + task_unlock(task); + + if (namespace) + m->private = namespace; + else { + seq_release(inode, file); + ret = -EINVAL; + } + } + return ret; +} + +static struct file_operations proc_mountstats_operations = { + .open = mountstats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = mounts_release, +}; + #define PROC_BLOCK_SIZE (3*1024) /* 4K page size but our output routines use some slack for overruns */ static ssize_t proc_info_read(struct file * file, char __user * buf, @@ -1730,6 +1765,10 @@ static struct dentry *proc_pident_lookup inode->i_fop = &proc_smaps_operations; break; #endif + case PROC_TID_MOUNTSTATS: + case PROC_TGID_MOUNTSTATS: + inode->i_fop = &proc_mountstats_operations; + break; #ifdef CONFIG_SECURITY case PROC_TID_ATTR: inode->i_nlink = 2; diff --git a/include/linux/fs.h b/include/linux/fs.h index e059da9..4652e42 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -666,7 +666,6 @@ extern spinlock_t files_lock; #define FL_POSIX 1 #define FL_FLOCK 2 #define FL_ACCESS 8 /* not trying to lock, just looking */ -#define FL_LOCKD 16 /* lock held by rpc.lockd */ #define FL_LEASE 32 /* lease held on this file */ #define FL_SLEEP 128 /* A blocking lock */ @@ -730,8 +729,6 @@ struct file_lock { #define OFFT_OFFSET_MAX INT_LIMIT(off_t) #endif -extern struct list_head file_lock_list; - #include extern int fcntl_getlk(struct file *, struct flock __user *); @@ -753,10 +750,9 @@ extern void locks_init_lock(struct file_ extern void locks_copy_lock(struct file_lock *, struct file_lock *); extern void locks_remove_posix(struct file *, fl_owner_t); extern void locks_remove_flock(struct file *); -extern struct file_lock *posix_test_lock(struct file *, struct file_lock *); +extern int posix_test_lock(struct file *, struct file_lock *, struct file_lock *); extern int posix_lock_file(struct file *, struct file_lock *); extern int posix_lock_file_wait(struct file *, struct file_lock *); -extern void posix_block_lock(struct file_lock *, struct file_lock *); extern int posix_unblock_lock(struct file *, struct file_lock *); extern int posix_locks_deadlock(struct file_lock *, struct file_lock *); extern int flock_lock_file_wait(struct file *filp, struct file_lock *fl); @@ -1085,6 +1081,7 @@ struct super_operations { void (*umount_begin) (struct super_block *); int (*show_options)(struct seq_file *, struct vfsmount *); + int (*show_stats)(struct seq_file *, struct vfsmount *); ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index ef21ed2..a04137d 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -58,6 +59,8 @@ struct nlm_host { unsigned long h_expires; /* eligible for GC */ struct list_head h_lockowners; /* Lockowners for the client */ spinlock_t h_lock; + struct list_head h_granted; /* Locks in GRANTED state */ + struct list_head h_reclaim; /* Locks in RECLAIM state */ }; /* @@ -83,9 +86,9 @@ struct nlm_rqst { struct nlm_host * a_host; /* host handle */ struct nlm_args a_args; /* arguments */ struct nlm_res a_res; /* result */ - struct nlm_wait * a_block; + struct nlm_block * a_block; unsigned int a_retries; /* Retry count */ - char a_owner[NLMCLNT_OHSIZE]; + u8 a_owner[NLMCLNT_OHSIZE]; }; /* @@ -110,16 +113,16 @@ struct nlm_file { */ #define NLM_NEVER (~(unsigned long) 0) struct nlm_block { + struct kref b_count; /* Reference count */ struct nlm_block * b_next; /* linked list (all blocks) */ struct nlm_block * b_fnext; /* linked list (per file) */ - struct nlm_rqst b_call; /* RPC args & callback info */ + struct nlm_rqst * b_call; /* RPC args & callback info */ struct svc_serv * b_daemon; /* NLM service */ struct nlm_host * b_host; /* host handle for RPC clnt */ unsigned long b_when; /* next re-xmit */ unsigned int b_id; /* block id */ unsigned char b_queued; /* re-queued */ unsigned char b_granted; /* VFS granted lock */ - unsigned char b_incall; /* doing callback */ unsigned char b_done; /* callback complete */ struct nlm_file * b_file; /* file in question */ }; @@ -145,15 +148,16 @@ extern unsigned long nlmsvc_timeout; /* * Lockd client functions */ -struct nlm_rqst * nlmclnt_alloc_call(void); -int nlmclnt_prepare_block(struct nlm_rqst *req, struct nlm_host *host, struct file_lock *fl); -void nlmclnt_finish_block(struct nlm_rqst *req); -long nlmclnt_block(struct nlm_rqst *req, long timeout); +struct nlm_rqst * nlm_alloc_call(struct nlm_host *host); +void nlm_release_call(struct nlm_rqst *); +int nlm_async_call(struct nlm_rqst *, u32, const struct rpc_call_ops *); +int nlm_async_reply(struct nlm_rqst *, u32, const struct rpc_call_ops *); +struct nlm_wait * nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *fl); +void nlmclnt_finish_block(struct nlm_wait *block); +int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout); u32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *); void nlmclnt_recovery(struct nlm_host *, u32); int nlmclnt_reclaim(struct nlm_host *, struct file_lock *); -int nlmclnt_setgrantargs(struct nlm_rqst *, struct nlm_lock *); -void nlmclnt_freegrantargs(struct nlm_rqst *); /* * Host cache @@ -172,7 +176,6 @@ extern struct nlm_host *nlm_find_client( /* * Server-side lock handling */ -int nlmsvc_async_call(struct nlm_rqst *, u32, const struct rpc_call_ops *); u32 nlmsvc_lock(struct svc_rqst *, struct nlm_file *, struct nlm_lock *, int, struct nlm_cookie *); u32 nlmsvc_unlock(struct nlm_file *, struct nlm_lock *); diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h index d7a5cc4..bb0a0f1 100644 --- a/include/linux/lockd/xdr.h +++ b/include/linux/lockd/xdr.h @@ -28,6 +28,7 @@ struct nlm_lock { int len; /* length of "caller" */ struct nfs_fh fh; struct xdr_netobj oh; + u32 svid; struct file_lock fl; }; diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index b4dc6e2..7f36fe9 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -118,8 +118,7 @@ struct nfs_inode { unsigned long cache_validity; /* bit mask */ /* - * read_cache_jiffies is when we started read-caching this inode, - * and read_cache_mtime is the mtime of the inode at that time. + * read_cache_jiffies is when we started read-caching this inode. * attrtimeo is for how long the cached information is assumed * to be valid. A successful attribute revalidation doubles * attrtimeo (up to acregmax/acdirmax), a failure resets it to @@ -128,11 +127,6 @@ struct nfs_inode { * We need to revalidate the cached attrs for this inode if * * jiffies - read_cache_jiffies > attrtimeo - * - * and invalidate any cached data/flush out any dirty pages if - * we find that - * - * mtime != read_cache_mtime */ unsigned long read_cache_jiffies; unsigned long attrtimeo; @@ -415,12 +409,12 @@ extern int nfs_writepage(struct page *p extern int nfs_writepages(struct address_space *, struct writeback_control *); extern int nfs_flush_incompatible(struct file *file, struct page *page); extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int); -extern void nfs_writeback_done(struct rpc_task *task, void *data); -extern void nfs_writedata_release(void *data); +extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); +extern void nfs_writedata_release(void *); #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) -extern void nfs_commit_done(struct rpc_task *, void *data); -extern void nfs_commit_release(void *data); +struct nfs_write_data *nfs_commit_alloc(unsigned int pagecount); +void nfs_commit_free(struct nfs_write_data *p); #endif /* @@ -430,6 +424,7 @@ extern void nfs_commit_release(void *dat extern int nfs_sync_inode(struct inode *, unsigned long, unsigned int, int); #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) extern int nfs_commit_inode(struct inode *, int); +extern void nfs_commit_release(void *wdata); #else static inline int nfs_commit_inode(struct inode *inode, int how) @@ -469,37 +464,8 @@ static inline int nfs_wb_page(struct ino /* * Allocate and free nfs_write_data structures */ -extern mempool_t *nfs_wdata_mempool; - -static inline struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) -{ - struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, SLAB_NOFS); - - if (p) { - memset(p, 0, sizeof(*p)); - INIT_LIST_HEAD(&p->pages); - if (pagecount < NFS_PAGEVEC_SIZE) - p->pagevec = &p->page_array[0]; - else { - size_t size = ++pagecount * sizeof(struct page *); - p->pagevec = kmalloc(size, GFP_NOFS); - if (p->pagevec) { - memset(p->pagevec, 0, size); - } else { - mempool_free(p, nfs_wdata_mempool); - p = NULL; - } - } - } - return p; -} - -static inline void nfs_writedata_free(struct nfs_write_data *p) -{ - if (p && (p->pagevec != &p->page_array[0])) - kfree(p->pagevec); - mempool_free(p, nfs_wdata_mempool); -} +extern struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount); +extern void nfs_writedata_free(struct nfs_write_data *p); /* * linux/fs/nfs/read.c @@ -507,44 +473,14 @@ static inline void nfs_writedata_free(st extern int nfs_readpage(struct file *, struct page *); extern int nfs_readpages(struct file *, struct address_space *, struct list_head *, unsigned); -extern void nfs_readpage_result(struct rpc_task *, void *); -extern void nfs_readdata_release(void *data); - +extern int nfs_readpage_result(struct rpc_task *, struct nfs_read_data *); +extern void nfs_readdata_release(void *data); /* * Allocate and free nfs_read_data structures */ -extern mempool_t *nfs_rdata_mempool; - -static inline struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) -{ - struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, SLAB_NOFS); - - if (p) { - memset(p, 0, sizeof(*p)); - INIT_LIST_HEAD(&p->pages); - if (pagecount < NFS_PAGEVEC_SIZE) - p->pagevec = &p->page_array[0]; - else { - size_t size = ++pagecount * sizeof(struct page *); - p->pagevec = kmalloc(size, GFP_NOFS); - if (p->pagevec) { - memset(p->pagevec, 0, size); - } else { - mempool_free(p, nfs_rdata_mempool); - p = NULL; - } - } - } - return p; -} - -static inline void nfs_readdata_free(struct nfs_read_data *p) -{ - if (p && (p->pagevec != &p->page_array[0])) - kfree(p->pagevec); - mempool_free(p, nfs_rdata_mempool); -} +extern struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount); +extern void nfs_readdata_free(struct nfs_read_data *p); /* * linux/fs/nfs3proc.c diff --git a/include/linux/nfs_fs_i.h b/include/linux/nfs_fs_i.h index e2c18da..8617302 100644 --- a/include/linux/nfs_fs_i.h +++ b/include/linux/nfs_fs_i.h @@ -12,8 +12,8 @@ struct nlm_lockowner; */ struct nfs_lock_info { u32 state; - u32 flags; struct nlm_lockowner *owner; + struct list_head list; }; struct nfs4_lock_state; @@ -21,10 +21,4 @@ struct nfs4_lock_info { struct nfs4_lock_state *owner; }; -/* - * Lock flag values - */ -#define NFS_LCK_GRANTED 0x0001 /* lock has been granted */ -#define NFS_LCK_RECLAIM 0x0002 /* lock marked for reclaiming */ - #endif diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 3d3a305..65dec21 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -4,6 +4,8 @@ #include #include +struct nfs_iostats; + /* * NFS client parameters stored in the superblock. */ @@ -12,6 +14,7 @@ struct nfs_server { struct rpc_clnt * client_sys; /* 2nd handle for FSINFO */ struct rpc_clnt * client_acl; /* ACL RPC client handle */ struct nfs_rpc_ops * rpc_ops; /* NFS protocol vector */ + struct nfs_iostats * io_stats; /* I/O statistics */ struct backing_dev_info backing_dev_info; int flags; /* various flags */ unsigned int caps; /* server capabilities */ @@ -26,10 +29,13 @@ struct nfs_server { unsigned int acregmax; unsigned int acdirmin; unsigned int acdirmax; + unsigned long retrans_timeo; /* retransmit timeout */ + unsigned int retrans_count; /* number of retransmit tries */ unsigned int namelen; char * hostname; /* remote hostname */ struct nfs_fh fh; struct sockaddr_in addr; + unsigned long mount_time; /* when this fs was mounted */ #ifdef CONFIG_NFS_V4 /* Our own IP address, as a null-terminated string. * This is used to generate the clientid, and the callback address. diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 6d6f69e..7fafc4c 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -695,7 +695,6 @@ struct nfs_read_data { #ifdef CONFIG_NFS_V4 unsigned long timestamp; /* For lease renewal */ #endif - void (*complete) (struct nfs_read_data *, int); struct page *page_array[NFS_PAGEVEC_SIZE + 1]; }; @@ -714,7 +713,6 @@ struct nfs_write_data { #ifdef CONFIG_NFS_V4 unsigned long timestamp; /* For lease renewal */ #endif - void (*complete) (struct nfs_write_data *, int); struct page *page_array[NFS_PAGEVEC_SIZE + 1]; }; @@ -769,8 +767,11 @@ struct nfs_rpc_ops { struct nfs_pathconf *); u32 * (*decode_dirent)(u32 *, struct nfs_entry *, int plus); void (*read_setup) (struct nfs_read_data *); + int (*read_done) (struct rpc_task *, struct nfs_read_data *); void (*write_setup) (struct nfs_write_data *, int how); + int (*write_done) (struct rpc_task *, struct nfs_write_data *); void (*commit_setup) (struct nfs_write_data *, int how); + int (*commit_done) (struct rpc_task *, struct nfs_write_data *); int (*file_open) (struct inode *, struct file *); int (*file_release) (struct inode *, struct file *); int (*lock)(struct file *, int, struct file_lock *); diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index f147e6b..e37c061 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -45,7 +45,8 @@ struct rpc_clnt { char * cl_server; /* server machine name */ char * cl_protname; /* protocol name */ struct rpc_auth * cl_auth; /* authenticator */ - struct rpc_stat * cl_stats; /* statistics */ + struct rpc_stat * cl_stats; /* per-program statistics */ + struct rpc_iostats * cl_metrics; /* per-client statistics */ unsigned int cl_softrtry : 1,/* soft timeouts */ cl_intr : 1,/* interruptible */ @@ -100,6 +101,8 @@ struct rpc_procinfo { unsigned int p_bufsiz; /* req. buffer size */ unsigned int p_count; /* call count */ unsigned int p_timer; /* Which RTT timer to use */ + u32 p_statidx; /* Which procedure to account */ + char * p_name; /* name of procedure */ }; #define RPC_CONGESTED(clnt) (RPCXPRT_CONGESTED((clnt)->cl_xprt)) @@ -137,20 +140,6 @@ size_t rpc_max_payload(struct rpc_clnt void rpc_force_rebind(struct rpc_clnt *); int rpc_ping(struct rpc_clnt *clnt, int flags); -static __inline__ -int rpc_call(struct rpc_clnt *clnt, u32 proc, void *argp, void *resp, int flags) -{ - struct rpc_message msg = { - .rpc_proc = &clnt->cl_procinfo[proc], - .rpc_argp = argp, - .rpc_resp = resp, - .rpc_cred = NULL - }; - return rpc_call_sync(clnt, &msg, flags); -} - -extern void rpciod_wake_up(void); - /* * Helper function for NFSroot support */ diff --git a/include/linux/sunrpc/metrics.h b/include/linux/sunrpc/metrics.h new file mode 100644 index 0000000..8f96e9d --- /dev/null +++ b/include/linux/sunrpc/metrics.h @@ -0,0 +1,77 @@ +/* + * linux/include/linux/sunrpc/metrics.h + * + * Declarations for RPC client per-operation metrics + * + * Copyright (C) 2005 Chuck Lever + * + * RPC client per-operation statistics provide latency and retry + * information about each type of RPC procedure in a given RPC program. + * These statistics are not for detailed problem diagnosis, but simply + * to indicate whether the problem is local or remote. + * + * These counters are not meant to be human-readable, but are meant to be + * integrated into system monitoring tools such as "sar" and "iostat". As + * such, the counters are sampled by the tools over time, and are never + * zeroed after a file system is mounted. Moving averages can be computed + * by the tools by taking the difference between two instantaneous samples + * and dividing that by the time between the samples. + * + * The counters are maintained in a single array per RPC client, indexed + * by procedure number. There is no need to maintain separate counter + * arrays per-CPU because these counters are always modified behind locks. + */ + +#ifndef _LINUX_SUNRPC_METRICS_H +#define _LINUX_SUNRPC_METRICS_H + +#include + +#define RPC_IOSTATS_VERS "1.0" + +struct rpc_iostats { + /* + * These counters give an idea about how many request + * transmissions are required, on average, to complete that + * particular procedure. Some procedures may require more + * than one transmission because the server is unresponsive, + * the client is retransmitting too aggressively, or the + * requests are large and the network is congested. + */ + unsigned long om_ops, /* count of operations */ + om_ntrans, /* count of RPC transmissions */ + om_timeouts; /* count of major timeouts */ + + /* + * These count how many bytes are sent and received for a + * given RPC procedure type. This indicates how much load a + * particular procedure is putting on the network. These + * counts include the RPC and ULP headers, and the request + * payload. + */ + unsigned long long om_bytes_sent, /* count of bytes out */ + om_bytes_recv; /* count of bytes in */ + + /* + * The length of time an RPC request waits in queue before + * transmission, the network + server latency of the request, + * and the total time the request spent from init to release + * are measured. + */ + unsigned long long om_queue, /* jiffies queued for xmit */ + om_rtt, /* jiffies for RPC RTT */ + om_execute; /* jiffies for RPC execution */ +} ____cacheline_aligned; + +struct rpc_task; +struct rpc_clnt; + +/* + * EXPORTed functions for managing rpc_iostats structures + */ +struct rpc_iostats * rpc_alloc_iostats(struct rpc_clnt *); +void rpc_count_iostats(struct rpc_task *); +void rpc_print_iostats(struct seq_file *, struct rpc_clnt *); +void rpc_free_iostats(struct rpc_iostats *); + +#endif /* _LINUX_SUNRPC_METRICS_H */ diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 8b25629..82a91bb 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -86,6 +86,12 @@ struct rpc_task { struct work_struct tk_work; /* Async task work queue */ struct rpc_wait tk_wait; /* RPC wait */ } u; + + unsigned short tk_timeouts; /* maj timeouts */ + size_t tk_bytes_sent; /* total bytes sent */ + unsigned long tk_start; /* RPC task init timestamp */ + long tk_rtt; /* round-trip time (jiffies) */ + #ifdef RPC_DEBUG unsigned short tk_pid; /* debugging aid */ #endif @@ -203,6 +209,7 @@ struct rpc_wait_queue { unsigned char priority; /* current priority */ unsigned char count; /* # task groups remaining serviced so far */ unsigned char nr; /* # tasks remaining for cookie */ + unsigned short qlen; /* total # tasks waiting in queue */ #ifdef RPC_DEBUG const char * name; #endif @@ -269,13 +276,13 @@ void * rpc_malloc(struct rpc_task *, si void rpc_free(struct rpc_task *); int rpciod_up(void); void rpciod_down(void); -void rpciod_wake_up(void); int __rpc_wait_for_completion_task(struct rpc_task *task, int (*)(void *)); #ifdef RPC_DEBUG void rpc_show_tasks(void); #endif int rpc_init_mempool(void); void rpc_destroy_mempool(void); +extern struct workqueue_struct *rpciod_workqueue; static inline void rpc_exit(struct rpc_task *task, int status) { diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 6ef99b1..7eebbab 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -114,6 +114,7 @@ struct rpc_xprt_ops { void (*release_request)(struct rpc_task *task); void (*close)(struct rpc_xprt *xprt); void (*destroy)(struct rpc_xprt *xprt); + void (*print_stats)(struct rpc_xprt *xprt, struct seq_file *seq); }; struct rpc_xprt { @@ -187,6 +188,18 @@ struct rpc_xprt { struct list_head recv; + struct { + unsigned long bind_count, /* total number of binds */ + connect_count, /* total number of connects */ + connect_start, /* connect start timestamp */ + connect_time, /* jiffies waiting for connect */ + sends, /* how many complete requests */ + recvs, /* how many complete requests */ + bad_xids; /* lookup_rqst didn't find XID */ + + unsigned long long req_u, /* average requests on the wire */ + bklog_u; /* backlog queue utilization */ + } stat; void (*old_data_ready)(struct sock *, int); void (*old_state_change)(struct sock *); diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 8d6f1a1..55163af 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -64,14 +64,26 @@ rpcauth_create(rpc_authflavor_t pseudofl struct rpc_authops *ops; u32 flavor = pseudoflavor_to_flavor(pseudoflavor); - if (flavor >= RPC_AUTH_MAXFLAVOR || !(ops = auth_flavors[flavor])) - return ERR_PTR(-EINVAL); + auth = ERR_PTR(-EINVAL); + if (flavor >= RPC_AUTH_MAXFLAVOR) + goto out; + + /* FIXME - auth_flavors[] really needs an rw lock, + * and module refcounting. */ +#ifdef CONFIG_KMOD + if ((ops = auth_flavors[flavor]) == NULL) + request_module("rpc-auth-%u", flavor); +#endif + if ((ops = auth_flavors[flavor]) == NULL) + goto out; auth = ops->create(clnt, pseudoflavor); if (IS_ERR(auth)) return auth; if (clnt->cl_auth) rpcauth_destroy(clnt->cl_auth); clnt->cl_auth = auth; + +out: return auth; } diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index bb46efd..900ef31 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -721,6 +721,8 @@ gss_destroy(struct rpc_auth *auth) gss_auth = container_of(auth, struct gss_auth, rpc_auth); rpc_unlink(gss_auth->path); + dput(gss_auth->dentry); + gss_auth->dentry = NULL; gss_mech_put(gss_auth->mech); rpcauth_free_credcache(auth); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index d2f0550..9d56fc8 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -28,12 +28,11 @@ #include #include #include +#include #include -#include #include - -#include +#include #define RPC_SLACK_SPACE (1024) /* total overkill */ @@ -147,6 +146,7 @@ rpc_new_client(struct rpc_xprt *xprt, ch clnt->cl_vers = version->number; clnt->cl_prot = xprt->prot; clnt->cl_stats = program->stats; + clnt->cl_metrics = rpc_alloc_iostats(clnt); rpc_init_wait_queue(&clnt->cl_pmap_default.pm_bindwait, "bindwait"); if (!clnt->cl_port) @@ -239,10 +239,12 @@ rpc_clone_client(struct rpc_clnt *clnt) new->cl_autobind = 0; new->cl_oneshot = 0; new->cl_dead = 0; + dget(new->cl_dentry); rpc_init_rtt(&new->cl_rtt_default, clnt->cl_xprt->timeout.to_initval); if (new->cl_auth) atomic_inc(&new->cl_auth->au_count); new->cl_pmap = &new->cl_pmap_default; + new->cl_metrics = rpc_alloc_iostats(clnt); rpc_init_wait_queue(&new->cl_pmap_default.pm_bindwait, "bindwait"); return new; out_no_clnt: @@ -313,6 +315,10 @@ rpc_destroy_client(struct rpc_clnt *clnt if (clnt->cl_server != clnt->cl_inline_name) kfree(clnt->cl_server); out_free: + rpc_free_iostats(clnt->cl_metrics); + clnt->cl_metrics = NULL; + if (clnt->cl_dentry) + dput(clnt->cl_dentry); kfree(clnt); return 0; } @@ -992,6 +998,8 @@ call_timeout(struct rpc_task *task) } dprintk("RPC: %4d call_timeout (major)\n", task->tk_pid); + task->tk_timeouts++; + if (RPC_IS_SOFT(task)) { printk(KERN_NOTICE "%s: server %s not responding, timed out\n", clnt->cl_protname, clnt->cl_server); @@ -1193,8 +1201,8 @@ call_verify(struct rpc_task *task) task->tk_action = call_bind; goto out_retry; case RPC_AUTH_TOOWEAK: - printk(KERN_NOTICE "call_verify: server requires stronger " - "authentication.\n"); + printk(KERN_NOTICE "call_verify: server %s requires stronger " + "authentication.\n", task->tk_client->cl_server); break; default: printk(KERN_WARNING "call_verify: unknown auth error: %x\n", n); diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c index 8139ce6..d25b054 100644 --- a/net/sunrpc/pmap_clnt.c +++ b/net/sunrpc/pmap_clnt.c @@ -82,6 +82,7 @@ rpc_getport(struct rpc_task *task, struc rpc_call_setup(child, &msg, 0); /* ... and run the child task */ + task->tk_xprt->stat.bind_count++; rpc_run_child(task, child, pmap_getport_done); return; @@ -103,6 +104,11 @@ rpc_getport_external(struct sockaddr_in .pm_prot = prot, .pm_port = 0 }; + struct rpc_message msg = { + .rpc_proc = &pmap_procedures[PMAP_GETPORT], + .rpc_argp = &map, + .rpc_resp = &map.pm_port, + }; struct rpc_clnt *pmap_clnt; char hostname[32]; int status; @@ -116,7 +122,7 @@ rpc_getport_external(struct sockaddr_in return PTR_ERR(pmap_clnt); /* Setup the call info struct */ - status = rpc_call(pmap_clnt, PMAP_GETPORT, &map, &map.pm_port, 0); + status = rpc_call_sync(pmap_clnt, &msg, 0); if (status >= 0) { if (map.pm_port != 0) @@ -161,16 +167,27 @@ pmap_getport_done(struct rpc_task *task) int rpc_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay) { - struct sockaddr_in sin; - struct rpc_portmap map; + struct sockaddr_in sin = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_LOOPBACK), + }; + struct rpc_portmap map = { + .pm_prog = prog, + .pm_vers = vers, + .pm_prot = prot, + .pm_port = port, + }; + struct rpc_message msg = { + .rpc_proc = &pmap_procedures[port ? PMAP_SET : PMAP_UNSET], + .rpc_argp = &map, + .rpc_resp = okay, + }; struct rpc_clnt *pmap_clnt; int error = 0; dprintk("RPC: registering (%d, %d, %d, %d) with portmapper.\n", prog, vers, prot, port); - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK); pmap_clnt = pmap_create("localhost", &sin, IPPROTO_UDP, 1); if (IS_ERR(pmap_clnt)) { error = PTR_ERR(pmap_clnt); @@ -178,13 +195,7 @@ rpc_register(u32 prog, u32 vers, int pro return error; } - map.pm_prog = prog; - map.pm_vers = vers; - map.pm_prot = prot; - map.pm_port = port; - - error = rpc_call(pmap_clnt, port? PMAP_SET : PMAP_UNSET, - &map, okay, 0); + error = rpc_call_sync(pmap_clnt, &msg, 0); if (error < 0) { printk(KERN_WARNING @@ -260,6 +271,8 @@ static struct rpc_procinfo pmap_procedur .p_decode = (kxdrproc_t) xdr_decode_bool, .p_bufsiz = 4, .p_count = 1, + .p_statidx = PMAP_SET, + .p_name = "SET", }, [PMAP_UNSET] = { .p_proc = PMAP_UNSET, @@ -267,6 +280,8 @@ static struct rpc_procinfo pmap_procedur .p_decode = (kxdrproc_t) xdr_decode_bool, .p_bufsiz = 4, .p_count = 1, + .p_statidx = PMAP_UNSET, + .p_name = "UNSET", }, [PMAP_GETPORT] = { .p_proc = PMAP_GETPORT, @@ -274,6 +289,8 @@ static struct rpc_procinfo pmap_procedur .p_decode = (kxdrproc_t) xdr_decode_port, .p_bufsiz = 4, .p_count = 1, + .p_statidx = PMAP_GETPORT, + .p_name = "GETPORT", }, }; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index a5c0c7b..72b2217 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -91,7 +91,8 @@ rpc_queue_upcall(struct inode *inode, st res = 0; } else if (rpci->flags & RPC_PIPE_WAIT_FOR_OPEN) { if (list_empty(&rpci->pipe)) - schedule_delayed_work(&rpci->queue_timeout, + queue_delayed_work(rpciod_workqueue, + &rpci->queue_timeout, RPC_UPCALL_TIMEOUT); list_add_tail(&msg->list, &rpci->pipe); rpci->pipelen += msg->len; @@ -132,7 +133,7 @@ rpc_close_pipes(struct inode *inode) if (ops->release_pipe) ops->release_pipe(inode); cancel_delayed_work(&rpci->queue_timeout); - flush_scheduled_work(); + flush_workqueue(rpciod_workqueue); } rpc_inode_setowner(inode, NULL); mutex_unlock(&inode->i_mutex); @@ -668,7 +669,7 @@ rpc_mkdir(char *path, struct rpc_clnt *r out: mutex_unlock(&dir->i_mutex); rpc_release_path(&nd); - return dentry; + return dget(dentry); err_depopulate: rpc_depopulate(dentry); __rpc_rmdir(dir, dentry); @@ -732,7 +733,7 @@ rpc_mkpipe(char *path, void *private, st out: mutex_unlock(&dir->i_mutex); rpc_release_path(&nd); - return dentry; + return dget(dentry); err_dput: dput(dentry); dentry = ERR_PTR(-ENOMEM); diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 802d4fe..a04cf3b 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -64,7 +64,7 @@ static LIST_HEAD(all_tasks); */ static DECLARE_MUTEX(rpciod_sema); static unsigned int rpciod_users; -static struct workqueue_struct *rpciod_workqueue; +struct workqueue_struct *rpciod_workqueue; /* * Spinlock for other critical sections of code. @@ -181,6 +181,7 @@ static void __rpc_add_wait_queue(struct else list_add_tail(&task->u.tk_wait.list, &queue->tasks[0]); task->u.tk_wait.rpc_waitq = queue; + queue->qlen++; rpc_set_queued(task); dprintk("RPC: %4d added to queue %p \"%s\"\n", @@ -215,6 +216,7 @@ static void __rpc_remove_wait_queue(stru __rpc_remove_wait_queue_priority(task); else list_del(&task->u.tk_wait.list); + queue->qlen--; dprintk("RPC: %4d removed from queue %p \"%s\"\n", task->tk_pid, queue, rpc_qname(queue)); } @@ -818,6 +820,9 @@ void rpc_init_task(struct rpc_task *task BUG_ON(task->tk_ops == NULL); + /* starting timestamp */ + task->tk_start = jiffies; + dprintk("RPC: %4d new task procpid %d\n", task->tk_pid, current->pid); } diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c index 4979f22..790941e 100644 --- a/net/sunrpc/stats.c +++ b/net/sunrpc/stats.c @@ -21,6 +21,7 @@ #include #include #include +#include #define RPCDBG_FACILITY RPCDBG_MISC @@ -106,6 +107,120 @@ void svc_seq_show(struct seq_file *seq, } } +/** + * rpc_alloc_iostats - allocate an rpc_iostats structure + * @clnt: RPC program, version, and xprt + * + */ +struct rpc_iostats *rpc_alloc_iostats(struct rpc_clnt *clnt) +{ + unsigned int ops = clnt->cl_maxproc; + size_t size = ops * sizeof(struct rpc_iostats); + struct rpc_iostats *new; + + new = kmalloc(size, GFP_KERNEL); + if (new) + memset(new, 0 , size); + return new; +} +EXPORT_SYMBOL(rpc_alloc_iostats); + +/** + * rpc_free_iostats - release an rpc_iostats structure + * @stats: doomed rpc_iostats structure + * + */ +void rpc_free_iostats(struct rpc_iostats *stats) +{ + kfree(stats); +} +EXPORT_SYMBOL(rpc_free_iostats); + +/** + * rpc_count_iostats - tally up per-task stats + * @task: completed rpc_task + * + * Relies on the caller for serialization. + */ +void rpc_count_iostats(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_iostats *stats = task->tk_client->cl_metrics; + struct rpc_iostats *op_metrics; + long rtt, execute, queue; + + if (!stats || !req) + return; + op_metrics = &stats[task->tk_msg.rpc_proc->p_statidx]; + + op_metrics->om_ops++; + op_metrics->om_ntrans += req->rq_ntrans; + op_metrics->om_timeouts += task->tk_timeouts; + + op_metrics->om_bytes_sent += task->tk_bytes_sent; + op_metrics->om_bytes_recv += req->rq_received; + + queue = (long)req->rq_xtime - task->tk_start; + if (queue < 0) + queue = -queue; + op_metrics->om_queue += queue; + + rtt = task->tk_rtt; + if (rtt < 0) + rtt = -rtt; + op_metrics->om_rtt += rtt; + + execute = (long)jiffies - task->tk_start; + if (execute < 0) + execute = -execute; + op_metrics->om_execute += execute; +} + +void _print_name(struct seq_file *seq, unsigned int op, struct rpc_procinfo *procs) +{ + if (procs[op].p_name) + seq_printf(seq, "\t%12s: ", procs[op].p_name); + else if (op == 0) + seq_printf(seq, "\t NULL: "); + else + seq_printf(seq, "\t%12u: ", op); +} + +#define MILLISECS_PER_JIFFY (1000 / HZ) + +void rpc_print_iostats(struct seq_file *seq, struct rpc_clnt *clnt) +{ + struct rpc_iostats *stats = clnt->cl_metrics; + struct rpc_xprt *xprt = clnt->cl_xprt; + unsigned int op, maxproc = clnt->cl_maxproc; + + if (!stats) + return; + + seq_printf(seq, "\tRPC iostats version: %s ", RPC_IOSTATS_VERS); + seq_printf(seq, "p/v: %u/%u (%s)\n", + clnt->cl_prog, clnt->cl_vers, clnt->cl_protname); + + if (xprt) + xprt->ops->print_stats(xprt, seq); + + seq_printf(seq, "\tper-op statistics\n"); + for (op = 0; op < maxproc; op++) { + struct rpc_iostats *metrics = &stats[op]; + _print_name(seq, op, clnt->cl_procinfo); + seq_printf(seq, "%lu %lu %lu %Lu %Lu %Lu %Lu %Lu\n", + metrics->om_ops, + metrics->om_ntrans, + metrics->om_timeouts, + metrics->om_bytes_sent, + metrics->om_bytes_recv, + metrics->om_queue * MILLISECS_PER_JIFFY, + metrics->om_rtt * MILLISECS_PER_JIFFY, + metrics->om_execute * MILLISECS_PER_JIFFY); + } +} +EXPORT_SYMBOL(rpc_print_iostats); + /* * Register/unregister RPC proc files */ diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 8ff2c8a..940dba9 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -44,13 +44,13 @@ #include #include +#include /* * Local variables */ #ifdef RPC_DEBUG -# undef RPC_DEBUG_DATA # define RPCDBG_FACILITY RPCDBG_XPRT #endif @@ -548,6 +548,7 @@ void xprt_connect(struct rpc_task *task) task->tk_timeout = xprt->connect_timeout; rpc_sleep_on(&xprt->pending, task, xprt_connect_status, NULL); + xprt->stat.connect_start = jiffies; xprt->ops->connect(task); } return; @@ -558,6 +559,8 @@ static void xprt_connect_status(struct r struct rpc_xprt *xprt = task->tk_xprt; if (task->tk_status >= 0) { + xprt->stat.connect_count++; + xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start; dprintk("RPC: %4d xprt_connect_status: connection established\n", task->tk_pid); return; @@ -601,16 +604,14 @@ static void xprt_connect_status(struct r struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid) { struct list_head *pos; - struct rpc_rqst *req = NULL; list_for_each(pos, &xprt->recv) { struct rpc_rqst *entry = list_entry(pos, struct rpc_rqst, rq_list); - if (entry->rq_xid == xid) { - req = entry; - break; - } + if (entry->rq_xid == xid) + return entry; } - return req; + xprt->stat.bad_xids++; + return NULL; } /** @@ -646,6 +647,9 @@ void xprt_complete_rqst(struct rpc_task dprintk("RPC: %5u xid %08x complete (%d bytes received)\n", task->tk_pid, ntohl(req->rq_xid), copied); + task->tk_xprt->stat.recvs++; + task->tk_rtt = (long)jiffies - req->rq_xtime; + list_del_init(&req->rq_list); req->rq_received = req->rq_private_buf.len = copied; rpc_wake_up_task(task); @@ -744,12 +748,19 @@ void xprt_transmit(struct rpc_task *task if (status == 0) { dprintk("RPC: %4d xmit complete\n", task->tk_pid); spin_lock_bh(&xprt->transport_lock); + xprt->ops->set_retrans_timeout(task); + + xprt->stat.sends++; + xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs; + xprt->stat.bklog_u += xprt->backlog.qlen; + /* Don't race with disconnect */ if (!xprt_connected(xprt)) task->tk_status = -ENOTCONN; else if (!req->rq_received) rpc_sleep_on(&xprt->pending, task, NULL, xprt_timer); + xprt->ops->release_xprt(xprt, task); spin_unlock_bh(&xprt->transport_lock); return; @@ -848,6 +859,7 @@ void xprt_release(struct rpc_task *task) if (!(req = task->tk_rqstp)) return; + rpc_count_iostats(task); spin_lock_bh(&xprt->transport_lock); xprt->ops->release_xprt(xprt, task); if (xprt->ops->release_request) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index c458f8d..4b4e7df 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -382,6 +382,7 @@ static int xs_tcp_send_request(struct rp /* If we've sent the entire packet, immediately * reset the count of bytes sent. */ req->rq_bytes_sent += status; + task->tk_bytes_sent += status; if (likely(req->rq_bytes_sent >= req->rq_slen)) { req->rq_bytes_sent = 0; return 0; @@ -1114,6 +1115,8 @@ static void xs_tcp_connect_worker(void * } /* Tell the socket layer to start connecting... */ + xprt->stat.connect_count++; + xprt->stat.connect_start = jiffies; status = sock->ops->connect(sock, (struct sockaddr *) &xprt->addr, sizeof(xprt->addr), O_NONBLOCK); dprintk("RPC: %p connect status %d connected %d sock state %d\n", @@ -1177,6 +1180,50 @@ static void xs_connect(struct rpc_task * } } +/** + * xs_udp_print_stats - display UDP socket-specifc stats + * @xprt: rpc_xprt struct containing statistics + * @seq: output file + * + */ +static void xs_udp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) +{ + seq_printf(seq, "\txprt:\tudp %u %lu %lu %lu %lu %Lu %Lu\n", + xprt->port, + xprt->stat.bind_count, + xprt->stat.sends, + xprt->stat.recvs, + xprt->stat.bad_xids, + xprt->stat.req_u, + xprt->stat.bklog_u); +} + +/** + * xs_tcp_print_stats - display TCP socket-specifc stats + * @xprt: rpc_xprt struct containing statistics + * @seq: output file + * + */ +static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) +{ + long idle_time = 0; + + if (xprt_connected(xprt)) + idle_time = (long)(jiffies - xprt->last_used) / HZ; + + seq_printf(seq, "\txprt:\ttcp %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu\n", + xprt->port, + xprt->stat.bind_count, + xprt->stat.connect_count, + xprt->stat.connect_time, + idle_time, + xprt->stat.sends, + xprt->stat.recvs, + xprt->stat.bad_xids, + xprt->stat.req_u, + xprt->stat.bklog_u); +} + static struct rpc_xprt_ops xs_udp_ops = { .set_buffer_size = xs_udp_set_buffer_size, .reserve_xprt = xprt_reserve_xprt_cong, @@ -1191,6 +1238,7 @@ static struct rpc_xprt_ops xs_udp_ops = .release_request = xprt_release_rqst_cong, .close = xs_close, .destroy = xs_destroy, + .print_stats = xs_udp_print_stats, }; static struct rpc_xprt_ops xs_tcp_ops = { @@ -1204,6 +1252,7 @@ static struct rpc_xprt_ops xs_tcp_ops = .set_retrans_timeout = xprt_set_retrans_timeout_def, .close = xs_close, .destroy = xs_destroy, + .print_stats = xs_tcp_print_stats, }; /**