From: Andrew Morton The present per-task IO accounting isn't very useful. It simply counts the number of bytes passed into read() and write(). So if a process reads 1MB from an already-cached file, it is accused of having performed 1MB of I/O, which is hopelessly wrong. This patchset adds new accounting which tries to be more accurate. We account for three things: reads: attempt to count the number of bytes which this process really did cause to be fetched from the storage layer writes: attempt to count the number of bytes which this process caused to be sent to the storage layer. The big inaccuracy here is truncate. If a process writes 1MB to a file and then deletes the file, it will in fact perform no writeout. But it will have been accounted as having caused 1MB of write. So... cancelled_writes: account the number of bytes which this process caused to not happen, by truncating pagecache. We _could_ just subtract this from the process's `write' accounting. But that means that some processes would be reported to have done negative amounts of write IO, which is silly. So we just report the raw number and punt this decision up to userspace. This patch: Wire up the kernel-private data structures and the accessor functions to manipulate them. Cc: Jay Lan Cc: Shailabh Nagar Cc: Balbir Singh Cc: Chris Sturtivant Cc: Tony Ernst Cc: Guillaume Thouvenin Signed-off-by: Andrew Morton --- include/linux/sched.h | 2 include/linux/task_io_accounting.h | 37 ++++++++++++++++++ include/linux/task_io_accounting_ops.h | 47 +++++++++++++++++++++++ init/Kconfig | 9 ++++ kernel/fork.c | 2 5 files changed, 97 insertions(+) diff -puN include/linux/sched.h~io-accounting-core-statistics include/linux/sched.h --- a/include/linux/sched.h~io-accounting-core-statistics +++ a/include/linux/sched.h @@ -82,6 +82,7 @@ struct sched_param { #include #include #include +#include #include @@ -1004,6 +1005,7 @@ struct task_struct { wait_queue_t *io_wait; /* i/o counters(bytes read/written, #syscalls */ u64 rchar, wchar, syscr, syscw; + struct task_io_accounting ioac; #if defined(CONFIG_TASK_XACCT) u64 acct_rss_mem1; /* accumulated rss usage */ u64 acct_vm_mem1; /* accumulated virtual memory usage */ diff -puN /dev/null include/linux/task_io_accounting.h --- /dev/null +++ a/include/linux/task_io_accounting.h @@ -0,0 +1,37 @@ +/* + * task_io_accounting: a structure which is used for recording a single task's + * IO statistics. + * + * Don't include this header file directly - it is designed to be dragged in via + * sched.h. + * + * Blame akpm@osdl.org for all this. + */ + +#ifdef CONFIG_TASK_IO_ACCOUNTING +struct task_io_accounting { + /* + * The number of bytes which this task has caused to be read from + * storage. + */ + u64 read_bytes; + + /* + * The number of bytes which this task has caused, or shall cause to be + * written to disk. + */ + u64 write_bytes; + + /* + * A task can cause "negative" IO too. If this task truncates some + * dirty pagecache, some IO which another task has been accounted for + * (in its write_bytes) will not be happening. We _could_ just + * subtract that from the truncating task's write_bytes, but there is + * information loss in doing that. + */ + u64 cancelled_write_bytes; +}; +#else +struct task_io_accounting { +}; +#endif diff -puN /dev/null include/linux/task_io_accounting_ops.h --- /dev/null +++ a/include/linux/task_io_accounting_ops.h @@ -0,0 +1,47 @@ +/* + * Task I/O accounting operations + */ +#ifndef __TASK_IO_ACCOUNTING_OPS_INCLUDED +#define __TASK_IO_ACCOUNTING_OPS_INCLUDED + +#ifdef CONFIG_TASK_IO_ACCOUNTING +static inline void task_io_account_read(u32 bytes) +{ + current->ioac.read_bytes += bytes; +} + +static inline void task_io_account_write(u32 bytes) +{ + current->ioac.write_bytes += bytes; +} + +static inline void task_io_account_cancelled_write(u32 bytes) +{ + current->ioac.cancelled_write_bytes += bytes; +} + +static inline void task_io_accounting_init(struct task_struct *tsk) +{ + memset(&tsk->ioac, 0, sizeof(tsk->ioac)); +} + +#else + +static inline void task_io_account_read(u32 bytes) +{ +} + +static inline void task_io_account_write(u32 bytes) +{ +} + +static inline void task_io_account_cancelled_write(u32 bytes) +{ +} + +static inline void task_io_accounting_init(struct task_struct *tsk) +{ +} + +#endif /* CONFIG_TASK_IO_ACCOUNTING */ +#endif /* __TASK_IO_ACCOUNTING_OPS_INCLUDED */ diff -puN init/Kconfig~io-accounting-core-statistics init/Kconfig --- a/init/Kconfig~io-accounting-core-statistics +++ a/init/Kconfig @@ -304,6 +304,15 @@ config TASK_XACCT Say N if unsure. +config TASK_IO_ACCOUNTING + bool "Enable per-task storage I/O accounting (EXPERIMENTAL)" + depends on TASK_XACCT + help + Collect information on the number of bytes of storage I/O which this + task has caused. + + Say N if unsure. + config SYSCTL bool diff -puN kernel/fork.c~io-accounting-core-statistics kernel/fork.c --- a/kernel/fork.c~io-accounting-core-statistics +++ a/kernel/fork.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -1055,6 +1056,7 @@ static struct task_struct *copy_process( p->wchar = 0; /* I/O counter: bytes written */ p->syscr = 0; /* I/O counter: read syscalls */ p->syscw = 0; /* I/O counter: write syscalls */ + task_io_accounting_init(p); acct_clear_integrals(p); p->it_virt_expires = cputime_zero; _