From: Shailabh Nagar Unlike earlier iterations of the delay accounting patches, now delays are only collected for the actual I/O waits rather than try and cover the delays seen in I/O submission paths. Account separately for block I/O delays incurred as a result of swapin page faults whose frequency can be affected by the task/process' rss limit. Hence swapin delays can act as feedback for rss limit changes independent of I/O priority changes. Signed-off-by: Shailabh Nagar Signed-off-by: Balbir Singh Cc: Jes Sorensen Cc: Peter Chubb Cc: Erich Focht Cc: Levent Serinol Cc: Jay Lan DESC Add comments on units for the delay fields EDESC From: Balbir Singh On Mon, May 08, 2006 at 02:19:52PM -0700, Andrew Morton wrote: > Balbir Singh wrote: > > > > @@ -550,6 +550,12 @@ struct task_delay_info { > > * Atomicity of updates to XXX_delay, XXX_count protected by > > * single lock above (split into XXX_lock if contention is an issue). > > */ > > + > > + struct timespec blkio_start, blkio_end; /* Shared by blkio, swapin */ > > + u64 blkio_delay; /* wait for sync block io completion */ > > + u64 swapin_delay; /* wait for swapin block io completion */ > > + u32 blkio_count; > > + u32 swapin_count; > > These fields are a bit mystifying. > > In what units are blkio_delay and swapin_delay? > > What is the meaning behind blkio_count and swapin_count? > > Better comments needed, please. Hi, Andrew, Here is an update, that adds comments to the fields as suggested in the review comments Balbir Singh, Linux Technology Center, IBM Software Labs Changelog 1. Add comments to the task_delay_info structure, documenting the units of delay and document the meaning of the count fields in the structure. Signed-off-by: Balbir Singh Signed-off-by: Shailabh Nagar Cc: Jes Sorensen Cc: Peter Chubb Cc: Erich Focht Cc: Levent Serinol Cc: Jay Lan Signed-off-by: Andrew Morton --- include/linux/delayacct.h | 25 +++++++++++++++++++++++++ include/linux/sched.h | 13 +++++++++++++ kernel/delayacct.c | 19 +++++++++++++++++++ kernel/sched.c | 5 +++++ mm/memory.c | 4 ++++ 5 files changed, 66 insertions(+) diff -puN include/linux/delayacct.h~per-task-delay-accounting-sync-block-i-o-and-swapin-delay-collection include/linux/delayacct.h --- a/include/linux/delayacct.h~per-task-delay-accounting-sync-block-i-o-and-swapin-delay-collection +++ a/include/linux/delayacct.h @@ -19,6 +19,13 @@ #include +/* + * Per-task flags relevant to delay accounting + * maintained privately to avoid exhausting similar flags in sched.h:PF_* + * Used to set current->delays->flags + */ +#define DELAYACCT_PF_SWAPIN 0x00000001 /* I am doing a swapin */ + #ifdef CONFIG_TASK_DELAY_ACCT extern int delayacct_on; /* Delay accounting turned on/off */ @@ -26,6 +33,8 @@ extern kmem_cache_t *delayacct_cache; extern void delayacct_init(void); extern void __delayacct_tsk_init(struct task_struct *); extern void __delayacct_tsk_exit(struct task_struct *); +extern void __delayacct_blkio_start(void); +extern void __delayacct_blkio_end(void); static inline void delayacct_set_flag(int flag) { @@ -53,6 +62,18 @@ static inline void delayacct_tsk_exit(st __delayacct_tsk_exit(tsk); } +static inline void delayacct_blkio_start(void) +{ + if (current->delays) + __delayacct_blkio_start(); +} + +static inline void delayacct_blkio_end(void) +{ + if (current->delays) + __delayacct_blkio_end(); +} + #else static inline void delayacct_set_flag(int flag) {} @@ -64,6 +85,10 @@ static inline void delayacct_tsk_init(st {} static inline void delayacct_tsk_exit(struct task_struct *tsk) {} +static inline void delayacct_blkio_start(void) +{} +static inline void delayacct_blkio_end(void) +{} #endif /* CONFIG_TASK_DELAY_ACCT */ #endif diff -puN include/linux/sched.h~per-task-delay-accounting-sync-block-i-o-and-swapin-delay-collection include/linux/sched.h --- a/include/linux/sched.h~per-task-delay-accounting-sync-block-i-o-and-swapin-delay-collection +++ a/include/linux/sched.h @@ -566,6 +566,19 @@ struct task_delay_info { * Atomicity of updates to XXX_delay, XXX_count protected by * single lock above (split into XXX_lock if contention is an issue). */ + + /* + * XXX_count is incremented on every XXX operation, the delay + * associated with the operation is added to XXX_delay. + * XXX_delay contains the accumulated delay time in nanoseconds. + */ + struct timespec blkio_start, blkio_end; /* Shared by blkio, swapin */ + u64 blkio_delay; /* wait for sync block io completion */ + u64 swapin_delay; /* wait for swapin block io completion */ + u32 blkio_count; /* total count of the number of sync block */ + /* io operations performed */ + u32 swapin_count; /* total count of the number of swapin block */ + /* io operations performed */ }; #endif diff -puN kernel/delayacct.c~per-task-delay-accounting-sync-block-i-o-and-swapin-delay-collection kernel/delayacct.c --- a/kernel/delayacct.c~per-task-delay-accounting-sync-block-i-o-and-swapin-delay-collection +++ a/kernel/delayacct.c @@ -85,3 +85,22 @@ static void delayacct_end(struct timespe spin_unlock(¤t->delays->lock); } +void __delayacct_blkio_start(void) +{ + delayacct_start(¤t->delays->blkio_start); +} + +void __delayacct_blkio_end(void) +{ + if (current->delays->flags & DELAYACCT_PF_SWAPIN) + /* Swapin block I/O */ + delayacct_end(¤t->delays->blkio_start, + ¤t->delays->blkio_end, + ¤t->delays->swapin_delay, + ¤t->delays->swapin_count); + else /* Other block I/O */ + delayacct_end(¤t->delays->blkio_start, + ¤t->delays->blkio_end, + ¤t->delays->blkio_delay, + ¤t->delays->blkio_count); +} diff -puN kernel/sched.c~per-task-delay-accounting-sync-block-i-o-and-swapin-delay-collection kernel/sched.c --- a/kernel/sched.c~per-task-delay-accounting-sync-block-i-o-and-swapin-delay-collection +++ a/kernel/sched.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include @@ -4534,9 +4535,11 @@ void __sched io_schedule(void) { struct rq *rq = &__raw_get_cpu_var(runqueues); + delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); schedule(); atomic_dec(&rq->nr_iowait); + delayacct_blkio_end(); } EXPORT_SYMBOL(io_schedule); @@ -4545,9 +4548,11 @@ long __sched io_schedule_timeout(long ti struct rq *rq = &__raw_get_cpu_var(runqueues); long ret; + delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); ret = schedule_timeout(timeout); atomic_dec(&rq->nr_iowait); + delayacct_blkio_end(); return ret; } diff -puN mm/memory.c~per-task-delay-accounting-sync-block-i-o-and-swapin-delay-collection mm/memory.c --- a/mm/memory.c~per-task-delay-accounting-sync-block-i-o-and-swapin-delay-collection +++ a/mm/memory.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -1950,6 +1951,7 @@ static int do_swap_page(struct mm_struct migration_entry_wait(mm, pmd, address); goto out; } + delayacct_set_flag(DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry); if (!page) { swapin_readahead(entry, address, vma); @@ -1962,6 +1964,7 @@ static int do_swap_page(struct mm_struct page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) ret = VM_FAULT_OOM; + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); goto unlock; } @@ -1971,6 +1974,7 @@ static int do_swap_page(struct mm_struct grab_swap_token(); } + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); mark_page_accessed(page); lock_page(page); _