diff -r 3a6a0615d33e Documentation/filesystems/proc.txt --- a/Documentation/filesystems/proc.txt Wed Sep 21 23:16:30 2005 +++ b/Documentation/filesystems/proc.txt Sun Oct 2 14:08:56 2005 @@ -1162,6 +1162,15 @@ a process which is generating disk writes will itself start writing out dirty data. +dirty_ratio_centisec +----------------- + +Throttle the I/O if the per-task writing bandwidth is high enough for +the dirty_ratio to be reached in less than dirty_ratio_centisec. This +makes the write throttling per-process and avoids making too much +memory dirty at the same time. Ideally in the future we should add +some feedback from the backing_dev_info to know the max disk bandwidth. + dirty_writeback_centisecs ------------------------- diff -r 3a6a0615d33e Documentation/sysctl/vm.txt --- a/Documentation/sysctl/vm.txt Wed Sep 21 23:16:30 2005 +++ b/Documentation/sysctl/vm.txt Sun Oct 2 14:08:56 2005 @@ -19,6 +19,7 @@ - overcommit_memory - page-cluster - dirty_ratio +- dirty_ratio_centisec - dirty_background_ratio - dirty_expire_centisecs - dirty_writeback_centisecs @@ -29,9 +30,10 @@ ============================================================== -dirty_ratio, dirty_background_ratio, dirty_expire_centisecs, -dirty_writeback_centisecs, vfs_cache_pressure, laptop_mode, -block_dump, swap_token_timeout: +dirty_ratio, dirty_ratio_centisec, dirty_background_ratio, +dirty_expire_centisecs, dirty_writeback_centisecs, +vfs_cache_pressure, laptop_mode, block_dump, +swap_token_timeout: See Documentation/filesystems/proc.txt diff -r 3a6a0615d33e fs/proc/base.c --- a/fs/proc/base.c Wed Sep 21 23:16:30 2005 +++ b/fs/proc/base.c Sun Oct 2 14:08:56 2005 @@ -122,6 +122,7 @@ #endif PROC_TGID_OOM_SCORE, PROC_TGID_OOM_ADJUST, + PROC_TGID_FUTURE_DIRTY, PROC_TID_INO, PROC_TID_STATUS, PROC_TID_MEM, @@ -160,6 +161,7 @@ #endif PROC_TID_OOM_SCORE, PROC_TID_OOM_ADJUST, + PROC_TID_FUTURE_DIRTY, /* Add new entries before this */ PROC_TID_FD_DIR = 0x8000, /* 0x8000-0xffff */ @@ -210,6 +212,7 @@ #endif E(PROC_TGID_OOM_SCORE, "oom_score",S_IFREG|S_IRUGO), E(PROC_TGID_OOM_ADJUST,"oom_adj", S_IFREG|S_IRUGO|S_IWUSR), + E(PROC_TGID_FUTURE_DIRTY,"future_dirty",S_IFREG|S_IRUGO), #ifdef CONFIG_AUDITSYSCALL E(PROC_TGID_LOGINUID, "loginuid", S_IFREG|S_IWUSR|S_IRUGO), #endif @@ -250,6 +253,7 @@ #endif E(PROC_TID_OOM_SCORE, "oom_score",S_IFREG|S_IRUGO), E(PROC_TID_OOM_ADJUST, "oom_adj", S_IFREG|S_IRUGO|S_IWUSR), + E(PROC_TID_FUTURE_DIRTY,"future_dirty",S_IFREG|S_IRUGO), #ifdef CONFIG_AUDITSYSCALL E(PROC_TID_LOGINUID, "loginuid", S_IFREG|S_IWUSR|S_IRUGO), #endif @@ -509,6 +513,12 @@ do_posix_clock_monotonic_gettime(&uptime); points = badness(task, uptime.tv_sec); return sprintf(buffer, "%lu\n", points); +} + +static int proc_future_dirty(struct task_struct *task, char *buffer) +{ + return sprintf(buffer, "%lu\n", + task->balance_dirty_state.future_dirty); } /************************************************************************/ @@ -1734,6 +1744,11 @@ case PROC_TGID_OOM_ADJUST: inode->i_fop = &proc_oom_adjust_operations; break; + case PROC_TID_FUTURE_DIRTY: + case PROC_TGID_FUTURE_DIRTY: + inode->i_fop = &proc_info_file_operations; + ei->op.proc_read = proc_future_dirty; + break; #ifdef CONFIG_AUDITSYSCALL case PROC_TID_LOGINUID: case PROC_TGID_LOGINUID: diff -r 3a6a0615d33e include/linux/sched.h --- a/include/linux/sched.h Wed Sep 21 23:16:30 2005 +++ b/include/linux/sched.h Sun Oct 2 14:08:56 2005 @@ -584,6 +584,12 @@ struct io_context; /* See blkdev.h */ void exit_io_context(void); struct cpuset; + +struct balance_dirty_state { + unsigned long future_dirty; + unsigned long last_sync; /* jiffies */ + int rate_limit; +}; #define NGROUPS_SMALL 32 #define NGROUPS_PER_BLOCK ((int)(PAGE_SIZE / sizeof(gid_t))) @@ -777,6 +783,8 @@ unsigned long ptrace_message; siginfo_t *last_siginfo; /* For ptrace use. */ + + struct balance_dirty_state balance_dirty_state; /* * current io wait handle: wait queue entry to use for io waits * If this thread is processing aio, this points at the waitqueue diff -r 3a6a0615d33e include/linux/sysctl.h --- a/include/linux/sysctl.h Wed Sep 21 23:16:30 2005 +++ b/include/linux/sysctl.h Sun Oct 2 14:08:56 2005 @@ -180,6 +180,7 @@ VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */ VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */ VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */ + VM_DIRTY_RATIO_CS=29, /* dirty_ratio_centisecs */ }; diff -r 3a6a0615d33e include/linux/writeback.h --- a/include/linux/writeback.h Wed Sep 21 23:16:30 2005 +++ b/include/linux/writeback.h Sun Oct 2 14:08:56 2005 @@ -93,6 +93,7 @@ /* These are exported to sysctl. */ extern int dirty_background_ratio; extern int vm_dirty_ratio; +extern int vm_dirty_ratio_centisecs; extern int dirty_writeback_centisecs; extern int dirty_expire_centisecs; extern int block_dump; diff -r 3a6a0615d33e kernel/fork.c --- a/kernel/fork.c Wed Sep 21 23:16:30 2005 +++ b/kernel/fork.c Sun Oct 2 14:08:56 2005 @@ -962,6 +962,10 @@ p->syscw = 0; /* I/O counter: write syscalls */ acct_clear_integrals(p); + p->balance_dirty_state.future_dirty = 0; + p->balance_dirty_state.last_sync = 0; + p->balance_dirty_state.rate_limit = 0; + p->it_virt_expires = cputime_zero; p->it_prof_expires = cputime_zero; p->it_sched_expires = 0; diff -r 3a6a0615d33e kernel/sysctl.c --- a/kernel/sysctl.c Wed Sep 21 23:16:30 2005 +++ b/kernel/sysctl.c Sun Oct 2 14:08:56 2005 @@ -725,6 +725,14 @@ .procname = "dirty_expire_centisecs", .data = &dirty_expire_centisecs, .maxlen = sizeof(dirty_expire_centisecs), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = VM_DIRTY_RATIO_CS, + .procname = "dirty_ratio_centisecs", + .data = &vm_dirty_ratio_centisecs, + .maxlen = sizeof(vm_dirty_ratio_centisecs), .mode = 0644, .proc_handler = &proc_dointvec, }, diff -r 3a6a0615d33e mm/filemap.c --- a/mm/filemap.c Wed Sep 21 23:16:30 2005 +++ b/mm/filemap.c Sun Oct 2 14:08:56 2005 @@ -1907,6 +1907,7 @@ unsigned long offset; unsigned long maxlen; size_t copied; + int was_dirty; offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ index = pos >> PAGE_CACHE_SHIFT; @@ -1930,6 +1931,7 @@ status = -ENOMEM; break; } + was_dirty = PageDirty(page); status = a_ops->prepare_write(file, page, offset, offset+bytes); if (unlikely(status)) { @@ -1980,7 +1982,8 @@ page_cache_release(page); if (status < 0) break; - balance_dirty_pages_ratelimited(mapping); + if (!was_dirty) + balance_dirty_pages_ratelimited(mapping); cond_resched(); } while (count); *ppos = pos; diff -r 3a6a0615d33e mm/page-writeback.c --- a/mm/page-writeback.c Wed Sep 21 23:16:30 2005 +++ b/mm/page-writeback.c Sun Oct 2 14:08:56 2005 @@ -64,12 +64,17 @@ /* * Start background writeback (via pdflush) at this percentage */ -int dirty_background_ratio = 10; +int dirty_background_ratio = 30; /* * The generator of dirty data starts writeback at this percentage */ int vm_dirty_ratio = 40; + +/* + * Throttle 5 sec before reaching the dirty_ratio. + */ +int vm_dirty_ratio_centisecs = 5 * 100; /* * The interval between `kupdate'-style writebacks, in centiseconds @@ -187,7 +192,8 @@ * If we're over `background_thresh' then pdflush is woken to perform some * writeout. */ -static void balance_dirty_pages(struct address_space *mapping) +static void balance_dirty_pages(struct address_space *mapping, + unsigned long future_dirty) { struct writeback_state wbs; long nr_reclaimable; @@ -209,10 +215,12 @@ get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, mapping); nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable; - if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) + if (nr_reclaimable + wbs.nr_writeback + future_dirty <= + dirty_thresh || !nr_reclaimable) break; - dirty_exceeded = 1; + if (nr_reclaimable + wbs.nr_writeback > dirty_thresh) + dirty_exceeded = 1; /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. * Unstable writes are a feature of certain networked @@ -225,7 +233,8 @@ get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, mapping); nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable; - if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) + if (nr_reclaimable + wbs.nr_writeback + future_dirty <= + dirty_thresh || !nr_reclaimable) break; pages_written += write_chunk - wbc.nr_to_write; if (pages_written >= write_chunk) @@ -253,6 +262,45 @@ pdflush_operation(background_writeout, 0); } +static int task_balance_dirty_pages(struct address_space *mapping) +{ + struct balance_dirty_state *state; + int need_balance_dirty; + int rate_limit; + + need_balance_dirty = -1; + if (unlikely(!vm_dirty_ratio_centisecs)) + goto out; + need_balance_dirty = 0; + + state = ¤t->balance_dirty_state; + if (likely(state->future_dirty < (~0UL>>1))) + state->future_dirty++; + + rate_limit = ratelimit_pages; + if (dirty_exceeded) + rate_limit = 8; + + if (state->rate_limit++ > rate_limit) { + state->rate_limit = 0; + need_balance_dirty = 1; + } + + if (time_after(jiffies, + state->last_sync+vm_dirty_ratio_centisecs*HZ/100)) { + state->future_dirty >>= 1; + state->last_sync = jiffies; + state->rate_limit = 0; + need_balance_dirty = 1; + } + + if (need_balance_dirty) + balance_dirty_pages(mapping, state->future_dirty); + +out: + return need_balance_dirty; +} + /** * balance_dirty_pages_ratelimited - balance dirty memory state * @mapping: address_space which was dirtied @@ -270,6 +318,9 @@ { static DEFINE_PER_CPU(int, ratelimits) = 0; long ratelimit; + + if (task_balance_dirty_pages(mapping) >= 0) + return; ratelimit = ratelimit_pages; if (dirty_exceeded) @@ -282,7 +333,7 @@ if (get_cpu_var(ratelimits)++ >= ratelimit) { __get_cpu_var(ratelimits) = 0; put_cpu_var(ratelimits); - balance_dirty_pages(mapping); + balance_dirty_pages(mapping, 0); return; } put_cpu_var(ratelimits);