Make dirty throttling obey cpusets Currently dirty throttling does not work properly in a cpuset. If f.e a cpuset contains only 1/10th of available memory then all of the memory of a cpuset can be dirtied without any writes being triggered. If we are writing to a device that is mounted via NFS then the write operation may be terminated with OOM since NFS is not allowed to allocate more pages for writeout. If all of the cpusets memory is dirty then only 10% of total memory is dirty.i The background writeback threshold is usually set at 40% and the synchrononous threshold at 60%. So we are still below the global limits while the dirty ratio in the cpuset is 100%! F.e. NFS pages can fill up the complete cpuset. This patch makes dirty writeout cpuset aware. When determining the dirty limits in get_dirty_limits() we calculate values based on the nodes that are reachable from the current process (that has been dirtying the page). Then we can trigger writeout based on the dirty ratio of the memory in the cpuset. Determination of the dirty state is expensive since we have to scan over all the nodes and sum up the relevant values. We trigger writeout in a a cpuset specific way. We go through the dirty inodes and search for inodes that have dirty pages on the nodes of the active cpuset. If an inode fulfills that requirement then we begin writeout of the dirty pages of that inode. Christoph Lameter Index: linux-2.6.20-rc4/mm/page-writeback.c =================================================================== --- linux-2.6.20-rc4.orig/mm/page-writeback.c 2007-01-08 17:36:51.217724377 -0600 +++ linux-2.6.20-rc4/mm/page-writeback.c 2007-01-08 17:38:49.813455891 -0600 @@ -101,7 +101,15 @@ EXPORT_SYMBOL(laptop_mode); /* End of sysctl-exported parameters */ -static void background_writeout(unsigned long _min_pages); +static void background_writeout(unsigned long _min_pages, nodemask_t *nodes); + +struct dirty_limits { + long thresh_background; + long thresh_dirty; + unsigned long nr_dirty; + unsigned long nr_unstable; + unsigned long nr_writeback; +}; /* * Work out the current dirty-memory clamping and background writeout @@ -121,30 +129,73 @@ static void background_writeout(unsigned * clamping level. */ static void -get_dirty_limits(long *pbackground, long *pdirty, - struct address_space *mapping) +get_dirty_limits(struct dirty_limits *dl, struct address_space *mapping, + struct writeback_control *wbc) { int background_ratio; /* Percentages */ int dirty_ratio; int unmapped_ratio; long background; long dirty; - unsigned long available_memory = vm_total_pages; + unsigned long available_memory; + unsigned long high_memory; + unsigned long nr_mapped; struct task_struct *tsk; +#ifdef CONFIG_CPUSETS + /* + * Calculate the limits relative to the current cpuset if necessary. + */ + if (unlikely(wbc && wbc->nodes && + !nodes_subset(node_online_map, *wbc->nodes))) { + int node; + + memset(dl, 0, sizeof(struct dirty_limits)); + available_memory = 0; + high_memory = 0; + nr_mapped = 0; + for_each_node_mask(node, *wbc->nodes) { + if (!node_online(node)) + continue; + dl->nr_dirty += node_page_state(node, NR_FILE_DIRTY); + dl->nr_unstable += + node_page_state(node, NR_UNSTABLE_NFS); + dl->nr_writeback += + node_page_state(node, NR_WRITEBACK); + available_memory += + NODE_DATA(node)->node_present_pages; +#ifdef CONFIG_HIGHMEM + high_memory += NODE_DATA(node) + ->node_zones[ZONE_HIGHMEM]->present_pages; +#endif + nr_mapped += node_page_state(node, NR_FILE_MAPPED) + + node_page_state(node, NR_ANON_PAGES); + } + } else +#endif + { + /* Global limits */ + if (wbc) + wbc->nodes = NULL; + dl->nr_dirty = global_page_state(NR_FILE_DIRTY); + dl->nr_unstable = global_page_state(NR_UNSTABLE_NFS); + dl->nr_writeback = global_page_state(NR_WRITEBACK); + available_memory = vm_total_pages; + high_memory = totalhigh_pages; + nr_mapped = global_page_state(NR_FILE_MAPPED) + + global_page_state(NR_ANON_PAGES); + } #ifdef CONFIG_HIGHMEM /* * If this mapping can only allocate from low memory, * we exclude high memory from our count. */ if (mapping && !(mapping_gfp_mask(mapping) & __GFP_HIGHMEM)) - available_memory -= totalhigh_pages; + available_memory -= high_memory; #endif - unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) + - global_page_state(NR_ANON_PAGES)) * 100) / - vm_total_pages; + unmapped_ratio = 100 - (nr_mapped * 100) / available_memory; dirty_ratio = vm_dirty_ratio; if (dirty_ratio > unmapped_ratio / 2) @@ -164,8 +215,8 @@ get_dirty_limits(long *pbackground, long background += background / 4; dirty += dirty / 4; } - *pbackground = background; - *pdirty = dirty; + dl->thresh_background = background; + dl->thresh_dirty = dirty; } /* @@ -178,8 +229,7 @@ get_dirty_limits(long *pbackground, long static void balance_dirty_pages(struct address_space *mapping) { long nr_reclaimable; - long background_thresh; - long dirty_thresh; + struct dirty_limits dl; unsigned long pages_written = 0; unsigned long write_chunk = sync_writeback_pages(); @@ -192,13 +242,13 @@ static void balance_dirty_pages(struct a .older_than_this = NULL, .nr_to_write = write_chunk, .range_cyclic = 1, + .nodes = &cpuset_current_mems_allowed }; - get_dirty_limits(&background_thresh, &dirty_thresh, mapping); - nr_reclaimable = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); - if (nr_reclaimable + global_page_state(NR_WRITEBACK) <= - dirty_thresh) + get_dirty_limits(&dl, mapping, &wbc); + nr_reclaimable = dl.nr_dirty + dl.nr_unstable; + if (nr_reclaimable + dl.nr_writeback <= + dl.thresh_dirty) break; if (!dirty_exceeded) @@ -212,13 +262,9 @@ static void balance_dirty_pages(struct a */ if (nr_reclaimable) { writeback_inodes(&wbc); - get_dirty_limits(&background_thresh, - &dirty_thresh, mapping); - nr_reclaimable = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); - if (nr_reclaimable + - global_page_state(NR_WRITEBACK) - <= dirty_thresh) + get_dirty_limits(&dl, mapping, &wbc); + nr_reclaimable = dl.nr_dirty + dl.nr_unstable; + if (nr_reclaimable + dl.nr_writeback <= dl.thresh_dirty) break; pages_written += write_chunk - wbc.nr_to_write; if (pages_written >= write_chunk) @@ -227,8 +273,8 @@ static void balance_dirty_pages(struct a congestion_wait(WRITE, HZ/10); } - if (nr_reclaimable + global_page_state(NR_WRITEBACK) - <= dirty_thresh && dirty_exceeded) + if (nr_reclaimable + dl.nr_writeback + <= dl.thresh_dirty && dirty_exceeded) dirty_exceeded = 0; if (writeback_in_progress(bdi)) @@ -243,8 +289,9 @@ static void balance_dirty_pages(struct a * background_thresh, to keep the amount of dirty memory low. */ if ((laptop_mode && pages_written) || - (!laptop_mode && (nr_reclaimable > background_thresh))) - pdflush_operation(background_writeout, 0); + (!laptop_mode && (nr_reclaimable > dl.thresh_background))) + pdflush_operation(background_writeout, 0, + &cpuset_current_mems_allowed); } void set_page_dirty_balance(struct page *page) @@ -301,21 +348,19 @@ EXPORT_SYMBOL(balance_dirty_pages_rateli void throttle_vm_writeout(void) { - long background_thresh; - long dirty_thresh; + struct dirty_limits dl; for ( ; ; ) { - get_dirty_limits(&background_thresh, &dirty_thresh, NULL); + get_dirty_limits(&dl, NULL, NULL); /* * Boost the allowable dirty threshold a bit for page * allocators so they don't get DoS'ed by heavy writers */ - dirty_thresh += dirty_thresh / 10; /* wheeee... */ + dl.thresh_dirty += dl.thresh_dirty / 10; /* wheeee... */ - if (global_page_state(NR_UNSTABLE_NFS) + - global_page_state(NR_WRITEBACK) <= dirty_thresh) - break; + if (dl.nr_unstable + dl.nr_writeback <= dl.thresh_dirty) + break; congestion_wait(WRITE, HZ/10); } } @@ -325,7 +370,7 @@ void throttle_vm_writeout(void) * writeback at least _min_pages, and keep writing until the amount of dirty * memory is less than the background threshold, or until we're all clean. */ -static void background_writeout(unsigned long _min_pages) +static void background_writeout(unsigned long _min_pages, nodemask_t *nodes) { long min_pages = _min_pages; struct writeback_control wbc = { @@ -335,15 +380,14 @@ static void background_writeout(unsigned .nr_to_write = 0, .nonblocking = 1, .range_cyclic = 1, + .nodes = nodes }; for ( ; ; ) { - long background_thresh; - long dirty_thresh; + struct dirty_limits dl; - get_dirty_limits(&background_thresh, &dirty_thresh, NULL); - if (global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) < background_thresh + get_dirty_limits(&dl, NULL, &wbc); + if (dl.nr_dirty + dl.nr_unstable < dl.thresh_background && min_pages <= 0) break; wbc.encountered_congestion = 0; @@ -365,12 +409,12 @@ static void background_writeout(unsigned * the whole world. Returns 0 if a pdflush thread was dispatched. Returns * -1 if all pdflush threads were busy. */ -int wakeup_pdflush(long nr_pages) +int wakeup_pdflush(long nr_pages, nodemask_t *nodes) { if (nr_pages == 0) nr_pages = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS); - return pdflush_operation(background_writeout, nr_pages); + return pdflush_operation(background_writeout, nr_pages, nodes); } static void wb_timer_fn(unsigned long unused); @@ -394,7 +438,7 @@ static DEFINE_TIMER(laptop_mode_wb_timer * older_than_this takes precedence over nr_to_write. So we'll only write back * all dirty pages if they are all attached to "old" mappings. */ -static void wb_kupdate(unsigned long arg) +static void wb_kupdate(unsigned long arg, nodemask_t *unused) { unsigned long oldest_jif; unsigned long start_jif; @@ -454,18 +498,18 @@ int dirty_writeback_centisecs_handler(ct static void wb_timer_fn(unsigned long unused) { - if (pdflush_operation(wb_kupdate, 0) < 0) + if (pdflush_operation(wb_kupdate, 0, &node_online_map) < 0) mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */ } -static void laptop_flush(unsigned long unused) +static void laptop_flush(unsigned long unused, nodemask_t *unused2) { sys_sync(); } static void laptop_timer_fn(unsigned long unused) { - pdflush_operation(laptop_flush, 0); + pdflush_operation(laptop_flush, 0, NULL); } /* Index: linux-2.6.20-rc4/include/linux/writeback.h =================================================================== --- linux-2.6.20-rc4.orig/include/linux/writeback.h 2007-01-08 17:36:51.251907171 -0600 +++ linux-2.6.20-rc4/include/linux/writeback.h 2007-01-08 17:38:49.831035612 -0600 @@ -59,11 +59,12 @@ struct writeback_control { unsigned for_reclaim:1; /* Invoked from the page allocator */ unsigned for_writepages:1; /* This is a writepages() call */ unsigned range_cyclic:1; /* range_start is cyclic */ + nodemask_t *nodes; /* Set of nodes of interest */ }; /* * fs/fs-writeback.c - */ + */ void writeback_inodes(struct writeback_control *wbc); void wake_up_inode(struct inode *inode); int inode_wait(void *); @@ -81,7 +82,7 @@ static inline void wait_on_inode(struct /* * mm/page-writeback.c */ -int wakeup_pdflush(long nr_pages); +int wakeup_pdflush(long nr_pages, nodemask_t *nodes); void laptop_io_completion(void); void laptop_sync_completion(void); void throttle_vm_writeout(void); @@ -109,7 +110,8 @@ balance_dirty_pages_ratelimited(struct a balance_dirty_pages_ratelimited_nr(mapping, 1); } -int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0); +int pdflush_operation(void (*fn)(unsigned long, nodemask_t *nodes), + unsigned long arg0, nodemask_t *nodes); extern int generic_writepages(struct address_space *mapping, struct writeback_control *wbc); int do_writepages(struct address_space *mapping, struct writeback_control *wbc); Index: linux-2.6.20-rc4/fs/fs-writeback.c =================================================================== --- linux-2.6.20-rc4.orig/fs/fs-writeback.c 2007-01-08 17:36:51.260697032 -0600 +++ linux-2.6.20-rc4/fs/fs-writeback.c 2007-01-08 17:38:49.844708728 -0600 @@ -365,6 +365,11 @@ sync_sb_inodes(struct super_block *sb, s if (current_is_pdflush() && !writeback_acquire(bdi)) break; +#ifdef CONFIG_CPUSETS + if (wbc->nodes && !nodes_intersects(mapping->dirty_nodes, + *wbc->nodes)) + break; +#endif BUG_ON(inode->i_state & I_FREEING); __iget(inode); pages_skipped = wbc->pages_skipped; Index: linux-2.6.20-rc4/fs/buffer.c =================================================================== --- linux-2.6.20-rc4.orig/fs/buffer.c 2007-01-08 17:36:51.268510242 -0600 +++ linux-2.6.20-rc4/fs/buffer.c 2007-01-08 17:38:49.862288448 -0600 @@ -357,7 +357,7 @@ static void free_more_memory(void) struct zone **zones; pg_data_t *pgdat; - wakeup_pdflush(1024); + wakeup_pdflush(1024, NULL); yield(); for_each_online_pgdat(pgdat) { Index: linux-2.6.20-rc4/fs/super.c =================================================================== --- linux-2.6.20-rc4.orig/fs/super.c 2007-01-08 17:36:51.277300103 -0600 +++ linux-2.6.20-rc4/fs/super.c 2007-01-08 17:38:49.874984913 -0600 @@ -618,7 +618,7 @@ int do_remount_sb(struct super_block *sb return 0; } -static void do_emergency_remount(unsigned long foo) +static void do_emergency_remount(unsigned long foo, nodemask_t *bar) { struct super_block *sb; @@ -646,7 +646,7 @@ static void do_emergency_remount(unsigne void emergency_remount(void) { - pdflush_operation(do_emergency_remount, 0); + pdflush_operation(do_emergency_remount, 0, NULL); } /* Index: linux-2.6.20-rc4/fs/sync.c =================================================================== --- linux-2.6.20-rc4.orig/fs/sync.c 2007-01-08 17:36:51.289019918 -0600 +++ linux-2.6.20-rc4/fs/sync.c 2007-01-08 17:38:49.884751425 -0600 @@ -21,9 +21,9 @@ * sync everything. Start out by waking pdflush, because that writes back * all queues in parallel. */ -static void do_sync(unsigned long wait) +static void do_sync(unsigned long wait, nodemask_t *unused) { - wakeup_pdflush(0); + wakeup_pdflush(0, NULL); sync_inodes(0); /* All mappings, inodes and their blockdevs */ DQUOT_SYNC(NULL); sync_supers(); /* Write the superblocks */ @@ -38,13 +38,13 @@ static void do_sync(unsigned long wait) asmlinkage long sys_sync(void) { - do_sync(1); + do_sync(1, NULL); return 0; } void emergency_sync(void) { - pdflush_operation(do_sync, 0); + pdflush_operation(do_sync, 0, NULL); } /* Index: linux-2.6.20-rc4/mm/pdflush.c =================================================================== --- linux-2.6.20-rc4.orig/mm/pdflush.c 2007-01-08 17:36:51.226514239 -0600 +++ linux-2.6.20-rc4/mm/pdflush.c 2007-01-08 17:38:49.892564634 -0600 @@ -83,10 +83,12 @@ static unsigned long last_empty_jifs; */ struct pdflush_work { struct task_struct *who; /* The thread */ - void (*fn)(unsigned long); /* A callback function */ + void (*fn)(unsigned long, nodemask_t *); /* A callback function */ unsigned long arg0; /* An argument to the callback */ struct list_head list; /* On pdflush_list, when idle */ unsigned long when_i_went_to_sleep; + int have_nodes; /* Nodes were specified */ + nodemask_t nodes; /* Nodes of interest */ }; static int __pdflush(struct pdflush_work *my_work) @@ -123,7 +125,8 @@ static int __pdflush(struct pdflush_work } spin_unlock_irq(&pdflush_lock); - (*my_work->fn)(my_work->arg0); + (*my_work->fn)(my_work->arg0, + my_work->have_nodes ? &my_work->nodes : NULL); /* * Thread creation: For how long have there been zero @@ -197,8 +200,8 @@ static int pdflush(void *dummy) * Returns zero if it indeed managed to find a worker thread, and passed your * payload to it. */ -int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0, - nodemask_t nodes) +int pdflush_operation(void (*fn)(unsigned long, nodemask_t *), + unsigned long arg0, nodemask_t *nodes) { unsigned long flags; int ret = 0; @@ -218,6 +221,11 @@ int pdflush_operation(void (*fn)(unsigne last_empty_jifs = jiffies; pdf->fn = fn; pdf->arg0 = arg0; + if (nodes) { + pdf->nodes = *nodes; + pdf->have_nodes = 1; + } else + pdf->have_nodes = 0; wake_up_process(pdf->who); spin_unlock_irqrestore(&pdflush_lock, flags); } Index: linux-2.6.20-rc4/mm/vmscan.c =================================================================== --- linux-2.6.20-rc4.orig/mm/vmscan.c 2007-01-08 17:36:51.240187356 -0600 +++ linux-2.6.20-rc4/mm/vmscan.c 2007-01-08 17:41:12.321494854 -0600 @@ -1065,7 +1065,8 @@ unsigned long try_to_free_pages(struct z */ if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max / 2) { - wakeup_pdflush(laptop_mode ? 0 : total_scanned); + wakeup_pdflush(laptop_mode ? 0 : total_scanned, + &cpuset_current_mems_allowed); sc.may_writepage = 1; }