Make dirty throttling obey cpusets Currently dirty throttling does not work properly in a cpuset. If f.e a cpuset contains only 1/10th of available memory then all of the memory of a cpuset can be dirtied without any writes being triggered. If all of the cpusets memory is dirty then only 10% of memory is dirty. The background writeback threshold is usually set at 40% and the synchrononous threshold at 60%. So we are still below the global limits while the dirty ratio in the cpuset is 100%! F.e. NFS pages can fill up the complete cpuset. This patch makes dirty writeout cpuset aware. When determining the dirty limits in get_dirty_limits() we calculate values based on the nodes that are reachable from the current process (that has been dirtying the page). Then we can trigger writeout based on the dirty ratio of the memory in the cpuset. Determination of the dirty state is expensive since we have to scan over all the nodes and sum up the relevant values. We trigger writeout in a a cpuset specific way. We go through the dirty inodes and search for pages that fit the node requirements. For that purpose we have to extend the write_control structure to contain a nodemask. Christoph Lameter Index: linux-2.6.19-rc6-mm2/mm/page-writeback.c =================================================================== --- linux-2.6.19-rc6-mm2.orig/mm/page-writeback.c 2006-11-30 22:15:11.000000000 -0800 +++ linux-2.6.19-rc6-mm2/mm/page-writeback.c 2006-11-30 22:21:10.000000000 -0800 @@ -33,6 +33,7 @@ #include #include #include +#include /* * The maximum number of pages to writeout in a single bdflush/kupdate @@ -102,6 +103,15 @@ EXPORT_SYMBOL(laptop_mode); static void background_writeout(unsigned long _min_pages); +struct dirty_limits { + long thresh_background; + long thresh_dirty; + unsigned long nr_dirty; + unsigned long nr_unstable; + unsigned long nr_writeback; + nodemask_t nodes; +}; + /* * Work out the current dirty-memory clamping and background writeout * thresholds. @@ -120,30 +130,63 @@ static void background_writeout(unsigned * clamping level. */ static void -get_dirty_limits(long *pbackground, long *pdirty, - struct address_space *mapping) +get_dirty_limits(struct dirty_limits *dl, struct address_space *mapping) { int background_ratio; /* Percentages */ int dirty_ratio; int unmapped_ratio; long background; long dirty; - unsigned long available_memory = vm_total_pages; + unsigned long available_memory; + unsigned long high_memory; + unsigned long nr_mapped; struct task_struct *tsk; + dl->nodes = cpuset_current_mems_allowed; + /* + * Respect the boundaries of the current cpuset otherwise dirty + * writeout will not work properly in a cpuset. + */ + if (likely(!nodes_equal(node_online_map, cpuset_current_mems_allowed))) { + dl->nr_dirty = global_page_state(NR_FILE_DIRTY); + dl->nr_unstable = global_page_state(NR_UNSTABLE_NFS); + dl->nr_writeback = global_page_state(NR_WRITEBACK); + available_memory = vm_total_pages; + high_memory = totalhigh_pages; + nr_mapped = global_page_state(NR_FILE_MAPPED) + + global_page_state(NR_ANON_PAGES); + } else { + int node; + + memset(&dl, 0, sizeof(struct dirty_limits)); + available_memory = 0; + high_memory = 0; + nr_mapped = 0; + + for_each_node_mask(node, cpuset_current_mems_allowed) { + dl->nr_dirty += node_page_state(node, NR_FILE_DIRTY); + dl->nr_unstable += node_page_state(node, NR_UNSTABLE_NFS); + dl->nr_writeback += node_page_state(node, NR_WRITEBACK); + available_memory += NODE_DATA(node)->node_present_pages; +#ifdef CONFIG_HIGHMEM + high_memory += NODE_DATA(node)->node_zones[ZONE_HIGHMEM]->present_pages; +#endif + nr_mapped += node_page_state(node, NR_FILE_MAPPED) + + node_page_state(node, NR_ANON_PAGES); + } + } + #ifdef CONFIG_HIGHMEM /* * If this mapping can only allocate from low memory, * we exclude high memory from our count. */ if (mapping && !(mapping_gfp_mask(mapping) & __GFP_HIGHMEM)) - available_memory -= totalhigh_pages; + available_memory -= high_memory; #endif - unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) + - global_page_state(NR_ANON_PAGES)) * 100) / - vm_total_pages; + unmapped_ratio = 100 - (nr_mapped * 100) / vm_total_pages; dirty_ratio = vm_dirty_ratio; if (dirty_ratio > unmapped_ratio / 2) @@ -163,8 +206,8 @@ get_dirty_limits(long *pbackground, long background += background / 4; dirty += dirty / 4; } - *pbackground = background; - *pdirty = dirty; + dl->thresh_background = background; + dl->thresh_dirty = dirty; } /* @@ -177,8 +220,7 @@ get_dirty_limits(long *pbackground, long static void balance_dirty_pages(struct address_space *mapping) { long nr_reclaimable; - long background_thresh; - long dirty_thresh; + struct dirty_limits dl; unsigned long pages_written = 0; unsigned long write_chunk = sync_writeback_pages(); @@ -193,11 +235,11 @@ static void balance_dirty_pages(struct a .range_cyclic = 1, }; - get_dirty_limits(&background_thresh, &dirty_thresh, mapping); - nr_reclaimable = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); - if (nr_reclaimable + global_page_state(NR_WRITEBACK) <= - dirty_thresh) + get_dirty_limits(&dl, mapping); + wbc.nodes = dl.nodes; + nr_reclaimable = dl.nr_dirty + dl.nr_unstable; + if (nr_reclaimable + dl.nr_writeback <= + dl.thresh_dirty) break; if (!dirty_exceeded) @@ -211,13 +253,9 @@ static void balance_dirty_pages(struct a */ if (nr_reclaimable) { writeback_inodes(&wbc); - get_dirty_limits(&background_thresh, - &dirty_thresh, mapping); - nr_reclaimable = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); - if (nr_reclaimable + - global_page_state(NR_WRITEBACK) - <= dirty_thresh) + get_dirty_limits(&dl, mapping); + nr_reclaimable = dl.nr_dirty + dl.nr_unstable; + if (nr_reclaimable + dl.nr_writeback <= dl.thresh_dirty) break; pages_written += write_chunk - wbc.nr_to_write; if (pages_written >= write_chunk) @@ -226,8 +264,8 @@ static void balance_dirty_pages(struct a congestion_wait(WRITE, HZ/10); } - if (nr_reclaimable + global_page_state(NR_WRITEBACK) - <= dirty_thresh && dirty_exceeded) + if (nr_reclaimable + dl.nr_writeback + <= dl.thresh_dirty && dirty_exceeded) dirty_exceeded = 0; if (writeback_in_progress(bdi)) @@ -242,7 +280,7 @@ static void balance_dirty_pages(struct a * background_thresh, to keep the amount of dirty memory low. */ if ((laptop_mode && pages_written) || - (!laptop_mode && (nr_reclaimable > background_thresh))) + (!laptop_mode && (nr_reclaimable > dl.thresh_background))) pdflush_operation(background_writeout, 0); } @@ -300,21 +338,19 @@ EXPORT_SYMBOL(balance_dirty_pages_rateli void throttle_vm_writeout(void) { - long background_thresh; - long dirty_thresh; + struct dirty_limits dl; for ( ; ; ) { - get_dirty_limits(&background_thresh, &dirty_thresh, NULL); + get_dirty_limits(&dl, NULL); /* * Boost the allowable dirty threshold a bit for page * allocators so they don't get DoS'ed by heavy writers */ - dirty_thresh += dirty_thresh / 10; /* wheeee... */ + dl.thresh_dirty += dl.thresh_dirty / 10; /* wheeee... */ - if (global_page_state(NR_UNSTABLE_NFS) + - global_page_state(NR_WRITEBACK) <= dirty_thresh) - break; + if (dl.nr_unstable + dl.nr_writeback <= dl.thresh_dirty) + break; congestion_wait(WRITE, HZ/10); } } @@ -337,12 +373,11 @@ static void background_writeout(unsigned }; for ( ; ; ) { - long background_thresh; - long dirty_thresh; + struct dirty_limits dl; - get_dirty_limits(&background_thresh, &dirty_thresh, NULL); - if (global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) < background_thresh + get_dirty_limits(&dl, NULL); + wbc.nodes = dl.nodes; + if (dl.nr_dirty + dl.nr_unstable < dl.thresh_background && min_pages <= 0) break; wbc.encountered_congestion = 0; @@ -407,6 +442,7 @@ static void wb_kupdate(unsigned long arg .nonblocking = 1, .for_kupdate = 1, .range_cyclic = 1, + .nodes = cpuset_current_mems_allowed, }; sync_supers(); @@ -621,6 +657,9 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + if (NUMA_BUILD && !node_isset(page_to_nid(page), + wbc->nodes)) + continue; /* * At this point we hold neither mapping->tree_lock nor * lock on the page itself: the page may be truncated or @@ -718,6 +757,7 @@ int write_one_page(struct page *page, in struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = 1, + .nodes = NODE_MASK_ALL, }; BUG_ON(!PageLocked(page)); Index: linux-2.6.19-rc6-mm2/include/linux/writeback.h =================================================================== --- linux-2.6.19-rc6-mm2.orig/include/linux/writeback.h 2006-11-30 22:15:11.000000000 -0800 +++ linux-2.6.19-rc6-mm2/include/linux/writeback.h 2006-11-30 22:15:37.000000000 -0800 @@ -41,6 +41,7 @@ struct writeback_control { enum writeback_sync_modes sync_mode; unsigned long *older_than_this; /* If !NULL, only write back inodes older than this */ + nodemask_t nodes; /* Restrict writeback to these nodes */ long nr_to_write; /* Write this many pages, and decrement this for each page written */ long pages_skipped; /* Pages which were not written */ Index: linux-2.6.19-rc6-mm2/fs/ext2/inode.c =================================================================== --- linux-2.6.19-rc6-mm2.orig/fs/ext2/inode.c 2006-11-30 22:31:58.000000000 -0800 +++ linux-2.6.19-rc6-mm2/fs/ext2/inode.c 2006-11-30 22:32:10.000000000 -0800 @@ -1374,6 +1374,7 @@ int ext2_sync_inode(struct inode *inode) struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = 0, /* sys_fsync did this */ + .nodes = NODE_MASK_ALL, }; return sync_inode(inode, &wbc); } Index: linux-2.6.19-rc6-mm2/fs/ext3/fsync.c =================================================================== --- linux-2.6.19-rc6-mm2.orig/fs/ext3/fsync.c 2006-11-30 22:32:28.000000000 -0800 +++ linux-2.6.19-rc6-mm2/fs/ext3/fsync.c 2006-11-30 22:32:45.000000000 -0800 @@ -80,6 +80,7 @@ int ext3_sync_file(struct file * file, s struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = 0, /* sys_fsync did this */ + .nodes = NODE_MASK_ALL, }; ret = sync_inode(inode, &wbc); } Index: linux-2.6.19-rc6-mm2/fs/ext4/fsync.c =================================================================== --- linux-2.6.19-rc6-mm2.orig/fs/ext4/fsync.c 2006-11-30 22:32:57.000000000 -0800 +++ linux-2.6.19-rc6-mm2/fs/ext4/fsync.c 2006-11-30 22:33:08.000000000 -0800 @@ -80,6 +80,7 @@ int ext4_sync_file(struct file * file, s struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = 0, /* sys_fsync did this */ + .nodes = NODE_MASK_ALL, }; ret = sync_inode(inode, &wbc); } Index: linux-2.6.19-rc6-mm2/fs/fat/inode.c =================================================================== --- linux-2.6.19-rc6-mm2.orig/fs/fat/inode.c 2006-11-30 22:30:25.000000000 -0800 +++ linux-2.6.19-rc6-mm2/fs/fat/inode.c 2006-11-30 22:30:39.000000000 -0800 @@ -1444,6 +1444,7 @@ static int writeback_inode(struct inode struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, .nr_to_write = 0, + .nodes = NODE_MASK_ALL, }; /* if we used WB_SYNC_ALL, sync_inode waits for the io for the * inode to finish. So WB_SYNC_NONE is sent down to sync_inode Index: linux-2.6.19-rc6-mm2/fs/fs-writeback.c =================================================================== --- linux-2.6.19-rc6-mm2.orig/fs/fs-writeback.c 2006-11-30 22:26:20.000000000 -0800 +++ linux-2.6.19-rc6-mm2/fs/fs-writeback.c 2006-11-30 22:27:06.000000000 -0800 @@ -470,6 +470,7 @@ void sync_inodes_sb(struct super_block * .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_HOLD, .range_start = 0, .range_end = LLONG_MAX, + .nodes = NODE_MASK_ALL, }; unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); @@ -568,6 +569,7 @@ int write_inode_now(struct inode *inode, .sync_mode = WB_SYNC_ALL, .range_start = 0, .range_end = LLONG_MAX, + .nodes = NODE_MASK_ALL, }; if (!mapping_cap_writeback_dirty(inode->i_mapping)) Index: linux-2.6.19-rc6-mm2/fs/udf/inode.c =================================================================== --- linux-2.6.19-rc6-mm2.orig/fs/udf/inode.c 2006-11-30 22:30:58.000000000 -0800 +++ linux-2.6.19-rc6-mm2/fs/udf/inode.c 2006-11-30 22:31:10.000000000 -0800 @@ -148,6 +148,7 @@ void udf_expand_file_adinicb(struct inod struct writeback_control udf_wbc = { .sync_mode = WB_SYNC_NONE, .nr_to_write = 1, + .nodes = NODE_MASK_ALL, }; /* from now on we have normal address_space methods */ Index: linux-2.6.19-rc6-mm2/fs/xfs/linux-2.6/xfs_aops.c =================================================================== --- linux-2.6.19-rc6-mm2.orig/fs/xfs/linux-2.6/xfs_aops.c 2006-11-30 22:31:31.000000000 -0800 +++ linux-2.6.19-rc6-mm2/fs/xfs/linux-2.6/xfs_aops.c 2006-11-30 22:31:47.000000000 -0800 @@ -1195,6 +1195,7 @@ xfs_vm_releasepage( struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = 1, + .nodes = NODE_MASK_ALL, }; xfs_page_trace(XFS_RELEASEPAGE_ENTER, inode, page, 0); Index: linux-2.6.19-rc6-mm2/mm/filemap.c =================================================================== --- linux-2.6.19-rc6-mm2.orig/mm/filemap.c 2006-11-30 22:25:33.000000000 -0800 +++ linux-2.6.19-rc6-mm2/mm/filemap.c 2006-11-30 22:25:59.000000000 -0800 @@ -193,6 +193,7 @@ int __filemap_fdatawrite_range(struct ad .nr_to_write = mapping->nrpages * 2, .range_start = start, .range_end = end, + .nodes = NODE_MASK_ALL, }; if (!mapping_cap_writeback_dirty(mapping)) Index: linux-2.6.19-rc6-mm2/mm/swapfile.c =================================================================== --- linux-2.6.19-rc6-mm2.orig/mm/swapfile.c 2006-11-30 22:23:32.000000000 -0800 +++ linux-2.6.19-rc6-mm2/mm/swapfile.c 2006-11-30 22:23:53.000000000 -0800 @@ -863,6 +863,7 @@ static int try_to_unuse(unsigned int typ if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, + .nodes = NODE_MASK_ALL, }; swap_writepage(page, &wbc); Index: linux-2.6.19-rc6-mm2/mm/vmscan.c =================================================================== --- linux-2.6.19-rc6-mm2.orig/mm/vmscan.c 2006-11-30 22:24:04.000000000 -0800 +++ linux-2.6.19-rc6-mm2/mm/vmscan.c 2006-11-30 22:24:50.000000000 -0800 @@ -358,6 +358,7 @@ static pageout_t pageout(struct page *pa .range_end = LLONG_MAX, .nonblocking = 1, .for_reclaim = 1, + .nodes = NODE_MASK_ALL, }; SetPageReclaim(page);