Add a dirty map to the inode In a NUMA system it is helpful to know where the dirty pages of a mapping are located. That way we will be able to implement writeout for applications that are constrained to a portion of the memory of the system as required by cpusets. Two functions are introduced to manage the dirty node map: cpuset_clear_dirty_nodes() and cpuset_update_nodes(). Both are defined using macros since the definition of struct inode may not be available in cpuset.h. The dirty map is cleared when the inode is cleared. There is no synchronization (except for atomic nature of node_set) for the dirty_map. The only problem that could be done is that we do not write out an inode if a node bit is not set. That is rare and will be impossibly rare if multiple pages are involved. There is therefore a slight chance that we have missed a dirty node if it just contains a single page. Which is likely tolerable. This patch increases the size of struct inode for the NUMA case. For most arches that only support up to 64 nodes this is simply adding one unsigned long. However, the default Itanium configuration allows for up to 1024 nodes. On Itanium we add 128 byte per inode. A later patch will make the size of the per node bit array dynamic so that the size of the inode slab caches is properly sized. Signed-off-by; Christoph Lameter Index: linux-2.6.20-rc4-mm1/fs/fs-writeback.c =================================================================== --- linux-2.6.20-rc4-mm1.orig/fs/fs-writeback.c 2007-01-12 12:58:25.000000000 -0800 +++ linux-2.6.20-rc4-mm1/fs/fs-writeback.c 2007-01-12 12:59:55.000000000 -0800 @@ -22,6 +22,7 @@ #include #include #include +#include #include "internal.h" /** @@ -223,11 +224,13 @@ /* * The inode is clean, inuse */ + cpuset_clear_dirty_nodes(inode); list_move(&inode->i_list, &inode_in_use); } else { /* * The inode is clean, unused */ + cpuset_clear_dirty_nodes(inode); list_move(&inode->i_list, &inode_unused); } } Index: linux-2.6.20-rc4-mm1/fs/inode.c =================================================================== --- linux-2.6.20-rc4-mm1.orig/fs/inode.c 2007-01-12 12:58:25.000000000 -0800 +++ linux-2.6.20-rc4-mm1/fs/inode.c 2007-01-12 12:59:55.000000000 -0800 @@ -22,6 +22,7 @@ #include #include #include +#include /* * This is needed for the following functions: @@ -134,6 +135,7 @@ inode->i_cdev = NULL; inode->i_rdev = 0; inode->dirtied_when = 0; + cpuset_clear_dirty_nodes(inode); if (security_inode_alloc(inode)) { if (inode->i_sb->s_op->destroy_inode) inode->i_sb->s_op->destroy_inode(inode); Index: linux-2.6.20-rc4-mm1/include/linux/fs.h =================================================================== --- linux-2.6.20-rc4-mm1.orig/include/linux/fs.h 2007-01-12 12:58:26.000000000 -0800 +++ linux-2.6.20-rc4-mm1/include/linux/fs.h 2007-01-12 12:59:55.000000000 -0800 @@ -591,6 +591,9 @@ void *i_security; #endif void *i_private; /* fs or device private pointer */ +#ifdef CONFIG_CPUSETS + nodemask_t dirty_nodes; /* Map of nodes with dirty pages */ +#endif }; /* Index: linux-2.6.20-rc4-mm1/mm/page-writeback.c =================================================================== --- linux-2.6.20-rc4-mm1.orig/mm/page-writeback.c 2007-01-12 12:58:26.000000000 -0800 +++ linux-2.6.20-rc4-mm1/mm/page-writeback.c 2007-01-12 12:59:55.000000000 -0800 @@ -33,6 +33,7 @@ #include #include #include +#include /* * The maximum number of pages to writeout in a single bdflush/kupdate @@ -780,6 +781,7 @@ if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + cpuset_update_dirty_nodes(mapping->host, page); } return 1; } Index: linux-2.6.20-rc4-mm1/fs/buffer.c =================================================================== --- linux-2.6.20-rc4-mm1.orig/fs/buffer.c 2007-01-12 12:58:25.000000000 -0800 +++ linux-2.6.20-rc4-mm1/fs/buffer.c 2007-01-12 12:59:55.000000000 -0800 @@ -42,6 +42,7 @@ #include #include #include +#include static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); static void invalidate_bh_lrus(void); @@ -740,6 +741,7 @@ } write_unlock_irq(&mapping->tree_lock); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + cpuset_update_dirty_nodes(mapping->host, page); return 1; } EXPORT_SYMBOL(__set_page_dirty_buffers); Index: linux-2.6.20-rc4-mm1/include/linux/cpuset.h =================================================================== --- linux-2.6.20-rc4-mm1.orig/include/linux/cpuset.h 2007-01-06 21:45:51.000000000 -0800 +++ linux-2.6.20-rc4-mm1/include/linux/cpuset.h 2007-01-12 12:59:55.000000000 -0800 @@ -75,6 +75,15 @@ extern void cpuset_track_online_nodes(void); +/* + * We need macros since struct inode is not defined yet + */ +#define cpuset_update_dirty_nodes(__inode, __page) \ + node_set(page_to_nid(__page), (__inode)->dirty_nodes) + +#define cpuset_clear_dirty_nodes(__inode) \ + (__inode)->dirty_nodes = NODE_MASK_NONE + #else /* !CONFIG_CPUSETS */ static inline int cpuset_init_early(void) { return 0; } @@ -146,6 +155,11 @@ static inline void cpuset_track_online_nodes(void) {} +static inline void cpuset_update_dirty_nodes(struct address_space *mapping, + struct page *page) {} + +static inline void cpuset_clear_dirty_nodes(struct address_space *mapping) {} + #endif /* !CONFIG_CPUSETS */ #endif /* _LINUX_CPUSET_H */