From clameter@sgi.com Mon Jun 18 11:53:38 2007 Message-Id: <20070618185326.593525493@sgi.com> User-Agent: quilt/0.46-1 Date: Mon, 18 Jun 2007 11:53:26 -0700 From: clameter@sgi.com Subject: [patch 00/10] Memoryless Node support -- From clameter@sgi.com Mon Jun 18 11:53:40 2007 Message-Id: <20070618185339.844731179@sgi.com> References: <20070618185326.593525493@sgi.com> User-Agent: quilt/0.46-1 Date: Mon, 18 Jun 2007 11:53:27 -0700 From: clameter@sgi.com Cc: Nishanth Aravamudan Subject: [patch 01/10] Memoryless nodes: Fix GFP_THISNODE behavior Content-Disposition: inline; filename=memless_thisnode_fix GFP_THISNODE checks that the zone selected is within the pgdat (node) of the first zone of a nodelist. That only works if the node has memory. A memoryless node will have its first node on another pgdat (node). GFP_THISNODE currently will return simply memory on the first pgdat. Thus it is returning memory on other nodes. GFP_THISNODE should fail if there is no local memory on a node. Add a new set of zonelists for each node that only contain the nodes that belong to the zones itself so that no fallback is possible. Then modify gfp_type to pickup the right zone based on the presence of __GFP_THISNODE. Drop the existing GFP_THISNODE checks from the page_allocators hot path. Signed-off-by: Christoph Lameter Acked-by: Nishanth Aravamudan Index: linux-2.6.22-rc4-mm2/include/linux/gfp.h =================================================================== --- linux-2.6.22-rc4-mm2.orig/include/linux/gfp.h 2007-06-18 11:46:24.000000000 -0700 +++ linux-2.6.22-rc4-mm2/include/linux/gfp.h 2007-06-18 11:47:11.000000000 -0700 @@ -116,22 +116,28 @@ static inline int allocflags_to_migratet static inline enum zone_type gfp_zone(gfp_t flags) { + int base = 0; + +#ifdef CONFIG_NUMA + if (flags & __GFP_THISNODE) + base = MAX_NR_ZONES; +#endif #ifdef CONFIG_ZONE_DMA if (flags & __GFP_DMA) - return ZONE_DMA; + return base + ZONE_DMA; #endif #ifdef CONFIG_ZONE_DMA32 if (flags & __GFP_DMA32) - return ZONE_DMA32; + return base + ZONE_DMA32; #endif if ((flags & (__GFP_HIGHMEM | __GFP_MOVABLE)) == (__GFP_HIGHMEM | __GFP_MOVABLE)) - return ZONE_MOVABLE; + return base + ZONE_MOVABLE; #ifdef CONFIG_HIGHMEM if (flags & __GFP_HIGHMEM) - return ZONE_HIGHMEM; + return base + ZONE_HIGHMEM; #endif - return ZONE_NORMAL; + return base + ZONE_NORMAL; } static inline gfp_t set_migrateflags(gfp_t gfp, gfp_t migrate_flags) Index: linux-2.6.22-rc4-mm2/mm/page_alloc.c =================================================================== --- linux-2.6.22-rc4-mm2.orig/mm/page_alloc.c 2007-06-18 11:46:26.000000000 -0700 +++ linux-2.6.22-rc4-mm2/mm/page_alloc.c 2007-06-18 11:48:32.000000000 -0700 @@ -1430,9 +1430,6 @@ zonelist_scan: !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; zone = *z; - if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && - zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) - break; if ((alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed_softwall(zone, gfp_mask)) goto try_next_zone; @@ -1553,7 +1550,10 @@ restart: z = zonelist->zones; /* the list of zones suitable for gfp_mask */ if (unlikely(*z == NULL)) { - /* Should this ever happen?? */ + /* + * Happens if we have an empty zonelist as a result of + * GFP_THISNODE being used on a memoryless node + */ return NULL; } @@ -2151,6 +2151,22 @@ static void build_zonelists_in_node_orde } /* + * Build gfp_thisnode zonelists + */ +static void build_thisnode_zonelists(pg_data_t *pgdat) +{ + enum zone_type i; + int j; + struct zonelist *zonelist; + + for (i = 0; i < MAX_NR_ZONES; i++) { + zonelist = pgdat->node_zonelists + MAX_NR_ZONES + i; + j = build_zonelists_node(pgdat, zonelist, 0, i); + zonelist->zones[j] = NULL; + } +} + +/* * Build zonelists ordered by zone and nodes within zones. * This results in conserving DMA zone[s] until all Normal memory is * exhausted, but results in overflowing to remote node while memory @@ -2254,7 +2270,7 @@ static void build_zonelists(pg_data_t *p int order = current_zonelist_order; /* initialize zonelists */ - for (i = 0; i < MAX_NR_ZONES; i++) { + for (i = 0; i < 2 * MAX_NR_ZONES; i++) { zonelist = pgdat->node_zonelists + i; zonelist->zones[0] = NULL; } @@ -2299,6 +2315,8 @@ static void build_zonelists(pg_data_t *p /* calculate node order -- i.e., DMA last! */ build_zonelists_in_zone_order(pgdat, j); } + + build_thisnode_zonelists(pgdat); } /* Construct the zonelist performance cache - see further mmzone.h */ Index: linux-2.6.22-rc4-mm2/include/linux/mmzone.h =================================================================== --- linux-2.6.22-rc4-mm2.orig/include/linux/mmzone.h 2007-06-18 11:46:24.000000000 -0700 +++ linux-2.6.22-rc4-mm2/include/linux/mmzone.h 2007-06-18 11:47:11.000000000 -0700 @@ -356,6 +356,7 @@ struct zone { #define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES) #ifdef CONFIG_NUMA +#define MAX_ZONELISTS (2 * MAX_NR_ZONES) /* * We cache key information from each zonelist for smaller cache * footprint when scanning for free pages in get_page_from_freelist(). @@ -421,6 +422,7 @@ struct zonelist_cache { unsigned long last_full_zap; /* when last zap'd (jiffies) */ }; #else +#define MAX_ZONELISTS MAX_NR_ZONES struct zonelist_cache; #endif @@ -469,7 +471,7 @@ extern struct page *mem_map; struct bootmem_data; typedef struct pglist_data { struct zone node_zones[MAX_NR_ZONES]; - struct zonelist node_zonelists[MAX_NR_ZONES]; + struct zonelist node_zonelists[MAX_ZONELISTS]; int nr_zones; #ifdef CONFIG_FLAT_NODE_MEM_MAP struct page *node_mem_map; -- From clameter@sgi.com Mon Jun 18 11:53:41 2007 Message-Id: <20070618185340.882920672@sgi.com> References: <20070618185326.593525493@sgi.com> User-Agent: quilt/0.46-1 Date: Mon, 18 Jun 2007 11:53:28 -0700 From: clameter@sgi.com Cc: Lee Schermerhorn , Nishanth Aravamudan Subject: [patch 02/10] NUMA: Introduce node_memory_map Content-Disposition: inline; filename=memless_memory_map It is necessary to know if nodes have memory since we have recently begun to add support for memoryless nodes. For that purpose we introduce a new bitmap called node_memory_map A node has its bit in node_memory_map set if it has memory. If a node has memory then it has at least one zone defined in its pgdat structure that is located in the pgdat itself. The node_memory_map can then be used in various places to insure that we do the right thing when we encounter a memoryless node. Signed-off-by: Lee Schermerhorn Signed-off-by: Nishanth Aravamudan Signed-off-by: Christoph Lameter Index: linux-2.6.22-rc4-mm2/include/linux/nodemask.h =================================================================== --- linux-2.6.22-rc4-mm2.orig/include/linux/nodemask.h 2007-06-18 11:46:26.000000000 -0700 +++ linux-2.6.22-rc4-mm2/include/linux/nodemask.h 2007-06-18 11:48:42.000000000 -0700 @@ -64,12 +64,16 @@ * * int node_online(node) Is some node online? * int node_possible(node) Is some node possible? + * int node_memory(node) Does a node have memory? * * int any_online_node(mask) First online node in mask * * node_set_online(node) set bit 'node' in node_online_map * node_set_offline(node) clear bit 'node' in node_online_map * + * node_set_has_memory(node) set bit 'node' in node_memory_map + * node_set_no_memory(node) clear bit 'node' in node_memory_map + * * for_each_node(node) for-loop node over node_possible_map * for_each_online_node(node) for-loop node over node_online_map * @@ -344,12 +348,14 @@ static inline void __nodes_remap(nodemas extern nodemask_t node_online_map; extern nodemask_t node_possible_map; +extern nodemask_t node_memory_map; #if MAX_NUMNODES > 1 #define num_online_nodes() nodes_weight(node_online_map) #define num_possible_nodes() nodes_weight(node_possible_map) #define node_online(node) node_isset((node), node_online_map) #define node_possible(node) node_isset((node), node_possible_map) +#define node_memory(node) node_isset((node), node_memory_map) #define first_online_node first_node(node_online_map) #define next_online_node(nid) next_node((nid), node_online_map) extern int nr_node_ids; @@ -358,6 +364,8 @@ extern int nr_node_ids; #define num_possible_nodes() 1 #define node_online(node) ((node) == 0) #define node_possible(node) ((node) == 0) +#define node_memory(node) ((node) == 0) +#define node_populated(node) ((node) == 0) #define first_online_node 0 #define next_online_node(nid) (MAX_NUMNODES) #define nr_node_ids 1 @@ -375,7 +383,11 @@ extern int nr_node_ids; #define node_set_online(node) set_bit((node), node_online_map.bits) #define node_set_offline(node) clear_bit((node), node_online_map.bits) +#define node_set_has_memory(node) set_bit((node), node_memory_map.bits) +#define node_set_no_memory(node) clear_bit((node), node_memory_map.bits) + #define for_each_node(node) for_each_node_mask((node), node_possible_map) #define for_each_online_node(node) for_each_node_mask((node), node_online_map) +#define for_each_memory_node(node) for_each_node_mask((node), node_memory_map) #endif /* __LINUX_NODEMASK_H */ Index: linux-2.6.22-rc4-mm2/mm/page_alloc.c =================================================================== --- linux-2.6.22-rc4-mm2.orig/mm/page_alloc.c 2007-06-18 11:48:32.000000000 -0700 +++ linux-2.6.22-rc4-mm2/mm/page_alloc.c 2007-06-18 11:49:34.000000000 -0700 @@ -54,6 +54,9 @@ nodemask_t node_online_map __read_mostly EXPORT_SYMBOL(node_online_map); nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; EXPORT_SYMBOL(node_possible_map); +nodemask_t node_memory_map __read_mostly = NODE_MASK_NONE; +EXPORT_SYMBOL(node_memory_map); + unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; long nr_swap_pages; @@ -2317,6 +2320,9 @@ static void build_zonelists(pg_data_t *p } build_thisnode_zonelists(pgdat); + + if (pgdat->node_present_pages) + node_set_has_memory(local_node); } /* Construct the zonelist performance cache - see further mmzone.h */ -- From clameter@sgi.com Mon Jun 18 11:53:42 2007 Message-Id: <20070618185341.507010532@sgi.com> References: <20070618185326.593525493@sgi.com> User-Agent: quilt/0.46-1 Date: Mon, 18 Jun 2007 11:53:29 -0700 From: clameter@sgi.com Cc: Nishanth Aravamudan Subject: [patch 03/10] Fix MPOL_INTERLEAVE behavior for memoryless nodes Content-Disposition: inline; filename=memless_fix_interleave MPOL_INTERLEAVE currently simply loops over all nodes. Allocations on memoryless nodes will be redirected to nodes with memory. This results in an imbalance because the neighboring nodes to memoryless nodes will get significantly more interleave hits that the rest of the nodes on the system. We can avoid this imbalance by clearing the nodes in the interleave node set that have no memory. Signed-off-by: Christoph Lameter Signed-off-by: Nishanth Aravamudan Index: linux-2.6.22-rc4-mm2/mm/mempolicy.c =================================================================== --- linux-2.6.22-rc4-mm2.orig/mm/mempolicy.c 2007-06-13 23:06:14.000000000 -0700 +++ linux-2.6.22-rc4-mm2/mm/mempolicy.c 2007-06-14 00:49:43.000000000 -0700 @@ -185,7 +185,8 @@ static struct mempolicy *mpol_new(int mo switch (mode) { case MPOL_INTERLEAVE: policy->v.nodes = *nodes; - if (nodes_weight(*nodes) == 0) { + nodes_and(policy->v.nodes, policy->v.nodes, node_memory_map); + if (nodes_weight(policy->v.nodes) == 0) { kmem_cache_free(policy_cache, policy); return ERR_PTR(-EINVAL); } -- From clameter@sgi.com Mon Jun 18 11:53:42 2007 Message-Id: <20070618185342.355249020@sgi.com> References: <20070618185326.593525493@sgi.com> User-Agent: quilt/0.46-1 Date: Mon, 18 Jun 2007 11:53:30 -0700 From: clameter@sgi.com Cc: Nishanth Aravamudan Subject: [patch 04/10] OOM: use the node_memory_map instead of constructing one on the fly Content-Disposition: inline; filename=memless_oom_kill constrained_alloc() builds its own memory map for nodes with memory. We have that available in node_memory_map now. So simplify the code. Signed-off-by: Christoph Lameter Acked-by: Nishanth Aravamudan Index: linux-2.6.22-rc4-mm2/mm/oom_kill.c =================================================================== --- linux-2.6.22-rc4-mm2.orig/mm/oom_kill.c 2007-06-13 23:11:32.000000000 -0700 +++ linux-2.6.22-rc4-mm2/mm/oom_kill.c 2007-06-13 23:12:39.000000000 -0700 @@ -176,14 +176,7 @@ static inline int constrained_alloc(stru { #ifdef CONFIG_NUMA struct zone **z; - nodemask_t nodes; - int node; - - nodes_clear(nodes); - /* node has memory ? */ - for_each_online_node(node) - if (NODE_DATA(node)->node_present_pages) - node_set(node, nodes); + nodemask_t nodes = node_memory_map; for (z = zonelist->zones; *z; z++) if (cpuset_zone_allowed_softwall(*z, gfp_mask)) -- From clameter@sgi.com Mon Jun 18 11:53:43 2007 Message-Id: <20070618185343.030964065@sgi.com> References: <20070618185326.593525493@sgi.com> User-Agent: quilt/0.46-1 Date: Mon, 18 Jun 2007 11:53:31 -0700 From: clameter@sgi.com Cc: Nishanth Aravamudan Subject: [patch 05/10] Memoryless Nodes: No need for kswapd Content-Disposition: inline; filename=memless_no_kswapd A node without memory does not need a kswapd. So use the memory map instead of the online map when starting kswapd. Signed-off-by: Christoph Lameter Acked-by: Nishanth Aravamudan Index: linux-2.6.22-rc4-mm2/mm/vmscan.c =================================================================== --- linux-2.6.22-rc4-mm2.orig/mm/vmscan.c 2007-06-18 11:46:25.000000000 -0700 +++ linux-2.6.22-rc4-mm2/mm/vmscan.c 2007-06-18 11:49:47.000000000 -0700 @@ -1735,7 +1735,7 @@ static int __init kswapd_init(void) int nid; swap_setup(); - for_each_online_node(nid) + for_each_memory_node(nid) kswapd_run(nid); hotcpu_notifier(cpu_callback, 0); return 0; -- From clameter@sgi.com Mon Jun 18 11:53:44 2007 Message-Id: <20070618185343.559364733@sgi.com> References: <20070618185326.593525493@sgi.com> User-Agent: quilt/0.46-1 Date: Mon, 18 Jun 2007 11:53:32 -0700 From: clameter@sgi.com Cc: Nishanth Aravamudan Subject: [patch 06/10] Memoryless Node: Slab support Content-Disposition: inline; filename=memless_slab Slab should not allocate control structures for nodes without memory. This may seem to work right now but its unreliable since not all allocations can fall back due to the use of GFP_THISNODE. Switching a few for_each_online_node's to for_each_memory_node will allow us to only allocate for nodes that actually have memory. Signed-off-by: Christoph Lameter Acked-by: Nishanth Aravamudan Index: linux-2.6.22-rc4-mm2/mm/slab.c =================================================================== --- linux-2.6.22-rc4-mm2.orig/mm/slab.c 2007-06-18 11:46:25.000000000 -0700 +++ linux-2.6.22-rc4-mm2/mm/slab.c 2007-06-18 11:49:53.000000000 -0700 @@ -1564,7 +1564,7 @@ void __init kmem_cache_init(void) /* Replace the static kmem_list3 structures for the boot cpu */ init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node); - for_each_online_node(nid) { + for_each_memory_node(nid) { init_list(malloc_sizes[INDEX_AC].cs_cachep, &initkmem_list3[SIZE_AC + nid], nid); @@ -1942,7 +1942,7 @@ static void __init set_up_list3s(struct { int node; - for_each_online_node(node) { + for_each_memory_node(node) { cachep->nodelists[node] = &initkmem_list3[index + node]; cachep->nodelists[node]->next_reap = jiffies + REAPTIMEOUT_LIST3 + @@ -2073,7 +2073,7 @@ static int __init_refok setup_cpu_cache( g_cpucache_up = PARTIAL_L3; } else { int node; - for_each_online_node(node) { + for_each_memory_node(node) { cachep->nodelists[node] = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node); @@ -3787,7 +3787,7 @@ static int alloc_kmemlist(struct kmem_ca struct array_cache *new_shared; struct array_cache **new_alien = NULL; - for_each_online_node(node) { + for_each_memory_node(node) { if (use_alien_caches) { new_alien = alloc_alien_cache(node, cachep->limit); -- From clameter@sgi.com Mon Jun 18 11:53:45 2007 Message-Id: <20070618185344.626942398@sgi.com> References: <20070618185326.593525493@sgi.com> User-Agent: quilt/0.46-1 Date: Mon, 18 Jun 2007 11:53:33 -0700 From: clameter@sgi.com Subject: [patch 07/10] Memoryless nodes: SLUB support Content-Disposition: inline; filename=memless_slub Simply switch all for_each_online_node to for_each_memory_node. That way SLUB only operates on nodes with memory. Any allocation attempt on a memoryless node will fall whereupon SLUB will fetch memory from a nearby node (depending on how memory policies and cpuset describe fallback). Signed-off-by: Christoph Lameter Index: linux-2.6.22-rc4-mm2/mm/slub.c =================================================================== --- linux-2.6.22-rc4-mm2.orig/mm/slub.c 2007-06-18 11:16:15.000000000 -0700 +++ linux-2.6.22-rc4-mm2/mm/slub.c 2007-06-18 11:28:50.000000000 -0700 @@ -2086,7 +2086,7 @@ static void free_kmem_cache_nodes(struct { int node; - for_each_online_node(node) { + for_each_memory_node(node) { struct kmem_cache_node *n = s->node[node]; if (n && n != &s->local_node) kmem_cache_free(kmalloc_caches, n); @@ -2104,7 +2104,7 @@ static int init_kmem_cache_nodes(struct else local_node = 0; - for_each_online_node(node) { + for_each_memory_node(node) { struct kmem_cache_node *n; if (local_node == node) @@ -2366,7 +2366,7 @@ static inline int kmem_cache_close(struc /* Attempt to free all objects */ free_kmem_cache_cpus(s); - for_each_online_node(node) { + for_each_memory_node(node) { struct kmem_cache_node *n = get_node(s, node); n->nr_partial -= free_list(s, n, &n->partial); @@ -2937,7 +2937,7 @@ int kmem_cache_shrink(struct kmem_cache if (!scratch) return -ENOMEM; - for_each_online_node(node) + for_each_memory_node(node) __kmem_cache_shrink(s, get_node(s, node), scratch); kfree(scratch); @@ -3008,7 +3008,7 @@ int kmem_cache_defrag(int percent, int n scratch = kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL); if (node == -1) { - for_each_online_node(node) + for_each_memory_node(node) pages += __kmem_cache_defrag(s, percent, node, scratch); } else @@ -3392,7 +3392,7 @@ static unsigned long validate_slab_cache unsigned long count = 0; flush_all(s); - for_each_online_node(node) { + for_each_memory_node(node) { struct kmem_cache_node *n = get_node(s, node); count += validate_slab_node(s, n); @@ -3611,7 +3611,7 @@ static int list_locations(struct kmem_ca /* Push back cpu slabs */ flush_all(s); - for_each_online_node(node) { + for_each_memory_node(node) { struct kmem_cache_node *n = get_node(s, node); unsigned long flags; struct page *page; @@ -3723,7 +3723,7 @@ static unsigned long slab_objects(struct } } - for_each_online_node(node) { + for_each_memory_node(node) { struct kmem_cache_node *n = get_node(s, node); if (flags & SO_PARTIAL) { @@ -3751,7 +3751,7 @@ static unsigned long slab_objects(struct x = sprintf(buf, "%lu", total); #ifdef CONFIG_NUMA - for_each_online_node(node) + for_each_memory_node(node) if (nodes[node]) x += sprintf(buf + x, " N%d=%lu", node, nodes[node]); @@ -3772,7 +3772,7 @@ static int any_slab_objects(struct kmem_ return 1; } - for_each_online_node(node) { + for_each_memory_node(node) { struct kmem_cache_node *n = get_node(s, node); if (n && (n->nr_partial || atomic_read(&n->nr_slabs))) -- From clameter@sgi.com Mon Jun 18 11:53:46 2007 Message-Id: <20070618185345.388646330@sgi.com> References: <20070618185326.593525493@sgi.com> User-Agent: quilt/0.46-1 Date: Mon, 18 Jun 2007 11:53:34 -0700 From: clameter@sgi.com Cc: jes@sgi.com Subject: [patch 08/10] Uncached allocator: Handle memoryless nodes Content-Disposition: inline; filename=memless_mspec The checks for node_online in the uncached allocator are made to make sure that memory is available on these nodes. Thus switch all the checks to use the node_memory and for_each_memory_node functions. Cc: jes@sgi.com Signed-off-by: Christoph Lameter Index: linux-2.6.22-rc4-mm2/arch/ia64/kernel/uncached.c =================================================================== --- linux-2.6.22-rc4-mm2.orig/arch/ia64/kernel/uncached.c 2007-06-13 23:29:58.000000000 -0700 +++ linux-2.6.22-rc4-mm2/arch/ia64/kernel/uncached.c 2007-06-13 23:32:35.000000000 -0700 @@ -196,7 +196,7 @@ unsigned long uncached_alloc_page(int st nid = starting_nid; do { - if (!node_online(nid)) + if (!node_memory(nid)) continue; uc_pool = &uncached_pools[nid]; if (uc_pool->pool == NULL) @@ -268,7 +268,7 @@ static int __init uncached_init(void) { int nid; - for_each_online_node(nid) { + for_each_memory_node(nid) { uncached_pools[nid].pool = gen_pool_create(PAGE_SHIFT, nid); mutex_init(&uncached_pools[nid].add_chunk_mutex); } Index: linux-2.6.22-rc4-mm2/drivers/char/mspec.c =================================================================== --- linux-2.6.22-rc4-mm2.orig/drivers/char/mspec.c 2007-06-13 23:28:15.000000000 -0700 +++ linux-2.6.22-rc4-mm2/drivers/char/mspec.c 2007-06-13 23:29:35.000000000 -0700 @@ -353,7 +353,7 @@ mspec_init(void) is_sn2 = 1; if (is_shub2()) { ret = -ENOMEM; - for_each_online_node(nid) { + for_each_memory_node(nid) { int actual_nid; int nasid; unsigned long phys; -- From clameter@sgi.com Mon Jun 18 11:53:47 2007 Message-Id: <20070618185346.513959832@sgi.com> References: <20070618185326.593525493@sgi.com> User-Agent: quilt/0.46-1 Date: Mon, 18 Jun 2007 11:53:35 -0700 From: clameter@sgi.com Cc: Nishanth Aravamudan Subject: [patch 09/10] Memoryless node: Allow profiling data to fall back to other nodes Content-Disposition: inline; filename=memless_profile Processors on memoryless nodes must be able to fall back to remote nodes in order to get a profiling buffer. This may lead to excessive NUMA traffic but I think we should allow this rather than failing. Signed-off-by: Christoph Lameter Acked-by: Nishanth Aravamudan Index: linux-2.6.22-rc4-mm2/kernel/profile.c =================================================================== --- linux-2.6.22-rc4-mm2.orig/kernel/profile.c 2007-06-13 23:36:42.000000000 -0700 +++ linux-2.6.22-rc4-mm2/kernel/profile.c 2007-06-13 23:36:55.000000000 -0700 @@ -346,7 +346,7 @@ static int __devinit profile_cpu_callbac per_cpu(cpu_profile_flip, cpu) = 0; if (!per_cpu(cpu_profile_hits, cpu)[1]) { page = alloc_pages_node(node, - GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + GFP_KERNEL | __GFP_ZERO, 0); if (!page) return NOTIFY_BAD; @@ -354,7 +354,7 @@ static int __devinit profile_cpu_callbac } if (!per_cpu(cpu_profile_hits, cpu)[0]) { page = alloc_pages_node(node, - GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + GFP_KERNEL | __GFP_ZERO, 0); if (!page) goto out_free; -- From clameter@sgi.com Mon Jun 18 11:53:47 2007 Message-Id: <20070618185347.299689839@sgi.com> References: <20070618185326.593525493@sgi.com> User-Agent: quilt/0.46-1 Date: Mon, 18 Jun 2007 11:53:36 -0700 From: clameter@sgi.com Cc: Nishanth Aravamudan Subject: [patch 10/10] Memoryless nodes: Update memory policy and page migration Content-Disposition: inline; filename=memless_migrate Online nodes now may have no memory. The checks and initialization must therefore be changed to no longer use the online functions. This will correctly initialize the interleave on bootup to only target nodes with memory and will make sys_move_pages return an error when a page is to be moved to a memoryless node. Similarly we will get an error if MPOL_BIND and MPOL_INTERLEAVE is used on a memoryless node. These are somewhat new semantics. So far one could specify memoryless nodes and we would maybe do the right thing and just ignore the node (or we'd do something strange like with MPOL_INTERLEAVE). If we want to allow the specification of memoryless nodes via memory policies then we need to keep checking for online nodes. Signed-off-by: Christoph Lameter Acked-by: Nishanth Aravamudan Index: linux-2.6.22-rc4-mm2/mm/migrate.c =================================================================== --- linux-2.6.22-rc4-mm2.orig/mm/migrate.c 2007-06-14 00:49:43.000000000 -0700 +++ linux-2.6.22-rc4-mm2/mm/migrate.c 2007-06-18 11:30:53.000000000 -0700 @@ -963,7 +963,7 @@ asmlinkage long sys_move_pages(pid_t pid goto out; err = -ENODEV; - if (!node_online(node)) + if (!node_memory(node)) goto out; err = -EACCES; Index: linux-2.6.22-rc4-mm2/mm/mempolicy.c =================================================================== --- linux-2.6.22-rc4-mm2.orig/mm/mempolicy.c 2007-06-18 11:24:51.000000000 -0700 +++ linux-2.6.22-rc4-mm2/mm/mempolicy.c 2007-06-18 11:30:53.000000000 -0700 @@ -130,7 +130,7 @@ static int mpol_check_policy(int mode, n return -EINVAL; break; } - return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; + return nodes_subset(*nodes, node_memory_map) ? 0 : -EINVAL; } /* Generate a custom zonelist for the BIND policy. */ @@ -495,9 +495,9 @@ static void get_zonemask(struct mempolic *nodes = p->v.nodes; break; case MPOL_PREFERRED: - /* or use current node instead of online map? */ + /* or use current node instead of memory_map? */ if (p->v.preferred_node < 0) - *nodes = node_online_map; + *nodes = node_memory_map; else node_set(p->v.preferred_node, *nodes); break; @@ -1606,7 +1606,7 @@ int mpol_parse_options(char *value, int *nodelist++ = '\0'; if (nodelist_parse(nodelist, *policy_nodes)) goto out; - if (!nodes_subset(*policy_nodes, node_online_map)) + if (!nodes_subset(*policy_nodes, node_memory_map)) goto out; } if (!strcmp(value, "default")) { @@ -1631,9 +1631,9 @@ int mpol_parse_options(char *value, int err = 0; } else if (!strcmp(value, "interleave")) { *policy = MPOL_INTERLEAVE; - /* Default to nodes online if no nodelist */ + /* Default to nodes memory map if no nodelist */ if (!nodelist) - *policy_nodes = node_online_map; + *policy_nodes = node_memory_map; err = 0; } out: @@ -1674,14 +1674,14 @@ void __init numa_policy_init(void) /* * Use the specified nodemask for init, or fall back to - * node_online_map. + * node_memory_map. */ if (policy_sysinit == MPOL_DEFAULT) nmask = NULL; else if (!nodes_empty(nmask_sysinit)) nmask = &nmask_sysinit; else - nmask = &node_online_map; + nmask = &node_memory_map; if (do_set_mempolicy(policy_sysinit, nmask)) printk("numa_policy_init: setting init policy failed\n"); @@ -1945,7 +1945,7 @@ int show_numa_map(struct seq_file *m, vo seq_printf(m, " huge"); } else { check_pgd_range(vma, vma->vm_start, vma->vm_end, - &node_online_map, MPOL_MF_STATS, md); + &node_memory_map, MPOL_MF_STATS, md); } if (!md->pages) @@ -1972,7 +1972,7 @@ int show_numa_map(struct seq_file *m, vo if (md->writeback) seq_printf(m," writeback=%lu", md->writeback); - for_each_online_node(n) + for_each_memory_node(n) if (md->node[n]) seq_printf(m, " N%d=%lu", n, md->node[n]); out: --