Index: linux-2.6.19-rc5-mm1/mm/slab.c =================================================================== --- linux-2.6.19-rc5-mm1.orig/mm/slab.c 2006-11-10 21:50:32.244951490 -0600 +++ linux-2.6.19-rc5-mm1/mm/slab.c 2006-11-22 15:58:15.720370063 -0600 @@ -1609,12 +1609,7 @@ static void *kmem_getpages(struct kmem_c flags |= __GFP_COMP; #endif - /* - * Under NUMA we want memory on the indicated node. We will handle - * the needed fallback ourselves since we want to serve from our - * per node object lists first for other nodes. - */ - flags |= cachep->gfpflags | GFP_THISNODE; + flags |= cachep->gfpflags; page = alloc_pages_node(nodeid, flags, cachep->gfporder); if (!page) @@ -2711,10 +2706,10 @@ static void slab_map_pages(struct kmem_c * Grow (by 1) the number of slabs within a cache. This is called by * kmem_cache_alloc() when there are no active objs left in a cache. */ -static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) +static int cache_grow(struct kmem_cache *cachep, + gfp_t flags, int nodeid, void *objp) { struct slab *slabp; - void *objp; size_t offset; gfp_t local_flags; unsigned long ctor_flags; @@ -2766,7 +2761,8 @@ static int cache_grow(struct kmem_cache * Get mem for the objs. Attempt to allocate a physical page from * 'nodeid'. */ - objp = kmem_getpages(cachep, flags, nodeid); + if (!objp) + objp = kmem_getpages(cachep, flags, nodeid); if (!objp) goto failed; @@ -3009,7 +3005,7 @@ alloc_done: if (unlikely(!ac->avail)) { int x; - x = cache_grow(cachep, flags, node); + x = cache_grow(cachep, flags | __GFP_THISNODE, node, NULL); /* cache_grow can reenable interrupts, then ac could change. */ ac = cpu_cache_get(cachep); @@ -3169,9 +3165,11 @@ static void *alternate_node_alloc(struct /* * Fallback function if there was no memory available and no objects on a - * certain node and we are allowed to fall back. We mimick the behavior of - * the page allocator. We fall back according to a zonelist determined by - * the policy layer while obeying cpuset constraints. + * certain node and fall back is permitted. First we scan all the + * available nodelists for available objects. If that fails then we + * perform an allocation without specifying a node. This allows the page + * allocator to do its reclaim / fallback magic. We then insert the + * slab into the proper nodelist and then allocate from it. */ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) { @@ -3179,16 +3177,52 @@ void *fallback_alloc(struct kmem_cache * ->node_zonelists[gfp_zone(flags)]; struct zone **z; void *obj = NULL; + int nid; +retry: + /* + * Look through allowed nodes for objects available + * from existing queues. + */ for (z = zonelist->zones; *z && !obj; z++) { - int nid = zone_to_nid(*z); + nid = zone_to_nid(*z); - if (zone_idx(*z) <= ZONE_NORMAL && - cpuset_zone_allowed(*z, flags) && - cache->nodelists[nid]) - obj = __cache_alloc_node(cache, + if (cpuset_zone_allowed(*z, flags) && + cache->nodelists[nid] && + cache->nodelists[nid]->free_objects) + obj = __cache_alloc_node(cache, flags | __GFP_THISNODE, nid); } + + if (!obj) { + /* + * This allocation will be performed within the constraints + * of the current cpuset / memory policy requirements. + * We may trigger various forms of reclaim on the allowed + * set and go into memory reserves if necessary. + */ + obj = kmem_getpages(cache, flags, -1); + if (obj) { + /* + * Insert into the appropriate per node queues + */ + nid = page_to_nid(virt_to_page(obj)); + if (cache_grow(cache, flags, nid, obj)) { + obj = __cache_alloc_node(cache, + flags | __GFP_THISNODE, nid); + if (!obj) + /* + * Another processor may allocate the + * objects in the slab since we are + * not holding any locks. + */ + goto retry; + } else { + kmem_freepages(cache, obj); + obj = NULL; + } + } + } return obj; } @@ -3244,7 +3278,7 @@ retry: must_grow: spin_unlock(&l3->list_lock); - x = cache_grow(cachep, flags, nodeid); + x = cache_grow(cachep, flags | __GFP_THISNODE, nodeid, NULL); if (x) goto retry; Index: linux-2.6.19-rc5-mm1/mm/page_alloc.c =================================================================== --- linux-2.6.19-rc5-mm1.orig/mm/page_alloc.c 2006-11-21 13:53:07.492305932 -0600 +++ linux-2.6.19-rc5-mm1/mm/page_alloc.c 2006-11-22 15:58:15.755529347 -0600 @@ -1268,9 +1268,18 @@ nofail_alloc: goto nopage; } - /* Atomic allocations - we can't balance anything */ - if (!wait) - goto nopage; + /* + * Atomic allocations - we can't balance anything. + * + * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and + * __GFP_NOWARN set) should not cause reclaim since the subsystem + * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim + * using a larger set of nodes after it has established that the + * allowed per node queues are empty. + */ + if (!wait || + (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)) + goto nopage; rebalance: cond_resched();