Index: linux-2.6.14-mm2/mm/mempolicy.c =================================================================== --- linux-2.6.14-mm2.orig/mm/mempolicy.c 2005-11-16 16:10:51.000000000 -0800 +++ linux-2.6.14-mm2/mm/mempolicy.c 2005-11-17 13:51:01.000000000 -0800 @@ -21,9 +21,6 @@ * * bind Only allocate memory on a specific set of nodes, * no fallback. - * FIXME: memory is allocated starting with the first node - * to the last. It would be better if bind would truly restrict - * the allocation to memory nodes instead * * preferred Try a specific node first before normal fallback. * As a special case node -1 here means do the allocation @@ -125,28 +122,33 @@ static int mpol_check_policy(int mode, n return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; } /* Generate a custom zonelist for the BIND policy. */ -static struct zonelist *bind_zonelist(nodemask_t *nodes) +static struct zonelist **bind_zonelist(nodemask_t *nodes) { - struct zonelist *zl; - int num, max, nd; + struct zonelist *zz; + struct zonelist **zl; + int num, max; - max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); + max = (1 + MAX_NR_ZONES * nodes_weight(*nodes)) * max_possible_nodes()); zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); if (!zl) return NULL; - num = 0; - for_each_node_mask(nd, *nodes) { - int k; - for (k = MAX_NR_ZONES-1; k >= 0; k--) { - struct zone *z = &NODE_DATA(nd)->node_zones[k]; - if (!z->present_pages) - continue; - zl->zones[num++] = z; - if (k > policy_zone) - policy_zone = k; + for_each_possible_node(node) { + if (!node_isset(node, *nodes)) { + zl[node] = no_zones; + continue; + } + num = 0; + for (zz = NODE_DATA(node)->node_zonelists; *z; z++) { + struct zone *z = *z; + if (node_isset(z->node, *nodes) && z->present_pages) { + zl_zones[num++] = z; + if (z > policy_zone) + policy_zone = z; + } } + zl->zones[num] = NULL; + zl[node] = zz; } - zl->zones[num] = NULL; return zl; } @@ -526,7 +528,7 @@ long do_set_mempolicy(int mode, nodemask mpol_free(current->mempolicy); current->mempolicy = new; if (new && new->policy == MPOL_INTERLEAVE) - current->il_next = first_node(new->v.nodes); + current->il_next = new->zonelist->zones[0]->pgdat->nodeid; return 0; } @@ -536,27 +538,9 @@ static void get_zonemask(struct mempolic int i; nodes_clear(*nodes); - switch (p->policy) { - case MPOL_BIND: - for (i = 0; p->v.zonelist->zones[i]; i++) - node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, - *nodes); - break; - case MPOL_DEFAULT: - break; - case MPOL_INTERLEAVE: - *nodes = p->v.nodes; - break; - case MPOL_PREFERRED: - /* or use current node instead of online map? */ - if (p->v.preferred_node < 0) - *nodes = node_online_map; - else - node_set(p->v.preferred_node, *nodes); - break; - default: - BUG(); - } + for (i = 0; p->zonelist->zones[i]; i++) + node_set(p->zonelist->zones[i]->zone_pgdat->node_id, + *nodes); } static int lookup_node(struct mm_struct *mm, unsigned long addr) @@ -943,45 +927,29 @@ get_vma_policy(struct task_struct *task, } /* Return a zonelist representing a mempolicy */ -static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) +inline static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) { - int nd; + int nd = numa_node_id(); + + /* Lower zones don't get a policy applied */ + /* Careful: current->mems_allowed might have moved */ + if (gfp_zone(gfp) >= policy_zone) + return policy->zonelist[nid]; - switch (policy->policy) { - case MPOL_PREFERRED: - nd = policy->v.preferred_node; - if (nd < 0) - nd = numa_node_id(); - break; - case MPOL_BIND: - /* Lower zones don't get a policy applied */ - /* Careful: current->mems_allowed might have moved */ - if (gfp_zone(gfp) >= policy_zone) - if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist)) - return policy->v.zonelist; - /*FALL THROUGH*/ - case MPOL_INTERLEAVE: /* should not happen */ - case MPOL_DEFAULT: - nd = numa_node_id(); - break; - default: - nd = 0; - BUG(); - } return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp); } /* Do dynamic interleaving for a process */ static unsigned interleave_nodes(struct mempolicy *policy) { - unsigned nid, next; - struct task_struct *me = current; + struct zone **next; - nid = me->il_next; - next = next_node(nid, policy->v.nodes); - if (next >= MAX_NUMNODES) - next = first_node(policy->v.nodes); - me->il_next = next; + next = current->il_next; + if (next - mempolicy->zonelist < max_possible_ + next++; + if (!next) + next = policy->zonelist; + current->il_next = next; return nid; } @@ -1378,8 +1346,9 @@ void mpol_free_shared_policy(struct shar void __init numa_policy_init(void) { policy_cache = kmem_cache_create("numa_policy", - sizeof(struct mempolicy), - 0, SLAB_PANIC, NULL, NULL); + sizeof(struct mempolicy) + + num_possible_nodes() * sizeof(void *), + 0, SLAB_PANIC, NULL, NULL); sn_cache = kmem_cache_create("shared_policy_node", sizeof(struct sp_node), Index: linux-2.6.14-mm2/include/linux/mempolicy.h =================================================================== --- linux-2.6.14-mm2.orig/include/linux/mempolicy.h 2005-11-16 10:43:41.000000000 -0800 +++ linux-2.6.14-mm2/include/linux/mempolicy.h 2005-11-17 12:00:06.000000000 -0800 @@ -60,12 +60,7 @@ struct vm_area_struct; struct mempolicy { atomic_t refcnt; short policy; /* See MPOL_* above */ - union { - struct zonelist *zonelist; /* bind */ - short preferred_node; /* preferred */ - nodemask_t nodes; /* interleave */ - /* undefined for default */ - } v; + struct zone *zonelist[0]; /* One for each active node */ }; /* Index: linux-2.6.14-mm2/include/linux/sched.h =================================================================== --- linux-2.6.14-mm2.orig/include/linux/sched.h 2005-11-15 10:29:52.000000000 -0800 +++ linux-2.6.14-mm2/include/linux/sched.h 2005-11-17 13:49:02.000000000 -0800 @@ -887,7 +887,7 @@ struct task_struct { #endif #ifdef CONFIG_NUMA struct mempolicy *mempolicy; - short il_next; + struct zone **il_next; #endif #ifdef CONFIG_CPUSETS struct cpuset *cpuset;