[RFC] MPOL_BIND fix and general simplification of memory policy layer

This proposal is based on ideas first proposed by Paul Jackson last year
and in face-to-face meetings last week. See his post at
http://marc.theaimsgroup.com/?l=linux-kernel&m=109149014310228&w=2

The handling of policies in the policy layer is complex right now since
there is a lot of special casing for each policy variation. The idea here
is to replace the special casing with a list of pointers to zoneslists
generated with a policy. This is similar to the current MPOL_BIND approach.

F.e. with this approach MPOL_PREFERRED will have zonelists created that
always begin with the preferred zone. MPOL_DEFAULT will simply have
the default zonelists for each node.

MPOL_INTERLEAVE is simply a restricted number of zonelists that are
sequentially assigned to one node after another. Generating interleave
nodelists can then simply be done by keeping a pointer to the next zonelist
to be used. That pointer is increased after use.

The advantages of this approach are:

1. Less overhead when processing policies. Less code to maintain.

3. MPOL_BIND policy works as documented. We will have zonelists for each node
   instead of a single one. The current MPOL_BIND policy implementation results
   in allocations on the first node first and may put unecessary memory pressure
   on that node since memory on that node must be exhausted before memory is
   used on other nodes.

4. The cpuset restrictions can be worked into the zonelist arrays maintained
   by the policy layer. This means that a number of cpuset hooks will be
   unnecessary. In particular that is true for the cpuset check in alloc_pages().

5. The distance tables allow a more accurate fallback. The tables generated
   for MPOL_PREFERRED allow the fallback to the local node relative to the
   executing processor instead of the next node relative to the preferred
   node. MPOL_BIND redirects allocations to nearest nodes in the BIND set.

The amount of memory to store policies is increased and may significantly
increase with larger processors sets since there will be zonelist generated
for each node in each policy. For 256 nodes we would have 256 zonelists in each
policy. These could be pointing to the zones of the 256 nodes each.

So that would in the worst case (policy that allows all nodes to be used)
256*256 = 65536 pointers of 16 bytes each = 1 Megabyte per policy.

For a simple slice of 8 nodes we would need 8*256*16 = 32k byte.

This could be reduced by storing the lowest and highest allowed node number
in the policy but that would introduce additional complexity into the patch.


Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux-2.6.15-rc1-mm2/mm/mempolicy.c
===================================================================
--- linux-2.6.15-rc1-mm2.orig/mm/mempolicy.c	2005-11-18 12:35:59.000000000 -0800
+++ linux-2.6.15-rc1-mm2/mm/mempolicy.c	2005-11-18 18:10:56.000000000 -0800
@@ -21,9 +21,6 @@
  *
  * bind           Only allocate memory on a specific set of nodes,
  *                no fallback.
- *                FIXME: memory is allocated starting with the first node
- *                to the last. It would be better if bind would truly restrict
- *                the allocation to memory nodes instead
  *
  * preferred       Try a specific node first before normal fallback.
  *                As a special case node -1 here means do the allocation
@@ -125,62 +122,108 @@ static int mpol_check_policy(int mode, n
 	}
 	return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 }
-/* Generate a custom zonelist for the BIND policy. */
-static struct zonelist *bind_zonelist(nodemask_t *nodes)
+
+/*
+ * Generate a custom zonelist
+ *  node = node from which the allocation is to be performed
+ *  preferred = node to be preferred
+ *  nodes = allowed set of nodes to be used.
+ */
+static struct zonelist *build_zonelist(int node, int preferred, nodemask_t *nodes)
 {
 	struct zonelist *zl;
-	int num, max, nd;
+	struct zone **z;
+	struct zone **p;
+	int max;
 
-	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
+	max = 1 + (preferred >= 0) + MAX_NR_ZONES * nodes_weight(*nodes);
 	zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
 	if (!zl)
 		return NULL;
-	num = 0;
-	for_each_node_mask(nd, *nodes) {
-		int k;
-		for (k = MAX_NR_ZONES-1; k >= 0; k--) {
-			struct zone *z = &NODE_DATA(nd)->node_zones[k];
-			if (!populated_zone(z))
-				continue;
-			zl->zones[num++] = z;
-			if (k > policy_zone)
-				policy_zone = k;
-		}
+
+	z = zl->zones;
+	if (preferred >=0)
+		*z++ = NODE_DATA(preferred)->node_zonelists[policy_zone].zones[0];
+
+	for (p = NODE_DATA(node)->node_zonelists[policy_zone].zones; *p; *p++) {
+		int n = (*p)->zone_pgdat->node_id;
+
+		if (n != preferred && node_isset(n, *nodes))
+			*z++ = *p;
 	}
-	zl->zones[num] = NULL;
+
+	*z = NULL;
 	return zl;
 }
 
+static void zap_zonelists(struct mempolicy *pol)
+{
+	int node;
+
+	for_each_node(node) {
+		kfree(pol->zonelist[node]);
+		pol->zonelist[node] = NULL;
+	}
+}
+
+static int mpol_populate(struct mempolicy *pol, nodemask_t *nodes)
+{
+	int mode = pol->policy;
+	int node;
+	int preferred_node;
+
+	if (mode == MPOL_PREFERRED || mode == MPOL_INTERLEAVE)
+		preferred_node = first_node(*nodes);
+	else
+		preferred_node = -1;
+
+	memset(pol->zonelist, 0, sizeof(void *) * num_possible_nodes());
+
+	for_each_node(node) {
+		struct zonelist *z;
+
+		if (mode != MPOL_INTERLEAVE)
+			z = build_zonelist(node, preferred_node, nodes);
+		else
+			z = build_zonelist(preferred_node, -1, nodes);
+
+		if (!z) {
+			zap_zonelists(pol);
+			kmem_cache_free(policy_cache, pol);
+			return -ENOMEM;
+		}
+		pol->zonelist[node] = z;
+
+		if (mode == MPOL_INTERLEAVE) {
+			preferred_node = next_node(preferred_node, *nodes);
+			if (preferred_node == MAX_NUMNODES)
+				preferred_node = first_node(*nodes);
+		}
+	}
+
+	return 0;
+}
+
 /* Create a new policy */
 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 {
 	struct mempolicy *policy;
+	int err;
 
-	PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 	if (mode == MPOL_DEFAULT)
 		return NULL;
+
 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 	if (!policy)
 		return ERR_PTR(-ENOMEM);
-	atomic_set(&policy->refcnt, 1);
-	switch (mode) {
-	case MPOL_INTERLEAVE:
-		policy->v.nodes = *nodes;
-		break;
-	case MPOL_PREFERRED:
-		policy->v.preferred_node = first_node(*nodes);
-		if (policy->v.preferred_node >= MAX_NUMNODES)
-			policy->v.preferred_node = -1;
-		break;
-	case MPOL_BIND:
-		policy->v.zonelist = bind_zonelist(nodes);
-		if (policy->v.zonelist == NULL) {
-			kmem_cache_free(policy_cache, policy);
-			return ERR_PTR(-ENOMEM);
-		}
-		break;
-	}
+
 	policy->policy = mode;
+	atomic_set(&policy->refcnt, 1);
+	err = mpol_populate(policy, nodes);
+
+	if (err)
+		return ERR_PTR(err);
+
 	return policy;
 }
 
@@ -527,13 +570,17 @@ long do_set_mempolicy(int mode, nodemask
 
 	if (contextualize_policy(mode, nodes))
 		return -EINVAL;
+
 	new = mpol_new(mode, nodes);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
+
 	mpol_free(current->mempolicy);
 	current->mempolicy = new;
+
 	if (new && new->policy == MPOL_INTERLEAVE)
-		current->il_next = first_node(new->v.nodes);
+		current->il_next = new->zonelist;
+
 	return 0;
 }
 
@@ -543,27 +590,9 @@ static void get_zonemask(struct mempolic
 	int i;
 
 	nodes_clear(*nodes);
-	switch (p->policy) {
-	case MPOL_BIND:
-		for (i = 0; p->v.zonelist->zones[i]; i++)
-			node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
-				*nodes);
-		break;
-	case MPOL_DEFAULT:
-		break;
-	case MPOL_INTERLEAVE:
-		*nodes = p->v.nodes;
-		break;
-	case MPOL_PREFERRED:
-		/* or use current node instead of online map? */
-		if (p->v.preferred_node < 0)
-			*nodes = node_online_map;
-		else
-			node_set(p->v.preferred_node, *nodes);
-		break;
-	default:
-		BUG();
-	}
+	for (i = 0; i < num_possible_nodes(); i++)
+		node_set(p->zonelist[i]->zones[policy_zone]->zone_pgdat->node_id,
+			*nodes);
 }
 
 static int lookup_node(struct mm_struct *mm, unsigned long addr)
@@ -616,7 +645,7 @@ long do_get_mempolicy(int *policy, nodem
 			*policy = err;
 		} else if (pol == current->mempolicy &&
 				pol->policy == MPOL_INTERLEAVE) {
-			*policy = current->il_next;
+			*policy = (current->il_next - pol->zonelist);
 		} else {
 			err = -EINVAL;
 			goto out;
@@ -950,46 +979,25 @@ get_vma_policy(struct task_struct *task,
 }
 
 /* Return a zonelist representing a mempolicy */
-static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
+inline static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 {
-	int nd;
+	int nd = numa_node_id();
 
-	switch (policy->policy) {
-	case MPOL_PREFERRED:
-		nd = policy->v.preferred_node;
-		if (nd < 0)
-			nd = numa_node_id();
-		break;
-	case MPOL_BIND:
-		/* Lower zones don't get a policy applied */
-		/* Careful: current->mems_allowed might have moved */
-		if (gfp_zone(gfp) >= policy_zone)
-			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
-				return policy->v.zonelist;
-		/*FALL THROUGH*/
-	case MPOL_INTERLEAVE: /* should not happen */
-	case MPOL_DEFAULT:
-		nd = numa_node_id();
-		break;
-	default:
-		nd = 0;
-		BUG();
-	}
-	return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
-}
+	/* Lower zones don't get a policy applied */
+	/* Careful: current->mems_allowed might have moved */
+	if (gfp_zone(gfp) >= policy_zone && policy->policy != MPOL_DEFAULT) {
 
-/* Do dynamic interleaving for a process */
-static unsigned interleave_nodes(struct mempolicy *policy)
-{
-	unsigned nid, next;
-	struct task_struct *me = current;
+		if (policy->policy != MPOL_INTERLEAVE)
+			return policy->zonelist[nd];
+
+		if (current->il_next < policy->zonelist + num_possible_nodes() - 1)
+			current->il_next++;
+		else
+			current->il_next = policy->zonelist;
 
-	nid = me->il_next;
-	next = next_node(nid, policy->v.nodes);
-	if (next >= MAX_NUMNODES)
-		next = first_node(policy->v.nodes);
-	me->il_next = next;
-	return nid;
+		return *current->il_next;
+	}
+	return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
 }
 
 /*
@@ -998,59 +1006,10 @@ static unsigned interleave_nodes(struct 
  */
 unsigned slab_node(struct mempolicy *policy)
 {
-	if (in_interrupt())
-		return numa_node_id();
-
-	switch (policy->policy) {
-	case MPOL_INTERLEAVE:
-		return interleave_nodes(policy);
-
-	case MPOL_BIND:
-		/*
-		 * Follow bind policy behavior and start allocation at the
-		 * first node.
-		 */
-		return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
-
-	case MPOL_PREFERRED:
-		if (policy->v.preferred_node >= 0)
-			return policy->v.preferred_node;
-		/* Fall through */
-
-	default:
+	if (in_interrupt() || policy->policy == MPOL_DEFAULT)
 		return numa_node_id();
-	}
-}
 
-/* Do static interleaving for a VMA with known offset. */
-static unsigned offset_il_node(struct mempolicy *pol,
-		struct vm_area_struct *vma, unsigned long off)
-{
-	unsigned nnodes = nodes_weight(pol->v.nodes);
-	unsigned target = (unsigned)off % nnodes;
-	int c;
-	int nid = -1;
-
-	c = 0;
-	do {
-		nid = next_node(nid, pol->v.nodes);
-		c++;
-	} while (c <= target);
-	return nid;
-}
-
-/* Determine a node number for interleave */
-static inline unsigned interleave_nid(struct mempolicy *pol,
-		 struct vm_area_struct *vma, unsigned long addr, int shift)
-{
-	if (vma) {
-		unsigned long off;
-
-		off = vma->vm_pgoff;
-		off += (addr - vma->vm_start) >> shift;
-		return offset_il_node(pol, vma, off);
-	} else
-		return interleave_nodes(pol);
+	return zonelist_policy(GFP_HIGHUSER, policy)->zones[0]->zone_pgdat->node_id;
 }
 
 /* Return a zonelist suitable for a huge page allocation. */
@@ -1058,32 +1017,12 @@ struct zonelist *huge_zonelist(struct vm
 {
 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 
-	if (pol->policy == MPOL_INTERLEAVE) {
-		unsigned nid;
+	if (unlikely(pol->policy == MPOL_INTERLEAVE))
+		return pol->zonelist[(addr << HPAGE_SHIFT) % num_possible_nodes()];
 
-		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
-		return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
-	}
 	return zonelist_policy(GFP_HIGHUSER, pol);
 }
 
-/* Allocate a page in interleaved policy.
-   Own path because it needs to do special accounting. */
-static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
-					unsigned nid)
-{
-	struct zonelist *zl;
-	struct page *page;
-
-	zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
-	page = __alloc_pages(gfp, order, zl);
-	if (page && page_zone(page) == zl->zones[0]) {
-		zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
-		put_cpu();
-	}
-	return page;
-}
-
 /**
  * 	alloc_page_vma	- Allocate a page for a VMA.
  *
@@ -1110,16 +1049,23 @@ struct page *
 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 {
 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
+	struct page *page;
+	struct zonelist *zl;
 
 	cpuset_update_current_mems_allowed();
 
-	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
-		unsigned nid;
+	if (unlikely(pol->policy == MPOL_INTERLEAVE && vma && gfp_zone(gfp) >= policy_zone))
+		zl = pol->zonelist[(addr << PAGE_SHIFT) % num_possible_nodes()];
+	else
+		zl = zonelist_policy(gfp, pol);
+
+	page = __alloc_pages(gfp, 0, zl);
 
-		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
-		return alloc_page_interleave(gfp, 0, nid);
+	if (pol->policy == MPOL_INTERLEAVE && page_zone(page) == zl->zones[0]) {
+		zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
+		put_cpu();
 	}
-	return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
+	return page;
 }
 
 /**
@@ -1149,8 +1095,6 @@ struct page *alloc_pages_current(gfp_t g
 		cpuset_update_current_mems_allowed();
 	if (!pol || in_interrupt())
 		pol = &default_policy;
-	if (pol->policy == MPOL_INTERLEAVE)
-		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
 	return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
 }
 EXPORT_SYMBOL(alloc_pages_current);
@@ -1159,48 +1103,55 @@ EXPORT_SYMBOL(alloc_pages_current);
 struct mempolicy *__mpol_copy(struct mempolicy *old)
 {
 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
+	int node;
 
 	if (!new)
 		return ERR_PTR(-ENOMEM);
 	*new = *old;
 	atomic_set(&new->refcnt, 1);
-	if (new->policy == MPOL_BIND) {
-		int sz = ksize(old->v.zonelist);
-		new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
-		if (!new->v.zonelist) {
+
+	memset(new->zonelist, 0, sizeof(void *) * num_possible_nodes());
+
+	for_each_node(node) {
+		int sz = ksize(old->zonelist[node]);
+		struct zonelist *zl = kmalloc(sz, SLAB_KERNEL);
+
+		if (!zl) {
+			zap_zonelists(new);
 			kmem_cache_free(policy_cache, new);
 			return ERR_PTR(-ENOMEM);
 		}
-		memcpy(new->v.zonelist, old->v.zonelist, sz);
+		new->zonelist[node] = zl;
 	}
+
 	return new;
 }
 
 /* Slow path of a mempolicy comparison */
 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 {
+	int node;
+
 	if (!a || !b)
 		return 0;
 	if (a->policy != b->policy)
 		return 0;
-	switch (a->policy) {
-	case MPOL_DEFAULT:
+
+	if (a->policy == MPOL_DEFAULT)
 		return 1;
-	case MPOL_INTERLEAVE:
-		return nodes_equal(a->v.nodes, b->v.nodes);
-	case MPOL_PREFERRED:
-		return a->v.preferred_node == b->v.preferred_node;
-	case MPOL_BIND: {
-		int i;
-		for (i = 0; a->v.zonelist->zones[i]; i++)
-			if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
+
+	for_each_node(node) {
+		struct zone **x, **y;
+
+		for (x = a->zonelist[node]->zones,
+		     y = b->zonelist[node]->zones; *x && *y; x++, y++)
+			if (*x != *y)
+				return 0;
+
+		if (*x != NULL || *y != NULL)
 				return 0;
-		return b->v.zonelist->zones[i] == NULL;
-	}
-	default:
-		BUG();
-		return 0;
 	}
+	return 1;
 }
 
 /* Slow path of a mpol destructor. */
@@ -1208,8 +1159,8 @@ void __mpol_free(struct mempolicy *p)
 {
 	if (!atomic_dec_and_test(&p->refcnt))
 		return;
-	if (p->policy == MPOL_BIND)
-		kfree(p->v.zonelist);
+
+	zap_zonelists(p);
 	p->policy = MPOL_DEFAULT;
 	kmem_cache_free(policy_cache, p);
 }
@@ -1375,11 +1326,6 @@ int mpol_set_shared_policy(struct shared
 	struct sp_node *new = NULL;
 	unsigned long sz = vma_pages(vma);
 
-	PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
-		 vma->vm_pgoff,
-		 sz, npol? npol->policy : -1,
-		npol ? nodes_addr(npol->v.nodes)[0] : -1);
-
 	if (npol) {
 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
 		if (!new)
@@ -1415,8 +1361,9 @@ void mpol_free_shared_policy(struct shar
 void __init numa_policy_init(void)
 {
 	policy_cache = kmem_cache_create("numa_policy",
-					 sizeof(struct mempolicy),
-					 0, SLAB_PANIC, NULL, NULL);
+				sizeof(struct mempolicy) +
+					num_possible_nodes() * sizeof(void *),
+				0, SLAB_PANIC, NULL, NULL);
 
 	sn_cache = kmem_cache_create("shared_policy_node",
 				     sizeof(struct sp_node),
@@ -1440,51 +1387,25 @@ static void rebind_policy(struct mempoli
 							const nodemask_t *new)
 {
 	nodemask_t tmp;
+	nodemask_t nodes;
+	struct zone **z;
+	int node;
 
 	if (!pol)
 		return;
 
-	switch (pol->policy) {
-	case MPOL_DEFAULT:
-		break;
-	case MPOL_INTERLEAVE:
-		nodes_remap(tmp, pol->v.nodes, *old, *new);
-		pol->v.nodes = tmp;
-		current->il_next = node_remap(current->il_next, *old, *new);
-		break;
-	case MPOL_PREFERRED:
-		pol->v.preferred_node = node_remap(pol->v.preferred_node,
-								*old, *new);
-		break;
-	case MPOL_BIND: {
-		nodemask_t nodes;
-		struct zone **z;
-		struct zonelist *zonelist;
-
-		nodes_clear(nodes);
-		for (z = pol->v.zonelist->zones; *z; z++)
-			node_set((*z)->zone_pgdat->node_id, nodes);
-		nodes_remap(tmp, nodes, *old, *new);
-		nodes = tmp;
-
-		zonelist = bind_zonelist(&nodes);
-
-		/* If no mem, then zonelist is NULL and we keep old zonelist.
-		 * If that old zonelist has no remaining mems_allowed nodes,
-		 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
-		 */
-
-		if (zonelist) {
-			/* Good - got mem - substitute new zonelist */
-			kfree(pol->v.zonelist);
-			pol->v.zonelist = zonelist;
-		}
-		break;
-	}
-	default:
-		BUG();
-		break;
-	}
+	nodes_clear(nodes);
+	if (pol->policy != MPOL_PREFERRED) {
+		for_each_node(node)
+			for (z = pol->zonelist[node]->zones; *z; z++)
+				node_set((*z)->zone_pgdat->node_id, nodes);
+	} else
+		/* Preferred just needs one node switched on */
+		node_set(pol->zonelist[0]->zones[0]->zone_pgdat->node_id, nodes);
+
+	zap_zonelists(pol);
+	nodes_remap(tmp, nodes, *old, *new);
+	mpol_populate(pol, &tmp);
 }
 
 /*
@@ -1497,3 +1418,4 @@ void numa_policy_rebind(const nodemask_t
 {
 	rebind_policy(current->mempolicy, old, new);
 }
+
Index: linux-2.6.15-rc1-mm2/include/linux/mempolicy.h
===================================================================
--- linux-2.6.15-rc1-mm2.orig/include/linux/mempolicy.h	2005-11-18 12:35:59.000000000 -0800
+++ linux-2.6.15-rc1-mm2/include/linux/mempolicy.h	2005-11-18 14:51:00.000000000 -0800
@@ -62,12 +62,7 @@ struct vm_area_struct;
 struct mempolicy {
 	atomic_t refcnt;
 	short policy; 	/* See MPOL_* above */
-	union {
-		struct zonelist  *zonelist;	/* bind */
-		short 		 preferred_node; /* preferred */
-		nodemask_t	 nodes;		/* interleave */
-		/* undefined for default */
-	} v;
+	struct zonelist *zonelist[0];	/* One for each active node */
 };
 
 /*
Index: linux-2.6.15-rc1-mm2/include/linux/sched.h
===================================================================
--- linux-2.6.15-rc1-mm2.orig/include/linux/sched.h	2005-11-18 12:35:59.000000000 -0800
+++ linux-2.6.15-rc1-mm2/include/linux/sched.h	2005-11-18 14:30:42.000000000 -0800
@@ -887,7 +887,7 @@ struct task_struct {
 #endif
 #ifdef CONFIG_NUMA
   	struct mempolicy *mempolicy;
-	short il_next;
+	struct zonelist **il_next;
 #endif
 #ifdef CONFIG_CPUSETS
 	struct cpuset *cpuset;