alloc_pages_range: Allocate memory from a specified range of addresses

The current ZONE_DMA scheme is limited to only a single boundary. I.e. one
can only allocate memory under 16M or above. alloc_pages_range allows one to specify
what the allowed memory range is. Allocate_pages_range() will check the system for
available zones and then perform the fastest allocation possible. If there is no
suitable zone then it will perform a search through the possible zones for pages
that fit the allocation criteria. This search is not fast but it is likely sufficient
for supporting legacy devices and devices with issues.

It is interesting to do this since the DMA subsystem has the ability to communicate
which addresses allowable. Only the page allocator cannot satisfy request
for memory for a specific memory range. With this patch the arch specific
dma_alloc_coherent() function can be modified to call alloc_pages_range() and then
the DMA subsystem will be able to exploit all available memory in that range.

Once this mechanism is in place and if one has dealt with all relevant GFP_DMA
references (all current uses must be changed to call alloc_pages_ranage()!)
for an arch then one can disable ZONE_DMA and enjoy the benefits
of a single zone while still being able to use the old floppy driver should
the need arise.

- Only i386 supported.

- Reclaim when not falling back to regular allocs may not be that efficient.

- It boots on my system.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux-2.6.19-rc5-mm1/include/linux/gfp.h
===================================================================
--- linux-2.6.19-rc5-mm1.orig/include/linux/gfp.h	2006-11-10 21:50:25.097850107 -0600
+++ linux-2.6.19-rc5-mm1/include/linux/gfp.h	2006-11-20 17:10:44.904470263 -0600
@@ -139,6 +139,17 @@ static inline struct page *alloc_pages_n
 		NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_mask));
 }
 
+#define ARCH_NEEDS_ALLOC_PAGES_RANGE
+
+#ifdef ARCH_NEEDS_ALLOC_PAGES_RANGE
+extern struct page *__alloc_pages_range(unsigned long low, unsigned long high,
+		gfp_t gfp_mask, unsigned int order, struct zonelist *zl);
+
+#else
+#define __alloc_pages_range(__low,__high,__mask,__order, __zl) \
+		__alloc_pages((__mask),(__order),(__zl))
+#endif
+
 #ifdef CONFIG_NUMA
 extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order);
 
@@ -150,6 +161,29 @@ alloc_pages(gfp_t gfp_mask, unsigned int
 
 	return alloc_pages_current(gfp_mask, order);
 }
+
+static inline struct page *alloc_pages_range_node(int nid,
+		unsigned long low, unsigned long high,
+		gfp_t gfp_mask, unsigned int order)
+{
+	if (unlikely(order >= MAX_ORDER))
+		return NULL;
+
+	/* Unknown node is current node */
+	if (nid < 0)
+		nid = numa_node_id();
+
+	return __alloc_pages_range(low, high, gfp_mask, order,
+		NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_mask));
+}
+
+static inline struct page *
+alloc_pages_range(unsigned long low, unsigned long high, gfp_t gfp_mask,
+							unsigned int order)
+{
+	return alloc_pages_range_node(-1, low, high, gfp_mask, order);
+}
+
 extern struct page *alloc_page_vma(gfp_t gfp_mask,
 			struct vm_area_struct *vma, unsigned long addr);
 #else
Index: linux-2.6.19-rc5-mm1/mm/page_alloc.c
===================================================================
--- linux-2.6.19-rc5-mm1.orig/mm/page_alloc.c	2006-11-10 21:50:32.027159250 -0600
+++ linux-2.6.19-rc5-mm1/mm/page_alloc.c	2006-11-14 23:10:39.524968672 -0600
@@ -1201,6 +1201,22 @@ __alloc_pages(gfp_t gfp_mask, unsigned i
 	int alloc_flags;
 	int did_some_progress;
 
+#ifdef ARCH_NEEDS_ALLLOC_PAGES_RANGE
+/*
+ * If we have alloc_pages_range then we can emulate GFP_DMAxx flags
+ * by using __alloc_pages_range.
+ */
+#ifndef CONFIG_ZONE_DMA32
+	if (gfp_mask & __GFP_DMA32)
+		return __alloc_pages_range(0, MAX_DMA32_ADDRESS,
+			gfp_mask & ~(__GFP_DMA32|__GFP_DMA), order, zonelist);
+#endif
+#ifndef CONFIG_ZONE_DMA
+	if (gfp_mask & __GFP_DMA)
+		return __alloc_pages_range(0, MAX_DMA_ADDRESS,
+				gfp_mask & ~__GFP_DMA, order, zonelist);
+#endif
+#endif
 	might_sleep_if(wait);
 
 restart:
@@ -1343,9 +1359,227 @@ got_pg:
 #endif
 	return page;
 }
-
 EXPORT_SYMBOL(__alloc_pages);
 
+#ifdef ARCH_NEEDS_ALLOC_PAGES_RANGE
+/*
+ * Find a page that is located in a specific physical address range.
+ * This is similar to what rmqueue() does but it is less efficient
+ * since we have to scan through each item on the freelist while looking for
+ * a page fitting the address requirements.
+ */
+static struct page *rmqueue_range(unsigned long low, unsigned long high,
+				struct zone *zone, unsigned int order)
+{
+	struct free_area * area;
+	unsigned int current_order;
+	struct page *page;
+
+	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
+		area = zone->free_area + current_order;
+		if (list_empty(&area->free_list))
+			continue;
+
+		list_for_each_entry(page, &area->free_list, lru) {
+			unsigned long addr = (unsigned long)page_address(page);
+
+			if (addr >= low &&
+				addr < high - (PAGE_SIZE << order))
+					goto found_match;
+		}
+		continue;
+found_match:
+		list_del(&page->lru);
+		rmv_page_order(page);
+		area->nr_free--;
+		zone->free_pages -= 1UL << order;
+		expand(zone, page, order, current_order, area);
+		return page;
+	}
+	return NULL;
+}
+
+/*
+ * Special allocation functions to get memory in a specified area of memory
+ * like necessary for allocations for DMA devices that are unable to address
+ * all of memory.
+ *
+ * This function attempts to fall back as much as possible to the regular allocator
+ * calls in order to avoid more expensive processing that comes with having to
+ * look through each page on the freelists to check if its suitable for the
+ * allocation.
+ */
+struct page *__alloc_pages_range(unsigned long low, unsigned long high,
+			gfp_t gfp_flags, unsigned int order,
+			struct zonelist *zl)
+{
+	const gfp_t wait = gfp_flags & __GFP_WAIT;
+	struct reclaim_state reclaim_state;
+	struct task_struct *p = current;
+	int do_retry;
+	int did_some_progress;
+	struct zonelist *z2 = NULL;
+	struct zone *stragglers[3];
+	int nr_stragglers = 0;
+	struct zone **z;
+	struct zone **zp = NULL;
+	struct page *page;
+
+	BUG_ON(gfp_flags & (__GFP_HIGHMEM|__GFP_DMA32|__GFP_DMA));
+	might_sleep_if(wait);
+
+	/*
+	 * Attempt to create a new zonelist with zones that contain memory
+	 * that satisfies the restrictions.
+	 */
+	for (z = zl->zones; *z; z++) {
+		unsigned long start_pfn = (*z)->zone_start_pfn;
+		unsigned long end_pfn = start_pfn + (*z)->spanned_pages;
+
+		if ((void *)low >= pfn_to_kaddr(end_pfn) ||
+		    (void *)high <= pfn_to_kaddr(start_pfn))
+				/* Zone outside of boundaries */
+				continue;
+
+		if ((void *)low <= pfn_to_kaddr(start_pfn) &&
+			(void *)high >= pfn_to_kaddr(end_pfn)) {
+			/* Zone completely within boundaries */
+			if (!z2) {
+				z2 = kmalloc(sizeof(struct zonelist), GFP_KERNEL);
+				zp = z2->zones;
+			}
+			*zp++ = *z;
+		} else
+			/* Zone straggles the boundaries */
+			stragglers[nr_stragglers++] = *z;
+	}
+
+	if (z2) {
+		/*
+		 * We got some zones that fall within the range.
+		 *
+		 * Typically either of the following is true:
+		 * A) We have DMA zones that match.
+		 * B) NUMA: Some nodes are within the DMA range
+		 * C) Installed memory does not go beyond DMA range
+		 *    (f.e. x86_64 with 2G installed and an alloc below 2G
+		 *    for a device with limited DMA capability).
+		 */
+		*zp = NULL;
+		page = __alloc_pages(gfp_flags, order, z2);
+		kfree(z2);
+		return page;
+	}
+
+	if (!nr_stragglers)
+		/*
+		 * There is no zone that can provide pages that satisfy the
+		 * address range
+		 */
+		goto nopage;
+
+	/*
+	 * We are left with zones with pages which may or may not match our
+	 * address requirements. We need to inspect the freelists to see
+	 * if fitting memory is available and do our own reclaim.
+	 *
+ 	 * We do this the hard way. Ignore caches as well as zone limits
+	 * Just find any page that would satisfy the allocation.
+	 */
+	BUG_ON(!nr_stragglers);
+	stragglers[nr_stragglers] = NULL;
+
+retry:
+	/*
+	 * Go through the list of zones where we may find pages and check each
+	 * for pages that satisfy our requirements.
+	 */
+	for (z = stragglers; *z; z++) {
+		unsigned long flags;
+		struct zone *zone = *z;
+
+		spin_lock_irqsave(&zone->lock, flags);
+		page = rmqueue_range(low, high, zone, order);
+		spin_unlock(&zone->lock);
+		if (!page) {
+			local_irq_restore(flags);
+			wakeup_kswapd(zone, order);
+			continue;
+		}
+		__count_zone_vm_events(PGALLOC, zone, 1 << order);
+		zone_statistics(zl, zone);
+		local_irq_restore(flags);
+
+		VM_BUG_ON(bad_range(zone, page));
+		if (!prep_new_page(page, order, gfp_flags)) {
+#ifdef CONFIG_PAGE_OWNER
+			if (page)
+				set_page_owner(page, order, gfp_flags);
+#endif
+			return page;
+		}
+	};
+
+	if (!wait)
+		goto nopage;
+
+	/*
+	 * Synchrononous reclaim. This is a broad shot at the straggler zones.
+	 * Hopefully some pages will be reclaimed that are fitting our address
+	 * requirements.
+	 *
+	 * Potential enhancement: Do targeted reclaim to only the address
+	 * ranges in the zone that are of interest now.
+	 * However, doing so would lead to a lot of additional code. Maybe
+	 * someone else has a bright idea on how to do so in a simple way.
+	 */
+	cond_resched();
+
+	p->flags |= PF_MEMALLOC;
+	reclaim_state.reclaimed_slab = 0;
+	p->reclaim_state = &reclaim_state;
+
+	did_some_progress = try_to_free_pages(stragglers, gfp_flags);
+
+	p->reclaim_state = NULL;
+	p->flags &= ~PF_MEMALLOC;
+
+	cond_resched();
+
+	if (!did_some_progress)
+		goto nopage;
+
+	/*
+	 * Don't let big-order allocations loop unless the caller explicitly
+	 * requests that.  Wait for some write requests to complete then retry.
+	 *
+	 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
+	 * <= 3, but that may not be true in other implementations.
+	 */
+	do_retry = 0;
+	if (!(gfp_flags & __GFP_NORETRY)) {
+		if ((order <= 3) || (gfp_flags & __GFP_REPEAT))
+			do_retry = 1;
+		if (gfp_flags & __GFP_NOFAIL)
+			do_retry = 1;
+	}
+	if (do_retry) {
+		congestion_wait(WRITE, HZ/50);
+		goto retry;
+	}
+
+nopage:
+	if (!(gfp_flags & __GFP_NOWARN) && printk_ratelimit()) {
+		printk(KERN_WARNING "%s: page range (%lx - %lx) allocation failure."
+			" order:%d, mode:0x%x\n",
+			p->comm, low, high, order, gfp_flags);
+		dump_stack();
+		show_mem();
+	}
+	return NULL;
+}
+#endif
+
 /*
  * Common helper functions.
  */