alloc_pages_range: Allocate memory from a specified range of addresses The current ZONE_DMA scheme is limited to only a single boundary. I.e. one can only allocate memory under 16M or above. alloc_pages_range allows one to specify what the allowed memory range is. Allocate_pages_range() will check the system for available zones and then perform the fastest allocation possible. If there is no suitable zone then it will perform a search through the possible zones for pages that fit the allocation criteria. This search is not fast but it is likely sufficient for supporting legacy devices and devices with issues. It is interesting to do this since the DMA subsystem has the ability to communicate which addresses allowable. Only the page allocator cannot satisfy request for memory for a specific memory range. With this patch the arch specific dma_alloc_coherent() function can be modified to call alloc_pages_range() and then the DMA subsystem will be able to exploit all available memory in that range. Once this mechanism is in place and if one has dealt with all relevant GFP_DMA references (all current uses must be changed to call alloc_pages_ranage()!) for an arch then one can disable ZONE_DMA and enjoy the benefits of a single zone while still being able to use the old floppy driver should the need arise. - Only i386 supported. - Reclaim when not falling back to regular allocs may not be that efficient. - It boots on my system. Signed-off-by: Christoph Lameter Index: linux-2.6.19-rc5-mm2/include/linux/gfp.h =================================================================== --- linux-2.6.19-rc5-mm2.orig/include/linux/gfp.h 2006-11-15 16:48:08.718689861 -0600 +++ linux-2.6.19-rc5-mm2/include/linux/gfp.h 2006-11-15 18:47:42.453711019 -0600 @@ -139,6 +139,17 @@ static inline struct page *alloc_pages_n NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_mask)); } +#define ARCH_NEEDS_ALLOC_PAGES_RANGE + +#ifdef ARCH_NEEDS_ALLOC_PAGES_RANGE +extern struct page *__alloc_pages_range(unsigned long low, unsigned long high, + gfp_t gfp_mask, unsigned int order, struct zonelist *zl); + +#else +#define __alloc_pages_range(__low,__high,__mask,__order, __zl) \ + __alloc_pages((__mask),(__order),(__zl)) +#endif + #ifdef CONFIG_NUMA extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order); @@ -150,6 +161,29 @@ alloc_pages(gfp_t gfp_mask, unsigned int return alloc_pages_current(gfp_mask, order); } + +static inline struct page *alloc_pages_range_node(int nid, + unsigned long low, unsigned long high, + gfp_t gfp_mask, unsigned int order) +{ + if (unlikely(order >= MAX_ORDER)) + return NULL; + + /* Unknown node is current node */ + if (nid < 0) + nid = numa_node_id(); + + return __alloc_pages_range(low, high, gfp_mask, order, + NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_mask)); +} + +static inline struct page * +alloc_pages_range(unsigned long low, unsigned long high, gfp_t gfp_mask, + unsigned int order) +{ + return alloc_pages_range_node(-1, low, high, gfp_mask, order); +} + extern struct page *alloc_page_vma(gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr); #else Index: linux-2.6.19-rc5-mm2/mm/page_alloc.c =================================================================== --- linux-2.6.19-rc5-mm2.orig/mm/page_alloc.c 2006-11-15 18:38:41.431344621 -0600 +++ linux-2.6.19-rc5-mm2/mm/page_alloc.c 2006-11-15 18:47:42.476174533 -0600 @@ -1285,6 +1285,22 @@ __alloc_pages(gfp_t gfp_mask, unsigned i int alloc_flags; int did_some_progress; +#ifdef ARCH_NEEDS_ALLLOC_PAGES_RANGE +/* + * If we have alloc_pages_range then we can emulate GFP_DMAxx flags + * by using __alloc_pages_range. + */ +#ifndef CONFIG_ZONE_DMA32 + if (gfp_mask & __GFP_DMA32) + return __alloc_pages_range(0, MAX_DMA32_ADDRESS, + gfp_mask & ~(__GFP_DMA32|__GFP_DMA), order, zonelist); +#endif +#ifndef CONFIG_ZONE_DMA + if (gfp_mask & __GFP_DMA) + return __alloc_pages_range(0, MAX_DMA_ADDRESS, + gfp_mask & ~__GFP_DMA, order, zonelist); +#endif +#endif might_sleep_if(wait); if (should_fail_alloc_page(gfp_mask, order)) @@ -1430,9 +1446,226 @@ got_pg: #endif return page; } - EXPORT_SYMBOL(__alloc_pages); +#ifdef ARCH_NEEDS_ALLOC_PAGES_RANGE +/* + * Find a page that is located in a specific physical address range. + * This is similar to what rmqueue() does but it is less efficient + * since we have to scan through each item on the freelist while looking for + * a page fitting the address requirements. + */ +static struct page *rmqueue_range(unsigned long low, unsigned long high, + struct zone *zone, unsigned int order) +{ + struct free_area * area; + unsigned int current_order; + struct page *page; + + for (current_order = order; current_order < MAX_ORDER; ++current_order) { + area = zone->free_area + current_order; + if (list_empty(&area->free_list)) + continue; + + list_for_each_entry(page, &area->free_list, lru) { + unsigned long addr = (unsigned long)page_address(page); + + if (addr >= low && + addr < high - (PAGE_SIZE << order)) + goto found_match; + } + continue; +found_match: + list_del(&page->lru); + rmv_page_order(page); + area->nr_free--; + zone->free_pages -= 1UL << order; + expand(zone, page, order, current_order, area); + return page; + } + return NULL; +} + +/* + * Special allocation functions to get memory in a specified area of memory + * like necessary for allocations for DMA devices that are unable to address + * all of memory. + * + * This function attempts to fall back as much as possible to the regular allocator + * calls in order to avoid more expensive processing that comes with having to + * look through each page on the freelists to check if its suitable for the + * allocation. + */ +struct page *__alloc_pages_range(unsigned long low, unsigned long high, + gfp_t gfp_flags, unsigned int order, + struct zonelist *zl) +{ + const gfp_t wait = gfp_flags & __GFP_WAIT; + struct reclaim_state reclaim_state; + struct task_struct *p = current; + int do_retry; + int did_some_progress; + struct zonelist *z2 = NULL; + struct zone *stragglers[3]; + int nr_stragglers = 0; + struct zone **z; + struct zone **zp = NULL; + struct page *page; + + BUG_ON(gfp_flags & (__GFP_HIGHMEM|__GFP_DMA32|__GFP_DMA)); + +/* + * If the address range specified includes all of available RAM + * then we can fall back to the regular allocator. This may occur + * f.e. if we only have 2 GB RAM on x86_64. Then 32 bit DMA + * will work on all of memory. Even controllers that only support + * 31 bits will be fine. + * + * This check assumes that increasing node numbers go + * along with increasing addresses! + */ + if ((void *)high >= pfn_to_kaddr(max_low_pfn) && + (void *)low <= pfn_to_kaddr(NODE_DATA(0)->node_start_pfn)) + return alloc_pages(gfp_flags, order); + + might_sleep_if(wait); + + for (z = zl->zones; *z; z++) { + unsigned long start_pfn = (*z)->zone_start_pfn; + unsigned long end_pfn = start_pfn + (*z)->spanned_pages; + + if ((void *)low >= pfn_to_kaddr(end_pfn) || + (void *)high <= pfn_to_kaddr(start_pfn)) + /* Zone outside of boundaries */ + continue; + + if ((void *)low <= pfn_to_kaddr(start_pfn) && + (void *)high >= pfn_to_kaddr(end_pfn)) { + /* Zone completely within boundaries */ + if (!z2) { + z2 = kmalloc(sizeof(struct zonelist), GFP_KERNEL); + zp = z2->zones; + } + *zp++ = *z; + } + + /* Zone straggles the boundaries */ + stragglers[nr_stragglers++] = *z; + } + + if (z2) { + /* + * We got some zones that fall within the range. + * + * Typically either of the following is true: + * A) We have DMA zones that match. + * B) NUMA: Some nodes are within the DMA range + * C) Installed memory does not go beyond DMA range + * (f.e. x86_64 with 2G installed and an alloc below 2G + * for a device with limited DMA capability). + */ + *zp = NULL; + page = __alloc_pages(gfp_flags, order, z2); + kfree(z2); + return page; + } + + /* Make the stragglers into a zoneslist */ + BUG_ON(!nr_stragglers); + stragglers[nr_stragglers] = NULL; + + /* + * We are left with zones whose pages do not necessarily match our + * address requirements. We need to inspect the freelists to see + * if fitting memory is available and do our own reclaim. + */ +retry: + for (z = stragglers; *z; z++) { + unsigned long flags; + struct zone *zone = *z; + + spin_lock_irqsave(&zone->lock, flags); + page = rmqueue_range(low, high, zone, order); + spin_unlock(&zone->lock); + if (!page) { + local_irq_restore(flags); + wakeup_kswapd(zone, order); + continue; + } + __count_zone_vm_events(PGALLOC, zone, 1 << order); + zone_statistics(zl, zone); + local_irq_restore(flags); + + VM_BUG_ON(bad_range(zone, page)); + if (!prep_new_page(page, order, gfp_flags)) { +#ifdef CONFIG_PAGE_OWNER + if (page) + set_page_owner(page, order, gfp_flags); +#endif + return page; + } + }; + + if (!wait) + goto nopage; + + /* + * Synchrononous reclaim. This is a broad shot at the straggler zones. + * Hopefully some pages will be reclaimed that are fitting our address + * requirements. + * + * Potential enhancement: Do targeted reclaim to only the address + * ranges in the zone that are of interest now. + * However, doing so would lead to a lot of additional code. Maybe + * someone else has a bright idea on how to do so in a simple way. + */ + cond_resched(); + + p->flags |= PF_MEMALLOC; + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + + did_some_progress = try_to_free_pages(stragglers, gfp_flags); + + p->reclaim_state = NULL; + p->flags &= ~PF_MEMALLOC; + + cond_resched(); + + if (!did_some_progress) + goto nopage; + + /* + * Don't let big-order allocations loop unless the caller explicitly + * requests that. Wait for some write requests to complete then retry. + * + * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order + * <= 3, but that may not be true in other implementations. + */ + do_retry = 0; + if (!(gfp_flags & __GFP_NORETRY)) { + if ((order <= 3) || (gfp_flags & __GFP_REPEAT)) + do_retry = 1; + if (gfp_flags & __GFP_NOFAIL) + do_retry = 1; + } + if (do_retry) { + congestion_wait(WRITE, HZ/50); + goto retry; + } + +nopage: + if (!(gfp_flags & __GFP_NOWARN) && printk_ratelimit()) { + printk(KERN_WARNING "%s: page range (%lx - %lx) allocation failure." + " order:%d, mode:0x%x\n", + p->comm, low, high, order, gfp_flags); + dump_stack(); + show_mem(); + } + return NULL; +} +#endif + /* * Common helper functions. */