Index: linux-2.6.20/mm/page_alloc.c =================================================================== --- linux-2.6.20.orig/mm/page_alloc.c 2007-02-12 21:34:12.000000000 -0800 +++ linux-2.6.20/mm/page_alloc.c 2007-02-12 21:49:41.000000000 -0800 @@ -1377,14 +1377,68 @@ EXPORT_SYMBOL(__alloc_pages); /* + * Find a page that is located in a specific physical address range. + * This is similar to what rmqueue() does but it is less efficient + * since we have to scan through each item on the freelist while looking for + * a page fitting the address requirements. + */ +static struct page *rmqueue_range(unsigned long low, unsigned long high, + struct zone *zone, unsigned int order) +{ + struct free_area * area; + unsigned int current_order; + struct page *page; + + for (current_order = order; current_order < MAX_ORDER; ++current_order) { + area = zone->free_area + current_order; + if (list_empty(&area->free_list)) + continue; + + list_for_each_entry(page, &area->free_list, lru) { + unsigned long addr = (unsigned long)page_address(page); + + if (addr >= low && + addr < high - (PAGE_SIZE << order)) + goto found_match; + } + continue; +found_match: + list_del(&page->lru); + rmv_page_order(page); + area->nr_free--; + zone->free_pages -= 1UL << order; + expand(zone, page, order, current_order, area); + return page; + } + return NULL; +} + +/* * Special allocation functions to get memory in a specified area of memory * like necessary for allocations for DMA devices that are unable to address * all of memory. + * + * This function attempts to fall back as much as possible to the regular allocator + * calls in order to avoid more expensive processing that comes with having to + * look through each page on the freelists to check if its suitable for the + * allocation. */ struct page *__alloc_pages_range(unsigned long low, unsigned long high, gfp_t gfp_flags, unsigned int order, struct zonelist *zl) { + const gfp_t wait = gfp_flags & __GFP_WAIT; + struct reclaim_state reclaim_state; + struct task_struct *p = current; + int do_retry; + int did_some_progress; + struct zonelist *z2 = NULL; + struct zone *stragglers[3]; + int nr_stragglers = 0; + struct zone **z; + struct zone **zp = NULL; + struct page *page; + BUG_ON(gfp_flags & (__GFP_HIGHMEM|__GFP_DMA32|__GFP_DMA)); /* @@ -1410,7 +1464,143 @@ if (high <= MAX_DMA32_ADDRESS && !low) return __alloc_pages(gfp_flags | GFP_DMA32, order, zl); #endif - /* We have no means of satisfying the allocation constraints */ + + might_sleep_if(wait); + + for (z = zl->zones; *z; z++) { + unsigned long start_pfn = (*z)->zone_start_pfn; + unsigned long end_pfn = start_pfn + (*z)->spanned_pages; + + if ((void *)low >= pfn_to_kaddr(end_pfn) || + (void *)high <= pfn_to_kaddr(start_pfn)) + /* Zone outside of boundaries */ + continue; + + if ((void *)low <= pfn_to_kaddr(start_pfn) && + (void *)high >= pfn_to_kaddr(end_pfn)) { + /* Zone completely within boundaries */ + if (!z2) { + z2 = kmalloc(sizeof(struct zonelist), GFP_KERNEL); + zp = z2->zones; + } + *zp++ = *z; + } + + /* Zone straggles the boundaries */ + stragglers[nr_stragglers++] = *z; + } + + if (z2) { + /* + * We got some zones that fall entirely within the range. + * Using those gives us the fastest result (however, we will + * not allocate in the zones whose boundaries we straggle!). + * + * Typically either of the following is true: + * A) We have DMA zones that match. + * B) NUMA: Some nodes are within the DMA range + * C) Installed memory does not go beyond DMA range + * (f.e. x86_64 with 2G installed and an alloc below 2G + * for a device with limited DMA capability). + */ + *zp = NULL; + page = __alloc_pages(gfp_flags, order, z2); + kfree(z2); + return page; + } + + /* Make the stragglers into a zoneslist */ + BUG_ON(!nr_stragglers); + stragglers[nr_stragglers] = NULL; + + /* + * We are left with zones in which only some pages match our + * address requirements. We need to inspect the freelists to see + * if fitting memory is available and do our own reclaim. + */ +retry: + for (z = stragglers; *z; z++) { + unsigned long flags; + struct zone *zone = *z; + + spin_lock_irqsave(&zone->lock, flags); + page = rmqueue_range(low, high, zone, order); + spin_unlock(&zone->lock); + if (!page) { + local_irq_restore(flags); + wakeup_kswapd(zone, order); + continue; + } + __count_zone_vm_events(PGALLOC, zone, 1 << order); + zone_statistics(zl, zone); + local_irq_restore(flags); + + VM_BUG_ON(bad_range(zone, page)); + if (!prep_new_page(page, order, gfp_flags)) { +#ifdef CONFIG_PAGE_OWNER + if (page) + set_page_owner(page, order, gfp_flags); +#endif + return page; + } + }; + + if (!wait) + goto nopage; + + /* + * Synchrononous reclaim. This is a broad shot at the straggler zones. + * Hopefully some pages will be reclaimed that are fitting our address + * requirements. + * + * Potential enhancement: Do targeted reclaim to only the address + * ranges in the zone that are of interest now. + * However, doing so would lead to a lot of additional code. Maybe + * someone else has a bright idea on how to do so in a simple way. + */ + cond_resched(); + + p->flags |= PF_MEMALLOC; + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + + did_some_progress = try_to_free_pages(stragglers, gfp_flags); + + p->reclaim_state = NULL; + p->flags &= ~PF_MEMALLOC; + + cond_resched(); + + if (!did_some_progress) + goto nopage; + + /* + * Don't let big-order allocations loop unless the caller explicitly + * requests that. Wait for some write requests to complete then retry. + * + * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order + * <= 3, but that may not be true in other implementations. + */ + do_retry = 0; + if (!(gfp_flags & __GFP_NORETRY)) { + if ((order <= 3) || (gfp_flags & __GFP_REPEAT)) + do_retry = 1; + if (gfp_flags & __GFP_NOFAIL) + do_retry = 1; + } + if (do_retry) { + congestion_wait(WRITE, HZ/50); + goto retry; + } + +nopage: + if (!(gfp_flags & __GFP_NOWARN) && printk_ratelimit()) { + printk(KERN_WARNING "%s: page range (%lx - %lx) allocation failure." + " order:%d, mode:0x%x\n", + p->comm, low, high, order, gfp_flags); + dump_stack(); + show_mem(); + } return NULL; }