From: Christoph Lameter Subject: Mmap support using pte PAGE_SIZE mappings This is realized by mmapping base page size (4k on x86) of the potentially larger page. Mmap semantics are not changed and therefore the large buffers can be handled by userspace as if files consisted of 4k pages (as it is now). The use of large buffersizes can therefore be fully transparent to user space. Details: - Modify the rmap functions (try_to_unmap, page_referenced and page_mkclean) to interate over all base pages of a large buffer to get all ptes that may point to a larger buffer. - Change vm_fault logic in __do_fault() to convert from 4k pte logic into a pointer to the page of the large buffer and an index to the basepage in that large buffer. - Fix up the memory policy address scan to skip tail pages of large buffers. - Fix up page migration to allow the moving of large buffers. - Create a new function compound_index() to determine the base page index in the large page. Tested by formatting a 32k ext2 filesystem, booting off it and building a kernel. Simplifications and fixes by Hugh Dickins. Signed-off-by: Hugh Dickins Signed-off-by: Christoph Lameter --- fs/proc/task_mmu.c | 4 +- include/linux/mm.h | 12 +++++++ mm/filemap.c | 6 --- mm/fremap.c | 3 + mm/memory.c | 82 ++++++++++++++++++++++++++++++++++++++--------------- mm/mempolicy.c | 28 ++++++++++++------ mm/migrate.c | 19 +++++++++--- mm/rmap.c | 81 ++++++++++++++++++++++++++++++++++++---------------- 8 files changed, 166 insertions(+), 69 deletions(-) Index: linux-2.6/mm/filemap.c =================================================================== --- linux-2.6.orig/mm/filemap.c 2007-09-25 10:22:47.000000000 -0700 +++ linux-2.6/mm/filemap.c 2007-09-25 10:22:59.000000000 -0700 @@ -1479,12 +1479,6 @@ int generic_file_mmap(struct file * file { struct address_space *mapping = file->f_mapping; - /* - * Forbid mmap access to higher order mappings. - */ - if (mapping_order(mapping)) - return -ENOSYS; - if (!mapping->a_ops->readpage) return -ENOEXEC; file_accessed(file); Index: linux-2.6/mm/memory.c =================================================================== --- linux-2.6.orig/mm/memory.c 2007-09-25 10:21:38.000000000 -0700 +++ linux-2.6/mm/memory.c 2007-09-25 11:28:59.000000000 -0700 @@ -383,16 +383,18 @@ static inline int is_cow_mapping(unsigne * _does_ have a "struct page" associated with it even if it is in a * VM_PFNMAP range). */ -struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) +struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + pte_t pte, int *page_index) { unsigned long pfn = pte_pfn(pte); + struct page *page, *head; if (unlikely(vma->vm_flags & VM_PFNMAP)) { unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; if (pfn == vma->vm_pgoff + off) - return NULL; + goto fail; if (!is_cow_mapping(vma->vm_flags)) - return NULL; + goto fail; } /* @@ -403,7 +405,7 @@ struct page *vm_normal_page(struct vm_ar */ if (unlikely(!pfn_valid(pfn))) { print_bad_pte(vma, pte, addr); - return NULL; + goto fail; } /* @@ -413,7 +415,15 @@ struct page *vm_normal_page(struct vm_ar * The PAGE_ZERO() pages and various VDSO mappings can * cause them to exist. */ - return pfn_to_page(pfn); + page = pfn_to_page(pfn); + head = page_cache_head(page); + if (page_index) + *page_index = compound_index(head, page, addr); + return head; +fail: + if (page_index) + *page_index = 0; + return NULL; } /* @@ -476,7 +486,7 @@ copy_one_pte(struct mm_struct *dst_mm, s pte = pte_mkclean(pte); pte = pte_mkold(pte); - page = vm_normal_page(vma, addr, pte); + page = vm_normal_page(vma, addr, pte, NULL); if (page) { get_page(page); page_dup_rmap(page, vma, addr); @@ -639,9 +649,17 @@ static unsigned long zap_pte_range(struc (*zap_work) -= PAGE_SIZE; if (pte_present(ptent)) { + int page_index; + int index = 0; struct page *page; - page = vm_normal_page(vma, addr, ptent); + page = vm_normal_page(vma, addr, ptent, &page_index); + + if (page) + index = (page->index << + page_cache_page_order(page)) + + page_index; + if (unlikely(details) && page) { /* * unmap_shared_mapping_pages() wants to @@ -656,8 +674,8 @@ static unsigned long zap_pte_range(struc * invalidating or truncating nonlinear. */ if (details->nonlinear_vma && - (page->index < details->first_index || - page->index > details->last_index)) + (index < details->first_index || + index > details->last_index)) continue; } ptent = ptep_get_and_clear_full(mm, addr, pte, @@ -667,9 +685,9 @@ static unsigned long zap_pte_range(struc continue; if (unlikely(details) && details->nonlinear_vma && linear_page_index(details->nonlinear_vma, - addr) != page->index) + addr) != index) set_pte_at(mm, addr, pte, - pgoff_to_pte(page->index)); + pgoff_to_pte(index)); if (PageAnon(page)) anon_rss--; else { @@ -680,7 +698,7 @@ static unsigned long zap_pte_range(struc file_rss--; } page_remove_rmap(page, vma); - tlb_remove_page(tlb, page); + tlb_remove_page(tlb, compound_nth_page(page, page_index)); continue; } /* @@ -908,6 +926,7 @@ struct page *follow_page(struct vm_area_ spinlock_t *ptl; struct page *page; struct mm_struct *mm = vma->vm_mm; + int page_index; page = follow_huge_addr(mm, address, flags & FOLL_WRITE); if (!IS_ERR(page)) { @@ -943,7 +962,7 @@ struct page *follow_page(struct vm_area_ goto unlock; if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; - page = vm_normal_page(vma, address, pte); + page = vm_normal_page(vma, address, pte, &page_index); if (unlikely(!page)) goto unlock; @@ -958,7 +977,7 @@ struct page *follow_page(struct vm_area_ unlock: pte_unmap_unlock(ptep, ptl); out: - return page; + return page + page_index; no_page_table: /* @@ -1019,8 +1038,12 @@ int get_user_pages(struct task_struct *t return i ? : -EFAULT; } if (pages) { - struct page *page = vm_normal_page(gate_vma, start, *pte); - pages[i] = page; + int page_index; + struct page *page; + + page = vm_normal_page(gate_vma, start, *pte, + &page_index); + pages[i] = page + page_index; if (page) get_page(page); } @@ -1639,9 +1662,10 @@ static int do_wp_page(struct mm_struct * struct page *old_page, *new_page; pte_t entry; int reuse = 0, ret = 0; + int page_index; struct page *dirty_page = NULL; - old_page = vm_normal_page(vma, address, orig_pte); + old_page = vm_normal_page(vma, address, orig_pte, &page_index); if (!old_page) goto gotten; @@ -1722,7 +1746,8 @@ gotten: new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); if (!new_page) goto oom; - cow_user_page(new_page, old_page, address, vma); + cow_user_page(new_page, compound_nth_page(old_page, page_index), + address, vma); } /* @@ -2321,9 +2346,17 @@ static int __do_fault(struct mm_struct * struct page *dirty_page = NULL; struct vm_fault vmf; int ret; + int page_index; + int order; + + if (vma->vm_file) + order = mapping_order(vma->vm_file->f_mapping); + else + order = 0; + page_index = pgoff % (1 << order); vmf.virtual_address = (void __user *)(address & PAGE_MASK); - vmf.pgoff = pgoff; + vmf.pgoff = pgoff << order; vmf.flags = flags; vmf.page = NULL; @@ -2371,7 +2404,10 @@ static int __do_fault(struct mm_struct * ret = VM_FAULT_OOM; goto out; } - copy_user_highpage(page, vmf.page, address, vma); + copy_user_highpage(page, + compound_nth_page(vmf.page, page_index), address, vma); + /* The newly created anonymous page is of order 0 */ + page_index = 0; } else { /* * If the page will be shareable, see if the backing @@ -2417,8 +2453,10 @@ static int __do_fault(struct mm_struct * */ /* Only go through if we didn't race with anybody else... */ if (likely(pte_same(*page_table, orig_pte))) { - flush_icache_page(vma, page); - entry = mk_pte(page, vma->vm_page_prot); + struct page *nth = compound_nth_page(page, page_index); + + flush_icache_page(vma, nth); + entry = mk_pte(nth, vma->vm_page_prot); if (flags & FAULT_FLAG_WRITE) entry = maybe_mkwrite(pte_mkdirty(entry), vma); set_pte_at(mm, address, page_table, entry); Index: linux-2.6/mm/rmap.c =================================================================== --- linux-2.6.orig/mm/rmap.c 2007-09-25 10:22:46.000000000 -0700 +++ linux-2.6/mm/rmap.c 2007-09-25 10:34:09.000000000 -0700 @@ -191,17 +191,10 @@ vma_address(struct page *page, struct vm pgoff_t pgoff; unsigned long address; - if (PageAnon(page)) - pgoff = page->index; - else - pgoff = page->index << mapping_order(page->mapping); - + pgoff = page->index << page_cache_page_order(page_cache_head(page)); address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { - /* page should be within any vma from prio_tree_next */ - BUG_ON(!PageAnon(page)); + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) return -EFAULT; - } return address; } @@ -271,7 +264,7 @@ pte_t *page_check_address(struct page *p * Subfunctions of page_referenced: page_referenced_one called * repeatedly from either page_referenced_anon or page_referenced_file. */ -static int page_referenced_one(struct page *page, +static int __page_referenced_one(struct page *page, struct vm_area_struct *vma, unsigned int *mapcount) { struct mm_struct *mm = vma->vm_mm; @@ -303,6 +296,18 @@ out: return referenced; } +static int page_referenced_one(struct page *page, + struct vm_area_struct *vma, unsigned int *mapcount) +{ + int i; + int referenced = 0; + + for (i = 0; i < page_cache_base_pages(page); i++) + referenced += __page_referenced_one(compound_nth_page(page, i), vma, mapcount); + + return referenced; +} + static int page_referenced_anon(struct page *page) { unsigned int mapcount; @@ -340,7 +345,7 @@ static int page_referenced_file(struct p { unsigned int mapcount; struct address_space *mapping = page->mapping; - pgoff_t pgoff = page->index << (page_cache_shift(mapping) - PAGE_SHIFT); + pgoff_t pgoff = page->index << page_cache_page_order(page); struct vm_area_struct *vma; struct prio_tree_iter iter; int referenced = 0; @@ -417,7 +422,7 @@ int page_referenced(struct page *page, i return referenced; } -static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) +static int __page_mkclean_one(struct page *page, struct vm_area_struct *vma) { struct mm_struct *mm = vma->vm_mm; unsigned long address; @@ -450,9 +455,20 @@ out: return ret; } +static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) +{ + int i; + int ret = 0; + + for (i = 0; i < page_cache_base_pages(page); i++) + ret += __page_mkclean_one(compound_nth_page(page, i), vma); + + return ret; +} + static int page_mkclean_file(struct address_space *mapping, struct page *page) { - pgoff_t pgoff = page->index << (page_cache_shift(mapping) - PAGE_SHIFT); + pgoff_t pgoff = page->index << page_cache_page_order(page); struct vm_area_struct *vma; struct prio_tree_iter iter; int ret = 0; @@ -657,8 +673,8 @@ void page_remove_rmap(struct page *page, * Subfunctions of try_to_unmap: try_to_unmap_one called * repeatedly from either try_to_unmap_anon or try_to_unmap_file. */ -static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, - int migration) +static int __try_to_unmap_one(struct page *head, struct page *page, + struct vm_area_struct *vma, int migration) { struct mm_struct *mm = vma->vm_mm; unsigned long address; @@ -692,15 +708,15 @@ static int try_to_unmap_one(struct page /* Move the dirty bit to the physical page now the pte is gone. */ if (pte_dirty(pteval)) - set_page_dirty(page); + set_page_dirty(head); /* Update high watermark before we lower rss */ update_hiwater_rss(mm); - if (PageAnon(page)) { - swp_entry_t entry = { .val = page_private(page) }; + if (PageAnon(head)) { + swp_entry_t entry = { .val = page_private(head) }; - if (PageSwapCache(page)) { + if (PageSwapCache(head)) { /* * Store the swap location in the pte. * See handle_pte_fault() ... @@ -738,8 +754,8 @@ static int try_to_unmap_one(struct page dec_mm_counter(mm, file_rss); - page_remove_rmap(page, vma); - page_cache_release(page); + page_remove_rmap(head, vma); + page_cache_release(head); out_unmap: pte_unmap_unlock(pte, ptl); @@ -747,6 +763,21 @@ out: return ret; } +static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, + int migration) +{ + int ret = SWAP_AGAIN; + int i; + + for (i = 0; i < page_cache_base_pages(page); i++) { + ret = __try_to_unmap_one(page, compound_nth_page(page, i), + vma, migration); + if (ret == SWAP_FAIL || !page_mapped(page)) + return ret; + } + return ret; +} + /* * objrmap doesn't work for nonlinear VMAs because the assumption that * offset-into-file correlates with offset-into-virtual-addresses does not hold. @@ -782,6 +813,7 @@ static void try_to_unmap_cluster(unsigne struct page *page; unsigned long address; unsigned long end; + int page_index; address = (vma->vm_start + cursor) & CLUSTER_MASK; end = address + CLUSTER_SIZE; @@ -810,7 +842,7 @@ static void try_to_unmap_cluster(unsigne for (; address < end; pte++, address += PAGE_SIZE) { if (!pte_present(*pte)) continue; - page = vm_normal_page(vma, address, *pte); + page = vm_normal_page(vma, address, *pte, &page_index); BUG_ON(!page || PageAnon(page)); if (ptep_clear_flush_young(vma, address, pte)) @@ -822,7 +854,8 @@ static void try_to_unmap_cluster(unsigne /* If nonlinear, store the file page offset in the pte. */ if (page->index != linear_page_index(vma, address)) - set_pte_at(mm, address, pte, pgoff_to_pte(page->index)); + set_pte_at(mm, address, pte, + pgoff_to_pte(page->index + page_index)); /* Move the dirty bit to the physical page now the pte is gone. */ if (pte_dirty(pteval)) @@ -868,7 +901,7 @@ static int try_to_unmap_anon(struct page static int try_to_unmap_file(struct page *page, int migration) { struct address_space *mapping = page->mapping; - pgoff_t pgoff = page->index << (page_cache_shift(mapping) - PAGE_SHIFT); + pgoff_t pgoff = page->index << page_cache_page_order(page); struct vm_area_struct *vma; struct prio_tree_iter iter; int ret = SWAP_AGAIN; Index: linux-2.6/mm/fremap.c =================================================================== --- linux-2.6.orig/mm/fremap.c 2007-09-25 10:21:38.000000000 -0700 +++ linux-2.6/mm/fremap.c 2007-09-25 10:22:59.000000000 -0700 @@ -27,10 +27,11 @@ static void zap_pte(struct mm_struct *mm if (pte_present(pte)) { struct page *page; + int page_index; flush_cache_page(vma, addr, pte_pfn(pte)); pte = ptep_clear_flush(vma, addr, ptep); - page = vm_normal_page(vma, addr, pte); + page = vm_normal_page(vma, addr, pte, &page_index); if (page) { if (pte_dirty(pte)) set_page_dirty(page); Index: linux-2.6/mm/mempolicy.c =================================================================== --- linux-2.6.orig/mm/mempolicy.c 2007-09-25 10:21:38.000000000 -0700 +++ linux-2.6/mm/mempolicy.c 2007-09-25 10:22:59.000000000 -0700 @@ -227,12 +227,17 @@ static int check_pte_range(struct vm_are do { struct page *page; int nid; + int pages = 1; + int page_index; if (!pte_present(*pte)) - continue; - page = vm_normal_page(vma, addr, *pte); + goto next; + page = vm_normal_page(vma, addr, *pte, &page_index); if (!page) - continue; + goto next; + + pages = page_cache_base_pages(page); + /* * The check for PageReserved here is important to avoid * handling zero pages and other pages that may have been @@ -245,10 +250,10 @@ static int check_pte_range(struct vm_are * to put zero pages on the migration list. */ if (PageReserved(page)) - continue; + goto next; nid = page_to_nid(page); if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) - continue; + goto next; if (flags & MPOL_MF_STATS) gather_stats(page, private, pte_dirty(*pte)); @@ -256,7 +261,11 @@ static int check_pte_range(struct vm_are migrate_page_add(page, private, flags); else break; - } while (pte++, addr += PAGE_SIZE, addr != end); + next: + pte += pages; + addr += pages << PAGE_SHIFT; + } while (addr != end); + pte_unmap_unlock(orig_pte, ptl); return addr != end; } @@ -592,9 +601,12 @@ static void migrate_page_add(struct page isolate_lru_page(page, pagelist); } -static struct page *new_node_page(struct page *page, unsigned long node, int **x) +static struct page *new_node_page(struct page *page, + unsigned long node, int **x) { - return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0); + return alloc_pages_node(node, + GFP_HIGHUSER_MOVABLE | __GFP_COMP, + page_cache_page_order(page)); } /* Index: linux-2.6/mm/migrate.c =================================================================== --- linux-2.6.orig/mm/migrate.c 2007-09-25 10:22:46.000000000 -0700 +++ linux-2.6/mm/migrate.c 2007-09-25 10:22:59.000000000 -0700 @@ -196,15 +196,18 @@ static void remove_file_migration_ptes(s struct address_space *mapping = page_mapping(new); struct prio_tree_iter iter; pgoff_t pgoff = new->index << mapping_order(mapping); + int i; if (!mapping) return; spin_lock(&mapping->i_mmap_lock); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) - remove_migration_pte(vma, old, new); - + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + for (i = 0; i < page_cache_base_pages(old); i++) + remove_migration_pte(vma, compound_nth_page(old, i), + compound_nth_page(new, i)); + } spin_unlock(&mapping->i_mmap_lock); } @@ -355,7 +358,11 @@ static int migrate_page_move_mapping(str */ static void migrate_page_copy(struct page *newpage, struct page *page) { - copy_highpage(newpage, page); + int i; + + for (i = 0; i < page_cache_base_pages(page); i++) + copy_highpage(compound_nth_page(newpage, i), + compound_nth_page(page, i)); if (PageError(page)) SetPageError(newpage); @@ -785,7 +792,8 @@ static struct page *new_page_node(struct *result = &pm->status; return alloc_pages_node(pm->node, - GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); + GFP_HIGHUSER_MOVABLE | GFP_THISNODE | __GFP_COMP, + page_cache_page_order(p)); } /* @@ -829,6 +837,7 @@ static int do_move_pages(struct mm_struc if (PageReserved(page)) /* Check for zero page */ goto put_and_set; + page = page_cache_head(page); pp->page = page; err = page_to_nid(page); Index: linux-2.6/fs/proc/task_mmu.c =================================================================== --- linux-2.6.orig/fs/proc/task_mmu.c 2007-09-25 10:22:50.000000000 -0700 +++ linux-2.6/fs/proc/task_mmu.c 2007-09-25 10:22:59.000000000 -0700 @@ -235,7 +235,7 @@ static void smaps_pte_range(struct vm_ar mss->resident += PAGE_SIZE; - page = vm_normal_page(vma, addr, ptent); + page = vm_normal_page(vma, addr, ptent, NULL); if (!page) continue; @@ -272,7 +272,7 @@ static void clear_refs_pte_range(struct if (!pte_present(ptent)) continue; - page = vm_normal_page(vma, addr, ptent); + page = vm_normal_page(vma, addr, ptent, NULL); if (!page) continue; Index: linux-2.6/include/linux/mm.h =================================================================== --- linux-2.6.orig/include/linux/mm.h 2007-09-25 10:22:50.000000000 -0700 +++ linux-2.6/include/linux/mm.h 2007-09-25 10:22:59.000000000 -0700 @@ -368,6 +368,15 @@ static inline struct page *compound_nth_ return vmalloc_nth_page(page, n); } +static inline int compound_index(struct page*head, struct page *tail, + unsigned long tail_addr) +{ + if (likely(!PageVcompound(head))) + return tail - head; + return (tail_addr - (unsigned long)vmalloc_address(head)) + >> PAGE_SHIFT; +} + static inline void set_compound_page_dtor(struct page *page, compound_page_dtor *dtor) { @@ -816,7 +825,8 @@ struct zap_details { unsigned long truncate_count; /* Compare vm_truncate_count */ }; -struct page *vm_normal_page(struct vm_area_struct *, unsigned long, pte_t); +struct page *vm_normal_page(struct vm_area_struct *, unsigned long, pte_t, + int *); unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *); unsigned long unmap_vmas(struct mmu_gather **tlb,