Mmap support for higher order page cache The higher order page cache can contain compound pages that span multiple of the regular pages. The head page is special in that it contains the status of the compound page. The tail pages of such a compound page do not contain any status at all. mmap supports mapping files in PAGE_SIZE chunks since the hardware support via page table is only possible for regular pages and not for compound pages. So we need to support a way of allowing the mapping of PAGE_SIZE chunks of compound pages. This can be done if we split the roles of the page_struct as yielding the address for the mapping from the role of the page_struct as a state keeper. The state keeper for the whole compound page is the head page. The address can be determined from the page within the compound page that a struct page * pointer may point to. So for memory management we keep on returning pointers to abitrary pages_struct. However, these can only be used to determine the address. For state manipulation we need to determine the head page first and then operate on the state in the head page struct. We add a couple of new operations to pagemap.h /* Determine the head page given an arbitrary page wihtin * a compound page. */ struct head_page *page_cache_head(struct page *sub_page); Fallback page = page; /* * Determine the offset of subpage within a compound page */ int page_cache_subpage_index(struct page *sub_page); fallback 0 /* * Determine subpage */ struct page *page_cache_subpage(struct page *head_page, int number) fallpage page = page pgoff_t page_cache_pgoff_to_mmap_pgoff(struct mapping *, pgoff_t x) index page_cache_mmap_pgoff_to_subpage_index(struct maping *, pgoff_t x) --- arch/x86_64/kernel/traps.c | 1 fs/buffer.c | 4 - include/linux/pagemap.h | 55 ++++++++++++++ mm/filemap.c | 43 +++++++---- mm/memory.c | 56 ++++++++++---- mm/rmap.c | 169 +++++++++++++++++++++++++-------------------- 6 files changed, 224 insertions(+), 104 deletions(-) Index: linux-2.6.22-rc4-mm2/mm/filemap.c =================================================================== --- linux-2.6.22-rc4-mm2.orig/mm/filemap.c 2007-06-21 21:46:30.000000000 -0700 +++ linux-2.6.22-rc4-mm2/mm/filemap.c 2007-06-21 21:46:45.000000000 -0700 @@ -1359,6 +1359,18 @@ static int fastcall page_cache_read(stru * The goto's are kind of ugly, but this streamlines the normal case of having * it in the page cache, and handles the special cases reasonably without * having a lot of duplicated code. + * + * The code has a strange intermediary function for the higher order page cache. + * The data coming in fault_data is in terms of base pages. However the + * page cache must be access in terms of the potential higher order mapping. + * + * We convert the base page reference into a compound_index and an page + * cache conformant pgoff. The page returned is a pointer to the desired + * base page in the compound pages of the page cache page. + * + * However, the caller may not keep status information in that page. The + * status information (such as locking) must be performed on the head + * of the compound page. */ struct page *filemap_fault(struct vm_area_struct *vma, struct fault_data *fdata) { @@ -1370,13 +1382,18 @@ struct page *filemap_fault(struct vm_are struct page *page; unsigned long size; int did_readaround = 0; + pgoff_t pgoff = page_cache_mmap_index_to_pgoff(mapping, fdata->pgoff); + int subpage_index = page_cache_mmap_index_to_subpage_index(mapping, fdata->pgoff); + if (mapping_order(mapping)) + printk("filemap_fault: pgoff=%ld subpage_index=%d\n", + pgoff, subpage_index); fdata->type = VM_FAULT_MINOR; BUG_ON(!(vma->vm_flags & VM_CAN_INVALIDATE)); size = page_cache_next(mapping, i_size_read(inode)); - if (fdata->pgoff >= size) + if (pgoff >= size) goto outside_data_content; /* If we don't want any read-ahead, don't bother */ @@ -1387,21 +1404,21 @@ struct page *filemap_fault(struct vm_are * Do we have something in the page cache already? */ retry_find: - page = find_lock_page(mapping, fdata->pgoff); + page = find_lock_page(mapping, pgoff); /* * For sequential accesses, we use the generic readahead logic. */ if (VM_SequentialReadHint(vma)) { if (!page) { page_cache_readahead_ondemand(mapping, ra, file, page, - fdata->pgoff, 1); - page = find_lock_page(mapping, fdata->pgoff); + pgoff, 1); + page = find_lock_page(mapping, pgoff); if (!page) goto no_cached_page; } if (PageReadahead(page)) { page_cache_readahead_ondemand(mapping, ra, file, page, - fdata->pgoff, 1); + pgoff, 1); } } @@ -1430,11 +1447,11 @@ retry_find: if (ra_pages) { pgoff_t start = 0; - if (fdata->pgoff > ra_pages / 2) - start = fdata->pgoff - ra_pages / 2; + if (pgoff > ra_pages / 2) + start = pgoff - ra_pages / 2; do_page_cache_readahead(mapping, file, start, ra_pages); } - page = find_lock_page(mapping, fdata->pgoff); + page = find_lock_page(mapping, pgoff); if (!page) goto no_cached_page; } @@ -1451,7 +1468,7 @@ retry_find: /* Must recheck i_size under page lock */ size = page_cache_next(mapping, i_size_read(inode)); - if (unlikely(fdata->pgoff >= size)) { + if (unlikely(pgoff >= size)) { unlock_page(page); goto outside_data_content; } @@ -1461,7 +1478,7 @@ retry_find: */ mark_page_accessed(page); ra->prev_index = page->index; - return page; + return page + subpage_index; outside_data_content: /* @@ -1478,7 +1495,7 @@ no_cached_page: * We're only likely to ever get here if MADV_RANDOM is in * effect. */ - error = page_cache_read(file, fdata->pgoff); + error = page_cache_read(file, pgoff); /* * The page we want has now been added to the page cache. @@ -1539,8 +1556,8 @@ int generic_file_mmap(struct file * file /* * Forbid mmap access to higher order mappings. */ - if (mapping_order(mapping)) - return -ENOSYS; +// if (mapping_order(mapping)) +// return -ENOSYS; if (!mapping->a_ops->readpage) return -ENOEXEC; Index: linux-2.6.22-rc4-mm2/mm/memory.c =================================================================== --- linux-2.6.22-rc4-mm2.orig/mm/memory.c 2007-06-21 21:42:34.000000000 -0700 +++ linux-2.6.22-rc4-mm2/mm/memory.c 2007-06-21 21:46:45.000000000 -0700 @@ -384,6 +384,9 @@ static inline int is_cow_mapping(unsigne * and if that isn't true, the page has been COW'ed (in which case it * _does_ have a "struct page" associated with it even if it is in a * VM_PFNMAP range). + * + * In case that there is a file backing the vma then vm_normal_page may + * return a pointer to a subpage. */ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { @@ -641,9 +644,15 @@ static unsigned long zap_pte_range(struc (*zap_work) -= PAGE_SIZE; if (pte_present(ptent)) { + struct page *sub_page; struct page *page; - page = vm_normal_page(vma, addr, ptent); + sub_page = vm_normal_page(vma, addr, ptent); + if (sub_page) + page = page_cache_head(sub_page); + else + page = NULL; + if (unlikely(details) && page) { /* * unmap_shared_mapping_pages() wants to @@ -671,7 +680,7 @@ static unsigned long zap_pte_range(struc && linear_page_index(details->nonlinear_vma, addr) != page->index) set_pte_at(mm, addr, pte, - pgoff_to_pte(page->index)); + pgoff_to_pte(page->index + page_cache_subpage_index(sub_page))); if (PageAnon(page)) anon_rss--; else { @@ -682,7 +691,7 @@ static unsigned long zap_pte_range(struc file_rss--; } page_remove_rmap(page, vma); - tlb_remove_page(tlb, page); + tlb_remove_page(tlb, sub_page); continue; } /* @@ -1636,15 +1645,18 @@ static int do_wp_page(struct mm_struct * unsigned long address, pte_t *page_table, pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte) { - struct page *old_page, *new_page; + struct page *old_page, *old_sub_page, *new_page; pte_t entry; int reuse = 0, ret = VM_FAULT_MINOR; struct page *dirty_page = NULL; - old_page = vm_normal_page(vma, address, orig_pte); - if (!old_page) + old_page = NULL; + old_sub_page = vm_normal_page(vma, address, orig_pte); + if (!old_sub_page) goto gotten; + old_page = page_cache_head(old_sub_page); + /* * Take out anonymous pages first, anonymous shared vmas are * not dirty accountable. @@ -1722,7 +1734,7 @@ gotten: new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); if (!new_page) goto oom; - cow_user_page(new_page, old_page, address, vma); + cow_user_page(new_page, old_sub_page, address, vma); } /* @@ -2314,12 +2326,17 @@ static int __do_fault(struct mm_struct * pgoff_t pgoff, unsigned int flags, pte_t orig_pte) { spinlock_t *ptl; - struct page *page, *faulted_page; + struct page *page, *faulted_sub_page, *faulted_page; pte_t entry; int anon = 0; + int subpage_index = 0; struct page *dirty_page = NULL; struct fault_data fdata; + if (vma->vm_file && vma->vm_file->f_mapping && + mapping_order(vma->vm_file->f_mapping)) + printk("__do_fault: pmd=%p pgoff=%ld flags=%lx pte=%lx\n", + pmd, pgoff, flags, orig_pte); fdata.address = address & PAGE_MASK; fdata.pgoff = pgoff; fdata.flags = flags; @@ -2329,22 +2346,29 @@ static int __do_fault(struct mm_struct * if (likely(vma->vm_ops->fault)) { fdata.type = -1; - faulted_page = vma->vm_ops->fault(vma, &fdata); + faulted_sub_page = vma->vm_ops->fault(vma, &fdata); WARN_ON(fdata.type == -1); - if (unlikely(!faulted_page)) + if (unlikely(!faulted_sub_page)) return fdata.type; } else { /* Legacy ->nopage path */ fdata.type = VM_FAULT_MINOR; - faulted_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, + faulted_sub_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &fdata.type); /* no page was available -- either SIGBUS or OOM */ - if (unlikely(faulted_page == NOPAGE_SIGBUS)) + if (unlikely(faulted_sub_page == NOPAGE_SIGBUS)) return VM_FAULT_SIGBUS; - else if (unlikely(faulted_page == NOPAGE_OOM)) + else if (unlikely(faulted_sub_page == NOPAGE_OOM)) return VM_FAULT_OOM; } + faulted_page = page_cache_head(faulted_sub_page); + subpage_index = page_cache_subpage_index(faulted_sub_page); + if (vma->vm_file && vma->vm_file->f_mapping && + mapping_order(vma->vm_file->f_mapping)) + printk("----- Higher page order=%d pmd=%p pgoff=%ld " + "flags=%lx pte=%lx\n", compound_order(faulted_page), + pmd, pgoff, flags, orig_pte); /* * For consistency in subsequent calls, make the faulted_page always * locked. @@ -2371,7 +2395,7 @@ static int __do_fault(struct mm_struct * fdata.type = VM_FAULT_OOM; goto out; } - copy_user_highpage(page, faulted_page, address, vma); + copy_user_highpage(page, faulted_sub_page, address, vma); } else { /* * If the page will be shareable, see if the backing @@ -2405,8 +2429,8 @@ static int __do_fault(struct mm_struct * */ /* Only go through if we didn't race with anybody else... */ if (likely(pte_same(*page_table, orig_pte))) { - flush_icache_page(vma, page); - entry = mk_pte(page, vma->vm_page_prot); + flush_icache_page(vma, page + subpage_index); + entry = mk_pte(page + subpage_index, vma->vm_page_prot); if (flags & FAULT_FLAG_WRITE) entry = maybe_mkwrite(pte_mkdirty(entry), vma); set_pte_at(mm, address, page_table, entry); Index: linux-2.6.22-rc4-mm2/include/linux/pagemap.h =================================================================== --- linux-2.6.22-rc4-mm2.orig/include/linux/pagemap.h 2007-06-21 21:46:29.000000000 -0700 +++ linux-2.6.22-rc4-mm2/include/linux/pagemap.h 2007-06-21 21:46:45.000000000 -0700 @@ -3,6 +3,10 @@ /* * Copyright 1995 Linus Torvalds + * + * (C) 2007 SGI Support for higher order pages + * Christoph Lameter + * */ #include #include @@ -132,6 +136,31 @@ static inline unsigned int page_cache_of { return pos & a->offset_mask; } + +static inline struct page *page_cache_head(struct page *sub_page) +{ + return compound_head(sub_page); +} + +static inline int page_cache_subpage_index(struct page *sub_page) +{ + return sub_page - compound_head(sub_page); +} + +static inline struct page *page_cache_subpage(struct page *page, int index) +{ + return page + index; +} + +static inline pgoff_t page_cache_mmap_index_to_pgoff(struct address_space *a, pgoff_t index) +{ + return index >> mapping_order(a); +} + +static inline int page_cache_mmap_index_to_subpage_index(struct address_space *a, pgoff_t index) +{ + return index & ((1 << mapping_order(a)) - 1); +} #else /* * Kernel configured for a fixed PAGE_SIZEd page cache @@ -182,6 +211,32 @@ static inline unsigned int page_cache_of { return pos & ~PAGE_MASK; } + +static inline struct page *page_cache_head(struct page *sub_page) +{ + return sub_page; +} + +static inline struct page *page_cache_subpage_index(struct page *sub_page) +{ + return 0; +} + +static inline struct page *page_cache_subpage(struct *page, int index) +{ + BUG_ON(index); + return page; +} + +static inline pgoff_t page_cache_mmap_index_to_pgoff(struct address_space *a, pgoff_t index) +{ + return index; +} + +static inline int page_cache_mmap_index_to_subpage_index(struct address_space *a, pgoff_t index) +{ + return 0; +} #endif static inline pgoff_t page_cache_index(struct address_space *a, Index: linux-2.6.22-rc4-mm2/arch/x86_64/kernel/traps.c =================================================================== --- linux-2.6.22-rc4-mm2.orig/arch/x86_64/kernel/traps.c 2007-06-21 21:42:34.000000000 -0700 +++ linux-2.6.22-rc4-mm2/arch/x86_64/kernel/traps.c 2007-06-21 21:46:45.000000000 -0700 @@ -753,6 +753,7 @@ asmlinkage void __kprobes do_general_pro tsk->thread.error_code = error_code; tsk->thread.trap_no = 13; + dump_stack(); if (exception_trace && unhandled_signal(tsk, SIGSEGV)) printk(KERN_INFO "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", Index: linux-2.6.22-rc4-mm2/fs/buffer.c =================================================================== --- linux-2.6.22-rc4-mm2.orig/fs/buffer.c 2007-06-21 21:46:29.000000000 -0700 +++ linux-2.6.22-rc4-mm2/fs/buffer.c 2007-06-21 21:46:45.000000000 -0700 @@ -698,7 +698,9 @@ static int __set_page_dirty(struct page if (TestSetPageDirty(page)) return 0; - + BUG_ON(PageTail(page)); + if (PageTail(page)) + return 0; write_lock_irq(&mapping->tree_lock); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); Index: linux-2.6.22-rc4-mm2/mm/rmap.c =================================================================== --- linux-2.6.22-rc4-mm2.orig/mm/rmap.c 2007-06-21 21:46:13.000000000 -0700 +++ linux-2.6.22-rc4-mm2/mm/rmap.c 2007-06-21 21:46:48.000000000 -0700 @@ -250,6 +250,10 @@ unsigned long page_address_in_vma(struct * Check that @page is mapped at @address into @mm. * * On success returns with pte mapped and locked. + * + * Checking is only possible on a page not on a compound page. + * For compound page checking this must be called for each page + * that is a part of the compound page. */ pte_t *page_check_address(struct page *page, struct mm_struct *mm, unsigned long address, spinlock_t **ptlp) @@ -301,17 +305,21 @@ static int page_referenced_one(struct pa pte_t *pte; spinlock_t *ptl; int referenced = 0; + int i; + BUG_ON(PageTail(page)); address = vma_address(page, vma); if (address == -EFAULT) goto out; - pte = page_check_address(page, mm, address, &ptl); - if (!pte) - goto out; + for (i = 0; i < compound_pages(page); i++, address += PAGE_SIZE) { + pte = page_check_address(page, mm, address, &ptl); + if (!pte) + continue; - if (ptep_clear_flush_young(vma, address, pte)) - referenced++; + if (ptep_clear_flush_young(vma, address, pte)) + referenced++; + } /* Pretend the page is referenced if the task has the swap token and is in the middle of a page fault. */ @@ -372,6 +380,7 @@ static int page_referenced_file(struct p * sure that this is a file page: the check for page->mapping * excludes the case just before it gets set on an anon page. */ + BUG_ON(PageTail(page)); BUG_ON(PageAnon(page)); /* @@ -417,6 +426,7 @@ int page_referenced(struct page *page, i { int referenced = 0; + BUG_ON(PageTail(page)); if (page_test_and_clear_young(page)) referenced++; @@ -446,28 +456,31 @@ static int page_mkclean_one(struct page pte_t *pte; spinlock_t *ptl; int ret = 0; + int i; + BUG_ON(PageTail(page)); address = vma_address(page, vma); if (address == -EFAULT) goto out; - pte = page_check_address(page, mm, address, &ptl); - if (!pte) - goto out; + for (i = 0; i < compound_pages(page); i++, address += PAGE_SIZE) { + pte = page_check_address(page, mm, address, &ptl); + if (!pte) + continue; - if (pte_dirty(*pte) || pte_write(*pte)) { - pte_t entry; + if (pte_dirty(*pte) || pte_write(*pte)) { + pte_t entry; - flush_cache_page(vma, address, pte_pfn(*pte)); - entry = ptep_clear_flush(vma, address, pte); - entry = pte_wrprotect(entry); - entry = pte_mkclean(entry); - set_pte_at(mm, address, pte, entry); - lazy_mmu_prot_update(entry); - ret = 1; + flush_cache_page(vma, address, pte_pfn(*pte)); + entry = ptep_clear_flush(vma, address, pte); + entry = pte_wrprotect(entry); + entry = pte_mkclean(entry); + set_pte_at(mm, address, pte, entry); + lazy_mmu_prot_update(entry); + ret = 1; + } + pte_unmap_unlock(pte, ptl); } - - pte_unmap_unlock(pte, ptl); out: return ret; } @@ -479,6 +492,7 @@ static int page_mkclean_file(struct addr struct prio_tree_iter iter; int ret = 0; + BUG_ON(PageTail(page)); BUG_ON(PageAnon(page)); spin_lock(&mapping->i_mmap_lock); @@ -494,6 +508,7 @@ int page_mkclean(struct page *page) { int ret = 0; + BUG_ON(PageTail(page)); BUG_ON(!PageLocked(page)); if (page_mapped(page)) { @@ -595,6 +610,7 @@ void page_add_anon_rmap(struct page *pag void page_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { + BUG_ON(PageCompound(page)); BUG_ON(address < vma->vm_start || address >= vma->vm_end); atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */ __page_set_anon_rmap(page, vma, address); @@ -640,6 +656,7 @@ void page_dup_rmap(struct page *page, st */ void page_remove_rmap(struct page *page, struct vm_area_struct *vma) { + BUG_ON(PageTail(page)); if (atomic_add_negative(-1, &page->_mapcount)) { if (unlikely(page_mapcount(page) < 0)) { printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); @@ -688,76 +705,80 @@ static int try_to_unmap_one(struct page pte_t pteval; spinlock_t *ptl; int ret = SWAP_AGAIN; + int i; + BUG_ON(PageTail(page)); address = vma_address(page, vma); if (address == -EFAULT) goto out; - pte = page_check_address(page, mm, address, &ptl); - if (!pte) - goto out; + for (i = 0; i < compound_pages(page); i++, address += PAGE_SIZE) { + pte = page_check_address(page, mm, address, &ptl); + if (!pte) + continue; - /* - * If the page is mlock()d, we cannot swap it out. - * If it's recently referenced (perhaps page_referenced - * skipped over this mm) then we should reactivate it. - */ - if (!migration && ((vma->vm_flags & VM_LOCKED) || - (ptep_clear_flush_young(vma, address, pte)))) { - ret = SWAP_FAIL; - goto out_unmap; - } + /* + * If the page is mlock()d, we cannot swap it out. + * If it's recently referenced (perhaps page_referenced + * skipped over this mm) then we should reactivate it. + */ + if (!migration && ((vma->vm_flags & VM_LOCKED) || + (ptep_clear_flush_young(vma, address, pte)))) { + ret = SWAP_FAIL; + goto out_unmap; + } - /* Nuke the page table entry. */ - flush_cache_page(vma, address, page_to_pfn(page)); - pteval = ptep_clear_flush(vma, address, pte); + /* Nuke the page table entry. */ + flush_cache_page(vma, address, page_to_pfn(page)); + pteval = ptep_clear_flush(vma, address, pte); - /* Move the dirty bit to the physical page now the pte is gone. */ - if (pte_dirty(pteval)) - set_page_dirty(page); + /* Move the dirty bit to the physical page now the pte is gone. */ + if (pte_dirty(pteval)) + set_page_dirty(page); - /* Update high watermark before we lower rss */ - update_hiwater_rss(mm); + /* Update high watermark before we lower rss */ + update_hiwater_rss(mm); - if (PageAnon(page)) { - swp_entry_t entry = { .val = page_private(page) }; + if (PageAnon(page)) { + swp_entry_t entry = { .val = page_private(page) }; - if (PageSwapCache(page)) { - /* - * Store the swap location in the pte. - * See handle_pte_fault() ... - */ - swap_duplicate(entry); - if (list_empty(&mm->mmlist)) { - spin_lock(&mmlist_lock); - if (list_empty(&mm->mmlist)) - list_add(&mm->mmlist, &init_mm.mmlist); - spin_unlock(&mmlist_lock); - } - dec_mm_counter(mm, anon_rss); + if (PageSwapCache(page)) { + /* + * Store the swap location in the pte. + * See handle_pte_fault() ... + */ + swap_duplicate(entry); + if (list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + if (list_empty(&mm->mmlist)) + list_add(&mm->mmlist, &init_mm.mmlist); + spin_unlock(&mmlist_lock); + } + dec_mm_counter(mm, anon_rss); #ifdef CONFIG_MIGRATION - } else { - /* - * Store the pfn of the page in a special migration - * pte. do_swap_page() will wait until the migration - * pte is removed and then restart fault handling. - */ - BUG_ON(!migration); - entry = make_migration_entry(page, pte_write(pteval)); + } else { + /* + * Store the pfn of the page in a special migration + * pte. do_swap_page() will wait until the migration + * pte is removed and then restart fault handling. + */ + BUG_ON(!migration); + entry = make_migration_entry(page, pte_write(pteval)); #endif - } - set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); - BUG_ON(pte_file(*pte)); - } else + } + set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); + BUG_ON(pte_file(*pte)); + } else #ifdef CONFIG_MIGRATION - if (migration) { - /* Establish migration entry for a file page */ - swp_entry_t entry; - entry = make_migration_entry(page, pte_write(pteval)); - set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); - } else + if (migration) { + /* Establish migration entry for a file page */ + swp_entry_t entry; + entry = make_migration_entry(page, pte_write(pteval)); + set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); + } else #endif - dec_mm_counter(mm, file_rss); + dec_mm_counter(mm, file_rss); + } page_remove_rmap(page, vma);