Add NR_MLOCK Track mlocked pages via a ZVC Mlocked pages are taken off the LRU and are only put back if there are no vmas any more that reference this page. Signed-off-by: Christoph Lameter Index: linux-2.6.20-rc7/include/linux/mmzone.h =================================================================== --- linux-2.6.20-rc7.orig/include/linux/mmzone.h 2007-01-30 19:42:57.000000000 -0800 +++ linux-2.6.20-rc7/include/linux/mmzone.h 2007-01-31 17:41:24.000000000 -0800 @@ -57,6 +57,7 @@ enum zone_stat_item { NR_FILE_DIRTY, NR_WRITEBACK, NR_UNSTABLE_NFS, /* NFS unstable pages */ + NR_MLOCK, /* Mlocked pages */ NR_BOUNCE, NR_VMSCAN_WRITE, #ifdef CONFIG_NUMA Index: linux-2.6.20-rc7/mm/memory.c =================================================================== --- linux-2.6.20-rc7.orig/mm/memory.c 2007-01-30 19:42:57.000000000 -0800 +++ linux-2.6.20-rc7/mm/memory.c 2007-04-02 22:30:24.000000000 -0700 @@ -682,6 +682,31 @@ static unsigned long zap_pte_range(struc file_rss--; } page_remove_rmap(page, vma); + if (vma->vm_flags & VM_LOCKED) { + /* + * Now for the expensive part ... Must take + * page lock to hold off new vma references + */ + lock_page(page); + if (mlock_vma_count(page) > 1 && + TestClearPageMlocked(page)) { + /* The MLOCK state of the page changed */ + dec_zone_page_state(page, NR_MLOCK); + /* + * page count may be temporarily + * elevated but that just means a useless + * add to the lru. + */ + if (page_count(page) > 1) + /* + * Page will survive the + * freeing so add page to + * the LRU. + */ + lru_cache_add_active(page); + } + unlock_page(page); + } tlb_remove_page(tlb, page); continue; } @@ -898,6 +923,40 @@ unsigned long zap_page_range(struct vm_a } /* + * Add a newly allocated page to the mlocked pages + */ +static void add_mlocked(struct page *page) { + SetPageMlocked(page); + inc_zone_page_state(page, NR_MLOCK); +} + +/* + * Make an existing page an mlocked page. Page must either already be + * PageMlocked or we need to make the page mlocked and remove it from its LRU. + */ +static void check_mlock(struct vm_area_struct *vma, struct page *page) +{ + if ((vma->vm_flags & VM_LOCKED) && !TestSetPageMlocked(page)) { + lru_release(page); + inc_zone_page_state(page, NR_MLOCK); + } +} + +/* + * Add anonymous page + */ +void anon_add(struct vm_area_struct *vma, struct page *page, + unsigned long address) +{ + inc_mm_counter(vma->vm_mm, anon_rss); + if (vma->vm_flags & VM_LOCKED) + add_mlocked(page); + else + lru_cache_add_active(page); + page_add_new_anon_rmap(page, vma, address); +} + +/* * Do a quick page-table lookup for a single page. */ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, @@ -949,6 +1008,7 @@ struct page *follow_page(struct vm_area_ if (unlikely(!page)) goto unlock; + check_mlock(vma, page); if (flags & FOLL_GET) get_page(page); if (flags & FOLL_TOUCH) { @@ -1023,8 +1083,9 @@ int get_user_pages(struct task_struct *t if (pages) { struct page *page = vm_normal_page(gate_vma, start, *pte); pages[i] = page; - if (page) + if (page) { get_page(page); + } } pte_unmap(pte); if (vmas) @@ -2104,9 +2165,7 @@ static int do_anonymous_page(struct mm_s page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_none(*page_table)) goto release; - inc_mm_counter(mm, anon_rss); - lru_cache_add_active(page); - page_add_new_anon_rmap(page, vma, address); + anon_add(vma, page, address); } else { /* Map the ZERO_PAGE - vm_page_prot is readonly */ page = ZERO_PAGE(address); @@ -2250,12 +2309,11 @@ retry: if (write_access) entry = maybe_mkwrite(pte_mkdirty(entry), vma); set_pte_at(mm, address, page_table, entry); - if (anon) { - inc_mm_counter(mm, anon_rss); - lru_cache_add_active(new_page); - page_add_new_anon_rmap(new_page, vma, address); - } else { + if (anon) + anon_add(vma, new_page, address); + else { inc_mm_counter(mm, file_rss); + check_mlock(vma, new_page); page_add_file_rmap(new_page); if (write_access) { dirty_page = new_page; Index: linux-2.6.20-rc7/drivers/base/node.c =================================================================== --- linux-2.6.20-rc7.orig/drivers/base/node.c 2007-01-30 19:42:57.000000000 -0800 +++ linux-2.6.20-rc7/drivers/base/node.c 2007-01-31 17:40:35.000000000 -0800 @@ -65,6 +65,7 @@ static ssize_t node_read_meminfo(struct "Node %d FilePages: %8lu kB\n" "Node %d Mapped: %8lu kB\n" "Node %d AnonPages: %8lu kB\n" + "Node %d Mlock: %8lu KB\n" "Node %d PageTables: %8lu kB\n" "Node %d NFS_Unstable: %8lu kB\n" "Node %d Bounce: %8lu kB\n" @@ -87,6 +88,7 @@ static ssize_t node_read_meminfo(struct nid, K(node_page_state(nid, NR_FILE_PAGES)), nid, K(node_page_state(nid, NR_FILE_MAPPED)), nid, K(node_page_state(nid, NR_ANON_PAGES)), + nid, K(node_page_state(nid, NR_MLOCK)), nid, K(node_page_state(nid, NR_PAGETABLE)), nid, K(node_page_state(nid, NR_UNSTABLE_NFS)), nid, K(node_page_state(nid, NR_BOUNCE)), Index: linux-2.6.20-rc7/fs/proc/proc_misc.c =================================================================== --- linux-2.6.20-rc7.orig/fs/proc/proc_misc.c 2007-01-30 19:42:57.000000000 -0800 +++ linux-2.6.20-rc7/fs/proc/proc_misc.c 2007-01-31 17:40:35.000000000 -0800 @@ -171,6 +171,7 @@ static int meminfo_read_proc(char *page, "Writeback: %8lu kB\n" "AnonPages: %8lu kB\n" "Mapped: %8lu kB\n" + "Mlock: %8lu KB\n" "Slab: %8lu kB\n" "SReclaimable: %8lu kB\n" "SUnreclaim: %8lu kB\n" @@ -201,6 +202,7 @@ static int meminfo_read_proc(char *page, K(global_page_state(NR_WRITEBACK)), K(global_page_state(NR_ANON_PAGES)), K(global_page_state(NR_FILE_MAPPED)), + K(global_page_state(NR_MLOCK)), K(global_page_state(NR_SLAB_RECLAIMABLE) + global_page_state(NR_SLAB_UNRECLAIMABLE)), K(global_page_state(NR_SLAB_RECLAIMABLE)), Index: linux-2.6.20-rc7/include/linux/page-flags.h =================================================================== --- linux-2.6.20-rc7.orig/include/linux/page-flags.h 2007-01-30 19:42:57.000000000 -0800 +++ linux-2.6.20-rc7/include/linux/page-flags.h 2007-01-31 17:40:35.000000000 -0800 @@ -91,6 +91,7 @@ #define PG_nosave_free 18 /* Used for system suspend/resume */ #define PG_buddy 19 /* Page is free, on buddy lists */ +#define PG_mlocked 21 /* Page is mlocked */ #if (BITS_PER_LONG > 32) /* @@ -251,6 +252,20 @@ static inline void SetPageUptodate(struc #define SetPageUncached(page) set_bit(PG_uncached, &(page)->flags) #define ClearPageUncached(page) clear_bit(PG_uncached, &(page)->flags) +#define PageMlocked(page) \ + test_bit(PG_mlocked, &(page)->flags) +#define SetPageMlocked(page) \ + set_bit(PG_mlocked, &(page)->flags) +#define TestSetPageMlocked(page) \ + test_and_set_bit(PG_mlocked, &(page)->flags) +#define TestClearPageMlocked(page) \ + test_and_clear_bit(PG_mlocked, &(page)->flags) +#define ClearPageMlocked(page) \ + clear_bit(PG_mlocked, &(page)->flags) +#define TestClearPageMlocked(page) \ + test_and_clear_bit(PG_mlocked, &(page)->flags) + + struct page; /* forward declaration */ extern void cancel_dirty_page(struct page *page, unsigned int account_size); Index: linux-2.6.20-rc7/include/linux/swap.h =================================================================== --- linux-2.6.20-rc7.orig/include/linux/swap.h 2007-01-30 19:42:57.000000000 -0800 +++ linux-2.6.20-rc7/include/linux/swap.h 2007-01-31 17:40:35.000000000 -0800 @@ -184,6 +184,8 @@ extern void lru_add_drain(void); extern int lru_add_drain_all(void); extern int rotate_reclaimable_page(struct page *page); extern void swap_setup(void); +extern void lru_release(struct page *page); +extern int mlock_vma_count(struct page *page); /* linux/mm/vmscan.c */ extern unsigned long try_to_free_pages(struct zone **, gfp_t); Index: linux-2.6.20-rc7/mm/swap.c =================================================================== --- linux-2.6.20-rc7.orig/mm/swap.c 2007-01-30 19:42:57.000000000 -0800 +++ linux-2.6.20-rc7/mm/swap.c 2007-01-31 17:40:35.000000000 -0800 @@ -35,10 +35,9 @@ int page_cluster; /* - * This path almost never happens for VM activity - pages are normally - * freed via pagevecs. But it gets used by networking. + * Release a page from the LRU. Needed by mlock. */ -static void fastcall __page_cache_release(struct page *page) +void lru_release(struct page *page) { if (PageLRU(page)) { unsigned long flags; @@ -50,6 +49,15 @@ static void fastcall __page_cache_releas del_page_from_lru(zone, page); spin_unlock_irqrestore(&zone->lru_lock, flags); } +} + +/* + * This path almost never happens for VM activity - pages are normally + * freed via pagevecs. But it gets used by networking. + */ +static void fastcall __page_cache_release(struct page *page) +{ + lru_release(page); free_hot_page(page); } Index: linux-2.6.20-rc7/mm/mlock.c =================================================================== --- linux-2.6.20-rc7.orig/mm/mlock.c 2007-01-30 19:42:57.000000000 -0800 +++ linux-2.6.20-rc7/mm/mlock.c 2007-01-31 19:00:21.000000000 -0800 @@ -10,6 +10,116 @@ #include #include #include +#include +#include +#include + +/* + * Check if the page is mapped by the vma + */ +static int page_in_vma(struct vm_area_struct *vma, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + spinlock_t *ptl; + unsigned long addr = page_address_in_vma(page, vma); + int rc = 0; + + if (addr == -EFAULT || PageReserved(page)) + goto out; + + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + goto out; + + ptep = pte_offset_map(pmd, addr); + + if (!is_swap_pte(*ptep)) { + pte_unmap(ptep); + goto out; + } + + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + pte = *ptep; + if (!is_swap_pte(pte)) + rc = page == vm_normal_page(vma, addr, pte); + pte_unmap_unlock(ptep, ptl); +out: + return rc; +} + +static int mlock_vma_count_file(struct page *page) +{ + struct vm_area_struct *vma; + struct address_space *mapping = page_mapping(page); + struct prio_tree_iter iter; + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + int count = 0; + + if (!mapping) + return 0; + + spin_lock(&mapping->i_mmap_lock); + + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) + if ((vma->vm_flags & VM_LOCKED) && page_in_vma(vma, page)) + count++; + + spin_unlock(&mapping->i_mmap_lock); + return count; +} + +/* + * Must hold mmap_sem lock on at least one of the vmas containing + * the page so that the anon_vma cannot vanish. + */ +static int mlock_vma_count_anon(struct page *page) +{ + struct anon_vma *anon_vma; + struct vm_area_struct *vma; + unsigned long mapping; + int count = 0; + + mapping = (unsigned long)page->mapping; + + if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) + return 0; + + /* + * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. + */ + anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); + spin_lock(&anon_vma->lock); + + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) + if ((vma->vm_flags & VM_LOCKED) & page_in_vma(vma, page)) + count++; + spin_unlock(&anon_vma->lock); + return count; +} + +/* + * Check if a page as remaining vma that has VM_LOCKED set + */ +int mlock_vma_count(struct page *page) +{ + if (PageAnon(page)) + return mlock_vma_count_anon(page); + else + return mlock_vma_count_file(page); +} static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, @@ -63,6 +173,10 @@ success: pages = -pages; if (!(newflags & VM_IO)) ret = make_pages_present(start, end); + } else { + /* We are clearing VM_LOCKED. Scan for pages + * and clear VM_LOCKED + */ } mm->locked_vm -= pages; Index: linux-2.6.20-rc7/include/linux/rmap.h =================================================================== --- linux-2.6.20-rc7.orig/include/linux/rmap.h 2007-01-30 19:42:57.000000000 -0800 +++ linux-2.6.20-rc7/include/linux/rmap.h 2007-01-31 17:40:35.000000000 -0800 @@ -128,6 +128,11 @@ static inline int page_mkclean(struct pa #endif /* CONFIG_MMU */ +static inline int is_swap_pte(pte_t pte) +{ + return !pte_none(pte) && !pte_present(pte) && !pte_file(pte); +} + /* * Return values of try_to_unmap */ Index: linux-2.6.20-rc7/mm/migrate.c =================================================================== --- linux-2.6.20-rc7.orig/mm/migrate.c 2007-01-30 19:42:57.000000000 -0800 +++ linux-2.6.20-rc7/mm/migrate.c 2007-01-31 17:40:35.000000000 -0800 @@ -115,11 +115,6 @@ int putback_lru_pages(struct list_head * return count; } -static inline int is_swap_pte(pte_t pte) -{ - return !pte_none(pte) && !pte_present(pte) && !pte_file(pte); -} - /* * Restore a potential migration pte to a working pte entry */