Replace atomic ops through counter deltas This patch applies on top of the counter delta patches and the page fault scalability patchset in 2.6.13-rc6-mm1. It switches the code paths that could potentially not use the page table lock to use inc_mm_delta instead of inc_mm_counter (which requires the ptl or atomic operations). We can then remove the definitions for making the mm_struct counters atomic. As a consequence page_add_anon_rmap does no longer require the page_table_lock. It will always increase the delta rss of the currently executing process instead of increasing the rss of the mm belonging to the vma. Most of the time this is okay except in the case when the unuse_mm uses this function. In that case the deferred counters need to be charged to the mm_structs as they are processed similarly to what was done for get_user_pages(). The use of deltas could be taken further and other places could be switched. Obviously this would be possible with places like unuse_pte() that now use mixed mm_counter and mm_delta operations. In the case of CONFIG_ATOMIC_TABLE_OPS not having been defined for an arch we will still be using the deltas. This will help somewhat in avoiding bouncing cachelines however the page_table_lock will still be taken which is the major scalability bottleneck. Maybe we need to fall back to no deltas? Signed-off-by: Christoph Lameter Index: linux-2.6.13-rc6-mm1/include/linux/sched.h =================================================================== --- linux-2.6.13-rc6-mm1.orig/include/linux/sched.h 2005-08-22 12:34:51.000000000 -0700 +++ linux-2.6.13-rc6-mm1/include/linux/sched.h 2005-08-22 12:44:51.000000000 -0700 @@ -227,35 +227,8 @@ arch_get_unmapped_area_topdown(struct fi extern void arch_unmap_area(struct mm_struct *, unsigned long); extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); -#ifdef CONFIG_ATOMIC_TABLE_OPS /* - * No spinlock is held during atomic page table operations. The - * counters are not protected anymore and must also be - * incremented atomically. -*/ -#ifdef ATOMIC64_INIT -#define set_mm_counter(mm, member, value) atomic64_set(&(mm)->_##member, value) -#define get_mm_counter(mm, member) ((unsigned long)atomic64_read(&(mm)->_##member)) -#define add_mm_counter(mm, member, value) atomic64_add(value, &(mm)->_##member) -#define inc_mm_counter(mm, member) atomic64_inc(&(mm)->_##member) -#define dec_mm_counter(mm, member) atomic64_dec(&(mm)->_##member) -typedef atomic64_t mm_counter_t; -#else -/* - * This may limit process memory to 2^31 * PAGE_SIZE which may be around 8TB - * if using 4KB page size - */ -#define set_mm_counter(mm, member, value) atomic_set(&(mm)->_##member, value) -#define get_mm_counter(mm, member) ((unsigned long)atomic_read(&(mm)->_##member)) -#define add_mm_counter(mm, member, value) atomic_add(value, &(mm)->_##member) -#define inc_mm_counter(mm, member) atomic_inc(&(mm)->_##member) -#define dec_mm_counter(mm, member) atomic_dec(&(mm)->_##member) -typedef atomic_t mm_counter_t; -#endif -#else -/* - * No atomic page table operations. Counters are protected by - * the page table lock + * Operations for mm_struct counters protected by the page table lock */ #define set_mm_counter(mm, member, value) (mm)->_##member = (value) #define get_mm_counter(mm, member) ((mm)->_##member) @@ -263,7 +236,6 @@ typedef atomic_t mm_counter_t; #define inc_mm_counter(mm, member) (mm)->_##member++ #define dec_mm_counter(mm, member) (mm)->_##member-- typedef unsigned long mm_counter_t; -#endif /* * mm_counter operations through the deltas in task_struct Index: linux-2.6.13-rc6-mm1/mm/memory.c =================================================================== --- linux-2.6.13-rc6-mm1.orig/mm/memory.c 2005-08-22 12:34:51.000000000 -0700 +++ linux-2.6.13-rc6-mm1/mm/memory.c 2005-08-22 12:44:51.000000000 -0700 @@ -1849,7 +1849,7 @@ do_anonymous_page(struct mm_struct *mm, */ page_add_anon_rmap(page, vma, addr); lru_cache_add_active(page); - inc_mm_counter(mm, rss); + inc_mm_delta(rss); update_mmu_cache(vma, addr, entry); lazy_mmu_prot_update(entry); @@ -2199,7 +2199,7 @@ int __handle_mm_fault(struct mm_struct * pte_free(new); else { inc_page_state(nr_page_table_pages); - inc_mm_counter(mm, nr_ptes); + inc_mm_delta(nr_ptes); } } Index: linux-2.6.13-rc6-mm1/mm/rmap.c =================================================================== --- linux-2.6.13-rc6-mm1.orig/mm/rmap.c 2005-08-19 11:45:27.000000000 -0700 +++ linux-2.6.13-rc6-mm1/mm/rmap.c 2005-08-22 12:44:51.000000000 -0700 @@ -437,15 +437,13 @@ int page_referenced(struct page *page, i * @page: the page to add the mapping to * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped - * - * The caller needs to hold the mm->page_table_lock. */ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { BUG_ON(PageReserved(page)); - inc_mm_counter(vma->vm_mm, anon_rss); + inc_mm_delta(anon_rss); if (atomic_inc_and_test(&page->_mapcount)) { struct anon_vma *anon_vma = vma->anon_vma; Index: linux-2.6.13-rc6-mm1/mm/swapfile.c =================================================================== --- linux-2.6.13-rc6-mm1.orig/mm/swapfile.c 2005-08-19 11:47:49.000000000 -0700 +++ linux-2.6.13-rc6-mm1/mm/swapfile.c 2005-08-22 12:44:51.000000000 -0700 @@ -508,6 +508,17 @@ static int unuse_mm(struct mm_struct *mm { struct vm_area_struct *vma; + /* + * Ensure that existing deltas are charged to the current mm since + * we will charge the next batch manually to the target mm + */ + if (current->mm && mm_counter_updates_pending(current)) { + XXX + mm_counter_catchup(current, current->mm, 1); + XXX + } + current->flags |= PF_NOMMCOUNTER; + if (!down_read_trylock(&mm->mmap_sem)) { /* * Activate page so shrink_cache is unlikely to unmap its @@ -523,6 +534,14 @@ static int unuse_mm(struct mm_struct *mm if (vma->anon_vma && unuse_vma(vma, entry, page)) break; } + + /* + * Make sure all the deferred counters get charged + * to the right mm_struct. + */ + mm_counter_catchup(current, mm, 1); XXXX + current->flags &= ~PF_NOMMCOUNTER; + spin_unlock(&mm->page_table_lock); up_read(&mm->mmap_sem); /*