mm_struct counter deltas in task_struct Introduce counter deltas in the task_struct. Instead of updating the counters in the mm_struct via inc_mm_counter() etc one may now use inc_mm_delta(). Inc_mm_delta will increment a delta in the task_struct. The delta is later folded into the mm_struct counter during schedule(). The advantage is that the operations on the deltas do not need any locks. The delta counters may be used for a variety of purposes outside of the page fault scalability patchset. (f.e. the existing tlb "freed" handling may be switched to use this method). The method to fold the counters in schedule() may require some scrutiny. We only take the page_table_lock if its available otherwise counter updates are deferred until the next schedule(). If the page_table_lock is busy for extended time periods then lots of deltas may accumulate. The reported RSS visible through /proc may lag a bit as a result. One may want to add other points where the mm counters will be updated. The main problem in the past with using current to store mm_struct counter information were primarily concerns with get_user_pages(). The approach here solves the issues in the following way: get_user_pages() first shifts any deltas into the current->mm. Then it does the handle_mm_fault() thing which may accumulate new deltas in current. PF_NOMMCOUNTER is set to disable schedule() counter consolidation which would add the deltas to the wrong mm. The resulting deltas are stuffed into the target mm after the page_table_lock has been acquired for the last time in get_user_pages. This patch only introduces the counter deltas and is independent of the page fault scalabilty patches. It does not make the page fault scalability patchset use the deltas. Signed-off-by: Christoph Lameter Index: linux-2.6.13-rc6-mm1/include/linux/sched.h =================================================================== --- linux-2.6.13-rc6-mm1.orig/include/linux/sched.h 2005-09-01 10:24:58.000000000 -0700 +++ linux-2.6.13-rc6-mm1/include/linux/sched.h 2005-09-01 10:36:08.000000000 -0700 @@ -264,6 +264,16 @@ typedef atomic_t mm_counter_t; typedef unsigned long mm_counter_t; #endif /* !CONFIG_SPLIT_PTLOCK */ +/* + * mm_counter operations through the deltas in task_struct + * that do not require holding the page_table_lock. + */ +#define inc_mm_delta(member) current->delta_##member++ +#define dec_mm_delta(member) current->delta_##member-- + +#define mm_counter_updates_pending(__p) \ + ((__p)->delta_nr_ptes | (__p)->delta_rss | (__p)->delta_anon_rss) + struct mm_struct { struct vm_area_struct * mmap; /* list of VMAs */ struct rb_root mm_rb; @@ -699,6 +709,15 @@ struct task_struct { struct mm_struct *mm, *active_mm; + /* + * Deltas for corresponding counters in mm_struct which require + * the page_table_lock. The deltas may be updated and are later + * folded into the corresponding mm_struct counters. + */ + long delta_rss; + long delta_anon_rss; + long delta_nr_ptes; + /* task state */ struct linux_binfmt *binfmt; long exit_state; @@ -888,6 +907,7 @@ do { if (atomic_dec_and_test(&(tsk)->usa #define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */ #define PF_BORROWED_MM 0x00400000 /* I am a kthread doing use_mm */ #define PF_RANDOMIZE 0x00800000 /* randomize virtual address space */ +#define PF_NOMMCOUNTER 0x01000000 /* No delta processing for mm_struct */ /* * Only the _current_ task can read/write to tsk->flags, but other @@ -1416,6 +1436,9 @@ static inline void thaw_processes(void) static inline int try_to_freeze(void) { return 0; } #endif /* CONFIG_PM */ + +extern void mm_counter_catchup(struct task_struct *t, struct mm_struct *mm,int force); + #endif /* __KERNEL__ */ #endif Index: linux-2.6.13-rc6-mm1/kernel/fork.c =================================================================== --- linux-2.6.13-rc6-mm1.orig/kernel/fork.c 2005-09-01 10:24:58.000000000 -0700 +++ linux-2.6.13-rc6-mm1/kernel/fork.c 2005-09-01 10:32:36.000000000 -0700 @@ -173,6 +173,9 @@ static struct task_struct *dup_task_stru *ti = *orig->thread_info; *tsk = *orig; tsk->thread_info = ti; + tsk->delta_rss = 0; + tsk->delta_anon_rss = 0; + tsk->delta_nr_ptes = 0; ti->task = tsk; /* One for us, one for whoever does the "release_task()" (usually parent) */ @@ -424,6 +427,10 @@ void mm_release(struct task_struct *tsk, { struct completion *vfork_done = tsk->vfork_done; + /* If we are still carrying deltas then apply them */ + if (mm && mm_counter_updates_pending(tsk)) + mm_counter_catchup(tsk, mm); + /* Get rid of any cached register state */ deactivate_mm(tsk, mm); Index: linux-2.6.13-rc6-mm1/mm/memory.c =================================================================== --- linux-2.6.13-rc6-mm1.orig/mm/memory.c 2005-09-01 10:24:58.000000000 -0700 +++ linux-2.6.13-rc6-mm1/mm/memory.c 2005-09-01 10:35:52.000000000 -0700 @@ -857,6 +857,27 @@ untouched_anonymous_page(struct mm_struc return 0; } +/* + * Update the mm_struct counters protected by + * the page_table_lock using the deltas in the task_struct. + */ +void mm_counter_catchup(struct task_struct *t, struct mm_struct *mm, int force) +{ + if (force) + spin_lock(&t->mm->page_table_lock); + else + if (!spin_trylock(&t->mm->page_table_lock)) + return; + + add_mm_counter(mm, rss, t->delta_rss); + add_mm_counter(mm, anon_rss, t->delta_anon_rss); + add_mm_counter(mm, nr_ptes, t->delta_nr_ptes); + t->delta_rss = 0; + t->delta_anon_rss = 0; + t->delta_nr_ptes = 0; + spin_unlock(&t->mm->page_table_lock); +} + int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas) @@ -872,6 +893,15 @@ int get_user_pages(struct task_struct *t flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); i = 0; + if (mm != current->mm && mm_counter_updates_pending(current)) + /* + * Access to a foreign mm requires us first to bring + * the counters in our mm up to date with counter deltas + * so that we do not carry any deltas. + */ + mm_counter_catchup(current, current->mm, 1); + + do { struct vm_area_struct * vma; @@ -922,6 +952,7 @@ int get_user_pages(struct task_struct *t &start, &len, i); continue; } + current-flags |= PF_NOMMCOUNTER; do { int write_access = write; struct page *page; @@ -960,8 +991,10 @@ int get_user_pages(struct task_struct *t tsk->maj_flt++; break; case VM_FAULT_SIGBUS: + current->flags &= ~PF_NOMMCOUNTER; return i ? i : -EFAULT; case VM_FAULT_OOM: + current->flags &= ~PF_NOMMCOUNTER; return i ? i : -ENOMEM; default: BUG(); @@ -977,6 +1010,14 @@ int get_user_pages(struct task_struct *t start += PAGE_SIZE; len--; } while (len && start < vma->vm_end); + if (mm != current->mm && mm_counter_updates_pending(current)) + /* + * Foreign mm. Update any counters delta in the + * foreign mm otherwise they will be later added + * to the mm_struct of this process. + */ + mm_counter_catchup(current, mm, 1); + } while (len); return i; } Index: linux-2.6.13-rc6-mm1/kernel/sched.c =================================================================== --- linux-2.6.13-rc6-mm1.orig/kernel/sched.c 2005-08-19 11:47:49.000000000 -0700 +++ linux-2.6.13-rc6-mm1/kernel/sched.c 2005-09-01 10:34:10.000000000 -0700 @@ -2917,6 +2917,12 @@ asmlinkage void __sched schedule(void) } profile_hit(SCHED_PROFILING, __builtin_return_address(0)); + /* If we have the opportunity then update the mm_counters */ + if (unlikely(current->mm + && mm_counter_updates_pending(current) + && !(current->flags & PF_NOMMCOUNTER))) + mm_counter_catchup(current, current->mm, 0); + need_resched: preempt_disable(); prev = current;