From: Balbir Singh This patch adds support for accounting and control of virtual address space limits. The accounting is done via the rlimit_cgroup_(un)charge_as functions. The core of the accounting takes place during fork time in copy_process(), may_expand_vm(), remove_vma_list() and exit_mmap(). Signed-off-by: Balbir Singh Cc: Sudhir Kumar Cc: YAMAMOTO Takashi Cc: Paul Menage Cc: Li Zefan Cc: Pavel Emelianov Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: David Rientjes Cc: Vivek Goyal Signed-off-by: Andrew Morton --- arch/x86/kernel/ptrace.c | 18 ++++- include/linux/memrlimitcgroup.h | 21 ++++++ kernel/fork.c | 8 ++ mm/memrlimitcgroup.c | 92 ++++++++++++++++++++++++++++++ mm/mmap.c | 17 ++++- 5 files changed, 149 insertions(+), 7 deletions(-) diff -puN arch/x86/kernel/ptrace.c~memrlimit-add-memrlimit-controller-accounting-and-control arch/x86/kernel/ptrace.c --- a/arch/x86/kernel/ptrace.c~memrlimit-add-memrlimit-controller-accounting-and-control +++ a/arch/x86/kernel/ptrace.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -782,21 +783,25 @@ static int ptrace_bts_realloc(struct tas current->mm->total_vm -= old_size; current->mm->locked_vm -= old_size; + memrlimit_cgroup_uncharge_as(mm, old_size); if (size == 0) goto out; + if (memrlimit_cgroup_charge_as(current->mm, size)) + goto out; + rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; vm = current->mm->total_vm + size; if (rlim < vm) { ret = -ENOMEM; if (!reduce_size) - goto out; + goto out_uncharge; size = rlim - current->mm->total_vm; if (size <= 0) - goto out; + goto out_uncharge; } rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; @@ -805,21 +810,24 @@ static int ptrace_bts_realloc(struct tas ret = -ENOMEM; if (!reduce_size) - goto out; + goto out_uncharge; size = rlim - current->mm->locked_vm; if (size <= 0) - goto out; + goto out_uncharge; } ret = ds_allocate((void **)&child->thread.ds_area_msr, size << PAGE_SHIFT); if (ret < 0) - goto out; + goto out_uncharge; current->mm->total_vm += size; current->mm->locked_vm += size; +out_uncharge: + if (ret < 0) + memrlimit_cgroup_uncharge_as(mm, size); out: if (child->thread.ds_area_msr) set_tsk_thread_flag(child, TIF_DS_AREA_MSR); diff -puN include/linux/memrlimitcgroup.h~memrlimit-add-memrlimit-controller-accounting-and-control include/linux/memrlimitcgroup.h --- a/include/linux/memrlimitcgroup.h~memrlimit-add-memrlimit-controller-accounting-and-control +++ a/include/linux/memrlimitcgroup.h @@ -16,4 +16,25 @@ #ifndef LINUX_MEMRLIMITCGROUP_H #define LINUX_MEMRLIMITCGROUP_H +#ifdef CONFIG_CGROUP_MEMRLIMIT_CTLR + +int memrlimit_cgroup_charge_as(struct mm_struct *mm, unsigned long nr_pages); +void memrlimit_cgroup_uncharge_as(struct mm_struct *mm, unsigned long nr_pages); + +#else /* !CONFIG_CGROUP_RLIMIT_CTLR */ + +static inline int +memrlimit_cgroup_charge_as(struct mm_struct *mm, unsigned long nr_pages) +{ + return 0; +} + +static inline void +memrlimit_cgroup_uncharge_as(struct mm_struct *mm, unsigned long nr_pages) +{ +} + +#endif /* CONFIG_CGROUP_RLIMIT_CTLR */ + + #endif /* LINUX_MEMRLIMITCGROUP_H */ diff -puN kernel/fork.c~memrlimit-add-memrlimit-controller-accounting-and-control kernel/fork.c --- a/kernel/fork.c~memrlimit-add-memrlimit-controller-accounting-and-control +++ a/kernel/fork.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -273,6 +274,7 @@ static int dup_mmap(struct mm_struct *mm mm->total_vm -= pages; vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, -pages); + memrlimit_cgroup_uncharge_as(mm, pages); continue; } charge = 0; @@ -610,6 +612,12 @@ static int copy_mm(unsigned long clone_f atomic_inc(&oldmm->mm_users); mm = oldmm; goto good_mm; + } else { + down_write(&oldmm->mmap_sem); + retval = memrlimit_cgroup_charge_as(oldmm, oldmm->total_vm); + up_write(&oldmm->mmap_sem); + if (retval) + goto fail_nomem; } retval = -ENOMEM; diff -puN mm/memrlimitcgroup.c~memrlimit-add-memrlimit-controller-accounting-and-control mm/memrlimitcgroup.c --- a/mm/memrlimitcgroup.c~memrlimit-add-memrlimit-controller-accounting-and-control +++ a/mm/memrlimitcgroup.c @@ -45,6 +45,38 @@ static struct memrlimit_cgroup *memrlimi struct memrlimit_cgroup, css); } +static struct memrlimit_cgroup * +memrlimit_cgroup_from_task(struct task_struct *p) +{ + return container_of(task_subsys_state(p, memrlimit_cgroup_subsys_id), + struct memrlimit_cgroup, css); +} + +/* + * Charge the cgroup for address space usage - mmap(), malloc() (through + * brk(), sbrk()), stack expansion, mremap(), etc - called with + * mmap_sem held. + */ +int memrlimit_cgroup_charge_as(struct mm_struct *mm, unsigned long nr_pages) +{ + struct memrlimit_cgroup *memrcg; + + memrcg = memrlimit_cgroup_from_task(mm->owner); + return res_counter_charge(&memrcg->as_res, (nr_pages << PAGE_SHIFT)); +} + +/* + * Uncharge the cgroup, as the address space of one of the tasks is + * decreasing - called with mmap_sem held. + */ +void memrlimit_cgroup_uncharge_as(struct mm_struct *mm, unsigned long nr_pages) +{ + struct memrlimit_cgroup *memrcg; + + memrcg = memrlimit_cgroup_from_task(mm->owner); + res_counter_uncharge(&memrcg->as_res, (nr_pages << PAGE_SHIFT)); +} + static struct cgroup_subsys_state * memrlimit_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) { @@ -134,11 +166,71 @@ static int memrlimit_cgroup_populate(str ARRAY_SIZE(memrlimit_cgroup_files)); } +static void memrlimit_cgroup_move_task(struct cgroup_subsys *ss, + struct cgroup *cgrp, + struct cgroup *old_cgrp, + struct task_struct *p) +{ + struct mm_struct *mm; + struct memrlimit_cgroup *memrcg, *old_memrcg; + + mm = get_task_mm(p); + if (mm == NULL) + return; + + /* + * Hold mmap_sem, so that total_vm does not change underneath us + */ + down_read(&mm->mmap_sem); + + rcu_read_lock(); + if (p != rcu_dereference(mm->owner)) + goto out; + + memrcg = memrlimit_cgroup_from_cgrp(cgrp); + old_memrcg = memrlimit_cgroup_from_cgrp(old_cgrp); + + if (memrcg == old_memrcg) + goto out; + + if (res_counter_charge(&memrcg->as_res, (mm->total_vm << PAGE_SHIFT))) + goto out; + res_counter_uncharge(&old_memrcg->as_res, (mm->total_vm << PAGE_SHIFT)); +out: + rcu_read_unlock(); + up_read(&mm->mmap_sem); + mmput(mm); +} + +/* + * This callback is called with mmap_sem held + */ +static void memrlimit_cgroup_mm_owner_changed(struct cgroup_subsys *ss, + struct cgroup *cgrp, + struct cgroup *old_cgrp, + struct task_struct *p) +{ + struct memrlimit_cgroup *memrcg, *old_memrcg; + struct mm_struct *mm = get_task_mm(p); + + BUG_ON(!mm); + memrcg = memrlimit_cgroup_from_cgrp(cgrp); + old_memrcg = memrlimit_cgroup_from_cgrp(old_cgrp); + + if (res_counter_charge(&memrcg->as_res, (mm->total_vm << PAGE_SHIFT))) + goto out; + res_counter_uncharge(&old_memrcg->as_res, (mm->total_vm << PAGE_SHIFT)); +out: + mmput(mm); +} + struct cgroup_subsys memrlimit_cgroup_subsys = { .name = "memrlimit", .subsys_id = memrlimit_cgroup_subsys_id, .create = memrlimit_cgroup_create, .destroy = memrlimit_cgroup_destroy, .populate = memrlimit_cgroup_populate, + .attach = memrlimit_cgroup_move_task, + .mm_owner_changed = memrlimit_cgroup_mm_owner_changed, .early_init = 0, }; diff -puN mm/mmap.c~memrlimit-add-memrlimit-controller-accounting-and-control mm/mmap.c --- a/mm/mmap.c~memrlimit-add-memrlimit-controller-accounting-and-control +++ a/mm/mmap.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -1741,6 +1742,7 @@ static void remove_vma_list(struct mm_st long nrpages = vma_pages(vma); mm->total_vm -= nrpages; + memrlimit_cgroup_uncharge_as(mm, nrpages); if (vma->vm_flags & VM_LOCKED) mm->locked_vm -= nrpages; vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); @@ -1767,6 +1769,7 @@ static void unmap_region(struct mm_struc update_hiwater_rss(mm); unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); + memrlimit_cgroup_uncharge_as(mm, mm->total_vm); free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, next? next->vm_start: 0); tlb_finish_mmu(tlb, start, end); @@ -2089,6 +2092,9 @@ int insert_vm_struct(struct mm_struct * struct vm_area_struct * __vma, * prev; struct rb_node ** rb_link, * rb_parent; + if (memrlimit_cgroup_charge_as(mm, vma_pages(vma))) + return -ENOMEM; + /* * The vm_pgoff of a purely anonymous vma should be irrelevant * until its first write fault, when page's anon_vma and index @@ -2107,12 +2113,15 @@ int insert_vm_struct(struct mm_struct * } __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent); if (__vma && __vma->vm_start < vma->vm_end) - return -ENOMEM; + goto err; if ((vma->vm_flags & VM_ACCOUNT) && security_vm_enough_memory_mm(mm, vma_pages(vma))) - return -ENOMEM; + goto err; vma_link(mm, vma, prev, rb_link, rb_parent); return 0; +err: + memrlimit_cgroup_uncharge_as(mm, vma_pages(vma)); + return -ENOMEM; } /* @@ -2185,6 +2194,10 @@ int may_expand_vm(struct mm_struct *mm, if (cur + npages > lim) return 0; + + if (memrlimit_cgroup_charge_as(mm, npages)) + return 0; + return 1; } _