From: Balbir Singh This patch adds support for accounting and control of virtual address space limits. The accounting is done via the rlimit_cgroup_(un)charge_as functions. The core of the accounting takes place during fork time in copy_process(), may_expand_vm(), remove_vma_list() and exit_mmap(). Signed-off-by: Balbir Singh Cc: Sudhir Kumar Cc: YAMAMOTO Takashi Cc: Paul Menage Cc: Li Zefan Cc: Pavel Emelianov Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: David Rientjes Cc: Vivek Goyal Cc: Hugh Dickins Signed-off-by: Andrew Morton --- arch/x86/kernel/ptrace.c | 18 ++++- include/linux/memrlimitcgroup.h | 21 ++++++ kernel/fork.c | 14 ++++ mm/memrlimitcgroup.c | 92 ++++++++++++++++++++++++++++++ mm/mmap.c | 17 ++++- 5 files changed, 154 insertions(+), 8 deletions(-) diff -puN arch/x86/kernel/ptrace.c~memrlimit-add-memrlimit-controller-accounting-and-control arch/x86/kernel/ptrace.c --- a/arch/x86/kernel/ptrace.c~memrlimit-add-memrlimit-controller-accounting-and-control +++ a/arch/x86/kernel/ptrace.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -808,21 +809,25 @@ static int ptrace_bts_realloc(struct tas current->mm->total_vm -= old_size; current->mm->locked_vm -= old_size; + memrlimit_cgroup_uncharge_as(mm, old_size); if (size == 0) goto out; + if (memrlimit_cgroup_charge_as(current->mm, size)) + goto out; + rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; vm = current->mm->total_vm + size; if (rlim < vm) { ret = -ENOMEM; if (!reduce_size) - goto out; + goto out_uncharge; size = rlim - current->mm->total_vm; if (size <= 0) - goto out; + goto out_uncharge; } rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; @@ -831,21 +836,24 @@ static int ptrace_bts_realloc(struct tas ret = -ENOMEM; if (!reduce_size) - goto out; + goto out_uncharge; size = rlim - current->mm->locked_vm; if (size <= 0) - goto out; + goto out_uncharge; } ret = ds_allocate((void **)&child->thread.ds_area_msr, size << PAGE_SHIFT); if (ret < 0) - goto out; + goto out_uncharge; current->mm->total_vm += size; current->mm->locked_vm += size; +out_uncharge: + if (ret < 0) + memrlimit_cgroup_uncharge_as(mm, size); out: if (child->thread.ds_area_msr) set_tsk_thread_flag(child, TIF_DS_AREA_MSR); diff -puN include/linux/memrlimitcgroup.h~memrlimit-add-memrlimit-controller-accounting-and-control include/linux/memrlimitcgroup.h --- a/include/linux/memrlimitcgroup.h~memrlimit-add-memrlimit-controller-accounting-and-control +++ a/include/linux/memrlimitcgroup.h @@ -16,4 +16,25 @@ #ifndef LINUX_MEMRLIMITCGROUP_H #define LINUX_MEMRLIMITCGROUP_H +#ifdef CONFIG_CGROUP_MEMRLIMIT_CTLR + +int memrlimit_cgroup_charge_as(struct mm_struct *mm, unsigned long nr_pages); +void memrlimit_cgroup_uncharge_as(struct mm_struct *mm, unsigned long nr_pages); + +#else /* !CONFIG_CGROUP_RLIMIT_CTLR */ + +static inline int +memrlimit_cgroup_charge_as(struct mm_struct *mm, unsigned long nr_pages) +{ + return 0; +} + +static inline void +memrlimit_cgroup_uncharge_as(struct mm_struct *mm, unsigned long nr_pages) +{ +} + +#endif /* CONFIG_CGROUP_RLIMIT_CTLR */ + + #endif /* LINUX_MEMRLIMITCGROUP_H */ diff -puN kernel/fork.c~memrlimit-add-memrlimit-controller-accounting-and-control kernel/fork.c --- a/kernel/fork.c~memrlimit-add-memrlimit-controller-accounting-and-control +++ a/kernel/fork.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -263,7 +264,7 @@ static int dup_mmap(struct mm_struct *mm struct vm_area_struct *mpnt, *tmp, **pprev; struct rb_node **rb_link, *rb_parent; int retval; - unsigned long charge; + unsigned long charge, uncharged = 0; struct mempolicy *pol; down_write(&oldmm->mmap_sem); @@ -273,6 +274,15 @@ static int dup_mmap(struct mm_struct *mm */ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); + /* + * Uncharging as a result of failure is done by mmput() + * in dup_mm() + */ + if (memrlimit_cgroup_charge_as(oldmm, oldmm->total_vm)) { + retval = -ENOMEM; + goto out; + } + mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_cache = NULL; @@ -293,6 +303,8 @@ static int dup_mmap(struct mm_struct *mm mm->total_vm -= pages; vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, -pages); + memrlimit_cgroup_uncharge_as(mm, pages); + uncharged += pages; continue; } charge = 0; diff -puN mm/memrlimitcgroup.c~memrlimit-add-memrlimit-controller-accounting-and-control mm/memrlimitcgroup.c --- a/mm/memrlimitcgroup.c~memrlimit-add-memrlimit-controller-accounting-and-control +++ a/mm/memrlimitcgroup.c @@ -45,6 +45,38 @@ static struct memrlimit_cgroup *memrlimi struct memrlimit_cgroup, css); } +static struct memrlimit_cgroup * +memrlimit_cgroup_from_task(struct task_struct *p) +{ + return container_of(task_subsys_state(p, memrlimit_cgroup_subsys_id), + struct memrlimit_cgroup, css); +} + +/* + * Charge the cgroup for address space usage - mmap(), malloc() (through + * brk(), sbrk()), stack expansion, mremap(), etc - called with + * mmap_sem held. + */ +int memrlimit_cgroup_charge_as(struct mm_struct *mm, unsigned long nr_pages) +{ + struct memrlimit_cgroup *memrcg; + + memrcg = memrlimit_cgroup_from_task(mm->owner); + return res_counter_charge(&memrcg->as_res, (nr_pages << PAGE_SHIFT)); +} + +/* + * Uncharge the cgroup, as the address space of one of the tasks is + * decreasing - called with mmap_sem held. + */ +void memrlimit_cgroup_uncharge_as(struct mm_struct *mm, unsigned long nr_pages) +{ + struct memrlimit_cgroup *memrcg; + + memrcg = memrlimit_cgroup_from_task(mm->owner); + res_counter_uncharge(&memrcg->as_res, (nr_pages << PAGE_SHIFT)); +} + static struct cgroup_subsys_state * memrlimit_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) { @@ -121,11 +153,71 @@ static int memrlimit_cgroup_populate(str ARRAY_SIZE(memrlimit_cgroup_files)); } +static void memrlimit_cgroup_move_task(struct cgroup_subsys *ss, + struct cgroup *cgrp, + struct cgroup *old_cgrp, + struct task_struct *p) +{ + struct mm_struct *mm; + struct memrlimit_cgroup *memrcg, *old_memrcg; + + mm = get_task_mm(p); + if (mm == NULL) + return; + + /* + * Hold mmap_sem, so that total_vm does not change underneath us + */ + down_read(&mm->mmap_sem); + + rcu_read_lock(); + if (p != rcu_dereference(mm->owner)) + goto out; + + memrcg = memrlimit_cgroup_from_cgrp(cgrp); + old_memrcg = memrlimit_cgroup_from_cgrp(old_cgrp); + + if (memrcg == old_memrcg) + goto out; + + if (res_counter_charge(&memrcg->as_res, (mm->total_vm << PAGE_SHIFT))) + goto out; + res_counter_uncharge(&old_memrcg->as_res, (mm->total_vm << PAGE_SHIFT)); +out: + rcu_read_unlock(); + up_read(&mm->mmap_sem); + mmput(mm); +} + +/* + * This callback is called with mmap_sem held + */ +static void memrlimit_cgroup_mm_owner_changed(struct cgroup_subsys *ss, + struct cgroup *cgrp, + struct cgroup *old_cgrp, + struct task_struct *p) +{ + struct memrlimit_cgroup *memrcg, *old_memrcg; + struct mm_struct *mm = get_task_mm(p); + + BUG_ON(!mm); + memrcg = memrlimit_cgroup_from_cgrp(cgrp); + old_memrcg = memrlimit_cgroup_from_cgrp(old_cgrp); + + if (res_counter_charge(&memrcg->as_res, (mm->total_vm << PAGE_SHIFT))) + goto out; + res_counter_uncharge(&old_memrcg->as_res, (mm->total_vm << PAGE_SHIFT)); +out: + mmput(mm); +} + struct cgroup_subsys memrlimit_cgroup_subsys = { .name = "memrlimit", .subsys_id = memrlimit_cgroup_subsys_id, .create = memrlimit_cgroup_create, .destroy = memrlimit_cgroup_destroy, .populate = memrlimit_cgroup_populate, + .attach = memrlimit_cgroup_move_task, + .mm_owner_changed = memrlimit_cgroup_mm_owner_changed, .early_init = 0, }; diff -puN mm/mmap.c~memrlimit-add-memrlimit-controller-accounting-and-control mm/mmap.c --- a/mm/mmap.c~memrlimit-add-memrlimit-controller-accounting-and-control +++ a/mm/mmap.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -1752,6 +1753,7 @@ static void remove_vma_list(struct mm_st long nrpages = vma_pages(vma); mm->total_vm -= nrpages; + memrlimit_cgroup_uncharge_as(mm, nrpages); vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); vma = remove_vma(vma); } while (vma); @@ -2102,6 +2104,7 @@ void exit_mmap(struct mm_struct *mm) /* Use -1 here to ensure all VMAs in the mm are unmapped */ end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); + memrlimit_cgroup_uncharge_as(mm, mm->total_vm); free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); tlb_finish_mmu(tlb, 0, end); @@ -2124,6 +2127,9 @@ int insert_vm_struct(struct mm_struct * struct vm_area_struct * __vma, * prev; struct rb_node ** rb_link, * rb_parent; + if (memrlimit_cgroup_charge_as(mm, vma_pages(vma))) + return -ENOMEM; + /* * The vm_pgoff of a purely anonymous vma should be irrelevant * until its first write fault, when page's anon_vma and index @@ -2142,12 +2148,15 @@ int insert_vm_struct(struct mm_struct * } __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent); if (__vma && __vma->vm_start < vma->vm_end) - return -ENOMEM; + goto err; if ((vma->vm_flags & VM_ACCOUNT) && security_vm_enough_memory_mm(mm, vma_pages(vma))) - return -ENOMEM; + goto err; vma_link(mm, vma, prev, rb_link, rb_parent); return 0; +err: + memrlimit_cgroup_uncharge_as(mm, vma_pages(vma)); + return -ENOMEM; } /* @@ -2220,6 +2229,10 @@ int may_expand_vm(struct mm_struct *mm, if (cur + npages > lim) return 0; + + if (memrlimit_cgroup_charge_as(mm, npages)) + return 0; + return 1; } _