diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -61,16 +61,20 @@ nfs_create_request(struct nfs_open_conte struct nfs_server *server = NFS_SERVER(inode); struct nfs_page *req; - for (;;) { - /* try to allocate the request struct */ - req = nfs_page_alloc(); - if (req != NULL) - break; - - if (signalled() && (server->flags & NFS_MOUNT_INTR)) - return ERR_PTR(-ERESTARTSYS); - yield(); - } + /* try to allocate the request struct */ + req = nfs_page_alloc(); + if (unlikely(!req)) { + /* + * -ENOMEM will be returned only when TIF_MEMDIE is set + * so userland shouldn't risk to get confused by a new + * unhandled ENOMEM errno. + */ + WARN_ON(!test_thread_flag(TIF_MEMDIE)); + return ERR_PTR(-ENOMEM); + } + + if (signalled() && (server->flags & NFS_MOUNT_INTR)) + return ERR_PTR(-ERESTARTSYS); /* Initialize the request struct. Initially, we assume a * long write-back delay. This will be adjusted in diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -220,13 +220,13 @@ struct zone { spinlock_t lru_lock; struct list_head active_list; struct list_head inactive_list; - unsigned long nr_scan_active; - unsigned long nr_scan_inactive; unsigned long pages_scanned; /* since last reclaim */ int all_unreclaimable; /* All pages pinned */ +#ifdef CONFIG_NUMA /* A count of how many reclaimers are scanning this zone */ atomic_t reclaim_in_progress; +#endif /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; diff --git a/include/linux/swap.h b/include/linux/swap.h --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -159,6 +159,8 @@ struct swap_list_t { #define vm_swap_full() (nr_swap_pages*2 < total_swap_pages) /* linux/mm/oom_kill.c */ +extern unsigned long VM_is_OOM; +#define is_VM_OOM() unlikely(test_bit(0, &VM_is_OOM)) extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order); extern int register_oom_notifier(struct notifier_block *nb); extern int unregister_oom_notifier(struct notifier_block *nb); diff --git a/kernel/exit.c b/kernel/exit.c --- a/kernel/exit.c +++ b/kernel/exit.c @@ -845,6 +845,15 @@ static void exit_notify(struct task_stru unlikely(tsk->parent->signal->flags & SIGNAL_GROUP_EXIT))) state = EXIT_DEAD; tsk->exit_state = state; + + /* + * Read TIF_MEMDIE and set VM_is_OOM to 0 atomically inside + * the tasklist_lock_lock. + */ + if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE))) { + extern unsigned long VM_is_OOM; + clear_bit(0, &VM_is_OOM); + } write_unlock_irq(&tasklist_lock); diff --git a/mm/filemap.c b/mm/filemap.c --- a/mm/filemap.c +++ b/mm/filemap.c @@ -894,6 +894,13 @@ void do_generic_mapping_read(struct addr struct page *page; unsigned long nr, ret; + if (unlikely(sigismember(¤t->pending.signal, SIGKILL))) + /* + * Must not hang almost forever in D state in presence of sigkill + * and lots of ram/swap (think during OOM). + */ + break; + /* nr is the maximum number of bytes to copy from this page */ nr = PAGE_CACHE_SIZE; if (index >= end_index) { @@ -2105,6 +2112,13 @@ generic_file_buffered_write(struct kiocb unsigned long index; unsigned long offset; size_t copied; + + if (unlikely(sigismember(¤t->pending.signal, SIGKILL))) + /* + * Must not hang almost forever in D state in presence of sigkill + * and lots of ram/swap (think during OOM). + */ + break; offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ index = pos >> PAGE_CACHE_SHIFT; diff --git a/mm/oom_kill.c b/mm/oom_kill.c --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -28,6 +28,9 @@ int sysctl_panic_on_oom; int sysctl_panic_on_oom; /* #define DEBUG */ +unsigned long VM_is_OOM __cacheline_aligned_in_smp; +static unsigned long last_tif_memdie_jiffies; + /** * badness - calculate a numeric value for how bad this task has been * @p: task struct of which task we should calculate @@ -49,7 +52,7 @@ int sysctl_panic_on_oom; unsigned long badness(struct task_struct *p, unsigned long uptime) { - unsigned long points, cpu_time, run_time, s; + unsigned long points; struct mm_struct *mm; struct task_struct *child; @@ -63,7 +66,7 @@ unsigned long badness(struct task_struct /* * The memory size of the process is the basis for the badness. */ - points = mm->total_vm; + points = get_mm_rss(mm); /* * After this unlock we can no longer dereference local variable `mm' @@ -87,29 +90,9 @@ unsigned long badness(struct task_struct list_for_each_entry(child, &p->children, sibling) { task_lock(child); if (child->mm != mm && child->mm) - points += child->mm->total_vm/2 + 1; + points += get_mm_rss(child->mm)/2 + 1; task_unlock(child); } - - /* - * CPU time is in tens of seconds and run time is in thousands - * of seconds. There is no particular reason for this other than - * that it turned out to work very well in practice. - */ - cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime)) - >> (SHIFT_HZ + 3); - - if (uptime >= p->start_time.tv_sec) - run_time = (uptime - p->start_time.tv_sec) >> 10; - else - run_time = 0; - - s = int_sqrt(cpu_time); - if (s) - points /= s; - s = int_sqrt(int_sqrt(run_time)); - if (s) - points /= s; /* * Niced processes are most likely less important, so double @@ -225,34 +208,13 @@ static struct task_struct *select_bad_pr if (is_init(p)) continue; - /* - * This task already has access to memory reserves and is - * being killed. Don't allow any other task access to the - * memory reserve. - * - * Note: this may have a chance of deadlock if it gets - * blocked waiting for another task which itself is waiting - * for memory. Is there a better alternative? - */ - if (test_tsk_thread_flag(p, TIF_MEMDIE)) - return ERR_PTR(-1UL); - - /* - * This is in the process of releasing memory so wait for it - * to finish before killing some other task by mistake. - * - * However, if p is the current task, we allow the 'kill' to - * go ahead if it is exiting: this will simply set TIF_MEMDIE, - * which will allow it to gain access to memory reserves in - * the process of exiting and releasing its resources. - * Otherwise we could get an easy OOM deadlock. - */ - if (p->flags & PF_EXITING) { - if (p != current) - return ERR_PTR(-1UL); - - chosen = p; - *ppoints = ULONG_MAX; + if (unlikely(test_tsk_thread_flag(p, TIF_MEMDIE))) { + /* + * Either we already waited long enough, + * or exit_mm already run, so we must + * try to kill another task. + */ + continue; } if (p->oomkilladj == OOM_DISABLE) @@ -290,18 +252,21 @@ static void __oom_kill_task(struct task_ if (verbose) printk(KERN_ERR "Killed process %d (%s)\n", p->pid, p->comm); + if (!test_and_set_tsk_thread_flag(p, TIF_MEMDIE)) { + last_tif_memdie_jiffies = jiffies; + set_bit(0, &VM_is_OOM); + } /* * We give our sacrificial lamb high priority and access to * all the memory it needs. That way it should be able to * exit() and clear out its resources quickly... */ p->time_slice = HZ; - set_tsk_thread_flag(p, TIF_MEMDIE); force_sig(SIGKILL, p); } -static int oom_kill_task(struct task_struct *p) +static int oom_kill_task(struct task_struct *p, gfp_t gfp_mask, int order) { struct mm_struct *mm; struct task_struct *g, *q; @@ -328,84 +293,6 @@ static int oom_kill_task(struct task_str return 1; } while_each_thread(g, q); - __oom_kill_task(p, 1); - - /* - * kill all processes that share the ->mm (i.e. all threads), - * but are in a different thread group. Don't let them have access - * to memory reserves though, otherwise we might deplete all memory. - */ - do_each_thread(g, q) { - if (q->mm == mm && q->tgid != p->tgid) - force_sig(SIGKILL, q); - } while_each_thread(g, q); - - return 0; -} - -static int oom_kill_process(struct task_struct *p, unsigned long points, - const char *message) -{ - struct task_struct *c; - struct list_head *tsk; - - /* - * If the task is already exiting, don't alarm the sysadmin or kill - * its children or threads, just set TIF_MEMDIE so it can die quickly - */ - if (p->flags & PF_EXITING) { - __oom_kill_task(p, 0); - return 0; - } - - printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n", - message, p->pid, p->comm, points); - - /* Try to kill a child first */ - list_for_each(tsk, &p->children) { - c = list_entry(tsk, struct task_struct, sibling); - if (c->mm == p->mm) - continue; - if (!oom_kill_task(c)) - return 0; - } - return oom_kill_task(p); -} - -static BLOCKING_NOTIFIER_HEAD(oom_notify_list); - -int register_oom_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_register(&oom_notify_list, nb); -} -EXPORT_SYMBOL_GPL(register_oom_notifier); - -int unregister_oom_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_unregister(&oom_notify_list, nb); -} -EXPORT_SYMBOL_GPL(unregister_oom_notifier); - -/** - * out_of_memory - kill the "best" process when we run out of memory - * - * If we run out of memory, we have the choice between either - * killing a random task (bad), letting the system crash (worse) - * OR try to be smart about which process to kill. Note that we - * don't have to be perfect here, we just have to be good. - */ -void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) -{ - struct task_struct *p; - unsigned long points = 0; - unsigned long freed = 0; - int constraint; - - blocking_notifier_call_chain(&oom_notify_list, 0, &freed); - if (freed > 0) - /* Got some memory back in the last second. */ - return; - if (printk_ratelimit()) { printk(KERN_WARNING "%s invoked oom-killer: " "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", @@ -414,6 +301,91 @@ void out_of_memory(struct zonelist *zone show_mem(); } + __oom_kill_task(p, 1); + + /* + * kill all processes that share the ->mm (i.e. all threads), + * but are in a different thread group. Don't let them have access + * to memory reserves though, otherwise we might deplete all memory. + */ + do_each_thread(g, q) { + if (q->mm == mm && q->tgid != p->tgid) + force_sig(SIGKILL, q); + } while_each_thread(g, q); + + return 0; +} + +static int oom_kill_process(struct task_struct *p, unsigned long points, + const char *message, gfp_t gfp_mask, int order) +{ + struct task_struct *c; + struct list_head *tsk; + + /* + * If the task is already exiting, don't alarm the sysadmin or kill + * its children or threads, just set TIF_MEMDIE so it can die quickly + */ + if (p->flags & PF_EXITING) { + __oom_kill_task(p, 0); + return 0; + } + + printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n", + message, p->pid, p->comm, points); + + /* Try to kill a child first */ + list_for_each(tsk, &p->children) { + c = list_entry(tsk, struct task_struct, sibling); + if (c->mm == p->mm) + continue; + /* + * We cannot select tasks with TIF_MEMDIE already set + * or we'll hard deadlock. + */ + if (unlikely(test_tsk_thread_flag(c, TIF_MEMDIE))) + continue; + if (!oom_kill_task(c, gfp_mask, order)) + return 0; + } + return oom_kill_task(p, gfp_mask, order); +} + +static BLOCKING_NOTIFIER_HEAD(oom_notify_list); + +int register_oom_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&oom_notify_list, nb); +} +EXPORT_SYMBOL_GPL(register_oom_notifier); + +int unregister_oom_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&oom_notify_list, nb); +} +EXPORT_SYMBOL_GPL(unregister_oom_notifier); + +/** + * out_of_memory - kill the "best" process when we run out of memory + * + * If we run out of memory, we have the choice between either + * killing a random task (bad), letting the system crash (worse) + * OR try to be smart about which process to kill. Note that we + * don't have to be perfect here, we just have to be good. + */ +void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) +{ + struct task_struct *p; + unsigned long points = 0; + unsigned long freed = 0; + int constraint; + static DECLARE_MUTEX(OOM_lock); + + blocking_notifier_call_chain(&oom_notify_list, 0, &freed); + if (freed > 0) + /* Got some memory back in the last second. */ + return; + if (sysctl_panic_on_oom == 2) panic("out of memory. Compulsory panic_on_oom is selected.\n"); @@ -423,53 +395,65 @@ void out_of_memory(struct zonelist *zone */ constraint = constrained_alloc(zonelist, gfp_mask); cpuset_lock(); - read_lock(&tasklist_lock); switch (constraint) { case CONSTRAINT_MEMORY_POLICY: + read_lock(&tasklist_lock); oom_kill_process(current, points, - "No available memory (MPOL_BIND)"); + "No available memory (MPOL_BIND)", gfp_mask, order); + read_unlock(&tasklist_lock); break; case CONSTRAINT_CPUSET: + read_lock(&tasklist_lock); oom_kill_process(current, points, - "No available memory in cpuset"); + "No available memory in cpuset", gfp_mask, order); + read_unlock(&tasklist_lock); break; case CONSTRAINT_NONE: - if (sysctl_panic_on_oom) + if (down_trylock(&OOM_lock)) + break; + read_lock(&tasklist_lock); + + /* + * This holds the down(OOM_lock)+read_lock(tasklist_lock), + * so it's equivalent to write_lock_irq(tasklist_lock) as + * far as VM_is_OOM is concerned. + */ + if (unlikely(test_bit(0, &VM_is_OOM))) { + if (time_before(jiffies, last_tif_memdie_jiffies + 10*HZ)) + goto out; + printk("detected probable OOM deadlock, so killing another task\n"); + last_tif_memdie_jiffies = jiffies; + } + + if (sysctl_panic_on_oom) { + read_unlock(&tasklist_lock); + cpuset_unlock(); panic("out of memory. panic_on_oom is selected\n"); + } retry: /* * Rambo mode: Shoot down a process and hope it solves whatever * issues we may have. */ p = select_bad_process(&points); - - if (PTR_ERR(p) == -1UL) - goto out; - /* Found nothing?!?! Either we hang forever, or we panic. */ - if (!p) { + if (unlikely(!p)) { read_unlock(&tasklist_lock); cpuset_unlock(); panic("Out of memory and no killable processes...\n"); } - if (oom_kill_process(p, points, "Out of memory")) + if (oom_kill_process(p, points, "Out of memory", gfp_mask, order)) goto retry; + out: + read_unlock(&tasklist_lock); + up(&OOM_lock); break; } -out: - read_unlock(&tasklist_lock); cpuset_unlock(); - - /* - * Give "p" a good chance of killing itself before we - * retry to allocate memory unless "p" is current - */ - if (!test_thread_flag(TIF_MEMDIE)) - schedule_timeout_uninterruptible(1); -} +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2649,10 +2649,10 @@ static void __meminit free_area_init_cor zone_pcp_init(zone); INIT_LIST_HEAD(&zone->active_list); INIT_LIST_HEAD(&zone->inactive_list); - zone->nr_scan_active = 0; - zone->nr_scan_inactive = 0; zap_zone_vm_stats(zone); +#ifdef CONFIG_NUMA atomic_set(&zone->reclaim_in_progress, 0); +#endif if (!size) continue; diff --git a/mm/vmscan.c b/mm/vmscan.c --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -909,29 +909,24 @@ static unsigned long shrink_zone(int pri unsigned long nr_to_scan; unsigned long nr_reclaimed = 0; +#ifdef CONFIG_NUMA atomic_inc(&zone->reclaim_in_progress); +#endif /* * Add one to `nr_to_scan' just to make sure that the kernel will * slowly sift through the active list. */ - zone->nr_scan_active += - (zone_page_state(zone, NR_ACTIVE) >> priority) + 1; - nr_active = zone->nr_scan_active; - if (nr_active >= sc->swap_cluster_max) - zone->nr_scan_active = 0; - else + nr_active = zone_page_state(zone, NR_ACTIVE) >> priority; + if (nr_active < sc->swap_cluster_max) nr_active = 0; - - zone->nr_scan_inactive += - (zone_page_state(zone, NR_INACTIVE) >> priority) + 1; - nr_inactive = zone->nr_scan_inactive; - if (nr_inactive >= sc->swap_cluster_max) - zone->nr_scan_inactive = 0; - else + nr_inactive = zone_page_state(zone, NR_INACTIVE) >> priority; + if (nr_inactive < sc->swap_cluster_max) nr_inactive = 0; while (nr_active || nr_inactive) { + if (is_VM_OOM()) + break; if (nr_active) { nr_to_scan = min(nr_active, (unsigned long)sc->swap_cluster_max); @@ -945,12 +940,16 @@ static unsigned long shrink_zone(int pri nr_inactive -= nr_to_scan; nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, sc); + if (nr_reclaimed >= sc->swap_cluster_max) + break; } } throttle_vm_writeout(sc->gfp_mask); +#ifdef CONFIG_NUMA atomic_dec(&zone->reclaim_in_progress); +#endif return nr_reclaimed; } @@ -1016,7 +1015,7 @@ unsigned long try_to_free_pages(struct z int priority; int ret = 0; unsigned long total_scanned = 0; - unsigned long nr_reclaimed = 0; + unsigned long nr_reclaimed; struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long lru_pages = 0; int i; @@ -1041,15 +1040,26 @@ unsigned long try_to_free_pages(struct z } for (priority = DEF_PRIORITY; priority >= 0; priority--) { + if (is_VM_OOM()) { + if (!test_thread_flag(TIF_MEMDIE)) { + /* get out of the way */ + schedule_timeout_interruptible(1); + /* don't waste cpu if we're still oom */ + if (is_VM_OOM()) + goto out; + } else + goto out; + } + sc.nr_scanned = 0; if (!priority) disable_swap_token(); - nr_reclaimed += shrink_zones(priority, zones, &sc); + nr_reclaimed = shrink_zones(priority, zones, &sc); + if (reclaim_state) + reclaim_state->reclaimed_slab = 0; shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); - if (reclaim_state) { + if (reclaim_state) nr_reclaimed += reclaim_state->reclaimed_slab; - reclaim_state->reclaimed_slab = 0; - } total_scanned += sc.nr_scanned; if (nr_reclaimed >= sc.swap_cluster_max) { ret = 1; @@ -1101,8 +1111,6 @@ out: * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at pages_high. * - * Returns the number of pages which were actually freed. - * * There is special handling here for zones which are full of pinned pages. * This can happen if the pages are all mlocked, or if they are all used by * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb. @@ -1118,7 +1126,7 @@ out: * the page allocator fallback scheme to ensure that aging of pages is balanced * across the zones. */ -static unsigned long balance_pgdat(pg_data_t *pgdat, int order) +static void balance_pgdat(pg_data_t *pgdat, int order) { int all_zones_ok; int priority; @@ -1140,7 +1148,6 @@ static unsigned long balance_pgdat(pg_da loop_again: total_scanned = 0; - nr_reclaimed = 0; sc.may_writepage = !laptop_mode; count_vm_event(PAGEOUTRUN); @@ -1195,6 +1202,7 @@ loop_again: * pages behind kswapd's direction of progress, which would * cause too much scanning of the lower zones. */ + nr_reclaimed = 0; for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; int nr_slab; @@ -1268,8 +1276,6 @@ out: goto loop_again; } - - return nr_reclaimed; } /* @@ -1392,22 +1398,14 @@ static unsigned long shrink_all_zones(un /* For pass = 0 we don't shrink the active list */ if (pass > 0) { - zone->nr_scan_active += - (zone_page_state(zone, NR_ACTIVE) >> prio) + 1; - if (zone->nr_scan_active >= nr_pages || pass > 3) { - zone->nr_scan_active = 0; - nr_to_scan = min(nr_pages, - zone_page_state(zone, NR_ACTIVE)); + nr_to_scan = (zone_page_state(zone, NR_ACTIVE) >> prio) + 1; + if (nr_to_scan >= nr_pages || pass > 3) { shrink_active_list(nr_to_scan, zone, sc, prio); } } - zone->nr_scan_inactive += - (zone_page_state(zone, NR_INACTIVE) >> prio) + 1; - if (zone->nr_scan_inactive >= nr_pages || pass > 3) { - zone->nr_scan_inactive = 0; - nr_to_scan = min(nr_pages, - zone_page_state(zone, NR_INACTIVE)); + nr_to_scan = (zone_page_state(zone, NR_INACTIVE) >> prio) + 1; + if (nr_to_scan >= nr_pages || pass > 3) { ret += shrink_inactive_list(nr_to_scan, zone, sc); if (ret >= nr_pages) return ret; diff --git a/mm/vmstat.c b/mm/vmstat.c --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -554,7 +554,7 @@ static int zoneinfo_show(struct seq_file "\n min %lu" "\n low %lu" "\n high %lu" - "\n scanned %lu (a: %lu i: %lu)" + "\n scanned %lu" "\n spanned %lu" "\n present %lu", zone_page_state(zone, NR_FREE_PAGES), @@ -562,7 +562,6 @@ static int zoneinfo_show(struct seq_file zone->pages_low, zone->pages_high, zone->pages_scanned, - zone->nr_scan_active, zone->nr_scan_inactive, zone->spanned_pages, zone->present_pages);