LCOV - code coverage report
Current view: top level - mm - oom_kill.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 14 334 4.2 %
Date: 2023-04-06 08:38:28 Functions: 3 30 10.0 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  *  linux/mm/oom_kill.c
       4             :  * 
       5             :  *  Copyright (C)  1998,2000  Rik van Riel
       6             :  *      Thanks go out to Claus Fischer for some serious inspiration and
       7             :  *      for goading me into coding this file...
       8             :  *  Copyright (C)  2010  Google, Inc.
       9             :  *      Rewritten by David Rientjes
      10             :  *
      11             :  *  The routines in this file are used to kill a process when
      12             :  *  we're seriously out of memory. This gets called from __alloc_pages()
      13             :  *  in mm/page_alloc.c when we really run out of memory.
      14             :  *
      15             :  *  Since we won't call these routines often (on a well-configured
      16             :  *  machine) this file will double as a 'coding guide' and a signpost
      17             :  *  for newbie kernel hackers. It features several pointers to major
      18             :  *  kernel subsystems and hints as to where to find out what things do.
      19             :  */
      20             : 
      21             : #include <linux/oom.h>
      22             : #include <linux/mm.h>
      23             : #include <linux/err.h>
      24             : #include <linux/gfp.h>
      25             : #include <linux/sched.h>
      26             : #include <linux/sched/mm.h>
      27             : #include <linux/sched/coredump.h>
      28             : #include <linux/sched/task.h>
      29             : #include <linux/sched/debug.h>
      30             : #include <linux/swap.h>
      31             : #include <linux/syscalls.h>
      32             : #include <linux/timex.h>
      33             : #include <linux/jiffies.h>
      34             : #include <linux/cpuset.h>
      35             : #include <linux/export.h>
      36             : #include <linux/notifier.h>
      37             : #include <linux/memcontrol.h>
      38             : #include <linux/mempolicy.h>
      39             : #include <linux/security.h>
      40             : #include <linux/ptrace.h>
      41             : #include <linux/freezer.h>
      42             : #include <linux/ftrace.h>
      43             : #include <linux/ratelimit.h>
      44             : #include <linux/kthread.h>
      45             : #include <linux/init.h>
      46             : #include <linux/mmu_notifier.h>
      47             : 
      48             : #include <asm/tlb.h>
      49             : #include "internal.h"
      50             : #include "slab.h"
      51             : 
      52             : #define CREATE_TRACE_POINTS
      53             : #include <trace/events/oom.h>
      54             : 
      55             : static int sysctl_panic_on_oom;
      56             : static int sysctl_oom_kill_allocating_task;
      57             : static int sysctl_oom_dump_tasks = 1;
      58             : 
      59             : /*
      60             :  * Serializes oom killer invocations (out_of_memory()) from all contexts to
      61             :  * prevent from over eager oom killing (e.g. when the oom killer is invoked
      62             :  * from different domains).
      63             :  *
      64             :  * oom_killer_disable() relies on this lock to stabilize oom_killer_disabled
      65             :  * and mark_oom_victim
      66             :  */
      67             : DEFINE_MUTEX(oom_lock);
      68             : /* Serializes oom_score_adj and oom_score_adj_min updates */
      69             : DEFINE_MUTEX(oom_adj_mutex);
      70             : 
      71             : static inline bool is_memcg_oom(struct oom_control *oc)
      72             : {
      73             :         return oc->memcg != NULL;
      74             : }
      75             : 
      76             : #ifdef CONFIG_NUMA
      77             : /**
      78             :  * oom_cpuset_eligible() - check task eligibility for kill
      79             :  * @start: task struct of which task to consider
      80             :  * @oc: pointer to struct oom_control
      81             :  *
      82             :  * Task eligibility is determined by whether or not a candidate task, @tsk,
      83             :  * shares the same mempolicy nodes as current if it is bound by such a policy
      84             :  * and whether or not it has the same set of allowed cpuset nodes.
      85             :  *
      86             :  * This function is assuming oom-killer context and 'current' has triggered
      87             :  * the oom-killer.
      88             :  */
      89             : static bool oom_cpuset_eligible(struct task_struct *start,
      90             :                                 struct oom_control *oc)
      91             : {
      92             :         struct task_struct *tsk;
      93             :         bool ret = false;
      94             :         const nodemask_t *mask = oc->nodemask;
      95             : 
      96             :         rcu_read_lock();
      97             :         for_each_thread(start, tsk) {
      98             :                 if (mask) {
      99             :                         /*
     100             :                          * If this is a mempolicy constrained oom, tsk's
     101             :                          * cpuset is irrelevant.  Only return true if its
     102             :                          * mempolicy intersects current, otherwise it may be
     103             :                          * needlessly killed.
     104             :                          */
     105             :                         ret = mempolicy_in_oom_domain(tsk, mask);
     106             :                 } else {
     107             :                         /*
     108             :                          * This is not a mempolicy constrained oom, so only
     109             :                          * check the mems of tsk's cpuset.
     110             :                          */
     111             :                         ret = cpuset_mems_allowed_intersects(current, tsk);
     112             :                 }
     113             :                 if (ret)
     114             :                         break;
     115             :         }
     116             :         rcu_read_unlock();
     117             : 
     118             :         return ret;
     119             : }
     120             : #else
     121             : static bool oom_cpuset_eligible(struct task_struct *tsk, struct oom_control *oc)
     122             : {
     123             :         return true;
     124             : }
     125             : #endif /* CONFIG_NUMA */
     126             : 
     127             : /*
     128             :  * The process p may have detached its own ->mm while exiting or through
     129             :  * kthread_use_mm(), but one or more of its subthreads may still have a valid
     130             :  * pointer.  Return p, or any of its subthreads with a valid ->mm, with
     131             :  * task_lock() held.
     132             :  */
     133          15 : struct task_struct *find_lock_task_mm(struct task_struct *p)
     134             : {
     135             :         struct task_struct *t;
     136             : 
     137             :         rcu_read_lock();
     138             : 
     139          30 :         for_each_thread(p, t) {
     140          15 :                 task_lock(t);
     141          15 :                 if (likely(t->mm))
     142             :                         goto found;
     143          15 :                 task_unlock(t);
     144             :         }
     145             :         t = NULL;
     146             : found:
     147             :         rcu_read_unlock();
     148             : 
     149          15 :         return t;
     150             : }
     151             : 
     152             : /*
     153             :  * order == -1 means the oom kill is required by sysrq, otherwise only
     154             :  * for display purposes.
     155             :  */
     156             : static inline bool is_sysrq_oom(struct oom_control *oc)
     157             : {
     158             :         return oc->order == -1;
     159             : }
     160             : 
     161             : /* return true if the task is not adequate as candidate victim task. */
     162             : static bool oom_unkillable_task(struct task_struct *p)
     163             : {
     164           0 :         if (is_global_init(p))
     165             :                 return true;
     166           0 :         if (p->flags & PF_KTHREAD)
     167             :                 return true;
     168             :         return false;
     169             : }
     170             : 
     171             : /*
     172             :  * Check whether unreclaimable slab amount is greater than
     173             :  * all user memory(LRU pages).
     174             :  * dump_unreclaimable_slab() could help in the case that
     175             :  * oom due to too much unreclaimable slab used by kernel.
     176             : */
     177             : static bool should_dump_unreclaim_slab(void)
     178             : {
     179             :         unsigned long nr_lru;
     180             : 
     181           0 :         nr_lru = global_node_page_state(NR_ACTIVE_ANON) +
     182           0 :                  global_node_page_state(NR_INACTIVE_ANON) +
     183           0 :                  global_node_page_state(NR_ACTIVE_FILE) +
     184           0 :                  global_node_page_state(NR_INACTIVE_FILE) +
     185           0 :                  global_node_page_state(NR_ISOLATED_ANON) +
     186           0 :                  global_node_page_state(NR_ISOLATED_FILE) +
     187           0 :                  global_node_page_state(NR_UNEVICTABLE);
     188             : 
     189           0 :         return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru);
     190             : }
     191             : 
     192             : /**
     193             :  * oom_badness - heuristic function to determine which candidate task to kill
     194             :  * @p: task struct of which task we should calculate
     195             :  * @totalpages: total present RAM allowed for page allocation
     196             :  *
     197             :  * The heuristic for determining which task to kill is made to be as simple and
     198             :  * predictable as possible.  The goal is to return the highest value for the
     199             :  * task consuming the most memory to avoid subsequent oom failures.
     200             :  */
     201           0 : long oom_badness(struct task_struct *p, unsigned long totalpages)
     202             : {
     203             :         long points;
     204             :         long adj;
     205             : 
     206           0 :         if (oom_unkillable_task(p))
     207             :                 return LONG_MIN;
     208             : 
     209           0 :         p = find_lock_task_mm(p);
     210           0 :         if (!p)
     211             :                 return LONG_MIN;
     212             : 
     213             :         /*
     214             :          * Do not even consider tasks which are explicitly marked oom
     215             :          * unkillable or have been already oom reaped or the are in
     216             :          * the middle of vfork
     217             :          */
     218           0 :         adj = (long)p->signal->oom_score_adj;
     219           0 :         if (adj == OOM_SCORE_ADJ_MIN ||
     220           0 :                         test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
     221           0 :                         in_vfork(p)) {
     222             :                 task_unlock(p);
     223           0 :                 return LONG_MIN;
     224             :         }
     225             : 
     226             :         /*
     227             :          * The baseline for the badness score is the proportion of RAM that each
     228             :          * task's rss, pagetable and swap space use.
     229             :          */
     230           0 :         points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
     231           0 :                 mm_pgtables_bytes(p->mm) / PAGE_SIZE;
     232             :         task_unlock(p);
     233             : 
     234             :         /* Normalize to oom_score_adj units */
     235           0 :         adj *= totalpages / 1000;
     236           0 :         points += adj;
     237             : 
     238           0 :         return points;
     239             : }
     240             : 
     241             : static const char * const oom_constraint_text[] = {
     242             :         [CONSTRAINT_NONE] = "CONSTRAINT_NONE",
     243             :         [CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET",
     244             :         [CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY",
     245             :         [CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG",
     246             : };
     247             : 
     248             : /*
     249             :  * Determine the type of allocation constraint.
     250             :  */
     251             : static enum oom_constraint constrained_alloc(struct oom_control *oc)
     252             : {
     253             :         struct zone *zone;
     254             :         struct zoneref *z;
     255           0 :         enum zone_type highest_zoneidx = gfp_zone(oc->gfp_mask);
     256           0 :         bool cpuset_limited = false;
     257             :         int nid;
     258             : 
     259           0 :         if (is_memcg_oom(oc)) {
     260           0 :                 oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1;
     261             :                 return CONSTRAINT_MEMCG;
     262             :         }
     263             : 
     264             :         /* Default to all available memory */
     265           0 :         oc->totalpages = totalram_pages() + total_swap_pages;
     266             : 
     267             :         if (!IS_ENABLED(CONFIG_NUMA))
     268             :                 return CONSTRAINT_NONE;
     269             : 
     270             :         if (!oc->zonelist)
     271             :                 return CONSTRAINT_NONE;
     272             :         /*
     273             :          * Reach here only when __GFP_NOFAIL is used. So, we should avoid
     274             :          * to kill current.We have to random task kill in this case.
     275             :          * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
     276             :          */
     277             :         if (oc->gfp_mask & __GFP_THISNODE)
     278             :                 return CONSTRAINT_NONE;
     279             : 
     280             :         /*
     281             :          * This is not a __GFP_THISNODE allocation, so a truncated nodemask in
     282             :          * the page allocator means a mempolicy is in effect.  Cpuset policy
     283             :          * is enforced in get_page_from_freelist().
     284             :          */
     285             :         if (oc->nodemask &&
     286             :             !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
     287             :                 oc->totalpages = total_swap_pages;
     288             :                 for_each_node_mask(nid, *oc->nodemask)
     289             :                         oc->totalpages += node_present_pages(nid);
     290             :                 return CONSTRAINT_MEMORY_POLICY;
     291             :         }
     292             : 
     293             :         /* Check this allocation failure is caused by cpuset's wall function */
     294             :         for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
     295             :                         highest_zoneidx, oc->nodemask)
     296             :                 if (!cpuset_zone_allowed(zone, oc->gfp_mask))
     297             :                         cpuset_limited = true;
     298             : 
     299             :         if (cpuset_limited) {
     300             :                 oc->totalpages = total_swap_pages;
     301             :                 for_each_node_mask(nid, cpuset_current_mems_allowed)
     302             :                         oc->totalpages += node_present_pages(nid);
     303             :                 return CONSTRAINT_CPUSET;
     304             :         }
     305             :         return CONSTRAINT_NONE;
     306             : }
     307             : 
     308           0 : static int oom_evaluate_task(struct task_struct *task, void *arg)
     309             : {
     310           0 :         struct oom_control *oc = arg;
     311             :         long points;
     312             : 
     313           0 :         if (oom_unkillable_task(task))
     314             :                 goto next;
     315             : 
     316             :         /* p may not have freeable memory in nodemask */
     317           0 :         if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc))
     318             :                 goto next;
     319             : 
     320             :         /*
     321             :          * This task already has access to memory reserves and is being killed.
     322             :          * Don't allow any other task to have access to the reserves unless
     323             :          * the task has MMF_OOM_SKIP because chances that it would release
     324             :          * any memory is quite low.
     325             :          */
     326           0 :         if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
     327           0 :                 if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
     328             :                         goto next;
     329             :                 goto abort;
     330             :         }
     331             : 
     332             :         /*
     333             :          * If task is allocating a lot of memory and has been marked to be
     334             :          * killed first if it triggers an oom, then select it.
     335             :          */
     336           0 :         if (oom_task_origin(task)) {
     337             :                 points = LONG_MAX;
     338             :                 goto select;
     339             :         }
     340             : 
     341           0 :         points = oom_badness(task, oc->totalpages);
     342           0 :         if (points == LONG_MIN || points < oc->chosen_points)
     343             :                 goto next;
     344             : 
     345             : select:
     346           0 :         if (oc->chosen)
     347           0 :                 put_task_struct(oc->chosen);
     348           0 :         get_task_struct(task);
     349           0 :         oc->chosen = task;
     350           0 :         oc->chosen_points = points;
     351             : next:
     352             :         return 0;
     353             : abort:
     354           0 :         if (oc->chosen)
     355           0 :                 put_task_struct(oc->chosen);
     356           0 :         oc->chosen = (void *)-1UL;
     357           0 :         return 1;
     358             : }
     359             : 
     360             : /*
     361             :  * Simple selection loop. We choose the process with the highest number of
     362             :  * 'points'. In case scan was aborted, oc->chosen is set to -1.
     363             :  */
     364           0 : static void select_bad_process(struct oom_control *oc)
     365             : {
     366           0 :         oc->chosen_points = LONG_MIN;
     367             : 
     368           0 :         if (is_memcg_oom(oc))
     369             :                 mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
     370             :         else {
     371             :                 struct task_struct *p;
     372             : 
     373             :                 rcu_read_lock();
     374           0 :                 for_each_process(p)
     375           0 :                         if (oom_evaluate_task(p, oc))
     376             :                                 break;
     377             :                 rcu_read_unlock();
     378             :         }
     379           0 : }
     380             : 
     381           0 : static int dump_task(struct task_struct *p, void *arg)
     382             : {
     383           0 :         struct oom_control *oc = arg;
     384             :         struct task_struct *task;
     385             : 
     386           0 :         if (oom_unkillable_task(p))
     387             :                 return 0;
     388             : 
     389             :         /* p may not have freeable memory in nodemask */
     390           0 :         if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
     391             :                 return 0;
     392             : 
     393           0 :         task = find_lock_task_mm(p);
     394           0 :         if (!task) {
     395             :                 /*
     396             :                  * All of p's threads have already detached their mm's. There's
     397             :                  * no need to report them; they can't be oom killed anyway.
     398             :                  */
     399             :                 return 0;
     400             :         }
     401             : 
     402           0 :         pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
     403             :                 task->pid, from_kuid(&init_user_ns, task_uid(task)),
     404             :                 task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
     405             :                 mm_pgtables_bytes(task->mm),
     406             :                 get_mm_counter(task->mm, MM_SWAPENTS),
     407             :                 task->signal->oom_score_adj, task->comm);
     408             :         task_unlock(task);
     409             : 
     410           0 :         return 0;
     411             : }
     412             : 
     413             : /**
     414             :  * dump_tasks - dump current memory state of all system tasks
     415             :  * @oc: pointer to struct oom_control
     416             :  *
     417             :  * Dumps the current memory state of all eligible tasks.  Tasks not in the same
     418             :  * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
     419             :  * are not shown.
     420             :  * State information includes task's pid, uid, tgid, vm size, rss,
     421             :  * pgtables_bytes, swapents, oom_score_adj value, and name.
     422             :  */
     423           0 : static void dump_tasks(struct oom_control *oc)
     424             : {
     425           0 :         pr_info("Tasks state (memory values in pages):\n");
     426           0 :         pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
     427             : 
     428           0 :         if (is_memcg_oom(oc))
     429             :                 mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
     430             :         else {
     431             :                 struct task_struct *p;
     432             : 
     433             :                 rcu_read_lock();
     434           0 :                 for_each_process(p)
     435           0 :                         dump_task(p, oc);
     436             :                 rcu_read_unlock();
     437             :         }
     438           0 : }
     439             : 
     440           0 : static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
     441             : {
     442             :         /* one line summary of the oom killer context. */
     443           0 :         pr_info("oom-kill:constraint=%s,nodemask=%*pbl",
     444             :                         oom_constraint_text[oc->constraint],
     445             :                         nodemask_pr_args(oc->nodemask));
     446             :         cpuset_print_current_mems_allowed();
     447           0 :         mem_cgroup_print_oom_context(oc->memcg, victim);
     448           0 :         pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid,
     449             :                 from_kuid(&init_user_ns, task_uid(victim)));
     450           0 : }
     451             : 
     452           0 : static void dump_header(struct oom_control *oc, struct task_struct *p)
     453             : {
     454           0 :         pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
     455             :                 current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
     456             :                         current->signal->oom_score_adj);
     457             :         if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
     458             :                 pr_warn("COMPACTION is disabled!!!\n");
     459             : 
     460           0 :         dump_stack();
     461           0 :         if (is_memcg_oom(oc))
     462             :                 mem_cgroup_print_oom_meminfo(oc->memcg);
     463             :         else {
     464           0 :                 __show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask, gfp_zone(oc->gfp_mask));
     465           0 :                 if (should_dump_unreclaim_slab())
     466           0 :                         dump_unreclaimable_slab();
     467             :         }
     468           0 :         if (sysctl_oom_dump_tasks)
     469           0 :                 dump_tasks(oc);
     470           0 :         if (p)
     471           0 :                 dump_oom_summary(oc, p);
     472           0 : }
     473             : 
     474             : /*
     475             :  * Number of OOM victims in flight
     476             :  */
     477             : static atomic_t oom_victims = ATOMIC_INIT(0);
     478             : static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
     479             : 
     480             : static bool oom_killer_disabled __read_mostly;
     481             : 
     482             : #define K(x) ((x) << (PAGE_SHIFT-10))
     483             : 
     484             : /*
     485             :  * task->mm can be NULL if the task is the exited group leader.  So to
     486             :  * determine whether the task is using a particular mm, we examine all the
     487             :  * task's threads: if one of those is using this mm then this task was also
     488             :  * using it.
     489             :  */
     490           0 : bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
     491             : {
     492             :         struct task_struct *t;
     493             : 
     494           0 :         for_each_thread(p, t) {
     495           0 :                 struct mm_struct *t_mm = READ_ONCE(t->mm);
     496           0 :                 if (t_mm)
     497           0 :                         return t_mm == mm;
     498             :         }
     499             :         return false;
     500             : }
     501             : 
     502             : #ifdef CONFIG_MMU
     503             : /*
     504             :  * OOM Reaper kernel thread which tries to reap the memory used by the OOM
     505             :  * victim (if that is possible) to help the OOM killer to move on.
     506             :  */
     507             : static struct task_struct *oom_reaper_th;
     508             : static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
     509             : static struct task_struct *oom_reaper_list;
     510             : static DEFINE_SPINLOCK(oom_reaper_lock);
     511             : 
     512           0 : static bool __oom_reap_task_mm(struct mm_struct *mm)
     513             : {
     514             :         struct vm_area_struct *vma;
     515           0 :         bool ret = true;
     516           0 :         VMA_ITERATOR(vmi, mm, 0);
     517             : 
     518             :         /*
     519             :          * Tell all users of get_user/copy_from_user etc... that the content
     520             :          * is no longer stable. No barriers really needed because unmapping
     521             :          * should imply barriers already and the reader would hit a page fault
     522             :          * if it stumbled over a reaped memory.
     523             :          */
     524           0 :         set_bit(MMF_UNSTABLE, &mm->flags);
     525             : 
     526           0 :         for_each_vma(vmi, vma) {
     527           0 :                 if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP))
     528           0 :                         continue;
     529             : 
     530             :                 /*
     531             :                  * Only anonymous pages have a good chance to be dropped
     532             :                  * without additional steps which we cannot afford as we
     533             :                  * are OOM already.
     534             :                  *
     535             :                  * We do not even care about fs backed pages because all
     536             :                  * which are reclaimable have already been reclaimed and
     537             :                  * we do not want to block exit_mmap by keeping mm ref
     538             :                  * count elevated without a good reason.
     539             :                  */
     540           0 :                 if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
     541             :                         struct mmu_notifier_range range;
     542             :                         struct mmu_gather tlb;
     543             : 
     544           0 :                         mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0,
     545             :                                                 mm, vma->vm_start,
     546             :                                                 vma->vm_end);
     547           0 :                         tlb_gather_mmu(&tlb, mm);
     548           0 :                         if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
     549             :                                 tlb_finish_mmu(&tlb);
     550             :                                 ret = false;
     551             :                                 continue;
     552             :                         }
     553           0 :                         unmap_page_range(&tlb, vma, range.start, range.end, NULL);
     554           0 :                         mmu_notifier_invalidate_range_end(&range);
     555           0 :                         tlb_finish_mmu(&tlb);
     556             :                 }
     557             :         }
     558             : 
     559           0 :         return ret;
     560             : }
     561             : 
     562             : /*
     563             :  * Reaps the address space of the give task.
     564             :  *
     565             :  * Returns true on success and false if none or part of the address space
     566             :  * has been reclaimed and the caller should retry later.
     567             :  */
     568           0 : static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
     569             : {
     570           0 :         bool ret = true;
     571             : 
     572           0 :         if (!mmap_read_trylock(mm)) {
     573             :                 trace_skip_task_reaping(tsk->pid);
     574             :                 return false;
     575             :         }
     576             : 
     577             :         /*
     578             :          * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
     579             :          * work on the mm anymore. The check for MMF_OOM_SKIP must run
     580             :          * under mmap_lock for reading because it serializes against the
     581             :          * mmap_write_lock();mmap_write_unlock() cycle in exit_mmap().
     582             :          */
     583           0 :         if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
     584             :                 trace_skip_task_reaping(tsk->pid);
     585             :                 goto out_unlock;
     586             :         }
     587             : 
     588           0 :         trace_start_task_reaping(tsk->pid);
     589             : 
     590             :         /* failed to reap part of the address space. Try again later */
     591           0 :         ret = __oom_reap_task_mm(mm);
     592           0 :         if (!ret)
     593             :                 goto out_finish;
     594             : 
     595           0 :         pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
     596             :                         task_pid_nr(tsk), tsk->comm,
     597             :                         K(get_mm_counter(mm, MM_ANONPAGES)),
     598             :                         K(get_mm_counter(mm, MM_FILEPAGES)),
     599             :                         K(get_mm_counter(mm, MM_SHMEMPAGES)));
     600             : out_finish:
     601           0 :         trace_finish_task_reaping(tsk->pid);
     602             : out_unlock:
     603           0 :         mmap_read_unlock(mm);
     604             : 
     605           0 :         return ret;
     606             : }
     607             : 
     608             : #define MAX_OOM_REAP_RETRIES 10
     609           0 : static void oom_reap_task(struct task_struct *tsk)
     610             : {
     611           0 :         int attempts = 0;
     612           0 :         struct mm_struct *mm = tsk->signal->oom_mm;
     613             : 
     614             :         /* Retry the mmap_read_trylock(mm) a few times */
     615           0 :         while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
     616           0 :                 schedule_timeout_idle(HZ/10);
     617             : 
     618           0 :         if (attempts <= MAX_OOM_REAP_RETRIES ||
     619           0 :             test_bit(MMF_OOM_SKIP, &mm->flags))
     620             :                 goto done;
     621             : 
     622           0 :         pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
     623             :                 task_pid_nr(tsk), tsk->comm);
     624           0 :         sched_show_task(tsk);
     625             :         debug_show_all_locks();
     626             : 
     627             : done:
     628           0 :         tsk->oom_reaper_list = NULL;
     629             : 
     630             :         /*
     631             :          * Hide this mm from OOM killer because it has been either reaped or
     632             :          * somebody can't call mmap_write_unlock(mm).
     633             :          */
     634           0 :         set_bit(MMF_OOM_SKIP, &mm->flags);
     635             : 
     636             :         /* Drop a reference taken by queue_oom_reaper */
     637           0 :         put_task_struct(tsk);
     638           0 : }
     639             : 
     640           1 : static int oom_reaper(void *unused)
     641             : {
     642           1 :         set_freezable();
     643             : 
     644             :         while (true) {
     645           1 :                 struct task_struct *tsk = NULL;
     646             : 
     647           1 :                 wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
     648           0 :                 spin_lock_irq(&oom_reaper_lock);
     649           0 :                 if (oom_reaper_list != NULL) {
     650           0 :                         tsk = oom_reaper_list;
     651           0 :                         oom_reaper_list = tsk->oom_reaper_list;
     652             :                 }
     653           0 :                 spin_unlock_irq(&oom_reaper_lock);
     654             : 
     655           0 :                 if (tsk)
     656           0 :                         oom_reap_task(tsk);
     657             :         }
     658             : 
     659             :         return 0;
     660             : }
     661             : 
     662           0 : static void wake_oom_reaper(struct timer_list *timer)
     663             : {
     664           0 :         struct task_struct *tsk = container_of(timer, struct task_struct,
     665             :                         oom_reaper_timer);
     666           0 :         struct mm_struct *mm = tsk->signal->oom_mm;
     667             :         unsigned long flags;
     668             : 
     669             :         /* The victim managed to terminate on its own - see exit_mmap */
     670           0 :         if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
     671           0 :                 put_task_struct(tsk);
     672           0 :                 return;
     673             :         }
     674             : 
     675           0 :         spin_lock_irqsave(&oom_reaper_lock, flags);
     676           0 :         tsk->oom_reaper_list = oom_reaper_list;
     677           0 :         oom_reaper_list = tsk;
     678           0 :         spin_unlock_irqrestore(&oom_reaper_lock, flags);
     679           0 :         trace_wake_reaper(tsk->pid);
     680           0 :         wake_up(&oom_reaper_wait);
     681             : }
     682             : 
     683             : /*
     684             :  * Give the OOM victim time to exit naturally before invoking the oom_reaping.
     685             :  * The timers timeout is arbitrary... the longer it is, the longer the worst
     686             :  * case scenario for the OOM can take. If it is too small, the oom_reaper can
     687             :  * get in the way and release resources needed by the process exit path.
     688             :  * e.g. The futex robust list can sit in Anon|Private memory that gets reaped
     689             :  * before the exit path is able to wake the futex waiters.
     690             :  */
     691             : #define OOM_REAPER_DELAY (2*HZ)
     692           0 : static void queue_oom_reaper(struct task_struct *tsk)
     693             : {
     694             :         /* mm is already queued? */
     695           0 :         if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
     696             :                 return;
     697             : 
     698           0 :         get_task_struct(tsk);
     699           0 :         timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0);
     700           0 :         tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY;
     701           0 :         add_timer(&tsk->oom_reaper_timer);
     702             : }
     703             : 
     704             : #ifdef CONFIG_SYSCTL
     705             : static struct ctl_table vm_oom_kill_table[] = {
     706             :         {
     707             :                 .procname       = "panic_on_oom",
     708             :                 .data           = &sysctl_panic_on_oom,
     709             :                 .maxlen         = sizeof(sysctl_panic_on_oom),
     710             :                 .mode           = 0644,
     711             :                 .proc_handler   = proc_dointvec_minmax,
     712             :                 .extra1         = SYSCTL_ZERO,
     713             :                 .extra2         = SYSCTL_TWO,
     714             :         },
     715             :         {
     716             :                 .procname       = "oom_kill_allocating_task",
     717             :                 .data           = &sysctl_oom_kill_allocating_task,
     718             :                 .maxlen         = sizeof(sysctl_oom_kill_allocating_task),
     719             :                 .mode           = 0644,
     720             :                 .proc_handler   = proc_dointvec,
     721             :         },
     722             :         {
     723             :                 .procname       = "oom_dump_tasks",
     724             :                 .data           = &sysctl_oom_dump_tasks,
     725             :                 .maxlen         = sizeof(sysctl_oom_dump_tasks),
     726             :                 .mode           = 0644,
     727             :                 .proc_handler   = proc_dointvec,
     728             :         },
     729             :         {}
     730             : };
     731             : #endif
     732             : 
     733           1 : static int __init oom_init(void)
     734             : {
     735           2 :         oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
     736             : #ifdef CONFIG_SYSCTL
     737           1 :         register_sysctl_init("vm", vm_oom_kill_table);
     738             : #endif
     739           1 :         return 0;
     740             : }
     741             : subsys_initcall(oom_init)
     742             : #else
     743             : static inline void queue_oom_reaper(struct task_struct *tsk)
     744             : {
     745             : }
     746             : #endif /* CONFIG_MMU */
     747             : 
     748             : /**
     749             :  * mark_oom_victim - mark the given task as OOM victim
     750             :  * @tsk: task to mark
     751             :  *
     752             :  * Has to be called with oom_lock held and never after
     753             :  * oom has been disabled already.
     754             :  *
     755             :  * tsk->mm has to be non NULL and caller has to guarantee it is stable (either
     756             :  * under task_lock or operate on the current).
     757             :  */
     758           0 : static void mark_oom_victim(struct task_struct *tsk)
     759             : {
     760           0 :         struct mm_struct *mm = tsk->mm;
     761             : 
     762           0 :         WARN_ON(oom_killer_disabled);
     763             :         /* OOM killer might race with memcg OOM */
     764           0 :         if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
     765             :                 return;
     766             : 
     767             :         /* oom_mm is bound to the signal struct life time. */
     768           0 :         if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
     769           0 :                 mmgrab(tsk->signal->oom_mm);
     770             : 
     771             :         /*
     772             :          * Make sure that the task is woken up from uninterruptible sleep
     773             :          * if it is frozen because OOM killer wouldn't be able to free
     774             :          * any memory and livelock. freezing_slow_path will tell the freezer
     775             :          * that TIF_MEMDIE tasks should be ignored.
     776             :          */
     777           0 :         __thaw_task(tsk);
     778           0 :         atomic_inc(&oom_victims);
     779           0 :         trace_mark_victim(tsk->pid);
     780             : }
     781             : 
     782             : /**
     783             :  * exit_oom_victim - note the exit of an OOM victim
     784             :  */
     785           0 : void exit_oom_victim(void)
     786             : {
     787           0 :         clear_thread_flag(TIF_MEMDIE);
     788             : 
     789           0 :         if (!atomic_dec_return(&oom_victims))
     790           0 :                 wake_up_all(&oom_victims_wait);
     791           0 : }
     792             : 
     793             : /**
     794             :  * oom_killer_enable - enable OOM killer
     795             :  */
     796           0 : void oom_killer_enable(void)
     797             : {
     798           0 :         oom_killer_disabled = false;
     799           0 :         pr_info("OOM killer enabled.\n");
     800           0 : }
     801             : 
     802             : /**
     803             :  * oom_killer_disable - disable OOM killer
     804             :  * @timeout: maximum timeout to wait for oom victims in jiffies
     805             :  *
     806             :  * Forces all page allocations to fail rather than trigger OOM killer.
     807             :  * Will block and wait until all OOM victims are killed or the given
     808             :  * timeout expires.
     809             :  *
     810             :  * The function cannot be called when there are runnable user tasks because
     811             :  * the userspace would see unexpected allocation failures as a result. Any
     812             :  * new usage of this function should be consulted with MM people.
     813             :  *
     814             :  * Returns true if successful and false if the OOM killer cannot be
     815             :  * disabled.
     816             :  */
     817           0 : bool oom_killer_disable(signed long timeout)
     818             : {
     819             :         signed long ret;
     820             : 
     821             :         /*
     822             :          * Make sure to not race with an ongoing OOM killer. Check that the
     823             :          * current is not killed (possibly due to sharing the victim's memory).
     824             :          */
     825           0 :         if (mutex_lock_killable(&oom_lock))
     826             :                 return false;
     827           0 :         oom_killer_disabled = true;
     828           0 :         mutex_unlock(&oom_lock);
     829             : 
     830           0 :         ret = wait_event_interruptible_timeout(oom_victims_wait,
     831             :                         !atomic_read(&oom_victims), timeout);
     832           0 :         if (ret <= 0) {
     833             :                 oom_killer_enable();
     834           0 :                 return false;
     835             :         }
     836           0 :         pr_info("OOM killer disabled.\n");
     837             : 
     838           0 :         return true;
     839             : }
     840             : 
     841             : static inline bool __task_will_free_mem(struct task_struct *task)
     842             : {
     843           0 :         struct signal_struct *sig = task->signal;
     844             : 
     845             :         /*
     846             :          * A coredumping process may sleep for an extended period in
     847             :          * coredump_task_exit(), so the oom killer cannot assume that
     848             :          * the process will promptly exit and release memory.
     849             :          */
     850           0 :         if (sig->core_state)
     851             :                 return false;
     852             : 
     853           0 :         if (sig->flags & SIGNAL_GROUP_EXIT)
     854             :                 return true;
     855             : 
     856           0 :         if (thread_group_empty(task) && (task->flags & PF_EXITING))
     857             :                 return true;
     858             : 
     859             :         return false;
     860             : }
     861             : 
     862             : /*
     863             :  * Checks whether the given task is dying or exiting and likely to
     864             :  * release its address space. This means that all threads and processes
     865             :  * sharing the same mm have to be killed or exiting.
     866             :  * Caller has to make sure that task->mm is stable (hold task_lock or
     867             :  * it operates on the current).
     868             :  */
     869           0 : static bool task_will_free_mem(struct task_struct *task)
     870             : {
     871           0 :         struct mm_struct *mm = task->mm;
     872             :         struct task_struct *p;
     873           0 :         bool ret = true;
     874             : 
     875             :         /*
     876             :          * Skip tasks without mm because it might have passed its exit_mm and
     877             :          * exit_oom_victim. oom_reaper could have rescued that but do not rely
     878             :          * on that for now. We can consider find_lock_task_mm in future.
     879             :          */
     880           0 :         if (!mm)
     881             :                 return false;
     882             : 
     883           0 :         if (!__task_will_free_mem(task))
     884             :                 return false;
     885             : 
     886             :         /*
     887             :          * This task has already been drained by the oom reaper so there are
     888             :          * only small chances it will free some more
     889             :          */
     890           0 :         if (test_bit(MMF_OOM_SKIP, &mm->flags))
     891             :                 return false;
     892             : 
     893           0 :         if (atomic_read(&mm->mm_users) <= 1)
     894             :                 return true;
     895             : 
     896             :         /*
     897             :          * Make sure that all tasks which share the mm with the given tasks
     898             :          * are dying as well to make sure that a) nobody pins its mm and
     899             :          * b) the task is also reapable by the oom reaper.
     900             :          */
     901             :         rcu_read_lock();
     902           0 :         for_each_process(p) {
     903           0 :                 if (!process_shares_mm(p, mm))
     904           0 :                         continue;
     905           0 :                 if (same_thread_group(task, p))
     906           0 :                         continue;
     907           0 :                 ret = __task_will_free_mem(p);
     908           0 :                 if (!ret)
     909             :                         break;
     910             :         }
     911             :         rcu_read_unlock();
     912             : 
     913           0 :         return ret;
     914             : }
     915             : 
     916           0 : static void __oom_kill_process(struct task_struct *victim, const char *message)
     917             : {
     918             :         struct task_struct *p;
     919             :         struct mm_struct *mm;
     920           0 :         bool can_oom_reap = true;
     921             : 
     922           0 :         p = find_lock_task_mm(victim);
     923           0 :         if (!p) {
     924           0 :                 pr_info("%s: OOM victim %d (%s) is already exiting. Skip killing the task\n",
     925             :                         message, task_pid_nr(victim), victim->comm);
     926           0 :                 put_task_struct(victim);
     927           0 :                 return;
     928           0 :         } else if (victim != p) {
     929           0 :                 get_task_struct(p);
     930           0 :                 put_task_struct(victim);
     931           0 :                 victim = p;
     932             :         }
     933             : 
     934             :         /* Get a reference to safely compare mm after task_unlock(victim) */
     935           0 :         mm = victim->mm;
     936           0 :         mmgrab(mm);
     937             : 
     938             :         /* Raise event before sending signal: task reaper must see this */
     939           0 :         count_vm_event(OOM_KILL);
     940           0 :         memcg_memory_event_mm(mm, MEMCG_OOM_KILL);
     941             : 
     942             :         /*
     943             :          * We should send SIGKILL before granting access to memory reserves
     944             :          * in order to prevent the OOM victim from depleting the memory
     945             :          * reserves from the user space under its control.
     946             :          */
     947           0 :         do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);
     948           0 :         mark_oom_victim(victim);
     949           0 :         pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
     950             :                 message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
     951             :                 K(get_mm_counter(mm, MM_ANONPAGES)),
     952             :                 K(get_mm_counter(mm, MM_FILEPAGES)),
     953             :                 K(get_mm_counter(mm, MM_SHMEMPAGES)),
     954             :                 from_kuid(&init_user_ns, task_uid(victim)),
     955             :                 mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj);
     956           0 :         task_unlock(victim);
     957             : 
     958             :         /*
     959             :          * Kill all user processes sharing victim->mm in other thread groups, if
     960             :          * any.  They don't get access to memory reserves, though, to avoid
     961             :          * depletion of all memory.  This prevents mm->mmap_lock livelock when an
     962             :          * oom killed thread cannot exit because it requires the semaphore and
     963             :          * its contended by another thread trying to allocate memory itself.
     964             :          * That thread will now get access to memory reserves since it has a
     965             :          * pending fatal signal.
     966             :          */
     967             :         rcu_read_lock();
     968           0 :         for_each_process(p) {
     969           0 :                 if (!process_shares_mm(p, mm))
     970           0 :                         continue;
     971           0 :                 if (same_thread_group(p, victim))
     972           0 :                         continue;
     973           0 :                 if (is_global_init(p)) {
     974           0 :                         can_oom_reap = false;
     975           0 :                         set_bit(MMF_OOM_SKIP, &mm->flags);
     976           0 :                         pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
     977             :                                         task_pid_nr(victim), victim->comm,
     978             :                                         task_pid_nr(p), p->comm);
     979           0 :                         continue;
     980             :                 }
     981             :                 /*
     982             :                  * No kthread_use_mm() user needs to read from the userspace so
     983             :                  * we are ok to reap it.
     984             :                  */
     985           0 :                 if (unlikely(p->flags & PF_KTHREAD))
     986           0 :                         continue;
     987           0 :                 do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
     988             :         }
     989             :         rcu_read_unlock();
     990             : 
     991           0 :         if (can_oom_reap)
     992           0 :                 queue_oom_reaper(victim);
     993             : 
     994           0 :         mmdrop(mm);
     995           0 :         put_task_struct(victim);
     996             : }
     997             : #undef K
     998             : 
     999             : /*
    1000             :  * Kill provided task unless it's secured by setting
    1001             :  * oom_score_adj to OOM_SCORE_ADJ_MIN.
    1002             :  */
    1003             : static int oom_kill_memcg_member(struct task_struct *task, void *message)
    1004             : {
    1005             :         if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN &&
    1006             :             !is_global_init(task)) {
    1007             :                 get_task_struct(task);
    1008             :                 __oom_kill_process(task, message);
    1009             :         }
    1010             :         return 0;
    1011             : }
    1012             : 
    1013           0 : static void oom_kill_process(struct oom_control *oc, const char *message)
    1014             : {
    1015           0 :         struct task_struct *victim = oc->chosen;
    1016             :         struct mem_cgroup *oom_group;
    1017             :         static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
    1018             :                                               DEFAULT_RATELIMIT_BURST);
    1019             : 
    1020             :         /*
    1021             :          * If the task is already exiting, don't alarm the sysadmin or kill
    1022             :          * its children or threads, just give it access to memory reserves
    1023             :          * so it can die quickly
    1024             :          */
    1025           0 :         task_lock(victim);
    1026           0 :         if (task_will_free_mem(victim)) {
    1027           0 :                 mark_oom_victim(victim);
    1028           0 :                 queue_oom_reaper(victim);
    1029           0 :                 task_unlock(victim);
    1030           0 :                 put_task_struct(victim);
    1031           0 :                 return;
    1032             :         }
    1033           0 :         task_unlock(victim);
    1034             : 
    1035           0 :         if (__ratelimit(&oom_rs))
    1036           0 :                 dump_header(oc, victim);
    1037             : 
    1038             :         /*
    1039             :          * Do we need to kill the entire memory cgroup?
    1040             :          * Or even one of the ancestor memory cgroups?
    1041             :          * Check this out before killing the victim task.
    1042             :          */
    1043           0 :         oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
    1044             : 
    1045           0 :         __oom_kill_process(victim, message);
    1046             : 
    1047             :         /*
    1048             :          * If necessary, kill all tasks in the selected memory cgroup.
    1049             :          */
    1050             :         if (oom_group) {
    1051             :                 memcg_memory_event(oom_group, MEMCG_OOM_GROUP_KILL);
    1052             :                 mem_cgroup_print_oom_group(oom_group);
    1053             :                 mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
    1054             :                                       (void *)message);
    1055             :                 mem_cgroup_put(oom_group);
    1056             :         }
    1057             : }
    1058             : 
    1059             : /*
    1060             :  * Determines whether the kernel must panic because of the panic_on_oom sysctl.
    1061             :  */
    1062           0 : static void check_panic_on_oom(struct oom_control *oc)
    1063             : {
    1064           0 :         if (likely(!sysctl_panic_on_oom))
    1065             :                 return;
    1066           0 :         if (sysctl_panic_on_oom != 2) {
    1067             :                 /*
    1068             :                  * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
    1069             :                  * does not panic for cpuset, mempolicy, or memcg allocation
    1070             :                  * failures.
    1071             :                  */
    1072           0 :                 if (oc->constraint != CONSTRAINT_NONE)
    1073             :                         return;
    1074             :         }
    1075             :         /* Do not panic for oom kills triggered by sysrq */
    1076           0 :         if (is_sysrq_oom(oc))
    1077             :                 return;
    1078           0 :         dump_header(oc, NULL);
    1079           0 :         panic("Out of memory: %s panic_on_oom is enabled\n",
    1080           0 :                 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
    1081             : }
    1082             : 
    1083             : static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
    1084             : 
    1085           0 : int register_oom_notifier(struct notifier_block *nb)
    1086             : {
    1087           0 :         return blocking_notifier_chain_register(&oom_notify_list, nb);
    1088             : }
    1089             : EXPORT_SYMBOL_GPL(register_oom_notifier);
    1090             : 
    1091           0 : int unregister_oom_notifier(struct notifier_block *nb)
    1092             : {
    1093           0 :         return blocking_notifier_chain_unregister(&oom_notify_list, nb);
    1094             : }
    1095             : EXPORT_SYMBOL_GPL(unregister_oom_notifier);
    1096             : 
    1097             : /**
    1098             :  * out_of_memory - kill the "best" process when we run out of memory
    1099             :  * @oc: pointer to struct oom_control
    1100             :  *
    1101             :  * If we run out of memory, we have the choice between either
    1102             :  * killing a random task (bad), letting the system crash (worse)
    1103             :  * OR try to be smart about which process to kill. Note that we
    1104             :  * don't have to be perfect here, we just have to be good.
    1105             :  */
    1106           0 : bool out_of_memory(struct oom_control *oc)
    1107             : {
    1108           0 :         unsigned long freed = 0;
    1109             : 
    1110           0 :         if (oom_killer_disabled)
    1111             :                 return false;
    1112             : 
    1113           0 :         if (!is_memcg_oom(oc)) {
    1114           0 :                 blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
    1115           0 :                 if (freed > 0 && !is_sysrq_oom(oc))
    1116             :                         /* Got some memory back in the last second. */
    1117             :                         return true;
    1118             :         }
    1119             : 
    1120             :         /*
    1121             :          * If current has a pending SIGKILL or is exiting, then automatically
    1122             :          * select it.  The goal is to allow it to allocate so that it may
    1123             :          * quickly exit and free its memory.
    1124             :          */
    1125           0 :         if (task_will_free_mem(current)) {
    1126           0 :                 mark_oom_victim(current);
    1127           0 :                 queue_oom_reaper(current);
    1128           0 :                 return true;
    1129             :         }
    1130             : 
    1131             :         /*
    1132             :          * The OOM killer does not compensate for IO-less reclaim.
    1133             :          * pagefault_out_of_memory lost its gfp context so we have to
    1134             :          * make sure exclude 0 mask - all other users should have at least
    1135             :          * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to
    1136             :          * invoke the OOM killer even if it is a GFP_NOFS allocation.
    1137             :          */
    1138           0 :         if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
    1139             :                 return true;
    1140             : 
    1141             :         /*
    1142             :          * Check if there were limitations on the allocation (only relevant for
    1143             :          * NUMA and memcg) that may require different handling.
    1144             :          */
    1145           0 :         oc->constraint = constrained_alloc(oc);
    1146           0 :         if (oc->constraint != CONSTRAINT_MEMORY_POLICY)
    1147           0 :                 oc->nodemask = NULL;
    1148           0 :         check_panic_on_oom(oc);
    1149             : 
    1150           0 :         if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
    1151           0 :             current->mm && !oom_unkillable_task(current) &&
    1152           0 :             oom_cpuset_eligible(current, oc) &&
    1153           0 :             current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
    1154           0 :                 get_task_struct(current);
    1155           0 :                 oc->chosen = current;
    1156           0 :                 oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
    1157           0 :                 return true;
    1158             :         }
    1159             : 
    1160           0 :         select_bad_process(oc);
    1161             :         /* Found nothing?!?! */
    1162           0 :         if (!oc->chosen) {
    1163           0 :                 dump_header(oc, NULL);
    1164           0 :                 pr_warn("Out of memory and no killable processes...\n");
    1165             :                 /*
    1166             :                  * If we got here due to an actual allocation at the
    1167             :                  * system level, we cannot survive this and will enter
    1168             :                  * an endless loop in the allocator. Bail out now.
    1169             :                  */
    1170           0 :                 if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))
    1171           0 :                         panic("System is deadlocked on memory\n");
    1172             :         }
    1173           0 :         if (oc->chosen && oc->chosen != (void *)-1UL)
    1174           0 :                 oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
    1175             :                                  "Memory cgroup out of memory");
    1176           0 :         return !!oc->chosen;
    1177             : }
    1178             : 
    1179             : /*
    1180             :  * The pagefault handler calls here because some allocation has failed. We have
    1181             :  * to take care of the memcg OOM here because this is the only safe context without
    1182             :  * any locks held but let the oom killer triggered from the allocation context care
    1183             :  * about the global OOM.
    1184             :  */
    1185           0 : void pagefault_out_of_memory(void)
    1186             : {
    1187             :         static DEFINE_RATELIMIT_STATE(pfoom_rs, DEFAULT_RATELIMIT_INTERVAL,
    1188             :                                       DEFAULT_RATELIMIT_BURST);
    1189             : 
    1190           0 :         if (mem_cgroup_oom_synchronize(true))
    1191             :                 return;
    1192             : 
    1193           0 :         if (fatal_signal_pending(current))
    1194             :                 return;
    1195             : 
    1196           0 :         if (__ratelimit(&pfoom_rs))
    1197           0 :                 pr_warn("Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF\n");
    1198             : }
    1199             : 
    1200           0 : SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
    1201             : {
    1202             : #ifdef CONFIG_MMU
    1203           0 :         struct mm_struct *mm = NULL;
    1204             :         struct task_struct *task;
    1205             :         struct task_struct *p;
    1206             :         unsigned int f_flags;
    1207           0 :         bool reap = false;
    1208           0 :         long ret = 0;
    1209             : 
    1210           0 :         if (flags)
    1211             :                 return -EINVAL;
    1212             : 
    1213           0 :         task = pidfd_get_task(pidfd, &f_flags);
    1214           0 :         if (IS_ERR(task))
    1215           0 :                 return PTR_ERR(task);
    1216             : 
    1217             :         /*
    1218             :          * Make sure to choose a thread which still has a reference to mm
    1219             :          * during the group exit
    1220             :          */
    1221           0 :         p = find_lock_task_mm(task);
    1222           0 :         if (!p) {
    1223             :                 ret = -ESRCH;
    1224             :                 goto put_task;
    1225             :         }
    1226             : 
    1227           0 :         mm = p->mm;
    1228           0 :         mmgrab(mm);
    1229             : 
    1230           0 :         if (task_will_free_mem(p))
    1231             :                 reap = true;
    1232             :         else {
    1233             :                 /* Error only if the work has not been done already */
    1234           0 :                 if (!test_bit(MMF_OOM_SKIP, &mm->flags))
    1235           0 :                         ret = -EINVAL;
    1236             :         }
    1237             :         task_unlock(p);
    1238             : 
    1239           0 :         if (!reap)
    1240             :                 goto drop_mm;
    1241             : 
    1242           0 :         if (mmap_read_lock_killable(mm)) {
    1243             :                 ret = -EINTR;
    1244             :                 goto drop_mm;
    1245             :         }
    1246             :         /*
    1247             :          * Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure
    1248             :          * possible change in exit_mmap is seen
    1249             :          */
    1250           0 :         if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm))
    1251           0 :                 ret = -EAGAIN;
    1252             :         mmap_read_unlock(mm);
    1253             : 
    1254             : drop_mm:
    1255             :         mmdrop(mm);
    1256             : put_task:
    1257           0 :         put_task_struct(task);
    1258           0 :         return ret;
    1259             : #else
    1260             :         return -ENOSYS;
    1261             : #endif /* CONFIG_MMU */
    1262             : }

Generated by: LCOV version 1.14