LCOV - code coverage report
Current view: top level - mm - page_alloc.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 555 1450 38.3 %
Date: 2023-08-24 13:40:31 Functions: 50 122 41.0 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  *  linux/mm/page_alloc.c
       4             :  *
       5             :  *  Manages the free list, the system allocates free pages here.
       6             :  *  Note that kmalloc() lives in slab.c
       7             :  *
       8             :  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
       9             :  *  Swap reorganised 29.12.95, Stephen Tweedie
      10             :  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
      11             :  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
      12             :  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
      13             :  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
      14             :  *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
      15             :  *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
      16             :  */
      17             : 
      18             : #include <linux/stddef.h>
      19             : #include <linux/mm.h>
      20             : #include <linux/highmem.h>
      21             : #include <linux/interrupt.h>
      22             : #include <linux/jiffies.h>
      23             : #include <linux/compiler.h>
      24             : #include <linux/kernel.h>
      25             : #include <linux/kasan.h>
      26             : #include <linux/kmsan.h>
      27             : #include <linux/module.h>
      28             : #include <linux/suspend.h>
      29             : #include <linux/ratelimit.h>
      30             : #include <linux/oom.h>
      31             : #include <linux/topology.h>
      32             : #include <linux/sysctl.h>
      33             : #include <linux/cpu.h>
      34             : #include <linux/cpuset.h>
      35             : #include <linux/memory_hotplug.h>
      36             : #include <linux/nodemask.h>
      37             : #include <linux/vmstat.h>
      38             : #include <linux/fault-inject.h>
      39             : #include <linux/compaction.h>
      40             : #include <trace/events/kmem.h>
      41             : #include <trace/events/oom.h>
      42             : #include <linux/prefetch.h>
      43             : #include <linux/mm_inline.h>
      44             : #include <linux/mmu_notifier.h>
      45             : #include <linux/migrate.h>
      46             : #include <linux/sched/mm.h>
      47             : #include <linux/page_owner.h>
      48             : #include <linux/page_table_check.h>
      49             : #include <linux/memcontrol.h>
      50             : #include <linux/ftrace.h>
      51             : #include <linux/lockdep.h>
      52             : #include <linux/psi.h>
      53             : #include <linux/khugepaged.h>
      54             : #include <linux/delayacct.h>
      55             : #include <asm/div64.h>
      56             : #include "internal.h"
      57             : #include "shuffle.h"
      58             : #include "page_reporting.h"
      59             : 
      60             : /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
      61             : typedef int __bitwise fpi_t;
      62             : 
      63             : /* No special request */
      64             : #define FPI_NONE                ((__force fpi_t)0)
      65             : 
      66             : /*
      67             :  * Skip free page reporting notification for the (possibly merged) page.
      68             :  * This does not hinder free page reporting from grabbing the page,
      69             :  * reporting it and marking it "reported" -  it only skips notifying
      70             :  * the free page reporting infrastructure about a newly freed page. For
      71             :  * example, used when temporarily pulling a page from a freelist and
      72             :  * putting it back unmodified.
      73             :  */
      74             : #define FPI_SKIP_REPORT_NOTIFY  ((__force fpi_t)BIT(0))
      75             : 
      76             : /*
      77             :  * Place the (possibly merged) page to the tail of the freelist. Will ignore
      78             :  * page shuffling (relevant code - e.g., memory onlining - is expected to
      79             :  * shuffle the whole zone).
      80             :  *
      81             :  * Note: No code should rely on this flag for correctness - it's purely
      82             :  *       to allow for optimizations when handing back either fresh pages
      83             :  *       (memory onlining) or untouched pages (page isolation, free page
      84             :  *       reporting).
      85             :  */
      86             : #define FPI_TO_TAIL             ((__force fpi_t)BIT(1))
      87             : 
      88             : /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
      89             : static DEFINE_MUTEX(pcp_batch_high_lock);
      90             : #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
      91             : 
      92             : #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
      93             : /*
      94             :  * On SMP, spin_trylock is sufficient protection.
      95             :  * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
      96             :  */
      97             : #define pcp_trylock_prepare(flags)      do { } while (0)
      98             : #define pcp_trylock_finish(flag)        do { } while (0)
      99             : #else
     100             : 
     101             : /* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
     102             : #define pcp_trylock_prepare(flags)      local_irq_save(flags)
     103             : #define pcp_trylock_finish(flags)       local_irq_restore(flags)
     104             : #endif
     105             : 
     106             : /*
     107             :  * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid
     108             :  * a migration causing the wrong PCP to be locked and remote memory being
     109             :  * potentially allocated, pin the task to the CPU for the lookup+lock.
     110             :  * preempt_disable is used on !RT because it is faster than migrate_disable.
     111             :  * migrate_disable is used on RT because otherwise RT spinlock usage is
     112             :  * interfered with and a high priority task cannot preempt the allocator.
     113             :  */
     114             : #ifndef CONFIG_PREEMPT_RT
     115             : #define pcpu_task_pin()         preempt_disable()
     116             : #define pcpu_task_unpin()       preempt_enable()
     117             : #else
     118             : #define pcpu_task_pin()         migrate_disable()
     119             : #define pcpu_task_unpin()       migrate_enable()
     120             : #endif
     121             : 
     122             : /*
     123             :  * Generic helper to lookup and a per-cpu variable with an embedded spinlock.
     124             :  * Return value should be used with equivalent unlock helper.
     125             :  */
     126             : #define pcpu_spin_lock(type, member, ptr)                               \
     127             : ({                                                                      \
     128             :         type *_ret;                                                     \
     129             :         pcpu_task_pin();                                                \
     130             :         _ret = this_cpu_ptr(ptr);                                       \
     131             :         spin_lock(&_ret->member);                                        \
     132             :         _ret;                                                           \
     133             : })
     134             : 
     135             : #define pcpu_spin_trylock(type, member, ptr)                            \
     136             : ({                                                                      \
     137             :         type *_ret;                                                     \
     138             :         pcpu_task_pin();                                                \
     139             :         _ret = this_cpu_ptr(ptr);                                       \
     140             :         if (!spin_trylock(&_ret->member)) {                              \
     141             :                 pcpu_task_unpin();                                      \
     142             :                 _ret = NULL;                                            \
     143             :         }                                                               \
     144             :         _ret;                                                           \
     145             : })
     146             : 
     147             : #define pcpu_spin_unlock(member, ptr)                                   \
     148             : ({                                                                      \
     149             :         spin_unlock(&ptr->member);                                       \
     150             :         pcpu_task_unpin();                                              \
     151             : })
     152             : 
     153             : /* struct per_cpu_pages specific helpers. */
     154             : #define pcp_spin_lock(ptr)                                              \
     155             :         pcpu_spin_lock(struct per_cpu_pages, lock, ptr)
     156             : 
     157             : #define pcp_spin_trylock(ptr)                                           \
     158             :         pcpu_spin_trylock(struct per_cpu_pages, lock, ptr)
     159             : 
     160             : #define pcp_spin_unlock(ptr)                                            \
     161             :         pcpu_spin_unlock(lock, ptr)
     162             : 
     163             : #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
     164             : DEFINE_PER_CPU(int, numa_node);
     165             : EXPORT_PER_CPU_SYMBOL(numa_node);
     166             : #endif
     167             : 
     168             : DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
     169             : 
     170             : #ifdef CONFIG_HAVE_MEMORYLESS_NODES
     171             : /*
     172             :  * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
     173             :  * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
     174             :  * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
     175             :  * defined in <linux/topology.h>.
     176             :  */
     177             : DEFINE_PER_CPU(int, _numa_mem_);                /* Kernel "local memory" node */
     178             : EXPORT_PER_CPU_SYMBOL(_numa_mem_);
     179             : #endif
     180             : 
     181             : static DEFINE_MUTEX(pcpu_drain_mutex);
     182             : 
     183             : #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
     184             : volatile unsigned long latent_entropy __latent_entropy;
     185             : EXPORT_SYMBOL(latent_entropy);
     186             : #endif
     187             : 
     188             : /*
     189             :  * Array of node states.
     190             :  */
     191             : nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
     192             :         [N_POSSIBLE] = NODE_MASK_ALL,
     193             :         [N_ONLINE] = { { [0] = 1UL } },
     194             : #ifndef CONFIG_NUMA
     195             :         [N_NORMAL_MEMORY] = { { [0] = 1UL } },
     196             : #ifdef CONFIG_HIGHMEM
     197             :         [N_HIGH_MEMORY] = { { [0] = 1UL } },
     198             : #endif
     199             :         [N_MEMORY] = { { [0] = 1UL } },
     200             :         [N_CPU] = { { [0] = 1UL } },
     201             : #endif  /* NUMA */
     202             : };
     203             : EXPORT_SYMBOL(node_states);
     204             : 
     205             : gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
     206             : 
     207             : /*
     208             :  * A cached value of the page's pageblock's migratetype, used when the page is
     209             :  * put on a pcplist. Used to avoid the pageblock migratetype lookup when
     210             :  * freeing from pcplists in most cases, at the cost of possibly becoming stale.
     211             :  * Also the migratetype set in the page does not necessarily match the pcplist
     212             :  * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
     213             :  * other index - this ensures that it will be put on the correct CMA freelist.
     214             :  */
     215             : static inline int get_pcppage_migratetype(struct page *page)
     216             : {
     217           0 :         return page->index;
     218             : }
     219             : 
     220             : static inline void set_pcppage_migratetype(struct page *page, int migratetype)
     221             : {
     222         677 :         page->index = migratetype;
     223             : }
     224             : 
     225             : #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
     226             : unsigned int pageblock_order __read_mostly;
     227             : #endif
     228             : 
     229             : static void __free_pages_ok(struct page *page, unsigned int order,
     230             :                             fpi_t fpi_flags);
     231             : 
     232             : /*
     233             :  * results with 256, 32 in the lowmem_reserve sysctl:
     234             :  *      1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
     235             :  *      1G machine -> (16M dma, 784M normal, 224M high)
     236             :  *      NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
     237             :  *      HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
     238             :  *      HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
     239             :  *
     240             :  * TBD: should special case ZONE_DMA32 machines here - in those we normally
     241             :  * don't need any ZONE_NORMAL reservation
     242             :  */
     243             : static int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
     244             : #ifdef CONFIG_ZONE_DMA
     245             :         [ZONE_DMA] = 256,
     246             : #endif
     247             : #ifdef CONFIG_ZONE_DMA32
     248             :         [ZONE_DMA32] = 256,
     249             : #endif
     250             :         [ZONE_NORMAL] = 32,
     251             : #ifdef CONFIG_HIGHMEM
     252             :         [ZONE_HIGHMEM] = 0,
     253             : #endif
     254             :         [ZONE_MOVABLE] = 0,
     255             : };
     256             : 
     257             : char * const zone_names[MAX_NR_ZONES] = {
     258             : #ifdef CONFIG_ZONE_DMA
     259             :          "DMA",
     260             : #endif
     261             : #ifdef CONFIG_ZONE_DMA32
     262             :          "DMA32",
     263             : #endif
     264             :          "Normal",
     265             : #ifdef CONFIG_HIGHMEM
     266             :          "HighMem",
     267             : #endif
     268             :          "Movable",
     269             : #ifdef CONFIG_ZONE_DEVICE
     270             :          "Device",
     271             : #endif
     272             : };
     273             : 
     274             : const char * const migratetype_names[MIGRATE_TYPES] = {
     275             :         "Unmovable",
     276             :         "Movable",
     277             :         "Reclaimable",
     278             :         "HighAtomic",
     279             : #ifdef CONFIG_CMA
     280             :         "CMA",
     281             : #endif
     282             : #ifdef CONFIG_MEMORY_ISOLATION
     283             :         "Isolate",
     284             : #endif
     285             : };
     286             : 
     287             : static compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
     288             :         [NULL_COMPOUND_DTOR] = NULL,
     289             :         [COMPOUND_PAGE_DTOR] = free_compound_page,
     290             : #ifdef CONFIG_HUGETLB_PAGE
     291             :         [HUGETLB_PAGE_DTOR] = free_huge_page,
     292             : #endif
     293             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
     294             :         [TRANSHUGE_PAGE_DTOR] = free_transhuge_page,
     295             : #endif
     296             : };
     297             : 
     298             : int min_free_kbytes = 1024;
     299             : int user_min_free_kbytes = -1;
     300             : static int watermark_boost_factor __read_mostly = 15000;
     301             : static int watermark_scale_factor = 10;
     302             : 
     303             : /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
     304             : int movable_zone;
     305             : EXPORT_SYMBOL(movable_zone);
     306             : 
     307             : #if MAX_NUMNODES > 1
     308             : unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
     309             : unsigned int nr_online_nodes __read_mostly = 1;
     310             : EXPORT_SYMBOL(nr_node_ids);
     311             : EXPORT_SYMBOL(nr_online_nodes);
     312             : #endif
     313             : 
     314             : static bool page_contains_unaccepted(struct page *page, unsigned int order);
     315             : static void accept_page(struct page *page, unsigned int order);
     316             : static bool try_to_accept_memory(struct zone *zone, unsigned int order);
     317             : static inline bool has_unaccepted_memory(void);
     318             : static bool __free_unaccepted(struct page *page);
     319             : 
     320             : int page_group_by_mobility_disabled __read_mostly;
     321             : 
     322             : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
     323             : /*
     324             :  * During boot we initialize deferred pages on-demand, as needed, but once
     325             :  * page_alloc_init_late() has finished, the deferred pages are all initialized,
     326             :  * and we can permanently disable that path.
     327             :  */
     328             : DEFINE_STATIC_KEY_TRUE(deferred_pages);
     329             : 
     330             : static inline bool deferred_pages_enabled(void)
     331             : {
     332             :         return static_branch_unlikely(&deferred_pages);
     333             : }
     334             : 
     335             : /*
     336             :  * deferred_grow_zone() is __init, but it is called from
     337             :  * get_page_from_freelist() during early boot until deferred_pages permanently
     338             :  * disables this call. This is why we have refdata wrapper to avoid warning,
     339             :  * and to ensure that the function body gets unloaded.
     340             :  */
     341             : static bool __ref
     342             : _deferred_grow_zone(struct zone *zone, unsigned int order)
     343             : {
     344             :        return deferred_grow_zone(zone, order);
     345             : }
     346             : #else
     347             : static inline bool deferred_pages_enabled(void)
     348             : {
     349             :         return false;
     350             : }
     351             : #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
     352             : 
     353             : /* Return a pointer to the bitmap storing bits affecting a block of pages */
     354             : static inline unsigned long *get_pageblock_bitmap(const struct page *page,
     355             :                                                         unsigned long pfn)
     356             : {
     357             : #ifdef CONFIG_SPARSEMEM
     358             :         return section_to_usemap(__pfn_to_section(pfn));
     359             : #else
     360         523 :         return page_zone(page)->pageblock_flags;
     361             : #endif /* CONFIG_SPARSEMEM */
     362             : }
     363             : 
     364             : static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
     365             : {
     366             : #ifdef CONFIG_SPARSEMEM
     367             :         pfn &= (PAGES_PER_SECTION-1);
     368             : #else
     369         523 :         pfn = pfn - pageblock_start_pfn(page_zone(page)->zone_start_pfn);
     370             : #endif /* CONFIG_SPARSEMEM */
     371         523 :         return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
     372             : }
     373             : 
     374             : static __always_inline
     375             : unsigned long __get_pfnblock_flags_mask(const struct page *page,
     376             :                                         unsigned long pfn,
     377             :                                         unsigned long mask)
     378             : {
     379             :         unsigned long *bitmap;
     380             :         unsigned long bitidx, word_bitidx;
     381             :         unsigned long word;
     382             : 
     383         522 :         bitmap = get_pageblock_bitmap(page, pfn);
     384         261 :         bitidx = pfn_to_bitidx(page, pfn);
     385         261 :         word_bitidx = bitidx / BITS_PER_LONG;
     386         261 :         bitidx &= (BITS_PER_LONG-1);
     387             :         /*
     388             :          * This races, without locks, with set_pfnblock_flags_mask(). Ensure
     389             :          * a consistent read of the memory array, so that results, even though
     390             :          * racy, are not corrupted.
     391             :          */
     392         261 :         word = READ_ONCE(bitmap[word_bitidx]);
     393         261 :         return (word >> bitidx) & mask;
     394             : }
     395             : 
     396             : /**
     397             :  * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
     398             :  * @page: The page within the block of interest
     399             :  * @pfn: The target page frame number
     400             :  * @mask: mask of bits that the caller is interested in
     401             :  *
     402             :  * Return: pageblock_bits flags
     403             :  */
     404           0 : unsigned long get_pfnblock_flags_mask(const struct page *page,
     405             :                                         unsigned long pfn, unsigned long mask)
     406             : {
     407           2 :         return __get_pfnblock_flags_mask(page, pfn, mask);
     408             : }
     409             : 
     410             : static __always_inline int get_pfnblock_migratetype(const struct page *page,
     411             :                                         unsigned long pfn)
     412             : {
     413         259 :         return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
     414             : }
     415             : 
     416             : /**
     417             :  * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
     418             :  * @page: The page within the block of interest
     419             :  * @flags: The flags to set
     420             :  * @pfn: The target page frame number
     421             :  * @mask: mask of bits that the caller is interested in
     422             :  */
     423         262 : void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
     424             :                                         unsigned long pfn,
     425             :                                         unsigned long mask)
     426             : {
     427             :         unsigned long *bitmap;
     428             :         unsigned long bitidx, word_bitidx;
     429             :         unsigned long word;
     430             : 
     431             :         BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
     432             :         BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
     433             : 
     434         524 :         bitmap = get_pageblock_bitmap(page, pfn);
     435         262 :         bitidx = pfn_to_bitidx(page, pfn);
     436         262 :         word_bitidx = bitidx / BITS_PER_LONG;
     437         262 :         bitidx &= (BITS_PER_LONG-1);
     438             : 
     439             :         VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
     440             : 
     441         262 :         mask <<= bitidx;
     442         262 :         flags <<= bitidx;
     443             : 
     444         262 :         word = READ_ONCE(bitmap[word_bitidx]);
     445             :         do {
     446         786 :         } while (!try_cmpxchg(&bitmap[word_bitidx], &word, (word & ~mask) | flags));
     447         262 : }
     448             : 
     449         262 : void set_pageblock_migratetype(struct page *page, int migratetype)
     450             : {
     451         262 :         if (unlikely(page_group_by_mobility_disabled &&
     452             :                      migratetype < MIGRATE_PCPTYPES))
     453           0 :                 migratetype = MIGRATE_UNMOVABLE;
     454             : 
     455         262 :         set_pfnblock_flags_mask(page, (unsigned long)migratetype,
     456         262 :                                 page_to_pfn(page), MIGRATETYPE_MASK);
     457         262 : }
     458             : 
     459             : #ifdef CONFIG_DEBUG_VM
     460             : static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
     461             : {
     462             :         int ret = 0;
     463             :         unsigned seq;
     464             :         unsigned long pfn = page_to_pfn(page);
     465             :         unsigned long sp, start_pfn;
     466             : 
     467             :         do {
     468             :                 seq = zone_span_seqbegin(zone);
     469             :                 start_pfn = zone->zone_start_pfn;
     470             :                 sp = zone->spanned_pages;
     471             :                 if (!zone_spans_pfn(zone, pfn))
     472             :                         ret = 1;
     473             :         } while (zone_span_seqretry(zone, seq));
     474             : 
     475             :         if (ret)
     476             :                 pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
     477             :                         pfn, zone_to_nid(zone), zone->name,
     478             :                         start_pfn, start_pfn + sp);
     479             : 
     480             :         return ret;
     481             : }
     482             : 
     483             : /*
     484             :  * Temporary debugging check for pages not lying within a given zone.
     485             :  */
     486             : static int __maybe_unused bad_range(struct zone *zone, struct page *page)
     487             : {
     488             :         if (page_outside_zone_boundaries(zone, page))
     489             :                 return 1;
     490             :         if (zone != page_zone(page))
     491             :                 return 1;
     492             : 
     493             :         return 0;
     494             : }
     495             : #else
     496             : static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
     497             : {
     498             :         return 0;
     499             : }
     500             : #endif
     501             : 
     502           0 : static void bad_page(struct page *page, const char *reason)
     503             : {
     504             :         static unsigned long resume;
     505             :         static unsigned long nr_shown;
     506             :         static unsigned long nr_unshown;
     507             : 
     508             :         /*
     509             :          * Allow a burst of 60 reports, then keep quiet for that minute;
     510             :          * or allow a steady drip of one report per second.
     511             :          */
     512           0 :         if (nr_shown == 60) {
     513           0 :                 if (time_before(jiffies, resume)) {
     514           0 :                         nr_unshown++;
     515           0 :                         goto out;
     516             :                 }
     517           0 :                 if (nr_unshown) {
     518           0 :                         pr_alert(
     519             :                               "BUG: Bad page state: %lu messages suppressed\n",
     520             :                                 nr_unshown);
     521           0 :                         nr_unshown = 0;
     522             :                 }
     523           0 :                 nr_shown = 0;
     524             :         }
     525           0 :         if (nr_shown++ == 0)
     526           0 :                 resume = jiffies + 60 * HZ;
     527             : 
     528           0 :         pr_alert("BUG: Bad page state in process %s  pfn:%05lx\n",
     529             :                 current->comm, page_to_pfn(page));
     530           0 :         dump_page(page, reason);
     531             : 
     532             :         print_modules();
     533           0 :         dump_stack();
     534             : out:
     535             :         /* Leave bad fields for debug, except PageBuddy could make trouble */
     536           0 :         page_mapcount_reset(page); /* remove PageBuddy */
     537           0 :         add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
     538           0 : }
     539             : 
     540             : static inline unsigned int order_to_pindex(int migratetype, int order)
     541             : {
     542         457 :         int base = order;
     543             : 
     544             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
     545             :         if (order > PAGE_ALLOC_COSTLY_ORDER) {
     546             :                 VM_BUG_ON(order != pageblock_order);
     547             :                 return NR_LOWORDER_PCP_LISTS;
     548             :         }
     549             : #else
     550             :         VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
     551             : #endif
     552             : 
     553         457 :         return (MIGRATE_PCPTYPES * base) + migratetype;
     554             : }
     555             : 
     556             : static inline int pindex_to_order(unsigned int pindex)
     557             : {
     558           0 :         int order = pindex / MIGRATE_PCPTYPES;
     559             : 
     560             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
     561             :         if (pindex == NR_LOWORDER_PCP_LISTS)
     562             :                 order = pageblock_order;
     563             : #else
     564             :         VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
     565             : #endif
     566             : 
     567             :         return order;
     568             : }
     569             : 
     570             : static inline bool pcp_allowed_order(unsigned int order)
     571             : {
     572         441 :         if (order <= PAGE_ALLOC_COSTLY_ORDER)
     573             :                 return true;
     574             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
     575             :         if (order == pageblock_order)
     576             :                 return true;
     577             : #endif
     578             :         return false;
     579             : }
     580             : 
     581           0 : static inline void free_the_page(struct page *page, unsigned int order)
     582             : {
     583           0 :         if (pcp_allowed_order(order))           /* Via pcp? */
     584           0 :                 free_unref_page(page, order);
     585             :         else
     586           0 :                 __free_pages_ok(page, order, FPI_NONE);
     587           0 : }
     588             : 
     589             : /*
     590             :  * Higher-order pages are called "compound pages".  They are structured thusly:
     591             :  *
     592             :  * The first PAGE_SIZE page is called the "head page" and have PG_head set.
     593             :  *
     594             :  * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
     595             :  * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
     596             :  *
     597             :  * The first tail page's ->compound_dtor holds the offset in array of compound
     598             :  * page destructors. See compound_page_dtors.
     599             :  *
     600             :  * The first tail page's ->compound_order holds the order of allocation.
     601             :  * This usage means that zero-order pages may not be compound.
     602             :  */
     603             : 
     604           0 : void free_compound_page(struct page *page)
     605             : {
     606           0 :         mem_cgroup_uncharge(page_folio(page));
     607           0 :         free_the_page(page, compound_order(page));
     608           0 : }
     609             : 
     610          96 : void prep_compound_page(struct page *page, unsigned int order)
     611             : {
     612             :         int i;
     613          96 :         int nr_pages = 1 << order;
     614             : 
     615          96 :         __SetPageHead(page);
     616         238 :         for (i = 1; i < nr_pages; i++)
     617         142 :                 prep_compound_tail(page, i);
     618             : 
     619          96 :         prep_compound_head(page, order);
     620          96 : }
     621             : 
     622           0 : void destroy_large_folio(struct folio *folio)
     623             : {
     624           0 :         enum compound_dtor_id dtor = folio->_folio_dtor;
     625             : 
     626             :         VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio);
     627           0 :         compound_page_dtors[dtor](&folio->page);
     628           0 : }
     629             : 
     630             : static inline void set_buddy_order(struct page *page, unsigned int order)
     631             : {
     632        1886 :         set_page_private(page, order);
     633         943 :         __SetPageBuddy(page);
     634             : }
     635             : 
     636             : #ifdef CONFIG_COMPACTION
     637         259 : static inline struct capture_control *task_capc(struct zone *zone)
     638             : {
     639         259 :         struct capture_control *capc = current->capture_control;
     640             : 
     641         259 :         return unlikely(capc) &&
     642           0 :                 !(current->flags & PF_KTHREAD) &&
     643           0 :                 !capc->page &&
     644         518 :                 capc->cc->zone == zone ? capc : NULL;
     645             : }
     646             : 
     647             : static inline bool
     648             : compaction_capture(struct capture_control *capc, struct page *page,
     649             :                    int order, int migratetype)
     650             : {
     651          11 :         if (!capc || order != capc->cc->order)
     652             :                 return false;
     653             : 
     654             :         /* Do not accidentally pollute CMA or isolated regions*/
     655             :         if (is_migrate_cma(migratetype) ||
     656           0 :             is_migrate_isolate(migratetype))
     657             :                 return false;
     658             : 
     659             :         /*
     660             :          * Do not let lower order allocations pollute a movable pageblock.
     661             :          * This might let an unmovable request use a reclaimable pageblock
     662             :          * and vice-versa but no more than normal fallback logic which can
     663             :          * have trouble finding a high-order free page.
     664             :          */
     665           0 :         if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
     666             :                 return false;
     667             : 
     668           0 :         capc->page = page;
     669             :         return true;
     670             : }
     671             : 
     672             : #else
     673             : static inline struct capture_control *task_capc(struct zone *zone)
     674             : {
     675             :         return NULL;
     676             : }
     677             : 
     678             : static inline bool
     679             : compaction_capture(struct capture_control *capc, struct page *page,
     680             :                    int order, int migratetype)
     681             : {
     682             :         return false;
     683             : }
     684             : #endif /* CONFIG_COMPACTION */
     685             : 
     686             : /* Used for pages not on another list */
     687             : static inline void add_to_free_list(struct page *page, struct zone *zone,
     688             :                                     unsigned int order, int migratetype)
     689             : {
     690         684 :         struct free_area *area = &zone->free_area[order];
     691             : 
     692        1368 :         list_add(&page->buddy_list, &area->free_list[migratetype]);
     693         684 :         area->nr_free++;
     694             : }
     695             : 
     696             : /* Used for pages not on another list */
     697             : static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
     698             :                                          unsigned int order, int migratetype)
     699             : {
     700         259 :         struct free_area *area = &zone->free_area[order];
     701             : 
     702         518 :         list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
     703         259 :         area->nr_free++;
     704             : }
     705             : 
     706             : /*
     707             :  * Used for pages which are on another list. Move the pages to the tail
     708             :  * of the list - so the moved pages won't immediately be considered for
     709             :  * allocation again (e.g., optimization for memory onlining).
     710             :  */
     711             : static inline void move_to_free_list(struct page *page, struct zone *zone,
     712             :                                      unsigned int order, int migratetype)
     713             : {
     714           2 :         struct free_area *area = &zone->free_area[order];
     715             : 
     716           4 :         list_move_tail(&page->buddy_list, &area->free_list[migratetype]);
     717             : }
     718             : 
     719             : static inline void del_page_from_free_list(struct page *page, struct zone *zone,
     720             :                                            unsigned int order)
     721             : {
     722             :         /* clear reported state and update reported page count */
     723             :         if (page_reported(page))
     724             :                 __ClearPageReported(page);
     725             : 
     726        1354 :         list_del(&page->buddy_list);
     727         677 :         __ClearPageBuddy(page);
     728        1354 :         set_page_private(page, 0);
     729         677 :         zone->free_area[order].nr_free--;
     730             : }
     731             : 
     732             : static inline struct page *get_page_from_free_area(struct free_area *area,
     733             :                                             int migratetype)
     734             : {
     735        1384 :         return list_first_entry_or_null(&area->free_list[migratetype],
     736             :                                         struct page, buddy_list);
     737             : }
     738             : 
     739             : /*
     740             :  * If this is not the largest possible page, check if the buddy
     741             :  * of the next-highest order is free. If it is, it's possible
     742             :  * that pages are being freed that will coalesce soon. In case,
     743             :  * that is happening, add the free page to the tail of the list
     744             :  * so it's less likely to be used soon and more likely to be merged
     745             :  * as a higher order page
     746             :  */
     747             : static inline bool
     748           0 : buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
     749             :                    struct page *page, unsigned int order)
     750             : {
     751             :         unsigned long higher_page_pfn;
     752             :         struct page *higher_page;
     753             : 
     754           0 :         if (order >= MAX_ORDER - 1)
     755             :                 return false;
     756             : 
     757           0 :         higher_page_pfn = buddy_pfn & pfn;
     758           0 :         higher_page = page + (higher_page_pfn - pfn);
     759             : 
     760           0 :         return find_buddy_page_pfn(higher_page, higher_page_pfn, order + 1,
     761           0 :                         NULL) != NULL;
     762             : }
     763             : 
     764             : /*
     765             :  * Freeing function for a buddy system allocator.
     766             :  *
     767             :  * The concept of a buddy system is to maintain direct-mapped table
     768             :  * (containing bit values) for memory blocks of various "orders".
     769             :  * The bottom level table contains the map for the smallest allocatable
     770             :  * units of memory (here, pages), and each level above it describes
     771             :  * pairs of units from the levels below, hence, "buddies".
     772             :  * At a high level, all that happens here is marking the table entry
     773             :  * at the bottom level available, and propagating the changes upward
     774             :  * as necessary, plus some accounting needed to play nicely with other
     775             :  * parts of the VM system.
     776             :  * At each level, we keep a list of pages, which are heads of continuous
     777             :  * free pages of length of (1 << order) and marked with PageBuddy.
     778             :  * Page's order is recorded in page_private(page) field.
     779             :  * So when we are allocating or freeing one, we can derive the state of the
     780             :  * other.  That is, if we allocate a small block, and both were
     781             :  * free, the remainder of the region must be split into blocks.
     782             :  * If a block is freed, and its buddy is also free, then this
     783             :  * triggers coalescing into a block of larger size.
     784             :  *
     785             :  * -- nyc
     786             :  */
     787             : 
     788         259 : static inline void __free_one_page(struct page *page,
     789             :                 unsigned long pfn,
     790             :                 struct zone *zone, unsigned int order,
     791             :                 int migratetype, fpi_t fpi_flags)
     792             : {
     793         259 :         struct capture_control *capc = task_capc(zone);
     794         259 :         unsigned long buddy_pfn = 0;
     795             :         unsigned long combined_pfn;
     796             :         struct page *buddy;
     797             :         bool to_tail;
     798             : 
     799             :         VM_BUG_ON(!zone_is_initialized(zone));
     800             :         VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
     801             : 
     802             :         VM_BUG_ON(migratetype == -1);
     803         259 :         if (likely(!is_migrate_isolate(migratetype)))
     804         259 :                 __mod_zone_freepage_state(zone, 1 << order, migratetype);
     805             : 
     806             :         VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
     807             :         VM_BUG_ON_PAGE(bad_range(zone, page), page);
     808             : 
     809         259 :         while (order < MAX_ORDER) {
     810          22 :                 if (compaction_capture(capc, page, order, migratetype)) {
     811           0 :                         __mod_zone_freepage_state(zone, -(1 << order),
     812             :                                                                 migratetype);
     813           0 :                         return;
     814             :                 }
     815             : 
     816          11 :                 buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn);
     817          11 :                 if (!buddy)
     818             :                         goto done_merging;
     819             : 
     820             :                 if (unlikely(order >= pageblock_order)) {
     821             :                         /*
     822             :                          * We want to prevent merge between freepages on pageblock
     823             :                          * without fallbacks and normal pageblock. Without this,
     824             :                          * pageblock isolation could cause incorrect freepage or CMA
     825             :                          * accounting or HIGHATOMIC accounting.
     826             :                          */
     827             :                         int buddy_mt = get_pageblock_migratetype(buddy);
     828             : 
     829             :                         if (migratetype != buddy_mt
     830             :                                         && (!migratetype_is_mergeable(migratetype) ||
     831             :                                                 !migratetype_is_mergeable(buddy_mt)))
     832             :                                 goto done_merging;
     833             :                 }
     834             : 
     835             :                 /*
     836             :                  * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
     837             :                  * merge with it and move up one order.
     838             :                  */
     839             :                 if (page_is_guard(buddy))
     840             :                         clear_page_guard(zone, buddy, order, migratetype);
     841             :                 else
     842             :                         del_page_from_free_list(buddy, zone, order);
     843           0 :                 combined_pfn = buddy_pfn & pfn;
     844           0 :                 page = page + (combined_pfn - pfn);
     845           0 :                 pfn = combined_pfn;
     846           0 :                 order++;
     847             :         }
     848             : 
     849             : done_merging:
     850         259 :         set_buddy_order(page, order);
     851             : 
     852         259 :         if (fpi_flags & FPI_TO_TAIL)
     853             :                 to_tail = true;
     854           0 :         else if (is_shuffle_order(order))
     855             :                 to_tail = shuffle_pick_tail();
     856             :         else
     857           0 :                 to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
     858             : 
     859         259 :         if (to_tail)
     860             :                 add_to_free_list_tail(page, zone, order, migratetype);
     861             :         else
     862             :                 add_to_free_list(page, zone, order, migratetype);
     863             : 
     864             :         /* Notify page reporting subsystem of freed page */
     865             :         if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
     866             :                 page_reporting_notify_free(order);
     867             : }
     868             : 
     869             : /**
     870             :  * split_free_page() -- split a free page at split_pfn_offset
     871             :  * @free_page:          the original free page
     872             :  * @order:              the order of the page
     873             :  * @split_pfn_offset:   split offset within the page
     874             :  *
     875             :  * Return -ENOENT if the free page is changed, otherwise 0
     876             :  *
     877             :  * It is used when the free page crosses two pageblocks with different migratetypes
     878             :  * at split_pfn_offset within the page. The split free page will be put into
     879             :  * separate migratetype lists afterwards. Otherwise, the function achieves
     880             :  * nothing.
     881             :  */
     882           0 : int split_free_page(struct page *free_page,
     883             :                         unsigned int order, unsigned long split_pfn_offset)
     884             : {
     885           0 :         struct zone *zone = page_zone(free_page);
     886           0 :         unsigned long free_page_pfn = page_to_pfn(free_page);
     887             :         unsigned long pfn;
     888             :         unsigned long flags;
     889             :         int free_page_order;
     890             :         int mt;
     891           0 :         int ret = 0;
     892             : 
     893           0 :         if (split_pfn_offset == 0)
     894             :                 return ret;
     895             : 
     896           0 :         spin_lock_irqsave(&zone->lock, flags);
     897             : 
     898           0 :         if (!PageBuddy(free_page) || buddy_order(free_page) != order) {
     899             :                 ret = -ENOENT;
     900             :                 goto out;
     901             :         }
     902             : 
     903           0 :         mt = get_pageblock_migratetype(free_page);
     904           0 :         if (likely(!is_migrate_isolate(mt)))
     905           0 :                 __mod_zone_freepage_state(zone, -(1UL << order), mt);
     906             : 
     907           0 :         del_page_from_free_list(free_page, zone, order);
     908           0 :         for (pfn = free_page_pfn;
     909           0 :              pfn < free_page_pfn + (1UL << order);) {
     910           0 :                 int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
     911             : 
     912           0 :                 free_page_order = min_t(unsigned int,
     913             :                                         pfn ? __ffs(pfn) : order,
     914             :                                         __fls(split_pfn_offset));
     915           0 :                 __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
     916             :                                 mt, FPI_NONE);
     917           0 :                 pfn += 1UL << free_page_order;
     918           0 :                 split_pfn_offset -= (1UL << free_page_order);
     919             :                 /* we have done the first part, now switch to second part */
     920           0 :                 if (split_pfn_offset == 0)
     921           0 :                         split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
     922             :         }
     923             : out:
     924           0 :         spin_unlock_irqrestore(&zone->lock, flags);
     925           0 :         return ret;
     926             : }
     927             : /*
     928             :  * A bad page could be due to a number of fields. Instead of multiple branches,
     929             :  * try and check multiple fields with one check. The caller must do a detailed
     930             :  * check if necessary.
     931             :  */
     932             : static inline bool page_expected_state(struct page *page,
     933             :                                         unsigned long check_flags)
     934             : {
     935           0 :         if (unlikely(atomic_read(&page->_mapcount) != -1))
     936             :                 return false;
     937             : 
     938           0 :         if (unlikely((unsigned long)page->mapping |
     939             :                         page_ref_count(page) |
     940             : #ifdef CONFIG_MEMCG
     941             :                         page->memcg_data |
     942             : #endif
     943             :                         (page->flags & check_flags)))
     944             :                 return false;
     945             : 
     946             :         return true;
     947             : }
     948             : 
     949             : static const char *page_bad_reason(struct page *page, unsigned long flags)
     950             : {
     951           0 :         const char *bad_reason = NULL;
     952             : 
     953           0 :         if (unlikely(atomic_read(&page->_mapcount) != -1))
     954           0 :                 bad_reason = "nonzero mapcount";
     955           0 :         if (unlikely(page->mapping != NULL))
     956           0 :                 bad_reason = "non-NULL mapping";
     957           0 :         if (unlikely(page_ref_count(page) != 0))
     958           0 :                 bad_reason = "nonzero _refcount";
     959           0 :         if (unlikely(page->flags & flags)) {
     960             :                 if (flags == PAGE_FLAGS_CHECK_AT_PREP)
     961             :                         bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
     962             :                 else
     963           0 :                         bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
     964             :         }
     965             : #ifdef CONFIG_MEMCG
     966             :         if (unlikely(page->memcg_data))
     967             :                 bad_reason = "page still charged to cgroup";
     968             : #endif
     969             :         return bad_reason;
     970             : }
     971             : 
     972           0 : static void free_page_is_bad_report(struct page *page)
     973             : {
     974           0 :         bad_page(page,
     975             :                  page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
     976           0 : }
     977             : 
     978           0 : static inline bool free_page_is_bad(struct page *page)
     979             : {
     980           0 :         if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
     981             :                 return false;
     982             : 
     983             :         /* Something has gone sideways, find it */
     984           0 :         free_page_is_bad_report(page);
     985           0 :         return true;
     986             : }
     987             : 
     988           0 : static inline bool is_check_pages_enabled(void)
     989             : {
     990      255250 :         return static_branch_unlikely(&check_pages_enabled);
     991             : }
     992             : 
     993           0 : static int free_tail_page_prepare(struct page *head_page, struct page *page)
     994             : {
     995           0 :         struct folio *folio = (struct folio *)head_page;
     996           0 :         int ret = 1;
     997             : 
     998             :         /*
     999             :          * We rely page->lru.next never has bit 0 set, unless the page
    1000             :          * is PageTail(). Let's make sure that's true even for poisoned ->lru.
    1001             :          */
    1002             :         BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
    1003             : 
    1004           0 :         if (!is_check_pages_enabled()) {
    1005             :                 ret = 0;
    1006             :                 goto out;
    1007             :         }
    1008           0 :         switch (page - head_page) {
    1009             :         case 1:
    1010             :                 /* the first tail page: these may be in place of ->mapping */
    1011           0 :                 if (unlikely(folio_entire_mapcount(folio))) {
    1012           0 :                         bad_page(page, "nonzero entire_mapcount");
    1013           0 :                         goto out;
    1014             :                 }
    1015           0 :                 if (unlikely(atomic_read(&folio->_nr_pages_mapped))) {
    1016           0 :                         bad_page(page, "nonzero nr_pages_mapped");
    1017           0 :                         goto out;
    1018             :                 }
    1019           0 :                 if (unlikely(atomic_read(&folio->_pincount))) {
    1020           0 :                         bad_page(page, "nonzero pincount");
    1021           0 :                         goto out;
    1022             :                 }
    1023             :                 break;
    1024             :         case 2:
    1025             :                 /*
    1026             :                  * the second tail page: ->mapping is
    1027             :                  * deferred_list.next -- ignore value.
    1028             :                  */
    1029             :                 break;
    1030             :         default:
    1031           0 :                 if (page->mapping != TAIL_MAPPING) {
    1032           0 :                         bad_page(page, "corrupted mapping in tail page");
    1033           0 :                         goto out;
    1034             :                 }
    1035             :                 break;
    1036             :         }
    1037           0 :         if (unlikely(!PageTail(page))) {
    1038           0 :                 bad_page(page, "PageTail not set");
    1039           0 :                 goto out;
    1040             :         }
    1041           0 :         if (unlikely(compound_head(page) != head_page)) {
    1042           0 :                 bad_page(page, "compound_head not consistent");
    1043           0 :                 goto out;
    1044             :         }
    1045             :         ret = 0;
    1046             : out:
    1047           0 :         page->mapping = NULL;
    1048           0 :         clear_compound_head(page);
    1049           0 :         return ret;
    1050             : }
    1051             : 
    1052             : /*
    1053             :  * Skip KASAN memory poisoning when either:
    1054             :  *
    1055             :  * 1. For generic KASAN: deferred memory initialization has not yet completed.
    1056             :  *    Tag-based KASAN modes skip pages freed via deferred memory initialization
    1057             :  *    using page tags instead (see below).
    1058             :  * 2. For tag-based KASAN modes: the page has a match-all KASAN tag, indicating
    1059             :  *    that error detection is disabled for accesses via the page address.
    1060             :  *
    1061             :  * Pages will have match-all tags in the following circumstances:
    1062             :  *
    1063             :  * 1. Pages are being initialized for the first time, including during deferred
    1064             :  *    memory init; see the call to page_kasan_tag_reset in __init_single_page.
    1065             :  * 2. The allocation was not unpoisoned due to __GFP_SKIP_KASAN, with the
    1066             :  *    exception of pages unpoisoned by kasan_unpoison_vmalloc.
    1067             :  * 3. The allocation was excluded from being checked due to sampling,
    1068             :  *    see the call to kasan_unpoison_pages.
    1069             :  *
    1070             :  * Poisoning pages during deferred memory init will greatly lengthen the
    1071             :  * process and cause problem in large memory systems as the deferred pages
    1072             :  * initialization is done with interrupt disabled.
    1073             :  *
    1074             :  * Assuming that there will be no reference to those newly initialized
    1075             :  * pages before they are ever allocated, this should have no effect on
    1076             :  * KASAN memory tracking as the poison will be properly inserted at page
    1077             :  * allocation time. The only corner case is when pages are allocated by
    1078             :  * on-demand allocation and then freed again before the deferred pages
    1079             :  * initialization is done, but this is not likely to happen.
    1080             :  */
    1081             : static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
    1082             : {
    1083             :         if (IS_ENABLED(CONFIG_KASAN_GENERIC))
    1084             :                 return deferred_pages_enabled();
    1085             : 
    1086         259 :         return page_kasan_tag(page) == 0xff;
    1087             : }
    1088             : 
    1089           0 : static void kernel_init_pages(struct page *page, int numpages)
    1090             : {
    1091             :         int i;
    1092             : 
    1093             :         /* s390's use of memset() could override KASAN redzones. */
    1094             :         kasan_disable_current();
    1095          74 :         for (i = 0; i < numpages; i++)
    1096          74 :                 clear_highpage_kasan_tagged(page + i);
    1097             :         kasan_enable_current();
    1098           0 : }
    1099             : 
    1100             : static __always_inline bool free_pages_prepare(struct page *page,
    1101             :                         unsigned int order, fpi_t fpi_flags)
    1102             : {
    1103         259 :         int bad = 0;
    1104         518 :         bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
    1105         259 :         bool init = want_init_on_free();
    1106             : 
    1107             :         VM_BUG_ON_PAGE(PageTail(page), page);
    1108             : 
    1109         259 :         trace_mm_page_free(page, order);
    1110         259 :         kmsan_free_page(page, order);
    1111             : 
    1112         259 :         if (unlikely(PageHWPoison(page)) && !order) {
    1113             :                 /*
    1114             :                  * Do not let hwpoison pages hit pcplists/buddy
    1115             :                  * Untie memcg state and reset page's owner
    1116             :                  */
    1117             :                 if (memcg_kmem_online() && PageMemcgKmem(page))
    1118             :                         __memcg_kmem_uncharge_page(page, order);
    1119             :                 reset_page_owner(page, order);
    1120             :                 page_table_check_free(page, order);
    1121             :                 return false;
    1122             :         }
    1123             : 
    1124             :         /*
    1125             :          * Check tail pages before head page information is cleared to
    1126             :          * avoid checking PageCompound for order-0 pages.
    1127             :          */
    1128         259 :         if (unlikely(order)) {
    1129         258 :                 bool compound = PageCompound(page);
    1130             :                 int i;
    1131             : 
    1132             :                 VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
    1133             : 
    1134             :                 if (compound)
    1135             :                         ClearPageHasHWPoisoned(page);
    1136      254486 :                 for (i = 1; i < (1 << order); i++) {
    1137      254486 :                         if (compound)
    1138           0 :                                 bad += free_tail_page_prepare(page, page + i);
    1139      254486 :                         if (is_check_pages_enabled()) {
    1140           0 :                                 if (free_page_is_bad(page + i)) {
    1141           0 :                                         bad++;
    1142           0 :                                         continue;
    1143             :                                 }
    1144             :                         }
    1145      254486 :                         (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
    1146             :                 }
    1147             :         }
    1148         259 :         if (PageMappingFlags(page))
    1149           0 :                 page->mapping = NULL;
    1150             :         if (memcg_kmem_online() && PageMemcgKmem(page))
    1151             :                 __memcg_kmem_uncharge_page(page, order);
    1152         259 :         if (is_check_pages_enabled()) {
    1153           0 :                 if (free_page_is_bad(page))
    1154           0 :                         bad++;
    1155           0 :                 if (bad)
    1156             :                         return false;
    1157             :         }
    1158             : 
    1159         259 :         page_cpupid_reset_last(page);
    1160         259 :         page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
    1161             :         reset_page_owner(page, order);
    1162         259 :         page_table_check_free(page, order);
    1163             : 
    1164         259 :         if (!PageHighMem(page)) {
    1165             :                 debug_check_no_locks_freed(page_address(page),
    1166             :                                            PAGE_SIZE << order);
    1167             :                 debug_check_no_obj_freed(page_address(page),
    1168             :                                            PAGE_SIZE << order);
    1169             :         }
    1170             : 
    1171         259 :         kernel_poison_pages(page, 1 << order);
    1172             : 
    1173             :         /*
    1174             :          * As memory initialization might be integrated into KASAN,
    1175             :          * KASAN poisoning and memory initialization code must be
    1176             :          * kept together to avoid discrepancies in behavior.
    1177             :          *
    1178             :          * With hardware tag-based KASAN, memory tags must be set before the
    1179             :          * page becomes unavailable via debug_pagealloc or arch_free_page.
    1180             :          */
    1181             :         if (!skip_kasan_poison) {
    1182             :                 kasan_poison_pages(page, order, init);
    1183             : 
    1184             :                 /* Memory is already initialized if KASAN did it internally. */
    1185             :                 if (kasan_has_integrated_init())
    1186             :                         init = false;
    1187             :         }
    1188         259 :         if (init)
    1189           0 :                 kernel_init_pages(page, 1 << order);
    1190             : 
    1191             :         /*
    1192             :          * arch_free_page() can make the page's contents inaccessible.  s390
    1193             :          * does this.  So nothing which can access the page's contents should
    1194             :          * happen after this.
    1195             :          */
    1196             :         arch_free_page(page, order);
    1197             : 
    1198             :         debug_pagealloc_unmap_pages(page, 1 << order);
    1199             : 
    1200             :         return true;
    1201             : }
    1202             : 
    1203             : /*
    1204             :  * Frees a number of pages from the PCP lists
    1205             :  * Assumes all pages on list are in same zone.
    1206             :  * count is the number of pages to free.
    1207             :  */
    1208           0 : static void free_pcppages_bulk(struct zone *zone, int count,
    1209             :                                         struct per_cpu_pages *pcp,
    1210             :                                         int pindex)
    1211             : {
    1212             :         unsigned long flags;
    1213           0 :         int min_pindex = 0;
    1214           0 :         int max_pindex = NR_PCP_LISTS - 1;
    1215             :         unsigned int order;
    1216             :         bool isolated_pageblocks;
    1217             :         struct page *page;
    1218             : 
    1219             :         /*
    1220             :          * Ensure proper count is passed which otherwise would stuck in the
    1221             :          * below while (list_empty(list)) loop.
    1222             :          */
    1223           0 :         count = min(pcp->count, count);
    1224             : 
    1225             :         /* Ensure requested pindex is drained first. */
    1226           0 :         pindex = pindex - 1;
    1227             : 
    1228           0 :         spin_lock_irqsave(&zone->lock, flags);
    1229           0 :         isolated_pageblocks = has_isolate_pageblock(zone);
    1230             : 
    1231           0 :         while (count > 0) {
    1232             :                 struct list_head *list;
    1233             :                 int nr_pages;
    1234             : 
    1235             :                 /* Remove pages from lists in a round-robin fashion. */
    1236             :                 do {
    1237           0 :                         if (++pindex > max_pindex)
    1238           0 :                                 pindex = min_pindex;
    1239           0 :                         list = &pcp->lists[pindex];
    1240           0 :                         if (!list_empty(list))
    1241             :                                 break;
    1242             : 
    1243           0 :                         if (pindex == max_pindex)
    1244           0 :                                 max_pindex--;
    1245           0 :                         if (pindex == min_pindex)
    1246           0 :                                 min_pindex++;
    1247             :                 } while (1);
    1248             : 
    1249           0 :                 order = pindex_to_order(pindex);
    1250           0 :                 nr_pages = 1 << order;
    1251             :                 do {
    1252             :                         int mt;
    1253             : 
    1254           0 :                         page = list_last_entry(list, struct page, pcp_list);
    1255           0 :                         mt = get_pcppage_migratetype(page);
    1256             : 
    1257             :                         /* must delete to avoid corrupting pcp list */
    1258           0 :                         list_del(&page->pcp_list);
    1259           0 :                         count -= nr_pages;
    1260           0 :                         pcp->count -= nr_pages;
    1261             : 
    1262             :                         /* MIGRATE_ISOLATE page should not go to pcplists */
    1263             :                         VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
    1264             :                         /* Pageblock could have been isolated meanwhile */
    1265             :                         if (unlikely(isolated_pageblocks))
    1266             :                                 mt = get_pageblock_migratetype(page);
    1267             : 
    1268           0 :                         __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE);
    1269           0 :                         trace_mm_page_pcpu_drain(page, order, mt);
    1270           0 :                 } while (count > 0 && !list_empty(list));
    1271             :         }
    1272             : 
    1273           0 :         spin_unlock_irqrestore(&zone->lock, flags);
    1274           0 : }
    1275             : 
    1276           0 : static void free_one_page(struct zone *zone,
    1277             :                                 struct page *page, unsigned long pfn,
    1278             :                                 unsigned int order,
    1279             :                                 int migratetype, fpi_t fpi_flags)
    1280             : {
    1281             :         unsigned long flags;
    1282             : 
    1283           0 :         spin_lock_irqsave(&zone->lock, flags);
    1284           0 :         if (unlikely(has_isolate_pageblock(zone) ||
    1285             :                 is_migrate_isolate(migratetype))) {
    1286             :                 migratetype = get_pfnblock_migratetype(page, pfn);
    1287             :         }
    1288           0 :         __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
    1289           0 :         spin_unlock_irqrestore(&zone->lock, flags);
    1290           0 : }
    1291             : 
    1292         259 : static void __free_pages_ok(struct page *page, unsigned int order,
    1293             :                             fpi_t fpi_flags)
    1294             : {
    1295             :         unsigned long flags;
    1296             :         int migratetype;
    1297         259 :         unsigned long pfn = page_to_pfn(page);
    1298         259 :         struct zone *zone = page_zone(page);
    1299             : 
    1300         259 :         if (!free_pages_prepare(page, order, fpi_flags))
    1301             :                 return;
    1302             : 
    1303             :         /*
    1304             :          * Calling get_pfnblock_migratetype() without spin_lock_irqsave() here
    1305             :          * is used to avoid calling get_pfnblock_migratetype() under the lock.
    1306             :          * This will reduce the lock holding time.
    1307             :          */
    1308         259 :         migratetype = get_pfnblock_migratetype(page, pfn);
    1309             : 
    1310         259 :         spin_lock_irqsave(&zone->lock, flags);
    1311             :         if (unlikely(has_isolate_pageblock(zone) ||
    1312             :                 is_migrate_isolate(migratetype))) {
    1313             :                 migratetype = get_pfnblock_migratetype(page, pfn);
    1314             :         }
    1315         259 :         __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
    1316         518 :         spin_unlock_irqrestore(&zone->lock, flags);
    1317             : 
    1318         259 :         __count_vm_events(PGFREE, 1 << order);
    1319             : }
    1320             : 
    1321         259 : void __free_pages_core(struct page *page, unsigned int order)
    1322             : {
    1323         259 :         unsigned int nr_pages = 1 << order;
    1324         259 :         struct page *p = page;
    1325             :         unsigned int loop;
    1326             : 
    1327             :         /*
    1328             :          * When initializing the memmap, __init_single_page() sets the refcount
    1329             :          * of all pages to 1 ("allocated"/"not free"). We have to set the
    1330             :          * refcount of all involved pages to 0.
    1331             :          */
    1332         259 :         prefetchw(p);
    1333      254745 :         for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
    1334      254486 :                 prefetchw(p + 1);
    1335      254486 :                 __ClearPageReserved(p);
    1336      254486 :                 set_page_count(p, 0);
    1337             :         }
    1338         259 :         __ClearPageReserved(p);
    1339         259 :         set_page_count(p, 0);
    1340             : 
    1341         518 :         atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
    1342             : 
    1343         259 :         if (page_contains_unaccepted(page, order)) {
    1344             :                 if (order == MAX_ORDER && __free_unaccepted(page))
    1345             :                         return;
    1346             : 
    1347             :                 accept_page(page, order);
    1348             :         }
    1349             : 
    1350             :         /*
    1351             :          * Bypass PCP and place fresh pages right to the tail, primarily
    1352             :          * relevant for memory onlining.
    1353             :          */
    1354         259 :         __free_pages_ok(page, order, FPI_TO_TAIL);
    1355             : }
    1356             : 
    1357             : /*
    1358             :  * Check that the whole (or subset of) a pageblock given by the interval of
    1359             :  * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
    1360             :  * with the migration of free compaction scanner.
    1361             :  *
    1362             :  * Return struct page pointer of start_pfn, or NULL if checks were not passed.
    1363             :  *
    1364             :  * It's possible on some configurations to have a setup like node0 node1 node0
    1365             :  * i.e. it's possible that all pages within a zones range of pages do not
    1366             :  * belong to a single zone. We assume that a border between node0 and node1
    1367             :  * can occur within a single pageblock, but not a node0 node1 node0
    1368             :  * interleaving within a single pageblock. It is therefore sufficient to check
    1369             :  * the first and last page of a pageblock and avoid checking each individual
    1370             :  * page in a pageblock.
    1371             :  *
    1372             :  * Note: the function may return non-NULL struct page even for a page block
    1373             :  * which contains a memory hole (i.e. there is no physical memory for a subset
    1374             :  * of the pfn range). For example, if the pageblock order is MAX_ORDER, which
    1375             :  * will fall into 2 sub-sections, and the end pfn of the pageblock may be hole
    1376             :  * even though the start pfn is online and valid. This should be safe most of
    1377             :  * the time because struct pages are still initialized via init_unavailable_range()
    1378             :  * and pfn walkers shouldn't touch any physical memory range for which they do
    1379             :  * not recognize any specific metadata in struct pages.
    1380             :  */
    1381         260 : struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
    1382             :                                      unsigned long end_pfn, struct zone *zone)
    1383             : {
    1384             :         struct page *start_page;
    1385             :         struct page *end_page;
    1386             : 
    1387             :         /* end_pfn is one past the range we are checking */
    1388         260 :         end_pfn--;
    1389             : 
    1390         260 :         if (!pfn_valid(end_pfn))
    1391             :                 return NULL;
    1392             : 
    1393         520 :         start_page = pfn_to_online_page(start_pfn);
    1394         260 :         if (!start_page)
    1395             :                 return NULL;
    1396             : 
    1397         260 :         if (page_zone(start_page) != zone)
    1398             :                 return NULL;
    1399             : 
    1400         260 :         end_page = pfn_to_page(end_pfn);
    1401             : 
    1402             :         /* This gives a shorter code than deriving page_zone(end_page) */
    1403         780 :         if (page_zone_id(start_page) != page_zone_id(end_page))
    1404             :                 return NULL;
    1405             : 
    1406         260 :         return start_page;
    1407             : }
    1408             : 
    1409             : /*
    1410             :  * The order of subdivision here is critical for the IO subsystem.
    1411             :  * Please do not alter this order without good reasons and regression
    1412             :  * testing. Specifically, as large blocks of memory are subdivided,
    1413             :  * the order in which smaller blocks are delivered depends on the order
    1414             :  * they're subdivided in this function. This is the primary factor
    1415             :  * influencing the order in which pages are delivered to the IO
    1416             :  * subsystem according to empirical testing, and this is also justified
    1417             :  * by considering the behavior of a buddy system containing a single
    1418             :  * large block of memory acted on by a series of small allocations.
    1419             :  * This behavior is a critical factor in sglist merging's success.
    1420             :  *
    1421             :  * -- nyc
    1422             :  */
    1423             : static inline void expand(struct zone *zone, struct page *page,
    1424             :         int low, int high, int migratetype)
    1425             : {
    1426         677 :         unsigned long size = 1 << high;
    1427             : 
    1428        1361 :         while (high > low) {
    1429         684 :                 high--;
    1430         684 :                 size >>= 1;
    1431             :                 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
    1432             : 
    1433             :                 /*
    1434             :                  * Mark as guard pages (or page), that will allow to
    1435             :                  * merge back to allocator when buddy will be freed.
    1436             :                  * Corresponding page table entries will not be touched,
    1437             :                  * pages will stay not present in virtual address space
    1438             :                  */
    1439         684 :                 if (set_page_guard(zone, &page[size], high, migratetype))
    1440             :                         continue;
    1441             : 
    1442        1368 :                 add_to_free_list(&page[size], zone, high, migratetype);
    1443         684 :                 set_buddy_order(&page[size], high);
    1444             :         }
    1445             : }
    1446             : 
    1447           0 : static void check_new_page_bad(struct page *page)
    1448             : {
    1449             :         if (unlikely(page->flags & __PG_HWPOISON)) {
    1450             :                 /* Don't complain about hwpoisoned pages */
    1451             :                 page_mapcount_reset(page); /* remove PageBuddy */
    1452             :                 return;
    1453             :         }
    1454             : 
    1455           0 :         bad_page(page,
    1456             :                  page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
    1457             : }
    1458             : 
    1459             : /*
    1460             :  * This page is about to be returned from the page allocator
    1461             :  */
    1462           0 : static int check_new_page(struct page *page)
    1463             : {
    1464           0 :         if (likely(page_expected_state(page,
    1465             :                                 PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
    1466             :                 return 0;
    1467             : 
    1468           0 :         check_new_page_bad(page);
    1469           0 :         return 1;
    1470             : }
    1471             : 
    1472         505 : static inline bool check_new_pages(struct page *page, unsigned int order)
    1473             : {
    1474         505 :         if (is_check_pages_enabled()) {
    1475           0 :                 for (int i = 0; i < (1 << order); i++) {
    1476           0 :                         struct page *p = page + i;
    1477             : 
    1478           0 :                         if (check_new_page(p))
    1479             :                                 return true;
    1480             :                 }
    1481             :         }
    1482             : 
    1483             :         return false;
    1484             : }
    1485             : 
    1486             : static inline bool should_skip_kasan_unpoison(gfp_t flags)
    1487             : {
    1488             :         /* Don't skip if a software KASAN mode is enabled. */
    1489             :         if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
    1490             :             IS_ENABLED(CONFIG_KASAN_SW_TAGS))
    1491             :                 return false;
    1492             : 
    1493             :         /* Skip, if hardware tag-based KASAN is not enabled. */
    1494             :         if (!kasan_hw_tags_enabled())
    1495             :                 return true;
    1496             : 
    1497             :         /*
    1498             :          * With hardware tag-based KASAN enabled, skip if this has been
    1499             :          * requested via __GFP_SKIP_KASAN.
    1500             :          */
    1501             :         return flags & __GFP_SKIP_KASAN;
    1502             : }
    1503             : 
    1504             : static inline bool should_skip_init(gfp_t flags)
    1505             : {
    1506             :         /* Don't skip, if hardware tag-based KASAN is not enabled. */
    1507             :         if (!kasan_hw_tags_enabled())
    1508             :                 return false;
    1509             : 
    1510             :         /* For hardware tag-based KASAN, skip if requested. */
    1511             :         return (flags & __GFP_SKIP_ZERO);
    1512             : }
    1513             : 
    1514         505 : inline void post_alloc_hook(struct page *page, unsigned int order,
    1515             :                                 gfp_t gfp_flags)
    1516             : {
    1517        1010 :         bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
    1518             :                         !should_skip_init(gfp_flags);
    1519         505 :         bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS);
    1520             :         int i;
    1521             : 
    1522        1010 :         set_page_private(page, 0);
    1523         505 :         set_page_refcounted(page);
    1524             : 
    1525         505 :         arch_alloc_page(page, order);
    1526         505 :         debug_pagealloc_map_pages(page, 1 << order);
    1527             : 
    1528             :         /*
    1529             :          * Page unpoisoning must happen before memory initialization.
    1530             :          * Otherwise, the poison pattern will be overwritten for __GFP_ZERO
    1531             :          * allocations and the page unpoisoning code will complain.
    1532             :          */
    1533         505 :         kernel_unpoison_pages(page, 1 << order);
    1534             : 
    1535             :         /*
    1536             :          * As memory initialization might be integrated into KASAN,
    1537             :          * KASAN unpoisoning and memory initializion code must be
    1538             :          * kept together to avoid discrepancies in behavior.
    1539             :          */
    1540             : 
    1541             :         /*
    1542             :          * If memory tags should be zeroed
    1543             :          * (which happens only when memory should be initialized as well).
    1544             :          */
    1545         505 :         if (zero_tags) {
    1546             :                 /* Initialize both memory and memory tags. */
    1547             :                 for (i = 0; i != 1 << order; ++i)
    1548             :                         tag_clear_highpage(page + i);
    1549             : 
    1550             :                 /* Take note that memory was initialized by the loop above. */
    1551             :                 init = false;
    1552             :         }
    1553         505 :         if (!should_skip_kasan_unpoison(gfp_flags) &&
    1554             :             kasan_unpoison_pages(page, order, init)) {
    1555             :                 /* Take note that memory was initialized by KASAN. */
    1556             :                 if (kasan_has_integrated_init())
    1557             :                         init = false;
    1558             :         } else {
    1559             :                 /*
    1560             :                  * If memory tags have not been set by KASAN, reset the page
    1561             :                  * tags to ensure page_address() dereferencing does not fault.
    1562             :                  */
    1563         505 :                 for (i = 0; i != 1 << order; ++i)
    1564             :                         page_kasan_tag_reset(page + i);
    1565             :         }
    1566             :         /* If memory is still not initialized, initialize it now. */
    1567         505 :         if (init)
    1568             :                 kernel_init_pages(page, 1 << order);
    1569             : 
    1570         505 :         set_page_owner(page, order, gfp_flags);
    1571         505 :         page_table_check_alloc(page, order);
    1572         505 : }
    1573             : 
    1574         441 : static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
    1575             :                                                         unsigned int alloc_flags)
    1576             : {
    1577         505 :         post_alloc_hook(page, order, gfp_flags);
    1578             : 
    1579         441 :         if (order && (gfp_flags & __GFP_COMP))
    1580          96 :                 prep_compound_page(page, order);
    1581             : 
    1582             :         /*
    1583             :          * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
    1584             :          * allocate the page. The expectation is that the caller is taking
    1585             :          * steps that will free more memory. The caller should avoid the page
    1586             :          * being used for !PFMEMALLOC purposes.
    1587             :          */
    1588         441 :         if (alloc_flags & ALLOC_NO_WATERMARKS)
    1589           0 :                 set_page_pfmemalloc(page);
    1590             :         else
    1591         505 :                 clear_page_pfmemalloc(page);
    1592         441 : }
    1593             : 
    1594             : /*
    1595             :  * Go through the free lists for the given migratetype and remove
    1596             :  * the smallest available page from the freelists
    1597             :  */
    1598             : static __always_inline
    1599             : struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
    1600             :                                                 int migratetype)
    1601             : {
    1602             :         unsigned int current_order;
    1603             :         struct free_area *area;
    1604             :         struct page *page;
    1605             : 
    1606             :         /* Find a page of the appropriate size in the preferred list */
    1607        2768 :         for (current_order = order; current_order <= MAX_ORDER; ++current_order) {
    1608        1382 :                 area = &(zone->free_area[current_order]);
    1609        1382 :                 page = get_page_from_free_area(area, migratetype);
    1610        1382 :                 if (!page)
    1611         705 :                         continue;
    1612         677 :                 del_page_from_free_list(page, zone, current_order);
    1613        1354 :                 expand(zone, page, order, current_order, migratetype);
    1614         677 :                 set_pcppage_migratetype(page, migratetype);
    1615             :                 trace_mm_page_alloc_zone_locked(page, order, migratetype,
    1616             :                                 pcp_allowed_order(order) &&
    1617             :                                 migratetype < MIGRATE_PCPTYPES);
    1618             :                 return page;
    1619             :         }
    1620             : 
    1621             :         return NULL;
    1622             : }
    1623             : 
    1624             : 
    1625             : /*
    1626             :  * This array describes the order lists are fallen back to when
    1627             :  * the free lists for the desirable migrate type are depleted
    1628             :  *
    1629             :  * The other migratetypes do not have fallbacks.
    1630             :  */
    1631             : static int fallbacks[MIGRATE_TYPES][MIGRATE_PCPTYPES - 1] = {
    1632             :         [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE   },
    1633             :         [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE },
    1634             :         [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE   },
    1635             : };
    1636             : 
    1637             : #ifdef CONFIG_CMA
    1638             : static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone,
    1639             :                                         unsigned int order)
    1640             : {
    1641             :         return __rmqueue_smallest(zone, order, MIGRATE_CMA);
    1642             : }
    1643             : #else
    1644             : static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
    1645             :                                         unsigned int order) { return NULL; }
    1646             : #endif
    1647             : 
    1648             : /*
    1649             :  * Move the free pages in a range to the freelist tail of the requested type.
    1650             :  * Note that start_page and end_pages are not aligned on a pageblock
    1651             :  * boundary. If alignment is required, use move_freepages_block()
    1652             :  */
    1653           0 : static int move_freepages(struct zone *zone,
    1654             :                           unsigned long start_pfn, unsigned long end_pfn,
    1655             :                           int migratetype, int *num_movable)
    1656             : {
    1657             :         struct page *page;
    1658             :         unsigned long pfn;
    1659             :         unsigned int order;
    1660           0 :         int pages_moved = 0;
    1661             : 
    1662           0 :         for (pfn = start_pfn; pfn <= end_pfn;) {
    1663           0 :                 page = pfn_to_page(pfn);
    1664           0 :                 if (!PageBuddy(page)) {
    1665             :                         /*
    1666             :                          * We assume that pages that could be isolated for
    1667             :                          * migration are movable. But we don't actually try
    1668             :                          * isolating, as that would be expensive.
    1669             :                          */
    1670           0 :                         if (num_movable &&
    1671           0 :                                         (PageLRU(page) || __PageMovable(page)))
    1672           0 :                                 (*num_movable)++;
    1673           0 :                         pfn++;
    1674           0 :                         continue;
    1675             :                 }
    1676             : 
    1677             :                 /* Make sure we are not inadvertently changing nodes */
    1678             :                 VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
    1679             :                 VM_BUG_ON_PAGE(page_zone(page) != zone, page);
    1680             : 
    1681           0 :                 order = buddy_order(page);
    1682           0 :                 move_to_free_list(page, zone, order, migratetype);
    1683           0 :                 pfn += 1 << order;
    1684           0 :                 pages_moved += 1 << order;
    1685             :         }
    1686             : 
    1687           0 :         return pages_moved;
    1688             : }
    1689             : 
    1690           0 : int move_freepages_block(struct zone *zone, struct page *page,
    1691             :                                 int migratetype, int *num_movable)
    1692             : {
    1693             :         unsigned long start_pfn, end_pfn, pfn;
    1694             : 
    1695           0 :         if (num_movable)
    1696           0 :                 *num_movable = 0;
    1697             : 
    1698           0 :         pfn = page_to_pfn(page);
    1699           0 :         start_pfn = pageblock_start_pfn(pfn);
    1700           0 :         end_pfn = pageblock_end_pfn(pfn) - 1;
    1701             : 
    1702             :         /* Do not cross zone boundaries */
    1703           0 :         if (!zone_spans_pfn(zone, start_pfn))
    1704           0 :                 start_pfn = pfn;
    1705           0 :         if (!zone_spans_pfn(zone, end_pfn))
    1706             :                 return 0;
    1707             : 
    1708           0 :         return move_freepages(zone, start_pfn, end_pfn, migratetype,
    1709             :                                                                 num_movable);
    1710             : }
    1711             : 
    1712             : static void change_pageblock_range(struct page *pageblock_page,
    1713             :                                         int start_order, int migratetype)
    1714             : {
    1715           2 :         int nr_pageblocks = 1 << (start_order - pageblock_order);
    1716             : 
    1717           4 :         while (nr_pageblocks--) {
    1718           2 :                 set_pageblock_migratetype(pageblock_page, migratetype);
    1719           2 :                 pageblock_page += pageblock_nr_pages;
    1720             :         }
    1721             : }
    1722             : 
    1723             : /*
    1724             :  * When we are falling back to another migratetype during allocation, try to
    1725             :  * steal extra free pages from the same pageblocks to satisfy further
    1726             :  * allocations, instead of polluting multiple pageblocks.
    1727             :  *
    1728             :  * If we are stealing a relatively large buddy page, it is likely there will
    1729             :  * be more free pages in the pageblock, so try to steal them all. For
    1730             :  * reclaimable and unmovable allocations, we steal regardless of page size,
    1731             :  * as fragmentation caused by those allocations polluting movable pageblocks
    1732             :  * is worse than movable allocations stealing from unmovable and reclaimable
    1733             :  * pageblocks.
    1734             :  */
    1735             : static bool can_steal_fallback(unsigned int order, int start_mt)
    1736             : {
    1737             :         /*
    1738             :          * Leaving this order check is intended, although there is
    1739             :          * relaxed order check in next check. The reason is that
    1740             :          * we can actually steal whole pageblock if this condition met,
    1741             :          * but, below check doesn't guarantee it and that is just heuristic
    1742             :          * so could be changed anytime.
    1743             :          */
    1744           2 :         if (order >= pageblock_order)
    1745             :                 return true;
    1746             : 
    1747           0 :         if (order >= pageblock_order / 2 ||
    1748           0 :                 start_mt == MIGRATE_RECLAIMABLE ||
    1749           0 :                 start_mt == MIGRATE_UNMOVABLE ||
    1750             :                 page_group_by_mobility_disabled)
    1751             :                 return true;
    1752             : 
    1753             :         return false;
    1754             : }
    1755             : 
    1756           0 : static inline bool boost_watermark(struct zone *zone)
    1757             : {
    1758             :         unsigned long max_boost;
    1759             : 
    1760           0 :         if (!watermark_boost_factor)
    1761             :                 return false;
    1762             :         /*
    1763             :          * Don't bother in zones that are unlikely to produce results.
    1764             :          * On small machines, including kdump capture kernels running
    1765             :          * in a small area, boosting the watermark can cause an out of
    1766             :          * memory situation immediately.
    1767             :          */
    1768           0 :         if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
    1769             :                 return false;
    1770             : 
    1771           0 :         max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
    1772             :                         watermark_boost_factor, 10000);
    1773             : 
    1774             :         /*
    1775             :          * high watermark may be uninitialised if fragmentation occurs
    1776             :          * very early in boot so do not boost. We do not fall
    1777             :          * through and boost by pageblock_nr_pages as failing
    1778             :          * allocations that early means that reclaim is not going
    1779             :          * to help and it may even be impossible to reclaim the
    1780             :          * boosted watermark resulting in a hang.
    1781             :          */
    1782           0 :         if (!max_boost)
    1783             :                 return false;
    1784             : 
    1785           0 :         max_boost = max(pageblock_nr_pages, max_boost);
    1786             : 
    1787           0 :         zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
    1788             :                 max_boost);
    1789             : 
    1790           0 :         return true;
    1791             : }
    1792             : 
    1793             : /*
    1794             :  * This function implements actual steal behaviour. If order is large enough,
    1795             :  * we can steal whole pageblock. If not, we first move freepages in this
    1796             :  * pageblock to our migratetype and determine how many already-allocated pages
    1797             :  * are there in the pageblock with a compatible migratetype. If at least half
    1798             :  * of pages are free or compatible, we can change migratetype of the pageblock
    1799             :  * itself, so pages freed in the future will be put on the correct free list.
    1800             :  */
    1801           2 : static void steal_suitable_fallback(struct zone *zone, struct page *page,
    1802             :                 unsigned int alloc_flags, int start_type, bool whole_block)
    1803             : {
    1804           4 :         unsigned int current_order = buddy_order(page);
    1805             :         int free_pages, movable_pages, alike_pages;
    1806             :         int old_block_type;
    1807             : 
    1808           4 :         old_block_type = get_pageblock_migratetype(page);
    1809             : 
    1810             :         /*
    1811             :          * This can happen due to races and we want to prevent broken
    1812             :          * highatomic accounting.
    1813             :          */
    1814           2 :         if (is_migrate_highatomic(old_block_type))
    1815             :                 goto single_page;
    1816             : 
    1817             :         /* Take ownership for orders >= pageblock_order */
    1818           2 :         if (current_order >= pageblock_order) {
    1819           2 :                 change_pageblock_range(page, current_order, start_type);
    1820             :                 goto single_page;
    1821             :         }
    1822             : 
    1823             :         /*
    1824             :          * Boost watermarks to increase reclaim pressure to reduce the
    1825             :          * likelihood of future fallbacks. Wake kswapd now as the node
    1826             :          * may be balanced overall and kswapd will not wake naturally.
    1827             :          */
    1828           0 :         if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
    1829           0 :                 set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
    1830             : 
    1831             :         /* We are not allowed to try stealing from the whole block */
    1832           0 :         if (!whole_block)
    1833             :                 goto single_page;
    1834             : 
    1835           0 :         free_pages = move_freepages_block(zone, page, start_type,
    1836             :                                                 &movable_pages);
    1837             :         /*
    1838             :          * Determine how many pages are compatible with our allocation.
    1839             :          * For movable allocation, it's the number of movable pages which
    1840             :          * we just obtained. For other types it's a bit more tricky.
    1841             :          */
    1842           0 :         if (start_type == MIGRATE_MOVABLE) {
    1843           0 :                 alike_pages = movable_pages;
    1844             :         } else {
    1845             :                 /*
    1846             :                  * If we are falling back a RECLAIMABLE or UNMOVABLE allocation
    1847             :                  * to MOVABLE pageblock, consider all non-movable pages as
    1848             :                  * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
    1849             :                  * vice versa, be conservative since we can't distinguish the
    1850             :                  * exact migratetype of non-movable pages.
    1851             :                  */
    1852           0 :                 if (old_block_type == MIGRATE_MOVABLE)
    1853           0 :                         alike_pages = pageblock_nr_pages
    1854           0 :                                                 - (free_pages + movable_pages);
    1855             :                 else
    1856             :                         alike_pages = 0;
    1857             :         }
    1858             : 
    1859             :         /* moving whole block can fail due to zone boundary conditions */
    1860           0 :         if (!free_pages)
    1861             :                 goto single_page;
    1862             : 
    1863             :         /*
    1864             :          * If a sufficient number of pages in the block are either free or of
    1865             :          * comparable migratability as our allocation, claim the whole block.
    1866             :          */
    1867           0 :         if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
    1868             :                         page_group_by_mobility_disabled)
    1869           0 :                 set_pageblock_migratetype(page, start_type);
    1870             : 
    1871           0 :         return;
    1872             : 
    1873             : single_page:
    1874           2 :         move_to_free_list(page, zone, current_order, start_type);
    1875             : }
    1876             : 
    1877             : /*
    1878             :  * Check whether there is a suitable fallback freepage with requested order.
    1879             :  * If only_stealable is true, this function returns fallback_mt only if
    1880             :  * we can steal other freepages all together. This would help to reduce
    1881             :  * fragmentation due to mixed migratetype pages in one pageblock.
    1882             :  */
    1883           2 : int find_suitable_fallback(struct free_area *area, unsigned int order,
    1884             :                         int migratetype, bool only_stealable, bool *can_steal)
    1885             : {
    1886             :         int i;
    1887             :         int fallback_mt;
    1888             : 
    1889           2 :         if (area->nr_free == 0)
    1890             :                 return -1;
    1891             : 
    1892           2 :         *can_steal = false;
    1893           4 :         for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) {
    1894           4 :                 fallback_mt = fallbacks[migratetype][i];
    1895           4 :                 if (free_area_empty(area, fallback_mt))
    1896           2 :                         continue;
    1897             : 
    1898           2 :                 if (can_steal_fallback(order, migratetype))
    1899           2 :                         *can_steal = true;
    1900             : 
    1901           2 :                 if (!only_stealable)
    1902             :                         return fallback_mt;
    1903             : 
    1904           0 :                 if (*can_steal)
    1905             :                         return fallback_mt;
    1906             :         }
    1907             : 
    1908             :         return -1;
    1909             : }
    1910             : 
    1911             : /*
    1912             :  * Reserve a pageblock for exclusive use of high-order atomic allocations if
    1913             :  * there are no empty page blocks that contain a page with a suitable order
    1914             :  */
    1915           0 : static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
    1916             :                                 unsigned int alloc_order)
    1917             : {
    1918             :         int mt;
    1919             :         unsigned long max_managed, flags;
    1920             : 
    1921             :         /*
    1922             :          * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
    1923             :          * Check is race-prone but harmless.
    1924             :          */
    1925           0 :         max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages;
    1926           0 :         if (zone->nr_reserved_highatomic >= max_managed)
    1927             :                 return;
    1928             : 
    1929           0 :         spin_lock_irqsave(&zone->lock, flags);
    1930             : 
    1931             :         /* Recheck the nr_reserved_highatomic limit under the lock */
    1932           0 :         if (zone->nr_reserved_highatomic >= max_managed)
    1933             :                 goto out_unlock;
    1934             : 
    1935             :         /* Yoink! */
    1936           0 :         mt = get_pageblock_migratetype(page);
    1937             :         /* Only reserve normal pageblocks (i.e., they can merge with others) */
    1938           0 :         if (migratetype_is_mergeable(mt)) {
    1939           0 :                 zone->nr_reserved_highatomic += pageblock_nr_pages;
    1940           0 :                 set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
    1941           0 :                 move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
    1942             :         }
    1943             : 
    1944             : out_unlock:
    1945           0 :         spin_unlock_irqrestore(&zone->lock, flags);
    1946             : }
    1947             : 
    1948             : /*
    1949             :  * Used when an allocation is about to fail under memory pressure. This
    1950             :  * potentially hurts the reliability of high-order allocations when under
    1951             :  * intense memory pressure but failed atomic allocations should be easier
    1952             :  * to recover from than an OOM.
    1953             :  *
    1954             :  * If @force is true, try to unreserve a pageblock even though highatomic
    1955             :  * pageblock is exhausted.
    1956             :  */
    1957           0 : static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
    1958             :                                                 bool force)
    1959             : {
    1960           0 :         struct zonelist *zonelist = ac->zonelist;
    1961             :         unsigned long flags;
    1962             :         struct zoneref *z;
    1963             :         struct zone *zone;
    1964             :         struct page *page;
    1965             :         int order;
    1966             :         bool ret;
    1967             : 
    1968           0 :         for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
    1969             :                                                                 ac->nodemask) {
    1970             :                 /*
    1971             :                  * Preserve at least one pageblock unless memory pressure
    1972             :                  * is really high.
    1973             :                  */
    1974           0 :                 if (!force && zone->nr_reserved_highatomic <=
    1975             :                                         pageblock_nr_pages)
    1976           0 :                         continue;
    1977             : 
    1978           0 :                 spin_lock_irqsave(&zone->lock, flags);
    1979           0 :                 for (order = 0; order <= MAX_ORDER; order++) {
    1980           0 :                         struct free_area *area = &(zone->free_area[order]);
    1981             : 
    1982           0 :                         page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
    1983           0 :                         if (!page)
    1984           0 :                                 continue;
    1985             : 
    1986             :                         /*
    1987             :                          * In page freeing path, migratetype change is racy so
    1988             :                          * we can counter several free pages in a pageblock
    1989             :                          * in this loop although we changed the pageblock type
    1990             :                          * from highatomic to ac->migratetype. So we should
    1991             :                          * adjust the count once.
    1992             :                          */
    1993           0 :                         if (is_migrate_highatomic_page(page)) {
    1994             :                                 /*
    1995             :                                  * It should never happen but changes to
    1996             :                                  * locking could inadvertently allow a per-cpu
    1997             :                                  * drain to add pages to MIGRATE_HIGHATOMIC
    1998             :                                  * while unreserving so be safe and watch for
    1999             :                                  * underflows.
    2000             :                                  */
    2001           0 :                                 zone->nr_reserved_highatomic -= min(
    2002             :                                                 pageblock_nr_pages,
    2003             :                                                 zone->nr_reserved_highatomic);
    2004             :                         }
    2005             : 
    2006             :                         /*
    2007             :                          * Convert to ac->migratetype and avoid the normal
    2008             :                          * pageblock stealing heuristics. Minimally, the caller
    2009             :                          * is doing the work and needs the pages. More
    2010             :                          * importantly, if the block was always converted to
    2011             :                          * MIGRATE_UNMOVABLE or another type then the number
    2012             :                          * of pageblocks that cannot be completely freed
    2013             :                          * may increase.
    2014             :                          */
    2015           0 :                         set_pageblock_migratetype(page, ac->migratetype);
    2016           0 :                         ret = move_freepages_block(zone, page, ac->migratetype,
    2017             :                                                                         NULL);
    2018           0 :                         if (ret) {
    2019           0 :                                 spin_unlock_irqrestore(&zone->lock, flags);
    2020           0 :                                 return ret;
    2021             :                         }
    2022             :                 }
    2023           0 :                 spin_unlock_irqrestore(&zone->lock, flags);
    2024             :         }
    2025             : 
    2026             :         return false;
    2027             : }
    2028             : 
    2029             : /*
    2030             :  * Try finding a free buddy page on the fallback list and put it on the free
    2031             :  * list of requested migratetype, possibly along with other pages from the same
    2032             :  * block, depending on fragmentation avoidance heuristics. Returns true if
    2033             :  * fallback was found so that __rmqueue_smallest() can grab it.
    2034             :  *
    2035             :  * The use of signed ints for order and current_order is a deliberate
    2036             :  * deviation from the rest of this file, to make the for loop
    2037             :  * condition simpler.
    2038             :  */
    2039             : static __always_inline bool
    2040             : __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
    2041             :                                                 unsigned int alloc_flags)
    2042             : {
    2043             :         struct free_area *area;
    2044             :         int current_order;
    2045           2 :         int min_order = order;
    2046             :         struct page *page;
    2047             :         int fallback_mt;
    2048             :         bool can_steal;
    2049             : 
    2050             :         /*
    2051             :          * Do not steal pages from freelists belonging to other pageblocks
    2052             :          * i.e. orders < pageblock_order. If there are no local zones free,
    2053             :          * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
    2054             :          */
    2055             :         if (order < pageblock_order && alloc_flags & ALLOC_NOFRAGMENT)
    2056             :                 min_order = pageblock_order;
    2057             : 
    2058             :         /*
    2059             :          * Find the largest available free page in the other list. This roughly
    2060             :          * approximates finding the pageblock with the most free pages, which
    2061             :          * would be too costly to do exactly.
    2062             :          */
    2063           4 :         for (current_order = MAX_ORDER; current_order >= min_order;
    2064           0 :                                 --current_order) {
    2065           2 :                 area = &(zone->free_area[current_order]);
    2066           2 :                 fallback_mt = find_suitable_fallback(area, current_order,
    2067             :                                 start_migratetype, false, &can_steal);
    2068           2 :                 if (fallback_mt == -1)
    2069           0 :                         continue;
    2070             : 
    2071             :                 /*
    2072             :                  * We cannot steal all free pages from the pageblock and the
    2073             :                  * requested migratetype is movable. In that case it's better to
    2074             :                  * steal and split the smallest available page instead of the
    2075             :                  * largest available page, because even if the next movable
    2076             :                  * allocation falls back into a different pageblock than this
    2077             :                  * one, it won't cause permanent fragmentation.
    2078             :                  */
    2079           2 :                 if (!can_steal && start_migratetype == MIGRATE_MOVABLE
    2080           0 :                                         && current_order > order)
    2081             :                         goto find_smallest;
    2082             : 
    2083             :                 goto do_steal;
    2084             :         }
    2085             : 
    2086             :         return false;
    2087             : 
    2088             : find_smallest:
    2089           0 :         for (current_order = order; current_order <= MAX_ORDER;
    2090           0 :                                                         current_order++) {
    2091           0 :                 area = &(zone->free_area[current_order]);
    2092           0 :                 fallback_mt = find_suitable_fallback(area, current_order,
    2093             :                                 start_migratetype, false, &can_steal);
    2094           0 :                 if (fallback_mt != -1)
    2095             :                         break;
    2096             :         }
    2097             : 
    2098             :         /*
    2099             :          * This should not happen - we already found a suitable fallback
    2100             :          * when looking for the largest page.
    2101             :          */
    2102             :         VM_BUG_ON(current_order > MAX_ORDER);
    2103             : 
    2104             : do_steal:
    2105           2 :         page = get_page_from_free_area(area, fallback_mt);
    2106             : 
    2107           2 :         steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
    2108             :                                                                 can_steal);
    2109             : 
    2110           2 :         trace_mm_page_alloc_extfrag(page, order, current_order,
    2111             :                 start_migratetype, fallback_mt);
    2112             : 
    2113             :         return true;
    2114             : 
    2115             : }
    2116             : 
    2117             : /*
    2118             :  * Do the hard work of removing an element from the buddy allocator.
    2119             :  * Call me with the zone->lock already held.
    2120             :  */
    2121             : static __always_inline struct page *
    2122             : __rmqueue(struct zone *zone, unsigned int order, int migratetype,
    2123             :                                                 unsigned int alloc_flags)
    2124             : {
    2125             :         struct page *page;
    2126             : 
    2127             :         if (IS_ENABLED(CONFIG_CMA)) {
    2128             :                 /*
    2129             :                  * Balance movable allocations between regular and CMA areas by
    2130             :                  * allocating from CMA when over half of the zone's free memory
    2131             :                  * is in the CMA area.
    2132             :                  */
    2133             :                 if (alloc_flags & ALLOC_CMA &&
    2134             :                     zone_page_state(zone, NR_FREE_CMA_PAGES) >
    2135             :                     zone_page_state(zone, NR_FREE_PAGES) / 2) {
    2136             :                         page = __rmqueue_cma_fallback(zone, order);
    2137             :                         if (page)
    2138             :                                 return page;
    2139             :                 }
    2140             :         }
    2141             : retry:
    2142         679 :         page = __rmqueue_smallest(zone, order, migratetype);
    2143         679 :         if (unlikely(!page)) {
    2144           2 :                 if (alloc_flags & ALLOC_CMA)
    2145           0 :                         page = __rmqueue_cma_fallback(zone, order);
    2146             : 
    2147           4 :                 if (!page && __rmqueue_fallback(zone, order, migratetype,
    2148             :                                                                 alloc_flags))
    2149             :                         goto retry;
    2150             :         }
    2151             :         return page;
    2152             : }
    2153             : 
    2154             : /*
    2155             :  * Obtain a specified number of elements from the buddy allocator, all under
    2156             :  * a single hold of the lock, for efficiency.  Add them to the supplied list.
    2157             :  * Returns the number of new pages which were placed at *list.
    2158             :  */
    2159          27 : static int rmqueue_bulk(struct zone *zone, unsigned int order,
    2160             :                         unsigned long count, struct list_head *list,
    2161             :                         int migratetype, unsigned int alloc_flags)
    2162             : {
    2163             :         unsigned long flags;
    2164             :         int i;
    2165             : 
    2166          27 :         spin_lock_irqsave(&zone->lock, flags);
    2167         704 :         for (i = 0; i < count; ++i) {
    2168         677 :                 struct page *page = __rmqueue(zone, order, migratetype,
    2169             :                                                                 alloc_flags);
    2170         677 :                 if (unlikely(page == NULL))
    2171             :                         break;
    2172             : 
    2173             :                 /*
    2174             :                  * Split buddy pages returned by expand() are received here in
    2175             :                  * physical page order. The page is added to the tail of
    2176             :                  * caller's list. From the callers perspective, the linked list
    2177             :                  * is ordered by page number under some conditions. This is
    2178             :                  * useful for IO devices that can forward direction from the
    2179             :                  * head, thus also in the physical page order. This is useful
    2180             :                  * for IO devices that can merge IO requests if the physical
    2181             :                  * pages are ordered properly.
    2182             :                  */
    2183        1354 :                 list_add_tail(&page->pcp_list, list);
    2184             :                 if (is_migrate_cma(get_pcppage_migratetype(page)))
    2185             :                         __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
    2186             :                                               -(1 << order));
    2187             :         }
    2188             : 
    2189          54 :         __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
    2190          54 :         spin_unlock_irqrestore(&zone->lock, flags);
    2191             : 
    2192          27 :         return i;
    2193             : }
    2194             : 
    2195             : #ifdef CONFIG_NUMA
    2196             : /*
    2197             :  * Called from the vmstat counter updater to drain pagesets of this
    2198             :  * currently executing processor on remote nodes after they have
    2199             :  * expired.
    2200             :  */
    2201             : void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
    2202             : {
    2203             :         int to_drain, batch;
    2204             : 
    2205             :         batch = READ_ONCE(pcp->batch);
    2206             :         to_drain = min(pcp->count, batch);
    2207             :         if (to_drain > 0) {
    2208             :                 spin_lock(&pcp->lock);
    2209             :                 free_pcppages_bulk(zone, to_drain, pcp, 0);
    2210             :                 spin_unlock(&pcp->lock);
    2211             :         }
    2212             : }
    2213             : #endif
    2214             : 
    2215             : /*
    2216             :  * Drain pcplists of the indicated processor and zone.
    2217             :  */
    2218           0 : static void drain_pages_zone(unsigned int cpu, struct zone *zone)
    2219             : {
    2220             :         struct per_cpu_pages *pcp;
    2221             : 
    2222           0 :         pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
    2223           0 :         if (pcp->count) {
    2224           0 :                 spin_lock(&pcp->lock);
    2225           0 :                 free_pcppages_bulk(zone, pcp->count, pcp, 0);
    2226           0 :                 spin_unlock(&pcp->lock);
    2227             :         }
    2228           0 : }
    2229             : 
    2230             : /*
    2231             :  * Drain pcplists of all zones on the indicated processor.
    2232             :  */
    2233           0 : static void drain_pages(unsigned int cpu)
    2234             : {
    2235             :         struct zone *zone;
    2236             : 
    2237           0 :         for_each_populated_zone(zone) {
    2238           0 :                 drain_pages_zone(cpu, zone);
    2239             :         }
    2240           0 : }
    2241             : 
    2242             : /*
    2243             :  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
    2244             :  */
    2245           0 : void drain_local_pages(struct zone *zone)
    2246             : {
    2247           0 :         int cpu = smp_processor_id();
    2248             : 
    2249           0 :         if (zone)
    2250           0 :                 drain_pages_zone(cpu, zone);
    2251             :         else
    2252           0 :                 drain_pages(cpu);
    2253           0 : }
    2254             : 
    2255             : /*
    2256             :  * The implementation of drain_all_pages(), exposing an extra parameter to
    2257             :  * drain on all cpus.
    2258             :  *
    2259             :  * drain_all_pages() is optimized to only execute on cpus where pcplists are
    2260             :  * not empty. The check for non-emptiness can however race with a free to
    2261             :  * pcplist that has not yet increased the pcp->count from 0 to 1. Callers
    2262             :  * that need the guarantee that every CPU has drained can disable the
    2263             :  * optimizing racy check.
    2264             :  */
    2265           0 : static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
    2266             : {
    2267             :         int cpu;
    2268             : 
    2269             :         /*
    2270             :          * Allocate in the BSS so we won't require allocation in
    2271             :          * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
    2272             :          */
    2273             :         static cpumask_t cpus_with_pcps;
    2274             : 
    2275             :         /*
    2276             :          * Do not drain if one is already in progress unless it's specific to
    2277             :          * a zone. Such callers are primarily CMA and memory hotplug and need
    2278             :          * the drain to be complete when the call returns.
    2279             :          */
    2280           0 :         if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
    2281           0 :                 if (!zone)
    2282             :                         return;
    2283           0 :                 mutex_lock(&pcpu_drain_mutex);
    2284             :         }
    2285             : 
    2286             :         /*
    2287             :          * We don't care about racing with CPU hotplug event
    2288             :          * as offline notification will cause the notified
    2289             :          * cpu to drain that CPU pcps and on_each_cpu_mask
    2290             :          * disables preemption as part of its processing
    2291             :          */
    2292           0 :         for_each_online_cpu(cpu) {
    2293             :                 struct per_cpu_pages *pcp;
    2294             :                 struct zone *z;
    2295           0 :                 bool has_pcps = false;
    2296             : 
    2297           0 :                 if (force_all_cpus) {
    2298             :                         /*
    2299             :                          * The pcp.count check is racy, some callers need a
    2300             :                          * guarantee that no cpu is missed.
    2301             :                          */
    2302             :                         has_pcps = true;
    2303           0 :                 } else if (zone) {
    2304           0 :                         pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
    2305           0 :                         if (pcp->count)
    2306           0 :                                 has_pcps = true;
    2307             :                 } else {
    2308           0 :                         for_each_populated_zone(z) {
    2309           0 :                                 pcp = per_cpu_ptr(z->per_cpu_pageset, cpu);
    2310           0 :                                 if (pcp->count) {
    2311             :                                         has_pcps = true;
    2312             :                                         break;
    2313             :                                 }
    2314             :                         }
    2315             :                 }
    2316             : 
    2317           0 :                 if (has_pcps)
    2318           0 :                         cpumask_set_cpu(cpu, &cpus_with_pcps);
    2319             :                 else
    2320             :                         cpumask_clear_cpu(cpu, &cpus_with_pcps);
    2321             :         }
    2322             : 
    2323           0 :         for_each_cpu(cpu, &cpus_with_pcps) {
    2324           0 :                 if (zone)
    2325           0 :                         drain_pages_zone(cpu, zone);
    2326             :                 else
    2327           0 :                         drain_pages(cpu);
    2328             :         }
    2329             : 
    2330           0 :         mutex_unlock(&pcpu_drain_mutex);
    2331             : }
    2332             : 
    2333             : /*
    2334             :  * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
    2335             :  *
    2336             :  * When zone parameter is non-NULL, spill just the single zone's pages.
    2337             :  */
    2338           0 : void drain_all_pages(struct zone *zone)
    2339             : {
    2340           0 :         __drain_all_pages(zone, false);
    2341           0 : }
    2342             : 
    2343           0 : static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
    2344             :                                                         unsigned int order)
    2345             : {
    2346             :         int migratetype;
    2347             : 
    2348           0 :         if (!free_pages_prepare(page, order, FPI_NONE))
    2349             :                 return false;
    2350             : 
    2351           0 :         migratetype = get_pfnblock_migratetype(page, pfn);
    2352           0 :         set_pcppage_migratetype(page, migratetype);
    2353           0 :         return true;
    2354             : }
    2355             : 
    2356             : static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch,
    2357             :                        bool free_high)
    2358             : {
    2359             :         int min_nr_free, max_nr_free;
    2360             : 
    2361             :         /* Free everything if batch freeing high-order pages. */
    2362           0 :         if (unlikely(free_high))
    2363             :                 return pcp->count;
    2364             : 
    2365             :         /* Check for PCP disabled or boot pageset */
    2366           0 :         if (unlikely(high < batch))
    2367             :                 return 1;
    2368             : 
    2369             :         /* Leave at least pcp->batch pages on the list */
    2370           0 :         min_nr_free = batch;
    2371           0 :         max_nr_free = high - batch;
    2372             : 
    2373             :         /*
    2374             :          * Double the number of pages freed each time there is subsequent
    2375             :          * freeing of pages without any allocation.
    2376             :          */
    2377           0 :         batch <<= pcp->free_factor;
    2378           0 :         if (batch < max_nr_free)
    2379           0 :                 pcp->free_factor++;
    2380           0 :         batch = clamp(batch, min_nr_free, max_nr_free);
    2381             : 
    2382             :         return batch;
    2383             : }
    2384             : 
    2385           0 : static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
    2386             :                        bool free_high)
    2387             : {
    2388           0 :         int high = READ_ONCE(pcp->high);
    2389             : 
    2390           0 :         if (unlikely(!high || free_high))
    2391             :                 return 0;
    2392             : 
    2393           0 :         if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags))
    2394             :                 return high;
    2395             : 
    2396             :         /*
    2397             :          * If reclaim is active, limit the number of pages that can be
    2398             :          * stored on pcp lists
    2399             :          */
    2400           0 :         return min(READ_ONCE(pcp->batch) << 2, high);
    2401             : }
    2402             : 
    2403           0 : static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
    2404             :                                    struct page *page, int migratetype,
    2405             :                                    unsigned int order)
    2406             : {
    2407             :         int high;
    2408             :         int pindex;
    2409             :         bool free_high;
    2410             : 
    2411           0 :         __count_vm_events(PGFREE, 1 << order);
    2412           0 :         pindex = order_to_pindex(migratetype, order);
    2413           0 :         list_add(&page->pcp_list, &pcp->lists[pindex]);
    2414           0 :         pcp->count += 1 << order;
    2415             : 
    2416             :         /*
    2417             :          * As high-order pages other than THP's stored on PCP can contribute
    2418             :          * to fragmentation, limit the number stored when PCP is heavily
    2419             :          * freeing without allocation. The remainder after bulk freeing
    2420             :          * stops will be drained from vmstat refresh context.
    2421             :          */
    2422           0 :         free_high = (pcp->free_factor && order && order <= PAGE_ALLOC_COSTLY_ORDER);
    2423             : 
    2424           0 :         high = nr_pcp_high(pcp, zone, free_high);
    2425           0 :         if (pcp->count >= high) {
    2426           0 :                 int batch = READ_ONCE(pcp->batch);
    2427             : 
    2428           0 :                 free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, pindex);
    2429             :         }
    2430           0 : }
    2431             : 
    2432             : /*
    2433             :  * Free a pcp page
    2434             :  */
    2435           0 : void free_unref_page(struct page *page, unsigned int order)
    2436             : {
    2437             :         unsigned long __maybe_unused UP_flags;
    2438             :         struct per_cpu_pages *pcp;
    2439             :         struct zone *zone;
    2440           0 :         unsigned long pfn = page_to_pfn(page);
    2441             :         int migratetype;
    2442             : 
    2443           0 :         if (!free_unref_page_prepare(page, pfn, order))
    2444             :                 return;
    2445             : 
    2446             :         /*
    2447             :          * We only track unmovable, reclaimable and movable on pcp lists.
    2448             :          * Place ISOLATE pages on the isolated list because they are being
    2449             :          * offlined but treat HIGHATOMIC as movable pages so we can get those
    2450             :          * areas back if necessary. Otherwise, we may have to free
    2451             :          * excessively into the page allocator
    2452             :          */
    2453           0 :         migratetype = get_pcppage_migratetype(page);
    2454           0 :         if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
    2455             :                 if (unlikely(is_migrate_isolate(migratetype))) {
    2456             :                         free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE);
    2457             :                         return;
    2458             :                 }
    2459           0 :                 migratetype = MIGRATE_MOVABLE;
    2460             :         }
    2461             : 
    2462           0 :         zone = page_zone(page);
    2463           0 :         pcp_trylock_prepare(UP_flags);
    2464           0 :         pcp = pcp_spin_trylock(zone->per_cpu_pageset);
    2465           0 :         if (pcp) {
    2466           0 :                 free_unref_page_commit(zone, pcp, page, migratetype, order);
    2467           0 :                 pcp_spin_unlock(pcp);
    2468             :         } else {
    2469           0 :                 free_one_page(zone, page, pfn, order, migratetype, FPI_NONE);
    2470             :         }
    2471           0 :         pcp_trylock_finish(UP_flags);
    2472             : }
    2473             : 
    2474             : /*
    2475             :  * Free a list of 0-order pages
    2476             :  */
    2477           0 : void free_unref_page_list(struct list_head *list)
    2478             : {
    2479             :         unsigned long __maybe_unused UP_flags;
    2480             :         struct page *page, *next;
    2481           0 :         struct per_cpu_pages *pcp = NULL;
    2482           0 :         struct zone *locked_zone = NULL;
    2483           0 :         int batch_count = 0;
    2484             :         int migratetype;
    2485             : 
    2486             :         /* Prepare pages for freeing */
    2487           0 :         list_for_each_entry_safe(page, next, list, lru) {
    2488           0 :                 unsigned long pfn = page_to_pfn(page);
    2489           0 :                 if (!free_unref_page_prepare(page, pfn, 0)) {
    2490           0 :                         list_del(&page->lru);
    2491           0 :                         continue;
    2492             :                 }
    2493             : 
    2494             :                 /*
    2495             :                  * Free isolated pages directly to the allocator, see
    2496             :                  * comment in free_unref_page.
    2497             :                  */
    2498             :                 migratetype = get_pcppage_migratetype(page);
    2499             :                 if (unlikely(is_migrate_isolate(migratetype))) {
    2500             :                         list_del(&page->lru);
    2501             :                         free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE);
    2502             :                         continue;
    2503             :                 }
    2504             :         }
    2505             : 
    2506           0 :         list_for_each_entry_safe(page, next, list, lru) {
    2507           0 :                 struct zone *zone = page_zone(page);
    2508             : 
    2509           0 :                 list_del(&page->lru);
    2510           0 :                 migratetype = get_pcppage_migratetype(page);
    2511             : 
    2512             :                 /*
    2513             :                  * Either different zone requiring a different pcp lock or
    2514             :                  * excessive lock hold times when freeing a large list of
    2515             :                  * pages.
    2516             :                  */
    2517           0 :                 if (zone != locked_zone || batch_count == SWAP_CLUSTER_MAX) {
    2518           0 :                         if (pcp) {
    2519           0 :                                 pcp_spin_unlock(pcp);
    2520           0 :                                 pcp_trylock_finish(UP_flags);
    2521             :                         }
    2522             : 
    2523           0 :                         batch_count = 0;
    2524             : 
    2525             :                         /*
    2526             :                          * trylock is necessary as pages may be getting freed
    2527             :                          * from IRQ or SoftIRQ context after an IO completion.
    2528             :                          */
    2529           0 :                         pcp_trylock_prepare(UP_flags);
    2530           0 :                         pcp = pcp_spin_trylock(zone->per_cpu_pageset);
    2531           0 :                         if (unlikely(!pcp)) {
    2532           0 :                                 pcp_trylock_finish(UP_flags);
    2533           0 :                                 free_one_page(zone, page, page_to_pfn(page),
    2534             :                                               0, migratetype, FPI_NONE);
    2535           0 :                                 locked_zone = NULL;
    2536           0 :                                 continue;
    2537             :                         }
    2538             :                         locked_zone = zone;
    2539             :                 }
    2540             : 
    2541             :                 /*
    2542             :                  * Non-isolated types over MIGRATE_PCPTYPES get added
    2543             :                  * to the MIGRATE_MOVABLE pcp list.
    2544             :                  */
    2545           0 :                 if (unlikely(migratetype >= MIGRATE_PCPTYPES))
    2546           0 :                         migratetype = MIGRATE_MOVABLE;
    2547             : 
    2548           0 :                 trace_mm_page_free_batched(page);
    2549           0 :                 free_unref_page_commit(zone, pcp, page, migratetype, 0);
    2550           0 :                 batch_count++;
    2551             :         }
    2552             : 
    2553           0 :         if (pcp) {
    2554           0 :                 pcp_spin_unlock(pcp);
    2555           0 :                 pcp_trylock_finish(UP_flags);
    2556             :         }
    2557           0 : }
    2558             : 
    2559             : /*
    2560             :  * split_page takes a non-compound higher-order page, and splits it into
    2561             :  * n (1<<order) sub-pages: page[0..n]
    2562             :  * Each sub-page must be freed individually.
    2563             :  *
    2564             :  * Note: this is probably too low level an operation for use in drivers.
    2565             :  * Please consult with lkml before using this in your driver.
    2566             :  */
    2567           0 : void split_page(struct page *page, unsigned int order)
    2568             : {
    2569             :         int i;
    2570             : 
    2571             :         VM_BUG_ON_PAGE(PageCompound(page), page);
    2572             :         VM_BUG_ON_PAGE(!page_count(page), page);
    2573             : 
    2574           0 :         for (i = 1; i < (1 << order); i++)
    2575           0 :                 set_page_refcounted(page + i);
    2576           0 :         split_page_owner(page, 1 << order);
    2577           0 :         split_page_memcg(page, 1 << order);
    2578           0 : }
    2579             : EXPORT_SYMBOL_GPL(split_page);
    2580             : 
    2581           0 : int __isolate_free_page(struct page *page, unsigned int order)
    2582             : {
    2583           0 :         struct zone *zone = page_zone(page);
    2584           0 :         int mt = get_pageblock_migratetype(page);
    2585             : 
    2586           0 :         if (!is_migrate_isolate(mt)) {
    2587             :                 unsigned long watermark;
    2588             :                 /*
    2589             :                  * Obey watermarks as if the page was being allocated. We can
    2590             :                  * emulate a high-order watermark check with a raised order-0
    2591             :                  * watermark, because we already know our high-order page
    2592             :                  * exists.
    2593             :                  */
    2594           0 :                 watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
    2595           0 :                 if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
    2596             :                         return 0;
    2597             : 
    2598           0 :                 __mod_zone_freepage_state(zone, -(1UL << order), mt);
    2599             :         }
    2600             : 
    2601           0 :         del_page_from_free_list(page, zone, order);
    2602             : 
    2603             :         /*
    2604             :          * Set the pageblock if the isolated page is at least half of a
    2605             :          * pageblock
    2606             :          */
    2607           0 :         if (order >= pageblock_order - 1) {
    2608           0 :                 struct page *endpage = page + (1 << order) - 1;
    2609           0 :                 for (; page < endpage; page += pageblock_nr_pages) {
    2610           0 :                         int mt = get_pageblock_migratetype(page);
    2611             :                         /*
    2612             :                          * Only change normal pageblocks (i.e., they can merge
    2613             :                          * with others)
    2614             :                          */
    2615           0 :                         if (migratetype_is_mergeable(mt))
    2616           0 :                                 set_pageblock_migratetype(page,
    2617             :                                                           MIGRATE_MOVABLE);
    2618             :                 }
    2619             :         }
    2620             : 
    2621           0 :         return 1UL << order;
    2622             : }
    2623             : 
    2624             : /**
    2625             :  * __putback_isolated_page - Return a now-isolated page back where we got it
    2626             :  * @page: Page that was isolated
    2627             :  * @order: Order of the isolated page
    2628             :  * @mt: The page's pageblock's migratetype
    2629             :  *
    2630             :  * This function is meant to return a page pulled from the free lists via
    2631             :  * __isolate_free_page back to the free lists they were pulled from.
    2632             :  */
    2633           0 : void __putback_isolated_page(struct page *page, unsigned int order, int mt)
    2634             : {
    2635           0 :         struct zone *zone = page_zone(page);
    2636             : 
    2637             :         /* zone lock should be held when this function is called */
    2638             :         lockdep_assert_held(&zone->lock);
    2639             : 
    2640             :         /* Return isolated page to tail of freelist. */
    2641           0 :         __free_one_page(page, page_to_pfn(page), zone, order, mt,
    2642             :                         FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL);
    2643           0 : }
    2644             : 
    2645             : /*
    2646             :  * Update NUMA hit/miss statistics
    2647             :  */
    2648             : static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
    2649             :                                    long nr_account)
    2650             : {
    2651             : #ifdef CONFIG_NUMA
    2652             :         enum numa_stat_item local_stat = NUMA_LOCAL;
    2653             : 
    2654             :         /* skip numa counters update if numa stats is disabled */
    2655             :         if (!static_branch_likely(&vm_numa_stat_key))
    2656             :                 return;
    2657             : 
    2658             :         if (zone_to_nid(z) != numa_node_id())
    2659             :                 local_stat = NUMA_OTHER;
    2660             : 
    2661             :         if (zone_to_nid(z) == zone_to_nid(preferred_zone))
    2662             :                 __count_numa_events(z, NUMA_HIT, nr_account);
    2663             :         else {
    2664             :                 __count_numa_events(z, NUMA_MISS, nr_account);
    2665             :                 __count_numa_events(preferred_zone, NUMA_FOREIGN, nr_account);
    2666             :         }
    2667             :         __count_numa_events(z, local_stat, nr_account);
    2668             : #endif
    2669             : }
    2670             : 
    2671             : static __always_inline
    2672             : struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
    2673             :                            unsigned int order, unsigned int alloc_flags,
    2674             :                            int migratetype)
    2675             : {
    2676             :         struct page *page;
    2677             :         unsigned long flags;
    2678             : 
    2679             :         do {
    2680           0 :                 page = NULL;
    2681           0 :                 spin_lock_irqsave(&zone->lock, flags);
    2682             :                 /*
    2683             :                  * order-0 request can reach here when the pcplist is skipped
    2684             :                  * due to non-CMA allocation context. HIGHATOMIC area is
    2685             :                  * reserved for high-order atomic allocation, so order-0
    2686             :                  * request should skip it.
    2687             :                  */
    2688           0 :                 if (alloc_flags & ALLOC_HIGHATOMIC)
    2689             :                         page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
    2690           0 :                 if (!page) {
    2691           0 :                         page = __rmqueue(zone, order, migratetype, alloc_flags);
    2692             : 
    2693             :                         /*
    2694             :                          * If the allocation fails, allow OOM handling access
    2695             :                          * to HIGHATOMIC reserves as failing now is worse than
    2696             :                          * failing a high-order atomic allocation in the
    2697             :                          * future.
    2698             :                          */
    2699           0 :                         if (!page && (alloc_flags & ALLOC_OOM))
    2700             :                                 page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
    2701             : 
    2702           0 :                         if (!page) {
    2703           0 :                                 spin_unlock_irqrestore(&zone->lock, flags);
    2704             :                                 return NULL;
    2705             :                         }
    2706             :                 }
    2707           0 :                 __mod_zone_freepage_state(zone, -(1 << order),
    2708             :                                           get_pcppage_migratetype(page));
    2709           0 :                 spin_unlock_irqrestore(&zone->lock, flags);
    2710           0 :         } while (check_new_pages(page, order));
    2711             : 
    2712           0 :         __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
    2713             :         zone_statistics(preferred_zone, zone, 1);
    2714             : 
    2715             :         return page;
    2716             : }
    2717             : 
    2718             : /* Remove page from the per-cpu list, caller must protect the list */
    2719             : static inline
    2720         505 : struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
    2721             :                         int migratetype,
    2722             :                         unsigned int alloc_flags,
    2723             :                         struct per_cpu_pages *pcp,
    2724             :                         struct list_head *list)
    2725             : {
    2726             :         struct page *page;
    2727             : 
    2728             :         do {
    2729         505 :                 if (list_empty(list)) {
    2730          27 :                         int batch = READ_ONCE(pcp->batch);
    2731             :                         int alloced;
    2732             : 
    2733             :                         /*
    2734             :                          * Scale batch relative to order if batch implies
    2735             :                          * free pages can be stored on the PCP. Batch can
    2736             :                          * be 1 for small zones or for boot pagesets which
    2737             :                          * should never store free pages as the pages may
    2738             :                          * belong to arbitrary zones.
    2739             :                          */
    2740          27 :                         if (batch > 1)
    2741          15 :                                 batch = max(batch >> order, 2);
    2742          27 :                         alloced = rmqueue_bulk(zone, order,
    2743             :                                         batch, list,
    2744             :                                         migratetype, alloc_flags);
    2745             : 
    2746          27 :                         pcp->count += alloced << order;
    2747          27 :                         if (unlikely(list_empty(list)))
    2748             :                                 return NULL;
    2749             :                 }
    2750             : 
    2751         505 :                 page = list_first_entry(list, struct page, pcp_list);
    2752        1010 :                 list_del(&page->pcp_list);
    2753         505 :                 pcp->count -= 1 << order;
    2754         505 :         } while (check_new_pages(page, order));
    2755             : 
    2756             :         return page;
    2757             : }
    2758             : 
    2759             : /* Lock and remove page from the per-cpu list */
    2760         441 : static struct page *rmqueue_pcplist(struct zone *preferred_zone,
    2761             :                         struct zone *zone, unsigned int order,
    2762             :                         int migratetype, unsigned int alloc_flags)
    2763             : {
    2764             :         struct per_cpu_pages *pcp;
    2765             :         struct list_head *list;
    2766             :         struct page *page;
    2767             :         unsigned long __maybe_unused UP_flags;
    2768             : 
    2769             :         /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
    2770         441 :         pcp_trylock_prepare(UP_flags);
    2771         882 :         pcp = pcp_spin_trylock(zone->per_cpu_pageset);
    2772         441 :         if (!pcp) {
    2773           0 :                 pcp_trylock_finish(UP_flags);
    2774             :                 return NULL;
    2775             :         }
    2776             : 
    2777             :         /*
    2778             :          * On allocation, reduce the number of pages that are batch freed.
    2779             :          * See nr_pcp_free() where free_factor is increased for subsequent
    2780             :          * frees.
    2781             :          */
    2782         441 :         pcp->free_factor >>= 1;
    2783         882 :         list = &pcp->lists[order_to_pindex(migratetype, order)];
    2784         441 :         page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
    2785         882 :         pcp_spin_unlock(pcp);
    2786         882 :         pcp_trylock_finish(UP_flags);
    2787         441 :         if (page) {
    2788         882 :                 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
    2789             :                 zone_statistics(preferred_zone, zone, 1);
    2790             :         }
    2791             :         return page;
    2792             : }
    2793             : 
    2794             : /*
    2795             :  * Allocate a page from the given zone.
    2796             :  * Use pcplists for THP or "cheap" high-order allocations.
    2797             :  */
    2798             : 
    2799             : /*
    2800             :  * Do not instrument rmqueue() with KMSAN. This function may call
    2801             :  * __msan_poison_alloca() through a call to set_pfnblock_flags_mask().
    2802             :  * If __msan_poison_alloca() attempts to allocate pages for the stack depot, it
    2803             :  * may call rmqueue() again, which will result in a deadlock.
    2804             :  */
    2805             : __no_sanitize_memory
    2806             : static inline
    2807         441 : struct page *rmqueue(struct zone *preferred_zone,
    2808             :                         struct zone *zone, unsigned int order,
    2809             :                         gfp_t gfp_flags, unsigned int alloc_flags,
    2810             :                         int migratetype)
    2811             : {
    2812             :         struct page *page;
    2813             : 
    2814             :         /*
    2815             :          * We most definitely don't want callers attempting to
    2816             :          * allocate greater than order-1 page units with __GFP_NOFAIL.
    2817             :          */
    2818         441 :         WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
    2819             : 
    2820         441 :         if (likely(pcp_allowed_order(order))) {
    2821             :                 /*
    2822             :                  * MIGRATE_MOVABLE pcplist could have the pages on CMA area and
    2823             :                  * we need to skip it when CMA area isn't allowed.
    2824             :                  */
    2825             :                 if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
    2826             :                                 migratetype != MIGRATE_MOVABLE) {
    2827         441 :                         page = rmqueue_pcplist(preferred_zone, zone, order,
    2828             :                                         migratetype, alloc_flags);
    2829         441 :                         if (likely(page))
    2830             :                                 goto out;
    2831             :                 }
    2832             :         }
    2833             : 
    2834             :         page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags,
    2835             :                                                         migratetype);
    2836             : 
    2837             : out:
    2838             :         /* Separate test+clear to avoid unnecessary atomics */
    2839         774 :         if ((alloc_flags & ALLOC_KSWAPD) &&
    2840         666 :             unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {
    2841           0 :                 clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
    2842           0 :                 wakeup_kswapd(zone, 0, 0, zone_idx(zone));
    2843             :         }
    2844             : 
    2845             :         VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
    2846         441 :         return page;
    2847             : }
    2848             : 
    2849         457 : noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
    2850             : {
    2851         457 :         return __should_fail_alloc_page(gfp_mask, order);
    2852             : }
    2853             : ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
    2854             : 
    2855             : static inline long __zone_watermark_unusable_free(struct zone *z,
    2856             :                                 unsigned int order, unsigned int alloc_flags)
    2857             : {
    2858         458 :         long unusable_free = (1 << order) - 1;
    2859             : 
    2860             :         /*
    2861             :          * If the caller does not have rights to reserves below the min
    2862             :          * watermark then subtract the high-atomic reserves. This will
    2863             :          * over-estimate the size of the atomic reserve but it avoids a search.
    2864             :          */
    2865         458 :         if (likely(!(alloc_flags & ALLOC_RESERVES)))
    2866         458 :                 unusable_free += z->nr_reserved_highatomic;
    2867             : 
    2868             : #ifdef CONFIG_CMA
    2869             :         /* If allocation can't use CMA areas don't use free CMA pages */
    2870             :         if (!(alloc_flags & ALLOC_CMA))
    2871             :                 unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
    2872             : #endif
    2873             : #ifdef CONFIG_UNACCEPTED_MEMORY
    2874             :         unusable_free += zone_page_state(z, NR_UNACCEPTED);
    2875             : #endif
    2876             : 
    2877             :         return unusable_free;
    2878             : }
    2879             : 
    2880             : /*
    2881             :  * Return true if free base pages are above 'mark'. For high-order checks it
    2882             :  * will return true of the order-0 watermark is reached and there is at least
    2883             :  * one free page of a suitable size. Checking now avoids taking the zone lock
    2884             :  * to check in the allocation paths if no pages are free.
    2885             :  */
    2886         100 : bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
    2887             :                          int highest_zoneidx, unsigned int alloc_flags,
    2888             :                          long free_pages)
    2889             : {
    2890         100 :         long min = mark;
    2891             :         int o;
    2892             : 
    2893             :         /* free_pages may go negative - that's OK */
    2894         200 :         free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
    2895             : 
    2896         100 :         if (unlikely(alloc_flags & ALLOC_RESERVES)) {
    2897             :                 /*
    2898             :                  * __GFP_HIGH allows access to 50% of the min reserve as well
    2899             :                  * as OOM.
    2900             :                  */
    2901           0 :                 if (alloc_flags & ALLOC_MIN_RESERVE) {
    2902           0 :                         min -= min / 2;
    2903             : 
    2904             :                         /*
    2905             :                          * Non-blocking allocations (e.g. GFP_ATOMIC) can
    2906             :                          * access more reserves than just __GFP_HIGH. Other
    2907             :                          * non-blocking allocations requests such as GFP_NOWAIT
    2908             :                          * or (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) do not get
    2909             :                          * access to the min reserve.
    2910             :                          */
    2911           0 :                         if (alloc_flags & ALLOC_NON_BLOCK)
    2912           0 :                                 min -= min / 4;
    2913             :                 }
    2914             : 
    2915             :                 /*
    2916             :                  * OOM victims can try even harder than the normal reserve
    2917             :                  * users on the grounds that it's definitely going to be in
    2918             :                  * the exit path shortly and free memory. Any allocation it
    2919             :                  * makes during the free path will be small and short-lived.
    2920             :                  */
    2921           0 :                 if (alloc_flags & ALLOC_OOM)
    2922           0 :                         min -= min / 2;
    2923             :         }
    2924             : 
    2925             :         /*
    2926             :          * Check watermarks for an order-0 allocation request. If these
    2927             :          * are not met, then a high-order request also cannot go ahead
    2928             :          * even if a suitable page happened to be free.
    2929             :          */
    2930         100 :         if (free_pages <= min + z->lowmem_reserve[highest_zoneidx])
    2931             :                 return false;
    2932             : 
    2933             :         /* If this is an order-0 request then the watermark is fine */
    2934         100 :         if (!order)
    2935             :                 return true;
    2936             : 
    2937             :         /* For a high-order request, check at least one suitable page is free */
    2938         103 :         for (o = order; o <= MAX_ORDER; o++) {
    2939         103 :                 struct free_area *area = &z->free_area[o];
    2940             :                 int mt;
    2941             : 
    2942         103 :                 if (!area->nr_free)
    2943           4 :                         continue;
    2944             : 
    2945          75 :                 for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
    2946         174 :                         if (!free_area_empty(area, mt))
    2947             :                                 return true;
    2948             :                 }
    2949             : 
    2950             : #ifdef CONFIG_CMA
    2951             :                 if ((alloc_flags & ALLOC_CMA) &&
    2952             :                     !free_area_empty(area, MIGRATE_CMA)) {
    2953             :                         return true;
    2954             :                 }
    2955             : #endif
    2956           0 :                 if ((alloc_flags & (ALLOC_HIGHATOMIC|ALLOC_OOM)) &&
    2957           0 :                     !free_area_empty(area, MIGRATE_HIGHATOMIC)) {
    2958             :                         return true;
    2959             :                 }
    2960             :         }
    2961             :         return false;
    2962             : }
    2963             : 
    2964           0 : bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
    2965             :                       int highest_zoneidx, unsigned int alloc_flags)
    2966             : {
    2967           0 :         return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
    2968           0 :                                         zone_page_state(z, NR_FREE_PAGES));
    2969             : }
    2970             : 
    2971         457 : static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
    2972             :                                 unsigned long mark, int highest_zoneidx,
    2973             :                                 unsigned int alloc_flags, gfp_t gfp_mask)
    2974             : {
    2975             :         long free_pages;
    2976             : 
    2977         457 :         free_pages = zone_page_state(z, NR_FREE_PAGES);
    2978             : 
    2979             :         /*
    2980             :          * Fast check for order-0 only. If this fails then the reserves
    2981             :          * need to be calculated.
    2982             :          */
    2983         457 :         if (!order) {
    2984             :                 long usable_free;
    2985             :                 long reserved;
    2986             : 
    2987         358 :                 usable_free = free_pages;
    2988         716 :                 reserved = __zone_watermark_unusable_free(z, 0, alloc_flags);
    2989             : 
    2990             :                 /* reserved may over estimate high-atomic reserves. */
    2991         358 :                 usable_free -= min(usable_free, reserved);
    2992         358 :                 if (usable_free > mark + z->lowmem_reserve[highest_zoneidx])
    2993             :                         return true;
    2994             :         }
    2995             : 
    2996          99 :         if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
    2997             :                                         free_pages))
    2998             :                 return true;
    2999             : 
    3000             :         /*
    3001             :          * Ignore watermark boosting for __GFP_HIGH order-0 allocations
    3002             :          * when checking the min watermark. The min watermark is the
    3003             :          * point where boosting is ignored so that kswapd is woken up
    3004             :          * when below the low watermark.
    3005             :          */
    3006           0 :         if (unlikely(!order && (alloc_flags & ALLOC_MIN_RESERVE) && z->watermark_boost
    3007             :                 && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
    3008           0 :                 mark = z->_watermark[WMARK_MIN];
    3009           0 :                 return __zone_watermark_ok(z, order, mark, highest_zoneidx,
    3010             :                                         alloc_flags, free_pages);
    3011             :         }
    3012             : 
    3013             :         return false;
    3014             : }
    3015             : 
    3016           1 : bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
    3017             :                         unsigned long mark, int highest_zoneidx)
    3018             : {
    3019           1 :         long free_pages = zone_page_state(z, NR_FREE_PAGES);
    3020             : 
    3021           1 :         if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
    3022           0 :                 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
    3023             : 
    3024           1 :         return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
    3025             :                                                                 free_pages);
    3026             : }
    3027             : 
    3028             : #ifdef CONFIG_NUMA
    3029             : int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
    3030             : 
    3031             : static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
    3032             : {
    3033             :         return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
    3034             :                                 node_reclaim_distance;
    3035             : }
    3036             : #else   /* CONFIG_NUMA */
    3037             : static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
    3038             : {
    3039             :         return true;
    3040             : }
    3041             : #endif  /* CONFIG_NUMA */
    3042             : 
    3043             : /*
    3044             :  * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
    3045             :  * fragmentation is subtle. If the preferred zone was HIGHMEM then
    3046             :  * premature use of a lower zone may cause lowmem pressure problems that
    3047             :  * are worse than fragmentation. If the next zone is ZONE_DMA then it is
    3048             :  * probably too small. It only makes sense to spread allocations to avoid
    3049             :  * fragmentation between the Normal and DMA32 zones.
    3050             :  */
    3051             : static inline unsigned int
    3052             : alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
    3053             : {
    3054             :         unsigned int alloc_flags;
    3055             : 
    3056             :         /*
    3057             :          * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
    3058             :          * to save a branch.
    3059             :          */
    3060         441 :         alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM);
    3061             : 
    3062             : #ifdef CONFIG_ZONE_DMA32
    3063             :         if (!zone)
    3064             :                 return alloc_flags;
    3065             : 
    3066             :         if (zone_idx(zone) != ZONE_NORMAL)
    3067             :                 return alloc_flags;
    3068             : 
    3069             :         /*
    3070             :          * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
    3071             :          * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
    3072             :          * on UMA that if Normal is populated then so is DMA32.
    3073             :          */
    3074             :         BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
    3075             :         if (nr_online_nodes > 1 && !populated_zone(--zone))
    3076             :                 return alloc_flags;
    3077             : 
    3078             :         alloc_flags |= ALLOC_NOFRAGMENT;
    3079             : #endif /* CONFIG_ZONE_DMA32 */
    3080             :         return alloc_flags;
    3081             : }
    3082             : 
    3083             : /* Must be called after current_gfp_context() which can change gfp_mask */
    3084             : static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask,
    3085             :                                                   unsigned int alloc_flags)
    3086             : {
    3087             : #ifdef CONFIG_CMA
    3088             :         if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
    3089             :                 alloc_flags |= ALLOC_CMA;
    3090             : #endif
    3091             :         return alloc_flags;
    3092             : }
    3093             : 
    3094             : /*
    3095             :  * get_page_from_freelist goes through the zonelist trying to allocate
    3096             :  * a page.
    3097             :  */
    3098             : static struct page *
    3099         441 : get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
    3100             :                                                 const struct alloc_context *ac)
    3101             : {
    3102             :         struct zoneref *z;
    3103             :         struct zone *zone;
    3104         441 :         struct pglist_data *last_pgdat = NULL;
    3105         441 :         bool last_pgdat_dirty_ok = false;
    3106             :         bool no_fallback;
    3107             : 
    3108             : retry:
    3109             :         /*
    3110             :          * Scan zonelist, looking for a zone with enough free.
    3111             :          * See also cpuset_node_allowed() comment in kernel/cgroup/cpuset.c.
    3112             :          */
    3113         441 :         no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
    3114         441 :         z = ac->preferred_zoneref;
    3115         441 :         for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
    3116             :                                         ac->nodemask) {
    3117             :                 struct page *page;
    3118             :                 unsigned long mark;
    3119             : 
    3120             :                 if (cpusets_enabled() &&
    3121             :                         (alloc_flags & ALLOC_CPUSET) &&
    3122             :                         !__cpuset_zone_allowed(zone, gfp_mask))
    3123             :                                 continue;
    3124             :                 /*
    3125             :                  * When allocating a page cache page for writing, we
    3126             :                  * want to get it from a node that is within its dirty
    3127             :                  * limit, such that no single node holds more than its
    3128             :                  * proportional share of globally allowed dirty pages.
    3129             :                  * The dirty limits take into account the node's
    3130             :                  * lowmem reserves and high watermark so that kswapd
    3131             :                  * should be able to balance it without having to
    3132             :                  * write pages from its LRU list.
    3133             :                  *
    3134             :                  * XXX: For now, allow allocations to potentially
    3135             :                  * exceed the per-node dirty limit in the slowpath
    3136             :                  * (spread_dirty_pages unset) before going into reclaim,
    3137             :                  * which is important when on a NUMA setup the allowed
    3138             :                  * nodes are together not big enough to reach the
    3139             :                  * global limit.  The proper fix for these situations
    3140             :                  * will require awareness of nodes in the
    3141             :                  * dirty-throttling and the flusher threads.
    3142             :                  */
    3143         441 :                 if (ac->spread_dirty_pages) {
    3144           0 :                         if (last_pgdat != zone->zone_pgdat) {
    3145           0 :                                 last_pgdat = zone->zone_pgdat;
    3146           0 :                                 last_pgdat_dirty_ok = node_dirty_ok(zone->zone_pgdat);
    3147             :                         }
    3148             : 
    3149           0 :                         if (!last_pgdat_dirty_ok)
    3150           0 :                                 continue;
    3151             :                 }
    3152             : 
    3153             :                 if (no_fallback && nr_online_nodes > 1 &&
    3154             :                     zone != ac->preferred_zoneref->zone) {
    3155             :                         int local_nid;
    3156             : 
    3157             :                         /*
    3158             :                          * If moving to a remote node, retry but allow
    3159             :                          * fragmenting fallbacks. Locality is more important
    3160             :                          * than fragmentation avoidance.
    3161             :                          */
    3162             :                         local_nid = zone_to_nid(ac->preferred_zoneref->zone);
    3163             :                         if (zone_to_nid(zone) != local_nid) {
    3164             :                                 alloc_flags &= ~ALLOC_NOFRAGMENT;
    3165             :                                 goto retry;
    3166             :                         }
    3167             :                 }
    3168             : 
    3169         441 :                 mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
    3170         882 :                 if (!zone_watermark_fast(zone, order, mark,
    3171         441 :                                        ac->highest_zoneidx, alloc_flags,
    3172             :                                        gfp_mask)) {
    3173             :                         int ret;
    3174             : 
    3175             :                         if (has_unaccepted_memory()) {
    3176             :                                 if (try_to_accept_memory(zone, order))
    3177             :                                         goto try_this_zone;
    3178             :                         }
    3179             : 
    3180             : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
    3181             :                         /*
    3182             :                          * Watermark failed for this zone, but see if we can
    3183             :                          * grow this zone if it contains deferred pages.
    3184             :                          */
    3185             :                         if (deferred_pages_enabled()) {
    3186             :                                 if (_deferred_grow_zone(zone, order))
    3187             :                                         goto try_this_zone;
    3188             :                         }
    3189             : #endif
    3190             :                         /* Checked here to keep the fast path fast */
    3191             :                         BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
    3192           0 :                         if (alloc_flags & ALLOC_NO_WATERMARKS)
    3193             :                                 goto try_this_zone;
    3194             : 
    3195             :                         if (!node_reclaim_enabled() ||
    3196             :                             !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
    3197           0 :                                 continue;
    3198             : 
    3199             :                         ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
    3200             :                         switch (ret) {
    3201             :                         case NODE_RECLAIM_NOSCAN:
    3202             :                                 /* did not scan */
    3203             :                                 continue;
    3204             :                         case NODE_RECLAIM_FULL:
    3205             :                                 /* scanned but unreclaimable */
    3206             :                                 continue;
    3207             :                         default:
    3208             :                                 /* did we reclaim enough */
    3209             :                                 if (zone_watermark_ok(zone, order, mark,
    3210             :                                         ac->highest_zoneidx, alloc_flags))
    3211             :                                         goto try_this_zone;
    3212             : 
    3213             :                                 continue;
    3214             :                         }
    3215             :                 }
    3216             : 
    3217             : try_this_zone:
    3218         441 :                 page = rmqueue(ac->preferred_zoneref->zone, zone, order,
    3219             :                                 gfp_mask, alloc_flags, ac->migratetype);
    3220         441 :                 if (page) {
    3221         441 :                         prep_new_page(page, order, gfp_mask, alloc_flags);
    3222             : 
    3223             :                         /*
    3224             :                          * If this is a high-order atomic allocation then check
    3225             :                          * if the pageblock should be reserved for the future
    3226             :                          */
    3227         441 :                         if (unlikely(alloc_flags & ALLOC_HIGHATOMIC))
    3228           0 :                                 reserve_highatomic_pageblock(page, zone, order);
    3229             : 
    3230             :                         return page;
    3231             :                 } else {
    3232             :                         if (has_unaccepted_memory()) {
    3233             :                                 if (try_to_accept_memory(zone, order))
    3234             :                                         goto try_this_zone;
    3235             :                         }
    3236             : 
    3237             : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
    3238             :                         /* Try again if zone has deferred pages */
    3239             :                         if (deferred_pages_enabled()) {
    3240             :                                 if (_deferred_grow_zone(zone, order))
    3241             :                                         goto try_this_zone;
    3242             :                         }
    3243             : #endif
    3244             :                 }
    3245             :         }
    3246             : 
    3247             :         /*
    3248             :          * It's possible on a UMA machine to get through all zones that are
    3249             :          * fragmented. If avoiding fragmentation, reset and try again.
    3250             :          */
    3251             :         if (no_fallback) {
    3252             :                 alloc_flags &= ~ALLOC_NOFRAGMENT;
    3253             :                 goto retry;
    3254             :         }
    3255             : 
    3256             :         return NULL;
    3257             : }
    3258             : 
    3259           0 : static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
    3260             : {
    3261           0 :         unsigned int filter = SHOW_MEM_FILTER_NODES;
    3262             : 
    3263             :         /*
    3264             :          * This documents exceptions given to allocations in certain
    3265             :          * contexts that are allowed to allocate outside current's set
    3266             :          * of allowed nodes.
    3267             :          */
    3268           0 :         if (!(gfp_mask & __GFP_NOMEMALLOC))
    3269           0 :                 if (tsk_is_oom_victim(current) ||
    3270           0 :                     (current->flags & (PF_MEMALLOC | PF_EXITING)))
    3271             :                         filter &= ~SHOW_MEM_FILTER_NODES;
    3272           0 :         if (!in_task() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
    3273           0 :                 filter &= ~SHOW_MEM_FILTER_NODES;
    3274             : 
    3275           0 :         __show_mem(filter, nodemask, gfp_zone(gfp_mask));
    3276           0 : }
    3277             : 
    3278           0 : void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
    3279             : {
    3280             :         struct va_format vaf;
    3281             :         va_list args;
    3282             :         static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1);
    3283             : 
    3284           0 :         if ((gfp_mask & __GFP_NOWARN) ||
    3285           0 :              !__ratelimit(&nopage_rs) ||
    3286           0 :              ((gfp_mask & __GFP_DMA) && !has_managed_dma()))
    3287           0 :                 return;
    3288             : 
    3289           0 :         va_start(args, fmt);
    3290           0 :         vaf.fmt = fmt;
    3291           0 :         vaf.va = &args;
    3292           0 :         pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
    3293             :                         current->comm, &vaf, gfp_mask, &gfp_mask,
    3294             :                         nodemask_pr_args(nodemask));
    3295           0 :         va_end(args);
    3296             : 
    3297             :         cpuset_print_current_mems_allowed();
    3298           0 :         pr_cont("\n");
    3299           0 :         dump_stack();
    3300           0 :         warn_alloc_show_mem(gfp_mask, nodemask);
    3301             : }
    3302             : 
    3303             : static inline struct page *
    3304           0 : __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
    3305             :                               unsigned int alloc_flags,
    3306             :                               const struct alloc_context *ac)
    3307             : {
    3308             :         struct page *page;
    3309             : 
    3310           0 :         page = get_page_from_freelist(gfp_mask, order,
    3311           0 :                         alloc_flags|ALLOC_CPUSET, ac);
    3312             :         /*
    3313             :          * fallback to ignore cpuset restriction if our nodes
    3314             :          * are depleted
    3315             :          */
    3316           0 :         if (!page)
    3317           0 :                 page = get_page_from_freelist(gfp_mask, order,
    3318             :                                 alloc_flags, ac);
    3319             : 
    3320           0 :         return page;
    3321             : }
    3322             : 
    3323             : static inline struct page *
    3324           0 : __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
    3325             :         const struct alloc_context *ac, unsigned long *did_some_progress)
    3326             : {
    3327           0 :         struct oom_control oc = {
    3328           0 :                 .zonelist = ac->zonelist,
    3329           0 :                 .nodemask = ac->nodemask,
    3330             :                 .memcg = NULL,
    3331             :                 .gfp_mask = gfp_mask,
    3332             :                 .order = order,
    3333             :         };
    3334             :         struct page *page;
    3335             : 
    3336           0 :         *did_some_progress = 0;
    3337             : 
    3338             :         /*
    3339             :          * Acquire the oom lock.  If that fails, somebody else is
    3340             :          * making progress for us.
    3341             :          */
    3342           0 :         if (!mutex_trylock(&oom_lock)) {
    3343           0 :                 *did_some_progress = 1;
    3344           0 :                 schedule_timeout_uninterruptible(1);
    3345           0 :                 return NULL;
    3346             :         }
    3347             : 
    3348             :         /*
    3349             :          * Go through the zonelist yet one more time, keep very high watermark
    3350             :          * here, this is only to catch a parallel oom killing, we must fail if
    3351             :          * we're still under heavy pressure. But make sure that this reclaim
    3352             :          * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
    3353             :          * allocation which will never fail due to oom_lock already held.
    3354             :          */
    3355           0 :         page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
    3356             :                                       ~__GFP_DIRECT_RECLAIM, order,
    3357             :                                       ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
    3358           0 :         if (page)
    3359             :                 goto out;
    3360             : 
    3361             :         /* Coredumps can quickly deplete all memory reserves */
    3362           0 :         if (current->flags & PF_DUMPCORE)
    3363             :                 goto out;
    3364             :         /* The OOM killer will not help higher order allocs */
    3365           0 :         if (order > PAGE_ALLOC_COSTLY_ORDER)
    3366             :                 goto out;
    3367             :         /*
    3368             :          * We have already exhausted all our reclaim opportunities without any
    3369             :          * success so it is time to admit defeat. We will skip the OOM killer
    3370             :          * because it is very likely that the caller has a more reasonable
    3371             :          * fallback than shooting a random task.
    3372             :          *
    3373             :          * The OOM killer may not free memory on a specific node.
    3374             :          */
    3375           0 :         if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE))
    3376             :                 goto out;
    3377             :         /* The OOM killer does not needlessly kill tasks for lowmem */
    3378             :         if (ac->highest_zoneidx < ZONE_NORMAL)
    3379             :                 goto out;
    3380           0 :         if (pm_suspended_storage())
    3381             :                 goto out;
    3382             :         /*
    3383             :          * XXX: GFP_NOFS allocations should rather fail than rely on
    3384             :          * other request to make a forward progress.
    3385             :          * We are in an unfortunate situation where out_of_memory cannot
    3386             :          * do much for this context but let's try it to at least get
    3387             :          * access to memory reserved if the current task is killed (see
    3388             :          * out_of_memory). Once filesystems are ready to handle allocation
    3389             :          * failures more gracefully we should just bail out here.
    3390             :          */
    3391             : 
    3392             :         /* Exhausted what can be done so it's blame time */
    3393           0 :         if (out_of_memory(&oc) ||
    3394           0 :             WARN_ON_ONCE_GFP(gfp_mask & __GFP_NOFAIL, gfp_mask)) {
    3395           0 :                 *did_some_progress = 1;
    3396             : 
    3397             :                 /*
    3398             :                  * Help non-failing allocations by giving them access to memory
    3399             :                  * reserves
    3400             :                  */
    3401           0 :                 if (gfp_mask & __GFP_NOFAIL)
    3402           0 :                         page = __alloc_pages_cpuset_fallback(gfp_mask, order,
    3403             :                                         ALLOC_NO_WATERMARKS, ac);
    3404             :         }
    3405             : out:
    3406           0 :         mutex_unlock(&oom_lock);
    3407           0 :         return page;
    3408             : }
    3409             : 
    3410             : /*
    3411             :  * Maximum number of compaction retries with a progress before OOM
    3412             :  * killer is consider as the only way to move forward.
    3413             :  */
    3414             : #define MAX_COMPACT_RETRIES 16
    3415             : 
    3416             : #ifdef CONFIG_COMPACTION
    3417             : /* Try memory compaction for high-order allocations before reclaim */
    3418             : static struct page *
    3419           0 : __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
    3420             :                 unsigned int alloc_flags, const struct alloc_context *ac,
    3421             :                 enum compact_priority prio, enum compact_result *compact_result)
    3422             : {
    3423           0 :         struct page *page = NULL;
    3424             :         unsigned long pflags;
    3425             :         unsigned int noreclaim_flag;
    3426             : 
    3427           0 :         if (!order)
    3428             :                 return NULL;
    3429             : 
    3430           0 :         psi_memstall_enter(&pflags);
    3431             :         delayacct_compact_start();
    3432           0 :         noreclaim_flag = memalloc_noreclaim_save();
    3433             : 
    3434           0 :         *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
    3435             :                                                                 prio, &page);
    3436             : 
    3437           0 :         memalloc_noreclaim_restore(noreclaim_flag);
    3438           0 :         psi_memstall_leave(&pflags);
    3439             :         delayacct_compact_end();
    3440             : 
    3441           0 :         if (*compact_result == COMPACT_SKIPPED)
    3442             :                 return NULL;
    3443             :         /*
    3444             :          * At least in one zone compaction wasn't deferred or skipped, so let's
    3445             :          * count a compaction stall
    3446             :          */
    3447           0 :         count_vm_event(COMPACTSTALL);
    3448             : 
    3449             :         /* Prep a captured page if available */
    3450           0 :         if (page)
    3451           0 :                 prep_new_page(page, order, gfp_mask, alloc_flags);
    3452             : 
    3453             :         /* Try get a page from the freelist if available */
    3454           0 :         if (!page)
    3455           0 :                 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
    3456             : 
    3457           0 :         if (page) {
    3458           0 :                 struct zone *zone = page_zone(page);
    3459             : 
    3460           0 :                 zone->compact_blockskip_flush = false;
    3461           0 :                 compaction_defer_reset(zone, order, true);
    3462           0 :                 count_vm_event(COMPACTSUCCESS);
    3463           0 :                 return page;
    3464             :         }
    3465             : 
    3466             :         /*
    3467             :          * It's bad if compaction run occurs and fails. The most likely reason
    3468             :          * is that pages exist, but not enough to satisfy watermarks.
    3469             :          */
    3470           0 :         count_vm_event(COMPACTFAIL);
    3471             : 
    3472           0 :         cond_resched();
    3473             : 
    3474           0 :         return NULL;
    3475             : }
    3476             : 
    3477             : static inline bool
    3478           0 : should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
    3479             :                      enum compact_result compact_result,
    3480             :                      enum compact_priority *compact_priority,
    3481             :                      int *compaction_retries)
    3482             : {
    3483           0 :         int max_retries = MAX_COMPACT_RETRIES;
    3484             :         int min_priority;
    3485           0 :         bool ret = false;
    3486           0 :         int retries = *compaction_retries;
    3487           0 :         enum compact_priority priority = *compact_priority;
    3488             : 
    3489           0 :         if (!order)
    3490             :                 return false;
    3491             : 
    3492           0 :         if (fatal_signal_pending(current))
    3493             :                 return false;
    3494             : 
    3495             :         /*
    3496             :          * Compaction was skipped due to a lack of free order-0
    3497             :          * migration targets. Continue if reclaim can help.
    3498             :          */
    3499           0 :         if (compact_result == COMPACT_SKIPPED) {
    3500           0 :                 ret = compaction_zonelist_suitable(ac, order, alloc_flags);
    3501           0 :                 goto out;
    3502             :         }
    3503             : 
    3504             :         /*
    3505             :          * Compaction managed to coalesce some page blocks, but the
    3506             :          * allocation failed presumably due to a race. Retry some.
    3507             :          */
    3508           0 :         if (compact_result == COMPACT_SUCCESS) {
    3509             :                 /*
    3510             :                  * !costly requests are much more important than
    3511             :                  * __GFP_RETRY_MAYFAIL costly ones because they are de
    3512             :                  * facto nofail and invoke OOM killer to move on while
    3513             :                  * costly can fail and users are ready to cope with
    3514             :                  * that. 1/4 retries is rather arbitrary but we would
    3515             :                  * need much more detailed feedback from compaction to
    3516             :                  * make a better decision.
    3517             :                  */
    3518           0 :                 if (order > PAGE_ALLOC_COSTLY_ORDER)
    3519           0 :                         max_retries /= 4;
    3520             : 
    3521           0 :                 if (++(*compaction_retries) <= max_retries) {
    3522             :                         ret = true;
    3523             :                         goto out;
    3524             :                 }
    3525             :         }
    3526             : 
    3527             :         /*
    3528             :          * Compaction failed. Retry with increasing priority.
    3529             :          */
    3530           0 :         min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
    3531           0 :                         MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
    3532             : 
    3533           0 :         if (*compact_priority > min_priority) {
    3534           0 :                 (*compact_priority)--;
    3535           0 :                 *compaction_retries = 0;
    3536           0 :                 ret = true;
    3537             :         }
    3538             : out:
    3539           0 :         trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);
    3540           0 :         return ret;
    3541             : }
    3542             : #else
    3543             : static inline struct page *
    3544             : __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
    3545             :                 unsigned int alloc_flags, const struct alloc_context *ac,
    3546             :                 enum compact_priority prio, enum compact_result *compact_result)
    3547             : {
    3548             :         *compact_result = COMPACT_SKIPPED;
    3549             :         return NULL;
    3550             : }
    3551             : 
    3552             : static inline bool
    3553             : should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
    3554             :                      enum compact_result compact_result,
    3555             :                      enum compact_priority *compact_priority,
    3556             :                      int *compaction_retries)
    3557             : {
    3558             :         struct zone *zone;
    3559             :         struct zoneref *z;
    3560             : 
    3561             :         if (!order || order > PAGE_ALLOC_COSTLY_ORDER)
    3562             :                 return false;
    3563             : 
    3564             :         /*
    3565             :          * There are setups with compaction disabled which would prefer to loop
    3566             :          * inside the allocator rather than hit the oom killer prematurely.
    3567             :          * Let's give them a good hope and keep retrying while the order-0
    3568             :          * watermarks are OK.
    3569             :          */
    3570             :         for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
    3571             :                                 ac->highest_zoneidx, ac->nodemask) {
    3572             :                 if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
    3573             :                                         ac->highest_zoneidx, alloc_flags))
    3574             :                         return true;
    3575             :         }
    3576             :         return false;
    3577             : }
    3578             : #endif /* CONFIG_COMPACTION */
    3579             : 
    3580             : #ifdef CONFIG_LOCKDEP
    3581             : static struct lockdep_map __fs_reclaim_map =
    3582             :         STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
    3583             : 
    3584             : static bool __need_reclaim(gfp_t gfp_mask)
    3585             : {
    3586             :         /* no reclaim without waiting on it */
    3587             :         if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
    3588             :                 return false;
    3589             : 
    3590             :         /* this guy won't enter reclaim */
    3591             :         if (current->flags & PF_MEMALLOC)
    3592             :                 return false;
    3593             : 
    3594             :         if (gfp_mask & __GFP_NOLOCKDEP)
    3595             :                 return false;
    3596             : 
    3597             :         return true;
    3598             : }
    3599             : 
    3600             : void __fs_reclaim_acquire(unsigned long ip)
    3601             : {
    3602             :         lock_acquire_exclusive(&__fs_reclaim_map, 0, 0, NULL, ip);
    3603             : }
    3604             : 
    3605             : void __fs_reclaim_release(unsigned long ip)
    3606             : {
    3607             :         lock_release(&__fs_reclaim_map, ip);
    3608             : }
    3609             : 
    3610             : void fs_reclaim_acquire(gfp_t gfp_mask)
    3611             : {
    3612             :         gfp_mask = current_gfp_context(gfp_mask);
    3613             : 
    3614             :         if (__need_reclaim(gfp_mask)) {
    3615             :                 if (gfp_mask & __GFP_FS)
    3616             :                         __fs_reclaim_acquire(_RET_IP_);
    3617             : 
    3618             : #ifdef CONFIG_MMU_NOTIFIER
    3619             :                 lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
    3620             :                 lock_map_release(&__mmu_notifier_invalidate_range_start_map);
    3621             : #endif
    3622             : 
    3623             :         }
    3624             : }
    3625             : EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
    3626             : 
    3627             : void fs_reclaim_release(gfp_t gfp_mask)
    3628             : {
    3629             :         gfp_mask = current_gfp_context(gfp_mask);
    3630             : 
    3631             :         if (__need_reclaim(gfp_mask)) {
    3632             :                 if (gfp_mask & __GFP_FS)
    3633             :                         __fs_reclaim_release(_RET_IP_);
    3634             :         }
    3635             : }
    3636             : EXPORT_SYMBOL_GPL(fs_reclaim_release);
    3637             : #endif
    3638             : 
    3639             : /*
    3640             :  * Zonelists may change due to hotplug during allocation. Detect when zonelists
    3641             :  * have been rebuilt so allocation retries. Reader side does not lock and
    3642             :  * retries the allocation if zonelist changes. Writer side is protected by the
    3643             :  * embedded spin_lock.
    3644             :  */
    3645             : static DEFINE_SEQLOCK(zonelist_update_seq);
    3646             : 
    3647             : static unsigned int zonelist_iter_begin(void)
    3648             : {
    3649             :         if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
    3650             :                 return read_seqbegin(&zonelist_update_seq);
    3651             : 
    3652             :         return 0;
    3653             : }
    3654             : 
    3655             : static unsigned int check_retry_zonelist(unsigned int seq)
    3656             : {
    3657             :         if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
    3658             :                 return read_seqretry(&zonelist_update_seq, seq);
    3659             : 
    3660             :         return seq;
    3661             : }
    3662             : 
    3663             : /* Perform direct synchronous page reclaim */
    3664             : static unsigned long
    3665           0 : __perform_reclaim(gfp_t gfp_mask, unsigned int order,
    3666             :                                         const struct alloc_context *ac)
    3667             : {
    3668             :         unsigned int noreclaim_flag;
    3669             :         unsigned long progress;
    3670             : 
    3671           0 :         cond_resched();
    3672             : 
    3673             :         /* We now go into synchronous reclaim */
    3674             :         cpuset_memory_pressure_bump();
    3675           0 :         fs_reclaim_acquire(gfp_mask);
    3676           0 :         noreclaim_flag = memalloc_noreclaim_save();
    3677             : 
    3678           0 :         progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
    3679             :                                                                 ac->nodemask);
    3680             : 
    3681           0 :         memalloc_noreclaim_restore(noreclaim_flag);
    3682           0 :         fs_reclaim_release(gfp_mask);
    3683             : 
    3684           0 :         cond_resched();
    3685             : 
    3686           0 :         return progress;
    3687             : }
    3688             : 
    3689             : /* The really slow allocator path where we enter direct reclaim */
    3690             : static inline struct page *
    3691           0 : __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
    3692             :                 unsigned int alloc_flags, const struct alloc_context *ac,
    3693             :                 unsigned long *did_some_progress)
    3694             : {
    3695           0 :         struct page *page = NULL;
    3696             :         unsigned long pflags;
    3697           0 :         bool drained = false;
    3698             : 
    3699           0 :         psi_memstall_enter(&pflags);
    3700           0 :         *did_some_progress = __perform_reclaim(gfp_mask, order, ac);
    3701           0 :         if (unlikely(!(*did_some_progress)))
    3702             :                 goto out;
    3703             : 
    3704             : retry:
    3705           0 :         page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
    3706             : 
    3707             :         /*
    3708             :          * If an allocation failed after direct reclaim, it could be because
    3709             :          * pages are pinned on the per-cpu lists or in high alloc reserves.
    3710             :          * Shrink them and try again
    3711             :          */
    3712           0 :         if (!page && !drained) {
    3713           0 :                 unreserve_highatomic_pageblock(ac, false);
    3714           0 :                 drain_all_pages(NULL);
    3715           0 :                 drained = true;
    3716           0 :                 goto retry;
    3717             :         }
    3718             : out:
    3719           0 :         psi_memstall_leave(&pflags);
    3720             : 
    3721           0 :         return page;
    3722             : }
    3723             : 
    3724           0 : static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
    3725             :                              const struct alloc_context *ac)
    3726             : {
    3727             :         struct zoneref *z;
    3728             :         struct zone *zone;
    3729           0 :         pg_data_t *last_pgdat = NULL;
    3730           0 :         enum zone_type highest_zoneidx = ac->highest_zoneidx;
    3731             : 
    3732           0 :         for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
    3733             :                                         ac->nodemask) {
    3734           0 :                 if (!managed_zone(zone))
    3735           0 :                         continue;
    3736           0 :                 if (last_pgdat != zone->zone_pgdat) {
    3737           0 :                         wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
    3738           0 :                         last_pgdat = zone->zone_pgdat;
    3739             :                 }
    3740             :         }
    3741           0 : }
    3742             : 
    3743             : static inline unsigned int
    3744           0 : gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
    3745             : {
    3746           0 :         unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
    3747             : 
    3748             :         /*
    3749             :          * __GFP_HIGH is assumed to be the same as ALLOC_MIN_RESERVE
    3750             :          * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
    3751             :          * to save two branches.
    3752             :          */
    3753             :         BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_MIN_RESERVE);
    3754             :         BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD);
    3755             : 
    3756             :         /*
    3757             :          * The caller may dip into page reserves a bit more if the caller
    3758             :          * cannot run direct reclaim, or if the caller has realtime scheduling
    3759             :          * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
    3760             :          * set both ALLOC_NON_BLOCK and ALLOC_MIN_RESERVE(__GFP_HIGH).
    3761             :          */
    3762           0 :         alloc_flags |= (__force int)
    3763             :                 (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
    3764             : 
    3765           0 :         if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
    3766             :                 /*
    3767             :                  * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
    3768             :                  * if it can't schedule.
    3769             :                  */
    3770           0 :                 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
    3771           0 :                         alloc_flags |= ALLOC_NON_BLOCK;
    3772             : 
    3773           0 :                         if (order > 0)
    3774           0 :                                 alloc_flags |= ALLOC_HIGHATOMIC;
    3775             :                 }
    3776             : 
    3777             :                 /*
    3778             :                  * Ignore cpuset mems for non-blocking __GFP_HIGH (probably
    3779             :                  * GFP_ATOMIC) rather than fail, see the comment for
    3780             :                  * cpuset_node_allowed().
    3781             :                  */
    3782           0 :                 if (alloc_flags & ALLOC_MIN_RESERVE)
    3783           0 :                         alloc_flags &= ~ALLOC_CPUSET;
    3784           0 :         } else if (unlikely(rt_task(current)) && in_task())
    3785           0 :                 alloc_flags |= ALLOC_MIN_RESERVE;
    3786             : 
    3787           0 :         alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
    3788             : 
    3789           0 :         return alloc_flags;
    3790             : }
    3791             : 
    3792             : static bool oom_reserves_allowed(struct task_struct *tsk)
    3793             : {
    3794           0 :         if (!tsk_is_oom_victim(tsk))
    3795             :                 return false;
    3796             : 
    3797             :         /*
    3798             :          * !MMU doesn't have oom reaper so give access to memory reserves
    3799             :          * only to the thread with TIF_MEMDIE set
    3800             :          */
    3801             :         if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
    3802             :                 return false;
    3803             : 
    3804             :         return true;
    3805             : }
    3806             : 
    3807             : /*
    3808             :  * Distinguish requests which really need access to full memory
    3809             :  * reserves from oom victims which can live with a portion of it
    3810             :  */
    3811           0 : static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
    3812             : {
    3813           0 :         if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
    3814             :                 return 0;
    3815           0 :         if (gfp_mask & __GFP_MEMALLOC)
    3816             :                 return ALLOC_NO_WATERMARKS;
    3817           0 :         if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
    3818             :                 return ALLOC_NO_WATERMARKS;
    3819           0 :         if (!in_interrupt()) {
    3820           0 :                 if (current->flags & PF_MEMALLOC)
    3821             :                         return ALLOC_NO_WATERMARKS;
    3822           0 :                 else if (oom_reserves_allowed(current))
    3823             :                         return ALLOC_OOM;
    3824             :         }
    3825             : 
    3826             :         return 0;
    3827             : }
    3828             : 
    3829           0 : bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
    3830             : {
    3831           0 :         return !!__gfp_pfmemalloc_flags(gfp_mask);
    3832             : }
    3833             : 
    3834             : /*
    3835             :  * Checks whether it makes sense to retry the reclaim to make a forward progress
    3836             :  * for the given allocation request.
    3837             :  *
    3838             :  * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
    3839             :  * without success, or when we couldn't even meet the watermark if we
    3840             :  * reclaimed all remaining pages on the LRU lists.
    3841             :  *
    3842             :  * Returns true if a retry is viable or false to enter the oom path.
    3843             :  */
    3844             : static inline bool
    3845           0 : should_reclaim_retry(gfp_t gfp_mask, unsigned order,
    3846             :                      struct alloc_context *ac, int alloc_flags,
    3847             :                      bool did_some_progress, int *no_progress_loops)
    3848             : {
    3849             :         struct zone *zone;
    3850             :         struct zoneref *z;
    3851           0 :         bool ret = false;
    3852             : 
    3853             :         /*
    3854             :          * Costly allocations might have made a progress but this doesn't mean
    3855             :          * their order will become available due to high fragmentation so
    3856             :          * always increment the no progress counter for them
    3857             :          */
    3858           0 :         if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
    3859           0 :                 *no_progress_loops = 0;
    3860             :         else
    3861           0 :                 (*no_progress_loops)++;
    3862             : 
    3863             :         /*
    3864             :          * Make sure we converge to OOM if we cannot make any progress
    3865             :          * several times in the row.
    3866             :          */
    3867           0 :         if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
    3868             :                 /* Before OOM, exhaust highatomic_reserve */
    3869           0 :                 return unreserve_highatomic_pageblock(ac, true);
    3870             :         }
    3871             : 
    3872             :         /*
    3873             :          * Keep reclaiming pages while there is a chance this will lead
    3874             :          * somewhere.  If none of the target zones can satisfy our allocation
    3875             :          * request even if all reclaimable pages are considered then we are
    3876             :          * screwed and have to go OOM.
    3877             :          */
    3878           0 :         for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
    3879             :                                 ac->highest_zoneidx, ac->nodemask) {
    3880             :                 unsigned long available;
    3881             :                 unsigned long reclaimable;
    3882           0 :                 unsigned long min_wmark = min_wmark_pages(zone);
    3883             :                 bool wmark;
    3884             : 
    3885           0 :                 available = reclaimable = zone_reclaimable_pages(zone);
    3886           0 :                 available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
    3887             : 
    3888             :                 /*
    3889             :                  * Would the allocation succeed if we reclaimed all
    3890             :                  * reclaimable pages?
    3891             :                  */
    3892           0 :                 wmark = __zone_watermark_ok(zone, order, min_wmark,
    3893           0 :                                 ac->highest_zoneidx, alloc_flags, available);
    3894           0 :                 trace_reclaim_retry_zone(z, order, reclaimable,
    3895             :                                 available, min_wmark, *no_progress_loops, wmark);
    3896           0 :                 if (wmark) {
    3897             :                         ret = true;
    3898             :                         break;
    3899             :                 }
    3900             :         }
    3901             : 
    3902             :         /*
    3903             :          * Memory allocation/reclaim might be called from a WQ context and the
    3904             :          * current implementation of the WQ concurrency control doesn't
    3905             :          * recognize that a particular WQ is congested if the worker thread is
    3906             :          * looping without ever sleeping. Therefore we have to do a short sleep
    3907             :          * here rather than calling cond_resched().
    3908             :          */
    3909           0 :         if (current->flags & PF_WQ_WORKER)
    3910           0 :                 schedule_timeout_uninterruptible(1);
    3911             :         else
    3912           0 :                 cond_resched();
    3913             :         return ret;
    3914             : }
    3915             : 
    3916             : static inline bool
    3917             : check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
    3918             : {
    3919             :         /*
    3920             :          * It's possible that cpuset's mems_allowed and the nodemask from
    3921             :          * mempolicy don't intersect. This should be normally dealt with by
    3922             :          * policy_nodemask(), but it's possible to race with cpuset update in
    3923             :          * such a way the check therein was true, and then it became false
    3924             :          * before we got our cpuset_mems_cookie here.
    3925             :          * This assumes that for all allocations, ac->nodemask can come only
    3926             :          * from MPOL_BIND mempolicy (whose documented semantics is to be ignored
    3927             :          * when it does not intersect with the cpuset restrictions) or the
    3928             :          * caller can deal with a violated nodemask.
    3929             :          */
    3930             :         if (cpusets_enabled() && ac->nodemask &&
    3931             :                         !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {
    3932             :                 ac->nodemask = NULL;
    3933             :                 return true;
    3934             :         }
    3935             : 
    3936             :         /*
    3937             :          * When updating a task's mems_allowed or mempolicy nodemask, it is
    3938             :          * possible to race with parallel threads in such a way that our
    3939             :          * allocation can fail while the mask is being updated. If we are about
    3940             :          * to fail, check if the cpuset changed during allocation and if so,
    3941             :          * retry.
    3942             :          */
    3943           0 :         if (read_mems_allowed_retry(cpuset_mems_cookie))
    3944             :                 return true;
    3945             : 
    3946             :         return false;
    3947             : }
    3948             : 
    3949             : static inline struct page *
    3950           0 : __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
    3951             :                                                 struct alloc_context *ac)
    3952             : {
    3953           0 :         bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
    3954           0 :         const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
    3955           0 :         struct page *page = NULL;
    3956             :         unsigned int alloc_flags;
    3957             :         unsigned long did_some_progress;
    3958             :         enum compact_priority compact_priority;
    3959             :         enum compact_result compact_result;
    3960             :         int compaction_retries;
    3961             :         int no_progress_loops;
    3962             :         unsigned int cpuset_mems_cookie;
    3963             :         unsigned int zonelist_iter_cookie;
    3964             :         int reserve_flags;
    3965             : 
    3966             : restart:
    3967           0 :         compaction_retries = 0;
    3968           0 :         no_progress_loops = 0;
    3969           0 :         compact_priority = DEF_COMPACT_PRIORITY;
    3970           0 :         cpuset_mems_cookie = read_mems_allowed_begin();
    3971           0 :         zonelist_iter_cookie = zonelist_iter_begin();
    3972             : 
    3973             :         /*
    3974             :          * The fast path uses conservative alloc_flags to succeed only until
    3975             :          * kswapd needs to be woken up, and to avoid the cost of setting up
    3976             :          * alloc_flags precisely. So we do that now.
    3977             :          */
    3978           0 :         alloc_flags = gfp_to_alloc_flags(gfp_mask, order);
    3979             : 
    3980             :         /*
    3981             :          * We need to recalculate the starting point for the zonelist iterator
    3982             :          * because we might have used different nodemask in the fast path, or
    3983             :          * there was a cpuset modification and we are retrying - otherwise we
    3984             :          * could end up iterating over non-eligible zones endlessly.
    3985             :          */
    3986           0 :         ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
    3987             :                                         ac->highest_zoneidx, ac->nodemask);
    3988           0 :         if (!ac->preferred_zoneref->zone)
    3989             :                 goto nopage;
    3990             : 
    3991             :         /*
    3992             :          * Check for insane configurations where the cpuset doesn't contain
    3993             :          * any suitable zone to satisfy the request - e.g. non-movable
    3994             :          * GFP_HIGHUSER allocations from MOVABLE nodes only.
    3995             :          */
    3996             :         if (cpusets_insane_config() && (gfp_mask & __GFP_HARDWALL)) {
    3997             :                 struct zoneref *z = first_zones_zonelist(ac->zonelist,
    3998             :                                         ac->highest_zoneidx,
    3999             :                                         &cpuset_current_mems_allowed);
    4000             :                 if (!z->zone)
    4001             :                         goto nopage;
    4002             :         }
    4003             : 
    4004           0 :         if (alloc_flags & ALLOC_KSWAPD)
    4005           0 :                 wake_all_kswapds(order, gfp_mask, ac);
    4006             : 
    4007             :         /*
    4008             :          * The adjusted alloc_flags might result in immediate success, so try
    4009             :          * that first
    4010             :          */
    4011           0 :         page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
    4012           0 :         if (page)
    4013             :                 goto got_pg;
    4014             : 
    4015             :         /*
    4016             :          * For costly allocations, try direct compaction first, as it's likely
    4017             :          * that we have enough base pages and don't need to reclaim. For non-
    4018             :          * movable high-order allocations, do that as well, as compaction will
    4019             :          * try prevent permanent fragmentation by migrating from blocks of the
    4020             :          * same migratetype.
    4021             :          * Don't try this for allocations that are allowed to ignore
    4022             :          * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
    4023             :          */
    4024           0 :         if (can_direct_reclaim &&
    4025           0 :                         (costly_order ||
    4026           0 :                            (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
    4027           0 :                         && !gfp_pfmemalloc_allowed(gfp_mask)) {
    4028           0 :                 page = __alloc_pages_direct_compact(gfp_mask, order,
    4029             :                                                 alloc_flags, ac,
    4030             :                                                 INIT_COMPACT_PRIORITY,
    4031             :                                                 &compact_result);
    4032           0 :                 if (page)
    4033             :                         goto got_pg;
    4034             : 
    4035             :                 /*
    4036             :                  * Checks for costly allocations with __GFP_NORETRY, which
    4037             :                  * includes some THP page fault allocations
    4038             :                  */
    4039           0 :                 if (costly_order && (gfp_mask & __GFP_NORETRY)) {
    4040             :                         /*
    4041             :                          * If allocating entire pageblock(s) and compaction
    4042             :                          * failed because all zones are below low watermarks
    4043             :                          * or is prohibited because it recently failed at this
    4044             :                          * order, fail immediately unless the allocator has
    4045             :                          * requested compaction and reclaim retry.
    4046             :                          *
    4047             :                          * Reclaim is
    4048             :                          *  - potentially very expensive because zones are far
    4049             :                          *    below their low watermarks or this is part of very
    4050             :                          *    bursty high order allocations,
    4051             :                          *  - not guaranteed to help because isolate_freepages()
    4052             :                          *    may not iterate over freed pages as part of its
    4053             :                          *    linear scan, and
    4054             :                          *  - unlikely to make entire pageblocks free on its
    4055             :                          *    own.
    4056             :                          */
    4057           0 :                         if (compact_result == COMPACT_SKIPPED ||
    4058             :                             compact_result == COMPACT_DEFERRED)
    4059             :                                 goto nopage;
    4060             : 
    4061             :                         /*
    4062             :                          * Looks like reclaim/compaction is worth trying, but
    4063             :                          * sync compaction could be very expensive, so keep
    4064             :                          * using async compaction.
    4065             :                          */
    4066           0 :                         compact_priority = INIT_COMPACT_PRIORITY;
    4067             :                 }
    4068             :         }
    4069             : 
    4070             : retry:
    4071             :         /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
    4072           0 :         if (alloc_flags & ALLOC_KSWAPD)
    4073           0 :                 wake_all_kswapds(order, gfp_mask, ac);
    4074             : 
    4075           0 :         reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
    4076           0 :         if (reserve_flags)
    4077           0 :                 alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags) |
    4078             :                                           (alloc_flags & ALLOC_KSWAPD);
    4079             : 
    4080             :         /*
    4081             :          * Reset the nodemask and zonelist iterators if memory policies can be
    4082             :          * ignored. These allocations are high priority and system rather than
    4083             :          * user oriented.
    4084             :          */
    4085           0 :         if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
    4086           0 :                 ac->nodemask = NULL;
    4087           0 :                 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
    4088             :                                         ac->highest_zoneidx, ac->nodemask);
    4089             :         }
    4090             : 
    4091             :         /* Attempt with potentially adjusted zonelist and alloc_flags */
    4092           0 :         page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
    4093           0 :         if (page)
    4094             :                 goto got_pg;
    4095             : 
    4096             :         /* Caller is not willing to reclaim, we can't balance anything */
    4097           0 :         if (!can_direct_reclaim)
    4098             :                 goto nopage;
    4099             : 
    4100             :         /* Avoid recursion of direct reclaim */
    4101           0 :         if (current->flags & PF_MEMALLOC)
    4102             :                 goto nopage;
    4103             : 
    4104             :         /* Try direct reclaim and then allocating */
    4105           0 :         page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
    4106             :                                                         &did_some_progress);
    4107           0 :         if (page)
    4108             :                 goto got_pg;
    4109             : 
    4110             :         /* Try direct compaction and then allocating */
    4111           0 :         page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
    4112             :                                         compact_priority, &compact_result);
    4113           0 :         if (page)
    4114             :                 goto got_pg;
    4115             : 
    4116             :         /* Do not loop if specifically requested */
    4117           0 :         if (gfp_mask & __GFP_NORETRY)
    4118             :                 goto nopage;
    4119             : 
    4120             :         /*
    4121             :          * Do not retry costly high order allocations unless they are
    4122             :          * __GFP_RETRY_MAYFAIL
    4123             :          */
    4124           0 :         if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
    4125             :                 goto nopage;
    4126             : 
    4127           0 :         if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
    4128             :                                  did_some_progress > 0, &no_progress_loops))
    4129             :                 goto retry;
    4130             : 
    4131             :         /*
    4132             :          * It doesn't make any sense to retry for the compaction if the order-0
    4133             :          * reclaim is not able to make any progress because the current
    4134             :          * implementation of the compaction depends on the sufficient amount
    4135             :          * of free memory (see __compaction_suitable)
    4136             :          */
    4137           0 :         if (did_some_progress > 0 &&
    4138           0 :                         should_compact_retry(ac, order, alloc_flags,
    4139             :                                 compact_result, &compact_priority,
    4140             :                                 &compaction_retries))
    4141             :                 goto retry;
    4142             : 
    4143             : 
    4144             :         /*
    4145             :          * Deal with possible cpuset update races or zonelist updates to avoid
    4146             :          * a unnecessary OOM kill.
    4147             :          */
    4148           0 :         if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
    4149           0 :             check_retry_zonelist(zonelist_iter_cookie))
    4150             :                 goto restart;
    4151             : 
    4152             :         /* Reclaim has failed us, start killing things */
    4153           0 :         page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
    4154           0 :         if (page)
    4155             :                 goto got_pg;
    4156             : 
    4157             :         /* Avoid allocations with no watermarks from looping endlessly */
    4158           0 :         if (tsk_is_oom_victim(current) &&
    4159           0 :             (alloc_flags & ALLOC_OOM ||
    4160           0 :              (gfp_mask & __GFP_NOMEMALLOC)))
    4161             :                 goto nopage;
    4162             : 
    4163             :         /* Retry as long as the OOM killer is making progress */
    4164           0 :         if (did_some_progress) {
    4165           0 :                 no_progress_loops = 0;
    4166           0 :                 goto retry;
    4167             :         }
    4168             : 
    4169             : nopage:
    4170             :         /*
    4171             :          * Deal with possible cpuset update races or zonelist updates to avoid
    4172             :          * a unnecessary OOM kill.
    4173             :          */
    4174           0 :         if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
    4175           0 :             check_retry_zonelist(zonelist_iter_cookie))
    4176             :                 goto restart;
    4177             : 
    4178             :         /*
    4179             :          * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
    4180             :          * we always retry
    4181             :          */
    4182           0 :         if (gfp_mask & __GFP_NOFAIL) {
    4183             :                 /*
    4184             :                  * All existing users of the __GFP_NOFAIL are blockable, so warn
    4185             :                  * of any new users that actually require GFP_NOWAIT
    4186             :                  */
    4187           0 :                 if (WARN_ON_ONCE_GFP(!can_direct_reclaim, gfp_mask))
    4188             :                         goto fail;
    4189             : 
    4190             :                 /*
    4191             :                  * PF_MEMALLOC request from this context is rather bizarre
    4192             :                  * because we cannot reclaim anything and only can loop waiting
    4193             :                  * for somebody to do a work for us
    4194             :                  */
    4195           0 :                 WARN_ON_ONCE_GFP(current->flags & PF_MEMALLOC, gfp_mask);
    4196             : 
    4197             :                 /*
    4198             :                  * non failing costly orders are a hard requirement which we
    4199             :                  * are not prepared for much so let's warn about these users
    4200             :                  * so that we can identify them and convert them to something
    4201             :                  * else.
    4202             :                  */
    4203           0 :                 WARN_ON_ONCE_GFP(costly_order, gfp_mask);
    4204             : 
    4205             :                 /*
    4206             :                  * Help non-failing allocations by giving some access to memory
    4207             :                  * reserves normally used for high priority non-blocking
    4208             :                  * allocations but do not use ALLOC_NO_WATERMARKS because this
    4209             :                  * could deplete whole memory reserves which would just make
    4210             :                  * the situation worse.
    4211             :                  */
    4212           0 :                 page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_MIN_RESERVE, ac);
    4213           0 :                 if (page)
    4214             :                         goto got_pg;
    4215             : 
    4216           0 :                 cond_resched();
    4217           0 :                 goto retry;
    4218             :         }
    4219             : fail:
    4220           0 :         warn_alloc(gfp_mask, ac->nodemask,
    4221             :                         "page allocation failure: order:%u", order);
    4222             : got_pg:
    4223           0 :         return page;
    4224             : }
    4225             : 
    4226         457 : static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
    4227             :                 int preferred_nid, nodemask_t *nodemask,
    4228             :                 struct alloc_context *ac, gfp_t *alloc_gfp,
    4229             :                 unsigned int *alloc_flags)
    4230             : {
    4231         457 :         ac->highest_zoneidx = gfp_zone(gfp_mask);
    4232         914 :         ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
    4233         457 :         ac->nodemask = nodemask;
    4234         457 :         ac->migratetype = gfp_migratetype(gfp_mask);
    4235             : 
    4236             :         if (cpusets_enabled()) {
    4237             :                 *alloc_gfp |= __GFP_HARDWALL;
    4238             :                 /*
    4239             :                  * When we are in the interrupt context, it is irrelevant
    4240             :                  * to the current task context. It means that any node ok.
    4241             :                  */
    4242             :                 if (in_task() && !ac->nodemask)
    4243             :                         ac->nodemask = &cpuset_current_mems_allowed;
    4244             :                 else
    4245             :                         *alloc_flags |= ALLOC_CPUSET;
    4246             :         }
    4247             : 
    4248         457 :         might_alloc(gfp_mask);
    4249             : 
    4250         457 :         if (should_fail_alloc_page(gfp_mask, order))
    4251             :                 return false;
    4252             : 
    4253         457 :         *alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags);
    4254             : 
    4255             :         /* Dirty zone balancing only done in the fast path */
    4256         457 :         ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
    4257             : 
    4258             :         /*
    4259             :          * The preferred zone is used for statistics but crucially it is
    4260             :          * also used as the starting point for the zonelist iterator. It
    4261             :          * may get reset for allocations that ignore memory policies.
    4262             :          */
    4263         914 :         ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
    4264             :                                         ac->highest_zoneidx, ac->nodemask);
    4265             : 
    4266             :         return true;
    4267             : }
    4268             : 
    4269             : /*
    4270             :  * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array
    4271             :  * @gfp: GFP flags for the allocation
    4272             :  * @preferred_nid: The preferred NUMA node ID to allocate from
    4273             :  * @nodemask: Set of nodes to allocate from, may be NULL
    4274             :  * @nr_pages: The number of pages desired on the list or array
    4275             :  * @page_list: Optional list to store the allocated pages
    4276             :  * @page_array: Optional array to store the pages
    4277             :  *
    4278             :  * This is a batched version of the page allocator that attempts to
    4279             :  * allocate nr_pages quickly. Pages are added to page_list if page_list
    4280             :  * is not NULL, otherwise it is assumed that the page_array is valid.
    4281             :  *
    4282             :  * For lists, nr_pages is the number of pages that should be allocated.
    4283             :  *
    4284             :  * For arrays, only NULL elements are populated with pages and nr_pages
    4285             :  * is the maximum number of pages that will be stored in the array.
    4286             :  *
    4287             :  * Returns the number of pages on the list or array.
    4288             :  */
    4289          16 : unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
    4290             :                         nodemask_t *nodemask, int nr_pages,
    4291             :                         struct list_head *page_list,
    4292             :                         struct page **page_array)
    4293             : {
    4294             :         struct page *page;
    4295             :         unsigned long __maybe_unused UP_flags;
    4296             :         struct zone *zone;
    4297             :         struct zoneref *z;
    4298             :         struct per_cpu_pages *pcp;
    4299             :         struct list_head *pcp_list;
    4300             :         struct alloc_context ac;
    4301             :         gfp_t alloc_gfp;
    4302          16 :         unsigned int alloc_flags = ALLOC_WMARK_LOW;
    4303          16 :         int nr_populated = 0, nr_account = 0;
    4304             : 
    4305             :         /*
    4306             :          * Skip populated array elements to determine if any pages need
    4307             :          * to be allocated before disabling IRQs.
    4308             :          */
    4309          32 :         while (page_array && nr_populated < nr_pages && page_array[nr_populated])
    4310           0 :                 nr_populated++;
    4311             : 
    4312             :         /* No pages requested? */
    4313          16 :         if (unlikely(nr_pages <= 0))
    4314             :                 goto out;
    4315             : 
    4316             :         /* Already populated array? */
    4317          16 :         if (unlikely(page_array && nr_pages - nr_populated == 0))
    4318             :                 goto out;
    4319             : 
    4320             :         /* Bulk allocator does not support memcg accounting. */
    4321             :         if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT))
    4322             :                 goto failed;
    4323             : 
    4324             :         /* Use the single page allocator for one page. */
    4325          16 :         if (nr_pages - nr_populated == 1)
    4326             :                 goto failed;
    4327             : 
    4328             : #ifdef CONFIG_PAGE_OWNER
    4329             :         /*
    4330             :          * PAGE_OWNER may recurse into the allocator to allocate space to
    4331             :          * save the stack with pagesets.lock held. Releasing/reacquiring
    4332             :          * removes much of the performance benefit of bulk allocation so
    4333             :          * force the caller to allocate one page at a time as it'll have
    4334             :          * similar performance to added complexity to the bulk allocator.
    4335             :          */
    4336             :         if (static_branch_unlikely(&page_owner_inited))
    4337             :                 goto failed;
    4338             : #endif
    4339             : 
    4340             :         /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */
    4341          16 :         gfp &= gfp_allowed_mask;
    4342          16 :         alloc_gfp = gfp;
    4343          16 :         if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags))
    4344             :                 goto out;
    4345          16 :         gfp = alloc_gfp;
    4346             : 
    4347             :         /* Find an allowed local zone that meets the low watermark. */
    4348          32 :         for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) {
    4349             :                 unsigned long mark;
    4350             : 
    4351             :                 if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) &&
    4352             :                     !__cpuset_zone_allowed(zone, gfp)) {
    4353             :                         continue;
    4354             :                 }
    4355             : 
    4356             :                 if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone &&
    4357             :                     zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) {
    4358             :                         goto failed;
    4359             :                 }
    4360             : 
    4361          16 :                 mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages;
    4362          16 :                 if (zone_watermark_fast(zone, 0,  mark,
    4363             :                                 zonelist_zone_idx(ac.preferred_zoneref),
    4364             :                                 alloc_flags, gfp)) {
    4365             :                         break;
    4366             :                 }
    4367             :         }
    4368             : 
    4369             :         /*
    4370             :          * If there are no allowed local zones that meets the watermarks then
    4371             :          * try to allocate a single page and reclaim if necessary.
    4372             :          */
    4373          16 :         if (unlikely(!zone))
    4374             :                 goto failed;
    4375             : 
    4376             :         /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
    4377          16 :         pcp_trylock_prepare(UP_flags);
    4378          32 :         pcp = pcp_spin_trylock(zone->per_cpu_pageset);
    4379          16 :         if (!pcp)
    4380             :                 goto failed_irq;
    4381             : 
    4382             :         /* Attempt the batch allocation */
    4383          32 :         pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
    4384          96 :         while (nr_populated < nr_pages) {
    4385             : 
    4386             :                 /* Skip existing pages */
    4387          64 :                 if (page_array && page_array[nr_populated]) {
    4388           0 :                         nr_populated++;
    4389           0 :                         continue;
    4390             :                 }
    4391             : 
    4392          64 :                 page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags,
    4393             :                                                                 pcp, pcp_list);
    4394          64 :                 if (unlikely(!page)) {
    4395             :                         /* Try and allocate at least one page */
    4396           0 :                         if (!nr_account) {
    4397           0 :                                 pcp_spin_unlock(pcp);
    4398           0 :                                 goto failed_irq;
    4399             :                         }
    4400             :                         break;
    4401             :                 }
    4402          64 :                 nr_account++;
    4403             : 
    4404          64 :                 prep_new_page(page, 0, gfp, 0);
    4405          64 :                 if (page_list)
    4406           0 :                         list_add(&page->lru, page_list);
    4407             :                 else
    4408          64 :                         page_array[nr_populated] = page;
    4409          64 :                 nr_populated++;
    4410             :         }
    4411             : 
    4412          32 :         pcp_spin_unlock(pcp);
    4413          32 :         pcp_trylock_finish(UP_flags);
    4414             : 
    4415          32 :         __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
    4416          16 :         zone_statistics(ac.preferred_zoneref->zone, zone, nr_account);
    4417             : 
    4418             : out:
    4419          16 :         return nr_populated;
    4420             : 
    4421             : failed_irq:
    4422           0 :         pcp_trylock_finish(UP_flags);
    4423             : 
    4424             : failed:
    4425           0 :         page = __alloc_pages(gfp, 0, preferred_nid, nodemask);
    4426           0 :         if (page) {
    4427           0 :                 if (page_list)
    4428           0 :                         list_add(&page->lru, page_list);
    4429             :                 else
    4430           0 :                         page_array[nr_populated] = page;
    4431           0 :                 nr_populated++;
    4432             :         }
    4433             : 
    4434             :         goto out;
    4435             : }
    4436             : EXPORT_SYMBOL_GPL(__alloc_pages_bulk);
    4437             : 
    4438             : /*
    4439             :  * This is the 'heart' of the zoned buddy allocator.
    4440             :  */
    4441         441 : struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
    4442             :                                                         nodemask_t *nodemask)
    4443             : {
    4444             :         struct page *page;
    4445         441 :         unsigned int alloc_flags = ALLOC_WMARK_LOW;
    4446             :         gfp_t alloc_gfp; /* The gfp_t that was actually used for allocation */
    4447         441 :         struct alloc_context ac = { };
    4448             : 
    4449             :         /*
    4450             :          * There are several places where we assume that the order value is sane
    4451             :          * so bail out early if the request is out of bound.
    4452             :          */
    4453         441 :         if (WARN_ON_ONCE_GFP(order > MAX_ORDER, gfp))
    4454             :                 return NULL;
    4455             : 
    4456         441 :         gfp &= gfp_allowed_mask;
    4457             :         /*
    4458             :          * Apply scoped allocation constraints. This is mainly about GFP_NOFS
    4459             :          * resp. GFP_NOIO which has to be inherited for all allocation requests
    4460             :          * from a particular context which has been marked by
    4461             :          * memalloc_no{fs,io}_{save,restore}. And PF_MEMALLOC_PIN which ensures
    4462             :          * movable zones are not used during allocation.
    4463             :          */
    4464         441 :         gfp = current_gfp_context(gfp);
    4465         441 :         alloc_gfp = gfp;
    4466         441 :         if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac,
    4467             :                         &alloc_gfp, &alloc_flags))
    4468             :                 return NULL;
    4469             : 
    4470             :         /*
    4471             :          * Forbid the first pass from falling back to types that fragment
    4472             :          * memory until all local zones are considered.
    4473             :          */
    4474         882 :         alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp);
    4475             : 
    4476             :         /* First allocation attempt */
    4477         441 :         page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
    4478         441 :         if (likely(page))
    4479             :                 goto out;
    4480             : 
    4481           0 :         alloc_gfp = gfp;
    4482           0 :         ac.spread_dirty_pages = false;
    4483             : 
    4484             :         /*
    4485             :          * Restore the original nodemask if it was potentially replaced with
    4486             :          * &cpuset_current_mems_allowed to optimize the fast-path attempt.
    4487             :          */
    4488           0 :         ac.nodemask = nodemask;
    4489             : 
    4490           0 :         page = __alloc_pages_slowpath(alloc_gfp, order, &ac);
    4491             : 
    4492             : out:
    4493             :         if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page &&
    4494             :             unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) {
    4495             :                 __free_pages(page, order);
    4496             :                 page = NULL;
    4497             :         }
    4498             : 
    4499         441 :         trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
    4500         441 :         kmsan_alloc_page(page, order, alloc_gfp);
    4501             : 
    4502         441 :         return page;
    4503             : }
    4504             : EXPORT_SYMBOL(__alloc_pages);
    4505             : 
    4506           0 : struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid,
    4507             :                 nodemask_t *nodemask)
    4508             : {
    4509           0 :         struct page *page = __alloc_pages(gfp | __GFP_COMP, order,
    4510             :                         preferred_nid, nodemask);
    4511             : 
    4512             :         if (page && order > 1)
    4513             :                 prep_transhuge_page(page);
    4514           0 :         return (struct folio *)page;
    4515             : }
    4516             : EXPORT_SYMBOL(__folio_alloc);
    4517             : 
    4518             : /*
    4519             :  * Common helper functions. Never use with __GFP_HIGHMEM because the returned
    4520             :  * address cannot represent highmem pages. Use alloc_pages and then kmap if
    4521             :  * you need to access high mem.
    4522             :  */
    4523           4 : unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
    4524             : {
    4525             :         struct page *page;
    4526             : 
    4527           8 :         page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order);
    4528           4 :         if (!page)
    4529             :                 return 0;
    4530           4 :         return (unsigned long) page_address(page);
    4531             : }
    4532             : EXPORT_SYMBOL(__get_free_pages);
    4533             : 
    4534           0 : unsigned long get_zeroed_page(gfp_t gfp_mask)
    4535             : {
    4536           0 :         return __get_free_page(gfp_mask | __GFP_ZERO);
    4537             : }
    4538             : EXPORT_SYMBOL(get_zeroed_page);
    4539             : 
    4540             : /**
    4541             :  * __free_pages - Free pages allocated with alloc_pages().
    4542             :  * @page: The page pointer returned from alloc_pages().
    4543             :  * @order: The order of the allocation.
    4544             :  *
    4545             :  * This function can free multi-page allocations that are not compound
    4546             :  * pages.  It does not check that the @order passed in matches that of
    4547             :  * the allocation, so it is easy to leak memory.  Freeing more memory
    4548             :  * than was allocated will probably emit a warning.
    4549             :  *
    4550             :  * If the last reference to this page is speculative, it will be released
    4551             :  * by put_page() which only frees the first page of a non-compound
    4552             :  * allocation.  To prevent the remaining pages from being leaked, we free
    4553             :  * the subsequent pages here.  If you want to use the page's reference
    4554             :  * count to decide when to free the allocation, you should allocate a
    4555             :  * compound page, and use put_page() instead of __free_pages().
    4556             :  *
    4557             :  * Context: May be called in interrupt context or while holding a normal
    4558             :  * spinlock, but not in NMI context or while holding a raw spinlock.
    4559             :  */
    4560           0 : void __free_pages(struct page *page, unsigned int order)
    4561             : {
    4562             :         /* get PageHead before we drop reference */
    4563           0 :         int head = PageHead(page);
    4564             : 
    4565           0 :         if (put_page_testzero(page))
    4566           0 :                 free_the_page(page, order);
    4567           0 :         else if (!head)
    4568           0 :                 while (order-- > 0)
    4569           0 :                         free_the_page(page + (1 << order), order);
    4570           0 : }
    4571             : EXPORT_SYMBOL(__free_pages);
    4572             : 
    4573           0 : void free_pages(unsigned long addr, unsigned int order)
    4574             : {
    4575           0 :         if (addr != 0) {
    4576             :                 VM_BUG_ON(!virt_addr_valid((void *)addr));
    4577           0 :                 __free_pages(virt_to_page((void *)addr), order);
    4578             :         }
    4579           0 : }
    4580             : 
    4581             : EXPORT_SYMBOL(free_pages);
    4582             : 
    4583             : /*
    4584             :  * Page Fragment:
    4585             :  *  An arbitrary-length arbitrary-offset area of memory which resides
    4586             :  *  within a 0 or higher order page.  Multiple fragments within that page
    4587             :  *  are individually refcounted, in the page's reference counter.
    4588             :  *
    4589             :  * The page_frag functions below provide a simple allocation framework for
    4590             :  * page fragments.  This is used by the network stack and network device
    4591             :  * drivers to provide a backing region of memory for use as either an
    4592             :  * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
    4593             :  */
    4594           0 : static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
    4595             :                                              gfp_t gfp_mask)
    4596             : {
    4597           0 :         struct page *page = NULL;
    4598           0 :         gfp_t gfp = gfp_mask;
    4599             : 
    4600             : #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
    4601           0 :         gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
    4602             :                     __GFP_NOMEMALLOC;
    4603           0 :         page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
    4604           0 :                                 PAGE_FRAG_CACHE_MAX_ORDER);
    4605           0 :         nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
    4606             : #endif
    4607           0 :         if (unlikely(!page))
    4608           0 :                 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
    4609             : 
    4610           0 :         nc->va = page ? page_address(page) : NULL;
    4611             : 
    4612           0 :         return page;
    4613             : }
    4614             : 
    4615           0 : void __page_frag_cache_drain(struct page *page, unsigned int count)
    4616             : {
    4617             :         VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
    4618             : 
    4619           0 :         if (page_ref_sub_and_test(page, count))
    4620           0 :                 free_the_page(page, compound_order(page));
    4621           0 : }
    4622             : EXPORT_SYMBOL(__page_frag_cache_drain);
    4623             : 
    4624           0 : void *page_frag_alloc_align(struct page_frag_cache *nc,
    4625             :                       unsigned int fragsz, gfp_t gfp_mask,
    4626             :                       unsigned int align_mask)
    4627             : {
    4628           0 :         unsigned int size = PAGE_SIZE;
    4629             :         struct page *page;
    4630             :         int offset;
    4631             : 
    4632           0 :         if (unlikely(!nc->va)) {
    4633             : refill:
    4634           0 :                 page = __page_frag_cache_refill(nc, gfp_mask);
    4635           0 :                 if (!page)
    4636             :                         return NULL;
    4637             : 
    4638             : #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
    4639             :                 /* if size can vary use size else just use PAGE_SIZE */
    4640           0 :                 size = nc->size;
    4641             : #endif
    4642             :                 /* Even if we own the page, we do not use atomic_set().
    4643             :                  * This would break get_page_unless_zero() users.
    4644             :                  */
    4645           0 :                 page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
    4646             : 
    4647             :                 /* reset page count bias and offset to start of new frag */
    4648           0 :                 nc->pfmemalloc = page_is_pfmemalloc(page);
    4649           0 :                 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
    4650           0 :                 nc->offset = size;
    4651             :         }
    4652             : 
    4653           0 :         offset = nc->offset - fragsz;
    4654           0 :         if (unlikely(offset < 0)) {
    4655           0 :                 page = virt_to_page(nc->va);
    4656             : 
    4657           0 :                 if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
    4658             :                         goto refill;
    4659             : 
    4660           0 :                 if (unlikely(nc->pfmemalloc)) {
    4661           0 :                         free_the_page(page, compound_order(page));
    4662           0 :                         goto refill;
    4663             :                 }
    4664             : 
    4665             : #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
    4666             :                 /* if size can vary use size else just use PAGE_SIZE */
    4667           0 :                 size = nc->size;
    4668             : #endif
    4669             :                 /* OK, page count is 0, we can safely set it */
    4670           0 :                 set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
    4671             : 
    4672             :                 /* reset page count bias and offset to start of new frag */
    4673           0 :                 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
    4674           0 :                 offset = size - fragsz;
    4675           0 :                 if (unlikely(offset < 0)) {
    4676             :                         /*
    4677             :                          * The caller is trying to allocate a fragment
    4678             :                          * with fragsz > PAGE_SIZE but the cache isn't big
    4679             :                          * enough to satisfy the request, this may
    4680             :                          * happen in low memory conditions.
    4681             :                          * We don't release the cache page because
    4682             :                          * it could make memory pressure worse
    4683             :                          * so we simply return NULL here.
    4684             :                          */
    4685             :                         return NULL;
    4686             :                 }
    4687             :         }
    4688             : 
    4689           0 :         nc->pagecnt_bias--;
    4690           0 :         offset &= align_mask;
    4691           0 :         nc->offset = offset;
    4692             : 
    4693           0 :         return nc->va + offset;
    4694             : }
    4695             : EXPORT_SYMBOL(page_frag_alloc_align);
    4696             : 
    4697             : /*
    4698             :  * Frees a page fragment allocated out of either a compound or order 0 page.
    4699             :  */
    4700           0 : void page_frag_free(void *addr)
    4701             : {
    4702           0 :         struct page *page = virt_to_head_page(addr);
    4703             : 
    4704           0 :         if (unlikely(put_page_testzero(page)))
    4705           0 :                 free_the_page(page, compound_order(page));
    4706           0 : }
    4707             : EXPORT_SYMBOL(page_frag_free);
    4708             : 
    4709           3 : static void *make_alloc_exact(unsigned long addr, unsigned int order,
    4710             :                 size_t size)
    4711             : {
    4712           3 :         if (addr) {
    4713           3 :                 unsigned long nr = DIV_ROUND_UP(size, PAGE_SIZE);
    4714           6 :                 struct page *page = virt_to_page((void *)addr);
    4715           3 :                 struct page *last = page + nr;
    4716             : 
    4717           3 :                 split_page_owner(page, 1 << order);
    4718           3 :                 split_page_memcg(page, 1 << order);
    4719          10 :                 while (page < --last)
    4720             :                         set_page_refcounted(last);
    4721             : 
    4722           3 :                 last = page + (1UL << order);
    4723           3 :                 for (page += nr; page < last; page++)
    4724           0 :                         __free_pages_ok(page, 0, FPI_TO_TAIL);
    4725             :         }
    4726           3 :         return (void *)addr;
    4727             : }
    4728             : 
    4729             : /**
    4730             :  * alloc_pages_exact - allocate an exact number physically-contiguous pages.
    4731             :  * @size: the number of bytes to allocate
    4732             :  * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
    4733             :  *
    4734             :  * This function is similar to alloc_pages(), except that it allocates the
    4735             :  * minimum number of pages to satisfy the request.  alloc_pages() can only
    4736             :  * allocate memory in power-of-two pages.
    4737             :  *
    4738             :  * This function is also limited by MAX_ORDER.
    4739             :  *
    4740             :  * Memory allocated by this function must be released by free_pages_exact().
    4741             :  *
    4742             :  * Return: pointer to the allocated area or %NULL in case of error.
    4743             :  */
    4744           3 : void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
    4745             : {
    4746           3 :         unsigned int order = get_order(size);
    4747             :         unsigned long addr;
    4748             : 
    4749           3 :         if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
    4750           0 :                 gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
    4751             : 
    4752           3 :         addr = __get_free_pages(gfp_mask, order);
    4753           3 :         return make_alloc_exact(addr, order, size);
    4754             : }
    4755             : EXPORT_SYMBOL(alloc_pages_exact);
    4756             : 
    4757             : /**
    4758             :  * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
    4759             :  *                         pages on a node.
    4760             :  * @nid: the preferred node ID where memory should be allocated
    4761             :  * @size: the number of bytes to allocate
    4762             :  * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
    4763             :  *
    4764             :  * Like alloc_pages_exact(), but try to allocate on node nid first before falling
    4765             :  * back.
    4766             :  *
    4767             :  * Return: pointer to the allocated area or %NULL in case of error.
    4768             :  */
    4769           0 : void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
    4770             : {
    4771           0 :         unsigned int order = get_order(size);
    4772             :         struct page *p;
    4773             : 
    4774           0 :         if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
    4775           0 :                 gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
    4776             : 
    4777           0 :         p = alloc_pages_node(nid, gfp_mask, order);
    4778           0 :         if (!p)
    4779             :                 return NULL;
    4780           0 :         return make_alloc_exact((unsigned long)page_address(p), order, size);
    4781             : }
    4782             : 
    4783             : /**
    4784             :  * free_pages_exact - release memory allocated via alloc_pages_exact()
    4785             :  * @virt: the value returned by alloc_pages_exact.
    4786             :  * @size: size of allocation, same value as passed to alloc_pages_exact().
    4787             :  *
    4788             :  * Release the memory allocated by a previous call to alloc_pages_exact.
    4789             :  */
    4790           0 : void free_pages_exact(void *virt, size_t size)
    4791             : {
    4792           0 :         unsigned long addr = (unsigned long)virt;
    4793           0 :         unsigned long end = addr + PAGE_ALIGN(size);
    4794             : 
    4795           0 :         while (addr < end) {
    4796           0 :                 free_page(addr);
    4797           0 :                 addr += PAGE_SIZE;
    4798             :         }
    4799           0 : }
    4800             : EXPORT_SYMBOL(free_pages_exact);
    4801             : 
    4802             : /**
    4803             :  * nr_free_zone_pages - count number of pages beyond high watermark
    4804             :  * @offset: The zone index of the highest zone
    4805             :  *
    4806             :  * nr_free_zone_pages() counts the number of pages which are beyond the
    4807             :  * high watermark within all zones at or below a given zone index.  For each
    4808             :  * zone, the number of pages is calculated as:
    4809             :  *
    4810             :  *     nr_free_zone_pages = managed_pages - high_pages
    4811             :  *
    4812             :  * Return: number of pages beyond high watermark.
    4813             :  */
    4814           3 : static unsigned long nr_free_zone_pages(int offset)
    4815             : {
    4816             :         struct zoneref *z;
    4817             :         struct zone *zone;
    4818             : 
    4819             :         /* Just pick one node, since fallback list is circular */
    4820           3 :         unsigned long sum = 0;
    4821             : 
    4822           6 :         struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
    4823             : 
    4824          12 :         for_each_zone_zonelist(zone, z, zonelist, offset) {
    4825           3 :                 unsigned long size = zone_managed_pages(zone);
    4826           3 :                 unsigned long high = high_wmark_pages(zone);
    4827           3 :                 if (size > high)
    4828           3 :                         sum += size - high;
    4829             :         }
    4830             : 
    4831           3 :         return sum;
    4832             : }
    4833             : 
    4834             : /**
    4835             :  * nr_free_buffer_pages - count number of pages beyond high watermark
    4836             :  *
    4837             :  * nr_free_buffer_pages() counts the number of pages which are beyond the high
    4838             :  * watermark within ZONE_DMA and ZONE_NORMAL.
    4839             :  *
    4840             :  * Return: number of pages beyond high watermark within ZONE_DMA and
    4841             :  * ZONE_NORMAL.
    4842             :  */
    4843           1 : unsigned long nr_free_buffer_pages(void)
    4844             : {
    4845           2 :         return nr_free_zone_pages(gfp_zone(GFP_USER));
    4846             : }
    4847             : EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
    4848             : 
    4849             : static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
    4850             : {
    4851           1 :         zoneref->zone = zone;
    4852           1 :         zoneref->zone_idx = zone_idx(zone);
    4853             : }
    4854             : 
    4855             : /*
    4856             :  * Builds allocation fallback zone lists.
    4857             :  *
    4858             :  * Add all populated zones of a node to the zonelist.
    4859             :  */
    4860             : static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
    4861             : {
    4862             :         struct zone *zone;
    4863           1 :         enum zone_type zone_type = MAX_NR_ZONES;
    4864           1 :         int nr_zones = 0;
    4865             : 
    4866             :         do {
    4867           2 :                 zone_type--;
    4868           2 :                 zone = pgdat->node_zones + zone_type;
    4869           2 :                 if (populated_zone(zone)) {
    4870           2 :                         zoneref_set_zone(zone, &zonerefs[nr_zones++]);
    4871           1 :                         check_highest_zone(zone_type);
    4872             :                 }
    4873           2 :         } while (zone_type);
    4874             : 
    4875             :         return nr_zones;
    4876             : }
    4877             : 
    4878             : #ifdef CONFIG_NUMA
    4879             : 
    4880             : static int __parse_numa_zonelist_order(char *s)
    4881             : {
    4882             :         /*
    4883             :          * We used to support different zonelists modes but they turned
    4884             :          * out to be just not useful. Let's keep the warning in place
    4885             :          * if somebody still use the cmd line parameter so that we do
    4886             :          * not fail it silently
    4887             :          */
    4888             :         if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
    4889             :                 pr_warn("Ignoring unsupported numa_zonelist_order value:  %s\n", s);
    4890             :                 return -EINVAL;
    4891             :         }
    4892             :         return 0;
    4893             : }
    4894             : 
    4895             : static char numa_zonelist_order[] = "Node";
    4896             : #define NUMA_ZONELIST_ORDER_LEN 16
    4897             : /*
    4898             :  * sysctl handler for numa_zonelist_order
    4899             :  */
    4900             : static int numa_zonelist_order_handler(struct ctl_table *table, int write,
    4901             :                 void *buffer, size_t *length, loff_t *ppos)
    4902             : {
    4903             :         if (write)
    4904             :                 return __parse_numa_zonelist_order(buffer);
    4905             :         return proc_dostring(table, write, buffer, length, ppos);
    4906             : }
    4907             : 
    4908             : static int node_load[MAX_NUMNODES];
    4909             : 
    4910             : /**
    4911             :  * find_next_best_node - find the next node that should appear in a given node's fallback list
    4912             :  * @node: node whose fallback list we're appending
    4913             :  * @used_node_mask: nodemask_t of already used nodes
    4914             :  *
    4915             :  * We use a number of factors to determine which is the next node that should
    4916             :  * appear on a given node's fallback list.  The node should not have appeared
    4917             :  * already in @node's fallback list, and it should be the next closest node
    4918             :  * according to the distance array (which contains arbitrary distance values
    4919             :  * from each node to each node in the system), and should also prefer nodes
    4920             :  * with no CPUs, since presumably they'll have very little allocation pressure
    4921             :  * on them otherwise.
    4922             :  *
    4923             :  * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
    4924             :  */
    4925             : int find_next_best_node(int node, nodemask_t *used_node_mask)
    4926             : {
    4927             :         int n, val;
    4928             :         int min_val = INT_MAX;
    4929             :         int best_node = NUMA_NO_NODE;
    4930             : 
    4931             :         /* Use the local node if we haven't already */
    4932             :         if (!node_isset(node, *used_node_mask)) {
    4933             :                 node_set(node, *used_node_mask);
    4934             :                 return node;
    4935             :         }
    4936             : 
    4937             :         for_each_node_state(n, N_MEMORY) {
    4938             : 
    4939             :                 /* Don't want a node to appear more than once */
    4940             :                 if (node_isset(n, *used_node_mask))
    4941             :                         continue;
    4942             : 
    4943             :                 /* Use the distance array to find the distance */
    4944             :                 val = node_distance(node, n);
    4945             : 
    4946             :                 /* Penalize nodes under us ("prefer the next node") */
    4947             :                 val += (n < node);
    4948             : 
    4949             :                 /* Give preference to headless and unused nodes */
    4950             :                 if (!cpumask_empty(cpumask_of_node(n)))
    4951             :                         val += PENALTY_FOR_NODE_WITH_CPUS;
    4952             : 
    4953             :                 /* Slight preference for less loaded node */
    4954             :                 val *= MAX_NUMNODES;
    4955             :                 val += node_load[n];
    4956             : 
    4957             :                 if (val < min_val) {
    4958             :                         min_val = val;
    4959             :                         best_node = n;
    4960             :                 }
    4961             :         }
    4962             : 
    4963             :         if (best_node >= 0)
    4964             :                 node_set(best_node, *used_node_mask);
    4965             : 
    4966             :         return best_node;
    4967             : }
    4968             : 
    4969             : 
    4970             : /*
    4971             :  * Build zonelists ordered by node and zones within node.
    4972             :  * This results in maximum locality--normal zone overflows into local
    4973             :  * DMA zone, if any--but risks exhausting DMA zone.
    4974             :  */
    4975             : static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
    4976             :                 unsigned nr_nodes)
    4977             : {
    4978             :         struct zoneref *zonerefs;
    4979             :         int i;
    4980             : 
    4981             :         zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
    4982             : 
    4983             :         for (i = 0; i < nr_nodes; i++) {
    4984             :                 int nr_zones;
    4985             : 
    4986             :                 pg_data_t *node = NODE_DATA(node_order[i]);
    4987             : 
    4988             :                 nr_zones = build_zonerefs_node(node, zonerefs);
    4989             :                 zonerefs += nr_zones;
    4990             :         }
    4991             :         zonerefs->zone = NULL;
    4992             :         zonerefs->zone_idx = 0;
    4993             : }
    4994             : 
    4995             : /*
    4996             :  * Build gfp_thisnode zonelists
    4997             :  */
    4998             : static void build_thisnode_zonelists(pg_data_t *pgdat)
    4999             : {
    5000             :         struct zoneref *zonerefs;
    5001             :         int nr_zones;
    5002             : 
    5003             :         zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
    5004             :         nr_zones = build_zonerefs_node(pgdat, zonerefs);
    5005             :         zonerefs += nr_zones;
    5006             :         zonerefs->zone = NULL;
    5007             :         zonerefs->zone_idx = 0;
    5008             : }
    5009             : 
    5010             : /*
    5011             :  * Build zonelists ordered by zone and nodes within zones.
    5012             :  * This results in conserving DMA zone[s] until all Normal memory is
    5013             :  * exhausted, but results in overflowing to remote node while memory
    5014             :  * may still exist in local DMA zone.
    5015             :  */
    5016             : 
    5017             : static void build_zonelists(pg_data_t *pgdat)
    5018             : {
    5019             :         static int node_order[MAX_NUMNODES];
    5020             :         int node, nr_nodes = 0;
    5021             :         nodemask_t used_mask = NODE_MASK_NONE;
    5022             :         int local_node, prev_node;
    5023             : 
    5024             :         /* NUMA-aware ordering of nodes */
    5025             :         local_node = pgdat->node_id;
    5026             :         prev_node = local_node;
    5027             : 
    5028             :         memset(node_order, 0, sizeof(node_order));
    5029             :         while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
    5030             :                 /*
    5031             :                  * We don't want to pressure a particular node.
    5032             :                  * So adding penalty to the first node in same
    5033             :                  * distance group to make it round-robin.
    5034             :                  */
    5035             :                 if (node_distance(local_node, node) !=
    5036             :                     node_distance(local_node, prev_node))
    5037             :                         node_load[node] += 1;
    5038             : 
    5039             :                 node_order[nr_nodes++] = node;
    5040             :                 prev_node = node;
    5041             :         }
    5042             : 
    5043             :         build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
    5044             :         build_thisnode_zonelists(pgdat);
    5045             :         pr_info("Fallback order for Node %d: ", local_node);
    5046             :         for (node = 0; node < nr_nodes; node++)
    5047             :                 pr_cont("%d ", node_order[node]);
    5048             :         pr_cont("\n");
    5049             : }
    5050             : 
    5051             : #ifdef CONFIG_HAVE_MEMORYLESS_NODES
    5052             : /*
    5053             :  * Return node id of node used for "local" allocations.
    5054             :  * I.e., first node id of first zone in arg node's generic zonelist.
    5055             :  * Used for initializing percpu 'numa_mem', which is used primarily
    5056             :  * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
    5057             :  */
    5058             : int local_memory_node(int node)
    5059             : {
    5060             :         struct zoneref *z;
    5061             : 
    5062             :         z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
    5063             :                                    gfp_zone(GFP_KERNEL),
    5064             :                                    NULL);
    5065             :         return zone_to_nid(z->zone);
    5066             : }
    5067             : #endif
    5068             : 
    5069             : static void setup_min_unmapped_ratio(void);
    5070             : static void setup_min_slab_ratio(void);
    5071             : #else   /* CONFIG_NUMA */
    5072             : 
    5073           1 : static void build_zonelists(pg_data_t *pgdat)
    5074             : {
    5075             :         int node, local_node;
    5076             :         struct zoneref *zonerefs;
    5077             :         int nr_zones;
    5078             : 
    5079           1 :         local_node = pgdat->node_id;
    5080             : 
    5081           1 :         zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
    5082           1 :         nr_zones = build_zonerefs_node(pgdat, zonerefs);
    5083           1 :         zonerefs += nr_zones;
    5084             : 
    5085             :         /*
    5086             :          * Now we build the zonelist so that it contains the zones
    5087             :          * of all the other nodes.
    5088             :          * We don't want to pressure a particular node, so when
    5089             :          * building the zones for node N, we make sure that the
    5090             :          * zones coming right after the local ones are those from
    5091             :          * node N+1 (modulo N)
    5092             :          */
    5093           1 :         for (node = local_node + 1; node < MAX_NUMNODES; node++) {
    5094           0 :                 if (!node_online(node))
    5095           0 :                         continue;
    5096           0 :                 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
    5097           0 :                 zonerefs += nr_zones;
    5098             :         }
    5099           0 :         for (node = 0; node < local_node; node++) {
    5100           0 :                 if (!node_online(node))
    5101           0 :                         continue;
    5102           0 :                 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
    5103           0 :                 zonerefs += nr_zones;
    5104             :         }
    5105             : 
    5106           1 :         zonerefs->zone = NULL;
    5107           1 :         zonerefs->zone_idx = 0;
    5108           1 : }
    5109             : 
    5110             : #endif  /* CONFIG_NUMA */
    5111             : 
    5112             : /*
    5113             :  * Boot pageset table. One per cpu which is going to be used for all
    5114             :  * zones and all nodes. The parameters will be set in such a way
    5115             :  * that an item put on a list will immediately be handed over to
    5116             :  * the buddy list. This is safe since pageset manipulation is done
    5117             :  * with interrupts disabled.
    5118             :  *
    5119             :  * The boot_pagesets must be kept even after bootup is complete for
    5120             :  * unused processors and/or zones. They do play a role for bootstrapping
    5121             :  * hotplugged processors.
    5122             :  *
    5123             :  * zoneinfo_show() and maybe other functions do
    5124             :  * not check if the processor is online before following the pageset pointer.
    5125             :  * Other parts of the kernel may not check if the zone is available.
    5126             :  */
    5127             : static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats);
    5128             : /* These effectively disable the pcplists in the boot pageset completely */
    5129             : #define BOOT_PAGESET_HIGH       0
    5130             : #define BOOT_PAGESET_BATCH      1
    5131             : static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset);
    5132             : static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats);
    5133             : 
    5134           1 : static void __build_all_zonelists(void *data)
    5135             : {
    5136             :         int nid;
    5137             :         int __maybe_unused cpu;
    5138           1 :         pg_data_t *self = data;
    5139             :         unsigned long flags;
    5140             : 
    5141             :         /*
    5142             :          * Explicitly disable this CPU's interrupts before taking seqlock
    5143             :          * to prevent any IRQ handler from calling into the page allocator
    5144             :          * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock.
    5145             :          */
    5146           1 :         local_irq_save(flags);
    5147             :         /*
    5148             :          * Explicitly disable this CPU's synchronous printk() before taking
    5149             :          * seqlock to prevent any printk() from trying to hold port->lock, for
    5150             :          * tty_insert_flip_string_and_push_buffer() on other CPU might be
    5151             :          * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held.
    5152             :          */
    5153           1 :         printk_deferred_enter();
    5154           1 :         write_seqlock(&zonelist_update_seq);
    5155             : 
    5156             : #ifdef CONFIG_NUMA
    5157             :         memset(node_load, 0, sizeof(node_load));
    5158             : #endif
    5159             : 
    5160             :         /*
    5161             :          * This node is hotadded and no memory is yet present.   So just
    5162             :          * building zonelists is fine - no need to touch other nodes.
    5163             :          */
    5164           1 :         if (self && !node_online(self->node_id)) {
    5165           0 :                 build_zonelists(self);
    5166             :         } else {
    5167             :                 /*
    5168             :                  * All possible nodes have pgdat preallocated
    5169             :                  * in free_area_init
    5170             :                  */
    5171           1 :                 for_each_node(nid) {
    5172           1 :                         pg_data_t *pgdat = NODE_DATA(nid);
    5173             : 
    5174           1 :                         build_zonelists(pgdat);
    5175             :                 }
    5176             : 
    5177             : #ifdef CONFIG_HAVE_MEMORYLESS_NODES
    5178             :                 /*
    5179             :                  * We now know the "local memory node" for each node--
    5180             :                  * i.e., the node of the first zone in the generic zonelist.
    5181             :                  * Set up numa_mem percpu variable for on-line cpus.  During
    5182             :                  * boot, only the boot cpu should be on-line;  we'll init the
    5183             :                  * secondary cpus' numa_mem as they come on-line.  During
    5184             :                  * node/memory hotplug, we'll fixup all on-line cpus.
    5185             :                  */
    5186             :                 for_each_online_cpu(cpu)
    5187             :                         set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
    5188             : #endif
    5189             :         }
    5190             : 
    5191           1 :         write_sequnlock(&zonelist_update_seq);
    5192           1 :         printk_deferred_exit();
    5193           2 :         local_irq_restore(flags);
    5194           1 : }
    5195             : 
    5196             : static noinline void __init
    5197           1 : build_all_zonelists_init(void)
    5198             : {
    5199             :         int cpu;
    5200             : 
    5201           1 :         __build_all_zonelists(NULL);
    5202             : 
    5203             :         /*
    5204             :          * Initialize the boot_pagesets that are going to be used
    5205             :          * for bootstrapping processors. The real pagesets for
    5206             :          * each zone will be allocated later when the per cpu
    5207             :          * allocator is available.
    5208             :          *
    5209             :          * boot_pagesets are used also for bootstrapping offline
    5210             :          * cpus if the system is already booted because the pagesets
    5211             :          * are needed to initialize allocators on a specific cpu too.
    5212             :          * F.e. the percpu allocator needs the page allocator which
    5213             :          * needs the percpu allocator in order to allocate its pagesets
    5214             :          * (a chicken-egg dilemma).
    5215             :          */
    5216           2 :         for_each_possible_cpu(cpu)
    5217           1 :                 per_cpu_pages_init(&per_cpu(boot_pageset, cpu), &per_cpu(boot_zonestats, cpu));
    5218             : 
    5219           1 :         mminit_verify_zonelist();
    5220             :         cpuset_init_current_mems_allowed();
    5221           1 : }
    5222             : 
    5223             : /*
    5224             :  * unless system_state == SYSTEM_BOOTING.
    5225             :  *
    5226             :  * __ref due to call of __init annotated helper build_all_zonelists_init
    5227             :  * [protected by SYSTEM_BOOTING].
    5228             :  */
    5229           1 : void __ref build_all_zonelists(pg_data_t *pgdat)
    5230             : {
    5231             :         unsigned long vm_total_pages;
    5232             : 
    5233           1 :         if (system_state == SYSTEM_BOOTING) {
    5234           1 :                 build_all_zonelists_init();
    5235             :         } else {
    5236           0 :                 __build_all_zonelists(pgdat);
    5237             :                 /* cpuset refresh routine should be here */
    5238             :         }
    5239             :         /* Get the number of free pages beyond high watermark in all zones. */
    5240           1 :         vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
    5241             :         /*
    5242             :          * Disable grouping by mobility if the number of pages in the
    5243             :          * system is too low to allow the mechanism to work. It would be
    5244             :          * more accurate, but expensive to check per-zone. This check is
    5245             :          * made on memory-hotadd so a system can start with mobility
    5246             :          * disabled and enable it later
    5247             :          */
    5248           1 :         if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
    5249           0 :                 page_group_by_mobility_disabled = 1;
    5250             :         else
    5251           1 :                 page_group_by_mobility_disabled = 0;
    5252             : 
    5253           1 :         pr_info("Built %u zonelists, mobility grouping %s.  Total pages: %ld\n",
    5254             :                 nr_online_nodes,
    5255             :                 page_group_by_mobility_disabled ? "off" : "on",
    5256             :                 vm_total_pages);
    5257             : #ifdef CONFIG_NUMA
    5258             :         pr_info("Policy zone: %s\n", zone_names[policy_zone]);
    5259             : #endif
    5260           1 : }
    5261             : 
    5262           3 : static int zone_batchsize(struct zone *zone)
    5263             : {
    5264             : #ifdef CONFIG_MMU
    5265             :         int batch;
    5266             : 
    5267             :         /*
    5268             :          * The number of pages to batch allocate is either ~0.1%
    5269             :          * of the zone or 1MB, whichever is smaller. The batch
    5270             :          * size is striking a balance between allocation latency
    5271             :          * and zone lock contention.
    5272             :          */
    5273           3 :         batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE);
    5274           3 :         batch /= 4;             /* We effectively *= 4 below */
    5275           3 :         if (batch < 1)
    5276           1 :                 batch = 1;
    5277             : 
    5278             :         /*
    5279             :          * Clamp the batch to a 2^n - 1 value. Having a power
    5280             :          * of 2 value was found to be more likely to have
    5281             :          * suboptimal cache aliasing properties in some cases.
    5282             :          *
    5283             :          * For example if 2 tasks are alternately allocating
    5284             :          * batches of pages, one task can end up with a lot
    5285             :          * of pages of one half of the possible page colors
    5286             :          * and the other with pages of the other colors.
    5287             :          */
    5288           5 :         batch = rounddown_pow_of_two(batch + batch/2) - 1;
    5289             : 
    5290           3 :         return batch;
    5291             : 
    5292             : #else
    5293             :         /* The deferral and batching of frees should be suppressed under NOMMU
    5294             :          * conditions.
    5295             :          *
    5296             :          * The problem is that NOMMU needs to be able to allocate large chunks
    5297             :          * of contiguous memory as there's no hardware page translation to
    5298             :          * assemble apparent contiguous memory from discontiguous pages.
    5299             :          *
    5300             :          * Queueing large contiguous runs of pages for batching, however,
    5301             :          * causes the pages to actually be freed in smaller chunks.  As there
    5302             :          * can be a significant delay between the individual batches being
    5303             :          * recycled, this leads to the once large chunks of space being
    5304             :          * fragmented and becoming unavailable for high-order allocations.
    5305             :          */
    5306             :         return 0;
    5307             : #endif
    5308             : }
    5309             : 
    5310             : static int percpu_pagelist_high_fraction;
    5311           3 : static int zone_highsize(struct zone *zone, int batch, int cpu_online)
    5312             : {
    5313             : #ifdef CONFIG_MMU
    5314             :         int high;
    5315             :         int nr_split_cpus;
    5316             :         unsigned long total_pages;
    5317             : 
    5318           3 :         if (!percpu_pagelist_high_fraction) {
    5319             :                 /*
    5320             :                  * By default, the high value of the pcp is based on the zone
    5321             :                  * low watermark so that if they are full then background
    5322             :                  * reclaim will not be started prematurely.
    5323             :                  */
    5324           3 :                 total_pages = low_wmark_pages(zone);
    5325             :         } else {
    5326             :                 /*
    5327             :                  * If percpu_pagelist_high_fraction is configured, the high
    5328             :                  * value is based on a fraction of the managed pages in the
    5329             :                  * zone.
    5330             :                  */
    5331           0 :                 total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction;
    5332             :         }
    5333             : 
    5334             :         /*
    5335             :          * Split the high value across all online CPUs local to the zone. Note
    5336             :          * that early in boot that CPUs may not be online yet and that during
    5337             :          * CPU hotplug that the cpumask is not yet updated when a CPU is being
    5338             :          * onlined. For memory nodes that have no CPUs, split pcp->high across
    5339             :          * all online CPUs to mitigate the risk that reclaim is triggered
    5340             :          * prematurely due to pages stored on pcp lists.
    5341             :          */
    5342           6 :         nr_split_cpus = cpumask_weight(cpumask_of_node(zone_to_nid(zone))) + cpu_online;
    5343           3 :         if (!nr_split_cpus)
    5344           0 :                 nr_split_cpus = num_online_cpus();
    5345           3 :         high = total_pages / nr_split_cpus;
    5346             : 
    5347             :         /*
    5348             :          * Ensure high is at least batch*4. The multiple is based on the
    5349             :          * historical relationship between high and batch.
    5350             :          */
    5351           3 :         high = max(high, batch << 2);
    5352             : 
    5353           3 :         return high;
    5354             : #else
    5355             :         return 0;
    5356             : #endif
    5357             : }
    5358             : 
    5359             : /*
    5360             :  * pcp->high and pcp->batch values are related and generally batch is lower
    5361             :  * than high. They are also related to pcp->count such that count is lower
    5362             :  * than high, and as soon as it reaches high, the pcplist is flushed.
    5363             :  *
    5364             :  * However, guaranteeing these relations at all times would require e.g. write
    5365             :  * barriers here but also careful usage of read barriers at the read side, and
    5366             :  * thus be prone to error and bad for performance. Thus the update only prevents
    5367             :  * store tearing. Any new users of pcp->batch and pcp->high should ensure they
    5368             :  * can cope with those fields changing asynchronously, and fully trust only the
    5369             :  * pcp->count field on the local CPU with interrupts disabled.
    5370             :  *
    5371             :  * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
    5372             :  * outside of boot time (or some other assurance that no concurrent updaters
    5373             :  * exist).
    5374             :  */
    5375             : static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
    5376             :                 unsigned long batch)
    5377             : {
    5378           3 :         WRITE_ONCE(pcp->batch, batch);
    5379           3 :         WRITE_ONCE(pcp->high, high);
    5380             : }
    5381             : 
    5382           2 : static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats)
    5383             : {
    5384             :         int pindex;
    5385             : 
    5386           4 :         memset(pcp, 0, sizeof(*pcp));
    5387           2 :         memset(pzstats, 0, sizeof(*pzstats));
    5388             : 
    5389           2 :         spin_lock_init(&pcp->lock);
    5390          26 :         for (pindex = 0; pindex < NR_PCP_LISTS; pindex++)
    5391          48 :                 INIT_LIST_HEAD(&pcp->lists[pindex]);
    5392             : 
    5393             :         /*
    5394             :          * Set batch and high values safe for a boot pageset. A true percpu
    5395             :          * pageset's initialization will update them subsequently. Here we don't
    5396             :          * need to be as careful as pageset_update() as nobody can access the
    5397             :          * pageset yet.
    5398             :          */
    5399           2 :         pcp->high = BOOT_PAGESET_HIGH;
    5400           2 :         pcp->batch = BOOT_PAGESET_BATCH;
    5401           2 :         pcp->free_factor = 0;
    5402           2 : }
    5403             : 
    5404             : static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high,
    5405             :                 unsigned long batch)
    5406             : {
    5407             :         struct per_cpu_pages *pcp;
    5408             :         int cpu;
    5409             : 
    5410           3 :         for_each_possible_cpu(cpu) {
    5411           3 :                 pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
    5412           3 :                 pageset_update(pcp, high, batch);
    5413             :         }
    5414             : }
    5415             : 
    5416             : /*
    5417             :  * Calculate and set new high and batch values for all per-cpu pagesets of a
    5418             :  * zone based on the zone's size.
    5419             :  */
    5420           3 : static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online)
    5421             : {
    5422             :         int new_high, new_batch;
    5423             : 
    5424           3 :         new_batch = max(1, zone_batchsize(zone));
    5425           3 :         new_high = zone_highsize(zone, new_batch, cpu_online);
    5426             : 
    5427           3 :         if (zone->pageset_high == new_high &&
    5428           0 :             zone->pageset_batch == new_batch)
    5429             :                 return;
    5430             : 
    5431           3 :         zone->pageset_high = new_high;
    5432           3 :         zone->pageset_batch = new_batch;
    5433             : 
    5434           3 :         __zone_set_pageset_high_and_batch(zone, new_high, new_batch);
    5435             : }
    5436             : 
    5437           1 : void __meminit setup_zone_pageset(struct zone *zone)
    5438             : {
    5439             :         int cpu;
    5440             : 
    5441             :         /* Size may be 0 on !SMP && !NUMA */
    5442             :         if (sizeof(struct per_cpu_zonestat) > 0)
    5443             :                 zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat);
    5444             : 
    5445           1 :         zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages);
    5446           2 :         for_each_possible_cpu(cpu) {
    5447             :                 struct per_cpu_pages *pcp;
    5448             :                 struct per_cpu_zonestat *pzstats;
    5449             : 
    5450           1 :                 pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
    5451           1 :                 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
    5452           1 :                 per_cpu_pages_init(pcp, pzstats);
    5453             :         }
    5454             : 
    5455           1 :         zone_set_pageset_high_and_batch(zone, 0);
    5456           1 : }
    5457             : 
    5458             : /*
    5459             :  * The zone indicated has a new number of managed_pages; batch sizes and percpu
    5460             :  * page high values need to be recalculated.
    5461             :  */
    5462           2 : static void zone_pcp_update(struct zone *zone, int cpu_online)
    5463             : {
    5464           2 :         mutex_lock(&pcp_batch_high_lock);
    5465           2 :         zone_set_pageset_high_and_batch(zone, cpu_online);
    5466           2 :         mutex_unlock(&pcp_batch_high_lock);
    5467           2 : }
    5468             : 
    5469             : /*
    5470             :  * Allocate per cpu pagesets and initialize them.
    5471             :  * Before this call only boot pagesets were available.
    5472             :  */
    5473           1 : void __init setup_per_cpu_pageset(void)
    5474             : {
    5475             :         struct pglist_data *pgdat;
    5476             :         struct zone *zone;
    5477             :         int __maybe_unused cpu;
    5478             : 
    5479           3 :         for_each_populated_zone(zone)
    5480           1 :                 setup_zone_pageset(zone);
    5481             : 
    5482             : #ifdef CONFIG_NUMA
    5483             :         /*
    5484             :          * Unpopulated zones continue using the boot pagesets.
    5485             :          * The numa stats for these pagesets need to be reset.
    5486             :          * Otherwise, they will end up skewing the stats of
    5487             :          * the nodes these zones are associated with.
    5488             :          */
    5489             :         for_each_possible_cpu(cpu) {
    5490             :                 struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu);
    5491             :                 memset(pzstats->vm_numa_event, 0,
    5492             :                        sizeof(pzstats->vm_numa_event));
    5493             :         }
    5494             : #endif
    5495             : 
    5496           2 :         for_each_online_pgdat(pgdat)
    5497           1 :                 pgdat->per_cpu_nodestats =
    5498           1 :                         alloc_percpu(struct per_cpu_nodestat);
    5499           1 : }
    5500             : 
    5501           2 : __meminit void zone_pcp_init(struct zone *zone)
    5502             : {
    5503             :         /*
    5504             :          * per cpu subsystem is not up at this point. The following code
    5505             :          * relies on the ability of the linker to provide the
    5506             :          * offset of a (static) per cpu variable into the per cpu area.
    5507             :          */
    5508           2 :         zone->per_cpu_pageset = &boot_pageset;
    5509           2 :         zone->per_cpu_zonestats = &boot_zonestats;
    5510           2 :         zone->pageset_high = BOOT_PAGESET_HIGH;
    5511           2 :         zone->pageset_batch = BOOT_PAGESET_BATCH;
    5512             : 
    5513           2 :         if (populated_zone(zone))
    5514             :                 pr_debug("  %s zone: %lu pages, LIFO batch:%u\n", zone->name,
    5515             :                          zone->present_pages, zone_batchsize(zone));
    5516           2 : }
    5517             : 
    5518           0 : void adjust_managed_page_count(struct page *page, long count)
    5519             : {
    5520           0 :         atomic_long_add(count, &page_zone(page)->managed_pages);
    5521           0 :         totalram_pages_add(count);
    5522             : #ifdef CONFIG_HIGHMEM
    5523             :         if (PageHighMem(page))
    5524             :                 totalhigh_pages_add(count);
    5525             : #endif
    5526           0 : }
    5527             : EXPORT_SYMBOL(adjust_managed_page_count);
    5528             : 
    5529           0 : unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)
    5530             : {
    5531             :         void *pos;
    5532           0 :         unsigned long pages = 0;
    5533             : 
    5534           0 :         start = (void *)PAGE_ALIGN((unsigned long)start);
    5535           0 :         end = (void *)((unsigned long)end & PAGE_MASK);
    5536           0 :         for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
    5537           0 :                 struct page *page = virt_to_page(pos);
    5538             :                 void *direct_map_addr;
    5539             : 
    5540             :                 /*
    5541             :                  * 'direct_map_addr' might be different from 'pos'
    5542             :                  * because some architectures' virt_to_page()
    5543             :                  * work with aliases.  Getting the direct map
    5544             :                  * address ensures that we get a _writeable_
    5545             :                  * alias for the memset().
    5546             :                  */
    5547           0 :                 direct_map_addr = page_address(page);
    5548             :                 /*
    5549             :                  * Perform a kasan-unchecked memset() since this memory
    5550             :                  * has not been initialized.
    5551             :                  */
    5552           0 :                 direct_map_addr = kasan_reset_tag(direct_map_addr);
    5553           0 :                 if ((unsigned int)poison <= 0xFF)
    5554           0 :                         memset(direct_map_addr, poison, PAGE_SIZE);
    5555             : 
    5556           0 :                 free_reserved_page(page);
    5557             :         }
    5558             : 
    5559           0 :         if (pages && s)
    5560           0 :                 pr_info("Freeing %s memory: %ldK\n", s, K(pages));
    5561             : 
    5562           0 :         return pages;
    5563             : }
    5564             : 
    5565           0 : static int page_alloc_cpu_dead(unsigned int cpu)
    5566             : {
    5567             :         struct zone *zone;
    5568             : 
    5569           0 :         lru_add_drain_cpu(cpu);
    5570           0 :         mlock_drain_remote(cpu);
    5571           0 :         drain_pages(cpu);
    5572             : 
    5573             :         /*
    5574             :          * Spill the event counters of the dead processor
    5575             :          * into the current processors event counters.
    5576             :          * This artificially elevates the count of the current
    5577             :          * processor.
    5578             :          */
    5579           0 :         vm_events_fold_cpu(cpu);
    5580             : 
    5581             :         /*
    5582             :          * Zero the differential counters of the dead processor
    5583             :          * so that the vm statistics are consistent.
    5584             :          *
    5585             :          * This is only okay since the processor is dead and cannot
    5586             :          * race with what we are doing.
    5587             :          */
    5588           0 :         cpu_vm_stats_fold(cpu);
    5589             : 
    5590           0 :         for_each_populated_zone(zone)
    5591           0 :                 zone_pcp_update(zone, 0);
    5592             : 
    5593           0 :         return 0;
    5594             : }
    5595             : 
    5596           0 : static int page_alloc_cpu_online(unsigned int cpu)
    5597             : {
    5598             :         struct zone *zone;
    5599             : 
    5600           0 :         for_each_populated_zone(zone)
    5601           0 :                 zone_pcp_update(zone, 1);
    5602           0 :         return 0;
    5603             : }
    5604             : 
    5605           1 : void __init page_alloc_init_cpuhp(void)
    5606             : {
    5607             :         int ret;
    5608             : 
    5609           1 :         ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC,
    5610             :                                         "mm/page_alloc:pcp",
    5611             :                                         page_alloc_cpu_online,
    5612             :                                         page_alloc_cpu_dead);
    5613           1 :         WARN_ON(ret < 0);
    5614           1 : }
    5615             : 
    5616             : /*
    5617             :  * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
    5618             :  *      or min_free_kbytes changes.
    5619             :  */
    5620           2 : static void calculate_totalreserve_pages(void)
    5621             : {
    5622             :         struct pglist_data *pgdat;
    5623           2 :         unsigned long reserve_pages = 0;
    5624             :         enum zone_type i, j;
    5625             : 
    5626           4 :         for_each_online_pgdat(pgdat) {
    5627             : 
    5628           2 :                 pgdat->totalreserve_pages = 0;
    5629             : 
    5630           6 :                 for (i = 0; i < MAX_NR_ZONES; i++) {
    5631           4 :                         struct zone *zone = pgdat->node_zones + i;
    5632           4 :                         long max = 0;
    5633           4 :                         unsigned long managed_pages = zone_managed_pages(zone);
    5634             : 
    5635             :                         /* Find valid and maximum lowmem_reserve in the zone */
    5636          10 :                         for (j = i; j < MAX_NR_ZONES; j++) {
    5637           6 :                                 if (zone->lowmem_reserve[j] > max)
    5638           0 :                                         max = zone->lowmem_reserve[j];
    5639             :                         }
    5640             : 
    5641             :                         /* we treat the high watermark as reserved pages. */
    5642           4 :                         max += high_wmark_pages(zone);
    5643             : 
    5644           4 :                         if (max > managed_pages)
    5645           0 :                                 max = managed_pages;
    5646             : 
    5647           4 :                         pgdat->totalreserve_pages += max;
    5648             : 
    5649           4 :                         reserve_pages += max;
    5650             :                 }
    5651             :         }
    5652           2 :         totalreserve_pages = reserve_pages;
    5653           2 : }
    5654             : 
    5655             : /*
    5656             :  * setup_per_zone_lowmem_reserve - called whenever
    5657             :  *      sysctl_lowmem_reserve_ratio changes.  Ensures that each zone
    5658             :  *      has a correct pages reserved value, so an adequate number of
    5659             :  *      pages are left in the zone after a successful __alloc_pages().
    5660             :  */
    5661           1 : static void setup_per_zone_lowmem_reserve(void)
    5662             : {
    5663             :         struct pglist_data *pgdat;
    5664             :         enum zone_type i, j;
    5665             : 
    5666           2 :         for_each_online_pgdat(pgdat) {
    5667           2 :                 for (i = 0; i < MAX_NR_ZONES - 1; i++) {
    5668           1 :                         struct zone *zone = &pgdat->node_zones[i];
    5669           1 :                         int ratio = sysctl_lowmem_reserve_ratio[i];
    5670           2 :                         bool clear = !ratio || !zone_managed_pages(zone);
    5671           1 :                         unsigned long managed_pages = 0;
    5672             : 
    5673           2 :                         for (j = i + 1; j < MAX_NR_ZONES; j++) {
    5674           1 :                                 struct zone *upper_zone = &pgdat->node_zones[j];
    5675             : 
    5676           1 :                                 managed_pages += zone_managed_pages(upper_zone);
    5677             : 
    5678           1 :                                 if (clear)
    5679           0 :                                         zone->lowmem_reserve[j] = 0;
    5680             :                                 else
    5681           1 :                                         zone->lowmem_reserve[j] = managed_pages / ratio;
    5682             :                         }
    5683             :                 }
    5684             :         }
    5685             : 
    5686             :         /* update totalreserve_pages */
    5687           1 :         calculate_totalreserve_pages();
    5688           1 : }
    5689             : 
    5690           1 : static void __setup_per_zone_wmarks(void)
    5691             : {
    5692           1 :         unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
    5693           1 :         unsigned long lowmem_pages = 0;
    5694             :         struct zone *zone;
    5695             :         unsigned long flags;
    5696             : 
    5697             :         /* Calculate total number of !ZONE_HIGHMEM pages */
    5698           3 :         for_each_zone(zone) {
    5699           2 :                 if (!is_highmem(zone))
    5700           2 :                         lowmem_pages += zone_managed_pages(zone);
    5701             :         }
    5702             : 
    5703           3 :         for_each_zone(zone) {
    5704             :                 u64 tmp;
    5705             : 
    5706           2 :                 spin_lock_irqsave(&zone->lock, flags);
    5707           2 :                 tmp = (u64)pages_min * zone_managed_pages(zone);
    5708           2 :                 do_div(tmp, lowmem_pages);
    5709           2 :                 if (is_highmem(zone)) {
    5710             :                         /*
    5711             :                          * __GFP_HIGH and PF_MEMALLOC allocations usually don't
    5712             :                          * need highmem pages, so cap pages_min to a small
    5713             :                          * value here.
    5714             :                          *
    5715             :                          * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
    5716             :                          * deltas control async page reclaim, and so should
    5717             :                          * not be capped for highmem.
    5718             :                          */
    5719             :                         unsigned long min_pages;
    5720             : 
    5721             :                         min_pages = zone_managed_pages(zone) / 1024;
    5722             :                         min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
    5723             :                         zone->_watermark[WMARK_MIN] = min_pages;
    5724             :                 } else {
    5725             :                         /*
    5726             :                          * If it's a lowmem zone, reserve a number of pages
    5727             :                          * proportionate to the zone's size.
    5728             :                          */
    5729           2 :                         zone->_watermark[WMARK_MIN] = tmp;
    5730             :                 }
    5731             : 
    5732             :                 /*
    5733             :                  * Set the kswapd watermarks distance according to the
    5734             :                  * scale factor in proportion to available memory, but
    5735             :                  * ensure a minimum size on small systems.
    5736             :                  */
    5737           4 :                 tmp = max_t(u64, tmp >> 2,
    5738             :                             mult_frac(zone_managed_pages(zone),
    5739             :                                       watermark_scale_factor, 10000));
    5740             : 
    5741           2 :                 zone->watermark_boost = 0;
    5742           2 :                 zone->_watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
    5743           2 :                 zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp;
    5744           2 :                 zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp;
    5745             : 
    5746           4 :                 spin_unlock_irqrestore(&zone->lock, flags);
    5747             :         }
    5748             : 
    5749             :         /* update totalreserve_pages */
    5750           1 :         calculate_totalreserve_pages();
    5751           1 : }
    5752             : 
    5753             : /**
    5754             :  * setup_per_zone_wmarks - called when min_free_kbytes changes
    5755             :  * or when memory is hot-{added|removed}
    5756             :  *
    5757             :  * Ensures that the watermark[min,low,high] values for each zone are set
    5758             :  * correctly with respect to min_free_kbytes.
    5759             :  */
    5760           1 : void setup_per_zone_wmarks(void)
    5761             : {
    5762             :         struct zone *zone;
    5763             :         static DEFINE_SPINLOCK(lock);
    5764             : 
    5765           1 :         spin_lock(&lock);
    5766           1 :         __setup_per_zone_wmarks();
    5767           1 :         spin_unlock(&lock);
    5768             : 
    5769             :         /*
    5770             :          * The watermark size have changed so update the pcpu batch
    5771             :          * and high limits or the limits may be inappropriate.
    5772             :          */
    5773           3 :         for_each_zone(zone)
    5774           2 :                 zone_pcp_update(zone, 0);
    5775           1 : }
    5776             : 
    5777             : /*
    5778             :  * Initialise min_free_kbytes.
    5779             :  *
    5780             :  * For small machines we want it small (128k min).  For large machines
    5781             :  * we want it large (256MB max).  But it is not linear, because network
    5782             :  * bandwidth does not increase linearly with machine size.  We use
    5783             :  *
    5784             :  *      min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
    5785             :  *      min_free_kbytes = sqrt(lowmem_kbytes * 16)
    5786             :  *
    5787             :  * which yields
    5788             :  *
    5789             :  * 16MB:        512k
    5790             :  * 32MB:        724k
    5791             :  * 64MB:        1024k
    5792             :  * 128MB:       1448k
    5793             :  * 256MB:       2048k
    5794             :  * 512MB:       2896k
    5795             :  * 1024MB:      4096k
    5796             :  * 2048MB:      5792k
    5797             :  * 4096MB:      8192k
    5798             :  * 8192MB:      11584k
    5799             :  * 16384MB:     16384k
    5800             :  */
    5801           1 : void calculate_min_free_kbytes(void)
    5802             : {
    5803             :         unsigned long lowmem_kbytes;
    5804             :         int new_min_free_kbytes;
    5805             : 
    5806           1 :         lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
    5807           1 :         new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
    5808             : 
    5809           1 :         if (new_min_free_kbytes > user_min_free_kbytes)
    5810           1 :                 min_free_kbytes = clamp(new_min_free_kbytes, 128, 262144);
    5811             :         else
    5812           0 :                 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
    5813             :                                 new_min_free_kbytes, user_min_free_kbytes);
    5814             : 
    5815           1 : }
    5816             : 
    5817           1 : int __meminit init_per_zone_wmark_min(void)
    5818             : {
    5819           1 :         calculate_min_free_kbytes();
    5820           1 :         setup_per_zone_wmarks();
    5821             :         refresh_zone_stat_thresholds();
    5822           1 :         setup_per_zone_lowmem_reserve();
    5823             : 
    5824             : #ifdef CONFIG_NUMA
    5825             :         setup_min_unmapped_ratio();
    5826             :         setup_min_slab_ratio();
    5827             : #endif
    5828             : 
    5829             :         khugepaged_min_free_kbytes_update();
    5830             : 
    5831           1 :         return 0;
    5832             : }
    5833             : postcore_initcall(init_per_zone_wmark_min)
    5834             : 
    5835             : /*
    5836             :  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
    5837             :  *      that we can call two helper functions whenever min_free_kbytes
    5838             :  *      changes.
    5839             :  */
    5840           0 : static int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
    5841             :                 void *buffer, size_t *length, loff_t *ppos)
    5842             : {
    5843             :         int rc;
    5844             : 
    5845           0 :         rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
    5846           0 :         if (rc)
    5847             :                 return rc;
    5848             : 
    5849           0 :         if (write) {
    5850           0 :                 user_min_free_kbytes = min_free_kbytes;
    5851           0 :                 setup_per_zone_wmarks();
    5852             :         }
    5853             :         return 0;
    5854             : }
    5855             : 
    5856           0 : static int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
    5857             :                 void *buffer, size_t *length, loff_t *ppos)
    5858             : {
    5859             :         int rc;
    5860             : 
    5861           0 :         rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
    5862           0 :         if (rc)
    5863             :                 return rc;
    5864             : 
    5865           0 :         if (write)
    5866           0 :                 setup_per_zone_wmarks();
    5867             : 
    5868             :         return 0;
    5869             : }
    5870             : 
    5871             : #ifdef CONFIG_NUMA
    5872             : static void setup_min_unmapped_ratio(void)
    5873             : {
    5874             :         pg_data_t *pgdat;
    5875             :         struct zone *zone;
    5876             : 
    5877             :         for_each_online_pgdat(pgdat)
    5878             :                 pgdat->min_unmapped_pages = 0;
    5879             : 
    5880             :         for_each_zone(zone)
    5881             :                 zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
    5882             :                                                          sysctl_min_unmapped_ratio) / 100;
    5883             : }
    5884             : 
    5885             : 
    5886             : static int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
    5887             :                 void *buffer, size_t *length, loff_t *ppos)
    5888             : {
    5889             :         int rc;
    5890             : 
    5891             :         rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
    5892             :         if (rc)
    5893             :                 return rc;
    5894             : 
    5895             :         setup_min_unmapped_ratio();
    5896             : 
    5897             :         return 0;
    5898             : }
    5899             : 
    5900             : static void setup_min_slab_ratio(void)
    5901             : {
    5902             :         pg_data_t *pgdat;
    5903             :         struct zone *zone;
    5904             : 
    5905             :         for_each_online_pgdat(pgdat)
    5906             :                 pgdat->min_slab_pages = 0;
    5907             : 
    5908             :         for_each_zone(zone)
    5909             :                 zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
    5910             :                                                      sysctl_min_slab_ratio) / 100;
    5911             : }
    5912             : 
    5913             : static int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
    5914             :                 void *buffer, size_t *length, loff_t *ppos)
    5915             : {
    5916             :         int rc;
    5917             : 
    5918             :         rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
    5919             :         if (rc)
    5920             :                 return rc;
    5921             : 
    5922             :         setup_min_slab_ratio();
    5923             : 
    5924             :         return 0;
    5925             : }
    5926             : #endif
    5927             : 
    5928             : /*
    5929             :  * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
    5930             :  *      proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
    5931             :  *      whenever sysctl_lowmem_reserve_ratio changes.
    5932             :  *
    5933             :  * The reserve ratio obviously has absolutely no relation with the
    5934             :  * minimum watermarks. The lowmem reserve ratio can only make sense
    5935             :  * if in function of the boot time zone sizes.
    5936             :  */
    5937           0 : static int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table,
    5938             :                 int write, void *buffer, size_t *length, loff_t *ppos)
    5939             : {
    5940             :         int i;
    5941             : 
    5942           0 :         proc_dointvec_minmax(table, write, buffer, length, ppos);
    5943             : 
    5944           0 :         for (i = 0; i < MAX_NR_ZONES; i++) {
    5945           0 :                 if (sysctl_lowmem_reserve_ratio[i] < 1)
    5946           0 :                         sysctl_lowmem_reserve_ratio[i] = 0;
    5947             :         }
    5948             : 
    5949           0 :         setup_per_zone_lowmem_reserve();
    5950           0 :         return 0;
    5951             : }
    5952             : 
    5953             : /*
    5954             :  * percpu_pagelist_high_fraction - changes the pcp->high for each zone on each
    5955             :  * cpu. It is the fraction of total pages in each zone that a hot per cpu
    5956             :  * pagelist can have before it gets flushed back to buddy allocator.
    5957             :  */
    5958           0 : static int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table,
    5959             :                 int write, void *buffer, size_t *length, loff_t *ppos)
    5960             : {
    5961             :         struct zone *zone;
    5962             :         int old_percpu_pagelist_high_fraction;
    5963             :         int ret;
    5964             : 
    5965           0 :         mutex_lock(&pcp_batch_high_lock);
    5966           0 :         old_percpu_pagelist_high_fraction = percpu_pagelist_high_fraction;
    5967             : 
    5968           0 :         ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
    5969           0 :         if (!write || ret < 0)
    5970             :                 goto out;
    5971             : 
    5972             :         /* Sanity checking to avoid pcp imbalance */
    5973           0 :         if (percpu_pagelist_high_fraction &&
    5974             :             percpu_pagelist_high_fraction < MIN_PERCPU_PAGELIST_HIGH_FRACTION) {
    5975           0 :                 percpu_pagelist_high_fraction = old_percpu_pagelist_high_fraction;
    5976           0 :                 ret = -EINVAL;
    5977           0 :                 goto out;
    5978             :         }
    5979             : 
    5980             :         /* No change? */
    5981           0 :         if (percpu_pagelist_high_fraction == old_percpu_pagelist_high_fraction)
    5982             :                 goto out;
    5983             : 
    5984           0 :         for_each_populated_zone(zone)
    5985           0 :                 zone_set_pageset_high_and_batch(zone, 0);
    5986             : out:
    5987           0 :         mutex_unlock(&pcp_batch_high_lock);
    5988           0 :         return ret;
    5989             : }
    5990             : 
    5991             : static struct ctl_table page_alloc_sysctl_table[] = {
    5992             :         {
    5993             :                 .procname       = "min_free_kbytes",
    5994             :                 .data           = &min_free_kbytes,
    5995             :                 .maxlen         = sizeof(min_free_kbytes),
    5996             :                 .mode           = 0644,
    5997             :                 .proc_handler   = min_free_kbytes_sysctl_handler,
    5998             :                 .extra1         = SYSCTL_ZERO,
    5999             :         },
    6000             :         {
    6001             :                 .procname       = "watermark_boost_factor",
    6002             :                 .data           = &watermark_boost_factor,
    6003             :                 .maxlen         = sizeof(watermark_boost_factor),
    6004             :                 .mode           = 0644,
    6005             :                 .proc_handler   = proc_dointvec_minmax,
    6006             :                 .extra1         = SYSCTL_ZERO,
    6007             :         },
    6008             :         {
    6009             :                 .procname       = "watermark_scale_factor",
    6010             :                 .data           = &watermark_scale_factor,
    6011             :                 .maxlen         = sizeof(watermark_scale_factor),
    6012             :                 .mode           = 0644,
    6013             :                 .proc_handler   = watermark_scale_factor_sysctl_handler,
    6014             :                 .extra1         = SYSCTL_ONE,
    6015             :                 .extra2         = SYSCTL_THREE_THOUSAND,
    6016             :         },
    6017             :         {
    6018             :                 .procname       = "percpu_pagelist_high_fraction",
    6019             :                 .data           = &percpu_pagelist_high_fraction,
    6020             :                 .maxlen         = sizeof(percpu_pagelist_high_fraction),
    6021             :                 .mode           = 0644,
    6022             :                 .proc_handler   = percpu_pagelist_high_fraction_sysctl_handler,
    6023             :                 .extra1         = SYSCTL_ZERO,
    6024             :         },
    6025             :         {
    6026             :                 .procname       = "lowmem_reserve_ratio",
    6027             :                 .data           = &sysctl_lowmem_reserve_ratio,
    6028             :                 .maxlen         = sizeof(sysctl_lowmem_reserve_ratio),
    6029             :                 .mode           = 0644,
    6030             :                 .proc_handler   = lowmem_reserve_ratio_sysctl_handler,
    6031             :         },
    6032             : #ifdef CONFIG_NUMA
    6033             :         {
    6034             :                 .procname       = "numa_zonelist_order",
    6035             :                 .data           = &numa_zonelist_order,
    6036             :                 .maxlen         = NUMA_ZONELIST_ORDER_LEN,
    6037             :                 .mode           = 0644,
    6038             :                 .proc_handler   = numa_zonelist_order_handler,
    6039             :         },
    6040             :         {
    6041             :                 .procname       = "min_unmapped_ratio",
    6042             :                 .data           = &sysctl_min_unmapped_ratio,
    6043             :                 .maxlen         = sizeof(sysctl_min_unmapped_ratio),
    6044             :                 .mode           = 0644,
    6045             :                 .proc_handler   = sysctl_min_unmapped_ratio_sysctl_handler,
    6046             :                 .extra1         = SYSCTL_ZERO,
    6047             :                 .extra2         = SYSCTL_ONE_HUNDRED,
    6048             :         },
    6049             :         {
    6050             :                 .procname       = "min_slab_ratio",
    6051             :                 .data           = &sysctl_min_slab_ratio,
    6052             :                 .maxlen         = sizeof(sysctl_min_slab_ratio),
    6053             :                 .mode           = 0644,
    6054             :                 .proc_handler   = sysctl_min_slab_ratio_sysctl_handler,
    6055             :                 .extra1         = SYSCTL_ZERO,
    6056             :                 .extra2         = SYSCTL_ONE_HUNDRED,
    6057             :         },
    6058             : #endif
    6059             :         {}
    6060             : };
    6061             : 
    6062           1 : void __init page_alloc_sysctl_init(void)
    6063             : {
    6064           1 :         register_sysctl_init("vm", page_alloc_sysctl_table);
    6065           1 : }
    6066             : 
    6067             : #ifdef CONFIG_CONTIG_ALLOC
    6068             : /* Usage: See admin-guide/dynamic-debug-howto.rst */
    6069             : static void alloc_contig_dump_pages(struct list_head *page_list)
    6070             : {
    6071             :         DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure");
    6072             : 
    6073             :         if (DYNAMIC_DEBUG_BRANCH(descriptor)) {
    6074             :                 struct page *page;
    6075             : 
    6076             :                 dump_stack();
    6077             :                 list_for_each_entry(page, page_list, lru)
    6078             :                         dump_page(page, "migration failure");
    6079             :         }
    6080             : }
    6081             : 
    6082             : /* [start, end) must belong to a single zone. */
    6083             : int __alloc_contig_migrate_range(struct compact_control *cc,
    6084             :                                         unsigned long start, unsigned long end)
    6085             : {
    6086             :         /* This function is based on compact_zone() from compaction.c. */
    6087             :         unsigned int nr_reclaimed;
    6088             :         unsigned long pfn = start;
    6089             :         unsigned int tries = 0;
    6090             :         int ret = 0;
    6091             :         struct migration_target_control mtc = {
    6092             :                 .nid = zone_to_nid(cc->zone),
    6093             :                 .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
    6094             :         };
    6095             : 
    6096             :         lru_cache_disable();
    6097             : 
    6098             :         while (pfn < end || !list_empty(&cc->migratepages)) {
    6099             :                 if (fatal_signal_pending(current)) {
    6100             :                         ret = -EINTR;
    6101             :                         break;
    6102             :                 }
    6103             : 
    6104             :                 if (list_empty(&cc->migratepages)) {
    6105             :                         cc->nr_migratepages = 0;
    6106             :                         ret = isolate_migratepages_range(cc, pfn, end);
    6107             :                         if (ret && ret != -EAGAIN)
    6108             :                                 break;
    6109             :                         pfn = cc->migrate_pfn;
    6110             :                         tries = 0;
    6111             :                 } else if (++tries == 5) {
    6112             :                         ret = -EBUSY;
    6113             :                         break;
    6114             :                 }
    6115             : 
    6116             :                 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
    6117             :                                                         &cc->migratepages);
    6118             :                 cc->nr_migratepages -= nr_reclaimed;
    6119             : 
    6120             :                 ret = migrate_pages(&cc->migratepages, alloc_migration_target,
    6121             :                         NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL);
    6122             : 
    6123             :                 /*
    6124             :                  * On -ENOMEM, migrate_pages() bails out right away. It is pointless
    6125             :                  * to retry again over this error, so do the same here.
    6126             :                  */
    6127             :                 if (ret == -ENOMEM)
    6128             :                         break;
    6129             :         }
    6130             : 
    6131             :         lru_cache_enable();
    6132             :         if (ret < 0) {
    6133             :                 if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY)
    6134             :                         alloc_contig_dump_pages(&cc->migratepages);
    6135             :                 putback_movable_pages(&cc->migratepages);
    6136             :                 return ret;
    6137             :         }
    6138             :         return 0;
    6139             : }
    6140             : 
    6141             : /**
    6142             :  * alloc_contig_range() -- tries to allocate given range of pages
    6143             :  * @start:      start PFN to allocate
    6144             :  * @end:        one-past-the-last PFN to allocate
    6145             :  * @migratetype:        migratetype of the underlying pageblocks (either
    6146             :  *                      #MIGRATE_MOVABLE or #MIGRATE_CMA).  All pageblocks
    6147             :  *                      in range must have the same migratetype and it must
    6148             :  *                      be either of the two.
    6149             :  * @gfp_mask:   GFP mask to use during compaction
    6150             :  *
    6151             :  * The PFN range does not have to be pageblock aligned. The PFN range must
    6152             :  * belong to a single zone.
    6153             :  *
    6154             :  * The first thing this routine does is attempt to MIGRATE_ISOLATE all
    6155             :  * pageblocks in the range.  Once isolated, the pageblocks should not
    6156             :  * be modified by others.
    6157             :  *
    6158             :  * Return: zero on success or negative error code.  On success all
    6159             :  * pages which PFN is in [start, end) are allocated for the caller and
    6160             :  * need to be freed with free_contig_range().
    6161             :  */
    6162             : int alloc_contig_range(unsigned long start, unsigned long end,
    6163             :                        unsigned migratetype, gfp_t gfp_mask)
    6164             : {
    6165             :         unsigned long outer_start, outer_end;
    6166             :         int order;
    6167             :         int ret = 0;
    6168             : 
    6169             :         struct compact_control cc = {
    6170             :                 .nr_migratepages = 0,
    6171             :                 .order = -1,
    6172             :                 .zone = page_zone(pfn_to_page(start)),
    6173             :                 .mode = MIGRATE_SYNC,
    6174             :                 .ignore_skip_hint = true,
    6175             :                 .no_set_skip_hint = true,
    6176             :                 .gfp_mask = current_gfp_context(gfp_mask),
    6177             :                 .alloc_contig = true,
    6178             :         };
    6179             :         INIT_LIST_HEAD(&cc.migratepages);
    6180             : 
    6181             :         /*
    6182             :          * What we do here is we mark all pageblocks in range as
    6183             :          * MIGRATE_ISOLATE.  Because pageblock and max order pages may
    6184             :          * have different sizes, and due to the way page allocator
    6185             :          * work, start_isolate_page_range() has special handlings for this.
    6186             :          *
    6187             :          * Once the pageblocks are marked as MIGRATE_ISOLATE, we
    6188             :          * migrate the pages from an unaligned range (ie. pages that
    6189             :          * we are interested in). This will put all the pages in
    6190             :          * range back to page allocator as MIGRATE_ISOLATE.
    6191             :          *
    6192             :          * When this is done, we take the pages in range from page
    6193             :          * allocator removing them from the buddy system.  This way
    6194             :          * page allocator will never consider using them.
    6195             :          *
    6196             :          * This lets us mark the pageblocks back as
    6197             :          * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
    6198             :          * aligned range but not in the unaligned, original range are
    6199             :          * put back to page allocator so that buddy can use them.
    6200             :          */
    6201             : 
    6202             :         ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask);
    6203             :         if (ret)
    6204             :                 goto done;
    6205             : 
    6206             :         drain_all_pages(cc.zone);
    6207             : 
    6208             :         /*
    6209             :          * In case of -EBUSY, we'd like to know which page causes problem.
    6210             :          * So, just fall through. test_pages_isolated() has a tracepoint
    6211             :          * which will report the busy page.
    6212             :          *
    6213             :          * It is possible that busy pages could become available before
    6214             :          * the call to test_pages_isolated, and the range will actually be
    6215             :          * allocated.  So, if we fall through be sure to clear ret so that
    6216             :          * -EBUSY is not accidentally used or returned to caller.
    6217             :          */
    6218             :         ret = __alloc_contig_migrate_range(&cc, start, end);
    6219             :         if (ret && ret != -EBUSY)
    6220             :                 goto done;
    6221             :         ret = 0;
    6222             : 
    6223             :         /*
    6224             :          * Pages from [start, end) are within a pageblock_nr_pages
    6225             :          * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
    6226             :          * more, all pages in [start, end) are free in page allocator.
    6227             :          * What we are going to do is to allocate all pages from
    6228             :          * [start, end) (that is remove them from page allocator).
    6229             :          *
    6230             :          * The only problem is that pages at the beginning and at the
    6231             :          * end of interesting range may be not aligned with pages that
    6232             :          * page allocator holds, ie. they can be part of higher order
    6233             :          * pages.  Because of this, we reserve the bigger range and
    6234             :          * once this is done free the pages we are not interested in.
    6235             :          *
    6236             :          * We don't have to hold zone->lock here because the pages are
    6237             :          * isolated thus they won't get removed from buddy.
    6238             :          */
    6239             : 
    6240             :         order = 0;
    6241             :         outer_start = start;
    6242             :         while (!PageBuddy(pfn_to_page(outer_start))) {
    6243             :                 if (++order > MAX_ORDER) {
    6244             :                         outer_start = start;
    6245             :                         break;
    6246             :                 }
    6247             :                 outer_start &= ~0UL << order;
    6248             :         }
    6249             : 
    6250             :         if (outer_start != start) {
    6251             :                 order = buddy_order(pfn_to_page(outer_start));
    6252             : 
    6253             :                 /*
    6254             :                  * outer_start page could be small order buddy page and
    6255             :                  * it doesn't include start page. Adjust outer_start
    6256             :                  * in this case to report failed page properly
    6257             :                  * on tracepoint in test_pages_isolated()
    6258             :                  */
    6259             :                 if (outer_start + (1UL << order) <= start)
    6260             :                         outer_start = start;
    6261             :         }
    6262             : 
    6263             :         /* Make sure the range is really isolated. */
    6264             :         if (test_pages_isolated(outer_start, end, 0)) {
    6265             :                 ret = -EBUSY;
    6266             :                 goto done;
    6267             :         }
    6268             : 
    6269             :         /* Grab isolated pages from freelists. */
    6270             :         outer_end = isolate_freepages_range(&cc, outer_start, end);
    6271             :         if (!outer_end) {
    6272             :                 ret = -EBUSY;
    6273             :                 goto done;
    6274             :         }
    6275             : 
    6276             :         /* Free head and tail (if any) */
    6277             :         if (start != outer_start)
    6278             :                 free_contig_range(outer_start, start - outer_start);
    6279             :         if (end != outer_end)
    6280             :                 free_contig_range(end, outer_end - end);
    6281             : 
    6282             : done:
    6283             :         undo_isolate_page_range(start, end, migratetype);
    6284             :         return ret;
    6285             : }
    6286             : EXPORT_SYMBOL(alloc_contig_range);
    6287             : 
    6288             : static int __alloc_contig_pages(unsigned long start_pfn,
    6289             :                                 unsigned long nr_pages, gfp_t gfp_mask)
    6290             : {
    6291             :         unsigned long end_pfn = start_pfn + nr_pages;
    6292             : 
    6293             :         return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
    6294             :                                   gfp_mask);
    6295             : }
    6296             : 
    6297             : static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
    6298             :                                    unsigned long nr_pages)
    6299             : {
    6300             :         unsigned long i, end_pfn = start_pfn + nr_pages;
    6301             :         struct page *page;
    6302             : 
    6303             :         for (i = start_pfn; i < end_pfn; i++) {
    6304             :                 page = pfn_to_online_page(i);
    6305             :                 if (!page)
    6306             :                         return false;
    6307             : 
    6308             :                 if (page_zone(page) != z)
    6309             :                         return false;
    6310             : 
    6311             :                 if (PageReserved(page))
    6312             :                         return false;
    6313             : 
    6314             :                 if (PageHuge(page))
    6315             :                         return false;
    6316             :         }
    6317             :         return true;
    6318             : }
    6319             : 
    6320             : static bool zone_spans_last_pfn(const struct zone *zone,
    6321             :                                 unsigned long start_pfn, unsigned long nr_pages)
    6322             : {
    6323             :         unsigned long last_pfn = start_pfn + nr_pages - 1;
    6324             : 
    6325             :         return zone_spans_pfn(zone, last_pfn);
    6326             : }
    6327             : 
    6328             : /**
    6329             :  * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
    6330             :  * @nr_pages:   Number of contiguous pages to allocate
    6331             :  * @gfp_mask:   GFP mask to limit search and used during compaction
    6332             :  * @nid:        Target node
    6333             :  * @nodemask:   Mask for other possible nodes
    6334             :  *
    6335             :  * This routine is a wrapper around alloc_contig_range(). It scans over zones
    6336             :  * on an applicable zonelist to find a contiguous pfn range which can then be
    6337             :  * tried for allocation with alloc_contig_range(). This routine is intended
    6338             :  * for allocation requests which can not be fulfilled with the buddy allocator.
    6339             :  *
    6340             :  * The allocated memory is always aligned to a page boundary. If nr_pages is a
    6341             :  * power of two, then allocated range is also guaranteed to be aligned to same
    6342             :  * nr_pages (e.g. 1GB request would be aligned to 1GB).
    6343             :  *
    6344             :  * Allocated pages can be freed with free_contig_range() or by manually calling
    6345             :  * __free_page() on each allocated page.
    6346             :  *
    6347             :  * Return: pointer to contiguous pages on success, or NULL if not successful.
    6348             :  */
    6349             : struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
    6350             :                                 int nid, nodemask_t *nodemask)
    6351             : {
    6352             :         unsigned long ret, pfn, flags;
    6353             :         struct zonelist *zonelist;
    6354             :         struct zone *zone;
    6355             :         struct zoneref *z;
    6356             : 
    6357             :         zonelist = node_zonelist(nid, gfp_mask);
    6358             :         for_each_zone_zonelist_nodemask(zone, z, zonelist,
    6359             :                                         gfp_zone(gfp_mask), nodemask) {
    6360             :                 spin_lock_irqsave(&zone->lock, flags);
    6361             : 
    6362             :                 pfn = ALIGN(zone->zone_start_pfn, nr_pages);
    6363             :                 while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
    6364             :                         if (pfn_range_valid_contig(zone, pfn, nr_pages)) {
    6365             :                                 /*
    6366             :                                  * We release the zone lock here because
    6367             :                                  * alloc_contig_range() will also lock the zone
    6368             :                                  * at some point. If there's an allocation
    6369             :                                  * spinning on this lock, it may win the race
    6370             :                                  * and cause alloc_contig_range() to fail...
    6371             :                                  */
    6372             :                                 spin_unlock_irqrestore(&zone->lock, flags);
    6373             :                                 ret = __alloc_contig_pages(pfn, nr_pages,
    6374             :                                                         gfp_mask);
    6375             :                                 if (!ret)
    6376             :                                         return pfn_to_page(pfn);
    6377             :                                 spin_lock_irqsave(&zone->lock, flags);
    6378             :                         }
    6379             :                         pfn += nr_pages;
    6380             :                 }
    6381             :                 spin_unlock_irqrestore(&zone->lock, flags);
    6382             :         }
    6383             :         return NULL;
    6384             : }
    6385             : #endif /* CONFIG_CONTIG_ALLOC */
    6386             : 
    6387           0 : void free_contig_range(unsigned long pfn, unsigned long nr_pages)
    6388             : {
    6389           0 :         unsigned long count = 0;
    6390             : 
    6391           0 :         for (; nr_pages--; pfn++) {
    6392           0 :                 struct page *page = pfn_to_page(pfn);
    6393             : 
    6394           0 :                 count += page_count(page) != 1;
    6395           0 :                 __free_page(page);
    6396             :         }
    6397           0 :         WARN(count != 0, "%lu pages are still in use!\n", count);
    6398           0 : }
    6399             : EXPORT_SYMBOL(free_contig_range);
    6400             : 
    6401             : /*
    6402             :  * Effectively disable pcplists for the zone by setting the high limit to 0
    6403             :  * and draining all cpus. A concurrent page freeing on another CPU that's about
    6404             :  * to put the page on pcplist will either finish before the drain and the page
    6405             :  * will be drained, or observe the new high limit and skip the pcplist.
    6406             :  *
    6407             :  * Must be paired with a call to zone_pcp_enable().
    6408             :  */
    6409           0 : void zone_pcp_disable(struct zone *zone)
    6410             : {
    6411           0 :         mutex_lock(&pcp_batch_high_lock);
    6412           0 :         __zone_set_pageset_high_and_batch(zone, 0, 1);
    6413           0 :         __drain_all_pages(zone, true);
    6414           0 : }
    6415             : 
    6416           0 : void zone_pcp_enable(struct zone *zone)
    6417             : {
    6418           0 :         __zone_set_pageset_high_and_batch(zone, zone->pageset_high, zone->pageset_batch);
    6419           0 :         mutex_unlock(&pcp_batch_high_lock);
    6420           0 : }
    6421             : 
    6422           0 : void zone_pcp_reset(struct zone *zone)
    6423             : {
    6424             :         int cpu;
    6425             :         struct per_cpu_zonestat *pzstats;
    6426             : 
    6427           0 :         if (zone->per_cpu_pageset != &boot_pageset) {
    6428             :                 for_each_online_cpu(cpu) {
    6429             :                         pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
    6430             :                         drain_zonestat(zone, pzstats);
    6431             :                 }
    6432           0 :                 free_percpu(zone->per_cpu_pageset);
    6433           0 :                 zone->per_cpu_pageset = &boot_pageset;
    6434           0 :                 if (zone->per_cpu_zonestats != &boot_zonestats) {
    6435           0 :                         free_percpu(zone->per_cpu_zonestats);
    6436           0 :                         zone->per_cpu_zonestats = &boot_zonestats;
    6437             :                 }
    6438             :         }
    6439           0 : }
    6440             : 
    6441             : #ifdef CONFIG_MEMORY_HOTREMOVE
    6442             : /*
    6443             :  * All pages in the range must be in a single zone, must not contain holes,
    6444             :  * must span full sections, and must be isolated before calling this function.
    6445             :  */
    6446             : void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
    6447             : {
    6448             :         unsigned long pfn = start_pfn;
    6449             :         struct page *page;
    6450             :         struct zone *zone;
    6451             :         unsigned int order;
    6452             :         unsigned long flags;
    6453             : 
    6454             :         offline_mem_sections(pfn, end_pfn);
    6455             :         zone = page_zone(pfn_to_page(pfn));
    6456             :         spin_lock_irqsave(&zone->lock, flags);
    6457             :         while (pfn < end_pfn) {
    6458             :                 page = pfn_to_page(pfn);
    6459             :                 /*
    6460             :                  * The HWPoisoned page may be not in buddy system, and
    6461             :                  * page_count() is not 0.
    6462             :                  */
    6463             :                 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
    6464             :                         pfn++;
    6465             :                         continue;
    6466             :                 }
    6467             :                 /*
    6468             :                  * At this point all remaining PageOffline() pages have a
    6469             :                  * reference count of 0 and can simply be skipped.
    6470             :                  */
    6471             :                 if (PageOffline(page)) {
    6472             :                         BUG_ON(page_count(page));
    6473             :                         BUG_ON(PageBuddy(page));
    6474             :                         pfn++;
    6475             :                         continue;
    6476             :                 }
    6477             : 
    6478             :                 BUG_ON(page_count(page));
    6479             :                 BUG_ON(!PageBuddy(page));
    6480             :                 order = buddy_order(page);
    6481             :                 del_page_from_free_list(page, zone, order);
    6482             :                 pfn += (1 << order);
    6483             :         }
    6484             :         spin_unlock_irqrestore(&zone->lock, flags);
    6485             : }
    6486             : #endif
    6487             : 
    6488             : /*
    6489             :  * This function returns a stable result only if called under zone lock.
    6490             :  */
    6491           0 : bool is_free_buddy_page(struct page *page)
    6492             : {
    6493           0 :         unsigned long pfn = page_to_pfn(page);
    6494             :         unsigned int order;
    6495             : 
    6496           0 :         for (order = 0; order <= MAX_ORDER; order++) {
    6497           0 :                 struct page *page_head = page - (pfn & ((1 << order) - 1));
    6498             : 
    6499           0 :                 if (PageBuddy(page_head) &&
    6500           0 :                     buddy_order_unsafe(page_head) >= order)
    6501             :                         break;
    6502             :         }
    6503             : 
    6504           0 :         return order <= MAX_ORDER;
    6505             : }
    6506             : EXPORT_SYMBOL(is_free_buddy_page);
    6507             : 
    6508             : #ifdef CONFIG_MEMORY_FAILURE
    6509             : /*
    6510             :  * Break down a higher-order page in sub-pages, and keep our target out of
    6511             :  * buddy allocator.
    6512             :  */
    6513             : static void break_down_buddy_pages(struct zone *zone, struct page *page,
    6514             :                                    struct page *target, int low, int high,
    6515             :                                    int migratetype)
    6516             : {
    6517             :         unsigned long size = 1 << high;
    6518             :         struct page *current_buddy, *next_page;
    6519             : 
    6520             :         while (high > low) {
    6521             :                 high--;
    6522             :                 size >>= 1;
    6523             : 
    6524             :                 if (target >= &page[size]) {
    6525             :                         next_page = page + size;
    6526             :                         current_buddy = page;
    6527             :                 } else {
    6528             :                         next_page = page;
    6529             :                         current_buddy = page + size;
    6530             :                 }
    6531             : 
    6532             :                 if (set_page_guard(zone, current_buddy, high, migratetype))
    6533             :                         continue;
    6534             : 
    6535             :                 if (current_buddy != target) {
    6536             :                         add_to_free_list(current_buddy, zone, high, migratetype);
    6537             :                         set_buddy_order(current_buddy, high);
    6538             :                         page = next_page;
    6539             :                 }
    6540             :         }
    6541             : }
    6542             : 
    6543             : /*
    6544             :  * Take a page that will be marked as poisoned off the buddy allocator.
    6545             :  */
    6546             : bool take_page_off_buddy(struct page *page)
    6547             : {
    6548             :         struct zone *zone = page_zone(page);
    6549             :         unsigned long pfn = page_to_pfn(page);
    6550             :         unsigned long flags;
    6551             :         unsigned int order;
    6552             :         bool ret = false;
    6553             : 
    6554             :         spin_lock_irqsave(&zone->lock, flags);
    6555             :         for (order = 0; order <= MAX_ORDER; order++) {
    6556             :                 struct page *page_head = page - (pfn & ((1 << order) - 1));
    6557             :                 int page_order = buddy_order(page_head);
    6558             : 
    6559             :                 if (PageBuddy(page_head) && page_order >= order) {
    6560             :                         unsigned long pfn_head = page_to_pfn(page_head);
    6561             :                         int migratetype = get_pfnblock_migratetype(page_head,
    6562             :                                                                    pfn_head);
    6563             : 
    6564             :                         del_page_from_free_list(page_head, zone, page_order);
    6565             :                         break_down_buddy_pages(zone, page_head, page, 0,
    6566             :                                                 page_order, migratetype);
    6567             :                         SetPageHWPoisonTakenOff(page);
    6568             :                         if (!is_migrate_isolate(migratetype))
    6569             :                                 __mod_zone_freepage_state(zone, -1, migratetype);
    6570             :                         ret = true;
    6571             :                         break;
    6572             :                 }
    6573             :                 if (page_count(page_head) > 0)
    6574             :                         break;
    6575             :         }
    6576             :         spin_unlock_irqrestore(&zone->lock, flags);
    6577             :         return ret;
    6578             : }
    6579             : 
    6580             : /*
    6581             :  * Cancel takeoff done by take_page_off_buddy().
    6582             :  */
    6583             : bool put_page_back_buddy(struct page *page)
    6584             : {
    6585             :         struct zone *zone = page_zone(page);
    6586             :         unsigned long pfn = page_to_pfn(page);
    6587             :         unsigned long flags;
    6588             :         int migratetype = get_pfnblock_migratetype(page, pfn);
    6589             :         bool ret = false;
    6590             : 
    6591             :         spin_lock_irqsave(&zone->lock, flags);
    6592             :         if (put_page_testzero(page)) {
    6593             :                 ClearPageHWPoisonTakenOff(page);
    6594             :                 __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE);
    6595             :                 if (TestClearPageHWPoison(page)) {
    6596             :                         ret = true;
    6597             :                 }
    6598             :         }
    6599             :         spin_unlock_irqrestore(&zone->lock, flags);
    6600             : 
    6601             :         return ret;
    6602             : }
    6603             : #endif
    6604             : 
    6605             : #ifdef CONFIG_ZONE_DMA
    6606             : bool has_managed_dma(void)
    6607             : {
    6608             :         struct pglist_data *pgdat;
    6609             : 
    6610             :         for_each_online_pgdat(pgdat) {
    6611             :                 struct zone *zone = &pgdat->node_zones[ZONE_DMA];
    6612             : 
    6613             :                 if (managed_zone(zone))
    6614             :                         return true;
    6615             :         }
    6616             :         return false;
    6617             : }
    6618             : #endif /* CONFIG_ZONE_DMA */
    6619             : 
    6620             : #ifdef CONFIG_UNACCEPTED_MEMORY
    6621             : 
    6622             : /* Counts number of zones with unaccepted pages. */
    6623             : static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages);
    6624             : 
    6625             : static bool lazy_accept = true;
    6626             : 
    6627             : static int __init accept_memory_parse(char *p)
    6628             : {
    6629             :         if (!strcmp(p, "lazy")) {
    6630             :                 lazy_accept = true;
    6631             :                 return 0;
    6632             :         } else if (!strcmp(p, "eager")) {
    6633             :                 lazy_accept = false;
    6634             :                 return 0;
    6635             :         } else {
    6636             :                 return -EINVAL;
    6637             :         }
    6638             : }
    6639             : early_param("accept_memory", accept_memory_parse);
    6640             : 
    6641             : static bool page_contains_unaccepted(struct page *page, unsigned int order)
    6642             : {
    6643             :         phys_addr_t start = page_to_phys(page);
    6644             :         phys_addr_t end = start + (PAGE_SIZE << order);
    6645             : 
    6646             :         return range_contains_unaccepted_memory(start, end);
    6647             : }
    6648             : 
    6649             : static void accept_page(struct page *page, unsigned int order)
    6650             : {
    6651             :         phys_addr_t start = page_to_phys(page);
    6652             : 
    6653             :         accept_memory(start, start + (PAGE_SIZE << order));
    6654             : }
    6655             : 
    6656             : static bool try_to_accept_memory_one(struct zone *zone)
    6657             : {
    6658             :         unsigned long flags;
    6659             :         struct page *page;
    6660             :         bool last;
    6661             : 
    6662             :         if (list_empty(&zone->unaccepted_pages))
    6663             :                 return false;
    6664             : 
    6665             :         spin_lock_irqsave(&zone->lock, flags);
    6666             :         page = list_first_entry_or_null(&zone->unaccepted_pages,
    6667             :                                         struct page, lru);
    6668             :         if (!page) {
    6669             :                 spin_unlock_irqrestore(&zone->lock, flags);
    6670             :                 return false;
    6671             :         }
    6672             : 
    6673             :         list_del(&page->lru);
    6674             :         last = list_empty(&zone->unaccepted_pages);
    6675             : 
    6676             :         __mod_zone_freepage_state(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
    6677             :         __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
    6678             :         spin_unlock_irqrestore(&zone->lock, flags);
    6679             : 
    6680             :         accept_page(page, MAX_ORDER);
    6681             : 
    6682             :         __free_pages_ok(page, MAX_ORDER, FPI_TO_TAIL);
    6683             : 
    6684             :         if (last)
    6685             :                 static_branch_dec(&zones_with_unaccepted_pages);
    6686             : 
    6687             :         return true;
    6688             : }
    6689             : 
    6690             : static bool try_to_accept_memory(struct zone *zone, unsigned int order)
    6691             : {
    6692             :         long to_accept;
    6693             :         int ret = false;
    6694             : 
    6695             :         /* How much to accept to get to high watermark? */
    6696             :         to_accept = high_wmark_pages(zone) -
    6697             :                     (zone_page_state(zone, NR_FREE_PAGES) -
    6698             :                     __zone_watermark_unusable_free(zone, order, 0));
    6699             : 
    6700             :         /* Accept at least one page */
    6701             :         do {
    6702             :                 if (!try_to_accept_memory_one(zone))
    6703             :                         break;
    6704             :                 ret = true;
    6705             :                 to_accept -= MAX_ORDER_NR_PAGES;
    6706             :         } while (to_accept > 0);
    6707             : 
    6708             :         return ret;
    6709             : }
    6710             : 
    6711             : static inline bool has_unaccepted_memory(void)
    6712             : {
    6713             :         return static_branch_unlikely(&zones_with_unaccepted_pages);
    6714             : }
    6715             : 
    6716             : static bool __free_unaccepted(struct page *page)
    6717             : {
    6718             :         struct zone *zone = page_zone(page);
    6719             :         unsigned long flags;
    6720             :         bool first = false;
    6721             : 
    6722             :         if (!lazy_accept)
    6723             :                 return false;
    6724             : 
    6725             :         spin_lock_irqsave(&zone->lock, flags);
    6726             :         first = list_empty(&zone->unaccepted_pages);
    6727             :         list_add_tail(&page->lru, &zone->unaccepted_pages);
    6728             :         __mod_zone_freepage_state(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
    6729             :         __mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES);
    6730             :         spin_unlock_irqrestore(&zone->lock, flags);
    6731             : 
    6732             :         if (first)
    6733             :                 static_branch_inc(&zones_with_unaccepted_pages);
    6734             : 
    6735             :         return true;
    6736             : }
    6737             : 
    6738             : #else
    6739             : 
    6740             : static bool page_contains_unaccepted(struct page *page, unsigned int order)
    6741             : {
    6742             :         return false;
    6743             : }
    6744             : 
    6745             : static void accept_page(struct page *page, unsigned int order)
    6746             : {
    6747             : }
    6748             : 
    6749             : static bool try_to_accept_memory(struct zone *zone, unsigned int order)
    6750             : {
    6751             :         return false;
    6752             : }
    6753             : 
    6754             : static inline bool has_unaccepted_memory(void)
    6755             : {
    6756             :         return false;
    6757             : }
    6758             : 
    6759             : static bool __free_unaccepted(struct page *page)
    6760             : {
    6761             :         BUILD_BUG();
    6762             :         return false;
    6763             : }
    6764             : 
    6765             : #endif /* CONFIG_UNACCEPTED_MEMORY */

Generated by: LCOV version 1.14